diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,62820 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 8966, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 2.3125102519989014, + "eval_runtime": 636.0298, + "eval_samples_per_second": 99.225, + "eval_steps_per_second": 1.552, + "step": 0 + }, + { + "epoch": 0.00022306491188935982, + "grad_norm": 447.7063293457031, + "learning_rate": 0.0, + "loss": 2.6996, + "step": 1 + }, + { + "epoch": 0.00044612982377871963, + "grad_norm": 410.13433837890625, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.5839, + "step": 2 + }, + { + "epoch": 0.0006691947356680794, + "grad_norm": 514.7770385742188, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.538, + "step": 3 + }, + { + "epoch": 0.0008922596475574393, + "grad_norm": 73.62993621826172, + "learning_rate": 6.000000000000001e-07, + "loss": 2.5648, + "step": 4 + }, + { + "epoch": 0.001115324559446799, + "grad_norm": 35.405982971191406, + "learning_rate": 8.000000000000001e-07, + "loss": 2.5318, + "step": 5 + }, + { + "epoch": 0.0013383894713361588, + "grad_norm": 31.43743133544922, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4902, + "step": 6 + }, + { + "epoch": 0.0015614543832255187, + "grad_norm": 32.07968521118164, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.4386, + "step": 7 + }, + { + "epoch": 0.0017845192951148785, + "grad_norm": 19.430936813354492, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.0901, + "step": 8 + }, + { + "epoch": 0.002007584207004238, + "grad_norm": 20.800960540771484, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.1737, + "step": 9 + }, + { + "epoch": 0.002230649118893598, + "grad_norm": 20.56642723083496, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.1823, + "step": 10 + }, + { + "epoch": 0.002453714030782958, + "grad_norm": 20.692852020263672, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.1278, + "step": 11 + }, + { + "epoch": 0.0026767789426723177, + "grad_norm": 14.481375694274902, + "learning_rate": 2.2e-06, + "loss": 1.9787, + "step": 12 + }, + { + "epoch": 0.0028998438545616775, + "grad_norm": 13.366188049316406, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.9635, + "step": 13 + }, + { + "epoch": 0.0031229087664510374, + "grad_norm": 14.910027503967285, + "learning_rate": 2.6e-06, + "loss": 1.8838, + "step": 14 + }, + { + "epoch": 0.0033459736783403972, + "grad_norm": 13.10051441192627, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.8825, + "step": 15 + }, + { + "epoch": 0.003569038590229757, + "grad_norm": 14.625020027160645, + "learning_rate": 3e-06, + "loss": 1.6607, + "step": 16 + }, + { + "epoch": 0.0037921035021191165, + "grad_norm": 9.08655834197998, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5911, + "step": 17 + }, + { + "epoch": 0.004015168414008476, + "grad_norm": 8.972038269042969, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.5538, + "step": 18 + }, + { + "epoch": 0.004238233325897837, + "grad_norm": 7.949400901794434, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.5478, + "step": 19 + }, + { + "epoch": 0.004461298237787196, + "grad_norm": 7.048157215118408, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.3381, + "step": 20 + }, + { + "epoch": 0.004684363149676556, + "grad_norm": 7.008754730224609, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3275, + "step": 21 + }, + { + "epoch": 0.004907428061565916, + "grad_norm": 6.20928955078125, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.2469, + "step": 22 + }, + { + "epoch": 0.005130492973455275, + "grad_norm": 4.8463029861450195, + "learning_rate": 4.4e-06, + "loss": 1.1167, + "step": 23 + }, + { + "epoch": 0.005353557885344635, + "grad_norm": 20.94325065612793, + "learning_rate": 4.600000000000001e-06, + "loss": 1.1391, + "step": 24 + }, + { + "epoch": 0.005576622797233995, + "grad_norm": 3.783233404159546, + "learning_rate": 4.800000000000001e-06, + "loss": 1.0692, + "step": 25 + }, + { + "epoch": 0.005799687709123355, + "grad_norm": 3.780400276184082, + "learning_rate": 5e-06, + "loss": 1.0715, + "step": 26 + }, + { + "epoch": 0.0060227526210127145, + "grad_norm": 3.059216260910034, + "learning_rate": 5.2e-06, + "loss": 1.0057, + "step": 27 + }, + { + "epoch": 0.006245817532902075, + "grad_norm": 2.7420809268951416, + "learning_rate": 5.400000000000001e-06, + "loss": 0.9421, + "step": 28 + }, + { + "epoch": 0.006468882444791434, + "grad_norm": 2.8878915309906006, + "learning_rate": 5.600000000000001e-06, + "loss": 0.9625, + "step": 29 + }, + { + "epoch": 0.0066919473566807944, + "grad_norm": 2.9568190574645996, + "learning_rate": 5.8e-06, + "loss": 0.924, + "step": 30 + }, + { + "epoch": 0.006915012268570154, + "grad_norm": 2.239628314971924, + "learning_rate": 6e-06, + "loss": 0.8886, + "step": 31 + }, + { + "epoch": 0.007138077180459514, + "grad_norm": 2.2312073707580566, + "learning_rate": 6.200000000000001e-06, + "loss": 0.9105, + "step": 32 + }, + { + "epoch": 0.0073611420923488735, + "grad_norm": 1.7348324060440063, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.8685, + "step": 33 + }, + { + "epoch": 0.007584207004238233, + "grad_norm": 1.3611754179000854, + "learning_rate": 6.600000000000001e-06, + "loss": 0.8088, + "step": 34 + }, + { + "epoch": 0.007807271916127593, + "grad_norm": 1.2034807205200195, + "learning_rate": 6.800000000000001e-06, + "loss": 0.8101, + "step": 35 + }, + { + "epoch": 0.008030336828016953, + "grad_norm": 1.3573353290557861, + "learning_rate": 7e-06, + "loss": 0.8048, + "step": 36 + }, + { + "epoch": 0.008253401739906312, + "grad_norm": 1.4660226106643677, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.7663, + "step": 37 + }, + { + "epoch": 0.008476466651795673, + "grad_norm": 1.4917473793029785, + "learning_rate": 7.4e-06, + "loss": 0.7847, + "step": 38 + }, + { + "epoch": 0.008699531563685033, + "grad_norm": 0.9017640352249146, + "learning_rate": 7.600000000000001e-06, + "loss": 0.7472, + "step": 39 + }, + { + "epoch": 0.008922596475574392, + "grad_norm": 0.9448667764663696, + "learning_rate": 7.800000000000002e-06, + "loss": 0.7384, + "step": 40 + }, + { + "epoch": 0.009145661387463751, + "grad_norm": 1.0112793445587158, + "learning_rate": 8.000000000000001e-06, + "loss": 0.735, + "step": 41 + }, + { + "epoch": 0.009368726299353113, + "grad_norm": 0.8147673606872559, + "learning_rate": 8.2e-06, + "loss": 0.7463, + "step": 42 + }, + { + "epoch": 0.009591791211242472, + "grad_norm": 0.8109986782073975, + "learning_rate": 8.400000000000001e-06, + "loss": 0.7275, + "step": 43 + }, + { + "epoch": 0.009814856123131831, + "grad_norm": 0.6843619346618652, + "learning_rate": 8.6e-06, + "loss": 0.7413, + "step": 44 + }, + { + "epoch": 0.01003792103502119, + "grad_norm": 0.5661330223083496, + "learning_rate": 8.8e-06, + "loss": 0.7226, + "step": 45 + }, + { + "epoch": 0.01026098594691055, + "grad_norm": 0.8250797986984253, + "learning_rate": 9e-06, + "loss": 0.7099, + "step": 46 + }, + { + "epoch": 0.010484050858799911, + "grad_norm": 0.5656530857086182, + "learning_rate": 9.200000000000002e-06, + "loss": 0.7145, + "step": 47 + }, + { + "epoch": 0.01070711577068927, + "grad_norm": 0.7327658534049988, + "learning_rate": 9.4e-06, + "loss": 0.716, + "step": 48 + }, + { + "epoch": 0.01093018068257863, + "grad_norm": 0.5881125330924988, + "learning_rate": 9.600000000000001e-06, + "loss": 0.6747, + "step": 49 + }, + { + "epoch": 0.01115324559446799, + "grad_norm": 0.5523539185523987, + "learning_rate": 9.800000000000001e-06, + "loss": 0.6521, + "step": 50 + }, + { + "epoch": 0.01137631050635735, + "grad_norm": 0.5446374416351318, + "learning_rate": 1e-05, + "loss": 0.6752, + "step": 51 + }, + { + "epoch": 0.01159937541824671, + "grad_norm": 0.475178599357605, + "learning_rate": 1.02e-05, + "loss": 0.7108, + "step": 52 + }, + { + "epoch": 0.01182244033013607, + "grad_norm": 0.7163774967193604, + "learning_rate": 1.04e-05, + "loss": 0.6717, + "step": 53 + }, + { + "epoch": 0.012045505242025429, + "grad_norm": 0.4185207784175873, + "learning_rate": 1.0600000000000002e-05, + "loss": 0.6645, + "step": 54 + }, + { + "epoch": 0.012268570153914788, + "grad_norm": 0.43035364151000977, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.6814, + "step": 55 + }, + { + "epoch": 0.01249163506580415, + "grad_norm": 0.37698522210121155, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.7271, + "step": 56 + }, + { + "epoch": 0.012714699977693509, + "grad_norm": 0.41325655579566956, + "learning_rate": 1.1200000000000001e-05, + "loss": 0.6718, + "step": 57 + }, + { + "epoch": 0.012937764889582868, + "grad_norm": 0.4455505907535553, + "learning_rate": 1.14e-05, + "loss": 0.6461, + "step": 58 + }, + { + "epoch": 0.013160829801472228, + "grad_norm": 0.39668968319892883, + "learning_rate": 1.16e-05, + "loss": 0.6744, + "step": 59 + }, + { + "epoch": 0.013383894713361589, + "grad_norm": 0.43382027745246887, + "learning_rate": 1.18e-05, + "loss": 0.686, + "step": 60 + }, + { + "epoch": 0.013606959625250948, + "grad_norm": 0.3178131878376007, + "learning_rate": 1.2e-05, + "loss": 0.6461, + "step": 61 + }, + { + "epoch": 0.013830024537140308, + "grad_norm": 0.36169150471687317, + "learning_rate": 1.22e-05, + "loss": 0.6521, + "step": 62 + }, + { + "epoch": 0.014053089449029667, + "grad_norm": 0.3932245671749115, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.7118, + "step": 63 + }, + { + "epoch": 0.014276154360919028, + "grad_norm": 0.3924449384212494, + "learning_rate": 1.2600000000000001e-05, + "loss": 0.6723, + "step": 64 + }, + { + "epoch": 0.014499219272808388, + "grad_norm": 0.32354405522346497, + "learning_rate": 1.2800000000000001e-05, + "loss": 0.6589, + "step": 65 + }, + { + "epoch": 0.014722284184697747, + "grad_norm": 0.3272797465324402, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.6413, + "step": 66 + }, + { + "epoch": 0.014945349096587107, + "grad_norm": 0.36967551708221436, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.6432, + "step": 67 + }, + { + "epoch": 0.015168414008476466, + "grad_norm": 0.2998528480529785, + "learning_rate": 1.3400000000000002e-05, + "loss": 0.6543, + "step": 68 + }, + { + "epoch": 0.015391478920365827, + "grad_norm": 0.30827096104621887, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.6715, + "step": 69 + }, + { + "epoch": 0.015614543832255186, + "grad_norm": 0.3031168580055237, + "learning_rate": 1.38e-05, + "loss": 0.6504, + "step": 70 + }, + { + "epoch": 0.015837608744144548, + "grad_norm": 0.3063329756259918, + "learning_rate": 1.4e-05, + "loss": 0.6214, + "step": 71 + }, + { + "epoch": 0.016060673656033905, + "grad_norm": 0.2707573175430298, + "learning_rate": 1.4200000000000001e-05, + "loss": 0.6347, + "step": 72 + }, + { + "epoch": 0.016283738567923266, + "grad_norm": 0.3051629066467285, + "learning_rate": 1.4400000000000001e-05, + "loss": 0.6318, + "step": 73 + }, + { + "epoch": 0.016506803479812624, + "grad_norm": 0.3260219991207123, + "learning_rate": 1.46e-05, + "loss": 0.6219, + "step": 74 + }, + { + "epoch": 0.016729868391701985, + "grad_norm": 0.2733941376209259, + "learning_rate": 1.48e-05, + "loss": 0.6384, + "step": 75 + }, + { + "epoch": 0.016952933303591346, + "grad_norm": 0.35662564635276794, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.622, + "step": 76 + }, + { + "epoch": 0.017175998215480704, + "grad_norm": 0.31423047184944153, + "learning_rate": 1.5200000000000002e-05, + "loss": 0.6344, + "step": 77 + }, + { + "epoch": 0.017399063127370065, + "grad_norm": 0.3120841681957245, + "learning_rate": 1.54e-05, + "loss": 0.6566, + "step": 78 + }, + { + "epoch": 0.017622128039259423, + "grad_norm": 0.30338573455810547, + "learning_rate": 1.5600000000000003e-05, + "loss": 0.6537, + "step": 79 + }, + { + "epoch": 0.017845192951148784, + "grad_norm": 0.3202762007713318, + "learning_rate": 1.58e-05, + "loss": 0.6705, + "step": 80 + }, + { + "epoch": 0.018068257863038145, + "grad_norm": 0.3163682520389557, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6175, + "step": 81 + }, + { + "epoch": 0.018291322774927503, + "grad_norm": 0.2831353545188904, + "learning_rate": 1.62e-05, + "loss": 0.6301, + "step": 82 + }, + { + "epoch": 0.018514387686816864, + "grad_norm": 0.267448365688324, + "learning_rate": 1.64e-05, + "loss": 0.6224, + "step": 83 + }, + { + "epoch": 0.018737452598706225, + "grad_norm": 0.2999647259712219, + "learning_rate": 1.66e-05, + "loss": 0.6294, + "step": 84 + }, + { + "epoch": 0.018960517510595583, + "grad_norm": 0.27424749732017517, + "learning_rate": 1.6800000000000002e-05, + "loss": 0.5998, + "step": 85 + }, + { + "epoch": 0.019183582422484944, + "grad_norm": 0.2582665979862213, + "learning_rate": 1.7e-05, + "loss": 0.6043, + "step": 86 + }, + { + "epoch": 0.0194066473343743, + "grad_norm": 0.2448103129863739, + "learning_rate": 1.72e-05, + "loss": 0.631, + "step": 87 + }, + { + "epoch": 0.019629712246263663, + "grad_norm": 0.25323718786239624, + "learning_rate": 1.7400000000000003e-05, + "loss": 0.6248, + "step": 88 + }, + { + "epoch": 0.019852777158153024, + "grad_norm": 0.32291799783706665, + "learning_rate": 1.76e-05, + "loss": 0.5797, + "step": 89 + }, + { + "epoch": 0.02007584207004238, + "grad_norm": 0.3439197242259979, + "learning_rate": 1.7800000000000002e-05, + "loss": 0.6141, + "step": 90 + }, + { + "epoch": 0.020298906981931743, + "grad_norm": 0.30653074383735657, + "learning_rate": 1.8e-05, + "loss": 0.6226, + "step": 91 + }, + { + "epoch": 0.0205219718938211, + "grad_norm": 0.5994735956192017, + "learning_rate": 1.8200000000000002e-05, + "loss": 0.614, + "step": 92 + }, + { + "epoch": 0.02074503680571046, + "grad_norm": 0.30260932445526123, + "learning_rate": 1.8400000000000003e-05, + "loss": 0.6224, + "step": 93 + }, + { + "epoch": 0.020968101717599823, + "grad_norm": 0.2914034426212311, + "learning_rate": 1.86e-05, + "loss": 0.6201, + "step": 94 + }, + { + "epoch": 0.02119116662948918, + "grad_norm": 0.28529268503189087, + "learning_rate": 1.88e-05, + "loss": 0.6046, + "step": 95 + }, + { + "epoch": 0.02141423154137854, + "grad_norm": 0.26552823185920715, + "learning_rate": 1.9e-05, + "loss": 0.6058, + "step": 96 + }, + { + "epoch": 0.021637296453267903, + "grad_norm": 0.4168465733528137, + "learning_rate": 1.9200000000000003e-05, + "loss": 0.6289, + "step": 97 + }, + { + "epoch": 0.02186036136515726, + "grad_norm": 0.40763556957244873, + "learning_rate": 1.94e-05, + "loss": 0.6113, + "step": 98 + }, + { + "epoch": 0.02208342627704662, + "grad_norm": 0.29686489701271057, + "learning_rate": 1.9600000000000002e-05, + "loss": 0.6288, + "step": 99 + }, + { + "epoch": 0.02230649118893598, + "grad_norm": 0.27713605761528015, + "learning_rate": 1.98e-05, + "loss": 0.6404, + "step": 100 + }, + { + "epoch": 0.02252955610082534, + "grad_norm": 0.2593631148338318, + "learning_rate": 2e-05, + "loss": 0.5652, + "step": 101 + }, + { + "epoch": 0.0227526210127147, + "grad_norm": 0.2575121819972992, + "learning_rate": 1.999999972306855e-05, + "loss": 0.6062, + "step": 102 + }, + { + "epoch": 0.02297568592460406, + "grad_norm": 0.24929626286029816, + "learning_rate": 1.999999889227421e-05, + "loss": 0.6336, + "step": 103 + }, + { + "epoch": 0.02319875083649342, + "grad_norm": 0.2693648040294647, + "learning_rate": 1.9999997507617033e-05, + "loss": 0.6611, + "step": 104 + }, + { + "epoch": 0.023421815748382778, + "grad_norm": 0.5212802290916443, + "learning_rate": 1.9999995569097088e-05, + "loss": 0.6162, + "step": 105 + }, + { + "epoch": 0.02364488066027214, + "grad_norm": 0.25279495120048523, + "learning_rate": 1.999999307671449e-05, + "loss": 0.6062, + "step": 106 + }, + { + "epoch": 0.0238679455721615, + "grad_norm": 0.2529296278953552, + "learning_rate": 1.999999003046937e-05, + "loss": 0.6164, + "step": 107 + }, + { + "epoch": 0.024091010484050858, + "grad_norm": 0.3054715096950531, + "learning_rate": 1.9999986430361896e-05, + "loss": 0.5984, + "step": 108 + }, + { + "epoch": 0.02431407539594022, + "grad_norm": 0.26101136207580566, + "learning_rate": 1.9999982276392274e-05, + "loss": 0.5809, + "step": 109 + }, + { + "epoch": 0.024537140307829577, + "grad_norm": 0.26034069061279297, + "learning_rate": 1.9999977568560734e-05, + "loss": 0.6026, + "step": 110 + }, + { + "epoch": 0.024760205219718938, + "grad_norm": 0.2570051848888397, + "learning_rate": 1.999997230686753e-05, + "loss": 0.6299, + "step": 111 + }, + { + "epoch": 0.0249832701316083, + "grad_norm": 0.2621450424194336, + "learning_rate": 1.999996649131296e-05, + "loss": 0.6141, + "step": 112 + }, + { + "epoch": 0.025206335043497657, + "grad_norm": 0.2425510734319687, + "learning_rate": 1.999996012189734e-05, + "loss": 0.6184, + "step": 113 + }, + { + "epoch": 0.025429399955387018, + "grad_norm": 0.28203898668289185, + "learning_rate": 1.999995319862103e-05, + "loss": 0.5922, + "step": 114 + }, + { + "epoch": 0.02565246486727638, + "grad_norm": 0.2976488173007965, + "learning_rate": 1.9999945721484407e-05, + "loss": 0.586, + "step": 115 + }, + { + "epoch": 0.025875529779165737, + "grad_norm": 0.27894967794418335, + "learning_rate": 1.999993769048789e-05, + "loss": 0.6099, + "step": 116 + }, + { + "epoch": 0.026098594691055098, + "grad_norm": 0.2924571931362152, + "learning_rate": 1.999992910563192e-05, + "loss": 0.5646, + "step": 117 + }, + { + "epoch": 0.026321659602944456, + "grad_norm": 0.2446456104516983, + "learning_rate": 1.9999919966916976e-05, + "loss": 0.5722, + "step": 118 + }, + { + "epoch": 0.026544724514833817, + "grad_norm": 0.23295633494853973, + "learning_rate": 1.9999910274343562e-05, + "loss": 0.5806, + "step": 119 + }, + { + "epoch": 0.026767789426723178, + "grad_norm": 0.24767954647541046, + "learning_rate": 1.999990002791221e-05, + "loss": 0.6003, + "step": 120 + }, + { + "epoch": 0.026990854338612535, + "grad_norm": 0.25390875339508057, + "learning_rate": 1.99998892276235e-05, + "loss": 0.5798, + "step": 121 + }, + { + "epoch": 0.027213919250501897, + "grad_norm": 0.22989511489868164, + "learning_rate": 1.999987787347802e-05, + "loss": 0.5873, + "step": 122 + }, + { + "epoch": 0.027436984162391254, + "grad_norm": 0.4493498206138611, + "learning_rate": 1.99998659654764e-05, + "loss": 0.5994, + "step": 123 + }, + { + "epoch": 0.027660049074280615, + "grad_norm": 0.2416534572839737, + "learning_rate": 1.99998535036193e-05, + "loss": 0.5787, + "step": 124 + }, + { + "epoch": 0.027883113986169977, + "grad_norm": 0.23413562774658203, + "learning_rate": 1.9999840487907414e-05, + "loss": 0.608, + "step": 125 + }, + { + "epoch": 0.028106178898059334, + "grad_norm": 0.28015846014022827, + "learning_rate": 1.9999826918341462e-05, + "loss": 0.6034, + "step": 126 + }, + { + "epoch": 0.028329243809948695, + "grad_norm": 0.2494441717863083, + "learning_rate": 1.999981279492219e-05, + "loss": 0.6256, + "step": 127 + }, + { + "epoch": 0.028552308721838057, + "grad_norm": 0.3094983398914337, + "learning_rate": 1.9999798117650385e-05, + "loss": 0.6351, + "step": 128 + }, + { + "epoch": 0.028775373633727414, + "grad_norm": 0.24032524228096008, + "learning_rate": 1.9999782886526863e-05, + "loss": 0.5703, + "step": 129 + }, + { + "epoch": 0.028998438545616775, + "grad_norm": 0.24950377643108368, + "learning_rate": 1.9999767101552458e-05, + "loss": 0.5952, + "step": 130 + }, + { + "epoch": 0.029221503457506133, + "grad_norm": 0.24877335131168365, + "learning_rate": 1.999975076272805e-05, + "loss": 0.6143, + "step": 131 + }, + { + "epoch": 0.029444568369395494, + "grad_norm": 0.2579665184020996, + "learning_rate": 1.999973387005455e-05, + "loss": 0.6063, + "step": 132 + }, + { + "epoch": 0.029667633281284855, + "grad_norm": 0.2793569564819336, + "learning_rate": 1.9999716423532884e-05, + "loss": 0.613, + "step": 133 + }, + { + "epoch": 0.029890698193174213, + "grad_norm": 0.23061081767082214, + "learning_rate": 1.999969842316402e-05, + "loss": 0.5918, + "step": 134 + }, + { + "epoch": 0.030113763105063574, + "grad_norm": 0.2353406399488449, + "learning_rate": 1.999967986894896e-05, + "loss": 0.585, + "step": 135 + }, + { + "epoch": 0.030336828016952932, + "grad_norm": 0.2254764884710312, + "learning_rate": 1.9999660760888722e-05, + "loss": 0.611, + "step": 136 + }, + { + "epoch": 0.030559892928842293, + "grad_norm": 0.24640028178691864, + "learning_rate": 1.9999641098984378e-05, + "loss": 0.5911, + "step": 137 + }, + { + "epoch": 0.030782957840731654, + "grad_norm": 0.22932305932044983, + "learning_rate": 1.9999620883237004e-05, + "loss": 0.5999, + "step": 138 + }, + { + "epoch": 0.031006022752621012, + "grad_norm": 0.2379560023546219, + "learning_rate": 1.999960011364773e-05, + "loss": 0.5987, + "step": 139 + }, + { + "epoch": 0.031229087664510373, + "grad_norm": 0.22063873708248138, + "learning_rate": 1.99995787902177e-05, + "loss": 0.5867, + "step": 140 + }, + { + "epoch": 0.03145215257639973, + "grad_norm": 0.23651117086410522, + "learning_rate": 1.9999556912948096e-05, + "loss": 0.5809, + "step": 141 + }, + { + "epoch": 0.031675217488289095, + "grad_norm": 0.2621421813964844, + "learning_rate": 1.9999534481840134e-05, + "loss": 0.5935, + "step": 142 + }, + { + "epoch": 0.03189828240017845, + "grad_norm": 0.24506062269210815, + "learning_rate": 1.9999511496895047e-05, + "loss": 0.5931, + "step": 143 + }, + { + "epoch": 0.03212134731206781, + "grad_norm": 0.2224111109972, + "learning_rate": 1.999948795811412e-05, + "loss": 0.6156, + "step": 144 + }, + { + "epoch": 0.03234441222395717, + "grad_norm": 0.23910044133663177, + "learning_rate": 1.9999463865498644e-05, + "loss": 0.5839, + "step": 145 + }, + { + "epoch": 0.03256747713584653, + "grad_norm": 0.2555723786354065, + "learning_rate": 1.9999439219049964e-05, + "loss": 0.6474, + "step": 146 + }, + { + "epoch": 0.03279054204773589, + "grad_norm": 0.25059348344802856, + "learning_rate": 1.9999414018769442e-05, + "loss": 0.5846, + "step": 147 + }, + { + "epoch": 0.03301360695962525, + "grad_norm": 0.22700609266757965, + "learning_rate": 1.9999388264658467e-05, + "loss": 0.5867, + "step": 148 + }, + { + "epoch": 0.03323667187151461, + "grad_norm": 0.25858354568481445, + "learning_rate": 1.9999361956718476e-05, + "loss": 0.5984, + "step": 149 + }, + { + "epoch": 0.03345973678340397, + "grad_norm": 0.2813456356525421, + "learning_rate": 1.9999335094950922e-05, + "loss": 0.5943, + "step": 150 + }, + { + "epoch": 0.03368280169529333, + "grad_norm": 0.288655549287796, + "learning_rate": 1.9999307679357293e-05, + "loss": 0.5975, + "step": 151 + }, + { + "epoch": 0.03390586660718269, + "grad_norm": 0.23721511662006378, + "learning_rate": 1.9999279709939102e-05, + "loss": 0.5637, + "step": 152 + }, + { + "epoch": 0.03412893151907205, + "grad_norm": 0.23953364789485931, + "learning_rate": 1.999925118669791e-05, + "loss": 0.5693, + "step": 153 + }, + { + "epoch": 0.03435199643096141, + "grad_norm": 0.23194071650505066, + "learning_rate": 1.9999222109635283e-05, + "loss": 0.5446, + "step": 154 + }, + { + "epoch": 0.03457506134285077, + "grad_norm": 0.2160234898328781, + "learning_rate": 1.999919247875284e-05, + "loss": 0.5812, + "step": 155 + }, + { + "epoch": 0.03479812625474013, + "grad_norm": 0.22934173047542572, + "learning_rate": 1.999916229405222e-05, + "loss": 0.6149, + "step": 156 + }, + { + "epoch": 0.03502119116662949, + "grad_norm": 0.24892570078372955, + "learning_rate": 1.999913155553509e-05, + "loss": 0.6053, + "step": 157 + }, + { + "epoch": 0.035244256078518846, + "grad_norm": 0.2235739678144455, + "learning_rate": 1.9999100263203165e-05, + "loss": 0.5799, + "step": 158 + }, + { + "epoch": 0.03546732099040821, + "grad_norm": 0.21852736175060272, + "learning_rate": 1.9999068417058168e-05, + "loss": 0.5793, + "step": 159 + }, + { + "epoch": 0.03569038590229757, + "grad_norm": 0.269131600856781, + "learning_rate": 1.9999036017101864e-05, + "loss": 0.5817, + "step": 160 + }, + { + "epoch": 0.035913450814186926, + "grad_norm": 0.22516575455665588, + "learning_rate": 1.999900306333605e-05, + "loss": 0.6, + "step": 161 + }, + { + "epoch": 0.03613651572607629, + "grad_norm": 0.24313850700855255, + "learning_rate": 1.999896955576255e-05, + "loss": 0.5738, + "step": 162 + }, + { + "epoch": 0.03635958063796565, + "grad_norm": 0.26203158497810364, + "learning_rate": 1.999893549438322e-05, + "loss": 0.5687, + "step": 163 + }, + { + "epoch": 0.036582645549855006, + "grad_norm": 0.23275348544120789, + "learning_rate": 1.9998900879199948e-05, + "loss": 0.5698, + "step": 164 + }, + { + "epoch": 0.03680571046174437, + "grad_norm": 0.2560306489467621, + "learning_rate": 1.9998865710214646e-05, + "loss": 0.5877, + "step": 165 + }, + { + "epoch": 0.03702877537363373, + "grad_norm": 0.2175760716199875, + "learning_rate": 1.999882998742927e-05, + "loss": 0.5797, + "step": 166 + }, + { + "epoch": 0.037251840285523086, + "grad_norm": 0.22381433844566345, + "learning_rate": 1.999879371084579e-05, + "loss": 0.5642, + "step": 167 + }, + { + "epoch": 0.03747490519741245, + "grad_norm": 0.2363629788160324, + "learning_rate": 1.9998756880466224e-05, + "loss": 0.5651, + "step": 168 + }, + { + "epoch": 0.03769797010930181, + "grad_norm": 0.22718748450279236, + "learning_rate": 1.9998719496292603e-05, + "loss": 0.5867, + "step": 169 + }, + { + "epoch": 0.037921035021191166, + "grad_norm": 0.23127657175064087, + "learning_rate": 1.9998681558327005e-05, + "loss": 0.5724, + "step": 170 + }, + { + "epoch": 0.03814409993308052, + "grad_norm": 0.23175480961799622, + "learning_rate": 1.9998643066571527e-05, + "loss": 0.5919, + "step": 171 + }, + { + "epoch": 0.03836716484496989, + "grad_norm": 0.22871741652488708, + "learning_rate": 1.9998604021028304e-05, + "loss": 0.5653, + "step": 172 + }, + { + "epoch": 0.038590229756859246, + "grad_norm": 0.21524189412593842, + "learning_rate": 1.999856442169949e-05, + "loss": 0.6123, + "step": 173 + }, + { + "epoch": 0.0388132946687486, + "grad_norm": 0.23523908853530884, + "learning_rate": 1.999852426858729e-05, + "loss": 0.5931, + "step": 174 + }, + { + "epoch": 0.03903635958063797, + "grad_norm": 0.23540736734867096, + "learning_rate": 1.9998483561693926e-05, + "loss": 0.5614, + "step": 175 + }, + { + "epoch": 0.039259424492527326, + "grad_norm": 0.22590956091880798, + "learning_rate": 1.999844230102164e-05, + "loss": 0.5917, + "step": 176 + }, + { + "epoch": 0.03948248940441668, + "grad_norm": 0.22635167837142944, + "learning_rate": 1.999840048657273e-05, + "loss": 0.5886, + "step": 177 + }, + { + "epoch": 0.03970555431630605, + "grad_norm": 0.24712443351745605, + "learning_rate": 1.9998358118349513e-05, + "loss": 0.5936, + "step": 178 + }, + { + "epoch": 0.039928619228195406, + "grad_norm": 0.266133576631546, + "learning_rate": 1.999831519635433e-05, + "loss": 0.5662, + "step": 179 + }, + { + "epoch": 0.04015168414008476, + "grad_norm": 0.2172316312789917, + "learning_rate": 1.9998271720589558e-05, + "loss": 0.5679, + "step": 180 + }, + { + "epoch": 0.04037474905197413, + "grad_norm": 0.3194250464439392, + "learning_rate": 1.999822769105761e-05, + "loss": 0.6105, + "step": 181 + }, + { + "epoch": 0.040597813963863486, + "grad_norm": 0.2604449391365051, + "learning_rate": 1.9998183107760915e-05, + "loss": 0.5942, + "step": 182 + }, + { + "epoch": 0.04082087887575284, + "grad_norm": 0.26620209217071533, + "learning_rate": 1.9998137970701955e-05, + "loss": 0.5859, + "step": 183 + }, + { + "epoch": 0.0410439437876422, + "grad_norm": 0.2345753312110901, + "learning_rate": 1.9998092279883215e-05, + "loss": 0.5939, + "step": 184 + }, + { + "epoch": 0.041267008699531565, + "grad_norm": 0.22213764488697052, + "learning_rate": 1.999804603530724e-05, + "loss": 0.5815, + "step": 185 + }, + { + "epoch": 0.04149007361142092, + "grad_norm": 0.23276378214359283, + "learning_rate": 1.9997999236976587e-05, + "loss": 0.5842, + "step": 186 + }, + { + "epoch": 0.04171313852331028, + "grad_norm": 0.22480995953083038, + "learning_rate": 1.9997951884893843e-05, + "loss": 0.5897, + "step": 187 + }, + { + "epoch": 0.041936203435199645, + "grad_norm": 0.22761783003807068, + "learning_rate": 1.9997903979061635e-05, + "loss": 0.5873, + "step": 188 + }, + { + "epoch": 0.042159268347089, + "grad_norm": 0.27698761224746704, + "learning_rate": 1.9997855519482614e-05, + "loss": 0.5933, + "step": 189 + }, + { + "epoch": 0.04238233325897836, + "grad_norm": 0.2002975046634674, + "learning_rate": 1.9997806506159466e-05, + "loss": 0.5698, + "step": 190 + }, + { + "epoch": 0.042605398170867725, + "grad_norm": 0.233673095703125, + "learning_rate": 1.9997756939094905e-05, + "loss": 0.5765, + "step": 191 + }, + { + "epoch": 0.04282846308275708, + "grad_norm": 0.26376616954803467, + "learning_rate": 1.999770681829168e-05, + "loss": 0.5741, + "step": 192 + }, + { + "epoch": 0.04305152799464644, + "grad_norm": 0.22886492311954498, + "learning_rate": 1.9997656143752556e-05, + "loss": 0.5709, + "step": 193 + }, + { + "epoch": 0.043274592906535805, + "grad_norm": 0.3004070520401001, + "learning_rate": 1.9997604915480352e-05, + "loss": 0.5514, + "step": 194 + }, + { + "epoch": 0.04349765781842516, + "grad_norm": 0.22209997475147247, + "learning_rate": 1.99975531334779e-05, + "loss": 0.6095, + "step": 195 + }, + { + "epoch": 0.04372072273031452, + "grad_norm": 0.20531293749809265, + "learning_rate": 1.9997500797748067e-05, + "loss": 0.5618, + "step": 196 + }, + { + "epoch": 0.04394378764220388, + "grad_norm": 0.20507480204105377, + "learning_rate": 1.9997447908293753e-05, + "loss": 0.5465, + "step": 197 + }, + { + "epoch": 0.04416685255409324, + "grad_norm": 0.22681620717048645, + "learning_rate": 1.999739446511789e-05, + "loss": 0.5546, + "step": 198 + }, + { + "epoch": 0.0443899174659826, + "grad_norm": 0.21772046387195587, + "learning_rate": 1.999734046822343e-05, + "loss": 0.5934, + "step": 199 + }, + { + "epoch": 0.04461298237787196, + "grad_norm": 0.22321215271949768, + "learning_rate": 1.9997285917613375e-05, + "loss": 0.579, + "step": 200 + }, + { + "epoch": 0.04483604728976132, + "grad_norm": 0.21166378259658813, + "learning_rate": 1.9997230813290737e-05, + "loss": 0.5978, + "step": 201 + }, + { + "epoch": 0.04505911220165068, + "grad_norm": 0.23722383379936218, + "learning_rate": 1.999717515525857e-05, + "loss": 0.5747, + "step": 202 + }, + { + "epoch": 0.04528217711354004, + "grad_norm": 0.36909613013267517, + "learning_rate": 1.9997118943519962e-05, + "loss": 0.5417, + "step": 203 + }, + { + "epoch": 0.0455052420254294, + "grad_norm": 0.23573465645313263, + "learning_rate": 1.9997062178078023e-05, + "loss": 0.5971, + "step": 204 + }, + { + "epoch": 0.04572830693731876, + "grad_norm": 0.25214946269989014, + "learning_rate": 1.9997004858935894e-05, + "loss": 0.5852, + "step": 205 + }, + { + "epoch": 0.04595137184920812, + "grad_norm": 0.20637395977973938, + "learning_rate": 1.9996946986096754e-05, + "loss": 0.5394, + "step": 206 + }, + { + "epoch": 0.046174436761097476, + "grad_norm": 0.21749068796634674, + "learning_rate": 1.9996888559563804e-05, + "loss": 0.5662, + "step": 207 + }, + { + "epoch": 0.04639750167298684, + "grad_norm": 0.2306276559829712, + "learning_rate": 1.9996829579340284e-05, + "loss": 0.6038, + "step": 208 + }, + { + "epoch": 0.0466205665848762, + "grad_norm": 0.22346408665180206, + "learning_rate": 1.999677004542946e-05, + "loss": 0.6024, + "step": 209 + }, + { + "epoch": 0.046843631496765556, + "grad_norm": 0.22103498876094818, + "learning_rate": 1.9996709957834627e-05, + "loss": 0.5805, + "step": 210 + }, + { + "epoch": 0.04706669640865492, + "grad_norm": 0.2120116651058197, + "learning_rate": 1.9996649316559118e-05, + "loss": 0.5712, + "step": 211 + }, + { + "epoch": 0.04728976132054428, + "grad_norm": 0.24006275832653046, + "learning_rate": 1.9996588121606286e-05, + "loss": 0.5487, + "step": 212 + }, + { + "epoch": 0.047512826232433636, + "grad_norm": 0.2095099836587906, + "learning_rate": 1.9996526372979522e-05, + "loss": 0.555, + "step": 213 + }, + { + "epoch": 0.047735891144323, + "grad_norm": 0.23283496499061584, + "learning_rate": 1.999646407068225e-05, + "loss": 0.6025, + "step": 214 + }, + { + "epoch": 0.04795895605621236, + "grad_norm": 0.2015821784734726, + "learning_rate": 1.9996401214717912e-05, + "loss": 0.5725, + "step": 215 + }, + { + "epoch": 0.048182020968101716, + "grad_norm": 0.2395429164171219, + "learning_rate": 1.999633780509e-05, + "loss": 0.5549, + "step": 216 + }, + { + "epoch": 0.04840508587999108, + "grad_norm": 0.2157035768032074, + "learning_rate": 1.9996273841802017e-05, + "loss": 0.5644, + "step": 217 + }, + { + "epoch": 0.04862815079188044, + "grad_norm": 0.2156548947095871, + "learning_rate": 1.9996209324857516e-05, + "loss": 0.5901, + "step": 218 + }, + { + "epoch": 0.048851215703769796, + "grad_norm": 0.2213698923587799, + "learning_rate": 1.999614425426006e-05, + "loss": 0.5463, + "step": 219 + }, + { + "epoch": 0.049074280615659154, + "grad_norm": 0.21965420246124268, + "learning_rate": 1.9996078630013253e-05, + "loss": 0.5741, + "step": 220 + }, + { + "epoch": 0.04929734552754852, + "grad_norm": 0.213240385055542, + "learning_rate": 1.999601245212074e-05, + "loss": 0.5698, + "step": 221 + }, + { + "epoch": 0.049520410439437876, + "grad_norm": 0.21078208088874817, + "learning_rate": 1.9995945720586177e-05, + "loss": 0.5923, + "step": 222 + }, + { + "epoch": 0.04974347535132723, + "grad_norm": 0.21629688143730164, + "learning_rate": 1.9995878435413264e-05, + "loss": 0.5438, + "step": 223 + }, + { + "epoch": 0.0499665402632166, + "grad_norm": 0.22872765362262726, + "learning_rate": 1.9995810596605725e-05, + "loss": 0.5543, + "step": 224 + }, + { + "epoch": 0.050189605175105956, + "grad_norm": 0.2012244164943695, + "learning_rate": 1.999574220416732e-05, + "loss": 0.557, + "step": 225 + }, + { + "epoch": 0.05041267008699531, + "grad_norm": 0.21912412345409393, + "learning_rate": 1.9995673258101837e-05, + "loss": 0.5816, + "step": 226 + }, + { + "epoch": 0.05063573499888468, + "grad_norm": 0.22358618676662445, + "learning_rate": 1.999560375841309e-05, + "loss": 0.6222, + "step": 227 + }, + { + "epoch": 0.050858799910774036, + "grad_norm": 0.22544872760772705, + "learning_rate": 1.9995533705104936e-05, + "loss": 0.5481, + "step": 228 + }, + { + "epoch": 0.05108186482266339, + "grad_norm": 0.20495697855949402, + "learning_rate": 1.999546309818125e-05, + "loss": 0.5371, + "step": 229 + }, + { + "epoch": 0.05130492973455276, + "grad_norm": 0.22267979383468628, + "learning_rate": 1.9995391937645944e-05, + "loss": 0.5916, + "step": 230 + }, + { + "epoch": 0.051527994646442116, + "grad_norm": 0.200364887714386, + "learning_rate": 1.9995320223502958e-05, + "loss": 0.541, + "step": 231 + }, + { + "epoch": 0.05175105955833147, + "grad_norm": 0.20090313255786896, + "learning_rate": 1.9995247955756267e-05, + "loss": 0.565, + "step": 232 + }, + { + "epoch": 0.05197412447022083, + "grad_norm": 5.10443639755249, + "learning_rate": 1.9995175134409868e-05, + "loss": 0.6056, + "step": 233 + }, + { + "epoch": 0.052197189382110196, + "grad_norm": 0.30669254064559937, + "learning_rate": 1.99951017594678e-05, + "loss": 0.589, + "step": 234 + }, + { + "epoch": 0.05242025429399955, + "grad_norm": 0.21791155636310577, + "learning_rate": 1.9995027830934125e-05, + "loss": 0.5438, + "step": 235 + }, + { + "epoch": 0.05264331920588891, + "grad_norm": 0.25966677069664, + "learning_rate": 1.9994953348812937e-05, + "loss": 0.582, + "step": 236 + }, + { + "epoch": 0.052866384117778276, + "grad_norm": 0.24935361742973328, + "learning_rate": 1.9994878313108362e-05, + "loss": 0.5934, + "step": 237 + }, + { + "epoch": 0.05308944902966763, + "grad_norm": 0.22276830673217773, + "learning_rate": 1.9994802723824557e-05, + "loss": 0.5845, + "step": 238 + }, + { + "epoch": 0.05331251394155699, + "grad_norm": 0.2349478304386139, + "learning_rate": 1.9994726580965704e-05, + "loss": 0.5716, + "step": 239 + }, + { + "epoch": 0.053535578853446356, + "grad_norm": 0.22332008183002472, + "learning_rate": 1.9994649884536026e-05, + "loss": 0.6009, + "step": 240 + }, + { + "epoch": 0.05375864376533571, + "grad_norm": 0.19660501182079315, + "learning_rate": 1.9994572634539767e-05, + "loss": 0.5574, + "step": 241 + }, + { + "epoch": 0.05398170867722507, + "grad_norm": 0.2085440754890442, + "learning_rate": 1.999449483098121e-05, + "loss": 0.5676, + "step": 242 + }, + { + "epoch": 0.054204773589114436, + "grad_norm": 0.21077819168567657, + "learning_rate": 1.999441647386466e-05, + "loss": 0.5507, + "step": 243 + }, + { + "epoch": 0.05442783850100379, + "grad_norm": 0.2927658259868622, + "learning_rate": 1.9994337563194457e-05, + "loss": 0.5834, + "step": 244 + }, + { + "epoch": 0.05465090341289315, + "grad_norm": 0.22845080494880676, + "learning_rate": 1.9994258098974974e-05, + "loss": 0.5524, + "step": 245 + }, + { + "epoch": 0.05487396832478251, + "grad_norm": 0.24501702189445496, + "learning_rate": 1.999417808121061e-05, + "loss": 0.5376, + "step": 246 + }, + { + "epoch": 0.05509703323667187, + "grad_norm": 0.19732032716274261, + "learning_rate": 1.99940975099058e-05, + "loss": 0.5697, + "step": 247 + }, + { + "epoch": 0.05532009814856123, + "grad_norm": 0.21217837929725647, + "learning_rate": 1.9994016385065005e-05, + "loss": 0.5941, + "step": 248 + }, + { + "epoch": 0.05554316306045059, + "grad_norm": 0.25333619117736816, + "learning_rate": 1.999393470669272e-05, + "loss": 0.5482, + "step": 249 + }, + { + "epoch": 0.05576622797233995, + "grad_norm": 0.21620452404022217, + "learning_rate": 1.9993852474793457e-05, + "loss": 0.5712, + "step": 250 + }, + { + "epoch": 0.05598929288422931, + "grad_norm": 0.20523187518119812, + "learning_rate": 1.9993769689371788e-05, + "loss": 0.5676, + "step": 251 + }, + { + "epoch": 0.05621235779611867, + "grad_norm": 0.20364375412464142, + "learning_rate": 1.999368635043229e-05, + "loss": 0.5764, + "step": 252 + }, + { + "epoch": 0.05643542270800803, + "grad_norm": 0.2179926186800003, + "learning_rate": 1.9993602457979574e-05, + "loss": 0.5583, + "step": 253 + }, + { + "epoch": 0.05665848761989739, + "grad_norm": 0.23844636976718903, + "learning_rate": 1.9993518012018297e-05, + "loss": 0.5768, + "step": 254 + }, + { + "epoch": 0.05688155253178675, + "grad_norm": 0.21450263261795044, + "learning_rate": 1.9993433012553128e-05, + "loss": 0.5814, + "step": 255 + }, + { + "epoch": 0.05710461744367611, + "grad_norm": 0.22201555967330933, + "learning_rate": 1.9993347459588777e-05, + "loss": 0.5565, + "step": 256 + }, + { + "epoch": 0.05732768235556547, + "grad_norm": 0.2182476669549942, + "learning_rate": 1.9993261353129988e-05, + "loss": 0.5818, + "step": 257 + }, + { + "epoch": 0.05755074726745483, + "grad_norm": 0.1971082240343094, + "learning_rate": 1.9993174693181517e-05, + "loss": 0.5686, + "step": 258 + }, + { + "epoch": 0.057773812179344186, + "grad_norm": 0.20535312592983246, + "learning_rate": 1.999308747974818e-05, + "loss": 0.5687, + "step": 259 + }, + { + "epoch": 0.05799687709123355, + "grad_norm": 0.218933567404747, + "learning_rate": 1.9992999712834794e-05, + "loss": 0.5769, + "step": 260 + }, + { + "epoch": 0.05821994200312291, + "grad_norm": 0.2132030874490738, + "learning_rate": 1.9992911392446227e-05, + "loss": 0.596, + "step": 261 + }, + { + "epoch": 0.058443006915012266, + "grad_norm": 0.19926026463508606, + "learning_rate": 1.999282251858737e-05, + "loss": 0.558, + "step": 262 + }, + { + "epoch": 0.05866607182690163, + "grad_norm": 0.2089562714099884, + "learning_rate": 1.9992733091263144e-05, + "loss": 0.582, + "step": 263 + }, + { + "epoch": 0.05888913673879099, + "grad_norm": 0.20722773671150208, + "learning_rate": 1.9992643110478504e-05, + "loss": 0.5758, + "step": 264 + }, + { + "epoch": 0.059112201650680346, + "grad_norm": 0.21631532907485962, + "learning_rate": 1.999255257623843e-05, + "loss": 0.5725, + "step": 265 + }, + { + "epoch": 0.05933526656256971, + "grad_norm": 0.21997545659542084, + "learning_rate": 1.999246148854794e-05, + "loss": 0.5915, + "step": 266 + }, + { + "epoch": 0.05955833147445907, + "grad_norm": 0.4292175769805908, + "learning_rate": 1.9992369847412076e-05, + "loss": 0.5891, + "step": 267 + }, + { + "epoch": 0.059781396386348426, + "grad_norm": 0.2055254876613617, + "learning_rate": 1.9992277652835918e-05, + "loss": 0.5219, + "step": 268 + }, + { + "epoch": 0.06000446129823779, + "grad_norm": 0.20326492190361023, + "learning_rate": 1.9992184904824566e-05, + "loss": 0.5462, + "step": 269 + }, + { + "epoch": 0.06022752621012715, + "grad_norm": 0.19965752959251404, + "learning_rate": 1.9992091603383164e-05, + "loss": 0.5688, + "step": 270 + }, + { + "epoch": 0.060450591122016506, + "grad_norm": 0.19592702388763428, + "learning_rate": 1.9991997748516872e-05, + "loss": 0.5556, + "step": 271 + }, + { + "epoch": 0.060673656033905864, + "grad_norm": 0.2036285698413849, + "learning_rate": 1.9991903340230898e-05, + "loss": 0.5452, + "step": 272 + }, + { + "epoch": 0.06089672094579523, + "grad_norm": 0.21509170532226562, + "learning_rate": 1.9991808378530465e-05, + "loss": 0.5682, + "step": 273 + }, + { + "epoch": 0.061119785857684586, + "grad_norm": 0.2015625238418579, + "learning_rate": 1.9991712863420832e-05, + "loss": 0.5846, + "step": 274 + }, + { + "epoch": 0.061342850769573944, + "grad_norm": 0.21707095205783844, + "learning_rate": 1.9991616794907286e-05, + "loss": 0.5666, + "step": 275 + }, + { + "epoch": 0.06156591568146331, + "grad_norm": 0.1988460123538971, + "learning_rate": 1.9991520172995158e-05, + "loss": 0.5875, + "step": 276 + }, + { + "epoch": 0.061788980593352666, + "grad_norm": 0.19290049374103546, + "learning_rate": 1.999142299768979e-05, + "loss": 0.5497, + "step": 277 + }, + { + "epoch": 0.062012045505242024, + "grad_norm": 0.2069414258003235, + "learning_rate": 1.9991325268996567e-05, + "loss": 0.5438, + "step": 278 + }, + { + "epoch": 0.06223511041713139, + "grad_norm": 0.22346992790699005, + "learning_rate": 1.9991226986920906e-05, + "loss": 0.5668, + "step": 279 + }, + { + "epoch": 0.062458175329020746, + "grad_norm": 0.21194760501384735, + "learning_rate": 1.9991128151468247e-05, + "loss": 0.5957, + "step": 280 + }, + { + "epoch": 0.06268124024091011, + "grad_norm": 0.209646537899971, + "learning_rate": 1.9991028762644063e-05, + "loss": 0.5748, + "step": 281 + }, + { + "epoch": 0.06290430515279946, + "grad_norm": 0.20566052198410034, + "learning_rate": 1.9990928820453858e-05, + "loss": 0.5506, + "step": 282 + }, + { + "epoch": 0.06312737006468883, + "grad_norm": 0.21374207735061646, + "learning_rate": 1.999082832490317e-05, + "loss": 0.5697, + "step": 283 + }, + { + "epoch": 0.06335043497657819, + "grad_norm": 0.21012777090072632, + "learning_rate": 1.999072727599757e-05, + "loss": 0.5112, + "step": 284 + }, + { + "epoch": 0.06357349988846754, + "grad_norm": 0.19954928755760193, + "learning_rate": 1.9990625673742644e-05, + "loss": 0.579, + "step": 285 + }, + { + "epoch": 0.0637965648003569, + "grad_norm": 0.20394836366176605, + "learning_rate": 1.9990523518144027e-05, + "loss": 0.5425, + "step": 286 + }, + { + "epoch": 0.06401962971224627, + "grad_norm": 0.20405294001102448, + "learning_rate": 1.9990420809207375e-05, + "loss": 0.5538, + "step": 287 + }, + { + "epoch": 0.06424269462413562, + "grad_norm": 0.24065051972866058, + "learning_rate": 1.9990317546938373e-05, + "loss": 0.5786, + "step": 288 + }, + { + "epoch": 0.06446575953602499, + "grad_norm": 0.20158305764198303, + "learning_rate": 1.9990213731342747e-05, + "loss": 0.5475, + "step": 289 + }, + { + "epoch": 0.06468882444791434, + "grad_norm": 0.29271385073661804, + "learning_rate": 1.9990109362426243e-05, + "loss": 0.5468, + "step": 290 + }, + { + "epoch": 0.0649118893598037, + "grad_norm": 0.2596181333065033, + "learning_rate": 1.999000444019464e-05, + "loss": 0.5632, + "step": 291 + }, + { + "epoch": 0.06513495427169307, + "grad_norm": 0.20416077971458435, + "learning_rate": 1.9989898964653753e-05, + "loss": 0.5755, + "step": 292 + }, + { + "epoch": 0.06535801918358242, + "grad_norm": 0.24204829335212708, + "learning_rate": 1.998979293580942e-05, + "loss": 0.5679, + "step": 293 + }, + { + "epoch": 0.06558108409547178, + "grad_norm": 0.20215743780136108, + "learning_rate": 1.9989686353667522e-05, + "loss": 0.5596, + "step": 294 + }, + { + "epoch": 0.06580414900736115, + "grad_norm": 0.24796119332313538, + "learning_rate": 1.998957921823395e-05, + "loss": 0.546, + "step": 295 + }, + { + "epoch": 0.0660272139192505, + "grad_norm": 0.22253303229808807, + "learning_rate": 1.9989471529514647e-05, + "loss": 0.5606, + "step": 296 + }, + { + "epoch": 0.06625027883113986, + "grad_norm": 0.19948336482048035, + "learning_rate": 1.9989363287515577e-05, + "loss": 0.596, + "step": 297 + }, + { + "epoch": 0.06647334374302923, + "grad_norm": 0.26588407158851624, + "learning_rate": 1.9989254492242727e-05, + "loss": 0.5347, + "step": 298 + }, + { + "epoch": 0.06669640865491858, + "grad_norm": 0.2680055797100067, + "learning_rate": 1.9989145143702132e-05, + "loss": 0.588, + "step": 299 + }, + { + "epoch": 0.06691947356680794, + "grad_norm": 0.2872098684310913, + "learning_rate": 1.9989035241899844e-05, + "loss": 0.5261, + "step": 300 + }, + { + "epoch": 0.0671425384786973, + "grad_norm": 0.19267728924751282, + "learning_rate": 1.998892478684195e-05, + "loss": 0.5807, + "step": 301 + }, + { + "epoch": 0.06736560339058666, + "grad_norm": 0.20785115659236908, + "learning_rate": 1.9988813778534568e-05, + "loss": 0.5972, + "step": 302 + }, + { + "epoch": 0.06758866830247602, + "grad_norm": 0.1926824450492859, + "learning_rate": 1.998870221698385e-05, + "loss": 0.5495, + "step": 303 + }, + { + "epoch": 0.06781173321436539, + "grad_norm": 0.21504642069339752, + "learning_rate": 1.9988590102195968e-05, + "loss": 0.5593, + "step": 304 + }, + { + "epoch": 0.06803479812625474, + "grad_norm": 0.22837965190410614, + "learning_rate": 1.9988477434177136e-05, + "loss": 0.5749, + "step": 305 + }, + { + "epoch": 0.0682578630381441, + "grad_norm": 0.27670159935951233, + "learning_rate": 1.9988364212933595e-05, + "loss": 0.5474, + "step": 306 + }, + { + "epoch": 0.06848092795003347, + "grad_norm": 0.1969708949327469, + "learning_rate": 1.9988250438471612e-05, + "loss": 0.573, + "step": 307 + }, + { + "epoch": 0.06870399286192282, + "grad_norm": 0.19591858983039856, + "learning_rate": 1.9988136110797494e-05, + "loss": 0.5677, + "step": 308 + }, + { + "epoch": 0.06892705777381218, + "grad_norm": 0.1888752281665802, + "learning_rate": 1.998802122991757e-05, + "loss": 0.5485, + "step": 309 + }, + { + "epoch": 0.06915012268570155, + "grad_norm": 0.20352420210838318, + "learning_rate": 1.9987905795838204e-05, + "loss": 0.5635, + "step": 310 + }, + { + "epoch": 0.0693731875975909, + "grad_norm": 0.19486026465892792, + "learning_rate": 1.9987789808565785e-05, + "loss": 0.5624, + "step": 311 + }, + { + "epoch": 0.06959625250948026, + "grad_norm": 0.20236323773860931, + "learning_rate": 1.9987673268106742e-05, + "loss": 0.5817, + "step": 312 + }, + { + "epoch": 0.06981931742136963, + "grad_norm": 0.2420659214258194, + "learning_rate": 1.998755617446753e-05, + "loss": 0.5586, + "step": 313 + }, + { + "epoch": 0.07004238233325898, + "grad_norm": 0.21799051761627197, + "learning_rate": 1.9987438527654633e-05, + "loss": 0.5918, + "step": 314 + }, + { + "epoch": 0.07026544724514834, + "grad_norm": 0.19442883133888245, + "learning_rate": 1.9987320327674566e-05, + "loss": 0.5621, + "step": 315 + }, + { + "epoch": 0.07048851215703769, + "grad_norm": 0.19582122564315796, + "learning_rate": 1.9987201574533876e-05, + "loss": 0.5432, + "step": 316 + }, + { + "epoch": 0.07071157706892706, + "grad_norm": 0.1984792798757553, + "learning_rate": 1.998708226823914e-05, + "loss": 0.5532, + "step": 317 + }, + { + "epoch": 0.07093464198081642, + "grad_norm": 0.2259417176246643, + "learning_rate": 1.9986962408796972e-05, + "loss": 0.5676, + "step": 318 + }, + { + "epoch": 0.07115770689270577, + "grad_norm": 0.2008512020111084, + "learning_rate": 1.9986841996213998e-05, + "loss": 0.5499, + "step": 319 + }, + { + "epoch": 0.07138077180459514, + "grad_norm": 0.19798852503299713, + "learning_rate": 1.99867210304969e-05, + "loss": 0.564, + "step": 320 + }, + { + "epoch": 0.0716038367164845, + "grad_norm": 0.21567919850349426, + "learning_rate": 1.998659951165237e-05, + "loss": 0.5701, + "step": 321 + }, + { + "epoch": 0.07182690162837385, + "grad_norm": 0.21192528307437897, + "learning_rate": 1.998647743968714e-05, + "loss": 0.595, + "step": 322 + }, + { + "epoch": 0.07204996654026322, + "grad_norm": 0.1853858083486557, + "learning_rate": 1.9986354814607974e-05, + "loss": 0.5546, + "step": 323 + }, + { + "epoch": 0.07227303145215258, + "grad_norm": 0.19545771181583405, + "learning_rate": 1.998623163642166e-05, + "loss": 0.5602, + "step": 324 + }, + { + "epoch": 0.07249609636404193, + "grad_norm": 0.20142099261283875, + "learning_rate": 1.998610790513502e-05, + "loss": 0.5589, + "step": 325 + }, + { + "epoch": 0.0727191612759313, + "grad_norm": 0.1894591748714447, + "learning_rate": 1.9985983620754914e-05, + "loss": 0.5855, + "step": 326 + }, + { + "epoch": 0.07294222618782066, + "grad_norm": 0.19726653397083282, + "learning_rate": 1.998585878328822e-05, + "loss": 0.559, + "step": 327 + }, + { + "epoch": 0.07316529109971001, + "grad_norm": 0.19552524387836456, + "learning_rate": 1.998573339274185e-05, + "loss": 0.5643, + "step": 328 + }, + { + "epoch": 0.07338835601159938, + "grad_norm": 0.2093891054391861, + "learning_rate": 1.9985607449122754e-05, + "loss": 0.5661, + "step": 329 + }, + { + "epoch": 0.07361142092348874, + "grad_norm": 0.2397080659866333, + "learning_rate": 1.9985480952437902e-05, + "loss": 0.565, + "step": 330 + }, + { + "epoch": 0.07383448583537809, + "grad_norm": 0.21570439636707306, + "learning_rate": 1.998535390269431e-05, + "loss": 0.5505, + "step": 331 + }, + { + "epoch": 0.07405755074726746, + "grad_norm": 0.18352028727531433, + "learning_rate": 1.9985226299899006e-05, + "loss": 0.5501, + "step": 332 + }, + { + "epoch": 0.07428061565915682, + "grad_norm": 0.20006653666496277, + "learning_rate": 1.9985098144059058e-05, + "loss": 0.5559, + "step": 333 + }, + { + "epoch": 0.07450368057104617, + "grad_norm": 0.22780755162239075, + "learning_rate": 1.998496943518157e-05, + "loss": 0.5886, + "step": 334 + }, + { + "epoch": 0.07472674548293554, + "grad_norm": 0.18735693395137787, + "learning_rate": 1.9984840173273662e-05, + "loss": 0.5714, + "step": 335 + }, + { + "epoch": 0.0749498103948249, + "grad_norm": 0.1882167011499405, + "learning_rate": 1.99847103583425e-05, + "loss": 0.5586, + "step": 336 + }, + { + "epoch": 0.07517287530671425, + "grad_norm": 0.20951317250728607, + "learning_rate": 1.9984579990395274e-05, + "loss": 0.5286, + "step": 337 + }, + { + "epoch": 0.07539594021860362, + "grad_norm": 0.2166478931903839, + "learning_rate": 1.9984449069439197e-05, + "loss": 0.5661, + "step": 338 + }, + { + "epoch": 0.07561900513049297, + "grad_norm": 0.191757470369339, + "learning_rate": 1.998431759548153e-05, + "loss": 0.5431, + "step": 339 + }, + { + "epoch": 0.07584207004238233, + "grad_norm": 0.2109043151140213, + "learning_rate": 1.998418556852955e-05, + "loss": 0.5642, + "step": 340 + }, + { + "epoch": 0.0760651349542717, + "grad_norm": 0.2524346709251404, + "learning_rate": 1.9984052988590573e-05, + "loss": 0.5505, + "step": 341 + }, + { + "epoch": 0.07628819986616105, + "grad_norm": 0.22443033754825592, + "learning_rate": 1.9983919855671937e-05, + "loss": 0.5454, + "step": 342 + }, + { + "epoch": 0.07651126477805041, + "grad_norm": 0.1956530660390854, + "learning_rate": 1.9983786169781017e-05, + "loss": 0.5546, + "step": 343 + }, + { + "epoch": 0.07673432968993978, + "grad_norm": 0.20710930228233337, + "learning_rate": 1.9983651930925217e-05, + "loss": 0.5554, + "step": 344 + }, + { + "epoch": 0.07695739460182913, + "grad_norm": 0.2150043398141861, + "learning_rate": 1.9983517139111978e-05, + "loss": 0.5434, + "step": 345 + }, + { + "epoch": 0.07718045951371849, + "grad_norm": 0.21276871860027313, + "learning_rate": 1.998338179434876e-05, + "loss": 0.5623, + "step": 346 + }, + { + "epoch": 0.07740352442560786, + "grad_norm": 0.20740285515785217, + "learning_rate": 1.998324589664306e-05, + "loss": 0.5804, + "step": 347 + }, + { + "epoch": 0.0776265893374972, + "grad_norm": 0.20988604426383972, + "learning_rate": 1.99831094460024e-05, + "loss": 0.5742, + "step": 348 + }, + { + "epoch": 0.07784965424938657, + "grad_norm": 0.19990280270576477, + "learning_rate": 1.9982972442434346e-05, + "loss": 0.5397, + "step": 349 + }, + { + "epoch": 0.07807271916127594, + "grad_norm": 0.295187383890152, + "learning_rate": 1.9982834885946482e-05, + "loss": 0.5713, + "step": 350 + }, + { + "epoch": 0.07829578407316529, + "grad_norm": 0.20176567137241364, + "learning_rate": 1.998269677654643e-05, + "loss": 0.5548, + "step": 351 + }, + { + "epoch": 0.07851884898505465, + "grad_norm": 0.20890064537525177, + "learning_rate": 1.9982558114241837e-05, + "loss": 0.5739, + "step": 352 + }, + { + "epoch": 0.07874191389694402, + "grad_norm": 0.2795037031173706, + "learning_rate": 1.998241889904038e-05, + "loss": 0.5343, + "step": 353 + }, + { + "epoch": 0.07896497880883337, + "grad_norm": 0.22098244726657867, + "learning_rate": 1.9982279130949775e-05, + "loss": 0.517, + "step": 354 + }, + { + "epoch": 0.07918804372072273, + "grad_norm": 0.20403707027435303, + "learning_rate": 1.998213880997776e-05, + "loss": 0.5599, + "step": 355 + }, + { + "epoch": 0.0794111086326121, + "grad_norm": 0.2201310694217682, + "learning_rate": 1.9981997936132107e-05, + "loss": 0.5348, + "step": 356 + }, + { + "epoch": 0.07963417354450145, + "grad_norm": 0.22818854451179504, + "learning_rate": 1.998185650942062e-05, + "loss": 0.551, + "step": 357 + }, + { + "epoch": 0.07985723845639081, + "grad_norm": 0.23051077127456665, + "learning_rate": 1.9981714529851127e-05, + "loss": 0.5926, + "step": 358 + }, + { + "epoch": 0.08008030336828018, + "grad_norm": 0.21485844254493713, + "learning_rate": 1.99815719974315e-05, + "loss": 0.5801, + "step": 359 + }, + { + "epoch": 0.08030336828016953, + "grad_norm": 0.1935802698135376, + "learning_rate": 1.998142891216963e-05, + "loss": 0.5411, + "step": 360 + }, + { + "epoch": 0.08052643319205889, + "grad_norm": 0.1971837282180786, + "learning_rate": 1.998128527407344e-05, + "loss": 0.5805, + "step": 361 + }, + { + "epoch": 0.08074949810394826, + "grad_norm": 0.20717322826385498, + "learning_rate": 1.9981141083150886e-05, + "loss": 0.5493, + "step": 362 + }, + { + "epoch": 0.0809725630158376, + "grad_norm": 0.2011585831642151, + "learning_rate": 1.9980996339409957e-05, + "loss": 0.5579, + "step": 363 + }, + { + "epoch": 0.08119562792772697, + "grad_norm": 0.19211354851722717, + "learning_rate": 1.9980851042858664e-05, + "loss": 0.5383, + "step": 364 + }, + { + "epoch": 0.08141869283961632, + "grad_norm": 0.21699954569339752, + "learning_rate": 1.998070519350506e-05, + "loss": 0.5524, + "step": 365 + }, + { + "epoch": 0.08164175775150569, + "grad_norm": 0.20210997760295868, + "learning_rate": 1.9980558791357222e-05, + "loss": 0.5738, + "step": 366 + }, + { + "epoch": 0.08186482266339505, + "grad_norm": 0.18855418264865875, + "learning_rate": 1.9980411836423256e-05, + "loss": 0.547, + "step": 367 + }, + { + "epoch": 0.0820878875752844, + "grad_norm": 0.21640720963478088, + "learning_rate": 1.9980264328711305e-05, + "loss": 0.5964, + "step": 368 + }, + { + "epoch": 0.08231095248717377, + "grad_norm": 0.2280052751302719, + "learning_rate": 1.9980116268229536e-05, + "loss": 0.561, + "step": 369 + }, + { + "epoch": 0.08253401739906313, + "grad_norm": 0.1860683411359787, + "learning_rate": 1.9979967654986155e-05, + "loss": 0.545, + "step": 370 + }, + { + "epoch": 0.08275708231095248, + "grad_norm": 0.19923582673072815, + "learning_rate": 1.9979818488989383e-05, + "loss": 0.5421, + "step": 371 + }, + { + "epoch": 0.08298014722284185, + "grad_norm": 0.23478665947914124, + "learning_rate": 1.997966877024749e-05, + "loss": 0.5652, + "step": 372 + }, + { + "epoch": 0.08320321213473121, + "grad_norm": 0.2126697599887848, + "learning_rate": 1.9979518498768768e-05, + "loss": 0.5599, + "step": 373 + }, + { + "epoch": 0.08342627704662056, + "grad_norm": 0.20854973793029785, + "learning_rate": 1.9979367674561535e-05, + "loss": 0.5425, + "step": 374 + }, + { + "epoch": 0.08364934195850993, + "grad_norm": 0.20865046977996826, + "learning_rate": 1.9979216297634148e-05, + "loss": 0.5529, + "step": 375 + }, + { + "epoch": 0.08387240687039929, + "grad_norm": 0.24369299411773682, + "learning_rate": 1.997906436799499e-05, + "loss": 0.535, + "step": 376 + }, + { + "epoch": 0.08409547178228864, + "grad_norm": 0.2064153105020523, + "learning_rate": 1.9978911885652475e-05, + "loss": 0.5488, + "step": 377 + }, + { + "epoch": 0.084318536694178, + "grad_norm": 0.21306705474853516, + "learning_rate": 1.997875885061505e-05, + "loss": 0.5627, + "step": 378 + }, + { + "epoch": 0.08454160160606737, + "grad_norm": 0.19575418531894684, + "learning_rate": 1.9978605262891196e-05, + "loss": 0.5565, + "step": 379 + }, + { + "epoch": 0.08476466651795672, + "grad_norm": 0.19842910766601562, + "learning_rate": 1.9978451122489412e-05, + "loss": 0.5709, + "step": 380 + }, + { + "epoch": 0.08498773142984609, + "grad_norm": 0.20483329892158508, + "learning_rate": 1.9978296429418237e-05, + "loss": 0.5773, + "step": 381 + }, + { + "epoch": 0.08521079634173545, + "grad_norm": 0.22734715044498444, + "learning_rate": 1.997814118368624e-05, + "loss": 0.5469, + "step": 382 + }, + { + "epoch": 0.0854338612536248, + "grad_norm": 0.19727206230163574, + "learning_rate": 1.997798538530202e-05, + "loss": 0.5541, + "step": 383 + }, + { + "epoch": 0.08565692616551417, + "grad_norm": 0.19417184591293335, + "learning_rate": 1.9977829034274205e-05, + "loss": 0.5513, + "step": 384 + }, + { + "epoch": 0.08587999107740353, + "grad_norm": 0.2037704885005951, + "learning_rate": 1.9977672130611454e-05, + "loss": 0.5567, + "step": 385 + }, + { + "epoch": 0.08610305598929288, + "grad_norm": 0.18121762573719025, + "learning_rate": 1.997751467432246e-05, + "loss": 0.5357, + "step": 386 + }, + { + "epoch": 0.08632612090118225, + "grad_norm": 0.27105584740638733, + "learning_rate": 1.997735666541594e-05, + "loss": 0.5654, + "step": 387 + }, + { + "epoch": 0.08654918581307161, + "grad_norm": 0.20133094489574432, + "learning_rate": 1.997719810390065e-05, + "loss": 0.5349, + "step": 388 + }, + { + "epoch": 0.08677225072496096, + "grad_norm": 0.18842002749443054, + "learning_rate": 1.997703898978537e-05, + "loss": 0.5352, + "step": 389 + }, + { + "epoch": 0.08699531563685033, + "grad_norm": 0.2452889233827591, + "learning_rate": 1.9976879323078913e-05, + "loss": 0.5899, + "step": 390 + }, + { + "epoch": 0.08721838054873968, + "grad_norm": 0.2256636619567871, + "learning_rate": 1.9976719103790118e-05, + "loss": 0.5115, + "step": 391 + }, + { + "epoch": 0.08744144546062904, + "grad_norm": 0.1972501426935196, + "learning_rate": 1.9976558331927868e-05, + "loss": 0.5523, + "step": 392 + }, + { + "epoch": 0.0876645103725184, + "grad_norm": 0.2026263177394867, + "learning_rate": 1.9976397007501062e-05, + "loss": 0.5758, + "step": 393 + }, + { + "epoch": 0.08788757528440776, + "grad_norm": 0.20572836697101593, + "learning_rate": 1.9976235130518632e-05, + "loss": 0.5429, + "step": 394 + }, + { + "epoch": 0.08811064019629712, + "grad_norm": 0.18851211667060852, + "learning_rate": 1.997607270098955e-05, + "loss": 0.5616, + "step": 395 + }, + { + "epoch": 0.08833370510818649, + "grad_norm": 0.20447294414043427, + "learning_rate": 1.9975909718922806e-05, + "loss": 0.5527, + "step": 396 + }, + { + "epoch": 0.08855677002007584, + "grad_norm": 0.19188085198402405, + "learning_rate": 1.997574618432744e-05, + "loss": 0.558, + "step": 397 + }, + { + "epoch": 0.0887798349319652, + "grad_norm": 0.19854480028152466, + "learning_rate": 1.997558209721249e-05, + "loss": 0.5756, + "step": 398 + }, + { + "epoch": 0.08900289984385457, + "grad_norm": 0.19832177460193634, + "learning_rate": 1.997541745758706e-05, + "loss": 0.569, + "step": 399 + }, + { + "epoch": 0.08922596475574392, + "grad_norm": 0.20165055990219116, + "learning_rate": 1.9975252265460265e-05, + "loss": 0.5376, + "step": 400 + }, + { + "epoch": 0.08944902966763328, + "grad_norm": 0.18997038900852203, + "learning_rate": 1.997508652084125e-05, + "loss": 0.5745, + "step": 401 + }, + { + "epoch": 0.08967209457952265, + "grad_norm": 0.19646713137626648, + "learning_rate": 1.9974920223739195e-05, + "loss": 0.5454, + "step": 402 + }, + { + "epoch": 0.089895159491412, + "grad_norm": 0.1976052224636078, + "learning_rate": 1.997475337416332e-05, + "loss": 0.537, + "step": 403 + }, + { + "epoch": 0.09011822440330136, + "grad_norm": 0.2045581340789795, + "learning_rate": 1.9974585972122857e-05, + "loss": 0.5405, + "step": 404 + }, + { + "epoch": 0.09034128931519073, + "grad_norm": 0.18631087243556976, + "learning_rate": 1.9974418017627076e-05, + "loss": 0.6034, + "step": 405 + }, + { + "epoch": 0.09056435422708008, + "grad_norm": 0.2022215723991394, + "learning_rate": 1.9974249510685285e-05, + "loss": 0.5682, + "step": 406 + }, + { + "epoch": 0.09078741913896944, + "grad_norm": 0.19569364190101624, + "learning_rate": 1.9974080451306816e-05, + "loss": 0.5739, + "step": 407 + }, + { + "epoch": 0.0910104840508588, + "grad_norm": 0.20746709406375885, + "learning_rate": 1.9973910839501035e-05, + "loss": 0.55, + "step": 408 + }, + { + "epoch": 0.09123354896274816, + "grad_norm": 0.21757768094539642, + "learning_rate": 1.997374067527733e-05, + "loss": 0.5404, + "step": 409 + }, + { + "epoch": 0.09145661387463752, + "grad_norm": 0.1881277859210968, + "learning_rate": 1.997356995864513e-05, + "loss": 0.5435, + "step": 410 + }, + { + "epoch": 0.09167967878652689, + "grad_norm": 0.1868787705898285, + "learning_rate": 1.9973398689613892e-05, + "loss": 0.5373, + "step": 411 + }, + { + "epoch": 0.09190274369841624, + "grad_norm": 0.1938547044992447, + "learning_rate": 1.9973226868193096e-05, + "loss": 0.5846, + "step": 412 + }, + { + "epoch": 0.0921258086103056, + "grad_norm": 0.19920659065246582, + "learning_rate": 1.9973054494392265e-05, + "loss": 0.5615, + "step": 413 + }, + { + "epoch": 0.09234887352219495, + "grad_norm": 0.18801911175251007, + "learning_rate": 1.997288156822094e-05, + "loss": 0.5675, + "step": 414 + }, + { + "epoch": 0.09257193843408432, + "grad_norm": 0.1990530639886856, + "learning_rate": 1.9972708089688705e-05, + "loss": 0.5626, + "step": 415 + }, + { + "epoch": 0.09279500334597368, + "grad_norm": 0.18555153906345367, + "learning_rate": 1.9972534058805163e-05, + "loss": 0.557, + "step": 416 + }, + { + "epoch": 0.09301806825786303, + "grad_norm": 0.20318473875522614, + "learning_rate": 1.9972359475579953e-05, + "loss": 0.5196, + "step": 417 + }, + { + "epoch": 0.0932411331697524, + "grad_norm": 0.20757220685482025, + "learning_rate": 1.997218434002275e-05, + "loss": 0.5454, + "step": 418 + }, + { + "epoch": 0.09346419808164176, + "grad_norm": 0.20588824152946472, + "learning_rate": 1.997200865214325e-05, + "loss": 0.5347, + "step": 419 + }, + { + "epoch": 0.09368726299353111, + "grad_norm": 0.20199880003929138, + "learning_rate": 1.9971832411951186e-05, + "loss": 0.5531, + "step": 420 + }, + { + "epoch": 0.09391032790542048, + "grad_norm": 0.19605985283851624, + "learning_rate": 1.9971655619456313e-05, + "loss": 0.5263, + "step": 421 + }, + { + "epoch": 0.09413339281730984, + "grad_norm": 0.19716261327266693, + "learning_rate": 1.997147827466843e-05, + "loss": 0.5722, + "step": 422 + }, + { + "epoch": 0.09435645772919919, + "grad_norm": 0.20400184392929077, + "learning_rate": 1.997130037759736e-05, + "loss": 0.5629, + "step": 423 + }, + { + "epoch": 0.09457952264108856, + "grad_norm": 0.203161358833313, + "learning_rate": 1.997112192825295e-05, + "loss": 0.5538, + "step": 424 + }, + { + "epoch": 0.09480258755297792, + "grad_norm": 0.20870056748390198, + "learning_rate": 1.997094292664509e-05, + "loss": 0.5548, + "step": 425 + }, + { + "epoch": 0.09502565246486727, + "grad_norm": 0.1856376975774765, + "learning_rate": 1.9970763372783687e-05, + "loss": 0.5193, + "step": 426 + }, + { + "epoch": 0.09524871737675664, + "grad_norm": 0.1954246610403061, + "learning_rate": 1.997058326667869e-05, + "loss": 0.5336, + "step": 427 + }, + { + "epoch": 0.095471782288646, + "grad_norm": 0.1996607482433319, + "learning_rate": 1.9970402608340076e-05, + "loss": 0.5708, + "step": 428 + }, + { + "epoch": 0.09569484720053535, + "grad_norm": 0.21878129243850708, + "learning_rate": 1.9970221397777848e-05, + "loss": 0.5794, + "step": 429 + }, + { + "epoch": 0.09591791211242472, + "grad_norm": 0.19142527878284454, + "learning_rate": 1.9970039635002044e-05, + "loss": 0.5221, + "step": 430 + }, + { + "epoch": 0.09614097702431408, + "grad_norm": 0.18871352076530457, + "learning_rate": 1.996985732002273e-05, + "loss": 0.5321, + "step": 431 + }, + { + "epoch": 0.09636404193620343, + "grad_norm": 0.1942790001630783, + "learning_rate": 1.996967445285001e-05, + "loss": 0.5269, + "step": 432 + }, + { + "epoch": 0.0965871068480928, + "grad_norm": 0.19900670647621155, + "learning_rate": 1.9969491033494e-05, + "loss": 0.6048, + "step": 433 + }, + { + "epoch": 0.09681017175998216, + "grad_norm": 0.193121075630188, + "learning_rate": 1.9969307061964873e-05, + "loss": 0.5613, + "step": 434 + }, + { + "epoch": 0.09703323667187151, + "grad_norm": 0.1859043687582016, + "learning_rate": 1.9969122538272807e-05, + "loss": 0.5657, + "step": 435 + }, + { + "epoch": 0.09725630158376088, + "grad_norm": 0.18597924709320068, + "learning_rate": 1.9968937462428028e-05, + "loss": 0.5336, + "step": 436 + }, + { + "epoch": 0.09747936649565024, + "grad_norm": 0.17796443402767181, + "learning_rate": 1.9968751834440783e-05, + "loss": 0.5611, + "step": 437 + }, + { + "epoch": 0.09770243140753959, + "grad_norm": 0.1958586871623993, + "learning_rate": 1.9968565654321356e-05, + "loss": 0.5377, + "step": 438 + }, + { + "epoch": 0.09792549631942896, + "grad_norm": 0.21395692229270935, + "learning_rate": 1.996837892208006e-05, + "loss": 0.5347, + "step": 439 + }, + { + "epoch": 0.09814856123131831, + "grad_norm": 0.19348031282424927, + "learning_rate": 1.9968191637727235e-05, + "loss": 0.5576, + "step": 440 + }, + { + "epoch": 0.09837162614320767, + "grad_norm": 0.19345401227474213, + "learning_rate": 1.9968003801273253e-05, + "loss": 0.5697, + "step": 441 + }, + { + "epoch": 0.09859469105509704, + "grad_norm": 0.18203531205654144, + "learning_rate": 1.9967815412728523e-05, + "loss": 0.5416, + "step": 442 + }, + { + "epoch": 0.09881775596698639, + "grad_norm": 0.20109198987483978, + "learning_rate": 1.9967626472103472e-05, + "loss": 0.5514, + "step": 443 + }, + { + "epoch": 0.09904082087887575, + "grad_norm": 0.18818293511867523, + "learning_rate": 1.996743697940857e-05, + "loss": 0.5535, + "step": 444 + }, + { + "epoch": 0.09926388579076512, + "grad_norm": 0.18718019127845764, + "learning_rate": 1.996724693465431e-05, + "loss": 0.5595, + "step": 445 + }, + { + "epoch": 0.09948695070265447, + "grad_norm": 0.18856103718280792, + "learning_rate": 1.9967056337851217e-05, + "loss": 0.5364, + "step": 446 + }, + { + "epoch": 0.09971001561454383, + "grad_norm": 0.18582871556282043, + "learning_rate": 1.996686518900985e-05, + "loss": 0.5632, + "step": 447 + }, + { + "epoch": 0.0999330805264332, + "grad_norm": 0.19166669249534607, + "learning_rate": 1.9966673488140794e-05, + "loss": 0.5727, + "step": 448 + }, + { + "epoch": 0.10015614543832255, + "grad_norm": 0.18631108105182648, + "learning_rate": 1.9966481235254667e-05, + "loss": 0.5472, + "step": 449 + }, + { + "epoch": 0.10037921035021191, + "grad_norm": 0.18970704078674316, + "learning_rate": 1.996628843036212e-05, + "loss": 0.5355, + "step": 450 + }, + { + "epoch": 0.10060227526210128, + "grad_norm": 0.18836888670921326, + "learning_rate": 1.9966095073473828e-05, + "loss": 0.5309, + "step": 451 + }, + { + "epoch": 0.10082534017399063, + "grad_norm": 0.1924007087945938, + "learning_rate": 1.99659011646005e-05, + "loss": 0.5213, + "step": 452 + }, + { + "epoch": 0.10104840508587999, + "grad_norm": 0.19710196554660797, + "learning_rate": 1.996570670375288e-05, + "loss": 0.5333, + "step": 453 + }, + { + "epoch": 0.10127146999776936, + "grad_norm": 0.1823819875717163, + "learning_rate": 1.9965511690941737e-05, + "loss": 0.5378, + "step": 454 + }, + { + "epoch": 0.1014945349096587, + "grad_norm": 0.21200565993785858, + "learning_rate": 1.9965316126177867e-05, + "loss": 0.5594, + "step": 455 + }, + { + "epoch": 0.10171759982154807, + "grad_norm": 0.18618617951869965, + "learning_rate": 1.9965120009472106e-05, + "loss": 0.5317, + "step": 456 + }, + { + "epoch": 0.10194066473343744, + "grad_norm": 0.19600717723369598, + "learning_rate": 1.996492334083532e-05, + "loss": 0.5612, + "step": 457 + }, + { + "epoch": 0.10216372964532679, + "grad_norm": 0.1988876461982727, + "learning_rate": 1.9964726120278394e-05, + "loss": 0.5709, + "step": 458 + }, + { + "epoch": 0.10238679455721615, + "grad_norm": 0.18938012421131134, + "learning_rate": 1.9964528347812255e-05, + "loss": 0.5413, + "step": 459 + }, + { + "epoch": 0.10260985946910552, + "grad_norm": 0.18799841403961182, + "learning_rate": 1.9964330023447854e-05, + "loss": 0.5534, + "step": 460 + }, + { + "epoch": 0.10283292438099487, + "grad_norm": 0.229729562997818, + "learning_rate": 1.9964131147196185e-05, + "loss": 0.5471, + "step": 461 + }, + { + "epoch": 0.10305598929288423, + "grad_norm": 0.19921550154685974, + "learning_rate": 1.9963931719068253e-05, + "loss": 0.5419, + "step": 462 + }, + { + "epoch": 0.1032790542047736, + "grad_norm": 0.19583772122859955, + "learning_rate": 1.9963731739075106e-05, + "loss": 0.5412, + "step": 463 + }, + { + "epoch": 0.10350211911666295, + "grad_norm": 0.19680215418338776, + "learning_rate": 1.996353120722782e-05, + "loss": 0.5477, + "step": 464 + }, + { + "epoch": 0.10372518402855231, + "grad_norm": 0.20173701643943787, + "learning_rate": 1.9963330123537507e-05, + "loss": 0.5523, + "step": 465 + }, + { + "epoch": 0.10394824894044166, + "grad_norm": 0.23238573968410492, + "learning_rate": 1.9963128488015294e-05, + "loss": 0.559, + "step": 466 + }, + { + "epoch": 0.10417131385233103, + "grad_norm": 0.22626003623008728, + "learning_rate": 1.996292630067236e-05, + "loss": 0.526, + "step": 467 + }, + { + "epoch": 0.10439437876422039, + "grad_norm": 0.21178299188613892, + "learning_rate": 1.9962723561519893e-05, + "loss": 0.5301, + "step": 468 + }, + { + "epoch": 0.10461744367610974, + "grad_norm": 0.1937963217496872, + "learning_rate": 1.996252027056913e-05, + "loss": 0.5449, + "step": 469 + }, + { + "epoch": 0.1048405085879991, + "grad_norm": 0.18095934391021729, + "learning_rate": 1.996231642783133e-05, + "loss": 0.5676, + "step": 470 + }, + { + "epoch": 0.10506357349988847, + "grad_norm": 0.2064565271139145, + "learning_rate": 1.9962112033317776e-05, + "loss": 0.5157, + "step": 471 + }, + { + "epoch": 0.10528663841177782, + "grad_norm": 0.1964641511440277, + "learning_rate": 1.9961907087039796e-05, + "loss": 0.6127, + "step": 472 + }, + { + "epoch": 0.10550970332366719, + "grad_norm": 0.17736569046974182, + "learning_rate": 1.996170158900874e-05, + "loss": 0.545, + "step": 473 + }, + { + "epoch": 0.10573276823555655, + "grad_norm": 0.1836784929037094, + "learning_rate": 1.9961495539235985e-05, + "loss": 0.5525, + "step": 474 + }, + { + "epoch": 0.1059558331474459, + "grad_norm": 0.2029949426651001, + "learning_rate": 1.996128893773295e-05, + "loss": 0.5558, + "step": 475 + }, + { + "epoch": 0.10617889805933527, + "grad_norm": 0.207282155752182, + "learning_rate": 1.9961081784511073e-05, + "loss": 0.5789, + "step": 476 + }, + { + "epoch": 0.10640196297122463, + "grad_norm": 0.20277360081672668, + "learning_rate": 1.9960874079581828e-05, + "loss": 0.5555, + "step": 477 + }, + { + "epoch": 0.10662502788311398, + "grad_norm": 0.21180874109268188, + "learning_rate": 1.996066582295672e-05, + "loss": 0.574, + "step": 478 + }, + { + "epoch": 0.10684809279500335, + "grad_norm": 0.22067199647426605, + "learning_rate": 1.996045701464729e-05, + "loss": 0.5641, + "step": 479 + }, + { + "epoch": 0.10707115770689271, + "grad_norm": 0.2735600471496582, + "learning_rate": 1.9960247654665088e-05, + "loss": 0.5473, + "step": 480 + }, + { + "epoch": 0.10729422261878206, + "grad_norm": 0.19656887650489807, + "learning_rate": 1.9960037743021723e-05, + "loss": 0.5614, + "step": 481 + }, + { + "epoch": 0.10751728753067143, + "grad_norm": 0.19436019659042358, + "learning_rate": 1.9959827279728815e-05, + "loss": 0.5554, + "step": 482 + }, + { + "epoch": 0.10774035244256079, + "grad_norm": 0.1874009072780609, + "learning_rate": 1.9959616264798022e-05, + "loss": 0.5723, + "step": 483 + }, + { + "epoch": 0.10796341735445014, + "grad_norm": 0.2946108877658844, + "learning_rate": 1.9959404698241037e-05, + "loss": 0.5572, + "step": 484 + }, + { + "epoch": 0.1081864822663395, + "grad_norm": 0.1900031566619873, + "learning_rate": 1.9959192580069567e-05, + "loss": 0.5629, + "step": 485 + }, + { + "epoch": 0.10840954717822887, + "grad_norm": 0.2330961972475052, + "learning_rate": 1.9958979910295367e-05, + "loss": 0.5725, + "step": 486 + }, + { + "epoch": 0.10863261209011822, + "grad_norm": 0.23831723630428314, + "learning_rate": 1.9958766688930215e-05, + "loss": 0.5549, + "step": 487 + }, + { + "epoch": 0.10885567700200759, + "grad_norm": 0.2095993012189865, + "learning_rate": 1.9958552915985923e-05, + "loss": 0.541, + "step": 488 + }, + { + "epoch": 0.10907874191389694, + "grad_norm": 0.18837721645832062, + "learning_rate": 1.9958338591474327e-05, + "loss": 0.5245, + "step": 489 + }, + { + "epoch": 0.1093018068257863, + "grad_norm": 0.19427752494812012, + "learning_rate": 1.99581237154073e-05, + "loss": 0.5442, + "step": 490 + }, + { + "epoch": 0.10952487173767567, + "grad_norm": 0.19493944942951202, + "learning_rate": 1.9957908287796743e-05, + "loss": 0.5559, + "step": 491 + }, + { + "epoch": 0.10974793664956502, + "grad_norm": 0.18494555354118347, + "learning_rate": 1.9957692308654586e-05, + "loss": 0.5427, + "step": 492 + }, + { + "epoch": 0.10997100156145438, + "grad_norm": 0.19978608191013336, + "learning_rate": 1.9957475777992794e-05, + "loss": 0.5391, + "step": 493 + }, + { + "epoch": 0.11019406647334375, + "grad_norm": 0.18339702486991882, + "learning_rate": 1.9957258695823358e-05, + "loss": 0.5454, + "step": 494 + }, + { + "epoch": 0.1104171313852331, + "grad_norm": 0.18613296747207642, + "learning_rate": 1.99570410621583e-05, + "loss": 0.5371, + "step": 495 + }, + { + "epoch": 0.11064019629712246, + "grad_norm": 0.19776608049869537, + "learning_rate": 1.9956822877009676e-05, + "loss": 0.5405, + "step": 496 + }, + { + "epoch": 0.11086326120901183, + "grad_norm": 0.1891903132200241, + "learning_rate": 1.9956604140389574e-05, + "loss": 0.5755, + "step": 497 + }, + { + "epoch": 0.11108632612090118, + "grad_norm": 0.20654836297035217, + "learning_rate": 1.9956384852310102e-05, + "loss": 0.5471, + "step": 498 + }, + { + "epoch": 0.11130939103279054, + "grad_norm": 0.18682821094989777, + "learning_rate": 1.995616501278341e-05, + "loss": 0.5479, + "step": 499 + }, + { + "epoch": 0.1115324559446799, + "grad_norm": 0.20868845283985138, + "learning_rate": 1.995594462182167e-05, + "loss": 0.5405, + "step": 500 + }, + { + "epoch": 0.11175552085656926, + "grad_norm": 0.20793192088603973, + "learning_rate": 1.9955723679437093e-05, + "loss": 0.5468, + "step": 501 + }, + { + "epoch": 0.11197858576845862, + "grad_norm": 0.18734110891819, + "learning_rate": 1.9955502185641915e-05, + "loss": 0.5336, + "step": 502 + }, + { + "epoch": 0.11220165068034799, + "grad_norm": 0.18563862144947052, + "learning_rate": 1.9955280140448404e-05, + "loss": 0.549, + "step": 503 + }, + { + "epoch": 0.11242471559223734, + "grad_norm": 0.19853924214839935, + "learning_rate": 1.9955057543868858e-05, + "loss": 0.5494, + "step": 504 + }, + { + "epoch": 0.1126477805041267, + "grad_norm": 0.2054789960384369, + "learning_rate": 1.9954834395915604e-05, + "loss": 0.6006, + "step": 505 + }, + { + "epoch": 0.11287084541601607, + "grad_norm": 0.1880260407924652, + "learning_rate": 1.9954610696601e-05, + "loss": 0.5664, + "step": 506 + }, + { + "epoch": 0.11309391032790542, + "grad_norm": 0.20359358191490173, + "learning_rate": 1.9954386445937444e-05, + "loss": 0.5387, + "step": 507 + }, + { + "epoch": 0.11331697523979478, + "grad_norm": 0.20446328818798065, + "learning_rate": 1.995416164393735e-05, + "loss": 0.6078, + "step": 508 + }, + { + "epoch": 0.11354004015168415, + "grad_norm": 0.20027638971805573, + "learning_rate": 1.9953936290613166e-05, + "loss": 0.4875, + "step": 509 + }, + { + "epoch": 0.1137631050635735, + "grad_norm": 0.19979824125766754, + "learning_rate": 1.9953710385977382e-05, + "loss": 0.5599, + "step": 510 + }, + { + "epoch": 0.11398616997546286, + "grad_norm": 0.17714561522006989, + "learning_rate": 1.9953483930042503e-05, + "loss": 0.536, + "step": 511 + }, + { + "epoch": 0.11420923488735223, + "grad_norm": 0.21002604067325592, + "learning_rate": 1.9953256922821075e-05, + "loss": 0.5388, + "step": 512 + }, + { + "epoch": 0.11443229979924158, + "grad_norm": 0.19017495214939117, + "learning_rate": 1.995302936432567e-05, + "loss": 0.5474, + "step": 513 + }, + { + "epoch": 0.11465536471113094, + "grad_norm": 0.1896909773349762, + "learning_rate": 1.995280125456889e-05, + "loss": 0.5577, + "step": 514 + }, + { + "epoch": 0.11487842962302029, + "grad_norm": 0.20575307309627533, + "learning_rate": 1.9952572593563375e-05, + "loss": 0.5204, + "step": 515 + }, + { + "epoch": 0.11510149453490966, + "grad_norm": 0.19565477967262268, + "learning_rate": 1.9952343381321785e-05, + "loss": 0.5971, + "step": 516 + }, + { + "epoch": 0.11532455944679902, + "grad_norm": 0.1984451860189438, + "learning_rate": 1.995211361785681e-05, + "loss": 0.5531, + "step": 517 + }, + { + "epoch": 0.11554762435868837, + "grad_norm": 0.18850469589233398, + "learning_rate": 1.9951883303181184e-05, + "loss": 0.5419, + "step": 518 + }, + { + "epoch": 0.11577068927057774, + "grad_norm": 0.22787964344024658, + "learning_rate": 1.9951652437307664e-05, + "loss": 0.5403, + "step": 519 + }, + { + "epoch": 0.1159937541824671, + "grad_norm": 0.29980340600013733, + "learning_rate": 1.995142102024903e-05, + "loss": 0.5365, + "step": 520 + }, + { + "epoch": 0.11621681909435645, + "grad_norm": 0.17774684727191925, + "learning_rate": 1.995118905201811e-05, + "loss": 0.545, + "step": 521 + }, + { + "epoch": 0.11643988400624582, + "grad_norm": 0.20445318520069122, + "learning_rate": 1.995095653262774e-05, + "loss": 0.5883, + "step": 522 + }, + { + "epoch": 0.11666294891813518, + "grad_norm": 0.1800977885723114, + "learning_rate": 1.9950723462090803e-05, + "loss": 0.5579, + "step": 523 + }, + { + "epoch": 0.11688601383002453, + "grad_norm": 0.18807321786880493, + "learning_rate": 1.9950489840420207e-05, + "loss": 0.5295, + "step": 524 + }, + { + "epoch": 0.1171090787419139, + "grad_norm": 0.17663832008838654, + "learning_rate": 1.9950255667628894e-05, + "loss": 0.5377, + "step": 525 + }, + { + "epoch": 0.11733214365380326, + "grad_norm": 0.18556755781173706, + "learning_rate": 1.9950020943729834e-05, + "loss": 0.5314, + "step": 526 + }, + { + "epoch": 0.11755520856569261, + "grad_norm": 0.18376345932483673, + "learning_rate": 1.994978566873602e-05, + "loss": 0.5383, + "step": 527 + }, + { + "epoch": 0.11777827347758198, + "grad_norm": 0.18771792948246002, + "learning_rate": 1.9949549842660495e-05, + "loss": 0.5273, + "step": 528 + }, + { + "epoch": 0.11800133838947134, + "grad_norm": 0.1931321620941162, + "learning_rate": 1.9949313465516312e-05, + "loss": 0.5692, + "step": 529 + }, + { + "epoch": 0.11822440330136069, + "grad_norm": 0.18806159496307373, + "learning_rate": 1.9949076537316566e-05, + "loss": 0.5375, + "step": 530 + }, + { + "epoch": 0.11844746821325006, + "grad_norm": 0.18746712803840637, + "learning_rate": 1.9948839058074383e-05, + "loss": 0.5418, + "step": 531 + }, + { + "epoch": 0.11867053312513942, + "grad_norm": 0.18869024515151978, + "learning_rate": 1.9948601027802908e-05, + "loss": 0.5059, + "step": 532 + }, + { + "epoch": 0.11889359803702877, + "grad_norm": 0.20482954382896423, + "learning_rate": 1.994836244651533e-05, + "loss": 0.5359, + "step": 533 + }, + { + "epoch": 0.11911666294891814, + "grad_norm": 0.17584972083568573, + "learning_rate": 1.9948123314224862e-05, + "loss": 0.5221, + "step": 534 + }, + { + "epoch": 0.1193397278608075, + "grad_norm": 0.1878448724746704, + "learning_rate": 1.994788363094475e-05, + "loss": 0.5358, + "step": 535 + }, + { + "epoch": 0.11956279277269685, + "grad_norm": 0.17549312114715576, + "learning_rate": 1.9947643396688266e-05, + "loss": 0.5391, + "step": 536 + }, + { + "epoch": 0.11978585768458622, + "grad_norm": 0.18577958643436432, + "learning_rate": 1.9947402611468714e-05, + "loss": 0.5606, + "step": 537 + }, + { + "epoch": 0.12000892259647558, + "grad_norm": 0.17562542855739594, + "learning_rate": 1.994716127529944e-05, + "loss": 0.5186, + "step": 538 + }, + { + "epoch": 0.12023198750836493, + "grad_norm": 0.18019866943359375, + "learning_rate": 1.9946919388193803e-05, + "loss": 0.524, + "step": 539 + }, + { + "epoch": 0.1204550524202543, + "grad_norm": 0.18850797414779663, + "learning_rate": 1.99466769501652e-05, + "loss": 0.5496, + "step": 540 + }, + { + "epoch": 0.12067811733214365, + "grad_norm": 0.1817145198583603, + "learning_rate": 1.9946433961227062e-05, + "loss": 0.5303, + "step": 541 + }, + { + "epoch": 0.12090118224403301, + "grad_norm": 0.2111491858959198, + "learning_rate": 1.9946190421392845e-05, + "loss": 0.5526, + "step": 542 + }, + { + "epoch": 0.12112424715592238, + "grad_norm": 0.294716477394104, + "learning_rate": 1.9945946330676036e-05, + "loss": 0.5172, + "step": 543 + }, + { + "epoch": 0.12134731206781173, + "grad_norm": 0.21202930808067322, + "learning_rate": 1.994570168909016e-05, + "loss": 0.5295, + "step": 544 + }, + { + "epoch": 0.12157037697970109, + "grad_norm": 0.18554560840129852, + "learning_rate": 1.9945456496648763e-05, + "loss": 0.5347, + "step": 545 + }, + { + "epoch": 0.12179344189159046, + "grad_norm": 0.18445701897144318, + "learning_rate": 1.9945210753365426e-05, + "loss": 0.5433, + "step": 546 + }, + { + "epoch": 0.12201650680347981, + "grad_norm": 0.2190207540988922, + "learning_rate": 1.9944964459253757e-05, + "loss": 0.5342, + "step": 547 + }, + { + "epoch": 0.12223957171536917, + "grad_norm": 0.19571848213672638, + "learning_rate": 1.99447176143274e-05, + "loss": 0.5675, + "step": 548 + }, + { + "epoch": 0.12246263662725854, + "grad_norm": 0.1853405386209488, + "learning_rate": 1.994447021860003e-05, + "loss": 0.5857, + "step": 549 + }, + { + "epoch": 0.12268570153914789, + "grad_norm": 0.19547483325004578, + "learning_rate": 1.9944222272085344e-05, + "loss": 0.5208, + "step": 550 + }, + { + "epoch": 0.12290876645103725, + "grad_norm": 0.20552557706832886, + "learning_rate": 1.994397377479708e-05, + "loss": 0.5518, + "step": 551 + }, + { + "epoch": 0.12313183136292662, + "grad_norm": 0.21979011595249176, + "learning_rate": 1.9943724726748996e-05, + "loss": 0.5658, + "step": 552 + }, + { + "epoch": 0.12335489627481597, + "grad_norm": 0.18478159606456757, + "learning_rate": 1.994347512795489e-05, + "loss": 0.5599, + "step": 553 + }, + { + "epoch": 0.12357796118670533, + "grad_norm": 0.20098286867141724, + "learning_rate": 1.9943224978428582e-05, + "loss": 0.5455, + "step": 554 + }, + { + "epoch": 0.1238010260985947, + "grad_norm": 0.2217499017715454, + "learning_rate": 1.994297427818393e-05, + "loss": 0.5512, + "step": 555 + }, + { + "epoch": 0.12402409101048405, + "grad_norm": 0.17720763385295868, + "learning_rate": 1.9942723027234817e-05, + "loss": 0.5295, + "step": 556 + }, + { + "epoch": 0.12424715592237341, + "grad_norm": 0.1762504130601883, + "learning_rate": 1.9942471225595162e-05, + "loss": 0.5361, + "step": 557 + }, + { + "epoch": 0.12447022083426278, + "grad_norm": 0.2265862226486206, + "learning_rate": 1.994221887327891e-05, + "loss": 0.5669, + "step": 558 + }, + { + "epoch": 0.12469328574615213, + "grad_norm": 0.19906531274318695, + "learning_rate": 1.994196597030004e-05, + "loss": 0.5302, + "step": 559 + }, + { + "epoch": 0.12491635065804149, + "grad_norm": 0.17008185386657715, + "learning_rate": 1.9941712516672553e-05, + "loss": 0.5333, + "step": 560 + }, + { + "epoch": 0.12513941556993086, + "grad_norm": 0.18816471099853516, + "learning_rate": 1.994145851241049e-05, + "loss": 0.5679, + "step": 561 + }, + { + "epoch": 0.12536248048182022, + "grad_norm": 0.19027023017406464, + "learning_rate": 1.9941203957527927e-05, + "loss": 0.5507, + "step": 562 + }, + { + "epoch": 0.12558554539370956, + "grad_norm": 0.17551641166210175, + "learning_rate": 1.994094885203895e-05, + "loss": 0.5311, + "step": 563 + }, + { + "epoch": 0.12580861030559892, + "grad_norm": 0.1766747236251831, + "learning_rate": 1.9940693195957696e-05, + "loss": 0.5279, + "step": 564 + }, + { + "epoch": 0.1260316752174883, + "grad_norm": 0.1826571673154831, + "learning_rate": 1.9940436989298322e-05, + "loss": 0.5195, + "step": 565 + }, + { + "epoch": 0.12625474012937765, + "grad_norm": 0.18142053484916687, + "learning_rate": 1.9940180232075025e-05, + "loss": 0.5678, + "step": 566 + }, + { + "epoch": 0.12647780504126702, + "grad_norm": 0.21495957672595978, + "learning_rate": 1.993992292430202e-05, + "loss": 0.5393, + "step": 567 + }, + { + "epoch": 0.12670086995315638, + "grad_norm": 0.18905828893184662, + "learning_rate": 1.9939665065993556e-05, + "loss": 0.5507, + "step": 568 + }, + { + "epoch": 0.12692393486504572, + "grad_norm": 0.19366420805454254, + "learning_rate": 1.9939406657163916e-05, + "loss": 0.5458, + "step": 569 + }, + { + "epoch": 0.12714699977693508, + "grad_norm": 0.20456644892692566, + "learning_rate": 1.9939147697827415e-05, + "loss": 0.5532, + "step": 570 + }, + { + "epoch": 0.12737006468882445, + "grad_norm": 0.20060132443904877, + "learning_rate": 1.9938888187998397e-05, + "loss": 0.5865, + "step": 571 + }, + { + "epoch": 0.1275931296007138, + "grad_norm": 0.18896770477294922, + "learning_rate": 1.9938628127691232e-05, + "loss": 0.5372, + "step": 572 + }, + { + "epoch": 0.12781619451260318, + "grad_norm": 0.1824081540107727, + "learning_rate": 1.9938367516920323e-05, + "loss": 0.5232, + "step": 573 + }, + { + "epoch": 0.12803925942449254, + "grad_norm": 0.4999215602874756, + "learning_rate": 1.993810635570011e-05, + "loss": 0.5323, + "step": 574 + }, + { + "epoch": 0.12826232433638188, + "grad_norm": 0.23097532987594604, + "learning_rate": 1.993784464404505e-05, + "loss": 0.544, + "step": 575 + }, + { + "epoch": 0.12848538924827124, + "grad_norm": 0.1975661814212799, + "learning_rate": 1.993758238196964e-05, + "loss": 0.5411, + "step": 576 + }, + { + "epoch": 0.1287084541601606, + "grad_norm": 0.18706879019737244, + "learning_rate": 1.9937319569488414e-05, + "loss": 0.5386, + "step": 577 + }, + { + "epoch": 0.12893151907204997, + "grad_norm": 0.19232505559921265, + "learning_rate": 1.993705620661592e-05, + "loss": 0.5327, + "step": 578 + }, + { + "epoch": 0.12915458398393934, + "grad_norm": 0.20714089274406433, + "learning_rate": 1.9936792293366744e-05, + "loss": 0.5381, + "step": 579 + }, + { + "epoch": 0.12937764889582867, + "grad_norm": 0.18375153839588165, + "learning_rate": 1.993652782975551e-05, + "loss": 0.5664, + "step": 580 + }, + { + "epoch": 0.12960071380771804, + "grad_norm": 0.19149671494960785, + "learning_rate": 1.993626281579686e-05, + "loss": 0.5265, + "step": 581 + }, + { + "epoch": 0.1298237787196074, + "grad_norm": 0.19362886250019073, + "learning_rate": 1.9935997251505473e-05, + "loss": 0.517, + "step": 582 + }, + { + "epoch": 0.13004684363149677, + "grad_norm": 0.19237902760505676, + "learning_rate": 1.993573113689606e-05, + "loss": 0.529, + "step": 583 + }, + { + "epoch": 0.13026990854338613, + "grad_norm": 0.2098451405763626, + "learning_rate": 1.9935464471983354e-05, + "loss": 0.536, + "step": 584 + }, + { + "epoch": 0.1304929734552755, + "grad_norm": 0.17832371592521667, + "learning_rate": 1.993519725678213e-05, + "loss": 0.528, + "step": 585 + }, + { + "epoch": 0.13071603836716483, + "grad_norm": 0.19082553684711456, + "learning_rate": 1.9934929491307194e-05, + "loss": 0.5208, + "step": 586 + }, + { + "epoch": 0.1309391032790542, + "grad_norm": 0.19741079211235046, + "learning_rate": 1.9934661175573363e-05, + "loss": 0.5605, + "step": 587 + }, + { + "epoch": 0.13116216819094356, + "grad_norm": 0.17371487617492676, + "learning_rate": 1.9934392309595504e-05, + "loss": 0.5462, + "step": 588 + }, + { + "epoch": 0.13138523310283293, + "grad_norm": 0.17552132904529572, + "learning_rate": 1.9934122893388512e-05, + "loss": 0.5343, + "step": 589 + }, + { + "epoch": 0.1316082980147223, + "grad_norm": 0.19698654115200043, + "learning_rate": 1.9933852926967305e-05, + "loss": 0.5711, + "step": 590 + }, + { + "epoch": 0.13183136292661166, + "grad_norm": 0.19045893847942352, + "learning_rate": 1.993358241034684e-05, + "loss": 0.5601, + "step": 591 + }, + { + "epoch": 0.132054427838501, + "grad_norm": 0.17538850009441376, + "learning_rate": 1.9933311343542094e-05, + "loss": 0.5403, + "step": 592 + }, + { + "epoch": 0.13227749275039036, + "grad_norm": 0.1708664894104004, + "learning_rate": 1.9933039726568078e-05, + "loss": 0.5346, + "step": 593 + }, + { + "epoch": 0.13250055766227972, + "grad_norm": 0.1795656532049179, + "learning_rate": 1.9932767559439844e-05, + "loss": 0.5516, + "step": 594 + }, + { + "epoch": 0.1327236225741691, + "grad_norm": 0.18116529285907745, + "learning_rate": 1.9932494842172465e-05, + "loss": 0.5311, + "step": 595 + }, + { + "epoch": 0.13294668748605845, + "grad_norm": 0.20366770029067993, + "learning_rate": 1.9932221574781043e-05, + "loss": 0.5546, + "step": 596 + }, + { + "epoch": 0.13316975239794782, + "grad_norm": 0.1805567592382431, + "learning_rate": 1.9931947757280713e-05, + "loss": 0.5754, + "step": 597 + }, + { + "epoch": 0.13339281730983715, + "grad_norm": 0.1815255880355835, + "learning_rate": 1.9931673389686642e-05, + "loss": 0.5172, + "step": 598 + }, + { + "epoch": 0.13361588222172652, + "grad_norm": 0.1993352770805359, + "learning_rate": 1.9931398472014024e-05, + "loss": 0.5483, + "step": 599 + }, + { + "epoch": 0.13383894713361588, + "grad_norm": 0.1812392920255661, + "learning_rate": 1.993112300427809e-05, + "loss": 0.5418, + "step": 600 + }, + { + "epoch": 0.13406201204550525, + "grad_norm": 0.23115472495555878, + "learning_rate": 1.9930846986494098e-05, + "loss": 0.5661, + "step": 601 + }, + { + "epoch": 0.1342850769573946, + "grad_norm": 0.1799275279045105, + "learning_rate": 1.9930570418677327e-05, + "loss": 0.5536, + "step": 602 + }, + { + "epoch": 0.13450814186928395, + "grad_norm": 0.19140706956386566, + "learning_rate": 1.9930293300843103e-05, + "loss": 0.5405, + "step": 603 + }, + { + "epoch": 0.1347312067811733, + "grad_norm": 0.20953412353992462, + "learning_rate": 1.993001563300677e-05, + "loss": 0.5215, + "step": 604 + }, + { + "epoch": 0.13495427169306268, + "grad_norm": 0.18991845846176147, + "learning_rate": 1.992973741518371e-05, + "loss": 0.5577, + "step": 605 + }, + { + "epoch": 0.13517733660495204, + "grad_norm": 0.18286292254924774, + "learning_rate": 1.9929458647389333e-05, + "loss": 0.5531, + "step": 606 + }, + { + "epoch": 0.1354004015168414, + "grad_norm": 0.1848326176404953, + "learning_rate": 1.9929179329639075e-05, + "loss": 0.5387, + "step": 607 + }, + { + "epoch": 0.13562346642873077, + "grad_norm": 0.17560309171676636, + "learning_rate": 1.9928899461948407e-05, + "loss": 0.5444, + "step": 608 + }, + { + "epoch": 0.1358465313406201, + "grad_norm": 0.18911704421043396, + "learning_rate": 1.9928619044332837e-05, + "loss": 0.506, + "step": 609 + }, + { + "epoch": 0.13606959625250947, + "grad_norm": 0.19542188942432404, + "learning_rate": 1.9928338076807888e-05, + "loss": 0.5222, + "step": 610 + }, + { + "epoch": 0.13629266116439884, + "grad_norm": 0.16921466588974, + "learning_rate": 1.9928056559389123e-05, + "loss": 0.5095, + "step": 611 + }, + { + "epoch": 0.1365157260762882, + "grad_norm": 0.19137442111968994, + "learning_rate": 1.9927774492092137e-05, + "loss": 0.5504, + "step": 612 + }, + { + "epoch": 0.13673879098817757, + "grad_norm": 0.2050313800573349, + "learning_rate": 1.9927491874932553e-05, + "loss": 0.5695, + "step": 613 + }, + { + "epoch": 0.13696185590006693, + "grad_norm": 0.18225258588790894, + "learning_rate": 1.992720870792602e-05, + "loss": 0.5286, + "step": 614 + }, + { + "epoch": 0.13718492081195627, + "grad_norm": 0.18813620507717133, + "learning_rate": 1.9926924991088226e-05, + "loss": 0.5362, + "step": 615 + }, + { + "epoch": 0.13740798572384563, + "grad_norm": 0.21695192158222198, + "learning_rate": 1.9926640724434882e-05, + "loss": 0.5548, + "step": 616 + }, + { + "epoch": 0.137631050635735, + "grad_norm": 0.1780216544866562, + "learning_rate": 1.9926355907981735e-05, + "loss": 0.547, + "step": 617 + }, + { + "epoch": 0.13785411554762436, + "grad_norm": 0.182960644364357, + "learning_rate": 1.9926070541744557e-05, + "loss": 0.524, + "step": 618 + }, + { + "epoch": 0.13807718045951373, + "grad_norm": 0.18580903112888336, + "learning_rate": 1.9925784625739157e-05, + "loss": 0.5515, + "step": 619 + }, + { + "epoch": 0.1383002453714031, + "grad_norm": 0.17631803452968597, + "learning_rate": 1.9925498159981368e-05, + "loss": 0.5269, + "step": 620 + }, + { + "epoch": 0.13852331028329243, + "grad_norm": 0.1774953305721283, + "learning_rate": 1.9925211144487057e-05, + "loss": 0.5279, + "step": 621 + }, + { + "epoch": 0.1387463751951818, + "grad_norm": 0.1902550458908081, + "learning_rate": 1.992492357927212e-05, + "loss": 0.5199, + "step": 622 + }, + { + "epoch": 0.13896944010707116, + "grad_norm": 0.18148526549339294, + "learning_rate": 1.9924635464352486e-05, + "loss": 0.5273, + "step": 623 + }, + { + "epoch": 0.13919250501896052, + "grad_norm": 0.18067635595798492, + "learning_rate": 1.9924346799744108e-05, + "loss": 0.5431, + "step": 624 + }, + { + "epoch": 0.1394155699308499, + "grad_norm": 0.1752486228942871, + "learning_rate": 1.992405758546298e-05, + "loss": 0.5012, + "step": 625 + }, + { + "epoch": 0.13963863484273925, + "grad_norm": 0.18866072595119476, + "learning_rate": 1.992376782152512e-05, + "loss": 0.5408, + "step": 626 + }, + { + "epoch": 0.1398616997546286, + "grad_norm": 0.21433256566524506, + "learning_rate": 1.9923477507946573e-05, + "loss": 0.5625, + "step": 627 + }, + { + "epoch": 0.14008476466651795, + "grad_norm": 0.17377375066280365, + "learning_rate": 1.9923186644743425e-05, + "loss": 0.516, + "step": 628 + }, + { + "epoch": 0.14030782957840732, + "grad_norm": 0.19306115806102753, + "learning_rate": 1.9922895231931775e-05, + "loss": 0.5397, + "step": 629 + }, + { + "epoch": 0.14053089449029668, + "grad_norm": 0.18144343793392181, + "learning_rate": 1.992260326952777e-05, + "loss": 0.5215, + "step": 630 + }, + { + "epoch": 0.14075395940218605, + "grad_norm": 0.19684460759162903, + "learning_rate": 1.9922310757547584e-05, + "loss": 0.5472, + "step": 631 + }, + { + "epoch": 0.14097702431407538, + "grad_norm": 0.18824423849582672, + "learning_rate": 1.9922017696007413e-05, + "loss": 0.5595, + "step": 632 + }, + { + "epoch": 0.14120008922596475, + "grad_norm": 0.18633319437503815, + "learning_rate": 1.992172408492349e-05, + "loss": 0.568, + "step": 633 + }, + { + "epoch": 0.1414231541378541, + "grad_norm": 0.18547959625720978, + "learning_rate": 1.9921429924312074e-05, + "loss": 0.5421, + "step": 634 + }, + { + "epoch": 0.14164621904974348, + "grad_norm": 0.16735389828681946, + "learning_rate": 1.9921135214189466e-05, + "loss": 0.499, + "step": 635 + }, + { + "epoch": 0.14186928396163284, + "grad_norm": 0.1912708729505539, + "learning_rate": 1.992083995457198e-05, + "loss": 0.5655, + "step": 636 + }, + { + "epoch": 0.1420923488735222, + "grad_norm": 0.19804726541042328, + "learning_rate": 1.9920544145475975e-05, + "loss": 0.5639, + "step": 637 + }, + { + "epoch": 0.14231541378541154, + "grad_norm": 0.17582711577415466, + "learning_rate": 1.992024778691783e-05, + "loss": 0.5577, + "step": 638 + }, + { + "epoch": 0.1425384786973009, + "grad_norm": 0.18506725132465363, + "learning_rate": 1.9919950878913962e-05, + "loss": 0.5287, + "step": 639 + }, + { + "epoch": 0.14276154360919027, + "grad_norm": 0.1914191097021103, + "learning_rate": 1.9919653421480816e-05, + "loss": 0.5125, + "step": 640 + }, + { + "epoch": 0.14298460852107964, + "grad_norm": 0.17707392573356628, + "learning_rate": 1.9919355414634864e-05, + "loss": 0.5351, + "step": 641 + }, + { + "epoch": 0.143207673432969, + "grad_norm": 0.1675226092338562, + "learning_rate": 1.9919056858392618e-05, + "loss": 0.5408, + "step": 642 + }, + { + "epoch": 0.14343073834485837, + "grad_norm": 0.18883782625198364, + "learning_rate": 1.9918757752770607e-05, + "loss": 0.5387, + "step": 643 + }, + { + "epoch": 0.1436538032567477, + "grad_norm": 0.18036174774169922, + "learning_rate": 1.99184580977854e-05, + "loss": 0.5444, + "step": 644 + }, + { + "epoch": 0.14387686816863707, + "grad_norm": 0.1817016750574112, + "learning_rate": 1.99181578934536e-05, + "loss": 0.527, + "step": 645 + }, + { + "epoch": 0.14409993308052643, + "grad_norm": 0.18899092078208923, + "learning_rate": 1.991785713979182e-05, + "loss": 0.5245, + "step": 646 + }, + { + "epoch": 0.1443229979924158, + "grad_norm": 0.18643809854984283, + "learning_rate": 1.991755583681673e-05, + "loss": 0.5247, + "step": 647 + }, + { + "epoch": 0.14454606290430516, + "grad_norm": 0.1903519332408905, + "learning_rate": 1.9917253984545014e-05, + "loss": 0.5497, + "step": 648 + }, + { + "epoch": 0.14476912781619453, + "grad_norm": 0.1695372760295868, + "learning_rate": 1.991695158299339e-05, + "loss": 0.5535, + "step": 649 + }, + { + "epoch": 0.14499219272808386, + "grad_norm": 0.19261977076530457, + "learning_rate": 1.9916648632178605e-05, + "loss": 0.5324, + "step": 650 + }, + { + "epoch": 0.14521525763997323, + "grad_norm": 0.1795564889907837, + "learning_rate": 1.9916345132117442e-05, + "loss": 0.5316, + "step": 651 + }, + { + "epoch": 0.1454383225518626, + "grad_norm": 0.27002382278442383, + "learning_rate": 1.9916041082826713e-05, + "loss": 0.5521, + "step": 652 + }, + { + "epoch": 0.14566138746375196, + "grad_norm": 0.17605867981910706, + "learning_rate": 1.9915736484323246e-05, + "loss": 0.5228, + "step": 653 + }, + { + "epoch": 0.14588445237564132, + "grad_norm": 0.1808420866727829, + "learning_rate": 1.9915431336623928e-05, + "loss": 0.5593, + "step": 654 + }, + { + "epoch": 0.14610751728753066, + "grad_norm": 0.189579576253891, + "learning_rate": 1.991512563974565e-05, + "loss": 0.5405, + "step": 655 + }, + { + "epoch": 0.14633058219942002, + "grad_norm": 0.1883007138967514, + "learning_rate": 1.9914819393705342e-05, + "loss": 0.5571, + "step": 656 + }, + { + "epoch": 0.1465536471113094, + "grad_norm": 0.18864606320858002, + "learning_rate": 1.9914512598519972e-05, + "loss": 0.519, + "step": 657 + }, + { + "epoch": 0.14677671202319875, + "grad_norm": 0.17059919238090515, + "learning_rate": 1.9914205254206527e-05, + "loss": 0.533, + "step": 658 + }, + { + "epoch": 0.14699977693508812, + "grad_norm": 0.4270278215408325, + "learning_rate": 1.9913897360782036e-05, + "loss": 0.5282, + "step": 659 + }, + { + "epoch": 0.14722284184697748, + "grad_norm": 0.18411633372306824, + "learning_rate": 1.9913588918263545e-05, + "loss": 0.5128, + "step": 660 + }, + { + "epoch": 0.14744590675886682, + "grad_norm": 0.2003640979528427, + "learning_rate": 1.9913279926668146e-05, + "loss": 0.5213, + "step": 661 + }, + { + "epoch": 0.14766897167075618, + "grad_norm": 0.18532606959342957, + "learning_rate": 1.9912970386012943e-05, + "loss": 0.498, + "step": 662 + }, + { + "epoch": 0.14789203658264555, + "grad_norm": 0.1674346923828125, + "learning_rate": 1.9912660296315083e-05, + "loss": 0.5383, + "step": 663 + }, + { + "epoch": 0.1481151014945349, + "grad_norm": 0.19120754301548004, + "learning_rate": 1.9912349657591748e-05, + "loss": 0.5301, + "step": 664 + }, + { + "epoch": 0.14833816640642428, + "grad_norm": 0.178116112947464, + "learning_rate": 1.9912038469860135e-05, + "loss": 0.5583, + "step": 665 + }, + { + "epoch": 0.14856123131831364, + "grad_norm": 0.1826133131980896, + "learning_rate": 1.9911726733137484e-05, + "loss": 0.5188, + "step": 666 + }, + { + "epoch": 0.14878429623020298, + "grad_norm": 0.18221695721149445, + "learning_rate": 1.991141444744106e-05, + "loss": 0.5236, + "step": 667 + }, + { + "epoch": 0.14900736114209234, + "grad_norm": 0.18043336272239685, + "learning_rate": 1.9911101612788157e-05, + "loss": 0.5201, + "step": 668 + }, + { + "epoch": 0.1492304260539817, + "grad_norm": 0.19176125526428223, + "learning_rate": 1.9910788229196104e-05, + "loss": 0.5501, + "step": 669 + }, + { + "epoch": 0.14945349096587107, + "grad_norm": 0.18294784426689148, + "learning_rate": 1.9910474296682256e-05, + "loss": 0.5304, + "step": 670 + }, + { + "epoch": 0.14967655587776044, + "grad_norm": 0.19021780788898468, + "learning_rate": 1.9910159815264e-05, + "loss": 0.5561, + "step": 671 + }, + { + "epoch": 0.1498996207896498, + "grad_norm": 0.18545284867286682, + "learning_rate": 1.9909844784958762e-05, + "loss": 0.536, + "step": 672 + }, + { + "epoch": 0.15012268570153914, + "grad_norm": 0.2078470140695572, + "learning_rate": 1.990952920578398e-05, + "loss": 0.513, + "step": 673 + }, + { + "epoch": 0.1503457506134285, + "grad_norm": 0.20093394815921783, + "learning_rate": 1.9909213077757138e-05, + "loss": 0.5406, + "step": 674 + }, + { + "epoch": 0.15056881552531787, + "grad_norm": 0.212370365858078, + "learning_rate": 1.9908896400895745e-05, + "loss": 0.5108, + "step": 675 + }, + { + "epoch": 0.15079188043720723, + "grad_norm": 0.1903519630432129, + "learning_rate": 1.990857917521734e-05, + "loss": 0.5163, + "step": 676 + }, + { + "epoch": 0.1510149453490966, + "grad_norm": 0.18388496339321136, + "learning_rate": 1.9908261400739494e-05, + "loss": 0.5361, + "step": 677 + }, + { + "epoch": 0.15123801026098593, + "grad_norm": 0.1817573606967926, + "learning_rate": 1.9907943077479802e-05, + "loss": 0.5582, + "step": 678 + }, + { + "epoch": 0.1514610751728753, + "grad_norm": 0.19696584343910217, + "learning_rate": 1.9907624205455903e-05, + "loss": 0.5326, + "step": 679 + }, + { + "epoch": 0.15168414008476466, + "grad_norm": 0.17569434642791748, + "learning_rate": 1.990730478468545e-05, + "loss": 0.5073, + "step": 680 + }, + { + "epoch": 0.15190720499665403, + "grad_norm": 0.1812848299741745, + "learning_rate": 1.9906984815186142e-05, + "loss": 0.539, + "step": 681 + }, + { + "epoch": 0.1521302699085434, + "grad_norm": 0.1694687008857727, + "learning_rate": 1.9906664296975696e-05, + "loss": 0.5286, + "step": 682 + }, + { + "epoch": 0.15235333482043276, + "grad_norm": 0.18631073832511902, + "learning_rate": 1.990634323007187e-05, + "loss": 0.5807, + "step": 683 + }, + { + "epoch": 0.1525763997323221, + "grad_norm": 0.18517658114433289, + "learning_rate": 1.9906021614492438e-05, + "loss": 0.5393, + "step": 684 + }, + { + "epoch": 0.15279946464421146, + "grad_norm": 0.19088414311408997, + "learning_rate": 1.990569945025522e-05, + "loss": 0.5583, + "step": 685 + }, + { + "epoch": 0.15302252955610082, + "grad_norm": 0.1954737901687622, + "learning_rate": 1.9905376737378056e-05, + "loss": 0.5348, + "step": 686 + }, + { + "epoch": 0.1532455944679902, + "grad_norm": 0.1780342310667038, + "learning_rate": 1.990505347587882e-05, + "loss": 0.5083, + "step": 687 + }, + { + "epoch": 0.15346865937987955, + "grad_norm": 0.18818046152591705, + "learning_rate": 1.9904729665775417e-05, + "loss": 0.51, + "step": 688 + }, + { + "epoch": 0.15369172429176892, + "grad_norm": 0.1797480434179306, + "learning_rate": 1.990440530708578e-05, + "loss": 0.5114, + "step": 689 + }, + { + "epoch": 0.15391478920365825, + "grad_norm": 0.18315070867538452, + "learning_rate": 1.9904080399827883e-05, + "loss": 0.5322, + "step": 690 + }, + { + "epoch": 0.15413785411554762, + "grad_norm": 0.2024601846933365, + "learning_rate": 1.990375494401971e-05, + "loss": 0.5198, + "step": 691 + }, + { + "epoch": 0.15436091902743698, + "grad_norm": 0.2151622325181961, + "learning_rate": 1.990342893967929e-05, + "loss": 0.5308, + "step": 692 + }, + { + "epoch": 0.15458398393932635, + "grad_norm": 0.1977054476737976, + "learning_rate": 1.990310238682468e-05, + "loss": 0.579, + "step": 693 + }, + { + "epoch": 0.1548070488512157, + "grad_norm": 0.1899496465921402, + "learning_rate": 1.990277528547397e-05, + "loss": 0.5435, + "step": 694 + }, + { + "epoch": 0.15503011376310508, + "grad_norm": 0.18425041437149048, + "learning_rate": 1.9902447635645273e-05, + "loss": 0.5582, + "step": 695 + }, + { + "epoch": 0.1552531786749944, + "grad_norm": 0.1868348866701126, + "learning_rate": 1.9902119437356737e-05, + "loss": 0.5208, + "step": 696 + }, + { + "epoch": 0.15547624358688378, + "grad_norm": 0.19318504631519318, + "learning_rate": 1.990179069062654e-05, + "loss": 0.581, + "step": 697 + }, + { + "epoch": 0.15569930849877314, + "grad_norm": 0.17516325414180756, + "learning_rate": 1.990146139547289e-05, + "loss": 0.522, + "step": 698 + }, + { + "epoch": 0.1559223734106625, + "grad_norm": 0.18644174933433533, + "learning_rate": 1.990113155191402e-05, + "loss": 0.5361, + "step": 699 + }, + { + "epoch": 0.15614543832255187, + "grad_norm": 0.204257071018219, + "learning_rate": 1.9900801159968207e-05, + "loss": 0.5216, + "step": 700 + }, + { + "epoch": 0.15636850323444124, + "grad_norm": 0.19151191413402557, + "learning_rate": 1.990047021965375e-05, + "loss": 0.5476, + "step": 701 + }, + { + "epoch": 0.15659156814633057, + "grad_norm": 0.18692530691623688, + "learning_rate": 1.9900138730988976e-05, + "loss": 0.5516, + "step": 702 + }, + { + "epoch": 0.15681463305821994, + "grad_norm": 0.19439953565597534, + "learning_rate": 1.9899806693992242e-05, + "loss": 0.5404, + "step": 703 + }, + { + "epoch": 0.1570376979701093, + "grad_norm": 0.1809748411178589, + "learning_rate": 1.989947410868194e-05, + "loss": 0.5571, + "step": 704 + }, + { + "epoch": 0.15726076288199867, + "grad_norm": 0.19498126208782196, + "learning_rate": 1.9899140975076495e-05, + "loss": 0.5121, + "step": 705 + }, + { + "epoch": 0.15748382779388803, + "grad_norm": 0.19838570058345795, + "learning_rate": 1.9898807293194352e-05, + "loss": 0.5535, + "step": 706 + }, + { + "epoch": 0.15770689270577737, + "grad_norm": 0.18156638741493225, + "learning_rate": 1.9898473063054e-05, + "loss": 0.5275, + "step": 707 + }, + { + "epoch": 0.15792995761766673, + "grad_norm": 0.1888674944639206, + "learning_rate": 1.989813828467394e-05, + "loss": 0.5274, + "step": 708 + }, + { + "epoch": 0.1581530225295561, + "grad_norm": 0.2016175538301468, + "learning_rate": 1.9897802958072722e-05, + "loss": 0.5057, + "step": 709 + }, + { + "epoch": 0.15837608744144546, + "grad_norm": 0.21129021048545837, + "learning_rate": 1.989746708326892e-05, + "loss": 0.5691, + "step": 710 + }, + { + "epoch": 0.15859915235333483, + "grad_norm": 0.20351482927799225, + "learning_rate": 1.9897130660281127e-05, + "loss": 0.5513, + "step": 711 + }, + { + "epoch": 0.1588222172652242, + "grad_norm": 0.2297467589378357, + "learning_rate": 1.9896793689127988e-05, + "loss": 0.5618, + "step": 712 + }, + { + "epoch": 0.15904528217711353, + "grad_norm": 0.222783625125885, + "learning_rate": 1.989645616982816e-05, + "loss": 0.5336, + "step": 713 + }, + { + "epoch": 0.1592683470890029, + "grad_norm": 0.1896630972623825, + "learning_rate": 1.9896118102400334e-05, + "loss": 0.5562, + "step": 714 + }, + { + "epoch": 0.15949141200089226, + "grad_norm": 0.188043013215065, + "learning_rate": 1.989577948686324e-05, + "loss": 0.497, + "step": 715 + }, + { + "epoch": 0.15971447691278162, + "grad_norm": 0.21135394275188446, + "learning_rate": 1.9895440323235635e-05, + "loss": 0.5714, + "step": 716 + }, + { + "epoch": 0.159937541824671, + "grad_norm": 0.1962571144104004, + "learning_rate": 1.98951006115363e-05, + "loss": 0.5686, + "step": 717 + }, + { + "epoch": 0.16016060673656035, + "grad_norm": 0.196267768740654, + "learning_rate": 1.9894760351784047e-05, + "loss": 0.557, + "step": 718 + }, + { + "epoch": 0.1603836716484497, + "grad_norm": 0.1771324872970581, + "learning_rate": 1.9894419543997724e-05, + "loss": 0.5492, + "step": 719 + }, + { + "epoch": 0.16060673656033905, + "grad_norm": 0.17055559158325195, + "learning_rate": 1.9894078188196213e-05, + "loss": 0.5311, + "step": 720 + }, + { + "epoch": 0.16082980147222842, + "grad_norm": 0.18929804861545563, + "learning_rate": 1.9893736284398414e-05, + "loss": 0.5388, + "step": 721 + }, + { + "epoch": 0.16105286638411778, + "grad_norm": 0.1930275857448578, + "learning_rate": 1.9893393832623266e-05, + "loss": 0.554, + "step": 722 + }, + { + "epoch": 0.16127593129600715, + "grad_norm": 0.1900511384010315, + "learning_rate": 1.9893050832889734e-05, + "loss": 0.5446, + "step": 723 + }, + { + "epoch": 0.1614989962078965, + "grad_norm": 0.17943377792835236, + "learning_rate": 1.9892707285216816e-05, + "loss": 0.541, + "step": 724 + }, + { + "epoch": 0.16172206111978585, + "grad_norm": 0.18837152421474457, + "learning_rate": 1.9892363189623546e-05, + "loss": 0.5424, + "step": 725 + }, + { + "epoch": 0.1619451260316752, + "grad_norm": 0.1987512707710266, + "learning_rate": 1.989201854612897e-05, + "loss": 0.5412, + "step": 726 + }, + { + "epoch": 0.16216819094356458, + "grad_norm": 0.1869790405035019, + "learning_rate": 1.9891673354752192e-05, + "loss": 0.5139, + "step": 727 + }, + { + "epoch": 0.16239125585545394, + "grad_norm": 0.17572949826717377, + "learning_rate": 1.9891327615512315e-05, + "loss": 0.5137, + "step": 728 + }, + { + "epoch": 0.1626143207673433, + "grad_norm": 0.17914965748786926, + "learning_rate": 1.9890981328428502e-05, + "loss": 0.5416, + "step": 729 + }, + { + "epoch": 0.16283738567923264, + "grad_norm": 0.17955482006072998, + "learning_rate": 1.989063449351992e-05, + "loss": 0.5307, + "step": 730 + }, + { + "epoch": 0.163060450591122, + "grad_norm": 0.17228074371814728, + "learning_rate": 1.9890287110805787e-05, + "loss": 0.5179, + "step": 731 + }, + { + "epoch": 0.16328351550301137, + "grad_norm": 0.19471096992492676, + "learning_rate": 1.9889939180305343e-05, + "loss": 0.5787, + "step": 732 + }, + { + "epoch": 0.16350658041490074, + "grad_norm": 0.19749003648757935, + "learning_rate": 1.9889590702037857e-05, + "loss": 0.5369, + "step": 733 + }, + { + "epoch": 0.1637296453267901, + "grad_norm": 0.17162089049816132, + "learning_rate": 1.9889241676022628e-05, + "loss": 0.5426, + "step": 734 + }, + { + "epoch": 0.16395271023867947, + "grad_norm": 0.18129977583885193, + "learning_rate": 1.988889210227899e-05, + "loss": 0.5297, + "step": 735 + }, + { + "epoch": 0.1641757751505688, + "grad_norm": 0.18625618517398834, + "learning_rate": 1.9888541980826307e-05, + "loss": 0.5169, + "step": 736 + }, + { + "epoch": 0.16439884006245817, + "grad_norm": 0.21191106736660004, + "learning_rate": 1.9888191311683966e-05, + "loss": 0.5322, + "step": 737 + }, + { + "epoch": 0.16462190497434753, + "grad_norm": 0.4791422486305237, + "learning_rate": 1.988784009487139e-05, + "loss": 0.5285, + "step": 738 + }, + { + "epoch": 0.1648449698862369, + "grad_norm": 0.18190808594226837, + "learning_rate": 1.9887488330408033e-05, + "loss": 0.5627, + "step": 739 + }, + { + "epoch": 0.16506803479812626, + "grad_norm": 0.17037086188793182, + "learning_rate": 1.9887136018313374e-05, + "loss": 0.5329, + "step": 740 + }, + { + "epoch": 0.16529109971001563, + "grad_norm": 0.18794186413288116, + "learning_rate": 1.9886783158606934e-05, + "loss": 0.5469, + "step": 741 + }, + { + "epoch": 0.16551416462190496, + "grad_norm": 0.19635480642318726, + "learning_rate": 1.9886429751308252e-05, + "loss": 0.5707, + "step": 742 + }, + { + "epoch": 0.16573722953379433, + "grad_norm": 0.1907324194908142, + "learning_rate": 1.9886075796436902e-05, + "loss": 0.5676, + "step": 743 + }, + { + "epoch": 0.1659602944456837, + "grad_norm": 0.1943301409482956, + "learning_rate": 1.9885721294012487e-05, + "loss": 0.5653, + "step": 744 + }, + { + "epoch": 0.16618335935757306, + "grad_norm": 0.19706295430660248, + "learning_rate": 1.9885366244054646e-05, + "loss": 0.5258, + "step": 745 + }, + { + "epoch": 0.16640642426946242, + "grad_norm": 0.2110588550567627, + "learning_rate": 1.9885010646583038e-05, + "loss": 0.5509, + "step": 746 + }, + { + "epoch": 0.1666294891813518, + "grad_norm": 0.18079644441604614, + "learning_rate": 1.988465450161736e-05, + "loss": 0.5299, + "step": 747 + }, + { + "epoch": 0.16685255409324112, + "grad_norm": 0.2338915318250656, + "learning_rate": 1.988429780917734e-05, + "loss": 0.5637, + "step": 748 + }, + { + "epoch": 0.1670756190051305, + "grad_norm": 0.18777087330818176, + "learning_rate": 1.9883940569282737e-05, + "loss": 0.5502, + "step": 749 + }, + { + "epoch": 0.16729868391701985, + "grad_norm": 0.17644049227237701, + "learning_rate": 1.988358278195333e-05, + "loss": 0.5477, + "step": 750 + }, + { + "epoch": 0.16752174882890922, + "grad_norm": 0.1912970095872879, + "learning_rate": 1.9883224447208936e-05, + "loss": 0.5447, + "step": 751 + }, + { + "epoch": 0.16774481374079858, + "grad_norm": 0.1746956706047058, + "learning_rate": 1.9882865565069408e-05, + "loss": 0.5425, + "step": 752 + }, + { + "epoch": 0.16796787865268792, + "grad_norm": 0.18779419362545013, + "learning_rate": 1.9882506135554614e-05, + "loss": 0.5329, + "step": 753 + }, + { + "epoch": 0.16819094356457728, + "grad_norm": 0.21110400557518005, + "learning_rate": 1.9882146158684473e-05, + "loss": 0.5421, + "step": 754 + }, + { + "epoch": 0.16841400847646665, + "grad_norm": 0.17395779490470886, + "learning_rate": 1.9881785634478915e-05, + "loss": 0.5321, + "step": 755 + }, + { + "epoch": 0.168637073388356, + "grad_norm": 0.1749841570854187, + "learning_rate": 1.988142456295791e-05, + "loss": 0.522, + "step": 756 + }, + { + "epoch": 0.16886013830024538, + "grad_norm": 0.17398926615715027, + "learning_rate": 1.988106294414145e-05, + "loss": 0.5563, + "step": 757 + }, + { + "epoch": 0.16908320321213474, + "grad_norm": 0.1813582479953766, + "learning_rate": 1.9880700778049575e-05, + "loss": 0.5242, + "step": 758 + }, + { + "epoch": 0.16930626812402408, + "grad_norm": 0.18003934621810913, + "learning_rate": 1.9880338064702337e-05, + "loss": 0.5468, + "step": 759 + }, + { + "epoch": 0.16952933303591344, + "grad_norm": 0.17160138487815857, + "learning_rate": 1.9879974804119827e-05, + "loss": 0.5321, + "step": 760 + }, + { + "epoch": 0.1697523979478028, + "grad_norm": 0.19145694375038147, + "learning_rate": 1.9879610996322168e-05, + "loss": 0.5365, + "step": 761 + }, + { + "epoch": 0.16997546285969217, + "grad_norm": 0.18664413690567017, + "learning_rate": 1.9879246641329505e-05, + "loss": 0.5278, + "step": 762 + }, + { + "epoch": 0.17019852777158154, + "grad_norm": 0.18776154518127441, + "learning_rate": 1.987888173916202e-05, + "loss": 0.5639, + "step": 763 + }, + { + "epoch": 0.1704215926834709, + "grad_norm": 0.17712879180908203, + "learning_rate": 1.9878516289839923e-05, + "loss": 0.5119, + "step": 764 + }, + { + "epoch": 0.17064465759536024, + "grad_norm": 0.1806483119726181, + "learning_rate": 1.9878150293383457e-05, + "loss": 0.5507, + "step": 765 + }, + { + "epoch": 0.1708677225072496, + "grad_norm": 0.18238821625709534, + "learning_rate": 1.9877783749812892e-05, + "loss": 0.5581, + "step": 766 + }, + { + "epoch": 0.17109078741913897, + "grad_norm": 0.17094913125038147, + "learning_rate": 1.9877416659148525e-05, + "loss": 0.5502, + "step": 767 + }, + { + "epoch": 0.17131385233102833, + "grad_norm": 0.18648453056812286, + "learning_rate": 1.9877049021410696e-05, + "loss": 0.5568, + "step": 768 + }, + { + "epoch": 0.1715369172429177, + "grad_norm": 0.20364215970039368, + "learning_rate": 1.9876680836619762e-05, + "loss": 0.5233, + "step": 769 + }, + { + "epoch": 0.17175998215480706, + "grad_norm": 0.17506776750087738, + "learning_rate": 1.9876312104796117e-05, + "loss": 0.5479, + "step": 770 + }, + { + "epoch": 0.1719830470666964, + "grad_norm": 0.17632229626178741, + "learning_rate": 1.9875942825960183e-05, + "loss": 0.5241, + "step": 771 + }, + { + "epoch": 0.17220611197858576, + "grad_norm": 0.18557725846767426, + "learning_rate": 1.9875573000132414e-05, + "loss": 0.5794, + "step": 772 + }, + { + "epoch": 0.17242917689047513, + "grad_norm": 0.1729423552751541, + "learning_rate": 1.987520262733329e-05, + "loss": 0.5247, + "step": 773 + }, + { + "epoch": 0.1726522418023645, + "grad_norm": 0.18643899261951447, + "learning_rate": 1.9874831707583328e-05, + "loss": 0.5305, + "step": 774 + }, + { + "epoch": 0.17287530671425386, + "grad_norm": 0.17992724478244781, + "learning_rate": 1.987446024090307e-05, + "loss": 0.5548, + "step": 775 + }, + { + "epoch": 0.17309837162614322, + "grad_norm": 0.19330288469791412, + "learning_rate": 1.9874088227313093e-05, + "loss": 0.5151, + "step": 776 + }, + { + "epoch": 0.17332143653803256, + "grad_norm": 0.18407322466373444, + "learning_rate": 1.9873715666834e-05, + "loss": 0.5262, + "step": 777 + }, + { + "epoch": 0.17354450144992192, + "grad_norm": 0.17531730234622955, + "learning_rate": 1.987334255948642e-05, + "loss": 0.51, + "step": 778 + }, + { + "epoch": 0.1737675663618113, + "grad_norm": 0.1791767179965973, + "learning_rate": 1.987296890529103e-05, + "loss": 0.5217, + "step": 779 + }, + { + "epoch": 0.17399063127370065, + "grad_norm": 0.17693190276622772, + "learning_rate": 1.9872594704268516e-05, + "loss": 0.5346, + "step": 780 + }, + { + "epoch": 0.17421369618559002, + "grad_norm": 0.17644071578979492, + "learning_rate": 1.9872219956439607e-05, + "loss": 0.5335, + "step": 781 + }, + { + "epoch": 0.17443676109747935, + "grad_norm": 0.17572622001171112, + "learning_rate": 1.987184466182506e-05, + "loss": 0.5302, + "step": 782 + }, + { + "epoch": 0.17465982600936872, + "grad_norm": 0.18111130595207214, + "learning_rate": 1.987146882044565e-05, + "loss": 0.5205, + "step": 783 + }, + { + "epoch": 0.17488289092125808, + "grad_norm": 0.17094580829143524, + "learning_rate": 1.987109243232221e-05, + "loss": 0.527, + "step": 784 + }, + { + "epoch": 0.17510595583314745, + "grad_norm": 0.1917412430047989, + "learning_rate": 1.9870715497475583e-05, + "loss": 0.5289, + "step": 785 + }, + { + "epoch": 0.1753290207450368, + "grad_norm": 0.19271767139434814, + "learning_rate": 1.9870338015926634e-05, + "loss": 0.5123, + "step": 786 + }, + { + "epoch": 0.17555208565692618, + "grad_norm": 0.19731736183166504, + "learning_rate": 1.9869959987696282e-05, + "loss": 0.543, + "step": 787 + }, + { + "epoch": 0.1757751505688155, + "grad_norm": 0.172173410654068, + "learning_rate": 1.9869581412805462e-05, + "loss": 0.5211, + "step": 788 + }, + { + "epoch": 0.17599821548070488, + "grad_norm": 0.1778416931629181, + "learning_rate": 1.9869202291275144e-05, + "loss": 0.5168, + "step": 789 + }, + { + "epoch": 0.17622128039259424, + "grad_norm": 0.1906319111585617, + "learning_rate": 1.986882262312632e-05, + "loss": 0.5502, + "step": 790 + }, + { + "epoch": 0.1764443453044836, + "grad_norm": 0.18600904941558838, + "learning_rate": 1.986844240838002e-05, + "loss": 0.5093, + "step": 791 + }, + { + "epoch": 0.17666741021637297, + "grad_norm": 0.17453651130199432, + "learning_rate": 1.986806164705731e-05, + "loss": 0.5316, + "step": 792 + }, + { + "epoch": 0.17689047512826234, + "grad_norm": 0.18647123873233795, + "learning_rate": 1.9867680339179268e-05, + "loss": 0.5293, + "step": 793 + }, + { + "epoch": 0.17711354004015167, + "grad_norm": 0.18260005116462708, + "learning_rate": 1.9867298484767022e-05, + "loss": 0.5429, + "step": 794 + }, + { + "epoch": 0.17733660495204104, + "grad_norm": 0.1722402721643448, + "learning_rate": 1.9866916083841715e-05, + "loss": 0.5211, + "step": 795 + }, + { + "epoch": 0.1775596698639304, + "grad_norm": 0.16579583287239075, + "learning_rate": 1.9866533136424537e-05, + "loss": 0.5173, + "step": 796 + }, + { + "epoch": 0.17778273477581977, + "grad_norm": 0.18849937617778778, + "learning_rate": 1.9866149642536683e-05, + "loss": 0.5482, + "step": 797 + }, + { + "epoch": 0.17800579968770913, + "grad_norm": 0.1786874532699585, + "learning_rate": 1.98657656021994e-05, + "loss": 0.5203, + "step": 798 + }, + { + "epoch": 0.1782288645995985, + "grad_norm": 0.1848941594362259, + "learning_rate": 1.986538101543397e-05, + "loss": 0.5248, + "step": 799 + }, + { + "epoch": 0.17845192951148783, + "grad_norm": 0.1833941787481308, + "learning_rate": 1.9864995882261674e-05, + "loss": 0.5192, + "step": 800 + }, + { + "epoch": 0.1786749944233772, + "grad_norm": 0.1813460886478424, + "learning_rate": 1.9864610202703858e-05, + "loss": 0.5307, + "step": 801 + }, + { + "epoch": 0.17889805933526656, + "grad_norm": 0.18154440820217133, + "learning_rate": 1.9864223976781876e-05, + "loss": 0.5468, + "step": 802 + }, + { + "epoch": 0.17912112424715593, + "grad_norm": 0.18413223326206207, + "learning_rate": 1.9863837204517124e-05, + "loss": 0.5376, + "step": 803 + }, + { + "epoch": 0.1793441891590453, + "grad_norm": 0.17483587563037872, + "learning_rate": 1.986344988593102e-05, + "loss": 0.5333, + "step": 804 + }, + { + "epoch": 0.17956725407093463, + "grad_norm": 0.17205438017845154, + "learning_rate": 1.9863062021045017e-05, + "loss": 0.4933, + "step": 805 + }, + { + "epoch": 0.179790318982824, + "grad_norm": 0.17578768730163574, + "learning_rate": 1.98626736098806e-05, + "loss": 0.5153, + "step": 806 + }, + { + "epoch": 0.18001338389471336, + "grad_norm": 0.1773952841758728, + "learning_rate": 1.9862284652459275e-05, + "loss": 0.5265, + "step": 807 + }, + { + "epoch": 0.18023644880660272, + "grad_norm": 0.17616188526153564, + "learning_rate": 1.9861895148802594e-05, + "loss": 0.5438, + "step": 808 + }, + { + "epoch": 0.1804595137184921, + "grad_norm": 0.1791033297777176, + "learning_rate": 1.9861505098932127e-05, + "loss": 0.5294, + "step": 809 + }, + { + "epoch": 0.18068257863038145, + "grad_norm": 0.18261539936065674, + "learning_rate": 1.986111450286947e-05, + "loss": 0.548, + "step": 810 + }, + { + "epoch": 0.1809056435422708, + "grad_norm": 0.1767009198665619, + "learning_rate": 1.986072336063627e-05, + "loss": 0.5278, + "step": 811 + }, + { + "epoch": 0.18112870845416015, + "grad_norm": 0.19362005591392517, + "learning_rate": 1.9860331672254182e-05, + "loss": 0.5206, + "step": 812 + }, + { + "epoch": 0.18135177336604952, + "grad_norm": 0.185451477766037, + "learning_rate": 1.98599394377449e-05, + "loss": 0.5513, + "step": 813 + }, + { + "epoch": 0.18157483827793888, + "grad_norm": 0.17266923189163208, + "learning_rate": 1.985954665713015e-05, + "loss": 0.5648, + "step": 814 + }, + { + "epoch": 0.18179790318982825, + "grad_norm": 0.170868918299675, + "learning_rate": 1.9859153330431692e-05, + "loss": 0.5343, + "step": 815 + }, + { + "epoch": 0.1820209681017176, + "grad_norm": 0.17769230902194977, + "learning_rate": 1.98587594576713e-05, + "loss": 0.5539, + "step": 816 + }, + { + "epoch": 0.18224403301360695, + "grad_norm": 0.16811443865299225, + "learning_rate": 1.9858365038870803e-05, + "loss": 0.5209, + "step": 817 + }, + { + "epoch": 0.1824670979254963, + "grad_norm": 0.1751745492219925, + "learning_rate": 1.985797007405203e-05, + "loss": 0.5509, + "step": 818 + }, + { + "epoch": 0.18269016283738568, + "grad_norm": 0.1694861203432083, + "learning_rate": 1.985757456323687e-05, + "loss": 0.5505, + "step": 819 + }, + { + "epoch": 0.18291322774927504, + "grad_norm": 0.1734897792339325, + "learning_rate": 1.985717850644722e-05, + "loss": 0.5022, + "step": 820 + }, + { + "epoch": 0.1831362926611644, + "grad_norm": 0.18104127049446106, + "learning_rate": 1.9856781903705026e-05, + "loss": 0.5434, + "step": 821 + }, + { + "epoch": 0.18335935757305377, + "grad_norm": 0.17157487571239471, + "learning_rate": 1.9856384755032245e-05, + "loss": 0.5356, + "step": 822 + }, + { + "epoch": 0.1835824224849431, + "grad_norm": 0.1781257838010788, + "learning_rate": 1.985598706045088e-05, + "loss": 0.522, + "step": 823 + }, + { + "epoch": 0.18380548739683247, + "grad_norm": 0.16910432279109955, + "learning_rate": 1.985558881998295e-05, + "loss": 0.5324, + "step": 824 + }, + { + "epoch": 0.18402855230872184, + "grad_norm": 0.180936798453331, + "learning_rate": 1.985519003365052e-05, + "loss": 0.5307, + "step": 825 + }, + { + "epoch": 0.1842516172206112, + "grad_norm": 0.1872517466545105, + "learning_rate": 1.9854790701475676e-05, + "loss": 0.5667, + "step": 826 + }, + { + "epoch": 0.18447468213250057, + "grad_norm": 0.16592343151569366, + "learning_rate": 1.985439082348053e-05, + "loss": 0.4938, + "step": 827 + }, + { + "epoch": 0.1846977470443899, + "grad_norm": 0.18184059858322144, + "learning_rate": 1.9853990399687237e-05, + "loss": 0.536, + "step": 828 + }, + { + "epoch": 0.18492081195627927, + "grad_norm": 0.1687706708908081, + "learning_rate": 1.985358943011797e-05, + "loss": 0.4902, + "step": 829 + }, + { + "epoch": 0.18514387686816863, + "grad_norm": 0.18062466382980347, + "learning_rate": 1.985318791479494e-05, + "loss": 0.5235, + "step": 830 + }, + { + "epoch": 0.185366941780058, + "grad_norm": 0.1865355372428894, + "learning_rate": 1.985278585374038e-05, + "loss": 0.5439, + "step": 831 + }, + { + "epoch": 0.18559000669194736, + "grad_norm": 0.21087662875652313, + "learning_rate": 1.985238324697657e-05, + "loss": 0.5338, + "step": 832 + }, + { + "epoch": 0.18581307160383673, + "grad_norm": 0.18593360483646393, + "learning_rate": 1.9851980094525795e-05, + "loss": 0.5139, + "step": 833 + }, + { + "epoch": 0.18603613651572606, + "grad_norm": 0.17105735838413239, + "learning_rate": 1.9851576396410395e-05, + "loss": 0.5385, + "step": 834 + }, + { + "epoch": 0.18625920142761543, + "grad_norm": 0.1764969825744629, + "learning_rate": 1.9851172152652722e-05, + "loss": 0.5381, + "step": 835 + }, + { + "epoch": 0.1864822663395048, + "grad_norm": 0.211869478225708, + "learning_rate": 1.985076736327517e-05, + "loss": 0.512, + "step": 836 + }, + { + "epoch": 0.18670533125139416, + "grad_norm": 0.17822885513305664, + "learning_rate": 1.9850362028300162e-05, + "loss": 0.5394, + "step": 837 + }, + { + "epoch": 0.18692839616328352, + "grad_norm": 0.17853499948978424, + "learning_rate": 1.9849956147750137e-05, + "loss": 0.5273, + "step": 838 + }, + { + "epoch": 0.1871514610751729, + "grad_norm": 0.1717289835214615, + "learning_rate": 1.9849549721647586e-05, + "loss": 0.5126, + "step": 839 + }, + { + "epoch": 0.18737452598706222, + "grad_norm": 0.167042076587677, + "learning_rate": 1.9849142750015014e-05, + "loss": 0.5154, + "step": 840 + }, + { + "epoch": 0.1875975908989516, + "grad_norm": 0.1893247812986374, + "learning_rate": 1.9848735232874966e-05, + "loss": 0.5205, + "step": 841 + }, + { + "epoch": 0.18782065581084095, + "grad_norm": 0.18898561596870422, + "learning_rate": 1.984832717025001e-05, + "loss": 0.5513, + "step": 842 + }, + { + "epoch": 0.18804372072273032, + "grad_norm": 0.18172813951969147, + "learning_rate": 1.984791856216274e-05, + "loss": 0.5056, + "step": 843 + }, + { + "epoch": 0.18826678563461968, + "grad_norm": 0.1696682870388031, + "learning_rate": 1.98475094086358e-05, + "loss": 0.5313, + "step": 844 + }, + { + "epoch": 0.18848985054650905, + "grad_norm": 0.1731012910604477, + "learning_rate": 1.9847099709691843e-05, + "loss": 0.5108, + "step": 845 + }, + { + "epoch": 0.18871291545839838, + "grad_norm": 0.18047916889190674, + "learning_rate": 1.9846689465353563e-05, + "loss": 0.5108, + "step": 846 + }, + { + "epoch": 0.18893598037028775, + "grad_norm": 0.17469045519828796, + "learning_rate": 1.9846278675643684e-05, + "loss": 0.5273, + "step": 847 + }, + { + "epoch": 0.1891590452821771, + "grad_norm": 0.18713539838790894, + "learning_rate": 1.9845867340584957e-05, + "loss": 0.5587, + "step": 848 + }, + { + "epoch": 0.18938211019406648, + "grad_norm": 0.2040695995092392, + "learning_rate": 1.984545546020016e-05, + "loss": 0.5651, + "step": 849 + }, + { + "epoch": 0.18960517510595584, + "grad_norm": 0.24084708094596863, + "learning_rate": 1.984504303451211e-05, + "loss": 0.5186, + "step": 850 + }, + { + "epoch": 0.1898282400178452, + "grad_norm": 0.18628886342048645, + "learning_rate": 1.9844630063543655e-05, + "loss": 0.5091, + "step": 851 + }, + { + "epoch": 0.19005130492973454, + "grad_norm": 0.18985594809055328, + "learning_rate": 1.9844216547317656e-05, + "loss": 0.5457, + "step": 852 + }, + { + "epoch": 0.1902743698416239, + "grad_norm": 0.1807604879140854, + "learning_rate": 1.9843802485857028e-05, + "loss": 0.5137, + "step": 853 + }, + { + "epoch": 0.19049743475351327, + "grad_norm": 0.1864684522151947, + "learning_rate": 1.984338787918469e-05, + "loss": 0.5232, + "step": 854 + }, + { + "epoch": 0.19072049966540264, + "grad_norm": 0.17398470640182495, + "learning_rate": 1.984297272732362e-05, + "loss": 0.5207, + "step": 855 + }, + { + "epoch": 0.190943564577292, + "grad_norm": 0.21071763336658478, + "learning_rate": 1.9842557030296804e-05, + "loss": 0.5032, + "step": 856 + }, + { + "epoch": 0.19116662948918134, + "grad_norm": 0.17372627556324005, + "learning_rate": 1.9842140788127264e-05, + "loss": 0.4992, + "step": 857 + }, + { + "epoch": 0.1913896944010707, + "grad_norm": 0.18683859705924988, + "learning_rate": 1.9841724000838064e-05, + "loss": 0.5342, + "step": 858 + }, + { + "epoch": 0.19161275931296007, + "grad_norm": 0.18670214712619781, + "learning_rate": 1.9841306668452275e-05, + "loss": 0.521, + "step": 859 + }, + { + "epoch": 0.19183582422484943, + "grad_norm": 0.18776099383831024, + "learning_rate": 1.9840888790993023e-05, + "loss": 0.5164, + "step": 860 + }, + { + "epoch": 0.1920588891367388, + "grad_norm": 0.20680485665798187, + "learning_rate": 1.9840470368483448e-05, + "loss": 0.5433, + "step": 861 + }, + { + "epoch": 0.19228195404862816, + "grad_norm": 0.18562763929367065, + "learning_rate": 1.9840051400946724e-05, + "loss": 0.5225, + "step": 862 + }, + { + "epoch": 0.1925050189605175, + "grad_norm": 0.1958203911781311, + "learning_rate": 1.9839631888406055e-05, + "loss": 0.5333, + "step": 863 + }, + { + "epoch": 0.19272808387240686, + "grad_norm": 0.1797972470521927, + "learning_rate": 1.9839211830884682e-05, + "loss": 0.5259, + "step": 864 + }, + { + "epoch": 0.19295114878429623, + "grad_norm": 0.18006913363933563, + "learning_rate": 1.9838791228405866e-05, + "loss": 0.5355, + "step": 865 + }, + { + "epoch": 0.1931742136961856, + "grad_norm": 0.19286584854125977, + "learning_rate": 1.9838370080992902e-05, + "loss": 0.5548, + "step": 866 + }, + { + "epoch": 0.19339727860807496, + "grad_norm": 0.17611972987651825, + "learning_rate": 1.9837948388669118e-05, + "loss": 0.4975, + "step": 867 + }, + { + "epoch": 0.19362034351996432, + "grad_norm": 0.17204061150550842, + "learning_rate": 1.983752615145787e-05, + "loss": 0.5353, + "step": 868 + }, + { + "epoch": 0.19384340843185366, + "grad_norm": 0.17773252725601196, + "learning_rate": 1.9837103369382542e-05, + "loss": 0.5621, + "step": 869 + }, + { + "epoch": 0.19406647334374302, + "grad_norm": 0.17241743206977844, + "learning_rate": 1.983668004246655e-05, + "loss": 0.5514, + "step": 870 + }, + { + "epoch": 0.1942895382556324, + "grad_norm": 0.1765352487564087, + "learning_rate": 1.9836256170733343e-05, + "loss": 0.5262, + "step": 871 + }, + { + "epoch": 0.19451260316752175, + "grad_norm": 0.18487049639225006, + "learning_rate": 1.98358317542064e-05, + "loss": 0.4941, + "step": 872 + }, + { + "epoch": 0.19473566807941112, + "grad_norm": 0.17895649373531342, + "learning_rate": 1.983540679290922e-05, + "loss": 0.5501, + "step": 873 + }, + { + "epoch": 0.19495873299130048, + "grad_norm": 0.1671830266714096, + "learning_rate": 1.9834981286865343e-05, + "loss": 0.5207, + "step": 874 + }, + { + "epoch": 0.19518179790318982, + "grad_norm": 0.18288585543632507, + "learning_rate": 1.9834555236098344e-05, + "loss": 0.5405, + "step": 875 + }, + { + "epoch": 0.19540486281507918, + "grad_norm": 0.2788625657558441, + "learning_rate": 1.983412864063181e-05, + "loss": 0.5029, + "step": 876 + }, + { + "epoch": 0.19562792772696855, + "grad_norm": 0.17516101896762848, + "learning_rate": 1.983370150048938e-05, + "loss": 0.5181, + "step": 877 + }, + { + "epoch": 0.1958509926388579, + "grad_norm": 0.18724878132343292, + "learning_rate": 1.9833273815694695e-05, + "loss": 0.5399, + "step": 878 + }, + { + "epoch": 0.19607405755074728, + "grad_norm": 0.1845855563879013, + "learning_rate": 1.9832845586271456e-05, + "loss": 0.5493, + "step": 879 + }, + { + "epoch": 0.19629712246263661, + "grad_norm": 0.1939295083284378, + "learning_rate": 1.9832416812243377e-05, + "loss": 0.5453, + "step": 880 + }, + { + "epoch": 0.19652018737452598, + "grad_norm": 0.17018291354179382, + "learning_rate": 1.9831987493634207e-05, + "loss": 0.5096, + "step": 881 + }, + { + "epoch": 0.19674325228641534, + "grad_norm": 0.18090112507343292, + "learning_rate": 1.9831557630467725e-05, + "loss": 0.5519, + "step": 882 + }, + { + "epoch": 0.1969663171983047, + "grad_norm": 0.19264590740203857, + "learning_rate": 1.983112722276774e-05, + "loss": 0.5469, + "step": 883 + }, + { + "epoch": 0.19718938211019407, + "grad_norm": 0.185777947306633, + "learning_rate": 1.9830696270558084e-05, + "loss": 0.5484, + "step": 884 + }, + { + "epoch": 0.19741244702208344, + "grad_norm": 0.15953023731708527, + "learning_rate": 1.9830264773862633e-05, + "loss": 0.5139, + "step": 885 + }, + { + "epoch": 0.19763551193397277, + "grad_norm": 0.17809630930423737, + "learning_rate": 1.9829832732705284e-05, + "loss": 0.5178, + "step": 886 + }, + { + "epoch": 0.19785857684586214, + "grad_norm": 0.20061112940311432, + "learning_rate": 1.982940014710997e-05, + "loss": 0.5393, + "step": 887 + }, + { + "epoch": 0.1980816417577515, + "grad_norm": 0.1748671978712082, + "learning_rate": 1.9828967017100642e-05, + "loss": 0.5332, + "step": 888 + }, + { + "epoch": 0.19830470666964087, + "grad_norm": 0.1713806539773941, + "learning_rate": 1.9828533342701296e-05, + "loss": 0.5165, + "step": 889 + }, + { + "epoch": 0.19852777158153023, + "grad_norm": 0.16470858454704285, + "learning_rate": 1.9828099123935948e-05, + "loss": 0.5133, + "step": 890 + }, + { + "epoch": 0.1987508364934196, + "grad_norm": 0.17374449968338013, + "learning_rate": 1.9827664360828647e-05, + "loss": 0.5475, + "step": 891 + }, + { + "epoch": 0.19897390140530893, + "grad_norm": 0.1877843141555786, + "learning_rate": 1.982722905340348e-05, + "loss": 0.5253, + "step": 892 + }, + { + "epoch": 0.1991969663171983, + "grad_norm": 0.17328353226184845, + "learning_rate": 1.982679320168455e-05, + "loss": 0.5219, + "step": 893 + }, + { + "epoch": 0.19942003122908766, + "grad_norm": 0.17716079950332642, + "learning_rate": 1.9826356805696e-05, + "loss": 0.535, + "step": 894 + }, + { + "epoch": 0.19964309614097703, + "grad_norm": 0.17511911690235138, + "learning_rate": 1.9825919865462004e-05, + "loss": 0.548, + "step": 895 + }, + { + "epoch": 0.1998661610528664, + "grad_norm": 0.1711902767419815, + "learning_rate": 1.9825482381006752e-05, + "loss": 0.5402, + "step": 896 + }, + { + "epoch": 0.20008922596475576, + "grad_norm": 0.17732638120651245, + "learning_rate": 1.9825044352354482e-05, + "loss": 0.5672, + "step": 897 + }, + { + "epoch": 0.2003122908766451, + "grad_norm": 0.17266635596752167, + "learning_rate": 1.9824605779529456e-05, + "loss": 0.5312, + "step": 898 + }, + { + "epoch": 0.20053535578853446, + "grad_norm": 0.16797249019145966, + "learning_rate": 1.982416666255596e-05, + "loss": 0.5294, + "step": 899 + }, + { + "epoch": 0.20075842070042382, + "grad_norm": 0.17062917351722717, + "learning_rate": 1.9823727001458318e-05, + "loss": 0.51, + "step": 900 + }, + { + "epoch": 0.2009814856123132, + "grad_norm": 0.17725898325443268, + "learning_rate": 1.9823286796260887e-05, + "loss": 0.5284, + "step": 901 + }, + { + "epoch": 0.20120455052420255, + "grad_norm": 0.17662313580513, + "learning_rate": 1.9822846046988037e-05, + "loss": 0.515, + "step": 902 + }, + { + "epoch": 0.2014276154360919, + "grad_norm": 0.17648808658123016, + "learning_rate": 1.9822404753664183e-05, + "loss": 0.5437, + "step": 903 + }, + { + "epoch": 0.20165068034798125, + "grad_norm": 0.17179900407791138, + "learning_rate": 1.982196291631377e-05, + "loss": 0.5332, + "step": 904 + }, + { + "epoch": 0.20187374525987062, + "grad_norm": 0.19034990668296814, + "learning_rate": 1.982152053496127e-05, + "loss": 0.5279, + "step": 905 + }, + { + "epoch": 0.20209681017175998, + "grad_norm": 0.17066267132759094, + "learning_rate": 1.9821077609631184e-05, + "loss": 0.5473, + "step": 906 + }, + { + "epoch": 0.20231987508364935, + "grad_norm": 0.1806066334247589, + "learning_rate": 1.982063414034804e-05, + "loss": 0.5495, + "step": 907 + }, + { + "epoch": 0.2025429399955387, + "grad_norm": 0.17627598345279694, + "learning_rate": 1.9820190127136403e-05, + "loss": 0.5469, + "step": 908 + }, + { + "epoch": 0.20276600490742805, + "grad_norm": 0.18282422423362732, + "learning_rate": 1.9819745570020867e-05, + "loss": 0.5228, + "step": 909 + }, + { + "epoch": 0.2029890698193174, + "grad_norm": 0.17409084737300873, + "learning_rate": 1.981930046902605e-05, + "loss": 0.5183, + "step": 910 + }, + { + "epoch": 0.20321213473120678, + "grad_norm": 0.1846904307603836, + "learning_rate": 1.9818854824176612e-05, + "loss": 0.5198, + "step": 911 + }, + { + "epoch": 0.20343519964309614, + "grad_norm": 0.18149109184741974, + "learning_rate": 1.9818408635497224e-05, + "loss": 0.5078, + "step": 912 + }, + { + "epoch": 0.2036582645549855, + "grad_norm": 0.18114425241947174, + "learning_rate": 1.981796190301261e-05, + "loss": 0.531, + "step": 913 + }, + { + "epoch": 0.20388132946687487, + "grad_norm": 0.1718929558992386, + "learning_rate": 1.981751462674751e-05, + "loss": 0.4939, + "step": 914 + }, + { + "epoch": 0.2041043943787642, + "grad_norm": 0.1930830329656601, + "learning_rate": 1.9817066806726695e-05, + "loss": 0.5055, + "step": 915 + }, + { + "epoch": 0.20432745929065357, + "grad_norm": 0.17051468789577484, + "learning_rate": 1.9816618442974964e-05, + "loss": 0.5058, + "step": 916 + }, + { + "epoch": 0.20455052420254294, + "grad_norm": 0.1774812787771225, + "learning_rate": 1.9816169535517157e-05, + "loss": 0.5341, + "step": 917 + }, + { + "epoch": 0.2047735891144323, + "grad_norm": 0.17879648506641388, + "learning_rate": 1.9815720084378134e-05, + "loss": 0.534, + "step": 918 + }, + { + "epoch": 0.20499665402632167, + "grad_norm": 0.1766035556793213, + "learning_rate": 1.9815270089582795e-05, + "loss": 0.5407, + "step": 919 + }, + { + "epoch": 0.20521971893821103, + "grad_norm": 0.17892096936702728, + "learning_rate": 1.981481955115605e-05, + "loss": 0.4676, + "step": 920 + }, + { + "epoch": 0.20544278385010037, + "grad_norm": 0.17289894819259644, + "learning_rate": 1.9814368469122866e-05, + "loss": 0.5416, + "step": 921 + }, + { + "epoch": 0.20566584876198973, + "grad_norm": 0.1802949607372284, + "learning_rate": 1.981391684350822e-05, + "loss": 0.5425, + "step": 922 + }, + { + "epoch": 0.2058889136738791, + "grad_norm": 0.17635071277618408, + "learning_rate": 1.9813464674337126e-05, + "loss": 0.5294, + "step": 923 + }, + { + "epoch": 0.20611197858576846, + "grad_norm": 0.1723651885986328, + "learning_rate": 1.981301196163463e-05, + "loss": 0.5045, + "step": 924 + }, + { + "epoch": 0.20633504349765783, + "grad_norm": 0.18130990862846375, + "learning_rate": 1.9812558705425805e-05, + "loss": 0.5264, + "step": 925 + }, + { + "epoch": 0.2065581084095472, + "grad_norm": 0.1879456490278244, + "learning_rate": 1.9812104905735756e-05, + "loss": 0.5215, + "step": 926 + }, + { + "epoch": 0.20678117332143653, + "grad_norm": 0.1742536425590515, + "learning_rate": 1.9811650562589616e-05, + "loss": 0.5093, + "step": 927 + }, + { + "epoch": 0.2070042382333259, + "grad_norm": 0.18259446322917938, + "learning_rate": 1.981119567601255e-05, + "loss": 0.5528, + "step": 928 + }, + { + "epoch": 0.20722730314521526, + "grad_norm": 0.19506299495697021, + "learning_rate": 1.9810740246029755e-05, + "loss": 0.5338, + "step": 929 + }, + { + "epoch": 0.20745036805710462, + "grad_norm": 0.166092187166214, + "learning_rate": 1.981028427266645e-05, + "loss": 0.5308, + "step": 930 + }, + { + "epoch": 0.207673432968994, + "grad_norm": 0.18397627770900726, + "learning_rate": 1.980982775594789e-05, + "loss": 0.5593, + "step": 931 + }, + { + "epoch": 0.20789649788088332, + "grad_norm": 0.18404512107372284, + "learning_rate": 1.980937069589937e-05, + "loss": 0.5004, + "step": 932 + }, + { + "epoch": 0.2081195627927727, + "grad_norm": 0.1727455109357834, + "learning_rate": 1.9808913092546195e-05, + "loss": 0.5245, + "step": 933 + }, + { + "epoch": 0.20834262770466205, + "grad_norm": 0.18410643935203552, + "learning_rate": 1.980845494591371e-05, + "loss": 0.532, + "step": 934 + }, + { + "epoch": 0.20856569261655142, + "grad_norm": 0.19003835320472717, + "learning_rate": 1.9807996256027296e-05, + "loss": 0.5129, + "step": 935 + }, + { + "epoch": 0.20878875752844078, + "grad_norm": 0.23560728132724762, + "learning_rate": 1.980753702291235e-05, + "loss": 0.5484, + "step": 936 + }, + { + "epoch": 0.20901182244033015, + "grad_norm": 0.17955288290977478, + "learning_rate": 1.9807077246594316e-05, + "loss": 0.5198, + "step": 937 + }, + { + "epoch": 0.20923488735221948, + "grad_norm": 0.19580209255218506, + "learning_rate": 1.9806616927098653e-05, + "loss": 0.5023, + "step": 938 + }, + { + "epoch": 0.20945795226410885, + "grad_norm": 0.17852741479873657, + "learning_rate": 1.9806156064450855e-05, + "loss": 0.5101, + "step": 939 + }, + { + "epoch": 0.2096810171759982, + "grad_norm": 0.17126716673374176, + "learning_rate": 1.9805694658676458e-05, + "loss": 0.5415, + "step": 940 + }, + { + "epoch": 0.20990408208788758, + "grad_norm": 0.16866669058799744, + "learning_rate": 1.9805232709801008e-05, + "loss": 0.5066, + "step": 941 + }, + { + "epoch": 0.21012714699977694, + "grad_norm": 0.17179332673549652, + "learning_rate": 1.9804770217850093e-05, + "loss": 0.5059, + "step": 942 + }, + { + "epoch": 0.2103502119116663, + "grad_norm": 0.1836715042591095, + "learning_rate": 1.9804307182849326e-05, + "loss": 0.5291, + "step": 943 + }, + { + "epoch": 0.21057327682355564, + "grad_norm": 0.2429589480161667, + "learning_rate": 1.980384360482436e-05, + "loss": 0.5282, + "step": 944 + }, + { + "epoch": 0.210796341735445, + "grad_norm": 0.17777171730995178, + "learning_rate": 1.9803379483800866e-05, + "loss": 0.5338, + "step": 945 + }, + { + "epoch": 0.21101940664733437, + "grad_norm": 0.20419363677501678, + "learning_rate": 1.9802914819804546e-05, + "loss": 0.5345, + "step": 946 + }, + { + "epoch": 0.21124247155922374, + "grad_norm": 0.23102112114429474, + "learning_rate": 1.9802449612861144e-05, + "loss": 0.5272, + "step": 947 + }, + { + "epoch": 0.2114655364711131, + "grad_norm": 0.17385543882846832, + "learning_rate": 1.9801983862996423e-05, + "loss": 0.5206, + "step": 948 + }, + { + "epoch": 0.21168860138300247, + "grad_norm": 0.20227134227752686, + "learning_rate": 1.980151757023618e-05, + "loss": 0.5362, + "step": 949 + }, + { + "epoch": 0.2119116662948918, + "grad_norm": 0.3727608025074005, + "learning_rate": 1.9801050734606236e-05, + "loss": 0.511, + "step": 950 + }, + { + "epoch": 0.21213473120678117, + "grad_norm": 0.17483538389205933, + "learning_rate": 1.9800583356132453e-05, + "loss": 0.5251, + "step": 951 + }, + { + "epoch": 0.21235779611867053, + "grad_norm": 0.17400579154491425, + "learning_rate": 1.9800115434840716e-05, + "loss": 0.541, + "step": 952 + }, + { + "epoch": 0.2125808610305599, + "grad_norm": 0.16707202792167664, + "learning_rate": 1.979964697075694e-05, + "loss": 0.5391, + "step": 953 + }, + { + "epoch": 0.21280392594244926, + "grad_norm": 0.16636815667152405, + "learning_rate": 1.9799177963907074e-05, + "loss": 0.5434, + "step": 954 + }, + { + "epoch": 0.2130269908543386, + "grad_norm": 0.17174044251441956, + "learning_rate": 1.9798708414317095e-05, + "loss": 0.5389, + "step": 955 + }, + { + "epoch": 0.21325005576622796, + "grad_norm": 0.16985206305980682, + "learning_rate": 1.9798238322013002e-05, + "loss": 0.5327, + "step": 956 + }, + { + "epoch": 0.21347312067811733, + "grad_norm": 0.17486560344696045, + "learning_rate": 1.9797767687020843e-05, + "loss": 0.5428, + "step": 957 + }, + { + "epoch": 0.2136961855900067, + "grad_norm": 0.17041227221488953, + "learning_rate": 1.9797296509366678e-05, + "loss": 0.4995, + "step": 958 + }, + { + "epoch": 0.21391925050189606, + "grad_norm": 0.1798069328069687, + "learning_rate": 1.97968247890766e-05, + "loss": 0.5364, + "step": 959 + }, + { + "epoch": 0.21414231541378542, + "grad_norm": 0.17648300528526306, + "learning_rate": 1.9796352526176746e-05, + "loss": 0.5317, + "step": 960 + }, + { + "epoch": 0.21436538032567476, + "grad_norm": 0.16647587716579437, + "learning_rate": 1.9795879720693264e-05, + "loss": 0.4989, + "step": 961 + }, + { + "epoch": 0.21458844523756412, + "grad_norm": 0.17619404196739197, + "learning_rate": 1.9795406372652345e-05, + "loss": 0.5123, + "step": 962 + }, + { + "epoch": 0.2148115101494535, + "grad_norm": 0.1789608895778656, + "learning_rate": 1.979493248208021e-05, + "loss": 0.5421, + "step": 963 + }, + { + "epoch": 0.21503457506134285, + "grad_norm": 0.16893987357616425, + "learning_rate": 1.97944580490031e-05, + "loss": 0.5226, + "step": 964 + }, + { + "epoch": 0.21525763997323222, + "grad_norm": 0.1653861552476883, + "learning_rate": 1.9793983073447288e-05, + "loss": 0.5221, + "step": 965 + }, + { + "epoch": 0.21548070488512158, + "grad_norm": 0.17093954980373383, + "learning_rate": 1.9793507555439092e-05, + "loss": 0.535, + "step": 966 + }, + { + "epoch": 0.21570376979701092, + "grad_norm": 0.18254542350769043, + "learning_rate": 1.9793031495004845e-05, + "loss": 0.5585, + "step": 967 + }, + { + "epoch": 0.21592683470890028, + "grad_norm": 0.19674773514270782, + "learning_rate": 1.9792554892170908e-05, + "loss": 0.5159, + "step": 968 + }, + { + "epoch": 0.21614989962078965, + "grad_norm": 0.17958855628967285, + "learning_rate": 1.9792077746963686e-05, + "loss": 0.5185, + "step": 969 + }, + { + "epoch": 0.216372964532679, + "grad_norm": 0.1741204559803009, + "learning_rate": 1.9791600059409606e-05, + "loss": 0.5325, + "step": 970 + }, + { + "epoch": 0.21659602944456838, + "grad_norm": 0.1755409985780716, + "learning_rate": 1.9791121829535122e-05, + "loss": 0.5005, + "step": 971 + }, + { + "epoch": 0.21681909435645774, + "grad_norm": 0.17128707468509674, + "learning_rate": 1.979064305736672e-05, + "loss": 0.5345, + "step": 972 + }, + { + "epoch": 0.21704215926834708, + "grad_norm": 0.17565475404262543, + "learning_rate": 1.9790163742930922e-05, + "loss": 0.4964, + "step": 973 + }, + { + "epoch": 0.21726522418023644, + "grad_norm": 0.1766713559627533, + "learning_rate": 1.978968388625427e-05, + "loss": 0.5151, + "step": 974 + }, + { + "epoch": 0.2174882890921258, + "grad_norm": 0.1699419617652893, + "learning_rate": 1.9789203487363352e-05, + "loss": 0.5365, + "step": 975 + }, + { + "epoch": 0.21771135400401517, + "grad_norm": 0.20117329061031342, + "learning_rate": 1.978872254628476e-05, + "loss": 0.4942, + "step": 976 + }, + { + "epoch": 0.21793441891590454, + "grad_norm": 0.17031805217266083, + "learning_rate": 1.9788241063045147e-05, + "loss": 0.5262, + "step": 977 + }, + { + "epoch": 0.21815748382779387, + "grad_norm": 0.17150211334228516, + "learning_rate": 1.9787759037671172e-05, + "loss": 0.5169, + "step": 978 + }, + { + "epoch": 0.21838054873968324, + "grad_norm": 0.17386960983276367, + "learning_rate": 1.978727647018953e-05, + "loss": 0.5464, + "step": 979 + }, + { + "epoch": 0.2186036136515726, + "grad_norm": 0.17337769269943237, + "learning_rate": 1.9786793360626956e-05, + "loss": 0.5217, + "step": 980 + }, + { + "epoch": 0.21882667856346197, + "grad_norm": 0.17415766417980194, + "learning_rate": 1.9786309709010204e-05, + "loss": 0.5222, + "step": 981 + }, + { + "epoch": 0.21904974347535133, + "grad_norm": 0.17464500665664673, + "learning_rate": 1.978582551536606e-05, + "loss": 0.504, + "step": 982 + }, + { + "epoch": 0.2192728083872407, + "grad_norm": 0.1703118085861206, + "learning_rate": 1.9785340779721348e-05, + "loss": 0.5419, + "step": 983 + }, + { + "epoch": 0.21949587329913003, + "grad_norm": 0.1777229905128479, + "learning_rate": 1.9784855502102908e-05, + "loss": 0.5396, + "step": 984 + }, + { + "epoch": 0.2197189382110194, + "grad_norm": 0.1821225881576538, + "learning_rate": 1.978436968253762e-05, + "loss": 0.5214, + "step": 985 + }, + { + "epoch": 0.21994200312290876, + "grad_norm": 0.17865097522735596, + "learning_rate": 1.9783883321052394e-05, + "loss": 0.5354, + "step": 986 + }, + { + "epoch": 0.22016506803479813, + "grad_norm": 0.17980031669139862, + "learning_rate": 1.978339641767417e-05, + "loss": 0.5461, + "step": 987 + }, + { + "epoch": 0.2203881329466875, + "grad_norm": 0.18906496465206146, + "learning_rate": 1.9782908972429906e-05, + "loss": 0.5466, + "step": 988 + }, + { + "epoch": 0.22061119785857686, + "grad_norm": 0.20680966973304749, + "learning_rate": 1.978242098534661e-05, + "loss": 0.4944, + "step": 989 + }, + { + "epoch": 0.2208342627704662, + "grad_norm": 0.18064817786216736, + "learning_rate": 1.978193245645131e-05, + "loss": 0.5418, + "step": 990 + }, + { + "epoch": 0.22105732768235556, + "grad_norm": 0.18148301541805267, + "learning_rate": 1.978144338577105e-05, + "loss": 0.5348, + "step": 991 + }, + { + "epoch": 0.22128039259424492, + "grad_norm": 0.18182723224163055, + "learning_rate": 1.9780953773332933e-05, + "loss": 0.5159, + "step": 992 + }, + { + "epoch": 0.2215034575061343, + "grad_norm": 0.17886720597743988, + "learning_rate": 1.9780463619164073e-05, + "loss": 0.5211, + "step": 993 + }, + { + "epoch": 0.22172652241802365, + "grad_norm": 0.1731138974428177, + "learning_rate": 1.9779972923291615e-05, + "loss": 0.5386, + "step": 994 + }, + { + "epoch": 0.22194958732991302, + "grad_norm": 0.17276941239833832, + "learning_rate": 1.977948168574274e-05, + "loss": 0.553, + "step": 995 + }, + { + "epoch": 0.22217265224180235, + "grad_norm": 0.1833122968673706, + "learning_rate": 1.977898990654465e-05, + "loss": 0.5082, + "step": 996 + }, + { + "epoch": 0.22239571715369172, + "grad_norm": 0.17474150657653809, + "learning_rate": 1.9778497585724586e-05, + "loss": 0.5167, + "step": 997 + }, + { + "epoch": 0.22261878206558108, + "grad_norm": 0.18032675981521606, + "learning_rate": 1.977800472330982e-05, + "loss": 0.5351, + "step": 998 + }, + { + "epoch": 0.22284184697747045, + "grad_norm": 0.19136174023151398, + "learning_rate": 1.9777511319327645e-05, + "loss": 0.5387, + "step": 999 + }, + { + "epoch": 0.2230649118893598, + "grad_norm": 0.17949000000953674, + "learning_rate": 1.977701737380539e-05, + "loss": 0.5132, + "step": 1000 + }, + { + "epoch": 0.22328797680124918, + "grad_norm": 0.16305360198020935, + "learning_rate": 1.9776522886770413e-05, + "loss": 0.4798, + "step": 1001 + }, + { + "epoch": 0.22351104171313851, + "grad_norm": 0.18127582967281342, + "learning_rate": 1.9776027858250102e-05, + "loss": 0.5314, + "step": 1002 + }, + { + "epoch": 0.22373410662502788, + "grad_norm": 0.17198516428470612, + "learning_rate": 1.9775532288271876e-05, + "loss": 0.5356, + "step": 1003 + }, + { + "epoch": 0.22395717153691724, + "grad_norm": 0.17598840594291687, + "learning_rate": 1.9775036176863178e-05, + "loss": 0.5243, + "step": 1004 + }, + { + "epoch": 0.2241802364488066, + "grad_norm": 0.17470763623714447, + "learning_rate": 1.977453952405149e-05, + "loss": 0.5138, + "step": 1005 + }, + { + "epoch": 0.22440330136069597, + "grad_norm": 0.17710572481155396, + "learning_rate": 1.977404232986432e-05, + "loss": 0.5103, + "step": 1006 + }, + { + "epoch": 0.2246263662725853, + "grad_norm": 0.16758602857589722, + "learning_rate": 1.9773544594329202e-05, + "loss": 0.5685, + "step": 1007 + }, + { + "epoch": 0.22484943118447467, + "grad_norm": 0.17261147499084473, + "learning_rate": 1.977304631747371e-05, + "loss": 0.5149, + "step": 1008 + }, + { + "epoch": 0.22507249609636404, + "grad_norm": 0.1722407191991806, + "learning_rate": 1.9772547499325437e-05, + "loss": 0.5277, + "step": 1009 + }, + { + "epoch": 0.2252955610082534, + "grad_norm": 0.17808939516544342, + "learning_rate": 1.9772048139912012e-05, + "loss": 0.5392, + "step": 1010 + }, + { + "epoch": 0.22551862592014277, + "grad_norm": 0.17333589494228363, + "learning_rate": 1.9771548239261088e-05, + "loss": 0.5347, + "step": 1011 + }, + { + "epoch": 0.22574169083203213, + "grad_norm": 0.1775379180908203, + "learning_rate": 1.9771047797400363e-05, + "loss": 0.5243, + "step": 1012 + }, + { + "epoch": 0.22596475574392147, + "grad_norm": 0.19107598066329956, + "learning_rate": 1.9770546814357546e-05, + "loss": 0.5464, + "step": 1013 + }, + { + "epoch": 0.22618782065581083, + "grad_norm": 0.1707477569580078, + "learning_rate": 1.9770045290160388e-05, + "loss": 0.5337, + "step": 1014 + }, + { + "epoch": 0.2264108855677002, + "grad_norm": 0.19580954313278198, + "learning_rate": 1.9769543224836668e-05, + "loss": 0.5138, + "step": 1015 + }, + { + "epoch": 0.22663395047958956, + "grad_norm": 0.17244374752044678, + "learning_rate": 1.9769040618414187e-05, + "loss": 0.5232, + "step": 1016 + }, + { + "epoch": 0.22685701539147893, + "grad_norm": 0.16292127966880798, + "learning_rate": 1.9768537470920788e-05, + "loss": 0.4989, + "step": 1017 + }, + { + "epoch": 0.2270800803033683, + "grad_norm": 0.18515698611736298, + "learning_rate": 1.9768033782384338e-05, + "loss": 0.5316, + "step": 1018 + }, + { + "epoch": 0.22730314521525763, + "grad_norm": 0.16262215375900269, + "learning_rate": 1.9767529552832732e-05, + "loss": 0.4855, + "step": 1019 + }, + { + "epoch": 0.227526210127147, + "grad_norm": 0.17309491336345673, + "learning_rate": 1.9767024782293902e-05, + "loss": 0.5041, + "step": 1020 + }, + { + "epoch": 0.22774927503903636, + "grad_norm": 0.1719563603401184, + "learning_rate": 1.9766519470795803e-05, + "loss": 0.5412, + "step": 1021 + }, + { + "epoch": 0.22797233995092572, + "grad_norm": 0.2674558460712433, + "learning_rate": 1.9766013618366417e-05, + "loss": 0.5274, + "step": 1022 + }, + { + "epoch": 0.2281954048628151, + "grad_norm": 0.16945339739322662, + "learning_rate": 1.9765507225033772e-05, + "loss": 0.5034, + "step": 1023 + }, + { + "epoch": 0.22841846977470445, + "grad_norm": 0.19735541939735413, + "learning_rate": 1.9765000290825908e-05, + "loss": 0.5059, + "step": 1024 + }, + { + "epoch": 0.2286415346865938, + "grad_norm": 0.17197129130363464, + "learning_rate": 1.97644928157709e-05, + "loss": 0.5133, + "step": 1025 + }, + { + "epoch": 0.22886459959848315, + "grad_norm": 0.18425902724266052, + "learning_rate": 1.976398479989686e-05, + "loss": 0.5062, + "step": 1026 + }, + { + "epoch": 0.22908766451037252, + "grad_norm": 0.4941766858100891, + "learning_rate": 1.9763476243231924e-05, + "loss": 0.535, + "step": 1027 + }, + { + "epoch": 0.22931072942226188, + "grad_norm": 0.16707998514175415, + "learning_rate": 1.976296714580426e-05, + "loss": 0.5177, + "step": 1028 + }, + { + "epoch": 0.22953379433415125, + "grad_norm": 0.173675537109375, + "learning_rate": 1.9762457507642066e-05, + "loss": 0.5234, + "step": 1029 + }, + { + "epoch": 0.22975685924604058, + "grad_norm": 0.18508677184581757, + "learning_rate": 1.9761947328773565e-05, + "loss": 0.5073, + "step": 1030 + }, + { + "epoch": 0.22997992415792995, + "grad_norm": 0.2566058039665222, + "learning_rate": 1.9761436609227016e-05, + "loss": 0.5176, + "step": 1031 + }, + { + "epoch": 0.2302029890698193, + "grad_norm": 0.1702575534582138, + "learning_rate": 1.9760925349030704e-05, + "loss": 0.5106, + "step": 1032 + }, + { + "epoch": 0.23042605398170868, + "grad_norm": 0.18858478963375092, + "learning_rate": 1.976041354821295e-05, + "loss": 0.5052, + "step": 1033 + }, + { + "epoch": 0.23064911889359804, + "grad_norm": 0.191171795129776, + "learning_rate": 1.9759901206802098e-05, + "loss": 0.5643, + "step": 1034 + }, + { + "epoch": 0.2308721838054874, + "grad_norm": 0.17536719143390656, + "learning_rate": 1.9759388324826523e-05, + "loss": 0.5344, + "step": 1035 + }, + { + "epoch": 0.23109524871737674, + "grad_norm": 0.21609671413898468, + "learning_rate": 1.9758874902314634e-05, + "loss": 0.5197, + "step": 1036 + }, + { + "epoch": 0.2313183136292661, + "grad_norm": 0.16850803792476654, + "learning_rate": 1.9758360939294867e-05, + "loss": 0.4967, + "step": 1037 + }, + { + "epoch": 0.23154137854115547, + "grad_norm": 0.19448642432689667, + "learning_rate": 1.9757846435795688e-05, + "loss": 0.5171, + "step": 1038 + }, + { + "epoch": 0.23176444345304484, + "grad_norm": 0.19304804503917694, + "learning_rate": 1.9757331391845596e-05, + "loss": 0.5491, + "step": 1039 + }, + { + "epoch": 0.2319875083649342, + "grad_norm": 0.17683899402618408, + "learning_rate": 1.975681580747312e-05, + "loss": 0.535, + "step": 1040 + }, + { + "epoch": 0.23221057327682357, + "grad_norm": 0.17681746184825897, + "learning_rate": 1.9756299682706804e-05, + "loss": 0.5053, + "step": 1041 + }, + { + "epoch": 0.2324336381887129, + "grad_norm": 0.17382597923278809, + "learning_rate": 1.9755783017575244e-05, + "loss": 0.5063, + "step": 1042 + }, + { + "epoch": 0.23265670310060227, + "grad_norm": 0.17841431498527527, + "learning_rate": 1.9755265812107053e-05, + "loss": 0.4845, + "step": 1043 + }, + { + "epoch": 0.23287976801249163, + "grad_norm": 0.18297810852527618, + "learning_rate": 1.9754748066330883e-05, + "loss": 0.5418, + "step": 1044 + }, + { + "epoch": 0.233102832924381, + "grad_norm": 0.19104409217834473, + "learning_rate": 1.97542297802754e-05, + "loss": 0.5417, + "step": 1045 + }, + { + "epoch": 0.23332589783627036, + "grad_norm": 0.1772020310163498, + "learning_rate": 1.975371095396932e-05, + "loss": 0.5296, + "step": 1046 + }, + { + "epoch": 0.23354896274815973, + "grad_norm": 0.18861500918865204, + "learning_rate": 1.9753191587441372e-05, + "loss": 0.5667, + "step": 1047 + }, + { + "epoch": 0.23377202766004906, + "grad_norm": 0.17586645483970642, + "learning_rate": 1.9752671680720324e-05, + "loss": 0.5001, + "step": 1048 + }, + { + "epoch": 0.23399509257193843, + "grad_norm": 0.7259854674339294, + "learning_rate": 1.975215123383497e-05, + "loss": 0.5456, + "step": 1049 + }, + { + "epoch": 0.2342181574838278, + "grad_norm": 0.18736739456653595, + "learning_rate": 1.9751630246814136e-05, + "loss": 0.5231, + "step": 1050 + }, + { + "epoch": 0.23444122239571716, + "grad_norm": 0.2538264989852905, + "learning_rate": 1.9751108719686683e-05, + "loss": 0.5387, + "step": 1051 + }, + { + "epoch": 0.23466428730760652, + "grad_norm": 0.17714229226112366, + "learning_rate": 1.9750586652481492e-05, + "loss": 0.5076, + "step": 1052 + }, + { + "epoch": 0.23488735221949586, + "grad_norm": 0.3667065501213074, + "learning_rate": 1.9750064045227474e-05, + "loss": 0.5432, + "step": 1053 + }, + { + "epoch": 0.23511041713138522, + "grad_norm": 0.18610940873622894, + "learning_rate": 1.9749540897953584e-05, + "loss": 0.5307, + "step": 1054 + }, + { + "epoch": 0.2353334820432746, + "grad_norm": 0.18711121380329132, + "learning_rate": 1.974901721068879e-05, + "loss": 0.5256, + "step": 1055 + }, + { + "epoch": 0.23555654695516395, + "grad_norm": 0.17803776264190674, + "learning_rate": 1.97484929834621e-05, + "loss": 0.5076, + "step": 1056 + }, + { + "epoch": 0.23577961186705332, + "grad_norm": 0.19948653876781464, + "learning_rate": 1.9747968216302545e-05, + "loss": 0.5185, + "step": 1057 + }, + { + "epoch": 0.23600267677894268, + "grad_norm": 0.3181793987751007, + "learning_rate": 1.9747442909239198e-05, + "loss": 0.4874, + "step": 1058 + }, + { + "epoch": 0.23622574169083202, + "grad_norm": 0.1865629106760025, + "learning_rate": 1.9746917062301146e-05, + "loss": 0.52, + "step": 1059 + }, + { + "epoch": 0.23644880660272138, + "grad_norm": 0.23274224996566772, + "learning_rate": 1.9746390675517514e-05, + "loss": 0.496, + "step": 1060 + }, + { + "epoch": 0.23667187151461075, + "grad_norm": 0.2044648379087448, + "learning_rate": 1.974586374891746e-05, + "loss": 0.5173, + "step": 1061 + }, + { + "epoch": 0.2368949364265001, + "grad_norm": 0.18317201733589172, + "learning_rate": 1.974533628253017e-05, + "loss": 0.5282, + "step": 1062 + }, + { + "epoch": 0.23711800133838948, + "grad_norm": 0.18429698050022125, + "learning_rate": 1.9744808276384858e-05, + "loss": 0.5395, + "step": 1063 + }, + { + "epoch": 0.23734106625027884, + "grad_norm": 0.20926502346992493, + "learning_rate": 1.9744279730510764e-05, + "loss": 0.5111, + "step": 1064 + }, + { + "epoch": 0.23756413116216818, + "grad_norm": 0.17823931574821472, + "learning_rate": 1.974375064493716e-05, + "loss": 0.5258, + "step": 1065 + }, + { + "epoch": 0.23778719607405754, + "grad_norm": 0.19109368324279785, + "learning_rate": 1.9743221019693362e-05, + "loss": 0.5512, + "step": 1066 + }, + { + "epoch": 0.2380102609859469, + "grad_norm": 0.21680758893489838, + "learning_rate": 1.9742690854808692e-05, + "loss": 0.4951, + "step": 1067 + }, + { + "epoch": 0.23823332589783627, + "grad_norm": 0.1768484115600586, + "learning_rate": 1.974216015031252e-05, + "loss": 0.534, + "step": 1068 + }, + { + "epoch": 0.23845639080972564, + "grad_norm": 0.18384598195552826, + "learning_rate": 1.974162890623424e-05, + "loss": 0.502, + "step": 1069 + }, + { + "epoch": 0.238679455721615, + "grad_norm": 0.17585726082324982, + "learning_rate": 1.974109712260327e-05, + "loss": 0.5177, + "step": 1070 + }, + { + "epoch": 0.23890252063350434, + "grad_norm": 0.17469032108783722, + "learning_rate": 1.9740564799449073e-05, + "loss": 0.553, + "step": 1071 + }, + { + "epoch": 0.2391255855453937, + "grad_norm": 0.21859519183635712, + "learning_rate": 1.9740031936801122e-05, + "loss": 0.5204, + "step": 1072 + }, + { + "epoch": 0.23934865045728307, + "grad_norm": 0.36654403805732727, + "learning_rate": 1.9739498534688936e-05, + "loss": 0.5375, + "step": 1073 + }, + { + "epoch": 0.23957171536917243, + "grad_norm": 0.17866800725460052, + "learning_rate": 1.973896459314206e-05, + "loss": 0.5477, + "step": 1074 + }, + { + "epoch": 0.2397947802810618, + "grad_norm": 0.1822461485862732, + "learning_rate": 1.973843011219006e-05, + "loss": 0.5197, + "step": 1075 + }, + { + "epoch": 0.24001784519295116, + "grad_norm": 0.17180079221725464, + "learning_rate": 1.9737895091862545e-05, + "loss": 0.5269, + "step": 1076 + }, + { + "epoch": 0.2402409101048405, + "grad_norm": 0.17935633659362793, + "learning_rate": 1.9737359532189147e-05, + "loss": 0.5279, + "step": 1077 + }, + { + "epoch": 0.24046397501672986, + "grad_norm": 0.22954648733139038, + "learning_rate": 1.9736823433199524e-05, + "loss": 0.5199, + "step": 1078 + }, + { + "epoch": 0.24068703992861923, + "grad_norm": 0.17607101798057556, + "learning_rate": 1.973628679492338e-05, + "loss": 0.5121, + "step": 1079 + }, + { + "epoch": 0.2409101048405086, + "grad_norm": 0.17458245158195496, + "learning_rate": 1.9735749617390422e-05, + "loss": 0.5363, + "step": 1080 + }, + { + "epoch": 0.24113316975239796, + "grad_norm": 0.17682591080665588, + "learning_rate": 1.9735211900630414e-05, + "loss": 0.5254, + "step": 1081 + }, + { + "epoch": 0.2413562346642873, + "grad_norm": 0.17350362241268158, + "learning_rate": 1.9734673644673133e-05, + "loss": 0.5381, + "step": 1082 + }, + { + "epoch": 0.24157929957617666, + "grad_norm": 0.17231930792331696, + "learning_rate": 1.973413484954839e-05, + "loss": 0.5118, + "step": 1083 + }, + { + "epoch": 0.24180236448806602, + "grad_norm": 0.21497975289821625, + "learning_rate": 1.9733595515286032e-05, + "loss": 0.5353, + "step": 1084 + }, + { + "epoch": 0.2420254293999554, + "grad_norm": 0.1669948548078537, + "learning_rate": 1.9733055641915926e-05, + "loss": 0.5216, + "step": 1085 + }, + { + "epoch": 0.24224849431184475, + "grad_norm": 0.18229244649410248, + "learning_rate": 1.9732515229467973e-05, + "loss": 0.5379, + "step": 1086 + }, + { + "epoch": 0.24247155922373412, + "grad_norm": 0.17432808876037598, + "learning_rate": 1.973197427797211e-05, + "loss": 0.5461, + "step": 1087 + }, + { + "epoch": 0.24269462413562345, + "grad_norm": 0.18571536242961884, + "learning_rate": 1.9731432787458294e-05, + "loss": 0.5469, + "step": 1088 + }, + { + "epoch": 0.24291768904751282, + "grad_norm": 0.18052545189857483, + "learning_rate": 1.9730890757956517e-05, + "loss": 0.5416, + "step": 1089 + }, + { + "epoch": 0.24314075395940218, + "grad_norm": 0.23118536174297333, + "learning_rate": 1.97303481894968e-05, + "loss": 0.5063, + "step": 1090 + }, + { + "epoch": 0.24336381887129155, + "grad_norm": 0.16191843152046204, + "learning_rate": 1.9729805082109194e-05, + "loss": 0.5178, + "step": 1091 + }, + { + "epoch": 0.2435868837831809, + "grad_norm": 0.17042016983032227, + "learning_rate": 1.9729261435823782e-05, + "loss": 0.5024, + "step": 1092 + }, + { + "epoch": 0.24380994869507028, + "grad_norm": 0.21003969013690948, + "learning_rate": 1.972871725067067e-05, + "loss": 0.5106, + "step": 1093 + }, + { + "epoch": 0.24403301360695961, + "grad_norm": 0.22672854363918304, + "learning_rate": 1.972817252668e-05, + "loss": 0.5309, + "step": 1094 + }, + { + "epoch": 0.24425607851884898, + "grad_norm": 0.16994351148605347, + "learning_rate": 1.9727627263881942e-05, + "loss": 0.5023, + "step": 1095 + }, + { + "epoch": 0.24447914343073834, + "grad_norm": 0.17019368708133698, + "learning_rate": 1.9727081462306697e-05, + "loss": 0.5316, + "step": 1096 + }, + { + "epoch": 0.2447022083426277, + "grad_norm": 0.17145198583602905, + "learning_rate": 1.97265351219845e-05, + "loss": 0.5148, + "step": 1097 + }, + { + "epoch": 0.24492527325451707, + "grad_norm": 0.20565351843833923, + "learning_rate": 1.9725988242945598e-05, + "loss": 0.5445, + "step": 1098 + }, + { + "epoch": 0.24514833816640644, + "grad_norm": 0.1741577833890915, + "learning_rate": 1.9725440825220296e-05, + "loss": 0.4958, + "step": 1099 + }, + { + "epoch": 0.24537140307829577, + "grad_norm": 0.18705230951309204, + "learning_rate": 1.9724892868838902e-05, + "loss": 0.5105, + "step": 1100 + }, + { + "epoch": 0.24559446799018514, + "grad_norm": 0.5166819095611572, + "learning_rate": 1.9724344373831768e-05, + "loss": 0.5414, + "step": 1101 + }, + { + "epoch": 0.2458175329020745, + "grad_norm": 0.17351533472537994, + "learning_rate": 1.9723795340229274e-05, + "loss": 0.5024, + "step": 1102 + }, + { + "epoch": 0.24604059781396387, + "grad_norm": 0.1658518761396408, + "learning_rate": 1.972324576806183e-05, + "loss": 0.4977, + "step": 1103 + }, + { + "epoch": 0.24626366272585323, + "grad_norm": 0.16890370845794678, + "learning_rate": 1.972269565735987e-05, + "loss": 0.5359, + "step": 1104 + }, + { + "epoch": 0.24648672763774257, + "grad_norm": 0.18016333878040314, + "learning_rate": 1.9722145008153873e-05, + "loss": 0.5394, + "step": 1105 + }, + { + "epoch": 0.24670979254963193, + "grad_norm": 0.16652169823646545, + "learning_rate": 1.9721593820474326e-05, + "loss": 0.5089, + "step": 1106 + }, + { + "epoch": 0.2469328574615213, + "grad_norm": 0.1999325454235077, + "learning_rate": 1.9721042094351764e-05, + "loss": 0.5541, + "step": 1107 + }, + { + "epoch": 0.24715592237341066, + "grad_norm": 0.16372385621070862, + "learning_rate": 1.972048982981674e-05, + "loss": 0.4942, + "step": 1108 + }, + { + "epoch": 0.24737898728530003, + "grad_norm": 0.20245228707790375, + "learning_rate": 1.971993702689985e-05, + "loss": 0.5034, + "step": 1109 + }, + { + "epoch": 0.2476020521971894, + "grad_norm": 0.17487917840480804, + "learning_rate": 1.97193836856317e-05, + "loss": 0.5231, + "step": 1110 + }, + { + "epoch": 0.24782511710907873, + "grad_norm": 0.1696334332227707, + "learning_rate": 1.971882980604295e-05, + "loss": 0.5141, + "step": 1111 + }, + { + "epoch": 0.2480481820209681, + "grad_norm": 0.17021594941616058, + "learning_rate": 1.971827538816427e-05, + "loss": 0.5081, + "step": 1112 + }, + { + "epoch": 0.24827124693285746, + "grad_norm": 0.18088696897029877, + "learning_rate": 1.9717720432026367e-05, + "loss": 0.5743, + "step": 1113 + }, + { + "epoch": 0.24849431184474682, + "grad_norm": 0.17647258937358856, + "learning_rate": 1.9717164937659984e-05, + "loss": 0.5289, + "step": 1114 + }, + { + "epoch": 0.2487173767566362, + "grad_norm": 0.17265263199806213, + "learning_rate": 1.971660890509588e-05, + "loss": 0.5296, + "step": 1115 + }, + { + "epoch": 0.24894044166852555, + "grad_norm": 0.16816379129886627, + "learning_rate": 1.971605233436485e-05, + "loss": 0.5263, + "step": 1116 + }, + { + "epoch": 0.2491635065804149, + "grad_norm": 0.17548470199108124, + "learning_rate": 1.9715495225497736e-05, + "loss": 0.5315, + "step": 1117 + }, + { + "epoch": 0.24938657149230425, + "grad_norm": 0.1775428056716919, + "learning_rate": 1.9714937578525374e-05, + "loss": 0.5227, + "step": 1118 + }, + { + "epoch": 0.24960963640419362, + "grad_norm": 0.17629674077033997, + "learning_rate": 1.971437939347866e-05, + "loss": 0.5475, + "step": 1119 + }, + { + "epoch": 0.24983270131608298, + "grad_norm": 0.20265498757362366, + "learning_rate": 1.9713820670388518e-05, + "loss": 0.5415, + "step": 1120 + }, + { + "epoch": 0.2500557662279723, + "grad_norm": 0.17958010733127594, + "learning_rate": 1.9713261409285876e-05, + "loss": 0.5491, + "step": 1121 + }, + { + "epoch": 0.2502788311398617, + "grad_norm": 0.1667289137840271, + "learning_rate": 1.9712701610201723e-05, + "loss": 0.5319, + "step": 1122 + }, + { + "epoch": 0.25050189605175105, + "grad_norm": 0.17537854611873627, + "learning_rate": 1.9712141273167058e-05, + "loss": 0.5033, + "step": 1123 + }, + { + "epoch": 0.25072496096364044, + "grad_norm": 0.1665065735578537, + "learning_rate": 1.9711580398212918e-05, + "loss": 0.5043, + "step": 1124 + }, + { + "epoch": 0.2509480258755298, + "grad_norm": 0.17893299460411072, + "learning_rate": 1.9711018985370366e-05, + "loss": 0.5424, + "step": 1125 + }, + { + "epoch": 0.2511710907874191, + "grad_norm": 0.17293697595596313, + "learning_rate": 1.97104570346705e-05, + "loss": 0.546, + "step": 1126 + }, + { + "epoch": 0.2513941556993085, + "grad_norm": 0.16615547239780426, + "learning_rate": 1.970989454614444e-05, + "loss": 0.5163, + "step": 1127 + }, + { + "epoch": 0.25161722061119784, + "grad_norm": 0.16944670677185059, + "learning_rate": 1.9709331519823343e-05, + "loss": 0.5471, + "step": 1128 + }, + { + "epoch": 0.25184028552308724, + "grad_norm": 0.179295152425766, + "learning_rate": 1.9708767955738394e-05, + "loss": 0.5381, + "step": 1129 + }, + { + "epoch": 0.2520633504349766, + "grad_norm": 0.1665600687265396, + "learning_rate": 1.9708203853920803e-05, + "loss": 0.4863, + "step": 1130 + }, + { + "epoch": 0.2522864153468659, + "grad_norm": 0.1918114423751831, + "learning_rate": 1.970763921440182e-05, + "loss": 0.4875, + "step": 1131 + }, + { + "epoch": 0.2525094802587553, + "grad_norm": 0.16761860251426697, + "learning_rate": 1.9707074037212707e-05, + "loss": 0.5314, + "step": 1132 + }, + { + "epoch": 0.25273254517064464, + "grad_norm": 0.19442524015903473, + "learning_rate": 1.970650832238478e-05, + "loss": 0.5186, + "step": 1133 + }, + { + "epoch": 0.25295561008253403, + "grad_norm": 0.17174071073532104, + "learning_rate": 1.9705942069949362e-05, + "loss": 0.5327, + "step": 1134 + }, + { + "epoch": 0.25317867499442337, + "grad_norm": 0.18218085169792175, + "learning_rate": 1.970537527993782e-05, + "loss": 0.5665, + "step": 1135 + }, + { + "epoch": 0.25340173990631276, + "grad_norm": 0.17595386505126953, + "learning_rate": 1.9704807952381542e-05, + "loss": 0.5581, + "step": 1136 + }, + { + "epoch": 0.2536248048182021, + "grad_norm": 0.20507752895355225, + "learning_rate": 1.9704240087311963e-05, + "loss": 0.541, + "step": 1137 + }, + { + "epoch": 0.25384786973009144, + "grad_norm": 0.1604541391134262, + "learning_rate": 1.970367168476052e-05, + "loss": 0.5223, + "step": 1138 + }, + { + "epoch": 0.25407093464198083, + "grad_norm": 0.16794823110103607, + "learning_rate": 1.9703102744758703e-05, + "loss": 0.5444, + "step": 1139 + }, + { + "epoch": 0.25429399955387016, + "grad_norm": 0.16887861490249634, + "learning_rate": 1.9702533267338015e-05, + "loss": 0.5237, + "step": 1140 + }, + { + "epoch": 0.25451706446575956, + "grad_norm": 0.16295503079891205, + "learning_rate": 1.970196325253001e-05, + "loss": 0.5293, + "step": 1141 + }, + { + "epoch": 0.2547401293776489, + "grad_norm": 0.16665813326835632, + "learning_rate": 1.9701392700366247e-05, + "loss": 0.5104, + "step": 1142 + }, + { + "epoch": 0.25496319428953823, + "grad_norm": 0.16648006439208984, + "learning_rate": 1.970082161087834e-05, + "loss": 0.5001, + "step": 1143 + }, + { + "epoch": 0.2551862592014276, + "grad_norm": 0.16400828957557678, + "learning_rate": 1.9700249984097907e-05, + "loss": 0.5106, + "step": 1144 + }, + { + "epoch": 0.25540932411331696, + "grad_norm": 0.16578614711761475, + "learning_rate": 1.969967782005661e-05, + "loss": 0.5134, + "step": 1145 + }, + { + "epoch": 0.25563238902520635, + "grad_norm": 0.16847828030586243, + "learning_rate": 1.9699105118786145e-05, + "loss": 0.4994, + "step": 1146 + }, + { + "epoch": 0.2558554539370957, + "grad_norm": 0.18443261086940765, + "learning_rate": 1.9698531880318228e-05, + "loss": 0.5136, + "step": 1147 + }, + { + "epoch": 0.2560785188489851, + "grad_norm": 0.17291708290576935, + "learning_rate": 1.969795810468461e-05, + "loss": 0.5243, + "step": 1148 + }, + { + "epoch": 0.2563015837608744, + "grad_norm": 0.18102054297924042, + "learning_rate": 1.9697383791917068e-05, + "loss": 0.4966, + "step": 1149 + }, + { + "epoch": 0.25652464867276376, + "grad_norm": 0.17706115543842316, + "learning_rate": 1.9696808942047414e-05, + "loss": 0.5332, + "step": 1150 + }, + { + "epoch": 0.25674771358465315, + "grad_norm": 0.1936006247997284, + "learning_rate": 1.9696233555107484e-05, + "loss": 0.5325, + "step": 1151 + }, + { + "epoch": 0.2569707784965425, + "grad_norm": 0.19508236646652222, + "learning_rate": 1.969565763112915e-05, + "loss": 0.509, + "step": 1152 + }, + { + "epoch": 0.2571938434084319, + "grad_norm": 0.16369056701660156, + "learning_rate": 1.9695081170144306e-05, + "loss": 0.5188, + "step": 1153 + }, + { + "epoch": 0.2574169083203212, + "grad_norm": 0.16955684125423431, + "learning_rate": 1.9694504172184885e-05, + "loss": 0.5316, + "step": 1154 + }, + { + "epoch": 0.25763997323221055, + "grad_norm": 0.16252164542675018, + "learning_rate": 1.969392663728284e-05, + "loss": 0.5291, + "step": 1155 + }, + { + "epoch": 0.25786303814409994, + "grad_norm": 0.17399340867996216, + "learning_rate": 1.969334856547016e-05, + "loss": 0.5153, + "step": 1156 + }, + { + "epoch": 0.2580861030559893, + "grad_norm": 0.17638690769672394, + "learning_rate": 1.9692769956778867e-05, + "loss": 0.5128, + "step": 1157 + }, + { + "epoch": 0.2583091679678787, + "grad_norm": 0.1711435467004776, + "learning_rate": 1.9692190811241e-05, + "loss": 0.5205, + "step": 1158 + }, + { + "epoch": 0.258532232879768, + "grad_norm": 0.18419131636619568, + "learning_rate": 1.9691611128888643e-05, + "loss": 0.5312, + "step": 1159 + }, + { + "epoch": 0.25875529779165735, + "grad_norm": 0.1925041675567627, + "learning_rate": 1.9691030909753894e-05, + "loss": 0.5479, + "step": 1160 + }, + { + "epoch": 0.25897836270354674, + "grad_norm": 0.1986900418996811, + "learning_rate": 1.9690450153868895e-05, + "loss": 0.5095, + "step": 1161 + }, + { + "epoch": 0.2592014276154361, + "grad_norm": 0.17978021502494812, + "learning_rate": 1.9689868861265816e-05, + "loss": 0.5015, + "step": 1162 + }, + { + "epoch": 0.25942449252732547, + "grad_norm": 0.16919931769371033, + "learning_rate": 1.9689287031976845e-05, + "loss": 0.5227, + "step": 1163 + }, + { + "epoch": 0.2596475574392148, + "grad_norm": 0.17485311627388, + "learning_rate": 1.9688704666034208e-05, + "loss": 0.5386, + "step": 1164 + }, + { + "epoch": 0.2598706223511042, + "grad_norm": 0.17683085799217224, + "learning_rate": 1.9688121763470165e-05, + "loss": 0.5201, + "step": 1165 + }, + { + "epoch": 0.26009368726299353, + "grad_norm": 0.18902894854545593, + "learning_rate": 1.9687538324316997e-05, + "loss": 0.51, + "step": 1166 + }, + { + "epoch": 0.26031675217488287, + "grad_norm": 0.17308862507343292, + "learning_rate": 1.968695434860702e-05, + "loss": 0.5112, + "step": 1167 + }, + { + "epoch": 0.26053981708677226, + "grad_norm": 0.17456986010074615, + "learning_rate": 1.9686369836372577e-05, + "loss": 0.5182, + "step": 1168 + }, + { + "epoch": 0.2607628819986616, + "grad_norm": 0.17276804149150848, + "learning_rate": 1.9685784787646044e-05, + "loss": 0.5389, + "step": 1169 + }, + { + "epoch": 0.260985946910551, + "grad_norm": 0.1710672229528427, + "learning_rate": 1.9685199202459824e-05, + "loss": 0.4948, + "step": 1170 + }, + { + "epoch": 0.26120901182244033, + "grad_norm": 0.18158847093582153, + "learning_rate": 1.9684613080846347e-05, + "loss": 0.5332, + "step": 1171 + }, + { + "epoch": 0.26143207673432967, + "grad_norm": 0.17782646417617798, + "learning_rate": 1.968402642283808e-05, + "loss": 0.5519, + "step": 1172 + }, + { + "epoch": 0.26165514164621906, + "grad_norm": 0.1716913878917694, + "learning_rate": 1.9683439228467515e-05, + "loss": 0.5074, + "step": 1173 + }, + { + "epoch": 0.2618782065581084, + "grad_norm": 0.1818932741880417, + "learning_rate": 1.9682851497767175e-05, + "loss": 0.5401, + "step": 1174 + }, + { + "epoch": 0.2621012714699978, + "grad_norm": 0.1722068190574646, + "learning_rate": 1.9682263230769612e-05, + "loss": 0.52, + "step": 1175 + }, + { + "epoch": 0.2623243363818871, + "grad_norm": 0.17454420030117035, + "learning_rate": 1.9681674427507405e-05, + "loss": 0.5063, + "step": 1176 + }, + { + "epoch": 0.2625474012937765, + "grad_norm": 0.1726730465888977, + "learning_rate": 1.9681085088013174e-05, + "loss": 0.5343, + "step": 1177 + }, + { + "epoch": 0.26277046620566585, + "grad_norm": 0.1802458018064499, + "learning_rate": 1.9680495212319547e-05, + "loss": 0.5218, + "step": 1178 + }, + { + "epoch": 0.2629935311175552, + "grad_norm": 0.1708204448223114, + "learning_rate": 1.9679904800459205e-05, + "loss": 0.4943, + "step": 1179 + }, + { + "epoch": 0.2632165960294446, + "grad_norm": 0.1644791066646576, + "learning_rate": 1.9679313852464846e-05, + "loss": 0.534, + "step": 1180 + }, + { + "epoch": 0.2634396609413339, + "grad_norm": 0.16891272366046906, + "learning_rate": 1.9678722368369203e-05, + "loss": 0.519, + "step": 1181 + }, + { + "epoch": 0.2636627258532233, + "grad_norm": 0.1720377802848816, + "learning_rate": 1.9678130348205032e-05, + "loss": 0.5362, + "step": 1182 + }, + { + "epoch": 0.26388579076511265, + "grad_norm": 0.17827892303466797, + "learning_rate": 1.9677537792005124e-05, + "loss": 0.5387, + "step": 1183 + }, + { + "epoch": 0.264108855677002, + "grad_norm": 0.17692813277244568, + "learning_rate": 1.96769446998023e-05, + "loss": 0.5051, + "step": 1184 + }, + { + "epoch": 0.2643319205888914, + "grad_norm": 0.17577511072158813, + "learning_rate": 1.9676351071629405e-05, + "loss": 0.5162, + "step": 1185 + }, + { + "epoch": 0.2645549855007807, + "grad_norm": 0.1805581897497177, + "learning_rate": 1.9675756907519325e-05, + "loss": 0.5528, + "step": 1186 + }, + { + "epoch": 0.2647780504126701, + "grad_norm": 0.17259716987609863, + "learning_rate": 1.967516220750496e-05, + "loss": 0.5381, + "step": 1187 + }, + { + "epoch": 0.26500111532455944, + "grad_norm": 0.16992929577827454, + "learning_rate": 1.9674566971619256e-05, + "loss": 0.51, + "step": 1188 + }, + { + "epoch": 0.2652241802364488, + "grad_norm": 0.16945737600326538, + "learning_rate": 1.9673971199895177e-05, + "loss": 0.4987, + "step": 1189 + }, + { + "epoch": 0.2654472451483382, + "grad_norm": 0.16880926489830017, + "learning_rate": 1.967337489236572e-05, + "loss": 0.5188, + "step": 1190 + }, + { + "epoch": 0.2656703100602275, + "grad_norm": 0.17628756165504456, + "learning_rate": 1.9672778049063915e-05, + "loss": 0.5346, + "step": 1191 + }, + { + "epoch": 0.2658933749721169, + "grad_norm": 0.17406663298606873, + "learning_rate": 1.967218067002282e-05, + "loss": 0.5493, + "step": 1192 + }, + { + "epoch": 0.26611643988400624, + "grad_norm": 0.17142999172210693, + "learning_rate": 1.9671582755275515e-05, + "loss": 0.5256, + "step": 1193 + }, + { + "epoch": 0.26633950479589563, + "grad_norm": 0.17752200365066528, + "learning_rate": 1.9670984304855125e-05, + "loss": 0.5237, + "step": 1194 + }, + { + "epoch": 0.26656256970778497, + "grad_norm": 0.17670097947120667, + "learning_rate": 1.9670385318794785e-05, + "loss": 0.5226, + "step": 1195 + }, + { + "epoch": 0.2667856346196743, + "grad_norm": 0.1656324565410614, + "learning_rate": 1.966978579712768e-05, + "loss": 0.494, + "step": 1196 + }, + { + "epoch": 0.2670086995315637, + "grad_norm": 0.17369796335697174, + "learning_rate": 1.966918573988701e-05, + "loss": 0.506, + "step": 1197 + }, + { + "epoch": 0.26723176444345303, + "grad_norm": 0.17040349543094635, + "learning_rate": 1.9668585147106017e-05, + "loss": 0.5112, + "step": 1198 + }, + { + "epoch": 0.2674548293553424, + "grad_norm": 0.1750401258468628, + "learning_rate": 1.9667984018817957e-05, + "loss": 0.5465, + "step": 1199 + }, + { + "epoch": 0.26767789426723176, + "grad_norm": 0.16083772480487823, + "learning_rate": 1.9667382355056128e-05, + "loss": 0.5105, + "step": 1200 + }, + { + "epoch": 0.2679009591791211, + "grad_norm": 0.16108831763267517, + "learning_rate": 1.9666780155853854e-05, + "loss": 0.512, + "step": 1201 + }, + { + "epoch": 0.2681240240910105, + "grad_norm": 0.17286653816699982, + "learning_rate": 1.966617742124449e-05, + "loss": 0.5686, + "step": 1202 + }, + { + "epoch": 0.26834708900289983, + "grad_norm": 0.17905832827091217, + "learning_rate": 1.9665574151261418e-05, + "loss": 0.5423, + "step": 1203 + }, + { + "epoch": 0.2685701539147892, + "grad_norm": 0.16681291162967682, + "learning_rate": 1.966497034593805e-05, + "loss": 0.5041, + "step": 1204 + }, + { + "epoch": 0.26879321882667856, + "grad_norm": 0.17184089124202728, + "learning_rate": 1.9664366005307828e-05, + "loss": 0.5501, + "step": 1205 + }, + { + "epoch": 0.2690162837385679, + "grad_norm": 0.19392690062522888, + "learning_rate": 1.9663761129404228e-05, + "loss": 0.5398, + "step": 1206 + }, + { + "epoch": 0.2692393486504573, + "grad_norm": 0.16529332101345062, + "learning_rate": 1.9663155718260746e-05, + "loss": 0.5086, + "step": 1207 + }, + { + "epoch": 0.2694624135623466, + "grad_norm": 0.1649094521999359, + "learning_rate": 1.966254977191092e-05, + "loss": 0.515, + "step": 1208 + }, + { + "epoch": 0.269685478474236, + "grad_norm": 0.1709623634815216, + "learning_rate": 1.9661943290388302e-05, + "loss": 0.5315, + "step": 1209 + }, + { + "epoch": 0.26990854338612535, + "grad_norm": 0.164725661277771, + "learning_rate": 1.9661336273726496e-05, + "loss": 0.4817, + "step": 1210 + }, + { + "epoch": 0.27013160829801475, + "grad_norm": 0.1715349704027176, + "learning_rate": 1.966072872195911e-05, + "loss": 0.4706, + "step": 1211 + }, + { + "epoch": 0.2703546732099041, + "grad_norm": 0.1747862696647644, + "learning_rate": 1.9660120635119798e-05, + "loss": 0.5055, + "step": 1212 + }, + { + "epoch": 0.2705777381217934, + "grad_norm": 0.17992718517780304, + "learning_rate": 1.9659512013242245e-05, + "loss": 0.4946, + "step": 1213 + }, + { + "epoch": 0.2708008030336828, + "grad_norm": 0.1895056813955307, + "learning_rate": 1.9658902856360153e-05, + "loss": 0.5258, + "step": 1214 + }, + { + "epoch": 0.27102386794557215, + "grad_norm": 0.19161482155323029, + "learning_rate": 1.9658293164507265e-05, + "loss": 0.5206, + "step": 1215 + }, + { + "epoch": 0.27124693285746154, + "grad_norm": 0.1860935389995575, + "learning_rate": 1.965768293771735e-05, + "loss": 0.5484, + "step": 1216 + }, + { + "epoch": 0.2714699977693509, + "grad_norm": 0.17763131856918335, + "learning_rate": 1.9657072176024202e-05, + "loss": 0.5029, + "step": 1217 + }, + { + "epoch": 0.2716930626812402, + "grad_norm": 0.18652020394802094, + "learning_rate": 1.9656460879461652e-05, + "loss": 0.5161, + "step": 1218 + }, + { + "epoch": 0.2719161275931296, + "grad_norm": 0.2644646465778351, + "learning_rate": 1.965584904806356e-05, + "loss": 0.5371, + "step": 1219 + }, + { + "epoch": 0.27213919250501895, + "grad_norm": 0.1769624799489975, + "learning_rate": 1.9655236681863806e-05, + "loss": 0.4892, + "step": 1220 + }, + { + "epoch": 0.27236225741690834, + "grad_norm": 0.1740611046552658, + "learning_rate": 1.9654623780896313e-05, + "loss": 0.5162, + "step": 1221 + }, + { + "epoch": 0.2725853223287977, + "grad_norm": 0.16314013302326202, + "learning_rate": 1.9654010345195026e-05, + "loss": 0.5304, + "step": 1222 + }, + { + "epoch": 0.27280838724068707, + "grad_norm": 0.18545354902744293, + "learning_rate": 1.9653396374793915e-05, + "loss": 0.5181, + "step": 1223 + }, + { + "epoch": 0.2730314521525764, + "grad_norm": 0.17202220857143402, + "learning_rate": 1.9652781869726993e-05, + "loss": 0.5486, + "step": 1224 + }, + { + "epoch": 0.27325451706446574, + "grad_norm": 0.18883365392684937, + "learning_rate": 1.9652166830028295e-05, + "loss": 0.5565, + "step": 1225 + }, + { + "epoch": 0.27347758197635513, + "grad_norm": 0.17693057656288147, + "learning_rate": 1.9651551255731884e-05, + "loss": 0.5275, + "step": 1226 + }, + { + "epoch": 0.27370064688824447, + "grad_norm": 0.17243415117263794, + "learning_rate": 1.9650935146871848e-05, + "loss": 0.5037, + "step": 1227 + }, + { + "epoch": 0.27392371180013386, + "grad_norm": 0.18462933599948883, + "learning_rate": 1.9650318503482323e-05, + "loss": 0.5573, + "step": 1228 + }, + { + "epoch": 0.2741467767120232, + "grad_norm": 0.3310684263706207, + "learning_rate": 1.964970132559745e-05, + "loss": 0.5186, + "step": 1229 + }, + { + "epoch": 0.27436984162391254, + "grad_norm": 0.16663436591625214, + "learning_rate": 1.964908361325142e-05, + "loss": 0.5067, + "step": 1230 + }, + { + "epoch": 0.27459290653580193, + "grad_norm": 0.16989050805568695, + "learning_rate": 1.964846536647845e-05, + "loss": 0.5084, + "step": 1231 + }, + { + "epoch": 0.27481597144769127, + "grad_norm": 0.16925394535064697, + "learning_rate": 1.9647846585312775e-05, + "loss": 0.5348, + "step": 1232 + }, + { + "epoch": 0.27503903635958066, + "grad_norm": 0.176070898771286, + "learning_rate": 1.9647227269788665e-05, + "loss": 0.5249, + "step": 1233 + }, + { + "epoch": 0.27526210127147, + "grad_norm": 0.16656488180160522, + "learning_rate": 1.9646607419940428e-05, + "loss": 0.5408, + "step": 1234 + }, + { + "epoch": 0.27548516618335933, + "grad_norm": 0.16989517211914062, + "learning_rate": 1.964598703580239e-05, + "loss": 0.5033, + "step": 1235 + }, + { + "epoch": 0.2757082310952487, + "grad_norm": 0.18006569147109985, + "learning_rate": 1.9645366117408918e-05, + "loss": 0.5354, + "step": 1236 + }, + { + "epoch": 0.27593129600713806, + "grad_norm": 0.17241814732551575, + "learning_rate": 1.9644744664794394e-05, + "loss": 0.508, + "step": 1237 + }, + { + "epoch": 0.27615436091902745, + "grad_norm": 0.20181423425674438, + "learning_rate": 1.9644122677993246e-05, + "loss": 0.4536, + "step": 1238 + }, + { + "epoch": 0.2763774258309168, + "grad_norm": 0.17486101388931274, + "learning_rate": 1.964350015703992e-05, + "loss": 0.5597, + "step": 1239 + }, + { + "epoch": 0.2766004907428062, + "grad_norm": 0.1774255484342575, + "learning_rate": 1.9642877101968894e-05, + "loss": 0.5511, + "step": 1240 + }, + { + "epoch": 0.2768235556546955, + "grad_norm": 0.3290056586265564, + "learning_rate": 1.964225351281468e-05, + "loss": 0.5278, + "step": 1241 + }, + { + "epoch": 0.27704662056658486, + "grad_norm": 0.20791974663734436, + "learning_rate": 1.9641629389611813e-05, + "loss": 0.5261, + "step": 1242 + }, + { + "epoch": 0.27726968547847425, + "grad_norm": 0.17645849287509918, + "learning_rate": 1.9641004732394862e-05, + "loss": 0.526, + "step": 1243 + }, + { + "epoch": 0.2774927503903636, + "grad_norm": 0.16956675052642822, + "learning_rate": 1.9640379541198425e-05, + "loss": 0.5489, + "step": 1244 + }, + { + "epoch": 0.277715815302253, + "grad_norm": 0.18473365902900696, + "learning_rate": 1.9639753816057128e-05, + "loss": 0.5422, + "step": 1245 + }, + { + "epoch": 0.2779388802141423, + "grad_norm": 0.262015700340271, + "learning_rate": 1.9639127557005627e-05, + "loss": 0.5031, + "step": 1246 + }, + { + "epoch": 0.27816194512603165, + "grad_norm": 0.19820185005664825, + "learning_rate": 1.963850076407861e-05, + "loss": 0.5132, + "step": 1247 + }, + { + "epoch": 0.27838501003792104, + "grad_norm": 0.1682923436164856, + "learning_rate": 1.9637873437310795e-05, + "loss": 0.5214, + "step": 1248 + }, + { + "epoch": 0.2786080749498104, + "grad_norm": 0.17070676386356354, + "learning_rate": 1.9637245576736923e-05, + "loss": 0.5368, + "step": 1249 + }, + { + "epoch": 0.2788311398616998, + "grad_norm": 0.17165376245975494, + "learning_rate": 1.9636617182391768e-05, + "loss": 0.5282, + "step": 1250 + }, + { + "epoch": 0.2790542047735891, + "grad_norm": 0.17272816598415375, + "learning_rate": 1.963598825431014e-05, + "loss": 0.5657, + "step": 1251 + }, + { + "epoch": 0.2792772696854785, + "grad_norm": 0.1659235805273056, + "learning_rate": 1.9635358792526865e-05, + "loss": 0.5181, + "step": 1252 + }, + { + "epoch": 0.27950033459736784, + "grad_norm": 0.1700238287448883, + "learning_rate": 1.9634728797076818e-05, + "loss": 0.5194, + "step": 1253 + }, + { + "epoch": 0.2797233995092572, + "grad_norm": 0.1710444688796997, + "learning_rate": 1.9634098267994882e-05, + "loss": 0.5405, + "step": 1254 + }, + { + "epoch": 0.27994646442114657, + "grad_norm": 0.1748325079679489, + "learning_rate": 1.9633467205315983e-05, + "loss": 0.5295, + "step": 1255 + }, + { + "epoch": 0.2801695293330359, + "grad_norm": 0.1600925624370575, + "learning_rate": 1.9632835609075072e-05, + "loss": 0.5448, + "step": 1256 + }, + { + "epoch": 0.2803925942449253, + "grad_norm": 0.1937401294708252, + "learning_rate": 1.9632203479307132e-05, + "loss": 0.5145, + "step": 1257 + }, + { + "epoch": 0.28061565915681463, + "grad_norm": 0.1888452023267746, + "learning_rate": 1.9631570816047176e-05, + "loss": 0.5376, + "step": 1258 + }, + { + "epoch": 0.28083872406870397, + "grad_norm": 0.16786231100559235, + "learning_rate": 1.963093761933024e-05, + "loss": 0.5336, + "step": 1259 + }, + { + "epoch": 0.28106178898059336, + "grad_norm": 0.16962790489196777, + "learning_rate": 1.9630303889191406e-05, + "loss": 0.5161, + "step": 1260 + }, + { + "epoch": 0.2812848538924827, + "grad_norm": 0.17393000423908234, + "learning_rate": 1.9629669625665757e-05, + "loss": 0.5098, + "step": 1261 + }, + { + "epoch": 0.2815079188043721, + "grad_norm": 0.18124370276927948, + "learning_rate": 1.9629034828788435e-05, + "loss": 0.5155, + "step": 1262 + }, + { + "epoch": 0.28173098371626143, + "grad_norm": 0.16687680780887604, + "learning_rate": 1.962839949859459e-05, + "loss": 0.5189, + "step": 1263 + }, + { + "epoch": 0.28195404862815077, + "grad_norm": 0.1622602790594101, + "learning_rate": 1.9627763635119423e-05, + "loss": 0.4974, + "step": 1264 + }, + { + "epoch": 0.28217711354004016, + "grad_norm": 0.16674260795116425, + "learning_rate": 1.9627127238398142e-05, + "loss": 0.4923, + "step": 1265 + }, + { + "epoch": 0.2824001784519295, + "grad_norm": 0.1815263032913208, + "learning_rate": 1.9626490308465996e-05, + "loss": 0.5048, + "step": 1266 + }, + { + "epoch": 0.2826232433638189, + "grad_norm": 0.17753565311431885, + "learning_rate": 1.9625852845358265e-05, + "loss": 0.5326, + "step": 1267 + }, + { + "epoch": 0.2828463082757082, + "grad_norm": 0.1820516288280487, + "learning_rate": 1.9625214849110253e-05, + "loss": 0.5289, + "step": 1268 + }, + { + "epoch": 0.2830693731875976, + "grad_norm": 0.16515739262104034, + "learning_rate": 1.9624576319757302e-05, + "loss": 0.5159, + "step": 1269 + }, + { + "epoch": 0.28329243809948695, + "grad_norm": 0.17564083635807037, + "learning_rate": 1.9623937257334767e-05, + "loss": 0.5052, + "step": 1270 + }, + { + "epoch": 0.2835155030113763, + "grad_norm": 0.18046805262565613, + "learning_rate": 1.9623297661878054e-05, + "loss": 0.5349, + "step": 1271 + }, + { + "epoch": 0.2837385679232657, + "grad_norm": 0.16942784190177917, + "learning_rate": 1.9622657533422583e-05, + "loss": 0.4924, + "step": 1272 + }, + { + "epoch": 0.283961632835155, + "grad_norm": 0.17093469202518463, + "learning_rate": 1.9622016872003807e-05, + "loss": 0.5261, + "step": 1273 + }, + { + "epoch": 0.2841846977470444, + "grad_norm": 0.17606058716773987, + "learning_rate": 1.9621375677657217e-05, + "loss": 0.51, + "step": 1274 + }, + { + "epoch": 0.28440776265893375, + "grad_norm": 0.16897493600845337, + "learning_rate": 1.9620733950418316e-05, + "loss": 0.5058, + "step": 1275 + }, + { + "epoch": 0.2846308275708231, + "grad_norm": 0.18536695837974548, + "learning_rate": 1.9620091690322654e-05, + "loss": 0.512, + "step": 1276 + }, + { + "epoch": 0.2848538924827125, + "grad_norm": 0.17734606564044952, + "learning_rate": 1.96194488974058e-05, + "loss": 0.5311, + "step": 1277 + }, + { + "epoch": 0.2850769573946018, + "grad_norm": 0.18116462230682373, + "learning_rate": 1.9618805571703356e-05, + "loss": 0.5153, + "step": 1278 + }, + { + "epoch": 0.2853000223064912, + "grad_norm": 0.17743848264217377, + "learning_rate": 1.961816171325096e-05, + "loss": 0.5238, + "step": 1279 + }, + { + "epoch": 0.28552308721838054, + "grad_norm": 0.16778843104839325, + "learning_rate": 1.961751732208426e-05, + "loss": 0.505, + "step": 1280 + }, + { + "epoch": 0.2857461521302699, + "grad_norm": 0.17177841067314148, + "learning_rate": 1.961687239823896e-05, + "loss": 0.5075, + "step": 1281 + }, + { + "epoch": 0.2859692170421593, + "grad_norm": 0.18460653722286224, + "learning_rate": 1.9616226941750775e-05, + "loss": 0.5109, + "step": 1282 + }, + { + "epoch": 0.2861922819540486, + "grad_norm": 0.16801568865776062, + "learning_rate": 1.961558095265545e-05, + "loss": 0.552, + "step": 1283 + }, + { + "epoch": 0.286415346865938, + "grad_norm": 0.19550864398479462, + "learning_rate": 1.961493443098877e-05, + "loss": 0.5138, + "step": 1284 + }, + { + "epoch": 0.28663841177782734, + "grad_norm": 0.17915892601013184, + "learning_rate": 1.9614287376786537e-05, + "loss": 0.5267, + "step": 1285 + }, + { + "epoch": 0.28686147668971673, + "grad_norm": 0.17141470313072205, + "learning_rate": 1.9613639790084596e-05, + "loss": 0.5398, + "step": 1286 + }, + { + "epoch": 0.28708454160160607, + "grad_norm": 0.16843633353710175, + "learning_rate": 1.9612991670918808e-05, + "loss": 0.5224, + "step": 1287 + }, + { + "epoch": 0.2873076065134954, + "grad_norm": 0.17116384208202362, + "learning_rate": 1.9612343019325077e-05, + "loss": 0.506, + "step": 1288 + }, + { + "epoch": 0.2875306714253848, + "grad_norm": 0.19337520003318787, + "learning_rate": 1.9611693835339323e-05, + "loss": 0.5417, + "step": 1289 + }, + { + "epoch": 0.28775373633727414, + "grad_norm": 0.1831638365983963, + "learning_rate": 1.9611044118997507e-05, + "loss": 0.5487, + "step": 1290 + }, + { + "epoch": 0.2879768012491635, + "grad_norm": 0.1741098165512085, + "learning_rate": 1.961039387033561e-05, + "loss": 0.5098, + "step": 1291 + }, + { + "epoch": 0.28819986616105286, + "grad_norm": 0.16796158254146576, + "learning_rate": 1.960974308938965e-05, + "loss": 0.5105, + "step": 1292 + }, + { + "epoch": 0.2884229310729422, + "grad_norm": 0.16848038136959076, + "learning_rate": 1.9609091776195667e-05, + "loss": 0.5106, + "step": 1293 + }, + { + "epoch": 0.2886459959848316, + "grad_norm": 0.1563376486301422, + "learning_rate": 1.960843993078974e-05, + "loss": 0.4927, + "step": 1294 + }, + { + "epoch": 0.28886906089672093, + "grad_norm": 0.16741539537906647, + "learning_rate": 1.9607787553207972e-05, + "loss": 0.5241, + "step": 1295 + }, + { + "epoch": 0.2890921258086103, + "grad_norm": 0.16913025081157684, + "learning_rate": 1.9607134643486492e-05, + "loss": 0.5183, + "step": 1296 + }, + { + "epoch": 0.28931519072049966, + "grad_norm": 0.17226742208003998, + "learning_rate": 1.9606481201661466e-05, + "loss": 0.5105, + "step": 1297 + }, + { + "epoch": 0.28953825563238905, + "grad_norm": 0.17816194891929626, + "learning_rate": 1.960582722776908e-05, + "loss": 0.5352, + "step": 1298 + }, + { + "epoch": 0.2897613205442784, + "grad_norm": 0.16423411667346954, + "learning_rate": 1.9605172721845564e-05, + "loss": 0.5303, + "step": 1299 + }, + { + "epoch": 0.2899843854561677, + "grad_norm": 0.22330845892429352, + "learning_rate": 1.9604517683927156e-05, + "loss": 0.5115, + "step": 1300 + }, + { + "epoch": 0.2902074503680571, + "grad_norm": 0.16385138034820557, + "learning_rate": 1.960386211405015e-05, + "loss": 0.5375, + "step": 1301 + }, + { + "epoch": 0.29043051527994646, + "grad_norm": 0.17269232869148254, + "learning_rate": 1.960320601225085e-05, + "loss": 0.5291, + "step": 1302 + }, + { + "epoch": 0.29065358019183585, + "grad_norm": 0.1625886857509613, + "learning_rate": 1.9602549378565592e-05, + "loss": 0.4982, + "step": 1303 + }, + { + "epoch": 0.2908766451037252, + "grad_norm": 0.17353162169456482, + "learning_rate": 1.9601892213030746e-05, + "loss": 0.5109, + "step": 1304 + }, + { + "epoch": 0.2910997100156145, + "grad_norm": 0.16863249242305756, + "learning_rate": 1.9601234515682712e-05, + "loss": 0.5409, + "step": 1305 + }, + { + "epoch": 0.2913227749275039, + "grad_norm": 0.17428503930568695, + "learning_rate": 1.960057628655792e-05, + "loss": 0.5155, + "step": 1306 + }, + { + "epoch": 0.29154583983939325, + "grad_norm": 0.1825830489397049, + "learning_rate": 1.9599917525692816e-05, + "loss": 0.5355, + "step": 1307 + }, + { + "epoch": 0.29176890475128264, + "grad_norm": 0.8213825821876526, + "learning_rate": 1.95992582331239e-05, + "loss": 0.5366, + "step": 1308 + }, + { + "epoch": 0.291991969663172, + "grad_norm": 0.16370530426502228, + "learning_rate": 1.959859840888768e-05, + "loss": 0.5071, + "step": 1309 + }, + { + "epoch": 0.2922150345750613, + "grad_norm": 0.17610913515090942, + "learning_rate": 1.95979380530207e-05, + "loss": 0.5346, + "step": 1310 + }, + { + "epoch": 0.2924380994869507, + "grad_norm": 0.16303309798240662, + "learning_rate": 1.959727716555954e-05, + "loss": 0.535, + "step": 1311 + }, + { + "epoch": 0.29266116439884005, + "grad_norm": 0.17250092327594757, + "learning_rate": 1.9596615746540798e-05, + "loss": 0.5322, + "step": 1312 + }, + { + "epoch": 0.29288422931072944, + "grad_norm": 0.18915972113609314, + "learning_rate": 1.959595379600111e-05, + "loss": 0.5293, + "step": 1313 + }, + { + "epoch": 0.2931072942226188, + "grad_norm": 0.17842963337898254, + "learning_rate": 1.9595291313977144e-05, + "loss": 0.5197, + "step": 1314 + }, + { + "epoch": 0.29333035913450817, + "grad_norm": 0.1693105846643448, + "learning_rate": 1.959462830050559e-05, + "loss": 0.5044, + "step": 1315 + }, + { + "epoch": 0.2935534240463975, + "grad_norm": 0.23653189837932587, + "learning_rate": 1.959396475562316e-05, + "loss": 0.5286, + "step": 1316 + }, + { + "epoch": 0.29377648895828684, + "grad_norm": 0.1704518347978592, + "learning_rate": 1.9593300679366622e-05, + "loss": 0.5288, + "step": 1317 + }, + { + "epoch": 0.29399955387017623, + "grad_norm": 0.169080451130867, + "learning_rate": 1.9592636071772745e-05, + "loss": 0.5336, + "step": 1318 + }, + { + "epoch": 0.29422261878206557, + "grad_norm": 0.2256423681974411, + "learning_rate": 1.959197093287834e-05, + "loss": 0.5253, + "step": 1319 + }, + { + "epoch": 0.29444568369395496, + "grad_norm": 0.1699950098991394, + "learning_rate": 1.9591305262720252e-05, + "loss": 0.5308, + "step": 1320 + }, + { + "epoch": 0.2946687486058443, + "grad_norm": 0.17314709722995758, + "learning_rate": 1.9590639061335345e-05, + "loss": 0.5477, + "step": 1321 + }, + { + "epoch": 0.29489181351773364, + "grad_norm": 0.16141551733016968, + "learning_rate": 1.958997232876052e-05, + "loss": 0.5207, + "step": 1322 + }, + { + "epoch": 0.29511487842962303, + "grad_norm": 0.15994872152805328, + "learning_rate": 1.9589305065032705e-05, + "loss": 0.5142, + "step": 1323 + }, + { + "epoch": 0.29533794334151237, + "grad_norm": 0.16060283780097961, + "learning_rate": 1.9588637270188852e-05, + "loss": 0.4967, + "step": 1324 + }, + { + "epoch": 0.29556100825340176, + "grad_norm": 0.17949143052101135, + "learning_rate": 1.9587968944265955e-05, + "loss": 0.5484, + "step": 1325 + }, + { + "epoch": 0.2957840731652911, + "grad_norm": 0.17476752400398254, + "learning_rate": 1.958730008730103e-05, + "loss": 0.5466, + "step": 1326 + }, + { + "epoch": 0.2960071380771805, + "grad_norm": 0.205107182264328, + "learning_rate": 1.9586630699331115e-05, + "loss": 0.5147, + "step": 1327 + }, + { + "epoch": 0.2962302029890698, + "grad_norm": 0.17714425921440125, + "learning_rate": 1.9585960780393293e-05, + "loss": 0.5142, + "step": 1328 + }, + { + "epoch": 0.29645326790095916, + "grad_norm": 0.17877231538295746, + "learning_rate": 1.9585290330524663e-05, + "loss": 0.5073, + "step": 1329 + }, + { + "epoch": 0.29667633281284855, + "grad_norm": 0.16752929985523224, + "learning_rate": 1.958461934976236e-05, + "loss": 0.5381, + "step": 1330 + }, + { + "epoch": 0.2968993977247379, + "grad_norm": 0.17550905048847198, + "learning_rate": 1.9583947838143553e-05, + "loss": 0.5185, + "step": 1331 + }, + { + "epoch": 0.2971224626366273, + "grad_norm": 0.17885231971740723, + "learning_rate": 1.958327579570542e-05, + "loss": 0.5279, + "step": 1332 + }, + { + "epoch": 0.2973455275485166, + "grad_norm": 0.16726696491241455, + "learning_rate": 1.95826032224852e-05, + "loss": 0.5253, + "step": 1333 + }, + { + "epoch": 0.29756859246040596, + "grad_norm": 0.16935260593891144, + "learning_rate": 1.9581930118520135e-05, + "loss": 0.5311, + "step": 1334 + }, + { + "epoch": 0.29779165737229535, + "grad_norm": 0.22187648713588715, + "learning_rate": 1.9581256483847505e-05, + "loss": 0.5124, + "step": 1335 + }, + { + "epoch": 0.2980147222841847, + "grad_norm": 0.17254653573036194, + "learning_rate": 1.9580582318504623e-05, + "loss": 0.5113, + "step": 1336 + }, + { + "epoch": 0.2982377871960741, + "grad_norm": 0.1686115860939026, + "learning_rate": 1.9579907622528827e-05, + "loss": 0.516, + "step": 1337 + }, + { + "epoch": 0.2984608521079634, + "grad_norm": 0.17233097553253174, + "learning_rate": 1.9579232395957492e-05, + "loss": 0.5135, + "step": 1338 + }, + { + "epoch": 0.29868391701985275, + "grad_norm": 0.1767144799232483, + "learning_rate": 1.957855663882801e-05, + "loss": 0.5113, + "step": 1339 + }, + { + "epoch": 0.29890698193174214, + "grad_norm": 0.16990788280963898, + "learning_rate": 1.9577880351177803e-05, + "loss": 0.5213, + "step": 1340 + }, + { + "epoch": 0.2991300468436315, + "grad_norm": 0.16583338379859924, + "learning_rate": 1.957720353304434e-05, + "loss": 0.5059, + "step": 1341 + }, + { + "epoch": 0.2993531117555209, + "grad_norm": 0.17070691287517548, + "learning_rate": 1.95765261844651e-05, + "loss": 0.5154, + "step": 1342 + }, + { + "epoch": 0.2995761766674102, + "grad_norm": 0.18274636566638947, + "learning_rate": 1.9575848305477606e-05, + "loss": 0.5367, + "step": 1343 + }, + { + "epoch": 0.2997992415792996, + "grad_norm": 0.16276118159294128, + "learning_rate": 1.957516989611939e-05, + "loss": 0.5156, + "step": 1344 + }, + { + "epoch": 0.30002230649118894, + "grad_norm": 0.16200782358646393, + "learning_rate": 1.9574490956428045e-05, + "loss": 0.5201, + "step": 1345 + }, + { + "epoch": 0.3002453714030783, + "grad_norm": 0.18693317472934723, + "learning_rate": 1.9573811486441158e-05, + "loss": 0.5181, + "step": 1346 + }, + { + "epoch": 0.30046843631496767, + "grad_norm": 0.17908775806427002, + "learning_rate": 1.9573131486196372e-05, + "loss": 0.5012, + "step": 1347 + }, + { + "epoch": 0.300691501226857, + "grad_norm": 0.16340862214565277, + "learning_rate": 1.9572450955731346e-05, + "loss": 0.4951, + "step": 1348 + }, + { + "epoch": 0.3009145661387464, + "grad_norm": 0.16997461020946503, + "learning_rate": 1.957176989508377e-05, + "loss": 0.5248, + "step": 1349 + }, + { + "epoch": 0.30113763105063573, + "grad_norm": 0.1896587759256363, + "learning_rate": 1.9571088304291376e-05, + "loss": 0.5129, + "step": 1350 + }, + { + "epoch": 0.30136069596252507, + "grad_norm": 0.1624515950679779, + "learning_rate": 1.95704061833919e-05, + "loss": 0.4872, + "step": 1351 + }, + { + "epoch": 0.30158376087441446, + "grad_norm": 0.16828244924545288, + "learning_rate": 1.956972353242313e-05, + "loss": 0.5291, + "step": 1352 + }, + { + "epoch": 0.3018068257863038, + "grad_norm": 0.17992524802684784, + "learning_rate": 1.9569040351422882e-05, + "loss": 0.5396, + "step": 1353 + }, + { + "epoch": 0.3020298906981932, + "grad_norm": 0.17174522578716278, + "learning_rate": 1.956835664042898e-05, + "loss": 0.523, + "step": 1354 + }, + { + "epoch": 0.30225295561008253, + "grad_norm": 0.18021540343761444, + "learning_rate": 1.9567672399479304e-05, + "loss": 0.5091, + "step": 1355 + }, + { + "epoch": 0.30247602052197187, + "grad_norm": 0.1889013797044754, + "learning_rate": 1.9566987628611748e-05, + "loss": 0.521, + "step": 1356 + }, + { + "epoch": 0.30269908543386126, + "grad_norm": 0.1710296869277954, + "learning_rate": 1.9566302327864233e-05, + "loss": 0.51, + "step": 1357 + }, + { + "epoch": 0.3029221503457506, + "grad_norm": 0.16925543546676636, + "learning_rate": 1.9565616497274725e-05, + "loss": 0.5194, + "step": 1358 + }, + { + "epoch": 0.30314521525764, + "grad_norm": 0.16558070480823517, + "learning_rate": 1.95649301368812e-05, + "loss": 0.5284, + "step": 1359 + }, + { + "epoch": 0.3033682801695293, + "grad_norm": 0.1646072119474411, + "learning_rate": 1.9564243246721686e-05, + "loss": 0.53, + "step": 1360 + }, + { + "epoch": 0.3035913450814187, + "grad_norm": 0.17670351266860962, + "learning_rate": 1.9563555826834214e-05, + "loss": 0.527, + "step": 1361 + }, + { + "epoch": 0.30381440999330805, + "grad_norm": 0.17414048314094543, + "learning_rate": 1.9562867877256867e-05, + "loss": 0.5318, + "step": 1362 + }, + { + "epoch": 0.3040374749051974, + "grad_norm": 0.18004649877548218, + "learning_rate": 1.956217939802774e-05, + "loss": 0.5149, + "step": 1363 + }, + { + "epoch": 0.3042605398170868, + "grad_norm": 0.16560295224189758, + "learning_rate": 1.9561490389184973e-05, + "loss": 0.5093, + "step": 1364 + }, + { + "epoch": 0.3044836047289761, + "grad_norm": 0.1967393010854721, + "learning_rate": 1.956080085076672e-05, + "loss": 0.499, + "step": 1365 + }, + { + "epoch": 0.3047066696408655, + "grad_norm": 0.17638468742370605, + "learning_rate": 1.956011078281118e-05, + "loss": 0.5287, + "step": 1366 + }, + { + "epoch": 0.30492973455275485, + "grad_norm": 0.18389096856117249, + "learning_rate": 1.955942018535657e-05, + "loss": 0.5263, + "step": 1367 + }, + { + "epoch": 0.3051527994646442, + "grad_norm": 0.16795802116394043, + "learning_rate": 1.9558729058441135e-05, + "loss": 0.4909, + "step": 1368 + }, + { + "epoch": 0.3053758643765336, + "grad_norm": 0.1886921525001526, + "learning_rate": 1.955803740210316e-05, + "loss": 0.4929, + "step": 1369 + }, + { + "epoch": 0.3055989292884229, + "grad_norm": 0.15743811428546906, + "learning_rate": 1.9557345216380953e-05, + "loss": 0.5252, + "step": 1370 + }, + { + "epoch": 0.3058219942003123, + "grad_norm": 0.18187056481838226, + "learning_rate": 1.955665250131285e-05, + "loss": 0.5307, + "step": 1371 + }, + { + "epoch": 0.30604505911220165, + "grad_norm": 0.17258834838867188, + "learning_rate": 1.9555959256937214e-05, + "loss": 0.5007, + "step": 1372 + }, + { + "epoch": 0.30626812402409104, + "grad_norm": 0.16746965050697327, + "learning_rate": 1.9555265483292446e-05, + "loss": 0.5174, + "step": 1373 + }, + { + "epoch": 0.3064911889359804, + "grad_norm": 0.191069096326828, + "learning_rate": 1.955457118041697e-05, + "loss": 0.5462, + "step": 1374 + }, + { + "epoch": 0.3067142538478697, + "grad_norm": 0.17369060218334198, + "learning_rate": 1.9553876348349242e-05, + "loss": 0.5175, + "step": 1375 + }, + { + "epoch": 0.3069373187597591, + "grad_norm": 0.16620078682899475, + "learning_rate": 1.9553180987127748e-05, + "loss": 0.5298, + "step": 1376 + }, + { + "epoch": 0.30716038367164844, + "grad_norm": 0.22137194871902466, + "learning_rate": 1.9552485096790996e-05, + "loss": 0.5135, + "step": 1377 + }, + { + "epoch": 0.30738344858353783, + "grad_norm": 0.20380550622940063, + "learning_rate": 1.9551788677377535e-05, + "loss": 0.5304, + "step": 1378 + }, + { + "epoch": 0.30760651349542717, + "grad_norm": 0.16844195127487183, + "learning_rate": 1.955109172892593e-05, + "loss": 0.55, + "step": 1379 + }, + { + "epoch": 0.3078295784073165, + "grad_norm": 0.1768990010023117, + "learning_rate": 1.955039425147479e-05, + "loss": 0.5254, + "step": 1380 + }, + { + "epoch": 0.3080526433192059, + "grad_norm": 0.16573172807693481, + "learning_rate": 1.954969624506274e-05, + "loss": 0.5195, + "step": 1381 + }, + { + "epoch": 0.30827570823109524, + "grad_norm": 0.16543026268482208, + "learning_rate": 1.9548997709728443e-05, + "loss": 0.5243, + "step": 1382 + }, + { + "epoch": 0.30849877314298463, + "grad_norm": 0.17528237402439117, + "learning_rate": 1.9548298645510587e-05, + "loss": 0.5133, + "step": 1383 + }, + { + "epoch": 0.30872183805487396, + "grad_norm": 0.16765229403972626, + "learning_rate": 1.954759905244789e-05, + "loss": 0.5185, + "step": 1384 + }, + { + "epoch": 0.3089449029667633, + "grad_norm": 0.17179544270038605, + "learning_rate": 1.9546898930579102e-05, + "loss": 0.5156, + "step": 1385 + }, + { + "epoch": 0.3091679678786527, + "grad_norm": 0.17194928228855133, + "learning_rate": 1.9546198279942997e-05, + "loss": 0.5134, + "step": 1386 + }, + { + "epoch": 0.30939103279054203, + "grad_norm": 0.16621114313602448, + "learning_rate": 1.9545497100578382e-05, + "loss": 0.5101, + "step": 1387 + }, + { + "epoch": 0.3096140977024314, + "grad_norm": 0.16408509016036987, + "learning_rate": 1.9544795392524096e-05, + "loss": 0.515, + "step": 1388 + }, + { + "epoch": 0.30983716261432076, + "grad_norm": 0.16583041846752167, + "learning_rate": 1.9544093155819004e-05, + "loss": 0.4974, + "step": 1389 + }, + { + "epoch": 0.31006022752621015, + "grad_norm": 0.16977885365486145, + "learning_rate": 1.9543390390502e-05, + "loss": 0.5229, + "step": 1390 + }, + { + "epoch": 0.3102832924380995, + "grad_norm": 0.1626800149679184, + "learning_rate": 1.9542687096611998e-05, + "loss": 0.5166, + "step": 1391 + }, + { + "epoch": 0.3105063573499888, + "grad_norm": 0.17305392026901245, + "learning_rate": 1.9541983274187964e-05, + "loss": 0.516, + "step": 1392 + }, + { + "epoch": 0.3107294222618782, + "grad_norm": 0.18632498383522034, + "learning_rate": 1.9541278923268872e-05, + "loss": 0.4985, + "step": 1393 + }, + { + "epoch": 0.31095248717376756, + "grad_norm": 0.18472731113433838, + "learning_rate": 1.9540574043893738e-05, + "loss": 0.5304, + "step": 1394 + }, + { + "epoch": 0.31117555208565695, + "grad_norm": 0.1734933704137802, + "learning_rate": 1.9539868636101602e-05, + "loss": 0.5523, + "step": 1395 + }, + { + "epoch": 0.3113986169975463, + "grad_norm": 0.17965291440486908, + "learning_rate": 1.9539162699931534e-05, + "loss": 0.532, + "step": 1396 + }, + { + "epoch": 0.3116216819094356, + "grad_norm": 0.1630621999502182, + "learning_rate": 1.9538456235422625e-05, + "loss": 0.5351, + "step": 1397 + }, + { + "epoch": 0.311844746821325, + "grad_norm": 0.17468804121017456, + "learning_rate": 1.9537749242614016e-05, + "loss": 0.5144, + "step": 1398 + }, + { + "epoch": 0.31206781173321435, + "grad_norm": 0.17356379330158234, + "learning_rate": 1.9537041721544862e-05, + "loss": 0.5124, + "step": 1399 + }, + { + "epoch": 0.31229087664510374, + "grad_norm": 0.1623128354549408, + "learning_rate": 1.953633367225434e-05, + "loss": 0.5256, + "step": 1400 + }, + { + "epoch": 0.3125139415569931, + "grad_norm": 0.17314434051513672, + "learning_rate": 1.9535625094781677e-05, + "loss": 0.5231, + "step": 1401 + }, + { + "epoch": 0.3127370064688825, + "grad_norm": 0.17369012534618378, + "learning_rate": 1.9534915989166115e-05, + "loss": 0.5266, + "step": 1402 + }, + { + "epoch": 0.3129600713807718, + "grad_norm": 0.16464915871620178, + "learning_rate": 1.9534206355446927e-05, + "loss": 0.4962, + "step": 1403 + }, + { + "epoch": 0.31318313629266115, + "grad_norm": 0.16123609244823456, + "learning_rate": 1.953349619366342e-05, + "loss": 0.4932, + "step": 1404 + }, + { + "epoch": 0.31340620120455054, + "grad_norm": 0.16396626830101013, + "learning_rate": 1.9532785503854926e-05, + "loss": 0.5103, + "step": 1405 + }, + { + "epoch": 0.3136292661164399, + "grad_norm": 0.1750822216272354, + "learning_rate": 1.9532074286060805e-05, + "loss": 0.5108, + "step": 1406 + }, + { + "epoch": 0.31385233102832927, + "grad_norm": 0.1627611666917801, + "learning_rate": 1.953136254032045e-05, + "loss": 0.4933, + "step": 1407 + }, + { + "epoch": 0.3140753959402186, + "grad_norm": 0.17482301592826843, + "learning_rate": 1.9530650266673286e-05, + "loss": 0.5179, + "step": 1408 + }, + { + "epoch": 0.31429846085210794, + "grad_norm": 0.17025405168533325, + "learning_rate": 1.952993746515876e-05, + "loss": 0.4968, + "step": 1409 + }, + { + "epoch": 0.31452152576399733, + "grad_norm": 0.17483878135681152, + "learning_rate": 1.9529224135816348e-05, + "loss": 0.5124, + "step": 1410 + }, + { + "epoch": 0.31474459067588667, + "grad_norm": 0.1743018627166748, + "learning_rate": 1.9528510278685568e-05, + "loss": 0.5207, + "step": 1411 + }, + { + "epoch": 0.31496765558777606, + "grad_norm": 0.16575628519058228, + "learning_rate": 1.9527795893805947e-05, + "loss": 0.4846, + "step": 1412 + }, + { + "epoch": 0.3151907204996654, + "grad_norm": 0.16664400696754456, + "learning_rate": 1.952708098121706e-05, + "loss": 0.5097, + "step": 1413 + }, + { + "epoch": 0.31541378541155474, + "grad_norm": 0.17265918850898743, + "learning_rate": 1.9526365540958497e-05, + "loss": 0.5002, + "step": 1414 + }, + { + "epoch": 0.31563685032344413, + "grad_norm": 0.16979828476905823, + "learning_rate": 1.952564957306989e-05, + "loss": 0.5172, + "step": 1415 + }, + { + "epoch": 0.31585991523533347, + "grad_norm": 0.17120909690856934, + "learning_rate": 1.952493307759089e-05, + "loss": 0.5204, + "step": 1416 + }, + { + "epoch": 0.31608298014722286, + "grad_norm": 0.15589144825935364, + "learning_rate": 1.9524216054561186e-05, + "loss": 0.5001, + "step": 1417 + }, + { + "epoch": 0.3163060450591122, + "grad_norm": 0.1638038605451584, + "learning_rate": 1.9523498504020486e-05, + "loss": 0.495, + "step": 1418 + }, + { + "epoch": 0.3165291099710016, + "grad_norm": 0.2510809600353241, + "learning_rate": 1.952278042600853e-05, + "loss": 0.5177, + "step": 1419 + }, + { + "epoch": 0.3167521748828909, + "grad_norm": 0.1806640475988388, + "learning_rate": 1.9522061820565093e-05, + "loss": 0.5187, + "step": 1420 + }, + { + "epoch": 0.31697523979478026, + "grad_norm": 0.16829638183116913, + "learning_rate": 1.9521342687729977e-05, + "loss": 0.5275, + "step": 1421 + }, + { + "epoch": 0.31719830470666965, + "grad_norm": 0.18520011007785797, + "learning_rate": 1.9520623027543015e-05, + "loss": 0.5654, + "step": 1422 + }, + { + "epoch": 0.317421369618559, + "grad_norm": 0.17533186078071594, + "learning_rate": 1.951990284004406e-05, + "loss": 0.5102, + "step": 1423 + }, + { + "epoch": 0.3176444345304484, + "grad_norm": 0.17360709607601166, + "learning_rate": 1.9519182125273e-05, + "loss": 0.5196, + "step": 1424 + }, + { + "epoch": 0.3178674994423377, + "grad_norm": 0.17495203018188477, + "learning_rate": 1.951846088326976e-05, + "loss": 0.5416, + "step": 1425 + }, + { + "epoch": 0.31809056435422706, + "grad_norm": 0.17361585795879364, + "learning_rate": 1.9517739114074282e-05, + "loss": 0.53, + "step": 1426 + }, + { + "epoch": 0.31831362926611645, + "grad_norm": 0.17399337887763977, + "learning_rate": 1.9517016817726542e-05, + "loss": 0.5207, + "step": 1427 + }, + { + "epoch": 0.3185366941780058, + "grad_norm": 0.16763678193092346, + "learning_rate": 1.9516293994266548e-05, + "loss": 0.5059, + "step": 1428 + }, + { + "epoch": 0.3187597590898952, + "grad_norm": 0.16563831269741058, + "learning_rate": 1.951557064373433e-05, + "loss": 0.5079, + "step": 1429 + }, + { + "epoch": 0.3189828240017845, + "grad_norm": 0.1833125650882721, + "learning_rate": 1.951484676616996e-05, + "loss": 0.5181, + "step": 1430 + }, + { + "epoch": 0.31920588891367385, + "grad_norm": 0.15749512612819672, + "learning_rate": 1.951412236161352e-05, + "loss": 0.4967, + "step": 1431 + }, + { + "epoch": 0.31942895382556324, + "grad_norm": 0.17580753564834595, + "learning_rate": 1.9513397430105137e-05, + "loss": 0.5238, + "step": 1432 + }, + { + "epoch": 0.3196520187374526, + "grad_norm": 0.16896769404411316, + "learning_rate": 1.9512671971684963e-05, + "loss": 0.5218, + "step": 1433 + }, + { + "epoch": 0.319875083649342, + "grad_norm": 0.1761835813522339, + "learning_rate": 1.951194598639318e-05, + "loss": 0.5183, + "step": 1434 + }, + { + "epoch": 0.3200981485612313, + "grad_norm": 0.1777181178331375, + "learning_rate": 1.9511219474269992e-05, + "loss": 0.5283, + "step": 1435 + }, + { + "epoch": 0.3203212134731207, + "grad_norm": 0.17135201394557953, + "learning_rate": 1.9510492435355647e-05, + "loss": 0.5183, + "step": 1436 + }, + { + "epoch": 0.32054427838501004, + "grad_norm": 0.18420204520225525, + "learning_rate": 1.9509764869690407e-05, + "loss": 0.5239, + "step": 1437 + }, + { + "epoch": 0.3207673432968994, + "grad_norm": 0.17093034088611603, + "learning_rate": 1.9509036777314568e-05, + "loss": 0.508, + "step": 1438 + }, + { + "epoch": 0.32099040820878877, + "grad_norm": 0.18275004625320435, + "learning_rate": 1.9508308158268458e-05, + "loss": 0.5257, + "step": 1439 + }, + { + "epoch": 0.3212134731206781, + "grad_norm": 0.19184084236621857, + "learning_rate": 1.950757901259243e-05, + "loss": 0.5138, + "step": 1440 + }, + { + "epoch": 0.3214365380325675, + "grad_norm": 0.19820280373096466, + "learning_rate": 1.9506849340326876e-05, + "loss": 0.5406, + "step": 1441 + }, + { + "epoch": 0.32165960294445683, + "grad_norm": 0.1850280463695526, + "learning_rate": 1.9506119141512204e-05, + "loss": 0.5009, + "step": 1442 + }, + { + "epoch": 0.32188266785634617, + "grad_norm": 0.17693570256233215, + "learning_rate": 1.9505388416188854e-05, + "loss": 0.5502, + "step": 1443 + }, + { + "epoch": 0.32210573276823556, + "grad_norm": 0.16360151767730713, + "learning_rate": 1.9504657164397307e-05, + "loss": 0.5215, + "step": 1444 + }, + { + "epoch": 0.3223287976801249, + "grad_norm": 0.1807732880115509, + "learning_rate": 1.950392538617806e-05, + "loss": 0.5024, + "step": 1445 + }, + { + "epoch": 0.3225518625920143, + "grad_norm": 0.17271289229393005, + "learning_rate": 1.950319308157164e-05, + "loss": 0.4997, + "step": 1446 + }, + { + "epoch": 0.32277492750390363, + "grad_norm": 0.16827355325222015, + "learning_rate": 1.950246025061861e-05, + "loss": 0.5178, + "step": 1447 + }, + { + "epoch": 0.322997992415793, + "grad_norm": 0.17736375331878662, + "learning_rate": 1.950172689335956e-05, + "loss": 0.494, + "step": 1448 + }, + { + "epoch": 0.32322105732768236, + "grad_norm": 0.16899362206459045, + "learning_rate": 1.9500993009835106e-05, + "loss": 0.4701, + "step": 1449 + }, + { + "epoch": 0.3234441222395717, + "grad_norm": 0.1732366383075714, + "learning_rate": 1.9500258600085894e-05, + "loss": 0.4774, + "step": 1450 + }, + { + "epoch": 0.3236671871514611, + "grad_norm": 0.16916102170944214, + "learning_rate": 1.9499523664152603e-05, + "loss": 0.5222, + "step": 1451 + }, + { + "epoch": 0.3238902520633504, + "grad_norm": 0.18570592999458313, + "learning_rate": 1.9498788202075936e-05, + "loss": 0.5611, + "step": 1452 + }, + { + "epoch": 0.3241133169752398, + "grad_norm": 0.1690537929534912, + "learning_rate": 1.9498052213896627e-05, + "loss": 0.5055, + "step": 1453 + }, + { + "epoch": 0.32433638188712915, + "grad_norm": 0.17589417099952698, + "learning_rate": 1.9497315699655447e-05, + "loss": 0.539, + "step": 1454 + }, + { + "epoch": 0.3245594467990185, + "grad_norm": 0.1848466843366623, + "learning_rate": 1.949657865939318e-05, + "loss": 0.5337, + "step": 1455 + }, + { + "epoch": 0.3247825117109079, + "grad_norm": 0.17046886682510376, + "learning_rate": 1.949584109315065e-05, + "loss": 0.507, + "step": 1456 + }, + { + "epoch": 0.3250055766227972, + "grad_norm": 0.17708532512187958, + "learning_rate": 1.9495103000968708e-05, + "loss": 0.5377, + "step": 1457 + }, + { + "epoch": 0.3252286415346866, + "grad_norm": 0.18529073894023895, + "learning_rate": 1.9494364382888236e-05, + "loss": 0.5136, + "step": 1458 + }, + { + "epoch": 0.32545170644657595, + "grad_norm": 0.16627921164035797, + "learning_rate": 1.9493625238950143e-05, + "loss": 0.5149, + "step": 1459 + }, + { + "epoch": 0.3256747713584653, + "grad_norm": 0.1823386400938034, + "learning_rate": 1.949288556919537e-05, + "loss": 0.5422, + "step": 1460 + }, + { + "epoch": 0.3258978362703547, + "grad_norm": 0.17173448204994202, + "learning_rate": 1.949214537366488e-05, + "loss": 0.5315, + "step": 1461 + }, + { + "epoch": 0.326120901182244, + "grad_norm": 0.20931746065616608, + "learning_rate": 1.949140465239967e-05, + "loss": 0.525, + "step": 1462 + }, + { + "epoch": 0.3263439660941334, + "grad_norm": 0.3969743549823761, + "learning_rate": 1.9490663405440765e-05, + "loss": 0.5193, + "step": 1463 + }, + { + "epoch": 0.32656703100602275, + "grad_norm": 0.17721976339817047, + "learning_rate": 1.9489921632829227e-05, + "loss": 0.5115, + "step": 1464 + }, + { + "epoch": 0.32679009591791214, + "grad_norm": 0.16409623622894287, + "learning_rate": 1.948917933460613e-05, + "loss": 0.5123, + "step": 1465 + }, + { + "epoch": 0.3270131608298015, + "grad_norm": 0.16853661835193634, + "learning_rate": 1.9488436510812594e-05, + "loss": 0.5269, + "step": 1466 + }, + { + "epoch": 0.3272362257416908, + "grad_norm": 0.1875690519809723, + "learning_rate": 1.948769316148976e-05, + "loss": 0.4727, + "step": 1467 + }, + { + "epoch": 0.3274592906535802, + "grad_norm": 0.1777825504541397, + "learning_rate": 1.9486949286678798e-05, + "loss": 0.5095, + "step": 1468 + }, + { + "epoch": 0.32768235556546954, + "grad_norm": 0.17113567888736725, + "learning_rate": 1.948620488642091e-05, + "loss": 0.5244, + "step": 1469 + }, + { + "epoch": 0.32790542047735893, + "grad_norm": 0.16337715089321136, + "learning_rate": 1.9485459960757325e-05, + "loss": 0.5194, + "step": 1470 + }, + { + "epoch": 0.32812848538924827, + "grad_norm": 0.1793193519115448, + "learning_rate": 1.9484714509729305e-05, + "loss": 0.527, + "step": 1471 + }, + { + "epoch": 0.3283515503011376, + "grad_norm": 0.16597363352775574, + "learning_rate": 1.948396853337813e-05, + "loss": 0.493, + "step": 1472 + }, + { + "epoch": 0.328574615213027, + "grad_norm": 0.16836529970169067, + "learning_rate": 1.9483222031745118e-05, + "loss": 0.4932, + "step": 1473 + }, + { + "epoch": 0.32879768012491634, + "grad_norm": 0.17120935022830963, + "learning_rate": 1.9482475004871622e-05, + "loss": 0.5156, + "step": 1474 + }, + { + "epoch": 0.32902074503680573, + "grad_norm": 0.1692550629377365, + "learning_rate": 1.9481727452799013e-05, + "loss": 0.5089, + "step": 1475 + }, + { + "epoch": 0.32924380994869507, + "grad_norm": 0.16897635161876678, + "learning_rate": 1.9480979375568694e-05, + "loss": 0.5042, + "step": 1476 + }, + { + "epoch": 0.32946687486058446, + "grad_norm": 0.16120664775371552, + "learning_rate": 1.9480230773222102e-05, + "loss": 0.5105, + "step": 1477 + }, + { + "epoch": 0.3296899397724738, + "grad_norm": 0.17156098783016205, + "learning_rate": 1.9479481645800694e-05, + "loss": 0.5217, + "step": 1478 + }, + { + "epoch": 0.32991300468436313, + "grad_norm": 0.17012807726860046, + "learning_rate": 1.9478731993345965e-05, + "loss": 0.4947, + "step": 1479 + }, + { + "epoch": 0.3301360695962525, + "grad_norm": 0.16123747825622559, + "learning_rate": 1.9477981815899435e-05, + "loss": 0.5181, + "step": 1480 + }, + { + "epoch": 0.33035913450814186, + "grad_norm": 0.16029803454875946, + "learning_rate": 1.947723111350265e-05, + "loss": 0.5089, + "step": 1481 + }, + { + "epoch": 0.33058219942003125, + "grad_norm": 0.17824719846248627, + "learning_rate": 1.9476479886197198e-05, + "loss": 0.5245, + "step": 1482 + }, + { + "epoch": 0.3308052643319206, + "grad_norm": 0.1661938726902008, + "learning_rate": 1.9475728134024675e-05, + "loss": 0.4971, + "step": 1483 + }, + { + "epoch": 0.3310283292438099, + "grad_norm": 0.16781049966812134, + "learning_rate": 1.9474975857026727e-05, + "loss": 0.5187, + "step": 1484 + }, + { + "epoch": 0.3312513941556993, + "grad_norm": 0.20398850739002228, + "learning_rate": 1.9474223055245014e-05, + "loss": 0.5087, + "step": 1485 + }, + { + "epoch": 0.33147445906758866, + "grad_norm": 0.19590437412261963, + "learning_rate": 1.9473469728721233e-05, + "loss": 0.5193, + "step": 1486 + }, + { + "epoch": 0.33169752397947805, + "grad_norm": 0.16896989941596985, + "learning_rate": 1.947271587749711e-05, + "loss": 0.5272, + "step": 1487 + }, + { + "epoch": 0.3319205888913674, + "grad_norm": 0.16621822118759155, + "learning_rate": 1.9471961501614395e-05, + "loss": 0.5356, + "step": 1488 + }, + { + "epoch": 0.3321436538032567, + "grad_norm": 0.16570885479450226, + "learning_rate": 1.947120660111487e-05, + "loss": 0.5105, + "step": 1489 + }, + { + "epoch": 0.3323667187151461, + "grad_norm": 0.1804710477590561, + "learning_rate": 1.9470451176040343e-05, + "loss": 0.5037, + "step": 1490 + }, + { + "epoch": 0.33258978362703545, + "grad_norm": 0.16383785009384155, + "learning_rate": 1.9469695226432667e-05, + "loss": 0.4991, + "step": 1491 + }, + { + "epoch": 0.33281284853892484, + "grad_norm": 0.16517230868339539, + "learning_rate": 1.9468938752333698e-05, + "loss": 0.4858, + "step": 1492 + }, + { + "epoch": 0.3330359134508142, + "grad_norm": 0.1879924088716507, + "learning_rate": 1.946818175378534e-05, + "loss": 0.5003, + "step": 1493 + }, + { + "epoch": 0.3332589783627036, + "grad_norm": 0.16963818669319153, + "learning_rate": 1.9467424230829514e-05, + "loss": 0.5081, + "step": 1494 + }, + { + "epoch": 0.3334820432745929, + "grad_norm": 0.1686849743127823, + "learning_rate": 1.946666618350819e-05, + "loss": 0.5384, + "step": 1495 + }, + { + "epoch": 0.33370510818648225, + "grad_norm": 0.18511471152305603, + "learning_rate": 1.946590761186334e-05, + "loss": 0.5387, + "step": 1496 + }, + { + "epoch": 0.33392817309837164, + "grad_norm": 0.15684032440185547, + "learning_rate": 1.9465148515936986e-05, + "loss": 0.5051, + "step": 1497 + }, + { + "epoch": 0.334151238010261, + "grad_norm": 0.18369467556476593, + "learning_rate": 1.9464388895771165e-05, + "loss": 0.4973, + "step": 1498 + }, + { + "epoch": 0.33437430292215037, + "grad_norm": 0.17524507641792297, + "learning_rate": 1.9463628751407957e-05, + "loss": 0.5204, + "step": 1499 + }, + { + "epoch": 0.3345973678340397, + "grad_norm": 0.15974605083465576, + "learning_rate": 1.946286808288946e-05, + "loss": 0.5128, + "step": 1500 + }, + { + "epoch": 0.33482043274592904, + "grad_norm": 0.1720893234014511, + "learning_rate": 1.9462106890257805e-05, + "loss": 0.531, + "step": 1501 + }, + { + "epoch": 0.33504349765781843, + "grad_norm": 0.1723415106534958, + "learning_rate": 1.946134517355515e-05, + "loss": 0.5344, + "step": 1502 + }, + { + "epoch": 0.33526656256970777, + "grad_norm": 0.1691267043352127, + "learning_rate": 1.9460582932823685e-05, + "loss": 0.4693, + "step": 1503 + }, + { + "epoch": 0.33548962748159716, + "grad_norm": 0.18036657571792603, + "learning_rate": 1.945982016810563e-05, + "loss": 0.5175, + "step": 1504 + }, + { + "epoch": 0.3357126923934865, + "grad_norm": 0.16378861665725708, + "learning_rate": 1.9459056879443227e-05, + "loss": 0.4967, + "step": 1505 + }, + { + "epoch": 0.33593575730537584, + "grad_norm": 0.16954581439495087, + "learning_rate": 1.9458293066878754e-05, + "loss": 0.5024, + "step": 1506 + }, + { + "epoch": 0.33615882221726523, + "grad_norm": 0.16542184352874756, + "learning_rate": 1.9457528730454516e-05, + "loss": 0.525, + "step": 1507 + }, + { + "epoch": 0.33638188712915457, + "grad_norm": 0.16997484862804413, + "learning_rate": 1.9456763870212853e-05, + "loss": 0.5177, + "step": 1508 + }, + { + "epoch": 0.33660495204104396, + "grad_norm": 0.1820513904094696, + "learning_rate": 1.945599848619611e-05, + "loss": 0.5173, + "step": 1509 + }, + { + "epoch": 0.3368280169529333, + "grad_norm": 0.1801905781030655, + "learning_rate": 1.94552325784467e-05, + "loss": 0.4951, + "step": 1510 + }, + { + "epoch": 0.3370510818648227, + "grad_norm": 0.16981808841228485, + "learning_rate": 1.9454466147007032e-05, + "loss": 0.5359, + "step": 1511 + }, + { + "epoch": 0.337274146776712, + "grad_norm": 0.16826239228248596, + "learning_rate": 1.9453699191919557e-05, + "loss": 0.4852, + "step": 1512 + }, + { + "epoch": 0.33749721168860136, + "grad_norm": 0.16905133426189423, + "learning_rate": 1.9452931713226752e-05, + "loss": 0.502, + "step": 1513 + }, + { + "epoch": 0.33772027660049075, + "grad_norm": 0.3108493685722351, + "learning_rate": 1.945216371097113e-05, + "loss": 0.5025, + "step": 1514 + }, + { + "epoch": 0.3379433415123801, + "grad_norm": 0.16754211485385895, + "learning_rate": 1.9451395185195224e-05, + "loss": 0.4913, + "step": 1515 + }, + { + "epoch": 0.3381664064242695, + "grad_norm": 0.16909672319889069, + "learning_rate": 1.9450626135941603e-05, + "loss": 0.5489, + "step": 1516 + }, + { + "epoch": 0.3383894713361588, + "grad_norm": 0.17022110521793365, + "learning_rate": 1.944985656325286e-05, + "loss": 0.5266, + "step": 1517 + }, + { + "epoch": 0.33861253624804816, + "grad_norm": 0.16641078889369965, + "learning_rate": 1.9449086467171615e-05, + "loss": 0.4913, + "step": 1518 + }, + { + "epoch": 0.33883560115993755, + "grad_norm": 0.16541585326194763, + "learning_rate": 1.9448315847740527e-05, + "loss": 0.5116, + "step": 1519 + }, + { + "epoch": 0.3390586660718269, + "grad_norm": 0.175320103764534, + "learning_rate": 1.9447544705002273e-05, + "loss": 0.506, + "step": 1520 + }, + { + "epoch": 0.3392817309837163, + "grad_norm": 0.18363285064697266, + "learning_rate": 1.9446773038999566e-05, + "loss": 0.5119, + "step": 1521 + }, + { + "epoch": 0.3395047958956056, + "grad_norm": 0.17000706493854523, + "learning_rate": 1.944600084977515e-05, + "loss": 0.5286, + "step": 1522 + }, + { + "epoch": 0.339727860807495, + "grad_norm": 0.20967601239681244, + "learning_rate": 1.9445228137371784e-05, + "loss": 0.5366, + "step": 1523 + }, + { + "epoch": 0.33995092571938434, + "grad_norm": 0.1751982420682907, + "learning_rate": 1.9444454901832273e-05, + "loss": 0.5298, + "step": 1524 + }, + { + "epoch": 0.3401739906312737, + "grad_norm": 0.16722087562084198, + "learning_rate": 1.944368114319944e-05, + "loss": 0.5248, + "step": 1525 + }, + { + "epoch": 0.3403970555431631, + "grad_norm": 0.16639317572116852, + "learning_rate": 1.9442906861516143e-05, + "loss": 0.4947, + "step": 1526 + }, + { + "epoch": 0.3406201204550524, + "grad_norm": 0.18173882365226746, + "learning_rate": 1.9442132056825268e-05, + "loss": 0.5198, + "step": 1527 + }, + { + "epoch": 0.3408431853669418, + "grad_norm": 0.16811014711856842, + "learning_rate": 1.9441356729169725e-05, + "loss": 0.5193, + "step": 1528 + }, + { + "epoch": 0.34106625027883114, + "grad_norm": 0.18385176360607147, + "learning_rate": 1.944058087859246e-05, + "loss": 0.5419, + "step": 1529 + }, + { + "epoch": 0.3412893151907205, + "grad_norm": 0.18335555493831635, + "learning_rate": 1.9439804505136437e-05, + "loss": 0.5442, + "step": 1530 + }, + { + "epoch": 0.34151238010260987, + "grad_norm": 0.16866059601306915, + "learning_rate": 1.9439027608844665e-05, + "loss": 0.4893, + "step": 1531 + }, + { + "epoch": 0.3417354450144992, + "grad_norm": 0.16770686209201813, + "learning_rate": 1.9438250189760168e-05, + "loss": 0.5276, + "step": 1532 + }, + { + "epoch": 0.3419585099263886, + "grad_norm": 0.17907945811748505, + "learning_rate": 1.943747224792601e-05, + "loss": 0.5221, + "step": 1533 + }, + { + "epoch": 0.34218157483827794, + "grad_norm": 0.16425661742687225, + "learning_rate": 1.9436693783385273e-05, + "loss": 0.527, + "step": 1534 + }, + { + "epoch": 0.3424046397501673, + "grad_norm": 0.20484143495559692, + "learning_rate": 1.9435914796181077e-05, + "loss": 0.5294, + "step": 1535 + }, + { + "epoch": 0.34262770466205666, + "grad_norm": 0.171040877699852, + "learning_rate": 1.9435135286356563e-05, + "loss": 0.5317, + "step": 1536 + }, + { + "epoch": 0.342850769573946, + "grad_norm": 0.1566307246685028, + "learning_rate": 1.943435525395491e-05, + "loss": 0.4908, + "step": 1537 + }, + { + "epoch": 0.3430738344858354, + "grad_norm": 0.15999655425548553, + "learning_rate": 1.9433574699019315e-05, + "loss": 0.5211, + "step": 1538 + }, + { + "epoch": 0.34329689939772473, + "grad_norm": 0.16879580914974213, + "learning_rate": 1.9432793621593013e-05, + "loss": 0.5473, + "step": 1539 + }, + { + "epoch": 0.3435199643096141, + "grad_norm": 0.16014404594898224, + "learning_rate": 1.943201202171927e-05, + "loss": 0.5369, + "step": 1540 + }, + { + "epoch": 0.34374302922150346, + "grad_norm": 0.1585661619901657, + "learning_rate": 1.943122989944137e-05, + "loss": 0.5203, + "step": 1541 + }, + { + "epoch": 0.3439660941333928, + "grad_norm": 0.16618762910366058, + "learning_rate": 1.943044725480263e-05, + "loss": 0.5163, + "step": 1542 + }, + { + "epoch": 0.3441891590452822, + "grad_norm": 0.1790398806333542, + "learning_rate": 1.9429664087846407e-05, + "loss": 0.492, + "step": 1543 + }, + { + "epoch": 0.3444122239571715, + "grad_norm": 0.17689375579357147, + "learning_rate": 1.9428880398616065e-05, + "loss": 0.5009, + "step": 1544 + }, + { + "epoch": 0.3446352888690609, + "grad_norm": 0.16719898581504822, + "learning_rate": 1.942809618715502e-05, + "loss": 0.5064, + "step": 1545 + }, + { + "epoch": 0.34485835378095026, + "grad_norm": 0.1700020283460617, + "learning_rate": 1.9427311453506705e-05, + "loss": 0.5229, + "step": 1546 + }, + { + "epoch": 0.3450814186928396, + "grad_norm": 0.17998361587524414, + "learning_rate": 1.9426526197714582e-05, + "loss": 0.5188, + "step": 1547 + }, + { + "epoch": 0.345304483604729, + "grad_norm": 0.17831408977508545, + "learning_rate": 1.9425740419822138e-05, + "loss": 0.4967, + "step": 1548 + }, + { + "epoch": 0.3455275485166183, + "grad_norm": 0.16397793591022491, + "learning_rate": 1.9424954119872904e-05, + "loss": 0.5319, + "step": 1549 + }, + { + "epoch": 0.3457506134285077, + "grad_norm": 0.17712676525115967, + "learning_rate": 1.9424167297910425e-05, + "loss": 0.5104, + "step": 1550 + }, + { + "epoch": 0.34597367834039705, + "grad_norm": 0.19361375272274017, + "learning_rate": 1.9423379953978277e-05, + "loss": 0.5113, + "step": 1551 + }, + { + "epoch": 0.34619674325228644, + "grad_norm": 0.1744556725025177, + "learning_rate": 1.9422592088120074e-05, + "loss": 0.5331, + "step": 1552 + }, + { + "epoch": 0.3464198081641758, + "grad_norm": 0.1747075766324997, + "learning_rate": 1.9421803700379454e-05, + "loss": 0.5257, + "step": 1553 + }, + { + "epoch": 0.3466428730760651, + "grad_norm": 0.17446283996105194, + "learning_rate": 1.9421014790800074e-05, + "loss": 0.5133, + "step": 1554 + }, + { + "epoch": 0.3468659379879545, + "grad_norm": 0.1626337766647339, + "learning_rate": 1.9420225359425637e-05, + "loss": 0.4812, + "step": 1555 + }, + { + "epoch": 0.34708900289984385, + "grad_norm": 0.16959097981452942, + "learning_rate": 1.9419435406299863e-05, + "loss": 0.4954, + "step": 1556 + }, + { + "epoch": 0.34731206781173324, + "grad_norm": 0.17018267512321472, + "learning_rate": 1.9418644931466507e-05, + "loss": 0.4785, + "step": 1557 + }, + { + "epoch": 0.3475351327236226, + "grad_norm": 0.18296095728874207, + "learning_rate": 1.9417853934969347e-05, + "loss": 0.5244, + "step": 1558 + }, + { + "epoch": 0.3477581976355119, + "grad_norm": 0.1640445590019226, + "learning_rate": 1.9417062416852198e-05, + "loss": 0.5273, + "step": 1559 + }, + { + "epoch": 0.3479812625474013, + "grad_norm": 0.1624521166086197, + "learning_rate": 1.9416270377158896e-05, + "loss": 0.4939, + "step": 1560 + }, + { + "epoch": 0.34820432745929064, + "grad_norm": 0.16685132682323456, + "learning_rate": 1.941547781593331e-05, + "loss": 0.5118, + "step": 1561 + }, + { + "epoch": 0.34842739237118003, + "grad_norm": 0.1717056930065155, + "learning_rate": 1.9414684733219334e-05, + "loss": 0.5156, + "step": 1562 + }, + { + "epoch": 0.34865045728306937, + "grad_norm": 0.18991652131080627, + "learning_rate": 1.94138911290609e-05, + "loss": 0.5346, + "step": 1563 + }, + { + "epoch": 0.3488735221949587, + "grad_norm": 0.16688503324985504, + "learning_rate": 1.941309700350196e-05, + "loss": 0.5333, + "step": 1564 + }, + { + "epoch": 0.3490965871068481, + "grad_norm": 0.17251865565776825, + "learning_rate": 1.9412302356586494e-05, + "loss": 0.5113, + "step": 1565 + }, + { + "epoch": 0.34931965201873744, + "grad_norm": 0.15765444934368134, + "learning_rate": 1.941150718835852e-05, + "loss": 0.5305, + "step": 1566 + }, + { + "epoch": 0.34954271693062683, + "grad_norm": 0.18435880541801453, + "learning_rate": 1.9410711498862077e-05, + "loss": 0.5087, + "step": 1567 + }, + { + "epoch": 0.34976578184251617, + "grad_norm": 0.16191725432872772, + "learning_rate": 1.9409915288141235e-05, + "loss": 0.5001, + "step": 1568 + }, + { + "epoch": 0.34998884675440556, + "grad_norm": 0.19742515683174133, + "learning_rate": 1.9409118556240095e-05, + "loss": 0.5232, + "step": 1569 + }, + { + "epoch": 0.3502119116662949, + "grad_norm": 0.16742932796478271, + "learning_rate": 1.940832130320278e-05, + "loss": 0.5227, + "step": 1570 + }, + { + "epoch": 0.35043497657818423, + "grad_norm": 0.17970915138721466, + "learning_rate": 1.9407523529073455e-05, + "loss": 0.5228, + "step": 1571 + }, + { + "epoch": 0.3506580414900736, + "grad_norm": 0.16648298501968384, + "learning_rate": 1.9406725233896297e-05, + "loss": 0.4987, + "step": 1572 + }, + { + "epoch": 0.35088110640196296, + "grad_norm": 0.1718263030052185, + "learning_rate": 1.940592641771553e-05, + "loss": 0.5256, + "step": 1573 + }, + { + "epoch": 0.35110417131385235, + "grad_norm": 0.17099805176258087, + "learning_rate": 1.9405127080575387e-05, + "loss": 0.5371, + "step": 1574 + }, + { + "epoch": 0.3513272362257417, + "grad_norm": 0.16639132797718048, + "learning_rate": 1.9404327222520147e-05, + "loss": 0.5055, + "step": 1575 + }, + { + "epoch": 0.351550301137631, + "grad_norm": 0.15915724635124207, + "learning_rate": 1.9403526843594115e-05, + "loss": 0.5123, + "step": 1576 + }, + { + "epoch": 0.3517733660495204, + "grad_norm": 0.190689817070961, + "learning_rate": 1.9402725943841608e-05, + "loss": 0.5149, + "step": 1577 + }, + { + "epoch": 0.35199643096140976, + "grad_norm": 0.1728559285402298, + "learning_rate": 1.9401924523306998e-05, + "loss": 0.518, + "step": 1578 + }, + { + "epoch": 0.35221949587329915, + "grad_norm": 0.15996624529361725, + "learning_rate": 1.9401122582034664e-05, + "loss": 0.4949, + "step": 1579 + }, + { + "epoch": 0.3524425607851885, + "grad_norm": 0.17822036147117615, + "learning_rate": 1.940032012006903e-05, + "loss": 0.5012, + "step": 1580 + }, + { + "epoch": 0.3526656256970778, + "grad_norm": 0.18252968788146973, + "learning_rate": 1.9399517137454534e-05, + "loss": 0.54, + "step": 1581 + }, + { + "epoch": 0.3528886906089672, + "grad_norm": 0.1667163223028183, + "learning_rate": 1.939871363423566e-05, + "loss": 0.5176, + "step": 1582 + }, + { + "epoch": 0.35311175552085655, + "grad_norm": 0.17513629794120789, + "learning_rate": 1.9397909610456897e-05, + "loss": 0.5182, + "step": 1583 + }, + { + "epoch": 0.35333482043274594, + "grad_norm": 0.16966688632965088, + "learning_rate": 1.939710506616279e-05, + "loss": 0.5154, + "step": 1584 + }, + { + "epoch": 0.3535578853446353, + "grad_norm": 0.17325864732265472, + "learning_rate": 1.9396300001397888e-05, + "loss": 0.5321, + "step": 1585 + }, + { + "epoch": 0.3537809502565247, + "grad_norm": 0.17322127521038055, + "learning_rate": 1.939549441620679e-05, + "loss": 0.5206, + "step": 1586 + }, + { + "epoch": 0.354004015168414, + "grad_norm": 0.16276715695858002, + "learning_rate": 1.9394688310634114e-05, + "loss": 0.4874, + "step": 1587 + }, + { + "epoch": 0.35422708008030335, + "grad_norm": 0.1607387810945511, + "learning_rate": 1.93938816847245e-05, + "loss": 0.4813, + "step": 1588 + }, + { + "epoch": 0.35445014499219274, + "grad_norm": 0.16621682047843933, + "learning_rate": 1.939307453852263e-05, + "loss": 0.533, + "step": 1589 + }, + { + "epoch": 0.3546732099040821, + "grad_norm": 0.165096253156662, + "learning_rate": 1.9392266872073207e-05, + "loss": 0.5115, + "step": 1590 + }, + { + "epoch": 0.35489627481597147, + "grad_norm": 0.17987790703773499, + "learning_rate": 1.9391458685420966e-05, + "loss": 0.5439, + "step": 1591 + }, + { + "epoch": 0.3551193397278608, + "grad_norm": 0.1641106754541397, + "learning_rate": 1.939064997861067e-05, + "loss": 0.462, + "step": 1592 + }, + { + "epoch": 0.35534240463975014, + "grad_norm": 0.23700258135795593, + "learning_rate": 1.9389840751687105e-05, + "loss": 0.4945, + "step": 1593 + }, + { + "epoch": 0.35556546955163953, + "grad_norm": 0.1738569736480713, + "learning_rate": 1.9389031004695095e-05, + "loss": 0.5247, + "step": 1594 + }, + { + "epoch": 0.35578853446352887, + "grad_norm": 0.16523152589797974, + "learning_rate": 1.9388220737679493e-05, + "loss": 0.5231, + "step": 1595 + }, + { + "epoch": 0.35601159937541826, + "grad_norm": 0.1748376339673996, + "learning_rate": 1.9387409950685167e-05, + "loss": 0.5048, + "step": 1596 + }, + { + "epoch": 0.3562346642873076, + "grad_norm": 0.20013664662837982, + "learning_rate": 1.938659864375703e-05, + "loss": 0.5284, + "step": 1597 + }, + { + "epoch": 0.356457729199197, + "grad_norm": 0.18472377955913544, + "learning_rate": 1.938578681694002e-05, + "loss": 0.5063, + "step": 1598 + }, + { + "epoch": 0.35668079411108633, + "grad_norm": 0.17718833684921265, + "learning_rate": 1.9384974470279093e-05, + "loss": 0.5156, + "step": 1599 + }, + { + "epoch": 0.35690385902297567, + "grad_norm": 0.16550204157829285, + "learning_rate": 1.938416160381925e-05, + "loss": 0.487, + "step": 1600 + }, + { + "epoch": 0.35712692393486506, + "grad_norm": 0.17269675433635712, + "learning_rate": 1.93833482176055e-05, + "loss": 0.4989, + "step": 1601 + }, + { + "epoch": 0.3573499888467544, + "grad_norm": 0.17201650142669678, + "learning_rate": 1.938253431168291e-05, + "loss": 0.5075, + "step": 1602 + }, + { + "epoch": 0.3575730537586438, + "grad_norm": 0.17147138714790344, + "learning_rate": 1.938171988609655e-05, + "loss": 0.4967, + "step": 1603 + }, + { + "epoch": 0.3577961186705331, + "grad_norm": 0.20010094344615936, + "learning_rate": 1.938090494089153e-05, + "loss": 0.5335, + "step": 1604 + }, + { + "epoch": 0.35801918358242246, + "grad_norm": 0.19643662869930267, + "learning_rate": 1.9380089476112985e-05, + "loss": 0.586, + "step": 1605 + }, + { + "epoch": 0.35824224849431185, + "grad_norm": 0.16737660765647888, + "learning_rate": 1.937927349180608e-05, + "loss": 0.5203, + "step": 1606 + }, + { + "epoch": 0.3584653134062012, + "grad_norm": 0.1626511663198471, + "learning_rate": 1.9378456988016015e-05, + "loss": 0.5057, + "step": 1607 + }, + { + "epoch": 0.3586883783180906, + "grad_norm": 0.18112361431121826, + "learning_rate": 1.9377639964788005e-05, + "loss": 0.5709, + "step": 1608 + }, + { + "epoch": 0.3589114432299799, + "grad_norm": 0.1701660305261612, + "learning_rate": 1.937682242216731e-05, + "loss": 0.5427, + "step": 1609 + }, + { + "epoch": 0.35913450814186926, + "grad_norm": 0.16982769966125488, + "learning_rate": 1.9376004360199202e-05, + "loss": 0.5414, + "step": 1610 + }, + { + "epoch": 0.35935757305375865, + "grad_norm": 0.1677013635635376, + "learning_rate": 1.9375185778928997e-05, + "loss": 0.5189, + "step": 1611 + }, + { + "epoch": 0.359580637965648, + "grad_norm": 0.16802898049354553, + "learning_rate": 1.9374366678402032e-05, + "loss": 0.5359, + "step": 1612 + }, + { + "epoch": 0.3598037028775374, + "grad_norm": 0.1629040390253067, + "learning_rate": 1.9373547058663674e-05, + "loss": 0.5239, + "step": 1613 + }, + { + "epoch": 0.3600267677894267, + "grad_norm": 0.15942612290382385, + "learning_rate": 1.9372726919759318e-05, + "loss": 0.4927, + "step": 1614 + }, + { + "epoch": 0.3602498327013161, + "grad_norm": 0.16228771209716797, + "learning_rate": 1.9371906261734387e-05, + "loss": 0.4795, + "step": 1615 + }, + { + "epoch": 0.36047289761320545, + "grad_norm": 0.18663759529590607, + "learning_rate": 1.9371085084634337e-05, + "loss": 0.4523, + "step": 1616 + }, + { + "epoch": 0.3606959625250948, + "grad_norm": 0.1616244614124298, + "learning_rate": 1.9370263388504647e-05, + "loss": 0.5221, + "step": 1617 + }, + { + "epoch": 0.3609190274369842, + "grad_norm": 0.16261187195777893, + "learning_rate": 1.936944117339083e-05, + "loss": 0.507, + "step": 1618 + }, + { + "epoch": 0.3611420923488735, + "grad_norm": 0.16541439294815063, + "learning_rate": 1.9368618439338424e-05, + "loss": 0.5318, + "step": 1619 + }, + { + "epoch": 0.3613651572607629, + "grad_norm": 0.17135834693908691, + "learning_rate": 1.9367795186392996e-05, + "loss": 0.5071, + "step": 1620 + }, + { + "epoch": 0.36158822217265224, + "grad_norm": 0.1599961221218109, + "learning_rate": 1.936697141460015e-05, + "loss": 0.5031, + "step": 1621 + }, + { + "epoch": 0.3618112870845416, + "grad_norm": 0.17239384353160858, + "learning_rate": 1.9366147124005504e-05, + "loss": 0.5138, + "step": 1622 + }, + { + "epoch": 0.36203435199643097, + "grad_norm": 0.1636582612991333, + "learning_rate": 1.9365322314654714e-05, + "loss": 0.5197, + "step": 1623 + }, + { + "epoch": 0.3622574169083203, + "grad_norm": 0.17068350315093994, + "learning_rate": 1.9364496986593463e-05, + "loss": 0.4886, + "step": 1624 + }, + { + "epoch": 0.3624804818202097, + "grad_norm": 0.17033697664737701, + "learning_rate": 1.9363671139867467e-05, + "loss": 0.5232, + "step": 1625 + }, + { + "epoch": 0.36270354673209904, + "grad_norm": 0.16417935490608215, + "learning_rate": 1.936284477452246e-05, + "loss": 0.5384, + "step": 1626 + }, + { + "epoch": 0.36292661164398843, + "grad_norm": 0.1705051213502884, + "learning_rate": 1.9362017890604215e-05, + "loss": 0.517, + "step": 1627 + }, + { + "epoch": 0.36314967655587777, + "grad_norm": 0.17601191997528076, + "learning_rate": 1.9361190488158535e-05, + "loss": 0.5141, + "step": 1628 + }, + { + "epoch": 0.3633727414677671, + "grad_norm": 0.173202782869339, + "learning_rate": 1.936036256723124e-05, + "loss": 0.5133, + "step": 1629 + }, + { + "epoch": 0.3635958063796565, + "grad_norm": 0.1655133217573166, + "learning_rate": 1.935953412786818e-05, + "loss": 0.5278, + "step": 1630 + }, + { + "epoch": 0.36381887129154583, + "grad_norm": 0.1649170219898224, + "learning_rate": 1.9358705170115253e-05, + "loss": 0.5101, + "step": 1631 + }, + { + "epoch": 0.3640419362034352, + "grad_norm": 0.4769546389579773, + "learning_rate": 1.9357875694018364e-05, + "loss": 0.5325, + "step": 1632 + }, + { + "epoch": 0.36426500111532456, + "grad_norm": 0.16675199568271637, + "learning_rate": 1.9357045699623452e-05, + "loss": 0.5373, + "step": 1633 + }, + { + "epoch": 0.3644880660272139, + "grad_norm": 0.18537083268165588, + "learning_rate": 1.9356215186976496e-05, + "loss": 0.5077, + "step": 1634 + }, + { + "epoch": 0.3647111309391033, + "grad_norm": 0.1646578013896942, + "learning_rate": 1.935538415612349e-05, + "loss": 0.5163, + "step": 1635 + }, + { + "epoch": 0.3649341958509926, + "grad_norm": 0.17558851838111877, + "learning_rate": 1.935455260711046e-05, + "loss": 0.5269, + "step": 1636 + }, + { + "epoch": 0.365157260762882, + "grad_norm": 0.1750616431236267, + "learning_rate": 1.9353720539983462e-05, + "loss": 0.5185, + "step": 1637 + }, + { + "epoch": 0.36538032567477136, + "grad_norm": 0.1686229407787323, + "learning_rate": 1.9352887954788583e-05, + "loss": 0.5033, + "step": 1638 + }, + { + "epoch": 0.3656033905866607, + "grad_norm": 0.17237605154514313, + "learning_rate": 1.935205485157194e-05, + "loss": 0.5071, + "step": 1639 + }, + { + "epoch": 0.3658264554985501, + "grad_norm": 0.17209577560424805, + "learning_rate": 1.9351221230379673e-05, + "loss": 0.5049, + "step": 1640 + }, + { + "epoch": 0.3660495204104394, + "grad_norm": 0.1662733256816864, + "learning_rate": 1.9350387091257952e-05, + "loss": 0.5399, + "step": 1641 + }, + { + "epoch": 0.3662725853223288, + "grad_norm": 0.17135699093341827, + "learning_rate": 1.9349552434252976e-05, + "loss": 0.5175, + "step": 1642 + }, + { + "epoch": 0.36649565023421815, + "grad_norm": 0.17738810181617737, + "learning_rate": 1.9348717259410975e-05, + "loss": 0.5464, + "step": 1643 + }, + { + "epoch": 0.36671871514610754, + "grad_norm": 0.17101503908634186, + "learning_rate": 1.9347881566778208e-05, + "loss": 0.5162, + "step": 1644 + }, + { + "epoch": 0.3669417800579969, + "grad_norm": 0.16841377317905426, + "learning_rate": 1.934704535640096e-05, + "loss": 0.5244, + "step": 1645 + }, + { + "epoch": 0.3671648449698862, + "grad_norm": 0.17288081347942352, + "learning_rate": 1.9346208628325543e-05, + "loss": 0.5487, + "step": 1646 + }, + { + "epoch": 0.3673879098817756, + "grad_norm": 0.1731618493795395, + "learning_rate": 1.93453713825983e-05, + "loss": 0.4924, + "step": 1647 + }, + { + "epoch": 0.36761097479366495, + "grad_norm": 0.18122443556785583, + "learning_rate": 1.934453361926561e-05, + "loss": 0.5105, + "step": 1648 + }, + { + "epoch": 0.36783403970555434, + "grad_norm": 0.17434357106685638, + "learning_rate": 1.9343695338373866e-05, + "loss": 0.5185, + "step": 1649 + }, + { + "epoch": 0.3680571046174437, + "grad_norm": 0.19195139408111572, + "learning_rate": 1.93428565399695e-05, + "loss": 0.5074, + "step": 1650 + }, + { + "epoch": 0.368280169529333, + "grad_norm": 0.17529140412807465, + "learning_rate": 1.9342017224098974e-05, + "loss": 0.5355, + "step": 1651 + }, + { + "epoch": 0.3685032344412224, + "grad_norm": 0.16739797592163086, + "learning_rate": 1.9341177390808768e-05, + "loss": 0.5128, + "step": 1652 + }, + { + "epoch": 0.36872629935311174, + "grad_norm": 0.17157401144504547, + "learning_rate": 1.9340337040145397e-05, + "loss": 0.5117, + "step": 1653 + }, + { + "epoch": 0.36894936426500113, + "grad_norm": 0.166087806224823, + "learning_rate": 1.933949617215541e-05, + "loss": 0.5182, + "step": 1654 + }, + { + "epoch": 0.36917242917689047, + "grad_norm": 0.17426824569702148, + "learning_rate": 1.9338654786885377e-05, + "loss": 0.5141, + "step": 1655 + }, + { + "epoch": 0.3693954940887798, + "grad_norm": 0.18022432923316956, + "learning_rate": 1.93378128843819e-05, + "loss": 0.5097, + "step": 1656 + }, + { + "epoch": 0.3696185590006692, + "grad_norm": 0.1715417206287384, + "learning_rate": 1.933697046469161e-05, + "loss": 0.525, + "step": 1657 + }, + { + "epoch": 0.36984162391255854, + "grad_norm": 0.17939765751361847, + "learning_rate": 1.9336127527861158e-05, + "loss": 0.5166, + "step": 1658 + }, + { + "epoch": 0.37006468882444793, + "grad_norm": 0.1856374740600586, + "learning_rate": 1.9335284073937242e-05, + "loss": 0.5078, + "step": 1659 + }, + { + "epoch": 0.37028775373633727, + "grad_norm": 0.16677772998809814, + "learning_rate": 1.9334440102966567e-05, + "loss": 0.5427, + "step": 1660 + }, + { + "epoch": 0.37051081864822666, + "grad_norm": 0.17052651941776276, + "learning_rate": 1.933359561499589e-05, + "loss": 0.4976, + "step": 1661 + }, + { + "epoch": 0.370733883560116, + "grad_norm": 0.180454283952713, + "learning_rate": 1.9332750610071972e-05, + "loss": 0.4807, + "step": 1662 + }, + { + "epoch": 0.37095694847200533, + "grad_norm": 0.16270607709884644, + "learning_rate": 1.9331905088241623e-05, + "loss": 0.5079, + "step": 1663 + }, + { + "epoch": 0.3711800133838947, + "grad_norm": 0.1687788963317871, + "learning_rate": 1.9331059049551668e-05, + "loss": 0.5208, + "step": 1664 + }, + { + "epoch": 0.37140307829578406, + "grad_norm": 0.16753818094730377, + "learning_rate": 1.933021249404897e-05, + "loss": 0.4991, + "step": 1665 + }, + { + "epoch": 0.37162614320767345, + "grad_norm": 0.1763564944267273, + "learning_rate": 1.9329365421780414e-05, + "loss": 0.5321, + "step": 1666 + }, + { + "epoch": 0.3718492081195628, + "grad_norm": 0.1733461618423462, + "learning_rate": 1.932851783279292e-05, + "loss": 0.5331, + "step": 1667 + }, + { + "epoch": 0.3720722730314521, + "grad_norm": 0.1711835116147995, + "learning_rate": 1.9327669727133424e-05, + "loss": 0.4764, + "step": 1668 + }, + { + "epoch": 0.3722953379433415, + "grad_norm": 0.190887913107872, + "learning_rate": 1.932682110484891e-05, + "loss": 0.5254, + "step": 1669 + }, + { + "epoch": 0.37251840285523086, + "grad_norm": 0.16770336031913757, + "learning_rate": 1.9325971965986373e-05, + "loss": 0.4989, + "step": 1670 + }, + { + "epoch": 0.37274146776712025, + "grad_norm": 0.2075570970773697, + "learning_rate": 1.9325122310592846e-05, + "loss": 0.515, + "step": 1671 + }, + { + "epoch": 0.3729645326790096, + "grad_norm": 0.17692680656909943, + "learning_rate": 1.9324272138715388e-05, + "loss": 0.5442, + "step": 1672 + }, + { + "epoch": 0.373187597590899, + "grad_norm": 0.17124991118907928, + "learning_rate": 1.932342145040109e-05, + "loss": 0.5066, + "step": 1673 + }, + { + "epoch": 0.3734106625027883, + "grad_norm": 0.16621904075145721, + "learning_rate": 1.932257024569706e-05, + "loss": 0.5066, + "step": 1674 + }, + { + "epoch": 0.37363372741467765, + "grad_norm": 0.17721469700336456, + "learning_rate": 1.932171852465045e-05, + "loss": 0.4789, + "step": 1675 + }, + { + "epoch": 0.37385679232656704, + "grad_norm": 0.16972602903842926, + "learning_rate": 1.9320866287308433e-05, + "loss": 0.4922, + "step": 1676 + }, + { + "epoch": 0.3740798572384564, + "grad_norm": 0.17205440998077393, + "learning_rate": 1.9320013533718208e-05, + "loss": 0.4909, + "step": 1677 + }, + { + "epoch": 0.3743029221503458, + "grad_norm": 0.16761687397956848, + "learning_rate": 1.9319160263927013e-05, + "loss": 0.513, + "step": 1678 + }, + { + "epoch": 0.3745259870622351, + "grad_norm": 0.16278418898582458, + "learning_rate": 1.93183064779821e-05, + "loss": 0.508, + "step": 1679 + }, + { + "epoch": 0.37474905197412445, + "grad_norm": 0.17359983921051025, + "learning_rate": 1.931745217593076e-05, + "loss": 0.5472, + "step": 1680 + }, + { + "epoch": 0.37497211688601384, + "grad_norm": 0.16624867916107178, + "learning_rate": 1.931659735782031e-05, + "loss": 0.53, + "step": 1681 + }, + { + "epoch": 0.3751951817979032, + "grad_norm": 0.16128243505954742, + "learning_rate": 1.9315742023698095e-05, + "loss": 0.474, + "step": 1682 + }, + { + "epoch": 0.37541824670979257, + "grad_norm": 0.19452917575836182, + "learning_rate": 1.9314886173611487e-05, + "loss": 0.5527, + "step": 1683 + }, + { + "epoch": 0.3756413116216819, + "grad_norm": 0.18000726401805878, + "learning_rate": 1.931402980760789e-05, + "loss": 0.5185, + "step": 1684 + }, + { + "epoch": 0.37586437653357124, + "grad_norm": 0.1593390852212906, + "learning_rate": 1.9313172925734736e-05, + "loss": 0.503, + "step": 1685 + }, + { + "epoch": 0.37608744144546064, + "grad_norm": 0.1637982428073883, + "learning_rate": 1.931231552803948e-05, + "loss": 0.5011, + "step": 1686 + }, + { + "epoch": 0.37631050635734997, + "grad_norm": 0.18159234523773193, + "learning_rate": 1.931145761456962e-05, + "loss": 0.501, + "step": 1687 + }, + { + "epoch": 0.37653357126923936, + "grad_norm": 0.17507293820381165, + "learning_rate": 1.9310599185372657e-05, + "loss": 0.5219, + "step": 1688 + }, + { + "epoch": 0.3767566361811287, + "grad_norm": 0.16377007961273193, + "learning_rate": 1.9309740240496152e-05, + "loss": 0.5096, + "step": 1689 + }, + { + "epoch": 0.3769797010930181, + "grad_norm": 0.171824112534523, + "learning_rate": 1.930888077998767e-05, + "loss": 0.5366, + "step": 1690 + }, + { + "epoch": 0.37720276600490743, + "grad_norm": 0.18775586783885956, + "learning_rate": 1.9308020803894813e-05, + "loss": 0.5018, + "step": 1691 + }, + { + "epoch": 0.37742583091679677, + "grad_norm": 0.1640700101852417, + "learning_rate": 1.9307160312265216e-05, + "loss": 0.5159, + "step": 1692 + }, + { + "epoch": 0.37764889582868616, + "grad_norm": 0.1758948117494583, + "learning_rate": 1.9306299305146535e-05, + "loss": 0.5171, + "step": 1693 + }, + { + "epoch": 0.3778719607405755, + "grad_norm": 0.1625639647245407, + "learning_rate": 1.9305437782586463e-05, + "loss": 0.4989, + "step": 1694 + }, + { + "epoch": 0.3780950256524649, + "grad_norm": 0.15426993370056152, + "learning_rate": 1.9304575744632708e-05, + "loss": 0.4781, + "step": 1695 + }, + { + "epoch": 0.3783180905643542, + "grad_norm": 0.18176917731761932, + "learning_rate": 1.9303713191333025e-05, + "loss": 0.5307, + "step": 1696 + }, + { + "epoch": 0.37854115547624356, + "grad_norm": 0.16359803080558777, + "learning_rate": 1.930285012273518e-05, + "loss": 0.5067, + "step": 1697 + }, + { + "epoch": 0.37876422038813296, + "grad_norm": 0.16793343424797058, + "learning_rate": 1.930198653888698e-05, + "loss": 0.5234, + "step": 1698 + }, + { + "epoch": 0.3789872853000223, + "grad_norm": 0.17032268643379211, + "learning_rate": 1.930112243983625e-05, + "loss": 0.5521, + "step": 1699 + }, + { + "epoch": 0.3792103502119117, + "grad_norm": 0.18838275969028473, + "learning_rate": 1.930025782563086e-05, + "loss": 0.5203, + "step": 1700 + }, + { + "epoch": 0.379433415123801, + "grad_norm": 0.1813000738620758, + "learning_rate": 1.9299392696318683e-05, + "loss": 0.5299, + "step": 1701 + }, + { + "epoch": 0.3796564800356904, + "grad_norm": 0.15768998861312866, + "learning_rate": 1.9298527051947645e-05, + "loss": 0.4802, + "step": 1702 + }, + { + "epoch": 0.37987954494757975, + "grad_norm": 0.15893371403217316, + "learning_rate": 1.9297660892565692e-05, + "loss": 0.4939, + "step": 1703 + }, + { + "epoch": 0.3801026098594691, + "grad_norm": 0.16448427736759186, + "learning_rate": 1.929679421822079e-05, + "loss": 0.503, + "step": 1704 + }, + { + "epoch": 0.3803256747713585, + "grad_norm": 0.1799025982618332, + "learning_rate": 1.9295927028960947e-05, + "loss": 0.5146, + "step": 1705 + }, + { + "epoch": 0.3805487396832478, + "grad_norm": 0.17270612716674805, + "learning_rate": 1.9295059324834193e-05, + "loss": 0.5472, + "step": 1706 + }, + { + "epoch": 0.3807718045951372, + "grad_norm": 0.1640714704990387, + "learning_rate": 1.9294191105888586e-05, + "loss": 0.4847, + "step": 1707 + }, + { + "epoch": 0.38099486950702655, + "grad_norm": 0.16522546112537384, + "learning_rate": 1.9293322372172207e-05, + "loss": 0.4752, + "step": 1708 + }, + { + "epoch": 0.3812179344189159, + "grad_norm": 0.17741802334785461, + "learning_rate": 1.9292453123733184e-05, + "loss": 0.5246, + "step": 1709 + }, + { + "epoch": 0.3814409993308053, + "grad_norm": 0.3004036545753479, + "learning_rate": 1.9291583360619653e-05, + "loss": 0.5032, + "step": 1710 + }, + { + "epoch": 0.3816640642426946, + "grad_norm": 0.17262016236782074, + "learning_rate": 1.9290713082879786e-05, + "loss": 0.5208, + "step": 1711 + }, + { + "epoch": 0.381887129154584, + "grad_norm": 0.1745826005935669, + "learning_rate": 1.928984229056179e-05, + "loss": 0.5245, + "step": 1712 + }, + { + "epoch": 0.38211019406647334, + "grad_norm": 0.19315177202224731, + "learning_rate": 1.9288970983713893e-05, + "loss": 0.5029, + "step": 1713 + }, + { + "epoch": 0.3823332589783627, + "grad_norm": 0.17079593241214752, + "learning_rate": 1.9288099162384354e-05, + "loss": 0.5204, + "step": 1714 + }, + { + "epoch": 0.38255632389025207, + "grad_norm": 0.17363616824150085, + "learning_rate": 1.9287226826621457e-05, + "loss": 0.5195, + "step": 1715 + }, + { + "epoch": 0.3827793888021414, + "grad_norm": 0.1668912172317505, + "learning_rate": 1.928635397647352e-05, + "loss": 0.5259, + "step": 1716 + }, + { + "epoch": 0.3830024537140308, + "grad_norm": 0.162687286734581, + "learning_rate": 1.9285480611988886e-05, + "loss": 0.5135, + "step": 1717 + }, + { + "epoch": 0.38322551862592014, + "grad_norm": 0.1736724078655243, + "learning_rate": 1.9284606733215925e-05, + "loss": 0.4855, + "step": 1718 + }, + { + "epoch": 0.38344858353780953, + "grad_norm": 0.16885711252689362, + "learning_rate": 1.9283732340203045e-05, + "loss": 0.4934, + "step": 1719 + }, + { + "epoch": 0.38367164844969887, + "grad_norm": 0.17074720561504364, + "learning_rate": 1.928285743299867e-05, + "loss": 0.5167, + "step": 1720 + }, + { + "epoch": 0.3838947133615882, + "grad_norm": 0.17049984633922577, + "learning_rate": 1.9281982011651257e-05, + "loss": 0.498, + "step": 1721 + }, + { + "epoch": 0.3841177782734776, + "grad_norm": 0.21770413219928741, + "learning_rate": 1.9281106076209296e-05, + "loss": 0.4869, + "step": 1722 + }, + { + "epoch": 0.38434084318536693, + "grad_norm": 0.17190971970558167, + "learning_rate": 1.9280229626721302e-05, + "loss": 0.5476, + "step": 1723 + }, + { + "epoch": 0.3845639080972563, + "grad_norm": 0.20358806848526, + "learning_rate": 1.9279352663235813e-05, + "loss": 0.5074, + "step": 1724 + }, + { + "epoch": 0.38478697300914566, + "grad_norm": 0.19548457860946655, + "learning_rate": 1.9278475185801404e-05, + "loss": 0.5246, + "step": 1725 + }, + { + "epoch": 0.385010037921035, + "grad_norm": 0.18699929118156433, + "learning_rate": 1.9277597194466674e-05, + "loss": 0.5267, + "step": 1726 + }, + { + "epoch": 0.3852331028329244, + "grad_norm": 0.16484257578849792, + "learning_rate": 1.9276718689280258e-05, + "loss": 0.5108, + "step": 1727 + }, + { + "epoch": 0.3854561677448137, + "grad_norm": 0.16136226058006287, + "learning_rate": 1.9275839670290804e-05, + "loss": 0.4973, + "step": 1728 + }, + { + "epoch": 0.3856792326567031, + "grad_norm": 0.16811032593250275, + "learning_rate": 1.9274960137547002e-05, + "loss": 0.4971, + "step": 1729 + }, + { + "epoch": 0.38590229756859246, + "grad_norm": 0.17258323729038239, + "learning_rate": 1.9274080091097568e-05, + "loss": 0.5513, + "step": 1730 + }, + { + "epoch": 0.3861253624804818, + "grad_norm": 0.16782884299755096, + "learning_rate": 1.927319953099124e-05, + "loss": 0.5032, + "step": 1731 + }, + { + "epoch": 0.3863484273923712, + "grad_norm": 0.1599043756723404, + "learning_rate": 1.9272318457276792e-05, + "loss": 0.5117, + "step": 1732 + }, + { + "epoch": 0.3865714923042605, + "grad_norm": 0.18118049204349518, + "learning_rate": 1.9271436870003022e-05, + "loss": 0.5476, + "step": 1733 + }, + { + "epoch": 0.3867945572161499, + "grad_norm": 0.16125431656837463, + "learning_rate": 1.927055476921876e-05, + "loss": 0.496, + "step": 1734 + }, + { + "epoch": 0.38701762212803925, + "grad_norm": 0.17001016438007355, + "learning_rate": 1.9269672154972863e-05, + "loss": 0.5274, + "step": 1735 + }, + { + "epoch": 0.38724068703992864, + "grad_norm": 0.18982788920402527, + "learning_rate": 1.9268789027314208e-05, + "loss": 0.5423, + "step": 1736 + }, + { + "epoch": 0.387463751951818, + "grad_norm": 0.20157238841056824, + "learning_rate": 1.9267905386291716e-05, + "loss": 0.5204, + "step": 1737 + }, + { + "epoch": 0.3876868168637073, + "grad_norm": 0.16580817103385925, + "learning_rate": 1.926702123195433e-05, + "loss": 0.523, + "step": 1738 + }, + { + "epoch": 0.3879098817755967, + "grad_norm": 0.1722583770751953, + "learning_rate": 1.926613656435101e-05, + "loss": 0.4689, + "step": 1739 + }, + { + "epoch": 0.38813294668748605, + "grad_norm": 0.16248932480812073, + "learning_rate": 1.9265251383530765e-05, + "loss": 0.4838, + "step": 1740 + }, + { + "epoch": 0.38835601159937544, + "grad_norm": 0.16789276897907257, + "learning_rate": 1.9264365689542616e-05, + "loss": 0.5191, + "step": 1741 + }, + { + "epoch": 0.3885790765112648, + "grad_norm": 0.16488412022590637, + "learning_rate": 1.926347948243562e-05, + "loss": 0.5124, + "step": 1742 + }, + { + "epoch": 0.3888021414231541, + "grad_norm": 0.17527227103710175, + "learning_rate": 1.926259276225886e-05, + "loss": 0.5042, + "step": 1743 + }, + { + "epoch": 0.3890252063350435, + "grad_norm": 0.16100718080997467, + "learning_rate": 1.926170552906145e-05, + "loss": 0.5103, + "step": 1744 + }, + { + "epoch": 0.38924827124693284, + "grad_norm": 0.1704183667898178, + "learning_rate": 1.926081778289253e-05, + "loss": 0.5252, + "step": 1745 + }, + { + "epoch": 0.38947133615882223, + "grad_norm": 0.15948736667633057, + "learning_rate": 1.9259929523801266e-05, + "loss": 0.4972, + "step": 1746 + }, + { + "epoch": 0.38969440107071157, + "grad_norm": 0.15783725678920746, + "learning_rate": 1.9259040751836858e-05, + "loss": 0.4862, + "step": 1747 + }, + { + "epoch": 0.38991746598260096, + "grad_norm": 0.22207237780094147, + "learning_rate": 1.9258151467048533e-05, + "loss": 0.5058, + "step": 1748 + }, + { + "epoch": 0.3901405308944903, + "grad_norm": 0.18270133435726166, + "learning_rate": 1.9257261669485544e-05, + "loss": 0.4872, + "step": 1749 + }, + { + "epoch": 0.39036359580637964, + "grad_norm": 0.15779659152030945, + "learning_rate": 1.925637135919717e-05, + "loss": 0.4972, + "step": 1750 + }, + { + "epoch": 0.39058666071826903, + "grad_norm": 0.16197752952575684, + "learning_rate": 1.9255480536232728e-05, + "loss": 0.5001, + "step": 1751 + }, + { + "epoch": 0.39080972563015837, + "grad_norm": 0.1598835587501526, + "learning_rate": 1.9254589200641556e-05, + "loss": 0.4991, + "step": 1752 + }, + { + "epoch": 0.39103279054204776, + "grad_norm": 0.1754835844039917, + "learning_rate": 1.925369735247302e-05, + "loss": 0.4858, + "step": 1753 + }, + { + "epoch": 0.3912558554539371, + "grad_norm": 0.17721553146839142, + "learning_rate": 1.9252804991776513e-05, + "loss": 0.4954, + "step": 1754 + }, + { + "epoch": 0.39147892036582643, + "grad_norm": 0.16600117087364197, + "learning_rate": 1.9251912118601466e-05, + "loss": 0.5006, + "step": 1755 + }, + { + "epoch": 0.3917019852777158, + "grad_norm": 0.18240424990653992, + "learning_rate": 1.925101873299733e-05, + "loss": 0.5563, + "step": 1756 + }, + { + "epoch": 0.39192505018960516, + "grad_norm": 0.2154020071029663, + "learning_rate": 1.9250124835013583e-05, + "loss": 0.5213, + "step": 1757 + }, + { + "epoch": 0.39214811510149455, + "grad_norm": 0.17148853838443756, + "learning_rate": 1.9249230424699735e-05, + "loss": 0.5378, + "step": 1758 + }, + { + "epoch": 0.3923711800133839, + "grad_norm": 0.17164389789104462, + "learning_rate": 1.9248335502105328e-05, + "loss": 0.5272, + "step": 1759 + }, + { + "epoch": 0.39259424492527323, + "grad_norm": 0.16211992502212524, + "learning_rate": 1.924744006727993e-05, + "loss": 0.5543, + "step": 1760 + }, + { + "epoch": 0.3928173098371626, + "grad_norm": 0.17015716433525085, + "learning_rate": 1.924654412027313e-05, + "loss": 0.5281, + "step": 1761 + }, + { + "epoch": 0.39304037474905196, + "grad_norm": 0.15979216992855072, + "learning_rate": 1.924564766113455e-05, + "loss": 0.4938, + "step": 1762 + }, + { + "epoch": 0.39326343966094135, + "grad_norm": 0.16782401502132416, + "learning_rate": 1.924475068991385e-05, + "loss": 0.4876, + "step": 1763 + }, + { + "epoch": 0.3934865045728307, + "grad_norm": 0.1688111275434494, + "learning_rate": 1.9243853206660703e-05, + "loss": 0.503, + "step": 1764 + }, + { + "epoch": 0.3937095694847201, + "grad_norm": 0.16509543359279633, + "learning_rate": 1.924295521142482e-05, + "loss": 0.4955, + "step": 1765 + }, + { + "epoch": 0.3939326343966094, + "grad_norm": 0.17914824187755585, + "learning_rate": 1.9242056704255935e-05, + "loss": 0.5334, + "step": 1766 + }, + { + "epoch": 0.39415569930849875, + "grad_norm": 0.18346786499023438, + "learning_rate": 1.9241157685203817e-05, + "loss": 0.5135, + "step": 1767 + }, + { + "epoch": 0.39437876422038814, + "grad_norm": 0.18024152517318726, + "learning_rate": 1.9240258154318257e-05, + "loss": 0.5284, + "step": 1768 + }, + { + "epoch": 0.3946018291322775, + "grad_norm": 0.17404595017433167, + "learning_rate": 1.923935811164908e-05, + "loss": 0.5069, + "step": 1769 + }, + { + "epoch": 0.3948248940441669, + "grad_norm": 0.15942560136318207, + "learning_rate": 1.9238457557246128e-05, + "loss": 0.5034, + "step": 1770 + }, + { + "epoch": 0.3950479589560562, + "grad_norm": 0.15909984707832336, + "learning_rate": 1.9237556491159285e-05, + "loss": 0.4762, + "step": 1771 + }, + { + "epoch": 0.39527102386794555, + "grad_norm": 0.16344551742076874, + "learning_rate": 1.9236654913438456e-05, + "loss": 0.5243, + "step": 1772 + }, + { + "epoch": 0.39549408877983494, + "grad_norm": 0.1672784835100174, + "learning_rate": 1.923575282413358e-05, + "loss": 0.5057, + "step": 1773 + }, + { + "epoch": 0.3957171536917243, + "grad_norm": 0.1585850566625595, + "learning_rate": 1.9234850223294613e-05, + "loss": 0.5074, + "step": 1774 + }, + { + "epoch": 0.39594021860361367, + "grad_norm": 0.17053529620170593, + "learning_rate": 1.9233947110971556e-05, + "loss": 0.5556, + "step": 1775 + }, + { + "epoch": 0.396163283515503, + "grad_norm": 0.16313889622688293, + "learning_rate": 1.9233043487214423e-05, + "loss": 0.491, + "step": 1776 + }, + { + "epoch": 0.3963863484273924, + "grad_norm": 0.15941911935806274, + "learning_rate": 1.9232139352073265e-05, + "loss": 0.4862, + "step": 1777 + }, + { + "epoch": 0.39660941333928174, + "grad_norm": 0.1699916571378708, + "learning_rate": 1.9231234705598153e-05, + "loss": 0.542, + "step": 1778 + }, + { + "epoch": 0.3968324782511711, + "grad_norm": 0.17430098354816437, + "learning_rate": 1.9230329547839196e-05, + "loss": 0.5006, + "step": 1779 + }, + { + "epoch": 0.39705554316306046, + "grad_norm": 0.16042235493659973, + "learning_rate": 1.9229423878846535e-05, + "loss": 0.5087, + "step": 1780 + }, + { + "epoch": 0.3972786080749498, + "grad_norm": 0.1637001633644104, + "learning_rate": 1.9228517698670316e-05, + "loss": 0.4966, + "step": 1781 + }, + { + "epoch": 0.3975016729868392, + "grad_norm": 0.1627058982849121, + "learning_rate": 1.922761100736074e-05, + "loss": 0.4882, + "step": 1782 + }, + { + "epoch": 0.39772473789872853, + "grad_norm": 0.1667163074016571, + "learning_rate": 1.9226703804968022e-05, + "loss": 0.5295, + "step": 1783 + }, + { + "epoch": 0.39794780281061787, + "grad_norm": 0.16532334685325623, + "learning_rate": 1.9225796091542412e-05, + "loss": 0.5062, + "step": 1784 + }, + { + "epoch": 0.39817086772250726, + "grad_norm": 0.16788703203201294, + "learning_rate": 1.9224887867134178e-05, + "loss": 0.5276, + "step": 1785 + }, + { + "epoch": 0.3983939326343966, + "grad_norm": 0.17480318248271942, + "learning_rate": 1.9223979131793627e-05, + "loss": 0.52, + "step": 1786 + }, + { + "epoch": 0.398616997546286, + "grad_norm": 0.1601472944021225, + "learning_rate": 1.9223069885571094e-05, + "loss": 0.5263, + "step": 1787 + }, + { + "epoch": 0.3988400624581753, + "grad_norm": 0.16812476515769958, + "learning_rate": 1.9222160128516932e-05, + "loss": 0.4831, + "step": 1788 + }, + { + "epoch": 0.39906312737006466, + "grad_norm": 0.1809302121400833, + "learning_rate": 1.9221249860681537e-05, + "loss": 0.4944, + "step": 1789 + }, + { + "epoch": 0.39928619228195406, + "grad_norm": 0.16555048525333405, + "learning_rate": 1.9220339082115317e-05, + "loss": 0.4851, + "step": 1790 + }, + { + "epoch": 0.3995092571938434, + "grad_norm": 0.16044898331165314, + "learning_rate": 1.9219427792868722e-05, + "loss": 0.5377, + "step": 1791 + }, + { + "epoch": 0.3997323221057328, + "grad_norm": 0.16331616044044495, + "learning_rate": 1.921851599299222e-05, + "loss": 0.4978, + "step": 1792 + }, + { + "epoch": 0.3999553870176221, + "grad_norm": 0.16315311193466187, + "learning_rate": 1.9217603682536315e-05, + "loss": 0.51, + "step": 1793 + }, + { + "epoch": 0.4001784519295115, + "grad_norm": 0.16615985333919525, + "learning_rate": 1.9216690861551544e-05, + "loss": 0.5343, + "step": 1794 + }, + { + "epoch": 0.40040151684140085, + "grad_norm": 0.1581142395734787, + "learning_rate": 1.9215777530088452e-05, + "loss": 0.5276, + "step": 1795 + }, + { + "epoch": 0.4006245817532902, + "grad_norm": 0.25778672099113464, + "learning_rate": 1.9214863688197634e-05, + "loss": 0.5265, + "step": 1796 + }, + { + "epoch": 0.4008476466651796, + "grad_norm": 0.21218818426132202, + "learning_rate": 1.92139493359297e-05, + "loss": 0.4929, + "step": 1797 + }, + { + "epoch": 0.4010707115770689, + "grad_norm": 0.1664579063653946, + "learning_rate": 1.9213034473335293e-05, + "loss": 0.5298, + "step": 1798 + }, + { + "epoch": 0.4012937764889583, + "grad_norm": 0.158608078956604, + "learning_rate": 1.9212119100465084e-05, + "loss": 0.513, + "step": 1799 + }, + { + "epoch": 0.40151684140084765, + "grad_norm": 0.15767832100391388, + "learning_rate": 1.9211203217369774e-05, + "loss": 0.5037, + "step": 1800 + }, + { + "epoch": 0.401739906312737, + "grad_norm": 0.1640487164258957, + "learning_rate": 1.921028682410009e-05, + "loss": 0.5102, + "step": 1801 + }, + { + "epoch": 0.4019629712246264, + "grad_norm": 0.1980050951242447, + "learning_rate": 1.9209369920706783e-05, + "loss": 0.4926, + "step": 1802 + }, + { + "epoch": 0.4021860361365157, + "grad_norm": 0.16443881392478943, + "learning_rate": 1.9208452507240642e-05, + "loss": 0.5165, + "step": 1803 + }, + { + "epoch": 0.4024091010484051, + "grad_norm": 0.17739985883235931, + "learning_rate": 1.920753458375248e-05, + "loss": 0.5141, + "step": 1804 + }, + { + "epoch": 0.40263216596029444, + "grad_norm": 0.1755398064851761, + "learning_rate": 1.9206616150293132e-05, + "loss": 0.5279, + "step": 1805 + }, + { + "epoch": 0.4028552308721838, + "grad_norm": 0.16583852469921112, + "learning_rate": 1.9205697206913473e-05, + "loss": 0.4838, + "step": 1806 + }, + { + "epoch": 0.40307829578407317, + "grad_norm": 0.1888515055179596, + "learning_rate": 1.9204777753664397e-05, + "loss": 0.5381, + "step": 1807 + }, + { + "epoch": 0.4033013606959625, + "grad_norm": 0.17295043170452118, + "learning_rate": 1.9203857790596826e-05, + "loss": 0.5062, + "step": 1808 + }, + { + "epoch": 0.4035244256078519, + "grad_norm": 0.16970674693584442, + "learning_rate": 1.9202937317761713e-05, + "loss": 0.5138, + "step": 1809 + }, + { + "epoch": 0.40374749051974124, + "grad_norm": 0.16952745616436005, + "learning_rate": 1.9202016335210047e-05, + "loss": 0.4829, + "step": 1810 + }, + { + "epoch": 0.40397055543163063, + "grad_norm": 0.16743586957454681, + "learning_rate": 1.9201094842992832e-05, + "loss": 0.5085, + "step": 1811 + }, + { + "epoch": 0.40419362034351997, + "grad_norm": 0.16140130162239075, + "learning_rate": 1.9200172841161108e-05, + "loss": 0.4983, + "step": 1812 + }, + { + "epoch": 0.4044166852554093, + "grad_norm": 0.1676797717809677, + "learning_rate": 1.9199250329765943e-05, + "loss": 0.5055, + "step": 1813 + }, + { + "epoch": 0.4046397501672987, + "grad_norm": 0.1718011051416397, + "learning_rate": 1.9198327308858427e-05, + "loss": 0.5021, + "step": 1814 + }, + { + "epoch": 0.40486281507918803, + "grad_norm": 0.1699821949005127, + "learning_rate": 1.9197403778489684e-05, + "loss": 0.5312, + "step": 1815 + }, + { + "epoch": 0.4050858799910774, + "grad_norm": 0.16701120138168335, + "learning_rate": 1.9196479738710865e-05, + "loss": 0.5028, + "step": 1816 + }, + { + "epoch": 0.40530894490296676, + "grad_norm": 0.17370650172233582, + "learning_rate": 1.9195555189573153e-05, + "loss": 0.5075, + "step": 1817 + }, + { + "epoch": 0.4055320098148561, + "grad_norm": 0.16744117438793182, + "learning_rate": 1.919463013112775e-05, + "loss": 0.4972, + "step": 1818 + }, + { + "epoch": 0.4057550747267455, + "grad_norm": 0.1703552007675171, + "learning_rate": 1.9193704563425896e-05, + "loss": 0.5215, + "step": 1819 + }, + { + "epoch": 0.4059781396386348, + "grad_norm": 0.1808227002620697, + "learning_rate": 1.919277848651885e-05, + "loss": 0.5048, + "step": 1820 + }, + { + "epoch": 0.4062012045505242, + "grad_norm": 0.17191959917545319, + "learning_rate": 1.9191851900457905e-05, + "loss": 0.5293, + "step": 1821 + }, + { + "epoch": 0.40642426946241356, + "grad_norm": 0.16464729607105255, + "learning_rate": 1.9190924805294388e-05, + "loss": 0.531, + "step": 1822 + }, + { + "epoch": 0.40664733437430295, + "grad_norm": 0.17217296361923218, + "learning_rate": 1.9189997201079638e-05, + "loss": 0.5221, + "step": 1823 + }, + { + "epoch": 0.4068703992861923, + "grad_norm": 0.15952859818935394, + "learning_rate": 1.918906908786504e-05, + "loss": 0.4898, + "step": 1824 + }, + { + "epoch": 0.4070934641980816, + "grad_norm": 0.15873339772224426, + "learning_rate": 1.9188140465701987e-05, + "loss": 0.4771, + "step": 1825 + }, + { + "epoch": 0.407316529109971, + "grad_norm": 0.16631244122982025, + "learning_rate": 1.9187211334641923e-05, + "loss": 0.5216, + "step": 1826 + }, + { + "epoch": 0.40753959402186035, + "grad_norm": 0.19430842995643616, + "learning_rate": 1.918628169473631e-05, + "loss": 0.532, + "step": 1827 + }, + { + "epoch": 0.40776265893374974, + "grad_norm": 0.1647733449935913, + "learning_rate": 1.9185351546036625e-05, + "loss": 0.4991, + "step": 1828 + }, + { + "epoch": 0.4079857238456391, + "grad_norm": 0.17242863774299622, + "learning_rate": 1.9184420888594398e-05, + "loss": 0.5095, + "step": 1829 + }, + { + "epoch": 0.4082087887575284, + "grad_norm": 0.15754511952400208, + "learning_rate": 1.9183489722461167e-05, + "loss": 0.5185, + "step": 1830 + }, + { + "epoch": 0.4084318536694178, + "grad_norm": 0.16371309757232666, + "learning_rate": 1.918255804768851e-05, + "loss": 0.5146, + "step": 1831 + }, + { + "epoch": 0.40865491858130715, + "grad_norm": 0.17599989473819733, + "learning_rate": 1.918162586432803e-05, + "loss": 0.5509, + "step": 1832 + }, + { + "epoch": 0.40887798349319654, + "grad_norm": 0.17950280010700226, + "learning_rate": 1.9180693172431353e-05, + "loss": 0.5236, + "step": 1833 + }, + { + "epoch": 0.4091010484050859, + "grad_norm": 0.1845085769891739, + "learning_rate": 1.917975997205014e-05, + "loss": 0.5155, + "step": 1834 + }, + { + "epoch": 0.4093241133169752, + "grad_norm": 0.16801810264587402, + "learning_rate": 1.9178826263236076e-05, + "loss": 0.5265, + "step": 1835 + }, + { + "epoch": 0.4095471782288646, + "grad_norm": 0.16314861178398132, + "learning_rate": 1.9177892046040875e-05, + "loss": 0.4922, + "step": 1836 + }, + { + "epoch": 0.40977024314075394, + "grad_norm": 0.16442064940929413, + "learning_rate": 1.9176957320516287e-05, + "loss": 0.5004, + "step": 1837 + }, + { + "epoch": 0.40999330805264333, + "grad_norm": 0.16156339645385742, + "learning_rate": 1.917602208671407e-05, + "loss": 0.5172, + "step": 1838 + }, + { + "epoch": 0.41021637296453267, + "grad_norm": 0.17251324653625488, + "learning_rate": 1.9175086344686035e-05, + "loss": 0.5432, + "step": 1839 + }, + { + "epoch": 0.41043943787642206, + "grad_norm": 0.1891004890203476, + "learning_rate": 1.9174150094484e-05, + "loss": 0.5276, + "step": 1840 + }, + { + "epoch": 0.4106625027883114, + "grad_norm": 0.1750495731830597, + "learning_rate": 1.917321333615983e-05, + "loss": 0.548, + "step": 1841 + }, + { + "epoch": 0.41088556770020074, + "grad_norm": 0.1698615849018097, + "learning_rate": 1.91722760697654e-05, + "loss": 0.5276, + "step": 1842 + }, + { + "epoch": 0.41110863261209013, + "grad_norm": 0.16396887600421906, + "learning_rate": 1.917133829535263e-05, + "loss": 0.5211, + "step": 1843 + }, + { + "epoch": 0.41133169752397947, + "grad_norm": 0.18432539701461792, + "learning_rate": 1.917040001297345e-05, + "loss": 0.5276, + "step": 1844 + }, + { + "epoch": 0.41155476243586886, + "grad_norm": 0.17017246782779694, + "learning_rate": 1.9169461222679836e-05, + "loss": 0.5312, + "step": 1845 + }, + { + "epoch": 0.4117778273477582, + "grad_norm": 0.16993820667266846, + "learning_rate": 1.9168521924523782e-05, + "loss": 0.4981, + "step": 1846 + }, + { + "epoch": 0.41200089225964753, + "grad_norm": 0.1594426929950714, + "learning_rate": 1.916758211855731e-05, + "loss": 0.5162, + "step": 1847 + }, + { + "epoch": 0.4122239571715369, + "grad_norm": 0.16228000819683075, + "learning_rate": 1.9166641804832474e-05, + "loss": 0.5086, + "step": 1848 + }, + { + "epoch": 0.41244702208342626, + "grad_norm": 0.2065616101026535, + "learning_rate": 1.9165700983401354e-05, + "loss": 0.4916, + "step": 1849 + }, + { + "epoch": 0.41267008699531565, + "grad_norm": 0.16846513748168945, + "learning_rate": 1.916475965431606e-05, + "loss": 0.5246, + "step": 1850 + }, + { + "epoch": 0.412893151907205, + "grad_norm": 0.19557945430278778, + "learning_rate": 1.9163817817628728e-05, + "loss": 0.5208, + "step": 1851 + }, + { + "epoch": 0.4131162168190944, + "grad_norm": 0.17081156373023987, + "learning_rate": 1.916287547339152e-05, + "loss": 0.5534, + "step": 1852 + }, + { + "epoch": 0.4133392817309837, + "grad_norm": 0.16580162942409515, + "learning_rate": 1.9161932621656634e-05, + "loss": 0.5152, + "step": 1853 + }, + { + "epoch": 0.41356234664287306, + "grad_norm": 0.18416714668273926, + "learning_rate": 1.9160989262476288e-05, + "loss": 0.5064, + "step": 1854 + }, + { + "epoch": 0.41378541155476245, + "grad_norm": 0.16161711513996124, + "learning_rate": 1.916004539590273e-05, + "loss": 0.5008, + "step": 1855 + }, + { + "epoch": 0.4140084764666518, + "grad_norm": 0.17786547541618347, + "learning_rate": 1.9159101021988244e-05, + "loss": 0.5168, + "step": 1856 + }, + { + "epoch": 0.4142315413785412, + "grad_norm": 0.15692083537578583, + "learning_rate": 1.9158156140785125e-05, + "loss": 0.5026, + "step": 1857 + }, + { + "epoch": 0.4144546062904305, + "grad_norm": 0.22050227224826813, + "learning_rate": 1.9157210752345713e-05, + "loss": 0.4936, + "step": 1858 + }, + { + "epoch": 0.41467767120231985, + "grad_norm": 0.20180164277553558, + "learning_rate": 1.915626485672237e-05, + "loss": 0.5347, + "step": 1859 + }, + { + "epoch": 0.41490073611420925, + "grad_norm": 0.156357541680336, + "learning_rate": 1.9155318453967483e-05, + "loss": 0.4816, + "step": 1860 + }, + { + "epoch": 0.4151238010260986, + "grad_norm": 0.16347983479499817, + "learning_rate": 1.9154371544133472e-05, + "loss": 0.5032, + "step": 1861 + }, + { + "epoch": 0.415346865937988, + "grad_norm": 0.16571176052093506, + "learning_rate": 1.9153424127272783e-05, + "loss": 0.4875, + "step": 1862 + }, + { + "epoch": 0.4155699308498773, + "grad_norm": 0.16103731095790863, + "learning_rate": 1.9152476203437884e-05, + "loss": 0.5266, + "step": 1863 + }, + { + "epoch": 0.41579299576176665, + "grad_norm": 0.17397384345531464, + "learning_rate": 1.915152777268128e-05, + "loss": 0.5114, + "step": 1864 + }, + { + "epoch": 0.41601606067365604, + "grad_norm": 0.16579222679138184, + "learning_rate": 1.9150578835055507e-05, + "loss": 0.5219, + "step": 1865 + }, + { + "epoch": 0.4162391255855454, + "grad_norm": 0.1724756360054016, + "learning_rate": 1.914962939061312e-05, + "loss": 0.4942, + "step": 1866 + }, + { + "epoch": 0.41646219049743477, + "grad_norm": 0.16709351539611816, + "learning_rate": 1.9148679439406704e-05, + "loss": 0.514, + "step": 1867 + }, + { + "epoch": 0.4166852554093241, + "grad_norm": 0.15628811717033386, + "learning_rate": 1.914772898148887e-05, + "loss": 0.5153, + "step": 1868 + }, + { + "epoch": 0.4169083203212135, + "grad_norm": 0.16881687939167023, + "learning_rate": 1.914677801691226e-05, + "loss": 0.4924, + "step": 1869 + }, + { + "epoch": 0.41713138523310284, + "grad_norm": 0.16054266691207886, + "learning_rate": 1.9145826545729555e-05, + "loss": 0.5238, + "step": 1870 + }, + { + "epoch": 0.4173544501449922, + "grad_norm": 0.1636224240064621, + "learning_rate": 1.9144874567993446e-05, + "loss": 0.4813, + "step": 1871 + }, + { + "epoch": 0.41757751505688157, + "grad_norm": 0.16189956665039062, + "learning_rate": 1.9143922083756656e-05, + "loss": 0.5211, + "step": 1872 + }, + { + "epoch": 0.4178005799687709, + "grad_norm": 0.16065020859241486, + "learning_rate": 1.9142969093071944e-05, + "loss": 0.5116, + "step": 1873 + }, + { + "epoch": 0.4180236448806603, + "grad_norm": 0.17001986503601074, + "learning_rate": 1.9142015595992096e-05, + "loss": 0.5202, + "step": 1874 + }, + { + "epoch": 0.41824670979254963, + "grad_norm": 0.16956806182861328, + "learning_rate": 1.9141061592569913e-05, + "loss": 0.4941, + "step": 1875 + }, + { + "epoch": 0.41846977470443897, + "grad_norm": 0.16168902814388275, + "learning_rate": 1.9140107082858243e-05, + "loss": 0.5134, + "step": 1876 + }, + { + "epoch": 0.41869283961632836, + "grad_norm": 0.16588400304317474, + "learning_rate": 1.9139152066909948e-05, + "loss": 0.4899, + "step": 1877 + }, + { + "epoch": 0.4189159045282177, + "grad_norm": 0.16289415955543518, + "learning_rate": 1.9138196544777925e-05, + "loss": 0.482, + "step": 1878 + }, + { + "epoch": 0.4191389694401071, + "grad_norm": 0.17892742156982422, + "learning_rate": 1.9137240516515094e-05, + "loss": 0.5178, + "step": 1879 + }, + { + "epoch": 0.4193620343519964, + "grad_norm": 0.17155860364437103, + "learning_rate": 1.913628398217441e-05, + "loss": 0.5099, + "step": 1880 + }, + { + "epoch": 0.41958509926388576, + "grad_norm": 0.1801706701517105, + "learning_rate": 1.913532694180885e-05, + "loss": 0.4966, + "step": 1881 + }, + { + "epoch": 0.41980816417577516, + "grad_norm": 0.16115552186965942, + "learning_rate": 1.9134369395471416e-05, + "loss": 0.4839, + "step": 1882 + }, + { + "epoch": 0.4200312290876645, + "grad_norm": 0.16582123935222626, + "learning_rate": 1.913341134321515e-05, + "loss": 0.4719, + "step": 1883 + }, + { + "epoch": 0.4202542939995539, + "grad_norm": 0.16009938716888428, + "learning_rate": 1.9132452785093113e-05, + "loss": 0.5096, + "step": 1884 + }, + { + "epoch": 0.4204773589114432, + "grad_norm": 0.17301099002361298, + "learning_rate": 1.9131493721158395e-05, + "loss": 0.5295, + "step": 1885 + }, + { + "epoch": 0.4207004238233326, + "grad_norm": 0.16522350907325745, + "learning_rate": 1.9130534151464116e-05, + "loss": 0.5386, + "step": 1886 + }, + { + "epoch": 0.42092348873522195, + "grad_norm": 0.1624733954668045, + "learning_rate": 1.9129574076063423e-05, + "loss": 0.5226, + "step": 1887 + }, + { + "epoch": 0.4211465536471113, + "grad_norm": 0.16734281182289124, + "learning_rate": 1.9128613495009487e-05, + "loss": 0.5367, + "step": 1888 + }, + { + "epoch": 0.4213696185590007, + "grad_norm": 0.15870822966098785, + "learning_rate": 1.912765240835552e-05, + "loss": 0.4597, + "step": 1889 + }, + { + "epoch": 0.42159268347089, + "grad_norm": 0.16024811565876007, + "learning_rate": 1.912669081615474e-05, + "loss": 0.4794, + "step": 1890 + }, + { + "epoch": 0.4218157483827794, + "grad_norm": 0.16861672699451447, + "learning_rate": 1.912572871846042e-05, + "loss": 0.4918, + "step": 1891 + }, + { + "epoch": 0.42203881329466875, + "grad_norm": 0.16772149503231049, + "learning_rate": 1.9124766115325837e-05, + "loss": 0.5262, + "step": 1892 + }, + { + "epoch": 0.4222618782065581, + "grad_norm": 0.16915294528007507, + "learning_rate": 1.912380300680431e-05, + "loss": 0.4926, + "step": 1893 + }, + { + "epoch": 0.4224849431184475, + "grad_norm": 0.16086746752262115, + "learning_rate": 1.912283939294918e-05, + "loss": 0.5091, + "step": 1894 + }, + { + "epoch": 0.4227080080303368, + "grad_norm": 0.18025851249694824, + "learning_rate": 1.912187527381382e-05, + "loss": 0.5147, + "step": 1895 + }, + { + "epoch": 0.4229310729422262, + "grad_norm": 0.15781170129776, + "learning_rate": 1.9120910649451632e-05, + "loss": 0.501, + "step": 1896 + }, + { + "epoch": 0.42315413785411554, + "grad_norm": 0.15790554881095886, + "learning_rate": 1.9119945519916036e-05, + "loss": 0.4963, + "step": 1897 + }, + { + "epoch": 0.42337720276600493, + "grad_norm": 0.16987471282482147, + "learning_rate": 1.9118979885260493e-05, + "loss": 0.5148, + "step": 1898 + }, + { + "epoch": 0.42360026767789427, + "grad_norm": 0.16438226401805878, + "learning_rate": 1.9118013745538483e-05, + "loss": 0.5348, + "step": 1899 + }, + { + "epoch": 0.4238233325897836, + "grad_norm": 0.18299153447151184, + "learning_rate": 1.9117047100803513e-05, + "loss": 0.5055, + "step": 1900 + }, + { + "epoch": 0.424046397501673, + "grad_norm": 0.18117006123065948, + "learning_rate": 1.911607995110913e-05, + "loss": 0.517, + "step": 1901 + }, + { + "epoch": 0.42426946241356234, + "grad_norm": 0.19496211409568787, + "learning_rate": 1.9115112296508896e-05, + "loss": 0.5216, + "step": 1902 + }, + { + "epoch": 0.42449252732545173, + "grad_norm": 0.17908519506454468, + "learning_rate": 1.9114144137056406e-05, + "loss": 0.5386, + "step": 1903 + }, + { + "epoch": 0.42471559223734107, + "grad_norm": 0.18684661388397217, + "learning_rate": 1.9113175472805284e-05, + "loss": 0.5341, + "step": 1904 + }, + { + "epoch": 0.4249386571492304, + "grad_norm": 0.1662161499261856, + "learning_rate": 1.9112206303809183e-05, + "loss": 0.4824, + "step": 1905 + }, + { + "epoch": 0.4251617220611198, + "grad_norm": 0.1687028408050537, + "learning_rate": 1.9111236630121775e-05, + "loss": 0.5026, + "step": 1906 + }, + { + "epoch": 0.42538478697300913, + "grad_norm": 0.20177949965000153, + "learning_rate": 1.9110266451796772e-05, + "loss": 0.5112, + "step": 1907 + }, + { + "epoch": 0.4256078518848985, + "grad_norm": 0.16266430914402008, + "learning_rate": 1.9109295768887907e-05, + "loss": 0.5011, + "step": 1908 + }, + { + "epoch": 0.42583091679678786, + "grad_norm": 0.1639285683631897, + "learning_rate": 1.910832458144894e-05, + "loss": 0.5147, + "step": 1909 + }, + { + "epoch": 0.4260539817086772, + "grad_norm": 0.16959701478481293, + "learning_rate": 1.9107352889533667e-05, + "loss": 0.513, + "step": 1910 + }, + { + "epoch": 0.4262770466205666, + "grad_norm": 0.16716282069683075, + "learning_rate": 1.9106380693195903e-05, + "loss": 0.5023, + "step": 1911 + }, + { + "epoch": 0.42650011153245593, + "grad_norm": 0.16967961192131042, + "learning_rate": 1.9105407992489495e-05, + "loss": 0.5185, + "step": 1912 + }, + { + "epoch": 0.4267231764443453, + "grad_norm": 0.16723939776420593, + "learning_rate": 1.9104434787468316e-05, + "loss": 0.5013, + "step": 1913 + }, + { + "epoch": 0.42694624135623466, + "grad_norm": 0.16064821183681488, + "learning_rate": 1.9103461078186268e-05, + "loss": 0.5109, + "step": 1914 + }, + { + "epoch": 0.42716930626812405, + "grad_norm": 0.17482173442840576, + "learning_rate": 1.9102486864697285e-05, + "loss": 0.5148, + "step": 1915 + }, + { + "epoch": 0.4273923711800134, + "grad_norm": 0.1638958603143692, + "learning_rate": 1.910151214705532e-05, + "loss": 0.527, + "step": 1916 + }, + { + "epoch": 0.4276154360919027, + "grad_norm": 0.18999075889587402, + "learning_rate": 1.9100536925314363e-05, + "loss": 0.5, + "step": 1917 + }, + { + "epoch": 0.4278385010037921, + "grad_norm": 0.18148307502269745, + "learning_rate": 1.9099561199528425e-05, + "loss": 0.4862, + "step": 1918 + }, + { + "epoch": 0.42806156591568145, + "grad_norm": 0.17788200080394745, + "learning_rate": 1.909858496975155e-05, + "loss": 0.4991, + "step": 1919 + }, + { + "epoch": 0.42828463082757084, + "grad_norm": 0.1629716157913208, + "learning_rate": 1.9097608236037813e-05, + "loss": 0.5292, + "step": 1920 + }, + { + "epoch": 0.4285076957394602, + "grad_norm": 0.1686851680278778, + "learning_rate": 1.9096630998441298e-05, + "loss": 0.5156, + "step": 1921 + }, + { + "epoch": 0.4287307606513495, + "grad_norm": 0.1803978681564331, + "learning_rate": 1.909565325701614e-05, + "loss": 0.5193, + "step": 1922 + }, + { + "epoch": 0.4289538255632389, + "grad_norm": 0.17601296305656433, + "learning_rate": 1.9094675011816496e-05, + "loss": 0.5193, + "step": 1923 + }, + { + "epoch": 0.42917689047512825, + "grad_norm": 0.17354577779769897, + "learning_rate": 1.9093696262896535e-05, + "loss": 0.5055, + "step": 1924 + }, + { + "epoch": 0.42939995538701764, + "grad_norm": 0.1659156084060669, + "learning_rate": 1.9092717010310476e-05, + "loss": 0.5232, + "step": 1925 + }, + { + "epoch": 0.429623020298907, + "grad_norm": 0.1605677753686905, + "learning_rate": 1.909173725411255e-05, + "loss": 0.4916, + "step": 1926 + }, + { + "epoch": 0.42984608521079637, + "grad_norm": 0.17374806106090546, + "learning_rate": 1.9090756994357035e-05, + "loss": 0.5259, + "step": 1927 + }, + { + "epoch": 0.4300691501226857, + "grad_norm": 0.16589613258838654, + "learning_rate": 1.9089776231098204e-05, + "loss": 0.5192, + "step": 1928 + }, + { + "epoch": 0.43029221503457504, + "grad_norm": 0.16348575055599213, + "learning_rate": 1.9088794964390395e-05, + "loss": 0.4963, + "step": 1929 + }, + { + "epoch": 0.43051527994646444, + "grad_norm": 0.16494914889335632, + "learning_rate": 1.9087813194287948e-05, + "loss": 0.5359, + "step": 1930 + }, + { + "epoch": 0.43073834485835377, + "grad_norm": 0.16134381294250488, + "learning_rate": 1.9086830920845242e-05, + "loss": 0.5087, + "step": 1931 + }, + { + "epoch": 0.43096140977024316, + "grad_norm": 0.16159029304981232, + "learning_rate": 1.908584814411668e-05, + "loss": 0.5242, + "step": 1932 + }, + { + "epoch": 0.4311844746821325, + "grad_norm": 0.1648871749639511, + "learning_rate": 1.9084864864156696e-05, + "loss": 0.5069, + "step": 1933 + }, + { + "epoch": 0.43140753959402184, + "grad_norm": 0.17623218894004822, + "learning_rate": 1.9083881081019752e-05, + "loss": 0.5049, + "step": 1934 + }, + { + "epoch": 0.43163060450591123, + "grad_norm": 0.166509211063385, + "learning_rate": 1.9082896794760327e-05, + "loss": 0.4966, + "step": 1935 + }, + { + "epoch": 0.43185366941780057, + "grad_norm": 0.15902282297611237, + "learning_rate": 1.908191200543295e-05, + "loss": 0.5343, + "step": 1936 + }, + { + "epoch": 0.43207673432968996, + "grad_norm": 0.16570866107940674, + "learning_rate": 1.908092671309216e-05, + "loss": 0.4797, + "step": 1937 + }, + { + "epoch": 0.4322997992415793, + "grad_norm": 0.15843339264392853, + "learning_rate": 1.9079940917792524e-05, + "loss": 0.5359, + "step": 1938 + }, + { + "epoch": 0.43252286415346863, + "grad_norm": 0.16115322709083557, + "learning_rate": 1.9078954619588645e-05, + "loss": 0.5032, + "step": 1939 + }, + { + "epoch": 0.432745929065358, + "grad_norm": 0.16374099254608154, + "learning_rate": 1.9077967818535153e-05, + "loss": 0.4998, + "step": 1940 + }, + { + "epoch": 0.43296899397724736, + "grad_norm": 0.17378082871437073, + "learning_rate": 1.9076980514686695e-05, + "loss": 0.5403, + "step": 1941 + }, + { + "epoch": 0.43319205888913676, + "grad_norm": 0.16688857972621918, + "learning_rate": 1.9075992708097965e-05, + "loss": 0.5103, + "step": 1942 + }, + { + "epoch": 0.4334151238010261, + "grad_norm": 0.16233094036579132, + "learning_rate": 1.9075004398823665e-05, + "loss": 0.5045, + "step": 1943 + }, + { + "epoch": 0.4336381887129155, + "grad_norm": 0.16693229973316193, + "learning_rate": 1.907401558691854e-05, + "loss": 0.5167, + "step": 1944 + }, + { + "epoch": 0.4338612536248048, + "grad_norm": 0.16341574490070343, + "learning_rate": 1.9073026272437353e-05, + "loss": 0.5001, + "step": 1945 + }, + { + "epoch": 0.43408431853669416, + "grad_norm": 0.16043853759765625, + "learning_rate": 1.90720364554349e-05, + "loss": 0.5109, + "step": 1946 + }, + { + "epoch": 0.43430738344858355, + "grad_norm": 0.15996608138084412, + "learning_rate": 1.9071046135966e-05, + "loss": 0.4826, + "step": 1947 + }, + { + "epoch": 0.4345304483604729, + "grad_norm": 0.15902268886566162, + "learning_rate": 1.9070055314085508e-05, + "loss": 0.4989, + "step": 1948 + }, + { + "epoch": 0.4347535132723623, + "grad_norm": 0.16526652872562408, + "learning_rate": 1.9069063989848298e-05, + "loss": 0.5209, + "step": 1949 + }, + { + "epoch": 0.4349765781842516, + "grad_norm": 0.16759642958641052, + "learning_rate": 1.9068072163309282e-05, + "loss": 0.5246, + "step": 1950 + }, + { + "epoch": 0.43519964309614095, + "grad_norm": 0.17135794460773468, + "learning_rate": 1.9067079834523387e-05, + "loss": 0.5198, + "step": 1951 + }, + { + "epoch": 0.43542270800803035, + "grad_norm": 0.16336077451705933, + "learning_rate": 1.9066087003545576e-05, + "loss": 0.5098, + "step": 1952 + }, + { + "epoch": 0.4356457729199197, + "grad_norm": 0.1512858122587204, + "learning_rate": 1.9065093670430836e-05, + "loss": 0.4843, + "step": 1953 + }, + { + "epoch": 0.4358688378318091, + "grad_norm": 0.17053045332431793, + "learning_rate": 1.9064099835234188e-05, + "loss": 0.527, + "step": 1954 + }, + { + "epoch": 0.4360919027436984, + "grad_norm": 0.16126702725887299, + "learning_rate": 1.9063105498010678e-05, + "loss": 0.5222, + "step": 1955 + }, + { + "epoch": 0.43631496765558775, + "grad_norm": 0.1633397489786148, + "learning_rate": 1.9062110658815375e-05, + "loss": 0.5124, + "step": 1956 + }, + { + "epoch": 0.43653803256747714, + "grad_norm": 0.18700194358825684, + "learning_rate": 1.9061115317703384e-05, + "loss": 0.5123, + "step": 1957 + }, + { + "epoch": 0.4367610974793665, + "grad_norm": 0.1907283067703247, + "learning_rate": 1.9060119474729826e-05, + "loss": 0.5223, + "step": 1958 + }, + { + "epoch": 0.43698416239125587, + "grad_norm": 0.1634737253189087, + "learning_rate": 1.9059123129949865e-05, + "loss": 0.4879, + "step": 1959 + }, + { + "epoch": 0.4372072273031452, + "grad_norm": 0.18314914405345917, + "learning_rate": 1.9058126283418675e-05, + "loss": 0.5018, + "step": 1960 + }, + { + "epoch": 0.4374302922150346, + "grad_norm": 0.1680321991443634, + "learning_rate": 1.9057128935191477e-05, + "loss": 0.4975, + "step": 1961 + }, + { + "epoch": 0.43765335712692394, + "grad_norm": 0.1635400801897049, + "learning_rate": 1.9056131085323506e-05, + "loss": 0.4895, + "step": 1962 + }, + { + "epoch": 0.4378764220388133, + "grad_norm": 0.15011341869831085, + "learning_rate": 1.905513273387003e-05, + "loss": 0.4765, + "step": 1963 + }, + { + "epoch": 0.43809948695070267, + "grad_norm": 0.16991405189037323, + "learning_rate": 1.9054133880886348e-05, + "loss": 0.518, + "step": 1964 + }, + { + "epoch": 0.438322551862592, + "grad_norm": 0.17523062229156494, + "learning_rate": 1.9053134526427777e-05, + "loss": 0.4764, + "step": 1965 + }, + { + "epoch": 0.4385456167744814, + "grad_norm": 0.1695917248725891, + "learning_rate": 1.905213467054967e-05, + "loss": 0.5111, + "step": 1966 + }, + { + "epoch": 0.43876868168637073, + "grad_norm": 0.16954153776168823, + "learning_rate": 1.90511343133074e-05, + "loss": 0.505, + "step": 1967 + }, + { + "epoch": 0.43899174659826007, + "grad_norm": 0.3032020628452301, + "learning_rate": 1.905013345475638e-05, + "loss": 0.5037, + "step": 1968 + }, + { + "epoch": 0.43921481151014946, + "grad_norm": 0.16867168247699738, + "learning_rate": 1.9049132094952046e-05, + "loss": 0.5596, + "step": 1969 + }, + { + "epoch": 0.4394378764220388, + "grad_norm": 0.16780449450016022, + "learning_rate": 1.904813023394985e-05, + "loss": 0.4937, + "step": 1970 + }, + { + "epoch": 0.4396609413339282, + "grad_norm": 0.19413304328918457, + "learning_rate": 1.904712787180529e-05, + "loss": 0.4823, + "step": 1971 + }, + { + "epoch": 0.4398840062458175, + "grad_norm": 0.16119445860385895, + "learning_rate": 1.9046125008573876e-05, + "loss": 0.5451, + "step": 1972 + }, + { + "epoch": 0.4401070711577069, + "grad_norm": 0.1607835441827774, + "learning_rate": 1.904512164431116e-05, + "loss": 0.5066, + "step": 1973 + }, + { + "epoch": 0.44033013606959626, + "grad_norm": 0.1697167158126831, + "learning_rate": 1.9044117779072708e-05, + "loss": 0.5166, + "step": 1974 + }, + { + "epoch": 0.4405532009814856, + "grad_norm": 0.16189232468605042, + "learning_rate": 1.9043113412914128e-05, + "loss": 0.4968, + "step": 1975 + }, + { + "epoch": 0.440776265893375, + "grad_norm": 0.1717143952846527, + "learning_rate": 1.904210854589104e-05, + "loss": 0.481, + "step": 1976 + }, + { + "epoch": 0.4409993308052643, + "grad_norm": 0.17401723563671112, + "learning_rate": 1.9041103178059107e-05, + "loss": 0.526, + "step": 1977 + }, + { + "epoch": 0.4412223957171537, + "grad_norm": 0.16598494350910187, + "learning_rate": 1.9040097309474007e-05, + "loss": 0.5195, + "step": 1978 + }, + { + "epoch": 0.44144546062904305, + "grad_norm": 0.21129997074604034, + "learning_rate": 1.9039090940191455e-05, + "loss": 0.5096, + "step": 1979 + }, + { + "epoch": 0.4416685255409324, + "grad_norm": 0.17546051740646362, + "learning_rate": 1.9038084070267186e-05, + "loss": 0.5313, + "step": 1980 + }, + { + "epoch": 0.4418915904528218, + "grad_norm": 0.15967412292957306, + "learning_rate": 1.9037076699756973e-05, + "loss": 0.5183, + "step": 1981 + }, + { + "epoch": 0.4421146553647111, + "grad_norm": 0.17214882373809814, + "learning_rate": 1.9036068828716603e-05, + "loss": 0.5091, + "step": 1982 + }, + { + "epoch": 0.4423377202766005, + "grad_norm": 0.184258371591568, + "learning_rate": 1.9035060457201904e-05, + "loss": 0.4785, + "step": 1983 + }, + { + "epoch": 0.44256078518848985, + "grad_norm": 0.1570109874010086, + "learning_rate": 1.9034051585268725e-05, + "loss": 0.515, + "step": 1984 + }, + { + "epoch": 0.4427838501003792, + "grad_norm": 0.17051473259925842, + "learning_rate": 1.903304221297294e-05, + "loss": 0.5031, + "step": 1985 + }, + { + "epoch": 0.4430069150122686, + "grad_norm": 0.19998317956924438, + "learning_rate": 1.903203234037046e-05, + "loss": 0.499, + "step": 1986 + }, + { + "epoch": 0.4432299799241579, + "grad_norm": 0.16017475724220276, + "learning_rate": 1.9031021967517213e-05, + "loss": 0.5237, + "step": 1987 + }, + { + "epoch": 0.4434530448360473, + "grad_norm": 0.18142235279083252, + "learning_rate": 1.9030011094469164e-05, + "loss": 0.5373, + "step": 1988 + }, + { + "epoch": 0.44367610974793664, + "grad_norm": 0.16957718133926392, + "learning_rate": 1.90289997212823e-05, + "loss": 0.5038, + "step": 1989 + }, + { + "epoch": 0.44389917465982603, + "grad_norm": 0.16631929576396942, + "learning_rate": 1.9027987848012635e-05, + "loss": 0.4873, + "step": 1990 + }, + { + "epoch": 0.44412223957171537, + "grad_norm": 0.16362012922763824, + "learning_rate": 1.9026975474716215e-05, + "loss": 0.5204, + "step": 1991 + }, + { + "epoch": 0.4443453044836047, + "grad_norm": 0.39765313267707825, + "learning_rate": 1.902596260144911e-05, + "loss": 0.4736, + "step": 1992 + }, + { + "epoch": 0.4445683693954941, + "grad_norm": 0.17083793878555298, + "learning_rate": 1.9024949228267423e-05, + "loss": 0.5152, + "step": 1993 + }, + { + "epoch": 0.44479143430738344, + "grad_norm": 0.1833147555589676, + "learning_rate": 1.902393535522728e-05, + "loss": 0.536, + "step": 1994 + }, + { + "epoch": 0.44501449921927283, + "grad_norm": 0.17427003383636475, + "learning_rate": 1.902292098238483e-05, + "loss": 0.514, + "step": 1995 + }, + { + "epoch": 0.44523756413116217, + "grad_norm": 0.15868180990219116, + "learning_rate": 1.902190610979626e-05, + "loss": 0.4869, + "step": 1996 + }, + { + "epoch": 0.4454606290430515, + "grad_norm": 0.17407076060771942, + "learning_rate": 1.9020890737517783e-05, + "loss": 0.5134, + "step": 1997 + }, + { + "epoch": 0.4456836939549409, + "grad_norm": 0.16776099801063538, + "learning_rate": 1.901987486560563e-05, + "loss": 0.5079, + "step": 1998 + }, + { + "epoch": 0.44590675886683023, + "grad_norm": 0.2472338080406189, + "learning_rate": 1.9018858494116074e-05, + "loss": 0.5128, + "step": 1999 + }, + { + "epoch": 0.4461298237787196, + "grad_norm": 0.16303667426109314, + "learning_rate": 1.90178416231054e-05, + "loss": 0.5225, + "step": 2000 + }, + { + "epoch": 0.44635288869060896, + "grad_norm": 0.17127907276153564, + "learning_rate": 1.901682425262993e-05, + "loss": 0.5092, + "step": 2001 + }, + { + "epoch": 0.44657595360249835, + "grad_norm": 0.17813526093959808, + "learning_rate": 1.9015806382746018e-05, + "loss": 0.5, + "step": 2002 + }, + { + "epoch": 0.4467990185143877, + "grad_norm": 0.1653776615858078, + "learning_rate": 1.901478801351004e-05, + "loss": 0.5051, + "step": 2003 + }, + { + "epoch": 0.44702208342627703, + "grad_norm": 0.16295598447322845, + "learning_rate": 1.9013769144978392e-05, + "loss": 0.4955, + "step": 2004 + }, + { + "epoch": 0.4472451483381664, + "grad_norm": 0.18259268999099731, + "learning_rate": 1.901274977720751e-05, + "loss": 0.5504, + "step": 2005 + }, + { + "epoch": 0.44746821325005576, + "grad_norm": 0.19138063490390778, + "learning_rate": 1.9011729910253856e-05, + "loss": 0.5129, + "step": 2006 + }, + { + "epoch": 0.44769127816194515, + "grad_norm": 0.1588236689567566, + "learning_rate": 1.9010709544173913e-05, + "loss": 0.5078, + "step": 2007 + }, + { + "epoch": 0.4479143430738345, + "grad_norm": 0.162478968501091, + "learning_rate": 1.900968867902419e-05, + "loss": 0.5121, + "step": 2008 + }, + { + "epoch": 0.4481374079857238, + "grad_norm": 0.17752927541732788, + "learning_rate": 1.900866731486124e-05, + "loss": 0.4983, + "step": 2009 + }, + { + "epoch": 0.4483604728976132, + "grad_norm": 0.17127980291843414, + "learning_rate": 1.900764545174163e-05, + "loss": 0.5347, + "step": 2010 + }, + { + "epoch": 0.44858353780950255, + "grad_norm": 0.1686343103647232, + "learning_rate": 1.900662308972195e-05, + "loss": 0.5107, + "step": 2011 + }, + { + "epoch": 0.44880660272139195, + "grad_norm": 0.16008520126342773, + "learning_rate": 1.9005600228858832e-05, + "loss": 0.5036, + "step": 2012 + }, + { + "epoch": 0.4490296676332813, + "grad_norm": 0.16494229435920715, + "learning_rate": 1.9004576869208922e-05, + "loss": 0.5135, + "step": 2013 + }, + { + "epoch": 0.4492527325451706, + "grad_norm": 0.17318038642406464, + "learning_rate": 1.9003553010828906e-05, + "loss": 0.5279, + "step": 2014 + }, + { + "epoch": 0.44947579745706, + "grad_norm": 0.252083420753479, + "learning_rate": 1.9002528653775492e-05, + "loss": 0.5176, + "step": 2015 + }, + { + "epoch": 0.44969886236894935, + "grad_norm": 0.1737111508846283, + "learning_rate": 1.900150379810541e-05, + "loss": 0.5051, + "step": 2016 + }, + { + "epoch": 0.44992192728083874, + "grad_norm": 0.172331303358078, + "learning_rate": 1.9000478443875427e-05, + "loss": 0.5161, + "step": 2017 + }, + { + "epoch": 0.4501449921927281, + "grad_norm": 0.17037436366081238, + "learning_rate": 1.899945259114233e-05, + "loss": 0.5282, + "step": 2018 + }, + { + "epoch": 0.45036805710461747, + "grad_norm": 0.16774620115756989, + "learning_rate": 1.8998426239962945e-05, + "loss": 0.5056, + "step": 2019 + }, + { + "epoch": 0.4505911220165068, + "grad_norm": 0.17856718599796295, + "learning_rate": 1.899739939039411e-05, + "loss": 0.514, + "step": 2020 + }, + { + "epoch": 0.45081418692839614, + "grad_norm": 0.18140466511249542, + "learning_rate": 1.89963720424927e-05, + "loss": 0.5372, + "step": 2021 + }, + { + "epoch": 0.45103725184028554, + "grad_norm": 0.1641564816236496, + "learning_rate": 1.8995344196315618e-05, + "loss": 0.5205, + "step": 2022 + }, + { + "epoch": 0.4512603167521749, + "grad_norm": 0.16866321861743927, + "learning_rate": 1.899431585191979e-05, + "loss": 0.4906, + "step": 2023 + }, + { + "epoch": 0.45148338166406426, + "grad_norm": 0.16418704390525818, + "learning_rate": 1.8993287009362175e-05, + "loss": 0.5275, + "step": 2024 + }, + { + "epoch": 0.4517064465759536, + "grad_norm": 0.15996740758419037, + "learning_rate": 1.8992257668699756e-05, + "loss": 0.5124, + "step": 2025 + }, + { + "epoch": 0.45192951148784294, + "grad_norm": 0.16520540416240692, + "learning_rate": 1.899122782998954e-05, + "loss": 0.5045, + "step": 2026 + }, + { + "epoch": 0.45215257639973233, + "grad_norm": 0.16536462306976318, + "learning_rate": 1.8990197493288575e-05, + "loss": 0.5003, + "step": 2027 + }, + { + "epoch": 0.45237564131162167, + "grad_norm": 0.16777758300304413, + "learning_rate": 1.8989166658653916e-05, + "loss": 0.5443, + "step": 2028 + }, + { + "epoch": 0.45259870622351106, + "grad_norm": 0.16215604543685913, + "learning_rate": 1.8988135326142668e-05, + "loss": 0.4965, + "step": 2029 + }, + { + "epoch": 0.4528217711354004, + "grad_norm": 0.1622023582458496, + "learning_rate": 1.8987103495811947e-05, + "loss": 0.4606, + "step": 2030 + }, + { + "epoch": 0.45304483604728973, + "grad_norm": 0.16461379826068878, + "learning_rate": 1.8986071167718902e-05, + "loss": 0.5273, + "step": 2031 + }, + { + "epoch": 0.4532679009591791, + "grad_norm": 0.16976390779018402, + "learning_rate": 1.8985038341920715e-05, + "loss": 0.4889, + "step": 2032 + }, + { + "epoch": 0.45349096587106846, + "grad_norm": 0.1569637656211853, + "learning_rate": 1.898400501847458e-05, + "loss": 0.4828, + "step": 2033 + }, + { + "epoch": 0.45371403078295786, + "grad_norm": 0.16640910506248474, + "learning_rate": 1.898297119743774e-05, + "loss": 0.5351, + "step": 2034 + }, + { + "epoch": 0.4539370956948472, + "grad_norm": 0.1644599735736847, + "learning_rate": 1.898193687886745e-05, + "loss": 0.4869, + "step": 2035 + }, + { + "epoch": 0.4541601606067366, + "grad_norm": 0.16732025146484375, + "learning_rate": 1.8980902062820997e-05, + "loss": 0.4893, + "step": 2036 + }, + { + "epoch": 0.4543832255186259, + "grad_norm": 0.16019105911254883, + "learning_rate": 1.8979866749355694e-05, + "loss": 0.5038, + "step": 2037 + }, + { + "epoch": 0.45460629043051526, + "grad_norm": 0.17239058017730713, + "learning_rate": 1.8978830938528884e-05, + "loss": 0.5106, + "step": 2038 + }, + { + "epoch": 0.45482935534240465, + "grad_norm": 0.15983308851718903, + "learning_rate": 1.8977794630397942e-05, + "loss": 0.5163, + "step": 2039 + }, + { + "epoch": 0.455052420254294, + "grad_norm": 0.16848912835121155, + "learning_rate": 1.8976757825020255e-05, + "loss": 0.5462, + "step": 2040 + }, + { + "epoch": 0.4552754851661834, + "grad_norm": 0.15303179621696472, + "learning_rate": 1.897572052245326e-05, + "loss": 0.4594, + "step": 2041 + }, + { + "epoch": 0.4554985500780727, + "grad_norm": 0.17675776779651642, + "learning_rate": 1.8974682722754397e-05, + "loss": 0.5385, + "step": 2042 + }, + { + "epoch": 0.45572161498996205, + "grad_norm": 0.17349328100681305, + "learning_rate": 1.8973644425981154e-05, + "loss": 0.4953, + "step": 2043 + }, + { + "epoch": 0.45594467990185145, + "grad_norm": 0.15753738582134247, + "learning_rate": 1.897260563219104e-05, + "loss": 0.4851, + "step": 2044 + }, + { + "epoch": 0.4561677448137408, + "grad_norm": 0.16278420388698578, + "learning_rate": 1.897156634144158e-05, + "loss": 0.4994, + "step": 2045 + }, + { + "epoch": 0.4563908097256302, + "grad_norm": 0.1592012643814087, + "learning_rate": 1.8970526553790346e-05, + "loss": 0.513, + "step": 2046 + }, + { + "epoch": 0.4566138746375195, + "grad_norm": 0.18133483827114105, + "learning_rate": 1.8969486269294922e-05, + "loss": 0.4734, + "step": 2047 + }, + { + "epoch": 0.4568369395494089, + "grad_norm": 0.170905202627182, + "learning_rate": 1.8968445488012933e-05, + "loss": 0.5401, + "step": 2048 + }, + { + "epoch": 0.45706000446129824, + "grad_norm": 0.16522808372974396, + "learning_rate": 1.8967404210002014e-05, + "loss": 0.5008, + "step": 2049 + }, + { + "epoch": 0.4572830693731876, + "grad_norm": 0.18945086002349854, + "learning_rate": 1.8966362435319845e-05, + "loss": 0.5064, + "step": 2050 + }, + { + "epoch": 0.45750613428507697, + "grad_norm": 0.17545610666275024, + "learning_rate": 1.8965320164024123e-05, + "loss": 0.5105, + "step": 2051 + }, + { + "epoch": 0.4577291991969663, + "grad_norm": 0.1666799783706665, + "learning_rate": 1.8964277396172577e-05, + "loss": 0.5233, + "step": 2052 + }, + { + "epoch": 0.4579522641088557, + "grad_norm": 0.16869474947452545, + "learning_rate": 1.896323413182296e-05, + "loss": 0.4921, + "step": 2053 + }, + { + "epoch": 0.45817532902074504, + "grad_norm": 0.17108272016048431, + "learning_rate": 1.8962190371033057e-05, + "loss": 0.5358, + "step": 2054 + }, + { + "epoch": 0.4583983939326344, + "grad_norm": 0.16527149081230164, + "learning_rate": 1.8961146113860676e-05, + "loss": 0.5033, + "step": 2055 + }, + { + "epoch": 0.45862145884452377, + "grad_norm": 0.17335473001003265, + "learning_rate": 1.8960101360363656e-05, + "loss": 0.5156, + "step": 2056 + }, + { + "epoch": 0.4588445237564131, + "grad_norm": 0.17064844071865082, + "learning_rate": 1.895905611059986e-05, + "loss": 0.5118, + "step": 2057 + }, + { + "epoch": 0.4590675886683025, + "grad_norm": 0.16565892100334167, + "learning_rate": 1.8958010364627183e-05, + "loss": 0.5152, + "step": 2058 + }, + { + "epoch": 0.45929065358019183, + "grad_norm": 0.1769896149635315, + "learning_rate": 1.8956964122503546e-05, + "loss": 0.5483, + "step": 2059 + }, + { + "epoch": 0.45951371849208117, + "grad_norm": 0.1574466973543167, + "learning_rate": 1.895591738428689e-05, + "loss": 0.5205, + "step": 2060 + }, + { + "epoch": 0.45973678340397056, + "grad_norm": 0.16593651473522186, + "learning_rate": 1.8954870150035195e-05, + "loss": 0.4863, + "step": 2061 + }, + { + "epoch": 0.4599598483158599, + "grad_norm": 0.16003793478012085, + "learning_rate": 1.8953822419806468e-05, + "loss": 0.5051, + "step": 2062 + }, + { + "epoch": 0.4601829132277493, + "grad_norm": 0.17091605067253113, + "learning_rate": 1.895277419365873e-05, + "loss": 0.5037, + "step": 2063 + }, + { + "epoch": 0.4604059781396386, + "grad_norm": 0.17469583451747894, + "learning_rate": 1.895172547165004e-05, + "loss": 0.4878, + "step": 2064 + }, + { + "epoch": 0.460629043051528, + "grad_norm": 0.16121211647987366, + "learning_rate": 1.895067625383849e-05, + "loss": 0.4768, + "step": 2065 + }, + { + "epoch": 0.46085210796341736, + "grad_norm": 0.1591392159461975, + "learning_rate": 1.894962654028218e-05, + "loss": 0.5138, + "step": 2066 + }, + { + "epoch": 0.4610751728753067, + "grad_norm": 1.2021454572677612, + "learning_rate": 1.8948576331039264e-05, + "loss": 0.5438, + "step": 2067 + }, + { + "epoch": 0.4612982377871961, + "grad_norm": 0.1606753021478653, + "learning_rate": 1.8947525626167896e-05, + "loss": 0.5376, + "step": 2068 + }, + { + "epoch": 0.4615213026990854, + "grad_norm": 0.17104879021644592, + "learning_rate": 1.894647442572628e-05, + "loss": 0.5164, + "step": 2069 + }, + { + "epoch": 0.4617443676109748, + "grad_norm": 0.16374671459197998, + "learning_rate": 1.8945422729772633e-05, + "loss": 0.516, + "step": 2070 + }, + { + "epoch": 0.46196743252286415, + "grad_norm": 0.16128143668174744, + "learning_rate": 1.8944370538365206e-05, + "loss": 0.5157, + "step": 2071 + }, + { + "epoch": 0.4621904974347535, + "grad_norm": 0.16720320284366608, + "learning_rate": 1.8943317851562278e-05, + "loss": 0.5291, + "step": 2072 + }, + { + "epoch": 0.4624135623466429, + "grad_norm": 0.1879906952381134, + "learning_rate": 1.8942264669422154e-05, + "loss": 0.5145, + "step": 2073 + }, + { + "epoch": 0.4626366272585322, + "grad_norm": 0.17528733611106873, + "learning_rate": 1.894121099200316e-05, + "loss": 0.5235, + "step": 2074 + }, + { + "epoch": 0.4628596921704216, + "grad_norm": 0.16449770331382751, + "learning_rate": 1.894015681936366e-05, + "loss": 0.5165, + "step": 2075 + }, + { + "epoch": 0.46308275708231095, + "grad_norm": 0.16646511852741241, + "learning_rate": 1.8939102151562036e-05, + "loss": 0.5093, + "step": 2076 + }, + { + "epoch": 0.46330582199420034, + "grad_norm": 0.17490622401237488, + "learning_rate": 1.893804698865671e-05, + "loss": 0.5385, + "step": 2077 + }, + { + "epoch": 0.4635288869060897, + "grad_norm": 0.2023608237504959, + "learning_rate": 1.893699133070612e-05, + "loss": 0.4972, + "step": 2078 + }, + { + "epoch": 0.463751951817979, + "grad_norm": 0.16244065761566162, + "learning_rate": 1.893593517776873e-05, + "loss": 0.5269, + "step": 2079 + }, + { + "epoch": 0.4639750167298684, + "grad_norm": 0.16335059702396393, + "learning_rate": 1.8934878529903043e-05, + "loss": 0.5107, + "step": 2080 + }, + { + "epoch": 0.46419808164175774, + "grad_norm": 0.16115811467170715, + "learning_rate": 1.8933821387167582e-05, + "loss": 0.5214, + "step": 2081 + }, + { + "epoch": 0.46442114655364714, + "grad_norm": 0.1833171248435974, + "learning_rate": 1.8932763749620894e-05, + "loss": 0.4664, + "step": 2082 + }, + { + "epoch": 0.46464421146553647, + "grad_norm": 0.1721174716949463, + "learning_rate": 1.893170561732156e-05, + "loss": 0.5224, + "step": 2083 + }, + { + "epoch": 0.4648672763774258, + "grad_norm": 0.16398632526397705, + "learning_rate": 1.8930646990328188e-05, + "loss": 0.4968, + "step": 2084 + }, + { + "epoch": 0.4650903412893152, + "grad_norm": 0.17112642526626587, + "learning_rate": 1.892958786869941e-05, + "loss": 0.5241, + "step": 2085 + }, + { + "epoch": 0.46531340620120454, + "grad_norm": 0.18161650002002716, + "learning_rate": 1.8928528252493884e-05, + "loss": 0.5047, + "step": 2086 + }, + { + "epoch": 0.46553647111309393, + "grad_norm": 0.1646048128604889, + "learning_rate": 1.8927468141770304e-05, + "loss": 0.5271, + "step": 2087 + }, + { + "epoch": 0.46575953602498327, + "grad_norm": 0.16542619466781616, + "learning_rate": 1.8926407536587378e-05, + "loss": 0.5338, + "step": 2088 + }, + { + "epoch": 0.4659826009368726, + "grad_norm": 0.17052631080150604, + "learning_rate": 1.8925346437003856e-05, + "loss": 0.5276, + "step": 2089 + }, + { + "epoch": 0.466205665848762, + "grad_norm": 0.17085763812065125, + "learning_rate": 1.8924284843078503e-05, + "loss": 0.5277, + "step": 2090 + }, + { + "epoch": 0.46642873076065133, + "grad_norm": 0.1689939796924591, + "learning_rate": 1.8923222754870124e-05, + "loss": 0.4889, + "step": 2091 + }, + { + "epoch": 0.4666517956725407, + "grad_norm": 0.16189821064472198, + "learning_rate": 1.8922160172437535e-05, + "loss": 0.5009, + "step": 2092 + }, + { + "epoch": 0.46687486058443006, + "grad_norm": 0.16525843739509583, + "learning_rate": 1.89210970958396e-05, + "loss": 0.5073, + "step": 2093 + }, + { + "epoch": 0.46709792549631945, + "grad_norm": 0.159906804561615, + "learning_rate": 1.8920033525135184e-05, + "loss": 0.5107, + "step": 2094 + }, + { + "epoch": 0.4673209904082088, + "grad_norm": 0.15631826221942902, + "learning_rate": 1.8918969460383205e-05, + "loss": 0.4839, + "step": 2095 + }, + { + "epoch": 0.46754405532009813, + "grad_norm": 0.1734917014837265, + "learning_rate": 1.8917904901642593e-05, + "loss": 0.5121, + "step": 2096 + }, + { + "epoch": 0.4677671202319875, + "grad_norm": 0.17475415766239166, + "learning_rate": 1.8916839848972315e-05, + "loss": 0.5198, + "step": 2097 + }, + { + "epoch": 0.46799018514387686, + "grad_norm": 0.15618795156478882, + "learning_rate": 1.8915774302431357e-05, + "loss": 0.4733, + "step": 2098 + }, + { + "epoch": 0.46821325005576625, + "grad_norm": 0.17231950163841248, + "learning_rate": 1.8914708262078735e-05, + "loss": 0.4975, + "step": 2099 + }, + { + "epoch": 0.4684363149676556, + "grad_norm": 0.1630786657333374, + "learning_rate": 1.891364172797349e-05, + "loss": 0.4906, + "step": 2100 + }, + { + "epoch": 0.4686593798795449, + "grad_norm": 0.20263253152370453, + "learning_rate": 1.89125747001747e-05, + "loss": 0.4961, + "step": 2101 + }, + { + "epoch": 0.4688824447914343, + "grad_norm": 0.1711629182100296, + "learning_rate": 1.891150717874146e-05, + "loss": 0.5197, + "step": 2102 + }, + { + "epoch": 0.46910550970332365, + "grad_norm": 0.15791350603103638, + "learning_rate": 1.89104391637329e-05, + "loss": 0.5151, + "step": 2103 + }, + { + "epoch": 0.46932857461521305, + "grad_norm": 0.1648074984550476, + "learning_rate": 1.890937065520817e-05, + "loss": 0.5206, + "step": 2104 + }, + { + "epoch": 0.4695516395271024, + "grad_norm": 0.16917914152145386, + "learning_rate": 1.8908301653226448e-05, + "loss": 0.5308, + "step": 2105 + }, + { + "epoch": 0.4697747044389917, + "grad_norm": 0.16751410067081451, + "learning_rate": 1.8907232157846946e-05, + "loss": 0.512, + "step": 2106 + }, + { + "epoch": 0.4699977693508811, + "grad_norm": 0.16895310580730438, + "learning_rate": 1.89061621691289e-05, + "loss": 0.5134, + "step": 2107 + }, + { + "epoch": 0.47022083426277045, + "grad_norm": 0.1659642457962036, + "learning_rate": 1.8905091687131567e-05, + "loss": 0.5125, + "step": 2108 + }, + { + "epoch": 0.47044389917465984, + "grad_norm": 0.18306776881217957, + "learning_rate": 1.8904020711914243e-05, + "loss": 0.5404, + "step": 2109 + }, + { + "epoch": 0.4706669640865492, + "grad_norm": 0.15722155570983887, + "learning_rate": 1.8902949243536245e-05, + "loss": 0.4929, + "step": 2110 + }, + { + "epoch": 0.47089002899843857, + "grad_norm": 0.17024749517440796, + "learning_rate": 1.8901877282056916e-05, + "loss": 0.5129, + "step": 2111 + }, + { + "epoch": 0.4711130939103279, + "grad_norm": 0.1668287068605423, + "learning_rate": 1.8900804827535626e-05, + "loss": 0.5386, + "step": 2112 + }, + { + "epoch": 0.47133615882221724, + "grad_norm": 0.17657139897346497, + "learning_rate": 1.8899731880031778e-05, + "loss": 0.4682, + "step": 2113 + }, + { + "epoch": 0.47155922373410664, + "grad_norm": 0.17689798772335052, + "learning_rate": 1.8898658439604798e-05, + "loss": 0.4814, + "step": 2114 + }, + { + "epoch": 0.471782288645996, + "grad_norm": 0.16755974292755127, + "learning_rate": 1.8897584506314137e-05, + "loss": 0.4956, + "step": 2115 + }, + { + "epoch": 0.47200535355788537, + "grad_norm": 0.16352280974388123, + "learning_rate": 1.8896510080219277e-05, + "loss": 0.5034, + "step": 2116 + }, + { + "epoch": 0.4722284184697747, + "grad_norm": 0.16078925132751465, + "learning_rate": 1.889543516137973e-05, + "loss": 0.5017, + "step": 2117 + }, + { + "epoch": 0.47245148338166404, + "grad_norm": 0.17038793861865997, + "learning_rate": 1.8894359749855027e-05, + "loss": 0.5128, + "step": 2118 + }, + { + "epoch": 0.47267454829355343, + "grad_norm": 0.16477574408054352, + "learning_rate": 1.8893283845704733e-05, + "loss": 0.4765, + "step": 2119 + }, + { + "epoch": 0.47289761320544277, + "grad_norm": 0.18243131041526794, + "learning_rate": 1.889220744898844e-05, + "loss": 0.5266, + "step": 2120 + }, + { + "epoch": 0.47312067811733216, + "grad_norm": 0.15715399384498596, + "learning_rate": 1.8891130559765763e-05, + "loss": 0.5092, + "step": 2121 + }, + { + "epoch": 0.4733437430292215, + "grad_norm": 0.16996978223323822, + "learning_rate": 1.8890053178096353e-05, + "loss": 0.5166, + "step": 2122 + }, + { + "epoch": 0.4735668079411109, + "grad_norm": 0.16575470566749573, + "learning_rate": 1.888897530403987e-05, + "loss": 0.5212, + "step": 2123 + }, + { + "epoch": 0.4737898728530002, + "grad_norm": 0.1607031673192978, + "learning_rate": 1.8887896937656028e-05, + "loss": 0.5067, + "step": 2124 + }, + { + "epoch": 0.47401293776488956, + "grad_norm": 0.157185897231102, + "learning_rate": 1.8886818079004545e-05, + "loss": 0.4801, + "step": 2125 + }, + { + "epoch": 0.47423600267677896, + "grad_norm": 0.1661345213651657, + "learning_rate": 1.8885738728145173e-05, + "loss": 0.5212, + "step": 2126 + }, + { + "epoch": 0.4744590675886683, + "grad_norm": 0.17212405800819397, + "learning_rate": 1.8884658885137698e-05, + "loss": 0.4872, + "step": 2127 + }, + { + "epoch": 0.4746821325005577, + "grad_norm": 0.1681511402130127, + "learning_rate": 1.8883578550041925e-05, + "loss": 0.5266, + "step": 2128 + }, + { + "epoch": 0.474905197412447, + "grad_norm": 0.15793408453464508, + "learning_rate": 1.8882497722917697e-05, + "loss": 0.505, + "step": 2129 + }, + { + "epoch": 0.47512826232433636, + "grad_norm": 0.1688978672027588, + "learning_rate": 1.8881416403824867e-05, + "loss": 0.5005, + "step": 2130 + }, + { + "epoch": 0.47535132723622575, + "grad_norm": 0.17982251942157745, + "learning_rate": 1.8880334592823333e-05, + "loss": 0.5177, + "step": 2131 + }, + { + "epoch": 0.4755743921481151, + "grad_norm": 0.15668730437755585, + "learning_rate": 1.8879252289973008e-05, + "loss": 0.5076, + "step": 2132 + }, + { + "epoch": 0.4757974570600045, + "grad_norm": 0.15427713096141815, + "learning_rate": 1.8878169495333843e-05, + "loss": 0.4916, + "step": 2133 + }, + { + "epoch": 0.4760205219718938, + "grad_norm": 0.1573878973722458, + "learning_rate": 1.88770862089658e-05, + "loss": 0.4851, + "step": 2134 + }, + { + "epoch": 0.47624358688378315, + "grad_norm": 0.16141542792320251, + "learning_rate": 1.887600243092889e-05, + "loss": 0.5065, + "step": 2135 + }, + { + "epoch": 0.47646665179567255, + "grad_norm": 0.1874970644712448, + "learning_rate": 1.8874918161283127e-05, + "loss": 0.4987, + "step": 2136 + }, + { + "epoch": 0.4766897167075619, + "grad_norm": 0.1556268036365509, + "learning_rate": 1.887383340008857e-05, + "loss": 0.5001, + "step": 2137 + }, + { + "epoch": 0.4769127816194513, + "grad_norm": 0.16177254915237427, + "learning_rate": 1.8872748147405303e-05, + "loss": 0.5145, + "step": 2138 + }, + { + "epoch": 0.4771358465313406, + "grad_norm": 0.16272678971290588, + "learning_rate": 1.8871662403293434e-05, + "loss": 0.5111, + "step": 2139 + }, + { + "epoch": 0.47735891144323, + "grad_norm": 0.157213494181633, + "learning_rate": 1.8870576167813096e-05, + "loss": 0.4975, + "step": 2140 + }, + { + "epoch": 0.47758197635511934, + "grad_norm": 0.16583600640296936, + "learning_rate": 1.886948944102445e-05, + "loss": 0.4835, + "step": 2141 + }, + { + "epoch": 0.4778050412670087, + "grad_norm": 0.15528440475463867, + "learning_rate": 1.8868402222987687e-05, + "loss": 0.4821, + "step": 2142 + }, + { + "epoch": 0.47802810617889807, + "grad_norm": 0.1678980439901352, + "learning_rate": 1.8867314513763023e-05, + "loss": 0.5126, + "step": 2143 + }, + { + "epoch": 0.4782511710907874, + "grad_norm": 0.1719445437192917, + "learning_rate": 1.886622631341071e-05, + "loss": 0.5278, + "step": 2144 + }, + { + "epoch": 0.4784742360026768, + "grad_norm": 0.16177958250045776, + "learning_rate": 1.886513762199101e-05, + "loss": 0.4716, + "step": 2145 + }, + { + "epoch": 0.47869730091456614, + "grad_norm": 0.17055903375148773, + "learning_rate": 1.886404843956422e-05, + "loss": 0.5155, + "step": 2146 + }, + { + "epoch": 0.4789203658264555, + "grad_norm": 0.16013142466545105, + "learning_rate": 1.8862958766190673e-05, + "loss": 0.4882, + "step": 2147 + }, + { + "epoch": 0.47914343073834487, + "grad_norm": 0.15844453871250153, + "learning_rate": 1.886186860193072e-05, + "loss": 0.5125, + "step": 2148 + }, + { + "epoch": 0.4793664956502342, + "grad_norm": 0.16520345211029053, + "learning_rate": 1.886077794684474e-05, + "loss": 0.5355, + "step": 2149 + }, + { + "epoch": 0.4795895605621236, + "grad_norm": 0.16560572385787964, + "learning_rate": 1.885968680099314e-05, + "loss": 0.5011, + "step": 2150 + }, + { + "epoch": 0.47981262547401293, + "grad_norm": 0.16460593044757843, + "learning_rate": 1.885859516443636e-05, + "loss": 0.5186, + "step": 2151 + }, + { + "epoch": 0.4800356903859023, + "grad_norm": 0.16416500508785248, + "learning_rate": 1.885750303723485e-05, + "loss": 0.5121, + "step": 2152 + }, + { + "epoch": 0.48025875529779166, + "grad_norm": 0.1615086942911148, + "learning_rate": 1.8856410419449108e-05, + "loss": 0.5291, + "step": 2153 + }, + { + "epoch": 0.480481820209681, + "grad_norm": 0.16628290712833405, + "learning_rate": 1.885531731113965e-05, + "loss": 0.5166, + "step": 2154 + }, + { + "epoch": 0.4807048851215704, + "grad_norm": 0.212614044547081, + "learning_rate": 1.8854223712367017e-05, + "loss": 0.4687, + "step": 2155 + }, + { + "epoch": 0.48092795003345973, + "grad_norm": 0.16417112946510315, + "learning_rate": 1.8853129623191775e-05, + "loss": 0.4839, + "step": 2156 + }, + { + "epoch": 0.4811510149453491, + "grad_norm": 0.16616222262382507, + "learning_rate": 1.8852035043674534e-05, + "loss": 0.5013, + "step": 2157 + }, + { + "epoch": 0.48137407985723846, + "grad_norm": 0.16397857666015625, + "learning_rate": 1.8850939973875907e-05, + "loss": 0.5163, + "step": 2158 + }, + { + "epoch": 0.4815971447691278, + "grad_norm": 0.1688777208328247, + "learning_rate": 1.8849844413856548e-05, + "loss": 0.5154, + "step": 2159 + }, + { + "epoch": 0.4818202096810172, + "grad_norm": 0.16248802840709686, + "learning_rate": 1.884874836367714e-05, + "loss": 0.49, + "step": 2160 + }, + { + "epoch": 0.4820432745929065, + "grad_norm": 0.17419035732746124, + "learning_rate": 1.8847651823398385e-05, + "loss": 0.5105, + "step": 2161 + }, + { + "epoch": 0.4822663395047959, + "grad_norm": 0.16489480435848236, + "learning_rate": 1.884655479308102e-05, + "loss": 0.4967, + "step": 2162 + }, + { + "epoch": 0.48248940441668525, + "grad_norm": 0.16695787012577057, + "learning_rate": 1.8845457272785802e-05, + "loss": 0.5316, + "step": 2163 + }, + { + "epoch": 0.4827124693285746, + "grad_norm": 0.1630323827266693, + "learning_rate": 1.884435926257352e-05, + "loss": 0.5144, + "step": 2164 + }, + { + "epoch": 0.482935534240464, + "grad_norm": 0.16861893236637115, + "learning_rate": 1.8843260762504985e-05, + "loss": 0.5402, + "step": 2165 + }, + { + "epoch": 0.4831585991523533, + "grad_norm": 0.16794002056121826, + "learning_rate": 1.884216177264105e-05, + "loss": 0.5581, + "step": 2166 + }, + { + "epoch": 0.4833816640642427, + "grad_norm": 0.16783180832862854, + "learning_rate": 1.8841062293042572e-05, + "loss": 0.4884, + "step": 2167 + }, + { + "epoch": 0.48360472897613205, + "grad_norm": 0.16696153581142426, + "learning_rate": 1.8839962323770455e-05, + "loss": 0.4893, + "step": 2168 + }, + { + "epoch": 0.48382779388802144, + "grad_norm": 0.17243477702140808, + "learning_rate": 1.8838861864885617e-05, + "loss": 0.5134, + "step": 2169 + }, + { + "epoch": 0.4840508587999108, + "grad_norm": 0.16844888031482697, + "learning_rate": 1.883776091644901e-05, + "loss": 0.5203, + "step": 2170 + }, + { + "epoch": 0.4842739237118001, + "grad_norm": 0.16011419892311096, + "learning_rate": 1.8836659478521614e-05, + "loss": 0.4837, + "step": 2171 + }, + { + "epoch": 0.4844969886236895, + "grad_norm": 0.16117213666439056, + "learning_rate": 1.883555755116443e-05, + "loss": 0.5124, + "step": 2172 + }, + { + "epoch": 0.48472005353557884, + "grad_norm": 0.15577594935894012, + "learning_rate": 1.883445513443849e-05, + "loss": 0.4939, + "step": 2173 + }, + { + "epoch": 0.48494311844746824, + "grad_norm": 0.17128697037696838, + "learning_rate": 1.883335222840485e-05, + "loss": 0.5331, + "step": 2174 + }, + { + "epoch": 0.4851661833593576, + "grad_norm": 0.16078589856624603, + "learning_rate": 1.8832248833124606e-05, + "loss": 0.5348, + "step": 2175 + }, + { + "epoch": 0.4853892482712469, + "grad_norm": 0.17158381640911102, + "learning_rate": 1.8831144948658863e-05, + "loss": 0.5016, + "step": 2176 + }, + { + "epoch": 0.4856123131831363, + "grad_norm": 0.17395475506782532, + "learning_rate": 1.883004057506876e-05, + "loss": 0.5145, + "step": 2177 + }, + { + "epoch": 0.48583537809502564, + "grad_norm": 0.15187759697437286, + "learning_rate": 1.882893571241547e-05, + "loss": 0.4747, + "step": 2178 + }, + { + "epoch": 0.48605844300691503, + "grad_norm": 0.1744384616613388, + "learning_rate": 1.8827830360760184e-05, + "loss": 0.488, + "step": 2179 + }, + { + "epoch": 0.48628150791880437, + "grad_norm": 0.16714079678058624, + "learning_rate": 1.8826724520164118e-05, + "loss": 0.5117, + "step": 2180 + }, + { + "epoch": 0.4865045728306937, + "grad_norm": 0.1541585475206375, + "learning_rate": 1.8825618190688534e-05, + "loss": 0.4808, + "step": 2181 + }, + { + "epoch": 0.4867276377425831, + "grad_norm": 0.16642670333385468, + "learning_rate": 1.8824511372394694e-05, + "loss": 0.5223, + "step": 2182 + }, + { + "epoch": 0.48695070265447243, + "grad_norm": 0.16418053209781647, + "learning_rate": 1.8823404065343904e-05, + "loss": 0.5022, + "step": 2183 + }, + { + "epoch": 0.4871737675663618, + "grad_norm": 0.1558452993631363, + "learning_rate": 1.88222962695975e-05, + "loss": 0.5009, + "step": 2184 + }, + { + "epoch": 0.48739683247825116, + "grad_norm": 0.16112567484378815, + "learning_rate": 1.8821187985216835e-05, + "loss": 0.4851, + "step": 2185 + }, + { + "epoch": 0.48761989739014056, + "grad_norm": 0.15951795876026154, + "learning_rate": 1.8820079212263287e-05, + "loss": 0.5364, + "step": 2186 + }, + { + "epoch": 0.4878429623020299, + "grad_norm": 0.15660803020000458, + "learning_rate": 1.8818969950798274e-05, + "loss": 0.4687, + "step": 2187 + }, + { + "epoch": 0.48806602721391923, + "grad_norm": 0.16717955470085144, + "learning_rate": 1.881786020088323e-05, + "loss": 0.5184, + "step": 2188 + }, + { + "epoch": 0.4882890921258086, + "grad_norm": 0.4413386583328247, + "learning_rate": 1.8816749962579625e-05, + "loss": 0.49, + "step": 2189 + }, + { + "epoch": 0.48851215703769796, + "grad_norm": 0.18018092215061188, + "learning_rate": 1.8815639235948945e-05, + "loss": 0.4948, + "step": 2190 + }, + { + "epoch": 0.48873522194958735, + "grad_norm": 0.17363835871219635, + "learning_rate": 1.881452802105271e-05, + "loss": 0.5255, + "step": 2191 + }, + { + "epoch": 0.4889582868614767, + "grad_norm": 0.16417300701141357, + "learning_rate": 1.8813416317952474e-05, + "loss": 0.4984, + "step": 2192 + }, + { + "epoch": 0.489181351773366, + "grad_norm": 0.1698407083749771, + "learning_rate": 1.8812304126709797e-05, + "loss": 0.5155, + "step": 2193 + }, + { + "epoch": 0.4894044166852554, + "grad_norm": 0.16547353565692902, + "learning_rate": 1.881119144738629e-05, + "loss": 0.504, + "step": 2194 + }, + { + "epoch": 0.48962748159714475, + "grad_norm": 0.15916848182678223, + "learning_rate": 1.8810078280043574e-05, + "loss": 0.5158, + "step": 2195 + }, + { + "epoch": 0.48985054650903415, + "grad_norm": 0.16655400395393372, + "learning_rate": 1.8808964624743303e-05, + "loss": 0.521, + "step": 2196 + }, + { + "epoch": 0.4900736114209235, + "grad_norm": 0.16791512072086334, + "learning_rate": 1.8807850481547165e-05, + "loss": 0.5327, + "step": 2197 + }, + { + "epoch": 0.4902966763328129, + "grad_norm": 0.1612214297056198, + "learning_rate": 1.880673585051686e-05, + "loss": 0.5074, + "step": 2198 + }, + { + "epoch": 0.4905197412447022, + "grad_norm": 0.33009010553359985, + "learning_rate": 1.880562073171413e-05, + "loss": 0.4991, + "step": 2199 + }, + { + "epoch": 0.49074280615659155, + "grad_norm": 0.17632457613945007, + "learning_rate": 1.8804505125200732e-05, + "loss": 0.5094, + "step": 2200 + }, + { + "epoch": 0.49096587106848094, + "grad_norm": 0.17380337417125702, + "learning_rate": 1.8803389031038462e-05, + "loss": 0.5132, + "step": 2201 + }, + { + "epoch": 0.4911889359803703, + "grad_norm": 0.16013850271701813, + "learning_rate": 1.880227244928913e-05, + "loss": 0.4987, + "step": 2202 + }, + { + "epoch": 0.49141200089225967, + "grad_norm": 0.18471701443195343, + "learning_rate": 1.8801155380014578e-05, + "loss": 0.5203, + "step": 2203 + }, + { + "epoch": 0.491635065804149, + "grad_norm": 0.16928328573703766, + "learning_rate": 1.8800037823276683e-05, + "loss": 0.4868, + "step": 2204 + }, + { + "epoch": 0.49185813071603834, + "grad_norm": 0.1749187409877777, + "learning_rate": 1.8798919779137337e-05, + "loss": 0.5049, + "step": 2205 + }, + { + "epoch": 0.49208119562792774, + "grad_norm": 0.173324853181839, + "learning_rate": 1.8797801247658465e-05, + "loss": 0.4963, + "step": 2206 + }, + { + "epoch": 0.4923042605398171, + "grad_norm": 0.16650435328483582, + "learning_rate": 1.8796682228902024e-05, + "loss": 0.5232, + "step": 2207 + }, + { + "epoch": 0.49252732545170647, + "grad_norm": 0.16675227880477905, + "learning_rate": 1.8795562722929986e-05, + "loss": 0.4887, + "step": 2208 + }, + { + "epoch": 0.4927503903635958, + "grad_norm": 0.20501257479190826, + "learning_rate": 1.8794442729804356e-05, + "loss": 0.5045, + "step": 2209 + }, + { + "epoch": 0.49297345527548514, + "grad_norm": 0.15837359428405762, + "learning_rate": 1.879332224958717e-05, + "loss": 0.4704, + "step": 2210 + }, + { + "epoch": 0.49319652018737453, + "grad_norm": 0.15580782294273376, + "learning_rate": 1.8792201282340485e-05, + "loss": 0.4753, + "step": 2211 + }, + { + "epoch": 0.49341958509926387, + "grad_norm": 0.16155748069286346, + "learning_rate": 1.879107982812639e-05, + "loss": 0.5178, + "step": 2212 + }, + { + "epoch": 0.49364265001115326, + "grad_norm": 0.17078126966953278, + "learning_rate": 1.8789957887006994e-05, + "loss": 0.5208, + "step": 2213 + }, + { + "epoch": 0.4938657149230426, + "grad_norm": 0.16327013075351715, + "learning_rate": 1.8788835459044438e-05, + "loss": 0.5446, + "step": 2214 + }, + { + "epoch": 0.494088779834932, + "grad_norm": 0.1604888141155243, + "learning_rate": 1.878771254430089e-05, + "loss": 0.5142, + "step": 2215 + }, + { + "epoch": 0.4943118447468213, + "grad_norm": 0.16108398139476776, + "learning_rate": 1.8786589142838548e-05, + "loss": 0.517, + "step": 2216 + }, + { + "epoch": 0.49453490965871066, + "grad_norm": 0.15508361160755157, + "learning_rate": 1.8785465254719625e-05, + "loss": 0.4782, + "step": 2217 + }, + { + "epoch": 0.49475797457060006, + "grad_norm": 0.1612912118434906, + "learning_rate": 1.878434088000638e-05, + "loss": 0.5104, + "step": 2218 + }, + { + "epoch": 0.4949810394824894, + "grad_norm": 0.1641056090593338, + "learning_rate": 1.8783216018761075e-05, + "loss": 0.5227, + "step": 2219 + }, + { + "epoch": 0.4952041043943788, + "grad_norm": 0.16892564296722412, + "learning_rate": 1.878209067104602e-05, + "loss": 0.5142, + "step": 2220 + }, + { + "epoch": 0.4954271693062681, + "grad_norm": 0.18777111172676086, + "learning_rate": 1.8780964836923545e-05, + "loss": 0.5091, + "step": 2221 + }, + { + "epoch": 0.49565023421815746, + "grad_norm": 0.15524406731128693, + "learning_rate": 1.8779838516455998e-05, + "loss": 0.5155, + "step": 2222 + }, + { + "epoch": 0.49587329913004685, + "grad_norm": 0.1688852459192276, + "learning_rate": 1.877871170970577e-05, + "loss": 0.5266, + "step": 2223 + }, + { + "epoch": 0.4960963640419362, + "grad_norm": 0.2029954195022583, + "learning_rate": 1.8777584416735268e-05, + "loss": 0.4751, + "step": 2224 + }, + { + "epoch": 0.4963194289538256, + "grad_norm": 0.168426051735878, + "learning_rate": 1.8776456637606926e-05, + "loss": 0.5263, + "step": 2225 + }, + { + "epoch": 0.4965424938657149, + "grad_norm": 0.18985538184642792, + "learning_rate": 1.877532837238321e-05, + "loss": 0.528, + "step": 2226 + }, + { + "epoch": 0.4967655587776043, + "grad_norm": 0.15908297896385193, + "learning_rate": 1.8774199621126605e-05, + "loss": 0.5226, + "step": 2227 + }, + { + "epoch": 0.49698862368949365, + "grad_norm": 0.15700525045394897, + "learning_rate": 1.8773070383899638e-05, + "loss": 0.4857, + "step": 2228 + }, + { + "epoch": 0.497211688601383, + "grad_norm": 0.17015953361988068, + "learning_rate": 1.877194066076485e-05, + "loss": 0.488, + "step": 2229 + }, + { + "epoch": 0.4974347535132724, + "grad_norm": 0.21846015751361847, + "learning_rate": 1.8770810451784806e-05, + "loss": 0.5305, + "step": 2230 + }, + { + "epoch": 0.4976578184251617, + "grad_norm": 0.16297823190689087, + "learning_rate": 1.8769679757022114e-05, + "loss": 0.494, + "step": 2231 + }, + { + "epoch": 0.4978808833370511, + "grad_norm": 0.1758735030889511, + "learning_rate": 1.876854857653939e-05, + "loss": 0.5394, + "step": 2232 + }, + { + "epoch": 0.49810394824894044, + "grad_norm": 0.1673547923564911, + "learning_rate": 1.876741691039929e-05, + "loss": 0.5035, + "step": 2233 + }, + { + "epoch": 0.4983270131608298, + "grad_norm": 0.16478639841079712, + "learning_rate": 1.8766284758664487e-05, + "loss": 0.5, + "step": 2234 + }, + { + "epoch": 0.49855007807271917, + "grad_norm": 0.16910696029663086, + "learning_rate": 1.8765152121397697e-05, + "loss": 0.5087, + "step": 2235 + }, + { + "epoch": 0.4987731429846085, + "grad_norm": 0.16947272419929504, + "learning_rate": 1.876401899866165e-05, + "loss": 0.5064, + "step": 2236 + }, + { + "epoch": 0.4989962078964979, + "grad_norm": 0.16172346472740173, + "learning_rate": 1.87628853905191e-05, + "loss": 0.4901, + "step": 2237 + }, + { + "epoch": 0.49921927280838724, + "grad_norm": 0.16061736643314362, + "learning_rate": 1.8761751297032838e-05, + "loss": 0.5038, + "step": 2238 + }, + { + "epoch": 0.4994423377202766, + "grad_norm": 0.15990641713142395, + "learning_rate": 1.8760616718265676e-05, + "loss": 0.4854, + "step": 2239 + }, + { + "epoch": 0.49966540263216597, + "grad_norm": 0.16735920310020447, + "learning_rate": 1.875948165428045e-05, + "loss": 0.5027, + "step": 2240 + }, + { + "epoch": 0.4998884675440553, + "grad_norm": 0.16122539341449738, + "learning_rate": 1.8758346105140033e-05, + "loss": 0.4922, + "step": 2241 + }, + { + "epoch": 0.5001115324559446, + "grad_norm": 0.1718963235616684, + "learning_rate": 1.8757210070907315e-05, + "loss": 0.5138, + "step": 2242 + }, + { + "epoch": 0.500334597367834, + "grad_norm": 0.17143313586711884, + "learning_rate": 1.875607355164522e-05, + "loss": 0.5308, + "step": 2243 + }, + { + "epoch": 0.5005576622797234, + "grad_norm": 0.15493686497211456, + "learning_rate": 1.875493654741669e-05, + "loss": 0.5243, + "step": 2244 + }, + { + "epoch": 0.5007807271916127, + "grad_norm": 0.162288635969162, + "learning_rate": 1.8753799058284707e-05, + "loss": 0.5052, + "step": 2245 + }, + { + "epoch": 0.5010037921035021, + "grad_norm": 0.17301948368549347, + "learning_rate": 1.8752661084312268e-05, + "loss": 0.5119, + "step": 2246 + }, + { + "epoch": 0.5012268570153915, + "grad_norm": 0.16729313135147095, + "learning_rate": 1.8751522625562405e-05, + "loss": 0.5372, + "step": 2247 + }, + { + "epoch": 0.5014499219272809, + "grad_norm": 0.16277892887592316, + "learning_rate": 1.8750383682098166e-05, + "loss": 0.4847, + "step": 2248 + }, + { + "epoch": 0.5016729868391702, + "grad_norm": 0.16257484257221222, + "learning_rate": 1.8749244253982633e-05, + "loss": 0.5105, + "step": 2249 + }, + { + "epoch": 0.5018960517510596, + "grad_norm": 0.16304615139961243, + "learning_rate": 1.8748104341278924e-05, + "loss": 0.5073, + "step": 2250 + }, + { + "epoch": 0.502119116662949, + "grad_norm": 0.1586243063211441, + "learning_rate": 1.874696394405017e-05, + "loss": 0.5096, + "step": 2251 + }, + { + "epoch": 0.5023421815748382, + "grad_norm": 0.16425400972366333, + "learning_rate": 1.874582306235953e-05, + "loss": 0.4817, + "step": 2252 + }, + { + "epoch": 0.5025652464867276, + "grad_norm": 0.16808679699897766, + "learning_rate": 1.874468169627019e-05, + "loss": 0.4866, + "step": 2253 + }, + { + "epoch": 0.502788311398617, + "grad_norm": 0.1656448394060135, + "learning_rate": 1.8743539845845378e-05, + "loss": 0.4903, + "step": 2254 + }, + { + "epoch": 0.5030113763105064, + "grad_norm": 0.16811887919902802, + "learning_rate": 1.8742397511148328e-05, + "loss": 0.4954, + "step": 2255 + }, + { + "epoch": 0.5032344412223957, + "grad_norm": 0.17424499988555908, + "learning_rate": 1.8741254692242315e-05, + "loss": 0.5301, + "step": 2256 + }, + { + "epoch": 0.5034575061342851, + "grad_norm": 0.17697711288928986, + "learning_rate": 1.874011138919063e-05, + "loss": 0.5103, + "step": 2257 + }, + { + "epoch": 0.5036805710461745, + "grad_norm": 0.16454057395458221, + "learning_rate": 1.8738967602056597e-05, + "loss": 0.499, + "step": 2258 + }, + { + "epoch": 0.5039036359580638, + "grad_norm": 0.16936853528022766, + "learning_rate": 1.873782333090357e-05, + "loss": 0.5029, + "step": 2259 + }, + { + "epoch": 0.5041267008699531, + "grad_norm": 0.16321606934070587, + "learning_rate": 1.873667857579492e-05, + "loss": 0.5043, + "step": 2260 + }, + { + "epoch": 0.5043497657818425, + "grad_norm": 0.17789475619792938, + "learning_rate": 1.873553333679406e-05, + "loss": 0.5206, + "step": 2261 + }, + { + "epoch": 0.5045728306937318, + "grad_norm": 0.17053046822547913, + "learning_rate": 1.8734387613964414e-05, + "loss": 0.5241, + "step": 2262 + }, + { + "epoch": 0.5047958956056212, + "grad_norm": 0.16408482193946838, + "learning_rate": 1.8733241407369438e-05, + "loss": 0.4999, + "step": 2263 + }, + { + "epoch": 0.5050189605175106, + "grad_norm": 0.1780211627483368, + "learning_rate": 1.873209471707262e-05, + "loss": 0.5014, + "step": 2264 + }, + { + "epoch": 0.5052420254294, + "grad_norm": 0.17059922218322754, + "learning_rate": 1.873094754313747e-05, + "loss": 0.5315, + "step": 2265 + }, + { + "epoch": 0.5054650903412893, + "grad_norm": 0.17030328512191772, + "learning_rate": 1.8729799885627528e-05, + "loss": 0.5295, + "step": 2266 + }, + { + "epoch": 0.5056881552531787, + "grad_norm": 0.1972958743572235, + "learning_rate": 1.872865174460635e-05, + "loss": 0.5094, + "step": 2267 + }, + { + "epoch": 0.5059112201650681, + "grad_norm": 0.1663295030593872, + "learning_rate": 1.8727503120137537e-05, + "loss": 0.5023, + "step": 2268 + }, + { + "epoch": 0.5061342850769573, + "grad_norm": 0.1621520221233368, + "learning_rate": 1.87263540122847e-05, + "loss": 0.5141, + "step": 2269 + }, + { + "epoch": 0.5063573499888467, + "grad_norm": 0.35805073380470276, + "learning_rate": 1.872520442111149e-05, + "loss": 0.539, + "step": 2270 + }, + { + "epoch": 0.5065804149007361, + "grad_norm": 0.16250286996364594, + "learning_rate": 1.8724054346681573e-05, + "loss": 0.5173, + "step": 2271 + }, + { + "epoch": 0.5068034798126255, + "grad_norm": 0.16895325481891632, + "learning_rate": 1.872290378905865e-05, + "loss": 0.5041, + "step": 2272 + }, + { + "epoch": 0.5070265447245148, + "grad_norm": 0.1690344661474228, + "learning_rate": 1.872175274830645e-05, + "loss": 0.5337, + "step": 2273 + }, + { + "epoch": 0.5072496096364042, + "grad_norm": 0.15611621737480164, + "learning_rate": 1.8720601224488716e-05, + "loss": 0.4819, + "step": 2274 + }, + { + "epoch": 0.5074726745482936, + "grad_norm": 0.18844662606716156, + "learning_rate": 1.871944921766923e-05, + "loss": 0.5082, + "step": 2275 + }, + { + "epoch": 0.5076957394601829, + "grad_norm": 0.16826370358467102, + "learning_rate": 1.8718296727911803e-05, + "loss": 0.4914, + "step": 2276 + }, + { + "epoch": 0.5079188043720723, + "grad_norm": 0.15561175346374512, + "learning_rate": 1.871714375528026e-05, + "loss": 0.5147, + "step": 2277 + }, + { + "epoch": 0.5081418692839617, + "grad_norm": 0.1746017336845398, + "learning_rate": 1.8715990299838463e-05, + "loss": 0.5224, + "step": 2278 + }, + { + "epoch": 0.508364934195851, + "grad_norm": 0.1721627563238144, + "learning_rate": 1.8714836361650303e-05, + "loss": 0.509, + "step": 2279 + }, + { + "epoch": 0.5085879991077403, + "grad_norm": 0.15273146331310272, + "learning_rate": 1.871368194077968e-05, + "loss": 0.4933, + "step": 2280 + }, + { + "epoch": 0.5088110640196297, + "grad_norm": 0.17017610371112823, + "learning_rate": 1.8712527037290546e-05, + "loss": 0.518, + "step": 2281 + }, + { + "epoch": 0.5090341289315191, + "grad_norm": 0.16407892107963562, + "learning_rate": 1.8711371651246854e-05, + "loss": 0.5152, + "step": 2282 + }, + { + "epoch": 0.5092571938434084, + "grad_norm": 0.18274806439876556, + "learning_rate": 1.8710215782712606e-05, + "loss": 0.4959, + "step": 2283 + }, + { + "epoch": 0.5094802587552978, + "grad_norm": 0.17501875758171082, + "learning_rate": 1.870905943175182e-05, + "loss": 0.5372, + "step": 2284 + }, + { + "epoch": 0.5097033236671872, + "grad_norm": 0.1640838235616684, + "learning_rate": 1.870790259842854e-05, + "loss": 0.5102, + "step": 2285 + }, + { + "epoch": 0.5099263885790765, + "grad_norm": 0.15836408734321594, + "learning_rate": 1.870674528280684e-05, + "loss": 0.5194, + "step": 2286 + }, + { + "epoch": 0.5101494534909659, + "grad_norm": 0.16688112914562225, + "learning_rate": 1.8705587484950815e-05, + "loss": 0.5077, + "step": 2287 + }, + { + "epoch": 0.5103725184028552, + "grad_norm": 0.16494499146938324, + "learning_rate": 1.8704429204924598e-05, + "loss": 0.4954, + "step": 2288 + }, + { + "epoch": 0.5105955833147446, + "grad_norm": 0.1580001264810562, + "learning_rate": 1.8703270442792337e-05, + "loss": 0.507, + "step": 2289 + }, + { + "epoch": 0.5108186482266339, + "grad_norm": 0.17867796123027802, + "learning_rate": 1.8702111198618213e-05, + "loss": 0.5081, + "step": 2290 + }, + { + "epoch": 0.5110417131385233, + "grad_norm": 0.28131747245788574, + "learning_rate": 1.8700951472466435e-05, + "loss": 0.5188, + "step": 2291 + }, + { + "epoch": 0.5112647780504127, + "grad_norm": 0.1570468544960022, + "learning_rate": 1.869979126440123e-05, + "loss": 0.4992, + "step": 2292 + }, + { + "epoch": 0.511487842962302, + "grad_norm": 0.1686072200536728, + "learning_rate": 1.8698630574486862e-05, + "loss": 0.4951, + "step": 2293 + }, + { + "epoch": 0.5117109078741914, + "grad_norm": 0.1702122986316681, + "learning_rate": 1.869746940278762e-05, + "loss": 0.4965, + "step": 2294 + }, + { + "epoch": 0.5119339727860808, + "grad_norm": 0.1632847636938095, + "learning_rate": 1.8696307749367807e-05, + "loss": 0.48, + "step": 2295 + }, + { + "epoch": 0.5121570376979702, + "grad_norm": 0.20230402052402496, + "learning_rate": 1.8695145614291773e-05, + "loss": 0.5262, + "step": 2296 + }, + { + "epoch": 0.5123801026098594, + "grad_norm": 0.17471125721931458, + "learning_rate": 1.8693982997623877e-05, + "loss": 0.5207, + "step": 2297 + }, + { + "epoch": 0.5126031675217488, + "grad_norm": 0.17141500115394592, + "learning_rate": 1.869281989942852e-05, + "loss": 0.4995, + "step": 2298 + }, + { + "epoch": 0.5128262324336382, + "grad_norm": 0.1641875058412552, + "learning_rate": 1.8691656319770112e-05, + "loss": 0.4773, + "step": 2299 + }, + { + "epoch": 0.5130492973455275, + "grad_norm": 0.1655300408601761, + "learning_rate": 1.8690492258713107e-05, + "loss": 0.5171, + "step": 2300 + }, + { + "epoch": 0.5132723622574169, + "grad_norm": 0.15635208785533905, + "learning_rate": 1.8689327716321975e-05, + "loss": 0.5005, + "step": 2301 + }, + { + "epoch": 0.5134954271693063, + "grad_norm": 0.1540510356426239, + "learning_rate": 1.8688162692661214e-05, + "loss": 0.4925, + "step": 2302 + }, + { + "epoch": 0.5137184920811956, + "grad_norm": 0.16273614764213562, + "learning_rate": 1.8686997187795354e-05, + "loss": 0.5153, + "step": 2303 + }, + { + "epoch": 0.513941556993085, + "grad_norm": 0.16257858276367188, + "learning_rate": 1.8685831201788945e-05, + "loss": 0.495, + "step": 2304 + }, + { + "epoch": 0.5141646219049744, + "grad_norm": 0.1771703064441681, + "learning_rate": 1.8684664734706572e-05, + "loss": 0.5499, + "step": 2305 + }, + { + "epoch": 0.5143876868168638, + "grad_norm": 0.17417144775390625, + "learning_rate": 1.8683497786612834e-05, + "loss": 0.4777, + "step": 2306 + }, + { + "epoch": 0.514610751728753, + "grad_norm": 0.2703874707221985, + "learning_rate": 1.8682330357572368e-05, + "loss": 0.5032, + "step": 2307 + }, + { + "epoch": 0.5148338166406424, + "grad_norm": 0.16738004982471466, + "learning_rate": 1.8681162447649834e-05, + "loss": 0.514, + "step": 2308 + }, + { + "epoch": 0.5150568815525318, + "grad_norm": 0.17614148557186127, + "learning_rate": 1.8679994056909915e-05, + "loss": 0.5193, + "step": 2309 + }, + { + "epoch": 0.5152799464644211, + "grad_norm": 0.1655927300453186, + "learning_rate": 1.8678825185417328e-05, + "loss": 0.5253, + "step": 2310 + }, + { + "epoch": 0.5155030113763105, + "grad_norm": 0.16509908437728882, + "learning_rate": 1.867765583323681e-05, + "loss": 0.5108, + "step": 2311 + }, + { + "epoch": 0.5157260762881999, + "grad_norm": 0.17742401361465454, + "learning_rate": 1.8676486000433123e-05, + "loss": 0.5024, + "step": 2312 + }, + { + "epoch": 0.5159491412000893, + "grad_norm": 0.1574956476688385, + "learning_rate": 1.8675315687071068e-05, + "loss": 0.4944, + "step": 2313 + }, + { + "epoch": 0.5161722061119786, + "grad_norm": 0.16360540688037872, + "learning_rate": 1.867414489321546e-05, + "loss": 0.5054, + "step": 2314 + }, + { + "epoch": 0.516395271023868, + "grad_norm": 0.1697058230638504, + "learning_rate": 1.8672973618931144e-05, + "loss": 0.5198, + "step": 2315 + }, + { + "epoch": 0.5166183359357573, + "grad_norm": 0.16391722857952118, + "learning_rate": 1.8671801864282996e-05, + "loss": 0.5138, + "step": 2316 + }, + { + "epoch": 0.5168414008476466, + "grad_norm": 0.1696222871541977, + "learning_rate": 1.867062962933591e-05, + "loss": 0.4929, + "step": 2317 + }, + { + "epoch": 0.517064465759536, + "grad_norm": 0.16633786261081696, + "learning_rate": 1.8669456914154817e-05, + "loss": 0.4723, + "step": 2318 + }, + { + "epoch": 0.5172875306714254, + "grad_norm": 0.16019189357757568, + "learning_rate": 1.8668283718804664e-05, + "loss": 0.5093, + "step": 2319 + }, + { + "epoch": 0.5175105955833147, + "grad_norm": 0.1600850522518158, + "learning_rate": 1.8667110043350435e-05, + "loss": 0.4998, + "step": 2320 + }, + { + "epoch": 0.5177336604952041, + "grad_norm": 0.1658647060394287, + "learning_rate": 1.8665935887857136e-05, + "loss": 0.5201, + "step": 2321 + }, + { + "epoch": 0.5179567254070935, + "grad_norm": 0.16224202513694763, + "learning_rate": 1.8664761252389795e-05, + "loss": 0.4979, + "step": 2322 + }, + { + "epoch": 0.5181797903189829, + "grad_norm": 0.16525104641914368, + "learning_rate": 1.866358613701347e-05, + "loss": 0.5168, + "step": 2323 + }, + { + "epoch": 0.5184028552308722, + "grad_norm": 0.16819415986537933, + "learning_rate": 1.866241054179325e-05, + "loss": 0.5189, + "step": 2324 + }, + { + "epoch": 0.5186259201427615, + "grad_norm": 0.1534615010023117, + "learning_rate": 1.8661234466794246e-05, + "loss": 0.4935, + "step": 2325 + }, + { + "epoch": 0.5188489850546509, + "grad_norm": 0.15495775640010834, + "learning_rate": 1.8660057912081598e-05, + "loss": 0.5039, + "step": 2326 + }, + { + "epoch": 0.5190720499665402, + "grad_norm": 0.16145430505275726, + "learning_rate": 1.8658880877720467e-05, + "loss": 0.4758, + "step": 2327 + }, + { + "epoch": 0.5192951148784296, + "grad_norm": 0.16488440334796906, + "learning_rate": 1.8657703363776044e-05, + "loss": 0.5183, + "step": 2328 + }, + { + "epoch": 0.519518179790319, + "grad_norm": 0.16661033034324646, + "learning_rate": 1.8656525370313553e-05, + "loss": 0.519, + "step": 2329 + }, + { + "epoch": 0.5197412447022084, + "grad_norm": 0.15961341559886932, + "learning_rate": 1.8655346897398234e-05, + "loss": 0.478, + "step": 2330 + }, + { + "epoch": 0.5199643096140977, + "grad_norm": 0.15573342144489288, + "learning_rate": 1.865416794509536e-05, + "loss": 0.4843, + "step": 2331 + }, + { + "epoch": 0.5201873745259871, + "grad_norm": 0.16535112261772156, + "learning_rate": 1.8652988513470227e-05, + "loss": 0.4943, + "step": 2332 + }, + { + "epoch": 0.5204104394378765, + "grad_norm": 0.16046664118766785, + "learning_rate": 1.865180860258816e-05, + "loss": 0.5445, + "step": 2333 + }, + { + "epoch": 0.5206335043497657, + "grad_norm": 0.16381436586380005, + "learning_rate": 1.8650628212514516e-05, + "loss": 0.4956, + "step": 2334 + }, + { + "epoch": 0.5208565692616551, + "grad_norm": 0.17565996944904327, + "learning_rate": 1.864944734331466e-05, + "loss": 0.5186, + "step": 2335 + }, + { + "epoch": 0.5210796341735445, + "grad_norm": 0.15597450733184814, + "learning_rate": 1.8648265995054005e-05, + "loss": 0.4993, + "step": 2336 + }, + { + "epoch": 0.5213026990854338, + "grad_norm": 0.22045518457889557, + "learning_rate": 1.8647084167797982e-05, + "loss": 0.4938, + "step": 2337 + }, + { + "epoch": 0.5215257639973232, + "grad_norm": 0.14948628842830658, + "learning_rate": 1.8645901861612044e-05, + "loss": 0.4637, + "step": 2338 + }, + { + "epoch": 0.5217488289092126, + "grad_norm": 0.16708412766456604, + "learning_rate": 1.8644719076561675e-05, + "loss": 0.5093, + "step": 2339 + }, + { + "epoch": 0.521971893821102, + "grad_norm": 0.1535186767578125, + "learning_rate": 1.8643535812712386e-05, + "loss": 0.4894, + "step": 2340 + }, + { + "epoch": 0.5221949587329913, + "grad_norm": 0.19612246751785278, + "learning_rate": 1.8642352070129715e-05, + "loss": 0.4719, + "step": 2341 + }, + { + "epoch": 0.5224180236448807, + "grad_norm": 0.1691182553768158, + "learning_rate": 1.8641167848879225e-05, + "loss": 0.5119, + "step": 2342 + }, + { + "epoch": 0.52264108855677, + "grad_norm": 0.16633757948875427, + "learning_rate": 1.86399831490265e-05, + "loss": 0.4941, + "step": 2343 + }, + { + "epoch": 0.5228641534686593, + "grad_norm": 0.15868477523326874, + "learning_rate": 1.8638797970637162e-05, + "loss": 0.5114, + "step": 2344 + }, + { + "epoch": 0.5230872183805487, + "grad_norm": 0.1649598479270935, + "learning_rate": 1.8637612313776856e-05, + "loss": 0.5446, + "step": 2345 + }, + { + "epoch": 0.5233102832924381, + "grad_norm": 0.1706569939851761, + "learning_rate": 1.8636426178511246e-05, + "loss": 0.509, + "step": 2346 + }, + { + "epoch": 0.5235333482043275, + "grad_norm": 0.16140028834342957, + "learning_rate": 1.8635239564906026e-05, + "loss": 0.4963, + "step": 2347 + }, + { + "epoch": 0.5237564131162168, + "grad_norm": 0.16061624884605408, + "learning_rate": 1.8634052473026925e-05, + "loss": 0.5149, + "step": 2348 + }, + { + "epoch": 0.5239794780281062, + "grad_norm": 0.1699933260679245, + "learning_rate": 1.8632864902939684e-05, + "loss": 0.5436, + "step": 2349 + }, + { + "epoch": 0.5242025429399956, + "grad_norm": 0.16812016069889069, + "learning_rate": 1.8631676854710082e-05, + "loss": 0.5179, + "step": 2350 + }, + { + "epoch": 0.5244256078518849, + "grad_norm": 0.1655721366405487, + "learning_rate": 1.8630488328403924e-05, + "loss": 0.5069, + "step": 2351 + }, + { + "epoch": 0.5246486727637742, + "grad_norm": 0.28273990750312805, + "learning_rate": 1.8629299324087032e-05, + "loss": 0.5207, + "step": 2352 + }, + { + "epoch": 0.5248717376756636, + "grad_norm": 0.167250394821167, + "learning_rate": 1.8628109841825263e-05, + "loss": 0.4995, + "step": 2353 + }, + { + "epoch": 0.525094802587553, + "grad_norm": 0.15988002717494965, + "learning_rate": 1.8626919881684497e-05, + "loss": 0.5331, + "step": 2354 + }, + { + "epoch": 0.5253178674994423, + "grad_norm": 0.18735967576503754, + "learning_rate": 1.8625729443730643e-05, + "loss": 0.508, + "step": 2355 + }, + { + "epoch": 0.5255409324113317, + "grad_norm": 0.16121627390384674, + "learning_rate": 1.8624538528029638e-05, + "loss": 0.4926, + "step": 2356 + }, + { + "epoch": 0.5257639973232211, + "grad_norm": 0.16314919292926788, + "learning_rate": 1.8623347134647437e-05, + "loss": 0.4936, + "step": 2357 + }, + { + "epoch": 0.5259870622351104, + "grad_norm": 0.16220073401927948, + "learning_rate": 1.862215526365003e-05, + "loss": 0.4866, + "step": 2358 + }, + { + "epoch": 0.5262101271469998, + "grad_norm": 0.18198350071907043, + "learning_rate": 1.8620962915103425e-05, + "loss": 0.4969, + "step": 2359 + }, + { + "epoch": 0.5264331920588892, + "grad_norm": 0.16916455328464508, + "learning_rate": 1.8619770089073665e-05, + "loss": 0.4872, + "step": 2360 + }, + { + "epoch": 0.5266562569707784, + "grad_norm": 0.19399681687355042, + "learning_rate": 1.861857678562682e-05, + "loss": 0.5037, + "step": 2361 + }, + { + "epoch": 0.5268793218826678, + "grad_norm": 0.16876184940338135, + "learning_rate": 1.8617383004828978e-05, + "loss": 0.5265, + "step": 2362 + }, + { + "epoch": 0.5271023867945572, + "grad_norm": 0.17060886323451996, + "learning_rate": 1.8616188746746262e-05, + "loss": 0.5247, + "step": 2363 + }, + { + "epoch": 0.5273254517064466, + "grad_norm": 0.17065522074699402, + "learning_rate": 1.8614994011444812e-05, + "loss": 0.5258, + "step": 2364 + }, + { + "epoch": 0.5275485166183359, + "grad_norm": 0.41077789664268494, + "learning_rate": 1.8613798798990806e-05, + "loss": 0.4971, + "step": 2365 + }, + { + "epoch": 0.5277715815302253, + "grad_norm": 0.1597769409418106, + "learning_rate": 1.8612603109450437e-05, + "loss": 0.4971, + "step": 2366 + }, + { + "epoch": 0.5279946464421147, + "grad_norm": 0.15872445702552795, + "learning_rate": 1.8611406942889934e-05, + "loss": 0.4987, + "step": 2367 + }, + { + "epoch": 0.528217711354004, + "grad_norm": 0.1601647585630417, + "learning_rate": 1.861021029937555e-05, + "loss": 0.4791, + "step": 2368 + }, + { + "epoch": 0.5284407762658934, + "grad_norm": 0.1533002108335495, + "learning_rate": 1.8609013178973555e-05, + "loss": 0.4842, + "step": 2369 + }, + { + "epoch": 0.5286638411777828, + "grad_norm": 0.16213175654411316, + "learning_rate": 1.8607815581750257e-05, + "loss": 0.5063, + "step": 2370 + }, + { + "epoch": 0.5288869060896721, + "grad_norm": 0.1895373910665512, + "learning_rate": 1.860661750777199e-05, + "loss": 0.4885, + "step": 2371 + }, + { + "epoch": 0.5291099710015614, + "grad_norm": 0.1578717827796936, + "learning_rate": 1.8605418957105105e-05, + "loss": 0.4541, + "step": 2372 + }, + { + "epoch": 0.5293330359134508, + "grad_norm": 0.17269465327262878, + "learning_rate": 1.8604219929815987e-05, + "loss": 0.5234, + "step": 2373 + }, + { + "epoch": 0.5295561008253402, + "grad_norm": 0.16912835836410522, + "learning_rate": 1.860302042597105e-05, + "loss": 0.5295, + "step": 2374 + }, + { + "epoch": 0.5297791657372295, + "grad_norm": 0.16457562148571014, + "learning_rate": 1.8601820445636722e-05, + "loss": 0.5096, + "step": 2375 + }, + { + "epoch": 0.5300022306491189, + "grad_norm": 0.17353565990924835, + "learning_rate": 1.860061998887947e-05, + "loss": 0.5086, + "step": 2376 + }, + { + "epoch": 0.5302252955610083, + "grad_norm": 0.1719389110803604, + "learning_rate": 1.859941905576579e-05, + "loss": 0.5045, + "step": 2377 + }, + { + "epoch": 0.5304483604728976, + "grad_norm": 0.15863987803459167, + "learning_rate": 1.8598217646362183e-05, + "loss": 0.5007, + "step": 2378 + }, + { + "epoch": 0.530671425384787, + "grad_norm": 0.16614408791065216, + "learning_rate": 1.85970157607352e-05, + "loss": 0.5123, + "step": 2379 + }, + { + "epoch": 0.5308944902966763, + "grad_norm": 0.1657867580652237, + "learning_rate": 1.859581339895141e-05, + "loss": 0.5237, + "step": 2380 + }, + { + "epoch": 0.5311175552085657, + "grad_norm": 0.1731244921684265, + "learning_rate": 1.85946105610774e-05, + "loss": 0.5407, + "step": 2381 + }, + { + "epoch": 0.531340620120455, + "grad_norm": 0.15216992795467377, + "learning_rate": 1.85934072471798e-05, + "loss": 0.4709, + "step": 2382 + }, + { + "epoch": 0.5315636850323444, + "grad_norm": 0.1583755612373352, + "learning_rate": 1.8592203457325248e-05, + "loss": 0.491, + "step": 2383 + }, + { + "epoch": 0.5317867499442338, + "grad_norm": 0.16302180290222168, + "learning_rate": 1.859099919158042e-05, + "loss": 0.4984, + "step": 2384 + }, + { + "epoch": 0.5320098148561231, + "grad_norm": 0.3964548110961914, + "learning_rate": 1.858979445001202e-05, + "loss": 0.5208, + "step": 2385 + }, + { + "epoch": 0.5322328797680125, + "grad_norm": 0.1690683513879776, + "learning_rate": 1.8588589232686768e-05, + "loss": 0.4932, + "step": 2386 + }, + { + "epoch": 0.5324559446799019, + "grad_norm": 0.18451258540153503, + "learning_rate": 1.8587383539671424e-05, + "loss": 0.5261, + "step": 2387 + }, + { + "epoch": 0.5326790095917913, + "grad_norm": 0.18741649389266968, + "learning_rate": 1.858617737103276e-05, + "loss": 0.499, + "step": 2388 + }, + { + "epoch": 0.5329020745036805, + "grad_norm": 0.16601239144802094, + "learning_rate": 1.8584970726837587e-05, + "loss": 0.5176, + "step": 2389 + }, + { + "epoch": 0.5331251394155699, + "grad_norm": 0.16434918344020844, + "learning_rate": 1.858376360715273e-05, + "loss": 0.497, + "step": 2390 + }, + { + "epoch": 0.5333482043274593, + "grad_norm": 0.16978149116039276, + "learning_rate": 1.8582556012045053e-05, + "loss": 0.4946, + "step": 2391 + }, + { + "epoch": 0.5335712692393486, + "grad_norm": 0.17810384929180145, + "learning_rate": 1.8581347941581438e-05, + "loss": 0.5223, + "step": 2392 + }, + { + "epoch": 0.533794334151238, + "grad_norm": 0.1722012311220169, + "learning_rate": 1.8580139395828795e-05, + "loss": 0.5082, + "step": 2393 + }, + { + "epoch": 0.5340173990631274, + "grad_norm": 0.16442382335662842, + "learning_rate": 1.857893037485406e-05, + "loss": 0.4838, + "step": 2394 + }, + { + "epoch": 0.5342404639750167, + "grad_norm": 0.17343981564044952, + "learning_rate": 1.8577720878724195e-05, + "loss": 0.5024, + "step": 2395 + }, + { + "epoch": 0.5344635288869061, + "grad_norm": 0.1579180806875229, + "learning_rate": 1.8576510907506192e-05, + "loss": 0.5168, + "step": 2396 + }, + { + "epoch": 0.5346865937987955, + "grad_norm": 0.1650589406490326, + "learning_rate": 1.8575300461267073e-05, + "loss": 0.5063, + "step": 2397 + }, + { + "epoch": 0.5349096587106849, + "grad_norm": 0.17452824115753174, + "learning_rate": 1.8574089540073868e-05, + "loss": 0.5272, + "step": 2398 + }, + { + "epoch": 0.5351327236225741, + "grad_norm": 0.16344332695007324, + "learning_rate": 1.8572878143993652e-05, + "loss": 0.4876, + "step": 2399 + }, + { + "epoch": 0.5353557885344635, + "grad_norm": 0.16399069130420685, + "learning_rate": 1.857166627309352e-05, + "loss": 0.4929, + "step": 2400 + }, + { + "epoch": 0.5355788534463529, + "grad_norm": 0.1590677946805954, + "learning_rate": 1.857045392744059e-05, + "loss": 0.4672, + "step": 2401 + }, + { + "epoch": 0.5358019183582422, + "grad_norm": 0.15974955260753632, + "learning_rate": 1.8569241107102014e-05, + "loss": 0.504, + "step": 2402 + }, + { + "epoch": 0.5360249832701316, + "grad_norm": 0.15898270905017853, + "learning_rate": 1.856802781214496e-05, + "loss": 0.5031, + "step": 2403 + }, + { + "epoch": 0.536248048182021, + "grad_norm": 0.15949919819831848, + "learning_rate": 1.856681404263663e-05, + "loss": 0.4957, + "step": 2404 + }, + { + "epoch": 0.5364711130939104, + "grad_norm": 0.1708550751209259, + "learning_rate": 1.8565599798644253e-05, + "loss": 0.5325, + "step": 2405 + }, + { + "epoch": 0.5366941780057997, + "grad_norm": 0.1631123423576355, + "learning_rate": 1.856438508023508e-05, + "loss": 0.5044, + "step": 2406 + }, + { + "epoch": 0.536917242917689, + "grad_norm": 0.17329680919647217, + "learning_rate": 1.8563169887476386e-05, + "loss": 0.4905, + "step": 2407 + }, + { + "epoch": 0.5371403078295784, + "grad_norm": 0.1555010825395584, + "learning_rate": 1.8561954220435483e-05, + "loss": 0.4896, + "step": 2408 + }, + { + "epoch": 0.5373633727414677, + "grad_norm": 0.1617012768983841, + "learning_rate": 1.85607380791797e-05, + "loss": 0.5199, + "step": 2409 + }, + { + "epoch": 0.5375864376533571, + "grad_norm": 0.16019612550735474, + "learning_rate": 1.8559521463776388e-05, + "loss": 0.5331, + "step": 2410 + }, + { + "epoch": 0.5378095025652465, + "grad_norm": 0.15812602639198303, + "learning_rate": 1.855830437429294e-05, + "loss": 0.4946, + "step": 2411 + }, + { + "epoch": 0.5380325674771358, + "grad_norm": 0.16577614843845367, + "learning_rate": 1.8557086810796756e-05, + "loss": 0.5049, + "step": 2412 + }, + { + "epoch": 0.5382556323890252, + "grad_norm": 3.4729576110839844, + "learning_rate": 1.8555868773355283e-05, + "loss": 0.5572, + "step": 2413 + }, + { + "epoch": 0.5384786973009146, + "grad_norm": 0.177010640501976, + "learning_rate": 1.8554650262035975e-05, + "loss": 0.4871, + "step": 2414 + }, + { + "epoch": 0.538701762212804, + "grad_norm": 0.16966521739959717, + "learning_rate": 1.8553431276906328e-05, + "loss": 0.5043, + "step": 2415 + }, + { + "epoch": 0.5389248271246933, + "grad_norm": 0.16035056114196777, + "learning_rate": 1.855221181803385e-05, + "loss": 0.4951, + "step": 2416 + }, + { + "epoch": 0.5391478920365826, + "grad_norm": 0.1640123724937439, + "learning_rate": 1.8550991885486093e-05, + "loss": 0.5175, + "step": 2417 + }, + { + "epoch": 0.539370956948472, + "grad_norm": 0.16451038420200348, + "learning_rate": 1.8549771479330612e-05, + "loss": 0.5192, + "step": 2418 + }, + { + "epoch": 0.5395940218603613, + "grad_norm": 0.16037864983081818, + "learning_rate": 1.8548550599635007e-05, + "loss": 0.5174, + "step": 2419 + }, + { + "epoch": 0.5398170867722507, + "grad_norm": 0.1533394753932953, + "learning_rate": 1.85473292464669e-05, + "loss": 0.502, + "step": 2420 + }, + { + "epoch": 0.5400401516841401, + "grad_norm": 0.16923432052135468, + "learning_rate": 1.854610741989393e-05, + "loss": 0.4917, + "step": 2421 + }, + { + "epoch": 0.5402632165960295, + "grad_norm": 0.16241200268268585, + "learning_rate": 1.8544885119983774e-05, + "loss": 0.4922, + "step": 2422 + }, + { + "epoch": 0.5404862815079188, + "grad_norm": 0.1572006493806839, + "learning_rate": 1.8543662346804138e-05, + "loss": 0.4771, + "step": 2423 + }, + { + "epoch": 0.5407093464198082, + "grad_norm": 0.18193307518959045, + "learning_rate": 1.8542439100422733e-05, + "loss": 0.512, + "step": 2424 + }, + { + "epoch": 0.5409324113316976, + "grad_norm": 0.1868167519569397, + "learning_rate": 1.8541215380907317e-05, + "loss": 0.5112, + "step": 2425 + }, + { + "epoch": 0.5411554762435868, + "grad_norm": 0.1707518994808197, + "learning_rate": 1.8539991188325664e-05, + "loss": 0.4923, + "step": 2426 + }, + { + "epoch": 0.5413785411554762, + "grad_norm": 0.16489239037036896, + "learning_rate": 1.8538766522745587e-05, + "loss": 0.4787, + "step": 2427 + }, + { + "epoch": 0.5416016060673656, + "grad_norm": 0.16489636898040771, + "learning_rate": 1.8537541384234906e-05, + "loss": 0.4966, + "step": 2428 + }, + { + "epoch": 0.541824670979255, + "grad_norm": 0.17048819363117218, + "learning_rate": 1.853631577286148e-05, + "loss": 0.5079, + "step": 2429 + }, + { + "epoch": 0.5420477358911443, + "grad_norm": 0.16793492436408997, + "learning_rate": 1.853508968869319e-05, + "loss": 0.5122, + "step": 2430 + }, + { + "epoch": 0.5422708008030337, + "grad_norm": 0.16388513147830963, + "learning_rate": 1.8533863131797948e-05, + "loss": 0.4653, + "step": 2431 + }, + { + "epoch": 0.5424938657149231, + "grad_norm": 0.16700832545757294, + "learning_rate": 1.853263610224368e-05, + "loss": 0.5208, + "step": 2432 + }, + { + "epoch": 0.5427169306268124, + "grad_norm": 0.17072373628616333, + "learning_rate": 1.8531408600098356e-05, + "loss": 0.5032, + "step": 2433 + }, + { + "epoch": 0.5429399955387018, + "grad_norm": 0.16046811640262604, + "learning_rate": 1.8530180625429958e-05, + "loss": 0.515, + "step": 2434 + }, + { + "epoch": 0.5431630604505912, + "grad_norm": 0.16855685412883759, + "learning_rate": 1.8528952178306504e-05, + "loss": 0.4915, + "step": 2435 + }, + { + "epoch": 0.5433861253624804, + "grad_norm": 0.17628896236419678, + "learning_rate": 1.8527723258796025e-05, + "loss": 0.5221, + "step": 2436 + }, + { + "epoch": 0.5436091902743698, + "grad_norm": 0.1593201607465744, + "learning_rate": 1.852649386696659e-05, + "loss": 0.5027, + "step": 2437 + }, + { + "epoch": 0.5438322551862592, + "grad_norm": 0.18933804333209991, + "learning_rate": 1.852526400288629e-05, + "loss": 0.5187, + "step": 2438 + }, + { + "epoch": 0.5440553200981486, + "grad_norm": 0.16613252460956573, + "learning_rate": 1.852403366662325e-05, + "loss": 0.4939, + "step": 2439 + }, + { + "epoch": 0.5442783850100379, + "grad_norm": 0.16360655426979065, + "learning_rate": 1.85228028582456e-05, + "loss": 0.4835, + "step": 2440 + }, + { + "epoch": 0.5445014499219273, + "grad_norm": 0.1624102145433426, + "learning_rate": 1.8521571577821522e-05, + "loss": 0.5308, + "step": 2441 + }, + { + "epoch": 0.5447245148338167, + "grad_norm": 0.16573922336101532, + "learning_rate": 1.8520339825419204e-05, + "loss": 0.5148, + "step": 2442 + }, + { + "epoch": 0.544947579745706, + "grad_norm": 0.17061501741409302, + "learning_rate": 1.8519107601106875e-05, + "loss": 0.5025, + "step": 2443 + }, + { + "epoch": 0.5451706446575953, + "grad_norm": 0.16382652521133423, + "learning_rate": 1.851787490495278e-05, + "loss": 0.5214, + "step": 2444 + }, + { + "epoch": 0.5453937095694847, + "grad_norm": 0.16731955111026764, + "learning_rate": 1.8516641737025187e-05, + "loss": 0.4915, + "step": 2445 + }, + { + "epoch": 0.5456167744813741, + "grad_norm": 0.1665150374174118, + "learning_rate": 1.8515408097392408e-05, + "loss": 0.4585, + "step": 2446 + }, + { + "epoch": 0.5458398393932634, + "grad_norm": 0.16348014771938324, + "learning_rate": 1.851417398612276e-05, + "loss": 0.5042, + "step": 2447 + }, + { + "epoch": 0.5460629043051528, + "grad_norm": 0.1641550064086914, + "learning_rate": 1.85129394032846e-05, + "loss": 0.5103, + "step": 2448 + }, + { + "epoch": 0.5462859692170422, + "grad_norm": 0.16449302434921265, + "learning_rate": 1.8511704348946314e-05, + "loss": 0.5043, + "step": 2449 + }, + { + "epoch": 0.5465090341289315, + "grad_norm": 0.1623837649822235, + "learning_rate": 1.85104688231763e-05, + "loss": 0.4831, + "step": 2450 + }, + { + "epoch": 0.5467320990408209, + "grad_norm": 0.16712717711925507, + "learning_rate": 1.8509232826042983e-05, + "loss": 0.5146, + "step": 2451 + }, + { + "epoch": 0.5469551639527103, + "grad_norm": 0.14933447539806366, + "learning_rate": 1.850799635761483e-05, + "loss": 0.5011, + "step": 2452 + }, + { + "epoch": 0.5471782288645995, + "grad_norm": 0.17392055690288544, + "learning_rate": 1.8506759417960322e-05, + "loss": 0.482, + "step": 2453 + }, + { + "epoch": 0.5474012937764889, + "grad_norm": 0.1616058647632599, + "learning_rate": 1.850552200714797e-05, + "loss": 0.5152, + "step": 2454 + }, + { + "epoch": 0.5476243586883783, + "grad_norm": 0.16296276450157166, + "learning_rate": 1.8504284125246304e-05, + "loss": 0.5073, + "step": 2455 + }, + { + "epoch": 0.5478474236002677, + "grad_norm": 0.16247029602527618, + "learning_rate": 1.850304577232389e-05, + "loss": 0.5194, + "step": 2456 + }, + { + "epoch": 0.548070488512157, + "grad_norm": 0.16784507036209106, + "learning_rate": 1.8501806948449316e-05, + "loss": 0.5212, + "step": 2457 + }, + { + "epoch": 0.5482935534240464, + "grad_norm": 0.1538800299167633, + "learning_rate": 1.8500567653691192e-05, + "loss": 0.4907, + "step": 2458 + }, + { + "epoch": 0.5485166183359358, + "grad_norm": 0.1586543768644333, + "learning_rate": 1.8499327888118163e-05, + "loss": 0.4873, + "step": 2459 + }, + { + "epoch": 0.5487396832478251, + "grad_norm": 0.15387628972530365, + "learning_rate": 1.8498087651798893e-05, + "loss": 0.5102, + "step": 2460 + }, + { + "epoch": 0.5489627481597145, + "grad_norm": 0.15858915448188782, + "learning_rate": 1.8496846944802072e-05, + "loss": 0.4983, + "step": 2461 + }, + { + "epoch": 0.5491858130716039, + "grad_norm": 0.15983784198760986, + "learning_rate": 1.849560576719642e-05, + "loss": 0.4728, + "step": 2462 + }, + { + "epoch": 0.5494088779834932, + "grad_norm": 0.1605166792869568, + "learning_rate": 1.849436411905068e-05, + "loss": 0.5083, + "step": 2463 + }, + { + "epoch": 0.5496319428953825, + "grad_norm": 0.15921124815940857, + "learning_rate": 1.8493122000433628e-05, + "loss": 0.5054, + "step": 2464 + }, + { + "epoch": 0.5498550078072719, + "grad_norm": 0.15854284167289734, + "learning_rate": 1.849187941141405e-05, + "loss": 0.5111, + "step": 2465 + }, + { + "epoch": 0.5500780727191613, + "grad_norm": 0.16846506297588348, + "learning_rate": 1.8490636352060778e-05, + "loss": 0.526, + "step": 2466 + }, + { + "epoch": 0.5503011376310506, + "grad_norm": 0.1619185209274292, + "learning_rate": 1.8489392822442657e-05, + "loss": 0.4965, + "step": 2467 + }, + { + "epoch": 0.55052420254294, + "grad_norm": 0.17305007576942444, + "learning_rate": 1.8488148822628557e-05, + "loss": 0.5171, + "step": 2468 + }, + { + "epoch": 0.5507472674548294, + "grad_norm": 0.16538777947425842, + "learning_rate": 1.8486904352687384e-05, + "loss": 0.5187, + "step": 2469 + }, + { + "epoch": 0.5509703323667187, + "grad_norm": 0.1914980560541153, + "learning_rate": 1.8485659412688065e-05, + "loss": 0.5105, + "step": 2470 + }, + { + "epoch": 0.551193397278608, + "grad_norm": 0.16565662622451782, + "learning_rate": 1.8484414002699552e-05, + "loss": 0.4949, + "step": 2471 + }, + { + "epoch": 0.5514164621904974, + "grad_norm": 0.17171315848827362, + "learning_rate": 1.848316812279082e-05, + "loss": 0.507, + "step": 2472 + }, + { + "epoch": 0.5516395271023868, + "grad_norm": 0.1598774790763855, + "learning_rate": 1.8481921773030878e-05, + "loss": 0.5101, + "step": 2473 + }, + { + "epoch": 0.5518625920142761, + "grad_norm": 0.16791561245918274, + "learning_rate": 1.8480674953488752e-05, + "loss": 0.4902, + "step": 2474 + }, + { + "epoch": 0.5520856569261655, + "grad_norm": 0.17411862313747406, + "learning_rate": 1.8479427664233505e-05, + "loss": 0.5017, + "step": 2475 + }, + { + "epoch": 0.5523087218380549, + "grad_norm": 0.16108182072639465, + "learning_rate": 1.8478179905334213e-05, + "loss": 0.4886, + "step": 2476 + }, + { + "epoch": 0.5525317867499442, + "grad_norm": 0.1858782172203064, + "learning_rate": 1.847693167685999e-05, + "loss": 0.5047, + "step": 2477 + }, + { + "epoch": 0.5527548516618336, + "grad_norm": 0.17761607468128204, + "learning_rate": 1.847568297887997e-05, + "loss": 0.5233, + "step": 2478 + }, + { + "epoch": 0.552977916573723, + "grad_norm": 0.1658019870519638, + "learning_rate": 1.8474433811463307e-05, + "loss": 0.5263, + "step": 2479 + }, + { + "epoch": 0.5532009814856124, + "grad_norm": 1.4999321699142456, + "learning_rate": 1.84731841746792e-05, + "loss": 0.5359, + "step": 2480 + }, + { + "epoch": 0.5534240463975016, + "grad_norm": 0.17992182075977325, + "learning_rate": 1.847193406859685e-05, + "loss": 0.5033, + "step": 2481 + }, + { + "epoch": 0.553647111309391, + "grad_norm": 0.17310257256031036, + "learning_rate": 1.8470683493285503e-05, + "loss": 0.4905, + "step": 2482 + }, + { + "epoch": 0.5538701762212804, + "grad_norm": 0.16551616787910461, + "learning_rate": 1.846943244881442e-05, + "loss": 0.4925, + "step": 2483 + }, + { + "epoch": 0.5540932411331697, + "grad_norm": 0.17117410898208618, + "learning_rate": 1.846818093525289e-05, + "loss": 0.5128, + "step": 2484 + }, + { + "epoch": 0.5543163060450591, + "grad_norm": 0.15640629827976227, + "learning_rate": 1.8466928952670242e-05, + "loss": 0.5054, + "step": 2485 + }, + { + "epoch": 0.5545393709569485, + "grad_norm": 0.16013699769973755, + "learning_rate": 1.8465676501135804e-05, + "loss": 0.4966, + "step": 2486 + }, + { + "epoch": 0.5547624358688378, + "grad_norm": 0.16586218774318695, + "learning_rate": 1.846442358071895e-05, + "loss": 0.506, + "step": 2487 + }, + { + "epoch": 0.5549855007807272, + "grad_norm": 0.1749541014432907, + "learning_rate": 1.8463170191489075e-05, + "loss": 0.4733, + "step": 2488 + }, + { + "epoch": 0.5552085656926166, + "grad_norm": 0.16143982112407684, + "learning_rate": 1.84619163335156e-05, + "loss": 0.5014, + "step": 2489 + }, + { + "epoch": 0.555431630604506, + "grad_norm": 0.1761699765920639, + "learning_rate": 1.846066200686797e-05, + "loss": 0.5305, + "step": 2490 + }, + { + "epoch": 0.5556546955163952, + "grad_norm": 0.25962182879447937, + "learning_rate": 1.8459407211615658e-05, + "loss": 0.5047, + "step": 2491 + }, + { + "epoch": 0.5558777604282846, + "grad_norm": 0.15808863937854767, + "learning_rate": 1.8458151947828165e-05, + "loss": 0.5079, + "step": 2492 + }, + { + "epoch": 0.556100825340174, + "grad_norm": 0.1679726392030716, + "learning_rate": 1.8456896215575013e-05, + "loss": 0.5097, + "step": 2493 + }, + { + "epoch": 0.5563238902520633, + "grad_norm": 0.15282638370990753, + "learning_rate": 1.845564001492575e-05, + "loss": 0.5196, + "step": 2494 + }, + { + "epoch": 0.5565469551639527, + "grad_norm": 0.1774146854877472, + "learning_rate": 1.8454383345949954e-05, + "loss": 0.5227, + "step": 2495 + }, + { + "epoch": 0.5567700200758421, + "grad_norm": 0.1605686992406845, + "learning_rate": 1.8453126208717235e-05, + "loss": 0.4751, + "step": 2496 + }, + { + "epoch": 0.5569930849877315, + "grad_norm": 0.16172072291374207, + "learning_rate": 1.845186860329721e-05, + "loss": 0.5158, + "step": 2497 + }, + { + "epoch": 0.5572161498996208, + "grad_norm": 0.16925646364688873, + "learning_rate": 1.8450610529759535e-05, + "loss": 0.5148, + "step": 2498 + }, + { + "epoch": 0.5574392148115102, + "grad_norm": 0.16925497353076935, + "learning_rate": 1.8449351988173894e-05, + "loss": 0.4947, + "step": 2499 + }, + { + "epoch": 0.5576622797233995, + "grad_norm": 1.354637861251831, + "learning_rate": 1.8448092978609993e-05, + "loss": 0.5136, + "step": 2500 + }, + { + "epoch": 0.5578853446352888, + "grad_norm": 0.16736604273319244, + "learning_rate": 1.844683350113756e-05, + "loss": 0.5389, + "step": 2501 + }, + { + "epoch": 0.5581084095471782, + "grad_norm": 0.16498203575611115, + "learning_rate": 1.8445573555826355e-05, + "loss": 0.5018, + "step": 2502 + }, + { + "epoch": 0.5583314744590676, + "grad_norm": 0.17059291899204254, + "learning_rate": 1.8444313142746164e-05, + "loss": 0.5282, + "step": 2503 + }, + { + "epoch": 0.558554539370957, + "grad_norm": 0.21622738242149353, + "learning_rate": 1.844305226196679e-05, + "loss": 0.4923, + "step": 2504 + }, + { + "epoch": 0.5587776042828463, + "grad_norm": 0.16687920689582825, + "learning_rate": 1.844179091355808e-05, + "loss": 0.4998, + "step": 2505 + }, + { + "epoch": 0.5590006691947357, + "grad_norm": 0.1704476922750473, + "learning_rate": 1.8440529097589885e-05, + "loss": 0.5267, + "step": 2506 + }, + { + "epoch": 0.5592237341066251, + "grad_norm": 0.16384254395961761, + "learning_rate": 1.8439266814132092e-05, + "loss": 0.5053, + "step": 2507 + }, + { + "epoch": 0.5594467990185144, + "grad_norm": 0.16077467799186707, + "learning_rate": 1.843800406325462e-05, + "loss": 0.5151, + "step": 2508 + }, + { + "epoch": 0.5596698639304037, + "grad_norm": 0.16944189369678497, + "learning_rate": 1.843674084502741e-05, + "loss": 0.5054, + "step": 2509 + }, + { + "epoch": 0.5598929288422931, + "grad_norm": 0.16107740998268127, + "learning_rate": 1.8435477159520418e-05, + "loss": 0.4861, + "step": 2510 + }, + { + "epoch": 0.5601159937541824, + "grad_norm": 0.17457795143127441, + "learning_rate": 1.843421300680364e-05, + "loss": 0.486, + "step": 2511 + }, + { + "epoch": 0.5603390586660718, + "grad_norm": 0.16926448047161102, + "learning_rate": 1.8432948386947092e-05, + "loss": 0.5238, + "step": 2512 + }, + { + "epoch": 0.5605621235779612, + "grad_norm": 0.16914579272270203, + "learning_rate": 1.8431683300020817e-05, + "loss": 0.5021, + "step": 2513 + }, + { + "epoch": 0.5607851884898506, + "grad_norm": 0.16727939248085022, + "learning_rate": 1.8430417746094886e-05, + "loss": 0.53, + "step": 2514 + }, + { + "epoch": 0.5610082534017399, + "grad_norm": 0.1788937747478485, + "learning_rate": 1.842915172523939e-05, + "loss": 0.5226, + "step": 2515 + }, + { + "epoch": 0.5612313183136293, + "grad_norm": 0.17947039008140564, + "learning_rate": 1.8427885237524446e-05, + "loss": 0.4914, + "step": 2516 + }, + { + "epoch": 0.5614543832255187, + "grad_norm": 0.17127928137779236, + "learning_rate": 1.842661828302021e-05, + "loss": 0.4802, + "step": 2517 + }, + { + "epoch": 0.5616774481374079, + "grad_norm": 0.20038791000843048, + "learning_rate": 1.8425350861796845e-05, + "loss": 0.4911, + "step": 2518 + }, + { + "epoch": 0.5619005130492973, + "grad_norm": 0.16238313913345337, + "learning_rate": 1.842408297392455e-05, + "loss": 0.492, + "step": 2519 + }, + { + "epoch": 0.5621235779611867, + "grad_norm": 0.17273001372814178, + "learning_rate": 1.8422814619473556e-05, + "loss": 0.555, + "step": 2520 + }, + { + "epoch": 0.5623466428730761, + "grad_norm": 0.19861721992492676, + "learning_rate": 1.84215457985141e-05, + "loss": 0.5245, + "step": 2521 + }, + { + "epoch": 0.5625697077849654, + "grad_norm": 0.15922409296035767, + "learning_rate": 1.8420276511116467e-05, + "loss": 0.4968, + "step": 2522 + }, + { + "epoch": 0.5627927726968548, + "grad_norm": 0.15657752752304077, + "learning_rate": 1.8419006757350956e-05, + "loss": 0.4923, + "step": 2523 + }, + { + "epoch": 0.5630158376087442, + "grad_norm": 0.1599670946598053, + "learning_rate": 1.8417736537287893e-05, + "loss": 0.5381, + "step": 2524 + }, + { + "epoch": 0.5632389025206335, + "grad_norm": 0.16177695989608765, + "learning_rate": 1.841646585099763e-05, + "loss": 0.5025, + "step": 2525 + }, + { + "epoch": 0.5634619674325229, + "grad_norm": 0.17016616463661194, + "learning_rate": 1.8415194698550548e-05, + "loss": 0.5024, + "step": 2526 + }, + { + "epoch": 0.5636850323444123, + "grad_norm": 0.15452872216701508, + "learning_rate": 1.8413923080017047e-05, + "loss": 0.4838, + "step": 2527 + }, + { + "epoch": 0.5639080972563015, + "grad_norm": 0.17372727394104004, + "learning_rate": 1.8412650995467564e-05, + "loss": 0.5272, + "step": 2528 + }, + { + "epoch": 0.5641311621681909, + "grad_norm": 0.20483483374118805, + "learning_rate": 1.8411378444972548e-05, + "loss": 0.4984, + "step": 2529 + }, + { + "epoch": 0.5643542270800803, + "grad_norm": 0.16677002608776093, + "learning_rate": 1.8410105428602485e-05, + "loss": 0.5217, + "step": 2530 + }, + { + "epoch": 0.5645772919919697, + "grad_norm": 0.16841678321361542, + "learning_rate": 1.840883194642788e-05, + "loss": 0.5331, + "step": 2531 + }, + { + "epoch": 0.564800356903859, + "grad_norm": 0.16345760226249695, + "learning_rate": 1.8407557998519273e-05, + "loss": 0.5372, + "step": 2532 + }, + { + "epoch": 0.5650234218157484, + "grad_norm": 0.16982464492321014, + "learning_rate": 1.840628358494721e-05, + "loss": 0.4795, + "step": 2533 + }, + { + "epoch": 0.5652464867276378, + "grad_norm": 0.16752153635025024, + "learning_rate": 1.840500870578229e-05, + "loss": 0.5088, + "step": 2534 + }, + { + "epoch": 0.5654695516395271, + "grad_norm": 0.16022369265556335, + "learning_rate": 1.840373336109512e-05, + "loss": 0.4676, + "step": 2535 + }, + { + "epoch": 0.5656926165514164, + "grad_norm": 0.15749748051166534, + "learning_rate": 1.8402457550956336e-05, + "loss": 0.5325, + "step": 2536 + }, + { + "epoch": 0.5659156814633058, + "grad_norm": 0.16572564840316772, + "learning_rate": 1.8401181275436596e-05, + "loss": 0.4972, + "step": 2537 + }, + { + "epoch": 0.5661387463751952, + "grad_norm": 0.15949216485023499, + "learning_rate": 1.839990453460659e-05, + "loss": 0.5088, + "step": 2538 + }, + { + "epoch": 0.5663618112870845, + "grad_norm": 0.15577349066734314, + "learning_rate": 1.8398627328537037e-05, + "loss": 0.4871, + "step": 2539 + }, + { + "epoch": 0.5665848761989739, + "grad_norm": 0.22663244605064392, + "learning_rate": 1.839734965729867e-05, + "loss": 0.51, + "step": 2540 + }, + { + "epoch": 0.5668079411108633, + "grad_norm": 0.16689926385879517, + "learning_rate": 1.8396071520962256e-05, + "loss": 0.4998, + "step": 2541 + }, + { + "epoch": 0.5670310060227526, + "grad_norm": 0.16619545221328735, + "learning_rate": 1.8394792919598592e-05, + "loss": 0.5208, + "step": 2542 + }, + { + "epoch": 0.567254070934642, + "grad_norm": 0.15518589317798615, + "learning_rate": 1.8393513853278492e-05, + "loss": 0.4967, + "step": 2543 + }, + { + "epoch": 0.5674771358465314, + "grad_norm": 0.19833387434482574, + "learning_rate": 1.8392234322072792e-05, + "loss": 0.5029, + "step": 2544 + }, + { + "epoch": 0.5677002007584206, + "grad_norm": 0.15093770623207092, + "learning_rate": 1.839095432605237e-05, + "loss": 0.5017, + "step": 2545 + }, + { + "epoch": 0.56792326567031, + "grad_norm": 0.19351163506507874, + "learning_rate": 1.8389673865288114e-05, + "loss": 0.494, + "step": 2546 + }, + { + "epoch": 0.5681463305821994, + "grad_norm": 0.16114504635334015, + "learning_rate": 1.8388392939850946e-05, + "loss": 0.5118, + "step": 2547 + }, + { + "epoch": 0.5683693954940888, + "grad_norm": 0.16146999597549438, + "learning_rate": 1.8387111549811812e-05, + "loss": 0.4732, + "step": 2548 + }, + { + "epoch": 0.5685924604059781, + "grad_norm": 0.1651589721441269, + "learning_rate": 1.8385829695241687e-05, + "loss": 0.5086, + "step": 2549 + }, + { + "epoch": 0.5688155253178675, + "grad_norm": 0.15596236288547516, + "learning_rate": 1.838454737621156e-05, + "loss": 0.5017, + "step": 2550 + }, + { + "epoch": 0.5690385902297569, + "grad_norm": 0.1615191251039505, + "learning_rate": 1.838326459279246e-05, + "loss": 0.5212, + "step": 2551 + }, + { + "epoch": 0.5692616551416462, + "grad_norm": 0.16675913333892822, + "learning_rate": 1.8381981345055435e-05, + "loss": 0.5229, + "step": 2552 + }, + { + "epoch": 0.5694847200535356, + "grad_norm": 0.16236376762390137, + "learning_rate": 1.8380697633071558e-05, + "loss": 0.4955, + "step": 2553 + }, + { + "epoch": 0.569707784965425, + "grad_norm": 0.17616188526153564, + "learning_rate": 1.837941345691193e-05, + "loss": 0.5067, + "step": 2554 + }, + { + "epoch": 0.5699308498773143, + "grad_norm": 0.16186662018299103, + "learning_rate": 1.8378128816647676e-05, + "loss": 0.5054, + "step": 2555 + }, + { + "epoch": 0.5701539147892036, + "grad_norm": 0.1674317866563797, + "learning_rate": 1.8376843712349946e-05, + "loss": 0.5009, + "step": 2556 + }, + { + "epoch": 0.570376979701093, + "grad_norm": 0.17086781561374664, + "learning_rate": 1.837555814408992e-05, + "loss": 0.4998, + "step": 2557 + }, + { + "epoch": 0.5706000446129824, + "grad_norm": 0.1624806523323059, + "learning_rate": 1.8374272111938797e-05, + "loss": 0.519, + "step": 2558 + }, + { + "epoch": 0.5708231095248717, + "grad_norm": 0.1701088398694992, + "learning_rate": 1.837298561596781e-05, + "loss": 0.5173, + "step": 2559 + }, + { + "epoch": 0.5710461744367611, + "grad_norm": 0.15940968692302704, + "learning_rate": 1.8371698656248212e-05, + "loss": 0.515, + "step": 2560 + }, + { + "epoch": 0.5712692393486505, + "grad_norm": 0.17150092124938965, + "learning_rate": 1.837041123285128e-05, + "loss": 0.4826, + "step": 2561 + }, + { + "epoch": 0.5714923042605398, + "grad_norm": 0.16845233738422394, + "learning_rate": 1.836912334584833e-05, + "loss": 0.5018, + "step": 2562 + }, + { + "epoch": 0.5717153691724292, + "grad_norm": 0.18033871054649353, + "learning_rate": 1.8367834995310676e-05, + "loss": 0.5177, + "step": 2563 + }, + { + "epoch": 0.5719384340843185, + "grad_norm": 0.17002245783805847, + "learning_rate": 1.8366546181309686e-05, + "loss": 0.5287, + "step": 2564 + }, + { + "epoch": 0.5721614989962079, + "grad_norm": 0.16821357607841492, + "learning_rate": 1.836525690391674e-05, + "loss": 0.522, + "step": 2565 + }, + { + "epoch": 0.5723845639080972, + "grad_norm": 0.1578892022371292, + "learning_rate": 1.836396716320325e-05, + "loss": 0.4929, + "step": 2566 + }, + { + "epoch": 0.5726076288199866, + "grad_norm": 0.15691736340522766, + "learning_rate": 1.836267695924065e-05, + "loss": 0.4775, + "step": 2567 + }, + { + "epoch": 0.572830693731876, + "grad_norm": 0.16604529321193695, + "learning_rate": 1.8361386292100394e-05, + "loss": 0.4854, + "step": 2568 + }, + { + "epoch": 0.5730537586437653, + "grad_norm": 0.16472390294075012, + "learning_rate": 1.8360095161853966e-05, + "loss": 0.486, + "step": 2569 + }, + { + "epoch": 0.5732768235556547, + "grad_norm": 0.1708393096923828, + "learning_rate": 1.8358803568572885e-05, + "loss": 0.5086, + "step": 2570 + }, + { + "epoch": 0.5734998884675441, + "grad_norm": 0.1598411649465561, + "learning_rate": 1.8357511512328683e-05, + "loss": 0.5071, + "step": 2571 + }, + { + "epoch": 0.5737229533794335, + "grad_norm": 0.19416102766990662, + "learning_rate": 1.8356218993192922e-05, + "loss": 0.5092, + "step": 2572 + }, + { + "epoch": 0.5739460182913227, + "grad_norm": 0.1696127951145172, + "learning_rate": 1.835492601123719e-05, + "loss": 0.5157, + "step": 2573 + }, + { + "epoch": 0.5741690832032121, + "grad_norm": 0.16586486995220184, + "learning_rate": 1.8353632566533102e-05, + "loss": 0.4948, + "step": 2574 + }, + { + "epoch": 0.5743921481151015, + "grad_norm": 0.17883430421352386, + "learning_rate": 1.8352338659152296e-05, + "loss": 0.4989, + "step": 2575 + }, + { + "epoch": 0.5746152130269908, + "grad_norm": 0.15909375250339508, + "learning_rate": 1.8351044289166435e-05, + "loss": 0.4892, + "step": 2576 + }, + { + "epoch": 0.5748382779388802, + "grad_norm": 0.17203538119792938, + "learning_rate": 1.834974945664721e-05, + "loss": 0.5039, + "step": 2577 + }, + { + "epoch": 0.5750613428507696, + "grad_norm": 0.17230457067489624, + "learning_rate": 1.834845416166634e-05, + "loss": 0.5102, + "step": 2578 + }, + { + "epoch": 0.575284407762659, + "grad_norm": 0.16863304376602173, + "learning_rate": 1.8347158404295566e-05, + "loss": 0.5365, + "step": 2579 + }, + { + "epoch": 0.5755074726745483, + "grad_norm": 0.15985195338726044, + "learning_rate": 1.8345862184606653e-05, + "loss": 0.4991, + "step": 2580 + }, + { + "epoch": 0.5757305375864377, + "grad_norm": 0.16000472009181976, + "learning_rate": 1.8344565502671396e-05, + "loss": 0.4898, + "step": 2581 + }, + { + "epoch": 0.575953602498327, + "grad_norm": 0.16451792418956757, + "learning_rate": 1.8343268358561607e-05, + "loss": 0.5382, + "step": 2582 + }, + { + "epoch": 0.5761766674102163, + "grad_norm": 0.16820542514324188, + "learning_rate": 1.834197075234914e-05, + "loss": 0.4993, + "step": 2583 + }, + { + "epoch": 0.5763997323221057, + "grad_norm": 0.16512157022953033, + "learning_rate": 1.834067268410586e-05, + "loss": 0.5139, + "step": 2584 + }, + { + "epoch": 0.5766227972339951, + "grad_norm": 0.16488297283649445, + "learning_rate": 1.833937415390366e-05, + "loss": 0.5371, + "step": 2585 + }, + { + "epoch": 0.5768458621458844, + "grad_norm": 0.1621064841747284, + "learning_rate": 1.8338075161814462e-05, + "loss": 0.5218, + "step": 2586 + }, + { + "epoch": 0.5770689270577738, + "grad_norm": 0.1790938526391983, + "learning_rate": 1.8336775707910214e-05, + "loss": 0.5357, + "step": 2587 + }, + { + "epoch": 0.5772919919696632, + "grad_norm": 0.16097491979599, + "learning_rate": 1.8335475792262888e-05, + "loss": 0.4874, + "step": 2588 + }, + { + "epoch": 0.5775150568815526, + "grad_norm": 0.17577522993087769, + "learning_rate": 1.8334175414944476e-05, + "loss": 0.5097, + "step": 2589 + }, + { + "epoch": 0.5777381217934419, + "grad_norm": 0.17199388146400452, + "learning_rate": 1.833287457602701e-05, + "loss": 0.4969, + "step": 2590 + }, + { + "epoch": 0.5779611867053313, + "grad_norm": 0.16499276459217072, + "learning_rate": 1.833157327558253e-05, + "loss": 0.5207, + "step": 2591 + }, + { + "epoch": 0.5781842516172206, + "grad_norm": 0.16708426177501678, + "learning_rate": 1.8330271513683118e-05, + "loss": 0.5077, + "step": 2592 + }, + { + "epoch": 0.5784073165291099, + "grad_norm": 0.1643514633178711, + "learning_rate": 1.8328969290400867e-05, + "loss": 0.4884, + "step": 2593 + }, + { + "epoch": 0.5786303814409993, + "grad_norm": 0.1548931747674942, + "learning_rate": 1.832766660580791e-05, + "loss": 0.5047, + "step": 2594 + }, + { + "epoch": 0.5788534463528887, + "grad_norm": 0.16089920699596405, + "learning_rate": 1.832636345997639e-05, + "loss": 0.4916, + "step": 2595 + }, + { + "epoch": 0.5790765112647781, + "grad_norm": 0.15845981240272522, + "learning_rate": 1.8325059852978485e-05, + "loss": 0.4832, + "step": 2596 + }, + { + "epoch": 0.5792995761766674, + "grad_norm": 0.1613757461309433, + "learning_rate": 1.83237557848864e-05, + "loss": 0.5192, + "step": 2597 + }, + { + "epoch": 0.5795226410885568, + "grad_norm": 0.15937237441539764, + "learning_rate": 1.8322451255772365e-05, + "loss": 0.5028, + "step": 2598 + }, + { + "epoch": 0.5797457060004462, + "grad_norm": 0.16132612526416779, + "learning_rate": 1.8321146265708627e-05, + "loss": 0.4948, + "step": 2599 + }, + { + "epoch": 0.5799687709123355, + "grad_norm": 0.18276172876358032, + "learning_rate": 1.8319840814767463e-05, + "loss": 0.4845, + "step": 2600 + }, + { + "epoch": 0.5801918358242248, + "grad_norm": 0.1661817729473114, + "learning_rate": 1.8318534903021182e-05, + "loss": 0.5344, + "step": 2601 + }, + { + "epoch": 0.5804149007361142, + "grad_norm": 0.15862146019935608, + "learning_rate": 1.8317228530542117e-05, + "loss": 0.5043, + "step": 2602 + }, + { + "epoch": 0.5806379656480035, + "grad_norm": 0.15994656085968018, + "learning_rate": 1.8315921697402618e-05, + "loss": 0.4765, + "step": 2603 + }, + { + "epoch": 0.5808610305598929, + "grad_norm": 0.16988401114940643, + "learning_rate": 1.8314614403675063e-05, + "loss": 0.5194, + "step": 2604 + }, + { + "epoch": 0.5810840954717823, + "grad_norm": 0.16828812658786774, + "learning_rate": 1.831330664943186e-05, + "loss": 0.5357, + "step": 2605 + }, + { + "epoch": 0.5813071603836717, + "grad_norm": 0.19693851470947266, + "learning_rate": 1.8311998434745445e-05, + "loss": 0.4976, + "step": 2606 + }, + { + "epoch": 0.581530225295561, + "grad_norm": 0.1689445823431015, + "learning_rate": 1.831068975968827e-05, + "loss": 0.4992, + "step": 2607 + }, + { + "epoch": 0.5817532902074504, + "grad_norm": 0.15751276910305023, + "learning_rate": 1.830938062433282e-05, + "loss": 0.4711, + "step": 2608 + }, + { + "epoch": 0.5819763551193398, + "grad_norm": 0.16275669634342194, + "learning_rate": 1.8308071028751608e-05, + "loss": 0.5184, + "step": 2609 + }, + { + "epoch": 0.582199420031229, + "grad_norm": 0.16536571085453033, + "learning_rate": 1.8306760973017158e-05, + "loss": 0.5172, + "step": 2610 + }, + { + "epoch": 0.5824224849431184, + "grad_norm": 0.18836647272109985, + "learning_rate": 1.830545045720203e-05, + "loss": 0.5019, + "step": 2611 + }, + { + "epoch": 0.5826455498550078, + "grad_norm": 0.16014382243156433, + "learning_rate": 1.830413948137882e-05, + "loss": 0.5182, + "step": 2612 + }, + { + "epoch": 0.5828686147668972, + "grad_norm": 0.1588478833436966, + "learning_rate": 1.8302828045620128e-05, + "loss": 0.5072, + "step": 2613 + }, + { + "epoch": 0.5830916796787865, + "grad_norm": 0.17403388023376465, + "learning_rate": 1.830151614999859e-05, + "loss": 0.5093, + "step": 2614 + }, + { + "epoch": 0.5833147445906759, + "grad_norm": 0.17456629872322083, + "learning_rate": 1.830020379458687e-05, + "loss": 0.4922, + "step": 2615 + }, + { + "epoch": 0.5835378095025653, + "grad_norm": 0.15623332560062408, + "learning_rate": 1.829889097945765e-05, + "loss": 0.5133, + "step": 2616 + }, + { + "epoch": 0.5837608744144546, + "grad_norm": 0.18376080691814423, + "learning_rate": 1.8297577704683653e-05, + "loss": 0.5154, + "step": 2617 + }, + { + "epoch": 0.583983939326344, + "grad_norm": 0.17060859501361847, + "learning_rate": 1.8296263970337602e-05, + "loss": 0.5058, + "step": 2618 + }, + { + "epoch": 0.5842070042382334, + "grad_norm": 0.1654299646615982, + "learning_rate": 1.829494977649227e-05, + "loss": 0.5072, + "step": 2619 + }, + { + "epoch": 0.5844300691501226, + "grad_norm": 0.1547902524471283, + "learning_rate": 1.829363512322044e-05, + "loss": 0.4967, + "step": 2620 + }, + { + "epoch": 0.584653134062012, + "grad_norm": 0.1600826382637024, + "learning_rate": 1.829232001059493e-05, + "loss": 0.5396, + "step": 2621 + }, + { + "epoch": 0.5848761989739014, + "grad_norm": 0.16009269654750824, + "learning_rate": 1.8291004438688578e-05, + "loss": 0.525, + "step": 2622 + }, + { + "epoch": 0.5850992638857908, + "grad_norm": 0.2055627703666687, + "learning_rate": 1.8289688407574246e-05, + "loss": 0.4993, + "step": 2623 + }, + { + "epoch": 0.5853223287976801, + "grad_norm": 0.1680591106414795, + "learning_rate": 1.8288371917324827e-05, + "loss": 0.5184, + "step": 2624 + }, + { + "epoch": 0.5855453937095695, + "grad_norm": 0.1637965738773346, + "learning_rate": 1.828705496801323e-05, + "loss": 0.5205, + "step": 2625 + }, + { + "epoch": 0.5857684586214589, + "grad_norm": 0.172703817486763, + "learning_rate": 1.828573755971241e-05, + "loss": 0.5201, + "step": 2626 + }, + { + "epoch": 0.5859915235333482, + "grad_norm": 0.18580362200737, + "learning_rate": 1.8284419692495316e-05, + "loss": 0.5246, + "step": 2627 + }, + { + "epoch": 0.5862145884452375, + "grad_norm": 0.16274447739124298, + "learning_rate": 1.8283101366434954e-05, + "loss": 0.5172, + "step": 2628 + }, + { + "epoch": 0.5864376533571269, + "grad_norm": 0.16170893609523773, + "learning_rate": 1.8281782581604334e-05, + "loss": 0.506, + "step": 2629 + }, + { + "epoch": 0.5866607182690163, + "grad_norm": 0.15339437127113342, + "learning_rate": 1.82804633380765e-05, + "loss": 0.4706, + "step": 2630 + }, + { + "epoch": 0.5868837831809056, + "grad_norm": 0.17161330580711365, + "learning_rate": 1.827914363592452e-05, + "loss": 0.5217, + "step": 2631 + }, + { + "epoch": 0.587106848092795, + "grad_norm": 0.16379369795322418, + "learning_rate": 1.8277823475221485e-05, + "loss": 0.4954, + "step": 2632 + }, + { + "epoch": 0.5873299130046844, + "grad_norm": 0.16465184092521667, + "learning_rate": 1.827650285604052e-05, + "loss": 0.4995, + "step": 2633 + }, + { + "epoch": 0.5875529779165737, + "grad_norm": 0.1681264042854309, + "learning_rate": 1.8275181778454767e-05, + "loss": 0.5391, + "step": 2634 + }, + { + "epoch": 0.5877760428284631, + "grad_norm": 0.17729119956493378, + "learning_rate": 1.827386024253739e-05, + "loss": 0.5313, + "step": 2635 + }, + { + "epoch": 0.5879991077403525, + "grad_norm": 0.1703905314207077, + "learning_rate": 1.8272538248361592e-05, + "loss": 0.5407, + "step": 2636 + }, + { + "epoch": 0.5882221726522417, + "grad_norm": 0.1546456664800644, + "learning_rate": 1.8271215796000588e-05, + "loss": 0.4965, + "step": 2637 + }, + { + "epoch": 0.5884452375641311, + "grad_norm": 0.15894514322280884, + "learning_rate": 1.8269892885527624e-05, + "loss": 0.498, + "step": 2638 + }, + { + "epoch": 0.5886683024760205, + "grad_norm": 0.15367206931114197, + "learning_rate": 1.826856951701597e-05, + "loss": 0.482, + "step": 2639 + }, + { + "epoch": 0.5888913673879099, + "grad_norm": 0.16719990968704224, + "learning_rate": 1.826724569053893e-05, + "loss": 0.5287, + "step": 2640 + }, + { + "epoch": 0.5891144322997992, + "grad_norm": 0.1742510050535202, + "learning_rate": 1.8265921406169816e-05, + "loss": 0.4773, + "step": 2641 + }, + { + "epoch": 0.5893374972116886, + "grad_norm": 0.16305537521839142, + "learning_rate": 1.8264596663981985e-05, + "loss": 0.47, + "step": 2642 + }, + { + "epoch": 0.589560562123578, + "grad_norm": 0.1746242344379425, + "learning_rate": 1.82632714640488e-05, + "loss": 0.4978, + "step": 2643 + }, + { + "epoch": 0.5897836270354673, + "grad_norm": 0.16970300674438477, + "learning_rate": 1.8261945806443666e-05, + "loss": 0.5225, + "step": 2644 + }, + { + "epoch": 0.5900066919473567, + "grad_norm": 0.16862879693508148, + "learning_rate": 1.826061969124e-05, + "loss": 0.5299, + "step": 2645 + }, + { + "epoch": 0.5902297568592461, + "grad_norm": 0.1616392582654953, + "learning_rate": 1.825929311851126e-05, + "loss": 0.5237, + "step": 2646 + }, + { + "epoch": 0.5904528217711354, + "grad_norm": 0.15673169493675232, + "learning_rate": 1.8257966088330907e-05, + "loss": 0.5005, + "step": 2647 + }, + { + "epoch": 0.5906758866830247, + "grad_norm": 0.16055789589881897, + "learning_rate": 1.825663860077245e-05, + "loss": 0.5179, + "step": 2648 + }, + { + "epoch": 0.5908989515949141, + "grad_norm": 0.1535293161869049, + "learning_rate": 1.8255310655909414e-05, + "loss": 0.4926, + "step": 2649 + }, + { + "epoch": 0.5911220165068035, + "grad_norm": 0.16450564563274384, + "learning_rate": 1.8253982253815343e-05, + "loss": 0.5016, + "step": 2650 + }, + { + "epoch": 0.5913450814186928, + "grad_norm": 0.1581798493862152, + "learning_rate": 1.8252653394563814e-05, + "loss": 0.5031, + "step": 2651 + }, + { + "epoch": 0.5915681463305822, + "grad_norm": 0.16955383121967316, + "learning_rate": 1.825132407822843e-05, + "loss": 0.5408, + "step": 2652 + }, + { + "epoch": 0.5917912112424716, + "grad_norm": 0.16490530967712402, + "learning_rate": 1.8249994304882818e-05, + "loss": 0.5352, + "step": 2653 + }, + { + "epoch": 0.592014276154361, + "grad_norm": 0.1716136932373047, + "learning_rate": 1.8248664074600626e-05, + "loss": 0.4936, + "step": 2654 + }, + { + "epoch": 0.5922373410662503, + "grad_norm": 0.16612227261066437, + "learning_rate": 1.8247333387455534e-05, + "loss": 0.4886, + "step": 2655 + }, + { + "epoch": 0.5924604059781396, + "grad_norm": 0.16571937501430511, + "learning_rate": 1.8246002243521234e-05, + "loss": 0.5171, + "step": 2656 + }, + { + "epoch": 0.592683470890029, + "grad_norm": 0.15452441573143005, + "learning_rate": 1.8244670642871464e-05, + "loss": 0.4825, + "step": 2657 + }, + { + "epoch": 0.5929065358019183, + "grad_norm": 0.16785788536071777, + "learning_rate": 1.8243338585579974e-05, + "loss": 0.4993, + "step": 2658 + }, + { + "epoch": 0.5931296007138077, + "grad_norm": 0.16040126979351044, + "learning_rate": 1.824200607172054e-05, + "loss": 0.498, + "step": 2659 + }, + { + "epoch": 0.5933526656256971, + "grad_norm": 0.16780081391334534, + "learning_rate": 1.8240673101366963e-05, + "loss": 0.509, + "step": 2660 + }, + { + "epoch": 0.5935757305375864, + "grad_norm": 0.16159792244434357, + "learning_rate": 1.823933967459308e-05, + "loss": 0.5054, + "step": 2661 + }, + { + "epoch": 0.5937987954494758, + "grad_norm": 0.1585383266210556, + "learning_rate": 1.823800579147273e-05, + "loss": 0.4861, + "step": 2662 + }, + { + "epoch": 0.5940218603613652, + "grad_norm": 0.1630701869726181, + "learning_rate": 1.8236671452079805e-05, + "loss": 0.5226, + "step": 2663 + }, + { + "epoch": 0.5942449252732546, + "grad_norm": 0.16565611958503723, + "learning_rate": 1.8235336656488203e-05, + "loss": 0.4905, + "step": 2664 + }, + { + "epoch": 0.5944679901851438, + "grad_norm": 0.15455248951911926, + "learning_rate": 1.8234001404771856e-05, + "loss": 0.4844, + "step": 2665 + }, + { + "epoch": 0.5946910550970332, + "grad_norm": 0.1679016500711441, + "learning_rate": 1.8232665697004713e-05, + "loss": 0.5312, + "step": 2666 + }, + { + "epoch": 0.5949141200089226, + "grad_norm": 0.16260258853435516, + "learning_rate": 1.823132953326076e-05, + "loss": 0.5106, + "step": 2667 + }, + { + "epoch": 0.5951371849208119, + "grad_norm": 0.15242820978164673, + "learning_rate": 1.8229992913614004e-05, + "loss": 0.4641, + "step": 2668 + }, + { + "epoch": 0.5953602498327013, + "grad_norm": 0.17028586566448212, + "learning_rate": 1.822865583813847e-05, + "loss": 0.5183, + "step": 2669 + }, + { + "epoch": 0.5955833147445907, + "grad_norm": 0.14964601397514343, + "learning_rate": 1.8227318306908216e-05, + "loss": 0.4843, + "step": 2670 + }, + { + "epoch": 0.5958063796564801, + "grad_norm": 0.1675204187631607, + "learning_rate": 1.822598031999732e-05, + "loss": 0.5098, + "step": 2671 + }, + { + "epoch": 0.5960294445683694, + "grad_norm": 0.15445132553577423, + "learning_rate": 1.822464187747989e-05, + "loss": 0.4759, + "step": 2672 + }, + { + "epoch": 0.5962525094802588, + "grad_norm": 0.1613750010728836, + "learning_rate": 1.822330297943006e-05, + "loss": 0.5154, + "step": 2673 + }, + { + "epoch": 0.5964755743921482, + "grad_norm": 0.16264407336711884, + "learning_rate": 1.8221963625921984e-05, + "loss": 0.4758, + "step": 2674 + }, + { + "epoch": 0.5966986393040374, + "grad_norm": 0.16100256145000458, + "learning_rate": 1.8220623817029843e-05, + "loss": 0.4946, + "step": 2675 + }, + { + "epoch": 0.5969217042159268, + "grad_norm": 0.1642381250858307, + "learning_rate": 1.8219283552827847e-05, + "loss": 0.5029, + "step": 2676 + }, + { + "epoch": 0.5971447691278162, + "grad_norm": 0.16491542756557465, + "learning_rate": 1.8217942833390227e-05, + "loss": 0.5077, + "step": 2677 + }, + { + "epoch": 0.5973678340397055, + "grad_norm": 0.1611475944519043, + "learning_rate": 1.821660165879124e-05, + "loss": 0.5346, + "step": 2678 + }, + { + "epoch": 0.5975908989515949, + "grad_norm": 0.16811859607696533, + "learning_rate": 1.8215260029105166e-05, + "loss": 0.5173, + "step": 2679 + }, + { + "epoch": 0.5978139638634843, + "grad_norm": 0.17261561751365662, + "learning_rate": 1.8213917944406315e-05, + "loss": 0.5142, + "step": 2680 + }, + { + "epoch": 0.5980370287753737, + "grad_norm": 0.1671370267868042, + "learning_rate": 1.8212575404769023e-05, + "loss": 0.5289, + "step": 2681 + }, + { + "epoch": 0.598260093687263, + "grad_norm": 0.16574646532535553, + "learning_rate": 1.8211232410267645e-05, + "loss": 0.5179, + "step": 2682 + }, + { + "epoch": 0.5984831585991524, + "grad_norm": 0.1583874672651291, + "learning_rate": 1.8209888960976565e-05, + "loss": 0.503, + "step": 2683 + }, + { + "epoch": 0.5987062235110417, + "grad_norm": 0.15771937370300293, + "learning_rate": 1.8208545056970193e-05, + "loss": 0.4811, + "step": 2684 + }, + { + "epoch": 0.598929288422931, + "grad_norm": 0.16496342420578003, + "learning_rate": 1.820720069832296e-05, + "loss": 0.5178, + "step": 2685 + }, + { + "epoch": 0.5991523533348204, + "grad_norm": 0.1662297397851944, + "learning_rate": 1.820585588510933e-05, + "loss": 0.5286, + "step": 2686 + }, + { + "epoch": 0.5993754182467098, + "grad_norm": 0.15326498448848724, + "learning_rate": 1.8204510617403785e-05, + "loss": 0.4983, + "step": 2687 + }, + { + "epoch": 0.5995984831585992, + "grad_norm": 0.1545604169368744, + "learning_rate": 1.820316489528083e-05, + "loss": 0.4746, + "step": 2688 + }, + { + "epoch": 0.5998215480704885, + "grad_norm": 0.16388210654258728, + "learning_rate": 1.8201818718815004e-05, + "loss": 0.4766, + "step": 2689 + }, + { + "epoch": 0.6000446129823779, + "grad_norm": 0.16617290675640106, + "learning_rate": 1.820047208808087e-05, + "loss": 0.5301, + "step": 2690 + }, + { + "epoch": 0.6002676778942673, + "grad_norm": 0.16141341626644135, + "learning_rate": 1.8199125003153e-05, + "loss": 0.5095, + "step": 2691 + }, + { + "epoch": 0.6004907428061566, + "grad_norm": 0.1575475037097931, + "learning_rate": 1.8197777464106022e-05, + "loss": 0.5164, + "step": 2692 + }, + { + "epoch": 0.600713807718046, + "grad_norm": 0.1886652559041977, + "learning_rate": 1.8196429471014558e-05, + "loss": 0.4922, + "step": 2693 + }, + { + "epoch": 0.6009368726299353, + "grad_norm": 0.16036200523376465, + "learning_rate": 1.8195081023953268e-05, + "loss": 0.5116, + "step": 2694 + }, + { + "epoch": 0.6011599375418246, + "grad_norm": 0.16269893944263458, + "learning_rate": 1.8193732122996847e-05, + "loss": 0.5135, + "step": 2695 + }, + { + "epoch": 0.601383002453714, + "grad_norm": 0.1868923157453537, + "learning_rate": 1.819238276822e-05, + "loss": 0.5052, + "step": 2696 + }, + { + "epoch": 0.6016060673656034, + "grad_norm": 0.16269947588443756, + "learning_rate": 1.8191032959697464e-05, + "loss": 0.4829, + "step": 2697 + }, + { + "epoch": 0.6018291322774928, + "grad_norm": 0.1679493635892868, + "learning_rate": 1.8189682697504e-05, + "loss": 0.4967, + "step": 2698 + }, + { + "epoch": 0.6020521971893821, + "grad_norm": 0.16471365094184875, + "learning_rate": 1.8188331981714386e-05, + "loss": 0.5189, + "step": 2699 + }, + { + "epoch": 0.6022752621012715, + "grad_norm": 0.16562478244304657, + "learning_rate": 1.8186980812403448e-05, + "loss": 0.5217, + "step": 2700 + }, + { + "epoch": 0.6024983270131609, + "grad_norm": 0.16145823895931244, + "learning_rate": 1.818562918964601e-05, + "loss": 0.5008, + "step": 2701 + }, + { + "epoch": 0.6027213919250501, + "grad_norm": 0.17072793841362, + "learning_rate": 1.8184277113516938e-05, + "loss": 0.5302, + "step": 2702 + }, + { + "epoch": 0.6029444568369395, + "grad_norm": 0.16824592649936676, + "learning_rate": 1.8182924584091122e-05, + "loss": 0.5358, + "step": 2703 + }, + { + "epoch": 0.6031675217488289, + "grad_norm": 0.1661488264799118, + "learning_rate": 1.8181571601443465e-05, + "loss": 0.5391, + "step": 2704 + }, + { + "epoch": 0.6033905866607183, + "grad_norm": 0.15754079818725586, + "learning_rate": 1.8180218165648913e-05, + "loss": 0.5013, + "step": 2705 + }, + { + "epoch": 0.6036136515726076, + "grad_norm": 0.1813340038061142, + "learning_rate": 1.817886427678242e-05, + "loss": 0.5301, + "step": 2706 + }, + { + "epoch": 0.603836716484497, + "grad_norm": 0.16588057577610016, + "learning_rate": 1.817750993491898e-05, + "loss": 0.4924, + "step": 2707 + }, + { + "epoch": 0.6040597813963864, + "grad_norm": 0.17572402954101562, + "learning_rate": 1.8176155140133596e-05, + "loss": 0.5075, + "step": 2708 + }, + { + "epoch": 0.6042828463082757, + "grad_norm": 0.157828688621521, + "learning_rate": 1.8174799892501315e-05, + "loss": 0.5133, + "step": 2709 + }, + { + "epoch": 0.6045059112201651, + "grad_norm": 0.1617002636194229, + "learning_rate": 1.817344419209719e-05, + "loss": 0.5092, + "step": 2710 + }, + { + "epoch": 0.6047289761320545, + "grad_norm": 0.1876181662082672, + "learning_rate": 1.817208803899632e-05, + "loss": 0.5326, + "step": 2711 + }, + { + "epoch": 0.6049520410439437, + "grad_norm": 0.15636108815670013, + "learning_rate": 1.8170731433273802e-05, + "loss": 0.4922, + "step": 2712 + }, + { + "epoch": 0.6051751059558331, + "grad_norm": 0.15785710513591766, + "learning_rate": 1.8169374375004784e-05, + "loss": 0.4956, + "step": 2713 + }, + { + "epoch": 0.6053981708677225, + "grad_norm": 0.17369434237480164, + "learning_rate": 1.8168016864264426e-05, + "loss": 0.493, + "step": 2714 + }, + { + "epoch": 0.6056212357796119, + "grad_norm": 0.16383057832717896, + "learning_rate": 1.8166658901127915e-05, + "loss": 0.4638, + "step": 2715 + }, + { + "epoch": 0.6058443006915012, + "grad_norm": 0.17268022894859314, + "learning_rate": 1.8165300485670464e-05, + "loss": 0.5056, + "step": 2716 + }, + { + "epoch": 0.6060673656033906, + "grad_norm": 0.1691250205039978, + "learning_rate": 1.8163941617967313e-05, + "loss": 0.5161, + "step": 2717 + }, + { + "epoch": 0.60629043051528, + "grad_norm": 0.1628870815038681, + "learning_rate": 1.8162582298093715e-05, + "loss": 0.5015, + "step": 2718 + }, + { + "epoch": 0.6065134954271693, + "grad_norm": 0.17182950675487518, + "learning_rate": 1.816122252612497e-05, + "loss": 0.5237, + "step": 2719 + }, + { + "epoch": 0.6067365603390587, + "grad_norm": 0.17092610895633698, + "learning_rate": 1.8159862302136386e-05, + "loss": 0.4941, + "step": 2720 + }, + { + "epoch": 0.606959625250948, + "grad_norm": 0.16498810052871704, + "learning_rate": 1.8158501626203298e-05, + "loss": 0.5197, + "step": 2721 + }, + { + "epoch": 0.6071826901628374, + "grad_norm": 0.16107720136642456, + "learning_rate": 1.815714049840107e-05, + "loss": 0.5186, + "step": 2722 + }, + { + "epoch": 0.6074057550747267, + "grad_norm": 0.1599961370229721, + "learning_rate": 1.8155778918805095e-05, + "loss": 0.5035, + "step": 2723 + }, + { + "epoch": 0.6076288199866161, + "grad_norm": 0.16911983489990234, + "learning_rate": 1.815441688749078e-05, + "loss": 0.5009, + "step": 2724 + }, + { + "epoch": 0.6078518848985055, + "grad_norm": 0.16482600569725037, + "learning_rate": 1.8153054404533562e-05, + "loss": 0.4991, + "step": 2725 + }, + { + "epoch": 0.6080749498103948, + "grad_norm": 0.16007374227046967, + "learning_rate": 1.8151691470008906e-05, + "loss": 0.4837, + "step": 2726 + }, + { + "epoch": 0.6082980147222842, + "grad_norm": 0.16472531855106354, + "learning_rate": 1.81503280839923e-05, + "loss": 0.4997, + "step": 2727 + }, + { + "epoch": 0.6085210796341736, + "grad_norm": 0.1605815589427948, + "learning_rate": 1.814896424655926e-05, + "loss": 0.5011, + "step": 2728 + }, + { + "epoch": 0.608744144546063, + "grad_norm": 0.16523852944374084, + "learning_rate": 1.814759995778532e-05, + "loss": 0.4947, + "step": 2729 + }, + { + "epoch": 0.6089672094579522, + "grad_norm": 0.1722632497549057, + "learning_rate": 1.8146235217746043e-05, + "loss": 0.495, + "step": 2730 + }, + { + "epoch": 0.6091902743698416, + "grad_norm": 0.16873040795326233, + "learning_rate": 1.8144870026517018e-05, + "loss": 0.5273, + "step": 2731 + }, + { + "epoch": 0.609413339281731, + "grad_norm": 0.16078028082847595, + "learning_rate": 1.8143504384173858e-05, + "loss": 0.5051, + "step": 2732 + }, + { + "epoch": 0.6096364041936203, + "grad_norm": 0.18267033994197845, + "learning_rate": 1.8142138290792202e-05, + "loss": 0.4946, + "step": 2733 + }, + { + "epoch": 0.6098594691055097, + "grad_norm": 0.17000767588615417, + "learning_rate": 1.814077174644771e-05, + "loss": 0.5079, + "step": 2734 + }, + { + "epoch": 0.6100825340173991, + "grad_norm": 0.1608540564775467, + "learning_rate": 1.813940475121607e-05, + "loss": 0.4912, + "step": 2735 + }, + { + "epoch": 0.6103055989292884, + "grad_norm": 0.160196915268898, + "learning_rate": 1.8138037305172997e-05, + "loss": 0.5315, + "step": 2736 + }, + { + "epoch": 0.6105286638411778, + "grad_norm": 0.16966816782951355, + "learning_rate": 1.813666940839423e-05, + "loss": 0.532, + "step": 2737 + }, + { + "epoch": 0.6107517287530672, + "grad_norm": 0.19911563396453857, + "learning_rate": 1.8135301060955525e-05, + "loss": 0.4847, + "step": 2738 + }, + { + "epoch": 0.6109747936649566, + "grad_norm": 0.16148284077644348, + "learning_rate": 1.8133932262932678e-05, + "loss": 0.4756, + "step": 2739 + }, + { + "epoch": 0.6111978585768458, + "grad_norm": 0.16302482783794403, + "learning_rate": 1.8132563014401497e-05, + "loss": 0.5021, + "step": 2740 + }, + { + "epoch": 0.6114209234887352, + "grad_norm": 0.1716173142194748, + "learning_rate": 1.813119331543782e-05, + "loss": 0.5303, + "step": 2741 + }, + { + "epoch": 0.6116439884006246, + "grad_norm": 0.16563881933689117, + "learning_rate": 1.812982316611751e-05, + "loss": 0.4991, + "step": 2742 + }, + { + "epoch": 0.6118670533125139, + "grad_norm": 0.1720925122499466, + "learning_rate": 1.812845256651645e-05, + "loss": 0.5026, + "step": 2743 + }, + { + "epoch": 0.6120901182244033, + "grad_norm": 0.16051127016544342, + "learning_rate": 1.8127081516710565e-05, + "loss": 0.4837, + "step": 2744 + }, + { + "epoch": 0.6123131831362927, + "grad_norm": 0.16201861202716827, + "learning_rate": 1.8125710016775778e-05, + "loss": 0.5161, + "step": 2745 + }, + { + "epoch": 0.6125362480481821, + "grad_norm": 0.17014898359775543, + "learning_rate": 1.812433806678806e-05, + "loss": 0.5346, + "step": 2746 + }, + { + "epoch": 0.6127593129600714, + "grad_norm": 0.1703520119190216, + "learning_rate": 1.8122965666823398e-05, + "loss": 0.5182, + "step": 2747 + }, + { + "epoch": 0.6129823778719607, + "grad_norm": 0.19158673286437988, + "learning_rate": 1.8121592816957797e-05, + "loss": 0.5043, + "step": 2748 + }, + { + "epoch": 0.6132054427838501, + "grad_norm": 0.14746567606925964, + "learning_rate": 1.8120219517267302e-05, + "loss": 0.4841, + "step": 2749 + }, + { + "epoch": 0.6134285076957394, + "grad_norm": 0.16160626709461212, + "learning_rate": 1.811884576782797e-05, + "loss": 0.5158, + "step": 2750 + }, + { + "epoch": 0.6136515726076288, + "grad_norm": 0.17975735664367676, + "learning_rate": 1.8117471568715893e-05, + "loss": 0.5235, + "step": 2751 + }, + { + "epoch": 0.6138746375195182, + "grad_norm": 0.17119070887565613, + "learning_rate": 1.8116096920007177e-05, + "loss": 0.4951, + "step": 2752 + }, + { + "epoch": 0.6140977024314075, + "grad_norm": 0.16311194002628326, + "learning_rate": 1.8114721821777964e-05, + "loss": 0.499, + "step": 2753 + }, + { + "epoch": 0.6143207673432969, + "grad_norm": 0.16315460205078125, + "learning_rate": 1.811334627410441e-05, + "loss": 0.51, + "step": 2754 + }, + { + "epoch": 0.6145438322551863, + "grad_norm": 0.44980496168136597, + "learning_rate": 1.81119702770627e-05, + "loss": 0.5005, + "step": 2755 + }, + { + "epoch": 0.6147668971670757, + "grad_norm": 0.17961294949054718, + "learning_rate": 1.8110593830729057e-05, + "loss": 0.5204, + "step": 2756 + }, + { + "epoch": 0.614989962078965, + "grad_norm": 0.16227351129055023, + "learning_rate": 1.8109216935179712e-05, + "loss": 0.5081, + "step": 2757 + }, + { + "epoch": 0.6152130269908543, + "grad_norm": 0.1761593520641327, + "learning_rate": 1.810783959049092e-05, + "loss": 0.5163, + "step": 2758 + }, + { + "epoch": 0.6154360919027437, + "grad_norm": 0.17095595598220825, + "learning_rate": 1.810646179673897e-05, + "loss": 0.4948, + "step": 2759 + }, + { + "epoch": 0.615659156814633, + "grad_norm": 0.16665005683898926, + "learning_rate": 1.8105083554000175e-05, + "loss": 0.4937, + "step": 2760 + }, + { + "epoch": 0.6158822217265224, + "grad_norm": 0.16761796176433563, + "learning_rate": 1.810370486235087e-05, + "loss": 0.5081, + "step": 2761 + }, + { + "epoch": 0.6161052866384118, + "grad_norm": 0.17292195558547974, + "learning_rate": 1.8102325721867417e-05, + "loss": 0.5027, + "step": 2762 + }, + { + "epoch": 0.6163283515503012, + "grad_norm": 0.1606137752532959, + "learning_rate": 1.8100946132626197e-05, + "loss": 0.4867, + "step": 2763 + }, + { + "epoch": 0.6165514164621905, + "grad_norm": 0.1619354635477066, + "learning_rate": 1.8099566094703626e-05, + "loss": 0.474, + "step": 2764 + }, + { + "epoch": 0.6167744813740799, + "grad_norm": 0.16491201519966125, + "learning_rate": 1.8098185608176132e-05, + "loss": 0.4953, + "step": 2765 + }, + { + "epoch": 0.6169975462859693, + "grad_norm": 0.16283756494522095, + "learning_rate": 1.8096804673120183e-05, + "loss": 0.4934, + "step": 2766 + }, + { + "epoch": 0.6172206111978585, + "grad_norm": 0.15283644199371338, + "learning_rate": 1.809542328961226e-05, + "loss": 0.4691, + "step": 2767 + }, + { + "epoch": 0.6174436761097479, + "grad_norm": 0.16907426714897156, + "learning_rate": 1.809404145772887e-05, + "loss": 0.4901, + "step": 2768 + }, + { + "epoch": 0.6176667410216373, + "grad_norm": 0.16500405967235565, + "learning_rate": 1.8092659177546554e-05, + "loss": 0.5042, + "step": 2769 + }, + { + "epoch": 0.6178898059335266, + "grad_norm": 0.16616129875183105, + "learning_rate": 1.8091276449141868e-05, + "loss": 0.528, + "step": 2770 + }, + { + "epoch": 0.618112870845416, + "grad_norm": 0.15396897494792938, + "learning_rate": 1.8089893272591393e-05, + "loss": 0.4797, + "step": 2771 + }, + { + "epoch": 0.6183359357573054, + "grad_norm": 0.1576898992061615, + "learning_rate": 1.8088509647971744e-05, + "loss": 0.5007, + "step": 2772 + }, + { + "epoch": 0.6185590006691948, + "grad_norm": 0.1703861653804779, + "learning_rate": 1.808712557535955e-05, + "loss": 0.5114, + "step": 2773 + }, + { + "epoch": 0.6187820655810841, + "grad_norm": 0.15141689777374268, + "learning_rate": 1.8085741054831472e-05, + "loss": 0.457, + "step": 2774 + }, + { + "epoch": 0.6190051304929735, + "grad_norm": 0.19050109386444092, + "learning_rate": 1.8084356086464197e-05, + "loss": 0.474, + "step": 2775 + }, + { + "epoch": 0.6192281954048628, + "grad_norm": 0.17825506627559662, + "learning_rate": 1.8082970670334425e-05, + "loss": 0.5164, + "step": 2776 + }, + { + "epoch": 0.6194512603167521, + "grad_norm": 0.16141526401042938, + "learning_rate": 1.8081584806518897e-05, + "loss": 0.5025, + "step": 2777 + }, + { + "epoch": 0.6196743252286415, + "grad_norm": 0.23704519867897034, + "learning_rate": 1.8080198495094364e-05, + "loss": 0.5315, + "step": 2778 + }, + { + "epoch": 0.6198973901405309, + "grad_norm": 0.15994718670845032, + "learning_rate": 1.8078811736137612e-05, + "loss": 0.4853, + "step": 2779 + }, + { + "epoch": 0.6201204550524203, + "grad_norm": 0.1649598479270935, + "learning_rate": 1.807742452972545e-05, + "loss": 0.5262, + "step": 2780 + }, + { + "epoch": 0.6203435199643096, + "grad_norm": 0.14894191920757294, + "learning_rate": 1.8076036875934707e-05, + "loss": 0.4816, + "step": 2781 + }, + { + "epoch": 0.620566584876199, + "grad_norm": 0.17157211899757385, + "learning_rate": 1.807464877484224e-05, + "loss": 0.4847, + "step": 2782 + }, + { + "epoch": 0.6207896497880884, + "grad_norm": 0.1663542538881302, + "learning_rate": 1.8073260226524937e-05, + "loss": 0.4931, + "step": 2783 + }, + { + "epoch": 0.6210127146999777, + "grad_norm": 0.1610024869441986, + "learning_rate": 1.8071871231059695e-05, + "loss": 0.4924, + "step": 2784 + }, + { + "epoch": 0.621235779611867, + "grad_norm": 0.16384142637252808, + "learning_rate": 1.807048178852345e-05, + "loss": 0.5143, + "step": 2785 + }, + { + "epoch": 0.6214588445237564, + "grad_norm": 0.16362221539020538, + "learning_rate": 1.8069091898993162e-05, + "loss": 0.5172, + "step": 2786 + }, + { + "epoch": 0.6216819094356457, + "grad_norm": 0.19173026084899902, + "learning_rate": 1.8067701562545808e-05, + "loss": 0.512, + "step": 2787 + }, + { + "epoch": 0.6219049743475351, + "grad_norm": 0.15911833941936493, + "learning_rate": 1.8066310779258393e-05, + "loss": 0.4874, + "step": 2788 + }, + { + "epoch": 0.6221280392594245, + "grad_norm": 0.15391753613948822, + "learning_rate": 1.8064919549207946e-05, + "loss": 0.4966, + "step": 2789 + }, + { + "epoch": 0.6223511041713139, + "grad_norm": 0.21308940649032593, + "learning_rate": 1.8063527872471523e-05, + "loss": 0.5172, + "step": 2790 + }, + { + "epoch": 0.6225741690832032, + "grad_norm": 0.15762938559055328, + "learning_rate": 1.8062135749126208e-05, + "loss": 0.4904, + "step": 2791 + }, + { + "epoch": 0.6227972339950926, + "grad_norm": 0.18538956344127655, + "learning_rate": 1.80607431792491e-05, + "loss": 0.4907, + "step": 2792 + }, + { + "epoch": 0.623020298906982, + "grad_norm": 0.1694374829530716, + "learning_rate": 1.8059350162917333e-05, + "loss": 0.4892, + "step": 2793 + }, + { + "epoch": 0.6232433638188712, + "grad_norm": 0.1632799506187439, + "learning_rate": 1.8057956700208055e-05, + "loss": 0.531, + "step": 2794 + }, + { + "epoch": 0.6234664287307606, + "grad_norm": 0.19030988216400146, + "learning_rate": 1.805656279119845e-05, + "loss": 0.4801, + "step": 2795 + }, + { + "epoch": 0.62368949364265, + "grad_norm": 0.17578838765621185, + "learning_rate": 1.8055168435965722e-05, + "loss": 0.5129, + "step": 2796 + }, + { + "epoch": 0.6239125585545394, + "grad_norm": 0.17337745428085327, + "learning_rate": 1.8053773634587095e-05, + "loss": 0.5252, + "step": 2797 + }, + { + "epoch": 0.6241356234664287, + "grad_norm": 0.16400404274463654, + "learning_rate": 1.8052378387139827e-05, + "loss": 0.4977, + "step": 2798 + }, + { + "epoch": 0.6243586883783181, + "grad_norm": 0.16884362697601318, + "learning_rate": 1.8050982693701188e-05, + "loss": 0.5209, + "step": 2799 + }, + { + "epoch": 0.6245817532902075, + "grad_norm": 0.16655333340168, + "learning_rate": 1.8049586554348487e-05, + "loss": 0.5236, + "step": 2800 + }, + { + "epoch": 0.6248048182020968, + "grad_norm": 0.1815658062696457, + "learning_rate": 1.804818996915905e-05, + "loss": 0.5155, + "step": 2801 + }, + { + "epoch": 0.6250278831139862, + "grad_norm": 0.1637655794620514, + "learning_rate": 1.8046792938210226e-05, + "loss": 0.5108, + "step": 2802 + }, + { + "epoch": 0.6252509480258756, + "grad_norm": 0.16214872896671295, + "learning_rate": 1.804539546157939e-05, + "loss": 0.5015, + "step": 2803 + }, + { + "epoch": 0.625474012937765, + "grad_norm": 0.1624768078327179, + "learning_rate": 1.804399753934395e-05, + "loss": 0.4852, + "step": 2804 + }, + { + "epoch": 0.6256970778496542, + "grad_norm": 0.1509588062763214, + "learning_rate": 1.8042599171581322e-05, + "loss": 0.4895, + "step": 2805 + }, + { + "epoch": 0.6259201427615436, + "grad_norm": 0.15983150899410248, + "learning_rate": 1.804120035836897e-05, + "loss": 0.5066, + "step": 2806 + }, + { + "epoch": 0.626143207673433, + "grad_norm": 0.1639653593301773, + "learning_rate": 1.8039801099784356e-05, + "loss": 0.5081, + "step": 2807 + }, + { + "epoch": 0.6263662725853223, + "grad_norm": 0.16054677963256836, + "learning_rate": 1.8038401395904984e-05, + "loss": 0.5019, + "step": 2808 + }, + { + "epoch": 0.6265893374972117, + "grad_norm": 0.16557928919792175, + "learning_rate": 1.8037001246808382e-05, + "loss": 0.5085, + "step": 2809 + }, + { + "epoch": 0.6268124024091011, + "grad_norm": 0.17172253131866455, + "learning_rate": 1.8035600652572093e-05, + "loss": 0.5069, + "step": 2810 + }, + { + "epoch": 0.6270354673209904, + "grad_norm": 0.16289275884628296, + "learning_rate": 1.80341996132737e-05, + "loss": 0.5054, + "step": 2811 + }, + { + "epoch": 0.6272585322328798, + "grad_norm": 0.159224271774292, + "learning_rate": 1.8032798128990788e-05, + "loss": 0.4884, + "step": 2812 + }, + { + "epoch": 0.6274815971447691, + "grad_norm": 0.1516553908586502, + "learning_rate": 1.803139619980099e-05, + "loss": 0.4823, + "step": 2813 + }, + { + "epoch": 0.6277046620566585, + "grad_norm": 0.164667546749115, + "learning_rate": 1.802999382578195e-05, + "loss": 0.5184, + "step": 2814 + }, + { + "epoch": 0.6279277269685478, + "grad_norm": 0.17175255715847015, + "learning_rate": 1.8028591007011343e-05, + "loss": 0.5166, + "step": 2815 + }, + { + "epoch": 0.6281507918804372, + "grad_norm": 0.18747776746749878, + "learning_rate": 1.8027187743566867e-05, + "loss": 0.499, + "step": 2816 + }, + { + "epoch": 0.6283738567923266, + "grad_norm": 0.16748382151126862, + "learning_rate": 1.8025784035526235e-05, + "loss": 0.5099, + "step": 2817 + }, + { + "epoch": 0.6285969217042159, + "grad_norm": 0.19813144207000732, + "learning_rate": 1.80243798829672e-05, + "loss": 0.5045, + "step": 2818 + }, + { + "epoch": 0.6288199866161053, + "grad_norm": 0.16501356661319733, + "learning_rate": 1.8022975285967534e-05, + "loss": 0.514, + "step": 2819 + }, + { + "epoch": 0.6290430515279947, + "grad_norm": 0.17181000113487244, + "learning_rate": 1.8021570244605028e-05, + "loss": 0.4963, + "step": 2820 + }, + { + "epoch": 0.6292661164398841, + "grad_norm": 0.16481012105941772, + "learning_rate": 1.8020164758957505e-05, + "loss": 0.4831, + "step": 2821 + }, + { + "epoch": 0.6294891813517733, + "grad_norm": 0.1659911572933197, + "learning_rate": 1.8018758829102808e-05, + "loss": 0.5026, + "step": 2822 + }, + { + "epoch": 0.6297122462636627, + "grad_norm": 0.18394367396831512, + "learning_rate": 1.8017352455118812e-05, + "loss": 0.5072, + "step": 2823 + }, + { + "epoch": 0.6299353111755521, + "grad_norm": 0.1666680872440338, + "learning_rate": 1.80159456370834e-05, + "loss": 0.4986, + "step": 2824 + }, + { + "epoch": 0.6301583760874414, + "grad_norm": 0.184769406914711, + "learning_rate": 1.80145383750745e-05, + "loss": 0.4863, + "step": 2825 + }, + { + "epoch": 0.6303814409993308, + "grad_norm": 0.15709228813648224, + "learning_rate": 1.801313066917005e-05, + "loss": 0.4945, + "step": 2826 + }, + { + "epoch": 0.6306045059112202, + "grad_norm": 0.16040046513080597, + "learning_rate": 1.801172251944802e-05, + "loss": 0.4704, + "step": 2827 + }, + { + "epoch": 0.6308275708231095, + "grad_norm": 0.174714133143425, + "learning_rate": 1.8010313925986398e-05, + "loss": 0.5224, + "step": 2828 + }, + { + "epoch": 0.6310506357349989, + "grad_norm": 0.17879654467105865, + "learning_rate": 1.8008904888863206e-05, + "loss": 0.5168, + "step": 2829 + }, + { + "epoch": 0.6312737006468883, + "grad_norm": 0.16159847378730774, + "learning_rate": 1.8007495408156483e-05, + "loss": 0.4905, + "step": 2830 + }, + { + "epoch": 0.6314967655587777, + "grad_norm": 0.16279365122318268, + "learning_rate": 1.8006085483944295e-05, + "loss": 0.5004, + "step": 2831 + }, + { + "epoch": 0.6317198304706669, + "grad_norm": 0.16124996542930603, + "learning_rate": 1.800467511630473e-05, + "loss": 0.498, + "step": 2832 + }, + { + "epoch": 0.6319428953825563, + "grad_norm": 0.16358453035354614, + "learning_rate": 1.800326430531591e-05, + "loss": 0.5172, + "step": 2833 + }, + { + "epoch": 0.6321659602944457, + "grad_norm": 0.16300733387470245, + "learning_rate": 1.8001853051055967e-05, + "loss": 0.5024, + "step": 2834 + }, + { + "epoch": 0.632389025206335, + "grad_norm": 0.1758897304534912, + "learning_rate": 1.8000441353603072e-05, + "loss": 0.4946, + "step": 2835 + }, + { + "epoch": 0.6326120901182244, + "grad_norm": 0.1550527960062027, + "learning_rate": 1.7999029213035408e-05, + "loss": 0.5027, + "step": 2836 + }, + { + "epoch": 0.6328351550301138, + "grad_norm": 0.15276572108268738, + "learning_rate": 1.799761662943119e-05, + "loss": 0.4968, + "step": 2837 + }, + { + "epoch": 0.6330582199420032, + "grad_norm": 0.16600914299488068, + "learning_rate": 1.7996203602868657e-05, + "loss": 0.5278, + "step": 2838 + }, + { + "epoch": 0.6332812848538925, + "grad_norm": 0.16788353025913239, + "learning_rate": 1.799479013342607e-05, + "loss": 0.5071, + "step": 2839 + }, + { + "epoch": 0.6335043497657818, + "grad_norm": 0.16952337324619293, + "learning_rate": 1.7993376221181716e-05, + "loss": 0.4852, + "step": 2840 + }, + { + "epoch": 0.6337274146776712, + "grad_norm": 0.28554677963256836, + "learning_rate": 1.7991961866213907e-05, + "loss": 0.474, + "step": 2841 + }, + { + "epoch": 0.6339504795895605, + "grad_norm": 0.17595677077770233, + "learning_rate": 1.7990547068600977e-05, + "loss": 0.5045, + "step": 2842 + }, + { + "epoch": 0.6341735445014499, + "grad_norm": 0.1631106734275818, + "learning_rate": 1.798913182842129e-05, + "loss": 0.4943, + "step": 2843 + }, + { + "epoch": 0.6343966094133393, + "grad_norm": 0.1630406528711319, + "learning_rate": 1.7987716145753226e-05, + "loss": 0.5095, + "step": 2844 + }, + { + "epoch": 0.6346196743252286, + "grad_norm": 0.17511357367038727, + "learning_rate": 1.7986300020675198e-05, + "loss": 0.5203, + "step": 2845 + }, + { + "epoch": 0.634842739237118, + "grad_norm": 0.1971133053302765, + "learning_rate": 1.798488345326564e-05, + "loss": 0.5061, + "step": 2846 + }, + { + "epoch": 0.6350658041490074, + "grad_norm": 0.15673868358135223, + "learning_rate": 1.7983466443603008e-05, + "loss": 0.4957, + "step": 2847 + }, + { + "epoch": 0.6352888690608968, + "grad_norm": 0.1608821600675583, + "learning_rate": 1.798204899176579e-05, + "loss": 0.4848, + "step": 2848 + }, + { + "epoch": 0.635511933972786, + "grad_norm": 0.15686407685279846, + "learning_rate": 1.7980631097832485e-05, + "loss": 0.4986, + "step": 2849 + }, + { + "epoch": 0.6357349988846754, + "grad_norm": 0.16254571080207825, + "learning_rate": 1.797921276188163e-05, + "loss": 0.5255, + "step": 2850 + }, + { + "epoch": 0.6359580637965648, + "grad_norm": 0.17136751115322113, + "learning_rate": 1.7977793983991785e-05, + "loss": 0.5029, + "step": 2851 + }, + { + "epoch": 0.6361811287084541, + "grad_norm": 0.2001720815896988, + "learning_rate": 1.7976374764241523e-05, + "loss": 0.5208, + "step": 2852 + }, + { + "epoch": 0.6364041936203435, + "grad_norm": 0.15888839960098267, + "learning_rate": 1.7974955102709457e-05, + "loss": 0.5125, + "step": 2853 + }, + { + "epoch": 0.6366272585322329, + "grad_norm": 0.1562851518392563, + "learning_rate": 1.797353499947421e-05, + "loss": 0.4647, + "step": 2854 + }, + { + "epoch": 0.6368503234441223, + "grad_norm": 0.157870814204216, + "learning_rate": 1.7972114454614436e-05, + "loss": 0.5072, + "step": 2855 + }, + { + "epoch": 0.6370733883560116, + "grad_norm": 0.16168496012687683, + "learning_rate": 1.7970693468208823e-05, + "loss": 0.4946, + "step": 2856 + }, + { + "epoch": 0.637296453267901, + "grad_norm": 0.23463092744350433, + "learning_rate": 1.796927204033607e-05, + "loss": 0.5073, + "step": 2857 + }, + { + "epoch": 0.6375195181797904, + "grad_norm": 0.18455378711223602, + "learning_rate": 1.7967850171074896e-05, + "loss": 0.5278, + "step": 2858 + }, + { + "epoch": 0.6377425830916796, + "grad_norm": 0.16380822658538818, + "learning_rate": 1.796642786050406e-05, + "loss": 0.5048, + "step": 2859 + }, + { + "epoch": 0.637965648003569, + "grad_norm": 0.15988902747631073, + "learning_rate": 1.7965005108702342e-05, + "loss": 0.4787, + "step": 2860 + }, + { + "epoch": 0.6381887129154584, + "grad_norm": 0.16252587735652924, + "learning_rate": 1.796358191574854e-05, + "loss": 0.5075, + "step": 2861 + }, + { + "epoch": 0.6384117778273477, + "grad_norm": 0.1922154724597931, + "learning_rate": 1.7962158281721475e-05, + "loss": 0.4616, + "step": 2862 + }, + { + "epoch": 0.6386348427392371, + "grad_norm": 0.17007192969322205, + "learning_rate": 1.7960734206700002e-05, + "loss": 0.4741, + "step": 2863 + }, + { + "epoch": 0.6388579076511265, + "grad_norm": 0.16017115116119385, + "learning_rate": 1.7959309690762992e-05, + "loss": 0.4757, + "step": 2864 + }, + { + "epoch": 0.6390809725630159, + "grad_norm": 0.16466043889522552, + "learning_rate": 1.795788473398935e-05, + "loss": 0.4856, + "step": 2865 + }, + { + "epoch": 0.6393040374749052, + "grad_norm": 0.16355857253074646, + "learning_rate": 1.795645933645799e-05, + "loss": 0.5055, + "step": 2866 + }, + { + "epoch": 0.6395271023867946, + "grad_norm": 0.1593533307313919, + "learning_rate": 1.7955033498247863e-05, + "loss": 0.5103, + "step": 2867 + }, + { + "epoch": 0.639750167298684, + "grad_norm": 0.16536985337734222, + "learning_rate": 1.7953607219437942e-05, + "loss": 0.5029, + "step": 2868 + }, + { + "epoch": 0.6399732322105732, + "grad_norm": 0.1590854823589325, + "learning_rate": 1.7952180500107225e-05, + "loss": 0.4687, + "step": 2869 + }, + { + "epoch": 0.6401962971224626, + "grad_norm": 0.16659674048423767, + "learning_rate": 1.7950753340334734e-05, + "loss": 0.4999, + "step": 2870 + }, + { + "epoch": 0.640419362034352, + "grad_norm": 0.17274489998817444, + "learning_rate": 1.7949325740199507e-05, + "loss": 0.4995, + "step": 2871 + }, + { + "epoch": 0.6406424269462414, + "grad_norm": 0.16115543246269226, + "learning_rate": 1.7947897699780616e-05, + "loss": 0.484, + "step": 2872 + }, + { + "epoch": 0.6408654918581307, + "grad_norm": 0.16735753417015076, + "learning_rate": 1.7946469219157158e-05, + "loss": 0.4917, + "step": 2873 + }, + { + "epoch": 0.6410885567700201, + "grad_norm": 0.1667783111333847, + "learning_rate": 1.7945040298408248e-05, + "loss": 0.5106, + "step": 2874 + }, + { + "epoch": 0.6413116216819095, + "grad_norm": 0.16274797916412354, + "learning_rate": 1.794361093761303e-05, + "loss": 0.5006, + "step": 2875 + }, + { + "epoch": 0.6415346865937988, + "grad_norm": 0.15851566195487976, + "learning_rate": 1.7942181136850672e-05, + "loss": 0.5007, + "step": 2876 + }, + { + "epoch": 0.6417577515056881, + "grad_norm": 0.16172711551189423, + "learning_rate": 1.7940750896200363e-05, + "loss": 0.5043, + "step": 2877 + }, + { + "epoch": 0.6419808164175775, + "grad_norm": 0.18239612877368927, + "learning_rate": 1.7939320215741322e-05, + "loss": 0.4888, + "step": 2878 + }, + { + "epoch": 0.6422038813294669, + "grad_norm": 0.17973902821540833, + "learning_rate": 1.7937889095552787e-05, + "loss": 0.4813, + "step": 2879 + }, + { + "epoch": 0.6424269462413562, + "grad_norm": 0.1610574722290039, + "learning_rate": 1.7936457535714023e-05, + "loss": 0.5138, + "step": 2880 + }, + { + "epoch": 0.6426500111532456, + "grad_norm": 0.16427603363990784, + "learning_rate": 1.7935025536304317e-05, + "loss": 0.483, + "step": 2881 + }, + { + "epoch": 0.642873076065135, + "grad_norm": 0.18190504610538483, + "learning_rate": 1.7933593097402983e-05, + "loss": 0.4968, + "step": 2882 + }, + { + "epoch": 0.6430961409770243, + "grad_norm": 0.16263769567012787, + "learning_rate": 1.793216021908936e-05, + "loss": 0.5245, + "step": 2883 + }, + { + "epoch": 0.6433192058889137, + "grad_norm": 0.16960637271404266, + "learning_rate": 1.793072690144281e-05, + "loss": 0.5353, + "step": 2884 + }, + { + "epoch": 0.6435422708008031, + "grad_norm": 0.1611735224723816, + "learning_rate": 1.7929293144542715e-05, + "loss": 0.4701, + "step": 2885 + }, + { + "epoch": 0.6437653357126923, + "grad_norm": 0.16312386095523834, + "learning_rate": 1.792785894846849e-05, + "loss": 0.5167, + "step": 2886 + }, + { + "epoch": 0.6439884006245817, + "grad_norm": 0.17161010205745697, + "learning_rate": 1.7926424313299568e-05, + "loss": 0.5092, + "step": 2887 + }, + { + "epoch": 0.6442114655364711, + "grad_norm": 0.1609477698802948, + "learning_rate": 1.7924989239115407e-05, + "loss": 0.5295, + "step": 2888 + }, + { + "epoch": 0.6444345304483605, + "grad_norm": 0.1607387214899063, + "learning_rate": 1.7923553725995494e-05, + "loss": 0.5205, + "step": 2889 + }, + { + "epoch": 0.6446575953602498, + "grad_norm": 0.16367650032043457, + "learning_rate": 1.7922117774019333e-05, + "loss": 0.4875, + "step": 2890 + }, + { + "epoch": 0.6448806602721392, + "grad_norm": 0.17014965415000916, + "learning_rate": 1.7920681383266458e-05, + "loss": 0.5218, + "step": 2891 + }, + { + "epoch": 0.6451037251840286, + "grad_norm": 0.1545102298259735, + "learning_rate": 1.7919244553816426e-05, + "loss": 0.4954, + "step": 2892 + }, + { + "epoch": 0.6453267900959179, + "grad_norm": 0.2512567937374115, + "learning_rate": 1.7917807285748817e-05, + "loss": 0.5173, + "step": 2893 + }, + { + "epoch": 0.6455498550078073, + "grad_norm": 0.17359793186187744, + "learning_rate": 1.7916369579143235e-05, + "loss": 0.5253, + "step": 2894 + }, + { + "epoch": 0.6457729199196967, + "grad_norm": 0.17602702975273132, + "learning_rate": 1.7914931434079305e-05, + "loss": 0.492, + "step": 2895 + }, + { + "epoch": 0.645995984831586, + "grad_norm": 0.1598963886499405, + "learning_rate": 1.791349285063669e-05, + "loss": 0.5071, + "step": 2896 + }, + { + "epoch": 0.6462190497434753, + "grad_norm": 0.16161389648914337, + "learning_rate": 1.7912053828895064e-05, + "loss": 0.4896, + "step": 2897 + }, + { + "epoch": 0.6464421146553647, + "grad_norm": 0.16500675678253174, + "learning_rate": 1.7910614368934127e-05, + "loss": 0.5142, + "step": 2898 + }, + { + "epoch": 0.6466651795672541, + "grad_norm": 0.15490639209747314, + "learning_rate": 1.7909174470833604e-05, + "loss": 0.4708, + "step": 2899 + }, + { + "epoch": 0.6468882444791434, + "grad_norm": 0.1680765300989151, + "learning_rate": 1.7907734134673252e-05, + "loss": 0.4978, + "step": 2900 + }, + { + "epoch": 0.6471113093910328, + "grad_norm": 0.16419318318367004, + "learning_rate": 1.790629336053284e-05, + "loss": 0.5106, + "step": 2901 + }, + { + "epoch": 0.6473343743029222, + "grad_norm": 0.1587122082710266, + "learning_rate": 1.790485214849217e-05, + "loss": 0.456, + "step": 2902 + }, + { + "epoch": 0.6475574392148115, + "grad_norm": 0.17150086164474487, + "learning_rate": 1.7903410498631063e-05, + "loss": 0.5066, + "step": 2903 + }, + { + "epoch": 0.6477805041267009, + "grad_norm": 0.16548244655132294, + "learning_rate": 1.790196841102937e-05, + "loss": 0.5297, + "step": 2904 + }, + { + "epoch": 0.6480035690385902, + "grad_norm": 0.1526126116514206, + "learning_rate": 1.790052588576696e-05, + "loss": 0.4693, + "step": 2905 + }, + { + "epoch": 0.6482266339504796, + "grad_norm": 0.16352921724319458, + "learning_rate": 1.7899082922923732e-05, + "loss": 0.5291, + "step": 2906 + }, + { + "epoch": 0.6484496988623689, + "grad_norm": 0.16485454142093658, + "learning_rate": 1.78976395225796e-05, + "loss": 0.4752, + "step": 2907 + }, + { + "epoch": 0.6486727637742583, + "grad_norm": 0.1610778123140335, + "learning_rate": 1.7896195684814516e-05, + "loss": 0.479, + "step": 2908 + }, + { + "epoch": 0.6488958286861477, + "grad_norm": 0.16191574931144714, + "learning_rate": 1.7894751409708447e-05, + "loss": 0.5059, + "step": 2909 + }, + { + "epoch": 0.649118893598037, + "grad_norm": 0.1598249077796936, + "learning_rate": 1.7893306697341385e-05, + "loss": 0.487, + "step": 2910 + }, + { + "epoch": 0.6493419585099264, + "grad_norm": 0.1673557460308075, + "learning_rate": 1.7891861547793345e-05, + "loss": 0.4926, + "step": 2911 + }, + { + "epoch": 0.6495650234218158, + "grad_norm": 0.16955086588859558, + "learning_rate": 1.789041596114437e-05, + "loss": 0.4912, + "step": 2912 + }, + { + "epoch": 0.6497880883337052, + "grad_norm": 0.15351925790309906, + "learning_rate": 1.788896993747453e-05, + "loss": 0.4758, + "step": 2913 + }, + { + "epoch": 0.6500111532455944, + "grad_norm": 0.16994906961917877, + "learning_rate": 1.7887523476863907e-05, + "loss": 0.5038, + "step": 2914 + }, + { + "epoch": 0.6502342181574838, + "grad_norm": 0.16765102744102478, + "learning_rate": 1.7886076579392622e-05, + "loss": 0.4787, + "step": 2915 + }, + { + "epoch": 0.6504572830693732, + "grad_norm": 0.16875889897346497, + "learning_rate": 1.7884629245140812e-05, + "loss": 0.5267, + "step": 2916 + }, + { + "epoch": 0.6506803479812625, + "grad_norm": 0.16783274710178375, + "learning_rate": 1.7883181474188637e-05, + "loss": 0.4913, + "step": 2917 + }, + { + "epoch": 0.6509034128931519, + "grad_norm": 0.16633883118629456, + "learning_rate": 1.7881733266616284e-05, + "loss": 0.5019, + "step": 2918 + }, + { + "epoch": 0.6511264778050413, + "grad_norm": 0.1654990315437317, + "learning_rate": 1.7880284622503966e-05, + "loss": 0.5033, + "step": 2919 + }, + { + "epoch": 0.6513495427169306, + "grad_norm": 0.16043652594089508, + "learning_rate": 1.7878835541931915e-05, + "loss": 0.4957, + "step": 2920 + }, + { + "epoch": 0.65157260762882, + "grad_norm": 0.16924645006656647, + "learning_rate": 1.7877386024980392e-05, + "loss": 0.5223, + "step": 2921 + }, + { + "epoch": 0.6517956725407094, + "grad_norm": 0.1635693907737732, + "learning_rate": 1.7875936071729682e-05, + "loss": 0.4908, + "step": 2922 + }, + { + "epoch": 0.6520187374525988, + "grad_norm": 0.1630011945962906, + "learning_rate": 1.7874485682260087e-05, + "loss": 0.5138, + "step": 2923 + }, + { + "epoch": 0.652241802364488, + "grad_norm": 0.16332125663757324, + "learning_rate": 1.7873034856651944e-05, + "loss": 0.5119, + "step": 2924 + }, + { + "epoch": 0.6524648672763774, + "grad_norm": 0.1802121102809906, + "learning_rate": 1.787158359498561e-05, + "loss": 0.4766, + "step": 2925 + }, + { + "epoch": 0.6526879321882668, + "grad_norm": 0.16062189638614655, + "learning_rate": 1.7870131897341458e-05, + "loss": 0.4747, + "step": 2926 + }, + { + "epoch": 0.6529109971001561, + "grad_norm": 0.16367454826831818, + "learning_rate": 1.7868679763799898e-05, + "loss": 0.4698, + "step": 2927 + }, + { + "epoch": 0.6531340620120455, + "grad_norm": 0.16663846373558044, + "learning_rate": 1.786722719444136e-05, + "loss": 0.4903, + "step": 2928 + }, + { + "epoch": 0.6533571269239349, + "grad_norm": 0.16041219234466553, + "learning_rate": 1.786577418934629e-05, + "loss": 0.496, + "step": 2929 + }, + { + "epoch": 0.6535801918358243, + "grad_norm": 0.1660057008266449, + "learning_rate": 1.7864320748595168e-05, + "loss": 0.497, + "step": 2930 + }, + { + "epoch": 0.6538032567477136, + "grad_norm": 0.17890667915344238, + "learning_rate": 1.7862866872268493e-05, + "loss": 0.5183, + "step": 2931 + }, + { + "epoch": 0.654026321659603, + "grad_norm": 0.15913154184818268, + "learning_rate": 1.7861412560446794e-05, + "loss": 0.4827, + "step": 2932 + }, + { + "epoch": 0.6542493865714923, + "grad_norm": 0.16374540328979492, + "learning_rate": 1.7859957813210614e-05, + "loss": 0.5119, + "step": 2933 + }, + { + "epoch": 0.6544724514833816, + "grad_norm": 0.1748073399066925, + "learning_rate": 1.7858502630640533e-05, + "loss": 0.493, + "step": 2934 + }, + { + "epoch": 0.654695516395271, + "grad_norm": 0.1679995357990265, + "learning_rate": 1.7857047012817144e-05, + "loss": 0.4878, + "step": 2935 + }, + { + "epoch": 0.6549185813071604, + "grad_norm": 0.16782502830028534, + "learning_rate": 1.7855590959821068e-05, + "loss": 0.4942, + "step": 2936 + }, + { + "epoch": 0.6551416462190497, + "grad_norm": 0.1666112244129181, + "learning_rate": 1.785413447173295e-05, + "loss": 0.5109, + "step": 2937 + }, + { + "epoch": 0.6553647111309391, + "grad_norm": 0.16338278353214264, + "learning_rate": 1.785267754863346e-05, + "loss": 0.509, + "step": 2938 + }, + { + "epoch": 0.6555877760428285, + "grad_norm": 0.16831496357917786, + "learning_rate": 1.7851220190603295e-05, + "loss": 0.4933, + "step": 2939 + }, + { + "epoch": 0.6558108409547179, + "grad_norm": 0.16522802412509918, + "learning_rate": 1.7849762397723168e-05, + "loss": 0.5204, + "step": 2940 + }, + { + "epoch": 0.6560339058666071, + "grad_norm": 0.1579723209142685, + "learning_rate": 1.7848304170073822e-05, + "loss": 0.4747, + "step": 2941 + }, + { + "epoch": 0.6562569707784965, + "grad_norm": 0.16369852423667908, + "learning_rate": 1.784684550773602e-05, + "loss": 0.4763, + "step": 2942 + }, + { + "epoch": 0.6564800356903859, + "grad_norm": 0.16621820628643036, + "learning_rate": 1.7845386410790558e-05, + "loss": 0.5113, + "step": 2943 + }, + { + "epoch": 0.6567031006022752, + "grad_norm": 0.161569282412529, + "learning_rate": 1.784392687931825e-05, + "loss": 0.4899, + "step": 2944 + }, + { + "epoch": 0.6569261655141646, + "grad_norm": 0.1616351306438446, + "learning_rate": 1.7842466913399928e-05, + "loss": 0.4879, + "step": 2945 + }, + { + "epoch": 0.657149230426054, + "grad_norm": 0.16426199674606323, + "learning_rate": 1.7841006513116456e-05, + "loss": 0.5265, + "step": 2946 + }, + { + "epoch": 0.6573722953379434, + "grad_norm": 0.1592702567577362, + "learning_rate": 1.7839545678548727e-05, + "loss": 0.5036, + "step": 2947 + }, + { + "epoch": 0.6575953602498327, + "grad_norm": 0.15806902945041656, + "learning_rate": 1.7838084409777637e-05, + "loss": 0.4954, + "step": 2948 + }, + { + "epoch": 0.6578184251617221, + "grad_norm": 0.17478401958942413, + "learning_rate": 1.7836622706884138e-05, + "loss": 0.5046, + "step": 2949 + }, + { + "epoch": 0.6580414900736115, + "grad_norm": 0.16117317974567413, + "learning_rate": 1.7835160569949174e-05, + "loss": 0.4925, + "step": 2950 + }, + { + "epoch": 0.6582645549855007, + "grad_norm": 0.1660463809967041, + "learning_rate": 1.783369799905373e-05, + "loss": 0.5003, + "step": 2951 + }, + { + "epoch": 0.6584876198973901, + "grad_norm": 0.1700252741575241, + "learning_rate": 1.7832234994278822e-05, + "loss": 0.5169, + "step": 2952 + }, + { + "epoch": 0.6587106848092795, + "grad_norm": 0.17790904641151428, + "learning_rate": 1.7830771555705468e-05, + "loss": 0.5088, + "step": 2953 + }, + { + "epoch": 0.6589337497211689, + "grad_norm": 0.16493864357471466, + "learning_rate": 1.782930768341473e-05, + "loss": 0.5082, + "step": 2954 + }, + { + "epoch": 0.6591568146330582, + "grad_norm": 0.16437609493732452, + "learning_rate": 1.7827843377487683e-05, + "loss": 0.4984, + "step": 2955 + }, + { + "epoch": 0.6593798795449476, + "grad_norm": 0.16439390182495117, + "learning_rate": 1.7826378638005432e-05, + "loss": 0.5005, + "step": 2956 + }, + { + "epoch": 0.659602944456837, + "grad_norm": 0.15785926580429077, + "learning_rate": 1.78249134650491e-05, + "loss": 0.5005, + "step": 2957 + }, + { + "epoch": 0.6598260093687263, + "grad_norm": 0.20435695350170135, + "learning_rate": 1.782344785869984e-05, + "loss": 0.4711, + "step": 2958 + }, + { + "epoch": 0.6600490742806157, + "grad_norm": 0.15359720587730408, + "learning_rate": 1.7821981819038828e-05, + "loss": 0.4738, + "step": 2959 + }, + { + "epoch": 0.660272139192505, + "grad_norm": 0.17044633626937866, + "learning_rate": 1.7820515346147262e-05, + "loss": 0.4951, + "step": 2960 + }, + { + "epoch": 0.6604952041043943, + "grad_norm": 0.17282849550247192, + "learning_rate": 1.781904844010636e-05, + "loss": 0.496, + "step": 2961 + }, + { + "epoch": 0.6607182690162837, + "grad_norm": 0.1486678570508957, + "learning_rate": 1.7817581100997374e-05, + "loss": 0.4671, + "step": 2962 + }, + { + "epoch": 0.6609413339281731, + "grad_norm": 0.16091682016849518, + "learning_rate": 1.781611332890157e-05, + "loss": 0.4824, + "step": 2963 + }, + { + "epoch": 0.6611643988400625, + "grad_norm": 0.15881048142910004, + "learning_rate": 1.7814645123900246e-05, + "loss": 0.4864, + "step": 2964 + }, + { + "epoch": 0.6613874637519518, + "grad_norm": 0.17700685560703278, + "learning_rate": 1.781317648607472e-05, + "loss": 0.4962, + "step": 2965 + }, + { + "epoch": 0.6616105286638412, + "grad_norm": 0.15473395586013794, + "learning_rate": 1.781170741550633e-05, + "loss": 0.4737, + "step": 2966 + }, + { + "epoch": 0.6618335935757306, + "grad_norm": 0.1670142114162445, + "learning_rate": 1.781023791227645e-05, + "loss": 0.5105, + "step": 2967 + }, + { + "epoch": 0.6620566584876199, + "grad_norm": 0.18345218896865845, + "learning_rate": 1.780876797646646e-05, + "loss": 0.5086, + "step": 2968 + }, + { + "epoch": 0.6622797233995092, + "grad_norm": 0.16229334473609924, + "learning_rate": 1.7807297608157784e-05, + "loss": 0.5007, + "step": 2969 + }, + { + "epoch": 0.6625027883113986, + "grad_norm": 0.16335418820381165, + "learning_rate": 1.7805826807431856e-05, + "loss": 0.4976, + "step": 2970 + }, + { + "epoch": 0.662725853223288, + "grad_norm": 0.16784057021141052, + "learning_rate": 1.780435557437014e-05, + "loss": 0.5106, + "step": 2971 + }, + { + "epoch": 0.6629489181351773, + "grad_norm": 0.16552112996578217, + "learning_rate": 1.7802883909054118e-05, + "loss": 0.5087, + "step": 2972 + }, + { + "epoch": 0.6631719830470667, + "grad_norm": 0.206945538520813, + "learning_rate": 1.7801411811565308e-05, + "loss": 0.4753, + "step": 2973 + }, + { + "epoch": 0.6633950479589561, + "grad_norm": 0.1575002670288086, + "learning_rate": 1.7799939281985236e-05, + "loss": 0.4875, + "step": 2974 + }, + { + "epoch": 0.6636181128708454, + "grad_norm": 0.1623086929321289, + "learning_rate": 1.7798466320395463e-05, + "loss": 0.4933, + "step": 2975 + }, + { + "epoch": 0.6638411777827348, + "grad_norm": 0.1668468862771988, + "learning_rate": 1.779699292687757e-05, + "loss": 0.462, + "step": 2976 + }, + { + "epoch": 0.6640642426946242, + "grad_norm": 0.43826133012771606, + "learning_rate": 1.7795519101513166e-05, + "loss": 0.5073, + "step": 2977 + }, + { + "epoch": 0.6642873076065134, + "grad_norm": 0.18710243701934814, + "learning_rate": 1.779404484438388e-05, + "loss": 0.5103, + "step": 2978 + }, + { + "epoch": 0.6645103725184028, + "grad_norm": 0.17884770035743713, + "learning_rate": 1.7792570155571358e-05, + "loss": 0.5219, + "step": 2979 + }, + { + "epoch": 0.6647334374302922, + "grad_norm": 0.1652306616306305, + "learning_rate": 1.7791095035157288e-05, + "loss": 0.513, + "step": 2980 + }, + { + "epoch": 0.6649565023421816, + "grad_norm": 0.175328329205513, + "learning_rate": 1.7789619483223367e-05, + "loss": 0.5064, + "step": 2981 + }, + { + "epoch": 0.6651795672540709, + "grad_norm": 0.15750913321971893, + "learning_rate": 1.7788143499851318e-05, + "loss": 0.4608, + "step": 2982 + }, + { + "epoch": 0.6654026321659603, + "grad_norm": 0.15582314133644104, + "learning_rate": 1.7786667085122895e-05, + "loss": 0.4793, + "step": 2983 + }, + { + "epoch": 0.6656256970778497, + "grad_norm": 0.18893486261367798, + "learning_rate": 1.7785190239119864e-05, + "loss": 0.5277, + "step": 2984 + }, + { + "epoch": 0.665848761989739, + "grad_norm": 0.16801688075065613, + "learning_rate": 1.7783712961924032e-05, + "loss": 0.5281, + "step": 2985 + }, + { + "epoch": 0.6660718269016284, + "grad_norm": 0.16510803997516632, + "learning_rate": 1.778223525361721e-05, + "loss": 0.4675, + "step": 2986 + }, + { + "epoch": 0.6662948918135178, + "grad_norm": 0.16860422492027283, + "learning_rate": 1.778075711428125e-05, + "loss": 0.4712, + "step": 2987 + }, + { + "epoch": 0.6665179567254071, + "grad_norm": 0.17091190814971924, + "learning_rate": 1.777927854399802e-05, + "loss": 0.5208, + "step": 2988 + }, + { + "epoch": 0.6667410216372964, + "grad_norm": 0.19528479874134064, + "learning_rate": 1.7777799542849408e-05, + "loss": 0.482, + "step": 2989 + }, + { + "epoch": 0.6669640865491858, + "grad_norm": 0.16672208905220032, + "learning_rate": 1.7776320110917334e-05, + "loss": 0.5002, + "step": 2990 + }, + { + "epoch": 0.6671871514610752, + "grad_norm": 0.1595972180366516, + "learning_rate": 1.777484024828374e-05, + "loss": 0.4868, + "step": 2991 + }, + { + "epoch": 0.6674102163729645, + "grad_norm": 0.16304026544094086, + "learning_rate": 1.7773359955030583e-05, + "loss": 0.4839, + "step": 2992 + }, + { + "epoch": 0.6676332812848539, + "grad_norm": 0.17252400517463684, + "learning_rate": 1.7771879231239857e-05, + "loss": 0.4737, + "step": 2993 + }, + { + "epoch": 0.6678563461967433, + "grad_norm": 0.17409226298332214, + "learning_rate": 1.777039807699357e-05, + "loss": 0.5152, + "step": 2994 + }, + { + "epoch": 0.6680794111086326, + "grad_norm": 0.15897680819034576, + "learning_rate": 1.7768916492373763e-05, + "loss": 0.5081, + "step": 2995 + }, + { + "epoch": 0.668302476020522, + "grad_norm": 0.15727804601192474, + "learning_rate": 1.7767434477462493e-05, + "loss": 0.4902, + "step": 2996 + }, + { + "epoch": 0.6685255409324113, + "grad_norm": 0.16594363749027252, + "learning_rate": 1.776595203234184e-05, + "loss": 0.4982, + "step": 2997 + }, + { + "epoch": 0.6687486058443007, + "grad_norm": 0.16226732730865479, + "learning_rate": 1.7764469157093916e-05, + "loss": 0.5304, + "step": 2998 + }, + { + "epoch": 0.66897167075619, + "grad_norm": 0.15772786736488342, + "learning_rate": 1.7762985851800846e-05, + "loss": 0.4707, + "step": 2999 + }, + { + "epoch": 0.6691947356680794, + "grad_norm": 0.19541241228580475, + "learning_rate": 1.776150211654479e-05, + "loss": 0.4826, + "step": 3000 + }, + { + "epoch": 0.6694178005799688, + "grad_norm": 0.17381907999515533, + "learning_rate": 1.7760017951407924e-05, + "loss": 0.5304, + "step": 3001 + }, + { + "epoch": 0.6696408654918581, + "grad_norm": 0.16170580685138702, + "learning_rate": 1.7758533356472454e-05, + "loss": 0.4722, + "step": 3002 + }, + { + "epoch": 0.6698639304037475, + "grad_norm": 0.16046100854873657, + "learning_rate": 1.7757048331820604e-05, + "loss": 0.4852, + "step": 3003 + }, + { + "epoch": 0.6700869953156369, + "grad_norm": 0.17672637104988098, + "learning_rate": 1.775556287753462e-05, + "loss": 0.5, + "step": 3004 + }, + { + "epoch": 0.6703100602275263, + "grad_norm": 0.15894585847854614, + "learning_rate": 1.7754076993696784e-05, + "loss": 0.4819, + "step": 3005 + }, + { + "epoch": 0.6705331251394155, + "grad_norm": 0.16397017240524292, + "learning_rate": 1.7752590680389382e-05, + "loss": 0.5348, + "step": 3006 + }, + { + "epoch": 0.6707561900513049, + "grad_norm": 0.17470906674861908, + "learning_rate": 1.7751103937694748e-05, + "loss": 0.5056, + "step": 3007 + }, + { + "epoch": 0.6709792549631943, + "grad_norm": 0.16147972643375397, + "learning_rate": 1.774961676569522e-05, + "loss": 0.4845, + "step": 3008 + }, + { + "epoch": 0.6712023198750836, + "grad_norm": 0.15075084567070007, + "learning_rate": 1.774812916447317e-05, + "loss": 0.4873, + "step": 3009 + }, + { + "epoch": 0.671425384786973, + "grad_norm": 0.1605006605386734, + "learning_rate": 1.774664113411099e-05, + "loss": 0.4885, + "step": 3010 + }, + { + "epoch": 0.6716484496988624, + "grad_norm": 0.17120316624641418, + "learning_rate": 1.7745152674691093e-05, + "loss": 0.4952, + "step": 3011 + }, + { + "epoch": 0.6718715146107517, + "grad_norm": 0.17164325714111328, + "learning_rate": 1.774366378629592e-05, + "loss": 0.5225, + "step": 3012 + }, + { + "epoch": 0.6720945795226411, + "grad_norm": 0.16912591457366943, + "learning_rate": 1.774217446900794e-05, + "loss": 0.5165, + "step": 3013 + }, + { + "epoch": 0.6723176444345305, + "grad_norm": 0.1592075079679489, + "learning_rate": 1.7740684722909638e-05, + "loss": 0.481, + "step": 3014 + }, + { + "epoch": 0.6725407093464199, + "grad_norm": 0.15714260935783386, + "learning_rate": 1.7739194548083526e-05, + "loss": 0.5122, + "step": 3015 + }, + { + "epoch": 0.6727637742583091, + "grad_norm": 0.15991204977035522, + "learning_rate": 1.7737703944612135e-05, + "loss": 0.5006, + "step": 3016 + }, + { + "epoch": 0.6729868391701985, + "grad_norm": 0.15693789720535278, + "learning_rate": 1.7736212912578028e-05, + "loss": 0.4867, + "step": 3017 + }, + { + "epoch": 0.6732099040820879, + "grad_norm": 0.15840266644954681, + "learning_rate": 1.773472145206379e-05, + "loss": 0.4937, + "step": 3018 + }, + { + "epoch": 0.6734329689939772, + "grad_norm": 0.16329781711101532, + "learning_rate": 1.7733229563152024e-05, + "loss": 0.489, + "step": 3019 + }, + { + "epoch": 0.6736560339058666, + "grad_norm": 0.17248262465000153, + "learning_rate": 1.7731737245925357e-05, + "loss": 0.5112, + "step": 3020 + }, + { + "epoch": 0.673879098817756, + "grad_norm": 0.1815134584903717, + "learning_rate": 1.7730244500466454e-05, + "loss": 0.5027, + "step": 3021 + }, + { + "epoch": 0.6741021637296454, + "grad_norm": 0.17082563042640686, + "learning_rate": 1.772875132685798e-05, + "loss": 0.5018, + "step": 3022 + }, + { + "epoch": 0.6743252286415347, + "grad_norm": 0.17142242193222046, + "learning_rate": 1.772725772518264e-05, + "loss": 0.5106, + "step": 3023 + }, + { + "epoch": 0.674548293553424, + "grad_norm": 0.17516618967056274, + "learning_rate": 1.7725763695523166e-05, + "loss": 0.4743, + "step": 3024 + }, + { + "epoch": 0.6747713584653134, + "grad_norm": 0.16038702428340912, + "learning_rate": 1.77242692379623e-05, + "loss": 0.5071, + "step": 3025 + }, + { + "epoch": 0.6749944233772027, + "grad_norm": 0.16445225477218628, + "learning_rate": 1.7722774352582816e-05, + "loss": 0.4992, + "step": 3026 + }, + { + "epoch": 0.6752174882890921, + "grad_norm": 0.16889688372612, + "learning_rate": 1.772127903946751e-05, + "loss": 0.4909, + "step": 3027 + }, + { + "epoch": 0.6754405532009815, + "grad_norm": 0.17117194831371307, + "learning_rate": 1.77197832986992e-05, + "loss": 0.4927, + "step": 3028 + }, + { + "epoch": 0.6756636181128709, + "grad_norm": 0.16765987873077393, + "learning_rate": 1.7718287130360733e-05, + "loss": 0.5065, + "step": 3029 + }, + { + "epoch": 0.6758866830247602, + "grad_norm": 0.1618220955133438, + "learning_rate": 1.7716790534534977e-05, + "loss": 0.4931, + "step": 3030 + }, + { + "epoch": 0.6761097479366496, + "grad_norm": 0.16639363765716553, + "learning_rate": 1.7715293511304815e-05, + "loss": 0.5044, + "step": 3031 + }, + { + "epoch": 0.676332812848539, + "grad_norm": 0.167351633310318, + "learning_rate": 1.7713796060753173e-05, + "loss": 0.5188, + "step": 3032 + }, + { + "epoch": 0.6765558777604282, + "grad_norm": 0.1656593531370163, + "learning_rate": 1.771229818296298e-05, + "loss": 0.5271, + "step": 3033 + }, + { + "epoch": 0.6767789426723176, + "grad_norm": 0.168874591588974, + "learning_rate": 1.7710799878017203e-05, + "loss": 0.4947, + "step": 3034 + }, + { + "epoch": 0.677002007584207, + "grad_norm": 0.16307704150676727, + "learning_rate": 1.7709301145998827e-05, + "loss": 0.5071, + "step": 3035 + }, + { + "epoch": 0.6772250724960963, + "grad_norm": 0.1524697244167328, + "learning_rate": 1.7707801986990857e-05, + "loss": 0.497, + "step": 3036 + }, + { + "epoch": 0.6774481374079857, + "grad_norm": 0.16599062085151672, + "learning_rate": 1.7706302401076327e-05, + "loss": 0.4726, + "step": 3037 + }, + { + "epoch": 0.6776712023198751, + "grad_norm": 0.16413775086402893, + "learning_rate": 1.77048023883383e-05, + "loss": 0.527, + "step": 3038 + }, + { + "epoch": 0.6778942672317645, + "grad_norm": 0.1584494262933731, + "learning_rate": 1.770330194885985e-05, + "loss": 0.5095, + "step": 3039 + }, + { + "epoch": 0.6781173321436538, + "grad_norm": 0.15526416897773743, + "learning_rate": 1.7701801082724084e-05, + "loss": 0.4914, + "step": 3040 + }, + { + "epoch": 0.6783403970555432, + "grad_norm": 0.16402754187583923, + "learning_rate": 1.7700299790014126e-05, + "loss": 0.5184, + "step": 3041 + }, + { + "epoch": 0.6785634619674326, + "grad_norm": 0.16508348286151886, + "learning_rate": 1.769879807081313e-05, + "loss": 0.4783, + "step": 3042 + }, + { + "epoch": 0.6787865268793218, + "grad_norm": 0.15416789054870605, + "learning_rate": 1.769729592520427e-05, + "loss": 0.4818, + "step": 3043 + }, + { + "epoch": 0.6790095917912112, + "grad_norm": 0.27602240443229675, + "learning_rate": 1.769579335327074e-05, + "loss": 0.49, + "step": 3044 + }, + { + "epoch": 0.6792326567031006, + "grad_norm": 0.15309108793735504, + "learning_rate": 1.7694290355095768e-05, + "loss": 0.4778, + "step": 3045 + }, + { + "epoch": 0.67945572161499, + "grad_norm": 0.15830327570438385, + "learning_rate": 1.76927869307626e-05, + "loss": 0.4909, + "step": 3046 + }, + { + "epoch": 0.6796787865268793, + "grad_norm": 0.16737417876720428, + "learning_rate": 1.76912830803545e-05, + "loss": 0.4642, + "step": 3047 + }, + { + "epoch": 0.6799018514387687, + "grad_norm": 0.16713353991508484, + "learning_rate": 1.7689778803954764e-05, + "loss": 0.5076, + "step": 3048 + }, + { + "epoch": 0.6801249163506581, + "grad_norm": 0.15644113719463348, + "learning_rate": 1.7688274101646702e-05, + "loss": 0.4746, + "step": 3049 + }, + { + "epoch": 0.6803479812625474, + "grad_norm": 0.1714516133069992, + "learning_rate": 1.7686768973513663e-05, + "loss": 0.4924, + "step": 3050 + }, + { + "epoch": 0.6805710461744368, + "grad_norm": 0.1625642329454422, + "learning_rate": 1.7685263419639008e-05, + "loss": 0.5103, + "step": 3051 + }, + { + "epoch": 0.6807941110863261, + "grad_norm": 0.15343210101127625, + "learning_rate": 1.768375744010612e-05, + "loss": 0.4871, + "step": 3052 + }, + { + "epoch": 0.6810171759982154, + "grad_norm": 0.1622699499130249, + "learning_rate": 1.7682251034998413e-05, + "loss": 0.5093, + "step": 3053 + }, + { + "epoch": 0.6812402409101048, + "grad_norm": 0.15701012313365936, + "learning_rate": 1.768074420439932e-05, + "loss": 0.5052, + "step": 3054 + }, + { + "epoch": 0.6814633058219942, + "grad_norm": 0.1647741198539734, + "learning_rate": 1.76792369483923e-05, + "loss": 0.528, + "step": 3055 + }, + { + "epoch": 0.6816863707338836, + "grad_norm": 0.16413739323616028, + "learning_rate": 1.7677729267060836e-05, + "loss": 0.5029, + "step": 3056 + }, + { + "epoch": 0.6819094356457729, + "grad_norm": 0.1663079708814621, + "learning_rate": 1.7676221160488426e-05, + "loss": 0.4961, + "step": 3057 + }, + { + "epoch": 0.6821325005576623, + "grad_norm": 0.1598016768693924, + "learning_rate": 1.7674712628758603e-05, + "loss": 0.5201, + "step": 3058 + }, + { + "epoch": 0.6823555654695517, + "grad_norm": 0.18910476565361023, + "learning_rate": 1.767320367195492e-05, + "loss": 0.5203, + "step": 3059 + }, + { + "epoch": 0.682578630381441, + "grad_norm": 0.1525489091873169, + "learning_rate": 1.767169429016095e-05, + "loss": 0.4609, + "step": 3060 + }, + { + "epoch": 0.6828016952933303, + "grad_norm": 0.22233475744724274, + "learning_rate": 1.7670184483460296e-05, + "loss": 0.4936, + "step": 3061 + }, + { + "epoch": 0.6830247602052197, + "grad_norm": 0.16714778542518616, + "learning_rate": 1.766867425193658e-05, + "loss": 0.5248, + "step": 3062 + }, + { + "epoch": 0.6832478251171091, + "grad_norm": 0.1673208624124527, + "learning_rate": 1.766716359567344e-05, + "loss": 0.5056, + "step": 3063 + }, + { + "epoch": 0.6834708900289984, + "grad_norm": 0.1797683835029602, + "learning_rate": 1.7665652514754554e-05, + "loss": 0.5039, + "step": 3064 + }, + { + "epoch": 0.6836939549408878, + "grad_norm": 0.1645476371049881, + "learning_rate": 1.7664141009263614e-05, + "loss": 0.4907, + "step": 3065 + }, + { + "epoch": 0.6839170198527772, + "grad_norm": 0.15605735778808594, + "learning_rate": 1.7662629079284336e-05, + "loss": 0.4821, + "step": 3066 + }, + { + "epoch": 0.6841400847646665, + "grad_norm": 0.16608816385269165, + "learning_rate": 1.7661116724900456e-05, + "loss": 0.5028, + "step": 3067 + }, + { + "epoch": 0.6843631496765559, + "grad_norm": 0.16926267743110657, + "learning_rate": 1.7659603946195746e-05, + "loss": 0.4933, + "step": 3068 + }, + { + "epoch": 0.6845862145884453, + "grad_norm": 0.15846210718154907, + "learning_rate": 1.7658090743253985e-05, + "loss": 0.4991, + "step": 3069 + }, + { + "epoch": 0.6848092795003345, + "grad_norm": 0.16848038136959076, + "learning_rate": 1.7656577116158988e-05, + "loss": 0.5002, + "step": 3070 + }, + { + "epoch": 0.6850323444122239, + "grad_norm": 0.16113083064556122, + "learning_rate": 1.765506306499459e-05, + "loss": 0.4999, + "step": 3071 + }, + { + "epoch": 0.6852554093241133, + "grad_norm": 0.1538555920124054, + "learning_rate": 1.7653548589844648e-05, + "loss": 0.4812, + "step": 3072 + }, + { + "epoch": 0.6854784742360027, + "grad_norm": 0.16473166644573212, + "learning_rate": 1.765203369079304e-05, + "loss": 0.4885, + "step": 3073 + }, + { + "epoch": 0.685701539147892, + "grad_norm": 0.15544745326042175, + "learning_rate": 1.765051836792367e-05, + "loss": 0.4901, + "step": 3074 + }, + { + "epoch": 0.6859246040597814, + "grad_norm": 0.1728777289390564, + "learning_rate": 1.764900262132048e-05, + "loss": 0.4578, + "step": 3075 + }, + { + "epoch": 0.6861476689716708, + "grad_norm": 0.16069582104682922, + "learning_rate": 1.76474864510674e-05, + "loss": 0.4904, + "step": 3076 + }, + { + "epoch": 0.6863707338835601, + "grad_norm": 0.15537656843662262, + "learning_rate": 1.764596985724842e-05, + "loss": 0.4899, + "step": 3077 + }, + { + "epoch": 0.6865937987954495, + "grad_norm": 0.16955626010894775, + "learning_rate": 1.7644452839947536e-05, + "loss": 0.5011, + "step": 3078 + }, + { + "epoch": 0.6868168637073389, + "grad_norm": 0.16193504631519318, + "learning_rate": 1.7642935399248765e-05, + "loss": 0.5128, + "step": 3079 + }, + { + "epoch": 0.6870399286192282, + "grad_norm": 0.1631263643503189, + "learning_rate": 1.7641417535236155e-05, + "loss": 0.5026, + "step": 3080 + }, + { + "epoch": 0.6872629935311175, + "grad_norm": 0.18346811830997467, + "learning_rate": 1.7639899247993775e-05, + "loss": 0.4857, + "step": 3081 + }, + { + "epoch": 0.6874860584430069, + "grad_norm": 0.16757658123970032, + "learning_rate": 1.7638380537605722e-05, + "loss": 0.4935, + "step": 3082 + }, + { + "epoch": 0.6877091233548963, + "grad_norm": 0.18027780950069427, + "learning_rate": 1.7636861404156106e-05, + "loss": 0.5448, + "step": 3083 + }, + { + "epoch": 0.6879321882667856, + "grad_norm": 0.17672498524188995, + "learning_rate": 1.763534184772907e-05, + "loss": 0.5054, + "step": 3084 + }, + { + "epoch": 0.688155253178675, + "grad_norm": 0.15495164692401886, + "learning_rate": 1.763382186840877e-05, + "loss": 0.4778, + "step": 3085 + }, + { + "epoch": 0.6883783180905644, + "grad_norm": 0.16750416159629822, + "learning_rate": 1.76323014662794e-05, + "loss": 0.5147, + "step": 3086 + }, + { + "epoch": 0.6886013830024537, + "grad_norm": 0.16243582963943481, + "learning_rate": 1.763078064142516e-05, + "loss": 0.5029, + "step": 3087 + }, + { + "epoch": 0.688824447914343, + "grad_norm": 0.1765555739402771, + "learning_rate": 1.7629259393930292e-05, + "loss": 0.5402, + "step": 3088 + }, + { + "epoch": 0.6890475128262324, + "grad_norm": 0.17327693104743958, + "learning_rate": 1.7627737723879048e-05, + "loss": 0.5223, + "step": 3089 + }, + { + "epoch": 0.6892705777381218, + "grad_norm": 0.16204944252967834, + "learning_rate": 1.762621563135571e-05, + "loss": 0.4926, + "step": 3090 + }, + { + "epoch": 0.6894936426500111, + "grad_norm": 0.16001787781715393, + "learning_rate": 1.762469311644458e-05, + "loss": 0.5038, + "step": 3091 + }, + { + "epoch": 0.6897167075619005, + "grad_norm": 0.15672995150089264, + "learning_rate": 1.7623170179229982e-05, + "loss": 0.4752, + "step": 3092 + }, + { + "epoch": 0.6899397724737899, + "grad_norm": 0.16134774684906006, + "learning_rate": 1.7621646819796264e-05, + "loss": 0.4911, + "step": 3093 + }, + { + "epoch": 0.6901628373856792, + "grad_norm": 0.18240399658679962, + "learning_rate": 1.762012303822781e-05, + "loss": 0.4956, + "step": 3094 + }, + { + "epoch": 0.6903859022975686, + "grad_norm": 0.16389527916908264, + "learning_rate": 1.761859883460901e-05, + "loss": 0.4713, + "step": 3095 + }, + { + "epoch": 0.690608967209458, + "grad_norm": 0.16408023238182068, + "learning_rate": 1.761707420902428e-05, + "loss": 0.4925, + "step": 3096 + }, + { + "epoch": 0.6908320321213474, + "grad_norm": 0.16707156598567963, + "learning_rate": 1.761554916155807e-05, + "loss": 0.5094, + "step": 3097 + }, + { + "epoch": 0.6910550970332366, + "grad_norm": 0.1580830216407776, + "learning_rate": 1.7614023692294838e-05, + "loss": 0.4896, + "step": 3098 + }, + { + "epoch": 0.691278161945126, + "grad_norm": 0.1949886530637741, + "learning_rate": 1.7612497801319084e-05, + "loss": 0.4919, + "step": 3099 + }, + { + "epoch": 0.6915012268570154, + "grad_norm": 0.1662502884864807, + "learning_rate": 1.7610971488715315e-05, + "loss": 0.5104, + "step": 3100 + }, + { + "epoch": 0.6917242917689047, + "grad_norm": 0.15772640705108643, + "learning_rate": 1.760944475456807e-05, + "loss": 0.4699, + "step": 3101 + }, + { + "epoch": 0.6919473566807941, + "grad_norm": 0.17815501987934113, + "learning_rate": 1.760791759896191e-05, + "loss": 0.5133, + "step": 3102 + }, + { + "epoch": 0.6921704215926835, + "grad_norm": 0.16980375349521637, + "learning_rate": 1.760639002198142e-05, + "loss": 0.4874, + "step": 3103 + }, + { + "epoch": 0.6923934865045729, + "grad_norm": 0.16098888218402863, + "learning_rate": 1.7604862023711204e-05, + "loss": 0.508, + "step": 3104 + }, + { + "epoch": 0.6926165514164622, + "grad_norm": 0.16175471246242523, + "learning_rate": 1.760333360423589e-05, + "loss": 0.4963, + "step": 3105 + }, + { + "epoch": 0.6928396163283516, + "grad_norm": 0.1580919772386551, + "learning_rate": 1.7601804763640137e-05, + "loss": 0.4674, + "step": 3106 + }, + { + "epoch": 0.693062681240241, + "grad_norm": 0.15538008511066437, + "learning_rate": 1.7600275502008618e-05, + "loss": 0.4907, + "step": 3107 + }, + { + "epoch": 0.6932857461521302, + "grad_norm": 0.15990300476551056, + "learning_rate": 1.7598745819426034e-05, + "loss": 0.5035, + "step": 3108 + }, + { + "epoch": 0.6935088110640196, + "grad_norm": 0.1725892871618271, + "learning_rate": 1.759721571597711e-05, + "loss": 0.5196, + "step": 3109 + }, + { + "epoch": 0.693731875975909, + "grad_norm": 0.16250944137573242, + "learning_rate": 1.7595685191746586e-05, + "loss": 0.4907, + "step": 3110 + }, + { + "epoch": 0.6939549408877983, + "grad_norm": 0.16884207725524902, + "learning_rate": 1.759415424681924e-05, + "loss": 0.535, + "step": 3111 + }, + { + "epoch": 0.6941780057996877, + "grad_norm": 0.1701684594154358, + "learning_rate": 1.7592622881279867e-05, + "loss": 0.4878, + "step": 3112 + }, + { + "epoch": 0.6944010707115771, + "grad_norm": 0.17077569663524628, + "learning_rate": 1.7591091095213277e-05, + "loss": 0.4851, + "step": 3113 + }, + { + "epoch": 0.6946241356234665, + "grad_norm": 0.33712977170944214, + "learning_rate": 1.758955888870431e-05, + "loss": 0.476, + "step": 3114 + }, + { + "epoch": 0.6948472005353558, + "grad_norm": 0.16534623503684998, + "learning_rate": 1.7588026261837833e-05, + "loss": 0.4962, + "step": 3115 + }, + { + "epoch": 0.6950702654472452, + "grad_norm": 0.17375873029232025, + "learning_rate": 1.758649321469873e-05, + "loss": 0.4883, + "step": 3116 + }, + { + "epoch": 0.6952933303591345, + "grad_norm": 0.16607031226158142, + "learning_rate": 1.758495974737191e-05, + "loss": 0.4727, + "step": 3117 + }, + { + "epoch": 0.6955163952710238, + "grad_norm": 0.16579467058181763, + "learning_rate": 1.7583425859942312e-05, + "loss": 0.5156, + "step": 3118 + }, + { + "epoch": 0.6957394601829132, + "grad_norm": 0.16067442297935486, + "learning_rate": 1.7581891552494886e-05, + "loss": 0.4797, + "step": 3119 + }, + { + "epoch": 0.6959625250948026, + "grad_norm": 0.16372907161712646, + "learning_rate": 1.7580356825114616e-05, + "loss": 0.4918, + "step": 3120 + }, + { + "epoch": 0.696185590006692, + "grad_norm": 0.1595798134803772, + "learning_rate": 1.75788216778865e-05, + "loss": 0.4963, + "step": 3121 + }, + { + "epoch": 0.6964086549185813, + "grad_norm": 0.16394098103046417, + "learning_rate": 1.757728611089557e-05, + "loss": 0.5189, + "step": 3122 + }, + { + "epoch": 0.6966317198304707, + "grad_norm": 0.16349278390407562, + "learning_rate": 1.757575012422687e-05, + "loss": 0.5392, + "step": 3123 + }, + { + "epoch": 0.6968547847423601, + "grad_norm": 0.16474361717700958, + "learning_rate": 1.7574213717965473e-05, + "loss": 0.498, + "step": 3124 + }, + { + "epoch": 0.6970778496542493, + "grad_norm": 0.16759975254535675, + "learning_rate": 1.757267689219648e-05, + "loss": 0.4909, + "step": 3125 + }, + { + "epoch": 0.6973009145661387, + "grad_norm": 0.18048445880413055, + "learning_rate": 1.7571139647005004e-05, + "loss": 0.5313, + "step": 3126 + }, + { + "epoch": 0.6975239794780281, + "grad_norm": 0.21055570244789124, + "learning_rate": 1.7569601982476194e-05, + "loss": 0.4773, + "step": 3127 + }, + { + "epoch": 0.6977470443899174, + "grad_norm": 0.16457606852054596, + "learning_rate": 1.7568063898695205e-05, + "loss": 0.4817, + "step": 3128 + }, + { + "epoch": 0.6979701093018068, + "grad_norm": 0.15786704421043396, + "learning_rate": 1.7566525395747237e-05, + "loss": 0.4793, + "step": 3129 + }, + { + "epoch": 0.6981931742136962, + "grad_norm": 0.17408685386180878, + "learning_rate": 1.7564986473717498e-05, + "loss": 0.5238, + "step": 3130 + }, + { + "epoch": 0.6984162391255856, + "grad_norm": 0.1875070482492447, + "learning_rate": 1.7563447132691222e-05, + "loss": 0.5133, + "step": 3131 + }, + { + "epoch": 0.6986393040374749, + "grad_norm": 0.15555784106254578, + "learning_rate": 1.7561907372753665e-05, + "loss": 0.4851, + "step": 3132 + }, + { + "epoch": 0.6988623689493643, + "grad_norm": 0.15845829248428345, + "learning_rate": 1.756036719399011e-05, + "loss": 0.4989, + "step": 3133 + }, + { + "epoch": 0.6990854338612537, + "grad_norm": 0.20709070563316345, + "learning_rate": 1.7558826596485866e-05, + "loss": 0.4939, + "step": 3134 + }, + { + "epoch": 0.6993084987731429, + "grad_norm": 0.17586678266525269, + "learning_rate": 1.755728558032626e-05, + "loss": 0.5028, + "step": 3135 + }, + { + "epoch": 0.6995315636850323, + "grad_norm": 0.16700804233551025, + "learning_rate": 1.7555744145596638e-05, + "loss": 0.4955, + "step": 3136 + }, + { + "epoch": 0.6997546285969217, + "grad_norm": 0.17724819481372833, + "learning_rate": 1.755420229238238e-05, + "loss": 0.4571, + "step": 3137 + }, + { + "epoch": 0.6999776935088111, + "grad_norm": 0.17517662048339844, + "learning_rate": 1.755266002076888e-05, + "loss": 0.5144, + "step": 3138 + }, + { + "epoch": 0.7002007584207004, + "grad_norm": 0.16020509600639343, + "learning_rate": 1.755111733084156e-05, + "loss": 0.4523, + "step": 3139 + }, + { + "epoch": 0.7004238233325898, + "grad_norm": 0.15964041650295258, + "learning_rate": 1.7549574222685864e-05, + "loss": 0.4695, + "step": 3140 + }, + { + "epoch": 0.7006468882444792, + "grad_norm": 0.17395976185798645, + "learning_rate": 1.754803069638726e-05, + "loss": 0.515, + "step": 3141 + }, + { + "epoch": 0.7008699531563685, + "grad_norm": 0.15984676778316498, + "learning_rate": 1.7546486752031237e-05, + "loss": 0.4909, + "step": 3142 + }, + { + "epoch": 0.7010930180682579, + "grad_norm": 0.16784314811229706, + "learning_rate": 1.7544942389703305e-05, + "loss": 0.4973, + "step": 3143 + }, + { + "epoch": 0.7013160829801472, + "grad_norm": 0.16454213857650757, + "learning_rate": 1.754339760948901e-05, + "loss": 0.503, + "step": 3144 + }, + { + "epoch": 0.7015391478920365, + "grad_norm": 0.1578417867422104, + "learning_rate": 1.7541852411473902e-05, + "loss": 0.4632, + "step": 3145 + }, + { + "epoch": 0.7017622128039259, + "grad_norm": 0.16670000553131104, + "learning_rate": 1.7540306795743566e-05, + "loss": 0.4804, + "step": 3146 + }, + { + "epoch": 0.7019852777158153, + "grad_norm": 0.15188740193843842, + "learning_rate": 1.753876076238361e-05, + "loss": 0.4884, + "step": 3147 + }, + { + "epoch": 0.7022083426277047, + "grad_norm": 0.16512979567050934, + "learning_rate": 1.7537214311479663e-05, + "loss": 0.5002, + "step": 3148 + }, + { + "epoch": 0.702431407539594, + "grad_norm": 0.15965288877487183, + "learning_rate": 1.7535667443117377e-05, + "loss": 0.4885, + "step": 3149 + }, + { + "epoch": 0.7026544724514834, + "grad_norm": 0.1552652269601822, + "learning_rate": 1.7534120157382425e-05, + "loss": 0.4955, + "step": 3150 + }, + { + "epoch": 0.7028775373633728, + "grad_norm": 0.16739974915981293, + "learning_rate": 1.7532572454360506e-05, + "loss": 0.4902, + "step": 3151 + }, + { + "epoch": 0.703100602275262, + "grad_norm": 0.18262702226638794, + "learning_rate": 1.7531024334137348e-05, + "loss": 0.5283, + "step": 3152 + }, + { + "epoch": 0.7033236671871514, + "grad_norm": 0.15477107465267181, + "learning_rate": 1.7529475796798686e-05, + "loss": 0.4969, + "step": 3153 + }, + { + "epoch": 0.7035467320990408, + "grad_norm": 0.16841940581798553, + "learning_rate": 1.7527926842430295e-05, + "loss": 0.5351, + "step": 3154 + }, + { + "epoch": 0.7037697970109302, + "grad_norm": 0.17480523884296417, + "learning_rate": 1.7526377471117963e-05, + "loss": 0.5279, + "step": 3155 + }, + { + "epoch": 0.7039928619228195, + "grad_norm": 0.1635589897632599, + "learning_rate": 1.75248276829475e-05, + "loss": 0.5076, + "step": 3156 + }, + { + "epoch": 0.7042159268347089, + "grad_norm": 0.18227113783359528, + "learning_rate": 1.7523277478004747e-05, + "loss": 0.4927, + "step": 3157 + }, + { + "epoch": 0.7044389917465983, + "grad_norm": 0.17305073142051697, + "learning_rate": 1.7521726856375568e-05, + "loss": 0.4967, + "step": 3158 + }, + { + "epoch": 0.7046620566584876, + "grad_norm": 0.16051623225212097, + "learning_rate": 1.7520175818145838e-05, + "loss": 0.5031, + "step": 3159 + }, + { + "epoch": 0.704885121570377, + "grad_norm": 0.16289980709552765, + "learning_rate": 1.751862436340147e-05, + "loss": 0.5101, + "step": 3160 + }, + { + "epoch": 0.7051081864822664, + "grad_norm": 0.16630062460899353, + "learning_rate": 1.751707249222839e-05, + "loss": 0.4928, + "step": 3161 + }, + { + "epoch": 0.7053312513941556, + "grad_norm": 0.154288649559021, + "learning_rate": 1.7515520204712552e-05, + "loss": 0.4921, + "step": 3162 + }, + { + "epoch": 0.705554316306045, + "grad_norm": 0.1542695015668869, + "learning_rate": 1.751396750093993e-05, + "loss": 0.4929, + "step": 3163 + }, + { + "epoch": 0.7057773812179344, + "grad_norm": 0.1700357347726822, + "learning_rate": 1.7512414380996524e-05, + "loss": 0.5014, + "step": 3164 + }, + { + "epoch": 0.7060004461298238, + "grad_norm": 0.1531635820865631, + "learning_rate": 1.7510860844968355e-05, + "loss": 0.4764, + "step": 3165 + }, + { + "epoch": 0.7062235110417131, + "grad_norm": 0.1919977366924286, + "learning_rate": 1.7509306892941464e-05, + "loss": 0.5453, + "step": 3166 + }, + { + "epoch": 0.7064465759536025, + "grad_norm": 0.16512922942638397, + "learning_rate": 1.7507752525001924e-05, + "loss": 0.5015, + "step": 3167 + }, + { + "epoch": 0.7066696408654919, + "grad_norm": 0.1677495837211609, + "learning_rate": 1.7506197741235822e-05, + "loss": 0.5164, + "step": 3168 + }, + { + "epoch": 0.7068927057773812, + "grad_norm": 0.15319664776325226, + "learning_rate": 1.7504642541729273e-05, + "loss": 0.4815, + "step": 3169 + }, + { + "epoch": 0.7071157706892706, + "grad_norm": 0.16099566221237183, + "learning_rate": 1.7503086926568416e-05, + "loss": 0.4966, + "step": 3170 + }, + { + "epoch": 0.70733883560116, + "grad_norm": 0.15989680588245392, + "learning_rate": 1.750153089583941e-05, + "loss": 0.4949, + "step": 3171 + }, + { + "epoch": 0.7075619005130493, + "grad_norm": 0.16961508989334106, + "learning_rate": 1.7499974449628433e-05, + "loss": 0.4916, + "step": 3172 + }, + { + "epoch": 0.7077849654249386, + "grad_norm": 0.1632029265165329, + "learning_rate": 1.74984175880217e-05, + "loss": 0.4921, + "step": 3173 + }, + { + "epoch": 0.708008030336828, + "grad_norm": 0.1647975593805313, + "learning_rate": 1.7496860311105426e-05, + "loss": 0.4635, + "step": 3174 + }, + { + "epoch": 0.7082310952487174, + "grad_norm": 0.15906676650047302, + "learning_rate": 1.7495302618965874e-05, + "loss": 0.4686, + "step": 3175 + }, + { + "epoch": 0.7084541601606067, + "grad_norm": 0.16585782170295715, + "learning_rate": 1.7493744511689316e-05, + "loss": 0.5159, + "step": 3176 + }, + { + "epoch": 0.7086772250724961, + "grad_norm": 0.16121666133403778, + "learning_rate": 1.7492185989362052e-05, + "loss": 0.5, + "step": 3177 + }, + { + "epoch": 0.7089002899843855, + "grad_norm": 0.1639820635318756, + "learning_rate": 1.7490627052070394e-05, + "loss": 0.4959, + "step": 3178 + }, + { + "epoch": 0.7091233548962749, + "grad_norm": 0.17450456321239471, + "learning_rate": 1.74890676999007e-05, + "loss": 0.4932, + "step": 3179 + }, + { + "epoch": 0.7093464198081642, + "grad_norm": 0.1635378748178482, + "learning_rate": 1.7487507932939324e-05, + "loss": 0.5107, + "step": 3180 + }, + { + "epoch": 0.7095694847200535, + "grad_norm": 0.16000831127166748, + "learning_rate": 1.7485947751272657e-05, + "loss": 0.462, + "step": 3181 + }, + { + "epoch": 0.7097925496319429, + "grad_norm": 0.16015185415744781, + "learning_rate": 1.748438715498712e-05, + "loss": 0.4934, + "step": 3182 + }, + { + "epoch": 0.7100156145438322, + "grad_norm": 0.1680067628622055, + "learning_rate": 1.7482826144169144e-05, + "loss": 0.5074, + "step": 3183 + }, + { + "epoch": 0.7102386794557216, + "grad_norm": 0.16120266914367676, + "learning_rate": 1.7481264718905187e-05, + "loss": 0.4853, + "step": 3184 + }, + { + "epoch": 0.710461744367611, + "grad_norm": 0.1535530686378479, + "learning_rate": 1.747970287928173e-05, + "loss": 0.4722, + "step": 3185 + }, + { + "epoch": 0.7106848092795003, + "grad_norm": 0.1595955789089203, + "learning_rate": 1.747814062538528e-05, + "loss": 0.5031, + "step": 3186 + }, + { + "epoch": 0.7109078741913897, + "grad_norm": 0.16170883178710938, + "learning_rate": 1.7476577957302358e-05, + "loss": 0.4947, + "step": 3187 + }, + { + "epoch": 0.7111309391032791, + "grad_norm": 0.16378958523273468, + "learning_rate": 1.747501487511952e-05, + "loss": 0.4722, + "step": 3188 + }, + { + "epoch": 0.7113540040151685, + "grad_norm": 0.17210274934768677, + "learning_rate": 1.7473451378923344e-05, + "loss": 0.5183, + "step": 3189 + }, + { + "epoch": 0.7115770689270577, + "grad_norm": 0.15549586713314056, + "learning_rate": 1.7471887468800416e-05, + "loss": 0.48, + "step": 3190 + }, + { + "epoch": 0.7118001338389471, + "grad_norm": 0.17286068201065063, + "learning_rate": 1.747032314483736e-05, + "loss": 0.4768, + "step": 3191 + }, + { + "epoch": 0.7120231987508365, + "grad_norm": 0.1790837049484253, + "learning_rate": 1.746875840712082e-05, + "loss": 0.5181, + "step": 3192 + }, + { + "epoch": 0.7122462636627258, + "grad_norm": 0.1648358702659607, + "learning_rate": 1.746719325573746e-05, + "loss": 0.508, + "step": 3193 + }, + { + "epoch": 0.7124693285746152, + "grad_norm": 0.16890966892242432, + "learning_rate": 1.7465627690773964e-05, + "loss": 0.4928, + "step": 3194 + }, + { + "epoch": 0.7126923934865046, + "grad_norm": 0.1561020165681839, + "learning_rate": 1.7464061712317047e-05, + "loss": 0.4783, + "step": 3195 + }, + { + "epoch": 0.712915458398394, + "grad_norm": 0.17456288635730743, + "learning_rate": 1.7462495320453442e-05, + "loss": 0.5203, + "step": 3196 + }, + { + "epoch": 0.7131385233102833, + "grad_norm": 0.16500715911388397, + "learning_rate": 1.7460928515269902e-05, + "loss": 0.5259, + "step": 3197 + }, + { + "epoch": 0.7133615882221727, + "grad_norm": 0.1663879156112671, + "learning_rate": 1.7459361296853217e-05, + "loss": 0.5204, + "step": 3198 + }, + { + "epoch": 0.713584653134062, + "grad_norm": 0.1611357033252716, + "learning_rate": 1.745779366529018e-05, + "loss": 0.498, + "step": 3199 + }, + { + "epoch": 0.7138077180459513, + "grad_norm": 0.1556338518857956, + "learning_rate": 1.7456225620667613e-05, + "loss": 0.4753, + "step": 3200 + }, + { + "epoch": 0.7140307829578407, + "grad_norm": 0.16986200213432312, + "learning_rate": 1.7454657163072372e-05, + "loss": 0.5075, + "step": 3201 + }, + { + "epoch": 0.7142538478697301, + "grad_norm": 0.16040728986263275, + "learning_rate": 1.7453088292591327e-05, + "loss": 0.5016, + "step": 3202 + }, + { + "epoch": 0.7144769127816194, + "grad_norm": 0.16356582939624786, + "learning_rate": 1.7451519009311368e-05, + "loss": 0.5118, + "step": 3203 + }, + { + "epoch": 0.7146999776935088, + "grad_norm": 0.16113781929016113, + "learning_rate": 1.744994931331942e-05, + "loss": 0.4869, + "step": 3204 + }, + { + "epoch": 0.7149230426053982, + "grad_norm": 0.17623494565486908, + "learning_rate": 1.744837920470241e-05, + "loss": 0.5247, + "step": 3205 + }, + { + "epoch": 0.7151461075172876, + "grad_norm": 0.16005289554595947, + "learning_rate": 1.744680868354731e-05, + "loss": 0.4722, + "step": 3206 + }, + { + "epoch": 0.7153691724291769, + "grad_norm": 0.18001532554626465, + "learning_rate": 1.7445237749941106e-05, + "loss": 0.4879, + "step": 3207 + }, + { + "epoch": 0.7155922373410663, + "grad_norm": 0.1617603451013565, + "learning_rate": 1.74436664039708e-05, + "loss": 0.4962, + "step": 3208 + }, + { + "epoch": 0.7158153022529556, + "grad_norm": 0.16146957874298096, + "learning_rate": 1.7442094645723425e-05, + "loss": 0.4849, + "step": 3209 + }, + { + "epoch": 0.7160383671648449, + "grad_norm": 0.16159506142139435, + "learning_rate": 1.744052247528604e-05, + "loss": 0.4814, + "step": 3210 + }, + { + "epoch": 0.7162614320767343, + "grad_norm": 0.16017811000347137, + "learning_rate": 1.7438949892745717e-05, + "loss": 0.4779, + "step": 3211 + }, + { + "epoch": 0.7164844969886237, + "grad_norm": 0.16643457114696503, + "learning_rate": 1.7437376898189554e-05, + "loss": 0.5058, + "step": 3212 + }, + { + "epoch": 0.7167075619005131, + "grad_norm": 0.174330934882164, + "learning_rate": 1.7435803491704674e-05, + "loss": 0.5037, + "step": 3213 + }, + { + "epoch": 0.7169306268124024, + "grad_norm": 0.16516099870204926, + "learning_rate": 1.7434229673378226e-05, + "loss": 0.4929, + "step": 3214 + }, + { + "epoch": 0.7171536917242918, + "grad_norm": 0.16369006037712097, + "learning_rate": 1.7432655443297377e-05, + "loss": 0.4788, + "step": 3215 + }, + { + "epoch": 0.7173767566361812, + "grad_norm": 0.19648444652557373, + "learning_rate": 1.7431080801549313e-05, + "loss": 0.4808, + "step": 3216 + }, + { + "epoch": 0.7175998215480704, + "grad_norm": 0.16290031373500824, + "learning_rate": 1.742950574822125e-05, + "loss": 0.4811, + "step": 3217 + }, + { + "epoch": 0.7178228864599598, + "grad_norm": 0.15613336861133575, + "learning_rate": 1.7427930283400428e-05, + "loss": 0.4948, + "step": 3218 + }, + { + "epoch": 0.7180459513718492, + "grad_norm": 0.17377032339572906, + "learning_rate": 1.7426354407174102e-05, + "loss": 0.5127, + "step": 3219 + }, + { + "epoch": 0.7182690162837385, + "grad_norm": 0.16334526240825653, + "learning_rate": 1.7424778119629556e-05, + "loss": 0.4865, + "step": 3220 + }, + { + "epoch": 0.7184920811956279, + "grad_norm": 0.15451814234256744, + "learning_rate": 1.7423201420854092e-05, + "loss": 0.4711, + "step": 3221 + }, + { + "epoch": 0.7187151461075173, + "grad_norm": 0.17903545498847961, + "learning_rate": 1.7421624310935043e-05, + "loss": 0.5265, + "step": 3222 + }, + { + "epoch": 0.7189382110194067, + "grad_norm": 0.15851649641990662, + "learning_rate": 1.7420046789959754e-05, + "loss": 0.5125, + "step": 3223 + }, + { + "epoch": 0.719161275931296, + "grad_norm": 0.15990275144577026, + "learning_rate": 1.74184688580156e-05, + "loss": 0.4983, + "step": 3224 + }, + { + "epoch": 0.7193843408431854, + "grad_norm": 0.16905120015144348, + "learning_rate": 1.7416890515189977e-05, + "loss": 0.514, + "step": 3225 + }, + { + "epoch": 0.7196074057550748, + "grad_norm": 0.16614043712615967, + "learning_rate": 1.74153117615703e-05, + "loss": 0.5071, + "step": 3226 + }, + { + "epoch": 0.719830470666964, + "grad_norm": 0.1659470647573471, + "learning_rate": 1.741373259724402e-05, + "loss": 0.5101, + "step": 3227 + }, + { + "epoch": 0.7200535355788534, + "grad_norm": 0.17081058025360107, + "learning_rate": 1.7412153022298587e-05, + "loss": 0.5345, + "step": 3228 + }, + { + "epoch": 0.7202766004907428, + "grad_norm": 0.17581294476985931, + "learning_rate": 1.74105730368215e-05, + "loss": 0.5119, + "step": 3229 + }, + { + "epoch": 0.7204996654026322, + "grad_norm": 0.15835914015769958, + "learning_rate": 1.7408992640900263e-05, + "loss": 0.5, + "step": 3230 + }, + { + "epoch": 0.7207227303145215, + "grad_norm": 0.16518062353134155, + "learning_rate": 1.740741183462241e-05, + "loss": 0.4791, + "step": 3231 + }, + { + "epoch": 0.7209457952264109, + "grad_norm": 0.19645792245864868, + "learning_rate": 1.7405830618075494e-05, + "loss": 0.4851, + "step": 3232 + }, + { + "epoch": 0.7211688601383003, + "grad_norm": 0.1701032668352127, + "learning_rate": 1.7404248991347093e-05, + "loss": 0.5165, + "step": 3233 + }, + { + "epoch": 0.7213919250501896, + "grad_norm": 0.16348305344581604, + "learning_rate": 1.740266695452481e-05, + "loss": 0.5236, + "step": 3234 + }, + { + "epoch": 0.721614989962079, + "grad_norm": 0.1572495847940445, + "learning_rate": 1.7401084507696263e-05, + "loss": 0.5141, + "step": 3235 + }, + { + "epoch": 0.7218380548739683, + "grad_norm": 0.16454187035560608, + "learning_rate": 1.7399501650949107e-05, + "loss": 0.4714, + "step": 3236 + }, + { + "epoch": 0.7220611197858576, + "grad_norm": 0.17999160289764404, + "learning_rate": 1.7397918384371003e-05, + "loss": 0.5035, + "step": 3237 + }, + { + "epoch": 0.722284184697747, + "grad_norm": 0.16409459710121155, + "learning_rate": 1.739633470804964e-05, + "loss": 0.4903, + "step": 3238 + }, + { + "epoch": 0.7225072496096364, + "grad_norm": 0.15939798951148987, + "learning_rate": 1.739475062207274e-05, + "loss": 0.4968, + "step": 3239 + }, + { + "epoch": 0.7227303145215258, + "grad_norm": 0.1598650962114334, + "learning_rate": 1.7393166126528035e-05, + "loss": 0.4829, + "step": 3240 + }, + { + "epoch": 0.7229533794334151, + "grad_norm": 0.1741870641708374, + "learning_rate": 1.7391581221503286e-05, + "loss": 0.5268, + "step": 3241 + }, + { + "epoch": 0.7231764443453045, + "grad_norm": 0.16262130439281464, + "learning_rate": 1.7389995907086273e-05, + "loss": 0.4814, + "step": 3242 + }, + { + "epoch": 0.7233995092571939, + "grad_norm": 0.16130922734737396, + "learning_rate": 1.73884101833648e-05, + "loss": 0.48, + "step": 3243 + }, + { + "epoch": 0.7236225741690832, + "grad_norm": 0.167790949344635, + "learning_rate": 1.7386824050426697e-05, + "loss": 0.4774, + "step": 3244 + }, + { + "epoch": 0.7238456390809725, + "grad_norm": 0.16130533814430237, + "learning_rate": 1.7385237508359812e-05, + "loss": 0.4765, + "step": 3245 + }, + { + "epoch": 0.7240687039928619, + "grad_norm": 0.16216439008712769, + "learning_rate": 1.7383650557252023e-05, + "loss": 0.5197, + "step": 3246 + }, + { + "epoch": 0.7242917689047513, + "grad_norm": 0.17525294423103333, + "learning_rate": 1.7382063197191218e-05, + "loss": 0.5063, + "step": 3247 + }, + { + "epoch": 0.7245148338166406, + "grad_norm": 0.16659271717071533, + "learning_rate": 1.738047542826532e-05, + "loss": 0.5077, + "step": 3248 + }, + { + "epoch": 0.72473789872853, + "grad_norm": 0.15952268242835999, + "learning_rate": 1.7378887250562268e-05, + "loss": 0.4991, + "step": 3249 + }, + { + "epoch": 0.7249609636404194, + "grad_norm": 0.16580569744110107, + "learning_rate": 1.737729866417002e-05, + "loss": 0.5026, + "step": 3250 + }, + { + "epoch": 0.7251840285523087, + "grad_norm": 0.16428150236606598, + "learning_rate": 1.7375709669176572e-05, + "loss": 0.4993, + "step": 3251 + }, + { + "epoch": 0.7254070934641981, + "grad_norm": 0.17543014883995056, + "learning_rate": 1.7374120265669927e-05, + "loss": 0.483, + "step": 3252 + }, + { + "epoch": 0.7256301583760875, + "grad_norm": 0.16254562139511108, + "learning_rate": 1.7372530453738113e-05, + "loss": 0.5051, + "step": 3253 + }, + { + "epoch": 0.7258532232879769, + "grad_norm": 0.16587981581687927, + "learning_rate": 1.737094023346919e-05, + "loss": 0.4717, + "step": 3254 + }, + { + "epoch": 0.7260762881998661, + "grad_norm": 0.15644344687461853, + "learning_rate": 1.7369349604951233e-05, + "loss": 0.4918, + "step": 3255 + }, + { + "epoch": 0.7262993531117555, + "grad_norm": 0.16663451492786407, + "learning_rate": 1.736775856827234e-05, + "loss": 0.4973, + "step": 3256 + }, + { + "epoch": 0.7265224180236449, + "grad_norm": 0.16686449944972992, + "learning_rate": 1.736616712352063e-05, + "loss": 0.5115, + "step": 3257 + }, + { + "epoch": 0.7267454829355342, + "grad_norm": 0.15312035381793976, + "learning_rate": 1.736457527078425e-05, + "loss": 0.4498, + "step": 3258 + }, + { + "epoch": 0.7269685478474236, + "grad_norm": 0.17026790976524353, + "learning_rate": 1.7362983010151368e-05, + "loss": 0.4958, + "step": 3259 + }, + { + "epoch": 0.727191612759313, + "grad_norm": 0.16290387511253357, + "learning_rate": 1.7361390341710173e-05, + "loss": 0.4785, + "step": 3260 + }, + { + "epoch": 0.7274146776712023, + "grad_norm": 0.15380245447158813, + "learning_rate": 1.7359797265548876e-05, + "loss": 0.466, + "step": 3261 + }, + { + "epoch": 0.7276377425830917, + "grad_norm": 0.1699349284172058, + "learning_rate": 1.7358203781755707e-05, + "loss": 0.4858, + "step": 3262 + }, + { + "epoch": 0.727860807494981, + "grad_norm": 0.16431613266468048, + "learning_rate": 1.735660989041893e-05, + "loss": 0.5011, + "step": 3263 + }, + { + "epoch": 0.7280838724068704, + "grad_norm": 0.16305620968341827, + "learning_rate": 1.735501559162682e-05, + "loss": 0.5167, + "step": 3264 + }, + { + "epoch": 0.7283069373187597, + "grad_norm": 0.15818987786769867, + "learning_rate": 1.7353420885467688e-05, + "loss": 0.4953, + "step": 3265 + }, + { + "epoch": 0.7285300022306491, + "grad_norm": 0.164206400513649, + "learning_rate": 1.7351825772029847e-05, + "loss": 0.5128, + "step": 3266 + }, + { + "epoch": 0.7287530671425385, + "grad_norm": 0.15800419449806213, + "learning_rate": 1.7350230251401653e-05, + "loss": 0.4654, + "step": 3267 + }, + { + "epoch": 0.7289761320544278, + "grad_norm": 0.16068902611732483, + "learning_rate": 1.734863432367147e-05, + "loss": 0.4997, + "step": 3268 + }, + { + "epoch": 0.7291991969663172, + "grad_norm": 0.16787229478359222, + "learning_rate": 1.7347037988927696e-05, + "loss": 0.5244, + "step": 3269 + }, + { + "epoch": 0.7294222618782066, + "grad_norm": 0.1621689349412918, + "learning_rate": 1.7345441247258743e-05, + "loss": 0.4855, + "step": 3270 + }, + { + "epoch": 0.729645326790096, + "grad_norm": 0.16722342371940613, + "learning_rate": 1.734384409875305e-05, + "loss": 0.4992, + "step": 3271 + }, + { + "epoch": 0.7298683917019853, + "grad_norm": 0.15855719149112701, + "learning_rate": 1.7342246543499074e-05, + "loss": 0.5074, + "step": 3272 + }, + { + "epoch": 0.7300914566138746, + "grad_norm": 0.17619627714157104, + "learning_rate": 1.7340648581585296e-05, + "loss": 0.4834, + "step": 3273 + }, + { + "epoch": 0.730314521525764, + "grad_norm": 0.17650535702705383, + "learning_rate": 1.7339050213100233e-05, + "loss": 0.5393, + "step": 3274 + }, + { + "epoch": 0.7305375864376533, + "grad_norm": 0.17073291540145874, + "learning_rate": 1.73374514381324e-05, + "loss": 0.4873, + "step": 3275 + }, + { + "epoch": 0.7307606513495427, + "grad_norm": 0.1601572334766388, + "learning_rate": 1.733585225677035e-05, + "loss": 0.485, + "step": 3276 + }, + { + "epoch": 0.7309837162614321, + "grad_norm": 0.1590786874294281, + "learning_rate": 1.7334252669102665e-05, + "loss": 0.4668, + "step": 3277 + }, + { + "epoch": 0.7312067811733214, + "grad_norm": 0.15466643869876862, + "learning_rate": 1.7332652675217928e-05, + "loss": 0.4476, + "step": 3278 + }, + { + "epoch": 0.7314298460852108, + "grad_norm": 0.15521705150604248, + "learning_rate": 1.733105227520476e-05, + "loss": 0.4878, + "step": 3279 + }, + { + "epoch": 0.7316529109971002, + "grad_norm": 0.21078747510910034, + "learning_rate": 1.7329451469151807e-05, + "loss": 0.5203, + "step": 3280 + }, + { + "epoch": 0.7318759759089896, + "grad_norm": 0.1662643849849701, + "learning_rate": 1.7327850257147724e-05, + "loss": 0.5099, + "step": 3281 + }, + { + "epoch": 0.7320990408208788, + "grad_norm": 0.16159231960773468, + "learning_rate": 1.73262486392812e-05, + "loss": 0.4944, + "step": 3282 + }, + { + "epoch": 0.7323221057327682, + "grad_norm": 0.16965213418006897, + "learning_rate": 1.7324646615640947e-05, + "loss": 0.4966, + "step": 3283 + }, + { + "epoch": 0.7325451706446576, + "grad_norm": 0.1666761040687561, + "learning_rate": 1.732304418631569e-05, + "loss": 0.4846, + "step": 3284 + }, + { + "epoch": 0.7327682355565469, + "grad_norm": 0.16207394003868103, + "learning_rate": 1.7321441351394178e-05, + "loss": 0.5124, + "step": 3285 + }, + { + "epoch": 0.7329913004684363, + "grad_norm": 0.15828841924667358, + "learning_rate": 1.7319838110965192e-05, + "loss": 0.4887, + "step": 3286 + }, + { + "epoch": 0.7332143653803257, + "grad_norm": 0.16760720312595367, + "learning_rate": 1.731823446511753e-05, + "loss": 0.4715, + "step": 3287 + }, + { + "epoch": 0.7334374302922151, + "grad_norm": 0.1632707566022873, + "learning_rate": 1.7316630413940005e-05, + "loss": 0.4803, + "step": 3288 + }, + { + "epoch": 0.7336604952041044, + "grad_norm": 0.1794753223657608, + "learning_rate": 1.7315025957521468e-05, + "loss": 0.5025, + "step": 3289 + }, + { + "epoch": 0.7338835601159938, + "grad_norm": 0.1697871834039688, + "learning_rate": 1.7313421095950778e-05, + "loss": 0.4725, + "step": 3290 + }, + { + "epoch": 0.7341066250278832, + "grad_norm": 0.16839033365249634, + "learning_rate": 1.7311815829316826e-05, + "loss": 0.5406, + "step": 3291 + }, + { + "epoch": 0.7343296899397724, + "grad_norm": 0.16735753417015076, + "learning_rate": 1.731021015770852e-05, + "loss": 0.5238, + "step": 3292 + }, + { + "epoch": 0.7345527548516618, + "grad_norm": 0.16878098249435425, + "learning_rate": 1.7308604081214793e-05, + "loss": 0.4978, + "step": 3293 + }, + { + "epoch": 0.7347758197635512, + "grad_norm": 0.16095402836799622, + "learning_rate": 1.7306997599924597e-05, + "loss": 0.4696, + "step": 3294 + }, + { + "epoch": 0.7349988846754405, + "grad_norm": 0.16328908503055573, + "learning_rate": 1.730539071392691e-05, + "loss": 0.4965, + "step": 3295 + }, + { + "epoch": 0.7352219495873299, + "grad_norm": 0.1627175509929657, + "learning_rate": 1.7303783423310735e-05, + "loss": 0.4822, + "step": 3296 + }, + { + "epoch": 0.7354450144992193, + "grad_norm": 0.1619403213262558, + "learning_rate": 1.730217572816509e-05, + "loss": 0.5027, + "step": 3297 + }, + { + "epoch": 0.7356680794111087, + "grad_norm": 0.16253367066383362, + "learning_rate": 1.7300567628579025e-05, + "loss": 0.5028, + "step": 3298 + }, + { + "epoch": 0.735891144322998, + "grad_norm": 0.16807517409324646, + "learning_rate": 1.72989591246416e-05, + "loss": 0.4756, + "step": 3299 + }, + { + "epoch": 0.7361142092348874, + "grad_norm": 0.18981721997261047, + "learning_rate": 1.7297350216441903e-05, + "loss": 0.5209, + "step": 3300 + }, + { + "epoch": 0.7363372741467767, + "grad_norm": 0.1598469614982605, + "learning_rate": 1.7295740904069053e-05, + "loss": 0.4872, + "step": 3301 + }, + { + "epoch": 0.736560339058666, + "grad_norm": 0.20748665928840637, + "learning_rate": 1.7294131187612176e-05, + "loss": 0.4603, + "step": 3302 + }, + { + "epoch": 0.7367834039705554, + "grad_norm": 0.16444922983646393, + "learning_rate": 1.7292521067160434e-05, + "loss": 0.4772, + "step": 3303 + }, + { + "epoch": 0.7370064688824448, + "grad_norm": 0.16779246926307678, + "learning_rate": 1.7290910542803004e-05, + "loss": 0.4948, + "step": 3304 + }, + { + "epoch": 0.7372295337943342, + "grad_norm": 0.16438250243663788, + "learning_rate": 1.7289299614629083e-05, + "loss": 0.5103, + "step": 3305 + }, + { + "epoch": 0.7374525987062235, + "grad_norm": 0.17737194895744324, + "learning_rate": 1.7287688282727903e-05, + "loss": 0.4994, + "step": 3306 + }, + { + "epoch": 0.7376756636181129, + "grad_norm": 0.15918989479541779, + "learning_rate": 1.7286076547188703e-05, + "loss": 0.4978, + "step": 3307 + }, + { + "epoch": 0.7378987285300023, + "grad_norm": 0.3185868561267853, + "learning_rate": 1.728446440810075e-05, + "loss": 0.489, + "step": 3308 + }, + { + "epoch": 0.7381217934418915, + "grad_norm": 0.1764698624610901, + "learning_rate": 1.728285186555334e-05, + "loss": 0.506, + "step": 3309 + }, + { + "epoch": 0.7383448583537809, + "grad_norm": 0.16116134822368622, + "learning_rate": 1.7281238919635784e-05, + "loss": 0.519, + "step": 3310 + }, + { + "epoch": 0.7385679232656703, + "grad_norm": 0.1670210361480713, + "learning_rate": 1.7279625570437413e-05, + "loss": 0.5228, + "step": 3311 + }, + { + "epoch": 0.7387909881775596, + "grad_norm": 0.16115038096904755, + "learning_rate": 1.7278011818047588e-05, + "loss": 0.4882, + "step": 3312 + }, + { + "epoch": 0.739014053089449, + "grad_norm": 0.16242891550064087, + "learning_rate": 1.7276397662555685e-05, + "loss": 0.5139, + "step": 3313 + }, + { + "epoch": 0.7392371180013384, + "grad_norm": 0.16696135699748993, + "learning_rate": 1.7274783104051112e-05, + "loss": 0.521, + "step": 3314 + }, + { + "epoch": 0.7394601829132278, + "grad_norm": 0.1544819325208664, + "learning_rate": 1.727316814262329e-05, + "loss": 0.4765, + "step": 3315 + }, + { + "epoch": 0.7396832478251171, + "grad_norm": 0.16446231305599213, + "learning_rate": 1.727155277836167e-05, + "loss": 0.4683, + "step": 3316 + }, + { + "epoch": 0.7399063127370065, + "grad_norm": 0.1666785031557083, + "learning_rate": 1.7269937011355713e-05, + "loss": 0.503, + "step": 3317 + }, + { + "epoch": 0.7401293776488959, + "grad_norm": 0.16166886687278748, + "learning_rate": 1.7268320841694915e-05, + "loss": 0.5014, + "step": 3318 + }, + { + "epoch": 0.7403524425607851, + "grad_norm": 0.27264273166656494, + "learning_rate": 1.7266704269468786e-05, + "loss": 0.4834, + "step": 3319 + }, + { + "epoch": 0.7405755074726745, + "grad_norm": 0.16175110638141632, + "learning_rate": 1.7265087294766872e-05, + "loss": 0.4926, + "step": 3320 + }, + { + "epoch": 0.7407985723845639, + "grad_norm": 0.1768079400062561, + "learning_rate": 1.726346991767872e-05, + "loss": 0.4933, + "step": 3321 + }, + { + "epoch": 0.7410216372964533, + "grad_norm": 0.15689828991889954, + "learning_rate": 1.7261852138293918e-05, + "loss": 0.4928, + "step": 3322 + }, + { + "epoch": 0.7412447022083426, + "grad_norm": 0.16128939390182495, + "learning_rate": 1.7260233956702062e-05, + "loss": 0.4739, + "step": 3323 + }, + { + "epoch": 0.741467767120232, + "grad_norm": 0.16142840683460236, + "learning_rate": 1.7258615372992783e-05, + "loss": 0.5123, + "step": 3324 + }, + { + "epoch": 0.7416908320321214, + "grad_norm": 0.15940620005130768, + "learning_rate": 1.7256996387255725e-05, + "loss": 0.5013, + "step": 3325 + }, + { + "epoch": 0.7419138969440107, + "grad_norm": 0.16134943068027496, + "learning_rate": 1.7255376999580557e-05, + "loss": 0.4651, + "step": 3326 + }, + { + "epoch": 0.7421369618559001, + "grad_norm": 0.15544810891151428, + "learning_rate": 1.7253757210056978e-05, + "loss": 0.4657, + "step": 3327 + }, + { + "epoch": 0.7423600267677894, + "grad_norm": 0.15278422832489014, + "learning_rate": 1.7252137018774694e-05, + "loss": 0.4604, + "step": 3328 + }, + { + "epoch": 0.7425830916796788, + "grad_norm": 0.15711228549480438, + "learning_rate": 1.7250516425823443e-05, + "loss": 0.4903, + "step": 3329 + }, + { + "epoch": 0.7428061565915681, + "grad_norm": 0.15838393568992615, + "learning_rate": 1.7248895431292988e-05, + "loss": 0.484, + "step": 3330 + }, + { + "epoch": 0.7430292215034575, + "grad_norm": 0.15732166171073914, + "learning_rate": 1.72472740352731e-05, + "loss": 0.4637, + "step": 3331 + }, + { + "epoch": 0.7432522864153469, + "grad_norm": 0.16680875420570374, + "learning_rate": 1.7245652237853593e-05, + "loss": 0.4999, + "step": 3332 + }, + { + "epoch": 0.7434753513272362, + "grad_norm": 0.16169287264347076, + "learning_rate": 1.7244030039124287e-05, + "loss": 0.4829, + "step": 3333 + }, + { + "epoch": 0.7436984162391256, + "grad_norm": 0.15638568997383118, + "learning_rate": 1.7242407439175035e-05, + "loss": 0.4731, + "step": 3334 + }, + { + "epoch": 0.743921481151015, + "grad_norm": 0.1748877912759781, + "learning_rate": 1.72407844380957e-05, + "loss": 0.4979, + "step": 3335 + }, + { + "epoch": 0.7441445460629043, + "grad_norm": 0.15920549631118774, + "learning_rate": 1.7239161035976175e-05, + "loss": 0.5024, + "step": 3336 + }, + { + "epoch": 0.7443676109747936, + "grad_norm": 0.16807498037815094, + "learning_rate": 1.7237537232906376e-05, + "loss": 0.5414, + "step": 3337 + }, + { + "epoch": 0.744590675886683, + "grad_norm": 0.16192768514156342, + "learning_rate": 1.723591302897624e-05, + "loss": 0.4883, + "step": 3338 + }, + { + "epoch": 0.7448137407985724, + "grad_norm": 0.15893089771270752, + "learning_rate": 1.7234288424275726e-05, + "loss": 0.5002, + "step": 3339 + }, + { + "epoch": 0.7450368057104617, + "grad_norm": 0.1622776985168457, + "learning_rate": 1.7232663418894812e-05, + "loss": 0.4859, + "step": 3340 + }, + { + "epoch": 0.7452598706223511, + "grad_norm": 0.1694253832101822, + "learning_rate": 1.72310380129235e-05, + "loss": 0.4903, + "step": 3341 + }, + { + "epoch": 0.7454829355342405, + "grad_norm": 0.1649332493543625, + "learning_rate": 1.722941220645182e-05, + "loss": 0.5145, + "step": 3342 + }, + { + "epoch": 0.7457060004461298, + "grad_norm": 0.19938278198242188, + "learning_rate": 1.722778599956982e-05, + "loss": 0.4949, + "step": 3343 + }, + { + "epoch": 0.7459290653580192, + "grad_norm": 0.15563727915287018, + "learning_rate": 1.7226159392367564e-05, + "loss": 0.4911, + "step": 3344 + }, + { + "epoch": 0.7461521302699086, + "grad_norm": 0.15972435474395752, + "learning_rate": 1.7224532384935148e-05, + "loss": 0.4748, + "step": 3345 + }, + { + "epoch": 0.746375195181798, + "grad_norm": 0.16170111298561096, + "learning_rate": 1.722290497736268e-05, + "loss": 0.4701, + "step": 3346 + }, + { + "epoch": 0.7465982600936872, + "grad_norm": 0.1537921279668808, + "learning_rate": 1.7221277169740305e-05, + "loss": 0.478, + "step": 3347 + }, + { + "epoch": 0.7468213250055766, + "grad_norm": 0.15656088292598724, + "learning_rate": 1.7219648962158174e-05, + "loss": 0.4851, + "step": 3348 + }, + { + "epoch": 0.747044389917466, + "grad_norm": 0.16441093385219574, + "learning_rate": 1.7218020354706473e-05, + "loss": 0.4846, + "step": 3349 + }, + { + "epoch": 0.7472674548293553, + "grad_norm": 0.17135939002037048, + "learning_rate": 1.72163913474754e-05, + "loss": 0.5044, + "step": 3350 + }, + { + "epoch": 0.7474905197412447, + "grad_norm": 0.17070244252681732, + "learning_rate": 1.721476194055518e-05, + "loss": 0.537, + "step": 3351 + }, + { + "epoch": 0.7477135846531341, + "grad_norm": 0.1651678830385208, + "learning_rate": 1.7213132134036063e-05, + "loss": 0.5161, + "step": 3352 + }, + { + "epoch": 0.7479366495650234, + "grad_norm": 0.16997648775577545, + "learning_rate": 1.7211501928008317e-05, + "loss": 0.5026, + "step": 3353 + }, + { + "epoch": 0.7481597144769128, + "grad_norm": 0.17357714474201202, + "learning_rate": 1.7209871322562232e-05, + "loss": 0.4899, + "step": 3354 + }, + { + "epoch": 0.7483827793888022, + "grad_norm": 0.16555210947990417, + "learning_rate": 1.7208240317788115e-05, + "loss": 0.5176, + "step": 3355 + }, + { + "epoch": 0.7486058443006915, + "grad_norm": 0.15799009799957275, + "learning_rate": 1.7206608913776315e-05, + "loss": 0.4923, + "step": 3356 + }, + { + "epoch": 0.7488289092125808, + "grad_norm": 0.15853382647037506, + "learning_rate": 1.720497711061718e-05, + "loss": 0.4881, + "step": 3357 + }, + { + "epoch": 0.7490519741244702, + "grad_norm": 0.15760891139507294, + "learning_rate": 1.720334490840109e-05, + "loss": 0.5008, + "step": 3358 + }, + { + "epoch": 0.7492750390363596, + "grad_norm": 0.16732452809810638, + "learning_rate": 1.720171230721845e-05, + "loss": 0.4923, + "step": 3359 + }, + { + "epoch": 0.7494981039482489, + "grad_norm": 0.17124591767787933, + "learning_rate": 1.7200079307159677e-05, + "loss": 0.5272, + "step": 3360 + }, + { + "epoch": 0.7497211688601383, + "grad_norm": 0.16751034557819366, + "learning_rate": 1.7198445908315226e-05, + "loss": 0.4801, + "step": 3361 + }, + { + "epoch": 0.7499442337720277, + "grad_norm": 0.16137580573558807, + "learning_rate": 1.719681211077556e-05, + "loss": 0.5371, + "step": 3362 + }, + { + "epoch": 0.7501672986839171, + "grad_norm": 0.16350311040878296, + "learning_rate": 1.7195177914631172e-05, + "loss": 0.5152, + "step": 3363 + }, + { + "epoch": 0.7503903635958064, + "grad_norm": 0.1560794562101364, + "learning_rate": 1.719354331997257e-05, + "loss": 0.4788, + "step": 3364 + }, + { + "epoch": 0.7506134285076957, + "grad_norm": 0.16271278262138367, + "learning_rate": 1.7191908326890288e-05, + "loss": 0.4975, + "step": 3365 + }, + { + "epoch": 0.7508364934195851, + "grad_norm": 0.21892762184143066, + "learning_rate": 1.7190272935474883e-05, + "loss": 0.511, + "step": 3366 + }, + { + "epoch": 0.7510595583314744, + "grad_norm": 0.1879514753818512, + "learning_rate": 1.7188637145816937e-05, + "loss": 0.5277, + "step": 3367 + }, + { + "epoch": 0.7512826232433638, + "grad_norm": 0.17167915403842926, + "learning_rate": 1.718700095800705e-05, + "loss": 0.5128, + "step": 3368 + }, + { + "epoch": 0.7515056881552532, + "grad_norm": 0.16950075328350067, + "learning_rate": 1.718536437213584e-05, + "loss": 0.5175, + "step": 3369 + }, + { + "epoch": 0.7517287530671425, + "grad_norm": 0.1571718156337738, + "learning_rate": 1.718372738829395e-05, + "loss": 0.489, + "step": 3370 + }, + { + "epoch": 0.7519518179790319, + "grad_norm": 0.161586731672287, + "learning_rate": 1.718209000657205e-05, + "loss": 0.5112, + "step": 3371 + }, + { + "epoch": 0.7521748828909213, + "grad_norm": 0.16906411945819855, + "learning_rate": 1.718045222706083e-05, + "loss": 0.5171, + "step": 3372 + }, + { + "epoch": 0.7523979478028107, + "grad_norm": 0.17176932096481323, + "learning_rate": 1.7178814049851e-05, + "loss": 0.5352, + "step": 3373 + }, + { + "epoch": 0.7526210127146999, + "grad_norm": 0.15188109874725342, + "learning_rate": 1.717717547503329e-05, + "loss": 0.4857, + "step": 3374 + }, + { + "epoch": 0.7528440776265893, + "grad_norm": 0.1867418736219406, + "learning_rate": 1.7175536502698456e-05, + "loss": 0.5199, + "step": 3375 + }, + { + "epoch": 0.7530671425384787, + "grad_norm": 0.1905105859041214, + "learning_rate": 1.7173897132937274e-05, + "loss": 0.5077, + "step": 3376 + }, + { + "epoch": 0.753290207450368, + "grad_norm": 0.16345377266407013, + "learning_rate": 1.7172257365840544e-05, + "loss": 0.5362, + "step": 3377 + }, + { + "epoch": 0.7535132723622574, + "grad_norm": 0.17045968770980835, + "learning_rate": 1.7170617201499083e-05, + "loss": 0.5057, + "step": 3378 + }, + { + "epoch": 0.7537363372741468, + "grad_norm": 0.16535595059394836, + "learning_rate": 1.716897664000374e-05, + "loss": 0.5141, + "step": 3379 + }, + { + "epoch": 0.7539594021860362, + "grad_norm": 0.16590653359889984, + "learning_rate": 1.716733568144538e-05, + "loss": 0.5146, + "step": 3380 + }, + { + "epoch": 0.7541824670979255, + "grad_norm": 0.156137153506279, + "learning_rate": 1.716569432591488e-05, + "loss": 0.4984, + "step": 3381 + }, + { + "epoch": 0.7544055320098149, + "grad_norm": 0.17076677083969116, + "learning_rate": 1.7164052573503155e-05, + "loss": 0.4968, + "step": 3382 + }, + { + "epoch": 0.7546285969217043, + "grad_norm": 0.15686576068401337, + "learning_rate": 1.7162410424301132e-05, + "loss": 0.4868, + "step": 3383 + }, + { + "epoch": 0.7548516618335935, + "grad_norm": 0.19442743062973022, + "learning_rate": 1.716076787839977e-05, + "loss": 0.5058, + "step": 3384 + }, + { + "epoch": 0.7550747267454829, + "grad_norm": 0.16424842178821564, + "learning_rate": 1.715912493589004e-05, + "loss": 0.4857, + "step": 3385 + }, + { + "epoch": 0.7552977916573723, + "grad_norm": 0.6959996819496155, + "learning_rate": 1.7157481596862936e-05, + "loss": 0.5091, + "step": 3386 + }, + { + "epoch": 0.7555208565692616, + "grad_norm": 0.17369645833969116, + "learning_rate": 1.7155837861409482e-05, + "loss": 0.5116, + "step": 3387 + }, + { + "epoch": 0.755743921481151, + "grad_norm": 0.2050577700138092, + "learning_rate": 1.7154193729620713e-05, + "loss": 0.4982, + "step": 3388 + }, + { + "epoch": 0.7559669863930404, + "grad_norm": 0.15558819472789764, + "learning_rate": 1.7152549201587695e-05, + "loss": 0.4873, + "step": 3389 + }, + { + "epoch": 0.7561900513049298, + "grad_norm": 0.16767403483390808, + "learning_rate": 1.715090427740151e-05, + "loss": 0.4806, + "step": 3390 + }, + { + "epoch": 0.7564131162168191, + "grad_norm": 0.1653711050748825, + "learning_rate": 1.714925895715326e-05, + "loss": 0.4858, + "step": 3391 + }, + { + "epoch": 0.7566361811287085, + "grad_norm": 0.1628393530845642, + "learning_rate": 1.7147613240934087e-05, + "loss": 0.5036, + "step": 3392 + }, + { + "epoch": 0.7568592460405978, + "grad_norm": 0.1690330058336258, + "learning_rate": 1.714596712883513e-05, + "loss": 0.4788, + "step": 3393 + }, + { + "epoch": 0.7570823109524871, + "grad_norm": 0.18115870654582977, + "learning_rate": 1.714432062094756e-05, + "loss": 0.5009, + "step": 3394 + }, + { + "epoch": 0.7573053758643765, + "grad_norm": 0.17084982991218567, + "learning_rate": 1.7142673717362578e-05, + "loss": 0.5281, + "step": 3395 + }, + { + "epoch": 0.7575284407762659, + "grad_norm": 0.16751186549663544, + "learning_rate": 1.7141026418171396e-05, + "loss": 0.5377, + "step": 3396 + }, + { + "epoch": 0.7577515056881553, + "grad_norm": 0.16510246694087982, + "learning_rate": 1.713937872346525e-05, + "loss": 0.4995, + "step": 3397 + }, + { + "epoch": 0.7579745706000446, + "grad_norm": 0.1689106822013855, + "learning_rate": 1.7137730633335404e-05, + "loss": 0.4891, + "step": 3398 + }, + { + "epoch": 0.758197635511934, + "grad_norm": 0.16398201882839203, + "learning_rate": 1.7136082147873136e-05, + "loss": 0.5013, + "step": 3399 + }, + { + "epoch": 0.7584207004238234, + "grad_norm": 0.16415871679782867, + "learning_rate": 1.713443326716975e-05, + "loss": 0.5343, + "step": 3400 + }, + { + "epoch": 0.7586437653357126, + "grad_norm": 0.164589986205101, + "learning_rate": 1.7132783991316577e-05, + "loss": 0.4897, + "step": 3401 + }, + { + "epoch": 0.758866830247602, + "grad_norm": 0.17511430382728577, + "learning_rate": 1.7131134320404953e-05, + "loss": 0.5088, + "step": 3402 + }, + { + "epoch": 0.7590898951594914, + "grad_norm": 0.1833367496728897, + "learning_rate": 1.7129484254526257e-05, + "loss": 0.4811, + "step": 3403 + }, + { + "epoch": 0.7593129600713808, + "grad_norm": 0.16160368919372559, + "learning_rate": 1.7127833793771874e-05, + "loss": 0.508, + "step": 3404 + }, + { + "epoch": 0.7595360249832701, + "grad_norm": 0.1855868101119995, + "learning_rate": 1.7126182938233228e-05, + "loss": 0.5091, + "step": 3405 + }, + { + "epoch": 0.7597590898951595, + "grad_norm": 0.16717424988746643, + "learning_rate": 1.7124531688001735e-05, + "loss": 0.482, + "step": 3406 + }, + { + "epoch": 0.7599821548070489, + "grad_norm": 0.1635364592075348, + "learning_rate": 1.7122880043168872e-05, + "loss": 0.4706, + "step": 3407 + }, + { + "epoch": 0.7602052197189382, + "grad_norm": 0.15802796185016632, + "learning_rate": 1.71212280038261e-05, + "loss": 0.4835, + "step": 3408 + }, + { + "epoch": 0.7604282846308276, + "grad_norm": 0.16960306465625763, + "learning_rate": 1.7119575570064926e-05, + "loss": 0.474, + "step": 3409 + }, + { + "epoch": 0.760651349542717, + "grad_norm": 0.16367004811763763, + "learning_rate": 1.7117922741976878e-05, + "loss": 0.5132, + "step": 3410 + }, + { + "epoch": 0.7608744144546062, + "grad_norm": 0.1700059324502945, + "learning_rate": 1.7116269519653493e-05, + "loss": 0.5124, + "step": 3411 + }, + { + "epoch": 0.7610974793664956, + "grad_norm": 0.19023141264915466, + "learning_rate": 1.711461590318634e-05, + "loss": 0.487, + "step": 3412 + }, + { + "epoch": 0.761320544278385, + "grad_norm": 0.16813622415065765, + "learning_rate": 1.7112961892667003e-05, + "loss": 0.4749, + "step": 3413 + }, + { + "epoch": 0.7615436091902744, + "grad_norm": 0.16135910153388977, + "learning_rate": 1.7111307488187096e-05, + "loss": 0.5041, + "step": 3414 + }, + { + "epoch": 0.7617666741021637, + "grad_norm": 0.16579851508140564, + "learning_rate": 1.710965268983825e-05, + "loss": 0.5062, + "step": 3415 + }, + { + "epoch": 0.7619897390140531, + "grad_norm": 0.15735271573066711, + "learning_rate": 1.7107997497712113e-05, + "loss": 0.5071, + "step": 3416 + }, + { + "epoch": 0.7622128039259425, + "grad_norm": 0.1686030477285385, + "learning_rate": 1.7106341911900365e-05, + "loss": 0.5271, + "step": 3417 + }, + { + "epoch": 0.7624358688378318, + "grad_norm": 0.17071577906608582, + "learning_rate": 1.7104685932494704e-05, + "loss": 0.5179, + "step": 3418 + }, + { + "epoch": 0.7626589337497212, + "grad_norm": 0.15663312375545502, + "learning_rate": 1.7103029559586843e-05, + "loss": 0.4928, + "step": 3419 + }, + { + "epoch": 0.7628819986616105, + "grad_norm": 0.17715679109096527, + "learning_rate": 1.7101372793268526e-05, + "loss": 0.5371, + "step": 3420 + }, + { + "epoch": 0.7631050635734999, + "grad_norm": 0.1651579737663269, + "learning_rate": 1.709971563363151e-05, + "loss": 0.5072, + "step": 3421 + }, + { + "epoch": 0.7633281284853892, + "grad_norm": 0.15921546518802643, + "learning_rate": 1.7098058080767587e-05, + "loss": 0.5008, + "step": 3422 + }, + { + "epoch": 0.7635511933972786, + "grad_norm": 0.17586684226989746, + "learning_rate": 1.709640013476856e-05, + "loss": 0.4962, + "step": 3423 + }, + { + "epoch": 0.763774258309168, + "grad_norm": 0.17664553225040436, + "learning_rate": 1.7094741795726254e-05, + "loss": 0.5107, + "step": 3424 + }, + { + "epoch": 0.7639973232210573, + "grad_norm": 0.1621769219636917, + "learning_rate": 1.7093083063732518e-05, + "loss": 0.5017, + "step": 3425 + }, + { + "epoch": 0.7642203881329467, + "grad_norm": 0.1619473695755005, + "learning_rate": 1.7091423938879227e-05, + "loss": 0.4858, + "step": 3426 + }, + { + "epoch": 0.7644434530448361, + "grad_norm": 0.1741994023323059, + "learning_rate": 1.7089764421258272e-05, + "loss": 0.5226, + "step": 3427 + }, + { + "epoch": 0.7646665179567254, + "grad_norm": 0.16964323818683624, + "learning_rate": 1.7088104510961564e-05, + "loss": 0.4987, + "step": 3428 + }, + { + "epoch": 0.7648895828686147, + "grad_norm": 0.1797408014535904, + "learning_rate": 1.7086444208081047e-05, + "loss": 0.4833, + "step": 3429 + }, + { + "epoch": 0.7651126477805041, + "grad_norm": 0.19941550493240356, + "learning_rate": 1.708478351270867e-05, + "loss": 0.5138, + "step": 3430 + }, + { + "epoch": 0.7653357126923935, + "grad_norm": 0.17348411679267883, + "learning_rate": 1.708312242493642e-05, + "loss": 0.4866, + "step": 3431 + }, + { + "epoch": 0.7655587776042828, + "grad_norm": 0.1664038747549057, + "learning_rate": 1.7081460944856294e-05, + "loss": 0.5124, + "step": 3432 + }, + { + "epoch": 0.7657818425161722, + "grad_norm": 0.15917468070983887, + "learning_rate": 1.7079799072560318e-05, + "loss": 0.5066, + "step": 3433 + }, + { + "epoch": 0.7660049074280616, + "grad_norm": 0.16369706392288208, + "learning_rate": 1.7078136808140532e-05, + "loss": 0.4903, + "step": 3434 + }, + { + "epoch": 0.7662279723399509, + "grad_norm": 0.16175441443920135, + "learning_rate": 1.707647415168901e-05, + "loss": 0.525, + "step": 3435 + }, + { + "epoch": 0.7664510372518403, + "grad_norm": 0.19323702156543732, + "learning_rate": 1.707481110329783e-05, + "loss": 0.4865, + "step": 3436 + }, + { + "epoch": 0.7666741021637297, + "grad_norm": 0.15811483561992645, + "learning_rate": 1.707314766305912e-05, + "loss": 0.4709, + "step": 3437 + }, + { + "epoch": 0.7668971670756191, + "grad_norm": 0.1613859087228775, + "learning_rate": 1.707148383106499e-05, + "loss": 0.4969, + "step": 3438 + }, + { + "epoch": 0.7671202319875083, + "grad_norm": 0.16382627189159393, + "learning_rate": 1.706981960740761e-05, + "loss": 0.4893, + "step": 3439 + }, + { + "epoch": 0.7673432968993977, + "grad_norm": 0.17395590245723724, + "learning_rate": 1.706815499217915e-05, + "loss": 0.5073, + "step": 3440 + }, + { + "epoch": 0.7675663618112871, + "grad_norm": 0.16161273419857025, + "learning_rate": 1.7066489985471802e-05, + "loss": 0.4766, + "step": 3441 + }, + { + "epoch": 0.7677894267231764, + "grad_norm": 0.16916361451148987, + "learning_rate": 1.706482458737779e-05, + "loss": 0.5102, + "step": 3442 + }, + { + "epoch": 0.7680124916350658, + "grad_norm": 0.16515444219112396, + "learning_rate": 1.7063158797989355e-05, + "loss": 0.5006, + "step": 3443 + }, + { + "epoch": 0.7682355565469552, + "grad_norm": 0.16712456941604614, + "learning_rate": 1.7061492617398755e-05, + "loss": 0.4757, + "step": 3444 + }, + { + "epoch": 0.7684586214588445, + "grad_norm": 0.1649584323167801, + "learning_rate": 1.7059826045698275e-05, + "loss": 0.4979, + "step": 3445 + }, + { + "epoch": 0.7686816863707339, + "grad_norm": 0.16432853043079376, + "learning_rate": 1.7058159082980223e-05, + "loss": 0.5155, + "step": 3446 + }, + { + "epoch": 0.7689047512826233, + "grad_norm": 0.15966151654720306, + "learning_rate": 1.7056491729336917e-05, + "loss": 0.4908, + "step": 3447 + }, + { + "epoch": 0.7691278161945126, + "grad_norm": 0.1597587913274765, + "learning_rate": 1.7054823984860716e-05, + "loss": 0.4912, + "step": 3448 + }, + { + "epoch": 0.7693508811064019, + "grad_norm": 0.16364127397537231, + "learning_rate": 1.705315584964399e-05, + "loss": 0.5045, + "step": 3449 + }, + { + "epoch": 0.7695739460182913, + "grad_norm": 0.15808305144309998, + "learning_rate": 1.7051487323779122e-05, + "loss": 0.5025, + "step": 3450 + }, + { + "epoch": 0.7697970109301807, + "grad_norm": 0.173845574259758, + "learning_rate": 1.704981840735853e-05, + "loss": 0.5166, + "step": 3451 + }, + { + "epoch": 0.77002007584207, + "grad_norm": 0.1586831957101822, + "learning_rate": 1.7048149100474653e-05, + "loss": 0.4758, + "step": 3452 + }, + { + "epoch": 0.7702431407539594, + "grad_norm": 0.16501812636852264, + "learning_rate": 1.704647940321994e-05, + "loss": 0.5051, + "step": 3453 + }, + { + "epoch": 0.7704662056658488, + "grad_norm": 0.17648212611675262, + "learning_rate": 1.704480931568688e-05, + "loss": 0.5014, + "step": 3454 + }, + { + "epoch": 0.7706892705777382, + "grad_norm": 0.15847481787204742, + "learning_rate": 1.704313883796796e-05, + "loss": 0.5246, + "step": 3455 + }, + { + "epoch": 0.7709123354896275, + "grad_norm": 0.17660297453403473, + "learning_rate": 1.704146797015571e-05, + "loss": 0.5028, + "step": 3456 + }, + { + "epoch": 0.7711354004015168, + "grad_norm": 0.1632460057735443, + "learning_rate": 1.7039796712342672e-05, + "loss": 0.4647, + "step": 3457 + }, + { + "epoch": 0.7713584653134062, + "grad_norm": 0.15266932547092438, + "learning_rate": 1.7038125064621408e-05, + "loss": 0.4829, + "step": 3458 + }, + { + "epoch": 0.7715815302252955, + "grad_norm": 0.16995325684547424, + "learning_rate": 1.703645302708451e-05, + "loss": 0.5057, + "step": 3459 + }, + { + "epoch": 0.7718045951371849, + "grad_norm": 0.15925444662570953, + "learning_rate": 1.703478059982458e-05, + "loss": 0.5128, + "step": 3460 + }, + { + "epoch": 0.7720276600490743, + "grad_norm": 0.17199598252773285, + "learning_rate": 1.703310778293425e-05, + "loss": 0.5088, + "step": 3461 + }, + { + "epoch": 0.7722507249609636, + "grad_norm": 0.17313992977142334, + "learning_rate": 1.7031434576506173e-05, + "loss": 0.4796, + "step": 3462 + }, + { + "epoch": 0.772473789872853, + "grad_norm": 0.1665591597557068, + "learning_rate": 1.7029760980633016e-05, + "loss": 0.4978, + "step": 3463 + }, + { + "epoch": 0.7726968547847424, + "grad_norm": 0.16291576623916626, + "learning_rate": 1.7028086995407477e-05, + "loss": 0.4892, + "step": 3464 + }, + { + "epoch": 0.7729199196966318, + "grad_norm": 0.15629801154136658, + "learning_rate": 1.7026412620922276e-05, + "loss": 0.483, + "step": 3465 + }, + { + "epoch": 0.773142984608521, + "grad_norm": 0.16080623865127563, + "learning_rate": 1.702473785727014e-05, + "loss": 0.4968, + "step": 3466 + }, + { + "epoch": 0.7733660495204104, + "grad_norm": 0.16567450761795044, + "learning_rate": 1.702306270454384e-05, + "loss": 0.5023, + "step": 3467 + }, + { + "epoch": 0.7735891144322998, + "grad_norm": 0.16630107164382935, + "learning_rate": 1.702138716283615e-05, + "loss": 0.4554, + "step": 3468 + }, + { + "epoch": 0.7738121793441891, + "grad_norm": 0.1635141223669052, + "learning_rate": 1.7019711232239872e-05, + "loss": 0.4947, + "step": 3469 + }, + { + "epoch": 0.7740352442560785, + "grad_norm": 0.16662491858005524, + "learning_rate": 1.7018034912847826e-05, + "loss": 0.4947, + "step": 3470 + }, + { + "epoch": 0.7742583091679679, + "grad_norm": 0.16234828531742096, + "learning_rate": 1.7016358204752865e-05, + "loss": 0.5424, + "step": 3471 + }, + { + "epoch": 0.7744813740798573, + "grad_norm": 0.16206470131874084, + "learning_rate": 1.701468110804785e-05, + "loss": 0.4761, + "step": 3472 + }, + { + "epoch": 0.7747044389917466, + "grad_norm": 0.1729600876569748, + "learning_rate": 1.7013003622825674e-05, + "loss": 0.4867, + "step": 3473 + }, + { + "epoch": 0.774927503903636, + "grad_norm": 0.15577882528305054, + "learning_rate": 1.7011325749179245e-05, + "loss": 0.4935, + "step": 3474 + }, + { + "epoch": 0.7751505688155254, + "grad_norm": 0.16553224623203278, + "learning_rate": 1.7009647487201492e-05, + "loss": 0.5052, + "step": 3475 + }, + { + "epoch": 0.7753736337274146, + "grad_norm": 0.16988618671894073, + "learning_rate": 1.700796883698536e-05, + "loss": 0.4999, + "step": 3476 + }, + { + "epoch": 0.775596698639304, + "grad_norm": 0.15535961091518402, + "learning_rate": 1.7006289798623842e-05, + "loss": 0.494, + "step": 3477 + }, + { + "epoch": 0.7758197635511934, + "grad_norm": 0.16982175409793854, + "learning_rate": 1.700461037220992e-05, + "loss": 0.5057, + "step": 3478 + }, + { + "epoch": 0.7760428284630828, + "grad_norm": 0.17564809322357178, + "learning_rate": 1.7002930557836615e-05, + "loss": 0.5088, + "step": 3479 + }, + { + "epoch": 0.7762658933749721, + "grad_norm": 0.16570429503917694, + "learning_rate": 1.7001250355596967e-05, + "loss": 0.5223, + "step": 3480 + }, + { + "epoch": 0.7764889582868615, + "grad_norm": 0.16374748945236206, + "learning_rate": 1.6999569765584035e-05, + "loss": 0.4533, + "step": 3481 + }, + { + "epoch": 0.7767120231987509, + "grad_norm": 0.15578052401542664, + "learning_rate": 1.69978887878909e-05, + "loss": 0.4797, + "step": 3482 + }, + { + "epoch": 0.7769350881106402, + "grad_norm": 0.15627166628837585, + "learning_rate": 1.6996207422610664e-05, + "loss": 0.4523, + "step": 3483 + }, + { + "epoch": 0.7771581530225296, + "grad_norm": 0.1620151549577713, + "learning_rate": 1.6994525669836453e-05, + "loss": 0.4999, + "step": 3484 + }, + { + "epoch": 0.7773812179344189, + "grad_norm": 0.15806102752685547, + "learning_rate": 1.6992843529661413e-05, + "loss": 0.5013, + "step": 3485 + }, + { + "epoch": 0.7776042828463082, + "grad_norm": 0.25901806354522705, + "learning_rate": 1.6991161002178712e-05, + "loss": 0.5003, + "step": 3486 + }, + { + "epoch": 0.7778273477581976, + "grad_norm": 0.15611301362514496, + "learning_rate": 1.698947808748154e-05, + "loss": 0.5079, + "step": 3487 + }, + { + "epoch": 0.778050412670087, + "grad_norm": 0.1656789630651474, + "learning_rate": 1.6987794785663107e-05, + "loss": 0.4983, + "step": 3488 + }, + { + "epoch": 0.7782734775819764, + "grad_norm": 0.16347582638263702, + "learning_rate": 1.698611109681664e-05, + "loss": 0.4887, + "step": 3489 + }, + { + "epoch": 0.7784965424938657, + "grad_norm": 0.15885215997695923, + "learning_rate": 1.69844270210354e-05, + "loss": 0.4837, + "step": 3490 + }, + { + "epoch": 0.7787196074057551, + "grad_norm": 0.17082704603672028, + "learning_rate": 1.698274255841265e-05, + "loss": 0.5165, + "step": 3491 + }, + { + "epoch": 0.7789426723176445, + "grad_norm": 0.16731634736061096, + "learning_rate": 1.6981057709041703e-05, + "loss": 0.485, + "step": 3492 + }, + { + "epoch": 0.7791657372295338, + "grad_norm": 0.16583065688610077, + "learning_rate": 1.697937247301586e-05, + "loss": 0.5117, + "step": 3493 + }, + { + "epoch": 0.7793888021414231, + "grad_norm": 0.16518862545490265, + "learning_rate": 1.6977686850428475e-05, + "loss": 0.5141, + "step": 3494 + }, + { + "epoch": 0.7796118670533125, + "grad_norm": 0.15372084081172943, + "learning_rate": 1.69760008413729e-05, + "loss": 0.4787, + "step": 3495 + }, + { + "epoch": 0.7798349319652019, + "grad_norm": 0.16989074647426605, + "learning_rate": 1.6974314445942514e-05, + "loss": 0.4906, + "step": 3496 + }, + { + "epoch": 0.7800579968770912, + "grad_norm": 0.16484175622463226, + "learning_rate": 1.697262766423072e-05, + "loss": 0.5, + "step": 3497 + }, + { + "epoch": 0.7802810617889806, + "grad_norm": 0.16868092119693756, + "learning_rate": 1.6970940496330953e-05, + "loss": 0.5106, + "step": 3498 + }, + { + "epoch": 0.78050412670087, + "grad_norm": 0.16630426049232483, + "learning_rate": 1.6969252942336648e-05, + "loss": 0.505, + "step": 3499 + }, + { + "epoch": 0.7807271916127593, + "grad_norm": 0.15219101309776306, + "learning_rate": 1.696756500234128e-05, + "loss": 0.4698, + "step": 3500 + }, + { + "epoch": 0.7809502565246487, + "grad_norm": 0.16838675737380981, + "learning_rate": 1.6965876676438334e-05, + "loss": 0.4897, + "step": 3501 + }, + { + "epoch": 0.7811733214365381, + "grad_norm": 0.15935415029525757, + "learning_rate": 1.696418796472132e-05, + "loss": 0.508, + "step": 3502 + }, + { + "epoch": 0.7813963863484273, + "grad_norm": 0.16184543073177338, + "learning_rate": 1.696249886728377e-05, + "loss": 0.5083, + "step": 3503 + }, + { + "epoch": 0.7816194512603167, + "grad_norm": 0.15746904909610748, + "learning_rate": 1.6960809384219237e-05, + "loss": 0.4662, + "step": 3504 + }, + { + "epoch": 0.7818425161722061, + "grad_norm": 0.1662943959236145, + "learning_rate": 1.6959119515621295e-05, + "loss": 0.4853, + "step": 3505 + }, + { + "epoch": 0.7820655810840955, + "grad_norm": 0.16866953670978546, + "learning_rate": 1.695742926158354e-05, + "loss": 0.4919, + "step": 3506 + }, + { + "epoch": 0.7822886459959848, + "grad_norm": 0.1712365448474884, + "learning_rate": 1.695573862219959e-05, + "loss": 0.4882, + "step": 3507 + }, + { + "epoch": 0.7825117109078742, + "grad_norm": 0.17081387341022491, + "learning_rate": 1.6954047597563078e-05, + "loss": 0.5068, + "step": 3508 + }, + { + "epoch": 0.7827347758197636, + "grad_norm": 0.16168633103370667, + "learning_rate": 1.695235618776767e-05, + "loss": 0.5119, + "step": 3509 + }, + { + "epoch": 0.7829578407316529, + "grad_norm": 0.2024863213300705, + "learning_rate": 1.6950664392907042e-05, + "loss": 0.5039, + "step": 3510 + }, + { + "epoch": 0.7831809056435423, + "grad_norm": 0.1648394763469696, + "learning_rate": 1.6948972213074902e-05, + "loss": 0.5304, + "step": 3511 + }, + { + "epoch": 0.7834039705554317, + "grad_norm": 0.15931276977062225, + "learning_rate": 1.6947279648364966e-05, + "loss": 0.5197, + "step": 3512 + }, + { + "epoch": 0.783627035467321, + "grad_norm": 0.15827037394046783, + "learning_rate": 1.6945586698870985e-05, + "loss": 0.513, + "step": 3513 + }, + { + "epoch": 0.7838501003792103, + "grad_norm": 0.15963312983512878, + "learning_rate": 1.694389336468672e-05, + "loss": 0.5096, + "step": 3514 + }, + { + "epoch": 0.7840731652910997, + "grad_norm": 0.1624821275472641, + "learning_rate": 1.694219964590597e-05, + "loss": 0.5086, + "step": 3515 + }, + { + "epoch": 0.7842962302029891, + "grad_norm": 0.15603917837142944, + "learning_rate": 1.694050554262253e-05, + "loss": 0.499, + "step": 3516 + }, + { + "epoch": 0.7845192951148784, + "grad_norm": 0.16541822254657745, + "learning_rate": 1.6938811054930237e-05, + "loss": 0.4843, + "step": 3517 + }, + { + "epoch": 0.7847423600267678, + "grad_norm": 0.16056667268276215, + "learning_rate": 1.693711618292294e-05, + "loss": 0.4992, + "step": 3518 + }, + { + "epoch": 0.7849654249386572, + "grad_norm": 0.15387259423732758, + "learning_rate": 1.693542092669451e-05, + "loss": 0.5056, + "step": 3519 + }, + { + "epoch": 0.7851884898505465, + "grad_norm": 0.15904875099658966, + "learning_rate": 1.6933725286338846e-05, + "loss": 0.4718, + "step": 3520 + }, + { + "epoch": 0.7854115547624358, + "grad_norm": 0.16445663571357727, + "learning_rate": 1.693202926194986e-05, + "loss": 0.4752, + "step": 3521 + }, + { + "epoch": 0.7856346196743252, + "grad_norm": 0.16833317279815674, + "learning_rate": 1.693033285362149e-05, + "loss": 0.4626, + "step": 3522 + }, + { + "epoch": 0.7858576845862146, + "grad_norm": 0.17628587782382965, + "learning_rate": 1.692863606144769e-05, + "loss": 0.5001, + "step": 3523 + }, + { + "epoch": 0.7860807494981039, + "grad_norm": 0.16575254499912262, + "learning_rate": 1.692693888552245e-05, + "loss": 0.4987, + "step": 3524 + }, + { + "epoch": 0.7863038144099933, + "grad_norm": 0.1609877198934555, + "learning_rate": 1.6925241325939756e-05, + "loss": 0.4897, + "step": 3525 + }, + { + "epoch": 0.7865268793218827, + "grad_norm": 0.1630864441394806, + "learning_rate": 1.6923543382793636e-05, + "loss": 0.473, + "step": 3526 + }, + { + "epoch": 0.786749944233772, + "grad_norm": 0.16511790454387665, + "learning_rate": 1.6921845056178133e-05, + "loss": 0.511, + "step": 3527 + }, + { + "epoch": 0.7869730091456614, + "grad_norm": 0.15955112874507904, + "learning_rate": 1.6920146346187312e-05, + "loss": 0.4702, + "step": 3528 + }, + { + "epoch": 0.7871960740575508, + "grad_norm": 0.16080255806446075, + "learning_rate": 1.691844725291526e-05, + "loss": 0.4897, + "step": 3529 + }, + { + "epoch": 0.7874191389694402, + "grad_norm": 0.15729734301567078, + "learning_rate": 1.6916747776456074e-05, + "loss": 0.477, + "step": 3530 + }, + { + "epoch": 0.7876422038813294, + "grad_norm": 0.18572328984737396, + "learning_rate": 1.691504791690389e-05, + "loss": 0.4953, + "step": 3531 + }, + { + "epoch": 0.7878652687932188, + "grad_norm": 0.17920143902301788, + "learning_rate": 1.6913347674352855e-05, + "loss": 0.4932, + "step": 3532 + }, + { + "epoch": 0.7880883337051082, + "grad_norm": 0.14651532471179962, + "learning_rate": 1.691164704889714e-05, + "loss": 0.4531, + "step": 3533 + }, + { + "epoch": 0.7883113986169975, + "grad_norm": 0.16487324237823486, + "learning_rate": 1.6909946040630935e-05, + "loss": 0.487, + "step": 3534 + }, + { + "epoch": 0.7885344635288869, + "grad_norm": 0.16055698692798615, + "learning_rate": 1.6908244649648455e-05, + "loss": 0.4754, + "step": 3535 + }, + { + "epoch": 0.7887575284407763, + "grad_norm": 0.1739615648984909, + "learning_rate": 1.690654287604393e-05, + "loss": 0.5076, + "step": 3536 + }, + { + "epoch": 0.7889805933526656, + "grad_norm": 0.17668306827545166, + "learning_rate": 1.690484071991162e-05, + "loss": 0.5177, + "step": 3537 + }, + { + "epoch": 0.789203658264555, + "grad_norm": 0.17033684253692627, + "learning_rate": 1.690313818134579e-05, + "loss": 0.4884, + "step": 3538 + }, + { + "epoch": 0.7894267231764444, + "grad_norm": 0.19986379146575928, + "learning_rate": 1.690143526044075e-05, + "loss": 0.4885, + "step": 3539 + }, + { + "epoch": 0.7896497880883337, + "grad_norm": 0.17267881333827972, + "learning_rate": 1.6899731957290814e-05, + "loss": 0.498, + "step": 3540 + }, + { + "epoch": 0.789872853000223, + "grad_norm": 0.15866464376449585, + "learning_rate": 1.689802827199032e-05, + "loss": 0.4856, + "step": 3541 + }, + { + "epoch": 0.7900959179121124, + "grad_norm": 0.17956510186195374, + "learning_rate": 1.689632420463363e-05, + "loss": 0.5046, + "step": 3542 + }, + { + "epoch": 0.7903189828240018, + "grad_norm": 0.17033924162387848, + "learning_rate": 1.6894619755315127e-05, + "loss": 0.5272, + "step": 3543 + }, + { + "epoch": 0.7905420477358911, + "grad_norm": 0.15526439249515533, + "learning_rate": 1.6892914924129212e-05, + "loss": 0.4858, + "step": 3544 + }, + { + "epoch": 0.7907651126477805, + "grad_norm": 0.16875073313713074, + "learning_rate": 1.689120971117031e-05, + "loss": 0.4957, + "step": 3545 + }, + { + "epoch": 0.7909881775596699, + "grad_norm": 0.16168349981307983, + "learning_rate": 1.6889504116532868e-05, + "loss": 0.5041, + "step": 3546 + }, + { + "epoch": 0.7912112424715593, + "grad_norm": 0.1654532253742218, + "learning_rate": 1.688779814031135e-05, + "loss": 0.5032, + "step": 3547 + }, + { + "epoch": 0.7914343073834486, + "grad_norm": 0.16922220587730408, + "learning_rate": 1.6886091782600248e-05, + "loss": 0.5098, + "step": 3548 + }, + { + "epoch": 0.791657372295338, + "grad_norm": 0.17434757947921753, + "learning_rate": 1.6884385043494064e-05, + "loss": 0.5232, + "step": 3549 + }, + { + "epoch": 0.7918804372072273, + "grad_norm": 0.1557532101869583, + "learning_rate": 1.688267792308733e-05, + "loss": 0.4865, + "step": 3550 + }, + { + "epoch": 0.7921035021191166, + "grad_norm": 0.1586635261774063, + "learning_rate": 1.6880970421474604e-05, + "loss": 0.4967, + "step": 3551 + }, + { + "epoch": 0.792326567031006, + "grad_norm": 0.16100507974624634, + "learning_rate": 1.6879262538750453e-05, + "loss": 0.4804, + "step": 3552 + }, + { + "epoch": 0.7925496319428954, + "grad_norm": 0.15817134082317352, + "learning_rate": 1.6877554275009467e-05, + "loss": 0.5066, + "step": 3553 + }, + { + "epoch": 0.7927726968547848, + "grad_norm": 0.15588125586509705, + "learning_rate": 1.6875845630346265e-05, + "loss": 0.5074, + "step": 3554 + }, + { + "epoch": 0.7929957617666741, + "grad_norm": 0.29957878589630127, + "learning_rate": 1.687413660485548e-05, + "loss": 0.4962, + "step": 3555 + }, + { + "epoch": 0.7932188266785635, + "grad_norm": 0.15556854009628296, + "learning_rate": 1.6872427198631772e-05, + "loss": 0.4746, + "step": 3556 + }, + { + "epoch": 0.7934418915904529, + "grad_norm": 0.1643342226743698, + "learning_rate": 1.6870717411769818e-05, + "loss": 0.5085, + "step": 3557 + }, + { + "epoch": 0.7936649565023421, + "grad_norm": 0.16817674040794373, + "learning_rate": 1.686900724436431e-05, + "loss": 0.5078, + "step": 3558 + }, + { + "epoch": 0.7938880214142315, + "grad_norm": 0.17203262448310852, + "learning_rate": 1.6867296696509978e-05, + "loss": 0.4775, + "step": 3559 + }, + { + "epoch": 0.7941110863261209, + "grad_norm": 0.1592828631401062, + "learning_rate": 1.6865585768301556e-05, + "loss": 0.487, + "step": 3560 + }, + { + "epoch": 0.7943341512380102, + "grad_norm": 0.15609320998191833, + "learning_rate": 1.6863874459833806e-05, + "loss": 0.4696, + "step": 3561 + }, + { + "epoch": 0.7945572161498996, + "grad_norm": 0.17543616890907288, + "learning_rate": 1.6862162771201515e-05, + "loss": 0.5098, + "step": 3562 + }, + { + "epoch": 0.794780281061789, + "grad_norm": 0.17788106203079224, + "learning_rate": 1.6860450702499486e-05, + "loss": 0.4826, + "step": 3563 + }, + { + "epoch": 0.7950033459736784, + "grad_norm": 0.16301631927490234, + "learning_rate": 1.685873825382254e-05, + "loss": 0.5087, + "step": 3564 + }, + { + "epoch": 0.7952264108855677, + "grad_norm": 0.16601480543613434, + "learning_rate": 1.685702542526553e-05, + "loss": 0.4624, + "step": 3565 + }, + { + "epoch": 0.7954494757974571, + "grad_norm": 0.7922313809394836, + "learning_rate": 1.6855312216923316e-05, + "loss": 0.5045, + "step": 3566 + }, + { + "epoch": 0.7956725407093465, + "grad_norm": 0.2716256380081177, + "learning_rate": 1.685359862889079e-05, + "loss": 0.5, + "step": 3567 + }, + { + "epoch": 0.7958956056212357, + "grad_norm": 0.1633540540933609, + "learning_rate": 1.6851884661262864e-05, + "loss": 0.4944, + "step": 3568 + }, + { + "epoch": 0.7961186705331251, + "grad_norm": 0.17535462975502014, + "learning_rate": 1.6850170314134465e-05, + "loss": 0.5251, + "step": 3569 + }, + { + "epoch": 0.7963417354450145, + "grad_norm": 0.18369325995445251, + "learning_rate": 1.6848455587600542e-05, + "loss": 0.5043, + "step": 3570 + }, + { + "epoch": 0.7965648003569039, + "grad_norm": 0.15560777485370636, + "learning_rate": 1.6846740481756072e-05, + "loss": 0.4686, + "step": 3571 + }, + { + "epoch": 0.7967878652687932, + "grad_norm": 0.18801093101501465, + "learning_rate": 1.6845024996696047e-05, + "loss": 0.4946, + "step": 3572 + }, + { + "epoch": 0.7970109301806826, + "grad_norm": 0.17000144720077515, + "learning_rate": 1.684330913251548e-05, + "loss": 0.5, + "step": 3573 + }, + { + "epoch": 0.797233995092572, + "grad_norm": 0.16179999709129333, + "learning_rate": 1.6841592889309405e-05, + "loss": 0.4769, + "step": 3574 + }, + { + "epoch": 0.7974570600044613, + "grad_norm": 0.16142041981220245, + "learning_rate": 1.6839876267172883e-05, + "loss": 0.4644, + "step": 3575 + }, + { + "epoch": 0.7976801249163507, + "grad_norm": 0.16446426510810852, + "learning_rate": 1.683815926620099e-05, + "loss": 0.4835, + "step": 3576 + }, + { + "epoch": 0.79790318982824, + "grad_norm": 0.16954201459884644, + "learning_rate": 1.6836441886488822e-05, + "loss": 0.5151, + "step": 3577 + }, + { + "epoch": 0.7981262547401293, + "grad_norm": 0.16908320784568787, + "learning_rate": 1.6834724128131496e-05, + "loss": 0.4714, + "step": 3578 + }, + { + "epoch": 0.7983493196520187, + "grad_norm": 0.1670169085264206, + "learning_rate": 1.683300599122416e-05, + "loss": 0.5004, + "step": 3579 + }, + { + "epoch": 0.7985723845639081, + "grad_norm": 0.17246994376182556, + "learning_rate": 1.683128747586197e-05, + "loss": 0.5066, + "step": 3580 + }, + { + "epoch": 0.7987954494757975, + "grad_norm": 0.16627082228660583, + "learning_rate": 1.6829568582140108e-05, + "loss": 0.4813, + "step": 3581 + }, + { + "epoch": 0.7990185143876868, + "grad_norm": 0.16429170966148376, + "learning_rate": 1.6827849310153778e-05, + "loss": 0.4916, + "step": 3582 + }, + { + "epoch": 0.7992415792995762, + "grad_norm": 0.1685648411512375, + "learning_rate": 1.6826129659998204e-05, + "loss": 0.4821, + "step": 3583 + }, + { + "epoch": 0.7994646442114656, + "grad_norm": 0.1623144894838333, + "learning_rate": 1.6824409631768633e-05, + "loss": 0.5092, + "step": 3584 + }, + { + "epoch": 0.7996877091233549, + "grad_norm": 0.16186010837554932, + "learning_rate": 1.6822689225560328e-05, + "loss": 0.4647, + "step": 3585 + }, + { + "epoch": 0.7999107740352442, + "grad_norm": 0.15835130214691162, + "learning_rate": 1.682096844146858e-05, + "loss": 0.5139, + "step": 3586 + }, + { + "epoch": 0.8001338389471336, + "grad_norm": 0.1640879362821579, + "learning_rate": 1.681924727958869e-05, + "loss": 0.4815, + "step": 3587 + }, + { + "epoch": 0.800356903859023, + "grad_norm": 0.18087539076805115, + "learning_rate": 1.681752574001599e-05, + "loss": 0.5057, + "step": 3588 + }, + { + "epoch": 0.8005799687709123, + "grad_norm": 0.16318152844905853, + "learning_rate": 1.6815803822845834e-05, + "loss": 0.4945, + "step": 3589 + }, + { + "epoch": 0.8008030336828017, + "grad_norm": 0.16909107565879822, + "learning_rate": 1.681408152817359e-05, + "loss": 0.5123, + "step": 3590 + }, + { + "epoch": 0.8010260985946911, + "grad_norm": 0.16913791000843048, + "learning_rate": 1.6812358856094652e-05, + "loss": 0.4933, + "step": 3591 + }, + { + "epoch": 0.8012491635065804, + "grad_norm": 0.16670222580432892, + "learning_rate": 1.681063580670442e-05, + "loss": 0.5109, + "step": 3592 + }, + { + "epoch": 0.8014722284184698, + "grad_norm": 0.16838552057743073, + "learning_rate": 1.680891238009834e-05, + "loss": 0.4926, + "step": 3593 + }, + { + "epoch": 0.8016952933303592, + "grad_norm": 0.15705366432666779, + "learning_rate": 1.6807188576371864e-05, + "loss": 0.4762, + "step": 3594 + }, + { + "epoch": 0.8019183582422484, + "grad_norm": 0.15505351126194, + "learning_rate": 1.6805464395620465e-05, + "loss": 0.4769, + "step": 3595 + }, + { + "epoch": 0.8021414231541378, + "grad_norm": 0.17056378722190857, + "learning_rate": 1.6803739837939642e-05, + "loss": 0.5038, + "step": 3596 + }, + { + "epoch": 0.8023644880660272, + "grad_norm": 0.15908926725387573, + "learning_rate": 1.6802014903424905e-05, + "loss": 0.5079, + "step": 3597 + }, + { + "epoch": 0.8025875529779166, + "grad_norm": 0.16009296476840973, + "learning_rate": 1.68002895921718e-05, + "loss": 0.497, + "step": 3598 + }, + { + "epoch": 0.8028106178898059, + "grad_norm": 0.17202340066432953, + "learning_rate": 1.6798563904275882e-05, + "loss": 0.5097, + "step": 3599 + }, + { + "epoch": 0.8030336828016953, + "grad_norm": 0.16179756820201874, + "learning_rate": 1.679683783983273e-05, + "loss": 0.4992, + "step": 3600 + }, + { + "epoch": 0.8032567477135847, + "grad_norm": 0.16165940463542938, + "learning_rate": 1.6795111398937944e-05, + "loss": 0.4959, + "step": 3601 + }, + { + "epoch": 0.803479812625474, + "grad_norm": 0.17612482607364655, + "learning_rate": 1.679338458168714e-05, + "loss": 0.4937, + "step": 3602 + }, + { + "epoch": 0.8037028775373634, + "grad_norm": 0.16228753328323364, + "learning_rate": 1.679165738817597e-05, + "loss": 0.5287, + "step": 3603 + }, + { + "epoch": 0.8039259424492528, + "grad_norm": 0.17843377590179443, + "learning_rate": 1.6789929818500096e-05, + "loss": 0.4851, + "step": 3604 + }, + { + "epoch": 0.8041490073611421, + "grad_norm": 0.1637219339609146, + "learning_rate": 1.6788201872755196e-05, + "loss": 0.4633, + "step": 3605 + }, + { + "epoch": 0.8043720722730314, + "grad_norm": 0.16961325705051422, + "learning_rate": 1.6786473551036978e-05, + "loss": 0.5084, + "step": 3606 + }, + { + "epoch": 0.8045951371849208, + "grad_norm": 0.15785901248455048, + "learning_rate": 1.6784744853441167e-05, + "loss": 0.4855, + "step": 3607 + }, + { + "epoch": 0.8048182020968102, + "grad_norm": 0.1613706648349762, + "learning_rate": 1.6783015780063503e-05, + "loss": 0.4674, + "step": 3608 + }, + { + "epoch": 0.8050412670086995, + "grad_norm": 0.15770243108272552, + "learning_rate": 1.678128633099976e-05, + "loss": 0.4802, + "step": 3609 + }, + { + "epoch": 0.8052643319205889, + "grad_norm": 0.16045016050338745, + "learning_rate": 1.677955650634573e-05, + "loss": 0.4965, + "step": 3610 + }, + { + "epoch": 0.8054873968324783, + "grad_norm": 0.15433759987354279, + "learning_rate": 1.6777826306197208e-05, + "loss": 0.4752, + "step": 3611 + }, + { + "epoch": 0.8057104617443676, + "grad_norm": 0.16770930588245392, + "learning_rate": 1.6776095730650034e-05, + "loss": 0.5025, + "step": 3612 + }, + { + "epoch": 0.805933526656257, + "grad_norm": 0.15930598974227905, + "learning_rate": 1.6774364779800057e-05, + "loss": 0.4995, + "step": 3613 + }, + { + "epoch": 0.8061565915681463, + "grad_norm": 0.16089414060115814, + "learning_rate": 1.6772633453743142e-05, + "loss": 0.4973, + "step": 3614 + }, + { + "epoch": 0.8063796564800357, + "grad_norm": 0.1723753660917282, + "learning_rate": 1.6770901752575186e-05, + "loss": 0.5267, + "step": 3615 + }, + { + "epoch": 0.806602721391925, + "grad_norm": 0.15985791385173798, + "learning_rate": 1.6769169676392103e-05, + "loss": 0.4873, + "step": 3616 + }, + { + "epoch": 0.8068257863038144, + "grad_norm": 0.15705527365207672, + "learning_rate": 1.676743722528982e-05, + "loss": 0.48, + "step": 3617 + }, + { + "epoch": 0.8070488512157038, + "grad_norm": 0.17080652713775635, + "learning_rate": 1.6765704399364297e-05, + "loss": 0.507, + "step": 3618 + }, + { + "epoch": 0.8072719161275931, + "grad_norm": 0.16177797317504883, + "learning_rate": 1.6763971198711505e-05, + "loss": 0.5168, + "step": 3619 + }, + { + "epoch": 0.8074949810394825, + "grad_norm": 0.15763692557811737, + "learning_rate": 1.6762237623427445e-05, + "loss": 0.4736, + "step": 3620 + }, + { + "epoch": 0.8077180459513719, + "grad_norm": 0.15657658874988556, + "learning_rate": 1.6760503673608123e-05, + "loss": 0.479, + "step": 3621 + }, + { + "epoch": 0.8079411108632613, + "grad_norm": 0.15933813154697418, + "learning_rate": 1.6758769349349586e-05, + "loss": 0.519, + "step": 3622 + }, + { + "epoch": 0.8081641757751505, + "grad_norm": 0.173330619931221, + "learning_rate": 1.675703465074789e-05, + "loss": 0.5056, + "step": 3623 + }, + { + "epoch": 0.8083872406870399, + "grad_norm": 0.16727612912654877, + "learning_rate": 1.6755299577899107e-05, + "loss": 0.4949, + "step": 3624 + }, + { + "epoch": 0.8086103055989293, + "grad_norm": 0.23466536402702332, + "learning_rate": 1.6753564130899343e-05, + "loss": 0.5203, + "step": 3625 + }, + { + "epoch": 0.8088333705108186, + "grad_norm": 0.16051769256591797, + "learning_rate": 1.6751828309844714e-05, + "loss": 0.4907, + "step": 3626 + }, + { + "epoch": 0.809056435422708, + "grad_norm": 0.16254782676696777, + "learning_rate": 1.6750092114831368e-05, + "loss": 0.4976, + "step": 3627 + }, + { + "epoch": 0.8092795003345974, + "grad_norm": 0.1722305417060852, + "learning_rate": 1.6748355545955456e-05, + "loss": 0.4932, + "step": 3628 + }, + { + "epoch": 0.8095025652464868, + "grad_norm": 0.16842518746852875, + "learning_rate": 1.6746618603313165e-05, + "loss": 0.4934, + "step": 3629 + }, + { + "epoch": 0.8097256301583761, + "grad_norm": 0.16397298872470856, + "learning_rate": 1.67448812870007e-05, + "loss": 0.4824, + "step": 3630 + }, + { + "epoch": 0.8099486950702655, + "grad_norm": 0.15751919150352478, + "learning_rate": 1.674314359711428e-05, + "loss": 0.5057, + "step": 3631 + }, + { + "epoch": 0.8101717599821548, + "grad_norm": 0.15799839794635773, + "learning_rate": 1.6741405533750154e-05, + "loss": 0.4941, + "step": 3632 + }, + { + "epoch": 0.8103948248940441, + "grad_norm": 0.16038256883621216, + "learning_rate": 1.6739667097004583e-05, + "loss": 0.4765, + "step": 3633 + }, + { + "epoch": 0.8106178898059335, + "grad_norm": 0.18798737227916718, + "learning_rate": 1.6737928286973852e-05, + "loss": 0.5119, + "step": 3634 + }, + { + "epoch": 0.8108409547178229, + "grad_norm": 0.1568623036146164, + "learning_rate": 1.673618910375427e-05, + "loss": 0.4881, + "step": 3635 + }, + { + "epoch": 0.8110640196297122, + "grad_norm": 0.16038067638874054, + "learning_rate": 1.6734449547442165e-05, + "loss": 0.491, + "step": 3636 + }, + { + "epoch": 0.8112870845416016, + "grad_norm": 0.1699703484773636, + "learning_rate": 1.6732709618133882e-05, + "loss": 0.4772, + "step": 3637 + }, + { + "epoch": 0.811510149453491, + "grad_norm": 0.16090309619903564, + "learning_rate": 1.673096931592579e-05, + "loss": 0.4703, + "step": 3638 + }, + { + "epoch": 0.8117332143653804, + "grad_norm": 0.16035638749599457, + "learning_rate": 1.672922864091428e-05, + "loss": 0.5076, + "step": 3639 + }, + { + "epoch": 0.8119562792772697, + "grad_norm": 0.16750063002109528, + "learning_rate": 1.6727487593195757e-05, + "loss": 0.4961, + "step": 3640 + }, + { + "epoch": 0.812179344189159, + "grad_norm": 0.15547265112400055, + "learning_rate": 1.6725746172866652e-05, + "loss": 0.4759, + "step": 3641 + }, + { + "epoch": 0.8124024091010484, + "grad_norm": 0.15655042231082916, + "learning_rate": 1.672400438002342e-05, + "loss": 0.511, + "step": 3642 + }, + { + "epoch": 0.8126254740129377, + "grad_norm": 0.1612492948770523, + "learning_rate": 1.6722262214762527e-05, + "loss": 0.4901, + "step": 3643 + }, + { + "epoch": 0.8128485389248271, + "grad_norm": 0.15349607169628143, + "learning_rate": 1.6720519677180472e-05, + "loss": 0.5094, + "step": 3644 + }, + { + "epoch": 0.8130716038367165, + "grad_norm": 0.16423405706882477, + "learning_rate": 1.671877676737376e-05, + "loss": 0.513, + "step": 3645 + }, + { + "epoch": 0.8132946687486059, + "grad_norm": 0.1583424061536789, + "learning_rate": 1.671703348543893e-05, + "loss": 0.4795, + "step": 3646 + }, + { + "epoch": 0.8135177336604952, + "grad_norm": 0.16294583678245544, + "learning_rate": 1.671528983147253e-05, + "loss": 0.5019, + "step": 3647 + }, + { + "epoch": 0.8137407985723846, + "grad_norm": 0.1648416668176651, + "learning_rate": 1.671354580557114e-05, + "loss": 0.4879, + "step": 3648 + }, + { + "epoch": 0.813963863484274, + "grad_norm": 0.16462133824825287, + "learning_rate": 1.6711801407831356e-05, + "loss": 0.4911, + "step": 3649 + }, + { + "epoch": 0.8141869283961632, + "grad_norm": 0.26390138268470764, + "learning_rate": 1.671005663834979e-05, + "loss": 0.4822, + "step": 3650 + }, + { + "epoch": 0.8144099933080526, + "grad_norm": 0.17741946876049042, + "learning_rate": 1.670831149722308e-05, + "loss": 0.5237, + "step": 3651 + }, + { + "epoch": 0.814633058219942, + "grad_norm": 0.18031084537506104, + "learning_rate": 1.670656598454788e-05, + "loss": 0.4956, + "step": 3652 + }, + { + "epoch": 0.8148561231318313, + "grad_norm": 0.1602051854133606, + "learning_rate": 1.670482010042087e-05, + "loss": 0.4695, + "step": 3653 + }, + { + "epoch": 0.8150791880437207, + "grad_norm": 0.15887601673603058, + "learning_rate": 1.670307384493875e-05, + "loss": 0.4907, + "step": 3654 + }, + { + "epoch": 0.8153022529556101, + "grad_norm": 0.16547198593616486, + "learning_rate": 1.6701327218198234e-05, + "loss": 0.4944, + "step": 3655 + }, + { + "epoch": 0.8155253178674995, + "grad_norm": 0.16335895657539368, + "learning_rate": 1.6699580220296065e-05, + "loss": 0.484, + "step": 3656 + }, + { + "epoch": 0.8157483827793888, + "grad_norm": 0.1715383529663086, + "learning_rate": 1.6697832851329002e-05, + "loss": 0.5086, + "step": 3657 + }, + { + "epoch": 0.8159714476912782, + "grad_norm": 0.16271516680717468, + "learning_rate": 1.6696085111393825e-05, + "loss": 0.4771, + "step": 3658 + }, + { + "epoch": 0.8161945126031676, + "grad_norm": 0.16480712592601776, + "learning_rate": 1.6694337000587334e-05, + "loss": 0.5099, + "step": 3659 + }, + { + "epoch": 0.8164175775150568, + "grad_norm": 0.15898552536964417, + "learning_rate": 1.669258851900635e-05, + "loss": 0.4987, + "step": 3660 + }, + { + "epoch": 0.8166406424269462, + "grad_norm": 0.16049543023109436, + "learning_rate": 1.6690839666747717e-05, + "loss": 0.4762, + "step": 3661 + }, + { + "epoch": 0.8168637073388356, + "grad_norm": 0.15958286821842194, + "learning_rate": 1.6689090443908296e-05, + "loss": 0.4827, + "step": 3662 + }, + { + "epoch": 0.817086772250725, + "grad_norm": 0.16123978793621063, + "learning_rate": 1.668734085058497e-05, + "loss": 0.4803, + "step": 3663 + }, + { + "epoch": 0.8173098371626143, + "grad_norm": 0.1554545909166336, + "learning_rate": 1.668559088687464e-05, + "loss": 0.4813, + "step": 3664 + }, + { + "epoch": 0.8175329020745037, + "grad_norm": 0.15960079431533813, + "learning_rate": 1.6683840552874235e-05, + "loss": 0.4947, + "step": 3665 + }, + { + "epoch": 0.8177559669863931, + "grad_norm": 0.15964864194393158, + "learning_rate": 1.6682089848680698e-05, + "loss": 0.4815, + "step": 3666 + }, + { + "epoch": 0.8179790318982824, + "grad_norm": 0.16264641284942627, + "learning_rate": 1.6680338774390993e-05, + "loss": 0.509, + "step": 3667 + }, + { + "epoch": 0.8182020968101718, + "grad_norm": 0.164497509598732, + "learning_rate": 1.6678587330102103e-05, + "loss": 0.5247, + "step": 3668 + }, + { + "epoch": 0.8184251617220611, + "grad_norm": 0.16061222553253174, + "learning_rate": 1.667683551591104e-05, + "loss": 0.5161, + "step": 3669 + }, + { + "epoch": 0.8186482266339504, + "grad_norm": 0.19114501774311066, + "learning_rate": 1.6675083331914823e-05, + "loss": 0.5119, + "step": 3670 + }, + { + "epoch": 0.8188712915458398, + "grad_norm": 0.18399201333522797, + "learning_rate": 1.6673330778210508e-05, + "loss": 0.4907, + "step": 3671 + }, + { + "epoch": 0.8190943564577292, + "grad_norm": 0.15932810306549072, + "learning_rate": 1.6671577854895153e-05, + "loss": 0.5102, + "step": 3672 + }, + { + "epoch": 0.8193174213696186, + "grad_norm": 0.1565459966659546, + "learning_rate": 1.6669824562065856e-05, + "loss": 0.4752, + "step": 3673 + }, + { + "epoch": 0.8195404862815079, + "grad_norm": 0.1646818071603775, + "learning_rate": 1.6668070899819714e-05, + "loss": 0.5099, + "step": 3674 + }, + { + "epoch": 0.8197635511933973, + "grad_norm": 0.1591091752052307, + "learning_rate": 1.6666316868253867e-05, + "loss": 0.4969, + "step": 3675 + }, + { + "epoch": 0.8199866161052867, + "grad_norm": 0.16319642961025238, + "learning_rate": 1.6664562467465455e-05, + "loss": 0.5129, + "step": 3676 + }, + { + "epoch": 0.820209681017176, + "grad_norm": 0.16143767535686493, + "learning_rate": 1.6662807697551654e-05, + "loss": 0.4952, + "step": 3677 + }, + { + "epoch": 0.8204327459290653, + "grad_norm": 0.15746799111366272, + "learning_rate": 1.666105255860965e-05, + "loss": 0.4947, + "step": 3678 + }, + { + "epoch": 0.8206558108409547, + "grad_norm": 0.16585499048233032, + "learning_rate": 1.6659297050736657e-05, + "loss": 0.5134, + "step": 3679 + }, + { + "epoch": 0.8208788757528441, + "grad_norm": 0.16004452109336853, + "learning_rate": 1.6657541174029902e-05, + "loss": 0.4995, + "step": 3680 + }, + { + "epoch": 0.8211019406647334, + "grad_norm": 0.1682739406824112, + "learning_rate": 1.665578492858664e-05, + "loss": 0.4796, + "step": 3681 + }, + { + "epoch": 0.8213250055766228, + "grad_norm": 0.1618606299161911, + "learning_rate": 1.6654028314504147e-05, + "loss": 0.4849, + "step": 3682 + }, + { + "epoch": 0.8215480704885122, + "grad_norm": 0.1632058471441269, + "learning_rate": 1.6652271331879706e-05, + "loss": 0.499, + "step": 3683 + }, + { + "epoch": 0.8217711354004015, + "grad_norm": 0.16575373709201813, + "learning_rate": 1.665051398081064e-05, + "loss": 0.5193, + "step": 3684 + }, + { + "epoch": 0.8219942003122909, + "grad_norm": 0.16228437423706055, + "learning_rate": 1.664875626139427e-05, + "loss": 0.5065, + "step": 3685 + }, + { + "epoch": 0.8222172652241803, + "grad_norm": 0.1525840312242508, + "learning_rate": 1.6646998173727955e-05, + "loss": 0.4904, + "step": 3686 + }, + { + "epoch": 0.8224403301360695, + "grad_norm": 0.16036051511764526, + "learning_rate": 1.6645239717909074e-05, + "loss": 0.4798, + "step": 3687 + }, + { + "epoch": 0.8226633950479589, + "grad_norm": 0.16049040853977203, + "learning_rate": 1.6643480894035015e-05, + "loss": 0.4985, + "step": 3688 + }, + { + "epoch": 0.8228864599598483, + "grad_norm": 0.16121259331703186, + "learning_rate": 1.6641721702203196e-05, + "loss": 0.4586, + "step": 3689 + }, + { + "epoch": 0.8231095248717377, + "grad_norm": 0.1610342562198639, + "learning_rate": 1.663996214251105e-05, + "loss": 0.4781, + "step": 3690 + }, + { + "epoch": 0.823332589783627, + "grad_norm": 0.17599108815193176, + "learning_rate": 1.6638202215056036e-05, + "loss": 0.509, + "step": 3691 + }, + { + "epoch": 0.8235556546955164, + "grad_norm": 0.15026211738586426, + "learning_rate": 1.6636441919935627e-05, + "loss": 0.4718, + "step": 3692 + }, + { + "epoch": 0.8237787196074058, + "grad_norm": 0.1567082703113556, + "learning_rate": 1.6634681257247314e-05, + "loss": 0.5084, + "step": 3693 + }, + { + "epoch": 0.8240017845192951, + "grad_norm": 0.1553567349910736, + "learning_rate": 1.6632920227088628e-05, + "loss": 0.481, + "step": 3694 + }, + { + "epoch": 0.8242248494311845, + "grad_norm": 0.16551898419857025, + "learning_rate": 1.663115882955709e-05, + "loss": 0.5145, + "step": 3695 + }, + { + "epoch": 0.8244479143430739, + "grad_norm": 0.1742102950811386, + "learning_rate": 1.6629397064750267e-05, + "loss": 0.5073, + "step": 3696 + }, + { + "epoch": 0.8246709792549632, + "grad_norm": 0.16208507120609283, + "learning_rate": 1.6627634932765735e-05, + "loss": 0.4637, + "step": 3697 + }, + { + "epoch": 0.8248940441668525, + "grad_norm": 0.1664377748966217, + "learning_rate": 1.662587243370109e-05, + "loss": 0.4927, + "step": 3698 + }, + { + "epoch": 0.8251171090787419, + "grad_norm": 0.16273631155490875, + "learning_rate": 1.662410956765395e-05, + "loss": 0.4861, + "step": 3699 + }, + { + "epoch": 0.8253401739906313, + "grad_norm": 0.16939525306224823, + "learning_rate": 1.6622346334721956e-05, + "loss": 0.4973, + "step": 3700 + }, + { + "epoch": 0.8255632389025206, + "grad_norm": 0.16064801812171936, + "learning_rate": 1.6620582735002762e-05, + "loss": 0.4815, + "step": 3701 + }, + { + "epoch": 0.82578630381441, + "grad_norm": 0.17662890255451202, + "learning_rate": 1.6618818768594058e-05, + "loss": 0.4851, + "step": 3702 + }, + { + "epoch": 0.8260093687262994, + "grad_norm": 0.16551104187965393, + "learning_rate": 1.6617054435593535e-05, + "loss": 0.4872, + "step": 3703 + }, + { + "epoch": 0.8262324336381888, + "grad_norm": 0.16171106696128845, + "learning_rate": 1.6615289736098912e-05, + "loss": 0.4801, + "step": 3704 + }, + { + "epoch": 0.826455498550078, + "grad_norm": 0.15718747675418854, + "learning_rate": 1.6613524670207933e-05, + "loss": 0.4687, + "step": 3705 + }, + { + "epoch": 0.8266785634619674, + "grad_norm": 0.16042965650558472, + "learning_rate": 1.6611759238018356e-05, + "loss": 0.5026, + "step": 3706 + }, + { + "epoch": 0.8269016283738568, + "grad_norm": 0.1648973822593689, + "learning_rate": 1.660999343962796e-05, + "loss": 0.5002, + "step": 3707 + }, + { + "epoch": 0.8271246932857461, + "grad_norm": 0.16398411989212036, + "learning_rate": 1.6608227275134555e-05, + "loss": 0.4601, + "step": 3708 + }, + { + "epoch": 0.8273477581976355, + "grad_norm": 0.16151902079582214, + "learning_rate": 1.6606460744635952e-05, + "loss": 0.5171, + "step": 3709 + }, + { + "epoch": 0.8275708231095249, + "grad_norm": 0.15724126994609833, + "learning_rate": 1.660469384823e-05, + "loss": 0.4794, + "step": 3710 + }, + { + "epoch": 0.8277938880214142, + "grad_norm": 0.16321682929992676, + "learning_rate": 1.6602926586014555e-05, + "loss": 0.4987, + "step": 3711 + }, + { + "epoch": 0.8280169529333036, + "grad_norm": 0.17783483862876892, + "learning_rate": 1.66011589580875e-05, + "loss": 0.4795, + "step": 3712 + }, + { + "epoch": 0.828240017845193, + "grad_norm": 0.16518917679786682, + "learning_rate": 1.659939096454674e-05, + "loss": 0.5131, + "step": 3713 + }, + { + "epoch": 0.8284630827570824, + "grad_norm": 0.1636318564414978, + "learning_rate": 1.6597622605490198e-05, + "loss": 0.4772, + "step": 3714 + }, + { + "epoch": 0.8286861476689716, + "grad_norm": 0.16061849892139435, + "learning_rate": 1.6595853881015814e-05, + "loss": 0.5001, + "step": 3715 + }, + { + "epoch": 0.828909212580861, + "grad_norm": 0.15872059762477875, + "learning_rate": 1.6594084791221554e-05, + "loss": 0.4987, + "step": 3716 + }, + { + "epoch": 0.8291322774927504, + "grad_norm": 0.15593315660953522, + "learning_rate": 1.65923153362054e-05, + "loss": 0.4752, + "step": 3717 + }, + { + "epoch": 0.8293553424046397, + "grad_norm": 0.15845991671085358, + "learning_rate": 1.6590545516065353e-05, + "loss": 0.5206, + "step": 3718 + }, + { + "epoch": 0.8295784073165291, + "grad_norm": 0.15479490160942078, + "learning_rate": 1.658877533089944e-05, + "loss": 0.4706, + "step": 3719 + }, + { + "epoch": 0.8298014722284185, + "grad_norm": 0.16462849080562592, + "learning_rate": 1.6587004780805704e-05, + "loss": 0.4816, + "step": 3720 + }, + { + "epoch": 0.8300245371403079, + "grad_norm": 0.17479833960533142, + "learning_rate": 1.658523386588221e-05, + "loss": 0.5083, + "step": 3721 + }, + { + "epoch": 0.8302476020521972, + "grad_norm": 0.16387352347373962, + "learning_rate": 1.658346258622704e-05, + "loss": 0.4897, + "step": 3722 + }, + { + "epoch": 0.8304706669640866, + "grad_norm": 0.16821224987506866, + "learning_rate": 1.6581690941938307e-05, + "loss": 0.4928, + "step": 3723 + }, + { + "epoch": 0.830693731875976, + "grad_norm": 0.15833254158496857, + "learning_rate": 1.657991893311412e-05, + "loss": 0.4523, + "step": 3724 + }, + { + "epoch": 0.8309167967878652, + "grad_norm": 0.15844541788101196, + "learning_rate": 1.657814655985264e-05, + "loss": 0.4989, + "step": 3725 + }, + { + "epoch": 0.8311398616997546, + "grad_norm": 0.16405586898326874, + "learning_rate": 1.657637382225202e-05, + "loss": 0.512, + "step": 3726 + }, + { + "epoch": 0.831362926611644, + "grad_norm": 0.16231343150138855, + "learning_rate": 1.6574600720410455e-05, + "loss": 0.4856, + "step": 3727 + }, + { + "epoch": 0.8315859915235333, + "grad_norm": 0.16474612057209015, + "learning_rate": 1.6572827254426145e-05, + "loss": 0.495, + "step": 3728 + }, + { + "epoch": 0.8318090564354227, + "grad_norm": 0.16462446749210358, + "learning_rate": 1.6571053424397316e-05, + "loss": 0.4931, + "step": 3729 + }, + { + "epoch": 0.8320321213473121, + "grad_norm": 0.1530153751373291, + "learning_rate": 1.6569279230422215e-05, + "loss": 0.4752, + "step": 3730 + }, + { + "epoch": 0.8322551862592015, + "grad_norm": 0.15689024329185486, + "learning_rate": 1.656750467259911e-05, + "loss": 0.4899, + "step": 3731 + }, + { + "epoch": 0.8324782511710908, + "grad_norm": 0.17177921533584595, + "learning_rate": 1.656572975102628e-05, + "loss": 0.4941, + "step": 3732 + }, + { + "epoch": 0.8327013160829801, + "grad_norm": 0.15630176663398743, + "learning_rate": 1.6563954465802042e-05, + "loss": 0.4768, + "step": 3733 + }, + { + "epoch": 0.8329243809948695, + "grad_norm": 0.17884239554405212, + "learning_rate": 1.6562178817024713e-05, + "loss": 0.5076, + "step": 3734 + }, + { + "epoch": 0.8331474459067588, + "grad_norm": 0.15849219262599945, + "learning_rate": 1.6560402804792644e-05, + "loss": 0.4948, + "step": 3735 + }, + { + "epoch": 0.8333705108186482, + "grad_norm": 0.15310537815093994, + "learning_rate": 1.65586264292042e-05, + "loss": 0.467, + "step": 3736 + }, + { + "epoch": 0.8335935757305376, + "grad_norm": 0.1629456728696823, + "learning_rate": 1.6556849690357776e-05, + "loss": 0.4794, + "step": 3737 + }, + { + "epoch": 0.833816640642427, + "grad_norm": 0.16876748204231262, + "learning_rate": 1.6555072588351765e-05, + "loss": 0.497, + "step": 3738 + }, + { + "epoch": 0.8340397055543163, + "grad_norm": 0.17010155320167542, + "learning_rate": 1.6553295123284605e-05, + "loss": 0.5199, + "step": 3739 + }, + { + "epoch": 0.8342627704662057, + "grad_norm": 0.16115908324718475, + "learning_rate": 1.6551517295254732e-05, + "loss": 0.5094, + "step": 3740 + }, + { + "epoch": 0.8344858353780951, + "grad_norm": 0.18651996552944183, + "learning_rate": 1.6549739104360627e-05, + "loss": 0.4963, + "step": 3741 + }, + { + "epoch": 0.8347089002899843, + "grad_norm": 0.16856126487255096, + "learning_rate": 1.6547960550700766e-05, + "loss": 0.4729, + "step": 3742 + }, + { + "epoch": 0.8349319652018737, + "grad_norm": 0.16244658827781677, + "learning_rate": 1.6546181634373666e-05, + "loss": 0.4964, + "step": 3743 + }, + { + "epoch": 0.8351550301137631, + "grad_norm": 0.15980856120586395, + "learning_rate": 1.654440235547785e-05, + "loss": 0.4741, + "step": 3744 + }, + { + "epoch": 0.8353780950256524, + "grad_norm": 0.15810216963291168, + "learning_rate": 1.6542622714111865e-05, + "loss": 0.4905, + "step": 3745 + }, + { + "epoch": 0.8356011599375418, + "grad_norm": 0.15649911761283875, + "learning_rate": 1.654084271037428e-05, + "loss": 0.4883, + "step": 3746 + }, + { + "epoch": 0.8358242248494312, + "grad_norm": 0.16376326978206635, + "learning_rate": 1.653906234436368e-05, + "loss": 0.4788, + "step": 3747 + }, + { + "epoch": 0.8360472897613206, + "grad_norm": 0.17131556570529938, + "learning_rate": 1.6537281616178674e-05, + "loss": 0.5121, + "step": 3748 + }, + { + "epoch": 0.8362703546732099, + "grad_norm": 0.1694851517677307, + "learning_rate": 1.6535500525917893e-05, + "loss": 0.5165, + "step": 3749 + }, + { + "epoch": 0.8364934195850993, + "grad_norm": 0.16306069493293762, + "learning_rate": 1.653371907367998e-05, + "loss": 0.493, + "step": 3750 + }, + { + "epoch": 0.8367164844969887, + "grad_norm": 0.1630067676305771, + "learning_rate": 1.6531937259563612e-05, + "loss": 0.4992, + "step": 3751 + }, + { + "epoch": 0.8369395494088779, + "grad_norm": 0.15753702819347382, + "learning_rate": 1.6530155083667468e-05, + "loss": 0.4767, + "step": 3752 + }, + { + "epoch": 0.8371626143207673, + "grad_norm": 0.1651187390089035, + "learning_rate": 1.6528372546090258e-05, + "loss": 0.4987, + "step": 3753 + }, + { + "epoch": 0.8373856792326567, + "grad_norm": 0.1615830808877945, + "learning_rate": 1.6526589646930712e-05, + "loss": 0.4932, + "step": 3754 + }, + { + "epoch": 0.8376087441445461, + "grad_norm": 0.1610192209482193, + "learning_rate": 1.6524806386287578e-05, + "loss": 0.5062, + "step": 3755 + }, + { + "epoch": 0.8378318090564354, + "grad_norm": 0.16058041155338287, + "learning_rate": 1.652302276425962e-05, + "loss": 0.4937, + "step": 3756 + }, + { + "epoch": 0.8380548739683248, + "grad_norm": 0.15800607204437256, + "learning_rate": 1.6521238780945635e-05, + "loss": 0.4814, + "step": 3757 + }, + { + "epoch": 0.8382779388802142, + "grad_norm": 0.16751538217067719, + "learning_rate": 1.6519454436444423e-05, + "loss": 0.4904, + "step": 3758 + }, + { + "epoch": 0.8385010037921035, + "grad_norm": 0.16399426758289337, + "learning_rate": 1.651766973085482e-05, + "loss": 0.5086, + "step": 3759 + }, + { + "epoch": 0.8387240687039929, + "grad_norm": 0.16619375348091125, + "learning_rate": 1.6515884664275663e-05, + "loss": 0.5192, + "step": 3760 + }, + { + "epoch": 0.8389471336158822, + "grad_norm": 0.1645645648241043, + "learning_rate": 1.651409923680583e-05, + "loss": 0.4894, + "step": 3761 + }, + { + "epoch": 0.8391701985277715, + "grad_norm": 0.16611702740192413, + "learning_rate": 1.6512313448544207e-05, + "loss": 0.4858, + "step": 3762 + }, + { + "epoch": 0.8393932634396609, + "grad_norm": 0.15403629839420319, + "learning_rate": 1.6510527299589696e-05, + "loss": 0.4844, + "step": 3763 + }, + { + "epoch": 0.8396163283515503, + "grad_norm": 0.15380257368087769, + "learning_rate": 1.6508740790041236e-05, + "loss": 0.491, + "step": 3764 + }, + { + "epoch": 0.8398393932634397, + "grad_norm": 0.1628074049949646, + "learning_rate": 1.650695391999777e-05, + "loss": 0.4666, + "step": 3765 + }, + { + "epoch": 0.840062458175329, + "grad_norm": 0.16377408802509308, + "learning_rate": 1.650516668955826e-05, + "loss": 0.4695, + "step": 3766 + }, + { + "epoch": 0.8402855230872184, + "grad_norm": 0.16199228167533875, + "learning_rate": 1.6503379098821705e-05, + "loss": 0.48, + "step": 3767 + }, + { + "epoch": 0.8405085879991078, + "grad_norm": 0.17420238256454468, + "learning_rate": 1.6501591147887108e-05, + "loss": 0.4983, + "step": 3768 + }, + { + "epoch": 0.840731652910997, + "grad_norm": 0.1635866016149521, + "learning_rate": 1.649980283685349e-05, + "loss": 0.4877, + "step": 3769 + }, + { + "epoch": 0.8409547178228864, + "grad_norm": 0.16857163608074188, + "learning_rate": 1.6498014165819908e-05, + "loss": 0.4956, + "step": 3770 + }, + { + "epoch": 0.8411777827347758, + "grad_norm": 0.17237527668476105, + "learning_rate": 1.649622513488543e-05, + "loss": 0.515, + "step": 3771 + }, + { + "epoch": 0.8414008476466652, + "grad_norm": 0.1601458191871643, + "learning_rate": 1.6494435744149142e-05, + "loss": 0.4839, + "step": 3772 + }, + { + "epoch": 0.8416239125585545, + "grad_norm": 0.1580028235912323, + "learning_rate": 1.6492645993710148e-05, + "loss": 0.4529, + "step": 3773 + }, + { + "epoch": 0.8418469774704439, + "grad_norm": 0.16191108524799347, + "learning_rate": 1.649085588366758e-05, + "loss": 0.495, + "step": 3774 + }, + { + "epoch": 0.8420700423823333, + "grad_norm": 0.16524077951908112, + "learning_rate": 1.6489065414120583e-05, + "loss": 0.4832, + "step": 3775 + }, + { + "epoch": 0.8422931072942226, + "grad_norm": 0.16424553096294403, + "learning_rate": 1.6487274585168327e-05, + "loss": 0.4811, + "step": 3776 + }, + { + "epoch": 0.842516172206112, + "grad_norm": 0.16102652251720428, + "learning_rate": 1.6485483396909997e-05, + "loss": 0.486, + "step": 3777 + }, + { + "epoch": 0.8427392371180014, + "grad_norm": 0.16395264863967896, + "learning_rate": 1.64836918494448e-05, + "loss": 0.4956, + "step": 3778 + }, + { + "epoch": 0.8429623020298908, + "grad_norm": 0.17286796867847443, + "learning_rate": 1.6481899942871967e-05, + "loss": 0.4859, + "step": 3779 + }, + { + "epoch": 0.84318536694178, + "grad_norm": 0.16825571656227112, + "learning_rate": 1.648010767729074e-05, + "loss": 0.4873, + "step": 3780 + }, + { + "epoch": 0.8434084318536694, + "grad_norm": 0.14970429241657257, + "learning_rate": 1.647831505280039e-05, + "loss": 0.4725, + "step": 3781 + }, + { + "epoch": 0.8436314967655588, + "grad_norm": 0.1689714640378952, + "learning_rate": 1.64765220695002e-05, + "loss": 0.4865, + "step": 3782 + }, + { + "epoch": 0.8438545616774481, + "grad_norm": 0.18679800629615784, + "learning_rate": 1.647472872748948e-05, + "loss": 0.5144, + "step": 3783 + }, + { + "epoch": 0.8440776265893375, + "grad_norm": 0.15957824885845184, + "learning_rate": 1.6472935026867555e-05, + "loss": 0.4969, + "step": 3784 + }, + { + "epoch": 0.8443006915012269, + "grad_norm": 0.1743471622467041, + "learning_rate": 1.6471140967733772e-05, + "loss": 0.4945, + "step": 3785 + }, + { + "epoch": 0.8445237564131162, + "grad_norm": 0.17243602871894836, + "learning_rate": 1.64693465501875e-05, + "loss": 0.4963, + "step": 3786 + }, + { + "epoch": 0.8447468213250056, + "grad_norm": 0.1854105442762375, + "learning_rate": 1.646755177432812e-05, + "loss": 0.4655, + "step": 3787 + }, + { + "epoch": 0.844969886236895, + "grad_norm": 0.1698131114244461, + "learning_rate": 1.6465756640255038e-05, + "loss": 0.5208, + "step": 3788 + }, + { + "epoch": 0.8451929511487843, + "grad_norm": 0.17987042665481567, + "learning_rate": 1.6463961148067685e-05, + "loss": 0.5031, + "step": 3789 + }, + { + "epoch": 0.8454160160606736, + "grad_norm": 0.15817251801490784, + "learning_rate": 1.6462165297865503e-05, + "loss": 0.4892, + "step": 3790 + }, + { + "epoch": 0.845639080972563, + "grad_norm": 0.16425161063671112, + "learning_rate": 1.6460369089747956e-05, + "loss": 0.4867, + "step": 3791 + }, + { + "epoch": 0.8458621458844524, + "grad_norm": 0.164134219288826, + "learning_rate": 1.6458572523814535e-05, + "loss": 0.5063, + "step": 3792 + }, + { + "epoch": 0.8460852107963417, + "grad_norm": 0.1651495099067688, + "learning_rate": 1.6456775600164737e-05, + "loss": 0.5168, + "step": 3793 + }, + { + "epoch": 0.8463082757082311, + "grad_norm": 0.1596038043498993, + "learning_rate": 1.6454978318898093e-05, + "loss": 0.4948, + "step": 3794 + }, + { + "epoch": 0.8465313406201205, + "grad_norm": 0.16954916715621948, + "learning_rate": 1.645318068011415e-05, + "loss": 0.5364, + "step": 3795 + }, + { + "epoch": 0.8467544055320099, + "grad_norm": 0.16513592004776, + "learning_rate": 1.6451382683912468e-05, + "loss": 0.4905, + "step": 3796 + }, + { + "epoch": 0.8469774704438991, + "grad_norm": 0.17096330225467682, + "learning_rate": 1.6449584330392627e-05, + "loss": 0.4687, + "step": 3797 + }, + { + "epoch": 0.8472005353557885, + "grad_norm": 0.16846898198127747, + "learning_rate": 1.644778561965424e-05, + "loss": 0.4803, + "step": 3798 + }, + { + "epoch": 0.8474236002676779, + "grad_norm": 0.16746586561203003, + "learning_rate": 1.644598655179693e-05, + "loss": 0.5015, + "step": 3799 + }, + { + "epoch": 0.8476466651795672, + "grad_norm": 0.16072294116020203, + "learning_rate": 1.6444187126920334e-05, + "loss": 0.4993, + "step": 3800 + }, + { + "epoch": 0.8478697300914566, + "grad_norm": 0.18298223614692688, + "learning_rate": 1.644238734512412e-05, + "loss": 0.4923, + "step": 3801 + }, + { + "epoch": 0.848092795003346, + "grad_norm": 0.1531648337841034, + "learning_rate": 1.6440587206507972e-05, + "loss": 0.4731, + "step": 3802 + }, + { + "epoch": 0.8483158599152353, + "grad_norm": 0.16385571658611298, + "learning_rate": 1.6438786711171588e-05, + "loss": 0.4718, + "step": 3803 + }, + { + "epoch": 0.8485389248271247, + "grad_norm": 0.15524934232234955, + "learning_rate": 1.6436985859214698e-05, + "loss": 0.4886, + "step": 3804 + }, + { + "epoch": 0.8487619897390141, + "grad_norm": 0.16199147701263428, + "learning_rate": 1.643518465073704e-05, + "loss": 0.5056, + "step": 3805 + }, + { + "epoch": 0.8489850546509035, + "grad_norm": 0.17070268094539642, + "learning_rate": 1.6433383085838378e-05, + "loss": 0.5157, + "step": 3806 + }, + { + "epoch": 0.8492081195627927, + "grad_norm": 0.19035401940345764, + "learning_rate": 1.643158116461849e-05, + "loss": 0.5135, + "step": 3807 + }, + { + "epoch": 0.8494311844746821, + "grad_norm": 0.16606462001800537, + "learning_rate": 1.6429778887177182e-05, + "loss": 0.5058, + "step": 3808 + }, + { + "epoch": 0.8496542493865715, + "grad_norm": 0.16338834166526794, + "learning_rate": 1.6427976253614275e-05, + "loss": 0.4914, + "step": 3809 + }, + { + "epoch": 0.8498773142984608, + "grad_norm": 0.16145442426204681, + "learning_rate": 1.6426173264029614e-05, + "loss": 0.4999, + "step": 3810 + }, + { + "epoch": 0.8501003792103502, + "grad_norm": 0.16404959559440613, + "learning_rate": 1.642436991852305e-05, + "loss": 0.5014, + "step": 3811 + }, + { + "epoch": 0.8503234441222396, + "grad_norm": 0.16108281910419464, + "learning_rate": 1.642256621719447e-05, + "loss": 0.4702, + "step": 3812 + }, + { + "epoch": 0.850546509034129, + "grad_norm": 0.15342526137828827, + "learning_rate": 1.642076216014377e-05, + "loss": 0.4686, + "step": 3813 + }, + { + "epoch": 0.8507695739460183, + "grad_norm": 0.16050684452056885, + "learning_rate": 1.6418957747470877e-05, + "loss": 0.4729, + "step": 3814 + }, + { + "epoch": 0.8509926388579077, + "grad_norm": 0.17533640563488007, + "learning_rate": 1.641715297927573e-05, + "loss": 0.4926, + "step": 3815 + }, + { + "epoch": 0.851215703769797, + "grad_norm": 0.16416087746620178, + "learning_rate": 1.641534785565828e-05, + "loss": 0.513, + "step": 3816 + }, + { + "epoch": 0.8514387686816863, + "grad_norm": 0.1623440384864807, + "learning_rate": 1.6413542376718513e-05, + "loss": 0.49, + "step": 3817 + }, + { + "epoch": 0.8516618335935757, + "grad_norm": 0.1671840250492096, + "learning_rate": 1.641173654255643e-05, + "loss": 0.5044, + "step": 3818 + }, + { + "epoch": 0.8518848985054651, + "grad_norm": 0.1557224541902542, + "learning_rate": 1.640993035327204e-05, + "loss": 0.4984, + "step": 3819 + }, + { + "epoch": 0.8521079634173544, + "grad_norm": 0.15791340172290802, + "learning_rate": 1.6408123808965392e-05, + "loss": 0.4801, + "step": 3820 + }, + { + "epoch": 0.8523310283292438, + "grad_norm": 0.1637008786201477, + "learning_rate": 1.6406316909736536e-05, + "loss": 0.4989, + "step": 3821 + }, + { + "epoch": 0.8525540932411332, + "grad_norm": 0.16228072345256805, + "learning_rate": 1.6404509655685555e-05, + "loss": 0.4568, + "step": 3822 + }, + { + "epoch": 0.8527771581530226, + "grad_norm": 0.16301074624061584, + "learning_rate": 1.640270204691254e-05, + "loss": 0.4741, + "step": 3823 + }, + { + "epoch": 0.8530002230649119, + "grad_norm": 0.15945035219192505, + "learning_rate": 1.6400894083517612e-05, + "loss": 0.4885, + "step": 3824 + }, + { + "epoch": 0.8532232879768012, + "grad_norm": 0.16819296777248383, + "learning_rate": 1.639908576560091e-05, + "loss": 0.5147, + "step": 3825 + }, + { + "epoch": 0.8534463528886906, + "grad_norm": 0.16270814836025238, + "learning_rate": 1.6397277093262583e-05, + "loss": 0.5147, + "step": 3826 + }, + { + "epoch": 0.8536694178005799, + "grad_norm": 0.16252903640270233, + "learning_rate": 1.6395468066602812e-05, + "loss": 0.5019, + "step": 3827 + }, + { + "epoch": 0.8538924827124693, + "grad_norm": 0.16933606564998627, + "learning_rate": 1.6393658685721787e-05, + "loss": 0.5085, + "step": 3828 + }, + { + "epoch": 0.8541155476243587, + "grad_norm": 0.17316611111164093, + "learning_rate": 1.639184895071973e-05, + "loss": 0.5141, + "step": 3829 + }, + { + "epoch": 0.8543386125362481, + "grad_norm": 0.17270614206790924, + "learning_rate": 1.6390038861696868e-05, + "loss": 0.4894, + "step": 3830 + }, + { + "epoch": 0.8545616774481374, + "grad_norm": 0.1740892380475998, + "learning_rate": 1.638822841875346e-05, + "loss": 0.5061, + "step": 3831 + }, + { + "epoch": 0.8547847423600268, + "grad_norm": 0.18361349403858185, + "learning_rate": 1.638641762198978e-05, + "loss": 0.4868, + "step": 3832 + }, + { + "epoch": 0.8550078072719162, + "grad_norm": 0.18127956986427307, + "learning_rate": 1.638460647150612e-05, + "loss": 0.5208, + "step": 3833 + }, + { + "epoch": 0.8552308721838054, + "grad_norm": 0.1610213816165924, + "learning_rate": 1.6382794967402792e-05, + "loss": 0.4749, + "step": 3834 + }, + { + "epoch": 0.8554539370956948, + "grad_norm": 0.16226479411125183, + "learning_rate": 1.638098310978013e-05, + "loss": 0.4872, + "step": 3835 + }, + { + "epoch": 0.8556770020075842, + "grad_norm": 0.15979908406734467, + "learning_rate": 1.6379170898738483e-05, + "loss": 0.5036, + "step": 3836 + }, + { + "epoch": 0.8559000669194735, + "grad_norm": 0.1549946367740631, + "learning_rate": 1.6377358334378228e-05, + "loss": 0.4858, + "step": 3837 + }, + { + "epoch": 0.8561231318313629, + "grad_norm": 0.16230742633342743, + "learning_rate": 1.6375545416799756e-05, + "loss": 0.5084, + "step": 3838 + }, + { + "epoch": 0.8563461967432523, + "grad_norm": 0.15997640788555145, + "learning_rate": 1.6373732146103466e-05, + "loss": 0.5162, + "step": 3839 + }, + { + "epoch": 0.8565692616551417, + "grad_norm": 0.1616683304309845, + "learning_rate": 1.6371918522389804e-05, + "loss": 0.471, + "step": 3840 + }, + { + "epoch": 0.856792326567031, + "grad_norm": 0.15914194285869598, + "learning_rate": 1.637010454575921e-05, + "loss": 0.5039, + "step": 3841 + }, + { + "epoch": 0.8570153914789204, + "grad_norm": 0.16176219284534454, + "learning_rate": 1.636829021631216e-05, + "loss": 0.4903, + "step": 3842 + }, + { + "epoch": 0.8572384563908098, + "grad_norm": 0.1591506451368332, + "learning_rate": 1.636647553414914e-05, + "loss": 0.4833, + "step": 3843 + }, + { + "epoch": 0.857461521302699, + "grad_norm": 0.16955386102199554, + "learning_rate": 1.6364660499370656e-05, + "loss": 0.5088, + "step": 3844 + }, + { + "epoch": 0.8576845862145884, + "grad_norm": 0.15826305747032166, + "learning_rate": 1.636284511207724e-05, + "loss": 0.4985, + "step": 3845 + }, + { + "epoch": 0.8579076511264778, + "grad_norm": 0.16054044663906097, + "learning_rate": 1.6361029372369433e-05, + "loss": 0.4991, + "step": 3846 + }, + { + "epoch": 0.8581307160383672, + "grad_norm": 0.16432225704193115, + "learning_rate": 1.6359213280347814e-05, + "loss": 0.4927, + "step": 3847 + }, + { + "epoch": 0.8583537809502565, + "grad_norm": 0.17022813856601715, + "learning_rate": 1.6357396836112957e-05, + "loss": 0.4863, + "step": 3848 + }, + { + "epoch": 0.8585768458621459, + "grad_norm": 0.15264475345611572, + "learning_rate": 1.6355580039765478e-05, + "loss": 0.4891, + "step": 3849 + }, + { + "epoch": 0.8587999107740353, + "grad_norm": 0.1790938675403595, + "learning_rate": 1.6353762891405993e-05, + "loss": 0.517, + "step": 3850 + }, + { + "epoch": 0.8590229756859246, + "grad_norm": 0.15989623963832855, + "learning_rate": 1.6351945391135154e-05, + "loss": 0.4684, + "step": 3851 + }, + { + "epoch": 0.859246040597814, + "grad_norm": 0.16801711916923523, + "learning_rate": 1.6350127539053626e-05, + "loss": 0.4782, + "step": 3852 + }, + { + "epoch": 0.8594691055097033, + "grad_norm": 0.15932013094425201, + "learning_rate": 1.634830933526209e-05, + "loss": 0.4805, + "step": 3853 + }, + { + "epoch": 0.8596921704215927, + "grad_norm": 0.16254423558712006, + "learning_rate": 1.6346490779861252e-05, + "loss": 0.5063, + "step": 3854 + }, + { + "epoch": 0.859915235333482, + "grad_norm": 0.1786786913871765, + "learning_rate": 1.634467187295183e-05, + "loss": 0.5457, + "step": 3855 + }, + { + "epoch": 0.8601383002453714, + "grad_norm": 0.15343046188354492, + "learning_rate": 1.6342852614634575e-05, + "loss": 0.4681, + "step": 3856 + }, + { + "epoch": 0.8603613651572608, + "grad_norm": 0.16010713577270508, + "learning_rate": 1.634103300501024e-05, + "loss": 0.5131, + "step": 3857 + }, + { + "epoch": 0.8605844300691501, + "grad_norm": 0.1694883406162262, + "learning_rate": 1.6339213044179612e-05, + "loss": 0.5117, + "step": 3858 + }, + { + "epoch": 0.8608074949810395, + "grad_norm": 0.1592147946357727, + "learning_rate": 1.6337392732243488e-05, + "loss": 0.5026, + "step": 3859 + }, + { + "epoch": 0.8610305598929289, + "grad_norm": 0.15784434974193573, + "learning_rate": 1.6335572069302694e-05, + "loss": 0.4705, + "step": 3860 + }, + { + "epoch": 0.8612536248048182, + "grad_norm": 0.15071116387844086, + "learning_rate": 1.6333751055458065e-05, + "loss": 0.4781, + "step": 3861 + }, + { + "epoch": 0.8614766897167075, + "grad_norm": 0.16125838458538055, + "learning_rate": 1.6331929690810464e-05, + "loss": 0.5079, + "step": 3862 + }, + { + "epoch": 0.8616997546285969, + "grad_norm": 0.16050268709659576, + "learning_rate": 1.6330107975460764e-05, + "loss": 0.4983, + "step": 3863 + }, + { + "epoch": 0.8619228195404863, + "grad_norm": 0.15100105106830597, + "learning_rate": 1.632828590950987e-05, + "loss": 0.46, + "step": 3864 + }, + { + "epoch": 0.8621458844523756, + "grad_norm": 0.17107254266738892, + "learning_rate": 1.632646349305869e-05, + "loss": 0.4708, + "step": 3865 + }, + { + "epoch": 0.862368949364265, + "grad_norm": 0.15665309131145477, + "learning_rate": 1.6324640726208172e-05, + "loss": 0.4786, + "step": 3866 + }, + { + "epoch": 0.8625920142761544, + "grad_norm": 0.1617649793624878, + "learning_rate": 1.6322817609059267e-05, + "loss": 0.5078, + "step": 3867 + }, + { + "epoch": 0.8628150791880437, + "grad_norm": 0.15603747963905334, + "learning_rate": 1.6320994141712948e-05, + "loss": 0.4939, + "step": 3868 + }, + { + "epoch": 0.8630381440999331, + "grad_norm": 0.14941559731960297, + "learning_rate": 1.6319170324270212e-05, + "loss": 0.4637, + "step": 3869 + }, + { + "epoch": 0.8632612090118225, + "grad_norm": 0.1626354604959488, + "learning_rate": 1.631734615683208e-05, + "loss": 0.4715, + "step": 3870 + }, + { + "epoch": 0.8634842739237119, + "grad_norm": 0.15691153705120087, + "learning_rate": 1.6315521639499573e-05, + "loss": 0.5017, + "step": 3871 + }, + { + "epoch": 0.8637073388356011, + "grad_norm": 0.19187089800834656, + "learning_rate": 1.6313696772373754e-05, + "loss": 0.5369, + "step": 3872 + }, + { + "epoch": 0.8639304037474905, + "grad_norm": 0.16823086142539978, + "learning_rate": 1.6311871555555696e-05, + "loss": 0.4892, + "step": 3873 + }, + { + "epoch": 0.8641534686593799, + "grad_norm": 0.17205961048603058, + "learning_rate": 1.6310045989146486e-05, + "loss": 0.5472, + "step": 3874 + }, + { + "epoch": 0.8643765335712692, + "grad_norm": 0.1610354632139206, + "learning_rate": 1.6308220073247237e-05, + "loss": 0.4887, + "step": 3875 + }, + { + "epoch": 0.8645995984831586, + "grad_norm": 0.21832773089408875, + "learning_rate": 1.6306393807959078e-05, + "loss": 0.5081, + "step": 3876 + }, + { + "epoch": 0.864822663395048, + "grad_norm": 0.17104685306549072, + "learning_rate": 1.6304567193383164e-05, + "loss": 0.5179, + "step": 3877 + }, + { + "epoch": 0.8650457283069373, + "grad_norm": 0.15571469068527222, + "learning_rate": 1.6302740229620662e-05, + "loss": 0.4773, + "step": 3878 + }, + { + "epoch": 0.8652687932188267, + "grad_norm": 0.16199520230293274, + "learning_rate": 1.630091291677276e-05, + "loss": 0.4972, + "step": 3879 + }, + { + "epoch": 0.865491858130716, + "grad_norm": 0.17316444218158722, + "learning_rate": 1.6299085254940664e-05, + "loss": 0.4961, + "step": 3880 + }, + { + "epoch": 0.8657149230426054, + "grad_norm": 0.16597145795822144, + "learning_rate": 1.6297257244225602e-05, + "loss": 0.4893, + "step": 3881 + }, + { + "epoch": 0.8659379879544947, + "grad_norm": 0.1571418046951294, + "learning_rate": 1.6295428884728827e-05, + "loss": 0.4957, + "step": 3882 + }, + { + "epoch": 0.8661610528663841, + "grad_norm": 0.15437713265419006, + "learning_rate": 1.62936001765516e-05, + "loss": 0.4912, + "step": 3883 + }, + { + "epoch": 0.8663841177782735, + "grad_norm": 0.15803179144859314, + "learning_rate": 1.6291771119795202e-05, + "loss": 0.5012, + "step": 3884 + }, + { + "epoch": 0.8666071826901628, + "grad_norm": 0.1526332050561905, + "learning_rate": 1.628994171456095e-05, + "loss": 0.48, + "step": 3885 + }, + { + "epoch": 0.8668302476020522, + "grad_norm": 0.16080142557621002, + "learning_rate": 1.628811196095016e-05, + "loss": 0.4885, + "step": 3886 + }, + { + "epoch": 0.8670533125139416, + "grad_norm": 0.16212552785873413, + "learning_rate": 1.628628185906417e-05, + "loss": 0.5049, + "step": 3887 + }, + { + "epoch": 0.867276377425831, + "grad_norm": 0.1618081033229828, + "learning_rate": 1.6284451409004352e-05, + "loss": 0.4955, + "step": 3888 + }, + { + "epoch": 0.8674994423377203, + "grad_norm": 0.15760111808776855, + "learning_rate": 1.628262061087208e-05, + "loss": 0.4808, + "step": 3889 + }, + { + "epoch": 0.8677225072496096, + "grad_norm": 0.15690281987190247, + "learning_rate": 1.6280789464768765e-05, + "loss": 0.5198, + "step": 3890 + }, + { + "epoch": 0.867945572161499, + "grad_norm": 0.17230992019176483, + "learning_rate": 1.6278957970795818e-05, + "loss": 0.4762, + "step": 3891 + }, + { + "epoch": 0.8681686370733883, + "grad_norm": 0.15698418021202087, + "learning_rate": 1.6277126129054687e-05, + "loss": 0.4969, + "step": 3892 + }, + { + "epoch": 0.8683917019852777, + "grad_norm": 0.15693382918834686, + "learning_rate": 1.6275293939646822e-05, + "loss": 0.4856, + "step": 3893 + }, + { + "epoch": 0.8686147668971671, + "grad_norm": 0.1605808138847351, + "learning_rate": 1.6273461402673706e-05, + "loss": 0.4739, + "step": 3894 + }, + { + "epoch": 0.8688378318090564, + "grad_norm": 0.2007637321949005, + "learning_rate": 1.6271628518236836e-05, + "loss": 0.5115, + "step": 3895 + }, + { + "epoch": 0.8690608967209458, + "grad_norm": 0.17741620540618896, + "learning_rate": 1.6269795286437728e-05, + "loss": 0.5034, + "step": 3896 + }, + { + "epoch": 0.8692839616328352, + "grad_norm": 0.1685972362756729, + "learning_rate": 1.6267961707377923e-05, + "loss": 0.5316, + "step": 3897 + }, + { + "epoch": 0.8695070265447246, + "grad_norm": 0.16251643002033234, + "learning_rate": 1.6266127781158965e-05, + "loss": 0.5018, + "step": 3898 + }, + { + "epoch": 0.8697300914566138, + "grad_norm": 0.15942230820655823, + "learning_rate": 1.626429350788244e-05, + "loss": 0.4799, + "step": 3899 + }, + { + "epoch": 0.8699531563685032, + "grad_norm": 0.16205990314483643, + "learning_rate": 1.6262458887649933e-05, + "loss": 0.4721, + "step": 3900 + }, + { + "epoch": 0.8701762212803926, + "grad_norm": 0.16467221081256866, + "learning_rate": 1.6260623920563062e-05, + "loss": 0.5137, + "step": 3901 + }, + { + "epoch": 0.8703992861922819, + "grad_norm": 0.16359251737594604, + "learning_rate": 1.6258788606723457e-05, + "loss": 0.5063, + "step": 3902 + }, + { + "epoch": 0.8706223511041713, + "grad_norm": 0.16454778611660004, + "learning_rate": 1.625695294623277e-05, + "loss": 0.4847, + "step": 3903 + }, + { + "epoch": 0.8708454160160607, + "grad_norm": 0.1625732183456421, + "learning_rate": 1.625511693919267e-05, + "loss": 0.5324, + "step": 3904 + }, + { + "epoch": 0.8710684809279501, + "grad_norm": 0.16343548893928528, + "learning_rate": 1.625328058570485e-05, + "loss": 0.5012, + "step": 3905 + }, + { + "epoch": 0.8712915458398394, + "grad_norm": 0.15868264436721802, + "learning_rate": 1.6251443885871013e-05, + "loss": 0.4921, + "step": 3906 + }, + { + "epoch": 0.8715146107517288, + "grad_norm": 0.1692667156457901, + "learning_rate": 1.6249606839792897e-05, + "loss": 0.4861, + "step": 3907 + }, + { + "epoch": 0.8717376756636182, + "grad_norm": 0.15825539827346802, + "learning_rate": 1.6247769447572235e-05, + "loss": 0.4828, + "step": 3908 + }, + { + "epoch": 0.8719607405755074, + "grad_norm": 0.1585260033607483, + "learning_rate": 1.6245931709310806e-05, + "loss": 0.4885, + "step": 3909 + }, + { + "epoch": 0.8721838054873968, + "grad_norm": 0.1593310832977295, + "learning_rate": 1.624409362511039e-05, + "loss": 0.5158, + "step": 3910 + }, + { + "epoch": 0.8724068703992862, + "grad_norm": 0.15775549411773682, + "learning_rate": 1.624225519507279e-05, + "loss": 0.4812, + "step": 3911 + }, + { + "epoch": 0.8726299353111755, + "grad_norm": 0.16018901765346527, + "learning_rate": 1.624041641929983e-05, + "loss": 0.5123, + "step": 3912 + }, + { + "epoch": 0.8728530002230649, + "grad_norm": 0.16512146592140198, + "learning_rate": 1.6238577297893357e-05, + "loss": 0.4886, + "step": 3913 + }, + { + "epoch": 0.8730760651349543, + "grad_norm": 0.16052314639091492, + "learning_rate": 1.6236737830955233e-05, + "loss": 0.5078, + "step": 3914 + }, + { + "epoch": 0.8732991300468437, + "grad_norm": 0.16342321038246155, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.5151, + "step": 3915 + }, + { + "epoch": 0.873522194958733, + "grad_norm": 0.16125398874282837, + "learning_rate": 1.6233057860891566e-05, + "loss": 0.4786, + "step": 3916 + }, + { + "epoch": 0.8737452598706223, + "grad_norm": 0.17200647294521332, + "learning_rate": 1.623121735796985e-05, + "loss": 0.5013, + "step": 3917 + }, + { + "epoch": 0.8739683247825117, + "grad_norm": 0.17798694968223572, + "learning_rate": 1.6229376509924116e-05, + "loss": 0.5257, + "step": 3918 + }, + { + "epoch": 0.874191389694401, + "grad_norm": 0.15707524120807648, + "learning_rate": 1.6227535316856326e-05, + "loss": 0.4889, + "step": 3919 + }, + { + "epoch": 0.8744144546062904, + "grad_norm": 0.16776537895202637, + "learning_rate": 1.622569377886846e-05, + "loss": 0.5196, + "step": 3920 + }, + { + "epoch": 0.8746375195181798, + "grad_norm": 0.16779324412345886, + "learning_rate": 1.622385189606251e-05, + "loss": 0.512, + "step": 3921 + }, + { + "epoch": 0.8748605844300692, + "grad_norm": 0.16673165559768677, + "learning_rate": 1.622200966854049e-05, + "loss": 0.4888, + "step": 3922 + }, + { + "epoch": 0.8750836493419585, + "grad_norm": 0.18797388672828674, + "learning_rate": 1.622016709640444e-05, + "loss": 0.4789, + "step": 3923 + }, + { + "epoch": 0.8753067142538479, + "grad_norm": 0.21027544140815735, + "learning_rate": 1.621832417975641e-05, + "loss": 0.4818, + "step": 3924 + }, + { + "epoch": 0.8755297791657373, + "grad_norm": 0.16859135031700134, + "learning_rate": 1.621648091869847e-05, + "loss": 0.4949, + "step": 3925 + }, + { + "epoch": 0.8757528440776265, + "grad_norm": 0.1632431447505951, + "learning_rate": 1.6214637313332714e-05, + "loss": 0.4869, + "step": 3926 + }, + { + "epoch": 0.8759759089895159, + "grad_norm": 0.1545240879058838, + "learning_rate": 1.6212793363761253e-05, + "loss": 0.495, + "step": 3927 + }, + { + "epoch": 0.8761989739014053, + "grad_norm": 0.162541463971138, + "learning_rate": 1.621094907008621e-05, + "loss": 0.507, + "step": 3928 + }, + { + "epoch": 0.8764220388132947, + "grad_norm": 0.16767984628677368, + "learning_rate": 1.6209104432409745e-05, + "loss": 0.5098, + "step": 3929 + }, + { + "epoch": 0.876645103725184, + "grad_norm": 0.17964600026607513, + "learning_rate": 1.6207259450834022e-05, + "loss": 0.5055, + "step": 3930 + }, + { + "epoch": 0.8768681686370734, + "grad_norm": 0.16593998670578003, + "learning_rate": 1.620541412546122e-05, + "loss": 0.4944, + "step": 3931 + }, + { + "epoch": 0.8770912335489628, + "grad_norm": 0.15954278409481049, + "learning_rate": 1.6203568456393554e-05, + "loss": 0.5158, + "step": 3932 + }, + { + "epoch": 0.8773142984608521, + "grad_norm": 0.1770762950181961, + "learning_rate": 1.620172244373324e-05, + "loss": 0.4886, + "step": 3933 + }, + { + "epoch": 0.8775373633727415, + "grad_norm": 0.16538506746292114, + "learning_rate": 1.619987608758253e-05, + "loss": 0.4563, + "step": 3934 + }, + { + "epoch": 0.8777604282846309, + "grad_norm": 0.1733943372964859, + "learning_rate": 1.6198029388043685e-05, + "loss": 0.5131, + "step": 3935 + }, + { + "epoch": 0.8779834931965201, + "grad_norm": 0.15919508039951324, + "learning_rate": 1.619618234521898e-05, + "loss": 0.4904, + "step": 3936 + }, + { + "epoch": 0.8782065581084095, + "grad_norm": 0.1546212136745453, + "learning_rate": 1.6194334959210726e-05, + "loss": 0.4827, + "step": 3937 + }, + { + "epoch": 0.8784296230202989, + "grad_norm": 0.16514717042446136, + "learning_rate": 1.6192487230121236e-05, + "loss": 0.501, + "step": 3938 + }, + { + "epoch": 0.8786526879321883, + "grad_norm": 0.1485379934310913, + "learning_rate": 1.6190639158052852e-05, + "loss": 0.47, + "step": 3939 + }, + { + "epoch": 0.8788757528440776, + "grad_norm": 0.1519124060869217, + "learning_rate": 1.618879074310793e-05, + "loss": 0.484, + "step": 3940 + }, + { + "epoch": 0.879098817755967, + "grad_norm": 0.16282908618450165, + "learning_rate": 1.618694198538885e-05, + "loss": 0.5071, + "step": 3941 + }, + { + "epoch": 0.8793218826678564, + "grad_norm": 0.16473978757858276, + "learning_rate": 1.6185092884998e-05, + "loss": 0.5203, + "step": 3942 + }, + { + "epoch": 0.8795449475797457, + "grad_norm": 0.15753906965255737, + "learning_rate": 1.6183243442037807e-05, + "loss": 0.4989, + "step": 3943 + }, + { + "epoch": 0.879768012491635, + "grad_norm": 0.1654537469148636, + "learning_rate": 1.6181393656610693e-05, + "loss": 0.4749, + "step": 3944 + }, + { + "epoch": 0.8799910774035244, + "grad_norm": 0.17788641154766083, + "learning_rate": 1.6179543528819116e-05, + "loss": 0.4961, + "step": 3945 + }, + { + "epoch": 0.8802141423154138, + "grad_norm": 0.16031092405319214, + "learning_rate": 1.617769305876555e-05, + "loss": 0.5006, + "step": 3946 + }, + { + "epoch": 0.8804372072273031, + "grad_norm": 0.16216473281383514, + "learning_rate": 1.6175842246552484e-05, + "loss": 0.5012, + "step": 3947 + }, + { + "epoch": 0.8806602721391925, + "grad_norm": 0.3736167848110199, + "learning_rate": 1.6173991092282424e-05, + "loss": 0.481, + "step": 3948 + }, + { + "epoch": 0.8808833370510819, + "grad_norm": 0.16511203348636627, + "learning_rate": 1.6172139596057902e-05, + "loss": 0.4837, + "step": 3949 + }, + { + "epoch": 0.8811064019629712, + "grad_norm": 0.16273178160190582, + "learning_rate": 1.6170287757981468e-05, + "loss": 0.494, + "step": 3950 + }, + { + "epoch": 0.8813294668748606, + "grad_norm": 0.17184849083423615, + "learning_rate": 1.616843557815568e-05, + "loss": 0.5257, + "step": 3951 + }, + { + "epoch": 0.88155253178675, + "grad_norm": 0.16385000944137573, + "learning_rate": 1.6166583056683132e-05, + "loss": 0.5086, + "step": 3952 + }, + { + "epoch": 0.8817755966986393, + "grad_norm": 0.16628234088420868, + "learning_rate": 1.6164730193666423e-05, + "loss": 0.5035, + "step": 3953 + }, + { + "epoch": 0.8819986616105286, + "grad_norm": 0.16098076105117798, + "learning_rate": 1.616287698920818e-05, + "loss": 0.4979, + "step": 3954 + }, + { + "epoch": 0.882221726522418, + "grad_norm": 0.16197021305561066, + "learning_rate": 1.6161023443411044e-05, + "loss": 0.4883, + "step": 3955 + }, + { + "epoch": 0.8824447914343074, + "grad_norm": 0.16163140535354614, + "learning_rate": 1.6159169556377672e-05, + "loss": 0.4899, + "step": 3956 + }, + { + "epoch": 0.8826678563461967, + "grad_norm": 0.16022367775440216, + "learning_rate": 1.615731532821075e-05, + "loss": 0.4832, + "step": 3957 + }, + { + "epoch": 0.8828909212580861, + "grad_norm": 0.16164539754390717, + "learning_rate": 1.615546075901297e-05, + "loss": 0.5016, + "step": 3958 + }, + { + "epoch": 0.8831139861699755, + "grad_norm": 0.17649579048156738, + "learning_rate": 1.615360584888706e-05, + "loss": 0.4918, + "step": 3959 + }, + { + "epoch": 0.8833370510818648, + "grad_norm": 0.16985541582107544, + "learning_rate": 1.6151750597935746e-05, + "loss": 0.4947, + "step": 3960 + }, + { + "epoch": 0.8835601159937542, + "grad_norm": 0.1681845337152481, + "learning_rate": 1.6149895006261788e-05, + "loss": 0.5169, + "step": 3961 + }, + { + "epoch": 0.8837831809056436, + "grad_norm": 0.1974424421787262, + "learning_rate": 1.6148039073967964e-05, + "loss": 0.5077, + "step": 3962 + }, + { + "epoch": 0.884006245817533, + "grad_norm": 0.1644642949104309, + "learning_rate": 1.614618280115706e-05, + "loss": 0.4981, + "step": 3963 + }, + { + "epoch": 0.8842293107294222, + "grad_norm": 0.16741390526294708, + "learning_rate": 1.6144326187931893e-05, + "loss": 0.4821, + "step": 3964 + }, + { + "epoch": 0.8844523756413116, + "grad_norm": 0.17722152173519135, + "learning_rate": 1.614246923439529e-05, + "loss": 0.4782, + "step": 3965 + }, + { + "epoch": 0.884675440553201, + "grad_norm": 0.17287255823612213, + "learning_rate": 1.6140611940650104e-05, + "loss": 0.4996, + "step": 3966 + }, + { + "epoch": 0.8848985054650903, + "grad_norm": 0.17982237040996552, + "learning_rate": 1.6138754306799206e-05, + "loss": 0.4567, + "step": 3967 + }, + { + "epoch": 0.8851215703769797, + "grad_norm": 0.1712944060564041, + "learning_rate": 1.6136896332945474e-05, + "loss": 0.4789, + "step": 3968 + }, + { + "epoch": 0.8853446352888691, + "grad_norm": 0.16616348922252655, + "learning_rate": 1.6135038019191823e-05, + "loss": 0.509, + "step": 3969 + }, + { + "epoch": 0.8855677002007584, + "grad_norm": 0.16956603527069092, + "learning_rate": 1.6133179365641178e-05, + "loss": 0.4906, + "step": 3970 + }, + { + "epoch": 0.8857907651126478, + "grad_norm": 0.15731406211853027, + "learning_rate": 1.613132037239648e-05, + "loss": 0.4911, + "step": 3971 + }, + { + "epoch": 0.8860138300245372, + "grad_norm": 0.16865961253643036, + "learning_rate": 1.6129461039560693e-05, + "loss": 0.5072, + "step": 3972 + }, + { + "epoch": 0.8862368949364265, + "grad_norm": 0.16574735939502716, + "learning_rate": 1.6127601367236793e-05, + "loss": 0.4807, + "step": 3973 + }, + { + "epoch": 0.8864599598483158, + "grad_norm": 0.16178300976753235, + "learning_rate": 1.6125741355527788e-05, + "loss": 0.5138, + "step": 3974 + }, + { + "epoch": 0.8866830247602052, + "grad_norm": 0.163814976811409, + "learning_rate": 1.6123881004536696e-05, + "loss": 0.4999, + "step": 3975 + }, + { + "epoch": 0.8869060896720946, + "grad_norm": 0.1648682951927185, + "learning_rate": 1.612202031436655e-05, + "loss": 0.4621, + "step": 3976 + }, + { + "epoch": 0.8871291545839839, + "grad_norm": 0.1537158489227295, + "learning_rate": 1.6120159285120417e-05, + "loss": 0.4678, + "step": 3977 + }, + { + "epoch": 0.8873522194958733, + "grad_norm": 0.17030583322048187, + "learning_rate": 1.6118297916901357e-05, + "loss": 0.5142, + "step": 3978 + }, + { + "epoch": 0.8875752844077627, + "grad_norm": 0.15777969360351562, + "learning_rate": 1.6116436209812476e-05, + "loss": 0.4857, + "step": 3979 + }, + { + "epoch": 0.8877983493196521, + "grad_norm": 0.16635335981845856, + "learning_rate": 1.6114574163956883e-05, + "loss": 0.4945, + "step": 3980 + }, + { + "epoch": 0.8880214142315414, + "grad_norm": 0.16205914318561554, + "learning_rate": 1.611271177943771e-05, + "loss": 0.5013, + "step": 3981 + }, + { + "epoch": 0.8882444791434307, + "grad_norm": 0.16518031060695648, + "learning_rate": 1.6110849056358112e-05, + "loss": 0.492, + "step": 3982 + }, + { + "epoch": 0.8884675440553201, + "grad_norm": 0.17391857504844666, + "learning_rate": 1.610898599482125e-05, + "loss": 0.5147, + "step": 3983 + }, + { + "epoch": 0.8886906089672094, + "grad_norm": 0.16679394245147705, + "learning_rate": 1.610712259493032e-05, + "loss": 0.5079, + "step": 3984 + }, + { + "epoch": 0.8889136738790988, + "grad_norm": 0.15235210955142975, + "learning_rate": 1.6105258856788525e-05, + "loss": 0.4616, + "step": 3985 + }, + { + "epoch": 0.8891367387909882, + "grad_norm": 0.15558315813541412, + "learning_rate": 1.6103394780499088e-05, + "loss": 0.5129, + "step": 3986 + }, + { + "epoch": 0.8893598037028775, + "grad_norm": 0.16309772431850433, + "learning_rate": 1.610153036616526e-05, + "loss": 0.4981, + "step": 3987 + }, + { + "epoch": 0.8895828686147669, + "grad_norm": 0.1611076146364212, + "learning_rate": 1.60996656138903e-05, + "loss": 0.4858, + "step": 3988 + }, + { + "epoch": 0.8898059335266563, + "grad_norm": 0.16238349676132202, + "learning_rate": 1.6097800523777487e-05, + "loss": 0.5327, + "step": 3989 + }, + { + "epoch": 0.8900289984385457, + "grad_norm": 0.1624278724193573, + "learning_rate": 1.6095935095930125e-05, + "loss": 0.4988, + "step": 3990 + }, + { + "epoch": 0.8902520633504349, + "grad_norm": 0.16069476306438446, + "learning_rate": 1.609406933045153e-05, + "loss": 0.5061, + "step": 3991 + }, + { + "epoch": 0.8904751282623243, + "grad_norm": 0.17323002219200134, + "learning_rate": 1.6092203227445046e-05, + "loss": 0.5126, + "step": 3992 + }, + { + "epoch": 0.8906981931742137, + "grad_norm": 0.16106821596622467, + "learning_rate": 1.6090336787014028e-05, + "loss": 0.5075, + "step": 3993 + }, + { + "epoch": 0.890921258086103, + "grad_norm": 0.20618608593940735, + "learning_rate": 1.6088470009261846e-05, + "loss": 0.4934, + "step": 3994 + }, + { + "epoch": 0.8911443229979924, + "grad_norm": 0.15415968000888824, + "learning_rate": 1.6086602894291895e-05, + "loss": 0.4685, + "step": 3995 + }, + { + "epoch": 0.8913673879098818, + "grad_norm": 0.16283094882965088, + "learning_rate": 1.608473544220759e-05, + "loss": 0.4893, + "step": 3996 + }, + { + "epoch": 0.8915904528217712, + "grad_norm": 0.15671685338020325, + "learning_rate": 1.6082867653112365e-05, + "loss": 0.4866, + "step": 3997 + }, + { + "epoch": 0.8918135177336605, + "grad_norm": 0.14689774811267853, + "learning_rate": 1.6080999527109665e-05, + "loss": 0.4596, + "step": 3998 + }, + { + "epoch": 0.8920365826455499, + "grad_norm": 0.1579175591468811, + "learning_rate": 1.6079131064302958e-05, + "loss": 0.5135, + "step": 3999 + }, + { + "epoch": 0.8922596475574393, + "grad_norm": 0.19000330567359924, + "learning_rate": 1.6077262264795735e-05, + "loss": 0.4955, + "step": 4000 + }, + { + "epoch": 0.8924827124693285, + "grad_norm": 0.17095667123794556, + "learning_rate": 1.6075393128691497e-05, + "loss": 0.492, + "step": 4001 + }, + { + "epoch": 0.8927057773812179, + "grad_norm": 0.1627168506383896, + "learning_rate": 1.6073523656093778e-05, + "loss": 0.4954, + "step": 4002 + }, + { + "epoch": 0.8929288422931073, + "grad_norm": 0.16945117712020874, + "learning_rate": 1.6071653847106113e-05, + "loss": 0.5073, + "step": 4003 + }, + { + "epoch": 0.8931519072049967, + "grad_norm": 0.16621045768260956, + "learning_rate": 1.6069783701832066e-05, + "loss": 0.509, + "step": 4004 + }, + { + "epoch": 0.893374972116886, + "grad_norm": 0.16034524142742157, + "learning_rate": 1.6067913220375216e-05, + "loss": 0.4839, + "step": 4005 + }, + { + "epoch": 0.8935980370287754, + "grad_norm": 0.15669786930084229, + "learning_rate": 1.6066042402839163e-05, + "loss": 0.4953, + "step": 4006 + }, + { + "epoch": 0.8938211019406648, + "grad_norm": 0.15480557084083557, + "learning_rate": 1.606417124932752e-05, + "loss": 0.459, + "step": 4007 + }, + { + "epoch": 0.8940441668525541, + "grad_norm": 0.15904684364795685, + "learning_rate": 1.6062299759943938e-05, + "loss": 0.4977, + "step": 4008 + }, + { + "epoch": 0.8942672317644434, + "grad_norm": 0.16506803035736084, + "learning_rate": 1.6060427934792056e-05, + "loss": 0.4792, + "step": 4009 + }, + { + "epoch": 0.8944902966763328, + "grad_norm": 0.16208292543888092, + "learning_rate": 1.6058555773975552e-05, + "loss": 0.5113, + "step": 4010 + }, + { + "epoch": 0.8947133615882221, + "grad_norm": 0.15986153483390808, + "learning_rate": 1.6056683277598123e-05, + "loss": 0.4839, + "step": 4011 + }, + { + "epoch": 0.8949364265001115, + "grad_norm": 0.17647914588451385, + "learning_rate": 1.6054810445763474e-05, + "loss": 0.4845, + "step": 4012 + }, + { + "epoch": 0.8951594914120009, + "grad_norm": 0.16568545997142792, + "learning_rate": 1.6052937278575338e-05, + "loss": 0.5114, + "step": 4013 + }, + { + "epoch": 0.8953825563238903, + "grad_norm": 0.16345763206481934, + "learning_rate": 1.605106377613746e-05, + "loss": 0.5004, + "step": 4014 + }, + { + "epoch": 0.8956056212357796, + "grad_norm": 0.15674430131912231, + "learning_rate": 1.6049189938553606e-05, + "loss": 0.4745, + "step": 4015 + }, + { + "epoch": 0.895828686147669, + "grad_norm": 0.25220704078674316, + "learning_rate": 1.6047315765927566e-05, + "loss": 0.4988, + "step": 4016 + }, + { + "epoch": 0.8960517510595584, + "grad_norm": 0.16052506864070892, + "learning_rate": 1.6045441258363138e-05, + "loss": 0.468, + "step": 4017 + }, + { + "epoch": 0.8962748159714476, + "grad_norm": 0.1760246306657791, + "learning_rate": 1.6043566415964145e-05, + "loss": 0.5213, + "step": 4018 + }, + { + "epoch": 0.896497880883337, + "grad_norm": 0.16761453449726105, + "learning_rate": 1.6041691238834426e-05, + "loss": 0.5223, + "step": 4019 + }, + { + "epoch": 0.8967209457952264, + "grad_norm": 0.1628294140100479, + "learning_rate": 1.6039815727077845e-05, + "loss": 0.4642, + "step": 4020 + }, + { + "epoch": 0.8969440107071158, + "grad_norm": 0.16300807893276215, + "learning_rate": 1.6037939880798277e-05, + "loss": 0.4907, + "step": 4021 + }, + { + "epoch": 0.8971670756190051, + "grad_norm": 0.16601742804050446, + "learning_rate": 1.603606370009962e-05, + "loss": 0.465, + "step": 4022 + }, + { + "epoch": 0.8973901405308945, + "grad_norm": 0.16860070824623108, + "learning_rate": 1.6034187185085783e-05, + "loss": 0.4817, + "step": 4023 + }, + { + "epoch": 0.8976132054427839, + "grad_norm": 0.16450795531272888, + "learning_rate": 1.6032310335860706e-05, + "loss": 0.4991, + "step": 4024 + }, + { + "epoch": 0.8978362703546732, + "grad_norm": 0.18083292245864868, + "learning_rate": 1.603043315252834e-05, + "loss": 0.49, + "step": 4025 + }, + { + "epoch": 0.8980593352665626, + "grad_norm": 0.16025912761688232, + "learning_rate": 1.6028555635192648e-05, + "loss": 0.492, + "step": 4026 + }, + { + "epoch": 0.898282400178452, + "grad_norm": 0.17619994282722473, + "learning_rate": 1.6026677783957626e-05, + "loss": 0.5244, + "step": 4027 + }, + { + "epoch": 0.8985054650903412, + "grad_norm": 0.16059327125549316, + "learning_rate": 1.602479959892728e-05, + "loss": 0.4875, + "step": 4028 + }, + { + "epoch": 0.8987285300022306, + "grad_norm": 0.166759192943573, + "learning_rate": 1.6022921080205634e-05, + "loss": 0.4953, + "step": 4029 + }, + { + "epoch": 0.89895159491412, + "grad_norm": 0.17170590162277222, + "learning_rate": 1.602104222789673e-05, + "loss": 0.5209, + "step": 4030 + }, + { + "epoch": 0.8991746598260094, + "grad_norm": 0.17768704891204834, + "learning_rate": 1.601916304210464e-05, + "loss": 0.4861, + "step": 4031 + }, + { + "epoch": 0.8993977247378987, + "grad_norm": 0.1798562854528427, + "learning_rate": 1.6017283522933432e-05, + "loss": 0.5049, + "step": 4032 + }, + { + "epoch": 0.8996207896497881, + "grad_norm": 0.15688154101371765, + "learning_rate": 1.6015403670487216e-05, + "loss": 0.4527, + "step": 4033 + }, + { + "epoch": 0.8998438545616775, + "grad_norm": 0.1658952832221985, + "learning_rate": 1.6013523484870107e-05, + "loss": 0.4687, + "step": 4034 + }, + { + "epoch": 0.9000669194735668, + "grad_norm": 0.20666684210300446, + "learning_rate": 1.6011642966186237e-05, + "loss": 0.4883, + "step": 4035 + }, + { + "epoch": 0.9002899843854562, + "grad_norm": 0.1706281453371048, + "learning_rate": 1.600976211453977e-05, + "loss": 0.4989, + "step": 4036 + }, + { + "epoch": 0.9005130492973455, + "grad_norm": 0.16239850223064423, + "learning_rate": 1.600788093003487e-05, + "loss": 0.4944, + "step": 4037 + }, + { + "epoch": 0.9007361142092349, + "grad_norm": 0.17332103848457336, + "learning_rate": 1.6005999412775736e-05, + "loss": 0.5225, + "step": 4038 + }, + { + "epoch": 0.9009591791211242, + "grad_norm": 0.16437414288520813, + "learning_rate": 1.600411756286657e-05, + "loss": 0.5197, + "step": 4039 + }, + { + "epoch": 0.9011822440330136, + "grad_norm": 0.1621236801147461, + "learning_rate": 1.6002235380411614e-05, + "loss": 0.4943, + "step": 4040 + }, + { + "epoch": 0.901405308944903, + "grad_norm": 0.17484545707702637, + "learning_rate": 1.60003528655151e-05, + "loss": 0.5032, + "step": 4041 + }, + { + "epoch": 0.9016283738567923, + "grad_norm": 0.16205301880836487, + "learning_rate": 1.5998470018281303e-05, + "loss": 0.4848, + "step": 4042 + }, + { + "epoch": 0.9018514387686817, + "grad_norm": 0.15417253971099854, + "learning_rate": 1.5996586838814505e-05, + "loss": 0.4656, + "step": 4043 + }, + { + "epoch": 0.9020745036805711, + "grad_norm": 0.15824946761131287, + "learning_rate": 1.5994703327219008e-05, + "loss": 0.4926, + "step": 4044 + }, + { + "epoch": 0.9022975685924604, + "grad_norm": 0.17363391816616058, + "learning_rate": 1.5992819483599132e-05, + "loss": 0.5254, + "step": 4045 + }, + { + "epoch": 0.9025206335043497, + "grad_norm": 0.16660279035568237, + "learning_rate": 1.599093530805922e-05, + "loss": 0.5396, + "step": 4046 + }, + { + "epoch": 0.9027436984162391, + "grad_norm": 0.16047632694244385, + "learning_rate": 1.5989050800703622e-05, + "loss": 0.4782, + "step": 4047 + }, + { + "epoch": 0.9029667633281285, + "grad_norm": 0.1667635589838028, + "learning_rate": 1.5987165961636718e-05, + "loss": 0.5084, + "step": 4048 + }, + { + "epoch": 0.9031898282400178, + "grad_norm": 0.16391973197460175, + "learning_rate": 1.5985280790962903e-05, + "loss": 0.4967, + "step": 4049 + }, + { + "epoch": 0.9034128931519072, + "grad_norm": 0.15799559652805328, + "learning_rate": 1.598339528878659e-05, + "loss": 0.4929, + "step": 4050 + }, + { + "epoch": 0.9036359580637966, + "grad_norm": 0.15577222406864166, + "learning_rate": 1.5981509455212207e-05, + "loss": 0.4766, + "step": 4051 + }, + { + "epoch": 0.9038590229756859, + "grad_norm": 0.15642912685871124, + "learning_rate": 1.5979623290344207e-05, + "loss": 0.4714, + "step": 4052 + }, + { + "epoch": 0.9040820878875753, + "grad_norm": 0.1563492864370346, + "learning_rate": 1.5977736794287057e-05, + "loss": 0.4891, + "step": 4053 + }, + { + "epoch": 0.9043051527994647, + "grad_norm": 0.16570942103862762, + "learning_rate": 1.597584996714524e-05, + "loss": 0.4857, + "step": 4054 + }, + { + "epoch": 0.904528217711354, + "grad_norm": 0.16158944368362427, + "learning_rate": 1.5973962809023258e-05, + "loss": 0.487, + "step": 4055 + }, + { + "epoch": 0.9047512826232433, + "grad_norm": 0.16446934640407562, + "learning_rate": 1.5972075320025643e-05, + "loss": 0.5012, + "step": 4056 + }, + { + "epoch": 0.9049743475351327, + "grad_norm": 0.20039531588554382, + "learning_rate": 1.597018750025693e-05, + "loss": 0.5092, + "step": 4057 + }, + { + "epoch": 0.9051974124470221, + "grad_norm": 0.16635029017925262, + "learning_rate": 1.5968299349821678e-05, + "loss": 0.5425, + "step": 4058 + }, + { + "epoch": 0.9054204773589114, + "grad_norm": 0.162612184882164, + "learning_rate": 1.596641086882447e-05, + "loss": 0.4561, + "step": 4059 + }, + { + "epoch": 0.9056435422708008, + "grad_norm": 0.15874888002872467, + "learning_rate": 1.5964522057369897e-05, + "loss": 0.503, + "step": 4060 + }, + { + "epoch": 0.9058666071826902, + "grad_norm": 0.16984137892723083, + "learning_rate": 1.596263291556257e-05, + "loss": 0.4931, + "step": 4061 + }, + { + "epoch": 0.9060896720945795, + "grad_norm": 0.15440818667411804, + "learning_rate": 1.5960743443507128e-05, + "loss": 0.4834, + "step": 4062 + }, + { + "epoch": 0.9063127370064689, + "grad_norm": 0.17957209050655365, + "learning_rate": 1.595885364130822e-05, + "loss": 0.5049, + "step": 4063 + }, + { + "epoch": 0.9065358019183583, + "grad_norm": 0.15549324452877045, + "learning_rate": 1.5956963509070513e-05, + "loss": 0.4733, + "step": 4064 + }, + { + "epoch": 0.9067588668302476, + "grad_norm": 0.1653992384672165, + "learning_rate": 1.59550730468987e-05, + "loss": 0.4925, + "step": 4065 + }, + { + "epoch": 0.9069819317421369, + "grad_norm": 0.1693269908428192, + "learning_rate": 1.5953182254897478e-05, + "loss": 0.4908, + "step": 4066 + }, + { + "epoch": 0.9072049966540263, + "grad_norm": 0.16473425924777985, + "learning_rate": 1.5951291133171577e-05, + "loss": 0.4741, + "step": 4067 + }, + { + "epoch": 0.9074280615659157, + "grad_norm": 0.16339989006519318, + "learning_rate": 1.5949399681825738e-05, + "loss": 0.4574, + "step": 4068 + }, + { + "epoch": 0.907651126477805, + "grad_norm": 0.16389763355255127, + "learning_rate": 1.5947507900964723e-05, + "loss": 0.4951, + "step": 4069 + }, + { + "epoch": 0.9078741913896944, + "grad_norm": 0.16346803307533264, + "learning_rate": 1.594561579069331e-05, + "loss": 0.4794, + "step": 4070 + }, + { + "epoch": 0.9080972563015838, + "grad_norm": 0.16627097129821777, + "learning_rate": 1.5943723351116293e-05, + "loss": 0.4726, + "step": 4071 + }, + { + "epoch": 0.9083203212134732, + "grad_norm": 0.17151188850402832, + "learning_rate": 1.5941830582338488e-05, + "loss": 0.5246, + "step": 4072 + }, + { + "epoch": 0.9085433861253625, + "grad_norm": 0.15683230757713318, + "learning_rate": 1.593993748446473e-05, + "loss": 0.4771, + "step": 4073 + }, + { + "epoch": 0.9087664510372518, + "grad_norm": 0.1879899501800537, + "learning_rate": 1.5938044057599873e-05, + "loss": 0.5102, + "step": 4074 + }, + { + "epoch": 0.9089895159491412, + "grad_norm": 0.16234296560287476, + "learning_rate": 1.593615030184878e-05, + "loss": 0.4906, + "step": 4075 + }, + { + "epoch": 0.9092125808610305, + "grad_norm": 0.15749451518058777, + "learning_rate": 1.593425621731635e-05, + "loss": 0.4867, + "step": 4076 + }, + { + "epoch": 0.9094356457729199, + "grad_norm": 0.1519862711429596, + "learning_rate": 1.593236180410748e-05, + "loss": 0.4644, + "step": 4077 + }, + { + "epoch": 0.9096587106848093, + "grad_norm": 0.20612524449825287, + "learning_rate": 1.5930467062327096e-05, + "loss": 0.5017, + "step": 4078 + }, + { + "epoch": 0.9098817755966987, + "grad_norm": 0.16134196519851685, + "learning_rate": 1.5928571992080142e-05, + "loss": 0.4801, + "step": 4079 + }, + { + "epoch": 0.910104840508588, + "grad_norm": 0.15148447453975677, + "learning_rate": 1.592667659347158e-05, + "loss": 0.4697, + "step": 4080 + }, + { + "epoch": 0.9103279054204774, + "grad_norm": 0.1598382443189621, + "learning_rate": 1.5924780866606387e-05, + "loss": 0.4976, + "step": 4081 + }, + { + "epoch": 0.9105509703323668, + "grad_norm": 0.15935340523719788, + "learning_rate": 1.592288481158956e-05, + "loss": 0.4806, + "step": 4082 + }, + { + "epoch": 0.910774035244256, + "grad_norm": 0.16446979343891144, + "learning_rate": 1.5920988428526117e-05, + "loss": 0.4966, + "step": 4083 + }, + { + "epoch": 0.9109971001561454, + "grad_norm": 0.15920083224773407, + "learning_rate": 1.591909171752109e-05, + "loss": 0.4917, + "step": 4084 + }, + { + "epoch": 0.9112201650680348, + "grad_norm": 0.16009603440761566, + "learning_rate": 1.5917194678679532e-05, + "loss": 0.5169, + "step": 4085 + }, + { + "epoch": 0.9114432299799241, + "grad_norm": 0.17751926183700562, + "learning_rate": 1.5915297312106513e-05, + "loss": 0.4804, + "step": 4086 + }, + { + "epoch": 0.9116662948918135, + "grad_norm": 0.18244434893131256, + "learning_rate": 1.5913399617907116e-05, + "loss": 0.487, + "step": 4087 + }, + { + "epoch": 0.9118893598037029, + "grad_norm": 0.15786734223365784, + "learning_rate": 1.5911501596186455e-05, + "loss": 0.4986, + "step": 4088 + }, + { + "epoch": 0.9121124247155923, + "grad_norm": 0.15740512311458588, + "learning_rate": 1.5909603247049654e-05, + "loss": 0.4895, + "step": 4089 + }, + { + "epoch": 0.9123354896274816, + "grad_norm": 0.1518784463405609, + "learning_rate": 1.5907704570601845e-05, + "loss": 0.4522, + "step": 4090 + }, + { + "epoch": 0.912558554539371, + "grad_norm": 0.1830586940050125, + "learning_rate": 1.59058055669482e-05, + "loss": 0.513, + "step": 4091 + }, + { + "epoch": 0.9127816194512604, + "grad_norm": 0.16128897666931152, + "learning_rate": 1.5903906236193892e-05, + "loss": 0.4706, + "step": 4092 + }, + { + "epoch": 0.9130046843631496, + "grad_norm": 0.17136983573436737, + "learning_rate": 1.5902006578444123e-05, + "loss": 0.4867, + "step": 4093 + }, + { + "epoch": 0.913227749275039, + "grad_norm": 0.15806463360786438, + "learning_rate": 1.59001065938041e-05, + "loss": 0.5066, + "step": 4094 + }, + { + "epoch": 0.9134508141869284, + "grad_norm": 0.17267145216464996, + "learning_rate": 1.5898206282379063e-05, + "loss": 0.4805, + "step": 4095 + }, + { + "epoch": 0.9136738790988178, + "grad_norm": 0.15600056946277618, + "learning_rate": 1.5896305644274262e-05, + "loss": 0.4865, + "step": 4096 + }, + { + "epoch": 0.9138969440107071, + "grad_norm": 0.1608215868473053, + "learning_rate": 1.5894404679594963e-05, + "loss": 0.5102, + "step": 4097 + }, + { + "epoch": 0.9141200089225965, + "grad_norm": 0.15523095428943634, + "learning_rate": 1.5892503388446456e-05, + "loss": 0.4642, + "step": 4098 + }, + { + "epoch": 0.9143430738344859, + "grad_norm": 0.16346514225006104, + "learning_rate": 1.589060177093405e-05, + "loss": 0.4655, + "step": 4099 + }, + { + "epoch": 0.9145661387463752, + "grad_norm": 0.1508352905511856, + "learning_rate": 1.588869982716306e-05, + "loss": 0.4826, + "step": 4100 + }, + { + "epoch": 0.9147892036582645, + "grad_norm": 0.1622077375650406, + "learning_rate": 1.5886797557238832e-05, + "loss": 0.4876, + "step": 4101 + }, + { + "epoch": 0.9150122685701539, + "grad_norm": 0.17161275446414948, + "learning_rate": 1.588489496126673e-05, + "loss": 0.4963, + "step": 4102 + }, + { + "epoch": 0.9152353334820432, + "grad_norm": 0.15936551988124847, + "learning_rate": 1.5882992039352122e-05, + "loss": 0.4768, + "step": 4103 + }, + { + "epoch": 0.9154583983939326, + "grad_norm": 0.1664397269487381, + "learning_rate": 1.588108879160041e-05, + "loss": 0.4927, + "step": 4104 + }, + { + "epoch": 0.915681463305822, + "grad_norm": 0.16500705480575562, + "learning_rate": 1.5879185218117012e-05, + "loss": 0.4909, + "step": 4105 + }, + { + "epoch": 0.9159045282177114, + "grad_norm": 0.16480299830436707, + "learning_rate": 1.5877281319007352e-05, + "loss": 0.468, + "step": 4106 + }, + { + "epoch": 0.9161275931296007, + "grad_norm": 0.17092856764793396, + "learning_rate": 1.5875377094376883e-05, + "loss": 0.5188, + "step": 4107 + }, + { + "epoch": 0.9163506580414901, + "grad_norm": 0.16929516196250916, + "learning_rate": 1.5873472544331073e-05, + "loss": 0.4932, + "step": 4108 + }, + { + "epoch": 0.9165737229533795, + "grad_norm": 0.1531393676996231, + "learning_rate": 1.5871567668975406e-05, + "loss": 0.4512, + "step": 4109 + }, + { + "epoch": 0.9167967878652687, + "grad_norm": 0.15918409824371338, + "learning_rate": 1.586966246841539e-05, + "loss": 0.5019, + "step": 4110 + }, + { + "epoch": 0.9170198527771581, + "grad_norm": 0.19668515026569366, + "learning_rate": 1.5867756942756548e-05, + "loss": 0.5106, + "step": 4111 + }, + { + "epoch": 0.9172429176890475, + "grad_norm": 0.15807564556598663, + "learning_rate": 1.5865851092104414e-05, + "loss": 0.4738, + "step": 4112 + }, + { + "epoch": 0.9174659826009369, + "grad_norm": 0.16922055184841156, + "learning_rate": 1.586394491656455e-05, + "loss": 0.5086, + "step": 4113 + }, + { + "epoch": 0.9176890475128262, + "grad_norm": 0.18552662432193756, + "learning_rate": 1.586203841624253e-05, + "loss": 0.5246, + "step": 4114 + }, + { + "epoch": 0.9179121124247156, + "grad_norm": 0.16202445328235626, + "learning_rate": 1.5860131591243945e-05, + "loss": 0.5113, + "step": 4115 + }, + { + "epoch": 0.918135177336605, + "grad_norm": 0.16352488100528717, + "learning_rate": 1.5858224441674416e-05, + "loss": 0.4739, + "step": 4116 + }, + { + "epoch": 0.9183582422484943, + "grad_norm": 0.16634705662727356, + "learning_rate": 1.5856316967639566e-05, + "loss": 0.5221, + "step": 4117 + }, + { + "epoch": 0.9185813071603837, + "grad_norm": 0.16568778455257416, + "learning_rate": 1.5854409169245043e-05, + "loss": 0.4476, + "step": 4118 + }, + { + "epoch": 0.9188043720722731, + "grad_norm": 0.160440593957901, + "learning_rate": 1.5852501046596516e-05, + "loss": 0.5054, + "step": 4119 + }, + { + "epoch": 0.9190274369841623, + "grad_norm": 0.1603170782327652, + "learning_rate": 1.5850592599799668e-05, + "loss": 0.4947, + "step": 4120 + }, + { + "epoch": 0.9192505018960517, + "grad_norm": 0.16584816575050354, + "learning_rate": 1.5848683828960195e-05, + "loss": 0.5014, + "step": 4121 + }, + { + "epoch": 0.9194735668079411, + "grad_norm": 0.1588207632303238, + "learning_rate": 1.584677473418383e-05, + "loss": 0.4973, + "step": 4122 + }, + { + "epoch": 0.9196966317198305, + "grad_norm": 0.1574103981256485, + "learning_rate": 1.5844865315576296e-05, + "loss": 0.5214, + "step": 4123 + }, + { + "epoch": 0.9199196966317198, + "grad_norm": 0.17080260813236237, + "learning_rate": 1.584295557324336e-05, + "loss": 0.5067, + "step": 4124 + }, + { + "epoch": 0.9201427615436092, + "grad_norm": 0.15937574207782745, + "learning_rate": 1.584104550729079e-05, + "loss": 0.4971, + "step": 4125 + }, + { + "epoch": 0.9203658264554986, + "grad_norm": 0.15593832731246948, + "learning_rate": 1.5839135117824375e-05, + "loss": 0.4844, + "step": 4126 + }, + { + "epoch": 0.9205888913673879, + "grad_norm": 0.18794505298137665, + "learning_rate": 1.583722440494993e-05, + "loss": 0.5135, + "step": 4127 + }, + { + "epoch": 0.9208119562792773, + "grad_norm": 0.183371901512146, + "learning_rate": 1.5835313368773276e-05, + "loss": 0.4861, + "step": 4128 + }, + { + "epoch": 0.9210350211911666, + "grad_norm": 0.153816357254982, + "learning_rate": 1.583340200940027e-05, + "loss": 0.4921, + "step": 4129 + }, + { + "epoch": 0.921258086103056, + "grad_norm": 0.16074557602405548, + "learning_rate": 1.583149032693676e-05, + "loss": 0.4566, + "step": 4130 + }, + { + "epoch": 0.9214811510149453, + "grad_norm": 0.15913258492946625, + "learning_rate": 1.5829578321488636e-05, + "loss": 0.4838, + "step": 4131 + }, + { + "epoch": 0.9217042159268347, + "grad_norm": 0.15266257524490356, + "learning_rate": 1.58276659931618e-05, + "loss": 0.4704, + "step": 4132 + }, + { + "epoch": 0.9219272808387241, + "grad_norm": 0.15182676911354065, + "learning_rate": 1.5825753342062155e-05, + "loss": 0.4825, + "step": 4133 + }, + { + "epoch": 0.9221503457506134, + "grad_norm": 0.15525878965854645, + "learning_rate": 1.582384036829565e-05, + "loss": 0.4636, + "step": 4134 + }, + { + "epoch": 0.9223734106625028, + "grad_norm": 0.15554243326187134, + "learning_rate": 1.582192707196823e-05, + "loss": 0.4945, + "step": 4135 + }, + { + "epoch": 0.9225964755743922, + "grad_norm": 0.15928377211093903, + "learning_rate": 1.582001345318587e-05, + "loss": 0.4772, + "step": 4136 + }, + { + "epoch": 0.9228195404862815, + "grad_norm": 0.15914230048656464, + "learning_rate": 1.581809951205455e-05, + "loss": 0.4723, + "step": 4137 + }, + { + "epoch": 0.9230426053981708, + "grad_norm": 0.16770337522029877, + "learning_rate": 1.581618524868029e-05, + "loss": 0.4579, + "step": 4138 + }, + { + "epoch": 0.9232656703100602, + "grad_norm": 0.16558465361595154, + "learning_rate": 1.58142706631691e-05, + "loss": 0.5186, + "step": 4139 + }, + { + "epoch": 0.9234887352219496, + "grad_norm": 0.16763773560523987, + "learning_rate": 1.5812355755627028e-05, + "loss": 0.4887, + "step": 4140 + }, + { + "epoch": 0.9237118001338389, + "grad_norm": 0.1562553197145462, + "learning_rate": 1.5810440526160133e-05, + "loss": 0.4953, + "step": 4141 + }, + { + "epoch": 0.9239348650457283, + "grad_norm": 0.1571418195962906, + "learning_rate": 1.5808524974874493e-05, + "loss": 0.4753, + "step": 4142 + }, + { + "epoch": 0.9241579299576177, + "grad_norm": 0.17362381517887115, + "learning_rate": 1.5806609101876203e-05, + "loss": 0.4704, + "step": 4143 + }, + { + "epoch": 0.924380994869507, + "grad_norm": 0.1619691401720047, + "learning_rate": 1.580469290727138e-05, + "loss": 0.476, + "step": 4144 + }, + { + "epoch": 0.9246040597813964, + "grad_norm": 0.16508683562278748, + "learning_rate": 1.5802776391166146e-05, + "loss": 0.5118, + "step": 4145 + }, + { + "epoch": 0.9248271246932858, + "grad_norm": 0.16421370208263397, + "learning_rate": 1.5800859553666655e-05, + "loss": 0.4876, + "step": 4146 + }, + { + "epoch": 0.9250501896051752, + "grad_norm": 0.16035978496074677, + "learning_rate": 1.5798942394879073e-05, + "loss": 0.5048, + "step": 4147 + }, + { + "epoch": 0.9252732545170644, + "grad_norm": 0.16965007781982422, + "learning_rate": 1.5797024914909584e-05, + "loss": 0.4932, + "step": 4148 + }, + { + "epoch": 0.9254963194289538, + "grad_norm": 0.16695380210876465, + "learning_rate": 1.5795107113864393e-05, + "loss": 0.5188, + "step": 4149 + }, + { + "epoch": 0.9257193843408432, + "grad_norm": 0.1638556569814682, + "learning_rate": 1.5793188991849717e-05, + "loss": 0.4764, + "step": 4150 + }, + { + "epoch": 0.9259424492527325, + "grad_norm": 0.16190040111541748, + "learning_rate": 1.579127054897179e-05, + "loss": 0.4877, + "step": 4151 + }, + { + "epoch": 0.9261655141646219, + "grad_norm": 0.16219079494476318, + "learning_rate": 1.5789351785336874e-05, + "loss": 0.4823, + "step": 4152 + }, + { + "epoch": 0.9263885790765113, + "grad_norm": 0.15557697415351868, + "learning_rate": 1.5787432701051242e-05, + "loss": 0.4826, + "step": 4153 + }, + { + "epoch": 0.9266116439884007, + "grad_norm": 0.15617480874061584, + "learning_rate": 1.578551329622118e-05, + "loss": 0.5012, + "step": 4154 + }, + { + "epoch": 0.92683470890029, + "grad_norm": 0.16417647898197174, + "learning_rate": 1.5783593570953e-05, + "loss": 0.4692, + "step": 4155 + }, + { + "epoch": 0.9270577738121794, + "grad_norm": 0.16066023707389832, + "learning_rate": 1.578167352535303e-05, + "loss": 0.4932, + "step": 4156 + }, + { + "epoch": 0.9272808387240687, + "grad_norm": 0.15020349621772766, + "learning_rate": 1.577975315952761e-05, + "loss": 0.4593, + "step": 4157 + }, + { + "epoch": 0.927503903635958, + "grad_norm": 0.15949909389019012, + "learning_rate": 1.57778324735831e-05, + "loss": 0.4952, + "step": 4158 + }, + { + "epoch": 0.9277269685478474, + "grad_norm": 0.169538676738739, + "learning_rate": 1.577591146762589e-05, + "loss": 0.4989, + "step": 4159 + }, + { + "epoch": 0.9279500334597368, + "grad_norm": 0.1595795601606369, + "learning_rate": 1.5773990141762366e-05, + "loss": 0.4886, + "step": 4160 + }, + { + "epoch": 0.9281730983716261, + "grad_norm": 0.17213213443756104, + "learning_rate": 1.577206849609895e-05, + "loss": 0.4383, + "step": 4161 + }, + { + "epoch": 0.9283961632835155, + "grad_norm": 0.15668156743049622, + "learning_rate": 1.5770146530742075e-05, + "loss": 0.4672, + "step": 4162 + }, + { + "epoch": 0.9286192281954049, + "grad_norm": 0.15910851955413818, + "learning_rate": 1.576822424579819e-05, + "loss": 0.5136, + "step": 4163 + }, + { + "epoch": 0.9288422931072943, + "grad_norm": 0.1524379998445511, + "learning_rate": 1.5766301641373755e-05, + "loss": 0.4752, + "step": 4164 + }, + { + "epoch": 0.9290653580191836, + "grad_norm": 0.1620722860097885, + "learning_rate": 1.5764378717575272e-05, + "loss": 0.46, + "step": 4165 + }, + { + "epoch": 0.9292884229310729, + "grad_norm": 0.16785915195941925, + "learning_rate": 1.576245547450923e-05, + "loss": 0.5097, + "step": 4166 + }, + { + "epoch": 0.9295114878429623, + "grad_norm": 0.15819083154201508, + "learning_rate": 1.5760531912282163e-05, + "loss": 0.5042, + "step": 4167 + }, + { + "epoch": 0.9297345527548516, + "grad_norm": 0.15982572734355927, + "learning_rate": 1.57586080310006e-05, + "loss": 0.4832, + "step": 4168 + }, + { + "epoch": 0.929957617666741, + "grad_norm": 0.16882368922233582, + "learning_rate": 1.57566838307711e-05, + "loss": 0.495, + "step": 4169 + }, + { + "epoch": 0.9301806825786304, + "grad_norm": 0.15450328588485718, + "learning_rate": 1.575475931170024e-05, + "loss": 0.4831, + "step": 4170 + }, + { + "epoch": 0.9304037474905198, + "grad_norm": 0.15884611010551453, + "learning_rate": 1.575283447389461e-05, + "loss": 0.4447, + "step": 4171 + }, + { + "epoch": 0.9306268124024091, + "grad_norm": 0.16536371409893036, + "learning_rate": 1.575090931746082e-05, + "loss": 0.4708, + "step": 4172 + }, + { + "epoch": 0.9308498773142985, + "grad_norm": 0.16749803721904755, + "learning_rate": 1.57489838425055e-05, + "loss": 0.4927, + "step": 4173 + }, + { + "epoch": 0.9310729422261879, + "grad_norm": 0.17306271195411682, + "learning_rate": 1.5747058049135286e-05, + "loss": 0.5355, + "step": 4174 + }, + { + "epoch": 0.9312960071380771, + "grad_norm": 0.1781659871339798, + "learning_rate": 1.5745131937456853e-05, + "loss": 0.5054, + "step": 4175 + }, + { + "epoch": 0.9315190720499665, + "grad_norm": 0.16959606111049652, + "learning_rate": 1.5743205507576873e-05, + "loss": 0.4799, + "step": 4176 + }, + { + "epoch": 0.9317421369618559, + "grad_norm": 0.16488516330718994, + "learning_rate": 1.5741278759602045e-05, + "loss": 0.4835, + "step": 4177 + }, + { + "epoch": 0.9319652018737452, + "grad_norm": 0.1639404445886612, + "learning_rate": 1.5739351693639085e-05, + "loss": 0.4757, + "step": 4178 + }, + { + "epoch": 0.9321882667856346, + "grad_norm": 0.17086337506771088, + "learning_rate": 1.573742430979473e-05, + "loss": 0.4982, + "step": 4179 + }, + { + "epoch": 0.932411331697524, + "grad_norm": 0.162103071808815, + "learning_rate": 1.5735496608175722e-05, + "loss": 0.4593, + "step": 4180 + }, + { + "epoch": 0.9326343966094134, + "grad_norm": 0.1637817919254303, + "learning_rate": 1.5733568588888835e-05, + "loss": 0.4776, + "step": 4181 + }, + { + "epoch": 0.9328574615213027, + "grad_norm": 0.1821950078010559, + "learning_rate": 1.5731640252040857e-05, + "loss": 0.5159, + "step": 4182 + }, + { + "epoch": 0.9330805264331921, + "grad_norm": 0.21807077527046204, + "learning_rate": 1.5729711597738587e-05, + "loss": 0.4721, + "step": 4183 + }, + { + "epoch": 0.9333035913450815, + "grad_norm": 0.1628894805908203, + "learning_rate": 1.5727782626088844e-05, + "loss": 0.5033, + "step": 4184 + }, + { + "epoch": 0.9335266562569707, + "grad_norm": 0.158598393201828, + "learning_rate": 1.5725853337198476e-05, + "loss": 0.4743, + "step": 4185 + }, + { + "epoch": 0.9337497211688601, + "grad_norm": 0.16272245347499847, + "learning_rate": 1.5723923731174327e-05, + "loss": 0.4952, + "step": 4186 + }, + { + "epoch": 0.9339727860807495, + "grad_norm": 0.16067437827587128, + "learning_rate": 1.5721993808123283e-05, + "loss": 0.4879, + "step": 4187 + }, + { + "epoch": 0.9341958509926389, + "grad_norm": 0.16409751772880554, + "learning_rate": 1.5720063568152222e-05, + "loss": 0.5051, + "step": 4188 + }, + { + "epoch": 0.9344189159045282, + "grad_norm": 0.15659625828266144, + "learning_rate": 1.5718133011368065e-05, + "loss": 0.4497, + "step": 4189 + }, + { + "epoch": 0.9346419808164176, + "grad_norm": 0.17328283190727234, + "learning_rate": 1.5716202137877732e-05, + "loss": 0.4927, + "step": 4190 + }, + { + "epoch": 0.934865045728307, + "grad_norm": 0.15970586240291595, + "learning_rate": 1.5714270947788168e-05, + "loss": 0.4706, + "step": 4191 + }, + { + "epoch": 0.9350881106401963, + "grad_norm": 0.16526605188846588, + "learning_rate": 1.5712339441206335e-05, + "loss": 0.5266, + "step": 4192 + }, + { + "epoch": 0.9353111755520856, + "grad_norm": 0.16756314039230347, + "learning_rate": 1.5710407618239215e-05, + "loss": 0.4818, + "step": 4193 + }, + { + "epoch": 0.935534240463975, + "grad_norm": 0.16548283398151398, + "learning_rate": 1.57084754789938e-05, + "loss": 0.4586, + "step": 4194 + }, + { + "epoch": 0.9357573053758643, + "grad_norm": 0.1704016625881195, + "learning_rate": 1.57065430235771e-05, + "loss": 0.4999, + "step": 4195 + }, + { + "epoch": 0.9359803702877537, + "grad_norm": 0.16027678549289703, + "learning_rate": 1.5704610252096158e-05, + "loss": 0.5155, + "step": 4196 + }, + { + "epoch": 0.9362034351996431, + "grad_norm": 0.16171659529209137, + "learning_rate": 1.5702677164658013e-05, + "loss": 0.482, + "step": 4197 + }, + { + "epoch": 0.9364265001115325, + "grad_norm": 0.15903575718402863, + "learning_rate": 1.5700743761369735e-05, + "loss": 0.4862, + "step": 4198 + }, + { + "epoch": 0.9366495650234218, + "grad_norm": 0.16277752816677094, + "learning_rate": 1.569881004233841e-05, + "loss": 0.5011, + "step": 4199 + }, + { + "epoch": 0.9368726299353112, + "grad_norm": 0.18112027645111084, + "learning_rate": 1.5696876007671137e-05, + "loss": 0.5124, + "step": 4200 + }, + { + "epoch": 0.9370956948472006, + "grad_norm": 0.1702015995979309, + "learning_rate": 1.5694941657475037e-05, + "loss": 0.5233, + "step": 4201 + }, + { + "epoch": 0.9373187597590898, + "grad_norm": 0.15752531588077545, + "learning_rate": 1.5693006991857248e-05, + "loss": 0.486, + "step": 4202 + }, + { + "epoch": 0.9375418246709792, + "grad_norm": 0.16015774011611938, + "learning_rate": 1.5691072010924915e-05, + "loss": 0.4816, + "step": 4203 + }, + { + "epoch": 0.9377648895828686, + "grad_norm": 0.17005962133407593, + "learning_rate": 1.568913671478522e-05, + "loss": 0.4986, + "step": 4204 + }, + { + "epoch": 0.937987954494758, + "grad_norm": 0.15905898809432983, + "learning_rate": 1.5687201103545343e-05, + "loss": 0.5031, + "step": 4205 + }, + { + "epoch": 0.9382110194066473, + "grad_norm": 0.16087795794010162, + "learning_rate": 1.56852651773125e-05, + "loss": 0.4957, + "step": 4206 + }, + { + "epoch": 0.9384340843185367, + "grad_norm": 0.16135592758655548, + "learning_rate": 1.5683328936193908e-05, + "loss": 0.5094, + "step": 4207 + }, + { + "epoch": 0.9386571492304261, + "grad_norm": 0.18367910385131836, + "learning_rate": 1.568139238029681e-05, + "loss": 0.515, + "step": 4208 + }, + { + "epoch": 0.9388802141423154, + "grad_norm": 0.16034136712551117, + "learning_rate": 1.5679455509728468e-05, + "loss": 0.5168, + "step": 4209 + }, + { + "epoch": 0.9391032790542048, + "grad_norm": 0.17193713784217834, + "learning_rate": 1.567751832459615e-05, + "loss": 0.5078, + "step": 4210 + }, + { + "epoch": 0.9393263439660942, + "grad_norm": 0.15431612730026245, + "learning_rate": 1.5675580825007158e-05, + "loss": 0.4718, + "step": 4211 + }, + { + "epoch": 0.9395494088779834, + "grad_norm": 0.16114309430122375, + "learning_rate": 1.5673643011068796e-05, + "loss": 0.4779, + "step": 4212 + }, + { + "epoch": 0.9397724737898728, + "grad_norm": 0.15799474716186523, + "learning_rate": 1.5671704882888396e-05, + "loss": 0.5086, + "step": 4213 + }, + { + "epoch": 0.9399955387017622, + "grad_norm": 0.1653827577829361, + "learning_rate": 1.5669766440573302e-05, + "loss": 0.488, + "step": 4214 + }, + { + "epoch": 0.9402186036136516, + "grad_norm": 0.16851937770843506, + "learning_rate": 1.566782768423088e-05, + "loss": 0.4806, + "step": 4215 + }, + { + "epoch": 0.9404416685255409, + "grad_norm": 0.17142115533351898, + "learning_rate": 1.566588861396851e-05, + "loss": 0.5131, + "step": 4216 + }, + { + "epoch": 0.9406647334374303, + "grad_norm": 0.15409086644649506, + "learning_rate": 1.5663949229893587e-05, + "loss": 0.4624, + "step": 4217 + }, + { + "epoch": 0.9408877983493197, + "grad_norm": 0.16042175889015198, + "learning_rate": 1.566200953211353e-05, + "loss": 0.4816, + "step": 4218 + }, + { + "epoch": 0.941110863261209, + "grad_norm": 0.15703439712524414, + "learning_rate": 1.5660069520735766e-05, + "loss": 0.504, + "step": 4219 + }, + { + "epoch": 0.9413339281730984, + "grad_norm": 0.16937057673931122, + "learning_rate": 1.565812919586775e-05, + "loss": 0.4766, + "step": 4220 + }, + { + "epoch": 0.9415569930849877, + "grad_norm": 0.17867791652679443, + "learning_rate": 1.565618855761695e-05, + "loss": 0.4872, + "step": 4221 + }, + { + "epoch": 0.9417800579968771, + "grad_norm": 0.15998175740242004, + "learning_rate": 1.5654247606090846e-05, + "loss": 0.4844, + "step": 4222 + }, + { + "epoch": 0.9420031229087664, + "grad_norm": 0.15649773180484772, + "learning_rate": 1.5652306341396943e-05, + "loss": 0.4774, + "step": 4223 + }, + { + "epoch": 0.9422261878206558, + "grad_norm": 0.15873044729232788, + "learning_rate": 1.5650364763642764e-05, + "loss": 0.5239, + "step": 4224 + }, + { + "epoch": 0.9424492527325452, + "grad_norm": 0.15709929168224335, + "learning_rate": 1.564842287293584e-05, + "loss": 0.4875, + "step": 4225 + }, + { + "epoch": 0.9426723176444345, + "grad_norm": 0.1562758833169937, + "learning_rate": 1.5646480669383726e-05, + "loss": 0.5132, + "step": 4226 + }, + { + "epoch": 0.9428953825563239, + "grad_norm": 0.162166565656662, + "learning_rate": 1.5644538153093995e-05, + "loss": 0.4901, + "step": 4227 + }, + { + "epoch": 0.9431184474682133, + "grad_norm": 0.1618107706308365, + "learning_rate": 1.564259532417424e-05, + "loss": 0.4722, + "step": 4228 + }, + { + "epoch": 0.9433415123801027, + "grad_norm": 0.15450115501880646, + "learning_rate": 1.5640652182732057e-05, + "loss": 0.4498, + "step": 4229 + }, + { + "epoch": 0.943564577291992, + "grad_norm": 0.1606828272342682, + "learning_rate": 1.563870872887508e-05, + "loss": 0.4798, + "step": 4230 + }, + { + "epoch": 0.9437876422038813, + "grad_norm": 0.15939275920391083, + "learning_rate": 1.5636764962710936e-05, + "loss": 0.4773, + "step": 4231 + }, + { + "epoch": 0.9440107071157707, + "grad_norm": 0.16131429374217987, + "learning_rate": 1.5634820884347303e-05, + "loss": 0.4861, + "step": 4232 + }, + { + "epoch": 0.94423377202766, + "grad_norm": 0.15643377602100372, + "learning_rate": 1.563287649389184e-05, + "loss": 0.5172, + "step": 4233 + }, + { + "epoch": 0.9444568369395494, + "grad_norm": 0.1604384183883667, + "learning_rate": 1.5630931791452246e-05, + "loss": 0.4808, + "step": 4234 + }, + { + "epoch": 0.9446799018514388, + "grad_norm": 0.15961483120918274, + "learning_rate": 1.5628986777136223e-05, + "loss": 0.4648, + "step": 4235 + }, + { + "epoch": 0.9449029667633281, + "grad_norm": 0.16450265049934387, + "learning_rate": 1.562704145105151e-05, + "loss": 0.5006, + "step": 4236 + }, + { + "epoch": 0.9451260316752175, + "grad_norm": 0.1608218103647232, + "learning_rate": 1.5625095813305847e-05, + "loss": 0.4983, + "step": 4237 + }, + { + "epoch": 0.9453490965871069, + "grad_norm": 0.1569865345954895, + "learning_rate": 1.5623149864006993e-05, + "loss": 0.4969, + "step": 4238 + }, + { + "epoch": 0.9455721614989963, + "grad_norm": 0.1610192060470581, + "learning_rate": 1.5621203603262727e-05, + "loss": 0.4842, + "step": 4239 + }, + { + "epoch": 0.9457952264108855, + "grad_norm": 0.18263404071331024, + "learning_rate": 1.561925703118085e-05, + "loss": 0.5053, + "step": 4240 + }, + { + "epoch": 0.9460182913227749, + "grad_norm": 0.16298796236515045, + "learning_rate": 1.561731014786917e-05, + "loss": 0.4906, + "step": 4241 + }, + { + "epoch": 0.9462413562346643, + "grad_norm": 0.16036361455917358, + "learning_rate": 1.5615362953435517e-05, + "loss": 0.4647, + "step": 4242 + }, + { + "epoch": 0.9464644211465536, + "grad_norm": 0.16408760845661163, + "learning_rate": 1.5613415447987743e-05, + "loss": 0.4836, + "step": 4243 + }, + { + "epoch": 0.946687486058443, + "grad_norm": 0.1685844510793686, + "learning_rate": 1.5611467631633713e-05, + "loss": 0.5509, + "step": 4244 + }, + { + "epoch": 0.9469105509703324, + "grad_norm": 0.16026711463928223, + "learning_rate": 1.5609519504481306e-05, + "loss": 0.4926, + "step": 4245 + }, + { + "epoch": 0.9471336158822218, + "grad_norm": 0.19063352048397064, + "learning_rate": 1.560757106663843e-05, + "loss": 0.4975, + "step": 4246 + }, + { + "epoch": 0.9473566807941111, + "grad_norm": 0.1648816168308258, + "learning_rate": 1.560562231821299e-05, + "loss": 0.5142, + "step": 4247 + }, + { + "epoch": 0.9475797457060005, + "grad_norm": 0.1545959860086441, + "learning_rate": 1.5603673259312927e-05, + "loss": 0.4862, + "step": 4248 + }, + { + "epoch": 0.9478028106178898, + "grad_norm": 0.15768404304981232, + "learning_rate": 1.5601723890046188e-05, + "loss": 0.4708, + "step": 4249 + }, + { + "epoch": 0.9480258755297791, + "grad_norm": 0.1632533222436905, + "learning_rate": 1.5599774210520747e-05, + "loss": 0.5113, + "step": 4250 + }, + { + "epoch": 0.9482489404416685, + "grad_norm": 0.1614227592945099, + "learning_rate": 1.5597824220844583e-05, + "loss": 0.5081, + "step": 4251 + }, + { + "epoch": 0.9484720053535579, + "grad_norm": 0.15559621155261993, + "learning_rate": 1.55958739211257e-05, + "loss": 0.5012, + "step": 4252 + }, + { + "epoch": 0.9486950702654472, + "grad_norm": 0.16258159279823303, + "learning_rate": 1.5593923311472127e-05, + "loss": 0.5018, + "step": 4253 + }, + { + "epoch": 0.9489181351773366, + "grad_norm": 0.15811897814273834, + "learning_rate": 1.559197239199189e-05, + "loss": 0.4694, + "step": 4254 + }, + { + "epoch": 0.949141200089226, + "grad_norm": 0.15983614325523376, + "learning_rate": 1.5590021162793047e-05, + "loss": 0.4758, + "step": 4255 + }, + { + "epoch": 0.9493642650011154, + "grad_norm": 0.16571156680583954, + "learning_rate": 1.558806962398367e-05, + "loss": 0.4952, + "step": 4256 + }, + { + "epoch": 0.9495873299130047, + "grad_norm": 0.17233605682849884, + "learning_rate": 1.5586117775671844e-05, + "loss": 0.5252, + "step": 4257 + }, + { + "epoch": 0.949810394824894, + "grad_norm": 0.17330588400363922, + "learning_rate": 1.558416561796568e-05, + "loss": 0.5099, + "step": 4258 + }, + { + "epoch": 0.9500334597367834, + "grad_norm": 0.15399448573589325, + "learning_rate": 1.5582213150973296e-05, + "loss": 0.4697, + "step": 4259 + }, + { + "epoch": 0.9502565246486727, + "grad_norm": 0.17017517983913422, + "learning_rate": 1.5580260374802837e-05, + "loss": 0.4981, + "step": 4260 + }, + { + "epoch": 0.9504795895605621, + "grad_norm": 0.1677882969379425, + "learning_rate": 1.5578307289562457e-05, + "loss": 0.5042, + "step": 4261 + }, + { + "epoch": 0.9507026544724515, + "grad_norm": 0.15946677327156067, + "learning_rate": 1.557635389536033e-05, + "loss": 0.4697, + "step": 4262 + }, + { + "epoch": 0.9509257193843409, + "grad_norm": 1.8394982814788818, + "learning_rate": 1.557440019230465e-05, + "loss": 0.5449, + "step": 4263 + }, + { + "epoch": 0.9511487842962302, + "grad_norm": 0.16612844169139862, + "learning_rate": 1.5572446180503618e-05, + "loss": 0.4826, + "step": 4264 + }, + { + "epoch": 0.9513718492081196, + "grad_norm": 0.15885433554649353, + "learning_rate": 1.557049186006547e-05, + "loss": 0.4837, + "step": 4265 + }, + { + "epoch": 0.951594914120009, + "grad_norm": 0.16099439561367035, + "learning_rate": 1.5568537231098438e-05, + "loss": 0.4863, + "step": 4266 + }, + { + "epoch": 0.9518179790318982, + "grad_norm": 0.15327222645282745, + "learning_rate": 1.5566582293710787e-05, + "loss": 0.4865, + "step": 4267 + }, + { + "epoch": 0.9520410439437876, + "grad_norm": 0.1621992439031601, + "learning_rate": 1.5564627048010797e-05, + "loss": 0.5082, + "step": 4268 + }, + { + "epoch": 0.952264108855677, + "grad_norm": 0.16586080193519592, + "learning_rate": 1.5562671494106756e-05, + "loss": 0.5066, + "step": 4269 + }, + { + "epoch": 0.9524871737675663, + "grad_norm": 0.16953197121620178, + "learning_rate": 1.5560715632106976e-05, + "loss": 0.5127, + "step": 4270 + }, + { + "epoch": 0.9527102386794557, + "grad_norm": 0.1806926429271698, + "learning_rate": 1.555875946211979e-05, + "loss": 0.5091, + "step": 4271 + }, + { + "epoch": 0.9529333035913451, + "grad_norm": 0.16707123816013336, + "learning_rate": 1.5556802984253534e-05, + "loss": 0.5048, + "step": 4272 + }, + { + "epoch": 0.9531563685032345, + "grad_norm": 0.29071560502052307, + "learning_rate": 1.5554846198616576e-05, + "loss": 0.4998, + "step": 4273 + }, + { + "epoch": 0.9533794334151238, + "grad_norm": 0.19042329490184784, + "learning_rate": 1.5552889105317296e-05, + "loss": 0.4883, + "step": 4274 + }, + { + "epoch": 0.9536024983270132, + "grad_norm": 0.1610180288553238, + "learning_rate": 1.555093170446409e-05, + "loss": 0.4816, + "step": 4275 + }, + { + "epoch": 0.9538255632389026, + "grad_norm": 0.16079892218112946, + "learning_rate": 1.5548973996165365e-05, + "loss": 0.5118, + "step": 4276 + }, + { + "epoch": 0.9540486281507918, + "grad_norm": 0.1768450289964676, + "learning_rate": 1.5547015980529558e-05, + "loss": 0.5249, + "step": 4277 + }, + { + "epoch": 0.9542716930626812, + "grad_norm": 0.1556175947189331, + "learning_rate": 1.5545057657665115e-05, + "loss": 0.5081, + "step": 4278 + }, + { + "epoch": 0.9544947579745706, + "grad_norm": 0.16020707786083221, + "learning_rate": 1.5543099027680496e-05, + "loss": 0.4846, + "step": 4279 + }, + { + "epoch": 0.95471782288646, + "grad_norm": 0.1518724262714386, + "learning_rate": 1.554114009068419e-05, + "loss": 0.4866, + "step": 4280 + }, + { + "epoch": 0.9549408877983493, + "grad_norm": 0.1620427668094635, + "learning_rate": 1.5539180846784686e-05, + "loss": 0.5054, + "step": 4281 + }, + { + "epoch": 0.9551639527102387, + "grad_norm": 0.1567380130290985, + "learning_rate": 1.5537221296090506e-05, + "loss": 0.4857, + "step": 4282 + }, + { + "epoch": 0.9553870176221281, + "grad_norm": 0.1572207808494568, + "learning_rate": 1.553526143871018e-05, + "loss": 0.4803, + "step": 4283 + }, + { + "epoch": 0.9556100825340174, + "grad_norm": 0.1630953699350357, + "learning_rate": 1.553330127475226e-05, + "loss": 0.5076, + "step": 4284 + }, + { + "epoch": 0.9558331474459068, + "grad_norm": 0.16126613318920135, + "learning_rate": 1.5531340804325303e-05, + "loss": 0.4566, + "step": 4285 + }, + { + "epoch": 0.9560562123577961, + "grad_norm": 0.16429363191127777, + "learning_rate": 1.5529380027537904e-05, + "loss": 0.4905, + "step": 4286 + }, + { + "epoch": 0.9562792772696854, + "grad_norm": 0.16202637553215027, + "learning_rate": 1.5527418944498656e-05, + "loss": 0.4962, + "step": 4287 + }, + { + "epoch": 0.9565023421815748, + "grad_norm": 0.16903194785118103, + "learning_rate": 1.5525457555316177e-05, + "loss": 0.4917, + "step": 4288 + }, + { + "epoch": 0.9567254070934642, + "grad_norm": 0.16751469671726227, + "learning_rate": 1.5523495860099102e-05, + "loss": 0.4762, + "step": 4289 + }, + { + "epoch": 0.9569484720053536, + "grad_norm": 0.16918997466564178, + "learning_rate": 1.5521533858956085e-05, + "loss": 0.5005, + "step": 4290 + }, + { + "epoch": 0.9571715369172429, + "grad_norm": 0.268764466047287, + "learning_rate": 1.551957155199579e-05, + "loss": 0.4734, + "step": 4291 + }, + { + "epoch": 0.9573946018291323, + "grad_norm": 0.1583995223045349, + "learning_rate": 1.55176089393269e-05, + "loss": 0.4776, + "step": 4292 + }, + { + "epoch": 0.9576176667410217, + "grad_norm": 0.16260802745819092, + "learning_rate": 1.5515646021058124e-05, + "loss": 0.487, + "step": 4293 + }, + { + "epoch": 0.957840731652911, + "grad_norm": 0.16519910097122192, + "learning_rate": 1.5513682797298172e-05, + "loss": 0.4916, + "step": 4294 + }, + { + "epoch": 0.9580637965648003, + "grad_norm": 0.15430989861488342, + "learning_rate": 1.551171926815579e-05, + "loss": 0.4645, + "step": 4295 + }, + { + "epoch": 0.9582868614766897, + "grad_norm": 0.15664781630039215, + "learning_rate": 1.5509755433739723e-05, + "loss": 0.4824, + "step": 4296 + }, + { + "epoch": 0.9585099263885791, + "grad_norm": 0.16775622963905334, + "learning_rate": 1.550779129415874e-05, + "loss": 0.4993, + "step": 4297 + }, + { + "epoch": 0.9587329913004684, + "grad_norm": 0.163412943482399, + "learning_rate": 1.550582684952163e-05, + "loss": 0.5176, + "step": 4298 + }, + { + "epoch": 0.9589560562123578, + "grad_norm": 0.1565362513065338, + "learning_rate": 1.5503862099937198e-05, + "loss": 0.4667, + "step": 4299 + }, + { + "epoch": 0.9591791211242472, + "grad_norm": 0.16091248393058777, + "learning_rate": 1.550189704551426e-05, + "loss": 0.4823, + "step": 4300 + }, + { + "epoch": 0.9594021860361365, + "grad_norm": 0.1619873195886612, + "learning_rate": 1.5499931686361658e-05, + "loss": 0.4753, + "step": 4301 + }, + { + "epoch": 0.9596252509480259, + "grad_norm": 0.18374527990818024, + "learning_rate": 1.549796602258824e-05, + "loss": 0.465, + "step": 4302 + }, + { + "epoch": 0.9598483158599153, + "grad_norm": 0.16035234928131104, + "learning_rate": 1.549600005430288e-05, + "loss": 0.4913, + "step": 4303 + }, + { + "epoch": 0.9600713807718046, + "grad_norm": 0.16635017096996307, + "learning_rate": 1.549403378161447e-05, + "loss": 0.4869, + "step": 4304 + }, + { + "epoch": 0.9602944456836939, + "grad_norm": 0.16243912279605865, + "learning_rate": 1.5492067204631908e-05, + "loss": 0.4787, + "step": 4305 + }, + { + "epoch": 0.9605175105955833, + "grad_norm": 0.1565091758966446, + "learning_rate": 1.5490100323464118e-05, + "loss": 0.4805, + "step": 4306 + }, + { + "epoch": 0.9607405755074727, + "grad_norm": 0.1620541661977768, + "learning_rate": 1.5488133138220038e-05, + "loss": 0.5016, + "step": 4307 + }, + { + "epoch": 0.960963640419362, + "grad_norm": 0.16399456560611725, + "learning_rate": 1.5486165649008623e-05, + "loss": 0.4977, + "step": 4308 + }, + { + "epoch": 0.9611867053312514, + "grad_norm": 0.1687907576560974, + "learning_rate": 1.5484197855938847e-05, + "loss": 0.4966, + "step": 4309 + }, + { + "epoch": 0.9614097702431408, + "grad_norm": 0.16494713723659515, + "learning_rate": 1.548222975911969e-05, + "loss": 0.5044, + "step": 4310 + }, + { + "epoch": 0.9616328351550301, + "grad_norm": 0.16072864830493927, + "learning_rate": 1.5480261358660172e-05, + "loss": 0.4969, + "step": 4311 + }, + { + "epoch": 0.9618559000669195, + "grad_norm": 0.1517607718706131, + "learning_rate": 1.5478292654669304e-05, + "loss": 0.4789, + "step": 4312 + }, + { + "epoch": 0.9620789649788088, + "grad_norm": 0.15903200209140778, + "learning_rate": 1.547632364725613e-05, + "loss": 0.4877, + "step": 4313 + }, + { + "epoch": 0.9623020298906982, + "grad_norm": 0.2090669870376587, + "learning_rate": 1.5474354336529706e-05, + "loss": 0.4699, + "step": 4314 + }, + { + "epoch": 0.9625250948025875, + "grad_norm": 0.17890144884586334, + "learning_rate": 1.5472384722599102e-05, + "loss": 0.4921, + "step": 4315 + }, + { + "epoch": 0.9627481597144769, + "grad_norm": 0.1629069298505783, + "learning_rate": 1.547041480557341e-05, + "loss": 0.5049, + "step": 4316 + }, + { + "epoch": 0.9629712246263663, + "grad_norm": 0.1475018411874771, + "learning_rate": 1.5468444585561736e-05, + "loss": 0.4795, + "step": 4317 + }, + { + "epoch": 0.9631942895382556, + "grad_norm": 0.15612593293190002, + "learning_rate": 1.54664740626732e-05, + "loss": 0.4402, + "step": 4318 + }, + { + "epoch": 0.963417354450145, + "grad_norm": 0.15412123501300812, + "learning_rate": 1.546450323701695e-05, + "loss": 0.4493, + "step": 4319 + }, + { + "epoch": 0.9636404193620344, + "grad_norm": 0.16448107361793518, + "learning_rate": 1.5462532108702134e-05, + "loss": 0.4979, + "step": 4320 + }, + { + "epoch": 0.9638634842739238, + "grad_norm": 0.16529002785682678, + "learning_rate": 1.546056067783793e-05, + "loss": 0.5127, + "step": 4321 + }, + { + "epoch": 0.964086549185813, + "grad_norm": 0.15807853639125824, + "learning_rate": 1.545858894453353e-05, + "loss": 0.4638, + "step": 4322 + }, + { + "epoch": 0.9643096140977024, + "grad_norm": 0.16824275255203247, + "learning_rate": 1.5456616908898134e-05, + "loss": 0.4877, + "step": 4323 + }, + { + "epoch": 0.9645326790095918, + "grad_norm": 0.15887348353862762, + "learning_rate": 1.5454644571040973e-05, + "loss": 0.4688, + "step": 4324 + }, + { + "epoch": 0.9647557439214811, + "grad_norm": 0.16340267658233643, + "learning_rate": 1.545267193107128e-05, + "loss": 0.4779, + "step": 4325 + }, + { + "epoch": 0.9649788088333705, + "grad_norm": 0.16982871294021606, + "learning_rate": 1.545069898909832e-05, + "loss": 0.5058, + "step": 4326 + }, + { + "epoch": 0.9652018737452599, + "grad_norm": 0.15784378349781036, + "learning_rate": 1.544872574523137e-05, + "loss": 0.4563, + "step": 4327 + }, + { + "epoch": 0.9654249386571492, + "grad_norm": 0.15926995873451233, + "learning_rate": 1.5446752199579703e-05, + "loss": 0.4893, + "step": 4328 + }, + { + "epoch": 0.9656480035690386, + "grad_norm": 0.16018277406692505, + "learning_rate": 1.544477835225265e-05, + "loss": 0.4876, + "step": 4329 + }, + { + "epoch": 0.965871068480928, + "grad_norm": 0.16839507222175598, + "learning_rate": 1.544280420335951e-05, + "loss": 0.5184, + "step": 4330 + }, + { + "epoch": 0.9660941333928174, + "grad_norm": 0.1582273244857788, + "learning_rate": 1.5440829753009646e-05, + "loss": 0.4895, + "step": 4331 + }, + { + "epoch": 0.9663171983047066, + "grad_norm": 0.15812306106090546, + "learning_rate": 1.5438855001312402e-05, + "loss": 0.5244, + "step": 4332 + }, + { + "epoch": 0.966540263216596, + "grad_norm": 0.15881675481796265, + "learning_rate": 1.5436879948377157e-05, + "loss": 0.4541, + "step": 4333 + }, + { + "epoch": 0.9667633281284854, + "grad_norm": 0.1649298518896103, + "learning_rate": 1.5434904594313303e-05, + "loss": 0.4898, + "step": 4334 + }, + { + "epoch": 0.9669863930403747, + "grad_norm": 0.1670350432395935, + "learning_rate": 1.5432928939230243e-05, + "loss": 0.4961, + "step": 4335 + }, + { + "epoch": 0.9672094579522641, + "grad_norm": 0.17490597069263458, + "learning_rate": 1.5430952983237404e-05, + "loss": 0.4996, + "step": 4336 + }, + { + "epoch": 0.9674325228641535, + "grad_norm": 0.18037396669387817, + "learning_rate": 1.542897672644423e-05, + "loss": 0.4856, + "step": 4337 + }, + { + "epoch": 0.9676555877760429, + "grad_norm": 0.16085660457611084, + "learning_rate": 1.5427000168960172e-05, + "loss": 0.4741, + "step": 4338 + }, + { + "epoch": 0.9678786526879322, + "grad_norm": 0.16227804124355316, + "learning_rate": 1.5425023310894707e-05, + "loss": 0.4858, + "step": 4339 + }, + { + "epoch": 0.9681017175998216, + "grad_norm": 0.21843907237052917, + "learning_rate": 1.5423046152357328e-05, + "loss": 0.4705, + "step": 4340 + }, + { + "epoch": 0.968324782511711, + "grad_norm": 0.16902711987495422, + "learning_rate": 1.542106869345754e-05, + "loss": 0.4673, + "step": 4341 + }, + { + "epoch": 0.9685478474236002, + "grad_norm": 0.15176647901535034, + "learning_rate": 1.5419090934304865e-05, + "loss": 0.477, + "step": 4342 + }, + { + "epoch": 0.9687709123354896, + "grad_norm": 0.15946295857429504, + "learning_rate": 1.5417112875008854e-05, + "loss": 0.4768, + "step": 4343 + }, + { + "epoch": 0.968993977247379, + "grad_norm": 0.1705644428730011, + "learning_rate": 1.5415134515679053e-05, + "loss": 0.4806, + "step": 4344 + }, + { + "epoch": 0.9692170421592683, + "grad_norm": 0.1598884016275406, + "learning_rate": 1.541315585642504e-05, + "loss": 0.5018, + "step": 4345 + }, + { + "epoch": 0.9694401070711577, + "grad_norm": 0.15806621313095093, + "learning_rate": 1.54111768973564e-05, + "loss": 0.4854, + "step": 4346 + }, + { + "epoch": 0.9696631719830471, + "grad_norm": 0.15730160474777222, + "learning_rate": 1.5409197638582753e-05, + "loss": 0.499, + "step": 4347 + }, + { + "epoch": 0.9698862368949365, + "grad_norm": 0.16367729008197784, + "learning_rate": 1.540721808021371e-05, + "loss": 0.4784, + "step": 4348 + }, + { + "epoch": 0.9701093018068258, + "grad_norm": 0.16696509718894958, + "learning_rate": 1.5405238222358925e-05, + "loss": 0.4887, + "step": 4349 + }, + { + "epoch": 0.9703323667187151, + "grad_norm": 0.1621687263250351, + "learning_rate": 1.5403258065128042e-05, + "loss": 0.5113, + "step": 4350 + }, + { + "epoch": 0.9705554316306045, + "grad_norm": 0.1656588315963745, + "learning_rate": 1.5401277608630742e-05, + "loss": 0.5021, + "step": 4351 + }, + { + "epoch": 0.9707784965424938, + "grad_norm": 0.17137381434440613, + "learning_rate": 1.539929685297671e-05, + "loss": 0.4965, + "step": 4352 + }, + { + "epoch": 0.9710015614543832, + "grad_norm": 0.16810309886932373, + "learning_rate": 1.5397315798275654e-05, + "loss": 0.4961, + "step": 4353 + }, + { + "epoch": 0.9712246263662726, + "grad_norm": 0.17059317231178284, + "learning_rate": 1.5395334444637306e-05, + "loss": 0.4708, + "step": 4354 + }, + { + "epoch": 0.971447691278162, + "grad_norm": 0.16224807500839233, + "learning_rate": 1.539335279217139e-05, + "loss": 0.4593, + "step": 4355 + }, + { + "epoch": 0.9716707561900513, + "grad_norm": 0.17209084331989288, + "learning_rate": 1.5391370840987674e-05, + "loss": 0.4959, + "step": 4356 + }, + { + "epoch": 0.9718938211019407, + "grad_norm": 0.16699501872062683, + "learning_rate": 1.5389388591195928e-05, + "loss": 0.4897, + "step": 4357 + }, + { + "epoch": 0.9721168860138301, + "grad_norm": 0.15935075283050537, + "learning_rate": 1.538740604290594e-05, + "loss": 0.4833, + "step": 4358 + }, + { + "epoch": 0.9723399509257193, + "grad_norm": 0.16760936379432678, + "learning_rate": 1.538542319622752e-05, + "loss": 0.4707, + "step": 4359 + }, + { + "epoch": 0.9725630158376087, + "grad_norm": 0.19774286448955536, + "learning_rate": 1.5383440051270486e-05, + "loss": 0.4799, + "step": 4360 + }, + { + "epoch": 0.9727860807494981, + "grad_norm": 0.1727793663740158, + "learning_rate": 1.5381456608144677e-05, + "loss": 0.4761, + "step": 4361 + }, + { + "epoch": 0.9730091456613874, + "grad_norm": 0.1611672341823578, + "learning_rate": 1.5379472866959954e-05, + "loss": 0.4984, + "step": 4362 + }, + { + "epoch": 0.9732322105732768, + "grad_norm": 0.15909035503864288, + "learning_rate": 1.537748882782618e-05, + "loss": 0.487, + "step": 4363 + }, + { + "epoch": 0.9734552754851662, + "grad_norm": 0.17754100263118744, + "learning_rate": 1.5375504490853255e-05, + "loss": 0.4782, + "step": 4364 + }, + { + "epoch": 0.9736783403970556, + "grad_norm": 0.19527114927768707, + "learning_rate": 1.5373519856151077e-05, + "loss": 0.4783, + "step": 4365 + }, + { + "epoch": 0.9739014053089449, + "grad_norm": 0.1601899415254593, + "learning_rate": 1.5371534923829562e-05, + "loss": 0.5042, + "step": 4366 + }, + { + "epoch": 0.9741244702208343, + "grad_norm": 0.16773121058940887, + "learning_rate": 1.536954969399866e-05, + "loss": 0.5025, + "step": 4367 + }, + { + "epoch": 0.9743475351327237, + "grad_norm": 0.1618761271238327, + "learning_rate": 1.5367564166768322e-05, + "loss": 0.5125, + "step": 4368 + }, + { + "epoch": 0.9745706000446129, + "grad_norm": 0.1555018573999405, + "learning_rate": 1.5365578342248515e-05, + "loss": 0.4754, + "step": 4369 + }, + { + "epoch": 0.9747936649565023, + "grad_norm": 0.16692039370536804, + "learning_rate": 1.5363592220549227e-05, + "loss": 0.5025, + "step": 4370 + }, + { + "epoch": 0.9750167298683917, + "grad_norm": 0.16272681951522827, + "learning_rate": 1.5361605801780465e-05, + "loss": 0.513, + "step": 4371 + }, + { + "epoch": 0.9752397947802811, + "grad_norm": 0.17153267562389374, + "learning_rate": 1.535961908605225e-05, + "loss": 0.4716, + "step": 4372 + }, + { + "epoch": 0.9754628596921704, + "grad_norm": 0.162004753947258, + "learning_rate": 1.5357632073474614e-05, + "loss": 0.4752, + "step": 4373 + }, + { + "epoch": 0.9756859246040598, + "grad_norm": 0.17653700709342957, + "learning_rate": 1.535564476415761e-05, + "loss": 0.4875, + "step": 4374 + }, + { + "epoch": 0.9759089895159492, + "grad_norm": 0.16345418989658356, + "learning_rate": 1.535365715821132e-05, + "loss": 0.5036, + "step": 4375 + }, + { + "epoch": 0.9761320544278385, + "grad_norm": 0.16799689829349518, + "learning_rate": 1.535166925574581e-05, + "loss": 0.5035, + "step": 4376 + }, + { + "epoch": 0.9763551193397279, + "grad_norm": 0.204716295003891, + "learning_rate": 1.53496810568712e-05, + "loss": 0.501, + "step": 4377 + }, + { + "epoch": 0.9765781842516172, + "grad_norm": 0.15521782636642456, + "learning_rate": 1.53476925616976e-05, + "loss": 0.4864, + "step": 4378 + }, + { + "epoch": 0.9768012491635066, + "grad_norm": 0.16439463198184967, + "learning_rate": 1.5345703770335147e-05, + "loss": 0.491, + "step": 4379 + }, + { + "epoch": 0.9770243140753959, + "grad_norm": 0.16255317628383636, + "learning_rate": 1.5343714682893997e-05, + "loss": 0.5108, + "step": 4380 + }, + { + "epoch": 0.9772473789872853, + "grad_norm": 0.15555396676063538, + "learning_rate": 1.534172529948431e-05, + "loss": 0.4506, + "step": 4381 + }, + { + "epoch": 0.9774704438991747, + "grad_norm": 0.16245706379413605, + "learning_rate": 1.5339735620216275e-05, + "loss": 0.4892, + "step": 4382 + }, + { + "epoch": 0.977693508811064, + "grad_norm": 0.16190017759799957, + "learning_rate": 1.5337745645200097e-05, + "loss": 0.4716, + "step": 4383 + }, + { + "epoch": 0.9779165737229534, + "grad_norm": 0.1614699810743332, + "learning_rate": 1.5335755374545985e-05, + "loss": 0.5041, + "step": 4384 + }, + { + "epoch": 0.9781396386348428, + "grad_norm": 0.1543981432914734, + "learning_rate": 1.533376480836418e-05, + "loss": 0.4837, + "step": 4385 + }, + { + "epoch": 0.978362703546732, + "grad_norm": 0.15510593354701996, + "learning_rate": 1.5331773946764928e-05, + "loss": 0.5073, + "step": 4386 + }, + { + "epoch": 0.9785857684586214, + "grad_norm": 0.16036750376224518, + "learning_rate": 1.5329782789858495e-05, + "loss": 0.511, + "step": 4387 + }, + { + "epoch": 0.9788088333705108, + "grad_norm": 0.1586884707212448, + "learning_rate": 1.532779133775517e-05, + "loss": 0.4783, + "step": 4388 + }, + { + "epoch": 0.9790318982824002, + "grad_norm": 0.1595693677663803, + "learning_rate": 1.5325799590565247e-05, + "loss": 0.4847, + "step": 4389 + }, + { + "epoch": 0.9792549631942895, + "grad_norm": 0.1676463484764099, + "learning_rate": 1.532380754839904e-05, + "loss": 0.4857, + "step": 4390 + }, + { + "epoch": 0.9794780281061789, + "grad_norm": 0.1569330245256424, + "learning_rate": 1.532181521136688e-05, + "loss": 0.5038, + "step": 4391 + }, + { + "epoch": 0.9797010930180683, + "grad_norm": 0.15425747632980347, + "learning_rate": 1.5319822579579125e-05, + "loss": 0.4687, + "step": 4392 + }, + { + "epoch": 0.9799241579299576, + "grad_norm": 0.1658424586057663, + "learning_rate": 1.5317829653146127e-05, + "loss": 0.4727, + "step": 4393 + }, + { + "epoch": 0.980147222841847, + "grad_norm": 0.16950277984142303, + "learning_rate": 1.5315836432178275e-05, + "loss": 0.5056, + "step": 4394 + }, + { + "epoch": 0.9803702877537364, + "grad_norm": 0.1796022653579712, + "learning_rate": 1.5313842916785965e-05, + "loss": 0.5062, + "step": 4395 + }, + { + "epoch": 0.9805933526656258, + "grad_norm": 0.16499702632427216, + "learning_rate": 1.5311849107079603e-05, + "loss": 0.5142, + "step": 4396 + }, + { + "epoch": 0.980816417577515, + "grad_norm": 0.1709393560886383, + "learning_rate": 1.5309855003169632e-05, + "loss": 0.5001, + "step": 4397 + }, + { + "epoch": 0.9810394824894044, + "grad_norm": 0.16077296435832977, + "learning_rate": 1.5307860605166487e-05, + "loss": 0.4908, + "step": 4398 + }, + { + "epoch": 0.9812625474012938, + "grad_norm": 0.17582206428050995, + "learning_rate": 1.5305865913180633e-05, + "loss": 0.4844, + "step": 4399 + }, + { + "epoch": 0.9814856123131831, + "grad_norm": 0.16168825328350067, + "learning_rate": 1.5303870927322552e-05, + "loss": 0.4856, + "step": 4400 + }, + { + "epoch": 0.9817086772250725, + "grad_norm": 0.15929754078388214, + "learning_rate": 1.5301875647702732e-05, + "loss": 0.4878, + "step": 4401 + }, + { + "epoch": 0.9819317421369619, + "grad_norm": 0.16380812227725983, + "learning_rate": 1.5299880074431693e-05, + "loss": 0.5009, + "step": 4402 + }, + { + "epoch": 0.9821548070488512, + "grad_norm": 0.17140789330005646, + "learning_rate": 1.5297884207619957e-05, + "loss": 0.4604, + "step": 4403 + }, + { + "epoch": 0.9823778719607406, + "grad_norm": 0.17358756065368652, + "learning_rate": 1.5295888047378064e-05, + "loss": 0.4905, + "step": 4404 + }, + { + "epoch": 0.98260093687263, + "grad_norm": 0.16201473772525787, + "learning_rate": 1.5293891593816583e-05, + "loss": 0.5156, + "step": 4405 + }, + { + "epoch": 0.9828240017845193, + "grad_norm": 0.16091175377368927, + "learning_rate": 1.529189484704608e-05, + "loss": 0.4883, + "step": 4406 + }, + { + "epoch": 0.9830470666964086, + "grad_norm": 0.16278314590454102, + "learning_rate": 1.528989780717716e-05, + "loss": 0.4727, + "step": 4407 + }, + { + "epoch": 0.983270131608298, + "grad_norm": 0.17446744441986084, + "learning_rate": 1.5287900474320422e-05, + "loss": 0.4824, + "step": 4408 + }, + { + "epoch": 0.9834931965201874, + "grad_norm": 0.3287087678909302, + "learning_rate": 1.5285902848586495e-05, + "loss": 0.49, + "step": 4409 + }, + { + "epoch": 0.9837162614320767, + "grad_norm": 0.16612185537815094, + "learning_rate": 1.5283904930086017e-05, + "loss": 0.4911, + "step": 4410 + }, + { + "epoch": 0.9839393263439661, + "grad_norm": 0.16220030188560486, + "learning_rate": 1.528190671892965e-05, + "loss": 0.4537, + "step": 4411 + }, + { + "epoch": 0.9841623912558555, + "grad_norm": 0.1631203591823578, + "learning_rate": 1.5279908215228058e-05, + "loss": 0.4725, + "step": 4412 + }, + { + "epoch": 0.9843854561677449, + "grad_norm": 0.15897974371910095, + "learning_rate": 1.5277909419091942e-05, + "loss": 0.4794, + "step": 4413 + }, + { + "epoch": 0.9846085210796341, + "grad_norm": 0.18077795207500458, + "learning_rate": 1.5275910330632e-05, + "loss": 0.4764, + "step": 4414 + }, + { + "epoch": 0.9848315859915235, + "grad_norm": 0.1682848483324051, + "learning_rate": 1.5273910949958963e-05, + "loss": 0.5024, + "step": 4415 + }, + { + "epoch": 0.9850546509034129, + "grad_norm": 0.1584632247686386, + "learning_rate": 1.527191127718356e-05, + "loss": 0.5007, + "step": 4416 + }, + { + "epoch": 0.9852777158153022, + "grad_norm": 0.1735336184501648, + "learning_rate": 1.5269911312416547e-05, + "loss": 0.4887, + "step": 4417 + }, + { + "epoch": 0.9855007807271916, + "grad_norm": 0.16184300184249878, + "learning_rate": 1.5267911055768697e-05, + "loss": 0.5064, + "step": 4418 + }, + { + "epoch": 0.985723845639081, + "grad_norm": 0.15283328294754028, + "learning_rate": 1.5265910507350797e-05, + "loss": 0.5012, + "step": 4419 + }, + { + "epoch": 0.9859469105509703, + "grad_norm": 0.1493072211742401, + "learning_rate": 1.526390966727365e-05, + "loss": 0.4763, + "step": 4420 + }, + { + "epoch": 0.9861699754628597, + "grad_norm": 0.15723635256290436, + "learning_rate": 1.526190853564807e-05, + "loss": 0.4893, + "step": 4421 + }, + { + "epoch": 0.9863930403747491, + "grad_norm": 0.17012642323970795, + "learning_rate": 1.52599071125849e-05, + "loss": 0.4905, + "step": 4422 + }, + { + "epoch": 0.9866161052866385, + "grad_norm": 0.15649664402008057, + "learning_rate": 1.5257905398194988e-05, + "loss": 0.4841, + "step": 4423 + }, + { + "epoch": 0.9868391701985277, + "grad_norm": 0.1696402132511139, + "learning_rate": 1.5255903392589204e-05, + "loss": 0.4792, + "step": 4424 + }, + { + "epoch": 0.9870622351104171, + "grad_norm": 0.17853881418704987, + "learning_rate": 1.5253901095878423e-05, + "loss": 0.5112, + "step": 4425 + }, + { + "epoch": 0.9872853000223065, + "grad_norm": 0.1598142683506012, + "learning_rate": 1.5251898508173558e-05, + "loss": 0.4907, + "step": 4426 + }, + { + "epoch": 0.9875083649341958, + "grad_norm": 0.1565473973751068, + "learning_rate": 1.5249895629585511e-05, + "loss": 0.4922, + "step": 4427 + }, + { + "epoch": 0.9877314298460852, + "grad_norm": 0.16049052774906158, + "learning_rate": 1.5247892460225226e-05, + "loss": 0.4947, + "step": 4428 + }, + { + "epoch": 0.9879544947579746, + "grad_norm": 0.16537557542324066, + "learning_rate": 1.5245889000203644e-05, + "loss": 0.4547, + "step": 4429 + }, + { + "epoch": 0.988177559669864, + "grad_norm": 0.16105584800243378, + "learning_rate": 1.5243885249631732e-05, + "loss": 0.4697, + "step": 4430 + }, + { + "epoch": 0.9884006245817533, + "grad_norm": 0.5197194814682007, + "learning_rate": 1.5241881208620468e-05, + "loss": 0.4723, + "step": 4431 + }, + { + "epoch": 0.9886236894936427, + "grad_norm": 0.15987588465213776, + "learning_rate": 1.5239876877280852e-05, + "loss": 0.5388, + "step": 4432 + }, + { + "epoch": 0.988846754405532, + "grad_norm": 0.16223685443401337, + "learning_rate": 1.5237872255723894e-05, + "loss": 0.5185, + "step": 4433 + }, + { + "epoch": 0.9890698193174213, + "grad_norm": 0.16801123321056366, + "learning_rate": 1.5235867344060622e-05, + "loss": 0.4825, + "step": 4434 + }, + { + "epoch": 0.9892928842293107, + "grad_norm": 0.16588987410068512, + "learning_rate": 1.523386214240208e-05, + "loss": 0.4933, + "step": 4435 + }, + { + "epoch": 0.9895159491412001, + "grad_norm": 0.1642870157957077, + "learning_rate": 1.5231856650859334e-05, + "loss": 0.4572, + "step": 4436 + }, + { + "epoch": 0.9897390140530894, + "grad_norm": 0.1569276601076126, + "learning_rate": 1.5229850869543454e-05, + "loss": 0.4844, + "step": 4437 + }, + { + "epoch": 0.9899620789649788, + "grad_norm": 0.16232363879680634, + "learning_rate": 1.5227844798565538e-05, + "loss": 0.5041, + "step": 4438 + }, + { + "epoch": 0.9901851438768682, + "grad_norm": 0.18178559839725494, + "learning_rate": 1.5225838438036693e-05, + "loss": 0.5203, + "step": 4439 + }, + { + "epoch": 0.9904082087887576, + "grad_norm": 0.1483062505722046, + "learning_rate": 1.5223831788068039e-05, + "loss": 0.4601, + "step": 4440 + }, + { + "epoch": 0.9906312737006469, + "grad_norm": 0.16904570162296295, + "learning_rate": 1.5221824848770728e-05, + "loss": 0.4692, + "step": 4441 + }, + { + "epoch": 0.9908543386125362, + "grad_norm": 0.16170957684516907, + "learning_rate": 1.5219817620255906e-05, + "loss": 0.476, + "step": 4442 + }, + { + "epoch": 0.9910774035244256, + "grad_norm": 0.15581966936588287, + "learning_rate": 1.521781010263475e-05, + "loss": 0.4685, + "step": 4443 + }, + { + "epoch": 0.9913004684363149, + "grad_norm": 0.17382118105888367, + "learning_rate": 1.521580229601845e-05, + "loss": 0.511, + "step": 4444 + }, + { + "epoch": 0.9915235333482043, + "grad_norm": 0.16696421802043915, + "learning_rate": 1.521379420051821e-05, + "loss": 0.5072, + "step": 4445 + }, + { + "epoch": 0.9917465982600937, + "grad_norm": 0.16701167821884155, + "learning_rate": 1.521178581624525e-05, + "loss": 0.521, + "step": 4446 + }, + { + "epoch": 0.9919696631719831, + "grad_norm": 0.16650201380252838, + "learning_rate": 1.520977714331081e-05, + "loss": 0.4945, + "step": 4447 + }, + { + "epoch": 0.9921927280838724, + "grad_norm": 0.1575930416584015, + "learning_rate": 1.5207768181826138e-05, + "loss": 0.4668, + "step": 4448 + }, + { + "epoch": 0.9924157929957618, + "grad_norm": 0.16714414954185486, + "learning_rate": 1.5205758931902507e-05, + "loss": 0.4739, + "step": 4449 + }, + { + "epoch": 0.9926388579076512, + "grad_norm": 0.1591663658618927, + "learning_rate": 1.5203749393651204e-05, + "loss": 0.5024, + "step": 4450 + }, + { + "epoch": 0.9928619228195404, + "grad_norm": 0.16330404579639435, + "learning_rate": 1.5201739567183525e-05, + "loss": 0.5104, + "step": 4451 + }, + { + "epoch": 0.9930849877314298, + "grad_norm": 0.16072112321853638, + "learning_rate": 1.519972945261079e-05, + "loss": 0.4722, + "step": 4452 + }, + { + "epoch": 0.9933080526433192, + "grad_norm": 0.1594439148902893, + "learning_rate": 1.5197719050044328e-05, + "loss": 0.4951, + "step": 4453 + }, + { + "epoch": 0.9935311175552086, + "grad_norm": 0.17731733620166779, + "learning_rate": 1.519570835959549e-05, + "loss": 0.4799, + "step": 4454 + }, + { + "epoch": 0.9937541824670979, + "grad_norm": 0.1658770889043808, + "learning_rate": 1.5193697381375641e-05, + "loss": 0.471, + "step": 4455 + }, + { + "epoch": 0.9939772473789873, + "grad_norm": 0.168108731508255, + "learning_rate": 1.5191686115496163e-05, + "loss": 0.4856, + "step": 4456 + }, + { + "epoch": 0.9942003122908767, + "grad_norm": 0.1631343513727188, + "learning_rate": 1.5189674562068448e-05, + "loss": 0.4795, + "step": 4457 + }, + { + "epoch": 0.994423377202766, + "grad_norm": 0.17591005563735962, + "learning_rate": 1.5187662721203916e-05, + "loss": 0.4431, + "step": 4458 + }, + { + "epoch": 0.9946464421146554, + "grad_norm": 0.16236771643161774, + "learning_rate": 1.5185650593013984e-05, + "loss": 0.5097, + "step": 4459 + }, + { + "epoch": 0.9948695070265448, + "grad_norm": 0.15711717307567596, + "learning_rate": 1.5183638177610109e-05, + "loss": 0.4896, + "step": 4460 + }, + { + "epoch": 0.995092571938434, + "grad_norm": 0.16357070207595825, + "learning_rate": 1.5181625475103744e-05, + "loss": 0.4878, + "step": 4461 + }, + { + "epoch": 0.9953156368503234, + "grad_norm": 0.1608823835849762, + "learning_rate": 1.5179612485606366e-05, + "loss": 0.4924, + "step": 4462 + }, + { + "epoch": 0.9955387017622128, + "grad_norm": 0.1610344648361206, + "learning_rate": 1.5177599209229468e-05, + "loss": 0.4564, + "step": 4463 + }, + { + "epoch": 0.9957617666741022, + "grad_norm": 0.1570533961057663, + "learning_rate": 1.5175585646084557e-05, + "loss": 0.4665, + "step": 4464 + }, + { + "epoch": 0.9959848315859915, + "grad_norm": 0.16878454387187958, + "learning_rate": 1.5173571796283155e-05, + "loss": 0.4832, + "step": 4465 + }, + { + "epoch": 0.9962078964978809, + "grad_norm": 0.15934909880161285, + "learning_rate": 1.5171557659936806e-05, + "loss": 0.4629, + "step": 4466 + }, + { + "epoch": 0.9964309614097703, + "grad_norm": 0.17097942531108856, + "learning_rate": 1.5169543237157062e-05, + "loss": 0.5009, + "step": 4467 + }, + { + "epoch": 0.9966540263216596, + "grad_norm": 0.1563706398010254, + "learning_rate": 1.5167528528055498e-05, + "loss": 0.4864, + "step": 4468 + }, + { + "epoch": 0.996877091233549, + "grad_norm": 0.16761906445026398, + "learning_rate": 1.5165513532743696e-05, + "loss": 0.4688, + "step": 4469 + }, + { + "epoch": 0.9971001561454383, + "grad_norm": 0.16788896918296814, + "learning_rate": 1.5163498251333267e-05, + "loss": 0.4975, + "step": 4470 + }, + { + "epoch": 0.9973232210573277, + "grad_norm": 0.16318921744823456, + "learning_rate": 1.516148268393582e-05, + "loss": 0.483, + "step": 4471 + }, + { + "epoch": 0.997546285969217, + "grad_norm": 0.1533585637807846, + "learning_rate": 1.5159466830662997e-05, + "loss": 0.4888, + "step": 4472 + }, + { + "epoch": 0.9977693508811064, + "grad_norm": 0.16081759333610535, + "learning_rate": 1.515745069162645e-05, + "loss": 0.4997, + "step": 4473 + }, + { + "epoch": 0.9979924157929958, + "grad_norm": 0.17346571385860443, + "learning_rate": 1.5155434266937836e-05, + "loss": 0.4918, + "step": 4474 + }, + { + "epoch": 0.9982154807048851, + "grad_norm": 0.16051939129829407, + "learning_rate": 1.515341755670885e-05, + "loss": 0.4911, + "step": 4475 + }, + { + "epoch": 0.9984385456167745, + "grad_norm": 0.15489095449447632, + "learning_rate": 1.5151400561051177e-05, + "loss": 0.4573, + "step": 4476 + }, + { + "epoch": 0.9986616105286639, + "grad_norm": 0.17051677405834198, + "learning_rate": 1.5149383280076544e-05, + "loss": 0.47, + "step": 4477 + }, + { + "epoch": 0.9988846754405531, + "grad_norm": 0.16898037493228912, + "learning_rate": 1.5147365713896669e-05, + "loss": 0.5265, + "step": 4478 + }, + { + "epoch": 0.9991077403524425, + "grad_norm": 0.1665852814912796, + "learning_rate": 1.5145347862623303e-05, + "loss": 0.4967, + "step": 4479 + }, + { + "epoch": 0.9993308052643319, + "grad_norm": 0.16627469658851624, + "learning_rate": 1.5143329726368205e-05, + "loss": 0.4755, + "step": 4480 + }, + { + "epoch": 0.9995538701762213, + "grad_norm": 0.15166249871253967, + "learning_rate": 1.5141311305243158e-05, + "loss": 0.4762, + "step": 4481 + }, + { + "epoch": 0.9997769350881106, + "grad_norm": 0.15809613466262817, + "learning_rate": 1.513929259935995e-05, + "loss": 0.4825, + "step": 4482 + }, + { + "epoch": 1.0, + "grad_norm": 0.24375846982002258, + "learning_rate": 1.5137273608830387e-05, + "loss": 0.4902, + "step": 4483 + }, + { + "epoch": 1.0, + "eval_loss": 0.3229251801967621, + "eval_runtime": 666.2343, + "eval_samples_per_second": 94.726, + "eval_steps_per_second": 1.481, + "step": 4483 + }, + { + "epoch": 1.0002230649118893, + "grad_norm": 0.2046872079372406, + "learning_rate": 1.5135254333766302e-05, + "loss": 0.5093, + "step": 4484 + }, + { + "epoch": 1.0004461298237788, + "grad_norm": 0.16689886152744293, + "learning_rate": 1.5133234774279526e-05, + "loss": 0.5118, + "step": 4485 + }, + { + "epoch": 1.000669194735668, + "grad_norm": 0.16698423027992249, + "learning_rate": 1.5131214930481922e-05, + "loss": 0.4917, + "step": 4486 + }, + { + "epoch": 1.0008922596475573, + "grad_norm": 0.17248550057411194, + "learning_rate": 1.5129194802485354e-05, + "loss": 0.4693, + "step": 4487 + }, + { + "epoch": 1.0011153245594469, + "grad_norm": 0.16011400520801544, + "learning_rate": 1.5127174390401717e-05, + "loss": 0.4788, + "step": 4488 + }, + { + "epoch": 1.0013383894713361, + "grad_norm": 0.16476082801818848, + "learning_rate": 1.512515369434291e-05, + "loss": 0.4723, + "step": 4489 + }, + { + "epoch": 1.0015614543832254, + "grad_norm": 0.1672191321849823, + "learning_rate": 1.5123132714420856e-05, + "loss": 0.484, + "step": 4490 + }, + { + "epoch": 1.001784519295115, + "grad_norm": 0.15264827013015747, + "learning_rate": 1.5121111450747483e-05, + "loss": 0.4577, + "step": 4491 + }, + { + "epoch": 1.0020075842070042, + "grad_norm": 0.15963797271251678, + "learning_rate": 1.5119089903434746e-05, + "loss": 0.4866, + "step": 4492 + }, + { + "epoch": 1.0022306491188937, + "grad_norm": 0.16446684300899506, + "learning_rate": 1.511706807259461e-05, + "loss": 0.4976, + "step": 4493 + }, + { + "epoch": 1.002453714030783, + "grad_norm": 0.1584995537996292, + "learning_rate": 1.5115045958339056e-05, + "loss": 0.4921, + "step": 4494 + }, + { + "epoch": 1.0026767789426723, + "grad_norm": 0.16608034074306488, + "learning_rate": 1.5113023560780083e-05, + "loss": 0.4946, + "step": 4495 + }, + { + "epoch": 1.0028998438545618, + "grad_norm": 0.15869465470314026, + "learning_rate": 1.5111000880029703e-05, + "loss": 0.5022, + "step": 4496 + }, + { + "epoch": 1.003122908766451, + "grad_norm": 0.16816754639148712, + "learning_rate": 1.5108977916199941e-05, + "loss": 0.4791, + "step": 4497 + }, + { + "epoch": 1.0033459736783403, + "grad_norm": 0.17300903797149658, + "learning_rate": 1.5106954669402849e-05, + "loss": 0.4817, + "step": 4498 + }, + { + "epoch": 1.0035690385902298, + "grad_norm": 0.17333939671516418, + "learning_rate": 1.510493113975048e-05, + "loss": 0.4671, + "step": 4499 + }, + { + "epoch": 1.0037921035021191, + "grad_norm": 0.2043878585100174, + "learning_rate": 1.5102907327354919e-05, + "loss": 0.5001, + "step": 4500 + }, + { + "epoch": 1.0040151684140084, + "grad_norm": 0.15702618658542633, + "learning_rate": 1.5100883232328247e-05, + "loss": 0.5172, + "step": 4501 + }, + { + "epoch": 1.004238233325898, + "grad_norm": 0.1576346904039383, + "learning_rate": 1.5098858854782576e-05, + "loss": 0.4813, + "step": 4502 + }, + { + "epoch": 1.0044612982377872, + "grad_norm": 0.171005517244339, + "learning_rate": 1.509683419483003e-05, + "loss": 0.5054, + "step": 4503 + }, + { + "epoch": 1.0046843631496765, + "grad_norm": 0.16706731915473938, + "learning_rate": 1.5094809252582744e-05, + "loss": 0.4791, + "step": 4504 + }, + { + "epoch": 1.004907428061566, + "grad_norm": 0.1601940393447876, + "learning_rate": 1.5092784028152878e-05, + "loss": 0.4788, + "step": 4505 + }, + { + "epoch": 1.0051304929734552, + "grad_norm": 0.1597285270690918, + "learning_rate": 1.5090758521652596e-05, + "loss": 0.4669, + "step": 4506 + }, + { + "epoch": 1.0053535578853445, + "grad_norm": 0.16775481402873993, + "learning_rate": 1.5088732733194085e-05, + "loss": 0.5091, + "step": 4507 + }, + { + "epoch": 1.005576622797234, + "grad_norm": 0.16889511048793793, + "learning_rate": 1.5086706662889544e-05, + "loss": 0.5208, + "step": 4508 + }, + { + "epoch": 1.0057996877091233, + "grad_norm": 0.1600525975227356, + "learning_rate": 1.5084680310851192e-05, + "loss": 0.4861, + "step": 4509 + }, + { + "epoch": 1.0060227526210128, + "grad_norm": 0.18192386627197266, + "learning_rate": 1.508265367719126e-05, + "loss": 0.486, + "step": 4510 + }, + { + "epoch": 1.006245817532902, + "grad_norm": 0.1748298555612564, + "learning_rate": 1.5080626762021997e-05, + "loss": 0.4623, + "step": 4511 + }, + { + "epoch": 1.0064688824447914, + "grad_norm": 0.1619860827922821, + "learning_rate": 1.5078599565455666e-05, + "loss": 0.4707, + "step": 4512 + }, + { + "epoch": 1.0066919473566809, + "grad_norm": 0.16041533648967743, + "learning_rate": 1.5076572087604544e-05, + "loss": 0.4663, + "step": 4513 + }, + { + "epoch": 1.0069150122685702, + "grad_norm": 0.1654667854309082, + "learning_rate": 1.5074544328580928e-05, + "loss": 0.4746, + "step": 4514 + }, + { + "epoch": 1.0071380771804594, + "grad_norm": 0.1631074994802475, + "learning_rate": 1.5072516288497127e-05, + "loss": 0.508, + "step": 4515 + }, + { + "epoch": 1.007361142092349, + "grad_norm": 0.16078722476959229, + "learning_rate": 1.5070487967465466e-05, + "loss": 0.5053, + "step": 4516 + }, + { + "epoch": 1.0075842070042382, + "grad_norm": 0.15384818613529205, + "learning_rate": 1.5068459365598286e-05, + "loss": 0.4896, + "step": 4517 + }, + { + "epoch": 1.0078072719161275, + "grad_norm": 0.15592217445373535, + "learning_rate": 1.5066430483007949e-05, + "loss": 0.4911, + "step": 4518 + }, + { + "epoch": 1.008030336828017, + "grad_norm": 0.1650250107049942, + "learning_rate": 1.5064401319806817e-05, + "loss": 0.4914, + "step": 4519 + }, + { + "epoch": 1.0082534017399063, + "grad_norm": 0.16215115785598755, + "learning_rate": 1.5062371876107286e-05, + "loss": 0.468, + "step": 4520 + }, + { + "epoch": 1.0084764666517956, + "grad_norm": 0.1609305888414383, + "learning_rate": 1.5060342152021757e-05, + "loss": 0.5136, + "step": 4521 + }, + { + "epoch": 1.008699531563685, + "grad_norm": 0.15757685899734497, + "learning_rate": 1.505831214766265e-05, + "loss": 0.4816, + "step": 4522 + }, + { + "epoch": 1.0089225964755744, + "grad_norm": 0.15898853540420532, + "learning_rate": 1.5056281863142394e-05, + "loss": 0.4704, + "step": 4523 + }, + { + "epoch": 1.0091456613874636, + "grad_norm": 0.16032522916793823, + "learning_rate": 1.5054251298573447e-05, + "loss": 0.4712, + "step": 4524 + }, + { + "epoch": 1.0093687262993531, + "grad_norm": 0.1658545881509781, + "learning_rate": 1.5052220454068267e-05, + "loss": 0.4975, + "step": 4525 + }, + { + "epoch": 1.0095917912112424, + "grad_norm": 0.15979249775409698, + "learning_rate": 1.5050189329739344e-05, + "loss": 0.4768, + "step": 4526 + }, + { + "epoch": 1.009814856123132, + "grad_norm": 0.16390205919742584, + "learning_rate": 1.5048157925699162e-05, + "loss": 0.5083, + "step": 4527 + }, + { + "epoch": 1.0100379210350212, + "grad_norm": 0.16372700035572052, + "learning_rate": 1.5046126242060247e-05, + "loss": 0.4919, + "step": 4528 + }, + { + "epoch": 1.0102609859469105, + "grad_norm": 0.16318677365779877, + "learning_rate": 1.5044094278935115e-05, + "loss": 0.4736, + "step": 4529 + }, + { + "epoch": 1.0104840508588, + "grad_norm": 0.1591407060623169, + "learning_rate": 1.5042062036436315e-05, + "loss": 0.4911, + "step": 4530 + }, + { + "epoch": 1.0107071157706893, + "grad_norm": 0.16406004130840302, + "learning_rate": 1.5040029514676402e-05, + "loss": 0.4897, + "step": 4531 + }, + { + "epoch": 1.0109301806825786, + "grad_norm": 0.15959376096725464, + "learning_rate": 1.5037996713767956e-05, + "loss": 0.4638, + "step": 4532 + }, + { + "epoch": 1.011153245594468, + "grad_norm": 0.15884444117546082, + "learning_rate": 1.5035963633823553e-05, + "loss": 0.4431, + "step": 4533 + }, + { + "epoch": 1.0113763105063573, + "grad_norm": 0.15193885564804077, + "learning_rate": 1.5033930274955813e-05, + "loss": 0.4828, + "step": 4534 + }, + { + "epoch": 1.0115993754182466, + "grad_norm": 0.16269952058792114, + "learning_rate": 1.503189663727735e-05, + "loss": 0.4977, + "step": 4535 + }, + { + "epoch": 1.0118224403301361, + "grad_norm": 0.1688687652349472, + "learning_rate": 1.5029862720900794e-05, + "loss": 0.4798, + "step": 4536 + }, + { + "epoch": 1.0120455052420254, + "grad_norm": 0.1625138372182846, + "learning_rate": 1.5027828525938809e-05, + "loss": 0.4732, + "step": 4537 + }, + { + "epoch": 1.0122685701539147, + "grad_norm": 0.162174254655838, + "learning_rate": 1.5025794052504048e-05, + "loss": 0.4912, + "step": 4538 + }, + { + "epoch": 1.0124916350658042, + "grad_norm": 0.1722794622182846, + "learning_rate": 1.5023759300709201e-05, + "loss": 0.5365, + "step": 4539 + }, + { + "epoch": 1.0127146999776935, + "grad_norm": 0.18794307112693787, + "learning_rate": 1.5021724270666962e-05, + "loss": 0.4874, + "step": 4540 + }, + { + "epoch": 1.0129377648895828, + "grad_norm": 0.16088199615478516, + "learning_rate": 1.5019688962490047e-05, + "loss": 0.4658, + "step": 4541 + }, + { + "epoch": 1.0131608298014723, + "grad_norm": 0.1688738763332367, + "learning_rate": 1.501765337629118e-05, + "loss": 0.4869, + "step": 4542 + }, + { + "epoch": 1.0133838947133615, + "grad_norm": 0.2059110552072525, + "learning_rate": 1.5015617512183109e-05, + "loss": 0.4952, + "step": 4543 + }, + { + "epoch": 1.013606959625251, + "grad_norm": 0.1552867442369461, + "learning_rate": 1.5013581370278587e-05, + "loss": 0.4761, + "step": 4544 + }, + { + "epoch": 1.0138300245371403, + "grad_norm": 0.15534162521362305, + "learning_rate": 1.5011544950690397e-05, + "loss": 0.4751, + "step": 4545 + }, + { + "epoch": 1.0140530894490296, + "grad_norm": 0.162929505109787, + "learning_rate": 1.5009508253531321e-05, + "loss": 0.5241, + "step": 4546 + }, + { + "epoch": 1.0142761543609191, + "grad_norm": 0.16057923436164856, + "learning_rate": 1.5007471278914167e-05, + "loss": 0.4944, + "step": 4547 + }, + { + "epoch": 1.0144992192728084, + "grad_norm": 0.18289563059806824, + "learning_rate": 1.5005434026951755e-05, + "loss": 0.4852, + "step": 4548 + }, + { + "epoch": 1.0147222841846977, + "grad_norm": 0.1580429971218109, + "learning_rate": 1.5003396497756923e-05, + "loss": 0.4776, + "step": 4549 + }, + { + "epoch": 1.0149453490965872, + "grad_norm": 0.1565399467945099, + "learning_rate": 1.5001358691442517e-05, + "loss": 0.4785, + "step": 4550 + }, + { + "epoch": 1.0151684140084765, + "grad_norm": 0.16028526425361633, + "learning_rate": 1.4999320608121411e-05, + "loss": 0.4797, + "step": 4551 + }, + { + "epoch": 1.0153914789203657, + "grad_norm": 0.16666477918624878, + "learning_rate": 1.499728224790648e-05, + "loss": 0.5112, + "step": 4552 + }, + { + "epoch": 1.0156145438322552, + "grad_norm": 0.164764866232872, + "learning_rate": 1.4995243610910625e-05, + "loss": 0.4843, + "step": 4553 + }, + { + "epoch": 1.0158376087441445, + "grad_norm": 0.16802887618541718, + "learning_rate": 1.4993204697246758e-05, + "loss": 0.4642, + "step": 4554 + }, + { + "epoch": 1.0160606736560338, + "grad_norm": 0.16424837708473206, + "learning_rate": 1.4991165507027802e-05, + "loss": 0.471, + "step": 4555 + }, + { + "epoch": 1.0162837385679233, + "grad_norm": 0.16172267496585846, + "learning_rate": 1.498912604036671e-05, + "loss": 0.4747, + "step": 4556 + }, + { + "epoch": 1.0165068034798126, + "grad_norm": 0.1546371579170227, + "learning_rate": 1.4987086297376431e-05, + "loss": 0.4624, + "step": 4557 + }, + { + "epoch": 1.0167298683917019, + "grad_norm": 0.15725232660770416, + "learning_rate": 1.4985046278169949e-05, + "loss": 0.4799, + "step": 4558 + }, + { + "epoch": 1.0169529333035914, + "grad_norm": 0.1614525318145752, + "learning_rate": 1.4983005982860241e-05, + "loss": 0.4652, + "step": 4559 + }, + { + "epoch": 1.0171759982154807, + "grad_norm": 0.1634308397769928, + "learning_rate": 1.498096541156032e-05, + "loss": 0.4742, + "step": 4560 + }, + { + "epoch": 1.0173990631273702, + "grad_norm": 0.16403678059577942, + "learning_rate": 1.4978924564383202e-05, + "loss": 0.4956, + "step": 4561 + }, + { + "epoch": 1.0176221280392594, + "grad_norm": 0.16014313697814941, + "learning_rate": 1.4976883441441924e-05, + "loss": 0.4956, + "step": 4562 + }, + { + "epoch": 1.0178451929511487, + "grad_norm": 0.16341127455234528, + "learning_rate": 1.4974842042849533e-05, + "loss": 0.5086, + "step": 4563 + }, + { + "epoch": 1.0180682578630382, + "grad_norm": 0.16314633190631866, + "learning_rate": 1.4972800368719098e-05, + "loss": 0.4681, + "step": 4564 + }, + { + "epoch": 1.0182913227749275, + "grad_norm": 0.16120873391628265, + "learning_rate": 1.4970758419163698e-05, + "loss": 0.4839, + "step": 4565 + }, + { + "epoch": 1.0185143876868168, + "grad_norm": 0.15808890759944916, + "learning_rate": 1.4968716194296429e-05, + "loss": 0.4711, + "step": 4566 + }, + { + "epoch": 1.0187374525987063, + "grad_norm": 0.1576492190361023, + "learning_rate": 1.4966673694230406e-05, + "loss": 0.4767, + "step": 4567 + }, + { + "epoch": 1.0189605175105956, + "grad_norm": 0.17056535184383392, + "learning_rate": 1.4964630919078747e-05, + "loss": 0.4551, + "step": 4568 + }, + { + "epoch": 1.0191835824224849, + "grad_norm": 0.15599535405635834, + "learning_rate": 1.4962587868954603e-05, + "loss": 0.462, + "step": 4569 + }, + { + "epoch": 1.0194066473343744, + "grad_norm": 0.16496673226356506, + "learning_rate": 1.4960544543971125e-05, + "loss": 0.4903, + "step": 4570 + }, + { + "epoch": 1.0196297122462636, + "grad_norm": 0.16336064040660858, + "learning_rate": 1.4958500944241488e-05, + "loss": 0.4804, + "step": 4571 + }, + { + "epoch": 1.019852777158153, + "grad_norm": 0.1574566662311554, + "learning_rate": 1.4956457069878875e-05, + "loss": 0.4405, + "step": 4572 + }, + { + "epoch": 1.0200758420700424, + "grad_norm": 0.1587173491716385, + "learning_rate": 1.4954412920996498e-05, + "loss": 0.4729, + "step": 4573 + }, + { + "epoch": 1.0202989069819317, + "grad_norm": 0.16297802329063416, + "learning_rate": 1.4952368497707566e-05, + "loss": 0.475, + "step": 4574 + }, + { + "epoch": 1.020521971893821, + "grad_norm": 0.16890546679496765, + "learning_rate": 1.4950323800125314e-05, + "loss": 0.4715, + "step": 4575 + }, + { + "epoch": 1.0207450368057105, + "grad_norm": 0.16996879875659943, + "learning_rate": 1.4948278828362991e-05, + "loss": 0.4772, + "step": 4576 + }, + { + "epoch": 1.0209681017175998, + "grad_norm": 0.16053706407546997, + "learning_rate": 1.4946233582533865e-05, + "loss": 0.4763, + "step": 4577 + }, + { + "epoch": 1.0211911666294893, + "grad_norm": 0.15968559682369232, + "learning_rate": 1.4944188062751207e-05, + "loss": 0.4593, + "step": 4578 + }, + { + "epoch": 1.0214142315413786, + "grad_norm": 0.15733575820922852, + "learning_rate": 1.4942142269128312e-05, + "loss": 0.462, + "step": 4579 + }, + { + "epoch": 1.0216372964532678, + "grad_norm": 0.1631694734096527, + "learning_rate": 1.4940096201778493e-05, + "loss": 0.4845, + "step": 4580 + }, + { + "epoch": 1.0218603613651573, + "grad_norm": 0.15469685196876526, + "learning_rate": 1.4938049860815072e-05, + "loss": 0.4726, + "step": 4581 + }, + { + "epoch": 1.0220834262770466, + "grad_norm": 0.16130223870277405, + "learning_rate": 1.4936003246351386e-05, + "loss": 0.4966, + "step": 4582 + }, + { + "epoch": 1.022306491188936, + "grad_norm": 0.19383056461811066, + "learning_rate": 1.4933956358500794e-05, + "loss": 0.5019, + "step": 4583 + }, + { + "epoch": 1.0225295561008254, + "grad_norm": 0.16322484612464905, + "learning_rate": 1.4931909197376664e-05, + "loss": 0.4344, + "step": 4584 + }, + { + "epoch": 1.0227526210127147, + "grad_norm": 0.15319214761257172, + "learning_rate": 1.4929861763092378e-05, + "loss": 0.475, + "step": 4585 + }, + { + "epoch": 1.022975685924604, + "grad_norm": 0.16148288547992706, + "learning_rate": 1.4927814055761336e-05, + "loss": 0.5031, + "step": 4586 + }, + { + "epoch": 1.0231987508364935, + "grad_norm": 0.17277774214744568, + "learning_rate": 1.4925766075496955e-05, + "loss": 0.5255, + "step": 4587 + }, + { + "epoch": 1.0234218157483828, + "grad_norm": 0.16420382261276245, + "learning_rate": 1.4923717822412666e-05, + "loss": 0.4811, + "step": 4588 + }, + { + "epoch": 1.023644880660272, + "grad_norm": 0.16079513728618622, + "learning_rate": 1.4921669296621912e-05, + "loss": 0.4742, + "step": 4589 + }, + { + "epoch": 1.0238679455721615, + "grad_norm": 0.1617536097764969, + "learning_rate": 1.4919620498238154e-05, + "loss": 0.4818, + "step": 4590 + }, + { + "epoch": 1.0240910104840508, + "grad_norm": 0.15949992835521698, + "learning_rate": 1.4917571427374866e-05, + "loss": 0.4697, + "step": 4591 + }, + { + "epoch": 1.0243140753959403, + "grad_norm": 0.1637139767408371, + "learning_rate": 1.4915522084145542e-05, + "loss": 0.4507, + "step": 4592 + }, + { + "epoch": 1.0245371403078296, + "grad_norm": 0.16350442171096802, + "learning_rate": 1.4913472468663681e-05, + "loss": 0.4676, + "step": 4593 + }, + { + "epoch": 1.024760205219719, + "grad_norm": 0.1604386270046234, + "learning_rate": 1.4911422581042812e-05, + "loss": 0.4941, + "step": 4594 + }, + { + "epoch": 1.0249832701316084, + "grad_norm": 0.16599829494953156, + "learning_rate": 1.4909372421396464e-05, + "loss": 0.486, + "step": 4595 + }, + { + "epoch": 1.0252063350434977, + "grad_norm": 0.24195000529289246, + "learning_rate": 1.4907321989838192e-05, + "loss": 0.4824, + "step": 4596 + }, + { + "epoch": 1.025429399955387, + "grad_norm": 0.1621350198984146, + "learning_rate": 1.4905271286481557e-05, + "loss": 0.4602, + "step": 4597 + }, + { + "epoch": 1.0256524648672765, + "grad_norm": 0.16584086418151855, + "learning_rate": 1.4903220311440147e-05, + "loss": 0.4581, + "step": 4598 + }, + { + "epoch": 1.0258755297791657, + "grad_norm": 0.16808323562145233, + "learning_rate": 1.4901169064827552e-05, + "loss": 0.4756, + "step": 4599 + }, + { + "epoch": 1.026098594691055, + "grad_norm": 0.15748214721679688, + "learning_rate": 1.4899117546757383e-05, + "loss": 0.4366, + "step": 4600 + }, + { + "epoch": 1.0263216596029445, + "grad_norm": 0.18983392417430878, + "learning_rate": 1.489706575734327e-05, + "loss": 0.4447, + "step": 4601 + }, + { + "epoch": 1.0265447245148338, + "grad_norm": 0.16951000690460205, + "learning_rate": 1.4895013696698847e-05, + "loss": 0.4608, + "step": 4602 + }, + { + "epoch": 1.026767789426723, + "grad_norm": 0.1630699634552002, + "learning_rate": 1.4892961364937779e-05, + "loss": 0.4773, + "step": 4603 + }, + { + "epoch": 1.0269908543386126, + "grad_norm": 0.15806427597999573, + "learning_rate": 1.4890908762173731e-05, + "loss": 0.4579, + "step": 4604 + }, + { + "epoch": 1.0272139192505019, + "grad_norm": 0.16593055427074432, + "learning_rate": 1.4888855888520393e-05, + "loss": 0.4622, + "step": 4605 + }, + { + "epoch": 1.0274369841623912, + "grad_norm": 0.16501964628696442, + "learning_rate": 1.488680274409146e-05, + "loss": 0.4793, + "step": 4606 + }, + { + "epoch": 1.0276600490742807, + "grad_norm": 0.1590343415737152, + "learning_rate": 1.4884749329000654e-05, + "loss": 0.4562, + "step": 4607 + }, + { + "epoch": 1.02788311398617, + "grad_norm": 0.1620016247034073, + "learning_rate": 1.4882695643361702e-05, + "loss": 0.4836, + "step": 4608 + }, + { + "epoch": 1.0281061788980594, + "grad_norm": 0.1608029156923294, + "learning_rate": 1.4880641687288356e-05, + "loss": 0.4819, + "step": 4609 + }, + { + "epoch": 1.0283292438099487, + "grad_norm": 0.18065601587295532, + "learning_rate": 1.4878587460894369e-05, + "loss": 0.4966, + "step": 4610 + }, + { + "epoch": 1.028552308721838, + "grad_norm": 0.16342510282993317, + "learning_rate": 1.4876532964293522e-05, + "loss": 0.504, + "step": 4611 + }, + { + "epoch": 1.0287753736337275, + "grad_norm": 0.16731272637844086, + "learning_rate": 1.4874478197599605e-05, + "loss": 0.4517, + "step": 4612 + }, + { + "epoch": 1.0289984385456168, + "grad_norm": 0.17315874993801117, + "learning_rate": 1.4872423160926424e-05, + "loss": 0.4704, + "step": 4613 + }, + { + "epoch": 1.029221503457506, + "grad_norm": 0.16523095965385437, + "learning_rate": 1.48703678543878e-05, + "loss": 0.4943, + "step": 4614 + }, + { + "epoch": 1.0294445683693956, + "grad_norm": 0.1586511880159378, + "learning_rate": 1.4868312278097568e-05, + "loss": 0.486, + "step": 4615 + }, + { + "epoch": 1.0296676332812849, + "grad_norm": 0.167024627327919, + "learning_rate": 1.4866256432169577e-05, + "loss": 0.4898, + "step": 4616 + }, + { + "epoch": 1.0298906981931741, + "grad_norm": 0.16019749641418457, + "learning_rate": 1.4864200316717698e-05, + "loss": 0.4738, + "step": 4617 + }, + { + "epoch": 1.0301137631050636, + "grad_norm": 0.16836729645729065, + "learning_rate": 1.4862143931855806e-05, + "loss": 0.4597, + "step": 4618 + }, + { + "epoch": 1.030336828016953, + "grad_norm": 0.1572437733411789, + "learning_rate": 1.4860087277697798e-05, + "loss": 0.495, + "step": 4619 + }, + { + "epoch": 1.0305598929288422, + "grad_norm": 0.1620069444179535, + "learning_rate": 1.4858030354357588e-05, + "loss": 0.4681, + "step": 4620 + }, + { + "epoch": 1.0307829578407317, + "grad_norm": 0.1601821482181549, + "learning_rate": 1.4855973161949097e-05, + "loss": 0.4803, + "step": 4621 + }, + { + "epoch": 1.031006022752621, + "grad_norm": 0.16902416944503784, + "learning_rate": 1.485391570058627e-05, + "loss": 0.479, + "step": 4622 + }, + { + "epoch": 1.0312290876645103, + "grad_norm": 0.15522709488868713, + "learning_rate": 1.4851857970383057e-05, + "loss": 0.4737, + "step": 4623 + }, + { + "epoch": 1.0314521525763998, + "grad_norm": 0.15457487106323242, + "learning_rate": 1.484979997145343e-05, + "loss": 0.4663, + "step": 4624 + }, + { + "epoch": 1.031675217488289, + "grad_norm": 0.16563165187835693, + "learning_rate": 1.4847741703911376e-05, + "loss": 0.4754, + "step": 4625 + }, + { + "epoch": 1.0318982824001786, + "grad_norm": 0.16526126861572266, + "learning_rate": 1.4845683167870891e-05, + "loss": 0.4743, + "step": 4626 + }, + { + "epoch": 1.0321213473120678, + "grad_norm": 0.1640041321516037, + "learning_rate": 1.4843624363445992e-05, + "loss": 0.4995, + "step": 4627 + }, + { + "epoch": 1.0323444122239571, + "grad_norm": 0.17919553816318512, + "learning_rate": 1.484156529075071e-05, + "loss": 0.4622, + "step": 4628 + }, + { + "epoch": 1.0325674771358466, + "grad_norm": 0.1707811802625656, + "learning_rate": 1.4839505949899084e-05, + "loss": 0.5257, + "step": 4629 + }, + { + "epoch": 1.032790542047736, + "grad_norm": 0.16861692070960999, + "learning_rate": 1.4837446341005179e-05, + "loss": 0.4661, + "step": 4630 + }, + { + "epoch": 1.0330136069596252, + "grad_norm": 0.15601199865341187, + "learning_rate": 1.4835386464183066e-05, + "loss": 0.4704, + "step": 4631 + }, + { + "epoch": 1.0332366718715147, + "grad_norm": 0.16105563938617706, + "learning_rate": 1.4833326319546837e-05, + "loss": 0.4822, + "step": 4632 + }, + { + "epoch": 1.033459736783404, + "grad_norm": 0.1516101360321045, + "learning_rate": 1.483126590721059e-05, + "loss": 0.4812, + "step": 4633 + }, + { + "epoch": 1.0336828016952933, + "grad_norm": 0.16206029057502747, + "learning_rate": 1.4829205227288451e-05, + "loss": 0.4827, + "step": 4634 + }, + { + "epoch": 1.0339058666071828, + "grad_norm": 0.16983060538768768, + "learning_rate": 1.4827144279894547e-05, + "loss": 0.4507, + "step": 4635 + }, + { + "epoch": 1.034128931519072, + "grad_norm": 0.17058612406253815, + "learning_rate": 1.4825083065143029e-05, + "loss": 0.4606, + "step": 4636 + }, + { + "epoch": 1.0343519964309613, + "grad_norm": 0.15379010140895844, + "learning_rate": 1.482302158314806e-05, + "loss": 0.4365, + "step": 4637 + }, + { + "epoch": 1.0345750613428508, + "grad_norm": 0.1577761024236679, + "learning_rate": 1.4820959834023821e-05, + "loss": 0.4678, + "step": 4638 + }, + { + "epoch": 1.03479812625474, + "grad_norm": 0.16535353660583496, + "learning_rate": 1.4818897817884499e-05, + "loss": 0.4994, + "step": 4639 + }, + { + "epoch": 1.0350211911666294, + "grad_norm": 0.16018177568912506, + "learning_rate": 1.48168355348443e-05, + "loss": 0.486, + "step": 4640 + }, + { + "epoch": 1.0352442560785189, + "grad_norm": 0.16746951639652252, + "learning_rate": 1.4814772985017456e-05, + "loss": 0.4665, + "step": 4641 + }, + { + "epoch": 1.0354673209904082, + "grad_norm": 0.1544388234615326, + "learning_rate": 1.4812710168518195e-05, + "loss": 0.4707, + "step": 4642 + }, + { + "epoch": 1.0356903859022977, + "grad_norm": 0.15881603956222534, + "learning_rate": 1.4810647085460771e-05, + "loss": 0.4727, + "step": 4643 + }, + { + "epoch": 1.035913450814187, + "grad_norm": 0.15742497146129608, + "learning_rate": 1.4808583735959453e-05, + "loss": 0.4864, + "step": 4644 + }, + { + "epoch": 1.0361365157260762, + "grad_norm": 0.15278148651123047, + "learning_rate": 1.4806520120128518e-05, + "loss": 0.4586, + "step": 4645 + }, + { + "epoch": 1.0363595806379657, + "grad_norm": 0.16195416450500488, + "learning_rate": 1.4804456238082266e-05, + "loss": 0.4532, + "step": 4646 + }, + { + "epoch": 1.036582645549855, + "grad_norm": 0.17061328887939453, + "learning_rate": 1.480239208993501e-05, + "loss": 0.4568, + "step": 4647 + }, + { + "epoch": 1.0368057104617443, + "grad_norm": 0.16437166929244995, + "learning_rate": 1.4800327675801065e-05, + "loss": 0.4751, + "step": 4648 + }, + { + "epoch": 1.0370287753736338, + "grad_norm": 0.15695516765117645, + "learning_rate": 1.4798262995794784e-05, + "loss": 0.4701, + "step": 4649 + }, + { + "epoch": 1.037251840285523, + "grad_norm": 0.16812454164028168, + "learning_rate": 1.479619805003051e-05, + "loss": 0.4543, + "step": 4650 + }, + { + "epoch": 1.0374749051974124, + "grad_norm": 0.16520899534225464, + "learning_rate": 1.4794132838622624e-05, + "loss": 0.4576, + "step": 4651 + }, + { + "epoch": 1.0376979701093019, + "grad_norm": 0.155088871717453, + "learning_rate": 1.4792067361685503e-05, + "loss": 0.4775, + "step": 4652 + }, + { + "epoch": 1.0379210350211912, + "grad_norm": 0.15887859463691711, + "learning_rate": 1.4790001619333547e-05, + "loss": 0.4642, + "step": 4653 + }, + { + "epoch": 1.0381440999330804, + "grad_norm": 0.17838329076766968, + "learning_rate": 1.4787935611681174e-05, + "loss": 0.4821, + "step": 4654 + }, + { + "epoch": 1.03836716484497, + "grad_norm": 0.15603458881378174, + "learning_rate": 1.4785869338842807e-05, + "loss": 0.4581, + "step": 4655 + }, + { + "epoch": 1.0385902297568592, + "grad_norm": 0.16940052807331085, + "learning_rate": 1.4783802800932894e-05, + "loss": 0.5005, + "step": 4656 + }, + { + "epoch": 1.0388132946687485, + "grad_norm": 0.1598014533519745, + "learning_rate": 1.4781735998065886e-05, + "loss": 0.4866, + "step": 4657 + }, + { + "epoch": 1.039036359580638, + "grad_norm": 0.1550016701221466, + "learning_rate": 1.4779668930356265e-05, + "loss": 0.4531, + "step": 4658 + }, + { + "epoch": 1.0392594244925273, + "grad_norm": 0.17037729918956757, + "learning_rate": 1.4777601597918511e-05, + "loss": 0.4867, + "step": 4659 + }, + { + "epoch": 1.0394824894044168, + "grad_norm": 0.15960454940795898, + "learning_rate": 1.477553400086713e-05, + "loss": 0.4768, + "step": 4660 + }, + { + "epoch": 1.039705554316306, + "grad_norm": 0.17182455956935883, + "learning_rate": 1.4773466139316634e-05, + "loss": 0.4833, + "step": 4661 + }, + { + "epoch": 1.0399286192281954, + "grad_norm": 0.1634978950023651, + "learning_rate": 1.4771398013381559e-05, + "loss": 0.4591, + "step": 4662 + }, + { + "epoch": 1.0401516841400849, + "grad_norm": 0.1541091501712799, + "learning_rate": 1.476932962317645e-05, + "loss": 0.4611, + "step": 4663 + }, + { + "epoch": 1.0403747490519741, + "grad_norm": 0.16876503825187683, + "learning_rate": 1.4767260968815864e-05, + "loss": 0.497, + "step": 4664 + }, + { + "epoch": 1.0405978139638634, + "grad_norm": 0.16533160209655762, + "learning_rate": 1.4765192050414378e-05, + "loss": 0.4818, + "step": 4665 + }, + { + "epoch": 1.040820878875753, + "grad_norm": 0.1572985202074051, + "learning_rate": 1.4763122868086584e-05, + "loss": 0.4748, + "step": 4666 + }, + { + "epoch": 1.0410439437876422, + "grad_norm": 0.16402465105056763, + "learning_rate": 1.4761053421947082e-05, + "loss": 0.4819, + "step": 4667 + }, + { + "epoch": 1.0412670086995315, + "grad_norm": 0.15840837359428406, + "learning_rate": 1.4758983712110494e-05, + "loss": 0.4781, + "step": 4668 + }, + { + "epoch": 1.041490073611421, + "grad_norm": 0.1562443971633911, + "learning_rate": 1.4756913738691451e-05, + "loss": 0.4784, + "step": 4669 + }, + { + "epoch": 1.0417131385233103, + "grad_norm": 0.1598626971244812, + "learning_rate": 1.4754843501804607e-05, + "loss": 0.4846, + "step": 4670 + }, + { + "epoch": 1.0419362034351995, + "grad_norm": 0.1586054414510727, + "learning_rate": 1.4752773001564617e-05, + "loss": 0.4819, + "step": 4671 + }, + { + "epoch": 1.042159268347089, + "grad_norm": 0.16802629828453064, + "learning_rate": 1.4750702238086164e-05, + "loss": 0.4851, + "step": 4672 + }, + { + "epoch": 1.0423823332589783, + "grad_norm": 0.16435806453227997, + "learning_rate": 1.474863121148394e-05, + "loss": 0.4617, + "step": 4673 + }, + { + "epoch": 1.0426053981708678, + "grad_norm": 0.1635981649160385, + "learning_rate": 1.4746559921872645e-05, + "loss": 0.4708, + "step": 4674 + }, + { + "epoch": 1.0428284630827571, + "grad_norm": 0.1575925201177597, + "learning_rate": 1.4744488369367007e-05, + "loss": 0.4662, + "step": 4675 + }, + { + "epoch": 1.0430515279946464, + "grad_norm": 0.16302283108234406, + "learning_rate": 1.474241655408176e-05, + "loss": 0.4632, + "step": 4676 + }, + { + "epoch": 1.043274592906536, + "grad_norm": 0.1619601547718048, + "learning_rate": 1.4740344476131652e-05, + "loss": 0.4489, + "step": 4677 + }, + { + "epoch": 1.0434976578184252, + "grad_norm": 0.16791771352291107, + "learning_rate": 1.4738272135631448e-05, + "loss": 0.5023, + "step": 4678 + }, + { + "epoch": 1.0437207227303145, + "grad_norm": 0.16237159073352814, + "learning_rate": 1.4736199532695929e-05, + "loss": 0.4603, + "step": 4679 + }, + { + "epoch": 1.043943787642204, + "grad_norm": 0.1556091606616974, + "learning_rate": 1.473412666743989e-05, + "loss": 0.4466, + "step": 4680 + }, + { + "epoch": 1.0441668525540932, + "grad_norm": 0.15769942104816437, + "learning_rate": 1.4732053539978138e-05, + "loss": 0.4532, + "step": 4681 + }, + { + "epoch": 1.0443899174659825, + "grad_norm": 0.15660084784030914, + "learning_rate": 1.472998015042549e-05, + "loss": 0.4862, + "step": 4682 + }, + { + "epoch": 1.044612982377872, + "grad_norm": 0.15516437590122223, + "learning_rate": 1.4727906498896793e-05, + "loss": 0.4756, + "step": 4683 + }, + { + "epoch": 1.0448360472897613, + "grad_norm": 0.16473138332366943, + "learning_rate": 1.4725832585506891e-05, + "loss": 0.4937, + "step": 4684 + }, + { + "epoch": 1.0450591122016506, + "grad_norm": 0.15583591163158417, + "learning_rate": 1.4723758410370654e-05, + "loss": 0.4732, + "step": 4685 + }, + { + "epoch": 1.04528217711354, + "grad_norm": 0.15941470861434937, + "learning_rate": 1.4721683973602965e-05, + "loss": 0.4457, + "step": 4686 + }, + { + "epoch": 1.0455052420254294, + "grad_norm": 0.1659325659275055, + "learning_rate": 1.4719609275318715e-05, + "loss": 0.4989, + "step": 4687 + }, + { + "epoch": 1.0457283069373187, + "grad_norm": 0.1708594560623169, + "learning_rate": 1.4717534315632817e-05, + "loss": 0.4833, + "step": 4688 + }, + { + "epoch": 1.0459513718492082, + "grad_norm": 0.1607704609632492, + "learning_rate": 1.4715459094660194e-05, + "loss": 0.4419, + "step": 4689 + }, + { + "epoch": 1.0461744367610974, + "grad_norm": 0.16482210159301758, + "learning_rate": 1.4713383612515786e-05, + "loss": 0.4686, + "step": 4690 + }, + { + "epoch": 1.046397501672987, + "grad_norm": 0.16981108486652374, + "learning_rate": 1.4711307869314544e-05, + "loss": 0.4961, + "step": 4691 + }, + { + "epoch": 1.0466205665848762, + "grad_norm": 0.16742509603500366, + "learning_rate": 1.4709231865171436e-05, + "loss": 0.4972, + "step": 4692 + }, + { + "epoch": 1.0468436314967655, + "grad_norm": 0.16613337397575378, + "learning_rate": 1.4707155600201447e-05, + "loss": 0.4803, + "step": 4693 + }, + { + "epoch": 1.047066696408655, + "grad_norm": 0.18546496331691742, + "learning_rate": 1.470507907451957e-05, + "loss": 0.4678, + "step": 4694 + }, + { + "epoch": 1.0472897613205443, + "grad_norm": 0.1629878729581833, + "learning_rate": 1.4703002288240818e-05, + "loss": 0.4479, + "step": 4695 + }, + { + "epoch": 1.0475128262324336, + "grad_norm": 0.1560726761817932, + "learning_rate": 1.4700925241480217e-05, + "loss": 0.451, + "step": 4696 + }, + { + "epoch": 1.047735891144323, + "grad_norm": 0.458696573972702, + "learning_rate": 1.4698847934352804e-05, + "loss": 0.493, + "step": 4697 + }, + { + "epoch": 1.0479589560562124, + "grad_norm": 0.1607166975736618, + "learning_rate": 1.4696770366973639e-05, + "loss": 0.4688, + "step": 4698 + }, + { + "epoch": 1.0481820209681016, + "grad_norm": 0.15131288766860962, + "learning_rate": 1.4694692539457784e-05, + "loss": 0.4551, + "step": 4699 + }, + { + "epoch": 1.0484050858799911, + "grad_norm": 0.16862881183624268, + "learning_rate": 1.4692614451920328e-05, + "loss": 0.4579, + "step": 4700 + }, + { + "epoch": 1.0486281507918804, + "grad_norm": 0.16511371731758118, + "learning_rate": 1.4690536104476364e-05, + "loss": 0.4887, + "step": 4701 + }, + { + "epoch": 1.0488512157037697, + "grad_norm": 0.16970515251159668, + "learning_rate": 1.4688457497241006e-05, + "loss": 0.4494, + "step": 4702 + }, + { + "epoch": 1.0490742806156592, + "grad_norm": 0.16722868382930756, + "learning_rate": 1.4686378630329382e-05, + "loss": 0.4781, + "step": 4703 + }, + { + "epoch": 1.0492973455275485, + "grad_norm": 0.16638034582138062, + "learning_rate": 1.4684299503856627e-05, + "loss": 0.4705, + "step": 4704 + }, + { + "epoch": 1.0495204104394378, + "grad_norm": 0.18224425613880157, + "learning_rate": 1.4682220117937904e-05, + "loss": 0.4902, + "step": 4705 + }, + { + "epoch": 1.0497434753513273, + "grad_norm": 0.15631026029586792, + "learning_rate": 1.4680140472688376e-05, + "loss": 0.448, + "step": 4706 + }, + { + "epoch": 1.0499665402632166, + "grad_norm": 0.15947161614894867, + "learning_rate": 1.4678060568223232e-05, + "loss": 0.453, + "step": 4707 + }, + { + "epoch": 1.050189605175106, + "grad_norm": 0.15899132192134857, + "learning_rate": 1.4675980404657666e-05, + "loss": 0.4594, + "step": 4708 + }, + { + "epoch": 1.0504126700869953, + "grad_norm": 0.1654270440340042, + "learning_rate": 1.4673899982106892e-05, + "loss": 0.482, + "step": 4709 + }, + { + "epoch": 1.0506357349988846, + "grad_norm": 0.16609270870685577, + "learning_rate": 1.4671819300686136e-05, + "loss": 0.5132, + "step": 4710 + }, + { + "epoch": 1.0508587999107741, + "grad_norm": 0.15867236256599426, + "learning_rate": 1.4669738360510643e-05, + "loss": 0.4499, + "step": 4711 + }, + { + "epoch": 1.0510818648226634, + "grad_norm": 0.1547602266073227, + "learning_rate": 1.4667657161695663e-05, + "loss": 0.4438, + "step": 4712 + }, + { + "epoch": 1.0513049297345527, + "grad_norm": 0.1628340631723404, + "learning_rate": 1.4665575704356472e-05, + "loss": 0.4922, + "step": 4713 + }, + { + "epoch": 1.0515279946464422, + "grad_norm": 0.15342377126216888, + "learning_rate": 1.4663493988608348e-05, + "loss": 0.4448, + "step": 4714 + }, + { + "epoch": 1.0517510595583315, + "grad_norm": 0.15994024276733398, + "learning_rate": 1.4661412014566594e-05, + "loss": 0.4655, + "step": 4715 + }, + { + "epoch": 1.0519741244702208, + "grad_norm": 0.17778268456459045, + "learning_rate": 1.4659329782346518e-05, + "loss": 0.5056, + "step": 4716 + }, + { + "epoch": 1.0521971893821103, + "grad_norm": 0.16450022161006927, + "learning_rate": 1.4657247292063455e-05, + "loss": 0.4914, + "step": 4717 + }, + { + "epoch": 1.0524202542939995, + "grad_norm": 0.15438216924667358, + "learning_rate": 1.4655164543832738e-05, + "loss": 0.4513, + "step": 4718 + }, + { + "epoch": 1.0526433192058888, + "grad_norm": 0.165638267993927, + "learning_rate": 1.4653081537769729e-05, + "loss": 0.487, + "step": 4719 + }, + { + "epoch": 1.0528663841177783, + "grad_norm": 0.1668267548084259, + "learning_rate": 1.4650998273989794e-05, + "loss": 0.4936, + "step": 4720 + }, + { + "epoch": 1.0530894490296676, + "grad_norm": 0.19577254354953766, + "learning_rate": 1.464891475260832e-05, + "loss": 0.4849, + "step": 4721 + }, + { + "epoch": 1.053312513941557, + "grad_norm": 0.16912342607975006, + "learning_rate": 1.4646830973740703e-05, + "loss": 0.4712, + "step": 4722 + }, + { + "epoch": 1.0535355788534464, + "grad_norm": 0.17454881966114044, + "learning_rate": 1.4644746937502356e-05, + "loss": 0.4945, + "step": 4723 + }, + { + "epoch": 1.0537586437653357, + "grad_norm": 0.16234014928340912, + "learning_rate": 1.464266264400871e-05, + "loss": 0.4644, + "step": 4724 + }, + { + "epoch": 1.0539817086772252, + "grad_norm": 0.16444821655750275, + "learning_rate": 1.46405780933752e-05, + "loss": 0.4698, + "step": 4725 + }, + { + "epoch": 1.0542047735891145, + "grad_norm": 0.15134121477603912, + "learning_rate": 1.4638493285717286e-05, + "loss": 0.4545, + "step": 4726 + }, + { + "epoch": 1.0544278385010037, + "grad_norm": 0.16439802944660187, + "learning_rate": 1.4636408221150436e-05, + "loss": 0.4881, + "step": 4727 + }, + { + "epoch": 1.0546509034128932, + "grad_norm": 0.16219298541545868, + "learning_rate": 1.4634322899790137e-05, + "loss": 0.4501, + "step": 4728 + }, + { + "epoch": 1.0548739683247825, + "grad_norm": 0.1483144313097, + "learning_rate": 1.463223732175188e-05, + "loss": 0.4461, + "step": 4729 + }, + { + "epoch": 1.0550970332366718, + "grad_norm": 0.16766944527626038, + "learning_rate": 1.4630151487151188e-05, + "loss": 0.4742, + "step": 4730 + }, + { + "epoch": 1.0553200981485613, + "grad_norm": 0.17085810005664825, + "learning_rate": 1.4628065396103576e-05, + "loss": 0.4917, + "step": 4731 + }, + { + "epoch": 1.0555431630604506, + "grad_norm": 0.18476352095603943, + "learning_rate": 1.4625979048724594e-05, + "loss": 0.4518, + "step": 4732 + }, + { + "epoch": 1.0557662279723399, + "grad_norm": 0.1589956432580948, + "learning_rate": 1.4623892445129792e-05, + "loss": 0.4732, + "step": 4733 + }, + { + "epoch": 1.0559892928842294, + "grad_norm": 0.18277813494205475, + "learning_rate": 1.4621805585434744e-05, + "loss": 0.4752, + "step": 4734 + }, + { + "epoch": 1.0562123577961187, + "grad_norm": 0.15718622505664825, + "learning_rate": 1.4619718469755029e-05, + "loss": 0.4815, + "step": 4735 + }, + { + "epoch": 1.056435422708008, + "grad_norm": 0.1619989573955536, + "learning_rate": 1.4617631098206244e-05, + "loss": 0.4595, + "step": 4736 + }, + { + "epoch": 1.0566584876198974, + "grad_norm": 0.16704005002975464, + "learning_rate": 1.4615543470904005e-05, + "loss": 0.4769, + "step": 4737 + }, + { + "epoch": 1.0568815525317867, + "grad_norm": 0.16695329546928406, + "learning_rate": 1.4613455587963934e-05, + "loss": 0.4868, + "step": 4738 + }, + { + "epoch": 1.057104617443676, + "grad_norm": 0.165815070271492, + "learning_rate": 1.4611367449501674e-05, + "loss": 0.4601, + "step": 4739 + }, + { + "epoch": 1.0573276823555655, + "grad_norm": 0.1685144156217575, + "learning_rate": 1.4609279055632878e-05, + "loss": 0.4848, + "step": 4740 + }, + { + "epoch": 1.0575507472674548, + "grad_norm": 0.16440360248088837, + "learning_rate": 1.4607190406473214e-05, + "loss": 0.4729, + "step": 4741 + }, + { + "epoch": 1.0577738121793443, + "grad_norm": 0.16086186468601227, + "learning_rate": 1.4605101502138363e-05, + "loss": 0.4719, + "step": 4742 + }, + { + "epoch": 1.0579968770912336, + "grad_norm": 0.16252481937408447, + "learning_rate": 1.4603012342744027e-05, + "loss": 0.4739, + "step": 4743 + }, + { + "epoch": 1.0582199420031229, + "grad_norm": 0.15817517042160034, + "learning_rate": 1.4600922928405911e-05, + "loss": 0.4982, + "step": 4744 + }, + { + "epoch": 1.0584430069150124, + "grad_norm": 0.15263493359088898, + "learning_rate": 1.4598833259239746e-05, + "loss": 0.4645, + "step": 4745 + }, + { + "epoch": 1.0586660718269016, + "grad_norm": 0.2680174708366394, + "learning_rate": 1.4596743335361263e-05, + "loss": 0.4876, + "step": 4746 + }, + { + "epoch": 1.058889136738791, + "grad_norm": 0.16394802927970886, + "learning_rate": 1.4594653156886222e-05, + "loss": 0.4812, + "step": 4747 + }, + { + "epoch": 1.0591122016506804, + "grad_norm": 0.1608896255493164, + "learning_rate": 1.4592562723930385e-05, + "loss": 0.4755, + "step": 4748 + }, + { + "epoch": 1.0593352665625697, + "grad_norm": 0.17238877713680267, + "learning_rate": 1.459047203660954e-05, + "loss": 0.4926, + "step": 4749 + }, + { + "epoch": 1.059558331474459, + "grad_norm": 0.16521191596984863, + "learning_rate": 1.4588381095039474e-05, + "loss": 0.4937, + "step": 4750 + }, + { + "epoch": 1.0597813963863485, + "grad_norm": 0.1620710790157318, + "learning_rate": 1.4586289899336003e-05, + "loss": 0.4339, + "step": 4751 + }, + { + "epoch": 1.0600044612982378, + "grad_norm": 0.15963472425937653, + "learning_rate": 1.4584198449614947e-05, + "loss": 0.4546, + "step": 4752 + }, + { + "epoch": 1.060227526210127, + "grad_norm": 0.17011800408363342, + "learning_rate": 1.4582106745992149e-05, + "loss": 0.4761, + "step": 4753 + }, + { + "epoch": 1.0604505911220166, + "grad_norm": 0.15755783021450043, + "learning_rate": 1.4580014788583452e-05, + "loss": 0.4621, + "step": 4754 + }, + { + "epoch": 1.0606736560339058, + "grad_norm": 0.1612185388803482, + "learning_rate": 1.457792257750473e-05, + "loss": 0.4526, + "step": 4755 + }, + { + "epoch": 1.0608967209457951, + "grad_norm": 0.17329202592372894, + "learning_rate": 1.4575830112871855e-05, + "loss": 0.472, + "step": 4756 + }, + { + "epoch": 1.0611197858576846, + "grad_norm": 0.16672907769680023, + "learning_rate": 1.4573737394800729e-05, + "loss": 0.4898, + "step": 4757 + }, + { + "epoch": 1.061342850769574, + "grad_norm": 0.17075766623020172, + "learning_rate": 1.4571644423407257e-05, + "loss": 0.474, + "step": 4758 + }, + { + "epoch": 1.0615659156814634, + "grad_norm": 0.15704765915870667, + "learning_rate": 1.4569551198807357e-05, + "loss": 0.4951, + "step": 4759 + }, + { + "epoch": 1.0617889805933527, + "grad_norm": 0.1504676192998886, + "learning_rate": 1.4567457721116971e-05, + "loss": 0.4597, + "step": 4760 + }, + { + "epoch": 1.062012045505242, + "grad_norm": 0.15645162761211395, + "learning_rate": 1.4565363990452046e-05, + "loss": 0.4548, + "step": 4761 + }, + { + "epoch": 1.0622351104171315, + "grad_norm": 0.16380253434181213, + "learning_rate": 1.4563270006928544e-05, + "loss": 0.4675, + "step": 4762 + }, + { + "epoch": 1.0624581753290208, + "grad_norm": 0.16168953478336334, + "learning_rate": 1.4561175770662446e-05, + "loss": 0.5015, + "step": 4763 + }, + { + "epoch": 1.06268124024091, + "grad_norm": 0.16128864884376526, + "learning_rate": 1.4559081281769742e-05, + "loss": 0.4827, + "step": 4764 + }, + { + "epoch": 1.0629043051527995, + "grad_norm": 0.1598096489906311, + "learning_rate": 1.4556986540366437e-05, + "loss": 0.4598, + "step": 4765 + }, + { + "epoch": 1.0631273700646888, + "grad_norm": 0.1796981692314148, + "learning_rate": 1.4554891546568557e-05, + "loss": 0.4803, + "step": 4766 + }, + { + "epoch": 1.063350434976578, + "grad_norm": 0.15178261697292328, + "learning_rate": 1.4552796300492129e-05, + "loss": 0.4228, + "step": 4767 + }, + { + "epoch": 1.0635734998884676, + "grad_norm": 0.15565361082553864, + "learning_rate": 1.4550700802253203e-05, + "loss": 0.484, + "step": 4768 + }, + { + "epoch": 1.063796564800357, + "grad_norm": 0.16248026490211487, + "learning_rate": 1.4548605051967843e-05, + "loss": 0.448, + "step": 4769 + }, + { + "epoch": 1.0640196297122462, + "grad_norm": 0.17214354872703552, + "learning_rate": 1.4546509049752122e-05, + "loss": 0.466, + "step": 4770 + }, + { + "epoch": 1.0642426946241357, + "grad_norm": 0.1753188818693161, + "learning_rate": 1.4544412795722135e-05, + "loss": 0.4861, + "step": 4771 + }, + { + "epoch": 1.064465759536025, + "grad_norm": 0.16392984986305237, + "learning_rate": 1.4542316289993976e-05, + "loss": 0.4584, + "step": 4772 + }, + { + "epoch": 1.0646888244479142, + "grad_norm": 0.16576439142227173, + "learning_rate": 1.4540219532683774e-05, + "loss": 0.456, + "step": 4773 + }, + { + "epoch": 1.0649118893598037, + "grad_norm": 0.17842838168144226, + "learning_rate": 1.4538122523907651e-05, + "loss": 0.471, + "step": 4774 + }, + { + "epoch": 1.065134954271693, + "grad_norm": 0.15883475542068481, + "learning_rate": 1.4536025263781762e-05, + "loss": 0.4868, + "step": 4775 + }, + { + "epoch": 1.0653580191835825, + "grad_norm": 0.15571808815002441, + "learning_rate": 1.4533927752422256e-05, + "loss": 0.4742, + "step": 4776 + }, + { + "epoch": 1.0655810840954718, + "grad_norm": 0.16916531324386597, + "learning_rate": 1.4531829989945315e-05, + "loss": 0.4646, + "step": 4777 + }, + { + "epoch": 1.065804149007361, + "grad_norm": 0.15617609024047852, + "learning_rate": 1.4529731976467119e-05, + "loss": 0.4569, + "step": 4778 + }, + { + "epoch": 1.0660272139192506, + "grad_norm": 0.15956276655197144, + "learning_rate": 1.4527633712103875e-05, + "loss": 0.4739, + "step": 4779 + }, + { + "epoch": 1.0662502788311399, + "grad_norm": 0.1674896627664566, + "learning_rate": 1.4525535196971797e-05, + "loss": 0.4993, + "step": 4780 + }, + { + "epoch": 1.0664733437430292, + "grad_norm": 0.14718051254749298, + "learning_rate": 1.4523436431187112e-05, + "loss": 0.4433, + "step": 4781 + }, + { + "epoch": 1.0666964086549187, + "grad_norm": 0.16965600848197937, + "learning_rate": 1.4521337414866064e-05, + "loss": 0.496, + "step": 4782 + }, + { + "epoch": 1.066919473566808, + "grad_norm": 0.16213834285736084, + "learning_rate": 1.451923814812491e-05, + "loss": 0.4385, + "step": 4783 + }, + { + "epoch": 1.0671425384786972, + "grad_norm": 0.16705723106861115, + "learning_rate": 1.451713863107992e-05, + "loss": 0.4893, + "step": 4784 + }, + { + "epoch": 1.0673656033905867, + "grad_norm": 0.16360625624656677, + "learning_rate": 1.451503886384738e-05, + "loss": 0.5065, + "step": 4785 + }, + { + "epoch": 1.067588668302476, + "grad_norm": 0.1604512631893158, + "learning_rate": 1.4512938846543583e-05, + "loss": 0.4597, + "step": 4786 + }, + { + "epoch": 1.0678117332143653, + "grad_norm": 0.16134783625602722, + "learning_rate": 1.4510838579284849e-05, + "loss": 0.467, + "step": 4787 + }, + { + "epoch": 1.0680347981262548, + "grad_norm": 0.1614135503768921, + "learning_rate": 1.4508738062187497e-05, + "loss": 0.482, + "step": 4788 + }, + { + "epoch": 1.068257863038144, + "grad_norm": 0.16068898141384125, + "learning_rate": 1.4506637295367872e-05, + "loss": 0.4557, + "step": 4789 + }, + { + "epoch": 1.0684809279500334, + "grad_norm": 0.18150994181632996, + "learning_rate": 1.4504536278942327e-05, + "loss": 0.4796, + "step": 4790 + }, + { + "epoch": 1.0687039928619229, + "grad_norm": 0.16428856551647186, + "learning_rate": 1.4502435013027225e-05, + "loss": 0.478, + "step": 4791 + }, + { + "epoch": 1.0689270577738121, + "grad_norm": 0.16501575708389282, + "learning_rate": 1.4500333497738955e-05, + "loss": 0.4575, + "step": 4792 + }, + { + "epoch": 1.0691501226857016, + "grad_norm": 0.16125361621379852, + "learning_rate": 1.4498231733193904e-05, + "loss": 0.4713, + "step": 4793 + }, + { + "epoch": 1.069373187597591, + "grad_norm": 0.15834558010101318, + "learning_rate": 1.4496129719508486e-05, + "loss": 0.4721, + "step": 4794 + }, + { + "epoch": 1.0695962525094802, + "grad_norm": 0.17870576679706573, + "learning_rate": 1.449402745679912e-05, + "loss": 0.4885, + "step": 4795 + }, + { + "epoch": 1.0698193174213697, + "grad_norm": 0.15810585021972656, + "learning_rate": 1.4491924945182248e-05, + "loss": 0.4698, + "step": 4796 + }, + { + "epoch": 1.070042382333259, + "grad_norm": 0.17196185886859894, + "learning_rate": 1.4489822184774317e-05, + "loss": 0.4946, + "step": 4797 + }, + { + "epoch": 1.0702654472451483, + "grad_norm": 0.1619616448879242, + "learning_rate": 1.448771917569179e-05, + "loss": 0.4697, + "step": 4798 + }, + { + "epoch": 1.0704885121570378, + "grad_norm": 0.1760883778333664, + "learning_rate": 1.4485615918051146e-05, + "loss": 0.4533, + "step": 4799 + }, + { + "epoch": 1.070711577068927, + "grad_norm": 0.15732388198375702, + "learning_rate": 1.448351241196888e-05, + "loss": 0.4639, + "step": 4800 + }, + { + "epoch": 1.0709346419808163, + "grad_norm": 0.1595136672258377, + "learning_rate": 1.448140865756149e-05, + "loss": 0.48, + "step": 4801 + }, + { + "epoch": 1.0711577068927058, + "grad_norm": 0.19920584559440613, + "learning_rate": 1.44793046549455e-05, + "loss": 0.4647, + "step": 4802 + }, + { + "epoch": 1.0713807718045951, + "grad_norm": 0.16085761785507202, + "learning_rate": 1.4477200404237446e-05, + "loss": 0.4788, + "step": 4803 + }, + { + "epoch": 1.0716038367164844, + "grad_norm": 0.15879055857658386, + "learning_rate": 1.4475095905553867e-05, + "loss": 0.4805, + "step": 4804 + }, + { + "epoch": 1.071826901628374, + "grad_norm": 0.16476532816886902, + "learning_rate": 1.4472991159011329e-05, + "loss": 0.4996, + "step": 4805 + }, + { + "epoch": 1.0720499665402632, + "grad_norm": 0.1692475974559784, + "learning_rate": 1.4470886164726403e-05, + "loss": 0.464, + "step": 4806 + }, + { + "epoch": 1.0722730314521525, + "grad_norm": 0.16641223430633545, + "learning_rate": 1.4468780922815679e-05, + "loss": 0.4698, + "step": 4807 + }, + { + "epoch": 1.072496096364042, + "grad_norm": 0.15899412333965302, + "learning_rate": 1.4466675433395758e-05, + "loss": 0.4688, + "step": 4808 + }, + { + "epoch": 1.0727191612759313, + "grad_norm": 0.1671571284532547, + "learning_rate": 1.4464569696583256e-05, + "loss": 0.4948, + "step": 4809 + }, + { + "epoch": 1.0729422261878208, + "grad_norm": 0.16346615552902222, + "learning_rate": 1.4462463712494799e-05, + "loss": 0.4682, + "step": 4810 + }, + { + "epoch": 1.07316529109971, + "grad_norm": 0.16411162912845612, + "learning_rate": 1.4460357481247035e-05, + "loss": 0.4704, + "step": 4811 + }, + { + "epoch": 1.0733883560115993, + "grad_norm": 0.16444380581378937, + "learning_rate": 1.4458251002956612e-05, + "loss": 0.478, + "step": 4812 + }, + { + "epoch": 1.0736114209234888, + "grad_norm": 0.15539826452732086, + "learning_rate": 1.4456144277740207e-05, + "loss": 0.4827, + "step": 4813 + }, + { + "epoch": 1.073834485835378, + "grad_norm": 0.16206075251102448, + "learning_rate": 1.4454037305714501e-05, + "loss": 0.4651, + "step": 4814 + }, + { + "epoch": 1.0740575507472674, + "grad_norm": 0.1546122431755066, + "learning_rate": 1.4451930086996193e-05, + "loss": 0.4668, + "step": 4815 + }, + { + "epoch": 1.074280615659157, + "grad_norm": 0.15941190719604492, + "learning_rate": 1.4449822621701992e-05, + "loss": 0.4674, + "step": 4816 + }, + { + "epoch": 1.0745036805710462, + "grad_norm": 0.16339150071144104, + "learning_rate": 1.4447714909948624e-05, + "loss": 0.4957, + "step": 4817 + }, + { + "epoch": 1.0747267454829355, + "grad_norm": 0.15850695967674255, + "learning_rate": 1.4445606951852828e-05, + "loss": 0.487, + "step": 4818 + }, + { + "epoch": 1.074949810394825, + "grad_norm": 0.16682858765125275, + "learning_rate": 1.4443498747531358e-05, + "loss": 0.4677, + "step": 4819 + }, + { + "epoch": 1.0751728753067142, + "grad_norm": 0.1573875993490219, + "learning_rate": 1.444139029710097e-05, + "loss": 0.4452, + "step": 4820 + }, + { + "epoch": 1.0753959402186035, + "grad_norm": 0.1620924174785614, + "learning_rate": 1.4439281600678455e-05, + "loss": 0.4767, + "step": 4821 + }, + { + "epoch": 1.075619005130493, + "grad_norm": 0.15519385039806366, + "learning_rate": 1.4437172658380598e-05, + "loss": 0.4571, + "step": 4822 + }, + { + "epoch": 1.0758420700423823, + "grad_norm": 0.17758627235889435, + "learning_rate": 1.443506347032421e-05, + "loss": 0.4767, + "step": 4823 + }, + { + "epoch": 1.0760651349542716, + "grad_norm": 0.15553906559944153, + "learning_rate": 1.4432954036626108e-05, + "loss": 0.4659, + "step": 4824 + }, + { + "epoch": 1.076288199866161, + "grad_norm": 0.15930281579494476, + "learning_rate": 1.4430844357403126e-05, + "loss": 0.4579, + "step": 4825 + }, + { + "epoch": 1.0765112647780504, + "grad_norm": 0.15948422253131866, + "learning_rate": 1.4428734432772115e-05, + "loss": 0.4682, + "step": 4826 + }, + { + "epoch": 1.0767343296899399, + "grad_norm": 0.15884706377983093, + "learning_rate": 1.4426624262849932e-05, + "loss": 0.469, + "step": 4827 + }, + { + "epoch": 1.0769573946018292, + "grad_norm": 0.15958735346794128, + "learning_rate": 1.4424513847753455e-05, + "loss": 0.4576, + "step": 4828 + }, + { + "epoch": 1.0771804595137184, + "grad_norm": 0.16710686683654785, + "learning_rate": 1.4422403187599565e-05, + "loss": 0.471, + "step": 4829 + }, + { + "epoch": 1.077403524425608, + "grad_norm": 0.2897154688835144, + "learning_rate": 1.4420292282505173e-05, + "loss": 0.4901, + "step": 4830 + }, + { + "epoch": 1.0776265893374972, + "grad_norm": 0.1648949235677719, + "learning_rate": 1.4418181132587185e-05, + "loss": 0.4867, + "step": 4831 + }, + { + "epoch": 1.0778496542493865, + "grad_norm": 0.1587132066488266, + "learning_rate": 1.4416069737962538e-05, + "loss": 0.4554, + "step": 4832 + }, + { + "epoch": 1.078072719161276, + "grad_norm": 0.3479141891002655, + "learning_rate": 1.4413958098748171e-05, + "loss": 0.4814, + "step": 4833 + }, + { + "epoch": 1.0782957840731653, + "grad_norm": 0.15964049100875854, + "learning_rate": 1.441184621506104e-05, + "loss": 0.4694, + "step": 4834 + }, + { + "epoch": 1.0785188489850546, + "grad_norm": 0.16396582126617432, + "learning_rate": 1.440973408701811e-05, + "loss": 0.4892, + "step": 4835 + }, + { + "epoch": 1.078741913896944, + "grad_norm": 0.1593598872423172, + "learning_rate": 1.4407621714736372e-05, + "loss": 0.4485, + "step": 4836 + }, + { + "epoch": 1.0789649788088334, + "grad_norm": 0.17762166261672974, + "learning_rate": 1.4405509098332818e-05, + "loss": 0.4358, + "step": 4837 + }, + { + "epoch": 1.0791880437207226, + "grad_norm": 0.15934491157531738, + "learning_rate": 1.4403396237924456e-05, + "loss": 0.4734, + "step": 4838 + }, + { + "epoch": 1.0794111086326121, + "grad_norm": 0.18883134424686432, + "learning_rate": 1.4401283133628315e-05, + "loss": 0.4488, + "step": 4839 + }, + { + "epoch": 1.0796341735445014, + "grad_norm": 0.1616184413433075, + "learning_rate": 1.4399169785561426e-05, + "loss": 0.465, + "step": 4840 + }, + { + "epoch": 1.0798572384563907, + "grad_norm": 0.16874286532402039, + "learning_rate": 1.4397056193840842e-05, + "loss": 0.5069, + "step": 4841 + }, + { + "epoch": 1.0800803033682802, + "grad_norm": 0.18162856996059418, + "learning_rate": 1.4394942358583627e-05, + "loss": 0.4929, + "step": 4842 + }, + { + "epoch": 1.0803033682801695, + "grad_norm": 0.15963618457317352, + "learning_rate": 1.4392828279906866e-05, + "loss": 0.46, + "step": 4843 + }, + { + "epoch": 1.080526433192059, + "grad_norm": 0.17380237579345703, + "learning_rate": 1.4390713957927636e-05, + "loss": 0.4929, + "step": 4844 + }, + { + "epoch": 1.0807494981039483, + "grad_norm": 0.174495667219162, + "learning_rate": 1.4388599392763052e-05, + "loss": 0.4626, + "step": 4845 + }, + { + "epoch": 1.0809725630158376, + "grad_norm": 0.17126008868217468, + "learning_rate": 1.4386484584530225e-05, + "loss": 0.47, + "step": 4846 + }, + { + "epoch": 1.081195627927727, + "grad_norm": 0.15579789876937866, + "learning_rate": 1.4384369533346292e-05, + "loss": 0.4565, + "step": 4847 + }, + { + "epoch": 1.0814186928396163, + "grad_norm": 0.16992245614528656, + "learning_rate": 1.4382254239328392e-05, + "loss": 0.4646, + "step": 4848 + }, + { + "epoch": 1.0816417577515056, + "grad_norm": 0.1682998389005661, + "learning_rate": 1.4380138702593691e-05, + "loss": 0.4807, + "step": 4849 + }, + { + "epoch": 1.0818648226633951, + "grad_norm": 0.16254080832004547, + "learning_rate": 1.4378022923259354e-05, + "loss": 0.4604, + "step": 4850 + }, + { + "epoch": 1.0820878875752844, + "grad_norm": 0.16904620826244354, + "learning_rate": 1.4375906901442568e-05, + "loss": 0.5068, + "step": 4851 + }, + { + "epoch": 1.0823109524871737, + "grad_norm": 0.15982915461063385, + "learning_rate": 1.4373790637260534e-05, + "loss": 0.4757, + "step": 4852 + }, + { + "epoch": 1.0825340173990632, + "grad_norm": 0.1598198115825653, + "learning_rate": 1.4371674130830462e-05, + "loss": 0.4596, + "step": 4853 + }, + { + "epoch": 1.0827570823109525, + "grad_norm": 0.2856222689151764, + "learning_rate": 1.4369557382269577e-05, + "loss": 0.4618, + "step": 4854 + }, + { + "epoch": 1.0829801472228417, + "grad_norm": 0.1609204113483429, + "learning_rate": 1.4367440391695118e-05, + "loss": 0.4814, + "step": 4855 + }, + { + "epoch": 1.0832032121347313, + "grad_norm": 0.1572166234254837, + "learning_rate": 1.4365323159224341e-05, + "loss": 0.4768, + "step": 4856 + }, + { + "epoch": 1.0834262770466205, + "grad_norm": 0.15718881785869598, + "learning_rate": 1.4363205684974504e-05, + "loss": 0.4594, + "step": 4857 + }, + { + "epoch": 1.0836493419585098, + "grad_norm": 0.1571321189403534, + "learning_rate": 1.4361087969062895e-05, + "loss": 0.4705, + "step": 4858 + }, + { + "epoch": 1.0838724068703993, + "grad_norm": 0.16006889939308167, + "learning_rate": 1.4358970011606798e-05, + "loss": 0.4549, + "step": 4859 + }, + { + "epoch": 1.0840954717822886, + "grad_norm": 0.1665477454662323, + "learning_rate": 1.4356851812723524e-05, + "loss": 0.4657, + "step": 4860 + }, + { + "epoch": 1.084318536694178, + "grad_norm": 0.16529056429862976, + "learning_rate": 1.4354733372530391e-05, + "loss": 0.476, + "step": 4861 + }, + { + "epoch": 1.0845416016060674, + "grad_norm": 0.1675388067960739, + "learning_rate": 1.4352614691144733e-05, + "loss": 0.4721, + "step": 4862 + }, + { + "epoch": 1.0847646665179567, + "grad_norm": 0.16424338519573212, + "learning_rate": 1.4350495768683893e-05, + "loss": 0.4856, + "step": 4863 + }, + { + "epoch": 1.0849877314298462, + "grad_norm": 0.16360746324062347, + "learning_rate": 1.4348376605265233e-05, + "loss": 0.4916, + "step": 4864 + }, + { + "epoch": 1.0852107963417355, + "grad_norm": 0.16066889464855194, + "learning_rate": 1.4346257201006122e-05, + "loss": 0.4606, + "step": 4865 + }, + { + "epoch": 1.0854338612536247, + "grad_norm": 0.15210098028182983, + "learning_rate": 1.434413755602395e-05, + "loss": 0.4684, + "step": 4866 + }, + { + "epoch": 1.0856569261655142, + "grad_norm": 0.16423378884792328, + "learning_rate": 1.4342017670436113e-05, + "loss": 0.4654, + "step": 4867 + }, + { + "epoch": 1.0858799910774035, + "grad_norm": 0.191256582736969, + "learning_rate": 1.4339897544360026e-05, + "loss": 0.4729, + "step": 4868 + }, + { + "epoch": 1.0861030559892928, + "grad_norm": 0.15907546877861023, + "learning_rate": 1.4337777177913113e-05, + "loss": 0.4545, + "step": 4869 + }, + { + "epoch": 1.0863261209011823, + "grad_norm": 0.15440797805786133, + "learning_rate": 1.4335656571212814e-05, + "loss": 0.4816, + "step": 4870 + }, + { + "epoch": 1.0865491858130716, + "grad_norm": 0.16571897268295288, + "learning_rate": 1.4333535724376585e-05, + "loss": 0.4498, + "step": 4871 + }, + { + "epoch": 1.0867722507249609, + "grad_norm": 0.16138556599617004, + "learning_rate": 1.4331414637521885e-05, + "loss": 0.4541, + "step": 4872 + }, + { + "epoch": 1.0869953156368504, + "grad_norm": 0.17464113235473633, + "learning_rate": 1.4329293310766196e-05, + "loss": 0.4997, + "step": 4873 + }, + { + "epoch": 1.0872183805487396, + "grad_norm": 0.15958289802074432, + "learning_rate": 1.4327171744227015e-05, + "loss": 0.4351, + "step": 4874 + }, + { + "epoch": 1.087441445460629, + "grad_norm": 0.17836567759513855, + "learning_rate": 1.432504993802184e-05, + "loss": 0.4626, + "step": 4875 + }, + { + "epoch": 1.0876645103725184, + "grad_norm": 0.16179955005645752, + "learning_rate": 1.4322927892268195e-05, + "loss": 0.4931, + "step": 4876 + }, + { + "epoch": 1.0878875752844077, + "grad_norm": 0.16194845736026764, + "learning_rate": 1.4320805607083611e-05, + "loss": 0.4619, + "step": 4877 + }, + { + "epoch": 1.0881106401962972, + "grad_norm": 0.16176071763038635, + "learning_rate": 1.4318683082585634e-05, + "loss": 0.4798, + "step": 4878 + }, + { + "epoch": 1.0883337051081865, + "grad_norm": 0.15933561325073242, + "learning_rate": 1.4316560318891823e-05, + "loss": 0.4635, + "step": 4879 + }, + { + "epoch": 1.0885567700200758, + "grad_norm": 0.16447357833385468, + "learning_rate": 1.431443731611975e-05, + "loss": 0.4747, + "step": 4880 + }, + { + "epoch": 1.0887798349319653, + "grad_norm": 0.1632988601922989, + "learning_rate": 1.4312314074386998e-05, + "loss": 0.489, + "step": 4881 + }, + { + "epoch": 1.0890028998438546, + "grad_norm": 0.16720198094844818, + "learning_rate": 1.4310190593811167e-05, + "loss": 0.4814, + "step": 4882 + }, + { + "epoch": 1.0892259647557438, + "grad_norm": 0.16806739568710327, + "learning_rate": 1.4308066874509869e-05, + "loss": 0.4566, + "step": 4883 + }, + { + "epoch": 1.0894490296676334, + "grad_norm": 0.1616521179676056, + "learning_rate": 1.430594291660073e-05, + "loss": 0.4875, + "step": 4884 + }, + { + "epoch": 1.0896720945795226, + "grad_norm": 0.15476809442043304, + "learning_rate": 1.4303818720201386e-05, + "loss": 0.4632, + "step": 4885 + }, + { + "epoch": 1.089895159491412, + "grad_norm": 0.15796570479869843, + "learning_rate": 1.4301694285429489e-05, + "loss": 0.4558, + "step": 4886 + }, + { + "epoch": 1.0901182244033014, + "grad_norm": 0.1628834307193756, + "learning_rate": 1.4299569612402701e-05, + "loss": 0.4578, + "step": 4887 + }, + { + "epoch": 1.0903412893151907, + "grad_norm": 0.16493961215019226, + "learning_rate": 1.4297444701238706e-05, + "loss": 0.5188, + "step": 4888 + }, + { + "epoch": 1.09056435422708, + "grad_norm": 0.17024171352386475, + "learning_rate": 1.4295319552055191e-05, + "loss": 0.4797, + "step": 4889 + }, + { + "epoch": 1.0907874191389695, + "grad_norm": 0.16179481148719788, + "learning_rate": 1.4293194164969859e-05, + "loss": 0.4894, + "step": 4890 + }, + { + "epoch": 1.0910104840508588, + "grad_norm": 0.15970705449581146, + "learning_rate": 1.429106854010043e-05, + "loss": 0.4698, + "step": 4891 + }, + { + "epoch": 1.091233548962748, + "grad_norm": 0.15802475810050964, + "learning_rate": 1.4288942677564634e-05, + "loss": 0.4585, + "step": 4892 + }, + { + "epoch": 1.0914566138746375, + "grad_norm": 0.16076651215553284, + "learning_rate": 1.4286816577480211e-05, + "loss": 0.4643, + "step": 4893 + }, + { + "epoch": 1.0916796787865268, + "grad_norm": 0.1617365926504135, + "learning_rate": 1.4284690239964925e-05, + "loss": 0.4608, + "step": 4894 + }, + { + "epoch": 1.0919027436984163, + "grad_norm": 0.1737692803144455, + "learning_rate": 1.428256366513654e-05, + "loss": 0.5016, + "step": 4895 + }, + { + "epoch": 1.0921258086103056, + "grad_norm": 0.16984876990318298, + "learning_rate": 1.428043685311284e-05, + "loss": 0.476, + "step": 4896 + }, + { + "epoch": 1.092348873522195, + "grad_norm": 0.16375143826007843, + "learning_rate": 1.427830980401162e-05, + "loss": 0.485, + "step": 4897 + }, + { + "epoch": 1.0925719384340844, + "grad_norm": 0.16099539399147034, + "learning_rate": 1.4276182517950696e-05, + "loss": 0.4743, + "step": 4898 + }, + { + "epoch": 1.0927950033459737, + "grad_norm": 0.16580551862716675, + "learning_rate": 1.4274054995047884e-05, + "loss": 0.473, + "step": 4899 + }, + { + "epoch": 1.093018068257863, + "grad_norm": 0.15540550649166107, + "learning_rate": 1.427192723542102e-05, + "loss": 0.4353, + "step": 4900 + }, + { + "epoch": 1.0932411331697525, + "grad_norm": 0.1589297354221344, + "learning_rate": 1.4269799239187956e-05, + "loss": 0.4637, + "step": 4901 + }, + { + "epoch": 1.0934641980816417, + "grad_norm": 0.16535155475139618, + "learning_rate": 1.4267671006466552e-05, + "loss": 0.4492, + "step": 4902 + }, + { + "epoch": 1.093687262993531, + "grad_norm": 0.15984977781772614, + "learning_rate": 1.4265542537374684e-05, + "loss": 0.4703, + "step": 4903 + }, + { + "epoch": 1.0939103279054205, + "grad_norm": 0.15200082957744598, + "learning_rate": 1.4263413832030237e-05, + "loss": 0.4483, + "step": 4904 + }, + { + "epoch": 1.0941333928173098, + "grad_norm": 0.21631982922554016, + "learning_rate": 1.4261284890551115e-05, + "loss": 0.4881, + "step": 4905 + }, + { + "epoch": 1.094356457729199, + "grad_norm": 0.16515322029590607, + "learning_rate": 1.4259155713055231e-05, + "loss": 0.4788, + "step": 4906 + }, + { + "epoch": 1.0945795226410886, + "grad_norm": 0.16507336497306824, + "learning_rate": 1.4257026299660511e-05, + "loss": 0.4749, + "step": 4907 + }, + { + "epoch": 1.0948025875529779, + "grad_norm": 0.17329774796962738, + "learning_rate": 1.4254896650484897e-05, + "loss": 0.4708, + "step": 4908 + }, + { + "epoch": 1.0950256524648672, + "grad_norm": 0.15935118496418, + "learning_rate": 1.4252766765646344e-05, + "loss": 0.4405, + "step": 4909 + }, + { + "epoch": 1.0952487173767567, + "grad_norm": 0.16520404815673828, + "learning_rate": 1.4250636645262813e-05, + "loss": 0.4546, + "step": 4910 + }, + { + "epoch": 1.095471782288646, + "grad_norm": 0.16655127704143524, + "learning_rate": 1.424850628945229e-05, + "loss": 0.4838, + "step": 4911 + }, + { + "epoch": 1.0956948472005354, + "grad_norm": 0.17478324472904205, + "learning_rate": 1.4246375698332764e-05, + "loss": 0.4932, + "step": 4912 + }, + { + "epoch": 1.0959179121124247, + "grad_norm": 0.15752072632312775, + "learning_rate": 1.4244244872022244e-05, + "loss": 0.4386, + "step": 4913 + }, + { + "epoch": 1.096140977024314, + "grad_norm": 0.16019755601882935, + "learning_rate": 1.424211381063874e-05, + "loss": 0.4474, + "step": 4914 + }, + { + "epoch": 1.0963640419362035, + "grad_norm": 0.1732664704322815, + "learning_rate": 1.4239982514300294e-05, + "loss": 0.4423, + "step": 4915 + }, + { + "epoch": 1.0965871068480928, + "grad_norm": 0.16661664843559265, + "learning_rate": 1.4237850983124943e-05, + "loss": 0.5198, + "step": 4916 + }, + { + "epoch": 1.096810171759982, + "grad_norm": 0.15834182500839233, + "learning_rate": 1.4235719217230751e-05, + "loss": 0.4784, + "step": 4917 + }, + { + "epoch": 1.0970332366718716, + "grad_norm": 0.16510586440563202, + "learning_rate": 1.423358721673578e-05, + "loss": 0.4843, + "step": 4918 + }, + { + "epoch": 1.0972563015837609, + "grad_norm": 0.16284050047397614, + "learning_rate": 1.4231454981758122e-05, + "loss": 0.4559, + "step": 4919 + }, + { + "epoch": 1.0974793664956501, + "grad_norm": 0.1576804369688034, + "learning_rate": 1.422932251241587e-05, + "loss": 0.4855, + "step": 4920 + }, + { + "epoch": 1.0977024314075396, + "grad_norm": 0.17963336408138275, + "learning_rate": 1.4227189808827131e-05, + "loss": 0.456, + "step": 4921 + }, + { + "epoch": 1.097925496319429, + "grad_norm": 0.16884389519691467, + "learning_rate": 1.4225056871110032e-05, + "loss": 0.4513, + "step": 4922 + }, + { + "epoch": 1.0981485612313182, + "grad_norm": 0.16066482663154602, + "learning_rate": 1.422292369938271e-05, + "loss": 0.4753, + "step": 4923 + }, + { + "epoch": 1.0983716261432077, + "grad_norm": 0.16328173875808716, + "learning_rate": 1.4220790293763307e-05, + "loss": 0.4872, + "step": 4924 + }, + { + "epoch": 1.098594691055097, + "grad_norm": 0.16611479222774506, + "learning_rate": 1.4218656654369987e-05, + "loss": 0.4599, + "step": 4925 + }, + { + "epoch": 1.0988177559669863, + "grad_norm": 0.15916502475738525, + "learning_rate": 1.4216522781320928e-05, + "loss": 0.4709, + "step": 4926 + }, + { + "epoch": 1.0990408208788758, + "grad_norm": 0.18872345983982086, + "learning_rate": 1.4214388674734309e-05, + "loss": 0.469, + "step": 4927 + }, + { + "epoch": 1.099263885790765, + "grad_norm": 0.1624644547700882, + "learning_rate": 1.421225433472834e-05, + "loss": 0.4793, + "step": 4928 + }, + { + "epoch": 1.0994869507026546, + "grad_norm": 0.16488322615623474, + "learning_rate": 1.4210119761421228e-05, + "loss": 0.4534, + "step": 4929 + }, + { + "epoch": 1.0997100156145438, + "grad_norm": 0.15583010017871857, + "learning_rate": 1.4207984954931204e-05, + "loss": 0.4851, + "step": 4930 + }, + { + "epoch": 1.0999330805264331, + "grad_norm": 0.1624946892261505, + "learning_rate": 1.4205849915376501e-05, + "loss": 0.4915, + "step": 4931 + }, + { + "epoch": 1.1001561454383226, + "grad_norm": 0.15822146832942963, + "learning_rate": 1.4203714642875377e-05, + "loss": 0.4684, + "step": 4932 + }, + { + "epoch": 1.100379210350212, + "grad_norm": 0.16068744659423828, + "learning_rate": 1.420157913754609e-05, + "loss": 0.4555, + "step": 4933 + }, + { + "epoch": 1.1006022752621012, + "grad_norm": 0.18929767608642578, + "learning_rate": 1.4199443399506922e-05, + "loss": 0.4551, + "step": 4934 + }, + { + "epoch": 1.1008253401739907, + "grad_norm": 0.1583186835050583, + "learning_rate": 1.4197307428876164e-05, + "loss": 0.4449, + "step": 4935 + }, + { + "epoch": 1.10104840508588, + "grad_norm": 0.16161227226257324, + "learning_rate": 1.4195171225772117e-05, + "loss": 0.4557, + "step": 4936 + }, + { + "epoch": 1.1012714699977693, + "grad_norm": 0.16375921666622162, + "learning_rate": 1.4193034790313101e-05, + "loss": 0.4627, + "step": 4937 + }, + { + "epoch": 1.1014945349096588, + "grad_norm": 0.17420132458209991, + "learning_rate": 1.4190898122617443e-05, + "loss": 0.4736, + "step": 4938 + }, + { + "epoch": 1.101717599821548, + "grad_norm": 0.16097956895828247, + "learning_rate": 1.4188761222803482e-05, + "loss": 0.4511, + "step": 4939 + }, + { + "epoch": 1.1019406647334375, + "grad_norm": 0.16940614581108093, + "learning_rate": 1.4186624090989578e-05, + "loss": 0.4763, + "step": 4940 + }, + { + "epoch": 1.1021637296453268, + "grad_norm": 0.1643071174621582, + "learning_rate": 1.4184486727294098e-05, + "loss": 0.4904, + "step": 4941 + }, + { + "epoch": 1.102386794557216, + "grad_norm": 0.1605340987443924, + "learning_rate": 1.418234913183542e-05, + "loss": 0.4621, + "step": 4942 + }, + { + "epoch": 1.1026098594691056, + "grad_norm": 0.15705011785030365, + "learning_rate": 1.4180211304731941e-05, + "loss": 0.4779, + "step": 4943 + }, + { + "epoch": 1.102832924380995, + "grad_norm": 0.164706289768219, + "learning_rate": 1.4178073246102062e-05, + "loss": 0.4685, + "step": 4944 + }, + { + "epoch": 1.1030559892928842, + "grad_norm": 0.1530647873878479, + "learning_rate": 1.417593495606421e-05, + "loss": 0.4643, + "step": 4945 + }, + { + "epoch": 1.1032790542047737, + "grad_norm": 0.1623847335577011, + "learning_rate": 1.4173796434736808e-05, + "loss": 0.4606, + "step": 4946 + }, + { + "epoch": 1.103502119116663, + "grad_norm": 0.17122423648834229, + "learning_rate": 1.4171657682238309e-05, + "loss": 0.4649, + "step": 4947 + }, + { + "epoch": 1.1037251840285522, + "grad_norm": 0.15987126529216766, + "learning_rate": 1.4169518698687164e-05, + "loss": 0.469, + "step": 4948 + }, + { + "epoch": 1.1039482489404417, + "grad_norm": 0.1600853055715561, + "learning_rate": 1.416737948420185e-05, + "loss": 0.4782, + "step": 4949 + }, + { + "epoch": 1.104171313852331, + "grad_norm": 0.15356458723545074, + "learning_rate": 1.4165240038900843e-05, + "loss": 0.4474, + "step": 4950 + }, + { + "epoch": 1.1043943787642203, + "grad_norm": 0.15699787437915802, + "learning_rate": 1.4163100362902642e-05, + "loss": 0.4525, + "step": 4951 + }, + { + "epoch": 1.1046174436761098, + "grad_norm": 0.161276713013649, + "learning_rate": 1.4160960456325757e-05, + "loss": 0.4644, + "step": 4952 + }, + { + "epoch": 1.104840508587999, + "grad_norm": 0.17264524102210999, + "learning_rate": 1.4158820319288709e-05, + "loss": 0.4881, + "step": 4953 + }, + { + "epoch": 1.1050635734998884, + "grad_norm": 0.15942956507205963, + "learning_rate": 1.4156679951910031e-05, + "loss": 0.4431, + "step": 4954 + }, + { + "epoch": 1.1052866384117779, + "grad_norm": 0.16725444793701172, + "learning_rate": 1.415453935430827e-05, + "loss": 0.5334, + "step": 4955 + }, + { + "epoch": 1.1055097033236672, + "grad_norm": 0.15742219984531403, + "learning_rate": 1.4152398526601987e-05, + "loss": 0.4648, + "step": 4956 + }, + { + "epoch": 1.1057327682355567, + "grad_norm": 0.16164308786392212, + "learning_rate": 1.4150257468909753e-05, + "loss": 0.4746, + "step": 4957 + }, + { + "epoch": 1.105955833147446, + "grad_norm": 0.16626529395580292, + "learning_rate": 1.4148116181350155e-05, + "loss": 0.4763, + "step": 4958 + }, + { + "epoch": 1.1061788980593352, + "grad_norm": 0.16360588371753693, + "learning_rate": 1.4145974664041793e-05, + "loss": 0.4983, + "step": 4959 + }, + { + "epoch": 1.1064019629712247, + "grad_norm": 0.16253133118152618, + "learning_rate": 1.4143832917103271e-05, + "loss": 0.4742, + "step": 4960 + }, + { + "epoch": 1.106625027883114, + "grad_norm": 0.16334842145442963, + "learning_rate": 1.4141690940653217e-05, + "loss": 0.4921, + "step": 4961 + }, + { + "epoch": 1.1068480927950033, + "grad_norm": 0.1619054228067398, + "learning_rate": 1.4139548734810267e-05, + "loss": 0.485, + "step": 4962 + }, + { + "epoch": 1.1070711577068928, + "grad_norm": 0.16352368891239166, + "learning_rate": 1.4137406299693068e-05, + "loss": 0.4702, + "step": 4963 + }, + { + "epoch": 1.107294222618782, + "grad_norm": 0.16347824037075043, + "learning_rate": 1.4135263635420287e-05, + "loss": 0.48, + "step": 4964 + }, + { + "epoch": 1.1075172875306714, + "grad_norm": 0.15994015336036682, + "learning_rate": 1.4133120742110591e-05, + "loss": 0.477, + "step": 4965 + }, + { + "epoch": 1.1077403524425609, + "grad_norm": 0.172930508852005, + "learning_rate": 1.4130977619882673e-05, + "loss": 0.4882, + "step": 4966 + }, + { + "epoch": 1.1079634173544501, + "grad_norm": 0.16305923461914062, + "learning_rate": 1.4128834268855224e-05, + "loss": 0.4737, + "step": 4967 + }, + { + "epoch": 1.1081864822663394, + "grad_norm": 0.1601942926645279, + "learning_rate": 1.4126690689146967e-05, + "loss": 0.4876, + "step": 4968 + }, + { + "epoch": 1.108409547178229, + "grad_norm": 0.16627256572246552, + "learning_rate": 1.4124546880876617e-05, + "loss": 0.4928, + "step": 4969 + }, + { + "epoch": 1.1086326120901182, + "grad_norm": 0.16845837235450745, + "learning_rate": 1.4122402844162921e-05, + "loss": 0.4756, + "step": 4970 + }, + { + "epoch": 1.1088556770020075, + "grad_norm": 0.1565047651529312, + "learning_rate": 1.412025857912462e-05, + "loss": 0.4661, + "step": 4971 + }, + { + "epoch": 1.109078741913897, + "grad_norm": 0.15977446734905243, + "learning_rate": 1.4118114085880484e-05, + "loss": 0.4466, + "step": 4972 + }, + { + "epoch": 1.1093018068257863, + "grad_norm": 0.16255097091197968, + "learning_rate": 1.4115969364549288e-05, + "loss": 0.4677, + "step": 4973 + }, + { + "epoch": 1.1095248717376758, + "grad_norm": 0.15899838507175446, + "learning_rate": 1.4113824415249812e-05, + "loss": 0.4786, + "step": 4974 + }, + { + "epoch": 1.109747936649565, + "grad_norm": 0.15630389750003815, + "learning_rate": 1.4111679238100868e-05, + "loss": 0.4657, + "step": 4975 + }, + { + "epoch": 1.1099710015614543, + "grad_norm": 0.16901795566082, + "learning_rate": 1.4109533833221263e-05, + "loss": 0.4582, + "step": 4976 + }, + { + "epoch": 1.1101940664733438, + "grad_norm": 0.15667371451854706, + "learning_rate": 1.4107388200729824e-05, + "loss": 0.4686, + "step": 4977 + }, + { + "epoch": 1.1104171313852331, + "grad_norm": 0.1538134068250656, + "learning_rate": 1.4105242340745388e-05, + "loss": 0.4591, + "step": 4978 + }, + { + "epoch": 1.1106401962971224, + "grad_norm": 0.15832480788230896, + "learning_rate": 1.4103096253386812e-05, + "loss": 0.4598, + "step": 4979 + }, + { + "epoch": 1.110863261209012, + "grad_norm": 0.16176483035087585, + "learning_rate": 1.4100949938772953e-05, + "loss": 0.4959, + "step": 4980 + }, + { + "epoch": 1.1110863261209012, + "grad_norm": 0.15992896258831024, + "learning_rate": 1.4098803397022694e-05, + "loss": 0.4693, + "step": 4981 + }, + { + "epoch": 1.1113093910327905, + "grad_norm": 0.16486084461212158, + "learning_rate": 1.4096656628254916e-05, + "loss": 0.4687, + "step": 4982 + }, + { + "epoch": 1.11153245594468, + "grad_norm": 0.16968096792697906, + "learning_rate": 1.4094509632588528e-05, + "loss": 0.4648, + "step": 4983 + }, + { + "epoch": 1.1117555208565693, + "grad_norm": 0.1609363853931427, + "learning_rate": 1.409236241014244e-05, + "loss": 0.47, + "step": 4984 + }, + { + "epoch": 1.1119785857684585, + "grad_norm": 0.16171255707740784, + "learning_rate": 1.409021496103558e-05, + "loss": 0.4538, + "step": 4985 + }, + { + "epoch": 1.112201650680348, + "grad_norm": 0.1664259135723114, + "learning_rate": 1.4088067285386885e-05, + "loss": 0.4717, + "step": 4986 + }, + { + "epoch": 1.1124247155922373, + "grad_norm": 0.16782160103321075, + "learning_rate": 1.4085919383315311e-05, + "loss": 0.472, + "step": 4987 + }, + { + "epoch": 1.1126477805041266, + "grad_norm": 0.17342595756053925, + "learning_rate": 1.408377125493982e-05, + "loss": 0.5161, + "step": 4988 + }, + { + "epoch": 1.112870845416016, + "grad_norm": 0.17580246925354004, + "learning_rate": 1.408162290037939e-05, + "loss": 0.4874, + "step": 4989 + }, + { + "epoch": 1.1130939103279054, + "grad_norm": 0.16201059520244598, + "learning_rate": 1.4079474319753007e-05, + "loss": 0.4597, + "step": 4990 + }, + { + "epoch": 1.113316975239795, + "grad_norm": 0.1731184720993042, + "learning_rate": 1.4077325513179676e-05, + "loss": 0.5251, + "step": 4991 + }, + { + "epoch": 1.1135400401516842, + "grad_norm": 0.14824289083480835, + "learning_rate": 1.407517648077841e-05, + "loss": 0.4159, + "step": 4992 + }, + { + "epoch": 1.1137631050635735, + "grad_norm": 0.1582590788602829, + "learning_rate": 1.4073027222668236e-05, + "loss": 0.4813, + "step": 4993 + }, + { + "epoch": 1.113986169975463, + "grad_norm": 0.15586505830287933, + "learning_rate": 1.4070877738968196e-05, + "loss": 0.4634, + "step": 4994 + }, + { + "epoch": 1.1142092348873522, + "grad_norm": 0.15916606783866882, + "learning_rate": 1.4068728029797338e-05, + "loss": 0.4608, + "step": 4995 + }, + { + "epoch": 1.1144322997992415, + "grad_norm": 0.15859851241111755, + "learning_rate": 1.4066578095274732e-05, + "loss": 0.4701, + "step": 4996 + }, + { + "epoch": 1.114655364711131, + "grad_norm": 0.1668933480978012, + "learning_rate": 1.406442793551945e-05, + "loss": 0.4823, + "step": 4997 + }, + { + "epoch": 1.1148784296230203, + "grad_norm": 0.15839633345603943, + "learning_rate": 1.406227755065058e-05, + "loss": 0.4411, + "step": 4998 + }, + { + "epoch": 1.1151014945349096, + "grad_norm": 0.16563774645328522, + "learning_rate": 1.4060126940787228e-05, + "loss": 0.5158, + "step": 4999 + }, + { + "epoch": 1.115324559446799, + "grad_norm": 0.16948749125003815, + "learning_rate": 1.4057976106048509e-05, + "loss": 0.4735, + "step": 5000 + }, + { + "epoch": 1.1155476243586884, + "grad_norm": 0.15439462661743164, + "learning_rate": 1.4055825046553544e-05, + "loss": 0.4646, + "step": 5001 + }, + { + "epoch": 1.1157706892705777, + "grad_norm": 0.22990640997886658, + "learning_rate": 1.4053673762421478e-05, + "loss": 0.4645, + "step": 5002 + }, + { + "epoch": 1.1159937541824672, + "grad_norm": 0.16069011390209198, + "learning_rate": 1.4051522253771458e-05, + "loss": 0.4598, + "step": 5003 + }, + { + "epoch": 1.1162168190943564, + "grad_norm": 0.15775886178016663, + "learning_rate": 1.4049370520722657e-05, + "loss": 0.468, + "step": 5004 + }, + { + "epoch": 1.1164398840062457, + "grad_norm": 0.16639457643032074, + "learning_rate": 1.4047218563394238e-05, + "loss": 0.5046, + "step": 5005 + }, + { + "epoch": 1.1166629489181352, + "grad_norm": 0.16088278591632843, + "learning_rate": 1.40450663819054e-05, + "loss": 0.4801, + "step": 5006 + }, + { + "epoch": 1.1168860138300245, + "grad_norm": 0.16275745630264282, + "learning_rate": 1.404291397637534e-05, + "loss": 0.4568, + "step": 5007 + }, + { + "epoch": 1.117109078741914, + "grad_norm": 0.16248852014541626, + "learning_rate": 1.4040761346923275e-05, + "loss": 0.4617, + "step": 5008 + }, + { + "epoch": 1.1173321436538033, + "grad_norm": 0.15882933139801025, + "learning_rate": 1.4038608493668428e-05, + "loss": 0.4553, + "step": 5009 + }, + { + "epoch": 1.1175552085656926, + "grad_norm": 0.15445846319198608, + "learning_rate": 1.4036455416730038e-05, + "loss": 0.4623, + "step": 5010 + }, + { + "epoch": 1.117778273477582, + "grad_norm": 0.16165289282798767, + "learning_rate": 1.4034302116227358e-05, + "loss": 0.4522, + "step": 5011 + }, + { + "epoch": 1.1180013383894714, + "grad_norm": 0.16482336819171906, + "learning_rate": 1.4032148592279649e-05, + "loss": 0.4897, + "step": 5012 + }, + { + "epoch": 1.1182244033013606, + "grad_norm": 0.16417334973812103, + "learning_rate": 1.4029994845006187e-05, + "loss": 0.459, + "step": 5013 + }, + { + "epoch": 1.1184474682132501, + "grad_norm": 0.16407686471939087, + "learning_rate": 1.4027840874526262e-05, + "loss": 0.4656, + "step": 5014 + }, + { + "epoch": 1.1186705331251394, + "grad_norm": 0.1576424241065979, + "learning_rate": 1.4025686680959174e-05, + "loss": 0.4284, + "step": 5015 + }, + { + "epoch": 1.1188935980370287, + "grad_norm": 0.15783734619617462, + "learning_rate": 1.4023532264424233e-05, + "loss": 0.4609, + "step": 5016 + }, + { + "epoch": 1.1191166629489182, + "grad_norm": 0.1539250910282135, + "learning_rate": 1.4021377625040768e-05, + "loss": 0.4507, + "step": 5017 + }, + { + "epoch": 1.1193397278608075, + "grad_norm": 0.16160206496715546, + "learning_rate": 1.4019222762928113e-05, + "loss": 0.456, + "step": 5018 + }, + { + "epoch": 1.1195627927726968, + "grad_norm": 0.15390300750732422, + "learning_rate": 1.4017067678205623e-05, + "loss": 0.4637, + "step": 5019 + }, + { + "epoch": 1.1197858576845863, + "grad_norm": 0.15983295440673828, + "learning_rate": 1.4014912370992653e-05, + "loss": 0.4809, + "step": 5020 + }, + { + "epoch": 1.1200089225964756, + "grad_norm": 0.16199488937854767, + "learning_rate": 1.4012756841408583e-05, + "loss": 0.4433, + "step": 5021 + }, + { + "epoch": 1.1202319875083648, + "grad_norm": 0.1602371335029602, + "learning_rate": 1.4010601089572794e-05, + "loss": 0.4472, + "step": 5022 + }, + { + "epoch": 1.1204550524202543, + "grad_norm": 0.16461622714996338, + "learning_rate": 1.4008445115604694e-05, + "loss": 0.4709, + "step": 5023 + }, + { + "epoch": 1.1206781173321436, + "grad_norm": 0.16210167109966278, + "learning_rate": 1.4006288919623687e-05, + "loss": 0.4569, + "step": 5024 + }, + { + "epoch": 1.1209011822440331, + "grad_norm": 0.15823470056056976, + "learning_rate": 1.4004132501749198e-05, + "loss": 0.4794, + "step": 5025 + }, + { + "epoch": 1.1211242471559224, + "grad_norm": 0.155581995844841, + "learning_rate": 1.4001975862100668e-05, + "loss": 0.4442, + "step": 5026 + }, + { + "epoch": 1.1213473120678117, + "grad_norm": 0.1608058661222458, + "learning_rate": 1.3999819000797539e-05, + "loss": 0.4535, + "step": 5027 + }, + { + "epoch": 1.1215703769797012, + "grad_norm": 0.16367916762828827, + "learning_rate": 1.3997661917959273e-05, + "loss": 0.4581, + "step": 5028 + }, + { + "epoch": 1.1217934418915905, + "grad_norm": 0.1618059128522873, + "learning_rate": 1.3995504613705344e-05, + "loss": 0.464, + "step": 5029 + }, + { + "epoch": 1.1220165068034798, + "grad_norm": 0.15621179342269897, + "learning_rate": 1.3993347088155237e-05, + "loss": 0.4573, + "step": 5030 + }, + { + "epoch": 1.1222395717153693, + "grad_norm": 0.1720859855413437, + "learning_rate": 1.399118934142845e-05, + "loss": 0.4854, + "step": 5031 + }, + { + "epoch": 1.1224626366272585, + "grad_norm": 0.16994187235832214, + "learning_rate": 1.3989031373644491e-05, + "loss": 0.5074, + "step": 5032 + }, + { + "epoch": 1.1226857015391478, + "grad_norm": 0.20645354688167572, + "learning_rate": 1.3986873184922882e-05, + "loss": 0.4498, + "step": 5033 + }, + { + "epoch": 1.1229087664510373, + "grad_norm": 0.1670864075422287, + "learning_rate": 1.3984714775383159e-05, + "loss": 0.4719, + "step": 5034 + }, + { + "epoch": 1.1231318313629266, + "grad_norm": 0.1660386323928833, + "learning_rate": 1.3982556145144866e-05, + "loss": 0.4869, + "step": 5035 + }, + { + "epoch": 1.1233548962748159, + "grad_norm": 0.16415423154830933, + "learning_rate": 1.3980397294327563e-05, + "loss": 0.4806, + "step": 5036 + }, + { + "epoch": 1.1235779611867054, + "grad_norm": 0.16121268272399902, + "learning_rate": 1.3978238223050817e-05, + "loss": 0.4677, + "step": 5037 + }, + { + "epoch": 1.1238010260985947, + "grad_norm": 0.16271083056926727, + "learning_rate": 1.3976078931434219e-05, + "loss": 0.4749, + "step": 5038 + }, + { + "epoch": 1.124024091010484, + "grad_norm": 0.16719701886177063, + "learning_rate": 1.3973919419597354e-05, + "loss": 0.4537, + "step": 5039 + }, + { + "epoch": 1.1242471559223735, + "grad_norm": 0.15598838031291962, + "learning_rate": 1.3971759687659841e-05, + "loss": 0.4623, + "step": 5040 + }, + { + "epoch": 1.1244702208342627, + "grad_norm": 0.1647387146949768, + "learning_rate": 1.3969599735741288e-05, + "loss": 0.4879, + "step": 5041 + }, + { + "epoch": 1.1246932857461522, + "grad_norm": 0.15755842626094818, + "learning_rate": 1.3967439563961334e-05, + "loss": 0.4564, + "step": 5042 + }, + { + "epoch": 1.1249163506580415, + "grad_norm": 0.1570240706205368, + "learning_rate": 1.396527917243962e-05, + "loss": 0.4593, + "step": 5043 + }, + { + "epoch": 1.1251394155699308, + "grad_norm": 0.6927107572555542, + "learning_rate": 1.3963118561295803e-05, + "loss": 0.4905, + "step": 5044 + }, + { + "epoch": 1.1253624804818203, + "grad_norm": 0.17146988213062286, + "learning_rate": 1.3960957730649551e-05, + "loss": 0.4754, + "step": 5045 + }, + { + "epoch": 1.1255855453937096, + "grad_norm": 0.16707734763622284, + "learning_rate": 1.3958796680620545e-05, + "loss": 0.4578, + "step": 5046 + }, + { + "epoch": 1.1258086103055989, + "grad_norm": 0.15594236552715302, + "learning_rate": 1.3956635411328478e-05, + "loss": 0.4556, + "step": 5047 + }, + { + "epoch": 1.1260316752174884, + "grad_norm": 0.15944088995456696, + "learning_rate": 1.395447392289305e-05, + "loss": 0.4478, + "step": 5048 + }, + { + "epoch": 1.1262547401293777, + "grad_norm": 0.16137385368347168, + "learning_rate": 1.3952312215433987e-05, + "loss": 0.4922, + "step": 5049 + }, + { + "epoch": 1.126477805041267, + "grad_norm": 0.18752697110176086, + "learning_rate": 1.3950150289071007e-05, + "loss": 0.4641, + "step": 5050 + }, + { + "epoch": 1.1267008699531564, + "grad_norm": 0.16448596119880676, + "learning_rate": 1.394798814392386e-05, + "loss": 0.475, + "step": 5051 + }, + { + "epoch": 1.1269239348650457, + "grad_norm": 0.16754111647605896, + "learning_rate": 1.3945825780112294e-05, + "loss": 0.4709, + "step": 5052 + }, + { + "epoch": 1.127146999776935, + "grad_norm": 0.165597066283226, + "learning_rate": 1.394366319775608e-05, + "loss": 0.4761, + "step": 5053 + }, + { + "epoch": 1.1273700646888245, + "grad_norm": 0.16928669810295105, + "learning_rate": 1.3941500396974984e-05, + "loss": 0.5111, + "step": 5054 + }, + { + "epoch": 1.1275931296007138, + "grad_norm": 0.16309335827827454, + "learning_rate": 1.3939337377888808e-05, + "loss": 0.4638, + "step": 5055 + }, + { + "epoch": 1.127816194512603, + "grad_norm": 0.16362158954143524, + "learning_rate": 1.3937174140617349e-05, + "loss": 0.4496, + "step": 5056 + }, + { + "epoch": 1.1280392594244926, + "grad_norm": 0.15970171988010406, + "learning_rate": 1.3935010685280417e-05, + "loss": 0.4583, + "step": 5057 + }, + { + "epoch": 1.1282623243363818, + "grad_norm": 0.16845880448818207, + "learning_rate": 1.3932847011997846e-05, + "loss": 0.4705, + "step": 5058 + }, + { + "epoch": 1.1284853892482714, + "grad_norm": 0.1621864140033722, + "learning_rate": 1.3930683120889463e-05, + "loss": 0.4659, + "step": 5059 + }, + { + "epoch": 1.1287084541601606, + "grad_norm": 0.16252627968788147, + "learning_rate": 1.3928519012075128e-05, + "loss": 0.4628, + "step": 5060 + }, + { + "epoch": 1.12893151907205, + "grad_norm": 0.1627832055091858, + "learning_rate": 1.3926354685674697e-05, + "loss": 0.4617, + "step": 5061 + }, + { + "epoch": 1.1291545839839394, + "grad_norm": 0.1639028936624527, + "learning_rate": 1.3924190141808048e-05, + "loss": 0.4626, + "step": 5062 + }, + { + "epoch": 1.1293776488958287, + "grad_norm": 0.1629188507795334, + "learning_rate": 1.3922025380595061e-05, + "loss": 0.4905, + "step": 5063 + }, + { + "epoch": 1.129600713807718, + "grad_norm": 0.16841430962085724, + "learning_rate": 1.391986040215564e-05, + "loss": 0.4512, + "step": 5064 + }, + { + "epoch": 1.1298237787196075, + "grad_norm": 0.1598411500453949, + "learning_rate": 1.3917695206609693e-05, + "loss": 0.4453, + "step": 5065 + }, + { + "epoch": 1.1300468436314968, + "grad_norm": 0.15308506786823273, + "learning_rate": 1.3915529794077142e-05, + "loss": 0.4563, + "step": 5066 + }, + { + "epoch": 1.130269908543386, + "grad_norm": 0.16865313053131104, + "learning_rate": 1.3913364164677922e-05, + "loss": 0.4587, + "step": 5067 + }, + { + "epoch": 1.1304929734552756, + "grad_norm": 0.1975971907377243, + "learning_rate": 1.3911198318531977e-05, + "loss": 0.4523, + "step": 5068 + }, + { + "epoch": 1.1307160383671648, + "grad_norm": 0.1655057817697525, + "learning_rate": 1.3909032255759267e-05, + "loss": 0.4446, + "step": 5069 + }, + { + "epoch": 1.1309391032790541, + "grad_norm": 0.16097813844680786, + "learning_rate": 1.3906865976479766e-05, + "loss": 0.4828, + "step": 5070 + }, + { + "epoch": 1.1311621681909436, + "grad_norm": 0.15927664935588837, + "learning_rate": 1.3904699480813446e-05, + "loss": 0.4715, + "step": 5071 + }, + { + "epoch": 1.131385233102833, + "grad_norm": 0.15957841277122498, + "learning_rate": 1.3902532768880313e-05, + "loss": 0.4638, + "step": 5072 + }, + { + "epoch": 1.1316082980147222, + "grad_norm": 0.16548919677734375, + "learning_rate": 1.3900365840800363e-05, + "loss": 0.495, + "step": 5073 + }, + { + "epoch": 1.1318313629266117, + "grad_norm": 0.1837669163942337, + "learning_rate": 1.3898198696693621e-05, + "loss": 0.4866, + "step": 5074 + }, + { + "epoch": 1.132054427838501, + "grad_norm": 0.16433456540107727, + "learning_rate": 1.3896031336680111e-05, + "loss": 0.4637, + "step": 5075 + }, + { + "epoch": 1.1322774927503905, + "grad_norm": 0.15830080211162567, + "learning_rate": 1.3893863760879882e-05, + "loss": 0.4644, + "step": 5076 + }, + { + "epoch": 1.1325005576622797, + "grad_norm": 0.16347894072532654, + "learning_rate": 1.3891695969412982e-05, + "loss": 0.4764, + "step": 5077 + }, + { + "epoch": 1.132723622574169, + "grad_norm": 0.15911467373371124, + "learning_rate": 1.388952796239948e-05, + "loss": 0.4585, + "step": 5078 + }, + { + "epoch": 1.1329466874860585, + "grad_norm": 0.16895873844623566, + "learning_rate": 1.3887359739959455e-05, + "loss": 0.4787, + "step": 5079 + }, + { + "epoch": 1.1331697523979478, + "grad_norm": 0.16661864519119263, + "learning_rate": 1.3885191302212993e-05, + "loss": 0.4987, + "step": 5080 + }, + { + "epoch": 1.133392817309837, + "grad_norm": 0.16188743710517883, + "learning_rate": 1.38830226492802e-05, + "loss": 0.4428, + "step": 5081 + }, + { + "epoch": 1.1336158822217266, + "grad_norm": 0.1691635102033615, + "learning_rate": 1.3880853781281187e-05, + "loss": 0.4713, + "step": 5082 + }, + { + "epoch": 1.1338389471336159, + "grad_norm": 0.16617491841316223, + "learning_rate": 1.387868469833608e-05, + "loss": 0.4687, + "step": 5083 + }, + { + "epoch": 1.1340620120455052, + "grad_norm": 0.17212171852588654, + "learning_rate": 1.3876515400565016e-05, + "loss": 0.4912, + "step": 5084 + }, + { + "epoch": 1.1342850769573947, + "grad_norm": 0.1617995649576187, + "learning_rate": 1.3874345888088145e-05, + "loss": 0.4807, + "step": 5085 + }, + { + "epoch": 1.134508141869284, + "grad_norm": 0.16481629014015198, + "learning_rate": 1.3872176161025627e-05, + "loss": 0.4652, + "step": 5086 + }, + { + "epoch": 1.1347312067811732, + "grad_norm": 0.16548612713813782, + "learning_rate": 1.3870006219497642e-05, + "loss": 0.4517, + "step": 5087 + }, + { + "epoch": 1.1349542716930627, + "grad_norm": 0.17424030601978302, + "learning_rate": 1.3867836063624363e-05, + "loss": 0.4804, + "step": 5088 + }, + { + "epoch": 1.135177336604952, + "grad_norm": 0.17292533814907074, + "learning_rate": 1.3865665693525994e-05, + "loss": 0.4776, + "step": 5089 + }, + { + "epoch": 1.1354004015168413, + "grad_norm": 0.1664893478155136, + "learning_rate": 1.3863495109322744e-05, + "loss": 0.4653, + "step": 5090 + }, + { + "epoch": 1.1356234664287308, + "grad_norm": 0.16548456251621246, + "learning_rate": 1.3861324311134832e-05, + "loss": 0.4709, + "step": 5091 + }, + { + "epoch": 1.13584653134062, + "grad_norm": 0.1643359214067459, + "learning_rate": 1.3859153299082493e-05, + "loss": 0.4361, + "step": 5092 + }, + { + "epoch": 1.1360695962525096, + "grad_norm": 0.17173811793327332, + "learning_rate": 1.3856982073285965e-05, + "loss": 0.4482, + "step": 5093 + }, + { + "epoch": 1.1362926611643989, + "grad_norm": 0.1580163687467575, + "learning_rate": 1.3854810633865512e-05, + "loss": 0.4395, + "step": 5094 + }, + { + "epoch": 1.1365157260762881, + "grad_norm": 0.16304674744606018, + "learning_rate": 1.3852638980941398e-05, + "loss": 0.4754, + "step": 5095 + }, + { + "epoch": 1.1367387909881776, + "grad_norm": 0.16688407957553864, + "learning_rate": 1.38504671146339e-05, + "loss": 0.5081, + "step": 5096 + }, + { + "epoch": 1.136961855900067, + "grad_norm": 0.16496771574020386, + "learning_rate": 1.3848295035063317e-05, + "loss": 0.4537, + "step": 5097 + }, + { + "epoch": 1.1371849208119562, + "grad_norm": 0.16779771447181702, + "learning_rate": 1.3846122742349946e-05, + "loss": 0.4608, + "step": 5098 + }, + { + "epoch": 1.1374079857238457, + "grad_norm": 0.1593242734670639, + "learning_rate": 1.3843950236614103e-05, + "loss": 0.4761, + "step": 5099 + }, + { + "epoch": 1.137631050635735, + "grad_norm": 0.16632351279258728, + "learning_rate": 1.384177751797612e-05, + "loss": 0.4723, + "step": 5100 + }, + { + "epoch": 1.1378541155476243, + "grad_norm": 0.16760534048080444, + "learning_rate": 1.383960458655633e-05, + "loss": 0.4509, + "step": 5101 + }, + { + "epoch": 1.1380771804595138, + "grad_norm": 0.16486233472824097, + "learning_rate": 1.3837431442475089e-05, + "loss": 0.4806, + "step": 5102 + }, + { + "epoch": 1.138300245371403, + "grad_norm": 0.1571461409330368, + "learning_rate": 1.3835258085852752e-05, + "loss": 0.4564, + "step": 5103 + }, + { + "epoch": 1.1385233102832923, + "grad_norm": 0.16537711024284363, + "learning_rate": 1.38330845168097e-05, + "loss": 0.4556, + "step": 5104 + }, + { + "epoch": 1.1387463751951818, + "grad_norm": 0.1652860939502716, + "learning_rate": 1.3830910735466313e-05, + "loss": 0.447, + "step": 5105 + }, + { + "epoch": 1.1389694401070711, + "grad_norm": 0.15692059695720673, + "learning_rate": 1.3828736741942998e-05, + "loss": 0.4566, + "step": 5106 + }, + { + "epoch": 1.1391925050189604, + "grad_norm": 0.16122755408287048, + "learning_rate": 1.3826562536360155e-05, + "loss": 0.4692, + "step": 5107 + }, + { + "epoch": 1.13941556993085, + "grad_norm": 0.1586706042289734, + "learning_rate": 1.382438811883821e-05, + "loss": 0.4316, + "step": 5108 + }, + { + "epoch": 1.1396386348427392, + "grad_norm": 0.16200071573257446, + "learning_rate": 1.3822213489497594e-05, + "loss": 0.4704, + "step": 5109 + }, + { + "epoch": 1.1398616997546287, + "grad_norm": 0.5188888311386108, + "learning_rate": 1.3820038648458748e-05, + "loss": 0.49, + "step": 5110 + }, + { + "epoch": 1.140084764666518, + "grad_norm": 0.15934044122695923, + "learning_rate": 1.3817863595842138e-05, + "loss": 0.4456, + "step": 5111 + }, + { + "epoch": 1.1403078295784073, + "grad_norm": 0.17257168889045715, + "learning_rate": 1.3815688331768224e-05, + "loss": 0.4655, + "step": 5112 + }, + { + "epoch": 1.1405308944902968, + "grad_norm": 0.16849681735038757, + "learning_rate": 1.3813512856357491e-05, + "loss": 0.4461, + "step": 5113 + }, + { + "epoch": 1.140753959402186, + "grad_norm": 0.16747885942459106, + "learning_rate": 1.3811337169730428e-05, + "loss": 0.473, + "step": 5114 + }, + { + "epoch": 1.1409770243140753, + "grad_norm": 0.1711420714855194, + "learning_rate": 1.3809161272007536e-05, + "loss": 0.4851, + "step": 5115 + }, + { + "epoch": 1.1412000892259648, + "grad_norm": 0.1751970797777176, + "learning_rate": 1.3806985163309334e-05, + "loss": 0.4955, + "step": 5116 + }, + { + "epoch": 1.1414231541378541, + "grad_norm": 0.16653122007846832, + "learning_rate": 1.3804808843756348e-05, + "loss": 0.4697, + "step": 5117 + }, + { + "epoch": 1.1416462190497434, + "grad_norm": 0.158418208360672, + "learning_rate": 1.3802632313469111e-05, + "loss": 0.433, + "step": 5118 + }, + { + "epoch": 1.141869283961633, + "grad_norm": 0.16599524021148682, + "learning_rate": 1.3800455572568182e-05, + "loss": 0.49, + "step": 5119 + }, + { + "epoch": 1.1420923488735222, + "grad_norm": 0.17085126042366028, + "learning_rate": 1.3798278621174113e-05, + "loss": 0.4914, + "step": 5120 + }, + { + "epoch": 1.1423154137854115, + "grad_norm": 0.16169953346252441, + "learning_rate": 1.3796101459407485e-05, + "loss": 0.4851, + "step": 5121 + }, + { + "epoch": 1.142538478697301, + "grad_norm": 0.19225676357746124, + "learning_rate": 1.3793924087388876e-05, + "loss": 0.4554, + "step": 5122 + }, + { + "epoch": 1.1427615436091902, + "grad_norm": 0.15610942244529724, + "learning_rate": 1.379174650523889e-05, + "loss": 0.4436, + "step": 5123 + }, + { + "epoch": 1.1429846085210795, + "grad_norm": 0.16388073563575745, + "learning_rate": 1.3789568713078129e-05, + "loss": 0.4647, + "step": 5124 + }, + { + "epoch": 1.143207673432969, + "grad_norm": 0.15931613743305206, + "learning_rate": 1.3787390711027217e-05, + "loss": 0.4707, + "step": 5125 + }, + { + "epoch": 1.1434307383448583, + "grad_norm": 0.1648622304201126, + "learning_rate": 1.3785212499206783e-05, + "loss": 0.4628, + "step": 5126 + }, + { + "epoch": 1.1436538032567478, + "grad_norm": 0.16240182518959045, + "learning_rate": 1.3783034077737472e-05, + "loss": 0.4735, + "step": 5127 + }, + { + "epoch": 1.143876868168637, + "grad_norm": 0.15895162522792816, + "learning_rate": 1.3780855446739937e-05, + "loss": 0.4587, + "step": 5128 + }, + { + "epoch": 1.1440999330805264, + "grad_norm": 0.1622496396303177, + "learning_rate": 1.3778676606334844e-05, + "loss": 0.4541, + "step": 5129 + }, + { + "epoch": 1.1443229979924159, + "grad_norm": 0.15852504968643188, + "learning_rate": 1.3776497556642874e-05, + "loss": 0.4546, + "step": 5130 + }, + { + "epoch": 1.1445460629043052, + "grad_norm": 0.16722345352172852, + "learning_rate": 1.377431829778471e-05, + "loss": 0.4754, + "step": 5131 + }, + { + "epoch": 1.1447691278161944, + "grad_norm": 0.1600501835346222, + "learning_rate": 1.377213882988106e-05, + "loss": 0.4851, + "step": 5132 + }, + { + "epoch": 1.144992192728084, + "grad_norm": 0.1717759370803833, + "learning_rate": 1.3769959153052634e-05, + "loss": 0.4581, + "step": 5133 + }, + { + "epoch": 1.1452152576399732, + "grad_norm": 0.1694609820842743, + "learning_rate": 1.3767779267420158e-05, + "loss": 0.4584, + "step": 5134 + }, + { + "epoch": 1.1454383225518625, + "grad_norm": 0.16004528105258942, + "learning_rate": 1.3765599173104362e-05, + "loss": 0.4768, + "step": 5135 + }, + { + "epoch": 1.145661387463752, + "grad_norm": 0.15713921189308167, + "learning_rate": 1.3763418870225999e-05, + "loss": 0.4515, + "step": 5136 + }, + { + "epoch": 1.1458844523756413, + "grad_norm": 0.16554328799247742, + "learning_rate": 1.3761238358905826e-05, + "loss": 0.4856, + "step": 5137 + }, + { + "epoch": 1.1461075172875306, + "grad_norm": 0.1595989465713501, + "learning_rate": 1.3759057639264614e-05, + "loss": 0.4695, + "step": 5138 + }, + { + "epoch": 1.14633058219942, + "grad_norm": 0.16599664092063904, + "learning_rate": 1.3756876711423143e-05, + "loss": 0.4821, + "step": 5139 + }, + { + "epoch": 1.1465536471113094, + "grad_norm": 0.15603865683078766, + "learning_rate": 1.3754695575502211e-05, + "loss": 0.4469, + "step": 5140 + }, + { + "epoch": 1.1467767120231986, + "grad_norm": 0.16003453731536865, + "learning_rate": 1.3752514231622617e-05, + "loss": 0.4619, + "step": 5141 + }, + { + "epoch": 1.1469997769350881, + "grad_norm": 0.15717166662216187, + "learning_rate": 1.375033267990518e-05, + "loss": 0.4585, + "step": 5142 + }, + { + "epoch": 1.1472228418469774, + "grad_norm": 0.15859000384807587, + "learning_rate": 1.374815092047073e-05, + "loss": 0.4446, + "step": 5143 + }, + { + "epoch": 1.147445906758867, + "grad_norm": 0.16028450429439545, + "learning_rate": 1.3745968953440105e-05, + "loss": 0.4512, + "step": 5144 + }, + { + "epoch": 1.1476689716707562, + "grad_norm": 0.1540936380624771, + "learning_rate": 1.3743786778934158e-05, + "loss": 0.4286, + "step": 5145 + }, + { + "epoch": 1.1478920365826455, + "grad_norm": 0.15674646198749542, + "learning_rate": 1.3741604397073748e-05, + "loss": 0.469, + "step": 5146 + }, + { + "epoch": 1.148115101494535, + "grad_norm": 0.16240215301513672, + "learning_rate": 1.3739421807979753e-05, + "loss": 0.4609, + "step": 5147 + }, + { + "epoch": 1.1483381664064243, + "grad_norm": 0.1656564623117447, + "learning_rate": 1.3737239011773054e-05, + "loss": 0.4829, + "step": 5148 + }, + { + "epoch": 1.1485612313183136, + "grad_norm": 0.16485963761806488, + "learning_rate": 1.3735056008574551e-05, + "loss": 0.4494, + "step": 5149 + }, + { + "epoch": 1.148784296230203, + "grad_norm": 0.16284818947315216, + "learning_rate": 1.3732872798505153e-05, + "loss": 0.4504, + "step": 5150 + }, + { + "epoch": 1.1490073611420923, + "grad_norm": 0.21305973827838898, + "learning_rate": 1.373068938168578e-05, + "loss": 0.4496, + "step": 5151 + }, + { + "epoch": 1.1492304260539816, + "grad_norm": 0.16297850012779236, + "learning_rate": 1.3728505758237358e-05, + "loss": 0.4799, + "step": 5152 + }, + { + "epoch": 1.1494534909658711, + "grad_norm": 0.16585685312747955, + "learning_rate": 1.3726321928280837e-05, + "loss": 0.4608, + "step": 5153 + }, + { + "epoch": 1.1496765558777604, + "grad_norm": 0.16437405347824097, + "learning_rate": 1.3724137891937167e-05, + "loss": 0.4812, + "step": 5154 + }, + { + "epoch": 1.14989962078965, + "grad_norm": 0.15935295820236206, + "learning_rate": 1.3721953649327316e-05, + "loss": 0.4633, + "step": 5155 + }, + { + "epoch": 1.1501226857015392, + "grad_norm": 0.15900611877441406, + "learning_rate": 1.3719769200572258e-05, + "loss": 0.4401, + "step": 5156 + }, + { + "epoch": 1.1503457506134285, + "grad_norm": 0.172005295753479, + "learning_rate": 1.3717584545792983e-05, + "loss": 0.4634, + "step": 5157 + }, + { + "epoch": 1.1505688155253178, + "grad_norm": 0.16333134472370148, + "learning_rate": 1.3715399685110492e-05, + "loss": 0.4391, + "step": 5158 + }, + { + "epoch": 1.1507918804372073, + "grad_norm": 0.16098088026046753, + "learning_rate": 1.3713214618645796e-05, + "loss": 0.4484, + "step": 5159 + }, + { + "epoch": 1.1510149453490965, + "grad_norm": 0.17146125435829163, + "learning_rate": 1.3711029346519917e-05, + "loss": 0.4606, + "step": 5160 + }, + { + "epoch": 1.151238010260986, + "grad_norm": 0.16574114561080933, + "learning_rate": 1.3708843868853889e-05, + "loss": 0.4862, + "step": 5161 + }, + { + "epoch": 1.1514610751728753, + "grad_norm": 0.168543741106987, + "learning_rate": 1.370665818576876e-05, + "loss": 0.4608, + "step": 5162 + }, + { + "epoch": 1.1516841400847646, + "grad_norm": 0.16033907234668732, + "learning_rate": 1.3704472297385583e-05, + "loss": 0.439, + "step": 5163 + }, + { + "epoch": 1.151907204996654, + "grad_norm": 0.16206419467926025, + "learning_rate": 1.3702286203825429e-05, + "loss": 0.4707, + "step": 5164 + }, + { + "epoch": 1.1521302699085434, + "grad_norm": 0.160027876496315, + "learning_rate": 1.3700099905209374e-05, + "loss": 0.4603, + "step": 5165 + }, + { + "epoch": 1.1523533348204327, + "grad_norm": 0.17743688821792603, + "learning_rate": 1.3697913401658516e-05, + "loss": 0.5053, + "step": 5166 + }, + { + "epoch": 1.1525763997323222, + "grad_norm": 0.15926334261894226, + "learning_rate": 1.3695726693293951e-05, + "loss": 0.4685, + "step": 5167 + }, + { + "epoch": 1.1527994646442115, + "grad_norm": 0.18017716705799103, + "learning_rate": 1.3693539780236798e-05, + "loss": 0.4864, + "step": 5168 + }, + { + "epoch": 1.1530225295561007, + "grad_norm": 0.1655382215976715, + "learning_rate": 1.3691352662608175e-05, + "loss": 0.4607, + "step": 5169 + }, + { + "epoch": 1.1532455944679902, + "grad_norm": 0.16363778710365295, + "learning_rate": 1.3689165340529222e-05, + "loss": 0.4425, + "step": 5170 + }, + { + "epoch": 1.1534686593798795, + "grad_norm": 0.1638006567955017, + "learning_rate": 1.3686977814121087e-05, + "loss": 0.4428, + "step": 5171 + }, + { + "epoch": 1.153691724291769, + "grad_norm": 0.1594667136669159, + "learning_rate": 1.368479008350493e-05, + "loss": 0.4417, + "step": 5172 + }, + { + "epoch": 1.1539147892036583, + "grad_norm": 0.1636561006307602, + "learning_rate": 1.3682602148801917e-05, + "loss": 0.4602, + "step": 5173 + }, + { + "epoch": 1.1541378541155476, + "grad_norm": 0.1705152690410614, + "learning_rate": 1.3680414010133237e-05, + "loss": 0.4494, + "step": 5174 + }, + { + "epoch": 1.1543609190274369, + "grad_norm": 0.1863928884267807, + "learning_rate": 1.3678225667620075e-05, + "loss": 0.4592, + "step": 5175 + }, + { + "epoch": 1.1545839839393264, + "grad_norm": 0.1695355474948883, + "learning_rate": 1.3676037121383638e-05, + "loss": 0.5064, + "step": 5176 + }, + { + "epoch": 1.1548070488512157, + "grad_norm": 0.16180136799812317, + "learning_rate": 1.3673848371545145e-05, + "loss": 0.473, + "step": 5177 + }, + { + "epoch": 1.1550301137631052, + "grad_norm": 0.18443214893341064, + "learning_rate": 1.3671659418225815e-05, + "loss": 0.4872, + "step": 5178 + }, + { + "epoch": 1.1552531786749944, + "grad_norm": 0.16797620058059692, + "learning_rate": 1.3669470261546896e-05, + "loss": 0.4538, + "step": 5179 + }, + { + "epoch": 1.1554762435868837, + "grad_norm": 0.16609326004981995, + "learning_rate": 1.3667280901629627e-05, + "loss": 0.5077, + "step": 5180 + }, + { + "epoch": 1.1556993084987732, + "grad_norm": 0.16129672527313232, + "learning_rate": 1.3665091338595277e-05, + "loss": 0.4544, + "step": 5181 + }, + { + "epoch": 1.1559223734106625, + "grad_norm": 0.16736097633838654, + "learning_rate": 1.3662901572565114e-05, + "loss": 0.4633, + "step": 5182 + }, + { + "epoch": 1.1561454383225518, + "grad_norm": 0.21419832110404968, + "learning_rate": 1.3660711603660422e-05, + "loss": 0.4508, + "step": 5183 + }, + { + "epoch": 1.1563685032344413, + "grad_norm": 0.16937695443630219, + "learning_rate": 1.3658521432002494e-05, + "loss": 0.4729, + "step": 5184 + }, + { + "epoch": 1.1565915681463306, + "grad_norm": 0.15548932552337646, + "learning_rate": 1.3656331057712637e-05, + "loss": 0.4829, + "step": 5185 + }, + { + "epoch": 1.1568146330582199, + "grad_norm": 0.17126090824604034, + "learning_rate": 1.3654140480912164e-05, + "loss": 0.4715, + "step": 5186 + }, + { + "epoch": 1.1570376979701094, + "grad_norm": 0.18009397387504578, + "learning_rate": 1.3651949701722407e-05, + "loss": 0.4883, + "step": 5187 + }, + { + "epoch": 1.1572607628819986, + "grad_norm": 0.171475350856781, + "learning_rate": 1.3649758720264705e-05, + "loss": 0.4411, + "step": 5188 + }, + { + "epoch": 1.1574838277938881, + "grad_norm": 0.17869600653648376, + "learning_rate": 1.3647567536660407e-05, + "loss": 0.4841, + "step": 5189 + }, + { + "epoch": 1.1577068927057774, + "grad_norm": 0.1822163164615631, + "learning_rate": 1.3645376151030871e-05, + "loss": 0.4595, + "step": 5190 + }, + { + "epoch": 1.1579299576176667, + "grad_norm": 0.17729108035564423, + "learning_rate": 1.3643184563497479e-05, + "loss": 0.4605, + "step": 5191 + }, + { + "epoch": 1.158153022529556, + "grad_norm": 0.16725416481494904, + "learning_rate": 1.3640992774181605e-05, + "loss": 0.4375, + "step": 5192 + }, + { + "epoch": 1.1583760874414455, + "grad_norm": 0.17714278399944305, + "learning_rate": 1.3638800783204653e-05, + "loss": 0.4944, + "step": 5193 + }, + { + "epoch": 1.1585991523533348, + "grad_norm": 0.159885436296463, + "learning_rate": 1.3636608590688019e-05, + "loss": 0.4793, + "step": 5194 + }, + { + "epoch": 1.1588222172652243, + "grad_norm": 0.16276371479034424, + "learning_rate": 1.363441619675313e-05, + "loss": 0.4872, + "step": 5195 + }, + { + "epoch": 1.1590452821771136, + "grad_norm": 0.16415032744407654, + "learning_rate": 1.3632223601521409e-05, + "loss": 0.4632, + "step": 5196 + }, + { + "epoch": 1.1592683470890028, + "grad_norm": 0.1709449738264084, + "learning_rate": 1.3630030805114297e-05, + "loss": 0.485, + "step": 5197 + }, + { + "epoch": 1.1594914120008923, + "grad_norm": 0.16452427208423615, + "learning_rate": 1.3627837807653249e-05, + "loss": 0.4306, + "step": 5198 + }, + { + "epoch": 1.1597144769127816, + "grad_norm": 0.18321935832500458, + "learning_rate": 1.3625644609259716e-05, + "loss": 0.4982, + "step": 5199 + }, + { + "epoch": 1.159937541824671, + "grad_norm": 0.1770065277814865, + "learning_rate": 1.3623451210055186e-05, + "loss": 0.496, + "step": 5200 + }, + { + "epoch": 1.1601606067365604, + "grad_norm": 0.16729894280433655, + "learning_rate": 1.3621257610161129e-05, + "loss": 0.4844, + "step": 5201 + }, + { + "epoch": 1.1603836716484497, + "grad_norm": 0.1584504395723343, + "learning_rate": 1.3619063809699054e-05, + "loss": 0.4801, + "step": 5202 + }, + { + "epoch": 1.160606736560339, + "grad_norm": 0.15807606279850006, + "learning_rate": 1.3616869808790453e-05, + "loss": 0.4623, + "step": 5203 + }, + { + "epoch": 1.1608298014722285, + "grad_norm": 0.15968340635299683, + "learning_rate": 1.3614675607556857e-05, + "loss": 0.4693, + "step": 5204 + }, + { + "epoch": 1.1610528663841178, + "grad_norm": 0.16744199395179749, + "learning_rate": 1.3612481206119786e-05, + "loss": 0.4821, + "step": 5205 + }, + { + "epoch": 1.1612759312960073, + "grad_norm": 0.16137133538722992, + "learning_rate": 1.3610286604600782e-05, + "loss": 0.4757, + "step": 5206 + }, + { + "epoch": 1.1614989962078965, + "grad_norm": 0.1617872714996338, + "learning_rate": 1.3608091803121397e-05, + "loss": 0.4703, + "step": 5207 + }, + { + "epoch": 1.1617220611197858, + "grad_norm": 0.16734254360198975, + "learning_rate": 1.360589680180319e-05, + "loss": 0.4706, + "step": 5208 + }, + { + "epoch": 1.161945126031675, + "grad_norm": 0.163192480802536, + "learning_rate": 1.3603701600767741e-05, + "loss": 0.4722, + "step": 5209 + }, + { + "epoch": 1.1621681909435646, + "grad_norm": 0.16038168966770172, + "learning_rate": 1.3601506200136624e-05, + "loss": 0.4443, + "step": 5210 + }, + { + "epoch": 1.1623912558554539, + "grad_norm": 0.16467641294002533, + "learning_rate": 1.3599310600031443e-05, + "loss": 0.4465, + "step": 5211 + }, + { + "epoch": 1.1626143207673434, + "grad_norm": 0.1640377789735794, + "learning_rate": 1.3597114800573799e-05, + "loss": 0.4722, + "step": 5212 + }, + { + "epoch": 1.1628373856792327, + "grad_norm": 0.16441082954406738, + "learning_rate": 1.359491880188531e-05, + "loss": 0.4617, + "step": 5213 + }, + { + "epoch": 1.163060450591122, + "grad_norm": 0.16120347380638123, + "learning_rate": 1.3592722604087604e-05, + "loss": 0.4464, + "step": 5214 + }, + { + "epoch": 1.1632835155030115, + "grad_norm": 0.17573657631874084, + "learning_rate": 1.3590526207302324e-05, + "loss": 0.5029, + "step": 5215 + }, + { + "epoch": 1.1635065804149007, + "grad_norm": 0.16203901171684265, + "learning_rate": 1.3588329611651117e-05, + "loss": 0.4654, + "step": 5216 + }, + { + "epoch": 1.16372964532679, + "grad_norm": 0.1684379279613495, + "learning_rate": 1.3586132817255644e-05, + "loss": 0.4706, + "step": 5217 + }, + { + "epoch": 1.1639527102386795, + "grad_norm": 0.15993818640708923, + "learning_rate": 1.3583935824237576e-05, + "loss": 0.4587, + "step": 5218 + }, + { + "epoch": 1.1641757751505688, + "grad_norm": 0.1687501221895218, + "learning_rate": 1.35817386327186e-05, + "loss": 0.4467, + "step": 5219 + }, + { + "epoch": 1.164398840062458, + "grad_norm": 0.16527238488197327, + "learning_rate": 1.3579541242820407e-05, + "loss": 0.4617, + "step": 5220 + }, + { + "epoch": 1.1646219049743476, + "grad_norm": 0.16829682886600494, + "learning_rate": 1.3577343654664705e-05, + "loss": 0.4582, + "step": 5221 + }, + { + "epoch": 1.1648449698862369, + "grad_norm": 0.186698317527771, + "learning_rate": 1.3575145868373207e-05, + "loss": 0.4878, + "step": 5222 + }, + { + "epoch": 1.1650680347981264, + "grad_norm": 0.19950494170188904, + "learning_rate": 1.3572947884067644e-05, + "loss": 0.4636, + "step": 5223 + }, + { + "epoch": 1.1652910997100157, + "grad_norm": 0.16333967447280884, + "learning_rate": 1.3570749701869751e-05, + "loss": 0.4772, + "step": 5224 + }, + { + "epoch": 1.165514164621905, + "grad_norm": 0.16258838772773743, + "learning_rate": 1.3568551321901282e-05, + "loss": 0.4968, + "step": 5225 + }, + { + "epoch": 1.1657372295337942, + "grad_norm": 0.18643935024738312, + "learning_rate": 1.356635274428399e-05, + "loss": 0.4937, + "step": 5226 + }, + { + "epoch": 1.1659602944456837, + "grad_norm": 0.1727738380432129, + "learning_rate": 1.3564153969139654e-05, + "loss": 0.4915, + "step": 5227 + }, + { + "epoch": 1.166183359357573, + "grad_norm": 0.16102762520313263, + "learning_rate": 1.3561954996590047e-05, + "loss": 0.4534, + "step": 5228 + }, + { + "epoch": 1.1664064242694625, + "grad_norm": 0.17399117350578308, + "learning_rate": 1.3559755826756968e-05, + "loss": 0.4775, + "step": 5229 + }, + { + "epoch": 1.1666294891813518, + "grad_norm": 0.1659899652004242, + "learning_rate": 1.3557556459762223e-05, + "loss": 0.4624, + "step": 5230 + }, + { + "epoch": 1.166852554093241, + "grad_norm": 0.17996659874916077, + "learning_rate": 1.3555356895727618e-05, + "loss": 0.4938, + "step": 5231 + }, + { + "epoch": 1.1670756190051306, + "grad_norm": 0.16586771607398987, + "learning_rate": 1.355315713477499e-05, + "loss": 0.4788, + "step": 5232 + }, + { + "epoch": 1.1672986839170199, + "grad_norm": 0.1771279126405716, + "learning_rate": 1.3550957177026164e-05, + "loss": 0.4752, + "step": 5233 + }, + { + "epoch": 1.1675217488289091, + "grad_norm": 0.17474165558815002, + "learning_rate": 1.3548757022602997e-05, + "loss": 0.4734, + "step": 5234 + }, + { + "epoch": 1.1677448137407986, + "grad_norm": 0.16958124935626984, + "learning_rate": 1.3546556671627341e-05, + "loss": 0.4713, + "step": 5235 + }, + { + "epoch": 1.167967878652688, + "grad_norm": 0.16799892485141754, + "learning_rate": 1.354435612422107e-05, + "loss": 0.4604, + "step": 5236 + }, + { + "epoch": 1.1681909435645772, + "grad_norm": 0.17070333659648895, + "learning_rate": 1.3542155380506059e-05, + "loss": 0.4706, + "step": 5237 + }, + { + "epoch": 1.1684140084764667, + "grad_norm": 0.1612580418586731, + "learning_rate": 1.3539954440604206e-05, + "loss": 0.4619, + "step": 5238 + }, + { + "epoch": 1.168637073388356, + "grad_norm": 0.16792915761470795, + "learning_rate": 1.3537753304637406e-05, + "loss": 0.4564, + "step": 5239 + }, + { + "epoch": 1.1688601383002455, + "grad_norm": 0.16356982290744781, + "learning_rate": 1.3535551972727577e-05, + "loss": 0.4843, + "step": 5240 + }, + { + "epoch": 1.1690832032121348, + "grad_norm": 0.161970853805542, + "learning_rate": 1.3533350444996636e-05, + "loss": 0.4573, + "step": 5241 + }, + { + "epoch": 1.169306268124024, + "grad_norm": 0.17388132214546204, + "learning_rate": 1.3531148721566525e-05, + "loss": 0.4739, + "step": 5242 + }, + { + "epoch": 1.1695293330359133, + "grad_norm": 0.1602068692445755, + "learning_rate": 1.3528946802559184e-05, + "loss": 0.4584, + "step": 5243 + }, + { + "epoch": 1.1697523979478028, + "grad_norm": 0.1629067063331604, + "learning_rate": 1.3526744688096574e-05, + "loss": 0.4653, + "step": 5244 + }, + { + "epoch": 1.1699754628596921, + "grad_norm": 0.16808199882507324, + "learning_rate": 1.3524542378300658e-05, + "loss": 0.4588, + "step": 5245 + }, + { + "epoch": 1.1701985277715816, + "grad_norm": 0.17957061529159546, + "learning_rate": 1.3522339873293416e-05, + "loss": 0.49, + "step": 5246 + }, + { + "epoch": 1.170421592683471, + "grad_norm": 0.15982255339622498, + "learning_rate": 1.3520137173196832e-05, + "loss": 0.4456, + "step": 5247 + }, + { + "epoch": 1.1706446575953602, + "grad_norm": 0.16950003802776337, + "learning_rate": 1.3517934278132909e-05, + "loss": 0.4802, + "step": 5248 + }, + { + "epoch": 1.1708677225072497, + "grad_norm": 0.16196782886981964, + "learning_rate": 1.351573118822366e-05, + "loss": 0.4905, + "step": 5249 + }, + { + "epoch": 1.171090787419139, + "grad_norm": 0.16096629202365875, + "learning_rate": 1.3513527903591101e-05, + "loss": 0.4829, + "step": 5250 + }, + { + "epoch": 1.1713138523310282, + "grad_norm": 0.18132293224334717, + "learning_rate": 1.3511324424357269e-05, + "loss": 0.485, + "step": 5251 + }, + { + "epoch": 1.1715369172429178, + "grad_norm": 0.18736086785793304, + "learning_rate": 1.3509120750644198e-05, + "loss": 0.4569, + "step": 5252 + }, + { + "epoch": 1.171759982154807, + "grad_norm": 0.1668289601802826, + "learning_rate": 1.350691688257395e-05, + "loss": 0.4785, + "step": 5253 + }, + { + "epoch": 1.1719830470666963, + "grad_norm": 0.16315414011478424, + "learning_rate": 1.3504712820268584e-05, + "loss": 0.4545, + "step": 5254 + }, + { + "epoch": 1.1722061119785858, + "grad_norm": 0.1799056977033615, + "learning_rate": 1.3502508563850179e-05, + "loss": 0.5051, + "step": 5255 + }, + { + "epoch": 1.172429176890475, + "grad_norm": 0.16646715998649597, + "learning_rate": 1.3500304113440815e-05, + "loss": 0.4531, + "step": 5256 + }, + { + "epoch": 1.1726522418023646, + "grad_norm": 0.16029830276966095, + "learning_rate": 1.3498099469162594e-05, + "loss": 0.4614, + "step": 5257 + }, + { + "epoch": 1.1728753067142539, + "grad_norm": 0.16910234093666077, + "learning_rate": 1.3495894631137618e-05, + "loss": 0.484, + "step": 5258 + }, + { + "epoch": 1.1730983716261432, + "grad_norm": 0.16509772837162018, + "learning_rate": 1.349368959948801e-05, + "loss": 0.4458, + "step": 5259 + }, + { + "epoch": 1.1733214365380324, + "grad_norm": 0.16832312941551208, + "learning_rate": 1.3491484374335893e-05, + "loss": 0.4574, + "step": 5260 + }, + { + "epoch": 1.173544501449922, + "grad_norm": 0.15936195850372314, + "learning_rate": 1.3489278955803409e-05, + "loss": 0.4441, + "step": 5261 + }, + { + "epoch": 1.1737675663618112, + "grad_norm": 0.17124436795711517, + "learning_rate": 1.348707334401271e-05, + "loss": 0.4525, + "step": 5262 + }, + { + "epoch": 1.1739906312737007, + "grad_norm": 0.16587725281715393, + "learning_rate": 1.3484867539085952e-05, + "loss": 0.4683, + "step": 5263 + }, + { + "epoch": 1.17421369618559, + "grad_norm": 0.16889216005802155, + "learning_rate": 1.3482661541145315e-05, + "loss": 0.4646, + "step": 5264 + }, + { + "epoch": 1.1744367610974793, + "grad_norm": 0.16621263325214386, + "learning_rate": 1.3480455350312968e-05, + "loss": 0.4614, + "step": 5265 + }, + { + "epoch": 1.1746598260093688, + "grad_norm": 0.1614251285791397, + "learning_rate": 1.3478248966711115e-05, + "loss": 0.4549, + "step": 5266 + }, + { + "epoch": 1.174882890921258, + "grad_norm": 0.16062335669994354, + "learning_rate": 1.3476042390461954e-05, + "loss": 0.4592, + "step": 5267 + }, + { + "epoch": 1.1751059558331474, + "grad_norm": 0.16389133036136627, + "learning_rate": 1.3473835621687701e-05, + "loss": 0.4582, + "step": 5268 + }, + { + "epoch": 1.1753290207450369, + "grad_norm": 0.16292013227939606, + "learning_rate": 1.3471628660510576e-05, + "loss": 0.4444, + "step": 5269 + }, + { + "epoch": 1.1755520856569261, + "grad_norm": 0.16953033208847046, + "learning_rate": 1.3469421507052824e-05, + "loss": 0.4735, + "step": 5270 + }, + { + "epoch": 1.1757751505688154, + "grad_norm": 0.1629514843225479, + "learning_rate": 1.346721416143668e-05, + "loss": 0.4534, + "step": 5271 + }, + { + "epoch": 1.175998215480705, + "grad_norm": 0.1695055365562439, + "learning_rate": 1.346500662378441e-05, + "loss": 0.4498, + "step": 5272 + }, + { + "epoch": 1.1762212803925942, + "grad_norm": 0.1747894585132599, + "learning_rate": 1.3462798894218278e-05, + "loss": 0.4773, + "step": 5273 + }, + { + "epoch": 1.1764443453044837, + "grad_norm": 0.16201353073120117, + "learning_rate": 1.3460590972860561e-05, + "loss": 0.4401, + "step": 5274 + }, + { + "epoch": 1.176667410216373, + "grad_norm": 0.16511163115501404, + "learning_rate": 1.3458382859833545e-05, + "loss": 0.4652, + "step": 5275 + }, + { + "epoch": 1.1768904751282623, + "grad_norm": 0.17246244847774506, + "learning_rate": 1.3456174555259535e-05, + "loss": 0.4619, + "step": 5276 + }, + { + "epoch": 1.1771135400401516, + "grad_norm": 0.16072800755500793, + "learning_rate": 1.3453966059260836e-05, + "loss": 0.473, + "step": 5277 + }, + { + "epoch": 1.177336604952041, + "grad_norm": 0.16221876442432404, + "learning_rate": 1.345175737195977e-05, + "loss": 0.4588, + "step": 5278 + }, + { + "epoch": 1.1775596698639303, + "grad_norm": 0.16264645755290985, + "learning_rate": 1.344954849347867e-05, + "loss": 0.4536, + "step": 5279 + }, + { + "epoch": 1.1777827347758199, + "grad_norm": 0.1796528697013855, + "learning_rate": 1.3447339423939876e-05, + "loss": 0.4775, + "step": 5280 + }, + { + "epoch": 1.1780057996877091, + "grad_norm": 0.1555919647216797, + "learning_rate": 1.3445130163465739e-05, + "loss": 0.4576, + "step": 5281 + }, + { + "epoch": 1.1782288645995984, + "grad_norm": 0.16730302572250366, + "learning_rate": 1.3442920712178622e-05, + "loss": 0.4591, + "step": 5282 + }, + { + "epoch": 1.178451929511488, + "grad_norm": 0.16881461441516876, + "learning_rate": 1.3440711070200903e-05, + "loss": 0.4516, + "step": 5283 + }, + { + "epoch": 1.1786749944233772, + "grad_norm": 0.1790027767419815, + "learning_rate": 1.3438501237654958e-05, + "loss": 0.4627, + "step": 5284 + }, + { + "epoch": 1.1788980593352665, + "grad_norm": 0.1679370403289795, + "learning_rate": 1.3436291214663186e-05, + "loss": 0.4759, + "step": 5285 + }, + { + "epoch": 1.179121124247156, + "grad_norm": 0.17167186737060547, + "learning_rate": 1.3434081001347992e-05, + "loss": 0.4687, + "step": 5286 + }, + { + "epoch": 1.1793441891590453, + "grad_norm": 0.1699121594429016, + "learning_rate": 1.3431870597831792e-05, + "loss": 0.4634, + "step": 5287 + }, + { + "epoch": 1.1795672540709345, + "grad_norm": 0.15923888981342316, + "learning_rate": 1.3429660004237008e-05, + "loss": 0.4293, + "step": 5288 + }, + { + "epoch": 1.179790318982824, + "grad_norm": 0.15723690390586853, + "learning_rate": 1.3427449220686085e-05, + "loss": 0.4476, + "step": 5289 + }, + { + "epoch": 1.1800133838947133, + "grad_norm": 0.16163286566734314, + "learning_rate": 1.342523824730146e-05, + "loss": 0.4554, + "step": 5290 + }, + { + "epoch": 1.1802364488066028, + "grad_norm": 0.16615921258926392, + "learning_rate": 1.3423027084205597e-05, + "loss": 0.4751, + "step": 5291 + }, + { + "epoch": 1.1804595137184921, + "grad_norm": 0.1613457053899765, + "learning_rate": 1.3420815731520965e-05, + "loss": 0.462, + "step": 5292 + }, + { + "epoch": 1.1806825786303814, + "grad_norm": 0.16468282043933868, + "learning_rate": 1.3418604189370034e-05, + "loss": 0.4773, + "step": 5293 + }, + { + "epoch": 1.1809056435422707, + "grad_norm": 0.1656079739332199, + "learning_rate": 1.3416392457875302e-05, + "loss": 0.4593, + "step": 5294 + }, + { + "epoch": 1.1811287084541602, + "grad_norm": 0.15908241271972656, + "learning_rate": 1.3414180537159265e-05, + "loss": 0.4527, + "step": 5295 + }, + { + "epoch": 1.1813517733660495, + "grad_norm": 0.181619331240654, + "learning_rate": 1.3411968427344438e-05, + "loss": 0.4779, + "step": 5296 + }, + { + "epoch": 1.181574838277939, + "grad_norm": 0.16857735812664032, + "learning_rate": 1.3409756128553331e-05, + "loss": 0.4994, + "step": 5297 + }, + { + "epoch": 1.1817979031898282, + "grad_norm": 0.16610047221183777, + "learning_rate": 1.3407543640908485e-05, + "loss": 0.4669, + "step": 5298 + }, + { + "epoch": 1.1820209681017175, + "grad_norm": 0.16683466732501984, + "learning_rate": 1.3405330964532437e-05, + "loss": 0.4855, + "step": 5299 + }, + { + "epoch": 1.182244033013607, + "grad_norm": 0.1574050635099411, + "learning_rate": 1.340311809954774e-05, + "loss": 0.4539, + "step": 5300 + }, + { + "epoch": 1.1824670979254963, + "grad_norm": 0.16250169277191162, + "learning_rate": 1.3400905046076955e-05, + "loss": 0.4827, + "step": 5301 + }, + { + "epoch": 1.1826901628373856, + "grad_norm": 0.179072305560112, + "learning_rate": 1.3398691804242658e-05, + "loss": 0.4821, + "step": 5302 + }, + { + "epoch": 1.182913227749275, + "grad_norm": 0.15977934002876282, + "learning_rate": 1.339647837416743e-05, + "loss": 0.436, + "step": 5303 + }, + { + "epoch": 1.1831362926611644, + "grad_norm": 0.16551627218723297, + "learning_rate": 1.3394264755973864e-05, + "loss": 0.473, + "step": 5304 + }, + { + "epoch": 1.1833593575730537, + "grad_norm": 0.16227102279663086, + "learning_rate": 1.3392050949784566e-05, + "loss": 0.4693, + "step": 5305 + }, + { + "epoch": 1.1835824224849432, + "grad_norm": 0.16538846492767334, + "learning_rate": 1.3389836955722149e-05, + "loss": 0.4523, + "step": 5306 + }, + { + "epoch": 1.1838054873968324, + "grad_norm": 0.1663748174905777, + "learning_rate": 1.3387622773909237e-05, + "loss": 0.4656, + "step": 5307 + }, + { + "epoch": 1.184028552308722, + "grad_norm": 0.1712309867143631, + "learning_rate": 1.338540840446847e-05, + "loss": 0.4611, + "step": 5308 + }, + { + "epoch": 1.1842516172206112, + "grad_norm": 0.1841256320476532, + "learning_rate": 1.3383193847522487e-05, + "loss": 0.4955, + "step": 5309 + }, + { + "epoch": 1.1844746821325005, + "grad_norm": 0.151309996843338, + "learning_rate": 1.338097910319395e-05, + "loss": 0.4323, + "step": 5310 + }, + { + "epoch": 1.1846977470443898, + "grad_norm": 0.16778849065303802, + "learning_rate": 1.337876417160552e-05, + "loss": 0.4633, + "step": 5311 + }, + { + "epoch": 1.1849208119562793, + "grad_norm": 0.16986416280269623, + "learning_rate": 1.3376549052879883e-05, + "loss": 0.4243, + "step": 5312 + }, + { + "epoch": 1.1851438768681686, + "grad_norm": 0.1710985153913498, + "learning_rate": 1.3374333747139714e-05, + "loss": 0.4569, + "step": 5313 + }, + { + "epoch": 1.185366941780058, + "grad_norm": 0.1675567626953125, + "learning_rate": 1.3372118254507718e-05, + "loss": 0.4747, + "step": 5314 + }, + { + "epoch": 1.1855900066919474, + "grad_norm": 0.19594267010688782, + "learning_rate": 1.3369902575106604e-05, + "loss": 0.4645, + "step": 5315 + }, + { + "epoch": 1.1858130716038366, + "grad_norm": 0.1636292189359665, + "learning_rate": 1.3367686709059084e-05, + "loss": 0.4467, + "step": 5316 + }, + { + "epoch": 1.1860361365157261, + "grad_norm": 0.16308671236038208, + "learning_rate": 1.3365470656487896e-05, + "loss": 0.4704, + "step": 5317 + }, + { + "epoch": 1.1862592014276154, + "grad_norm": 0.16027814149856567, + "learning_rate": 1.3363254417515769e-05, + "loss": 0.4726, + "step": 5318 + }, + { + "epoch": 1.1864822663395047, + "grad_norm": 0.16369383037090302, + "learning_rate": 1.336103799226546e-05, + "loss": 0.4467, + "step": 5319 + }, + { + "epoch": 1.1867053312513942, + "grad_norm": 0.1744934618473053, + "learning_rate": 1.335882138085972e-05, + "loss": 0.4677, + "step": 5320 + }, + { + "epoch": 1.1869283961632835, + "grad_norm": 0.17109525203704834, + "learning_rate": 1.3356604583421331e-05, + "loss": 0.4587, + "step": 5321 + }, + { + "epoch": 1.1871514610751728, + "grad_norm": 0.17207783460617065, + "learning_rate": 1.335438760007306e-05, + "loss": 0.4468, + "step": 5322 + }, + { + "epoch": 1.1873745259870623, + "grad_norm": 0.16258476674556732, + "learning_rate": 1.3352170430937707e-05, + "loss": 0.4498, + "step": 5323 + }, + { + "epoch": 1.1875975908989516, + "grad_norm": 0.1659199595451355, + "learning_rate": 1.334995307613807e-05, + "loss": 0.4492, + "step": 5324 + }, + { + "epoch": 1.187820655810841, + "grad_norm": 0.18148742616176605, + "learning_rate": 1.3347735535796957e-05, + "loss": 0.4785, + "step": 5325 + }, + { + "epoch": 1.1880437207227303, + "grad_norm": 0.16647182404994965, + "learning_rate": 1.3345517810037194e-05, + "loss": 0.4375, + "step": 5326 + }, + { + "epoch": 1.1882667856346196, + "grad_norm": 0.16515585780143738, + "learning_rate": 1.334329989898161e-05, + "loss": 0.4653, + "step": 5327 + }, + { + "epoch": 1.1884898505465091, + "grad_norm": 0.161190927028656, + "learning_rate": 1.3341081802753046e-05, + "loss": 0.4448, + "step": 5328 + }, + { + "epoch": 1.1887129154583984, + "grad_norm": 0.1584840565919876, + "learning_rate": 1.333886352147436e-05, + "loss": 0.446, + "step": 5329 + }, + { + "epoch": 1.1889359803702877, + "grad_norm": 0.16086257994174957, + "learning_rate": 1.3336645055268405e-05, + "loss": 0.461, + "step": 5330 + }, + { + "epoch": 1.1891590452821772, + "grad_norm": 0.17658580839633942, + "learning_rate": 1.333442640425806e-05, + "loss": 0.4905, + "step": 5331 + }, + { + "epoch": 1.1893821101940665, + "grad_norm": 0.17238235473632812, + "learning_rate": 1.3332207568566209e-05, + "loss": 0.4966, + "step": 5332 + }, + { + "epoch": 1.1896051751059558, + "grad_norm": 0.16364935040473938, + "learning_rate": 1.332998854831574e-05, + "loss": 0.4554, + "step": 5333 + }, + { + "epoch": 1.1898282400178453, + "grad_norm": 0.19050312042236328, + "learning_rate": 1.3327769343629559e-05, + "loss": 0.4485, + "step": 5334 + }, + { + "epoch": 1.1900513049297345, + "grad_norm": 0.17412003874778748, + "learning_rate": 1.3325549954630579e-05, + "loss": 0.4776, + "step": 5335 + }, + { + "epoch": 1.1902743698416238, + "grad_norm": 0.16760744154453278, + "learning_rate": 1.3323330381441723e-05, + "loss": 0.4481, + "step": 5336 + }, + { + "epoch": 1.1904974347535133, + "grad_norm": 0.16635169088840485, + "learning_rate": 1.3321110624185927e-05, + "loss": 0.4572, + "step": 5337 + }, + { + "epoch": 1.1907204996654026, + "grad_norm": 0.1623852401971817, + "learning_rate": 1.3318890682986135e-05, + "loss": 0.455, + "step": 5338 + }, + { + "epoch": 1.190943564577292, + "grad_norm": 0.1643582582473755, + "learning_rate": 1.3316670557965299e-05, + "loss": 0.4356, + "step": 5339 + }, + { + "epoch": 1.1911666294891814, + "grad_norm": 0.1604517102241516, + "learning_rate": 1.3314450249246385e-05, + "loss": 0.4334, + "step": 5340 + }, + { + "epoch": 1.1913896944010707, + "grad_norm": 0.16572065651416779, + "learning_rate": 1.3312229756952366e-05, + "loss": 0.467, + "step": 5341 + }, + { + "epoch": 1.1916127593129602, + "grad_norm": 0.1680530309677124, + "learning_rate": 1.3310009081206232e-05, + "loss": 0.4546, + "step": 5342 + }, + { + "epoch": 1.1918358242248495, + "grad_norm": 0.16786667704582214, + "learning_rate": 1.330778822213097e-05, + "loss": 0.4519, + "step": 5343 + }, + { + "epoch": 1.1920588891367387, + "grad_norm": 0.17252503335475922, + "learning_rate": 1.3305567179849594e-05, + "loss": 0.4748, + "step": 5344 + }, + { + "epoch": 1.1922819540486282, + "grad_norm": 0.16872383654117584, + "learning_rate": 1.3303345954485113e-05, + "loss": 0.4529, + "step": 5345 + }, + { + "epoch": 1.1925050189605175, + "grad_norm": 0.1684456765651703, + "learning_rate": 1.330112454616055e-05, + "loss": 0.4696, + "step": 5346 + }, + { + "epoch": 1.1927280838724068, + "grad_norm": 0.16428877413272858, + "learning_rate": 1.3298902954998951e-05, + "loss": 0.4595, + "step": 5347 + }, + { + "epoch": 1.1929511487842963, + "grad_norm": 0.17616313695907593, + "learning_rate": 1.329668118112335e-05, + "loss": 0.4696, + "step": 5348 + }, + { + "epoch": 1.1931742136961856, + "grad_norm": 0.18709328770637512, + "learning_rate": 1.3294459224656813e-05, + "loss": 0.4847, + "step": 5349 + }, + { + "epoch": 1.1933972786080749, + "grad_norm": 0.17944732308387756, + "learning_rate": 1.3292237085722396e-05, + "loss": 0.4338, + "step": 5350 + }, + { + "epoch": 1.1936203435199644, + "grad_norm": 0.1728639304637909, + "learning_rate": 1.3290014764443186e-05, + "loss": 0.4658, + "step": 5351 + }, + { + "epoch": 1.1938434084318537, + "grad_norm": 0.17052049934864044, + "learning_rate": 1.328779226094226e-05, + "loss": 0.4922, + "step": 5352 + }, + { + "epoch": 1.194066473343743, + "grad_norm": 0.16665810346603394, + "learning_rate": 1.3285569575342719e-05, + "loss": 0.485, + "step": 5353 + }, + { + "epoch": 1.1942895382556324, + "grad_norm": 0.17398186028003693, + "learning_rate": 1.3283346707767666e-05, + "loss": 0.4587, + "step": 5354 + }, + { + "epoch": 1.1945126031675217, + "grad_norm": 0.1634979397058487, + "learning_rate": 1.3281123658340222e-05, + "loss": 0.4309, + "step": 5355 + }, + { + "epoch": 1.194735668079411, + "grad_norm": 0.20972603559494019, + "learning_rate": 1.3278900427183507e-05, + "loss": 0.4808, + "step": 5356 + }, + { + "epoch": 1.1949587329913005, + "grad_norm": 0.17813637852668762, + "learning_rate": 1.3276677014420665e-05, + "loss": 0.4556, + "step": 5357 + }, + { + "epoch": 1.1951817979031898, + "grad_norm": 0.18965846300125122, + "learning_rate": 1.3274453420174835e-05, + "loss": 0.4716, + "step": 5358 + }, + { + "epoch": 1.1954048628150793, + "grad_norm": 0.16441969573497772, + "learning_rate": 1.3272229644569182e-05, + "loss": 0.4379, + "step": 5359 + }, + { + "epoch": 1.1956279277269686, + "grad_norm": 0.17030787467956543, + "learning_rate": 1.3270005687726864e-05, + "loss": 0.4506, + "step": 5360 + }, + { + "epoch": 1.1958509926388579, + "grad_norm": 0.1666693389415741, + "learning_rate": 1.3267781549771064e-05, + "loss": 0.471, + "step": 5361 + }, + { + "epoch": 1.1960740575507474, + "grad_norm": 0.1696898639202118, + "learning_rate": 1.3265557230824967e-05, + "loss": 0.4808, + "step": 5362 + }, + { + "epoch": 1.1962971224626366, + "grad_norm": 0.17153112590312958, + "learning_rate": 1.326333273101177e-05, + "loss": 0.4762, + "step": 5363 + }, + { + "epoch": 1.196520187374526, + "grad_norm": 0.1648848056793213, + "learning_rate": 1.3261108050454674e-05, + "loss": 0.4448, + "step": 5364 + }, + { + "epoch": 1.1967432522864154, + "grad_norm": 0.17086121439933777, + "learning_rate": 1.3258883189276906e-05, + "loss": 0.4848, + "step": 5365 + }, + { + "epoch": 1.1969663171983047, + "grad_norm": 0.16545124351978302, + "learning_rate": 1.3256658147601686e-05, + "loss": 0.4797, + "step": 5366 + }, + { + "epoch": 1.197189382110194, + "grad_norm": 0.19573906064033508, + "learning_rate": 1.3254432925552252e-05, + "loss": 0.4813, + "step": 5367 + }, + { + "epoch": 1.1974124470220835, + "grad_norm": 0.16083821654319763, + "learning_rate": 1.3252207523251854e-05, + "loss": 0.4509, + "step": 5368 + }, + { + "epoch": 1.1976355119339728, + "grad_norm": 0.1647178679704666, + "learning_rate": 1.3249981940823742e-05, + "loss": 0.4506, + "step": 5369 + }, + { + "epoch": 1.197858576845862, + "grad_norm": 0.16676411032676697, + "learning_rate": 1.3247756178391192e-05, + "loss": 0.4696, + "step": 5370 + }, + { + "epoch": 1.1980816417577516, + "grad_norm": 0.16701491177082062, + "learning_rate": 1.3245530236077474e-05, + "loss": 0.4642, + "step": 5371 + }, + { + "epoch": 1.1983047066696408, + "grad_norm": 0.16053996980190277, + "learning_rate": 1.3243304114005878e-05, + "loss": 0.449, + "step": 5372 + }, + { + "epoch": 1.1985277715815301, + "grad_norm": 0.1547413319349289, + "learning_rate": 1.3241077812299694e-05, + "loss": 0.4496, + "step": 5373 + }, + { + "epoch": 1.1987508364934196, + "grad_norm": 0.16424560546875, + "learning_rate": 1.3238851331082237e-05, + "loss": 0.4802, + "step": 5374 + }, + { + "epoch": 1.198973901405309, + "grad_norm": 0.1648808866739273, + "learning_rate": 1.3236624670476819e-05, + "loss": 0.4574, + "step": 5375 + }, + { + "epoch": 1.1991969663171984, + "grad_norm": 0.1601562201976776, + "learning_rate": 1.323439783060677e-05, + "loss": 0.4553, + "step": 5376 + }, + { + "epoch": 1.1994200312290877, + "grad_norm": 0.16455237567424774, + "learning_rate": 1.323217081159542e-05, + "loss": 0.4666, + "step": 5377 + }, + { + "epoch": 1.199643096140977, + "grad_norm": 0.16123172640800476, + "learning_rate": 1.3229943613566118e-05, + "loss": 0.4797, + "step": 5378 + }, + { + "epoch": 1.1998661610528665, + "grad_norm": 0.18106834590435028, + "learning_rate": 1.3227716236642226e-05, + "loss": 0.476, + "step": 5379 + }, + { + "epoch": 1.2000892259647558, + "grad_norm": 0.1630832701921463, + "learning_rate": 1.3225488680947103e-05, + "loss": 0.4995, + "step": 5380 + }, + { + "epoch": 1.200312290876645, + "grad_norm": 0.16625766456127167, + "learning_rate": 1.322326094660413e-05, + "loss": 0.4662, + "step": 5381 + }, + { + "epoch": 1.2005353557885345, + "grad_norm": 0.16396591067314148, + "learning_rate": 1.3221033033736688e-05, + "loss": 0.4631, + "step": 5382 + }, + { + "epoch": 1.2007584207004238, + "grad_norm": 0.161235049366951, + "learning_rate": 1.321880494246818e-05, + "loss": 0.4481, + "step": 5383 + }, + { + "epoch": 1.200981485612313, + "grad_norm": 0.16487468779087067, + "learning_rate": 1.3216576672922002e-05, + "loss": 0.463, + "step": 5384 + }, + { + "epoch": 1.2012045505242026, + "grad_norm": 0.16097383201122284, + "learning_rate": 1.3214348225221578e-05, + "loss": 0.4517, + "step": 5385 + }, + { + "epoch": 1.201427615436092, + "grad_norm": 0.1607978343963623, + "learning_rate": 1.3212119599490327e-05, + "loss": 0.4775, + "step": 5386 + }, + { + "epoch": 1.2016506803479812, + "grad_norm": 0.1695454865694046, + "learning_rate": 1.3209890795851693e-05, + "loss": 0.4667, + "step": 5387 + }, + { + "epoch": 1.2018737452598707, + "grad_norm": 0.16572801768779755, + "learning_rate": 1.3207661814429112e-05, + "loss": 0.4623, + "step": 5388 + }, + { + "epoch": 1.20209681017176, + "grad_norm": 0.1595952957868576, + "learning_rate": 1.3205432655346044e-05, + "loss": 0.4801, + "step": 5389 + }, + { + "epoch": 1.2023198750836492, + "grad_norm": 0.16465139389038086, + "learning_rate": 1.3203203318725951e-05, + "loss": 0.4801, + "step": 5390 + }, + { + "epoch": 1.2025429399955387, + "grad_norm": 0.1634162813425064, + "learning_rate": 1.320097380469231e-05, + "loss": 0.4807, + "step": 5391 + }, + { + "epoch": 1.202766004907428, + "grad_norm": 0.16548092663288116, + "learning_rate": 1.3198744113368604e-05, + "loss": 0.4569, + "step": 5392 + }, + { + "epoch": 1.2029890698193175, + "grad_norm": 0.16287754476070404, + "learning_rate": 1.319651424487833e-05, + "loss": 0.4539, + "step": 5393 + }, + { + "epoch": 1.2032121347312068, + "grad_norm": 0.16865308582782745, + "learning_rate": 1.3194284199344987e-05, + "loss": 0.4555, + "step": 5394 + }, + { + "epoch": 1.203435199643096, + "grad_norm": 0.16417443752288818, + "learning_rate": 1.3192053976892097e-05, + "loss": 0.4449, + "step": 5395 + }, + { + "epoch": 1.2036582645549856, + "grad_norm": 0.17047269642353058, + "learning_rate": 1.3189823577643175e-05, + "loss": 0.4669, + "step": 5396 + }, + { + "epoch": 1.2038813294668749, + "grad_norm": 0.15792153775691986, + "learning_rate": 1.3187593001721762e-05, + "loss": 0.4303, + "step": 5397 + }, + { + "epoch": 1.2041043943787642, + "grad_norm": 0.16310684382915497, + "learning_rate": 1.3185362249251395e-05, + "loss": 0.4373, + "step": 5398 + }, + { + "epoch": 1.2043274592906537, + "grad_norm": 0.15764662623405457, + "learning_rate": 1.3183131320355629e-05, + "loss": 0.4409, + "step": 5399 + }, + { + "epoch": 1.204550524202543, + "grad_norm": 0.17336583137512207, + "learning_rate": 1.3180900215158028e-05, + "loss": 0.4655, + "step": 5400 + }, + { + "epoch": 1.2047735891144322, + "grad_norm": 0.1825001984834671, + "learning_rate": 1.3178668933782166e-05, + "loss": 0.4676, + "step": 5401 + }, + { + "epoch": 1.2049966540263217, + "grad_norm": 0.16924050450325012, + "learning_rate": 1.3176437476351625e-05, + "loss": 0.4762, + "step": 5402 + }, + { + "epoch": 1.205219718938211, + "grad_norm": 0.163119837641716, + "learning_rate": 1.3174205842989993e-05, + "loss": 0.4078, + "step": 5403 + }, + { + "epoch": 1.2054427838501003, + "grad_norm": 0.16703581809997559, + "learning_rate": 1.3171974033820878e-05, + "loss": 0.4769, + "step": 5404 + }, + { + "epoch": 1.2056658487619898, + "grad_norm": 0.1679922491312027, + "learning_rate": 1.3169742048967886e-05, + "loss": 0.4776, + "step": 5405 + }, + { + "epoch": 1.205888913673879, + "grad_norm": 0.16275057196617126, + "learning_rate": 1.3167509888554641e-05, + "loss": 0.4646, + "step": 5406 + }, + { + "epoch": 1.2061119785857684, + "grad_norm": 0.16767756640911102, + "learning_rate": 1.3165277552704774e-05, + "loss": 0.4401, + "step": 5407 + }, + { + "epoch": 1.2063350434976579, + "grad_norm": 0.16466595232486725, + "learning_rate": 1.3163045041541929e-05, + "loss": 0.4626, + "step": 5408 + }, + { + "epoch": 1.2065581084095471, + "grad_norm": 0.17687217891216278, + "learning_rate": 1.316081235518975e-05, + "loss": 0.4553, + "step": 5409 + }, + { + "epoch": 1.2067811733214366, + "grad_norm": 0.16307489573955536, + "learning_rate": 1.3158579493771901e-05, + "loss": 0.4444, + "step": 5410 + }, + { + "epoch": 1.207004238233326, + "grad_norm": 0.21108053624629974, + "learning_rate": 1.3156346457412051e-05, + "loss": 0.4845, + "step": 5411 + }, + { + "epoch": 1.2072273031452152, + "grad_norm": 0.1662529855966568, + "learning_rate": 1.3154113246233881e-05, + "loss": 0.4673, + "step": 5412 + }, + { + "epoch": 1.2074503680571047, + "grad_norm": 0.1835554838180542, + "learning_rate": 1.3151879860361078e-05, + "loss": 0.4679, + "step": 5413 + }, + { + "epoch": 1.207673432968994, + "grad_norm": 0.16841189563274384, + "learning_rate": 1.3149646299917342e-05, + "loss": 0.4932, + "step": 5414 + }, + { + "epoch": 1.2078964978808833, + "grad_norm": 0.15623950958251953, + "learning_rate": 1.3147412565026385e-05, + "loss": 0.4396, + "step": 5415 + }, + { + "epoch": 1.2081195627927728, + "grad_norm": 0.1630774587392807, + "learning_rate": 1.3145178655811921e-05, + "loss": 0.459, + "step": 5416 + }, + { + "epoch": 1.208342627704662, + "grad_norm": 0.16008590161800385, + "learning_rate": 1.3142944572397677e-05, + "loss": 0.4662, + "step": 5417 + }, + { + "epoch": 1.2085656926165513, + "grad_norm": 0.16384388506412506, + "learning_rate": 1.3140710314907392e-05, + "loss": 0.4481, + "step": 5418 + }, + { + "epoch": 1.2087887575284408, + "grad_norm": 0.1759367287158966, + "learning_rate": 1.3138475883464818e-05, + "loss": 0.4766, + "step": 5419 + }, + { + "epoch": 1.2090118224403301, + "grad_norm": 0.1597670316696167, + "learning_rate": 1.3136241278193704e-05, + "loss": 0.4521, + "step": 5420 + }, + { + "epoch": 1.2092348873522194, + "grad_norm": 0.1790318489074707, + "learning_rate": 1.3134006499217824e-05, + "loss": 0.4404, + "step": 5421 + }, + { + "epoch": 1.209457952264109, + "grad_norm": 0.16232487559318542, + "learning_rate": 1.3131771546660947e-05, + "loss": 0.4436, + "step": 5422 + }, + { + "epoch": 1.2096810171759982, + "grad_norm": 0.16369551420211792, + "learning_rate": 1.3129536420646862e-05, + "loss": 0.4754, + "step": 5423 + }, + { + "epoch": 1.2099040820878875, + "grad_norm": 0.1614258736371994, + "learning_rate": 1.3127301121299367e-05, + "loss": 0.4436, + "step": 5424 + }, + { + "epoch": 1.210127146999777, + "grad_norm": 0.163519024848938, + "learning_rate": 1.3125065648742263e-05, + "loss": 0.4443, + "step": 5425 + }, + { + "epoch": 1.2103502119116663, + "grad_norm": 0.16275057196617126, + "learning_rate": 1.3122830003099364e-05, + "loss": 0.4632, + "step": 5426 + }, + { + "epoch": 1.2105732768235558, + "grad_norm": 0.1647331267595291, + "learning_rate": 1.3120594184494499e-05, + "loss": 0.4644, + "step": 5427 + }, + { + "epoch": 1.210796341735445, + "grad_norm": 0.16043558716773987, + "learning_rate": 1.3118358193051499e-05, + "loss": 0.4691, + "step": 5428 + }, + { + "epoch": 1.2110194066473343, + "grad_norm": 0.1672472357749939, + "learning_rate": 1.3116122028894206e-05, + "loss": 0.4663, + "step": 5429 + }, + { + "epoch": 1.2112424715592238, + "grad_norm": 0.16601301729679108, + "learning_rate": 1.3113885692146473e-05, + "loss": 0.4594, + "step": 5430 + }, + { + "epoch": 1.211465536471113, + "grad_norm": 0.16756746172904968, + "learning_rate": 1.3111649182932163e-05, + "loss": 0.4555, + "step": 5431 + }, + { + "epoch": 1.2116886013830024, + "grad_norm": 0.1739458590745926, + "learning_rate": 1.3109412501375149e-05, + "loss": 0.4703, + "step": 5432 + }, + { + "epoch": 1.2119116662948919, + "grad_norm": 0.16856876015663147, + "learning_rate": 1.3107175647599311e-05, + "loss": 0.448, + "step": 5433 + }, + { + "epoch": 1.2121347312067812, + "grad_norm": 0.16722862422466278, + "learning_rate": 1.3104938621728542e-05, + "loss": 0.4598, + "step": 5434 + }, + { + "epoch": 1.2123577961186704, + "grad_norm": 0.16905224323272705, + "learning_rate": 1.310270142388674e-05, + "loss": 0.4782, + "step": 5435 + }, + { + "epoch": 1.21258086103056, + "grad_norm": 0.16216522455215454, + "learning_rate": 1.3100464054197819e-05, + "loss": 0.4755, + "step": 5436 + }, + { + "epoch": 1.2128039259424492, + "grad_norm": 0.17112210392951965, + "learning_rate": 1.3098226512785695e-05, + "loss": 0.4774, + "step": 5437 + }, + { + "epoch": 1.2130269908543385, + "grad_norm": 0.16165569424629211, + "learning_rate": 1.3095988799774296e-05, + "loss": 0.4723, + "step": 5438 + }, + { + "epoch": 1.213250055766228, + "grad_norm": 0.16415071487426758, + "learning_rate": 1.3093750915287565e-05, + "loss": 0.4649, + "step": 5439 + }, + { + "epoch": 1.2134731206781173, + "grad_norm": 0.23170824348926544, + "learning_rate": 1.3091512859449447e-05, + "loss": 0.4754, + "step": 5440 + }, + { + "epoch": 1.2136961855900066, + "grad_norm": 0.15743504464626312, + "learning_rate": 1.30892746323839e-05, + "loss": 0.4367, + "step": 5441 + }, + { + "epoch": 1.213919250501896, + "grad_norm": 0.1775827705860138, + "learning_rate": 1.3087036234214892e-05, + "loss": 0.468, + "step": 5442 + }, + { + "epoch": 1.2141423154137854, + "grad_norm": 0.16368676722049713, + "learning_rate": 1.3084797665066398e-05, + "loss": 0.4635, + "step": 5443 + }, + { + "epoch": 1.2143653803256749, + "grad_norm": 0.15797144174575806, + "learning_rate": 1.3082558925062406e-05, + "loss": 0.4378, + "step": 5444 + }, + { + "epoch": 1.2145884452375642, + "grad_norm": 0.16582882404327393, + "learning_rate": 1.308032001432691e-05, + "loss": 0.4508, + "step": 5445 + }, + { + "epoch": 1.2148115101494534, + "grad_norm": 0.16783961653709412, + "learning_rate": 1.307808093298392e-05, + "loss": 0.478, + "step": 5446 + }, + { + "epoch": 1.215034575061343, + "grad_norm": 0.29538553953170776, + "learning_rate": 1.307584168115744e-05, + "loss": 0.4617, + "step": 5447 + }, + { + "epoch": 1.2152576399732322, + "grad_norm": 0.16198082268238068, + "learning_rate": 1.3073602258971503e-05, + "loss": 0.4605, + "step": 5448 + }, + { + "epoch": 1.2154807048851215, + "grad_norm": 0.16145728528499603, + "learning_rate": 1.3071362666550136e-05, + "loss": 0.4702, + "step": 5449 + }, + { + "epoch": 1.215703769797011, + "grad_norm": 0.16520316898822784, + "learning_rate": 1.3069122904017389e-05, + "loss": 0.4913, + "step": 5450 + }, + { + "epoch": 1.2159268347089003, + "grad_norm": 0.1647755205631256, + "learning_rate": 1.3066882971497308e-05, + "loss": 0.4538, + "step": 5451 + }, + { + "epoch": 1.2161498996207896, + "grad_norm": 0.16983206570148468, + "learning_rate": 1.3064642869113955e-05, + "loss": 0.4531, + "step": 5452 + }, + { + "epoch": 1.216372964532679, + "grad_norm": 0.16914002597332, + "learning_rate": 1.3062402596991407e-05, + "loss": 0.4643, + "step": 5453 + }, + { + "epoch": 1.2165960294445683, + "grad_norm": 0.16433201730251312, + "learning_rate": 1.3060162155253738e-05, + "loss": 0.4398, + "step": 5454 + }, + { + "epoch": 1.2168190943564579, + "grad_norm": 0.1646953821182251, + "learning_rate": 1.305792154402504e-05, + "loss": 0.4704, + "step": 5455 + }, + { + "epoch": 1.2170421592683471, + "grad_norm": 0.1791263222694397, + "learning_rate": 1.3055680763429411e-05, + "loss": 0.4338, + "step": 5456 + }, + { + "epoch": 1.2172652241802364, + "grad_norm": 0.16176985204219818, + "learning_rate": 1.305343981359096e-05, + "loss": 0.4534, + "step": 5457 + }, + { + "epoch": 1.2174882890921257, + "grad_norm": 0.16253705322742462, + "learning_rate": 1.3051198694633804e-05, + "loss": 0.4728, + "step": 5458 + }, + { + "epoch": 1.2177113540040152, + "grad_norm": 0.1789017915725708, + "learning_rate": 1.3048957406682074e-05, + "loss": 0.4333, + "step": 5459 + }, + { + "epoch": 1.2179344189159045, + "grad_norm": 0.16050660610198975, + "learning_rate": 1.3046715949859902e-05, + "loss": 0.4614, + "step": 5460 + }, + { + "epoch": 1.218157483827794, + "grad_norm": 0.15737925469875336, + "learning_rate": 1.3044474324291438e-05, + "loss": 0.4558, + "step": 5461 + }, + { + "epoch": 1.2183805487396833, + "grad_norm": 0.16488178074359894, + "learning_rate": 1.3042232530100833e-05, + "loss": 0.4804, + "step": 5462 + }, + { + "epoch": 1.2186036136515725, + "grad_norm": 0.16783639788627625, + "learning_rate": 1.3039990567412255e-05, + "loss": 0.4564, + "step": 5463 + }, + { + "epoch": 1.218826678563462, + "grad_norm": 0.16743387281894684, + "learning_rate": 1.303774843634988e-05, + "loss": 0.4568, + "step": 5464 + }, + { + "epoch": 1.2190497434753513, + "grad_norm": 0.16670465469360352, + "learning_rate": 1.3035506137037883e-05, + "loss": 0.4416, + "step": 5465 + }, + { + "epoch": 1.2192728083872406, + "grad_norm": 0.17037998139858246, + "learning_rate": 1.3033263669600466e-05, + "loss": 0.4753, + "step": 5466 + }, + { + "epoch": 1.2194958732991301, + "grad_norm": 0.17123369872570038, + "learning_rate": 1.3031021034161827e-05, + "loss": 0.4718, + "step": 5467 + }, + { + "epoch": 1.2197189382110194, + "grad_norm": 0.17543406784534454, + "learning_rate": 1.3028778230846178e-05, + "loss": 0.4564, + "step": 5468 + }, + { + "epoch": 1.2199420031229087, + "grad_norm": 0.17404866218566895, + "learning_rate": 1.3026535259777734e-05, + "loss": 0.4716, + "step": 5469 + }, + { + "epoch": 1.2201650680347982, + "grad_norm": 0.1693383902311325, + "learning_rate": 1.3024292121080735e-05, + "loss": 0.4759, + "step": 5470 + }, + { + "epoch": 1.2203881329466875, + "grad_norm": 0.18779967725276947, + "learning_rate": 1.3022048814879412e-05, + "loss": 0.4784, + "step": 5471 + }, + { + "epoch": 1.220611197858577, + "grad_norm": 0.16473285853862762, + "learning_rate": 1.3019805341298017e-05, + "loss": 0.4334, + "step": 5472 + }, + { + "epoch": 1.2208342627704662, + "grad_norm": 0.1686456799507141, + "learning_rate": 1.3017561700460806e-05, + "loss": 0.4744, + "step": 5473 + }, + { + "epoch": 1.2210573276823555, + "grad_norm": 0.16365700960159302, + "learning_rate": 1.301531789249205e-05, + "loss": 0.4711, + "step": 5474 + }, + { + "epoch": 1.2212803925942448, + "grad_norm": 0.16734614968299866, + "learning_rate": 1.3013073917516018e-05, + "loss": 0.4512, + "step": 5475 + }, + { + "epoch": 1.2215034575061343, + "grad_norm": 0.1621614247560501, + "learning_rate": 1.3010829775657001e-05, + "loss": 0.4579, + "step": 5476 + }, + { + "epoch": 1.2217265224180236, + "grad_norm": 0.16041643917560577, + "learning_rate": 1.3008585467039291e-05, + "loss": 0.4723, + "step": 5477 + }, + { + "epoch": 1.221949587329913, + "grad_norm": 0.17315927147865295, + "learning_rate": 1.3006340991787196e-05, + "loss": 0.4888, + "step": 5478 + }, + { + "epoch": 1.2221726522418024, + "grad_norm": 0.16364838182926178, + "learning_rate": 1.3004096350025024e-05, + "loss": 0.4456, + "step": 5479 + }, + { + "epoch": 1.2223957171536917, + "grad_norm": 0.17080283164978027, + "learning_rate": 1.30018515418771e-05, + "loss": 0.4545, + "step": 5480 + }, + { + "epoch": 1.2226187820655812, + "grad_norm": 0.1685846894979477, + "learning_rate": 1.2999606567467753e-05, + "loss": 0.4709, + "step": 5481 + }, + { + "epoch": 1.2228418469774704, + "grad_norm": 0.16784535348415375, + "learning_rate": 1.2997361426921331e-05, + "loss": 0.4753, + "step": 5482 + }, + { + "epoch": 1.2230649118893597, + "grad_norm": 0.17074494063854218, + "learning_rate": 1.2995116120362175e-05, + "loss": 0.4519, + "step": 5483 + }, + { + "epoch": 1.2232879768012492, + "grad_norm": 0.15669210255146027, + "learning_rate": 1.2992870647914648e-05, + "loss": 0.4205, + "step": 5484 + }, + { + "epoch": 1.2235110417131385, + "grad_norm": 0.16939307749271393, + "learning_rate": 1.2990625009703122e-05, + "loss": 0.4685, + "step": 5485 + }, + { + "epoch": 1.2237341066250278, + "grad_norm": 0.1645343005657196, + "learning_rate": 1.298837920585197e-05, + "loss": 0.4713, + "step": 5486 + }, + { + "epoch": 1.2239571715369173, + "grad_norm": 0.16123943030834198, + "learning_rate": 1.298613323648558e-05, + "loss": 0.4592, + "step": 5487 + }, + { + "epoch": 1.2241802364488066, + "grad_norm": 0.16125601530075073, + "learning_rate": 1.2983887101728342e-05, + "loss": 0.4522, + "step": 5488 + }, + { + "epoch": 1.224403301360696, + "grad_norm": 0.16092932224273682, + "learning_rate": 1.2981640801704676e-05, + "loss": 0.4469, + "step": 5489 + }, + { + "epoch": 1.2246263662725854, + "grad_norm": 0.16154557466506958, + "learning_rate": 1.2979394336538982e-05, + "loss": 0.5035, + "step": 5490 + }, + { + "epoch": 1.2248494311844746, + "grad_norm": 0.16762235760688782, + "learning_rate": 1.2977147706355688e-05, + "loss": 0.4505, + "step": 5491 + }, + { + "epoch": 1.225072496096364, + "grad_norm": 0.16036982834339142, + "learning_rate": 1.297490091127923e-05, + "loss": 0.4635, + "step": 5492 + }, + { + "epoch": 1.2252955610082534, + "grad_norm": 0.16005462408065796, + "learning_rate": 1.2972653951434046e-05, + "loss": 0.4745, + "step": 5493 + }, + { + "epoch": 1.2255186259201427, + "grad_norm": 0.16705186665058136, + "learning_rate": 1.2970406826944589e-05, + "loss": 0.4684, + "step": 5494 + }, + { + "epoch": 1.2257416908320322, + "grad_norm": 0.16716639697551727, + "learning_rate": 1.2968159537935312e-05, + "loss": 0.4612, + "step": 5495 + }, + { + "epoch": 1.2259647557439215, + "grad_norm": 0.16546767950057983, + "learning_rate": 1.2965912084530693e-05, + "loss": 0.4797, + "step": 5496 + }, + { + "epoch": 1.2261878206558108, + "grad_norm": 0.16606946289539337, + "learning_rate": 1.2963664466855207e-05, + "loss": 0.4695, + "step": 5497 + }, + { + "epoch": 1.2264108855677003, + "grad_norm": 0.16966325044631958, + "learning_rate": 1.2961416685033339e-05, + "loss": 0.4498, + "step": 5498 + }, + { + "epoch": 1.2266339504795896, + "grad_norm": 0.17357441782951355, + "learning_rate": 1.2959168739189587e-05, + "loss": 0.4576, + "step": 5499 + }, + { + "epoch": 1.2268570153914788, + "grad_norm": 0.16445501148700714, + "learning_rate": 1.2956920629448458e-05, + "loss": 0.4371, + "step": 5500 + }, + { + "epoch": 1.2270800803033683, + "grad_norm": 0.16668792068958282, + "learning_rate": 1.295467235593446e-05, + "loss": 0.4698, + "step": 5501 + }, + { + "epoch": 1.2273031452152576, + "grad_norm": 0.15939384698867798, + "learning_rate": 1.2952423918772128e-05, + "loss": 0.4261, + "step": 5502 + }, + { + "epoch": 1.227526210127147, + "grad_norm": 0.1705656349658966, + "learning_rate": 1.2950175318085983e-05, + "loss": 0.441, + "step": 5503 + }, + { + "epoch": 1.2277492750390364, + "grad_norm": 0.16186708211898804, + "learning_rate": 1.2947926554000574e-05, + "loss": 0.4795, + "step": 5504 + }, + { + "epoch": 1.2279723399509257, + "grad_norm": 0.16865457594394684, + "learning_rate": 1.2945677626640447e-05, + "loss": 0.4653, + "step": 5505 + }, + { + "epoch": 1.2281954048628152, + "grad_norm": 0.16898195445537567, + "learning_rate": 1.2943428536130167e-05, + "loss": 0.4422, + "step": 5506 + }, + { + "epoch": 1.2284184697747045, + "grad_norm": 0.15895049273967743, + "learning_rate": 1.2941179282594298e-05, + "loss": 0.4435, + "step": 5507 + }, + { + "epoch": 1.2286415346865938, + "grad_norm": 0.17526674270629883, + "learning_rate": 1.293892986615742e-05, + "loss": 0.4486, + "step": 5508 + }, + { + "epoch": 1.228864599598483, + "grad_norm": 0.16472922265529633, + "learning_rate": 1.2936680286944118e-05, + "loss": 0.4414, + "step": 5509 + }, + { + "epoch": 1.2290876645103725, + "grad_norm": 0.16342167556285858, + "learning_rate": 1.2934430545078991e-05, + "loss": 0.4708, + "step": 5510 + }, + { + "epoch": 1.2293107294222618, + "grad_norm": 0.16461318731307983, + "learning_rate": 1.2932180640686643e-05, + "loss": 0.4545, + "step": 5511 + }, + { + "epoch": 1.2295337943341513, + "grad_norm": 0.16927462816238403, + "learning_rate": 1.2929930573891685e-05, + "loss": 0.4573, + "step": 5512 + }, + { + "epoch": 1.2297568592460406, + "grad_norm": 0.1817016750574112, + "learning_rate": 1.2927680344818741e-05, + "loss": 0.442, + "step": 5513 + }, + { + "epoch": 1.22997992415793, + "grad_norm": 0.16440905630588531, + "learning_rate": 1.2925429953592446e-05, + "loss": 0.4534, + "step": 5514 + }, + { + "epoch": 1.2302029890698194, + "grad_norm": 0.16801615059375763, + "learning_rate": 1.2923179400337435e-05, + "loss": 0.447, + "step": 5515 + }, + { + "epoch": 1.2304260539817087, + "grad_norm": 0.16331814229488373, + "learning_rate": 1.2920928685178365e-05, + "loss": 0.4432, + "step": 5516 + }, + { + "epoch": 1.230649118893598, + "grad_norm": 0.17522062361240387, + "learning_rate": 1.291867780823989e-05, + "loss": 0.497, + "step": 5517 + }, + { + "epoch": 1.2308721838054875, + "grad_norm": 0.1681208312511444, + "learning_rate": 1.2916426769646677e-05, + "loss": 0.4688, + "step": 5518 + }, + { + "epoch": 1.2310952487173767, + "grad_norm": 0.16323919594287872, + "learning_rate": 1.2914175569523408e-05, + "loss": 0.4562, + "step": 5519 + }, + { + "epoch": 1.231318313629266, + "grad_norm": 0.17886684834957123, + "learning_rate": 1.291192420799476e-05, + "loss": 0.4333, + "step": 5520 + }, + { + "epoch": 1.2315413785411555, + "grad_norm": 0.17314215004444122, + "learning_rate": 1.2909672685185437e-05, + "loss": 0.4517, + "step": 5521 + }, + { + "epoch": 1.2317644434530448, + "grad_norm": 0.16686271131038666, + "learning_rate": 1.2907421001220138e-05, + "loss": 0.4842, + "step": 5522 + }, + { + "epoch": 1.2319875083649343, + "grad_norm": 0.16551433503627777, + "learning_rate": 1.2905169156223574e-05, + "loss": 0.4726, + "step": 5523 + }, + { + "epoch": 1.2322105732768236, + "grad_norm": 0.17935457825660706, + "learning_rate": 1.2902917150320468e-05, + "loss": 0.4438, + "step": 5524 + }, + { + "epoch": 1.2324336381887129, + "grad_norm": 0.1675693243741989, + "learning_rate": 1.2900664983635551e-05, + "loss": 0.4471, + "step": 5525 + }, + { + "epoch": 1.2326567031006022, + "grad_norm": 0.1640770435333252, + "learning_rate": 1.289841265629356e-05, + "loss": 0.4245, + "step": 5526 + }, + { + "epoch": 1.2328797680124917, + "grad_norm": 0.17131870985031128, + "learning_rate": 1.2896160168419245e-05, + "loss": 0.4767, + "step": 5527 + }, + { + "epoch": 1.233102832924381, + "grad_norm": 0.1778288334608078, + "learning_rate": 1.2893907520137364e-05, + "loss": 0.4754, + "step": 5528 + }, + { + "epoch": 1.2333258978362704, + "grad_norm": 0.16755534708499908, + "learning_rate": 1.2891654711572678e-05, + "loss": 0.4649, + "step": 5529 + }, + { + "epoch": 1.2335489627481597, + "grad_norm": 0.1766786277294159, + "learning_rate": 1.2889401742849965e-05, + "loss": 0.5005, + "step": 5530 + }, + { + "epoch": 1.233772027660049, + "grad_norm": 0.16578464210033417, + "learning_rate": 1.2887148614094012e-05, + "loss": 0.4371, + "step": 5531 + }, + { + "epoch": 1.2339950925719385, + "grad_norm": 0.17723850905895233, + "learning_rate": 1.2884895325429605e-05, + "loss": 0.4787, + "step": 5532 + }, + { + "epoch": 1.2342181574838278, + "grad_norm": 0.171020045876503, + "learning_rate": 1.288264187698155e-05, + "loss": 0.4576, + "step": 5533 + }, + { + "epoch": 1.234441222395717, + "grad_norm": 0.16916728019714355, + "learning_rate": 1.2880388268874653e-05, + "loss": 0.4739, + "step": 5534 + }, + { + "epoch": 1.2346642873076066, + "grad_norm": 0.16481377184391022, + "learning_rate": 1.2878134501233736e-05, + "loss": 0.4456, + "step": 5535 + }, + { + "epoch": 1.2348873522194959, + "grad_norm": 0.3519321382045746, + "learning_rate": 1.2875880574183628e-05, + "loss": 0.4754, + "step": 5536 + }, + { + "epoch": 1.2351104171313851, + "grad_norm": 0.23968984186649323, + "learning_rate": 1.2873626487849162e-05, + "loss": 0.4661, + "step": 5537 + }, + { + "epoch": 1.2353334820432746, + "grad_norm": 0.16674065589904785, + "learning_rate": 1.2871372242355186e-05, + "loss": 0.4611, + "step": 5538 + }, + { + "epoch": 1.235556546955164, + "grad_norm": 0.17423000931739807, + "learning_rate": 1.2869117837826553e-05, + "loss": 0.4456, + "step": 5539 + }, + { + "epoch": 1.2357796118670534, + "grad_norm": 0.16467906534671783, + "learning_rate": 1.2866863274388128e-05, + "loss": 0.459, + "step": 5540 + }, + { + "epoch": 1.2360026767789427, + "grad_norm": 0.1630530208349228, + "learning_rate": 1.2864608552164779e-05, + "loss": 0.4274, + "step": 5541 + }, + { + "epoch": 1.236225741690832, + "grad_norm": 0.1718243807554245, + "learning_rate": 1.2862353671281392e-05, + "loss": 0.4572, + "step": 5542 + }, + { + "epoch": 1.2364488066027213, + "grad_norm": 0.1649494469165802, + "learning_rate": 1.2860098631862852e-05, + "loss": 0.4365, + "step": 5543 + }, + { + "epoch": 1.2366718715146108, + "grad_norm": 0.16701866686344147, + "learning_rate": 1.2857843434034058e-05, + "loss": 0.4559, + "step": 5544 + }, + { + "epoch": 1.2368949364265, + "grad_norm": 0.16802479326725006, + "learning_rate": 1.2855588077919921e-05, + "loss": 0.4636, + "step": 5545 + }, + { + "epoch": 1.2371180013383896, + "grad_norm": 0.17907430231571198, + "learning_rate": 1.2853332563645353e-05, + "loss": 0.4747, + "step": 5546 + }, + { + "epoch": 1.2373410662502788, + "grad_norm": 0.16412149369716644, + "learning_rate": 1.2851076891335277e-05, + "loss": 0.4492, + "step": 5547 + }, + { + "epoch": 1.2375641311621681, + "grad_norm": 0.16632318496704102, + "learning_rate": 1.2848821061114629e-05, + "loss": 0.4628, + "step": 5548 + }, + { + "epoch": 1.2377871960740576, + "grad_norm": 0.1785164624452591, + "learning_rate": 1.2846565073108355e-05, + "loss": 0.4821, + "step": 5549 + }, + { + "epoch": 1.238010260985947, + "grad_norm": 0.1653890609741211, + "learning_rate": 1.2844308927441397e-05, + "loss": 0.435, + "step": 5550 + }, + { + "epoch": 1.2382333258978362, + "grad_norm": 0.18326538801193237, + "learning_rate": 1.2842052624238724e-05, + "loss": 0.4691, + "step": 5551 + }, + { + "epoch": 1.2384563908097257, + "grad_norm": 0.16218526661396027, + "learning_rate": 1.2839796163625296e-05, + "loss": 0.441, + "step": 5552 + }, + { + "epoch": 1.238679455721615, + "grad_norm": 0.17150120437145233, + "learning_rate": 1.2837539545726095e-05, + "loss": 0.4533, + "step": 5553 + }, + { + "epoch": 1.2389025206335043, + "grad_norm": 0.17477098107337952, + "learning_rate": 1.2835282770666101e-05, + "loss": 0.4898, + "step": 5554 + }, + { + "epoch": 1.2391255855453938, + "grad_norm": 0.16403955221176147, + "learning_rate": 1.2833025838570318e-05, + "loss": 0.4591, + "step": 5555 + }, + { + "epoch": 1.239348650457283, + "grad_norm": 0.17669036984443665, + "learning_rate": 1.283076874956374e-05, + "loss": 0.4727, + "step": 5556 + }, + { + "epoch": 1.2395717153691725, + "grad_norm": 0.17886574566364288, + "learning_rate": 1.2828511503771386e-05, + "loss": 0.4824, + "step": 5557 + }, + { + "epoch": 1.2397947802810618, + "grad_norm": 0.1615883857011795, + "learning_rate": 1.282625410131827e-05, + "loss": 0.4573, + "step": 5558 + }, + { + "epoch": 1.240017845192951, + "grad_norm": 0.16335685551166534, + "learning_rate": 1.2823996542329426e-05, + "loss": 0.4624, + "step": 5559 + }, + { + "epoch": 1.2402409101048404, + "grad_norm": 0.17322902381420135, + "learning_rate": 1.2821738826929888e-05, + "loss": 0.4607, + "step": 5560 + }, + { + "epoch": 1.24046397501673, + "grad_norm": 0.17016318440437317, + "learning_rate": 1.2819480955244705e-05, + "loss": 0.4522, + "step": 5561 + }, + { + "epoch": 1.2406870399286192, + "grad_norm": 0.16096848249435425, + "learning_rate": 1.2817222927398932e-05, + "loss": 0.4475, + "step": 5562 + }, + { + "epoch": 1.2409101048405087, + "grad_norm": 0.166842982172966, + "learning_rate": 1.281496474351763e-05, + "loss": 0.4731, + "step": 5563 + }, + { + "epoch": 1.241133169752398, + "grad_norm": 0.17023105919361115, + "learning_rate": 1.2812706403725876e-05, + "loss": 0.4626, + "step": 5564 + }, + { + "epoch": 1.2413562346642872, + "grad_norm": 0.1609751284122467, + "learning_rate": 1.2810447908148748e-05, + "loss": 0.4751, + "step": 5565 + }, + { + "epoch": 1.2415792995761767, + "grad_norm": 0.22108225524425507, + "learning_rate": 1.2808189256911336e-05, + "loss": 0.4497, + "step": 5566 + }, + { + "epoch": 1.241802364488066, + "grad_norm": 0.1703910231590271, + "learning_rate": 1.2805930450138742e-05, + "loss": 0.4731, + "step": 5567 + }, + { + "epoch": 1.2420254293999553, + "grad_norm": 0.16368292272090912, + "learning_rate": 1.2803671487956063e-05, + "loss": 0.459, + "step": 5568 + }, + { + "epoch": 1.2422484943118448, + "grad_norm": 0.1646466702222824, + "learning_rate": 1.2801412370488427e-05, + "loss": 0.4756, + "step": 5569 + }, + { + "epoch": 1.242471559223734, + "grad_norm": 0.16485071182250977, + "learning_rate": 1.279915309786095e-05, + "loss": 0.479, + "step": 5570 + }, + { + "epoch": 1.2426946241356234, + "grad_norm": 0.17181314527988434, + "learning_rate": 1.2796893670198767e-05, + "loss": 0.4802, + "step": 5571 + }, + { + "epoch": 1.2429176890475129, + "grad_norm": 0.17581303417682648, + "learning_rate": 1.279463408762702e-05, + "loss": 0.4747, + "step": 5572 + }, + { + "epoch": 1.2431407539594022, + "grad_norm": 0.16636666655540466, + "learning_rate": 1.2792374350270858e-05, + "loss": 0.443, + "step": 5573 + }, + { + "epoch": 1.2433638188712917, + "grad_norm": 0.1754438728094101, + "learning_rate": 1.2790114458255441e-05, + "loss": 0.4555, + "step": 5574 + }, + { + "epoch": 1.243586883783181, + "grad_norm": 0.16133657097816467, + "learning_rate": 1.2787854411705935e-05, + "loss": 0.4426, + "step": 5575 + }, + { + "epoch": 1.2438099486950702, + "grad_norm": 0.15894077718257904, + "learning_rate": 1.2785594210747513e-05, + "loss": 0.4511, + "step": 5576 + }, + { + "epoch": 1.2440330136069595, + "grad_norm": 0.1614711582660675, + "learning_rate": 1.2783333855505364e-05, + "loss": 0.4712, + "step": 5577 + }, + { + "epoch": 1.244256078518849, + "grad_norm": 0.16365355253219604, + "learning_rate": 1.2781073346104677e-05, + "loss": 0.4399, + "step": 5578 + }, + { + "epoch": 1.2444791434307383, + "grad_norm": 0.16568869352340698, + "learning_rate": 1.2778812682670654e-05, + "loss": 0.4695, + "step": 5579 + }, + { + "epoch": 1.2447022083426278, + "grad_norm": 0.16856862604618073, + "learning_rate": 1.2776551865328503e-05, + "loss": 0.4528, + "step": 5580 + }, + { + "epoch": 1.244925273254517, + "grad_norm": 0.17370723187923431, + "learning_rate": 1.277429089420345e-05, + "loss": 0.4822, + "step": 5581 + }, + { + "epoch": 1.2451483381664064, + "grad_norm": 0.16506825387477875, + "learning_rate": 1.277202976942071e-05, + "loss": 0.4354, + "step": 5582 + }, + { + "epoch": 1.2453714030782959, + "grad_norm": 0.16328351199626923, + "learning_rate": 1.276976849110553e-05, + "loss": 0.4509, + "step": 5583 + }, + { + "epoch": 1.2455944679901851, + "grad_norm": 0.16474324464797974, + "learning_rate": 1.2767507059383144e-05, + "loss": 0.4789, + "step": 5584 + }, + { + "epoch": 1.2458175329020744, + "grad_norm": 0.16100145876407623, + "learning_rate": 1.2765245474378814e-05, + "loss": 0.442, + "step": 5585 + }, + { + "epoch": 1.246040597813964, + "grad_norm": 0.1651872992515564, + "learning_rate": 1.2762983736217792e-05, + "loss": 0.4363, + "step": 5586 + }, + { + "epoch": 1.2462636627258532, + "grad_norm": 0.1605336219072342, + "learning_rate": 1.2760721845025353e-05, + "loss": 0.4722, + "step": 5587 + }, + { + "epoch": 1.2464867276377425, + "grad_norm": 0.16939808428287506, + "learning_rate": 1.2758459800926768e-05, + "loss": 0.4738, + "step": 5588 + }, + { + "epoch": 1.246709792549632, + "grad_norm": 0.16064578294754028, + "learning_rate": 1.2756197604047333e-05, + "loss": 0.4487, + "step": 5589 + }, + { + "epoch": 1.2469328574615213, + "grad_norm": 0.17054161429405212, + "learning_rate": 1.2753935254512332e-05, + "loss": 0.4863, + "step": 5590 + }, + { + "epoch": 1.2471559223734108, + "grad_norm": 0.16378219425678253, + "learning_rate": 1.2751672752447079e-05, + "loss": 0.4373, + "step": 5591 + }, + { + "epoch": 1.2473789872853, + "grad_norm": 0.16274695098400116, + "learning_rate": 1.2749410097976878e-05, + "loss": 0.4425, + "step": 5592 + }, + { + "epoch": 1.2476020521971893, + "grad_norm": 0.16811704635620117, + "learning_rate": 1.2747147291227053e-05, + "loss": 0.4589, + "step": 5593 + }, + { + "epoch": 1.2478251171090786, + "grad_norm": 0.1644984930753708, + "learning_rate": 1.2744884332322926e-05, + "loss": 0.4524, + "step": 5594 + }, + { + "epoch": 1.2480481820209681, + "grad_norm": 0.17755252122879028, + "learning_rate": 1.2742621221389846e-05, + "loss": 0.4453, + "step": 5595 + }, + { + "epoch": 1.2482712469328574, + "grad_norm": 0.17471030354499817, + "learning_rate": 1.2740357958553144e-05, + "loss": 0.5054, + "step": 5596 + }, + { + "epoch": 1.248494311844747, + "grad_norm": 0.16155916452407837, + "learning_rate": 1.2738094543938187e-05, + "loss": 0.4663, + "step": 5597 + }, + { + "epoch": 1.2487173767566362, + "grad_norm": 0.16668613255023956, + "learning_rate": 1.2735830977670325e-05, + "loss": 0.4664, + "step": 5598 + }, + { + "epoch": 1.2489404416685255, + "grad_norm": 0.17190469801425934, + "learning_rate": 1.2733567259874937e-05, + "loss": 0.4645, + "step": 5599 + }, + { + "epoch": 1.249163506580415, + "grad_norm": 0.17003987729549408, + "learning_rate": 1.2731303390677399e-05, + "loss": 0.4662, + "step": 5600 + }, + { + "epoch": 1.2493865714923043, + "grad_norm": 0.175228089094162, + "learning_rate": 1.2729039370203098e-05, + "loss": 0.4609, + "step": 5601 + }, + { + "epoch": 1.2496096364041935, + "grad_norm": 0.169545978307724, + "learning_rate": 1.2726775198577432e-05, + "loss": 0.4838, + "step": 5602 + }, + { + "epoch": 1.249832701316083, + "grad_norm": 0.16817337274551392, + "learning_rate": 1.2724510875925802e-05, + "loss": 0.4786, + "step": 5603 + }, + { + "epoch": 1.2500557662279723, + "grad_norm": 0.16374395787715912, + "learning_rate": 1.2722246402373624e-05, + "loss": 0.4848, + "step": 5604 + }, + { + "epoch": 1.2502788311398616, + "grad_norm": 0.16421671211719513, + "learning_rate": 1.2719981778046313e-05, + "loss": 0.4712, + "step": 5605 + }, + { + "epoch": 1.250501896051751, + "grad_norm": 0.16157956421375275, + "learning_rate": 1.2717717003069305e-05, + "loss": 0.4392, + "step": 5606 + }, + { + "epoch": 1.2507249609636404, + "grad_norm": 0.1583690643310547, + "learning_rate": 1.271545207756803e-05, + "loss": 0.4425, + "step": 5607 + }, + { + "epoch": 1.25094802587553, + "grad_norm": 0.18596141040325165, + "learning_rate": 1.2713187001667943e-05, + "loss": 0.4751, + "step": 5608 + }, + { + "epoch": 1.2511710907874192, + "grad_norm": 0.19196903705596924, + "learning_rate": 1.2710921775494494e-05, + "loss": 0.4827, + "step": 5609 + }, + { + "epoch": 1.2513941556993085, + "grad_norm": 0.16318334639072418, + "learning_rate": 1.270865639917314e-05, + "loss": 0.4548, + "step": 5610 + }, + { + "epoch": 1.2516172206111977, + "grad_norm": 0.17583735287189484, + "learning_rate": 1.270639087282936e-05, + "loss": 0.4829, + "step": 5611 + }, + { + "epoch": 1.2518402855230872, + "grad_norm": 0.19173632562160492, + "learning_rate": 1.2704125196588628e-05, + "loss": 0.4773, + "step": 5612 + }, + { + "epoch": 1.2520633504349765, + "grad_norm": 0.15936903655529022, + "learning_rate": 1.2701859370576432e-05, + "loss": 0.428, + "step": 5613 + }, + { + "epoch": 1.252286415346866, + "grad_norm": 0.16663479804992676, + "learning_rate": 1.2699593394918273e-05, + "loss": 0.4285, + "step": 5614 + }, + { + "epoch": 1.2525094802587553, + "grad_norm": 0.16250313818454742, + "learning_rate": 1.2697327269739646e-05, + "loss": 0.471, + "step": 5615 + }, + { + "epoch": 1.2527325451706446, + "grad_norm": 0.16274987161159515, + "learning_rate": 1.2695060995166069e-05, + "loss": 0.4566, + "step": 5616 + }, + { + "epoch": 1.252955610082534, + "grad_norm": 0.17902643978595734, + "learning_rate": 1.2692794571323064e-05, + "loss": 0.4694, + "step": 5617 + }, + { + "epoch": 1.2531786749944234, + "grad_norm": 0.16219644248485565, + "learning_rate": 1.2690527998336153e-05, + "loss": 0.4996, + "step": 5618 + }, + { + "epoch": 1.2534017399063129, + "grad_norm": 0.16951128840446472, + "learning_rate": 1.2688261276330882e-05, + "loss": 0.4908, + "step": 5619 + }, + { + "epoch": 1.2536248048182022, + "grad_norm": 0.1658773273229599, + "learning_rate": 1.2685994405432788e-05, + "loss": 0.4776, + "step": 5620 + }, + { + "epoch": 1.2538478697300914, + "grad_norm": 0.15743979811668396, + "learning_rate": 1.268372738576743e-05, + "loss": 0.46, + "step": 5621 + }, + { + "epoch": 1.2540709346419807, + "grad_norm": 0.16387997567653656, + "learning_rate": 1.2681460217460365e-05, + "loss": 0.4833, + "step": 5622 + }, + { + "epoch": 1.2542939995538702, + "grad_norm": 0.16733065247535706, + "learning_rate": 1.2679192900637172e-05, + "loss": 0.4621, + "step": 5623 + }, + { + "epoch": 1.2545170644657595, + "grad_norm": 0.16208809614181519, + "learning_rate": 1.267692543542342e-05, + "loss": 0.4688, + "step": 5624 + }, + { + "epoch": 1.254740129377649, + "grad_norm": 0.16228267550468445, + "learning_rate": 1.2674657821944699e-05, + "loss": 0.4499, + "step": 5625 + }, + { + "epoch": 1.2549631942895383, + "grad_norm": 0.17010217905044556, + "learning_rate": 1.2672390060326603e-05, + "loss": 0.4411, + "step": 5626 + }, + { + "epoch": 1.2551862592014276, + "grad_norm": 0.1624649167060852, + "learning_rate": 1.2670122150694737e-05, + "loss": 0.451, + "step": 5627 + }, + { + "epoch": 1.2554093241133168, + "grad_norm": 0.16551491618156433, + "learning_rate": 1.2667854093174707e-05, + "loss": 0.4502, + "step": 5628 + }, + { + "epoch": 1.2556323890252064, + "grad_norm": 0.16004303097724915, + "learning_rate": 1.266558588789214e-05, + "loss": 0.4371, + "step": 5629 + }, + { + "epoch": 1.2558554539370956, + "grad_norm": 0.1660148799419403, + "learning_rate": 1.2663317534972656e-05, + "loss": 0.4518, + "step": 5630 + }, + { + "epoch": 1.2560785188489851, + "grad_norm": 0.16411980986595154, + "learning_rate": 1.2661049034541897e-05, + "loss": 0.4644, + "step": 5631 + }, + { + "epoch": 1.2563015837608744, + "grad_norm": 0.16674858331680298, + "learning_rate": 1.2658780386725503e-05, + "loss": 0.4384, + "step": 5632 + }, + { + "epoch": 1.2565246486727637, + "grad_norm": 0.17275111377239227, + "learning_rate": 1.2656511591649125e-05, + "loss": 0.4698, + "step": 5633 + }, + { + "epoch": 1.2567477135846532, + "grad_norm": 0.16666240990161896, + "learning_rate": 1.2654242649438426e-05, + "loss": 0.4706, + "step": 5634 + }, + { + "epoch": 1.2569707784965425, + "grad_norm": 0.1748340129852295, + "learning_rate": 1.2651973560219073e-05, + "loss": 0.4442, + "step": 5635 + }, + { + "epoch": 1.257193843408432, + "grad_norm": 0.16753262281417847, + "learning_rate": 1.2649704324116745e-05, + "loss": 0.4586, + "step": 5636 + }, + { + "epoch": 1.2574169083203213, + "grad_norm": 0.1677563339471817, + "learning_rate": 1.264743494125712e-05, + "loss": 0.4699, + "step": 5637 + }, + { + "epoch": 1.2576399732322106, + "grad_norm": 0.16369731724262238, + "learning_rate": 1.2645165411765899e-05, + "loss": 0.4658, + "step": 5638 + }, + { + "epoch": 1.2578630381440998, + "grad_norm": 0.18121913075447083, + "learning_rate": 1.2642895735768775e-05, + "loss": 0.453, + "step": 5639 + }, + { + "epoch": 1.2580861030559893, + "grad_norm": 0.16915792226791382, + "learning_rate": 1.2640625913391464e-05, + "loss": 0.4496, + "step": 5640 + }, + { + "epoch": 1.2583091679678786, + "grad_norm": 0.17231988906860352, + "learning_rate": 1.2638355944759678e-05, + "loss": 0.4594, + "step": 5641 + }, + { + "epoch": 1.2585322328797681, + "grad_norm": 0.17405405640602112, + "learning_rate": 1.2636085829999145e-05, + "loss": 0.4665, + "step": 5642 + }, + { + "epoch": 1.2587552977916574, + "grad_norm": 0.17699532210826874, + "learning_rate": 1.2633815569235594e-05, + "loss": 0.4856, + "step": 5643 + }, + { + "epoch": 1.2589783627035467, + "grad_norm": 0.16303865611553192, + "learning_rate": 1.2631545162594773e-05, + "loss": 0.4475, + "step": 5644 + }, + { + "epoch": 1.259201427615436, + "grad_norm": 0.40841978788375854, + "learning_rate": 1.2629274610202427e-05, + "loss": 0.4419, + "step": 5645 + }, + { + "epoch": 1.2594244925273255, + "grad_norm": 0.17015258967876434, + "learning_rate": 1.2627003912184315e-05, + "loss": 0.4642, + "step": 5646 + }, + { + "epoch": 1.2596475574392147, + "grad_norm": 0.16850386559963226, + "learning_rate": 1.26247330686662e-05, + "loss": 0.4751, + "step": 5647 + }, + { + "epoch": 1.2598706223511043, + "grad_norm": 0.16403812170028687, + "learning_rate": 1.2622462079773859e-05, + "loss": 0.4555, + "step": 5648 + }, + { + "epoch": 1.2600936872629935, + "grad_norm": 0.16475041210651398, + "learning_rate": 1.2620190945633069e-05, + "loss": 0.4456, + "step": 5649 + }, + { + "epoch": 1.2603167521748828, + "grad_norm": 0.1695244163274765, + "learning_rate": 1.2617919666369627e-05, + "loss": 0.4489, + "step": 5650 + }, + { + "epoch": 1.2605398170867723, + "grad_norm": 0.16356348991394043, + "learning_rate": 1.2615648242109324e-05, + "loss": 0.4545, + "step": 5651 + }, + { + "epoch": 1.2607628819986616, + "grad_norm": 0.17045846581459045, + "learning_rate": 1.2613376672977968e-05, + "loss": 0.4734, + "step": 5652 + }, + { + "epoch": 1.260985946910551, + "grad_norm": 0.15884579718112946, + "learning_rate": 1.2611104959101374e-05, + "loss": 0.4357, + "step": 5653 + }, + { + "epoch": 1.2612090118224404, + "grad_norm": 0.16921289265155792, + "learning_rate": 1.2608833100605361e-05, + "loss": 0.4714, + "step": 5654 + }, + { + "epoch": 1.2614320767343297, + "grad_norm": 0.16710084676742554, + "learning_rate": 1.2606561097615764e-05, + "loss": 0.4882, + "step": 5655 + }, + { + "epoch": 1.261655141646219, + "grad_norm": 0.16774149239063263, + "learning_rate": 1.2604288950258414e-05, + "loss": 0.4501, + "step": 5656 + }, + { + "epoch": 1.2618782065581085, + "grad_norm": 0.16943363845348358, + "learning_rate": 1.2602016658659167e-05, + "loss": 0.4764, + "step": 5657 + }, + { + "epoch": 1.2621012714699977, + "grad_norm": 0.16601496934890747, + "learning_rate": 1.2599744222943864e-05, + "loss": 0.4549, + "step": 5658 + }, + { + "epoch": 1.2623243363818872, + "grad_norm": 0.1612405627965927, + "learning_rate": 1.2597471643238372e-05, + "loss": 0.4451, + "step": 5659 + }, + { + "epoch": 1.2625474012937765, + "grad_norm": 0.16680100560188293, + "learning_rate": 1.2595198919668566e-05, + "loss": 0.475, + "step": 5660 + }, + { + "epoch": 1.2627704662056658, + "grad_norm": 0.16711308062076569, + "learning_rate": 1.2592926052360316e-05, + "loss": 0.4574, + "step": 5661 + }, + { + "epoch": 1.262993531117555, + "grad_norm": 0.16317816078662872, + "learning_rate": 1.259065304143951e-05, + "loss": 0.4345, + "step": 5662 + }, + { + "epoch": 1.2632165960294446, + "grad_norm": 0.16551434993743896, + "learning_rate": 1.2588379887032048e-05, + "loss": 0.4706, + "step": 5663 + }, + { + "epoch": 1.2634396609413339, + "grad_norm": 0.16414450109004974, + "learning_rate": 1.2586106589263823e-05, + "loss": 0.4574, + "step": 5664 + }, + { + "epoch": 1.2636627258532234, + "grad_norm": 0.17107480764389038, + "learning_rate": 1.2583833148260749e-05, + "loss": 0.4698, + "step": 5665 + }, + { + "epoch": 1.2638857907651126, + "grad_norm": 0.17316211760044098, + "learning_rate": 1.258155956414874e-05, + "loss": 0.4772, + "step": 5666 + }, + { + "epoch": 1.264108855677002, + "grad_norm": 0.16382986307144165, + "learning_rate": 1.2579285837053722e-05, + "loss": 0.4463, + "step": 5667 + }, + { + "epoch": 1.2643319205888914, + "grad_norm": 0.20516705513000488, + "learning_rate": 1.2577011967101636e-05, + "loss": 0.4539, + "step": 5668 + }, + { + "epoch": 1.2645549855007807, + "grad_norm": 0.17769816517829895, + "learning_rate": 1.2574737954418412e-05, + "loss": 0.4889, + "step": 5669 + }, + { + "epoch": 1.2647780504126702, + "grad_norm": 0.16899065673351288, + "learning_rate": 1.2572463799130008e-05, + "loss": 0.476, + "step": 5670 + }, + { + "epoch": 1.2650011153245595, + "grad_norm": 0.16549617052078247, + "learning_rate": 1.2570189501362375e-05, + "loss": 0.4493, + "step": 5671 + }, + { + "epoch": 1.2652241802364488, + "grad_norm": 0.16648399829864502, + "learning_rate": 1.2567915061241483e-05, + "loss": 0.4375, + "step": 5672 + }, + { + "epoch": 1.265447245148338, + "grad_norm": 0.16941656172275543, + "learning_rate": 1.2565640478893299e-05, + "loss": 0.4543, + "step": 5673 + }, + { + "epoch": 1.2656703100602276, + "grad_norm": 0.1804044395685196, + "learning_rate": 1.2563365754443808e-05, + "loss": 0.4722, + "step": 5674 + }, + { + "epoch": 1.2658933749721168, + "grad_norm": 0.18117646872997284, + "learning_rate": 1.2561090888018996e-05, + "loss": 0.4869, + "step": 5675 + }, + { + "epoch": 1.2661164398840064, + "grad_norm": 0.16993004083633423, + "learning_rate": 1.2558815879744865e-05, + "loss": 0.4665, + "step": 5676 + }, + { + "epoch": 1.2663395047958956, + "grad_norm": 0.19133198261260986, + "learning_rate": 1.255654072974741e-05, + "loss": 0.4619, + "step": 5677 + }, + { + "epoch": 1.266562569707785, + "grad_norm": 0.17450137436389923, + "learning_rate": 1.2554265438152653e-05, + "loss": 0.4638, + "step": 5678 + }, + { + "epoch": 1.2667856346196742, + "grad_norm": 0.1640671044588089, + "learning_rate": 1.2551990005086604e-05, + "loss": 0.4364, + "step": 5679 + }, + { + "epoch": 1.2670086995315637, + "grad_norm": 0.16710147261619568, + "learning_rate": 1.2549714430675299e-05, + "loss": 0.4479, + "step": 5680 + }, + { + "epoch": 1.267231764443453, + "grad_norm": 0.17442373931407928, + "learning_rate": 1.2547438715044769e-05, + "loss": 0.4517, + "step": 5681 + }, + { + "epoch": 1.2674548293553425, + "grad_norm": 0.18162380158901215, + "learning_rate": 1.254516285832106e-05, + "loss": 0.4829, + "step": 5682 + }, + { + "epoch": 1.2676778942672318, + "grad_norm": 0.17129097878932953, + "learning_rate": 1.2542886860630221e-05, + "loss": 0.4519, + "step": 5683 + }, + { + "epoch": 1.267900959179121, + "grad_norm": 0.1645333468914032, + "learning_rate": 1.2540610722098314e-05, + "loss": 0.4544, + "step": 5684 + }, + { + "epoch": 1.2681240240910105, + "grad_norm": 0.21202479302883148, + "learning_rate": 1.2538334442851403e-05, + "loss": 0.5017, + "step": 5685 + }, + { + "epoch": 1.2683470890028998, + "grad_norm": 0.21118725836277008, + "learning_rate": 1.2536058023015563e-05, + "loss": 0.477, + "step": 5686 + }, + { + "epoch": 1.2685701539147893, + "grad_norm": 0.16621573269367218, + "learning_rate": 1.2533781462716879e-05, + "loss": 0.4427, + "step": 5687 + }, + { + "epoch": 1.2687932188266786, + "grad_norm": 0.20093737542629242, + "learning_rate": 1.2531504762081437e-05, + "loss": 0.4873, + "step": 5688 + }, + { + "epoch": 1.269016283738568, + "grad_norm": 0.1732548326253891, + "learning_rate": 1.2529227921235342e-05, + "loss": 0.4745, + "step": 5689 + }, + { + "epoch": 1.2692393486504572, + "grad_norm": 0.1718747317790985, + "learning_rate": 1.252695094030469e-05, + "loss": 0.4478, + "step": 5690 + }, + { + "epoch": 1.2694624135623467, + "grad_norm": 0.1892472505569458, + "learning_rate": 1.2524673819415602e-05, + "loss": 0.457, + "step": 5691 + }, + { + "epoch": 1.269685478474236, + "grad_norm": 0.16968894004821777, + "learning_rate": 1.2522396558694197e-05, + "loss": 0.4734, + "step": 5692 + }, + { + "epoch": 1.2699085433861255, + "grad_norm": 0.1685362160205841, + "learning_rate": 1.2520119158266606e-05, + "loss": 0.4233, + "step": 5693 + }, + { + "epoch": 1.2701316082980147, + "grad_norm": 0.16388767957687378, + "learning_rate": 1.2517841618258961e-05, + "loss": 0.4135, + "step": 5694 + }, + { + "epoch": 1.270354673209904, + "grad_norm": 0.16109982132911682, + "learning_rate": 1.251556393879741e-05, + "loss": 0.4484, + "step": 5695 + }, + { + "epoch": 1.2705777381217933, + "grad_norm": 0.18354761600494385, + "learning_rate": 1.2513286120008105e-05, + "loss": 0.4358, + "step": 5696 + }, + { + "epoch": 1.2708008030336828, + "grad_norm": 0.17148545384407043, + "learning_rate": 1.2511008162017209e-05, + "loss": 0.4646, + "step": 5697 + }, + { + "epoch": 1.271023867945572, + "grad_norm": 0.17020155489444733, + "learning_rate": 1.2508730064950881e-05, + "loss": 0.4587, + "step": 5698 + }, + { + "epoch": 1.2712469328574616, + "grad_norm": 0.16151095926761627, + "learning_rate": 1.2506451828935303e-05, + "loss": 0.4886, + "step": 5699 + }, + { + "epoch": 1.2714699977693509, + "grad_norm": 0.1669510155916214, + "learning_rate": 1.2504173454096658e-05, + "loss": 0.4432, + "step": 5700 + }, + { + "epoch": 1.2716930626812402, + "grad_norm": 0.16603168845176697, + "learning_rate": 1.2501894940561133e-05, + "loss": 0.4556, + "step": 5701 + }, + { + "epoch": 1.2719161275931297, + "grad_norm": 0.1668505221605301, + "learning_rate": 1.249961628845493e-05, + "loss": 0.477, + "step": 5702 + }, + { + "epoch": 1.272139192505019, + "grad_norm": 0.1614041030406952, + "learning_rate": 1.2497337497904251e-05, + "loss": 0.4273, + "step": 5703 + }, + { + "epoch": 1.2723622574169084, + "grad_norm": 0.1707865595817566, + "learning_rate": 1.2495058569035316e-05, + "loss": 0.4562, + "step": 5704 + }, + { + "epoch": 1.2725853223287977, + "grad_norm": 0.17685820162296295, + "learning_rate": 1.249277950197434e-05, + "loss": 0.4693, + "step": 5705 + }, + { + "epoch": 1.272808387240687, + "grad_norm": 0.16811825335025787, + "learning_rate": 1.2490500296847558e-05, + "loss": 0.4576, + "step": 5706 + }, + { + "epoch": 1.2730314521525763, + "grad_norm": 0.17165428400039673, + "learning_rate": 1.2488220953781201e-05, + "loss": 0.4851, + "step": 5707 + }, + { + "epoch": 1.2732545170644658, + "grad_norm": 0.1754395067691803, + "learning_rate": 1.2485941472901519e-05, + "loss": 0.4895, + "step": 5708 + }, + { + "epoch": 1.273477581976355, + "grad_norm": 0.1676994264125824, + "learning_rate": 1.2483661854334756e-05, + "loss": 0.4675, + "step": 5709 + }, + { + "epoch": 1.2737006468882446, + "grad_norm": 0.16332073509693146, + "learning_rate": 1.2481382098207181e-05, + "loss": 0.4444, + "step": 5710 + }, + { + "epoch": 1.2739237118001339, + "grad_norm": 0.175141841173172, + "learning_rate": 1.2479102204645057e-05, + "loss": 0.4934, + "step": 5711 + }, + { + "epoch": 1.2741467767120231, + "grad_norm": 0.17090824246406555, + "learning_rate": 1.247682217377466e-05, + "loss": 0.4588, + "step": 5712 + }, + { + "epoch": 1.2743698416239124, + "grad_norm": 0.1759280413389206, + "learning_rate": 1.2474542005722265e-05, + "loss": 0.4487, + "step": 5713 + }, + { + "epoch": 1.274592906535802, + "grad_norm": 0.16140590608119965, + "learning_rate": 1.2472261700614174e-05, + "loss": 0.451, + "step": 5714 + }, + { + "epoch": 1.2748159714476912, + "grad_norm": 0.19568322598934174, + "learning_rate": 1.2469981258576676e-05, + "loss": 0.4726, + "step": 5715 + }, + { + "epoch": 1.2750390363595807, + "grad_norm": 0.16794992983341217, + "learning_rate": 1.246770067973608e-05, + "loss": 0.465, + "step": 5716 + }, + { + "epoch": 1.27526210127147, + "grad_norm": 0.17148984968662262, + "learning_rate": 1.2465419964218696e-05, + "loss": 0.4809, + "step": 5717 + }, + { + "epoch": 1.2754851661833593, + "grad_norm": 0.17300093173980713, + "learning_rate": 1.2463139112150851e-05, + "loss": 0.4417, + "step": 5718 + }, + { + "epoch": 1.2757082310952488, + "grad_norm": 0.16525490581989288, + "learning_rate": 1.2460858123658863e-05, + "loss": 0.4744, + "step": 5719 + }, + { + "epoch": 1.275931296007138, + "grad_norm": 0.16058532893657684, + "learning_rate": 1.2458576998869076e-05, + "loss": 0.4494, + "step": 5720 + }, + { + "epoch": 1.2761543609190276, + "grad_norm": 0.16562692821025848, + "learning_rate": 1.2456295737907828e-05, + "loss": 0.3971, + "step": 5721 + }, + { + "epoch": 1.2763774258309168, + "grad_norm": 0.1694319099187851, + "learning_rate": 1.2454014340901472e-05, + "loss": 0.4956, + "step": 5722 + }, + { + "epoch": 1.2766004907428061, + "grad_norm": 0.1706772893667221, + "learning_rate": 1.2451732807976367e-05, + "loss": 0.487, + "step": 5723 + }, + { + "epoch": 1.2768235556546954, + "grad_norm": 0.16685868799686432, + "learning_rate": 1.2449451139258875e-05, + "loss": 0.4671, + "step": 5724 + }, + { + "epoch": 1.277046620566585, + "grad_norm": 0.16977933049201965, + "learning_rate": 1.2447169334875374e-05, + "loss": 0.4633, + "step": 5725 + }, + { + "epoch": 1.2772696854784742, + "grad_norm": 0.1995961219072342, + "learning_rate": 1.2444887394952237e-05, + "loss": 0.4676, + "step": 5726 + }, + { + "epoch": 1.2774927503903637, + "grad_norm": 0.17159555852413177, + "learning_rate": 1.2442605319615862e-05, + "loss": 0.4884, + "step": 5727 + }, + { + "epoch": 1.277715815302253, + "grad_norm": 0.18190377950668335, + "learning_rate": 1.2440323108992635e-05, + "loss": 0.4802, + "step": 5728 + }, + { + "epoch": 1.2779388802141423, + "grad_norm": 0.16413584351539612, + "learning_rate": 1.2438040763208967e-05, + "loss": 0.4475, + "step": 5729 + }, + { + "epoch": 1.2781619451260315, + "grad_norm": 0.17521817982196808, + "learning_rate": 1.2435758282391266e-05, + "loss": 0.4559, + "step": 5730 + }, + { + "epoch": 1.278385010037921, + "grad_norm": 0.17562466859817505, + "learning_rate": 1.243347566666595e-05, + "loss": 0.4614, + "step": 5731 + }, + { + "epoch": 1.2786080749498103, + "grad_norm": 0.18439187109470367, + "learning_rate": 1.2431192916159442e-05, + "loss": 0.4739, + "step": 5732 + }, + { + "epoch": 1.2788311398616998, + "grad_norm": 0.17046132683753967, + "learning_rate": 1.242891003099818e-05, + "loss": 0.4653, + "step": 5733 + }, + { + "epoch": 1.279054204773589, + "grad_norm": 0.17075121402740479, + "learning_rate": 1.24266270113086e-05, + "loss": 0.5025, + "step": 5734 + }, + { + "epoch": 1.2792772696854784, + "grad_norm": 0.1661718189716339, + "learning_rate": 1.2424343857217153e-05, + "loss": 0.4574, + "step": 5735 + }, + { + "epoch": 1.279500334597368, + "grad_norm": 0.1734541356563568, + "learning_rate": 1.2422060568850293e-05, + "loss": 0.4579, + "step": 5736 + }, + { + "epoch": 1.2797233995092572, + "grad_norm": 0.16650691628456116, + "learning_rate": 1.2419777146334486e-05, + "loss": 0.4793, + "step": 5737 + }, + { + "epoch": 1.2799464644211467, + "grad_norm": 0.16996720433235168, + "learning_rate": 1.2417493589796199e-05, + "loss": 0.4679, + "step": 5738 + }, + { + "epoch": 1.280169529333036, + "grad_norm": 0.1680038571357727, + "learning_rate": 1.2415209899361908e-05, + "loss": 0.4842, + "step": 5739 + }, + { + "epoch": 1.2803925942449252, + "grad_norm": 0.21281017363071442, + "learning_rate": 1.2412926075158103e-05, + "loss": 0.4534, + "step": 5740 + }, + { + "epoch": 1.2806156591568145, + "grad_norm": 0.16885453462600708, + "learning_rate": 1.241064211731127e-05, + "loss": 0.4758, + "step": 5741 + }, + { + "epoch": 1.280838724068704, + "grad_norm": 0.19396549463272095, + "learning_rate": 1.2408358025947917e-05, + "loss": 0.4729, + "step": 5742 + }, + { + "epoch": 1.2810617889805933, + "grad_norm": 0.1660171002149582, + "learning_rate": 1.2406073801194546e-05, + "loss": 0.4574, + "step": 5743 + }, + { + "epoch": 1.2812848538924828, + "grad_norm": 0.17650169134140015, + "learning_rate": 1.2403789443177672e-05, + "loss": 0.452, + "step": 5744 + }, + { + "epoch": 1.281507918804372, + "grad_norm": 0.16995863616466522, + "learning_rate": 1.240150495202382e-05, + "loss": 0.4556, + "step": 5745 + }, + { + "epoch": 1.2817309837162614, + "grad_norm": 0.17916269600391388, + "learning_rate": 1.2399220327859516e-05, + "loss": 0.4595, + "step": 5746 + }, + { + "epoch": 1.2819540486281507, + "grad_norm": 0.16572338342666626, + "learning_rate": 1.2396935570811299e-05, + "loss": 0.4408, + "step": 5747 + }, + { + "epoch": 1.2821771135400402, + "grad_norm": 0.1570359319448471, + "learning_rate": 1.2394650681005713e-05, + "loss": 0.4357, + "step": 5748 + }, + { + "epoch": 1.2824001784519294, + "grad_norm": 0.1629701554775238, + "learning_rate": 1.239236565856931e-05, + "loss": 0.4481, + "step": 5749 + }, + { + "epoch": 1.282623243363819, + "grad_norm": 0.17372570931911469, + "learning_rate": 1.2390080503628647e-05, + "loss": 0.4717, + "step": 5750 + }, + { + "epoch": 1.2828463082757082, + "grad_norm": 0.1692957580089569, + "learning_rate": 1.2387795216310292e-05, + "loss": 0.4665, + "step": 5751 + }, + { + "epoch": 1.2830693731875975, + "grad_norm": 0.16883119940757751, + "learning_rate": 1.2385509796740818e-05, + "loss": 0.4566, + "step": 5752 + }, + { + "epoch": 1.283292438099487, + "grad_norm": 0.1655278503894806, + "learning_rate": 1.2383224245046805e-05, + "loss": 0.4482, + "step": 5753 + }, + { + "epoch": 1.2835155030113763, + "grad_norm": 0.17099110782146454, + "learning_rate": 1.2380938561354846e-05, + "loss": 0.4733, + "step": 5754 + }, + { + "epoch": 1.2837385679232658, + "grad_norm": 0.16379962861537933, + "learning_rate": 1.2378652745791528e-05, + "loss": 0.4337, + "step": 5755 + }, + { + "epoch": 1.283961632835155, + "grad_norm": 0.1781388372182846, + "learning_rate": 1.237636679848346e-05, + "loss": 0.4642, + "step": 5756 + }, + { + "epoch": 1.2841846977470444, + "grad_norm": 0.16018837690353394, + "learning_rate": 1.2374080719557253e-05, + "loss": 0.4488, + "step": 5757 + }, + { + "epoch": 1.2844077626589336, + "grad_norm": 0.17385733127593994, + "learning_rate": 1.237179450913952e-05, + "loss": 0.4448, + "step": 5758 + }, + { + "epoch": 1.2846308275708231, + "grad_norm": 0.1600496470928192, + "learning_rate": 1.236950816735689e-05, + "loss": 0.4523, + "step": 5759 + }, + { + "epoch": 1.2848538924827124, + "grad_norm": 0.17114561796188354, + "learning_rate": 1.2367221694335992e-05, + "loss": 0.4717, + "step": 5760 + }, + { + "epoch": 1.285076957394602, + "grad_norm": 0.17317497730255127, + "learning_rate": 1.2364935090203464e-05, + "loss": 0.4571, + "step": 5761 + }, + { + "epoch": 1.2853000223064912, + "grad_norm": 0.18465475738048553, + "learning_rate": 1.2362648355085958e-05, + "loss": 0.4622, + "step": 5762 + }, + { + "epoch": 1.2855230872183805, + "grad_norm": 0.16968859732151031, + "learning_rate": 1.2360361489110123e-05, + "loss": 0.445, + "step": 5763 + }, + { + "epoch": 1.2857461521302698, + "grad_norm": 0.16804584860801697, + "learning_rate": 1.235807449240262e-05, + "loss": 0.4482, + "step": 5764 + }, + { + "epoch": 1.2859692170421593, + "grad_norm": 0.170896977186203, + "learning_rate": 1.2355787365090122e-05, + "loss": 0.4517, + "step": 5765 + }, + { + "epoch": 1.2861922819540486, + "grad_norm": 0.1710241287946701, + "learning_rate": 1.2353500107299299e-05, + "loss": 0.4884, + "step": 5766 + }, + { + "epoch": 1.286415346865938, + "grad_norm": 0.16201786696910858, + "learning_rate": 1.2351212719156835e-05, + "loss": 0.4542, + "step": 5767 + }, + { + "epoch": 1.2866384117778273, + "grad_norm": 0.16651326417922974, + "learning_rate": 1.234892520078942e-05, + "loss": 0.4649, + "step": 5768 + }, + { + "epoch": 1.2868614766897166, + "grad_norm": 0.16634796559810638, + "learning_rate": 1.2346637552323757e-05, + "loss": 0.4793, + "step": 5769 + }, + { + "epoch": 1.2870845416016061, + "grad_norm": 0.16899818181991577, + "learning_rate": 1.2344349773886542e-05, + "loss": 0.4633, + "step": 5770 + }, + { + "epoch": 1.2873076065134954, + "grad_norm": 0.17024937272071838, + "learning_rate": 1.2342061865604492e-05, + "loss": 0.4471, + "step": 5771 + }, + { + "epoch": 1.287530671425385, + "grad_norm": 0.1707090586423874, + "learning_rate": 1.2339773827604322e-05, + "loss": 0.4758, + "step": 5772 + }, + { + "epoch": 1.2877537363372742, + "grad_norm": 0.17313364148139954, + "learning_rate": 1.2337485660012757e-05, + "loss": 0.4858, + "step": 5773 + }, + { + "epoch": 1.2879768012491635, + "grad_norm": 0.1700849086046219, + "learning_rate": 1.2335197362956537e-05, + "loss": 0.4505, + "step": 5774 + }, + { + "epoch": 1.2881998661610528, + "grad_norm": 0.17922791838645935, + "learning_rate": 1.2332908936562395e-05, + "loss": 0.4559, + "step": 5775 + }, + { + "epoch": 1.2884229310729423, + "grad_norm": 0.1638413369655609, + "learning_rate": 1.2330620380957086e-05, + "loss": 0.4526, + "step": 5776 + }, + { + "epoch": 1.2886459959848315, + "grad_norm": 0.15971173346042633, + "learning_rate": 1.2328331696267357e-05, + "loss": 0.4386, + "step": 5777 + }, + { + "epoch": 1.288869060896721, + "grad_norm": 0.17769648134708405, + "learning_rate": 1.2326042882619973e-05, + "loss": 0.4659, + "step": 5778 + }, + { + "epoch": 1.2890921258086103, + "grad_norm": 0.16623780131340027, + "learning_rate": 1.2323753940141704e-05, + "loss": 0.4593, + "step": 5779 + }, + { + "epoch": 1.2893151907204996, + "grad_norm": 0.16500690579414368, + "learning_rate": 1.2321464868959326e-05, + "loss": 0.45, + "step": 5780 + }, + { + "epoch": 1.289538255632389, + "grad_norm": 0.17217601835727692, + "learning_rate": 1.2319175669199619e-05, + "loss": 0.4747, + "step": 5781 + }, + { + "epoch": 1.2897613205442784, + "grad_norm": 0.16614340245723724, + "learning_rate": 1.2316886340989375e-05, + "loss": 0.4685, + "step": 5782 + }, + { + "epoch": 1.2899843854561677, + "grad_norm": 0.1841314285993576, + "learning_rate": 1.231459688445539e-05, + "loss": 0.4537, + "step": 5783 + }, + { + "epoch": 1.2902074503680572, + "grad_norm": 0.1670687049627304, + "learning_rate": 1.231230729972447e-05, + "loss": 0.4763, + "step": 5784 + }, + { + "epoch": 1.2904305152799465, + "grad_norm": 0.1700257658958435, + "learning_rate": 1.2310017586923431e-05, + "loss": 0.4698, + "step": 5785 + }, + { + "epoch": 1.2906535801918357, + "grad_norm": 0.17322078347206116, + "learning_rate": 1.2307727746179085e-05, + "loss": 0.4422, + "step": 5786 + }, + { + "epoch": 1.2908766451037252, + "grad_norm": 0.18490198254585266, + "learning_rate": 1.230543777761826e-05, + "loss": 0.451, + "step": 5787 + }, + { + "epoch": 1.2910997100156145, + "grad_norm": 0.17593975365161896, + "learning_rate": 1.2303147681367788e-05, + "loss": 0.4806, + "step": 5788 + }, + { + "epoch": 1.291322774927504, + "grad_norm": 0.16735313832759857, + "learning_rate": 1.2300857457554513e-05, + "loss": 0.4551, + "step": 5789 + }, + { + "epoch": 1.2915458398393933, + "grad_norm": 0.18818075954914093, + "learning_rate": 1.2298567106305277e-05, + "loss": 0.4722, + "step": 5790 + }, + { + "epoch": 1.2917689047512826, + "grad_norm": 0.33603474497795105, + "learning_rate": 1.2296276627746938e-05, + "loss": 0.4789, + "step": 5791 + }, + { + "epoch": 1.2919919696631719, + "grad_norm": 0.16247035562992096, + "learning_rate": 1.2293986022006353e-05, + "loss": 0.4493, + "step": 5792 + }, + { + "epoch": 1.2922150345750614, + "grad_norm": 0.17141158878803253, + "learning_rate": 1.2291695289210395e-05, + "loss": 0.4718, + "step": 5793 + }, + { + "epoch": 1.2924380994869507, + "grad_norm": 0.15925908088684082, + "learning_rate": 1.2289404429485932e-05, + "loss": 0.4805, + "step": 5794 + }, + { + "epoch": 1.2926611643988402, + "grad_norm": 0.2104969471693039, + "learning_rate": 1.2287113442959854e-05, + "loss": 0.4718, + "step": 5795 + }, + { + "epoch": 1.2928842293107294, + "grad_norm": 0.1778918355703354, + "learning_rate": 1.2284822329759047e-05, + "loss": 0.4708, + "step": 5796 + }, + { + "epoch": 1.2931072942226187, + "grad_norm": 0.1711834967136383, + "learning_rate": 1.2282531090010408e-05, + "loss": 0.4613, + "step": 5797 + }, + { + "epoch": 1.2933303591345082, + "grad_norm": 0.17444384098052979, + "learning_rate": 1.2280239723840836e-05, + "loss": 0.4477, + "step": 5798 + }, + { + "epoch": 1.2935534240463975, + "grad_norm": 0.16771963238716125, + "learning_rate": 1.2277948231377247e-05, + "loss": 0.4698, + "step": 5799 + }, + { + "epoch": 1.2937764889582868, + "grad_norm": 0.1844215989112854, + "learning_rate": 1.2275656612746556e-05, + "loss": 0.4681, + "step": 5800 + }, + { + "epoch": 1.2939995538701763, + "grad_norm": 0.1627340018749237, + "learning_rate": 1.227336486807569e-05, + "loss": 0.4758, + "step": 5801 + }, + { + "epoch": 1.2942226187820656, + "grad_norm": 0.17234186828136444, + "learning_rate": 1.2271072997491573e-05, + "loss": 0.4646, + "step": 5802 + }, + { + "epoch": 1.2944456836939549, + "grad_norm": 0.169833242893219, + "learning_rate": 1.2268781001121151e-05, + "loss": 0.4709, + "step": 5803 + }, + { + "epoch": 1.2946687486058444, + "grad_norm": 0.16857881844043732, + "learning_rate": 1.2266488879091365e-05, + "loss": 0.484, + "step": 5804 + }, + { + "epoch": 1.2948918135177336, + "grad_norm": 0.16517959535121918, + "learning_rate": 1.2264196631529166e-05, + "loss": 0.4608, + "step": 5805 + }, + { + "epoch": 1.2951148784296231, + "grad_norm": 0.16706417500972748, + "learning_rate": 1.226190425856152e-05, + "loss": 0.4541, + "step": 5806 + }, + { + "epoch": 1.2953379433415124, + "grad_norm": 0.16569514572620392, + "learning_rate": 1.2259611760315381e-05, + "loss": 0.4393, + "step": 5807 + }, + { + "epoch": 1.2955610082534017, + "grad_norm": 0.17674359679222107, + "learning_rate": 1.2257319136917735e-05, + "loss": 0.4869, + "step": 5808 + }, + { + "epoch": 1.295784073165291, + "grad_norm": 0.1684064269065857, + "learning_rate": 1.2255026388495554e-05, + "loss": 0.488, + "step": 5809 + }, + { + "epoch": 1.2960071380771805, + "grad_norm": 0.16627109050750732, + "learning_rate": 1.2252733515175829e-05, + "loss": 0.4577, + "step": 5810 + }, + { + "epoch": 1.2962302029890698, + "grad_norm": 0.18663783371448517, + "learning_rate": 1.2250440517085549e-05, + "loss": 0.4512, + "step": 5811 + }, + { + "epoch": 1.2964532679009593, + "grad_norm": 0.17597997188568115, + "learning_rate": 1.2248147394351719e-05, + "loss": 0.4482, + "step": 5812 + }, + { + "epoch": 1.2966763328128486, + "grad_norm": 0.1624067723751068, + "learning_rate": 1.2245854147101344e-05, + "loss": 0.4809, + "step": 5813 + }, + { + "epoch": 1.2968993977247378, + "grad_norm": 0.1702115684747696, + "learning_rate": 1.2243560775461441e-05, + "loss": 0.4572, + "step": 5814 + }, + { + "epoch": 1.2971224626366273, + "grad_norm": 0.17795614898204803, + "learning_rate": 1.2241267279559029e-05, + "loss": 0.4668, + "step": 5815 + }, + { + "epoch": 1.2973455275485166, + "grad_norm": 0.17128905653953552, + "learning_rate": 1.2238973659521136e-05, + "loss": 0.4649, + "step": 5816 + }, + { + "epoch": 1.297568592460406, + "grad_norm": 0.1639898121356964, + "learning_rate": 1.2236679915474799e-05, + "loss": 0.4741, + "step": 5817 + }, + { + "epoch": 1.2977916573722954, + "grad_norm": 0.17181125283241272, + "learning_rate": 1.2234386047547057e-05, + "loss": 0.4569, + "step": 5818 + }, + { + "epoch": 1.2980147222841847, + "grad_norm": 0.17386600375175476, + "learning_rate": 1.2232092055864961e-05, + "loss": 0.4536, + "step": 5819 + }, + { + "epoch": 1.298237787196074, + "grad_norm": 0.1769663542509079, + "learning_rate": 1.222979794055557e-05, + "loss": 0.461, + "step": 5820 + }, + { + "epoch": 1.2984608521079635, + "grad_norm": 0.17835234105587006, + "learning_rate": 1.2227503701745942e-05, + "loss": 0.4535, + "step": 5821 + }, + { + "epoch": 1.2986839170198528, + "grad_norm": 0.1780080497264862, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.4532, + "step": 5822 + }, + { + "epoch": 1.2989069819317423, + "grad_norm": 0.1722138375043869, + "learning_rate": 1.2222914854134261e-05, + "loss": 0.4626, + "step": 5823 + }, + { + "epoch": 1.2991300468436315, + "grad_norm": 0.1659894585609436, + "learning_rate": 1.2220620245586365e-05, + "loss": 0.4466, + "step": 5824 + }, + { + "epoch": 1.2993531117555208, + "grad_norm": 0.17937776446342468, + "learning_rate": 1.2218325514046557e-05, + "loss": 0.4572, + "step": 5825 + }, + { + "epoch": 1.29957617666741, + "grad_norm": 0.18704891204833984, + "learning_rate": 1.2216030659641924e-05, + "loss": 0.4745, + "step": 5826 + }, + { + "epoch": 1.2997992415792996, + "grad_norm": 0.17715615034103394, + "learning_rate": 1.2213735682499578e-05, + "loss": 0.4536, + "step": 5827 + }, + { + "epoch": 1.3000223064911889, + "grad_norm": 0.1589982509613037, + "learning_rate": 1.2211440582746619e-05, + "loss": 0.4619, + "step": 5828 + }, + { + "epoch": 1.3002453714030784, + "grad_norm": 0.1834399253129959, + "learning_rate": 1.2209145360510175e-05, + "loss": 0.457, + "step": 5829 + }, + { + "epoch": 1.3004684363149677, + "grad_norm": 0.16908270120620728, + "learning_rate": 1.2206850015917362e-05, + "loss": 0.4432, + "step": 5830 + }, + { + "epoch": 1.300691501226857, + "grad_norm": 0.16872212290763855, + "learning_rate": 1.2204554549095316e-05, + "loss": 0.4378, + "step": 5831 + }, + { + "epoch": 1.3009145661387465, + "grad_norm": 0.1816396862268448, + "learning_rate": 1.2202258960171167e-05, + "loss": 0.4645, + "step": 5832 + }, + { + "epoch": 1.3011376310506357, + "grad_norm": 0.17055541276931763, + "learning_rate": 1.219996324927207e-05, + "loss": 0.4567, + "step": 5833 + }, + { + "epoch": 1.301360695962525, + "grad_norm": 0.17526254057884216, + "learning_rate": 1.2197667416525165e-05, + "loss": 0.4335, + "step": 5834 + }, + { + "epoch": 1.3015837608744145, + "grad_norm": 0.17466293275356293, + "learning_rate": 1.2195371462057619e-05, + "loss": 0.4645, + "step": 5835 + }, + { + "epoch": 1.3018068257863038, + "grad_norm": 0.1705324649810791, + "learning_rate": 1.2193075385996589e-05, + "loss": 0.4782, + "step": 5836 + }, + { + "epoch": 1.302029890698193, + "grad_norm": 0.1765318214893341, + "learning_rate": 1.2190779188469248e-05, + "loss": 0.4651, + "step": 5837 + }, + { + "epoch": 1.3022529556100826, + "grad_norm": 0.18065306544303894, + "learning_rate": 1.2188482869602778e-05, + "loss": 0.4507, + "step": 5838 + }, + { + "epoch": 1.3024760205219719, + "grad_norm": 0.17382916808128357, + "learning_rate": 1.2186186429524358e-05, + "loss": 0.4621, + "step": 5839 + }, + { + "epoch": 1.3026990854338614, + "grad_norm": 0.17227432131767273, + "learning_rate": 1.2183889868361185e-05, + "loss": 0.4493, + "step": 5840 + }, + { + "epoch": 1.3029221503457507, + "grad_norm": 0.16566970944404602, + "learning_rate": 1.218159318624045e-05, + "loss": 0.4613, + "step": 5841 + }, + { + "epoch": 1.30314521525764, + "grad_norm": 0.16630716621875763, + "learning_rate": 1.2179296383289366e-05, + "loss": 0.4703, + "step": 5842 + }, + { + "epoch": 1.3033682801695292, + "grad_norm": 0.16445283591747284, + "learning_rate": 1.2176999459635137e-05, + "loss": 0.471, + "step": 5843 + }, + { + "epoch": 1.3035913450814187, + "grad_norm": 0.1610761433839798, + "learning_rate": 1.2174702415404987e-05, + "loss": 0.4669, + "step": 5844 + }, + { + "epoch": 1.303814409993308, + "grad_norm": 0.16646409034729004, + "learning_rate": 1.2172405250726134e-05, + "loss": 0.4691, + "step": 5845 + }, + { + "epoch": 1.3040374749051975, + "grad_norm": 0.173641636967659, + "learning_rate": 1.2170107965725815e-05, + "loss": 0.4525, + "step": 5846 + }, + { + "epoch": 1.3042605398170868, + "grad_norm": 0.16962803900241852, + "learning_rate": 1.2167810560531266e-05, + "loss": 0.4491, + "step": 5847 + }, + { + "epoch": 1.304483604728976, + "grad_norm": 0.1649630218744278, + "learning_rate": 1.2165513035269733e-05, + "loss": 0.4409, + "step": 5848 + }, + { + "epoch": 1.3047066696408656, + "grad_norm": 0.1694600135087967, + "learning_rate": 1.2163215390068466e-05, + "loss": 0.47, + "step": 5849 + }, + { + "epoch": 1.3049297345527548, + "grad_norm": 0.1770717054605484, + "learning_rate": 1.2160917625054721e-05, + "loss": 0.4647, + "step": 5850 + }, + { + "epoch": 1.3051527994646441, + "grad_norm": 0.15664999186992645, + "learning_rate": 1.2158619740355767e-05, + "loss": 0.4371, + "step": 5851 + }, + { + "epoch": 1.3053758643765336, + "grad_norm": 0.16437092423439026, + "learning_rate": 1.2156321736098877e-05, + "loss": 0.436, + "step": 5852 + }, + { + "epoch": 1.305598929288423, + "grad_norm": 0.17147205770015717, + "learning_rate": 1.2154023612411321e-05, + "loss": 0.4696, + "step": 5853 + }, + { + "epoch": 1.3058219942003122, + "grad_norm": 0.1643034666776657, + "learning_rate": 1.215172536942039e-05, + "loss": 0.472, + "step": 5854 + }, + { + "epoch": 1.3060450591122017, + "grad_norm": 0.1705443561077118, + "learning_rate": 1.2149427007253372e-05, + "loss": 0.4422, + "step": 5855 + }, + { + "epoch": 1.306268124024091, + "grad_norm": 0.1692255437374115, + "learning_rate": 1.2147128526037568e-05, + "loss": 0.4583, + "step": 5856 + }, + { + "epoch": 1.3064911889359805, + "grad_norm": 0.17892007529735565, + "learning_rate": 1.2144829925900278e-05, + "loss": 0.4834, + "step": 5857 + }, + { + "epoch": 1.3067142538478698, + "grad_norm": 0.1718159317970276, + "learning_rate": 1.2142531206968815e-05, + "loss": 0.4576, + "step": 5858 + }, + { + "epoch": 1.306937318759759, + "grad_norm": 0.16335871815681458, + "learning_rate": 1.21402323693705e-05, + "loss": 0.4709, + "step": 5859 + }, + { + "epoch": 1.3071603836716483, + "grad_norm": 0.1720331460237503, + "learning_rate": 1.2137933413232651e-05, + "loss": 0.4536, + "step": 5860 + }, + { + "epoch": 1.3073834485835378, + "grad_norm": 0.17654761672019958, + "learning_rate": 1.2135634338682605e-05, + "loss": 0.4706, + "step": 5861 + }, + { + "epoch": 1.3076065134954271, + "grad_norm": 0.16412043571472168, + "learning_rate": 1.2133335145847691e-05, + "loss": 0.4914, + "step": 5862 + }, + { + "epoch": 1.3078295784073166, + "grad_norm": 0.17055629193782806, + "learning_rate": 1.213103583485526e-05, + "loss": 0.4666, + "step": 5863 + }, + { + "epoch": 1.308052643319206, + "grad_norm": 0.174026221036911, + "learning_rate": 1.2128736405832657e-05, + "loss": 0.46, + "step": 5864 + }, + { + "epoch": 1.3082757082310952, + "grad_norm": 0.17043223977088928, + "learning_rate": 1.2126436858907244e-05, + "loss": 0.4658, + "step": 5865 + }, + { + "epoch": 1.3084987731429847, + "grad_norm": 0.1644662320613861, + "learning_rate": 1.212413719420638e-05, + "loss": 0.4527, + "step": 5866 + }, + { + "epoch": 1.308721838054874, + "grad_norm": 0.1671665757894516, + "learning_rate": 1.212183741185744e-05, + "loss": 0.4617, + "step": 5867 + }, + { + "epoch": 1.3089449029667632, + "grad_norm": 0.2064938247203827, + "learning_rate": 1.2119537511987794e-05, + "loss": 0.4554, + "step": 5868 + }, + { + "epoch": 1.3091679678786527, + "grad_norm": 0.1725645214319229, + "learning_rate": 1.211723749472483e-05, + "loss": 0.4534, + "step": 5869 + }, + { + "epoch": 1.309391032790542, + "grad_norm": 0.16695371270179749, + "learning_rate": 1.2114937360195935e-05, + "loss": 0.4517, + "step": 5870 + }, + { + "epoch": 1.3096140977024313, + "grad_norm": 0.18011373281478882, + "learning_rate": 1.2112637108528505e-05, + "loss": 0.4576, + "step": 5871 + }, + { + "epoch": 1.3098371626143208, + "grad_norm": 0.17157688736915588, + "learning_rate": 1.2110336739849944e-05, + "loss": 0.438, + "step": 5872 + }, + { + "epoch": 1.31006022752621, + "grad_norm": 0.17350567877292633, + "learning_rate": 1.2108036254287658e-05, + "loss": 0.4647, + "step": 5873 + }, + { + "epoch": 1.3102832924380996, + "grad_norm": 0.16434237360954285, + "learning_rate": 1.2105735651969066e-05, + "loss": 0.4603, + "step": 5874 + }, + { + "epoch": 1.3105063573499889, + "grad_norm": 0.16745516657829285, + "learning_rate": 1.2103434933021587e-05, + "loss": 0.4585, + "step": 5875 + }, + { + "epoch": 1.3107294222618782, + "grad_norm": 0.16241510212421417, + "learning_rate": 1.2101134097572654e-05, + "loss": 0.4415, + "step": 5876 + }, + { + "epoch": 1.3109524871737674, + "grad_norm": 0.16632577776908875, + "learning_rate": 1.2098833145749691e-05, + "loss": 0.4715, + "step": 5877 + }, + { + "epoch": 1.311175552085657, + "grad_norm": 0.17647215723991394, + "learning_rate": 1.2096532077680153e-05, + "loss": 0.491, + "step": 5878 + }, + { + "epoch": 1.3113986169975462, + "grad_norm": 0.1738002449274063, + "learning_rate": 1.2094230893491475e-05, + "loss": 0.4731, + "step": 5879 + }, + { + "epoch": 1.3116216819094357, + "grad_norm": 0.16479554772377014, + "learning_rate": 1.2091929593311122e-05, + "loss": 0.4787, + "step": 5880 + }, + { + "epoch": 1.311844746821325, + "grad_norm": 0.16516366600990295, + "learning_rate": 1.2089628177266545e-05, + "loss": 0.4551, + "step": 5881 + }, + { + "epoch": 1.3120678117332143, + "grad_norm": 0.1664283722639084, + "learning_rate": 1.2087326645485218e-05, + "loss": 0.4516, + "step": 5882 + }, + { + "epoch": 1.3122908766451038, + "grad_norm": 0.23761501908302307, + "learning_rate": 1.208502499809461e-05, + "loss": 0.4679, + "step": 5883 + }, + { + "epoch": 1.312513941556993, + "grad_norm": 0.17012745141983032, + "learning_rate": 1.2082723235222205e-05, + "loss": 0.4618, + "step": 5884 + }, + { + "epoch": 1.3127370064688826, + "grad_norm": 0.1753227859735489, + "learning_rate": 1.2080421356995484e-05, + "loss": 0.4671, + "step": 5885 + }, + { + "epoch": 1.3129600713807719, + "grad_norm": 0.17254190146923065, + "learning_rate": 1.2078119363541942e-05, + "loss": 0.4379, + "step": 5886 + }, + { + "epoch": 1.3131831362926611, + "grad_norm": 0.1624683141708374, + "learning_rate": 1.2075817254989078e-05, + "loss": 0.4361, + "step": 5887 + }, + { + "epoch": 1.3134062012045504, + "grad_norm": 0.16935352981090546, + "learning_rate": 1.2073515031464397e-05, + "loss": 0.4528, + "step": 5888 + }, + { + "epoch": 1.31362926611644, + "grad_norm": 0.1754865050315857, + "learning_rate": 1.207121269309541e-05, + "loss": 0.4503, + "step": 5889 + }, + { + "epoch": 1.3138523310283292, + "grad_norm": 0.1585618406534195, + "learning_rate": 1.2068910240009636e-05, + "loss": 0.4401, + "step": 5890 + }, + { + "epoch": 1.3140753959402187, + "grad_norm": 0.18017402291297913, + "learning_rate": 1.20666076723346e-05, + "loss": 0.4595, + "step": 5891 + }, + { + "epoch": 1.314298460852108, + "grad_norm": 0.1608768105506897, + "learning_rate": 1.206430499019783e-05, + "loss": 0.4414, + "step": 5892 + }, + { + "epoch": 1.3145215257639973, + "grad_norm": 0.1631273776292801, + "learning_rate": 1.2062002193726867e-05, + "loss": 0.4567, + "step": 5893 + }, + { + "epoch": 1.3147445906758866, + "grad_norm": 0.16995559632778168, + "learning_rate": 1.2059699283049249e-05, + "loss": 0.4611, + "step": 5894 + }, + { + "epoch": 1.314967655587776, + "grad_norm": 0.1610070914030075, + "learning_rate": 1.2057396258292533e-05, + "loss": 0.4271, + "step": 5895 + }, + { + "epoch": 1.3151907204996653, + "grad_norm": 0.16440489888191223, + "learning_rate": 1.2055093119584264e-05, + "loss": 0.4531, + "step": 5896 + }, + { + "epoch": 1.3154137854115548, + "grad_norm": 0.16929922997951508, + "learning_rate": 1.2052789867052018e-05, + "loss": 0.4426, + "step": 5897 + }, + { + "epoch": 1.3156368503234441, + "grad_norm": 0.17286795377731323, + "learning_rate": 1.2050486500823352e-05, + "loss": 0.4584, + "step": 5898 + }, + { + "epoch": 1.3158599152353334, + "grad_norm": 0.1717979460954666, + "learning_rate": 1.2048183021025847e-05, + "loss": 0.461, + "step": 5899 + }, + { + "epoch": 1.316082980147223, + "grad_norm": 0.15612578392028809, + "learning_rate": 1.2045879427787084e-05, + "loss": 0.4466, + "step": 5900 + }, + { + "epoch": 1.3163060450591122, + "grad_norm": 0.17022433876991272, + "learning_rate": 1.2043575721234649e-05, + "loss": 0.4376, + "step": 5901 + }, + { + "epoch": 1.3165291099710017, + "grad_norm": 0.16973432898521423, + "learning_rate": 1.2041271901496136e-05, + "loss": 0.4606, + "step": 5902 + }, + { + "epoch": 1.316752174882891, + "grad_norm": 0.1685478538274765, + "learning_rate": 1.2038967968699143e-05, + "loss": 0.4621, + "step": 5903 + }, + { + "epoch": 1.3169752397947803, + "grad_norm": 0.16581124067306519, + "learning_rate": 1.2036663922971279e-05, + "loss": 0.4652, + "step": 5904 + }, + { + "epoch": 1.3171983047066695, + "grad_norm": 0.17322222888469696, + "learning_rate": 1.2034359764440156e-05, + "loss": 0.5025, + "step": 5905 + }, + { + "epoch": 1.317421369618559, + "grad_norm": 0.16943678259849548, + "learning_rate": 1.2032055493233394e-05, + "loss": 0.4508, + "step": 5906 + }, + { + "epoch": 1.3176444345304483, + "grad_norm": 0.17897944152355194, + "learning_rate": 1.2029751109478614e-05, + "loss": 0.4623, + "step": 5907 + }, + { + "epoch": 1.3178674994423378, + "grad_norm": 0.1740075647830963, + "learning_rate": 1.2027446613303454e-05, + "loss": 0.4835, + "step": 5908 + }, + { + "epoch": 1.3180905643542271, + "grad_norm": 0.173087015748024, + "learning_rate": 1.2025142004835541e-05, + "loss": 0.4713, + "step": 5909 + }, + { + "epoch": 1.3183136292661164, + "grad_norm": 0.18074947595596313, + "learning_rate": 1.2022837284202531e-05, + "loss": 0.4621, + "step": 5910 + }, + { + "epoch": 1.3185366941780057, + "grad_norm": 0.16880744695663452, + "learning_rate": 1.2020532451532063e-05, + "loss": 0.4486, + "step": 5911 + }, + { + "epoch": 1.3187597590898952, + "grad_norm": 0.16226506233215332, + "learning_rate": 1.2018227506951802e-05, + "loss": 0.4502, + "step": 5912 + }, + { + "epoch": 1.3189828240017845, + "grad_norm": 0.16763652861118317, + "learning_rate": 1.2015922450589405e-05, + "loss": 0.4588, + "step": 5913 + }, + { + "epoch": 1.319205888913674, + "grad_norm": 0.1670703887939453, + "learning_rate": 1.2013617282572545e-05, + "loss": 0.4398, + "step": 5914 + }, + { + "epoch": 1.3194289538255632, + "grad_norm": 0.18015524744987488, + "learning_rate": 1.201131200302889e-05, + "loss": 0.4625, + "step": 5915 + }, + { + "epoch": 1.3196520187374525, + "grad_norm": 0.16982007026672363, + "learning_rate": 1.2009006612086128e-05, + "loss": 0.4619, + "step": 5916 + }, + { + "epoch": 1.319875083649342, + "grad_norm": 0.16795918345451355, + "learning_rate": 1.2006701109871936e-05, + "loss": 0.4577, + "step": 5917 + }, + { + "epoch": 1.3200981485612313, + "grad_norm": 0.17438872158527374, + "learning_rate": 1.2004395496514021e-05, + "loss": 0.4668, + "step": 5918 + }, + { + "epoch": 1.3203212134731208, + "grad_norm": 0.17492419481277466, + "learning_rate": 1.2002089772140071e-05, + "loss": 0.4592, + "step": 5919 + }, + { + "epoch": 1.32054427838501, + "grad_norm": 0.17449866235256195, + "learning_rate": 1.19997839368778e-05, + "loss": 0.4632, + "step": 5920 + }, + { + "epoch": 1.3207673432968994, + "grad_norm": 0.16483739018440247, + "learning_rate": 1.199747799085491e-05, + "loss": 0.4496, + "step": 5921 + }, + { + "epoch": 1.3209904082087887, + "grad_norm": 0.19091255962848663, + "learning_rate": 1.1995171934199128e-05, + "loss": 0.4647, + "step": 5922 + }, + { + "epoch": 1.3212134731206782, + "grad_norm": 0.17579780519008636, + "learning_rate": 1.1992865767038172e-05, + "loss": 0.4545, + "step": 5923 + }, + { + "epoch": 1.3214365380325674, + "grad_norm": 0.17818951606750488, + "learning_rate": 1.1990559489499776e-05, + "loss": 0.4806, + "step": 5924 + }, + { + "epoch": 1.321659602944457, + "grad_norm": 0.17060208320617676, + "learning_rate": 1.1988253101711675e-05, + "loss": 0.4435, + "step": 5925 + }, + { + "epoch": 1.3218826678563462, + "grad_norm": 0.17483805119991302, + "learning_rate": 1.1985946603801608e-05, + "loss": 0.4901, + "step": 5926 + }, + { + "epoch": 1.3221057327682355, + "grad_norm": 0.17470861971378326, + "learning_rate": 1.198363999589733e-05, + "loss": 0.462, + "step": 5927 + }, + { + "epoch": 1.3223287976801248, + "grad_norm": 0.16998319327831268, + "learning_rate": 1.1981333278126585e-05, + "loss": 0.4469, + "step": 5928 + }, + { + "epoch": 1.3225518625920143, + "grad_norm": 0.1669076532125473, + "learning_rate": 1.1979026450617147e-05, + "loss": 0.4422, + "step": 5929 + }, + { + "epoch": 1.3227749275039036, + "grad_norm": 0.17433854937553406, + "learning_rate": 1.197671951349677e-05, + "loss": 0.4572, + "step": 5930 + }, + { + "epoch": 1.322997992415793, + "grad_norm": 0.16878628730773926, + "learning_rate": 1.1974412466893237e-05, + "loss": 0.4354, + "step": 5931 + }, + { + "epoch": 1.3232210573276824, + "grad_norm": 0.17208078503608704, + "learning_rate": 1.1972105310934318e-05, + "loss": 0.4149, + "step": 5932 + }, + { + "epoch": 1.3234441222395716, + "grad_norm": 0.1662035584449768, + "learning_rate": 1.1969798045747805e-05, + "loss": 0.4224, + "step": 5933 + }, + { + "epoch": 1.3236671871514611, + "grad_norm": 0.16633039712905884, + "learning_rate": 1.1967490671461484e-05, + "loss": 0.4621, + "step": 5934 + }, + { + "epoch": 1.3238902520633504, + "grad_norm": 0.17780262231826782, + "learning_rate": 1.1965183188203154e-05, + "loss": 0.4969, + "step": 5935 + }, + { + "epoch": 1.32411331697524, + "grad_norm": 0.16385456919670105, + "learning_rate": 1.1962875596100616e-05, + "loss": 0.4476, + "step": 5936 + }, + { + "epoch": 1.3243363818871292, + "grad_norm": 0.17453844845294952, + "learning_rate": 1.1960567895281682e-05, + "loss": 0.4784, + "step": 5937 + }, + { + "epoch": 1.3245594467990185, + "grad_norm": 0.17324872314929962, + "learning_rate": 1.1958260085874165e-05, + "loss": 0.4734, + "step": 5938 + }, + { + "epoch": 1.3247825117109078, + "grad_norm": 0.1686837375164032, + "learning_rate": 1.1955952168005889e-05, + "loss": 0.4497, + "step": 5939 + }, + { + "epoch": 1.3250055766227973, + "grad_norm": 0.17261843383312225, + "learning_rate": 1.1953644141804675e-05, + "loss": 0.4805, + "step": 5940 + }, + { + "epoch": 1.3252286415346866, + "grad_norm": 0.16724753379821777, + "learning_rate": 1.1951336007398362e-05, + "loss": 0.4568, + "step": 5941 + }, + { + "epoch": 1.325451706446576, + "grad_norm": 0.17275574803352356, + "learning_rate": 1.1949027764914786e-05, + "loss": 0.4567, + "step": 5942 + }, + { + "epoch": 1.3256747713584653, + "grad_norm": 0.16790451109409332, + "learning_rate": 1.194671941448179e-05, + "loss": 0.4821, + "step": 5943 + }, + { + "epoch": 1.3258978362703546, + "grad_norm": 0.16740332543849945, + "learning_rate": 1.1944410956227233e-05, + "loss": 0.4712, + "step": 5944 + }, + { + "epoch": 1.326120901182244, + "grad_norm": 0.185842826962471, + "learning_rate": 1.1942102390278961e-05, + "loss": 0.4639, + "step": 5945 + }, + { + "epoch": 1.3263439660941334, + "grad_norm": 0.1872578263282776, + "learning_rate": 1.1939793716764845e-05, + "loss": 0.4601, + "step": 5946 + }, + { + "epoch": 1.3265670310060227, + "grad_norm": 0.17680978775024414, + "learning_rate": 1.1937484935812749e-05, + "loss": 0.4536, + "step": 5947 + }, + { + "epoch": 1.3267900959179122, + "grad_norm": 0.1672886312007904, + "learning_rate": 1.1935176047550552e-05, + "loss": 0.4573, + "step": 5948 + }, + { + "epoch": 1.3270131608298015, + "grad_norm": 0.16590574383735657, + "learning_rate": 1.1932867052106132e-05, + "loss": 0.4702, + "step": 5949 + }, + { + "epoch": 1.3272362257416908, + "grad_norm": 0.15358559787273407, + "learning_rate": 1.1930557949607378e-05, + "loss": 0.4202, + "step": 5950 + }, + { + "epoch": 1.3274592906535803, + "grad_norm": 0.16293473541736603, + "learning_rate": 1.1928248740182177e-05, + "loss": 0.4528, + "step": 5951 + }, + { + "epoch": 1.3276823555654695, + "grad_norm": 0.16601158678531647, + "learning_rate": 1.1925939423958437e-05, + "loss": 0.4641, + "step": 5952 + }, + { + "epoch": 1.327905420477359, + "grad_norm": 0.16637404263019562, + "learning_rate": 1.1923630001064052e-05, + "loss": 0.4606, + "step": 5953 + }, + { + "epoch": 1.3281284853892483, + "grad_norm": 0.16997739672660828, + "learning_rate": 1.1921320471626939e-05, + "loss": 0.4688, + "step": 5954 + }, + { + "epoch": 1.3283515503011376, + "grad_norm": 0.1634935885667801, + "learning_rate": 1.1919010835775015e-05, + "loss": 0.4367, + "step": 5955 + }, + { + "epoch": 1.3285746152130269, + "grad_norm": 0.16118724644184113, + "learning_rate": 1.1916701093636196e-05, + "loss": 0.4367, + "step": 5956 + }, + { + "epoch": 1.3287976801249164, + "grad_norm": 0.16823935508728027, + "learning_rate": 1.1914391245338417e-05, + "loss": 0.4566, + "step": 5957 + }, + { + "epoch": 1.3290207450368057, + "grad_norm": 0.16638781130313873, + "learning_rate": 1.1912081291009608e-05, + "loss": 0.4533, + "step": 5958 + }, + { + "epoch": 1.3292438099486952, + "grad_norm": 0.20548874139785767, + "learning_rate": 1.1909771230777709e-05, + "loss": 0.4482, + "step": 5959 + }, + { + "epoch": 1.3294668748605845, + "grad_norm": 0.16751278936862946, + "learning_rate": 1.1907461064770667e-05, + "loss": 0.4552, + "step": 5960 + }, + { + "epoch": 1.3296899397724737, + "grad_norm": 0.17895007133483887, + "learning_rate": 1.1905150793116433e-05, + "loss": 0.4638, + "step": 5961 + }, + { + "epoch": 1.329913004684363, + "grad_norm": 0.17109131813049316, + "learning_rate": 1.1902840415942964e-05, + "loss": 0.4386, + "step": 5962 + }, + { + "epoch": 1.3301360695962525, + "grad_norm": 0.1729149967432022, + "learning_rate": 1.1900529933378224e-05, + "loss": 0.4626, + "step": 5963 + }, + { + "epoch": 1.3303591345081418, + "grad_norm": 0.16538968682289124, + "learning_rate": 1.189821934555018e-05, + "loss": 0.4514, + "step": 5964 + }, + { + "epoch": 1.3305821994200313, + "grad_norm": 0.1783122420310974, + "learning_rate": 1.189590865258681e-05, + "loss": 0.4652, + "step": 5965 + }, + { + "epoch": 1.3308052643319206, + "grad_norm": 0.15844860672950745, + "learning_rate": 1.1893597854616092e-05, + "loss": 0.4402, + "step": 5966 + }, + { + "epoch": 1.3310283292438099, + "grad_norm": 0.1638982594013214, + "learning_rate": 1.1891286951766014e-05, + "loss": 0.4608, + "step": 5967 + }, + { + "epoch": 1.3312513941556994, + "grad_norm": 0.1644224226474762, + "learning_rate": 1.1888975944164567e-05, + "loss": 0.4526, + "step": 5968 + }, + { + "epoch": 1.3314744590675887, + "grad_norm": 0.20200717449188232, + "learning_rate": 1.1886664831939751e-05, + "loss": 0.4611, + "step": 5969 + }, + { + "epoch": 1.3316975239794782, + "grad_norm": 0.1918911337852478, + "learning_rate": 1.188435361521957e-05, + "loss": 0.4704, + "step": 5970 + }, + { + "epoch": 1.3319205888913674, + "grad_norm": 0.16691647469997406, + "learning_rate": 1.1882042294132032e-05, + "loss": 0.4787, + "step": 5971 + }, + { + "epoch": 1.3321436538032567, + "grad_norm": 0.16211196780204773, + "learning_rate": 1.1879730868805153e-05, + "loss": 0.4535, + "step": 5972 + }, + { + "epoch": 1.332366718715146, + "grad_norm": 0.16488417983055115, + "learning_rate": 1.1877419339366953e-05, + "loss": 0.4461, + "step": 5973 + }, + { + "epoch": 1.3325897836270355, + "grad_norm": 0.16110685467720032, + "learning_rate": 1.1875107705945461e-05, + "loss": 0.444, + "step": 5974 + }, + { + "epoch": 1.3328128485389248, + "grad_norm": 0.16213123500347137, + "learning_rate": 1.187279596866871e-05, + "loss": 0.4324, + "step": 5975 + }, + { + "epoch": 1.3330359134508143, + "grad_norm": 0.17590108513832092, + "learning_rate": 1.1870484127664739e-05, + "loss": 0.4416, + "step": 5976 + }, + { + "epoch": 1.3332589783627036, + "grad_norm": 0.20271827280521393, + "learning_rate": 1.186817218306159e-05, + "loss": 0.4514, + "step": 5977 + }, + { + "epoch": 1.3334820432745929, + "grad_norm": 0.17544665932655334, + "learning_rate": 1.1865860134987317e-05, + "loss": 0.4788, + "step": 5978 + }, + { + "epoch": 1.3337051081864821, + "grad_norm": 0.2024955302476883, + "learning_rate": 1.1863547983569967e-05, + "loss": 0.4804, + "step": 5979 + }, + { + "epoch": 1.3339281730983716, + "grad_norm": 0.17373384535312653, + "learning_rate": 1.1861235728937613e-05, + "loss": 0.4485, + "step": 5980 + }, + { + "epoch": 1.334151238010261, + "grad_norm": 0.17016762495040894, + "learning_rate": 1.1858923371218314e-05, + "loss": 0.4399, + "step": 5981 + }, + { + "epoch": 1.3343743029221504, + "grad_norm": 0.167991504073143, + "learning_rate": 1.1856610910540149e-05, + "loss": 0.4612, + "step": 5982 + }, + { + "epoch": 1.3345973678340397, + "grad_norm": 0.1790834218263626, + "learning_rate": 1.185429834703119e-05, + "loss": 0.456, + "step": 5983 + }, + { + "epoch": 1.334820432745929, + "grad_norm": 0.16934290528297424, + "learning_rate": 1.1851985680819526e-05, + "loss": 0.4717, + "step": 5984 + }, + { + "epoch": 1.3350434976578185, + "grad_norm": 0.1672845035791397, + "learning_rate": 1.1849672912033245e-05, + "loss": 0.4748, + "step": 5985 + }, + { + "epoch": 1.3352665625697078, + "grad_norm": 0.16489911079406738, + "learning_rate": 1.1847360040800442e-05, + "loss": 0.416, + "step": 5986 + }, + { + "epoch": 1.3354896274815973, + "grad_norm": 0.18953464925289154, + "learning_rate": 1.184504706724922e-05, + "loss": 0.4601, + "step": 5987 + }, + { + "epoch": 1.3357126923934866, + "grad_norm": 0.16453106701374054, + "learning_rate": 1.1842733991507687e-05, + "loss": 0.4417, + "step": 5988 + }, + { + "epoch": 1.3359357573053758, + "grad_norm": 0.1774827390909195, + "learning_rate": 1.1840420813703955e-05, + "loss": 0.4459, + "step": 5989 + }, + { + "epoch": 1.3361588222172651, + "grad_norm": 0.1909363716840744, + "learning_rate": 1.183810753396614e-05, + "loss": 0.4695, + "step": 5990 + }, + { + "epoch": 1.3363818871291546, + "grad_norm": 0.18591512739658356, + "learning_rate": 1.1835794152422366e-05, + "loss": 0.4579, + "step": 5991 + }, + { + "epoch": 1.336604952041044, + "grad_norm": 0.1766308695077896, + "learning_rate": 1.1833480669200765e-05, + "loss": 0.4595, + "step": 5992 + }, + { + "epoch": 1.3368280169529334, + "grad_norm": 0.16689802706241608, + "learning_rate": 1.1831167084429474e-05, + "loss": 0.4371, + "step": 5993 + }, + { + "epoch": 1.3370510818648227, + "grad_norm": 0.180791437625885, + "learning_rate": 1.1828853398236629e-05, + "loss": 0.4773, + "step": 5994 + }, + { + "epoch": 1.337274146776712, + "grad_norm": 0.18029746413230896, + "learning_rate": 1.182653961075038e-05, + "loss": 0.4281, + "step": 5995 + }, + { + "epoch": 1.3374972116886013, + "grad_norm": 0.16131596267223358, + "learning_rate": 1.1824225722098877e-05, + "loss": 0.4479, + "step": 5996 + }, + { + "epoch": 1.3377202766004908, + "grad_norm": 0.16809871792793274, + "learning_rate": 1.182191173241028e-05, + "loss": 0.4473, + "step": 5997 + }, + { + "epoch": 1.33794334151238, + "grad_norm": 0.17205321788787842, + "learning_rate": 1.181959764181275e-05, + "loss": 0.4357, + "step": 5998 + }, + { + "epoch": 1.3381664064242695, + "grad_norm": 0.17919965088367462, + "learning_rate": 1.1817283450434459e-05, + "loss": 0.4892, + "step": 5999 + }, + { + "epoch": 1.3383894713361588, + "grad_norm": 0.16656439006328583, + "learning_rate": 1.1814969158403575e-05, + "loss": 0.4686, + "step": 6000 + }, + { + "epoch": 1.338612536248048, + "grad_norm": 0.16457730531692505, + "learning_rate": 1.1812654765848285e-05, + "loss": 0.4346, + "step": 6001 + }, + { + "epoch": 1.3388356011599376, + "grad_norm": 0.17078131437301636, + "learning_rate": 1.1810340272896772e-05, + "loss": 0.4567, + "step": 6002 + }, + { + "epoch": 1.3390586660718269, + "grad_norm": 0.16368982195854187, + "learning_rate": 1.1808025679677229e-05, + "loss": 0.451, + "step": 6003 + }, + { + "epoch": 1.3392817309837164, + "grad_norm": 0.16397084295749664, + "learning_rate": 1.1805710986317846e-05, + "loss": 0.4541, + "step": 6004 + }, + { + "epoch": 1.3395047958956057, + "grad_norm": 0.16752174496650696, + "learning_rate": 1.1803396192946835e-05, + "loss": 0.4714, + "step": 6005 + }, + { + "epoch": 1.339727860807495, + "grad_norm": 0.17410169541835785, + "learning_rate": 1.1801081299692396e-05, + "loss": 0.4782, + "step": 6006 + }, + { + "epoch": 1.3399509257193842, + "grad_norm": 0.16793891787528992, + "learning_rate": 1.1798766306682746e-05, + "loss": 0.4733, + "step": 6007 + }, + { + "epoch": 1.3401739906312737, + "grad_norm": 0.17633196711540222, + "learning_rate": 1.1796451214046106e-05, + "loss": 0.4668, + "step": 6008 + }, + { + "epoch": 1.340397055543163, + "grad_norm": 0.17554394900798798, + "learning_rate": 1.1794136021910694e-05, + "loss": 0.4375, + "step": 6009 + }, + { + "epoch": 1.3406201204550525, + "grad_norm": 0.1725650429725647, + "learning_rate": 1.1791820730404746e-05, + "loss": 0.46, + "step": 6010 + }, + { + "epoch": 1.3408431853669418, + "grad_norm": 0.17213909327983856, + "learning_rate": 1.1789505339656493e-05, + "loss": 0.463, + "step": 6011 + }, + { + "epoch": 1.341066250278831, + "grad_norm": 0.17620781064033508, + "learning_rate": 1.1787189849794178e-05, + "loss": 0.4817, + "step": 6012 + }, + { + "epoch": 1.3412893151907204, + "grad_norm": 0.1637295037508011, + "learning_rate": 1.1784874260946048e-05, + "loss": 0.4883, + "step": 6013 + }, + { + "epoch": 1.3415123801026099, + "grad_norm": 0.17048433423042297, + "learning_rate": 1.1782558573240355e-05, + "loss": 0.4333, + "step": 6014 + }, + { + "epoch": 1.3417354450144992, + "grad_norm": 0.17557942867279053, + "learning_rate": 1.1780242786805353e-05, + "loss": 0.4704, + "step": 6015 + }, + { + "epoch": 1.3419585099263887, + "grad_norm": 0.17155835032463074, + "learning_rate": 1.177792690176931e-05, + "loss": 0.4627, + "step": 6016 + }, + { + "epoch": 1.342181574838278, + "grad_norm": 0.1626436561346054, + "learning_rate": 1.177561091826049e-05, + "loss": 0.4703, + "step": 6017 + }, + { + "epoch": 1.3424046397501672, + "grad_norm": 0.18032924830913544, + "learning_rate": 1.177329483640717e-05, + "loss": 0.4727, + "step": 6018 + }, + { + "epoch": 1.3426277046620567, + "grad_norm": 0.18225564062595367, + "learning_rate": 1.1770978656337626e-05, + "loss": 0.4737, + "step": 6019 + }, + { + "epoch": 1.342850769573946, + "grad_norm": 0.22427327930927277, + "learning_rate": 1.1768662378180144e-05, + "loss": 0.4353, + "step": 6020 + }, + { + "epoch": 1.3430738344858355, + "grad_norm": 0.16713419556617737, + "learning_rate": 1.1766346002063017e-05, + "loss": 0.4674, + "step": 6021 + }, + { + "epoch": 1.3432968993977248, + "grad_norm": 0.17510871589183807, + "learning_rate": 1.1764029528114533e-05, + "loss": 0.4907, + "step": 6022 + }, + { + "epoch": 1.343519964309614, + "grad_norm": 0.20499172806739807, + "learning_rate": 1.1761712956463003e-05, + "loss": 0.4798, + "step": 6023 + }, + { + "epoch": 1.3437430292215033, + "grad_norm": 0.17217415571212769, + "learning_rate": 1.1759396287236721e-05, + "loss": 0.4642, + "step": 6024 + }, + { + "epoch": 1.3439660941333929, + "grad_norm": 0.17448726296424866, + "learning_rate": 1.1757079520564012e-05, + "loss": 0.4569, + "step": 6025 + }, + { + "epoch": 1.3441891590452821, + "grad_norm": 0.16537372767925262, + "learning_rate": 1.1754762656573182e-05, + "loss": 0.4348, + "step": 6026 + }, + { + "epoch": 1.3444122239571716, + "grad_norm": 0.16384288668632507, + "learning_rate": 1.1752445695392563e-05, + "loss": 0.444, + "step": 6027 + }, + { + "epoch": 1.344635288869061, + "grad_norm": 0.25420817732810974, + "learning_rate": 1.1750128637150473e-05, + "loss": 0.4489, + "step": 6028 + }, + { + "epoch": 1.3448583537809502, + "grad_norm": 0.17444314062595367, + "learning_rate": 1.1747811481975253e-05, + "loss": 0.4641, + "step": 6029 + }, + { + "epoch": 1.3450814186928395, + "grad_norm": 0.17278996109962463, + "learning_rate": 1.1745494229995237e-05, + "loss": 0.4622, + "step": 6030 + }, + { + "epoch": 1.345304483604729, + "grad_norm": 0.1730216145515442, + "learning_rate": 1.1743176881338773e-05, + "loss": 0.4413, + "step": 6031 + }, + { + "epoch": 1.3455275485166183, + "grad_norm": 0.1769610494375229, + "learning_rate": 1.174085943613421e-05, + "loss": 0.4739, + "step": 6032 + }, + { + "epoch": 1.3457506134285078, + "grad_norm": 0.17104336619377136, + "learning_rate": 1.1738541894509898e-05, + "loss": 0.4513, + "step": 6033 + }, + { + "epoch": 1.345973678340397, + "grad_norm": 0.1742696762084961, + "learning_rate": 1.17362242565942e-05, + "loss": 0.4549, + "step": 6034 + }, + { + "epoch": 1.3461967432522863, + "grad_norm": 0.16949966549873352, + "learning_rate": 1.1733906522515483e-05, + "loss": 0.4773, + "step": 6035 + }, + { + "epoch": 1.3464198081641758, + "grad_norm": 0.17838764190673828, + "learning_rate": 1.1731588692402114e-05, + "loss": 0.47, + "step": 6036 + }, + { + "epoch": 1.3466428730760651, + "grad_norm": 0.1655292510986328, + "learning_rate": 1.1729270766382474e-05, + "loss": 0.4577, + "step": 6037 + }, + { + "epoch": 1.3468659379879546, + "grad_norm": 0.1620989590883255, + "learning_rate": 1.172695274458494e-05, + "loss": 0.427, + "step": 6038 + }, + { + "epoch": 1.347089002899844, + "grad_norm": 0.15767668187618256, + "learning_rate": 1.1724634627137896e-05, + "loss": 0.4419, + "step": 6039 + }, + { + "epoch": 1.3473120678117332, + "grad_norm": 0.17484217882156372, + "learning_rate": 1.1722316414169743e-05, + "loss": 0.4221, + "step": 6040 + }, + { + "epoch": 1.3475351327236225, + "grad_norm": 0.16977369785308838, + "learning_rate": 1.1719998105808871e-05, + "loss": 0.4639, + "step": 6041 + }, + { + "epoch": 1.347758197635512, + "grad_norm": 0.16105391085147858, + "learning_rate": 1.1717679702183687e-05, + "loss": 0.4707, + "step": 6042 + }, + { + "epoch": 1.3479812625474012, + "grad_norm": 0.16509099304676056, + "learning_rate": 1.1715361203422595e-05, + "loss": 0.4378, + "step": 6043 + }, + { + "epoch": 1.3482043274592908, + "grad_norm": 0.18078747391700745, + "learning_rate": 1.1713042609654008e-05, + "loss": 0.4565, + "step": 6044 + }, + { + "epoch": 1.34842739237118, + "grad_norm": 0.17105768620967865, + "learning_rate": 1.1710723921006348e-05, + "loss": 0.4574, + "step": 6045 + }, + { + "epoch": 1.3486504572830693, + "grad_norm": 0.17831863462924957, + "learning_rate": 1.1708405137608036e-05, + "loss": 0.4789, + "step": 6046 + }, + { + "epoch": 1.3488735221949586, + "grad_norm": 0.17648382484912872, + "learning_rate": 1.1706086259587499e-05, + "loss": 0.4757, + "step": 6047 + }, + { + "epoch": 1.349096587106848, + "grad_norm": 0.1652226746082306, + "learning_rate": 1.1703767287073177e-05, + "loss": 0.4559, + "step": 6048 + }, + { + "epoch": 1.3493196520187374, + "grad_norm": 0.16818653047084808, + "learning_rate": 1.1701448220193503e-05, + "loss": 0.4739, + "step": 6049 + }, + { + "epoch": 1.3495427169306269, + "grad_norm": 0.17042328417301178, + "learning_rate": 1.1699129059076925e-05, + "loss": 0.4512, + "step": 6050 + }, + { + "epoch": 1.3497657818425162, + "grad_norm": 0.16678601503372192, + "learning_rate": 1.1696809803851891e-05, + "loss": 0.4454, + "step": 6051 + }, + { + "epoch": 1.3499888467544054, + "grad_norm": 0.1808800995349884, + "learning_rate": 1.1694490454646858e-05, + "loss": 0.4656, + "step": 6052 + }, + { + "epoch": 1.350211911666295, + "grad_norm": 0.17242035269737244, + "learning_rate": 1.1692171011590282e-05, + "loss": 0.4635, + "step": 6053 + }, + { + "epoch": 1.3504349765781842, + "grad_norm": 0.16750234365463257, + "learning_rate": 1.1689851474810636e-05, + "loss": 0.4627, + "step": 6054 + }, + { + "epoch": 1.3506580414900737, + "grad_norm": 0.1674799621105194, + "learning_rate": 1.1687531844436382e-05, + "loss": 0.4452, + "step": 6055 + }, + { + "epoch": 1.350881106401963, + "grad_norm": 0.17459098994731903, + "learning_rate": 1.1685212120596001e-05, + "loss": 0.467, + "step": 6056 + }, + { + "epoch": 1.3511041713138523, + "grad_norm": 0.17721140384674072, + "learning_rate": 1.168289230341797e-05, + "loss": 0.4788, + "step": 6057 + }, + { + "epoch": 1.3513272362257416, + "grad_norm": 0.167550191283226, + "learning_rate": 1.168057239303078e-05, + "loss": 0.4488, + "step": 6058 + }, + { + "epoch": 1.351550301137631, + "grad_norm": 0.17352843284606934, + "learning_rate": 1.1678252389562919e-05, + "loss": 0.4582, + "step": 6059 + }, + { + "epoch": 1.3517733660495204, + "grad_norm": 0.1822769194841385, + "learning_rate": 1.1675932293142882e-05, + "loss": 0.4597, + "step": 6060 + }, + { + "epoch": 1.3519964309614099, + "grad_norm": 0.16711196303367615, + "learning_rate": 1.1673612103899176e-05, + "loss": 0.4624, + "step": 6061 + }, + { + "epoch": 1.3522194958732991, + "grad_norm": 0.16815169155597687, + "learning_rate": 1.16712918219603e-05, + "loss": 0.4416, + "step": 6062 + }, + { + "epoch": 1.3524425607851884, + "grad_norm": 0.17060011625289917, + "learning_rate": 1.1668971447454775e-05, + "loss": 0.4472, + "step": 6063 + }, + { + "epoch": 1.3526656256970777, + "grad_norm": 0.1819947212934494, + "learning_rate": 1.1666650980511112e-05, + "loss": 0.4791, + "step": 6064 + }, + { + "epoch": 1.3528886906089672, + "grad_norm": 0.16703422367572784, + "learning_rate": 1.1664330421257835e-05, + "loss": 0.4596, + "step": 6065 + }, + { + "epoch": 1.3531117555208565, + "grad_norm": 0.16398312151432037, + "learning_rate": 1.1662009769823466e-05, + "loss": 0.46, + "step": 6066 + }, + { + "epoch": 1.353334820432746, + "grad_norm": 0.177035391330719, + "learning_rate": 1.1659689026336544e-05, + "loss": 0.4566, + "step": 6067 + }, + { + "epoch": 1.3535578853446353, + "grad_norm": 0.17809878289699554, + "learning_rate": 1.1657368190925602e-05, + "loss": 0.4687, + "step": 6068 + }, + { + "epoch": 1.3537809502565246, + "grad_norm": 0.17178352177143097, + "learning_rate": 1.1655047263719188e-05, + "loss": 0.4609, + "step": 6069 + }, + { + "epoch": 1.354004015168414, + "grad_norm": 0.16717985272407532, + "learning_rate": 1.1652726244845843e-05, + "loss": 0.4341, + "step": 6070 + }, + { + "epoch": 1.3542270800803033, + "grad_norm": 0.1680830717086792, + "learning_rate": 1.1650405134434122e-05, + "loss": 0.4286, + "step": 6071 + }, + { + "epoch": 1.3544501449921929, + "grad_norm": 0.1840554177761078, + "learning_rate": 1.1648083932612584e-05, + "loss": 0.4781, + "step": 6072 + }, + { + "epoch": 1.3546732099040821, + "grad_norm": 0.16866901516914368, + "learning_rate": 1.164576263950979e-05, + "loss": 0.4588, + "step": 6073 + }, + { + "epoch": 1.3548962748159714, + "grad_norm": 0.18350256979465485, + "learning_rate": 1.164344125525431e-05, + "loss": 0.4828, + "step": 6074 + }, + { + "epoch": 1.3551193397278607, + "grad_norm": 0.18490628898143768, + "learning_rate": 1.1641119779974717e-05, + "loss": 0.4127, + "step": 6075 + }, + { + "epoch": 1.3553424046397502, + "grad_norm": 0.1628909409046173, + "learning_rate": 1.1638798213799585e-05, + "loss": 0.4403, + "step": 6076 + }, + { + "epoch": 1.3555654695516395, + "grad_norm": 0.17124617099761963, + "learning_rate": 1.1636476556857502e-05, + "loss": 0.4638, + "step": 6077 + }, + { + "epoch": 1.355788534463529, + "grad_norm": 0.18702422082424164, + "learning_rate": 1.1634154809277052e-05, + "loss": 0.4675, + "step": 6078 + }, + { + "epoch": 1.3560115993754183, + "grad_norm": 0.174610435962677, + "learning_rate": 1.1631832971186827e-05, + "loss": 0.448, + "step": 6079 + }, + { + "epoch": 1.3562346642873075, + "grad_norm": 0.165745347738266, + "learning_rate": 1.162951104271543e-05, + "loss": 0.4722, + "step": 6080 + }, + { + "epoch": 1.356457729199197, + "grad_norm": 0.1741677075624466, + "learning_rate": 1.162718902399146e-05, + "loss": 0.4478, + "step": 6081 + }, + { + "epoch": 1.3566807941110863, + "grad_norm": 0.17158982157707214, + "learning_rate": 1.162486691514353e-05, + "loss": 0.4594, + "step": 6082 + }, + { + "epoch": 1.3569038590229756, + "grad_norm": 0.16993384063243866, + "learning_rate": 1.1622544716300245e-05, + "loss": 0.4304, + "step": 6083 + }, + { + "epoch": 1.3571269239348651, + "grad_norm": 0.1746159791946411, + "learning_rate": 1.1620222427590232e-05, + "loss": 0.4395, + "step": 6084 + }, + { + "epoch": 1.3573499888467544, + "grad_norm": 0.18195772171020508, + "learning_rate": 1.1617900049142105e-05, + "loss": 0.449, + "step": 6085 + }, + { + "epoch": 1.3575730537586437, + "grad_norm": 0.19646844267845154, + "learning_rate": 1.16155775810845e-05, + "loss": 0.4391, + "step": 6086 + }, + { + "epoch": 1.3577961186705332, + "grad_norm": 0.17097879946231842, + "learning_rate": 1.1613255023546043e-05, + "loss": 0.4744, + "step": 6087 + }, + { + "epoch": 1.3580191835824225, + "grad_norm": 0.18478180468082428, + "learning_rate": 1.1610932376655377e-05, + "loss": 0.5241, + "step": 6088 + }, + { + "epoch": 1.358242248494312, + "grad_norm": 0.16517724096775055, + "learning_rate": 1.1608609640541142e-05, + "loss": 0.465, + "step": 6089 + }, + { + "epoch": 1.3584653134062012, + "grad_norm": 0.17419113218784332, + "learning_rate": 1.1606286815331988e-05, + "loss": 0.4501, + "step": 6090 + }, + { + "epoch": 1.3586883783180905, + "grad_norm": 0.18028919398784637, + "learning_rate": 1.1603963901156563e-05, + "loss": 0.5103, + "step": 6091 + }, + { + "epoch": 1.3589114432299798, + "grad_norm": 0.17037071287631989, + "learning_rate": 1.1601640898143529e-05, + "loss": 0.48, + "step": 6092 + }, + { + "epoch": 1.3591345081418693, + "grad_norm": 0.1665482372045517, + "learning_rate": 1.1599317806421548e-05, + "loss": 0.4826, + "step": 6093 + }, + { + "epoch": 1.3593575730537586, + "grad_norm": 0.18023498356342316, + "learning_rate": 1.1596994626119287e-05, + "loss": 0.4615, + "step": 6094 + }, + { + "epoch": 1.359580637965648, + "grad_norm": 0.16920949518680573, + "learning_rate": 1.159467135736542e-05, + "loss": 0.4777, + "step": 6095 + }, + { + "epoch": 1.3598037028775374, + "grad_norm": 0.17280833423137665, + "learning_rate": 1.1592348000288618e-05, + "loss": 0.4691, + "step": 6096 + }, + { + "epoch": 1.3600267677894267, + "grad_norm": 0.17000465095043182, + "learning_rate": 1.1590024555017571e-05, + "loss": 0.44, + "step": 6097 + }, + { + "epoch": 1.3602498327013162, + "grad_norm": 0.1656549870967865, + "learning_rate": 1.1587701021680959e-05, + "loss": 0.4266, + "step": 6098 + }, + { + "epoch": 1.3604728976132054, + "grad_norm": 0.1633346974849701, + "learning_rate": 1.1585377400407483e-05, + "loss": 0.3994, + "step": 6099 + }, + { + "epoch": 1.3606959625250947, + "grad_norm": 0.1661108434200287, + "learning_rate": 1.1583053691325829e-05, + "loss": 0.4668, + "step": 6100 + }, + { + "epoch": 1.3609190274369842, + "grad_norm": 0.17027215659618378, + "learning_rate": 1.1580729894564706e-05, + "loss": 0.4536, + "step": 6101 + }, + { + "epoch": 1.3611420923488735, + "grad_norm": 0.17399293184280396, + "learning_rate": 1.1578406010252818e-05, + "loss": 0.4718, + "step": 6102 + }, + { + "epoch": 1.3613651572607628, + "grad_norm": 0.16790112853050232, + "learning_rate": 1.1576082038518876e-05, + "loss": 0.4487, + "step": 6103 + }, + { + "epoch": 1.3615882221726523, + "grad_norm": 0.16512347757816315, + "learning_rate": 1.15737579794916e-05, + "loss": 0.4497, + "step": 6104 + }, + { + "epoch": 1.3618112870845416, + "grad_norm": 0.16861163079738617, + "learning_rate": 1.1571433833299703e-05, + "loss": 0.4552, + "step": 6105 + }, + { + "epoch": 1.362034351996431, + "grad_norm": 0.16993777453899384, + "learning_rate": 1.156910960007192e-05, + "loss": 0.4651, + "step": 6106 + }, + { + "epoch": 1.3622574169083204, + "grad_norm": 0.16158044338226318, + "learning_rate": 1.1566785279936972e-05, + "loss": 0.4361, + "step": 6107 + }, + { + "epoch": 1.3624804818202096, + "grad_norm": 0.17330992221832275, + "learning_rate": 1.1564460873023604e-05, + "loss": 0.4651, + "step": 6108 + }, + { + "epoch": 1.362703546732099, + "grad_norm": 0.16832031309604645, + "learning_rate": 1.156213637946055e-05, + "loss": 0.4802, + "step": 6109 + }, + { + "epoch": 1.3629266116439884, + "grad_norm": 0.17535291612148285, + "learning_rate": 1.1559811799376557e-05, + "loss": 0.4598, + "step": 6110 + }, + { + "epoch": 1.3631496765558777, + "grad_norm": 0.17599309980869293, + "learning_rate": 1.1557487132900376e-05, + "loss": 0.4568, + "step": 6111 + }, + { + "epoch": 1.3633727414677672, + "grad_norm": 0.18158310651779175, + "learning_rate": 1.155516238016076e-05, + "loss": 0.4597, + "step": 6112 + }, + { + "epoch": 1.3635958063796565, + "grad_norm": 0.1722884625196457, + "learning_rate": 1.1552837541286468e-05, + "loss": 0.4703, + "step": 6113 + }, + { + "epoch": 1.3638188712915458, + "grad_norm": 0.16803938150405884, + "learning_rate": 1.1550512616406269e-05, + "loss": 0.4524, + "step": 6114 + }, + { + "epoch": 1.3640419362034353, + "grad_norm": 0.16398029029369354, + "learning_rate": 1.1548187605648923e-05, + "loss": 0.476, + "step": 6115 + }, + { + "epoch": 1.3642650011153246, + "grad_norm": 0.17081895470619202, + "learning_rate": 1.1545862509143212e-05, + "loss": 0.4794, + "step": 6116 + }, + { + "epoch": 1.3644880660272138, + "grad_norm": 0.16784706711769104, + "learning_rate": 1.1543537327017911e-05, + "loss": 0.4522, + "step": 6117 + }, + { + "epoch": 1.3647111309391033, + "grad_norm": 0.1711539328098297, + "learning_rate": 1.1541212059401806e-05, + "loss": 0.4607, + "step": 6118 + }, + { + "epoch": 1.3649341958509926, + "grad_norm": 0.17196914553642273, + "learning_rate": 1.1538886706423678e-05, + "loss": 0.4689, + "step": 6119 + }, + { + "epoch": 1.365157260762882, + "grad_norm": 0.17912612855434418, + "learning_rate": 1.1536561268212328e-05, + "loss": 0.4616, + "step": 6120 + }, + { + "epoch": 1.3653803256747714, + "grad_norm": 0.16970542073249817, + "learning_rate": 1.1534235744896547e-05, + "loss": 0.4481, + "step": 6121 + }, + { + "epoch": 1.3656033905866607, + "grad_norm": 0.17101642489433289, + "learning_rate": 1.153191013660514e-05, + "loss": 0.4507, + "step": 6122 + }, + { + "epoch": 1.3658264554985502, + "grad_norm": 0.16778020560741425, + "learning_rate": 1.1529584443466915e-05, + "loss": 0.4473, + "step": 6123 + }, + { + "epoch": 1.3660495204104395, + "grad_norm": 0.16892312467098236, + "learning_rate": 1.1527258665610681e-05, + "loss": 0.4837, + "step": 6124 + }, + { + "epoch": 1.3662725853223288, + "grad_norm": 0.1817074865102768, + "learning_rate": 1.1524932803165254e-05, + "loss": 0.4611, + "step": 6125 + }, + { + "epoch": 1.366495650234218, + "grad_norm": 0.1750434935092926, + "learning_rate": 1.1522606856259457e-05, + "loss": 0.4877, + "step": 6126 + }, + { + "epoch": 1.3667187151461075, + "grad_norm": 0.16866067051887512, + "learning_rate": 1.1520280825022116e-05, + "loss": 0.4593, + "step": 6127 + }, + { + "epoch": 1.3669417800579968, + "grad_norm": 0.16479606926441193, + "learning_rate": 1.1517954709582058e-05, + "loss": 0.4685, + "step": 6128 + }, + { + "epoch": 1.3671648449698863, + "grad_norm": 0.16955013573169708, + "learning_rate": 1.1515628510068122e-05, + "loss": 0.4905, + "step": 6129 + }, + { + "epoch": 1.3673879098817756, + "grad_norm": 0.1888909786939621, + "learning_rate": 1.1513302226609144e-05, + "loss": 0.4382, + "step": 6130 + }, + { + "epoch": 1.367610974793665, + "grad_norm": 0.179367333650589, + "learning_rate": 1.151097585933397e-05, + "loss": 0.4522, + "step": 6131 + }, + { + "epoch": 1.3678340397055544, + "grad_norm": 0.16989760100841522, + "learning_rate": 1.1508649408371448e-05, + "loss": 0.4614, + "step": 6132 + }, + { + "epoch": 1.3680571046174437, + "grad_norm": 0.17462711036205292, + "learning_rate": 1.1506322873850434e-05, + "loss": 0.4512, + "step": 6133 + }, + { + "epoch": 1.368280169529333, + "grad_norm": 0.18676964938640594, + "learning_rate": 1.1503996255899783e-05, + "loss": 0.4778, + "step": 6134 + }, + { + "epoch": 1.3685032344412225, + "grad_norm": 0.1641889065504074, + "learning_rate": 1.1501669554648359e-05, + "loss": 0.4562, + "step": 6135 + }, + { + "epoch": 1.3687262993531117, + "grad_norm": 0.1635216772556305, + "learning_rate": 1.1499342770225028e-05, + "loss": 0.456, + "step": 6136 + }, + { + "epoch": 1.368949364265001, + "grad_norm": 0.20571352541446686, + "learning_rate": 1.1497015902758663e-05, + "loss": 0.4625, + "step": 6137 + }, + { + "epoch": 1.3691724291768905, + "grad_norm": 0.1723473221063614, + "learning_rate": 1.1494688952378141e-05, + "loss": 0.4592, + "step": 6138 + }, + { + "epoch": 1.3693954940887798, + "grad_norm": 0.167448028922081, + "learning_rate": 1.1492361919212345e-05, + "loss": 0.454, + "step": 6139 + }, + { + "epoch": 1.3696185590006693, + "grad_norm": 0.16840708255767822, + "learning_rate": 1.1490034803390157e-05, + "loss": 0.4703, + "step": 6140 + }, + { + "epoch": 1.3698416239125586, + "grad_norm": 0.17156104743480682, + "learning_rate": 1.148770760504047e-05, + "loss": 0.4603, + "step": 6141 + }, + { + "epoch": 1.3700646888244479, + "grad_norm": 0.16438238322734833, + "learning_rate": 1.1485380324292175e-05, + "loss": 0.4521, + "step": 6142 + }, + { + "epoch": 1.3702877537363372, + "grad_norm": 0.17488059401512146, + "learning_rate": 1.1483052961274177e-05, + "loss": 0.4845, + "step": 6143 + }, + { + "epoch": 1.3705108186482267, + "grad_norm": 0.16744007170200348, + "learning_rate": 1.1480725516115374e-05, + "loss": 0.441, + "step": 6144 + }, + { + "epoch": 1.370733883560116, + "grad_norm": 0.16667571663856506, + "learning_rate": 1.1478397988944683e-05, + "loss": 0.4253, + "step": 6145 + }, + { + "epoch": 1.3709569484720054, + "grad_norm": 0.1638866513967514, + "learning_rate": 1.1476070379891009e-05, + "loss": 0.4539, + "step": 6146 + }, + { + "epoch": 1.3711800133838947, + "grad_norm": 0.2004345804452896, + "learning_rate": 1.1473742689083271e-05, + "loss": 0.4615, + "step": 6147 + }, + { + "epoch": 1.371403078295784, + "grad_norm": 0.17425598204135895, + "learning_rate": 1.1471414916650397e-05, + "loss": 0.4428, + "step": 6148 + }, + { + "epoch": 1.3716261432076735, + "grad_norm": 0.1777925044298172, + "learning_rate": 1.1469087062721305e-05, + "loss": 0.4741, + "step": 6149 + }, + { + "epoch": 1.3718492081195628, + "grad_norm": 0.17382198572158813, + "learning_rate": 1.146675912742493e-05, + "loss": 0.4744, + "step": 6150 + }, + { + "epoch": 1.372072273031452, + "grad_norm": 0.18798965215682983, + "learning_rate": 1.146443111089021e-05, + "loss": 0.4216, + "step": 6151 + }, + { + "epoch": 1.3722953379433416, + "grad_norm": 0.1665268987417221, + "learning_rate": 1.1462103013246086e-05, + "loss": 0.4669, + "step": 6152 + }, + { + "epoch": 1.3725184028552309, + "grad_norm": 0.1658693253993988, + "learning_rate": 1.1459774834621498e-05, + "loss": 0.444, + "step": 6153 + }, + { + "epoch": 1.3727414677671201, + "grad_norm": 0.16875915229320526, + "learning_rate": 1.1457446575145397e-05, + "loss": 0.46, + "step": 6154 + }, + { + "epoch": 1.3729645326790096, + "grad_norm": 0.17175830900669098, + "learning_rate": 1.1455118234946737e-05, + "loss": 0.4879, + "step": 6155 + }, + { + "epoch": 1.373187597590899, + "grad_norm": 0.17912839353084564, + "learning_rate": 1.1452789814154475e-05, + "loss": 0.4488, + "step": 6156 + }, + { + "epoch": 1.3734106625027884, + "grad_norm": 0.16743507981300354, + "learning_rate": 1.1450461312897576e-05, + "loss": 0.4565, + "step": 6157 + }, + { + "epoch": 1.3736337274146777, + "grad_norm": 0.1700303554534912, + "learning_rate": 1.1448132731305005e-05, + "loss": 0.4278, + "step": 6158 + }, + { + "epoch": 1.373856792326567, + "grad_norm": 0.17981837689876556, + "learning_rate": 1.1445804069505735e-05, + "loss": 0.4379, + "step": 6159 + }, + { + "epoch": 1.3740798572384563, + "grad_norm": 0.172749325633049, + "learning_rate": 1.1443475327628739e-05, + "loss": 0.4344, + "step": 6160 + }, + { + "epoch": 1.3743029221503458, + "grad_norm": 0.16664694249629974, + "learning_rate": 1.1441146505803003e-05, + "loss": 0.4574, + "step": 6161 + }, + { + "epoch": 1.374525987062235, + "grad_norm": 0.1706552803516388, + "learning_rate": 1.1438817604157506e-05, + "loss": 0.4525, + "step": 6162 + }, + { + "epoch": 1.3747490519741246, + "grad_norm": 0.17497579753398895, + "learning_rate": 1.1436488622821243e-05, + "loss": 0.4882, + "step": 6163 + }, + { + "epoch": 1.3749721168860138, + "grad_norm": 0.17119908332824707, + "learning_rate": 1.14341595619232e-05, + "loss": 0.4728, + "step": 6164 + }, + { + "epoch": 1.3751951817979031, + "grad_norm": 0.1758640706539154, + "learning_rate": 1.143183042159238e-05, + "loss": 0.4205, + "step": 6165 + }, + { + "epoch": 1.3754182467097926, + "grad_norm": 0.17280808091163635, + "learning_rate": 1.1429501201957785e-05, + "loss": 0.4964, + "step": 6166 + }, + { + "epoch": 1.375641311621682, + "grad_norm": 0.17379097640514374, + "learning_rate": 1.1427171903148425e-05, + "loss": 0.4624, + "step": 6167 + }, + { + "epoch": 1.3758643765335712, + "grad_norm": 0.17071013152599335, + "learning_rate": 1.14248425252933e-05, + "loss": 0.4503, + "step": 6168 + }, + { + "epoch": 1.3760874414454607, + "grad_norm": 0.18627150356769562, + "learning_rate": 1.1422513068521442e-05, + "loss": 0.4476, + "step": 6169 + }, + { + "epoch": 1.37631050635735, + "grad_norm": 0.1711094081401825, + "learning_rate": 1.1420183532961855e-05, + "loss": 0.445, + "step": 6170 + }, + { + "epoch": 1.3765335712692393, + "grad_norm": 0.16976723074913025, + "learning_rate": 1.1417853918743576e-05, + "loss": 0.4638, + "step": 6171 + }, + { + "epoch": 1.3767566361811288, + "grad_norm": 0.15716324746608734, + "learning_rate": 1.1415524225995624e-05, + "loss": 0.4555, + "step": 6172 + }, + { + "epoch": 1.376979701093018, + "grad_norm": 0.17595480382442474, + "learning_rate": 1.1413194454847041e-05, + "loss": 0.4777, + "step": 6173 + }, + { + "epoch": 1.3772027660049075, + "grad_norm": 0.1665569245815277, + "learning_rate": 1.1410864605426856e-05, + "loss": 0.4478, + "step": 6174 + }, + { + "epoch": 1.3774258309167968, + "grad_norm": 0.16966520249843597, + "learning_rate": 1.1408534677864119e-05, + "loss": 0.4626, + "step": 6175 + }, + { + "epoch": 1.377648895828686, + "grad_norm": 0.17505811154842377, + "learning_rate": 1.1406204672287867e-05, + "loss": 0.4595, + "step": 6176 + }, + { + "epoch": 1.3778719607405754, + "grad_norm": 0.1676858365535736, + "learning_rate": 1.1403874588827156e-05, + "loss": 0.4451, + "step": 6177 + }, + { + "epoch": 1.3780950256524649, + "grad_norm": 0.1657474786043167, + "learning_rate": 1.1401544427611037e-05, + "loss": 0.4268, + "step": 6178 + }, + { + "epoch": 1.3783180905643542, + "grad_norm": 0.16758085787296295, + "learning_rate": 1.1399214188768574e-05, + "loss": 0.4783, + "step": 6179 + }, + { + "epoch": 1.3785411554762437, + "grad_norm": 0.17600210011005402, + "learning_rate": 1.139688387242883e-05, + "loss": 0.4546, + "step": 6180 + }, + { + "epoch": 1.378764220388133, + "grad_norm": 0.17735408246517181, + "learning_rate": 1.1394553478720868e-05, + "loss": 0.4663, + "step": 6181 + }, + { + "epoch": 1.3789872853000222, + "grad_norm": 0.17437157034873962, + "learning_rate": 1.1392223007773764e-05, + "loss": 0.4914, + "step": 6182 + }, + { + "epoch": 1.3792103502119117, + "grad_norm": 0.16675308346748352, + "learning_rate": 1.1389892459716589e-05, + "loss": 0.463, + "step": 6183 + }, + { + "epoch": 1.379433415123801, + "grad_norm": 0.17255185544490814, + "learning_rate": 1.138756183467843e-05, + "loss": 0.4716, + "step": 6184 + }, + { + "epoch": 1.3796564800356905, + "grad_norm": 0.16526752710342407, + "learning_rate": 1.1385231132788368e-05, + "loss": 0.427, + "step": 6185 + }, + { + "epoch": 1.3798795449475798, + "grad_norm": 0.16648603975772858, + "learning_rate": 1.1382900354175494e-05, + "loss": 0.4393, + "step": 6186 + }, + { + "epoch": 1.380102609859469, + "grad_norm": 0.16615265607833862, + "learning_rate": 1.1380569498968896e-05, + "loss": 0.4493, + "step": 6187 + }, + { + "epoch": 1.3803256747713584, + "grad_norm": 0.19119910895824432, + "learning_rate": 1.1378238567297677e-05, + "loss": 0.4583, + "step": 6188 + }, + { + "epoch": 1.3805487396832479, + "grad_norm": 0.17033104598522186, + "learning_rate": 1.1375907559290935e-05, + "loss": 0.4919, + "step": 6189 + }, + { + "epoch": 1.3807718045951372, + "grad_norm": 0.16820746660232544, + "learning_rate": 1.1373576475077778e-05, + "loss": 0.4316, + "step": 6190 + }, + { + "epoch": 1.3809948695070267, + "grad_norm": 0.17084772884845734, + "learning_rate": 1.1371245314787318e-05, + "loss": 0.4225, + "step": 6191 + }, + { + "epoch": 1.381217934418916, + "grad_norm": 0.18454383313655853, + "learning_rate": 1.1368914078548666e-05, + "loss": 0.4675, + "step": 6192 + }, + { + "epoch": 1.3814409993308052, + "grad_norm": 0.3558218777179718, + "learning_rate": 1.136658276649094e-05, + "loss": 0.4507, + "step": 6193 + }, + { + "epoch": 1.3816640642426945, + "grad_norm": 0.18921050429344177, + "learning_rate": 1.1364251378743266e-05, + "loss": 0.4656, + "step": 6194 + }, + { + "epoch": 1.381887129154584, + "grad_norm": 0.1709054410457611, + "learning_rate": 1.136191991543477e-05, + "loss": 0.468, + "step": 6195 + }, + { + "epoch": 1.3821101940664733, + "grad_norm": 0.17301930487155914, + "learning_rate": 1.1359588376694577e-05, + "loss": 0.4486, + "step": 6196 + }, + { + "epoch": 1.3823332589783628, + "grad_norm": 0.17260199785232544, + "learning_rate": 1.1357256762651834e-05, + "loss": 0.4671, + "step": 6197 + }, + { + "epoch": 1.382556323890252, + "grad_norm": 0.16965550184249878, + "learning_rate": 1.135492507343567e-05, + "loss": 0.4657, + "step": 6198 + }, + { + "epoch": 1.3827793888021414, + "grad_norm": 0.17701901495456696, + "learning_rate": 1.1352593309175233e-05, + "loss": 0.4698, + "step": 6199 + }, + { + "epoch": 1.3830024537140309, + "grad_norm": 0.16689814627170563, + "learning_rate": 1.135026146999967e-05, + "loss": 0.4606, + "step": 6200 + }, + { + "epoch": 1.3832255186259201, + "grad_norm": 0.1691475808620453, + "learning_rate": 1.1347929556038135e-05, + "loss": 0.4287, + "step": 6201 + }, + { + "epoch": 1.3834485835378096, + "grad_norm": 0.17036226391792297, + "learning_rate": 1.1345597567419782e-05, + "loss": 0.44, + "step": 6202 + }, + { + "epoch": 1.383671648449699, + "grad_norm": 0.1688498854637146, + "learning_rate": 1.1343265504273773e-05, + "loss": 0.4621, + "step": 6203 + }, + { + "epoch": 1.3838947133615882, + "grad_norm": 0.17137567698955536, + "learning_rate": 1.1340933366729268e-05, + "loss": 0.4415, + "step": 6204 + }, + { + "epoch": 1.3841177782734775, + "grad_norm": 0.17283524572849274, + "learning_rate": 1.1338601154915441e-05, + "loss": 0.4303, + "step": 6205 + }, + { + "epoch": 1.384340843185367, + "grad_norm": 0.1761488914489746, + "learning_rate": 1.1336268868961459e-05, + "loss": 0.4892, + "step": 6206 + }, + { + "epoch": 1.3845639080972563, + "grad_norm": 0.1702503263950348, + "learning_rate": 1.1333936508996503e-05, + "loss": 0.4523, + "step": 6207 + }, + { + "epoch": 1.3847869730091458, + "grad_norm": 0.17347446084022522, + "learning_rate": 1.1331604075149753e-05, + "loss": 0.4681, + "step": 6208 + }, + { + "epoch": 1.385010037921035, + "grad_norm": 0.18203677237033844, + "learning_rate": 1.1329271567550394e-05, + "loss": 0.4697, + "step": 6209 + }, + { + "epoch": 1.3852331028329243, + "grad_norm": 0.168579563498497, + "learning_rate": 1.132693898632761e-05, + "loss": 0.4579, + "step": 6210 + }, + { + "epoch": 1.3854561677448136, + "grad_norm": 0.17098036408424377, + "learning_rate": 1.1324606331610602e-05, + "loss": 0.4451, + "step": 6211 + }, + { + "epoch": 1.3856792326567031, + "grad_norm": 0.1700953096151352, + "learning_rate": 1.1322273603528562e-05, + "loss": 0.4446, + "step": 6212 + }, + { + "epoch": 1.3859022975685924, + "grad_norm": 0.18041159212589264, + "learning_rate": 1.1319940802210692e-05, + "loss": 0.4943, + "step": 6213 + }, + { + "epoch": 1.386125362480482, + "grad_norm": 0.17734865844249725, + "learning_rate": 1.1317607927786201e-05, + "loss": 0.4507, + "step": 6214 + }, + { + "epoch": 1.3863484273923712, + "grad_norm": 0.18012134730815887, + "learning_rate": 1.131527498038429e-05, + "loss": 0.4572, + "step": 6215 + }, + { + "epoch": 1.3865714923042605, + "grad_norm": 0.17000438272953033, + "learning_rate": 1.131294196013418e-05, + "loss": 0.4896, + "step": 6216 + }, + { + "epoch": 1.38679455721615, + "grad_norm": 0.16048485040664673, + "learning_rate": 1.1310608867165082e-05, + "loss": 0.4398, + "step": 6217 + }, + { + "epoch": 1.3870176221280393, + "grad_norm": 0.17833036184310913, + "learning_rate": 1.1308275701606226e-05, + "loss": 0.4706, + "step": 6218 + }, + { + "epoch": 1.3872406870399288, + "grad_norm": 0.17137986421585083, + "learning_rate": 1.130594246358683e-05, + "loss": 0.4862, + "step": 6219 + }, + { + "epoch": 1.387463751951818, + "grad_norm": 0.1729787439107895, + "learning_rate": 1.1303609153236127e-05, + "loss": 0.4607, + "step": 6220 + }, + { + "epoch": 1.3876868168637073, + "grad_norm": 0.16867367923259735, + "learning_rate": 1.1301275770683344e-05, + "loss": 0.4671, + "step": 6221 + }, + { + "epoch": 1.3879098817755966, + "grad_norm": 0.17126032710075378, + "learning_rate": 1.1298942316057731e-05, + "loss": 0.4162, + "step": 6222 + }, + { + "epoch": 1.388132946687486, + "grad_norm": 0.16363003849983215, + "learning_rate": 1.1296608789488515e-05, + "loss": 0.4292, + "step": 6223 + }, + { + "epoch": 1.3883560115993754, + "grad_norm": 0.16574527323246002, + "learning_rate": 1.1294275191104952e-05, + "loss": 0.4622, + "step": 6224 + }, + { + "epoch": 1.3885790765112649, + "grad_norm": 0.16721156239509583, + "learning_rate": 1.1291941521036286e-05, + "loss": 0.4569, + "step": 6225 + }, + { + "epoch": 1.3888021414231542, + "grad_norm": 0.16934387385845184, + "learning_rate": 1.1289607779411775e-05, + "loss": 0.4488, + "step": 6226 + }, + { + "epoch": 1.3890252063350434, + "grad_norm": 0.16877540946006775, + "learning_rate": 1.1287273966360673e-05, + "loss": 0.456, + "step": 6227 + }, + { + "epoch": 1.3892482712469327, + "grad_norm": 0.17397183179855347, + "learning_rate": 1.1284940082012238e-05, + "loss": 0.4727, + "step": 6228 + }, + { + "epoch": 1.3894713361588222, + "grad_norm": 0.1647840440273285, + "learning_rate": 1.128260612649574e-05, + "loss": 0.4497, + "step": 6229 + }, + { + "epoch": 1.3896944010707115, + "grad_norm": 0.1644977629184723, + "learning_rate": 1.1280272099940446e-05, + "loss": 0.4395, + "step": 6230 + }, + { + "epoch": 1.389917465982601, + "grad_norm": 0.17503903806209564, + "learning_rate": 1.1277938002475633e-05, + "loss": 0.4515, + "step": 6231 + }, + { + "epoch": 1.3901405308944903, + "grad_norm": 0.16988563537597656, + "learning_rate": 1.127560383423057e-05, + "loss": 0.4365, + "step": 6232 + }, + { + "epoch": 1.3903635958063796, + "grad_norm": 0.1652899980545044, + "learning_rate": 1.1273269595334547e-05, + "loss": 0.4484, + "step": 6233 + }, + { + "epoch": 1.390586660718269, + "grad_norm": 0.1692708283662796, + "learning_rate": 1.1270935285916842e-05, + "loss": 0.4472, + "step": 6234 + }, + { + "epoch": 1.3908097256301584, + "grad_norm": 0.16943638026714325, + "learning_rate": 1.1268600906106749e-05, + "loss": 0.4454, + "step": 6235 + }, + { + "epoch": 1.3910327905420479, + "grad_norm": 0.1718982458114624, + "learning_rate": 1.1266266456033555e-05, + "loss": 0.4305, + "step": 6236 + }, + { + "epoch": 1.3912558554539372, + "grad_norm": 0.17046236991882324, + "learning_rate": 1.1263931935826561e-05, + "loss": 0.4383, + "step": 6237 + }, + { + "epoch": 1.3914789203658264, + "grad_norm": 0.17139141261577606, + "learning_rate": 1.1261597345615064e-05, + "loss": 0.4437, + "step": 6238 + }, + { + "epoch": 1.3917019852777157, + "grad_norm": 0.1709447205066681, + "learning_rate": 1.1259262685528376e-05, + "loss": 0.4938, + "step": 6239 + }, + { + "epoch": 1.3919250501896052, + "grad_norm": 0.16337740421295166, + "learning_rate": 1.1256927955695793e-05, + "loss": 0.4651, + "step": 6240 + }, + { + "epoch": 1.3921481151014945, + "grad_norm": 0.17332367599010468, + "learning_rate": 1.1254593156246638e-05, + "loss": 0.4803, + "step": 6241 + }, + { + "epoch": 1.392371180013384, + "grad_norm": 0.17104537785053253, + "learning_rate": 1.1252258287310219e-05, + "loss": 0.4717, + "step": 6242 + }, + { + "epoch": 1.3925942449252733, + "grad_norm": 0.16978146135807037, + "learning_rate": 1.1249923349015859e-05, + "loss": 0.4999, + "step": 6243 + }, + { + "epoch": 1.3928173098371626, + "grad_norm": 0.17217592895030975, + "learning_rate": 1.1247588341492884e-05, + "loss": 0.4738, + "step": 6244 + }, + { + "epoch": 1.3930403747490518, + "grad_norm": 0.17651988565921783, + "learning_rate": 1.1245253264870616e-05, + "loss": 0.441, + "step": 6245 + }, + { + "epoch": 1.3932634396609413, + "grad_norm": 0.16616612672805786, + "learning_rate": 1.1242918119278395e-05, + "loss": 0.4334, + "step": 6246 + }, + { + "epoch": 1.3934865045728306, + "grad_norm": 0.17374052107334137, + "learning_rate": 1.1240582904845542e-05, + "loss": 0.4464, + "step": 6247 + }, + { + "epoch": 1.3937095694847201, + "grad_norm": 0.17393873631954193, + "learning_rate": 1.1238247621701413e-05, + "loss": 0.4438, + "step": 6248 + }, + { + "epoch": 1.3939326343966094, + "grad_norm": 0.17229264974594116, + "learning_rate": 1.1235912269975335e-05, + "loss": 0.4775, + "step": 6249 + }, + { + "epoch": 1.3941556993084987, + "grad_norm": 0.17282813787460327, + "learning_rate": 1.1233576849796666e-05, + "loss": 0.454, + "step": 6250 + }, + { + "epoch": 1.3943787642203882, + "grad_norm": 0.18351981043815613, + "learning_rate": 1.1231241361294747e-05, + "loss": 0.4702, + "step": 6251 + }, + { + "epoch": 1.3946018291322775, + "grad_norm": 0.17423385381698608, + "learning_rate": 1.122890580459894e-05, + "loss": 0.4506, + "step": 6252 + }, + { + "epoch": 1.394824894044167, + "grad_norm": 0.17099998891353607, + "learning_rate": 1.1226570179838596e-05, + "loss": 0.4509, + "step": 6253 + }, + { + "epoch": 1.3950479589560563, + "grad_norm": 0.16235356032848358, + "learning_rate": 1.1224234487143085e-05, + "loss": 0.4243, + "step": 6254 + }, + { + "epoch": 1.3952710238679455, + "grad_norm": 0.17439930140972137, + "learning_rate": 1.1221898726641762e-05, + "loss": 0.4701, + "step": 6255 + }, + { + "epoch": 1.3954940887798348, + "grad_norm": 0.16977347433567047, + "learning_rate": 1.1219562898464006e-05, + "loss": 0.4516, + "step": 6256 + }, + { + "epoch": 1.3957171536917243, + "grad_norm": 0.16299669444561005, + "learning_rate": 1.1217227002739181e-05, + "loss": 0.4579, + "step": 6257 + }, + { + "epoch": 1.3959402186036136, + "grad_norm": 0.17073673009872437, + "learning_rate": 1.1214891039596673e-05, + "loss": 0.4989, + "step": 6258 + }, + { + "epoch": 1.3961632835155031, + "grad_norm": 0.16990354657173157, + "learning_rate": 1.1212555009165852e-05, + "loss": 0.436, + "step": 6259 + }, + { + "epoch": 1.3963863484273924, + "grad_norm": 0.16700886189937592, + "learning_rate": 1.1210218911576112e-05, + "loss": 0.4324, + "step": 6260 + }, + { + "epoch": 1.3966094133392817, + "grad_norm": 0.16864052414894104, + "learning_rate": 1.1207882746956834e-05, + "loss": 0.4834, + "step": 6261 + }, + { + "epoch": 1.396832478251171, + "grad_norm": 0.16238147020339966, + "learning_rate": 1.1205546515437413e-05, + "loss": 0.4447, + "step": 6262 + }, + { + "epoch": 1.3970555431630605, + "grad_norm": 0.1640692502260208, + "learning_rate": 1.120321021714724e-05, + "loss": 0.4539, + "step": 6263 + }, + { + "epoch": 1.3972786080749497, + "grad_norm": 0.16666129231452942, + "learning_rate": 1.1200873852215717e-05, + "loss": 0.4434, + "step": 6264 + }, + { + "epoch": 1.3975016729868392, + "grad_norm": 0.16352206468582153, + "learning_rate": 1.1198537420772249e-05, + "loss": 0.4345, + "step": 6265 + }, + { + "epoch": 1.3977247378987285, + "grad_norm": 0.16906216740608215, + "learning_rate": 1.1196200922946237e-05, + "loss": 0.4753, + "step": 6266 + }, + { + "epoch": 1.3979478028106178, + "grad_norm": 0.16937586665153503, + "learning_rate": 1.1193864358867097e-05, + "loss": 0.4545, + "step": 6267 + }, + { + "epoch": 1.3981708677225073, + "grad_norm": 0.17257355153560638, + "learning_rate": 1.1191527728664235e-05, + "loss": 0.4722, + "step": 6268 + }, + { + "epoch": 1.3983939326343966, + "grad_norm": 0.1705016791820526, + "learning_rate": 1.1189191032467074e-05, + "loss": 0.4647, + "step": 6269 + }, + { + "epoch": 1.398616997546286, + "grad_norm": 0.23135609924793243, + "learning_rate": 1.1186854270405035e-05, + "loss": 0.4725, + "step": 6270 + }, + { + "epoch": 1.3988400624581754, + "grad_norm": 0.17285971343517303, + "learning_rate": 1.1184517442607538e-05, + "loss": 0.4284, + "step": 6271 + }, + { + "epoch": 1.3990631273700647, + "grad_norm": 0.17679868638515472, + "learning_rate": 1.1182180549204013e-05, + "loss": 0.4412, + "step": 6272 + }, + { + "epoch": 1.399286192281954, + "grad_norm": 0.16253626346588135, + "learning_rate": 1.1179843590323897e-05, + "loss": 0.4321, + "step": 6273 + }, + { + "epoch": 1.3995092571938434, + "grad_norm": 0.1678762137889862, + "learning_rate": 1.1177506566096619e-05, + "loss": 0.483, + "step": 6274 + }, + { + "epoch": 1.3997323221057327, + "grad_norm": 0.16577322781085968, + "learning_rate": 1.1175169476651622e-05, + "loss": 0.4419, + "step": 6275 + }, + { + "epoch": 1.3999553870176222, + "grad_norm": 0.16397406160831451, + "learning_rate": 1.1172832322118346e-05, + "loss": 0.4529, + "step": 6276 + }, + { + "epoch": 1.4001784519295115, + "grad_norm": 0.17259271442890167, + "learning_rate": 1.1170495102626238e-05, + "loss": 0.4763, + "step": 6277 + }, + { + "epoch": 1.4004015168414008, + "grad_norm": 0.16409693658351898, + "learning_rate": 1.116815781830475e-05, + "loss": 0.4737, + "step": 6278 + }, + { + "epoch": 1.40062458175329, + "grad_norm": 0.17240269482135773, + "learning_rate": 1.1165820469283333e-05, + "loss": 0.4708, + "step": 6279 + }, + { + "epoch": 1.4008476466651796, + "grad_norm": 0.16498403251171112, + "learning_rate": 1.1163483055691447e-05, + "loss": 0.4387, + "step": 6280 + }, + { + "epoch": 1.4010707115770689, + "grad_norm": 0.16875067353248596, + "learning_rate": 1.116114557765855e-05, + "loss": 0.4768, + "step": 6281 + }, + { + "epoch": 1.4012937764889584, + "grad_norm": 0.16360358893871307, + "learning_rate": 1.1158808035314105e-05, + "loss": 0.4598, + "step": 6282 + }, + { + "epoch": 1.4015168414008476, + "grad_norm": 0.1615404486656189, + "learning_rate": 1.1156470428787582e-05, + "loss": 0.4529, + "step": 6283 + }, + { + "epoch": 1.401739906312737, + "grad_norm": 0.16733388602733612, + "learning_rate": 1.1154132758208456e-05, + "loss": 0.4564, + "step": 6284 + }, + { + "epoch": 1.4019629712246264, + "grad_norm": 0.176735982298851, + "learning_rate": 1.1151795023706194e-05, + "loss": 0.4371, + "step": 6285 + }, + { + "epoch": 1.4021860361365157, + "grad_norm": 0.17406433820724487, + "learning_rate": 1.1149457225410281e-05, + "loss": 0.4602, + "step": 6286 + }, + { + "epoch": 1.4024091010484052, + "grad_norm": 0.18209131062030792, + "learning_rate": 1.1147119363450197e-05, + "loss": 0.4559, + "step": 6287 + }, + { + "epoch": 1.4026321659602945, + "grad_norm": 0.17238366603851318, + "learning_rate": 1.1144781437955426e-05, + "loss": 0.4725, + "step": 6288 + }, + { + "epoch": 1.4028552308721838, + "grad_norm": 0.16568414866924286, + "learning_rate": 1.1142443449055455e-05, + "loss": 0.4291, + "step": 6289 + }, + { + "epoch": 1.403078295784073, + "grad_norm": 0.17432232201099396, + "learning_rate": 1.1140105396879783e-05, + "loss": 0.4827, + "step": 6290 + }, + { + "epoch": 1.4033013606959626, + "grad_norm": 0.17041410505771637, + "learning_rate": 1.11377672815579e-05, + "loss": 0.4523, + "step": 6291 + }, + { + "epoch": 1.4035244256078518, + "grad_norm": 0.1681598722934723, + "learning_rate": 1.113542910321931e-05, + "loss": 0.4588, + "step": 6292 + }, + { + "epoch": 1.4037474905197413, + "grad_norm": 0.16634954512119293, + "learning_rate": 1.1133090861993514e-05, + "loss": 0.4306, + "step": 6293 + }, + { + "epoch": 1.4039705554316306, + "grad_norm": 0.1695157140493393, + "learning_rate": 1.113075255801002e-05, + "loss": 0.4532, + "step": 6294 + }, + { + "epoch": 1.40419362034352, + "grad_norm": 0.16797398030757904, + "learning_rate": 1.1128414191398333e-05, + "loss": 0.4452, + "step": 6295 + }, + { + "epoch": 1.4044166852554092, + "grad_norm": 0.1665017306804657, + "learning_rate": 1.1126075762287972e-05, + "loss": 0.4523, + "step": 6296 + }, + { + "epoch": 1.4046397501672987, + "grad_norm": 0.17001719772815704, + "learning_rate": 1.1123737270808452e-05, + "loss": 0.4472, + "step": 6297 + }, + { + "epoch": 1.404862815079188, + "grad_norm": 0.16899511218070984, + "learning_rate": 1.1121398717089294e-05, + "loss": 0.4753, + "step": 6298 + }, + { + "epoch": 1.4050858799910775, + "grad_norm": 0.16389401257038116, + "learning_rate": 1.1119060101260021e-05, + "loss": 0.4494, + "step": 6299 + }, + { + "epoch": 1.4053089449029668, + "grad_norm": 0.16568145155906677, + "learning_rate": 1.1116721423450158e-05, + "loss": 0.4509, + "step": 6300 + }, + { + "epoch": 1.405532009814856, + "grad_norm": 0.16839419305324554, + "learning_rate": 1.1114382683789241e-05, + "loss": 0.4425, + "step": 6301 + }, + { + "epoch": 1.4057550747267455, + "grad_norm": 0.17624156177043915, + "learning_rate": 1.1112043882406802e-05, + "loss": 0.4668, + "step": 6302 + }, + { + "epoch": 1.4059781396386348, + "grad_norm": 0.2432551085948944, + "learning_rate": 1.1109705019432378e-05, + "loss": 0.4504, + "step": 6303 + }, + { + "epoch": 1.4062012045505243, + "grad_norm": 0.17527322471141815, + "learning_rate": 1.1107366094995506e-05, + "loss": 0.4734, + "step": 6304 + }, + { + "epoch": 1.4064242694624136, + "grad_norm": 0.16600951552391052, + "learning_rate": 1.1105027109225737e-05, + "loss": 0.4789, + "step": 6305 + }, + { + "epoch": 1.406647334374303, + "grad_norm": 0.20400017499923706, + "learning_rate": 1.1102688062252614e-05, + "loss": 0.4688, + "step": 6306 + }, + { + "epoch": 1.4068703992861922, + "grad_norm": 0.17171353101730347, + "learning_rate": 1.110034895420569e-05, + "loss": 0.4374, + "step": 6307 + }, + { + "epoch": 1.4070934641980817, + "grad_norm": 0.1854102462530136, + "learning_rate": 1.1098009785214523e-05, + "loss": 0.4255, + "step": 6308 + }, + { + "epoch": 1.407316529109971, + "grad_norm": 0.16316324472427368, + "learning_rate": 1.1095670555408662e-05, + "loss": 0.4676, + "step": 6309 + }, + { + "epoch": 1.4075395940218605, + "grad_norm": 0.18751972913742065, + "learning_rate": 1.1093331264917676e-05, + "loss": 0.4734, + "step": 6310 + }, + { + "epoch": 1.4077626589337497, + "grad_norm": 0.22429490089416504, + "learning_rate": 1.1090991913871128e-05, + "loss": 0.444, + "step": 6311 + }, + { + "epoch": 1.407985723845639, + "grad_norm": 0.16278603672981262, + "learning_rate": 1.1088652502398585e-05, + "loss": 0.4528, + "step": 6312 + }, + { + "epoch": 1.4082087887575283, + "grad_norm": 0.1643955558538437, + "learning_rate": 1.108631303062962e-05, + "loss": 0.4646, + "step": 6313 + }, + { + "epoch": 1.4084318536694178, + "grad_norm": 0.17145782709121704, + "learning_rate": 1.1083973498693802e-05, + "loss": 0.4592, + "step": 6314 + }, + { + "epoch": 1.408654918581307, + "grad_norm": 0.18193364143371582, + "learning_rate": 1.1081633906720714e-05, + "loss": 0.4932, + "step": 6315 + }, + { + "epoch": 1.4088779834931966, + "grad_norm": 0.17122429609298706, + "learning_rate": 1.1079294254839941e-05, + "loss": 0.4708, + "step": 6316 + }, + { + "epoch": 1.4091010484050859, + "grad_norm": 0.1604917347431183, + "learning_rate": 1.1076954543181058e-05, + "loss": 0.4659, + "step": 6317 + }, + { + "epoch": 1.4093241133169752, + "grad_norm": 0.1739075481891632, + "learning_rate": 1.1074614771873661e-05, + "loss": 0.4714, + "step": 6318 + }, + { + "epoch": 1.4095471782288647, + "grad_norm": 0.16061022877693176, + "learning_rate": 1.1072274941047336e-05, + "loss": 0.4414, + "step": 6319 + }, + { + "epoch": 1.409770243140754, + "grad_norm": 0.1809171438217163, + "learning_rate": 1.1069935050831683e-05, + "loss": 0.4467, + "step": 6320 + }, + { + "epoch": 1.4099933080526434, + "grad_norm": 0.1696557253599167, + "learning_rate": 1.1067595101356295e-05, + "loss": 0.462, + "step": 6321 + }, + { + "epoch": 1.4102163729645327, + "grad_norm": 0.1734270453453064, + "learning_rate": 1.1065255092750774e-05, + "loss": 0.4859, + "step": 6322 + }, + { + "epoch": 1.410439437876422, + "grad_norm": 0.1748056411743164, + "learning_rate": 1.1062915025144727e-05, + "loss": 0.4709, + "step": 6323 + }, + { + "epoch": 1.4106625027883113, + "grad_norm": 0.17737674713134766, + "learning_rate": 1.106057489866776e-05, + "loss": 0.4875, + "step": 6324 + }, + { + "epoch": 1.4108855677002008, + "grad_norm": 0.16855676472187042, + "learning_rate": 1.105823471344948e-05, + "loss": 0.4713, + "step": 6325 + }, + { + "epoch": 1.41110863261209, + "grad_norm": 0.17489537596702576, + "learning_rate": 1.105589446961951e-05, + "loss": 0.4678, + "step": 6326 + }, + { + "epoch": 1.4113316975239796, + "grad_norm": 0.19364696741104126, + "learning_rate": 1.1053554167307458e-05, + "loss": 0.4732, + "step": 6327 + }, + { + "epoch": 1.4115547624358689, + "grad_norm": 0.17721161246299744, + "learning_rate": 1.1051213806642951e-05, + "loss": 0.4753, + "step": 6328 + }, + { + "epoch": 1.4117778273477581, + "grad_norm": 0.17709079384803772, + "learning_rate": 1.1048873387755615e-05, + "loss": 0.4453, + "step": 6329 + }, + { + "epoch": 1.4120008922596474, + "grad_norm": 0.1669922173023224, + "learning_rate": 1.1046532910775068e-05, + "loss": 0.4639, + "step": 6330 + }, + { + "epoch": 1.412223957171537, + "grad_norm": 0.17192141711711884, + "learning_rate": 1.1044192375830946e-05, + "loss": 0.4559, + "step": 6331 + }, + { + "epoch": 1.4124470220834262, + "grad_norm": 0.16646385192871094, + "learning_rate": 1.1041851783052882e-05, + "loss": 0.4393, + "step": 6332 + }, + { + "epoch": 1.4126700869953157, + "grad_norm": 0.1777653992176056, + "learning_rate": 1.1039511132570516e-05, + "loss": 0.4681, + "step": 6333 + }, + { + "epoch": 1.412893151907205, + "grad_norm": 0.17389416694641113, + "learning_rate": 1.1037170424513482e-05, + "loss": 0.4644, + "step": 6334 + }, + { + "epoch": 1.4131162168190943, + "grad_norm": 0.17686106264591217, + "learning_rate": 1.1034829659011426e-05, + "loss": 0.4946, + "step": 6335 + }, + { + "epoch": 1.4133392817309838, + "grad_norm": 0.17192257940769196, + "learning_rate": 1.1032488836193994e-05, + "loss": 0.4583, + "step": 6336 + }, + { + "epoch": 1.413562346642873, + "grad_norm": 0.17798608541488647, + "learning_rate": 1.1030147956190835e-05, + "loss": 0.453, + "step": 6337 + }, + { + "epoch": 1.4137854115547626, + "grad_norm": 0.18982993066310883, + "learning_rate": 1.1027807019131605e-05, + "loss": 0.448, + "step": 6338 + }, + { + "epoch": 1.4140084764666518, + "grad_norm": 0.17006950080394745, + "learning_rate": 1.1025466025145955e-05, + "loss": 0.4625, + "step": 6339 + }, + { + "epoch": 1.4142315413785411, + "grad_norm": 0.16911807656288147, + "learning_rate": 1.1023124974363546e-05, + "loss": 0.4514, + "step": 6340 + }, + { + "epoch": 1.4144546062904304, + "grad_norm": 0.16757598519325256, + "learning_rate": 1.1020783866914042e-05, + "loss": 0.4415, + "step": 6341 + }, + { + "epoch": 1.41467767120232, + "grad_norm": 0.18296495079994202, + "learning_rate": 1.1018442702927104e-05, + "loss": 0.4786, + "step": 6342 + }, + { + "epoch": 1.4149007361142092, + "grad_norm": 0.16517338156700134, + "learning_rate": 1.1016101482532404e-05, + "loss": 0.432, + "step": 6343 + }, + { + "epoch": 1.4151238010260987, + "grad_norm": 0.16575907170772552, + "learning_rate": 1.1013760205859611e-05, + "loss": 0.4495, + "step": 6344 + }, + { + "epoch": 1.415346865937988, + "grad_norm": 0.17554599046707153, + "learning_rate": 1.1011418873038404e-05, + "loss": 0.4354, + "step": 6345 + }, + { + "epoch": 1.4155699308498773, + "grad_norm": 0.17527008056640625, + "learning_rate": 1.1009077484198456e-05, + "loss": 0.4729, + "step": 6346 + }, + { + "epoch": 1.4157929957617665, + "grad_norm": 0.1661120504140854, + "learning_rate": 1.100673603946945e-05, + "loss": 0.456, + "step": 6347 + }, + { + "epoch": 1.416016060673656, + "grad_norm": 0.16820774972438812, + "learning_rate": 1.1004394538981069e-05, + "loss": 0.4675, + "step": 6348 + }, + { + "epoch": 1.4162391255855453, + "grad_norm": 0.16874277591705322, + "learning_rate": 1.1002052982863001e-05, + "loss": 0.4414, + "step": 6349 + }, + { + "epoch": 1.4164621904974348, + "grad_norm": 0.17940542101860046, + "learning_rate": 1.0999711371244936e-05, + "loss": 0.4615, + "step": 6350 + }, + { + "epoch": 1.416685255409324, + "grad_norm": 0.1683221310377121, + "learning_rate": 1.0997369704256566e-05, + "loss": 0.4633, + "step": 6351 + }, + { + "epoch": 1.4169083203212134, + "grad_norm": 0.17308497428894043, + "learning_rate": 1.0995027982027588e-05, + "loss": 0.4399, + "step": 6352 + }, + { + "epoch": 1.417131385233103, + "grad_norm": 0.17420804500579834, + "learning_rate": 1.0992686204687701e-05, + "loss": 0.4701, + "step": 6353 + }, + { + "epoch": 1.4173544501449922, + "grad_norm": 0.1695612072944641, + "learning_rate": 1.0990344372366611e-05, + "loss": 0.4307, + "step": 6354 + }, + { + "epoch": 1.4175775150568817, + "grad_norm": 0.17334359884262085, + "learning_rate": 1.0988002485194016e-05, + "loss": 0.4673, + "step": 6355 + }, + { + "epoch": 1.417800579968771, + "grad_norm": 0.1705588549375534, + "learning_rate": 1.0985660543299632e-05, + "loss": 0.4589, + "step": 6356 + }, + { + "epoch": 1.4180236448806602, + "grad_norm": 0.1716470867395401, + "learning_rate": 1.0983318546813164e-05, + "loss": 0.4645, + "step": 6357 + }, + { + "epoch": 1.4182467097925495, + "grad_norm": 0.17242403328418732, + "learning_rate": 1.098097649586433e-05, + "loss": 0.4401, + "step": 6358 + }, + { + "epoch": 1.418469774704439, + "grad_norm": 0.1679268628358841, + "learning_rate": 1.0978634390582847e-05, + "loss": 0.46, + "step": 6359 + }, + { + "epoch": 1.4186928396163283, + "grad_norm": 0.17091570794582367, + "learning_rate": 1.0976292231098435e-05, + "loss": 0.4361, + "step": 6360 + }, + { + "epoch": 1.4189159045282178, + "grad_norm": 0.17040546238422394, + "learning_rate": 1.0973950017540823e-05, + "loss": 0.4295, + "step": 6361 + }, + { + "epoch": 1.419138969440107, + "grad_norm": 0.17108914256095886, + "learning_rate": 1.0971607750039727e-05, + "loss": 0.4581, + "step": 6362 + }, + { + "epoch": 1.4193620343519964, + "grad_norm": 0.16708308458328247, + "learning_rate": 1.0969265428724887e-05, + "loss": 0.455, + "step": 6363 + }, + { + "epoch": 1.4195850992638857, + "grad_norm": 0.16952864825725555, + "learning_rate": 1.0966923053726025e-05, + "loss": 0.4411, + "step": 6364 + }, + { + "epoch": 1.4198081641757752, + "grad_norm": 0.16820164024829865, + "learning_rate": 1.0964580625172887e-05, + "loss": 0.4328, + "step": 6365 + }, + { + "epoch": 1.4200312290876644, + "grad_norm": 0.16284674406051636, + "learning_rate": 1.0962238143195203e-05, + "loss": 0.4227, + "step": 6366 + }, + { + "epoch": 1.420254293999554, + "grad_norm": 0.16655023396015167, + "learning_rate": 1.0959895607922722e-05, + "loss": 0.4568, + "step": 6367 + }, + { + "epoch": 1.4204773589114432, + "grad_norm": 0.17234466969966888, + "learning_rate": 1.095755301948518e-05, + "loss": 0.4756, + "step": 6368 + }, + { + "epoch": 1.4207004238233325, + "grad_norm": 0.1787555068731308, + "learning_rate": 1.0955210378012331e-05, + "loss": 0.4822, + "step": 6369 + }, + { + "epoch": 1.420923488735222, + "grad_norm": 0.1660604625940323, + "learning_rate": 1.0952867683633922e-05, + "loss": 0.4712, + "step": 6370 + }, + { + "epoch": 1.4211465536471113, + "grad_norm": 0.19277963042259216, + "learning_rate": 1.0950524936479708e-05, + "loss": 0.4813, + "step": 6371 + }, + { + "epoch": 1.4213696185590008, + "grad_norm": 0.16839838027954102, + "learning_rate": 1.0948182136679442e-05, + "loss": 0.411, + "step": 6372 + }, + { + "epoch": 1.42159268347089, + "grad_norm": 0.18013301491737366, + "learning_rate": 1.0945839284362885e-05, + "loss": 0.4256, + "step": 6373 + }, + { + "epoch": 1.4218157483827794, + "grad_norm": 0.16745387017726898, + "learning_rate": 1.09434963796598e-05, + "loss": 0.4408, + "step": 6374 + }, + { + "epoch": 1.4220388132946686, + "grad_norm": 0.16723443567752838, + "learning_rate": 1.094115342269995e-05, + "loss": 0.4702, + "step": 6375 + }, + { + "epoch": 1.4222618782065581, + "grad_norm": 0.17772428691387177, + "learning_rate": 1.0938810413613103e-05, + "loss": 0.4401, + "step": 6376 + }, + { + "epoch": 1.4224849431184474, + "grad_norm": 0.17869889736175537, + "learning_rate": 1.0936467352529032e-05, + "loss": 0.4536, + "step": 6377 + }, + { + "epoch": 1.422708008030337, + "grad_norm": 0.18365752696990967, + "learning_rate": 1.0934124239577506e-05, + "loss": 0.4603, + "step": 6378 + }, + { + "epoch": 1.4229310729422262, + "grad_norm": 0.18287013471126556, + "learning_rate": 1.0931781074888306e-05, + "loss": 0.4501, + "step": 6379 + }, + { + "epoch": 1.4231541378541155, + "grad_norm": 0.21559575200080872, + "learning_rate": 1.0929437858591207e-05, + "loss": 0.4447, + "step": 6380 + }, + { + "epoch": 1.423377202766005, + "grad_norm": 0.189785435795784, + "learning_rate": 1.0927094590815992e-05, + "loss": 0.4608, + "step": 6381 + }, + { + "epoch": 1.4236002676778943, + "grad_norm": 0.17252789437770844, + "learning_rate": 1.092475127169245e-05, + "loss": 0.4806, + "step": 6382 + }, + { + "epoch": 1.4238233325897836, + "grad_norm": 0.16337281465530396, + "learning_rate": 1.0922407901350365e-05, + "loss": 0.4523, + "step": 6383 + }, + { + "epoch": 1.424046397501673, + "grad_norm": 0.17701326310634613, + "learning_rate": 1.0920064479919527e-05, + "loss": 0.4629, + "step": 6384 + }, + { + "epoch": 1.4242694624135623, + "grad_norm": 0.17202292382717133, + "learning_rate": 1.0917721007529731e-05, + "loss": 0.4646, + "step": 6385 + }, + { + "epoch": 1.4244925273254516, + "grad_norm": 0.18118809163570404, + "learning_rate": 1.0915377484310774e-05, + "loss": 0.4814, + "step": 6386 + }, + { + "epoch": 1.4247155922373411, + "grad_norm": 0.1883123368024826, + "learning_rate": 1.0913033910392452e-05, + "loss": 0.4767, + "step": 6387 + }, + { + "epoch": 1.4249386571492304, + "grad_norm": 0.16664288938045502, + "learning_rate": 1.0910690285904573e-05, + "loss": 0.429, + "step": 6388 + }, + { + "epoch": 1.42516172206112, + "grad_norm": 0.17694927752017975, + "learning_rate": 1.0908346610976934e-05, + "loss": 0.4506, + "step": 6389 + }, + { + "epoch": 1.4253847869730092, + "grad_norm": 0.17796604335308075, + "learning_rate": 1.0906002885739348e-05, + "loss": 0.4564, + "step": 6390 + }, + { + "epoch": 1.4256078518848985, + "grad_norm": 0.1945108026266098, + "learning_rate": 1.090365911032162e-05, + "loss": 0.4479, + "step": 6391 + }, + { + "epoch": 1.4258309167967878, + "grad_norm": 0.1725914031267166, + "learning_rate": 1.0901315284853566e-05, + "loss": 0.4606, + "step": 6392 + }, + { + "epoch": 1.4260539817086773, + "grad_norm": 0.16722865402698517, + "learning_rate": 1.0898971409465006e-05, + "loss": 0.4595, + "step": 6393 + }, + { + "epoch": 1.4262770466205665, + "grad_norm": 0.1750846952199936, + "learning_rate": 1.0896627484285752e-05, + "loss": 0.4485, + "step": 6394 + }, + { + "epoch": 1.426500111532456, + "grad_norm": 0.1740426868200302, + "learning_rate": 1.0894283509445629e-05, + "loss": 0.4627, + "step": 6395 + }, + { + "epoch": 1.4267231764443453, + "grad_norm": 0.17253148555755615, + "learning_rate": 1.0891939485074459e-05, + "loss": 0.4481, + "step": 6396 + }, + { + "epoch": 1.4269462413562346, + "grad_norm": 0.17242968082427979, + "learning_rate": 1.088959541130207e-05, + "loss": 0.4573, + "step": 6397 + }, + { + "epoch": 1.427169306268124, + "grad_norm": 0.1693577617406845, + "learning_rate": 1.0887251288258291e-05, + "loss": 0.461, + "step": 6398 + }, + { + "epoch": 1.4273923711800134, + "grad_norm": 0.17257948219776154, + "learning_rate": 1.0884907116072956e-05, + "loss": 0.4735, + "step": 6399 + }, + { + "epoch": 1.4276154360919027, + "grad_norm": 0.15882723033428192, + "learning_rate": 1.0882562894875897e-05, + "loss": 0.4495, + "step": 6400 + }, + { + "epoch": 1.4278385010037922, + "grad_norm": 0.16930413246154785, + "learning_rate": 1.0880218624796954e-05, + "loss": 0.4321, + "step": 6401 + }, + { + "epoch": 1.4280615659156815, + "grad_norm": 0.1744142770767212, + "learning_rate": 1.0877874305965968e-05, + "loss": 0.4472, + "step": 6402 + }, + { + "epoch": 1.4282846308275707, + "grad_norm": 0.1769794076681137, + "learning_rate": 1.0875529938512779e-05, + "loss": 0.475, + "step": 6403 + }, + { + "epoch": 1.4285076957394602, + "grad_norm": 0.1765398234128952, + "learning_rate": 1.0873185522567236e-05, + "loss": 0.4595, + "step": 6404 + }, + { + "epoch": 1.4287307606513495, + "grad_norm": 0.1690848469734192, + "learning_rate": 1.0870841058259185e-05, + "loss": 0.4647, + "step": 6405 + }, + { + "epoch": 1.428953825563239, + "grad_norm": 0.17777849733829498, + "learning_rate": 1.086849654571848e-05, + "loss": 0.4644, + "step": 6406 + }, + { + "epoch": 1.4291768904751283, + "grad_norm": 0.17765332758426666, + "learning_rate": 1.0866151985074973e-05, + "loss": 0.4528, + "step": 6407 + }, + { + "epoch": 1.4293999553870176, + "grad_norm": 0.1740536242723465, + "learning_rate": 1.0863807376458516e-05, + "loss": 0.4675, + "step": 6408 + }, + { + "epoch": 1.4296230202989069, + "grad_norm": 0.16334398090839386, + "learning_rate": 1.0861462719998981e-05, + "loss": 0.4409, + "step": 6409 + }, + { + "epoch": 1.4298460852107964, + "grad_norm": 0.17333824932575226, + "learning_rate": 1.0859118015826216e-05, + "loss": 0.4712, + "step": 6410 + }, + { + "epoch": 1.4300691501226857, + "grad_norm": 0.17034272849559784, + "learning_rate": 1.0856773264070092e-05, + "loss": 0.4635, + "step": 6411 + }, + { + "epoch": 1.4302922150345752, + "grad_norm": 0.16784878075122833, + "learning_rate": 1.0854428464860476e-05, + "loss": 0.4438, + "step": 6412 + }, + { + "epoch": 1.4305152799464644, + "grad_norm": 0.17363719642162323, + "learning_rate": 1.0852083618327239e-05, + "loss": 0.4789, + "step": 6413 + }, + { + "epoch": 1.4307383448583537, + "grad_norm": 0.16398392617702484, + "learning_rate": 1.084973872460025e-05, + "loss": 0.4559, + "step": 6414 + }, + { + "epoch": 1.4309614097702432, + "grad_norm": 0.17007873952388763, + "learning_rate": 1.0847393783809383e-05, + "loss": 0.468, + "step": 6415 + }, + { + "epoch": 1.4311844746821325, + "grad_norm": 0.1761980801820755, + "learning_rate": 1.084504879608452e-05, + "loss": 0.4525, + "step": 6416 + }, + { + "epoch": 1.4314075395940218, + "grad_norm": 0.17042046785354614, + "learning_rate": 1.084270376155554e-05, + "loss": 0.4498, + "step": 6417 + }, + { + "epoch": 1.4316306045059113, + "grad_norm": 0.1663273125886917, + "learning_rate": 1.0840358680352324e-05, + "loss": 0.4453, + "step": 6418 + }, + { + "epoch": 1.4318536694178006, + "grad_norm": 0.1743524819612503, + "learning_rate": 1.0838013552604758e-05, + "loss": 0.481, + "step": 6419 + }, + { + "epoch": 1.4320767343296898, + "grad_norm": 0.16921429336071014, + "learning_rate": 1.083566837844273e-05, + "loss": 0.4284, + "step": 6420 + }, + { + "epoch": 1.4322997992415794, + "grad_norm": 0.1793074607849121, + "learning_rate": 1.083332315799613e-05, + "loss": 0.4842, + "step": 6421 + }, + { + "epoch": 1.4325228641534686, + "grad_norm": 0.16839145123958588, + "learning_rate": 1.0830977891394853e-05, + "loss": 0.4509, + "step": 6422 + }, + { + "epoch": 1.4327459290653581, + "grad_norm": 0.1707042008638382, + "learning_rate": 1.0828632578768794e-05, + "loss": 0.446, + "step": 6423 + }, + { + "epoch": 1.4329689939772474, + "grad_norm": 0.16985002160072327, + "learning_rate": 1.0826287220247851e-05, + "loss": 0.4856, + "step": 6424 + }, + { + "epoch": 1.4331920588891367, + "grad_norm": 0.17573486268520355, + "learning_rate": 1.082394181596192e-05, + "loss": 0.4552, + "step": 6425 + }, + { + "epoch": 1.433415123801026, + "grad_norm": 0.20836280286312103, + "learning_rate": 1.0821596366040911e-05, + "loss": 0.4532, + "step": 6426 + }, + { + "epoch": 1.4336381887129155, + "grad_norm": 0.17805512249469757, + "learning_rate": 1.0819250870614729e-05, + "loss": 0.462, + "step": 6427 + }, + { + "epoch": 1.4338612536248048, + "grad_norm": 0.17473873496055603, + "learning_rate": 1.081690532981328e-05, + "loss": 0.444, + "step": 6428 + }, + { + "epoch": 1.4340843185366943, + "grad_norm": 0.17013612389564514, + "learning_rate": 1.081455974376647e-05, + "loss": 0.4578, + "step": 6429 + }, + { + "epoch": 1.4343073834485836, + "grad_norm": 0.16665178537368774, + "learning_rate": 1.0812214112604224e-05, + "loss": 0.432, + "step": 6430 + }, + { + "epoch": 1.4345304483604728, + "grad_norm": 0.1708722859621048, + "learning_rate": 1.080986843645645e-05, + "loss": 0.4473, + "step": 6431 + }, + { + "epoch": 1.4347535132723623, + "grad_norm": 0.17868392169475555, + "learning_rate": 1.0807522715453067e-05, + "loss": 0.4696, + "step": 6432 + }, + { + "epoch": 1.4349765781842516, + "grad_norm": 0.17720571160316467, + "learning_rate": 1.0805176949723997e-05, + "loss": 0.4708, + "step": 6433 + }, + { + "epoch": 1.435199643096141, + "grad_norm": 0.1707199364900589, + "learning_rate": 1.080283113939916e-05, + "loss": 0.4668, + "step": 6434 + }, + { + "epoch": 1.4354227080080304, + "grad_norm": 0.1737671196460724, + "learning_rate": 1.0800485284608488e-05, + "loss": 0.4539, + "step": 6435 + }, + { + "epoch": 1.4356457729199197, + "grad_norm": 0.1636429876089096, + "learning_rate": 1.0798139385481903e-05, + "loss": 0.4321, + "step": 6436 + }, + { + "epoch": 1.435868837831809, + "grad_norm": 0.1818714439868927, + "learning_rate": 1.079579344214934e-05, + "loss": 0.4698, + "step": 6437 + }, + { + "epoch": 1.4360919027436985, + "grad_norm": 0.17777089774608612, + "learning_rate": 1.0793447454740731e-05, + "loss": 0.4674, + "step": 6438 + }, + { + "epoch": 1.4363149676555877, + "grad_norm": 0.16908809542655945, + "learning_rate": 1.079110142338601e-05, + "loss": 0.4584, + "step": 6439 + }, + { + "epoch": 1.4365380325674773, + "grad_norm": 0.1798996478319168, + "learning_rate": 1.0788755348215114e-05, + "loss": 0.4565, + "step": 6440 + }, + { + "epoch": 1.4367610974793665, + "grad_norm": 0.1786443144083023, + "learning_rate": 1.0786409229357991e-05, + "loss": 0.4672, + "step": 6441 + }, + { + "epoch": 1.4369841623912558, + "grad_norm": 0.1697261929512024, + "learning_rate": 1.0784063066944572e-05, + "loss": 0.4342, + "step": 6442 + }, + { + "epoch": 1.437207227303145, + "grad_norm": 0.1702648103237152, + "learning_rate": 1.0781716861104812e-05, + "loss": 0.4487, + "step": 6443 + }, + { + "epoch": 1.4374302922150346, + "grad_norm": 0.17532885074615479, + "learning_rate": 1.0779370611968652e-05, + "loss": 0.4431, + "step": 6444 + }, + { + "epoch": 1.4376533571269239, + "grad_norm": 0.17340442538261414, + "learning_rate": 1.0777024319666048e-05, + "loss": 0.4404, + "step": 6445 + }, + { + "epoch": 1.4378764220388134, + "grad_norm": 0.1587846279144287, + "learning_rate": 1.0774677984326946e-05, + "loss": 0.4275, + "step": 6446 + }, + { + "epoch": 1.4380994869507027, + "grad_norm": 0.1758798211812973, + "learning_rate": 1.0772331606081308e-05, + "loss": 0.462, + "step": 6447 + }, + { + "epoch": 1.438322551862592, + "grad_norm": 0.16627365350723267, + "learning_rate": 1.0769985185059087e-05, + "loss": 0.4244, + "step": 6448 + }, + { + "epoch": 1.4385456167744815, + "grad_norm": 0.17225994169712067, + "learning_rate": 1.0767638721390242e-05, + "loss": 0.4572, + "step": 6449 + }, + { + "epoch": 1.4387686816863707, + "grad_norm": 0.1741783767938614, + "learning_rate": 1.0765292215204738e-05, + "loss": 0.45, + "step": 6450 + }, + { + "epoch": 1.43899174659826, + "grad_norm": 0.16373707354068756, + "learning_rate": 1.0762945666632534e-05, + "loss": 0.4508, + "step": 6451 + }, + { + "epoch": 1.4392148115101495, + "grad_norm": 0.18814869225025177, + "learning_rate": 1.0760599075803601e-05, + "loss": 0.5056, + "step": 6452 + }, + { + "epoch": 1.4394378764220388, + "grad_norm": 0.17694085836410522, + "learning_rate": 1.0758252442847907e-05, + "loss": 0.4423, + "step": 6453 + }, + { + "epoch": 1.439660941333928, + "grad_norm": 0.17024989426136017, + "learning_rate": 1.0755905767895425e-05, + "loss": 0.4288, + "step": 6454 + }, + { + "epoch": 1.4398840062458176, + "grad_norm": 0.1731920838356018, + "learning_rate": 1.0753559051076123e-05, + "loss": 0.4909, + "step": 6455 + }, + { + "epoch": 1.4401070711577069, + "grad_norm": 0.17619867622852325, + "learning_rate": 1.0751212292519983e-05, + "loss": 0.4567, + "step": 6456 + }, + { + "epoch": 1.4403301360695964, + "grad_norm": 0.512635350227356, + "learning_rate": 1.0748865492356981e-05, + "loss": 0.4643, + "step": 6457 + }, + { + "epoch": 1.4405532009814856, + "grad_norm": 0.17129187285900116, + "learning_rate": 1.0746518650717097e-05, + "loss": 0.4453, + "step": 6458 + }, + { + "epoch": 1.440776265893375, + "grad_norm": 0.16431821882724762, + "learning_rate": 1.0744171767730315e-05, + "loss": 0.4289, + "step": 6459 + }, + { + "epoch": 1.4409993308052642, + "grad_norm": 0.18094895780086517, + "learning_rate": 1.0741824843526619e-05, + "loss": 0.4711, + "step": 6460 + }, + { + "epoch": 1.4412223957171537, + "grad_norm": 0.19894392788410187, + "learning_rate": 1.0739477878235996e-05, + "loss": 0.4643, + "step": 6461 + }, + { + "epoch": 1.441445460629043, + "grad_norm": 0.17579229176044464, + "learning_rate": 1.073713087198844e-05, + "loss": 0.4529, + "step": 6462 + }, + { + "epoch": 1.4416685255409325, + "grad_norm": 0.1695978045463562, + "learning_rate": 1.0734783824913935e-05, + "loss": 0.4772, + "step": 6463 + }, + { + "epoch": 1.4418915904528218, + "grad_norm": 0.17056889832019806, + "learning_rate": 1.0732436737142482e-05, + "loss": 0.4648, + "step": 6464 + }, + { + "epoch": 1.442114655364711, + "grad_norm": 0.18249176442623138, + "learning_rate": 1.0730089608804074e-05, + "loss": 0.4555, + "step": 6465 + }, + { + "epoch": 1.4423377202766006, + "grad_norm": 0.2158237099647522, + "learning_rate": 1.0727742440028712e-05, + "loss": 0.4276, + "step": 6466 + }, + { + "epoch": 1.4425607851884898, + "grad_norm": 0.17240867018699646, + "learning_rate": 1.0725395230946396e-05, + "loss": 0.4629, + "step": 6467 + }, + { + "epoch": 1.4427838501003791, + "grad_norm": 0.16894541680812836, + "learning_rate": 1.072304798168713e-05, + "loss": 0.4522, + "step": 6468 + }, + { + "epoch": 1.4430069150122686, + "grad_norm": 0.17741340398788452, + "learning_rate": 1.0720700692380918e-05, + "loss": 0.4489, + "step": 6469 + }, + { + "epoch": 1.443229979924158, + "grad_norm": 0.16370341181755066, + "learning_rate": 1.0718353363157767e-05, + "loss": 0.4731, + "step": 6470 + }, + { + "epoch": 1.4434530448360472, + "grad_norm": 0.1746392399072647, + "learning_rate": 1.0716005994147694e-05, + "loss": 0.4832, + "step": 6471 + }, + { + "epoch": 1.4436761097479367, + "grad_norm": 0.16757871210575104, + "learning_rate": 1.0713658585480697e-05, + "loss": 0.4523, + "step": 6472 + }, + { + "epoch": 1.443899174659826, + "grad_norm": 0.18170872330665588, + "learning_rate": 1.0711311137286804e-05, + "loss": 0.4357, + "step": 6473 + }, + { + "epoch": 1.4441222395717155, + "grad_norm": 0.17265670001506805, + "learning_rate": 1.0708963649696023e-05, + "loss": 0.4685, + "step": 6474 + }, + { + "epoch": 1.4443453044836048, + "grad_norm": 0.15775421261787415, + "learning_rate": 1.0706616122838379e-05, + "loss": 0.427, + "step": 6475 + }, + { + "epoch": 1.444568369395494, + "grad_norm": 0.18243922293186188, + "learning_rate": 1.0704268556843884e-05, + "loss": 0.4614, + "step": 6476 + }, + { + "epoch": 1.4447914343073833, + "grad_norm": 0.17497234046459198, + "learning_rate": 1.0701920951842568e-05, + "loss": 0.4814, + "step": 6477 + }, + { + "epoch": 1.4450144992192728, + "grad_norm": 0.1762491762638092, + "learning_rate": 1.0699573307964457e-05, + "loss": 0.458, + "step": 6478 + }, + { + "epoch": 1.4452375641311621, + "grad_norm": 0.1725545972585678, + "learning_rate": 1.0697225625339573e-05, + "loss": 0.4371, + "step": 6479 + }, + { + "epoch": 1.4454606290430516, + "grad_norm": 0.9475487470626831, + "learning_rate": 1.0694877904097952e-05, + "loss": 0.4609, + "step": 6480 + }, + { + "epoch": 1.445683693954941, + "grad_norm": 0.21081972122192383, + "learning_rate": 1.0692530144369615e-05, + "loss": 0.4515, + "step": 6481 + }, + { + "epoch": 1.4459067588668302, + "grad_norm": 0.1702289879322052, + "learning_rate": 1.0690182346284608e-05, + "loss": 0.4586, + "step": 6482 + }, + { + "epoch": 1.4461298237787197, + "grad_norm": 0.18808738887310028, + "learning_rate": 1.0687834509972958e-05, + "loss": 0.4688, + "step": 6483 + }, + { + "epoch": 1.446352888690609, + "grad_norm": 0.17759791016578674, + "learning_rate": 1.068548663556471e-05, + "loss": 0.4544, + "step": 6484 + }, + { + "epoch": 1.4465759536024985, + "grad_norm": 0.1792079657316208, + "learning_rate": 1.0683138723189897e-05, + "loss": 0.4475, + "step": 6485 + }, + { + "epoch": 1.4467990185143877, + "grad_norm": 0.17630550265312195, + "learning_rate": 1.0680790772978566e-05, + "loss": 0.4504, + "step": 6486 + }, + { + "epoch": 1.447022083426277, + "grad_norm": 0.18616117537021637, + "learning_rate": 1.0678442785060758e-05, + "loss": 0.4455, + "step": 6487 + }, + { + "epoch": 1.4472451483381663, + "grad_norm": 0.18077711760997772, + "learning_rate": 1.0676094759566524e-05, + "loss": 0.4937, + "step": 6488 + }, + { + "epoch": 1.4474682132500558, + "grad_norm": 0.1756681501865387, + "learning_rate": 1.0673746696625906e-05, + "loss": 0.4585, + "step": 6489 + }, + { + "epoch": 1.447691278161945, + "grad_norm": 0.17320303618907928, + "learning_rate": 1.0671398596368961e-05, + "loss": 0.4564, + "step": 6490 + }, + { + "epoch": 1.4479143430738346, + "grad_norm": 0.17476066946983337, + "learning_rate": 1.0669050458925736e-05, + "loss": 0.4621, + "step": 6491 + }, + { + "epoch": 1.4481374079857239, + "grad_norm": 0.1720237284898758, + "learning_rate": 1.0666702284426289e-05, + "loss": 0.4483, + "step": 6492 + }, + { + "epoch": 1.4483604728976132, + "grad_norm": 0.1799459308385849, + "learning_rate": 1.0664354073000676e-05, + "loss": 0.4825, + "step": 6493 + }, + { + "epoch": 1.4485835378095024, + "grad_norm": 0.173868328332901, + "learning_rate": 1.0662005824778957e-05, + "loss": 0.4586, + "step": 6494 + }, + { + "epoch": 1.448806602721392, + "grad_norm": 0.17507266998291016, + "learning_rate": 1.0659657539891189e-05, + "loss": 0.453, + "step": 6495 + }, + { + "epoch": 1.4490296676332812, + "grad_norm": 0.17579588294029236, + "learning_rate": 1.0657309218467437e-05, + "loss": 0.4606, + "step": 6496 + }, + { + "epoch": 1.4492527325451707, + "grad_norm": 0.18230777978897095, + "learning_rate": 1.0654960860637766e-05, + "loss": 0.4721, + "step": 6497 + }, + { + "epoch": 1.44947579745706, + "grad_norm": 0.1772591918706894, + "learning_rate": 1.0652612466532242e-05, + "loss": 0.4642, + "step": 6498 + }, + { + "epoch": 1.4496988623689493, + "grad_norm": 0.17671100795269012, + "learning_rate": 1.0650264036280935e-05, + "loss": 0.4526, + "step": 6499 + }, + { + "epoch": 1.4499219272808388, + "grad_norm": 0.1718316227197647, + "learning_rate": 1.0647915570013916e-05, + "loss": 0.4592, + "step": 6500 + }, + { + "epoch": 1.450144992192728, + "grad_norm": 0.17270374298095703, + "learning_rate": 1.0645567067861257e-05, + "loss": 0.4722, + "step": 6501 + }, + { + "epoch": 1.4503680571046176, + "grad_norm": 0.1727593094110489, + "learning_rate": 1.0643218529953032e-05, + "loss": 0.4519, + "step": 6502 + }, + { + "epoch": 1.4505911220165069, + "grad_norm": 0.18215766549110413, + "learning_rate": 1.0640869956419321e-05, + "loss": 0.4584, + "step": 6503 + }, + { + "epoch": 1.4508141869283961, + "grad_norm": 0.17123596370220184, + "learning_rate": 1.0638521347390198e-05, + "loss": 0.4829, + "step": 6504 + }, + { + "epoch": 1.4510372518402854, + "grad_norm": 0.17143850028514862, + "learning_rate": 1.063617270299575e-05, + "loss": 0.4703, + "step": 6505 + }, + { + "epoch": 1.451260316752175, + "grad_norm": 0.17444929480552673, + "learning_rate": 1.0633824023366053e-05, + "loss": 0.4424, + "step": 6506 + }, + { + "epoch": 1.4514833816640642, + "grad_norm": 0.17457221448421478, + "learning_rate": 1.0631475308631196e-05, + "loss": 0.4759, + "step": 6507 + }, + { + "epoch": 1.4517064465759537, + "grad_norm": 0.16844333708286285, + "learning_rate": 1.0629126558921264e-05, + "loss": 0.4622, + "step": 6508 + }, + { + "epoch": 1.451929511487843, + "grad_norm": 0.17886194586753845, + "learning_rate": 1.0626777774366347e-05, + "loss": 0.4502, + "step": 6509 + }, + { + "epoch": 1.4521525763997323, + "grad_norm": 0.17846544086933136, + "learning_rate": 1.062442895509653e-05, + "loss": 0.4472, + "step": 6510 + }, + { + "epoch": 1.4523756413116216, + "grad_norm": 0.19240468740463257, + "learning_rate": 1.0622080101241914e-05, + "loss": 0.4895, + "step": 6511 + }, + { + "epoch": 1.452598706223511, + "grad_norm": 0.17172129452228546, + "learning_rate": 1.061973121293259e-05, + "loss": 0.4443, + "step": 6512 + }, + { + "epoch": 1.4528217711354003, + "grad_norm": 0.16445989906787872, + "learning_rate": 1.0617382290298649e-05, + "loss": 0.4089, + "step": 6513 + }, + { + "epoch": 1.4530448360472898, + "grad_norm": 0.1770135909318924, + "learning_rate": 1.0615033333470194e-05, + "loss": 0.4726, + "step": 6514 + }, + { + "epoch": 1.4532679009591791, + "grad_norm": 0.16954918205738068, + "learning_rate": 1.0612684342577326e-05, + "loss": 0.4364, + "step": 6515 + }, + { + "epoch": 1.4534909658710684, + "grad_norm": 0.16754356026649475, + "learning_rate": 1.0610335317750144e-05, + "loss": 0.433, + "step": 6516 + }, + { + "epoch": 1.453714030782958, + "grad_norm": 0.17707470059394836, + "learning_rate": 1.0607986259118752e-05, + "loss": 0.4817, + "step": 6517 + }, + { + "epoch": 1.4539370956948472, + "grad_norm": 0.16972193121910095, + "learning_rate": 1.0605637166813261e-05, + "loss": 0.4376, + "step": 6518 + }, + { + "epoch": 1.4541601606067367, + "grad_norm": 0.1755881905555725, + "learning_rate": 1.0603288040963768e-05, + "loss": 0.4354, + "step": 6519 + }, + { + "epoch": 1.454383225518626, + "grad_norm": 0.17727094888687134, + "learning_rate": 1.0600938881700394e-05, + "loss": 0.4519, + "step": 6520 + }, + { + "epoch": 1.4546062904305153, + "grad_norm": 0.17010506987571716, + "learning_rate": 1.059858968915324e-05, + "loss": 0.4579, + "step": 6521 + }, + { + "epoch": 1.4548293553424045, + "grad_norm": 0.1777375340461731, + "learning_rate": 1.0596240463452427e-05, + "loss": 0.4646, + "step": 6522 + }, + { + "epoch": 1.455052420254294, + "grad_norm": 0.17490847408771515, + "learning_rate": 1.0593891204728064e-05, + "loss": 0.4906, + "step": 6523 + }, + { + "epoch": 1.4552754851661833, + "grad_norm": 0.16298337280750275, + "learning_rate": 1.0591541913110273e-05, + "loss": 0.4104, + "step": 6524 + }, + { + "epoch": 1.4554985500780728, + "grad_norm": 0.1771763265132904, + "learning_rate": 1.0589192588729167e-05, + "loss": 0.4829, + "step": 6525 + }, + { + "epoch": 1.455721614989962, + "grad_norm": 0.16941291093826294, + "learning_rate": 1.0586843231714874e-05, + "loss": 0.4469, + "step": 6526 + }, + { + "epoch": 1.4559446799018514, + "grad_norm": 0.16429558396339417, + "learning_rate": 1.0584493842197505e-05, + "loss": 0.4358, + "step": 6527 + }, + { + "epoch": 1.4561677448137407, + "grad_norm": 0.18524731695652008, + "learning_rate": 1.0582144420307196e-05, + "loss": 0.4476, + "step": 6528 + }, + { + "epoch": 1.4563908097256302, + "grad_norm": 0.18424685299396515, + "learning_rate": 1.0579794966174064e-05, + "loss": 0.4621, + "step": 6529 + }, + { + "epoch": 1.4566138746375195, + "grad_norm": 0.17425791919231415, + "learning_rate": 1.0577445479928239e-05, + "loss": 0.4229, + "step": 6530 + }, + { + "epoch": 1.456836939549409, + "grad_norm": 0.1773643046617508, + "learning_rate": 1.0575095961699856e-05, + "loss": 0.4825, + "step": 6531 + }, + { + "epoch": 1.4570600044612982, + "grad_norm": 0.1699049323797226, + "learning_rate": 1.0572746411619034e-05, + "loss": 0.448, + "step": 6532 + }, + { + "epoch": 1.4572830693731875, + "grad_norm": 0.1901503950357437, + "learning_rate": 1.0570396829815919e-05, + "loss": 0.4498, + "step": 6533 + }, + { + "epoch": 1.457506134285077, + "grad_norm": 0.1757887601852417, + "learning_rate": 1.0568047216420636e-05, + "loss": 0.4576, + "step": 6534 + }, + { + "epoch": 1.4577291991969663, + "grad_norm": 0.17276814579963684, + "learning_rate": 1.056569757156333e-05, + "loss": 0.4685, + "step": 6535 + }, + { + "epoch": 1.4579522641088558, + "grad_norm": 0.1706877201795578, + "learning_rate": 1.0563347895374127e-05, + "loss": 0.4404, + "step": 6536 + }, + { + "epoch": 1.458175329020745, + "grad_norm": 0.1756119430065155, + "learning_rate": 1.056099818798318e-05, + "loss": 0.485, + "step": 6537 + }, + { + "epoch": 1.4583983939326344, + "grad_norm": 0.17187528312206268, + "learning_rate": 1.0558648449520621e-05, + "loss": 0.4528, + "step": 6538 + }, + { + "epoch": 1.4586214588445237, + "grad_norm": 0.17733196914196014, + "learning_rate": 1.05562986801166e-05, + "loss": 0.4607, + "step": 6539 + }, + { + "epoch": 1.4588445237564132, + "grad_norm": 0.1726623773574829, + "learning_rate": 1.0553948879901255e-05, + "loss": 0.4593, + "step": 6540 + }, + { + "epoch": 1.4590675886683024, + "grad_norm": 0.1699986606836319, + "learning_rate": 1.0551599049004738e-05, + "loss": 0.4614, + "step": 6541 + }, + { + "epoch": 1.459290653580192, + "grad_norm": 0.17713840305805206, + "learning_rate": 1.0549249187557196e-05, + "loss": 0.4911, + "step": 6542 + }, + { + "epoch": 1.4595137184920812, + "grad_norm": 0.17454893887043, + "learning_rate": 1.054689929568878e-05, + "loss": 0.469, + "step": 6543 + }, + { + "epoch": 1.4597367834039705, + "grad_norm": 0.17392531037330627, + "learning_rate": 1.054454937352964e-05, + "loss": 0.4355, + "step": 6544 + }, + { + "epoch": 1.4599598483158598, + "grad_norm": 0.16687026619911194, + "learning_rate": 1.0542199421209931e-05, + "loss": 0.456, + "step": 6545 + }, + { + "epoch": 1.4601829132277493, + "grad_norm": 0.18140020966529846, + "learning_rate": 1.0539849438859806e-05, + "loss": 0.4495, + "step": 6546 + }, + { + "epoch": 1.4604059781396386, + "grad_norm": 0.1847231388092041, + "learning_rate": 1.0537499426609426e-05, + "loss": 0.4382, + "step": 6547 + }, + { + "epoch": 1.460629043051528, + "grad_norm": 0.17485037446022034, + "learning_rate": 1.0535149384588943e-05, + "loss": 0.428, + "step": 6548 + }, + { + "epoch": 1.4608521079634174, + "grad_norm": 0.17149090766906738, + "learning_rate": 1.0532799312928525e-05, + "loss": 0.4624, + "step": 6549 + }, + { + "epoch": 1.4610751728753066, + "grad_norm": 0.17815500497817993, + "learning_rate": 1.0530449211758327e-05, + "loss": 0.4581, + "step": 6550 + }, + { + "epoch": 1.4612982377871961, + "grad_norm": 0.16746407747268677, + "learning_rate": 1.0528099081208514e-05, + "loss": 0.4839, + "step": 6551 + }, + { + "epoch": 1.4615213026990854, + "grad_norm": 0.17022733390331268, + "learning_rate": 1.0525748921409256e-05, + "loss": 0.4636, + "step": 6552 + }, + { + "epoch": 1.461744367610975, + "grad_norm": 0.1741877943277359, + "learning_rate": 1.0523398732490712e-05, + "loss": 0.4602, + "step": 6553 + }, + { + "epoch": 1.4619674325228642, + "grad_norm": 0.16914743185043335, + "learning_rate": 1.0521048514583057e-05, + "loss": 0.4616, + "step": 6554 + }, + { + "epoch": 1.4621904974347535, + "grad_norm": 0.16964775323867798, + "learning_rate": 1.0518698267816454e-05, + "loss": 0.4738, + "step": 6555 + }, + { + "epoch": 1.4624135623466428, + "grad_norm": 0.17116379737854004, + "learning_rate": 1.0516347992321081e-05, + "loss": 0.4588, + "step": 6556 + }, + { + "epoch": 1.4626366272585323, + "grad_norm": 0.22291259467601776, + "learning_rate": 1.0513997688227107e-05, + "loss": 0.4697, + "step": 6557 + }, + { + "epoch": 1.4628596921704216, + "grad_norm": 0.18003083765506744, + "learning_rate": 1.051164735566471e-05, + "loss": 0.4628, + "step": 6558 + }, + { + "epoch": 1.463082757082311, + "grad_norm": 0.1763797551393509, + "learning_rate": 1.050929699476406e-05, + "loss": 0.4565, + "step": 6559 + }, + { + "epoch": 1.4633058219942003, + "grad_norm": 0.1819460391998291, + "learning_rate": 1.0506946605655342e-05, + "loss": 0.483, + "step": 6560 + }, + { + "epoch": 1.4635288869060896, + "grad_norm": 0.18002116680145264, + "learning_rate": 1.050459618846873e-05, + "loss": 0.4485, + "step": 6561 + }, + { + "epoch": 1.463751951817979, + "grad_norm": 0.1870141327381134, + "learning_rate": 1.0502245743334409e-05, + "loss": 0.4761, + "step": 6562 + }, + { + "epoch": 1.4639750167298684, + "grad_norm": 0.16674816608428955, + "learning_rate": 1.0499895270382558e-05, + "loss": 0.4615, + "step": 6563 + }, + { + "epoch": 1.4641980816417577, + "grad_norm": 0.18010953068733215, + "learning_rate": 1.0497544769743362e-05, + "loss": 0.4677, + "step": 6564 + }, + { + "epoch": 1.4644211465536472, + "grad_norm": 0.17383818328380585, + "learning_rate": 1.049519424154701e-05, + "loss": 0.4165, + "step": 6565 + }, + { + "epoch": 1.4646442114655365, + "grad_norm": 0.17807599902153015, + "learning_rate": 1.0492843685923684e-05, + "loss": 0.4671, + "step": 6566 + }, + { + "epoch": 1.4648672763774258, + "grad_norm": 0.1714266836643219, + "learning_rate": 1.0490493103003573e-05, + "loss": 0.4445, + "step": 6567 + }, + { + "epoch": 1.4650903412893153, + "grad_norm": 0.18563859164714813, + "learning_rate": 1.048814249291687e-05, + "loss": 0.4694, + "step": 6568 + }, + { + "epoch": 1.4653134062012045, + "grad_norm": 0.1768602728843689, + "learning_rate": 1.0485791855793768e-05, + "loss": 0.4529, + "step": 6569 + }, + { + "epoch": 1.465536471113094, + "grad_norm": 0.16756857931613922, + "learning_rate": 1.0483441191764453e-05, + "loss": 0.4739, + "step": 6570 + }, + { + "epoch": 1.4657595360249833, + "grad_norm": 0.16969642043113708, + "learning_rate": 1.0481090500959125e-05, + "loss": 0.4815, + "step": 6571 + }, + { + "epoch": 1.4659826009368726, + "grad_norm": 0.17080920934677124, + "learning_rate": 1.047873978350798e-05, + "loss": 0.476, + "step": 6572 + }, + { + "epoch": 1.4662056658487619, + "grad_norm": 0.17539383471012115, + "learning_rate": 1.0476389039541214e-05, + "loss": 0.4739, + "step": 6573 + }, + { + "epoch": 1.4664287307606514, + "grad_norm": 0.1617891639471054, + "learning_rate": 1.0474038269189026e-05, + "loss": 0.4419, + "step": 6574 + }, + { + "epoch": 1.4666517956725407, + "grad_norm": 0.1763634830713272, + "learning_rate": 1.0471687472581617e-05, + "loss": 0.4513, + "step": 6575 + }, + { + "epoch": 1.4668748605844302, + "grad_norm": 0.16819307208061218, + "learning_rate": 1.046933664984919e-05, + "loss": 0.4533, + "step": 6576 + }, + { + "epoch": 1.4670979254963195, + "grad_norm": 0.16723337769508362, + "learning_rate": 1.0466985801121948e-05, + "loss": 0.4588, + "step": 6577 + }, + { + "epoch": 1.4673209904082087, + "grad_norm": 0.16584204137325287, + "learning_rate": 1.046463492653009e-05, + "loss": 0.432, + "step": 6578 + }, + { + "epoch": 1.467544055320098, + "grad_norm": 0.1695747673511505, + "learning_rate": 1.0462284026203831e-05, + "loss": 0.4584, + "step": 6579 + }, + { + "epoch": 1.4677671202319875, + "grad_norm": 0.17204824090003967, + "learning_rate": 1.0459933100273371e-05, + "loss": 0.4653, + "step": 6580 + }, + { + "epoch": 1.4679901851438768, + "grad_norm": 0.18302413821220398, + "learning_rate": 1.0457582148868928e-05, + "loss": 0.4231, + "step": 6581 + }, + { + "epoch": 1.4682132500557663, + "grad_norm": 0.17187362909317017, + "learning_rate": 1.0455231172120704e-05, + "loss": 0.4444, + "step": 6582 + }, + { + "epoch": 1.4684363149676556, + "grad_norm": 0.16787424683570862, + "learning_rate": 1.0452880170158914e-05, + "loss": 0.4393, + "step": 6583 + }, + { + "epoch": 1.4686593798795449, + "grad_norm": 0.16989830136299133, + "learning_rate": 1.0450529143113772e-05, + "loss": 0.4475, + "step": 6584 + }, + { + "epoch": 1.4688824447914344, + "grad_norm": 0.17312617599964142, + "learning_rate": 1.0448178091115493e-05, + "loss": 0.4664, + "step": 6585 + }, + { + "epoch": 1.4691055097033237, + "grad_norm": 0.16683322191238403, + "learning_rate": 1.0445827014294292e-05, + "loss": 0.4654, + "step": 6586 + }, + { + "epoch": 1.4693285746152132, + "grad_norm": 0.18341569602489471, + "learning_rate": 1.0443475912780386e-05, + "loss": 0.4659, + "step": 6587 + }, + { + "epoch": 1.4695516395271024, + "grad_norm": 0.18148088455200195, + "learning_rate": 1.0441124786703996e-05, + "loss": 0.4771, + "step": 6588 + }, + { + "epoch": 1.4697747044389917, + "grad_norm": 0.1746566742658615, + "learning_rate": 1.0438773636195336e-05, + "loss": 0.4601, + "step": 6589 + }, + { + "epoch": 1.469997769350881, + "grad_norm": 0.1989150047302246, + "learning_rate": 1.0436422461384636e-05, + "loss": 0.4603, + "step": 6590 + }, + { + "epoch": 1.4702208342627705, + "grad_norm": 0.1698281466960907, + "learning_rate": 1.0434071262402114e-05, + "loss": 0.4584, + "step": 6591 + }, + { + "epoch": 1.4704438991746598, + "grad_norm": 0.1744074821472168, + "learning_rate": 1.0431720039377998e-05, + "loss": 0.4839, + "step": 6592 + }, + { + "epoch": 1.4706669640865493, + "grad_norm": 0.1633480042219162, + "learning_rate": 1.0429368792442507e-05, + "loss": 0.4426, + "step": 6593 + }, + { + "epoch": 1.4708900289984386, + "grad_norm": 0.17857415974140167, + "learning_rate": 1.0427017521725873e-05, + "loss": 0.4595, + "step": 6594 + }, + { + "epoch": 1.4711130939103279, + "grad_norm": 0.1769183874130249, + "learning_rate": 1.0424666227358323e-05, + "loss": 0.4862, + "step": 6595 + }, + { + "epoch": 1.4713361588222171, + "grad_norm": 0.16599570214748383, + "learning_rate": 1.0422314909470082e-05, + "loss": 0.4185, + "step": 6596 + }, + { + "epoch": 1.4715592237341066, + "grad_norm": 0.16501422226428986, + "learning_rate": 1.0419963568191389e-05, + "loss": 0.4314, + "step": 6597 + }, + { + "epoch": 1.471782288645996, + "grad_norm": 0.17242339253425598, + "learning_rate": 1.041761220365247e-05, + "loss": 0.4411, + "step": 6598 + }, + { + "epoch": 1.4720053535578854, + "grad_norm": 0.1672823429107666, + "learning_rate": 1.0415260815983561e-05, + "loss": 0.4531, + "step": 6599 + }, + { + "epoch": 1.4722284184697747, + "grad_norm": 0.1693340539932251, + "learning_rate": 1.0412909405314896e-05, + "loss": 0.4494, + "step": 6600 + }, + { + "epoch": 1.472451483381664, + "grad_norm": 0.17069166898727417, + "learning_rate": 1.0410557971776711e-05, + "loss": 0.4587, + "step": 6601 + }, + { + "epoch": 1.4726745482935535, + "grad_norm": 0.1749400794506073, + "learning_rate": 1.0408206515499242e-05, + "loss": 0.4258, + "step": 6602 + }, + { + "epoch": 1.4728976132054428, + "grad_norm": 0.17911866307258606, + "learning_rate": 1.0405855036612728e-05, + "loss": 0.4718, + "step": 6603 + }, + { + "epoch": 1.4731206781173323, + "grad_norm": 0.1690555214881897, + "learning_rate": 1.040350353524741e-05, + "loss": 0.4594, + "step": 6604 + }, + { + "epoch": 1.4733437430292216, + "grad_norm": 0.1977461576461792, + "learning_rate": 1.0401152011533531e-05, + "loss": 0.4651, + "step": 6605 + }, + { + "epoch": 1.4735668079411108, + "grad_norm": 0.17112137377262115, + "learning_rate": 1.0398800465601327e-05, + "loss": 0.47, + "step": 6606 + }, + { + "epoch": 1.4737898728530001, + "grad_norm": 0.20035414397716522, + "learning_rate": 1.0396448897581043e-05, + "loss": 0.4539, + "step": 6607 + }, + { + "epoch": 1.4740129377648896, + "grad_norm": 0.1666097790002823, + "learning_rate": 1.0394097307602928e-05, + "loss": 0.4328, + "step": 6608 + }, + { + "epoch": 1.474236002676779, + "grad_norm": 0.17964059114456177, + "learning_rate": 1.0391745695797226e-05, + "loss": 0.4684, + "step": 6609 + }, + { + "epoch": 1.4744590675886684, + "grad_norm": 0.1727062463760376, + "learning_rate": 1.038939406229418e-05, + "loss": 0.435, + "step": 6610 + }, + { + "epoch": 1.4746821325005577, + "grad_norm": 0.18273860216140747, + "learning_rate": 1.0387042407224046e-05, + "loss": 0.4723, + "step": 6611 + }, + { + "epoch": 1.474905197412447, + "grad_norm": 0.1780747026205063, + "learning_rate": 1.0384690730717065e-05, + "loss": 0.4544, + "step": 6612 + }, + { + "epoch": 1.4751282623243362, + "grad_norm": 0.17232657968997955, + "learning_rate": 1.0382339032903492e-05, + "loss": 0.4507, + "step": 6613 + }, + { + "epoch": 1.4753513272362258, + "grad_norm": 0.17787334322929382, + "learning_rate": 1.037998731391358e-05, + "loss": 0.4654, + "step": 6614 + }, + { + "epoch": 1.475574392148115, + "grad_norm": 0.17883604764938354, + "learning_rate": 1.0377635573877581e-05, + "loss": 0.4569, + "step": 6615 + }, + { + "epoch": 1.4757974570600045, + "grad_norm": 0.172362819314003, + "learning_rate": 1.0375283812925745e-05, + "loss": 0.4433, + "step": 6616 + }, + { + "epoch": 1.4760205219718938, + "grad_norm": 0.17884163558483124, + "learning_rate": 1.037293203118833e-05, + "loss": 0.437, + "step": 6617 + }, + { + "epoch": 1.476243586883783, + "grad_norm": 0.1667267084121704, + "learning_rate": 1.0370580228795597e-05, + "loss": 0.4556, + "step": 6618 + }, + { + "epoch": 1.4764666517956726, + "grad_norm": 0.17479191720485687, + "learning_rate": 1.0368228405877799e-05, + "loss": 0.4462, + "step": 6619 + }, + { + "epoch": 1.4766897167075619, + "grad_norm": 0.1751541793346405, + "learning_rate": 1.0365876562565195e-05, + "loss": 0.4504, + "step": 6620 + }, + { + "epoch": 1.4769127816194514, + "grad_norm": 0.20716704428195953, + "learning_rate": 1.0363524698988045e-05, + "loss": 0.4626, + "step": 6621 + }, + { + "epoch": 1.4771358465313407, + "grad_norm": 0.16398191452026367, + "learning_rate": 1.0361172815276615e-05, + "loss": 0.4581, + "step": 6622 + }, + { + "epoch": 1.47735891144323, + "grad_norm": 0.17340189218521118, + "learning_rate": 1.0358820911561157e-05, + "loss": 0.4472, + "step": 6623 + }, + { + "epoch": 1.4775819763551192, + "grad_norm": 0.18412639200687408, + "learning_rate": 1.0356468987971944e-05, + "loss": 0.4323, + "step": 6624 + }, + { + "epoch": 1.4778050412670087, + "grad_norm": 0.16925156116485596, + "learning_rate": 1.0354117044639232e-05, + "loss": 0.4324, + "step": 6625 + }, + { + "epoch": 1.478028106178898, + "grad_norm": 0.1772768348455429, + "learning_rate": 1.0351765081693294e-05, + "loss": 0.4574, + "step": 6626 + }, + { + "epoch": 1.4782511710907875, + "grad_norm": 0.1698581427335739, + "learning_rate": 1.0349413099264391e-05, + "loss": 0.4748, + "step": 6627 + }, + { + "epoch": 1.4784742360026768, + "grad_norm": 0.1687650978565216, + "learning_rate": 1.0347061097482794e-05, + "loss": 0.4226, + "step": 6628 + }, + { + "epoch": 1.478697300914566, + "grad_norm": 0.18025682866573334, + "learning_rate": 1.034470907647877e-05, + "loss": 0.4615, + "step": 6629 + }, + { + "epoch": 1.4789203658264554, + "grad_norm": 0.18218332529067993, + "learning_rate": 1.034235703638259e-05, + "loss": 0.4393, + "step": 6630 + }, + { + "epoch": 1.4791434307383449, + "grad_norm": 0.17049945890903473, + "learning_rate": 1.0340004977324521e-05, + "loss": 0.4613, + "step": 6631 + }, + { + "epoch": 1.4793664956502341, + "grad_norm": 0.18157726526260376, + "learning_rate": 1.033765289943484e-05, + "loss": 0.4811, + "step": 6632 + }, + { + "epoch": 1.4795895605621237, + "grad_norm": 0.17979174852371216, + "learning_rate": 1.0335300802843815e-05, + "loss": 0.4478, + "step": 6633 + }, + { + "epoch": 1.479812625474013, + "grad_norm": 0.17775587737560272, + "learning_rate": 1.0332948687681725e-05, + "loss": 0.4669, + "step": 6634 + }, + { + "epoch": 1.4800356903859022, + "grad_norm": 0.1723344624042511, + "learning_rate": 1.0330596554078842e-05, + "loss": 0.4577, + "step": 6635 + }, + { + "epoch": 1.4802587552977917, + "grad_norm": 0.1759558767080307, + "learning_rate": 1.0328244402165442e-05, + "loss": 0.476, + "step": 6636 + }, + { + "epoch": 1.480481820209681, + "grad_norm": 0.1862598955631256, + "learning_rate": 1.0325892232071803e-05, + "loss": 0.464, + "step": 6637 + }, + { + "epoch": 1.4807048851215705, + "grad_norm": 0.18910667300224304, + "learning_rate": 1.0323540043928199e-05, + "loss": 0.4197, + "step": 6638 + }, + { + "epoch": 1.4809279500334598, + "grad_norm": 0.16868862509727478, + "learning_rate": 1.0321187837864917e-05, + "loss": 0.4334, + "step": 6639 + }, + { + "epoch": 1.481151014945349, + "grad_norm": 0.17516660690307617, + "learning_rate": 1.0318835614012228e-05, + "loss": 0.4503, + "step": 6640 + }, + { + "epoch": 1.4813740798572383, + "grad_norm": 0.1748761534690857, + "learning_rate": 1.0316483372500422e-05, + "loss": 0.4656, + "step": 6641 + }, + { + "epoch": 1.4815971447691278, + "grad_norm": 0.17153500020503998, + "learning_rate": 1.0314131113459772e-05, + "loss": 0.4646, + "step": 6642 + }, + { + "epoch": 1.4818202096810171, + "grad_norm": 0.1690903753042221, + "learning_rate": 1.0311778837020565e-05, + "loss": 0.4384, + "step": 6643 + }, + { + "epoch": 1.4820432745929066, + "grad_norm": 0.17170549929141998, + "learning_rate": 1.0309426543313086e-05, + "loss": 0.4606, + "step": 6644 + }, + { + "epoch": 1.482266339504796, + "grad_norm": 0.20019586384296417, + "learning_rate": 1.030707423246762e-05, + "loss": 0.4458, + "step": 6645 + }, + { + "epoch": 1.4824894044166852, + "grad_norm": 0.17187346518039703, + "learning_rate": 1.0304721904614447e-05, + "loss": 0.4777, + "step": 6646 + }, + { + "epoch": 1.4827124693285745, + "grad_norm": 0.17592963576316833, + "learning_rate": 1.0302369559883862e-05, + "loss": 0.462, + "step": 6647 + }, + { + "epoch": 1.482935534240464, + "grad_norm": 0.18362174928188324, + "learning_rate": 1.0300017198406148e-05, + "loss": 0.4851, + "step": 6648 + }, + { + "epoch": 1.4831585991523533, + "grad_norm": 0.17138110101222992, + "learning_rate": 1.0297664820311593e-05, + "loss": 0.5046, + "step": 6649 + }, + { + "epoch": 1.4833816640642428, + "grad_norm": 0.16969837248325348, + "learning_rate": 1.029531242573049e-05, + "loss": 0.4376, + "step": 6650 + }, + { + "epoch": 1.483604728976132, + "grad_norm": 0.1832205057144165, + "learning_rate": 1.0292960014793126e-05, + "loss": 0.4376, + "step": 6651 + }, + { + "epoch": 1.4838277938880213, + "grad_norm": 0.1777479201555252, + "learning_rate": 1.0290607587629795e-05, + "loss": 0.4616, + "step": 6652 + }, + { + "epoch": 1.4840508587999108, + "grad_norm": 0.18245477974414825, + "learning_rate": 1.0288255144370784e-05, + "loss": 0.4699, + "step": 6653 + }, + { + "epoch": 1.4842739237118001, + "grad_norm": 0.16267704963684082, + "learning_rate": 1.0285902685146394e-05, + "loss": 0.4358, + "step": 6654 + }, + { + "epoch": 1.4844969886236896, + "grad_norm": 0.1723002940416336, + "learning_rate": 1.0283550210086913e-05, + "loss": 0.4621, + "step": 6655 + }, + { + "epoch": 1.484720053535579, + "grad_norm": 0.17394044995307922, + "learning_rate": 1.028119771932264e-05, + "loss": 0.4456, + "step": 6656 + }, + { + "epoch": 1.4849431184474682, + "grad_norm": 0.1774415820837021, + "learning_rate": 1.0278845212983865e-05, + "loss": 0.4761, + "step": 6657 + }, + { + "epoch": 1.4851661833593575, + "grad_norm": 0.16867241263389587, + "learning_rate": 1.0276492691200893e-05, + "loss": 0.482, + "step": 6658 + }, + { + "epoch": 1.485389248271247, + "grad_norm": 0.1685202419757843, + "learning_rate": 1.027414015410401e-05, + "loss": 0.45, + "step": 6659 + }, + { + "epoch": 1.4856123131831362, + "grad_norm": 0.1738312542438507, + "learning_rate": 1.0271787601823526e-05, + "loss": 0.4591, + "step": 6660 + }, + { + "epoch": 1.4858353780950257, + "grad_norm": 0.17828047275543213, + "learning_rate": 1.0269435034489733e-05, + "loss": 0.4261, + "step": 6661 + }, + { + "epoch": 1.486058443006915, + "grad_norm": 0.164927139878273, + "learning_rate": 1.0267082452232935e-05, + "loss": 0.4383, + "step": 6662 + }, + { + "epoch": 1.4862815079188043, + "grad_norm": 0.17446818947792053, + "learning_rate": 1.0264729855183426e-05, + "loss": 0.461, + "step": 6663 + }, + { + "epoch": 1.4865045728306936, + "grad_norm": 0.1646289974451065, + "learning_rate": 1.0262377243471517e-05, + "loss": 0.4335, + "step": 6664 + }, + { + "epoch": 1.486727637742583, + "grad_norm": 0.19351711869239807, + "learning_rate": 1.0260024617227504e-05, + "loss": 0.4697, + "step": 6665 + }, + { + "epoch": 1.4869507026544724, + "grad_norm": 0.18421606719493866, + "learning_rate": 1.0257671976581694e-05, + "loss": 0.4531, + "step": 6666 + }, + { + "epoch": 1.4871737675663619, + "grad_norm": 0.17079128324985504, + "learning_rate": 1.0255319321664386e-05, + "loss": 0.4531, + "step": 6667 + }, + { + "epoch": 1.4873968324782512, + "grad_norm": 0.16973423957824707, + "learning_rate": 1.0252966652605889e-05, + "loss": 0.4358, + "step": 6668 + }, + { + "epoch": 1.4876198973901404, + "grad_norm": 0.17064902186393738, + "learning_rate": 1.0250613969536507e-05, + "loss": 0.4843, + "step": 6669 + }, + { + "epoch": 1.48784296230203, + "grad_norm": 0.1614328771829605, + "learning_rate": 1.0248261272586549e-05, + "loss": 0.4197, + "step": 6670 + }, + { + "epoch": 1.4880660272139192, + "grad_norm": 0.1676885336637497, + "learning_rate": 1.0245908561886319e-05, + "loss": 0.464, + "step": 6671 + }, + { + "epoch": 1.4882890921258087, + "grad_norm": 0.17245353758335114, + "learning_rate": 1.0243555837566124e-05, + "loss": 0.4368, + "step": 6672 + }, + { + "epoch": 1.488512157037698, + "grad_norm": 0.184434175491333, + "learning_rate": 1.0241203099756279e-05, + "loss": 0.4419, + "step": 6673 + }, + { + "epoch": 1.4887352219495873, + "grad_norm": 0.17373715341091156, + "learning_rate": 1.0238850348587088e-05, + "loss": 0.471, + "step": 6674 + }, + { + "epoch": 1.4889582868614766, + "grad_norm": 0.1708296537399292, + "learning_rate": 1.0236497584188862e-05, + "loss": 0.4468, + "step": 6675 + }, + { + "epoch": 1.489181351773366, + "grad_norm": 0.18406598269939423, + "learning_rate": 1.023414480669191e-05, + "loss": 0.4622, + "step": 6676 + }, + { + "epoch": 1.4894044166852554, + "grad_norm": 0.17563220858573914, + "learning_rate": 1.0231792016226546e-05, + "loss": 0.4529, + "step": 6677 + }, + { + "epoch": 1.4896274815971449, + "grad_norm": 0.21065136790275574, + "learning_rate": 1.0229439212923084e-05, + "loss": 0.4668, + "step": 6678 + }, + { + "epoch": 1.4898505465090341, + "grad_norm": 0.18098492920398712, + "learning_rate": 1.0227086396911837e-05, + "loss": 0.4669, + "step": 6679 + }, + { + "epoch": 1.4900736114209234, + "grad_norm": 0.18423466384410858, + "learning_rate": 1.0224733568323111e-05, + "loss": 0.4815, + "step": 6680 + }, + { + "epoch": 1.490296676332813, + "grad_norm": 0.17648786306381226, + "learning_rate": 1.022238072728723e-05, + "loss": 0.4566, + "step": 6681 + }, + { + "epoch": 1.4905197412447022, + "grad_norm": 0.17524605989456177, + "learning_rate": 1.0220027873934506e-05, + "loss": 0.4466, + "step": 6682 + }, + { + "epoch": 1.4907428061565915, + "grad_norm": 0.18514925241470337, + "learning_rate": 1.021767500839525e-05, + "loss": 0.4545, + "step": 6683 + }, + { + "epoch": 1.490965871068481, + "grad_norm": 0.17596104741096497, + "learning_rate": 1.021532213079979e-05, + "loss": 0.4637, + "step": 6684 + }, + { + "epoch": 1.4911889359803703, + "grad_norm": 0.1722065955400467, + "learning_rate": 1.021296924127843e-05, + "loss": 0.4471, + "step": 6685 + }, + { + "epoch": 1.4914120008922596, + "grad_norm": 0.1769905835390091, + "learning_rate": 1.0210616339961497e-05, + "loss": 0.4665, + "step": 6686 + }, + { + "epoch": 1.491635065804149, + "grad_norm": 0.1704041212797165, + "learning_rate": 1.0208263426979304e-05, + "loss": 0.437, + "step": 6687 + }, + { + "epoch": 1.4918581307160383, + "grad_norm": 0.1706121265888214, + "learning_rate": 1.0205910502462174e-05, + "loss": 0.4533, + "step": 6688 + }, + { + "epoch": 1.4920811956279278, + "grad_norm": 0.1680435687303543, + "learning_rate": 1.0203557566540425e-05, + "loss": 0.4465, + "step": 6689 + }, + { + "epoch": 1.4923042605398171, + "grad_norm": 0.17137907445430756, + "learning_rate": 1.0201204619344378e-05, + "loss": 0.4712, + "step": 6690 + }, + { + "epoch": 1.4925273254517064, + "grad_norm": 0.17346073687076569, + "learning_rate": 1.0198851661004352e-05, + "loss": 0.4396, + "step": 6691 + }, + { + "epoch": 1.4927503903635957, + "grad_norm": 0.1783093512058258, + "learning_rate": 1.0196498691650671e-05, + "loss": 0.453, + "step": 6692 + }, + { + "epoch": 1.4929734552754852, + "grad_norm": 0.16575607657432556, + "learning_rate": 1.0194145711413656e-05, + "loss": 0.4234, + "step": 6693 + }, + { + "epoch": 1.4931965201873745, + "grad_norm": 0.17617058753967285, + "learning_rate": 1.0191792720423632e-05, + "loss": 0.4275, + "step": 6694 + }, + { + "epoch": 1.493419585099264, + "grad_norm": 0.17400598526000977, + "learning_rate": 1.0189439718810919e-05, + "loss": 0.4667, + "step": 6695 + }, + { + "epoch": 1.4936426500111533, + "grad_norm": 0.17919796705245972, + "learning_rate": 1.0187086706705844e-05, + "loss": 0.4669, + "step": 6696 + }, + { + "epoch": 1.4938657149230425, + "grad_norm": 0.17682035267353058, + "learning_rate": 1.0184733684238728e-05, + "loss": 0.4918, + "step": 6697 + }, + { + "epoch": 1.494088779834932, + "grad_norm": 0.16982607543468475, + "learning_rate": 1.01823806515399e-05, + "loss": 0.4626, + "step": 6698 + }, + { + "epoch": 1.4943118447468213, + "grad_norm": 0.17292605340480804, + "learning_rate": 1.0180027608739684e-05, + "loss": 0.4648, + "step": 6699 + }, + { + "epoch": 1.4945349096587106, + "grad_norm": 0.16893380880355835, + "learning_rate": 1.017767455596841e-05, + "loss": 0.4285, + "step": 6700 + }, + { + "epoch": 1.4947579745706001, + "grad_norm": 0.17039963603019714, + "learning_rate": 1.0175321493356396e-05, + "loss": 0.4589, + "step": 6701 + }, + { + "epoch": 1.4949810394824894, + "grad_norm": 0.1750868260860443, + "learning_rate": 1.0172968421033977e-05, + "loss": 0.4703, + "step": 6702 + }, + { + "epoch": 1.4952041043943787, + "grad_norm": 0.17685000598430634, + "learning_rate": 1.017061533913148e-05, + "loss": 0.4585, + "step": 6703 + }, + { + "epoch": 1.4954271693062682, + "grad_norm": 0.1752578467130661, + "learning_rate": 1.0168262247779231e-05, + "loss": 0.4583, + "step": 6704 + }, + { + "epoch": 1.4956502342181575, + "grad_norm": 0.16632848978042603, + "learning_rate": 1.0165909147107563e-05, + "loss": 0.4684, + "step": 6705 + }, + { + "epoch": 1.495873299130047, + "grad_norm": 0.17605751752853394, + "learning_rate": 1.0163556037246798e-05, + "loss": 0.4747, + "step": 6706 + }, + { + "epoch": 1.4960963640419362, + "grad_norm": 0.16805213689804077, + "learning_rate": 1.0161202918327276e-05, + "loss": 0.4272, + "step": 6707 + }, + { + "epoch": 1.4963194289538255, + "grad_norm": 0.25550636649131775, + "learning_rate": 1.0158849790479318e-05, + "loss": 0.4756, + "step": 6708 + }, + { + "epoch": 1.4965424938657148, + "grad_norm": 0.1749117523431778, + "learning_rate": 1.015649665383326e-05, + "loss": 0.4746, + "step": 6709 + }, + { + "epoch": 1.4967655587776043, + "grad_norm": 0.17432990670204163, + "learning_rate": 1.0154143508519434e-05, + "loss": 0.4707, + "step": 6710 + }, + { + "epoch": 1.4969886236894936, + "grad_norm": 0.16568134725093842, + "learning_rate": 1.0151790354668171e-05, + "loss": 0.4359, + "step": 6711 + }, + { + "epoch": 1.497211688601383, + "grad_norm": 0.17027510702610016, + "learning_rate": 1.0149437192409803e-05, + "loss": 0.4352, + "step": 6712 + }, + { + "epoch": 1.4974347535132724, + "grad_norm": 0.17918628454208374, + "learning_rate": 1.0147084021874664e-05, + "loss": 0.4749, + "step": 6713 + }, + { + "epoch": 1.4976578184251617, + "grad_norm": 0.1729065179824829, + "learning_rate": 1.0144730843193086e-05, + "loss": 0.4465, + "step": 6714 + }, + { + "epoch": 1.4978808833370512, + "grad_norm": 0.17687968909740448, + "learning_rate": 1.0142377656495405e-05, + "loss": 0.4861, + "step": 6715 + }, + { + "epoch": 1.4981039482489404, + "grad_norm": 0.17104387283325195, + "learning_rate": 1.0140024461911955e-05, + "loss": 0.4529, + "step": 6716 + }, + { + "epoch": 1.4983270131608297, + "grad_norm": 0.1739518642425537, + "learning_rate": 1.0137671259573066e-05, + "loss": 0.4493, + "step": 6717 + }, + { + "epoch": 1.4985500780727192, + "grad_norm": 0.17664003372192383, + "learning_rate": 1.013531804960908e-05, + "loss": 0.4567, + "step": 6718 + }, + { + "epoch": 1.4987731429846085, + "grad_norm": 0.16971804201602936, + "learning_rate": 1.0132964832150325e-05, + "loss": 0.4546, + "step": 6719 + }, + { + "epoch": 1.4989962078964978, + "grad_norm": 0.17096221446990967, + "learning_rate": 1.0130611607327144e-05, + "loss": 0.4399, + "step": 6720 + }, + { + "epoch": 1.4992192728083873, + "grad_norm": 0.16578656435012817, + "learning_rate": 1.0128258375269868e-05, + "loss": 0.4558, + "step": 6721 + }, + { + "epoch": 1.4994423377202766, + "grad_norm": 0.17407840490341187, + "learning_rate": 1.012590513610884e-05, + "loss": 0.4345, + "step": 6722 + }, + { + "epoch": 1.499665402632166, + "grad_norm": 0.16804394125938416, + "learning_rate": 1.012355188997439e-05, + "loss": 0.4497, + "step": 6723 + }, + { + "epoch": 1.4998884675440554, + "grad_norm": 0.1964014172554016, + "learning_rate": 1.0121198636996862e-05, + "loss": 0.4412, + "step": 6724 + }, + { + "epoch": 1.5001115324559446, + "grad_norm": 0.17465969920158386, + "learning_rate": 1.011884537730659e-05, + "loss": 0.4605, + "step": 6725 + }, + { + "epoch": 1.500334597367834, + "grad_norm": 0.17849335074424744, + "learning_rate": 1.0116492111033916e-05, + "loss": 0.4766, + "step": 6726 + }, + { + "epoch": 1.5005576622797234, + "grad_norm": 0.1719314008951187, + "learning_rate": 1.0114138838309171e-05, + "loss": 0.4745, + "step": 6727 + }, + { + "epoch": 1.5007807271916127, + "grad_norm": 0.17707185447216034, + "learning_rate": 1.0111785559262703e-05, + "loss": 0.4544, + "step": 6728 + }, + { + "epoch": 1.5010037921035022, + "grad_norm": 0.18315830826759338, + "learning_rate": 1.0109432274024846e-05, + "loss": 0.4607, + "step": 6729 + }, + { + "epoch": 1.5012268570153915, + "grad_norm": 0.18333736062049866, + "learning_rate": 1.0107078982725942e-05, + "loss": 0.4835, + "step": 6730 + }, + { + "epoch": 1.5014499219272808, + "grad_norm": 0.1704811453819275, + "learning_rate": 1.010472568549633e-05, + "loss": 0.4337, + "step": 6731 + }, + { + "epoch": 1.50167298683917, + "grad_norm": 0.1818595677614212, + "learning_rate": 1.0102372382466352e-05, + "loss": 0.46, + "step": 6732 + }, + { + "epoch": 1.5018960517510596, + "grad_norm": 0.16688323020935059, + "learning_rate": 1.0100019073766344e-05, + "loss": 0.4584, + "step": 6733 + }, + { + "epoch": 1.502119116662949, + "grad_norm": 0.17898161709308624, + "learning_rate": 1.0097665759526654e-05, + "loss": 0.4605, + "step": 6734 + }, + { + "epoch": 1.5023421815748383, + "grad_norm": 0.16883964836597443, + "learning_rate": 1.009531243987762e-05, + "loss": 0.4334, + "step": 6735 + }, + { + "epoch": 1.5025652464867276, + "grad_norm": 0.16763442754745483, + "learning_rate": 1.009295911494958e-05, + "loss": 0.4353, + "step": 6736 + }, + { + "epoch": 1.502788311398617, + "grad_norm": 0.1733693778514862, + "learning_rate": 1.0090605784872884e-05, + "loss": 0.4385, + "step": 6737 + }, + { + "epoch": 1.5030113763105064, + "grad_norm": 0.17827312648296356, + "learning_rate": 1.0088252449777866e-05, + "loss": 0.4436, + "step": 6738 + }, + { + "epoch": 1.5032344412223957, + "grad_norm": 0.17768245935440063, + "learning_rate": 1.0085899109794874e-05, + "loss": 0.4745, + "step": 6739 + }, + { + "epoch": 1.5034575061342852, + "grad_norm": 0.1763843595981598, + "learning_rate": 1.0083545765054248e-05, + "loss": 0.4604, + "step": 6740 + }, + { + "epoch": 1.5036805710461745, + "grad_norm": 0.17313605546951294, + "learning_rate": 1.0081192415686334e-05, + "loss": 0.4513, + "step": 6741 + }, + { + "epoch": 1.5039036359580638, + "grad_norm": 0.17623473703861237, + "learning_rate": 1.0078839061821473e-05, + "loss": 0.4541, + "step": 6742 + }, + { + "epoch": 1.504126700869953, + "grad_norm": 0.17384295165538788, + "learning_rate": 1.007648570359001e-05, + "loss": 0.454, + "step": 6743 + }, + { + "epoch": 1.5043497657818425, + "grad_norm": 0.1829373836517334, + "learning_rate": 1.0074132341122282e-05, + "loss": 0.4691, + "step": 6744 + }, + { + "epoch": 1.5045728306937318, + "grad_norm": 0.17881907522678375, + "learning_rate": 1.0071778974548642e-05, + "loss": 0.4705, + "step": 6745 + }, + { + "epoch": 1.5047958956056213, + "grad_norm": 0.17440833151340485, + "learning_rate": 1.0069425603999432e-05, + "loss": 0.45, + "step": 6746 + }, + { + "epoch": 1.5050189605175106, + "grad_norm": 0.17206740379333496, + "learning_rate": 1.0067072229604995e-05, + "loss": 0.4508, + "step": 6747 + }, + { + "epoch": 1.5052420254293999, + "grad_norm": 0.18030904233455658, + "learning_rate": 1.0064718851495674e-05, + "loss": 0.4786, + "step": 6748 + }, + { + "epoch": 1.5054650903412892, + "grad_norm": 0.17595484852790833, + "learning_rate": 1.0062365469801818e-05, + "loss": 0.4772, + "step": 6749 + }, + { + "epoch": 1.5056881552531787, + "grad_norm": 0.1742616444826126, + "learning_rate": 1.006001208465377e-05, + "loss": 0.4564, + "step": 6750 + }, + { + "epoch": 1.5059112201650682, + "grad_norm": 0.16961626708507538, + "learning_rate": 1.0057658696181875e-05, + "loss": 0.4514, + "step": 6751 + }, + { + "epoch": 1.5061342850769575, + "grad_norm": 0.16909094154834747, + "learning_rate": 1.0055305304516477e-05, + "loss": 0.4647, + "step": 6752 + }, + { + "epoch": 1.5063573499888467, + "grad_norm": 0.168659508228302, + "learning_rate": 1.0052951909787927e-05, + "loss": 0.4883, + "step": 6753 + }, + { + "epoch": 1.506580414900736, + "grad_norm": 0.16884328424930573, + "learning_rate": 1.0050598512126562e-05, + "loss": 0.4683, + "step": 6754 + }, + { + "epoch": 1.5068034798126255, + "grad_norm": 0.17461951076984406, + "learning_rate": 1.0048245111662735e-05, + "loss": 0.455, + "step": 6755 + }, + { + "epoch": 1.5070265447245148, + "grad_norm": 0.1754961460828781, + "learning_rate": 1.0045891708526796e-05, + "loss": 0.4833, + "step": 6756 + }, + { + "epoch": 1.5072496096364043, + "grad_norm": 0.16815195977687836, + "learning_rate": 1.0043538302849078e-05, + "loss": 0.4338, + "step": 6757 + }, + { + "epoch": 1.5074726745482936, + "grad_norm": 0.1777064949274063, + "learning_rate": 1.004118489475994e-05, + "loss": 0.4582, + "step": 6758 + }, + { + "epoch": 1.5076957394601829, + "grad_norm": 0.17062366008758545, + "learning_rate": 1.0038831484389719e-05, + "loss": 0.4427, + "step": 6759 + }, + { + "epoch": 1.5079188043720722, + "grad_norm": 0.16902603209018707, + "learning_rate": 1.003647807186877e-05, + "loss": 0.464, + "step": 6760 + }, + { + "epoch": 1.5081418692839617, + "grad_norm": 0.4289887547492981, + "learning_rate": 1.0034124657327433e-05, + "loss": 0.4714, + "step": 6761 + }, + { + "epoch": 1.5083649341958512, + "grad_norm": 0.17374208569526672, + "learning_rate": 1.003177124089606e-05, + "loss": 0.4568, + "step": 6762 + }, + { + "epoch": 1.5085879991077404, + "grad_norm": 0.2121153473854065, + "learning_rate": 1.0029417822704995e-05, + "loss": 0.4444, + "step": 6763 + }, + { + "epoch": 1.5088110640196297, + "grad_norm": 0.17344814538955688, + "learning_rate": 1.0027064402884585e-05, + "loss": 0.4663, + "step": 6764 + }, + { + "epoch": 1.509034128931519, + "grad_norm": 0.18848566710948944, + "learning_rate": 1.0024710981565177e-05, + "loss": 0.4632, + "step": 6765 + }, + { + "epoch": 1.5092571938434083, + "grad_norm": 0.16708870232105255, + "learning_rate": 1.002235755887712e-05, + "loss": 0.4484, + "step": 6766 + }, + { + "epoch": 1.5094802587552978, + "grad_norm": 0.17515549063682556, + "learning_rate": 1.0020004134950765e-05, + "loss": 0.4862, + "step": 6767 + }, + { + "epoch": 1.5097033236671873, + "grad_norm": 0.17192357778549194, + "learning_rate": 1.001765070991645e-05, + "loss": 0.4601, + "step": 6768 + }, + { + "epoch": 1.5099263885790766, + "grad_norm": 0.1708642542362213, + "learning_rate": 1.001529728390453e-05, + "loss": 0.4686, + "step": 6769 + }, + { + "epoch": 1.5101494534909659, + "grad_norm": 0.17151105403900146, + "learning_rate": 1.0012943857045349e-05, + "loss": 0.459, + "step": 6770 + }, + { + "epoch": 1.5103725184028551, + "grad_norm": 0.173813134431839, + "learning_rate": 1.001059042946926e-05, + "loss": 0.4437, + "step": 6771 + }, + { + "epoch": 1.5105955833147446, + "grad_norm": 0.17130842804908752, + "learning_rate": 1.0008237001306602e-05, + "loss": 0.457, + "step": 6772 + }, + { + "epoch": 1.510818648226634, + "grad_norm": 0.17134280502796173, + "learning_rate": 1.000588357268773e-05, + "loss": 0.4546, + "step": 6773 + }, + { + "epoch": 1.5110417131385234, + "grad_norm": 0.17797818779945374, + "learning_rate": 1.0003530143742986e-05, + "loss": 0.4637, + "step": 6774 + }, + { + "epoch": 1.5112647780504127, + "grad_norm": 0.16602249443531036, + "learning_rate": 1.000117671460272e-05, + "loss": 0.4494, + "step": 6775 + }, + { + "epoch": 1.511487842962302, + "grad_norm": 0.16833782196044922, + "learning_rate": 9.998823285397282e-06, + "loss": 0.4414, + "step": 6776 + }, + { + "epoch": 1.5117109078741913, + "grad_norm": 0.1781986951828003, + "learning_rate": 9.996469856257018e-06, + "loss": 0.444, + "step": 6777 + }, + { + "epoch": 1.5119339727860808, + "grad_norm": 0.16520027816295624, + "learning_rate": 9.994116427312274e-06, + "loss": 0.4289, + "step": 6778 + }, + { + "epoch": 1.5121570376979703, + "grad_norm": 0.17205528914928436, + "learning_rate": 9.991762998693401e-06, + "loss": 0.4745, + "step": 6779 + }, + { + "epoch": 1.5123801026098596, + "grad_norm": 0.17844898998737335, + "learning_rate": 9.989409570530743e-06, + "loss": 0.4685, + "step": 6780 + }, + { + "epoch": 1.5126031675217488, + "grad_norm": 0.1733841598033905, + "learning_rate": 9.987056142954653e-06, + "loss": 0.4505, + "step": 6781 + }, + { + "epoch": 1.5128262324336381, + "grad_norm": 0.16828158497810364, + "learning_rate": 9.984702716095472e-06, + "loss": 0.4303, + "step": 6782 + }, + { + "epoch": 1.5130492973455274, + "grad_norm": 0.18432192504405975, + "learning_rate": 9.982349290083553e-06, + "loss": 0.4657, + "step": 6783 + }, + { + "epoch": 1.513272362257417, + "grad_norm": 0.17612218856811523, + "learning_rate": 9.979995865049239e-06, + "loss": 0.4531, + "step": 6784 + }, + { + "epoch": 1.5134954271693064, + "grad_norm": 0.1782546043395996, + "learning_rate": 9.977642441122881e-06, + "loss": 0.4442, + "step": 6785 + }, + { + "epoch": 1.5137184920811957, + "grad_norm": 0.17371070384979248, + "learning_rate": 9.975289018434826e-06, + "loss": 0.4636, + "step": 6786 + }, + { + "epoch": 1.513941556993085, + "grad_norm": 0.16479544341564178, + "learning_rate": 9.972935597115417e-06, + "loss": 0.4454, + "step": 6787 + }, + { + "epoch": 1.5141646219049743, + "grad_norm": 0.18058109283447266, + "learning_rate": 9.970582177295009e-06, + "loss": 0.4914, + "step": 6788 + }, + { + "epoch": 1.5143876868168638, + "grad_norm": 0.17538684606552124, + "learning_rate": 9.968228759103943e-06, + "loss": 0.4258, + "step": 6789 + }, + { + "epoch": 1.514610751728753, + "grad_norm": 0.17010897397994995, + "learning_rate": 9.965875342672572e-06, + "loss": 0.4544, + "step": 6790 + }, + { + "epoch": 1.5148338166406425, + "grad_norm": 0.1833350658416748, + "learning_rate": 9.963521928131234e-06, + "loss": 0.4608, + "step": 6791 + }, + { + "epoch": 1.5150568815525318, + "grad_norm": 0.1769610494375229, + "learning_rate": 9.961168515610283e-06, + "loss": 0.4679, + "step": 6792 + }, + { + "epoch": 1.515279946464421, + "grad_norm": 0.17435282468795776, + "learning_rate": 9.958815105240064e-06, + "loss": 0.4737, + "step": 6793 + }, + { + "epoch": 1.5155030113763104, + "grad_norm": 0.1680775284767151, + "learning_rate": 9.956461697150925e-06, + "loss": 0.4635, + "step": 6794 + }, + { + "epoch": 1.5157260762881999, + "grad_norm": 0.17000484466552734, + "learning_rate": 9.954108291473207e-06, + "loss": 0.4558, + "step": 6795 + }, + { + "epoch": 1.5159491412000894, + "grad_norm": 0.17273864150047302, + "learning_rate": 9.951754888337265e-06, + "loss": 0.4472, + "step": 6796 + }, + { + "epoch": 1.5161722061119787, + "grad_norm": 0.18519042432308197, + "learning_rate": 9.949401487873438e-06, + "loss": 0.455, + "step": 6797 + }, + { + "epoch": 1.516395271023868, + "grad_norm": 0.17533376812934875, + "learning_rate": 9.947048090212076e-06, + "loss": 0.4684, + "step": 6798 + }, + { + "epoch": 1.5166183359357572, + "grad_norm": 0.1752437800168991, + "learning_rate": 9.944694695483523e-06, + "loss": 0.4625, + "step": 6799 + }, + { + "epoch": 1.5168414008476465, + "grad_norm": 0.17211464047431946, + "learning_rate": 9.942341303818128e-06, + "loss": 0.443, + "step": 6800 + }, + { + "epoch": 1.517064465759536, + "grad_norm": 0.170721635222435, + "learning_rate": 9.939987915346232e-06, + "loss": 0.4211, + "step": 6801 + }, + { + "epoch": 1.5172875306714255, + "grad_norm": 0.17440038919448853, + "learning_rate": 9.937634530198184e-06, + "loss": 0.456, + "step": 6802 + }, + { + "epoch": 1.5175105955833148, + "grad_norm": 0.17179426550865173, + "learning_rate": 9.935281148504326e-06, + "loss": 0.4487, + "step": 6803 + }, + { + "epoch": 1.517733660495204, + "grad_norm": 0.17573867738246918, + "learning_rate": 9.932927770395007e-06, + "loss": 0.4663, + "step": 6804 + }, + { + "epoch": 1.5179567254070934, + "grad_norm": 0.18278174102306366, + "learning_rate": 9.93057439600057e-06, + "loss": 0.4471, + "step": 6805 + }, + { + "epoch": 1.5181797903189829, + "grad_norm": 0.17256884276866913, + "learning_rate": 9.92822102545136e-06, + "loss": 0.4675, + "step": 6806 + }, + { + "epoch": 1.5184028552308722, + "grad_norm": 0.19855371117591858, + "learning_rate": 9.925867658877718e-06, + "loss": 0.4661, + "step": 6807 + }, + { + "epoch": 1.5186259201427617, + "grad_norm": 0.1643659472465515, + "learning_rate": 9.923514296409992e-06, + "loss": 0.4457, + "step": 6808 + }, + { + "epoch": 1.518848985054651, + "grad_norm": 0.16748277842998505, + "learning_rate": 9.921160938178529e-06, + "loss": 0.4558, + "step": 6809 + }, + { + "epoch": 1.5190720499665402, + "grad_norm": 0.16805437207221985, + "learning_rate": 9.918807584313666e-06, + "loss": 0.4283, + "step": 6810 + }, + { + "epoch": 1.5192951148784295, + "grad_norm": 0.18906736373901367, + "learning_rate": 9.916454234945752e-06, + "loss": 0.4675, + "step": 6811 + }, + { + "epoch": 1.519518179790319, + "grad_norm": 0.19310304522514343, + "learning_rate": 9.914100890205124e-06, + "loss": 0.4689, + "step": 6812 + }, + { + "epoch": 1.5197412447022085, + "grad_norm": 0.17019645869731903, + "learning_rate": 9.911747550222134e-06, + "loss": 0.4309, + "step": 6813 + }, + { + "epoch": 1.5199643096140978, + "grad_norm": 0.16500838100910187, + "learning_rate": 9.909394215127118e-06, + "loss": 0.4364, + "step": 6814 + }, + { + "epoch": 1.520187374525987, + "grad_norm": 0.1711840033531189, + "learning_rate": 9.90704088505042e-06, + "loss": 0.4439, + "step": 6815 + }, + { + "epoch": 1.5204104394378763, + "grad_norm": 0.17483307421207428, + "learning_rate": 9.904687560122381e-06, + "loss": 0.493, + "step": 6816 + }, + { + "epoch": 1.5206335043497656, + "grad_norm": 0.17699526250362396, + "learning_rate": 9.90233424047335e-06, + "loss": 0.4446, + "step": 6817 + }, + { + "epoch": 1.5208565692616551, + "grad_norm": 0.18029628694057465, + "learning_rate": 9.89998092623366e-06, + "loss": 0.4676, + "step": 6818 + }, + { + "epoch": 1.5210796341735446, + "grad_norm": 0.17293018102645874, + "learning_rate": 9.897627617533653e-06, + "loss": 0.4501, + "step": 6819 + }, + { + "epoch": 1.521302699085434, + "grad_norm": 0.17556585371494293, + "learning_rate": 9.895274314503674e-06, + "loss": 0.4454, + "step": 6820 + }, + { + "epoch": 1.5215257639973232, + "grad_norm": 0.1644999384880066, + "learning_rate": 9.892921017274062e-06, + "loss": 0.4197, + "step": 6821 + }, + { + "epoch": 1.5217488289092125, + "grad_norm": 0.17591945827007294, + "learning_rate": 9.890567725975159e-06, + "loss": 0.4593, + "step": 6822 + }, + { + "epoch": 1.521971893821102, + "grad_norm": 0.1673215925693512, + "learning_rate": 9.888214440737302e-06, + "loss": 0.4442, + "step": 6823 + }, + { + "epoch": 1.5221949587329913, + "grad_norm": 0.16372933983802795, + "learning_rate": 9.885861161690832e-06, + "loss": 0.4252, + "step": 6824 + }, + { + "epoch": 1.5224180236448808, + "grad_norm": 0.17661777138710022, + "learning_rate": 9.88350788896609e-06, + "loss": 0.4593, + "step": 6825 + }, + { + "epoch": 1.52264108855677, + "grad_norm": 0.17217707633972168, + "learning_rate": 9.881154622693415e-06, + "loss": 0.4418, + "step": 6826 + }, + { + "epoch": 1.5228641534686593, + "grad_norm": 0.1641775518655777, + "learning_rate": 9.878801363003143e-06, + "loss": 0.4624, + "step": 6827 + }, + { + "epoch": 1.5230872183805486, + "grad_norm": 0.17330925166606903, + "learning_rate": 9.876448110025615e-06, + "loss": 0.4916, + "step": 6828 + }, + { + "epoch": 1.5233102832924381, + "grad_norm": 0.17335113883018494, + "learning_rate": 9.874094863891166e-06, + "loss": 0.4581, + "step": 6829 + }, + { + "epoch": 1.5235333482043276, + "grad_norm": 0.18039341270923615, + "learning_rate": 9.871741624730134e-06, + "loss": 0.4441, + "step": 6830 + }, + { + "epoch": 1.523756413116217, + "grad_norm": 0.18012328445911407, + "learning_rate": 9.869388392672861e-06, + "loss": 0.4618, + "step": 6831 + }, + { + "epoch": 1.5239794780281062, + "grad_norm": 0.17947714030742645, + "learning_rate": 9.867035167849678e-06, + "loss": 0.4905, + "step": 6832 + }, + { + "epoch": 1.5242025429399955, + "grad_norm": 0.18116816878318787, + "learning_rate": 9.864681950390927e-06, + "loss": 0.4649, + "step": 6833 + }, + { + "epoch": 1.5244256078518847, + "grad_norm": 0.1693393439054489, + "learning_rate": 9.862328740426938e-06, + "loss": 0.458, + "step": 6834 + }, + { + "epoch": 1.5246486727637742, + "grad_norm": 0.17401763796806335, + "learning_rate": 9.85997553808805e-06, + "loss": 0.4719, + "step": 6835 + }, + { + "epoch": 1.5248717376756638, + "grad_norm": 0.17534461617469788, + "learning_rate": 9.857622343504597e-06, + "loss": 0.4504, + "step": 6836 + }, + { + "epoch": 1.525094802587553, + "grad_norm": 0.17880570888519287, + "learning_rate": 9.855269156806916e-06, + "loss": 0.4832, + "step": 6837 + }, + { + "epoch": 1.5253178674994423, + "grad_norm": 0.16782738268375397, + "learning_rate": 9.852915978125337e-06, + "loss": 0.46, + "step": 6838 + }, + { + "epoch": 1.5255409324113316, + "grad_norm": 0.18301822245121002, + "learning_rate": 9.850562807590199e-06, + "loss": 0.4459, + "step": 6839 + }, + { + "epoch": 1.525763997323221, + "grad_norm": 0.1692187637090683, + "learning_rate": 9.84820964533183e-06, + "loss": 0.4465, + "step": 6840 + }, + { + "epoch": 1.5259870622351104, + "grad_norm": 0.17732323706150055, + "learning_rate": 9.845856491480569e-06, + "loss": 0.4374, + "step": 6841 + }, + { + "epoch": 1.5262101271469999, + "grad_norm": 0.17886339128017426, + "learning_rate": 9.843503346166741e-06, + "loss": 0.445, + "step": 6842 + }, + { + "epoch": 1.5264331920588892, + "grad_norm": 0.17064787447452545, + "learning_rate": 9.841150209520686e-06, + "loss": 0.4343, + "step": 6843 + }, + { + "epoch": 1.5266562569707784, + "grad_norm": 0.18215104937553406, + "learning_rate": 9.83879708167273e-06, + "loss": 0.4515, + "step": 6844 + }, + { + "epoch": 1.5268793218826677, + "grad_norm": 0.17810726165771484, + "learning_rate": 9.836443962753205e-06, + "loss": 0.4746, + "step": 6845 + }, + { + "epoch": 1.5271023867945572, + "grad_norm": 0.17213033139705658, + "learning_rate": 9.834090852892442e-06, + "loss": 0.4731, + "step": 6846 + }, + { + "epoch": 1.5273254517064467, + "grad_norm": 0.18000519275665283, + "learning_rate": 9.831737752220772e-06, + "loss": 0.4725, + "step": 6847 + }, + { + "epoch": 1.527548516618336, + "grad_norm": 0.17720220983028412, + "learning_rate": 9.829384660868523e-06, + "loss": 0.449, + "step": 6848 + }, + { + "epoch": 1.5277715815302253, + "grad_norm": 0.1757756471633911, + "learning_rate": 9.827031578966026e-06, + "loss": 0.4492, + "step": 6849 + }, + { + "epoch": 1.5279946464421146, + "grad_norm": 0.18418079614639282, + "learning_rate": 9.824678506643607e-06, + "loss": 0.4514, + "step": 6850 + }, + { + "epoch": 1.5282177113540039, + "grad_norm": 0.16821002960205078, + "learning_rate": 9.822325444031594e-06, + "loss": 0.4331, + "step": 6851 + }, + { + "epoch": 1.5284407762658934, + "grad_norm": 0.16792722046375275, + "learning_rate": 9.819972391260318e-06, + "loss": 0.4373, + "step": 6852 + }, + { + "epoch": 1.5286638411777829, + "grad_norm": 0.18326304852962494, + "learning_rate": 9.8176193484601e-06, + "loss": 0.4577, + "step": 6853 + }, + { + "epoch": 1.5288869060896721, + "grad_norm": 0.1714770793914795, + "learning_rate": 9.815266315761275e-06, + "loss": 0.437, + "step": 6854 + }, + { + "epoch": 1.5291099710015614, + "grad_norm": 0.16261571645736694, + "learning_rate": 9.81291329329416e-06, + "loss": 0.406, + "step": 6855 + }, + { + "epoch": 1.5293330359134507, + "grad_norm": 0.17615249752998352, + "learning_rate": 9.810560281189085e-06, + "loss": 0.4704, + "step": 6856 + }, + { + "epoch": 1.5295561008253402, + "grad_norm": 0.1748465746641159, + "learning_rate": 9.80820727957637e-06, + "loss": 0.4763, + "step": 6857 + }, + { + "epoch": 1.5297791657372295, + "grad_norm": 0.1718483865261078, + "learning_rate": 9.805854288586346e-06, + "loss": 0.4574, + "step": 6858 + }, + { + "epoch": 1.530002230649119, + "grad_norm": 0.17926479876041412, + "learning_rate": 9.803501308349332e-06, + "loss": 0.4556, + "step": 6859 + }, + { + "epoch": 1.5302252955610083, + "grad_norm": 0.17003634572029114, + "learning_rate": 9.801148338995652e-06, + "loss": 0.4553, + "step": 6860 + }, + { + "epoch": 1.5304483604728976, + "grad_norm": 0.17237353324890137, + "learning_rate": 9.798795380655625e-06, + "loss": 0.4521, + "step": 6861 + }, + { + "epoch": 1.5306714253847868, + "grad_norm": 0.18084143102169037, + "learning_rate": 9.796442433459578e-06, + "loss": 0.4592, + "step": 6862 + }, + { + "epoch": 1.5308944902966763, + "grad_norm": 0.18167221546173096, + "learning_rate": 9.794089497537827e-06, + "loss": 0.4735, + "step": 6863 + }, + { + "epoch": 1.5311175552085659, + "grad_norm": 0.18819572031497955, + "learning_rate": 9.791736573020697e-06, + "loss": 0.4906, + "step": 6864 + }, + { + "epoch": 1.5313406201204551, + "grad_norm": 0.17070408165454865, + "learning_rate": 9.789383660038506e-06, + "loss": 0.4242, + "step": 6865 + }, + { + "epoch": 1.5315636850323444, + "grad_norm": 0.17565836012363434, + "learning_rate": 9.787030758721573e-06, + "loss": 0.4431, + "step": 6866 + }, + { + "epoch": 1.5317867499442337, + "grad_norm": 0.174460306763649, + "learning_rate": 9.784677869200215e-06, + "loss": 0.4487, + "step": 6867 + }, + { + "epoch": 1.532009814856123, + "grad_norm": 0.17642372846603394, + "learning_rate": 9.782324991604751e-06, + "loss": 0.469, + "step": 6868 + }, + { + "epoch": 1.5322328797680125, + "grad_norm": 0.17292480170726776, + "learning_rate": 9.779972126065497e-06, + "loss": 0.4408, + "step": 6869 + }, + { + "epoch": 1.532455944679902, + "grad_norm": 0.17120680212974548, + "learning_rate": 9.777619272712774e-06, + "loss": 0.474, + "step": 6870 + }, + { + "epoch": 1.5326790095917913, + "grad_norm": 0.16610749065876007, + "learning_rate": 9.77526643167689e-06, + "loss": 0.4486, + "step": 6871 + }, + { + "epoch": 1.5329020745036805, + "grad_norm": 0.17615914344787598, + "learning_rate": 9.772913603088166e-06, + "loss": 0.4654, + "step": 6872 + }, + { + "epoch": 1.5331251394155698, + "grad_norm": 0.1731085181236267, + "learning_rate": 9.770560787076918e-06, + "loss": 0.4459, + "step": 6873 + }, + { + "epoch": 1.5333482043274593, + "grad_norm": 0.17174161970615387, + "learning_rate": 9.768207983773455e-06, + "loss": 0.4443, + "step": 6874 + }, + { + "epoch": 1.5335712692393486, + "grad_norm": 0.17707736790180206, + "learning_rate": 9.765855193308092e-06, + "loss": 0.4707, + "step": 6875 + }, + { + "epoch": 1.5337943341512381, + "grad_norm": 0.17120012640953064, + "learning_rate": 9.763502415811141e-06, + "loss": 0.4599, + "step": 6876 + }, + { + "epoch": 1.5340173990631274, + "grad_norm": 0.18111848831176758, + "learning_rate": 9.761149651412915e-06, + "loss": 0.4354, + "step": 6877 + }, + { + "epoch": 1.5342404639750167, + "grad_norm": 0.17889343202114105, + "learning_rate": 9.758796900243723e-06, + "loss": 0.4521, + "step": 6878 + }, + { + "epoch": 1.534463528886906, + "grad_norm": 0.1871626377105713, + "learning_rate": 9.756444162433876e-06, + "loss": 0.4711, + "step": 6879 + }, + { + "epoch": 1.5346865937987955, + "grad_norm": 0.17596866190433502, + "learning_rate": 9.754091438113683e-06, + "loss": 0.4566, + "step": 6880 + }, + { + "epoch": 1.534909658710685, + "grad_norm": 0.17852075397968292, + "learning_rate": 9.751738727413453e-06, + "loss": 0.4763, + "step": 6881 + }, + { + "epoch": 1.5351327236225742, + "grad_norm": 0.191118523478508, + "learning_rate": 9.749386030463493e-06, + "loss": 0.4396, + "step": 6882 + }, + { + "epoch": 1.5353557885344635, + "grad_norm": 0.19689539074897766, + "learning_rate": 9.747033347394113e-06, + "loss": 0.4421, + "step": 6883 + }, + { + "epoch": 1.5355788534463528, + "grad_norm": 0.16877755522727966, + "learning_rate": 9.744680678335614e-06, + "loss": 0.4185, + "step": 6884 + }, + { + "epoch": 1.535801918358242, + "grad_norm": 0.1971082091331482, + "learning_rate": 9.74232802341831e-06, + "loss": 0.4537, + "step": 6885 + }, + { + "epoch": 1.5360249832701316, + "grad_norm": 0.1687203049659729, + "learning_rate": 9.739975382772498e-06, + "loss": 0.4524, + "step": 6886 + }, + { + "epoch": 1.536248048182021, + "grad_norm": 0.169671893119812, + "learning_rate": 9.737622756528485e-06, + "loss": 0.4467, + "step": 6887 + }, + { + "epoch": 1.5364711130939104, + "grad_norm": 0.1722356528043747, + "learning_rate": 9.735270144816574e-06, + "loss": 0.4771, + "step": 6888 + }, + { + "epoch": 1.5366941780057997, + "grad_norm": 0.17518217861652374, + "learning_rate": 9.732917547767069e-06, + "loss": 0.4543, + "step": 6889 + }, + { + "epoch": 1.536917242917689, + "grad_norm": 0.17109224200248718, + "learning_rate": 9.730564965510269e-06, + "loss": 0.4401, + "step": 6890 + }, + { + "epoch": 1.5371403078295784, + "grad_norm": 0.1737949699163437, + "learning_rate": 9.728212398176476e-06, + "loss": 0.4415, + "step": 6891 + }, + { + "epoch": 1.5373633727414677, + "grad_norm": 0.16947439312934875, + "learning_rate": 9.72585984589599e-06, + "loss": 0.4723, + "step": 6892 + }, + { + "epoch": 1.5375864376533572, + "grad_norm": 0.1718679517507553, + "learning_rate": 9.723507308799109e-06, + "loss": 0.4842, + "step": 6893 + }, + { + "epoch": 1.5378095025652465, + "grad_norm": 0.1752775013446808, + "learning_rate": 9.721154787016135e-06, + "loss": 0.4461, + "step": 6894 + }, + { + "epoch": 1.5380325674771358, + "grad_norm": 0.17731130123138428, + "learning_rate": 9.71880228067736e-06, + "loss": 0.4566, + "step": 6895 + }, + { + "epoch": 1.538255632389025, + "grad_norm": 0.22506393492221832, + "learning_rate": 9.716449789913087e-06, + "loss": 0.4486, + "step": 6896 + }, + { + "epoch": 1.5384786973009146, + "grad_norm": 0.17493079602718353, + "learning_rate": 9.714097314853606e-06, + "loss": 0.439, + "step": 6897 + }, + { + "epoch": 1.538701762212804, + "grad_norm": 0.16993321478366852, + "learning_rate": 9.711744855629218e-06, + "loss": 0.4549, + "step": 6898 + }, + { + "epoch": 1.5389248271246934, + "grad_norm": 0.17897512018680573, + "learning_rate": 9.709392412370212e-06, + "loss": 0.4452, + "step": 6899 + }, + { + "epoch": 1.5391478920365826, + "grad_norm": 0.16903938353061676, + "learning_rate": 9.707039985206878e-06, + "loss": 0.4665, + "step": 6900 + }, + { + "epoch": 1.539370956948472, + "grad_norm": 0.17775236070156097, + "learning_rate": 9.704687574269515e-06, + "loss": 0.4702, + "step": 6901 + }, + { + "epoch": 1.5395940218603612, + "grad_norm": 0.17940306663513184, + "learning_rate": 9.70233517968841e-06, + "loss": 0.4671, + "step": 6902 + }, + { + "epoch": 1.5398170867722507, + "grad_norm": 0.163747638463974, + "learning_rate": 9.699982801593857e-06, + "loss": 0.4533, + "step": 6903 + }, + { + "epoch": 1.5400401516841402, + "grad_norm": 0.1675698310136795, + "learning_rate": 9.697630440116141e-06, + "loss": 0.4422, + "step": 6904 + }, + { + "epoch": 1.5402632165960295, + "grad_norm": 0.1684923619031906, + "learning_rate": 9.695278095385558e-06, + "loss": 0.4439, + "step": 6905 + }, + { + "epoch": 1.5404862815079188, + "grad_norm": 0.16581715643405914, + "learning_rate": 9.692925767532386e-06, + "loss": 0.4288, + "step": 6906 + }, + { + "epoch": 1.540709346419808, + "grad_norm": 0.1779187023639679, + "learning_rate": 9.690573456686919e-06, + "loss": 0.4589, + "step": 6907 + }, + { + "epoch": 1.5409324113316976, + "grad_norm": 0.17237865924835205, + "learning_rate": 9.688221162979438e-06, + "loss": 0.4595, + "step": 6908 + }, + { + "epoch": 1.5411554762435868, + "grad_norm": 0.17078691720962524, + "learning_rate": 9.685868886540233e-06, + "loss": 0.4417, + "step": 6909 + }, + { + "epoch": 1.5413785411554763, + "grad_norm": 0.16685281693935394, + "learning_rate": 9.683516627499583e-06, + "loss": 0.4311, + "step": 6910 + }, + { + "epoch": 1.5416016060673656, + "grad_norm": 0.17877590656280518, + "learning_rate": 9.681164385987776e-06, + "loss": 0.4475, + "step": 6911 + }, + { + "epoch": 1.541824670979255, + "grad_norm": 0.17893154919147491, + "learning_rate": 9.678812162135087e-06, + "loss": 0.4585, + "step": 6912 + }, + { + "epoch": 1.5420477358911442, + "grad_norm": 0.17963162064552307, + "learning_rate": 9.676459956071804e-06, + "loss": 0.4643, + "step": 6913 + }, + { + "epoch": 1.5422708008030337, + "grad_norm": 0.16504304111003876, + "learning_rate": 9.674107767928202e-06, + "loss": 0.4181, + "step": 6914 + }, + { + "epoch": 1.5424938657149232, + "grad_norm": 0.17094488441944122, + "learning_rate": 9.671755597834563e-06, + "loss": 0.4739, + "step": 6915 + }, + { + "epoch": 1.5427169306268125, + "grad_norm": 0.1811668872833252, + "learning_rate": 9.669403445921162e-06, + "loss": 0.4549, + "step": 6916 + }, + { + "epoch": 1.5429399955387018, + "grad_norm": 0.17800799012184143, + "learning_rate": 9.667051312318277e-06, + "loss": 0.4655, + "step": 6917 + }, + { + "epoch": 1.543163060450591, + "grad_norm": 0.1688285917043686, + "learning_rate": 9.664699197156188e-06, + "loss": 0.4442, + "step": 6918 + }, + { + "epoch": 1.5433861253624803, + "grad_norm": 0.17787446081638336, + "learning_rate": 9.662347100565163e-06, + "loss": 0.4689, + "step": 6919 + }, + { + "epoch": 1.5436091902743698, + "grad_norm": 0.1820237636566162, + "learning_rate": 9.659995022675482e-06, + "loss": 0.453, + "step": 6920 + }, + { + "epoch": 1.5438322551862593, + "grad_norm": 0.18955709040164948, + "learning_rate": 9.657642963617414e-06, + "loss": 0.4653, + "step": 6921 + }, + { + "epoch": 1.5440553200981486, + "grad_norm": 0.17111030220985413, + "learning_rate": 9.655290923521235e-06, + "loss": 0.4436, + "step": 6922 + }, + { + "epoch": 1.544278385010038, + "grad_norm": 0.17172935605049133, + "learning_rate": 9.65293890251721e-06, + "loss": 0.4361, + "step": 6923 + }, + { + "epoch": 1.5445014499219272, + "grad_norm": 0.18299220502376556, + "learning_rate": 9.650586900735612e-06, + "loss": 0.4775, + "step": 6924 + }, + { + "epoch": 1.5447245148338167, + "grad_norm": 0.18452848494052887, + "learning_rate": 9.648234918306709e-06, + "loss": 0.4646, + "step": 6925 + }, + { + "epoch": 1.544947579745706, + "grad_norm": 0.17104355990886688, + "learning_rate": 9.645882955360771e-06, + "loss": 0.454, + "step": 6926 + }, + { + "epoch": 1.5451706446575955, + "grad_norm": 0.18376219272613525, + "learning_rate": 9.643531012028061e-06, + "loss": 0.4683, + "step": 6927 + }, + { + "epoch": 1.5453937095694847, + "grad_norm": 0.17010129988193512, + "learning_rate": 9.641179088438845e-06, + "loss": 0.4417, + "step": 6928 + }, + { + "epoch": 1.545616774481374, + "grad_norm": 0.167456716299057, + "learning_rate": 9.63882718472339e-06, + "loss": 0.4114, + "step": 6929 + }, + { + "epoch": 1.5458398393932633, + "grad_norm": 0.17169076204299927, + "learning_rate": 9.636475301011957e-06, + "loss": 0.4526, + "step": 6930 + }, + { + "epoch": 1.5460629043051528, + "grad_norm": 0.16744768619537354, + "learning_rate": 9.634123437434807e-06, + "loss": 0.4615, + "step": 6931 + }, + { + "epoch": 1.5462859692170423, + "grad_norm": 0.1830861121416092, + "learning_rate": 9.631771594122205e-06, + "loss": 0.4554, + "step": 6932 + }, + { + "epoch": 1.5465090341289316, + "grad_norm": 0.17061203718185425, + "learning_rate": 9.629419771204405e-06, + "loss": 0.4317, + "step": 6933 + }, + { + "epoch": 1.5467320990408209, + "grad_norm": 0.1750810146331787, + "learning_rate": 9.627067968811671e-06, + "loss": 0.4648, + "step": 6934 + }, + { + "epoch": 1.5469551639527102, + "grad_norm": 0.16664697229862213, + "learning_rate": 9.624716187074258e-06, + "loss": 0.4555, + "step": 6935 + }, + { + "epoch": 1.5471782288645994, + "grad_norm": 0.17463943362236023, + "learning_rate": 9.622364426122424e-06, + "loss": 0.4334, + "step": 6936 + }, + { + "epoch": 1.547401293776489, + "grad_norm": 0.182994544506073, + "learning_rate": 9.620012686086423e-06, + "loss": 0.4657, + "step": 6937 + }, + { + "epoch": 1.5476243586883784, + "grad_norm": 0.18054361641407013, + "learning_rate": 9.61766096709651e-06, + "loss": 0.4595, + "step": 6938 + }, + { + "epoch": 1.5478474236002677, + "grad_norm": 0.17179542779922485, + "learning_rate": 9.615309269282938e-06, + "loss": 0.473, + "step": 6939 + }, + { + "epoch": 1.548070488512157, + "grad_norm": 0.17718704044818878, + "learning_rate": 9.612957592775957e-06, + "loss": 0.4708, + "step": 6940 + }, + { + "epoch": 1.5482935534240463, + "grad_norm": 0.17120683193206787, + "learning_rate": 9.610605937705823e-06, + "loss": 0.4432, + "step": 6941 + }, + { + "epoch": 1.5485166183359358, + "grad_norm": 0.17965561151504517, + "learning_rate": 9.608254304202776e-06, + "loss": 0.439, + "step": 6942 + }, + { + "epoch": 1.548739683247825, + "grad_norm": 0.17135334014892578, + "learning_rate": 9.605902692397074e-06, + "loss": 0.4597, + "step": 6943 + }, + { + "epoch": 1.5489627481597146, + "grad_norm": 0.1764613837003708, + "learning_rate": 9.603551102418958e-06, + "loss": 0.4478, + "step": 6944 + }, + { + "epoch": 1.5491858130716039, + "grad_norm": 0.16766582429409027, + "learning_rate": 9.601199534398678e-06, + "loss": 0.425, + "step": 6945 + }, + { + "epoch": 1.5494088779834931, + "grad_norm": 0.1771164983510971, + "learning_rate": 9.598847988466472e-06, + "loss": 0.457, + "step": 6946 + }, + { + "epoch": 1.5496319428953824, + "grad_norm": 0.1649523228406906, + "learning_rate": 9.596496464752593e-06, + "loss": 0.4563, + "step": 6947 + }, + { + "epoch": 1.549855007807272, + "grad_norm": 0.17230331897735596, + "learning_rate": 9.594144963387274e-06, + "loss": 0.4587, + "step": 6948 + }, + { + "epoch": 1.5500780727191614, + "grad_norm": 0.17549771070480347, + "learning_rate": 9.591793484500762e-06, + "loss": 0.4719, + "step": 6949 + }, + { + "epoch": 1.5503011376310507, + "grad_norm": 0.17885257303714752, + "learning_rate": 9.589442028223292e-06, + "loss": 0.4479, + "step": 6950 + }, + { + "epoch": 1.55052420254294, + "grad_norm": 0.1839398443698883, + "learning_rate": 9.587090594685108e-06, + "loss": 0.4631, + "step": 6951 + }, + { + "epoch": 1.5507472674548293, + "grad_norm": 0.17899379134178162, + "learning_rate": 9.584739184016442e-06, + "loss": 0.4705, + "step": 6952 + }, + { + "epoch": 1.5509703323667186, + "grad_norm": 0.17247112095355988, + "learning_rate": 9.582387796347534e-06, + "loss": 0.4653, + "step": 6953 + }, + { + "epoch": 1.551193397278608, + "grad_norm": 0.17476417124271393, + "learning_rate": 9.580036431808614e-06, + "loss": 0.447, + "step": 6954 + }, + { + "epoch": 1.5514164621904976, + "grad_norm": 0.17558352649211884, + "learning_rate": 9.57768509052992e-06, + "loss": 0.4607, + "step": 6955 + }, + { + "epoch": 1.5516395271023868, + "grad_norm": 0.1909688413143158, + "learning_rate": 9.575333772641682e-06, + "loss": 0.4619, + "step": 6956 + }, + { + "epoch": 1.5518625920142761, + "grad_norm": 0.1743474304676056, + "learning_rate": 9.572982478274128e-06, + "loss": 0.443, + "step": 6957 + }, + { + "epoch": 1.5520856569261654, + "grad_norm": 0.18644817173480988, + "learning_rate": 9.570631207557495e-06, + "loss": 0.4533, + "step": 6958 + }, + { + "epoch": 1.552308721838055, + "grad_norm": 0.17602680623531342, + "learning_rate": 9.568279960622004e-06, + "loss": 0.4394, + "step": 6959 + }, + { + "epoch": 1.5525317867499442, + "grad_norm": 0.17043206095695496, + "learning_rate": 9.565928737597886e-06, + "loss": 0.4551, + "step": 6960 + }, + { + "epoch": 1.5527548516618337, + "grad_norm": 0.18989066779613495, + "learning_rate": 9.563577538615363e-06, + "loss": 0.471, + "step": 6961 + }, + { + "epoch": 1.552977916573723, + "grad_norm": 0.19321848452091217, + "learning_rate": 9.561226363804663e-06, + "loss": 0.4739, + "step": 6962 + }, + { + "epoch": 1.5532009814856123, + "grad_norm": 0.17869152128696442, + "learning_rate": 9.558875213296006e-06, + "loss": 0.4409, + "step": 6963 + }, + { + "epoch": 1.5534240463975015, + "grad_norm": 0.17526346445083618, + "learning_rate": 9.556524087219616e-06, + "loss": 0.4514, + "step": 6964 + }, + { + "epoch": 1.553647111309391, + "grad_norm": 0.17142999172210693, + "learning_rate": 9.55417298570571e-06, + "loss": 0.4427, + "step": 6965 + }, + { + "epoch": 1.5538701762212805, + "grad_norm": 0.17005646228790283, + "learning_rate": 9.551821908884507e-06, + "loss": 0.4434, + "step": 6966 + }, + { + "epoch": 1.5540932411331698, + "grad_norm": 0.1687459945678711, + "learning_rate": 9.549470856886228e-06, + "loss": 0.4651, + "step": 6967 + }, + { + "epoch": 1.554316306045059, + "grad_norm": 0.1726951152086258, + "learning_rate": 9.547119829841088e-06, + "loss": 0.4582, + "step": 6968 + }, + { + "epoch": 1.5545393709569484, + "grad_norm": 0.17231762409210205, + "learning_rate": 9.544768827879297e-06, + "loss": 0.4472, + "step": 6969 + }, + { + "epoch": 1.5547624358688377, + "grad_norm": 0.179609015583992, + "learning_rate": 9.542417851131076e-06, + "loss": 0.4563, + "step": 6970 + }, + { + "epoch": 1.5549855007807272, + "grad_norm": 0.17300112545490265, + "learning_rate": 9.540066899726627e-06, + "loss": 0.4245, + "step": 6971 + }, + { + "epoch": 1.5552085656926167, + "grad_norm": 0.19064456224441528, + "learning_rate": 9.537715973796172e-06, + "loss": 0.4519, + "step": 6972 + }, + { + "epoch": 1.555431630604506, + "grad_norm": 0.1847352236509323, + "learning_rate": 9.535365073469911e-06, + "loss": 0.4784, + "step": 6973 + }, + { + "epoch": 1.5556546955163952, + "grad_norm": 0.18160337209701538, + "learning_rate": 9.533014198878057e-06, + "loss": 0.4533, + "step": 6974 + }, + { + "epoch": 1.5558777604282845, + "grad_norm": 0.1749866008758545, + "learning_rate": 9.530663350150812e-06, + "loss": 0.4607, + "step": 6975 + }, + { + "epoch": 1.556100825340174, + "grad_norm": 0.1792685091495514, + "learning_rate": 9.528312527418384e-06, + "loss": 0.4598, + "step": 6976 + }, + { + "epoch": 1.5563238902520633, + "grad_norm": 0.16535772383213043, + "learning_rate": 9.525961730810974e-06, + "loss": 0.4733, + "step": 6977 + }, + { + "epoch": 1.5565469551639528, + "grad_norm": 0.17856188118457794, + "learning_rate": 9.523610960458786e-06, + "loss": 0.4724, + "step": 6978 + }, + { + "epoch": 1.556770020075842, + "grad_norm": 0.1673804074525833, + "learning_rate": 9.521260216492025e-06, + "loss": 0.4281, + "step": 6979 + }, + { + "epoch": 1.5569930849877314, + "grad_norm": 0.17213106155395508, + "learning_rate": 9.518909499040878e-06, + "loss": 0.4652, + "step": 6980 + }, + { + "epoch": 1.5572161498996206, + "grad_norm": 0.1793312132358551, + "learning_rate": 9.51655880823555e-06, + "loss": 0.4645, + "step": 6981 + }, + { + "epoch": 1.5574392148115102, + "grad_norm": 0.17523737251758575, + "learning_rate": 9.514208144206237e-06, + "loss": 0.4459, + "step": 6982 + }, + { + "epoch": 1.5576622797233997, + "grad_norm": 0.17327353358268738, + "learning_rate": 9.511857507083132e-06, + "loss": 0.4531, + "step": 6983 + }, + { + "epoch": 1.557885344635289, + "grad_norm": 0.3019886016845703, + "learning_rate": 9.509506896996432e-06, + "loss": 0.4868, + "step": 6984 + }, + { + "epoch": 1.5581084095471782, + "grad_norm": 0.16475124657154083, + "learning_rate": 9.50715631407632e-06, + "loss": 0.4522, + "step": 6985 + }, + { + "epoch": 1.5583314744590675, + "grad_norm": 0.17215771973133087, + "learning_rate": 9.504805758452996e-06, + "loss": 0.4741, + "step": 6986 + }, + { + "epoch": 1.558554539370957, + "grad_norm": 0.17033734917640686, + "learning_rate": 9.502455230256641e-06, + "loss": 0.4406, + "step": 6987 + }, + { + "epoch": 1.5587776042828463, + "grad_norm": 0.17442695796489716, + "learning_rate": 9.500104729617447e-06, + "loss": 0.4473, + "step": 6988 + }, + { + "epoch": 1.5590006691947358, + "grad_norm": 0.17998719215393066, + "learning_rate": 9.497754256665596e-06, + "loss": 0.4747, + "step": 6989 + }, + { + "epoch": 1.559223734106625, + "grad_norm": 0.18134324252605438, + "learning_rate": 9.495403811531274e-06, + "loss": 0.457, + "step": 6990 + }, + { + "epoch": 1.5594467990185144, + "grad_norm": 0.16917896270751953, + "learning_rate": 9.493053394344663e-06, + "loss": 0.4701, + "step": 6991 + }, + { + "epoch": 1.5596698639304036, + "grad_norm": 0.18709082901477814, + "learning_rate": 9.490703005235945e-06, + "loss": 0.4577, + "step": 6992 + }, + { + "epoch": 1.5598929288422931, + "grad_norm": 0.17913807928562164, + "learning_rate": 9.488352644335296e-06, + "loss": 0.4391, + "step": 6993 + }, + { + "epoch": 1.5601159937541824, + "grad_norm": 0.24650318920612335, + "learning_rate": 9.486002311772898e-06, + "loss": 0.4355, + "step": 6994 + }, + { + "epoch": 1.560339058666072, + "grad_norm": 0.17876996099948883, + "learning_rate": 9.483652007678924e-06, + "loss": 0.4741, + "step": 6995 + }, + { + "epoch": 1.5605621235779612, + "grad_norm": 0.1874217391014099, + "learning_rate": 9.48130173218355e-06, + "loss": 0.4516, + "step": 6996 + }, + { + "epoch": 1.5607851884898505, + "grad_norm": 0.1787550449371338, + "learning_rate": 9.478951485416948e-06, + "loss": 0.4784, + "step": 6997 + }, + { + "epoch": 1.5610082534017398, + "grad_norm": 0.17699971795082092, + "learning_rate": 9.476601267509293e-06, + "loss": 0.4708, + "step": 6998 + }, + { + "epoch": 1.5612313183136293, + "grad_norm": 0.17713846266269684, + "learning_rate": 9.474251078590749e-06, + "loss": 0.4404, + "step": 6999 + }, + { + "epoch": 1.5614543832255188, + "grad_norm": 0.1823769062757492, + "learning_rate": 9.47190091879149e-06, + "loss": 0.428, + "step": 7000 + }, + { + "epoch": 1.561677448137408, + "grad_norm": 0.17455317080020905, + "learning_rate": 9.469550788241678e-06, + "loss": 0.4416, + "step": 7001 + }, + { + "epoch": 1.5619005130492973, + "grad_norm": 0.164621502161026, + "learning_rate": 9.467200687071478e-06, + "loss": 0.4429, + "step": 7002 + }, + { + "epoch": 1.5621235779611866, + "grad_norm": 0.1848084181547165, + "learning_rate": 9.464850615411059e-06, + "loss": 0.5004, + "step": 7003 + }, + { + "epoch": 1.5623466428730761, + "grad_norm": 0.20320458710193634, + "learning_rate": 9.462500573390578e-06, + "loss": 0.4704, + "step": 7004 + }, + { + "epoch": 1.5625697077849654, + "grad_norm": 0.16972646117210388, + "learning_rate": 9.460150561140196e-06, + "loss": 0.451, + "step": 7005 + }, + { + "epoch": 1.562792772696855, + "grad_norm": 0.16521897912025452, + "learning_rate": 9.457800578790072e-06, + "loss": 0.4457, + "step": 7006 + }, + { + "epoch": 1.5630158376087442, + "grad_norm": 0.18121238052845, + "learning_rate": 9.455450626470363e-06, + "loss": 0.4887, + "step": 7007 + }, + { + "epoch": 1.5632389025206335, + "grad_norm": 0.17698867619037628, + "learning_rate": 9.453100704311222e-06, + "loss": 0.4559, + "step": 7008 + }, + { + "epoch": 1.5634619674325227, + "grad_norm": 0.18639512360095978, + "learning_rate": 9.450750812442808e-06, + "loss": 0.4504, + "step": 7009 + }, + { + "epoch": 1.5636850323444123, + "grad_norm": 0.17181695997714996, + "learning_rate": 9.448400950995265e-06, + "loss": 0.4383, + "step": 7010 + }, + { + "epoch": 1.5639080972563015, + "grad_norm": 0.1778135597705841, + "learning_rate": 9.446051120098749e-06, + "loss": 0.4787, + "step": 7011 + }, + { + "epoch": 1.564131162168191, + "grad_norm": 0.17463596165180206, + "learning_rate": 9.443701319883404e-06, + "loss": 0.4513, + "step": 7012 + }, + { + "epoch": 1.5643542270800803, + "grad_norm": 0.1755073219537735, + "learning_rate": 9.441351550479382e-06, + "loss": 0.472, + "step": 7013 + }, + { + "epoch": 1.5645772919919696, + "grad_norm": 0.17314688861370087, + "learning_rate": 9.439001812016824e-06, + "loss": 0.4824, + "step": 7014 + }, + { + "epoch": 1.5648003569038589, + "grad_norm": 0.17931319773197174, + "learning_rate": 9.436652104625875e-06, + "loss": 0.487, + "step": 7015 + }, + { + "epoch": 1.5650234218157484, + "grad_norm": 0.1748175173997879, + "learning_rate": 9.434302428436674e-06, + "loss": 0.4322, + "step": 7016 + }, + { + "epoch": 1.5652464867276379, + "grad_norm": 0.1719309538602829, + "learning_rate": 9.431952783579365e-06, + "loss": 0.4584, + "step": 7017 + }, + { + "epoch": 1.5654695516395272, + "grad_norm": 0.1643352508544922, + "learning_rate": 9.429603170184085e-06, + "loss": 0.4204, + "step": 7018 + }, + { + "epoch": 1.5656926165514164, + "grad_norm": 0.17811031639575958, + "learning_rate": 9.427253588380968e-06, + "loss": 0.4827, + "step": 7019 + }, + { + "epoch": 1.5659156814633057, + "grad_norm": 0.18726174533367157, + "learning_rate": 9.42490403830015e-06, + "loss": 0.4466, + "step": 7020 + }, + { + "epoch": 1.5661387463751952, + "grad_norm": 0.1743079274892807, + "learning_rate": 9.422554520071763e-06, + "loss": 0.4595, + "step": 7021 + }, + { + "epoch": 1.5663618112870845, + "grad_norm": 0.16648052632808685, + "learning_rate": 9.42020503382594e-06, + "loss": 0.4399, + "step": 7022 + }, + { + "epoch": 1.566584876198974, + "grad_norm": 0.1800967901945114, + "learning_rate": 9.417855579692807e-06, + "loss": 0.4602, + "step": 7023 + }, + { + "epoch": 1.5668079411108633, + "grad_norm": 0.17495980858802795, + "learning_rate": 9.415506157802497e-06, + "loss": 0.45, + "step": 7024 + }, + { + "epoch": 1.5670310060227526, + "grad_norm": 0.1846977323293686, + "learning_rate": 9.41315676828513e-06, + "loss": 0.4677, + "step": 7025 + }, + { + "epoch": 1.5672540709346419, + "grad_norm": 0.1642792671918869, + "learning_rate": 9.410807411270834e-06, + "loss": 0.452, + "step": 7026 + }, + { + "epoch": 1.5674771358465314, + "grad_norm": 0.18164919316768646, + "learning_rate": 9.40845808688973e-06, + "loss": 0.457, + "step": 7027 + }, + { + "epoch": 1.5677002007584206, + "grad_norm": 0.18115800619125366, + "learning_rate": 9.40610879527194e-06, + "loss": 0.4561, + "step": 7028 + }, + { + "epoch": 1.5679232656703102, + "grad_norm": 0.17622902989387512, + "learning_rate": 9.403759536547577e-06, + "loss": 0.4436, + "step": 7029 + }, + { + "epoch": 1.5681463305821994, + "grad_norm": 0.17321671545505524, + "learning_rate": 9.401410310846762e-06, + "loss": 0.4635, + "step": 7030 + }, + { + "epoch": 1.5683693954940887, + "grad_norm": 0.16843074560165405, + "learning_rate": 9.39906111829961e-06, + "loss": 0.4266, + "step": 7031 + }, + { + "epoch": 1.568592460405978, + "grad_norm": 0.17982402443885803, + "learning_rate": 9.396711959036234e-06, + "loss": 0.4572, + "step": 7032 + }, + { + "epoch": 1.5688155253178675, + "grad_norm": 0.16617652773857117, + "learning_rate": 9.394362833186742e-06, + "loss": 0.4548, + "step": 7033 + }, + { + "epoch": 1.569038590229757, + "grad_norm": 0.18477341532707214, + "learning_rate": 9.39201374088125e-06, + "loss": 0.4708, + "step": 7034 + }, + { + "epoch": 1.5692616551416463, + "grad_norm": 0.1808331459760666, + "learning_rate": 9.389664682249859e-06, + "loss": 0.4721, + "step": 7035 + }, + { + "epoch": 1.5694847200535356, + "grad_norm": 0.16547119617462158, + "learning_rate": 9.387315657422677e-06, + "loss": 0.4477, + "step": 7036 + }, + { + "epoch": 1.5697077849654248, + "grad_norm": 0.19338512420654297, + "learning_rate": 9.384966666529807e-06, + "loss": 0.4559, + "step": 7037 + }, + { + "epoch": 1.5699308498773143, + "grad_norm": 0.1708751618862152, + "learning_rate": 9.382617709701355e-06, + "loss": 0.4593, + "step": 7038 + }, + { + "epoch": 1.5701539147892036, + "grad_norm": 0.17655450105667114, + "learning_rate": 9.380268787067416e-06, + "loss": 0.452, + "step": 7039 + }, + { + "epoch": 1.5703769797010931, + "grad_norm": 0.1719549298286438, + "learning_rate": 9.377919898758089e-06, + "loss": 0.452, + "step": 7040 + }, + { + "epoch": 1.5706000446129824, + "grad_norm": 0.17467504739761353, + "learning_rate": 9.375571044903469e-06, + "loss": 0.4699, + "step": 7041 + }, + { + "epoch": 1.5708231095248717, + "grad_norm": 0.17232942581176758, + "learning_rate": 9.373222225633655e-06, + "loss": 0.4696, + "step": 7042 + }, + { + "epoch": 1.571046174436761, + "grad_norm": 0.1764511615037918, + "learning_rate": 9.370873441078738e-06, + "loss": 0.4655, + "step": 7043 + }, + { + "epoch": 1.5712692393486505, + "grad_norm": 0.16992317140102386, + "learning_rate": 9.368524691368804e-06, + "loss": 0.4342, + "step": 7044 + }, + { + "epoch": 1.5714923042605398, + "grad_norm": 0.1786929816007614, + "learning_rate": 9.366175976633949e-06, + "loss": 0.4513, + "step": 7045 + }, + { + "epoch": 1.5717153691724293, + "grad_norm": 0.1684352606534958, + "learning_rate": 9.363827297004252e-06, + "loss": 0.4698, + "step": 7046 + }, + { + "epoch": 1.5719384340843185, + "grad_norm": 0.17656061053276062, + "learning_rate": 9.3614786526098e-06, + "loss": 0.4762, + "step": 7047 + }, + { + "epoch": 1.5721614989962078, + "grad_norm": 0.1755649596452713, + "learning_rate": 9.359130043580679e-06, + "loss": 0.471, + "step": 7048 + }, + { + "epoch": 1.572384563908097, + "grad_norm": 0.1698499619960785, + "learning_rate": 9.356781470046968e-06, + "loss": 0.4455, + "step": 7049 + }, + { + "epoch": 1.5726076288199866, + "grad_norm": 0.1733773797750473, + "learning_rate": 9.354432932138743e-06, + "loss": 0.4295, + "step": 7050 + }, + { + "epoch": 1.5728306937318761, + "grad_norm": 0.1755225956439972, + "learning_rate": 9.352084429986085e-06, + "loss": 0.4365, + "step": 7051 + }, + { + "epoch": 1.5730537586437654, + "grad_norm": 0.16997897624969482, + "learning_rate": 9.349735963719065e-06, + "loss": 0.439, + "step": 7052 + }, + { + "epoch": 1.5732768235556547, + "grad_norm": 0.17118623852729797, + "learning_rate": 9.34738753346776e-06, + "loss": 0.4616, + "step": 7053 + }, + { + "epoch": 1.573499888467544, + "grad_norm": 0.1747768670320511, + "learning_rate": 9.345039139362235e-06, + "loss": 0.4594, + "step": 7054 + }, + { + "epoch": 1.5737229533794335, + "grad_norm": 0.17374879121780396, + "learning_rate": 9.342690781532566e-06, + "loss": 0.461, + "step": 7055 + }, + { + "epoch": 1.5739460182913227, + "grad_norm": 0.18289963901042938, + "learning_rate": 9.340342460108813e-06, + "loss": 0.4668, + "step": 7056 + }, + { + "epoch": 1.5741690832032122, + "grad_norm": 0.17302124202251434, + "learning_rate": 9.337994175221047e-06, + "loss": 0.4441, + "step": 7057 + }, + { + "epoch": 1.5743921481151015, + "grad_norm": 0.17973129451274872, + "learning_rate": 9.335645926999324e-06, + "loss": 0.4488, + "step": 7058 + }, + { + "epoch": 1.5746152130269908, + "grad_norm": 0.17324143648147583, + "learning_rate": 9.333297715573713e-06, + "loss": 0.4394, + "step": 7059 + }, + { + "epoch": 1.57483827793888, + "grad_norm": 0.17785955965518951, + "learning_rate": 9.330949541074265e-06, + "loss": 0.4533, + "step": 7060 + }, + { + "epoch": 1.5750613428507696, + "grad_norm": 0.17939788103103638, + "learning_rate": 9.328601403631044e-06, + "loss": 0.4592, + "step": 7061 + }, + { + "epoch": 1.575284407762659, + "grad_norm": 0.18555541336536407, + "learning_rate": 9.326253303374099e-06, + "loss": 0.4838, + "step": 7062 + }, + { + "epoch": 1.5755074726745484, + "grad_norm": 0.17354725301265717, + "learning_rate": 9.323905240433481e-06, + "loss": 0.4543, + "step": 7063 + }, + { + "epoch": 1.5757305375864377, + "grad_norm": 0.18157054483890533, + "learning_rate": 9.321557214939247e-06, + "loss": 0.444, + "step": 7064 + }, + { + "epoch": 1.575953602498327, + "grad_norm": 0.1770789921283722, + "learning_rate": 9.319209227021439e-06, + "loss": 0.4877, + "step": 7065 + }, + { + "epoch": 1.5761766674102162, + "grad_norm": 0.18698160350322723, + "learning_rate": 9.316861276810105e-06, + "loss": 0.453, + "step": 7066 + }, + { + "epoch": 1.5763997323221057, + "grad_norm": 0.1865304410457611, + "learning_rate": 9.314513364435295e-06, + "loss": 0.4649, + "step": 7067 + }, + { + "epoch": 1.5766227972339952, + "grad_norm": 0.18025344610214233, + "learning_rate": 9.312165490027044e-06, + "loss": 0.4874, + "step": 7068 + }, + { + "epoch": 1.5768458621458845, + "grad_norm": 0.17585685849189758, + "learning_rate": 9.309817653715395e-06, + "loss": 0.4716, + "step": 7069 + }, + { + "epoch": 1.5770689270577738, + "grad_norm": 0.17630046606063843, + "learning_rate": 9.307469855630386e-06, + "loss": 0.4836, + "step": 7070 + }, + { + "epoch": 1.577291991969663, + "grad_norm": 0.17560921609401703, + "learning_rate": 9.305122095902055e-06, + "loss": 0.4392, + "step": 7071 + }, + { + "epoch": 1.5775150568815526, + "grad_norm": 0.17391790449619293, + "learning_rate": 9.30277437466043e-06, + "loss": 0.4587, + "step": 7072 + }, + { + "epoch": 1.5777381217934419, + "grad_norm": 0.17293858528137207, + "learning_rate": 9.30042669203555e-06, + "loss": 0.4473, + "step": 7073 + }, + { + "epoch": 1.5779611867053314, + "grad_norm": 0.17845462262630463, + "learning_rate": 9.298079048157434e-06, + "loss": 0.4684, + "step": 7074 + }, + { + "epoch": 1.5781842516172206, + "grad_norm": 0.18190234899520874, + "learning_rate": 9.295731443156121e-06, + "loss": 0.4561, + "step": 7075 + }, + { + "epoch": 1.57840731652911, + "grad_norm": 0.1686115711927414, + "learning_rate": 9.293383877161628e-06, + "loss": 0.4418, + "step": 7076 + }, + { + "epoch": 1.5786303814409992, + "grad_norm": 0.17196792364120483, + "learning_rate": 9.291036350303982e-06, + "loss": 0.4569, + "step": 7077 + }, + { + "epoch": 1.5788534463528887, + "grad_norm": 0.17236146330833435, + "learning_rate": 9.288688862713201e-06, + "loss": 0.4466, + "step": 7078 + }, + { + "epoch": 1.5790765112647782, + "grad_norm": 0.1745366007089615, + "learning_rate": 9.286341414519307e-06, + "loss": 0.4356, + "step": 7079 + }, + { + "epoch": 1.5792995761766675, + "grad_norm": 0.1981036514043808, + "learning_rate": 9.283994005852313e-06, + "loss": 0.4698, + "step": 7080 + }, + { + "epoch": 1.5795226410885568, + "grad_norm": 0.17840775847434998, + "learning_rate": 9.281646636842235e-06, + "loss": 0.4553, + "step": 7081 + }, + { + "epoch": 1.579745706000446, + "grad_norm": 0.17592334747314453, + "learning_rate": 9.279299307619085e-06, + "loss": 0.4468, + "step": 7082 + }, + { + "epoch": 1.5799687709123353, + "grad_norm": 0.17644402384757996, + "learning_rate": 9.276952018312874e-06, + "loss": 0.4373, + "step": 7083 + }, + { + "epoch": 1.5801918358242248, + "grad_norm": 0.18005765974521637, + "learning_rate": 9.274604769053605e-06, + "loss": 0.4825, + "step": 7084 + }, + { + "epoch": 1.5804149007361143, + "grad_norm": 0.17240746319293976, + "learning_rate": 9.272257559971291e-06, + "loss": 0.4562, + "step": 7085 + }, + { + "epoch": 1.5806379656480036, + "grad_norm": 0.171810120344162, + "learning_rate": 9.269910391195929e-06, + "loss": 0.4314, + "step": 7086 + }, + { + "epoch": 1.580861030559893, + "grad_norm": 0.33604612946510315, + "learning_rate": 9.26756326285752e-06, + "loss": 0.4709, + "step": 7087 + }, + { + "epoch": 1.5810840954717822, + "grad_norm": 0.18000468611717224, + "learning_rate": 9.265216175086068e-06, + "loss": 0.4858, + "step": 7088 + }, + { + "epoch": 1.5813071603836717, + "grad_norm": 0.17121154069900513, + "learning_rate": 9.262869128011565e-06, + "loss": 0.4492, + "step": 7089 + }, + { + "epoch": 1.581530225295561, + "grad_norm": 0.1873648315668106, + "learning_rate": 9.260522121764007e-06, + "loss": 0.4501, + "step": 7090 + }, + { + "epoch": 1.5817532902074505, + "grad_norm": 0.17154662311077118, + "learning_rate": 9.258175156473383e-06, + "loss": 0.4237, + "step": 7091 + }, + { + "epoch": 1.5819763551193398, + "grad_norm": 0.17864510416984558, + "learning_rate": 9.255828232269689e-06, + "loss": 0.4674, + "step": 7092 + }, + { + "epoch": 1.582199420031229, + "grad_norm": 0.18545754253864288, + "learning_rate": 9.253481349282906e-06, + "loss": 0.4657, + "step": 7093 + }, + { + "epoch": 1.5824224849431183, + "grad_norm": 0.18193696439266205, + "learning_rate": 9.251134507643022e-06, + "loss": 0.4538, + "step": 7094 + }, + { + "epoch": 1.5826455498550078, + "grad_norm": 0.17298151552677155, + "learning_rate": 9.248787707480018e-06, + "loss": 0.4696, + "step": 7095 + }, + { + "epoch": 1.5828686147668973, + "grad_norm": 0.17081047594547272, + "learning_rate": 9.24644094892388e-06, + "loss": 0.4593, + "step": 7096 + }, + { + "epoch": 1.5830916796787866, + "grad_norm": 0.18091976642608643, + "learning_rate": 9.244094232104578e-06, + "loss": 0.4611, + "step": 7097 + }, + { + "epoch": 1.583314744590676, + "grad_norm": 0.29800283908843994, + "learning_rate": 9.241747557152096e-06, + "loss": 0.4455, + "step": 7098 + }, + { + "epoch": 1.5835378095025652, + "grad_norm": 0.16630631685256958, + "learning_rate": 9.239400924196402e-06, + "loss": 0.4664, + "step": 7099 + }, + { + "epoch": 1.5837608744144545, + "grad_norm": 0.17668049037456512, + "learning_rate": 9.23705433336747e-06, + "loss": 0.4666, + "step": 7100 + }, + { + "epoch": 1.583983939326344, + "grad_norm": 0.17897532880306244, + "learning_rate": 9.234707784795266e-06, + "loss": 0.4586, + "step": 7101 + }, + { + "epoch": 1.5842070042382335, + "grad_norm": 0.18220888078212738, + "learning_rate": 9.232361278609761e-06, + "loss": 0.459, + "step": 7102 + }, + { + "epoch": 1.5844300691501227, + "grad_norm": 0.17005829513072968, + "learning_rate": 9.230014814940917e-06, + "loss": 0.4506, + "step": 7103 + }, + { + "epoch": 1.584653134062012, + "grad_norm": 0.17794610559940338, + "learning_rate": 9.227668393918695e-06, + "loss": 0.4894, + "step": 7104 + }, + { + "epoch": 1.5848761989739013, + "grad_norm": 0.1713520735502243, + "learning_rate": 9.225322015673055e-06, + "loss": 0.4762, + "step": 7105 + }, + { + "epoch": 1.5850992638857908, + "grad_norm": 0.17363131046295166, + "learning_rate": 9.222975680333956e-06, + "loss": 0.453, + "step": 7106 + }, + { + "epoch": 1.58532232879768, + "grad_norm": 0.1799049973487854, + "learning_rate": 9.22062938803135e-06, + "loss": 0.4692, + "step": 7107 + }, + { + "epoch": 1.5855453937095696, + "grad_norm": 0.1882503479719162, + "learning_rate": 9.218283138895192e-06, + "loss": 0.4721, + "step": 7108 + }, + { + "epoch": 1.5857684586214589, + "grad_norm": 0.19091439247131348, + "learning_rate": 9.215936933055431e-06, + "loss": 0.471, + "step": 7109 + }, + { + "epoch": 1.5859915235333482, + "grad_norm": 0.1782108098268509, + "learning_rate": 9.213590770642014e-06, + "loss": 0.4753, + "step": 7110 + }, + { + "epoch": 1.5862145884452374, + "grad_norm": 0.17361801862716675, + "learning_rate": 9.211244651784888e-06, + "loss": 0.4706, + "step": 7111 + }, + { + "epoch": 1.586437653357127, + "grad_norm": 0.18161118030548096, + "learning_rate": 9.208898576613993e-06, + "loss": 0.457, + "step": 7112 + }, + { + "epoch": 1.5866607182690164, + "grad_norm": 0.16798540949821472, + "learning_rate": 9.206552545259274e-06, + "loss": 0.4232, + "step": 7113 + }, + { + "epoch": 1.5868837831809057, + "grad_norm": 0.1804846078157425, + "learning_rate": 9.204206557850661e-06, + "loss": 0.4703, + "step": 7114 + }, + { + "epoch": 1.587106848092795, + "grad_norm": 0.17677569389343262, + "learning_rate": 9.201860614518098e-06, + "loss": 0.4451, + "step": 7115 + }, + { + "epoch": 1.5873299130046843, + "grad_norm": 0.18691909313201904, + "learning_rate": 9.199514715391514e-06, + "loss": 0.4525, + "step": 7116 + }, + { + "epoch": 1.5875529779165736, + "grad_norm": 0.1880151480436325, + "learning_rate": 9.197168860600843e-06, + "loss": 0.4887, + "step": 7117 + }, + { + "epoch": 1.587776042828463, + "grad_norm": 0.18004381656646729, + "learning_rate": 9.194823050276007e-06, + "loss": 0.4763, + "step": 7118 + }, + { + "epoch": 1.5879991077403526, + "grad_norm": 0.17934048175811768, + "learning_rate": 9.192477284546937e-06, + "loss": 0.4896, + "step": 7119 + }, + { + "epoch": 1.5882221726522419, + "grad_norm": 0.17603585124015808, + "learning_rate": 9.190131563543551e-06, + "loss": 0.4514, + "step": 7120 + }, + { + "epoch": 1.5884452375641311, + "grad_norm": 0.1763220578432083, + "learning_rate": 9.187785887395778e-06, + "loss": 0.4501, + "step": 7121 + }, + { + "epoch": 1.5886683024760204, + "grad_norm": 0.16422604024410248, + "learning_rate": 9.18544025623353e-06, + "loss": 0.4382, + "step": 7122 + }, + { + "epoch": 1.58889136738791, + "grad_norm": 0.17664875090122223, + "learning_rate": 9.183094670186724e-06, + "loss": 0.4802, + "step": 7123 + }, + { + "epoch": 1.5891144322997992, + "grad_norm": 0.1742987334728241, + "learning_rate": 9.180749129385273e-06, + "loss": 0.4307, + "step": 7124 + }, + { + "epoch": 1.5893374972116887, + "grad_norm": 0.16630713641643524, + "learning_rate": 9.17840363395909e-06, + "loss": 0.4245, + "step": 7125 + }, + { + "epoch": 1.589560562123578, + "grad_norm": 0.17244166135787964, + "learning_rate": 9.176058184038081e-06, + "loss": 0.4488, + "step": 7126 + }, + { + "epoch": 1.5897836270354673, + "grad_norm": 0.1764003038406372, + "learning_rate": 9.17371277975215e-06, + "loss": 0.4711, + "step": 7127 + }, + { + "epoch": 1.5900066919473566, + "grad_norm": 0.18569590151309967, + "learning_rate": 9.171367421231207e-06, + "loss": 0.4765, + "step": 7128 + }, + { + "epoch": 1.590229756859246, + "grad_norm": 0.19138166308403015, + "learning_rate": 9.169022108605147e-06, + "loss": 0.4742, + "step": 7129 + }, + { + "epoch": 1.5904528217711356, + "grad_norm": 0.17266172170639038, + "learning_rate": 9.16667684200387e-06, + "loss": 0.4525, + "step": 7130 + }, + { + "epoch": 1.5906758866830248, + "grad_norm": 0.17976059019565582, + "learning_rate": 9.16433162155727e-06, + "loss": 0.4669, + "step": 7131 + }, + { + "epoch": 1.5908989515949141, + "grad_norm": 0.16930918395519257, + "learning_rate": 9.161986447395244e-06, + "loss": 0.4463, + "step": 7132 + }, + { + "epoch": 1.5911220165068034, + "grad_norm": 0.185069739818573, + "learning_rate": 9.159641319647676e-06, + "loss": 0.4553, + "step": 7133 + }, + { + "epoch": 1.5913450814186927, + "grad_norm": 0.17120788991451263, + "learning_rate": 9.157296238444462e-06, + "loss": 0.4564, + "step": 7134 + }, + { + "epoch": 1.5915681463305822, + "grad_norm": 0.18028856813907623, + "learning_rate": 9.15495120391548e-06, + "loss": 0.4903, + "step": 7135 + }, + { + "epoch": 1.5917912112424717, + "grad_norm": 0.18500107526779175, + "learning_rate": 9.152606216190619e-06, + "loss": 0.4839, + "step": 7136 + }, + { + "epoch": 1.592014276154361, + "grad_norm": 0.17206281423568726, + "learning_rate": 9.150261275399752e-06, + "loss": 0.4445, + "step": 7137 + }, + { + "epoch": 1.5922373410662503, + "grad_norm": 0.19422174990177155, + "learning_rate": 9.147916381672763e-06, + "loss": 0.4412, + "step": 7138 + }, + { + "epoch": 1.5924604059781395, + "grad_norm": 0.17535541951656342, + "learning_rate": 9.145571535139524e-06, + "loss": 0.4682, + "step": 7139 + }, + { + "epoch": 1.592683470890029, + "grad_norm": 0.17184211313724518, + "learning_rate": 9.14322673592991e-06, + "loss": 0.4383, + "step": 7140 + }, + { + "epoch": 1.5929065358019183, + "grad_norm": 0.1792580634355545, + "learning_rate": 9.140881984173786e-06, + "loss": 0.4486, + "step": 7141 + }, + { + "epoch": 1.5931296007138078, + "grad_norm": 0.17893850803375244, + "learning_rate": 9.138537280001025e-06, + "loss": 0.4494, + "step": 7142 + }, + { + "epoch": 1.593352665625697, + "grad_norm": 0.17815670371055603, + "learning_rate": 9.136192623541487e-06, + "loss": 0.46, + "step": 7143 + }, + { + "epoch": 1.5935757305375864, + "grad_norm": 0.17325416207313538, + "learning_rate": 9.133848014925032e-06, + "loss": 0.4582, + "step": 7144 + }, + { + "epoch": 1.5937987954494757, + "grad_norm": 0.1770549863576889, + "learning_rate": 9.131503454281526e-06, + "loss": 0.4393, + "step": 7145 + }, + { + "epoch": 1.5940218603613652, + "grad_norm": 0.1701727956533432, + "learning_rate": 9.129158941740818e-06, + "loss": 0.4728, + "step": 7146 + }, + { + "epoch": 1.5942449252732547, + "grad_norm": 0.1813841164112091, + "learning_rate": 9.12681447743277e-06, + "loss": 0.4432, + "step": 7147 + }, + { + "epoch": 1.594467990185144, + "grad_norm": 0.17347143590450287, + "learning_rate": 9.124470061487225e-06, + "loss": 0.4405, + "step": 7148 + }, + { + "epoch": 1.5946910550970332, + "grad_norm": 0.18103331327438354, + "learning_rate": 9.122125694034039e-06, + "loss": 0.4811, + "step": 7149 + }, + { + "epoch": 1.5949141200089225, + "grad_norm": 0.17634287476539612, + "learning_rate": 9.11978137520305e-06, + "loss": 0.4625, + "step": 7150 + }, + { + "epoch": 1.5951371849208118, + "grad_norm": 0.16531561315059662, + "learning_rate": 9.117437105124107e-06, + "loss": 0.4206, + "step": 7151 + }, + { + "epoch": 1.5953602498327013, + "grad_norm": 0.1829378753900528, + "learning_rate": 9.11509288392705e-06, + "loss": 0.4665, + "step": 7152 + }, + { + "epoch": 1.5955833147445908, + "grad_norm": 0.17331163585186005, + "learning_rate": 9.112748711741712e-06, + "loss": 0.4393, + "step": 7153 + }, + { + "epoch": 1.59580637965648, + "grad_norm": 0.1706630289554596, + "learning_rate": 9.110404588697935e-06, + "loss": 0.46, + "step": 7154 + }, + { + "epoch": 1.5960294445683694, + "grad_norm": 0.16561025381088257, + "learning_rate": 9.108060514925546e-06, + "loss": 0.4287, + "step": 7155 + }, + { + "epoch": 1.5962525094802587, + "grad_norm": 0.17239882051944733, + "learning_rate": 9.105716490554377e-06, + "loss": 0.466, + "step": 7156 + }, + { + "epoch": 1.5964755743921482, + "grad_norm": 0.16943053901195526, + "learning_rate": 9.103372515714252e-06, + "loss": 0.4275, + "step": 7157 + }, + { + "epoch": 1.5966986393040374, + "grad_norm": 0.1701379269361496, + "learning_rate": 9.101028590534999e-06, + "loss": 0.4459, + "step": 7158 + }, + { + "epoch": 1.596921704215927, + "grad_norm": 0.18113826215267181, + "learning_rate": 9.098684715146435e-06, + "loss": 0.454, + "step": 7159 + }, + { + "epoch": 1.5971447691278162, + "grad_norm": 0.19422270357608795, + "learning_rate": 9.096340889678384e-06, + "loss": 0.4594, + "step": 7160 + }, + { + "epoch": 1.5973678340397055, + "grad_norm": 0.18085762858390808, + "learning_rate": 9.093997114260658e-06, + "loss": 0.4853, + "step": 7161 + }, + { + "epoch": 1.5975908989515948, + "grad_norm": 0.18114502727985382, + "learning_rate": 9.09165338902307e-06, + "loss": 0.4688, + "step": 7162 + }, + { + "epoch": 1.5978139638634843, + "grad_norm": 0.18122133612632751, + "learning_rate": 9.089309714095432e-06, + "loss": 0.4679, + "step": 7163 + }, + { + "epoch": 1.5980370287753738, + "grad_norm": 0.18571965396404266, + "learning_rate": 9.086966089607551e-06, + "loss": 0.48, + "step": 7164 + }, + { + "epoch": 1.598260093687263, + "grad_norm": 0.1749122440814972, + "learning_rate": 9.084622515689228e-06, + "loss": 0.4706, + "step": 7165 + }, + { + "epoch": 1.5984831585991524, + "grad_norm": 0.17465496063232422, + "learning_rate": 9.082278992470272e-06, + "loss": 0.4586, + "step": 7166 + }, + { + "epoch": 1.5987062235110416, + "grad_norm": 0.1802026480436325, + "learning_rate": 9.079935520080474e-06, + "loss": 0.4348, + "step": 7167 + }, + { + "epoch": 1.598929288422931, + "grad_norm": 0.17339572310447693, + "learning_rate": 9.077592098649639e-06, + "loss": 0.4688, + "step": 7168 + }, + { + "epoch": 1.5991523533348204, + "grad_norm": 0.17421960830688477, + "learning_rate": 9.075248728307551e-06, + "loss": 0.4779, + "step": 7169 + }, + { + "epoch": 1.59937541824671, + "grad_norm": 0.17133331298828125, + "learning_rate": 9.07290540918401e-06, + "loss": 0.451, + "step": 7170 + }, + { + "epoch": 1.5995984831585992, + "grad_norm": 0.1657906174659729, + "learning_rate": 9.070562141408795e-06, + "loss": 0.4296, + "step": 7171 + }, + { + "epoch": 1.5998215480704885, + "grad_norm": 0.17014503479003906, + "learning_rate": 9.068218925111695e-06, + "loss": 0.4302, + "step": 7172 + }, + { + "epoch": 1.6000446129823778, + "grad_norm": 0.1789960116147995, + "learning_rate": 9.065875760422496e-06, + "loss": 0.4808, + "step": 7173 + }, + { + "epoch": 1.6002676778942673, + "grad_norm": 0.16898800432682037, + "learning_rate": 9.06353264747097e-06, + "loss": 0.4614, + "step": 7174 + }, + { + "epoch": 1.6004907428061566, + "grad_norm": 0.1819867193698883, + "learning_rate": 9.0611895863869e-06, + "loss": 0.4687, + "step": 7175 + }, + { + "epoch": 1.600713807718046, + "grad_norm": 0.17777277529239655, + "learning_rate": 9.058846577300052e-06, + "loss": 0.4452, + "step": 7176 + }, + { + "epoch": 1.6009368726299353, + "grad_norm": 0.1771070510149002, + "learning_rate": 9.056503620340204e-06, + "loss": 0.4647, + "step": 7177 + }, + { + "epoch": 1.6011599375418246, + "grad_norm": 0.17987768352031708, + "learning_rate": 9.054160715637117e-06, + "loss": 0.465, + "step": 7178 + }, + { + "epoch": 1.601383002453714, + "grad_norm": 0.17189104855060577, + "learning_rate": 9.051817863320563e-06, + "loss": 0.46, + "step": 7179 + }, + { + "epoch": 1.6016060673656034, + "grad_norm": 0.19388031959533691, + "learning_rate": 9.049475063520295e-06, + "loss": 0.4347, + "step": 7180 + }, + { + "epoch": 1.601829132277493, + "grad_norm": 0.1758044809103012, + "learning_rate": 9.047132316366082e-06, + "loss": 0.4477, + "step": 7181 + }, + { + "epoch": 1.6020521971893822, + "grad_norm": 0.19193986058235168, + "learning_rate": 9.044789621987672e-06, + "loss": 0.4698, + "step": 7182 + }, + { + "epoch": 1.6022752621012715, + "grad_norm": 0.1996065378189087, + "learning_rate": 9.042446980514822e-06, + "loss": 0.4733, + "step": 7183 + }, + { + "epoch": 1.6024983270131608, + "grad_norm": 0.1759694218635559, + "learning_rate": 9.040104392077283e-06, + "loss": 0.4534, + "step": 7184 + }, + { + "epoch": 1.60272139192505, + "grad_norm": 0.1798923909664154, + "learning_rate": 9.0377618568048e-06, + "loss": 0.4792, + "step": 7185 + }, + { + "epoch": 1.6029444568369395, + "grad_norm": 0.17134787142276764, + "learning_rate": 9.035419374827116e-06, + "loss": 0.4889, + "step": 7186 + }, + { + "epoch": 1.603167521748829, + "grad_norm": 0.2020411342382431, + "learning_rate": 9.033076946273978e-06, + "loss": 0.4891, + "step": 7187 + }, + { + "epoch": 1.6033905866607183, + "grad_norm": 0.1749039590358734, + "learning_rate": 9.030734571275118e-06, + "loss": 0.4535, + "step": 7188 + }, + { + "epoch": 1.6036136515726076, + "grad_norm": 0.1836901307106018, + "learning_rate": 9.028392249960276e-06, + "loss": 0.4793, + "step": 7189 + }, + { + "epoch": 1.6038367164844969, + "grad_norm": 0.19300808012485504, + "learning_rate": 9.02604998245918e-06, + "loss": 0.4434, + "step": 7190 + }, + { + "epoch": 1.6040597813963864, + "grad_norm": 0.18210625648498535, + "learning_rate": 9.023707768901567e-06, + "loss": 0.4591, + "step": 7191 + }, + { + "epoch": 1.6042828463082757, + "grad_norm": 0.17498520016670227, + "learning_rate": 9.021365609417155e-06, + "loss": 0.4658, + "step": 7192 + }, + { + "epoch": 1.6045059112201652, + "grad_norm": 0.17925825715065002, + "learning_rate": 9.019023504135671e-06, + "loss": 0.4617, + "step": 7193 + }, + { + "epoch": 1.6047289761320545, + "grad_norm": 0.18746887147426605, + "learning_rate": 9.01668145318684e-06, + "loss": 0.4819, + "step": 7194 + }, + { + "epoch": 1.6049520410439437, + "grad_norm": 0.16884252429008484, + "learning_rate": 9.014339456700371e-06, + "loss": 0.4457, + "step": 7195 + }, + { + "epoch": 1.605175105955833, + "grad_norm": 0.1717928946018219, + "learning_rate": 9.011997514805986e-06, + "loss": 0.4507, + "step": 7196 + }, + { + "epoch": 1.6053981708677225, + "grad_norm": 0.18760330975055695, + "learning_rate": 9.009655627633392e-06, + "loss": 0.4447, + "step": 7197 + }, + { + "epoch": 1.605621235779612, + "grad_norm": 0.16959816217422485, + "learning_rate": 9.0073137953123e-06, + "loss": 0.4185, + "step": 7198 + }, + { + "epoch": 1.6058443006915013, + "grad_norm": 0.17206130921840668, + "learning_rate": 9.004972017972414e-06, + "loss": 0.4572, + "step": 7199 + }, + { + "epoch": 1.6060673656033906, + "grad_norm": 0.1713947355747223, + "learning_rate": 9.002630295743437e-06, + "loss": 0.4656, + "step": 7200 + }, + { + "epoch": 1.6062904305152799, + "grad_norm": 0.1734878569841385, + "learning_rate": 9.000288628755065e-06, + "loss": 0.4543, + "step": 7201 + }, + { + "epoch": 1.6065134954271691, + "grad_norm": 0.17771008610725403, + "learning_rate": 8.997947017137e-06, + "loss": 0.4749, + "step": 7202 + }, + { + "epoch": 1.6067365603390587, + "grad_norm": 0.17664694786071777, + "learning_rate": 8.995605461018933e-06, + "loss": 0.4472, + "step": 7203 + }, + { + "epoch": 1.6069596252509482, + "grad_norm": 0.18092039227485657, + "learning_rate": 8.993263960530552e-06, + "loss": 0.4716, + "step": 7204 + }, + { + "epoch": 1.6071826901628374, + "grad_norm": 0.17921680212020874, + "learning_rate": 8.990922515801546e-06, + "loss": 0.4736, + "step": 7205 + }, + { + "epoch": 1.6074057550747267, + "grad_norm": 0.17247629165649414, + "learning_rate": 8.9885811269616e-06, + "loss": 0.4575, + "step": 7206 + }, + { + "epoch": 1.607628819986616, + "grad_norm": 0.1867935061454773, + "learning_rate": 8.986239794140389e-06, + "loss": 0.4509, + "step": 7207 + }, + { + "epoch": 1.6078518848985055, + "grad_norm": 0.18020030856132507, + "learning_rate": 8.983898517467598e-06, + "loss": 0.4526, + "step": 7208 + }, + { + "epoch": 1.6080749498103948, + "grad_norm": 0.17290444672107697, + "learning_rate": 8.981557297072897e-06, + "loss": 0.4358, + "step": 7209 + }, + { + "epoch": 1.6082980147222843, + "grad_norm": 0.17369163036346436, + "learning_rate": 8.979216133085961e-06, + "loss": 0.4485, + "step": 7210 + }, + { + "epoch": 1.6085210796341736, + "grad_norm": 0.17620067298412323, + "learning_rate": 8.976875025636455e-06, + "loss": 0.4515, + "step": 7211 + }, + { + "epoch": 1.6087441445460628, + "grad_norm": 0.17683733999729156, + "learning_rate": 8.974533974854046e-06, + "loss": 0.4434, + "step": 7212 + }, + { + "epoch": 1.6089672094579521, + "grad_norm": 0.16844935715198517, + "learning_rate": 8.972192980868397e-06, + "loss": 0.4471, + "step": 7213 + }, + { + "epoch": 1.6091902743698416, + "grad_norm": 0.17559292912483215, + "learning_rate": 8.969852043809165e-06, + "loss": 0.4755, + "step": 7214 + }, + { + "epoch": 1.6094133392817311, + "grad_norm": 0.17093756794929504, + "learning_rate": 8.967511163806008e-06, + "loss": 0.4571, + "step": 7215 + }, + { + "epoch": 1.6096364041936204, + "grad_norm": 0.17171628773212433, + "learning_rate": 8.965170340988574e-06, + "loss": 0.4461, + "step": 7216 + }, + { + "epoch": 1.6098594691055097, + "grad_norm": 0.17218218743801117, + "learning_rate": 8.96282957548652e-06, + "loss": 0.4595, + "step": 7217 + }, + { + "epoch": 1.610082534017399, + "grad_norm": 0.17221598327159882, + "learning_rate": 8.960488867429486e-06, + "loss": 0.4469, + "step": 7218 + }, + { + "epoch": 1.6103055989292883, + "grad_norm": 0.19169965386390686, + "learning_rate": 8.958148216947118e-06, + "loss": 0.4863, + "step": 7219 + }, + { + "epoch": 1.6105286638411778, + "grad_norm": 0.19341188669204712, + "learning_rate": 8.955807624169054e-06, + "loss": 0.482, + "step": 7220 + }, + { + "epoch": 1.6107517287530673, + "grad_norm": 0.17554320394992828, + "learning_rate": 8.953467089224934e-06, + "loss": 0.4396, + "step": 7221 + }, + { + "epoch": 1.6109747936649566, + "grad_norm": 0.17117850482463837, + "learning_rate": 8.951126612244387e-06, + "loss": 0.4319, + "step": 7222 + }, + { + "epoch": 1.6111978585768458, + "grad_norm": 0.17589138448238373, + "learning_rate": 8.948786193357049e-06, + "loss": 0.4553, + "step": 7223 + }, + { + "epoch": 1.6114209234887351, + "grad_norm": 0.265455961227417, + "learning_rate": 8.946445832692545e-06, + "loss": 0.481, + "step": 7224 + }, + { + "epoch": 1.6116439884006246, + "grad_norm": 0.17754699289798737, + "learning_rate": 8.944105530380495e-06, + "loss": 0.4489, + "step": 7225 + }, + { + "epoch": 1.611867053312514, + "grad_norm": 0.19480863213539124, + "learning_rate": 8.941765286550523e-06, + "loss": 0.4515, + "step": 7226 + }, + { + "epoch": 1.6120901182244034, + "grad_norm": 0.16930724680423737, + "learning_rate": 8.939425101332245e-06, + "loss": 0.4376, + "step": 7227 + }, + { + "epoch": 1.6123131831362927, + "grad_norm": 0.1783130019903183, + "learning_rate": 8.937084974855278e-06, + "loss": 0.4679, + "step": 7228 + }, + { + "epoch": 1.612536248048182, + "grad_norm": 0.19291211664676666, + "learning_rate": 8.934744907249229e-06, + "loss": 0.481, + "step": 7229 + }, + { + "epoch": 1.6127593129600712, + "grad_norm": 0.1795196533203125, + "learning_rate": 8.93240489864371e-06, + "loss": 0.4683, + "step": 7230 + }, + { + "epoch": 1.6129823778719607, + "grad_norm": 0.17331638932228088, + "learning_rate": 8.930064949168322e-06, + "loss": 0.4553, + "step": 7231 + }, + { + "epoch": 1.6132054427838503, + "grad_norm": 0.15906356275081635, + "learning_rate": 8.927725058952669e-06, + "loss": 0.4416, + "step": 7232 + }, + { + "epoch": 1.6134285076957395, + "grad_norm": 0.17841282486915588, + "learning_rate": 8.925385228126344e-06, + "loss": 0.4671, + "step": 7233 + }, + { + "epoch": 1.6136515726076288, + "grad_norm": 0.17484787106513977, + "learning_rate": 8.923045456818947e-06, + "loss": 0.4748, + "step": 7234 + }, + { + "epoch": 1.613874637519518, + "grad_norm": 0.17049816250801086, + "learning_rate": 8.920705745160064e-06, + "loss": 0.4464, + "step": 7235 + }, + { + "epoch": 1.6140977024314074, + "grad_norm": 0.17688851058483124, + "learning_rate": 8.918366093279288e-06, + "loss": 0.4538, + "step": 7236 + }, + { + "epoch": 1.6143207673432969, + "grad_norm": 0.19764314591884613, + "learning_rate": 8.916026501306203e-06, + "loss": 0.464, + "step": 7237 + }, + { + "epoch": 1.6145438322551864, + "grad_norm": 0.16911235451698303, + "learning_rate": 8.913686969370386e-06, + "loss": 0.4505, + "step": 7238 + }, + { + "epoch": 1.6147668971670757, + "grad_norm": 0.1755746304988861, + "learning_rate": 8.911347497601418e-06, + "loss": 0.4726, + "step": 7239 + }, + { + "epoch": 1.614989962078965, + "grad_norm": 0.17547662556171417, + "learning_rate": 8.909008086128874e-06, + "loss": 0.4596, + "step": 7240 + }, + { + "epoch": 1.6152130269908542, + "grad_norm": 0.18209370970726013, + "learning_rate": 8.906668735082327e-06, + "loss": 0.4692, + "step": 7241 + }, + { + "epoch": 1.6154360919027437, + "grad_norm": 0.17402005195617676, + "learning_rate": 8.90432944459134e-06, + "loss": 0.4477, + "step": 7242 + }, + { + "epoch": 1.615659156814633, + "grad_norm": 0.1721334308385849, + "learning_rate": 8.901990214785483e-06, + "loss": 0.4463, + "step": 7243 + }, + { + "epoch": 1.6158822217265225, + "grad_norm": 0.17670831084251404, + "learning_rate": 8.899651045794313e-06, + "loss": 0.4583, + "step": 7244 + }, + { + "epoch": 1.6161052866384118, + "grad_norm": 0.1660810261964798, + "learning_rate": 8.89731193774739e-06, + "loss": 0.4553, + "step": 7245 + }, + { + "epoch": 1.616328351550301, + "grad_norm": 0.16802763938903809, + "learning_rate": 8.894972890774266e-06, + "loss": 0.4426, + "step": 7246 + }, + { + "epoch": 1.6165514164621904, + "grad_norm": 0.18203771114349365, + "learning_rate": 8.892633905004498e-06, + "loss": 0.4281, + "step": 7247 + }, + { + "epoch": 1.6167744813740799, + "grad_norm": 0.17793646454811096, + "learning_rate": 8.890294980567626e-06, + "loss": 0.4463, + "step": 7248 + }, + { + "epoch": 1.6169975462859694, + "grad_norm": 0.1768350899219513, + "learning_rate": 8.887956117593201e-06, + "loss": 0.4461, + "step": 7249 + }, + { + "epoch": 1.6172206111978586, + "grad_norm": 0.16968591511249542, + "learning_rate": 8.88561731621076e-06, + "loss": 0.4256, + "step": 7250 + }, + { + "epoch": 1.617443676109748, + "grad_norm": 0.17799051105976105, + "learning_rate": 8.883278576549843e-06, + "loss": 0.4425, + "step": 7251 + }, + { + "epoch": 1.6176667410216372, + "grad_norm": 0.1699870377779007, + "learning_rate": 8.88093989873998e-06, + "loss": 0.4579, + "step": 7252 + }, + { + "epoch": 1.6178898059335265, + "grad_norm": 0.17788825929164886, + "learning_rate": 8.87860128291071e-06, + "loss": 0.4816, + "step": 7253 + }, + { + "epoch": 1.618112870845416, + "grad_norm": 0.1790701150894165, + "learning_rate": 8.87626272919155e-06, + "loss": 0.4347, + "step": 7254 + }, + { + "epoch": 1.6183359357573055, + "grad_norm": 0.1901506632566452, + "learning_rate": 8.87392423771203e-06, + "loss": 0.4522, + "step": 7255 + }, + { + "epoch": 1.6185590006691948, + "grad_norm": 0.19295074045658112, + "learning_rate": 8.871585808601669e-06, + "loss": 0.4651, + "step": 7256 + }, + { + "epoch": 1.618782065581084, + "grad_norm": 0.1766098290681839, + "learning_rate": 8.869247441989983e-06, + "loss": 0.4137, + "step": 7257 + }, + { + "epoch": 1.6190051304929733, + "grad_norm": 0.17081737518310547, + "learning_rate": 8.866909138006488e-06, + "loss": 0.4277, + "step": 7258 + }, + { + "epoch": 1.6192281954048628, + "grad_norm": 0.17560650408267975, + "learning_rate": 8.864570896780691e-06, + "loss": 0.4675, + "step": 7259 + }, + { + "epoch": 1.6194512603167521, + "grad_norm": 0.1765887439250946, + "learning_rate": 8.862232718442101e-06, + "loss": 0.4536, + "step": 7260 + }, + { + "epoch": 1.6196743252286416, + "grad_norm": 0.18248333036899567, + "learning_rate": 8.85989460312022e-06, + "loss": 0.4778, + "step": 7261 + }, + { + "epoch": 1.619897390140531, + "grad_norm": 0.17278814315795898, + "learning_rate": 8.857556550944548e-06, + "loss": 0.4378, + "step": 7262 + }, + { + "epoch": 1.6201204550524202, + "grad_norm": 0.18780216574668884, + "learning_rate": 8.855218562044579e-06, + "loss": 0.4768, + "step": 7263 + }, + { + "epoch": 1.6203435199643095, + "grad_norm": 0.16610752046108246, + "learning_rate": 8.852880636549808e-06, + "loss": 0.4362, + "step": 7264 + }, + { + "epoch": 1.620566584876199, + "grad_norm": 0.16919295489788055, + "learning_rate": 8.85054277458972e-06, + "loss": 0.4394, + "step": 7265 + }, + { + "epoch": 1.6207896497880885, + "grad_norm": 0.16546227037906647, + "learning_rate": 8.848204976293807e-06, + "loss": 0.4484, + "step": 7266 + }, + { + "epoch": 1.6210127146999778, + "grad_norm": 0.1768079698085785, + "learning_rate": 8.845867241791548e-06, + "loss": 0.445, + "step": 7267 + }, + { + "epoch": 1.621235779611867, + "grad_norm": 0.17911723256111145, + "learning_rate": 8.84352957121242e-06, + "loss": 0.4643, + "step": 7268 + }, + { + "epoch": 1.6214588445237563, + "grad_norm": 0.17884330451488495, + "learning_rate": 8.841191964685896e-06, + "loss": 0.4687, + "step": 7269 + }, + { + "epoch": 1.6216819094356456, + "grad_norm": 0.17899781465530396, + "learning_rate": 8.838854422341454e-06, + "loss": 0.4641, + "step": 7270 + }, + { + "epoch": 1.6219049743475351, + "grad_norm": 0.17130422592163086, + "learning_rate": 8.836516944308555e-06, + "loss": 0.4413, + "step": 7271 + }, + { + "epoch": 1.6221280392594246, + "grad_norm": 0.17263583838939667, + "learning_rate": 8.834179530716669e-06, + "loss": 0.4521, + "step": 7272 + }, + { + "epoch": 1.622351104171314, + "grad_norm": 0.17574861645698547, + "learning_rate": 8.831842181695251e-06, + "loss": 0.4695, + "step": 7273 + }, + { + "epoch": 1.6225741690832032, + "grad_norm": 0.1733783781528473, + "learning_rate": 8.829504897373764e-06, + "loss": 0.4458, + "step": 7274 + }, + { + "epoch": 1.6227972339950925, + "grad_norm": 0.18253105878829956, + "learning_rate": 8.827167677881656e-06, + "loss": 0.4412, + "step": 7275 + }, + { + "epoch": 1.623020298906982, + "grad_norm": 0.1744835376739502, + "learning_rate": 8.824830523348383e-06, + "loss": 0.441, + "step": 7276 + }, + { + "epoch": 1.6232433638188712, + "grad_norm": 0.17317108809947968, + "learning_rate": 8.822493433903383e-06, + "loss": 0.4819, + "step": 7277 + }, + { + "epoch": 1.6234664287307607, + "grad_norm": 0.17538447678089142, + "learning_rate": 8.820156409676105e-06, + "loss": 0.4303, + "step": 7278 + }, + { + "epoch": 1.62368949364265, + "grad_norm": 0.18321259319782257, + "learning_rate": 8.817819450795989e-06, + "loss": 0.4616, + "step": 7279 + }, + { + "epoch": 1.6239125585545393, + "grad_norm": 0.17948691546916962, + "learning_rate": 8.815482557392463e-06, + "loss": 0.4734, + "step": 7280 + }, + { + "epoch": 1.6241356234664286, + "grad_norm": 0.17915484309196472, + "learning_rate": 8.81314572959497e-06, + "loss": 0.4497, + "step": 7281 + }, + { + "epoch": 1.624358688378318, + "grad_norm": 0.18481531739234924, + "learning_rate": 8.810808967532927e-06, + "loss": 0.4729, + "step": 7282 + }, + { + "epoch": 1.6245817532902076, + "grad_norm": 0.18924424052238464, + "learning_rate": 8.808472271335767e-06, + "loss": 0.4762, + "step": 7283 + }, + { + "epoch": 1.6248048182020969, + "grad_norm": 0.1721714437007904, + "learning_rate": 8.806135641132907e-06, + "loss": 0.4682, + "step": 7284 + }, + { + "epoch": 1.6250278831139862, + "grad_norm": 0.1784234195947647, + "learning_rate": 8.803799077053765e-06, + "loss": 0.465, + "step": 7285 + }, + { + "epoch": 1.6252509480258754, + "grad_norm": 0.18713349103927612, + "learning_rate": 8.801462579227751e-06, + "loss": 0.4549, + "step": 7286 + }, + { + "epoch": 1.625474012937765, + "grad_norm": 0.1756950169801712, + "learning_rate": 8.799126147784284e-06, + "loss": 0.4389, + "step": 7287 + }, + { + "epoch": 1.6256970778496542, + "grad_norm": 0.16715659201145172, + "learning_rate": 8.796789782852761e-06, + "loss": 0.4456, + "step": 7288 + }, + { + "epoch": 1.6259201427615437, + "grad_norm": 0.1801193356513977, + "learning_rate": 8.79445348456259e-06, + "loss": 0.4601, + "step": 7289 + }, + { + "epoch": 1.626143207673433, + "grad_norm": 0.18265050649642944, + "learning_rate": 8.792117253043166e-06, + "loss": 0.4617, + "step": 7290 + }, + { + "epoch": 1.6263662725853223, + "grad_norm": 0.17601865530014038, + "learning_rate": 8.78978108842389e-06, + "loss": 0.4572, + "step": 7291 + }, + { + "epoch": 1.6265893374972116, + "grad_norm": 0.1800207793712616, + "learning_rate": 8.787444990834146e-06, + "loss": 0.4626, + "step": 7292 + }, + { + "epoch": 1.626812402409101, + "grad_norm": 0.18082986772060394, + "learning_rate": 8.78510896040333e-06, + "loss": 0.4583, + "step": 7293 + }, + { + "epoch": 1.6270354673209904, + "grad_norm": 0.1825123131275177, + "learning_rate": 8.782772997260819e-06, + "loss": 0.4604, + "step": 7294 + }, + { + "epoch": 1.6272585322328799, + "grad_norm": 0.17616558074951172, + "learning_rate": 8.780437101535997e-06, + "loss": 0.442, + "step": 7295 + }, + { + "epoch": 1.6274815971447691, + "grad_norm": 0.17222437262535095, + "learning_rate": 8.778101273358238e-06, + "loss": 0.4374, + "step": 7296 + }, + { + "epoch": 1.6277046620566584, + "grad_norm": 0.1752903312444687, + "learning_rate": 8.775765512856919e-06, + "loss": 0.4703, + "step": 7297 + }, + { + "epoch": 1.6279277269685477, + "grad_norm": 0.17624272406101227, + "learning_rate": 8.773429820161404e-06, + "loss": 0.4692, + "step": 7298 + }, + { + "epoch": 1.6281507918804372, + "grad_norm": 0.17555177211761475, + "learning_rate": 8.771094195401062e-06, + "loss": 0.4517, + "step": 7299 + }, + { + "epoch": 1.6283738567923267, + "grad_norm": 0.1727064996957779, + "learning_rate": 8.768758638705253e-06, + "loss": 0.4617, + "step": 7300 + }, + { + "epoch": 1.628596921704216, + "grad_norm": 0.17664147913455963, + "learning_rate": 8.766423150203335e-06, + "loss": 0.4565, + "step": 7301 + }, + { + "epoch": 1.6288199866161053, + "grad_norm": 0.17871947586536407, + "learning_rate": 8.764087730024667e-06, + "loss": 0.464, + "step": 7302 + }, + { + "epoch": 1.6290430515279946, + "grad_norm": 0.18316639959812164, + "learning_rate": 8.761752378298589e-06, + "loss": 0.4474, + "step": 7303 + }, + { + "epoch": 1.629266116439884, + "grad_norm": 0.19442254304885864, + "learning_rate": 8.759417095154456e-06, + "loss": 0.4353, + "step": 7304 + }, + { + "epoch": 1.6294891813517733, + "grad_norm": 0.17314781248569489, + "learning_rate": 8.757081880721612e-06, + "loss": 0.4544, + "step": 7305 + }, + { + "epoch": 1.6297122462636628, + "grad_norm": 0.1879189908504486, + "learning_rate": 8.754746735129385e-06, + "loss": 0.4588, + "step": 7306 + }, + { + "epoch": 1.6299353111755521, + "grad_norm": 0.17803287506103516, + "learning_rate": 8.752411658507121e-06, + "loss": 0.4526, + "step": 7307 + }, + { + "epoch": 1.6301583760874414, + "grad_norm": 0.18903419375419617, + "learning_rate": 8.750076650984143e-06, + "loss": 0.4412, + "step": 7308 + }, + { + "epoch": 1.6303814409993307, + "grad_norm": 0.17319492995738983, + "learning_rate": 8.747741712689786e-06, + "loss": 0.4506, + "step": 7309 + }, + { + "epoch": 1.6306045059112202, + "grad_norm": 0.1812223494052887, + "learning_rate": 8.745406843753369e-06, + "loss": 0.4256, + "step": 7310 + }, + { + "epoch": 1.6308275708231095, + "grad_norm": 0.1832951158285141, + "learning_rate": 8.743072044304212e-06, + "loss": 0.4721, + "step": 7311 + }, + { + "epoch": 1.631050635734999, + "grad_norm": 0.183165043592453, + "learning_rate": 8.74073731447163e-06, + "loss": 0.4675, + "step": 7312 + }, + { + "epoch": 1.6312737006468883, + "grad_norm": 0.18091800808906555, + "learning_rate": 8.738402654384939e-06, + "loss": 0.4438, + "step": 7313 + }, + { + "epoch": 1.6314967655587775, + "grad_norm": 0.1746785044670105, + "learning_rate": 8.736068064173444e-06, + "loss": 0.4539, + "step": 7314 + }, + { + "epoch": 1.6317198304706668, + "grad_norm": 0.18262261152267456, + "learning_rate": 8.733733543966449e-06, + "loss": 0.4514, + "step": 7315 + }, + { + "epoch": 1.6319428953825563, + "grad_norm": 0.18014779686927795, + "learning_rate": 8.731399093893256e-06, + "loss": 0.4688, + "step": 7316 + }, + { + "epoch": 1.6321659602944458, + "grad_norm": 0.1781376600265503, + "learning_rate": 8.729064714083163e-06, + "loss": 0.4552, + "step": 7317 + }, + { + "epoch": 1.632389025206335, + "grad_norm": 0.17916658520698547, + "learning_rate": 8.726730404665458e-06, + "loss": 0.4463, + "step": 7318 + }, + { + "epoch": 1.6326120901182244, + "grad_norm": 0.17721021175384521, + "learning_rate": 8.724396165769435e-06, + "loss": 0.4552, + "step": 7319 + }, + { + "epoch": 1.6328351550301137, + "grad_norm": 0.18291765451431274, + "learning_rate": 8.722061997524374e-06, + "loss": 0.4492, + "step": 7320 + }, + { + "epoch": 1.6330582199420032, + "grad_norm": 0.19074088335037231, + "learning_rate": 8.719727900059559e-06, + "loss": 0.4784, + "step": 7321 + }, + { + "epoch": 1.6332812848538925, + "grad_norm": 0.18184469640254974, + "learning_rate": 8.717393873504265e-06, + "loss": 0.4602, + "step": 7322 + }, + { + "epoch": 1.633504349765782, + "grad_norm": 0.17274357378482819, + "learning_rate": 8.715059917987766e-06, + "loss": 0.4394, + "step": 7323 + }, + { + "epoch": 1.6337274146776712, + "grad_norm": 0.1714925616979599, + "learning_rate": 8.712726033639334e-06, + "loss": 0.4277, + "step": 7324 + }, + { + "epoch": 1.6339504795895605, + "grad_norm": 0.17422372102737427, + "learning_rate": 8.710392220588229e-06, + "loss": 0.4585, + "step": 7325 + }, + { + "epoch": 1.6341735445014498, + "grad_norm": 0.17235322296619415, + "learning_rate": 8.708058478963717e-06, + "loss": 0.4489, + "step": 7326 + }, + { + "epoch": 1.6343966094133393, + "grad_norm": 0.17422306537628174, + "learning_rate": 8.70572480889505e-06, + "loss": 0.4624, + "step": 7327 + }, + { + "epoch": 1.6346196743252286, + "grad_norm": 0.1853763461112976, + "learning_rate": 8.703391210511486e-06, + "loss": 0.4723, + "step": 7328 + }, + { + "epoch": 1.634842739237118, + "grad_norm": 0.17920438945293427, + "learning_rate": 8.701057683942274e-06, + "loss": 0.4596, + "step": 7329 + }, + { + "epoch": 1.6350658041490074, + "grad_norm": 0.17522959411144257, + "learning_rate": 8.698724229316658e-06, + "loss": 0.4501, + "step": 7330 + }, + { + "epoch": 1.6352888690608967, + "grad_norm": 0.17898008227348328, + "learning_rate": 8.696390846763877e-06, + "loss": 0.4416, + "step": 7331 + }, + { + "epoch": 1.635511933972786, + "grad_norm": 0.1786876916885376, + "learning_rate": 8.694057536413175e-06, + "loss": 0.4541, + "step": 7332 + }, + { + "epoch": 1.6357349988846754, + "grad_norm": 0.18402035534381866, + "learning_rate": 8.691724298393777e-06, + "loss": 0.4781, + "step": 7333 + }, + { + "epoch": 1.635958063796565, + "grad_norm": 0.18904945254325867, + "learning_rate": 8.68939113283492e-06, + "loss": 0.4541, + "step": 7334 + }, + { + "epoch": 1.6361811287084542, + "grad_norm": 0.18246184289455414, + "learning_rate": 8.687058039865823e-06, + "loss": 0.4711, + "step": 7335 + }, + { + "epoch": 1.6364041936203435, + "grad_norm": 0.1694568693637848, + "learning_rate": 8.684725019615714e-06, + "loss": 0.4648, + "step": 7336 + }, + { + "epoch": 1.6366272585322328, + "grad_norm": 0.17736688256263733, + "learning_rate": 8.682392072213804e-06, + "loss": 0.4192, + "step": 7337 + }, + { + "epoch": 1.6368503234441223, + "grad_norm": 0.16740672290325165, + "learning_rate": 8.680059197789311e-06, + "loss": 0.4591, + "step": 7338 + }, + { + "epoch": 1.6370733883560116, + "grad_norm": 0.17872267961502075, + "learning_rate": 8.67772639647144e-06, + "loss": 0.4453, + "step": 7339 + }, + { + "epoch": 1.637296453267901, + "grad_norm": 0.18077607452869415, + "learning_rate": 8.675393668389402e-06, + "loss": 0.459, + "step": 7340 + }, + { + "epoch": 1.6375195181797904, + "grad_norm": 0.17609618604183197, + "learning_rate": 8.67306101367239e-06, + "loss": 0.4774, + "step": 7341 + }, + { + "epoch": 1.6377425830916796, + "grad_norm": 0.17567932605743408, + "learning_rate": 8.670728432449608e-06, + "loss": 0.4577, + "step": 7342 + }, + { + "epoch": 1.637965648003569, + "grad_norm": 0.18669098615646362, + "learning_rate": 8.66839592485025e-06, + "loss": 0.4321, + "step": 7343 + }, + { + "epoch": 1.6381887129154584, + "grad_norm": 0.1745385229587555, + "learning_rate": 8.666063491003499e-06, + "loss": 0.4595, + "step": 7344 + }, + { + "epoch": 1.6384117778273477, + "grad_norm": 0.16913901269435883, + "learning_rate": 8.663731131038544e-06, + "loss": 0.4179, + "step": 7345 + }, + { + "epoch": 1.6386348427392372, + "grad_norm": 0.16662278771400452, + "learning_rate": 8.661398845084562e-06, + "loss": 0.4333, + "step": 7346 + }, + { + "epoch": 1.6388579076511265, + "grad_norm": 0.17218153178691864, + "learning_rate": 8.659066633270736e-06, + "loss": 0.4324, + "step": 7347 + }, + { + "epoch": 1.6390809725630158, + "grad_norm": 0.1764010488986969, + "learning_rate": 8.65673449572623e-06, + "loss": 0.44, + "step": 7348 + }, + { + "epoch": 1.639304037474905, + "grad_norm": 0.17202350497245789, + "learning_rate": 8.65440243258022e-06, + "loss": 0.4611, + "step": 7349 + }, + { + "epoch": 1.6395271023867946, + "grad_norm": 0.17550015449523926, + "learning_rate": 8.652070443961866e-06, + "loss": 0.4641, + "step": 7350 + }, + { + "epoch": 1.639750167298684, + "grad_norm": 0.18475106358528137, + "learning_rate": 8.649738530000333e-06, + "loss": 0.4537, + "step": 7351 + }, + { + "epoch": 1.6399732322105733, + "grad_norm": 0.1760861724615097, + "learning_rate": 8.647406690824769e-06, + "loss": 0.4216, + "step": 7352 + }, + { + "epoch": 1.6401962971224626, + "grad_norm": 0.20794068276882172, + "learning_rate": 8.645074926564334e-06, + "loss": 0.4492, + "step": 7353 + }, + { + "epoch": 1.640419362034352, + "grad_norm": 0.1916939616203308, + "learning_rate": 8.64274323734817e-06, + "loss": 0.4492, + "step": 7354 + }, + { + "epoch": 1.6406424269462414, + "grad_norm": 0.17501096427440643, + "learning_rate": 8.640411623305425e-06, + "loss": 0.4365, + "step": 7355 + }, + { + "epoch": 1.6408654918581307, + "grad_norm": 0.1728835254907608, + "learning_rate": 8.638080084565235e-06, + "loss": 0.444, + "step": 7356 + }, + { + "epoch": 1.6410885567700202, + "grad_norm": 0.17348189651966095, + "learning_rate": 8.63574862125674e-06, + "loss": 0.4614, + "step": 7357 + }, + { + "epoch": 1.6413116216819095, + "grad_norm": 0.17011727392673492, + "learning_rate": 8.633417233509063e-06, + "loss": 0.4539, + "step": 7358 + }, + { + "epoch": 1.6415346865937988, + "grad_norm": 0.17248336970806122, + "learning_rate": 8.63108592145134e-06, + "loss": 0.4545, + "step": 7359 + }, + { + "epoch": 1.641757751505688, + "grad_norm": 0.17574246227741241, + "learning_rate": 8.628754685212685e-06, + "loss": 0.458, + "step": 7360 + }, + { + "epoch": 1.6419808164175775, + "grad_norm": 0.19359445571899414, + "learning_rate": 8.626423524922224e-06, + "loss": 0.4388, + "step": 7361 + }, + { + "epoch": 1.642203881329467, + "grad_norm": 0.17245686054229736, + "learning_rate": 8.624092440709066e-06, + "loss": 0.4345, + "step": 7362 + }, + { + "epoch": 1.6424269462413563, + "grad_norm": 0.180564284324646, + "learning_rate": 8.621761432702325e-06, + "loss": 0.4665, + "step": 7363 + }, + { + "epoch": 1.6426500111532456, + "grad_norm": 0.18473738431930542, + "learning_rate": 8.619430501031106e-06, + "loss": 0.4364, + "step": 7364 + }, + { + "epoch": 1.6428730760651349, + "grad_norm": 0.18806539475917816, + "learning_rate": 8.617099645824509e-06, + "loss": 0.4498, + "step": 7365 + }, + { + "epoch": 1.6430961409770242, + "grad_norm": 0.1734282672405243, + "learning_rate": 8.614768867211634e-06, + "loss": 0.4782, + "step": 7366 + }, + { + "epoch": 1.6433192058889137, + "grad_norm": 0.183370441198349, + "learning_rate": 8.612438165321571e-06, + "loss": 0.4892, + "step": 7367 + }, + { + "epoch": 1.6435422708008032, + "grad_norm": 0.1682674139738083, + "learning_rate": 8.610107540283411e-06, + "loss": 0.4256, + "step": 7368 + }, + { + "epoch": 1.6437653357126925, + "grad_norm": 0.17661243677139282, + "learning_rate": 8.607776992226238e-06, + "loss": 0.4693, + "step": 7369 + }, + { + "epoch": 1.6439884006245817, + "grad_norm": 0.2739085257053375, + "learning_rate": 8.605446521279132e-06, + "loss": 0.461, + "step": 7370 + }, + { + "epoch": 1.644211465536471, + "grad_norm": 0.1764136105775833, + "learning_rate": 8.60311612757117e-06, + "loss": 0.4833, + "step": 7371 + }, + { + "epoch": 1.6444345304483605, + "grad_norm": 0.17037895321846008, + "learning_rate": 8.600785811231426e-06, + "loss": 0.474, + "step": 7372 + }, + { + "epoch": 1.6446575953602498, + "grad_norm": 0.17914576828479767, + "learning_rate": 8.598455572388961e-06, + "loss": 0.4404, + "step": 7373 + }, + { + "epoch": 1.6448806602721393, + "grad_norm": 0.1834004670381546, + "learning_rate": 8.596125411172846e-06, + "loss": 0.4738, + "step": 7374 + }, + { + "epoch": 1.6451037251840286, + "grad_norm": 0.542524516582489, + "learning_rate": 8.593795327712135e-06, + "loss": 0.4519, + "step": 7375 + }, + { + "epoch": 1.6453267900959179, + "grad_norm": 0.17706745862960815, + "learning_rate": 8.591465322135886e-06, + "loss": 0.47, + "step": 7376 + }, + { + "epoch": 1.6455498550078071, + "grad_norm": 0.18335482478141785, + "learning_rate": 8.589135394573146e-06, + "loss": 0.4763, + "step": 7377 + }, + { + "epoch": 1.6457729199196967, + "grad_norm": 0.19544199109077454, + "learning_rate": 8.586805545152962e-06, + "loss": 0.4459, + "step": 7378 + }, + { + "epoch": 1.6459959848315862, + "grad_norm": 0.19434663653373718, + "learning_rate": 8.584475774004374e-06, + "loss": 0.4611, + "step": 7379 + }, + { + "epoch": 1.6462190497434754, + "grad_norm": 0.17542743682861328, + "learning_rate": 8.582146081256428e-06, + "loss": 0.4435, + "step": 7380 + }, + { + "epoch": 1.6464421146553647, + "grad_norm": 0.17821446061134338, + "learning_rate": 8.579816467038144e-06, + "loss": 0.4674, + "step": 7381 + }, + { + "epoch": 1.646665179567254, + "grad_norm": 0.16441945731639862, + "learning_rate": 8.577486931478563e-06, + "loss": 0.4263, + "step": 7382 + }, + { + "epoch": 1.6468882444791433, + "grad_norm": 0.17425528168678284, + "learning_rate": 8.5751574747067e-06, + "loss": 0.4491, + "step": 7383 + }, + { + "epoch": 1.6471113093910328, + "grad_norm": 0.17746862769126892, + "learning_rate": 8.572828096851577e-06, + "loss": 0.4633, + "step": 7384 + }, + { + "epoch": 1.6473343743029223, + "grad_norm": 0.16490298509597778, + "learning_rate": 8.570498798042217e-06, + "loss": 0.4116, + "step": 7385 + }, + { + "epoch": 1.6475574392148116, + "grad_norm": 0.17692579329013824, + "learning_rate": 8.568169578407624e-06, + "loss": 0.4585, + "step": 7386 + }, + { + "epoch": 1.6477805041267009, + "grad_norm": 0.18186244368553162, + "learning_rate": 8.565840438076805e-06, + "loss": 0.4792, + "step": 7387 + }, + { + "epoch": 1.6480035690385901, + "grad_norm": 0.16964338719844818, + "learning_rate": 8.563511377178764e-06, + "loss": 0.4254, + "step": 7388 + }, + { + "epoch": 1.6482266339504796, + "grad_norm": 0.17917568981647491, + "learning_rate": 8.561182395842497e-06, + "loss": 0.4823, + "step": 7389 + }, + { + "epoch": 1.648449698862369, + "grad_norm": 0.1826728731393814, + "learning_rate": 8.558853494197e-06, + "loss": 0.4297, + "step": 7390 + }, + { + "epoch": 1.6486727637742584, + "grad_norm": 0.17334018647670746, + "learning_rate": 8.556524672371263e-06, + "loss": 0.4342, + "step": 7391 + }, + { + "epoch": 1.6488958286861477, + "grad_norm": 0.17298614978790283, + "learning_rate": 8.55419593049427e-06, + "loss": 0.4605, + "step": 7392 + }, + { + "epoch": 1.649118893598037, + "grad_norm": 0.1751953512430191, + "learning_rate": 8.551867268694999e-06, + "loss": 0.4426, + "step": 7393 + }, + { + "epoch": 1.6493419585099263, + "grad_norm": 0.17890918254852295, + "learning_rate": 8.54953868710243e-06, + "loss": 0.4479, + "step": 7394 + }, + { + "epoch": 1.6495650234218158, + "grad_norm": 0.17455795407295227, + "learning_rate": 8.54721018584553e-06, + "loss": 0.4452, + "step": 7395 + }, + { + "epoch": 1.6497880883337053, + "grad_norm": 0.1724436730146408, + "learning_rate": 8.54488176505327e-06, + "loss": 0.4326, + "step": 7396 + }, + { + "epoch": 1.6500111532455946, + "grad_norm": 0.17377914488315582, + "learning_rate": 8.542553424854608e-06, + "loss": 0.4589, + "step": 7397 + }, + { + "epoch": 1.6502342181574838, + "grad_norm": 0.17500680685043335, + "learning_rate": 8.540225165378509e-06, + "loss": 0.432, + "step": 7398 + }, + { + "epoch": 1.6504572830693731, + "grad_norm": 0.17203043401241302, + "learning_rate": 8.53789698675392e-06, + "loss": 0.4801, + "step": 7399 + }, + { + "epoch": 1.6506803479812624, + "grad_norm": 0.17373989522457123, + "learning_rate": 8.535568889109794e-06, + "loss": 0.4431, + "step": 7400 + }, + { + "epoch": 1.650903412893152, + "grad_norm": 0.1792883723974228, + "learning_rate": 8.533240872575073e-06, + "loss": 0.4531, + "step": 7401 + }, + { + "epoch": 1.6511264778050414, + "grad_norm": 0.17690402269363403, + "learning_rate": 8.530912937278702e-06, + "loss": 0.4548, + "step": 7402 + }, + { + "epoch": 1.6513495427169307, + "grad_norm": 0.17443451285362244, + "learning_rate": 8.52858508334961e-06, + "loss": 0.4478, + "step": 7403 + }, + { + "epoch": 1.65157260762882, + "grad_norm": 0.1743832528591156, + "learning_rate": 8.526257310916734e-06, + "loss": 0.4732, + "step": 7404 + }, + { + "epoch": 1.6517956725407092, + "grad_norm": 0.1756555140018463, + "learning_rate": 8.523929620108996e-06, + "loss": 0.4439, + "step": 7405 + }, + { + "epoch": 1.6520187374525988, + "grad_norm": 0.19035564363002777, + "learning_rate": 8.521602011055324e-06, + "loss": 0.4677, + "step": 7406 + }, + { + "epoch": 1.652241802364488, + "grad_norm": 0.18924179673194885, + "learning_rate": 8.519274483884627e-06, + "loss": 0.463, + "step": 7407 + }, + { + "epoch": 1.6524648672763775, + "grad_norm": 0.18173234164714813, + "learning_rate": 8.516947038725826e-06, + "loss": 0.4318, + "step": 7408 + }, + { + "epoch": 1.6526879321882668, + "grad_norm": 0.175716370344162, + "learning_rate": 8.514619675707828e-06, + "loss": 0.4292, + "step": 7409 + }, + { + "epoch": 1.652910997100156, + "grad_norm": 0.17146100103855133, + "learning_rate": 8.512292394959533e-06, + "loss": 0.4241, + "step": 7410 + }, + { + "epoch": 1.6531340620120454, + "grad_norm": 0.17284603416919708, + "learning_rate": 8.509965196609846e-06, + "loss": 0.4442, + "step": 7411 + }, + { + "epoch": 1.6533571269239349, + "grad_norm": 0.18271994590759277, + "learning_rate": 8.507638080787657e-06, + "loss": 0.451, + "step": 7412 + }, + { + "epoch": 1.6535801918358244, + "grad_norm": 0.17891302704811096, + "learning_rate": 8.50531104762186e-06, + "loss": 0.4496, + "step": 7413 + }, + { + "epoch": 1.6538032567477137, + "grad_norm": 0.19851596653461456, + "learning_rate": 8.502984097241338e-06, + "loss": 0.4715, + "step": 7414 + }, + { + "epoch": 1.654026321659603, + "grad_norm": 0.18337850272655487, + "learning_rate": 8.500657229774975e-06, + "loss": 0.439, + "step": 7415 + }, + { + "epoch": 1.6542493865714922, + "grad_norm": 0.18898166716098785, + "learning_rate": 8.498330445351643e-06, + "loss": 0.4641, + "step": 7416 + }, + { + "epoch": 1.6544724514833815, + "grad_norm": 0.18804045021533966, + "learning_rate": 8.49600374410022e-06, + "loss": 0.4488, + "step": 7417 + }, + { + "epoch": 1.654695516395271, + "grad_norm": 0.18279941380023956, + "learning_rate": 8.493677126149569e-06, + "loss": 0.4395, + "step": 7418 + }, + { + "epoch": 1.6549185813071605, + "grad_norm": 0.18200235068798065, + "learning_rate": 8.491350591628554e-06, + "loss": 0.4475, + "step": 7419 + }, + { + "epoch": 1.6551416462190498, + "grad_norm": 0.1727295070886612, + "learning_rate": 8.489024140666032e-06, + "loss": 0.4627, + "step": 7420 + }, + { + "epoch": 1.655364711130939, + "grad_norm": 0.17785604298114777, + "learning_rate": 8.486697773390859e-06, + "loss": 0.4605, + "step": 7421 + }, + { + "epoch": 1.6555877760428284, + "grad_norm": 0.18455497920513153, + "learning_rate": 8.48437148993188e-06, + "loss": 0.4463, + "step": 7422 + }, + { + "epoch": 1.6558108409547179, + "grad_norm": 0.17284157872200012, + "learning_rate": 8.482045290417946e-06, + "loss": 0.4735, + "step": 7423 + }, + { + "epoch": 1.6560339058666071, + "grad_norm": 0.1757441759109497, + "learning_rate": 8.479719174977887e-06, + "loss": 0.4296, + "step": 7424 + }, + { + "epoch": 1.6562569707784967, + "grad_norm": 0.17652331292629242, + "learning_rate": 8.477393143740546e-06, + "loss": 0.4289, + "step": 7425 + }, + { + "epoch": 1.656480035690386, + "grad_norm": 0.17844471335411072, + "learning_rate": 8.475067196834749e-06, + "loss": 0.4624, + "step": 7426 + }, + { + "epoch": 1.6567031006022752, + "grad_norm": 0.1853104829788208, + "learning_rate": 8.472741334389322e-06, + "loss": 0.444, + "step": 7427 + }, + { + "epoch": 1.6569261655141645, + "grad_norm": 0.20148655772209167, + "learning_rate": 8.47041555653309e-06, + "loss": 0.4412, + "step": 7428 + }, + { + "epoch": 1.657149230426054, + "grad_norm": 0.17949336767196655, + "learning_rate": 8.468089863394864e-06, + "loss": 0.4799, + "step": 7429 + }, + { + "epoch": 1.6573722953379435, + "grad_norm": 0.1833929866552353, + "learning_rate": 8.465764255103457e-06, + "loss": 0.459, + "step": 7430 + }, + { + "epoch": 1.6575953602498328, + "grad_norm": 0.16851405799388885, + "learning_rate": 8.463438731787677e-06, + "loss": 0.4509, + "step": 7431 + }, + { + "epoch": 1.657818425161722, + "grad_norm": 0.17039094865322113, + "learning_rate": 8.461113293576325e-06, + "loss": 0.4591, + "step": 7432 + }, + { + "epoch": 1.6580414900736113, + "grad_norm": 0.1713404804468155, + "learning_rate": 8.4587879405982e-06, + "loss": 0.4469, + "step": 7433 + }, + { + "epoch": 1.6582645549855006, + "grad_norm": 0.1776912361383438, + "learning_rate": 8.456462672982092e-06, + "loss": 0.4526, + "step": 7434 + }, + { + "epoch": 1.6584876198973901, + "grad_norm": 0.1807934045791626, + "learning_rate": 8.45413749085679e-06, + "loss": 0.4689, + "step": 7435 + }, + { + "epoch": 1.6587106848092796, + "grad_norm": 0.17492172122001648, + "learning_rate": 8.451812394351078e-06, + "loss": 0.4605, + "step": 7436 + }, + { + "epoch": 1.658933749721169, + "grad_norm": 0.1770438402891159, + "learning_rate": 8.449487383593734e-06, + "loss": 0.4603, + "step": 7437 + }, + { + "epoch": 1.6591568146330582, + "grad_norm": 0.18041646480560303, + "learning_rate": 8.447162458713534e-06, + "loss": 0.4534, + "step": 7438 + }, + { + "epoch": 1.6593798795449475, + "grad_norm": 0.1849108189344406, + "learning_rate": 8.444837619839243e-06, + "loss": 0.4547, + "step": 7439 + }, + { + "epoch": 1.659602944456837, + "grad_norm": 0.18359196186065674, + "learning_rate": 8.442512867099627e-06, + "loss": 0.4545, + "step": 7440 + }, + { + "epoch": 1.6598260093687263, + "grad_norm": 0.19102846086025238, + "learning_rate": 8.440188200623445e-06, + "loss": 0.4257, + "step": 7441 + }, + { + "epoch": 1.6600490742806158, + "grad_norm": 0.1707092970609665, + "learning_rate": 8.437863620539454e-06, + "loss": 0.4307, + "step": 7442 + }, + { + "epoch": 1.660272139192505, + "grad_norm": 0.18710510432720184, + "learning_rate": 8.4355391269764e-06, + "loss": 0.4453, + "step": 7443 + }, + { + "epoch": 1.6604952041043943, + "grad_norm": 0.18393000960350037, + "learning_rate": 8.43321472006303e-06, + "loss": 0.448, + "step": 7444 + }, + { + "epoch": 1.6607182690162836, + "grad_norm": 0.16592559218406677, + "learning_rate": 8.430890399928085e-06, + "loss": 0.4243, + "step": 7445 + }, + { + "epoch": 1.6609413339281731, + "grad_norm": 0.16804459691047668, + "learning_rate": 8.4285661667003e-06, + "loss": 0.4357, + "step": 7446 + }, + { + "epoch": 1.6611643988400626, + "grad_norm": 0.1712721884250641, + "learning_rate": 8.426242020508405e-06, + "loss": 0.4403, + "step": 7447 + }, + { + "epoch": 1.661387463751952, + "grad_norm": 0.17568182945251465, + "learning_rate": 8.423917961481124e-06, + "loss": 0.4489, + "step": 7448 + }, + { + "epoch": 1.6616105286638412, + "grad_norm": 0.16803157329559326, + "learning_rate": 8.421593989747184e-06, + "loss": 0.4293, + "step": 7449 + }, + { + "epoch": 1.6618335935757305, + "grad_norm": 0.18901732563972473, + "learning_rate": 8.419270105435294e-06, + "loss": 0.4637, + "step": 7450 + }, + { + "epoch": 1.6620566584876197, + "grad_norm": 0.18891791999340057, + "learning_rate": 8.416946308674173e-06, + "loss": 0.4602, + "step": 7451 + }, + { + "epoch": 1.6622797233995092, + "grad_norm": 0.17820779979228973, + "learning_rate": 8.414622599592518e-06, + "loss": 0.4555, + "step": 7452 + }, + { + "epoch": 1.6625027883113987, + "grad_norm": 0.18127723038196564, + "learning_rate": 8.41229897831904e-06, + "loss": 0.4512, + "step": 7453 + }, + { + "epoch": 1.662725853223288, + "grad_norm": 0.18046575784683228, + "learning_rate": 8.40997544498243e-06, + "loss": 0.4635, + "step": 7454 + }, + { + "epoch": 1.6629489181351773, + "grad_norm": 0.1841399371623993, + "learning_rate": 8.407651999711383e-06, + "loss": 0.4605, + "step": 7455 + }, + { + "epoch": 1.6631719830470666, + "grad_norm": 0.17511752247810364, + "learning_rate": 8.405328642634582e-06, + "loss": 0.4308, + "step": 7456 + }, + { + "epoch": 1.663395047958956, + "grad_norm": 0.17428848147392273, + "learning_rate": 8.403005373880713e-06, + "loss": 0.4426, + "step": 7457 + }, + { + "epoch": 1.6636181128708454, + "grad_norm": 0.18597403168678284, + "learning_rate": 8.400682193578451e-06, + "loss": 0.4476, + "step": 7458 + }, + { + "epoch": 1.6638411777827349, + "grad_norm": 0.17569150030612946, + "learning_rate": 8.398359101856471e-06, + "loss": 0.4161, + "step": 7459 + }, + { + "epoch": 1.6640642426946242, + "grad_norm": 0.17648978531360626, + "learning_rate": 8.396036098843438e-06, + "loss": 0.4596, + "step": 7460 + }, + { + "epoch": 1.6642873076065134, + "grad_norm": 0.1786419153213501, + "learning_rate": 8.393713184668015e-06, + "loss": 0.4623, + "step": 7461 + }, + { + "epoch": 1.6645103725184027, + "grad_norm": 0.18259434401988983, + "learning_rate": 8.391390359458858e-06, + "loss": 0.4703, + "step": 7462 + }, + { + "epoch": 1.6647334374302922, + "grad_norm": 0.18008796870708466, + "learning_rate": 8.389067623344625e-06, + "loss": 0.464, + "step": 7463 + }, + { + "epoch": 1.6649565023421817, + "grad_norm": 0.20690587162971497, + "learning_rate": 8.386744976453958e-06, + "loss": 0.457, + "step": 7464 + }, + { + "epoch": 1.665179567254071, + "grad_norm": 0.21626955270767212, + "learning_rate": 8.384422418915503e-06, + "loss": 0.4171, + "step": 7465 + }, + { + "epoch": 1.6654026321659603, + "grad_norm": 0.16983038187026978, + "learning_rate": 8.382099950857895e-06, + "loss": 0.434, + "step": 7466 + }, + { + "epoch": 1.6656256970778496, + "grad_norm": 0.18319158256053925, + "learning_rate": 8.379777572409771e-06, + "loss": 0.4808, + "step": 7467 + }, + { + "epoch": 1.6658487619897389, + "grad_norm": 0.2683338522911072, + "learning_rate": 8.377455283699758e-06, + "loss": 0.4772, + "step": 7468 + }, + { + "epoch": 1.6660718269016284, + "grad_norm": 0.18146146833896637, + "learning_rate": 8.375133084856475e-06, + "loss": 0.4226, + "step": 7469 + }, + { + "epoch": 1.6662948918135179, + "grad_norm": 0.17817704379558563, + "learning_rate": 8.372810976008543e-06, + "loss": 0.4272, + "step": 7470 + }, + { + "epoch": 1.6665179567254071, + "grad_norm": 0.18885549902915955, + "learning_rate": 8.370488957284574e-06, + "loss": 0.4722, + "step": 7471 + }, + { + "epoch": 1.6667410216372964, + "grad_norm": 0.18117155134677887, + "learning_rate": 8.368167028813176e-06, + "loss": 0.4377, + "step": 7472 + }, + { + "epoch": 1.6669640865491857, + "grad_norm": 0.19117340445518494, + "learning_rate": 8.365845190722955e-06, + "loss": 0.4563, + "step": 7473 + }, + { + "epoch": 1.6671871514610752, + "grad_norm": 0.1751892864704132, + "learning_rate": 8.363523443142503e-06, + "loss": 0.4433, + "step": 7474 + }, + { + "epoch": 1.6674102163729645, + "grad_norm": 0.18443763256072998, + "learning_rate": 8.36120178620042e-06, + "loss": 0.439, + "step": 7475 + }, + { + "epoch": 1.667633281284854, + "grad_norm": 0.18026113510131836, + "learning_rate": 8.358880220025288e-06, + "loss": 0.4275, + "step": 7476 + }, + { + "epoch": 1.6678563461967433, + "grad_norm": 0.17562150955200195, + "learning_rate": 8.356558744745695e-06, + "loss": 0.4685, + "step": 7477 + }, + { + "epoch": 1.6680794111086326, + "grad_norm": 0.1727527379989624, + "learning_rate": 8.354237360490212e-06, + "loss": 0.4602, + "step": 7478 + }, + { + "epoch": 1.6683024760205218, + "grad_norm": 0.1706780195236206, + "learning_rate": 8.351916067387421e-06, + "loss": 0.4431, + "step": 7479 + }, + { + "epoch": 1.6685255409324113, + "grad_norm": 0.19286386668682098, + "learning_rate": 8.349594865565882e-06, + "loss": 0.4496, + "step": 7480 + }, + { + "epoch": 1.6687486058443008, + "grad_norm": 0.2684648931026459, + "learning_rate": 8.347273755154164e-06, + "loss": 0.4839, + "step": 7481 + }, + { + "epoch": 1.6689716707561901, + "grad_norm": 0.1720086932182312, + "learning_rate": 8.344952736280819e-06, + "loss": 0.4253, + "step": 7482 + }, + { + "epoch": 1.6691947356680794, + "grad_norm": 0.17519108951091766, + "learning_rate": 8.342631809074403e-06, + "loss": 0.4363, + "step": 7483 + }, + { + "epoch": 1.6694178005799687, + "grad_norm": 0.18655794858932495, + "learning_rate": 8.340310973663461e-06, + "loss": 0.4811, + "step": 7484 + }, + { + "epoch": 1.669640865491858, + "grad_norm": 0.16531935334205627, + "learning_rate": 8.33799023017654e-06, + "loss": 0.431, + "step": 7485 + }, + { + "epoch": 1.6698639304037475, + "grad_norm": 0.2258535474538803, + "learning_rate": 8.335669578742172e-06, + "loss": 0.4407, + "step": 7486 + }, + { + "epoch": 1.670086995315637, + "grad_norm": 0.17526793479919434, + "learning_rate": 8.333349019488893e-06, + "loss": 0.4531, + "step": 7487 + }, + { + "epoch": 1.6703100602275263, + "grad_norm": 0.17840759456157684, + "learning_rate": 8.331028552545228e-06, + "loss": 0.4374, + "step": 7488 + }, + { + "epoch": 1.6705331251394155, + "grad_norm": 0.1826542466878891, + "learning_rate": 8.328708178039702e-06, + "loss": 0.4881, + "step": 7489 + }, + { + "epoch": 1.6707561900513048, + "grad_norm": 0.17568182945251465, + "learning_rate": 8.326387896100827e-06, + "loss": 0.4607, + "step": 7490 + }, + { + "epoch": 1.6709792549631943, + "grad_norm": 0.1761264204978943, + "learning_rate": 8.324067706857121e-06, + "loss": 0.4399, + "step": 7491 + }, + { + "epoch": 1.6712023198750836, + "grad_norm": 0.16690705716609955, + "learning_rate": 8.321747610437084e-06, + "loss": 0.4417, + "step": 7492 + }, + { + "epoch": 1.6714253847869731, + "grad_norm": 0.17830541729927063, + "learning_rate": 8.319427606969223e-06, + "loss": 0.4437, + "step": 7493 + }, + { + "epoch": 1.6716484496988624, + "grad_norm": 0.271932452917099, + "learning_rate": 8.317107696582031e-06, + "loss": 0.4486, + "step": 7494 + }, + { + "epoch": 1.6718715146107517, + "grad_norm": 0.19442439079284668, + "learning_rate": 8.314787879404002e-06, + "loss": 0.474, + "step": 7495 + }, + { + "epoch": 1.672094579522641, + "grad_norm": 0.1841454654932022, + "learning_rate": 8.312468155563623e-06, + "loss": 0.4689, + "step": 7496 + }, + { + "epoch": 1.6723176444345305, + "grad_norm": 0.18820422887802124, + "learning_rate": 8.310148525189367e-06, + "loss": 0.4349, + "step": 7497 + }, + { + "epoch": 1.67254070934642, + "grad_norm": 0.17644047737121582, + "learning_rate": 8.30782898840972e-06, + "loss": 0.4665, + "step": 7498 + }, + { + "epoch": 1.6727637742583092, + "grad_norm": 0.17768198251724243, + "learning_rate": 8.305509545353144e-06, + "loss": 0.4551, + "step": 7499 + }, + { + "epoch": 1.6729868391701985, + "grad_norm": 0.18405990302562714, + "learning_rate": 8.303190196148112e-06, + "loss": 0.4434, + "step": 7500 + }, + { + "epoch": 1.6732099040820878, + "grad_norm": 0.17325492203235626, + "learning_rate": 8.300870940923077e-06, + "loss": 0.4501, + "step": 7501 + }, + { + "epoch": 1.673432968993977, + "grad_norm": 0.18856076896190643, + "learning_rate": 8.2985517798065e-06, + "loss": 0.4431, + "step": 7502 + }, + { + "epoch": 1.6736560339058666, + "grad_norm": 0.1935485303401947, + "learning_rate": 8.296232712926826e-06, + "loss": 0.4648, + "step": 7503 + }, + { + "epoch": 1.673879098817756, + "grad_norm": 0.17944234609603882, + "learning_rate": 8.293913740412503e-06, + "loss": 0.4561, + "step": 7504 + }, + { + "epoch": 1.6741021637296454, + "grad_norm": 0.17433197796344757, + "learning_rate": 8.291594862391966e-06, + "loss": 0.4551, + "step": 7505 + }, + { + "epoch": 1.6743252286415347, + "grad_norm": 0.18297721445560455, + "learning_rate": 8.289276078993655e-06, + "loss": 0.4609, + "step": 7506 + }, + { + "epoch": 1.674548293553424, + "grad_norm": 0.17488986253738403, + "learning_rate": 8.286957390345994e-06, + "loss": 0.4294, + "step": 7507 + }, + { + "epoch": 1.6747713584653134, + "grad_norm": 0.1750224232673645, + "learning_rate": 8.28463879657741e-06, + "loss": 0.4609, + "step": 7508 + }, + { + "epoch": 1.6749944233772027, + "grad_norm": 0.18751104176044464, + "learning_rate": 8.282320297816315e-06, + "loss": 0.4519, + "step": 7509 + }, + { + "epoch": 1.6752174882890922, + "grad_norm": 0.17194952070713043, + "learning_rate": 8.280001894191132e-06, + "loss": 0.4437, + "step": 7510 + }, + { + "epoch": 1.6754405532009815, + "grad_norm": 0.17177167534828186, + "learning_rate": 8.277683585830259e-06, + "loss": 0.4455, + "step": 7511 + }, + { + "epoch": 1.6756636181128708, + "grad_norm": 0.17539672553539276, + "learning_rate": 8.275365372862106e-06, + "loss": 0.4598, + "step": 7512 + }, + { + "epoch": 1.67588668302476, + "grad_norm": 0.17903223633766174, + "learning_rate": 8.273047255415066e-06, + "loss": 0.4473, + "step": 7513 + }, + { + "epoch": 1.6761097479366496, + "grad_norm": 0.17606431245803833, + "learning_rate": 8.27072923361753e-06, + "loss": 0.4592, + "step": 7514 + }, + { + "epoch": 1.676332812848539, + "grad_norm": 0.1900903880596161, + "learning_rate": 8.26841130759789e-06, + "loss": 0.4735, + "step": 7515 + }, + { + "epoch": 1.6765558777604284, + "grad_norm": 0.18751583993434906, + "learning_rate": 8.26609347748452e-06, + "loss": 0.4795, + "step": 7516 + }, + { + "epoch": 1.6767789426723176, + "grad_norm": 0.18636786937713623, + "learning_rate": 8.263775743405804e-06, + "loss": 0.4474, + "step": 7517 + }, + { + "epoch": 1.677002007584207, + "grad_norm": 0.1759779453277588, + "learning_rate": 8.261458105490104e-06, + "loss": 0.4613, + "step": 7518 + }, + { + "epoch": 1.6772250724960962, + "grad_norm": 0.16955047845840454, + "learning_rate": 8.259140563865796e-06, + "loss": 0.4526, + "step": 7519 + }, + { + "epoch": 1.6774481374079857, + "grad_norm": 0.18776053190231323, + "learning_rate": 8.256823118661228e-06, + "loss": 0.4265, + "step": 7520 + }, + { + "epoch": 1.6776712023198752, + "grad_norm": 0.17667481303215027, + "learning_rate": 8.254505770004764e-06, + "loss": 0.4782, + "step": 7521 + }, + { + "epoch": 1.6778942672317645, + "grad_norm": 0.17100000381469727, + "learning_rate": 8.252188518024748e-06, + "loss": 0.464, + "step": 7522 + }, + { + "epoch": 1.6781173321436538, + "grad_norm": 0.16992557048797607, + "learning_rate": 8.24987136284953e-06, + "loss": 0.4451, + "step": 7523 + }, + { + "epoch": 1.678340397055543, + "grad_norm": 0.18691998720169067, + "learning_rate": 8.247554304607442e-06, + "loss": 0.4701, + "step": 7524 + }, + { + "epoch": 1.6785634619674326, + "grad_norm": 0.16606706380844116, + "learning_rate": 8.24523734342682e-06, + "loss": 0.4341, + "step": 7525 + }, + { + "epoch": 1.6787865268793218, + "grad_norm": 0.16912147402763367, + "learning_rate": 8.242920479435991e-06, + "loss": 0.4381, + "step": 7526 + }, + { + "epoch": 1.6790095917912113, + "grad_norm": 0.1704743504524231, + "learning_rate": 8.24060371276328e-06, + "loss": 0.4458, + "step": 7527 + }, + { + "epoch": 1.6792326567031006, + "grad_norm": 0.18589746952056885, + "learning_rate": 8.238287043537e-06, + "loss": 0.4349, + "step": 7528 + }, + { + "epoch": 1.67945572161499, + "grad_norm": 0.17729128897190094, + "learning_rate": 8.235970471885468e-06, + "loss": 0.446, + "step": 7529 + }, + { + "epoch": 1.6796787865268792, + "grad_norm": 0.17116542160511017, + "learning_rate": 8.233653997936985e-06, + "loss": 0.42, + "step": 7530 + }, + { + "epoch": 1.6799018514387687, + "grad_norm": 0.1848667562007904, + "learning_rate": 8.231337621819858e-06, + "loss": 0.4622, + "step": 7531 + }, + { + "epoch": 1.6801249163506582, + "grad_norm": 0.16781508922576904, + "learning_rate": 8.229021343662376e-06, + "loss": 0.4326, + "step": 7532 + }, + { + "epoch": 1.6803479812625475, + "grad_norm": 0.1825597584247589, + "learning_rate": 8.22670516359283e-06, + "loss": 0.4471, + "step": 7533 + }, + { + "epoch": 1.6805710461744368, + "grad_norm": 0.17909115552902222, + "learning_rate": 8.22438908173951e-06, + "loss": 0.464, + "step": 7534 + }, + { + "epoch": 1.680794111086326, + "grad_norm": 0.16868308186531067, + "learning_rate": 8.22207309823069e-06, + "loss": 0.4446, + "step": 7535 + }, + { + "epoch": 1.6810171759982153, + "grad_norm": 0.17775742709636688, + "learning_rate": 8.219757213194647e-06, + "loss": 0.4638, + "step": 7536 + }, + { + "epoch": 1.6812402409101048, + "grad_norm": 0.1721707284450531, + "learning_rate": 8.217441426759645e-06, + "loss": 0.4598, + "step": 7537 + }, + { + "epoch": 1.6814633058219943, + "grad_norm": 0.17508956789970398, + "learning_rate": 8.215125739053953e-06, + "loss": 0.4804, + "step": 7538 + }, + { + "epoch": 1.6816863707338836, + "grad_norm": 0.18540501594543457, + "learning_rate": 8.21281015020582e-06, + "loss": 0.457, + "step": 7539 + }, + { + "epoch": 1.6819094356457729, + "grad_norm": 0.19119501113891602, + "learning_rate": 8.210494660343508e-06, + "loss": 0.4486, + "step": 7540 + }, + { + "epoch": 1.6821325005576622, + "grad_norm": 0.18080313503742218, + "learning_rate": 8.208179269595255e-06, + "loss": 0.4727, + "step": 7541 + }, + { + "epoch": 1.6823555654695517, + "grad_norm": 0.192534402012825, + "learning_rate": 8.205863978089308e-06, + "loss": 0.4712, + "step": 7542 + }, + { + "epoch": 1.682578630381441, + "grad_norm": 0.16961976885795593, + "learning_rate": 8.203548785953896e-06, + "loss": 0.4169, + "step": 7543 + }, + { + "epoch": 1.6828016952933305, + "grad_norm": 0.17554304003715515, + "learning_rate": 8.201233693317254e-06, + "loss": 0.4448, + "step": 7544 + }, + { + "epoch": 1.6830247602052197, + "grad_norm": 0.1828324943780899, + "learning_rate": 8.198918700307604e-06, + "loss": 0.4758, + "step": 7545 + }, + { + "epoch": 1.683247825117109, + "grad_norm": 0.1798938661813736, + "learning_rate": 8.196603807053167e-06, + "loss": 0.4567, + "step": 7546 + }, + { + "epoch": 1.6834708900289983, + "grad_norm": 0.178616002202034, + "learning_rate": 8.194289013682154e-06, + "loss": 0.4547, + "step": 7547 + }, + { + "epoch": 1.6836939549408878, + "grad_norm": 0.20149093866348267, + "learning_rate": 8.191974320322776e-06, + "loss": 0.4427, + "step": 7548 + }, + { + "epoch": 1.6839170198527773, + "grad_norm": 0.17913025617599487, + "learning_rate": 8.189659727103233e-06, + "loss": 0.4371, + "step": 7549 + }, + { + "epoch": 1.6841400847646666, + "grad_norm": 0.18823112547397614, + "learning_rate": 8.18734523415172e-06, + "loss": 0.4548, + "step": 7550 + }, + { + "epoch": 1.6843631496765559, + "grad_norm": 0.17050491273403168, + "learning_rate": 8.185030841596431e-06, + "loss": 0.4481, + "step": 7551 + }, + { + "epoch": 1.6845862145884452, + "grad_norm": 0.1750423163175583, + "learning_rate": 8.182716549565548e-06, + "loss": 0.4527, + "step": 7552 + }, + { + "epoch": 1.6848092795003344, + "grad_norm": 0.17691361904144287, + "learning_rate": 8.180402358187256e-06, + "loss": 0.4564, + "step": 7553 + }, + { + "epoch": 1.685032344412224, + "grad_norm": 0.17147858440876007, + "learning_rate": 8.178088267589725e-06, + "loss": 0.4572, + "step": 7554 + }, + { + "epoch": 1.6852554093241134, + "grad_norm": 0.18109926581382751, + "learning_rate": 8.175774277901128e-06, + "loss": 0.4387, + "step": 7555 + }, + { + "epoch": 1.6854784742360027, + "grad_norm": 0.17862330377101898, + "learning_rate": 8.173460389249625e-06, + "loss": 0.4453, + "step": 7556 + }, + { + "epoch": 1.685701539147892, + "grad_norm": 0.18046163022518158, + "learning_rate": 8.171146601763374e-06, + "loss": 0.4471, + "step": 7557 + }, + { + "epoch": 1.6859246040597813, + "grad_norm": 0.172270730137825, + "learning_rate": 8.168832915570531e-06, + "loss": 0.4167, + "step": 7558 + }, + { + "epoch": 1.6861476689716708, + "grad_norm": 0.17891794443130493, + "learning_rate": 8.166519330799237e-06, + "loss": 0.4455, + "step": 7559 + }, + { + "epoch": 1.68637073388356, + "grad_norm": 0.1716044843196869, + "learning_rate": 8.16420584757764e-06, + "loss": 0.4451, + "step": 7560 + }, + { + "epoch": 1.6865937987954496, + "grad_norm": 0.17391957342624664, + "learning_rate": 8.161892466033865e-06, + "loss": 0.4543, + "step": 7561 + }, + { + "epoch": 1.6868168637073389, + "grad_norm": 0.17183732986450195, + "learning_rate": 8.159579186296052e-06, + "loss": 0.4653, + "step": 7562 + }, + { + "epoch": 1.6870399286192281, + "grad_norm": 0.19677892327308655, + "learning_rate": 8.157266008492318e-06, + "loss": 0.4543, + "step": 7563 + }, + { + "epoch": 1.6872629935311174, + "grad_norm": 0.18332447111606598, + "learning_rate": 8.154952932750784e-06, + "loss": 0.4407, + "step": 7564 + }, + { + "epoch": 1.687486058443007, + "grad_norm": 0.1903562992811203, + "learning_rate": 8.152639959199561e-06, + "loss": 0.4464, + "step": 7565 + }, + { + "epoch": 1.6877091233548964, + "grad_norm": 0.22077789902687073, + "learning_rate": 8.150327087966761e-06, + "loss": 0.4941, + "step": 7566 + }, + { + "epoch": 1.6879321882667857, + "grad_norm": 0.172964408993721, + "learning_rate": 8.148014319180479e-06, + "loss": 0.4584, + "step": 7567 + }, + { + "epoch": 1.688155253178675, + "grad_norm": 0.16977094113826752, + "learning_rate": 8.145701652968814e-06, + "loss": 0.4347, + "step": 7568 + }, + { + "epoch": 1.6883783180905643, + "grad_norm": 0.1772548258304596, + "learning_rate": 8.143389089459855e-06, + "loss": 0.4701, + "step": 7569 + }, + { + "epoch": 1.6886013830024535, + "grad_norm": 0.16908633708953857, + "learning_rate": 8.14107662878169e-06, + "loss": 0.4595, + "step": 7570 + }, + { + "epoch": 1.688824447914343, + "grad_norm": 0.18083757162094116, + "learning_rate": 8.138764271062389e-06, + "loss": 0.4919, + "step": 7571 + }, + { + "epoch": 1.6890475128262326, + "grad_norm": 0.18248751759529114, + "learning_rate": 8.136452016430035e-06, + "loss": 0.4767, + "step": 7572 + }, + { + "epoch": 1.6892705777381218, + "grad_norm": 0.1837497055530548, + "learning_rate": 8.134139865012688e-06, + "loss": 0.4462, + "step": 7573 + }, + { + "epoch": 1.6894936426500111, + "grad_norm": 0.1747952401638031, + "learning_rate": 8.131827816938412e-06, + "loss": 0.4586, + "step": 7574 + }, + { + "epoch": 1.6897167075619004, + "grad_norm": 0.16300569474697113, + "learning_rate": 8.129515872335263e-06, + "loss": 0.4319, + "step": 7575 + }, + { + "epoch": 1.68993977247379, + "grad_norm": 0.17720209062099457, + "learning_rate": 8.127204031331293e-06, + "loss": 0.4447, + "step": 7576 + }, + { + "epoch": 1.6901628373856792, + "grad_norm": 0.1850651055574417, + "learning_rate": 8.12489229405454e-06, + "loss": 0.4473, + "step": 7577 + }, + { + "epoch": 1.6903859022975687, + "grad_norm": 0.17101094126701355, + "learning_rate": 8.122580660633048e-06, + "loss": 0.4259, + "step": 7578 + }, + { + "epoch": 1.690608967209458, + "grad_norm": 0.17117485404014587, + "learning_rate": 8.12026913119485e-06, + "loss": 0.4467, + "step": 7579 + }, + { + "epoch": 1.6908320321213473, + "grad_norm": 0.18159452080726624, + "learning_rate": 8.117957705867971e-06, + "loss": 0.4606, + "step": 7580 + }, + { + "epoch": 1.6910550970332365, + "grad_norm": 0.17214803397655487, + "learning_rate": 8.115646384780434e-06, + "loss": 0.4454, + "step": 7581 + }, + { + "epoch": 1.691278161945126, + "grad_norm": 0.17703603208065033, + "learning_rate": 8.11333516806025e-06, + "loss": 0.4455, + "step": 7582 + }, + { + "epoch": 1.6915012268570155, + "grad_norm": 0.18035492300987244, + "learning_rate": 8.111024055835436e-06, + "loss": 0.4668, + "step": 7583 + }, + { + "epoch": 1.6917242917689048, + "grad_norm": 0.17585453391075134, + "learning_rate": 8.108713048233988e-06, + "loss": 0.4267, + "step": 7584 + }, + { + "epoch": 1.691947356680794, + "grad_norm": 0.184658944606781, + "learning_rate": 8.106402145383911e-06, + "loss": 0.4665, + "step": 7585 + }, + { + "epoch": 1.6921704215926834, + "grad_norm": 0.19270674884319305, + "learning_rate": 8.104091347413192e-06, + "loss": 0.4386, + "step": 7586 + }, + { + "epoch": 1.6923934865045729, + "grad_norm": 0.1800229251384735, + "learning_rate": 8.101780654449822e-06, + "loss": 0.4618, + "step": 7587 + }, + { + "epoch": 1.6926165514164622, + "grad_norm": 0.18103908002376556, + "learning_rate": 8.099470066621778e-06, + "loss": 0.4513, + "step": 7588 + }, + { + "epoch": 1.6928396163283517, + "grad_norm": 0.16606682538986206, + "learning_rate": 8.09715958405704e-06, + "loss": 0.4251, + "step": 7589 + }, + { + "epoch": 1.693062681240241, + "grad_norm": 0.17416086792945862, + "learning_rate": 8.094849206883569e-06, + "loss": 0.4473, + "step": 7590 + }, + { + "epoch": 1.6932857461521302, + "grad_norm": 0.21117763221263885, + "learning_rate": 8.092538935229336e-06, + "loss": 0.4578, + "step": 7591 + }, + { + "epoch": 1.6935088110640195, + "grad_norm": 0.17628377676010132, + "learning_rate": 8.090228769222292e-06, + "loss": 0.4716, + "step": 7592 + }, + { + "epoch": 1.693731875975909, + "grad_norm": 0.1700354665517807, + "learning_rate": 8.087918708990396e-06, + "loss": 0.4444, + "step": 7593 + }, + { + "epoch": 1.6939549408877983, + "grad_norm": 0.18636515736579895, + "learning_rate": 8.085608754661585e-06, + "loss": 0.4843, + "step": 7594 + }, + { + "epoch": 1.6941780057996878, + "grad_norm": 0.1767621785402298, + "learning_rate": 8.083298906363806e-06, + "loss": 0.4422, + "step": 7595 + }, + { + "epoch": 1.694401070711577, + "grad_norm": 0.1970623880624771, + "learning_rate": 8.080989164224988e-06, + "loss": 0.4432, + "step": 7596 + }, + { + "epoch": 1.6946241356234664, + "grad_norm": 0.1670975536108017, + "learning_rate": 8.078679528373063e-06, + "loss": 0.4353, + "step": 7597 + }, + { + "epoch": 1.6948472005353556, + "grad_norm": 0.18546359241008759, + "learning_rate": 8.076369998935951e-06, + "loss": 0.4501, + "step": 7598 + }, + { + "epoch": 1.6950702654472452, + "grad_norm": 0.17603303492069244, + "learning_rate": 8.074060576041566e-06, + "loss": 0.4451, + "step": 7599 + }, + { + "epoch": 1.6952933303591347, + "grad_norm": 0.17194673418998718, + "learning_rate": 8.071751259817825e-06, + "loss": 0.431, + "step": 7600 + }, + { + "epoch": 1.695516395271024, + "grad_norm": 0.1741739809513092, + "learning_rate": 8.069442050392625e-06, + "loss": 0.4715, + "step": 7601 + }, + { + "epoch": 1.6957394601829132, + "grad_norm": 0.24975387752056122, + "learning_rate": 8.067132947893872e-06, + "loss": 0.4369, + "step": 7602 + }, + { + "epoch": 1.6959625250948025, + "grad_norm": 0.18625226616859436, + "learning_rate": 8.064823952449449e-06, + "loss": 0.4464, + "step": 7603 + }, + { + "epoch": 1.696185590006692, + "grad_norm": 0.1777094006538391, + "learning_rate": 8.062515064187253e-06, + "loss": 0.4496, + "step": 7604 + }, + { + "epoch": 1.6964086549185813, + "grad_norm": 0.1841573417186737, + "learning_rate": 8.060206283235159e-06, + "loss": 0.4731, + "step": 7605 + }, + { + "epoch": 1.6966317198304708, + "grad_norm": 0.17774668335914612, + "learning_rate": 8.05789760972104e-06, + "loss": 0.4905, + "step": 7606 + }, + { + "epoch": 1.69685478474236, + "grad_norm": 0.16449688374996185, + "learning_rate": 8.055589043772772e-06, + "loss": 0.4542, + "step": 7607 + }, + { + "epoch": 1.6970778496542493, + "grad_norm": 0.1715865582227707, + "learning_rate": 8.053280585518211e-06, + "loss": 0.4448, + "step": 7608 + }, + { + "epoch": 1.6973009145661386, + "grad_norm": 0.18522848188877106, + "learning_rate": 8.050972235085217e-06, + "loss": 0.4821, + "step": 7609 + }, + { + "epoch": 1.6975239794780281, + "grad_norm": 0.17819975316524506, + "learning_rate": 8.048663992601641e-06, + "loss": 0.4306, + "step": 7610 + }, + { + "epoch": 1.6977470443899174, + "grad_norm": 0.1718638390302658, + "learning_rate": 8.046355858195325e-06, + "loss": 0.4349, + "step": 7611 + }, + { + "epoch": 1.697970109301807, + "grad_norm": 0.16903117299079895, + "learning_rate": 8.044047831994114e-06, + "loss": 0.4348, + "step": 7612 + }, + { + "epoch": 1.6981931742136962, + "grad_norm": 0.1886761337518692, + "learning_rate": 8.041739914125835e-06, + "loss": 0.4732, + "step": 7613 + }, + { + "epoch": 1.6984162391255855, + "grad_norm": 0.20157945156097412, + "learning_rate": 8.03943210471832e-06, + "loss": 0.4639, + "step": 7614 + }, + { + "epoch": 1.6986393040374748, + "grad_norm": 0.1816253513097763, + "learning_rate": 8.037124403899384e-06, + "loss": 0.4429, + "step": 7615 + }, + { + "epoch": 1.6988623689493643, + "grad_norm": 0.17475511133670807, + "learning_rate": 8.03481681179685e-06, + "loss": 0.4565, + "step": 7616 + }, + { + "epoch": 1.6990854338612538, + "grad_norm": 0.17711593210697174, + "learning_rate": 8.032509328538518e-06, + "loss": 0.4488, + "step": 7617 + }, + { + "epoch": 1.699308498773143, + "grad_norm": 0.18735013902187347, + "learning_rate": 8.030201954252198e-06, + "loss": 0.4554, + "step": 7618 + }, + { + "epoch": 1.6995315636850323, + "grad_norm": 0.5531375408172607, + "learning_rate": 8.027894689065684e-06, + "loss": 0.4486, + "step": 7619 + }, + { + "epoch": 1.6997546285969216, + "grad_norm": 0.16994529962539673, + "learning_rate": 8.025587533106765e-06, + "loss": 0.4126, + "step": 7620 + }, + { + "epoch": 1.6999776935088111, + "grad_norm": 0.18847712874412537, + "learning_rate": 8.02328048650323e-06, + "loss": 0.4653, + "step": 7621 + }, + { + "epoch": 1.7002007584207004, + "grad_norm": 0.17005674540996552, + "learning_rate": 8.020973549382855e-06, + "loss": 0.4099, + "step": 7622 + }, + { + "epoch": 1.70042382333259, + "grad_norm": 0.18046452105045319, + "learning_rate": 8.018666721873414e-06, + "loss": 0.423, + "step": 7623 + }, + { + "epoch": 1.7006468882444792, + "grad_norm": 0.17824363708496094, + "learning_rate": 8.016360004102672e-06, + "loss": 0.4659, + "step": 7624 + }, + { + "epoch": 1.7008699531563685, + "grad_norm": 0.16878539323806763, + "learning_rate": 8.014053396198392e-06, + "loss": 0.4464, + "step": 7625 + }, + { + "epoch": 1.7010930180682577, + "grad_norm": 0.1796082705259323, + "learning_rate": 8.011746898288326e-06, + "loss": 0.4493, + "step": 7626 + }, + { + "epoch": 1.7013160829801472, + "grad_norm": 0.1814991682767868, + "learning_rate": 8.009440510500224e-06, + "loss": 0.4588, + "step": 7627 + }, + { + "epoch": 1.7015391478920365, + "grad_norm": 0.1700638234615326, + "learning_rate": 8.007134232961828e-06, + "loss": 0.4207, + "step": 7628 + }, + { + "epoch": 1.701762212803926, + "grad_norm": 0.1950269490480423, + "learning_rate": 8.004828065800874e-06, + "loss": 0.4338, + "step": 7629 + }, + { + "epoch": 1.7019852777158153, + "grad_norm": 0.17464493215084076, + "learning_rate": 8.00252200914509e-06, + "loss": 0.4465, + "step": 7630 + }, + { + "epoch": 1.7022083426277046, + "grad_norm": 0.19278262555599213, + "learning_rate": 8.000216063122205e-06, + "loss": 0.4549, + "step": 7631 + }, + { + "epoch": 1.7024314075395939, + "grad_norm": 0.1833004504442215, + "learning_rate": 7.997910227859932e-06, + "loss": 0.4433, + "step": 7632 + }, + { + "epoch": 1.7026544724514834, + "grad_norm": 0.17740319669246674, + "learning_rate": 7.995604503485984e-06, + "loss": 0.4509, + "step": 7633 + }, + { + "epoch": 1.7028775373633729, + "grad_norm": 0.17956921458244324, + "learning_rate": 7.993298890128067e-06, + "loss": 0.4465, + "step": 7634 + }, + { + "epoch": 1.7031006022752622, + "grad_norm": 0.18017272651195526, + "learning_rate": 7.99099338791388e-06, + "loss": 0.4817, + "step": 7635 + }, + { + "epoch": 1.7033236671871514, + "grad_norm": 0.17207284271717072, + "learning_rate": 7.988687996971116e-06, + "loss": 0.4545, + "step": 7636 + }, + { + "epoch": 1.7035467320990407, + "grad_norm": 0.19489426910877228, + "learning_rate": 7.986382717427461e-06, + "loss": 0.4867, + "step": 7637 + }, + { + "epoch": 1.7037697970109302, + "grad_norm": 0.1766097992658615, + "learning_rate": 7.984077549410598e-06, + "loss": 0.4792, + "step": 7638 + }, + { + "epoch": 1.7039928619228195, + "grad_norm": 0.17337632179260254, + "learning_rate": 7.981772493048203e-06, + "loss": 0.4619, + "step": 7639 + }, + { + "epoch": 1.704215926834709, + "grad_norm": 0.1748560070991516, + "learning_rate": 7.97946754846794e-06, + "loss": 0.449, + "step": 7640 + }, + { + "epoch": 1.7044389917465983, + "grad_norm": 0.18279384076595306, + "learning_rate": 7.977162715797476e-06, + "loss": 0.4498, + "step": 7641 + }, + { + "epoch": 1.7046620566584876, + "grad_norm": 0.1735672801733017, + "learning_rate": 7.97485799516446e-06, + "loss": 0.4588, + "step": 7642 + }, + { + "epoch": 1.7048851215703769, + "grad_norm": 0.17070254683494568, + "learning_rate": 7.972553386696553e-06, + "loss": 0.466, + "step": 7643 + }, + { + "epoch": 1.7051081864822664, + "grad_norm": 0.19621379673480988, + "learning_rate": 7.970248890521389e-06, + "loss": 0.4496, + "step": 7644 + }, + { + "epoch": 1.7053312513941556, + "grad_norm": 0.18287599086761475, + "learning_rate": 7.967944506766611e-06, + "loss": 0.449, + "step": 7645 + }, + { + "epoch": 1.7055543163060451, + "grad_norm": 0.1747177392244339, + "learning_rate": 7.965640235559847e-06, + "loss": 0.4491, + "step": 7646 + }, + { + "epoch": 1.7057773812179344, + "grad_norm": 0.1790647655725479, + "learning_rate": 7.963336077028725e-06, + "loss": 0.4558, + "step": 7647 + }, + { + "epoch": 1.7060004461298237, + "grad_norm": 0.1670437902212143, + "learning_rate": 7.96103203130086e-06, + "loss": 0.4331, + "step": 7648 + }, + { + "epoch": 1.706223511041713, + "grad_norm": 0.18212758004665375, + "learning_rate": 7.95872809850387e-06, + "loss": 0.4953, + "step": 7649 + }, + { + "epoch": 1.7064465759536025, + "grad_norm": 0.17667363584041595, + "learning_rate": 7.956424278765354e-06, + "loss": 0.4552, + "step": 7650 + }, + { + "epoch": 1.706669640865492, + "grad_norm": 0.18635283410549164, + "learning_rate": 7.95412057221292e-06, + "loss": 0.4677, + "step": 7651 + }, + { + "epoch": 1.7068927057773813, + "grad_norm": 0.1755317598581314, + "learning_rate": 7.951816978974154e-06, + "loss": 0.4372, + "step": 7652 + }, + { + "epoch": 1.7071157706892706, + "grad_norm": 0.1793239861726761, + "learning_rate": 7.949513499176651e-06, + "loss": 0.4509, + "step": 7653 + }, + { + "epoch": 1.7073388356011598, + "grad_norm": 0.17480173707008362, + "learning_rate": 7.947210132947984e-06, + "loss": 0.4487, + "step": 7654 + }, + { + "epoch": 1.7075619005130493, + "grad_norm": 0.19050490856170654, + "learning_rate": 7.944906880415738e-06, + "loss": 0.4449, + "step": 7655 + }, + { + "epoch": 1.7077849654249386, + "grad_norm": 0.17552480101585388, + "learning_rate": 7.94260374170747e-06, + "loss": 0.4457, + "step": 7656 + }, + { + "epoch": 1.7080080303368281, + "grad_norm": 0.17692866921424866, + "learning_rate": 7.940300716950753e-06, + "loss": 0.4179, + "step": 7657 + }, + { + "epoch": 1.7082310952487174, + "grad_norm": 0.18498113751411438, + "learning_rate": 7.937997806273135e-06, + "loss": 0.4221, + "step": 7658 + }, + { + "epoch": 1.7084541601606067, + "grad_norm": 0.18404224514961243, + "learning_rate": 7.935695009802172e-06, + "loss": 0.4682, + "step": 7659 + }, + { + "epoch": 1.708677225072496, + "grad_norm": 0.18323245644569397, + "learning_rate": 7.933392327665403e-06, + "loss": 0.4541, + "step": 7660 + }, + { + "epoch": 1.7089002899843855, + "grad_norm": 0.17359468340873718, + "learning_rate": 7.931089759990367e-06, + "loss": 0.4504, + "step": 7661 + }, + { + "epoch": 1.709123354896275, + "grad_norm": 0.1712869256734848, + "learning_rate": 7.928787306904593e-06, + "loss": 0.45, + "step": 7662 + }, + { + "epoch": 1.7093464198081643, + "grad_norm": 0.17846348881721497, + "learning_rate": 7.926484968535604e-06, + "loss": 0.4644, + "step": 7663 + }, + { + "epoch": 1.7095694847200535, + "grad_norm": 0.18150021135807037, + "learning_rate": 7.924182745010926e-06, + "loss": 0.4206, + "step": 7664 + }, + { + "epoch": 1.7097925496319428, + "grad_norm": 0.17358383536338806, + "learning_rate": 7.921880636458061e-06, + "loss": 0.4492, + "step": 7665 + }, + { + "epoch": 1.710015614543832, + "grad_norm": 0.18115226924419403, + "learning_rate": 7.919578643004519e-06, + "loss": 0.4601, + "step": 7666 + }, + { + "epoch": 1.7102386794557216, + "grad_norm": 0.17906229197978973, + "learning_rate": 7.917276764777799e-06, + "loss": 0.4428, + "step": 7667 + }, + { + "epoch": 1.7104617443676111, + "grad_norm": 0.1732749044895172, + "learning_rate": 7.914975001905393e-06, + "loss": 0.4274, + "step": 7668 + }, + { + "epoch": 1.7106848092795004, + "grad_norm": 0.1772816926240921, + "learning_rate": 7.912673354514784e-06, + "loss": 0.4574, + "step": 7669 + }, + { + "epoch": 1.7109078741913897, + "grad_norm": 0.18031185865402222, + "learning_rate": 7.910371822733458e-06, + "loss": 0.4498, + "step": 7670 + }, + { + "epoch": 1.711130939103279, + "grad_norm": 0.18245866894721985, + "learning_rate": 7.908070406688881e-06, + "loss": 0.4271, + "step": 7671 + }, + { + "epoch": 1.7113540040151685, + "grad_norm": 0.17715051770210266, + "learning_rate": 7.905769106508527e-06, + "loss": 0.4714, + "step": 7672 + }, + { + "epoch": 1.7115770689270577, + "grad_norm": 0.18403951823711395, + "learning_rate": 7.90346792231985e-06, + "loss": 0.4349, + "step": 7673 + }, + { + "epoch": 1.7118001338389472, + "grad_norm": 0.1747589409351349, + "learning_rate": 7.90116685425031e-06, + "loss": 0.4318, + "step": 7674 + }, + { + "epoch": 1.7120231987508365, + "grad_norm": 0.18269647657871246, + "learning_rate": 7.898865902427351e-06, + "loss": 0.4687, + "step": 7675 + }, + { + "epoch": 1.7122462636627258, + "grad_norm": 0.18082202970981598, + "learning_rate": 7.896565066978416e-06, + "loss": 0.4601, + "step": 7676 + }, + { + "epoch": 1.712469328574615, + "grad_norm": 0.16931264102458954, + "learning_rate": 7.894264348030935e-06, + "loss": 0.4478, + "step": 7677 + }, + { + "epoch": 1.7126923934865046, + "grad_norm": 0.1926901787519455, + "learning_rate": 7.891963745712344e-06, + "loss": 0.4334, + "step": 7678 + }, + { + "epoch": 1.712915458398394, + "grad_norm": 0.18817317485809326, + "learning_rate": 7.88966326015006e-06, + "loss": 0.4705, + "step": 7679 + }, + { + "epoch": 1.7131385233102834, + "grad_norm": 0.18068066239356995, + "learning_rate": 7.887362891471499e-06, + "loss": 0.4796, + "step": 7680 + }, + { + "epoch": 1.7133615882221727, + "grad_norm": 0.17936988174915314, + "learning_rate": 7.885062639804068e-06, + "loss": 0.4743, + "step": 7681 + }, + { + "epoch": 1.713584653134062, + "grad_norm": 0.17455992102622986, + "learning_rate": 7.882762505275175e-06, + "loss": 0.4528, + "step": 7682 + }, + { + "epoch": 1.7138077180459512, + "grad_norm": 0.17338865995407104, + "learning_rate": 7.880462488012208e-06, + "loss": 0.4324, + "step": 7683 + }, + { + "epoch": 1.7140307829578407, + "grad_norm": 0.18230746686458588, + "learning_rate": 7.878162588142563e-06, + "loss": 0.4634, + "step": 7684 + }, + { + "epoch": 1.7142538478697302, + "grad_norm": 0.21422216296195984, + "learning_rate": 7.875862805793622e-06, + "loss": 0.4554, + "step": 7685 + }, + { + "epoch": 1.7144769127816195, + "grad_norm": 0.17937451601028442, + "learning_rate": 7.873563141092759e-06, + "loss": 0.4646, + "step": 7686 + }, + { + "epoch": 1.7146999776935088, + "grad_norm": 0.17571154236793518, + "learning_rate": 7.871263594167345e-06, + "loss": 0.4408, + "step": 7687 + }, + { + "epoch": 1.714923042605398, + "grad_norm": 0.18515697121620178, + "learning_rate": 7.868964165144743e-06, + "loss": 0.4778, + "step": 7688 + }, + { + "epoch": 1.7151461075172876, + "grad_norm": 0.17536167800426483, + "learning_rate": 7.866664854152312e-06, + "loss": 0.4284, + "step": 7689 + }, + { + "epoch": 1.7153691724291769, + "grad_norm": 0.17925989627838135, + "learning_rate": 7.864365661317399e-06, + "loss": 0.4425, + "step": 7690 + }, + { + "epoch": 1.7155922373410664, + "grad_norm": 0.17760370671749115, + "learning_rate": 7.86206658676735e-06, + "loss": 0.4521, + "step": 7691 + }, + { + "epoch": 1.7158153022529556, + "grad_norm": 0.1762177050113678, + "learning_rate": 7.8597676306295e-06, + "loss": 0.4415, + "step": 7692 + }, + { + "epoch": 1.716038367164845, + "grad_norm": 0.17917610704898834, + "learning_rate": 7.857468793031185e-06, + "loss": 0.4356, + "step": 7693 + }, + { + "epoch": 1.7162614320767342, + "grad_norm": 0.17529255151748657, + "learning_rate": 7.855170074099723e-06, + "loss": 0.4345, + "step": 7694 + }, + { + "epoch": 1.7164844969886237, + "grad_norm": 0.17721855640411377, + "learning_rate": 7.852871473962435e-06, + "loss": 0.4607, + "step": 7695 + }, + { + "epoch": 1.7167075619005132, + "grad_norm": 0.18101619184017181, + "learning_rate": 7.850572992746628e-06, + "loss": 0.4568, + "step": 7696 + }, + { + "epoch": 1.7169306268124025, + "grad_norm": 0.1923847496509552, + "learning_rate": 7.848274630579611e-06, + "loss": 0.4492, + "step": 7697 + }, + { + "epoch": 1.7171536917242918, + "grad_norm": 0.1815698742866516, + "learning_rate": 7.845976387588679e-06, + "loss": 0.4323, + "step": 7698 + }, + { + "epoch": 1.717376756636181, + "grad_norm": 0.1835760623216629, + "learning_rate": 7.843678263901125e-06, + "loss": 0.4363, + "step": 7699 + }, + { + "epoch": 1.7175998215480703, + "grad_norm": 0.1698540896177292, + "learning_rate": 7.841380259644231e-06, + "loss": 0.4399, + "step": 7700 + }, + { + "epoch": 1.7178228864599598, + "grad_norm": 0.1790151596069336, + "learning_rate": 7.83908237494528e-06, + "loss": 0.4504, + "step": 7701 + }, + { + "epoch": 1.7180459513718493, + "grad_norm": 0.18497934937477112, + "learning_rate": 7.836784609931536e-06, + "loss": 0.4655, + "step": 7702 + }, + { + "epoch": 1.7182690162837386, + "grad_norm": 0.19526800513267517, + "learning_rate": 7.83448696473027e-06, + "loss": 0.4419, + "step": 7703 + }, + { + "epoch": 1.718492081195628, + "grad_norm": 0.1786162108182907, + "learning_rate": 7.832189439468734e-06, + "loss": 0.4286, + "step": 7704 + }, + { + "epoch": 1.7187151461075172, + "grad_norm": 0.18116159737110138, + "learning_rate": 7.829892034274184e-06, + "loss": 0.4799, + "step": 7705 + }, + { + "epoch": 1.7189382110194067, + "grad_norm": 0.1766580492258072, + "learning_rate": 7.827594749273867e-06, + "loss": 0.4692, + "step": 7706 + }, + { + "epoch": 1.719161275931296, + "grad_norm": 0.1848170906305313, + "learning_rate": 7.825297584595014e-06, + "loss": 0.4547, + "step": 7707 + }, + { + "epoch": 1.7193843408431855, + "grad_norm": 0.17963701486587524, + "learning_rate": 7.823000540364865e-06, + "loss": 0.4674, + "step": 7708 + }, + { + "epoch": 1.7196074057550748, + "grad_norm": 0.183467298746109, + "learning_rate": 7.820703616710634e-06, + "loss": 0.4579, + "step": 7709 + }, + { + "epoch": 1.719830470666964, + "grad_norm": 0.17056439816951752, + "learning_rate": 7.81840681375955e-06, + "loss": 0.4638, + "step": 7710 + }, + { + "epoch": 1.7200535355788533, + "grad_norm": 0.18178224563598633, + "learning_rate": 7.816110131638815e-06, + "loss": 0.486, + "step": 7711 + }, + { + "epoch": 1.7202766004907428, + "grad_norm": 0.20546191930770874, + "learning_rate": 7.813813570475645e-06, + "loss": 0.4659, + "step": 7712 + }, + { + "epoch": 1.7204996654026323, + "grad_norm": 0.18711180984973907, + "learning_rate": 7.811517130397227e-06, + "loss": 0.4554, + "step": 7713 + }, + { + "epoch": 1.7207227303145216, + "grad_norm": 0.17499269545078278, + "learning_rate": 7.809220811530755e-06, + "loss": 0.434, + "step": 7714 + }, + { + "epoch": 1.720945795226411, + "grad_norm": 0.1984056979417801, + "learning_rate": 7.806924614003416e-06, + "loss": 0.4382, + "step": 7715 + }, + { + "epoch": 1.7211688601383002, + "grad_norm": 0.20512497425079346, + "learning_rate": 7.804628537942386e-06, + "loss": 0.4726, + "step": 7716 + }, + { + "epoch": 1.7213919250501895, + "grad_norm": 0.18379102647304535, + "learning_rate": 7.80233258347484e-06, + "loss": 0.4779, + "step": 7717 + }, + { + "epoch": 1.721614989962079, + "grad_norm": 0.17394806444644928, + "learning_rate": 7.800036750727935e-06, + "loss": 0.4678, + "step": 7718 + }, + { + "epoch": 1.7218380548739685, + "grad_norm": 0.1689169555902481, + "learning_rate": 7.797741039828836e-06, + "loss": 0.4272, + "step": 7719 + }, + { + "epoch": 1.7220611197858577, + "grad_norm": 0.17023077607154846, + "learning_rate": 7.79544545090469e-06, + "loss": 0.4591, + "step": 7720 + }, + { + "epoch": 1.722284184697747, + "grad_norm": 0.18222182989120483, + "learning_rate": 7.793149984082643e-06, + "loss": 0.4452, + "step": 7721 + }, + { + "epoch": 1.7225072496096363, + "grad_norm": 0.18358302116394043, + "learning_rate": 7.79085463948983e-06, + "loss": 0.4537, + "step": 7722 + }, + { + "epoch": 1.7227303145215258, + "grad_norm": 0.1760125756263733, + "learning_rate": 7.788559417253386e-06, + "loss": 0.4384, + "step": 7723 + }, + { + "epoch": 1.722953379433415, + "grad_norm": 0.1962590217590332, + "learning_rate": 7.786264317500429e-06, + "loss": 0.4778, + "step": 7724 + }, + { + "epoch": 1.7231764443453046, + "grad_norm": 0.18565121293067932, + "learning_rate": 7.783969340358081e-06, + "loss": 0.4379, + "step": 7725 + }, + { + "epoch": 1.7233995092571939, + "grad_norm": 0.19673305749893188, + "learning_rate": 7.781674485953448e-06, + "loss": 0.4361, + "step": 7726 + }, + { + "epoch": 1.7236225741690832, + "grad_norm": 0.17665378749370575, + "learning_rate": 7.779379754413636e-06, + "loss": 0.4323, + "step": 7727 + }, + { + "epoch": 1.7238456390809724, + "grad_norm": 0.16950669884681702, + "learning_rate": 7.777085145865744e-06, + "loss": 0.4348, + "step": 7728 + }, + { + "epoch": 1.724068703992862, + "grad_norm": 0.17674678564071655, + "learning_rate": 7.774790660436857e-06, + "loss": 0.4745, + "step": 7729 + }, + { + "epoch": 1.7242917689047514, + "grad_norm": 0.183528333902359, + "learning_rate": 7.772496298254063e-06, + "loss": 0.4611, + "step": 7730 + }, + { + "epoch": 1.7245148338166407, + "grad_norm": 0.18080073595046997, + "learning_rate": 7.770202059444433e-06, + "loss": 0.4605, + "step": 7731 + }, + { + "epoch": 1.72473789872853, + "grad_norm": 0.18171045184135437, + "learning_rate": 7.76790794413504e-06, + "loss": 0.4527, + "step": 7732 + }, + { + "epoch": 1.7249609636404193, + "grad_norm": 0.18763373792171478, + "learning_rate": 7.765613952452945e-06, + "loss": 0.4559, + "step": 7733 + }, + { + "epoch": 1.7251840285523086, + "grad_norm": 0.1811000555753708, + "learning_rate": 7.763320084525205e-06, + "loss": 0.4536, + "step": 7734 + }, + { + "epoch": 1.725407093464198, + "grad_norm": 0.1753537356853485, + "learning_rate": 7.761026340478866e-06, + "loss": 0.4383, + "step": 7735 + }, + { + "epoch": 1.7256301583760876, + "grad_norm": 0.1794290989637375, + "learning_rate": 7.758732720440976e-06, + "loss": 0.4593, + "step": 7736 + }, + { + "epoch": 1.7258532232879769, + "grad_norm": 0.18469299376010895, + "learning_rate": 7.75643922453856e-06, + "loss": 0.4239, + "step": 7737 + }, + { + "epoch": 1.7260762881998661, + "grad_norm": 0.17508934438228607, + "learning_rate": 7.754145852898658e-06, + "loss": 0.4473, + "step": 7738 + }, + { + "epoch": 1.7262993531117554, + "grad_norm": 0.18345895409584045, + "learning_rate": 7.751852605648283e-06, + "loss": 0.45, + "step": 7739 + }, + { + "epoch": 1.726522418023645, + "grad_norm": 0.18025921285152435, + "learning_rate": 7.749559482914453e-06, + "loss": 0.4645, + "step": 7740 + }, + { + "epoch": 1.7267454829355342, + "grad_norm": 0.1723143756389618, + "learning_rate": 7.747266484824174e-06, + "loss": 0.4087, + "step": 7741 + }, + { + "epoch": 1.7269685478474237, + "grad_norm": 0.17820636928081512, + "learning_rate": 7.744973611504448e-06, + "loss": 0.451, + "step": 7742 + }, + { + "epoch": 1.727191612759313, + "grad_norm": 0.17735761404037476, + "learning_rate": 7.742680863082267e-06, + "loss": 0.435, + "step": 7743 + }, + { + "epoch": 1.7274146776712023, + "grad_norm": 0.169492706656456, + "learning_rate": 7.74038823968462e-06, + "loss": 0.4232, + "step": 7744 + }, + { + "epoch": 1.7276377425830916, + "grad_norm": 0.17863723635673523, + "learning_rate": 7.738095741438485e-06, + "loss": 0.4407, + "step": 7745 + }, + { + "epoch": 1.727860807494981, + "grad_norm": 0.17826594412326813, + "learning_rate": 7.735803368470836e-06, + "loss": 0.4568, + "step": 7746 + }, + { + "epoch": 1.7280838724068706, + "grad_norm": 0.18301434814929962, + "learning_rate": 7.733511120908639e-06, + "loss": 0.471, + "step": 7747 + }, + { + "epoch": 1.7283069373187598, + "grad_norm": 0.17796590924263, + "learning_rate": 7.73121899887885e-06, + "loss": 0.4517, + "step": 7748 + }, + { + "epoch": 1.7285300022306491, + "grad_norm": 0.17701533436775208, + "learning_rate": 7.728927002508429e-06, + "loss": 0.4672, + "step": 7749 + }, + { + "epoch": 1.7287530671425384, + "grad_norm": 0.17282108962535858, + "learning_rate": 7.726635131924313e-06, + "loss": 0.4205, + "step": 7750 + }, + { + "epoch": 1.7289761320544277, + "grad_norm": 0.17632122337818146, + "learning_rate": 7.724343387253446e-06, + "loss": 0.4561, + "step": 7751 + }, + { + "epoch": 1.7291991969663172, + "grad_norm": 0.18215633928775787, + "learning_rate": 7.722051768622754e-06, + "loss": 0.4757, + "step": 7752 + }, + { + "epoch": 1.7294222618782067, + "grad_norm": 0.1942843198776245, + "learning_rate": 7.719760276159166e-06, + "loss": 0.4407, + "step": 7753 + }, + { + "epoch": 1.729645326790096, + "grad_norm": 0.18691153824329376, + "learning_rate": 7.717468909989595e-06, + "loss": 0.4524, + "step": 7754 + }, + { + "epoch": 1.7298683917019853, + "grad_norm": 0.16908486187458038, + "learning_rate": 7.715177670240956e-06, + "loss": 0.4627, + "step": 7755 + }, + { + "epoch": 1.7300914566138745, + "grad_norm": 0.1892176866531372, + "learning_rate": 7.712886557040147e-06, + "loss": 0.4367, + "step": 7756 + }, + { + "epoch": 1.730314521525764, + "grad_norm": 0.17933063209056854, + "learning_rate": 7.71059557051407e-06, + "loss": 0.4912, + "step": 7757 + }, + { + "epoch": 1.7305375864376533, + "grad_norm": 0.17910033464431763, + "learning_rate": 7.70830471078961e-06, + "loss": 0.4412, + "step": 7758 + }, + { + "epoch": 1.7307606513495428, + "grad_norm": 0.17961618304252625, + "learning_rate": 7.70601397799365e-06, + "loss": 0.441, + "step": 7759 + }, + { + "epoch": 1.730983716261432, + "grad_norm": 0.17368407547473907, + "learning_rate": 7.703723372253064e-06, + "loss": 0.4236, + "step": 7760 + }, + { + "epoch": 1.7312067811733214, + "grad_norm": 0.16743780672550201, + "learning_rate": 7.701432893694727e-06, + "loss": 0.4032, + "step": 7761 + }, + { + "epoch": 1.7314298460852107, + "grad_norm": 0.17270828783512115, + "learning_rate": 7.699142542445489e-06, + "loss": 0.4428, + "step": 7762 + }, + { + "epoch": 1.7316529109971002, + "grad_norm": 0.18314076960086823, + "learning_rate": 7.696852318632214e-06, + "loss": 0.4728, + "step": 7763 + }, + { + "epoch": 1.7318759759089897, + "grad_norm": 0.17854903638362885, + "learning_rate": 7.694562222381741e-06, + "loss": 0.4628, + "step": 7764 + }, + { + "epoch": 1.732099040820879, + "grad_norm": 0.17335215210914612, + "learning_rate": 7.692272253820918e-06, + "loss": 0.4492, + "step": 7765 + }, + { + "epoch": 1.7323221057327682, + "grad_norm": 0.19753512740135193, + "learning_rate": 7.68998241307657e-06, + "loss": 0.4499, + "step": 7766 + }, + { + "epoch": 1.7325451706446575, + "grad_norm": 0.17499345541000366, + "learning_rate": 7.687692700275532e-06, + "loss": 0.4394, + "step": 7767 + }, + { + "epoch": 1.7327682355565468, + "grad_norm": 0.17762510478496552, + "learning_rate": 7.685403115544612e-06, + "loss": 0.4656, + "step": 7768 + }, + { + "epoch": 1.7329913004684363, + "grad_norm": 0.1747395098209381, + "learning_rate": 7.683113659010628e-06, + "loss": 0.4431, + "step": 7769 + }, + { + "epoch": 1.7332143653803258, + "grad_norm": 0.1724051684141159, + "learning_rate": 7.680824330800384e-06, + "loss": 0.4266, + "step": 7770 + }, + { + "epoch": 1.733437430292215, + "grad_norm": 0.1789834052324295, + "learning_rate": 7.678535131040676e-06, + "loss": 0.4342, + "step": 7771 + }, + { + "epoch": 1.7336604952041044, + "grad_norm": 0.18412499129772186, + "learning_rate": 7.676246059858297e-06, + "loss": 0.4515, + "step": 7772 + }, + { + "epoch": 1.7338835601159936, + "grad_norm": 0.18302331864833832, + "learning_rate": 7.673957117380027e-06, + "loss": 0.4316, + "step": 7773 + }, + { + "epoch": 1.7341066250278832, + "grad_norm": 0.18772312998771667, + "learning_rate": 7.671668303732644e-06, + "loss": 0.4918, + "step": 7774 + }, + { + "epoch": 1.7343296899397724, + "grad_norm": 0.21190685033798218, + "learning_rate": 7.669379619042915e-06, + "loss": 0.4785, + "step": 7775 + }, + { + "epoch": 1.734552754851662, + "grad_norm": 0.18342965841293335, + "learning_rate": 7.667091063437604e-06, + "loss": 0.4516, + "step": 7776 + }, + { + "epoch": 1.7347758197635512, + "grad_norm": 0.1735561341047287, + "learning_rate": 7.664802637043463e-06, + "loss": 0.426, + "step": 7777 + }, + { + "epoch": 1.7349988846754405, + "grad_norm": 0.18314534425735474, + "learning_rate": 7.662514339987243e-06, + "loss": 0.4523, + "step": 7778 + }, + { + "epoch": 1.7352219495873298, + "grad_norm": 0.18044976890087128, + "learning_rate": 7.66022617239568e-06, + "loss": 0.4398, + "step": 7779 + }, + { + "epoch": 1.7354450144992193, + "grad_norm": 0.18999162316322327, + "learning_rate": 7.65793813439551e-06, + "loss": 0.4582, + "step": 7780 + }, + { + "epoch": 1.7356680794111088, + "grad_norm": 0.1812235713005066, + "learning_rate": 7.655650226113458e-06, + "loss": 0.4576, + "step": 7781 + }, + { + "epoch": 1.735891144322998, + "grad_norm": 0.1805853396654129, + "learning_rate": 7.653362447676245e-06, + "loss": 0.4331, + "step": 7782 + }, + { + "epoch": 1.7361142092348874, + "grad_norm": 0.19444364309310913, + "learning_rate": 7.651074799210578e-06, + "loss": 0.4718, + "step": 7783 + }, + { + "epoch": 1.7363372741467766, + "grad_norm": 0.17560544610023499, + "learning_rate": 7.648787280843167e-06, + "loss": 0.4434, + "step": 7784 + }, + { + "epoch": 1.736560339058666, + "grad_norm": 0.17922593653202057, + "learning_rate": 7.646499892700703e-06, + "loss": 0.4171, + "step": 7785 + }, + { + "epoch": 1.7367834039705554, + "grad_norm": 0.17291361093521118, + "learning_rate": 7.644212634909881e-06, + "loss": 0.4309, + "step": 7786 + }, + { + "epoch": 1.737006468882445, + "grad_norm": 0.2205720990896225, + "learning_rate": 7.641925507597381e-06, + "loss": 0.4471, + "step": 7787 + }, + { + "epoch": 1.7372295337943342, + "grad_norm": 0.2528192698955536, + "learning_rate": 7.63963851088988e-06, + "loss": 0.463, + "step": 7788 + }, + { + "epoch": 1.7374525987062235, + "grad_norm": 0.1895085573196411, + "learning_rate": 7.637351644914044e-06, + "loss": 0.4486, + "step": 7789 + }, + { + "epoch": 1.7376756636181128, + "grad_norm": 0.17160922288894653, + "learning_rate": 7.635064909796534e-06, + "loss": 0.4531, + "step": 7790 + }, + { + "epoch": 1.7378987285300023, + "grad_norm": 0.17404474318027496, + "learning_rate": 7.63277830566401e-06, + "loss": 0.4446, + "step": 7791 + }, + { + "epoch": 1.7381217934418915, + "grad_norm": 0.17817366123199463, + "learning_rate": 7.63049183264311e-06, + "loss": 0.4627, + "step": 7792 + }, + { + "epoch": 1.738344858353781, + "grad_norm": 0.1933346390724182, + "learning_rate": 7.628205490860482e-06, + "loss": 0.4744, + "step": 7793 + }, + { + "epoch": 1.7385679232656703, + "grad_norm": 0.1809963434934616, + "learning_rate": 7.625919280442751e-06, + "loss": 0.4791, + "step": 7794 + }, + { + "epoch": 1.7387909881775596, + "grad_norm": 0.31423863768577576, + "learning_rate": 7.623633201516542e-06, + "loss": 0.4465, + "step": 7795 + }, + { + "epoch": 1.739014053089449, + "grad_norm": 0.19363394379615784, + "learning_rate": 7.621347254208475e-06, + "loss": 0.4696, + "step": 7796 + }, + { + "epoch": 1.7392371180013384, + "grad_norm": 0.18826474249362946, + "learning_rate": 7.619061438645159e-06, + "loss": 0.4746, + "step": 7797 + }, + { + "epoch": 1.739460182913228, + "grad_norm": 0.16698677837848663, + "learning_rate": 7.616775754953199e-06, + "loss": 0.4349, + "step": 7798 + }, + { + "epoch": 1.7396832478251172, + "grad_norm": 0.17490437626838684, + "learning_rate": 7.614490203259186e-06, + "loss": 0.4237, + "step": 7799 + }, + { + "epoch": 1.7399063127370065, + "grad_norm": 0.17656764388084412, + "learning_rate": 7.6122047836897125e-06, + "loss": 0.4583, + "step": 7800 + }, + { + "epoch": 1.7401293776488957, + "grad_norm": 0.17693257331848145, + "learning_rate": 7.609919496371357e-06, + "loss": 0.454, + "step": 7801 + }, + { + "epoch": 1.740352442560785, + "grad_norm": 0.1721879243850708, + "learning_rate": 7.6076343414306965e-06, + "loss": 0.4373, + "step": 7802 + }, + { + "epoch": 1.7405755074726745, + "grad_norm": 0.17377860844135284, + "learning_rate": 7.605349318994291e-06, + "loss": 0.4456, + "step": 7803 + }, + { + "epoch": 1.740798572384564, + "grad_norm": 0.17918775975704193, + "learning_rate": 7.603064429188707e-06, + "loss": 0.4473, + "step": 7804 + }, + { + "epoch": 1.7410216372964533, + "grad_norm": 0.17350159585475922, + "learning_rate": 7.600779672140488e-06, + "loss": 0.4505, + "step": 7805 + }, + { + "epoch": 1.7412447022083426, + "grad_norm": 0.1735851764678955, + "learning_rate": 7.5984950479761865e-06, + "loss": 0.432, + "step": 7806 + }, + { + "epoch": 1.7414677671202319, + "grad_norm": 0.19026067852973938, + "learning_rate": 7.596210556822332e-06, + "loss": 0.4668, + "step": 7807 + }, + { + "epoch": 1.7416908320321214, + "grad_norm": 0.17454437911510468, + "learning_rate": 7.59392619880546e-06, + "loss": 0.4583, + "step": 7808 + }, + { + "epoch": 1.7419138969440107, + "grad_norm": 0.17719118297100067, + "learning_rate": 7.591641974052089e-06, + "loss": 0.4214, + "step": 7809 + }, + { + "epoch": 1.7421369618559002, + "grad_norm": 0.17559269070625305, + "learning_rate": 7.589357882688735e-06, + "loss": 0.4234, + "step": 7810 + }, + { + "epoch": 1.7423600267677894, + "grad_norm": 0.17429721355438232, + "learning_rate": 7.587073924841904e-06, + "loss": 0.4205, + "step": 7811 + }, + { + "epoch": 1.7425830916796787, + "grad_norm": 0.18059833347797394, + "learning_rate": 7.584790100638098e-06, + "loss": 0.4475, + "step": 7812 + }, + { + "epoch": 1.742806156591568, + "grad_norm": 0.1845857799053192, + "learning_rate": 7.582506410203806e-06, + "loss": 0.4408, + "step": 7813 + }, + { + "epoch": 1.7430292215034575, + "grad_norm": 0.18028376996517181, + "learning_rate": 7.580222853665517e-06, + "loss": 0.4213, + "step": 7814 + }, + { + "epoch": 1.743252286415347, + "grad_norm": 0.1828741878271103, + "learning_rate": 7.577939431149709e-06, + "loss": 0.4559, + "step": 7815 + }, + { + "epoch": 1.7434753513272363, + "grad_norm": 0.19069290161132812, + "learning_rate": 7.575656142782848e-06, + "loss": 0.4372, + "step": 7816 + }, + { + "epoch": 1.7436984162391256, + "grad_norm": 0.1698480099439621, + "learning_rate": 7.573372988691402e-06, + "loss": 0.431, + "step": 7817 + }, + { + "epoch": 1.7439214811510149, + "grad_norm": 0.17554740607738495, + "learning_rate": 7.5710899690018226e-06, + "loss": 0.4536, + "step": 7818 + }, + { + "epoch": 1.7441445460629041, + "grad_norm": 0.17417636513710022, + "learning_rate": 7.568807083840561e-06, + "loss": 0.4561, + "step": 7819 + }, + { + "epoch": 1.7443676109747936, + "grad_norm": 0.1864258497953415, + "learning_rate": 7.566524333334053e-06, + "loss": 0.4933, + "step": 7820 + }, + { + "epoch": 1.7445906758866832, + "grad_norm": 0.20237578451633453, + "learning_rate": 7.564241717608737e-06, + "loss": 0.4429, + "step": 7821 + }, + { + "epoch": 1.7448137407985724, + "grad_norm": 0.18540610373020172, + "learning_rate": 7.5619592367910345e-06, + "loss": 0.4572, + "step": 7822 + }, + { + "epoch": 1.7450368057104617, + "grad_norm": 0.17512571811676025, + "learning_rate": 7.5596768910073674e-06, + "loss": 0.4408, + "step": 7823 + }, + { + "epoch": 1.745259870622351, + "grad_norm": 0.19032084941864014, + "learning_rate": 7.557394680384142e-06, + "loss": 0.4466, + "step": 7824 + }, + { + "epoch": 1.7454829355342405, + "grad_norm": 0.18703311681747437, + "learning_rate": 7.555112605047766e-06, + "loss": 0.4681, + "step": 7825 + }, + { + "epoch": 1.7457060004461298, + "grad_norm": 0.17146390676498413, + "learning_rate": 7.55283066512463e-06, + "loss": 0.4518, + "step": 7826 + }, + { + "epoch": 1.7459290653580193, + "grad_norm": 0.17544762790203094, + "learning_rate": 7.550548860741128e-06, + "loss": 0.4459, + "step": 7827 + }, + { + "epoch": 1.7461521302699086, + "grad_norm": 0.16931942105293274, + "learning_rate": 7.548267192023636e-06, + "loss": 0.4317, + "step": 7828 + }, + { + "epoch": 1.7463751951817978, + "grad_norm": 0.17575553059577942, + "learning_rate": 7.545985659098531e-06, + "loss": 0.4239, + "step": 7829 + }, + { + "epoch": 1.7465982600936871, + "grad_norm": 0.1667536497116089, + "learning_rate": 7.543704262092174e-06, + "loss": 0.4351, + "step": 7830 + }, + { + "epoch": 1.7468213250055766, + "grad_norm": 0.17519286274909973, + "learning_rate": 7.541423001130928e-06, + "loss": 0.4405, + "step": 7831 + }, + { + "epoch": 1.7470443899174661, + "grad_norm": 0.7807138562202454, + "learning_rate": 7.5391418763411385e-06, + "loss": 0.4395, + "step": 7832 + }, + { + "epoch": 1.7472674548293554, + "grad_norm": 0.17985686659812927, + "learning_rate": 7.536860887849152e-06, + "loss": 0.4564, + "step": 7833 + }, + { + "epoch": 1.7474905197412447, + "grad_norm": 0.18918108940124512, + "learning_rate": 7.534580035781304e-06, + "loss": 0.4898, + "step": 7834 + }, + { + "epoch": 1.747713584653134, + "grad_norm": 0.19184869527816772, + "learning_rate": 7.532299320263922e-06, + "loss": 0.4714, + "step": 7835 + }, + { + "epoch": 1.7479366495650233, + "grad_norm": 0.18925781548023224, + "learning_rate": 7.530018741423328e-06, + "loss": 0.4548, + "step": 7836 + }, + { + "epoch": 1.7481597144769128, + "grad_norm": 0.188811257481575, + "learning_rate": 7.5277382993858295e-06, + "loss": 0.4444, + "step": 7837 + }, + { + "epoch": 1.7483827793888023, + "grad_norm": 0.18189990520477295, + "learning_rate": 7.525457994277737e-06, + "loss": 0.4754, + "step": 7838 + }, + { + "epoch": 1.7486058443006915, + "grad_norm": 0.179085835814476, + "learning_rate": 7.5231778262253456e-06, + "loss": 0.4511, + "step": 7839 + }, + { + "epoch": 1.7488289092125808, + "grad_norm": 0.17552562057971954, + "learning_rate": 7.520897795354947e-06, + "loss": 0.446, + "step": 7840 + }, + { + "epoch": 1.74905197412447, + "grad_norm": 0.1869988739490509, + "learning_rate": 7.518617901792819e-06, + "loss": 0.4591, + "step": 7841 + }, + { + "epoch": 1.7492750390363596, + "grad_norm": 0.17976555228233337, + "learning_rate": 7.516338145665245e-06, + "loss": 0.4489, + "step": 7842 + }, + { + "epoch": 1.749498103948249, + "grad_norm": 0.18475496768951416, + "learning_rate": 7.514058527098484e-06, + "loss": 0.4781, + "step": 7843 + }, + { + "epoch": 1.7497211688601384, + "grad_norm": 0.17996063828468323, + "learning_rate": 7.5117790462188016e-06, + "loss": 0.435, + "step": 7844 + }, + { + "epoch": 1.7499442337720277, + "grad_norm": 0.1795787513256073, + "learning_rate": 7.509499703152445e-06, + "loss": 0.4901, + "step": 7845 + }, + { + "epoch": 1.750167298683917, + "grad_norm": 0.16901874542236328, + "learning_rate": 7.507220498025662e-06, + "loss": 0.4694, + "step": 7846 + }, + { + "epoch": 1.7503903635958062, + "grad_norm": 0.16905230283737183, + "learning_rate": 7.504941430964687e-06, + "loss": 0.4333, + "step": 7847 + }, + { + "epoch": 1.7506134285076957, + "grad_norm": 0.17756043374538422, + "learning_rate": 7.502662502095752e-06, + "loss": 0.4535, + "step": 7848 + }, + { + "epoch": 1.7508364934195852, + "grad_norm": 0.17667444050312042, + "learning_rate": 7.500383711545074e-06, + "loss": 0.4636, + "step": 7849 + }, + { + "epoch": 1.7510595583314745, + "grad_norm": 0.18121804296970367, + "learning_rate": 7.4981050594388716e-06, + "loss": 0.4796, + "step": 7850 + }, + { + "epoch": 1.7512826232433638, + "grad_norm": 0.18036022782325745, + "learning_rate": 7.495826545903347e-06, + "loss": 0.4667, + "step": 7851 + }, + { + "epoch": 1.751505688155253, + "grad_norm": 0.24715840816497803, + "learning_rate": 7.4935481710647005e-06, + "loss": 0.4713, + "step": 7852 + }, + { + "epoch": 1.7517287530671424, + "grad_norm": 0.19006216526031494, + "learning_rate": 7.491269935049122e-06, + "loss": 0.4464, + "step": 7853 + }, + { + "epoch": 1.7519518179790319, + "grad_norm": 0.18384769558906555, + "learning_rate": 7.488991837982794e-06, + "loss": 0.4707, + "step": 7854 + }, + { + "epoch": 1.7521748828909214, + "grad_norm": 0.18586494028568268, + "learning_rate": 7.486713879991896e-06, + "loss": 0.4731, + "step": 7855 + }, + { + "epoch": 1.7523979478028107, + "grad_norm": 0.19443517923355103, + "learning_rate": 7.484436061202589e-06, + "loss": 0.4883, + "step": 7856 + }, + { + "epoch": 1.7526210127147, + "grad_norm": 0.17276805639266968, + "learning_rate": 7.48215838174104e-06, + "loss": 0.4449, + "step": 7857 + }, + { + "epoch": 1.7528440776265892, + "grad_norm": 0.1835545003414154, + "learning_rate": 7.4798808417333954e-06, + "loss": 0.4748, + "step": 7858 + }, + { + "epoch": 1.7530671425384787, + "grad_norm": 0.1789509505033493, + "learning_rate": 7.477603441305803e-06, + "loss": 0.4639, + "step": 7859 + }, + { + "epoch": 1.753290207450368, + "grad_norm": 0.193350687623024, + "learning_rate": 7.475326180584398e-06, + "loss": 0.4897, + "step": 7860 + }, + { + "epoch": 1.7535132723622575, + "grad_norm": 0.17808929085731506, + "learning_rate": 7.473049059695311e-06, + "loss": 0.4593, + "step": 7861 + }, + { + "epoch": 1.7537363372741468, + "grad_norm": 0.18136462569236755, + "learning_rate": 7.470772078764661e-06, + "loss": 0.4693, + "step": 7862 + }, + { + "epoch": 1.753959402186036, + "grad_norm": 0.1805723011493683, + "learning_rate": 7.4684952379185625e-06, + "loss": 0.4663, + "step": 7863 + }, + { + "epoch": 1.7541824670979254, + "grad_norm": 0.1832210123538971, + "learning_rate": 7.466218537283122e-06, + "loss": 0.4541, + "step": 7864 + }, + { + "epoch": 1.7544055320098149, + "grad_norm": 0.1848025768995285, + "learning_rate": 7.463941976984439e-06, + "loss": 0.4507, + "step": 7865 + }, + { + "epoch": 1.7546285969217044, + "grad_norm": 0.1897166222333908, + "learning_rate": 7.461665557148599e-06, + "loss": 0.4422, + "step": 7866 + }, + { + "epoch": 1.7548516618335936, + "grad_norm": 0.1763024479150772, + "learning_rate": 7.459389277901689e-06, + "loss": 0.4611, + "step": 7867 + }, + { + "epoch": 1.755074726745483, + "grad_norm": 0.1759800910949707, + "learning_rate": 7.4571131393697795e-06, + "loss": 0.4406, + "step": 7868 + }, + { + "epoch": 1.7552977916573722, + "grad_norm": 0.17623275518417358, + "learning_rate": 7.454837141678942e-06, + "loss": 0.4643, + "step": 7869 + }, + { + "epoch": 1.7555208565692615, + "grad_norm": 0.1801615208387375, + "learning_rate": 7.452561284955231e-06, + "loss": 0.4649, + "step": 7870 + }, + { + "epoch": 1.755743921481151, + "grad_norm": 0.1817091703414917, + "learning_rate": 7.450285569324703e-06, + "loss": 0.4531, + "step": 7871 + }, + { + "epoch": 1.7559669863930405, + "grad_norm": 0.18977338075637817, + "learning_rate": 7.448009994913396e-06, + "loss": 0.4476, + "step": 7872 + }, + { + "epoch": 1.7561900513049298, + "grad_norm": 0.17964302003383636, + "learning_rate": 7.445734561847351e-06, + "loss": 0.4365, + "step": 7873 + }, + { + "epoch": 1.756413116216819, + "grad_norm": 0.17655393481254578, + "learning_rate": 7.44345927025259e-06, + "loss": 0.4412, + "step": 7874 + }, + { + "epoch": 1.7566361811287083, + "grad_norm": 0.1832321286201477, + "learning_rate": 7.441184120255141e-06, + "loss": 0.4589, + "step": 7875 + }, + { + "epoch": 1.7568592460405978, + "grad_norm": 0.18122322857379913, + "learning_rate": 7.438909111981008e-06, + "loss": 0.4363, + "step": 7876 + }, + { + "epoch": 1.7570823109524871, + "grad_norm": 0.19032621383666992, + "learning_rate": 7.436634245556195e-06, + "loss": 0.454, + "step": 7877 + }, + { + "epoch": 1.7573053758643766, + "grad_norm": 0.1851622313261032, + "learning_rate": 7.4343595211067045e-06, + "loss": 0.4839, + "step": 7878 + }, + { + "epoch": 1.757528440776266, + "grad_norm": 0.2012101113796234, + "learning_rate": 7.432084938758521e-06, + "loss": 0.4918, + "step": 7879 + }, + { + "epoch": 1.7577515056881552, + "grad_norm": 0.18474964797496796, + "learning_rate": 7.429810498637627e-06, + "loss": 0.4552, + "step": 7880 + }, + { + "epoch": 1.7579745706000445, + "grad_norm": 0.17664113640785217, + "learning_rate": 7.427536200869996e-06, + "loss": 0.4456, + "step": 7881 + }, + { + "epoch": 1.758197635511934, + "grad_norm": 0.18062026798725128, + "learning_rate": 7.425262045581589e-06, + "loss": 0.456, + "step": 7882 + }, + { + "epoch": 1.7584207004238235, + "grad_norm": 0.18045106530189514, + "learning_rate": 7.42298803289837e-06, + "loss": 0.4886, + "step": 7883 + }, + { + "epoch": 1.7586437653357128, + "grad_norm": 0.17931503057479858, + "learning_rate": 7.420714162946278e-06, + "loss": 0.4453, + "step": 7884 + }, + { + "epoch": 1.758866830247602, + "grad_norm": 0.1748034656047821, + "learning_rate": 7.418440435851265e-06, + "loss": 0.4626, + "step": 7885 + }, + { + "epoch": 1.7590898951594913, + "grad_norm": 0.17426064610481262, + "learning_rate": 7.416166851739257e-06, + "loss": 0.4369, + "step": 7886 + }, + { + "epoch": 1.7593129600713808, + "grad_norm": 0.17878130078315735, + "learning_rate": 7.4138934107361825e-06, + "loss": 0.4624, + "step": 7887 + }, + { + "epoch": 1.75953602498327, + "grad_norm": 0.1841915249824524, + "learning_rate": 7.411620112967957e-06, + "loss": 0.4625, + "step": 7888 + }, + { + "epoch": 1.7597590898951596, + "grad_norm": 0.18047453463077545, + "learning_rate": 7.4093469585604925e-06, + "loss": 0.4383, + "step": 7889 + }, + { + "epoch": 1.759982154807049, + "grad_norm": 0.1770285815000534, + "learning_rate": 7.407073947639688e-06, + "loss": 0.4267, + "step": 7890 + }, + { + "epoch": 1.7602052197189382, + "grad_norm": 0.17349712550640106, + "learning_rate": 7.404801080331441e-06, + "loss": 0.4404, + "step": 7891 + }, + { + "epoch": 1.7604282846308275, + "grad_norm": 0.17680755257606506, + "learning_rate": 7.4025283567616315e-06, + "loss": 0.4272, + "step": 7892 + }, + { + "epoch": 1.760651349542717, + "grad_norm": 0.18388980627059937, + "learning_rate": 7.400255777056142e-06, + "loss": 0.4677, + "step": 7893 + }, + { + "epoch": 1.7608744144546062, + "grad_norm": 0.18694987893104553, + "learning_rate": 7.39798334134084e-06, + "loss": 0.4669, + "step": 7894 + }, + { + "epoch": 1.7610974793664957, + "grad_norm": 0.16978652775287628, + "learning_rate": 7.395711049741588e-06, + "loss": 0.4435, + "step": 7895 + }, + { + "epoch": 1.761320544278385, + "grad_norm": 0.17169198393821716, + "learning_rate": 7.3934389023842405e-06, + "loss": 0.4306, + "step": 7896 + }, + { + "epoch": 1.7615436091902743, + "grad_norm": 0.18931961059570312, + "learning_rate": 7.391166899394642e-06, + "loss": 0.4597, + "step": 7897 + }, + { + "epoch": 1.7617666741021636, + "grad_norm": 0.17674033343791962, + "learning_rate": 7.3888950408986294e-06, + "loss": 0.4626, + "step": 7898 + }, + { + "epoch": 1.761989739014053, + "grad_norm": 0.18383964896202087, + "learning_rate": 7.386623327022034e-06, + "loss": 0.4635, + "step": 7899 + }, + { + "epoch": 1.7622128039259426, + "grad_norm": 0.1803615838289261, + "learning_rate": 7.38435175789068e-06, + "loss": 0.4811, + "step": 7900 + }, + { + "epoch": 1.7624358688378319, + "grad_norm": 0.19607679545879364, + "learning_rate": 7.382080333630377e-06, + "loss": 0.4715, + "step": 7901 + }, + { + "epoch": 1.7626589337497212, + "grad_norm": 0.17395561933517456, + "learning_rate": 7.379809054366933e-06, + "loss": 0.4515, + "step": 7902 + }, + { + "epoch": 1.7628819986616104, + "grad_norm": 0.1800202876329422, + "learning_rate": 7.3775379202261455e-06, + "loss": 0.4933, + "step": 7903 + }, + { + "epoch": 1.7631050635735, + "grad_norm": 0.1730235368013382, + "learning_rate": 7.375266931333805e-06, + "loss": 0.4634, + "step": 7904 + }, + { + "epoch": 1.7633281284853892, + "grad_norm": 0.1844315528869629, + "learning_rate": 7.37299608781569e-06, + "loss": 0.456, + "step": 7905 + }, + { + "epoch": 1.7635511933972787, + "grad_norm": 0.17907127737998962, + "learning_rate": 7.370725389797577e-06, + "loss": 0.451, + "step": 7906 + }, + { + "epoch": 1.763774258309168, + "grad_norm": 0.18444164097309113, + "learning_rate": 7.368454837405229e-06, + "loss": 0.4647, + "step": 7907 + }, + { + "epoch": 1.7639973232210573, + "grad_norm": 0.1789025217294693, + "learning_rate": 7.366184430764407e-06, + "loss": 0.4542, + "step": 7908 + }, + { + "epoch": 1.7642203881329466, + "grad_norm": 0.17742502689361572, + "learning_rate": 7.363914170000858e-06, + "loss": 0.4419, + "step": 7909 + }, + { + "epoch": 1.764443453044836, + "grad_norm": 0.18832780420780182, + "learning_rate": 7.361644055240325e-06, + "loss": 0.4763, + "step": 7910 + }, + { + "epoch": 1.7646665179567254, + "grad_norm": 0.2054174244403839, + "learning_rate": 7.359374086608539e-06, + "loss": 0.4543, + "step": 7911 + }, + { + "epoch": 1.7648895828686149, + "grad_norm": 0.17511680722236633, + "learning_rate": 7.357104264231228e-06, + "loss": 0.4377, + "step": 7912 + }, + { + "epoch": 1.7651126477805041, + "grad_norm": 0.1764538437128067, + "learning_rate": 7.354834588234105e-06, + "loss": 0.4675, + "step": 7913 + }, + { + "epoch": 1.7653357126923934, + "grad_norm": 0.17360688745975494, + "learning_rate": 7.352565058742882e-06, + "loss": 0.4409, + "step": 7914 + }, + { + "epoch": 1.7655587776042827, + "grad_norm": 0.1800130158662796, + "learning_rate": 7.350295675883259e-06, + "loss": 0.466, + "step": 7915 + }, + { + "epoch": 1.7657818425161722, + "grad_norm": 0.1828777939081192, + "learning_rate": 7.34802643978093e-06, + "loss": 0.4603, + "step": 7916 + }, + { + "epoch": 1.7660049074280617, + "grad_norm": 0.180416077375412, + "learning_rate": 7.345757350561577e-06, + "loss": 0.4442, + "step": 7917 + }, + { + "epoch": 1.766227972339951, + "grad_norm": 0.18395313620567322, + "learning_rate": 7.343488408350879e-06, + "loss": 0.4801, + "step": 7918 + }, + { + "epoch": 1.7664510372518403, + "grad_norm": 0.1887856274843216, + "learning_rate": 7.3412196132745005e-06, + "loss": 0.4421, + "step": 7919 + }, + { + "epoch": 1.7666741021637296, + "grad_norm": 0.17162606120109558, + "learning_rate": 7.3389509654581045e-06, + "loss": 0.4293, + "step": 7920 + }, + { + "epoch": 1.766897167075619, + "grad_norm": 0.18713156878948212, + "learning_rate": 7.336682465027346e-06, + "loss": 0.4521, + "step": 7921 + }, + { + "epoch": 1.7671202319875083, + "grad_norm": 0.1808861941099167, + "learning_rate": 7.334414112107863e-06, + "loss": 0.4461, + "step": 7922 + }, + { + "epoch": 1.7673432968993978, + "grad_norm": 0.1840147227048874, + "learning_rate": 7.3321459068252955e-06, + "loss": 0.4606, + "step": 7923 + }, + { + "epoch": 1.7675663618112871, + "grad_norm": 0.22466281056404114, + "learning_rate": 7.329877849305267e-06, + "loss": 0.4342, + "step": 7924 + }, + { + "epoch": 1.7677894267231764, + "grad_norm": 0.18192514777183533, + "learning_rate": 7.327609939673401e-06, + "loss": 0.4638, + "step": 7925 + }, + { + "epoch": 1.7680124916350657, + "grad_norm": 0.1804400235414505, + "learning_rate": 7.325342178055304e-06, + "loss": 0.4558, + "step": 7926 + }, + { + "epoch": 1.7682355565469552, + "grad_norm": 0.1800002157688141, + "learning_rate": 7.3230745645765845e-06, + "loss": 0.4303, + "step": 7927 + }, + { + "epoch": 1.7684586214588445, + "grad_norm": 0.17886100709438324, + "learning_rate": 7.320807099362831e-06, + "loss": 0.4507, + "step": 7928 + }, + { + "epoch": 1.768681686370734, + "grad_norm": 0.1851029098033905, + "learning_rate": 7.318539782539635e-06, + "loss": 0.4693, + "step": 7929 + }, + { + "epoch": 1.7689047512826233, + "grad_norm": 0.19121414422988892, + "learning_rate": 7.316272614232572e-06, + "loss": 0.4465, + "step": 7930 + }, + { + "epoch": 1.7691278161945125, + "grad_norm": 0.18079820275306702, + "learning_rate": 7.314005594567215e-06, + "loss": 0.4475, + "step": 7931 + }, + { + "epoch": 1.7693508811064018, + "grad_norm": 0.1832534670829773, + "learning_rate": 7.311738723669122e-06, + "loss": 0.4585, + "step": 7932 + }, + { + "epoch": 1.7695739460182913, + "grad_norm": 0.18712462484836578, + "learning_rate": 7.309472001663849e-06, + "loss": 0.4578, + "step": 7933 + }, + { + "epoch": 1.7697970109301808, + "grad_norm": 0.17979365587234497, + "learning_rate": 7.307205428676939e-06, + "loss": 0.4711, + "step": 7934 + }, + { + "epoch": 1.77002007584207, + "grad_norm": 0.19163386523723602, + "learning_rate": 7.304939004833933e-06, + "loss": 0.432, + "step": 7935 + }, + { + "epoch": 1.7702431407539594, + "grad_norm": 0.20467031002044678, + "learning_rate": 7.302672730260356e-06, + "loss": 0.4606, + "step": 7936 + }, + { + "epoch": 1.7704662056658487, + "grad_norm": 0.1881827414035797, + "learning_rate": 7.300406605081732e-06, + "loss": 0.459, + "step": 7937 + }, + { + "epoch": 1.7706892705777382, + "grad_norm": 0.18381071090698242, + "learning_rate": 7.298140629423568e-06, + "loss": 0.4804, + "step": 7938 + }, + { + "epoch": 1.7709123354896275, + "grad_norm": 0.18901976943016052, + "learning_rate": 7.295874803411373e-06, + "loss": 0.4568, + "step": 7939 + }, + { + "epoch": 1.771135400401517, + "grad_norm": 0.17536242306232452, + "learning_rate": 7.293609127170642e-06, + "loss": 0.4217, + "step": 7940 + }, + { + "epoch": 1.7713584653134062, + "grad_norm": 0.17019738256931305, + "learning_rate": 7.291343600826859e-06, + "loss": 0.4405, + "step": 7941 + }, + { + "epoch": 1.7715815302252955, + "grad_norm": 0.1769004762172699, + "learning_rate": 7.2890782245055084e-06, + "loss": 0.461, + "step": 7942 + }, + { + "epoch": 1.7718045951371848, + "grad_norm": 0.18277987837791443, + "learning_rate": 7.286812998332056e-06, + "loss": 0.4691, + "step": 7943 + }, + { + "epoch": 1.7720276600490743, + "grad_norm": 0.18873664736747742, + "learning_rate": 7.2845479224319685e-06, + "loss": 0.4606, + "step": 7944 + }, + { + "epoch": 1.7722507249609636, + "grad_norm": 0.1791076958179474, + "learning_rate": 7.2822829969306955e-06, + "loss": 0.4369, + "step": 7945 + }, + { + "epoch": 1.772473789872853, + "grad_norm": 0.17617996037006378, + "learning_rate": 7.280018221953686e-06, + "loss": 0.4538, + "step": 7946 + }, + { + "epoch": 1.7726968547847424, + "grad_norm": 0.1820133924484253, + "learning_rate": 7.277753597626378e-06, + "loss": 0.4444, + "step": 7947 + }, + { + "epoch": 1.7729199196966317, + "grad_norm": 0.16976666450500488, + "learning_rate": 7.275489124074198e-06, + "loss": 0.4415, + "step": 7948 + }, + { + "epoch": 1.773142984608521, + "grad_norm": 0.19467420876026154, + "learning_rate": 7.273224801422568e-06, + "loss": 0.4527, + "step": 7949 + }, + { + "epoch": 1.7733660495204104, + "grad_norm": 0.18623554706573486, + "learning_rate": 7.270960629796903e-06, + "loss": 0.4552, + "step": 7950 + }, + { + "epoch": 1.7735891144323, + "grad_norm": 0.1663247048854828, + "learning_rate": 7.2686966093226e-06, + "loss": 0.4154, + "step": 7951 + }, + { + "epoch": 1.7738121793441892, + "grad_norm": 0.18195995688438416, + "learning_rate": 7.266432740125065e-06, + "loss": 0.4501, + "step": 7952 + }, + { + "epoch": 1.7740352442560785, + "grad_norm": 0.18873853981494904, + "learning_rate": 7.264169022329676e-06, + "loss": 0.4529, + "step": 7953 + }, + { + "epoch": 1.7742583091679678, + "grad_norm": 0.1879783272743225, + "learning_rate": 7.2619054560618175e-06, + "loss": 0.4949, + "step": 7954 + }, + { + "epoch": 1.7744813740798573, + "grad_norm": 0.17232050001621246, + "learning_rate": 7.259642041446856e-06, + "loss": 0.4307, + "step": 7955 + }, + { + "epoch": 1.7747044389917466, + "grad_norm": 0.18015244603157043, + "learning_rate": 7.25737877861016e-06, + "loss": 0.4438, + "step": 7956 + }, + { + "epoch": 1.774927503903636, + "grad_norm": 0.17779506742954254, + "learning_rate": 7.255115667677077e-06, + "loss": 0.4488, + "step": 7957 + }, + { + "epoch": 1.7751505688155254, + "grad_norm": 0.17906124889850616, + "learning_rate": 7.252852708772953e-06, + "loss": 0.4606, + "step": 7958 + }, + { + "epoch": 1.7753736337274146, + "grad_norm": 0.18236404657363892, + "learning_rate": 7.250589902023127e-06, + "loss": 0.452, + "step": 7959 + }, + { + "epoch": 1.775596698639304, + "grad_norm": 0.17182929813861847, + "learning_rate": 7.248327247552925e-06, + "loss": 0.4517, + "step": 7960 + }, + { + "epoch": 1.7758197635511934, + "grad_norm": 0.18052874505519867, + "learning_rate": 7.246064745487671e-06, + "loss": 0.4603, + "step": 7961 + }, + { + "epoch": 1.776042828463083, + "grad_norm": 0.18056614696979523, + "learning_rate": 7.243802395952673e-06, + "loss": 0.4641, + "step": 7962 + }, + { + "epoch": 1.7762658933749722, + "grad_norm": 0.17786100506782532, + "learning_rate": 7.241540199073235e-06, + "loss": 0.4782, + "step": 7963 + }, + { + "epoch": 1.7764889582868615, + "grad_norm": 0.17451272904872894, + "learning_rate": 7.239278154974654e-06, + "loss": 0.4126, + "step": 7964 + }, + { + "epoch": 1.7767120231987508, + "grad_norm": 0.1730719357728958, + "learning_rate": 7.237016263782213e-06, + "loss": 0.4368, + "step": 7965 + }, + { + "epoch": 1.77693508811064, + "grad_norm": 0.173664852976799, + "learning_rate": 7.234754525621192e-06, + "loss": 0.4113, + "step": 7966 + }, + { + "epoch": 1.7771581530225296, + "grad_norm": 0.17867377400398254, + "learning_rate": 7.232492940616858e-06, + "loss": 0.4555, + "step": 7967 + }, + { + "epoch": 1.777381217934419, + "grad_norm": 0.1810644119977951, + "learning_rate": 7.2302315088944755e-06, + "loss": 0.458, + "step": 7968 + }, + { + "epoch": 1.7776042828463083, + "grad_norm": 0.18765312433242798, + "learning_rate": 7.227970230579292e-06, + "loss": 0.4574, + "step": 7969 + }, + { + "epoch": 1.7778273477581976, + "grad_norm": 0.1742631047964096, + "learning_rate": 7.225709105796557e-06, + "loss": 0.4651, + "step": 7970 + }, + { + "epoch": 1.778050412670087, + "grad_norm": 0.22400738298892975, + "learning_rate": 7.223448134671499e-06, + "loss": 0.4561, + "step": 7971 + }, + { + "epoch": 1.7782734775819764, + "grad_norm": 0.18479710817337036, + "learning_rate": 7.221187317329353e-06, + "loss": 0.4447, + "step": 7972 + }, + { + "epoch": 1.7784965424938657, + "grad_norm": 0.17623472213745117, + "learning_rate": 7.218926653895329e-06, + "loss": 0.44, + "step": 7973 + }, + { + "epoch": 1.7787196074057552, + "grad_norm": 0.18266162276268005, + "learning_rate": 7.216666144494642e-06, + "loss": 0.4708, + "step": 7974 + }, + { + "epoch": 1.7789426723176445, + "grad_norm": 0.1808854639530182, + "learning_rate": 7.21440578925249e-06, + "loss": 0.4403, + "step": 7975 + }, + { + "epoch": 1.7791657372295338, + "grad_norm": 0.1751621812582016, + "learning_rate": 7.212145588294071e-06, + "loss": 0.4669, + "step": 7976 + }, + { + "epoch": 1.779388802141423, + "grad_norm": 0.18863114714622498, + "learning_rate": 7.2098855417445625e-06, + "loss": 0.4674, + "step": 7977 + }, + { + "epoch": 1.7796118670533125, + "grad_norm": 0.17852959036827087, + "learning_rate": 7.207625649729145e-06, + "loss": 0.4389, + "step": 7978 + }, + { + "epoch": 1.779834931965202, + "grad_norm": 0.18438269197940826, + "learning_rate": 7.205365912372982e-06, + "loss": 0.4467, + "step": 7979 + }, + { + "epoch": 1.7800579968770913, + "grad_norm": 0.179841086268425, + "learning_rate": 7.203106329801236e-06, + "loss": 0.4554, + "step": 7980 + }, + { + "epoch": 1.7802810617889806, + "grad_norm": 0.1838013380765915, + "learning_rate": 7.200846902139053e-06, + "loss": 0.4643, + "step": 7981 + }, + { + "epoch": 1.7805041267008699, + "grad_norm": 0.17783623933792114, + "learning_rate": 7.198587629511577e-06, + "loss": 0.4615, + "step": 7982 + }, + { + "epoch": 1.7807271916127592, + "grad_norm": 0.18178191781044006, + "learning_rate": 7.196328512043938e-06, + "loss": 0.4285, + "step": 7983 + }, + { + "epoch": 1.7809502565246487, + "grad_norm": 0.1845524162054062, + "learning_rate": 7.194069549861262e-06, + "loss": 0.4441, + "step": 7984 + }, + { + "epoch": 1.7811733214365382, + "grad_norm": 0.17717888951301575, + "learning_rate": 7.191810743088667e-06, + "loss": 0.465, + "step": 7985 + }, + { + "epoch": 1.7813963863484275, + "grad_norm": 0.1790652871131897, + "learning_rate": 7.189552091851254e-06, + "loss": 0.4652, + "step": 7986 + }, + { + "epoch": 1.7816194512603167, + "grad_norm": 0.17189590632915497, + "learning_rate": 7.187293596274128e-06, + "loss": 0.4241, + "step": 7987 + }, + { + "epoch": 1.781842516172206, + "grad_norm": 0.16961240768432617, + "learning_rate": 7.1850352564823715e-06, + "loss": 0.4425, + "step": 7988 + }, + { + "epoch": 1.7820655810840955, + "grad_norm": 0.17109279334545135, + "learning_rate": 7.182777072601072e-06, + "loss": 0.4479, + "step": 7989 + }, + { + "epoch": 1.7822886459959848, + "grad_norm": 0.17438296973705292, + "learning_rate": 7.180519044755298e-06, + "loss": 0.4439, + "step": 7990 + }, + { + "epoch": 1.7825117109078743, + "grad_norm": 0.17713730037212372, + "learning_rate": 7.178261173070116e-06, + "loss": 0.4643, + "step": 7991 + }, + { + "epoch": 1.7827347758197636, + "grad_norm": 0.18717728555202484, + "learning_rate": 7.176003457670577e-06, + "loss": 0.4673, + "step": 7992 + }, + { + "epoch": 1.7829578407316529, + "grad_norm": 0.18204233050346375, + "learning_rate": 7.173745898681732e-06, + "loss": 0.4605, + "step": 7993 + }, + { + "epoch": 1.7831809056435421, + "grad_norm": 0.18667717278003693, + "learning_rate": 7.171488496228617e-06, + "loss": 0.484, + "step": 7994 + }, + { + "epoch": 1.7834039705554317, + "grad_norm": 0.18115851283073425, + "learning_rate": 7.169231250436261e-06, + "loss": 0.4749, + "step": 7995 + }, + { + "epoch": 1.7836270354673212, + "grad_norm": 0.17957910895347595, + "learning_rate": 7.166974161429684e-06, + "loss": 0.4673, + "step": 7996 + }, + { + "epoch": 1.7838501003792104, + "grad_norm": 0.17228113114833832, + "learning_rate": 7.1647172293338995e-06, + "loss": 0.4658, + "step": 7997 + }, + { + "epoch": 1.7840731652910997, + "grad_norm": 0.17893186211585999, + "learning_rate": 7.162460454273909e-06, + "loss": 0.4639, + "step": 7998 + }, + { + "epoch": 1.784296230202989, + "grad_norm": 0.17566993832588196, + "learning_rate": 7.160203836374708e-06, + "loss": 0.4567, + "step": 7999 + }, + { + "epoch": 1.7845192951148783, + "grad_norm": 0.17004132270812988, + "learning_rate": 7.1579473757612795e-06, + "loss": 0.4419, + "step": 8000 + }, + { + "epoch": 1.7847423600267678, + "grad_norm": 0.17692558467388153, + "learning_rate": 7.155691072558604e-06, + "loss": 0.4546, + "step": 8001 + }, + { + "epoch": 1.7849654249386573, + "grad_norm": 0.1739339381456375, + "learning_rate": 7.153434926891648e-06, + "loss": 0.4636, + "step": 8002 + }, + { + "epoch": 1.7851884898505466, + "grad_norm": 0.17398999631404877, + "learning_rate": 7.151178938885373e-06, + "loss": 0.4291, + "step": 8003 + }, + { + "epoch": 1.7854115547624358, + "grad_norm": 0.1721528023481369, + "learning_rate": 7.148923108664725e-06, + "loss": 0.4325, + "step": 8004 + }, + { + "epoch": 1.7856346196743251, + "grad_norm": 0.18906241655349731, + "learning_rate": 7.146667436354651e-06, + "loss": 0.4211, + "step": 8005 + }, + { + "epoch": 1.7858576845862146, + "grad_norm": 0.18445979058742523, + "learning_rate": 7.144411922080082e-06, + "loss": 0.4536, + "step": 8006 + }, + { + "epoch": 1.786080749498104, + "grad_norm": 0.18223892152309418, + "learning_rate": 7.142156565965943e-06, + "loss": 0.4554, + "step": 8007 + }, + { + "epoch": 1.7863038144099934, + "grad_norm": 0.18098188936710358, + "learning_rate": 7.139901368137152e-06, + "loss": 0.4453, + "step": 8008 + }, + { + "epoch": 1.7865268793218827, + "grad_norm": 0.17738774418830872, + "learning_rate": 7.1376463287186105e-06, + "loss": 0.4302, + "step": 8009 + }, + { + "epoch": 1.786749944233772, + "grad_norm": 0.18761973083019257, + "learning_rate": 7.135391447835224e-06, + "loss": 0.4673, + "step": 8010 + }, + { + "epoch": 1.7869730091456613, + "grad_norm": 0.17969244718551636, + "learning_rate": 7.133136725611876e-06, + "loss": 0.4278, + "step": 8011 + }, + { + "epoch": 1.7871960740575508, + "grad_norm": 0.19217950105667114, + "learning_rate": 7.13088216217345e-06, + "loss": 0.4448, + "step": 8012 + }, + { + "epoch": 1.7874191389694403, + "grad_norm": 0.1748211681842804, + "learning_rate": 7.128627757644816e-06, + "loss": 0.4345, + "step": 8013 + }, + { + "epoch": 1.7876422038813296, + "grad_norm": 0.1840067058801651, + "learning_rate": 7.126373512150842e-06, + "loss": 0.4478, + "step": 8014 + }, + { + "epoch": 1.7878652687932188, + "grad_norm": 0.1767268180847168, + "learning_rate": 7.1241194258163735e-06, + "loss": 0.4493, + "step": 8015 + }, + { + "epoch": 1.7880883337051081, + "grad_norm": 0.17844226956367493, + "learning_rate": 7.121865498766265e-06, + "loss": 0.4131, + "step": 8016 + }, + { + "epoch": 1.7883113986169974, + "grad_norm": 0.22129550576210022, + "learning_rate": 7.119611731125349e-06, + "loss": 0.4441, + "step": 8017 + }, + { + "epoch": 1.788534463528887, + "grad_norm": 0.18443824350833893, + "learning_rate": 7.117358123018454e-06, + "loss": 0.4344, + "step": 8018 + }, + { + "epoch": 1.7887575284407764, + "grad_norm": 0.17921538650989532, + "learning_rate": 7.115104674570397e-06, + "loss": 0.4646, + "step": 8019 + }, + { + "epoch": 1.7889805933526657, + "grad_norm": 0.1854918748140335, + "learning_rate": 7.11285138590599e-06, + "loss": 0.4692, + "step": 8020 + }, + { + "epoch": 1.789203658264555, + "grad_norm": 0.1828143298625946, + "learning_rate": 7.110598257150034e-06, + "loss": 0.444, + "step": 8021 + }, + { + "epoch": 1.7894267231764442, + "grad_norm": 0.18005596101284027, + "learning_rate": 7.108345288427324e-06, + "loss": 0.4447, + "step": 8022 + }, + { + "epoch": 1.7896497880883337, + "grad_norm": 0.18779170513153076, + "learning_rate": 7.106092479862639e-06, + "loss": 0.4516, + "step": 8023 + }, + { + "epoch": 1.789872853000223, + "grad_norm": 0.17747437953948975, + "learning_rate": 7.103839831580754e-06, + "loss": 0.4436, + "step": 8024 + }, + { + "epoch": 1.7900959179121125, + "grad_norm": 0.1890641450881958, + "learning_rate": 7.101587343706441e-06, + "loss": 0.4624, + "step": 8025 + }, + { + "epoch": 1.7903189828240018, + "grad_norm": 0.18202100694179535, + "learning_rate": 7.099335016364449e-06, + "loss": 0.4821, + "step": 8026 + }, + { + "epoch": 1.790542047735891, + "grad_norm": 0.17150594294071198, + "learning_rate": 7.097082849679533e-06, + "loss": 0.444, + "step": 8027 + }, + { + "epoch": 1.7907651126477804, + "grad_norm": 0.18223629891872406, + "learning_rate": 7.0948308437764255e-06, + "loss": 0.4514, + "step": 8028 + }, + { + "epoch": 1.7909881775596699, + "grad_norm": 0.1873582899570465, + "learning_rate": 7.092578998779864e-06, + "loss": 0.4614, + "step": 8029 + }, + { + "epoch": 1.7912112424715594, + "grad_norm": 0.17026644945144653, + "learning_rate": 7.090327314814562e-06, + "loss": 0.4616, + "step": 8030 + }, + { + "epoch": 1.7914343073834487, + "grad_norm": 0.1856604516506195, + "learning_rate": 7.088075792005239e-06, + "loss": 0.4666, + "step": 8031 + }, + { + "epoch": 1.791657372295338, + "grad_norm": 0.18316306173801422, + "learning_rate": 7.085824430476593e-06, + "loss": 0.4768, + "step": 8032 + }, + { + "epoch": 1.7918804372072272, + "grad_norm": 0.16916830837726593, + "learning_rate": 7.083573230353324e-06, + "loss": 0.4437, + "step": 8033 + }, + { + "epoch": 1.7921035021191165, + "grad_norm": 0.17978118360042572, + "learning_rate": 7.0813221917601104e-06, + "loss": 0.4525, + "step": 8034 + }, + { + "epoch": 1.792326567031006, + "grad_norm": 0.19063124060630798, + "learning_rate": 7.079071314821637e-06, + "loss": 0.4366, + "step": 8035 + }, + { + "epoch": 1.7925496319428955, + "grad_norm": 0.1843474954366684, + "learning_rate": 7.076820599662564e-06, + "loss": 0.4635, + "step": 8036 + }, + { + "epoch": 1.7927726968547848, + "grad_norm": 0.17682790756225586, + "learning_rate": 7.074570046407556e-06, + "loss": 0.4645, + "step": 8037 + }, + { + "epoch": 1.792995761766674, + "grad_norm": 0.19309797883033752, + "learning_rate": 7.072319655181263e-06, + "loss": 0.4527, + "step": 8038 + }, + { + "epoch": 1.7932188266785634, + "grad_norm": 0.17107698321342468, + "learning_rate": 7.07006942610832e-06, + "loss": 0.4349, + "step": 8039 + }, + { + "epoch": 1.7934418915904529, + "grad_norm": 0.18476001918315887, + "learning_rate": 7.067819359313364e-06, + "loss": 0.4645, + "step": 8040 + }, + { + "epoch": 1.7936649565023421, + "grad_norm": 0.17997145652770996, + "learning_rate": 7.065569454921013e-06, + "loss": 0.4634, + "step": 8041 + }, + { + "epoch": 1.7938880214142316, + "grad_norm": 0.17692801356315613, + "learning_rate": 7.063319713055887e-06, + "loss": 0.4336, + "step": 8042 + }, + { + "epoch": 1.794111086326121, + "grad_norm": 0.17240415513515472, + "learning_rate": 7.061070133842584e-06, + "loss": 0.4452, + "step": 8043 + }, + { + "epoch": 1.7943341512380102, + "grad_norm": 0.16578581929206848, + "learning_rate": 7.058820717405707e-06, + "loss": 0.4297, + "step": 8044 + }, + { + "epoch": 1.7945572161498995, + "grad_norm": 0.1810625195503235, + "learning_rate": 7.056571463869838e-06, + "loss": 0.4628, + "step": 8045 + }, + { + "epoch": 1.794780281061789, + "grad_norm": 0.1759740114212036, + "learning_rate": 7.054322373359557e-06, + "loss": 0.4383, + "step": 8046 + }, + { + "epoch": 1.7950033459736785, + "grad_norm": 0.18540038168430328, + "learning_rate": 7.052073445999431e-06, + "loss": 0.4655, + "step": 8047 + }, + { + "epoch": 1.7952264108855678, + "grad_norm": 0.17017890512943268, + "learning_rate": 7.04982468191402e-06, + "loss": 0.4216, + "step": 8048 + }, + { + "epoch": 1.795449475797457, + "grad_norm": 0.17637605965137482, + "learning_rate": 7.047576081227878e-06, + "loss": 0.463, + "step": 8049 + }, + { + "epoch": 1.7956725407093463, + "grad_norm": 0.1885131597518921, + "learning_rate": 7.045327644065542e-06, + "loss": 0.4563, + "step": 8050 + }, + { + "epoch": 1.7958956056212356, + "grad_norm": 0.18475908041000366, + "learning_rate": 7.043079370551549e-06, + "loss": 0.4511, + "step": 8051 + }, + { + "epoch": 1.7961186705331251, + "grad_norm": 0.19659362733364105, + "learning_rate": 7.0408312608104164e-06, + "loss": 0.4794, + "step": 8052 + }, + { + "epoch": 1.7963417354450146, + "grad_norm": 0.1826995611190796, + "learning_rate": 7.038583314966666e-06, + "loss": 0.4628, + "step": 8053 + }, + { + "epoch": 1.796564800356904, + "grad_norm": 0.1809930056333542, + "learning_rate": 7.036335533144798e-06, + "loss": 0.4272, + "step": 8054 + }, + { + "epoch": 1.7967878652687932, + "grad_norm": 0.177769273519516, + "learning_rate": 7.034087915469312e-06, + "loss": 0.4509, + "step": 8055 + }, + { + "epoch": 1.7970109301806825, + "grad_norm": 0.17363698780536652, + "learning_rate": 7.031840462064691e-06, + "loss": 0.4564, + "step": 8056 + }, + { + "epoch": 1.797233995092572, + "grad_norm": 0.17677956819534302, + "learning_rate": 7.029593173055418e-06, + "loss": 0.4327, + "step": 8057 + }, + { + "epoch": 1.7974570600044613, + "grad_norm": 0.1727108657360077, + "learning_rate": 7.0273460485659576e-06, + "loss": 0.4229, + "step": 8058 + }, + { + "epoch": 1.7976801249163508, + "grad_norm": 0.1822299361228943, + "learning_rate": 7.025099088720773e-06, + "loss": 0.4397, + "step": 8059 + }, + { + "epoch": 1.79790318982824, + "grad_norm": 0.18957071006298065, + "learning_rate": 7.022852293644313e-06, + "loss": 0.4679, + "step": 8060 + }, + { + "epoch": 1.7981262547401293, + "grad_norm": 0.17512580752372742, + "learning_rate": 7.020605663461022e-06, + "loss": 0.4267, + "step": 8061 + }, + { + "epoch": 1.7983493196520186, + "grad_norm": 0.1905566304922104, + "learning_rate": 7.018359198295328e-06, + "loss": 0.4542, + "step": 8062 + }, + { + "epoch": 1.798572384563908, + "grad_norm": 0.1788644641637802, + "learning_rate": 7.016112898271658e-06, + "loss": 0.4609, + "step": 8063 + }, + { + "epoch": 1.7987954494757976, + "grad_norm": 0.1741524040699005, + "learning_rate": 7.013866763514426e-06, + "loss": 0.439, + "step": 8064 + }, + { + "epoch": 1.799018514387687, + "grad_norm": 0.169100821018219, + "learning_rate": 7.011620794148035e-06, + "loss": 0.4511, + "step": 8065 + }, + { + "epoch": 1.7992415792995762, + "grad_norm": 0.1780511438846588, + "learning_rate": 7.009374990296882e-06, + "loss": 0.4391, + "step": 8066 + }, + { + "epoch": 1.7994646442114655, + "grad_norm": 0.17674995958805084, + "learning_rate": 7.0071293520853555e-06, + "loss": 0.4662, + "step": 8067 + }, + { + "epoch": 1.7996877091233547, + "grad_norm": 0.17258092761039734, + "learning_rate": 7.004883879637829e-06, + "loss": 0.4235, + "step": 8068 + }, + { + "epoch": 1.7999107740352442, + "grad_norm": 0.17394836246967316, + "learning_rate": 7.002638573078673e-06, + "loss": 0.4728, + "step": 8069 + }, + { + "epoch": 1.8001338389471337, + "grad_norm": 0.17654062807559967, + "learning_rate": 7.000393432532249e-06, + "loss": 0.4386, + "step": 8070 + }, + { + "epoch": 1.800356903859023, + "grad_norm": 0.17876291275024414, + "learning_rate": 6.998148458122903e-06, + "loss": 0.4621, + "step": 8071 + }, + { + "epoch": 1.8005799687709123, + "grad_norm": 0.18045787513256073, + "learning_rate": 6.99590364997498e-06, + "loss": 0.4521, + "step": 8072 + }, + { + "epoch": 1.8008030336828016, + "grad_norm": 0.1908344030380249, + "learning_rate": 6.993659008212808e-06, + "loss": 0.4671, + "step": 8073 + }, + { + "epoch": 1.801026098594691, + "grad_norm": 0.18054337799549103, + "learning_rate": 6.991414532960712e-06, + "loss": 0.4476, + "step": 8074 + }, + { + "epoch": 1.8012491635065804, + "grad_norm": 0.18316704034805298, + "learning_rate": 6.989170224343003e-06, + "loss": 0.4653, + "step": 8075 + }, + { + "epoch": 1.8014722284184699, + "grad_norm": 0.17818012833595276, + "learning_rate": 6.986926082483986e-06, + "loss": 0.4487, + "step": 8076 + }, + { + "epoch": 1.8016952933303592, + "grad_norm": 0.17305941879749298, + "learning_rate": 6.9846821075079544e-06, + "loss": 0.4331, + "step": 8077 + }, + { + "epoch": 1.8019183582422484, + "grad_norm": 0.17837919294834137, + "learning_rate": 6.982438299539197e-06, + "loss": 0.433, + "step": 8078 + }, + { + "epoch": 1.8021414231541377, + "grad_norm": 0.198192298412323, + "learning_rate": 6.980194658701985e-06, + "loss": 0.4623, + "step": 8079 + }, + { + "epoch": 1.8023644880660272, + "grad_norm": 0.18458157777786255, + "learning_rate": 6.977951185120591e-06, + "loss": 0.4642, + "step": 8080 + }, + { + "epoch": 1.8025875529779167, + "grad_norm": 0.17324891686439514, + "learning_rate": 6.975707878919268e-06, + "loss": 0.4549, + "step": 8081 + }, + { + "epoch": 1.802810617889806, + "grad_norm": 0.1909075230360031, + "learning_rate": 6.973464740222268e-06, + "loss": 0.4647, + "step": 8082 + }, + { + "epoch": 1.8030336828016953, + "grad_norm": 0.17792390286922455, + "learning_rate": 6.971221769153826e-06, + "loss": 0.4563, + "step": 8083 + }, + { + "epoch": 1.8032567477135846, + "grad_norm": 0.17481090128421783, + "learning_rate": 6.968978965838176e-06, + "loss": 0.4537, + "step": 8084 + }, + { + "epoch": 1.8034798126254739, + "grad_norm": 0.19158580899238586, + "learning_rate": 6.966736330399535e-06, + "loss": 0.4493, + "step": 8085 + }, + { + "epoch": 1.8037028775373634, + "grad_norm": 0.18006612360477448, + "learning_rate": 6.964493862962118e-06, + "loss": 0.4838, + "step": 8086 + }, + { + "epoch": 1.8039259424492529, + "grad_norm": 0.18528711795806885, + "learning_rate": 6.962251563650124e-06, + "loss": 0.4397, + "step": 8087 + }, + { + "epoch": 1.8041490073611421, + "grad_norm": 0.17403841018676758, + "learning_rate": 6.960009432587747e-06, + "loss": 0.4219, + "step": 8088 + }, + { + "epoch": 1.8043720722730314, + "grad_norm": 0.17622390389442444, + "learning_rate": 6.957767469899168e-06, + "loss": 0.4645, + "step": 8089 + }, + { + "epoch": 1.8045951371849207, + "grad_norm": 0.17650552093982697, + "learning_rate": 6.955525675708564e-06, + "loss": 0.4427, + "step": 8090 + }, + { + "epoch": 1.8048182020968102, + "grad_norm": 0.1795479953289032, + "learning_rate": 6.953284050140101e-06, + "loss": 0.4268, + "step": 8091 + }, + { + "epoch": 1.8050412670086995, + "grad_norm": 0.18363459408283234, + "learning_rate": 6.951042593317929e-06, + "loss": 0.4403, + "step": 8092 + }, + { + "epoch": 1.805264331920589, + "grad_norm": 0.18579071760177612, + "learning_rate": 6.948801305366199e-06, + "loss": 0.4556, + "step": 8093 + }, + { + "epoch": 1.8054873968324783, + "grad_norm": 0.1805351972579956, + "learning_rate": 6.946560186409042e-06, + "loss": 0.4356, + "step": 8094 + }, + { + "epoch": 1.8057104617443676, + "grad_norm": 0.18544495105743408, + "learning_rate": 6.944319236570593e-06, + "loss": 0.46, + "step": 8095 + }, + { + "epoch": 1.8059335266562568, + "grad_norm": 0.1721157431602478, + "learning_rate": 6.942078455974963e-06, + "loss": 0.4562, + "step": 8096 + }, + { + "epoch": 1.8061565915681463, + "grad_norm": 0.1732361763715744, + "learning_rate": 6.9398378447462645e-06, + "loss": 0.4537, + "step": 8097 + }, + { + "epoch": 1.8063796564800358, + "grad_norm": 0.1871383637189865, + "learning_rate": 6.937597403008594e-06, + "loss": 0.4808, + "step": 8098 + }, + { + "epoch": 1.8066027213919251, + "grad_norm": 0.17318753898143768, + "learning_rate": 6.9353571308860446e-06, + "loss": 0.4451, + "step": 8099 + }, + { + "epoch": 1.8068257863038144, + "grad_norm": 0.17707256972789764, + "learning_rate": 6.933117028502693e-06, + "loss": 0.4374, + "step": 8100 + }, + { + "epoch": 1.8070488512157037, + "grad_norm": 0.18298032879829407, + "learning_rate": 6.930877095982613e-06, + "loss": 0.4588, + "step": 8101 + }, + { + "epoch": 1.807271916127593, + "grad_norm": 0.18046192824840546, + "learning_rate": 6.928637333449863e-06, + "loss": 0.4731, + "step": 8102 + }, + { + "epoch": 1.8074949810394825, + "grad_norm": 0.16831883788108826, + "learning_rate": 6.9263977410285014e-06, + "loss": 0.4308, + "step": 8103 + }, + { + "epoch": 1.807718045951372, + "grad_norm": 0.17589864134788513, + "learning_rate": 6.924158318842562e-06, + "loss": 0.4372, + "step": 8104 + }, + { + "epoch": 1.8079411108632613, + "grad_norm": 0.1834905594587326, + "learning_rate": 6.9219190670160856e-06, + "loss": 0.477, + "step": 8105 + }, + { + "epoch": 1.8081641757751505, + "grad_norm": 0.18303707242012024, + "learning_rate": 6.919679985673091e-06, + "loss": 0.4603, + "step": 8106 + }, + { + "epoch": 1.8083872406870398, + "grad_norm": 0.1804521679878235, + "learning_rate": 6.917441074937597e-06, + "loss": 0.4504, + "step": 8107 + }, + { + "epoch": 1.8086103055989293, + "grad_norm": 0.19304148852825165, + "learning_rate": 6.915202334933604e-06, + "loss": 0.4768, + "step": 8108 + }, + { + "epoch": 1.8088333705108186, + "grad_norm": 0.1749470978975296, + "learning_rate": 6.912963765785111e-06, + "loss": 0.449, + "step": 8109 + }, + { + "epoch": 1.809056435422708, + "grad_norm": 0.18221113085746765, + "learning_rate": 6.910725367616102e-06, + "loss": 0.454, + "step": 8110 + }, + { + "epoch": 1.8092795003345974, + "grad_norm": 0.1730881929397583, + "learning_rate": 6.908487140550555e-06, + "loss": 0.4506, + "step": 8111 + }, + { + "epoch": 1.8095025652464867, + "grad_norm": 0.17381657660007477, + "learning_rate": 6.9062490847124375e-06, + "loss": 0.4499, + "step": 8112 + }, + { + "epoch": 1.809725630158376, + "grad_norm": 0.17647477984428406, + "learning_rate": 6.904011200225704e-06, + "loss": 0.4394, + "step": 8113 + }, + { + "epoch": 1.8099486950702655, + "grad_norm": 0.17345494031906128, + "learning_rate": 6.901773487214308e-06, + "loss": 0.463, + "step": 8114 + }, + { + "epoch": 1.810171759982155, + "grad_norm": 0.178610160946846, + "learning_rate": 6.899535945802182e-06, + "loss": 0.4519, + "step": 8115 + }, + { + "epoch": 1.8103948248940442, + "grad_norm": 0.17938841879367828, + "learning_rate": 6.89729857611326e-06, + "loss": 0.4326, + "step": 8116 + }, + { + "epoch": 1.8106178898059335, + "grad_norm": 0.19250169396400452, + "learning_rate": 6.895061378271457e-06, + "loss": 0.4666, + "step": 8117 + }, + { + "epoch": 1.8108409547178228, + "grad_norm": 0.17775999009609222, + "learning_rate": 6.892824352400689e-06, + "loss": 0.4465, + "step": 8118 + }, + { + "epoch": 1.811064019629712, + "grad_norm": 0.1746588945388794, + "learning_rate": 6.890587498624854e-06, + "loss": 0.4471, + "step": 8119 + }, + { + "epoch": 1.8112870845416016, + "grad_norm": 0.2063416689634323, + "learning_rate": 6.88835081706784e-06, + "loss": 0.4341, + "step": 8120 + }, + { + "epoch": 1.811510149453491, + "grad_norm": 0.1776486337184906, + "learning_rate": 6.8861143078535334e-06, + "loss": 0.4288, + "step": 8121 + }, + { + "epoch": 1.8117332143653804, + "grad_norm": 0.1845918595790863, + "learning_rate": 6.883877971105799e-06, + "loss": 0.4622, + "step": 8122 + }, + { + "epoch": 1.8119562792772697, + "grad_norm": 0.174449160695076, + "learning_rate": 6.881641806948507e-06, + "loss": 0.4518, + "step": 8123 + }, + { + "epoch": 1.812179344189159, + "grad_norm": 0.16784648597240448, + "learning_rate": 6.879405815505504e-06, + "loss": 0.4349, + "step": 8124 + }, + { + "epoch": 1.8124024091010484, + "grad_norm": 0.19099272787570953, + "learning_rate": 6.877169996900639e-06, + "loss": 0.4694, + "step": 8125 + }, + { + "epoch": 1.8126254740129377, + "grad_norm": 0.17356911301612854, + "learning_rate": 6.874934351257742e-06, + "loss": 0.4486, + "step": 8126 + }, + { + "epoch": 1.8128485389248272, + "grad_norm": 0.1857564002275467, + "learning_rate": 6.872698878700639e-06, + "loss": 0.4687, + "step": 8127 + }, + { + "epoch": 1.8130716038367165, + "grad_norm": 0.18476644158363342, + "learning_rate": 6.870463579353141e-06, + "loss": 0.4673, + "step": 8128 + }, + { + "epoch": 1.8132946687486058, + "grad_norm": 0.17950499057769775, + "learning_rate": 6.868228453339058e-06, + "loss": 0.439, + "step": 8129 + }, + { + "epoch": 1.813517733660495, + "grad_norm": 0.17879453301429749, + "learning_rate": 6.865993500782182e-06, + "loss": 0.461, + "step": 8130 + }, + { + "epoch": 1.8137407985723846, + "grad_norm": 0.1743541806936264, + "learning_rate": 6.8637587218063e-06, + "loss": 0.4458, + "step": 8131 + }, + { + "epoch": 1.813963863484274, + "grad_norm": 0.18763816356658936, + "learning_rate": 6.861524116535187e-06, + "loss": 0.4482, + "step": 8132 + }, + { + "epoch": 1.8141869283961634, + "grad_norm": 0.17765846848487854, + "learning_rate": 6.85928968509261e-06, + "loss": 0.4387, + "step": 8133 + }, + { + "epoch": 1.8144099933080526, + "grad_norm": 0.1837596446275711, + "learning_rate": 6.857055427602327e-06, + "loss": 0.4789, + "step": 8134 + }, + { + "epoch": 1.814633058219942, + "grad_norm": 0.18249496817588806, + "learning_rate": 6.854821344188083e-06, + "loss": 0.4511, + "step": 8135 + }, + { + "epoch": 1.8148561231318312, + "grad_norm": 0.17417484521865845, + "learning_rate": 6.852587434973619e-06, + "loss": 0.4265, + "step": 8136 + }, + { + "epoch": 1.8150791880437207, + "grad_norm": 0.17669354379177094, + "learning_rate": 6.850353700082659e-06, + "loss": 0.4448, + "step": 8137 + }, + { + "epoch": 1.8153022529556102, + "grad_norm": 0.17180798947811127, + "learning_rate": 6.8481201396389254e-06, + "loss": 0.4516, + "step": 8138 + }, + { + "epoch": 1.8155253178674995, + "grad_norm": 0.17706137895584106, + "learning_rate": 6.845886753766122e-06, + "loss": 0.4411, + "step": 8139 + }, + { + "epoch": 1.8157483827793888, + "grad_norm": 0.18320493400096893, + "learning_rate": 6.843653542587952e-06, + "loss": 0.4633, + "step": 8140 + }, + { + "epoch": 1.815971447691278, + "grad_norm": 0.1703965961933136, + "learning_rate": 6.8414205062281025e-06, + "loss": 0.435, + "step": 8141 + }, + { + "epoch": 1.8161945126031676, + "grad_norm": 0.2011895328760147, + "learning_rate": 6.839187644810255e-06, + "loss": 0.4653, + "step": 8142 + }, + { + "epoch": 1.8164175775150568, + "grad_norm": 0.17818677425384521, + "learning_rate": 6.836954958458075e-06, + "loss": 0.4564, + "step": 8143 + }, + { + "epoch": 1.8166406424269463, + "grad_norm": 0.17748653888702393, + "learning_rate": 6.834722447295228e-06, + "loss": 0.4357, + "step": 8144 + }, + { + "epoch": 1.8168637073388356, + "grad_norm": 0.1691962629556656, + "learning_rate": 6.832490111445361e-06, + "loss": 0.4411, + "step": 8145 + }, + { + "epoch": 1.817086772250725, + "grad_norm": 0.1772109568119049, + "learning_rate": 6.830257951032118e-06, + "loss": 0.4365, + "step": 8146 + }, + { + "epoch": 1.8173098371626142, + "grad_norm": 0.17656628787517548, + "learning_rate": 6.828025966179126e-06, + "loss": 0.4387, + "step": 8147 + }, + { + "epoch": 1.8175329020745037, + "grad_norm": 0.19161885976791382, + "learning_rate": 6.825794157010011e-06, + "loss": 0.4518, + "step": 8148 + }, + { + "epoch": 1.8177559669863932, + "grad_norm": 0.18089298903942108, + "learning_rate": 6.823562523648379e-06, + "loss": 0.4414, + "step": 8149 + }, + { + "epoch": 1.8179790318982825, + "grad_norm": 0.1826842874288559, + "learning_rate": 6.8213310662178375e-06, + "loss": 0.4651, + "step": 8150 + }, + { + "epoch": 1.8182020968101718, + "grad_norm": 0.1827784776687622, + "learning_rate": 6.819099784841974e-06, + "loss": 0.4793, + "step": 8151 + }, + { + "epoch": 1.818425161722061, + "grad_norm": 0.193526491522789, + "learning_rate": 6.816868679644375e-06, + "loss": 0.4758, + "step": 8152 + }, + { + "epoch": 1.8186482266339503, + "grad_norm": 0.18154491484165192, + "learning_rate": 6.814637750748609e-06, + "loss": 0.4674, + "step": 8153 + }, + { + "epoch": 1.8188712915458398, + "grad_norm": 0.18000057339668274, + "learning_rate": 6.812406998278242e-06, + "loss": 0.4513, + "step": 8154 + }, + { + "epoch": 1.8190943564577293, + "grad_norm": 0.17618589103221893, + "learning_rate": 6.8101764223568286e-06, + "loss": 0.4684, + "step": 8155 + }, + { + "epoch": 1.8193174213696186, + "grad_norm": 0.1722210943698883, + "learning_rate": 6.807946023107906e-06, + "loss": 0.4327, + "step": 8156 + }, + { + "epoch": 1.8195404862815079, + "grad_norm": 0.18736113607883453, + "learning_rate": 6.805715800655015e-06, + "loss": 0.4647, + "step": 8157 + }, + { + "epoch": 1.8197635511933972, + "grad_norm": 0.17818038165569305, + "learning_rate": 6.803485755121672e-06, + "loss": 0.4543, + "step": 8158 + }, + { + "epoch": 1.8199866161052867, + "grad_norm": 0.18235214054584503, + "learning_rate": 6.801255886631399e-06, + "loss": 0.4679, + "step": 8159 + }, + { + "epoch": 1.820209681017176, + "grad_norm": 0.1716981828212738, + "learning_rate": 6.799026195307693e-06, + "loss": 0.4515, + "step": 8160 + }, + { + "epoch": 1.8204327459290655, + "grad_norm": 0.17725342512130737, + "learning_rate": 6.796796681274054e-06, + "loss": 0.4501, + "step": 8161 + }, + { + "epoch": 1.8206558108409547, + "grad_norm": 0.18533247709274292, + "learning_rate": 6.79456734465396e-06, + "loss": 0.4674, + "step": 8162 + }, + { + "epoch": 1.820878875752844, + "grad_norm": 0.17556942999362946, + "learning_rate": 6.792338185570893e-06, + "loss": 0.4552, + "step": 8163 + }, + { + "epoch": 1.8211019406647333, + "grad_norm": 0.1827983409166336, + "learning_rate": 6.790109204148311e-06, + "loss": 0.435, + "step": 8164 + }, + { + "epoch": 1.8213250055766228, + "grad_norm": 0.18367482721805573, + "learning_rate": 6.787880400509674e-06, + "loss": 0.4447, + "step": 8165 + }, + { + "epoch": 1.8215480704885123, + "grad_norm": 0.17707587778568268, + "learning_rate": 6.785651774778425e-06, + "loss": 0.4555, + "step": 8166 + }, + { + "epoch": 1.8217711354004016, + "grad_norm": 0.17791809141635895, + "learning_rate": 6.7834233270780005e-06, + "loss": 0.4766, + "step": 8167 + }, + { + "epoch": 1.8219942003122909, + "grad_norm": 0.1814098060131073, + "learning_rate": 6.781195057531824e-06, + "loss": 0.4634, + "step": 8168 + }, + { + "epoch": 1.8222172652241801, + "grad_norm": 0.1738249808549881, + "learning_rate": 6.778966966263314e-06, + "loss": 0.4515, + "step": 8169 + }, + { + "epoch": 1.8224403301360694, + "grad_norm": 0.17479370534420013, + "learning_rate": 6.7767390533958735e-06, + "loss": 0.4376, + "step": 8170 + }, + { + "epoch": 1.822663395047959, + "grad_norm": 0.17913582921028137, + "learning_rate": 6.774511319052899e-06, + "loss": 0.4571, + "step": 8171 + }, + { + "epoch": 1.8228864599598484, + "grad_norm": 0.1840064972639084, + "learning_rate": 6.7722837633577766e-06, + "loss": 0.4191, + "step": 8172 + }, + { + "epoch": 1.8231095248717377, + "grad_norm": 0.1903713196516037, + "learning_rate": 6.770056386433884e-06, + "loss": 0.4352, + "step": 8173 + }, + { + "epoch": 1.823332589783627, + "grad_norm": 0.18376924097537994, + "learning_rate": 6.767829188404585e-06, + "loss": 0.4649, + "step": 8174 + }, + { + "epoch": 1.8235556546955163, + "grad_norm": 0.17748649418354034, + "learning_rate": 6.765602169393235e-06, + "loss": 0.4321, + "step": 8175 + }, + { + "epoch": 1.8237787196074058, + "grad_norm": 0.19591280817985535, + "learning_rate": 6.763375329523185e-06, + "loss": 0.4656, + "step": 8176 + }, + { + "epoch": 1.824001784519295, + "grad_norm": 0.1707315593957901, + "learning_rate": 6.761148668917766e-06, + "loss": 0.4413, + "step": 8177 + }, + { + "epoch": 1.8242248494311846, + "grad_norm": 0.17590689659118652, + "learning_rate": 6.758922187700309e-06, + "loss": 0.4682, + "step": 8178 + }, + { + "epoch": 1.8244479143430739, + "grad_norm": 0.17584408819675446, + "learning_rate": 6.756695885994126e-06, + "loss": 0.4626, + "step": 8179 + }, + { + "epoch": 1.8246709792549631, + "grad_norm": 0.1750144511461258, + "learning_rate": 6.754469763922529e-06, + "loss": 0.4217, + "step": 8180 + }, + { + "epoch": 1.8248940441668524, + "grad_norm": 0.18190471827983856, + "learning_rate": 6.752243821608808e-06, + "loss": 0.448, + "step": 8181 + }, + { + "epoch": 1.825117109078742, + "grad_norm": 0.1814090758562088, + "learning_rate": 6.7500180591762575e-06, + "loss": 0.442, + "step": 8182 + }, + { + "epoch": 1.8253401739906314, + "grad_norm": 0.18113404512405396, + "learning_rate": 6.747792476748147e-06, + "loss": 0.4522, + "step": 8183 + }, + { + "epoch": 1.8255632389025207, + "grad_norm": 0.1728588193655014, + "learning_rate": 6.7455670744477484e-06, + "loss": 0.4394, + "step": 8184 + }, + { + "epoch": 1.82578630381441, + "grad_norm": 0.17083577811717987, + "learning_rate": 6.743341852398315e-06, + "loss": 0.4432, + "step": 8185 + }, + { + "epoch": 1.8260093687262993, + "grad_norm": 0.17789053916931152, + "learning_rate": 6.741116810723096e-06, + "loss": 0.4453, + "step": 8186 + }, + { + "epoch": 1.8262324336381888, + "grad_norm": 0.18853406608104706, + "learning_rate": 6.738891949545325e-06, + "loss": 0.4398, + "step": 8187 + }, + { + "epoch": 1.826455498550078, + "grad_norm": 0.1743546426296234, + "learning_rate": 6.736667268988235e-06, + "loss": 0.4292, + "step": 8188 + }, + { + "epoch": 1.8266785634619676, + "grad_norm": 0.18301378190517426, + "learning_rate": 6.734442769175036e-06, + "loss": 0.4604, + "step": 8189 + }, + { + "epoch": 1.8269016283738568, + "grad_norm": 0.18097054958343506, + "learning_rate": 6.732218450228938e-06, + "loss": 0.4585, + "step": 8190 + }, + { + "epoch": 1.8271246932857461, + "grad_norm": 0.17369534075260162, + "learning_rate": 6.729994312273137e-06, + "loss": 0.4206, + "step": 8191 + }, + { + "epoch": 1.8273477581976354, + "grad_norm": 0.19749490916728973, + "learning_rate": 6.727770355430822e-06, + "loss": 0.4724, + "step": 8192 + }, + { + "epoch": 1.827570823109525, + "grad_norm": 0.17227788269519806, + "learning_rate": 6.725546579825165e-06, + "loss": 0.4397, + "step": 8193 + }, + { + "epoch": 1.8277938880214142, + "grad_norm": 0.17980121076107025, + "learning_rate": 6.723322985579338e-06, + "loss": 0.4568, + "step": 8194 + }, + { + "epoch": 1.8280169529333037, + "grad_norm": 0.1813877820968628, + "learning_rate": 6.721099572816494e-06, + "loss": 0.4348, + "step": 8195 + }, + { + "epoch": 1.828240017845193, + "grad_norm": 0.1828600913286209, + "learning_rate": 6.718876341659779e-06, + "loss": 0.4685, + "step": 8196 + }, + { + "epoch": 1.8284630827570822, + "grad_norm": 0.175664484500885, + "learning_rate": 6.716653292232334e-06, + "loss": 0.4359, + "step": 8197 + }, + { + "epoch": 1.8286861476689715, + "grad_norm": 0.18827195465564728, + "learning_rate": 6.714430424657281e-06, + "loss": 0.4545, + "step": 8198 + }, + { + "epoch": 1.828909212580861, + "grad_norm": 0.18291041254997253, + "learning_rate": 6.7122077390577415e-06, + "loss": 0.455, + "step": 8199 + }, + { + "epoch": 1.8291322774927505, + "grad_norm": 0.17457644641399384, + "learning_rate": 6.709985235556819e-06, + "loss": 0.4355, + "step": 8200 + }, + { + "epoch": 1.8293553424046398, + "grad_norm": 0.19204942882061005, + "learning_rate": 6.707762914277604e-06, + "loss": 0.4779, + "step": 8201 + }, + { + "epoch": 1.829578407316529, + "grad_norm": 0.17864084243774414, + "learning_rate": 6.705540775343192e-06, + "loss": 0.4301, + "step": 8202 + }, + { + "epoch": 1.8298014722284184, + "grad_norm": 0.17506073415279388, + "learning_rate": 6.703318818876652e-06, + "loss": 0.4389, + "step": 8203 + }, + { + "epoch": 1.8300245371403079, + "grad_norm": 0.1823255568742752, + "learning_rate": 6.701097045001055e-06, + "loss": 0.4644, + "step": 8204 + }, + { + "epoch": 1.8302476020521972, + "grad_norm": 0.192154660820961, + "learning_rate": 6.698875453839453e-06, + "loss": 0.447, + "step": 8205 + }, + { + "epoch": 1.8304706669640867, + "grad_norm": 0.18675589561462402, + "learning_rate": 6.696654045514894e-06, + "loss": 0.4493, + "step": 8206 + }, + { + "epoch": 1.830693731875976, + "grad_norm": 0.193002849817276, + "learning_rate": 6.694432820150412e-06, + "loss": 0.4124, + "step": 8207 + }, + { + "epoch": 1.8309167967878652, + "grad_norm": 0.1812351495027542, + "learning_rate": 6.692211777869033e-06, + "loss": 0.4555, + "step": 8208 + }, + { + "epoch": 1.8311398616997545, + "grad_norm": 0.18915650248527527, + "learning_rate": 6.689990918793773e-06, + "loss": 0.4678, + "step": 8209 + }, + { + "epoch": 1.831362926611644, + "grad_norm": 0.18118512630462646, + "learning_rate": 6.6877702430476374e-06, + "loss": 0.4425, + "step": 8210 + }, + { + "epoch": 1.8315859915235333, + "grad_norm": 0.17849402129650116, + "learning_rate": 6.68554975075362e-06, + "loss": 0.453, + "step": 8211 + }, + { + "epoch": 1.8318090564354228, + "grad_norm": 0.17353329062461853, + "learning_rate": 6.683329442034707e-06, + "loss": 0.4514, + "step": 8212 + }, + { + "epoch": 1.832032121347312, + "grad_norm": 0.1685749590396881, + "learning_rate": 6.681109317013869e-06, + "loss": 0.4358, + "step": 8213 + }, + { + "epoch": 1.8322551862592014, + "grad_norm": 0.17803844809532166, + "learning_rate": 6.678889375814077e-06, + "loss": 0.4495, + "step": 8214 + }, + { + "epoch": 1.8324782511710906, + "grad_norm": 0.18683882057666779, + "learning_rate": 6.67666961855828e-06, + "loss": 0.4501, + "step": 8215 + }, + { + "epoch": 1.8327013160829801, + "grad_norm": 0.18386538326740265, + "learning_rate": 6.674450045369427e-06, + "loss": 0.4349, + "step": 8216 + }, + { + "epoch": 1.8329243809948697, + "grad_norm": 0.20062412321567535, + "learning_rate": 6.672230656370446e-06, + "loss": 0.4646, + "step": 8217 + }, + { + "epoch": 1.833147445906759, + "grad_norm": 0.17725883424282074, + "learning_rate": 6.670011451684266e-06, + "loss": 0.452, + "step": 8218 + }, + { + "epoch": 1.8333705108186482, + "grad_norm": 0.16746068000793457, + "learning_rate": 6.667792431433796e-06, + "loss": 0.4277, + "step": 8219 + }, + { + "epoch": 1.8335935757305375, + "grad_norm": 0.18574824929237366, + "learning_rate": 6.665573595741941e-06, + "loss": 0.4381, + "step": 8220 + }, + { + "epoch": 1.833816640642427, + "grad_norm": 0.18854008615016937, + "learning_rate": 6.663354944731598e-06, + "loss": 0.4505, + "step": 8221 + }, + { + "epoch": 1.8340397055543163, + "grad_norm": 0.18151527643203735, + "learning_rate": 6.661136478525644e-06, + "loss": 0.4744, + "step": 8222 + }, + { + "epoch": 1.8342627704662058, + "grad_norm": 0.17463651299476624, + "learning_rate": 6.6589181972469556e-06, + "loss": 0.4667, + "step": 8223 + }, + { + "epoch": 1.834485835378095, + "grad_norm": 0.17236928641796112, + "learning_rate": 6.656700101018392e-06, + "loss": 0.4546, + "step": 8224 + }, + { + "epoch": 1.8347089002899843, + "grad_norm": 0.1812690645456314, + "learning_rate": 6.65448218996281e-06, + "loss": 0.4311, + "step": 8225 + }, + { + "epoch": 1.8349319652018736, + "grad_norm": 0.2300224006175995, + "learning_rate": 6.6522644642030445e-06, + "loss": 0.4551, + "step": 8226 + }, + { + "epoch": 1.8351550301137631, + "grad_norm": 0.17556323111057281, + "learning_rate": 6.650046923861935e-06, + "loss": 0.4333, + "step": 8227 + }, + { + "epoch": 1.8353780950256524, + "grad_norm": 0.17821857333183289, + "learning_rate": 6.647829569062295e-06, + "loss": 0.4486, + "step": 8228 + }, + { + "epoch": 1.835601159937542, + "grad_norm": 0.23370222747325897, + "learning_rate": 6.645612399926941e-06, + "loss": 0.4446, + "step": 8229 + }, + { + "epoch": 1.8358242248494312, + "grad_norm": 0.1826113909482956, + "learning_rate": 6.643395416578673e-06, + "loss": 0.4362, + "step": 8230 + }, + { + "epoch": 1.8360472897613205, + "grad_norm": 0.1840929388999939, + "learning_rate": 6.641178619140282e-06, + "loss": 0.4662, + "step": 8231 + }, + { + "epoch": 1.8362703546732098, + "grad_norm": 0.19260503351688385, + "learning_rate": 6.638962007734544e-06, + "loss": 0.4709, + "step": 8232 + }, + { + "epoch": 1.8364934195850993, + "grad_norm": 0.17979590594768524, + "learning_rate": 6.636745582484234e-06, + "loss": 0.4509, + "step": 8233 + }, + { + "epoch": 1.8367164844969888, + "grad_norm": 0.17919979989528656, + "learning_rate": 6.634529343512107e-06, + "loss": 0.4582, + "step": 8234 + }, + { + "epoch": 1.836939549408878, + "grad_norm": 0.1846793293952942, + "learning_rate": 6.632313290940917e-06, + "loss": 0.4356, + "step": 8235 + }, + { + "epoch": 1.8371626143207673, + "grad_norm": 0.18610712885856628, + "learning_rate": 6.630097424893398e-06, + "loss": 0.4555, + "step": 8236 + }, + { + "epoch": 1.8373856792326566, + "grad_norm": 0.17965011298656464, + "learning_rate": 6.6278817454922835e-06, + "loss": 0.4494, + "step": 8237 + }, + { + "epoch": 1.8376087441445461, + "grad_norm": 0.1982518583536148, + "learning_rate": 6.625666252860288e-06, + "loss": 0.4644, + "step": 8238 + }, + { + "epoch": 1.8378318090564354, + "grad_norm": 0.18403403460979462, + "learning_rate": 6.6234509471201205e-06, + "loss": 0.4501, + "step": 8239 + }, + { + "epoch": 1.838054873968325, + "grad_norm": 0.17158350348472595, + "learning_rate": 6.621235828394481e-06, + "loss": 0.4386, + "step": 8240 + }, + { + "epoch": 1.8382779388802142, + "grad_norm": 0.1909009963274002, + "learning_rate": 6.619020896806052e-06, + "loss": 0.4464, + "step": 8241 + }, + { + "epoch": 1.8385010037921035, + "grad_norm": 0.18062900006771088, + "learning_rate": 6.616806152477515e-06, + "loss": 0.4657, + "step": 8242 + }, + { + "epoch": 1.8387240687039927, + "grad_norm": 0.1820714771747589, + "learning_rate": 6.614591595531533e-06, + "loss": 0.4723, + "step": 8243 + }, + { + "epoch": 1.8389471336158822, + "grad_norm": 0.18476656079292297, + "learning_rate": 6.612377226090764e-06, + "loss": 0.4457, + "step": 8244 + }, + { + "epoch": 1.8391701985277715, + "grad_norm": 0.17424450814723969, + "learning_rate": 6.610163044277853e-06, + "loss": 0.444, + "step": 8245 + }, + { + "epoch": 1.839393263439661, + "grad_norm": 0.19445692002773285, + "learning_rate": 6.607949050215438e-06, + "loss": 0.4446, + "step": 8246 + }, + { + "epoch": 1.8396163283515503, + "grad_norm": 0.18586407601833344, + "learning_rate": 6.605735244026138e-06, + "loss": 0.4495, + "step": 8247 + }, + { + "epoch": 1.8398393932634396, + "grad_norm": 0.17516399919986725, + "learning_rate": 6.6035216258325745e-06, + "loss": 0.4261, + "step": 8248 + }, + { + "epoch": 1.8400624581753289, + "grad_norm": 0.17480303347110748, + "learning_rate": 6.601308195757343e-06, + "loss": 0.4283, + "step": 8249 + }, + { + "epoch": 1.8402855230872184, + "grad_norm": 0.1745281219482422, + "learning_rate": 6.599094953923046e-06, + "loss": 0.4384, + "step": 8250 + }, + { + "epoch": 1.8405085879991079, + "grad_norm": 0.18595942854881287, + "learning_rate": 6.5968819004522625e-06, + "loss": 0.4519, + "step": 8251 + }, + { + "epoch": 1.8407316529109972, + "grad_norm": 0.17765408754348755, + "learning_rate": 6.594669035467567e-06, + "loss": 0.4463, + "step": 8252 + }, + { + "epoch": 1.8409547178228864, + "grad_norm": 0.17620229721069336, + "learning_rate": 6.592456359091517e-06, + "loss": 0.4545, + "step": 8253 + }, + { + "epoch": 1.8411777827347757, + "grad_norm": 0.18198491632938385, + "learning_rate": 6.5902438714466706e-06, + "loss": 0.4706, + "step": 8254 + }, + { + "epoch": 1.8414008476466652, + "grad_norm": 0.1813773214817047, + "learning_rate": 6.588031572655566e-06, + "loss": 0.4435, + "step": 8255 + }, + { + "epoch": 1.8416239125585545, + "grad_norm": 0.1799456626176834, + "learning_rate": 6.585819462840737e-06, + "loss": 0.4108, + "step": 8256 + }, + { + "epoch": 1.841846977470444, + "grad_norm": 0.18853728473186493, + "learning_rate": 6.583607542124699e-06, + "loss": 0.4527, + "step": 8257 + }, + { + "epoch": 1.8420700423823333, + "grad_norm": 0.1802626997232437, + "learning_rate": 6.581395810629969e-06, + "loss": 0.4413, + "step": 8258 + }, + { + "epoch": 1.8422931072942226, + "grad_norm": 0.18371166288852692, + "learning_rate": 6.57918426847904e-06, + "loss": 0.4408, + "step": 8259 + }, + { + "epoch": 1.8425161722061119, + "grad_norm": 0.17911924421787262, + "learning_rate": 6.576972915794404e-06, + "loss": 0.444, + "step": 8260 + }, + { + "epoch": 1.8427392371180014, + "grad_norm": 0.1887340396642685, + "learning_rate": 6.574761752698542e-06, + "loss": 0.4522, + "step": 8261 + }, + { + "epoch": 1.8429623020298909, + "grad_norm": 0.2037588208913803, + "learning_rate": 6.572550779313916e-06, + "loss": 0.4419, + "step": 8262 + }, + { + "epoch": 1.8431853669417801, + "grad_norm": 0.17989180982112885, + "learning_rate": 6.570339995762991e-06, + "loss": 0.4453, + "step": 8263 + }, + { + "epoch": 1.8434084318536694, + "grad_norm": 0.17799720168113708, + "learning_rate": 6.5681294021682084e-06, + "loss": 0.4333, + "step": 8264 + }, + { + "epoch": 1.8436314967655587, + "grad_norm": 0.17597989737987518, + "learning_rate": 6.5659189986520085e-06, + "loss": 0.4441, + "step": 8265 + }, + { + "epoch": 1.843854561677448, + "grad_norm": 0.1881999969482422, + "learning_rate": 6.563708785336813e-06, + "loss": 0.4674, + "step": 8266 + }, + { + "epoch": 1.8440776265893375, + "grad_norm": 0.18646754324436188, + "learning_rate": 6.561498762345044e-06, + "loss": 0.4533, + "step": 8267 + }, + { + "epoch": 1.844300691501227, + "grad_norm": 0.1898210197687149, + "learning_rate": 6.559288929799099e-06, + "loss": 0.4494, + "step": 8268 + }, + { + "epoch": 1.8445237564131163, + "grad_norm": 0.1805214136838913, + "learning_rate": 6.557079287821378e-06, + "loss": 0.4533, + "step": 8269 + }, + { + "epoch": 1.8447468213250056, + "grad_norm": 0.17660482227802277, + "learning_rate": 6.554869836534261e-06, + "loss": 0.4237, + "step": 8270 + }, + { + "epoch": 1.8449698862368948, + "grad_norm": 0.1824430525302887, + "learning_rate": 6.552660576060126e-06, + "loss": 0.4767, + "step": 8271 + }, + { + "epoch": 1.8451929511487843, + "grad_norm": 0.1871567666530609, + "learning_rate": 6.55045150652133e-06, + "loss": 0.4576, + "step": 8272 + }, + { + "epoch": 1.8454160160606736, + "grad_norm": 0.17943458259105682, + "learning_rate": 6.548242628040231e-06, + "loss": 0.4475, + "step": 8273 + }, + { + "epoch": 1.8456390809725631, + "grad_norm": 0.1877557635307312, + "learning_rate": 6.5460339407391645e-06, + "loss": 0.4419, + "step": 8274 + }, + { + "epoch": 1.8458621458844524, + "grad_norm": 0.17813464999198914, + "learning_rate": 6.543825444740468e-06, + "loss": 0.4647, + "step": 8275 + }, + { + "epoch": 1.8460852107963417, + "grad_norm": 0.18218432366847992, + "learning_rate": 6.5416171401664565e-06, + "loss": 0.4735, + "step": 8276 + }, + { + "epoch": 1.846308275708231, + "grad_norm": 0.1764039546251297, + "learning_rate": 6.539409027139443e-06, + "loss": 0.4525, + "step": 8277 + }, + { + "epoch": 1.8465313406201205, + "grad_norm": 0.1932554543018341, + "learning_rate": 6.537201105781723e-06, + "loss": 0.4911, + "step": 8278 + }, + { + "epoch": 1.84675440553201, + "grad_norm": 0.1814078986644745, + "learning_rate": 6.53499337621559e-06, + "loss": 0.4461, + "step": 8279 + }, + { + "epoch": 1.8469774704438993, + "grad_norm": 0.173579141497612, + "learning_rate": 6.5327858385633184e-06, + "loss": 0.4262, + "step": 8280 + }, + { + "epoch": 1.8472005353557885, + "grad_norm": 0.18026985228061676, + "learning_rate": 6.530578492947177e-06, + "loss": 0.4385, + "step": 8281 + }, + { + "epoch": 1.8474236002676778, + "grad_norm": 0.18310683965682983, + "learning_rate": 6.528371339489428e-06, + "loss": 0.4574, + "step": 8282 + }, + { + "epoch": 1.847646665179567, + "grad_norm": 0.18035286664962769, + "learning_rate": 6.526164378312305e-06, + "loss": 0.4579, + "step": 8283 + }, + { + "epoch": 1.8478697300914566, + "grad_norm": 0.20232632756233215, + "learning_rate": 6.523957609538049e-06, + "loss": 0.4492, + "step": 8284 + }, + { + "epoch": 1.8480927950033461, + "grad_norm": 0.180943563580513, + "learning_rate": 6.5217510332888904e-06, + "loss": 0.4339, + "step": 8285 + }, + { + "epoch": 1.8483158599152354, + "grad_norm": 0.17553845047950745, + "learning_rate": 6.519544649687034e-06, + "loss": 0.4313, + "step": 8286 + }, + { + "epoch": 1.8485389248271247, + "grad_norm": 0.17904485762119293, + "learning_rate": 6.517338458854692e-06, + "loss": 0.4484, + "step": 8287 + }, + { + "epoch": 1.848761989739014, + "grad_norm": 0.17891845107078552, + "learning_rate": 6.515132460914049e-06, + "loss": 0.4655, + "step": 8288 + }, + { + "epoch": 1.8489850546509035, + "grad_norm": 0.19273404777050018, + "learning_rate": 6.512926655987295e-06, + "loss": 0.4718, + "step": 8289 + }, + { + "epoch": 1.8492081195627927, + "grad_norm": 0.21862982213497162, + "learning_rate": 6.510721044196593e-06, + "loss": 0.4678, + "step": 8290 + }, + { + "epoch": 1.8494311844746822, + "grad_norm": 0.17714935541152954, + "learning_rate": 6.508515625664111e-06, + "loss": 0.4626, + "step": 8291 + }, + { + "epoch": 1.8496542493865715, + "grad_norm": 0.1834956407546997, + "learning_rate": 6.5063104005119945e-06, + "loss": 0.4463, + "step": 8292 + }, + { + "epoch": 1.8498773142984608, + "grad_norm": 0.17443230748176575, + "learning_rate": 6.504105368862386e-06, + "loss": 0.4563, + "step": 8293 + }, + { + "epoch": 1.85010037921035, + "grad_norm": 0.17853355407714844, + "learning_rate": 6.50190053083741e-06, + "loss": 0.4588, + "step": 8294 + }, + { + "epoch": 1.8503234441222396, + "grad_norm": 0.17415203154087067, + "learning_rate": 6.4996958865591895e-06, + "loss": 0.4303, + "step": 8295 + }, + { + "epoch": 1.850546509034129, + "grad_norm": 0.169075608253479, + "learning_rate": 6.497491436149826e-06, + "loss": 0.4292, + "step": 8296 + }, + { + "epoch": 1.8507695739460184, + "grad_norm": 0.1862117052078247, + "learning_rate": 6.49528717973142e-06, + "loss": 0.4308, + "step": 8297 + }, + { + "epoch": 1.8509926388579077, + "grad_norm": 0.1834726333618164, + "learning_rate": 6.4930831174260535e-06, + "loss": 0.4493, + "step": 8298 + }, + { + "epoch": 1.851215703769797, + "grad_norm": 0.18994340300559998, + "learning_rate": 6.490879249355806e-06, + "loss": 0.4688, + "step": 8299 + }, + { + "epoch": 1.8514387686816862, + "grad_norm": 0.18322892487049103, + "learning_rate": 6.4886755756427365e-06, + "loss": 0.4473, + "step": 8300 + }, + { + "epoch": 1.8516618335935757, + "grad_norm": 0.1817985475063324, + "learning_rate": 6.486472096408903e-06, + "loss": 0.4633, + "step": 8301 + }, + { + "epoch": 1.8518848985054652, + "grad_norm": 0.17432516813278198, + "learning_rate": 6.484268811776343e-06, + "loss": 0.4574, + "step": 8302 + }, + { + "epoch": 1.8521079634173545, + "grad_norm": 0.17498242855072021, + "learning_rate": 6.482065721867095e-06, + "loss": 0.4379, + "step": 8303 + }, + { + "epoch": 1.8523310283292438, + "grad_norm": 0.18785923719406128, + "learning_rate": 6.479862826803172e-06, + "loss": 0.4572, + "step": 8304 + }, + { + "epoch": 1.852554093241133, + "grad_norm": 0.1780933290719986, + "learning_rate": 6.477660126706588e-06, + "loss": 0.4161, + "step": 8305 + }, + { + "epoch": 1.8527771581530226, + "grad_norm": 0.17613643407821655, + "learning_rate": 6.4754576216993455e-06, + "loss": 0.4323, + "step": 8306 + }, + { + "epoch": 1.8530002230649119, + "grad_norm": 0.17086435854434967, + "learning_rate": 6.473255311903428e-06, + "loss": 0.4497, + "step": 8307 + }, + { + "epoch": 1.8532232879768014, + "grad_norm": 0.18054986000061035, + "learning_rate": 6.471053197440817e-06, + "loss": 0.4697, + "step": 8308 + }, + { + "epoch": 1.8534463528886906, + "grad_norm": 0.17696300148963928, + "learning_rate": 6.468851278433476e-06, + "loss": 0.4704, + "step": 8309 + }, + { + "epoch": 1.85366941780058, + "grad_norm": 0.18155822157859802, + "learning_rate": 6.466649555003366e-06, + "loss": 0.4576, + "step": 8310 + }, + { + "epoch": 1.8538924827124692, + "grad_norm": 0.18807131052017212, + "learning_rate": 6.464448027272427e-06, + "loss": 0.4651, + "step": 8311 + }, + { + "epoch": 1.8541155476243587, + "grad_norm": 0.1846548169851303, + "learning_rate": 6.462246695362597e-06, + "loss": 0.4697, + "step": 8312 + }, + { + "epoch": 1.8543386125362482, + "grad_norm": 0.18467946350574493, + "learning_rate": 6.460045559395797e-06, + "loss": 0.4474, + "step": 8313 + }, + { + "epoch": 1.8545616774481375, + "grad_norm": 0.18795856833457947, + "learning_rate": 6.457844619493943e-06, + "loss": 0.461, + "step": 8314 + }, + { + "epoch": 1.8547847423600268, + "grad_norm": 0.1844158172607422, + "learning_rate": 6.455643875778932e-06, + "loss": 0.4456, + "step": 8315 + }, + { + "epoch": 1.855007807271916, + "grad_norm": 0.19735929369926453, + "learning_rate": 6.453443328372662e-06, + "loss": 0.4785, + "step": 8316 + }, + { + "epoch": 1.8552308721838053, + "grad_norm": 0.187907874584198, + "learning_rate": 6.451242977397006e-06, + "loss": 0.4334, + "step": 8317 + }, + { + "epoch": 1.8554539370956948, + "grad_norm": 0.18026615679264069, + "learning_rate": 6.449042822973839e-06, + "loss": 0.4454, + "step": 8318 + }, + { + "epoch": 1.8556770020075843, + "grad_norm": 0.17745928466320038, + "learning_rate": 6.446842865225014e-06, + "loss": 0.4621, + "step": 8319 + }, + { + "epoch": 1.8559000669194736, + "grad_norm": 0.17665286362171173, + "learning_rate": 6.4446431042723835e-06, + "loss": 0.4459, + "step": 8320 + }, + { + "epoch": 1.856123131831363, + "grad_norm": 0.18266570568084717, + "learning_rate": 6.44244354023778e-06, + "loss": 0.466, + "step": 8321 + }, + { + "epoch": 1.8563461967432522, + "grad_norm": 0.27143698930740356, + "learning_rate": 6.440244173243033e-06, + "loss": 0.4728, + "step": 8322 + }, + { + "epoch": 1.8565692616551417, + "grad_norm": 0.18305523693561554, + "learning_rate": 6.438045003409954e-06, + "loss": 0.4272, + "step": 8323 + }, + { + "epoch": 1.856792326567031, + "grad_norm": 0.17935436964035034, + "learning_rate": 6.435846030860349e-06, + "loss": 0.4616, + "step": 8324 + }, + { + "epoch": 1.8570153914789205, + "grad_norm": 0.17304593324661255, + "learning_rate": 6.433647255716011e-06, + "loss": 0.4485, + "step": 8325 + }, + { + "epoch": 1.8572384563908098, + "grad_norm": 0.1776076853275299, + "learning_rate": 6.43144867809872e-06, + "loss": 0.4431, + "step": 8326 + }, + { + "epoch": 1.857461521302699, + "grad_norm": 0.18469847738742828, + "learning_rate": 6.4292502981302494e-06, + "loss": 0.4633, + "step": 8327 + }, + { + "epoch": 1.8576845862145883, + "grad_norm": 0.18084385991096497, + "learning_rate": 6.427052115932357e-06, + "loss": 0.4558, + "step": 8328 + }, + { + "epoch": 1.8579076511264778, + "grad_norm": 0.1821753978729248, + "learning_rate": 6.424854131626796e-06, + "loss": 0.4565, + "step": 8329 + }, + { + "epoch": 1.8581307160383673, + "grad_norm": 0.17873086035251617, + "learning_rate": 6.422656345335299e-06, + "loss": 0.4521, + "step": 8330 + }, + { + "epoch": 1.8583537809502566, + "grad_norm": 0.2390872985124588, + "learning_rate": 6.420458757179597e-06, + "loss": 0.4443, + "step": 8331 + }, + { + "epoch": 1.8585768458621459, + "grad_norm": 0.1892087161540985, + "learning_rate": 6.418261367281404e-06, + "loss": 0.4496, + "step": 8332 + }, + { + "epoch": 1.8587999107740352, + "grad_norm": 0.1815430074930191, + "learning_rate": 6.416064175762428e-06, + "loss": 0.4725, + "step": 8333 + }, + { + "epoch": 1.8590229756859245, + "grad_norm": 0.17400890588760376, + "learning_rate": 6.41386718274436e-06, + "loss": 0.4264, + "step": 8334 + }, + { + "epoch": 1.859246040597814, + "grad_norm": 0.17337702214717865, + "learning_rate": 6.4116703883488875e-06, + "loss": 0.4363, + "step": 8335 + }, + { + "epoch": 1.8594691055097035, + "grad_norm": 0.1814270168542862, + "learning_rate": 6.409473792697678e-06, + "loss": 0.4413, + "step": 8336 + }, + { + "epoch": 1.8596921704215927, + "grad_norm": 0.17769312858581543, + "learning_rate": 6.407277395912397e-06, + "loss": 0.4651, + "step": 8337 + }, + { + "epoch": 1.859915235333482, + "grad_norm": 0.18968099355697632, + "learning_rate": 6.405081198114692e-06, + "loss": 0.4984, + "step": 8338 + }, + { + "epoch": 1.8601383002453713, + "grad_norm": 0.17772215604782104, + "learning_rate": 6.4028851994262045e-06, + "loss": 0.4267, + "step": 8339 + }, + { + "epoch": 1.8603613651572608, + "grad_norm": 0.18239617347717285, + "learning_rate": 6.400689399968559e-06, + "loss": 0.47, + "step": 8340 + }, + { + "epoch": 1.86058443006915, + "grad_norm": 0.1779201775789261, + "learning_rate": 6.398493799863378e-06, + "loss": 0.4684, + "step": 8341 + }, + { + "epoch": 1.8608074949810396, + "grad_norm": 0.18368324637413025, + "learning_rate": 6.396298399232261e-06, + "loss": 0.4599, + "step": 8342 + }, + { + "epoch": 1.8610305598929289, + "grad_norm": 0.1744937300682068, + "learning_rate": 6.39410319819681e-06, + "loss": 0.4278, + "step": 8343 + }, + { + "epoch": 1.8612536248048182, + "grad_norm": 0.17699581384658813, + "learning_rate": 6.391908196878605e-06, + "loss": 0.4393, + "step": 8344 + }, + { + "epoch": 1.8614766897167074, + "grad_norm": 0.17797012627124786, + "learning_rate": 6.389713395399219e-06, + "loss": 0.4653, + "step": 8345 + }, + { + "epoch": 1.861699754628597, + "grad_norm": 0.17617610096931458, + "learning_rate": 6.3875187938802164e-06, + "loss": 0.4577, + "step": 8346 + }, + { + "epoch": 1.8619228195404864, + "grad_norm": 0.17390653491020203, + "learning_rate": 6.385324392443144e-06, + "loss": 0.4215, + "step": 8347 + }, + { + "epoch": 1.8621458844523757, + "grad_norm": 0.1813625991344452, + "learning_rate": 6.3831301912095465e-06, + "loss": 0.429, + "step": 8348 + }, + { + "epoch": 1.862368949364265, + "grad_norm": 0.17084379494190216, + "learning_rate": 6.380936190300948e-06, + "loss": 0.4402, + "step": 8349 + }, + { + "epoch": 1.8625920142761543, + "grad_norm": 0.18061068654060364, + "learning_rate": 6.37874238983887e-06, + "loss": 0.4657, + "step": 8350 + }, + { + "epoch": 1.8628150791880436, + "grad_norm": 0.18320946395397186, + "learning_rate": 6.376548789944817e-06, + "loss": 0.4526, + "step": 8351 + }, + { + "epoch": 1.863038144099933, + "grad_norm": 0.17092865705490112, + "learning_rate": 6.374355390740284e-06, + "loss": 0.4259, + "step": 8352 + }, + { + "epoch": 1.8632612090118226, + "grad_norm": 0.17330855131149292, + "learning_rate": 6.372162192346754e-06, + "loss": 0.4313, + "step": 8353 + }, + { + "epoch": 1.8634842739237119, + "grad_norm": 0.17183774709701538, + "learning_rate": 6.369969194885703e-06, + "loss": 0.4601, + "step": 8354 + }, + { + "epoch": 1.8637073388356011, + "grad_norm": 0.19768626987934113, + "learning_rate": 6.367776398478591e-06, + "loss": 0.491, + "step": 8355 + }, + { + "epoch": 1.8639304037474904, + "grad_norm": 0.1920434832572937, + "learning_rate": 6.365583803246872e-06, + "loss": 0.444, + "step": 8356 + }, + { + "epoch": 1.86415346865938, + "grad_norm": 0.18218182027339935, + "learning_rate": 6.3633914093119804e-06, + "loss": 0.5027, + "step": 8357 + }, + { + "epoch": 1.8643765335712692, + "grad_norm": 0.18595336377620697, + "learning_rate": 6.36119921679535e-06, + "loss": 0.4463, + "step": 8358 + }, + { + "epoch": 1.8645995984831587, + "grad_norm": 0.19364967942237854, + "learning_rate": 6.359007225818395e-06, + "loss": 0.4637, + "step": 8359 + }, + { + "epoch": 1.864822663395048, + "grad_norm": 0.18406571447849274, + "learning_rate": 6.356815436502524e-06, + "loss": 0.4745, + "step": 8360 + }, + { + "epoch": 1.8650457283069373, + "grad_norm": 0.18429051339626312, + "learning_rate": 6.354623848969128e-06, + "loss": 0.4379, + "step": 8361 + }, + { + "epoch": 1.8652687932188265, + "grad_norm": 0.18177488446235657, + "learning_rate": 6.352432463339596e-06, + "loss": 0.4565, + "step": 8362 + }, + { + "epoch": 1.865491858130716, + "grad_norm": 0.1876797080039978, + "learning_rate": 6.350241279735301e-06, + "loss": 0.4532, + "step": 8363 + }, + { + "epoch": 1.8657149230426056, + "grad_norm": 0.18419890105724335, + "learning_rate": 6.3480502982775974e-06, + "loss": 0.4485, + "step": 8364 + }, + { + "epoch": 1.8659379879544948, + "grad_norm": 0.1855155974626541, + "learning_rate": 6.3458595190878414e-06, + "loss": 0.455, + "step": 8365 + }, + { + "epoch": 1.8661610528663841, + "grad_norm": 0.18031683564186096, + "learning_rate": 6.343668942287369e-06, + "loss": 0.4502, + "step": 8366 + }, + { + "epoch": 1.8663841177782734, + "grad_norm": 0.18513405323028564, + "learning_rate": 6.341478567997512e-06, + "loss": 0.458, + "step": 8367 + }, + { + "epoch": 1.8666071826901627, + "grad_norm": 0.1804891675710678, + "learning_rate": 6.3392883963395826e-06, + "loss": 0.4391, + "step": 8368 + }, + { + "epoch": 1.8668302476020522, + "grad_norm": 0.17720703780651093, + "learning_rate": 6.337098427434889e-06, + "loss": 0.4451, + "step": 8369 + }, + { + "epoch": 1.8670533125139417, + "grad_norm": 0.17895428836345673, + "learning_rate": 6.334908661404726e-06, + "loss": 0.4603, + "step": 8370 + }, + { + "epoch": 1.867276377425831, + "grad_norm": 0.1729314774274826, + "learning_rate": 6.3327190983703745e-06, + "loss": 0.4544, + "step": 8371 + }, + { + "epoch": 1.8674994423377203, + "grad_norm": 0.17181377112865448, + "learning_rate": 6.330529738453109e-06, + "loss": 0.4406, + "step": 8372 + }, + { + "epoch": 1.8677225072496095, + "grad_norm": 0.17984220385551453, + "learning_rate": 6.328340581774187e-06, + "loss": 0.4796, + "step": 8373 + }, + { + "epoch": 1.867945572161499, + "grad_norm": 0.1723097413778305, + "learning_rate": 6.326151628454862e-06, + "loss": 0.4341, + "step": 8374 + }, + { + "epoch": 1.8681686370733883, + "grad_norm": 0.17812368273735046, + "learning_rate": 6.323962878616366e-06, + "loss": 0.4558, + "step": 8375 + }, + { + "epoch": 1.8683917019852778, + "grad_norm": 0.2598254680633545, + "learning_rate": 6.3217743323799305e-06, + "loss": 0.4445, + "step": 8376 + }, + { + "epoch": 1.868614766897167, + "grad_norm": 0.17721040546894073, + "learning_rate": 6.3195859898667675e-06, + "loss": 0.4308, + "step": 8377 + }, + { + "epoch": 1.8688378318090564, + "grad_norm": 0.1813073456287384, + "learning_rate": 6.317397851198087e-06, + "loss": 0.4672, + "step": 8378 + }, + { + "epoch": 1.8690608967209457, + "grad_norm": 0.17863677442073822, + "learning_rate": 6.315209916495076e-06, + "loss": 0.4606, + "step": 8379 + }, + { + "epoch": 1.8692839616328352, + "grad_norm": 0.1793542504310608, + "learning_rate": 6.313022185878918e-06, + "loss": 0.4879, + "step": 8380 + }, + { + "epoch": 1.8695070265447247, + "grad_norm": 0.18399956822395325, + "learning_rate": 6.310834659470781e-06, + "loss": 0.4591, + "step": 8381 + }, + { + "epoch": 1.869730091456614, + "grad_norm": 0.18076615035533905, + "learning_rate": 6.308647337391831e-06, + "loss": 0.4385, + "step": 8382 + }, + { + "epoch": 1.8699531563685032, + "grad_norm": 0.1910838931798935, + "learning_rate": 6.306460219763207e-06, + "loss": 0.4298, + "step": 8383 + }, + { + "epoch": 1.8701762212803925, + "grad_norm": 0.191563680768013, + "learning_rate": 6.304273306706052e-06, + "loss": 0.4685, + "step": 8384 + }, + { + "epoch": 1.8703992861922818, + "grad_norm": 0.1867865025997162, + "learning_rate": 6.302086598341486e-06, + "loss": 0.4638, + "step": 8385 + }, + { + "epoch": 1.8706223511041713, + "grad_norm": 0.18163664638996124, + "learning_rate": 6.2999000947906275e-06, + "loss": 0.4394, + "step": 8386 + }, + { + "epoch": 1.8708454160160608, + "grad_norm": 0.1798231601715088, + "learning_rate": 6.297713796174574e-06, + "loss": 0.4885, + "step": 8387 + }, + { + "epoch": 1.87106848092795, + "grad_norm": 0.19126291573047638, + "learning_rate": 6.295527702614422e-06, + "loss": 0.46, + "step": 8388 + }, + { + "epoch": 1.8712915458398394, + "grad_norm": 0.17811056971549988, + "learning_rate": 6.293341814231244e-06, + "loss": 0.4518, + "step": 8389 + }, + { + "epoch": 1.8715146107517286, + "grad_norm": 0.1750759333372116, + "learning_rate": 6.2911561311461126e-06, + "loss": 0.445, + "step": 8390 + }, + { + "epoch": 1.8717376756636182, + "grad_norm": 0.17805348336696625, + "learning_rate": 6.288970653480087e-06, + "loss": 0.4423, + "step": 8391 + }, + { + "epoch": 1.8719607405755074, + "grad_norm": 0.18071311712265015, + "learning_rate": 6.286785381354205e-06, + "loss": 0.4472, + "step": 8392 + }, + { + "epoch": 1.872183805487397, + "grad_norm": 0.1815258264541626, + "learning_rate": 6.284600314889511e-06, + "loss": 0.4749, + "step": 8393 + }, + { + "epoch": 1.8724068703992862, + "grad_norm": 0.1812569499015808, + "learning_rate": 6.28241545420702e-06, + "loss": 0.4406, + "step": 8394 + }, + { + "epoch": 1.8726299353111755, + "grad_norm": 0.1827443242073059, + "learning_rate": 6.280230799427747e-06, + "loss": 0.4698, + "step": 8395 + }, + { + "epoch": 1.8728530002230648, + "grad_norm": 0.1790318340063095, + "learning_rate": 6.278046350672689e-06, + "loss": 0.4462, + "step": 8396 + }, + { + "epoch": 1.8730760651349543, + "grad_norm": 0.18236975371837616, + "learning_rate": 6.275862108062837e-06, + "loss": 0.4655, + "step": 8397 + }, + { + "epoch": 1.8732991300468438, + "grad_norm": 0.2039221227169037, + "learning_rate": 6.273678071719166e-06, + "loss": 0.4689, + "step": 8398 + }, + { + "epoch": 1.873522194958733, + "grad_norm": 0.181179016828537, + "learning_rate": 6.2714942417626445e-06, + "loss": 0.436, + "step": 8399 + }, + { + "epoch": 1.8737452598706223, + "grad_norm": 0.1835191249847412, + "learning_rate": 6.269310618314223e-06, + "loss": 0.4576, + "step": 8400 + }, + { + "epoch": 1.8739683247825116, + "grad_norm": 0.19027259945869446, + "learning_rate": 6.267127201494849e-06, + "loss": 0.4814, + "step": 8401 + }, + { + "epoch": 1.874191389694401, + "grad_norm": 0.17249448597431183, + "learning_rate": 6.264943991425449e-06, + "loss": 0.4466, + "step": 8402 + }, + { + "epoch": 1.8744144546062904, + "grad_norm": 0.18657323718070984, + "learning_rate": 6.262760988226948e-06, + "loss": 0.4782, + "step": 8403 + }, + { + "epoch": 1.87463751951818, + "grad_norm": 0.1854049414396286, + "learning_rate": 6.260578192020249e-06, + "loss": 0.4693, + "step": 8404 + }, + { + "epoch": 1.8748605844300692, + "grad_norm": 0.1833823025226593, + "learning_rate": 6.2583956029262526e-06, + "loss": 0.4455, + "step": 8405 + }, + { + "epoch": 1.8750836493419585, + "grad_norm": 0.2055628001689911, + "learning_rate": 6.256213221065843e-06, + "loss": 0.4378, + "step": 8406 + }, + { + "epoch": 1.8753067142538478, + "grad_norm": 0.21103620529174805, + "learning_rate": 6.254031046559896e-06, + "loss": 0.4435, + "step": 8407 + }, + { + "epoch": 1.8755297791657373, + "grad_norm": 0.17623326182365417, + "learning_rate": 6.251849079529271e-06, + "loss": 0.4526, + "step": 8408 + }, + { + "epoch": 1.8757528440776265, + "grad_norm": 0.1766071766614914, + "learning_rate": 6.2496673200948225e-06, + "loss": 0.4459, + "step": 8409 + }, + { + "epoch": 1.875975908989516, + "grad_norm": 0.175866037607193, + "learning_rate": 6.247485768377386e-06, + "loss": 0.456, + "step": 8410 + }, + { + "epoch": 1.8761989739014053, + "grad_norm": 0.17706961929798126, + "learning_rate": 6.245304424497792e-06, + "loss": 0.4631, + "step": 8411 + }, + { + "epoch": 1.8764220388132946, + "grad_norm": 0.17484982311725616, + "learning_rate": 6.24312328857686e-06, + "loss": 0.4689, + "step": 8412 + }, + { + "epoch": 1.876645103725184, + "grad_norm": 0.18998976051807404, + "learning_rate": 6.240942360735388e-06, + "loss": 0.4627, + "step": 8413 + }, + { + "epoch": 1.8768681686370734, + "grad_norm": 0.1831061840057373, + "learning_rate": 6.238761641094178e-06, + "loss": 0.4503, + "step": 8414 + }, + { + "epoch": 1.877091233548963, + "grad_norm": 0.1826712042093277, + "learning_rate": 6.236581129774003e-06, + "loss": 0.4729, + "step": 8415 + }, + { + "epoch": 1.8773142984608522, + "grad_norm": 0.1810642033815384, + "learning_rate": 6.234400826895641e-06, + "loss": 0.4462, + "step": 8416 + }, + { + "epoch": 1.8775373633727415, + "grad_norm": 0.16975516080856323, + "learning_rate": 6.232220732579845e-06, + "loss": 0.4159, + "step": 8417 + }, + { + "epoch": 1.8777604282846307, + "grad_norm": 0.1825772076845169, + "learning_rate": 6.230040846947368e-06, + "loss": 0.47, + "step": 8418 + }, + { + "epoch": 1.87798349319652, + "grad_norm": 0.18529464304447174, + "learning_rate": 6.227861170118941e-06, + "loss": 0.4473, + "step": 8419 + }, + { + "epoch": 1.8782065581084095, + "grad_norm": 0.17210659384727478, + "learning_rate": 6.2256817022152915e-06, + "loss": 0.4412, + "step": 8420 + }, + { + "epoch": 1.878429623020299, + "grad_norm": 0.19293324649333954, + "learning_rate": 6.223502443357129e-06, + "loss": 0.4592, + "step": 8421 + }, + { + "epoch": 1.8786526879321883, + "grad_norm": 0.16933956742286682, + "learning_rate": 6.2213233936651586e-06, + "loss": 0.433, + "step": 8422 + }, + { + "epoch": 1.8788757528440776, + "grad_norm": 0.1740327775478363, + "learning_rate": 6.219144553260065e-06, + "loss": 0.4452, + "step": 8423 + }, + { + "epoch": 1.8790988177559669, + "grad_norm": 0.18426866829395294, + "learning_rate": 6.21696592226253e-06, + "loss": 0.4658, + "step": 8424 + }, + { + "epoch": 1.8793218826678564, + "grad_norm": 0.2005191296339035, + "learning_rate": 6.214787500793218e-06, + "loss": 0.4762, + "step": 8425 + }, + { + "epoch": 1.8795449475797457, + "grad_norm": 0.18926943838596344, + "learning_rate": 6.212609288972785e-06, + "loss": 0.4584, + "step": 8426 + }, + { + "epoch": 1.8797680124916352, + "grad_norm": 0.18263809382915497, + "learning_rate": 6.210431286921872e-06, + "loss": 0.4344, + "step": 8427 + }, + { + "epoch": 1.8799910774035244, + "grad_norm": 0.18587824702262878, + "learning_rate": 6.208253494761113e-06, + "loss": 0.457, + "step": 8428 + }, + { + "epoch": 1.8802141423154137, + "grad_norm": 0.1803131103515625, + "learning_rate": 6.2060759126111246e-06, + "loss": 0.4588, + "step": 8429 + }, + { + "epoch": 1.880437207227303, + "grad_norm": 0.17640437185764313, + "learning_rate": 6.203898540592517e-06, + "loss": 0.4593, + "step": 8430 + }, + { + "epoch": 1.8806602721391925, + "grad_norm": 0.17931349575519562, + "learning_rate": 6.201721378825889e-06, + "loss": 0.4371, + "step": 8431 + }, + { + "epoch": 1.880883337051082, + "grad_norm": 0.19341467320919037, + "learning_rate": 6.199544427431821e-06, + "loss": 0.4422, + "step": 8432 + }, + { + "epoch": 1.8811064019629713, + "grad_norm": 0.17641760408878326, + "learning_rate": 6.19736768653089e-06, + "loss": 0.4522, + "step": 8433 + }, + { + "epoch": 1.8813294668748606, + "grad_norm": 0.18412084877490997, + "learning_rate": 6.195191156243654e-06, + "loss": 0.4819, + "step": 8434 + }, + { + "epoch": 1.8815525317867499, + "grad_norm": 0.18774092197418213, + "learning_rate": 6.193014836690667e-06, + "loss": 0.4648, + "step": 8435 + }, + { + "epoch": 1.8817755966986391, + "grad_norm": 0.1818709373474121, + "learning_rate": 6.190838727992463e-06, + "loss": 0.4621, + "step": 8436 + }, + { + "epoch": 1.8819986616105286, + "grad_norm": 0.1864043027162552, + "learning_rate": 6.188662830269574e-06, + "loss": 0.4569, + "step": 8437 + }, + { + "epoch": 1.8822217265224181, + "grad_norm": 0.18635167181491852, + "learning_rate": 6.186487143642508e-06, + "loss": 0.4451, + "step": 8438 + }, + { + "epoch": 1.8824447914343074, + "grad_norm": 0.18478180468082428, + "learning_rate": 6.184311668231775e-06, + "loss": 0.4505, + "step": 8439 + }, + { + "epoch": 1.8826678563461967, + "grad_norm": 0.1775161474943161, + "learning_rate": 6.1821364041578615e-06, + "loss": 0.4432, + "step": 8440 + }, + { + "epoch": 1.882890921258086, + "grad_norm": 0.18662866950035095, + "learning_rate": 6.179961351541252e-06, + "loss": 0.4602, + "step": 8441 + }, + { + "epoch": 1.8831139861699755, + "grad_norm": 0.1838211715221405, + "learning_rate": 6.177786510502408e-06, + "loss": 0.4519, + "step": 8442 + }, + { + "epoch": 1.8833370510818648, + "grad_norm": 0.1826631873846054, + "learning_rate": 6.1756118811617935e-06, + "loss": 0.4505, + "step": 8443 + }, + { + "epoch": 1.8835601159937543, + "grad_norm": 0.18082208931446075, + "learning_rate": 6.173437463639846e-06, + "loss": 0.4717, + "step": 8444 + }, + { + "epoch": 1.8837831809056436, + "grad_norm": 0.18472546339035034, + "learning_rate": 6.171263258057008e-06, + "loss": 0.4626, + "step": 8445 + }, + { + "epoch": 1.8840062458175328, + "grad_norm": 0.1825518012046814, + "learning_rate": 6.16908926453369e-06, + "loss": 0.4531, + "step": 8446 + }, + { + "epoch": 1.8842293107294221, + "grad_norm": 0.23177875578403473, + "learning_rate": 6.166915483190304e-06, + "loss": 0.4379, + "step": 8447 + }, + { + "epoch": 1.8844523756413116, + "grad_norm": 0.17296954989433289, + "learning_rate": 6.164741914147252e-06, + "loss": 0.4359, + "step": 8448 + }, + { + "epoch": 1.8846754405532011, + "grad_norm": 0.18288472294807434, + "learning_rate": 6.162568557524918e-06, + "loss": 0.4547, + "step": 8449 + }, + { + "epoch": 1.8848985054650904, + "grad_norm": 0.17033147811889648, + "learning_rate": 6.160395413443674e-06, + "loss": 0.4164, + "step": 8450 + }, + { + "epoch": 1.8851215703769797, + "grad_norm": 0.1902470588684082, + "learning_rate": 6.1582224820238836e-06, + "loss": 0.4366, + "step": 8451 + }, + { + "epoch": 1.885344635288869, + "grad_norm": 0.17565543949604034, + "learning_rate": 6.156049763385901e-06, + "loss": 0.4693, + "step": 8452 + }, + { + "epoch": 1.8855677002007583, + "grad_norm": 0.1908363252878189, + "learning_rate": 6.1538772576500584e-06, + "loss": 0.4484, + "step": 8453 + }, + { + "epoch": 1.8857907651126478, + "grad_norm": 0.1861358880996704, + "learning_rate": 6.151704964936687e-06, + "loss": 0.4531, + "step": 8454 + }, + { + "epoch": 1.8860138300245373, + "grad_norm": 0.18762946128845215, + "learning_rate": 6.149532885366103e-06, + "loss": 0.4676, + "step": 8455 + }, + { + "epoch": 1.8862368949364265, + "grad_norm": 0.19913704693317413, + "learning_rate": 6.147361019058606e-06, + "loss": 0.4429, + "step": 8456 + }, + { + "epoch": 1.8864599598483158, + "grad_norm": 0.19710618257522583, + "learning_rate": 6.145189366134492e-06, + "loss": 0.4722, + "step": 8457 + }, + { + "epoch": 1.886683024760205, + "grad_norm": 0.1801493912935257, + "learning_rate": 6.143017926714036e-06, + "loss": 0.4598, + "step": 8458 + }, + { + "epoch": 1.8869060896720946, + "grad_norm": 0.2561694383621216, + "learning_rate": 6.140846700917513e-06, + "loss": 0.4218, + "step": 8459 + }, + { + "epoch": 1.887129154583984, + "grad_norm": 0.17985545098781586, + "learning_rate": 6.138675688865171e-06, + "loss": 0.4269, + "step": 8460 + }, + { + "epoch": 1.8873522194958734, + "grad_norm": 0.1875736266374588, + "learning_rate": 6.13650489067726e-06, + "loss": 0.4695, + "step": 8461 + }, + { + "epoch": 1.8875752844077627, + "grad_norm": 0.18572880327701569, + "learning_rate": 6.134334306474009e-06, + "loss": 0.4448, + "step": 8462 + }, + { + "epoch": 1.887798349319652, + "grad_norm": 0.17786027491092682, + "learning_rate": 6.132163936375641e-06, + "loss": 0.4505, + "step": 8463 + }, + { + "epoch": 1.8880214142315412, + "grad_norm": 0.18378950655460358, + "learning_rate": 6.129993780502364e-06, + "loss": 0.4583, + "step": 8464 + }, + { + "epoch": 1.8882444791434307, + "grad_norm": 0.1798183172941208, + "learning_rate": 6.127823838974375e-06, + "loss": 0.4491, + "step": 8465 + }, + { + "epoch": 1.8884675440553202, + "grad_norm": 0.18183192610740662, + "learning_rate": 6.125654111911857e-06, + "loss": 0.4707, + "step": 8466 + }, + { + "epoch": 1.8886906089672095, + "grad_norm": 0.17841511964797974, + "learning_rate": 6.1234845994349875e-06, + "loss": 0.4645, + "step": 8467 + }, + { + "epoch": 1.8889136738790988, + "grad_norm": 0.17050506174564362, + "learning_rate": 6.121315301663923e-06, + "loss": 0.4241, + "step": 8468 + }, + { + "epoch": 1.889136738790988, + "grad_norm": 0.18663549423217773, + "learning_rate": 6.119146218718817e-06, + "loss": 0.4736, + "step": 8469 + }, + { + "epoch": 1.8893598037028774, + "grad_norm": 0.19147509336471558, + "learning_rate": 6.116977350719801e-06, + "loss": 0.4557, + "step": 8470 + }, + { + "epoch": 1.8895828686147669, + "grad_norm": 0.18618586659431458, + "learning_rate": 6.114808697787009e-06, + "loss": 0.4469, + "step": 8471 + }, + { + "epoch": 1.8898059335266564, + "grad_norm": 0.18382149934768677, + "learning_rate": 6.112640260040548e-06, + "loss": 0.4903, + "step": 8472 + }, + { + "epoch": 1.8900289984385457, + "grad_norm": 0.20091551542282104, + "learning_rate": 6.110472037600523e-06, + "loss": 0.4578, + "step": 8473 + }, + { + "epoch": 1.890252063350435, + "grad_norm": 0.1820792704820633, + "learning_rate": 6.10830403058702e-06, + "loss": 0.4645, + "step": 8474 + }, + { + "epoch": 1.8904751282623242, + "grad_norm": 0.2219545990228653, + "learning_rate": 6.106136239120121e-06, + "loss": 0.4705, + "step": 8475 + }, + { + "epoch": 1.8906981931742137, + "grad_norm": 0.17899756133556366, + "learning_rate": 6.103968663319893e-06, + "loss": 0.4646, + "step": 8476 + }, + { + "epoch": 1.890921258086103, + "grad_norm": 0.17997188866138458, + "learning_rate": 6.101801303306383e-06, + "loss": 0.4517, + "step": 8477 + }, + { + "epoch": 1.8911443229979925, + "grad_norm": 0.16982750594615936, + "learning_rate": 6.099634159199641e-06, + "loss": 0.4286, + "step": 8478 + }, + { + "epoch": 1.8913673879098818, + "grad_norm": 0.17612092196941376, + "learning_rate": 6.0974672311196916e-06, + "loss": 0.4476, + "step": 8479 + }, + { + "epoch": 1.891590452821771, + "grad_norm": 0.18440189957618713, + "learning_rate": 6.095300519186555e-06, + "loss": 0.4456, + "step": 8480 + }, + { + "epoch": 1.8918135177336604, + "grad_norm": 0.16765691339969635, + "learning_rate": 6.093134023520237e-06, + "loss": 0.4215, + "step": 8481 + }, + { + "epoch": 1.8920365826455499, + "grad_norm": 0.1789095401763916, + "learning_rate": 6.090967744240734e-06, + "loss": 0.4711, + "step": 8482 + }, + { + "epoch": 1.8922596475574394, + "grad_norm": 0.18848133087158203, + "learning_rate": 6.088801681468024e-06, + "loss": 0.4513, + "step": 8483 + }, + { + "epoch": 1.8924827124693286, + "grad_norm": 0.17877109348773956, + "learning_rate": 6.086635835322081e-06, + "loss": 0.4506, + "step": 8484 + }, + { + "epoch": 1.892705777381218, + "grad_norm": 0.18696287274360657, + "learning_rate": 6.084470205922859e-06, + "loss": 0.4523, + "step": 8485 + }, + { + "epoch": 1.8929288422931072, + "grad_norm": 0.18571056425571442, + "learning_rate": 6.08230479339031e-06, + "loss": 0.4646, + "step": 8486 + }, + { + "epoch": 1.8931519072049967, + "grad_norm": 0.17447581887245178, + "learning_rate": 6.080139597844361e-06, + "loss": 0.4671, + "step": 8487 + }, + { + "epoch": 1.893374972116886, + "grad_norm": 0.18128472566604614, + "learning_rate": 6.0779746194049415e-06, + "loss": 0.4419, + "step": 8488 + }, + { + "epoch": 1.8935980370287755, + "grad_norm": 0.1792159378528595, + "learning_rate": 6.075809858191957e-06, + "loss": 0.4547, + "step": 8489 + }, + { + "epoch": 1.8938211019406648, + "grad_norm": 0.17733284831047058, + "learning_rate": 6.073645314325306e-06, + "loss": 0.4177, + "step": 8490 + }, + { + "epoch": 1.894044166852554, + "grad_norm": 0.1729053109884262, + "learning_rate": 6.071480987924875e-06, + "loss": 0.4555, + "step": 8491 + }, + { + "epoch": 1.8942672317644433, + "grad_norm": 0.18540537357330322, + "learning_rate": 6.069316879110539e-06, + "loss": 0.4352, + "step": 8492 + }, + { + "epoch": 1.8944902966763328, + "grad_norm": 0.17858169972896576, + "learning_rate": 6.0671529880021585e-06, + "loss": 0.4707, + "step": 8493 + }, + { + "epoch": 1.8947133615882221, + "grad_norm": 0.1803089827299118, + "learning_rate": 6.0649893147195845e-06, + "loss": 0.4438, + "step": 8494 + }, + { + "epoch": 1.8949364265001116, + "grad_norm": 0.18085269629955292, + "learning_rate": 6.062825859382655e-06, + "loss": 0.4419, + "step": 8495 + }, + { + "epoch": 1.895159491412001, + "grad_norm": 0.18501605093479156, + "learning_rate": 6.060662622111193e-06, + "loss": 0.4698, + "step": 8496 + }, + { + "epoch": 1.8953825563238902, + "grad_norm": 0.20104503631591797, + "learning_rate": 6.058499603025018e-06, + "loss": 0.459, + "step": 8497 + }, + { + "epoch": 1.8956056212357795, + "grad_norm": 0.17107710242271423, + "learning_rate": 6.056336802243925e-06, + "loss": 0.4357, + "step": 8498 + }, + { + "epoch": 1.895828686147669, + "grad_norm": 0.18555793166160583, + "learning_rate": 6.0541742198877085e-06, + "loss": 0.4562, + "step": 8499 + }, + { + "epoch": 1.8960517510595585, + "grad_norm": 0.17548789083957672, + "learning_rate": 6.052011856076142e-06, + "loss": 0.4281, + "step": 8500 + }, + { + "epoch": 1.8962748159714478, + "grad_norm": 0.19359424710273743, + "learning_rate": 6.049849710928995e-06, + "loss": 0.4761, + "step": 8501 + }, + { + "epoch": 1.896497880883337, + "grad_norm": 0.1868746131658554, + "learning_rate": 6.0476877845660165e-06, + "loss": 0.4797, + "step": 8502 + }, + { + "epoch": 1.8967209457952263, + "grad_norm": 0.18077945709228516, + "learning_rate": 6.045526077106951e-06, + "loss": 0.4245, + "step": 8503 + }, + { + "epoch": 1.8969440107071158, + "grad_norm": 0.1770927757024765, + "learning_rate": 6.0433645886715255e-06, + "loss": 0.4483, + "step": 8504 + }, + { + "epoch": 1.897167075619005, + "grad_norm": 0.18114930391311646, + "learning_rate": 6.041203319379457e-06, + "loss": 0.4253, + "step": 8505 + }, + { + "epoch": 1.8973901405308946, + "grad_norm": 0.17783895134925842, + "learning_rate": 6.039042269350451e-06, + "loss": 0.4396, + "step": 8506 + }, + { + "epoch": 1.897613205442784, + "grad_norm": 0.18055394291877747, + "learning_rate": 6.036881438704198e-06, + "loss": 0.4561, + "step": 8507 + }, + { + "epoch": 1.8978362703546732, + "grad_norm": 0.17930994927883148, + "learning_rate": 6.034720827560381e-06, + "loss": 0.4489, + "step": 8508 + }, + { + "epoch": 1.8980593352665625, + "grad_norm": 0.18059539794921875, + "learning_rate": 6.0325604360386684e-06, + "loss": 0.4519, + "step": 8509 + }, + { + "epoch": 1.898282400178452, + "grad_norm": 0.19161933660507202, + "learning_rate": 6.0304002642587135e-06, + "loss": 0.4771, + "step": 8510 + }, + { + "epoch": 1.8985054650903412, + "grad_norm": 0.1823475956916809, + "learning_rate": 6.028240312340163e-06, + "loss": 0.4446, + "step": 8511 + }, + { + "epoch": 1.8987285300022307, + "grad_norm": 0.1844417154788971, + "learning_rate": 6.026080580402645e-06, + "loss": 0.4526, + "step": 8512 + }, + { + "epoch": 1.89895159491412, + "grad_norm": 0.1865631490945816, + "learning_rate": 6.023921068565783e-06, + "loss": 0.4767, + "step": 8513 + }, + { + "epoch": 1.8991746598260093, + "grad_norm": 0.18571047484874725, + "learning_rate": 6.021761776949182e-06, + "loss": 0.4429, + "step": 8514 + }, + { + "epoch": 1.8993977247378986, + "grad_norm": 0.18585099279880524, + "learning_rate": 6.019602705672441e-06, + "loss": 0.4608, + "step": 8515 + }, + { + "epoch": 1.899620789649788, + "grad_norm": 0.17889703810214996, + "learning_rate": 6.017443854855136e-06, + "loss": 0.4121, + "step": 8516 + }, + { + "epoch": 1.8998438545616776, + "grad_norm": 0.1884198635816574, + "learning_rate": 6.015285224616843e-06, + "loss": 0.4274, + "step": 8517 + }, + { + "epoch": 1.9000669194735669, + "grad_norm": 0.18577080965042114, + "learning_rate": 6.0131268150771194e-06, + "loss": 0.4466, + "step": 8518 + }, + { + "epoch": 1.9002899843854562, + "grad_norm": 0.1771336793899536, + "learning_rate": 6.010968626355509e-06, + "loss": 0.4567, + "step": 8519 + }, + { + "epoch": 1.9005130492973454, + "grad_norm": 0.18902990221977234, + "learning_rate": 6.008810658571551e-06, + "loss": 0.4537, + "step": 8520 + }, + { + "epoch": 1.900736114209235, + "grad_norm": 0.20060394704341888, + "learning_rate": 6.006652911844763e-06, + "loss": 0.4792, + "step": 8521 + }, + { + "epoch": 1.9009591791211242, + "grad_norm": 0.1900641918182373, + "learning_rate": 6.004495386294657e-06, + "loss": 0.4775, + "step": 8522 + }, + { + "epoch": 1.9011822440330137, + "grad_norm": 0.18330518901348114, + "learning_rate": 6.002338082040727e-06, + "loss": 0.4522, + "step": 8523 + }, + { + "epoch": 1.901405308944903, + "grad_norm": 0.18432478606700897, + "learning_rate": 6.000180999202463e-06, + "loss": 0.4619, + "step": 8524 + }, + { + "epoch": 1.9016283738567923, + "grad_norm": 0.17643876373767853, + "learning_rate": 5.998024137899333e-06, + "loss": 0.4453, + "step": 8525 + }, + { + "epoch": 1.9018514387686816, + "grad_norm": 0.17010371387004852, + "learning_rate": 5.995867498250804e-06, + "loss": 0.4264, + "step": 8526 + }, + { + "epoch": 1.902074503680571, + "grad_norm": 0.17619560658931732, + "learning_rate": 5.993711080376317e-06, + "loss": 0.4522, + "step": 8527 + }, + { + "epoch": 1.9022975685924604, + "grad_norm": 0.18597012758255005, + "learning_rate": 5.99155488439531e-06, + "loss": 0.4814, + "step": 8528 + }, + { + "epoch": 1.9025206335043499, + "grad_norm": 0.20063801109790802, + "learning_rate": 5.989398910427209e-06, + "loss": 0.4966, + "step": 8529 + }, + { + "epoch": 1.9027436984162391, + "grad_norm": 0.17774170637130737, + "learning_rate": 5.987243158591422e-06, + "loss": 0.436, + "step": 8530 + }, + { + "epoch": 1.9029667633281284, + "grad_norm": 0.1837313175201416, + "learning_rate": 5.985087629007353e-06, + "loss": 0.4648, + "step": 8531 + }, + { + "epoch": 1.9031898282400177, + "grad_norm": 0.17951352894306183, + "learning_rate": 5.982932321794383e-06, + "loss": 0.4547, + "step": 8532 + }, + { + "epoch": 1.9034128931519072, + "grad_norm": 0.1875593215227127, + "learning_rate": 5.980777237071891e-06, + "loss": 0.4525, + "step": 8533 + }, + { + "epoch": 1.9036359580637967, + "grad_norm": 0.177789106965065, + "learning_rate": 5.978622374959235e-06, + "loss": 0.4369, + "step": 8534 + }, + { + "epoch": 1.903859022975686, + "grad_norm": 0.1993599534034729, + "learning_rate": 5.9764677355757705e-06, + "loss": 0.4327, + "step": 8535 + }, + { + "epoch": 1.9040820878875753, + "grad_norm": 0.18298867344856262, + "learning_rate": 5.97431331904083e-06, + "loss": 0.4487, + "step": 8536 + }, + { + "epoch": 1.9043051527994646, + "grad_norm": 0.19131208956241608, + "learning_rate": 5.972159125473742e-06, + "loss": 0.4426, + "step": 8537 + }, + { + "epoch": 1.904528217711354, + "grad_norm": 0.1916476935148239, + "learning_rate": 5.970005154993816e-06, + "loss": 0.4456, + "step": 8538 + }, + { + "epoch": 1.9047512826232433, + "grad_norm": 0.18648818135261536, + "learning_rate": 5.9678514077203554e-06, + "loss": 0.4564, + "step": 8539 + }, + { + "epoch": 1.9049743475351328, + "grad_norm": 0.18742208182811737, + "learning_rate": 5.965697883772648e-06, + "loss": 0.4665, + "step": 8540 + }, + { + "epoch": 1.9051974124470221, + "grad_norm": 0.20118087530136108, + "learning_rate": 5.963544583269965e-06, + "loss": 0.5012, + "step": 8541 + }, + { + "epoch": 1.9054204773589114, + "grad_norm": 0.181925430893898, + "learning_rate": 5.9613915063315775e-06, + "loss": 0.4185, + "step": 8542 + }, + { + "epoch": 1.9056435422708007, + "grad_norm": 0.18299917876720428, + "learning_rate": 5.9592386530767285e-06, + "loss": 0.4633, + "step": 8543 + }, + { + "epoch": 1.9058666071826902, + "grad_norm": 0.1903630495071411, + "learning_rate": 5.957086023624663e-06, + "loss": 0.452, + "step": 8544 + }, + { + "epoch": 1.9060896720945795, + "grad_norm": 0.17144551873207092, + "learning_rate": 5.954933618094603e-06, + "loss": 0.446, + "step": 8545 + }, + { + "epoch": 1.906312737006469, + "grad_norm": 0.1903725415468216, + "learning_rate": 5.952781436605766e-06, + "loss": 0.4643, + "step": 8546 + }, + { + "epoch": 1.9065358019183583, + "grad_norm": 0.17307879030704498, + "learning_rate": 5.9506294792773475e-06, + "loss": 0.4358, + "step": 8547 + }, + { + "epoch": 1.9067588668302475, + "grad_norm": 0.18062752485275269, + "learning_rate": 5.9484777462285425e-06, + "loss": 0.4517, + "step": 8548 + }, + { + "epoch": 1.9069819317421368, + "grad_norm": 0.19149060547351837, + "learning_rate": 5.946326237578524e-06, + "loss": 0.4513, + "step": 8549 + }, + { + "epoch": 1.9072049966540263, + "grad_norm": 0.18597029149532318, + "learning_rate": 5.944174953446457e-06, + "loss": 0.4324, + "step": 8550 + }, + { + "epoch": 1.9074280615659158, + "grad_norm": 0.17686405777931213, + "learning_rate": 5.942023893951494e-06, + "loss": 0.4175, + "step": 8551 + }, + { + "epoch": 1.907651126477805, + "grad_norm": 0.17996470630168915, + "learning_rate": 5.939873059212775e-06, + "loss": 0.4548, + "step": 8552 + }, + { + "epoch": 1.9078741913896944, + "grad_norm": 0.18042857944965363, + "learning_rate": 5.937722449349421e-06, + "loss": 0.438, + "step": 8553 + }, + { + "epoch": 1.9080972563015837, + "grad_norm": 0.17731131613254547, + "learning_rate": 5.935572064480555e-06, + "loss": 0.4301, + "step": 8554 + }, + { + "epoch": 1.9083203212134732, + "grad_norm": 0.18161912262439728, + "learning_rate": 5.933421904725271e-06, + "loss": 0.4805, + "step": 8555 + }, + { + "epoch": 1.9085433861253625, + "grad_norm": 0.17587031424045563, + "learning_rate": 5.931271970202664e-06, + "loss": 0.4361, + "step": 8556 + }, + { + "epoch": 1.908766451037252, + "grad_norm": 0.1959368884563446, + "learning_rate": 5.929122261031806e-06, + "loss": 0.4652, + "step": 8557 + }, + { + "epoch": 1.9089895159491412, + "grad_norm": 0.17692820727825165, + "learning_rate": 5.926972777331767e-06, + "loss": 0.4492, + "step": 8558 + }, + { + "epoch": 1.9092125808610305, + "grad_norm": 0.17724066972732544, + "learning_rate": 5.924823519221593e-06, + "loss": 0.4458, + "step": 8559 + }, + { + "epoch": 1.9094356457729198, + "grad_norm": 0.17765964567661285, + "learning_rate": 5.922674486820327e-06, + "loss": 0.4259, + "step": 8560 + }, + { + "epoch": 1.9096587106848093, + "grad_norm": 0.18779602646827698, + "learning_rate": 5.920525680246997e-06, + "loss": 0.4595, + "step": 8561 + }, + { + "epoch": 1.9098817755966988, + "grad_norm": 0.17274630069732666, + "learning_rate": 5.918377099620613e-06, + "loss": 0.4408, + "step": 8562 + }, + { + "epoch": 1.910104840508588, + "grad_norm": 0.1717062145471573, + "learning_rate": 5.916228745060184e-06, + "loss": 0.4308, + "step": 8563 + }, + { + "epoch": 1.9103279054204774, + "grad_norm": 0.18208001554012299, + "learning_rate": 5.914080616684691e-06, + "loss": 0.4582, + "step": 8564 + }, + { + "epoch": 1.9105509703323666, + "grad_norm": 0.18288151919841766, + "learning_rate": 5.911932714613118e-06, + "loss": 0.4404, + "step": 8565 + }, + { + "epoch": 1.910774035244256, + "grad_norm": 0.1811714917421341, + "learning_rate": 5.909785038964424e-06, + "loss": 0.4544, + "step": 8566 + }, + { + "epoch": 1.9109971001561454, + "grad_norm": 0.2010553777217865, + "learning_rate": 5.907637589857565e-06, + "loss": 0.4497, + "step": 8567 + }, + { + "epoch": 1.911220165068035, + "grad_norm": 0.1751619428396225, + "learning_rate": 5.905490367411475e-06, + "loss": 0.4761, + "step": 8568 + }, + { + "epoch": 1.9114432299799242, + "grad_norm": 0.1781492680311203, + "learning_rate": 5.903343371745087e-06, + "loss": 0.4393, + "step": 8569 + }, + { + "epoch": 1.9116662948918135, + "grad_norm": 0.1823146939277649, + "learning_rate": 5.9011966029773105e-06, + "loss": 0.4438, + "step": 8570 + }, + { + "epoch": 1.9118893598037028, + "grad_norm": 0.1732335388660431, + "learning_rate": 5.89905006122705e-06, + "loss": 0.4574, + "step": 8571 + }, + { + "epoch": 1.9121124247155923, + "grad_norm": 0.1782243847846985, + "learning_rate": 5.896903746613191e-06, + "loss": 0.448, + "step": 8572 + }, + { + "epoch": 1.9123354896274816, + "grad_norm": 0.16777978837490082, + "learning_rate": 5.894757659254614e-06, + "loss": 0.4136, + "step": 8573 + }, + { + "epoch": 1.912558554539371, + "grad_norm": 0.18306882679462433, + "learning_rate": 5.892611799270179e-06, + "loss": 0.472, + "step": 8574 + }, + { + "epoch": 1.9127816194512604, + "grad_norm": 0.19164350628852844, + "learning_rate": 5.890466166778741e-06, + "loss": 0.4304, + "step": 8575 + }, + { + "epoch": 1.9130046843631496, + "grad_norm": 0.18419405817985535, + "learning_rate": 5.888320761899136e-06, + "loss": 0.4458, + "step": 8576 + }, + { + "epoch": 1.913227749275039, + "grad_norm": 0.1842220425605774, + "learning_rate": 5.886175584750191e-06, + "loss": 0.4658, + "step": 8577 + }, + { + "epoch": 1.9134508141869284, + "grad_norm": 0.18262866139411926, + "learning_rate": 5.884030635450717e-06, + "loss": 0.4389, + "step": 8578 + }, + { + "epoch": 1.913673879098818, + "grad_norm": 0.17716680467128754, + "learning_rate": 5.88188591411952e-06, + "loss": 0.4465, + "step": 8579 + }, + { + "epoch": 1.9138969440107072, + "grad_norm": 0.18368251621723175, + "learning_rate": 5.879741420875382e-06, + "loss": 0.4687, + "step": 8580 + }, + { + "epoch": 1.9141200089225965, + "grad_norm": 0.18374212086200714, + "learning_rate": 5.877597155837082e-06, + "loss": 0.4255, + "step": 8581 + }, + { + "epoch": 1.9143430738344858, + "grad_norm": 0.2865939736366272, + "learning_rate": 5.8754531191233845e-06, + "loss": 0.4244, + "step": 8582 + }, + { + "epoch": 1.914566138746375, + "grad_norm": 0.16909492015838623, + "learning_rate": 5.873309310853037e-06, + "loss": 0.4432, + "step": 8583 + }, + { + "epoch": 1.9147892036582645, + "grad_norm": 0.18193134665489197, + "learning_rate": 5.8711657311447775e-06, + "loss": 0.4461, + "step": 8584 + }, + { + "epoch": 1.915012268570154, + "grad_norm": 0.1864008605480194, + "learning_rate": 5.8690223801173305e-06, + "loss": 0.4525, + "step": 8585 + }, + { + "epoch": 1.9152353334820433, + "grad_norm": 0.17558574676513672, + "learning_rate": 5.866879257889411e-06, + "loss": 0.4376, + "step": 8586 + }, + { + "epoch": 1.9154583983939326, + "grad_norm": 0.18040230870246887, + "learning_rate": 5.864736364579714e-06, + "loss": 0.4505, + "step": 8587 + }, + { + "epoch": 1.915681463305822, + "grad_norm": 0.1805705428123474, + "learning_rate": 5.862593700306931e-06, + "loss": 0.4497, + "step": 8588 + }, + { + "epoch": 1.9159045282177114, + "grad_norm": 0.17601102590560913, + "learning_rate": 5.860451265189733e-06, + "loss": 0.4287, + "step": 8589 + }, + { + "epoch": 1.9161275931296007, + "grad_norm": 0.19040507078170776, + "learning_rate": 5.858309059346784e-06, + "loss": 0.4736, + "step": 8590 + }, + { + "epoch": 1.9163506580414902, + "grad_norm": 0.18196603655815125, + "learning_rate": 5.85616708289673e-06, + "loss": 0.4527, + "step": 8591 + }, + { + "epoch": 1.9165737229533795, + "grad_norm": 0.19250567257404327, + "learning_rate": 5.85402533595821e-06, + "loss": 0.4137, + "step": 8592 + }, + { + "epoch": 1.9167967878652687, + "grad_norm": 0.1818055659532547, + "learning_rate": 5.851883818649845e-06, + "loss": 0.4603, + "step": 8593 + }, + { + "epoch": 1.917019852777158, + "grad_norm": 0.21804414689540863, + "learning_rate": 5.8497425310902474e-06, + "loss": 0.4716, + "step": 8594 + }, + { + "epoch": 1.9172429176890475, + "grad_norm": 0.19273175299167633, + "learning_rate": 5.847601473398014e-06, + "loss": 0.4345, + "step": 8595 + }, + { + "epoch": 1.917465982600937, + "grad_norm": 0.18502821028232574, + "learning_rate": 5.845460645691732e-06, + "loss": 0.4634, + "step": 8596 + }, + { + "epoch": 1.9176890475128263, + "grad_norm": 0.18303586542606354, + "learning_rate": 5.843320048089971e-06, + "loss": 0.4813, + "step": 8597 + }, + { + "epoch": 1.9179121124247156, + "grad_norm": 0.18128132820129395, + "learning_rate": 5.841179680711294e-06, + "loss": 0.4677, + "step": 8598 + }, + { + "epoch": 1.9181351773366049, + "grad_norm": 0.17710062861442566, + "learning_rate": 5.839039543674245e-06, + "loss": 0.4321, + "step": 8599 + }, + { + "epoch": 1.9183582422484942, + "grad_norm": 0.187991201877594, + "learning_rate": 5.836899637097358e-06, + "loss": 0.4803, + "step": 8600 + }, + { + "epoch": 1.9185813071603837, + "grad_norm": 0.17623665928840637, + "learning_rate": 5.834759961099157e-06, + "loss": 0.4064, + "step": 8601 + }, + { + "epoch": 1.9188043720722732, + "grad_norm": 0.17650528252124786, + "learning_rate": 5.832620515798154e-06, + "loss": 0.463, + "step": 8602 + }, + { + "epoch": 1.9190274369841624, + "grad_norm": 0.17875365912914276, + "learning_rate": 5.830481301312836e-06, + "loss": 0.4543, + "step": 8603 + }, + { + "epoch": 1.9192505018960517, + "grad_norm": 0.2137259989976883, + "learning_rate": 5.828342317761692e-06, + "loss": 0.4586, + "step": 8604 + }, + { + "epoch": 1.919473566807941, + "grad_norm": 0.18043895065784454, + "learning_rate": 5.8262035652631896e-06, + "loss": 0.4573, + "step": 8605 + }, + { + "epoch": 1.9196966317198305, + "grad_norm": 0.18645495176315308, + "learning_rate": 5.824065043935794e-06, + "loss": 0.4812, + "step": 8606 + }, + { + "epoch": 1.9199196966317198, + "grad_norm": 0.18367193639278412, + "learning_rate": 5.821926753897942e-06, + "loss": 0.4644, + "step": 8607 + }, + { + "epoch": 1.9201427615436093, + "grad_norm": 0.17733469605445862, + "learning_rate": 5.819788695268065e-06, + "loss": 0.4581, + "step": 8608 + }, + { + "epoch": 1.9203658264554986, + "grad_norm": 0.17940470576286316, + "learning_rate": 5.817650868164584e-06, + "loss": 0.4452, + "step": 8609 + }, + { + "epoch": 1.9205888913673879, + "grad_norm": 0.1958422213792801, + "learning_rate": 5.815513272705905e-06, + "loss": 0.47, + "step": 8610 + }, + { + "epoch": 1.9208119562792771, + "grad_norm": 0.21605324745178223, + "learning_rate": 5.813375909010427e-06, + "loss": 0.4444, + "step": 8611 + }, + { + "epoch": 1.9210350211911666, + "grad_norm": 0.18394535779953003, + "learning_rate": 5.811238777196522e-06, + "loss": 0.4527, + "step": 8612 + }, + { + "epoch": 1.9212580861030562, + "grad_norm": 0.17584146559238434, + "learning_rate": 5.809101877382562e-06, + "loss": 0.4181, + "step": 8613 + }, + { + "epoch": 1.9214811510149454, + "grad_norm": 0.1733761727809906, + "learning_rate": 5.806965209686903e-06, + "loss": 0.4431, + "step": 8614 + }, + { + "epoch": 1.9217042159268347, + "grad_norm": 0.17333033680915833, + "learning_rate": 5.8048287742278886e-06, + "loss": 0.4301, + "step": 8615 + }, + { + "epoch": 1.921927280838724, + "grad_norm": 0.17815978825092316, + "learning_rate": 5.802692571123843e-06, + "loss": 0.4417, + "step": 8616 + }, + { + "epoch": 1.9221503457506133, + "grad_norm": 0.17262916266918182, + "learning_rate": 5.8005566004930826e-06, + "loss": 0.424, + "step": 8617 + }, + { + "epoch": 1.9223734106625028, + "grad_norm": 0.1765177696943283, + "learning_rate": 5.798420862453914e-06, + "loss": 0.4553, + "step": 8618 + }, + { + "epoch": 1.9225964755743923, + "grad_norm": 0.18109013140201569, + "learning_rate": 5.796285357124632e-06, + "loss": 0.4362, + "step": 8619 + }, + { + "epoch": 1.9228195404862816, + "grad_norm": 0.2021632194519043, + "learning_rate": 5.7941500846235045e-06, + "loss": 0.4319, + "step": 8620 + }, + { + "epoch": 1.9230426053981708, + "grad_norm": 0.18348877131938934, + "learning_rate": 5.7920150450688e-06, + "loss": 0.4166, + "step": 8621 + }, + { + "epoch": 1.9232656703100601, + "grad_norm": 0.18288786709308624, + "learning_rate": 5.789880238578773e-06, + "loss": 0.4756, + "step": 8622 + }, + { + "epoch": 1.9234887352219496, + "grad_norm": 0.2258935421705246, + "learning_rate": 5.787745665271664e-06, + "loss": 0.4458, + "step": 8623 + }, + { + "epoch": 1.923711800133839, + "grad_norm": 0.19700488448143005, + "learning_rate": 5.785611325265694e-06, + "loss": 0.4566, + "step": 8624 + }, + { + "epoch": 1.9239348650457284, + "grad_norm": 0.177838996052742, + "learning_rate": 5.783477218679078e-06, + "loss": 0.4348, + "step": 8625 + }, + { + "epoch": 1.9241579299576177, + "grad_norm": 0.18436270952224731, + "learning_rate": 5.781343345630016e-06, + "loss": 0.4323, + "step": 8626 + }, + { + "epoch": 1.924380994869507, + "grad_norm": 0.18075759708881378, + "learning_rate": 5.779209706236696e-06, + "loss": 0.4355, + "step": 8627 + }, + { + "epoch": 1.9246040597813963, + "grad_norm": 0.18219725787639618, + "learning_rate": 5.7770763006172966e-06, + "loss": 0.4712, + "step": 8628 + }, + { + "epoch": 1.9248271246932858, + "grad_norm": 0.18282842636108398, + "learning_rate": 5.77494312888997e-06, + "loss": 0.4449, + "step": 8629 + }, + { + "epoch": 1.9250501896051753, + "grad_norm": 0.19060496985912323, + "learning_rate": 5.77281019117287e-06, + "loss": 0.4629, + "step": 8630 + }, + { + "epoch": 1.9252732545170645, + "grad_norm": 0.1835882067680359, + "learning_rate": 5.770677487584133e-06, + "loss": 0.4504, + "step": 8631 + }, + { + "epoch": 1.9254963194289538, + "grad_norm": 0.19525527954101562, + "learning_rate": 5.7685450182418825e-06, + "loss": 0.4748, + "step": 8632 + }, + { + "epoch": 1.925719384340843, + "grad_norm": 0.18883006274700165, + "learning_rate": 5.7664127832642225e-06, + "loss": 0.4342, + "step": 8633 + }, + { + "epoch": 1.9259424492527324, + "grad_norm": 0.18440766632556915, + "learning_rate": 5.764280782769254e-06, + "loss": 0.4448, + "step": 8634 + }, + { + "epoch": 1.926165514164622, + "grad_norm": 0.18678854405879974, + "learning_rate": 5.762149016875057e-06, + "loss": 0.4417, + "step": 8635 + }, + { + "epoch": 1.9263885790765114, + "grad_norm": 0.1747465282678604, + "learning_rate": 5.760017485699711e-06, + "loss": 0.442, + "step": 8636 + }, + { + "epoch": 1.9266116439884007, + "grad_norm": 0.17557400465011597, + "learning_rate": 5.757886189361261e-06, + "loss": 0.4598, + "step": 8637 + }, + { + "epoch": 1.92683470890029, + "grad_norm": 0.1763993501663208, + "learning_rate": 5.755755127977759e-06, + "loss": 0.4279, + "step": 8638 + }, + { + "epoch": 1.9270577738121792, + "grad_norm": 0.18249794840812683, + "learning_rate": 5.753624301667235e-06, + "loss": 0.4529, + "step": 8639 + }, + { + "epoch": 1.9272808387240687, + "grad_norm": 0.16952653229236603, + "learning_rate": 5.751493710547713e-06, + "loss": 0.4208, + "step": 8640 + }, + { + "epoch": 1.927503903635958, + "grad_norm": 0.1794244945049286, + "learning_rate": 5.749363354737188e-06, + "loss": 0.456, + "step": 8641 + }, + { + "epoch": 1.9277269685478475, + "grad_norm": 0.1846603900194168, + "learning_rate": 5.74723323435366e-06, + "loss": 0.4559, + "step": 8642 + }, + { + "epoch": 1.9279500334597368, + "grad_norm": 0.18191924691200256, + "learning_rate": 5.745103349515104e-06, + "loss": 0.4466, + "step": 8643 + }, + { + "epoch": 1.928173098371626, + "grad_norm": 0.1772155612707138, + "learning_rate": 5.742973700339494e-06, + "loss": 0.4005, + "step": 8644 + }, + { + "epoch": 1.9283961632835154, + "grad_norm": 0.17686043679714203, + "learning_rate": 5.740844286944775e-06, + "loss": 0.4279, + "step": 8645 + }, + { + "epoch": 1.9286192281954049, + "grad_norm": 0.19069218635559082, + "learning_rate": 5.738715109448889e-06, + "loss": 0.4725, + "step": 8646 + }, + { + "epoch": 1.9288422931072944, + "grad_norm": 0.17691883444786072, + "learning_rate": 5.736586167969767e-06, + "loss": 0.4383, + "step": 8647 + }, + { + "epoch": 1.9290653580191837, + "grad_norm": 0.1703954041004181, + "learning_rate": 5.734457462625318e-06, + "loss": 0.4225, + "step": 8648 + }, + { + "epoch": 1.929288422931073, + "grad_norm": 0.17984159290790558, + "learning_rate": 5.732328993533452e-06, + "loss": 0.4681, + "step": 8649 + }, + { + "epoch": 1.9295114878429622, + "grad_norm": 0.18141353130340576, + "learning_rate": 5.730200760812045e-06, + "loss": 0.4654, + "step": 8650 + }, + { + "epoch": 1.9297345527548515, + "grad_norm": 0.19043731689453125, + "learning_rate": 5.72807276457898e-06, + "loss": 0.4443, + "step": 8651 + }, + { + "epoch": 1.929957617666741, + "grad_norm": 0.6355077624320984, + "learning_rate": 5.725945004952118e-06, + "loss": 0.4525, + "step": 8652 + }, + { + "epoch": 1.9301806825786305, + "grad_norm": 0.17925289273262024, + "learning_rate": 5.723817482049307e-06, + "loss": 0.4438, + "step": 8653 + }, + { + "epoch": 1.9304037474905198, + "grad_norm": 0.1707497388124466, + "learning_rate": 5.721690195988381e-06, + "loss": 0.4078, + "step": 8654 + }, + { + "epoch": 1.930626812402409, + "grad_norm": 0.1779852956533432, + "learning_rate": 5.719563146887163e-06, + "loss": 0.4304, + "step": 8655 + }, + { + "epoch": 1.9308498773142984, + "grad_norm": 0.18374204635620117, + "learning_rate": 5.717436334863462e-06, + "loss": 0.4503, + "step": 8656 + }, + { + "epoch": 1.9310729422261879, + "grad_norm": 0.18949589133262634, + "learning_rate": 5.715309760035079e-06, + "loss": 0.4901, + "step": 8657 + }, + { + "epoch": 1.9312960071380771, + "grad_norm": 0.1836424320936203, + "learning_rate": 5.71318342251979e-06, + "loss": 0.462, + "step": 8658 + }, + { + "epoch": 1.9315190720499666, + "grad_norm": 0.1780422180891037, + "learning_rate": 5.711057322435369e-06, + "loss": 0.4386, + "step": 8659 + }, + { + "epoch": 1.931742136961856, + "grad_norm": 0.17757810652256012, + "learning_rate": 5.708931459899571e-06, + "loss": 0.4421, + "step": 8660 + }, + { + "epoch": 1.9319652018737452, + "grad_norm": 0.1806020736694336, + "learning_rate": 5.7068058350301445e-06, + "loss": 0.4349, + "step": 8661 + }, + { + "epoch": 1.9321882667856345, + "grad_norm": 0.1809801310300827, + "learning_rate": 5.704680447944812e-06, + "loss": 0.4557, + "step": 8662 + }, + { + "epoch": 1.932411331697524, + "grad_norm": 0.22528956830501556, + "learning_rate": 5.702555298761297e-06, + "loss": 0.4204, + "step": 8663 + }, + { + "epoch": 1.9326343966094135, + "grad_norm": 0.17288967967033386, + "learning_rate": 5.700430387597299e-06, + "loss": 0.4379, + "step": 8664 + }, + { + "epoch": 1.9328574615213028, + "grad_norm": 0.18082986772060394, + "learning_rate": 5.698305714570517e-06, + "loss": 0.4727, + "step": 8665 + }, + { + "epoch": 1.933080526433192, + "grad_norm": 0.17780068516731262, + "learning_rate": 5.696181279798618e-06, + "loss": 0.4321, + "step": 8666 + }, + { + "epoch": 1.9333035913450813, + "grad_norm": 0.17463639378547668, + "learning_rate": 5.694057083399272e-06, + "loss": 0.4647, + "step": 8667 + }, + { + "epoch": 1.9335266562569706, + "grad_norm": 0.18342427909374237, + "learning_rate": 5.691933125490131e-06, + "loss": 0.434, + "step": 8668 + }, + { + "epoch": 1.9337497211688601, + "grad_norm": 0.1869836300611496, + "learning_rate": 5.689809406188832e-06, + "loss": 0.4545, + "step": 8669 + }, + { + "epoch": 1.9339727860807496, + "grad_norm": 0.19317130744457245, + "learning_rate": 5.687685925613005e-06, + "loss": 0.4475, + "step": 8670 + }, + { + "epoch": 1.934195850992639, + "grad_norm": 0.18191368877887726, + "learning_rate": 5.685562683880252e-06, + "loss": 0.4646, + "step": 8671 + }, + { + "epoch": 1.9344189159045282, + "grad_norm": 0.1778983622789383, + "learning_rate": 5.683439681108176e-06, + "loss": 0.4122, + "step": 8672 + }, + { + "epoch": 1.9346419808164175, + "grad_norm": 0.18883691728115082, + "learning_rate": 5.681316917414363e-06, + "loss": 0.4493, + "step": 8673 + }, + { + "epoch": 1.934865045728307, + "grad_norm": 0.1791423112154007, + "learning_rate": 5.6791943929163895e-06, + "loss": 0.4312, + "step": 8674 + }, + { + "epoch": 1.9350881106401963, + "grad_norm": 0.1763908416032791, + "learning_rate": 5.677072107731805e-06, + "loss": 0.4833, + "step": 8675 + }, + { + "epoch": 1.9353111755520858, + "grad_norm": 0.17612230777740479, + "learning_rate": 5.6749500619781595e-06, + "loss": 0.4412, + "step": 8676 + }, + { + "epoch": 1.935534240463975, + "grad_norm": 0.1743626892566681, + "learning_rate": 5.672828255772986e-06, + "loss": 0.4195, + "step": 8677 + }, + { + "epoch": 1.9357573053758643, + "grad_norm": 0.18442870676517487, + "learning_rate": 5.6707066892338054e-06, + "loss": 0.4564, + "step": 8678 + }, + { + "epoch": 1.9359803702877536, + "grad_norm": 0.2595793306827545, + "learning_rate": 5.668585362478117e-06, + "loss": 0.4738, + "step": 8679 + }, + { + "epoch": 1.936203435199643, + "grad_norm": 0.19340401887893677, + "learning_rate": 5.666464275623418e-06, + "loss": 0.4409, + "step": 8680 + }, + { + "epoch": 1.9364265001115326, + "grad_norm": 0.17830665409564972, + "learning_rate": 5.664343428787185e-06, + "loss": 0.4476, + "step": 8681 + }, + { + "epoch": 1.936649565023422, + "grad_norm": 0.18472926318645477, + "learning_rate": 5.66222282208689e-06, + "loss": 0.4597, + "step": 8682 + }, + { + "epoch": 1.9368726299353112, + "grad_norm": 0.18445588648319244, + "learning_rate": 5.660102455639978e-06, + "loss": 0.4724, + "step": 8683 + }, + { + "epoch": 1.9370956948472005, + "grad_norm": 0.20936332643032074, + "learning_rate": 5.657982329563889e-06, + "loss": 0.4823, + "step": 8684 + }, + { + "epoch": 1.9373187597590897, + "grad_norm": 0.17280444502830505, + "learning_rate": 5.655862443976051e-06, + "loss": 0.4476, + "step": 8685 + }, + { + "epoch": 1.9375418246709792, + "grad_norm": 0.18051601946353912, + "learning_rate": 5.653742798993877e-06, + "loss": 0.4426, + "step": 8686 + }, + { + "epoch": 1.9377648895828687, + "grad_norm": 0.18293160200119019, + "learning_rate": 5.651623394734771e-06, + "loss": 0.4589, + "step": 8687 + }, + { + "epoch": 1.937987954494758, + "grad_norm": 0.18495197594165802, + "learning_rate": 5.649504231316109e-06, + "loss": 0.463, + "step": 8688 + }, + { + "epoch": 1.9382110194066473, + "grad_norm": 0.18280479311943054, + "learning_rate": 5.647385308855271e-06, + "loss": 0.457, + "step": 8689 + }, + { + "epoch": 1.9384340843185366, + "grad_norm": 0.1783073991537094, + "learning_rate": 5.645266627469612e-06, + "loss": 0.4672, + "step": 8690 + }, + { + "epoch": 1.938657149230426, + "grad_norm": 0.18390284478664398, + "learning_rate": 5.643148187276477e-06, + "loss": 0.47, + "step": 8691 + }, + { + "epoch": 1.9388802141423154, + "grad_norm": 0.176648810505867, + "learning_rate": 5.641029988393208e-06, + "loss": 0.4732, + "step": 8692 + }, + { + "epoch": 1.9391032790542049, + "grad_norm": 0.19437986612319946, + "learning_rate": 5.638912030937112e-06, + "loss": 0.4631, + "step": 8693 + }, + { + "epoch": 1.9393263439660942, + "grad_norm": 0.1740999072790146, + "learning_rate": 5.636794315025499e-06, + "loss": 0.4326, + "step": 8694 + }, + { + "epoch": 1.9395494088779834, + "grad_norm": 0.17980170249938965, + "learning_rate": 5.634676840775664e-06, + "loss": 0.4373, + "step": 8695 + }, + { + "epoch": 1.9397724737898727, + "grad_norm": 0.1797953099012375, + "learning_rate": 5.632559608304886e-06, + "loss": 0.4681, + "step": 8696 + }, + { + "epoch": 1.9399955387017622, + "grad_norm": 0.17781023681163788, + "learning_rate": 5.630442617730427e-06, + "loss": 0.447, + "step": 8697 + }, + { + "epoch": 1.9402186036136517, + "grad_norm": 0.19193176925182343, + "learning_rate": 5.628325869169542e-06, + "loss": 0.4399, + "step": 8698 + }, + { + "epoch": 1.940441668525541, + "grad_norm": 0.18457980453968048, + "learning_rate": 5.62620936273947e-06, + "loss": 0.4717, + "step": 8699 + }, + { + "epoch": 1.9406647334374303, + "grad_norm": 0.1887241154909134, + "learning_rate": 5.624093098557437e-06, + "loss": 0.4239, + "step": 8700 + }, + { + "epoch": 1.9408877983493196, + "grad_norm": 0.18782730400562286, + "learning_rate": 5.621977076740652e-06, + "loss": 0.4417, + "step": 8701 + }, + { + "epoch": 1.9411108632612089, + "grad_norm": 0.21468473970890045, + "learning_rate": 5.619861297406315e-06, + "loss": 0.4635, + "step": 8702 + }, + { + "epoch": 1.9413339281730984, + "grad_norm": 0.18426984548568726, + "learning_rate": 5.61774576067161e-06, + "loss": 0.435, + "step": 8703 + }, + { + "epoch": 1.9415569930849879, + "grad_norm": 0.17525528371334076, + "learning_rate": 5.6156304666537166e-06, + "loss": 0.4459, + "step": 8704 + }, + { + "epoch": 1.9417800579968771, + "grad_norm": 0.17843464016914368, + "learning_rate": 5.613515415469782e-06, + "loss": 0.4455, + "step": 8705 + }, + { + "epoch": 1.9420031229087664, + "grad_norm": 0.1760268360376358, + "learning_rate": 5.6114006072369544e-06, + "loss": 0.4394, + "step": 8706 + }, + { + "epoch": 1.9422261878206557, + "grad_norm": 0.18088506162166595, + "learning_rate": 5.609286042072366e-06, + "loss": 0.4819, + "step": 8707 + }, + { + "epoch": 1.9424492527325452, + "grad_norm": 0.17896127700805664, + "learning_rate": 5.607171720093141e-06, + "loss": 0.4474, + "step": 8708 + }, + { + "epoch": 1.9426723176444345, + "grad_norm": 0.1753784865140915, + "learning_rate": 5.605057641416374e-06, + "loss": 0.4743, + "step": 8709 + }, + { + "epoch": 1.942895382556324, + "grad_norm": 0.1875680536031723, + "learning_rate": 5.60294380615916e-06, + "loss": 0.4491, + "step": 8710 + }, + { + "epoch": 1.9431184474682133, + "grad_norm": 0.18972131609916687, + "learning_rate": 5.600830214438577e-06, + "loss": 0.4332, + "step": 8711 + }, + { + "epoch": 1.9433415123801026, + "grad_norm": 0.25385287404060364, + "learning_rate": 5.598716866371688e-06, + "loss": 0.4125, + "step": 8712 + }, + { + "epoch": 1.9435645772919918, + "grad_norm": 0.19474051892757416, + "learning_rate": 5.5966037620755475e-06, + "loss": 0.4405, + "step": 8713 + }, + { + "epoch": 1.9437876422038813, + "grad_norm": 0.18606369197368622, + "learning_rate": 5.594490901667187e-06, + "loss": 0.4364, + "step": 8714 + }, + { + "epoch": 1.9440107071157708, + "grad_norm": 0.17978952825069427, + "learning_rate": 5.592378285263631e-06, + "loss": 0.446, + "step": 8715 + }, + { + "epoch": 1.9442337720276601, + "grad_norm": 0.17850260436534882, + "learning_rate": 5.5902659129818896e-06, + "loss": 0.4761, + "step": 8716 + }, + { + "epoch": 1.9444568369395494, + "grad_norm": 0.1837419867515564, + "learning_rate": 5.588153784938966e-06, + "loss": 0.4414, + "step": 8717 + }, + { + "epoch": 1.9446799018514387, + "grad_norm": 0.18115438520908356, + "learning_rate": 5.586041901251832e-06, + "loss": 0.4248, + "step": 8718 + }, + { + "epoch": 1.944902966763328, + "grad_norm": 0.18175376951694489, + "learning_rate": 5.5839302620374625e-06, + "loss": 0.4587, + "step": 8719 + }, + { + "epoch": 1.9451260316752175, + "grad_norm": 0.18459230661392212, + "learning_rate": 5.581818867412814e-06, + "loss": 0.4576, + "step": 8720 + }, + { + "epoch": 1.945349096587107, + "grad_norm": 0.17397940158843994, + "learning_rate": 5.5797077174948325e-06, + "loss": 0.4568, + "step": 8721 + }, + { + "epoch": 1.9455721614989963, + "grad_norm": 0.18246999382972717, + "learning_rate": 5.577596812400438e-06, + "loss": 0.4431, + "step": 8722 + }, + { + "epoch": 1.9457952264108855, + "grad_norm": 0.18297289311885834, + "learning_rate": 5.57548615224655e-06, + "loss": 0.4615, + "step": 8723 + }, + { + "epoch": 1.9460182913227748, + "grad_norm": 0.17768624424934387, + "learning_rate": 5.57337573715007e-06, + "loss": 0.45, + "step": 8724 + }, + { + "epoch": 1.9462413562346643, + "grad_norm": 0.17627879977226257, + "learning_rate": 5.5712655672278905e-06, + "loss": 0.4253, + "step": 8725 + }, + { + "epoch": 1.9464644211465536, + "grad_norm": 0.1850820928812027, + "learning_rate": 5.569155642596878e-06, + "loss": 0.4427, + "step": 8726 + }, + { + "epoch": 1.946687486058443, + "grad_norm": 0.2031596601009369, + "learning_rate": 5.567045963373896e-06, + "loss": 0.5089, + "step": 8727 + }, + { + "epoch": 1.9469105509703324, + "grad_norm": 0.17893202602863312, + "learning_rate": 5.564936529675793e-06, + "loss": 0.4536, + "step": 8728 + }, + { + "epoch": 1.9471336158822217, + "grad_norm": 0.19759956002235413, + "learning_rate": 5.562827341619407e-06, + "loss": 0.458, + "step": 8729 + }, + { + "epoch": 1.947356680794111, + "grad_norm": 0.18113680183887482, + "learning_rate": 5.560718399321549e-06, + "loss": 0.4763, + "step": 8730 + }, + { + "epoch": 1.9475797457060005, + "grad_norm": 0.1818569451570511, + "learning_rate": 5.558609702899032e-06, + "loss": 0.4484, + "step": 8731 + }, + { + "epoch": 1.94780281061789, + "grad_norm": 0.1860044002532959, + "learning_rate": 5.556501252468647e-06, + "loss": 0.4312, + "step": 8732 + }, + { + "epoch": 1.9480258755297792, + "grad_norm": 0.18196377158164978, + "learning_rate": 5.554393048147172e-06, + "loss": 0.4708, + "step": 8733 + }, + { + "epoch": 1.9482489404416685, + "grad_norm": 0.18474991619586945, + "learning_rate": 5.552285090051379e-06, + "loss": 0.4675, + "step": 8734 + }, + { + "epoch": 1.9484720053535578, + "grad_norm": 0.2845540940761566, + "learning_rate": 5.55017737829801e-06, + "loss": 0.4609, + "step": 8735 + }, + { + "epoch": 1.948695070265447, + "grad_norm": 0.17991533875465393, + "learning_rate": 5.548069913003808e-06, + "loss": 0.4612, + "step": 8736 + }, + { + "epoch": 1.9489181351773366, + "grad_norm": 0.1794772446155548, + "learning_rate": 5.5459626942855e-06, + "loss": 0.429, + "step": 8737 + }, + { + "epoch": 1.949141200089226, + "grad_norm": 0.18330740928649902, + "learning_rate": 5.543855722259797e-06, + "loss": 0.4359, + "step": 8738 + }, + { + "epoch": 1.9493642650011154, + "grad_norm": 0.17791210114955902, + "learning_rate": 5.541748997043392e-06, + "loss": 0.4532, + "step": 8739 + }, + { + "epoch": 1.9495873299130047, + "grad_norm": 0.18846653401851654, + "learning_rate": 5.5396425187529705e-06, + "loss": 0.4784, + "step": 8740 + }, + { + "epoch": 1.949810394824894, + "grad_norm": 0.18275344371795654, + "learning_rate": 5.537536287505203e-06, + "loss": 0.4651, + "step": 8741 + }, + { + "epoch": 1.9500334597367834, + "grad_norm": 0.28151261806488037, + "learning_rate": 5.5354303034167486e-06, + "loss": 0.4302, + "step": 8742 + }, + { + "epoch": 1.9502565246486727, + "grad_norm": 0.1811457872390747, + "learning_rate": 5.533324566604246e-06, + "loss": 0.4572, + "step": 8743 + }, + { + "epoch": 1.9504795895605622, + "grad_norm": 0.18251028656959534, + "learning_rate": 5.531219077184322e-06, + "loss": 0.4651, + "step": 8744 + }, + { + "epoch": 1.9507026544724515, + "grad_norm": 0.18917511403560638, + "learning_rate": 5.529113835273599e-06, + "loss": 0.431, + "step": 8745 + }, + { + "epoch": 1.9509257193843408, + "grad_norm": 0.19956184923648834, + "learning_rate": 5.527008840988676e-06, + "loss": 0.4789, + "step": 8746 + }, + { + "epoch": 1.95114878429623, + "grad_norm": 0.17988136410713196, + "learning_rate": 5.5249040944461366e-06, + "loss": 0.4427, + "step": 8747 + }, + { + "epoch": 1.9513718492081196, + "grad_norm": 0.18015359342098236, + "learning_rate": 5.522799595762558e-06, + "loss": 0.4454, + "step": 8748 + }, + { + "epoch": 1.951594914120009, + "grad_norm": 0.17955633997917175, + "learning_rate": 5.5206953450545e-06, + "loss": 0.4477, + "step": 8749 + }, + { + "epoch": 1.9518179790318984, + "grad_norm": 0.18171581625938416, + "learning_rate": 5.518591342438513e-06, + "loss": 0.4489, + "step": 8750 + }, + { + "epoch": 1.9520410439437876, + "grad_norm": 0.1831309199333191, + "learning_rate": 5.516487588031124e-06, + "loss": 0.4686, + "step": 8751 + }, + { + "epoch": 1.952264108855677, + "grad_norm": 0.1874886304140091, + "learning_rate": 5.514384081948855e-06, + "loss": 0.4631, + "step": 8752 + }, + { + "epoch": 1.9524871737675662, + "grad_norm": 0.19060033559799194, + "learning_rate": 5.51228082430821e-06, + "loss": 0.4694, + "step": 8753 + }, + { + "epoch": 1.9527102386794557, + "grad_norm": 0.18917007744312286, + "learning_rate": 5.510177815225683e-06, + "loss": 0.4681, + "step": 8754 + }, + { + "epoch": 1.9529333035913452, + "grad_norm": 0.1917768120765686, + "learning_rate": 5.508075054817755e-06, + "loss": 0.4628, + "step": 8755 + }, + { + "epoch": 1.9531563685032345, + "grad_norm": 0.18906119465827942, + "learning_rate": 5.505972543200879e-06, + "loss": 0.4553, + "step": 8756 + }, + { + "epoch": 1.9533794334151238, + "grad_norm": 0.18696992099285126, + "learning_rate": 5.503870280491514e-06, + "loss": 0.446, + "step": 8757 + }, + { + "epoch": 1.953602498327013, + "grad_norm": 0.18778546154499054, + "learning_rate": 5.501768266806095e-06, + "loss": 0.4418, + "step": 8758 + }, + { + "epoch": 1.9538255632389026, + "grad_norm": 0.182692751288414, + "learning_rate": 5.4996665022610495e-06, + "loss": 0.4688, + "step": 8759 + }, + { + "epoch": 1.9540486281507918, + "grad_norm": 0.20348599553108215, + "learning_rate": 5.497564986972775e-06, + "loss": 0.4815, + "step": 8760 + }, + { + "epoch": 1.9542716930626813, + "grad_norm": 0.1796850711107254, + "learning_rate": 5.495463721057675e-06, + "loss": 0.4688, + "step": 8761 + }, + { + "epoch": 1.9544947579745706, + "grad_norm": 0.1841420829296112, + "learning_rate": 5.493362704632127e-06, + "loss": 0.4444, + "step": 8762 + }, + { + "epoch": 1.95471782288646, + "grad_norm": 0.20517012476921082, + "learning_rate": 5.491261937812504e-06, + "loss": 0.4471, + "step": 8763 + }, + { + "epoch": 1.9549408877983492, + "grad_norm": 0.1862691342830658, + "learning_rate": 5.489161420715154e-06, + "loss": 0.4669, + "step": 8764 + }, + { + "epoch": 1.9551639527102387, + "grad_norm": 0.1903495490550995, + "learning_rate": 5.487061153456418e-06, + "loss": 0.4476, + "step": 8765 + }, + { + "epoch": 1.9553870176221282, + "grad_norm": 0.17540110647678375, + "learning_rate": 5.484961136152622e-06, + "loss": 0.4416, + "step": 8766 + }, + { + "epoch": 1.9556100825340175, + "grad_norm": 0.18369555473327637, + "learning_rate": 5.482861368920084e-06, + "loss": 0.4645, + "step": 8767 + }, + { + "epoch": 1.9558331474459068, + "grad_norm": 0.17829713225364685, + "learning_rate": 5.4807618518750925e-06, + "loss": 0.4156, + "step": 8768 + }, + { + "epoch": 1.956056212357796, + "grad_norm": 0.17648525536060333, + "learning_rate": 5.478662585133938e-06, + "loss": 0.4493, + "step": 8769 + }, + { + "epoch": 1.9562792772696853, + "grad_norm": 0.18088924884796143, + "learning_rate": 5.476563568812893e-06, + "loss": 0.455, + "step": 8770 + }, + { + "epoch": 1.9565023421815748, + "grad_norm": 0.18878251314163208, + "learning_rate": 5.474464803028206e-06, + "loss": 0.4492, + "step": 8771 + }, + { + "epoch": 1.9567254070934643, + "grad_norm": 0.19534710049629211, + "learning_rate": 5.472366287896132e-06, + "loss": 0.4368, + "step": 8772 + }, + { + "epoch": 1.9569484720053536, + "grad_norm": 0.1891869604587555, + "learning_rate": 5.470268023532885e-06, + "loss": 0.4573, + "step": 8773 + }, + { + "epoch": 1.9571715369172429, + "grad_norm": 0.1803818792104721, + "learning_rate": 5.468170010054691e-06, + "loss": 0.4338, + "step": 8774 + }, + { + "epoch": 1.9573946018291322, + "grad_norm": 0.17830346524715424, + "learning_rate": 5.4660722475777475e-06, + "loss": 0.4367, + "step": 8775 + }, + { + "epoch": 1.9576176667410217, + "grad_norm": 0.1889098584651947, + "learning_rate": 5.463974736218243e-06, + "loss": 0.4494, + "step": 8776 + }, + { + "epoch": 1.957840731652911, + "grad_norm": 0.1826130598783493, + "learning_rate": 5.461877476092352e-06, + "loss": 0.4531, + "step": 8777 + }, + { + "epoch": 1.9580637965648005, + "grad_norm": 0.17219486832618713, + "learning_rate": 5.4597804673162315e-06, + "loss": 0.4272, + "step": 8778 + }, + { + "epoch": 1.9582868614766897, + "grad_norm": 0.18349269032478333, + "learning_rate": 5.457683710006026e-06, + "loss": 0.4443, + "step": 8779 + }, + { + "epoch": 1.958509926388579, + "grad_norm": 0.1871849000453949, + "learning_rate": 5.455587204277868e-06, + "loss": 0.4588, + "step": 8780 + }, + { + "epoch": 1.9587329913004683, + "grad_norm": 0.18738840520381927, + "learning_rate": 5.453490950247882e-06, + "loss": 0.478, + "step": 8781 + }, + { + "epoch": 1.9589560562123578, + "grad_norm": 0.17778688669204712, + "learning_rate": 5.45139494803216e-06, + "loss": 0.428, + "step": 8782 + }, + { + "epoch": 1.9591791211242473, + "grad_norm": 0.18447647988796234, + "learning_rate": 5.4492991977468e-06, + "loss": 0.4422, + "step": 8783 + }, + { + "epoch": 1.9594021860361366, + "grad_norm": 0.19066467881202698, + "learning_rate": 5.4472036995078734e-06, + "loss": 0.4328, + "step": 8784 + }, + { + "epoch": 1.9596252509480259, + "grad_norm": 0.171788290143013, + "learning_rate": 5.445108453431449e-06, + "loss": 0.428, + "step": 8785 + }, + { + "epoch": 1.9598483158599151, + "grad_norm": 0.1862727701663971, + "learning_rate": 5.443013459633566e-06, + "loss": 0.4497, + "step": 8786 + }, + { + "epoch": 1.9600713807718046, + "grad_norm": 0.19083817303180695, + "learning_rate": 5.440918718230263e-06, + "loss": 0.4435, + "step": 8787 + }, + { + "epoch": 1.960294445683694, + "grad_norm": 0.18043813109397888, + "learning_rate": 5.438824229337558e-06, + "loss": 0.4386, + "step": 8788 + }, + { + "epoch": 1.9605175105955834, + "grad_norm": 0.17163397371768951, + "learning_rate": 5.4367299930714615e-06, + "loss": 0.4406, + "step": 8789 + }, + { + "epoch": 1.9607405755074727, + "grad_norm": 0.17972704768180847, + "learning_rate": 5.43463600954796e-06, + "loss": 0.4606, + "step": 8790 + }, + { + "epoch": 1.960963640419362, + "grad_norm": 0.1928553283214569, + "learning_rate": 5.432542278883031e-06, + "loss": 0.4587, + "step": 8791 + }, + { + "epoch": 1.9611867053312513, + "grad_norm": 0.18378639221191406, + "learning_rate": 5.430448801192642e-06, + "loss": 0.4554, + "step": 8792 + }, + { + "epoch": 1.9614097702431408, + "grad_norm": 0.18218287825584412, + "learning_rate": 5.428355576592748e-06, + "loss": 0.4634, + "step": 8793 + }, + { + "epoch": 1.96163283515503, + "grad_norm": 0.18878161907196045, + "learning_rate": 5.426262605199273e-06, + "loss": 0.4567, + "step": 8794 + }, + { + "epoch": 1.9618559000669196, + "grad_norm": 0.17760403454303741, + "learning_rate": 5.424169887128145e-06, + "loss": 0.4414, + "step": 8795 + }, + { + "epoch": 1.9620789649788088, + "grad_norm": 0.18763794004917145, + "learning_rate": 5.4220774224952735e-06, + "loss": 0.4499, + "step": 8796 + }, + { + "epoch": 1.9623020298906981, + "grad_norm": 0.18167226016521454, + "learning_rate": 5.419985211416548e-06, + "loss": 0.4325, + "step": 8797 + }, + { + "epoch": 1.9625250948025874, + "grad_norm": 0.18740572035312653, + "learning_rate": 5.417893254007857e-06, + "loss": 0.453, + "step": 8798 + }, + { + "epoch": 1.962748159714477, + "grad_norm": 0.18634778261184692, + "learning_rate": 5.415801550385056e-06, + "loss": 0.4658, + "step": 8799 + }, + { + "epoch": 1.9629712246263664, + "grad_norm": 0.1778239905834198, + "learning_rate": 5.4137101006639994e-06, + "loss": 0.4438, + "step": 8800 + }, + { + "epoch": 1.9631942895382557, + "grad_norm": 0.17299148440361023, + "learning_rate": 5.411618904960527e-06, + "loss": 0.4038, + "step": 8801 + }, + { + "epoch": 1.963417354450145, + "grad_norm": 0.17823709547519684, + "learning_rate": 5.4095279633904665e-06, + "loss": 0.412, + "step": 8802 + }, + { + "epoch": 1.9636404193620343, + "grad_norm": 0.18503381311893463, + "learning_rate": 5.407437276069618e-06, + "loss": 0.4553, + "step": 8803 + }, + { + "epoch": 1.9638634842739238, + "grad_norm": 0.17986641824245453, + "learning_rate": 5.4053468431137814e-06, + "loss": 0.4714, + "step": 8804 + }, + { + "epoch": 1.964086549185813, + "grad_norm": 0.17366492748260498, + "learning_rate": 5.403256664638738e-06, + "loss": 0.4264, + "step": 8805 + }, + { + "epoch": 1.9643096140977025, + "grad_norm": 0.18994669616222382, + "learning_rate": 5.40116674076026e-06, + "loss": 0.447, + "step": 8806 + }, + { + "epoch": 1.9645326790095918, + "grad_norm": 0.17680832743644714, + "learning_rate": 5.3990770715940895e-06, + "loss": 0.4308, + "step": 8807 + }, + { + "epoch": 1.9647557439214811, + "grad_norm": 0.17399604618549347, + "learning_rate": 5.396987657255974e-06, + "loss": 0.4405, + "step": 8808 + }, + { + "epoch": 1.9649788088333704, + "grad_norm": 0.21814000606536865, + "learning_rate": 5.3948984978616355e-06, + "loss": 0.4643, + "step": 8809 + }, + { + "epoch": 1.96520187374526, + "grad_norm": 0.17908701300621033, + "learning_rate": 5.39280959352679e-06, + "loss": 0.4189, + "step": 8810 + }, + { + "epoch": 1.9654249386571492, + "grad_norm": 0.17751288414001465, + "learning_rate": 5.390720944367125e-06, + "loss": 0.4502, + "step": 8811 + }, + { + "epoch": 1.9656480035690387, + "grad_norm": 0.21767963469028473, + "learning_rate": 5.388632550498327e-06, + "loss": 0.4495, + "step": 8812 + }, + { + "epoch": 1.965871068480928, + "grad_norm": 0.1864757090806961, + "learning_rate": 5.386544412036066e-06, + "loss": 0.4775, + "step": 8813 + }, + { + "epoch": 1.9660941333928172, + "grad_norm": 0.17667576670646667, + "learning_rate": 5.384456529095999e-06, + "loss": 0.4515, + "step": 8814 + }, + { + "epoch": 1.9663171983047065, + "grad_norm": 0.18594609200954437, + "learning_rate": 5.382368901793759e-06, + "loss": 0.4855, + "step": 8815 + }, + { + "epoch": 1.966540263216596, + "grad_norm": 0.1851484477519989, + "learning_rate": 5.380281530244974e-06, + "loss": 0.417, + "step": 8816 + }, + { + "epoch": 1.9667633281284855, + "grad_norm": 0.18481191992759705, + "learning_rate": 5.378194414565258e-06, + "loss": 0.4504, + "step": 8817 + }, + { + "epoch": 1.9669863930403748, + "grad_norm": 0.17994412779808044, + "learning_rate": 5.376107554870207e-06, + "loss": 0.4565, + "step": 8818 + }, + { + "epoch": 1.967209457952264, + "grad_norm": 0.17830024659633636, + "learning_rate": 5.374020951275409e-06, + "loss": 0.4562, + "step": 8819 + }, + { + "epoch": 1.9674325228641534, + "grad_norm": 0.19198620319366455, + "learning_rate": 5.371934603896426e-06, + "loss": 0.4414, + "step": 8820 + }, + { + "epoch": 1.9676555877760429, + "grad_norm": 0.17319843173027039, + "learning_rate": 5.369848512848816e-06, + "loss": 0.4351, + "step": 8821 + }, + { + "epoch": 1.9678786526879322, + "grad_norm": 0.1835477352142334, + "learning_rate": 5.3677626782481205e-06, + "loss": 0.4447, + "step": 8822 + }, + { + "epoch": 1.9681017175998217, + "grad_norm": 0.17269068956375122, + "learning_rate": 5.365677100209868e-06, + "loss": 0.4313, + "step": 8823 + }, + { + "epoch": 1.968324782511711, + "grad_norm": 0.17792171239852905, + "learning_rate": 5.363591778849566e-06, + "loss": 0.4276, + "step": 8824 + }, + { + "epoch": 1.9685478474236002, + "grad_norm": 0.17253033816814423, + "learning_rate": 5.361506714282716e-06, + "loss": 0.4399, + "step": 8825 + }, + { + "epoch": 1.9687709123354895, + "grad_norm": 0.17550261318683624, + "learning_rate": 5.359421906624802e-06, + "loss": 0.4381, + "step": 8826 + }, + { + "epoch": 1.968993977247379, + "grad_norm": 0.18501095473766327, + "learning_rate": 5.357337355991296e-06, + "loss": 0.4414, + "step": 8827 + }, + { + "epoch": 1.9692170421592683, + "grad_norm": 0.17807962000370026, + "learning_rate": 5.3552530624976475e-06, + "loss": 0.4627, + "step": 8828 + }, + { + "epoch": 1.9694401070711578, + "grad_norm": 0.17999139428138733, + "learning_rate": 5.3531690262593e-06, + "loss": 0.446, + "step": 8829 + }, + { + "epoch": 1.969663171983047, + "grad_norm": 0.17978177964687347, + "learning_rate": 5.351085247391681e-06, + "loss": 0.4616, + "step": 8830 + }, + { + "epoch": 1.9698862368949364, + "grad_norm": 0.18131205439567566, + "learning_rate": 5.34900172601021e-06, + "loss": 0.4418, + "step": 8831 + }, + { + "epoch": 1.9701093018068256, + "grad_norm": 0.17810958623886108, + "learning_rate": 5.3469184622302725e-06, + "loss": 0.4492, + "step": 8832 + }, + { + "epoch": 1.9703323667187151, + "grad_norm": 0.1817857027053833, + "learning_rate": 5.344835456167262e-06, + "loss": 0.4706, + "step": 8833 + }, + { + "epoch": 1.9705554316306046, + "grad_norm": 0.18190069496631622, + "learning_rate": 5.342752707936546e-06, + "loss": 0.4614, + "step": 8834 + }, + { + "epoch": 1.970778496542494, + "grad_norm": 0.18769033253192902, + "learning_rate": 5.340670217653483e-06, + "loss": 0.4537, + "step": 8835 + }, + { + "epoch": 1.9710015614543832, + "grad_norm": 0.18518251180648804, + "learning_rate": 5.338587985433409e-06, + "loss": 0.4551, + "step": 8836 + }, + { + "epoch": 1.9712246263662725, + "grad_norm": 0.1764073669910431, + "learning_rate": 5.336506011391653e-06, + "loss": 0.4309, + "step": 8837 + }, + { + "epoch": 1.971447691278162, + "grad_norm": 0.3080836534500122, + "learning_rate": 5.334424295643529e-06, + "loss": 0.4195, + "step": 8838 + }, + { + "epoch": 1.9716707561900513, + "grad_norm": 0.18763823807239532, + "learning_rate": 5.332342838304335e-06, + "loss": 0.4559, + "step": 8839 + }, + { + "epoch": 1.9718938211019408, + "grad_norm": 0.19015417993068695, + "learning_rate": 5.330261639489359e-06, + "loss": 0.4487, + "step": 8840 + }, + { + "epoch": 1.97211688601383, + "grad_norm": 0.18165238201618195, + "learning_rate": 5.328180699313864e-06, + "loss": 0.4439, + "step": 8841 + }, + { + "epoch": 1.9723399509257193, + "grad_norm": 0.28610339760780334, + "learning_rate": 5.326100017893107e-06, + "loss": 0.4331, + "step": 8842 + }, + { + "epoch": 1.9725630158376086, + "grad_norm": 0.1734454482793808, + "learning_rate": 5.324019595342334e-06, + "loss": 0.4424, + "step": 8843 + }, + { + "epoch": 1.9727860807494981, + "grad_norm": 0.18375049531459808, + "learning_rate": 5.32193943177677e-06, + "loss": 0.4364, + "step": 8844 + }, + { + "epoch": 1.9730091456613874, + "grad_norm": 0.1872616410255432, + "learning_rate": 5.319859527311624e-06, + "loss": 0.4591, + "step": 8845 + }, + { + "epoch": 1.973232210573277, + "grad_norm": 0.18444837629795074, + "learning_rate": 5.3177798820620975e-06, + "loss": 0.4462, + "step": 8846 + }, + { + "epoch": 1.9734552754851662, + "grad_norm": 0.1809314340353012, + "learning_rate": 5.315700496143372e-06, + "loss": 0.4382, + "step": 8847 + }, + { + "epoch": 1.9736783403970555, + "grad_norm": 0.18612340092658997, + "learning_rate": 5.313621369670623e-06, + "loss": 0.4391, + "step": 8848 + }, + { + "epoch": 1.9739014053089448, + "grad_norm": 0.18118642270565033, + "learning_rate": 5.311542502758996e-06, + "loss": 0.4659, + "step": 8849 + }, + { + "epoch": 1.9741244702208343, + "grad_norm": 0.19646091759204865, + "learning_rate": 5.309463895523638e-06, + "loss": 0.4594, + "step": 8850 + }, + { + "epoch": 1.9743475351327238, + "grad_norm": 0.18082433938980103, + "learning_rate": 5.3073855480796735e-06, + "loss": 0.472, + "step": 8851 + }, + { + "epoch": 1.974570600044613, + "grad_norm": 0.17272797226905823, + "learning_rate": 5.305307460542219e-06, + "loss": 0.4375, + "step": 8852 + }, + { + "epoch": 1.9747936649565023, + "grad_norm": 0.19139288365840912, + "learning_rate": 5.303229633026368e-06, + "loss": 0.459, + "step": 8853 + }, + { + "epoch": 1.9750167298683916, + "grad_norm": 0.18878120183944702, + "learning_rate": 5.3011520656472e-06, + "loss": 0.4701, + "step": 8854 + }, + { + "epoch": 1.975239794780281, + "grad_norm": 0.1769939512014389, + "learning_rate": 5.299074758519788e-06, + "loss": 0.4322, + "step": 8855 + }, + { + "epoch": 1.9754628596921704, + "grad_norm": 0.1793774664402008, + "learning_rate": 5.296997711759186e-06, + "loss": 0.4352, + "step": 8856 + }, + { + "epoch": 1.97568592460406, + "grad_norm": 0.17794568836688995, + "learning_rate": 5.2949209254804366e-06, + "loss": 0.4493, + "step": 8857 + }, + { + "epoch": 1.9759089895159492, + "grad_norm": 0.185879185795784, + "learning_rate": 5.292844399798559e-06, + "loss": 0.4612, + "step": 8858 + }, + { + "epoch": 1.9761320544278385, + "grad_norm": 0.18604058027267456, + "learning_rate": 5.290768134828568e-06, + "loss": 0.4597, + "step": 8859 + }, + { + "epoch": 1.9763551193397277, + "grad_norm": 0.19229385256767273, + "learning_rate": 5.28869213068546e-06, + "loss": 0.4597, + "step": 8860 + }, + { + "epoch": 1.9765781842516172, + "grad_norm": 0.1805671751499176, + "learning_rate": 5.286616387484217e-06, + "loss": 0.4478, + "step": 8861 + }, + { + "epoch": 1.9768012491635067, + "grad_norm": 0.18257898092269897, + "learning_rate": 5.28454090533981e-06, + "loss": 0.4505, + "step": 8862 + }, + { + "epoch": 1.977024314075396, + "grad_norm": 0.20359763503074646, + "learning_rate": 5.2824656843671865e-06, + "loss": 0.4693, + "step": 8863 + }, + { + "epoch": 1.9772473789872853, + "grad_norm": 0.1799335926771164, + "learning_rate": 5.280390724681288e-06, + "loss": 0.414, + "step": 8864 + }, + { + "epoch": 1.9774704438991746, + "grad_norm": 0.17781312763690948, + "learning_rate": 5.278316026397037e-06, + "loss": 0.4495, + "step": 8865 + }, + { + "epoch": 1.9776935088110639, + "grad_norm": 0.18418540060520172, + "learning_rate": 5.276241589629349e-06, + "loss": 0.4328, + "step": 8866 + }, + { + "epoch": 1.9779165737229534, + "grad_norm": 0.1901324838399887, + "learning_rate": 5.274167414493113e-06, + "loss": 0.4648, + "step": 8867 + }, + { + "epoch": 1.9781396386348429, + "grad_norm": 0.17587338387966156, + "learning_rate": 5.272093501103213e-06, + "loss": 0.4451, + "step": 8868 + }, + { + "epoch": 1.9783627035467322, + "grad_norm": 0.17703662812709808, + "learning_rate": 5.270019849574513e-06, + "loss": 0.4693, + "step": 8869 + }, + { + "epoch": 1.9785857684586214, + "grad_norm": 0.2079380750656128, + "learning_rate": 5.26794646002187e-06, + "loss": 0.47, + "step": 8870 + }, + { + "epoch": 1.9788088333705107, + "grad_norm": 0.21435651183128357, + "learning_rate": 5.265873332560114e-06, + "loss": 0.4404, + "step": 8871 + }, + { + "epoch": 1.9790318982824002, + "grad_norm": 0.1856585443019867, + "learning_rate": 5.263800467304072e-06, + "loss": 0.4442, + "step": 8872 + }, + { + "epoch": 1.9792549631942895, + "grad_norm": 0.178078293800354, + "learning_rate": 5.261727864368554e-06, + "loss": 0.4459, + "step": 8873 + }, + { + "epoch": 1.979478028106179, + "grad_norm": 0.17547845840454102, + "learning_rate": 5.259655523868352e-06, + "loss": 0.4636, + "step": 8874 + }, + { + "epoch": 1.9797010930180683, + "grad_norm": 0.17804180085659027, + "learning_rate": 5.257583445918244e-06, + "loss": 0.4311, + "step": 8875 + }, + { + "epoch": 1.9799241579299576, + "grad_norm": 0.180350661277771, + "learning_rate": 5.2555116306329945e-06, + "loss": 0.4316, + "step": 8876 + }, + { + "epoch": 1.9801472228418469, + "grad_norm": 0.17848455905914307, + "learning_rate": 5.253440078127355e-06, + "loss": 0.4671, + "step": 8877 + }, + { + "epoch": 1.9803702877537364, + "grad_norm": 0.1925760954618454, + "learning_rate": 5.251368788516066e-06, + "loss": 0.462, + "step": 8878 + }, + { + "epoch": 1.9805933526656259, + "grad_norm": 0.19506682455539703, + "learning_rate": 5.249297761913839e-06, + "loss": 0.4718, + "step": 8879 + }, + { + "epoch": 1.9808164175775151, + "grad_norm": 0.18116964399814606, + "learning_rate": 5.2472269984353845e-06, + "loss": 0.4597, + "step": 8880 + }, + { + "epoch": 1.9810394824894044, + "grad_norm": 0.18594235181808472, + "learning_rate": 5.245156498195395e-06, + "loss": 0.453, + "step": 8881 + }, + { + "epoch": 1.9812625474012937, + "grad_norm": 0.1806681901216507, + "learning_rate": 5.243086261308548e-06, + "loss": 0.4442, + "step": 8882 + }, + { + "epoch": 1.981485612313183, + "grad_norm": 0.18260101974010468, + "learning_rate": 5.241016287889511e-06, + "loss": 0.4457, + "step": 8883 + }, + { + "epoch": 1.9817086772250725, + "grad_norm": 0.17734308540821075, + "learning_rate": 5.238946578052921e-06, + "loss": 0.449, + "step": 8884 + }, + { + "epoch": 1.981931742136962, + "grad_norm": 0.1897689253091812, + "learning_rate": 5.236877131913421e-06, + "loss": 0.4611, + "step": 8885 + }, + { + "epoch": 1.9821548070488513, + "grad_norm": 0.1725919246673584, + "learning_rate": 5.234807949585623e-06, + "loss": 0.4231, + "step": 8886 + }, + { + "epoch": 1.9823778719607406, + "grad_norm": 0.18596228957176208, + "learning_rate": 5.2327390311841404e-06, + "loss": 0.45, + "step": 8887 + }, + { + "epoch": 1.9826009368726298, + "grad_norm": 0.18885090947151184, + "learning_rate": 5.230670376823555e-06, + "loss": 0.4753, + "step": 8888 + }, + { + "epoch": 1.9828240017845193, + "grad_norm": 0.18967807292938232, + "learning_rate": 5.228601986618442e-06, + "loss": 0.449, + "step": 8889 + }, + { + "epoch": 1.9830470666964086, + "grad_norm": 0.18121550977230072, + "learning_rate": 5.226533860683366e-06, + "loss": 0.4322, + "step": 8890 + }, + { + "epoch": 1.9832701316082981, + "grad_norm": 0.181602343916893, + "learning_rate": 5.224465999132875e-06, + "loss": 0.4414, + "step": 8891 + }, + { + "epoch": 1.9834931965201874, + "grad_norm": 0.17742736637592316, + "learning_rate": 5.222398402081492e-06, + "loss": 0.4493, + "step": 8892 + }, + { + "epoch": 1.9837162614320767, + "grad_norm": 0.17834076285362244, + "learning_rate": 5.220331069643737e-06, + "loss": 0.4529, + "step": 8893 + }, + { + "epoch": 1.983939326343966, + "grad_norm": 0.17975963652133942, + "learning_rate": 5.218264001934114e-06, + "loss": 0.4164, + "step": 8894 + }, + { + "epoch": 1.9841623912558555, + "grad_norm": 0.18679198622703552, + "learning_rate": 5.216197199067112e-06, + "loss": 0.431, + "step": 8895 + }, + { + "epoch": 1.984385456167745, + "grad_norm": 0.1757335662841797, + "learning_rate": 5.2141306611571965e-06, + "loss": 0.4427, + "step": 8896 + }, + { + "epoch": 1.9846085210796343, + "grad_norm": 0.19077639281749725, + "learning_rate": 5.21206438831883e-06, + "loss": 0.4358, + "step": 8897 + }, + { + "epoch": 1.9848315859915235, + "grad_norm": 0.18716174364089966, + "learning_rate": 5.2099983806664546e-06, + "loss": 0.4621, + "step": 8898 + }, + { + "epoch": 1.9850546509034128, + "grad_norm": 0.1881352961063385, + "learning_rate": 5.207932638314502e-06, + "loss": 0.461, + "step": 8899 + }, + { + "epoch": 1.985277715815302, + "grad_norm": 0.18843919038772583, + "learning_rate": 5.2058671613773805e-06, + "loss": 0.4489, + "step": 8900 + }, + { + "epoch": 1.9855007807271916, + "grad_norm": 0.18741180002689362, + "learning_rate": 5.203801949969491e-06, + "loss": 0.4648, + "step": 8901 + }, + { + "epoch": 1.985723845639081, + "grad_norm": 0.17685861885547638, + "learning_rate": 5.2017370042052205e-06, + "loss": 0.4629, + "step": 8902 + }, + { + "epoch": 1.9859469105509704, + "grad_norm": 0.16930168867111206, + "learning_rate": 5.199672324198935e-06, + "loss": 0.4395, + "step": 8903 + }, + { + "epoch": 1.9861699754628597, + "grad_norm": 0.17272312939167023, + "learning_rate": 5.197607910064997e-06, + "loss": 0.4509, + "step": 8904 + }, + { + "epoch": 1.986393040374749, + "grad_norm": 0.18139375746250153, + "learning_rate": 5.195543761917736e-06, + "loss": 0.4504, + "step": 8905 + }, + { + "epoch": 1.9866161052866385, + "grad_norm": 0.18100585043430328, + "learning_rate": 5.193479879871483e-06, + "loss": 0.4445, + "step": 8906 + }, + { + "epoch": 1.9868391701985277, + "grad_norm": 0.2222553938627243, + "learning_rate": 5.19141626404055e-06, + "loss": 0.4387, + "step": 8907 + }, + { + "epoch": 1.9870622351104172, + "grad_norm": 0.17244549095630646, + "learning_rate": 5.189352914539233e-06, + "loss": 0.4727, + "step": 8908 + }, + { + "epoch": 1.9872853000223065, + "grad_norm": 0.1766839474439621, + "learning_rate": 5.18728983148181e-06, + "loss": 0.4521, + "step": 8909 + }, + { + "epoch": 1.9875083649341958, + "grad_norm": 0.1754491627216339, + "learning_rate": 5.185227014982548e-06, + "loss": 0.4548, + "step": 8910 + }, + { + "epoch": 1.987731429846085, + "grad_norm": 0.17780523002147675, + "learning_rate": 5.183164465155699e-06, + "loss": 0.4563, + "step": 8911 + }, + { + "epoch": 1.9879544947579746, + "grad_norm": 0.1709602177143097, + "learning_rate": 5.181102182115507e-06, + "loss": 0.4194, + "step": 8912 + }, + { + "epoch": 1.988177559669864, + "grad_norm": 0.1960027664899826, + "learning_rate": 5.179040165976183e-06, + "loss": 0.4324, + "step": 8913 + }, + { + "epoch": 1.9884006245817534, + "grad_norm": 0.18167687952518463, + "learning_rate": 5.176978416851941e-06, + "loss": 0.4349, + "step": 8914 + }, + { + "epoch": 1.9886236894936427, + "grad_norm": 0.1885111927986145, + "learning_rate": 5.17491693485697e-06, + "loss": 0.4979, + "step": 8915 + }, + { + "epoch": 1.988846754405532, + "grad_norm": 0.18724943697452545, + "learning_rate": 5.172855720105456e-06, + "loss": 0.4774, + "step": 8916 + }, + { + "epoch": 1.9890698193174212, + "grad_norm": 0.1897243857383728, + "learning_rate": 5.1707947727115515e-06, + "loss": 0.4415, + "step": 8917 + }, + { + "epoch": 1.9892928842293107, + "grad_norm": 0.18215550482273102, + "learning_rate": 5.16873409278941e-06, + "loss": 0.4539, + "step": 8918 + }, + { + "epoch": 1.9895159491412002, + "grad_norm": 0.17774201929569244, + "learning_rate": 5.1666736804531646e-06, + "loss": 0.4199, + "step": 8919 + }, + { + "epoch": 1.9897390140530895, + "grad_norm": 0.17743372917175293, + "learning_rate": 5.164613535816937e-06, + "loss": 0.4457, + "step": 8920 + }, + { + "epoch": 1.9899620789649788, + "grad_norm": 0.17990683019161224, + "learning_rate": 5.162553658994823e-06, + "loss": 0.4648, + "step": 8921 + }, + { + "epoch": 1.990185143876868, + "grad_norm": 0.18305742740631104, + "learning_rate": 5.160494050100917e-06, + "loss": 0.4792, + "step": 8922 + }, + { + "epoch": 1.9904082087887576, + "grad_norm": 0.18619424104690552, + "learning_rate": 5.158434709249291e-06, + "loss": 0.425, + "step": 8923 + }, + { + "epoch": 1.9906312737006469, + "grad_norm": 0.17404185235500336, + "learning_rate": 5.156375636554007e-06, + "loss": 0.4299, + "step": 8924 + }, + { + "epoch": 1.9908543386125364, + "grad_norm": 0.17407676577568054, + "learning_rate": 5.1543168321291115e-06, + "loss": 0.4386, + "step": 8925 + }, + { + "epoch": 1.9910774035244256, + "grad_norm": 0.17693208158016205, + "learning_rate": 5.152258296088626e-06, + "loss": 0.4306, + "step": 8926 + }, + { + "epoch": 1.991300468436315, + "grad_norm": 0.1785212606191635, + "learning_rate": 5.15020002854657e-06, + "loss": 0.4715, + "step": 8927 + }, + { + "epoch": 1.9915235333482042, + "grad_norm": 0.18704737722873688, + "learning_rate": 5.148142029616943e-06, + "loss": 0.4653, + "step": 8928 + }, + { + "epoch": 1.9917465982600937, + "grad_norm": 0.19586940109729767, + "learning_rate": 5.146084299413732e-06, + "loss": 0.4784, + "step": 8929 + }, + { + "epoch": 1.9919696631719832, + "grad_norm": 0.18988698720932007, + "learning_rate": 5.144026838050902e-06, + "loss": 0.4543, + "step": 8930 + }, + { + "epoch": 1.9921927280838725, + "grad_norm": 0.17881841957569122, + "learning_rate": 5.141969645642412e-06, + "loss": 0.4286, + "step": 8931 + }, + { + "epoch": 1.9924157929957618, + "grad_norm": 0.18214870989322662, + "learning_rate": 5.139912722302201e-06, + "loss": 0.435, + "step": 8932 + }, + { + "epoch": 1.992638857907651, + "grad_norm": 0.26209428906440735, + "learning_rate": 5.137856068144197e-06, + "loss": 0.4628, + "step": 8933 + }, + { + "epoch": 1.9928619228195403, + "grad_norm": 0.17895345389842987, + "learning_rate": 5.135799683282309e-06, + "loss": 0.4708, + "step": 8934 + }, + { + "epoch": 1.9930849877314298, + "grad_norm": 0.18702343106269836, + "learning_rate": 5.133743567830427e-06, + "loss": 0.4324, + "step": 8935 + }, + { + "epoch": 1.9933080526433193, + "grad_norm": 0.18158088624477386, + "learning_rate": 5.1316877219024375e-06, + "loss": 0.4554, + "step": 8936 + }, + { + "epoch": 1.9935311175552086, + "grad_norm": 0.1866513192653656, + "learning_rate": 5.129632145612204e-06, + "loss": 0.4397, + "step": 8937 + }, + { + "epoch": 1.993754182467098, + "grad_norm": 0.19149568676948547, + "learning_rate": 5.127576839073583e-06, + "loss": 0.432, + "step": 8938 + }, + { + "epoch": 1.9939772473789872, + "grad_norm": 0.18493977189064026, + "learning_rate": 5.1255218024004e-06, + "loss": 0.4445, + "step": 8939 + }, + { + "epoch": 1.9942003122908767, + "grad_norm": 0.18081733584403992, + "learning_rate": 5.123467035706482e-06, + "loss": 0.4412, + "step": 8940 + }, + { + "epoch": 1.994423377202766, + "grad_norm": 0.17544502019882202, + "learning_rate": 5.121412539105635e-06, + "loss": 0.4076, + "step": 8941 + }, + { + "epoch": 1.9946464421146555, + "grad_norm": 0.20097681879997253, + "learning_rate": 5.119358312711651e-06, + "loss": 0.4676, + "step": 8942 + }, + { + "epoch": 1.9948695070265448, + "grad_norm": 0.17874199151992798, + "learning_rate": 5.117304356638301e-06, + "loss": 0.4525, + "step": 8943 + }, + { + "epoch": 1.995092571938434, + "grad_norm": 0.18695956468582153, + "learning_rate": 5.11525067099935e-06, + "loss": 0.4498, + "step": 8944 + }, + { + "epoch": 1.9953156368503233, + "grad_norm": 0.18029382824897766, + "learning_rate": 5.113197255908543e-06, + "loss": 0.454, + "step": 8945 + }, + { + "epoch": 1.9955387017622128, + "grad_norm": 0.17514650523662567, + "learning_rate": 5.111144111479611e-06, + "loss": 0.4191, + "step": 8946 + }, + { + "epoch": 1.9957617666741023, + "grad_norm": 0.17547555267810822, + "learning_rate": 5.109091237826273e-06, + "loss": 0.4276, + "step": 8947 + }, + { + "epoch": 1.9959848315859916, + "grad_norm": 0.18239977955818176, + "learning_rate": 5.107038635062225e-06, + "loss": 0.4447, + "step": 8948 + }, + { + "epoch": 1.9962078964978809, + "grad_norm": 0.1756354421377182, + "learning_rate": 5.1049863033011535e-06, + "loss": 0.4251, + "step": 8949 + }, + { + "epoch": 1.9964309614097702, + "grad_norm": 0.18526077270507812, + "learning_rate": 5.1029342426567345e-06, + "loss": 0.4603, + "step": 8950 + }, + { + "epoch": 1.9966540263216594, + "grad_norm": 0.20123088359832764, + "learning_rate": 5.100882453242622e-06, + "loss": 0.4473, + "step": 8951 + }, + { + "epoch": 1.996877091233549, + "grad_norm": 0.17634332180023193, + "learning_rate": 5.098830935172453e-06, + "loss": 0.4318, + "step": 8952 + }, + { + "epoch": 1.9971001561454385, + "grad_norm": 0.19645899534225464, + "learning_rate": 5.096779688559857e-06, + "loss": 0.4572, + "step": 8953 + }, + { + "epoch": 1.9973232210573277, + "grad_norm": 0.17646844685077667, + "learning_rate": 5.094728713518442e-06, + "loss": 0.4428, + "step": 8954 + }, + { + "epoch": 1.997546285969217, + "grad_norm": 0.17658363282680511, + "learning_rate": 5.092678010161812e-06, + "loss": 0.4518, + "step": 8955 + }, + { + "epoch": 1.9977693508811063, + "grad_norm": 0.17553554475307465, + "learning_rate": 5.090627578603537e-06, + "loss": 0.4602, + "step": 8956 + }, + { + "epoch": 1.9979924157929958, + "grad_norm": 0.18948344886302948, + "learning_rate": 5.08857741895719e-06, + "loss": 0.4539, + "step": 8957 + }, + { + "epoch": 1.998215480704885, + "grad_norm": 0.17445118725299835, + "learning_rate": 5.086527531336318e-06, + "loss": 0.4532, + "step": 8958 + }, + { + "epoch": 1.9984385456167746, + "grad_norm": 0.1809672862291336, + "learning_rate": 5.084477915854462e-06, + "loss": 0.4203, + "step": 8959 + }, + { + "epoch": 1.9986616105286639, + "grad_norm": 0.17913390696048737, + "learning_rate": 5.082428572625136e-06, + "loss": 0.4322, + "step": 8960 + }, + { + "epoch": 1.9988846754405531, + "grad_norm": 0.18459449708461761, + "learning_rate": 5.080379501761848e-06, + "loss": 0.4865, + "step": 8961 + }, + { + "epoch": 1.9991077403524424, + "grad_norm": 0.18868421018123627, + "learning_rate": 5.0783307033780895e-06, + "loss": 0.4571, + "step": 8962 + }, + { + "epoch": 1.999330805264332, + "grad_norm": 0.17596866190433502, + "learning_rate": 5.076282177587339e-06, + "loss": 0.4382, + "step": 8963 + }, + { + "epoch": 1.9995538701762214, + "grad_norm": 0.174740731716156, + "learning_rate": 5.074233924503047e-06, + "loss": 0.4407, + "step": 8964 + }, + { + "epoch": 1.9997769350881107, + "grad_norm": 0.17470811307430267, + "learning_rate": 5.072185944238665e-06, + "loss": 0.4456, + "step": 8965 + }, + { + "epoch": 2.0, + "grad_norm": 0.18581973016262054, + "learning_rate": 5.070138236907625e-06, + "loss": 0.4513, + "step": 8966 + }, + { + "epoch": 2.0, + "eval_loss": 0.31910037994384766, + "eval_runtime": 667.801, + "eval_samples_per_second": 94.504, + "eval_steps_per_second": 1.478, + "step": 8966 + } + ], + "logging_steps": 1, + "max_steps": 13449, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.876226391488531e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}