diff --git "a/checkpoint-4296/trainer_state.json" "b/checkpoint-4296/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4296/trainer_state.json" @@ -0,0 +1,30105 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 4296, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000931098696461825, + "grad_norm": 4.652266025543213, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.7999, + "step": 1 + }, + { + "epoch": 0.00186219739292365, + "grad_norm": 4.820754528045654, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.7973, + "step": 2 + }, + { + "epoch": 0.002793296089385475, + "grad_norm": 4.462317943572998, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.8345, + "step": 3 + }, + { + "epoch": 0.0037243947858473, + "grad_norm": 4.625609874725342, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.8793, + "step": 4 + }, + { + "epoch": 0.004655493482309125, + "grad_norm": 5.072265625, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.8234, + "step": 5 + }, + { + "epoch": 0.00558659217877095, + "grad_norm": 5.068264484405518, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.7846, + "step": 6 + }, + { + "epoch": 0.006517690875232775, + "grad_norm": 4.836957931518555, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.8265, + "step": 7 + }, + { + "epoch": 0.0074487895716946, + "grad_norm": 4.526134490966797, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.777, + "step": 8 + }, + { + "epoch": 0.008379888268156424, + "grad_norm": 4.688526153564453, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.7964, + "step": 9 + }, + { + "epoch": 0.00931098696461825, + "grad_norm": 4.841691017150879, + "learning_rate": 5.000000000000001e-07, + "loss": 1.8594, + "step": 10 + }, + { + "epoch": 0.010242085661080074, + "grad_norm": 4.664731979370117, + "learning_rate": 5.5e-07, + "loss": 1.8016, + "step": 11 + }, + { + "epoch": 0.0111731843575419, + "grad_norm": 4.7134294509887695, + "learning_rate": 6.000000000000001e-07, + "loss": 1.7898, + "step": 12 + }, + { + "epoch": 0.012104283054003724, + "grad_norm": 4.467348098754883, + "learning_rate": 6.5e-07, + "loss": 1.767, + "step": 13 + }, + { + "epoch": 0.01303538175046555, + "grad_norm": 4.61416482925415, + "learning_rate": 7.000000000000001e-07, + "loss": 1.8564, + "step": 14 + }, + { + "epoch": 0.013966480446927373, + "grad_norm": 4.473025798797607, + "learning_rate": 7.5e-07, + "loss": 1.7543, + "step": 15 + }, + { + "epoch": 0.0148975791433892, + "grad_norm": 4.157939910888672, + "learning_rate": 8.000000000000001e-07, + "loss": 1.6981, + "step": 16 + }, + { + "epoch": 0.015828677839851025, + "grad_norm": 4.375277519226074, + "learning_rate": 8.500000000000001e-07, + "loss": 1.8426, + "step": 17 + }, + { + "epoch": 0.01675977653631285, + "grad_norm": 3.9843714237213135, + "learning_rate": 9.000000000000001e-07, + "loss": 1.8064, + "step": 18 + }, + { + "epoch": 0.017690875232774673, + "grad_norm": 4.007570743560791, + "learning_rate": 9.500000000000001e-07, + "loss": 1.7724, + "step": 19 + }, + { + "epoch": 0.0186219739292365, + "grad_norm": 3.71211838722229, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.7475, + "step": 20 + }, + { + "epoch": 0.019553072625698324, + "grad_norm": 3.6564059257507324, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.7303, + "step": 21 + }, + { + "epoch": 0.020484171322160148, + "grad_norm": 3.2663612365722656, + "learning_rate": 1.1e-06, + "loss": 1.68, + "step": 22 + }, + { + "epoch": 0.021415270018621976, + "grad_norm": 3.5662412643432617, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.7961, + "step": 23 + }, + { + "epoch": 0.0223463687150838, + "grad_norm": 3.1484689712524414, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.6784, + "step": 24 + }, + { + "epoch": 0.023277467411545624, + "grad_norm": 2.9891276359558105, + "learning_rate": 1.25e-06, + "loss": 1.6174, + "step": 25 + }, + { + "epoch": 0.024208566108007448, + "grad_norm": 2.7955873012542725, + "learning_rate": 1.3e-06, + "loss": 1.6626, + "step": 26 + }, + { + "epoch": 0.025139664804469275, + "grad_norm": 2.6824326515197754, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.693, + "step": 27 + }, + { + "epoch": 0.0260707635009311, + "grad_norm": 2.4676928520202637, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.6401, + "step": 28 + }, + { + "epoch": 0.027001862197392923, + "grad_norm": 2.366990327835083, + "learning_rate": 1.45e-06, + "loss": 1.595, + "step": 29 + }, + { + "epoch": 0.027932960893854747, + "grad_norm": 2.2949702739715576, + "learning_rate": 1.5e-06, + "loss": 1.595, + "step": 30 + }, + { + "epoch": 0.028864059590316574, + "grad_norm": 2.186319351196289, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.6694, + "step": 31 + }, + { + "epoch": 0.0297951582867784, + "grad_norm": 2.113795280456543, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.5779, + "step": 32 + }, + { + "epoch": 0.030726256983240222, + "grad_norm": 2.0262715816497803, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.5292, + "step": 33 + }, + { + "epoch": 0.03165735567970205, + "grad_norm": 1.9731030464172363, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.5463, + "step": 34 + }, + { + "epoch": 0.032588454376163874, + "grad_norm": 2.100799322128296, + "learning_rate": 1.75e-06, + "loss": 1.4808, + "step": 35 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 2.134826183319092, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.56, + "step": 36 + }, + { + "epoch": 0.03445065176908752, + "grad_norm": 2.2226216793060303, + "learning_rate": 1.85e-06, + "loss": 1.5058, + "step": 37 + }, + { + "epoch": 0.035381750465549346, + "grad_norm": 2.211540937423706, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.5198, + "step": 38 + }, + { + "epoch": 0.036312849162011177, + "grad_norm": 2.066465139389038, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.5065, + "step": 39 + }, + { + "epoch": 0.037243947858473, + "grad_norm": 2.0119385719299316, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.461, + "step": 40 + }, + { + "epoch": 0.038175046554934824, + "grad_norm": 1.9685887098312378, + "learning_rate": 2.05e-06, + "loss": 1.485, + "step": 41 + }, + { + "epoch": 0.03910614525139665, + "grad_norm": 1.9120490550994873, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.4428, + "step": 42 + }, + { + "epoch": 0.04003724394785847, + "grad_norm": 1.780713677406311, + "learning_rate": 2.15e-06, + "loss": 1.4762, + "step": 43 + }, + { + "epoch": 0.040968342644320296, + "grad_norm": 1.8144075870513916, + "learning_rate": 2.2e-06, + "loss": 1.4752, + "step": 44 + }, + { + "epoch": 0.04189944134078212, + "grad_norm": 1.7422245740890503, + "learning_rate": 2.25e-06, + "loss": 1.4352, + "step": 45 + }, + { + "epoch": 0.04283054003724395, + "grad_norm": 1.7721035480499268, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.413, + "step": 46 + }, + { + "epoch": 0.043761638733705775, + "grad_norm": 1.5673249959945679, + "learning_rate": 2.35e-06, + "loss": 1.4243, + "step": 47 + }, + { + "epoch": 0.0446927374301676, + "grad_norm": 1.660043716430664, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.4073, + "step": 48 + }, + { + "epoch": 0.04562383612662942, + "grad_norm": 1.6705303192138672, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.4532, + "step": 49 + }, + { + "epoch": 0.04655493482309125, + "grad_norm": 1.587410807609558, + "learning_rate": 2.5e-06, + "loss": 1.3665, + "step": 50 + }, + { + "epoch": 0.04748603351955307, + "grad_norm": 1.536221981048584, + "learning_rate": 2.55e-06, + "loss": 1.4453, + "step": 51 + }, + { + "epoch": 0.048417132216014895, + "grad_norm": 1.5890748500823975, + "learning_rate": 2.6e-06, + "loss": 1.4185, + "step": 52 + }, + { + "epoch": 0.049348230912476726, + "grad_norm": 1.5140084028244019, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.3715, + "step": 53 + }, + { + "epoch": 0.05027932960893855, + "grad_norm": 1.5359089374542236, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.4454, + "step": 54 + }, + { + "epoch": 0.051210428305400374, + "grad_norm": 1.6898878812789917, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.42, + "step": 55 + }, + { + "epoch": 0.0521415270018622, + "grad_norm": 1.7680490016937256, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.3853, + "step": 56 + }, + { + "epoch": 0.05307262569832402, + "grad_norm": 1.5281091928482056, + "learning_rate": 2.85e-06, + "loss": 1.3481, + "step": 57 + }, + { + "epoch": 0.054003724394785846, + "grad_norm": 1.6096315383911133, + "learning_rate": 2.9e-06, + "loss": 1.3911, + "step": 58 + }, + { + "epoch": 0.05493482309124767, + "grad_norm": 1.6074330806732178, + "learning_rate": 2.95e-06, + "loss": 1.3572, + "step": 59 + }, + { + "epoch": 0.055865921787709494, + "grad_norm": 1.5256246328353882, + "learning_rate": 3e-06, + "loss": 1.3775, + "step": 60 + }, + { + "epoch": 0.056797020484171325, + "grad_norm": 1.5154070854187012, + "learning_rate": 3.05e-06, + "loss": 1.3726, + "step": 61 + }, + { + "epoch": 0.05772811918063315, + "grad_norm": 1.547666072845459, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.422, + "step": 62 + }, + { + "epoch": 0.05865921787709497, + "grad_norm": 1.7028982639312744, + "learning_rate": 3.1500000000000003e-06, + "loss": 1.3521, + "step": 63 + }, + { + "epoch": 0.0595903165735568, + "grad_norm": 1.5640915632247925, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.3419, + "step": 64 + }, + { + "epoch": 0.06052141527001862, + "grad_norm": 1.460614800453186, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.3397, + "step": 65 + }, + { + "epoch": 0.061452513966480445, + "grad_norm": 1.4689096212387085, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.3239, + "step": 66 + }, + { + "epoch": 0.06238361266294227, + "grad_norm": 1.5040433406829834, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.368, + "step": 67 + }, + { + "epoch": 0.0633147113594041, + "grad_norm": 1.4992578029632568, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.3752, + "step": 68 + }, + { + "epoch": 0.06424581005586592, + "grad_norm": 1.4988967180252075, + "learning_rate": 3.45e-06, + "loss": 1.3404, + "step": 69 + }, + { + "epoch": 0.06517690875232775, + "grad_norm": 1.4750804901123047, + "learning_rate": 3.5e-06, + "loss": 1.3108, + "step": 70 + }, + { + "epoch": 0.06610800744878957, + "grad_norm": 1.5298198461532593, + "learning_rate": 3.5500000000000003e-06, + "loss": 1.3074, + "step": 71 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 1.4945071935653687, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.3569, + "step": 72 + }, + { + "epoch": 0.06797020484171322, + "grad_norm": 1.4995956420898438, + "learning_rate": 3.65e-06, + "loss": 1.341, + "step": 73 + }, + { + "epoch": 0.06890130353817504, + "grad_norm": 1.4956507682800293, + "learning_rate": 3.7e-06, + "loss": 1.3281, + "step": 74 + }, + { + "epoch": 0.06983240223463687, + "grad_norm": 1.8083794116973877, + "learning_rate": 3.7500000000000005e-06, + "loss": 1.3557, + "step": 75 + }, + { + "epoch": 0.07076350093109869, + "grad_norm": 1.417014241218567, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.32, + "step": 76 + }, + { + "epoch": 0.07169459962756052, + "grad_norm": 1.4146186113357544, + "learning_rate": 3.85e-06, + "loss": 1.3017, + "step": 77 + }, + { + "epoch": 0.07262569832402235, + "grad_norm": 1.394770622253418, + "learning_rate": 3.900000000000001e-06, + "loss": 1.289, + "step": 78 + }, + { + "epoch": 0.07355679702048418, + "grad_norm": 1.4548990726470947, + "learning_rate": 3.95e-06, + "loss": 1.2983, + "step": 79 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 1.4482989311218262, + "learning_rate": 4.000000000000001e-06, + "loss": 1.3115, + "step": 80 + }, + { + "epoch": 0.07541899441340782, + "grad_norm": 1.4169769287109375, + "learning_rate": 4.05e-06, + "loss": 1.2504, + "step": 81 + }, + { + "epoch": 0.07635009310986965, + "grad_norm": 1.5154104232788086, + "learning_rate": 4.1e-06, + "loss": 1.2701, + "step": 82 + }, + { + "epoch": 0.07728119180633147, + "grad_norm": 1.543452262878418, + "learning_rate": 4.15e-06, + "loss": 1.3176, + "step": 83 + }, + { + "epoch": 0.0782122905027933, + "grad_norm": 1.4130207300186157, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.2498, + "step": 84 + }, + { + "epoch": 0.07914338919925512, + "grad_norm": 1.4742012023925781, + "learning_rate": 4.25e-06, + "loss": 1.3082, + "step": 85 + }, + { + "epoch": 0.08007448789571694, + "grad_norm": 1.4841349124908447, + "learning_rate": 4.3e-06, + "loss": 1.3189, + "step": 86 + }, + { + "epoch": 0.08100558659217877, + "grad_norm": 1.4499276876449585, + "learning_rate": 4.350000000000001e-06, + "loss": 1.2709, + "step": 87 + }, + { + "epoch": 0.08193668528864059, + "grad_norm": 1.3884755373001099, + "learning_rate": 4.4e-06, + "loss": 1.2855, + "step": 88 + }, + { + "epoch": 0.08286778398510242, + "grad_norm": 1.4352260828018188, + "learning_rate": 4.450000000000001e-06, + "loss": 1.2763, + "step": 89 + }, + { + "epoch": 0.08379888268156424, + "grad_norm": 1.4299675226211548, + "learning_rate": 4.5e-06, + "loss": 1.2597, + "step": 90 + }, + { + "epoch": 0.08472998137802606, + "grad_norm": 1.4941706657409668, + "learning_rate": 4.5500000000000005e-06, + "loss": 1.2807, + "step": 91 + }, + { + "epoch": 0.0856610800744879, + "grad_norm": 1.5136346817016602, + "learning_rate": 4.600000000000001e-06, + "loss": 1.2434, + "step": 92 + }, + { + "epoch": 0.08659217877094973, + "grad_norm": 1.5864630937576294, + "learning_rate": 4.65e-06, + "loss": 1.2247, + "step": 93 + }, + { + "epoch": 0.08752327746741155, + "grad_norm": 1.6212531328201294, + "learning_rate": 4.7e-06, + "loss": 1.3218, + "step": 94 + }, + { + "epoch": 0.08845437616387337, + "grad_norm": 1.4686894416809082, + "learning_rate": 4.75e-06, + "loss": 1.2693, + "step": 95 + }, + { + "epoch": 0.0893854748603352, + "grad_norm": 1.456432819366455, + "learning_rate": 4.800000000000001e-06, + "loss": 1.2696, + "step": 96 + }, + { + "epoch": 0.09031657355679702, + "grad_norm": 1.449016809463501, + "learning_rate": 4.85e-06, + "loss": 1.2402, + "step": 97 + }, + { + "epoch": 0.09124767225325885, + "grad_norm": 1.454689860343933, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.3209, + "step": 98 + }, + { + "epoch": 0.09217877094972067, + "grad_norm": 1.5578409433364868, + "learning_rate": 4.95e-06, + "loss": 1.2525, + "step": 99 + }, + { + "epoch": 0.0931098696461825, + "grad_norm": 1.3689794540405273, + "learning_rate": 5e-06, + "loss": 1.2663, + "step": 100 + }, + { + "epoch": 0.09404096834264432, + "grad_norm": 1.7641292810440063, + "learning_rate": 4.999999693462649e-06, + "loss": 1.2557, + "step": 101 + }, + { + "epoch": 0.09497206703910614, + "grad_norm": 1.497525930404663, + "learning_rate": 4.999998773850669e-06, + "loss": 1.2893, + "step": 102 + }, + { + "epoch": 0.09590316573556797, + "grad_norm": 1.5183961391448975, + "learning_rate": 4.999997241164287e-06, + "loss": 1.2804, + "step": 103 + }, + { + "epoch": 0.09683426443202979, + "grad_norm": 1.4652289152145386, + "learning_rate": 4.999995095403878e-06, + "loss": 1.2537, + "step": 104 + }, + { + "epoch": 0.09776536312849161, + "grad_norm": 1.6081790924072266, + "learning_rate": 4.999992336569969e-06, + "loss": 1.2438, + "step": 105 + }, + { + "epoch": 0.09869646182495345, + "grad_norm": 1.6800453662872314, + "learning_rate": 4.999988964663236e-06, + "loss": 1.2564, + "step": 106 + }, + { + "epoch": 0.09962756052141528, + "grad_norm": 1.4903538227081299, + "learning_rate": 4.999984979684505e-06, + "loss": 1.2658, + "step": 107 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 1.5448410511016846, + "learning_rate": 4.999980381634756e-06, + "loss": 1.2739, + "step": 108 + }, + { + "epoch": 0.10148975791433892, + "grad_norm": 1.549705982208252, + "learning_rate": 4.999975170515114e-06, + "loss": 1.2424, + "step": 109 + }, + { + "epoch": 0.10242085661080075, + "grad_norm": 1.7979212999343872, + "learning_rate": 4.999969346326857e-06, + "loss": 1.2482, + "step": 110 + }, + { + "epoch": 0.10335195530726257, + "grad_norm": 1.4740439653396606, + "learning_rate": 4.999962909071414e-06, + "loss": 1.257, + "step": 111 + }, + { + "epoch": 0.1042830540037244, + "grad_norm": 1.5189530849456787, + "learning_rate": 4.999955858750365e-06, + "loss": 1.1822, + "step": 112 + }, + { + "epoch": 0.10521415270018622, + "grad_norm": 1.541293740272522, + "learning_rate": 4.999948195365436e-06, + "loss": 1.2725, + "step": 113 + }, + { + "epoch": 0.10614525139664804, + "grad_norm": 1.4437657594680786, + "learning_rate": 4.9999399189185085e-06, + "loss": 1.2148, + "step": 114 + }, + { + "epoch": 0.10707635009310987, + "grad_norm": 1.7418147325515747, + "learning_rate": 4.999931029411611e-06, + "loss": 1.2545, + "step": 115 + }, + { + "epoch": 0.10800744878957169, + "grad_norm": 1.450005292892456, + "learning_rate": 4.999921526846925e-06, + "loss": 1.2965, + "step": 116 + }, + { + "epoch": 0.10893854748603352, + "grad_norm": 1.5809992551803589, + "learning_rate": 4.999911411226779e-06, + "loss": 1.2881, + "step": 117 + }, + { + "epoch": 0.10986964618249534, + "grad_norm": 1.6253516674041748, + "learning_rate": 4.9999006825536545e-06, + "loss": 1.2578, + "step": 118 + }, + { + "epoch": 0.11080074487895716, + "grad_norm": 1.5688159465789795, + "learning_rate": 4.999889340830183e-06, + "loss": 1.2826, + "step": 119 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 1.5473788976669312, + "learning_rate": 4.999877386059144e-06, + "loss": 1.1899, + "step": 120 + }, + { + "epoch": 0.11266294227188083, + "grad_norm": 1.547783374786377, + "learning_rate": 4.999864818243471e-06, + "loss": 1.2287, + "step": 121 + }, + { + "epoch": 0.11359404096834265, + "grad_norm": 1.5663732290267944, + "learning_rate": 4.999851637386246e-06, + "loss": 1.2096, + "step": 122 + }, + { + "epoch": 0.11452513966480447, + "grad_norm": 1.7208932638168335, + "learning_rate": 4.9998378434907e-06, + "loss": 1.2281, + "step": 123 + }, + { + "epoch": 0.1154562383612663, + "grad_norm": 1.6368746757507324, + "learning_rate": 4.9998234365602164e-06, + "loss": 1.2892, + "step": 124 + }, + { + "epoch": 0.11638733705772812, + "grad_norm": 1.7141762971878052, + "learning_rate": 4.999808416598329e-06, + "loss": 1.2207, + "step": 125 + }, + { + "epoch": 0.11731843575418995, + "grad_norm": 1.6172239780426025, + "learning_rate": 4.99979278360872e-06, + "loss": 1.2188, + "step": 126 + }, + { + "epoch": 0.11824953445065177, + "grad_norm": 1.4444636106491089, + "learning_rate": 4.999776537595223e-06, + "loss": 1.2105, + "step": 127 + }, + { + "epoch": 0.1191806331471136, + "grad_norm": 1.526252269744873, + "learning_rate": 4.999759678561822e-06, + "loss": 1.2613, + "step": 128 + }, + { + "epoch": 0.12011173184357542, + "grad_norm": 1.5656245946884155, + "learning_rate": 4.999742206512653e-06, + "loss": 1.2439, + "step": 129 + }, + { + "epoch": 0.12104283054003724, + "grad_norm": 1.6116547584533691, + "learning_rate": 4.999724121451998e-06, + "loss": 1.2393, + "step": 130 + }, + { + "epoch": 0.12197392923649907, + "grad_norm": 1.5212883949279785, + "learning_rate": 4.999705423384296e-06, + "loss": 1.2332, + "step": 131 + }, + { + "epoch": 0.12290502793296089, + "grad_norm": 1.5971696376800537, + "learning_rate": 4.9996861123141274e-06, + "loss": 1.2412, + "step": 132 + }, + { + "epoch": 0.12383612662942271, + "grad_norm": 1.5015281438827515, + "learning_rate": 4.999666188246231e-06, + "loss": 1.262, + "step": 133 + }, + { + "epoch": 0.12476722532588454, + "grad_norm": 1.527295470237732, + "learning_rate": 4.999645651185492e-06, + "loss": 1.2187, + "step": 134 + }, + { + "epoch": 0.12569832402234637, + "grad_norm": 1.572506070137024, + "learning_rate": 4.999624501136947e-06, + "loss": 1.2401, + "step": 135 + }, + { + "epoch": 0.1266294227188082, + "grad_norm": 1.5193438529968262, + "learning_rate": 4.9996027381057825e-06, + "loss": 1.2444, + "step": 136 + }, + { + "epoch": 0.12756052141527002, + "grad_norm": 1.600056767463684, + "learning_rate": 4.9995803620973335e-06, + "loss": 1.2193, + "step": 137 + }, + { + "epoch": 0.12849162011173185, + "grad_norm": 1.4413717985153198, + "learning_rate": 4.999557373117091e-06, + "loss": 1.2166, + "step": 138 + }, + { + "epoch": 0.12942271880819367, + "grad_norm": 1.5173741579055786, + "learning_rate": 4.99953377117069e-06, + "loss": 1.2545, + "step": 139 + }, + { + "epoch": 0.1303538175046555, + "grad_norm": 1.6268305778503418, + "learning_rate": 4.999509556263919e-06, + "loss": 1.2699, + "step": 140 + }, + { + "epoch": 0.13128491620111732, + "grad_norm": 1.5302830934524536, + "learning_rate": 4.999484728402716e-06, + "loss": 1.2474, + "step": 141 + }, + { + "epoch": 0.13221601489757914, + "grad_norm": 1.5415630340576172, + "learning_rate": 4.99945928759317e-06, + "loss": 1.2487, + "step": 142 + }, + { + "epoch": 0.13314711359404097, + "grad_norm": 2.400120258331299, + "learning_rate": 4.999433233841519e-06, + "loss": 1.2102, + "step": 143 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 1.6354401111602783, + "learning_rate": 4.999406567154155e-06, + "loss": 1.2152, + "step": 144 + }, + { + "epoch": 0.13500931098696461, + "grad_norm": 1.504473090171814, + "learning_rate": 4.999379287537613e-06, + "loss": 1.2464, + "step": 145 + }, + { + "epoch": 0.13594040968342644, + "grad_norm": 1.461122751235962, + "learning_rate": 4.999351394998586e-06, + "loss": 1.2347, + "step": 146 + }, + { + "epoch": 0.13687150837988826, + "grad_norm": 1.4422621726989746, + "learning_rate": 4.999322889543913e-06, + "loss": 1.2246, + "step": 147 + }, + { + "epoch": 0.1378026070763501, + "grad_norm": 1.5868957042694092, + "learning_rate": 4.999293771180584e-06, + "loss": 1.2221, + "step": 148 + }, + { + "epoch": 0.1387337057728119, + "grad_norm": 1.6620584726333618, + "learning_rate": 4.999264039915741e-06, + "loss": 1.2739, + "step": 149 + }, + { + "epoch": 0.13966480446927373, + "grad_norm": 1.5058534145355225, + "learning_rate": 4.9992336957566735e-06, + "loss": 1.2088, + "step": 150 + }, + { + "epoch": 0.14059590316573556, + "grad_norm": 1.5899105072021484, + "learning_rate": 4.999202738710824e-06, + "loss": 1.2266, + "step": 151 + }, + { + "epoch": 0.14152700186219738, + "grad_norm": 1.592184066772461, + "learning_rate": 4.999171168785783e-06, + "loss": 1.1896, + "step": 152 + }, + { + "epoch": 0.1424581005586592, + "grad_norm": 1.6183990240097046, + "learning_rate": 4.999138985989293e-06, + "loss": 1.267, + "step": 153 + }, + { + "epoch": 0.14338919925512103, + "grad_norm": 1.4739596843719482, + "learning_rate": 4.999106190329247e-06, + "loss": 1.1763, + "step": 154 + }, + { + "epoch": 0.14432029795158285, + "grad_norm": 1.6200612783432007, + "learning_rate": 4.9990727818136865e-06, + "loss": 1.2519, + "step": 155 + }, + { + "epoch": 0.1452513966480447, + "grad_norm": 1.555408000946045, + "learning_rate": 4.9990387604508035e-06, + "loss": 1.2342, + "step": 156 + }, + { + "epoch": 0.14618249534450653, + "grad_norm": 1.7190477848052979, + "learning_rate": 4.999004126248943e-06, + "loss": 1.2354, + "step": 157 + }, + { + "epoch": 0.14711359404096835, + "grad_norm": 1.5006803274154663, + "learning_rate": 4.998968879216597e-06, + "loss": 1.2088, + "step": 158 + }, + { + "epoch": 0.14804469273743018, + "grad_norm": 1.536983847618103, + "learning_rate": 4.998933019362408e-06, + "loss": 1.2691, + "step": 159 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 1.5591208934783936, + "learning_rate": 4.998896546695173e-06, + "loss": 1.212, + "step": 160 + }, + { + "epoch": 0.14990689013035383, + "grad_norm": 1.4551342725753784, + "learning_rate": 4.998859461223834e-06, + "loss": 1.248, + "step": 161 + }, + { + "epoch": 0.15083798882681565, + "grad_norm": 1.506801724433899, + "learning_rate": 4.9988217629574855e-06, + "loss": 1.2388, + "step": 162 + }, + { + "epoch": 0.15176908752327747, + "grad_norm": 2.3571014404296875, + "learning_rate": 4.998783451905374e-06, + "loss": 1.2038, + "step": 163 + }, + { + "epoch": 0.1527001862197393, + "grad_norm": 1.5549551248550415, + "learning_rate": 4.998744528076892e-06, + "loss": 1.2224, + "step": 164 + }, + { + "epoch": 0.15363128491620112, + "grad_norm": 1.6795048713684082, + "learning_rate": 4.998704991481587e-06, + "loss": 1.2473, + "step": 165 + }, + { + "epoch": 0.15456238361266295, + "grad_norm": 1.58402419090271, + "learning_rate": 4.9986648421291525e-06, + "loss": 1.2011, + "step": 166 + }, + { + "epoch": 0.15549348230912477, + "grad_norm": 1.589712142944336, + "learning_rate": 4.998624080029436e-06, + "loss": 1.1923, + "step": 167 + }, + { + "epoch": 0.1564245810055866, + "grad_norm": 1.5033369064331055, + "learning_rate": 4.998582705192433e-06, + "loss": 1.2498, + "step": 168 + }, + { + "epoch": 0.15735567970204842, + "grad_norm": 1.5704784393310547, + "learning_rate": 4.99854071762829e-06, + "loss": 1.2255, + "step": 169 + }, + { + "epoch": 0.15828677839851024, + "grad_norm": 1.637010097503662, + "learning_rate": 4.9984981173473025e-06, + "loss": 1.1929, + "step": 170 + }, + { + "epoch": 0.15921787709497207, + "grad_norm": 1.563796043395996, + "learning_rate": 4.998454904359919e-06, + "loss": 1.2376, + "step": 171 + }, + { + "epoch": 0.1601489757914339, + "grad_norm": 1.512030839920044, + "learning_rate": 4.998411078676736e-06, + "loss": 1.1806, + "step": 172 + }, + { + "epoch": 0.1610800744878957, + "grad_norm": 1.468132734298706, + "learning_rate": 4.998366640308501e-06, + "loss": 1.2143, + "step": 173 + }, + { + "epoch": 0.16201117318435754, + "grad_norm": 1.5265820026397705, + "learning_rate": 4.998321589266111e-06, + "loss": 1.2083, + "step": 174 + }, + { + "epoch": 0.16294227188081936, + "grad_norm": 1.471686601638794, + "learning_rate": 4.998275925560614e-06, + "loss": 1.2075, + "step": 175 + }, + { + "epoch": 0.16387337057728119, + "grad_norm": 1.5236564874649048, + "learning_rate": 4.9982296492032084e-06, + "loss": 1.1755, + "step": 176 + }, + { + "epoch": 0.164804469273743, + "grad_norm": 1.5038493871688843, + "learning_rate": 4.998182760205243e-06, + "loss": 1.2131, + "step": 177 + }, + { + "epoch": 0.16573556797020483, + "grad_norm": 1.6880675554275513, + "learning_rate": 4.9981352585782154e-06, + "loss": 1.2333, + "step": 178 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.4586597681045532, + "learning_rate": 4.9980871443337755e-06, + "loss": 1.1889, + "step": 179 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 1.5220293998718262, + "learning_rate": 4.998038417483721e-06, + "loss": 1.2134, + "step": 180 + }, + { + "epoch": 0.1685288640595903, + "grad_norm": 1.6228876113891602, + "learning_rate": 4.997989078040003e-06, + "loss": 1.2357, + "step": 181 + }, + { + "epoch": 0.16945996275605213, + "grad_norm": 1.5319768190383911, + "learning_rate": 4.99793912601472e-06, + "loss": 1.1883, + "step": 182 + }, + { + "epoch": 0.17039106145251395, + "grad_norm": 1.460939884185791, + "learning_rate": 4.99788856142012e-06, + "loss": 1.1607, + "step": 183 + }, + { + "epoch": 0.1713221601489758, + "grad_norm": 1.5423537492752075, + "learning_rate": 4.997837384268606e-06, + "loss": 1.2033, + "step": 184 + }, + { + "epoch": 0.17225325884543763, + "grad_norm": 1.5639525651931763, + "learning_rate": 4.997785594572726e-06, + "loss": 1.231, + "step": 185 + }, + { + "epoch": 0.17318435754189945, + "grad_norm": 1.5919618606567383, + "learning_rate": 4.997733192345181e-06, + "loss": 1.211, + "step": 186 + }, + { + "epoch": 0.17411545623836128, + "grad_norm": 1.6297861337661743, + "learning_rate": 4.997680177598823e-06, + "loss": 1.1782, + "step": 187 + }, + { + "epoch": 0.1750465549348231, + "grad_norm": 1.542855978012085, + "learning_rate": 4.99762655034665e-06, + "loss": 1.2185, + "step": 188 + }, + { + "epoch": 0.17597765363128492, + "grad_norm": 1.547610878944397, + "learning_rate": 4.997572310601816e-06, + "loss": 1.209, + "step": 189 + }, + { + "epoch": 0.17690875232774675, + "grad_norm": 1.5525280237197876, + "learning_rate": 4.99751745837762e-06, + "loss": 1.2088, + "step": 190 + }, + { + "epoch": 0.17783985102420857, + "grad_norm": 1.5458629131317139, + "learning_rate": 4.997461993687514e-06, + "loss": 1.2057, + "step": 191 + }, + { + "epoch": 0.1787709497206704, + "grad_norm": 1.614859700202942, + "learning_rate": 4.997405916545101e-06, + "loss": 1.2307, + "step": 192 + }, + { + "epoch": 0.17970204841713222, + "grad_norm": 16.333786010742188, + "learning_rate": 4.99734922696413e-06, + "loss": 1.2222, + "step": 193 + }, + { + "epoch": 0.18063314711359404, + "grad_norm": 1.5468389987945557, + "learning_rate": 4.997291924958506e-06, + "loss": 1.2269, + "step": 194 + }, + { + "epoch": 0.18156424581005587, + "grad_norm": 1.5191587209701538, + "learning_rate": 4.997234010542279e-06, + "loss": 1.2483, + "step": 195 + }, + { + "epoch": 0.1824953445065177, + "grad_norm": 1.5326647758483887, + "learning_rate": 4.9971754837296514e-06, + "loss": 1.1967, + "step": 196 + }, + { + "epoch": 0.18342644320297952, + "grad_norm": 1.5105819702148438, + "learning_rate": 4.9971163445349775e-06, + "loss": 1.2059, + "step": 197 + }, + { + "epoch": 0.18435754189944134, + "grad_norm": 1.5659078359603882, + "learning_rate": 4.997056592972758e-06, + "loss": 1.226, + "step": 198 + }, + { + "epoch": 0.18528864059590316, + "grad_norm": 1.4845701456069946, + "learning_rate": 4.996996229057648e-06, + "loss": 1.1691, + "step": 199 + }, + { + "epoch": 0.186219739292365, + "grad_norm": 1.5166332721710205, + "learning_rate": 4.996935252804448e-06, + "loss": 1.2265, + "step": 200 + }, + { + "epoch": 0.1871508379888268, + "grad_norm": 1.6071895360946655, + "learning_rate": 4.9968736642281125e-06, + "loss": 1.2409, + "step": 201 + }, + { + "epoch": 0.18808193668528864, + "grad_norm": 1.712390661239624, + "learning_rate": 4.9968114633437445e-06, + "loss": 1.1973, + "step": 202 + }, + { + "epoch": 0.18901303538175046, + "grad_norm": 1.5515329837799072, + "learning_rate": 4.996748650166599e-06, + "loss": 1.1616, + "step": 203 + }, + { + "epoch": 0.18994413407821228, + "grad_norm": 1.6127967834472656, + "learning_rate": 4.996685224712077e-06, + "loss": 1.2296, + "step": 204 + }, + { + "epoch": 0.1908752327746741, + "grad_norm": 1.585608720779419, + "learning_rate": 4.996621186995734e-06, + "loss": 1.2399, + "step": 205 + }, + { + "epoch": 0.19180633147113593, + "grad_norm": 1.6636781692504883, + "learning_rate": 4.996556537033274e-06, + "loss": 1.2427, + "step": 206 + }, + { + "epoch": 0.19273743016759776, + "grad_norm": 1.6432000398635864, + "learning_rate": 4.9964912748405504e-06, + "loss": 1.2324, + "step": 207 + }, + { + "epoch": 0.19366852886405958, + "grad_norm": 1.5804307460784912, + "learning_rate": 4.996425400433569e-06, + "loss": 1.1804, + "step": 208 + }, + { + "epoch": 0.1945996275605214, + "grad_norm": 1.5581324100494385, + "learning_rate": 4.996358913828482e-06, + "loss": 1.1843, + "step": 209 + }, + { + "epoch": 0.19553072625698323, + "grad_norm": 1.606521725654602, + "learning_rate": 4.996291815041595e-06, + "loss": 1.2607, + "step": 210 + }, + { + "epoch": 0.19646182495344505, + "grad_norm": 1.5909523963928223, + "learning_rate": 4.996224104089363e-06, + "loss": 1.2228, + "step": 211 + }, + { + "epoch": 0.1973929236499069, + "grad_norm": 1.5124651193618774, + "learning_rate": 4.99615578098839e-06, + "loss": 1.1794, + "step": 212 + }, + { + "epoch": 0.19832402234636873, + "grad_norm": 1.5392098426818848, + "learning_rate": 4.9960868457554305e-06, + "loss": 1.2181, + "step": 213 + }, + { + "epoch": 0.19925512104283055, + "grad_norm": 1.5319880247116089, + "learning_rate": 4.996017298407391e-06, + "loss": 1.1936, + "step": 214 + }, + { + "epoch": 0.20018621973929238, + "grad_norm": 1.5836493968963623, + "learning_rate": 4.995947138961326e-06, + "loss": 1.2581, + "step": 215 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 1.6041829586029053, + "learning_rate": 4.995876367434439e-06, + "loss": 1.2059, + "step": 216 + }, + { + "epoch": 0.20204841713221602, + "grad_norm": 1.6239100694656372, + "learning_rate": 4.995804983844088e-06, + "loss": 1.1831, + "step": 217 + }, + { + "epoch": 0.20297951582867785, + "grad_norm": 1.5361995697021484, + "learning_rate": 4.995732988207777e-06, + "loss": 1.1793, + "step": 218 + }, + { + "epoch": 0.20391061452513967, + "grad_norm": 1.5640617609024048, + "learning_rate": 4.995660380543162e-06, + "loss": 1.2027, + "step": 219 + }, + { + "epoch": 0.2048417132216015, + "grad_norm": 1.5686819553375244, + "learning_rate": 4.995587160868047e-06, + "loss": 1.1653, + "step": 220 + }, + { + "epoch": 0.20577281191806332, + "grad_norm": 1.7007719278335571, + "learning_rate": 4.99551332920039e-06, + "loss": 1.1909, + "step": 221 + }, + { + "epoch": 0.20670391061452514, + "grad_norm": 1.6069990396499634, + "learning_rate": 4.995438885558294e-06, + "loss": 1.1549, + "step": 222 + }, + { + "epoch": 0.20763500931098697, + "grad_norm": 1.6227385997772217, + "learning_rate": 4.9953638299600174e-06, + "loss": 1.2101, + "step": 223 + }, + { + "epoch": 0.2085661080074488, + "grad_norm": 1.5943350791931152, + "learning_rate": 4.995288162423965e-06, + "loss": 1.2195, + "step": 224 + }, + { + "epoch": 0.20949720670391062, + "grad_norm": 1.5146862268447876, + "learning_rate": 4.995211882968692e-06, + "loss": 1.154, + "step": 225 + }, + { + "epoch": 0.21042830540037244, + "grad_norm": 1.5101516246795654, + "learning_rate": 4.995134991612906e-06, + "loss": 1.2102, + "step": 226 + }, + { + "epoch": 0.21135940409683426, + "grad_norm": 1.5723508596420288, + "learning_rate": 4.995057488375462e-06, + "loss": 1.2054, + "step": 227 + }, + { + "epoch": 0.2122905027932961, + "grad_norm": 1.5589677095413208, + "learning_rate": 4.994979373275366e-06, + "loss": 1.1447, + "step": 228 + }, + { + "epoch": 0.2132216014897579, + "grad_norm": 2.5887372493743896, + "learning_rate": 4.9949006463317754e-06, + "loss": 1.1973, + "step": 229 + }, + { + "epoch": 0.21415270018621974, + "grad_norm": 1.6149126291275024, + "learning_rate": 4.994821307563995e-06, + "loss": 1.2131, + "step": 230 + }, + { + "epoch": 0.21508379888268156, + "grad_norm": 1.491944432258606, + "learning_rate": 4.994741356991481e-06, + "loss": 1.2111, + "step": 231 + }, + { + "epoch": 0.21601489757914338, + "grad_norm": 1.6424049139022827, + "learning_rate": 4.99466079463384e-06, + "loss": 1.1892, + "step": 232 + }, + { + "epoch": 0.2169459962756052, + "grad_norm": 1.5669760704040527, + "learning_rate": 4.99457962051083e-06, + "loss": 1.2215, + "step": 233 + }, + { + "epoch": 0.21787709497206703, + "grad_norm": 1.597413420677185, + "learning_rate": 4.994497834642355e-06, + "loss": 1.2191, + "step": 234 + }, + { + "epoch": 0.21880819366852886, + "grad_norm": 1.5258022546768188, + "learning_rate": 4.994415437048471e-06, + "loss": 1.1769, + "step": 235 + }, + { + "epoch": 0.21973929236499068, + "grad_norm": 1.5535242557525635, + "learning_rate": 4.994332427749387e-06, + "loss": 1.213, + "step": 236 + }, + { + "epoch": 0.2206703910614525, + "grad_norm": 1.4210128784179688, + "learning_rate": 4.994248806765457e-06, + "loss": 1.1963, + "step": 237 + }, + { + "epoch": 0.22160148975791433, + "grad_norm": 1.450096607208252, + "learning_rate": 4.994164574117189e-06, + "loss": 1.1394, + "step": 238 + }, + { + "epoch": 0.22253258845437615, + "grad_norm": 1.6149117946624756, + "learning_rate": 4.994079729825238e-06, + "loss": 1.1759, + "step": 239 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 1.5469238758087158, + "learning_rate": 4.993994273910411e-06, + "loss": 1.2085, + "step": 240 + }, + { + "epoch": 0.22439478584729983, + "grad_norm": 1.533961534500122, + "learning_rate": 4.9939082063936646e-06, + "loss": 1.1617, + "step": 241 + }, + { + "epoch": 0.22532588454376165, + "grad_norm": 1.6694387197494507, + "learning_rate": 4.993821527296104e-06, + "loss": 1.173, + "step": 242 + }, + { + "epoch": 0.22625698324022347, + "grad_norm": 1.634507656097412, + "learning_rate": 4.9937342366389875e-06, + "loss": 1.1698, + "step": 243 + }, + { + "epoch": 0.2271880819366853, + "grad_norm": 1.6566752195358276, + "learning_rate": 4.9936463344437196e-06, + "loss": 1.1804, + "step": 244 + }, + { + "epoch": 0.22811918063314712, + "grad_norm": 1.530820369720459, + "learning_rate": 4.993557820731857e-06, + "loss": 1.1491, + "step": 245 + }, + { + "epoch": 0.22905027932960895, + "grad_norm": 1.553288221359253, + "learning_rate": 4.993468695525106e-06, + "loss": 1.1816, + "step": 246 + }, + { + "epoch": 0.22998137802607077, + "grad_norm": 1.5366573333740234, + "learning_rate": 4.993378958845323e-06, + "loss": 1.1069, + "step": 247 + }, + { + "epoch": 0.2309124767225326, + "grad_norm": 1.692987322807312, + "learning_rate": 4.993288610714515e-06, + "loss": 1.2053, + "step": 248 + }, + { + "epoch": 0.23184357541899442, + "grad_norm": 1.6329057216644287, + "learning_rate": 4.993197651154835e-06, + "loss": 1.1967, + "step": 249 + }, + { + "epoch": 0.23277467411545624, + "grad_norm": 1.5369672775268555, + "learning_rate": 4.9931060801885924e-06, + "loss": 1.2103, + "step": 250 + }, + { + "epoch": 0.23370577281191807, + "grad_norm": 1.6424072980880737, + "learning_rate": 4.993013897838242e-06, + "loss": 1.2123, + "step": 251 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 1.7182517051696777, + "learning_rate": 4.992921104126388e-06, + "loss": 1.1979, + "step": 252 + }, + { + "epoch": 0.23556797020484171, + "grad_norm": 1.5643051862716675, + "learning_rate": 4.992827699075789e-06, + "loss": 1.153, + "step": 253 + }, + { + "epoch": 0.23649906890130354, + "grad_norm": 1.7497291564941406, + "learning_rate": 4.992733682709347e-06, + "loss": 1.1993, + "step": 254 + }, + { + "epoch": 0.23743016759776536, + "grad_norm": 1.7165666818618774, + "learning_rate": 4.9926390550501225e-06, + "loss": 1.1385, + "step": 255 + }, + { + "epoch": 0.2383612662942272, + "grad_norm": 1.5467196702957153, + "learning_rate": 4.992543816121317e-06, + "loss": 1.2077, + "step": 256 + }, + { + "epoch": 0.239292364990689, + "grad_norm": 1.5102182626724243, + "learning_rate": 4.9924479659462875e-06, + "loss": 1.1998, + "step": 257 + }, + { + "epoch": 0.24022346368715083, + "grad_norm": 1.470045804977417, + "learning_rate": 4.9923515045485395e-06, + "loss": 1.216, + "step": 258 + }, + { + "epoch": 0.24115456238361266, + "grad_norm": 1.7479827404022217, + "learning_rate": 4.992254431951729e-06, + "loss": 1.1651, + "step": 259 + }, + { + "epoch": 0.24208566108007448, + "grad_norm": 1.5148561000823975, + "learning_rate": 4.992156748179659e-06, + "loss": 1.1773, + "step": 260 + }, + { + "epoch": 0.2430167597765363, + "grad_norm": 1.5441746711730957, + "learning_rate": 4.992058453256284e-06, + "loss": 1.1554, + "step": 261 + }, + { + "epoch": 0.24394785847299813, + "grad_norm": 1.5650780200958252, + "learning_rate": 4.991959547205713e-06, + "loss": 1.1348, + "step": 262 + }, + { + "epoch": 0.24487895716945995, + "grad_norm": 1.6538890600204468, + "learning_rate": 4.991860030052196e-06, + "loss": 1.1757, + "step": 263 + }, + { + "epoch": 0.24581005586592178, + "grad_norm": 1.5534696578979492, + "learning_rate": 4.991759901820141e-06, + "loss": 1.1876, + "step": 264 + }, + { + "epoch": 0.2467411545623836, + "grad_norm": 1.6564414501190186, + "learning_rate": 4.991659162534101e-06, + "loss": 1.2146, + "step": 265 + }, + { + "epoch": 0.24767225325884543, + "grad_norm": 1.4657020568847656, + "learning_rate": 4.991557812218779e-06, + "loss": 1.1072, + "step": 266 + }, + { + "epoch": 0.24860335195530725, + "grad_norm": 1.6279875040054321, + "learning_rate": 4.991455850899032e-06, + "loss": 1.1923, + "step": 267 + }, + { + "epoch": 0.24953445065176907, + "grad_norm": 1.6531950235366821, + "learning_rate": 4.991353278599862e-06, + "loss": 1.1993, + "step": 268 + }, + { + "epoch": 0.2504655493482309, + "grad_norm": 1.5914767980575562, + "learning_rate": 4.991250095346423e-06, + "loss": 1.1991, + "step": 269 + }, + { + "epoch": 0.25139664804469275, + "grad_norm": 1.557533860206604, + "learning_rate": 4.991146301164019e-06, + "loss": 1.1358, + "step": 270 + }, + { + "epoch": 0.25232774674115455, + "grad_norm": 1.6352646350860596, + "learning_rate": 4.991041896078104e-06, + "loss": 1.1973, + "step": 271 + }, + { + "epoch": 0.2532588454376164, + "grad_norm": 1.5098541975021362, + "learning_rate": 4.990936880114279e-06, + "loss": 1.1683, + "step": 272 + }, + { + "epoch": 0.2541899441340782, + "grad_norm": 1.6232593059539795, + "learning_rate": 4.990831253298299e-06, + "loss": 1.1356, + "step": 273 + }, + { + "epoch": 0.25512104283054005, + "grad_norm": 1.5661216974258423, + "learning_rate": 4.990725015656068e-06, + "loss": 1.2006, + "step": 274 + }, + { + "epoch": 0.25605214152700184, + "grad_norm": 1.507642149925232, + "learning_rate": 4.990618167213636e-06, + "loss": 1.1316, + "step": 275 + }, + { + "epoch": 0.2569832402234637, + "grad_norm": 1.5950425863265991, + "learning_rate": 4.990510707997207e-06, + "loss": 1.1983, + "step": 276 + }, + { + "epoch": 0.2579143389199255, + "grad_norm": 1.6054936647415161, + "learning_rate": 4.990402638033132e-06, + "loss": 1.217, + "step": 277 + }, + { + "epoch": 0.25884543761638734, + "grad_norm": 1.5690265893936157, + "learning_rate": 4.990293957347914e-06, + "loss": 1.1826, + "step": 278 + }, + { + "epoch": 0.25977653631284914, + "grad_norm": 1.579464316368103, + "learning_rate": 4.990184665968204e-06, + "loss": 1.2222, + "step": 279 + }, + { + "epoch": 0.260707635009311, + "grad_norm": 1.4905120134353638, + "learning_rate": 4.990074763920804e-06, + "loss": 1.162, + "step": 280 + }, + { + "epoch": 0.2616387337057728, + "grad_norm": 1.6381080150604248, + "learning_rate": 4.989964251232667e-06, + "loss": 1.1937, + "step": 281 + }, + { + "epoch": 0.26256983240223464, + "grad_norm": 1.4682097434997559, + "learning_rate": 4.98985312793089e-06, + "loss": 1.1548, + "step": 282 + }, + { + "epoch": 0.2635009310986965, + "grad_norm": 1.54086434841156, + "learning_rate": 4.989741394042728e-06, + "loss": 1.1879, + "step": 283 + }, + { + "epoch": 0.2644320297951583, + "grad_norm": 1.6787452697753906, + "learning_rate": 4.989629049595579e-06, + "loss": 1.1718, + "step": 284 + }, + { + "epoch": 0.26536312849162014, + "grad_norm": 1.6381014585494995, + "learning_rate": 4.989516094616993e-06, + "loss": 1.1987, + "step": 285 + }, + { + "epoch": 0.26629422718808193, + "grad_norm": 1.5789836645126343, + "learning_rate": 4.98940252913467e-06, + "loss": 1.1694, + "step": 286 + }, + { + "epoch": 0.2672253258845438, + "grad_norm": 1.561802864074707, + "learning_rate": 4.989288353176463e-06, + "loss": 1.1576, + "step": 287 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 1.568241834640503, + "learning_rate": 4.989173566770366e-06, + "loss": 1.1579, + "step": 288 + }, + { + "epoch": 0.26908752327746743, + "grad_norm": 1.5002732276916504, + "learning_rate": 4.989058169944532e-06, + "loss": 1.18, + "step": 289 + }, + { + "epoch": 0.27001862197392923, + "grad_norm": 1.5480579137802124, + "learning_rate": 4.9889421627272575e-06, + "loss": 1.139, + "step": 290 + }, + { + "epoch": 0.2709497206703911, + "grad_norm": 1.5767931938171387, + "learning_rate": 4.988825545146993e-06, + "loss": 1.186, + "step": 291 + }, + { + "epoch": 0.2718808193668529, + "grad_norm": 1.6289966106414795, + "learning_rate": 4.988708317232335e-06, + "loss": 1.1851, + "step": 292 + }, + { + "epoch": 0.27281191806331473, + "grad_norm": 1.5453596115112305, + "learning_rate": 4.988590479012032e-06, + "loss": 1.1442, + "step": 293 + }, + { + "epoch": 0.2737430167597765, + "grad_norm": 1.6295948028564453, + "learning_rate": 4.988472030514982e-06, + "loss": 1.2072, + "step": 294 + }, + { + "epoch": 0.2746741154562384, + "grad_norm": 1.5502432584762573, + "learning_rate": 4.988352971770229e-06, + "loss": 1.201, + "step": 295 + }, + { + "epoch": 0.2756052141527002, + "grad_norm": 1.5260909795761108, + "learning_rate": 4.988233302806974e-06, + "loss": 1.2135, + "step": 296 + }, + { + "epoch": 0.276536312849162, + "grad_norm": 1.5481847524642944, + "learning_rate": 4.988113023654562e-06, + "loss": 1.1454, + "step": 297 + }, + { + "epoch": 0.2774674115456238, + "grad_norm": 1.5329092741012573, + "learning_rate": 4.987992134342488e-06, + "loss": 1.1294, + "step": 298 + }, + { + "epoch": 0.2783985102420857, + "grad_norm": 1.5602904558181763, + "learning_rate": 4.987870634900398e-06, + "loss": 1.1469, + "step": 299 + }, + { + "epoch": 0.27932960893854747, + "grad_norm": 1.5570727586746216, + "learning_rate": 4.987748525358087e-06, + "loss": 1.2041, + "step": 300 + }, + { + "epoch": 0.2802607076350093, + "grad_norm": 1.5078455209732056, + "learning_rate": 4.9876258057455015e-06, + "loss": 1.1436, + "step": 301 + }, + { + "epoch": 0.2811918063314711, + "grad_norm": 1.5646103620529175, + "learning_rate": 4.987502476092734e-06, + "loss": 1.1338, + "step": 302 + }, + { + "epoch": 0.28212290502793297, + "grad_norm": 1.6741316318511963, + "learning_rate": 4.987378536430031e-06, + "loss": 1.1459, + "step": 303 + }, + { + "epoch": 0.28305400372439476, + "grad_norm": 1.6432318687438965, + "learning_rate": 4.987253986787783e-06, + "loss": 1.2054, + "step": 304 + }, + { + "epoch": 0.2839851024208566, + "grad_norm": 1.5824742317199707, + "learning_rate": 4.987128827196537e-06, + "loss": 1.1358, + "step": 305 + }, + { + "epoch": 0.2849162011173184, + "grad_norm": 1.5743874311447144, + "learning_rate": 4.987003057686983e-06, + "loss": 1.1483, + "step": 306 + }, + { + "epoch": 0.28584729981378026, + "grad_norm": 1.6464556455612183, + "learning_rate": 4.986876678289964e-06, + "loss": 1.1897, + "step": 307 + }, + { + "epoch": 0.28677839851024206, + "grad_norm": 1.6144617795944214, + "learning_rate": 4.9867496890364734e-06, + "loss": 1.2007, + "step": 308 + }, + { + "epoch": 0.2877094972067039, + "grad_norm": 1.5669097900390625, + "learning_rate": 4.986622089957651e-06, + "loss": 1.212, + "step": 309 + }, + { + "epoch": 0.2886405959031657, + "grad_norm": 1.61880362033844, + "learning_rate": 4.986493881084789e-06, + "loss": 1.1408, + "step": 310 + }, + { + "epoch": 0.28957169459962756, + "grad_norm": 1.652443766593933, + "learning_rate": 4.986365062449328e-06, + "loss": 1.1441, + "step": 311 + }, + { + "epoch": 0.2905027932960894, + "grad_norm": 1.6292386054992676, + "learning_rate": 4.986235634082857e-06, + "loss": 1.1859, + "step": 312 + }, + { + "epoch": 0.2914338919925512, + "grad_norm": 1.5104798078536987, + "learning_rate": 4.986105596017118e-06, + "loss": 1.1938, + "step": 313 + }, + { + "epoch": 0.29236499068901306, + "grad_norm": 1.5984587669372559, + "learning_rate": 4.985974948283997e-06, + "loss": 1.1797, + "step": 314 + }, + { + "epoch": 0.29329608938547486, + "grad_norm": 1.568224310874939, + "learning_rate": 4.985843690915536e-06, + "loss": 1.1488, + "step": 315 + }, + { + "epoch": 0.2942271880819367, + "grad_norm": 1.6096382141113281, + "learning_rate": 4.985711823943921e-06, + "loss": 1.1292, + "step": 316 + }, + { + "epoch": 0.2951582867783985, + "grad_norm": 1.5105839967727661, + "learning_rate": 4.985579347401491e-06, + "loss": 1.1253, + "step": 317 + }, + { + "epoch": 0.29608938547486036, + "grad_norm": 1.5043179988861084, + "learning_rate": 4.985446261320732e-06, + "loss": 1.1599, + "step": 318 + }, + { + "epoch": 0.29702048417132215, + "grad_norm": 1.5188047885894775, + "learning_rate": 4.985312565734283e-06, + "loss": 1.1521, + "step": 319 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 1.5714865922927856, + "learning_rate": 4.985178260674927e-06, + "loss": 1.1467, + "step": 320 + }, + { + "epoch": 0.2988826815642458, + "grad_norm": 1.6004184484481812, + "learning_rate": 4.985043346175602e-06, + "loss": 1.1482, + "step": 321 + }, + { + "epoch": 0.29981378026070765, + "grad_norm": 1.5001561641693115, + "learning_rate": 4.984907822269391e-06, + "loss": 1.1448, + "step": 322 + }, + { + "epoch": 0.30074487895716945, + "grad_norm": 1.4722920656204224, + "learning_rate": 4.984771688989532e-06, + "loss": 1.1172, + "step": 323 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 1.5229582786560059, + "learning_rate": 4.984634946369405e-06, + "loss": 1.1548, + "step": 324 + }, + { + "epoch": 0.3026070763500931, + "grad_norm": 1.5634204149246216, + "learning_rate": 4.984497594442545e-06, + "loss": 1.1541, + "step": 325 + }, + { + "epoch": 0.30353817504655495, + "grad_norm": 1.673488974571228, + "learning_rate": 4.984359633242636e-06, + "loss": 1.1821, + "step": 326 + }, + { + "epoch": 0.30446927374301674, + "grad_norm": 1.5141621828079224, + "learning_rate": 4.984221062803508e-06, + "loss": 1.1469, + "step": 327 + }, + { + "epoch": 0.3054003724394786, + "grad_norm": 1.5276451110839844, + "learning_rate": 4.984081883159144e-06, + "loss": 1.1489, + "step": 328 + }, + { + "epoch": 0.3063314711359404, + "grad_norm": 1.567543387413025, + "learning_rate": 4.983942094343675e-06, + "loss": 1.1288, + "step": 329 + }, + { + "epoch": 0.30726256983240224, + "grad_norm": 1.5250859260559082, + "learning_rate": 4.983801696391381e-06, + "loss": 1.1456, + "step": 330 + }, + { + "epoch": 0.30819366852886404, + "grad_norm": 1.6484228372573853, + "learning_rate": 4.983660689336692e-06, + "loss": 1.1432, + "step": 331 + }, + { + "epoch": 0.3091247672253259, + "grad_norm": 1.469672679901123, + "learning_rate": 4.983519073214186e-06, + "loss": 1.1402, + "step": 332 + }, + { + "epoch": 0.3100558659217877, + "grad_norm": 1.6095240116119385, + "learning_rate": 4.983376848058593e-06, + "loss": 1.1993, + "step": 333 + }, + { + "epoch": 0.31098696461824954, + "grad_norm": 1.5325345993041992, + "learning_rate": 4.983234013904791e-06, + "loss": 1.1637, + "step": 334 + }, + { + "epoch": 0.31191806331471134, + "grad_norm": 1.6400229930877686, + "learning_rate": 4.983090570787806e-06, + "loss": 1.1667, + "step": 335 + }, + { + "epoch": 0.3128491620111732, + "grad_norm": 1.539421558380127, + "learning_rate": 4.982946518742815e-06, + "loss": 1.1727, + "step": 336 + }, + { + "epoch": 0.313780260707635, + "grad_norm": 1.7082252502441406, + "learning_rate": 4.982801857805144e-06, + "loss": 1.1511, + "step": 337 + }, + { + "epoch": 0.31471135940409684, + "grad_norm": 1.596779227256775, + "learning_rate": 4.982656588010269e-06, + "loss": 1.1945, + "step": 338 + }, + { + "epoch": 0.31564245810055863, + "grad_norm": 1.6482007503509521, + "learning_rate": 4.982510709393813e-06, + "loss": 1.1822, + "step": 339 + }, + { + "epoch": 0.3165735567970205, + "grad_norm": 1.7460479736328125, + "learning_rate": 4.98236422199155e-06, + "loss": 1.1871, + "step": 340 + }, + { + "epoch": 0.31750465549348234, + "grad_norm": 1.5625799894332886, + "learning_rate": 4.982217125839403e-06, + "loss": 1.1541, + "step": 341 + }, + { + "epoch": 0.31843575418994413, + "grad_norm": 1.5512086153030396, + "learning_rate": 4.982069420973446e-06, + "loss": 1.1585, + "step": 342 + }, + { + "epoch": 0.319366852886406, + "grad_norm": 1.670642614364624, + "learning_rate": 4.9819211074299e-06, + "loss": 1.1908, + "step": 343 + }, + { + "epoch": 0.3202979515828678, + "grad_norm": 1.5584107637405396, + "learning_rate": 4.981772185245135e-06, + "loss": 1.075, + "step": 344 + }, + { + "epoch": 0.32122905027932963, + "grad_norm": 1.5482008457183838, + "learning_rate": 4.9816226544556725e-06, + "loss": 1.1959, + "step": 345 + }, + { + "epoch": 0.3221601489757914, + "grad_norm": 1.5843377113342285, + "learning_rate": 4.98147251509818e-06, + "loss": 1.188, + "step": 346 + }, + { + "epoch": 0.3230912476722533, + "grad_norm": 1.5460178852081299, + "learning_rate": 4.981321767209477e-06, + "loss": 1.1275, + "step": 347 + }, + { + "epoch": 0.3240223463687151, + "grad_norm": 1.645349383354187, + "learning_rate": 4.981170410826532e-06, + "loss": 1.126, + "step": 348 + }, + { + "epoch": 0.3249534450651769, + "grad_norm": 1.5918056964874268, + "learning_rate": 4.9810184459864615e-06, + "loss": 1.1475, + "step": 349 + }, + { + "epoch": 0.3258845437616387, + "grad_norm": 1.502294659614563, + "learning_rate": 4.980865872726532e-06, + "loss": 1.1257, + "step": 350 + }, + { + "epoch": 0.3268156424581006, + "grad_norm": 1.5638993978500366, + "learning_rate": 4.9807126910841595e-06, + "loss": 1.1593, + "step": 351 + }, + { + "epoch": 0.32774674115456237, + "grad_norm": 1.7481025457382202, + "learning_rate": 4.980558901096908e-06, + "loss": 1.1387, + "step": 352 + }, + { + "epoch": 0.3286778398510242, + "grad_norm": 1.622081995010376, + "learning_rate": 4.980404502802492e-06, + "loss": 1.1767, + "step": 353 + }, + { + "epoch": 0.329608938547486, + "grad_norm": 1.5219521522521973, + "learning_rate": 4.980249496238774e-06, + "loss": 1.1513, + "step": 354 + }, + { + "epoch": 0.33054003724394787, + "grad_norm": 1.5893964767456055, + "learning_rate": 4.980093881443766e-06, + "loss": 1.1612, + "step": 355 + }, + { + "epoch": 0.33147113594040967, + "grad_norm": 1.650078535079956, + "learning_rate": 4.9799376584556304e-06, + "loss": 1.0907, + "step": 356 + }, + { + "epoch": 0.3324022346368715, + "grad_norm": 1.578399419784546, + "learning_rate": 4.9797808273126765e-06, + "loss": 1.1579, + "step": 357 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.5139063596725464, + "learning_rate": 4.979623388053366e-06, + "loss": 1.1327, + "step": 358 + }, + { + "epoch": 0.33426443202979517, + "grad_norm": 1.5960277318954468, + "learning_rate": 4.979465340716306e-06, + "loss": 1.1434, + "step": 359 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 1.5807005167007446, + "learning_rate": 4.9793066853402535e-06, + "loss": 1.1833, + "step": 360 + }, + { + "epoch": 0.3361266294227188, + "grad_norm": 1.5984652042388916, + "learning_rate": 4.979147421964119e-06, + "loss": 1.1528, + "step": 361 + }, + { + "epoch": 0.3370577281191806, + "grad_norm": 1.5312142372131348, + "learning_rate": 4.978987550626955e-06, + "loss": 1.1644, + "step": 362 + }, + { + "epoch": 0.33798882681564246, + "grad_norm": 1.5473692417144775, + "learning_rate": 4.978827071367969e-06, + "loss": 1.1445, + "step": 363 + }, + { + "epoch": 0.33891992551210426, + "grad_norm": 1.6537644863128662, + "learning_rate": 4.978665984226514e-06, + "loss": 1.1617, + "step": 364 + }, + { + "epoch": 0.3398510242085661, + "grad_norm": 1.6679675579071045, + "learning_rate": 4.978504289242094e-06, + "loss": 1.1474, + "step": 365 + }, + { + "epoch": 0.3407821229050279, + "grad_norm": 1.507922649383545, + "learning_rate": 4.978341986454363e-06, + "loss": 1.1524, + "step": 366 + }, + { + "epoch": 0.34171322160148976, + "grad_norm": 1.6119061708450317, + "learning_rate": 4.978179075903119e-06, + "loss": 1.1789, + "step": 367 + }, + { + "epoch": 0.3426443202979516, + "grad_norm": 1.5990979671478271, + "learning_rate": 4.978015557628315e-06, + "loss": 1.1562, + "step": 368 + }, + { + "epoch": 0.3435754189944134, + "grad_norm": 1.6053398847579956, + "learning_rate": 4.9778514316700496e-06, + "loss": 1.1645, + "step": 369 + }, + { + "epoch": 0.34450651769087526, + "grad_norm": 1.5216419696807861, + "learning_rate": 4.977686698068572e-06, + "loss": 1.1137, + "step": 370 + }, + { + "epoch": 0.34543761638733705, + "grad_norm": 1.5150953531265259, + "learning_rate": 4.9775213568642796e-06, + "loss": 1.1092, + "step": 371 + }, + { + "epoch": 0.3463687150837989, + "grad_norm": 1.616699457168579, + "learning_rate": 4.97735540809772e-06, + "loss": 1.1725, + "step": 372 + }, + { + "epoch": 0.3472998137802607, + "grad_norm": 1.5947240591049194, + "learning_rate": 4.9771888518095855e-06, + "loss": 1.1512, + "step": 373 + }, + { + "epoch": 0.34823091247672255, + "grad_norm": 1.6488538980484009, + "learning_rate": 4.977021688040724e-06, + "loss": 1.1565, + "step": 374 + }, + { + "epoch": 0.34916201117318435, + "grad_norm": 1.4983750581741333, + "learning_rate": 4.976853916832129e-06, + "loss": 1.112, + "step": 375 + }, + { + "epoch": 0.3500931098696462, + "grad_norm": 1.5410925149917603, + "learning_rate": 4.9766855382249414e-06, + "loss": 1.1333, + "step": 376 + }, + { + "epoch": 0.351024208566108, + "grad_norm": 1.7927250862121582, + "learning_rate": 4.976516552260453e-06, + "loss": 1.1381, + "step": 377 + }, + { + "epoch": 0.35195530726256985, + "grad_norm": 1.568892478942871, + "learning_rate": 4.976346958980105e-06, + "loss": 1.1423, + "step": 378 + }, + { + "epoch": 0.35288640595903165, + "grad_norm": 1.5994700193405151, + "learning_rate": 4.9761767584254855e-06, + "loss": 1.1756, + "step": 379 + }, + { + "epoch": 0.3538175046554935, + "grad_norm": 1.5248911380767822, + "learning_rate": 4.976005950638334e-06, + "loss": 1.1473, + "step": 380 + }, + { + "epoch": 0.3547486033519553, + "grad_norm": 1.5898964405059814, + "learning_rate": 4.975834535660538e-06, + "loss": 1.1558, + "step": 381 + }, + { + "epoch": 0.35567970204841715, + "grad_norm": 1.5030688047409058, + "learning_rate": 4.975662513534131e-06, + "loss": 1.0974, + "step": 382 + }, + { + "epoch": 0.35661080074487894, + "grad_norm": 1.5825740098953247, + "learning_rate": 4.975489884301301e-06, + "loss": 1.1819, + "step": 383 + }, + { + "epoch": 0.3575418994413408, + "grad_norm": 1.5107347965240479, + "learning_rate": 4.97531664800438e-06, + "loss": 1.1406, + "step": 384 + }, + { + "epoch": 0.3584729981378026, + "grad_norm": 1.5390645265579224, + "learning_rate": 4.975142804685851e-06, + "loss": 1.1333, + "step": 385 + }, + { + "epoch": 0.35940409683426444, + "grad_norm": 1.5294268131256104, + "learning_rate": 4.974968354388346e-06, + "loss": 1.177, + "step": 386 + }, + { + "epoch": 0.36033519553072624, + "grad_norm": 1.518444538116455, + "learning_rate": 4.974793297154645e-06, + "loss": 1.0785, + "step": 387 + }, + { + "epoch": 0.3612662942271881, + "grad_norm": 1.7795239686965942, + "learning_rate": 4.974617633027678e-06, + "loss": 1.1469, + "step": 388 + }, + { + "epoch": 0.3621973929236499, + "grad_norm": 1.5898357629776, + "learning_rate": 4.974441362050523e-06, + "loss": 1.105, + "step": 389 + }, + { + "epoch": 0.36312849162011174, + "grad_norm": 1.6565510034561157, + "learning_rate": 4.9742644842664066e-06, + "loss": 1.1728, + "step": 390 + }, + { + "epoch": 0.36405959031657353, + "grad_norm": 1.6090935468673706, + "learning_rate": 4.9740869997187035e-06, + "loss": 1.1556, + "step": 391 + }, + { + "epoch": 0.3649906890130354, + "grad_norm": 1.5549203157424927, + "learning_rate": 4.97390890845094e-06, + "loss": 1.1392, + "step": 392 + }, + { + "epoch": 0.3659217877094972, + "grad_norm": 1.576033353805542, + "learning_rate": 4.973730210506788e-06, + "loss": 1.1421, + "step": 393 + }, + { + "epoch": 0.36685288640595903, + "grad_norm": 1.5537548065185547, + "learning_rate": 4.97355090593007e-06, + "loss": 1.1855, + "step": 394 + }, + { + "epoch": 0.36778398510242083, + "grad_norm": 1.558398723602295, + "learning_rate": 4.973370994764758e-06, + "loss": 1.1753, + "step": 395 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 1.5602357387542725, + "learning_rate": 4.97319047705497e-06, + "loss": 1.1199, + "step": 396 + }, + { + "epoch": 0.36964618249534453, + "grad_norm": 1.4839043617248535, + "learning_rate": 4.973009352844974e-06, + "loss": 1.128, + "step": 397 + }, + { + "epoch": 0.37057728119180633, + "grad_norm": 1.548038125038147, + "learning_rate": 4.9728276221791895e-06, + "loss": 1.1709, + "step": 398 + }, + { + "epoch": 0.3715083798882682, + "grad_norm": 1.5458474159240723, + "learning_rate": 4.9726452851021804e-06, + "loss": 1.1752, + "step": 399 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 1.4915302991867065, + "learning_rate": 4.972462341658661e-06, + "loss": 1.1563, + "step": 400 + }, + { + "epoch": 0.37337057728119183, + "grad_norm": 1.5692366361618042, + "learning_rate": 4.972278791893496e-06, + "loss": 1.1479, + "step": 401 + }, + { + "epoch": 0.3743016759776536, + "grad_norm": 1.5486400127410889, + "learning_rate": 4.972094635851695e-06, + "loss": 1.1555, + "step": 402 + }, + { + "epoch": 0.3752327746741155, + "grad_norm": 1.509786605834961, + "learning_rate": 4.971909873578421e-06, + "loss": 1.1315, + "step": 403 + }, + { + "epoch": 0.3761638733705773, + "grad_norm": 1.497616171836853, + "learning_rate": 4.971724505118982e-06, + "loss": 1.1305, + "step": 404 + }, + { + "epoch": 0.3770949720670391, + "grad_norm": 1.6088786125183105, + "learning_rate": 4.971538530518836e-06, + "loss": 1.1403, + "step": 405 + }, + { + "epoch": 0.3780260707635009, + "grad_norm": 1.5144068002700806, + "learning_rate": 4.971351949823589e-06, + "loss": 1.1195, + "step": 406 + }, + { + "epoch": 0.3789571694599628, + "grad_norm": 1.5215985774993896, + "learning_rate": 4.971164763078998e-06, + "loss": 1.1357, + "step": 407 + }, + { + "epoch": 0.37988826815642457, + "grad_norm": 1.4947015047073364, + "learning_rate": 4.970976970330964e-06, + "loss": 1.1286, + "step": 408 + }, + { + "epoch": 0.3808193668528864, + "grad_norm": 1.6067999601364136, + "learning_rate": 4.970788571625542e-06, + "loss": 1.1786, + "step": 409 + }, + { + "epoch": 0.3817504655493482, + "grad_norm": 1.657520055770874, + "learning_rate": 4.970599567008931e-06, + "loss": 1.143, + "step": 410 + }, + { + "epoch": 0.38268156424581007, + "grad_norm": 1.4607744216918945, + "learning_rate": 4.970409956527483e-06, + "loss": 1.1543, + "step": 411 + }, + { + "epoch": 0.38361266294227186, + "grad_norm": 1.51872718334198, + "learning_rate": 4.970219740227693e-06, + "loss": 1.1456, + "step": 412 + }, + { + "epoch": 0.3845437616387337, + "grad_norm": 1.6181915998458862, + "learning_rate": 4.97002891815621e-06, + "loss": 1.1318, + "step": 413 + }, + { + "epoch": 0.3854748603351955, + "grad_norm": 1.5810741186141968, + "learning_rate": 4.969837490359829e-06, + "loss": 1.1315, + "step": 414 + }, + { + "epoch": 0.38640595903165736, + "grad_norm": 1.5425630807876587, + "learning_rate": 4.969645456885493e-06, + "loss": 1.1152, + "step": 415 + }, + { + "epoch": 0.38733705772811916, + "grad_norm": 1.6084840297698975, + "learning_rate": 4.969452817780295e-06, + "loss": 1.1991, + "step": 416 + }, + { + "epoch": 0.388268156424581, + "grad_norm": 1.6024158000946045, + "learning_rate": 4.969259573091476e-06, + "loss": 1.1318, + "step": 417 + }, + { + "epoch": 0.3891992551210428, + "grad_norm": 1.6052086353302002, + "learning_rate": 4.9690657228664244e-06, + "loss": 1.1744, + "step": 418 + }, + { + "epoch": 0.39013035381750466, + "grad_norm": 1.4998224973678589, + "learning_rate": 4.9688712671526786e-06, + "loss": 1.0993, + "step": 419 + }, + { + "epoch": 0.39106145251396646, + "grad_norm": 1.5480080842971802, + "learning_rate": 4.9686762059979255e-06, + "loss": 1.1203, + "step": 420 + }, + { + "epoch": 0.3919925512104283, + "grad_norm": 1.593430519104004, + "learning_rate": 4.968480539449999e-06, + "loss": 1.1335, + "step": 421 + }, + { + "epoch": 0.3929236499068901, + "grad_norm": 1.5293151140213013, + "learning_rate": 4.968284267556883e-06, + "loss": 1.1365, + "step": 422 + }, + { + "epoch": 0.39385474860335196, + "grad_norm": 1.5110721588134766, + "learning_rate": 4.9680873903667095e-06, + "loss": 1.1576, + "step": 423 + }, + { + "epoch": 0.3947858472998138, + "grad_norm": 1.6544134616851807, + "learning_rate": 4.967889907927758e-06, + "loss": 1.1416, + "step": 424 + }, + { + "epoch": 0.3957169459962756, + "grad_norm": 1.583020567893982, + "learning_rate": 4.967691820288457e-06, + "loss": 1.1544, + "step": 425 + }, + { + "epoch": 0.39664804469273746, + "grad_norm": 1.6162288188934326, + "learning_rate": 4.967493127497385e-06, + "loss": 1.1582, + "step": 426 + }, + { + "epoch": 0.39757914338919925, + "grad_norm": 1.5714863538742065, + "learning_rate": 4.9672938296032656e-06, + "loss": 1.1535, + "step": 427 + }, + { + "epoch": 0.3985102420856611, + "grad_norm": 1.587739109992981, + "learning_rate": 4.967093926654973e-06, + "loss": 1.1497, + "step": 428 + }, + { + "epoch": 0.3994413407821229, + "grad_norm": 1.5518815517425537, + "learning_rate": 4.966893418701529e-06, + "loss": 1.1046, + "step": 429 + }, + { + "epoch": 0.40037243947858475, + "grad_norm": 1.57876718044281, + "learning_rate": 4.966692305792106e-06, + "loss": 1.1518, + "step": 430 + }, + { + "epoch": 0.40130353817504655, + "grad_norm": 1.5824034214019775, + "learning_rate": 4.966490587976021e-06, + "loss": 1.1365, + "step": 431 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 1.6374784708023071, + "learning_rate": 4.9662882653027434e-06, + "loss": 1.1732, + "step": 432 + }, + { + "epoch": 0.4031657355679702, + "grad_norm": 1.6593799591064453, + "learning_rate": 4.966085337821886e-06, + "loss": 1.1394, + "step": 433 + }, + { + "epoch": 0.40409683426443205, + "grad_norm": 1.595018982887268, + "learning_rate": 4.965881805583214e-06, + "loss": 1.0903, + "step": 434 + }, + { + "epoch": 0.40502793296089384, + "grad_norm": 1.5136500597000122, + "learning_rate": 4.965677668636639e-06, + "loss": 1.0883, + "step": 435 + }, + { + "epoch": 0.4059590316573557, + "grad_norm": 1.5129704475402832, + "learning_rate": 4.9654729270322236e-06, + "loss": 1.0843, + "step": 436 + }, + { + "epoch": 0.4068901303538175, + "grad_norm": 1.9252679347991943, + "learning_rate": 4.965267580820173e-06, + "loss": 1.1544, + "step": 437 + }, + { + "epoch": 0.40782122905027934, + "grad_norm": 1.6633261442184448, + "learning_rate": 4.965061630050848e-06, + "loss": 1.1798, + "step": 438 + }, + { + "epoch": 0.40875232774674114, + "grad_norm": 1.5632121562957764, + "learning_rate": 4.964855074774751e-06, + "loss": 1.1276, + "step": 439 + }, + { + "epoch": 0.409683426443203, + "grad_norm": 1.54603910446167, + "learning_rate": 4.964647915042537e-06, + "loss": 1.1236, + "step": 440 + }, + { + "epoch": 0.4106145251396648, + "grad_norm": 1.5920768976211548, + "learning_rate": 4.964440150905008e-06, + "loss": 1.1273, + "step": 441 + }, + { + "epoch": 0.41154562383612664, + "grad_norm": 1.5279273986816406, + "learning_rate": 4.9642317824131125e-06, + "loss": 1.1274, + "step": 442 + }, + { + "epoch": 0.41247672253258844, + "grad_norm": 1.5296270847320557, + "learning_rate": 4.96402280961795e-06, + "loss": 1.1374, + "step": 443 + }, + { + "epoch": 0.4134078212290503, + "grad_norm": 1.5581321716308594, + "learning_rate": 4.963813232570767e-06, + "loss": 1.1089, + "step": 444 + }, + { + "epoch": 0.4143389199255121, + "grad_norm": 1.5483899116516113, + "learning_rate": 4.963603051322956e-06, + "loss": 1.1198, + "step": 445 + }, + { + "epoch": 0.41527001862197394, + "grad_norm": 1.6096707582473755, + "learning_rate": 4.963392265926062e-06, + "loss": 1.1416, + "step": 446 + }, + { + "epoch": 0.41620111731843573, + "grad_norm": 1.5583285093307495, + "learning_rate": 4.963180876431775e-06, + "loss": 1.1425, + "step": 447 + }, + { + "epoch": 0.4171322160148976, + "grad_norm": 1.6165952682495117, + "learning_rate": 4.962968882891934e-06, + "loss": 1.1218, + "step": 448 + }, + { + "epoch": 0.4180633147113594, + "grad_norm": 1.6122556924819946, + "learning_rate": 4.962756285358527e-06, + "loss": 1.2283, + "step": 449 + }, + { + "epoch": 0.41899441340782123, + "grad_norm": 1.5779941082000732, + "learning_rate": 4.9625430838836875e-06, + "loss": 1.1163, + "step": 450 + }, + { + "epoch": 0.419925512104283, + "grad_norm": 1.5453237295150757, + "learning_rate": 4.9623292785197e-06, + "loss": 1.1109, + "step": 451 + }, + { + "epoch": 0.4208566108007449, + "grad_norm": 1.4900524616241455, + "learning_rate": 4.962114869318996e-06, + "loss": 1.1068, + "step": 452 + }, + { + "epoch": 0.42178770949720673, + "grad_norm": 1.582170009613037, + "learning_rate": 4.961899856334155e-06, + "loss": 1.1642, + "step": 453 + }, + { + "epoch": 0.4227188081936685, + "grad_norm": 1.568885326385498, + "learning_rate": 4.961684239617904e-06, + "loss": 1.1053, + "step": 454 + }, + { + "epoch": 0.4236499068901304, + "grad_norm": 1.4655330181121826, + "learning_rate": 4.96146801922312e-06, + "loss": 1.107, + "step": 455 + }, + { + "epoch": 0.4245810055865922, + "grad_norm": 1.5923532247543335, + "learning_rate": 4.961251195202825e-06, + "loss": 1.1428, + "step": 456 + }, + { + "epoch": 0.425512104283054, + "grad_norm": 1.5150216817855835, + "learning_rate": 4.961033767610193e-06, + "loss": 1.1066, + "step": 457 + }, + { + "epoch": 0.4264432029795158, + "grad_norm": 1.5623857975006104, + "learning_rate": 4.960815736498541e-06, + "loss": 1.1533, + "step": 458 + }, + { + "epoch": 0.4273743016759777, + "grad_norm": 1.6647369861602783, + "learning_rate": 4.960597101921338e-06, + "loss": 1.1528, + "step": 459 + }, + { + "epoch": 0.42830540037243947, + "grad_norm": 1.5871447324752808, + "learning_rate": 4.960377863932201e-06, + "loss": 1.1001, + "step": 460 + }, + { + "epoch": 0.4292364990689013, + "grad_norm": 1.4906527996063232, + "learning_rate": 4.9601580225848914e-06, + "loss": 1.0991, + "step": 461 + }, + { + "epoch": 0.4301675977653631, + "grad_norm": 1.5552783012390137, + "learning_rate": 4.959937577933323e-06, + "loss": 1.1788, + "step": 462 + }, + { + "epoch": 0.43109869646182497, + "grad_norm": 1.5226411819458008, + "learning_rate": 4.959716530031553e-06, + "loss": 1.1406, + "step": 463 + }, + { + "epoch": 0.43202979515828677, + "grad_norm": 1.5362269878387451, + "learning_rate": 4.959494878933792e-06, + "loss": 1.0898, + "step": 464 + }, + { + "epoch": 0.4329608938547486, + "grad_norm": 1.5201159715652466, + "learning_rate": 4.9592726246943924e-06, + "loss": 1.0861, + "step": 465 + }, + { + "epoch": 0.4338919925512104, + "grad_norm": 1.6300363540649414, + "learning_rate": 4.95904976736786e-06, + "loss": 1.1056, + "step": 466 + }, + { + "epoch": 0.43482309124767227, + "grad_norm": 1.5079962015151978, + "learning_rate": 4.9588263070088435e-06, + "loss": 1.0943, + "step": 467 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 1.5123474597930908, + "learning_rate": 4.958602243672145e-06, + "loss": 1.0997, + "step": 468 + }, + { + "epoch": 0.4366852886405959, + "grad_norm": 1.614089846611023, + "learning_rate": 4.95837757741271e-06, + "loss": 1.146, + "step": 469 + }, + { + "epoch": 0.4376163873370577, + "grad_norm": 1.557093620300293, + "learning_rate": 4.9581523082856335e-06, + "loss": 1.1745, + "step": 470 + }, + { + "epoch": 0.43854748603351956, + "grad_norm": 1.6429771184921265, + "learning_rate": 4.957926436346158e-06, + "loss": 1.084, + "step": 471 + }, + { + "epoch": 0.43947858472998136, + "grad_norm": 1.5478867292404175, + "learning_rate": 4.9576999616496744e-06, + "loss": 1.1426, + "step": 472 + }, + { + "epoch": 0.4404096834264432, + "grad_norm": 1.6172984838485718, + "learning_rate": 4.957472884251722e-06, + "loss": 1.1599, + "step": 473 + }, + { + "epoch": 0.441340782122905, + "grad_norm": 1.5386159420013428, + "learning_rate": 4.9572452042079845e-06, + "loss": 1.1179, + "step": 474 + }, + { + "epoch": 0.44227188081936686, + "grad_norm": 1.5015331506729126, + "learning_rate": 4.957016921574298e-06, + "loss": 1.1114, + "step": 475 + }, + { + "epoch": 0.44320297951582865, + "grad_norm": 1.5971821546554565, + "learning_rate": 4.9567880364066435e-06, + "loss": 1.0885, + "step": 476 + }, + { + "epoch": 0.4441340782122905, + "grad_norm": 1.5242514610290527, + "learning_rate": 4.95655854876115e-06, + "loss": 1.0912, + "step": 477 + }, + { + "epoch": 0.4450651769087523, + "grad_norm": 1.5206764936447144, + "learning_rate": 4.956328458694096e-06, + "loss": 1.143, + "step": 478 + }, + { + "epoch": 0.44599627560521415, + "grad_norm": 1.595788836479187, + "learning_rate": 4.956097766261905e-06, + "loss": 1.1071, + "step": 479 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 2.1942999362945557, + "learning_rate": 4.95586647152115e-06, + "loss": 1.1245, + "step": 480 + }, + { + "epoch": 0.4478584729981378, + "grad_norm": 1.5881682634353638, + "learning_rate": 4.955634574528553e-06, + "loss": 1.1411, + "step": 481 + }, + { + "epoch": 0.44878957169459965, + "grad_norm": 1.6564148664474487, + "learning_rate": 4.95540207534098e-06, + "loss": 1.1259, + "step": 482 + }, + { + "epoch": 0.44972067039106145, + "grad_norm": 1.5351526737213135, + "learning_rate": 4.955168974015447e-06, + "loss": 1.1233, + "step": 483 + }, + { + "epoch": 0.4506517690875233, + "grad_norm": 1.5172646045684814, + "learning_rate": 4.954935270609119e-06, + "loss": 1.0793, + "step": 484 + }, + { + "epoch": 0.4515828677839851, + "grad_norm": 1.5817103385925293, + "learning_rate": 4.954700965179306e-06, + "loss": 1.0754, + "step": 485 + }, + { + "epoch": 0.45251396648044695, + "grad_norm": 1.6204304695129395, + "learning_rate": 4.954466057783466e-06, + "loss": 1.1562, + "step": 486 + }, + { + "epoch": 0.45344506517690875, + "grad_norm": 1.5794012546539307, + "learning_rate": 4.9542305484792066e-06, + "loss": 1.122, + "step": 487 + }, + { + "epoch": 0.4543761638733706, + "grad_norm": 1.6076405048370361, + "learning_rate": 4.953994437324281e-06, + "loss": 1.1669, + "step": 488 + }, + { + "epoch": 0.4553072625698324, + "grad_norm": 1.5874429941177368, + "learning_rate": 4.953757724376591e-06, + "loss": 1.1477, + "step": 489 + }, + { + "epoch": 0.45623836126629425, + "grad_norm": 1.6093076467514038, + "learning_rate": 4.953520409694186e-06, + "loss": 1.1499, + "step": 490 + }, + { + "epoch": 0.45716945996275604, + "grad_norm": 1.5191084146499634, + "learning_rate": 4.953282493335261e-06, + "loss": 1.1399, + "step": 491 + }, + { + "epoch": 0.4581005586592179, + "grad_norm": 1.5259243249893188, + "learning_rate": 4.953043975358162e-06, + "loss": 1.1177, + "step": 492 + }, + { + "epoch": 0.4590316573556797, + "grad_norm": 1.5378997325897217, + "learning_rate": 4.95280485582138e-06, + "loss": 1.1194, + "step": 493 + }, + { + "epoch": 0.45996275605214154, + "grad_norm": 1.5503298044204712, + "learning_rate": 4.952565134783554e-06, + "loss": 1.1356, + "step": 494 + }, + { + "epoch": 0.46089385474860334, + "grad_norm": 1.6783236265182495, + "learning_rate": 4.952324812303473e-06, + "loss": 1.1194, + "step": 495 + }, + { + "epoch": 0.4618249534450652, + "grad_norm": 1.6769311428070068, + "learning_rate": 4.952083888440068e-06, + "loss": 1.0884, + "step": 496 + }, + { + "epoch": 0.462756052141527, + "grad_norm": 1.6050509214401245, + "learning_rate": 4.951842363252421e-06, + "loss": 1.1494, + "step": 497 + }, + { + "epoch": 0.46368715083798884, + "grad_norm": 1.5492044687271118, + "learning_rate": 4.951600236799765e-06, + "loss": 1.1018, + "step": 498 + }, + { + "epoch": 0.46461824953445063, + "grad_norm": 1.689003586769104, + "learning_rate": 4.951357509141472e-06, + "loss": 1.15, + "step": 499 + }, + { + "epoch": 0.4655493482309125, + "grad_norm": 1.5559686422348022, + "learning_rate": 4.951114180337068e-06, + "loss": 1.0971, + "step": 500 + }, + { + "epoch": 0.4664804469273743, + "grad_norm": 1.630271315574646, + "learning_rate": 4.950870250446226e-06, + "loss": 1.1491, + "step": 501 + }, + { + "epoch": 0.46741154562383613, + "grad_norm": 1.5715354681015015, + "learning_rate": 4.950625719528762e-06, + "loss": 1.1375, + "step": 502 + }, + { + "epoch": 0.46834264432029793, + "grad_norm": 1.5592912435531616, + "learning_rate": 4.950380587644645e-06, + "loss": 1.1869, + "step": 503 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 1.5133591890335083, + "learning_rate": 4.950134854853986e-06, + "loss": 1.1025, + "step": 504 + }, + { + "epoch": 0.4702048417132216, + "grad_norm": 1.5977835655212402, + "learning_rate": 4.949888521217049e-06, + "loss": 1.1079, + "step": 505 + }, + { + "epoch": 0.47113594040968343, + "grad_norm": 1.5206948518753052, + "learning_rate": 4.949641586794239e-06, + "loss": 1.1285, + "step": 506 + }, + { + "epoch": 0.4720670391061452, + "grad_norm": 1.591206669807434, + "learning_rate": 4.949394051646115e-06, + "loss": 1.1545, + "step": 507 + }, + { + "epoch": 0.4729981378026071, + "grad_norm": 1.546183466911316, + "learning_rate": 4.9491459158333775e-06, + "loss": 1.1069, + "step": 508 + }, + { + "epoch": 0.47392923649906893, + "grad_norm": 1.6251070499420166, + "learning_rate": 4.9488971794168785e-06, + "loss": 1.1175, + "step": 509 + }, + { + "epoch": 0.4748603351955307, + "grad_norm": 1.5359351634979248, + "learning_rate": 4.948647842457615e-06, + "loss": 1.0687, + "step": 510 + }, + { + "epoch": 0.4757914338919926, + "grad_norm": 1.519960880279541, + "learning_rate": 4.948397905016731e-06, + "loss": 1.115, + "step": 511 + }, + { + "epoch": 0.4767225325884544, + "grad_norm": 1.5621953010559082, + "learning_rate": 4.94814736715552e-06, + "loss": 1.1167, + "step": 512 + }, + { + "epoch": 0.4776536312849162, + "grad_norm": 1.575027346611023, + "learning_rate": 4.947896228935421e-06, + "loss": 1.1495, + "step": 513 + }, + { + "epoch": 0.478584729981378, + "grad_norm": 1.539473295211792, + "learning_rate": 4.94764449041802e-06, + "loss": 1.146, + "step": 514 + }, + { + "epoch": 0.4795158286778399, + "grad_norm": 1.5453004837036133, + "learning_rate": 4.94739215166505e-06, + "loss": 1.1036, + "step": 515 + }, + { + "epoch": 0.48044692737430167, + "grad_norm": 1.6224074363708496, + "learning_rate": 4.947139212738395e-06, + "loss": 1.1356, + "step": 516 + }, + { + "epoch": 0.4813780260707635, + "grad_norm": 1.5118556022644043, + "learning_rate": 4.946885673700081e-06, + "loss": 1.1427, + "step": 517 + }, + { + "epoch": 0.4823091247672253, + "grad_norm": 1.4834543466567993, + "learning_rate": 4.9466315346122825e-06, + "loss": 1.0838, + "step": 518 + }, + { + "epoch": 0.48324022346368717, + "grad_norm": 1.6373393535614014, + "learning_rate": 4.946376795537325e-06, + "loss": 1.1342, + "step": 519 + }, + { + "epoch": 0.48417132216014896, + "grad_norm": 1.6570336818695068, + "learning_rate": 4.946121456537676e-06, + "loss": 1.1947, + "step": 520 + }, + { + "epoch": 0.4851024208566108, + "grad_norm": 1.5891071557998657, + "learning_rate": 4.9458655176759515e-06, + "loss": 1.1161, + "step": 521 + }, + { + "epoch": 0.4860335195530726, + "grad_norm": 1.5352654457092285, + "learning_rate": 4.945608979014917e-06, + "loss": 1.1303, + "step": 522 + }, + { + "epoch": 0.48696461824953446, + "grad_norm": 1.565674901008606, + "learning_rate": 4.9453518406174835e-06, + "loss": 1.146, + "step": 523 + }, + { + "epoch": 0.48789571694599626, + "grad_norm": 1.6120684146881104, + "learning_rate": 4.945094102546708e-06, + "loss": 1.1477, + "step": 524 + }, + { + "epoch": 0.4888268156424581, + "grad_norm": 1.664642572402954, + "learning_rate": 4.944835764865796e-06, + "loss": 1.1291, + "step": 525 + }, + { + "epoch": 0.4897579143389199, + "grad_norm": 1.5092589855194092, + "learning_rate": 4.944576827638099e-06, + "loss": 1.0896, + "step": 526 + }, + { + "epoch": 0.49068901303538176, + "grad_norm": 1.5625349283218384, + "learning_rate": 4.9443172909271174e-06, + "loss": 1.1628, + "step": 527 + }, + { + "epoch": 0.49162011173184356, + "grad_norm": 1.5484577417373657, + "learning_rate": 4.9440571547964964e-06, + "loss": 1.0536, + "step": 528 + }, + { + "epoch": 0.4925512104283054, + "grad_norm": 1.5750197172164917, + "learning_rate": 4.94379641931003e-06, + "loss": 1.1176, + "step": 529 + }, + { + "epoch": 0.4934823091247672, + "grad_norm": 1.5709573030471802, + "learning_rate": 4.943535084531658e-06, + "loss": 1.0891, + "step": 530 + }, + { + "epoch": 0.49441340782122906, + "grad_norm": 1.5684819221496582, + "learning_rate": 4.943273150525467e-06, + "loss": 1.0996, + "step": 531 + }, + { + "epoch": 0.49534450651769085, + "grad_norm": 1.5334537029266357, + "learning_rate": 4.943010617355691e-06, + "loss": 1.1499, + "step": 532 + }, + { + "epoch": 0.4962756052141527, + "grad_norm": 1.5027036666870117, + "learning_rate": 4.942747485086712e-06, + "loss": 1.0837, + "step": 533 + }, + { + "epoch": 0.4972067039106145, + "grad_norm": 1.515656590461731, + "learning_rate": 4.942483753783056e-06, + "loss": 1.0909, + "step": 534 + }, + { + "epoch": 0.49813780260707635, + "grad_norm": 1.5232927799224854, + "learning_rate": 4.9422194235094e-06, + "loss": 1.1155, + "step": 535 + }, + { + "epoch": 0.49906890130353815, + "grad_norm": 1.5004891157150269, + "learning_rate": 4.9419544943305645e-06, + "loss": 1.0875, + "step": 536 + }, + { + "epoch": 0.5, + "grad_norm": 1.5596823692321777, + "learning_rate": 4.941688966311519e-06, + "loss": 1.1505, + "step": 537 + }, + { + "epoch": 0.5009310986964618, + "grad_norm": 1.6406277418136597, + "learning_rate": 4.941422839517377e-06, + "loss": 1.1892, + "step": 538 + }, + { + "epoch": 0.5018621973929237, + "grad_norm": 1.5456867218017578, + "learning_rate": 4.941156114013403e-06, + "loss": 1.1329, + "step": 539 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 1.4837782382965088, + "learning_rate": 4.940888789865004e-06, + "loss": 1.1124, + "step": 540 + }, + { + "epoch": 0.5037243947858473, + "grad_norm": 1.4902780055999756, + "learning_rate": 4.940620867137736e-06, + "loss": 1.1124, + "step": 541 + }, + { + "epoch": 0.5046554934823091, + "grad_norm": 1.4913779497146606, + "learning_rate": 4.940352345897304e-06, + "loss": 1.1245, + "step": 542 + }, + { + "epoch": 0.505586592178771, + "grad_norm": 1.4790959358215332, + "learning_rate": 4.940083226209555e-06, + "loss": 1.0903, + "step": 543 + }, + { + "epoch": 0.5065176908752328, + "grad_norm": 1.5012257099151611, + "learning_rate": 4.939813508140487e-06, + "loss": 1.1274, + "step": 544 + }, + { + "epoch": 0.5074487895716946, + "grad_norm": 1.7029085159301758, + "learning_rate": 4.9395431917562416e-06, + "loss": 1.1124, + "step": 545 + }, + { + "epoch": 0.5083798882681564, + "grad_norm": 1.5168663263320923, + "learning_rate": 4.939272277123109e-06, + "loss": 1.1341, + "step": 546 + }, + { + "epoch": 0.5093109869646183, + "grad_norm": 1.5790756940841675, + "learning_rate": 4.939000764307526e-06, + "loss": 1.0935, + "step": 547 + }, + { + "epoch": 0.5102420856610801, + "grad_norm": 1.5467796325683594, + "learning_rate": 4.9387286533760745e-06, + "loss": 1.0967, + "step": 548 + }, + { + "epoch": 0.5111731843575419, + "grad_norm": 1.5498733520507812, + "learning_rate": 4.938455944395485e-06, + "loss": 1.1503, + "step": 549 + }, + { + "epoch": 0.5121042830540037, + "grad_norm": 1.564219355583191, + "learning_rate": 4.938182637432634e-06, + "loss": 1.1287, + "step": 550 + }, + { + "epoch": 0.5130353817504656, + "grad_norm": 1.519954800605774, + "learning_rate": 4.937908732554544e-06, + "loss": 1.1007, + "step": 551 + }, + { + "epoch": 0.5139664804469274, + "grad_norm": 1.5214513540267944, + "learning_rate": 4.937634229828384e-06, + "loss": 1.0951, + "step": 552 + }, + { + "epoch": 0.5148975791433892, + "grad_norm": 1.4941843748092651, + "learning_rate": 4.9373591293214725e-06, + "loss": 1.1077, + "step": 553 + }, + { + "epoch": 0.515828677839851, + "grad_norm": 1.5349756479263306, + "learning_rate": 4.937083431101271e-06, + "loss": 1.1217, + "step": 554 + }, + { + "epoch": 0.5167597765363129, + "grad_norm": 1.5093350410461426, + "learning_rate": 4.936807135235389e-06, + "loss": 1.1127, + "step": 555 + }, + { + "epoch": 0.5176908752327747, + "grad_norm": 1.491680383682251, + "learning_rate": 4.936530241791582e-06, + "loss": 1.1055, + "step": 556 + }, + { + "epoch": 0.5186219739292365, + "grad_norm": 1.5149484872817993, + "learning_rate": 4.936252750837752e-06, + "loss": 1.0967, + "step": 557 + }, + { + "epoch": 0.5195530726256983, + "grad_norm": 1.5864510536193848, + "learning_rate": 4.935974662441952e-06, + "loss": 1.1255, + "step": 558 + }, + { + "epoch": 0.5204841713221602, + "grad_norm": 1.577508807182312, + "learning_rate": 4.935695976672372e-06, + "loss": 1.1149, + "step": 559 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 1.517242193222046, + "learning_rate": 4.935416693597358e-06, + "loss": 1.0857, + "step": 560 + }, + { + "epoch": 0.5223463687150838, + "grad_norm": 1.5732022523880005, + "learning_rate": 4.935136813285398e-06, + "loss": 1.1494, + "step": 561 + }, + { + "epoch": 0.5232774674115456, + "grad_norm": 1.6665898561477661, + "learning_rate": 4.934856335805125e-06, + "loss": 1.1114, + "step": 562 + }, + { + "epoch": 0.5242085661080075, + "grad_norm": 1.5523788928985596, + "learning_rate": 4.934575261225322e-06, + "loss": 1.1198, + "step": 563 + }, + { + "epoch": 0.5251396648044693, + "grad_norm": 1.5870845317840576, + "learning_rate": 4.934293589614917e-06, + "loss": 1.1332, + "step": 564 + }, + { + "epoch": 0.5260707635009311, + "grad_norm": 1.5657655000686646, + "learning_rate": 4.934011321042984e-06, + "loss": 1.0994, + "step": 565 + }, + { + "epoch": 0.527001862197393, + "grad_norm": 1.4799336194992065, + "learning_rate": 4.933728455578745e-06, + "loss": 1.091, + "step": 566 + }, + { + "epoch": 0.5279329608938548, + "grad_norm": 1.5232096910476685, + "learning_rate": 4.933444993291564e-06, + "loss": 1.1084, + "step": 567 + }, + { + "epoch": 0.5288640595903166, + "grad_norm": 1.7085096836090088, + "learning_rate": 4.933160934250957e-06, + "loss": 1.0918, + "step": 568 + }, + { + "epoch": 0.5297951582867784, + "grad_norm": 1.5313915014266968, + "learning_rate": 4.932876278526583e-06, + "loss": 1.1068, + "step": 569 + }, + { + "epoch": 0.5307262569832403, + "grad_norm": 1.61378014087677, + "learning_rate": 4.932591026188247e-06, + "loss": 1.0907, + "step": 570 + }, + { + "epoch": 0.5316573556797021, + "grad_norm": 1.5584440231323242, + "learning_rate": 4.932305177305903e-06, + "loss": 1.1082, + "step": 571 + }, + { + "epoch": 0.5325884543761639, + "grad_norm": 1.5553605556488037, + "learning_rate": 4.932018731949649e-06, + "loss": 1.0443, + "step": 572 + }, + { + "epoch": 0.5335195530726257, + "grad_norm": 1.559438943862915, + "learning_rate": 4.931731690189731e-06, + "loss": 1.0706, + "step": 573 + }, + { + "epoch": 0.5344506517690876, + "grad_norm": 1.82330322265625, + "learning_rate": 4.931444052096539e-06, + "loss": 1.1079, + "step": 574 + }, + { + "epoch": 0.5353817504655494, + "grad_norm": 1.6149985790252686, + "learning_rate": 4.9311558177406105e-06, + "loss": 1.0747, + "step": 575 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 1.5697458982467651, + "learning_rate": 4.93086698719263e-06, + "loss": 1.1586, + "step": 576 + }, + { + "epoch": 0.537243947858473, + "grad_norm": 1.5794774293899536, + "learning_rate": 4.9305775605234255e-06, + "loss": 1.1616, + "step": 577 + }, + { + "epoch": 0.5381750465549349, + "grad_norm": 1.6230155229568481, + "learning_rate": 4.930287537803975e-06, + "loss": 1.0912, + "step": 578 + }, + { + "epoch": 0.5391061452513967, + "grad_norm": 1.6342494487762451, + "learning_rate": 4.9299969191054e-06, + "loss": 1.1393, + "step": 579 + }, + { + "epoch": 0.5400372439478585, + "grad_norm": 1.4576398134231567, + "learning_rate": 4.929705704498969e-06, + "loss": 1.1042, + "step": 580 + }, + { + "epoch": 0.5409683426443203, + "grad_norm": 1.6272011995315552, + "learning_rate": 4.929413894056098e-06, + "loss": 1.0928, + "step": 581 + }, + { + "epoch": 0.5418994413407822, + "grad_norm": 1.6674431562423706, + "learning_rate": 4.929121487848344e-06, + "loss": 1.1872, + "step": 582 + }, + { + "epoch": 0.542830540037244, + "grad_norm": 1.6330026388168335, + "learning_rate": 4.9288284859474165e-06, + "loss": 1.1248, + "step": 583 + }, + { + "epoch": 0.5437616387337058, + "grad_norm": 1.5673110485076904, + "learning_rate": 4.928534888425168e-06, + "loss": 1.1088, + "step": 584 + }, + { + "epoch": 0.5446927374301676, + "grad_norm": 1.533722162246704, + "learning_rate": 4.928240695353598e-06, + "loss": 1.0675, + "step": 585 + }, + { + "epoch": 0.5456238361266295, + "grad_norm": 1.5629221200942993, + "learning_rate": 4.92794590680485e-06, + "loss": 1.0901, + "step": 586 + }, + { + "epoch": 0.5465549348230913, + "grad_norm": 1.624135136604309, + "learning_rate": 4.927650522851215e-06, + "loss": 1.0508, + "step": 587 + }, + { + "epoch": 0.547486033519553, + "grad_norm": 1.578065037727356, + "learning_rate": 4.927354543565131e-06, + "loss": 1.1512, + "step": 588 + }, + { + "epoch": 0.5484171322160148, + "grad_norm": 1.513796329498291, + "learning_rate": 4.92705796901918e-06, + "loss": 1.0465, + "step": 589 + }, + { + "epoch": 0.5493482309124768, + "grad_norm": 1.6567186117172241, + "learning_rate": 4.926760799286091e-06, + "loss": 1.0986, + "step": 590 + }, + { + "epoch": 0.5502793296089385, + "grad_norm": 1.5196467638015747, + "learning_rate": 4.92646303443874e-06, + "loss": 1.0875, + "step": 591 + }, + { + "epoch": 0.5512104283054003, + "grad_norm": 1.599839687347412, + "learning_rate": 4.926164674550147e-06, + "loss": 1.0962, + "step": 592 + }, + { + "epoch": 0.5521415270018621, + "grad_norm": 1.503763198852539, + "learning_rate": 4.925865719693479e-06, + "loss": 1.1416, + "step": 593 + }, + { + "epoch": 0.553072625698324, + "grad_norm": 1.5292751789093018, + "learning_rate": 4.925566169942048e-06, + "loss": 1.0939, + "step": 594 + }, + { + "epoch": 0.5540037243947858, + "grad_norm": 1.5088088512420654, + "learning_rate": 4.925266025369314e-06, + "loss": 1.132, + "step": 595 + }, + { + "epoch": 0.5549348230912476, + "grad_norm": 1.5630978345870972, + "learning_rate": 4.92496528604888e-06, + "loss": 1.1056, + "step": 596 + }, + { + "epoch": 0.5558659217877095, + "grad_norm": 1.6168630123138428, + "learning_rate": 4.924663952054497e-06, + "loss": 1.1408, + "step": 597 + }, + { + "epoch": 0.5567970204841713, + "grad_norm": 1.5950313806533813, + "learning_rate": 4.924362023460061e-06, + "loss": 1.1514, + "step": 598 + }, + { + "epoch": 0.5577281191806331, + "grad_norm": 1.5354509353637695, + "learning_rate": 4.924059500339613e-06, + "loss": 1.1279, + "step": 599 + }, + { + "epoch": 0.5586592178770949, + "grad_norm": 1.5805270671844482, + "learning_rate": 4.923756382767342e-06, + "loss": 1.1022, + "step": 600 + }, + { + "epoch": 0.5595903165735568, + "grad_norm": 1.48416268825531, + "learning_rate": 4.923452670817581e-06, + "loss": 1.0623, + "step": 601 + }, + { + "epoch": 0.5605214152700186, + "grad_norm": 1.545617699623108, + "learning_rate": 4.923148364564809e-06, + "loss": 1.1428, + "step": 602 + }, + { + "epoch": 0.5614525139664804, + "grad_norm": 1.4851430654525757, + "learning_rate": 4.922843464083651e-06, + "loss": 1.1682, + "step": 603 + }, + { + "epoch": 0.5623836126629422, + "grad_norm": 1.5792040824890137, + "learning_rate": 4.922537969448879e-06, + "loss": 1.1727, + "step": 604 + }, + { + "epoch": 0.5633147113594041, + "grad_norm": 1.5662609338760376, + "learning_rate": 4.922231880735407e-06, + "loss": 1.1186, + "step": 605 + }, + { + "epoch": 0.5642458100558659, + "grad_norm": 1.6631107330322266, + "learning_rate": 4.9219251980183e-06, + "loss": 1.0989, + "step": 606 + }, + { + "epoch": 0.5651769087523277, + "grad_norm": 1.559081792831421, + "learning_rate": 4.921617921372764e-06, + "loss": 1.0703, + "step": 607 + }, + { + "epoch": 0.5661080074487895, + "grad_norm": 1.5378901958465576, + "learning_rate": 4.921310050874151e-06, + "loss": 1.1408, + "step": 608 + }, + { + "epoch": 0.5670391061452514, + "grad_norm": 1.4655958414077759, + "learning_rate": 4.921001586597963e-06, + "loss": 1.1095, + "step": 609 + }, + { + "epoch": 0.5679702048417132, + "grad_norm": 1.6246010065078735, + "learning_rate": 4.920692528619843e-06, + "loss": 1.1076, + "step": 610 + }, + { + "epoch": 0.568901303538175, + "grad_norm": 1.5420719385147095, + "learning_rate": 4.920382877015581e-06, + "loss": 1.0959, + "step": 611 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 1.6560289859771729, + "learning_rate": 4.920072631861115e-06, + "loss": 1.1583, + "step": 612 + }, + { + "epoch": 0.5707635009310987, + "grad_norm": 1.4817291498184204, + "learning_rate": 4.919761793232524e-06, + "loss": 1.1129, + "step": 613 + }, + { + "epoch": 0.5716945996275605, + "grad_norm": 1.7070050239562988, + "learning_rate": 4.919450361206035e-06, + "loss": 1.1142, + "step": 614 + }, + { + "epoch": 0.5726256983240223, + "grad_norm": 1.7083073854446411, + "learning_rate": 4.919138335858021e-06, + "loss": 1.1474, + "step": 615 + }, + { + "epoch": 0.5735567970204841, + "grad_norm": 1.6288410425186157, + "learning_rate": 4.918825717265001e-06, + "loss": 1.1543, + "step": 616 + }, + { + "epoch": 0.574487895716946, + "grad_norm": 1.6135038137435913, + "learning_rate": 4.918512505503638e-06, + "loss": 1.0989, + "step": 617 + }, + { + "epoch": 0.5754189944134078, + "grad_norm": 1.4804624319076538, + "learning_rate": 4.9181987006507396e-06, + "loss": 1.0831, + "step": 618 + }, + { + "epoch": 0.5763500931098696, + "grad_norm": 1.4810327291488647, + "learning_rate": 4.91788430278326e-06, + "loss": 1.1531, + "step": 619 + }, + { + "epoch": 0.5772811918063314, + "grad_norm": 1.5278757810592651, + "learning_rate": 4.917569311978301e-06, + "loss": 1.1033, + "step": 620 + }, + { + "epoch": 0.5782122905027933, + "grad_norm": 1.474737524986267, + "learning_rate": 4.917253728313107e-06, + "loss": 1.0526, + "step": 621 + }, + { + "epoch": 0.5791433891992551, + "grad_norm": 1.5314648151397705, + "learning_rate": 4.916937551865068e-06, + "loss": 1.0946, + "step": 622 + }, + { + "epoch": 0.5800744878957169, + "grad_norm": 1.654606580734253, + "learning_rate": 4.916620782711719e-06, + "loss": 1.1657, + "step": 623 + }, + { + "epoch": 0.5810055865921788, + "grad_norm": 1.4837652444839478, + "learning_rate": 4.9163034209307435e-06, + "loss": 1.1388, + "step": 624 + }, + { + "epoch": 0.5819366852886406, + "grad_norm": 1.6011992692947388, + "learning_rate": 4.915985466599967e-06, + "loss": 1.0946, + "step": 625 + }, + { + "epoch": 0.5828677839851024, + "grad_norm": 1.623832106590271, + "learning_rate": 4.91566691979736e-06, + "loss": 1.1649, + "step": 626 + }, + { + "epoch": 0.5837988826815642, + "grad_norm": 1.526106357574463, + "learning_rate": 4.915347780601042e-06, + "loss": 1.1469, + "step": 627 + }, + { + "epoch": 0.5847299813780261, + "grad_norm": 1.5569554567337036, + "learning_rate": 4.915028049089275e-06, + "loss": 1.1163, + "step": 628 + }, + { + "epoch": 0.5856610800744879, + "grad_norm": 1.5431631803512573, + "learning_rate": 4.914707725340465e-06, + "loss": 1.0944, + "step": 629 + }, + { + "epoch": 0.5865921787709497, + "grad_norm": 1.4687777757644653, + "learning_rate": 4.914386809433167e-06, + "loss": 1.0564, + "step": 630 + }, + { + "epoch": 0.5875232774674115, + "grad_norm": 1.5362383127212524, + "learning_rate": 4.914065301446078e-06, + "loss": 1.1252, + "step": 631 + }, + { + "epoch": 0.5884543761638734, + "grad_norm": 1.5959590673446655, + "learning_rate": 4.913743201458042e-06, + "loss": 1.133, + "step": 632 + }, + { + "epoch": 0.5893854748603352, + "grad_norm": 1.552869439125061, + "learning_rate": 4.913420509548047e-06, + "loss": 1.1218, + "step": 633 + }, + { + "epoch": 0.590316573556797, + "grad_norm": 1.591748833656311, + "learning_rate": 4.913097225795227e-06, + "loss": 1.1433, + "step": 634 + }, + { + "epoch": 0.5912476722532588, + "grad_norm": 1.5221431255340576, + "learning_rate": 4.912773350278861e-06, + "loss": 1.0595, + "step": 635 + }, + { + "epoch": 0.5921787709497207, + "grad_norm": 1.5379856824874878, + "learning_rate": 4.912448883078373e-06, + "loss": 1.1289, + "step": 636 + }, + { + "epoch": 0.5931098696461825, + "grad_norm": 1.562050223350525, + "learning_rate": 4.912123824273331e-06, + "loss": 1.1394, + "step": 637 + }, + { + "epoch": 0.5940409683426443, + "grad_norm": 1.6056617498397827, + "learning_rate": 4.91179817394345e-06, + "loss": 1.1104, + "step": 638 + }, + { + "epoch": 0.5949720670391061, + "grad_norm": 1.5202099084854126, + "learning_rate": 4.91147193216859e-06, + "loss": 1.0701, + "step": 639 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 1.53488028049469, + "learning_rate": 4.911145099028753e-06, + "loss": 1.0748, + "step": 640 + }, + { + "epoch": 0.5968342644320298, + "grad_norm": 1.6276181936264038, + "learning_rate": 4.91081767460409e-06, + "loss": 1.129, + "step": 641 + }, + { + "epoch": 0.5977653631284916, + "grad_norm": 1.5922019481658936, + "learning_rate": 4.910489658974896e-06, + "loss": 1.1204, + "step": 642 + }, + { + "epoch": 0.5986964618249534, + "grad_norm": 1.5041857957839966, + "learning_rate": 4.910161052221608e-06, + "loss": 1.0824, + "step": 643 + }, + { + "epoch": 0.5996275605214153, + "grad_norm": 1.6397160291671753, + "learning_rate": 4.909831854424812e-06, + "loss": 1.1401, + "step": 644 + }, + { + "epoch": 0.6005586592178771, + "grad_norm": 1.5218939781188965, + "learning_rate": 4.909502065665236e-06, + "loss": 1.0875, + "step": 645 + }, + { + "epoch": 0.6014897579143389, + "grad_norm": 1.5679396390914917, + "learning_rate": 4.9091716860237545e-06, + "loss": 1.1467, + "step": 646 + }, + { + "epoch": 0.6024208566108007, + "grad_norm": 1.606338620185852, + "learning_rate": 4.908840715581386e-06, + "loss": 1.0847, + "step": 647 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 1.495506763458252, + "learning_rate": 4.908509154419296e-06, + "loss": 1.1315, + "step": 648 + }, + { + "epoch": 0.6042830540037244, + "grad_norm": 1.4830591678619385, + "learning_rate": 4.9081770026187915e-06, + "loss": 1.1011, + "step": 649 + }, + { + "epoch": 0.6052141527001862, + "grad_norm": 1.540897250175476, + "learning_rate": 4.9078442602613265e-06, + "loss": 1.1061, + "step": 650 + }, + { + "epoch": 0.6061452513966481, + "grad_norm": 1.5537681579589844, + "learning_rate": 4.907510927428499e-06, + "loss": 1.138, + "step": 651 + }, + { + "epoch": 0.6070763500931099, + "grad_norm": 1.626189947128296, + "learning_rate": 4.907177004202053e-06, + "loss": 1.1607, + "step": 652 + }, + { + "epoch": 0.6080074487895717, + "grad_norm": 1.6425658464431763, + "learning_rate": 4.9068424906638756e-06, + "loss": 1.1357, + "step": 653 + }, + { + "epoch": 0.6089385474860335, + "grad_norm": 1.5693364143371582, + "learning_rate": 4.906507386896e-06, + "loss": 1.1053, + "step": 654 + }, + { + "epoch": 0.6098696461824954, + "grad_norm": 1.557714581489563, + "learning_rate": 4.906171692980603e-06, + "loss": 1.1253, + "step": 655 + }, + { + "epoch": 0.6108007448789572, + "grad_norm": 1.5892163515090942, + "learning_rate": 4.905835409000009e-06, + "loss": 1.1183, + "step": 656 + }, + { + "epoch": 0.611731843575419, + "grad_norm": 1.5864640474319458, + "learning_rate": 4.905498535036683e-06, + "loss": 1.0769, + "step": 657 + }, + { + "epoch": 0.6126629422718808, + "grad_norm": 1.4962928295135498, + "learning_rate": 4.905161071173236e-06, + "loss": 1.0852, + "step": 658 + }, + { + "epoch": 0.6135940409683427, + "grad_norm": 1.4973156452178955, + "learning_rate": 4.904823017492425e-06, + "loss": 1.126, + "step": 659 + }, + { + "epoch": 0.6145251396648045, + "grad_norm": 1.5556844472885132, + "learning_rate": 4.904484374077151e-06, + "loss": 1.1195, + "step": 660 + }, + { + "epoch": 0.6154562383612663, + "grad_norm": 1.661524772644043, + "learning_rate": 4.9041451410104595e-06, + "loss": 1.1442, + "step": 661 + }, + { + "epoch": 0.6163873370577281, + "grad_norm": 1.5103766918182373, + "learning_rate": 4.903805318375541e-06, + "loss": 1.0951, + "step": 662 + }, + { + "epoch": 0.61731843575419, + "grad_norm": 1.5827866792678833, + "learning_rate": 4.9034649062557295e-06, + "loss": 1.1581, + "step": 663 + }, + { + "epoch": 0.6182495344506518, + "grad_norm": 1.5530108213424683, + "learning_rate": 4.903123904734504e-06, + "loss": 1.1187, + "step": 664 + }, + { + "epoch": 0.6191806331471136, + "grad_norm": 1.930545449256897, + "learning_rate": 4.902782313895489e-06, + "loss": 1.0908, + "step": 665 + }, + { + "epoch": 0.6201117318435754, + "grad_norm": 1.5824450254440308, + "learning_rate": 4.902440133822452e-06, + "loss": 1.0865, + "step": 666 + }, + { + "epoch": 0.6210428305400373, + "grad_norm": 1.5734223127365112, + "learning_rate": 4.902097364599306e-06, + "loss": 1.0816, + "step": 667 + }, + { + "epoch": 0.6219739292364991, + "grad_norm": 1.5792101621627808, + "learning_rate": 4.9017540063101085e-06, + "loss": 1.0988, + "step": 668 + }, + { + "epoch": 0.6229050279329609, + "grad_norm": 1.5171549320220947, + "learning_rate": 4.901410059039061e-06, + "loss": 1.0898, + "step": 669 + }, + { + "epoch": 0.6238361266294227, + "grad_norm": 1.5120694637298584, + "learning_rate": 4.901065522870511e-06, + "loss": 1.0758, + "step": 670 + }, + { + "epoch": 0.6247672253258846, + "grad_norm": 1.6109082698822021, + "learning_rate": 4.900720397888947e-06, + "loss": 1.1205, + "step": 671 + }, + { + "epoch": 0.6256983240223464, + "grad_norm": 1.5656803846359253, + "learning_rate": 4.900374684179005e-06, + "loss": 1.0803, + "step": 672 + }, + { + "epoch": 0.6266294227188082, + "grad_norm": 1.5185593366622925, + "learning_rate": 4.900028381825464e-06, + "loss": 1.1101, + "step": 673 + }, + { + "epoch": 0.62756052141527, + "grad_norm": 1.51339852809906, + "learning_rate": 4.8996814909132475e-06, + "loss": 1.0805, + "step": 674 + }, + { + "epoch": 0.6284916201117319, + "grad_norm": 1.5645750761032104, + "learning_rate": 4.899334011527424e-06, + "loss": 1.1125, + "step": 675 + }, + { + "epoch": 0.6294227188081937, + "grad_norm": 1.6254100799560547, + "learning_rate": 4.898985943753207e-06, + "loss": 1.1332, + "step": 676 + }, + { + "epoch": 0.6303538175046555, + "grad_norm": 1.5608704090118408, + "learning_rate": 4.89863728767595e-06, + "loss": 1.1132, + "step": 677 + }, + { + "epoch": 0.6312849162011173, + "grad_norm": 1.5128356218338013, + "learning_rate": 4.898288043381157e-06, + "loss": 1.0267, + "step": 678 + }, + { + "epoch": 0.6322160148975792, + "grad_norm": 1.5966126918792725, + "learning_rate": 4.897938210954472e-06, + "loss": 1.0597, + "step": 679 + }, + { + "epoch": 0.633147113594041, + "grad_norm": 1.5826008319854736, + "learning_rate": 4.8975877904816825e-06, + "loss": 1.0882, + "step": 680 + }, + { + "epoch": 0.6340782122905028, + "grad_norm": 1.5969572067260742, + "learning_rate": 4.897236782048726e-06, + "loss": 1.0752, + "step": 681 + }, + { + "epoch": 0.6350093109869647, + "grad_norm": 1.5381343364715576, + "learning_rate": 4.896885185741676e-06, + "loss": 1.1118, + "step": 682 + }, + { + "epoch": 0.6359404096834265, + "grad_norm": 1.5589604377746582, + "learning_rate": 4.896533001646757e-06, + "loss": 1.1077, + "step": 683 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 1.4659178256988525, + "learning_rate": 4.8961802298503355e-06, + "loss": 1.078, + "step": 684 + }, + { + "epoch": 0.6378026070763501, + "grad_norm": 1.5804171562194824, + "learning_rate": 4.89582687043892e-06, + "loss": 1.0963, + "step": 685 + }, + { + "epoch": 0.638733705772812, + "grad_norm": 1.4594801664352417, + "learning_rate": 4.895472923499165e-06, + "loss": 1.1026, + "step": 686 + }, + { + "epoch": 0.6396648044692738, + "grad_norm": 1.5364091396331787, + "learning_rate": 4.89511838911787e-06, + "loss": 1.1093, + "step": 687 + }, + { + "epoch": 0.6405959031657356, + "grad_norm": 1.5281437635421753, + "learning_rate": 4.894763267381977e-06, + "loss": 1.1183, + "step": 688 + }, + { + "epoch": 0.6415270018621974, + "grad_norm": 1.5509490966796875, + "learning_rate": 4.894407558378572e-06, + "loss": 1.1291, + "step": 689 + }, + { + "epoch": 0.6424581005586593, + "grad_norm": 1.616011381149292, + "learning_rate": 4.894051262194885e-06, + "loss": 1.0723, + "step": 690 + }, + { + "epoch": 0.6433891992551211, + "grad_norm": 1.6060771942138672, + "learning_rate": 4.893694378918292e-06, + "loss": 1.067, + "step": 691 + }, + { + "epoch": 0.6443202979515829, + "grad_norm": 1.5956634283065796, + "learning_rate": 4.8933369086363105e-06, + "loss": 1.1002, + "step": 692 + }, + { + "epoch": 0.6452513966480447, + "grad_norm": 1.5667318105697632, + "learning_rate": 4.892978851436603e-06, + "loss": 1.1144, + "step": 693 + }, + { + "epoch": 0.6461824953445066, + "grad_norm": 1.488541603088379, + "learning_rate": 4.892620207406975e-06, + "loss": 1.0363, + "step": 694 + }, + { + "epoch": 0.6471135940409684, + "grad_norm": 1.6104015111923218, + "learning_rate": 4.892260976635379e-06, + "loss": 1.0853, + "step": 695 + }, + { + "epoch": 0.6480446927374302, + "grad_norm": 1.565591812133789, + "learning_rate": 4.891901159209907e-06, + "loss": 1.0491, + "step": 696 + }, + { + "epoch": 0.648975791433892, + "grad_norm": 1.5780127048492432, + "learning_rate": 4.891540755218797e-06, + "loss": 1.0958, + "step": 697 + }, + { + "epoch": 0.6499068901303539, + "grad_norm": 1.596530795097351, + "learning_rate": 4.891179764750434e-06, + "loss": 1.1335, + "step": 698 + }, + { + "epoch": 0.6508379888268156, + "grad_norm": 1.5459315776824951, + "learning_rate": 4.890818187893338e-06, + "loss": 1.091, + "step": 699 + }, + { + "epoch": 0.6517690875232774, + "grad_norm": 1.5360651016235352, + "learning_rate": 4.890456024736183e-06, + "loss": 1.1095, + "step": 700 + }, + { + "epoch": 0.6527001862197392, + "grad_norm": 1.5870563983917236, + "learning_rate": 4.890093275367781e-06, + "loss": 1.0876, + "step": 701 + }, + { + "epoch": 0.6536312849162011, + "grad_norm": 1.5088231563568115, + "learning_rate": 4.889729939877089e-06, + "loss": 1.1153, + "step": 702 + }, + { + "epoch": 0.654562383612663, + "grad_norm": 1.4789011478424072, + "learning_rate": 4.889366018353207e-06, + "loss": 1.1195, + "step": 703 + }, + { + "epoch": 0.6554934823091247, + "grad_norm": 1.4999905824661255, + "learning_rate": 4.8890015108853805e-06, + "loss": 1.1156, + "step": 704 + }, + { + "epoch": 0.6564245810055865, + "grad_norm": 1.5061445236206055, + "learning_rate": 4.888636417562996e-06, + "loss": 1.0963, + "step": 705 + }, + { + "epoch": 0.6573556797020484, + "grad_norm": 1.5363857746124268, + "learning_rate": 4.888270738475588e-06, + "loss": 1.1112, + "step": 706 + }, + { + "epoch": 0.6582867783985102, + "grad_norm": 1.4777934551239014, + "learning_rate": 4.887904473712829e-06, + "loss": 1.1139, + "step": 707 + }, + { + "epoch": 0.659217877094972, + "grad_norm": 1.4956679344177246, + "learning_rate": 4.88753762336454e-06, + "loss": 1.0872, + "step": 708 + }, + { + "epoch": 0.660148975791434, + "grad_norm": 1.4835432767868042, + "learning_rate": 4.887170187520684e-06, + "loss": 1.1361, + "step": 709 + }, + { + "epoch": 0.6610800744878957, + "grad_norm": 1.5423203706741333, + "learning_rate": 4.886802166271365e-06, + "loss": 1.0813, + "step": 710 + }, + { + "epoch": 0.6620111731843575, + "grad_norm": 1.559105634689331, + "learning_rate": 4.8864335597068335e-06, + "loss": 1.1319, + "step": 711 + }, + { + "epoch": 0.6629422718808193, + "grad_norm": 1.5082135200500488, + "learning_rate": 4.886064367917485e-06, + "loss": 1.1182, + "step": 712 + }, + { + "epoch": 0.6638733705772812, + "grad_norm": 1.5104544162750244, + "learning_rate": 4.885694590993854e-06, + "loss": 1.0512, + "step": 713 + }, + { + "epoch": 0.664804469273743, + "grad_norm": 1.4970797300338745, + "learning_rate": 4.8853242290266216e-06, + "loss": 1.1005, + "step": 714 + }, + { + "epoch": 0.6657355679702048, + "grad_norm": 1.579653263092041, + "learning_rate": 4.884953282106612e-06, + "loss": 1.0788, + "step": 715 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.603585958480835, + "learning_rate": 4.884581750324792e-06, + "loss": 1.0905, + "step": 716 + }, + { + "epoch": 0.6675977653631285, + "grad_norm": 1.5491455793380737, + "learning_rate": 4.884209633772272e-06, + "loss": 1.0607, + "step": 717 + }, + { + "epoch": 0.6685288640595903, + "grad_norm": 1.4984642267227173, + "learning_rate": 4.883836932540308e-06, + "loss": 1.0939, + "step": 718 + }, + { + "epoch": 0.6694599627560521, + "grad_norm": 1.6042760610580444, + "learning_rate": 4.883463646720295e-06, + "loss": 1.071, + "step": 719 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 1.4980610609054565, + "learning_rate": 4.883089776403775e-06, + "loss": 1.0743, + "step": 720 + }, + { + "epoch": 0.6713221601489758, + "grad_norm": 1.5268374681472778, + "learning_rate": 4.882715321682432e-06, + "loss": 1.0753, + "step": 721 + }, + { + "epoch": 0.6722532588454376, + "grad_norm": 1.61496102809906, + "learning_rate": 4.882340282648094e-06, + "loss": 1.1517, + "step": 722 + }, + { + "epoch": 0.6731843575418994, + "grad_norm": 1.528674602508545, + "learning_rate": 4.881964659392731e-06, + "loss": 1.1136, + "step": 723 + }, + { + "epoch": 0.6741154562383612, + "grad_norm": 1.451182246208191, + "learning_rate": 4.881588452008457e-06, + "loss": 1.0732, + "step": 724 + }, + { + "epoch": 0.6750465549348231, + "grad_norm": 1.580771565437317, + "learning_rate": 4.8812116605875295e-06, + "loss": 1.1069, + "step": 725 + }, + { + "epoch": 0.6759776536312849, + "grad_norm": 1.5257370471954346, + "learning_rate": 4.88083428522235e-06, + "loss": 1.0937, + "step": 726 + }, + { + "epoch": 0.6769087523277467, + "grad_norm": 1.6596299409866333, + "learning_rate": 4.88045632600546e-06, + "loss": 1.0962, + "step": 727 + }, + { + "epoch": 0.6778398510242085, + "grad_norm": 1.533115029335022, + "learning_rate": 4.880077783029549e-06, + "loss": 1.0583, + "step": 728 + }, + { + "epoch": 0.6787709497206704, + "grad_norm": 1.6121939420700073, + "learning_rate": 4.879698656387446e-06, + "loss": 1.0924, + "step": 729 + }, + { + "epoch": 0.6797020484171322, + "grad_norm": 1.555610179901123, + "learning_rate": 4.879318946172124e-06, + "loss": 1.1304, + "step": 730 + }, + { + "epoch": 0.680633147113594, + "grad_norm": 1.5840232372283936, + "learning_rate": 4.878938652476698e-06, + "loss": 1.1388, + "step": 731 + }, + { + "epoch": 0.6815642458100558, + "grad_norm": 1.578559398651123, + "learning_rate": 4.878557775394429e-06, + "loss": 1.1108, + "step": 732 + }, + { + "epoch": 0.6824953445065177, + "grad_norm": 1.5036128759384155, + "learning_rate": 4.87817631501872e-06, + "loss": 1.0703, + "step": 733 + }, + { + "epoch": 0.6834264432029795, + "grad_norm": 1.5150079727172852, + "learning_rate": 4.877794271443116e-06, + "loss": 1.1165, + "step": 734 + }, + { + "epoch": 0.6843575418994413, + "grad_norm": 1.5486232042312622, + "learning_rate": 4.877411644761304e-06, + "loss": 1.0724, + "step": 735 + }, + { + "epoch": 0.6852886405959032, + "grad_norm": 1.5734553337097168, + "learning_rate": 4.877028435067117e-06, + "loss": 1.0929, + "step": 736 + }, + { + "epoch": 0.686219739292365, + "grad_norm": 1.570406198501587, + "learning_rate": 4.876644642454529e-06, + "loss": 1.0725, + "step": 737 + }, + { + "epoch": 0.6871508379888268, + "grad_norm": 1.5357011556625366, + "learning_rate": 4.8762602670176574e-06, + "loss": 1.0723, + "step": 738 + }, + { + "epoch": 0.6880819366852886, + "grad_norm": 1.5737360715866089, + "learning_rate": 4.875875308850762e-06, + "loss": 1.1451, + "step": 739 + }, + { + "epoch": 0.6890130353817505, + "grad_norm": 1.6345967054367065, + "learning_rate": 4.875489768048247e-06, + "loss": 1.157, + "step": 740 + }, + { + "epoch": 0.6899441340782123, + "grad_norm": 1.5567008256912231, + "learning_rate": 4.87510364470466e-06, + "loss": 1.0703, + "step": 741 + }, + { + "epoch": 0.6908752327746741, + "grad_norm": 1.5474246740341187, + "learning_rate": 4.874716938914686e-06, + "loss": 1.0862, + "step": 742 + }, + { + "epoch": 0.6918063314711359, + "grad_norm": 1.5408908128738403, + "learning_rate": 4.87432965077316e-06, + "loss": 1.0408, + "step": 743 + }, + { + "epoch": 0.6927374301675978, + "grad_norm": 1.5973511934280396, + "learning_rate": 4.873941780375055e-06, + "loss": 1.0974, + "step": 744 + }, + { + "epoch": 0.6936685288640596, + "grad_norm": 1.5032771825790405, + "learning_rate": 4.873553327815489e-06, + "loss": 1.0607, + "step": 745 + }, + { + "epoch": 0.6945996275605214, + "grad_norm": 1.5170742273330688, + "learning_rate": 4.873164293189723e-06, + "loss": 1.0646, + "step": 746 + }, + { + "epoch": 0.6955307262569832, + "grad_norm": 1.6156871318817139, + "learning_rate": 4.872774676593158e-06, + "loss": 1.1065, + "step": 747 + }, + { + "epoch": 0.6964618249534451, + "grad_norm": 1.5451059341430664, + "learning_rate": 4.872384478121342e-06, + "loss": 1.087, + "step": 748 + }, + { + "epoch": 0.6973929236499069, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.871993697869961e-06, + "loss": 1.0941, + "step": 749 + }, + { + "epoch": 0.6983240223463687, + "grad_norm": 1.5981560945510864, + "learning_rate": 4.871602335934847e-06, + "loss": 1.0877, + "step": 750 + }, + { + "epoch": 0.6992551210428305, + "grad_norm": 1.584849238395691, + "learning_rate": 4.8712103924119744e-06, + "loss": 1.1211, + "step": 751 + }, + { + "epoch": 0.7001862197392924, + "grad_norm": 1.5590605735778809, + "learning_rate": 4.870817867397459e-06, + "loss": 1.1352, + "step": 752 + }, + { + "epoch": 0.7011173184357542, + "grad_norm": 1.5966578722000122, + "learning_rate": 4.870424760987559e-06, + "loss": 1.1345, + "step": 753 + }, + { + "epoch": 0.702048417132216, + "grad_norm": 1.4746921062469482, + "learning_rate": 4.870031073278676e-06, + "loss": 1.0907, + "step": 754 + }, + { + "epoch": 0.7029795158286778, + "grad_norm": 1.5347262620925903, + "learning_rate": 4.869636804367355e-06, + "loss": 1.0769, + "step": 755 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 1.5220005512237549, + "learning_rate": 4.869241954350281e-06, + "loss": 1.0585, + "step": 756 + }, + { + "epoch": 0.7048417132216015, + "grad_norm": 1.5988545417785645, + "learning_rate": 4.868846523324284e-06, + "loss": 1.1004, + "step": 757 + }, + { + "epoch": 0.7057728119180633, + "grad_norm": 1.574826717376709, + "learning_rate": 4.868450511386336e-06, + "loss": 1.1146, + "step": 758 + }, + { + "epoch": 0.7067039106145251, + "grad_norm": 1.5736562013626099, + "learning_rate": 4.868053918633549e-06, + "loss": 1.0703, + "step": 759 + }, + { + "epoch": 0.707635009310987, + "grad_norm": 1.464430809020996, + "learning_rate": 4.867656745163182e-06, + "loss": 1.0535, + "step": 760 + }, + { + "epoch": 0.7085661080074488, + "grad_norm": 1.561993956565857, + "learning_rate": 4.8672589910726305e-06, + "loss": 1.0615, + "step": 761 + }, + { + "epoch": 0.7094972067039106, + "grad_norm": 1.5395992994308472, + "learning_rate": 4.86686065645944e-06, + "loss": 1.0774, + "step": 762 + }, + { + "epoch": 0.7104283054003724, + "grad_norm": 1.5870630741119385, + "learning_rate": 4.86646174142129e-06, + "loss": 1.0823, + "step": 763 + }, + { + "epoch": 0.7113594040968343, + "grad_norm": 1.6562172174453735, + "learning_rate": 4.8660622460560096e-06, + "loss": 1.1273, + "step": 764 + }, + { + "epoch": 0.7122905027932961, + "grad_norm": 1.6088863611221313, + "learning_rate": 4.865662170461564e-06, + "loss": 1.1102, + "step": 765 + }, + { + "epoch": 0.7132216014897579, + "grad_norm": 1.506174921989441, + "learning_rate": 4.865261514736066e-06, + "loss": 1.0233, + "step": 766 + }, + { + "epoch": 0.7141527001862198, + "grad_norm": 1.5461585521697998, + "learning_rate": 4.864860278977767e-06, + "loss": 1.1069, + "step": 767 + }, + { + "epoch": 0.7150837988826816, + "grad_norm": 1.6047543287277222, + "learning_rate": 4.864458463285063e-06, + "loss": 1.1141, + "step": 768 + }, + { + "epoch": 0.7160148975791434, + "grad_norm": 1.550565481185913, + "learning_rate": 4.864056067756491e-06, + "loss": 1.0916, + "step": 769 + }, + { + "epoch": 0.7169459962756052, + "grad_norm": 1.6570130586624146, + "learning_rate": 4.86365309249073e-06, + "loss": 1.1312, + "step": 770 + }, + { + "epoch": 0.7178770949720671, + "grad_norm": 1.5110983848571777, + "learning_rate": 4.863249537586601e-06, + "loss": 1.08, + "step": 771 + }, + { + "epoch": 0.7188081936685289, + "grad_norm": 1.571300745010376, + "learning_rate": 4.86284540314307e-06, + "loss": 1.0873, + "step": 772 + }, + { + "epoch": 0.7197392923649907, + "grad_norm": 1.5201406478881836, + "learning_rate": 4.8624406892592394e-06, + "loss": 1.1234, + "step": 773 + }, + { + "epoch": 0.7206703910614525, + "grad_norm": 1.4831037521362305, + "learning_rate": 4.862035396034359e-06, + "loss": 1.0649, + "step": 774 + }, + { + "epoch": 0.7216014897579144, + "grad_norm": 1.5336039066314697, + "learning_rate": 4.86162952356782e-06, + "loss": 1.0121, + "step": 775 + }, + { + "epoch": 0.7225325884543762, + "grad_norm": 1.5172268152236938, + "learning_rate": 4.8612230719591535e-06, + "loss": 1.1539, + "step": 776 + }, + { + "epoch": 0.723463687150838, + "grad_norm": 1.5218273401260376, + "learning_rate": 4.860816041308033e-06, + "loss": 1.064, + "step": 777 + }, + { + "epoch": 0.7243947858472998, + "grad_norm": 1.5292189121246338, + "learning_rate": 4.860408431714275e-06, + "loss": 1.0914, + "step": 778 + }, + { + "epoch": 0.7253258845437617, + "grad_norm": 1.5450135469436646, + "learning_rate": 4.860000243277837e-06, + "loss": 1.082, + "step": 779 + }, + { + "epoch": 0.7262569832402235, + "grad_norm": 1.6913230419158936, + "learning_rate": 4.85959147609882e-06, + "loss": 1.0812, + "step": 780 + }, + { + "epoch": 0.7271880819366853, + "grad_norm": 1.5307987928390503, + "learning_rate": 4.859182130277465e-06, + "loss": 1.0964, + "step": 781 + }, + { + "epoch": 0.7281191806331471, + "grad_norm": 1.5680224895477295, + "learning_rate": 4.858772205914158e-06, + "loss": 1.1084, + "step": 782 + }, + { + "epoch": 0.729050279329609, + "grad_norm": 1.5493946075439453, + "learning_rate": 4.85836170310942e-06, + "loss": 1.066, + "step": 783 + }, + { + "epoch": 0.7299813780260708, + "grad_norm": 1.5324965715408325, + "learning_rate": 4.857950621963924e-06, + "loss": 1.0525, + "step": 784 + }, + { + "epoch": 0.7309124767225326, + "grad_norm": 1.5298123359680176, + "learning_rate": 4.857538962578475e-06, + "loss": 1.0998, + "step": 785 + }, + { + "epoch": 0.7318435754189944, + "grad_norm": 1.6049745082855225, + "learning_rate": 4.857126725054028e-06, + "loss": 1.1404, + "step": 786 + }, + { + "epoch": 0.7327746741154563, + "grad_norm": 1.6233011484146118, + "learning_rate": 4.856713909491673e-06, + "loss": 1.0963, + "step": 787 + }, + { + "epoch": 0.7337057728119181, + "grad_norm": 1.594020128250122, + "learning_rate": 4.856300515992645e-06, + "loss": 1.0649, + "step": 788 + }, + { + "epoch": 0.7346368715083799, + "grad_norm": 1.6714119911193848, + "learning_rate": 4.855886544658322e-06, + "loss": 1.0816, + "step": 789 + }, + { + "epoch": 0.7355679702048417, + "grad_norm": 1.5608378648757935, + "learning_rate": 4.8554719955902215e-06, + "loss": 1.0909, + "step": 790 + }, + { + "epoch": 0.7364990689013036, + "grad_norm": 1.4881634712219238, + "learning_rate": 4.855056868890004e-06, + "loss": 1.0887, + "step": 791 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 1.5242223739624023, + "learning_rate": 4.854641164659468e-06, + "loss": 1.102, + "step": 792 + }, + { + "epoch": 0.7383612662942272, + "grad_norm": 1.6237317323684692, + "learning_rate": 4.854224883000561e-06, + "loss": 1.0703, + "step": 793 + }, + { + "epoch": 0.7392923649906891, + "grad_norm": 1.5481069087982178, + "learning_rate": 4.853808024015364e-06, + "loss": 1.0754, + "step": 794 + }, + { + "epoch": 0.7402234636871509, + "grad_norm": 1.6555005311965942, + "learning_rate": 4.853390587806105e-06, + "loss": 1.0886, + "step": 795 + }, + { + "epoch": 0.7411545623836127, + "grad_norm": 1.604873776435852, + "learning_rate": 4.852972574475151e-06, + "loss": 1.1246, + "step": 796 + }, + { + "epoch": 0.7420856610800745, + "grad_norm": 1.5767509937286377, + "learning_rate": 4.852553984125013e-06, + "loss": 1.0772, + "step": 797 + }, + { + "epoch": 0.7430167597765364, + "grad_norm": 1.6008780002593994, + "learning_rate": 4.852134816858341e-06, + "loss": 1.1223, + "step": 798 + }, + { + "epoch": 0.7439478584729982, + "grad_norm": 1.5163817405700684, + "learning_rate": 4.851715072777926e-06, + "loss": 1.0917, + "step": 799 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 1.5727468729019165, + "learning_rate": 4.8512947519867025e-06, + "loss": 1.1302, + "step": 800 + }, + { + "epoch": 0.7458100558659218, + "grad_norm": 1.613973617553711, + "learning_rate": 4.850873854587747e-06, + "loss": 1.0977, + "step": 801 + }, + { + "epoch": 0.7467411545623837, + "grad_norm": 1.7014856338500977, + "learning_rate": 4.850452380684275e-06, + "loss": 1.0715, + "step": 802 + }, + { + "epoch": 0.7476722532588455, + "grad_norm": 1.5576146841049194, + "learning_rate": 4.850030330379645e-06, + "loss": 1.0662, + "step": 803 + }, + { + "epoch": 0.7486033519553073, + "grad_norm": 1.6135509014129639, + "learning_rate": 4.849607703777356e-06, + "loss": 1.1317, + "step": 804 + }, + { + "epoch": 0.749534450651769, + "grad_norm": 1.5958330631256104, + "learning_rate": 4.849184500981048e-06, + "loss": 1.121, + "step": 805 + }, + { + "epoch": 0.750465549348231, + "grad_norm": 1.4927066564559937, + "learning_rate": 4.848760722094504e-06, + "loss": 1.0863, + "step": 806 + }, + { + "epoch": 0.7513966480446927, + "grad_norm": 1.5335276126861572, + "learning_rate": 4.8483363672216475e-06, + "loss": 1.0748, + "step": 807 + }, + { + "epoch": 0.7523277467411545, + "grad_norm": 1.4925564527511597, + "learning_rate": 4.8479114364665425e-06, + "loss": 1.0899, + "step": 808 + }, + { + "epoch": 0.7532588454376163, + "grad_norm": 1.5200743675231934, + "learning_rate": 4.847485929933395e-06, + "loss": 1.0907, + "step": 809 + }, + { + "epoch": 0.7541899441340782, + "grad_norm": 1.5277286767959595, + "learning_rate": 4.84705984772655e-06, + "loss": 1.0982, + "step": 810 + }, + { + "epoch": 0.75512104283054, + "grad_norm": 1.5960828065872192, + "learning_rate": 4.846633189950498e-06, + "loss": 1.0875, + "step": 811 + }, + { + "epoch": 0.7560521415270018, + "grad_norm": 1.5886591672897339, + "learning_rate": 4.846205956709868e-06, + "loss": 1.0638, + "step": 812 + }, + { + "epoch": 0.7569832402234636, + "grad_norm": 1.5943529605865479, + "learning_rate": 4.845778148109429e-06, + "loss": 1.1377, + "step": 813 + }, + { + "epoch": 0.7579143389199255, + "grad_norm": 1.5340068340301514, + "learning_rate": 4.8453497642540935e-06, + "loss": 1.0761, + "step": 814 + }, + { + "epoch": 0.7588454376163873, + "grad_norm": 1.4797906875610352, + "learning_rate": 4.844920805248914e-06, + "loss": 1.0684, + "step": 815 + }, + { + "epoch": 0.7597765363128491, + "grad_norm": 1.5927391052246094, + "learning_rate": 4.844491271199083e-06, + "loss": 1.0687, + "step": 816 + }, + { + "epoch": 0.7607076350093109, + "grad_norm": 1.531162977218628, + "learning_rate": 4.844061162209937e-06, + "loss": 1.1113, + "step": 817 + }, + { + "epoch": 0.7616387337057728, + "grad_norm": 1.5677597522735596, + "learning_rate": 4.84363047838695e-06, + "loss": 1.1091, + "step": 818 + }, + { + "epoch": 0.7625698324022346, + "grad_norm": 1.511192798614502, + "learning_rate": 4.843199219835739e-06, + "loss": 1.0765, + "step": 819 + }, + { + "epoch": 0.7635009310986964, + "grad_norm": 1.5178874731063843, + "learning_rate": 4.842767386662062e-06, + "loss": 1.0829, + "step": 820 + }, + { + "epoch": 0.7644320297951583, + "grad_norm": 1.5365880727767944, + "learning_rate": 4.842334978971815e-06, + "loss": 1.0914, + "step": 821 + }, + { + "epoch": 0.7653631284916201, + "grad_norm": 1.5762591361999512, + "learning_rate": 4.8419019968710415e-06, + "loss": 1.1028, + "step": 822 + }, + { + "epoch": 0.7662942271880819, + "grad_norm": 1.5376968383789062, + "learning_rate": 4.841468440465918e-06, + "loss": 1.1049, + "step": 823 + }, + { + "epoch": 0.7672253258845437, + "grad_norm": 1.476027488708496, + "learning_rate": 4.841034309862768e-06, + "loss": 1.0825, + "step": 824 + }, + { + "epoch": 0.7681564245810056, + "grad_norm": 1.4843603372573853, + "learning_rate": 4.8405996051680505e-06, + "loss": 1.0894, + "step": 825 + }, + { + "epoch": 0.7690875232774674, + "grad_norm": 1.5072650909423828, + "learning_rate": 4.84016432648837e-06, + "loss": 1.0703, + "step": 826 + }, + { + "epoch": 0.7700186219739292, + "grad_norm": 1.519260287284851, + "learning_rate": 4.839728473930471e-06, + "loss": 1.0286, + "step": 827 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 1.636559247970581, + "learning_rate": 4.839292047601234e-06, + "loss": 1.1477, + "step": 828 + }, + { + "epoch": 0.7718808193668529, + "grad_norm": 1.5612605810165405, + "learning_rate": 4.838855047607688e-06, + "loss": 1.093, + "step": 829 + }, + { + "epoch": 0.7728119180633147, + "grad_norm": 1.7009735107421875, + "learning_rate": 4.838417474056994e-06, + "loss": 1.0631, + "step": 830 + }, + { + "epoch": 0.7737430167597765, + "grad_norm": 1.5374367237091064, + "learning_rate": 4.8379793270564625e-06, + "loss": 1.0809, + "step": 831 + }, + { + "epoch": 0.7746741154562383, + "grad_norm": 1.537718653678894, + "learning_rate": 4.837540606713538e-06, + "loss": 1.0802, + "step": 832 + }, + { + "epoch": 0.7756052141527002, + "grad_norm": 1.6281205415725708, + "learning_rate": 4.837101313135807e-06, + "loss": 1.0901, + "step": 833 + }, + { + "epoch": 0.776536312849162, + "grad_norm": 1.516488790512085, + "learning_rate": 4.836661446430999e-06, + "loss": 1.0512, + "step": 834 + }, + { + "epoch": 0.7774674115456238, + "grad_norm": 1.5857281684875488, + "learning_rate": 4.836221006706982e-06, + "loss": 1.1061, + "step": 835 + }, + { + "epoch": 0.7783985102420856, + "grad_norm": 1.579249620437622, + "learning_rate": 4.8357799940717644e-06, + "loss": 1.1102, + "step": 836 + }, + { + "epoch": 0.7793296089385475, + "grad_norm": 1.4527626037597656, + "learning_rate": 4.8353384086334965e-06, + "loss": 1.1271, + "step": 837 + }, + { + "epoch": 0.7802607076350093, + "grad_norm": 1.55983304977417, + "learning_rate": 4.834896250500467e-06, + "loss": 1.097, + "step": 838 + }, + { + "epoch": 0.7811918063314711, + "grad_norm": 1.5362640619277954, + "learning_rate": 4.834453519781108e-06, + "loss": 1.0798, + "step": 839 + }, + { + "epoch": 0.7821229050279329, + "grad_norm": 1.5832512378692627, + "learning_rate": 4.83401021658399e-06, + "loss": 1.1011, + "step": 840 + }, + { + "epoch": 0.7830540037243948, + "grad_norm": 1.5125010013580322, + "learning_rate": 4.833566341017823e-06, + "loss": 1.1151, + "step": 841 + }, + { + "epoch": 0.7839851024208566, + "grad_norm": 1.5389574766159058, + "learning_rate": 4.833121893191459e-06, + "loss": 1.0744, + "step": 842 + }, + { + "epoch": 0.7849162011173184, + "grad_norm": 1.5336229801177979, + "learning_rate": 4.832676873213891e-06, + "loss": 1.0641, + "step": 843 + }, + { + "epoch": 0.7858472998137802, + "grad_norm": 1.525045394897461, + "learning_rate": 4.83223128119425e-06, + "loss": 1.1052, + "step": 844 + }, + { + "epoch": 0.7867783985102421, + "grad_norm": 1.5727417469024658, + "learning_rate": 4.831785117241809e-06, + "loss": 1.1179, + "step": 845 + }, + { + "epoch": 0.7877094972067039, + "grad_norm": 1.6233503818511963, + "learning_rate": 4.831338381465979e-06, + "loss": 1.0909, + "step": 846 + }, + { + "epoch": 0.7886405959031657, + "grad_norm": 1.517408013343811, + "learning_rate": 4.830891073976316e-06, + "loss": 1.0837, + "step": 847 + }, + { + "epoch": 0.7895716945996276, + "grad_norm": 1.5409659147262573, + "learning_rate": 4.830443194882511e-06, + "loss": 1.1028, + "step": 848 + }, + { + "epoch": 0.7905027932960894, + "grad_norm": 1.5416734218597412, + "learning_rate": 4.829994744294398e-06, + "loss": 1.0996, + "step": 849 + }, + { + "epoch": 0.7914338919925512, + "grad_norm": 1.5739552974700928, + "learning_rate": 4.82954572232195e-06, + "loss": 1.138, + "step": 850 + }, + { + "epoch": 0.792364990689013, + "grad_norm": 1.5199893712997437, + "learning_rate": 4.8290961290752825e-06, + "loss": 1.0863, + "step": 851 + }, + { + "epoch": 0.7932960893854749, + "grad_norm": 1.5755997896194458, + "learning_rate": 4.8286459646646465e-06, + "loss": 1.1103, + "step": 852 + }, + { + "epoch": 0.7942271880819367, + "grad_norm": 1.5463471412658691, + "learning_rate": 4.828195229200438e-06, + "loss": 1.0744, + "step": 853 + }, + { + "epoch": 0.7951582867783985, + "grad_norm": 1.519218921661377, + "learning_rate": 4.827743922793189e-06, + "loss": 1.0795, + "step": 854 + }, + { + "epoch": 0.7960893854748603, + "grad_norm": 1.5830116271972656, + "learning_rate": 4.827292045553574e-06, + "loss": 1.1171, + "step": 855 + }, + { + "epoch": 0.7970204841713222, + "grad_norm": 1.5137747526168823, + "learning_rate": 4.826839597592408e-06, + "loss": 1.0656, + "step": 856 + }, + { + "epoch": 0.797951582867784, + "grad_norm": 1.5891633033752441, + "learning_rate": 4.826386579020643e-06, + "loss": 1.1629, + "step": 857 + }, + { + "epoch": 0.7988826815642458, + "grad_norm": 1.5737459659576416, + "learning_rate": 4.825932989949373e-06, + "loss": 1.0798, + "step": 858 + }, + { + "epoch": 0.7998137802607076, + "grad_norm": 1.4494706392288208, + "learning_rate": 4.8254788304898335e-06, + "loss": 1.0276, + "step": 859 + }, + { + "epoch": 0.8007448789571695, + "grad_norm": 1.5734295845031738, + "learning_rate": 4.825024100753395e-06, + "loss": 1.0917, + "step": 860 + }, + { + "epoch": 0.8016759776536313, + "grad_norm": 1.5180836915969849, + "learning_rate": 4.824568800851574e-06, + "loss": 1.0714, + "step": 861 + }, + { + "epoch": 0.8026070763500931, + "grad_norm": 1.5582964420318604, + "learning_rate": 4.82411293089602e-06, + "loss": 1.1214, + "step": 862 + }, + { + "epoch": 0.8035381750465549, + "grad_norm": 1.6028590202331543, + "learning_rate": 4.823656490998529e-06, + "loss": 1.1307, + "step": 863 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 1.7554177045822144, + "learning_rate": 4.823199481271031e-06, + "loss": 1.0846, + "step": 864 + }, + { + "epoch": 0.8054003724394786, + "grad_norm": 1.5387828350067139, + "learning_rate": 4.822741901825602e-06, + "loss": 1.0349, + "step": 865 + }, + { + "epoch": 0.8063314711359404, + "grad_norm": 1.5495141744613647, + "learning_rate": 4.8222837527744514e-06, + "loss": 1.1161, + "step": 866 + }, + { + "epoch": 0.8072625698324022, + "grad_norm": 1.5196735858917236, + "learning_rate": 4.8218250342299314e-06, + "loss": 1.0591, + "step": 867 + }, + { + "epoch": 0.8081936685288641, + "grad_norm": 1.6387678384780884, + "learning_rate": 4.821365746304535e-06, + "loss": 1.0852, + "step": 868 + }, + { + "epoch": 0.8091247672253259, + "grad_norm": 1.5407074689865112, + "learning_rate": 4.8209058891108905e-06, + "loss": 1.0724, + "step": 869 + }, + { + "epoch": 0.8100558659217877, + "grad_norm": 1.6038005352020264, + "learning_rate": 4.820445462761771e-06, + "loss": 1.0933, + "step": 870 + }, + { + "epoch": 0.8109869646182495, + "grad_norm": 1.5694278478622437, + "learning_rate": 4.819984467370087e-06, + "loss": 1.077, + "step": 871 + }, + { + "epoch": 0.8119180633147114, + "grad_norm": 1.564446210861206, + "learning_rate": 4.819522903048887e-06, + "loss": 1.046, + "step": 872 + }, + { + "epoch": 0.8128491620111732, + "grad_norm": 1.5972493886947632, + "learning_rate": 4.819060769911361e-06, + "loss": 1.1444, + "step": 873 + }, + { + "epoch": 0.813780260707635, + "grad_norm": 1.5357850790023804, + "learning_rate": 4.8185980680708375e-06, + "loss": 1.1271, + "step": 874 + }, + { + "epoch": 0.8147113594040968, + "grad_norm": 1.5091062784194946, + "learning_rate": 4.818134797640785e-06, + "loss": 1.0779, + "step": 875 + }, + { + "epoch": 0.8156424581005587, + "grad_norm": 1.523606538772583, + "learning_rate": 4.817670958734812e-06, + "loss": 1.0626, + "step": 876 + }, + { + "epoch": 0.8165735567970205, + "grad_norm": 1.566107988357544, + "learning_rate": 4.8172065514666634e-06, + "loss": 1.1188, + "step": 877 + }, + { + "epoch": 0.8175046554934823, + "grad_norm": 1.4585009813308716, + "learning_rate": 4.8167415759502275e-06, + "loss": 1.0964, + "step": 878 + }, + { + "epoch": 0.8184357541899442, + "grad_norm": 1.5572326183319092, + "learning_rate": 4.8162760322995314e-06, + "loss": 1.1198, + "step": 879 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 1.5219098329544067, + "learning_rate": 4.815809920628738e-06, + "loss": 1.0747, + "step": 880 + }, + { + "epoch": 0.8202979515828678, + "grad_norm": 1.5456024408340454, + "learning_rate": 4.815343241052153e-06, + "loss": 1.0782, + "step": 881 + }, + { + "epoch": 0.8212290502793296, + "grad_norm": 1.5535281896591187, + "learning_rate": 4.8148759936842196e-06, + "loss": 1.0902, + "step": 882 + }, + { + "epoch": 0.8221601489757915, + "grad_norm": 1.5116961002349854, + "learning_rate": 4.81440817863952e-06, + "loss": 1.0608, + "step": 883 + }, + { + "epoch": 0.8230912476722533, + "grad_norm": 1.5352904796600342, + "learning_rate": 4.813939796032779e-06, + "loss": 1.128, + "step": 884 + }, + { + "epoch": 0.8240223463687151, + "grad_norm": 1.5234910249710083, + "learning_rate": 4.813470845978856e-06, + "loss": 1.1008, + "step": 885 + }, + { + "epoch": 0.8249534450651769, + "grad_norm": 1.5409780740737915, + "learning_rate": 4.813001328592752e-06, + "loss": 1.0754, + "step": 886 + }, + { + "epoch": 0.8258845437616388, + "grad_norm": 1.53434419631958, + "learning_rate": 4.812531243989608e-06, + "loss": 1.1088, + "step": 887 + }, + { + "epoch": 0.8268156424581006, + "grad_norm": 1.6033190488815308, + "learning_rate": 4.8120605922847e-06, + "loss": 1.1187, + "step": 888 + }, + { + "epoch": 0.8277467411545624, + "grad_norm": 1.5325690507888794, + "learning_rate": 4.811589373593448e-06, + "loss": 1.0988, + "step": 889 + }, + { + "epoch": 0.8286778398510242, + "grad_norm": 1.487269639968872, + "learning_rate": 4.811117588031409e-06, + "loss": 1.0486, + "step": 890 + }, + { + "epoch": 0.8296089385474861, + "grad_norm": 1.545894742012024, + "learning_rate": 4.810645235714277e-06, + "loss": 1.0542, + "step": 891 + }, + { + "epoch": 0.8305400372439479, + "grad_norm": 1.5531600713729858, + "learning_rate": 4.810172316757889e-06, + "loss": 1.0589, + "step": 892 + }, + { + "epoch": 0.8314711359404097, + "grad_norm": 1.579483151435852, + "learning_rate": 4.809698831278217e-06, + "loss": 1.0866, + "step": 893 + }, + { + "epoch": 0.8324022346368715, + "grad_norm": 1.5545785427093506, + "learning_rate": 4.809224779391376e-06, + "loss": 1.1418, + "step": 894 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.6231085062026978, + "learning_rate": 4.808750161213615e-06, + "loss": 1.0997, + "step": 895 + }, + { + "epoch": 0.8342644320297952, + "grad_norm": 1.6763361692428589, + "learning_rate": 4.8082749768613275e-06, + "loss": 1.1184, + "step": 896 + }, + { + "epoch": 0.835195530726257, + "grad_norm": 1.5546660423278809, + "learning_rate": 4.807799226451041e-06, + "loss": 1.1144, + "step": 897 + }, + { + "epoch": 0.8361266294227188, + "grad_norm": 1.5605677366256714, + "learning_rate": 4.807322910099425e-06, + "loss": 1.081, + "step": 898 + }, + { + "epoch": 0.8370577281191807, + "grad_norm": 1.603249430656433, + "learning_rate": 4.806846027923284e-06, + "loss": 1.0923, + "step": 899 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 1.5507538318634033, + "learning_rate": 4.806368580039567e-06, + "loss": 1.0814, + "step": 900 + }, + { + "epoch": 0.8389199255121043, + "grad_norm": 1.527549386024475, + "learning_rate": 4.805890566565356e-06, + "loss": 1.0446, + "step": 901 + }, + { + "epoch": 0.839851024208566, + "grad_norm": 1.5054140090942383, + "learning_rate": 4.805411987617875e-06, + "loss": 1.092, + "step": 902 + }, + { + "epoch": 0.840782122905028, + "grad_norm": 1.5621106624603271, + "learning_rate": 4.804932843314487e-06, + "loss": 1.0731, + "step": 903 + }, + { + "epoch": 0.8417132216014898, + "grad_norm": 1.5620564222335815, + "learning_rate": 4.804453133772691e-06, + "loss": 1.1233, + "step": 904 + }, + { + "epoch": 0.8426443202979516, + "grad_norm": 1.5361804962158203, + "learning_rate": 4.803972859110126e-06, + "loss": 1.1221, + "step": 905 + }, + { + "epoch": 0.8435754189944135, + "grad_norm": 1.6729267835617065, + "learning_rate": 4.803492019444571e-06, + "loss": 1.1071, + "step": 906 + }, + { + "epoch": 0.8445065176908753, + "grad_norm": 1.5836952924728394, + "learning_rate": 4.80301061489394e-06, + "loss": 1.1474, + "step": 907 + }, + { + "epoch": 0.845437616387337, + "grad_norm": 1.5220308303833008, + "learning_rate": 4.8025286455762905e-06, + "loss": 1.056, + "step": 908 + }, + { + "epoch": 0.8463687150837989, + "grad_norm": 1.6178399324417114, + "learning_rate": 4.802046111609815e-06, + "loss": 1.079, + "step": 909 + }, + { + "epoch": 0.8472998137802608, + "grad_norm": 1.501449465751648, + "learning_rate": 4.8015630131128446e-06, + "loss": 1.0707, + "step": 910 + }, + { + "epoch": 0.8482309124767226, + "grad_norm": 1.543421745300293, + "learning_rate": 4.801079350203849e-06, + "loss": 1.1117, + "step": 911 + }, + { + "epoch": 0.8491620111731844, + "grad_norm": 1.4939581155776978, + "learning_rate": 4.800595123001439e-06, + "loss": 1.0927, + "step": 912 + }, + { + "epoch": 0.8500931098696461, + "grad_norm": 1.5124197006225586, + "learning_rate": 4.8001103316243585e-06, + "loss": 1.1018, + "step": 913 + }, + { + "epoch": 0.851024208566108, + "grad_norm": 1.535314917564392, + "learning_rate": 4.799624976191495e-06, + "loss": 1.1157, + "step": 914 + }, + { + "epoch": 0.8519553072625698, + "grad_norm": 1.6102347373962402, + "learning_rate": 4.799139056821872e-06, + "loss": 1.0903, + "step": 915 + }, + { + "epoch": 0.8528864059590316, + "grad_norm": 1.5451605319976807, + "learning_rate": 4.798652573634651e-06, + "loss": 1.1138, + "step": 916 + }, + { + "epoch": 0.8538175046554934, + "grad_norm": 1.6535327434539795, + "learning_rate": 4.798165526749132e-06, + "loss": 1.1009, + "step": 917 + }, + { + "epoch": 0.8547486033519553, + "grad_norm": 1.5381852388381958, + "learning_rate": 4.797677916284753e-06, + "loss": 1.0654, + "step": 918 + }, + { + "epoch": 0.8556797020484171, + "grad_norm": 1.5182949304580688, + "learning_rate": 4.7971897423610925e-06, + "loss": 1.1074, + "step": 919 + }, + { + "epoch": 0.8566108007448789, + "grad_norm": 1.5372347831726074, + "learning_rate": 4.796701005097863e-06, + "loss": 1.0757, + "step": 920 + }, + { + "epoch": 0.8575418994413407, + "grad_norm": 1.5097237825393677, + "learning_rate": 4.7962117046149205e-06, + "loss": 1.0554, + "step": 921 + }, + { + "epoch": 0.8584729981378026, + "grad_norm": 1.5212228298187256, + "learning_rate": 4.795721841032253e-06, + "loss": 1.0421, + "step": 922 + }, + { + "epoch": 0.8594040968342644, + "grad_norm": 1.6038963794708252, + "learning_rate": 4.795231414469991e-06, + "loss": 1.1119, + "step": 923 + }, + { + "epoch": 0.8603351955307262, + "grad_norm": 1.5137346982955933, + "learning_rate": 4.794740425048402e-06, + "loss": 1.0676, + "step": 924 + }, + { + "epoch": 0.861266294227188, + "grad_norm": 1.5880333185195923, + "learning_rate": 4.794248872887891e-06, + "loss": 1.1038, + "step": 925 + }, + { + "epoch": 0.8621973929236499, + "grad_norm": 1.553910732269287, + "learning_rate": 4.793756758109e-06, + "loss": 1.1233, + "step": 926 + }, + { + "epoch": 0.8631284916201117, + "grad_norm": 1.525766372680664, + "learning_rate": 4.793264080832414e-06, + "loss": 1.0828, + "step": 927 + }, + { + "epoch": 0.8640595903165735, + "grad_norm": 1.5134408473968506, + "learning_rate": 4.792770841178947e-06, + "loss": 1.1278, + "step": 928 + }, + { + "epoch": 0.8649906890130353, + "grad_norm": 1.5016069412231445, + "learning_rate": 4.792277039269561e-06, + "loss": 1.0933, + "step": 929 + }, + { + "epoch": 0.8659217877094972, + "grad_norm": 1.592409610748291, + "learning_rate": 4.791782675225348e-06, + "loss": 1.106, + "step": 930 + }, + { + "epoch": 0.866852886405959, + "grad_norm": 1.62725031375885, + "learning_rate": 4.791287749167541e-06, + "loss": 1.1183, + "step": 931 + }, + { + "epoch": 0.8677839851024208, + "grad_norm": 1.5325590372085571, + "learning_rate": 4.790792261217513e-06, + "loss": 1.0763, + "step": 932 + }, + { + "epoch": 0.8687150837988827, + "grad_norm": 1.4817781448364258, + "learning_rate": 4.790296211496769e-06, + "loss": 1.0604, + "step": 933 + }, + { + "epoch": 0.8696461824953445, + "grad_norm": 1.5212786197662354, + "learning_rate": 4.789799600126957e-06, + "loss": 1.0935, + "step": 934 + }, + { + "epoch": 0.8705772811918063, + "grad_norm": 1.580877661705017, + "learning_rate": 4.7893024272298615e-06, + "loss": 1.0778, + "step": 935 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 1.5551815032958984, + "learning_rate": 4.788804692927403e-06, + "loss": 1.0734, + "step": 936 + }, + { + "epoch": 0.87243947858473, + "grad_norm": 1.5052282810211182, + "learning_rate": 4.788306397341643e-06, + "loss": 1.0978, + "step": 937 + }, + { + "epoch": 0.8733705772811918, + "grad_norm": 1.5590263605117798, + "learning_rate": 4.7878075405947755e-06, + "loss": 1.0542, + "step": 938 + }, + { + "epoch": 0.8743016759776536, + "grad_norm": 1.5098881721496582, + "learning_rate": 4.787308122809137e-06, + "loss": 1.0664, + "step": 939 + }, + { + "epoch": 0.8752327746741154, + "grad_norm": 1.5522726774215698, + "learning_rate": 4.7868081441071975e-06, + "loss": 1.1083, + "step": 940 + }, + { + "epoch": 0.8761638733705773, + "grad_norm": 1.5374565124511719, + "learning_rate": 4.78630760461157e-06, + "loss": 1.0984, + "step": 941 + }, + { + "epoch": 0.8770949720670391, + "grad_norm": 1.546431303024292, + "learning_rate": 4.785806504445e-06, + "loss": 1.0553, + "step": 942 + }, + { + "epoch": 0.8780260707635009, + "grad_norm": 1.510360836982727, + "learning_rate": 4.7853048437303716e-06, + "loss": 1.0387, + "step": 943 + }, + { + "epoch": 0.8789571694599627, + "grad_norm": 1.5364389419555664, + "learning_rate": 4.784802622590707e-06, + "loss": 1.0961, + "step": 944 + }, + { + "epoch": 0.8798882681564246, + "grad_norm": 1.6120407581329346, + "learning_rate": 4.784299841149168e-06, + "loss": 1.0731, + "step": 945 + }, + { + "epoch": 0.8808193668528864, + "grad_norm": 1.5982757806777954, + "learning_rate": 4.78379649952905e-06, + "loss": 1.0507, + "step": 946 + }, + { + "epoch": 0.8817504655493482, + "grad_norm": 1.5438737869262695, + "learning_rate": 4.783292597853787e-06, + "loss": 1.0688, + "step": 947 + }, + { + "epoch": 0.88268156424581, + "grad_norm": 1.5605790615081787, + "learning_rate": 4.782788136246951e-06, + "loss": 1.137, + "step": 948 + }, + { + "epoch": 0.8836126629422719, + "grad_norm": 1.6020355224609375, + "learning_rate": 4.782283114832252e-06, + "loss": 1.0574, + "step": 949 + }, + { + "epoch": 0.8845437616387337, + "grad_norm": 1.6144306659698486, + "learning_rate": 4.781777533733535e-06, + "loss": 1.0832, + "step": 950 + }, + { + "epoch": 0.8854748603351955, + "grad_norm": 1.5760427713394165, + "learning_rate": 4.781271393074785e-06, + "loss": 1.1107, + "step": 951 + }, + { + "epoch": 0.8864059590316573, + "grad_norm": 1.572853684425354, + "learning_rate": 4.780764692980122e-06, + "loss": 1.1229, + "step": 952 + }, + { + "epoch": 0.8873370577281192, + "grad_norm": 1.629422903060913, + "learning_rate": 4.780257433573804e-06, + "loss": 1.0583, + "step": 953 + }, + { + "epoch": 0.888268156424581, + "grad_norm": 1.4656858444213867, + "learning_rate": 4.779749614980225e-06, + "loss": 1.052, + "step": 954 + }, + { + "epoch": 0.8891992551210428, + "grad_norm": 1.571095585823059, + "learning_rate": 4.77924123732392e-06, + "loss": 1.0923, + "step": 955 + }, + { + "epoch": 0.8901303538175046, + "grad_norm": 1.621502161026001, + "learning_rate": 4.7787323007295575e-06, + "loss": 1.086, + "step": 956 + }, + { + "epoch": 0.8910614525139665, + "grad_norm": 1.5676844120025635, + "learning_rate": 4.778222805321942e-06, + "loss": 1.0927, + "step": 957 + }, + { + "epoch": 0.8919925512104283, + "grad_norm": 1.539885401725769, + "learning_rate": 4.777712751226019e-06, + "loss": 1.0733, + "step": 958 + }, + { + "epoch": 0.8929236499068901, + "grad_norm": 1.4780341386795044, + "learning_rate": 4.777202138566869e-06, + "loss": 1.0664, + "step": 959 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 1.502509593963623, + "learning_rate": 4.776690967469708e-06, + "loss": 1.0859, + "step": 960 + }, + { + "epoch": 0.8947858472998138, + "grad_norm": 1.5148625373840332, + "learning_rate": 4.7761792380598916e-06, + "loss": 1.0833, + "step": 961 + }, + { + "epoch": 0.8957169459962756, + "grad_norm": 1.5759106874465942, + "learning_rate": 4.775666950462911e-06, + "loss": 1.1062, + "step": 962 + }, + { + "epoch": 0.8966480446927374, + "grad_norm": 1.485351324081421, + "learning_rate": 4.775154104804393e-06, + "loss": 1.0503, + "step": 963 + }, + { + "epoch": 0.8975791433891993, + "grad_norm": 1.509264349937439, + "learning_rate": 4.774640701210106e-06, + "loss": 1.104, + "step": 964 + }, + { + "epoch": 0.8985102420856611, + "grad_norm": 1.5137369632720947, + "learning_rate": 4.77412673980595e-06, + "loss": 1.0921, + "step": 965 + }, + { + "epoch": 0.8994413407821229, + "grad_norm": 1.502916932106018, + "learning_rate": 4.773612220717962e-06, + "loss": 1.1148, + "step": 966 + }, + { + "epoch": 0.9003724394785847, + "grad_norm": 1.5210751295089722, + "learning_rate": 4.7730971440723196e-06, + "loss": 1.0766, + "step": 967 + }, + { + "epoch": 0.9013035381750466, + "grad_norm": 1.5958471298217773, + "learning_rate": 4.7725815099953344e-06, + "loss": 1.11, + "step": 968 + }, + { + "epoch": 0.9022346368715084, + "grad_norm": 1.5454305410385132, + "learning_rate": 4.772065318613456e-06, + "loss": 1.0933, + "step": 969 + }, + { + "epoch": 0.9031657355679702, + "grad_norm": 1.5663084983825684, + "learning_rate": 4.771548570053268e-06, + "loss": 1.0882, + "step": 970 + }, + { + "epoch": 0.904096834264432, + "grad_norm": 1.4800734519958496, + "learning_rate": 4.771031264441494e-06, + "loss": 1.06, + "step": 971 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 1.574017882347107, + "learning_rate": 4.770513401904994e-06, + "loss": 1.0682, + "step": 972 + }, + { + "epoch": 0.9059590316573557, + "grad_norm": 1.4770042896270752, + "learning_rate": 4.76999498257076e-06, + "loss": 1.0528, + "step": 973 + }, + { + "epoch": 0.9068901303538175, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.7694760065659275e-06, + "loss": 1.1145, + "step": 974 + }, + { + "epoch": 0.9078212290502793, + "grad_norm": 1.4944844245910645, + "learning_rate": 4.768956474017763e-06, + "loss": 1.0629, + "step": 975 + }, + { + "epoch": 0.9087523277467412, + "grad_norm": 1.534879207611084, + "learning_rate": 4.7684363850536715e-06, + "loss": 1.0944, + "step": 976 + }, + { + "epoch": 0.909683426443203, + "grad_norm": 1.5772992372512817, + "learning_rate": 4.767915739801194e-06, + "loss": 1.1354, + "step": 977 + }, + { + "epoch": 0.9106145251396648, + "grad_norm": 1.5922162532806396, + "learning_rate": 4.76739453838801e-06, + "loss": 1.0478, + "step": 978 + }, + { + "epoch": 0.9115456238361266, + "grad_norm": 1.4816398620605469, + "learning_rate": 4.766872780941933e-06, + "loss": 1.0951, + "step": 979 + }, + { + "epoch": 0.9124767225325885, + "grad_norm": 1.4929137229919434, + "learning_rate": 4.766350467590911e-06, + "loss": 1.0562, + "step": 980 + }, + { + "epoch": 0.9134078212290503, + "grad_norm": 1.577746868133545, + "learning_rate": 4.7658275984630345e-06, + "loss": 1.0923, + "step": 981 + }, + { + "epoch": 0.9143389199255121, + "grad_norm": 1.5916051864624023, + "learning_rate": 4.765304173686525e-06, + "loss": 1.0723, + "step": 982 + }, + { + "epoch": 0.9152700186219739, + "grad_norm": 1.5880569219589233, + "learning_rate": 4.764780193389741e-06, + "loss": 1.0696, + "step": 983 + }, + { + "epoch": 0.9162011173184358, + "grad_norm": 1.5580594539642334, + "learning_rate": 4.764255657701179e-06, + "loss": 1.0591, + "step": 984 + }, + { + "epoch": 0.9171322160148976, + "grad_norm": 1.6069998741149902, + "learning_rate": 4.763730566749472e-06, + "loss": 1.0753, + "step": 985 + }, + { + "epoch": 0.9180633147113594, + "grad_norm": 1.6108243465423584, + "learning_rate": 4.763204920663386e-06, + "loss": 1.0907, + "step": 986 + }, + { + "epoch": 0.9189944134078212, + "grad_norm": 1.5411450862884521, + "learning_rate": 4.762678719571826e-06, + "loss": 1.0865, + "step": 987 + }, + { + "epoch": 0.9199255121042831, + "grad_norm": 1.5383867025375366, + "learning_rate": 4.762151963603832e-06, + "loss": 1.0598, + "step": 988 + }, + { + "epoch": 0.9208566108007449, + "grad_norm": 1.537736177444458, + "learning_rate": 4.761624652888581e-06, + "loss": 1.0907, + "step": 989 + }, + { + "epoch": 0.9217877094972067, + "grad_norm": 1.5310419797897339, + "learning_rate": 4.761096787555385e-06, + "loss": 1.1208, + "step": 990 + }, + { + "epoch": 0.9227188081936686, + "grad_norm": 1.5367666482925415, + "learning_rate": 4.760568367733691e-06, + "loss": 1.1241, + "step": 991 + }, + { + "epoch": 0.9236499068901304, + "grad_norm": 1.493673324584961, + "learning_rate": 4.7600393935530865e-06, + "loss": 1.0776, + "step": 992 + }, + { + "epoch": 0.9245810055865922, + "grad_norm": 1.5405055284500122, + "learning_rate": 4.759509865143289e-06, + "loss": 1.1081, + "step": 993 + }, + { + "epoch": 0.925512104283054, + "grad_norm": 1.518207311630249, + "learning_rate": 4.758979782634155e-06, + "loss": 1.0798, + "step": 994 + }, + { + "epoch": 0.9264432029795159, + "grad_norm": 1.4959720373153687, + "learning_rate": 4.758449146155677e-06, + "loss": 1.0789, + "step": 995 + }, + { + "epoch": 0.9273743016759777, + "grad_norm": 1.509627103805542, + "learning_rate": 4.757917955837984e-06, + "loss": 1.1218, + "step": 996 + }, + { + "epoch": 0.9283054003724395, + "grad_norm": 1.5125606060028076, + "learning_rate": 4.757386211811338e-06, + "loss": 1.0951, + "step": 997 + }, + { + "epoch": 0.9292364990689013, + "grad_norm": 1.458593726158142, + "learning_rate": 4.7568539142061395e-06, + "loss": 1.0851, + "step": 998 + }, + { + "epoch": 0.9301675977653632, + "grad_norm": 1.560383677482605, + "learning_rate": 4.756321063152924e-06, + "loss": 1.0952, + "step": 999 + }, + { + "epoch": 0.931098696461825, + "grad_norm": 1.5245909690856934, + "learning_rate": 4.755787658782361e-06, + "loss": 1.0847, + "step": 1000 + }, + { + "epoch": 0.9320297951582868, + "grad_norm": 1.5548995733261108, + "learning_rate": 4.755253701225259e-06, + "loss": 1.0239, + "step": 1001 + }, + { + "epoch": 0.9329608938547486, + "grad_norm": 1.5523535013198853, + "learning_rate": 4.754719190612559e-06, + "loss": 1.1271, + "step": 1002 + }, + { + "epoch": 0.9338919925512105, + "grad_norm": 1.4968879222869873, + "learning_rate": 4.75418412707534e-06, + "loss": 1.0334, + "step": 1003 + }, + { + "epoch": 0.9348230912476723, + "grad_norm": 1.5370330810546875, + "learning_rate": 4.753648510744815e-06, + "loss": 1.0667, + "step": 1004 + }, + { + "epoch": 0.9357541899441341, + "grad_norm": 1.5096780061721802, + "learning_rate": 4.753112341752333e-06, + "loss": 1.0667, + "step": 1005 + }, + { + "epoch": 0.9366852886405959, + "grad_norm": 1.585119366645813, + "learning_rate": 4.752575620229379e-06, + "loss": 1.0813, + "step": 1006 + }, + { + "epoch": 0.9376163873370578, + "grad_norm": 1.5708082914352417, + "learning_rate": 4.752038346307573e-06, + "loss": 1.0632, + "step": 1007 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 1.6266708374023438, + "learning_rate": 4.751500520118671e-06, + "loss": 1.1227, + "step": 1008 + }, + { + "epoch": 0.9394785847299814, + "grad_norm": 1.5830202102661133, + "learning_rate": 4.750962141794565e-06, + "loss": 1.0873, + "step": 1009 + }, + { + "epoch": 0.9404096834264432, + "grad_norm": 1.624855399131775, + "learning_rate": 4.750423211467278e-06, + "loss": 1.0675, + "step": 1010 + }, + { + "epoch": 0.9413407821229051, + "grad_norm": 1.5412731170654297, + "learning_rate": 4.749883729268975e-06, + "loss": 1.0973, + "step": 1011 + }, + { + "epoch": 0.9422718808193669, + "grad_norm": 1.4659055471420288, + "learning_rate": 4.749343695331952e-06, + "loss": 1.0657, + "step": 1012 + }, + { + "epoch": 0.9432029795158287, + "grad_norm": 1.5228955745697021, + "learning_rate": 4.748803109788642e-06, + "loss": 1.042, + "step": 1013 + }, + { + "epoch": 0.9441340782122905, + "grad_norm": 1.5122032165527344, + "learning_rate": 4.748261972771612e-06, + "loss": 1.0389, + "step": 1014 + }, + { + "epoch": 0.9450651769087524, + "grad_norm": 1.5943773984909058, + "learning_rate": 4.747720284413565e-06, + "loss": 1.1077, + "step": 1015 + }, + { + "epoch": 0.9459962756052142, + "grad_norm": 1.5315780639648438, + "learning_rate": 4.74717804484734e-06, + "loss": 1.0354, + "step": 1016 + }, + { + "epoch": 0.946927374301676, + "grad_norm": 1.5775771141052246, + "learning_rate": 4.74663525420591e-06, + "loss": 1.0711, + "step": 1017 + }, + { + "epoch": 0.9478584729981379, + "grad_norm": 1.4658482074737549, + "learning_rate": 4.7460919126223825e-06, + "loss": 1.0556, + "step": 1018 + }, + { + "epoch": 0.9487895716945997, + "grad_norm": 1.5183653831481934, + "learning_rate": 4.745548020230003e-06, + "loss": 1.1033, + "step": 1019 + }, + { + "epoch": 0.9497206703910615, + "grad_norm": 1.5640509128570557, + "learning_rate": 4.745003577162148e-06, + "loss": 1.0281, + "step": 1020 + }, + { + "epoch": 0.9506517690875232, + "grad_norm": 1.539595603942871, + "learning_rate": 4.7444585835523335e-06, + "loss": 1.0728, + "step": 1021 + }, + { + "epoch": 0.9515828677839852, + "grad_norm": 1.4784270524978638, + "learning_rate": 4.743913039534206e-06, + "loss": 1.1092, + "step": 1022 + }, + { + "epoch": 0.952513966480447, + "grad_norm": 1.55125892162323, + "learning_rate": 4.74336694524155e-06, + "loss": 1.0749, + "step": 1023 + }, + { + "epoch": 0.9534450651769087, + "grad_norm": 1.5337117910385132, + "learning_rate": 4.7428203008082855e-06, + "loss": 1.0731, + "step": 1024 + }, + { + "epoch": 0.9543761638733705, + "grad_norm": 1.5211478471755981, + "learning_rate": 4.742273106368464e-06, + "loss": 1.0597, + "step": 1025 + }, + { + "epoch": 0.9553072625698324, + "grad_norm": 1.4872453212738037, + "learning_rate": 4.741725362056275e-06, + "loss": 1.0439, + "step": 1026 + }, + { + "epoch": 0.9562383612662942, + "grad_norm": 1.5244958400726318, + "learning_rate": 4.741177068006042e-06, + "loss": 1.0959, + "step": 1027 + }, + { + "epoch": 0.957169459962756, + "grad_norm": 1.8209069967269897, + "learning_rate": 4.7406282243522224e-06, + "loss": 1.1022, + "step": 1028 + }, + { + "epoch": 0.9581005586592178, + "grad_norm": 1.5457854270935059, + "learning_rate": 4.740078831229408e-06, + "loss": 1.0691, + "step": 1029 + }, + { + "epoch": 0.9590316573556797, + "grad_norm": 1.4670771360397339, + "learning_rate": 4.7395288887723304e-06, + "loss": 1.0195, + "step": 1030 + }, + { + "epoch": 0.9599627560521415, + "grad_norm": 1.5621514320373535, + "learning_rate": 4.738978397115848e-06, + "loss": 1.0913, + "step": 1031 + }, + { + "epoch": 0.9608938547486033, + "grad_norm": 1.5232383012771606, + "learning_rate": 4.738427356394959e-06, + "loss": 1.0054, + "step": 1032 + }, + { + "epoch": 0.9618249534450651, + "grad_norm": 1.5279157161712646, + "learning_rate": 4.737875766744795e-06, + "loss": 1.0832, + "step": 1033 + }, + { + "epoch": 0.962756052141527, + "grad_norm": 1.6728878021240234, + "learning_rate": 4.737323628300622e-06, + "loss": 1.121, + "step": 1034 + }, + { + "epoch": 0.9636871508379888, + "grad_norm": 1.5027371644973755, + "learning_rate": 4.736770941197842e-06, + "loss": 1.0849, + "step": 1035 + }, + { + "epoch": 0.9646182495344506, + "grad_norm": 1.6248314380645752, + "learning_rate": 4.736217705571989e-06, + "loss": 1.0889, + "step": 1036 + }, + { + "epoch": 0.9655493482309124, + "grad_norm": 1.536523699760437, + "learning_rate": 4.735663921558734e-06, + "loss": 1.0619, + "step": 1037 + }, + { + "epoch": 0.9664804469273743, + "grad_norm": 1.6086184978485107, + "learning_rate": 4.735109589293881e-06, + "loss": 1.1171, + "step": 1038 + }, + { + "epoch": 0.9674115456238361, + "grad_norm": 1.5546424388885498, + "learning_rate": 4.734554708913368e-06, + "loss": 1.0684, + "step": 1039 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 1.589662790298462, + "learning_rate": 4.73399928055327e-06, + "loss": 1.1086, + "step": 1040 + }, + { + "epoch": 0.9692737430167597, + "grad_norm": 1.4940744638442993, + "learning_rate": 4.733443304349793e-06, + "loss": 1.0359, + "step": 1041 + }, + { + "epoch": 0.9702048417132216, + "grad_norm": 1.6239373683929443, + "learning_rate": 4.7328867804392805e-06, + "loss": 1.0987, + "step": 1042 + }, + { + "epoch": 0.9711359404096834, + "grad_norm": 1.4926732778549194, + "learning_rate": 4.732329708958208e-06, + "loss": 1.0538, + "step": 1043 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 1.48908269405365, + "learning_rate": 4.731772090043184e-06, + "loss": 1.0165, + "step": 1044 + }, + { + "epoch": 0.972998137802607, + "grad_norm": 1.4550187587738037, + "learning_rate": 4.7312139238309574e-06, + "loss": 1.0609, + "step": 1045 + }, + { + "epoch": 0.9739292364990689, + "grad_norm": 1.53258216381073, + "learning_rate": 4.730655210458404e-06, + "loss": 1.0263, + "step": 1046 + }, + { + "epoch": 0.9748603351955307, + "grad_norm": 1.5673062801361084, + "learning_rate": 4.730095950062539e-06, + "loss": 1.0963, + "step": 1047 + }, + { + "epoch": 0.9757914338919925, + "grad_norm": 1.6262962818145752, + "learning_rate": 4.729536142780507e-06, + "loss": 1.0938, + "step": 1048 + }, + { + "epoch": 0.9767225325884544, + "grad_norm": 1.5321420431137085, + "learning_rate": 4.7289757887495935e-06, + "loss": 1.0464, + "step": 1049 + }, + { + "epoch": 0.9776536312849162, + "grad_norm": 1.6310549974441528, + "learning_rate": 4.728414888107211e-06, + "loss": 1.0772, + "step": 1050 + }, + { + "epoch": 0.978584729981378, + "grad_norm": 1.585007667541504, + "learning_rate": 4.7278534409909106e-06, + "loss": 1.0536, + "step": 1051 + }, + { + "epoch": 0.9795158286778398, + "grad_norm": 1.567671298980713, + "learning_rate": 4.727291447538375e-06, + "loss": 1.0715, + "step": 1052 + }, + { + "epoch": 0.9804469273743017, + "grad_norm": 1.5334957838058472, + "learning_rate": 4.726728907887422e-06, + "loss": 1.0591, + "step": 1053 + }, + { + "epoch": 0.9813780260707635, + "grad_norm": 1.603265643119812, + "learning_rate": 4.726165822176003e-06, + "loss": 1.1033, + "step": 1054 + }, + { + "epoch": 0.9823091247672253, + "grad_norm": 1.4996964931488037, + "learning_rate": 4.725602190542204e-06, + "loss": 1.0485, + "step": 1055 + }, + { + "epoch": 0.9832402234636871, + "grad_norm": 1.5344486236572266, + "learning_rate": 4.725038013124245e-06, + "loss": 1.0936, + "step": 1056 + }, + { + "epoch": 0.984171322160149, + "grad_norm": 1.5993379354476929, + "learning_rate": 4.724473290060477e-06, + "loss": 1.0931, + "step": 1057 + }, + { + "epoch": 0.9851024208566108, + "grad_norm": 1.5253984928131104, + "learning_rate": 4.7239080214893885e-06, + "loss": 1.1228, + "step": 1058 + }, + { + "epoch": 0.9860335195530726, + "grad_norm": 1.5465527772903442, + "learning_rate": 4.7233422075496e-06, + "loss": 1.0769, + "step": 1059 + }, + { + "epoch": 0.9869646182495344, + "grad_norm": 1.5549031496047974, + "learning_rate": 4.722775848379866e-06, + "loss": 1.0596, + "step": 1060 + }, + { + "epoch": 0.9878957169459963, + "grad_norm": 1.5188418626785278, + "learning_rate": 4.722208944119075e-06, + "loss": 1.082, + "step": 1061 + }, + { + "epoch": 0.9888268156424581, + "grad_norm": 1.4955778121948242, + "learning_rate": 4.721641494906247e-06, + "loss": 1.0732, + "step": 1062 + }, + { + "epoch": 0.9897579143389199, + "grad_norm": 1.6066385507583618, + "learning_rate": 4.7210735008805395e-06, + "loss": 1.098, + "step": 1063 + }, + { + "epoch": 0.9906890130353817, + "grad_norm": 1.5475724935531616, + "learning_rate": 4.720504962181241e-06, + "loss": 1.0925, + "step": 1064 + }, + { + "epoch": 0.9916201117318436, + "grad_norm": 1.5275098085403442, + "learning_rate": 4.719935878947775e-06, + "loss": 1.1194, + "step": 1065 + }, + { + "epoch": 0.9925512104283054, + "grad_norm": 1.5309367179870605, + "learning_rate": 4.719366251319696e-06, + "loss": 1.0911, + "step": 1066 + }, + { + "epoch": 0.9934823091247672, + "grad_norm": 1.573983907699585, + "learning_rate": 4.718796079436696e-06, + "loss": 1.0641, + "step": 1067 + }, + { + "epoch": 0.994413407821229, + "grad_norm": 1.5268535614013672, + "learning_rate": 4.718225363438595e-06, + "loss": 1.079, + "step": 1068 + }, + { + "epoch": 0.9953445065176909, + "grad_norm": 1.4826828241348267, + "learning_rate": 4.717654103465354e-06, + "loss": 1.0295, + "step": 1069 + }, + { + "epoch": 0.9962756052141527, + "grad_norm": 1.4942678213119507, + "learning_rate": 4.717082299657058e-06, + "loss": 1.0894, + "step": 1070 + }, + { + "epoch": 0.9972067039106145, + "grad_norm": 1.5466221570968628, + "learning_rate": 4.716509952153934e-06, + "loss": 1.0756, + "step": 1071 + }, + { + "epoch": 0.9981378026070763, + "grad_norm": 1.5310014486312866, + "learning_rate": 4.715937061096337e-06, + "loss": 1.099, + "step": 1072 + }, + { + "epoch": 0.9990689013035382, + "grad_norm": 1.5186617374420166, + "learning_rate": 4.7153636266247586e-06, + "loss": 1.1122, + "step": 1073 + }, + { + "epoch": 1.0, + "grad_norm": 1.5495269298553467, + "learning_rate": 4.71478964887982e-06, + "loss": 1.1152, + "step": 1074 + }, + { + "epoch": 1.000931098696462, + "grad_norm": 1.487399935722351, + "learning_rate": 4.714215128002279e-06, + "loss": 1.049, + "step": 1075 + }, + { + "epoch": 1.0018621973929236, + "grad_norm": 1.4866782426834106, + "learning_rate": 4.7136400641330245e-06, + "loss": 1.0436, + "step": 1076 + }, + { + "epoch": 1.0027932960893855, + "grad_norm": 1.4660440683364868, + "learning_rate": 4.713064457413081e-06, + "loss": 1.0009, + "step": 1077 + }, + { + "epoch": 1.0037243947858474, + "grad_norm": 1.575087070465088, + "learning_rate": 4.712488307983603e-06, + "loss": 1.0632, + "step": 1078 + }, + { + "epoch": 1.004655493482309, + "grad_norm": 1.6018379926681519, + "learning_rate": 4.7119116159858795e-06, + "loss": 1.0791, + "step": 1079 + }, + { + "epoch": 1.005586592178771, + "grad_norm": 1.611703634262085, + "learning_rate": 4.711334381561333e-06, + "loss": 1.0551, + "step": 1080 + }, + { + "epoch": 1.0065176908752327, + "grad_norm": 1.5527249574661255, + "learning_rate": 4.710756604851519e-06, + "loss": 1.0131, + "step": 1081 + }, + { + "epoch": 1.0074487895716946, + "grad_norm": 1.5393624305725098, + "learning_rate": 4.710178285998125e-06, + "loss": 1.0383, + "step": 1082 + }, + { + "epoch": 1.0083798882681565, + "grad_norm": 1.5630712509155273, + "learning_rate": 4.709599425142973e-06, + "loss": 1.0142, + "step": 1083 + }, + { + "epoch": 1.0093109869646182, + "grad_norm": 1.448293924331665, + "learning_rate": 4.709020022428016e-06, + "loss": 0.9501, + "step": 1084 + }, + { + "epoch": 1.01024208566108, + "grad_norm": 1.540922999382019, + "learning_rate": 4.70844007799534e-06, + "loss": 1.0014, + "step": 1085 + }, + { + "epoch": 1.011173184357542, + "grad_norm": 1.4855278730392456, + "learning_rate": 4.707859591987167e-06, + "loss": 1.0071, + "step": 1086 + }, + { + "epoch": 1.0121042830540037, + "grad_norm": 1.5268491506576538, + "learning_rate": 4.707278564545849e-06, + "loss": 1.0471, + "step": 1087 + }, + { + "epoch": 1.0130353817504656, + "grad_norm": 1.5671162605285645, + "learning_rate": 4.706696995813869e-06, + "loss": 1.0394, + "step": 1088 + }, + { + "epoch": 1.0139664804469273, + "grad_norm": 1.5690945386886597, + "learning_rate": 4.706114885933847e-06, + "loss": 1.0595, + "step": 1089 + }, + { + "epoch": 1.0148975791433892, + "grad_norm": 1.4803555011749268, + "learning_rate": 4.705532235048534e-06, + "loss": 1.0331, + "step": 1090 + }, + { + "epoch": 1.015828677839851, + "grad_norm": 1.6176484823226929, + "learning_rate": 4.7049490433008125e-06, + "loss": 0.9892, + "step": 1091 + }, + { + "epoch": 1.0167597765363128, + "grad_norm": 1.5290471315383911, + "learning_rate": 4.7043653108337e-06, + "loss": 0.9948, + "step": 1092 + }, + { + "epoch": 1.0176908752327747, + "grad_norm": 1.6027026176452637, + "learning_rate": 4.703781037790342e-06, + "loss": 1.0675, + "step": 1093 + }, + { + "epoch": 1.0186219739292366, + "grad_norm": 1.58138108253479, + "learning_rate": 4.703196224314023e-06, + "loss": 1.0456, + "step": 1094 + }, + { + "epoch": 1.0195530726256983, + "grad_norm": 1.620977520942688, + "learning_rate": 4.702610870548155e-06, + "loss": 1.0387, + "step": 1095 + }, + { + "epoch": 1.0204841713221602, + "grad_norm": 1.5576655864715576, + "learning_rate": 4.702024976636286e-06, + "loss": 1.0722, + "step": 1096 + }, + { + "epoch": 1.0214152700186219, + "grad_norm": 1.4967273473739624, + "learning_rate": 4.701438542722092e-06, + "loss": 1.0387, + "step": 1097 + }, + { + "epoch": 1.0223463687150838, + "grad_norm": 1.5341986417770386, + "learning_rate": 4.700851568949386e-06, + "loss": 1.0523, + "step": 1098 + }, + { + "epoch": 1.0232774674115457, + "grad_norm": 1.54250168800354, + "learning_rate": 4.70026405546211e-06, + "loss": 0.9612, + "step": 1099 + }, + { + "epoch": 1.0242085661080074, + "grad_norm": 1.5784211158752441, + "learning_rate": 4.699676002404342e-06, + "loss": 1.0184, + "step": 1100 + }, + { + "epoch": 1.0251396648044693, + "grad_norm": 1.5577175617218018, + "learning_rate": 4.699087409920289e-06, + "loss": 1.0436, + "step": 1101 + }, + { + "epoch": 1.0260707635009312, + "grad_norm": 1.5604437589645386, + "learning_rate": 4.698498278154291e-06, + "loss": 1.02, + "step": 1102 + }, + { + "epoch": 1.0270018621973929, + "grad_norm": 1.577594518661499, + "learning_rate": 4.697908607250822e-06, + "loss": 1.0243, + "step": 1103 + }, + { + "epoch": 1.0279329608938548, + "grad_norm": 1.5273542404174805, + "learning_rate": 4.6973183973544854e-06, + "loss": 1.0192, + "step": 1104 + }, + { + "epoch": 1.0288640595903167, + "grad_norm": 1.6009721755981445, + "learning_rate": 4.69672764861002e-06, + "loss": 1.0125, + "step": 1105 + }, + { + "epoch": 1.0297951582867784, + "grad_norm": 1.5656324625015259, + "learning_rate": 4.696136361162293e-06, + "loss": 1.0289, + "step": 1106 + }, + { + "epoch": 1.0307262569832403, + "grad_norm": 1.5725963115692139, + "learning_rate": 4.695544535156308e-06, + "loss": 1.0008, + "step": 1107 + }, + { + "epoch": 1.031657355679702, + "grad_norm": 1.5703850984573364, + "learning_rate": 4.694952170737197e-06, + "loss": 1.0054, + "step": 1108 + }, + { + "epoch": 1.0325884543761639, + "grad_norm": 1.5812164545059204, + "learning_rate": 4.694359268050225e-06, + "loss": 1.0616, + "step": 1109 + }, + { + "epoch": 1.0335195530726258, + "grad_norm": 1.4778999090194702, + "learning_rate": 4.693765827240791e-06, + "loss": 0.9779, + "step": 1110 + }, + { + "epoch": 1.0344506517690875, + "grad_norm": 1.5594961643218994, + "learning_rate": 4.693171848454423e-06, + "loss": 1.0284, + "step": 1111 + }, + { + "epoch": 1.0353817504655494, + "grad_norm": 1.4904820919036865, + "learning_rate": 4.692577331836784e-06, + "loss": 0.996, + "step": 1112 + }, + { + "epoch": 1.0363128491620113, + "grad_norm": 1.5186986923217773, + "learning_rate": 4.691982277533665e-06, + "loss": 1.0641, + "step": 1113 + }, + { + "epoch": 1.037243947858473, + "grad_norm": 1.5642173290252686, + "learning_rate": 4.691386685690993e-06, + "loss": 1.0516, + "step": 1114 + }, + { + "epoch": 1.0381750465549349, + "grad_norm": 1.5472195148468018, + "learning_rate": 4.690790556454824e-06, + "loss": 1.0251, + "step": 1115 + }, + { + "epoch": 1.0391061452513966, + "grad_norm": 1.57706880569458, + "learning_rate": 4.690193889971346e-06, + "loss": 1.0486, + "step": 1116 + }, + { + "epoch": 1.0400372439478585, + "grad_norm": 1.5180350542068481, + "learning_rate": 4.689596686386882e-06, + "loss": 1.0376, + "step": 1117 + }, + { + "epoch": 1.0409683426443204, + "grad_norm": 1.584374189376831, + "learning_rate": 4.688998945847881e-06, + "loss": 0.9991, + "step": 1118 + }, + { + "epoch": 1.041899441340782, + "grad_norm": 1.5857142210006714, + "learning_rate": 4.6884006685009295e-06, + "loss": 0.995, + "step": 1119 + }, + { + "epoch": 1.042830540037244, + "grad_norm": 1.5227071046829224, + "learning_rate": 4.6878018544927415e-06, + "loss": 1.0097, + "step": 1120 + }, + { + "epoch": 1.0437616387337059, + "grad_norm": 1.551161289215088, + "learning_rate": 4.687202503970165e-06, + "loss": 1.0228, + "step": 1121 + }, + { + "epoch": 1.0446927374301676, + "grad_norm": 1.5544378757476807, + "learning_rate": 4.686602617080177e-06, + "loss": 1.0527, + "step": 1122 + }, + { + "epoch": 1.0456238361266295, + "grad_norm": 1.5288114547729492, + "learning_rate": 4.68600219396989e-06, + "loss": 1.006, + "step": 1123 + }, + { + "epoch": 1.0465549348230911, + "grad_norm": 1.5617018938064575, + "learning_rate": 4.685401234786544e-06, + "loss": 1.0623, + "step": 1124 + }, + { + "epoch": 1.047486033519553, + "grad_norm": 1.4837076663970947, + "learning_rate": 4.6847997396775125e-06, + "loss": 0.9941, + "step": 1125 + }, + { + "epoch": 1.048417132216015, + "grad_norm": 1.5589977502822876, + "learning_rate": 4.6841977087903e-06, + "loss": 1.0266, + "step": 1126 + }, + { + "epoch": 1.0493482309124766, + "grad_norm": 1.542898178100586, + "learning_rate": 4.683595142272544e-06, + "loss": 1.0045, + "step": 1127 + }, + { + "epoch": 1.0502793296089385, + "grad_norm": 1.5327948331832886, + "learning_rate": 4.682992040272008e-06, + "loss": 0.9836, + "step": 1128 + }, + { + "epoch": 1.0512104283054005, + "grad_norm": 1.5851460695266724, + "learning_rate": 4.682388402936595e-06, + "loss": 1.0505, + "step": 1129 + }, + { + "epoch": 1.0521415270018621, + "grad_norm": 1.578179121017456, + "learning_rate": 4.6817842304143325e-06, + "loss": 1.0409, + "step": 1130 + }, + { + "epoch": 1.053072625698324, + "grad_norm": 1.5836224555969238, + "learning_rate": 4.681179522853383e-06, + "loss": 1.0505, + "step": 1131 + }, + { + "epoch": 1.0540037243947857, + "grad_norm": 1.5516797304153442, + "learning_rate": 4.680574280402037e-06, + "loss": 0.9993, + "step": 1132 + }, + { + "epoch": 1.0549348230912476, + "grad_norm": 1.5390299558639526, + "learning_rate": 4.67996850320872e-06, + "loss": 0.9907, + "step": 1133 + }, + { + "epoch": 1.0558659217877095, + "grad_norm": 1.6459771394729614, + "learning_rate": 4.679362191421984e-06, + "loss": 1.0276, + "step": 1134 + }, + { + "epoch": 1.0567970204841712, + "grad_norm": 1.6367114782333374, + "learning_rate": 4.678755345190517e-06, + "loss": 1.0565, + "step": 1135 + }, + { + "epoch": 1.0577281191806331, + "grad_norm": 1.5795568227767944, + "learning_rate": 4.678147964663137e-06, + "loss": 0.9907, + "step": 1136 + }, + { + "epoch": 1.058659217877095, + "grad_norm": 1.6172724962234497, + "learning_rate": 4.6775400499887894e-06, + "loss": 1.0417, + "step": 1137 + }, + { + "epoch": 1.0595903165735567, + "grad_norm": 1.5257259607315063, + "learning_rate": 4.676931601316553e-06, + "loss": 1.0324, + "step": 1138 + }, + { + "epoch": 1.0605214152700186, + "grad_norm": 1.5546989440917969, + "learning_rate": 4.67632261879564e-06, + "loss": 1.0049, + "step": 1139 + }, + { + "epoch": 1.0614525139664805, + "grad_norm": 1.5146756172180176, + "learning_rate": 4.675713102575389e-06, + "loss": 1.0025, + "step": 1140 + }, + { + "epoch": 1.0623836126629422, + "grad_norm": 1.5709218978881836, + "learning_rate": 4.675103052805271e-06, + "loss": 1.027, + "step": 1141 + }, + { + "epoch": 1.0633147113594041, + "grad_norm": 1.533260703086853, + "learning_rate": 4.6744924696348906e-06, + "loss": 1.0348, + "step": 1142 + }, + { + "epoch": 1.0642458100558658, + "grad_norm": 1.5131194591522217, + "learning_rate": 4.67388135321398e-06, + "loss": 1.0271, + "step": 1143 + }, + { + "epoch": 1.0651769087523277, + "grad_norm": 1.5011087656021118, + "learning_rate": 4.673269703692403e-06, + "loss": 1.0093, + "step": 1144 + }, + { + "epoch": 1.0661080074487896, + "grad_norm": 1.5335296392440796, + "learning_rate": 4.672657521220155e-06, + "loss": 1.0285, + "step": 1145 + }, + { + "epoch": 1.0670391061452513, + "grad_norm": 1.5683685541152954, + "learning_rate": 4.67204480594736e-06, + "loss": 1.0436, + "step": 1146 + }, + { + "epoch": 1.0679702048417132, + "grad_norm": 1.5955976247787476, + "learning_rate": 4.671431558024276e-06, + "loss": 1.0619, + "step": 1147 + }, + { + "epoch": 1.0689013035381751, + "grad_norm": 1.5921692848205566, + "learning_rate": 4.670817777601289e-06, + "loss": 1.0398, + "step": 1148 + }, + { + "epoch": 1.0698324022346368, + "grad_norm": 1.5250576734542847, + "learning_rate": 4.670203464828915e-06, + "loss": 1.0437, + "step": 1149 + }, + { + "epoch": 1.0707635009310987, + "grad_norm": 1.5908156633377075, + "learning_rate": 4.669588619857804e-06, + "loss": 1.0793, + "step": 1150 + }, + { + "epoch": 1.0716945996275604, + "grad_norm": 1.577242136001587, + "learning_rate": 4.668973242838733e-06, + "loss": 1.0361, + "step": 1151 + }, + { + "epoch": 1.0726256983240223, + "grad_norm": 1.5931251049041748, + "learning_rate": 4.6683573339226105e-06, + "loss": 1.0279, + "step": 1152 + }, + { + "epoch": 1.0735567970204842, + "grad_norm": 1.6169497966766357, + "learning_rate": 4.667740893260477e-06, + "loss": 1.0306, + "step": 1153 + }, + { + "epoch": 1.074487895716946, + "grad_norm": 1.531818151473999, + "learning_rate": 4.667123921003502e-06, + "loss": 0.9994, + "step": 1154 + }, + { + "epoch": 1.0754189944134078, + "grad_norm": 1.6143760681152344, + "learning_rate": 4.6665064173029845e-06, + "loss": 1.0164, + "step": 1155 + }, + { + "epoch": 1.0763500931098697, + "grad_norm": 1.6116831302642822, + "learning_rate": 4.6658883823103555e-06, + "loss": 1.0356, + "step": 1156 + }, + { + "epoch": 1.0772811918063314, + "grad_norm": 1.5921729803085327, + "learning_rate": 4.665269816177176e-06, + "loss": 1.0267, + "step": 1157 + }, + { + "epoch": 1.0782122905027933, + "grad_norm": 1.5189334154129028, + "learning_rate": 4.664650719055136e-06, + "loss": 1.0014, + "step": 1158 + }, + { + "epoch": 1.0791433891992552, + "grad_norm": 1.5935183763504028, + "learning_rate": 4.664031091096058e-06, + "loss": 1.0025, + "step": 1159 + }, + { + "epoch": 1.080074487895717, + "grad_norm": 1.50205659866333, + "learning_rate": 4.663410932451892e-06, + "loss": 0.9977, + "step": 1160 + }, + { + "epoch": 1.0810055865921788, + "grad_norm": 1.5565632581710815, + "learning_rate": 4.66279024327472e-06, + "loss": 1.0247, + "step": 1161 + }, + { + "epoch": 1.0819366852886405, + "grad_norm": 1.5481642484664917, + "learning_rate": 4.6621690237167525e-06, + "loss": 1.023, + "step": 1162 + }, + { + "epoch": 1.0828677839851024, + "grad_norm": 1.5725595951080322, + "learning_rate": 4.661547273930333e-06, + "loss": 1.0354, + "step": 1163 + }, + { + "epoch": 1.0837988826815643, + "grad_norm": 1.5911078453063965, + "learning_rate": 4.6609249940679316e-06, + "loss": 0.9993, + "step": 1164 + }, + { + "epoch": 1.084729981378026, + "grad_norm": 1.5475032329559326, + "learning_rate": 4.6603021842821504e-06, + "loss": 1.012, + "step": 1165 + }, + { + "epoch": 1.085661080074488, + "grad_norm": 1.6458382606506348, + "learning_rate": 4.659678844725722e-06, + "loss": 1.0992, + "step": 1166 + }, + { + "epoch": 1.0865921787709498, + "grad_norm": 1.4876608848571777, + "learning_rate": 4.6590549755515055e-06, + "loss": 1.0253, + "step": 1167 + }, + { + "epoch": 1.0875232774674115, + "grad_norm": 1.5520896911621094, + "learning_rate": 4.658430576912495e-06, + "loss": 1.0218, + "step": 1168 + }, + { + "epoch": 1.0884543761638734, + "grad_norm": 1.4791535139083862, + "learning_rate": 4.657805648961809e-06, + "loss": 1.0252, + "step": 1169 + }, + { + "epoch": 1.089385474860335, + "grad_norm": 1.5296577215194702, + "learning_rate": 4.657180191852701e-06, + "loss": 1.0488, + "step": 1170 + }, + { + "epoch": 1.090316573556797, + "grad_norm": 1.541603922843933, + "learning_rate": 4.65655420573855e-06, + "loss": 1.0759, + "step": 1171 + }, + { + "epoch": 1.091247672253259, + "grad_norm": 1.632855772972107, + "learning_rate": 4.655927690772868e-06, + "loss": 1.0103, + "step": 1172 + }, + { + "epoch": 1.0921787709497206, + "grad_norm": 1.6715481281280518, + "learning_rate": 4.655300647109293e-06, + "loss": 1.041, + "step": 1173 + }, + { + "epoch": 1.0931098696461825, + "grad_norm": 1.5514028072357178, + "learning_rate": 4.654673074901596e-06, + "loss": 1.0283, + "step": 1174 + }, + { + "epoch": 1.0940409683426444, + "grad_norm": 1.5209184885025024, + "learning_rate": 4.654044974303679e-06, + "loss": 1.0072, + "step": 1175 + }, + { + "epoch": 1.094972067039106, + "grad_norm": 1.562577247619629, + "learning_rate": 4.653416345469567e-06, + "loss": 1.0396, + "step": 1176 + }, + { + "epoch": 1.095903165735568, + "grad_norm": 1.6706187725067139, + "learning_rate": 4.65278718855342e-06, + "loss": 1.046, + "step": 1177 + }, + { + "epoch": 1.0968342644320297, + "grad_norm": 1.4715631008148193, + "learning_rate": 4.652157503709527e-06, + "loss": 1.007, + "step": 1178 + }, + { + "epoch": 1.0977653631284916, + "grad_norm": 1.5649617910385132, + "learning_rate": 4.651527291092305e-06, + "loss": 1.0125, + "step": 1179 + }, + { + "epoch": 1.0986964618249535, + "grad_norm": 1.675752878189087, + "learning_rate": 4.6508965508563e-06, + "loss": 1.0586, + "step": 1180 + }, + { + "epoch": 1.0996275605214152, + "grad_norm": 1.6852096319198608, + "learning_rate": 4.650265283156189e-06, + "loss": 1.0755, + "step": 1181 + }, + { + "epoch": 1.100558659217877, + "grad_norm": 1.493811845779419, + "learning_rate": 4.649633488146779e-06, + "loss": 1.0031, + "step": 1182 + }, + { + "epoch": 1.101489757914339, + "grad_norm": 1.600391149520874, + "learning_rate": 4.6490011659830035e-06, + "loss": 1.0142, + "step": 1183 + }, + { + "epoch": 1.1024208566108007, + "grad_norm": 1.6148325204849243, + "learning_rate": 4.648368316819927e-06, + "loss": 1.0453, + "step": 1184 + }, + { + "epoch": 1.1033519553072626, + "grad_norm": 1.5433683395385742, + "learning_rate": 4.647734940812743e-06, + "loss": 0.9821, + "step": 1185 + }, + { + "epoch": 1.1042830540037243, + "grad_norm": 1.5970865488052368, + "learning_rate": 4.647101038116775e-06, + "loss": 1.0295, + "step": 1186 + }, + { + "epoch": 1.1052141527001862, + "grad_norm": 1.570863127708435, + "learning_rate": 4.646466608887474e-06, + "loss": 1.0342, + "step": 1187 + }, + { + "epoch": 1.106145251396648, + "grad_norm": 1.64969801902771, + "learning_rate": 4.645831653280421e-06, + "loss": 1.0779, + "step": 1188 + }, + { + "epoch": 1.1070763500931098, + "grad_norm": 1.5375019311904907, + "learning_rate": 4.645196171451327e-06, + "loss": 1.0346, + "step": 1189 + }, + { + "epoch": 1.1080074487895717, + "grad_norm": 1.5763280391693115, + "learning_rate": 4.6445601635560305e-06, + "loss": 1.0118, + "step": 1190 + }, + { + "epoch": 1.1089385474860336, + "grad_norm": 1.623668909072876, + "learning_rate": 4.6439236297505e-06, + "loss": 1.0476, + "step": 1191 + }, + { + "epoch": 1.1098696461824953, + "grad_norm": 1.6408751010894775, + "learning_rate": 4.643286570190832e-06, + "loss": 1.0261, + "step": 1192 + }, + { + "epoch": 1.1108007448789572, + "grad_norm": 1.563445806503296, + "learning_rate": 4.6426489850332515e-06, + "loss": 1.024, + "step": 1193 + }, + { + "epoch": 1.111731843575419, + "grad_norm": 1.5648657083511353, + "learning_rate": 4.642010874434116e-06, + "loss": 1.0513, + "step": 1194 + }, + { + "epoch": 1.1126629422718808, + "grad_norm": 1.5322439670562744, + "learning_rate": 4.641372238549909e-06, + "loss": 0.9958, + "step": 1195 + }, + { + "epoch": 1.1135940409683427, + "grad_norm": 1.521952748298645, + "learning_rate": 4.640733077537241e-06, + "loss": 1.0329, + "step": 1196 + }, + { + "epoch": 1.1145251396648044, + "grad_norm": 1.5218099355697632, + "learning_rate": 4.640093391552854e-06, + "loss": 0.9921, + "step": 1197 + }, + { + "epoch": 1.1154562383612663, + "grad_norm": 1.6294655799865723, + "learning_rate": 4.639453180753619e-06, + "loss": 1.0276, + "step": 1198 + }, + { + "epoch": 1.1163873370577282, + "grad_norm": 1.5066817998886108, + "learning_rate": 4.638812445296535e-06, + "loss": 1.0537, + "step": 1199 + }, + { + "epoch": 1.1173184357541899, + "grad_norm": 1.5842376947402954, + "learning_rate": 4.638171185338729e-06, + "loss": 1.0, + "step": 1200 + }, + { + "epoch": 1.1182495344506518, + "grad_norm": 1.5416522026062012, + "learning_rate": 4.637529401037456e-06, + "loss": 1.0459, + "step": 1201 + }, + { + "epoch": 1.1191806331471137, + "grad_norm": 1.5635859966278076, + "learning_rate": 4.636887092550103e-06, + "loss": 1.0746, + "step": 1202 + }, + { + "epoch": 1.1201117318435754, + "grad_norm": 1.5437886714935303, + "learning_rate": 4.636244260034182e-06, + "loss": 1.0373, + "step": 1203 + }, + { + "epoch": 1.1210428305400373, + "grad_norm": 1.4998105764389038, + "learning_rate": 4.635600903647334e-06, + "loss": 1.0077, + "step": 1204 + }, + { + "epoch": 1.121973929236499, + "grad_norm": 1.4908320903778076, + "learning_rate": 4.63495702354733e-06, + "loss": 1.0359, + "step": 1205 + }, + { + "epoch": 1.1229050279329609, + "grad_norm": 1.551636815071106, + "learning_rate": 4.634312619892069e-06, + "loss": 1.0086, + "step": 1206 + }, + { + "epoch": 1.1238361266294228, + "grad_norm": 1.5259814262390137, + "learning_rate": 4.633667692839577e-06, + "loss": 1.0517, + "step": 1207 + }, + { + "epoch": 1.1247672253258845, + "grad_norm": 1.5764120817184448, + "learning_rate": 4.6330222425480095e-06, + "loss": 1.0802, + "step": 1208 + }, + { + "epoch": 1.1256983240223464, + "grad_norm": 1.5756115913391113, + "learning_rate": 4.632376269175653e-06, + "loss": 1.0305, + "step": 1209 + }, + { + "epoch": 1.1266294227188083, + "grad_norm": 1.6039260625839233, + "learning_rate": 4.631729772880914e-06, + "loss": 1.0446, + "step": 1210 + }, + { + "epoch": 1.12756052141527, + "grad_norm": 1.570855975151062, + "learning_rate": 4.631082753822338e-06, + "loss": 1.048, + "step": 1211 + }, + { + "epoch": 1.1284916201117319, + "grad_norm": 1.5383471250534058, + "learning_rate": 4.630435212158591e-06, + "loss": 1.0401, + "step": 1212 + }, + { + "epoch": 1.1294227188081938, + "grad_norm": 1.5055118799209595, + "learning_rate": 4.6297871480484694e-06, + "loss": 0.9918, + "step": 1213 + }, + { + "epoch": 1.1303538175046555, + "grad_norm": 1.5805336236953735, + "learning_rate": 4.629138561650899e-06, + "loss": 1.0582, + "step": 1214 + }, + { + "epoch": 1.1312849162011174, + "grad_norm": 1.4991474151611328, + "learning_rate": 4.628489453124931e-06, + "loss": 0.9945, + "step": 1215 + }, + { + "epoch": 1.132216014897579, + "grad_norm": 1.5370382070541382, + "learning_rate": 4.627839822629748e-06, + "loss": 0.9885, + "step": 1216 + }, + { + "epoch": 1.133147113594041, + "grad_norm": 1.5403425693511963, + "learning_rate": 4.627189670324657e-06, + "loss": 1.0336, + "step": 1217 + }, + { + "epoch": 1.1340782122905029, + "grad_norm": 1.6169673204421997, + "learning_rate": 4.626538996369096e-06, + "loss": 1.0285, + "step": 1218 + }, + { + "epoch": 1.1350093109869646, + "grad_norm": 1.4898627996444702, + "learning_rate": 4.62588780092263e-06, + "loss": 0.9995, + "step": 1219 + }, + { + "epoch": 1.1359404096834265, + "grad_norm": 1.5606540441513062, + "learning_rate": 4.6252360841449504e-06, + "loss": 0.9895, + "step": 1220 + }, + { + "epoch": 1.1368715083798882, + "grad_norm": 1.5657724142074585, + "learning_rate": 4.624583846195878e-06, + "loss": 1.051, + "step": 1221 + }, + { + "epoch": 1.13780260707635, + "grad_norm": 1.6210291385650635, + "learning_rate": 4.623931087235361e-06, + "loss": 1.0703, + "step": 1222 + }, + { + "epoch": 1.138733705772812, + "grad_norm": 1.4690114259719849, + "learning_rate": 4.623277807423477e-06, + "loss": 0.9946, + "step": 1223 + }, + { + "epoch": 1.1396648044692737, + "grad_norm": 1.558742642402649, + "learning_rate": 4.622624006920426e-06, + "loss": 1.0388, + "step": 1224 + }, + { + "epoch": 1.1405959031657356, + "grad_norm": 1.565818428993225, + "learning_rate": 4.621969685886544e-06, + "loss": 1.0368, + "step": 1225 + }, + { + "epoch": 1.1415270018621975, + "grad_norm": 1.5682542324066162, + "learning_rate": 4.621314844482287e-06, + "loss": 1.0392, + "step": 1226 + }, + { + "epoch": 1.1424581005586592, + "grad_norm": 1.4746406078338623, + "learning_rate": 4.6206594828682425e-06, + "loss": 0.9779, + "step": 1227 + }, + { + "epoch": 1.143389199255121, + "grad_norm": 1.5437195301055908, + "learning_rate": 4.620003601205125e-06, + "loss": 1.0153, + "step": 1228 + }, + { + "epoch": 1.144320297951583, + "grad_norm": 1.542839527130127, + "learning_rate": 4.619347199653775e-06, + "loss": 1.0044, + "step": 1229 + }, + { + "epoch": 1.1452513966480447, + "grad_norm": 1.4853402376174927, + "learning_rate": 4.6186902783751645e-06, + "loss": 1.0036, + "step": 1230 + }, + { + "epoch": 1.1461824953445066, + "grad_norm": 1.5345730781555176, + "learning_rate": 4.6180328375303876e-06, + "loss": 1.0376, + "step": 1231 + }, + { + "epoch": 1.1471135940409685, + "grad_norm": 1.5952215194702148, + "learning_rate": 4.617374877280669e-06, + "loss": 1.039, + "step": 1232 + }, + { + "epoch": 1.1480446927374302, + "grad_norm": 1.5528311729431152, + "learning_rate": 4.616716397787362e-06, + "loss": 1.0234, + "step": 1233 + }, + { + "epoch": 1.148975791433892, + "grad_norm": 1.5732342004776, + "learning_rate": 4.616057399211943e-06, + "loss": 1.0117, + "step": 1234 + }, + { + "epoch": 1.1499068901303537, + "grad_norm": 1.5569034814834595, + "learning_rate": 4.615397881716019e-06, + "loss": 1.0582, + "step": 1235 + }, + { + "epoch": 1.1508379888268156, + "grad_norm": 1.5574308633804321, + "learning_rate": 4.614737845461325e-06, + "loss": 1.0399, + "step": 1236 + }, + { + "epoch": 1.1517690875232776, + "grad_norm": 1.820462703704834, + "learning_rate": 4.614077290609719e-06, + "loss": 1.0413, + "step": 1237 + }, + { + "epoch": 1.1527001862197392, + "grad_norm": 1.6015551090240479, + "learning_rate": 4.61341621732319e-06, + "loss": 1.0486, + "step": 1238 + }, + { + "epoch": 1.1536312849162011, + "grad_norm": 1.5389423370361328, + "learning_rate": 4.612754625763854e-06, + "loss": 1.0387, + "step": 1239 + }, + { + "epoch": 1.1545623836126628, + "grad_norm": 1.5318962335586548, + "learning_rate": 4.61209251609395e-06, + "loss": 1.0429, + "step": 1240 + }, + { + "epoch": 1.1554934823091247, + "grad_norm": 1.5230445861816406, + "learning_rate": 4.61142988847585e-06, + "loss": 1.0348, + "step": 1241 + }, + { + "epoch": 1.1564245810055866, + "grad_norm": 1.472663402557373, + "learning_rate": 4.61076674307205e-06, + "loss": 0.9974, + "step": 1242 + }, + { + "epoch": 1.1573556797020483, + "grad_norm": 1.6207207441329956, + "learning_rate": 4.610103080045171e-06, + "loss": 1.0286, + "step": 1243 + }, + { + "epoch": 1.1582867783985102, + "grad_norm": 1.5338255167007446, + "learning_rate": 4.609438899557964e-06, + "loss": 1.0252, + "step": 1244 + }, + { + "epoch": 1.1592178770949721, + "grad_norm": 1.560064673423767, + "learning_rate": 4.608774201773307e-06, + "loss": 1.0351, + "step": 1245 + }, + { + "epoch": 1.1601489757914338, + "grad_norm": 1.538946270942688, + "learning_rate": 4.608108986854202e-06, + "loss": 1.0156, + "step": 1246 + }, + { + "epoch": 1.1610800744878957, + "grad_norm": 1.6004894971847534, + "learning_rate": 4.607443254963782e-06, + "loss": 1.0239, + "step": 1247 + }, + { + "epoch": 1.1620111731843576, + "grad_norm": 1.6527992486953735, + "learning_rate": 4.606777006265302e-06, + "loss": 0.9598, + "step": 1248 + }, + { + "epoch": 1.1629422718808193, + "grad_norm": 1.5303829908370972, + "learning_rate": 4.606110240922146e-06, + "loss": 1.0364, + "step": 1249 + }, + { + "epoch": 1.1638733705772812, + "grad_norm": 1.5249247550964355, + "learning_rate": 4.605442959097826e-06, + "loss": 1.0127, + "step": 1250 + }, + { + "epoch": 1.164804469273743, + "grad_norm": 1.5821163654327393, + "learning_rate": 4.604775160955979e-06, + "loss": 1.0117, + "step": 1251 + }, + { + "epoch": 1.1657355679702048, + "grad_norm": 1.56100594997406, + "learning_rate": 4.60410684666037e-06, + "loss": 1.044, + "step": 1252 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 1.534448266029358, + "learning_rate": 4.603438016374888e-06, + "loss": 1.0152, + "step": 1253 + }, + { + "epoch": 1.1675977653631284, + "grad_norm": 1.5872039794921875, + "learning_rate": 4.602768670263551e-06, + "loss": 1.0006, + "step": 1254 + }, + { + "epoch": 1.1685288640595903, + "grad_norm": 1.58054518699646, + "learning_rate": 4.602098808490503e-06, + "loss": 1.0303, + "step": 1255 + }, + { + "epoch": 1.169459962756052, + "grad_norm": 1.5923792123794556, + "learning_rate": 4.6014284312200134e-06, + "loss": 0.9982, + "step": 1256 + }, + { + "epoch": 1.170391061452514, + "grad_norm": 1.5675373077392578, + "learning_rate": 4.600757538616479e-06, + "loss": 1.0388, + "step": 1257 + }, + { + "epoch": 1.1713221601489758, + "grad_norm": 1.5689672231674194, + "learning_rate": 4.600086130844424e-06, + "loss": 1.0358, + "step": 1258 + }, + { + "epoch": 1.1722532588454375, + "grad_norm": 1.5517081022262573, + "learning_rate": 4.5994142080684956e-06, + "loss": 1.0056, + "step": 1259 + }, + { + "epoch": 1.1731843575418994, + "grad_norm": 1.5735931396484375, + "learning_rate": 4.5987417704534695e-06, + "loss": 1.0255, + "step": 1260 + }, + { + "epoch": 1.1741154562383613, + "grad_norm": 1.5739511251449585, + "learning_rate": 4.598068818164249e-06, + "loss": 0.9954, + "step": 1261 + }, + { + "epoch": 1.175046554934823, + "grad_norm": 1.588541865348816, + "learning_rate": 4.597395351365861e-06, + "loss": 1.0068, + "step": 1262 + }, + { + "epoch": 1.175977653631285, + "grad_norm": 1.6256474256515503, + "learning_rate": 4.596721370223461e-06, + "loss": 1.05, + "step": 1263 + }, + { + "epoch": 1.1769087523277468, + "grad_norm": 1.5730301141738892, + "learning_rate": 4.5960468749023265e-06, + "loss": 1.0262, + "step": 1264 + }, + { + "epoch": 1.1778398510242085, + "grad_norm": 1.6214555501937866, + "learning_rate": 4.595371865567866e-06, + "loss": 1.051, + "step": 1265 + }, + { + "epoch": 1.1787709497206704, + "grad_norm": 1.581821322441101, + "learning_rate": 4.5946963423856125e-06, + "loss": 1.0208, + "step": 1266 + }, + { + "epoch": 1.1797020484171323, + "grad_norm": 1.6067352294921875, + "learning_rate": 4.594020305521223e-06, + "loss": 1.014, + "step": 1267 + }, + { + "epoch": 1.180633147113594, + "grad_norm": 1.6256908178329468, + "learning_rate": 4.593343755140484e-06, + "loss": 1.0744, + "step": 1268 + }, + { + "epoch": 1.181564245810056, + "grad_norm": 1.5307555198669434, + "learning_rate": 4.592666691409303e-06, + "loss": 0.9939, + "step": 1269 + }, + { + "epoch": 1.1824953445065176, + "grad_norm": 1.5120658874511719, + "learning_rate": 4.591989114493718e-06, + "loss": 1.0313, + "step": 1270 + }, + { + "epoch": 1.1834264432029795, + "grad_norm": 1.5359355211257935, + "learning_rate": 4.591311024559891e-06, + "loss": 1.0259, + "step": 1271 + }, + { + "epoch": 1.1843575418994414, + "grad_norm": 1.6252797842025757, + "learning_rate": 4.59063242177411e-06, + "loss": 1.0344, + "step": 1272 + }, + { + "epoch": 1.185288640595903, + "grad_norm": 1.6357321739196777, + "learning_rate": 4.589953306302787e-06, + "loss": 1.0615, + "step": 1273 + }, + { + "epoch": 1.186219739292365, + "grad_norm": 1.6225498914718628, + "learning_rate": 4.5892736783124635e-06, + "loss": 1.0005, + "step": 1274 + }, + { + "epoch": 1.1871508379888267, + "grad_norm": 1.6100691556930542, + "learning_rate": 4.588593537969805e-06, + "loss": 1.0212, + "step": 1275 + }, + { + "epoch": 1.1880819366852886, + "grad_norm": 1.5478780269622803, + "learning_rate": 4.5879128854415996e-06, + "loss": 1.0574, + "step": 1276 + }, + { + "epoch": 1.1890130353817505, + "grad_norm": 1.5705329179763794, + "learning_rate": 4.5872317208947656e-06, + "loss": 1.0777, + "step": 1277 + }, + { + "epoch": 1.1899441340782122, + "grad_norm": 1.463334321975708, + "learning_rate": 4.586550044496345e-06, + "loss": 1.0335, + "step": 1278 + }, + { + "epoch": 1.190875232774674, + "grad_norm": 1.537367820739746, + "learning_rate": 4.585867856413505e-06, + "loss": 1.0388, + "step": 1279 + }, + { + "epoch": 1.191806331471136, + "grad_norm": 1.6201003789901733, + "learning_rate": 4.5851851568135376e-06, + "loss": 1.0516, + "step": 1280 + }, + { + "epoch": 1.1927374301675977, + "grad_norm": 1.52261483669281, + "learning_rate": 4.5845019458638614e-06, + "loss": 0.9937, + "step": 1281 + }, + { + "epoch": 1.1936685288640596, + "grad_norm": 1.5462313890457153, + "learning_rate": 4.583818223732021e-06, + "loss": 1.0214, + "step": 1282 + }, + { + "epoch": 1.1945996275605215, + "grad_norm": 1.5227042436599731, + "learning_rate": 4.583133990585684e-06, + "loss": 1.0141, + "step": 1283 + }, + { + "epoch": 1.1955307262569832, + "grad_norm": 1.5403075218200684, + "learning_rate": 4.5824492465926474e-06, + "loss": 1.0451, + "step": 1284 + }, + { + "epoch": 1.196461824953445, + "grad_norm": 1.556834101676941, + "learning_rate": 4.581763991920829e-06, + "loss": 1.0162, + "step": 1285 + }, + { + "epoch": 1.197392923649907, + "grad_norm": 1.5314834117889404, + "learning_rate": 4.5810782267382736e-06, + "loss": 1.0482, + "step": 1286 + }, + { + "epoch": 1.1983240223463687, + "grad_norm": 1.6128249168395996, + "learning_rate": 4.580391951213151e-06, + "loss": 1.0176, + "step": 1287 + }, + { + "epoch": 1.1992551210428306, + "grad_norm": 1.5465834140777588, + "learning_rate": 4.579705165513758e-06, + "loss": 1.029, + "step": 1288 + }, + { + "epoch": 1.2001862197392923, + "grad_norm": 1.5844080448150635, + "learning_rate": 4.579017869808514e-06, + "loss": 1.0404, + "step": 1289 + }, + { + "epoch": 1.2011173184357542, + "grad_norm": 1.6418040990829468, + "learning_rate": 4.578330064265965e-06, + "loss": 1.0137, + "step": 1290 + }, + { + "epoch": 1.202048417132216, + "grad_norm": 1.597098708152771, + "learning_rate": 4.57764174905478e-06, + "loss": 0.9727, + "step": 1291 + }, + { + "epoch": 1.2029795158286778, + "grad_norm": 1.6501483917236328, + "learning_rate": 4.576952924343756e-06, + "loss": 1.0167, + "step": 1292 + }, + { + "epoch": 1.2039106145251397, + "grad_norm": 1.6502028703689575, + "learning_rate": 4.576263590301814e-06, + "loss": 1.0172, + "step": 1293 + }, + { + "epoch": 1.2048417132216014, + "grad_norm": 1.546657681465149, + "learning_rate": 4.575573747097996e-06, + "loss": 1.034, + "step": 1294 + }, + { + "epoch": 1.2057728119180633, + "grad_norm": 1.6021524667739868, + "learning_rate": 4.5748833949014766e-06, + "loss": 1.0109, + "step": 1295 + }, + { + "epoch": 1.2067039106145252, + "grad_norm": 1.5246201753616333, + "learning_rate": 4.574192533881547e-06, + "loss": 0.9966, + "step": 1296 + }, + { + "epoch": 1.2076350093109869, + "grad_norm": 1.6043974161148071, + "learning_rate": 4.57350116420763e-06, + "loss": 1.0458, + "step": 1297 + }, + { + "epoch": 1.2085661080074488, + "grad_norm": 1.6759452819824219, + "learning_rate": 4.572809286049268e-06, + "loss": 1.0335, + "step": 1298 + }, + { + "epoch": 1.2094972067039107, + "grad_norm": 1.5275174379348755, + "learning_rate": 4.572116899576131e-06, + "loss": 1.0203, + "step": 1299 + }, + { + "epoch": 1.2104283054003724, + "grad_norm": 1.6002684831619263, + "learning_rate": 4.571424004958012e-06, + "loss": 1.0415, + "step": 1300 + }, + { + "epoch": 1.2113594040968343, + "grad_norm": 1.5781075954437256, + "learning_rate": 4.570730602364831e-06, + "loss": 1.0681, + "step": 1301 + }, + { + "epoch": 1.2122905027932962, + "grad_norm": 1.5570040941238403, + "learning_rate": 4.5700366919666294e-06, + "loss": 1.0484, + "step": 1302 + }, + { + "epoch": 1.2132216014897579, + "grad_norm": 1.5383262634277344, + "learning_rate": 4.569342273933576e-06, + "loss": 1.0193, + "step": 1303 + }, + { + "epoch": 1.2141527001862198, + "grad_norm": 1.5316905975341797, + "learning_rate": 4.568647348435963e-06, + "loss": 1.0248, + "step": 1304 + }, + { + "epoch": 1.2150837988826815, + "grad_norm": 1.4854018688201904, + "learning_rate": 4.567951915644205e-06, + "loss": 1.0422, + "step": 1305 + }, + { + "epoch": 1.2160148975791434, + "grad_norm": 1.5791949033737183, + "learning_rate": 4.567255975728846e-06, + "loss": 1.0302, + "step": 1306 + }, + { + "epoch": 1.2169459962756053, + "grad_norm": 1.6900380849838257, + "learning_rate": 4.566559528860548e-06, + "loss": 1.0799, + "step": 1307 + }, + { + "epoch": 1.217877094972067, + "grad_norm": 1.6398446559906006, + "learning_rate": 4.565862575210102e-06, + "loss": 1.0161, + "step": 1308 + }, + { + "epoch": 1.2188081936685289, + "grad_norm": 1.5573872327804565, + "learning_rate": 4.565165114948423e-06, + "loss": 0.9874, + "step": 1309 + }, + { + "epoch": 1.2197392923649906, + "grad_norm": 1.5711264610290527, + "learning_rate": 4.564467148246548e-06, + "loss": 1.0225, + "step": 1310 + }, + { + "epoch": 1.2206703910614525, + "grad_norm": 1.58504319190979, + "learning_rate": 4.563768675275639e-06, + "loss": 1.0156, + "step": 1311 + }, + { + "epoch": 1.2216014897579144, + "grad_norm": 1.845436930656433, + "learning_rate": 4.563069696206982e-06, + "loss": 1.0516, + "step": 1312 + }, + { + "epoch": 1.222532588454376, + "grad_norm": 1.6350762844085693, + "learning_rate": 4.56237021121199e-06, + "loss": 1.0155, + "step": 1313 + }, + { + "epoch": 1.223463687150838, + "grad_norm": 1.5159375667572021, + "learning_rate": 4.561670220462194e-06, + "loss": 1.019, + "step": 1314 + }, + { + "epoch": 1.2243947858472999, + "grad_norm": 1.6053423881530762, + "learning_rate": 4.560969724129256e-06, + "loss": 1.0054, + "step": 1315 + }, + { + "epoch": 1.2253258845437616, + "grad_norm": 1.5837332010269165, + "learning_rate": 4.560268722384956e-06, + "loss": 0.9912, + "step": 1316 + }, + { + "epoch": 1.2262569832402235, + "grad_norm": 1.707977294921875, + "learning_rate": 4.559567215401203e-06, + "loss": 1.0047, + "step": 1317 + }, + { + "epoch": 1.2271880819366854, + "grad_norm": 1.5581670999526978, + "learning_rate": 4.558865203350026e-06, + "loss": 1.0079, + "step": 1318 + }, + { + "epoch": 1.228119180633147, + "grad_norm": 1.537711501121521, + "learning_rate": 4.558162686403579e-06, + "loss": 1.0281, + "step": 1319 + }, + { + "epoch": 1.229050279329609, + "grad_norm": 1.6358866691589355, + "learning_rate": 4.5574596647341414e-06, + "loss": 1.0651, + "step": 1320 + }, + { + "epoch": 1.2299813780260709, + "grad_norm": 1.6060906648635864, + "learning_rate": 4.556756138514114e-06, + "loss": 1.0007, + "step": 1321 + }, + { + "epoch": 1.2309124767225326, + "grad_norm": 1.5629162788391113, + "learning_rate": 4.556052107916023e-06, + "loss": 0.9999, + "step": 1322 + }, + { + "epoch": 1.2318435754189945, + "grad_norm": 1.5924534797668457, + "learning_rate": 4.555347573112519e-06, + "loss": 1.0452, + "step": 1323 + }, + { + "epoch": 1.2327746741154562, + "grad_norm": 1.632931113243103, + "learning_rate": 4.5546425342763715e-06, + "loss": 1.0224, + "step": 1324 + }, + { + "epoch": 1.233705772811918, + "grad_norm": 1.5387897491455078, + "learning_rate": 4.55393699158048e-06, + "loss": 1.0182, + "step": 1325 + }, + { + "epoch": 1.23463687150838, + "grad_norm": 1.515651822090149, + "learning_rate": 4.553230945197864e-06, + "loss": 0.9789, + "step": 1326 + }, + { + "epoch": 1.2355679702048417, + "grad_norm": 1.5131114721298218, + "learning_rate": 4.552524395301667e-06, + "loss": 1.0067, + "step": 1327 + }, + { + "epoch": 1.2364990689013036, + "grad_norm": 1.6065956354141235, + "learning_rate": 4.551817342065157e-06, + "loss": 1.0358, + "step": 1328 + }, + { + "epoch": 1.2374301675977653, + "grad_norm": 1.5832058191299438, + "learning_rate": 4.551109785661722e-06, + "loss": 1.0342, + "step": 1329 + }, + { + "epoch": 1.2383612662942272, + "grad_norm": 1.5282360315322876, + "learning_rate": 4.55040172626488e-06, + "loss": 1.0214, + "step": 1330 + }, + { + "epoch": 1.239292364990689, + "grad_norm": 1.595635175704956, + "learning_rate": 4.549693164048265e-06, + "loss": 1.0128, + "step": 1331 + }, + { + "epoch": 1.2402234636871508, + "grad_norm": 1.6100645065307617, + "learning_rate": 4.548984099185638e-06, + "loss": 1.0168, + "step": 1332 + }, + { + "epoch": 1.2411545623836127, + "grad_norm": 1.5903205871582031, + "learning_rate": 4.548274531850885e-06, + "loss": 1.0006, + "step": 1333 + }, + { + "epoch": 1.2420856610800746, + "grad_norm": 1.5367014408111572, + "learning_rate": 4.5475644622180105e-06, + "loss": 0.9752, + "step": 1334 + }, + { + "epoch": 1.2430167597765363, + "grad_norm": 1.553397536277771, + "learning_rate": 4.546853890461147e-06, + "loss": 1.0178, + "step": 1335 + }, + { + "epoch": 1.2439478584729982, + "grad_norm": 1.5958094596862793, + "learning_rate": 4.546142816754546e-06, + "loss": 1.0039, + "step": 1336 + }, + { + "epoch": 1.24487895716946, + "grad_norm": 1.5748871564865112, + "learning_rate": 4.545431241272585e-06, + "loss": 0.9865, + "step": 1337 + }, + { + "epoch": 1.2458100558659218, + "grad_norm": 1.6458196640014648, + "learning_rate": 4.5447191641897645e-06, + "loss": 1.0063, + "step": 1338 + }, + { + "epoch": 1.2467411545623837, + "grad_norm": 1.6087242364883423, + "learning_rate": 4.544006585680706e-06, + "loss": 1.0252, + "step": 1339 + }, + { + "epoch": 1.2476722532588453, + "grad_norm": 1.6460249423980713, + "learning_rate": 4.543293505920155e-06, + "loss": 1.0613, + "step": 1340 + }, + { + "epoch": 1.2486033519553073, + "grad_norm": 1.590306043624878, + "learning_rate": 4.542579925082979e-06, + "loss": 1.0291, + "step": 1341 + }, + { + "epoch": 1.2495344506517692, + "grad_norm": 1.529325008392334, + "learning_rate": 4.541865843344171e-06, + "loss": 0.9829, + "step": 1342 + }, + { + "epoch": 1.2504655493482308, + "grad_norm": 1.5599408149719238, + "learning_rate": 4.5411512608788454e-06, + "loss": 0.9913, + "step": 1343 + }, + { + "epoch": 1.2513966480446927, + "grad_norm": 1.6064761877059937, + "learning_rate": 4.540436177862237e-06, + "loss": 1.0504, + "step": 1344 + }, + { + "epoch": 1.2523277467411544, + "grad_norm": 1.5412648916244507, + "learning_rate": 4.5397205944697084e-06, + "loss": 1.0459, + "step": 1345 + }, + { + "epoch": 1.2532588454376163, + "grad_norm": 1.5696524381637573, + "learning_rate": 4.53900451087674e-06, + "loss": 1.0483, + "step": 1346 + }, + { + "epoch": 1.2541899441340782, + "grad_norm": 1.5144925117492676, + "learning_rate": 4.538287927258937e-06, + "loss": 0.9966, + "step": 1347 + }, + { + "epoch": 1.25512104283054, + "grad_norm": 1.576715111732483, + "learning_rate": 4.537570843792028e-06, + "loss": 1.0069, + "step": 1348 + }, + { + "epoch": 1.2560521415270018, + "grad_norm": 1.5711338520050049, + "learning_rate": 4.536853260651863e-06, + "loss": 1.0318, + "step": 1349 + }, + { + "epoch": 1.2569832402234637, + "grad_norm": 1.5961750745773315, + "learning_rate": 4.536135178014416e-06, + "loss": 1.0429, + "step": 1350 + }, + { + "epoch": 1.2579143389199254, + "grad_norm": 1.564344048500061, + "learning_rate": 4.535416596055779e-06, + "loss": 1.0069, + "step": 1351 + }, + { + "epoch": 1.2588454376163873, + "grad_norm": 1.5682843923568726, + "learning_rate": 4.534697514952172e-06, + "loss": 1.0283, + "step": 1352 + }, + { + "epoch": 1.2597765363128492, + "grad_norm": 1.581691026687622, + "learning_rate": 4.533977934879936e-06, + "loss": 1.0382, + "step": 1353 + }, + { + "epoch": 1.260707635009311, + "grad_norm": 1.5946353673934937, + "learning_rate": 4.533257856015532e-06, + "loss": 1.0676, + "step": 1354 + }, + { + "epoch": 1.2616387337057728, + "grad_norm": 1.5726041793823242, + "learning_rate": 4.532537278535545e-06, + "loss": 0.974, + "step": 1355 + }, + { + "epoch": 1.2625698324022347, + "grad_norm": 1.545648217201233, + "learning_rate": 4.531816202616682e-06, + "loss": 1.04, + "step": 1356 + }, + { + "epoch": 1.2635009310986964, + "grad_norm": 1.5651519298553467, + "learning_rate": 4.531094628435774e-06, + "loss": 1.0368, + "step": 1357 + }, + { + "epoch": 1.2644320297951583, + "grad_norm": 1.5435962677001953, + "learning_rate": 4.530372556169771e-06, + "loss": 1.0279, + "step": 1358 + }, + { + "epoch": 1.2653631284916202, + "grad_norm": 1.5451003313064575, + "learning_rate": 4.5296499859957475e-06, + "loss": 1.0387, + "step": 1359 + }, + { + "epoch": 1.266294227188082, + "grad_norm": 1.5139414072036743, + "learning_rate": 4.528926918090898e-06, + "loss": 1.0284, + "step": 1360 + }, + { + "epoch": 1.2672253258845438, + "grad_norm": 1.6322270631790161, + "learning_rate": 4.528203352632542e-06, + "loss": 1.0512, + "step": 1361 + }, + { + "epoch": 1.2681564245810055, + "grad_norm": 1.5866525173187256, + "learning_rate": 4.527479289798118e-06, + "loss": 1.0128, + "step": 1362 + }, + { + "epoch": 1.2690875232774674, + "grad_norm": 1.8181447982788086, + "learning_rate": 4.526754729765188e-06, + "loss": 1.0794, + "step": 1363 + }, + { + "epoch": 1.2700186219739291, + "grad_norm": 1.521165132522583, + "learning_rate": 4.526029672711437e-06, + "loss": 1.0179, + "step": 1364 + }, + { + "epoch": 1.270949720670391, + "grad_norm": 1.547357201576233, + "learning_rate": 4.525304118814671e-06, + "loss": 1.0195, + "step": 1365 + }, + { + "epoch": 1.271880819366853, + "grad_norm": 1.5587042570114136, + "learning_rate": 4.524578068252815e-06, + "loss": 1.0101, + "step": 1366 + }, + { + "epoch": 1.2728119180633146, + "grad_norm": 1.5763295888900757, + "learning_rate": 4.52385152120392e-06, + "loss": 1.056, + "step": 1367 + }, + { + "epoch": 1.2737430167597765, + "grad_norm": 1.5521553754806519, + "learning_rate": 4.523124477846156e-06, + "loss": 1.0119, + "step": 1368 + }, + { + "epoch": 1.2746741154562384, + "grad_norm": 1.5642468929290771, + "learning_rate": 4.522396938357817e-06, + "loss": 1.0362, + "step": 1369 + }, + { + "epoch": 1.2756052141527001, + "grad_norm": 1.5667338371276855, + "learning_rate": 4.5216689029173175e-06, + "loss": 1.0295, + "step": 1370 + }, + { + "epoch": 1.276536312849162, + "grad_norm": 1.5273995399475098, + "learning_rate": 4.520940371703192e-06, + "loss": 1.0064, + "step": 1371 + }, + { + "epoch": 1.277467411545624, + "grad_norm": 1.5165363550186157, + "learning_rate": 4.5202113448941e-06, + "loss": 0.984, + "step": 1372 + }, + { + "epoch": 1.2783985102420856, + "grad_norm": 1.538142204284668, + "learning_rate": 4.519481822668819e-06, + "loss": 1.0077, + "step": 1373 + }, + { + "epoch": 1.2793296089385475, + "grad_norm": 1.5915653705596924, + "learning_rate": 4.518751805206251e-06, + "loss": 1.0175, + "step": 1374 + }, + { + "epoch": 1.2802607076350094, + "grad_norm": 1.5434679985046387, + "learning_rate": 4.518021292685417e-06, + "loss": 1.0021, + "step": 1375 + }, + { + "epoch": 1.2811918063314711, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.5172902852854604e-06, + "loss": 1.0481, + "step": 1376 + }, + { + "epoch": 1.282122905027933, + "grad_norm": 1.5424895286560059, + "learning_rate": 4.516558783185647e-06, + "loss": 1.0049, + "step": 1377 + }, + { + "epoch": 1.2830540037243947, + "grad_norm": 1.5552512407302856, + "learning_rate": 4.5158267865653636e-06, + "loss": 1.0189, + "step": 1378 + }, + { + "epoch": 1.2839851024208566, + "grad_norm": 1.6167283058166504, + "learning_rate": 4.515094295604115e-06, + "loss": 1.0196, + "step": 1379 + }, + { + "epoch": 1.2849162011173183, + "grad_norm": 1.6639224290847778, + "learning_rate": 4.514361310481533e-06, + "loss": 1.0404, + "step": 1380 + }, + { + "epoch": 1.2858472998137802, + "grad_norm": 1.5814796686172485, + "learning_rate": 4.513627831377365e-06, + "loss": 1.0158, + "step": 1381 + }, + { + "epoch": 1.2867783985102421, + "grad_norm": 1.6043853759765625, + "learning_rate": 4.512893858471483e-06, + "loss": 1.0288, + "step": 1382 + }, + { + "epoch": 1.2877094972067038, + "grad_norm": 1.6249885559082031, + "learning_rate": 4.51215939194388e-06, + "loss": 1.0724, + "step": 1383 + }, + { + "epoch": 1.2886405959031657, + "grad_norm": 1.6318695545196533, + "learning_rate": 4.511424431974667e-06, + "loss": 0.9819, + "step": 1384 + }, + { + "epoch": 1.2895716945996276, + "grad_norm": 1.5797929763793945, + "learning_rate": 4.51068897874408e-06, + "loss": 1.0519, + "step": 1385 + }, + { + "epoch": 1.2905027932960893, + "grad_norm": 1.6023569107055664, + "learning_rate": 4.509953032432474e-06, + "loss": 1.0146, + "step": 1386 + }, + { + "epoch": 1.2914338919925512, + "grad_norm": 1.6638710498809814, + "learning_rate": 4.509216593220324e-06, + "loss": 1.0712, + "step": 1387 + }, + { + "epoch": 1.2923649906890131, + "grad_norm": 1.6138070821762085, + "learning_rate": 4.508479661288227e-06, + "loss": 1.0388, + "step": 1388 + }, + { + "epoch": 1.2932960893854748, + "grad_norm": 1.5506830215454102, + "learning_rate": 4.507742236816901e-06, + "loss": 0.9748, + "step": 1389 + }, + { + "epoch": 1.2942271880819367, + "grad_norm": 1.5509533882141113, + "learning_rate": 4.507004319987185e-06, + "loss": 1.0001, + "step": 1390 + }, + { + "epoch": 1.2951582867783986, + "grad_norm": 1.5898278951644897, + "learning_rate": 4.506265910980038e-06, + "loss": 1.0029, + "step": 1391 + }, + { + "epoch": 1.2960893854748603, + "grad_norm": 1.6037395000457764, + "learning_rate": 4.5055270099765396e-06, + "loss": 1.0227, + "step": 1392 + }, + { + "epoch": 1.2970204841713222, + "grad_norm": 1.5255653858184814, + "learning_rate": 4.50478761715789e-06, + "loss": 1.0208, + "step": 1393 + }, + { + "epoch": 1.2979515828677841, + "grad_norm": 1.5290815830230713, + "learning_rate": 4.504047732705412e-06, + "loss": 0.996, + "step": 1394 + }, + { + "epoch": 1.2988826815642458, + "grad_norm": 1.578757405281067, + "learning_rate": 4.503307356800546e-06, + "loss": 0.9945, + "step": 1395 + }, + { + "epoch": 1.2998137802607077, + "grad_norm": 1.5922045707702637, + "learning_rate": 4.502566489624855e-06, + "loss": 1.0281, + "step": 1396 + }, + { + "epoch": 1.3007448789571694, + "grad_norm": 1.6003899574279785, + "learning_rate": 4.501825131360022e-06, + "loss": 1.0071, + "step": 1397 + }, + { + "epoch": 1.3016759776536313, + "grad_norm": 1.5832836627960205, + "learning_rate": 4.501083282187848e-06, + "loss": 1.0216, + "step": 1398 + }, + { + "epoch": 1.302607076350093, + "grad_norm": 1.5400363206863403, + "learning_rate": 4.500340942290259e-06, + "loss": 1.0267, + "step": 1399 + }, + { + "epoch": 1.303538175046555, + "grad_norm": 1.6731758117675781, + "learning_rate": 4.499598111849299e-06, + "loss": 1.0123, + "step": 1400 + }, + { + "epoch": 1.3044692737430168, + "grad_norm": 1.6038299798965454, + "learning_rate": 4.498854791047131e-06, + "loss": 1.0038, + "step": 1401 + }, + { + "epoch": 1.3054003724394785, + "grad_norm": 1.5957401990890503, + "learning_rate": 4.4981109800660395e-06, + "loss": 0.9816, + "step": 1402 + }, + { + "epoch": 1.3063314711359404, + "grad_norm": 1.628294587135315, + "learning_rate": 4.49736667908843e-06, + "loss": 0.9827, + "step": 1403 + }, + { + "epoch": 1.3072625698324023, + "grad_norm": 1.552747130393982, + "learning_rate": 4.496621888296827e-06, + "loss": 1.0261, + "step": 1404 + }, + { + "epoch": 1.308193668528864, + "grad_norm": 1.5063432455062866, + "learning_rate": 4.4958766078738745e-06, + "loss": 1.0045, + "step": 1405 + }, + { + "epoch": 1.309124767225326, + "grad_norm": 1.618623971939087, + "learning_rate": 4.495130838002339e-06, + "loss": 1.0185, + "step": 1406 + }, + { + "epoch": 1.3100558659217878, + "grad_norm": 1.5620105266571045, + "learning_rate": 4.4943845788651055e-06, + "loss": 1.0118, + "step": 1407 + }, + { + "epoch": 1.3109869646182495, + "grad_norm": 1.5479755401611328, + "learning_rate": 4.493637830645178e-06, + "loss": 1.0042, + "step": 1408 + }, + { + "epoch": 1.3119180633147114, + "grad_norm": 1.5699485540390015, + "learning_rate": 4.492890593525682e-06, + "loss": 1.012, + "step": 1409 + }, + { + "epoch": 1.3128491620111733, + "grad_norm": 1.6348872184753418, + "learning_rate": 4.492142867689861e-06, + "loss": 0.9904, + "step": 1410 + }, + { + "epoch": 1.313780260707635, + "grad_norm": 1.5649633407592773, + "learning_rate": 4.491394653321083e-06, + "loss": 1.0283, + "step": 1411 + }, + { + "epoch": 1.314711359404097, + "grad_norm": 1.5797923803329468, + "learning_rate": 4.49064595060283e-06, + "loss": 1.0507, + "step": 1412 + }, + { + "epoch": 1.3156424581005586, + "grad_norm": 1.5442322492599487, + "learning_rate": 4.489896759718706e-06, + "loss": 1.0131, + "step": 1413 + }, + { + "epoch": 1.3165735567970205, + "grad_norm": 1.5516129732131958, + "learning_rate": 4.489147080852437e-06, + "loss": 1.0079, + "step": 1414 + }, + { + "epoch": 1.3175046554934824, + "grad_norm": 1.6696643829345703, + "learning_rate": 4.488396914187865e-06, + "loss": 1.0633, + "step": 1415 + }, + { + "epoch": 1.318435754189944, + "grad_norm": 1.5874762535095215, + "learning_rate": 4.487646259908955e-06, + "loss": 1.0289, + "step": 1416 + }, + { + "epoch": 1.319366852886406, + "grad_norm": 1.5189284086227417, + "learning_rate": 4.486895118199787e-06, + "loss": 1.017, + "step": 1417 + }, + { + "epoch": 1.3202979515828677, + "grad_norm": 1.4921741485595703, + "learning_rate": 4.4861434892445645e-06, + "loss": 1.0013, + "step": 1418 + }, + { + "epoch": 1.3212290502793296, + "grad_norm": 1.5352593660354614, + "learning_rate": 4.485391373227611e-06, + "loss": 1.0171, + "step": 1419 + }, + { + "epoch": 1.3221601489757915, + "grad_norm": 1.5205575227737427, + "learning_rate": 4.484638770333367e-06, + "loss": 0.9941, + "step": 1420 + }, + { + "epoch": 1.3230912476722532, + "grad_norm": 1.538454294204712, + "learning_rate": 4.483885680746393e-06, + "loss": 0.9701, + "step": 1421 + }, + { + "epoch": 1.324022346368715, + "grad_norm": 1.5228955745697021, + "learning_rate": 4.483132104651369e-06, + "loss": 0.9875, + "step": 1422 + }, + { + "epoch": 1.324953445065177, + "grad_norm": 1.6079059839248657, + "learning_rate": 4.4823780422330935e-06, + "loss": 1.0667, + "step": 1423 + }, + { + "epoch": 1.3258845437616387, + "grad_norm": 1.5907213687896729, + "learning_rate": 4.481623493676487e-06, + "loss": 1.0127, + "step": 1424 + }, + { + "epoch": 1.3268156424581006, + "grad_norm": 1.5828615427017212, + "learning_rate": 4.480868459166586e-06, + "loss": 1.0278, + "step": 1425 + }, + { + "epoch": 1.3277467411545625, + "grad_norm": 1.6869210004806519, + "learning_rate": 4.4801129388885475e-06, + "loss": 1.0464, + "step": 1426 + }, + { + "epoch": 1.3286778398510242, + "grad_norm": 1.5865498781204224, + "learning_rate": 4.479356933027649e-06, + "loss": 1.0158, + "step": 1427 + }, + { + "epoch": 1.329608938547486, + "grad_norm": 1.5338958501815796, + "learning_rate": 4.478600441769284e-06, + "loss": 1.0471, + "step": 1428 + }, + { + "epoch": 1.330540037243948, + "grad_norm": 1.5718988180160522, + "learning_rate": 4.477843465298968e-06, + "loss": 0.9899, + "step": 1429 + }, + { + "epoch": 1.3314711359404097, + "grad_norm": 1.5694533586502075, + "learning_rate": 4.477086003802333e-06, + "loss": 1.0218, + "step": 1430 + }, + { + "epoch": 1.3324022346368716, + "grad_norm": 1.5479017496109009, + "learning_rate": 4.476328057465133e-06, + "loss": 0.9838, + "step": 1431 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.5717720985412598, + "learning_rate": 4.475569626473238e-06, + "loss": 1.0287, + "step": 1432 + }, + { + "epoch": 1.3342644320297952, + "grad_norm": 1.7197667360305786, + "learning_rate": 4.474810711012637e-06, + "loss": 1.0404, + "step": 1433 + }, + { + "epoch": 1.3351955307262569, + "grad_norm": 1.569809913635254, + "learning_rate": 4.474051311269441e-06, + "loss": 1.0421, + "step": 1434 + }, + { + "epoch": 1.3361266294227188, + "grad_norm": 1.6082358360290527, + "learning_rate": 4.473291427429876e-06, + "loss": 1.0136, + "step": 1435 + }, + { + "epoch": 1.3370577281191807, + "grad_norm": 1.5059759616851807, + "learning_rate": 4.472531059680289e-06, + "loss": 0.9791, + "step": 1436 + }, + { + "epoch": 1.3379888268156424, + "grad_norm": 1.6574952602386475, + "learning_rate": 4.471770208207143e-06, + "loss": 1.0398, + "step": 1437 + }, + { + "epoch": 1.3389199255121043, + "grad_norm": 1.6103168725967407, + "learning_rate": 4.4710088731970245e-06, + "loss": 1.0475, + "step": 1438 + }, + { + "epoch": 1.3398510242085662, + "grad_norm": 1.6753737926483154, + "learning_rate": 4.470247054836633e-06, + "loss": 1.0361, + "step": 1439 + }, + { + "epoch": 1.3407821229050279, + "grad_norm": 1.6116617918014526, + "learning_rate": 4.469484753312791e-06, + "loss": 1.0348, + "step": 1440 + }, + { + "epoch": 1.3417132216014898, + "grad_norm": 1.5506784915924072, + "learning_rate": 4.468721968812435e-06, + "loss": 1.0374, + "step": 1441 + }, + { + "epoch": 1.3426443202979517, + "grad_norm": 1.6124831438064575, + "learning_rate": 4.4679587015226255e-06, + "loss": 0.995, + "step": 1442 + }, + { + "epoch": 1.3435754189944134, + "grad_norm": 1.5187324285507202, + "learning_rate": 4.467194951630538e-06, + "loss": 0.9978, + "step": 1443 + }, + { + "epoch": 1.3445065176908753, + "grad_norm": 1.521330714225769, + "learning_rate": 4.466430719323464e-06, + "loss": 1.0057, + "step": 1444 + }, + { + "epoch": 1.3454376163873372, + "grad_norm": 1.5347623825073242, + "learning_rate": 4.46566600478882e-06, + "loss": 1.0561, + "step": 1445 + }, + { + "epoch": 1.3463687150837989, + "grad_norm": 1.619405746459961, + "learning_rate": 4.464900808214134e-06, + "loss": 1.0268, + "step": 1446 + }, + { + "epoch": 1.3472998137802608, + "grad_norm": 1.498252272605896, + "learning_rate": 4.464135129787057e-06, + "loss": 0.9973, + "step": 1447 + }, + { + "epoch": 1.3482309124767227, + "grad_norm": 1.6767488718032837, + "learning_rate": 4.463368969695355e-06, + "loss": 1.039, + "step": 1448 + }, + { + "epoch": 1.3491620111731844, + "grad_norm": 1.6489245891571045, + "learning_rate": 4.462602328126913e-06, + "loss": 1.015, + "step": 1449 + }, + { + "epoch": 1.3500931098696463, + "grad_norm": 1.6135965585708618, + "learning_rate": 4.461835205269736e-06, + "loss": 1.0581, + "step": 1450 + }, + { + "epoch": 1.351024208566108, + "grad_norm": 1.527063250541687, + "learning_rate": 4.461067601311944e-06, + "loss": 0.996, + "step": 1451 + }, + { + "epoch": 1.3519553072625698, + "grad_norm": 1.5755585432052612, + "learning_rate": 4.460299516441777e-06, + "loss": 1.0149, + "step": 1452 + }, + { + "epoch": 1.3528864059590315, + "grad_norm": 1.5916379690170288, + "learning_rate": 4.459530950847591e-06, + "loss": 1.0469, + "step": 1453 + }, + { + "epoch": 1.3538175046554934, + "grad_norm": 1.6131742000579834, + "learning_rate": 4.458761904717864e-06, + "loss": 1.0034, + "step": 1454 + }, + { + "epoch": 1.3547486033519553, + "grad_norm": 1.567866325378418, + "learning_rate": 4.457992378241188e-06, + "loss": 1.0275, + "step": 1455 + }, + { + "epoch": 1.355679702048417, + "grad_norm": 1.5476313829421997, + "learning_rate": 4.4572223716062725e-06, + "loss": 0.9842, + "step": 1456 + }, + { + "epoch": 1.356610800744879, + "grad_norm": 1.5024303197860718, + "learning_rate": 4.456451885001948e-06, + "loss": 0.9864, + "step": 1457 + }, + { + "epoch": 1.3575418994413408, + "grad_norm": 1.5810939073562622, + "learning_rate": 4.455680918617159e-06, + "loss": 1.0143, + "step": 1458 + }, + { + "epoch": 1.3584729981378025, + "grad_norm": 1.6658401489257812, + "learning_rate": 4.454909472640972e-06, + "loss": 1.0468, + "step": 1459 + }, + { + "epoch": 1.3594040968342644, + "grad_norm": 1.58247971534729, + "learning_rate": 4.454137547262566e-06, + "loss": 1.0124, + "step": 1460 + }, + { + "epoch": 1.3603351955307263, + "grad_norm": 1.5933088064193726, + "learning_rate": 4.453365142671241e-06, + "loss": 1.0646, + "step": 1461 + }, + { + "epoch": 1.361266294227188, + "grad_norm": 1.5584853887557983, + "learning_rate": 4.4525922590564144e-06, + "loss": 1.0202, + "step": 1462 + }, + { + "epoch": 1.36219739292365, + "grad_norm": 1.535901665687561, + "learning_rate": 4.45181889660762e-06, + "loss": 1.0196, + "step": 1463 + }, + { + "epoch": 1.3631284916201118, + "grad_norm": 1.5713459253311157, + "learning_rate": 4.45104505551451e-06, + "loss": 1.0111, + "step": 1464 + }, + { + "epoch": 1.3640595903165735, + "grad_norm": 1.5734684467315674, + "learning_rate": 4.4502707359668515e-06, + "loss": 1.0329, + "step": 1465 + }, + { + "epoch": 1.3649906890130354, + "grad_norm": 1.4879016876220703, + "learning_rate": 4.4494959381545325e-06, + "loss": 1.0261, + "step": 1466 + }, + { + "epoch": 1.3659217877094971, + "grad_norm": 1.565568447113037, + "learning_rate": 4.448720662267556e-06, + "loss": 1.0189, + "step": 1467 + }, + { + "epoch": 1.366852886405959, + "grad_norm": 1.5577555894851685, + "learning_rate": 4.447944908496042e-06, + "loss": 1.0312, + "step": 1468 + }, + { + "epoch": 1.3677839851024207, + "grad_norm": 1.570672869682312, + "learning_rate": 4.44716867703023e-06, + "loss": 1.0695, + "step": 1469 + }, + { + "epoch": 1.3687150837988826, + "grad_norm": 1.5811032056808472, + "learning_rate": 4.446391968060475e-06, + "loss": 1.0106, + "step": 1470 + }, + { + "epoch": 1.3696461824953445, + "grad_norm": 1.6152100563049316, + "learning_rate": 4.445614781777248e-06, + "loss": 1.0293, + "step": 1471 + }, + { + "epoch": 1.3705772811918062, + "grad_norm": 1.5285683870315552, + "learning_rate": 4.444837118371139e-06, + "loss": 0.992, + "step": 1472 + }, + { + "epoch": 1.3715083798882681, + "grad_norm": 1.5448665618896484, + "learning_rate": 4.444058978032855e-06, + "loss": 1.0511, + "step": 1473 + }, + { + "epoch": 1.37243947858473, + "grad_norm": 1.6055021286010742, + "learning_rate": 4.443280360953218e-06, + "loss": 1.034, + "step": 1474 + }, + { + "epoch": 1.3733705772811917, + "grad_norm": 1.5773824453353882, + "learning_rate": 4.442501267323169e-06, + "loss": 1.0308, + "step": 1475 + }, + { + "epoch": 1.3743016759776536, + "grad_norm": 1.5723717212677002, + "learning_rate": 4.441721697333765e-06, + "loss": 1.012, + "step": 1476 + }, + { + "epoch": 1.3752327746741155, + "grad_norm": 1.5339237451553345, + "learning_rate": 4.440941651176181e-06, + "loss": 0.9972, + "step": 1477 + }, + { + "epoch": 1.3761638733705772, + "grad_norm": 1.5498487949371338, + "learning_rate": 4.440161129041704e-06, + "loss": 1.0377, + "step": 1478 + }, + { + "epoch": 1.3770949720670391, + "grad_norm": 1.6140390634536743, + "learning_rate": 4.439380131121744e-06, + "loss": 1.0794, + "step": 1479 + }, + { + "epoch": 1.378026070763501, + "grad_norm": 1.5776286125183105, + "learning_rate": 4.438598657607826e-06, + "loss": 1.0189, + "step": 1480 + }, + { + "epoch": 1.3789571694599627, + "grad_norm": 1.6249407529830933, + "learning_rate": 4.437816708691588e-06, + "loss": 1.023, + "step": 1481 + }, + { + "epoch": 1.3798882681564246, + "grad_norm": 1.561781406402588, + "learning_rate": 4.437034284564789e-06, + "loss": 1.0348, + "step": 1482 + }, + { + "epoch": 1.3808193668528865, + "grad_norm": 1.6608562469482422, + "learning_rate": 4.436251385419302e-06, + "loss": 1.0405, + "step": 1483 + }, + { + "epoch": 1.3817504655493482, + "grad_norm": 1.5895798206329346, + "learning_rate": 4.4354680114471184e-06, + "loss": 1.0264, + "step": 1484 + }, + { + "epoch": 1.3826815642458101, + "grad_norm": 1.554221749305725, + "learning_rate": 4.434684162840344e-06, + "loss": 0.994, + "step": 1485 + }, + { + "epoch": 1.3836126629422718, + "grad_norm": 1.5708531141281128, + "learning_rate": 4.433899839791202e-06, + "loss": 1.0285, + "step": 1486 + }, + { + "epoch": 1.3845437616387337, + "grad_norm": 1.6230992078781128, + "learning_rate": 4.433115042492031e-06, + "loss": 1.0506, + "step": 1487 + }, + { + "epoch": 1.3854748603351954, + "grad_norm": 1.6104018688201904, + "learning_rate": 4.4323297711352885e-06, + "loss": 1.052, + "step": 1488 + }, + { + "epoch": 1.3864059590316573, + "grad_norm": 1.5672563314437866, + "learning_rate": 4.431544025913546e-06, + "loss": 1.0258, + "step": 1489 + }, + { + "epoch": 1.3873370577281192, + "grad_norm": 1.5545810461044312, + "learning_rate": 4.430757807019491e-06, + "loss": 0.9816, + "step": 1490 + }, + { + "epoch": 1.388268156424581, + "grad_norm": 1.576515555381775, + "learning_rate": 4.429971114645928e-06, + "loss": 1.0404, + "step": 1491 + }, + { + "epoch": 1.3891992551210428, + "grad_norm": 1.5094834566116333, + "learning_rate": 4.4291839489857775e-06, + "loss": 0.9768, + "step": 1492 + }, + { + "epoch": 1.3901303538175047, + "grad_norm": 1.6064214706420898, + "learning_rate": 4.428396310232076e-06, + "loss": 1.0372, + "step": 1493 + }, + { + "epoch": 1.3910614525139664, + "grad_norm": 1.6829825639724731, + "learning_rate": 4.427608198577976e-06, + "loss": 1.0051, + "step": 1494 + }, + { + "epoch": 1.3919925512104283, + "grad_norm": 1.6020152568817139, + "learning_rate": 4.426819614216747e-06, + "loss": 1.0152, + "step": 1495 + }, + { + "epoch": 1.3929236499068902, + "grad_norm": 1.6294755935668945, + "learning_rate": 4.426030557341774e-06, + "loss": 1.0368, + "step": 1496 + }, + { + "epoch": 1.393854748603352, + "grad_norm": 1.5974087715148926, + "learning_rate": 4.425241028146554e-06, + "loss": 1.026, + "step": 1497 + }, + { + "epoch": 1.3947858472998138, + "grad_norm": 1.516837477684021, + "learning_rate": 4.424451026824707e-06, + "loss": 1.042, + "step": 1498 + }, + { + "epoch": 1.3957169459962757, + "grad_norm": 1.560184359550476, + "learning_rate": 4.423660553569961e-06, + "loss": 1.0287, + "step": 1499 + }, + { + "epoch": 1.3966480446927374, + "grad_norm": 1.6382440328598022, + "learning_rate": 4.4228696085761665e-06, + "loss": 1.0682, + "step": 1500 + }, + { + "epoch": 1.3975791433891993, + "grad_norm": 1.6879359483718872, + "learning_rate": 4.422078192037287e-06, + "loss": 1.0623, + "step": 1501 + }, + { + "epoch": 1.3985102420856612, + "grad_norm": 1.498881459236145, + "learning_rate": 4.4212863041474005e-06, + "loss": 0.9874, + "step": 1502 + }, + { + "epoch": 1.399441340782123, + "grad_norm": 1.5965639352798462, + "learning_rate": 4.420493945100702e-06, + "loss": 1.08, + "step": 1503 + }, + { + "epoch": 1.4003724394785848, + "grad_norm": 1.5797770023345947, + "learning_rate": 4.4197011150915e-06, + "loss": 1.025, + "step": 1504 + }, + { + "epoch": 1.4013035381750465, + "grad_norm": 1.603064775466919, + "learning_rate": 4.418907814314223e-06, + "loss": 1.0367, + "step": 1505 + }, + { + "epoch": 1.4022346368715084, + "grad_norm": 1.594980239868164, + "learning_rate": 4.418114042963409e-06, + "loss": 0.9674, + "step": 1506 + }, + { + "epoch": 1.40316573556797, + "grad_norm": 1.5968581438064575, + "learning_rate": 4.417319801233717e-06, + "loss": 0.9967, + "step": 1507 + }, + { + "epoch": 1.404096834264432, + "grad_norm": 1.6094814538955688, + "learning_rate": 4.416525089319917e-06, + "loss": 1.0468, + "step": 1508 + }, + { + "epoch": 1.405027932960894, + "grad_norm": 1.589282751083374, + "learning_rate": 4.415729907416898e-06, + "loss": 1.029, + "step": 1509 + }, + { + "epoch": 1.4059590316573556, + "grad_norm": 1.6730644702911377, + "learning_rate": 4.4149342557196605e-06, + "loss": 0.9902, + "step": 1510 + }, + { + "epoch": 1.4068901303538175, + "grad_norm": 1.6359583139419556, + "learning_rate": 4.414138134423323e-06, + "loss": 1.0121, + "step": 1511 + }, + { + "epoch": 1.4078212290502794, + "grad_norm": 1.587175726890564, + "learning_rate": 4.413341543723118e-06, + "loss": 1.0261, + "step": 1512 + }, + { + "epoch": 1.408752327746741, + "grad_norm": 1.5310614109039307, + "learning_rate": 4.412544483814394e-06, + "loss": 1.0227, + "step": 1513 + }, + { + "epoch": 1.409683426443203, + "grad_norm": 1.5506380796432495, + "learning_rate": 4.411746954892612e-06, + "loss": 1.0063, + "step": 1514 + }, + { + "epoch": 1.410614525139665, + "grad_norm": 1.5596989393234253, + "learning_rate": 4.410948957153351e-06, + "loss": 1.0346, + "step": 1515 + }, + { + "epoch": 1.4115456238361266, + "grad_norm": 1.5759638547897339, + "learning_rate": 4.410150490792306e-06, + "loss": 0.9939, + "step": 1516 + }, + { + "epoch": 1.4124767225325885, + "grad_norm": 1.5548573732376099, + "learning_rate": 4.409351556005281e-06, + "loss": 1.0091, + "step": 1517 + }, + { + "epoch": 1.4134078212290504, + "grad_norm": 1.5779527425765991, + "learning_rate": 4.408552152988202e-06, + "loss": 1.0538, + "step": 1518 + }, + { + "epoch": 1.414338919925512, + "grad_norm": 1.5253233909606934, + "learning_rate": 4.407752281937104e-06, + "loss": 1.0112, + "step": 1519 + }, + { + "epoch": 1.415270018621974, + "grad_norm": 1.5554537773132324, + "learning_rate": 4.406951943048141e-06, + "loss": 1.0002, + "step": 1520 + }, + { + "epoch": 1.4162011173184357, + "grad_norm": 1.5797187089920044, + "learning_rate": 4.406151136517579e-06, + "loss": 0.996, + "step": 1521 + }, + { + "epoch": 1.4171322160148976, + "grad_norm": 1.5718674659729004, + "learning_rate": 4.405349862541801e-06, + "loss": 1.0041, + "step": 1522 + }, + { + "epoch": 1.4180633147113593, + "grad_norm": 1.608933448791504, + "learning_rate": 4.404548121317301e-06, + "loss": 1.0468, + "step": 1523 + }, + { + "epoch": 1.4189944134078212, + "grad_norm": 1.651956558227539, + "learning_rate": 4.403745913040692e-06, + "loss": 1.0345, + "step": 1524 + }, + { + "epoch": 1.419925512104283, + "grad_norm": 1.5862330198287964, + "learning_rate": 4.402943237908699e-06, + "loss": 1.0425, + "step": 1525 + }, + { + "epoch": 1.4208566108007448, + "grad_norm": 1.631689429283142, + "learning_rate": 4.4021400961181614e-06, + "loss": 1.0175, + "step": 1526 + }, + { + "epoch": 1.4217877094972067, + "grad_norm": 1.5958492755889893, + "learning_rate": 4.401336487866035e-06, + "loss": 1.037, + "step": 1527 + }, + { + "epoch": 1.4227188081936686, + "grad_norm": 1.5291881561279297, + "learning_rate": 4.400532413349385e-06, + "loss": 0.995, + "step": 1528 + }, + { + "epoch": 1.4236499068901303, + "grad_norm": 1.5820049047470093, + "learning_rate": 4.399727872765399e-06, + "loss": 1.0117, + "step": 1529 + }, + { + "epoch": 1.4245810055865922, + "grad_norm": 1.4826836585998535, + "learning_rate": 4.398922866311371e-06, + "loss": 1.0209, + "step": 1530 + }, + { + "epoch": 1.425512104283054, + "grad_norm": 1.5255011320114136, + "learning_rate": 4.398117394184715e-06, + "loss": 0.99, + "step": 1531 + }, + { + "epoch": 1.4264432029795158, + "grad_norm": 1.5269635915756226, + "learning_rate": 4.397311456582955e-06, + "loss": 1.0077, + "step": 1532 + }, + { + "epoch": 1.4273743016759777, + "grad_norm": 1.5802326202392578, + "learning_rate": 4.3965050537037325e-06, + "loss": 0.9775, + "step": 1533 + }, + { + "epoch": 1.4283054003724396, + "grad_norm": 1.6086894273757935, + "learning_rate": 4.3956981857448e-06, + "loss": 0.997, + "step": 1534 + }, + { + "epoch": 1.4292364990689013, + "grad_norm": 1.5799283981323242, + "learning_rate": 4.394890852904027e-06, + "loss": 0.9973, + "step": 1535 + }, + { + "epoch": 1.4301675977653632, + "grad_norm": 1.5577309131622314, + "learning_rate": 4.3940830553793946e-06, + "loss": 0.963, + "step": 1536 + }, + { + "epoch": 1.431098696461825, + "grad_norm": 1.6366912126541138, + "learning_rate": 4.393274793368999e-06, + "loss": 1.0124, + "step": 1537 + }, + { + "epoch": 1.4320297951582868, + "grad_norm": 1.6136518716812134, + "learning_rate": 4.392466067071051e-06, + "loss": 1.0405, + "step": 1538 + }, + { + "epoch": 1.4329608938547487, + "grad_norm": 1.6023740768432617, + "learning_rate": 4.391656876683875e-06, + "loss": 0.9989, + "step": 1539 + }, + { + "epoch": 1.4338919925512104, + "grad_norm": 1.6204147338867188, + "learning_rate": 4.390847222405907e-06, + "loss": 1.0159, + "step": 1540 + }, + { + "epoch": 1.4348230912476723, + "grad_norm": 1.537744164466858, + "learning_rate": 4.3900371044357e-06, + "loss": 0.9919, + "step": 1541 + }, + { + "epoch": 1.435754189944134, + "grad_norm": 1.5629849433898926, + "learning_rate": 4.389226522971917e-06, + "loss": 1.0466, + "step": 1542 + }, + { + "epoch": 1.4366852886405959, + "grad_norm": 1.5706342458724976, + "learning_rate": 4.388415478213337e-06, + "loss": 1.0111, + "step": 1543 + }, + { + "epoch": 1.4376163873370578, + "grad_norm": 1.628672480583191, + "learning_rate": 4.387603970358856e-06, + "loss": 1.0371, + "step": 1544 + }, + { + "epoch": 1.4385474860335195, + "grad_norm": 1.5431934595108032, + "learning_rate": 4.386791999607476e-06, + "loss": 1.0496, + "step": 1545 + }, + { + "epoch": 1.4394785847299814, + "grad_norm": 1.5898290872573853, + "learning_rate": 4.385979566158318e-06, + "loss": 1.0132, + "step": 1546 + }, + { + "epoch": 1.4404096834264433, + "grad_norm": 1.5480161905288696, + "learning_rate": 4.385166670210615e-06, + "loss": 1.0285, + "step": 1547 + }, + { + "epoch": 1.441340782122905, + "grad_norm": 1.5950345993041992, + "learning_rate": 4.384353311963713e-06, + "loss": 1.0059, + "step": 1548 + }, + { + "epoch": 1.4422718808193669, + "grad_norm": 1.5423212051391602, + "learning_rate": 4.383539491617073e-06, + "loss": 1.0412, + "step": 1549 + }, + { + "epoch": 1.4432029795158288, + "grad_norm": 1.5239518880844116, + "learning_rate": 4.382725209370266e-06, + "loss": 0.984, + "step": 1550 + }, + { + "epoch": 1.4441340782122905, + "grad_norm": 1.771134853363037, + "learning_rate": 4.38191046542298e-06, + "loss": 1.0527, + "step": 1551 + }, + { + "epoch": 1.4450651769087524, + "grad_norm": 1.5928163528442383, + "learning_rate": 4.381095259975013e-06, + "loss": 1.0041, + "step": 1552 + }, + { + "epoch": 1.4459962756052143, + "grad_norm": 1.5586247444152832, + "learning_rate": 4.38027959322628e-06, + "loss": 1.0317, + "step": 1553 + }, + { + "epoch": 1.446927374301676, + "grad_norm": 1.601347804069519, + "learning_rate": 4.379463465376805e-06, + "loss": 1.0223, + "step": 1554 + }, + { + "epoch": 1.4478584729981379, + "grad_norm": 1.5670347213745117, + "learning_rate": 4.378646876626728e-06, + "loss": 0.986, + "step": 1555 + }, + { + "epoch": 1.4487895716945998, + "grad_norm": 1.5701487064361572, + "learning_rate": 4.3778298271762995e-06, + "loss": 1.0203, + "step": 1556 + }, + { + "epoch": 1.4497206703910615, + "grad_norm": 1.572339653968811, + "learning_rate": 4.377012317225886e-06, + "loss": 1.0236, + "step": 1557 + }, + { + "epoch": 1.4506517690875234, + "grad_norm": 1.579890251159668, + "learning_rate": 4.376194346975965e-06, + "loss": 1.0269, + "step": 1558 + }, + { + "epoch": 1.451582867783985, + "grad_norm": 1.661680817604065, + "learning_rate": 4.375375916627127e-06, + "loss": 0.9753, + "step": 1559 + }, + { + "epoch": 1.452513966480447, + "grad_norm": 1.7139066457748413, + "learning_rate": 4.374557026380075e-06, + "loss": 1.0403, + "step": 1560 + }, + { + "epoch": 1.4534450651769086, + "grad_norm": 1.5265493392944336, + "learning_rate": 4.373737676435627e-06, + "loss": 1.0078, + "step": 1561 + }, + { + "epoch": 1.4543761638733705, + "grad_norm": 1.4877139329910278, + "learning_rate": 4.37291786699471e-06, + "loss": 0.99, + "step": 1562 + }, + { + "epoch": 1.4553072625698324, + "grad_norm": 1.6461578607559204, + "learning_rate": 4.372097598258368e-06, + "loss": 1.0565, + "step": 1563 + }, + { + "epoch": 1.4562383612662941, + "grad_norm": 1.5626616477966309, + "learning_rate": 4.3712768704277535e-06, + "loss": 1.0245, + "step": 1564 + }, + { + "epoch": 1.457169459962756, + "grad_norm": 1.5262725353240967, + "learning_rate": 4.370455683704134e-06, + "loss": 0.9997, + "step": 1565 + }, + { + "epoch": 1.458100558659218, + "grad_norm": 1.5577821731567383, + "learning_rate": 4.369634038288889e-06, + "loss": 1.0397, + "step": 1566 + }, + { + "epoch": 1.4590316573556796, + "grad_norm": 1.558105230331421, + "learning_rate": 4.368811934383511e-06, + "loss": 1.0504, + "step": 1567 + }, + { + "epoch": 1.4599627560521415, + "grad_norm": 1.6111888885498047, + "learning_rate": 4.367989372189605e-06, + "loss": 1.0177, + "step": 1568 + }, + { + "epoch": 1.4608938547486034, + "grad_norm": 1.5383460521697998, + "learning_rate": 4.367166351908886e-06, + "loss": 0.9705, + "step": 1569 + }, + { + "epoch": 1.4618249534450651, + "grad_norm": 1.5505338907241821, + "learning_rate": 4.366342873743184e-06, + "loss": 1.0018, + "step": 1570 + }, + { + "epoch": 1.462756052141527, + "grad_norm": 1.5472747087478638, + "learning_rate": 4.365518937894442e-06, + "loss": 1.0328, + "step": 1571 + }, + { + "epoch": 1.463687150837989, + "grad_norm": 1.6104297637939453, + "learning_rate": 4.364694544564711e-06, + "loss": 1.0077, + "step": 1572 + }, + { + "epoch": 1.4646182495344506, + "grad_norm": 1.5873563289642334, + "learning_rate": 4.3638696939561595e-06, + "loss": 1.0187, + "step": 1573 + }, + { + "epoch": 1.4655493482309125, + "grad_norm": 1.6917014122009277, + "learning_rate": 4.363044386271063e-06, + "loss": 1.0458, + "step": 1574 + }, + { + "epoch": 1.4664804469273742, + "grad_norm": 1.612997055053711, + "learning_rate": 4.3622186217118135e-06, + "loss": 1.0292, + "step": 1575 + }, + { + "epoch": 1.4674115456238361, + "grad_norm": 1.602902889251709, + "learning_rate": 4.361392400480912e-06, + "loss": 1.019, + "step": 1576 + }, + { + "epoch": 1.4683426443202978, + "grad_norm": 1.5190726518630981, + "learning_rate": 4.360565722780974e-06, + "loss": 0.9725, + "step": 1577 + }, + { + "epoch": 1.4692737430167597, + "grad_norm": 1.535841464996338, + "learning_rate": 4.3597385888147235e-06, + "loss": 0.988, + "step": 1578 + }, + { + "epoch": 1.4702048417132216, + "grad_norm": 1.4984718561172485, + "learning_rate": 4.358910998785001e-06, + "loss": 0.9924, + "step": 1579 + }, + { + "epoch": 1.4711359404096833, + "grad_norm": 1.6019173860549927, + "learning_rate": 4.358082952894753e-06, + "loss": 1.053, + "step": 1580 + }, + { + "epoch": 1.4720670391061452, + "grad_norm": 1.569354772567749, + "learning_rate": 4.357254451347045e-06, + "loss": 1.0012, + "step": 1581 + }, + { + "epoch": 1.4729981378026071, + "grad_norm": 1.5266960859298706, + "learning_rate": 4.356425494345047e-06, + "loss": 0.9885, + "step": 1582 + }, + { + "epoch": 1.4739292364990688, + "grad_norm": 1.5382027626037598, + "learning_rate": 4.3555960820920465e-06, + "loss": 1.0262, + "step": 1583 + }, + { + "epoch": 1.4748603351955307, + "grad_norm": 1.5143629312515259, + "learning_rate": 4.354766214791439e-06, + "loss": 0.9881, + "step": 1584 + }, + { + "epoch": 1.4757914338919926, + "grad_norm": 1.5462934970855713, + "learning_rate": 4.353935892646732e-06, + "loss": 1.029, + "step": 1585 + }, + { + "epoch": 1.4767225325884543, + "grad_norm": 1.58848237991333, + "learning_rate": 4.353105115861546e-06, + "loss": 1.0053, + "step": 1586 + }, + { + "epoch": 1.4776536312849162, + "grad_norm": 1.5712151527404785, + "learning_rate": 4.352273884639613e-06, + "loss": 0.9747, + "step": 1587 + }, + { + "epoch": 1.4785847299813781, + "grad_norm": 1.526538372039795, + "learning_rate": 4.351442199184776e-06, + "loss": 1.0062, + "step": 1588 + }, + { + "epoch": 1.4795158286778398, + "grad_norm": 1.5572766065597534, + "learning_rate": 4.350610059700986e-06, + "loss": 1.0126, + "step": 1589 + }, + { + "epoch": 1.4804469273743017, + "grad_norm": 1.522646427154541, + "learning_rate": 4.349777466392313e-06, + "loss": 0.9894, + "step": 1590 + }, + { + "epoch": 1.4813780260707636, + "grad_norm": 1.5419663190841675, + "learning_rate": 4.34894441946293e-06, + "loss": 0.9672, + "step": 1591 + }, + { + "epoch": 1.4823091247672253, + "grad_norm": 1.551070213317871, + "learning_rate": 4.348110919117128e-06, + "loss": 1.0119, + "step": 1592 + }, + { + "epoch": 1.4832402234636872, + "grad_norm": 1.5017564296722412, + "learning_rate": 4.3472769655593035e-06, + "loss": 1.0066, + "step": 1593 + }, + { + "epoch": 1.484171322160149, + "grad_norm": 1.5424667596817017, + "learning_rate": 4.346442558993969e-06, + "loss": 1.025, + "step": 1594 + }, + { + "epoch": 1.4851024208566108, + "grad_norm": 1.5252048969268799, + "learning_rate": 4.345607699625744e-06, + "loss": 1.0331, + "step": 1595 + }, + { + "epoch": 1.4860335195530725, + "grad_norm": 1.5838569402694702, + "learning_rate": 4.344772387659362e-06, + "loss": 1.0777, + "step": 1596 + }, + { + "epoch": 1.4869646182495344, + "grad_norm": 1.491519570350647, + "learning_rate": 4.343936623299667e-06, + "loss": 0.9835, + "step": 1597 + }, + { + "epoch": 1.4878957169459963, + "grad_norm": 1.5592520236968994, + "learning_rate": 4.343100406751612e-06, + "loss": 0.9963, + "step": 1598 + }, + { + "epoch": 1.488826815642458, + "grad_norm": 1.525503158569336, + "learning_rate": 4.342263738220264e-06, + "loss": 0.9823, + "step": 1599 + }, + { + "epoch": 1.48975791433892, + "grad_norm": 1.5770426988601685, + "learning_rate": 4.3414266179107975e-06, + "loss": 1.0168, + "step": 1600 + }, + { + "epoch": 1.4906890130353818, + "grad_norm": 1.5425713062286377, + "learning_rate": 4.340589046028501e-06, + "loss": 0.9938, + "step": 1601 + }, + { + "epoch": 1.4916201117318435, + "grad_norm": 1.5985755920410156, + "learning_rate": 4.33975102277877e-06, + "loss": 0.9895, + "step": 1602 + }, + { + "epoch": 1.4925512104283054, + "grad_norm": 1.6213315725326538, + "learning_rate": 4.3389125483671145e-06, + "loss": 1.0204, + "step": 1603 + }, + { + "epoch": 1.4934823091247673, + "grad_norm": 1.5698057413101196, + "learning_rate": 4.3380736229991535e-06, + "loss": 1.0563, + "step": 1604 + }, + { + "epoch": 1.494413407821229, + "grad_norm": 1.6291435956954956, + "learning_rate": 4.337234246880616e-06, + "loss": 1.0361, + "step": 1605 + }, + { + "epoch": 1.495344506517691, + "grad_norm": 1.4916856288909912, + "learning_rate": 4.336394420217342e-06, + "loss": 1.0341, + "step": 1606 + }, + { + "epoch": 1.4962756052141528, + "grad_norm": 1.5709009170532227, + "learning_rate": 4.3355541432152826e-06, + "loss": 1.0507, + "step": 1607 + }, + { + "epoch": 1.4972067039106145, + "grad_norm": 1.554448127746582, + "learning_rate": 4.334713416080498e-06, + "loss": 1.0495, + "step": 1608 + }, + { + "epoch": 1.4981378026070764, + "grad_norm": 1.5857200622558594, + "learning_rate": 4.3338722390191615e-06, + "loss": 1.065, + "step": 1609 + }, + { + "epoch": 1.499068901303538, + "grad_norm": 1.514541745185852, + "learning_rate": 4.3330306122375524e-06, + "loss": 1.0344, + "step": 1610 + }, + { + "epoch": 1.5, + "grad_norm": 1.496036410331726, + "learning_rate": 4.3321885359420635e-06, + "loss": 1.0013, + "step": 1611 + }, + { + "epoch": 1.5009310986964617, + "grad_norm": 1.5402806997299194, + "learning_rate": 4.331346010339198e-06, + "loss": 1.049, + "step": 1612 + }, + { + "epoch": 1.5018621973929238, + "grad_norm": 1.5287505388259888, + "learning_rate": 4.330503035635568e-06, + "loss": 1.0085, + "step": 1613 + }, + { + "epoch": 1.5027932960893855, + "grad_norm": 1.5743392705917358, + "learning_rate": 4.329659612037895e-06, + "loss": 0.996, + "step": 1614 + }, + { + "epoch": 1.5037243947858472, + "grad_norm": 1.5880268812179565, + "learning_rate": 4.3288157397530135e-06, + "loss": 1.0273, + "step": 1615 + }, + { + "epoch": 1.504655493482309, + "grad_norm": 1.5983085632324219, + "learning_rate": 4.327971418987866e-06, + "loss": 0.9985, + "step": 1616 + }, + { + "epoch": 1.505586592178771, + "grad_norm": 1.6333311796188354, + "learning_rate": 4.327126649949504e-06, + "loss": 1.0232, + "step": 1617 + }, + { + "epoch": 1.5065176908752327, + "grad_norm": 1.74416184425354, + "learning_rate": 4.326281432845089e-06, + "loss": 1.0844, + "step": 1618 + }, + { + "epoch": 1.5074487895716946, + "grad_norm": 1.5786820650100708, + "learning_rate": 4.325435767881896e-06, + "loss": 1.0042, + "step": 1619 + }, + { + "epoch": 1.5083798882681565, + "grad_norm": 1.5413870811462402, + "learning_rate": 4.324589655267306e-06, + "loss": 1.0145, + "step": 1620 + }, + { + "epoch": 1.5093109869646182, + "grad_norm": 1.6098166704177856, + "learning_rate": 4.323743095208812e-06, + "loss": 1.0154, + "step": 1621 + }, + { + "epoch": 1.51024208566108, + "grad_norm": 1.5322644710540771, + "learning_rate": 4.322896087914016e-06, + "loss": 1.0256, + "step": 1622 + }, + { + "epoch": 1.511173184357542, + "grad_norm": 1.534049153327942, + "learning_rate": 4.322048633590628e-06, + "loss": 1.0344, + "step": 1623 + }, + { + "epoch": 1.5121042830540037, + "grad_norm": 1.6334002017974854, + "learning_rate": 4.3212007324464684e-06, + "loss": 1.0057, + "step": 1624 + }, + { + "epoch": 1.5130353817504656, + "grad_norm": 1.5892307758331299, + "learning_rate": 4.3203523846894715e-06, + "loss": 0.985, + "step": 1625 + }, + { + "epoch": 1.5139664804469275, + "grad_norm": 1.5497863292694092, + "learning_rate": 4.319503590527675e-06, + "loss": 1.0127, + "step": 1626 + }, + { + "epoch": 1.5148975791433892, + "grad_norm": 1.5987825393676758, + "learning_rate": 4.318654350169228e-06, + "loss": 0.9887, + "step": 1627 + }, + { + "epoch": 1.5158286778398509, + "grad_norm": 1.5655267238616943, + "learning_rate": 4.317804663822391e-06, + "loss": 0.9991, + "step": 1628 + }, + { + "epoch": 1.516759776536313, + "grad_norm": 1.5631802082061768, + "learning_rate": 4.316954531695533e-06, + "loss": 0.988, + "step": 1629 + }, + { + "epoch": 1.5176908752327747, + "grad_norm": 1.5858218669891357, + "learning_rate": 4.31610395399713e-06, + "loss": 1.0262, + "step": 1630 + }, + { + "epoch": 1.5186219739292364, + "grad_norm": 1.6221550703048706, + "learning_rate": 4.315252930935771e-06, + "loss": 1.0221, + "step": 1631 + }, + { + "epoch": 1.5195530726256983, + "grad_norm": 1.6089915037155151, + "learning_rate": 4.31440146272015e-06, + "loss": 1.0106, + "step": 1632 + }, + { + "epoch": 1.5204841713221602, + "grad_norm": 1.5913159847259521, + "learning_rate": 4.313549549559074e-06, + "loss": 1.0143, + "step": 1633 + }, + { + "epoch": 1.5214152700186219, + "grad_norm": 1.49701726436615, + "learning_rate": 4.312697191661457e-06, + "loss": 0.9691, + "step": 1634 + }, + { + "epoch": 1.5223463687150838, + "grad_norm": 1.5818606615066528, + "learning_rate": 4.311844389236324e-06, + "loss": 1.0178, + "step": 1635 + }, + { + "epoch": 1.5232774674115457, + "grad_norm": 1.660035490989685, + "learning_rate": 4.310991142492806e-06, + "loss": 0.98, + "step": 1636 + }, + { + "epoch": 1.5242085661080074, + "grad_norm": 1.5978046655654907, + "learning_rate": 4.310137451640144e-06, + "loss": 1.0154, + "step": 1637 + }, + { + "epoch": 1.5251396648044693, + "grad_norm": 1.5881710052490234, + "learning_rate": 4.309283316887691e-06, + "loss": 1.0269, + "step": 1638 + }, + { + "epoch": 1.5260707635009312, + "grad_norm": 1.5614145994186401, + "learning_rate": 4.308428738444904e-06, + "loss": 1.0075, + "step": 1639 + }, + { + "epoch": 1.5270018621973929, + "grad_norm": 1.546956181526184, + "learning_rate": 4.307573716521353e-06, + "loss": 1.0178, + "step": 1640 + }, + { + "epoch": 1.5279329608938548, + "grad_norm": 1.6735785007476807, + "learning_rate": 4.306718251326714e-06, + "loss": 1.0419, + "step": 1641 + }, + { + "epoch": 1.5288640595903167, + "grad_norm": 1.528800129890442, + "learning_rate": 4.305862343070772e-06, + "loss": 0.9834, + "step": 1642 + }, + { + "epoch": 1.5297951582867784, + "grad_norm": 1.5266664028167725, + "learning_rate": 4.305005991963423e-06, + "loss": 1.0035, + "step": 1643 + }, + { + "epoch": 1.5307262569832403, + "grad_norm": 1.5515495538711548, + "learning_rate": 4.304149198214669e-06, + "loss": 1.0017, + "step": 1644 + }, + { + "epoch": 1.5316573556797022, + "grad_norm": 1.508175253868103, + "learning_rate": 4.30329196203462e-06, + "loss": 0.95, + "step": 1645 + }, + { + "epoch": 1.5325884543761639, + "grad_norm": 1.6205800771713257, + "learning_rate": 4.302434283633499e-06, + "loss": 1.0154, + "step": 1646 + }, + { + "epoch": 1.5335195530726256, + "grad_norm": 1.5815434455871582, + "learning_rate": 4.301576163221631e-06, + "loss": 1.0083, + "step": 1647 + }, + { + "epoch": 1.5344506517690877, + "grad_norm": 1.5696470737457275, + "learning_rate": 4.3007176010094545e-06, + "loss": 1.0549, + "step": 1648 + }, + { + "epoch": 1.5353817504655494, + "grad_norm": 1.6360572576522827, + "learning_rate": 4.299858597207514e-06, + "loss": 1.0432, + "step": 1649 + }, + { + "epoch": 1.536312849162011, + "grad_norm": 1.6072953939437866, + "learning_rate": 4.298999152026465e-06, + "loss": 1.0322, + "step": 1650 + }, + { + "epoch": 1.537243947858473, + "grad_norm": 1.5885045528411865, + "learning_rate": 4.298139265677067e-06, + "loss": 0.9939, + "step": 1651 + }, + { + "epoch": 1.5381750465549349, + "grad_norm": 1.584114909172058, + "learning_rate": 4.29727893837019e-06, + "loss": 0.9872, + "step": 1652 + }, + { + "epoch": 1.5391061452513966, + "grad_norm": 1.5574947595596313, + "learning_rate": 4.296418170316813e-06, + "loss": 1.04, + "step": 1653 + }, + { + "epoch": 1.5400372439478585, + "grad_norm": 1.6010804176330566, + "learning_rate": 4.29555696172802e-06, + "loss": 1.0243, + "step": 1654 + }, + { + "epoch": 1.5409683426443204, + "grad_norm": 1.6047048568725586, + "learning_rate": 4.294695312815008e-06, + "loss": 1.043, + "step": 1655 + }, + { + "epoch": 1.541899441340782, + "grad_norm": 1.6073797941207886, + "learning_rate": 4.293833223789076e-06, + "loss": 1.019, + "step": 1656 + }, + { + "epoch": 1.542830540037244, + "grad_norm": 1.5430538654327393, + "learning_rate": 4.292970694861636e-06, + "loss": 0.9849, + "step": 1657 + }, + { + "epoch": 1.5437616387337059, + "grad_norm": 1.6261460781097412, + "learning_rate": 4.292107726244206e-06, + "loss": 1.0406, + "step": 1658 + }, + { + "epoch": 1.5446927374301676, + "grad_norm": 1.582450270652771, + "learning_rate": 4.291244318148411e-06, + "loss": 1.021, + "step": 1659 + }, + { + "epoch": 1.5456238361266295, + "grad_norm": 1.5819180011749268, + "learning_rate": 4.290380470785984e-06, + "loss": 1.0121, + "step": 1660 + }, + { + "epoch": 1.5465549348230914, + "grad_norm": 1.5410597324371338, + "learning_rate": 4.289516184368766e-06, + "loss": 1.012, + "step": 1661 + }, + { + "epoch": 1.547486033519553, + "grad_norm": 1.5834349393844604, + "learning_rate": 4.288651459108708e-06, + "loss": 0.9964, + "step": 1662 + }, + { + "epoch": 1.5484171322160147, + "grad_norm": 1.5701252222061157, + "learning_rate": 4.287786295217864e-06, + "loss": 1.0471, + "step": 1663 + }, + { + "epoch": 1.5493482309124769, + "grad_norm": 1.610573172569275, + "learning_rate": 4.286920692908399e-06, + "loss": 1.0419, + "step": 1664 + }, + { + "epoch": 1.5502793296089385, + "grad_norm": 1.5417912006378174, + "learning_rate": 4.286054652392586e-06, + "loss": 0.9699, + "step": 1665 + }, + { + "epoch": 1.5512104283054002, + "grad_norm": 1.555499792098999, + "learning_rate": 4.285188173882802e-06, + "loss": 1.0109, + "step": 1666 + }, + { + "epoch": 1.5521415270018621, + "grad_norm": 1.5716497898101807, + "learning_rate": 4.284321257591533e-06, + "loss": 1.0581, + "step": 1667 + }, + { + "epoch": 1.553072625698324, + "grad_norm": 1.6050313711166382, + "learning_rate": 4.283453903731375e-06, + "loss": 0.9937, + "step": 1668 + }, + { + "epoch": 1.5540037243947857, + "grad_norm": 1.5794506072998047, + "learning_rate": 4.282586112515027e-06, + "loss": 1.0522, + "step": 1669 + }, + { + "epoch": 1.5549348230912476, + "grad_norm": 1.5321145057678223, + "learning_rate": 4.2817178841552985e-06, + "loss": 1.0093, + "step": 1670 + }, + { + "epoch": 1.5558659217877095, + "grad_norm": 1.5704569816589355, + "learning_rate": 4.2808492188651054e-06, + "loss": 1.0276, + "step": 1671 + }, + { + "epoch": 1.5567970204841712, + "grad_norm": 1.5735173225402832, + "learning_rate": 4.279980116857469e-06, + "loss": 1.0586, + "step": 1672 + }, + { + "epoch": 1.5577281191806331, + "grad_norm": 1.586900234222412, + "learning_rate": 4.27911057834552e-06, + "loss": 1.0028, + "step": 1673 + }, + { + "epoch": 1.558659217877095, + "grad_norm": 1.5465446710586548, + "learning_rate": 4.278240603542496e-06, + "loss": 0.9711, + "step": 1674 + }, + { + "epoch": 1.5595903165735567, + "grad_norm": 1.613728404045105, + "learning_rate": 4.27737019266174e-06, + "loss": 1.0134, + "step": 1675 + }, + { + "epoch": 1.5605214152700186, + "grad_norm": 1.5634862184524536, + "learning_rate": 4.276499345916701e-06, + "loss": 0.9971, + "step": 1676 + }, + { + "epoch": 1.5614525139664805, + "grad_norm": 1.5275167226791382, + "learning_rate": 4.275628063520939e-06, + "loss": 1.0019, + "step": 1677 + }, + { + "epoch": 1.5623836126629422, + "grad_norm": 1.6941787004470825, + "learning_rate": 4.274756345688118e-06, + "loss": 1.016, + "step": 1678 + }, + { + "epoch": 1.5633147113594041, + "grad_norm": 1.5935890674591064, + "learning_rate": 4.2738841926320095e-06, + "loss": 1.0739, + "step": 1679 + }, + { + "epoch": 1.564245810055866, + "grad_norm": 1.5766217708587646, + "learning_rate": 4.27301160456649e-06, + "loss": 0.99, + "step": 1680 + }, + { + "epoch": 1.5651769087523277, + "grad_norm": 1.532702922821045, + "learning_rate": 4.2721385817055465e-06, + "loss": 0.9736, + "step": 1681 + }, + { + "epoch": 1.5661080074487894, + "grad_norm": 1.4808470010757446, + "learning_rate": 4.271265124263267e-06, + "loss": 0.9814, + "step": 1682 + }, + { + "epoch": 1.5670391061452515, + "grad_norm": 1.5192599296569824, + "learning_rate": 4.270391232453853e-06, + "loss": 1.0212, + "step": 1683 + }, + { + "epoch": 1.5679702048417132, + "grad_norm": 1.5223890542984009, + "learning_rate": 4.269516906491607e-06, + "loss": 1.0154, + "step": 1684 + }, + { + "epoch": 1.568901303538175, + "grad_norm": 1.5646499395370483, + "learning_rate": 4.26864214659094e-06, + "loss": 0.9994, + "step": 1685 + }, + { + "epoch": 1.5698324022346368, + "grad_norm": 1.5947473049163818, + "learning_rate": 4.267766952966369e-06, + "loss": 1.0549, + "step": 1686 + }, + { + "epoch": 1.5707635009310987, + "grad_norm": 1.5698765516281128, + "learning_rate": 4.2668913258325186e-06, + "loss": 0.9759, + "step": 1687 + }, + { + "epoch": 1.5716945996275604, + "grad_norm": 1.563552975654602, + "learning_rate": 4.266015265404118e-06, + "loss": 1.0288, + "step": 1688 + }, + { + "epoch": 1.5726256983240223, + "grad_norm": 1.5093134641647339, + "learning_rate": 4.265138771896003e-06, + "loss": 0.9821, + "step": 1689 + }, + { + "epoch": 1.5735567970204842, + "grad_norm": 1.45540189743042, + "learning_rate": 4.264261845523117e-06, + "loss": 0.9767, + "step": 1690 + }, + { + "epoch": 1.574487895716946, + "grad_norm": 1.6118829250335693, + "learning_rate": 4.263384486500508e-06, + "loss": 1.0468, + "step": 1691 + }, + { + "epoch": 1.5754189944134078, + "grad_norm": 1.5699982643127441, + "learning_rate": 4.2625066950433305e-06, + "loss": 1.0319, + "step": 1692 + }, + { + "epoch": 1.5763500931098697, + "grad_norm": 1.5150530338287354, + "learning_rate": 4.261628471366845e-06, + "loss": 0.9948, + "step": 1693 + }, + { + "epoch": 1.5772811918063314, + "grad_norm": 1.581607460975647, + "learning_rate": 4.260749815686419e-06, + "loss": 1.0573, + "step": 1694 + }, + { + "epoch": 1.5782122905027933, + "grad_norm": 1.5703697204589844, + "learning_rate": 4.259870728217525e-06, + "loss": 1.0074, + "step": 1695 + }, + { + "epoch": 1.5791433891992552, + "grad_norm": 1.597098469734192, + "learning_rate": 4.2589912091757415e-06, + "loss": 1.0501, + "step": 1696 + }, + { + "epoch": 1.580074487895717, + "grad_norm": 1.5741326808929443, + "learning_rate": 4.258111258776751e-06, + "loss": 1.0303, + "step": 1697 + }, + { + "epoch": 1.5810055865921788, + "grad_norm": 1.583868145942688, + "learning_rate": 4.257230877236347e-06, + "loss": 1.0344, + "step": 1698 + }, + { + "epoch": 1.5819366852886407, + "grad_norm": 1.5371819734573364, + "learning_rate": 4.256350064770424e-06, + "loss": 1.0098, + "step": 1699 + }, + { + "epoch": 1.5828677839851024, + "grad_norm": 1.5712038278579712, + "learning_rate": 4.255468821594981e-06, + "loss": 1.0294, + "step": 1700 + }, + { + "epoch": 1.583798882681564, + "grad_norm": 1.5606169700622559, + "learning_rate": 4.254587147926129e-06, + "loss": 1.0818, + "step": 1701 + }, + { + "epoch": 1.5847299813780262, + "grad_norm": 1.6024121046066284, + "learning_rate": 4.2537050439800775e-06, + "loss": 1.0258, + "step": 1702 + }, + { + "epoch": 1.585661080074488, + "grad_norm": 1.604117512702942, + "learning_rate": 4.252822509973148e-06, + "loss": 1.0501, + "step": 1703 + }, + { + "epoch": 1.5865921787709496, + "grad_norm": 1.5062448978424072, + "learning_rate": 4.251939546121761e-06, + "loss": 1.0455, + "step": 1704 + }, + { + "epoch": 1.5875232774674115, + "grad_norm": 1.592667579650879, + "learning_rate": 4.251056152642448e-06, + "loss": 0.9994, + "step": 1705 + }, + { + "epoch": 1.5884543761638734, + "grad_norm": 1.587271809577942, + "learning_rate": 4.250172329751843e-06, + "loss": 0.9718, + "step": 1706 + }, + { + "epoch": 1.589385474860335, + "grad_norm": 1.6200915575027466, + "learning_rate": 4.249288077666684e-06, + "loss": 1.0291, + "step": 1707 + }, + { + "epoch": 1.590316573556797, + "grad_norm": 1.5204311609268188, + "learning_rate": 4.248403396603818e-06, + "loss": 0.9555, + "step": 1708 + }, + { + "epoch": 1.591247672253259, + "grad_norm": 1.5054893493652344, + "learning_rate": 4.2475182867801945e-06, + "loss": 0.985, + "step": 1709 + }, + { + "epoch": 1.5921787709497206, + "grad_norm": 1.698233723640442, + "learning_rate": 4.246632748412869e-06, + "loss": 1.0638, + "step": 1710 + }, + { + "epoch": 1.5931098696461825, + "grad_norm": 1.586362361907959, + "learning_rate": 4.245746781719002e-06, + "loss": 0.9907, + "step": 1711 + }, + { + "epoch": 1.5940409683426444, + "grad_norm": 1.6033439636230469, + "learning_rate": 4.2448603869158585e-06, + "loss": 1.0306, + "step": 1712 + }, + { + "epoch": 1.594972067039106, + "grad_norm": 1.551690697669983, + "learning_rate": 4.243973564220811e-06, + "loss": 0.9998, + "step": 1713 + }, + { + "epoch": 1.595903165735568, + "grad_norm": 1.5957772731781006, + "learning_rate": 4.243086313851332e-06, + "loss": 1.0524, + "step": 1714 + }, + { + "epoch": 1.59683426443203, + "grad_norm": 1.5673012733459473, + "learning_rate": 4.242198636025004e-06, + "loss": 1.0687, + "step": 1715 + }, + { + "epoch": 1.5977653631284916, + "grad_norm": 1.5716279745101929, + "learning_rate": 4.241310530959511e-06, + "loss": 1.0395, + "step": 1716 + }, + { + "epoch": 1.5986964618249533, + "grad_norm": 1.5629993677139282, + "learning_rate": 4.240421998872643e-06, + "loss": 1.0239, + "step": 1717 + }, + { + "epoch": 1.5996275605214154, + "grad_norm": 1.5452954769134521, + "learning_rate": 4.239533039982295e-06, + "loss": 0.9931, + "step": 1718 + }, + { + "epoch": 1.600558659217877, + "grad_norm": 1.561960220336914, + "learning_rate": 4.238643654506466e-06, + "loss": 1.0137, + "step": 1719 + }, + { + "epoch": 1.6014897579143388, + "grad_norm": 1.5363296270370483, + "learning_rate": 4.237753842663259e-06, + "loss": 1.0249, + "step": 1720 + }, + { + "epoch": 1.6024208566108007, + "grad_norm": 1.5614672899246216, + "learning_rate": 4.236863604670885e-06, + "loss": 1.0258, + "step": 1721 + }, + { + "epoch": 1.6033519553072626, + "grad_norm": 1.493328332901001, + "learning_rate": 4.235972940747655e-06, + "loss": 1.0081, + "step": 1722 + }, + { + "epoch": 1.6042830540037243, + "grad_norm": 1.5374797582626343, + "learning_rate": 4.235081851111987e-06, + "loss": 1.0028, + "step": 1723 + }, + { + "epoch": 1.6052141527001862, + "grad_norm": 1.5515986680984497, + "learning_rate": 4.234190335982402e-06, + "loss": 1.0305, + "step": 1724 + }, + { + "epoch": 1.606145251396648, + "grad_norm": 1.576271891593933, + "learning_rate": 4.233298395577527e-06, + "loss": 1.0491, + "step": 1725 + }, + { + "epoch": 1.6070763500931098, + "grad_norm": 1.4784661531448364, + "learning_rate": 4.232406030116093e-06, + "loss": 0.9514, + "step": 1726 + }, + { + "epoch": 1.6080074487895717, + "grad_norm": 1.55087149143219, + "learning_rate": 4.231513239816933e-06, + "loss": 1.035, + "step": 1727 + }, + { + "epoch": 1.6089385474860336, + "grad_norm": 1.5262744426727295, + "learning_rate": 4.230620024898987e-06, + "loss": 0.9637, + "step": 1728 + }, + { + "epoch": 1.6098696461824953, + "grad_norm": 1.5404300689697266, + "learning_rate": 4.229726385581298e-06, + "loss": 0.9982, + "step": 1729 + }, + { + "epoch": 1.6108007448789572, + "grad_norm": 1.5411280393600464, + "learning_rate": 4.228832322083013e-06, + "loss": 0.9682, + "step": 1730 + }, + { + "epoch": 1.611731843575419, + "grad_norm": 1.5592252016067505, + "learning_rate": 4.227937834623382e-06, + "loss": 0.995, + "step": 1731 + }, + { + "epoch": 1.6126629422718808, + "grad_norm": 1.5622261762619019, + "learning_rate": 4.227042923421762e-06, + "loss": 1.0032, + "step": 1732 + }, + { + "epoch": 1.6135940409683427, + "grad_norm": 1.6128736734390259, + "learning_rate": 4.22614758869761e-06, + "loss": 1.0497, + "step": 1733 + }, + { + "epoch": 1.6145251396648046, + "grad_norm": 1.5503853559494019, + "learning_rate": 4.22525183067049e-06, + "loss": 1.0408, + "step": 1734 + }, + { + "epoch": 1.6154562383612663, + "grad_norm": 1.5317991971969604, + "learning_rate": 4.224355649560069e-06, + "loss": 0.975, + "step": 1735 + }, + { + "epoch": 1.616387337057728, + "grad_norm": 1.544202446937561, + "learning_rate": 4.223459045586115e-06, + "loss": 0.9766, + "step": 1736 + }, + { + "epoch": 1.61731843575419, + "grad_norm": 1.6110060214996338, + "learning_rate": 4.222562018968506e-06, + "loss": 1.0375, + "step": 1737 + }, + { + "epoch": 1.6182495344506518, + "grad_norm": 1.545624017715454, + "learning_rate": 4.221664569927217e-06, + "loss": 1.0226, + "step": 1738 + }, + { + "epoch": 1.6191806331471135, + "grad_norm": 1.5491719245910645, + "learning_rate": 4.2207666986823295e-06, + "loss": 0.9853, + "step": 1739 + }, + { + "epoch": 1.6201117318435754, + "grad_norm": 1.5949021577835083, + "learning_rate": 4.219868405454029e-06, + "loss": 1.0244, + "step": 1740 + }, + { + "epoch": 1.6210428305400373, + "grad_norm": 1.5667827129364014, + "learning_rate": 4.218969690462603e-06, + "loss": 1.0295, + "step": 1741 + }, + { + "epoch": 1.621973929236499, + "grad_norm": 1.5576833486557007, + "learning_rate": 4.218070553928444e-06, + "loss": 1.0323, + "step": 1742 + }, + { + "epoch": 1.6229050279329609, + "grad_norm": 1.5543231964111328, + "learning_rate": 4.217170996072048e-06, + "loss": 1.0304, + "step": 1743 + }, + { + "epoch": 1.6238361266294228, + "grad_norm": 1.523926854133606, + "learning_rate": 4.216271017114012e-06, + "loss": 1.0071, + "step": 1744 + }, + { + "epoch": 1.6247672253258845, + "grad_norm": 1.5696873664855957, + "learning_rate": 4.2153706172750375e-06, + "loss": 0.9715, + "step": 1745 + }, + { + "epoch": 1.6256983240223464, + "grad_norm": 1.656225562095642, + "learning_rate": 4.2144697967759315e-06, + "loss": 1.0515, + "step": 1746 + }, + { + "epoch": 1.6266294227188083, + "grad_norm": 1.5823173522949219, + "learning_rate": 4.2135685558376e-06, + "loss": 1.0513, + "step": 1747 + }, + { + "epoch": 1.62756052141527, + "grad_norm": 1.5069504976272583, + "learning_rate": 4.212666894681054e-06, + "loss": 0.9981, + "step": 1748 + }, + { + "epoch": 1.6284916201117319, + "grad_norm": 1.5372499227523804, + "learning_rate": 4.211764813527411e-06, + "loss": 0.9588, + "step": 1749 + }, + { + "epoch": 1.6294227188081938, + "grad_norm": 1.6231273412704468, + "learning_rate": 4.210862312597884e-06, + "loss": 1.032, + "step": 1750 + }, + { + "epoch": 1.6303538175046555, + "grad_norm": 1.5773149728775024, + "learning_rate": 4.209959392113796e-06, + "loss": 1.0025, + "step": 1751 + }, + { + "epoch": 1.6312849162011172, + "grad_norm": 1.5779573917388916, + "learning_rate": 4.209056052296569e-06, + "loss": 1.007, + "step": 1752 + }, + { + "epoch": 1.6322160148975793, + "grad_norm": 1.6042360067367554, + "learning_rate": 4.20815229336773e-06, + "loss": 1.0212, + "step": 1753 + }, + { + "epoch": 1.633147113594041, + "grad_norm": 1.5667567253112793, + "learning_rate": 4.207248115548906e-06, + "loss": 0.9833, + "step": 1754 + }, + { + "epoch": 1.6340782122905027, + "grad_norm": 1.601111650466919, + "learning_rate": 4.20634351906183e-06, + "loss": 1.0281, + "step": 1755 + }, + { + "epoch": 1.6350093109869648, + "grad_norm": 1.6138057708740234, + "learning_rate": 4.205438504128335e-06, + "loss": 1.02, + "step": 1756 + }, + { + "epoch": 1.6359404096834265, + "grad_norm": 1.5983747243881226, + "learning_rate": 4.204533070970358e-06, + "loss": 0.9883, + "step": 1757 + }, + { + "epoch": 1.6368715083798882, + "grad_norm": 1.6062757968902588, + "learning_rate": 4.2036272198099384e-06, + "loss": 1.0299, + "step": 1758 + }, + { + "epoch": 1.63780260707635, + "grad_norm": 1.5497345924377441, + "learning_rate": 4.202720950869218e-06, + "loss": 1.0102, + "step": 1759 + }, + { + "epoch": 1.638733705772812, + "grad_norm": 1.6185259819030762, + "learning_rate": 4.201814264370441e-06, + "loss": 1.0081, + "step": 1760 + }, + { + "epoch": 1.6396648044692737, + "grad_norm": 1.5611456632614136, + "learning_rate": 4.200907160535954e-06, + "loss": 1.0048, + "step": 1761 + }, + { + "epoch": 1.6405959031657356, + "grad_norm": 1.5442298650741577, + "learning_rate": 4.199999639588206e-06, + "loss": 0.9999, + "step": 1762 + }, + { + "epoch": 1.6415270018621975, + "grad_norm": 1.5870798826217651, + "learning_rate": 4.199091701749748e-06, + "loss": 0.9889, + "step": 1763 + }, + { + "epoch": 1.6424581005586592, + "grad_norm": 1.5467183589935303, + "learning_rate": 4.198183347243233e-06, + "loss": 1.0103, + "step": 1764 + }, + { + "epoch": 1.643389199255121, + "grad_norm": 1.5057183504104614, + "learning_rate": 4.197274576291418e-06, + "loss": 0.9773, + "step": 1765 + }, + { + "epoch": 1.644320297951583, + "grad_norm": 1.5750811100006104, + "learning_rate": 4.196365389117161e-06, + "loss": 0.9911, + "step": 1766 + }, + { + "epoch": 1.6452513966480447, + "grad_norm": 1.6021958589553833, + "learning_rate": 4.19545578594342e-06, + "loss": 0.9964, + "step": 1767 + }, + { + "epoch": 1.6461824953445066, + "grad_norm": 1.5625004768371582, + "learning_rate": 4.1945457669932575e-06, + "loss": 1.0119, + "step": 1768 + }, + { + "epoch": 1.6471135940409685, + "grad_norm": 1.5571106672286987, + "learning_rate": 4.193635332489839e-06, + "loss": 1.0412, + "step": 1769 + }, + { + "epoch": 1.6480446927374302, + "grad_norm": 1.5201088190078735, + "learning_rate": 4.192724482656429e-06, + "loss": 1.011, + "step": 1770 + }, + { + "epoch": 1.6489757914338918, + "grad_norm": 1.5553494691848755, + "learning_rate": 4.191813217716394e-06, + "loss": 1.0398, + "step": 1771 + }, + { + "epoch": 1.649906890130354, + "grad_norm": 1.5127707719802856, + "learning_rate": 4.190901537893205e-06, + "loss": 0.9685, + "step": 1772 + }, + { + "epoch": 1.6508379888268156, + "grad_norm": 1.5730139017105103, + "learning_rate": 4.189989443410432e-06, + "loss": 1.0446, + "step": 1773 + }, + { + "epoch": 1.6517690875232773, + "grad_norm": 1.5341529846191406, + "learning_rate": 4.189076934491749e-06, + "loss": 1.0077, + "step": 1774 + }, + { + "epoch": 1.6527001862197392, + "grad_norm": 1.6464959383010864, + "learning_rate": 4.18816401136093e-06, + "loss": 1.0555, + "step": 1775 + }, + { + "epoch": 1.6536312849162011, + "grad_norm": 1.5322630405426025, + "learning_rate": 4.187250674241851e-06, + "loss": 0.9992, + "step": 1776 + }, + { + "epoch": 1.6545623836126628, + "grad_norm": 1.5465375185012817, + "learning_rate": 4.186336923358488e-06, + "loss": 0.9907, + "step": 1777 + }, + { + "epoch": 1.6554934823091247, + "grad_norm": 1.518801212310791, + "learning_rate": 4.185422758934923e-06, + "loss": 1.0328, + "step": 1778 + }, + { + "epoch": 1.6564245810055866, + "grad_norm": 1.5929596424102783, + "learning_rate": 4.1845081811953345e-06, + "loss": 1.0329, + "step": 1779 + }, + { + "epoch": 1.6573556797020483, + "grad_norm": 1.5499824285507202, + "learning_rate": 4.183593190364005e-06, + "loss": 0.9968, + "step": 1780 + }, + { + "epoch": 1.6582867783985102, + "grad_norm": 1.517143726348877, + "learning_rate": 4.182677786665317e-06, + "loss": 0.9946, + "step": 1781 + }, + { + "epoch": 1.6592178770949721, + "grad_norm": 1.5566442012786865, + "learning_rate": 4.181761970323756e-06, + "loss": 0.9972, + "step": 1782 + }, + { + "epoch": 1.6601489757914338, + "grad_norm": 1.5849595069885254, + "learning_rate": 4.180845741563905e-06, + "loss": 1.0058, + "step": 1783 + }, + { + "epoch": 1.6610800744878957, + "grad_norm": 1.5653990507125854, + "learning_rate": 4.179929100610454e-06, + "loss": 1.0369, + "step": 1784 + }, + { + "epoch": 1.6620111731843576, + "grad_norm": 1.56673264503479, + "learning_rate": 4.1790120476881875e-06, + "loss": 1.0497, + "step": 1785 + }, + { + "epoch": 1.6629422718808193, + "grad_norm": 1.6064715385437012, + "learning_rate": 4.178094583021997e-06, + "loss": 1.0392, + "step": 1786 + }, + { + "epoch": 1.6638733705772812, + "grad_norm": 1.5575141906738281, + "learning_rate": 4.177176706836871e-06, + "loss": 1.0179, + "step": 1787 + }, + { + "epoch": 1.6648044692737431, + "grad_norm": 1.5933163166046143, + "learning_rate": 4.1762584193578996e-06, + "loss": 1.0264, + "step": 1788 + }, + { + "epoch": 1.6657355679702048, + "grad_norm": 1.533276081085205, + "learning_rate": 4.175339720810276e-06, + "loss": 1.0351, + "step": 1789 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 1.5583761930465698, + "learning_rate": 4.1744206114192895e-06, + "loss": 1.0436, + "step": 1790 + }, + { + "epoch": 1.6675977653631286, + "grad_norm": 1.5703086853027344, + "learning_rate": 4.173501091410338e-06, + "loss": 0.9978, + "step": 1791 + }, + { + "epoch": 1.6685288640595903, + "grad_norm": 1.5729907751083374, + "learning_rate": 4.172581161008911e-06, + "loss": 1.0299, + "step": 1792 + }, + { + "epoch": 1.669459962756052, + "grad_norm": 1.5342987775802612, + "learning_rate": 4.171660820440605e-06, + "loss": 1.0342, + "step": 1793 + }, + { + "epoch": 1.670391061452514, + "grad_norm": 1.5894691944122314, + "learning_rate": 4.170740069931114e-06, + "loss": 1.0314, + "step": 1794 + }, + { + "epoch": 1.6713221601489758, + "grad_norm": 1.6253712177276611, + "learning_rate": 4.169818909706234e-06, + "loss": 1.032, + "step": 1795 + }, + { + "epoch": 1.6722532588454375, + "grad_norm": 1.6311748027801514, + "learning_rate": 4.168897339991862e-06, + "loss": 1.008, + "step": 1796 + }, + { + "epoch": 1.6731843575418994, + "grad_norm": 1.6183005571365356, + "learning_rate": 4.167975361013992e-06, + "loss": 1.0035, + "step": 1797 + }, + { + "epoch": 1.6741154562383613, + "grad_norm": 1.533651351928711, + "learning_rate": 4.167052972998723e-06, + "loss": 1.0347, + "step": 1798 + }, + { + "epoch": 1.675046554934823, + "grad_norm": 1.4968410730361938, + "learning_rate": 4.166130176172251e-06, + "loss": 1.0056, + "step": 1799 + }, + { + "epoch": 1.675977653631285, + "grad_norm": 1.605269193649292, + "learning_rate": 4.165206970760874e-06, + "loss": 1.0483, + "step": 1800 + }, + { + "epoch": 1.6769087523277468, + "grad_norm": 1.6567280292510986, + "learning_rate": 4.164283356990989e-06, + "loss": 1.0096, + "step": 1801 + }, + { + "epoch": 1.6778398510242085, + "grad_norm": 1.5661386251449585, + "learning_rate": 4.1633593350890945e-06, + "loss": 1.0253, + "step": 1802 + }, + { + "epoch": 1.6787709497206704, + "grad_norm": 1.632533073425293, + "learning_rate": 4.162434905281787e-06, + "loss": 0.9913, + "step": 1803 + }, + { + "epoch": 1.6797020484171323, + "grad_norm": 1.564606785774231, + "learning_rate": 4.1615100677957655e-06, + "loss": 0.9877, + "step": 1804 + }, + { + "epoch": 1.680633147113594, + "grad_norm": 1.518401861190796, + "learning_rate": 4.160584822857827e-06, + "loss": 0.9998, + "step": 1805 + }, + { + "epoch": 1.6815642458100557, + "grad_norm": 1.6043238639831543, + "learning_rate": 4.1596591706948695e-06, + "loss": 1.031, + "step": 1806 + }, + { + "epoch": 1.6824953445065178, + "grad_norm": 1.655834674835205, + "learning_rate": 4.158733111533892e-06, + "loss": 1.0401, + "step": 1807 + }, + { + "epoch": 1.6834264432029795, + "grad_norm": 1.5445494651794434, + "learning_rate": 4.1578066456019885e-06, + "loss": 1.0024, + "step": 1808 + }, + { + "epoch": 1.6843575418994412, + "grad_norm": 1.5908763408660889, + "learning_rate": 4.156879773126359e-06, + "loss": 1.016, + "step": 1809 + }, + { + "epoch": 1.6852886405959033, + "grad_norm": 1.5460089445114136, + "learning_rate": 4.155952494334299e-06, + "loss": 1.0061, + "step": 1810 + }, + { + "epoch": 1.686219739292365, + "grad_norm": 1.6270462274551392, + "learning_rate": 4.1550248094532055e-06, + "loss": 1.0111, + "step": 1811 + }, + { + "epoch": 1.6871508379888267, + "grad_norm": 1.6202434301376343, + "learning_rate": 4.154096718710575e-06, + "loss": 1.0104, + "step": 1812 + }, + { + "epoch": 1.6880819366852886, + "grad_norm": 1.6059703826904297, + "learning_rate": 4.153168222334002e-06, + "loss": 1.0274, + "step": 1813 + }, + { + "epoch": 1.6890130353817505, + "grad_norm": 1.5943083763122559, + "learning_rate": 4.152239320551182e-06, + "loss": 1.0276, + "step": 1814 + }, + { + "epoch": 1.6899441340782122, + "grad_norm": 1.5937108993530273, + "learning_rate": 4.151310013589911e-06, + "loss": 0.9987, + "step": 1815 + }, + { + "epoch": 1.690875232774674, + "grad_norm": 1.5884298086166382, + "learning_rate": 4.15038030167808e-06, + "loss": 1.0126, + "step": 1816 + }, + { + "epoch": 1.691806331471136, + "grad_norm": 1.5987244844436646, + "learning_rate": 4.149450185043684e-06, + "loss": 1.0124, + "step": 1817 + }, + { + "epoch": 1.6927374301675977, + "grad_norm": 1.5544172525405884, + "learning_rate": 4.148519663914814e-06, + "loss": 1.0055, + "step": 1818 + }, + { + "epoch": 1.6936685288640596, + "grad_norm": 1.6529874801635742, + "learning_rate": 4.1475887385196635e-06, + "loss": 0.9751, + "step": 1819 + }, + { + "epoch": 1.6945996275605215, + "grad_norm": 1.5711724758148193, + "learning_rate": 4.146657409086522e-06, + "loss": 1.0334, + "step": 1820 + }, + { + "epoch": 1.6955307262569832, + "grad_norm": 1.590208888053894, + "learning_rate": 4.1457256758437795e-06, + "loss": 0.9884, + "step": 1821 + }, + { + "epoch": 1.696461824953445, + "grad_norm": 1.5541894435882568, + "learning_rate": 4.144793539019926e-06, + "loss": 1.0326, + "step": 1822 + }, + { + "epoch": 1.697392923649907, + "grad_norm": 1.5732228755950928, + "learning_rate": 4.143860998843546e-06, + "loss": 0.9827, + "step": 1823 + }, + { + "epoch": 1.6983240223463687, + "grad_norm": 1.5404633283615112, + "learning_rate": 4.1429280555433305e-06, + "loss": 0.992, + "step": 1824 + }, + { + "epoch": 1.6992551210428304, + "grad_norm": 1.616262674331665, + "learning_rate": 4.141994709348062e-06, + "loss": 1.0318, + "step": 1825 + }, + { + "epoch": 1.7001862197392925, + "grad_norm": 1.5422700643539429, + "learning_rate": 4.141060960486626e-06, + "loss": 1.0038, + "step": 1826 + }, + { + "epoch": 1.7011173184357542, + "grad_norm": 1.6094343662261963, + "learning_rate": 4.1401268091880054e-06, + "loss": 1.0101, + "step": 1827 + }, + { + "epoch": 1.7020484171322159, + "grad_norm": 1.5428720712661743, + "learning_rate": 4.139192255681281e-06, + "loss": 1.0174, + "step": 1828 + }, + { + "epoch": 1.7029795158286778, + "grad_norm": 1.570723056793213, + "learning_rate": 4.138257300195636e-06, + "loss": 1.0162, + "step": 1829 + }, + { + "epoch": 1.7039106145251397, + "grad_norm": 1.4893836975097656, + "learning_rate": 4.137321942960348e-06, + "loss": 1.0002, + "step": 1830 + }, + { + "epoch": 1.7048417132216014, + "grad_norm": 1.5964380502700806, + "learning_rate": 4.136386184204793e-06, + "loss": 1.0456, + "step": 1831 + }, + { + "epoch": 1.7057728119180633, + "grad_norm": 1.5750490427017212, + "learning_rate": 4.135450024158448e-06, + "loss": 1.0276, + "step": 1832 + }, + { + "epoch": 1.7067039106145252, + "grad_norm": 1.5630074739456177, + "learning_rate": 4.134513463050889e-06, + "loss": 0.9788, + "step": 1833 + }, + { + "epoch": 1.7076350093109869, + "grad_norm": 1.5705476999282837, + "learning_rate": 4.133576501111787e-06, + "loss": 1.0317, + "step": 1834 + }, + { + "epoch": 1.7085661080074488, + "grad_norm": 1.5644968748092651, + "learning_rate": 4.132639138570913e-06, + "loss": 1.0207, + "step": 1835 + }, + { + "epoch": 1.7094972067039107, + "grad_norm": 1.5199353694915771, + "learning_rate": 4.131701375658138e-06, + "loss": 0.9853, + "step": 1836 + }, + { + "epoch": 1.7104283054003724, + "grad_norm": 1.5593675374984741, + "learning_rate": 4.130763212603428e-06, + "loss": 0.9939, + "step": 1837 + }, + { + "epoch": 1.7113594040968343, + "grad_norm": 1.580325961112976, + "learning_rate": 4.129824649636849e-06, + "loss": 1.0683, + "step": 1838 + }, + { + "epoch": 1.7122905027932962, + "grad_norm": 1.5195567607879639, + "learning_rate": 4.128885686988564e-06, + "loss": 1.0409, + "step": 1839 + }, + { + "epoch": 1.7132216014897579, + "grad_norm": 1.5187969207763672, + "learning_rate": 4.127946324888836e-06, + "loss": 1.0059, + "step": 1840 + }, + { + "epoch": 1.7141527001862198, + "grad_norm": 1.5323264598846436, + "learning_rate": 4.127006563568024e-06, + "loss": 0.993, + "step": 1841 + }, + { + "epoch": 1.7150837988826817, + "grad_norm": 1.602422833442688, + "learning_rate": 4.126066403256585e-06, + "loss": 1.0041, + "step": 1842 + }, + { + "epoch": 1.7160148975791434, + "grad_norm": 1.577170968055725, + "learning_rate": 4.125125844185076e-06, + "loss": 1.0102, + "step": 1843 + }, + { + "epoch": 1.716945996275605, + "grad_norm": 1.5555764436721802, + "learning_rate": 4.1241848865841485e-06, + "loss": 1.0169, + "step": 1844 + }, + { + "epoch": 1.7178770949720672, + "grad_norm": 1.5609252452850342, + "learning_rate": 4.123243530684554e-06, + "loss": 0.9838, + "step": 1845 + }, + { + "epoch": 1.7188081936685289, + "grad_norm": 1.5773577690124512, + "learning_rate": 4.122301776717141e-06, + "loss": 1.0646, + "step": 1846 + }, + { + "epoch": 1.7197392923649906, + "grad_norm": 1.5558104515075684, + "learning_rate": 4.121359624912856e-06, + "loss": 1.0225, + "step": 1847 + }, + { + "epoch": 1.7206703910614525, + "grad_norm": 1.6140307188034058, + "learning_rate": 4.120417075502743e-06, + "loss": 1.0302, + "step": 1848 + }, + { + "epoch": 1.7216014897579144, + "grad_norm": 1.6136341094970703, + "learning_rate": 4.119474128717943e-06, + "loss": 1.0444, + "step": 1849 + }, + { + "epoch": 1.722532588454376, + "grad_norm": 1.5411615371704102, + "learning_rate": 4.118530784789694e-06, + "loss": 0.9695, + "step": 1850 + }, + { + "epoch": 1.723463687150838, + "grad_norm": 1.5063066482543945, + "learning_rate": 4.117587043949334e-06, + "loss": 1.0043, + "step": 1851 + }, + { + "epoch": 1.7243947858472999, + "grad_norm": 1.5670325756072998, + "learning_rate": 4.116642906428294e-06, + "loss": 1.0175, + "step": 1852 + }, + { + "epoch": 1.7253258845437616, + "grad_norm": 1.5807342529296875, + "learning_rate": 4.115698372458107e-06, + "loss": 1.0311, + "step": 1853 + }, + { + "epoch": 1.7262569832402235, + "grad_norm": 1.539474606513977, + "learning_rate": 4.114753442270399e-06, + "loss": 0.9885, + "step": 1854 + }, + { + "epoch": 1.7271880819366854, + "grad_norm": 1.572718620300293, + "learning_rate": 4.113808116096897e-06, + "loss": 1.0285, + "step": 1855 + }, + { + "epoch": 1.728119180633147, + "grad_norm": 1.5126920938491821, + "learning_rate": 4.112862394169422e-06, + "loss": 1.0006, + "step": 1856 + }, + { + "epoch": 1.729050279329609, + "grad_norm": 1.5438448190689087, + "learning_rate": 4.111916276719892e-06, + "loss": 1.0345, + "step": 1857 + }, + { + "epoch": 1.7299813780260709, + "grad_norm": 1.55963933467865, + "learning_rate": 4.110969763980326e-06, + "loss": 1.0144, + "step": 1858 + }, + { + "epoch": 1.7309124767225326, + "grad_norm": 1.5334101915359497, + "learning_rate": 4.110022856182836e-06, + "loss": 1.0181, + "step": 1859 + }, + { + "epoch": 1.7318435754189943, + "grad_norm": 1.5000497102737427, + "learning_rate": 4.109075553559633e-06, + "loss": 0.9822, + "step": 1860 + }, + { + "epoch": 1.7327746741154564, + "grad_norm": 1.6354038715362549, + "learning_rate": 4.108127856343022e-06, + "loss": 1.0128, + "step": 1861 + }, + { + "epoch": 1.733705772811918, + "grad_norm": 1.625012993812561, + "learning_rate": 4.107179764765408e-06, + "loss": 0.9666, + "step": 1862 + }, + { + "epoch": 1.7346368715083798, + "grad_norm": 1.60771906375885, + "learning_rate": 4.106231279059291e-06, + "loss": 1.0731, + "step": 1863 + }, + { + "epoch": 1.7355679702048417, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.105282399457268e-06, + "loss": 0.9957, + "step": 1864 + }, + { + "epoch": 1.7364990689013036, + "grad_norm": 1.6249788999557495, + "learning_rate": 4.1043331261920325e-06, + "loss": 1.0013, + "step": 1865 + }, + { + "epoch": 1.7374301675977653, + "grad_norm": 1.5934343338012695, + "learning_rate": 4.103383459496376e-06, + "loss": 0.9848, + "step": 1866 + }, + { + "epoch": 1.7383612662942272, + "grad_norm": 1.6212162971496582, + "learning_rate": 4.102433399603183e-06, + "loss": 0.9908, + "step": 1867 + }, + { + "epoch": 1.739292364990689, + "grad_norm": 1.5215398073196411, + "learning_rate": 4.101482946745438e-06, + "loss": 0.9839, + "step": 1868 + }, + { + "epoch": 1.7402234636871508, + "grad_norm": 1.6186209917068481, + "learning_rate": 4.10053210115622e-06, + "loss": 1.0727, + "step": 1869 + }, + { + "epoch": 1.7411545623836127, + "grad_norm": 1.563692569732666, + "learning_rate": 4.099580863068706e-06, + "loss": 0.9986, + "step": 1870 + }, + { + "epoch": 1.7420856610800746, + "grad_norm": 1.573208212852478, + "learning_rate": 4.098629232716166e-06, + "loss": 1.0244, + "step": 1871 + }, + { + "epoch": 1.7430167597765363, + "grad_norm": 1.5983200073242188, + "learning_rate": 4.097677210331968e-06, + "loss": 1.0464, + "step": 1872 + }, + { + "epoch": 1.7439478584729982, + "grad_norm": 1.5691585540771484, + "learning_rate": 4.096724796149578e-06, + "loss": 0.9984, + "step": 1873 + }, + { + "epoch": 1.74487895716946, + "grad_norm": 1.5949965715408325, + "learning_rate": 4.095771990402556e-06, + "loss": 1.0507, + "step": 1874 + }, + { + "epoch": 1.7458100558659218, + "grad_norm": 1.5889729261398315, + "learning_rate": 4.0948187933245585e-06, + "loss": 1.0005, + "step": 1875 + }, + { + "epoch": 1.7467411545623837, + "grad_norm": 1.5813437700271606, + "learning_rate": 4.093865205149338e-06, + "loss": 1.0553, + "step": 1876 + }, + { + "epoch": 1.7476722532588456, + "grad_norm": 1.5170701742172241, + "learning_rate": 4.092911226110742e-06, + "loss": 1.0448, + "step": 1877 + }, + { + "epoch": 1.7486033519553073, + "grad_norm": 1.5029665231704712, + "learning_rate": 4.091956856442715e-06, + "loss": 0.9802, + "step": 1878 + }, + { + "epoch": 1.749534450651769, + "grad_norm": 1.572916030883789, + "learning_rate": 4.0910020963792966e-06, + "loss": 1.0368, + "step": 1879 + }, + { + "epoch": 1.750465549348231, + "grad_norm": 1.5514036417007446, + "learning_rate": 4.090046946154624e-06, + "loss": 1.037, + "step": 1880 + }, + { + "epoch": 1.7513966480446927, + "grad_norm": 1.5397133827209473, + "learning_rate": 4.089091406002926e-06, + "loss": 1.0147, + "step": 1881 + }, + { + "epoch": 1.7523277467411544, + "grad_norm": 1.5901501178741455, + "learning_rate": 4.088135476158533e-06, + "loss": 1.015, + "step": 1882 + }, + { + "epoch": 1.7532588454376163, + "grad_norm": 1.468786597251892, + "learning_rate": 4.087179156855865e-06, + "loss": 0.9723, + "step": 1883 + }, + { + "epoch": 1.7541899441340782, + "grad_norm": 1.7198137044906616, + "learning_rate": 4.086222448329441e-06, + "loss": 0.9827, + "step": 1884 + }, + { + "epoch": 1.75512104283054, + "grad_norm": 1.5916560888290405, + "learning_rate": 4.085265350813873e-06, + "loss": 1.0184, + "step": 1885 + }, + { + "epoch": 1.7560521415270018, + "grad_norm": 1.5931979417800903, + "learning_rate": 4.084307864543873e-06, + "loss": 0.9839, + "step": 1886 + }, + { + "epoch": 1.7569832402234637, + "grad_norm": 1.5414892435073853, + "learning_rate": 4.0833499897542425e-06, + "loss": 1.002, + "step": 1887 + }, + { + "epoch": 1.7579143389199254, + "grad_norm": 1.5468543767929077, + "learning_rate": 4.082391726679882e-06, + "loss": 0.9616, + "step": 1888 + }, + { + "epoch": 1.7588454376163873, + "grad_norm": 1.537461757659912, + "learning_rate": 4.081433075555786e-06, + "loss": 1.0143, + "step": 1889 + }, + { + "epoch": 1.7597765363128492, + "grad_norm": 1.552990436553955, + "learning_rate": 4.080474036617045e-06, + "loss": 1.0131, + "step": 1890 + }, + { + "epoch": 1.760707635009311, + "grad_norm": 1.620948314666748, + "learning_rate": 4.079514610098844e-06, + "loss": 1.013, + "step": 1891 + }, + { + "epoch": 1.7616387337057728, + "grad_norm": 1.562349557876587, + "learning_rate": 4.078554796236462e-06, + "loss": 0.9969, + "step": 1892 + }, + { + "epoch": 1.7625698324022347, + "grad_norm": 1.5185693502426147, + "learning_rate": 4.077594595265275e-06, + "loss": 0.9843, + "step": 1893 + }, + { + "epoch": 1.7635009310986964, + "grad_norm": 1.6121083498001099, + "learning_rate": 4.076634007420754e-06, + "loss": 1.0312, + "step": 1894 + }, + { + "epoch": 1.7644320297951583, + "grad_norm": 1.5583982467651367, + "learning_rate": 4.07567303293846e-06, + "loss": 1.0156, + "step": 1895 + }, + { + "epoch": 1.7653631284916202, + "grad_norm": 1.58591890335083, + "learning_rate": 4.074711672054057e-06, + "loss": 0.9978, + "step": 1896 + }, + { + "epoch": 1.766294227188082, + "grad_norm": 1.5953869819641113, + "learning_rate": 4.073749925003297e-06, + "loss": 1.0297, + "step": 1897 + }, + { + "epoch": 1.7672253258845436, + "grad_norm": 1.6581158638000488, + "learning_rate": 4.07278779202203e-06, + "loss": 0.9896, + "step": 1898 + }, + { + "epoch": 1.7681564245810057, + "grad_norm": 1.5670716762542725, + "learning_rate": 4.0718252733461995e-06, + "loss": 1.0352, + "step": 1899 + }, + { + "epoch": 1.7690875232774674, + "grad_norm": 1.5343213081359863, + "learning_rate": 4.070862369211843e-06, + "loss": 0.9853, + "step": 1900 + }, + { + "epoch": 1.7700186219739291, + "grad_norm": 1.519212007522583, + "learning_rate": 4.069899079855095e-06, + "loss": 1.012, + "step": 1901 + }, + { + "epoch": 1.770949720670391, + "grad_norm": 1.54884934425354, + "learning_rate": 4.068935405512182e-06, + "loss": 1.006, + "step": 1902 + }, + { + "epoch": 1.771880819366853, + "grad_norm": 1.5804592370986938, + "learning_rate": 4.067971346419425e-06, + "loss": 1.0343, + "step": 1903 + }, + { + "epoch": 1.7728119180633146, + "grad_norm": 1.6775153875350952, + "learning_rate": 4.0670069028132414e-06, + "loss": 1.0048, + "step": 1904 + }, + { + "epoch": 1.7737430167597765, + "grad_norm": 1.5643740892410278, + "learning_rate": 4.066042074930141e-06, + "loss": 0.9855, + "step": 1905 + }, + { + "epoch": 1.7746741154562384, + "grad_norm": 1.5684539079666138, + "learning_rate": 4.065076863006729e-06, + "loss": 1.0003, + "step": 1906 + }, + { + "epoch": 1.7756052141527001, + "grad_norm": 1.6218197345733643, + "learning_rate": 4.064111267279703e-06, + "loss": 1.0386, + "step": 1907 + }, + { + "epoch": 1.776536312849162, + "grad_norm": 1.6494494676589966, + "learning_rate": 4.063145287985857e-06, + "loss": 1.0209, + "step": 1908 + }, + { + "epoch": 1.777467411545624, + "grad_norm": 1.5487157106399536, + "learning_rate": 4.062178925362077e-06, + "loss": 1.011, + "step": 1909 + }, + { + "epoch": 1.7783985102420856, + "grad_norm": 1.53665030002594, + "learning_rate": 4.0612121796453455e-06, + "loss": 1.0126, + "step": 1910 + }, + { + "epoch": 1.7793296089385475, + "grad_norm": 1.5955960750579834, + "learning_rate": 4.060245051072736e-06, + "loss": 1.0329, + "step": 1911 + }, + { + "epoch": 1.7802607076350094, + "grad_norm": 1.5115478038787842, + "learning_rate": 4.059277539881418e-06, + "loss": 0.9835, + "step": 1912 + }, + { + "epoch": 1.7811918063314711, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.058309646308654e-06, + "loss": 1.0095, + "step": 1913 + }, + { + "epoch": 1.7821229050279328, + "grad_norm": 1.6130045652389526, + "learning_rate": 4.0573413705918e-06, + "loss": 0.9779, + "step": 1914 + }, + { + "epoch": 1.783054003724395, + "grad_norm": 1.540324330329895, + "learning_rate": 4.056372712968308e-06, + "loss": 1.0292, + "step": 1915 + }, + { + "epoch": 1.7839851024208566, + "grad_norm": 1.581092119216919, + "learning_rate": 4.055403673675718e-06, + "loss": 0.998, + "step": 1916 + }, + { + "epoch": 1.7849162011173183, + "grad_norm": 1.6607609987258911, + "learning_rate": 4.054434252951671e-06, + "loss": 1.0253, + "step": 1917 + }, + { + "epoch": 1.7858472998137802, + "grad_norm": 1.5878102779388428, + "learning_rate": 4.0534644510338976e-06, + "loss": 1.0132, + "step": 1918 + }, + { + "epoch": 1.7867783985102421, + "grad_norm": 1.5497157573699951, + "learning_rate": 4.052494268160219e-06, + "loss": 0.9781, + "step": 1919 + }, + { + "epoch": 1.7877094972067038, + "grad_norm": 1.6174079179763794, + "learning_rate": 4.051523704568557e-06, + "loss": 0.9921, + "step": 1920 + }, + { + "epoch": 1.7886405959031657, + "grad_norm": 1.569042682647705, + "learning_rate": 4.050552760496921e-06, + "loss": 1.0481, + "step": 1921 + }, + { + "epoch": 1.7895716945996276, + "grad_norm": 1.5762373208999634, + "learning_rate": 4.049581436183416e-06, + "loss": 0.9592, + "step": 1922 + }, + { + "epoch": 1.7905027932960893, + "grad_norm": 1.5199081897735596, + "learning_rate": 4.048609731866239e-06, + "loss": 1.0148, + "step": 1923 + }, + { + "epoch": 1.7914338919925512, + "grad_norm": 1.5941567420959473, + "learning_rate": 4.047637647783681e-06, + "loss": 1.0775, + "step": 1924 + }, + { + "epoch": 1.7923649906890131, + "grad_norm": 1.5598386526107788, + "learning_rate": 4.046665184174126e-06, + "loss": 1.0571, + "step": 1925 + }, + { + "epoch": 1.7932960893854748, + "grad_norm": 1.5563305616378784, + "learning_rate": 4.0456923412760516e-06, + "loss": 1.0311, + "step": 1926 + }, + { + "epoch": 1.7942271880819367, + "grad_norm": 1.5213959217071533, + "learning_rate": 4.044719119328029e-06, + "loss": 0.9834, + "step": 1927 + }, + { + "epoch": 1.7951582867783986, + "grad_norm": 1.5801384449005127, + "learning_rate": 4.043745518568719e-06, + "loss": 1.0473, + "step": 1928 + }, + { + "epoch": 1.7960893854748603, + "grad_norm": 1.5230423212051392, + "learning_rate": 4.042771539236879e-06, + "loss": 1.0417, + "step": 1929 + }, + { + "epoch": 1.7970204841713222, + "grad_norm": 1.491419792175293, + "learning_rate": 4.041797181571358e-06, + "loss": 1.0325, + "step": 1930 + }, + { + "epoch": 1.7979515828677841, + "grad_norm": 1.5568187236785889, + "learning_rate": 4.040822445811097e-06, + "loss": 1.016, + "step": 1931 + }, + { + "epoch": 1.7988826815642458, + "grad_norm": 1.5121091604232788, + "learning_rate": 4.03984733219513e-06, + "loss": 0.9922, + "step": 1932 + }, + { + "epoch": 1.7998137802607075, + "grad_norm": 1.5466973781585693, + "learning_rate": 4.038871840962585e-06, + "loss": 1.0187, + "step": 1933 + }, + { + "epoch": 1.8007448789571696, + "grad_norm": 1.5360347032546997, + "learning_rate": 4.037895972352681e-06, + "loss": 1.0495, + "step": 1934 + }, + { + "epoch": 1.8016759776536313, + "grad_norm": 1.5188478231430054, + "learning_rate": 4.036919726604731e-06, + "loss": 0.9754, + "step": 1935 + }, + { + "epoch": 1.802607076350093, + "grad_norm": 1.5918091535568237, + "learning_rate": 4.035943103958138e-06, + "loss": 1.0008, + "step": 1936 + }, + { + "epoch": 1.803538175046555, + "grad_norm": 1.551121473312378, + "learning_rate": 4.0349661046524e-06, + "loss": 1.0466, + "step": 1937 + }, + { + "epoch": 1.8044692737430168, + "grad_norm": 1.613853096961975, + "learning_rate": 4.033988728927108e-06, + "loss": 1.0346, + "step": 1938 + }, + { + "epoch": 1.8054003724394785, + "grad_norm": 1.580567479133606, + "learning_rate": 4.03301097702194e-06, + "loss": 1.0191, + "step": 1939 + }, + { + "epoch": 1.8063314711359404, + "grad_norm": 1.4997655153274536, + "learning_rate": 4.032032849176672e-06, + "loss": 1.013, + "step": 1940 + }, + { + "epoch": 1.8072625698324023, + "grad_norm": 1.550525426864624, + "learning_rate": 4.031054345631172e-06, + "loss": 1.0317, + "step": 1941 + }, + { + "epoch": 1.808193668528864, + "grad_norm": 1.586842656135559, + "learning_rate": 4.030075466625395e-06, + "loss": 1.0114, + "step": 1942 + }, + { + "epoch": 1.809124767225326, + "grad_norm": 1.5969867706298828, + "learning_rate": 4.029096212399394e-06, + "loss": 1.0094, + "step": 1943 + }, + { + "epoch": 1.8100558659217878, + "grad_norm": 1.5217729806900024, + "learning_rate": 4.02811658319331e-06, + "loss": 0.9947, + "step": 1944 + }, + { + "epoch": 1.8109869646182495, + "grad_norm": 1.5831209421157837, + "learning_rate": 4.0271365792473774e-06, + "loss": 0.9888, + "step": 1945 + }, + { + "epoch": 1.8119180633147114, + "grad_norm": 1.5278444290161133, + "learning_rate": 4.026156200801924e-06, + "loss": 1.0594, + "step": 1946 + }, + { + "epoch": 1.8128491620111733, + "grad_norm": 1.5892688035964966, + "learning_rate": 4.025175448097365e-06, + "loss": 1.0214, + "step": 1947 + }, + { + "epoch": 1.813780260707635, + "grad_norm": 1.658461332321167, + "learning_rate": 4.024194321374213e-06, + "loss": 1.0635, + "step": 1948 + }, + { + "epoch": 1.8147113594040967, + "grad_norm": 1.5656095743179321, + "learning_rate": 4.023212820873068e-06, + "loss": 1.0001, + "step": 1949 + }, + { + "epoch": 1.8156424581005588, + "grad_norm": 1.538415789604187, + "learning_rate": 4.022230946834624e-06, + "loss": 0.973, + "step": 1950 + }, + { + "epoch": 1.8165735567970205, + "grad_norm": 1.6055189371109009, + "learning_rate": 4.021248699499666e-06, + "loss": 0.9945, + "step": 1951 + }, + { + "epoch": 1.8175046554934822, + "grad_norm": 1.5532666444778442, + "learning_rate": 4.02026607910907e-06, + "loss": 1.0098, + "step": 1952 + }, + { + "epoch": 1.8184357541899443, + "grad_norm": 1.6120880842208862, + "learning_rate": 4.019283085903803e-06, + "loss": 1.0449, + "step": 1953 + }, + { + "epoch": 1.819366852886406, + "grad_norm": 1.5207961797714233, + "learning_rate": 4.0182997201249255e-06, + "loss": 0.9536, + "step": 1954 + }, + { + "epoch": 1.8202979515828677, + "grad_norm": 1.5699912309646606, + "learning_rate": 4.017315982013588e-06, + "loss": 1.0186, + "step": 1955 + }, + { + "epoch": 1.8212290502793296, + "grad_norm": 1.5570015907287598, + "learning_rate": 4.0163318718110324e-06, + "loss": 1.0087, + "step": 1956 + }, + { + "epoch": 1.8221601489757915, + "grad_norm": 1.597419261932373, + "learning_rate": 4.015347389758592e-06, + "loss": 0.9972, + "step": 1957 + }, + { + "epoch": 1.8230912476722532, + "grad_norm": 1.5418192148208618, + "learning_rate": 4.014362536097691e-06, + "loss": 1.0293, + "step": 1958 + }, + { + "epoch": 1.824022346368715, + "grad_norm": 1.5408064126968384, + "learning_rate": 4.0133773110698454e-06, + "loss": 0.9764, + "step": 1959 + }, + { + "epoch": 1.824953445065177, + "grad_norm": 1.5946096181869507, + "learning_rate": 4.012391714916662e-06, + "loss": 1.009, + "step": 1960 + }, + { + "epoch": 1.8258845437616387, + "grad_norm": 1.6126819849014282, + "learning_rate": 4.011405747879836e-06, + "loss": 1.0125, + "step": 1961 + }, + { + "epoch": 1.8268156424581006, + "grad_norm": 1.5833483934402466, + "learning_rate": 4.010419410201159e-06, + "loss": 1.0153, + "step": 1962 + }, + { + "epoch": 1.8277467411545625, + "grad_norm": 1.5596377849578857, + "learning_rate": 4.00943270212251e-06, + "loss": 1.0023, + "step": 1963 + }, + { + "epoch": 1.8286778398510242, + "grad_norm": 1.5329121351242065, + "learning_rate": 4.008445623885857e-06, + "loss": 0.9717, + "step": 1964 + }, + { + "epoch": 1.829608938547486, + "grad_norm": 1.5359712839126587, + "learning_rate": 4.007458175733264e-06, + "loss": 1.0082, + "step": 1965 + }, + { + "epoch": 1.830540037243948, + "grad_norm": 1.5251449346542358, + "learning_rate": 4.0064703579068805e-06, + "loss": 0.9666, + "step": 1966 + }, + { + "epoch": 1.8314711359404097, + "grad_norm": 1.6319305896759033, + "learning_rate": 4.005482170648951e-06, + "loss": 0.9903, + "step": 1967 + }, + { + "epoch": 1.8324022346368714, + "grad_norm": 1.5564286708831787, + "learning_rate": 4.004493614201808e-06, + "loss": 0.9939, + "step": 1968 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 1.6011050939559937, + "learning_rate": 4.003504688807873e-06, + "loss": 1.0329, + "step": 1969 + }, + { + "epoch": 1.8342644320297952, + "grad_norm": 1.5545549392700195, + "learning_rate": 4.002515394709663e-06, + "loss": 0.9753, + "step": 1970 + }, + { + "epoch": 1.8351955307262569, + "grad_norm": 1.5623029470443726, + "learning_rate": 4.00152573214978e-06, + "loss": 1.0234, + "step": 1971 + }, + { + "epoch": 1.8361266294227188, + "grad_norm": 1.4882252216339111, + "learning_rate": 4.0005357013709215e-06, + "loss": 0.948, + "step": 1972 + }, + { + "epoch": 1.8370577281191807, + "grad_norm": 1.515845775604248, + "learning_rate": 3.999545302615869e-06, + "loss": 0.9678, + "step": 1973 + }, + { + "epoch": 1.8379888268156424, + "grad_norm": 1.6062061786651611, + "learning_rate": 3.998554536127502e-06, + "loss": 1.02, + "step": 1974 + }, + { + "epoch": 1.8389199255121043, + "grad_norm": 1.6199564933776855, + "learning_rate": 3.997563402148783e-06, + "loss": 1.0337, + "step": 1975 + }, + { + "epoch": 1.8398510242085662, + "grad_norm": 1.4950660467147827, + "learning_rate": 3.996571900922769e-06, + "loss": 0.9718, + "step": 1976 + }, + { + "epoch": 1.8407821229050279, + "grad_norm": 1.546147346496582, + "learning_rate": 3.995580032692604e-06, + "loss": 1.0317, + "step": 1977 + }, + { + "epoch": 1.8417132216014898, + "grad_norm": 1.5217554569244385, + "learning_rate": 3.994587797701527e-06, + "loss": 0.9711, + "step": 1978 + }, + { + "epoch": 1.8426443202979517, + "grad_norm": 1.492216944694519, + "learning_rate": 3.993595196192861e-06, + "loss": 1.011, + "step": 1979 + }, + { + "epoch": 1.8435754189944134, + "grad_norm": 1.574309229850769, + "learning_rate": 3.992602228410023e-06, + "loss": 0.9773, + "step": 1980 + }, + { + "epoch": 1.8445065176908753, + "grad_norm": 1.548874020576477, + "learning_rate": 3.9916088945965165e-06, + "loss": 1.0043, + "step": 1981 + }, + { + "epoch": 1.8454376163873372, + "grad_norm": 1.4939056634902954, + "learning_rate": 3.990615194995939e-06, + "loss": 0.978, + "step": 1982 + }, + { + "epoch": 1.8463687150837989, + "grad_norm": 1.5849956274032593, + "learning_rate": 3.9896211298519735e-06, + "loss": 1.0117, + "step": 1983 + }, + { + "epoch": 1.8472998137802608, + "grad_norm": 1.5433441400527954, + "learning_rate": 3.988626699408396e-06, + "loss": 1.0079, + "step": 1984 + }, + { + "epoch": 1.8482309124767227, + "grad_norm": 1.5646699666976929, + "learning_rate": 3.987631903909068e-06, + "loss": 1.0014, + "step": 1985 + }, + { + "epoch": 1.8491620111731844, + "grad_norm": 1.5790705680847168, + "learning_rate": 3.986636743597946e-06, + "loss": 1.0123, + "step": 1986 + }, + { + "epoch": 1.850093109869646, + "grad_norm": 1.536658525466919, + "learning_rate": 3.9856412187190715e-06, + "loss": 0.9933, + "step": 1987 + }, + { + "epoch": 1.8510242085661082, + "grad_norm": 1.5895529985427856, + "learning_rate": 3.984645329516578e-06, + "loss": 0.9964, + "step": 1988 + }, + { + "epoch": 1.8519553072625698, + "grad_norm": 1.5518970489501953, + "learning_rate": 3.9836490762346866e-06, + "loss": 1.0266, + "step": 1989 + }, + { + "epoch": 1.8528864059590315, + "grad_norm": 1.576671838760376, + "learning_rate": 3.9826524591177075e-06, + "loss": 1.0368, + "step": 1990 + }, + { + "epoch": 1.8538175046554934, + "grad_norm": 1.5843610763549805, + "learning_rate": 3.981655478410043e-06, + "loss": 1.0336, + "step": 1991 + }, + { + "epoch": 1.8547486033519553, + "grad_norm": 1.5642727613449097, + "learning_rate": 3.98065813435618e-06, + "loss": 0.9767, + "step": 1992 + }, + { + "epoch": 1.855679702048417, + "grad_norm": 1.5783385038375854, + "learning_rate": 3.979660427200699e-06, + "loss": 1.0132, + "step": 1993 + }, + { + "epoch": 1.856610800744879, + "grad_norm": 1.6256749629974365, + "learning_rate": 3.978662357188268e-06, + "loss": 1.0206, + "step": 1994 + }, + { + "epoch": 1.8575418994413408, + "grad_norm": 1.5093574523925781, + "learning_rate": 3.977663924563642e-06, + "loss": 0.9927, + "step": 1995 + }, + { + "epoch": 1.8584729981378025, + "grad_norm": 1.561970591545105, + "learning_rate": 3.976665129571667e-06, + "loss": 0.9956, + "step": 1996 + }, + { + "epoch": 1.8594040968342644, + "grad_norm": 1.5527724027633667, + "learning_rate": 3.975665972457278e-06, + "loss": 1.0261, + "step": 1997 + }, + { + "epoch": 1.8603351955307263, + "grad_norm": 1.6018401384353638, + "learning_rate": 3.9746664534654975e-06, + "loss": 1.0463, + "step": 1998 + }, + { + "epoch": 1.861266294227188, + "grad_norm": 1.5350617170333862, + "learning_rate": 3.973666572841438e-06, + "loss": 1.0295, + "step": 1999 + }, + { + "epoch": 1.86219739292365, + "grad_norm": 1.4940292835235596, + "learning_rate": 3.9726663308302995e-06, + "loss": 0.9825, + "step": 2000 + }, + { + "epoch": 1.8631284916201118, + "grad_norm": 1.593605875968933, + "learning_rate": 3.971665727677371e-06, + "loss": 1.0417, + "step": 2001 + }, + { + "epoch": 1.8640595903165735, + "grad_norm": 1.5552031993865967, + "learning_rate": 3.970664763628032e-06, + "loss": 1.0131, + "step": 2002 + }, + { + "epoch": 1.8649906890130352, + "grad_norm": 1.5304882526397705, + "learning_rate": 3.969663438927747e-06, + "loss": 1.0129, + "step": 2003 + }, + { + "epoch": 1.8659217877094973, + "grad_norm": 1.5545238256454468, + "learning_rate": 3.96866175382207e-06, + "loss": 1.0015, + "step": 2004 + }, + { + "epoch": 1.866852886405959, + "grad_norm": 1.5410882234573364, + "learning_rate": 3.967659708556647e-06, + "loss": 1.0546, + "step": 2005 + }, + { + "epoch": 1.8677839851024207, + "grad_norm": 1.518667221069336, + "learning_rate": 3.966657303377209e-06, + "loss": 0.9997, + "step": 2006 + }, + { + "epoch": 1.8687150837988828, + "grad_norm": 1.5257500410079956, + "learning_rate": 3.965654538529572e-06, + "loss": 0.9914, + "step": 2007 + }, + { + "epoch": 1.8696461824953445, + "grad_norm": 1.586334228515625, + "learning_rate": 3.964651414259648e-06, + "loss": 0.9874, + "step": 2008 + }, + { + "epoch": 1.8705772811918062, + "grad_norm": 1.5263134241104126, + "learning_rate": 3.963647930813432e-06, + "loss": 1.0115, + "step": 2009 + }, + { + "epoch": 1.8715083798882681, + "grad_norm": 1.5681990385055542, + "learning_rate": 3.962644088437006e-06, + "loss": 1.0251, + "step": 2010 + }, + { + "epoch": 1.87243947858473, + "grad_norm": 1.5999913215637207, + "learning_rate": 3.961639887376546e-06, + "loss": 1.0047, + "step": 2011 + }, + { + "epoch": 1.8733705772811917, + "grad_norm": 1.5397801399230957, + "learning_rate": 3.9606353278783085e-06, + "loss": 0.9562, + "step": 2012 + }, + { + "epoch": 1.8743016759776536, + "grad_norm": 1.6281425952911377, + "learning_rate": 3.959630410188643e-06, + "loss": 0.9745, + "step": 2013 + }, + { + "epoch": 1.8752327746741155, + "grad_norm": 1.5694729089736938, + "learning_rate": 3.958625134553985e-06, + "loss": 0.9789, + "step": 2014 + }, + { + "epoch": 1.8761638733705772, + "grad_norm": 1.5906692743301392, + "learning_rate": 3.95761950122086e-06, + "loss": 1.0406, + "step": 2015 + }, + { + "epoch": 1.8770949720670391, + "grad_norm": 1.592438817024231, + "learning_rate": 3.956613510435876e-06, + "loss": 0.986, + "step": 2016 + }, + { + "epoch": 1.878026070763501, + "grad_norm": 1.57843816280365, + "learning_rate": 3.955607162445735e-06, + "loss": 1.0199, + "step": 2017 + }, + { + "epoch": 1.8789571694599627, + "grad_norm": 1.5538474321365356, + "learning_rate": 3.9546004574972215e-06, + "loss": 1.0128, + "step": 2018 + }, + { + "epoch": 1.8798882681564246, + "grad_norm": 1.5547465085983276, + "learning_rate": 3.953593395837211e-06, + "loss": 1.0451, + "step": 2019 + }, + { + "epoch": 1.8808193668528865, + "grad_norm": 1.537627935409546, + "learning_rate": 3.952585977712664e-06, + "loss": 1.0052, + "step": 2020 + }, + { + "epoch": 1.8817504655493482, + "grad_norm": 1.548630714416504, + "learning_rate": 3.9515782033706305e-06, + "loss": 1.0159, + "step": 2021 + }, + { + "epoch": 1.88268156424581, + "grad_norm": 1.5562455654144287, + "learning_rate": 3.950570073058247e-06, + "loss": 0.9971, + "step": 2022 + }, + { + "epoch": 1.883612662942272, + "grad_norm": 1.559014081954956, + "learning_rate": 3.949561587022736e-06, + "loss": 1.0016, + "step": 2023 + }, + { + "epoch": 1.8845437616387337, + "grad_norm": 1.589950680732727, + "learning_rate": 3.9485527455114095e-06, + "loss": 1.0237, + "step": 2024 + }, + { + "epoch": 1.8854748603351954, + "grad_norm": 1.5429989099502563, + "learning_rate": 3.9475435487716655e-06, + "loss": 1.01, + "step": 2025 + }, + { + "epoch": 1.8864059590316573, + "grad_norm": 1.6332148313522339, + "learning_rate": 3.946533997050988e-06, + "loss": 1.018, + "step": 2026 + }, + { + "epoch": 1.8873370577281192, + "grad_norm": 1.5471529960632324, + "learning_rate": 3.94552409059695e-06, + "loss": 1.0015, + "step": 2027 + }, + { + "epoch": 1.888268156424581, + "grad_norm": 1.5839293003082275, + "learning_rate": 3.944513829657211e-06, + "loss": 1.0256, + "step": 2028 + }, + { + "epoch": 1.8891992551210428, + "grad_norm": 1.5847171545028687, + "learning_rate": 3.9435032144795185e-06, + "loss": 1.0063, + "step": 2029 + }, + { + "epoch": 1.8901303538175047, + "grad_norm": 1.5821939706802368, + "learning_rate": 3.9424922453117036e-06, + "loss": 1.0064, + "step": 2030 + }, + { + "epoch": 1.8910614525139664, + "grad_norm": 1.626847267150879, + "learning_rate": 3.941480922401685e-06, + "loss": 0.9998, + "step": 2031 + }, + { + "epoch": 1.8919925512104283, + "grad_norm": 1.6688166856765747, + "learning_rate": 3.940469245997473e-06, + "loss": 1.0122, + "step": 2032 + }, + { + "epoch": 1.8929236499068902, + "grad_norm": 1.5906034708023071, + "learning_rate": 3.939457216347157e-06, + "loss": 1.0132, + "step": 2033 + }, + { + "epoch": 1.893854748603352, + "grad_norm": 1.5552074909210205, + "learning_rate": 3.93844483369892e-06, + "loss": 0.9796, + "step": 2034 + }, + { + "epoch": 1.8947858472998138, + "grad_norm": 1.6331313848495483, + "learning_rate": 3.937432098301026e-06, + "loss": 1.0066, + "step": 2035 + }, + { + "epoch": 1.8957169459962757, + "grad_norm": 1.6071209907531738, + "learning_rate": 3.936419010401831e-06, + "loss": 1.0172, + "step": 2036 + }, + { + "epoch": 1.8966480446927374, + "grad_norm": 1.590070366859436, + "learning_rate": 3.9354055702497715e-06, + "loss": 0.9958, + "step": 2037 + }, + { + "epoch": 1.8975791433891993, + "grad_norm": 1.5456080436706543, + "learning_rate": 3.934391778093374e-06, + "loss": 0.9578, + "step": 2038 + }, + { + "epoch": 1.8985102420856612, + "grad_norm": 1.5467373132705688, + "learning_rate": 3.933377634181251e-06, + "loss": 1.0239, + "step": 2039 + }, + { + "epoch": 1.899441340782123, + "grad_norm": 1.611258625984192, + "learning_rate": 3.932363138762102e-06, + "loss": 1.0315, + "step": 2040 + }, + { + "epoch": 1.9003724394785846, + "grad_norm": 1.6488022804260254, + "learning_rate": 3.93134829208471e-06, + "loss": 1.0337, + "step": 2041 + }, + { + "epoch": 1.9013035381750467, + "grad_norm": 1.5456761121749878, + "learning_rate": 3.9303330943979465e-06, + "loss": 1.0238, + "step": 2042 + }, + { + "epoch": 1.9022346368715084, + "grad_norm": 1.5027612447738647, + "learning_rate": 3.929317545950767e-06, + "loss": 1.0046, + "step": 2043 + }, + { + "epoch": 1.90316573556797, + "grad_norm": 1.5116413831710815, + "learning_rate": 3.9283016469922165e-06, + "loss": 0.9841, + "step": 2044 + }, + { + "epoch": 1.904096834264432, + "grad_norm": 1.6401547193527222, + "learning_rate": 3.927285397771422e-06, + "loss": 1.0078, + "step": 2045 + }, + { + "epoch": 1.905027932960894, + "grad_norm": 1.5742777585983276, + "learning_rate": 3.9262687985376e-06, + "loss": 0.9828, + "step": 2046 + }, + { + "epoch": 1.9059590316573556, + "grad_norm": 1.5467455387115479, + "learning_rate": 3.925251849540048e-06, + "loss": 0.9998, + "step": 2047 + }, + { + "epoch": 1.9068901303538175, + "grad_norm": 1.534225344657898, + "learning_rate": 3.9242345510281555e-06, + "loss": 0.994, + "step": 2048 + }, + { + "epoch": 1.9078212290502794, + "grad_norm": 1.4439020156860352, + "learning_rate": 3.9232169032513934e-06, + "loss": 0.9708, + "step": 2049 + }, + { + "epoch": 1.908752327746741, + "grad_norm": 1.616662621498108, + "learning_rate": 3.922198906459318e-06, + "loss": 0.9909, + "step": 2050 + }, + { + "epoch": 1.909683426443203, + "grad_norm": 1.6388851404190063, + "learning_rate": 3.921180560901574e-06, + "loss": 1.0037, + "step": 2051 + }, + { + "epoch": 1.910614525139665, + "grad_norm": 1.625038504600525, + "learning_rate": 3.92016186682789e-06, + "loss": 1.028, + "step": 2052 + }, + { + "epoch": 1.9115456238361266, + "grad_norm": 1.6417691707611084, + "learning_rate": 3.91914282448808e-06, + "loss": 1.0026, + "step": 2053 + }, + { + "epoch": 1.9124767225325885, + "grad_norm": 1.5444130897521973, + "learning_rate": 3.918123434132043e-06, + "loss": 0.9872, + "step": 2054 + }, + { + "epoch": 1.9134078212290504, + "grad_norm": 1.6016485691070557, + "learning_rate": 3.9171036960097655e-06, + "loss": 1.0215, + "step": 2055 + }, + { + "epoch": 1.914338919925512, + "grad_norm": 1.5767979621887207, + "learning_rate": 3.9160836103713165e-06, + "loss": 1.025, + "step": 2056 + }, + { + "epoch": 1.9152700186219738, + "grad_norm": 1.6227903366088867, + "learning_rate": 3.915063177466851e-06, + "loss": 0.9902, + "step": 2057 + }, + { + "epoch": 1.916201117318436, + "grad_norm": 1.5350115299224854, + "learning_rate": 3.914042397546611e-06, + "loss": 0.9852, + "step": 2058 + }, + { + "epoch": 1.9171322160148976, + "grad_norm": 1.5428268909454346, + "learning_rate": 3.9130212708609225e-06, + "loss": 0.9821, + "step": 2059 + }, + { + "epoch": 1.9180633147113593, + "grad_norm": 1.5585871934890747, + "learning_rate": 3.911999797660195e-06, + "loss": 1.0135, + "step": 2060 + }, + { + "epoch": 1.9189944134078212, + "grad_norm": 1.589049220085144, + "learning_rate": 3.910977978194925e-06, + "loss": 0.9644, + "step": 2061 + }, + { + "epoch": 1.919925512104283, + "grad_norm": 1.6130081415176392, + "learning_rate": 3.909955812715692e-06, + "loss": 1.0156, + "step": 2062 + }, + { + "epoch": 1.9208566108007448, + "grad_norm": 1.604012131690979, + "learning_rate": 3.908933301473163e-06, + "loss": 1.0444, + "step": 2063 + }, + { + "epoch": 1.9217877094972067, + "grad_norm": 1.5679888725280762, + "learning_rate": 3.907910444718088e-06, + "loss": 1.0237, + "step": 2064 + }, + { + "epoch": 1.9227188081936686, + "grad_norm": 1.6054996252059937, + "learning_rate": 3.906887242701302e-06, + "loss": 1.0414, + "step": 2065 + }, + { + "epoch": 1.9236499068901303, + "grad_norm": 1.5020970106124878, + "learning_rate": 3.9058636956737235e-06, + "loss": 0.9605, + "step": 2066 + }, + { + "epoch": 1.9245810055865922, + "grad_norm": 1.5365707874298096, + "learning_rate": 3.904839803886359e-06, + "loss": 1.0201, + "step": 2067 + }, + { + "epoch": 1.925512104283054, + "grad_norm": 1.5040481090545654, + "learning_rate": 3.903815567590296e-06, + "loss": 1.0102, + "step": 2068 + }, + { + "epoch": 1.9264432029795158, + "grad_norm": 1.502992868423462, + "learning_rate": 3.902790987036707e-06, + "loss": 1.0013, + "step": 2069 + }, + { + "epoch": 1.9273743016759777, + "grad_norm": 1.5323235988616943, + "learning_rate": 3.901766062476852e-06, + "loss": 0.9882, + "step": 2070 + }, + { + "epoch": 1.9283054003724396, + "grad_norm": 1.5687792301177979, + "learning_rate": 3.90074079416207e-06, + "loss": 0.9641, + "step": 2071 + }, + { + "epoch": 1.9292364990689013, + "grad_norm": 1.5298455953598022, + "learning_rate": 3.8997151823437915e-06, + "loss": 0.9996, + "step": 2072 + }, + { + "epoch": 1.9301675977653632, + "grad_norm": 1.5928900241851807, + "learning_rate": 3.8986892272735235e-06, + "loss": 0.9994, + "step": 2073 + }, + { + "epoch": 1.931098696461825, + "grad_norm": 1.631481647491455, + "learning_rate": 3.897662929202863e-06, + "loss": 1.0374, + "step": 2074 + }, + { + "epoch": 1.9320297951582868, + "grad_norm": 1.5729954242706299, + "learning_rate": 3.896636288383489e-06, + "loss": 0.942, + "step": 2075 + }, + { + "epoch": 1.9329608938547485, + "grad_norm": 1.5741610527038574, + "learning_rate": 3.895609305067162e-06, + "loss": 1.0351, + "step": 2076 + }, + { + "epoch": 1.9338919925512106, + "grad_norm": 1.606162667274475, + "learning_rate": 3.894581979505732e-06, + "loss": 1.0079, + "step": 2077 + }, + { + "epoch": 1.9348230912476723, + "grad_norm": 1.5142172574996948, + "learning_rate": 3.8935543119511285e-06, + "loss": 0.9669, + "step": 2078 + }, + { + "epoch": 1.935754189944134, + "grad_norm": 1.596543550491333, + "learning_rate": 3.892526302655367e-06, + "loss": 1.0376, + "step": 2079 + }, + { + "epoch": 1.9366852886405959, + "grad_norm": 1.5575776100158691, + "learning_rate": 3.8914979518705455e-06, + "loss": 1.0272, + "step": 2080 + }, + { + "epoch": 1.9376163873370578, + "grad_norm": 1.5682884454727173, + "learning_rate": 3.8904692598488454e-06, + "loss": 0.9919, + "step": 2081 + }, + { + "epoch": 1.9385474860335195, + "grad_norm": 1.6136903762817383, + "learning_rate": 3.889440226842535e-06, + "loss": 1.0278, + "step": 2082 + }, + { + "epoch": 1.9394785847299814, + "grad_norm": 1.556594729423523, + "learning_rate": 3.8884108531039625e-06, + "loss": 1.037, + "step": 2083 + }, + { + "epoch": 1.9404096834264433, + "grad_norm": 1.5085079669952393, + "learning_rate": 3.887381138885561e-06, + "loss": 1.0262, + "step": 2084 + }, + { + "epoch": 1.941340782122905, + "grad_norm": 1.5283361673355103, + "learning_rate": 3.886351084439847e-06, + "loss": 0.9872, + "step": 2085 + }, + { + "epoch": 1.9422718808193669, + "grad_norm": 1.53825044631958, + "learning_rate": 3.885320690019422e-06, + "loss": 0.9761, + "step": 2086 + }, + { + "epoch": 1.9432029795158288, + "grad_norm": 1.5831376314163208, + "learning_rate": 3.884289955876968e-06, + "loss": 1.0072, + "step": 2087 + }, + { + "epoch": 1.9441340782122905, + "grad_norm": 1.556359052658081, + "learning_rate": 3.883258882265253e-06, + "loss": 1.0276, + "step": 2088 + }, + { + "epoch": 1.9450651769087524, + "grad_norm": 1.6506770849227905, + "learning_rate": 3.882227469437126e-06, + "loss": 1.0758, + "step": 2089 + }, + { + "epoch": 1.9459962756052143, + "grad_norm": 1.5666788816452026, + "learning_rate": 3.881195717645522e-06, + "loss": 1.0241, + "step": 2090 + }, + { + "epoch": 1.946927374301676, + "grad_norm": 1.5779006481170654, + "learning_rate": 3.880163627143454e-06, + "loss": 1.008, + "step": 2091 + }, + { + "epoch": 1.9478584729981379, + "grad_norm": 1.5439555644989014, + "learning_rate": 3.879131198184026e-06, + "loss": 0.9954, + "step": 2092 + }, + { + "epoch": 1.9487895716945998, + "grad_norm": 1.50613534450531, + "learning_rate": 3.878098431020416e-06, + "loss": 1.0075, + "step": 2093 + }, + { + "epoch": 1.9497206703910615, + "grad_norm": 1.5468829870224, + "learning_rate": 3.8770653259058924e-06, + "loss": 1.0132, + "step": 2094 + }, + { + "epoch": 1.9506517690875231, + "grad_norm": 1.4966695308685303, + "learning_rate": 3.876031883093802e-06, + "loss": 0.98, + "step": 2095 + }, + { + "epoch": 1.9515828677839853, + "grad_norm": 1.5314258337020874, + "learning_rate": 3.874998102837577e-06, + "loss": 0.9633, + "step": 2096 + }, + { + "epoch": 1.952513966480447, + "grad_norm": 1.6018353700637817, + "learning_rate": 3.873963985390729e-06, + "loss": 1.0623, + "step": 2097 + }, + { + "epoch": 1.9534450651769086, + "grad_norm": 1.5945348739624023, + "learning_rate": 3.872929531006858e-06, + "loss": 1.0109, + "step": 2098 + }, + { + "epoch": 1.9543761638733705, + "grad_norm": 1.5638896226882935, + "learning_rate": 3.87189473993964e-06, + "loss": 0.9776, + "step": 2099 + }, + { + "epoch": 1.9553072625698324, + "grad_norm": 1.555023193359375, + "learning_rate": 3.870859612442837e-06, + "loss": 1.0104, + "step": 2100 + }, + { + "epoch": 1.9562383612662941, + "grad_norm": 1.6065038442611694, + "learning_rate": 3.869824148770295e-06, + "loss": 1.0362, + "step": 2101 + }, + { + "epoch": 1.957169459962756, + "grad_norm": 1.5318093299865723, + "learning_rate": 3.868788349175939e-06, + "loss": 0.9929, + "step": 2102 + }, + { + "epoch": 1.958100558659218, + "grad_norm": 1.661205768585205, + "learning_rate": 3.867752213913779e-06, + "loss": 1.0048, + "step": 2103 + }, + { + "epoch": 1.9590316573556796, + "grad_norm": 1.540018916130066, + "learning_rate": 3.866715743237906e-06, + "loss": 1.016, + "step": 2104 + }, + { + "epoch": 1.9599627560521415, + "grad_norm": 1.7148969173431396, + "learning_rate": 3.865678937402494e-06, + "loss": 0.988, + "step": 2105 + }, + { + "epoch": 1.9608938547486034, + "grad_norm": 1.6059976816177368, + "learning_rate": 3.864641796661798e-06, + "loss": 1.0014, + "step": 2106 + }, + { + "epoch": 1.9618249534450651, + "grad_norm": 1.5444613695144653, + "learning_rate": 3.863604321270156e-06, + "loss": 0.9694, + "step": 2107 + }, + { + "epoch": 1.962756052141527, + "grad_norm": 1.566719889640808, + "learning_rate": 3.862566511481987e-06, + "loss": 1.009, + "step": 2108 + }, + { + "epoch": 1.963687150837989, + "grad_norm": 1.6250853538513184, + "learning_rate": 3.8615283675517965e-06, + "loss": 1.0017, + "step": 2109 + }, + { + "epoch": 1.9646182495344506, + "grad_norm": 1.5748748779296875, + "learning_rate": 3.860489889734165e-06, + "loss": 0.9933, + "step": 2110 + }, + { + "epoch": 1.9655493482309123, + "grad_norm": 1.5659135580062866, + "learning_rate": 3.859451078283759e-06, + "loss": 0.9979, + "step": 2111 + }, + { + "epoch": 1.9664804469273744, + "grad_norm": 1.5945208072662354, + "learning_rate": 3.858411933455326e-06, + "loss": 1.0218, + "step": 2112 + }, + { + "epoch": 1.9674115456238361, + "grad_norm": 1.6475328207015991, + "learning_rate": 3.857372455503698e-06, + "loss": 1.0261, + "step": 2113 + }, + { + "epoch": 1.9683426443202978, + "grad_norm": 1.5161283016204834, + "learning_rate": 3.856332644683781e-06, + "loss": 0.9859, + "step": 2114 + }, + { + "epoch": 1.9692737430167597, + "grad_norm": 1.5574990510940552, + "learning_rate": 3.855292501250573e-06, + "loss": 1.0285, + "step": 2115 + }, + { + "epoch": 1.9702048417132216, + "grad_norm": 1.5281257629394531, + "learning_rate": 3.854252025459144e-06, + "loss": 1.0304, + "step": 2116 + }, + { + "epoch": 1.9711359404096833, + "grad_norm": 1.5524795055389404, + "learning_rate": 3.853211217564653e-06, + "loss": 0.9691, + "step": 2117 + }, + { + "epoch": 1.9720670391061452, + "grad_norm": 1.481471300125122, + "learning_rate": 3.852170077822335e-06, + "loss": 0.9963, + "step": 2118 + }, + { + "epoch": 1.9729981378026071, + "grad_norm": 1.5913469791412354, + "learning_rate": 3.851128606487509e-06, + "loss": 0.9942, + "step": 2119 + }, + { + "epoch": 1.9739292364990688, + "grad_norm": 1.5506529808044434, + "learning_rate": 3.850086803815576e-06, + "loss": 1.0041, + "step": 2120 + }, + { + "epoch": 1.9748603351955307, + "grad_norm": 1.5746383666992188, + "learning_rate": 3.849044670062016e-06, + "loss": 1.0189, + "step": 2121 + }, + { + "epoch": 1.9757914338919926, + "grad_norm": 1.5480257272720337, + "learning_rate": 3.848002205482392e-06, + "loss": 1.0129, + "step": 2122 + }, + { + "epoch": 1.9767225325884543, + "grad_norm": 1.5308213233947754, + "learning_rate": 3.8469594103323475e-06, + "loss": 1.0026, + "step": 2123 + }, + { + "epoch": 1.9776536312849162, + "grad_norm": 1.5448893308639526, + "learning_rate": 3.845916284867606e-06, + "loss": 0.9704, + "step": 2124 + }, + { + "epoch": 1.9785847299813781, + "grad_norm": 1.517349362373352, + "learning_rate": 3.844872829343973e-06, + "loss": 1.0055, + "step": 2125 + }, + { + "epoch": 1.9795158286778398, + "grad_norm": 1.5437157154083252, + "learning_rate": 3.843829044017337e-06, + "loss": 1.0325, + "step": 2126 + }, + { + "epoch": 1.9804469273743017, + "grad_norm": 1.5579521656036377, + "learning_rate": 3.842784929143663e-06, + "loss": 1.0458, + "step": 2127 + }, + { + "epoch": 1.9813780260707636, + "grad_norm": 1.5024526119232178, + "learning_rate": 3.841740484979002e-06, + "loss": 0.9796, + "step": 2128 + }, + { + "epoch": 1.9823091247672253, + "grad_norm": 1.5668773651123047, + "learning_rate": 3.840695711779479e-06, + "loss": 0.9917, + "step": 2129 + }, + { + "epoch": 1.983240223463687, + "grad_norm": 1.5908945798873901, + "learning_rate": 3.839650609801307e-06, + "loss": 0.9805, + "step": 2130 + }, + { + "epoch": 1.9841713221601491, + "grad_norm": 1.5236387252807617, + "learning_rate": 3.838605179300775e-06, + "loss": 0.9831, + "step": 2131 + }, + { + "epoch": 1.9851024208566108, + "grad_norm": 1.5576443672180176, + "learning_rate": 3.837559420534253e-06, + "loss": 1.0294, + "step": 2132 + }, + { + "epoch": 1.9860335195530725, + "grad_norm": 1.5494862794876099, + "learning_rate": 3.836513333758195e-06, + "loss": 1.0007, + "step": 2133 + }, + { + "epoch": 1.9869646182495344, + "grad_norm": 1.5600168704986572, + "learning_rate": 3.835466919229129e-06, + "loss": 0.9943, + "step": 2134 + }, + { + "epoch": 1.9878957169459963, + "grad_norm": 1.4927576780319214, + "learning_rate": 3.83442017720367e-06, + "loss": 0.9551, + "step": 2135 + }, + { + "epoch": 1.988826815642458, + "grad_norm": 1.5802292823791504, + "learning_rate": 3.833373107938509e-06, + "loss": 1.0159, + "step": 2136 + }, + { + "epoch": 1.98975791433892, + "grad_norm": 1.5049283504486084, + "learning_rate": 3.832325711690419e-06, + "loss": 1.0163, + "step": 2137 + }, + { + "epoch": 1.9906890130353818, + "grad_norm": 1.6011770963668823, + "learning_rate": 3.831277988716252e-06, + "loss": 1.0292, + "step": 2138 + }, + { + "epoch": 1.9916201117318435, + "grad_norm": 1.5500998497009277, + "learning_rate": 3.830229939272943e-06, + "loss": 1.0216, + "step": 2139 + }, + { + "epoch": 1.9925512104283054, + "grad_norm": 1.6106137037277222, + "learning_rate": 3.829181563617504e-06, + "loss": 1.0065, + "step": 2140 + }, + { + "epoch": 1.9934823091247673, + "grad_norm": 1.5394093990325928, + "learning_rate": 3.828132862007027e-06, + "loss": 0.9953, + "step": 2141 + }, + { + "epoch": 1.994413407821229, + "grad_norm": 1.4998019933700562, + "learning_rate": 3.827083834698687e-06, + "loss": 0.9858, + "step": 2142 + }, + { + "epoch": 1.995344506517691, + "grad_norm": 1.6170512437820435, + "learning_rate": 3.826034481949734e-06, + "loss": 1.0158, + "step": 2143 + }, + { + "epoch": 1.9962756052141528, + "grad_norm": 1.5653436183929443, + "learning_rate": 3.824984804017505e-06, + "loss": 0.981, + "step": 2144 + }, + { + "epoch": 1.9972067039106145, + "grad_norm": 1.4922889471054077, + "learning_rate": 3.823934801159408e-06, + "loss": 0.9946, + "step": 2145 + }, + { + "epoch": 1.9981378026070762, + "grad_norm": 1.5930675268173218, + "learning_rate": 3.822884473632937e-06, + "loss": 1.0029, + "step": 2146 + }, + { + "epoch": 1.9990689013035383, + "grad_norm": 1.6091874837875366, + "learning_rate": 3.821833821695664e-06, + "loss": 1.0429, + "step": 2147 + }, + { + "epoch": 2.0, + "grad_norm": 1.80600905418396, + "learning_rate": 3.82078284560524e-06, + "loss": 1.0268, + "step": 2148 + }, + { + "epoch": 2.0009310986964617, + "grad_norm": 1.5678883790969849, + "learning_rate": 3.819731545619395e-06, + "loss": 0.9571, + "step": 2149 + }, + { + "epoch": 2.001862197392924, + "grad_norm": 1.5916420221328735, + "learning_rate": 3.81867992199594e-06, + "loss": 0.9762, + "step": 2150 + }, + { + "epoch": 2.0027932960893855, + "grad_norm": 1.561748743057251, + "learning_rate": 3.817627974992765e-06, + "loss": 0.9838, + "step": 2151 + }, + { + "epoch": 2.003724394785847, + "grad_norm": 1.5832701921463013, + "learning_rate": 3.816575704867836e-06, + "loss": 0.9845, + "step": 2152 + }, + { + "epoch": 2.0046554934823093, + "grad_norm": 1.5434550046920776, + "learning_rate": 3.815523111879206e-06, + "loss": 0.9923, + "step": 2153 + }, + { + "epoch": 2.005586592178771, + "grad_norm": 1.609713077545166, + "learning_rate": 3.8144701962849973e-06, + "loss": 0.9661, + "step": 2154 + }, + { + "epoch": 2.0065176908752327, + "grad_norm": 1.5397262573242188, + "learning_rate": 3.81341695834342e-06, + "loss": 0.928, + "step": 2155 + }, + { + "epoch": 2.007448789571695, + "grad_norm": 1.503147840499878, + "learning_rate": 3.812363398312757e-06, + "loss": 0.9651, + "step": 2156 + }, + { + "epoch": 2.0083798882681565, + "grad_norm": 1.5558241605758667, + "learning_rate": 3.8113095164513737e-06, + "loss": 0.9624, + "step": 2157 + }, + { + "epoch": 2.009310986964618, + "grad_norm": 1.6071900129318237, + "learning_rate": 3.8102553130177133e-06, + "loss": 0.9785, + "step": 2158 + }, + { + "epoch": 2.01024208566108, + "grad_norm": 1.5680207014083862, + "learning_rate": 3.8092007882702973e-06, + "loss": 0.9524, + "step": 2159 + }, + { + "epoch": 2.011173184357542, + "grad_norm": 1.6078006029129028, + "learning_rate": 3.808145942467729e-06, + "loss": 0.9604, + "step": 2160 + }, + { + "epoch": 2.0121042830540037, + "grad_norm": 1.5493887662887573, + "learning_rate": 3.807090775868686e-06, + "loss": 0.9508, + "step": 2161 + }, + { + "epoch": 2.0130353817504654, + "grad_norm": 1.6327743530273438, + "learning_rate": 3.8060352887319264e-06, + "loss": 1.0041, + "step": 2162 + }, + { + "epoch": 2.0139664804469275, + "grad_norm": 1.5337445735931396, + "learning_rate": 3.804979481316289e-06, + "loss": 0.9469, + "step": 2163 + }, + { + "epoch": 2.014897579143389, + "grad_norm": 1.581568717956543, + "learning_rate": 3.8039233538806873e-06, + "loss": 0.9633, + "step": 2164 + }, + { + "epoch": 2.015828677839851, + "grad_norm": 1.6584010124206543, + "learning_rate": 3.8028669066841172e-06, + "loss": 1.0006, + "step": 2165 + }, + { + "epoch": 2.016759776536313, + "grad_norm": 1.5558737516403198, + "learning_rate": 3.80181013998565e-06, + "loss": 0.971, + "step": 2166 + }, + { + "epoch": 2.0176908752327747, + "grad_norm": 1.5990629196166992, + "learning_rate": 3.800753054044437e-06, + "loss": 0.9652, + "step": 2167 + }, + { + "epoch": 2.0186219739292364, + "grad_norm": 1.5948203802108765, + "learning_rate": 3.799695649119706e-06, + "loss": 0.9601, + "step": 2168 + }, + { + "epoch": 2.0195530726256985, + "grad_norm": 1.646085500717163, + "learning_rate": 3.7986379254707663e-06, + "loss": 0.9891, + "step": 2169 + }, + { + "epoch": 2.02048417132216, + "grad_norm": 1.5336343050003052, + "learning_rate": 3.797579883357002e-06, + "loss": 0.9692, + "step": 2170 + }, + { + "epoch": 2.021415270018622, + "grad_norm": 1.5603774785995483, + "learning_rate": 3.7965215230378766e-06, + "loss": 0.9762, + "step": 2171 + }, + { + "epoch": 2.022346368715084, + "grad_norm": 1.5954972505569458, + "learning_rate": 3.7954628447729326e-06, + "loss": 0.9657, + "step": 2172 + }, + { + "epoch": 2.0232774674115457, + "grad_norm": 1.5735665559768677, + "learning_rate": 3.7944038488217884e-06, + "loss": 0.9386, + "step": 2173 + }, + { + "epoch": 2.0242085661080074, + "grad_norm": 1.5985512733459473, + "learning_rate": 3.793344535444142e-06, + "loss": 0.9596, + "step": 2174 + }, + { + "epoch": 2.0251396648044695, + "grad_norm": 1.658065915107727, + "learning_rate": 3.7922849048997688e-06, + "loss": 0.9683, + "step": 2175 + }, + { + "epoch": 2.026070763500931, + "grad_norm": 1.6113154888153076, + "learning_rate": 3.7912249574485226e-06, + "loss": 0.963, + "step": 2176 + }, + { + "epoch": 2.027001862197393, + "grad_norm": 1.6236968040466309, + "learning_rate": 3.7901646933503323e-06, + "loss": 0.9318, + "step": 2177 + }, + { + "epoch": 2.0279329608938546, + "grad_norm": 1.576192021369934, + "learning_rate": 3.7891041128652085e-06, + "loss": 0.9485, + "step": 2178 + }, + { + "epoch": 2.0288640595903167, + "grad_norm": 1.5539826154708862, + "learning_rate": 3.7880432162532354e-06, + "loss": 0.9295, + "step": 2179 + }, + { + "epoch": 2.0297951582867784, + "grad_norm": 1.624599575996399, + "learning_rate": 3.7869820037745773e-06, + "loss": 0.9312, + "step": 2180 + }, + { + "epoch": 2.03072625698324, + "grad_norm": 1.5872856378555298, + "learning_rate": 3.7859204756894754e-06, + "loss": 0.9381, + "step": 2181 + }, + { + "epoch": 2.031657355679702, + "grad_norm": 1.6143959760665894, + "learning_rate": 3.7848586322582475e-06, + "loss": 0.9437, + "step": 2182 + }, + { + "epoch": 2.032588454376164, + "grad_norm": 1.548168420791626, + "learning_rate": 3.78379647374129e-06, + "loss": 0.9271, + "step": 2183 + }, + { + "epoch": 2.0335195530726256, + "grad_norm": 1.6314013004302979, + "learning_rate": 3.7827340003990752e-06, + "loss": 1.0021, + "step": 2184 + }, + { + "epoch": 2.0344506517690877, + "grad_norm": 1.6110419034957886, + "learning_rate": 3.7816712124921553e-06, + "loss": 0.9576, + "step": 2185 + }, + { + "epoch": 2.0353817504655494, + "grad_norm": 1.6009128093719482, + "learning_rate": 3.7806081102811542e-06, + "loss": 0.965, + "step": 2186 + }, + { + "epoch": 2.036312849162011, + "grad_norm": 1.5964843034744263, + "learning_rate": 3.77954469402678e-06, + "loss": 0.9919, + "step": 2187 + }, + { + "epoch": 2.037243947858473, + "grad_norm": 1.5717049837112427, + "learning_rate": 3.7784809639898114e-06, + "loss": 0.9531, + "step": 2188 + }, + { + "epoch": 2.038175046554935, + "grad_norm": 1.5899442434310913, + "learning_rate": 3.777416920431108e-06, + "loss": 0.9588, + "step": 2189 + }, + { + "epoch": 2.0391061452513966, + "grad_norm": 1.6514818668365479, + "learning_rate": 3.776352563611604e-06, + "loss": 0.9617, + "step": 2190 + }, + { + "epoch": 2.0400372439478587, + "grad_norm": 1.5895659923553467, + "learning_rate": 3.775287893792314e-06, + "loss": 0.9674, + "step": 2191 + }, + { + "epoch": 2.0409683426443204, + "grad_norm": 1.5712453126907349, + "learning_rate": 3.7742229112343245e-06, + "loss": 0.9766, + "step": 2192 + }, + { + "epoch": 2.041899441340782, + "grad_norm": 1.5232397317886353, + "learning_rate": 3.7731576161988005e-06, + "loss": 0.9601, + "step": 2193 + }, + { + "epoch": 2.0428305400372437, + "grad_norm": 1.5671443939208984, + "learning_rate": 3.772092008946987e-06, + "loss": 0.9401, + "step": 2194 + }, + { + "epoch": 2.043761638733706, + "grad_norm": 1.6170986890792847, + "learning_rate": 3.7710260897402e-06, + "loss": 0.9331, + "step": 2195 + }, + { + "epoch": 2.0446927374301676, + "grad_norm": 1.6012412309646606, + "learning_rate": 3.7699598588398367e-06, + "loss": 0.9889, + "step": 2196 + }, + { + "epoch": 2.0456238361266292, + "grad_norm": 1.5996229648590088, + "learning_rate": 3.7688933165073676e-06, + "loss": 0.9558, + "step": 2197 + }, + { + "epoch": 2.0465549348230914, + "grad_norm": 1.5653645992279053, + "learning_rate": 3.7678264630043416e-06, + "loss": 0.9898, + "step": 2198 + }, + { + "epoch": 2.047486033519553, + "grad_norm": 1.6206297874450684, + "learning_rate": 3.7667592985923827e-06, + "loss": 0.9878, + "step": 2199 + }, + { + "epoch": 2.0484171322160147, + "grad_norm": 1.6035964488983154, + "learning_rate": 3.7656918235331906e-06, + "loss": 0.9504, + "step": 2200 + }, + { + "epoch": 2.049348230912477, + "grad_norm": 1.5887678861618042, + "learning_rate": 3.7646240380885434e-06, + "loss": 0.9061, + "step": 2201 + }, + { + "epoch": 2.0502793296089385, + "grad_norm": 1.528428554534912, + "learning_rate": 3.763555942520293e-06, + "loss": 0.9024, + "step": 2202 + }, + { + "epoch": 2.0512104283054002, + "grad_norm": 1.601758599281311, + "learning_rate": 3.7624875370903695e-06, + "loss": 0.9566, + "step": 2203 + }, + { + "epoch": 2.0521415270018624, + "grad_norm": 1.5678621530532837, + "learning_rate": 3.7614188220607767e-06, + "loss": 0.9635, + "step": 2204 + }, + { + "epoch": 2.053072625698324, + "grad_norm": 1.6154072284698486, + "learning_rate": 3.7603497976935967e-06, + "loss": 0.9341, + "step": 2205 + }, + { + "epoch": 2.0540037243947857, + "grad_norm": 1.5354888439178467, + "learning_rate": 3.7592804642509844e-06, + "loss": 0.9631, + "step": 2206 + }, + { + "epoch": 2.054934823091248, + "grad_norm": 1.5997956991195679, + "learning_rate": 3.758210821995174e-06, + "loss": 0.9818, + "step": 2207 + }, + { + "epoch": 2.0558659217877095, + "grad_norm": 1.6126775741577148, + "learning_rate": 3.7571408711884726e-06, + "loss": 1.0008, + "step": 2208 + }, + { + "epoch": 2.0567970204841712, + "grad_norm": 1.6038191318511963, + "learning_rate": 3.756070612093265e-06, + "loss": 0.9348, + "step": 2209 + }, + { + "epoch": 2.0577281191806334, + "grad_norm": 1.617152452468872, + "learning_rate": 3.7550000449720103e-06, + "loss": 0.9586, + "step": 2210 + }, + { + "epoch": 2.058659217877095, + "grad_norm": 1.6337593793869019, + "learning_rate": 3.7539291700872426e-06, + "loss": 0.9399, + "step": 2211 + }, + { + "epoch": 2.0595903165735567, + "grad_norm": 1.5585325956344604, + "learning_rate": 3.752857987701575e-06, + "loss": 0.9364, + "step": 2212 + }, + { + "epoch": 2.0605214152700184, + "grad_norm": 1.645774483680725, + "learning_rate": 3.751786498077691e-06, + "loss": 0.983, + "step": 2213 + }, + { + "epoch": 2.0614525139664805, + "grad_norm": 1.5499606132507324, + "learning_rate": 3.7507147014783523e-06, + "loss": 0.9152, + "step": 2214 + }, + { + "epoch": 2.0623836126629422, + "grad_norm": 1.648878574371338, + "learning_rate": 3.7496425981663965e-06, + "loss": 0.9616, + "step": 2215 + }, + { + "epoch": 2.063314711359404, + "grad_norm": 1.6605618000030518, + "learning_rate": 3.748570188404734e-06, + "loss": 0.9531, + "step": 2216 + }, + { + "epoch": 2.064245810055866, + "grad_norm": 1.619666576385498, + "learning_rate": 3.747497472456353e-06, + "loss": 0.9419, + "step": 2217 + }, + { + "epoch": 2.0651769087523277, + "grad_norm": 1.5980136394500732, + "learning_rate": 3.746424450584315e-06, + "loss": 0.9534, + "step": 2218 + }, + { + "epoch": 2.0661080074487894, + "grad_norm": 1.578047513961792, + "learning_rate": 3.7453511230517563e-06, + "loss": 0.9449, + "step": 2219 + }, + { + "epoch": 2.0670391061452515, + "grad_norm": 1.6305173635482788, + "learning_rate": 3.7442774901218903e-06, + "loss": 0.955, + "step": 2220 + }, + { + "epoch": 2.0679702048417132, + "grad_norm": 1.612795352935791, + "learning_rate": 3.7432035520580025e-06, + "loss": 0.9186, + "step": 2221 + }, + { + "epoch": 2.068901303538175, + "grad_norm": 1.5992122888565063, + "learning_rate": 3.7421293091234555e-06, + "loss": 0.945, + "step": 2222 + }, + { + "epoch": 2.069832402234637, + "grad_norm": 1.6819311380386353, + "learning_rate": 3.741054761581686e-06, + "loss": 0.9983, + "step": 2223 + }, + { + "epoch": 2.0707635009310987, + "grad_norm": 1.6414536237716675, + "learning_rate": 3.7399799096962035e-06, + "loss": 0.953, + "step": 2224 + }, + { + "epoch": 2.0716945996275604, + "grad_norm": 1.5660078525543213, + "learning_rate": 3.738904753730596e-06, + "loss": 0.9565, + "step": 2225 + }, + { + "epoch": 2.0726256983240225, + "grad_norm": 1.6420103311538696, + "learning_rate": 3.7378292939485218e-06, + "loss": 0.9476, + "step": 2226 + }, + { + "epoch": 2.0735567970204842, + "grad_norm": 1.6457041501998901, + "learning_rate": 3.7367535306137175e-06, + "loss": 0.9656, + "step": 2227 + }, + { + "epoch": 2.074487895716946, + "grad_norm": 1.6613293886184692, + "learning_rate": 3.735677463989992e-06, + "loss": 0.9705, + "step": 2228 + }, + { + "epoch": 2.0754189944134076, + "grad_norm": 1.5280052423477173, + "learning_rate": 3.7346010943412282e-06, + "loss": 0.94, + "step": 2229 + }, + { + "epoch": 2.0763500931098697, + "grad_norm": 1.5736757516860962, + "learning_rate": 3.733524421931385e-06, + "loss": 0.9651, + "step": 2230 + }, + { + "epoch": 2.0772811918063314, + "grad_norm": 1.640153169631958, + "learning_rate": 3.732447447024493e-06, + "loss": 0.986, + "step": 2231 + }, + { + "epoch": 2.078212290502793, + "grad_norm": 1.59702467918396, + "learning_rate": 3.7313701698846616e-06, + "loss": 0.9461, + "step": 2232 + }, + { + "epoch": 2.0791433891992552, + "grad_norm": 1.6325974464416504, + "learning_rate": 3.7302925907760682e-06, + "loss": 0.9514, + "step": 2233 + }, + { + "epoch": 2.080074487895717, + "grad_norm": 1.5375553369522095, + "learning_rate": 3.7292147099629707e-06, + "loss": 0.9078, + "step": 2234 + }, + { + "epoch": 2.0810055865921786, + "grad_norm": 1.582912802696228, + "learning_rate": 3.7281365277096937e-06, + "loss": 0.9446, + "step": 2235 + }, + { + "epoch": 2.0819366852886407, + "grad_norm": 1.616733193397522, + "learning_rate": 3.7270580442806425e-06, + "loss": 0.9491, + "step": 2236 + }, + { + "epoch": 2.0828677839851024, + "grad_norm": 1.6279919147491455, + "learning_rate": 3.725979259940293e-06, + "loss": 0.9445, + "step": 2237 + }, + { + "epoch": 2.083798882681564, + "grad_norm": 1.6524853706359863, + "learning_rate": 3.7249001749531955e-06, + "loss": 0.9977, + "step": 2238 + }, + { + "epoch": 2.0847299813780262, + "grad_norm": 1.6813267469406128, + "learning_rate": 3.723820789583973e-06, + "loss": 0.9726, + "step": 2239 + }, + { + "epoch": 2.085661080074488, + "grad_norm": 1.6247060298919678, + "learning_rate": 3.7227411040973232e-06, + "loss": 0.9426, + "step": 2240 + }, + { + "epoch": 2.0865921787709496, + "grad_norm": 1.6058865785598755, + "learning_rate": 3.7216611187580188e-06, + "loss": 0.9519, + "step": 2241 + }, + { + "epoch": 2.0875232774674117, + "grad_norm": 1.6968483924865723, + "learning_rate": 3.7205808338309023e-06, + "loss": 0.987, + "step": 2242 + }, + { + "epoch": 2.0884543761638734, + "grad_norm": 1.563751220703125, + "learning_rate": 3.719500249580893e-06, + "loss": 0.915, + "step": 2243 + }, + { + "epoch": 2.089385474860335, + "grad_norm": 1.5815222263336182, + "learning_rate": 3.718419366272982e-06, + "loss": 0.9558, + "step": 2244 + }, + { + "epoch": 2.0903165735567972, + "grad_norm": 1.536697268486023, + "learning_rate": 3.7173381841722344e-06, + "loss": 0.9312, + "step": 2245 + }, + { + "epoch": 2.091247672253259, + "grad_norm": 1.6369175910949707, + "learning_rate": 3.7162567035437897e-06, + "loss": 0.9809, + "step": 2246 + }, + { + "epoch": 2.0921787709497206, + "grad_norm": 1.7439841032028198, + "learning_rate": 3.7151749246528567e-06, + "loss": 0.982, + "step": 2247 + }, + { + "epoch": 2.0931098696461823, + "grad_norm": 2.0753540992736816, + "learning_rate": 3.714092847764722e-06, + "loss": 0.9526, + "step": 2248 + }, + { + "epoch": 2.0940409683426444, + "grad_norm": 1.5672013759613037, + "learning_rate": 3.7130104731447415e-06, + "loss": 0.9388, + "step": 2249 + }, + { + "epoch": 2.094972067039106, + "grad_norm": 1.6285289525985718, + "learning_rate": 3.711927801058347e-06, + "loss": 0.9833, + "step": 2250 + }, + { + "epoch": 2.095903165735568, + "grad_norm": 1.6870088577270508, + "learning_rate": 3.710844831771042e-06, + "loss": 0.9354, + "step": 2251 + }, + { + "epoch": 2.09683426443203, + "grad_norm": 1.6003613471984863, + "learning_rate": 3.7097615655484024e-06, + "loss": 0.9592, + "step": 2252 + }, + { + "epoch": 2.0977653631284916, + "grad_norm": 1.6126164197921753, + "learning_rate": 3.708678002656078e-06, + "loss": 0.924, + "step": 2253 + }, + { + "epoch": 2.0986964618249533, + "grad_norm": 1.697689414024353, + "learning_rate": 3.70759414335979e-06, + "loss": 0.9904, + "step": 2254 + }, + { + "epoch": 2.0996275605214154, + "grad_norm": 1.6612224578857422, + "learning_rate": 3.7065099879253343e-06, + "loss": 0.9832, + "step": 2255 + }, + { + "epoch": 2.100558659217877, + "grad_norm": 1.532278060913086, + "learning_rate": 3.7054255366185763e-06, + "loss": 0.9333, + "step": 2256 + }, + { + "epoch": 2.101489757914339, + "grad_norm": 1.6134192943572998, + "learning_rate": 3.7043407897054585e-06, + "loss": 0.9766, + "step": 2257 + }, + { + "epoch": 2.102420856610801, + "grad_norm": 1.6757482290267944, + "learning_rate": 3.703255747451991e-06, + "loss": 0.9844, + "step": 2258 + }, + { + "epoch": 2.1033519553072626, + "grad_norm": 1.5740753412246704, + "learning_rate": 3.7021704101242596e-06, + "loss": 0.9228, + "step": 2259 + }, + { + "epoch": 2.1042830540037243, + "grad_norm": 1.6292288303375244, + "learning_rate": 3.7010847779884207e-06, + "loss": 0.9713, + "step": 2260 + }, + { + "epoch": 2.1052141527001864, + "grad_norm": 1.6327106952667236, + "learning_rate": 3.6999988513107047e-06, + "loss": 0.9618, + "step": 2261 + }, + { + "epoch": 2.106145251396648, + "grad_norm": 1.5581427812576294, + "learning_rate": 3.698912630357413e-06, + "loss": 0.9708, + "step": 2262 + }, + { + "epoch": 2.10707635009311, + "grad_norm": 1.6215169429779053, + "learning_rate": 3.6978261153949197e-06, + "loss": 0.9514, + "step": 2263 + }, + { + "epoch": 2.1080074487895715, + "grad_norm": 1.6247423887252808, + "learning_rate": 3.6967393066896697e-06, + "loss": 0.9442, + "step": 2264 + }, + { + "epoch": 2.1089385474860336, + "grad_norm": 1.639685869216919, + "learning_rate": 3.6956522045081815e-06, + "loss": 0.969, + "step": 2265 + }, + { + "epoch": 2.1098696461824953, + "grad_norm": 1.6477811336517334, + "learning_rate": 3.6945648091170454e-06, + "loss": 0.9996, + "step": 2266 + }, + { + "epoch": 2.110800744878957, + "grad_norm": 1.6385444402694702, + "learning_rate": 3.693477120782923e-06, + "loss": 0.9448, + "step": 2267 + }, + { + "epoch": 2.111731843575419, + "grad_norm": 1.6664146184921265, + "learning_rate": 3.692389139772548e-06, + "loss": 0.9739, + "step": 2268 + }, + { + "epoch": 2.112662942271881, + "grad_norm": 1.606868028640747, + "learning_rate": 3.6913008663527254e-06, + "loss": 0.9649, + "step": 2269 + }, + { + "epoch": 2.1135940409683425, + "grad_norm": 1.6053881645202637, + "learning_rate": 3.690212300790333e-06, + "loss": 0.9266, + "step": 2270 + }, + { + "epoch": 2.1145251396648046, + "grad_norm": 1.6166749000549316, + "learning_rate": 3.689123443352319e-06, + "loss": 0.9652, + "step": 2271 + }, + { + "epoch": 2.1154562383612663, + "grad_norm": 1.7247328758239746, + "learning_rate": 3.688034294305705e-06, + "loss": 1.0175, + "step": 2272 + }, + { + "epoch": 2.116387337057728, + "grad_norm": 1.593933343887329, + "learning_rate": 3.686944853917582e-06, + "loss": 0.9756, + "step": 2273 + }, + { + "epoch": 2.11731843575419, + "grad_norm": 1.5824462175369263, + "learning_rate": 3.6858551224551127e-06, + "loss": 0.9258, + "step": 2274 + }, + { + "epoch": 2.118249534450652, + "grad_norm": 1.7062172889709473, + "learning_rate": 3.6847651001855336e-06, + "loss": 0.9449, + "step": 2275 + }, + { + "epoch": 2.1191806331471135, + "grad_norm": 1.5969610214233398, + "learning_rate": 3.683674787376148e-06, + "loss": 0.9855, + "step": 2276 + }, + { + "epoch": 2.1201117318435756, + "grad_norm": 1.61667799949646, + "learning_rate": 3.6825841842943362e-06, + "loss": 0.9656, + "step": 2277 + }, + { + "epoch": 2.1210428305400373, + "grad_norm": 1.6104581356048584, + "learning_rate": 3.681493291207544e-06, + "loss": 0.9553, + "step": 2278 + }, + { + "epoch": 2.121973929236499, + "grad_norm": 1.6627345085144043, + "learning_rate": 3.680402108383293e-06, + "loss": 0.9833, + "step": 2279 + }, + { + "epoch": 2.122905027932961, + "grad_norm": 1.6288490295410156, + "learning_rate": 3.679310636089174e-06, + "loss": 0.9731, + "step": 2280 + }, + { + "epoch": 2.123836126629423, + "grad_norm": 1.6077247858047485, + "learning_rate": 3.678218874592846e-06, + "loss": 0.9668, + "step": 2281 + }, + { + "epoch": 2.1247672253258845, + "grad_norm": 1.5970665216445923, + "learning_rate": 3.6771268241620444e-06, + "loss": 0.9408, + "step": 2282 + }, + { + "epoch": 2.1256983240223466, + "grad_norm": 1.5959385633468628, + "learning_rate": 3.6760344850645707e-06, + "loss": 0.9184, + "step": 2283 + }, + { + "epoch": 2.1266294227188083, + "grad_norm": 1.637071132659912, + "learning_rate": 3.6749418575683005e-06, + "loss": 0.9414, + "step": 2284 + }, + { + "epoch": 2.12756052141527, + "grad_norm": 1.5946859121322632, + "learning_rate": 3.6738489419411775e-06, + "loss": 0.9392, + "step": 2285 + }, + { + "epoch": 2.1284916201117317, + "grad_norm": 1.6978176832199097, + "learning_rate": 3.6727557384512187e-06, + "loss": 0.9659, + "step": 2286 + }, + { + "epoch": 2.129422718808194, + "grad_norm": 1.620904803276062, + "learning_rate": 3.6716622473665085e-06, + "loss": 0.954, + "step": 2287 + }, + { + "epoch": 2.1303538175046555, + "grad_norm": 1.6063650846481323, + "learning_rate": 3.670568468955205e-06, + "loss": 0.9505, + "step": 2288 + }, + { + "epoch": 2.131284916201117, + "grad_norm": 1.5732965469360352, + "learning_rate": 3.6694744034855347e-06, + "loss": 0.9194, + "step": 2289 + }, + { + "epoch": 2.1322160148975793, + "grad_norm": 1.5468127727508545, + "learning_rate": 3.6683800512257945e-06, + "loss": 0.9728, + "step": 2290 + }, + { + "epoch": 2.133147113594041, + "grad_norm": 1.596658706665039, + "learning_rate": 3.667285412444354e-06, + "loss": 0.9436, + "step": 2291 + }, + { + "epoch": 2.1340782122905027, + "grad_norm": 1.5953705310821533, + "learning_rate": 3.6661904874096506e-06, + "loss": 0.9453, + "step": 2292 + }, + { + "epoch": 2.135009310986965, + "grad_norm": 1.610055923461914, + "learning_rate": 3.6650952763901913e-06, + "loss": 0.9292, + "step": 2293 + }, + { + "epoch": 2.1359404096834265, + "grad_norm": 1.6304115056991577, + "learning_rate": 3.6639997796545567e-06, + "loss": 0.9441, + "step": 2294 + }, + { + "epoch": 2.136871508379888, + "grad_norm": 1.6363351345062256, + "learning_rate": 3.662903997471394e-06, + "loss": 0.9426, + "step": 2295 + }, + { + "epoch": 2.1378026070763503, + "grad_norm": 1.6297560930252075, + "learning_rate": 3.661807930109422e-06, + "loss": 0.9981, + "step": 2296 + }, + { + "epoch": 2.138733705772812, + "grad_norm": 1.6084342002868652, + "learning_rate": 3.660711577837429e-06, + "loss": 0.986, + "step": 2297 + }, + { + "epoch": 2.1396648044692737, + "grad_norm": 1.5748109817504883, + "learning_rate": 3.6596149409242735e-06, + "loss": 0.9023, + "step": 2298 + }, + { + "epoch": 2.1405959031657353, + "grad_norm": 1.623326301574707, + "learning_rate": 3.6585180196388844e-06, + "loss": 0.9698, + "step": 2299 + }, + { + "epoch": 2.1415270018621975, + "grad_norm": 1.6260778903961182, + "learning_rate": 3.6574208142502582e-06, + "loss": 0.9415, + "step": 2300 + }, + { + "epoch": 2.142458100558659, + "grad_norm": 1.6899642944335938, + "learning_rate": 3.656323325027463e-06, + "loss": 0.9999, + "step": 2301 + }, + { + "epoch": 2.143389199255121, + "grad_norm": 1.654906988143921, + "learning_rate": 3.6552255522396367e-06, + "loss": 0.9616, + "step": 2302 + }, + { + "epoch": 2.144320297951583, + "grad_norm": 1.6278623342514038, + "learning_rate": 3.6541274961559854e-06, + "loss": 0.9397, + "step": 2303 + }, + { + "epoch": 2.1452513966480447, + "grad_norm": 1.6147063970565796, + "learning_rate": 3.653029157045785e-06, + "loss": 0.9506, + "step": 2304 + }, + { + "epoch": 2.1461824953445063, + "grad_norm": 1.640665054321289, + "learning_rate": 3.6519305351783814e-06, + "loss": 0.954, + "step": 2305 + }, + { + "epoch": 2.1471135940409685, + "grad_norm": 1.5741463899612427, + "learning_rate": 3.650831630823189e-06, + "loss": 0.9804, + "step": 2306 + }, + { + "epoch": 2.14804469273743, + "grad_norm": 1.5746980905532837, + "learning_rate": 3.649732444249693e-06, + "loss": 0.9251, + "step": 2307 + }, + { + "epoch": 2.148975791433892, + "grad_norm": 1.572954773902893, + "learning_rate": 3.6486329757274454e-06, + "loss": 0.9174, + "step": 2308 + }, + { + "epoch": 2.149906890130354, + "grad_norm": 1.587693214416504, + "learning_rate": 3.6475332255260697e-06, + "loss": 0.9928, + "step": 2309 + }, + { + "epoch": 2.1508379888268156, + "grad_norm": 1.6290266513824463, + "learning_rate": 3.6464331939152576e-06, + "loss": 0.9989, + "step": 2310 + }, + { + "epoch": 2.1517690875232773, + "grad_norm": 1.6513193845748901, + "learning_rate": 3.64533288116477e-06, + "loss": 0.9571, + "step": 2311 + }, + { + "epoch": 2.1527001862197395, + "grad_norm": 1.6338671445846558, + "learning_rate": 3.644232287544435e-06, + "loss": 0.9754, + "step": 2312 + }, + { + "epoch": 2.153631284916201, + "grad_norm": 1.615729808807373, + "learning_rate": 3.6431314133241526e-06, + "loss": 0.957, + "step": 2313 + }, + { + "epoch": 2.154562383612663, + "grad_norm": 1.576277256011963, + "learning_rate": 3.6420302587738886e-06, + "loss": 0.9765, + "step": 2314 + }, + { + "epoch": 2.155493482309125, + "grad_norm": 1.6133235692977905, + "learning_rate": 3.6409288241636808e-06, + "loss": 0.9698, + "step": 2315 + }, + { + "epoch": 2.1564245810055866, + "grad_norm": 1.6441055536270142, + "learning_rate": 3.6398271097636322e-06, + "loss": 0.9425, + "step": 2316 + }, + { + "epoch": 2.1573556797020483, + "grad_norm": 1.6123785972595215, + "learning_rate": 3.6387251158439173e-06, + "loss": 0.9623, + "step": 2317 + }, + { + "epoch": 2.1582867783985105, + "grad_norm": 1.6008034944534302, + "learning_rate": 3.637622842674777e-06, + "loss": 0.9389, + "step": 2318 + }, + { + "epoch": 2.159217877094972, + "grad_norm": 1.6267389059066772, + "learning_rate": 3.6365202905265224e-06, + "loss": 0.9516, + "step": 2319 + }, + { + "epoch": 2.160148975791434, + "grad_norm": 1.635219693183899, + "learning_rate": 3.6354174596695324e-06, + "loss": 0.9528, + "step": 2320 + }, + { + "epoch": 2.1610800744878955, + "grad_norm": 1.6462581157684326, + "learning_rate": 3.6343143503742524e-06, + "loss": 0.9308, + "step": 2321 + }, + { + "epoch": 2.1620111731843576, + "grad_norm": 1.5747215747833252, + "learning_rate": 3.633210962911199e-06, + "loss": 0.9676, + "step": 2322 + }, + { + "epoch": 2.1629422718808193, + "grad_norm": 1.6596400737762451, + "learning_rate": 3.6321072975509564e-06, + "loss": 0.9738, + "step": 2323 + }, + { + "epoch": 2.163873370577281, + "grad_norm": 1.5887181758880615, + "learning_rate": 3.6310033545641753e-06, + "loss": 0.9101, + "step": 2324 + }, + { + "epoch": 2.164804469273743, + "grad_norm": 1.6391619443893433, + "learning_rate": 3.629899134221576e-06, + "loss": 0.9517, + "step": 2325 + }, + { + "epoch": 2.165735567970205, + "grad_norm": 1.5836042165756226, + "learning_rate": 3.6287946367939455e-06, + "loss": 0.9443, + "step": 2326 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 1.7963414192199707, + "learning_rate": 3.6276898625521413e-06, + "loss": 0.9787, + "step": 2327 + }, + { + "epoch": 2.1675977653631286, + "grad_norm": 1.5905530452728271, + "learning_rate": 3.6265848117670847e-06, + "loss": 0.928, + "step": 2328 + }, + { + "epoch": 2.1685288640595903, + "grad_norm": 1.6398658752441406, + "learning_rate": 3.6254794847097695e-06, + "loss": 0.9697, + "step": 2329 + }, + { + "epoch": 2.169459962756052, + "grad_norm": 1.6280919313430786, + "learning_rate": 3.624373881651254e-06, + "loss": 0.9456, + "step": 2330 + }, + { + "epoch": 2.170391061452514, + "grad_norm": 1.691656231880188, + "learning_rate": 3.6232680028626644e-06, + "loss": 0.9969, + "step": 2331 + }, + { + "epoch": 2.171322160148976, + "grad_norm": 1.6128681898117065, + "learning_rate": 3.6221618486151953e-06, + "loss": 0.9372, + "step": 2332 + }, + { + "epoch": 2.1722532588454375, + "grad_norm": 1.6305981874465942, + "learning_rate": 3.6210554191801102e-06, + "loss": 0.9642, + "step": 2333 + }, + { + "epoch": 2.1731843575418996, + "grad_norm": 1.6264302730560303, + "learning_rate": 3.6199487148287376e-06, + "loss": 0.9315, + "step": 2334 + }, + { + "epoch": 2.1741154562383613, + "grad_norm": 1.5924581289291382, + "learning_rate": 3.618841735832474e-06, + "loss": 0.962, + "step": 2335 + }, + { + "epoch": 2.175046554934823, + "grad_norm": 1.5694663524627686, + "learning_rate": 3.6177344824627854e-06, + "loss": 0.9778, + "step": 2336 + }, + { + "epoch": 2.1759776536312847, + "grad_norm": 1.6225179433822632, + "learning_rate": 3.6166269549912013e-06, + "loss": 0.9664, + "step": 2337 + }, + { + "epoch": 2.176908752327747, + "grad_norm": 1.6614285707473755, + "learning_rate": 3.6155191536893225e-06, + "loss": 0.9783, + "step": 2338 + }, + { + "epoch": 2.1778398510242085, + "grad_norm": 1.5901024341583252, + "learning_rate": 3.6144110788288135e-06, + "loss": 0.9345, + "step": 2339 + }, + { + "epoch": 2.17877094972067, + "grad_norm": 1.5973265171051025, + "learning_rate": 3.6133027306814085e-06, + "loss": 0.9681, + "step": 2340 + }, + { + "epoch": 2.1797020484171323, + "grad_norm": 1.6260349750518799, + "learning_rate": 3.612194109518906e-06, + "loss": 0.9398, + "step": 2341 + }, + { + "epoch": 2.180633147113594, + "grad_norm": 1.604882001876831, + "learning_rate": 3.6110852156131746e-06, + "loss": 0.9414, + "step": 2342 + }, + { + "epoch": 2.1815642458100557, + "grad_norm": 1.6518100500106812, + "learning_rate": 3.609976049236148e-06, + "loss": 0.968, + "step": 2343 + }, + { + "epoch": 2.182495344506518, + "grad_norm": 1.56865656375885, + "learning_rate": 3.6088666106598265e-06, + "loss": 0.9395, + "step": 2344 + }, + { + "epoch": 2.1834264432029795, + "grad_norm": 1.7989506721496582, + "learning_rate": 3.6077569001562775e-06, + "loss": 1.0015, + "step": 2345 + }, + { + "epoch": 2.184357541899441, + "grad_norm": 1.6323250532150269, + "learning_rate": 3.6066469179976347e-06, + "loss": 0.9175, + "step": 2346 + }, + { + "epoch": 2.1852886405959033, + "grad_norm": 1.6387637853622437, + "learning_rate": 3.6055366644561006e-06, + "loss": 0.9317, + "step": 2347 + }, + { + "epoch": 2.186219739292365, + "grad_norm": 1.6584813594818115, + "learning_rate": 3.6044261398039416e-06, + "loss": 0.9766, + "step": 2348 + }, + { + "epoch": 2.1871508379888267, + "grad_norm": 1.5685219764709473, + "learning_rate": 3.6033153443134903e-06, + "loss": 0.9108, + "step": 2349 + }, + { + "epoch": 2.188081936685289, + "grad_norm": 1.6261427402496338, + "learning_rate": 3.602204278257149e-06, + "loss": 0.9839, + "step": 2350 + }, + { + "epoch": 2.1890130353817505, + "grad_norm": 1.5805799961090088, + "learning_rate": 3.601092941907384e-06, + "loss": 0.9655, + "step": 2351 + }, + { + "epoch": 2.189944134078212, + "grad_norm": 1.6410917043685913, + "learning_rate": 3.5999813355367262e-06, + "loss": 0.9629, + "step": 2352 + }, + { + "epoch": 2.1908752327746743, + "grad_norm": 1.6219847202301025, + "learning_rate": 3.598869459417777e-06, + "loss": 0.9617, + "step": 2353 + }, + { + "epoch": 2.191806331471136, + "grad_norm": 1.6232783794403076, + "learning_rate": 3.5977573138232e-06, + "loss": 0.9682, + "step": 2354 + }, + { + "epoch": 2.1927374301675977, + "grad_norm": 1.5942046642303467, + "learning_rate": 3.596644899025728e-06, + "loss": 0.9381, + "step": 2355 + }, + { + "epoch": 2.1936685288640594, + "grad_norm": 1.6250545978546143, + "learning_rate": 3.5955322152981575e-06, + "loss": 0.9658, + "step": 2356 + }, + { + "epoch": 2.1945996275605215, + "grad_norm": 1.648079752922058, + "learning_rate": 3.594419262913351e-06, + "loss": 0.9646, + "step": 2357 + }, + { + "epoch": 2.195530726256983, + "grad_norm": 1.5967533588409424, + "learning_rate": 3.59330604214424e-06, + "loss": 0.9281, + "step": 2358 + }, + { + "epoch": 2.196461824953445, + "grad_norm": 1.6187447309494019, + "learning_rate": 3.592192553263817e-06, + "loss": 0.981, + "step": 2359 + }, + { + "epoch": 2.197392923649907, + "grad_norm": 1.5705369710922241, + "learning_rate": 3.5910787965451444e-06, + "loss": 0.9382, + "step": 2360 + }, + { + "epoch": 2.1983240223463687, + "grad_norm": 1.5949432849884033, + "learning_rate": 3.5899647722613482e-06, + "loss": 0.963, + "step": 2361 + }, + { + "epoch": 2.1992551210428304, + "grad_norm": 1.608249306678772, + "learning_rate": 3.5888504806856194e-06, + "loss": 0.964, + "step": 2362 + }, + { + "epoch": 2.2001862197392925, + "grad_norm": 1.6186444759368896, + "learning_rate": 3.5877359220912174e-06, + "loss": 0.9722, + "step": 2363 + }, + { + "epoch": 2.201117318435754, + "grad_norm": 1.613977074623108, + "learning_rate": 3.5866210967514635e-06, + "loss": 0.9208, + "step": 2364 + }, + { + "epoch": 2.202048417132216, + "grad_norm": 1.5724316835403442, + "learning_rate": 3.585506004939748e-06, + "loss": 0.9136, + "step": 2365 + }, + { + "epoch": 2.202979515828678, + "grad_norm": 1.63250732421875, + "learning_rate": 3.5843906469295226e-06, + "loss": 1.0011, + "step": 2366 + }, + { + "epoch": 2.2039106145251397, + "grad_norm": 1.6363906860351562, + "learning_rate": 3.583275022994308e-06, + "loss": 0.9643, + "step": 2367 + }, + { + "epoch": 2.2048417132216014, + "grad_norm": 1.6174997091293335, + "learning_rate": 3.5821591334076893e-06, + "loss": 0.9573, + "step": 2368 + }, + { + "epoch": 2.2057728119180635, + "grad_norm": 1.6489567756652832, + "learning_rate": 3.5810429784433133e-06, + "loss": 0.9676, + "step": 2369 + }, + { + "epoch": 2.206703910614525, + "grad_norm": 1.6908695697784424, + "learning_rate": 3.579926558374897e-06, + "loss": 0.9682, + "step": 2370 + }, + { + "epoch": 2.207635009310987, + "grad_norm": 1.647871494293213, + "learning_rate": 3.5788098734762177e-06, + "loss": 0.9665, + "step": 2371 + }, + { + "epoch": 2.2085661080074486, + "grad_norm": 1.6591806411743164, + "learning_rate": 3.5776929240211227e-06, + "loss": 0.9527, + "step": 2372 + }, + { + "epoch": 2.2094972067039107, + "grad_norm": 1.6059842109680176, + "learning_rate": 3.5765757102835197e-06, + "loss": 0.9841, + "step": 2373 + }, + { + "epoch": 2.2104283054003724, + "grad_norm": 1.6544197797775269, + "learning_rate": 3.5754582325373823e-06, + "loss": 0.9641, + "step": 2374 + }, + { + "epoch": 2.211359404096834, + "grad_norm": 1.6366745233535767, + "learning_rate": 3.574340491056751e-06, + "loss": 0.9386, + "step": 2375 + }, + { + "epoch": 2.212290502793296, + "grad_norm": 1.6419512033462524, + "learning_rate": 3.573222486115727e-06, + "loss": 1.0125, + "step": 2376 + }, + { + "epoch": 2.213221601489758, + "grad_norm": 1.604271650314331, + "learning_rate": 3.5721042179884824e-06, + "loss": 0.9483, + "step": 2377 + }, + { + "epoch": 2.2141527001862196, + "grad_norm": 1.6592247486114502, + "learning_rate": 3.570985686949246e-06, + "loss": 0.9794, + "step": 2378 + }, + { + "epoch": 2.2150837988826817, + "grad_norm": 1.619288682937622, + "learning_rate": 3.569866893272318e-06, + "loss": 0.9412, + "step": 2379 + }, + { + "epoch": 2.2160148975791434, + "grad_norm": 1.6512722969055176, + "learning_rate": 3.568747837232058e-06, + "loss": 0.9479, + "step": 2380 + }, + { + "epoch": 2.216945996275605, + "grad_norm": 1.602245807647705, + "learning_rate": 3.5676285191028926e-06, + "loss": 0.9529, + "step": 2381 + }, + { + "epoch": 2.217877094972067, + "grad_norm": 1.6473281383514404, + "learning_rate": 3.566508939159312e-06, + "loss": 0.961, + "step": 2382 + }, + { + "epoch": 2.218808193668529, + "grad_norm": 1.6244070529937744, + "learning_rate": 3.565389097675872e-06, + "loss": 0.9704, + "step": 2383 + }, + { + "epoch": 2.2197392923649906, + "grad_norm": 1.6561599969863892, + "learning_rate": 3.5642689949271892e-06, + "loss": 0.9874, + "step": 2384 + }, + { + "epoch": 2.2206703910614527, + "grad_norm": 1.6253948211669922, + "learning_rate": 3.563148631187947e-06, + "loss": 0.9708, + "step": 2385 + }, + { + "epoch": 2.2216014897579144, + "grad_norm": 1.6938610076904297, + "learning_rate": 3.562028006732893e-06, + "loss": 0.9425, + "step": 2386 + }, + { + "epoch": 2.222532588454376, + "grad_norm": 1.5649609565734863, + "learning_rate": 3.5609071218368363e-06, + "loss": 0.9248, + "step": 2387 + }, + { + "epoch": 2.223463687150838, + "grad_norm": 1.6441490650177002, + "learning_rate": 3.5597859767746524e-06, + "loss": 0.9543, + "step": 2388 + }, + { + "epoch": 2.2243947858473, + "grad_norm": 1.5588210821151733, + "learning_rate": 3.5586645718212787e-06, + "loss": 0.9334, + "step": 2389 + }, + { + "epoch": 2.2253258845437616, + "grad_norm": 1.6426608562469482, + "learning_rate": 3.557542907251718e-06, + "loss": 0.9572, + "step": 2390 + }, + { + "epoch": 2.2262569832402237, + "grad_norm": 1.6280072927474976, + "learning_rate": 3.5564209833410357e-06, + "loss": 0.9539, + "step": 2391 + }, + { + "epoch": 2.2271880819366854, + "grad_norm": 1.5709218978881836, + "learning_rate": 3.5552988003643613e-06, + "loss": 0.901, + "step": 2392 + }, + { + "epoch": 2.228119180633147, + "grad_norm": 1.657243251800537, + "learning_rate": 3.5541763585968874e-06, + "loss": 0.9599, + "step": 2393 + }, + { + "epoch": 2.2290502793296088, + "grad_norm": 1.68219792842865, + "learning_rate": 3.55305365831387e-06, + "loss": 0.9478, + "step": 2394 + }, + { + "epoch": 2.229981378026071, + "grad_norm": 1.6564964056015015, + "learning_rate": 3.551930699790629e-06, + "loss": 0.9289, + "step": 2395 + }, + { + "epoch": 2.2309124767225326, + "grad_norm": 1.6154779195785522, + "learning_rate": 3.5508074833025476e-06, + "loss": 0.9397, + "step": 2396 + }, + { + "epoch": 2.2318435754189943, + "grad_norm": 1.6187154054641724, + "learning_rate": 3.5496840091250716e-06, + "loss": 0.9596, + "step": 2397 + }, + { + "epoch": 2.2327746741154564, + "grad_norm": 1.6263182163238525, + "learning_rate": 3.548560277533711e-06, + "loss": 0.9812, + "step": 2398 + }, + { + "epoch": 2.233705772811918, + "grad_norm": 1.5656661987304688, + "learning_rate": 3.547436288804038e-06, + "loss": 0.9455, + "step": 2399 + }, + { + "epoch": 2.2346368715083798, + "grad_norm": 1.5954402685165405, + "learning_rate": 3.546312043211687e-06, + "loss": 0.92, + "step": 2400 + }, + { + "epoch": 2.235567970204842, + "grad_norm": 1.6701622009277344, + "learning_rate": 3.5451875410323587e-06, + "loss": 0.9477, + "step": 2401 + }, + { + "epoch": 2.2364990689013036, + "grad_norm": 1.6704421043395996, + "learning_rate": 3.5440627825418143e-06, + "loss": 0.9595, + "step": 2402 + }, + { + "epoch": 2.2374301675977653, + "grad_norm": 1.6545499563217163, + "learning_rate": 3.542937768015877e-06, + "loss": 0.9771, + "step": 2403 + }, + { + "epoch": 2.2383612662942274, + "grad_norm": 1.6518737077713013, + "learning_rate": 3.541812497730435e-06, + "loss": 0.9653, + "step": 2404 + }, + { + "epoch": 2.239292364990689, + "grad_norm": 1.6180881261825562, + "learning_rate": 3.540686971961438e-06, + "loss": 0.9728, + "step": 2405 + }, + { + "epoch": 2.2402234636871508, + "grad_norm": 1.6360397338867188, + "learning_rate": 3.5395611909848986e-06, + "loss": 0.9367, + "step": 2406 + }, + { + "epoch": 2.2411545623836124, + "grad_norm": 1.682340383529663, + "learning_rate": 3.5384351550768916e-06, + "loss": 0.9692, + "step": 2407 + }, + { + "epoch": 2.2420856610800746, + "grad_norm": 1.6571754217147827, + "learning_rate": 3.5373088645135543e-06, + "loss": 1.0257, + "step": 2408 + }, + { + "epoch": 2.2430167597765363, + "grad_norm": 1.5855857133865356, + "learning_rate": 3.5361823195710874e-06, + "loss": 0.948, + "step": 2409 + }, + { + "epoch": 2.243947858472998, + "grad_norm": 1.8361417055130005, + "learning_rate": 3.535055520525753e-06, + "loss": 0.9757, + "step": 2410 + }, + { + "epoch": 2.24487895716946, + "grad_norm": 1.6739006042480469, + "learning_rate": 3.5339284676538774e-06, + "loss": 1.0081, + "step": 2411 + }, + { + "epoch": 2.2458100558659218, + "grad_norm": 1.6460050344467163, + "learning_rate": 3.5328011612318454e-06, + "loss": 0.9522, + "step": 2412 + }, + { + "epoch": 2.2467411545623834, + "grad_norm": 1.601426124572754, + "learning_rate": 3.531673601536108e-06, + "loss": 0.9687, + "step": 2413 + }, + { + "epoch": 2.2476722532588456, + "grad_norm": 1.6364185810089111, + "learning_rate": 3.5305457888431747e-06, + "loss": 1.0097, + "step": 2414 + }, + { + "epoch": 2.2486033519553073, + "grad_norm": 1.6102561950683594, + "learning_rate": 3.529417723429621e-06, + "loss": 0.9442, + "step": 2415 + }, + { + "epoch": 2.249534450651769, + "grad_norm": 1.646165370941162, + "learning_rate": 3.5282894055720803e-06, + "loss": 0.9272, + "step": 2416 + }, + { + "epoch": 2.250465549348231, + "grad_norm": 1.6000772714614868, + "learning_rate": 3.5271608355472513e-06, + "loss": 0.941, + "step": 2417 + }, + { + "epoch": 2.2513966480446927, + "grad_norm": 1.7036372423171997, + "learning_rate": 3.5260320136318927e-06, + "loss": 0.9855, + "step": 2418 + }, + { + "epoch": 2.2523277467411544, + "grad_norm": 1.646814227104187, + "learning_rate": 3.5249029401028247e-06, + "loss": 0.9612, + "step": 2419 + }, + { + "epoch": 2.2532588454376166, + "grad_norm": 1.5991038084030151, + "learning_rate": 3.5237736152369305e-06, + "loss": 0.9251, + "step": 2420 + }, + { + "epoch": 2.2541899441340782, + "grad_norm": 1.5567059516906738, + "learning_rate": 3.522644039311154e-06, + "loss": 0.9331, + "step": 2421 + }, + { + "epoch": 2.25512104283054, + "grad_norm": 1.5913971662521362, + "learning_rate": 3.5215142126025013e-06, + "loss": 0.919, + "step": 2422 + }, + { + "epoch": 2.256052141527002, + "grad_norm": 1.616965651512146, + "learning_rate": 3.52038413538804e-06, + "loss": 0.964, + "step": 2423 + }, + { + "epoch": 2.2569832402234637, + "grad_norm": 1.612487554550171, + "learning_rate": 3.519253807944898e-06, + "loss": 0.972, + "step": 2424 + }, + { + "epoch": 2.2579143389199254, + "grad_norm": 1.5858962535858154, + "learning_rate": 3.5181232305502657e-06, + "loss": 0.9513, + "step": 2425 + }, + { + "epoch": 2.2588454376163876, + "grad_norm": 1.7430367469787598, + "learning_rate": 3.516992403481394e-06, + "loss": 0.9933, + "step": 2426 + }, + { + "epoch": 2.2597765363128492, + "grad_norm": 1.5940974950790405, + "learning_rate": 3.515861327015596e-06, + "loss": 0.964, + "step": 2427 + }, + { + "epoch": 2.260707635009311, + "grad_norm": 1.5860908031463623, + "learning_rate": 3.514730001430246e-06, + "loss": 0.9204, + "step": 2428 + }, + { + "epoch": 2.2616387337057726, + "grad_norm": 1.585479497909546, + "learning_rate": 3.513598427002778e-06, + "loss": 0.9387, + "step": 2429 + }, + { + "epoch": 2.2625698324022347, + "grad_norm": 1.726728916168213, + "learning_rate": 3.512466604010688e-06, + "loss": 0.9552, + "step": 2430 + }, + { + "epoch": 2.2635009310986964, + "grad_norm": 1.5958534479141235, + "learning_rate": 3.511334532731533e-06, + "loss": 0.9718, + "step": 2431 + }, + { + "epoch": 2.264432029795158, + "grad_norm": 1.6186741590499878, + "learning_rate": 3.51020221344293e-06, + "loss": 0.9475, + "step": 2432 + }, + { + "epoch": 2.2653631284916202, + "grad_norm": 1.6689815521240234, + "learning_rate": 3.5090696464225587e-06, + "loss": 0.9716, + "step": 2433 + }, + { + "epoch": 2.266294227188082, + "grad_norm": 1.5765066146850586, + "learning_rate": 3.507936831948158e-06, + "loss": 0.8982, + "step": 2434 + }, + { + "epoch": 2.2672253258845436, + "grad_norm": 1.6396182775497437, + "learning_rate": 3.5068037702975266e-06, + "loss": 0.9696, + "step": 2435 + }, + { + "epoch": 2.2681564245810057, + "grad_norm": 1.5978269577026367, + "learning_rate": 3.5056704617485273e-06, + "loss": 0.9596, + "step": 2436 + }, + { + "epoch": 2.2690875232774674, + "grad_norm": 1.629413366317749, + "learning_rate": 3.504536906579079e-06, + "loss": 0.968, + "step": 2437 + }, + { + "epoch": 2.270018621973929, + "grad_norm": 1.5942696332931519, + "learning_rate": 3.503403105067165e-06, + "loss": 0.9502, + "step": 2438 + }, + { + "epoch": 2.2709497206703912, + "grad_norm": 1.6827514171600342, + "learning_rate": 3.502269057490827e-06, + "loss": 0.9928, + "step": 2439 + }, + { + "epoch": 2.271880819366853, + "grad_norm": 1.6567050218582153, + "learning_rate": 3.501134764128167e-06, + "loss": 0.9822, + "step": 2440 + }, + { + "epoch": 2.2728119180633146, + "grad_norm": 1.6781362295150757, + "learning_rate": 3.5000002252573466e-06, + "loss": 0.9575, + "step": 2441 + }, + { + "epoch": 2.2737430167597763, + "grad_norm": 1.5720270872116089, + "learning_rate": 3.498865441156591e-06, + "loss": 0.9085, + "step": 2442 + }, + { + "epoch": 2.2746741154562384, + "grad_norm": 1.6472513675689697, + "learning_rate": 3.497730412104181e-06, + "loss": 0.969, + "step": 2443 + }, + { + "epoch": 2.2756052141527, + "grad_norm": 1.5873229503631592, + "learning_rate": 3.4965951383784603e-06, + "loss": 0.9627, + "step": 2444 + }, + { + "epoch": 2.276536312849162, + "grad_norm": 1.5857559442520142, + "learning_rate": 3.495459620257833e-06, + "loss": 0.9274, + "step": 2445 + }, + { + "epoch": 2.277467411545624, + "grad_norm": 1.5823582410812378, + "learning_rate": 3.4943238580207604e-06, + "loss": 0.9593, + "step": 2446 + }, + { + "epoch": 2.2783985102420856, + "grad_norm": 1.5935879945755005, + "learning_rate": 3.4931878519457664e-06, + "loss": 0.99, + "step": 2447 + }, + { + "epoch": 2.2793296089385473, + "grad_norm": 1.6486268043518066, + "learning_rate": 3.4920516023114337e-06, + "loss": 0.9198, + "step": 2448 + }, + { + "epoch": 2.2802607076350094, + "grad_norm": 1.6650099754333496, + "learning_rate": 3.4909151093964046e-06, + "loss": 0.9869, + "step": 2449 + }, + { + "epoch": 2.281191806331471, + "grad_norm": 1.77230703830719, + "learning_rate": 3.4897783734793794e-06, + "loss": 1.0144, + "step": 2450 + }, + { + "epoch": 2.282122905027933, + "grad_norm": 1.5813177824020386, + "learning_rate": 3.488641394839123e-06, + "loss": 0.9611, + "step": 2451 + }, + { + "epoch": 2.283054003724395, + "grad_norm": 1.6739068031311035, + "learning_rate": 3.487504173754453e-06, + "loss": 0.9774, + "step": 2452 + }, + { + "epoch": 2.2839851024208566, + "grad_norm": 1.6267461776733398, + "learning_rate": 3.4863667105042526e-06, + "loss": 0.9745, + "step": 2453 + }, + { + "epoch": 2.2849162011173183, + "grad_norm": 1.5787975788116455, + "learning_rate": 3.485229005367461e-06, + "loss": 0.8997, + "step": 2454 + }, + { + "epoch": 2.2858472998137804, + "grad_norm": 1.6535475254058838, + "learning_rate": 3.4840910586230768e-06, + "loss": 0.9749, + "step": 2455 + }, + { + "epoch": 2.286778398510242, + "grad_norm": 1.5885213613510132, + "learning_rate": 3.4829528705501605e-06, + "loss": 0.9455, + "step": 2456 + }, + { + "epoch": 2.287709497206704, + "grad_norm": 1.5992566347122192, + "learning_rate": 3.4818144414278266e-06, + "loss": 0.9605, + "step": 2457 + }, + { + "epoch": 2.288640595903166, + "grad_norm": 1.6213765144348145, + "learning_rate": 3.4806757715352552e-06, + "loss": 0.9389, + "step": 2458 + }, + { + "epoch": 2.2895716945996276, + "grad_norm": 1.6163054704666138, + "learning_rate": 3.4795368611516795e-06, + "loss": 0.9622, + "step": 2459 + }, + { + "epoch": 2.2905027932960893, + "grad_norm": 1.6298826932907104, + "learning_rate": 3.4783977105563973e-06, + "loss": 1.002, + "step": 2460 + }, + { + "epoch": 2.2914338919925514, + "grad_norm": 1.593178629875183, + "learning_rate": 3.477258320028759e-06, + "loss": 0.9383, + "step": 2461 + }, + { + "epoch": 2.292364990689013, + "grad_norm": 1.6418095827102661, + "learning_rate": 3.47611868984818e-06, + "loss": 0.9743, + "step": 2462 + }, + { + "epoch": 2.293296089385475, + "grad_norm": 1.6952311992645264, + "learning_rate": 3.4749788202941297e-06, + "loss": 0.9733, + "step": 2463 + }, + { + "epoch": 2.294227188081937, + "grad_norm": 1.6906559467315674, + "learning_rate": 3.473838711646139e-06, + "loss": 0.956, + "step": 2464 + }, + { + "epoch": 2.2951582867783986, + "grad_norm": 1.6767266988754272, + "learning_rate": 3.472698364183798e-06, + "loss": 0.9933, + "step": 2465 + }, + { + "epoch": 2.2960893854748603, + "grad_norm": 1.6053396463394165, + "learning_rate": 3.4715577781867516e-06, + "loss": 0.926, + "step": 2466 + }, + { + "epoch": 2.297020484171322, + "grad_norm": 1.6009371280670166, + "learning_rate": 3.4704169539347066e-06, + "loss": 0.9401, + "step": 2467 + }, + { + "epoch": 2.297951582867784, + "grad_norm": 1.6782152652740479, + "learning_rate": 3.469275891707428e-06, + "loss": 0.9905, + "step": 2468 + }, + { + "epoch": 2.298882681564246, + "grad_norm": 1.7365431785583496, + "learning_rate": 3.4681345917847363e-06, + "loss": 0.9765, + "step": 2469 + }, + { + "epoch": 2.2998137802607075, + "grad_norm": 1.6472665071487427, + "learning_rate": 3.4669930544465147e-06, + "loss": 0.9605, + "step": 2470 + }, + { + "epoch": 2.3007448789571696, + "grad_norm": 1.6339211463928223, + "learning_rate": 3.4658512799727006e-06, + "loss": 0.9462, + "step": 2471 + }, + { + "epoch": 2.3016759776536313, + "grad_norm": 1.6361422538757324, + "learning_rate": 3.4647092686432917e-06, + "loss": 0.96, + "step": 2472 + }, + { + "epoch": 2.302607076350093, + "grad_norm": 1.7519464492797852, + "learning_rate": 3.4635670207383438e-06, + "loss": 0.9925, + "step": 2473 + }, + { + "epoch": 2.303538175046555, + "grad_norm": 1.6213538646697998, + "learning_rate": 3.4624245365379694e-06, + "loss": 0.9396, + "step": 2474 + }, + { + "epoch": 2.304469273743017, + "grad_norm": 1.6561614274978638, + "learning_rate": 3.46128181632234e-06, + "loss": 1.0284, + "step": 2475 + }, + { + "epoch": 2.3054003724394785, + "grad_norm": 1.6527241468429565, + "learning_rate": 3.4601388603716853e-06, + "loss": 0.9487, + "step": 2476 + }, + { + "epoch": 2.30633147113594, + "grad_norm": 1.5925219058990479, + "learning_rate": 3.458995668966292e-06, + "loss": 0.9012, + "step": 2477 + }, + { + "epoch": 2.3072625698324023, + "grad_norm": 1.712669849395752, + "learning_rate": 3.4578522423865042e-06, + "loss": 1.0153, + "step": 2478 + }, + { + "epoch": 2.308193668528864, + "grad_norm": 1.626772403717041, + "learning_rate": 3.4567085809127247e-06, + "loss": 0.9233, + "step": 2479 + }, + { + "epoch": 2.3091247672253257, + "grad_norm": 1.6258918046951294, + "learning_rate": 3.455564684825414e-06, + "loss": 0.961, + "step": 2480 + }, + { + "epoch": 2.310055865921788, + "grad_norm": 1.6405956745147705, + "learning_rate": 3.4544205544050886e-06, + "loss": 0.977, + "step": 2481 + }, + { + "epoch": 2.3109869646182495, + "grad_norm": 1.6471235752105713, + "learning_rate": 3.453276189932324e-06, + "loss": 0.9703, + "step": 2482 + }, + { + "epoch": 2.311918063314711, + "grad_norm": 1.6240640878677368, + "learning_rate": 3.452131591687753e-06, + "loss": 0.9616, + "step": 2483 + }, + { + "epoch": 2.3128491620111733, + "grad_norm": 1.6593929529190063, + "learning_rate": 3.450986759952064e-06, + "loss": 1.0255, + "step": 2484 + }, + { + "epoch": 2.313780260707635, + "grad_norm": 1.6254647970199585, + "learning_rate": 3.4498416950060056e-06, + "loss": 0.9551, + "step": 2485 + }, + { + "epoch": 2.3147113594040967, + "grad_norm": 1.606094479560852, + "learning_rate": 3.4486963971303805e-06, + "loss": 0.9408, + "step": 2486 + }, + { + "epoch": 2.315642458100559, + "grad_norm": 1.569433569908142, + "learning_rate": 3.447550866606051e-06, + "loss": 0.9511, + "step": 2487 + }, + { + "epoch": 2.3165735567970205, + "grad_norm": 1.6102689504623413, + "learning_rate": 3.4464051037139346e-06, + "loss": 0.9534, + "step": 2488 + }, + { + "epoch": 2.317504655493482, + "grad_norm": 1.6579538583755493, + "learning_rate": 3.4452591087350067e-06, + "loss": 0.9577, + "step": 2489 + }, + { + "epoch": 2.3184357541899443, + "grad_norm": 1.5535004138946533, + "learning_rate": 3.4441128819503e-06, + "loss": 0.9063, + "step": 2490 + }, + { + "epoch": 2.319366852886406, + "grad_norm": 1.6399790048599243, + "learning_rate": 3.4429664236409034e-06, + "loss": 0.9651, + "step": 2491 + }, + { + "epoch": 2.3202979515828677, + "grad_norm": 1.6075857877731323, + "learning_rate": 3.441819734087963e-06, + "loss": 0.9414, + "step": 2492 + }, + { + "epoch": 2.32122905027933, + "grad_norm": 1.6000339984893799, + "learning_rate": 3.440672813572681e-06, + "loss": 0.9064, + "step": 2493 + }, + { + "epoch": 2.3221601489757915, + "grad_norm": 1.6694415807724, + "learning_rate": 3.439525662376317e-06, + "loss": 0.9531, + "step": 2494 + }, + { + "epoch": 2.323091247672253, + "grad_norm": 1.6475422382354736, + "learning_rate": 3.4383782807801846e-06, + "loss": 0.9548, + "step": 2495 + }, + { + "epoch": 2.3240223463687153, + "grad_norm": 1.7334296703338623, + "learning_rate": 3.437230669065659e-06, + "loss": 0.9952, + "step": 2496 + }, + { + "epoch": 2.324953445065177, + "grad_norm": 1.6021231412887573, + "learning_rate": 3.4360828275141677e-06, + "loss": 0.9259, + "step": 2497 + }, + { + "epoch": 2.3258845437616387, + "grad_norm": 1.5765581130981445, + "learning_rate": 3.4349347564071956e-06, + "loss": 0.9581, + "step": 2498 + }, + { + "epoch": 2.326815642458101, + "grad_norm": 1.6334996223449707, + "learning_rate": 3.433786456026285e-06, + "loss": 0.8949, + "step": 2499 + }, + { + "epoch": 2.3277467411545625, + "grad_norm": 1.5742268562316895, + "learning_rate": 3.432637926653031e-06, + "loss": 0.9344, + "step": 2500 + }, + { + "epoch": 2.328677839851024, + "grad_norm": 1.620252251625061, + "learning_rate": 3.431489168569091e-06, + "loss": 0.9185, + "step": 2501 + }, + { + "epoch": 2.329608938547486, + "grad_norm": 1.6275135278701782, + "learning_rate": 3.430340182056171e-06, + "loss": 0.9599, + "step": 2502 + }, + { + "epoch": 2.330540037243948, + "grad_norm": 1.6273826360702515, + "learning_rate": 3.4291909673960392e-06, + "loss": 0.946, + "step": 2503 + }, + { + "epoch": 2.3314711359404097, + "grad_norm": 1.6474202871322632, + "learning_rate": 3.4280415248705173e-06, + "loss": 0.9829, + "step": 2504 + }, + { + "epoch": 2.3324022346368714, + "grad_norm": 1.5774883031845093, + "learning_rate": 3.4268918547614814e-06, + "loss": 0.9115, + "step": 2505 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.6210507154464722, + "learning_rate": 3.425741957350867e-06, + "loss": 0.943, + "step": 2506 + }, + { + "epoch": 2.334264432029795, + "grad_norm": 1.6329035758972168, + "learning_rate": 3.424591832920661e-06, + "loss": 0.9272, + "step": 2507 + }, + { + "epoch": 2.335195530726257, + "grad_norm": 1.5920088291168213, + "learning_rate": 3.423441481752911e-06, + "loss": 0.9525, + "step": 2508 + }, + { + "epoch": 2.336126629422719, + "grad_norm": 1.6059393882751465, + "learning_rate": 3.422290904129715e-06, + "loss": 0.951, + "step": 2509 + }, + { + "epoch": 2.3370577281191807, + "grad_norm": 1.6441189050674438, + "learning_rate": 3.421140100333231e-06, + "loss": 0.9659, + "step": 2510 + }, + { + "epoch": 2.3379888268156424, + "grad_norm": 1.5974291563034058, + "learning_rate": 3.4199890706456697e-06, + "loss": 0.9425, + "step": 2511 + }, + { + "epoch": 2.338919925512104, + "grad_norm": 1.66739821434021, + "learning_rate": 3.4188378153492967e-06, + "loss": 1.025, + "step": 2512 + }, + { + "epoch": 2.339851024208566, + "grad_norm": 1.6239019632339478, + "learning_rate": 3.4176863347264355e-06, + "loss": 0.989, + "step": 2513 + }, + { + "epoch": 2.340782122905028, + "grad_norm": 1.6041226387023926, + "learning_rate": 3.4165346290594642e-06, + "loss": 0.9669, + "step": 2514 + }, + { + "epoch": 2.3417132216014895, + "grad_norm": 1.586902379989624, + "learning_rate": 3.4153826986308143e-06, + "loss": 0.9116, + "step": 2515 + }, + { + "epoch": 2.3426443202979517, + "grad_norm": 1.6596691608428955, + "learning_rate": 3.414230543722973e-06, + "loss": 0.9187, + "step": 2516 + }, + { + "epoch": 2.3435754189944134, + "grad_norm": 1.6154959201812744, + "learning_rate": 3.4130781646184852e-06, + "loss": 0.9626, + "step": 2517 + }, + { + "epoch": 2.344506517690875, + "grad_norm": 1.6038247346878052, + "learning_rate": 3.411925561599947e-06, + "loss": 0.9795, + "step": 2518 + }, + { + "epoch": 2.345437616387337, + "grad_norm": 1.6716678142547607, + "learning_rate": 3.4107727349500114e-06, + "loss": 1.0093, + "step": 2519 + }, + { + "epoch": 2.346368715083799, + "grad_norm": 1.6727886199951172, + "learning_rate": 3.4096196849513863e-06, + "loss": 0.9405, + "step": 2520 + }, + { + "epoch": 2.3472998137802605, + "grad_norm": 1.592605471611023, + "learning_rate": 3.4084664118868336e-06, + "loss": 0.9467, + "step": 2521 + }, + { + "epoch": 2.3482309124767227, + "grad_norm": 1.5989329814910889, + "learning_rate": 3.407312916039171e-06, + "loss": 0.9397, + "step": 2522 + }, + { + "epoch": 2.3491620111731844, + "grad_norm": 1.6715210676193237, + "learning_rate": 3.40615919769127e-06, + "loss": 0.9699, + "step": 2523 + }, + { + "epoch": 2.350093109869646, + "grad_norm": 1.577141284942627, + "learning_rate": 3.405005257126056e-06, + "loss": 0.9236, + "step": 2524 + }, + { + "epoch": 2.351024208566108, + "grad_norm": 1.5585591793060303, + "learning_rate": 3.40385109462651e-06, + "loss": 0.9187, + "step": 2525 + }, + { + "epoch": 2.35195530726257, + "grad_norm": 1.6048600673675537, + "learning_rate": 3.402696710475668e-06, + "loss": 0.9369, + "step": 2526 + }, + { + "epoch": 2.3528864059590315, + "grad_norm": 1.7114615440368652, + "learning_rate": 3.4015421049566185e-06, + "loss": 0.9871, + "step": 2527 + }, + { + "epoch": 2.3538175046554937, + "grad_norm": 1.6561464071273804, + "learning_rate": 3.4003872783525054e-06, + "loss": 0.9469, + "step": 2528 + }, + { + "epoch": 2.3547486033519553, + "grad_norm": 1.5900381803512573, + "learning_rate": 3.3992322309465274e-06, + "loss": 0.912, + "step": 2529 + }, + { + "epoch": 2.355679702048417, + "grad_norm": 1.7404963970184326, + "learning_rate": 3.3980769630219357e-06, + "loss": 0.9895, + "step": 2530 + }, + { + "epoch": 2.356610800744879, + "grad_norm": 1.5682674646377563, + "learning_rate": 3.396921474862037e-06, + "loss": 0.9672, + "step": 2531 + }, + { + "epoch": 2.357541899441341, + "grad_norm": 1.5718039274215698, + "learning_rate": 3.395765766750192e-06, + "loss": 0.9715, + "step": 2532 + }, + { + "epoch": 2.3584729981378025, + "grad_norm": 1.6884418725967407, + "learning_rate": 3.394609838969814e-06, + "loss": 0.939, + "step": 2533 + }, + { + "epoch": 2.3594040968342647, + "grad_norm": 1.6072217226028442, + "learning_rate": 3.3934536918043713e-06, + "loss": 0.963, + "step": 2534 + }, + { + "epoch": 2.3603351955307263, + "grad_norm": 1.6443567276000977, + "learning_rate": 3.3922973255373857e-06, + "loss": 1.0078, + "step": 2535 + }, + { + "epoch": 2.361266294227188, + "grad_norm": 1.6523720026016235, + "learning_rate": 3.3911407404524333e-06, + "loss": 0.9699, + "step": 2536 + }, + { + "epoch": 2.3621973929236497, + "grad_norm": 1.635683536529541, + "learning_rate": 3.389983936833143e-06, + "loss": 0.9842, + "step": 2537 + }, + { + "epoch": 2.363128491620112, + "grad_norm": 1.565341830253601, + "learning_rate": 3.3888269149631977e-06, + "loss": 0.9148, + "step": 2538 + }, + { + "epoch": 2.3640595903165735, + "grad_norm": 1.6393283605575562, + "learning_rate": 3.3876696751263333e-06, + "loss": 0.9789, + "step": 2539 + }, + { + "epoch": 2.364990689013035, + "grad_norm": 1.6058269739151, + "learning_rate": 3.386512217606339e-06, + "loss": 0.9404, + "step": 2540 + }, + { + "epoch": 2.3659217877094973, + "grad_norm": 1.5703598260879517, + "learning_rate": 3.38535454268706e-06, + "loss": 0.9168, + "step": 2541 + }, + { + "epoch": 2.366852886405959, + "grad_norm": 1.671377182006836, + "learning_rate": 3.3841966506523916e-06, + "loss": 0.961, + "step": 2542 + }, + { + "epoch": 2.3677839851024207, + "grad_norm": 1.6794096231460571, + "learning_rate": 3.383038541786282e-06, + "loss": 0.9689, + "step": 2543 + }, + { + "epoch": 2.368715083798883, + "grad_norm": 1.619209885597229, + "learning_rate": 3.3818802163727377e-06, + "loss": 0.9762, + "step": 2544 + }, + { + "epoch": 2.3696461824953445, + "grad_norm": 1.6613311767578125, + "learning_rate": 3.380721674695811e-06, + "loss": 0.9633, + "step": 2545 + }, + { + "epoch": 2.370577281191806, + "grad_norm": 1.6107838153839111, + "learning_rate": 3.379562917039614e-06, + "loss": 0.9326, + "step": 2546 + }, + { + "epoch": 2.3715083798882683, + "grad_norm": 1.6407833099365234, + "learning_rate": 3.3784039436883055e-06, + "loss": 0.9685, + "step": 2547 + }, + { + "epoch": 2.37243947858473, + "grad_norm": 1.6014596223831177, + "learning_rate": 3.377244754926104e-06, + "loss": 0.9425, + "step": 2548 + }, + { + "epoch": 2.3733705772811917, + "grad_norm": 1.6465659141540527, + "learning_rate": 3.376085351037274e-06, + "loss": 0.9691, + "step": 2549 + }, + { + "epoch": 2.3743016759776534, + "grad_norm": 1.6096807718276978, + "learning_rate": 3.374925732306138e-06, + "loss": 0.9214, + "step": 2550 + }, + { + "epoch": 2.3752327746741155, + "grad_norm": 1.6267849206924438, + "learning_rate": 3.3737658990170684e-06, + "loss": 0.9871, + "step": 2551 + }, + { + "epoch": 2.376163873370577, + "grad_norm": 1.6364647150039673, + "learning_rate": 3.3726058514544915e-06, + "loss": 0.9673, + "step": 2552 + }, + { + "epoch": 2.377094972067039, + "grad_norm": 1.588962197303772, + "learning_rate": 3.3714455899028847e-06, + "loss": 0.9555, + "step": 2553 + }, + { + "epoch": 2.378026070763501, + "grad_norm": 1.5935996770858765, + "learning_rate": 3.3702851146467797e-06, + "loss": 0.9042, + "step": 2554 + }, + { + "epoch": 2.3789571694599627, + "grad_norm": 1.5623427629470825, + "learning_rate": 3.369124425970759e-06, + "loss": 0.9053, + "step": 2555 + }, + { + "epoch": 2.3798882681564244, + "grad_norm": 1.6203159093856812, + "learning_rate": 3.3679635241594586e-06, + "loss": 0.9456, + "step": 2556 + }, + { + "epoch": 2.3808193668528865, + "grad_norm": 1.614363670349121, + "learning_rate": 3.3668024094975665e-06, + "loss": 0.948, + "step": 2557 + }, + { + "epoch": 2.381750465549348, + "grad_norm": 1.640191674232483, + "learning_rate": 3.365641082269822e-06, + "loss": 1.0115, + "step": 2558 + }, + { + "epoch": 2.38268156424581, + "grad_norm": 1.643373966217041, + "learning_rate": 3.364479542761018e-06, + "loss": 0.95, + "step": 2559 + }, + { + "epoch": 2.383612662942272, + "grad_norm": 1.6399234533309937, + "learning_rate": 3.3633177912559982e-06, + "loss": 0.9584, + "step": 2560 + }, + { + "epoch": 2.3845437616387337, + "grad_norm": 1.6302703619003296, + "learning_rate": 3.3621558280396594e-06, + "loss": 0.9707, + "step": 2561 + }, + { + "epoch": 2.3854748603351954, + "grad_norm": 1.6424598693847656, + "learning_rate": 3.360993653396949e-06, + "loss": 0.9836, + "step": 2562 + }, + { + "epoch": 2.3864059590316575, + "grad_norm": 1.6595407724380493, + "learning_rate": 3.3598312676128665e-06, + "loss": 0.958, + "step": 2563 + }, + { + "epoch": 2.387337057728119, + "grad_norm": 1.6464163064956665, + "learning_rate": 3.3586686709724647e-06, + "loss": 0.9656, + "step": 2564 + }, + { + "epoch": 2.388268156424581, + "grad_norm": 1.5871891975402832, + "learning_rate": 3.357505863760847e-06, + "loss": 0.926, + "step": 2565 + }, + { + "epoch": 2.389199255121043, + "grad_norm": 1.5294058322906494, + "learning_rate": 3.356342846263168e-06, + "loss": 0.9535, + "step": 2566 + }, + { + "epoch": 2.3901303538175047, + "grad_norm": 1.6083580255508423, + "learning_rate": 3.3551796187646345e-06, + "loss": 0.9587, + "step": 2567 + }, + { + "epoch": 2.3910614525139664, + "grad_norm": 1.5972957611083984, + "learning_rate": 3.3540161815505046e-06, + "loss": 0.9117, + "step": 2568 + }, + { + "epoch": 2.3919925512104285, + "grad_norm": 1.5899473428726196, + "learning_rate": 3.3528525349060873e-06, + "loss": 0.9214, + "step": 2569 + }, + { + "epoch": 2.39292364990689, + "grad_norm": 1.6294020414352417, + "learning_rate": 3.3516886791167446e-06, + "loss": 0.9497, + "step": 2570 + }, + { + "epoch": 2.393854748603352, + "grad_norm": 1.8026212453842163, + "learning_rate": 3.3505246144678884e-06, + "loss": 0.9189, + "step": 2571 + }, + { + "epoch": 2.394785847299814, + "grad_norm": 1.6396327018737793, + "learning_rate": 3.3493603412449815e-06, + "loss": 0.9805, + "step": 2572 + }, + { + "epoch": 2.3957169459962757, + "grad_norm": 1.7179055213928223, + "learning_rate": 3.34819585973354e-06, + "loss": 0.9952, + "step": 2573 + }, + { + "epoch": 2.3966480446927374, + "grad_norm": 1.6350146532058716, + "learning_rate": 3.347031170219127e-06, + "loss": 0.9306, + "step": 2574 + }, + { + "epoch": 2.397579143389199, + "grad_norm": 1.6674777269363403, + "learning_rate": 3.3458662729873614e-06, + "loss": 0.96, + "step": 2575 + }, + { + "epoch": 2.398510242085661, + "grad_norm": 1.5673600435256958, + "learning_rate": 3.3447011683239104e-06, + "loss": 0.9062, + "step": 2576 + }, + { + "epoch": 2.399441340782123, + "grad_norm": 1.6294329166412354, + "learning_rate": 3.343535856514492e-06, + "loss": 0.9479, + "step": 2577 + }, + { + "epoch": 2.4003724394785846, + "grad_norm": 1.7142693996429443, + "learning_rate": 3.342370337844876e-06, + "loss": 0.9603, + "step": 2578 + }, + { + "epoch": 2.4013035381750467, + "grad_norm": 1.614809513092041, + "learning_rate": 3.3412046126008814e-06, + "loss": 0.9635, + "step": 2579 + }, + { + "epoch": 2.4022346368715084, + "grad_norm": 1.6972476243972778, + "learning_rate": 3.34003868106838e-06, + "loss": 0.9607, + "step": 2580 + }, + { + "epoch": 2.40316573556797, + "grad_norm": 1.7378531694412231, + "learning_rate": 3.3388725435332915e-06, + "loss": 0.9986, + "step": 2581 + }, + { + "epoch": 2.404096834264432, + "grad_norm": 1.6341021060943604, + "learning_rate": 3.33770620028159e-06, + "loss": 0.978, + "step": 2582 + }, + { + "epoch": 2.405027932960894, + "grad_norm": 1.6068378686904907, + "learning_rate": 3.3365396515992954e-06, + "loss": 0.9471, + "step": 2583 + }, + { + "epoch": 2.4059590316573556, + "grad_norm": 1.668980360031128, + "learning_rate": 3.335372897772482e-06, + "loss": 1.0031, + "step": 2584 + }, + { + "epoch": 2.4068901303538173, + "grad_norm": 1.6691441535949707, + "learning_rate": 3.334205939087272e-06, + "loss": 0.95, + "step": 2585 + }, + { + "epoch": 2.4078212290502794, + "grad_norm": 1.6827136278152466, + "learning_rate": 3.333038775829839e-06, + "loss": 0.9729, + "step": 2586 + }, + { + "epoch": 2.408752327746741, + "grad_norm": 1.6189643144607544, + "learning_rate": 3.331871408286406e-06, + "loss": 1.009, + "step": 2587 + }, + { + "epoch": 2.4096834264432028, + "grad_norm": 1.6583327054977417, + "learning_rate": 3.330703836743245e-06, + "loss": 1.003, + "step": 2588 + }, + { + "epoch": 2.410614525139665, + "grad_norm": 1.6170976161956787, + "learning_rate": 3.329536061486682e-06, + "loss": 0.9728, + "step": 2589 + }, + { + "epoch": 2.4115456238361266, + "grad_norm": 1.6288248300552368, + "learning_rate": 3.328368082803088e-06, + "loss": 0.9571, + "step": 2590 + }, + { + "epoch": 2.4124767225325883, + "grad_norm": 1.6497873067855835, + "learning_rate": 3.3271999009788886e-06, + "loss": 0.941, + "step": 2591 + }, + { + "epoch": 2.4134078212290504, + "grad_norm": 1.7231147289276123, + "learning_rate": 3.3260315163005552e-06, + "loss": 0.9537, + "step": 2592 + }, + { + "epoch": 2.414338919925512, + "grad_norm": 1.6208795309066772, + "learning_rate": 3.32486292905461e-06, + "loss": 0.9503, + "step": 2593 + }, + { + "epoch": 2.4152700186219738, + "grad_norm": 1.665986180305481, + "learning_rate": 3.3236941395276283e-06, + "loss": 0.9516, + "step": 2594 + }, + { + "epoch": 2.416201117318436, + "grad_norm": 1.6351374387741089, + "learning_rate": 3.3225251480062296e-06, + "loss": 0.9594, + "step": 2595 + }, + { + "epoch": 2.4171322160148976, + "grad_norm": 1.627670407295227, + "learning_rate": 3.3213559547770873e-06, + "loss": 0.9799, + "step": 2596 + }, + { + "epoch": 2.4180633147113593, + "grad_norm": 1.6770120859146118, + "learning_rate": 3.3201865601269206e-06, + "loss": 0.9445, + "step": 2597 + }, + { + "epoch": 2.4189944134078214, + "grad_norm": 1.7052509784698486, + "learning_rate": 3.3190169643425025e-06, + "loss": 1.0043, + "step": 2598 + }, + { + "epoch": 2.419925512104283, + "grad_norm": 1.6855852603912354, + "learning_rate": 3.3178471677106504e-06, + "loss": 0.9495, + "step": 2599 + }, + { + "epoch": 2.4208566108007448, + "grad_norm": 1.743435025215149, + "learning_rate": 3.316677170518235e-06, + "loss": 0.9987, + "step": 2600 + }, + { + "epoch": 2.421787709497207, + "grad_norm": 1.6144195795059204, + "learning_rate": 3.315506973052174e-06, + "loss": 0.9481, + "step": 2601 + }, + { + "epoch": 2.4227188081936686, + "grad_norm": 1.718222975730896, + "learning_rate": 3.3143365755994346e-06, + "loss": 0.9586, + "step": 2602 + }, + { + "epoch": 2.4236499068901303, + "grad_norm": 1.644515037536621, + "learning_rate": 3.3131659784470334e-06, + "loss": 0.9423, + "step": 2603 + }, + { + "epoch": 2.4245810055865924, + "grad_norm": 1.6273270845413208, + "learning_rate": 3.3119951818820357e-06, + "loss": 0.9598, + "step": 2604 + }, + { + "epoch": 2.425512104283054, + "grad_norm": 1.6700594425201416, + "learning_rate": 3.3108241861915565e-06, + "loss": 0.9155, + "step": 2605 + }, + { + "epoch": 2.4264432029795158, + "grad_norm": 1.6600873470306396, + "learning_rate": 3.309652991662758e-06, + "loss": 0.9351, + "step": 2606 + }, + { + "epoch": 2.427374301675978, + "grad_norm": 1.6999610662460327, + "learning_rate": 3.3084815985828524e-06, + "loss": 0.9712, + "step": 2607 + }, + { + "epoch": 2.4283054003724396, + "grad_norm": 1.643938660621643, + "learning_rate": 3.3073100072391e-06, + "loss": 0.9507, + "step": 2608 + }, + { + "epoch": 2.4292364990689013, + "grad_norm": 1.6479452848434448, + "learning_rate": 3.306138217918811e-06, + "loss": 0.9557, + "step": 2609 + }, + { + "epoch": 2.430167597765363, + "grad_norm": 1.6434718370437622, + "learning_rate": 3.304966230909342e-06, + "loss": 0.9475, + "step": 2610 + }, + { + "epoch": 2.431098696461825, + "grad_norm": 1.676967740058899, + "learning_rate": 3.3037940464981005e-06, + "loss": 0.9573, + "step": 2611 + }, + { + "epoch": 2.4320297951582868, + "grad_norm": 1.6404533386230469, + "learning_rate": 3.30262166497254e-06, + "loss": 0.932, + "step": 2612 + }, + { + "epoch": 2.4329608938547485, + "grad_norm": 1.6470792293548584, + "learning_rate": 3.301449086620164e-06, + "loss": 0.9507, + "step": 2613 + }, + { + "epoch": 2.4338919925512106, + "grad_norm": 1.6286709308624268, + "learning_rate": 3.300276311728523e-06, + "loss": 0.986, + "step": 2614 + }, + { + "epoch": 2.4348230912476723, + "grad_norm": 1.6010756492614746, + "learning_rate": 3.299103340585218e-06, + "loss": 0.9355, + "step": 2615 + }, + { + "epoch": 2.435754189944134, + "grad_norm": 1.6273709535598755, + "learning_rate": 3.297930173477895e-06, + "loss": 0.9762, + "step": 2616 + }, + { + "epoch": 2.436685288640596, + "grad_norm": 1.6747825145721436, + "learning_rate": 3.2967568106942504e-06, + "loss": 0.9455, + "step": 2617 + }, + { + "epoch": 2.4376163873370578, + "grad_norm": 1.5681103467941284, + "learning_rate": 3.295583252522028e-06, + "loss": 0.9678, + "step": 2618 + }, + { + "epoch": 2.4385474860335195, + "grad_norm": 1.6230740547180176, + "learning_rate": 3.294409499249019e-06, + "loss": 0.9732, + "step": 2619 + }, + { + "epoch": 2.439478584729981, + "grad_norm": 1.7424641847610474, + "learning_rate": 3.2932355511630627e-06, + "loss": 0.9887, + "step": 2620 + }, + { + "epoch": 2.4404096834264433, + "grad_norm": 1.6812171936035156, + "learning_rate": 3.2920614085520465e-06, + "loss": 0.9808, + "step": 2621 + }, + { + "epoch": 2.441340782122905, + "grad_norm": 1.6341584920883179, + "learning_rate": 3.290887071703905e-06, + "loss": 0.9699, + "step": 2622 + }, + { + "epoch": 2.4422718808193666, + "grad_norm": 1.6232788562774658, + "learning_rate": 3.289712540906621e-06, + "loss": 0.9573, + "step": 2623 + }, + { + "epoch": 2.4432029795158288, + "grad_norm": 1.6105865240097046, + "learning_rate": 3.2885378164482235e-06, + "loss": 0.9222, + "step": 2624 + }, + { + "epoch": 2.4441340782122905, + "grad_norm": 1.6497597694396973, + "learning_rate": 3.287362898616792e-06, + "loss": 0.9804, + "step": 2625 + }, + { + "epoch": 2.445065176908752, + "grad_norm": 1.6729713678359985, + "learning_rate": 3.2861877877004495e-06, + "loss": 0.9674, + "step": 2626 + }, + { + "epoch": 2.4459962756052143, + "grad_norm": 1.700015902519226, + "learning_rate": 3.2850124839873693e-06, + "loss": 0.9498, + "step": 2627 + }, + { + "epoch": 2.446927374301676, + "grad_norm": 1.598156452178955, + "learning_rate": 3.283836987765771e-06, + "loss": 0.959, + "step": 2628 + }, + { + "epoch": 2.4478584729981376, + "grad_norm": 1.6494011878967285, + "learning_rate": 3.2826612993239213e-06, + "loss": 0.986, + "step": 2629 + }, + { + "epoch": 2.4487895716945998, + "grad_norm": 1.6884851455688477, + "learning_rate": 3.2814854189501343e-06, + "loss": 0.9648, + "step": 2630 + }, + { + "epoch": 2.4497206703910615, + "grad_norm": 1.6454182863235474, + "learning_rate": 3.28030934693277e-06, + "loss": 0.9667, + "step": 2631 + }, + { + "epoch": 2.450651769087523, + "grad_norm": 1.6368962526321411, + "learning_rate": 3.2791330835602385e-06, + "loss": 0.9348, + "step": 2632 + }, + { + "epoch": 2.4515828677839853, + "grad_norm": 1.6652709245681763, + "learning_rate": 3.2779566291209918e-06, + "loss": 0.9571, + "step": 2633 + }, + { + "epoch": 2.452513966480447, + "grad_norm": 1.6055703163146973, + "learning_rate": 3.2767799839035347e-06, + "loss": 0.9356, + "step": 2634 + }, + { + "epoch": 2.4534450651769086, + "grad_norm": 1.6307944059371948, + "learning_rate": 3.2756031481964134e-06, + "loss": 0.9324, + "step": 2635 + }, + { + "epoch": 2.4543761638733708, + "grad_norm": 1.5875563621520996, + "learning_rate": 3.274426122288225e-06, + "loss": 0.9381, + "step": 2636 + }, + { + "epoch": 2.4553072625698324, + "grad_norm": 1.6253734827041626, + "learning_rate": 3.2732489064676096e-06, + "loss": 0.9514, + "step": 2637 + }, + { + "epoch": 2.456238361266294, + "grad_norm": 1.6338211297988892, + "learning_rate": 3.2720715010232572e-06, + "loss": 0.9504, + "step": 2638 + }, + { + "epoch": 2.4571694599627563, + "grad_norm": 1.6506370306015015, + "learning_rate": 3.2708939062439027e-06, + "loss": 0.9667, + "step": 2639 + }, + { + "epoch": 2.458100558659218, + "grad_norm": 1.6240171194076538, + "learning_rate": 3.269716122418326e-06, + "loss": 0.9284, + "step": 2640 + }, + { + "epoch": 2.4590316573556796, + "grad_norm": 1.6600265502929688, + "learning_rate": 3.2685381498353574e-06, + "loss": 0.9666, + "step": 2641 + }, + { + "epoch": 2.4599627560521418, + "grad_norm": 1.6203696727752686, + "learning_rate": 3.267359988783869e-06, + "loss": 0.9705, + "step": 2642 + }, + { + "epoch": 2.4608938547486034, + "grad_norm": 1.5917997360229492, + "learning_rate": 3.266181639552781e-06, + "loss": 0.9284, + "step": 2643 + }, + { + "epoch": 2.461824953445065, + "grad_norm": 1.6019238233566284, + "learning_rate": 3.2650031024310607e-06, + "loss": 0.9538, + "step": 2644 + }, + { + "epoch": 2.462756052141527, + "grad_norm": 1.6653268337249756, + "learning_rate": 3.2638243777077204e-06, + "loss": 0.9551, + "step": 2645 + }, + { + "epoch": 2.463687150837989, + "grad_norm": 1.6435784101486206, + "learning_rate": 3.262645465671819e-06, + "loss": 0.9281, + "step": 2646 + }, + { + "epoch": 2.4646182495344506, + "grad_norm": 1.611222743988037, + "learning_rate": 3.26146636661246e-06, + "loss": 0.9646, + "step": 2647 + }, + { + "epoch": 2.4655493482309123, + "grad_norm": 1.6042946577072144, + "learning_rate": 3.2602870808187955e-06, + "loss": 0.9553, + "step": 2648 + }, + { + "epoch": 2.4664804469273744, + "grad_norm": 1.6262257099151611, + "learning_rate": 3.2591076085800193e-06, + "loss": 0.9423, + "step": 2649 + }, + { + "epoch": 2.467411545623836, + "grad_norm": 1.7350205183029175, + "learning_rate": 3.2579279501853746e-06, + "loss": 0.9583, + "step": 2650 + }, + { + "epoch": 2.468342644320298, + "grad_norm": 1.658310055732727, + "learning_rate": 3.256748105924149e-06, + "loss": 0.9655, + "step": 2651 + }, + { + "epoch": 2.46927374301676, + "grad_norm": 1.5956592559814453, + "learning_rate": 3.255568076085675e-06, + "loss": 0.9628, + "step": 2652 + }, + { + "epoch": 2.4702048417132216, + "grad_norm": 1.6241995096206665, + "learning_rate": 3.2543878609593314e-06, + "loss": 0.941, + "step": 2653 + }, + { + "epoch": 2.4711359404096833, + "grad_norm": 1.5923924446105957, + "learning_rate": 3.253207460834542e-06, + "loss": 0.9342, + "step": 2654 + }, + { + "epoch": 2.472067039106145, + "grad_norm": 1.6418542861938477, + "learning_rate": 3.2520268760007768e-06, + "loss": 0.9534, + "step": 2655 + }, + { + "epoch": 2.472998137802607, + "grad_norm": 1.634240746498108, + "learning_rate": 3.25084610674755e-06, + "loss": 0.9608, + "step": 2656 + }, + { + "epoch": 2.473929236499069, + "grad_norm": 1.5932693481445312, + "learning_rate": 3.249665153364421e-06, + "loss": 0.963, + "step": 2657 + }, + { + "epoch": 2.4748603351955305, + "grad_norm": 1.617982029914856, + "learning_rate": 3.248484016140996e-06, + "loss": 0.9391, + "step": 2658 + }, + { + "epoch": 2.4757914338919926, + "grad_norm": 1.6907939910888672, + "learning_rate": 3.2473026953669245e-06, + "loss": 0.9438, + "step": 2659 + }, + { + "epoch": 2.4767225325884543, + "grad_norm": 1.5538761615753174, + "learning_rate": 3.246121191331902e-06, + "loss": 0.941, + "step": 2660 + }, + { + "epoch": 2.477653631284916, + "grad_norm": 1.5928725004196167, + "learning_rate": 3.2449395043256683e-06, + "loss": 0.9695, + "step": 2661 + }, + { + "epoch": 2.478584729981378, + "grad_norm": 1.6139932870864868, + "learning_rate": 3.2437576346380077e-06, + "loss": 0.9563, + "step": 2662 + }, + { + "epoch": 2.47951582867784, + "grad_norm": 1.7027126550674438, + "learning_rate": 3.2425755825587515e-06, + "loss": 0.9372, + "step": 2663 + }, + { + "epoch": 2.4804469273743015, + "grad_norm": 1.6509934663772583, + "learning_rate": 3.2413933483777725e-06, + "loss": 0.9442, + "step": 2664 + }, + { + "epoch": 2.4813780260707636, + "grad_norm": 1.5893349647521973, + "learning_rate": 3.240210932384991e-06, + "loss": 0.9307, + "step": 2665 + }, + { + "epoch": 2.4823091247672253, + "grad_norm": 1.6430909633636475, + "learning_rate": 3.239028334870371e-06, + "loss": 0.9626, + "step": 2666 + }, + { + "epoch": 2.483240223463687, + "grad_norm": 1.6869335174560547, + "learning_rate": 3.23784555612392e-06, + "loss": 0.9607, + "step": 2667 + }, + { + "epoch": 2.484171322160149, + "grad_norm": 1.588243007659912, + "learning_rate": 3.2366625964356906e-06, + "loss": 0.9608, + "step": 2668 + }, + { + "epoch": 2.485102420856611, + "grad_norm": 1.6660819053649902, + "learning_rate": 3.2354794560957793e-06, + "loss": 0.9824, + "step": 2669 + }, + { + "epoch": 2.4860335195530725, + "grad_norm": 1.652782678604126, + "learning_rate": 3.234296135394329e-06, + "loss": 0.9175, + "step": 2670 + }, + { + "epoch": 2.4869646182495346, + "grad_norm": 1.6150346994400024, + "learning_rate": 3.2331126346215247e-06, + "loss": 0.9775, + "step": 2671 + }, + { + "epoch": 2.4878957169459963, + "grad_norm": 1.6447911262512207, + "learning_rate": 3.2319289540675963e-06, + "loss": 0.9344, + "step": 2672 + }, + { + "epoch": 2.488826815642458, + "grad_norm": 1.6614223718643188, + "learning_rate": 3.230745094022818e-06, + "loss": 0.9467, + "step": 2673 + }, + { + "epoch": 2.48975791433892, + "grad_norm": 1.6187084913253784, + "learning_rate": 3.2295610547775054e-06, + "loss": 0.9545, + "step": 2674 + }, + { + "epoch": 2.490689013035382, + "grad_norm": 1.6777342557907104, + "learning_rate": 3.228376836622023e-06, + "loss": 0.985, + "step": 2675 + }, + { + "epoch": 2.4916201117318435, + "grad_norm": 1.6157910823822021, + "learning_rate": 3.2271924398467746e-06, + "loss": 0.9573, + "step": 2676 + }, + { + "epoch": 2.4925512104283056, + "grad_norm": 1.5595381259918213, + "learning_rate": 3.2260078647422116e-06, + "loss": 0.9273, + "step": 2677 + }, + { + "epoch": 2.4934823091247673, + "grad_norm": 1.653050422668457, + "learning_rate": 3.2248231115988253e-06, + "loss": 0.9611, + "step": 2678 + }, + { + "epoch": 2.494413407821229, + "grad_norm": 1.6171876192092896, + "learning_rate": 3.2236381807071543e-06, + "loss": 0.9452, + "step": 2679 + }, + { + "epoch": 2.4953445065176907, + "grad_norm": 1.6414589881896973, + "learning_rate": 3.2224530723577775e-06, + "loss": 0.9668, + "step": 2680 + }, + { + "epoch": 2.496275605214153, + "grad_norm": 1.6396350860595703, + "learning_rate": 3.221267786841319e-06, + "loss": 0.9429, + "step": 2681 + }, + { + "epoch": 2.4972067039106145, + "grad_norm": 1.6193255186080933, + "learning_rate": 3.220082324448448e-06, + "loss": 0.983, + "step": 2682 + }, + { + "epoch": 2.498137802607076, + "grad_norm": 1.6369428634643555, + "learning_rate": 3.2188966854698724e-06, + "loss": 0.965, + "step": 2683 + }, + { + "epoch": 2.4990689013035383, + "grad_norm": 1.6261485815048218, + "learning_rate": 3.2177108701963494e-06, + "loss": 0.9932, + "step": 2684 + }, + { + "epoch": 2.5, + "grad_norm": 1.6400699615478516, + "learning_rate": 3.2165248789186744e-06, + "loss": 0.9876, + "step": 2685 + }, + { + "epoch": 2.5009310986964617, + "grad_norm": 1.5728873014450073, + "learning_rate": 3.2153387119276886e-06, + "loss": 0.9208, + "step": 2686 + }, + { + "epoch": 2.501862197392924, + "grad_norm": 1.5843089818954468, + "learning_rate": 3.214152369514275e-06, + "loss": 0.9705, + "step": 2687 + }, + { + "epoch": 2.5027932960893855, + "grad_norm": 1.6322309970855713, + "learning_rate": 3.2129658519693613e-06, + "loss": 0.9442, + "step": 2688 + }, + { + "epoch": 2.503724394785847, + "grad_norm": 1.5921897888183594, + "learning_rate": 3.211779159583916e-06, + "loss": 0.9573, + "step": 2689 + }, + { + "epoch": 2.504655493482309, + "grad_norm": 1.6905947923660278, + "learning_rate": 3.2105922926489507e-06, + "loss": 0.985, + "step": 2690 + }, + { + "epoch": 2.505586592178771, + "grad_norm": 1.6740881204605103, + "learning_rate": 3.209405251455524e-06, + "loss": 0.9897, + "step": 2691 + }, + { + "epoch": 2.5065176908752327, + "grad_norm": 1.6153020858764648, + "learning_rate": 3.20821803629473e-06, + "loss": 0.9173, + "step": 2692 + }, + { + "epoch": 2.5074487895716944, + "grad_norm": 1.6218596696853638, + "learning_rate": 3.2070306474577123e-06, + "loss": 0.9639, + "step": 2693 + }, + { + "epoch": 2.5083798882681565, + "grad_norm": 1.6769354343414307, + "learning_rate": 3.205843085235652e-06, + "loss": 0.9687, + "step": 2694 + }, + { + "epoch": 2.509310986964618, + "grad_norm": 1.6009342670440674, + "learning_rate": 3.204655349919776e-06, + "loss": 0.9274, + "step": 2695 + }, + { + "epoch": 2.51024208566108, + "grad_norm": 1.6258291006088257, + "learning_rate": 3.2034674418013523e-06, + "loss": 0.9475, + "step": 2696 + }, + { + "epoch": 2.511173184357542, + "grad_norm": 1.649918794631958, + "learning_rate": 3.202279361171691e-06, + "loss": 0.9828, + "step": 2697 + }, + { + "epoch": 2.5121042830540037, + "grad_norm": 1.6119354963302612, + "learning_rate": 3.2010911083221453e-06, + "loss": 0.953, + "step": 2698 + }, + { + "epoch": 2.5130353817504654, + "grad_norm": 1.6593668460845947, + "learning_rate": 3.1999026835441104e-06, + "loss": 0.9554, + "step": 2699 + }, + { + "epoch": 2.5139664804469275, + "grad_norm": 1.7150312662124634, + "learning_rate": 3.198714087129024e-06, + "loss": 1.0175, + "step": 2700 + }, + { + "epoch": 2.514897579143389, + "grad_norm": 1.5864650011062622, + "learning_rate": 3.197525319368365e-06, + "loss": 0.9561, + "step": 2701 + }, + { + "epoch": 2.515828677839851, + "grad_norm": 1.6027896404266357, + "learning_rate": 3.1963363805536542e-06, + "loss": 0.9817, + "step": 2702 + }, + { + "epoch": 2.516759776536313, + "grad_norm": 1.668905258178711, + "learning_rate": 3.195147270976455e-06, + "loss": 0.9692, + "step": 2703 + }, + { + "epoch": 2.5176908752327747, + "grad_norm": 1.641294002532959, + "learning_rate": 3.193957990928374e-06, + "loss": 0.9391, + "step": 2704 + }, + { + "epoch": 2.5186219739292364, + "grad_norm": 1.6915578842163086, + "learning_rate": 3.1927685407010574e-06, + "loss": 0.9842, + "step": 2705 + }, + { + "epoch": 2.5195530726256985, + "grad_norm": 1.6631584167480469, + "learning_rate": 3.191578920586193e-06, + "loss": 1.0038, + "step": 2706 + }, + { + "epoch": 2.52048417132216, + "grad_norm": 1.6747456789016724, + "learning_rate": 3.1903891308755125e-06, + "loss": 0.8991, + "step": 2707 + }, + { + "epoch": 2.521415270018622, + "grad_norm": 1.651414155960083, + "learning_rate": 3.1891991718607874e-06, + "loss": 0.9691, + "step": 2708 + }, + { + "epoch": 2.522346368715084, + "grad_norm": 1.6247169971466064, + "learning_rate": 3.1880090438338308e-06, + "loss": 0.9746, + "step": 2709 + }, + { + "epoch": 2.5232774674115457, + "grad_norm": 1.7061047554016113, + "learning_rate": 3.1868187470864986e-06, + "loss": 0.9578, + "step": 2710 + }, + { + "epoch": 2.5242085661080074, + "grad_norm": 1.6343475580215454, + "learning_rate": 3.1856282819106867e-06, + "loss": 0.9196, + "step": 2711 + }, + { + "epoch": 2.5251396648044695, + "grad_norm": 1.637317419052124, + "learning_rate": 3.184437648598332e-06, + "loss": 0.9537, + "step": 2712 + }, + { + "epoch": 2.526070763500931, + "grad_norm": 1.6752567291259766, + "learning_rate": 3.1832468474414148e-06, + "loss": 0.9653, + "step": 2713 + }, + { + "epoch": 2.527001862197393, + "grad_norm": 1.614477276802063, + "learning_rate": 3.1820558787319528e-06, + "loss": 0.9844, + "step": 2714 + }, + { + "epoch": 2.527932960893855, + "grad_norm": 1.6819109916687012, + "learning_rate": 3.18086474276201e-06, + "loss": 1.0123, + "step": 2715 + }, + { + "epoch": 2.5288640595903167, + "grad_norm": 1.6218712329864502, + "learning_rate": 3.1796734398236863e-06, + "loss": 0.9654, + "step": 2716 + }, + { + "epoch": 2.5297951582867784, + "grad_norm": 1.6440956592559814, + "learning_rate": 3.1784819702091263e-06, + "loss": 0.9451, + "step": 2717 + }, + { + "epoch": 2.5307262569832405, + "grad_norm": 1.6846622228622437, + "learning_rate": 3.1772903342105135e-06, + "loss": 0.9828, + "step": 2718 + }, + { + "epoch": 2.531657355679702, + "grad_norm": 1.7056374549865723, + "learning_rate": 3.176098532120071e-06, + "loss": 0.963, + "step": 2719 + }, + { + "epoch": 2.532588454376164, + "grad_norm": 1.6760709285736084, + "learning_rate": 3.1749065642300677e-06, + "loss": 0.954, + "step": 2720 + }, + { + "epoch": 2.5335195530726256, + "grad_norm": 1.6642045974731445, + "learning_rate": 3.173714430832806e-06, + "loss": 0.9972, + "step": 2721 + }, + { + "epoch": 2.5344506517690877, + "grad_norm": 1.6921284198760986, + "learning_rate": 3.1725221322206355e-06, + "loss": 0.9384, + "step": 2722 + }, + { + "epoch": 2.5353817504655494, + "grad_norm": 1.6488786935806274, + "learning_rate": 3.171329668685942e-06, + "loss": 0.9436, + "step": 2723 + }, + { + "epoch": 2.536312849162011, + "grad_norm": 1.70298433303833, + "learning_rate": 3.1701370405211536e-06, + "loss": 0.9577, + "step": 2724 + }, + { + "epoch": 2.5372439478584727, + "grad_norm": 1.6803982257843018, + "learning_rate": 3.1689442480187388e-06, + "loss": 0.9582, + "step": 2725 + }, + { + "epoch": 2.538175046554935, + "grad_norm": 1.6304399967193604, + "learning_rate": 3.1677512914712044e-06, + "loss": 0.948, + "step": 2726 + }, + { + "epoch": 2.5391061452513966, + "grad_norm": 1.6413767337799072, + "learning_rate": 3.1665581711711014e-06, + "loss": 0.9774, + "step": 2727 + }, + { + "epoch": 2.5400372439478582, + "grad_norm": 1.6617759466171265, + "learning_rate": 3.165364887411016e-06, + "loss": 0.926, + "step": 2728 + }, + { + "epoch": 2.5409683426443204, + "grad_norm": 1.6656525135040283, + "learning_rate": 3.164171440483579e-06, + "loss": 0.9859, + "step": 2729 + }, + { + "epoch": 2.541899441340782, + "grad_norm": 1.6810792684555054, + "learning_rate": 3.1629778306814586e-06, + "loss": 0.9433, + "step": 2730 + }, + { + "epoch": 2.5428305400372437, + "grad_norm": 1.6987841129302979, + "learning_rate": 3.161784058297363e-06, + "loss": 0.9508, + "step": 2731 + }, + { + "epoch": 2.543761638733706, + "grad_norm": 1.5758877992630005, + "learning_rate": 3.160590123624041e-06, + "loss": 0.9221, + "step": 2732 + }, + { + "epoch": 2.5446927374301676, + "grad_norm": 1.6114469766616821, + "learning_rate": 3.1593960269542817e-06, + "loss": 0.9251, + "step": 2733 + }, + { + "epoch": 2.5456238361266292, + "grad_norm": 1.7200394868850708, + "learning_rate": 3.1582017685809136e-06, + "loss": 0.9773, + "step": 2734 + }, + { + "epoch": 2.5465549348230914, + "grad_norm": 1.576918601989746, + "learning_rate": 3.1570073487968035e-06, + "loss": 0.9137, + "step": 2735 + }, + { + "epoch": 2.547486033519553, + "grad_norm": 1.5539870262145996, + "learning_rate": 3.155812767894859e-06, + "loss": 0.9061, + "step": 2736 + }, + { + "epoch": 2.5484171322160147, + "grad_norm": 1.6504672765731812, + "learning_rate": 3.1546180261680283e-06, + "loss": 0.941, + "step": 2737 + }, + { + "epoch": 2.549348230912477, + "grad_norm": 1.614476203918457, + "learning_rate": 3.1534231239092957e-06, + "loss": 0.896, + "step": 2738 + }, + { + "epoch": 2.5502793296089385, + "grad_norm": 1.616269826889038, + "learning_rate": 3.1522280614116886e-06, + "loss": 0.945, + "step": 2739 + }, + { + "epoch": 2.5512104283054002, + "grad_norm": 1.6424388885498047, + "learning_rate": 3.1510328389682708e-06, + "loss": 0.95, + "step": 2740 + }, + { + "epoch": 2.5521415270018624, + "grad_norm": 1.5913423299789429, + "learning_rate": 3.1498374568721473e-06, + "loss": 0.9496, + "step": 2741 + }, + { + "epoch": 2.553072625698324, + "grad_norm": 1.5946433544158936, + "learning_rate": 3.1486419154164615e-06, + "loss": 0.9832, + "step": 2742 + }, + { + "epoch": 2.5540037243947857, + "grad_norm": 1.606937289237976, + "learning_rate": 3.1474462148943963e-06, + "loss": 0.981, + "step": 2743 + }, + { + "epoch": 2.554934823091248, + "grad_norm": 1.6524548530578613, + "learning_rate": 3.146250355599172e-06, + "loss": 1.0038, + "step": 2744 + }, + { + "epoch": 2.5558659217877095, + "grad_norm": 1.6219244003295898, + "learning_rate": 3.14505433782405e-06, + "loss": 0.9715, + "step": 2745 + }, + { + "epoch": 2.5567970204841712, + "grad_norm": 1.663824200630188, + "learning_rate": 3.1438581618623293e-06, + "loss": 0.9773, + "step": 2746 + }, + { + "epoch": 2.5577281191806334, + "grad_norm": 1.6206378936767578, + "learning_rate": 3.1426618280073485e-06, + "loss": 0.951, + "step": 2747 + }, + { + "epoch": 2.558659217877095, + "grad_norm": 1.6190366744995117, + "learning_rate": 3.1414653365524827e-06, + "loss": 0.9686, + "step": 2748 + }, + { + "epoch": 2.5595903165735567, + "grad_norm": 1.6203886270523071, + "learning_rate": 3.1402686877911494e-06, + "loss": 0.9125, + "step": 2749 + }, + { + "epoch": 2.560521415270019, + "grad_norm": 1.583175539970398, + "learning_rate": 3.139071882016802e-06, + "loss": 0.9413, + "step": 2750 + }, + { + "epoch": 2.5614525139664805, + "grad_norm": 1.6100095510482788, + "learning_rate": 3.1378749195229325e-06, + "loss": 0.9386, + "step": 2751 + }, + { + "epoch": 2.5623836126629422, + "grad_norm": 1.6055479049682617, + "learning_rate": 3.1366778006030717e-06, + "loss": 0.9745, + "step": 2752 + }, + { + "epoch": 2.5633147113594044, + "grad_norm": 1.6939977407455444, + "learning_rate": 3.1354805255507902e-06, + "loss": 0.9899, + "step": 2753 + }, + { + "epoch": 2.564245810055866, + "grad_norm": 1.599851131439209, + "learning_rate": 3.134283094659695e-06, + "loss": 0.9089, + "step": 2754 + }, + { + "epoch": 2.5651769087523277, + "grad_norm": 1.6958569288253784, + "learning_rate": 3.1330855082234313e-06, + "loss": 0.9982, + "step": 2755 + }, + { + "epoch": 2.5661080074487894, + "grad_norm": 1.58479642868042, + "learning_rate": 3.131887766535684e-06, + "loss": 0.9103, + "step": 2756 + }, + { + "epoch": 2.5670391061452515, + "grad_norm": 1.6718580722808838, + "learning_rate": 3.1306898698901744e-06, + "loss": 0.9578, + "step": 2757 + }, + { + "epoch": 2.5679702048417132, + "grad_norm": 1.6546039581298828, + "learning_rate": 3.1294918185806627e-06, + "loss": 0.9321, + "step": 2758 + }, + { + "epoch": 2.568901303538175, + "grad_norm": 1.6734904050827026, + "learning_rate": 3.1282936129009473e-06, + "loss": 0.9425, + "step": 2759 + }, + { + "epoch": 2.5698324022346366, + "grad_norm": 1.6155725717544556, + "learning_rate": 3.127095253144864e-06, + "loss": 0.9266, + "step": 2760 + }, + { + "epoch": 2.5707635009310987, + "grad_norm": 1.6590644121170044, + "learning_rate": 3.125896739606286e-06, + "loss": 0.942, + "step": 2761 + }, + { + "epoch": 2.5716945996275604, + "grad_norm": 1.6101138591766357, + "learning_rate": 3.124698072579125e-06, + "loss": 0.9837, + "step": 2762 + }, + { + "epoch": 2.572625698324022, + "grad_norm": 1.6097989082336426, + "learning_rate": 3.12349925235733e-06, + "loss": 0.9496, + "step": 2763 + }, + { + "epoch": 2.5735567970204842, + "grad_norm": 1.6388952732086182, + "learning_rate": 3.122300279234886e-06, + "loss": 0.9519, + "step": 2764 + }, + { + "epoch": 2.574487895716946, + "grad_norm": 1.5914781093597412, + "learning_rate": 3.12110115350582e-06, + "loss": 0.93, + "step": 2765 + }, + { + "epoch": 2.5754189944134076, + "grad_norm": 1.5782808065414429, + "learning_rate": 3.1199018754641907e-06, + "loss": 0.9343, + "step": 2766 + }, + { + "epoch": 2.5763500931098697, + "grad_norm": 1.5984432697296143, + "learning_rate": 3.1187024454040993e-06, + "loss": 0.9514, + "step": 2767 + }, + { + "epoch": 2.5772811918063314, + "grad_norm": 1.5987129211425781, + "learning_rate": 3.11750286361968e-06, + "loss": 0.921, + "step": 2768 + }, + { + "epoch": 2.578212290502793, + "grad_norm": 1.5802693367004395, + "learning_rate": 3.1163031304051065e-06, + "loss": 0.9252, + "step": 2769 + }, + { + "epoch": 2.5791433891992552, + "grad_norm": 1.6504684686660767, + "learning_rate": 3.1151032460545906e-06, + "loss": 0.9065, + "step": 2770 + }, + { + "epoch": 2.580074487895717, + "grad_norm": 1.6109102964401245, + "learning_rate": 3.1139032108623773e-06, + "loss": 0.9553, + "step": 2771 + }, + { + "epoch": 2.5810055865921786, + "grad_norm": 1.6237130165100098, + "learning_rate": 3.1127030251227534e-06, + "loss": 0.9649, + "step": 2772 + }, + { + "epoch": 2.5819366852886407, + "grad_norm": 1.5690436363220215, + "learning_rate": 3.111502689130039e-06, + "loss": 0.9843, + "step": 2773 + }, + { + "epoch": 2.5828677839851024, + "grad_norm": 1.632887840270996, + "learning_rate": 3.110302203178593e-06, + "loss": 0.9335, + "step": 2774 + }, + { + "epoch": 2.583798882681564, + "grad_norm": 1.753038763999939, + "learning_rate": 3.10910156756281e-06, + "loss": 0.98, + "step": 2775 + }, + { + "epoch": 2.5847299813780262, + "grad_norm": 1.584394931793213, + "learning_rate": 3.1079007825771217e-06, + "loss": 0.9226, + "step": 2776 + }, + { + "epoch": 2.585661080074488, + "grad_norm": 1.5974069833755493, + "learning_rate": 3.1066998485159965e-06, + "loss": 0.9383, + "step": 2777 + }, + { + "epoch": 2.5865921787709496, + "grad_norm": 1.6693968772888184, + "learning_rate": 3.1054987656739395e-06, + "loss": 0.9654, + "step": 2778 + }, + { + "epoch": 2.5875232774674117, + "grad_norm": 1.673758864402771, + "learning_rate": 3.1042975343454927e-06, + "loss": 0.9673, + "step": 2779 + }, + { + "epoch": 2.5884543761638734, + "grad_norm": 1.6719874143600464, + "learning_rate": 3.103096154825233e-06, + "loss": 0.9402, + "step": 2780 + }, + { + "epoch": 2.589385474860335, + "grad_norm": 1.63694167137146, + "learning_rate": 3.1018946274077748e-06, + "loss": 0.9231, + "step": 2781 + }, + { + "epoch": 2.5903165735567972, + "grad_norm": 1.603509783744812, + "learning_rate": 3.100692952387769e-06, + "loss": 0.9154, + "step": 2782 + }, + { + "epoch": 2.591247672253259, + "grad_norm": 1.6750872135162354, + "learning_rate": 3.0994911300599013e-06, + "loss": 0.9394, + "step": 2783 + }, + { + "epoch": 2.5921787709497206, + "grad_norm": 1.6520224809646606, + "learning_rate": 3.0982891607188948e-06, + "loss": 0.9231, + "step": 2784 + }, + { + "epoch": 2.5931098696461827, + "grad_norm": 1.6294194459915161, + "learning_rate": 3.0970870446595087e-06, + "loss": 0.9222, + "step": 2785 + }, + { + "epoch": 2.5940409683426444, + "grad_norm": 1.70402193069458, + "learning_rate": 3.0958847821765377e-06, + "loss": 0.9699, + "step": 2786 + }, + { + "epoch": 2.594972067039106, + "grad_norm": 1.700933575630188, + "learning_rate": 3.094682373564812e-06, + "loss": 0.9437, + "step": 2787 + }, + { + "epoch": 2.5959031657355682, + "grad_norm": 1.5616328716278076, + "learning_rate": 3.0934798191191986e-06, + "loss": 0.9535, + "step": 2788 + }, + { + "epoch": 2.59683426443203, + "grad_norm": 1.6358249187469482, + "learning_rate": 3.092277119134599e-06, + "loss": 0.9425, + "step": 2789 + }, + { + "epoch": 2.5977653631284916, + "grad_norm": 1.6383639574050903, + "learning_rate": 3.091074273905953e-06, + "loss": 0.982, + "step": 2790 + }, + { + "epoch": 2.5986964618249533, + "grad_norm": 1.5911756753921509, + "learning_rate": 3.089871283728232e-06, + "loss": 0.9508, + "step": 2791 + }, + { + "epoch": 2.5996275605214154, + "grad_norm": 1.633705496788025, + "learning_rate": 3.0886681488964466e-06, + "loss": 0.9524, + "step": 2792 + }, + { + "epoch": 2.600558659217877, + "grad_norm": 1.6034173965454102, + "learning_rate": 3.0874648697056403e-06, + "loss": 0.943, + "step": 2793 + }, + { + "epoch": 2.601489757914339, + "grad_norm": 1.6653735637664795, + "learning_rate": 3.0862614464508944e-06, + "loss": 0.9616, + "step": 2794 + }, + { + "epoch": 2.6024208566108005, + "grad_norm": 1.5828442573547363, + "learning_rate": 3.0850578794273236e-06, + "loss": 0.9045, + "step": 2795 + }, + { + "epoch": 2.6033519553072626, + "grad_norm": 1.6605607271194458, + "learning_rate": 3.083854168930078e-06, + "loss": 0.9554, + "step": 2796 + }, + { + "epoch": 2.6042830540037243, + "grad_norm": 1.7061210870742798, + "learning_rate": 3.082650315254344e-06, + "loss": 0.9633, + "step": 2797 + }, + { + "epoch": 2.605214152700186, + "grad_norm": 1.6655867099761963, + "learning_rate": 3.0814463186953424e-06, + "loss": 0.9708, + "step": 2798 + }, + { + "epoch": 2.606145251396648, + "grad_norm": 1.5771557092666626, + "learning_rate": 3.08024217954833e-06, + "loss": 0.9222, + "step": 2799 + }, + { + "epoch": 2.60707635009311, + "grad_norm": 1.6326788663864136, + "learning_rate": 3.0790378981085957e-06, + "loss": 0.9345, + "step": 2800 + }, + { + "epoch": 2.6080074487895715, + "grad_norm": 1.6208207607269287, + "learning_rate": 3.077833474671467e-06, + "loss": 0.9574, + "step": 2801 + }, + { + "epoch": 2.6089385474860336, + "grad_norm": 1.6612160205841064, + "learning_rate": 3.076628909532303e-06, + "loss": 0.9509, + "step": 2802 + }, + { + "epoch": 2.6098696461824953, + "grad_norm": 1.69960618019104, + "learning_rate": 3.0754242029865005e-06, + "loss": 0.9156, + "step": 2803 + }, + { + "epoch": 2.610800744878957, + "grad_norm": 1.7071855068206787, + "learning_rate": 3.0742193553294896e-06, + "loss": 0.9748, + "step": 2804 + }, + { + "epoch": 2.611731843575419, + "grad_norm": 1.6477720737457275, + "learning_rate": 3.073014366856733e-06, + "loss": 0.9873, + "step": 2805 + }, + { + "epoch": 2.612662942271881, + "grad_norm": 1.680709719657898, + "learning_rate": 3.0718092378637325e-06, + "loss": 0.9412, + "step": 2806 + }, + { + "epoch": 2.6135940409683425, + "grad_norm": 1.6208752393722534, + "learning_rate": 3.07060396864602e-06, + "loss": 0.9422, + "step": 2807 + }, + { + "epoch": 2.6145251396648046, + "grad_norm": 1.6615691184997559, + "learning_rate": 3.0693985594991643e-06, + "loss": 0.9973, + "step": 2808 + }, + { + "epoch": 2.6154562383612663, + "grad_norm": 1.6101716756820679, + "learning_rate": 3.0681930107187667e-06, + "loss": 0.9604, + "step": 2809 + }, + { + "epoch": 2.616387337057728, + "grad_norm": 1.5833975076675415, + "learning_rate": 3.0669873226004655e-06, + "loss": 0.8983, + "step": 2810 + }, + { + "epoch": 2.61731843575419, + "grad_norm": 1.6398487091064453, + "learning_rate": 3.06578149543993e-06, + "loss": 0.9398, + "step": 2811 + }, + { + "epoch": 2.618249534450652, + "grad_norm": 1.7333900928497314, + "learning_rate": 3.064575529532865e-06, + "loss": 0.9354, + "step": 2812 + }, + { + "epoch": 2.6191806331471135, + "grad_norm": 1.6353338956832886, + "learning_rate": 3.063369425175011e-06, + "loss": 0.9272, + "step": 2813 + }, + { + "epoch": 2.6201117318435756, + "grad_norm": 1.5639485120773315, + "learning_rate": 3.062163182662139e-06, + "loss": 0.9171, + "step": 2814 + }, + { + "epoch": 2.6210428305400373, + "grad_norm": 1.578440546989441, + "learning_rate": 3.060956802290057e-06, + "loss": 0.9036, + "step": 2815 + }, + { + "epoch": 2.621973929236499, + "grad_norm": 1.6571640968322754, + "learning_rate": 3.0597502843546044e-06, + "loss": 0.9671, + "step": 2816 + }, + { + "epoch": 2.622905027932961, + "grad_norm": 1.6397638320922852, + "learning_rate": 3.058543629151657e-06, + "loss": 0.982, + "step": 2817 + }, + { + "epoch": 2.623836126629423, + "grad_norm": 1.668396234512329, + "learning_rate": 3.0573368369771204e-06, + "loss": 0.9567, + "step": 2818 + }, + { + "epoch": 2.6247672253258845, + "grad_norm": 1.600182294845581, + "learning_rate": 3.056129908126938e-06, + "loss": 0.9538, + "step": 2819 + }, + { + "epoch": 2.6256983240223466, + "grad_norm": 1.6866743564605713, + "learning_rate": 3.0549228428970844e-06, + "loss": 0.9791, + "step": 2820 + }, + { + "epoch": 2.6266294227188083, + "grad_norm": 1.5799665451049805, + "learning_rate": 3.053715641583567e-06, + "loss": 0.9613, + "step": 2821 + }, + { + "epoch": 2.62756052141527, + "grad_norm": 1.5834141969680786, + "learning_rate": 3.0525083044824306e-06, + "loss": 0.9783, + "step": 2822 + }, + { + "epoch": 2.628491620111732, + "grad_norm": 1.6275930404663086, + "learning_rate": 3.0513008318897468e-06, + "loss": 1.0146, + "step": 2823 + }, + { + "epoch": 2.629422718808194, + "grad_norm": 1.6697674989700317, + "learning_rate": 3.0500932241016255e-06, + "loss": 0.9296, + "step": 2824 + }, + { + "epoch": 2.6303538175046555, + "grad_norm": 1.5969644784927368, + "learning_rate": 3.0488854814142083e-06, + "loss": 0.9779, + "step": 2825 + }, + { + "epoch": 2.631284916201117, + "grad_norm": 1.6485135555267334, + "learning_rate": 3.04767760412367e-06, + "loss": 0.9463, + "step": 2826 + }, + { + "epoch": 2.6322160148975793, + "grad_norm": 1.6014982461929321, + "learning_rate": 3.0464695925262173e-06, + "loss": 0.9296, + "step": 2827 + }, + { + "epoch": 2.633147113594041, + "grad_norm": 1.6272295713424683, + "learning_rate": 3.045261446918092e-06, + "loss": 1.0032, + "step": 2828 + }, + { + "epoch": 2.6340782122905027, + "grad_norm": 1.607211947441101, + "learning_rate": 3.044053167595566e-06, + "loss": 0.9617, + "step": 2829 + }, + { + "epoch": 2.635009310986965, + "grad_norm": 1.6082310676574707, + "learning_rate": 3.0428447548549466e-06, + "loss": 0.9552, + "step": 2830 + }, + { + "epoch": 2.6359404096834265, + "grad_norm": 1.5937196016311646, + "learning_rate": 3.041636208992572e-06, + "loss": 0.9673, + "step": 2831 + }, + { + "epoch": 2.636871508379888, + "grad_norm": 1.6141903400421143, + "learning_rate": 3.0404275303048152e-06, + "loss": 0.9033, + "step": 2832 + }, + { + "epoch": 2.63780260707635, + "grad_norm": 1.6257685422897339, + "learning_rate": 3.0392187190880786e-06, + "loss": 0.9767, + "step": 2833 + }, + { + "epoch": 2.638733705772812, + "grad_norm": 1.5968188047409058, + "learning_rate": 3.0380097756387996e-06, + "loss": 0.9525, + "step": 2834 + }, + { + "epoch": 2.6396648044692737, + "grad_norm": 1.6123281717300415, + "learning_rate": 3.0368007002534474e-06, + "loss": 0.9457, + "step": 2835 + }, + { + "epoch": 2.6405959031657353, + "grad_norm": 1.6483839750289917, + "learning_rate": 3.035591493228523e-06, + "loss": 0.95, + "step": 2836 + }, + { + "epoch": 2.6415270018621975, + "grad_norm": 1.6793286800384521, + "learning_rate": 3.03438215486056e-06, + "loss": 0.965, + "step": 2837 + }, + { + "epoch": 2.642458100558659, + "grad_norm": 1.690168023109436, + "learning_rate": 3.033172685446125e-06, + "loss": 0.9667, + "step": 2838 + }, + { + "epoch": 2.643389199255121, + "grad_norm": 1.640297770500183, + "learning_rate": 3.031963085281816e-06, + "loss": 0.9258, + "step": 2839 + }, + { + "epoch": 2.644320297951583, + "grad_norm": 1.7079721689224243, + "learning_rate": 3.030753354664262e-06, + "loss": 0.9913, + "step": 2840 + }, + { + "epoch": 2.6452513966480447, + "grad_norm": 1.5586864948272705, + "learning_rate": 3.0295434938901263e-06, + "loss": 0.9342, + "step": 2841 + }, + { + "epoch": 2.6461824953445063, + "grad_norm": 1.6786353588104248, + "learning_rate": 3.028333503256103e-06, + "loss": 0.9496, + "step": 2842 + }, + { + "epoch": 2.6471135940409685, + "grad_norm": 1.613917350769043, + "learning_rate": 3.0271233830589162e-06, + "loss": 0.9445, + "step": 2843 + }, + { + "epoch": 2.64804469273743, + "grad_norm": 1.6379549503326416, + "learning_rate": 3.025913133595325e-06, + "loss": 0.9509, + "step": 2844 + }, + { + "epoch": 2.648975791433892, + "grad_norm": 1.6311167478561401, + "learning_rate": 3.0247027551621187e-06, + "loss": 0.9568, + "step": 2845 + }, + { + "epoch": 2.649906890130354, + "grad_norm": 1.5794034004211426, + "learning_rate": 3.0234922480561187e-06, + "loss": 0.9651, + "step": 2846 + }, + { + "epoch": 2.6508379888268156, + "grad_norm": 1.5947290658950806, + "learning_rate": 3.022281612574176e-06, + "loss": 0.9511, + "step": 2847 + }, + { + "epoch": 2.6517690875232773, + "grad_norm": 1.6354261636734009, + "learning_rate": 3.021070849013176e-06, + "loss": 0.9405, + "step": 2848 + }, + { + "epoch": 2.6527001862197395, + "grad_norm": 1.6305598020553589, + "learning_rate": 3.019859957670034e-06, + "loss": 0.9263, + "step": 2849 + }, + { + "epoch": 2.653631284916201, + "grad_norm": 1.6536580324172974, + "learning_rate": 3.018648938841695e-06, + "loss": 0.9455, + "step": 2850 + }, + { + "epoch": 2.654562383612663, + "grad_norm": 1.6688307523727417, + "learning_rate": 3.0174377928251392e-06, + "loss": 1.0027, + "step": 2851 + }, + { + "epoch": 2.655493482309125, + "grad_norm": 1.6771334409713745, + "learning_rate": 3.016226519917374e-06, + "loss": 0.9978, + "step": 2852 + }, + { + "epoch": 2.6564245810055866, + "grad_norm": 1.6501225233078003, + "learning_rate": 3.0150151204154423e-06, + "loss": 0.9535, + "step": 2853 + }, + { + "epoch": 2.6573556797020483, + "grad_norm": 1.6267509460449219, + "learning_rate": 3.0138035946164125e-06, + "loss": 1.0055, + "step": 2854 + }, + { + "epoch": 2.6582867783985105, + "grad_norm": 1.6203352212905884, + "learning_rate": 3.0125919428173876e-06, + "loss": 0.9498, + "step": 2855 + }, + { + "epoch": 2.659217877094972, + "grad_norm": 1.6645296812057495, + "learning_rate": 3.011380165315503e-06, + "loss": 0.9502, + "step": 2856 + }, + { + "epoch": 2.660148975791434, + "grad_norm": 1.5809940099716187, + "learning_rate": 3.010168262407919e-06, + "loss": 0.9448, + "step": 2857 + }, + { + "epoch": 2.661080074487896, + "grad_norm": 1.5849182605743408, + "learning_rate": 3.008956234391835e-06, + "loss": 0.9243, + "step": 2858 + }, + { + "epoch": 2.6620111731843576, + "grad_norm": 1.621230125427246, + "learning_rate": 3.0077440815644722e-06, + "loss": 0.957, + "step": 2859 + }, + { + "epoch": 2.6629422718808193, + "grad_norm": 1.6482765674591064, + "learning_rate": 3.00653180422309e-06, + "loss": 0.9411, + "step": 2860 + }, + { + "epoch": 2.6638733705772815, + "grad_norm": 1.5941625833511353, + "learning_rate": 3.005319402664973e-06, + "loss": 0.9388, + "step": 2861 + }, + { + "epoch": 2.664804469273743, + "grad_norm": 1.6318464279174805, + "learning_rate": 3.0041068771874387e-06, + "loss": 0.958, + "step": 2862 + }, + { + "epoch": 2.665735567970205, + "grad_norm": 1.6854920387268066, + "learning_rate": 3.0028942280878347e-06, + "loss": 0.9576, + "step": 2863 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 1.6749870777130127, + "learning_rate": 3.0016814556635387e-06, + "loss": 0.9924, + "step": 2864 + }, + { + "epoch": 2.6675977653631286, + "grad_norm": 1.5535457134246826, + "learning_rate": 3.00046856021196e-06, + "loss": 0.95, + "step": 2865 + }, + { + "epoch": 2.6685288640595903, + "grad_norm": 1.5978206396102905, + "learning_rate": 2.999255542030534e-06, + "loss": 0.9508, + "step": 2866 + }, + { + "epoch": 2.669459962756052, + "grad_norm": 1.6399273872375488, + "learning_rate": 2.998042401416732e-06, + "loss": 0.9616, + "step": 2867 + }, + { + "epoch": 2.6703910614525137, + "grad_norm": 1.5954216718673706, + "learning_rate": 2.9968291386680505e-06, + "loss": 0.9421, + "step": 2868 + }, + { + "epoch": 2.671322160148976, + "grad_norm": 1.7154549360275269, + "learning_rate": 2.9956157540820186e-06, + "loss": 1.0109, + "step": 2869 + }, + { + "epoch": 2.6722532588454375, + "grad_norm": 1.7107621431350708, + "learning_rate": 2.994402247956194e-06, + "loss": 1.0035, + "step": 2870 + }, + { + "epoch": 2.673184357541899, + "grad_norm": 1.6286007165908813, + "learning_rate": 2.9931886205881642e-06, + "loss": 0.9785, + "step": 2871 + }, + { + "epoch": 2.6741154562383613, + "grad_norm": 1.6210182905197144, + "learning_rate": 2.9919748722755485e-06, + "loss": 0.9403, + "step": 2872 + }, + { + "epoch": 2.675046554934823, + "grad_norm": 1.6527222394943237, + "learning_rate": 2.9907610033159927e-06, + "loss": 0.9398, + "step": 2873 + }, + { + "epoch": 2.6759776536312847, + "grad_norm": 1.6232794523239136, + "learning_rate": 2.989547014007175e-06, + "loss": 0.9371, + "step": 2874 + }, + { + "epoch": 2.676908752327747, + "grad_norm": 1.618330478668213, + "learning_rate": 2.9883329046468e-06, + "loss": 0.951, + "step": 2875 + }, + { + "epoch": 2.6778398510242085, + "grad_norm": 1.6865403652191162, + "learning_rate": 2.987118675532606e-06, + "loss": 0.975, + "step": 2876 + }, + { + "epoch": 2.67877094972067, + "grad_norm": 1.668399453163147, + "learning_rate": 2.985904326962357e-06, + "loss": 0.9715, + "step": 2877 + }, + { + "epoch": 2.6797020484171323, + "grad_norm": 1.7188880443572998, + "learning_rate": 2.9846898592338465e-06, + "loss": 0.9705, + "step": 2878 + }, + { + "epoch": 2.680633147113594, + "grad_norm": 1.6627426147460938, + "learning_rate": 2.9834752726449e-06, + "loss": 0.9891, + "step": 2879 + }, + { + "epoch": 2.6815642458100557, + "grad_norm": 1.6442075967788696, + "learning_rate": 2.9822605674933696e-06, + "loss": 0.9561, + "step": 2880 + }, + { + "epoch": 2.682495344506518, + "grad_norm": 1.6207232475280762, + "learning_rate": 2.9810457440771374e-06, + "loss": 0.9354, + "step": 2881 + }, + { + "epoch": 2.6834264432029795, + "grad_norm": 1.6638420820236206, + "learning_rate": 2.9798308026941147e-06, + "loss": 0.9404, + "step": 2882 + }, + { + "epoch": 2.684357541899441, + "grad_norm": 1.5670677423477173, + "learning_rate": 2.9786157436422413e-06, + "loss": 0.9166, + "step": 2883 + }, + { + "epoch": 2.6852886405959033, + "grad_norm": 1.6316684484481812, + "learning_rate": 2.9774005672194854e-06, + "loss": 0.9695, + "step": 2884 + }, + { + "epoch": 2.686219739292365, + "grad_norm": 1.5999932289123535, + "learning_rate": 2.9761852737238462e-06, + "loss": 0.9539, + "step": 2885 + }, + { + "epoch": 2.6871508379888267, + "grad_norm": 1.797905445098877, + "learning_rate": 2.9749698634533476e-06, + "loss": 0.9529, + "step": 2886 + }, + { + "epoch": 2.688081936685289, + "grad_norm": 1.74045729637146, + "learning_rate": 2.9737543367060475e-06, + "loss": 0.9387, + "step": 2887 + }, + { + "epoch": 2.6890130353817505, + "grad_norm": 1.6801518201828003, + "learning_rate": 2.9725386937800254e-06, + "loss": 0.9607, + "step": 2888 + }, + { + "epoch": 2.689944134078212, + "grad_norm": 1.661042332649231, + "learning_rate": 2.9713229349733965e-06, + "loss": 0.9621, + "step": 2889 + }, + { + "epoch": 2.6908752327746743, + "grad_norm": 1.6822454929351807, + "learning_rate": 2.9701070605843e-06, + "loss": 0.9812, + "step": 2890 + }, + { + "epoch": 2.691806331471136, + "grad_norm": 1.7273681163787842, + "learning_rate": 2.9688910709109052e-06, + "loss": 0.9304, + "step": 2891 + }, + { + "epoch": 2.6927374301675977, + "grad_norm": 1.7504971027374268, + "learning_rate": 2.967674966251409e-06, + "loss": 0.9818, + "step": 2892 + }, + { + "epoch": 2.69366852886406, + "grad_norm": 1.7233527898788452, + "learning_rate": 2.966458746904036e-06, + "loss": 0.9391, + "step": 2893 + }, + { + "epoch": 2.6945996275605215, + "grad_norm": 1.5805985927581787, + "learning_rate": 2.9652424131670404e-06, + "loss": 0.923, + "step": 2894 + }, + { + "epoch": 2.695530726256983, + "grad_norm": 1.5961047410964966, + "learning_rate": 2.964025965338702e-06, + "loss": 0.947, + "step": 2895 + }, + { + "epoch": 2.6964618249534453, + "grad_norm": 1.6449295282363892, + "learning_rate": 2.962809403717332e-06, + "loss": 0.9858, + "step": 2896 + }, + { + "epoch": 2.697392923649907, + "grad_norm": 1.668685793876648, + "learning_rate": 2.9615927286012664e-06, + "loss": 0.9773, + "step": 2897 + }, + { + "epoch": 2.6983240223463687, + "grad_norm": 1.6844929456710815, + "learning_rate": 2.960375940288871e-06, + "loss": 0.9953, + "step": 2898 + }, + { + "epoch": 2.6992551210428304, + "grad_norm": 1.6631215810775757, + "learning_rate": 2.959159039078539e-06, + "loss": 0.9509, + "step": 2899 + }, + { + "epoch": 2.7001862197392925, + "grad_norm": 1.6469553709030151, + "learning_rate": 2.957942025268689e-06, + "loss": 0.9356, + "step": 2900 + }, + { + "epoch": 2.701117318435754, + "grad_norm": 1.6841636896133423, + "learning_rate": 2.956724899157772e-06, + "loss": 0.9985, + "step": 2901 + }, + { + "epoch": 2.702048417132216, + "grad_norm": 1.6484923362731934, + "learning_rate": 2.9555076610442605e-06, + "loss": 0.9745, + "step": 2902 + }, + { + "epoch": 2.7029795158286776, + "grad_norm": 1.6595820188522339, + "learning_rate": 2.9542903112266613e-06, + "loss": 0.9586, + "step": 2903 + }, + { + "epoch": 2.7039106145251397, + "grad_norm": 1.7325596809387207, + "learning_rate": 2.953072850003502e-06, + "loss": 0.9571, + "step": 2904 + }, + { + "epoch": 2.7048417132216014, + "grad_norm": 1.6623222827911377, + "learning_rate": 2.9518552776733416e-06, + "loss": 1.0077, + "step": 2905 + }, + { + "epoch": 2.705772811918063, + "grad_norm": 1.6257872581481934, + "learning_rate": 2.950637594534765e-06, + "loss": 0.9116, + "step": 2906 + }, + { + "epoch": 2.706703910614525, + "grad_norm": 1.6201913356781006, + "learning_rate": 2.9494198008863843e-06, + "loss": 0.9217, + "step": 2907 + }, + { + "epoch": 2.707635009310987, + "grad_norm": 1.6633226871490479, + "learning_rate": 2.9482018970268395e-06, + "loss": 0.9638, + "step": 2908 + }, + { + "epoch": 2.7085661080074486, + "grad_norm": 1.828963041305542, + "learning_rate": 2.9469838832547964e-06, + "loss": 0.9927, + "step": 2909 + }, + { + "epoch": 2.7094972067039107, + "grad_norm": 1.6324316263198853, + "learning_rate": 2.9457657598689493e-06, + "loss": 0.9536, + "step": 2910 + }, + { + "epoch": 2.7104283054003724, + "grad_norm": 1.6936625242233276, + "learning_rate": 2.9445475271680175e-06, + "loss": 0.9468, + "step": 2911 + }, + { + "epoch": 2.711359404096834, + "grad_norm": 1.6941533088684082, + "learning_rate": 2.9433291854507483e-06, + "loss": 0.9363, + "step": 2912 + }, + { + "epoch": 2.712290502793296, + "grad_norm": 1.643308401107788, + "learning_rate": 2.9421107350159156e-06, + "loss": 0.9686, + "step": 2913 + }, + { + "epoch": 2.713221601489758, + "grad_norm": 1.685700535774231, + "learning_rate": 2.94089217616232e-06, + "loss": 0.9778, + "step": 2914 + }, + { + "epoch": 2.7141527001862196, + "grad_norm": 1.6530426740646362, + "learning_rate": 2.9396735091887883e-06, + "loss": 0.9866, + "step": 2915 + }, + { + "epoch": 2.7150837988826817, + "grad_norm": 1.6582716703414917, + "learning_rate": 2.9384547343941742e-06, + "loss": 0.9543, + "step": 2916 + }, + { + "epoch": 2.7160148975791434, + "grad_norm": 1.6577575206756592, + "learning_rate": 2.9372358520773575e-06, + "loss": 0.9201, + "step": 2917 + }, + { + "epoch": 2.716945996275605, + "grad_norm": 1.6079766750335693, + "learning_rate": 2.936016862537245e-06, + "loss": 0.9468, + "step": 2918 + }, + { + "epoch": 2.717877094972067, + "grad_norm": 1.615461826324463, + "learning_rate": 2.934797766072769e-06, + "loss": 0.9059, + "step": 2919 + }, + { + "epoch": 2.718808193668529, + "grad_norm": 1.6259170770645142, + "learning_rate": 2.933578562982888e-06, + "loss": 0.9722, + "step": 2920 + }, + { + "epoch": 2.7197392923649906, + "grad_norm": 1.6534541845321655, + "learning_rate": 2.932359253566588e-06, + "loss": 0.9857, + "step": 2921 + }, + { + "epoch": 2.7206703910614527, + "grad_norm": 1.5887418985366821, + "learning_rate": 2.931139838122879e-06, + "loss": 0.8908, + "step": 2922 + }, + { + "epoch": 2.7216014897579144, + "grad_norm": 1.7287448644638062, + "learning_rate": 2.929920316950799e-06, + "loss": 0.9871, + "step": 2923 + }, + { + "epoch": 2.722532588454376, + "grad_norm": 1.6595737934112549, + "learning_rate": 2.928700690349411e-06, + "loss": 0.968, + "step": 2924 + }, + { + "epoch": 2.723463687150838, + "grad_norm": 1.8576184511184692, + "learning_rate": 2.9274809586178026e-06, + "loss": 0.9522, + "step": 2925 + }, + { + "epoch": 2.7243947858473, + "grad_norm": 1.6479943990707397, + "learning_rate": 2.9262611220550906e-06, + "loss": 0.9813, + "step": 2926 + }, + { + "epoch": 2.7253258845437616, + "grad_norm": 1.6889318227767944, + "learning_rate": 2.9250411809604136e-06, + "loss": 0.979, + "step": 2927 + }, + { + "epoch": 2.7262569832402237, + "grad_norm": 1.651171326637268, + "learning_rate": 2.923821135632938e-06, + "loss": 0.9693, + "step": 2928 + }, + { + "epoch": 2.7271880819366854, + "grad_norm": 1.6054799556732178, + "learning_rate": 2.922600986371856e-06, + "loss": 0.9596, + "step": 2929 + }, + { + "epoch": 2.728119180633147, + "grad_norm": 1.677786111831665, + "learning_rate": 2.9213807334763857e-06, + "loss": 0.949, + "step": 2930 + }, + { + "epoch": 2.729050279329609, + "grad_norm": 1.6429251432418823, + "learning_rate": 2.920160377245766e-06, + "loss": 0.9775, + "step": 2931 + }, + { + "epoch": 2.729981378026071, + "grad_norm": 1.6588455438613892, + "learning_rate": 2.9189399179792675e-06, + "loss": 0.9177, + "step": 2932 + }, + { + "epoch": 2.7309124767225326, + "grad_norm": 1.6935231685638428, + "learning_rate": 2.917719355976183e-06, + "loss": 0.9392, + "step": 2933 + }, + { + "epoch": 2.7318435754189943, + "grad_norm": 1.578336477279663, + "learning_rate": 2.91649869153583e-06, + "loss": 0.9171, + "step": 2934 + }, + { + "epoch": 2.7327746741154564, + "grad_norm": 1.6603213548660278, + "learning_rate": 2.9152779249575536e-06, + "loss": 0.9416, + "step": 2935 + }, + { + "epoch": 2.733705772811918, + "grad_norm": 1.6804225444793701, + "learning_rate": 2.9140570565407194e-06, + "loss": 0.9519, + "step": 2936 + }, + { + "epoch": 2.7346368715083798, + "grad_norm": 1.6281383037567139, + "learning_rate": 2.9128360865847235e-06, + "loss": 0.9511, + "step": 2937 + }, + { + "epoch": 2.7355679702048414, + "grad_norm": 1.6681561470031738, + "learning_rate": 2.911615015388982e-06, + "loss": 0.9447, + "step": 2938 + }, + { + "epoch": 2.7364990689013036, + "grad_norm": 1.6646918058395386, + "learning_rate": 2.91039384325294e-06, + "loss": 0.9626, + "step": 2939 + }, + { + "epoch": 2.7374301675977653, + "grad_norm": 1.6071377992630005, + "learning_rate": 2.9091725704760637e-06, + "loss": 0.9286, + "step": 2940 + }, + { + "epoch": 2.738361266294227, + "grad_norm": 1.5857560634613037, + "learning_rate": 2.9079511973578467e-06, + "loss": 0.9536, + "step": 2941 + }, + { + "epoch": 2.739292364990689, + "grad_norm": 1.610556721687317, + "learning_rate": 2.906729724197807e-06, + "loss": 0.9714, + "step": 2942 + }, + { + "epoch": 2.7402234636871508, + "grad_norm": 1.5707459449768066, + "learning_rate": 2.905508151295484e-06, + "loss": 0.9564, + "step": 2943 + }, + { + "epoch": 2.7411545623836124, + "grad_norm": 1.5549044609069824, + "learning_rate": 2.9042864789504465e-06, + "loss": 0.9252, + "step": 2944 + }, + { + "epoch": 2.7420856610800746, + "grad_norm": 1.581815242767334, + "learning_rate": 2.9030647074622824e-06, + "loss": 0.9302, + "step": 2945 + }, + { + "epoch": 2.7430167597765363, + "grad_norm": 1.6523064374923706, + "learning_rate": 2.9018428371306097e-06, + "loss": 0.9348, + "step": 2946 + }, + { + "epoch": 2.743947858472998, + "grad_norm": 1.6628508567810059, + "learning_rate": 2.900620868255064e-06, + "loss": 0.9592, + "step": 2947 + }, + { + "epoch": 2.74487895716946, + "grad_norm": 1.6209479570388794, + "learning_rate": 2.8993988011353113e-06, + "loss": 0.9587, + "step": 2948 + }, + { + "epoch": 2.7458100558659218, + "grad_norm": 1.6480685472488403, + "learning_rate": 2.8981766360710377e-06, + "loss": 0.9909, + "step": 2949 + }, + { + "epoch": 2.7467411545623834, + "grad_norm": 1.6479328870773315, + "learning_rate": 2.8969543733619553e-06, + "loss": 0.9748, + "step": 2950 + }, + { + "epoch": 2.7476722532588456, + "grad_norm": 1.6769737005233765, + "learning_rate": 2.8957320133077987e-06, + "loss": 0.9729, + "step": 2951 + }, + { + "epoch": 2.7486033519553073, + "grad_norm": 1.6814030408859253, + "learning_rate": 2.894509556208327e-06, + "loss": 0.9305, + "step": 2952 + }, + { + "epoch": 2.749534450651769, + "grad_norm": 1.6598000526428223, + "learning_rate": 2.893287002363324e-06, + "loss": 0.9597, + "step": 2953 + }, + { + "epoch": 2.750465549348231, + "grad_norm": 1.5595886707305908, + "learning_rate": 2.8920643520725967e-06, + "loss": 0.9185, + "step": 2954 + }, + { + "epoch": 2.7513966480446927, + "grad_norm": 1.6225324869155884, + "learning_rate": 2.8908416056359743e-06, + "loss": 0.9801, + "step": 2955 + }, + { + "epoch": 2.7523277467411544, + "grad_norm": 1.6058166027069092, + "learning_rate": 2.8896187633533112e-06, + "loss": 0.9456, + "step": 2956 + }, + { + "epoch": 2.7532588454376166, + "grad_norm": 1.6955596208572388, + "learning_rate": 2.8883958255244855e-06, + "loss": 0.9439, + "step": 2957 + }, + { + "epoch": 2.7541899441340782, + "grad_norm": 1.7024625539779663, + "learning_rate": 2.8871727924493976e-06, + "loss": 0.95, + "step": 2958 + }, + { + "epoch": 2.75512104283054, + "grad_norm": 1.6556583642959595, + "learning_rate": 2.885949664427972e-06, + "loss": 0.9536, + "step": 2959 + }, + { + "epoch": 2.756052141527002, + "grad_norm": 1.6698493957519531, + "learning_rate": 2.884726441760155e-06, + "loss": 0.9951, + "step": 2960 + }, + { + "epoch": 2.7569832402234637, + "grad_norm": 1.6347700357437134, + "learning_rate": 2.8835031247459187e-06, + "loss": 0.9715, + "step": 2961 + }, + { + "epoch": 2.7579143389199254, + "grad_norm": 1.6315240859985352, + "learning_rate": 2.882279713685257e-06, + "loss": 0.9595, + "step": 2962 + }, + { + "epoch": 2.7588454376163876, + "grad_norm": 1.6313539743423462, + "learning_rate": 2.881056208878186e-06, + "loss": 0.9395, + "step": 2963 + }, + { + "epoch": 2.7597765363128492, + "grad_norm": 1.6100709438323975, + "learning_rate": 2.8798326106247472e-06, + "loss": 0.9141, + "step": 2964 + }, + { + "epoch": 2.760707635009311, + "grad_norm": 1.6138430833816528, + "learning_rate": 2.878608919225001e-06, + "loss": 0.9412, + "step": 2965 + }, + { + "epoch": 2.761638733705773, + "grad_norm": 1.6096676588058472, + "learning_rate": 2.8773851349790357e-06, + "loss": 0.9372, + "step": 2966 + }, + { + "epoch": 2.7625698324022347, + "grad_norm": 1.608506441116333, + "learning_rate": 2.876161258186958e-06, + "loss": 0.9344, + "step": 2967 + }, + { + "epoch": 2.7635009310986964, + "grad_norm": 1.6560670137405396, + "learning_rate": 2.8749372891488998e-06, + "loss": 0.9043, + "step": 2968 + }, + { + "epoch": 2.7644320297951586, + "grad_norm": 1.609519362449646, + "learning_rate": 2.873713228165014e-06, + "loss": 0.9625, + "step": 2969 + }, + { + "epoch": 2.7653631284916202, + "grad_norm": 1.5880539417266846, + "learning_rate": 2.8724890755354784e-06, + "loss": 0.9201, + "step": 2970 + }, + { + "epoch": 2.766294227188082, + "grad_norm": 1.6771913766860962, + "learning_rate": 2.8712648315604905e-06, + "loss": 0.9694, + "step": 2971 + }, + { + "epoch": 2.7672253258845436, + "grad_norm": 1.6371620893478394, + "learning_rate": 2.8700404965402728e-06, + "loss": 0.9564, + "step": 2972 + }, + { + "epoch": 2.7681564245810057, + "grad_norm": 1.6186131238937378, + "learning_rate": 2.8688160707750678e-06, + "loss": 0.9439, + "step": 2973 + }, + { + "epoch": 2.7690875232774674, + "grad_norm": 1.6411898136138916, + "learning_rate": 2.867591554565141e-06, + "loss": 0.9422, + "step": 2974 + }, + { + "epoch": 2.770018621973929, + "grad_norm": 1.6392754316329956, + "learning_rate": 2.866366948210781e-06, + "loss": 0.9719, + "step": 2975 + }, + { + "epoch": 2.770949720670391, + "grad_norm": 1.6504026651382446, + "learning_rate": 2.865142252012298e-06, + "loss": 0.942, + "step": 2976 + }, + { + "epoch": 2.771880819366853, + "grad_norm": 1.5859071016311646, + "learning_rate": 2.863917466270024e-06, + "loss": 0.9234, + "step": 2977 + }, + { + "epoch": 2.7728119180633146, + "grad_norm": 1.6110219955444336, + "learning_rate": 2.862692591284313e-06, + "loss": 0.9222, + "step": 2978 + }, + { + "epoch": 2.7737430167597763, + "grad_norm": 1.6395645141601562, + "learning_rate": 2.861467627355541e-06, + "loss": 0.9906, + "step": 2979 + }, + { + "epoch": 2.7746741154562384, + "grad_norm": 1.720544695854187, + "learning_rate": 2.8602425747841054e-06, + "loss": 0.9639, + "step": 2980 + }, + { + "epoch": 2.7756052141527, + "grad_norm": 1.661580204963684, + "learning_rate": 2.8590174338704256e-06, + "loss": 0.9635, + "step": 2981 + }, + { + "epoch": 2.776536312849162, + "grad_norm": 1.5911599397659302, + "learning_rate": 2.8577922049149443e-06, + "loss": 0.9593, + "step": 2982 + }, + { + "epoch": 2.777467411545624, + "grad_norm": 1.6180620193481445, + "learning_rate": 2.856566888218121e-06, + "loss": 0.9348, + "step": 2983 + }, + { + "epoch": 2.7783985102420856, + "grad_norm": 1.6100971698760986, + "learning_rate": 2.8553414840804446e-06, + "loss": 0.9376, + "step": 2984 + }, + { + "epoch": 2.7793296089385473, + "grad_norm": 1.6387910842895508, + "learning_rate": 2.8541159928024167e-06, + "loss": 0.9479, + "step": 2985 + }, + { + "epoch": 2.7802607076350094, + "grad_norm": 1.654554009437561, + "learning_rate": 2.8528904146845652e-06, + "loss": 0.9996, + "step": 2986 + }, + { + "epoch": 2.781191806331471, + "grad_norm": 1.6798985004425049, + "learning_rate": 2.851664750027441e-06, + "loss": 0.966, + "step": 2987 + }, + { + "epoch": 2.782122905027933, + "grad_norm": 1.6659077405929565, + "learning_rate": 2.8504389991316107e-06, + "loss": 0.9521, + "step": 2988 + }, + { + "epoch": 2.783054003724395, + "grad_norm": 1.6424578428268433, + "learning_rate": 2.849213162297667e-06, + "loss": 0.9747, + "step": 2989 + }, + { + "epoch": 2.7839851024208566, + "grad_norm": 1.6100014448165894, + "learning_rate": 2.8479872398262197e-06, + "loss": 0.941, + "step": 2990 + }, + { + "epoch": 2.7849162011173183, + "grad_norm": 1.6316558122634888, + "learning_rate": 2.8467612320179046e-06, + "loss": 0.9519, + "step": 2991 + }, + { + "epoch": 2.7858472998137804, + "grad_norm": 1.676816701889038, + "learning_rate": 2.8455351391733726e-06, + "loss": 1.0057, + "step": 2992 + }, + { + "epoch": 2.786778398510242, + "grad_norm": 1.6258941888809204, + "learning_rate": 2.8443089615933002e-06, + "loss": 0.9546, + "step": 2993 + }, + { + "epoch": 2.787709497206704, + "grad_norm": 1.622909426689148, + "learning_rate": 2.843082699578381e-06, + "loss": 0.9706, + "step": 2994 + }, + { + "epoch": 2.788640595903166, + "grad_norm": 1.6182568073272705, + "learning_rate": 2.841856353429332e-06, + "loss": 0.9454, + "step": 2995 + }, + { + "epoch": 2.7895716945996276, + "grad_norm": 1.6602880954742432, + "learning_rate": 2.8406299234468914e-06, + "loss": 0.9623, + "step": 2996 + }, + { + "epoch": 2.7905027932960893, + "grad_norm": 1.587006688117981, + "learning_rate": 2.839403409931814e-06, + "loss": 0.9345, + "step": 2997 + }, + { + "epoch": 2.7914338919925514, + "grad_norm": 1.6170670986175537, + "learning_rate": 2.8381768131848796e-06, + "loss": 0.9874, + "step": 2998 + }, + { + "epoch": 2.792364990689013, + "grad_norm": 1.6449601650238037, + "learning_rate": 2.836950133506885e-06, + "loss": 0.9658, + "step": 2999 + }, + { + "epoch": 2.793296089385475, + "grad_norm": 1.6289868354797363, + "learning_rate": 2.8357233711986487e-06, + "loss": 0.9559, + "step": 3000 + }, + { + "epoch": 2.794227188081937, + "grad_norm": 1.611748218536377, + "learning_rate": 2.8344965265610107e-06, + "loss": 0.9572, + "step": 3001 + }, + { + "epoch": 2.7951582867783986, + "grad_norm": 1.5705057382583618, + "learning_rate": 2.833269599894829e-06, + "loss": 0.9514, + "step": 3002 + }, + { + "epoch": 2.7960893854748603, + "grad_norm": 1.5393644571304321, + "learning_rate": 2.8320425915009825e-06, + "loss": 0.9308, + "step": 3003 + }, + { + "epoch": 2.7970204841713224, + "grad_norm": 1.6054822206497192, + "learning_rate": 2.8308155016803706e-06, + "loss": 0.9441, + "step": 3004 + }, + { + "epoch": 2.797951582867784, + "grad_norm": 1.6108616590499878, + "learning_rate": 2.829588330733913e-06, + "loss": 0.9299, + "step": 3005 + }, + { + "epoch": 2.798882681564246, + "grad_norm": 1.6474404335021973, + "learning_rate": 2.8283610789625483e-06, + "loss": 0.9654, + "step": 3006 + }, + { + "epoch": 2.7998137802607075, + "grad_norm": 1.5784049034118652, + "learning_rate": 2.8271337466672343e-06, + "loss": 0.9391, + "step": 3007 + }, + { + "epoch": 2.8007448789571696, + "grad_norm": 1.6093735694885254, + "learning_rate": 2.8259063341489514e-06, + "loss": 0.9102, + "step": 3008 + }, + { + "epoch": 2.8016759776536313, + "grad_norm": 1.6307052373886108, + "learning_rate": 2.8246788417086964e-06, + "loss": 0.9106, + "step": 3009 + }, + { + "epoch": 2.802607076350093, + "grad_norm": 1.6352678537368774, + "learning_rate": 2.8234512696474875e-06, + "loss": 0.9917, + "step": 3010 + }, + { + "epoch": 2.8035381750465547, + "grad_norm": 1.594149112701416, + "learning_rate": 2.8222236182663624e-06, + "loss": 0.9294, + "step": 3011 + }, + { + "epoch": 2.804469273743017, + "grad_norm": 1.5843313932418823, + "learning_rate": 2.820995887866378e-06, + "loss": 0.9624, + "step": 3012 + }, + { + "epoch": 2.8054003724394785, + "grad_norm": 1.559574007987976, + "learning_rate": 2.819768078748609e-06, + "loss": 0.9146, + "step": 3013 + }, + { + "epoch": 2.80633147113594, + "grad_norm": 1.6503013372421265, + "learning_rate": 2.8185401912141532e-06, + "loss": 0.9558, + "step": 3014 + }, + { + "epoch": 2.8072625698324023, + "grad_norm": 1.6319999694824219, + "learning_rate": 2.8173122255641234e-06, + "loss": 0.9347, + "step": 3015 + }, + { + "epoch": 2.808193668528864, + "grad_norm": 1.638600468635559, + "learning_rate": 2.8160841820996547e-06, + "loss": 0.9692, + "step": 3016 + }, + { + "epoch": 2.8091247672253257, + "grad_norm": 1.6471275091171265, + "learning_rate": 2.8148560611218987e-06, + "loss": 0.9525, + "step": 3017 + }, + { + "epoch": 2.810055865921788, + "grad_norm": 1.6110758781433105, + "learning_rate": 2.8136278629320294e-06, + "loss": 0.9576, + "step": 3018 + }, + { + "epoch": 2.8109869646182495, + "grad_norm": 1.6616575717926025, + "learning_rate": 2.8123995878312356e-06, + "loss": 0.9588, + "step": 3019 + }, + { + "epoch": 2.811918063314711, + "grad_norm": 1.6166157722473145, + "learning_rate": 2.811171236120728e-06, + "loss": 0.944, + "step": 3020 + }, + { + "epoch": 2.8128491620111733, + "grad_norm": 1.6271874904632568, + "learning_rate": 2.8099428081017353e-06, + "loss": 0.9796, + "step": 3021 + }, + { + "epoch": 2.813780260707635, + "grad_norm": 1.61025869846344, + "learning_rate": 2.808714304075505e-06, + "loss": 0.9339, + "step": 3022 + }, + { + "epoch": 2.8147113594040967, + "grad_norm": 1.6462604999542236, + "learning_rate": 2.807485724343303e-06, + "loss": 0.9791, + "step": 3023 + }, + { + "epoch": 2.815642458100559, + "grad_norm": 1.6522613763809204, + "learning_rate": 2.806257069206412e-06, + "loss": 0.963, + "step": 3024 + }, + { + "epoch": 2.8165735567970205, + "grad_norm": 1.6467231512069702, + "learning_rate": 2.805028338966137e-06, + "loss": 1.001, + "step": 3025 + }, + { + "epoch": 2.817504655493482, + "grad_norm": 1.604615569114685, + "learning_rate": 2.803799533923798e-06, + "loss": 0.9258, + "step": 3026 + }, + { + "epoch": 2.8184357541899443, + "grad_norm": 1.6463779211044312, + "learning_rate": 2.8025706543807364e-06, + "loss": 0.9679, + "step": 3027 + }, + { + "epoch": 2.819366852886406, + "grad_norm": 1.6304740905761719, + "learning_rate": 2.8013417006383078e-06, + "loss": 0.9912, + "step": 3028 + }, + { + "epoch": 2.8202979515828677, + "grad_norm": 1.6669981479644775, + "learning_rate": 2.8001126729978907e-06, + "loss": 0.9279, + "step": 3029 + }, + { + "epoch": 2.82122905027933, + "grad_norm": 1.5770384073257446, + "learning_rate": 2.7988835717608785e-06, + "loss": 0.9259, + "step": 3030 + }, + { + "epoch": 2.8221601489757915, + "grad_norm": 1.6221460103988647, + "learning_rate": 2.7976543972286824e-06, + "loss": 0.9441, + "step": 3031 + }, + { + "epoch": 2.823091247672253, + "grad_norm": 1.6051779985427856, + "learning_rate": 2.796425149702735e-06, + "loss": 0.9333, + "step": 3032 + }, + { + "epoch": 2.8240223463687153, + "grad_norm": 1.6150850057601929, + "learning_rate": 2.795195829484483e-06, + "loss": 0.9316, + "step": 3033 + }, + { + "epoch": 2.824953445065177, + "grad_norm": 1.7130149602890015, + "learning_rate": 2.7939664368753925e-06, + "loss": 0.9654, + "step": 3034 + }, + { + "epoch": 2.8258845437616387, + "grad_norm": 1.7591301202774048, + "learning_rate": 2.792736972176948e-06, + "loss": 0.9578, + "step": 3035 + }, + { + "epoch": 2.826815642458101, + "grad_norm": 1.6858676671981812, + "learning_rate": 2.79150743569065e-06, + "loss": 0.923, + "step": 3036 + }, + { + "epoch": 2.8277467411545625, + "grad_norm": 1.6735764741897583, + "learning_rate": 2.7902778277180187e-06, + "loss": 0.9377, + "step": 3037 + }, + { + "epoch": 2.828677839851024, + "grad_norm": 1.6521159410476685, + "learning_rate": 2.7890481485605898e-06, + "loss": 0.948, + "step": 3038 + }, + { + "epoch": 2.8296089385474863, + "grad_norm": 1.6810277700424194, + "learning_rate": 2.787818398519918e-06, + "loss": 0.9623, + "step": 3039 + }, + { + "epoch": 2.830540037243948, + "grad_norm": 1.734470009803772, + "learning_rate": 2.7865885778975742e-06, + "loss": 0.9737, + "step": 3040 + }, + { + "epoch": 2.8314711359404097, + "grad_norm": 1.5533194541931152, + "learning_rate": 2.7853586869951484e-06, + "loss": 0.9278, + "step": 3041 + }, + { + "epoch": 2.8324022346368714, + "grad_norm": 1.583024263381958, + "learning_rate": 2.784128726114245e-06, + "loss": 0.9148, + "step": 3042 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 1.6349562406539917, + "learning_rate": 2.782898695556487e-06, + "loss": 0.9693, + "step": 3043 + }, + { + "epoch": 2.834264432029795, + "grad_norm": 1.613322377204895, + "learning_rate": 2.7816685956235167e-06, + "loss": 0.9425, + "step": 3044 + }, + { + "epoch": 2.835195530726257, + "grad_norm": 1.5986024141311646, + "learning_rate": 2.7804384266169897e-06, + "loss": 0.9586, + "step": 3045 + }, + { + "epoch": 2.8361266294227185, + "grad_norm": 1.6297509670257568, + "learning_rate": 2.77920818883858e-06, + "loss": 0.9195, + "step": 3046 + }, + { + "epoch": 2.8370577281191807, + "grad_norm": 1.7266979217529297, + "learning_rate": 2.7779778825899804e-06, + "loss": 0.9818, + "step": 3047 + }, + { + "epoch": 2.8379888268156424, + "grad_norm": 1.6487879753112793, + "learning_rate": 2.776747508172897e-06, + "loss": 0.9603, + "step": 3048 + }, + { + "epoch": 2.838919925512104, + "grad_norm": 1.64192795753479, + "learning_rate": 2.775517065889055e-06, + "loss": 0.9191, + "step": 3049 + }, + { + "epoch": 2.839851024208566, + "grad_norm": 1.6581125259399414, + "learning_rate": 2.774286556040196e-06, + "loss": 0.9706, + "step": 3050 + }, + { + "epoch": 2.840782122905028, + "grad_norm": 1.5849334001541138, + "learning_rate": 2.7730559789280774e-06, + "loss": 0.9312, + "step": 3051 + }, + { + "epoch": 2.8417132216014895, + "grad_norm": 1.6276161670684814, + "learning_rate": 2.7718253348544734e-06, + "loss": 0.9773, + "step": 3052 + }, + { + "epoch": 2.8426443202979517, + "grad_norm": 1.6801183223724365, + "learning_rate": 2.7705946241211746e-06, + "loss": 0.958, + "step": 3053 + }, + { + "epoch": 2.8435754189944134, + "grad_norm": 1.6204458475112915, + "learning_rate": 2.7693638470299883e-06, + "loss": 0.949, + "step": 3054 + }, + { + "epoch": 2.844506517690875, + "grad_norm": 1.6157158613204956, + "learning_rate": 2.768133003882738e-06, + "loss": 0.9332, + "step": 3055 + }, + { + "epoch": 2.845437616387337, + "grad_norm": 1.6545689105987549, + "learning_rate": 2.7669020949812626e-06, + "loss": 0.9309, + "step": 3056 + }, + { + "epoch": 2.846368715083799, + "grad_norm": 1.6190471649169922, + "learning_rate": 2.7656711206274184e-06, + "loss": 0.9426, + "step": 3057 + }, + { + "epoch": 2.8472998137802605, + "grad_norm": 1.6490509510040283, + "learning_rate": 2.764440081123077e-06, + "loss": 0.9429, + "step": 3058 + }, + { + "epoch": 2.8482309124767227, + "grad_norm": 1.626109004020691, + "learning_rate": 2.7632089767701258e-06, + "loss": 0.9661, + "step": 3059 + }, + { + "epoch": 2.8491620111731844, + "grad_norm": 1.557902455329895, + "learning_rate": 2.7619778078704683e-06, + "loss": 0.926, + "step": 3060 + }, + { + "epoch": 2.850093109869646, + "grad_norm": 1.6351616382598877, + "learning_rate": 2.760746574726025e-06, + "loss": 0.9849, + "step": 3061 + }, + { + "epoch": 2.851024208566108, + "grad_norm": 1.630013346672058, + "learning_rate": 2.7595152776387283e-06, + "loss": 0.9583, + "step": 3062 + }, + { + "epoch": 2.85195530726257, + "grad_norm": 1.649207592010498, + "learning_rate": 2.758283916910532e-06, + "loss": 0.9624, + "step": 3063 + }, + { + "epoch": 2.8528864059590315, + "grad_norm": 1.732132077217102, + "learning_rate": 2.757052492843401e-06, + "loss": 0.9508, + "step": 3064 + }, + { + "epoch": 2.8538175046554937, + "grad_norm": 1.614696979522705, + "learning_rate": 2.755821005739318e-06, + "loss": 0.9374, + "step": 3065 + }, + { + "epoch": 2.8547486033519553, + "grad_norm": 1.610378623008728, + "learning_rate": 2.7545894559002806e-06, + "loss": 0.9471, + "step": 3066 + }, + { + "epoch": 2.855679702048417, + "grad_norm": 1.6785856485366821, + "learning_rate": 2.7533578436283005e-06, + "loss": 0.9765, + "step": 3067 + }, + { + "epoch": 2.856610800744879, + "grad_norm": 1.6911710500717163, + "learning_rate": 2.752126169225407e-06, + "loss": 0.9738, + "step": 3068 + }, + { + "epoch": 2.857541899441341, + "grad_norm": 1.6244384050369263, + "learning_rate": 2.750894432993642e-06, + "loss": 0.9087, + "step": 3069 + }, + { + "epoch": 2.8584729981378025, + "grad_norm": 1.6339046955108643, + "learning_rate": 2.7496626352350662e-06, + "loss": 0.9439, + "step": 3070 + }, + { + "epoch": 2.8594040968342647, + "grad_norm": 1.6325255632400513, + "learning_rate": 2.748430776251751e-06, + "loss": 0.9313, + "step": 3071 + }, + { + "epoch": 2.8603351955307263, + "grad_norm": 1.6449286937713623, + "learning_rate": 2.747198856345786e-06, + "loss": 0.9531, + "step": 3072 + }, + { + "epoch": 2.861266294227188, + "grad_norm": 1.6812504529953003, + "learning_rate": 2.745966875819276e-06, + "loss": 0.9332, + "step": 3073 + }, + { + "epoch": 2.86219739292365, + "grad_norm": 1.6733043193817139, + "learning_rate": 2.744734834974337e-06, + "loss": 0.9592, + "step": 3074 + }, + { + "epoch": 2.863128491620112, + "grad_norm": 1.6880135536193848, + "learning_rate": 2.7435027341131043e-06, + "loss": 0.9909, + "step": 3075 + }, + { + "epoch": 2.8640595903165735, + "grad_norm": 1.7200161218643188, + "learning_rate": 2.7422705735377243e-06, + "loss": 0.9699, + "step": 3076 + }, + { + "epoch": 2.864990689013035, + "grad_norm": 1.6862751245498657, + "learning_rate": 2.7410383535503616e-06, + "loss": 0.9148, + "step": 3077 + }, + { + "epoch": 2.8659217877094973, + "grad_norm": 1.5981451272964478, + "learning_rate": 2.73980607445319e-06, + "loss": 0.9852, + "step": 3078 + }, + { + "epoch": 2.866852886405959, + "grad_norm": 1.6795841455459595, + "learning_rate": 2.738573736548405e-06, + "loss": 0.9464, + "step": 3079 + }, + { + "epoch": 2.8677839851024207, + "grad_norm": 1.5939457416534424, + "learning_rate": 2.7373413401382104e-06, + "loss": 0.9513, + "step": 3080 + }, + { + "epoch": 2.868715083798883, + "grad_norm": 1.6223682165145874, + "learning_rate": 2.736108885524827e-06, + "loss": 0.914, + "step": 3081 + }, + { + "epoch": 2.8696461824953445, + "grad_norm": 1.6074382066726685, + "learning_rate": 2.73487637301049e-06, + "loss": 0.9667, + "step": 3082 + }, + { + "epoch": 2.870577281191806, + "grad_norm": 1.61403489112854, + "learning_rate": 2.7336438028974465e-06, + "loss": 0.9254, + "step": 3083 + }, + { + "epoch": 2.871508379888268, + "grad_norm": 1.6451538801193237, + "learning_rate": 2.732411175487963e-06, + "loss": 0.9691, + "step": 3084 + }, + { + "epoch": 2.87243947858473, + "grad_norm": 1.6541868448257446, + "learning_rate": 2.7311784910843135e-06, + "loss": 0.9379, + "step": 3085 + }, + { + "epoch": 2.8733705772811917, + "grad_norm": 1.7156344652175903, + "learning_rate": 2.72994574998879e-06, + "loss": 0.9369, + "step": 3086 + }, + { + "epoch": 2.8743016759776534, + "grad_norm": 1.7009023427963257, + "learning_rate": 2.7287129525036977e-06, + "loss": 0.9635, + "step": 3087 + }, + { + "epoch": 2.8752327746741155, + "grad_norm": 1.588074803352356, + "learning_rate": 2.7274800989313557e-06, + "loss": 0.8856, + "step": 3088 + }, + { + "epoch": 2.876163873370577, + "grad_norm": 1.5667760372161865, + "learning_rate": 2.726247189574095e-06, + "loss": 0.9399, + "step": 3089 + }, + { + "epoch": 2.877094972067039, + "grad_norm": 1.6495251655578613, + "learning_rate": 2.7250142247342637e-06, + "loss": 0.9757, + "step": 3090 + }, + { + "epoch": 2.878026070763501, + "grad_norm": 1.6756880283355713, + "learning_rate": 2.7237812047142204e-06, + "loss": 0.9813, + "step": 3091 + }, + { + "epoch": 2.8789571694599627, + "grad_norm": 1.696001648902893, + "learning_rate": 2.7225481298163388e-06, + "loss": 0.9784, + "step": 3092 + }, + { + "epoch": 2.8798882681564244, + "grad_norm": 1.6369796991348267, + "learning_rate": 2.7213150003430054e-06, + "loss": 0.9286, + "step": 3093 + }, + { + "epoch": 2.8808193668528865, + "grad_norm": 1.6162023544311523, + "learning_rate": 2.7200818165966213e-06, + "loss": 0.9484, + "step": 3094 + }, + { + "epoch": 2.881750465549348, + "grad_norm": 1.6477746963500977, + "learning_rate": 2.718848578879599e-06, + "loss": 0.9389, + "step": 3095 + }, + { + "epoch": 2.88268156424581, + "grad_norm": 1.6024819612503052, + "learning_rate": 2.7176152874943667e-06, + "loss": 0.9621, + "step": 3096 + }, + { + "epoch": 2.883612662942272, + "grad_norm": 1.5970337390899658, + "learning_rate": 2.7163819427433623e-06, + "loss": 0.9418, + "step": 3097 + }, + { + "epoch": 2.8845437616387337, + "grad_norm": 1.6287981271743774, + "learning_rate": 2.7151485449290405e-06, + "loss": 0.973, + "step": 3098 + }, + { + "epoch": 2.8854748603351954, + "grad_norm": 1.624208927154541, + "learning_rate": 2.7139150943538657e-06, + "loss": 0.9561, + "step": 3099 + }, + { + "epoch": 2.8864059590316575, + "grad_norm": 1.6084685325622559, + "learning_rate": 2.712681591320318e-06, + "loss": 0.9476, + "step": 3100 + }, + { + "epoch": 2.887337057728119, + "grad_norm": 1.651850700378418, + "learning_rate": 2.7114480361308892e-06, + "loss": 0.9776, + "step": 3101 + }, + { + "epoch": 2.888268156424581, + "grad_norm": 1.6913796663284302, + "learning_rate": 2.7102144290880834e-06, + "loss": 0.9782, + "step": 3102 + }, + { + "epoch": 2.889199255121043, + "grad_norm": 1.6258907318115234, + "learning_rate": 2.7089807704944184e-06, + "loss": 0.9245, + "step": 3103 + }, + { + "epoch": 2.8901303538175047, + "grad_norm": 1.6570173501968384, + "learning_rate": 2.707747060652424e-06, + "loss": 0.9823, + "step": 3104 + }, + { + "epoch": 2.8910614525139664, + "grad_norm": 1.649380087852478, + "learning_rate": 2.7065132998646414e-06, + "loss": 0.9363, + "step": 3105 + }, + { + "epoch": 2.8919925512104285, + "grad_norm": 1.6097255945205688, + "learning_rate": 2.7052794884336282e-06, + "loss": 0.9232, + "step": 3106 + }, + { + "epoch": 2.89292364990689, + "grad_norm": 1.6660614013671875, + "learning_rate": 2.70404562666195e-06, + "loss": 0.9478, + "step": 3107 + }, + { + "epoch": 2.893854748603352, + "grad_norm": 1.636584758758545, + "learning_rate": 2.7028117148521864e-06, + "loss": 0.9156, + "step": 3108 + }, + { + "epoch": 2.894785847299814, + "grad_norm": 1.614495873451233, + "learning_rate": 2.70157775330693e-06, + "loss": 0.9213, + "step": 3109 + }, + { + "epoch": 2.8957169459962757, + "grad_norm": 1.6499046087265015, + "learning_rate": 2.700343742328786e-06, + "loss": 0.9381, + "step": 3110 + }, + { + "epoch": 2.8966480446927374, + "grad_norm": 1.6998428106307983, + "learning_rate": 2.69910968222037e-06, + "loss": 0.9828, + "step": 3111 + }, + { + "epoch": 2.8975791433891995, + "grad_norm": 1.638384222984314, + "learning_rate": 2.6978755732843086e-06, + "loss": 0.9534, + "step": 3112 + }, + { + "epoch": 2.898510242085661, + "grad_norm": 1.6330877542495728, + "learning_rate": 2.696641415823246e-06, + "loss": 0.9542, + "step": 3113 + }, + { + "epoch": 2.899441340782123, + "grad_norm": 1.6594738960266113, + "learning_rate": 2.6954072101398305e-06, + "loss": 0.9036, + "step": 3114 + }, + { + "epoch": 2.9003724394785846, + "grad_norm": 1.6305596828460693, + "learning_rate": 2.6941729565367285e-06, + "loss": 0.9041, + "step": 3115 + }, + { + "epoch": 2.9013035381750467, + "grad_norm": 1.641000509262085, + "learning_rate": 2.6929386553166165e-06, + "loss": 0.9449, + "step": 3116 + }, + { + "epoch": 2.9022346368715084, + "grad_norm": 1.6552766561508179, + "learning_rate": 2.6917043067821796e-06, + "loss": 0.9586, + "step": 3117 + }, + { + "epoch": 2.90316573556797, + "grad_norm": 1.6130967140197754, + "learning_rate": 2.6904699112361195e-06, + "loss": 0.9568, + "step": 3118 + }, + { + "epoch": 2.9040968342644318, + "grad_norm": 1.5935466289520264, + "learning_rate": 2.6892354689811445e-06, + "loss": 0.9716, + "step": 3119 + }, + { + "epoch": 2.905027932960894, + "grad_norm": 1.7267063856124878, + "learning_rate": 2.688000980319979e-06, + "loss": 1.0024, + "step": 3120 + }, + { + "epoch": 2.9059590316573556, + "grad_norm": 1.7677205801010132, + "learning_rate": 2.686766445555354e-06, + "loss": 0.9305, + "step": 3121 + }, + { + "epoch": 2.9068901303538173, + "grad_norm": 1.6599723100662231, + "learning_rate": 2.6855318649900175e-06, + "loss": 0.9838, + "step": 3122 + }, + { + "epoch": 2.9078212290502794, + "grad_norm": 1.576768159866333, + "learning_rate": 2.684297238926723e-06, + "loss": 0.9306, + "step": 3123 + }, + { + "epoch": 2.908752327746741, + "grad_norm": 1.6890748739242554, + "learning_rate": 2.6830625676682383e-06, + "loss": 0.9659, + "step": 3124 + }, + { + "epoch": 2.9096834264432028, + "grad_norm": 1.6318072080612183, + "learning_rate": 2.6818278515173417e-06, + "loss": 0.952, + "step": 3125 + }, + { + "epoch": 2.910614525139665, + "grad_norm": 1.61650812625885, + "learning_rate": 2.6805930907768227e-06, + "loss": 0.9411, + "step": 3126 + }, + { + "epoch": 2.9115456238361266, + "grad_norm": 1.7172917127609253, + "learning_rate": 2.679358285749482e-06, + "loss": 1.0007, + "step": 3127 + }, + { + "epoch": 2.9124767225325883, + "grad_norm": 1.6232937574386597, + "learning_rate": 2.678123436738129e-06, + "loss": 0.9547, + "step": 3128 + }, + { + "epoch": 2.9134078212290504, + "grad_norm": 1.6304243803024292, + "learning_rate": 2.6768885440455887e-06, + "loss": 0.9458, + "step": 3129 + }, + { + "epoch": 2.914338919925512, + "grad_norm": 1.6008975505828857, + "learning_rate": 2.675653607974691e-06, + "loss": 0.9198, + "step": 3130 + }, + { + "epoch": 2.9152700186219738, + "grad_norm": 1.6223965883255005, + "learning_rate": 2.674418628828279e-06, + "loss": 0.9755, + "step": 3131 + }, + { + "epoch": 2.916201117318436, + "grad_norm": 1.7025574445724487, + "learning_rate": 2.6731836069092083e-06, + "loss": 0.9995, + "step": 3132 + }, + { + "epoch": 2.9171322160148976, + "grad_norm": 1.6130601167678833, + "learning_rate": 2.6719485425203415e-06, + "loss": 0.9468, + "step": 3133 + }, + { + "epoch": 2.9180633147113593, + "grad_norm": 1.6327604055404663, + "learning_rate": 2.6707134359645544e-06, + "loss": 0.9648, + "step": 3134 + }, + { + "epoch": 2.9189944134078214, + "grad_norm": 1.690895915031433, + "learning_rate": 2.6694782875447317e-06, + "loss": 0.931, + "step": 3135 + }, + { + "epoch": 2.919925512104283, + "grad_norm": 1.5851452350616455, + "learning_rate": 2.6682430975637687e-06, + "loss": 0.9232, + "step": 3136 + }, + { + "epoch": 2.9208566108007448, + "grad_norm": 1.7316862344741821, + "learning_rate": 2.6670078663245707e-06, + "loss": 0.9454, + "step": 3137 + }, + { + "epoch": 2.921787709497207, + "grad_norm": 1.651943325996399, + "learning_rate": 2.6657725941300533e-06, + "loss": 0.9533, + "step": 3138 + }, + { + "epoch": 2.9227188081936686, + "grad_norm": 1.6796914339065552, + "learning_rate": 2.664537281283143e-06, + "loss": 0.9699, + "step": 3139 + }, + { + "epoch": 2.9236499068901303, + "grad_norm": 1.6089403629302979, + "learning_rate": 2.663301928086774e-06, + "loss": 0.8837, + "step": 3140 + }, + { + "epoch": 2.9245810055865924, + "grad_norm": 1.6756631135940552, + "learning_rate": 2.662066534843893e-06, + "loss": 0.988, + "step": 3141 + }, + { + "epoch": 2.925512104283054, + "grad_norm": 1.572462558746338, + "learning_rate": 2.6608311018574545e-06, + "loss": 0.9402, + "step": 3142 + }, + { + "epoch": 2.9264432029795158, + "grad_norm": 1.6604962348937988, + "learning_rate": 2.659595629430424e-06, + "loss": 0.9766, + "step": 3143 + }, + { + "epoch": 2.927374301675978, + "grad_norm": 1.6363316774368286, + "learning_rate": 2.658360117865777e-06, + "loss": 0.9537, + "step": 3144 + }, + { + "epoch": 2.9283054003724396, + "grad_norm": 1.5668021440505981, + "learning_rate": 2.6571245674664964e-06, + "loss": 0.95, + "step": 3145 + }, + { + "epoch": 2.9292364990689013, + "grad_norm": 1.707266092300415, + "learning_rate": 2.6558889785355767e-06, + "loss": 0.9538, + "step": 3146 + }, + { + "epoch": 2.9301675977653634, + "grad_norm": 1.636340856552124, + "learning_rate": 2.6546533513760213e-06, + "loss": 0.9557, + "step": 3147 + }, + { + "epoch": 2.931098696461825, + "grad_norm": 1.629612684249878, + "learning_rate": 2.6534176862908434e-06, + "loss": 0.9362, + "step": 3148 + }, + { + "epoch": 2.9320297951582868, + "grad_norm": 1.6107096672058105, + "learning_rate": 2.652181983583064e-06, + "loss": 0.9299, + "step": 3149 + }, + { + "epoch": 2.9329608938547485, + "grad_norm": 1.625532865524292, + "learning_rate": 2.6509462435557155e-06, + "loss": 0.9558, + "step": 3150 + }, + { + "epoch": 2.9338919925512106, + "grad_norm": 1.6218911409378052, + "learning_rate": 2.6497104665118373e-06, + "loss": 0.9378, + "step": 3151 + }, + { + "epoch": 2.9348230912476723, + "grad_norm": 1.6273612976074219, + "learning_rate": 2.6484746527544786e-06, + "loss": 0.9315, + "step": 3152 + }, + { + "epoch": 2.935754189944134, + "grad_norm": 1.5820204019546509, + "learning_rate": 2.6472388025866993e-06, + "loss": 0.9616, + "step": 3153 + }, + { + "epoch": 2.9366852886405956, + "grad_norm": 1.6987463235855103, + "learning_rate": 2.646002916311566e-06, + "loss": 0.9663, + "step": 3154 + }, + { + "epoch": 2.9376163873370578, + "grad_norm": 1.595864176750183, + "learning_rate": 2.6447669942321535e-06, + "loss": 0.9353, + "step": 3155 + }, + { + "epoch": 2.9385474860335195, + "grad_norm": 1.6008108854293823, + "learning_rate": 2.6435310366515497e-06, + "loss": 0.9526, + "step": 3156 + }, + { + "epoch": 2.939478584729981, + "grad_norm": 1.6466643810272217, + "learning_rate": 2.6422950438728454e-06, + "loss": 0.9296, + "step": 3157 + }, + { + "epoch": 2.9404096834264433, + "grad_norm": 1.636741280555725, + "learning_rate": 2.6410590161991463e-06, + "loss": 0.9622, + "step": 3158 + }, + { + "epoch": 2.941340782122905, + "grad_norm": 1.6602981090545654, + "learning_rate": 2.6398229539335597e-06, + "loss": 0.979, + "step": 3159 + }, + { + "epoch": 2.9422718808193666, + "grad_norm": 1.658056378364563, + "learning_rate": 2.6385868573792074e-06, + "loss": 0.9597, + "step": 3160 + }, + { + "epoch": 2.9432029795158288, + "grad_norm": 1.7455765008926392, + "learning_rate": 2.6373507268392167e-06, + "loss": 0.9359, + "step": 3161 + }, + { + "epoch": 2.9441340782122905, + "grad_norm": 1.624936819076538, + "learning_rate": 2.6361145626167227e-06, + "loss": 0.9655, + "step": 3162 + }, + { + "epoch": 2.945065176908752, + "grad_norm": 1.6546506881713867, + "learning_rate": 2.634878365014872e-06, + "loss": 0.95, + "step": 3163 + }, + { + "epoch": 2.9459962756052143, + "grad_norm": 1.6638516187667847, + "learning_rate": 2.633642134336814e-06, + "loss": 0.947, + "step": 3164 + }, + { + "epoch": 2.946927374301676, + "grad_norm": 1.69620943069458, + "learning_rate": 2.632405870885713e-06, + "loss": 1.0029, + "step": 3165 + }, + { + "epoch": 2.9478584729981376, + "grad_norm": 1.6695328950881958, + "learning_rate": 2.6311695749647352e-06, + "loss": 0.9448, + "step": 3166 + }, + { + "epoch": 2.9487895716945998, + "grad_norm": 1.6961241960525513, + "learning_rate": 2.6299332468770583e-06, + "loss": 0.9598, + "step": 3167 + }, + { + "epoch": 2.9497206703910615, + "grad_norm": 1.6267427206039429, + "learning_rate": 2.6286968869258666e-06, + "loss": 0.9482, + "step": 3168 + }, + { + "epoch": 2.950651769087523, + "grad_norm": 1.6254559755325317, + "learning_rate": 2.627460495414352e-06, + "loss": 0.9316, + "step": 3169 + }, + { + "epoch": 2.9515828677839853, + "grad_norm": 1.6306689977645874, + "learning_rate": 2.6262240726457165e-06, + "loss": 0.9421, + "step": 3170 + }, + { + "epoch": 2.952513966480447, + "grad_norm": 1.6224042177200317, + "learning_rate": 2.624987618923166e-06, + "loss": 0.9272, + "step": 3171 + }, + { + "epoch": 2.9534450651769086, + "grad_norm": 1.599687099456787, + "learning_rate": 2.623751134549917e-06, + "loss": 0.9634, + "step": 3172 + }, + { + "epoch": 2.9543761638733708, + "grad_norm": 1.6378819942474365, + "learning_rate": 2.6225146198291915e-06, + "loss": 0.95, + "step": 3173 + }, + { + "epoch": 2.9553072625698324, + "grad_norm": 1.5943334102630615, + "learning_rate": 2.6212780750642203e-06, + "loss": 0.9431, + "step": 3174 + }, + { + "epoch": 2.956238361266294, + "grad_norm": 1.6969786882400513, + "learning_rate": 2.6200415005582414e-06, + "loss": 0.9583, + "step": 3175 + }, + { + "epoch": 2.9571694599627563, + "grad_norm": 1.6389068365097046, + "learning_rate": 2.618804896614499e-06, + "loss": 0.9671, + "step": 3176 + }, + { + "epoch": 2.958100558659218, + "grad_norm": 1.6660076379776, + "learning_rate": 2.6175682635362463e-06, + "loss": 0.9643, + "step": 3177 + }, + { + "epoch": 2.9590316573556796, + "grad_norm": 1.6488064527511597, + "learning_rate": 2.616331601626742e-06, + "loss": 0.9624, + "step": 3178 + }, + { + "epoch": 2.9599627560521418, + "grad_norm": 1.6433595418930054, + "learning_rate": 2.615094911189254e-06, + "loss": 0.9304, + "step": 3179 + }, + { + "epoch": 2.9608938547486034, + "grad_norm": 1.6492249965667725, + "learning_rate": 2.6138581925270536e-06, + "loss": 0.9602, + "step": 3180 + }, + { + "epoch": 2.961824953445065, + "grad_norm": 1.6754956245422363, + "learning_rate": 2.6126214459434223e-06, + "loss": 0.9699, + "step": 3181 + }, + { + "epoch": 2.9627560521415273, + "grad_norm": 1.6222394704818726, + "learning_rate": 2.611384671741647e-06, + "loss": 0.9408, + "step": 3182 + }, + { + "epoch": 2.963687150837989, + "grad_norm": 1.6233813762664795, + "learning_rate": 2.610147870225022e-06, + "loss": 0.9871, + "step": 3183 + }, + { + "epoch": 2.9646182495344506, + "grad_norm": 1.718134880065918, + "learning_rate": 2.608911041696848e-06, + "loss": 0.9432, + "step": 3184 + }, + { + "epoch": 2.9655493482309123, + "grad_norm": 1.6321865320205688, + "learning_rate": 2.607674186460432e-06, + "loss": 0.9787, + "step": 3185 + }, + { + "epoch": 2.9664804469273744, + "grad_norm": 1.7013866901397705, + "learning_rate": 2.6064373048190884e-06, + "loss": 0.9676, + "step": 3186 + }, + { + "epoch": 2.967411545623836, + "grad_norm": 1.6143778562545776, + "learning_rate": 2.605200397076137e-06, + "loss": 0.9424, + "step": 3187 + }, + { + "epoch": 2.968342644320298, + "grad_norm": 1.6556555032730103, + "learning_rate": 2.6039634635349044e-06, + "loss": 0.9698, + "step": 3188 + }, + { + "epoch": 2.9692737430167595, + "grad_norm": 1.6135071516036987, + "learning_rate": 2.602726504498724e-06, + "loss": 0.9193, + "step": 3189 + }, + { + "epoch": 2.9702048417132216, + "grad_norm": 1.6962298154830933, + "learning_rate": 2.6014895202709354e-06, + "loss": 0.9831, + "step": 3190 + }, + { + "epoch": 2.9711359404096833, + "grad_norm": 1.635617733001709, + "learning_rate": 2.600252511154884e-06, + "loss": 0.9779, + "step": 3191 + }, + { + "epoch": 2.972067039106145, + "grad_norm": 1.6644952297210693, + "learning_rate": 2.5990154774539213e-06, + "loss": 0.951, + "step": 3192 + }, + { + "epoch": 2.972998137802607, + "grad_norm": 1.6300113201141357, + "learning_rate": 2.5977784194714036e-06, + "loss": 0.9815, + "step": 3193 + }, + { + "epoch": 2.973929236499069, + "grad_norm": 1.6248083114624023, + "learning_rate": 2.5965413375106965e-06, + "loss": 0.9385, + "step": 3194 + }, + { + "epoch": 2.9748603351955305, + "grad_norm": 1.6800507307052612, + "learning_rate": 2.5953042318751686e-06, + "loss": 0.9469, + "step": 3195 + }, + { + "epoch": 2.9757914338919926, + "grad_norm": 1.644413709640503, + "learning_rate": 2.5940671028681954e-06, + "loss": 0.9257, + "step": 3196 + }, + { + "epoch": 2.9767225325884543, + "grad_norm": 1.654010534286499, + "learning_rate": 2.5928299507931574e-06, + "loss": 0.9792, + "step": 3197 + }, + { + "epoch": 2.977653631284916, + "grad_norm": 1.6688764095306396, + "learning_rate": 2.591592775953442e-06, + "loss": 0.9659, + "step": 3198 + }, + { + "epoch": 2.978584729981378, + "grad_norm": 1.67815363407135, + "learning_rate": 2.5903555786524413e-06, + "loss": 0.9418, + "step": 3199 + }, + { + "epoch": 2.97951582867784, + "grad_norm": 1.6087619066238403, + "learning_rate": 2.5891183591935514e-06, + "loss": 0.9498, + "step": 3200 + }, + { + "epoch": 2.9804469273743015, + "grad_norm": 1.616249442100525, + "learning_rate": 2.587881117880179e-06, + "loss": 0.9735, + "step": 3201 + }, + { + "epoch": 2.9813780260707636, + "grad_norm": 1.6133146286010742, + "learning_rate": 2.5866438550157284e-06, + "loss": 0.9327, + "step": 3202 + }, + { + "epoch": 2.9823091247672253, + "grad_norm": 1.6440492868423462, + "learning_rate": 2.585406570903616e-06, + "loss": 0.9456, + "step": 3203 + }, + { + "epoch": 2.983240223463687, + "grad_norm": 1.6011227369308472, + "learning_rate": 2.5841692658472616e-06, + "loss": 0.9421, + "step": 3204 + }, + { + "epoch": 2.984171322160149, + "grad_norm": 1.6756632328033447, + "learning_rate": 2.5829319401500867e-06, + "loss": 1.0086, + "step": 3205 + }, + { + "epoch": 2.985102420856611, + "grad_norm": 1.5915745496749878, + "learning_rate": 2.581694594115523e-06, + "loss": 0.9242, + "step": 3206 + }, + { + "epoch": 2.9860335195530725, + "grad_norm": 1.6238288879394531, + "learning_rate": 2.5804572280470027e-06, + "loss": 0.973, + "step": 3207 + }, + { + "epoch": 2.9869646182495346, + "grad_norm": 1.5941739082336426, + "learning_rate": 2.5792198422479668e-06, + "loss": 0.9684, + "step": 3208 + }, + { + "epoch": 2.9878957169459963, + "grad_norm": 1.6146596670150757, + "learning_rate": 2.5779824370218575e-06, + "loss": 0.9749, + "step": 3209 + }, + { + "epoch": 2.988826815642458, + "grad_norm": 1.5792685747146606, + "learning_rate": 2.5767450126721254e-06, + "loss": 0.9422, + "step": 3210 + }, + { + "epoch": 2.98975791433892, + "grad_norm": 1.5883413553237915, + "learning_rate": 2.5755075695022223e-06, + "loss": 0.9395, + "step": 3211 + }, + { + "epoch": 2.990689013035382, + "grad_norm": 1.6765646934509277, + "learning_rate": 2.574270107815607e-06, + "loss": 0.9924, + "step": 3212 + }, + { + "epoch": 2.9916201117318435, + "grad_norm": 1.6381527185440063, + "learning_rate": 2.5730326279157426e-06, + "loss": 0.9296, + "step": 3213 + }, + { + "epoch": 2.9925512104283056, + "grad_norm": 1.6947510242462158, + "learning_rate": 2.5717951301060947e-06, + "loss": 0.9788, + "step": 3214 + }, + { + "epoch": 2.9934823091247673, + "grad_norm": 1.7428909540176392, + "learning_rate": 2.5705576146901364e-06, + "loss": 0.9851, + "step": 3215 + }, + { + "epoch": 2.994413407821229, + "grad_norm": 1.6258810758590698, + "learning_rate": 2.5693200819713414e-06, + "loss": 0.9569, + "step": 3216 + }, + { + "epoch": 2.995344506517691, + "grad_norm": 1.6402835845947266, + "learning_rate": 2.5680825322531923e-06, + "loss": 0.9147, + "step": 3217 + }, + { + "epoch": 2.996275605214153, + "grad_norm": 1.5955889225006104, + "learning_rate": 2.566844965839171e-06, + "loss": 0.9239, + "step": 3218 + }, + { + "epoch": 2.9972067039106145, + "grad_norm": 1.5961555242538452, + "learning_rate": 2.5656073830327665e-06, + "loss": 0.9238, + "step": 3219 + }, + { + "epoch": 2.998137802607076, + "grad_norm": 1.5453569889068604, + "learning_rate": 2.5643697841374722e-06, + "loss": 0.928, + "step": 3220 + }, + { + "epoch": 2.9990689013035383, + "grad_norm": 1.69312584400177, + "learning_rate": 2.563132169456782e-06, + "loss": 0.9641, + "step": 3221 + }, + { + "epoch": 3.0, + "grad_norm": 1.7103936672210693, + "learning_rate": 2.5618945392941984e-06, + "loss": 0.9236, + "step": 3222 + }, + { + "epoch": 3.0009310986964617, + "grad_norm": 1.6348861455917358, + "learning_rate": 2.5606568939532243e-06, + "loss": 0.8808, + "step": 3223 + }, + { + "epoch": 3.001862197392924, + "grad_norm": 1.5315836668014526, + "learning_rate": 2.559419233737367e-06, + "loss": 0.8667, + "step": 3224 + }, + { + "epoch": 3.0027932960893855, + "grad_norm": 1.564589500427246, + "learning_rate": 2.558181558950138e-06, + "loss": 0.8484, + "step": 3225 + }, + { + "epoch": 3.003724394785847, + "grad_norm": 1.6595914363861084, + "learning_rate": 2.5569438698950523e-06, + "loss": 0.9022, + "step": 3226 + }, + { + "epoch": 3.0046554934823093, + "grad_norm": 1.5895179510116577, + "learning_rate": 2.5557061668756284e-06, + "loss": 0.9148, + "step": 3227 + }, + { + "epoch": 3.005586592178771, + "grad_norm": 1.7316328287124634, + "learning_rate": 2.5544684501953876e-06, + "loss": 0.9026, + "step": 3228 + }, + { + "epoch": 3.0065176908752327, + "grad_norm": 1.5359153747558594, + "learning_rate": 2.5532307201578548e-06, + "loss": 0.8577, + "step": 3229 + }, + { + "epoch": 3.007448789571695, + "grad_norm": 1.5917328596115112, + "learning_rate": 2.5519929770665596e-06, + "loss": 0.8823, + "step": 3230 + }, + { + "epoch": 3.0083798882681565, + "grad_norm": 1.6393204927444458, + "learning_rate": 2.5507552212250324e-06, + "loss": 0.9357, + "step": 3231 + }, + { + "epoch": 3.009310986964618, + "grad_norm": 1.6072674989700317, + "learning_rate": 2.5495174529368084e-06, + "loss": 0.9027, + "step": 3232 + }, + { + "epoch": 3.01024208566108, + "grad_norm": 1.6603189706802368, + "learning_rate": 2.5482796725054247e-06, + "loss": 0.91, + "step": 3233 + }, + { + "epoch": 3.011173184357542, + "grad_norm": 1.6961463689804077, + "learning_rate": 2.547041880234424e-06, + "loss": 0.9258, + "step": 3234 + }, + { + "epoch": 3.0121042830540037, + "grad_norm": 1.6351066827774048, + "learning_rate": 2.545804076427348e-06, + "loss": 0.8773, + "step": 3235 + }, + { + "epoch": 3.0130353817504654, + "grad_norm": 1.6111769676208496, + "learning_rate": 2.544566261387743e-06, + "loss": 0.8677, + "step": 3236 + }, + { + "epoch": 3.0139664804469275, + "grad_norm": 1.627763032913208, + "learning_rate": 2.5433284354191595e-06, + "loss": 0.9188, + "step": 3237 + }, + { + "epoch": 3.014897579143389, + "grad_norm": 1.680160641670227, + "learning_rate": 2.5420905988251488e-06, + "loss": 0.913, + "step": 3238 + }, + { + "epoch": 3.015828677839851, + "grad_norm": 1.7062358856201172, + "learning_rate": 2.5408527519092656e-06, + "loss": 0.9194, + "step": 3239 + }, + { + "epoch": 3.016759776536313, + "grad_norm": 1.6547439098358154, + "learning_rate": 2.539614894975067e-06, + "loss": 0.8822, + "step": 3240 + }, + { + "epoch": 3.0176908752327747, + "grad_norm": 1.7282648086547852, + "learning_rate": 2.5383770283261126e-06, + "loss": 0.9289, + "step": 3241 + }, + { + "epoch": 3.0186219739292364, + "grad_norm": 1.6317826509475708, + "learning_rate": 2.5371391522659645e-06, + "loss": 0.921, + "step": 3242 + }, + { + "epoch": 3.0195530726256985, + "grad_norm": 1.6277379989624023, + "learning_rate": 2.5359012670981853e-06, + "loss": 0.9567, + "step": 3243 + }, + { + "epoch": 3.02048417132216, + "grad_norm": 1.6117379665374756, + "learning_rate": 2.5346633731263444e-06, + "loss": 0.9047, + "step": 3244 + }, + { + "epoch": 3.021415270018622, + "grad_norm": 1.715893268585205, + "learning_rate": 2.533425470654007e-06, + "loss": 0.9291, + "step": 3245 + }, + { + "epoch": 3.022346368715084, + "grad_norm": 1.680368185043335, + "learning_rate": 2.5321875599847456e-06, + "loss": 0.9226, + "step": 3246 + }, + { + "epoch": 3.0232774674115457, + "grad_norm": 1.5958926677703857, + "learning_rate": 2.530949641422133e-06, + "loss": 0.9247, + "step": 3247 + }, + { + "epoch": 3.0242085661080074, + "grad_norm": 1.6338911056518555, + "learning_rate": 2.529711715269743e-06, + "loss": 0.9021, + "step": 3248 + }, + { + "epoch": 3.0251396648044695, + "grad_norm": 1.6195487976074219, + "learning_rate": 2.5284737818311537e-06, + "loss": 0.8914, + "step": 3249 + }, + { + "epoch": 3.026070763500931, + "grad_norm": 1.6505372524261475, + "learning_rate": 2.527235841409941e-06, + "loss": 0.8979, + "step": 3250 + }, + { + "epoch": 3.027001862197393, + "grad_norm": 1.685255527496338, + "learning_rate": 2.525997894309688e-06, + "loss": 0.9199, + "step": 3251 + }, + { + "epoch": 3.0279329608938546, + "grad_norm": 1.575434923171997, + "learning_rate": 2.5247599408339724e-06, + "loss": 0.867, + "step": 3252 + }, + { + "epoch": 3.0288640595903167, + "grad_norm": 1.7103954553604126, + "learning_rate": 2.523521981286381e-06, + "loss": 0.9454, + "step": 3253 + }, + { + "epoch": 3.0297951582867784, + "grad_norm": 1.6721266508102417, + "learning_rate": 2.5222840159704957e-06, + "loss": 0.9143, + "step": 3254 + }, + { + "epoch": 3.03072625698324, + "grad_norm": 1.69712495803833, + "learning_rate": 2.521046045189905e-06, + "loss": 0.9241, + "step": 3255 + }, + { + "epoch": 3.031657355679702, + "grad_norm": 1.6615443229675293, + "learning_rate": 2.519808069248194e-06, + "loss": 0.8948, + "step": 3256 + }, + { + "epoch": 3.032588454376164, + "grad_norm": 1.701850414276123, + "learning_rate": 2.5185700884489527e-06, + "loss": 0.8984, + "step": 3257 + }, + { + "epoch": 3.0335195530726256, + "grad_norm": 1.635740041732788, + "learning_rate": 2.5173321030957716e-06, + "loss": 0.9336, + "step": 3258 + }, + { + "epoch": 3.0344506517690877, + "grad_norm": 1.6742604970932007, + "learning_rate": 2.51609411349224e-06, + "loss": 0.9297, + "step": 3259 + }, + { + "epoch": 3.0353817504655494, + "grad_norm": 1.7093249559402466, + "learning_rate": 2.514856119941952e-06, + "loss": 0.9176, + "step": 3260 + }, + { + "epoch": 3.036312849162011, + "grad_norm": 1.6714253425598145, + "learning_rate": 2.5136181227484983e-06, + "loss": 0.9162, + "step": 3261 + }, + { + "epoch": 3.037243947858473, + "grad_norm": 1.629442572593689, + "learning_rate": 2.512380122215475e-06, + "loss": 0.9021, + "step": 3262 + }, + { + "epoch": 3.038175046554935, + "grad_norm": 1.6980584859848022, + "learning_rate": 2.5111421186464747e-06, + "loss": 0.9064, + "step": 3263 + }, + { + "epoch": 3.0391061452513966, + "grad_norm": 1.7323260307312012, + "learning_rate": 2.5099041123450948e-06, + "loss": 0.937, + "step": 3264 + }, + { + "epoch": 3.0400372439478587, + "grad_norm": 1.637475848197937, + "learning_rate": 2.50866610361493e-06, + "loss": 0.8936, + "step": 3265 + }, + { + "epoch": 3.0409683426443204, + "grad_norm": 1.6962095499038696, + "learning_rate": 2.507428092759578e-06, + "loss": 0.9293, + "step": 3266 + }, + { + "epoch": 3.041899441340782, + "grad_norm": 1.7129355669021606, + "learning_rate": 2.5061900800826355e-06, + "loss": 0.9075, + "step": 3267 + }, + { + "epoch": 3.0428305400372437, + "grad_norm": 1.664898157119751, + "learning_rate": 2.504952065887701e-06, + "loss": 0.91, + "step": 3268 + }, + { + "epoch": 3.043761638733706, + "grad_norm": 1.734605073928833, + "learning_rate": 2.5037140504783714e-06, + "loss": 0.8936, + "step": 3269 + }, + { + "epoch": 3.0446927374301676, + "grad_norm": 1.7139325141906738, + "learning_rate": 2.5024760341582455e-06, + "loss": 0.9331, + "step": 3270 + }, + { + "epoch": 3.0456238361266292, + "grad_norm": 1.717179298400879, + "learning_rate": 2.5012380172309224e-06, + "loss": 0.9096, + "step": 3271 + }, + { + "epoch": 3.0465549348230914, + "grad_norm": 1.6623668670654297, + "learning_rate": 2.5e-06, + "loss": 0.9106, + "step": 3272 + }, + { + "epoch": 3.047486033519553, + "grad_norm": 1.6601907014846802, + "learning_rate": 2.4987619827690784e-06, + "loss": 0.8966, + "step": 3273 + }, + { + "epoch": 3.0484171322160147, + "grad_norm": 1.6603847742080688, + "learning_rate": 2.4975239658417557e-06, + "loss": 0.9007, + "step": 3274 + }, + { + "epoch": 3.049348230912477, + "grad_norm": 1.7118208408355713, + "learning_rate": 2.4962859495216295e-06, + "loss": 0.9512, + "step": 3275 + }, + { + "epoch": 3.0502793296089385, + "grad_norm": 1.6912704706192017, + "learning_rate": 2.4950479341123e-06, + "loss": 0.9446, + "step": 3276 + }, + { + "epoch": 3.0512104283054002, + "grad_norm": 1.6684913635253906, + "learning_rate": 2.4938099199173645e-06, + "loss": 0.9156, + "step": 3277 + }, + { + "epoch": 3.0521415270018624, + "grad_norm": 1.657727599143982, + "learning_rate": 2.492571907240423e-06, + "loss": 0.9253, + "step": 3278 + }, + { + "epoch": 3.053072625698324, + "grad_norm": 1.6968016624450684, + "learning_rate": 2.4913338963850704e-06, + "loss": 0.8988, + "step": 3279 + }, + { + "epoch": 3.0540037243947857, + "grad_norm": 1.7171710729599, + "learning_rate": 2.490095887654906e-06, + "loss": 0.9049, + "step": 3280 + }, + { + "epoch": 3.054934823091248, + "grad_norm": 1.7257767915725708, + "learning_rate": 2.488857881353526e-06, + "loss": 0.9327, + "step": 3281 + }, + { + "epoch": 3.0558659217877095, + "grad_norm": 1.8294111490249634, + "learning_rate": 2.4876198777845263e-06, + "loss": 0.9482, + "step": 3282 + }, + { + "epoch": 3.0567970204841712, + "grad_norm": 1.726961612701416, + "learning_rate": 2.486381877251502e-06, + "loss": 0.9213, + "step": 3283 + }, + { + "epoch": 3.0577281191806334, + "grad_norm": 1.6841015815734863, + "learning_rate": 2.485143880058049e-06, + "loss": 0.9021, + "step": 3284 + }, + { + "epoch": 3.058659217877095, + "grad_norm": 1.6948037147521973, + "learning_rate": 2.4839058865077607e-06, + "loss": 0.9414, + "step": 3285 + }, + { + "epoch": 3.0595903165735567, + "grad_norm": 1.6763288974761963, + "learning_rate": 2.4826678969042292e-06, + "loss": 0.9183, + "step": 3286 + }, + { + "epoch": 3.0605214152700184, + "grad_norm": 1.658308982849121, + "learning_rate": 2.4814299115510477e-06, + "loss": 0.8916, + "step": 3287 + }, + { + "epoch": 3.0614525139664805, + "grad_norm": 1.7754426002502441, + "learning_rate": 2.480191930751807e-06, + "loss": 0.8561, + "step": 3288 + }, + { + "epoch": 3.0623836126629422, + "grad_norm": 1.7293765544891357, + "learning_rate": 2.478953954810096e-06, + "loss": 0.9501, + "step": 3289 + }, + { + "epoch": 3.063314711359404, + "grad_norm": 1.6309765577316284, + "learning_rate": 2.4777159840295047e-06, + "loss": 0.9146, + "step": 3290 + }, + { + "epoch": 3.064245810055866, + "grad_norm": 1.7002781629562378, + "learning_rate": 2.47647801871362e-06, + "loss": 0.9249, + "step": 3291 + }, + { + "epoch": 3.0651769087523277, + "grad_norm": 1.694254755973816, + "learning_rate": 2.4752400591660284e-06, + "loss": 0.9045, + "step": 3292 + }, + { + "epoch": 3.0661080074487894, + "grad_norm": 1.6246881484985352, + "learning_rate": 2.474002105690313e-06, + "loss": 0.9315, + "step": 3293 + }, + { + "epoch": 3.0670391061452515, + "grad_norm": 1.660295009613037, + "learning_rate": 2.472764158590059e-06, + "loss": 0.8825, + "step": 3294 + }, + { + "epoch": 3.0679702048417132, + "grad_norm": 1.688385248184204, + "learning_rate": 2.4715262181688475e-06, + "loss": 0.9207, + "step": 3295 + }, + { + "epoch": 3.068901303538175, + "grad_norm": 1.634578824043274, + "learning_rate": 2.4702882847302573e-06, + "loss": 0.9101, + "step": 3296 + }, + { + "epoch": 3.069832402234637, + "grad_norm": 1.6727747917175293, + "learning_rate": 2.469050358577867e-06, + "loss": 0.9342, + "step": 3297 + }, + { + "epoch": 3.0707635009310987, + "grad_norm": 1.6537649631500244, + "learning_rate": 2.467812440015255e-06, + "loss": 0.8723, + "step": 3298 + }, + { + "epoch": 3.0716945996275604, + "grad_norm": 1.6903131008148193, + "learning_rate": 2.466574529345994e-06, + "loss": 0.9111, + "step": 3299 + }, + { + "epoch": 3.0726256983240225, + "grad_norm": 1.656028389930725, + "learning_rate": 2.465336626873657e-06, + "loss": 0.904, + "step": 3300 + }, + { + "epoch": 3.0735567970204842, + "grad_norm": 1.6640174388885498, + "learning_rate": 2.4640987329018147e-06, + "loss": 0.9014, + "step": 3301 + }, + { + "epoch": 3.074487895716946, + "grad_norm": 1.6518352031707764, + "learning_rate": 2.4628608477340368e-06, + "loss": 0.8969, + "step": 3302 + }, + { + "epoch": 3.0754189944134076, + "grad_norm": 1.6160846948623657, + "learning_rate": 2.461622971673888e-06, + "loss": 0.9276, + "step": 3303 + }, + { + "epoch": 3.0763500931098697, + "grad_norm": 1.7019175291061401, + "learning_rate": 2.4603851050249327e-06, + "loss": 0.9182, + "step": 3304 + }, + { + "epoch": 3.0772811918063314, + "grad_norm": 1.6947954893112183, + "learning_rate": 2.4591472480907348e-06, + "loss": 0.9317, + "step": 3305 + }, + { + "epoch": 3.078212290502793, + "grad_norm": 1.66879403591156, + "learning_rate": 2.4579094011748517e-06, + "loss": 0.8858, + "step": 3306 + }, + { + "epoch": 3.0791433891992552, + "grad_norm": 1.6757324934005737, + "learning_rate": 2.4566715645808413e-06, + "loss": 0.8714, + "step": 3307 + }, + { + "epoch": 3.080074487895717, + "grad_norm": 1.6933484077453613, + "learning_rate": 2.4554337386122575e-06, + "loss": 0.9289, + "step": 3308 + }, + { + "epoch": 3.0810055865921786, + "grad_norm": 1.6270999908447266, + "learning_rate": 2.4541959235726534e-06, + "loss": 0.866, + "step": 3309 + }, + { + "epoch": 3.0819366852886407, + "grad_norm": 1.6904125213623047, + "learning_rate": 2.452958119765577e-06, + "loss": 0.938, + "step": 3310 + }, + { + "epoch": 3.0828677839851024, + "grad_norm": 1.825324296951294, + "learning_rate": 2.451720327494575e-06, + "loss": 0.9581, + "step": 3311 + }, + { + "epoch": 3.083798882681564, + "grad_norm": 1.7476593255996704, + "learning_rate": 2.450482547063193e-06, + "loss": 0.8942, + "step": 3312 + }, + { + "epoch": 3.0847299813780262, + "grad_norm": 1.7949507236480713, + "learning_rate": 2.4492447787749684e-06, + "loss": 0.9288, + "step": 3313 + }, + { + "epoch": 3.085661080074488, + "grad_norm": 1.719113826751709, + "learning_rate": 2.4480070229334413e-06, + "loss": 0.8886, + "step": 3314 + }, + { + "epoch": 3.0865921787709496, + "grad_norm": 1.7233141660690308, + "learning_rate": 2.446769279842145e-06, + "loss": 0.9106, + "step": 3315 + }, + { + "epoch": 3.0875232774674117, + "grad_norm": 1.7058930397033691, + "learning_rate": 2.4455315498046132e-06, + "loss": 0.8962, + "step": 3316 + }, + { + "epoch": 3.0884543761638734, + "grad_norm": 1.724055290222168, + "learning_rate": 2.4442938331243724e-06, + "loss": 0.8723, + "step": 3317 + }, + { + "epoch": 3.089385474860335, + "grad_norm": 1.727042555809021, + "learning_rate": 2.443056130104948e-06, + "loss": 0.9233, + "step": 3318 + }, + { + "epoch": 3.0903165735567972, + "grad_norm": 1.670355200767517, + "learning_rate": 2.441818441049863e-06, + "loss": 0.911, + "step": 3319 + }, + { + "epoch": 3.091247672253259, + "grad_norm": 1.7252628803253174, + "learning_rate": 2.440580766262634e-06, + "loss": 0.9492, + "step": 3320 + }, + { + "epoch": 3.0921787709497206, + "grad_norm": 1.6504132747650146, + "learning_rate": 2.4393431060467765e-06, + "loss": 0.919, + "step": 3321 + }, + { + "epoch": 3.0931098696461823, + "grad_norm": 1.7550840377807617, + "learning_rate": 2.438105460705803e-06, + "loss": 0.951, + "step": 3322 + }, + { + "epoch": 3.0940409683426444, + "grad_norm": 1.6968220472335815, + "learning_rate": 2.4368678305432182e-06, + "loss": 0.9516, + "step": 3323 + }, + { + "epoch": 3.094972067039106, + "grad_norm": 1.6675012111663818, + "learning_rate": 2.435630215862529e-06, + "loss": 0.8952, + "step": 3324 + }, + { + "epoch": 3.095903165735568, + "grad_norm": 1.6590911149978638, + "learning_rate": 2.434392616967234e-06, + "loss": 0.9139, + "step": 3325 + }, + { + "epoch": 3.09683426443203, + "grad_norm": 1.7370678186416626, + "learning_rate": 2.4331550341608304e-06, + "loss": 0.9367, + "step": 3326 + }, + { + "epoch": 3.0977653631284916, + "grad_norm": 1.6874971389770508, + "learning_rate": 2.431917467746809e-06, + "loss": 0.9108, + "step": 3327 + }, + { + "epoch": 3.0986964618249533, + "grad_norm": 1.65764582157135, + "learning_rate": 2.430679918028659e-06, + "loss": 0.8995, + "step": 3328 + }, + { + "epoch": 3.0996275605214154, + "grad_norm": 1.7074867486953735, + "learning_rate": 2.4294423853098653e-06, + "loss": 0.9226, + "step": 3329 + }, + { + "epoch": 3.100558659217877, + "grad_norm": 1.715795874595642, + "learning_rate": 2.4282048698939066e-06, + "loss": 0.9263, + "step": 3330 + }, + { + "epoch": 3.101489757914339, + "grad_norm": 1.6853233575820923, + "learning_rate": 2.426967372084258e-06, + "loss": 0.9209, + "step": 3331 + }, + { + "epoch": 3.102420856610801, + "grad_norm": 1.7638869285583496, + "learning_rate": 2.4257298921843935e-06, + "loss": 0.9018, + "step": 3332 + }, + { + "epoch": 3.1033519553072626, + "grad_norm": 1.7023661136627197, + "learning_rate": 2.4244924304977785e-06, + "loss": 0.8811, + "step": 3333 + }, + { + "epoch": 3.1042830540037243, + "grad_norm": 1.6923280954360962, + "learning_rate": 2.423254987327875e-06, + "loss": 0.9129, + "step": 3334 + }, + { + "epoch": 3.1052141527001864, + "grad_norm": 1.7066457271575928, + "learning_rate": 2.4220175629781425e-06, + "loss": 0.9022, + "step": 3335 + }, + { + "epoch": 3.106145251396648, + "grad_norm": 1.6826326847076416, + "learning_rate": 2.4207801577520345e-06, + "loss": 0.9095, + "step": 3336 + }, + { + "epoch": 3.10707635009311, + "grad_norm": 1.756100058555603, + "learning_rate": 2.4195427719529977e-06, + "loss": 0.9269, + "step": 3337 + }, + { + "epoch": 3.1080074487895715, + "grad_norm": 1.6395617723464966, + "learning_rate": 2.4183054058844775e-06, + "loss": 0.8798, + "step": 3338 + }, + { + "epoch": 3.1089385474860336, + "grad_norm": 1.6417088508605957, + "learning_rate": 2.4170680598499137e-06, + "loss": 0.9377, + "step": 3339 + }, + { + "epoch": 3.1098696461824953, + "grad_norm": 1.7448179721832275, + "learning_rate": 2.4158307341527396e-06, + "loss": 0.9021, + "step": 3340 + }, + { + "epoch": 3.110800744878957, + "grad_norm": 1.7119693756103516, + "learning_rate": 2.4145934290963842e-06, + "loss": 0.9027, + "step": 3341 + }, + { + "epoch": 3.111731843575419, + "grad_norm": 1.646247148513794, + "learning_rate": 2.413356144984272e-06, + "loss": 0.8591, + "step": 3342 + }, + { + "epoch": 3.112662942271881, + "grad_norm": 1.7105697393417358, + "learning_rate": 2.412118882119823e-06, + "loss": 0.91, + "step": 3343 + }, + { + "epoch": 3.1135940409683425, + "grad_norm": 1.656224250793457, + "learning_rate": 2.410881640806449e-06, + "loss": 0.9112, + "step": 3344 + }, + { + "epoch": 3.1145251396648046, + "grad_norm": 1.6878925561904907, + "learning_rate": 2.4096444213475595e-06, + "loss": 0.9162, + "step": 3345 + }, + { + "epoch": 3.1154562383612663, + "grad_norm": 1.707326889038086, + "learning_rate": 2.4084072240465585e-06, + "loss": 0.9485, + "step": 3346 + }, + { + "epoch": 3.116387337057728, + "grad_norm": 1.7247391939163208, + "learning_rate": 2.407170049206843e-06, + "loss": 0.8914, + "step": 3347 + }, + { + "epoch": 3.11731843575419, + "grad_norm": 1.6792007684707642, + "learning_rate": 2.4059328971318054e-06, + "loss": 0.926, + "step": 3348 + }, + { + "epoch": 3.118249534450652, + "grad_norm": 1.7814252376556396, + "learning_rate": 2.4046957681248314e-06, + "loss": 0.8685, + "step": 3349 + }, + { + "epoch": 3.1191806331471135, + "grad_norm": 1.6012327671051025, + "learning_rate": 2.403458662489304e-06, + "loss": 0.8687, + "step": 3350 + }, + { + "epoch": 3.1201117318435756, + "grad_norm": 1.6993153095245361, + "learning_rate": 2.4022215805285973e-06, + "loss": 0.9373, + "step": 3351 + }, + { + "epoch": 3.1210428305400373, + "grad_norm": 1.707701325416565, + "learning_rate": 2.4009845225460795e-06, + "loss": 0.923, + "step": 3352 + }, + { + "epoch": 3.121973929236499, + "grad_norm": 1.6501392126083374, + "learning_rate": 2.3997474888451165e-06, + "loss": 0.8701, + "step": 3353 + }, + { + "epoch": 3.122905027932961, + "grad_norm": 1.6571564674377441, + "learning_rate": 2.3985104797290654e-06, + "loss": 0.8881, + "step": 3354 + }, + { + "epoch": 3.123836126629423, + "grad_norm": 1.6818349361419678, + "learning_rate": 2.3972734955012766e-06, + "loss": 0.8899, + "step": 3355 + }, + { + "epoch": 3.1247672253258845, + "grad_norm": 1.6249483823776245, + "learning_rate": 2.396036536465096e-06, + "loss": 0.8891, + "step": 3356 + }, + { + "epoch": 3.1256983240223466, + "grad_norm": 1.73823881149292, + "learning_rate": 2.394799602923864e-06, + "loss": 0.9463, + "step": 3357 + }, + { + "epoch": 3.1266294227188083, + "grad_norm": 1.72157621383667, + "learning_rate": 2.393562695180913e-06, + "loss": 0.8692, + "step": 3358 + }, + { + "epoch": 3.12756052141527, + "grad_norm": 1.64155912399292, + "learning_rate": 2.3923258135395688e-06, + "loss": 0.8962, + "step": 3359 + }, + { + "epoch": 3.1284916201117317, + "grad_norm": 1.783261775970459, + "learning_rate": 2.3910889583031533e-06, + "loss": 0.9376, + "step": 3360 + }, + { + "epoch": 3.129422718808194, + "grad_norm": 1.684281349182129, + "learning_rate": 2.3898521297749785e-06, + "loss": 0.8981, + "step": 3361 + }, + { + "epoch": 3.1303538175046555, + "grad_norm": 1.691046118736267, + "learning_rate": 2.388615328258354e-06, + "loss": 0.9489, + "step": 3362 + }, + { + "epoch": 3.131284916201117, + "grad_norm": 1.683213710784912, + "learning_rate": 2.387378554056578e-06, + "loss": 0.9331, + "step": 3363 + }, + { + "epoch": 3.1322160148975793, + "grad_norm": 1.6740621328353882, + "learning_rate": 2.3861418074729477e-06, + "loss": 0.9037, + "step": 3364 + }, + { + "epoch": 3.133147113594041, + "grad_norm": 1.697553277015686, + "learning_rate": 2.384905088810747e-06, + "loss": 0.9186, + "step": 3365 + }, + { + "epoch": 3.1340782122905027, + "grad_norm": 1.6715878248214722, + "learning_rate": 2.3836683983732583e-06, + "loss": 0.9397, + "step": 3366 + }, + { + "epoch": 3.135009310986965, + "grad_norm": 1.7044919729232788, + "learning_rate": 2.382431736463755e-06, + "loss": 0.9084, + "step": 3367 + }, + { + "epoch": 3.1359404096834265, + "grad_norm": 1.687350869178772, + "learning_rate": 2.3811951033855015e-06, + "loss": 0.9025, + "step": 3368 + }, + { + "epoch": 3.136871508379888, + "grad_norm": 1.6968352794647217, + "learning_rate": 2.3799584994417594e-06, + "loss": 0.8827, + "step": 3369 + }, + { + "epoch": 3.1378026070763503, + "grad_norm": 1.6833912134170532, + "learning_rate": 2.3787219249357805e-06, + "loss": 0.945, + "step": 3370 + }, + { + "epoch": 3.138733705772812, + "grad_norm": 1.6953020095825195, + "learning_rate": 2.3774853801708097e-06, + "loss": 0.949, + "step": 3371 + }, + { + "epoch": 3.1396648044692737, + "grad_norm": 1.7083728313446045, + "learning_rate": 2.3762488654500836e-06, + "loss": 0.8975, + "step": 3372 + }, + { + "epoch": 3.1405959031657353, + "grad_norm": 1.6619200706481934, + "learning_rate": 2.3750123810768344e-06, + "loss": 0.8989, + "step": 3373 + }, + { + "epoch": 3.1415270018621975, + "grad_norm": 1.684128761291504, + "learning_rate": 2.3737759273542843e-06, + "loss": 0.9292, + "step": 3374 + }, + { + "epoch": 3.142458100558659, + "grad_norm": 1.6813548803329468, + "learning_rate": 2.372539504585648e-06, + "loss": 0.9007, + "step": 3375 + }, + { + "epoch": 3.143389199255121, + "grad_norm": 1.667564034461975, + "learning_rate": 2.371303113074134e-06, + "loss": 0.8933, + "step": 3376 + }, + { + "epoch": 3.144320297951583, + "grad_norm": 1.7173337936401367, + "learning_rate": 2.370066753122942e-06, + "loss": 0.904, + "step": 3377 + }, + { + "epoch": 3.1452513966480447, + "grad_norm": 1.646213173866272, + "learning_rate": 2.368830425035266e-06, + "loss": 0.8946, + "step": 3378 + }, + { + "epoch": 3.1461824953445063, + "grad_norm": 1.709409236907959, + "learning_rate": 2.367594129114288e-06, + "loss": 0.9175, + "step": 3379 + }, + { + "epoch": 3.1471135940409685, + "grad_norm": 1.6985713243484497, + "learning_rate": 2.366357865663186e-06, + "loss": 0.9487, + "step": 3380 + }, + { + "epoch": 3.14804469273743, + "grad_norm": 1.693822979927063, + "learning_rate": 2.3651216349851297e-06, + "loss": 0.9266, + "step": 3381 + }, + { + "epoch": 3.148975791433892, + "grad_norm": 1.6887474060058594, + "learning_rate": 2.363885437383278e-06, + "loss": 0.921, + "step": 3382 + }, + { + "epoch": 3.149906890130354, + "grad_norm": 1.6518738269805908, + "learning_rate": 2.362649273160784e-06, + "loss": 0.8776, + "step": 3383 + }, + { + "epoch": 3.1508379888268156, + "grad_norm": 1.7738643884658813, + "learning_rate": 2.361413142620793e-06, + "loss": 0.9344, + "step": 3384 + }, + { + "epoch": 3.1517690875232773, + "grad_norm": 1.6904795169830322, + "learning_rate": 2.3601770460664415e-06, + "loss": 0.9688, + "step": 3385 + }, + { + "epoch": 3.1527001862197395, + "grad_norm": 1.626943588256836, + "learning_rate": 2.358940983800855e-06, + "loss": 0.8789, + "step": 3386 + }, + { + "epoch": 3.153631284916201, + "grad_norm": 1.6936602592468262, + "learning_rate": 2.3577049561271545e-06, + "loss": 0.9073, + "step": 3387 + }, + { + "epoch": 3.154562383612663, + "grad_norm": 1.6696503162384033, + "learning_rate": 2.3564689633484515e-06, + "loss": 0.8763, + "step": 3388 + }, + { + "epoch": 3.155493482309125, + "grad_norm": 1.7532196044921875, + "learning_rate": 2.3552330057678473e-06, + "loss": 0.9078, + "step": 3389 + }, + { + "epoch": 3.1564245810055866, + "grad_norm": 1.7158058881759644, + "learning_rate": 2.353997083688435e-06, + "loss": 0.8885, + "step": 3390 + }, + { + "epoch": 3.1573556797020483, + "grad_norm": 1.6331466436386108, + "learning_rate": 2.3527611974133016e-06, + "loss": 0.9263, + "step": 3391 + }, + { + "epoch": 3.1582867783985105, + "grad_norm": 1.708320140838623, + "learning_rate": 2.351525347245522e-06, + "loss": 0.9185, + "step": 3392 + }, + { + "epoch": 3.159217877094972, + "grad_norm": 1.6720097064971924, + "learning_rate": 2.3502895334881635e-06, + "loss": 0.9187, + "step": 3393 + }, + { + "epoch": 3.160148975791434, + "grad_norm": 1.7563456296920776, + "learning_rate": 2.349053756444285e-06, + "loss": 0.9286, + "step": 3394 + }, + { + "epoch": 3.1610800744878955, + "grad_norm": 1.7602083683013916, + "learning_rate": 2.3478180164169366e-06, + "loss": 0.9178, + "step": 3395 + }, + { + "epoch": 3.1620111731843576, + "grad_norm": 1.681713342666626, + "learning_rate": 2.3465823137091574e-06, + "loss": 0.9244, + "step": 3396 + }, + { + "epoch": 3.1629422718808193, + "grad_norm": 1.6967233419418335, + "learning_rate": 2.3453466486239783e-06, + "loss": 0.9221, + "step": 3397 + }, + { + "epoch": 3.163873370577281, + "grad_norm": 1.7396472692489624, + "learning_rate": 2.3441110214644246e-06, + "loss": 0.917, + "step": 3398 + }, + { + "epoch": 3.164804469273743, + "grad_norm": 1.7534763813018799, + "learning_rate": 2.3428754325335044e-06, + "loss": 0.9176, + "step": 3399 + }, + { + "epoch": 3.165735567970205, + "grad_norm": 1.7043342590332031, + "learning_rate": 2.341639882134224e-06, + "loss": 0.9037, + "step": 3400 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 1.685696005821228, + "learning_rate": 2.340404370569576e-06, + "loss": 0.904, + "step": 3401 + }, + { + "epoch": 3.1675977653631286, + "grad_norm": 1.7363678216934204, + "learning_rate": 2.3391688981425464e-06, + "loss": 0.9145, + "step": 3402 + }, + { + "epoch": 3.1685288640595903, + "grad_norm": 1.6908007860183716, + "learning_rate": 2.337933465156108e-06, + "loss": 0.8958, + "step": 3403 + }, + { + "epoch": 3.169459962756052, + "grad_norm": 1.7950772047042847, + "learning_rate": 2.3366980719132268e-06, + "loss": 0.915, + "step": 3404 + }, + { + "epoch": 3.170391061452514, + "grad_norm": 1.7826727628707886, + "learning_rate": 2.3354627187168584e-06, + "loss": 0.8764, + "step": 3405 + }, + { + "epoch": 3.171322160148976, + "grad_norm": 1.6985899209976196, + "learning_rate": 2.3342274058699475e-06, + "loss": 0.9165, + "step": 3406 + }, + { + "epoch": 3.1722532588454375, + "grad_norm": 1.7099257707595825, + "learning_rate": 2.33299213367543e-06, + "loss": 0.877, + "step": 3407 + }, + { + "epoch": 3.1731843575418996, + "grad_norm": 1.7201907634735107, + "learning_rate": 2.3317569024362317e-06, + "loss": 0.9108, + "step": 3408 + }, + { + "epoch": 3.1741154562383613, + "grad_norm": 1.7022032737731934, + "learning_rate": 2.3305217124552696e-06, + "loss": 0.89, + "step": 3409 + }, + { + "epoch": 3.175046554934823, + "grad_norm": 1.7690213918685913, + "learning_rate": 2.329286564035446e-06, + "loss": 0.9562, + "step": 3410 + }, + { + "epoch": 3.1759776536312847, + "grad_norm": 1.7318713665008545, + "learning_rate": 2.3280514574796593e-06, + "loss": 0.9034, + "step": 3411 + }, + { + "epoch": 3.176908752327747, + "grad_norm": 1.6821590662002563, + "learning_rate": 2.3268163930907934e-06, + "loss": 0.9395, + "step": 3412 + }, + { + "epoch": 3.1778398510242085, + "grad_norm": 1.6919987201690674, + "learning_rate": 2.3255813711717216e-06, + "loss": 0.8768, + "step": 3413 + }, + { + "epoch": 3.17877094972067, + "grad_norm": 1.6836568117141724, + "learning_rate": 2.3243463920253103e-06, + "loss": 0.9124, + "step": 3414 + }, + { + "epoch": 3.1797020484171323, + "grad_norm": 1.7388001680374146, + "learning_rate": 2.3231114559544117e-06, + "loss": 0.9167, + "step": 3415 + }, + { + "epoch": 3.180633147113594, + "grad_norm": 1.7960084676742554, + "learning_rate": 2.321876563261871e-06, + "loss": 0.9004, + "step": 3416 + }, + { + "epoch": 3.1815642458100557, + "grad_norm": 1.6765440702438354, + "learning_rate": 2.3206417142505187e-06, + "loss": 0.9384, + "step": 3417 + }, + { + "epoch": 3.182495344506518, + "grad_norm": 1.6785004138946533, + "learning_rate": 2.3194069092231777e-06, + "loss": 0.9173, + "step": 3418 + }, + { + "epoch": 3.1834264432029795, + "grad_norm": 1.6729774475097656, + "learning_rate": 2.318172148482659e-06, + "loss": 0.892, + "step": 3419 + }, + { + "epoch": 3.184357541899441, + "grad_norm": 1.678394079208374, + "learning_rate": 2.316937432331762e-06, + "loss": 0.9129, + "step": 3420 + }, + { + "epoch": 3.1852886405959033, + "grad_norm": 1.6302787065505981, + "learning_rate": 2.3157027610732775e-06, + "loss": 0.8923, + "step": 3421 + }, + { + "epoch": 3.186219739292365, + "grad_norm": 1.7189890146255493, + "learning_rate": 2.3144681350099837e-06, + "loss": 0.9103, + "step": 3422 + }, + { + "epoch": 3.1871508379888267, + "grad_norm": 1.6748616695404053, + "learning_rate": 2.3132335544446462e-06, + "loss": 0.8764, + "step": 3423 + }, + { + "epoch": 3.188081936685289, + "grad_norm": 1.7060880661010742, + "learning_rate": 2.3119990196800218e-06, + "loss": 0.9425, + "step": 3424 + }, + { + "epoch": 3.1890130353817505, + "grad_norm": 1.6806586980819702, + "learning_rate": 2.3107645310188555e-06, + "loss": 0.9121, + "step": 3425 + }, + { + "epoch": 3.189944134078212, + "grad_norm": 1.674682378768921, + "learning_rate": 2.309530088763882e-06, + "loss": 0.8912, + "step": 3426 + }, + { + "epoch": 3.1908752327746743, + "grad_norm": 1.7024798393249512, + "learning_rate": 2.3082956932178212e-06, + "loss": 0.9466, + "step": 3427 + }, + { + "epoch": 3.191806331471136, + "grad_norm": 1.6897104978561401, + "learning_rate": 2.3070613446833843e-06, + "loss": 0.8994, + "step": 3428 + }, + { + "epoch": 3.1927374301675977, + "grad_norm": 1.6608905792236328, + "learning_rate": 2.305827043463272e-06, + "loss": 0.8684, + "step": 3429 + }, + { + "epoch": 3.1936685288640594, + "grad_norm": 1.7487759590148926, + "learning_rate": 2.3045927898601703e-06, + "loss": 0.937, + "step": 3430 + }, + { + "epoch": 3.1945996275605215, + "grad_norm": 1.7491610050201416, + "learning_rate": 2.303358584176755e-06, + "loss": 0.8712, + "step": 3431 + }, + { + "epoch": 3.195530726256983, + "grad_norm": 1.7019641399383545, + "learning_rate": 2.302124426715691e-06, + "loss": 0.8719, + "step": 3432 + }, + { + "epoch": 3.196461824953445, + "grad_norm": 1.7703745365142822, + "learning_rate": 2.3008903177796318e-06, + "loss": 0.8891, + "step": 3433 + }, + { + "epoch": 3.197392923649907, + "grad_norm": 1.6748254299163818, + "learning_rate": 2.2996562576712145e-06, + "loss": 0.8984, + "step": 3434 + }, + { + "epoch": 3.1983240223463687, + "grad_norm": 1.711982250213623, + "learning_rate": 2.2984222466930698e-06, + "loss": 0.8878, + "step": 3435 + }, + { + "epoch": 3.1992551210428304, + "grad_norm": 1.6686856746673584, + "learning_rate": 2.2971882851478144e-06, + "loss": 0.886, + "step": 3436 + }, + { + "epoch": 3.2001862197392925, + "grad_norm": 1.7037968635559082, + "learning_rate": 2.2959543733380514e-06, + "loss": 0.8974, + "step": 3437 + }, + { + "epoch": 3.201117318435754, + "grad_norm": 1.713139295578003, + "learning_rate": 2.294720511566373e-06, + "loss": 0.9088, + "step": 3438 + }, + { + "epoch": 3.202048417132216, + "grad_norm": 1.6898949146270752, + "learning_rate": 2.293486700135358e-06, + "loss": 0.8976, + "step": 3439 + }, + { + "epoch": 3.202979515828678, + "grad_norm": 1.6984347105026245, + "learning_rate": 2.292252939347577e-06, + "loss": 0.8946, + "step": 3440 + }, + { + "epoch": 3.2039106145251397, + "grad_norm": 1.6554429531097412, + "learning_rate": 2.2910192295055825e-06, + "loss": 0.8954, + "step": 3441 + }, + { + "epoch": 3.2048417132216014, + "grad_norm": 1.7220181226730347, + "learning_rate": 2.2897855709119166e-06, + "loss": 0.9305, + "step": 3442 + }, + { + "epoch": 3.2057728119180635, + "grad_norm": 1.739747405052185, + "learning_rate": 2.288551963869112e-06, + "loss": 0.9027, + "step": 3443 + }, + { + "epoch": 3.206703910614525, + "grad_norm": 1.7780839204788208, + "learning_rate": 2.2873184086796825e-06, + "loss": 0.9107, + "step": 3444 + }, + { + "epoch": 3.207635009310987, + "grad_norm": 1.7119289636611938, + "learning_rate": 2.2860849056461347e-06, + "loss": 0.8865, + "step": 3445 + }, + { + "epoch": 3.2085661080074486, + "grad_norm": 1.7334108352661133, + "learning_rate": 2.28485145507096e-06, + "loss": 0.9614, + "step": 3446 + }, + { + "epoch": 3.2094972067039107, + "grad_norm": 1.858974575996399, + "learning_rate": 2.283618057256638e-06, + "loss": 0.8966, + "step": 3447 + }, + { + "epoch": 3.2104283054003724, + "grad_norm": 1.789520025253296, + "learning_rate": 2.282384712505634e-06, + "loss": 0.9359, + "step": 3448 + }, + { + "epoch": 3.211359404096834, + "grad_norm": 1.6848711967468262, + "learning_rate": 2.2811514211204004e-06, + "loss": 0.8967, + "step": 3449 + }, + { + "epoch": 3.212290502793296, + "grad_norm": 1.7142305374145508, + "learning_rate": 2.27991818340338e-06, + "loss": 0.9115, + "step": 3450 + }, + { + "epoch": 3.213221601489758, + "grad_norm": 1.6918210983276367, + "learning_rate": 2.278684999656995e-06, + "loss": 0.8723, + "step": 3451 + }, + { + "epoch": 3.2141527001862196, + "grad_norm": 1.7474030256271362, + "learning_rate": 2.277451870183662e-06, + "loss": 0.9524, + "step": 3452 + }, + { + "epoch": 3.2150837988826817, + "grad_norm": 1.7189279794692993, + "learning_rate": 2.27621879528578e-06, + "loss": 0.9311, + "step": 3453 + }, + { + "epoch": 3.2160148975791434, + "grad_norm": 1.6171016693115234, + "learning_rate": 2.274985775265737e-06, + "loss": 0.9057, + "step": 3454 + }, + { + "epoch": 3.216945996275605, + "grad_norm": 1.7530337572097778, + "learning_rate": 2.273752810425906e-06, + "loss": 0.9257, + "step": 3455 + }, + { + "epoch": 3.217877094972067, + "grad_norm": 1.7092812061309814, + "learning_rate": 2.2725199010686456e-06, + "loss": 0.8976, + "step": 3456 + }, + { + "epoch": 3.218808193668529, + "grad_norm": 1.6773228645324707, + "learning_rate": 2.2712870474963036e-06, + "loss": 0.9133, + "step": 3457 + }, + { + "epoch": 3.2197392923649906, + "grad_norm": 1.6792818307876587, + "learning_rate": 2.270054250011211e-06, + "loss": 0.9434, + "step": 3458 + }, + { + "epoch": 3.2206703910614527, + "grad_norm": 1.7077932357788086, + "learning_rate": 2.2688215089156874e-06, + "loss": 0.9152, + "step": 3459 + }, + { + "epoch": 3.2216014897579144, + "grad_norm": 1.6712348461151123, + "learning_rate": 2.2675888245120384e-06, + "loss": 0.9235, + "step": 3460 + }, + { + "epoch": 3.222532588454376, + "grad_norm": 1.6330510377883911, + "learning_rate": 2.266356197102554e-06, + "loss": 0.9006, + "step": 3461 + }, + { + "epoch": 3.223463687150838, + "grad_norm": 1.6948401927947998, + "learning_rate": 2.265123626989511e-06, + "loss": 0.9428, + "step": 3462 + }, + { + "epoch": 3.2243947858473, + "grad_norm": 1.68091881275177, + "learning_rate": 2.2638911144751734e-06, + "loss": 0.9243, + "step": 3463 + }, + { + "epoch": 3.2253258845437616, + "grad_norm": 1.7108176946640015, + "learning_rate": 2.262658659861791e-06, + "loss": 0.938, + "step": 3464 + }, + { + "epoch": 3.2262569832402237, + "grad_norm": 1.680197834968567, + "learning_rate": 2.2614262634515953e-06, + "loss": 0.9002, + "step": 3465 + }, + { + "epoch": 3.2271880819366854, + "grad_norm": 1.7151423692703247, + "learning_rate": 2.26019392554681e-06, + "loss": 0.9125, + "step": 3466 + }, + { + "epoch": 3.228119180633147, + "grad_norm": 1.6826694011688232, + "learning_rate": 2.25896164644964e-06, + "loss": 0.9196, + "step": 3467 + }, + { + "epoch": 3.2290502793296088, + "grad_norm": 1.7032185792922974, + "learning_rate": 2.2577294264622765e-06, + "loss": 0.9164, + "step": 3468 + }, + { + "epoch": 3.229981378026071, + "grad_norm": 1.716027021408081, + "learning_rate": 2.256497265886896e-06, + "loss": 0.9518, + "step": 3469 + }, + { + "epoch": 3.2309124767225326, + "grad_norm": 1.7424236536026, + "learning_rate": 2.2552651650256634e-06, + "loss": 0.959, + "step": 3470 + }, + { + "epoch": 3.2318435754189943, + "grad_norm": 1.6295639276504517, + "learning_rate": 2.254033124180725e-06, + "loss": 0.8792, + "step": 3471 + }, + { + "epoch": 3.2327746741154564, + "grad_norm": 1.6773732900619507, + "learning_rate": 2.2528011436542142e-06, + "loss": 0.8981, + "step": 3472 + }, + { + "epoch": 3.233705772811918, + "grad_norm": 1.7503982782363892, + "learning_rate": 2.251569223748249e-06, + "loss": 0.9512, + "step": 3473 + }, + { + "epoch": 3.2346368715083798, + "grad_norm": 1.7218401432037354, + "learning_rate": 2.250337364764935e-06, + "loss": 0.9261, + "step": 3474 + }, + { + "epoch": 3.235567970204842, + "grad_norm": 1.6968414783477783, + "learning_rate": 2.2491055670063584e-06, + "loss": 0.8908, + "step": 3475 + }, + { + "epoch": 3.2364990689013036, + "grad_norm": 1.7379018068313599, + "learning_rate": 2.2478738307745937e-06, + "loss": 0.9323, + "step": 3476 + }, + { + "epoch": 3.2374301675977653, + "grad_norm": 1.6510965824127197, + "learning_rate": 2.2466421563717e-06, + "loss": 0.8853, + "step": 3477 + }, + { + "epoch": 3.2383612662942274, + "grad_norm": 1.7129486799240112, + "learning_rate": 2.24541054409972e-06, + "loss": 0.9482, + "step": 3478 + }, + { + "epoch": 3.239292364990689, + "grad_norm": 1.749428629875183, + "learning_rate": 2.2441789942606827e-06, + "loss": 0.9357, + "step": 3479 + }, + { + "epoch": 3.2402234636871508, + "grad_norm": 1.6841530799865723, + "learning_rate": 2.242947507156599e-06, + "loss": 0.9128, + "step": 3480 + }, + { + "epoch": 3.2411545623836124, + "grad_norm": 1.700286626815796, + "learning_rate": 2.2417160830894688e-06, + "loss": 0.8929, + "step": 3481 + }, + { + "epoch": 3.2420856610800746, + "grad_norm": 1.7240166664123535, + "learning_rate": 2.2404847223612725e-06, + "loss": 0.9307, + "step": 3482 + }, + { + "epoch": 3.2430167597765363, + "grad_norm": 1.7347228527069092, + "learning_rate": 2.239253425273976e-06, + "loss": 0.8893, + "step": 3483 + }, + { + "epoch": 3.243947858472998, + "grad_norm": 1.7174723148345947, + "learning_rate": 2.238022192129532e-06, + "loss": 0.8987, + "step": 3484 + }, + { + "epoch": 3.24487895716946, + "grad_norm": 1.6831586360931396, + "learning_rate": 2.236791023229875e-06, + "loss": 0.902, + "step": 3485 + }, + { + "epoch": 3.2458100558659218, + "grad_norm": 1.7524018287658691, + "learning_rate": 2.235559918876924e-06, + "loss": 0.957, + "step": 3486 + }, + { + "epoch": 3.2467411545623834, + "grad_norm": 1.7050849199295044, + "learning_rate": 2.2343288793725816e-06, + "loss": 0.9503, + "step": 3487 + }, + { + "epoch": 3.2476722532588456, + "grad_norm": 1.6524884700775146, + "learning_rate": 2.233097905018738e-06, + "loss": 0.9114, + "step": 3488 + }, + { + "epoch": 3.2486033519553073, + "grad_norm": 1.7306863069534302, + "learning_rate": 2.2318669961172627e-06, + "loss": 0.9093, + "step": 3489 + }, + { + "epoch": 3.249534450651769, + "grad_norm": 1.7213823795318604, + "learning_rate": 2.2306361529700125e-06, + "loss": 0.9007, + "step": 3490 + }, + { + "epoch": 3.250465549348231, + "grad_norm": 1.7477878332138062, + "learning_rate": 2.2294053758788267e-06, + "loss": 0.9367, + "step": 3491 + }, + { + "epoch": 3.2513966480446927, + "grad_norm": 1.6408740282058716, + "learning_rate": 2.2281746651455275e-06, + "loss": 0.8728, + "step": 3492 + }, + { + "epoch": 3.2523277467411544, + "grad_norm": 1.7176867723464966, + "learning_rate": 2.2269440210719234e-06, + "loss": 0.9092, + "step": 3493 + }, + { + "epoch": 3.2532588454376166, + "grad_norm": 1.7315559387207031, + "learning_rate": 2.2257134439598043e-06, + "loss": 0.9237, + "step": 3494 + }, + { + "epoch": 3.2541899441340782, + "grad_norm": 1.6877254247665405, + "learning_rate": 2.2244829341109463e-06, + "loss": 0.8952, + "step": 3495 + }, + { + "epoch": 3.25512104283054, + "grad_norm": 1.6631169319152832, + "learning_rate": 2.2232524918271036e-06, + "loss": 0.9074, + "step": 3496 + }, + { + "epoch": 3.256052141527002, + "grad_norm": 1.7043720483779907, + "learning_rate": 2.2220221174100204e-06, + "loss": 0.9091, + "step": 3497 + }, + { + "epoch": 3.2569832402234637, + "grad_norm": 1.7908573150634766, + "learning_rate": 2.220791811161421e-06, + "loss": 0.9532, + "step": 3498 + }, + { + "epoch": 3.2579143389199254, + "grad_norm": 1.732521891593933, + "learning_rate": 2.219561573383011e-06, + "loss": 0.9163, + "step": 3499 + }, + { + "epoch": 3.2588454376163876, + "grad_norm": 1.6356511116027832, + "learning_rate": 2.218331404376484e-06, + "loss": 0.8871, + "step": 3500 + }, + { + "epoch": 3.2597765363128492, + "grad_norm": 1.6799367666244507, + "learning_rate": 2.2171013044435132e-06, + "loss": 0.8903, + "step": 3501 + }, + { + "epoch": 3.260707635009311, + "grad_norm": 1.6938735246658325, + "learning_rate": 2.2158712738857564e-06, + "loss": 0.9461, + "step": 3502 + }, + { + "epoch": 3.2616387337057726, + "grad_norm": 1.7129855155944824, + "learning_rate": 2.2146413130048524e-06, + "loss": 0.9481, + "step": 3503 + }, + { + "epoch": 3.2625698324022347, + "grad_norm": 1.6919702291488647, + "learning_rate": 2.213411422102426e-06, + "loss": 0.9585, + "step": 3504 + }, + { + "epoch": 3.2635009310986964, + "grad_norm": 1.6424874067306519, + "learning_rate": 2.212181601480083e-06, + "loss": 0.9107, + "step": 3505 + }, + { + "epoch": 3.264432029795158, + "grad_norm": 1.6698466539382935, + "learning_rate": 2.210951851439411e-06, + "loss": 0.9099, + "step": 3506 + }, + { + "epoch": 3.2653631284916202, + "grad_norm": 1.6773866415023804, + "learning_rate": 2.2097221722819817e-06, + "loss": 0.9162, + "step": 3507 + }, + { + "epoch": 3.266294227188082, + "grad_norm": 1.744491457939148, + "learning_rate": 2.2084925643093502e-06, + "loss": 0.9262, + "step": 3508 + }, + { + "epoch": 3.2672253258845436, + "grad_norm": 1.7049018144607544, + "learning_rate": 2.207263027823053e-06, + "loss": 0.9251, + "step": 3509 + }, + { + "epoch": 3.2681564245810057, + "grad_norm": 1.6969716548919678, + "learning_rate": 2.206033563124608e-06, + "loss": 0.9365, + "step": 3510 + }, + { + "epoch": 3.2690875232774674, + "grad_norm": 1.697211742401123, + "learning_rate": 2.2048041705155175e-06, + "loss": 0.8812, + "step": 3511 + }, + { + "epoch": 3.270018621973929, + "grad_norm": 1.7172445058822632, + "learning_rate": 2.2035748502972658e-06, + "loss": 0.8945, + "step": 3512 + }, + { + "epoch": 3.2709497206703912, + "grad_norm": 1.645365834236145, + "learning_rate": 2.202345602771318e-06, + "loss": 0.8789, + "step": 3513 + }, + { + "epoch": 3.271880819366853, + "grad_norm": 1.6403355598449707, + "learning_rate": 2.2011164282391223e-06, + "loss": 0.8674, + "step": 3514 + }, + { + "epoch": 3.2728119180633146, + "grad_norm": 1.6744697093963623, + "learning_rate": 2.1998873270021097e-06, + "loss": 0.9114, + "step": 3515 + }, + { + "epoch": 3.2737430167597763, + "grad_norm": 1.7862238883972168, + "learning_rate": 2.1986582993616926e-06, + "loss": 0.9383, + "step": 3516 + }, + { + "epoch": 3.2746741154562384, + "grad_norm": 1.7182978391647339, + "learning_rate": 2.197429345619265e-06, + "loss": 0.8933, + "step": 3517 + }, + { + "epoch": 3.2756052141527, + "grad_norm": 1.748374104499817, + "learning_rate": 2.1962004660762025e-06, + "loss": 0.9279, + "step": 3518 + }, + { + "epoch": 3.276536312849162, + "grad_norm": 1.7625303268432617, + "learning_rate": 2.194971661033864e-06, + "loss": 0.9081, + "step": 3519 + }, + { + "epoch": 3.277467411545624, + "grad_norm": 1.655259370803833, + "learning_rate": 2.193742930793589e-06, + "loss": 0.8837, + "step": 3520 + }, + { + "epoch": 3.2783985102420856, + "grad_norm": 1.6879080533981323, + "learning_rate": 2.192514275656698e-06, + "loss": 0.896, + "step": 3521 + }, + { + "epoch": 3.2793296089385473, + "grad_norm": 1.6889163255691528, + "learning_rate": 2.1912856959244958e-06, + "loss": 0.8757, + "step": 3522 + }, + { + "epoch": 3.2802607076350094, + "grad_norm": 1.6968783140182495, + "learning_rate": 2.190057191898265e-06, + "loss": 0.938, + "step": 3523 + }, + { + "epoch": 3.281191806331471, + "grad_norm": 1.696675181388855, + "learning_rate": 2.1888287638792722e-06, + "loss": 0.9038, + "step": 3524 + }, + { + "epoch": 3.282122905027933, + "grad_norm": 1.6698479652404785, + "learning_rate": 2.1876004121687644e-06, + "loss": 0.8684, + "step": 3525 + }, + { + "epoch": 3.283054003724395, + "grad_norm": 1.6960601806640625, + "learning_rate": 2.186372137067972e-06, + "loss": 0.92, + "step": 3526 + }, + { + "epoch": 3.2839851024208566, + "grad_norm": 1.6478750705718994, + "learning_rate": 2.1851439388781017e-06, + "loss": 0.9147, + "step": 3527 + }, + { + "epoch": 3.2849162011173183, + "grad_norm": 1.7124546766281128, + "learning_rate": 2.1839158179003457e-06, + "loss": 0.8868, + "step": 3528 + }, + { + "epoch": 3.2858472998137804, + "grad_norm": 1.6764276027679443, + "learning_rate": 2.182687774435878e-06, + "loss": 0.8683, + "step": 3529 + }, + { + "epoch": 3.286778398510242, + "grad_norm": 1.8640626668930054, + "learning_rate": 2.1814598087858476e-06, + "loss": 0.9044, + "step": 3530 + }, + { + "epoch": 3.287709497206704, + "grad_norm": 1.934227466583252, + "learning_rate": 2.1802319212513913e-06, + "loss": 0.966, + "step": 3531 + }, + { + "epoch": 3.288640595903166, + "grad_norm": 1.7664982080459595, + "learning_rate": 2.1790041121336223e-06, + "loss": 0.9185, + "step": 3532 + }, + { + "epoch": 3.2895716945996276, + "grad_norm": 1.7142821550369263, + "learning_rate": 2.1777763817336384e-06, + "loss": 0.9193, + "step": 3533 + }, + { + "epoch": 3.2905027932960893, + "grad_norm": 1.735909342765808, + "learning_rate": 2.176548730352513e-06, + "loss": 0.9219, + "step": 3534 + }, + { + "epoch": 3.2914338919925514, + "grad_norm": 1.7370494604110718, + "learning_rate": 2.175321158291304e-06, + "loss": 0.9395, + "step": 3535 + }, + { + "epoch": 3.292364990689013, + "grad_norm": 1.7619099617004395, + "learning_rate": 2.17409366585105e-06, + "loss": 0.8983, + "step": 3536 + }, + { + "epoch": 3.293296089385475, + "grad_norm": 1.7213809490203857, + "learning_rate": 2.172866253332766e-06, + "loss": 0.8812, + "step": 3537 + }, + { + "epoch": 3.294227188081937, + "grad_norm": 1.7410892248153687, + "learning_rate": 2.171638921037453e-06, + "loss": 0.9048, + "step": 3538 + }, + { + "epoch": 3.2951582867783986, + "grad_norm": 1.7433421611785889, + "learning_rate": 2.1704116692660872e-06, + "loss": 0.9084, + "step": 3539 + }, + { + "epoch": 3.2960893854748603, + "grad_norm": 1.793975830078125, + "learning_rate": 2.1691844983196302e-06, + "loss": 0.8724, + "step": 3540 + }, + { + "epoch": 3.297020484171322, + "grad_norm": 1.6627880334854126, + "learning_rate": 2.1679574084990184e-06, + "loss": 0.8882, + "step": 3541 + }, + { + "epoch": 3.297951582867784, + "grad_norm": 1.8283196687698364, + "learning_rate": 2.166730400105172e-06, + "loss": 0.9405, + "step": 3542 + }, + { + "epoch": 3.298882681564246, + "grad_norm": 1.6557303667068481, + "learning_rate": 2.1655034734389906e-06, + "loss": 0.8783, + "step": 3543 + }, + { + "epoch": 3.2998137802607075, + "grad_norm": 1.696735143661499, + "learning_rate": 2.1642766288013517e-06, + "loss": 0.8668, + "step": 3544 + }, + { + "epoch": 3.3007448789571696, + "grad_norm": 1.704197883605957, + "learning_rate": 2.1630498664931156e-06, + "loss": 0.9217, + "step": 3545 + }, + { + "epoch": 3.3016759776536313, + "grad_norm": 1.6723155975341797, + "learning_rate": 2.161823186815121e-06, + "loss": 0.906, + "step": 3546 + }, + { + "epoch": 3.302607076350093, + "grad_norm": 1.6761142015457153, + "learning_rate": 2.160596590068187e-06, + "loss": 0.9172, + "step": 3547 + }, + { + "epoch": 3.303538175046555, + "grad_norm": 1.7704648971557617, + "learning_rate": 2.159370076553109e-06, + "loss": 0.8914, + "step": 3548 + }, + { + "epoch": 3.304469273743017, + "grad_norm": 1.6825358867645264, + "learning_rate": 2.158143646570668e-06, + "loss": 0.9232, + "step": 3549 + }, + { + "epoch": 3.3054003724394785, + "grad_norm": 1.7193504571914673, + "learning_rate": 2.15691730042162e-06, + "loss": 0.8553, + "step": 3550 + }, + { + "epoch": 3.30633147113594, + "grad_norm": 1.7455410957336426, + "learning_rate": 2.155691038406701e-06, + "loss": 0.934, + "step": 3551 + }, + { + "epoch": 3.3072625698324023, + "grad_norm": 1.6663957834243774, + "learning_rate": 2.1544648608266282e-06, + "loss": 0.8788, + "step": 3552 + }, + { + "epoch": 3.308193668528864, + "grad_norm": 1.7086024284362793, + "learning_rate": 2.1532387679820967e-06, + "loss": 0.9155, + "step": 3553 + }, + { + "epoch": 3.3091247672253257, + "grad_norm": 1.7731077671051025, + "learning_rate": 2.1520127601737807e-06, + "loss": 0.9755, + "step": 3554 + }, + { + "epoch": 3.310055865921788, + "grad_norm": 1.768649935722351, + "learning_rate": 2.1507868377023337e-06, + "loss": 0.9071, + "step": 3555 + }, + { + "epoch": 3.3109869646182495, + "grad_norm": 1.7404378652572632, + "learning_rate": 2.1495610008683897e-06, + "loss": 0.9018, + "step": 3556 + }, + { + "epoch": 3.311918063314711, + "grad_norm": 1.6915026903152466, + "learning_rate": 2.1483352499725604e-06, + "loss": 0.946, + "step": 3557 + }, + { + "epoch": 3.3128491620111733, + "grad_norm": 1.746828556060791, + "learning_rate": 2.147109585315435e-06, + "loss": 0.9038, + "step": 3558 + }, + { + "epoch": 3.313780260707635, + "grad_norm": 1.7067960500717163, + "learning_rate": 2.145884007197584e-06, + "loss": 0.9339, + "step": 3559 + }, + { + "epoch": 3.3147113594040967, + "grad_norm": 1.7131682634353638, + "learning_rate": 2.144658515919557e-06, + "loss": 0.8702, + "step": 3560 + }, + { + "epoch": 3.315642458100559, + "grad_norm": 1.6957186460494995, + "learning_rate": 2.143433111781879e-06, + "loss": 0.9072, + "step": 3561 + }, + { + "epoch": 3.3165735567970205, + "grad_norm": 1.6971725225448608, + "learning_rate": 2.1422077950850565e-06, + "loss": 0.9189, + "step": 3562 + }, + { + "epoch": 3.317504655493482, + "grad_norm": 1.7515807151794434, + "learning_rate": 2.140982566129575e-06, + "loss": 0.9088, + "step": 3563 + }, + { + "epoch": 3.3184357541899443, + "grad_norm": 1.684130311012268, + "learning_rate": 2.1397574252158954e-06, + "loss": 0.9014, + "step": 3564 + }, + { + "epoch": 3.319366852886406, + "grad_norm": 1.7420741319656372, + "learning_rate": 2.13853237264446e-06, + "loss": 0.924, + "step": 3565 + }, + { + "epoch": 3.3202979515828677, + "grad_norm": 1.6821070909500122, + "learning_rate": 2.1373074087156874e-06, + "loss": 0.9173, + "step": 3566 + }, + { + "epoch": 3.32122905027933, + "grad_norm": 1.7626503705978394, + "learning_rate": 2.1360825337299766e-06, + "loss": 0.9113, + "step": 3567 + }, + { + "epoch": 3.3221601489757915, + "grad_norm": 1.6721113920211792, + "learning_rate": 2.1348577479877025e-06, + "loss": 0.8961, + "step": 3568 + }, + { + "epoch": 3.323091247672253, + "grad_norm": 1.711659550666809, + "learning_rate": 2.1336330517892195e-06, + "loss": 0.9276, + "step": 3569 + }, + { + "epoch": 3.3240223463687153, + "grad_norm": 1.7217745780944824, + "learning_rate": 2.1324084454348592e-06, + "loss": 0.9278, + "step": 3570 + }, + { + "epoch": 3.324953445065177, + "grad_norm": 1.7459384202957153, + "learning_rate": 2.1311839292249335e-06, + "loss": 0.8922, + "step": 3571 + }, + { + "epoch": 3.3258845437616387, + "grad_norm": 1.7842185497283936, + "learning_rate": 2.129959503459728e-06, + "loss": 0.8986, + "step": 3572 + }, + { + "epoch": 3.326815642458101, + "grad_norm": 1.7478796243667603, + "learning_rate": 2.1287351684395094e-06, + "loss": 0.932, + "step": 3573 + }, + { + "epoch": 3.3277467411545625, + "grad_norm": 1.7591707706451416, + "learning_rate": 2.1275109244645224e-06, + "loss": 0.9474, + "step": 3574 + }, + { + "epoch": 3.328677839851024, + "grad_norm": 1.7879152297973633, + "learning_rate": 2.1262867718349867e-06, + "loss": 0.9286, + "step": 3575 + }, + { + "epoch": 3.329608938547486, + "grad_norm": 1.6960433721542358, + "learning_rate": 2.125062710851101e-06, + "loss": 0.8789, + "step": 3576 + }, + { + "epoch": 3.330540037243948, + "grad_norm": 1.6222604513168335, + "learning_rate": 2.1238387418130425e-06, + "loss": 0.8959, + "step": 3577 + }, + { + "epoch": 3.3314711359404097, + "grad_norm": 1.7148146629333496, + "learning_rate": 2.122614865020965e-06, + "loss": 0.9323, + "step": 3578 + }, + { + "epoch": 3.3324022346368714, + "grad_norm": 1.718290090560913, + "learning_rate": 2.1213910807749995e-06, + "loss": 0.9226, + "step": 3579 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 1.7199945449829102, + "learning_rate": 2.120167389375253e-06, + "loss": 0.9269, + "step": 3580 + }, + { + "epoch": 3.334264432029795, + "grad_norm": 1.7048648595809937, + "learning_rate": 2.118943791121815e-06, + "loss": 0.9035, + "step": 3581 + }, + { + "epoch": 3.335195530726257, + "grad_norm": 1.664337158203125, + "learning_rate": 2.1177202863147436e-06, + "loss": 0.9267, + "step": 3582 + }, + { + "epoch": 3.336126629422719, + "grad_norm": 1.72476065158844, + "learning_rate": 2.1164968752540817e-06, + "loss": 0.9389, + "step": 3583 + }, + { + "epoch": 3.3370577281191807, + "grad_norm": 1.7327197790145874, + "learning_rate": 2.1152735582398453e-06, + "loss": 0.9282, + "step": 3584 + }, + { + "epoch": 3.3379888268156424, + "grad_norm": 1.755927324295044, + "learning_rate": 2.1140503355720295e-06, + "loss": 0.9149, + "step": 3585 + }, + { + "epoch": 3.338919925512104, + "grad_norm": 1.6904610395431519, + "learning_rate": 2.1128272075506036e-06, + "loss": 0.873, + "step": 3586 + }, + { + "epoch": 3.339851024208566, + "grad_norm": 1.7619491815567017, + "learning_rate": 2.1116041744755153e-06, + "loss": 0.9276, + "step": 3587 + }, + { + "epoch": 3.340782122905028, + "grad_norm": 1.7055435180664062, + "learning_rate": 2.1103812366466896e-06, + "loss": 0.9215, + "step": 3588 + }, + { + "epoch": 3.3417132216014895, + "grad_norm": 1.774766445159912, + "learning_rate": 2.1091583943640265e-06, + "loss": 0.9031, + "step": 3589 + }, + { + "epoch": 3.3426443202979517, + "grad_norm": 1.7082542181015015, + "learning_rate": 2.107935647927404e-06, + "loss": 0.8945, + "step": 3590 + }, + { + "epoch": 3.3435754189944134, + "grad_norm": 1.674393892288208, + "learning_rate": 2.1067129976366767e-06, + "loss": 0.8965, + "step": 3591 + }, + { + "epoch": 3.344506517690875, + "grad_norm": 1.6963931322097778, + "learning_rate": 2.105490443791674e-06, + "loss": 0.8976, + "step": 3592 + }, + { + "epoch": 3.345437616387337, + "grad_norm": 1.6533441543579102, + "learning_rate": 2.104267986692202e-06, + "loss": 0.9075, + "step": 3593 + }, + { + "epoch": 3.346368715083799, + "grad_norm": 1.6994211673736572, + "learning_rate": 2.1030456266380455e-06, + "loss": 0.9111, + "step": 3594 + }, + { + "epoch": 3.3472998137802605, + "grad_norm": 1.7067980766296387, + "learning_rate": 2.1018233639289636e-06, + "loss": 0.9104, + "step": 3595 + }, + { + "epoch": 3.3482309124767227, + "grad_norm": 1.719502329826355, + "learning_rate": 2.1006011988646895e-06, + "loss": 0.9107, + "step": 3596 + }, + { + "epoch": 3.3491620111731844, + "grad_norm": 1.7006198167800903, + "learning_rate": 2.0993791317449362e-06, + "loss": 0.9026, + "step": 3597 + }, + { + "epoch": 3.350093109869646, + "grad_norm": 1.6677294969558716, + "learning_rate": 2.098157162869392e-06, + "loss": 0.9156, + "step": 3598 + }, + { + "epoch": 3.351024208566108, + "grad_norm": 1.785241961479187, + "learning_rate": 2.096935292537718e-06, + "loss": 0.9246, + "step": 3599 + }, + { + "epoch": 3.35195530726257, + "grad_norm": 1.6744484901428223, + "learning_rate": 2.0957135210495543e-06, + "loss": 0.9146, + "step": 3600 + }, + { + "epoch": 3.3528864059590315, + "grad_norm": 1.6849631071090698, + "learning_rate": 2.094491848704516e-06, + "loss": 0.8868, + "step": 3601 + }, + { + "epoch": 3.3538175046554937, + "grad_norm": 1.699573040008545, + "learning_rate": 2.093270275802194e-06, + "loss": 0.8887, + "step": 3602 + }, + { + "epoch": 3.3547486033519553, + "grad_norm": 1.684644103050232, + "learning_rate": 2.0920488026421537e-06, + "loss": 0.8873, + "step": 3603 + }, + { + "epoch": 3.355679702048417, + "grad_norm": 1.6562538146972656, + "learning_rate": 2.0908274295239367e-06, + "loss": 0.8809, + "step": 3604 + }, + { + "epoch": 3.356610800744879, + "grad_norm": 1.7715096473693848, + "learning_rate": 2.089606156747061e-06, + "loss": 0.9317, + "step": 3605 + }, + { + "epoch": 3.357541899441341, + "grad_norm": 1.8907216787338257, + "learning_rate": 2.0883849846110186e-06, + "loss": 0.9325, + "step": 3606 + }, + { + "epoch": 3.3584729981378025, + "grad_norm": 1.7321754693984985, + "learning_rate": 2.0871639134152773e-06, + "loss": 0.9316, + "step": 3607 + }, + { + "epoch": 3.3594040968342647, + "grad_norm": 1.7374184131622314, + "learning_rate": 2.085942943459281e-06, + "loss": 0.8875, + "step": 3608 + }, + { + "epoch": 3.3603351955307263, + "grad_norm": 1.7196762561798096, + "learning_rate": 2.084722075042448e-06, + "loss": 0.9262, + "step": 3609 + }, + { + "epoch": 3.361266294227188, + "grad_norm": 1.8062974214553833, + "learning_rate": 2.0835013084641704e-06, + "loss": 0.8704, + "step": 3610 + }, + { + "epoch": 3.3621973929236497, + "grad_norm": 1.6995432376861572, + "learning_rate": 2.0822806440238173e-06, + "loss": 0.8831, + "step": 3611 + }, + { + "epoch": 3.363128491620112, + "grad_norm": 1.709960699081421, + "learning_rate": 2.081060082020733e-06, + "loss": 0.883, + "step": 3612 + }, + { + "epoch": 3.3640595903165735, + "grad_norm": 1.767360806465149, + "learning_rate": 2.079839622754235e-06, + "loss": 0.9391, + "step": 3613 + }, + { + "epoch": 3.364990689013035, + "grad_norm": 1.7222709655761719, + "learning_rate": 2.078619266523615e-06, + "loss": 0.8884, + "step": 3614 + }, + { + "epoch": 3.3659217877094973, + "grad_norm": 1.757759928703308, + "learning_rate": 2.0773990136281435e-06, + "loss": 0.9416, + "step": 3615 + }, + { + "epoch": 3.366852886405959, + "grad_norm": 1.7415378093719482, + "learning_rate": 2.0761788643670623e-06, + "loss": 0.9303, + "step": 3616 + }, + { + "epoch": 3.3677839851024207, + "grad_norm": 1.687500238418579, + "learning_rate": 2.0749588190395868e-06, + "loss": 0.8661, + "step": 3617 + }, + { + "epoch": 3.368715083798883, + "grad_norm": 1.7491952180862427, + "learning_rate": 2.0737388779449098e-06, + "loss": 0.9242, + "step": 3618 + }, + { + "epoch": 3.3696461824953445, + "grad_norm": 1.8421630859375, + "learning_rate": 2.0725190413821978e-06, + "loss": 0.9117, + "step": 3619 + }, + { + "epoch": 3.370577281191806, + "grad_norm": 1.7336701154708862, + "learning_rate": 2.0712993096505902e-06, + "loss": 0.9022, + "step": 3620 + }, + { + "epoch": 3.3715083798882683, + "grad_norm": 1.7102673053741455, + "learning_rate": 2.0700796830492016e-06, + "loss": 0.8834, + "step": 3621 + }, + { + "epoch": 3.37243947858473, + "grad_norm": 1.728464961051941, + "learning_rate": 2.068860161877122e-06, + "loss": 0.9096, + "step": 3622 + }, + { + "epoch": 3.3733705772811917, + "grad_norm": 1.7444391250610352, + "learning_rate": 2.067640746433413e-06, + "loss": 0.9049, + "step": 3623 + }, + { + "epoch": 3.3743016759776534, + "grad_norm": 1.709729552268982, + "learning_rate": 2.066421437017113e-06, + "loss": 0.9237, + "step": 3624 + }, + { + "epoch": 3.3752327746741155, + "grad_norm": 1.6979856491088867, + "learning_rate": 2.0652022339272314e-06, + "loss": 0.9058, + "step": 3625 + }, + { + "epoch": 3.376163873370577, + "grad_norm": 1.6928445100784302, + "learning_rate": 2.0639831374627563e-06, + "loss": 0.9381, + "step": 3626 + }, + { + "epoch": 3.377094972067039, + "grad_norm": 1.7880457639694214, + "learning_rate": 2.0627641479226434e-06, + "loss": 0.9574, + "step": 3627 + }, + { + "epoch": 3.378026070763501, + "grad_norm": 1.672634243965149, + "learning_rate": 2.0615452656058266e-06, + "loss": 0.91, + "step": 3628 + }, + { + "epoch": 3.3789571694599627, + "grad_norm": 1.7423460483551025, + "learning_rate": 2.060326490811213e-06, + "loss": 0.9427, + "step": 3629 + }, + { + "epoch": 3.3798882681564244, + "grad_norm": 1.7408066987991333, + "learning_rate": 2.0591078238376804e-06, + "loss": 0.9179, + "step": 3630 + }, + { + "epoch": 3.3808193668528865, + "grad_norm": 1.7179259061813354, + "learning_rate": 2.057889264984085e-06, + "loss": 0.8984, + "step": 3631 + }, + { + "epoch": 3.381750465549348, + "grad_norm": 1.8055065870285034, + "learning_rate": 2.056670814549252e-06, + "loss": 0.9687, + "step": 3632 + }, + { + "epoch": 3.38268156424581, + "grad_norm": 1.6409751176834106, + "learning_rate": 2.0554524728319837e-06, + "loss": 0.9099, + "step": 3633 + }, + { + "epoch": 3.383612662942272, + "grad_norm": 1.7668641805648804, + "learning_rate": 2.0542342401310515e-06, + "loss": 0.9434, + "step": 3634 + }, + { + "epoch": 3.3845437616387337, + "grad_norm": 1.6830002069473267, + "learning_rate": 2.053016116745204e-06, + "loss": 0.8506, + "step": 3635 + }, + { + "epoch": 3.3854748603351954, + "grad_norm": 1.707519769668579, + "learning_rate": 2.0517981029731613e-06, + "loss": 0.887, + "step": 3636 + }, + { + "epoch": 3.3864059590316575, + "grad_norm": 1.7540509700775146, + "learning_rate": 2.050580199113616e-06, + "loss": 0.9065, + "step": 3637 + }, + { + "epoch": 3.387337057728119, + "grad_norm": 1.7131767272949219, + "learning_rate": 2.049362405465236e-06, + "loss": 0.9048, + "step": 3638 + }, + { + "epoch": 3.388268156424581, + "grad_norm": 1.7063047885894775, + "learning_rate": 2.0481447223266593e-06, + "loss": 0.9117, + "step": 3639 + }, + { + "epoch": 3.389199255121043, + "grad_norm": 1.744381070137024, + "learning_rate": 2.0469271499964995e-06, + "loss": 0.9218, + "step": 3640 + }, + { + "epoch": 3.3901303538175047, + "grad_norm": 1.7229853868484497, + "learning_rate": 2.0457096887733395e-06, + "loss": 0.8999, + "step": 3641 + }, + { + "epoch": 3.3910614525139664, + "grad_norm": 1.6572545766830444, + "learning_rate": 2.044492338955739e-06, + "loss": 0.8681, + "step": 3642 + }, + { + "epoch": 3.3919925512104285, + "grad_norm": 1.7188464403152466, + "learning_rate": 2.0432751008422293e-06, + "loss": 0.9177, + "step": 3643 + }, + { + "epoch": 3.39292364990689, + "grad_norm": 1.7729672193527222, + "learning_rate": 2.0420579747313114e-06, + "loss": 0.9408, + "step": 3644 + }, + { + "epoch": 3.393854748603352, + "grad_norm": 1.7475430965423584, + "learning_rate": 2.040840960921462e-06, + "loss": 0.9109, + "step": 3645 + }, + { + "epoch": 3.394785847299814, + "grad_norm": 1.7914328575134277, + "learning_rate": 2.039624059711129e-06, + "loss": 0.933, + "step": 3646 + }, + { + "epoch": 3.3957169459962757, + "grad_norm": 1.7425769567489624, + "learning_rate": 2.0384072713987345e-06, + "loss": 0.9491, + "step": 3647 + }, + { + "epoch": 3.3966480446927374, + "grad_norm": 1.7524155378341675, + "learning_rate": 2.0371905962826684e-06, + "loss": 0.9544, + "step": 3648 + }, + { + "epoch": 3.397579143389199, + "grad_norm": 1.738939642906189, + "learning_rate": 2.0359740346612982e-06, + "loss": 0.9082, + "step": 3649 + }, + { + "epoch": 3.398510242085661, + "grad_norm": 1.6907044649124146, + "learning_rate": 2.034757586832961e-06, + "loss": 0.9215, + "step": 3650 + }, + { + "epoch": 3.399441340782123, + "grad_norm": 1.6801565885543823, + "learning_rate": 2.0335412530959647e-06, + "loss": 0.9007, + "step": 3651 + }, + { + "epoch": 3.4003724394785846, + "grad_norm": 1.7847118377685547, + "learning_rate": 2.0323250337485913e-06, + "loss": 0.9443, + "step": 3652 + }, + { + "epoch": 3.4013035381750467, + "grad_norm": 1.7194799184799194, + "learning_rate": 2.031108929089095e-06, + "loss": 0.9563, + "step": 3653 + }, + { + "epoch": 3.4022346368715084, + "grad_norm": 1.7634360790252686, + "learning_rate": 2.0298929394157e-06, + "loss": 0.8962, + "step": 3654 + }, + { + "epoch": 3.40316573556797, + "grad_norm": 1.6883530616760254, + "learning_rate": 2.028677065026604e-06, + "loss": 0.8953, + "step": 3655 + }, + { + "epoch": 3.404096834264432, + "grad_norm": 1.6946510076522827, + "learning_rate": 2.0274613062199746e-06, + "loss": 0.9169, + "step": 3656 + }, + { + "epoch": 3.405027932960894, + "grad_norm": 1.7746978998184204, + "learning_rate": 2.0262456632939542e-06, + "loss": 0.9428, + "step": 3657 + }, + { + "epoch": 3.4059590316573556, + "grad_norm": 1.7683535814285278, + "learning_rate": 2.0250301365466528e-06, + "loss": 0.937, + "step": 3658 + }, + { + "epoch": 3.4068901303538173, + "grad_norm": 1.7677348852157593, + "learning_rate": 2.023814726276154e-06, + "loss": 0.9354, + "step": 3659 + }, + { + "epoch": 3.4078212290502794, + "grad_norm": 1.674815058708191, + "learning_rate": 2.0225994327805154e-06, + "loss": 0.8908, + "step": 3660 + }, + { + "epoch": 3.408752327746741, + "grad_norm": 1.854444980621338, + "learning_rate": 2.0213842563577595e-06, + "loss": 0.9402, + "step": 3661 + }, + { + "epoch": 3.4096834264432028, + "grad_norm": 1.6401361227035522, + "learning_rate": 2.020169197305886e-06, + "loss": 0.8829, + "step": 3662 + }, + { + "epoch": 3.410614525139665, + "grad_norm": 1.7089415788650513, + "learning_rate": 2.0189542559228626e-06, + "loss": 0.8877, + "step": 3663 + }, + { + "epoch": 3.4115456238361266, + "grad_norm": 1.7052597999572754, + "learning_rate": 2.0177394325066312e-06, + "loss": 0.9098, + "step": 3664 + }, + { + "epoch": 3.4124767225325883, + "grad_norm": 1.6806219816207886, + "learning_rate": 2.016524727355101e-06, + "loss": 0.9219, + "step": 3665 + }, + { + "epoch": 3.4134078212290504, + "grad_norm": 1.6951138973236084, + "learning_rate": 2.0153101407661544e-06, + "loss": 0.8858, + "step": 3666 + }, + { + "epoch": 3.414338919925512, + "grad_norm": 1.7071776390075684, + "learning_rate": 2.014095673037645e-06, + "loss": 0.9188, + "step": 3667 + }, + { + "epoch": 3.4152700186219738, + "grad_norm": 1.6961917877197266, + "learning_rate": 2.0128813244673947e-06, + "loss": 0.9074, + "step": 3668 + }, + { + "epoch": 3.416201117318436, + "grad_norm": 1.6834344863891602, + "learning_rate": 2.0116670953532004e-06, + "loss": 0.883, + "step": 3669 + }, + { + "epoch": 3.4171322160148976, + "grad_norm": 1.7505922317504883, + "learning_rate": 2.010452985992825e-06, + "loss": 0.9253, + "step": 3670 + }, + { + "epoch": 3.4180633147113593, + "grad_norm": 1.6882200241088867, + "learning_rate": 2.0092389966840077e-06, + "loss": 0.8687, + "step": 3671 + }, + { + "epoch": 3.4189944134078214, + "grad_norm": 1.7411258220672607, + "learning_rate": 2.0080251277244523e-06, + "loss": 0.9343, + "step": 3672 + }, + { + "epoch": 3.419925512104283, + "grad_norm": 1.731019377708435, + "learning_rate": 2.006811379411836e-06, + "loss": 0.939, + "step": 3673 + }, + { + "epoch": 3.4208566108007448, + "grad_norm": 1.7165260314941406, + "learning_rate": 2.0055977520438075e-06, + "loss": 0.931, + "step": 3674 + }, + { + "epoch": 3.421787709497207, + "grad_norm": 1.7198034524917603, + "learning_rate": 2.0043842459179823e-06, + "loss": 0.8711, + "step": 3675 + }, + { + "epoch": 3.4227188081936686, + "grad_norm": 1.680818796157837, + "learning_rate": 2.00317086133195e-06, + "loss": 0.9176, + "step": 3676 + }, + { + "epoch": 3.4236499068901303, + "grad_norm": 1.6482501029968262, + "learning_rate": 2.0019575985832684e-06, + "loss": 0.9031, + "step": 3677 + }, + { + "epoch": 3.4245810055865924, + "grad_norm": 1.7547640800476074, + "learning_rate": 2.000744457969467e-06, + "loss": 0.9087, + "step": 3678 + }, + { + "epoch": 3.425512104283054, + "grad_norm": 1.8160649538040161, + "learning_rate": 1.9995314397880412e-06, + "loss": 0.9311, + "step": 3679 + }, + { + "epoch": 3.4264432029795158, + "grad_norm": 1.7029207944869995, + "learning_rate": 1.9983185443364617e-06, + "loss": 0.899, + "step": 3680 + }, + { + "epoch": 3.427374301675978, + "grad_norm": 1.7521311044692993, + "learning_rate": 1.9971057719121666e-06, + "loss": 0.9135, + "step": 3681 + }, + { + "epoch": 3.4283054003724396, + "grad_norm": 1.7538201808929443, + "learning_rate": 1.9958931228125617e-06, + "loss": 0.914, + "step": 3682 + }, + { + "epoch": 3.4292364990689013, + "grad_norm": 1.6631975173950195, + "learning_rate": 1.9946805973350277e-06, + "loss": 0.8881, + "step": 3683 + }, + { + "epoch": 3.430167597765363, + "grad_norm": 1.8578765392303467, + "learning_rate": 1.993468195776911e-06, + "loss": 0.9325, + "step": 3684 + }, + { + "epoch": 3.431098696461825, + "grad_norm": 1.7013373374938965, + "learning_rate": 1.992255918435528e-06, + "loss": 0.8848, + "step": 3685 + }, + { + "epoch": 3.4320297951582868, + "grad_norm": 1.7043328285217285, + "learning_rate": 1.9910437656081658e-06, + "loss": 0.9025, + "step": 3686 + }, + { + "epoch": 3.4329608938547485, + "grad_norm": 1.6889464855194092, + "learning_rate": 1.9898317375920805e-06, + "loss": 0.9057, + "step": 3687 + }, + { + "epoch": 3.4338919925512106, + "grad_norm": 1.7357399463653564, + "learning_rate": 1.988619834684499e-06, + "loss": 0.9387, + "step": 3688 + }, + { + "epoch": 3.4348230912476723, + "grad_norm": 1.7146764993667603, + "learning_rate": 1.9874080571826132e-06, + "loss": 0.9334, + "step": 3689 + }, + { + "epoch": 3.435754189944134, + "grad_norm": 1.702122449874878, + "learning_rate": 1.9861964053835887e-06, + "loss": 0.8971, + "step": 3690 + }, + { + "epoch": 3.436685288640596, + "grad_norm": 1.6351912021636963, + "learning_rate": 1.9849848795845594e-06, + "loss": 0.9124, + "step": 3691 + }, + { + "epoch": 3.4376163873370578, + "grad_norm": 1.7232487201690674, + "learning_rate": 1.9837734800826267e-06, + "loss": 0.9056, + "step": 3692 + }, + { + "epoch": 3.4385474860335195, + "grad_norm": 1.7359774112701416, + "learning_rate": 1.9825622071748616e-06, + "loss": 0.9461, + "step": 3693 + }, + { + "epoch": 3.439478584729981, + "grad_norm": 1.67306649684906, + "learning_rate": 1.9813510611583054e-06, + "loss": 0.9207, + "step": 3694 + }, + { + "epoch": 3.4404096834264433, + "grad_norm": 1.7530717849731445, + "learning_rate": 1.9801400423299673e-06, + "loss": 0.9013, + "step": 3695 + }, + { + "epoch": 3.441340782122905, + "grad_norm": 1.7043278217315674, + "learning_rate": 1.9789291509868246e-06, + "loss": 0.9168, + "step": 3696 + }, + { + "epoch": 3.4422718808193666, + "grad_norm": 1.6729575395584106, + "learning_rate": 1.9777183874258242e-06, + "loss": 0.9174, + "step": 3697 + }, + { + "epoch": 3.4432029795158288, + "grad_norm": 1.8327440023422241, + "learning_rate": 1.976507751943882e-06, + "loss": 0.966, + "step": 3698 + }, + { + "epoch": 3.4441340782122905, + "grad_norm": 1.6883249282836914, + "learning_rate": 1.9752972448378817e-06, + "loss": 0.9182, + "step": 3699 + }, + { + "epoch": 3.445065176908752, + "grad_norm": 1.7073884010314941, + "learning_rate": 1.9740868664046754e-06, + "loss": 0.9128, + "step": 3700 + }, + { + "epoch": 3.4459962756052143, + "grad_norm": 1.7522189617156982, + "learning_rate": 1.972876616941084e-06, + "loss": 0.9069, + "step": 3701 + }, + { + "epoch": 3.446927374301676, + "grad_norm": 1.672468662261963, + "learning_rate": 1.9716664967438983e-06, + "loss": 0.8487, + "step": 3702 + }, + { + "epoch": 3.4478584729981376, + "grad_norm": 1.718237280845642, + "learning_rate": 1.970456506109874e-06, + "loss": 0.9069, + "step": 3703 + }, + { + "epoch": 3.4487895716945998, + "grad_norm": 1.736087441444397, + "learning_rate": 1.969246645335738e-06, + "loss": 0.9169, + "step": 3704 + }, + { + "epoch": 3.4497206703910615, + "grad_norm": 1.6877533197402954, + "learning_rate": 1.9680369147181847e-06, + "loss": 0.9257, + "step": 3705 + }, + { + "epoch": 3.450651769087523, + "grad_norm": 1.737767219543457, + "learning_rate": 1.9668273145538754e-06, + "loss": 0.9517, + "step": 3706 + }, + { + "epoch": 3.4515828677839853, + "grad_norm": 1.7471284866333008, + "learning_rate": 1.9656178451394404e-06, + "loss": 0.9254, + "step": 3707 + }, + { + "epoch": 3.452513966480447, + "grad_norm": 1.9232438802719116, + "learning_rate": 1.964408506771477e-06, + "loss": 0.9296, + "step": 3708 + }, + { + "epoch": 3.4534450651769086, + "grad_norm": 1.7236459255218506, + "learning_rate": 1.9631992997465535e-06, + "loss": 0.9221, + "step": 3709 + }, + { + "epoch": 3.4543761638733708, + "grad_norm": 1.7353768348693848, + "learning_rate": 1.961990224361201e-06, + "loss": 0.9305, + "step": 3710 + }, + { + "epoch": 3.4553072625698324, + "grad_norm": 1.7400027513504028, + "learning_rate": 1.9607812809119214e-06, + "loss": 0.9193, + "step": 3711 + }, + { + "epoch": 3.456238361266294, + "grad_norm": 1.8154343366622925, + "learning_rate": 1.959572469695186e-06, + "loss": 0.9334, + "step": 3712 + }, + { + "epoch": 3.4571694599627563, + "grad_norm": 1.7508608102798462, + "learning_rate": 1.9583637910074283e-06, + "loss": 0.8952, + "step": 3713 + }, + { + "epoch": 3.458100558659218, + "grad_norm": 1.7181293964385986, + "learning_rate": 1.9571552451450542e-06, + "loss": 0.9095, + "step": 3714 + }, + { + "epoch": 3.4590316573556796, + "grad_norm": 1.6842950582504272, + "learning_rate": 1.9559468324044343e-06, + "loss": 0.91, + "step": 3715 + }, + { + "epoch": 3.4599627560521418, + "grad_norm": 1.7393920421600342, + "learning_rate": 1.954738553081909e-06, + "loss": 0.9008, + "step": 3716 + }, + { + "epoch": 3.4608938547486034, + "grad_norm": 1.7108683586120605, + "learning_rate": 1.953530407473783e-06, + "loss": 0.9252, + "step": 3717 + }, + { + "epoch": 3.461824953445065, + "grad_norm": 1.7097468376159668, + "learning_rate": 1.952322395876331e-06, + "loss": 0.9425, + "step": 3718 + }, + { + "epoch": 3.462756052141527, + "grad_norm": 1.7436959743499756, + "learning_rate": 1.9511145185857925e-06, + "loss": 0.9465, + "step": 3719 + }, + { + "epoch": 3.463687150837989, + "grad_norm": 1.6939001083374023, + "learning_rate": 1.9499067758983753e-06, + "loss": 0.9184, + "step": 3720 + }, + { + "epoch": 3.4646182495344506, + "grad_norm": 1.771569848060608, + "learning_rate": 1.948699168110254e-06, + "loss": 0.933, + "step": 3721 + }, + { + "epoch": 3.4655493482309123, + "grad_norm": 1.7227600812911987, + "learning_rate": 1.947491695517571e-06, + "loss": 0.9264, + "step": 3722 + }, + { + "epoch": 3.4664804469273744, + "grad_norm": 1.6653895378112793, + "learning_rate": 1.9462843584164333e-06, + "loss": 0.8951, + "step": 3723 + }, + { + "epoch": 3.467411545623836, + "grad_norm": 1.6405330896377563, + "learning_rate": 1.945077157102916e-06, + "loss": 0.8724, + "step": 3724 + }, + { + "epoch": 3.468342644320298, + "grad_norm": 1.77916419506073, + "learning_rate": 1.9438700918730624e-06, + "loss": 0.9016, + "step": 3725 + }, + { + "epoch": 3.46927374301676, + "grad_norm": 1.7160097360610962, + "learning_rate": 1.942663163022881e-06, + "loss": 0.8992, + "step": 3726 + }, + { + "epoch": 3.4702048417132216, + "grad_norm": 1.7439091205596924, + "learning_rate": 1.941456370848344e-06, + "loss": 0.9197, + "step": 3727 + }, + { + "epoch": 3.4711359404096833, + "grad_norm": 1.7196481227874756, + "learning_rate": 1.940249715645396e-06, + "loss": 0.8867, + "step": 3728 + }, + { + "epoch": 3.472067039106145, + "grad_norm": 1.7669312953948975, + "learning_rate": 1.9390431977099444e-06, + "loss": 0.9277, + "step": 3729 + }, + { + "epoch": 3.472998137802607, + "grad_norm": 1.6947247982025146, + "learning_rate": 1.937836817337862e-06, + "loss": 0.8697, + "step": 3730 + }, + { + "epoch": 3.473929236499069, + "grad_norm": 1.7258371114730835, + "learning_rate": 1.9366305748249893e-06, + "loss": 0.9306, + "step": 3731 + }, + { + "epoch": 3.4748603351955305, + "grad_norm": 1.6535981893539429, + "learning_rate": 1.935424470467135e-06, + "loss": 0.8931, + "step": 3732 + }, + { + "epoch": 3.4757914338919926, + "grad_norm": 1.7044140100479126, + "learning_rate": 1.934218504560071e-06, + "loss": 0.9179, + "step": 3733 + }, + { + "epoch": 3.4767225325884543, + "grad_norm": 1.7087328433990479, + "learning_rate": 1.933012677399535e-06, + "loss": 0.9154, + "step": 3734 + }, + { + "epoch": 3.477653631284916, + "grad_norm": 1.8075371980667114, + "learning_rate": 1.9318069892812333e-06, + "loss": 0.947, + "step": 3735 + }, + { + "epoch": 3.478584729981378, + "grad_norm": 1.7017734050750732, + "learning_rate": 1.9306014405008365e-06, + "loss": 0.9186, + "step": 3736 + }, + { + "epoch": 3.47951582867784, + "grad_norm": 1.7372357845306396, + "learning_rate": 1.929396031353981e-06, + "loss": 0.9109, + "step": 3737 + }, + { + "epoch": 3.4804469273743015, + "grad_norm": 1.7136497497558594, + "learning_rate": 1.928190762136268e-06, + "loss": 0.8936, + "step": 3738 + }, + { + "epoch": 3.4813780260707636, + "grad_norm": 1.7523882389068604, + "learning_rate": 1.926985633143267e-06, + "loss": 0.912, + "step": 3739 + }, + { + "epoch": 3.4823091247672253, + "grad_norm": 1.7151979207992554, + "learning_rate": 1.9257806446705116e-06, + "loss": 0.9324, + "step": 3740 + }, + { + "epoch": 3.483240223463687, + "grad_norm": 1.8397040367126465, + "learning_rate": 1.9245757970135e-06, + "loss": 0.9392, + "step": 3741 + }, + { + "epoch": 3.484171322160149, + "grad_norm": 1.7208216190338135, + "learning_rate": 1.9233710904676973e-06, + "loss": 0.9126, + "step": 3742 + }, + { + "epoch": 3.485102420856611, + "grad_norm": 1.6682169437408447, + "learning_rate": 1.9221665253285344e-06, + "loss": 0.8867, + "step": 3743 + }, + { + "epoch": 3.4860335195530725, + "grad_norm": 1.6920920610427856, + "learning_rate": 1.9209621018914056e-06, + "loss": 0.8853, + "step": 3744 + }, + { + "epoch": 3.4869646182495346, + "grad_norm": 1.6885992288589478, + "learning_rate": 1.9197578204516707e-06, + "loss": 0.8793, + "step": 3745 + }, + { + "epoch": 3.4878957169459963, + "grad_norm": 1.6868942975997925, + "learning_rate": 1.918553681304657e-06, + "loss": 0.8969, + "step": 3746 + }, + { + "epoch": 3.488826815642458, + "grad_norm": 1.6382404565811157, + "learning_rate": 1.9173496847456567e-06, + "loss": 0.8694, + "step": 3747 + }, + { + "epoch": 3.48975791433892, + "grad_norm": 1.7104731798171997, + "learning_rate": 1.9161458310699227e-06, + "loss": 0.9172, + "step": 3748 + }, + { + "epoch": 3.490689013035382, + "grad_norm": 1.7463269233703613, + "learning_rate": 1.914942120572677e-06, + "loss": 0.9552, + "step": 3749 + }, + { + "epoch": 3.4916201117318435, + "grad_norm": 1.7733818292617798, + "learning_rate": 1.9137385535491064e-06, + "loss": 0.921, + "step": 3750 + }, + { + "epoch": 3.4925512104283056, + "grad_norm": 1.7402420043945312, + "learning_rate": 1.91253513029436e-06, + "loss": 0.9041, + "step": 3751 + }, + { + "epoch": 3.4934823091247673, + "grad_norm": 1.78652024269104, + "learning_rate": 1.9113318511035543e-06, + "loss": 0.9058, + "step": 3752 + }, + { + "epoch": 3.494413407821229, + "grad_norm": 1.7097171545028687, + "learning_rate": 1.9101287162717694e-06, + "loss": 0.9026, + "step": 3753 + }, + { + "epoch": 3.4953445065176907, + "grad_norm": 1.7454807758331299, + "learning_rate": 1.908925726094048e-06, + "loss": 0.9257, + "step": 3754 + }, + { + "epoch": 3.496275605214153, + "grad_norm": 1.700960397720337, + "learning_rate": 1.9077228808654012e-06, + "loss": 0.9014, + "step": 3755 + }, + { + "epoch": 3.4972067039106145, + "grad_norm": 1.694154977798462, + "learning_rate": 1.9065201808808018e-06, + "loss": 0.9043, + "step": 3756 + }, + { + "epoch": 3.498137802607076, + "grad_norm": 1.7099239826202393, + "learning_rate": 1.9053176264351894e-06, + "loss": 0.8785, + "step": 3757 + }, + { + "epoch": 3.4990689013035383, + "grad_norm": 1.8350014686584473, + "learning_rate": 1.9041152178234631e-06, + "loss": 0.9105, + "step": 3758 + }, + { + "epoch": 3.5, + "grad_norm": 1.7365843057632446, + "learning_rate": 1.9029129553404921e-06, + "loss": 0.8887, + "step": 3759 + }, + { + "epoch": 3.5009310986964617, + "grad_norm": 1.7611134052276611, + "learning_rate": 1.9017108392811065e-06, + "loss": 0.9398, + "step": 3760 + }, + { + "epoch": 3.501862197392924, + "grad_norm": 1.701502799987793, + "learning_rate": 1.9005088699400998e-06, + "loss": 0.9134, + "step": 3761 + }, + { + "epoch": 3.5027932960893855, + "grad_norm": 1.7850090265274048, + "learning_rate": 1.8993070476122318e-06, + "loss": 0.9112, + "step": 3762 + }, + { + "epoch": 3.503724394785847, + "grad_norm": 1.804706335067749, + "learning_rate": 1.8981053725922258e-06, + "loss": 0.9187, + "step": 3763 + }, + { + "epoch": 3.504655493482309, + "grad_norm": 1.6850451231002808, + "learning_rate": 1.8969038451747682e-06, + "loss": 0.8729, + "step": 3764 + }, + { + "epoch": 3.505586592178771, + "grad_norm": 1.7206321954727173, + "learning_rate": 1.895702465654508e-06, + "loss": 0.8743, + "step": 3765 + }, + { + "epoch": 3.5065176908752327, + "grad_norm": 1.7220323085784912, + "learning_rate": 1.8945012343260605e-06, + "loss": 0.9265, + "step": 3766 + }, + { + "epoch": 3.5074487895716944, + "grad_norm": 1.797593355178833, + "learning_rate": 1.8933001514840043e-06, + "loss": 0.9193, + "step": 3767 + }, + { + "epoch": 3.5083798882681565, + "grad_norm": 1.7038910388946533, + "learning_rate": 1.8920992174228792e-06, + "loss": 0.877, + "step": 3768 + }, + { + "epoch": 3.509310986964618, + "grad_norm": 1.7656110525131226, + "learning_rate": 1.890898432437191e-06, + "loss": 0.9062, + "step": 3769 + }, + { + "epoch": 3.51024208566108, + "grad_norm": 1.7884465456008911, + "learning_rate": 1.8896977968214078e-06, + "loss": 0.8961, + "step": 3770 + }, + { + "epoch": 3.511173184357542, + "grad_norm": 1.7180119752883911, + "learning_rate": 1.8884973108699623e-06, + "loss": 0.8757, + "step": 3771 + }, + { + "epoch": 3.5121042830540037, + "grad_norm": 1.7336595058441162, + "learning_rate": 1.8872969748772474e-06, + "loss": 0.908, + "step": 3772 + }, + { + "epoch": 3.5130353817504654, + "grad_norm": 1.7699941396713257, + "learning_rate": 1.886096789137623e-06, + "loss": 0.8661, + "step": 3773 + }, + { + "epoch": 3.5139664804469275, + "grad_norm": 1.726975917816162, + "learning_rate": 1.8848967539454109e-06, + "loss": 0.8995, + "step": 3774 + }, + { + "epoch": 3.514897579143389, + "grad_norm": 1.6452610492706299, + "learning_rate": 1.8836968695948944e-06, + "loss": 0.8486, + "step": 3775 + }, + { + "epoch": 3.515828677839851, + "grad_norm": 1.7326273918151855, + "learning_rate": 1.8824971363803205e-06, + "loss": 0.8939, + "step": 3776 + }, + { + "epoch": 3.516759776536313, + "grad_norm": 1.7196978330612183, + "learning_rate": 1.8812975545959011e-06, + "loss": 0.8934, + "step": 3777 + }, + { + "epoch": 3.5176908752327747, + "grad_norm": 1.7697244882583618, + "learning_rate": 1.8800981245358097e-06, + "loss": 0.9103, + "step": 3778 + }, + { + "epoch": 3.5186219739292364, + "grad_norm": 1.7580496072769165, + "learning_rate": 1.8788988464941804e-06, + "loss": 0.943, + "step": 3779 + }, + { + "epoch": 3.5195530726256985, + "grad_norm": 1.7845139503479004, + "learning_rate": 1.8776997207651137e-06, + "loss": 0.9133, + "step": 3780 + }, + { + "epoch": 3.52048417132216, + "grad_norm": 1.724279522895813, + "learning_rate": 1.8765007476426714e-06, + "loss": 0.9305, + "step": 3781 + }, + { + "epoch": 3.521415270018622, + "grad_norm": 1.751499891281128, + "learning_rate": 1.8753019274208762e-06, + "loss": 0.9397, + "step": 3782 + }, + { + "epoch": 3.522346368715084, + "grad_norm": 1.72331702709198, + "learning_rate": 1.8741032603937142e-06, + "loss": 0.924, + "step": 3783 + }, + { + "epoch": 3.5232774674115457, + "grad_norm": 1.779765248298645, + "learning_rate": 1.8729047468551365e-06, + "loss": 0.9207, + "step": 3784 + }, + { + "epoch": 3.5242085661080074, + "grad_norm": 1.6782563924789429, + "learning_rate": 1.8717063870990535e-06, + "loss": 0.871, + "step": 3785 + }, + { + "epoch": 3.5251396648044695, + "grad_norm": 1.6838434934616089, + "learning_rate": 1.8705081814193381e-06, + "loss": 0.9243, + "step": 3786 + }, + { + "epoch": 3.526070763500931, + "grad_norm": 1.7279614210128784, + "learning_rate": 1.869310130109826e-06, + "loss": 0.9378, + "step": 3787 + }, + { + "epoch": 3.527001862197393, + "grad_norm": 1.7303756475448608, + "learning_rate": 1.868112233464317e-06, + "loss": 0.9089, + "step": 3788 + }, + { + "epoch": 3.527932960893855, + "grad_norm": 1.6787052154541016, + "learning_rate": 1.8669144917765694e-06, + "loss": 0.8852, + "step": 3789 + }, + { + "epoch": 3.5288640595903167, + "grad_norm": 1.7224739789962769, + "learning_rate": 1.8657169053403052e-06, + "loss": 0.9187, + "step": 3790 + }, + { + "epoch": 3.5297951582867784, + "grad_norm": 1.7588502168655396, + "learning_rate": 1.8645194744492106e-06, + "loss": 0.931, + "step": 3791 + }, + { + "epoch": 3.5307262569832405, + "grad_norm": 1.7237749099731445, + "learning_rate": 1.8633221993969285e-06, + "loss": 0.9052, + "step": 3792 + }, + { + "epoch": 3.531657355679702, + "grad_norm": 1.775160789489746, + "learning_rate": 1.8621250804770683e-06, + "loss": 0.9254, + "step": 3793 + }, + { + "epoch": 3.532588454376164, + "grad_norm": 1.705663800239563, + "learning_rate": 1.8609281179831984e-06, + "loss": 0.9256, + "step": 3794 + }, + { + "epoch": 3.5335195530726256, + "grad_norm": 1.7833622694015503, + "learning_rate": 1.8597313122088513e-06, + "loss": 0.8922, + "step": 3795 + }, + { + "epoch": 3.5344506517690877, + "grad_norm": 1.7586307525634766, + "learning_rate": 1.8585346634475177e-06, + "loss": 0.8932, + "step": 3796 + }, + { + "epoch": 3.5353817504655494, + "grad_norm": 1.7682034969329834, + "learning_rate": 1.857338171992652e-06, + "loss": 0.9043, + "step": 3797 + }, + { + "epoch": 3.536312849162011, + "grad_norm": 1.7299585342407227, + "learning_rate": 1.8561418381376717e-06, + "loss": 0.8587, + "step": 3798 + }, + { + "epoch": 3.5372439478584727, + "grad_norm": 1.7286533117294312, + "learning_rate": 1.8549456621759506e-06, + "loss": 0.9318, + "step": 3799 + }, + { + "epoch": 3.538175046554935, + "grad_norm": 1.761705994606018, + "learning_rate": 1.8537496444008285e-06, + "loss": 0.9185, + "step": 3800 + }, + { + "epoch": 3.5391061452513966, + "grad_norm": 1.7503806352615356, + "learning_rate": 1.852553785105604e-06, + "loss": 0.9316, + "step": 3801 + }, + { + "epoch": 3.5400372439478582, + "grad_norm": 1.7258548736572266, + "learning_rate": 1.8513580845835387e-06, + "loss": 0.918, + "step": 3802 + }, + { + "epoch": 3.5409683426443204, + "grad_norm": 1.762282371520996, + "learning_rate": 1.8501625431278533e-06, + "loss": 0.9176, + "step": 3803 + }, + { + "epoch": 3.541899441340782, + "grad_norm": 1.7405894994735718, + "learning_rate": 1.84896716103173e-06, + "loss": 0.9356, + "step": 3804 + }, + { + "epoch": 3.5428305400372437, + "grad_norm": 1.7192190885543823, + "learning_rate": 1.847771938588313e-06, + "loss": 0.9344, + "step": 3805 + }, + { + "epoch": 3.543761638733706, + "grad_norm": 1.7265293598175049, + "learning_rate": 1.846576876090705e-06, + "loss": 0.8953, + "step": 3806 + }, + { + "epoch": 3.5446927374301676, + "grad_norm": 1.725490689277649, + "learning_rate": 1.8453819738319728e-06, + "loss": 0.9133, + "step": 3807 + }, + { + "epoch": 3.5456238361266292, + "grad_norm": 1.6897168159484863, + "learning_rate": 1.8441872321051406e-06, + "loss": 0.9312, + "step": 3808 + }, + { + "epoch": 3.5465549348230914, + "grad_norm": 1.7965730428695679, + "learning_rate": 1.8429926512031976e-06, + "loss": 0.9131, + "step": 3809 + }, + { + "epoch": 3.547486033519553, + "grad_norm": 1.733558177947998, + "learning_rate": 1.8417982314190868e-06, + "loss": 0.9284, + "step": 3810 + }, + { + "epoch": 3.5484171322160147, + "grad_norm": 1.709241509437561, + "learning_rate": 1.8406039730457185e-06, + "loss": 0.8946, + "step": 3811 + }, + { + "epoch": 3.549348230912477, + "grad_norm": 1.7721717357635498, + "learning_rate": 1.83940987637596e-06, + "loss": 0.9202, + "step": 3812 + }, + { + "epoch": 3.5502793296089385, + "grad_norm": 1.756757140159607, + "learning_rate": 1.838215941702638e-06, + "loss": 0.8986, + "step": 3813 + }, + { + "epoch": 3.5512104283054002, + "grad_norm": 1.7835568189620972, + "learning_rate": 1.8370221693185424e-06, + "loss": 0.9355, + "step": 3814 + }, + { + "epoch": 3.5521415270018624, + "grad_norm": 1.786633849143982, + "learning_rate": 1.8358285595164216e-06, + "loss": 0.9102, + "step": 3815 + }, + { + "epoch": 3.553072625698324, + "grad_norm": 1.702560305595398, + "learning_rate": 1.8346351125889849e-06, + "loss": 0.9205, + "step": 3816 + }, + { + "epoch": 3.5540037243947857, + "grad_norm": 1.6461273431777954, + "learning_rate": 1.8334418288288995e-06, + "loss": 0.8601, + "step": 3817 + }, + { + "epoch": 3.554934823091248, + "grad_norm": 1.731619954109192, + "learning_rate": 1.8322487085287953e-06, + "loss": 0.9504, + "step": 3818 + }, + { + "epoch": 3.5558659217877095, + "grad_norm": 1.7459062337875366, + "learning_rate": 1.831055751981262e-06, + "loss": 0.9443, + "step": 3819 + }, + { + "epoch": 3.5567970204841712, + "grad_norm": 1.8265029191970825, + "learning_rate": 1.829862959478847e-06, + "loss": 0.9443, + "step": 3820 + }, + { + "epoch": 3.5577281191806334, + "grad_norm": 1.7825050354003906, + "learning_rate": 1.828670331314058e-06, + "loss": 0.8552, + "step": 3821 + }, + { + "epoch": 3.558659217877095, + "grad_norm": 1.7627006769180298, + "learning_rate": 1.8274778677793653e-06, + "loss": 0.9402, + "step": 3822 + }, + { + "epoch": 3.5595903165735567, + "grad_norm": 1.7195544242858887, + "learning_rate": 1.8262855691671944e-06, + "loss": 0.9229, + "step": 3823 + }, + { + "epoch": 3.560521415270019, + "grad_norm": 1.7175885438919067, + "learning_rate": 1.825093435769933e-06, + "loss": 0.916, + "step": 3824 + }, + { + "epoch": 3.5614525139664805, + "grad_norm": 1.7534878253936768, + "learning_rate": 1.823901467879929e-06, + "loss": 0.9417, + "step": 3825 + }, + { + "epoch": 3.5623836126629422, + "grad_norm": 1.7108118534088135, + "learning_rate": 1.8227096657894878e-06, + "loss": 0.8908, + "step": 3826 + }, + { + "epoch": 3.5633147113594044, + "grad_norm": 1.6790822744369507, + "learning_rate": 1.8215180297908746e-06, + "loss": 0.8999, + "step": 3827 + }, + { + "epoch": 3.564245810055866, + "grad_norm": 1.7114630937576294, + "learning_rate": 1.8203265601763137e-06, + "loss": 0.8944, + "step": 3828 + }, + { + "epoch": 3.5651769087523277, + "grad_norm": 1.6899574995040894, + "learning_rate": 1.819135257237991e-06, + "loss": 0.91, + "step": 3829 + }, + { + "epoch": 3.5661080074487894, + "grad_norm": 1.6954869031906128, + "learning_rate": 1.8179441212680479e-06, + "loss": 0.874, + "step": 3830 + }, + { + "epoch": 3.5670391061452515, + "grad_norm": 1.6966147422790527, + "learning_rate": 1.8167531525585863e-06, + "loss": 0.9008, + "step": 3831 + }, + { + "epoch": 3.5679702048417132, + "grad_norm": 1.714116096496582, + "learning_rate": 1.8155623514016685e-06, + "loss": 0.9516, + "step": 3832 + }, + { + "epoch": 3.568901303538175, + "grad_norm": 1.796891212463379, + "learning_rate": 1.8143717180893144e-06, + "loss": 0.926, + "step": 3833 + }, + { + "epoch": 3.5698324022346366, + "grad_norm": 1.7023240327835083, + "learning_rate": 1.8131812529135024e-06, + "loss": 0.9359, + "step": 3834 + }, + { + "epoch": 3.5707635009310987, + "grad_norm": 1.7283834218978882, + "learning_rate": 1.8119909561661692e-06, + "loss": 0.921, + "step": 3835 + }, + { + "epoch": 3.5716945996275604, + "grad_norm": 1.807257890701294, + "learning_rate": 1.8108008281392136e-06, + "loss": 0.9173, + "step": 3836 + }, + { + "epoch": 3.572625698324022, + "grad_norm": 1.7010875940322876, + "learning_rate": 1.8096108691244884e-06, + "loss": 0.8992, + "step": 3837 + }, + { + "epoch": 3.5735567970204842, + "grad_norm": 1.81072998046875, + "learning_rate": 1.8084210794138076e-06, + "loss": 0.9329, + "step": 3838 + }, + { + "epoch": 3.574487895716946, + "grad_norm": 1.6628484725952148, + "learning_rate": 1.8072314592989432e-06, + "loss": 0.8697, + "step": 3839 + }, + { + "epoch": 3.5754189944134076, + "grad_norm": 1.7389262914657593, + "learning_rate": 1.8060420090716266e-06, + "loss": 0.9095, + "step": 3840 + }, + { + "epoch": 3.5763500931098697, + "grad_norm": 1.686619758605957, + "learning_rate": 1.8048527290235452e-06, + "loss": 0.884, + "step": 3841 + }, + { + "epoch": 3.5772811918063314, + "grad_norm": 1.6599712371826172, + "learning_rate": 1.8036636194463462e-06, + "loss": 0.8856, + "step": 3842 + }, + { + "epoch": 3.578212290502793, + "grad_norm": 1.774749517440796, + "learning_rate": 1.8024746806316369e-06, + "loss": 0.8936, + "step": 3843 + }, + { + "epoch": 3.5791433891992552, + "grad_norm": 1.7699576616287231, + "learning_rate": 1.8012859128709766e-06, + "loss": 0.9504, + "step": 3844 + }, + { + "epoch": 3.580074487895717, + "grad_norm": 1.7629735469818115, + "learning_rate": 1.80009731645589e-06, + "loss": 0.8941, + "step": 3845 + }, + { + "epoch": 3.5810055865921786, + "grad_norm": 1.768296241760254, + "learning_rate": 1.7989088916778546e-06, + "loss": 0.9206, + "step": 3846 + }, + { + "epoch": 3.5819366852886407, + "grad_norm": 1.7568581104278564, + "learning_rate": 1.7977206388283098e-06, + "loss": 0.8989, + "step": 3847 + }, + { + "epoch": 3.5828677839851024, + "grad_norm": 1.7615503072738647, + "learning_rate": 1.7965325581986487e-06, + "loss": 0.9256, + "step": 3848 + }, + { + "epoch": 3.583798882681564, + "grad_norm": 1.723462700843811, + "learning_rate": 1.7953446500802246e-06, + "loss": 0.8967, + "step": 3849 + }, + { + "epoch": 3.5847299813780262, + "grad_norm": 1.7438347339630127, + "learning_rate": 1.7941569147643493e-06, + "loss": 0.9299, + "step": 3850 + }, + { + "epoch": 3.585661080074488, + "grad_norm": 1.6896212100982666, + "learning_rate": 1.7929693525422887e-06, + "loss": 0.8743, + "step": 3851 + }, + { + "epoch": 3.5865921787709496, + "grad_norm": 1.7458972930908203, + "learning_rate": 1.7917819637052702e-06, + "loss": 0.8805, + "step": 3852 + }, + { + "epoch": 3.5875232774674117, + "grad_norm": 1.7452510595321655, + "learning_rate": 1.7905947485444775e-06, + "loss": 0.9088, + "step": 3853 + }, + { + "epoch": 3.5884543761638734, + "grad_norm": 1.7439093589782715, + "learning_rate": 1.7894077073510497e-06, + "loss": 0.8998, + "step": 3854 + }, + { + "epoch": 3.589385474860335, + "grad_norm": 1.7456589937210083, + "learning_rate": 1.788220840416085e-06, + "loss": 0.9257, + "step": 3855 + }, + { + "epoch": 3.5903165735567972, + "grad_norm": 1.722548246383667, + "learning_rate": 1.7870341480306397e-06, + "loss": 0.8935, + "step": 3856 + }, + { + "epoch": 3.591247672253259, + "grad_norm": 1.8748631477355957, + "learning_rate": 1.7858476304857259e-06, + "loss": 0.9039, + "step": 3857 + }, + { + "epoch": 3.5921787709497206, + "grad_norm": 1.6816240549087524, + "learning_rate": 1.7846612880723118e-06, + "loss": 0.896, + "step": 3858 + }, + { + "epoch": 3.5931098696461827, + "grad_norm": 1.692328691482544, + "learning_rate": 1.7834751210813262e-06, + "loss": 0.8662, + "step": 3859 + }, + { + "epoch": 3.5940409683426444, + "grad_norm": 1.7272828817367554, + "learning_rate": 1.7822891298036514e-06, + "loss": 0.9349, + "step": 3860 + }, + { + "epoch": 3.594972067039106, + "grad_norm": 1.672918677330017, + "learning_rate": 1.7811033145301282e-06, + "loss": 0.925, + "step": 3861 + }, + { + "epoch": 3.5959031657355682, + "grad_norm": 1.824395775794983, + "learning_rate": 1.7799176755515529e-06, + "loss": 0.9268, + "step": 3862 + }, + { + "epoch": 3.59683426443203, + "grad_norm": 1.709234595298767, + "learning_rate": 1.7787322131586815e-06, + "loss": 0.8982, + "step": 3863 + }, + { + "epoch": 3.5977653631284916, + "grad_norm": 1.7000312805175781, + "learning_rate": 1.7775469276422238e-06, + "loss": 0.9213, + "step": 3864 + }, + { + "epoch": 3.5986964618249533, + "grad_norm": 1.7053449153900146, + "learning_rate": 1.7763618192928468e-06, + "loss": 0.8863, + "step": 3865 + }, + { + "epoch": 3.5996275605214154, + "grad_norm": 1.7321999073028564, + "learning_rate": 1.775176888401175e-06, + "loss": 0.9507, + "step": 3866 + }, + { + "epoch": 3.600558659217877, + "grad_norm": 1.754015564918518, + "learning_rate": 1.7739921352577894e-06, + "loss": 0.8876, + "step": 3867 + }, + { + "epoch": 3.601489757914339, + "grad_norm": 1.6962555646896362, + "learning_rate": 1.7728075601532258e-06, + "loss": 0.9022, + "step": 3868 + }, + { + "epoch": 3.6024208566108005, + "grad_norm": 1.7329515218734741, + "learning_rate": 1.7716231633779774e-06, + "loss": 0.8734, + "step": 3869 + }, + { + "epoch": 3.6033519553072626, + "grad_norm": 1.7151148319244385, + "learning_rate": 1.7704389452224945e-06, + "loss": 0.9207, + "step": 3870 + }, + { + "epoch": 3.6042830540037243, + "grad_norm": 1.7513630390167236, + "learning_rate": 1.7692549059771835e-06, + "loss": 0.9082, + "step": 3871 + }, + { + "epoch": 3.605214152700186, + "grad_norm": 1.7094073295593262, + "learning_rate": 1.7680710459324043e-06, + "loss": 0.8941, + "step": 3872 + }, + { + "epoch": 3.606145251396648, + "grad_norm": 1.750381350517273, + "learning_rate": 1.766887365378475e-06, + "loss": 0.9715, + "step": 3873 + }, + { + "epoch": 3.60707635009311, + "grad_norm": 1.787935733795166, + "learning_rate": 1.7657038646056713e-06, + "loss": 0.8934, + "step": 3874 + }, + { + "epoch": 3.6080074487895715, + "grad_norm": 1.6999529600143433, + "learning_rate": 1.7645205439042213e-06, + "loss": 0.8942, + "step": 3875 + }, + { + "epoch": 3.6089385474860336, + "grad_norm": 1.8665771484375, + "learning_rate": 1.7633374035643103e-06, + "loss": 0.925, + "step": 3876 + }, + { + "epoch": 3.6098696461824953, + "grad_norm": 1.785208821296692, + "learning_rate": 1.7621544438760807e-06, + "loss": 0.9325, + "step": 3877 + }, + { + "epoch": 3.610800744878957, + "grad_norm": 1.6994990110397339, + "learning_rate": 1.76097166512963e-06, + "loss": 0.8469, + "step": 3878 + }, + { + "epoch": 3.611731843575419, + "grad_norm": 1.7353748083114624, + "learning_rate": 1.7597890676150096e-06, + "loss": 0.9276, + "step": 3879 + }, + { + "epoch": 3.612662942271881, + "grad_norm": 1.7782478332519531, + "learning_rate": 1.7586066516222277e-06, + "loss": 0.9373, + "step": 3880 + }, + { + "epoch": 3.6135940409683425, + "grad_norm": 2.06430983543396, + "learning_rate": 1.7574244174412498e-06, + "loss": 0.9152, + "step": 3881 + }, + { + "epoch": 3.6145251396648046, + "grad_norm": 1.729926586151123, + "learning_rate": 1.7562423653619931e-06, + "loss": 0.9395, + "step": 3882 + }, + { + "epoch": 3.6154562383612663, + "grad_norm": 1.798756718635559, + "learning_rate": 1.7550604956743328e-06, + "loss": 0.9276, + "step": 3883 + }, + { + "epoch": 3.616387337057728, + "grad_norm": 1.7765427827835083, + "learning_rate": 1.7538788086680994e-06, + "loss": 0.892, + "step": 3884 + }, + { + "epoch": 3.61731843575419, + "grad_norm": 1.7441920042037964, + "learning_rate": 1.752697304633076e-06, + "loss": 0.8823, + "step": 3885 + }, + { + "epoch": 3.618249534450652, + "grad_norm": 1.6643478870391846, + "learning_rate": 1.7515159838590046e-06, + "loss": 0.8948, + "step": 3886 + }, + { + "epoch": 3.6191806331471135, + "grad_norm": 1.7136600017547607, + "learning_rate": 1.750334846635579e-06, + "loss": 0.9205, + "step": 3887 + }, + { + "epoch": 3.6201117318435756, + "grad_norm": 1.6596384048461914, + "learning_rate": 1.7491538932524514e-06, + "loss": 0.872, + "step": 3888 + }, + { + "epoch": 3.6210428305400373, + "grad_norm": 1.7417312860488892, + "learning_rate": 1.747973123999224e-06, + "loss": 0.929, + "step": 3889 + }, + { + "epoch": 3.621973929236499, + "grad_norm": 1.6772305965423584, + "learning_rate": 1.7467925391654587e-06, + "loss": 0.9075, + "step": 3890 + }, + { + "epoch": 3.622905027932961, + "grad_norm": 1.7425562143325806, + "learning_rate": 1.74561213904067e-06, + "loss": 0.8965, + "step": 3891 + }, + { + "epoch": 3.623836126629423, + "grad_norm": 1.7270545959472656, + "learning_rate": 1.744431923914326e-06, + "loss": 0.8976, + "step": 3892 + }, + { + "epoch": 3.6247672253258845, + "grad_norm": 1.7174123525619507, + "learning_rate": 1.7432518940758519e-06, + "loss": 0.9245, + "step": 3893 + }, + { + "epoch": 3.6256983240223466, + "grad_norm": 1.7063301801681519, + "learning_rate": 1.7420720498146254e-06, + "loss": 0.9172, + "step": 3894 + }, + { + "epoch": 3.6266294227188083, + "grad_norm": 1.698531150817871, + "learning_rate": 1.7408923914199818e-06, + "loss": 0.8942, + "step": 3895 + }, + { + "epoch": 3.62756052141527, + "grad_norm": 1.7554367780685425, + "learning_rate": 1.7397129191812058e-06, + "loss": 0.8929, + "step": 3896 + }, + { + "epoch": 3.628491620111732, + "grad_norm": 1.718575119972229, + "learning_rate": 1.7385336333875403e-06, + "loss": 0.9067, + "step": 3897 + }, + { + "epoch": 3.629422718808194, + "grad_norm": 1.8120148181915283, + "learning_rate": 1.7373545343281822e-06, + "loss": 0.9292, + "step": 3898 + }, + { + "epoch": 3.6303538175046555, + "grad_norm": 1.76980721950531, + "learning_rate": 1.7361756222922798e-06, + "loss": 0.9431, + "step": 3899 + }, + { + "epoch": 3.631284916201117, + "grad_norm": 1.8019706010818481, + "learning_rate": 1.73499689756894e-06, + "loss": 0.9483, + "step": 3900 + }, + { + "epoch": 3.6322160148975793, + "grad_norm": 1.709527850151062, + "learning_rate": 1.7338183604472198e-06, + "loss": 0.9054, + "step": 3901 + }, + { + "epoch": 3.633147113594041, + "grad_norm": 1.767380952835083, + "learning_rate": 1.7326400112161329e-06, + "loss": 0.9074, + "step": 3902 + }, + { + "epoch": 3.6340782122905027, + "grad_norm": 1.7089146375656128, + "learning_rate": 1.7314618501646435e-06, + "loss": 0.9118, + "step": 3903 + }, + { + "epoch": 3.635009310986965, + "grad_norm": 1.8244383335113525, + "learning_rate": 1.730283877581674e-06, + "loss": 0.9024, + "step": 3904 + }, + { + "epoch": 3.6359404096834265, + "grad_norm": 1.777852177619934, + "learning_rate": 1.7291060937560985e-06, + "loss": 0.9283, + "step": 3905 + }, + { + "epoch": 3.636871508379888, + "grad_norm": 1.771607756614685, + "learning_rate": 1.7279284989767436e-06, + "loss": 0.9338, + "step": 3906 + }, + { + "epoch": 3.63780260707635, + "grad_norm": 1.7060426473617554, + "learning_rate": 1.7267510935323906e-06, + "loss": 0.9047, + "step": 3907 + }, + { + "epoch": 3.638733705772812, + "grad_norm": 1.7086195945739746, + "learning_rate": 1.7255738777117758e-06, + "loss": 0.9195, + "step": 3908 + }, + { + "epoch": 3.6396648044692737, + "grad_norm": 1.6890431642532349, + "learning_rate": 1.7243968518035874e-06, + "loss": 0.9024, + "step": 3909 + }, + { + "epoch": 3.6405959031657353, + "grad_norm": 1.7672970294952393, + "learning_rate": 1.7232200160964657e-06, + "loss": 0.967, + "step": 3910 + }, + { + "epoch": 3.6415270018621975, + "grad_norm": 1.6634522676467896, + "learning_rate": 1.7220433708790082e-06, + "loss": 0.8917, + "step": 3911 + }, + { + "epoch": 3.642458100558659, + "grad_norm": 1.7099052667617798, + "learning_rate": 1.720866916439763e-06, + "loss": 0.8967, + "step": 3912 + }, + { + "epoch": 3.643389199255121, + "grad_norm": 1.7886919975280762, + "learning_rate": 1.7196906530672306e-06, + "loss": 0.8804, + "step": 3913 + }, + { + "epoch": 3.644320297951583, + "grad_norm": 2.3211326599121094, + "learning_rate": 1.7185145810498663e-06, + "loss": 0.8589, + "step": 3914 + }, + { + "epoch": 3.6452513966480447, + "grad_norm": 1.695995807647705, + "learning_rate": 1.7173387006760789e-06, + "loss": 0.9519, + "step": 3915 + }, + { + "epoch": 3.6461824953445063, + "grad_norm": 1.670315146446228, + "learning_rate": 1.7161630122342299e-06, + "loss": 0.8887, + "step": 3916 + }, + { + "epoch": 3.6471135940409685, + "grad_norm": 1.7106984853744507, + "learning_rate": 1.7149875160126315e-06, + "loss": 0.8978, + "step": 3917 + }, + { + "epoch": 3.64804469273743, + "grad_norm": 1.7007761001586914, + "learning_rate": 1.713812212299551e-06, + "loss": 0.8804, + "step": 3918 + }, + { + "epoch": 3.648975791433892, + "grad_norm": 1.6932612657546997, + "learning_rate": 1.7126371013832093e-06, + "loss": 0.8936, + "step": 3919 + }, + { + "epoch": 3.649906890130354, + "grad_norm": 1.7997294664382935, + "learning_rate": 1.7114621835517771e-06, + "loss": 0.912, + "step": 3920 + }, + { + "epoch": 3.6508379888268156, + "grad_norm": 1.7106883525848389, + "learning_rate": 1.71028745909338e-06, + "loss": 0.8954, + "step": 3921 + }, + { + "epoch": 3.6517690875232773, + "grad_norm": 1.765065312385559, + "learning_rate": 1.7091129282960966e-06, + "loss": 0.9305, + "step": 3922 + }, + { + "epoch": 3.6527001862197395, + "grad_norm": 1.7304763793945312, + "learning_rate": 1.7079385914479545e-06, + "loss": 0.9079, + "step": 3923 + }, + { + "epoch": 3.653631284916201, + "grad_norm": 1.788089394569397, + "learning_rate": 1.7067644488369381e-06, + "loss": 0.9261, + "step": 3924 + }, + { + "epoch": 3.654562383612663, + "grad_norm": 1.7470579147338867, + "learning_rate": 1.705590500750981e-06, + "loss": 0.9251, + "step": 3925 + }, + { + "epoch": 3.655493482309125, + "grad_norm": 1.7565187215805054, + "learning_rate": 1.7044167474779727e-06, + "loss": 0.9067, + "step": 3926 + }, + { + "epoch": 3.6564245810055866, + "grad_norm": 1.7824771404266357, + "learning_rate": 1.70324318930575e-06, + "loss": 0.9555, + "step": 3927 + }, + { + "epoch": 3.6573556797020483, + "grad_norm": 1.664537787437439, + "learning_rate": 1.702069826522105e-06, + "loss": 0.8739, + "step": 3928 + }, + { + "epoch": 3.6582867783985105, + "grad_norm": 1.7708044052124023, + "learning_rate": 1.7008966594147833e-06, + "loss": 0.9262, + "step": 3929 + }, + { + "epoch": 3.659217877094972, + "grad_norm": 1.7169562578201294, + "learning_rate": 1.6997236882714774e-06, + "loss": 0.9157, + "step": 3930 + }, + { + "epoch": 3.660148975791434, + "grad_norm": 1.7896921634674072, + "learning_rate": 1.6985509133798367e-06, + "loss": 0.9081, + "step": 3931 + }, + { + "epoch": 3.661080074487896, + "grad_norm": 1.713206171989441, + "learning_rate": 1.6973783350274603e-06, + "loss": 0.8746, + "step": 3932 + }, + { + "epoch": 3.6620111731843576, + "grad_norm": 1.8007779121398926, + "learning_rate": 1.6962059535019001e-06, + "loss": 0.9185, + "step": 3933 + }, + { + "epoch": 3.6629422718808193, + "grad_norm": 1.7197685241699219, + "learning_rate": 1.6950337690906582e-06, + "loss": 0.917, + "step": 3934 + }, + { + "epoch": 3.6638733705772815, + "grad_norm": 1.7353636026382446, + "learning_rate": 1.6938617820811899e-06, + "loss": 0.8867, + "step": 3935 + }, + { + "epoch": 3.664804469273743, + "grad_norm": 1.6639593839645386, + "learning_rate": 1.692689992760901e-06, + "loss": 0.8644, + "step": 3936 + }, + { + "epoch": 3.665735567970205, + "grad_norm": 1.7221115827560425, + "learning_rate": 1.6915184014171484e-06, + "loss": 0.8906, + "step": 3937 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.6684808731079102, + "learning_rate": 1.690347008337243e-06, + "loss": 0.8812, + "step": 3938 + }, + { + "epoch": 3.6675977653631286, + "grad_norm": 1.7499772310256958, + "learning_rate": 1.6891758138084441e-06, + "loss": 0.9105, + "step": 3939 + }, + { + "epoch": 3.6685288640595903, + "grad_norm": 1.7787092924118042, + "learning_rate": 1.6880048181179653e-06, + "loss": 0.9388, + "step": 3940 + }, + { + "epoch": 3.669459962756052, + "grad_norm": 1.7139168977737427, + "learning_rate": 1.6868340215529674e-06, + "loss": 0.9015, + "step": 3941 + }, + { + "epoch": 3.6703910614525137, + "grad_norm": 1.7353609800338745, + "learning_rate": 1.6856634244005662e-06, + "loss": 0.9073, + "step": 3942 + }, + { + "epoch": 3.671322160148976, + "grad_norm": 1.727271556854248, + "learning_rate": 1.6844930269478274e-06, + "loss": 0.9021, + "step": 3943 + }, + { + "epoch": 3.6722532588454375, + "grad_norm": 1.8015776872634888, + "learning_rate": 1.6833228294817656e-06, + "loss": 0.9229, + "step": 3944 + }, + { + "epoch": 3.673184357541899, + "grad_norm": 1.7594966888427734, + "learning_rate": 1.6821528322893498e-06, + "loss": 0.9205, + "step": 3945 + }, + { + "epoch": 3.6741154562383613, + "grad_norm": 1.6897203922271729, + "learning_rate": 1.6809830356574982e-06, + "loss": 0.9033, + "step": 3946 + }, + { + "epoch": 3.675046554934823, + "grad_norm": 1.7579700946807861, + "learning_rate": 1.6798134398730798e-06, + "loss": 0.9091, + "step": 3947 + }, + { + "epoch": 3.6759776536312847, + "grad_norm": 1.8155449628829956, + "learning_rate": 1.6786440452229134e-06, + "loss": 0.8815, + "step": 3948 + }, + { + "epoch": 3.676908752327747, + "grad_norm": 1.7368659973144531, + "learning_rate": 1.6774748519937706e-06, + "loss": 0.9471, + "step": 3949 + }, + { + "epoch": 3.6778398510242085, + "grad_norm": 1.7323052883148193, + "learning_rate": 1.6763058604723725e-06, + "loss": 0.9392, + "step": 3950 + }, + { + "epoch": 3.67877094972067, + "grad_norm": 1.7457796335220337, + "learning_rate": 1.67513707094539e-06, + "loss": 0.9205, + "step": 3951 + }, + { + "epoch": 3.6797020484171323, + "grad_norm": 1.7468905448913574, + "learning_rate": 1.6739684836994458e-06, + "loss": 0.9137, + "step": 3952 + }, + { + "epoch": 3.680633147113594, + "grad_norm": 1.6871715784072876, + "learning_rate": 1.6728000990211124e-06, + "loss": 0.8508, + "step": 3953 + }, + { + "epoch": 3.6815642458100557, + "grad_norm": 1.7542946338653564, + "learning_rate": 1.6716319171969126e-06, + "loss": 0.9126, + "step": 3954 + }, + { + "epoch": 3.682495344506518, + "grad_norm": 1.7160470485687256, + "learning_rate": 1.6704639385133187e-06, + "loss": 0.8812, + "step": 3955 + }, + { + "epoch": 3.6834264432029795, + "grad_norm": 1.789633870124817, + "learning_rate": 1.669296163256755e-06, + "loss": 0.9538, + "step": 3956 + }, + { + "epoch": 3.684357541899441, + "grad_norm": 1.7571874856948853, + "learning_rate": 1.6681285917135952e-06, + "loss": 0.9509, + "step": 3957 + }, + { + "epoch": 3.6852886405959033, + "grad_norm": 1.67544424533844, + "learning_rate": 1.6669612241701622e-06, + "loss": 0.908, + "step": 3958 + }, + { + "epoch": 3.686219739292365, + "grad_norm": 1.7213146686553955, + "learning_rate": 1.665794060912728e-06, + "loss": 0.9236, + "step": 3959 + }, + { + "epoch": 3.6871508379888267, + "grad_norm": 1.732373595237732, + "learning_rate": 1.6646271022275185e-06, + "loss": 0.9216, + "step": 3960 + }, + { + "epoch": 3.688081936685289, + "grad_norm": 1.6794570684432983, + "learning_rate": 1.663460348400705e-06, + "loss": 0.9523, + "step": 3961 + }, + { + "epoch": 3.6890130353817505, + "grad_norm": 1.7191836833953857, + "learning_rate": 1.6622937997184106e-06, + "loss": 0.9, + "step": 3962 + }, + { + "epoch": 3.689944134078212, + "grad_norm": 1.7113410234451294, + "learning_rate": 1.6611274564667085e-06, + "loss": 0.9006, + "step": 3963 + }, + { + "epoch": 3.6908752327746743, + "grad_norm": 1.7398282289505005, + "learning_rate": 1.6599613189316213e-06, + "loss": 0.8882, + "step": 3964 + }, + { + "epoch": 3.691806331471136, + "grad_norm": 1.9544732570648193, + "learning_rate": 1.6587953873991198e-06, + "loss": 0.9377, + "step": 3965 + }, + { + "epoch": 3.6927374301675977, + "grad_norm": 1.8131192922592163, + "learning_rate": 1.6576296621551246e-06, + "loss": 0.9431, + "step": 3966 + }, + { + "epoch": 3.69366852886406, + "grad_norm": 1.8333956003189087, + "learning_rate": 1.656464143485509e-06, + "loss": 0.9295, + "step": 3967 + }, + { + "epoch": 3.6945996275605215, + "grad_norm": 1.7353899478912354, + "learning_rate": 1.6552988316760904e-06, + "loss": 0.8784, + "step": 3968 + }, + { + "epoch": 3.695530726256983, + "grad_norm": 1.7982414960861206, + "learning_rate": 1.654133727012639e-06, + "loss": 0.9251, + "step": 3969 + }, + { + "epoch": 3.6964618249534453, + "grad_norm": 1.6911696195602417, + "learning_rate": 1.6529688297808727e-06, + "loss": 0.9399, + "step": 3970 + }, + { + "epoch": 3.697392923649907, + "grad_norm": 1.6974493265151978, + "learning_rate": 1.651804140266461e-06, + "loss": 0.9237, + "step": 3971 + }, + { + "epoch": 3.6983240223463687, + "grad_norm": 1.7594558000564575, + "learning_rate": 1.650639658755019e-06, + "loss": 0.9622, + "step": 3972 + }, + { + "epoch": 3.6992551210428304, + "grad_norm": 1.779521107673645, + "learning_rate": 1.6494753855321116e-06, + "loss": 0.9512, + "step": 3973 + }, + { + "epoch": 3.7001862197392925, + "grad_norm": 1.7489949464797974, + "learning_rate": 1.6483113208832562e-06, + "loss": 0.9262, + "step": 3974 + }, + { + "epoch": 3.701117318435754, + "grad_norm": 1.6858429908752441, + "learning_rate": 1.647147465093913e-06, + "loss": 0.8642, + "step": 3975 + }, + { + "epoch": 3.702048417132216, + "grad_norm": 1.761678695678711, + "learning_rate": 1.6459838184494964e-06, + "loss": 0.8623, + "step": 3976 + }, + { + "epoch": 3.7029795158286776, + "grad_norm": 1.6939162015914917, + "learning_rate": 1.6448203812353657e-06, + "loss": 0.9389, + "step": 3977 + }, + { + "epoch": 3.7039106145251397, + "grad_norm": 1.7257338762283325, + "learning_rate": 1.6436571537368328e-06, + "loss": 0.9168, + "step": 3978 + }, + { + "epoch": 3.7048417132216014, + "grad_norm": 1.6770213842391968, + "learning_rate": 1.6424941362391539e-06, + "loss": 0.9101, + "step": 3979 + }, + { + "epoch": 3.705772811918063, + "grad_norm": 1.7241153717041016, + "learning_rate": 1.6413313290275357e-06, + "loss": 0.8996, + "step": 3980 + }, + { + "epoch": 3.706703910614525, + "grad_norm": 1.7888871431350708, + "learning_rate": 1.6401687323871346e-06, + "loss": 0.9858, + "step": 3981 + }, + { + "epoch": 3.707635009310987, + "grad_norm": 1.7111773490905762, + "learning_rate": 1.639006346603052e-06, + "loss": 0.9281, + "step": 3982 + }, + { + "epoch": 3.7085661080074486, + "grad_norm": 1.6771979331970215, + "learning_rate": 1.6378441719603417e-06, + "loss": 0.8948, + "step": 3983 + }, + { + "epoch": 3.7094972067039107, + "grad_norm": 1.7977564334869385, + "learning_rate": 1.6366822087440026e-06, + "loss": 0.9514, + "step": 3984 + }, + { + "epoch": 3.7104283054003724, + "grad_norm": 1.7720965147018433, + "learning_rate": 1.6355204572389832e-06, + "loss": 0.9168, + "step": 3985 + }, + { + "epoch": 3.711359404096834, + "grad_norm": 1.7549103498458862, + "learning_rate": 1.6343589177301783e-06, + "loss": 0.9071, + "step": 3986 + }, + { + "epoch": 3.712290502793296, + "grad_norm": 1.7695749998092651, + "learning_rate": 1.6331975905024341e-06, + "loss": 0.9641, + "step": 3987 + }, + { + "epoch": 3.713221601489758, + "grad_norm": 1.7933679819107056, + "learning_rate": 1.6320364758405422e-06, + "loss": 0.9461, + "step": 3988 + }, + { + "epoch": 3.7141527001862196, + "grad_norm": 1.7250888347625732, + "learning_rate": 1.6308755740292415e-06, + "loss": 0.9268, + "step": 3989 + }, + { + "epoch": 3.7150837988826817, + "grad_norm": 1.6932573318481445, + "learning_rate": 1.629714885353221e-06, + "loss": 0.8801, + "step": 3990 + }, + { + "epoch": 3.7160148975791434, + "grad_norm": 1.744909644126892, + "learning_rate": 1.6285544100971163e-06, + "loss": 0.9293, + "step": 3991 + }, + { + "epoch": 3.716945996275605, + "grad_norm": 1.7552919387817383, + "learning_rate": 1.6273941485455098e-06, + "loss": 0.9323, + "step": 3992 + }, + { + "epoch": 3.717877094972067, + "grad_norm": 1.6713666915893555, + "learning_rate": 1.6262341009829318e-06, + "loss": 0.9094, + "step": 3993 + }, + { + "epoch": 3.718808193668529, + "grad_norm": 1.679654836654663, + "learning_rate": 1.6250742676938625e-06, + "loss": 0.8952, + "step": 3994 + }, + { + "epoch": 3.7197392923649906, + "grad_norm": 1.6698956489562988, + "learning_rate": 1.6239146489627266e-06, + "loss": 0.8893, + "step": 3995 + }, + { + "epoch": 3.7206703910614527, + "grad_norm": 1.6208436489105225, + "learning_rate": 1.622755245073897e-06, + "loss": 0.8775, + "step": 3996 + }, + { + "epoch": 3.7216014897579144, + "grad_norm": 1.7489488124847412, + "learning_rate": 1.6215960563116945e-06, + "loss": 0.9083, + "step": 3997 + }, + { + "epoch": 3.722532588454376, + "grad_norm": 1.782345175743103, + "learning_rate": 1.6204370829603874e-06, + "loss": 0.8902, + "step": 3998 + }, + { + "epoch": 3.723463687150838, + "grad_norm": 1.7041456699371338, + "learning_rate": 1.6192783253041896e-06, + "loss": 0.8999, + "step": 3999 + }, + { + "epoch": 3.7243947858473, + "grad_norm": 1.7831697463989258, + "learning_rate": 1.618119783627263e-06, + "loss": 0.9237, + "step": 4000 + }, + { + "epoch": 3.7253258845437616, + "grad_norm": 1.8157907724380493, + "learning_rate": 1.6169614582137177e-06, + "loss": 0.9669, + "step": 4001 + }, + { + "epoch": 3.7262569832402237, + "grad_norm": 1.680737018585205, + "learning_rate": 1.6158033493476099e-06, + "loss": 0.9043, + "step": 4002 + }, + { + "epoch": 3.7271880819366854, + "grad_norm": 1.7769601345062256, + "learning_rate": 1.614645457312941e-06, + "loss": 0.9462, + "step": 4003 + }, + { + "epoch": 3.728119180633147, + "grad_norm": 1.732236385345459, + "learning_rate": 1.613487782393661e-06, + "loss": 0.9309, + "step": 4004 + }, + { + "epoch": 3.729050279329609, + "grad_norm": 1.6602421998977661, + "learning_rate": 1.6123303248736678e-06, + "loss": 0.8757, + "step": 4005 + }, + { + "epoch": 3.729981378026071, + "grad_norm": 1.7312138080596924, + "learning_rate": 1.6111730850368034e-06, + "loss": 0.9091, + "step": 4006 + }, + { + "epoch": 3.7309124767225326, + "grad_norm": 1.7424546480178833, + "learning_rate": 1.6100160631668572e-06, + "loss": 0.9067, + "step": 4007 + }, + { + "epoch": 3.7318435754189943, + "grad_norm": 1.6888412237167358, + "learning_rate": 1.6088592595475667e-06, + "loss": 0.8795, + "step": 4008 + }, + { + "epoch": 3.7327746741154564, + "grad_norm": 1.7270209789276123, + "learning_rate": 1.6077026744626145e-06, + "loss": 0.8942, + "step": 4009 + }, + { + "epoch": 3.733705772811918, + "grad_norm": 1.7854082584381104, + "learning_rate": 1.6065463081956293e-06, + "loss": 0.9028, + "step": 4010 + }, + { + "epoch": 3.7346368715083798, + "grad_norm": 1.7233459949493408, + "learning_rate": 1.605390161030186e-06, + "loss": 0.9115, + "step": 4011 + }, + { + "epoch": 3.7355679702048414, + "grad_norm": 1.7517027854919434, + "learning_rate": 1.6042342332498089e-06, + "loss": 0.898, + "step": 4012 + }, + { + "epoch": 3.7364990689013036, + "grad_norm": 1.7661665678024292, + "learning_rate": 1.6030785251379635e-06, + "loss": 0.9272, + "step": 4013 + }, + { + "epoch": 3.7374301675977653, + "grad_norm": 1.7863315343856812, + "learning_rate": 1.601923036978065e-06, + "loss": 0.9195, + "step": 4014 + }, + { + "epoch": 3.738361266294227, + "grad_norm": 1.6750671863555908, + "learning_rate": 1.6007677690534728e-06, + "loss": 0.9089, + "step": 4015 + }, + { + "epoch": 3.739292364990689, + "grad_norm": 1.7570219039916992, + "learning_rate": 1.5996127216474953e-06, + "loss": 0.9184, + "step": 4016 + }, + { + "epoch": 3.7402234636871508, + "grad_norm": 1.7448984384536743, + "learning_rate": 1.5984578950433823e-06, + "loss": 0.9409, + "step": 4017 + }, + { + "epoch": 3.7411545623836124, + "grad_norm": 1.6719850301742554, + "learning_rate": 1.5973032895243324e-06, + "loss": 0.9105, + "step": 4018 + }, + { + "epoch": 3.7420856610800746, + "grad_norm": 1.7397119998931885, + "learning_rate": 1.5961489053734908e-06, + "loss": 0.9321, + "step": 4019 + }, + { + "epoch": 3.7430167597765363, + "grad_norm": 1.7455973625183105, + "learning_rate": 1.5949947428739448e-06, + "loss": 0.8734, + "step": 4020 + }, + { + "epoch": 3.743947858472998, + "grad_norm": 1.7380287647247314, + "learning_rate": 1.5938408023087309e-06, + "loss": 0.9054, + "step": 4021 + }, + { + "epoch": 3.74487895716946, + "grad_norm": 1.7269648313522339, + "learning_rate": 1.59268708396083e-06, + "loss": 0.939, + "step": 4022 + }, + { + "epoch": 3.7458100558659218, + "grad_norm": 1.7250744104385376, + "learning_rate": 1.5915335881131666e-06, + "loss": 0.9365, + "step": 4023 + }, + { + "epoch": 3.7467411545623834, + "grad_norm": 1.7136321067810059, + "learning_rate": 1.590380315048614e-06, + "loss": 0.9138, + "step": 4024 + }, + { + "epoch": 3.7476722532588456, + "grad_norm": 1.7721107006072998, + "learning_rate": 1.5892272650499886e-06, + "loss": 0.9199, + "step": 4025 + }, + { + "epoch": 3.7486033519553073, + "grad_norm": 1.65714430809021, + "learning_rate": 1.5880744384000544e-06, + "loss": 0.9008, + "step": 4026 + }, + { + "epoch": 3.749534450651769, + "grad_norm": 1.722625732421875, + "learning_rate": 1.5869218353815158e-06, + "loss": 0.9126, + "step": 4027 + }, + { + "epoch": 3.750465549348231, + "grad_norm": 1.7553139925003052, + "learning_rate": 1.5857694562770273e-06, + "loss": 0.9111, + "step": 4028 + }, + { + "epoch": 3.7513966480446927, + "grad_norm": 1.7488781213760376, + "learning_rate": 1.5846173013691874e-06, + "loss": 0.921, + "step": 4029 + }, + { + "epoch": 3.7523277467411544, + "grad_norm": 1.72938871383667, + "learning_rate": 1.5834653709405368e-06, + "loss": 0.9112, + "step": 4030 + }, + { + "epoch": 3.7532588454376166, + "grad_norm": 1.7150977849960327, + "learning_rate": 1.582313665273565e-06, + "loss": 0.9133, + "step": 4031 + }, + { + "epoch": 3.7541899441340782, + "grad_norm": 1.7561137676239014, + "learning_rate": 1.581162184650704e-06, + "loss": 0.8987, + "step": 4032 + }, + { + "epoch": 3.75512104283054, + "grad_norm": 1.712423324584961, + "learning_rate": 1.580010929354332e-06, + "loss": 0.9485, + "step": 4033 + }, + { + "epoch": 3.756052141527002, + "grad_norm": 1.7377071380615234, + "learning_rate": 1.5788598996667695e-06, + "loss": 0.9128, + "step": 4034 + }, + { + "epoch": 3.7569832402234637, + "grad_norm": 1.7843185663223267, + "learning_rate": 1.577709095870285e-06, + "loss": 0.9394, + "step": 4035 + }, + { + "epoch": 3.7579143389199254, + "grad_norm": 1.732096791267395, + "learning_rate": 1.57655851824709e-06, + "loss": 0.9155, + "step": 4036 + }, + { + "epoch": 3.7588454376163876, + "grad_norm": 1.7659202814102173, + "learning_rate": 1.5754081670793395e-06, + "loss": 0.9083, + "step": 4037 + }, + { + "epoch": 3.7597765363128492, + "grad_norm": 1.743561029434204, + "learning_rate": 1.5742580426491338e-06, + "loss": 0.8927, + "step": 4038 + }, + { + "epoch": 3.760707635009311, + "grad_norm": 1.7321566343307495, + "learning_rate": 1.5731081452385188e-06, + "loss": 0.9204, + "step": 4039 + }, + { + "epoch": 3.761638733705773, + "grad_norm": 1.7810157537460327, + "learning_rate": 1.5719584751294842e-06, + "loss": 0.9043, + "step": 4040 + }, + { + "epoch": 3.7625698324022347, + "grad_norm": 1.823854684829712, + "learning_rate": 1.570809032603961e-06, + "loss": 0.954, + "step": 4041 + }, + { + "epoch": 3.7635009310986964, + "grad_norm": 1.723378300666809, + "learning_rate": 1.5696598179438293e-06, + "loss": 0.8818, + "step": 4042 + }, + { + "epoch": 3.7644320297951586, + "grad_norm": 1.7377586364746094, + "learning_rate": 1.5685108314309105e-06, + "loss": 0.902, + "step": 4043 + }, + { + "epoch": 3.7653631284916202, + "grad_norm": 1.7482560873031616, + "learning_rate": 1.5673620733469694e-06, + "loss": 0.9164, + "step": 4044 + }, + { + "epoch": 3.766294227188082, + "grad_norm": 1.7988420724868774, + "learning_rate": 1.5662135439737159e-06, + "loss": 0.9426, + "step": 4045 + }, + { + "epoch": 3.7672253258845436, + "grad_norm": 1.6999481916427612, + "learning_rate": 1.5650652435928043e-06, + "loss": 0.9025, + "step": 4046 + }, + { + "epoch": 3.7681564245810057, + "grad_norm": 1.8339227437973022, + "learning_rate": 1.5639171724858327e-06, + "loss": 0.9609, + "step": 4047 + }, + { + "epoch": 3.7690875232774674, + "grad_norm": 1.746368169784546, + "learning_rate": 1.5627693309343413e-06, + "loss": 0.9176, + "step": 4048 + }, + { + "epoch": 3.770018621973929, + "grad_norm": 1.7249951362609863, + "learning_rate": 1.5616217192198151e-06, + "loss": 0.9196, + "step": 4049 + }, + { + "epoch": 3.770949720670391, + "grad_norm": 1.6959333419799805, + "learning_rate": 1.5604743376236847e-06, + "loss": 0.9396, + "step": 4050 + }, + { + "epoch": 3.771880819366853, + "grad_norm": 1.8023675680160522, + "learning_rate": 1.5593271864273198e-06, + "loss": 0.9029, + "step": 4051 + }, + { + "epoch": 3.7728119180633146, + "grad_norm": 1.7501859664916992, + "learning_rate": 1.558180265912037e-06, + "loss": 0.9616, + "step": 4052 + }, + { + "epoch": 3.7737430167597763, + "grad_norm": 1.8011305332183838, + "learning_rate": 1.5570335763590972e-06, + "loss": 0.9092, + "step": 4053 + }, + { + "epoch": 3.7746741154562384, + "grad_norm": 1.827329397201538, + "learning_rate": 1.5558871180497004e-06, + "loss": 0.919, + "step": 4054 + }, + { + "epoch": 3.7756052141527, + "grad_norm": 1.671895980834961, + "learning_rate": 1.5547408912649942e-06, + "loss": 0.9416, + "step": 4055 + }, + { + "epoch": 3.776536312849162, + "grad_norm": 1.7218687534332275, + "learning_rate": 1.5535948962860658e-06, + "loss": 0.9169, + "step": 4056 + }, + { + "epoch": 3.777467411545624, + "grad_norm": 1.7514137029647827, + "learning_rate": 1.5524491333939501e-06, + "loss": 0.9409, + "step": 4057 + }, + { + "epoch": 3.7783985102420856, + "grad_norm": 1.6476129293441772, + "learning_rate": 1.5513036028696204e-06, + "loss": 0.8372, + "step": 4058 + }, + { + "epoch": 3.7793296089385473, + "grad_norm": 1.714404582977295, + "learning_rate": 1.550158304993995e-06, + "loss": 0.9221, + "step": 4059 + }, + { + "epoch": 3.7802607076350094, + "grad_norm": 1.6984009742736816, + "learning_rate": 1.549013240047937e-06, + "loss": 0.9143, + "step": 4060 + }, + { + "epoch": 3.781191806331471, + "grad_norm": 1.6875550746917725, + "learning_rate": 1.5478684083122481e-06, + "loss": 0.887, + "step": 4061 + }, + { + "epoch": 3.782122905027933, + "grad_norm": 1.7351542711257935, + "learning_rate": 1.5467238100676768e-06, + "loss": 0.9187, + "step": 4062 + }, + { + "epoch": 3.783054003724395, + "grad_norm": 1.7538175582885742, + "learning_rate": 1.5455794455949116e-06, + "loss": 0.8947, + "step": 4063 + }, + { + "epoch": 3.7839851024208566, + "grad_norm": 1.7356371879577637, + "learning_rate": 1.544435315174587e-06, + "loss": 0.9072, + "step": 4064 + }, + { + "epoch": 3.7849162011173183, + "grad_norm": 1.7276921272277832, + "learning_rate": 1.5432914190872757e-06, + "loss": 0.8663, + "step": 4065 + }, + { + "epoch": 3.7858472998137804, + "grad_norm": 1.7000435590744019, + "learning_rate": 1.5421477576134966e-06, + "loss": 0.9123, + "step": 4066 + }, + { + "epoch": 3.786778398510242, + "grad_norm": 1.789954662322998, + "learning_rate": 1.5410043310337095e-06, + "loss": 0.9101, + "step": 4067 + }, + { + "epoch": 3.787709497206704, + "grad_norm": 1.7146666049957275, + "learning_rate": 1.5398611396283153e-06, + "loss": 0.9223, + "step": 4068 + }, + { + "epoch": 3.788640595903166, + "grad_norm": 1.7471160888671875, + "learning_rate": 1.5387181836776604e-06, + "loss": 0.9261, + "step": 4069 + }, + { + "epoch": 3.7895716945996276, + "grad_norm": 1.7684365510940552, + "learning_rate": 1.537575463462031e-06, + "loss": 0.9519, + "step": 4070 + }, + { + "epoch": 3.7905027932960893, + "grad_norm": 1.7900761365890503, + "learning_rate": 1.5364329792616577e-06, + "loss": 0.9018, + "step": 4071 + }, + { + "epoch": 3.7914338919925514, + "grad_norm": 1.843151330947876, + "learning_rate": 1.535290731356709e-06, + "loss": 0.9397, + "step": 4072 + }, + { + "epoch": 3.792364990689013, + "grad_norm": 1.6838605403900146, + "learning_rate": 1.5341487200273003e-06, + "loss": 0.881, + "step": 4073 + }, + { + "epoch": 3.793296089385475, + "grad_norm": 1.744537353515625, + "learning_rate": 1.5330069455534868e-06, + "loss": 0.9048, + "step": 4074 + }, + { + "epoch": 3.794227188081937, + "grad_norm": 1.799020528793335, + "learning_rate": 1.5318654082152639e-06, + "loss": 0.9034, + "step": 4075 + }, + { + "epoch": 3.7951582867783986, + "grad_norm": 1.6442123651504517, + "learning_rate": 1.530724108292573e-06, + "loss": 0.8921, + "step": 4076 + }, + { + "epoch": 3.7960893854748603, + "grad_norm": 1.6862736940383911, + "learning_rate": 1.5295830460652938e-06, + "loss": 0.9082, + "step": 4077 + }, + { + "epoch": 3.7970204841713224, + "grad_norm": 1.7750946283340454, + "learning_rate": 1.5284422218132495e-06, + "loss": 0.8917, + "step": 4078 + }, + { + "epoch": 3.797951582867784, + "grad_norm": 1.7139170169830322, + "learning_rate": 1.527301635816203e-06, + "loss": 0.9092, + "step": 4079 + }, + { + "epoch": 3.798882681564246, + "grad_norm": 1.7798914909362793, + "learning_rate": 1.526161288353861e-06, + "loss": 0.916, + "step": 4080 + }, + { + "epoch": 3.7998137802607075, + "grad_norm": 1.8249850273132324, + "learning_rate": 1.5250211797058712e-06, + "loss": 0.9659, + "step": 4081 + }, + { + "epoch": 3.8007448789571696, + "grad_norm": 1.728442668914795, + "learning_rate": 1.5238813101518208e-06, + "loss": 0.8815, + "step": 4082 + }, + { + "epoch": 3.8016759776536313, + "grad_norm": 1.6928693056106567, + "learning_rate": 1.5227416799712414e-06, + "loss": 0.8964, + "step": 4083 + }, + { + "epoch": 3.802607076350093, + "grad_norm": 1.705544114112854, + "learning_rate": 1.5216022894436044e-06, + "loss": 0.9163, + "step": 4084 + }, + { + "epoch": 3.8035381750465547, + "grad_norm": 1.8003267049789429, + "learning_rate": 1.5204631388483213e-06, + "loss": 0.9213, + "step": 4085 + }, + { + "epoch": 3.804469273743017, + "grad_norm": 1.760697364807129, + "learning_rate": 1.5193242284647458e-06, + "loss": 0.8991, + "step": 4086 + }, + { + "epoch": 3.8054003724394785, + "grad_norm": 1.7088054418563843, + "learning_rate": 1.5181855585721738e-06, + "loss": 0.8996, + "step": 4087 + }, + { + "epoch": 3.80633147113594, + "grad_norm": 1.6870173215866089, + "learning_rate": 1.5170471294498412e-06, + "loss": 0.9147, + "step": 4088 + }, + { + "epoch": 3.8072625698324023, + "grad_norm": 1.7374681234359741, + "learning_rate": 1.515908941376924e-06, + "loss": 0.9506, + "step": 4089 + }, + { + "epoch": 3.808193668528864, + "grad_norm": 1.6433149576187134, + "learning_rate": 1.5147709946325395e-06, + "loss": 0.8907, + "step": 4090 + }, + { + "epoch": 3.8091247672253257, + "grad_norm": 1.762100338935852, + "learning_rate": 1.5136332894957484e-06, + "loss": 0.8929, + "step": 4091 + }, + { + "epoch": 3.810055865921788, + "grad_norm": 1.7047054767608643, + "learning_rate": 1.5124958262455477e-06, + "loss": 0.8679, + "step": 4092 + }, + { + "epoch": 3.8109869646182495, + "grad_norm": 1.7583643198013306, + "learning_rate": 1.5113586051608782e-06, + "loss": 0.9434, + "step": 4093 + }, + { + "epoch": 3.811918063314711, + "grad_norm": 1.73088538646698, + "learning_rate": 1.5102216265206208e-06, + "loss": 0.9231, + "step": 4094 + }, + { + "epoch": 3.8128491620111733, + "grad_norm": 1.7437982559204102, + "learning_rate": 1.509084890603597e-06, + "loss": 0.9224, + "step": 4095 + }, + { + "epoch": 3.813780260707635, + "grad_norm": 1.7670544385910034, + "learning_rate": 1.5079483976885672e-06, + "loss": 0.9434, + "step": 4096 + }, + { + "epoch": 3.8147113594040967, + "grad_norm": 1.770229458808899, + "learning_rate": 1.5068121480542335e-06, + "loss": 0.8782, + "step": 4097 + }, + { + "epoch": 3.815642458100559, + "grad_norm": 1.7599060535430908, + "learning_rate": 1.5056761419792404e-06, + "loss": 0.9072, + "step": 4098 + }, + { + "epoch": 3.8165735567970205, + "grad_norm": 1.739033818244934, + "learning_rate": 1.5045403797421681e-06, + "loss": 0.9472, + "step": 4099 + }, + { + "epoch": 3.817504655493482, + "grad_norm": 1.755418062210083, + "learning_rate": 1.50340486162154e-06, + "loss": 0.8881, + "step": 4100 + }, + { + "epoch": 3.8184357541899443, + "grad_norm": 2.0345919132232666, + "learning_rate": 1.5022695878958194e-06, + "loss": 0.9399, + "step": 4101 + }, + { + "epoch": 3.819366852886406, + "grad_norm": 1.6922738552093506, + "learning_rate": 1.50113455884341e-06, + "loss": 0.8856, + "step": 4102 + }, + { + "epoch": 3.8202979515828677, + "grad_norm": 1.6994667053222656, + "learning_rate": 1.4999997747426538e-06, + "loss": 0.9008, + "step": 4103 + }, + { + "epoch": 3.82122905027933, + "grad_norm": 1.7431222200393677, + "learning_rate": 1.4988652358718336e-06, + "loss": 0.8982, + "step": 4104 + }, + { + "epoch": 3.8221601489757915, + "grad_norm": 1.7737343311309814, + "learning_rate": 1.4977309425091742e-06, + "loss": 0.9293, + "step": 4105 + }, + { + "epoch": 3.823091247672253, + "grad_norm": 1.700769305229187, + "learning_rate": 1.4965968949328352e-06, + "loss": 0.9152, + "step": 4106 + }, + { + "epoch": 3.8240223463687153, + "grad_norm": 1.7748693227767944, + "learning_rate": 1.4954630934209213e-06, + "loss": 0.9649, + "step": 4107 + }, + { + "epoch": 3.824953445065177, + "grad_norm": 1.7902367115020752, + "learning_rate": 1.494329538251473e-06, + "loss": 0.932, + "step": 4108 + }, + { + "epoch": 3.8258845437616387, + "grad_norm": 1.692631721496582, + "learning_rate": 1.4931962297024738e-06, + "loss": 0.8596, + "step": 4109 + }, + { + "epoch": 3.826815642458101, + "grad_norm": 1.7029507160186768, + "learning_rate": 1.4920631680518432e-06, + "loss": 0.9208, + "step": 4110 + }, + { + "epoch": 3.8277467411545625, + "grad_norm": 1.6942970752716064, + "learning_rate": 1.4909303535774421e-06, + "loss": 0.9032, + "step": 4111 + }, + { + "epoch": 3.828677839851024, + "grad_norm": 1.6594653129577637, + "learning_rate": 1.4897977865570713e-06, + "loss": 0.8836, + "step": 4112 + }, + { + "epoch": 3.8296089385474863, + "grad_norm": 1.7085827589035034, + "learning_rate": 1.488665467268468e-06, + "loss": 0.9315, + "step": 4113 + }, + { + "epoch": 3.830540037243948, + "grad_norm": 1.746665596961975, + "learning_rate": 1.487533395989313e-06, + "loss": 0.9154, + "step": 4114 + }, + { + "epoch": 3.8314711359404097, + "grad_norm": 1.7758121490478516, + "learning_rate": 1.4864015729972232e-06, + "loss": 0.9339, + "step": 4115 + }, + { + "epoch": 3.8324022346368714, + "grad_norm": 1.7491447925567627, + "learning_rate": 1.4852699985697546e-06, + "loss": 0.9125, + "step": 4116 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 1.7201588153839111, + "learning_rate": 1.4841386729844043e-06, + "loss": 0.8898, + "step": 4117 + }, + { + "epoch": 3.834264432029795, + "grad_norm": 1.71219003200531, + "learning_rate": 1.4830075965186064e-06, + "loss": 0.8557, + "step": 4118 + }, + { + "epoch": 3.835195530726257, + "grad_norm": 1.758597493171692, + "learning_rate": 1.4818767694497354e-06, + "loss": 0.925, + "step": 4119 + }, + { + "epoch": 3.8361266294227185, + "grad_norm": 1.7303849458694458, + "learning_rate": 1.4807461920551028e-06, + "loss": 0.9076, + "step": 4120 + }, + { + "epoch": 3.8370577281191807, + "grad_norm": 1.7226804494857788, + "learning_rate": 1.4796158646119607e-06, + "loss": 0.9126, + "step": 4121 + }, + { + "epoch": 3.8379888268156424, + "grad_norm": 1.7260617017745972, + "learning_rate": 1.4784857873974996e-06, + "loss": 0.8967, + "step": 4122 + }, + { + "epoch": 3.838919925512104, + "grad_norm": 1.6722670793533325, + "learning_rate": 1.477355960688847e-06, + "loss": 0.868, + "step": 4123 + }, + { + "epoch": 3.839851024208566, + "grad_norm": 1.7288227081298828, + "learning_rate": 1.4762263847630701e-06, + "loss": 0.9071, + "step": 4124 + }, + { + "epoch": 3.840782122905028, + "grad_norm": 1.7335927486419678, + "learning_rate": 1.475097059897176e-06, + "loss": 0.9051, + "step": 4125 + }, + { + "epoch": 3.8417132216014895, + "grad_norm": 1.8620281219482422, + "learning_rate": 1.4739679863681086e-06, + "loss": 0.9595, + "step": 4126 + }, + { + "epoch": 3.8426443202979517, + "grad_norm": 1.7311451435089111, + "learning_rate": 1.4728391644527494e-06, + "loss": 0.9315, + "step": 4127 + }, + { + "epoch": 3.8435754189944134, + "grad_norm": 1.7223219871520996, + "learning_rate": 1.4717105944279201e-06, + "loss": 0.9291, + "step": 4128 + }, + { + "epoch": 3.844506517690875, + "grad_norm": 1.8028019666671753, + "learning_rate": 1.4705822765703804e-06, + "loss": 0.9195, + "step": 4129 + }, + { + "epoch": 3.845437616387337, + "grad_norm": 1.7164208889007568, + "learning_rate": 1.4694542111568261e-06, + "loss": 0.9082, + "step": 4130 + }, + { + "epoch": 3.846368715083799, + "grad_norm": 1.698807716369629, + "learning_rate": 1.4683263984638929e-06, + "loss": 0.915, + "step": 4131 + }, + { + "epoch": 3.8472998137802605, + "grad_norm": 1.7113651037216187, + "learning_rate": 1.4671988387681548e-06, + "loss": 0.9033, + "step": 4132 + }, + { + "epoch": 3.8482309124767227, + "grad_norm": 1.8233782052993774, + "learning_rate": 1.4660715323461238e-06, + "loss": 0.9269, + "step": 4133 + }, + { + "epoch": 3.8491620111731844, + "grad_norm": 1.7255011796951294, + "learning_rate": 1.4649444794742474e-06, + "loss": 0.9298, + "step": 4134 + }, + { + "epoch": 3.850093109869646, + "grad_norm": 1.7078388929367065, + "learning_rate": 1.4638176804289128e-06, + "loss": 0.9132, + "step": 4135 + }, + { + "epoch": 3.851024208566108, + "grad_norm": 1.7611697912216187, + "learning_rate": 1.4626911354864465e-06, + "loss": 0.9237, + "step": 4136 + }, + { + "epoch": 3.85195530726257, + "grad_norm": 1.7190121412277222, + "learning_rate": 1.4615648449231095e-06, + "loss": 0.9199, + "step": 4137 + }, + { + "epoch": 3.8528864059590315, + "grad_norm": 1.7358964681625366, + "learning_rate": 1.4604388090151016e-06, + "loss": 0.9289, + "step": 4138 + }, + { + "epoch": 3.8538175046554937, + "grad_norm": 1.7040410041809082, + "learning_rate": 1.4593130280385614e-06, + "loss": 0.9017, + "step": 4139 + }, + { + "epoch": 3.8547486033519553, + "grad_norm": 1.7929447889328003, + "learning_rate": 1.4581875022695655e-06, + "loss": 0.9467, + "step": 4140 + }, + { + "epoch": 3.855679702048417, + "grad_norm": 1.7423875331878662, + "learning_rate": 1.4570622319841232e-06, + "loss": 0.9309, + "step": 4141 + }, + { + "epoch": 3.856610800744879, + "grad_norm": 1.756166696548462, + "learning_rate": 1.4559372174581865e-06, + "loss": 0.9572, + "step": 4142 + }, + { + "epoch": 3.857541899441341, + "grad_norm": 1.727579116821289, + "learning_rate": 1.4548124589676417e-06, + "loss": 0.9594, + "step": 4143 + }, + { + "epoch": 3.8584729981378025, + "grad_norm": 1.7996478080749512, + "learning_rate": 1.4536879567883133e-06, + "loss": 0.9497, + "step": 4144 + }, + { + "epoch": 3.8594040968342647, + "grad_norm": 1.7171953916549683, + "learning_rate": 1.4525637111959634e-06, + "loss": 0.8674, + "step": 4145 + }, + { + "epoch": 3.8603351955307263, + "grad_norm": 1.745754599571228, + "learning_rate": 1.4514397224662902e-06, + "loss": 0.9116, + "step": 4146 + }, + { + "epoch": 3.861266294227188, + "grad_norm": 1.7001436948776245, + "learning_rate": 1.4503159908749292e-06, + "loss": 0.8944, + "step": 4147 + }, + { + "epoch": 3.86219739292365, + "grad_norm": 1.8127472400665283, + "learning_rate": 1.4491925166974533e-06, + "loss": 0.9166, + "step": 4148 + }, + { + "epoch": 3.863128491620112, + "grad_norm": 1.7718563079833984, + "learning_rate": 1.4480693002093715e-06, + "loss": 0.916, + "step": 4149 + }, + { + "epoch": 3.8640595903165735, + "grad_norm": 1.7531687021255493, + "learning_rate": 1.4469463416861307e-06, + "loss": 0.9367, + "step": 4150 + }, + { + "epoch": 3.864990689013035, + "grad_norm": 1.7965582609176636, + "learning_rate": 1.4458236414031134e-06, + "loss": 0.9461, + "step": 4151 + }, + { + "epoch": 3.8659217877094973, + "grad_norm": 1.7462981939315796, + "learning_rate": 1.444701199635639e-06, + "loss": 0.8741, + "step": 4152 + }, + { + "epoch": 3.866852886405959, + "grad_norm": 1.7626858949661255, + "learning_rate": 1.4435790166589647e-06, + "loss": 0.8978, + "step": 4153 + }, + { + "epoch": 3.8677839851024207, + "grad_norm": 1.7848018407821655, + "learning_rate": 1.4424570927482826e-06, + "loss": 0.9353, + "step": 4154 + }, + { + "epoch": 3.868715083798883, + "grad_norm": 1.6567825078964233, + "learning_rate": 1.4413354281787217e-06, + "loss": 0.8812, + "step": 4155 + }, + { + "epoch": 3.8696461824953445, + "grad_norm": 1.779465675354004, + "learning_rate": 1.4402140232253486e-06, + "loss": 0.9216, + "step": 4156 + }, + { + "epoch": 3.870577281191806, + "grad_norm": 1.738989233970642, + "learning_rate": 1.4390928781631647e-06, + "loss": 0.9017, + "step": 4157 + }, + { + "epoch": 3.871508379888268, + "grad_norm": 1.7417113780975342, + "learning_rate": 1.437971993267108e-06, + "loss": 0.8857, + "step": 4158 + }, + { + "epoch": 3.87243947858473, + "grad_norm": 1.738136649131775, + "learning_rate": 1.4368513688120534e-06, + "loss": 0.9133, + "step": 4159 + }, + { + "epoch": 3.8733705772811917, + "grad_norm": 1.8069320917129517, + "learning_rate": 1.4357310050728116e-06, + "loss": 0.9184, + "step": 4160 + }, + { + "epoch": 3.8743016759776534, + "grad_norm": 1.7768930196762085, + "learning_rate": 1.434610902324129e-06, + "loss": 0.9325, + "step": 4161 + }, + { + "epoch": 3.8752327746741155, + "grad_norm": 1.7043150663375854, + "learning_rate": 1.4334910608406881e-06, + "loss": 0.917, + "step": 4162 + }, + { + "epoch": 3.876163873370577, + "grad_norm": 1.7589137554168701, + "learning_rate": 1.4323714808971078e-06, + "loss": 0.9163, + "step": 4163 + }, + { + "epoch": 3.877094972067039, + "grad_norm": 1.8015676736831665, + "learning_rate": 1.431252162767943e-06, + "loss": 0.8728, + "step": 4164 + }, + { + "epoch": 3.878026070763501, + "grad_norm": 1.7375624179840088, + "learning_rate": 1.430133106727683e-06, + "loss": 0.8913, + "step": 4165 + }, + { + "epoch": 3.8789571694599627, + "grad_norm": 1.756018042564392, + "learning_rate": 1.4290143130507544e-06, + "loss": 0.9051, + "step": 4166 + }, + { + "epoch": 3.8798882681564244, + "grad_norm": 1.6689399480819702, + "learning_rate": 1.4278957820115187e-06, + "loss": 0.8907, + "step": 4167 + }, + { + "epoch": 3.8808193668528865, + "grad_norm": 1.6374313831329346, + "learning_rate": 1.4267775138842726e-06, + "loss": 0.8721, + "step": 4168 + }, + { + "epoch": 3.881750465549348, + "grad_norm": 1.7690523862838745, + "learning_rate": 1.4256595089432502e-06, + "loss": 0.9368, + "step": 4169 + }, + { + "epoch": 3.88268156424581, + "grad_norm": 1.7883734703063965, + "learning_rate": 1.4245417674626183e-06, + "loss": 0.9404, + "step": 4170 + }, + { + "epoch": 3.883612662942272, + "grad_norm": 1.8173571825027466, + "learning_rate": 1.4234242897164814e-06, + "loss": 0.9077, + "step": 4171 + }, + { + "epoch": 3.8845437616387337, + "grad_norm": 1.8200242519378662, + "learning_rate": 1.4223070759788777e-06, + "loss": 0.9186, + "step": 4172 + }, + { + "epoch": 3.8854748603351954, + "grad_norm": 1.7700036764144897, + "learning_rate": 1.4211901265237821e-06, + "loss": 0.904, + "step": 4173 + }, + { + "epoch": 3.8864059590316575, + "grad_norm": 1.8249975442886353, + "learning_rate": 1.4200734416251047e-06, + "loss": 0.8984, + "step": 4174 + }, + { + "epoch": 3.887337057728119, + "grad_norm": 1.7791286706924438, + "learning_rate": 1.418957021556687e-06, + "loss": 0.9425, + "step": 4175 + }, + { + "epoch": 3.888268156424581, + "grad_norm": 1.8396724462509155, + "learning_rate": 1.4178408665923115e-06, + "loss": 0.9386, + "step": 4176 + }, + { + "epoch": 3.889199255121043, + "grad_norm": 1.7584431171417236, + "learning_rate": 1.4167249770056918e-06, + "loss": 0.8921, + "step": 4177 + }, + { + "epoch": 3.8901303538175047, + "grad_norm": 1.7347151041030884, + "learning_rate": 1.4156093530704774e-06, + "loss": 0.8947, + "step": 4178 + }, + { + "epoch": 3.8910614525139664, + "grad_norm": 1.752658724784851, + "learning_rate": 1.4144939950602527e-06, + "loss": 0.9384, + "step": 4179 + }, + { + "epoch": 3.8919925512104285, + "grad_norm": 1.7194240093231201, + "learning_rate": 1.4133789032485367e-06, + "loss": 0.8973, + "step": 4180 + }, + { + "epoch": 3.89292364990689, + "grad_norm": 1.7423033714294434, + "learning_rate": 1.4122640779087842e-06, + "loss": 0.9023, + "step": 4181 + }, + { + "epoch": 3.893854748603352, + "grad_norm": 1.7129592895507812, + "learning_rate": 1.411149519314381e-06, + "loss": 0.899, + "step": 4182 + }, + { + "epoch": 3.894785847299814, + "grad_norm": 1.8546561002731323, + "learning_rate": 1.4100352277386526e-06, + "loss": 0.9093, + "step": 4183 + }, + { + "epoch": 3.8957169459962757, + "grad_norm": 1.7067517042160034, + "learning_rate": 1.4089212034548572e-06, + "loss": 0.8734, + "step": 4184 + }, + { + "epoch": 3.8966480446927374, + "grad_norm": 1.7312023639678955, + "learning_rate": 1.407807446736184e-06, + "loss": 0.8872, + "step": 4185 + }, + { + "epoch": 3.8975791433891995, + "grad_norm": 1.7390625476837158, + "learning_rate": 1.4066939578557604e-06, + "loss": 0.9156, + "step": 4186 + }, + { + "epoch": 3.898510242085661, + "grad_norm": 1.6948963403701782, + "learning_rate": 1.4055807370866488e-06, + "loss": 0.9313, + "step": 4187 + }, + { + "epoch": 3.899441340782123, + "grad_norm": 1.7991451025009155, + "learning_rate": 1.404467784701844e-06, + "loss": 0.9527, + "step": 4188 + }, + { + "epoch": 3.9003724394785846, + "grad_norm": 1.7324836254119873, + "learning_rate": 1.403355100974272e-06, + "loss": 0.8844, + "step": 4189 + }, + { + "epoch": 3.9013035381750467, + "grad_norm": 1.7347400188446045, + "learning_rate": 1.4022426861767999e-06, + "loss": 0.8792, + "step": 4190 + }, + { + "epoch": 3.9022346368715084, + "grad_norm": 1.750918984413147, + "learning_rate": 1.4011305405822242e-06, + "loss": 0.9356, + "step": 4191 + }, + { + "epoch": 3.90316573556797, + "grad_norm": 1.7328245639801025, + "learning_rate": 1.4000186644632746e-06, + "loss": 0.8797, + "step": 4192 + }, + { + "epoch": 3.9040968342644318, + "grad_norm": 1.769477367401123, + "learning_rate": 1.3989070580926167e-06, + "loss": 0.9287, + "step": 4193 + }, + { + "epoch": 3.905027932960894, + "grad_norm": 1.7599049806594849, + "learning_rate": 1.3977957217428507e-06, + "loss": 0.9192, + "step": 4194 + }, + { + "epoch": 3.9059590316573556, + "grad_norm": 1.7609810829162598, + "learning_rate": 1.3966846556865105e-06, + "loss": 0.9351, + "step": 4195 + }, + { + "epoch": 3.9068901303538173, + "grad_norm": 1.7171039581298828, + "learning_rate": 1.395573860196059e-06, + "loss": 0.8922, + "step": 4196 + }, + { + "epoch": 3.9078212290502794, + "grad_norm": 1.691245675086975, + "learning_rate": 1.3944633355438994e-06, + "loss": 0.9075, + "step": 4197 + }, + { + "epoch": 3.908752327746741, + "grad_norm": 1.7755759954452515, + "learning_rate": 1.3933530820023661e-06, + "loss": 0.8995, + "step": 4198 + }, + { + "epoch": 3.9096834264432028, + "grad_norm": 1.766641616821289, + "learning_rate": 1.392243099843724e-06, + "loss": 0.9629, + "step": 4199 + }, + { + "epoch": 3.910614525139665, + "grad_norm": 1.7448575496673584, + "learning_rate": 1.3911333893401742e-06, + "loss": 0.9003, + "step": 4200 + }, + { + "epoch": 3.9115456238361266, + "grad_norm": 1.7452454566955566, + "learning_rate": 1.3900239507638525e-06, + "loss": 0.9341, + "step": 4201 + }, + { + "epoch": 3.9124767225325883, + "grad_norm": 1.7095787525177002, + "learning_rate": 1.3889147843868264e-06, + "loss": 0.8974, + "step": 4202 + }, + { + "epoch": 3.9134078212290504, + "grad_norm": 1.7270052433013916, + "learning_rate": 1.387805890481095e-06, + "loss": 0.9203, + "step": 4203 + }, + { + "epoch": 3.914338919925512, + "grad_norm": 1.765599250793457, + "learning_rate": 1.3866972693185921e-06, + "loss": 0.8957, + "step": 4204 + }, + { + "epoch": 3.9152700186219738, + "grad_norm": 1.7474619150161743, + "learning_rate": 1.3855889211711875e-06, + "loss": 0.9101, + "step": 4205 + }, + { + "epoch": 3.916201117318436, + "grad_norm": 1.8109654188156128, + "learning_rate": 1.3844808463106788e-06, + "loss": 0.9356, + "step": 4206 + }, + { + "epoch": 3.9171322160148976, + "grad_norm": 1.6738662719726562, + "learning_rate": 1.3833730450087985e-06, + "loss": 0.8954, + "step": 4207 + }, + { + "epoch": 3.9180633147113593, + "grad_norm": 1.7239640951156616, + "learning_rate": 1.3822655175372148e-06, + "loss": 0.8762, + "step": 4208 + }, + { + "epoch": 3.9189944134078214, + "grad_norm": 1.6647493839263916, + "learning_rate": 1.3811582641675266e-06, + "loss": 0.8794, + "step": 4209 + }, + { + "epoch": 3.919925512104283, + "grad_norm": 1.7524518966674805, + "learning_rate": 1.3800512851712636e-06, + "loss": 0.8951, + "step": 4210 + }, + { + "epoch": 3.9208566108007448, + "grad_norm": 1.7558668851852417, + "learning_rate": 1.3789445808198898e-06, + "loss": 0.9081, + "step": 4211 + }, + { + "epoch": 3.921787709497207, + "grad_norm": 1.8038939237594604, + "learning_rate": 1.3778381513848056e-06, + "loss": 0.8974, + "step": 4212 + }, + { + "epoch": 3.9227188081936686, + "grad_norm": 1.7405784130096436, + "learning_rate": 1.3767319971373369e-06, + "loss": 0.9448, + "step": 4213 + }, + { + "epoch": 3.9236499068901303, + "grad_norm": 1.757226824760437, + "learning_rate": 1.3756261183487473e-06, + "loss": 0.9157, + "step": 4214 + }, + { + "epoch": 3.9245810055865924, + "grad_norm": 1.6415494680404663, + "learning_rate": 1.3745205152902313e-06, + "loss": 0.8923, + "step": 4215 + }, + { + "epoch": 3.925512104283054, + "grad_norm": 1.7430733442306519, + "learning_rate": 1.3734151882329157e-06, + "loss": 0.8992, + "step": 4216 + }, + { + "epoch": 3.9264432029795158, + "grad_norm": 1.680516004562378, + "learning_rate": 1.3723101374478598e-06, + "loss": 0.9135, + "step": 4217 + }, + { + "epoch": 3.927374301675978, + "grad_norm": 1.7036300897598267, + "learning_rate": 1.371205363206054e-06, + "loss": 0.8859, + "step": 4218 + }, + { + "epoch": 3.9283054003724396, + "grad_norm": 1.7933826446533203, + "learning_rate": 1.370100865778425e-06, + "loss": 0.8925, + "step": 4219 + }, + { + "epoch": 3.9292364990689013, + "grad_norm": 1.6897965669631958, + "learning_rate": 1.3689966454358255e-06, + "loss": 0.8869, + "step": 4220 + }, + { + "epoch": 3.9301675977653634, + "grad_norm": 1.769063949584961, + "learning_rate": 1.3678927024490446e-06, + "loss": 0.9002, + "step": 4221 + }, + { + "epoch": 3.931098696461825, + "grad_norm": 1.729137659072876, + "learning_rate": 1.3667890370888016e-06, + "loss": 0.8762, + "step": 4222 + }, + { + "epoch": 3.9320297951582868, + "grad_norm": 1.7462197542190552, + "learning_rate": 1.3656856496257486e-06, + "loss": 0.909, + "step": 4223 + }, + { + "epoch": 3.9329608938547485, + "grad_norm": 1.7002540826797485, + "learning_rate": 1.364582540330469e-06, + "loss": 0.8908, + "step": 4224 + }, + { + "epoch": 3.9338919925512106, + "grad_norm": 1.711059331893921, + "learning_rate": 1.3634797094734776e-06, + "loss": 0.8971, + "step": 4225 + }, + { + "epoch": 3.9348230912476723, + "grad_norm": 1.7691084146499634, + "learning_rate": 1.3623771573252237e-06, + "loss": 0.942, + "step": 4226 + }, + { + "epoch": 3.935754189944134, + "grad_norm": 1.795163631439209, + "learning_rate": 1.3612748841560835e-06, + "loss": 0.9528, + "step": 4227 + }, + { + "epoch": 3.9366852886405956, + "grad_norm": 1.7376372814178467, + "learning_rate": 1.3601728902363682e-06, + "loss": 0.9344, + "step": 4228 + }, + { + "epoch": 3.9376163873370578, + "grad_norm": 1.747847080230713, + "learning_rate": 1.35907117583632e-06, + "loss": 0.91, + "step": 4229 + }, + { + "epoch": 3.9385474860335195, + "grad_norm": 1.7352440357208252, + "learning_rate": 1.3579697412261116e-06, + "loss": 0.9061, + "step": 4230 + }, + { + "epoch": 3.939478584729981, + "grad_norm": 1.7595494985580444, + "learning_rate": 1.3568685866758483e-06, + "loss": 0.9509, + "step": 4231 + }, + { + "epoch": 3.9404096834264433, + "grad_norm": 1.70807945728302, + "learning_rate": 1.3557677124555656e-06, + "loss": 0.898, + "step": 4232 + }, + { + "epoch": 3.941340782122905, + "grad_norm": 1.8331208229064941, + "learning_rate": 1.354667118835231e-06, + "loss": 0.9125, + "step": 4233 + }, + { + "epoch": 3.9422718808193666, + "grad_norm": 1.8426227569580078, + "learning_rate": 1.3535668060847428e-06, + "loss": 0.9327, + "step": 4234 + }, + { + "epoch": 3.9432029795158288, + "grad_norm": 1.745550274848938, + "learning_rate": 1.3524667744739305e-06, + "loss": 0.9143, + "step": 4235 + }, + { + "epoch": 3.9441340782122905, + "grad_norm": 1.727669596672058, + "learning_rate": 1.3513670242725552e-06, + "loss": 0.8857, + "step": 4236 + }, + { + "epoch": 3.945065176908752, + "grad_norm": 1.639237642288208, + "learning_rate": 1.350267555750308e-06, + "loss": 0.8758, + "step": 4237 + }, + { + "epoch": 3.9459962756052143, + "grad_norm": 1.7214468717575073, + "learning_rate": 1.3491683691768118e-06, + "loss": 0.9018, + "step": 4238 + }, + { + "epoch": 3.946927374301676, + "grad_norm": 1.7821786403656006, + "learning_rate": 1.3480694648216197e-06, + "loss": 0.8893, + "step": 4239 + }, + { + "epoch": 3.9478584729981376, + "grad_norm": 1.7453091144561768, + "learning_rate": 1.3469708429542157e-06, + "loss": 0.878, + "step": 4240 + }, + { + "epoch": 3.9487895716945998, + "grad_norm": 1.7219913005828857, + "learning_rate": 1.3458725038440154e-06, + "loss": 0.8881, + "step": 4241 + }, + { + "epoch": 3.9497206703910615, + "grad_norm": 1.684158444404602, + "learning_rate": 1.3447744477603639e-06, + "loss": 0.8995, + "step": 4242 + }, + { + "epoch": 3.950651769087523, + "grad_norm": 1.7291629314422607, + "learning_rate": 1.3436766749725372e-06, + "loss": 0.8966, + "step": 4243 + }, + { + "epoch": 3.9515828677839853, + "grad_norm": 1.786635160446167, + "learning_rate": 1.3425791857497422e-06, + "loss": 0.9089, + "step": 4244 + }, + { + "epoch": 3.952513966480447, + "grad_norm": 1.7477847337722778, + "learning_rate": 1.3414819803611165e-06, + "loss": 0.9289, + "step": 4245 + }, + { + "epoch": 3.9534450651769086, + "grad_norm": 1.695330023765564, + "learning_rate": 1.3403850590757267e-06, + "loss": 0.9412, + "step": 4246 + }, + { + "epoch": 3.9543761638733708, + "grad_norm": 1.7521381378173828, + "learning_rate": 1.3392884221625718e-06, + "loss": 0.9098, + "step": 4247 + }, + { + "epoch": 3.9553072625698324, + "grad_norm": 1.6534643173217773, + "learning_rate": 1.3381920698905788e-06, + "loss": 0.8957, + "step": 4248 + }, + { + "epoch": 3.956238361266294, + "grad_norm": 1.7063037157058716, + "learning_rate": 1.3370960025286068e-06, + "loss": 0.926, + "step": 4249 + }, + { + "epoch": 3.9571694599627563, + "grad_norm": 1.7220642566680908, + "learning_rate": 1.3360002203454441e-06, + "loss": 0.9278, + "step": 4250 + }, + { + "epoch": 3.958100558659218, + "grad_norm": 1.7003141641616821, + "learning_rate": 1.3349047236098089e-06, + "loss": 0.8983, + "step": 4251 + }, + { + "epoch": 3.9590316573556796, + "grad_norm": 1.6883808374404907, + "learning_rate": 1.3338095125903504e-06, + "loss": 0.9157, + "step": 4252 + }, + { + "epoch": 3.9599627560521418, + "grad_norm": 1.7049285173416138, + "learning_rate": 1.3327145875556475e-06, + "loss": 0.8966, + "step": 4253 + }, + { + "epoch": 3.9608938547486034, + "grad_norm": 1.7231625318527222, + "learning_rate": 1.3316199487742057e-06, + "loss": 0.927, + "step": 4254 + }, + { + "epoch": 3.961824953445065, + "grad_norm": 1.7194812297821045, + "learning_rate": 1.330525596514466e-06, + "loss": 0.894, + "step": 4255 + }, + { + "epoch": 3.9627560521415273, + "grad_norm": 1.7682898044586182, + "learning_rate": 1.3294315310447958e-06, + "loss": 0.9238, + "step": 4256 + }, + { + "epoch": 3.963687150837989, + "grad_norm": 1.6532021760940552, + "learning_rate": 1.3283377526334921e-06, + "loss": 0.8631, + "step": 4257 + }, + { + "epoch": 3.9646182495344506, + "grad_norm": 1.7085129022598267, + "learning_rate": 1.3272442615487822e-06, + "loss": 0.9009, + "step": 4258 + }, + { + "epoch": 3.9655493482309123, + "grad_norm": 1.7306479215621948, + "learning_rate": 1.3261510580588227e-06, + "loss": 0.9284, + "step": 4259 + }, + { + "epoch": 3.9664804469273744, + "grad_norm": 1.719754934310913, + "learning_rate": 1.3250581424317012e-06, + "loss": 0.902, + "step": 4260 + }, + { + "epoch": 3.967411545623836, + "grad_norm": 1.7374979257583618, + "learning_rate": 1.3239655149354297e-06, + "loss": 0.9353, + "step": 4261 + }, + { + "epoch": 3.968342644320298, + "grad_norm": 1.6877343654632568, + "learning_rate": 1.3228731758379562e-06, + "loss": 0.908, + "step": 4262 + }, + { + "epoch": 3.9692737430167595, + "grad_norm": 1.689611554145813, + "learning_rate": 1.3217811254071544e-06, + "loss": 0.886, + "step": 4263 + }, + { + "epoch": 3.9702048417132216, + "grad_norm": 1.7651277780532837, + "learning_rate": 1.320689363910827e-06, + "loss": 0.8875, + "step": 4264 + }, + { + "epoch": 3.9711359404096833, + "grad_norm": 1.7377010583877563, + "learning_rate": 1.319597891616707e-06, + "loss": 0.8883, + "step": 4265 + }, + { + "epoch": 3.972067039106145, + "grad_norm": 1.7432407140731812, + "learning_rate": 1.318506708792456e-06, + "loss": 0.8782, + "step": 4266 + }, + { + "epoch": 3.972998137802607, + "grad_norm": 1.7551065683364868, + "learning_rate": 1.3174158157056654e-06, + "loss": 0.898, + "step": 4267 + }, + { + "epoch": 3.973929236499069, + "grad_norm": 1.7748680114746094, + "learning_rate": 1.3163252126238524e-06, + "loss": 0.9389, + "step": 4268 + }, + { + "epoch": 3.9748603351955305, + "grad_norm": 1.691055178642273, + "learning_rate": 1.3152348998144677e-06, + "loss": 0.8876, + "step": 4269 + }, + { + "epoch": 3.9757914338919926, + "grad_norm": 1.7495986223220825, + "learning_rate": 1.3141448775448875e-06, + "loss": 0.885, + "step": 4270 + }, + { + "epoch": 3.9767225325884543, + "grad_norm": 1.6924501657485962, + "learning_rate": 1.3130551460824196e-06, + "loss": 0.9332, + "step": 4271 + }, + { + "epoch": 3.977653631284916, + "grad_norm": 1.6970319747924805, + "learning_rate": 1.3119657056942952e-06, + "loss": 0.8884, + "step": 4272 + }, + { + "epoch": 3.978584729981378, + "grad_norm": 1.7221189737319946, + "learning_rate": 1.3108765566476805e-06, + "loss": 0.8864, + "step": 4273 + }, + { + "epoch": 3.97951582867784, + "grad_norm": 1.7274166345596313, + "learning_rate": 1.309787699209668e-06, + "loss": 0.9295, + "step": 4274 + }, + { + "epoch": 3.9804469273743015, + "grad_norm": 1.6894820928573608, + "learning_rate": 1.3086991336472748e-06, + "loss": 0.8784, + "step": 4275 + }, + { + "epoch": 3.9813780260707636, + "grad_norm": 1.782052755355835, + "learning_rate": 1.3076108602274523e-06, + "loss": 0.8826, + "step": 4276 + }, + { + "epoch": 3.9823091247672253, + "grad_norm": 1.732740879058838, + "learning_rate": 1.3065228792170772e-06, + "loss": 0.9184, + "step": 4277 + }, + { + "epoch": 3.983240223463687, + "grad_norm": 1.6925077438354492, + "learning_rate": 1.3054351908829558e-06, + "loss": 0.8922, + "step": 4278 + }, + { + "epoch": 3.984171322160149, + "grad_norm": 1.7218379974365234, + "learning_rate": 1.3043477954918189e-06, + "loss": 0.936, + "step": 4279 + }, + { + "epoch": 3.985102420856611, + "grad_norm": 1.7616764307022095, + "learning_rate": 1.3032606933103305e-06, + "loss": 0.8905, + "step": 4280 + }, + { + "epoch": 3.9860335195530725, + "grad_norm": 1.7263654470443726, + "learning_rate": 1.302173884605082e-06, + "loss": 0.9289, + "step": 4281 + }, + { + "epoch": 3.9869646182495346, + "grad_norm": 1.785617470741272, + "learning_rate": 1.301087369642588e-06, + "loss": 0.9303, + "step": 4282 + }, + { + "epoch": 3.9878957169459963, + "grad_norm": 1.753913164138794, + "learning_rate": 1.3000011486892948e-06, + "loss": 0.9088, + "step": 4283 + }, + { + "epoch": 3.988826815642458, + "grad_norm": 1.7531358003616333, + "learning_rate": 1.2989152220115803e-06, + "loss": 0.9382, + "step": 4284 + }, + { + "epoch": 3.98975791433892, + "grad_norm": 1.7975728511810303, + "learning_rate": 1.2978295898757414e-06, + "loss": 0.9357, + "step": 4285 + }, + { + "epoch": 3.990689013035382, + "grad_norm": 1.7662535905838013, + "learning_rate": 1.2967442525480092e-06, + "loss": 0.9669, + "step": 4286 + }, + { + "epoch": 3.9916201117318435, + "grad_norm": 1.771039366722107, + "learning_rate": 1.295659210294542e-06, + "loss": 0.8885, + "step": 4287 + }, + { + "epoch": 3.9925512104283056, + "grad_norm": 1.7708089351654053, + "learning_rate": 1.2945744633814245e-06, + "loss": 0.933, + "step": 4288 + }, + { + "epoch": 3.9934823091247673, + "grad_norm": 1.7533818483352661, + "learning_rate": 1.2934900120746672e-06, + "loss": 0.9461, + "step": 4289 + }, + { + "epoch": 3.994413407821229, + "grad_norm": 1.743216872215271, + "learning_rate": 1.2924058566402097e-06, + "loss": 0.8616, + "step": 4290 + }, + { + "epoch": 3.995344506517691, + "grad_norm": 1.7505429983139038, + "learning_rate": 1.2913219973439234e-06, + "loss": 0.9026, + "step": 4291 + }, + { + "epoch": 3.996275605214153, + "grad_norm": 1.7968896627426147, + "learning_rate": 1.2902384344515984e-06, + "loss": 0.9293, + "step": 4292 + }, + { + "epoch": 3.9972067039106145, + "grad_norm": 1.7324241399765015, + "learning_rate": 1.2891551682289582e-06, + "loss": 0.8965, + "step": 4293 + }, + { + "epoch": 3.998137802607076, + "grad_norm": 1.7534737586975098, + "learning_rate": 1.2880721989416528e-06, + "loss": 0.8998, + "step": 4294 + }, + { + "epoch": 3.9990689013035383, + "grad_norm": 1.8052053451538086, + "learning_rate": 1.2869895268552596e-06, + "loss": 0.9414, + "step": 4295 + }, + { + "epoch": 4.0, + "grad_norm": 1.813524603843689, + "learning_rate": 1.2859071522352794e-06, + "loss": 0.931, + "step": 4296 + } + ], + "logging_steps": 1, + "max_steps": 6444, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1074, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0851222140108145e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}