diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8254 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1173, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0025575447570332483, + "grad_norm": 60.07620508040347, + "learning_rate": 0.0, + "loss": 10.9714, + "step": 1 + }, + { + "epoch": 0.005115089514066497, + "grad_norm": 60.511635982681035, + "learning_rate": 4.2372881355932204e-07, + "loss": 11.044, + "step": 2 + }, + { + "epoch": 0.0076726342710997444, + "grad_norm": 61.57012701086648, + "learning_rate": 8.474576271186441e-07, + "loss": 10.9687, + "step": 3 + }, + { + "epoch": 0.010230179028132993, + "grad_norm": 62.423863746635334, + "learning_rate": 1.2711864406779662e-06, + "loss": 10.9132, + "step": 4 + }, + { + "epoch": 0.01278772378516624, + "grad_norm": 60.51018546257131, + "learning_rate": 1.6949152542372882e-06, + "loss": 11.0108, + "step": 5 + }, + { + "epoch": 0.015345268542199489, + "grad_norm": 66.2795306712718, + "learning_rate": 2.11864406779661e-06, + "loss": 10.7022, + "step": 6 + }, + { + "epoch": 0.017902813299232736, + "grad_norm": 68.66164801562074, + "learning_rate": 2.5423728813559323e-06, + "loss": 10.6058, + "step": 7 + }, + { + "epoch": 0.020460358056265986, + "grad_norm": 107.4660149943893, + "learning_rate": 2.9661016949152545e-06, + "loss": 9.0593, + "step": 8 + }, + { + "epoch": 0.023017902813299233, + "grad_norm": 122.48386436910788, + "learning_rate": 3.3898305084745763e-06, + "loss": 8.4522, + "step": 9 + }, + { + "epoch": 0.02557544757033248, + "grad_norm": 125.82848908671042, + "learning_rate": 3.813559322033899e-06, + "loss": 5.6693, + "step": 10 + }, + { + "epoch": 0.028132992327365727, + "grad_norm": 52.58888004444451, + "learning_rate": 4.23728813559322e-06, + "loss": 3.0629, + "step": 11 + }, + { + "epoch": 0.030690537084398978, + "grad_norm": 37.39340585668415, + "learning_rate": 4.6610169491525425e-06, + "loss": 2.376, + "step": 12 + }, + { + "epoch": 0.03324808184143223, + "grad_norm": 28.735337125133064, + "learning_rate": 5.084745762711865e-06, + "loss": 2.1006, + "step": 13 + }, + { + "epoch": 0.03580562659846547, + "grad_norm": 6.3291764630351315, + "learning_rate": 5.508474576271187e-06, + "loss": 1.2756, + "step": 14 + }, + { + "epoch": 0.03836317135549872, + "grad_norm": 4.690308248334096, + "learning_rate": 5.932203389830509e-06, + "loss": 1.2509, + "step": 15 + }, + { + "epoch": 0.04092071611253197, + "grad_norm": 3.5468348254384843, + "learning_rate": 6.3559322033898304e-06, + "loss": 1.1712, + "step": 16 + }, + { + "epoch": 0.043478260869565216, + "grad_norm": 2.676492989643342, + "learning_rate": 6.779661016949153e-06, + "loss": 1.055, + "step": 17 + }, + { + "epoch": 0.04603580562659847, + "grad_norm": 2.1888510444313205, + "learning_rate": 7.203389830508475e-06, + "loss": 1.0324, + "step": 18 + }, + { + "epoch": 0.04859335038363171, + "grad_norm": 55.49598040447309, + "learning_rate": 7.627118644067798e-06, + "loss": 0.9577, + "step": 19 + }, + { + "epoch": 0.05115089514066496, + "grad_norm": 18.10939464017419, + "learning_rate": 8.050847457627118e-06, + "loss": 0.8841, + "step": 20 + }, + { + "epoch": 0.05370843989769821, + "grad_norm": 1.783845830738153, + "learning_rate": 8.47457627118644e-06, + "loss": 0.8704, + "step": 21 + }, + { + "epoch": 0.056265984654731455, + "grad_norm": 1.2295478253717957, + "learning_rate": 8.898305084745763e-06, + "loss": 0.829, + "step": 22 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 1.0279978849315632, + "learning_rate": 9.322033898305085e-06, + "loss": 0.8196, + "step": 23 + }, + { + "epoch": 0.061381074168797956, + "grad_norm": 0.8982739673565904, + "learning_rate": 9.745762711864407e-06, + "loss": 0.7903, + "step": 24 + }, + { + "epoch": 0.0639386189258312, + "grad_norm": 0.7588801023963194, + "learning_rate": 1.016949152542373e-05, + "loss": 0.7177, + "step": 25 + }, + { + "epoch": 0.06649616368286446, + "grad_norm": 1.0123370131062162, + "learning_rate": 1.0593220338983052e-05, + "loss": 0.7536, + "step": 26 + }, + { + "epoch": 0.06905370843989769, + "grad_norm": 0.7910316066634632, + "learning_rate": 1.1016949152542374e-05, + "loss": 0.6874, + "step": 27 + }, + { + "epoch": 0.07161125319693094, + "grad_norm": 0.7192937721653079, + "learning_rate": 1.1440677966101696e-05, + "loss": 0.6942, + "step": 28 + }, + { + "epoch": 0.0741687979539642, + "grad_norm": 0.6367048650637959, + "learning_rate": 1.1864406779661018e-05, + "loss": 0.652, + "step": 29 + }, + { + "epoch": 0.07672634271099744, + "grad_norm": 0.6890008346231932, + "learning_rate": 1.228813559322034e-05, + "loss": 0.6527, + "step": 30 + }, + { + "epoch": 0.0792838874680307, + "grad_norm": 0.7018774861427414, + "learning_rate": 1.2711864406779661e-05, + "loss": 0.6389, + "step": 31 + }, + { + "epoch": 0.08184143222506395, + "grad_norm": 0.6934531307251741, + "learning_rate": 1.3135593220338985e-05, + "loss": 0.6612, + "step": 32 + }, + { + "epoch": 0.08439897698209718, + "grad_norm": 0.4547490187162451, + "learning_rate": 1.3559322033898305e-05, + "loss": 0.6272, + "step": 33 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 0.5411681099025528, + "learning_rate": 1.3983050847457627e-05, + "loss": 0.6326, + "step": 34 + }, + { + "epoch": 0.08951406649616368, + "grad_norm": 0.5591394716745298, + "learning_rate": 1.440677966101695e-05, + "loss": 0.6139, + "step": 35 + }, + { + "epoch": 0.09207161125319693, + "grad_norm": 0.4569913550931653, + "learning_rate": 1.4830508474576272e-05, + "loss": 0.6073, + "step": 36 + }, + { + "epoch": 0.09462915601023018, + "grad_norm": 0.4147309558729621, + "learning_rate": 1.5254237288135596e-05, + "loss": 0.6017, + "step": 37 + }, + { + "epoch": 0.09718670076726342, + "grad_norm": 0.44578293274404973, + "learning_rate": 1.5677966101694916e-05, + "loss": 0.578, + "step": 38 + }, + { + "epoch": 0.09974424552429667, + "grad_norm": 0.44759576906101894, + "learning_rate": 1.6101694915254237e-05, + "loss": 0.5725, + "step": 39 + }, + { + "epoch": 0.10230179028132992, + "grad_norm": 0.521441753506374, + "learning_rate": 1.652542372881356e-05, + "loss": 0.6091, + "step": 40 + }, + { + "epoch": 0.10485933503836317, + "grad_norm": 0.3633683810476169, + "learning_rate": 1.694915254237288e-05, + "loss": 0.591, + "step": 41 + }, + { + "epoch": 0.10741687979539642, + "grad_norm": 0.38875293035716313, + "learning_rate": 1.7372881355932205e-05, + "loss": 0.5684, + "step": 42 + }, + { + "epoch": 0.10997442455242967, + "grad_norm": 0.4050488399781334, + "learning_rate": 1.7796610169491526e-05, + "loss": 0.5604, + "step": 43 + }, + { + "epoch": 0.11253196930946291, + "grad_norm": 0.35484531528744356, + "learning_rate": 1.8220338983050846e-05, + "loss": 0.5588, + "step": 44 + }, + { + "epoch": 0.11508951406649616, + "grad_norm": 0.3558009349640067, + "learning_rate": 1.864406779661017e-05, + "loss": 0.5772, + "step": 45 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.3631599278698065, + "learning_rate": 1.906779661016949e-05, + "loss": 0.5567, + "step": 46 + }, + { + "epoch": 0.12020460358056266, + "grad_norm": 0.29178893481388374, + "learning_rate": 1.9491525423728814e-05, + "loss": 0.5575, + "step": 47 + }, + { + "epoch": 0.12276214833759591, + "grad_norm": 0.28512370332661957, + "learning_rate": 1.9915254237288135e-05, + "loss": 0.545, + "step": 48 + }, + { + "epoch": 0.12531969309462915, + "grad_norm": 0.33383686916439004, + "learning_rate": 2.033898305084746e-05, + "loss": 0.5395, + "step": 49 + }, + { + "epoch": 0.1278772378516624, + "grad_norm": 0.3302302589173117, + "learning_rate": 2.076271186440678e-05, + "loss": 0.5654, + "step": 50 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.25804408924344063, + "learning_rate": 2.1186440677966103e-05, + "loss": 0.545, + "step": 51 + }, + { + "epoch": 0.1329923273657289, + "grad_norm": 0.27338682999676506, + "learning_rate": 2.1610169491525427e-05, + "loss": 0.5417, + "step": 52 + }, + { + "epoch": 0.13554987212276215, + "grad_norm": 0.25924856640229854, + "learning_rate": 2.2033898305084748e-05, + "loss": 0.5435, + "step": 53 + }, + { + "epoch": 0.13810741687979539, + "grad_norm": 0.25667969517909306, + "learning_rate": 2.245762711864407e-05, + "loss": 0.5027, + "step": 54 + }, + { + "epoch": 0.14066496163682865, + "grad_norm": 0.2651721483714715, + "learning_rate": 2.2881355932203392e-05, + "loss": 0.5148, + "step": 55 + }, + { + "epoch": 0.1432225063938619, + "grad_norm": 0.2695589091283933, + "learning_rate": 2.3305084745762712e-05, + "loss": 0.5421, + "step": 56 + }, + { + "epoch": 0.14578005115089515, + "grad_norm": 0.2660946246807775, + "learning_rate": 2.3728813559322036e-05, + "loss": 0.5302, + "step": 57 + }, + { + "epoch": 0.1483375959079284, + "grad_norm": 0.2572598707834026, + "learning_rate": 2.4152542372881357e-05, + "loss": 0.5494, + "step": 58 + }, + { + "epoch": 0.15089514066496162, + "grad_norm": 0.25796653370038297, + "learning_rate": 2.457627118644068e-05, + "loss": 0.5173, + "step": 59 + }, + { + "epoch": 0.1534526854219949, + "grad_norm": 0.26719666930574326, + "learning_rate": 2.5e-05, + "loss": 0.5318, + "step": 60 + }, + { + "epoch": 0.15601023017902813, + "grad_norm": 0.2415395019191131, + "learning_rate": 2.5423728813559322e-05, + "loss": 0.533, + "step": 61 + }, + { + "epoch": 0.1585677749360614, + "grad_norm": 0.2731503593131359, + "learning_rate": 2.5847457627118642e-05, + "loss": 0.5138, + "step": 62 + }, + { + "epoch": 0.16112531969309463, + "grad_norm": 0.23021339667231472, + "learning_rate": 2.627118644067797e-05, + "loss": 0.506, + "step": 63 + }, + { + "epoch": 0.1636828644501279, + "grad_norm": 0.2438183399920384, + "learning_rate": 2.669491525423729e-05, + "loss": 0.4933, + "step": 64 + }, + { + "epoch": 0.16624040920716113, + "grad_norm": 0.25625774549395297, + "learning_rate": 2.711864406779661e-05, + "loss": 0.5275, + "step": 65 + }, + { + "epoch": 0.16879795396419436, + "grad_norm": 0.2523483723490555, + "learning_rate": 2.754237288135593e-05, + "loss": 0.517, + "step": 66 + }, + { + "epoch": 0.17135549872122763, + "grad_norm": 0.24599282565528238, + "learning_rate": 2.7966101694915255e-05, + "loss": 0.5105, + "step": 67 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.25271072320627247, + "learning_rate": 2.838983050847458e-05, + "loss": 0.4947, + "step": 68 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 0.26800675870234536, + "learning_rate": 2.88135593220339e-05, + "loss": 0.5018, + "step": 69 + }, + { + "epoch": 0.17902813299232737, + "grad_norm": 0.22967309445842915, + "learning_rate": 2.9237288135593223e-05, + "loss": 0.5127, + "step": 70 + }, + { + "epoch": 0.1815856777493606, + "grad_norm": 0.2936501608494599, + "learning_rate": 2.9661016949152544e-05, + "loss": 0.5067, + "step": 71 + }, + { + "epoch": 0.18414322250639387, + "grad_norm": 0.3944135766030376, + "learning_rate": 3.0084745762711864e-05, + "loss": 0.5189, + "step": 72 + }, + { + "epoch": 0.1867007672634271, + "grad_norm": 0.266923293934136, + "learning_rate": 3.050847457627119e-05, + "loss": 0.5099, + "step": 73 + }, + { + "epoch": 0.18925831202046037, + "grad_norm": 0.25718984553900326, + "learning_rate": 3.093220338983051e-05, + "loss": 0.5024, + "step": 74 + }, + { + "epoch": 0.1918158567774936, + "grad_norm": 0.23516139958516855, + "learning_rate": 3.135593220338983e-05, + "loss": 0.4961, + "step": 75 + }, + { + "epoch": 0.19437340153452684, + "grad_norm": 0.2629972539950733, + "learning_rate": 3.177966101694915e-05, + "loss": 0.4858, + "step": 76 + }, + { + "epoch": 0.1969309462915601, + "grad_norm": 0.2397591843698089, + "learning_rate": 3.2203389830508473e-05, + "loss": 0.5022, + "step": 77 + }, + { + "epoch": 0.19948849104859334, + "grad_norm": 0.2488143296082389, + "learning_rate": 3.26271186440678e-05, + "loss": 0.5008, + "step": 78 + }, + { + "epoch": 0.2020460358056266, + "grad_norm": 0.284022517588893, + "learning_rate": 3.305084745762712e-05, + "loss": 0.4944, + "step": 79 + }, + { + "epoch": 0.20460358056265984, + "grad_norm": 0.2585535341280856, + "learning_rate": 3.347457627118644e-05, + "loss": 0.4681, + "step": 80 + }, + { + "epoch": 0.2071611253196931, + "grad_norm": 0.27227808307258267, + "learning_rate": 3.389830508474576e-05, + "loss": 0.4798, + "step": 81 + }, + { + "epoch": 0.20971867007672634, + "grad_norm": 0.27943220348506814, + "learning_rate": 3.432203389830508e-05, + "loss": 0.4869, + "step": 82 + }, + { + "epoch": 0.21227621483375958, + "grad_norm": 0.2591147558052403, + "learning_rate": 3.474576271186441e-05, + "loss": 0.5002, + "step": 83 + }, + { + "epoch": 0.21483375959079284, + "grad_norm": 0.26199419848962174, + "learning_rate": 3.516949152542373e-05, + "loss": 0.4848, + "step": 84 + }, + { + "epoch": 0.21739130434782608, + "grad_norm": 0.2560452706817345, + "learning_rate": 3.559322033898305e-05, + "loss": 0.4796, + "step": 85 + }, + { + "epoch": 0.21994884910485935, + "grad_norm": 0.3104926180958261, + "learning_rate": 3.601694915254237e-05, + "loss": 0.4857, + "step": 86 + }, + { + "epoch": 0.22250639386189258, + "grad_norm": 0.2595037856684306, + "learning_rate": 3.644067796610169e-05, + "loss": 0.4786, + "step": 87 + }, + { + "epoch": 0.22506393861892582, + "grad_norm": 0.28985166506581866, + "learning_rate": 3.686440677966102e-05, + "loss": 0.4733, + "step": 88 + }, + { + "epoch": 0.22762148337595908, + "grad_norm": 0.2900856188045173, + "learning_rate": 3.728813559322034e-05, + "loss": 0.4893, + "step": 89 + }, + { + "epoch": 0.23017902813299232, + "grad_norm": 0.3181961782523891, + "learning_rate": 3.771186440677966e-05, + "loss": 0.4836, + "step": 90 + }, + { + "epoch": 0.23273657289002558, + "grad_norm": 0.3524322519656808, + "learning_rate": 3.813559322033898e-05, + "loss": 0.4858, + "step": 91 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.277143774625197, + "learning_rate": 3.855932203389831e-05, + "loss": 0.4602, + "step": 92 + }, + { + "epoch": 0.23785166240409208, + "grad_norm": 0.3152846596099472, + "learning_rate": 3.898305084745763e-05, + "loss": 0.4861, + "step": 93 + }, + { + "epoch": 0.24040920716112532, + "grad_norm": 0.3108040600900486, + "learning_rate": 3.940677966101695e-05, + "loss": 0.4735, + "step": 94 + }, + { + "epoch": 0.24296675191815856, + "grad_norm": 0.3636456936928106, + "learning_rate": 3.983050847457627e-05, + "loss": 0.4927, + "step": 95 + }, + { + "epoch": 0.24552429667519182, + "grad_norm": 0.281719824056263, + "learning_rate": 4.025423728813559e-05, + "loss": 0.478, + "step": 96 + }, + { + "epoch": 0.24808184143222506, + "grad_norm": 0.31572505604740536, + "learning_rate": 4.067796610169492e-05, + "loss": 0.4782, + "step": 97 + }, + { + "epoch": 0.2506393861892583, + "grad_norm": 0.3265923715391404, + "learning_rate": 4.110169491525424e-05, + "loss": 0.4769, + "step": 98 + }, + { + "epoch": 0.2531969309462916, + "grad_norm": 0.28803267079398887, + "learning_rate": 4.152542372881356e-05, + "loss": 0.4729, + "step": 99 + }, + { + "epoch": 0.2557544757033248, + "grad_norm": 0.3650171432061163, + "learning_rate": 4.1949152542372886e-05, + "loss": 0.4686, + "step": 100 + }, + { + "epoch": 0.25831202046035806, + "grad_norm": 0.3208885876586653, + "learning_rate": 4.2372881355932206e-05, + "loss": 0.4756, + "step": 101 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.3018386311182313, + "learning_rate": 4.279661016949153e-05, + "loss": 0.4898, + "step": 102 + }, + { + "epoch": 0.26342710997442453, + "grad_norm": 0.35043017471200005, + "learning_rate": 4.3220338983050854e-05, + "loss": 0.4791, + "step": 103 + }, + { + "epoch": 0.2659846547314578, + "grad_norm": 0.34067263771788764, + "learning_rate": 4.3644067796610175e-05, + "loss": 0.4605, + "step": 104 + }, + { + "epoch": 0.26854219948849106, + "grad_norm": 0.30101429979539257, + "learning_rate": 4.4067796610169495e-05, + "loss": 0.4736, + "step": 105 + }, + { + "epoch": 0.2710997442455243, + "grad_norm": 0.30707206512082585, + "learning_rate": 4.4491525423728816e-05, + "loss": 0.4822, + "step": 106 + }, + { + "epoch": 0.27365728900255754, + "grad_norm": 0.39306930698809855, + "learning_rate": 4.491525423728814e-05, + "loss": 0.4586, + "step": 107 + }, + { + "epoch": 0.27621483375959077, + "grad_norm": 0.2793625552949932, + "learning_rate": 4.533898305084746e-05, + "loss": 0.4824, + "step": 108 + }, + { + "epoch": 0.27877237851662406, + "grad_norm": 0.39226221347711837, + "learning_rate": 4.5762711864406784e-05, + "loss": 0.4576, + "step": 109 + }, + { + "epoch": 0.2813299232736573, + "grad_norm": 0.3030667831941101, + "learning_rate": 4.6186440677966104e-05, + "loss": 0.4624, + "step": 110 + }, + { + "epoch": 0.28388746803069054, + "grad_norm": 0.3273613222535301, + "learning_rate": 4.6610169491525425e-05, + "loss": 0.4799, + "step": 111 + }, + { + "epoch": 0.2864450127877238, + "grad_norm": 0.2863063658186757, + "learning_rate": 4.703389830508475e-05, + "loss": 0.4669, + "step": 112 + }, + { + "epoch": 0.289002557544757, + "grad_norm": 0.33232608459400076, + "learning_rate": 4.745762711864407e-05, + "loss": 0.4825, + "step": 113 + }, + { + "epoch": 0.2915601023017903, + "grad_norm": 0.3600411712420216, + "learning_rate": 4.788135593220339e-05, + "loss": 0.4683, + "step": 114 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 0.27761199193640784, + "learning_rate": 4.8305084745762714e-05, + "loss": 0.4755, + "step": 115 + }, + { + "epoch": 0.2966751918158568, + "grad_norm": 0.3144400589669658, + "learning_rate": 4.8728813559322034e-05, + "loss": 0.4566, + "step": 116 + }, + { + "epoch": 0.29923273657289, + "grad_norm": 0.35513580765557595, + "learning_rate": 4.915254237288136e-05, + "loss": 0.459, + "step": 117 + }, + { + "epoch": 0.30179028132992325, + "grad_norm": 0.2738899153960894, + "learning_rate": 4.957627118644068e-05, + "loss": 0.4657, + "step": 118 + }, + { + "epoch": 0.30434782608695654, + "grad_norm": 0.3433568900683564, + "learning_rate": 5e-05, + "loss": 0.4594, + "step": 119 + }, + { + "epoch": 0.3069053708439898, + "grad_norm": 0.2765901256428289, + "learning_rate": 4.9952606635071094e-05, + "loss": 0.4754, + "step": 120 + }, + { + "epoch": 0.309462915601023, + "grad_norm": 0.3551164959173657, + "learning_rate": 4.990521327014218e-05, + "loss": 0.4616, + "step": 121 + }, + { + "epoch": 0.31202046035805625, + "grad_norm": 0.29005640287933204, + "learning_rate": 4.985781990521327e-05, + "loss": 0.4671, + "step": 122 + }, + { + "epoch": 0.3145780051150895, + "grad_norm": 0.3733034604083977, + "learning_rate": 4.981042654028436e-05, + "loss": 0.4891, + "step": 123 + }, + { + "epoch": 0.3171355498721228, + "grad_norm": 0.28943964192066307, + "learning_rate": 4.976303317535545e-05, + "loss": 0.4658, + "step": 124 + }, + { + "epoch": 0.319693094629156, + "grad_norm": 0.34596305401543703, + "learning_rate": 4.9715639810426544e-05, + "loss": 0.4601, + "step": 125 + }, + { + "epoch": 0.32225063938618925, + "grad_norm": 0.30408705183314516, + "learning_rate": 4.9668246445497635e-05, + "loss": 0.4392, + "step": 126 + }, + { + "epoch": 0.3248081841432225, + "grad_norm": 0.2934769724426176, + "learning_rate": 4.9620853080568726e-05, + "loss": 0.4755, + "step": 127 + }, + { + "epoch": 0.3273657289002558, + "grad_norm": 0.3194601339022468, + "learning_rate": 4.957345971563981e-05, + "loss": 0.455, + "step": 128 + }, + { + "epoch": 0.329923273657289, + "grad_norm": 0.2765722150148559, + "learning_rate": 4.95260663507109e-05, + "loss": 0.4371, + "step": 129 + }, + { + "epoch": 0.33248081841432225, + "grad_norm": 0.3098968524738735, + "learning_rate": 4.9478672985781994e-05, + "loss": 0.4479, + "step": 130 + }, + { + "epoch": 0.3350383631713555, + "grad_norm": 0.29058110177351354, + "learning_rate": 4.9431279620853085e-05, + "loss": 0.4638, + "step": 131 + }, + { + "epoch": 0.3375959079283887, + "grad_norm": 0.34878186474460904, + "learning_rate": 4.938388625592417e-05, + "loss": 0.4589, + "step": 132 + }, + { + "epoch": 0.340153452685422, + "grad_norm": 0.34103367199010814, + "learning_rate": 4.933649289099526e-05, + "loss": 0.4494, + "step": 133 + }, + { + "epoch": 0.34271099744245526, + "grad_norm": 0.3024000321891373, + "learning_rate": 4.928909952606635e-05, + "loss": 0.4642, + "step": 134 + }, + { + "epoch": 0.3452685421994885, + "grad_norm": 0.3120266717266376, + "learning_rate": 4.9241706161137443e-05, + "loss": 0.4494, + "step": 135 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.3437694967932959, + "learning_rate": 4.919431279620853e-05, + "loss": 0.4385, + "step": 136 + }, + { + "epoch": 0.35038363171355497, + "grad_norm": 0.3561886653860422, + "learning_rate": 4.9146919431279626e-05, + "loss": 0.4503, + "step": 137 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.3265621404114159, + "learning_rate": 4.909952606635072e-05, + "loss": 0.4528, + "step": 138 + }, + { + "epoch": 0.3554987212276215, + "grad_norm": 0.39021732468276327, + "learning_rate": 4.90521327014218e-05, + "loss": 0.47, + "step": 139 + }, + { + "epoch": 0.35805626598465473, + "grad_norm": 0.2892525783443359, + "learning_rate": 4.900473933649289e-05, + "loss": 0.4425, + "step": 140 + }, + { + "epoch": 0.36061381074168797, + "grad_norm": 0.35290539690783224, + "learning_rate": 4.8957345971563985e-05, + "loss": 0.4575, + "step": 141 + }, + { + "epoch": 0.3631713554987212, + "grad_norm": 0.4428368742434025, + "learning_rate": 4.8909952606635076e-05, + "loss": 0.4722, + "step": 142 + }, + { + "epoch": 0.3657289002557545, + "grad_norm": 0.2735345303292492, + "learning_rate": 4.886255924170616e-05, + "loss": 0.4553, + "step": 143 + }, + { + "epoch": 0.36828644501278773, + "grad_norm": 0.4438915131124858, + "learning_rate": 4.881516587677725e-05, + "loss": 0.4721, + "step": 144 + }, + { + "epoch": 0.37084398976982097, + "grad_norm": 0.3281658095262752, + "learning_rate": 4.876777251184834e-05, + "loss": 0.4378, + "step": 145 + }, + { + "epoch": 0.3734015345268542, + "grad_norm": 0.3710338165695333, + "learning_rate": 4.8720379146919435e-05, + "loss": 0.4764, + "step": 146 + }, + { + "epoch": 0.37595907928388744, + "grad_norm": 0.35926803120990913, + "learning_rate": 4.867298578199052e-05, + "loss": 0.4552, + "step": 147 + }, + { + "epoch": 0.37851662404092073, + "grad_norm": 0.36794824845872526, + "learning_rate": 4.862559241706162e-05, + "loss": 0.4578, + "step": 148 + }, + { + "epoch": 0.38107416879795397, + "grad_norm": 0.31318291286449124, + "learning_rate": 4.857819905213271e-05, + "loss": 0.4351, + "step": 149 + }, + { + "epoch": 0.3836317135549872, + "grad_norm": 0.33033923224683864, + "learning_rate": 4.853080568720379e-05, + "loss": 0.4565, + "step": 150 + }, + { + "epoch": 0.38618925831202044, + "grad_norm": 0.30424131577956276, + "learning_rate": 4.8483412322274884e-05, + "loss": 0.4403, + "step": 151 + }, + { + "epoch": 0.3887468030690537, + "grad_norm": 0.28074085140395005, + "learning_rate": 4.8436018957345976e-05, + "loss": 0.4486, + "step": 152 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.3579125827021185, + "learning_rate": 4.838862559241707e-05, + "loss": 0.4625, + "step": 153 + }, + { + "epoch": 0.3938618925831202, + "grad_norm": 0.3057863908214165, + "learning_rate": 4.834123222748815e-05, + "loss": 0.4361, + "step": 154 + }, + { + "epoch": 0.39641943734015345, + "grad_norm": 0.28441580945568773, + "learning_rate": 4.829383886255924e-05, + "loss": 0.4341, + "step": 155 + }, + { + "epoch": 0.3989769820971867, + "grad_norm": 0.28566109258674055, + "learning_rate": 4.8246445497630334e-05, + "loss": 0.441, + "step": 156 + }, + { + "epoch": 0.40153452685422, + "grad_norm": 0.3002441202365732, + "learning_rate": 4.819905213270142e-05, + "loss": 0.4329, + "step": 157 + }, + { + "epoch": 0.4040920716112532, + "grad_norm": 0.3199646866784537, + "learning_rate": 4.815165876777251e-05, + "loss": 0.4446, + "step": 158 + }, + { + "epoch": 0.40664961636828645, + "grad_norm": 0.2928518681501388, + "learning_rate": 4.810426540284361e-05, + "loss": 0.428, + "step": 159 + }, + { + "epoch": 0.4092071611253197, + "grad_norm": 0.3946927235529595, + "learning_rate": 4.80568720379147e-05, + "loss": 0.4421, + "step": 160 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 0.30774149759921665, + "learning_rate": 4.8009478672985784e-05, + "loss": 0.4679, + "step": 161 + }, + { + "epoch": 0.4143222506393862, + "grad_norm": 0.3644907390867006, + "learning_rate": 4.7962085308056876e-05, + "loss": 0.4516, + "step": 162 + }, + { + "epoch": 0.41687979539641945, + "grad_norm": 0.31614856501701377, + "learning_rate": 4.791469194312797e-05, + "loss": 0.4539, + "step": 163 + }, + { + "epoch": 0.4194373401534527, + "grad_norm": 0.30645090377175094, + "learning_rate": 4.786729857819905e-05, + "loss": 0.4346, + "step": 164 + }, + { + "epoch": 0.4219948849104859, + "grad_norm": 0.34220416537004444, + "learning_rate": 4.781990521327014e-05, + "loss": 0.4437, + "step": 165 + }, + { + "epoch": 0.42455242966751916, + "grad_norm": 0.29009367411374415, + "learning_rate": 4.7772511848341234e-05, + "loss": 0.4478, + "step": 166 + }, + { + "epoch": 0.42710997442455245, + "grad_norm": 0.3080387840957786, + "learning_rate": 4.7725118483412326e-05, + "loss": 0.4365, + "step": 167 + }, + { + "epoch": 0.4296675191815857, + "grad_norm": 0.30741939240017874, + "learning_rate": 4.767772511848341e-05, + "loss": 0.4588, + "step": 168 + }, + { + "epoch": 0.4322250639386189, + "grad_norm": 0.3198498782578863, + "learning_rate": 4.76303317535545e-05, + "loss": 0.438, + "step": 169 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.34750707859647, + "learning_rate": 4.758293838862559e-05, + "loss": 0.4543, + "step": 170 + }, + { + "epoch": 0.4373401534526854, + "grad_norm": 0.3106322104274765, + "learning_rate": 4.7535545023696684e-05, + "loss": 0.4567, + "step": 171 + }, + { + "epoch": 0.4398976982097187, + "grad_norm": 0.30192961843031885, + "learning_rate": 4.7488151658767775e-05, + "loss": 0.4342, + "step": 172 + }, + { + "epoch": 0.4424552429667519, + "grad_norm": 0.28068686110702473, + "learning_rate": 4.744075829383887e-05, + "loss": 0.4246, + "step": 173 + }, + { + "epoch": 0.44501278772378516, + "grad_norm": 0.343504552181982, + "learning_rate": 4.739336492890996e-05, + "loss": 0.4515, + "step": 174 + }, + { + "epoch": 0.4475703324808184, + "grad_norm": 0.27995937978607677, + "learning_rate": 4.734597156398104e-05, + "loss": 0.4423, + "step": 175 + }, + { + "epoch": 0.45012787723785164, + "grad_norm": 0.3040416539136848, + "learning_rate": 4.7298578199052134e-05, + "loss": 0.45, + "step": 176 + }, + { + "epoch": 0.45268542199488493, + "grad_norm": 0.31835031373188166, + "learning_rate": 4.7251184834123225e-05, + "loss": 0.4532, + "step": 177 + }, + { + "epoch": 0.45524296675191817, + "grad_norm": 0.3414414505648522, + "learning_rate": 4.720379146919432e-05, + "loss": 0.4498, + "step": 178 + }, + { + "epoch": 0.4578005115089514, + "grad_norm": 0.3673972403213916, + "learning_rate": 4.71563981042654e-05, + "loss": 0.444, + "step": 179 + }, + { + "epoch": 0.46035805626598464, + "grad_norm": 0.2994655863634162, + "learning_rate": 4.710900473933649e-05, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 0.4629156010230179, + "grad_norm": 0.2979340261572654, + "learning_rate": 4.7061611374407584e-05, + "loss": 0.4338, + "step": 181 + }, + { + "epoch": 0.46547314578005117, + "grad_norm": 0.3259496116943633, + "learning_rate": 4.7014218009478675e-05, + "loss": 0.4495, + "step": 182 + }, + { + "epoch": 0.4680306905370844, + "grad_norm": 0.23888073915231028, + "learning_rate": 4.6966824644549767e-05, + "loss": 0.4284, + "step": 183 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.32689979789920254, + "learning_rate": 4.691943127962086e-05, + "loss": 0.4307, + "step": 184 + }, + { + "epoch": 0.4731457800511509, + "grad_norm": 0.25079547977758637, + "learning_rate": 4.687203791469195e-05, + "loss": 0.4421, + "step": 185 + }, + { + "epoch": 0.47570332480818417, + "grad_norm": 0.2867599117175957, + "learning_rate": 4.6824644549763034e-05, + "loss": 0.4208, + "step": 186 + }, + { + "epoch": 0.4782608695652174, + "grad_norm": 0.30676226767943815, + "learning_rate": 4.6777251184834125e-05, + "loss": 0.4473, + "step": 187 + }, + { + "epoch": 0.48081841432225064, + "grad_norm": 0.2535226915718885, + "learning_rate": 4.6729857819905216e-05, + "loss": 0.4341, + "step": 188 + }, + { + "epoch": 0.4833759590792839, + "grad_norm": 0.2953685479977593, + "learning_rate": 4.668246445497631e-05, + "loss": 0.4296, + "step": 189 + }, + { + "epoch": 0.4859335038363171, + "grad_norm": 0.24557281792948057, + "learning_rate": 4.663507109004739e-05, + "loss": 0.4507, + "step": 190 + }, + { + "epoch": 0.4884910485933504, + "grad_norm": 0.2738208517596116, + "learning_rate": 4.6587677725118484e-05, + "loss": 0.4285, + "step": 191 + }, + { + "epoch": 0.49104859335038364, + "grad_norm": 0.28109008439258515, + "learning_rate": 4.6540284360189575e-05, + "loss": 0.4396, + "step": 192 + }, + { + "epoch": 0.4936061381074169, + "grad_norm": 0.2793263219783419, + "learning_rate": 4.6492890995260666e-05, + "loss": 0.4334, + "step": 193 + }, + { + "epoch": 0.4961636828644501, + "grad_norm": 0.2679578064695335, + "learning_rate": 4.644549763033176e-05, + "loss": 0.4425, + "step": 194 + }, + { + "epoch": 0.49872122762148335, + "grad_norm": 0.22379280473483837, + "learning_rate": 4.639810426540285e-05, + "loss": 0.4366, + "step": 195 + }, + { + "epoch": 0.5012787723785166, + "grad_norm": 0.24785033078885174, + "learning_rate": 4.635071090047394e-05, + "loss": 0.4309, + "step": 196 + }, + { + "epoch": 0.5038363171355499, + "grad_norm": 0.24670000195823377, + "learning_rate": 4.6303317535545025e-05, + "loss": 0.4417, + "step": 197 + }, + { + "epoch": 0.5063938618925832, + "grad_norm": 0.2930253060170641, + "learning_rate": 4.6255924170616116e-05, + "loss": 0.4375, + "step": 198 + }, + { + "epoch": 0.5089514066496164, + "grad_norm": 0.25825281391527216, + "learning_rate": 4.620853080568721e-05, + "loss": 0.4069, + "step": 199 + }, + { + "epoch": 0.5115089514066496, + "grad_norm": 0.26224408452770004, + "learning_rate": 4.616113744075829e-05, + "loss": 0.4324, + "step": 200 + }, + { + "epoch": 0.5140664961636828, + "grad_norm": 0.25990930281801855, + "learning_rate": 4.6113744075829384e-05, + "loss": 0.4345, + "step": 201 + }, + { + "epoch": 0.5166240409207161, + "grad_norm": 0.268851283978036, + "learning_rate": 4.6066350710900475e-05, + "loss": 0.4459, + "step": 202 + }, + { + "epoch": 0.5191815856777494, + "grad_norm": 0.24959358046946803, + "learning_rate": 4.6018957345971566e-05, + "loss": 0.429, + "step": 203 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.25540467279864626, + "learning_rate": 4.597156398104265e-05, + "loss": 0.4348, + "step": 204 + }, + { + "epoch": 0.5242966751918159, + "grad_norm": 0.3130713054404299, + "learning_rate": 4.592417061611375e-05, + "loss": 0.4271, + "step": 205 + }, + { + "epoch": 0.5268542199488491, + "grad_norm": 0.2688748449663916, + "learning_rate": 4.587677725118484e-05, + "loss": 0.442, + "step": 206 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 0.28683589425397626, + "learning_rate": 4.5829383886255925e-05, + "loss": 0.4333, + "step": 207 + }, + { + "epoch": 0.5319693094629157, + "grad_norm": 0.271763985489389, + "learning_rate": 4.5781990521327016e-05, + "loss": 0.4438, + "step": 208 + }, + { + "epoch": 0.5345268542199488, + "grad_norm": 0.2885843579908882, + "learning_rate": 4.573459715639811e-05, + "loss": 0.4419, + "step": 209 + }, + { + "epoch": 0.5370843989769821, + "grad_norm": 0.28754217783051483, + "learning_rate": 4.56872037914692e-05, + "loss": 0.4355, + "step": 210 + }, + { + "epoch": 0.5396419437340153, + "grad_norm": 0.2737511286441873, + "learning_rate": 4.563981042654028e-05, + "loss": 0.4387, + "step": 211 + }, + { + "epoch": 0.5421994884910486, + "grad_norm": 0.27934016374689097, + "learning_rate": 4.5592417061611375e-05, + "loss": 0.4349, + "step": 212 + }, + { + "epoch": 0.5447570332480819, + "grad_norm": 0.26735219691819356, + "learning_rate": 4.5545023696682466e-05, + "loss": 0.4134, + "step": 213 + }, + { + "epoch": 0.5473145780051151, + "grad_norm": 0.23887968506376323, + "learning_rate": 4.549763033175356e-05, + "loss": 0.4229, + "step": 214 + }, + { + "epoch": 0.5498721227621484, + "grad_norm": 0.3011075198266259, + "learning_rate": 4.545023696682464e-05, + "loss": 0.428, + "step": 215 + }, + { + "epoch": 0.5524296675191815, + "grad_norm": 0.2637321441272665, + "learning_rate": 4.540284360189574e-05, + "loss": 0.4363, + "step": 216 + }, + { + "epoch": 0.5549872122762148, + "grad_norm": 0.29296145440427007, + "learning_rate": 4.535545023696683e-05, + "loss": 0.4304, + "step": 217 + }, + { + "epoch": 0.5575447570332481, + "grad_norm": 0.30674762583017257, + "learning_rate": 4.5308056872037916e-05, + "loss": 0.4298, + "step": 218 + }, + { + "epoch": 0.5601023017902813, + "grad_norm": 0.3143323294898988, + "learning_rate": 4.526066350710901e-05, + "loss": 0.4174, + "step": 219 + }, + { + "epoch": 0.5626598465473146, + "grad_norm": 0.32231260882420976, + "learning_rate": 4.52132701421801e-05, + "loss": 0.4167, + "step": 220 + }, + { + "epoch": 0.5652173913043478, + "grad_norm": 0.3083722811455428, + "learning_rate": 4.516587677725119e-05, + "loss": 0.4146, + "step": 221 + }, + { + "epoch": 0.5677749360613811, + "grad_norm": 0.29120067898722657, + "learning_rate": 4.5118483412322274e-05, + "loss": 0.4341, + "step": 222 + }, + { + "epoch": 0.5703324808184144, + "grad_norm": 0.31085600501836047, + "learning_rate": 4.5071090047393366e-05, + "loss": 0.4368, + "step": 223 + }, + { + "epoch": 0.5728900255754475, + "grad_norm": 0.2562962629149674, + "learning_rate": 4.502369668246446e-05, + "loss": 0.4505, + "step": 224 + }, + { + "epoch": 0.5754475703324808, + "grad_norm": 0.3229335775809623, + "learning_rate": 4.497630331753555e-05, + "loss": 0.4281, + "step": 225 + }, + { + "epoch": 0.578005115089514, + "grad_norm": 0.2540883724081723, + "learning_rate": 4.492890995260663e-05, + "loss": 0.4345, + "step": 226 + }, + { + "epoch": 0.5805626598465473, + "grad_norm": 0.2886423143864352, + "learning_rate": 4.488151658767773e-05, + "loss": 0.4252, + "step": 227 + }, + { + "epoch": 0.5831202046035806, + "grad_norm": 0.25233412822407364, + "learning_rate": 4.483412322274882e-05, + "loss": 0.4366, + "step": 228 + }, + { + "epoch": 0.5856777493606138, + "grad_norm": 0.3098472836225145, + "learning_rate": 4.478672985781991e-05, + "loss": 0.4363, + "step": 229 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.27067664311480977, + "learning_rate": 4.4739336492891e-05, + "loss": 0.4354, + "step": 230 + }, + { + "epoch": 0.5907928388746803, + "grad_norm": 0.28985639348209424, + "learning_rate": 4.469194312796209e-05, + "loss": 0.4441, + "step": 231 + }, + { + "epoch": 0.5933503836317136, + "grad_norm": 0.24685436630203944, + "learning_rate": 4.464454976303318e-05, + "loss": 0.4249, + "step": 232 + }, + { + "epoch": 0.5959079283887468, + "grad_norm": 0.2415267361110554, + "learning_rate": 4.4597156398104266e-05, + "loss": 0.4218, + "step": 233 + }, + { + "epoch": 0.59846547314578, + "grad_norm": 0.2690111434121743, + "learning_rate": 4.454976303317536e-05, + "loss": 0.4597, + "step": 234 + }, + { + "epoch": 0.6010230179028133, + "grad_norm": 0.24515241676488578, + "learning_rate": 4.450236966824645e-05, + "loss": 0.4484, + "step": 235 + }, + { + "epoch": 0.6035805626598465, + "grad_norm": 0.27035232285201444, + "learning_rate": 4.445497630331753e-05, + "loss": 0.4241, + "step": 236 + }, + { + "epoch": 0.6061381074168798, + "grad_norm": 0.24712864164146403, + "learning_rate": 4.4407582938388624e-05, + "loss": 0.4351, + "step": 237 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 0.2756970755602701, + "learning_rate": 4.4360189573459716e-05, + "loss": 0.424, + "step": 238 + }, + { + "epoch": 0.6112531969309463, + "grad_norm": 0.219814601291788, + "learning_rate": 4.431279620853081e-05, + "loss": 0.4355, + "step": 239 + }, + { + "epoch": 0.6138107416879796, + "grad_norm": 0.3022287967822769, + "learning_rate": 4.42654028436019e-05, + "loss": 0.421, + "step": 240 + }, + { + "epoch": 0.6163682864450127, + "grad_norm": 0.25187957786419013, + "learning_rate": 4.421800947867299e-05, + "loss": 0.3987, + "step": 241 + }, + { + "epoch": 0.618925831202046, + "grad_norm": 0.2906641083550279, + "learning_rate": 4.417061611374408e-05, + "loss": 0.4094, + "step": 242 + }, + { + "epoch": 0.6214833759590793, + "grad_norm": 0.276150078017692, + "learning_rate": 4.4123222748815165e-05, + "loss": 0.4336, + "step": 243 + }, + { + "epoch": 0.6240409207161125, + "grad_norm": 0.31066268816197273, + "learning_rate": 4.407582938388626e-05, + "loss": 0.4327, + "step": 244 + }, + { + "epoch": 0.6265984654731458, + "grad_norm": 0.2741673358883194, + "learning_rate": 4.402843601895735e-05, + "loss": 0.4389, + "step": 245 + }, + { + "epoch": 0.629156010230179, + "grad_norm": 0.2836157982013865, + "learning_rate": 4.398104265402844e-05, + "loss": 0.4197, + "step": 246 + }, + { + "epoch": 0.6317135549872123, + "grad_norm": 0.2785562060260622, + "learning_rate": 4.3933649289099524e-05, + "loss": 0.4118, + "step": 247 + }, + { + "epoch": 0.6342710997442456, + "grad_norm": 0.2562708631634233, + "learning_rate": 4.3886255924170615e-05, + "loss": 0.4313, + "step": 248 + }, + { + "epoch": 0.6368286445012787, + "grad_norm": 0.3006474659338952, + "learning_rate": 4.383886255924171e-05, + "loss": 0.4369, + "step": 249 + }, + { + "epoch": 0.639386189258312, + "grad_norm": 0.2457393144167786, + "learning_rate": 4.37914691943128e-05, + "loss": 0.4262, + "step": 250 + }, + { + "epoch": 0.6419437340153452, + "grad_norm": 0.2613054151983516, + "learning_rate": 4.374407582938389e-05, + "loss": 0.4156, + "step": 251 + }, + { + "epoch": 0.6445012787723785, + "grad_norm": 0.2560975612132112, + "learning_rate": 4.369668246445498e-05, + "loss": 0.4327, + "step": 252 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 0.2615125383682568, + "learning_rate": 4.364928909952607e-05, + "loss": 0.4239, + "step": 253 + }, + { + "epoch": 0.649616368286445, + "grad_norm": 0.2703032829220517, + "learning_rate": 4.3601895734597157e-05, + "loss": 0.4326, + "step": 254 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.27271514949565595, + "learning_rate": 4.355450236966825e-05, + "loss": 0.4265, + "step": 255 + }, + { + "epoch": 0.6547314578005116, + "grad_norm": 0.28726916782564577, + "learning_rate": 4.350710900473934e-05, + "loss": 0.4357, + "step": 256 + }, + { + "epoch": 0.6572890025575447, + "grad_norm": 0.24344125190622717, + "learning_rate": 4.345971563981043e-05, + "loss": 0.4209, + "step": 257 + }, + { + "epoch": 0.659846547314578, + "grad_norm": 0.2779762711774089, + "learning_rate": 4.3412322274881515e-05, + "loss": 0.4402, + "step": 258 + }, + { + "epoch": 0.6624040920716112, + "grad_norm": 0.2833066194766303, + "learning_rate": 4.3364928909952606e-05, + "loss": 0.4309, + "step": 259 + }, + { + "epoch": 0.6649616368286445, + "grad_norm": 0.264439200242611, + "learning_rate": 4.33175355450237e-05, + "loss": 0.4234, + "step": 260 + }, + { + "epoch": 0.6675191815856778, + "grad_norm": 0.24820943480335378, + "learning_rate": 4.327014218009479e-05, + "loss": 0.3998, + "step": 261 + }, + { + "epoch": 0.670076726342711, + "grad_norm": 0.25992990540498473, + "learning_rate": 4.322274881516588e-05, + "loss": 0.4168, + "step": 262 + }, + { + "epoch": 0.6726342710997443, + "grad_norm": 0.261861520036362, + "learning_rate": 4.317535545023697e-05, + "loss": 0.4148, + "step": 263 + }, + { + "epoch": 0.6751918158567775, + "grad_norm": 0.26644356287497634, + "learning_rate": 4.312796208530806e-05, + "loss": 0.4345, + "step": 264 + }, + { + "epoch": 0.6777493606138107, + "grad_norm": 0.2666945078617733, + "learning_rate": 4.308056872037915e-05, + "loss": 0.429, + "step": 265 + }, + { + "epoch": 0.680306905370844, + "grad_norm": 0.23998424454638398, + "learning_rate": 4.303317535545024e-05, + "loss": 0.4226, + "step": 266 + }, + { + "epoch": 0.6828644501278772, + "grad_norm": 0.2530577923125785, + "learning_rate": 4.298578199052133e-05, + "loss": 0.4078, + "step": 267 + }, + { + "epoch": 0.6854219948849105, + "grad_norm": 0.2532304497913718, + "learning_rate": 4.293838862559242e-05, + "loss": 0.4157, + "step": 268 + }, + { + "epoch": 0.6879795396419437, + "grad_norm": 0.25183699854529734, + "learning_rate": 4.2890995260663506e-05, + "loss": 0.4213, + "step": 269 + }, + { + "epoch": 0.690537084398977, + "grad_norm": 0.26547907225114403, + "learning_rate": 4.28436018957346e-05, + "loss": 0.4213, + "step": 270 + }, + { + "epoch": 0.6930946291560103, + "grad_norm": 0.247119162528651, + "learning_rate": 4.279620853080569e-05, + "loss": 0.4441, + "step": 271 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 0.2802387698001237, + "learning_rate": 4.2748815165876774e-05, + "loss": 0.3961, + "step": 272 + }, + { + "epoch": 0.6982097186700768, + "grad_norm": 0.26948037359199567, + "learning_rate": 4.270142180094787e-05, + "loss": 0.4245, + "step": 273 + }, + { + "epoch": 0.7007672634271099, + "grad_norm": 0.26130649009882045, + "learning_rate": 4.265402843601896e-05, + "loss": 0.4322, + "step": 274 + }, + { + "epoch": 0.7033248081841432, + "grad_norm": 0.27770444806162603, + "learning_rate": 4.260663507109005e-05, + "loss": 0.4202, + "step": 275 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.2725938450444014, + "learning_rate": 4.255924170616114e-05, + "loss": 0.4287, + "step": 276 + }, + { + "epoch": 0.7084398976982097, + "grad_norm": 0.27389105466937425, + "learning_rate": 4.251184834123223e-05, + "loss": 0.4409, + "step": 277 + }, + { + "epoch": 0.710997442455243, + "grad_norm": 0.2559589781819663, + "learning_rate": 4.246445497630332e-05, + "loss": 0.4239, + "step": 278 + }, + { + "epoch": 0.7135549872122762, + "grad_norm": 0.27905262687916627, + "learning_rate": 4.2417061611374406e-05, + "loss": 0.4012, + "step": 279 + }, + { + "epoch": 0.7161125319693095, + "grad_norm": 0.23334151341578974, + "learning_rate": 4.23696682464455e-05, + "loss": 0.4046, + "step": 280 + }, + { + "epoch": 0.7186700767263428, + "grad_norm": 0.268496141900923, + "learning_rate": 4.232227488151659e-05, + "loss": 0.4333, + "step": 281 + }, + { + "epoch": 0.7212276214833759, + "grad_norm": 0.23917839522777942, + "learning_rate": 4.227488151658768e-05, + "loss": 0.4199, + "step": 282 + }, + { + "epoch": 0.7237851662404092, + "grad_norm": 0.2550111302110382, + "learning_rate": 4.2227488151658765e-05, + "loss": 0.4323, + "step": 283 + }, + { + "epoch": 0.7263427109974424, + "grad_norm": 0.23586654241099228, + "learning_rate": 4.218009478672986e-05, + "loss": 0.4332, + "step": 284 + }, + { + "epoch": 0.7289002557544757, + "grad_norm": 0.23396222749154336, + "learning_rate": 4.2132701421800954e-05, + "loss": 0.4269, + "step": 285 + }, + { + "epoch": 0.731457800511509, + "grad_norm": 0.24604087944261607, + "learning_rate": 4.208530805687204e-05, + "loss": 0.4255, + "step": 286 + }, + { + "epoch": 0.7340153452685422, + "grad_norm": 0.2357151023733209, + "learning_rate": 4.203791469194313e-05, + "loss": 0.4153, + "step": 287 + }, + { + "epoch": 0.7365728900255755, + "grad_norm": 0.2737358478652627, + "learning_rate": 4.199052132701422e-05, + "loss": 0.4122, + "step": 288 + }, + { + "epoch": 0.7391304347826086, + "grad_norm": 0.24451563415626945, + "learning_rate": 4.194312796208531e-05, + "loss": 0.4185, + "step": 289 + }, + { + "epoch": 0.7416879795396419, + "grad_norm": 0.2541923450282548, + "learning_rate": 4.18957345971564e-05, + "loss": 0.433, + "step": 290 + }, + { + "epoch": 0.7442455242966752, + "grad_norm": 0.26598804764295264, + "learning_rate": 4.184834123222749e-05, + "loss": 0.4331, + "step": 291 + }, + { + "epoch": 0.7468030690537084, + "grad_norm": 0.2652230008961156, + "learning_rate": 4.180094786729858e-05, + "loss": 0.4055, + "step": 292 + }, + { + "epoch": 0.7493606138107417, + "grad_norm": 0.2795715968348066, + "learning_rate": 4.175355450236967e-05, + "loss": 0.445, + "step": 293 + }, + { + "epoch": 0.7519181585677749, + "grad_norm": 0.27501165211060447, + "learning_rate": 4.1706161137440756e-05, + "loss": 0.4159, + "step": 294 + }, + { + "epoch": 0.7544757033248082, + "grad_norm": 0.2739979913068721, + "learning_rate": 4.1658767772511854e-05, + "loss": 0.4104, + "step": 295 + }, + { + "epoch": 0.7570332480818415, + "grad_norm": 0.28465485095011533, + "learning_rate": 4.1611374407582945e-05, + "loss": 0.4244, + "step": 296 + }, + { + "epoch": 0.7595907928388747, + "grad_norm": 0.23976720007281227, + "learning_rate": 4.156398104265403e-05, + "loss": 0.4295, + "step": 297 + }, + { + "epoch": 0.7621483375959079, + "grad_norm": 0.3088498871060993, + "learning_rate": 4.151658767772512e-05, + "loss": 0.4092, + "step": 298 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 0.22770658779763117, + "learning_rate": 4.146919431279621e-05, + "loss": 0.4269, + "step": 299 + }, + { + "epoch": 0.7672634271099744, + "grad_norm": 0.27958609884503893, + "learning_rate": 4.1421800947867304e-05, + "loss": 0.4044, + "step": 300 + }, + { + "epoch": 0.7698209718670077, + "grad_norm": 0.21729140703255853, + "learning_rate": 4.137440758293839e-05, + "loss": 0.4131, + "step": 301 + }, + { + "epoch": 0.7723785166240409, + "grad_norm": 0.2685972778786351, + "learning_rate": 4.132701421800948e-05, + "loss": 0.4207, + "step": 302 + }, + { + "epoch": 0.7749360613810742, + "grad_norm": 0.22146302445276972, + "learning_rate": 4.127962085308057e-05, + "loss": 0.4242, + "step": 303 + }, + { + "epoch": 0.7774936061381074, + "grad_norm": 0.246088542123556, + "learning_rate": 4.123222748815166e-05, + "loss": 0.4141, + "step": 304 + }, + { + "epoch": 0.7800511508951407, + "grad_norm": 0.2601313122186582, + "learning_rate": 4.118483412322275e-05, + "loss": 0.4139, + "step": 305 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.24464325806612688, + "learning_rate": 4.113744075829384e-05, + "loss": 0.4163, + "step": 306 + }, + { + "epoch": 0.7851662404092071, + "grad_norm": 0.2651808280050511, + "learning_rate": 4.1090047393364936e-05, + "loss": 0.4306, + "step": 307 + }, + { + "epoch": 0.7877237851662404, + "grad_norm": 0.30621497925153024, + "learning_rate": 4.104265402843602e-05, + "loss": 0.4142, + "step": 308 + }, + { + "epoch": 0.7902813299232737, + "grad_norm": 0.27574828742072455, + "learning_rate": 4.099526066350711e-05, + "loss": 0.4194, + "step": 309 + }, + { + "epoch": 0.7928388746803069, + "grad_norm": 0.2646206797692572, + "learning_rate": 4.0947867298578204e-05, + "loss": 0.4338, + "step": 310 + }, + { + "epoch": 0.7953964194373402, + "grad_norm": 0.2953561111239538, + "learning_rate": 4.090047393364929e-05, + "loss": 0.4354, + "step": 311 + }, + { + "epoch": 0.7979539641943734, + "grad_norm": 0.2679304891781562, + "learning_rate": 4.085308056872038e-05, + "loss": 0.3996, + "step": 312 + }, + { + "epoch": 0.8005115089514067, + "grad_norm": 0.2614240488716786, + "learning_rate": 4.080568720379147e-05, + "loss": 0.4177, + "step": 313 + }, + { + "epoch": 0.80306905370844, + "grad_norm": 0.265506214792124, + "learning_rate": 4.075829383886256e-05, + "loss": 0.4229, + "step": 314 + }, + { + "epoch": 0.8056265984654731, + "grad_norm": 0.27403664060217564, + "learning_rate": 4.071090047393365e-05, + "loss": 0.4111, + "step": 315 + }, + { + "epoch": 0.8081841432225064, + "grad_norm": 0.27566927450054673, + "learning_rate": 4.066350710900474e-05, + "loss": 0.4186, + "step": 316 + }, + { + "epoch": 0.8107416879795396, + "grad_norm": 0.2432325969682962, + "learning_rate": 4.061611374407583e-05, + "loss": 0.4317, + "step": 317 + }, + { + "epoch": 0.8132992327365729, + "grad_norm": 0.3100835908713653, + "learning_rate": 4.056872037914692e-05, + "loss": 0.4263, + "step": 318 + }, + { + "epoch": 0.8158567774936062, + "grad_norm": 0.22437352803704477, + "learning_rate": 4.052132701421801e-05, + "loss": 0.4255, + "step": 319 + }, + { + "epoch": 0.8184143222506394, + "grad_norm": 0.2922741159014344, + "learning_rate": 4.0473933649289103e-05, + "loss": 0.4211, + "step": 320 + }, + { + "epoch": 0.8209718670076727, + "grad_norm": 0.24988657961825148, + "learning_rate": 4.0426540284360195e-05, + "loss": 0.424, + "step": 321 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.26255657346142036, + "learning_rate": 4.037914691943128e-05, + "loss": 0.4227, + "step": 322 + }, + { + "epoch": 0.8260869565217391, + "grad_norm": 0.30330186929682673, + "learning_rate": 4.033175355450237e-05, + "loss": 0.4074, + "step": 323 + }, + { + "epoch": 0.8286445012787724, + "grad_norm": 0.2545401531861922, + "learning_rate": 4.028436018957346e-05, + "loss": 0.4229, + "step": 324 + }, + { + "epoch": 0.8312020460358056, + "grad_norm": 0.33076162934856534, + "learning_rate": 4.023696682464455e-05, + "loss": 0.4236, + "step": 325 + }, + { + "epoch": 0.8337595907928389, + "grad_norm": 0.2595060997444347, + "learning_rate": 4.018957345971564e-05, + "loss": 0.3957, + "step": 326 + }, + { + "epoch": 0.8363171355498721, + "grad_norm": 0.27331289296315875, + "learning_rate": 4.014218009478673e-05, + "loss": 0.3975, + "step": 327 + }, + { + "epoch": 0.8388746803069054, + "grad_norm": 0.3262160188917236, + "learning_rate": 4.009478672985782e-05, + "loss": 0.4149, + "step": 328 + }, + { + "epoch": 0.8414322250639387, + "grad_norm": 0.28084918191171054, + "learning_rate": 4.004739336492891e-05, + "loss": 0.3999, + "step": 329 + }, + { + "epoch": 0.8439897698209718, + "grad_norm": 0.28489832877056614, + "learning_rate": 4e-05, + "loss": 0.4134, + "step": 330 + }, + { + "epoch": 0.8465473145780051, + "grad_norm": 0.28245197788542203, + "learning_rate": 3.9952606635071095e-05, + "loss": 0.4169, + "step": 331 + }, + { + "epoch": 0.8491048593350383, + "grad_norm": 0.2637012101965425, + "learning_rate": 3.9905213270142186e-05, + "loss": 0.4218, + "step": 332 + }, + { + "epoch": 0.8516624040920716, + "grad_norm": 0.25239050580322964, + "learning_rate": 3.985781990521327e-05, + "loss": 0.403, + "step": 333 + }, + { + "epoch": 0.8542199488491049, + "grad_norm": 0.3242230481439879, + "learning_rate": 3.981042654028436e-05, + "loss": 0.4413, + "step": 334 + }, + { + "epoch": 0.8567774936061381, + "grad_norm": 0.284310422864808, + "learning_rate": 3.976303317535545e-05, + "loss": 0.3992, + "step": 335 + }, + { + "epoch": 0.8593350383631714, + "grad_norm": 0.44681533776592774, + "learning_rate": 3.9715639810426545e-05, + "loss": 0.427, + "step": 336 + }, + { + "epoch": 0.8618925831202046, + "grad_norm": 0.276866045564762, + "learning_rate": 3.966824644549763e-05, + "loss": 0.4166, + "step": 337 + }, + { + "epoch": 0.8644501278772379, + "grad_norm": 0.2645666241102728, + "learning_rate": 3.962085308056872e-05, + "loss": 0.4018, + "step": 338 + }, + { + "epoch": 0.8670076726342711, + "grad_norm": 0.24575688741880164, + "learning_rate": 3.957345971563981e-05, + "loss": 0.4103, + "step": 339 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.27234369778819617, + "learning_rate": 3.95260663507109e-05, + "loss": 0.4178, + "step": 340 + }, + { + "epoch": 0.8721227621483376, + "grad_norm": 0.2510614688018398, + "learning_rate": 3.9478672985781994e-05, + "loss": 0.4124, + "step": 341 + }, + { + "epoch": 0.8746803069053708, + "grad_norm": 0.26748644233845587, + "learning_rate": 3.9431279620853086e-05, + "loss": 0.414, + "step": 342 + }, + { + "epoch": 0.8772378516624041, + "grad_norm": 0.2839803657663104, + "learning_rate": 3.938388625592418e-05, + "loss": 0.4089, + "step": 343 + }, + { + "epoch": 0.8797953964194374, + "grad_norm": 0.2797111880377059, + "learning_rate": 3.933649289099526e-05, + "loss": 0.4143, + "step": 344 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 0.2889574353504485, + "learning_rate": 3.928909952606635e-05, + "loss": 0.4224, + "step": 345 + }, + { + "epoch": 0.8849104859335039, + "grad_norm": 0.2661360330372934, + "learning_rate": 3.9241706161137444e-05, + "loss": 0.4002, + "step": 346 + }, + { + "epoch": 0.887468030690537, + "grad_norm": 0.2771219438556858, + "learning_rate": 3.919431279620853e-05, + "loss": 0.4113, + "step": 347 + }, + { + "epoch": 0.8900255754475703, + "grad_norm": 0.27519975509219513, + "learning_rate": 3.914691943127962e-05, + "loss": 0.4191, + "step": 348 + }, + { + "epoch": 0.8925831202046036, + "grad_norm": 0.2928986459591194, + "learning_rate": 3.909952606635071e-05, + "loss": 0.4233, + "step": 349 + }, + { + "epoch": 0.8951406649616368, + "grad_norm": 0.2516706010333049, + "learning_rate": 3.90521327014218e-05, + "loss": 0.4012, + "step": 350 + }, + { + "epoch": 0.8976982097186701, + "grad_norm": 0.2408367305868911, + "learning_rate": 3.900473933649289e-05, + "loss": 0.409, + "step": 351 + }, + { + "epoch": 0.9002557544757033, + "grad_norm": 0.27279698596719115, + "learning_rate": 3.8957345971563986e-05, + "loss": 0.4068, + "step": 352 + }, + { + "epoch": 0.9028132992327366, + "grad_norm": 0.25724144072901317, + "learning_rate": 3.890995260663508e-05, + "loss": 0.4048, + "step": 353 + }, + { + "epoch": 0.9053708439897699, + "grad_norm": 0.22699353109114737, + "learning_rate": 3.886255924170616e-05, + "loss": 0.4005, + "step": 354 + }, + { + "epoch": 0.907928388746803, + "grad_norm": 0.248751842398148, + "learning_rate": 3.881516587677725e-05, + "loss": 0.403, + "step": 355 + }, + { + "epoch": 0.9104859335038363, + "grad_norm": 0.29922749419927136, + "learning_rate": 3.8767772511848344e-05, + "loss": 0.4255, + "step": 356 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.24165253803081185, + "learning_rate": 3.8720379146919435e-05, + "loss": 0.3939, + "step": 357 + }, + { + "epoch": 0.9156010230179028, + "grad_norm": 0.26769384614675706, + "learning_rate": 3.867298578199052e-05, + "loss": 0.3881, + "step": 358 + }, + { + "epoch": 0.9181585677749361, + "grad_norm": 0.24501952061738294, + "learning_rate": 3.862559241706161e-05, + "loss": 0.412, + "step": 359 + }, + { + "epoch": 0.9207161125319693, + "grad_norm": 0.27781395797316877, + "learning_rate": 3.85781990521327e-05, + "loss": 0.4293, + "step": 360 + }, + { + "epoch": 0.9232736572890026, + "grad_norm": 0.22892488592677732, + "learning_rate": 3.8530805687203794e-05, + "loss": 0.4069, + "step": 361 + }, + { + "epoch": 0.9258312020460358, + "grad_norm": 0.258222796507594, + "learning_rate": 3.848341232227488e-05, + "loss": 0.4177, + "step": 362 + }, + { + "epoch": 0.928388746803069, + "grad_norm": 0.22668062864053168, + "learning_rate": 3.843601895734598e-05, + "loss": 0.4012, + "step": 363 + }, + { + "epoch": 0.9309462915601023, + "grad_norm": 0.29919710610032196, + "learning_rate": 3.838862559241707e-05, + "loss": 0.3971, + "step": 364 + }, + { + "epoch": 0.9335038363171355, + "grad_norm": 0.25611276582674614, + "learning_rate": 3.834123222748815e-05, + "loss": 0.4071, + "step": 365 + }, + { + "epoch": 0.9360613810741688, + "grad_norm": 0.24646222411688562, + "learning_rate": 3.8293838862559244e-05, + "loss": 0.3993, + "step": 366 + }, + { + "epoch": 0.9386189258312021, + "grad_norm": 0.26353127236434676, + "learning_rate": 3.8246445497630335e-05, + "loss": 0.4042, + "step": 367 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.23542690864108376, + "learning_rate": 3.8199052132701427e-05, + "loss": 0.3971, + "step": 368 + }, + { + "epoch": 0.9437340153452686, + "grad_norm": 0.28245650020992513, + "learning_rate": 3.815165876777251e-05, + "loss": 0.4044, + "step": 369 + }, + { + "epoch": 0.9462915601023018, + "grad_norm": 0.241903542321646, + "learning_rate": 3.81042654028436e-05, + "loss": 0.4054, + "step": 370 + }, + { + "epoch": 0.948849104859335, + "grad_norm": 0.2378788913607164, + "learning_rate": 3.8056872037914694e-05, + "loss": 0.4125, + "step": 371 + }, + { + "epoch": 0.9514066496163683, + "grad_norm": 0.2804121730578267, + "learning_rate": 3.8009478672985785e-05, + "loss": 0.421, + "step": 372 + }, + { + "epoch": 0.9539641943734015, + "grad_norm": 0.23484030136789275, + "learning_rate": 3.796208530805687e-05, + "loss": 0.4112, + "step": 373 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.27396114311616715, + "learning_rate": 3.791469194312796e-05, + "loss": 0.4204, + "step": 374 + }, + { + "epoch": 0.959079283887468, + "grad_norm": 0.2393059668656863, + "learning_rate": 3.786729857819906e-05, + "loss": 0.4221, + "step": 375 + }, + { + "epoch": 0.9616368286445013, + "grad_norm": 0.2482404610854873, + "learning_rate": 3.7819905213270144e-05, + "loss": 0.4078, + "step": 376 + }, + { + "epoch": 0.9641943734015346, + "grad_norm": 0.2187225932256056, + "learning_rate": 3.7772511848341235e-05, + "loss": 0.4012, + "step": 377 + }, + { + "epoch": 0.9667519181585678, + "grad_norm": 0.2876923866076237, + "learning_rate": 3.7725118483412326e-05, + "loss": 0.4252, + "step": 378 + }, + { + "epoch": 0.969309462915601, + "grad_norm": 0.21352306199272536, + "learning_rate": 3.767772511848342e-05, + "loss": 0.3966, + "step": 379 + }, + { + "epoch": 0.9718670076726342, + "grad_norm": 0.24918249981369345, + "learning_rate": 3.76303317535545e-05, + "loss": 0.415, + "step": 380 + }, + { + "epoch": 0.9744245524296675, + "grad_norm": 0.23387991185870513, + "learning_rate": 3.7582938388625594e-05, + "loss": 0.4127, + "step": 381 + }, + { + "epoch": 0.9769820971867008, + "grad_norm": 0.29074570845452075, + "learning_rate": 3.7535545023696685e-05, + "loss": 0.4275, + "step": 382 + }, + { + "epoch": 0.979539641943734, + "grad_norm": 0.26317053639627985, + "learning_rate": 3.748815165876777e-05, + "loss": 0.4294, + "step": 383 + }, + { + "epoch": 0.9820971867007673, + "grad_norm": 0.2686101535519852, + "learning_rate": 3.744075829383886e-05, + "loss": 0.4025, + "step": 384 + }, + { + "epoch": 0.9846547314578005, + "grad_norm": 0.23512146035375164, + "learning_rate": 3.739336492890995e-05, + "loss": 0.4156, + "step": 385 + }, + { + "epoch": 0.9872122762148338, + "grad_norm": 0.2445528540082093, + "learning_rate": 3.734597156398105e-05, + "loss": 0.411, + "step": 386 + }, + { + "epoch": 0.989769820971867, + "grad_norm": 0.25629958731478186, + "learning_rate": 3.7298578199052135e-05, + "loss": 0.4146, + "step": 387 + }, + { + "epoch": 0.9923273657289002, + "grad_norm": 0.22796776248894252, + "learning_rate": 3.7251184834123226e-05, + "loss": 0.4087, + "step": 388 + }, + { + "epoch": 0.9948849104859335, + "grad_norm": 0.2958838240159185, + "learning_rate": 3.720379146919432e-05, + "loss": 0.4099, + "step": 389 + }, + { + "epoch": 0.9974424552429667, + "grad_norm": 0.29381146676513115, + "learning_rate": 3.71563981042654e-05, + "loss": 0.414, + "step": 390 + }, + { + "epoch": 1.0, + "grad_norm": 0.24652387465895462, + "learning_rate": 3.7109004739336493e-05, + "loss": 0.406, + "step": 391 + }, + { + "epoch": 1.0025575447570332, + "grad_norm": 0.34877694988477503, + "learning_rate": 3.7061611374407585e-05, + "loss": 0.3503, + "step": 392 + }, + { + "epoch": 1.0051150895140666, + "grad_norm": 0.253308743207643, + "learning_rate": 3.7014218009478676e-05, + "loss": 0.3396, + "step": 393 + }, + { + "epoch": 1.0076726342710998, + "grad_norm": 0.25868870232786395, + "learning_rate": 3.696682464454976e-05, + "loss": 0.3479, + "step": 394 + }, + { + "epoch": 1.010230179028133, + "grad_norm": 0.2971613524674976, + "learning_rate": 3.691943127962085e-05, + "loss": 0.3422, + "step": 395 + }, + { + "epoch": 1.0127877237851663, + "grad_norm": 0.2906921720555448, + "learning_rate": 3.687203791469194e-05, + "loss": 0.3554, + "step": 396 + }, + { + "epoch": 1.0153452685421995, + "grad_norm": 0.2875908700888308, + "learning_rate": 3.6824644549763035e-05, + "loss": 0.3291, + "step": 397 + }, + { + "epoch": 1.0179028132992327, + "grad_norm": 0.26243291597976126, + "learning_rate": 3.6777251184834126e-05, + "loss": 0.3563, + "step": 398 + }, + { + "epoch": 1.020460358056266, + "grad_norm": 0.2730126516412927, + "learning_rate": 3.672985781990522e-05, + "loss": 0.3292, + "step": 399 + }, + { + "epoch": 1.0230179028132993, + "grad_norm": 0.29682604006588903, + "learning_rate": 3.668246445497631e-05, + "loss": 0.3461, + "step": 400 + }, + { + "epoch": 1.0255754475703325, + "grad_norm": 0.2494027953748241, + "learning_rate": 3.663507109004739e-05, + "loss": 0.3491, + "step": 401 + }, + { + "epoch": 1.0281329923273657, + "grad_norm": 0.2538094727914758, + "learning_rate": 3.6587677725118485e-05, + "loss": 0.3406, + "step": 402 + }, + { + "epoch": 1.030690537084399, + "grad_norm": 0.28915662612861087, + "learning_rate": 3.6540284360189576e-05, + "loss": 0.3408, + "step": 403 + }, + { + "epoch": 1.0332480818414322, + "grad_norm": 0.24591203347051302, + "learning_rate": 3.649289099526067e-05, + "loss": 0.337, + "step": 404 + }, + { + "epoch": 1.0358056265984654, + "grad_norm": 0.2871114071516867, + "learning_rate": 3.644549763033175e-05, + "loss": 0.3347, + "step": 405 + }, + { + "epoch": 1.0383631713554988, + "grad_norm": 0.2524744240235806, + "learning_rate": 3.639810426540284e-05, + "loss": 0.3441, + "step": 406 + }, + { + "epoch": 1.040920716112532, + "grad_norm": 0.2630583826634349, + "learning_rate": 3.6350710900473935e-05, + "loss": 0.3099, + "step": 407 + }, + { + "epoch": 1.0434782608695652, + "grad_norm": 0.2570358498212408, + "learning_rate": 3.6303317535545026e-05, + "loss": 0.3211, + "step": 408 + }, + { + "epoch": 1.0460358056265984, + "grad_norm": 0.26431307397410003, + "learning_rate": 3.625592417061612e-05, + "loss": 0.35, + "step": 409 + }, + { + "epoch": 1.0485933503836318, + "grad_norm": 0.27463747349361467, + "learning_rate": 3.620853080568721e-05, + "loss": 0.3494, + "step": 410 + }, + { + "epoch": 1.051150895140665, + "grad_norm": 0.24666921280022072, + "learning_rate": 3.61611374407583e-05, + "loss": 0.341, + "step": 411 + }, + { + "epoch": 1.0537084398976981, + "grad_norm": 0.2505495844763562, + "learning_rate": 3.6113744075829384e-05, + "loss": 0.3412, + "step": 412 + }, + { + "epoch": 1.0562659846547315, + "grad_norm": 0.2506374206193608, + "learning_rate": 3.6066350710900476e-05, + "loss": 0.3229, + "step": 413 + }, + { + "epoch": 1.0588235294117647, + "grad_norm": 0.24249666251287566, + "learning_rate": 3.601895734597157e-05, + "loss": 0.3329, + "step": 414 + }, + { + "epoch": 1.061381074168798, + "grad_norm": 0.2618704099040068, + "learning_rate": 3.597156398104266e-05, + "loss": 0.3462, + "step": 415 + }, + { + "epoch": 1.0639386189258313, + "grad_norm": 0.25454325976096887, + "learning_rate": 3.592417061611374e-05, + "loss": 0.319, + "step": 416 + }, + { + "epoch": 1.0664961636828645, + "grad_norm": 0.3012500683553219, + "learning_rate": 3.5876777251184834e-05, + "loss": 0.3452, + "step": 417 + }, + { + "epoch": 1.0690537084398977, + "grad_norm": 0.2310352458746118, + "learning_rate": 3.5829383886255926e-05, + "loss": 0.3203, + "step": 418 + }, + { + "epoch": 1.0716112531969308, + "grad_norm": 0.2867380051579317, + "learning_rate": 3.578199052132701e-05, + "loss": 0.3408, + "step": 419 + }, + { + "epoch": 1.0741687979539642, + "grad_norm": 0.24642924252308632, + "learning_rate": 3.573459715639811e-05, + "loss": 0.3247, + "step": 420 + }, + { + "epoch": 1.0767263427109974, + "grad_norm": 0.22539243089747027, + "learning_rate": 3.56872037914692e-05, + "loss": 0.3282, + "step": 421 + }, + { + "epoch": 1.0792838874680306, + "grad_norm": 0.2508510372019925, + "learning_rate": 3.563981042654029e-05, + "loss": 0.3444, + "step": 422 + }, + { + "epoch": 1.081841432225064, + "grad_norm": 0.25272955853952195, + "learning_rate": 3.5592417061611376e-05, + "loss": 0.3366, + "step": 423 + }, + { + "epoch": 1.0843989769820972, + "grad_norm": 0.2272026636889727, + "learning_rate": 3.554502369668247e-05, + "loss": 0.3516, + "step": 424 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.27462834447503987, + "learning_rate": 3.549763033175356e-05, + "loss": 0.3374, + "step": 425 + }, + { + "epoch": 1.0895140664961638, + "grad_norm": 0.2128026835115876, + "learning_rate": 3.545023696682464e-05, + "loss": 0.3336, + "step": 426 + }, + { + "epoch": 1.092071611253197, + "grad_norm": 0.2354379611105053, + "learning_rate": 3.5402843601895734e-05, + "loss": 0.3379, + "step": 427 + }, + { + "epoch": 1.0946291560102301, + "grad_norm": 0.224481929706568, + "learning_rate": 3.5355450236966825e-05, + "loss": 0.3564, + "step": 428 + }, + { + "epoch": 1.0971867007672633, + "grad_norm": 0.21368714222847607, + "learning_rate": 3.530805687203792e-05, + "loss": 0.3141, + "step": 429 + }, + { + "epoch": 1.0997442455242967, + "grad_norm": 0.23974097995132895, + "learning_rate": 3.5260663507109e-05, + "loss": 0.3313, + "step": 430 + }, + { + "epoch": 1.10230179028133, + "grad_norm": 1.5113616428603722, + "learning_rate": 3.52132701421801e-05, + "loss": 0.3288, + "step": 431 + }, + { + "epoch": 1.104859335038363, + "grad_norm": 0.2584287203231182, + "learning_rate": 3.516587677725119e-05, + "loss": 0.3347, + "step": 432 + }, + { + "epoch": 1.1074168797953965, + "grad_norm": 0.19574817933562375, + "learning_rate": 3.5118483412322275e-05, + "loss": 0.3204, + "step": 433 + }, + { + "epoch": 1.1099744245524297, + "grad_norm": 0.24894576509043242, + "learning_rate": 3.507109004739337e-05, + "loss": 0.3406, + "step": 434 + }, + { + "epoch": 1.1125319693094629, + "grad_norm": 0.22605511670011208, + "learning_rate": 3.502369668246446e-05, + "loss": 0.3372, + "step": 435 + }, + { + "epoch": 1.1150895140664963, + "grad_norm": 0.2426838758794149, + "learning_rate": 3.497630331753555e-05, + "loss": 0.3354, + "step": 436 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 0.22312946234793202, + "learning_rate": 3.4928909952606634e-05, + "loss": 0.3283, + "step": 437 + }, + { + "epoch": 1.1202046035805626, + "grad_norm": 0.24548486238399964, + "learning_rate": 3.4881516587677725e-05, + "loss": 0.3428, + "step": 438 + }, + { + "epoch": 1.1227621483375958, + "grad_norm": 0.22862518373154317, + "learning_rate": 3.4834123222748817e-05, + "loss": 0.3296, + "step": 439 + }, + { + "epoch": 1.1253196930946292, + "grad_norm": 0.2415393319131855, + "learning_rate": 3.478672985781991e-05, + "loss": 0.3366, + "step": 440 + }, + { + "epoch": 1.1278772378516624, + "grad_norm": 0.24350557581856444, + "learning_rate": 3.473933649289099e-05, + "loss": 0.3331, + "step": 441 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.22906222897576142, + "learning_rate": 3.4691943127962084e-05, + "loss": 0.3383, + "step": 442 + }, + { + "epoch": 1.132992327365729, + "grad_norm": 0.9570276654007269, + "learning_rate": 3.464454976303318e-05, + "loss": 0.3383, + "step": 443 + }, + { + "epoch": 1.1355498721227621, + "grad_norm": 0.4221562491079337, + "learning_rate": 3.4597156398104267e-05, + "loss": 0.3493, + "step": 444 + }, + { + "epoch": 1.1381074168797953, + "grad_norm": 0.7247823264413438, + "learning_rate": 3.454976303317536e-05, + "loss": 0.3503, + "step": 445 + }, + { + "epoch": 1.1406649616368287, + "grad_norm": 0.27292208588198535, + "learning_rate": 3.450236966824645e-05, + "loss": 0.3491, + "step": 446 + }, + { + "epoch": 1.143222506393862, + "grad_norm": 0.24846288711065778, + "learning_rate": 3.445497630331754e-05, + "loss": 0.34, + "step": 447 + }, + { + "epoch": 1.145780051150895, + "grad_norm": 0.28289846837651944, + "learning_rate": 3.4407582938388625e-05, + "loss": 0.3395, + "step": 448 + }, + { + "epoch": 1.1483375959079285, + "grad_norm": 0.20360202393964177, + "learning_rate": 3.4360189573459716e-05, + "loss": 0.336, + "step": 449 + }, + { + "epoch": 1.1508951406649617, + "grad_norm": 0.26795912135731376, + "learning_rate": 3.431279620853081e-05, + "loss": 0.3363, + "step": 450 + }, + { + "epoch": 1.1534526854219949, + "grad_norm": 0.24482207535162454, + "learning_rate": 3.42654028436019e-05, + "loss": 0.3263, + "step": 451 + }, + { + "epoch": 1.156010230179028, + "grad_norm": 1.091037041185637, + "learning_rate": 3.4218009478672984e-05, + "loss": 0.3319, + "step": 452 + }, + { + "epoch": 1.1585677749360614, + "grad_norm": 0.25708621832570655, + "learning_rate": 3.4170616113744075e-05, + "loss": 0.3456, + "step": 453 + }, + { + "epoch": 1.1611253196930946, + "grad_norm": 0.22978489335863728, + "learning_rate": 3.412322274881517e-05, + "loss": 0.3453, + "step": 454 + }, + { + "epoch": 1.1636828644501278, + "grad_norm": 0.23661531026909, + "learning_rate": 3.407582938388626e-05, + "loss": 0.3311, + "step": 455 + }, + { + "epoch": 1.1662404092071612, + "grad_norm": 0.26058801823717986, + "learning_rate": 3.402843601895735e-05, + "loss": 0.3293, + "step": 456 + }, + { + "epoch": 1.1687979539641944, + "grad_norm": 0.2213831357242978, + "learning_rate": 3.398104265402844e-05, + "loss": 0.3448, + "step": 457 + }, + { + "epoch": 1.1713554987212276, + "grad_norm": 0.22314933582706364, + "learning_rate": 3.393364928909953e-05, + "loss": 0.3319, + "step": 458 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 0.2368130291318867, + "learning_rate": 3.3886255924170616e-05, + "loss": 0.3417, + "step": 459 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.23299474747082943, + "learning_rate": 3.383886255924171e-05, + "loss": 0.3434, + "step": 460 + }, + { + "epoch": 1.1790281329923273, + "grad_norm": 0.23122859916384542, + "learning_rate": 3.37914691943128e-05, + "loss": 0.353, + "step": 461 + }, + { + "epoch": 1.1815856777493605, + "grad_norm": 0.23133544704817127, + "learning_rate": 3.3744075829383883e-05, + "loss": 0.3268, + "step": 462 + }, + { + "epoch": 1.184143222506394, + "grad_norm": 0.2279777026926834, + "learning_rate": 3.3696682464454975e-05, + "loss": 0.3427, + "step": 463 + }, + { + "epoch": 1.186700767263427, + "grad_norm": 0.24354305412664243, + "learning_rate": 3.3649289099526066e-05, + "loss": 0.3416, + "step": 464 + }, + { + "epoch": 1.1892583120204603, + "grad_norm": 0.23175483156511392, + "learning_rate": 3.360189573459716e-05, + "loss": 0.3253, + "step": 465 + }, + { + "epoch": 1.1918158567774937, + "grad_norm": 0.2505719500530794, + "learning_rate": 3.355450236966825e-05, + "loss": 0.357, + "step": 466 + }, + { + "epoch": 1.1943734015345269, + "grad_norm": 0.23794330588394164, + "learning_rate": 3.350710900473934e-05, + "loss": 0.3368, + "step": 467 + }, + { + "epoch": 1.19693094629156, + "grad_norm": 0.24430596176344385, + "learning_rate": 3.345971563981043e-05, + "loss": 0.3387, + "step": 468 + }, + { + "epoch": 1.1994884910485935, + "grad_norm": 0.22981260995180924, + "learning_rate": 3.3412322274881516e-05, + "loss": 0.3394, + "step": 469 + }, + { + "epoch": 1.2020460358056266, + "grad_norm": 0.26211223679278534, + "learning_rate": 3.336492890995261e-05, + "loss": 0.3319, + "step": 470 + }, + { + "epoch": 1.2046035805626598, + "grad_norm": 0.20949166867985375, + "learning_rate": 3.33175355450237e-05, + "loss": 0.3247, + "step": 471 + }, + { + "epoch": 1.207161125319693, + "grad_norm": 0.26920054172152863, + "learning_rate": 3.327014218009479e-05, + "loss": 0.3266, + "step": 472 + }, + { + "epoch": 1.2097186700767264, + "grad_norm": 0.23259269182122375, + "learning_rate": 3.3222748815165875e-05, + "loss": 0.336, + "step": 473 + }, + { + "epoch": 1.2122762148337596, + "grad_norm": 0.2544872114348285, + "learning_rate": 3.3175355450236966e-05, + "loss": 0.3288, + "step": 474 + }, + { + "epoch": 1.2148337595907928, + "grad_norm": 0.23096314256849135, + "learning_rate": 3.312796208530806e-05, + "loss": 0.338, + "step": 475 + }, + { + "epoch": 1.2173913043478262, + "grad_norm": 0.2714305850528602, + "learning_rate": 3.308056872037915e-05, + "loss": 0.3447, + "step": 476 + }, + { + "epoch": 1.2199488491048593, + "grad_norm": 0.27398730927997655, + "learning_rate": 3.303317535545024e-05, + "loss": 0.328, + "step": 477 + }, + { + "epoch": 1.2225063938618925, + "grad_norm": 0.21842573163699253, + "learning_rate": 3.298578199052133e-05, + "loss": 0.3434, + "step": 478 + }, + { + "epoch": 1.2250639386189257, + "grad_norm": 0.24231426743387735, + "learning_rate": 3.293838862559242e-05, + "loss": 0.3355, + "step": 479 + }, + { + "epoch": 1.227621483375959, + "grad_norm": 0.23387954201665254, + "learning_rate": 3.289099526066351e-05, + "loss": 0.3456, + "step": 480 + }, + { + "epoch": 1.2301790281329923, + "grad_norm": 0.2240321236806126, + "learning_rate": 3.28436018957346e-05, + "loss": 0.3376, + "step": 481 + }, + { + "epoch": 1.2327365728900257, + "grad_norm": 0.2261690321581938, + "learning_rate": 3.279620853080569e-05, + "loss": 0.3313, + "step": 482 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.27592615919196145, + "learning_rate": 3.274881516587678e-05, + "loss": 0.3385, + "step": 483 + }, + { + "epoch": 1.237851662404092, + "grad_norm": 0.1983777165414548, + "learning_rate": 3.2701421800947866e-05, + "loss": 0.3344, + "step": 484 + }, + { + "epoch": 1.2404092071611252, + "grad_norm": 0.25775855180422713, + "learning_rate": 3.265402843601896e-05, + "loss": 0.349, + "step": 485 + }, + { + "epoch": 1.2429667519181586, + "grad_norm": 0.21057070064196648, + "learning_rate": 3.260663507109005e-05, + "loss": 0.3226, + "step": 486 + }, + { + "epoch": 1.2455242966751918, + "grad_norm": 0.25264888053163403, + "learning_rate": 3.255924170616114e-05, + "loss": 0.3405, + "step": 487 + }, + { + "epoch": 1.248081841432225, + "grad_norm": 0.20358857621290893, + "learning_rate": 3.251184834123223e-05, + "loss": 0.3384, + "step": 488 + }, + { + "epoch": 1.2506393861892584, + "grad_norm": 0.2221188350040554, + "learning_rate": 3.246445497630332e-05, + "loss": 0.3592, + "step": 489 + }, + { + "epoch": 1.2531969309462916, + "grad_norm": 0.22907812456671411, + "learning_rate": 3.2417061611374414e-05, + "loss": 0.3509, + "step": 490 + }, + { + "epoch": 1.2557544757033248, + "grad_norm": 0.2203229089376764, + "learning_rate": 3.23696682464455e-05, + "loss": 0.3157, + "step": 491 + }, + { + "epoch": 1.258312020460358, + "grad_norm": 0.22923138047926875, + "learning_rate": 3.232227488151659e-05, + "loss": 0.3321, + "step": 492 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 0.20989998077940591, + "learning_rate": 3.227488151658768e-05, + "loss": 0.3293, + "step": 493 + }, + { + "epoch": 1.2634271099744245, + "grad_norm": 0.21568832380392375, + "learning_rate": 3.222748815165877e-05, + "loss": 0.3466, + "step": 494 + }, + { + "epoch": 1.265984654731458, + "grad_norm": 0.22068990151180867, + "learning_rate": 3.218009478672986e-05, + "loss": 0.3346, + "step": 495 + }, + { + "epoch": 1.2685421994884911, + "grad_norm": 0.22072570403379316, + "learning_rate": 3.213270142180095e-05, + "loss": 0.3415, + "step": 496 + }, + { + "epoch": 1.2710997442455243, + "grad_norm": 0.22130125503638862, + "learning_rate": 3.208530805687204e-05, + "loss": 0.3237, + "step": 497 + }, + { + "epoch": 1.2736572890025575, + "grad_norm": 0.21888368739994798, + "learning_rate": 3.2037914691943124e-05, + "loss": 0.3232, + "step": 498 + }, + { + "epoch": 1.2762148337595907, + "grad_norm": 0.23237851118888075, + "learning_rate": 3.1990521327014215e-05, + "loss": 0.3362, + "step": 499 + }, + { + "epoch": 1.278772378516624, + "grad_norm": 0.20408840058481117, + "learning_rate": 3.1943127962085314e-05, + "loss": 0.331, + "step": 500 + }, + { + "epoch": 1.2813299232736572, + "grad_norm": 0.25671575639210675, + "learning_rate": 3.18957345971564e-05, + "loss": 0.3483, + "step": 501 + }, + { + "epoch": 1.2838874680306906, + "grad_norm": 0.21146534380332607, + "learning_rate": 3.184834123222749e-05, + "loss": 0.3179, + "step": 502 + }, + { + "epoch": 1.2864450127877238, + "grad_norm": 0.22647977672526137, + "learning_rate": 3.180094786729858e-05, + "loss": 0.3371, + "step": 503 + }, + { + "epoch": 1.289002557544757, + "grad_norm": 0.2024444911259877, + "learning_rate": 3.175355450236967e-05, + "loss": 0.3208, + "step": 504 + }, + { + "epoch": 1.2915601023017902, + "grad_norm": 0.23292871699588752, + "learning_rate": 3.170616113744076e-05, + "loss": 0.3325, + "step": 505 + }, + { + "epoch": 1.2941176470588236, + "grad_norm": 0.24617143877927294, + "learning_rate": 3.165876777251185e-05, + "loss": 0.3456, + "step": 506 + }, + { + "epoch": 1.2966751918158568, + "grad_norm": 0.21521749139279983, + "learning_rate": 3.161137440758294e-05, + "loss": 0.3632, + "step": 507 + }, + { + "epoch": 1.29923273657289, + "grad_norm": 0.23710165276474784, + "learning_rate": 3.156398104265403e-05, + "loss": 0.3258, + "step": 508 + }, + { + "epoch": 1.3017902813299234, + "grad_norm": 0.2182566275526034, + "learning_rate": 3.1516587677725115e-05, + "loss": 0.3254, + "step": 509 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.2567309562573808, + "learning_rate": 3.1469194312796207e-05, + "loss": 0.3393, + "step": 510 + }, + { + "epoch": 1.3069053708439897, + "grad_norm": 0.21736576266820642, + "learning_rate": 3.1421800947867305e-05, + "loss": 0.3237, + "step": 511 + }, + { + "epoch": 1.309462915601023, + "grad_norm": 0.2476914139591077, + "learning_rate": 3.137440758293839e-05, + "loss": 0.3387, + "step": 512 + }, + { + "epoch": 1.3120204603580563, + "grad_norm": 0.21116880886723996, + "learning_rate": 3.132701421800948e-05, + "loss": 0.3342, + "step": 513 + }, + { + "epoch": 1.3145780051150895, + "grad_norm": 0.2282427820832504, + "learning_rate": 3.127962085308057e-05, + "loss": 0.3406, + "step": 514 + }, + { + "epoch": 1.317135549872123, + "grad_norm": 0.220656045586937, + "learning_rate": 3.123222748815166e-05, + "loss": 0.3481, + "step": 515 + }, + { + "epoch": 1.319693094629156, + "grad_norm": 0.21477244949218188, + "learning_rate": 3.118483412322275e-05, + "loss": 0.3417, + "step": 516 + }, + { + "epoch": 1.3222506393861893, + "grad_norm": 0.2123179538890313, + "learning_rate": 3.113744075829384e-05, + "loss": 0.3374, + "step": 517 + }, + { + "epoch": 1.3248081841432224, + "grad_norm": 0.20966406562861323, + "learning_rate": 3.109004739336493e-05, + "loss": 0.332, + "step": 518 + }, + { + "epoch": 1.3273657289002558, + "grad_norm": 0.1967874776267105, + "learning_rate": 3.104265402843602e-05, + "loss": 0.327, + "step": 519 + }, + { + "epoch": 1.329923273657289, + "grad_norm": 0.21447737012880227, + "learning_rate": 3.0995260663507106e-05, + "loss": 0.3342, + "step": 520 + }, + { + "epoch": 1.3324808184143222, + "grad_norm": 0.22702076072063435, + "learning_rate": 3.09478672985782e-05, + "loss": 0.357, + "step": 521 + }, + { + "epoch": 1.3350383631713556, + "grad_norm": 0.24746439681290008, + "learning_rate": 3.0900473933649296e-05, + "loss": 0.3489, + "step": 522 + }, + { + "epoch": 1.3375959079283888, + "grad_norm": 0.21236354577498476, + "learning_rate": 3.085308056872038e-05, + "loss": 0.309, + "step": 523 + }, + { + "epoch": 1.340153452685422, + "grad_norm": 0.21060912049882632, + "learning_rate": 3.080568720379147e-05, + "loss": 0.3191, + "step": 524 + }, + { + "epoch": 1.3427109974424551, + "grad_norm": 0.20505413275714032, + "learning_rate": 3.075829383886256e-05, + "loss": 0.3451, + "step": 525 + }, + { + "epoch": 1.3452685421994885, + "grad_norm": 0.24615234141005185, + "learning_rate": 3.0710900473933654e-05, + "loss": 0.3528, + "step": 526 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 0.22369032901485378, + "learning_rate": 3.066350710900474e-05, + "loss": 0.3284, + "step": 527 + }, + { + "epoch": 1.350383631713555, + "grad_norm": 0.22838183924629246, + "learning_rate": 3.061611374407583e-05, + "loss": 0.3374, + "step": 528 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 0.27105416556647893, + "learning_rate": 3.056872037914692e-05, + "loss": 0.3464, + "step": 529 + }, + { + "epoch": 1.3554987212276215, + "grad_norm": 0.20957984851168898, + "learning_rate": 3.052132701421801e-05, + "loss": 0.3284, + "step": 530 + }, + { + "epoch": 1.3580562659846547, + "grad_norm": 0.2320992461077895, + "learning_rate": 3.0473933649289098e-05, + "loss": 0.3486, + "step": 531 + }, + { + "epoch": 1.3606138107416879, + "grad_norm": 0.21826085112206514, + "learning_rate": 3.042654028436019e-05, + "loss": 0.3385, + "step": 532 + }, + { + "epoch": 1.3631713554987213, + "grad_norm": 0.2098867818685027, + "learning_rate": 3.0379146919431277e-05, + "loss": 0.3281, + "step": 533 + }, + { + "epoch": 1.3657289002557544, + "grad_norm": 0.20672602706389986, + "learning_rate": 3.0331753554502375e-05, + "loss": 0.3347, + "step": 534 + }, + { + "epoch": 1.3682864450127878, + "grad_norm": 0.19992143809636548, + "learning_rate": 3.0284360189573463e-05, + "loss": 0.3359, + "step": 535 + }, + { + "epoch": 1.370843989769821, + "grad_norm": 0.20352905078357075, + "learning_rate": 3.023696682464455e-05, + "loss": 0.3378, + "step": 536 + }, + { + "epoch": 1.3734015345268542, + "grad_norm": 0.20917931960339106, + "learning_rate": 3.0189573459715642e-05, + "loss": 0.3477, + "step": 537 + }, + { + "epoch": 1.3759590792838874, + "grad_norm": 0.19805266052365922, + "learning_rate": 3.014218009478673e-05, + "loss": 0.3268, + "step": 538 + }, + { + "epoch": 1.3785166240409208, + "grad_norm": 0.19502494739762932, + "learning_rate": 3.009478672985782e-05, + "loss": 0.3493, + "step": 539 + }, + { + "epoch": 1.381074168797954, + "grad_norm": 0.19983564710769386, + "learning_rate": 3.004739336492891e-05, + "loss": 0.3428, + "step": 540 + }, + { + "epoch": 1.3836317135549872, + "grad_norm": 0.22658682357477436, + "learning_rate": 3e-05, + "loss": 0.3378, + "step": 541 + }, + { + "epoch": 1.3861892583120206, + "grad_norm": 0.23466719921978055, + "learning_rate": 2.995260663507109e-05, + "loss": 0.3353, + "step": 542 + }, + { + "epoch": 1.3887468030690537, + "grad_norm": 0.21015550969200184, + "learning_rate": 2.990521327014218e-05, + "loss": 0.35, + "step": 543 + }, + { + "epoch": 1.391304347826087, + "grad_norm": 0.2614967884472048, + "learning_rate": 2.9857819905213268e-05, + "loss": 0.3458, + "step": 544 + }, + { + "epoch": 1.39386189258312, + "grad_norm": 0.20442313390946987, + "learning_rate": 2.9810426540284363e-05, + "loss": 0.3197, + "step": 545 + }, + { + "epoch": 1.3964194373401535, + "grad_norm": 0.22800479961193035, + "learning_rate": 2.9763033175355454e-05, + "loss": 0.3216, + "step": 546 + }, + { + "epoch": 1.3989769820971867, + "grad_norm": 0.23631248009519853, + "learning_rate": 2.9715639810426542e-05, + "loss": 0.3475, + "step": 547 + }, + { + "epoch": 1.40153452685422, + "grad_norm": 0.2148560333286399, + "learning_rate": 2.9668246445497633e-05, + "loss": 0.3321, + "step": 548 + }, + { + "epoch": 1.4040920716112533, + "grad_norm": 0.22336842918171954, + "learning_rate": 2.962085308056872e-05, + "loss": 0.3488, + "step": 549 + }, + { + "epoch": 1.4066496163682864, + "grad_norm": 0.21805777627104153, + "learning_rate": 2.9573459715639813e-05, + "loss": 0.3377, + "step": 550 + }, + { + "epoch": 1.4092071611253196, + "grad_norm": 0.197363455632153, + "learning_rate": 2.95260663507109e-05, + "loss": 0.3395, + "step": 551 + }, + { + "epoch": 1.4117647058823528, + "grad_norm": 0.273730892459262, + "learning_rate": 2.9478672985781992e-05, + "loss": 0.3637, + "step": 552 + }, + { + "epoch": 1.4143222506393862, + "grad_norm": 0.21968147378847208, + "learning_rate": 2.943127962085308e-05, + "loss": 0.3458, + "step": 553 + }, + { + "epoch": 1.4168797953964194, + "grad_norm": 0.22407752561098943, + "learning_rate": 2.938388625592417e-05, + "loss": 0.3341, + "step": 554 + }, + { + "epoch": 1.4194373401534528, + "grad_norm": 0.24706387013454778, + "learning_rate": 2.933649289099526e-05, + "loss": 0.3393, + "step": 555 + }, + { + "epoch": 1.421994884910486, + "grad_norm": 0.23943822236699114, + "learning_rate": 2.9289099526066354e-05, + "loss": 0.3476, + "step": 556 + }, + { + "epoch": 1.4245524296675192, + "grad_norm": 0.21133262016275636, + "learning_rate": 2.9241706161137445e-05, + "loss": 0.3317, + "step": 557 + }, + { + "epoch": 1.4271099744245523, + "grad_norm": 0.2187205757977556, + "learning_rate": 2.9194312796208533e-05, + "loss": 0.3344, + "step": 558 + }, + { + "epoch": 1.4296675191815857, + "grad_norm": 0.2278521035319285, + "learning_rate": 2.9146919431279624e-05, + "loss": 0.3391, + "step": 559 + }, + { + "epoch": 1.432225063938619, + "grad_norm": 0.22984861625880482, + "learning_rate": 2.9099526066350712e-05, + "loss": 0.3289, + "step": 560 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 0.2126484088527432, + "learning_rate": 2.9052132701421804e-05, + "loss": 0.3216, + "step": 561 + }, + { + "epoch": 1.4373401534526855, + "grad_norm": 0.21963164389365947, + "learning_rate": 2.9004739336492892e-05, + "loss": 0.3366, + "step": 562 + }, + { + "epoch": 1.4398976982097187, + "grad_norm": 0.2271030491918638, + "learning_rate": 2.8957345971563983e-05, + "loss": 0.3287, + "step": 563 + }, + { + "epoch": 1.4424552429667519, + "grad_norm": 0.22596502012606307, + "learning_rate": 2.890995260663507e-05, + "loss": 0.3445, + "step": 564 + }, + { + "epoch": 1.445012787723785, + "grad_norm": 0.2092819883191256, + "learning_rate": 2.8862559241706162e-05, + "loss": 0.3188, + "step": 565 + }, + { + "epoch": 1.4475703324808185, + "grad_norm": 0.2085679869485133, + "learning_rate": 2.881516587677725e-05, + "loss": 0.3297, + "step": 566 + }, + { + "epoch": 1.4501278772378516, + "grad_norm": 0.20731318095051873, + "learning_rate": 2.8767772511848338e-05, + "loss": 0.3254, + "step": 567 + }, + { + "epoch": 1.452685421994885, + "grad_norm": 0.22614524862117436, + "learning_rate": 2.8720379146919436e-05, + "loss": 0.3389, + "step": 568 + }, + { + "epoch": 1.4552429667519182, + "grad_norm": 0.22116772067000307, + "learning_rate": 2.8672985781990524e-05, + "loss": 0.3372, + "step": 569 + }, + { + "epoch": 1.4578005115089514, + "grad_norm": 0.2081439129486148, + "learning_rate": 2.8625592417061616e-05, + "loss": 0.3185, + "step": 570 + }, + { + "epoch": 1.4603580562659846, + "grad_norm": 0.2101126677570276, + "learning_rate": 2.8578199052132704e-05, + "loss": 0.3386, + "step": 571 + }, + { + "epoch": 1.4629156010230178, + "grad_norm": 0.20857004085030162, + "learning_rate": 2.853080568720379e-05, + "loss": 0.3317, + "step": 572 + }, + { + "epoch": 1.4654731457800512, + "grad_norm": 0.21972466939910235, + "learning_rate": 2.8483412322274883e-05, + "loss": 0.3421, + "step": 573 + }, + { + "epoch": 1.4680306905370843, + "grad_norm": 0.22670934909893178, + "learning_rate": 2.843601895734597e-05, + "loss": 0.3373, + "step": 574 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.20335752165987916, + "learning_rate": 2.8388625592417062e-05, + "loss": 0.3489, + "step": 575 + }, + { + "epoch": 1.473145780051151, + "grad_norm": 0.21670951576300224, + "learning_rate": 2.834123222748815e-05, + "loss": 0.3436, + "step": 576 + }, + { + "epoch": 1.4757033248081841, + "grad_norm": 0.24188198119161047, + "learning_rate": 2.829383886255924e-05, + "loss": 0.346, + "step": 577 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.19284248575531912, + "learning_rate": 2.824644549763033e-05, + "loss": 0.3412, + "step": 578 + }, + { + "epoch": 1.4808184143222507, + "grad_norm": 0.21408001651811503, + "learning_rate": 2.8199052132701424e-05, + "loss": 0.3359, + "step": 579 + }, + { + "epoch": 1.4833759590792839, + "grad_norm": 0.23043383318843624, + "learning_rate": 2.8151658767772515e-05, + "loss": 0.3352, + "step": 580 + }, + { + "epoch": 1.485933503836317, + "grad_norm": 0.19637762705882086, + "learning_rate": 2.8104265402843603e-05, + "loss": 0.3335, + "step": 581 + }, + { + "epoch": 1.4884910485933505, + "grad_norm": 0.21814477890754627, + "learning_rate": 2.8056872037914695e-05, + "loss": 0.3219, + "step": 582 + }, + { + "epoch": 1.4910485933503836, + "grad_norm": 0.24377115034783173, + "learning_rate": 2.8009478672985783e-05, + "loss": 0.349, + "step": 583 + }, + { + "epoch": 1.4936061381074168, + "grad_norm": 0.20020466798938577, + "learning_rate": 2.7962085308056874e-05, + "loss": 0.3364, + "step": 584 + }, + { + "epoch": 1.49616368286445, + "grad_norm": 0.2363047057326481, + "learning_rate": 2.7914691943127962e-05, + "loss": 0.3565, + "step": 585 + }, + { + "epoch": 1.4987212276214834, + "grad_norm": 0.21956341661227455, + "learning_rate": 2.7867298578199053e-05, + "loss": 0.3377, + "step": 586 + }, + { + "epoch": 1.5012787723785166, + "grad_norm": 0.2267586568563103, + "learning_rate": 2.781990521327014e-05, + "loss": 0.328, + "step": 587 + }, + { + "epoch": 1.50383631713555, + "grad_norm": 0.2047581074184725, + "learning_rate": 2.7772511848341233e-05, + "loss": 0.3468, + "step": 588 + }, + { + "epoch": 1.5063938618925832, + "grad_norm": 0.24688978065050932, + "learning_rate": 2.772511848341232e-05, + "loss": 0.3279, + "step": 589 + }, + { + "epoch": 1.5089514066496164, + "grad_norm": 0.21201942656506023, + "learning_rate": 2.7677725118483415e-05, + "loss": 0.3435, + "step": 590 + }, + { + "epoch": 1.5115089514066495, + "grad_norm": 0.21407415709345273, + "learning_rate": 2.7630331753554507e-05, + "loss": 0.3338, + "step": 591 + }, + { + "epoch": 1.5140664961636827, + "grad_norm": 0.24244329625085864, + "learning_rate": 2.7582938388625595e-05, + "loss": 0.3353, + "step": 592 + }, + { + "epoch": 1.5166240409207161, + "grad_norm": 0.21106578023597755, + "learning_rate": 2.7535545023696686e-05, + "loss": 0.3185, + "step": 593 + }, + { + "epoch": 1.5191815856777495, + "grad_norm": 0.22046969326673913, + "learning_rate": 2.7488151658767774e-05, + "loss": 0.3405, + "step": 594 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 0.22082584514229614, + "learning_rate": 2.7440758293838865e-05, + "loss": 0.3374, + "step": 595 + }, + { + "epoch": 1.5242966751918159, + "grad_norm": 0.2214039800077301, + "learning_rate": 2.7393364928909953e-05, + "loss": 0.3408, + "step": 596 + }, + { + "epoch": 1.526854219948849, + "grad_norm": 0.21162564133453074, + "learning_rate": 2.7345971563981044e-05, + "loss": 0.3223, + "step": 597 + }, + { + "epoch": 1.5294117647058822, + "grad_norm": 0.21038119410478973, + "learning_rate": 2.7298578199052132e-05, + "loss": 0.3232, + "step": 598 + }, + { + "epoch": 1.5319693094629157, + "grad_norm": 0.2232877311097297, + "learning_rate": 2.7251184834123224e-05, + "loss": 0.3569, + "step": 599 + }, + { + "epoch": 1.5345268542199488, + "grad_norm": 0.21018531562144588, + "learning_rate": 2.720379146919431e-05, + "loss": 0.3572, + "step": 600 + }, + { + "epoch": 1.5370843989769822, + "grad_norm": 0.18432057304329444, + "learning_rate": 2.7156398104265403e-05, + "loss": 0.3239, + "step": 601 + }, + { + "epoch": 1.5396419437340154, + "grad_norm": 0.23256686957170164, + "learning_rate": 2.7109004739336498e-05, + "loss": 0.327, + "step": 602 + }, + { + "epoch": 1.5421994884910486, + "grad_norm": 0.2168659241371808, + "learning_rate": 2.7061611374407586e-05, + "loss": 0.3345, + "step": 603 + }, + { + "epoch": 1.5447570332480818, + "grad_norm": 0.2066176620461704, + "learning_rate": 2.7014218009478677e-05, + "loss": 0.3263, + "step": 604 + }, + { + "epoch": 1.547314578005115, + "grad_norm": 0.2510122104603682, + "learning_rate": 2.6966824644549765e-05, + "loss": 0.3383, + "step": 605 + }, + { + "epoch": 1.5498721227621484, + "grad_norm": 0.21620946293671467, + "learning_rate": 2.6919431279620856e-05, + "loss": 0.3421, + "step": 606 + }, + { + "epoch": 1.5524296675191815, + "grad_norm": 0.2374053609246905, + "learning_rate": 2.6872037914691944e-05, + "loss": 0.3395, + "step": 607 + }, + { + "epoch": 1.554987212276215, + "grad_norm": 0.23310585207272871, + "learning_rate": 2.6824644549763032e-05, + "loss": 0.3328, + "step": 608 + }, + { + "epoch": 1.5575447570332481, + "grad_norm": 0.21706136950371865, + "learning_rate": 2.6777251184834124e-05, + "loss": 0.3291, + "step": 609 + }, + { + "epoch": 1.5601023017902813, + "grad_norm": 0.2557349212164624, + "learning_rate": 2.672985781990521e-05, + "loss": 0.3329, + "step": 610 + }, + { + "epoch": 1.5626598465473145, + "grad_norm": 0.21369332563945545, + "learning_rate": 2.6682464454976303e-05, + "loss": 0.3403, + "step": 611 + }, + { + "epoch": 1.5652173913043477, + "grad_norm": 0.22249413300101917, + "learning_rate": 2.663507109004739e-05, + "loss": 0.3295, + "step": 612 + }, + { + "epoch": 1.567774936061381, + "grad_norm": 0.2352350138406436, + "learning_rate": 2.658767772511849e-05, + "loss": 0.3491, + "step": 613 + }, + { + "epoch": 1.5703324808184145, + "grad_norm": 0.21444744975093857, + "learning_rate": 2.6540284360189577e-05, + "loss": 0.3515, + "step": 614 + }, + { + "epoch": 1.5728900255754477, + "grad_norm": 0.20106535936796008, + "learning_rate": 2.6492890995260665e-05, + "loss": 0.3415, + "step": 615 + }, + { + "epoch": 1.5754475703324808, + "grad_norm": 0.24355009019358553, + "learning_rate": 2.6445497630331756e-05, + "loss": 0.3382, + "step": 616 + }, + { + "epoch": 1.578005115089514, + "grad_norm": 0.19632069646990316, + "learning_rate": 2.6398104265402844e-05, + "loss": 0.3344, + "step": 617 + }, + { + "epoch": 1.5805626598465472, + "grad_norm": 0.2041069390497847, + "learning_rate": 2.6350710900473935e-05, + "loss": 0.3314, + "step": 618 + }, + { + "epoch": 1.5831202046035806, + "grad_norm": 0.23307172936850007, + "learning_rate": 2.6303317535545023e-05, + "loss": 0.3279, + "step": 619 + }, + { + "epoch": 1.5856777493606138, + "grad_norm": 0.22016471550055694, + "learning_rate": 2.6255924170616115e-05, + "loss": 0.3238, + "step": 620 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 0.21161098152167546, + "learning_rate": 2.6208530805687203e-05, + "loss": 0.3382, + "step": 621 + }, + { + "epoch": 1.5907928388746804, + "grad_norm": 0.23095386869319134, + "learning_rate": 2.6161137440758294e-05, + "loss": 0.3316, + "step": 622 + }, + { + "epoch": 1.5933503836317136, + "grad_norm": 0.2258130781665819, + "learning_rate": 2.6113744075829382e-05, + "loss": 0.3184, + "step": 623 + }, + { + "epoch": 1.5959079283887467, + "grad_norm": 0.2101743033652242, + "learning_rate": 2.6066350710900477e-05, + "loss": 0.3525, + "step": 624 + }, + { + "epoch": 1.59846547314578, + "grad_norm": 0.23556388836544728, + "learning_rate": 2.6018957345971568e-05, + "loss": 0.3596, + "step": 625 + }, + { + "epoch": 1.6010230179028133, + "grad_norm": 0.2173806495933601, + "learning_rate": 2.5971563981042656e-05, + "loss": 0.3367, + "step": 626 + }, + { + "epoch": 1.6035805626598465, + "grad_norm": 0.21332385463283657, + "learning_rate": 2.5924170616113747e-05, + "loss": 0.3371, + "step": 627 + }, + { + "epoch": 1.60613810741688, + "grad_norm": 0.20121738409593162, + "learning_rate": 2.5876777251184835e-05, + "loss": 0.3308, + "step": 628 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 0.22736961793911684, + "learning_rate": 2.5829383886255927e-05, + "loss": 0.3453, + "step": 629 + }, + { + "epoch": 1.6112531969309463, + "grad_norm": 0.19872074468079123, + "learning_rate": 2.5781990521327014e-05, + "loss": 0.3278, + "step": 630 + }, + { + "epoch": 1.6138107416879794, + "grad_norm": 0.2314685609756946, + "learning_rate": 2.5734597156398106e-05, + "loss": 0.3466, + "step": 631 + }, + { + "epoch": 1.6163682864450126, + "grad_norm": 0.21359598281755646, + "learning_rate": 2.5687203791469194e-05, + "loss": 0.3568, + "step": 632 + }, + { + "epoch": 1.618925831202046, + "grad_norm": 0.2410455323018816, + "learning_rate": 2.5639810426540285e-05, + "loss": 0.3212, + "step": 633 + }, + { + "epoch": 1.6214833759590794, + "grad_norm": 0.253509763295898, + "learning_rate": 2.5592417061611373e-05, + "loss": 0.3589, + "step": 634 + }, + { + "epoch": 1.6240409207161126, + "grad_norm": 0.22712797799055953, + "learning_rate": 2.5545023696682464e-05, + "loss": 0.3349, + "step": 635 + }, + { + "epoch": 1.6265984654731458, + "grad_norm": 0.22386259809972237, + "learning_rate": 2.549763033175356e-05, + "loss": 0.3261, + "step": 636 + }, + { + "epoch": 1.629156010230179, + "grad_norm": 0.2605466792154154, + "learning_rate": 2.5450236966824647e-05, + "loss": 0.3435, + "step": 637 + }, + { + "epoch": 1.6317135549872122, + "grad_norm": 0.20761172721493334, + "learning_rate": 2.540284360189574e-05, + "loss": 0.3251, + "step": 638 + }, + { + "epoch": 1.6342710997442456, + "grad_norm": 0.24685722210051553, + "learning_rate": 2.5355450236966826e-05, + "loss": 0.3235, + "step": 639 + }, + { + "epoch": 1.6368286445012787, + "grad_norm": 0.21434302571838307, + "learning_rate": 2.5308056872037918e-05, + "loss": 0.3207, + "step": 640 + }, + { + "epoch": 1.6393861892583121, + "grad_norm": 0.21184549514913412, + "learning_rate": 2.5260663507109006e-05, + "loss": 0.3238, + "step": 641 + }, + { + "epoch": 1.6419437340153453, + "grad_norm": 0.21252363567202226, + "learning_rate": 2.5213270142180097e-05, + "loss": 0.323, + "step": 642 + }, + { + "epoch": 1.6445012787723785, + "grad_norm": 0.21829115176694264, + "learning_rate": 2.5165876777251185e-05, + "loss": 0.3321, + "step": 643 + }, + { + "epoch": 1.6470588235294117, + "grad_norm": 0.20922978833957703, + "learning_rate": 2.5118483412322273e-05, + "loss": 0.326, + "step": 644 + }, + { + "epoch": 1.6496163682864449, + "grad_norm": 0.2091095674028597, + "learning_rate": 2.5071090047393364e-05, + "loss": 0.3076, + "step": 645 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.20714939228335966, + "learning_rate": 2.5023696682464452e-05, + "loss": 0.3304, + "step": 646 + }, + { + "epoch": 1.6547314578005117, + "grad_norm": 0.19116018976328764, + "learning_rate": 2.4976303317535547e-05, + "loss": 0.3274, + "step": 647 + }, + { + "epoch": 1.6572890025575449, + "grad_norm": 0.19243515464801864, + "learning_rate": 2.4928909952606635e-05, + "loss": 0.3397, + "step": 648 + }, + { + "epoch": 1.659846547314578, + "grad_norm": 0.22171437826424312, + "learning_rate": 2.4881516587677726e-05, + "loss": 0.3404, + "step": 649 + }, + { + "epoch": 1.6624040920716112, + "grad_norm": 0.18811862144302852, + "learning_rate": 2.4834123222748817e-05, + "loss": 0.3294, + "step": 650 + }, + { + "epoch": 1.6649616368286444, + "grad_norm": 0.20726034225043538, + "learning_rate": 2.4786729857819905e-05, + "loss": 0.3376, + "step": 651 + }, + { + "epoch": 1.6675191815856778, + "grad_norm": 0.22641125026229106, + "learning_rate": 2.4739336492890997e-05, + "loss": 0.3315, + "step": 652 + }, + { + "epoch": 1.670076726342711, + "grad_norm": 0.19760668759690572, + "learning_rate": 2.4691943127962085e-05, + "loss": 0.3484, + "step": 653 + }, + { + "epoch": 1.6726342710997444, + "grad_norm": 0.2036460572936716, + "learning_rate": 2.4644549763033176e-05, + "loss": 0.3405, + "step": 654 + }, + { + "epoch": 1.6751918158567776, + "grad_norm": 0.19580889345429936, + "learning_rate": 2.4597156398104264e-05, + "loss": 0.3311, + "step": 655 + }, + { + "epoch": 1.6777493606138107, + "grad_norm": 0.20331485010582212, + "learning_rate": 2.454976303317536e-05, + "loss": 0.3319, + "step": 656 + }, + { + "epoch": 1.680306905370844, + "grad_norm": 0.2003381154185122, + "learning_rate": 2.4502369668246447e-05, + "loss": 0.3338, + "step": 657 + }, + { + "epoch": 1.682864450127877, + "grad_norm": 0.22901909585607055, + "learning_rate": 2.4454976303317538e-05, + "loss": 0.3439, + "step": 658 + }, + { + "epoch": 1.6854219948849105, + "grad_norm": 0.2072701167914152, + "learning_rate": 2.4407582938388626e-05, + "loss": 0.3299, + "step": 659 + }, + { + "epoch": 1.6879795396419437, + "grad_norm": 0.2156044161532469, + "learning_rate": 2.4360189573459717e-05, + "loss": 0.3356, + "step": 660 + }, + { + "epoch": 1.690537084398977, + "grad_norm": 0.22960331769603365, + "learning_rate": 2.431279620853081e-05, + "loss": 0.3211, + "step": 661 + }, + { + "epoch": 1.6930946291560103, + "grad_norm": 0.184836419291593, + "learning_rate": 2.4265402843601897e-05, + "loss": 0.3134, + "step": 662 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.22152975307273395, + "learning_rate": 2.4218009478672988e-05, + "loss": 0.3556, + "step": 663 + }, + { + "epoch": 1.6982097186700766, + "grad_norm": 0.27533636995577504, + "learning_rate": 2.4170616113744076e-05, + "loss": 0.333, + "step": 664 + }, + { + "epoch": 1.7007672634271098, + "grad_norm": 0.20239642573133182, + "learning_rate": 2.4123222748815167e-05, + "loss": 0.3244, + "step": 665 + }, + { + "epoch": 1.7033248081841432, + "grad_norm": 0.19215048920041694, + "learning_rate": 2.4075829383886255e-05, + "loss": 0.3261, + "step": 666 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.21226322101300024, + "learning_rate": 2.402843601895735e-05, + "loss": 0.3357, + "step": 667 + }, + { + "epoch": 1.7084398976982098, + "grad_norm": 0.22539028864743468, + "learning_rate": 2.3981042654028438e-05, + "loss": 0.3472, + "step": 668 + }, + { + "epoch": 1.710997442455243, + "grad_norm": 0.23393010371109055, + "learning_rate": 2.3933649289099526e-05, + "loss": 0.3325, + "step": 669 + }, + { + "epoch": 1.7135549872122762, + "grad_norm": 0.1735369909355323, + "learning_rate": 2.3886255924170617e-05, + "loss": 0.3158, + "step": 670 + }, + { + "epoch": 1.7161125319693094, + "grad_norm": 0.21921508136082404, + "learning_rate": 2.3838862559241705e-05, + "loss": 0.35, + "step": 671 + }, + { + "epoch": 1.7186700767263428, + "grad_norm": 0.21982061308675563, + "learning_rate": 2.3791469194312796e-05, + "loss": 0.3479, + "step": 672 + }, + { + "epoch": 1.721227621483376, + "grad_norm": 0.2169093947973993, + "learning_rate": 2.3744075829383888e-05, + "loss": 0.3318, + "step": 673 + }, + { + "epoch": 1.7237851662404093, + "grad_norm": 0.20360889372476712, + "learning_rate": 2.369668246445498e-05, + "loss": 0.334, + "step": 674 + }, + { + "epoch": 1.7263427109974425, + "grad_norm": 0.2174062686096523, + "learning_rate": 2.3649289099526067e-05, + "loss": 0.3385, + "step": 675 + }, + { + "epoch": 1.7289002557544757, + "grad_norm": 0.20684367968994177, + "learning_rate": 2.360189573459716e-05, + "loss": 0.3375, + "step": 676 + }, + { + "epoch": 1.7314578005115089, + "grad_norm": 0.19965154462316637, + "learning_rate": 2.3554502369668246e-05, + "loss": 0.3253, + "step": 677 + }, + { + "epoch": 1.734015345268542, + "grad_norm": 0.21474011017766587, + "learning_rate": 2.3507109004739338e-05, + "loss": 0.3382, + "step": 678 + }, + { + "epoch": 1.7365728900255755, + "grad_norm": 0.22746922194428235, + "learning_rate": 2.345971563981043e-05, + "loss": 0.324, + "step": 679 + }, + { + "epoch": 1.7391304347826086, + "grad_norm": 0.2104457674935215, + "learning_rate": 2.3412322274881517e-05, + "loss": 0.3392, + "step": 680 + }, + { + "epoch": 1.741687979539642, + "grad_norm": 0.20283023890137888, + "learning_rate": 2.3364928909952608e-05, + "loss": 0.3291, + "step": 681 + }, + { + "epoch": 1.7442455242966752, + "grad_norm": 0.21461127150907236, + "learning_rate": 2.3317535545023696e-05, + "loss": 0.3386, + "step": 682 + }, + { + "epoch": 1.7468030690537084, + "grad_norm": 0.1909336493281626, + "learning_rate": 2.3270142180094788e-05, + "loss": 0.3175, + "step": 683 + }, + { + "epoch": 1.7493606138107416, + "grad_norm": 0.21259577247012493, + "learning_rate": 2.322274881516588e-05, + "loss": 0.3403, + "step": 684 + }, + { + "epoch": 1.7519181585677748, + "grad_norm": 0.20066175215287518, + "learning_rate": 2.317535545023697e-05, + "loss": 0.3285, + "step": 685 + }, + { + "epoch": 1.7544757033248082, + "grad_norm": 0.7581076529268704, + "learning_rate": 2.3127962085308058e-05, + "loss": 0.363, + "step": 686 + }, + { + "epoch": 1.7570332480818416, + "grad_norm": 0.22657067765448413, + "learning_rate": 2.3080568720379146e-05, + "loss": 0.3435, + "step": 687 + }, + { + "epoch": 1.7595907928388748, + "grad_norm": 0.2217418832124222, + "learning_rate": 2.3033175355450237e-05, + "loss": 0.3197, + "step": 688 + }, + { + "epoch": 1.762148337595908, + "grad_norm": 0.21459082938179172, + "learning_rate": 2.2985781990521325e-05, + "loss": 0.3505, + "step": 689 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.20738951497933816, + "learning_rate": 2.293838862559242e-05, + "loss": 0.321, + "step": 690 + }, + { + "epoch": 1.7672634271099743, + "grad_norm": 0.22785561899819126, + "learning_rate": 2.2890995260663508e-05, + "loss": 0.3424, + "step": 691 + }, + { + "epoch": 1.7698209718670077, + "grad_norm": 0.22927074980811404, + "learning_rate": 2.28436018957346e-05, + "loss": 0.3351, + "step": 692 + }, + { + "epoch": 1.772378516624041, + "grad_norm": 0.23347434762189972, + "learning_rate": 2.2796208530805687e-05, + "loss": 0.347, + "step": 693 + }, + { + "epoch": 1.7749360613810743, + "grad_norm": 0.2330189859527237, + "learning_rate": 2.274881516587678e-05, + "loss": 0.3341, + "step": 694 + }, + { + "epoch": 1.7774936061381075, + "grad_norm": 0.25074043381573513, + "learning_rate": 2.270142180094787e-05, + "loss": 0.3172, + "step": 695 + }, + { + "epoch": 1.7800511508951407, + "grad_norm": 0.21374906832842885, + "learning_rate": 2.2654028436018958e-05, + "loss": 0.339, + "step": 696 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.25168218613406507, + "learning_rate": 2.260663507109005e-05, + "loss": 0.3325, + "step": 697 + }, + { + "epoch": 1.785166240409207, + "grad_norm": 0.2403091187285791, + "learning_rate": 2.2559241706161137e-05, + "loss": 0.3478, + "step": 698 + }, + { + "epoch": 1.7877237851662404, + "grad_norm": 0.22187397630061947, + "learning_rate": 2.251184834123223e-05, + "loss": 0.3452, + "step": 699 + }, + { + "epoch": 1.7902813299232738, + "grad_norm": 0.24752310282755516, + "learning_rate": 2.2464454976303317e-05, + "loss": 0.3356, + "step": 700 + }, + { + "epoch": 1.792838874680307, + "grad_norm": 0.2084033534950943, + "learning_rate": 2.241706161137441e-05, + "loss": 0.3259, + "step": 701 + }, + { + "epoch": 1.7953964194373402, + "grad_norm": 0.2355859217064896, + "learning_rate": 2.23696682464455e-05, + "loss": 0.3425, + "step": 702 + }, + { + "epoch": 1.7979539641943734, + "grad_norm": 0.21447292569876703, + "learning_rate": 2.232227488151659e-05, + "loss": 0.3153, + "step": 703 + }, + { + "epoch": 1.8005115089514065, + "grad_norm": 0.20854337096420522, + "learning_rate": 2.227488151658768e-05, + "loss": 0.3352, + "step": 704 + }, + { + "epoch": 1.80306905370844, + "grad_norm": 0.22064595096377312, + "learning_rate": 2.2227488151658766e-05, + "loss": 0.3228, + "step": 705 + }, + { + "epoch": 1.8056265984654731, + "grad_norm": 0.23748592354665862, + "learning_rate": 2.2180094786729858e-05, + "loss": 0.3407, + "step": 706 + }, + { + "epoch": 1.8081841432225065, + "grad_norm": 0.25098201166842826, + "learning_rate": 2.213270142180095e-05, + "loss": 0.3533, + "step": 707 + }, + { + "epoch": 1.8107416879795397, + "grad_norm": 0.2789258681226503, + "learning_rate": 2.208530805687204e-05, + "loss": 0.3405, + "step": 708 + }, + { + "epoch": 1.813299232736573, + "grad_norm": 0.21924763977982134, + "learning_rate": 2.203791469194313e-05, + "loss": 0.3209, + "step": 709 + }, + { + "epoch": 1.815856777493606, + "grad_norm": 0.24534901252195856, + "learning_rate": 2.199052132701422e-05, + "loss": 0.3228, + "step": 710 + }, + { + "epoch": 1.8184143222506393, + "grad_norm": 0.23769380073414784, + "learning_rate": 2.1943127962085308e-05, + "loss": 0.3319, + "step": 711 + }, + { + "epoch": 1.8209718670076727, + "grad_norm": 0.20966116422671724, + "learning_rate": 2.18957345971564e-05, + "loss": 0.3255, + "step": 712 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 0.2278495662047266, + "learning_rate": 2.184834123222749e-05, + "loss": 0.3234, + "step": 713 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.22895416972072405, + "learning_rate": 2.1800947867298578e-05, + "loss": 0.3189, + "step": 714 + }, + { + "epoch": 1.8286445012787724, + "grad_norm": 0.2086902846375283, + "learning_rate": 2.175355450236967e-05, + "loss": 0.3472, + "step": 715 + }, + { + "epoch": 1.8312020460358056, + "grad_norm": 0.19855684843219606, + "learning_rate": 2.1706161137440758e-05, + "loss": 0.3458, + "step": 716 + }, + { + "epoch": 1.8337595907928388, + "grad_norm": 0.23552439546401155, + "learning_rate": 2.165876777251185e-05, + "loss": 0.336, + "step": 717 + }, + { + "epoch": 1.836317135549872, + "grad_norm": 0.20685861123790114, + "learning_rate": 2.161137440758294e-05, + "loss": 0.336, + "step": 718 + }, + { + "epoch": 1.8388746803069054, + "grad_norm": 0.19887491717386577, + "learning_rate": 2.156398104265403e-05, + "loss": 0.3396, + "step": 719 + }, + { + "epoch": 1.8414322250639388, + "grad_norm": 0.2509814669536259, + "learning_rate": 2.151658767772512e-05, + "loss": 0.3441, + "step": 720 + }, + { + "epoch": 1.843989769820972, + "grad_norm": 0.19522319866892376, + "learning_rate": 2.146919431279621e-05, + "loss": 0.3106, + "step": 721 + }, + { + "epoch": 1.8465473145780051, + "grad_norm": 0.18974799063588516, + "learning_rate": 2.14218009478673e-05, + "loss": 0.3262, + "step": 722 + }, + { + "epoch": 1.8491048593350383, + "grad_norm": 0.20477382204756456, + "learning_rate": 2.1374407582938387e-05, + "loss": 0.3429, + "step": 723 + }, + { + "epoch": 1.8516624040920715, + "grad_norm": 0.2142522572136313, + "learning_rate": 2.132701421800948e-05, + "loss": 0.3308, + "step": 724 + }, + { + "epoch": 1.854219948849105, + "grad_norm": 0.1979056292881532, + "learning_rate": 2.127962085308057e-05, + "loss": 0.3346, + "step": 725 + }, + { + "epoch": 1.856777493606138, + "grad_norm": 0.20554303110251226, + "learning_rate": 2.123222748815166e-05, + "loss": 0.3583, + "step": 726 + }, + { + "epoch": 1.8593350383631715, + "grad_norm": 0.1984154565321334, + "learning_rate": 2.118483412322275e-05, + "loss": 0.3196, + "step": 727 + }, + { + "epoch": 1.8618925831202047, + "grad_norm": 0.2008595114867567, + "learning_rate": 2.113744075829384e-05, + "loss": 0.3427, + "step": 728 + }, + { + "epoch": 1.8644501278772379, + "grad_norm": 0.21531627949148452, + "learning_rate": 2.109004739336493e-05, + "loss": 0.3415, + "step": 729 + }, + { + "epoch": 1.867007672634271, + "grad_norm": 0.20193118670494573, + "learning_rate": 2.104265402843602e-05, + "loss": 0.3371, + "step": 730 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.21944975819432885, + "learning_rate": 2.099526066350711e-05, + "loss": 0.3217, + "step": 731 + }, + { + "epoch": 1.8721227621483376, + "grad_norm": 0.23381023417915053, + "learning_rate": 2.09478672985782e-05, + "loss": 0.3406, + "step": 732 + }, + { + "epoch": 1.8746803069053708, + "grad_norm": 0.19300009053421657, + "learning_rate": 2.090047393364929e-05, + "loss": 0.3199, + "step": 733 + }, + { + "epoch": 1.8772378516624042, + "grad_norm": 0.19576466530600098, + "learning_rate": 2.0853080568720378e-05, + "loss": 0.3215, + "step": 734 + }, + { + "epoch": 1.8797953964194374, + "grad_norm": 0.21787819537132525, + "learning_rate": 2.0805687203791473e-05, + "loss": 0.3359, + "step": 735 + }, + { + "epoch": 1.8823529411764706, + "grad_norm": 0.20623605117402122, + "learning_rate": 2.075829383886256e-05, + "loss": 0.3396, + "step": 736 + }, + { + "epoch": 1.8849104859335037, + "grad_norm": 0.19807063269430017, + "learning_rate": 2.0710900473933652e-05, + "loss": 0.3321, + "step": 737 + }, + { + "epoch": 1.887468030690537, + "grad_norm": 0.21340084606280826, + "learning_rate": 2.066350710900474e-05, + "loss": 0.3376, + "step": 738 + }, + { + "epoch": 1.8900255754475703, + "grad_norm": 0.2117778713699439, + "learning_rate": 2.061611374407583e-05, + "loss": 0.318, + "step": 739 + }, + { + "epoch": 1.8925831202046037, + "grad_norm": 0.19496876658086634, + "learning_rate": 2.056872037914692e-05, + "loss": 0.3275, + "step": 740 + }, + { + "epoch": 1.895140664961637, + "grad_norm": 0.22772399554231024, + "learning_rate": 2.052132701421801e-05, + "loss": 0.3456, + "step": 741 + }, + { + "epoch": 1.89769820971867, + "grad_norm": 0.19861753270620258, + "learning_rate": 2.0473933649289102e-05, + "loss": 0.3364, + "step": 742 + }, + { + "epoch": 1.9002557544757033, + "grad_norm": 0.2101418258514019, + "learning_rate": 2.042654028436019e-05, + "loss": 0.3324, + "step": 743 + }, + { + "epoch": 1.9028132992327365, + "grad_norm": 0.19738568484825283, + "learning_rate": 2.037914691943128e-05, + "loss": 0.3458, + "step": 744 + }, + { + "epoch": 1.9053708439897699, + "grad_norm": 0.22341732627665845, + "learning_rate": 2.033175355450237e-05, + "loss": 0.3472, + "step": 745 + }, + { + "epoch": 1.907928388746803, + "grad_norm": 0.20794044931146008, + "learning_rate": 2.028436018957346e-05, + "loss": 0.3367, + "step": 746 + }, + { + "epoch": 1.9104859335038364, + "grad_norm": 0.20491964629174395, + "learning_rate": 2.0236966824644552e-05, + "loss": 0.3223, + "step": 747 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.20189894495265795, + "learning_rate": 2.018957345971564e-05, + "loss": 0.3362, + "step": 748 + }, + { + "epoch": 1.9156010230179028, + "grad_norm": 0.2048150503497566, + "learning_rate": 2.014218009478673e-05, + "loss": 0.3426, + "step": 749 + }, + { + "epoch": 1.918158567774936, + "grad_norm": 0.21954225182865705, + "learning_rate": 2.009478672985782e-05, + "loss": 0.3528, + "step": 750 + }, + { + "epoch": 1.9207161125319692, + "grad_norm": 0.22214844960655306, + "learning_rate": 2.004739336492891e-05, + "loss": 0.3491, + "step": 751 + }, + { + "epoch": 1.9232736572890026, + "grad_norm": 0.2002610790388588, + "learning_rate": 2e-05, + "loss": 0.3324, + "step": 752 + }, + { + "epoch": 1.9258312020460358, + "grad_norm": 0.23222016864966347, + "learning_rate": 1.9952606635071093e-05, + "loss": 0.3292, + "step": 753 + }, + { + "epoch": 1.9283887468030692, + "grad_norm": 0.2207542823663722, + "learning_rate": 1.990521327014218e-05, + "loss": 0.3385, + "step": 754 + }, + { + "epoch": 1.9309462915601023, + "grad_norm": 0.22749264244194325, + "learning_rate": 1.9857819905213272e-05, + "loss": 0.316, + "step": 755 + }, + { + "epoch": 1.9335038363171355, + "grad_norm": 0.1977916254111309, + "learning_rate": 1.981042654028436e-05, + "loss": 0.3395, + "step": 756 + }, + { + "epoch": 1.9360613810741687, + "grad_norm": 0.19556691281474403, + "learning_rate": 1.976303317535545e-05, + "loss": 0.3355, + "step": 757 + }, + { + "epoch": 1.938618925831202, + "grad_norm": 0.1962514353937156, + "learning_rate": 1.9715639810426543e-05, + "loss": 0.3156, + "step": 758 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.23567156259437158, + "learning_rate": 1.966824644549763e-05, + "loss": 0.3413, + "step": 759 + }, + { + "epoch": 1.9437340153452687, + "grad_norm": 0.1980383962745943, + "learning_rate": 1.9620853080568722e-05, + "loss": 0.323, + "step": 760 + }, + { + "epoch": 1.9462915601023019, + "grad_norm": 0.19505875547934262, + "learning_rate": 1.957345971563981e-05, + "loss": 0.3342, + "step": 761 + }, + { + "epoch": 1.948849104859335, + "grad_norm": 0.22978204914718386, + "learning_rate": 1.95260663507109e-05, + "loss": 0.3438, + "step": 762 + }, + { + "epoch": 1.9514066496163682, + "grad_norm": 0.19344201193147603, + "learning_rate": 1.9478672985781993e-05, + "loss": 0.3118, + "step": 763 + }, + { + "epoch": 1.9539641943734014, + "grad_norm": 0.18582466291193162, + "learning_rate": 1.943127962085308e-05, + "loss": 0.3375, + "step": 764 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.21401678800134463, + "learning_rate": 1.9383886255924172e-05, + "loss": 0.3384, + "step": 765 + }, + { + "epoch": 1.959079283887468, + "grad_norm": 0.19342159241258478, + "learning_rate": 1.933649289099526e-05, + "loss": 0.3206, + "step": 766 + }, + { + "epoch": 1.9616368286445014, + "grad_norm": 0.19399605381147378, + "learning_rate": 1.928909952606635e-05, + "loss": 0.3384, + "step": 767 + }, + { + "epoch": 1.9641943734015346, + "grad_norm": 0.20148408812790133, + "learning_rate": 1.924170616113744e-05, + "loss": 0.3225, + "step": 768 + }, + { + "epoch": 1.9667519181585678, + "grad_norm": 0.18715476457309554, + "learning_rate": 1.9194312796208534e-05, + "loss": 0.3288, + "step": 769 + }, + { + "epoch": 1.969309462915601, + "grad_norm": 0.18815259504289839, + "learning_rate": 1.9146919431279622e-05, + "loss": 0.3142, + "step": 770 + }, + { + "epoch": 1.9718670076726341, + "grad_norm": 0.20640473592890973, + "learning_rate": 1.9099526066350713e-05, + "loss": 0.3191, + "step": 771 + }, + { + "epoch": 1.9744245524296675, + "grad_norm": 0.19662779268863564, + "learning_rate": 1.90521327014218e-05, + "loss": 0.3207, + "step": 772 + }, + { + "epoch": 1.976982097186701, + "grad_norm": 0.2066568679986916, + "learning_rate": 1.9004739336492893e-05, + "loss": 0.3454, + "step": 773 + }, + { + "epoch": 1.979539641943734, + "grad_norm": 0.20322475668496154, + "learning_rate": 1.895734597156398e-05, + "loss": 0.3234, + "step": 774 + }, + { + "epoch": 1.9820971867007673, + "grad_norm": 0.19059283154521114, + "learning_rate": 1.8909952606635072e-05, + "loss": 0.3313, + "step": 775 + }, + { + "epoch": 1.9846547314578005, + "grad_norm": 0.21406140221632183, + "learning_rate": 1.8862559241706163e-05, + "loss": 0.3465, + "step": 776 + }, + { + "epoch": 1.9872122762148337, + "grad_norm": 0.21139212441518793, + "learning_rate": 1.881516587677725e-05, + "loss": 0.3261, + "step": 777 + }, + { + "epoch": 1.989769820971867, + "grad_norm": 0.19320779992691875, + "learning_rate": 1.8767772511848342e-05, + "loss": 0.3199, + "step": 778 + }, + { + "epoch": 1.9923273657289002, + "grad_norm": 0.1948553869588904, + "learning_rate": 1.872037914691943e-05, + "loss": 0.3295, + "step": 779 + }, + { + "epoch": 1.9948849104859336, + "grad_norm": 0.19898631153447896, + "learning_rate": 1.8672985781990525e-05, + "loss": 0.3185, + "step": 780 + }, + { + "epoch": 1.9974424552429668, + "grad_norm": 0.20500622742531077, + "learning_rate": 1.8625592417061613e-05, + "loss": 0.3367, + "step": 781 + }, + { + "epoch": 2.0, + "grad_norm": 0.18745671376713552, + "learning_rate": 1.85781990521327e-05, + "loss": 0.3035, + "step": 782 + }, + { + "epoch": 2.002557544757033, + "grad_norm": 0.2785878708893465, + "learning_rate": 1.8530805687203792e-05, + "loss": 0.2615, + "step": 783 + }, + { + "epoch": 2.0051150895140664, + "grad_norm": 0.20397480769360993, + "learning_rate": 1.848341232227488e-05, + "loss": 0.2434, + "step": 784 + }, + { + "epoch": 2.0076726342710995, + "grad_norm": 0.2923962743620856, + "learning_rate": 1.843601895734597e-05, + "loss": 0.2478, + "step": 785 + }, + { + "epoch": 2.010230179028133, + "grad_norm": 0.25689369914334176, + "learning_rate": 1.8388625592417063e-05, + "loss": 0.2448, + "step": 786 + }, + { + "epoch": 2.0127877237851663, + "grad_norm": 0.23710484976355836, + "learning_rate": 1.8341232227488154e-05, + "loss": 0.257, + "step": 787 + }, + { + "epoch": 2.0153452685421995, + "grad_norm": 0.29563461441097083, + "learning_rate": 1.8293838862559242e-05, + "loss": 0.2521, + "step": 788 + }, + { + "epoch": 2.0179028132992327, + "grad_norm": 0.2381040612370418, + "learning_rate": 1.8246445497630334e-05, + "loss": 0.2499, + "step": 789 + }, + { + "epoch": 2.020460358056266, + "grad_norm": 0.2291439129489046, + "learning_rate": 1.819905213270142e-05, + "loss": 0.2438, + "step": 790 + }, + { + "epoch": 2.023017902813299, + "grad_norm": 0.28685620757378183, + "learning_rate": 1.8151658767772513e-05, + "loss": 0.2553, + "step": 791 + }, + { + "epoch": 2.0255754475703327, + "grad_norm": 0.21147497245529764, + "learning_rate": 1.8104265402843604e-05, + "loss": 0.252, + "step": 792 + }, + { + "epoch": 2.028132992327366, + "grad_norm": 0.22446603408981508, + "learning_rate": 1.8056872037914692e-05, + "loss": 0.2536, + "step": 793 + }, + { + "epoch": 2.030690537084399, + "grad_norm": 0.24541367333886985, + "learning_rate": 1.8009478672985784e-05, + "loss": 0.2504, + "step": 794 + }, + { + "epoch": 2.0332480818414322, + "grad_norm": 0.22514879404996416, + "learning_rate": 1.796208530805687e-05, + "loss": 0.2605, + "step": 795 + }, + { + "epoch": 2.0358056265984654, + "grad_norm": 0.20624678594072715, + "learning_rate": 1.7914691943127963e-05, + "loss": 0.2612, + "step": 796 + }, + { + "epoch": 2.0383631713554986, + "grad_norm": 0.21342231575903908, + "learning_rate": 1.7867298578199054e-05, + "loss": 0.2499, + "step": 797 + }, + { + "epoch": 2.040920716112532, + "grad_norm": 0.22708020169166784, + "learning_rate": 1.7819905213270146e-05, + "loss": 0.2573, + "step": 798 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.20671082360929366, + "learning_rate": 1.7772511848341233e-05, + "loss": 0.2517, + "step": 799 + }, + { + "epoch": 2.0460358056265986, + "grad_norm": 0.20470461882320312, + "learning_rate": 1.772511848341232e-05, + "loss": 0.2441, + "step": 800 + }, + { + "epoch": 2.0485933503836318, + "grad_norm": 0.20251032207130173, + "learning_rate": 1.7677725118483413e-05, + "loss": 0.2547, + "step": 801 + }, + { + "epoch": 2.051150895140665, + "grad_norm": 0.20368951509303784, + "learning_rate": 1.76303317535545e-05, + "loss": 0.2439, + "step": 802 + }, + { + "epoch": 2.053708439897698, + "grad_norm": 0.19922183550926562, + "learning_rate": 1.7582938388625595e-05, + "loss": 0.2401, + "step": 803 + }, + { + "epoch": 2.0562659846547313, + "grad_norm": 0.21378847417361496, + "learning_rate": 1.7535545023696683e-05, + "loss": 0.2598, + "step": 804 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 0.2093916676403955, + "learning_rate": 1.7488151658767775e-05, + "loss": 0.2562, + "step": 805 + }, + { + "epoch": 2.061381074168798, + "grad_norm": 0.2148148853889112, + "learning_rate": 1.7440758293838863e-05, + "loss": 0.2543, + "step": 806 + }, + { + "epoch": 2.0639386189258313, + "grad_norm": 0.20365914748452466, + "learning_rate": 1.7393364928909954e-05, + "loss": 0.248, + "step": 807 + }, + { + "epoch": 2.0664961636828645, + "grad_norm": 0.21066398897720096, + "learning_rate": 1.7345971563981042e-05, + "loss": 0.2638, + "step": 808 + }, + { + "epoch": 2.0690537084398977, + "grad_norm": 0.20804166422941303, + "learning_rate": 1.7298578199052133e-05, + "loss": 0.2542, + "step": 809 + }, + { + "epoch": 2.071611253196931, + "grad_norm": 0.18674967472128892, + "learning_rate": 1.7251184834123225e-05, + "loss": 0.2405, + "step": 810 + }, + { + "epoch": 2.074168797953964, + "grad_norm": 0.1906175209072609, + "learning_rate": 1.7203791469194313e-05, + "loss": 0.2342, + "step": 811 + }, + { + "epoch": 2.0767263427109977, + "grad_norm": 0.2100046063283888, + "learning_rate": 1.7156398104265404e-05, + "loss": 0.2432, + "step": 812 + }, + { + "epoch": 2.079283887468031, + "grad_norm": 0.1967925906674926, + "learning_rate": 1.7109004739336492e-05, + "loss": 0.2413, + "step": 813 + }, + { + "epoch": 2.081841432225064, + "grad_norm": 0.1985022110628129, + "learning_rate": 1.7061611374407587e-05, + "loss": 0.2412, + "step": 814 + }, + { + "epoch": 2.084398976982097, + "grad_norm": 0.2004462205861864, + "learning_rate": 1.7014218009478674e-05, + "loss": 0.2608, + "step": 815 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.2126154513787664, + "learning_rate": 1.6966824644549766e-05, + "loss": 0.2419, + "step": 816 + }, + { + "epoch": 2.0895140664961636, + "grad_norm": 0.21724682158013556, + "learning_rate": 1.6919431279620854e-05, + "loss": 0.2555, + "step": 817 + }, + { + "epoch": 2.0920716112531967, + "grad_norm": 0.20957633230824144, + "learning_rate": 1.6872037914691942e-05, + "loss": 0.2434, + "step": 818 + }, + { + "epoch": 2.0946291560102304, + "grad_norm": 0.1852153835527483, + "learning_rate": 1.6824644549763033e-05, + "loss": 0.2408, + "step": 819 + }, + { + "epoch": 2.0971867007672635, + "grad_norm": 0.22086697836670513, + "learning_rate": 1.6777251184834124e-05, + "loss": 0.2582, + "step": 820 + }, + { + "epoch": 2.0997442455242967, + "grad_norm": 0.24261708505812196, + "learning_rate": 1.6729857819905216e-05, + "loss": 0.2632, + "step": 821 + }, + { + "epoch": 2.10230179028133, + "grad_norm": 0.18366952389698496, + "learning_rate": 1.6682464454976304e-05, + "loss": 0.2433, + "step": 822 + }, + { + "epoch": 2.104859335038363, + "grad_norm": 0.2038172463973163, + "learning_rate": 1.6635071090047395e-05, + "loss": 0.2525, + "step": 823 + }, + { + "epoch": 2.1074168797953963, + "grad_norm": 0.2012679343362801, + "learning_rate": 1.6587677725118483e-05, + "loss": 0.249, + "step": 824 + }, + { + "epoch": 2.10997442455243, + "grad_norm": 0.19324190678914918, + "learning_rate": 1.6540284360189574e-05, + "loss": 0.2476, + "step": 825 + }, + { + "epoch": 2.112531969309463, + "grad_norm": 0.19308515698590148, + "learning_rate": 1.6492890995260666e-05, + "loss": 0.2545, + "step": 826 + }, + { + "epoch": 2.1150895140664963, + "grad_norm": 0.20072878909780828, + "learning_rate": 1.6445497630331754e-05, + "loss": 0.2493, + "step": 827 + }, + { + "epoch": 2.1176470588235294, + "grad_norm": 0.21529840999791708, + "learning_rate": 1.6398104265402845e-05, + "loss": 0.2505, + "step": 828 + }, + { + "epoch": 2.1202046035805626, + "grad_norm": 0.190291814924438, + "learning_rate": 1.6350710900473933e-05, + "loss": 0.2568, + "step": 829 + }, + { + "epoch": 2.122762148337596, + "grad_norm": 0.1843567434491544, + "learning_rate": 1.6303317535545024e-05, + "loss": 0.235, + "step": 830 + }, + { + "epoch": 2.125319693094629, + "grad_norm": 0.20192839632170334, + "learning_rate": 1.6255924170616116e-05, + "loss": 0.2518, + "step": 831 + }, + { + "epoch": 2.1278772378516626, + "grad_norm": 0.19505086113061484, + "learning_rate": 1.6208530805687207e-05, + "loss": 0.2422, + "step": 832 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.18413293323513488, + "learning_rate": 1.6161137440758295e-05, + "loss": 0.2481, + "step": 833 + }, + { + "epoch": 2.132992327365729, + "grad_norm": 0.19660864149905055, + "learning_rate": 1.6113744075829386e-05, + "loss": 0.2482, + "step": 834 + }, + { + "epoch": 2.135549872122762, + "grad_norm": 0.19108123965299506, + "learning_rate": 1.6066350710900474e-05, + "loss": 0.2488, + "step": 835 + }, + { + "epoch": 2.1381074168797953, + "grad_norm": 0.2054493861311576, + "learning_rate": 1.6018957345971562e-05, + "loss": 0.2508, + "step": 836 + }, + { + "epoch": 2.1406649616368285, + "grad_norm": 0.19933140761961352, + "learning_rate": 1.5971563981042657e-05, + "loss": 0.2526, + "step": 837 + }, + { + "epoch": 2.1432225063938617, + "grad_norm": 0.18520997915505424, + "learning_rate": 1.5924170616113745e-05, + "loss": 0.2553, + "step": 838 + }, + { + "epoch": 2.1457800511508953, + "grad_norm": 0.18142714347687713, + "learning_rate": 1.5876777251184836e-05, + "loss": 0.2404, + "step": 839 + }, + { + "epoch": 2.1483375959079285, + "grad_norm": 0.19332393510145196, + "learning_rate": 1.5829383886255924e-05, + "loss": 0.2608, + "step": 840 + }, + { + "epoch": 2.1508951406649617, + "grad_norm": 0.18239849204776917, + "learning_rate": 1.5781990521327015e-05, + "loss": 0.2472, + "step": 841 + }, + { + "epoch": 2.153452685421995, + "grad_norm": 0.19432247568701047, + "learning_rate": 1.5734597156398103e-05, + "loss": 0.2509, + "step": 842 + }, + { + "epoch": 2.156010230179028, + "grad_norm": 0.1891425736304601, + "learning_rate": 1.5687203791469195e-05, + "loss": 0.2544, + "step": 843 + }, + { + "epoch": 2.1585677749360612, + "grad_norm": 0.1776945543749591, + "learning_rate": 1.5639810426540286e-05, + "loss": 0.2418, + "step": 844 + }, + { + "epoch": 2.1611253196930944, + "grad_norm": 0.19454352996860633, + "learning_rate": 1.5592417061611374e-05, + "loss": 0.2578, + "step": 845 + }, + { + "epoch": 2.163682864450128, + "grad_norm": 0.19387855469120038, + "learning_rate": 1.5545023696682465e-05, + "loss": 0.2562, + "step": 846 + }, + { + "epoch": 2.166240409207161, + "grad_norm": 0.1884476249381793, + "learning_rate": 1.5497630331753553e-05, + "loss": 0.2435, + "step": 847 + }, + { + "epoch": 2.1687979539641944, + "grad_norm": 0.19682354969261456, + "learning_rate": 1.5450236966824648e-05, + "loss": 0.245, + "step": 848 + }, + { + "epoch": 2.1713554987212276, + "grad_norm": 0.19646857607869206, + "learning_rate": 1.5402843601895736e-05, + "loss": 0.2421, + "step": 849 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.1878274831496743, + "learning_rate": 1.5355450236966827e-05, + "loss": 0.2602, + "step": 850 + }, + { + "epoch": 2.176470588235294, + "grad_norm": 0.2203759013180319, + "learning_rate": 1.5308056872037915e-05, + "loss": 0.26, + "step": 851 + }, + { + "epoch": 2.1790281329923276, + "grad_norm": 0.20353045344689538, + "learning_rate": 1.5260663507109007e-05, + "loss": 0.2548, + "step": 852 + }, + { + "epoch": 2.1815856777493607, + "grad_norm": 0.17917216373663247, + "learning_rate": 1.5213270142180094e-05, + "loss": 0.2481, + "step": 853 + }, + { + "epoch": 2.184143222506394, + "grad_norm": 0.19965741458148648, + "learning_rate": 1.5165876777251187e-05, + "loss": 0.2369, + "step": 854 + }, + { + "epoch": 2.186700767263427, + "grad_norm": 0.1983107544529902, + "learning_rate": 1.5118483412322275e-05, + "loss": 0.2649, + "step": 855 + }, + { + "epoch": 2.1892583120204603, + "grad_norm": 0.18889444027577523, + "learning_rate": 1.5071090047393365e-05, + "loss": 0.2537, + "step": 856 + }, + { + "epoch": 2.1918158567774935, + "grad_norm": 0.1778943432272497, + "learning_rate": 1.5023696682464455e-05, + "loss": 0.2416, + "step": 857 + }, + { + "epoch": 2.1943734015345266, + "grad_norm": 0.1864614456662679, + "learning_rate": 1.4976303317535544e-05, + "loss": 0.2552, + "step": 858 + }, + { + "epoch": 2.1969309462915603, + "grad_norm": 0.20406945944259086, + "learning_rate": 1.4928909952606634e-05, + "loss": 0.2445, + "step": 859 + }, + { + "epoch": 2.1994884910485935, + "grad_norm": 0.19912968488704036, + "learning_rate": 1.4881516587677727e-05, + "loss": 0.2483, + "step": 860 + }, + { + "epoch": 2.2020460358056266, + "grad_norm": 0.1971404080483016, + "learning_rate": 1.4834123222748817e-05, + "loss": 0.2485, + "step": 861 + }, + { + "epoch": 2.20460358056266, + "grad_norm": 0.1906866495437284, + "learning_rate": 1.4786729857819906e-05, + "loss": 0.2422, + "step": 862 + }, + { + "epoch": 2.207161125319693, + "grad_norm": 0.2236746863882317, + "learning_rate": 1.4739336492890996e-05, + "loss": 0.2526, + "step": 863 + }, + { + "epoch": 2.209718670076726, + "grad_norm": 0.20479615550169253, + "learning_rate": 1.4691943127962086e-05, + "loss": 0.2406, + "step": 864 + }, + { + "epoch": 2.21227621483376, + "grad_norm": 0.18952772024384357, + "learning_rate": 1.4644549763033177e-05, + "loss": 0.2473, + "step": 865 + }, + { + "epoch": 2.214833759590793, + "grad_norm": 0.21288572261909536, + "learning_rate": 1.4597156398104267e-05, + "loss": 0.2421, + "step": 866 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.22140557077938572, + "learning_rate": 1.4549763033175356e-05, + "loss": 0.2475, + "step": 867 + }, + { + "epoch": 2.2199488491048593, + "grad_norm": 0.20708774144192757, + "learning_rate": 1.4502369668246446e-05, + "loss": 0.2673, + "step": 868 + }, + { + "epoch": 2.2225063938618925, + "grad_norm": 0.18720660014130933, + "learning_rate": 1.4454976303317535e-05, + "loss": 0.247, + "step": 869 + }, + { + "epoch": 2.2250639386189257, + "grad_norm": 0.22218057616305048, + "learning_rate": 1.4407582938388625e-05, + "loss": 0.2563, + "step": 870 + }, + { + "epoch": 2.227621483375959, + "grad_norm": 0.19461791848551566, + "learning_rate": 1.4360189573459718e-05, + "loss": 0.2411, + "step": 871 + }, + { + "epoch": 2.2301790281329925, + "grad_norm": 0.18465999777437872, + "learning_rate": 1.4312796208530808e-05, + "loss": 0.2436, + "step": 872 + }, + { + "epoch": 2.2327365728900257, + "grad_norm": 0.1869706742832914, + "learning_rate": 1.4265402843601896e-05, + "loss": 0.2468, + "step": 873 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.19859678673304443, + "learning_rate": 1.4218009478672985e-05, + "loss": 0.2629, + "step": 874 + }, + { + "epoch": 2.237851662404092, + "grad_norm": 0.18894735140342547, + "learning_rate": 1.4170616113744075e-05, + "loss": 0.2463, + "step": 875 + }, + { + "epoch": 2.2404092071611252, + "grad_norm": 0.1841073857339832, + "learning_rate": 1.4123222748815165e-05, + "loss": 0.2443, + "step": 876 + }, + { + "epoch": 2.2429667519181584, + "grad_norm": 0.19766216004613152, + "learning_rate": 1.4075829383886258e-05, + "loss": 0.2445, + "step": 877 + }, + { + "epoch": 2.2455242966751916, + "grad_norm": 0.20409991732668273, + "learning_rate": 1.4028436018957347e-05, + "loss": 0.2708, + "step": 878 + }, + { + "epoch": 2.2480818414322252, + "grad_norm": 0.19977719707950095, + "learning_rate": 1.3981042654028437e-05, + "loss": 0.2518, + "step": 879 + }, + { + "epoch": 2.2506393861892584, + "grad_norm": 0.2053796828512668, + "learning_rate": 1.3933649289099527e-05, + "loss": 0.2495, + "step": 880 + }, + { + "epoch": 2.2531969309462916, + "grad_norm": 0.17832792645098117, + "learning_rate": 1.3886255924170616e-05, + "loss": 0.2556, + "step": 881 + }, + { + "epoch": 2.2557544757033248, + "grad_norm": 0.18840256764724986, + "learning_rate": 1.3838862559241708e-05, + "loss": 0.2451, + "step": 882 + }, + { + "epoch": 2.258312020460358, + "grad_norm": 0.19398836581670234, + "learning_rate": 1.3791469194312797e-05, + "loss": 0.2473, + "step": 883 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.20303902146790734, + "learning_rate": 1.3744075829383887e-05, + "loss": 0.2597, + "step": 884 + }, + { + "epoch": 2.2634271099744243, + "grad_norm": 0.18720894136927904, + "learning_rate": 1.3696682464454977e-05, + "loss": 0.2434, + "step": 885 + }, + { + "epoch": 2.265984654731458, + "grad_norm": 0.18987210304857877, + "learning_rate": 1.3649289099526066e-05, + "loss": 0.2525, + "step": 886 + }, + { + "epoch": 2.268542199488491, + "grad_norm": 0.2048273825139193, + "learning_rate": 1.3601895734597156e-05, + "loss": 0.2455, + "step": 887 + }, + { + "epoch": 2.2710997442455243, + "grad_norm": 0.19576486403594287, + "learning_rate": 1.3554502369668249e-05, + "loss": 0.2613, + "step": 888 + }, + { + "epoch": 2.2736572890025575, + "grad_norm": 0.20157435141172714, + "learning_rate": 1.3507109004739339e-05, + "loss": 0.2537, + "step": 889 + }, + { + "epoch": 2.2762148337595907, + "grad_norm": 0.18228152643827863, + "learning_rate": 1.3459715639810428e-05, + "loss": 0.2513, + "step": 890 + }, + { + "epoch": 2.2787723785166243, + "grad_norm": 0.19555632064091633, + "learning_rate": 1.3412322274881516e-05, + "loss": 0.2586, + "step": 891 + }, + { + "epoch": 2.2813299232736575, + "grad_norm": 0.203816174533527, + "learning_rate": 1.3364928909952606e-05, + "loss": 0.2442, + "step": 892 + }, + { + "epoch": 2.2838874680306906, + "grad_norm": 0.2029139098001244, + "learning_rate": 1.3317535545023695e-05, + "loss": 0.2561, + "step": 893 + }, + { + "epoch": 2.286445012787724, + "grad_norm": 0.19048244262223243, + "learning_rate": 1.3270142180094788e-05, + "loss": 0.2548, + "step": 894 + }, + { + "epoch": 2.289002557544757, + "grad_norm": 0.19391847669904372, + "learning_rate": 1.3222748815165878e-05, + "loss": 0.2421, + "step": 895 + }, + { + "epoch": 2.29156010230179, + "grad_norm": 0.17981132307135597, + "learning_rate": 1.3175355450236968e-05, + "loss": 0.2532, + "step": 896 + }, + { + "epoch": 2.2941176470588234, + "grad_norm": 0.17858235830519362, + "learning_rate": 1.3127962085308057e-05, + "loss": 0.2404, + "step": 897 + }, + { + "epoch": 2.296675191815857, + "grad_norm": 0.19117496497677566, + "learning_rate": 1.3080568720379147e-05, + "loss": 0.2593, + "step": 898 + }, + { + "epoch": 2.29923273657289, + "grad_norm": 0.2016899881448073, + "learning_rate": 1.3033175355450238e-05, + "loss": 0.2528, + "step": 899 + }, + { + "epoch": 2.3017902813299234, + "grad_norm": 0.1810650437144312, + "learning_rate": 1.2985781990521328e-05, + "loss": 0.2402, + "step": 900 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.19291047749671594, + "learning_rate": 1.2938388625592418e-05, + "loss": 0.2461, + "step": 901 + }, + { + "epoch": 2.3069053708439897, + "grad_norm": 0.1939959846671169, + "learning_rate": 1.2890995260663507e-05, + "loss": 0.2473, + "step": 902 + }, + { + "epoch": 2.309462915601023, + "grad_norm": 0.1863110176956983, + "learning_rate": 1.2843601895734597e-05, + "loss": 0.2364, + "step": 903 + }, + { + "epoch": 2.312020460358056, + "grad_norm": 0.17566806664980533, + "learning_rate": 1.2796208530805687e-05, + "loss": 0.2482, + "step": 904 + }, + { + "epoch": 2.3145780051150897, + "grad_norm": 0.19920352232738897, + "learning_rate": 1.274881516587678e-05, + "loss": 0.2559, + "step": 905 + }, + { + "epoch": 2.317135549872123, + "grad_norm": 0.1953502116868402, + "learning_rate": 1.270142180094787e-05, + "loss": 0.2408, + "step": 906 + }, + { + "epoch": 2.319693094629156, + "grad_norm": 0.18651854725906564, + "learning_rate": 1.2654028436018959e-05, + "loss": 0.2523, + "step": 907 + }, + { + "epoch": 2.3222506393861893, + "grad_norm": 0.1894806065906189, + "learning_rate": 1.2606635071090048e-05, + "loss": 0.2651, + "step": 908 + }, + { + "epoch": 2.3248081841432224, + "grad_norm": 0.18839186702018404, + "learning_rate": 1.2559241706161136e-05, + "loss": 0.2474, + "step": 909 + }, + { + "epoch": 2.3273657289002556, + "grad_norm": 0.19140520747243725, + "learning_rate": 1.2511848341232226e-05, + "loss": 0.2393, + "step": 910 + }, + { + "epoch": 2.329923273657289, + "grad_norm": 0.18330215327131463, + "learning_rate": 1.2464454976303317e-05, + "loss": 0.2528, + "step": 911 + }, + { + "epoch": 2.3324808184143224, + "grad_norm": 0.1932126436646379, + "learning_rate": 1.2417061611374409e-05, + "loss": 0.2565, + "step": 912 + }, + { + "epoch": 2.3350383631713556, + "grad_norm": 0.1950356336934161, + "learning_rate": 1.2369668246445498e-05, + "loss": 0.2457, + "step": 913 + }, + { + "epoch": 2.337595907928389, + "grad_norm": 0.17865872468905974, + "learning_rate": 1.2322274881516588e-05, + "loss": 0.2425, + "step": 914 + }, + { + "epoch": 2.340153452685422, + "grad_norm": 0.18504654975711932, + "learning_rate": 1.227488151658768e-05, + "loss": 0.2577, + "step": 915 + }, + { + "epoch": 2.342710997442455, + "grad_norm": 0.20222565063208944, + "learning_rate": 1.2227488151658769e-05, + "loss": 0.2581, + "step": 916 + }, + { + "epoch": 2.3452685421994883, + "grad_norm": 0.1838472381815511, + "learning_rate": 1.2180094786729859e-05, + "loss": 0.2542, + "step": 917 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.18346333495631081, + "learning_rate": 1.2132701421800948e-05, + "loss": 0.2366, + "step": 918 + }, + { + "epoch": 2.350383631713555, + "grad_norm": 0.18792845931699567, + "learning_rate": 1.2085308056872038e-05, + "loss": 0.2397, + "step": 919 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.18552873313226068, + "learning_rate": 1.2037914691943128e-05, + "loss": 0.2476, + "step": 920 + }, + { + "epoch": 2.3554987212276215, + "grad_norm": 0.18568764961833162, + "learning_rate": 1.1990521327014219e-05, + "loss": 0.2464, + "step": 921 + }, + { + "epoch": 2.3580562659846547, + "grad_norm": 0.19910274628884306, + "learning_rate": 1.1943127962085309e-05, + "loss": 0.2616, + "step": 922 + }, + { + "epoch": 2.360613810741688, + "grad_norm": 0.19023555512921334, + "learning_rate": 1.1895734597156398e-05, + "loss": 0.2468, + "step": 923 + }, + { + "epoch": 2.363171355498721, + "grad_norm": 0.18498994847552913, + "learning_rate": 1.184834123222749e-05, + "loss": 0.238, + "step": 924 + }, + { + "epoch": 2.3657289002557547, + "grad_norm": 0.19230699836861417, + "learning_rate": 1.180094786729858e-05, + "loss": 0.2499, + "step": 925 + }, + { + "epoch": 2.368286445012788, + "grad_norm": 0.18992356411703937, + "learning_rate": 1.1753554502369669e-05, + "loss": 0.2486, + "step": 926 + }, + { + "epoch": 2.370843989769821, + "grad_norm": 0.194179037810583, + "learning_rate": 1.1706161137440758e-05, + "loss": 0.2419, + "step": 927 + }, + { + "epoch": 2.373401534526854, + "grad_norm": 0.19309339760618768, + "learning_rate": 1.1658767772511848e-05, + "loss": 0.2498, + "step": 928 + }, + { + "epoch": 2.3759590792838874, + "grad_norm": 0.18311942664171635, + "learning_rate": 1.161137440758294e-05, + "loss": 0.2508, + "step": 929 + }, + { + "epoch": 2.3785166240409206, + "grad_norm": 0.1851345033868292, + "learning_rate": 1.1563981042654029e-05, + "loss": 0.2458, + "step": 930 + }, + { + "epoch": 2.381074168797954, + "grad_norm": 0.20539574184587675, + "learning_rate": 1.1516587677725119e-05, + "loss": 0.2535, + "step": 931 + }, + { + "epoch": 2.3836317135549874, + "grad_norm": 0.1892100521466075, + "learning_rate": 1.146919431279621e-05, + "loss": 0.2429, + "step": 932 + }, + { + "epoch": 2.3861892583120206, + "grad_norm": 0.18603312629992957, + "learning_rate": 1.14218009478673e-05, + "loss": 0.2509, + "step": 933 + }, + { + "epoch": 2.3887468030690537, + "grad_norm": 0.19500218393174892, + "learning_rate": 1.137440758293839e-05, + "loss": 0.2566, + "step": 934 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.1826094688821734, + "learning_rate": 1.1327014218009479e-05, + "loss": 0.2463, + "step": 935 + }, + { + "epoch": 2.39386189258312, + "grad_norm": 0.2035505913630252, + "learning_rate": 1.1279620853080569e-05, + "loss": 0.2453, + "step": 936 + }, + { + "epoch": 2.3964194373401533, + "grad_norm": 0.1986182294740978, + "learning_rate": 1.1232227488151658e-05, + "loss": 0.2558, + "step": 937 + }, + { + "epoch": 2.398976982097187, + "grad_norm": 0.19707252280447218, + "learning_rate": 1.118483412322275e-05, + "loss": 0.2434, + "step": 938 + }, + { + "epoch": 2.40153452685422, + "grad_norm": 0.18477355859208972, + "learning_rate": 1.113744075829384e-05, + "loss": 0.2593, + "step": 939 + }, + { + "epoch": 2.4040920716112533, + "grad_norm": 0.18942958453392783, + "learning_rate": 1.1090047393364929e-05, + "loss": 0.2356, + "step": 940 + }, + { + "epoch": 2.4066496163682864, + "grad_norm": 0.19115020121012594, + "learning_rate": 1.104265402843602e-05, + "loss": 0.2586, + "step": 941 + }, + { + "epoch": 2.4092071611253196, + "grad_norm": 0.1907623500410302, + "learning_rate": 1.099526066350711e-05, + "loss": 0.2466, + "step": 942 + }, + { + "epoch": 2.411764705882353, + "grad_norm": 0.20551739715864323, + "learning_rate": 1.09478672985782e-05, + "loss": 0.2641, + "step": 943 + }, + { + "epoch": 2.414322250639386, + "grad_norm": 0.2037628239144751, + "learning_rate": 1.0900473933649289e-05, + "loss": 0.2581, + "step": 944 + }, + { + "epoch": 2.4168797953964196, + "grad_norm": 0.20606502494179982, + "learning_rate": 1.0853080568720379e-05, + "loss": 0.2455, + "step": 945 + }, + { + "epoch": 2.419437340153453, + "grad_norm": 0.19957207626288198, + "learning_rate": 1.080568720379147e-05, + "loss": 0.2417, + "step": 946 + }, + { + "epoch": 2.421994884910486, + "grad_norm": 0.19178247803581283, + "learning_rate": 1.075829383886256e-05, + "loss": 0.2491, + "step": 947 + }, + { + "epoch": 2.424552429667519, + "grad_norm": 0.1829585466891497, + "learning_rate": 1.071090047393365e-05, + "loss": 0.2622, + "step": 948 + }, + { + "epoch": 2.4271099744245523, + "grad_norm": 0.19009770566404205, + "learning_rate": 1.066350710900474e-05, + "loss": 0.2472, + "step": 949 + }, + { + "epoch": 2.4296675191815855, + "grad_norm": 0.18826570727560837, + "learning_rate": 1.061611374407583e-05, + "loss": 0.2435, + "step": 950 + }, + { + "epoch": 2.4322250639386187, + "grad_norm": 0.18359777168223823, + "learning_rate": 1.056872037914692e-05, + "loss": 0.2563, + "step": 951 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.18743774264051472, + "learning_rate": 1.052132701421801e-05, + "loss": 0.2501, + "step": 952 + }, + { + "epoch": 2.4373401534526855, + "grad_norm": 0.18190848955579414, + "learning_rate": 1.04739336492891e-05, + "loss": 0.2419, + "step": 953 + }, + { + "epoch": 2.4398976982097187, + "grad_norm": 0.1869056453658558, + "learning_rate": 1.0426540284360189e-05, + "loss": 0.2468, + "step": 954 + }, + { + "epoch": 2.442455242966752, + "grad_norm": 0.19325837584457362, + "learning_rate": 1.037914691943128e-05, + "loss": 0.2398, + "step": 955 + }, + { + "epoch": 2.445012787723785, + "grad_norm": 0.18958397138054392, + "learning_rate": 1.033175355450237e-05, + "loss": 0.2587, + "step": 956 + }, + { + "epoch": 2.4475703324808182, + "grad_norm": 0.1802569001638857, + "learning_rate": 1.028436018957346e-05, + "loss": 0.249, + "step": 957 + }, + { + "epoch": 2.4501278772378514, + "grad_norm": 0.19473776964299236, + "learning_rate": 1.0236966824644551e-05, + "loss": 0.2504, + "step": 958 + }, + { + "epoch": 2.452685421994885, + "grad_norm": 0.19318565328468898, + "learning_rate": 1.018957345971564e-05, + "loss": 0.2504, + "step": 959 + }, + { + "epoch": 2.455242966751918, + "grad_norm": 0.1935892471549821, + "learning_rate": 1.014218009478673e-05, + "loss": 0.2608, + "step": 960 + }, + { + "epoch": 2.4578005115089514, + "grad_norm": 0.2128184199845009, + "learning_rate": 1.009478672985782e-05, + "loss": 0.2547, + "step": 961 + }, + { + "epoch": 2.4603580562659846, + "grad_norm": 0.1894940142447489, + "learning_rate": 1.004739336492891e-05, + "loss": 0.2654, + "step": 962 + }, + { + "epoch": 2.4629156010230178, + "grad_norm": 0.18093993857309348, + "learning_rate": 1e-05, + "loss": 0.2214, + "step": 963 + }, + { + "epoch": 2.4654731457800514, + "grad_norm": 0.19178096580173365, + "learning_rate": 9.95260663507109e-06, + "loss": 0.2428, + "step": 964 + }, + { + "epoch": 2.4680306905370846, + "grad_norm": 0.17992616710306839, + "learning_rate": 9.90521327014218e-06, + "loss": 0.2316, + "step": 965 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.19203487435465688, + "learning_rate": 9.857819905213271e-06, + "loss": 0.2461, + "step": 966 + }, + { + "epoch": 2.473145780051151, + "grad_norm": 0.18682427737935345, + "learning_rate": 9.810426540284361e-06, + "loss": 0.2546, + "step": 967 + }, + { + "epoch": 2.475703324808184, + "grad_norm": 0.18859469892005637, + "learning_rate": 9.76303317535545e-06, + "loss": 0.2622, + "step": 968 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.18438885948965986, + "learning_rate": 9.71563981042654e-06, + "loss": 0.2545, + "step": 969 + }, + { + "epoch": 2.4808184143222505, + "grad_norm": 0.19779141574116138, + "learning_rate": 9.66824644549763e-06, + "loss": 0.244, + "step": 970 + }, + { + "epoch": 2.483375959079284, + "grad_norm": 0.19565635148681754, + "learning_rate": 9.62085308056872e-06, + "loss": 0.2659, + "step": 971 + }, + { + "epoch": 2.4859335038363173, + "grad_norm": 0.17628127887779274, + "learning_rate": 9.573459715639811e-06, + "loss": 0.2445, + "step": 972 + }, + { + "epoch": 2.4884910485933505, + "grad_norm": 0.1800315251692981, + "learning_rate": 9.5260663507109e-06, + "loss": 0.2524, + "step": 973 + }, + { + "epoch": 2.4910485933503836, + "grad_norm": 0.1863667327196091, + "learning_rate": 9.47867298578199e-06, + "loss": 0.2555, + "step": 974 + }, + { + "epoch": 2.493606138107417, + "grad_norm": 0.19474788386072336, + "learning_rate": 9.431279620853082e-06, + "loss": 0.2607, + "step": 975 + }, + { + "epoch": 2.49616368286445, + "grad_norm": 0.18695877540681222, + "learning_rate": 9.383886255924171e-06, + "loss": 0.2594, + "step": 976 + }, + { + "epoch": 2.498721227621483, + "grad_norm": 0.18123819527715856, + "learning_rate": 9.336492890995263e-06, + "loss": 0.2388, + "step": 977 + }, + { + "epoch": 2.501278772378517, + "grad_norm": 0.1805022447990822, + "learning_rate": 9.28909952606635e-06, + "loss": 0.2611, + "step": 978 + }, + { + "epoch": 2.50383631713555, + "grad_norm": 0.20725044894441963, + "learning_rate": 9.24170616113744e-06, + "loss": 0.2597, + "step": 979 + }, + { + "epoch": 2.506393861892583, + "grad_norm": 0.17978028465292306, + "learning_rate": 9.194312796208532e-06, + "loss": 0.2546, + "step": 980 + }, + { + "epoch": 2.5089514066496164, + "grad_norm": 0.19895373521772294, + "learning_rate": 9.146919431279621e-06, + "loss": 0.2592, + "step": 981 + }, + { + "epoch": 2.5115089514066495, + "grad_norm": 0.1908106662263474, + "learning_rate": 9.09952606635071e-06, + "loss": 0.2617, + "step": 982 + }, + { + "epoch": 2.5140664961636827, + "grad_norm": 0.17851227882118903, + "learning_rate": 9.052132701421802e-06, + "loss": 0.2438, + "step": 983 + }, + { + "epoch": 2.516624040920716, + "grad_norm": 0.18752114855298738, + "learning_rate": 9.004739336492892e-06, + "loss": 0.2443, + "step": 984 + }, + { + "epoch": 2.5191815856777495, + "grad_norm": 0.2066530632997492, + "learning_rate": 8.957345971563981e-06, + "loss": 0.2518, + "step": 985 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.17903017399321067, + "learning_rate": 8.909952606635073e-06, + "loss": 0.2551, + "step": 986 + }, + { + "epoch": 2.524296675191816, + "grad_norm": 0.1813455062016161, + "learning_rate": 8.86255924170616e-06, + "loss": 0.2477, + "step": 987 + }, + { + "epoch": 2.526854219948849, + "grad_norm": 0.19991953255257042, + "learning_rate": 8.81516587677725e-06, + "loss": 0.2518, + "step": 988 + }, + { + "epoch": 2.5294117647058822, + "grad_norm": 0.19321012124999268, + "learning_rate": 8.767772511848342e-06, + "loss": 0.2586, + "step": 989 + }, + { + "epoch": 2.531969309462916, + "grad_norm": 0.17877646902912023, + "learning_rate": 8.720379146919431e-06, + "loss": 0.2439, + "step": 990 + }, + { + "epoch": 2.5345268542199486, + "grad_norm": 0.18016347358333068, + "learning_rate": 8.672985781990521e-06, + "loss": 0.2504, + "step": 991 + }, + { + "epoch": 2.5370843989769822, + "grad_norm": 0.19598330654437415, + "learning_rate": 8.625592417061612e-06, + "loss": 0.2402, + "step": 992 + }, + { + "epoch": 2.5396419437340154, + "grad_norm": 0.19085013146167623, + "learning_rate": 8.578199052132702e-06, + "loss": 0.2518, + "step": 993 + }, + { + "epoch": 2.5421994884910486, + "grad_norm": 0.1835210269356536, + "learning_rate": 8.530805687203793e-06, + "loss": 0.2457, + "step": 994 + }, + { + "epoch": 2.544757033248082, + "grad_norm": 0.1808657315125499, + "learning_rate": 8.483412322274883e-06, + "loss": 0.2581, + "step": 995 + }, + { + "epoch": 2.547314578005115, + "grad_norm": 0.17348100084546994, + "learning_rate": 8.436018957345971e-06, + "loss": 0.2361, + "step": 996 + }, + { + "epoch": 2.5498721227621486, + "grad_norm": 0.18526137266320347, + "learning_rate": 8.388625592417062e-06, + "loss": 0.2518, + "step": 997 + }, + { + "epoch": 2.5524296675191813, + "grad_norm": 0.1866598354500443, + "learning_rate": 8.341232227488152e-06, + "loss": 0.2489, + "step": 998 + }, + { + "epoch": 2.554987212276215, + "grad_norm": 0.1866669094430096, + "learning_rate": 8.293838862559241e-06, + "loss": 0.2612, + "step": 999 + }, + { + "epoch": 2.557544757033248, + "grad_norm": 0.20188335606638064, + "learning_rate": 8.246445497630333e-06, + "loss": 0.2394, + "step": 1000 + }, + { + "epoch": 2.5601023017902813, + "grad_norm": 0.17491742752344783, + "learning_rate": 8.199052132701422e-06, + "loss": 0.2377, + "step": 1001 + }, + { + "epoch": 2.5626598465473145, + "grad_norm": 0.17572988640896128, + "learning_rate": 8.151658767772512e-06, + "loss": 0.2523, + "step": 1002 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.1850486906047413, + "learning_rate": 8.104265402843603e-06, + "loss": 0.2604, + "step": 1003 + }, + { + "epoch": 2.5677749360613813, + "grad_norm": 0.18456694735716317, + "learning_rate": 8.056872037914693e-06, + "loss": 0.2579, + "step": 1004 + }, + { + "epoch": 2.5703324808184145, + "grad_norm": 0.1989825021126641, + "learning_rate": 8.009478672985781e-06, + "loss": 0.2456, + "step": 1005 + }, + { + "epoch": 2.5728900255754477, + "grad_norm": 0.19392351458658866, + "learning_rate": 7.962085308056872e-06, + "loss": 0.2542, + "step": 1006 + }, + { + "epoch": 2.575447570332481, + "grad_norm": 0.1803390415874974, + "learning_rate": 7.914691943127962e-06, + "loss": 0.243, + "step": 1007 + }, + { + "epoch": 2.578005115089514, + "grad_norm": 0.18345024591378195, + "learning_rate": 7.867298578199052e-06, + "loss": 0.2451, + "step": 1008 + }, + { + "epoch": 2.580562659846547, + "grad_norm": 0.1941629539774514, + "learning_rate": 7.819905213270143e-06, + "loss": 0.2484, + "step": 1009 + }, + { + "epoch": 2.5831202046035804, + "grad_norm": 0.20207081751890732, + "learning_rate": 7.772511848341233e-06, + "loss": 0.2562, + "step": 1010 + }, + { + "epoch": 2.585677749360614, + "grad_norm": 0.18062688142042024, + "learning_rate": 7.725118483412324e-06, + "loss": 0.2454, + "step": 1011 + }, + { + "epoch": 2.588235294117647, + "grad_norm": 0.18172987412926497, + "learning_rate": 7.677725118483414e-06, + "loss": 0.258, + "step": 1012 + }, + { + "epoch": 2.5907928388746804, + "grad_norm": 0.1910447725518475, + "learning_rate": 7.630331753554503e-06, + "loss": 0.2547, + "step": 1013 + }, + { + "epoch": 2.5933503836317136, + "grad_norm": 0.18183009657939525, + "learning_rate": 7.582938388625594e-06, + "loss": 0.2477, + "step": 1014 + }, + { + "epoch": 2.5959079283887467, + "grad_norm": 0.19084392574095072, + "learning_rate": 7.5355450236966825e-06, + "loss": 0.2535, + "step": 1015 + }, + { + "epoch": 2.59846547314578, + "grad_norm": 0.19660855958741716, + "learning_rate": 7.488151658767772e-06, + "loss": 0.2542, + "step": 1016 + }, + { + "epoch": 2.601023017902813, + "grad_norm": 0.19119145619102845, + "learning_rate": 7.4407582938388635e-06, + "loss": 0.2652, + "step": 1017 + }, + { + "epoch": 2.6035805626598467, + "grad_norm": 0.18403920655569364, + "learning_rate": 7.393364928909953e-06, + "loss": 0.2507, + "step": 1018 + }, + { + "epoch": 2.60613810741688, + "grad_norm": 0.18051231326303438, + "learning_rate": 7.345971563981043e-06, + "loss": 0.2457, + "step": 1019 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.18075516190798283, + "learning_rate": 7.298578199052133e-06, + "loss": 0.252, + "step": 1020 + }, + { + "epoch": 2.6112531969309463, + "grad_norm": 0.1788096745482508, + "learning_rate": 7.251184834123223e-06, + "loss": 0.2436, + "step": 1021 + }, + { + "epoch": 2.6138107416879794, + "grad_norm": 0.1824092891963844, + "learning_rate": 7.2037914691943126e-06, + "loss": 0.2432, + "step": 1022 + }, + { + "epoch": 2.6163682864450126, + "grad_norm": 0.16862388714463997, + "learning_rate": 7.156398104265404e-06, + "loss": 0.2432, + "step": 1023 + }, + { + "epoch": 2.618925831202046, + "grad_norm": 0.17677820353677443, + "learning_rate": 7.109004739336493e-06, + "loss": 0.2454, + "step": 1024 + }, + { + "epoch": 2.6214833759590794, + "grad_norm": 0.1749578021536912, + "learning_rate": 7.061611374407582e-06, + "loss": 0.2516, + "step": 1025 + }, + { + "epoch": 2.6240409207161126, + "grad_norm": 0.1746709607811344, + "learning_rate": 7.014218009478674e-06, + "loss": 0.2393, + "step": 1026 + }, + { + "epoch": 2.626598465473146, + "grad_norm": 0.1774898930232003, + "learning_rate": 6.966824644549763e-06, + "loss": 0.2488, + "step": 1027 + }, + { + "epoch": 2.629156010230179, + "grad_norm": 0.17292145541011766, + "learning_rate": 6.919431279620854e-06, + "loss": 0.2439, + "step": 1028 + }, + { + "epoch": 2.631713554987212, + "grad_norm": 0.25017047237469586, + "learning_rate": 6.8720379146919435e-06, + "loss": 0.2666, + "step": 1029 + }, + { + "epoch": 2.634271099744246, + "grad_norm": 0.1802705434620767, + "learning_rate": 6.824644549763033e-06, + "loss": 0.2611, + "step": 1030 + }, + { + "epoch": 2.6368286445012785, + "grad_norm": 0.18448765710220488, + "learning_rate": 6.7772511848341244e-06, + "loss": 0.2407, + "step": 1031 + }, + { + "epoch": 2.639386189258312, + "grad_norm": 0.1740367294783914, + "learning_rate": 6.729857819905214e-06, + "loss": 0.2459, + "step": 1032 + }, + { + "epoch": 2.6419437340153453, + "grad_norm": 0.17677782819853, + "learning_rate": 6.682464454976303e-06, + "loss": 0.2531, + "step": 1033 + }, + { + "epoch": 2.6445012787723785, + "grad_norm": 0.18226239340272907, + "learning_rate": 6.635071090047394e-06, + "loss": 0.2488, + "step": 1034 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 0.17405875174384855, + "learning_rate": 6.587677725118484e-06, + "loss": 0.2445, + "step": 1035 + }, + { + "epoch": 2.649616368286445, + "grad_norm": 0.1776309384953249, + "learning_rate": 6.5402843601895735e-06, + "loss": 0.2401, + "step": 1036 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.18334105397117928, + "learning_rate": 6.492890995260664e-06, + "loss": 0.2503, + "step": 1037 + }, + { + "epoch": 2.6547314578005117, + "grad_norm": 0.17986542354916346, + "learning_rate": 6.445497630331754e-06, + "loss": 0.2489, + "step": 1038 + }, + { + "epoch": 2.657289002557545, + "grad_norm": 0.17690617151115767, + "learning_rate": 6.398104265402843e-06, + "loss": 0.2403, + "step": 1039 + }, + { + "epoch": 2.659846547314578, + "grad_norm": 0.18280012481225613, + "learning_rate": 6.350710900473935e-06, + "loss": 0.2498, + "step": 1040 + }, + { + "epoch": 2.662404092071611, + "grad_norm": 0.17506102024381995, + "learning_rate": 6.303317535545024e-06, + "loss": 0.2308, + "step": 1041 + }, + { + "epoch": 2.6649616368286444, + "grad_norm": 0.18790705934428378, + "learning_rate": 6.255924170616113e-06, + "loss": 0.2529, + "step": 1042 + }, + { + "epoch": 2.6675191815856776, + "grad_norm": 0.1892712596775323, + "learning_rate": 6.208530805687204e-06, + "loss": 0.2432, + "step": 1043 + }, + { + "epoch": 2.670076726342711, + "grad_norm": 0.19005999786944971, + "learning_rate": 6.161137440758294e-06, + "loss": 0.2423, + "step": 1044 + }, + { + "epoch": 2.6726342710997444, + "grad_norm": 0.1845872169401998, + "learning_rate": 6.1137440758293845e-06, + "loss": 0.2593, + "step": 1045 + }, + { + "epoch": 2.6751918158567776, + "grad_norm": 0.18704678458411442, + "learning_rate": 6.066350710900474e-06, + "loss": 0.2391, + "step": 1046 + }, + { + "epoch": 2.6777493606138107, + "grad_norm": 0.17913018163417851, + "learning_rate": 6.018957345971564e-06, + "loss": 0.2405, + "step": 1047 + }, + { + "epoch": 2.680306905370844, + "grad_norm": 0.19472560672322844, + "learning_rate": 5.971563981042654e-06, + "loss": 0.2505, + "step": 1048 + }, + { + "epoch": 2.682864450127877, + "grad_norm": 0.18019457992632396, + "learning_rate": 5.924170616113745e-06, + "loss": 0.24, + "step": 1049 + }, + { + "epoch": 2.6854219948849103, + "grad_norm": 0.17330920592908153, + "learning_rate": 5.876777251184834e-06, + "loss": 0.2544, + "step": 1050 + }, + { + "epoch": 2.687979539641944, + "grad_norm": 0.1815334540303024, + "learning_rate": 5.829383886255924e-06, + "loss": 0.2466, + "step": 1051 + }, + { + "epoch": 2.690537084398977, + "grad_norm": 0.19496083605348435, + "learning_rate": 5.7819905213270145e-06, + "loss": 0.252, + "step": 1052 + }, + { + "epoch": 2.6930946291560103, + "grad_norm": 0.19241164389164786, + "learning_rate": 5.734597156398105e-06, + "loss": 0.2504, + "step": 1053 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.45538670179365104, + "learning_rate": 5.687203791469195e-06, + "loss": 0.26, + "step": 1054 + }, + { + "epoch": 2.6982097186700766, + "grad_norm": 0.18008218699297715, + "learning_rate": 5.639810426540284e-06, + "loss": 0.2493, + "step": 1055 + }, + { + "epoch": 2.70076726342711, + "grad_norm": 0.18340270921249122, + "learning_rate": 5.592417061611375e-06, + "loss": 0.237, + "step": 1056 + }, + { + "epoch": 2.703324808184143, + "grad_norm": 0.22199762048479815, + "learning_rate": 5.5450236966824644e-06, + "loss": 0.2509, + "step": 1057 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.1761889836482758, + "learning_rate": 5.497630331753555e-06, + "loss": 0.2502, + "step": 1058 + }, + { + "epoch": 2.70843989769821, + "grad_norm": 0.1788757958818801, + "learning_rate": 5.4502369668246446e-06, + "loss": 0.2548, + "step": 1059 + }, + { + "epoch": 2.710997442455243, + "grad_norm": 0.17679336863109477, + "learning_rate": 5.402843601895735e-06, + "loss": 0.2511, + "step": 1060 + }, + { + "epoch": 2.713554987212276, + "grad_norm": 0.18320215458542266, + "learning_rate": 5.355450236966825e-06, + "loss": 0.2394, + "step": 1061 + }, + { + "epoch": 2.7161125319693094, + "grad_norm": 0.1954816098192057, + "learning_rate": 5.308056872037915e-06, + "loss": 0.2686, + "step": 1062 + }, + { + "epoch": 2.718670076726343, + "grad_norm": 0.17601162272580267, + "learning_rate": 5.260663507109005e-06, + "loss": 0.2428, + "step": 1063 + }, + { + "epoch": 2.7212276214833757, + "grad_norm": 0.17667083168107164, + "learning_rate": 5.2132701421800945e-06, + "loss": 0.2538, + "step": 1064 + }, + { + "epoch": 2.7237851662404093, + "grad_norm": 0.17964593927923134, + "learning_rate": 5.165876777251185e-06, + "loss": 0.2515, + "step": 1065 + }, + { + "epoch": 2.7263427109974425, + "grad_norm": 0.18415215339401061, + "learning_rate": 5.1184834123222755e-06, + "loss": 0.2467, + "step": 1066 + }, + { + "epoch": 2.7289002557544757, + "grad_norm": 0.18206983493291928, + "learning_rate": 5.071090047393365e-06, + "loss": 0.2505, + "step": 1067 + }, + { + "epoch": 2.731457800511509, + "grad_norm": 0.17702957807117964, + "learning_rate": 5.023696682464455e-06, + "loss": 0.2481, + "step": 1068 + }, + { + "epoch": 2.734015345268542, + "grad_norm": 0.18941559797605062, + "learning_rate": 4.976303317535545e-06, + "loss": 0.2515, + "step": 1069 + }, + { + "epoch": 2.7365728900255757, + "grad_norm": 0.17866021653822645, + "learning_rate": 4.928909952606636e-06, + "loss": 0.2573, + "step": 1070 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.1771695946103805, + "learning_rate": 4.881516587677725e-06, + "loss": 0.2468, + "step": 1071 + }, + { + "epoch": 2.741687979539642, + "grad_norm": 0.18877889489990468, + "learning_rate": 4.834123222748815e-06, + "loss": 0.2513, + "step": 1072 + }, + { + "epoch": 2.7442455242966752, + "grad_norm": 0.18206987667415467, + "learning_rate": 4.7867298578199055e-06, + "loss": 0.261, + "step": 1073 + }, + { + "epoch": 2.7468030690537084, + "grad_norm": 0.17701923743513961, + "learning_rate": 4.739336492890995e-06, + "loss": 0.2382, + "step": 1074 + }, + { + "epoch": 2.7493606138107416, + "grad_norm": 0.17540739260356206, + "learning_rate": 4.691943127962086e-06, + "loss": 0.2252, + "step": 1075 + }, + { + "epoch": 2.7519181585677748, + "grad_norm": 0.18630964353133092, + "learning_rate": 4.644549763033175e-06, + "loss": 0.2641, + "step": 1076 + }, + { + "epoch": 2.7544757033248084, + "grad_norm": 0.18014938799060165, + "learning_rate": 4.597156398104266e-06, + "loss": 0.2474, + "step": 1077 + }, + { + "epoch": 2.7570332480818416, + "grad_norm": 0.17307796418005664, + "learning_rate": 4.549763033175355e-06, + "loss": 0.2494, + "step": 1078 + }, + { + "epoch": 2.7595907928388748, + "grad_norm": 0.17931809836460505, + "learning_rate": 4.502369668246446e-06, + "loss": 0.2489, + "step": 1079 + }, + { + "epoch": 2.762148337595908, + "grad_norm": 0.18594644827320997, + "learning_rate": 4.454976303317536e-06, + "loss": 0.2592, + "step": 1080 + }, + { + "epoch": 2.764705882352941, + "grad_norm": 0.1761943169656133, + "learning_rate": 4.407582938388625e-06, + "loss": 0.2478, + "step": 1081 + }, + { + "epoch": 2.7672634271099743, + "grad_norm": 0.173916317774858, + "learning_rate": 4.360189573459716e-06, + "loss": 0.247, + "step": 1082 + }, + { + "epoch": 2.7698209718670075, + "grad_norm": 0.17873537641991927, + "learning_rate": 4.312796208530806e-06, + "loss": 0.2506, + "step": 1083 + }, + { + "epoch": 2.772378516624041, + "grad_norm": 0.16911433381467955, + "learning_rate": 4.265402843601897e-06, + "loss": 0.2508, + "step": 1084 + }, + { + "epoch": 2.7749360613810743, + "grad_norm": 0.1791375462513097, + "learning_rate": 4.2180094786729854e-06, + "loss": 0.2388, + "step": 1085 + }, + { + "epoch": 2.7774936061381075, + "grad_norm": 0.1784797467796097, + "learning_rate": 4.170616113744076e-06, + "loss": 0.2572, + "step": 1086 + }, + { + "epoch": 2.7800511508951407, + "grad_norm": 0.18492104457145292, + "learning_rate": 4.123222748815166e-06, + "loss": 0.2384, + "step": 1087 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.18012702780394255, + "learning_rate": 4.075829383886256e-06, + "loss": 0.2307, + "step": 1088 + }, + { + "epoch": 2.785166240409207, + "grad_norm": 0.17320341741197393, + "learning_rate": 4.0284360189573465e-06, + "loss": 0.2383, + "step": 1089 + }, + { + "epoch": 2.78772378516624, + "grad_norm": 0.17354330549732871, + "learning_rate": 3.981042654028436e-06, + "loss": 0.2284, + "step": 1090 + }, + { + "epoch": 2.790281329923274, + "grad_norm": 0.1817183194818464, + "learning_rate": 3.933649289099526e-06, + "loss": 0.2325, + "step": 1091 + }, + { + "epoch": 2.792838874680307, + "grad_norm": 0.1740902128176595, + "learning_rate": 3.886255924170616e-06, + "loss": 0.2464, + "step": 1092 + }, + { + "epoch": 2.79539641943734, + "grad_norm": 0.17979740364634914, + "learning_rate": 3.838862559241707e-06, + "loss": 0.2448, + "step": 1093 + }, + { + "epoch": 2.7979539641943734, + "grad_norm": 0.18910478213308557, + "learning_rate": 3.791469194312797e-06, + "loss": 0.2529, + "step": 1094 + }, + { + "epoch": 2.8005115089514065, + "grad_norm": 0.17562387593048473, + "learning_rate": 3.744075829383886e-06, + "loss": 0.2516, + "step": 1095 + }, + { + "epoch": 2.80306905370844, + "grad_norm": 0.17037649183598133, + "learning_rate": 3.6966824644549766e-06, + "loss": 0.2304, + "step": 1096 + }, + { + "epoch": 2.805626598465473, + "grad_norm": 0.1865715453669857, + "learning_rate": 3.6492890995260666e-06, + "loss": 0.2484, + "step": 1097 + }, + { + "epoch": 2.8081841432225065, + "grad_norm": 0.17956564469501413, + "learning_rate": 3.6018957345971563e-06, + "loss": 0.2429, + "step": 1098 + }, + { + "epoch": 2.8107416879795397, + "grad_norm": 0.17380201982016105, + "learning_rate": 3.5545023696682464e-06, + "loss": 0.2475, + "step": 1099 + }, + { + "epoch": 2.813299232736573, + "grad_norm": 0.18949661964378972, + "learning_rate": 3.507109004739337e-06, + "loss": 0.254, + "step": 1100 + }, + { + "epoch": 2.815856777493606, + "grad_norm": 0.18281900420620492, + "learning_rate": 3.459715639810427e-06, + "loss": 0.246, + "step": 1101 + }, + { + "epoch": 2.8184143222506393, + "grad_norm": 0.19046092151157248, + "learning_rate": 3.4123222748815165e-06, + "loss": 0.252, + "step": 1102 + }, + { + "epoch": 2.820971867007673, + "grad_norm": 0.17912805262085352, + "learning_rate": 3.364928909952607e-06, + "loss": 0.2528, + "step": 1103 + }, + { + "epoch": 2.8235294117647056, + "grad_norm": 0.16539721286530049, + "learning_rate": 3.317535545023697e-06, + "loss": 0.2452, + "step": 1104 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.18089995003561432, + "learning_rate": 3.2701421800947867e-06, + "loss": 0.2442, + "step": 1105 + }, + { + "epoch": 2.8286445012787724, + "grad_norm": 0.17961058615086165, + "learning_rate": 3.222748815165877e-06, + "loss": 0.2532, + "step": 1106 + }, + { + "epoch": 2.8312020460358056, + "grad_norm": 0.17670809278729904, + "learning_rate": 3.1753554502369673e-06, + "loss": 0.2469, + "step": 1107 + }, + { + "epoch": 2.833759590792839, + "grad_norm": 0.17184672240491808, + "learning_rate": 3.1279620853080565e-06, + "loss": 0.253, + "step": 1108 + }, + { + "epoch": 2.836317135549872, + "grad_norm": 0.18342580492222546, + "learning_rate": 3.080568720379147e-06, + "loss": 0.2447, + "step": 1109 + }, + { + "epoch": 2.8388746803069056, + "grad_norm": 0.17195526482252144, + "learning_rate": 3.033175355450237e-06, + "loss": 0.2463, + "step": 1110 + }, + { + "epoch": 2.8414322250639388, + "grad_norm": 0.1792058703015505, + "learning_rate": 2.985781990521327e-06, + "loss": 0.2498, + "step": 1111 + }, + { + "epoch": 2.843989769820972, + "grad_norm": 0.17565132753951782, + "learning_rate": 2.938388625592417e-06, + "loss": 0.2553, + "step": 1112 + }, + { + "epoch": 2.846547314578005, + "grad_norm": 0.18056116078607748, + "learning_rate": 2.8909952606635073e-06, + "loss": 0.254, + "step": 1113 + }, + { + "epoch": 2.8491048593350383, + "grad_norm": 0.17874160432603925, + "learning_rate": 2.8436018957345973e-06, + "loss": 0.246, + "step": 1114 + }, + { + "epoch": 2.8516624040920715, + "grad_norm": 0.17126107844733118, + "learning_rate": 2.7962085308056874e-06, + "loss": 0.2437, + "step": 1115 + }, + { + "epoch": 2.8542199488491047, + "grad_norm": 0.16804735225501954, + "learning_rate": 2.7488151658767775e-06, + "loss": 0.2338, + "step": 1116 + }, + { + "epoch": 2.8567774936061383, + "grad_norm": 0.17871874445538027, + "learning_rate": 2.7014218009478675e-06, + "loss": 0.2504, + "step": 1117 + }, + { + "epoch": 2.8593350383631715, + "grad_norm": 0.16605891064626507, + "learning_rate": 2.6540284360189576e-06, + "loss": 0.2431, + "step": 1118 + }, + { + "epoch": 2.8618925831202047, + "grad_norm": 0.1803054733026333, + "learning_rate": 2.6066350710900472e-06, + "loss": 0.2508, + "step": 1119 + }, + { + "epoch": 2.864450127877238, + "grad_norm": 0.17422585403639088, + "learning_rate": 2.5592417061611377e-06, + "loss": 0.2462, + "step": 1120 + }, + { + "epoch": 2.867007672634271, + "grad_norm": 0.17364151884560752, + "learning_rate": 2.5118483412322274e-06, + "loss": 0.2583, + "step": 1121 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.1746615119917103, + "learning_rate": 2.464454976303318e-06, + "loss": 0.2421, + "step": 1122 + }, + { + "epoch": 2.8721227621483374, + "grad_norm": 0.17498173832710498, + "learning_rate": 2.4170616113744075e-06, + "loss": 0.2437, + "step": 1123 + }, + { + "epoch": 2.874680306905371, + "grad_norm": 0.16581915880183135, + "learning_rate": 2.3696682464454976e-06, + "loss": 0.2398, + "step": 1124 + }, + { + "epoch": 2.877237851662404, + "grad_norm": 0.16997023135551514, + "learning_rate": 2.3222748815165876e-06, + "loss": 0.2467, + "step": 1125 + }, + { + "epoch": 2.8797953964194374, + "grad_norm": 0.1691406192934443, + "learning_rate": 2.2748815165876777e-06, + "loss": 0.238, + "step": 1126 + }, + { + "epoch": 2.8823529411764706, + "grad_norm": 0.178904797391566, + "learning_rate": 2.227488151658768e-06, + "loss": 0.2664, + "step": 1127 + }, + { + "epoch": 2.8849104859335037, + "grad_norm": 0.17160493563940152, + "learning_rate": 2.180094786729858e-06, + "loss": 0.2405, + "step": 1128 + }, + { + "epoch": 2.887468030690537, + "grad_norm": 0.17542566078202718, + "learning_rate": 2.1327014218009483e-06, + "loss": 0.2514, + "step": 1129 + }, + { + "epoch": 2.89002557544757, + "grad_norm": 0.18375078276401854, + "learning_rate": 2.085308056872038e-06, + "loss": 0.2558, + "step": 1130 + }, + { + "epoch": 2.8925831202046037, + "grad_norm": 0.1746613382211159, + "learning_rate": 2.037914691943128e-06, + "loss": 0.2498, + "step": 1131 + }, + { + "epoch": 2.895140664961637, + "grad_norm": 0.17688085965152694, + "learning_rate": 1.990521327014218e-06, + "loss": 0.2439, + "step": 1132 + }, + { + "epoch": 2.89769820971867, + "grad_norm": 0.17262679123443198, + "learning_rate": 1.943127962085308e-06, + "loss": 0.232, + "step": 1133 + }, + { + "epoch": 2.9002557544757033, + "grad_norm": 0.16308493274857086, + "learning_rate": 1.8957345971563984e-06, + "loss": 0.2372, + "step": 1134 + }, + { + "epoch": 2.9028132992327365, + "grad_norm": 0.17307065518752035, + "learning_rate": 1.8483412322274883e-06, + "loss": 0.2496, + "step": 1135 + }, + { + "epoch": 2.90537084398977, + "grad_norm": 0.17570971869354174, + "learning_rate": 1.8009478672985781e-06, + "loss": 0.255, + "step": 1136 + }, + { + "epoch": 2.907928388746803, + "grad_norm": 0.17365101323268625, + "learning_rate": 1.7535545023696684e-06, + "loss": 0.2463, + "step": 1137 + }, + { + "epoch": 2.9104859335038364, + "grad_norm": 0.17347423801664208, + "learning_rate": 1.7061611374407583e-06, + "loss": 0.2596, + "step": 1138 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.16784563506361547, + "learning_rate": 1.6587677725118486e-06, + "loss": 0.2389, + "step": 1139 + }, + { + "epoch": 2.915601023017903, + "grad_norm": 0.1815931456665935, + "learning_rate": 1.6113744075829384e-06, + "loss": 0.2601, + "step": 1140 + }, + { + "epoch": 2.918158567774936, + "grad_norm": 0.179248730479389, + "learning_rate": 1.5639810426540283e-06, + "loss": 0.2613, + "step": 1141 + }, + { + "epoch": 2.920716112531969, + "grad_norm": 0.16773919256420614, + "learning_rate": 1.5165876777251185e-06, + "loss": 0.2439, + "step": 1142 + }, + { + "epoch": 2.923273657289003, + "grad_norm": 0.17107783453591816, + "learning_rate": 1.4691943127962086e-06, + "loss": 0.2457, + "step": 1143 + }, + { + "epoch": 2.9258312020460355, + "grad_norm": 0.17079854678851109, + "learning_rate": 1.4218009478672987e-06, + "loss": 0.2435, + "step": 1144 + }, + { + "epoch": 2.928388746803069, + "grad_norm": 0.16349978016152963, + "learning_rate": 1.3744075829383887e-06, + "loss": 0.2304, + "step": 1145 + }, + { + "epoch": 2.9309462915601023, + "grad_norm": 0.1673539709565629, + "learning_rate": 1.3270142180094788e-06, + "loss": 0.2436, + "step": 1146 + }, + { + "epoch": 2.9335038363171355, + "grad_norm": 0.1708287520831406, + "learning_rate": 1.2796208530805689e-06, + "loss": 0.2622, + "step": 1147 + }, + { + "epoch": 2.9360613810741687, + "grad_norm": 0.17066789815162614, + "learning_rate": 1.232227488151659e-06, + "loss": 0.2493, + "step": 1148 + }, + { + "epoch": 2.938618925831202, + "grad_norm": 0.1716388233712423, + "learning_rate": 1.1848341232227488e-06, + "loss": 0.2501, + "step": 1149 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.18539699654347663, + "learning_rate": 1.1374407582938388e-06, + "loss": 0.2544, + "step": 1150 + }, + { + "epoch": 2.9437340153452687, + "grad_norm": 0.1685342419724911, + "learning_rate": 1.090047393364929e-06, + "loss": 0.2563, + "step": 1151 + }, + { + "epoch": 2.946291560102302, + "grad_norm": 0.17685764883456795, + "learning_rate": 1.042654028436019e-06, + "loss": 0.2484, + "step": 1152 + }, + { + "epoch": 2.948849104859335, + "grad_norm": 0.17233103902620187, + "learning_rate": 9.95260663507109e-07, + "loss": 0.2471, + "step": 1153 + }, + { + "epoch": 2.9514066496163682, + "grad_norm": 0.17927170332506637, + "learning_rate": 9.478672985781992e-07, + "loss": 0.2463, + "step": 1154 + }, + { + "epoch": 2.9539641943734014, + "grad_norm": 0.16939187906182812, + "learning_rate": 9.004739336492891e-07, + "loss": 0.2449, + "step": 1155 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.16994779580235087, + "learning_rate": 8.530805687203791e-07, + "loss": 0.2449, + "step": 1156 + }, + { + "epoch": 2.959079283887468, + "grad_norm": 0.1699706601774477, + "learning_rate": 8.056872037914692e-07, + "loss": 0.2481, + "step": 1157 + }, + { + "epoch": 2.9616368286445014, + "grad_norm": 0.16870847051946605, + "learning_rate": 7.582938388625593e-07, + "loss": 0.246, + "step": 1158 + }, + { + "epoch": 2.9641943734015346, + "grad_norm": 0.1698101486377668, + "learning_rate": 7.109004739336493e-07, + "loss": 0.2554, + "step": 1159 + }, + { + "epoch": 2.9667519181585678, + "grad_norm": 0.16651213070393764, + "learning_rate": 6.635071090047394e-07, + "loss": 0.2358, + "step": 1160 + }, + { + "epoch": 2.969309462915601, + "grad_norm": 0.17237546004566973, + "learning_rate": 6.161137440758295e-07, + "loss": 0.2593, + "step": 1161 + }, + { + "epoch": 2.971867007672634, + "grad_norm": 0.1669612454811503, + "learning_rate": 5.687203791469194e-07, + "loss": 0.2422, + "step": 1162 + }, + { + "epoch": 2.9744245524296673, + "grad_norm": 0.16627677687780293, + "learning_rate": 5.213270142180095e-07, + "loss": 0.2524, + "step": 1163 + }, + { + "epoch": 2.976982097186701, + "grad_norm": 0.17381593936793757, + "learning_rate": 4.739336492890996e-07, + "loss": 0.2605, + "step": 1164 + }, + { + "epoch": 2.979539641943734, + "grad_norm": 0.1685052599832634, + "learning_rate": 4.2654028436018957e-07, + "loss": 0.2436, + "step": 1165 + }, + { + "epoch": 2.9820971867007673, + "grad_norm": 0.16629494329700928, + "learning_rate": 3.7914691943127963e-07, + "loss": 0.2509, + "step": 1166 + }, + { + "epoch": 2.9846547314578005, + "grad_norm": 0.17193426032210676, + "learning_rate": 3.317535545023697e-07, + "loss": 0.2525, + "step": 1167 + }, + { + "epoch": 2.9872122762148337, + "grad_norm": 0.1691249872952514, + "learning_rate": 2.843601895734597e-07, + "loss": 0.2471, + "step": 1168 + }, + { + "epoch": 2.9897698209718673, + "grad_norm": 0.16940746272899151, + "learning_rate": 2.369668246445498e-07, + "loss": 0.2421, + "step": 1169 + }, + { + "epoch": 2.9923273657289, + "grad_norm": 0.16950720483754556, + "learning_rate": 1.8957345971563982e-07, + "loss": 0.252, + "step": 1170 + }, + { + "epoch": 2.9948849104859336, + "grad_norm": 0.16465075098818885, + "learning_rate": 1.4218009478672986e-07, + "loss": 0.246, + "step": 1171 + }, + { + "epoch": 2.997442455242967, + "grad_norm": 0.1658083222308387, + "learning_rate": 9.478672985781991e-08, + "loss": 0.2591, + "step": 1172 + }, + { + "epoch": 3.0, + "grad_norm": 0.17583311315224384, + "learning_rate": 4.7393364928909954e-08, + "loss": 0.2248, + "step": 1173 + }, + { + "epoch": 3.0, + "step": 1173, + "total_flos": 1.3044690334083187e+19, + "train_loss": 0.4372467596944539, + "train_runtime": 36845.5005, + "train_samples_per_second": 0.509, + "train_steps_per_second": 0.032 + } + ], + "logging_steps": 1, + "max_steps": 1173, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3044690334083187e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}