diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20419 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 2500, + "global_step": 2911, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003435983335480823, + "grad_norm": 0.2608718276023865, + "learning_rate": 0.0, + "loss": 11.9378, + "step": 1 + }, + { + "epoch": 0.0006871966670961646, + "grad_norm": 0.27573105692863464, + "learning_rate": 1.7123287671232875e-06, + "loss": 11.9422, + "step": 2 + }, + { + "epoch": 0.001030795000644247, + "grad_norm": 0.22586911916732788, + "learning_rate": 3.424657534246575e-06, + "loss": 11.9418, + "step": 3 + }, + { + "epoch": 0.0013743933341923292, + "grad_norm": 0.23945586383342743, + "learning_rate": 5.136986301369863e-06, + "loss": 11.9397, + "step": 4 + }, + { + "epoch": 0.0017179916677404115, + "grad_norm": 0.2580551207065582, + "learning_rate": 6.84931506849315e-06, + "loss": 11.9401, + "step": 5 + }, + { + "epoch": 0.002061590001288494, + "grad_norm": 0.24455508589744568, + "learning_rate": 8.561643835616438e-06, + "loss": 11.9391, + "step": 6 + }, + { + "epoch": 0.002405188334836576, + "grad_norm": 0.26748892664909363, + "learning_rate": 1.0273972602739726e-05, + "loss": 11.9398, + "step": 7 + }, + { + "epoch": 0.0027487866683846584, + "grad_norm": 0.26134631037712097, + "learning_rate": 1.1986301369863013e-05, + "loss": 11.9397, + "step": 8 + }, + { + "epoch": 0.0030923850019327404, + "grad_norm": 0.23617519438266754, + "learning_rate": 1.36986301369863e-05, + "loss": 11.9416, + "step": 9 + }, + { + "epoch": 0.003435983335480823, + "grad_norm": 0.24465899169445038, + "learning_rate": 1.541095890410959e-05, + "loss": 11.938, + "step": 10 + }, + { + "epoch": 0.0037795816690289054, + "grad_norm": 0.23813503980636597, + "learning_rate": 1.7123287671232875e-05, + "loss": 11.9394, + "step": 11 + }, + { + "epoch": 0.004123180002576988, + "grad_norm": 0.25183629989624023, + "learning_rate": 1.8835616438356162e-05, + "loss": 11.9392, + "step": 12 + }, + { + "epoch": 0.00446677833612507, + "grad_norm": 0.22182029485702515, + "learning_rate": 2.0547945205479453e-05, + "loss": 11.9393, + "step": 13 + }, + { + "epoch": 0.004810376669673152, + "grad_norm": 0.22089728713035583, + "learning_rate": 2.226027397260274e-05, + "loss": 11.9374, + "step": 14 + }, + { + "epoch": 0.005153975003221235, + "grad_norm": 0.22943605482578278, + "learning_rate": 2.3972602739726026e-05, + "loss": 11.9379, + "step": 15 + }, + { + "epoch": 0.005497573336769317, + "grad_norm": 0.25133973360061646, + "learning_rate": 2.5684931506849313e-05, + "loss": 11.936, + "step": 16 + }, + { + "epoch": 0.005841171670317399, + "grad_norm": 0.2602233290672302, + "learning_rate": 2.73972602739726e-05, + "loss": 11.9316, + "step": 17 + }, + { + "epoch": 0.006184770003865481, + "grad_norm": 0.23244044184684753, + "learning_rate": 2.910958904109589e-05, + "loss": 11.9329, + "step": 18 + }, + { + "epoch": 0.006528368337413564, + "grad_norm": 0.2674640417098999, + "learning_rate": 3.082191780821918e-05, + "loss": 11.9311, + "step": 19 + }, + { + "epoch": 0.006871966670961646, + "grad_norm": 0.24958398938179016, + "learning_rate": 3.2534246575342464e-05, + "loss": 11.9319, + "step": 20 + }, + { + "epoch": 0.007215565004509728, + "grad_norm": 0.28795570135116577, + "learning_rate": 3.424657534246575e-05, + "loss": 11.9309, + "step": 21 + }, + { + "epoch": 0.007559163338057811, + "grad_norm": 0.27296486496925354, + "learning_rate": 3.595890410958904e-05, + "loss": 11.9292, + "step": 22 + }, + { + "epoch": 0.007902761671605892, + "grad_norm": 0.29060548543930054, + "learning_rate": 3.7671232876712325e-05, + "loss": 11.93, + "step": 23 + }, + { + "epoch": 0.008246360005153976, + "grad_norm": 0.2740913927555084, + "learning_rate": 3.938356164383562e-05, + "loss": 11.9286, + "step": 24 + }, + { + "epoch": 0.008589958338702058, + "grad_norm": 0.29341545701026917, + "learning_rate": 4.1095890410958905e-05, + "loss": 11.9271, + "step": 25 + }, + { + "epoch": 0.00893355667225014, + "grad_norm": 0.323218435049057, + "learning_rate": 4.280821917808219e-05, + "loss": 11.9232, + "step": 26 + }, + { + "epoch": 0.009277155005798222, + "grad_norm": 0.3378582000732422, + "learning_rate": 4.452054794520548e-05, + "loss": 11.9198, + "step": 27 + }, + { + "epoch": 0.009620753339346304, + "grad_norm": 0.36083006858825684, + "learning_rate": 4.6232876712328766e-05, + "loss": 11.9189, + "step": 28 + }, + { + "epoch": 0.009964351672894386, + "grad_norm": 0.3552938401699066, + "learning_rate": 4.794520547945205e-05, + "loss": 11.9171, + "step": 29 + }, + { + "epoch": 0.01030795000644247, + "grad_norm": 0.3807588219642639, + "learning_rate": 4.965753424657534e-05, + "loss": 11.9156, + "step": 30 + }, + { + "epoch": 0.010651548339990552, + "grad_norm": 0.3817954957485199, + "learning_rate": 5.1369863013698626e-05, + "loss": 11.9133, + "step": 31 + }, + { + "epoch": 0.010995146673538634, + "grad_norm": 0.42127540707588196, + "learning_rate": 5.308219178082191e-05, + "loss": 11.9084, + "step": 32 + }, + { + "epoch": 0.011338745007086716, + "grad_norm": 0.42336493730545044, + "learning_rate": 5.47945205479452e-05, + "loss": 11.9081, + "step": 33 + }, + { + "epoch": 0.011682343340634798, + "grad_norm": 0.4692400097846985, + "learning_rate": 5.6506849315068494e-05, + "loss": 11.8979, + "step": 34 + }, + { + "epoch": 0.01202594167418288, + "grad_norm": 0.45885521173477173, + "learning_rate": 5.821917808219178e-05, + "loss": 11.898, + "step": 35 + }, + { + "epoch": 0.012369540007730962, + "grad_norm": 0.48935025930404663, + "learning_rate": 5.993150684931507e-05, + "loss": 11.8932, + "step": 36 + }, + { + "epoch": 0.012713138341279046, + "grad_norm": 0.5434125065803528, + "learning_rate": 6.164383561643835e-05, + "loss": 11.8905, + "step": 37 + }, + { + "epoch": 0.013056736674827128, + "grad_norm": 0.5894971489906311, + "learning_rate": 6.335616438356165e-05, + "loss": 11.8791, + "step": 38 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 0.5886083841323853, + "learning_rate": 6.506849315068493e-05, + "loss": 11.879, + "step": 39 + }, + { + "epoch": 0.013743933341923292, + "grad_norm": 0.5979152321815491, + "learning_rate": 6.678082191780822e-05, + "loss": 11.8734, + "step": 40 + }, + { + "epoch": 0.014087531675471374, + "grad_norm": 0.6167778372764587, + "learning_rate": 6.84931506849315e-05, + "loss": 11.8688, + "step": 41 + }, + { + "epoch": 0.014431130009019456, + "grad_norm": 0.6409335732460022, + "learning_rate": 7.02054794520548e-05, + "loss": 11.858, + "step": 42 + }, + { + "epoch": 0.014774728342567538, + "grad_norm": 0.6955194473266602, + "learning_rate": 7.191780821917808e-05, + "loss": 11.8506, + "step": 43 + }, + { + "epoch": 0.015118326676115621, + "grad_norm": 0.7092486619949341, + "learning_rate": 7.363013698630137e-05, + "loss": 11.8318, + "step": 44 + }, + { + "epoch": 0.015461925009663703, + "grad_norm": 0.6970604658126831, + "learning_rate": 7.534246575342465e-05, + "loss": 11.833, + "step": 45 + }, + { + "epoch": 0.015805523343211784, + "grad_norm": 0.7087641358375549, + "learning_rate": 7.705479452054794e-05, + "loss": 11.8274, + "step": 46 + }, + { + "epoch": 0.01614912167675987, + "grad_norm": 0.7028504014015198, + "learning_rate": 7.876712328767124e-05, + "loss": 11.8209, + "step": 47 + }, + { + "epoch": 0.01649272001030795, + "grad_norm": 0.6933362483978271, + "learning_rate": 8.047945205479453e-05, + "loss": 11.8089, + "step": 48 + }, + { + "epoch": 0.016836318343856033, + "grad_norm": 0.7153650522232056, + "learning_rate": 8.219178082191781e-05, + "loss": 11.7998, + "step": 49 + }, + { + "epoch": 0.017179916677404115, + "grad_norm": 0.7396632432937622, + "learning_rate": 8.39041095890411e-05, + "loss": 11.7903, + "step": 50 + }, + { + "epoch": 0.017523515010952197, + "grad_norm": 0.7029207348823547, + "learning_rate": 8.561643835616438e-05, + "loss": 11.7541, + "step": 51 + }, + { + "epoch": 0.01786711334450028, + "grad_norm": 0.6831297278404236, + "learning_rate": 8.732876712328768e-05, + "loss": 11.7451, + "step": 52 + }, + { + "epoch": 0.01821071167804836, + "grad_norm": 0.6773477792739868, + "learning_rate": 8.904109589041096e-05, + "loss": 11.7351, + "step": 53 + }, + { + "epoch": 0.018554310011596443, + "grad_norm": 0.6540944576263428, + "learning_rate": 9.075342465753425e-05, + "loss": 11.7275, + "step": 54 + }, + { + "epoch": 0.018897908345144526, + "grad_norm": 0.6842557191848755, + "learning_rate": 9.246575342465753e-05, + "loss": 11.7084, + "step": 55 + }, + { + "epoch": 0.019241506678692608, + "grad_norm": 0.6622518301010132, + "learning_rate": 9.417808219178083e-05, + "loss": 11.7025, + "step": 56 + }, + { + "epoch": 0.01958510501224069, + "grad_norm": 0.6573502421379089, + "learning_rate": 9.58904109589041e-05, + "loss": 11.6883, + "step": 57 + }, + { + "epoch": 0.01992870334578877, + "grad_norm": 0.6725475788116455, + "learning_rate": 9.76027397260274e-05, + "loss": 11.6727, + "step": 58 + }, + { + "epoch": 0.020272301679336854, + "grad_norm": 0.6889570355415344, + "learning_rate": 9.931506849315068e-05, + "loss": 11.6625, + "step": 59 + }, + { + "epoch": 0.02061590001288494, + "grad_norm": 0.672089695930481, + "learning_rate": 0.00010102739726027397, + "loss": 11.6495, + "step": 60 + }, + { + "epoch": 0.02095949834643302, + "grad_norm": 0.6492573618888855, + "learning_rate": 0.00010273972602739725, + "loss": 11.6344, + "step": 61 + }, + { + "epoch": 0.021303096679981103, + "grad_norm": 0.6624525189399719, + "learning_rate": 0.00010445205479452055, + "loss": 11.6246, + "step": 62 + }, + { + "epoch": 0.021646695013529185, + "grad_norm": 0.640524685382843, + "learning_rate": 0.00010616438356164383, + "loss": 11.6162, + "step": 63 + }, + { + "epoch": 0.021990293347077267, + "grad_norm": 0.6240212917327881, + "learning_rate": 0.00010787671232876712, + "loss": 11.601, + "step": 64 + }, + { + "epoch": 0.02233389168062535, + "grad_norm": 0.6419646739959717, + "learning_rate": 0.0001095890410958904, + "loss": 11.5904, + "step": 65 + }, + { + "epoch": 0.02267749001417343, + "grad_norm": 0.6217727065086365, + "learning_rate": 0.00011130136986301371, + "loss": 11.5754, + "step": 66 + }, + { + "epoch": 0.023021088347721513, + "grad_norm": 0.6366190314292908, + "learning_rate": 0.00011301369863013699, + "loss": 11.5616, + "step": 67 + }, + { + "epoch": 0.023364686681269595, + "grad_norm": 0.6308091282844543, + "learning_rate": 0.00011472602739726028, + "loss": 11.5476, + "step": 68 + }, + { + "epoch": 0.023708285014817677, + "grad_norm": 0.6248790621757507, + "learning_rate": 0.00011643835616438356, + "loss": 11.5356, + "step": 69 + }, + { + "epoch": 0.02405188334836576, + "grad_norm": 0.6040776968002319, + "learning_rate": 0.00011815068493150686, + "loss": 11.5316, + "step": 70 + }, + { + "epoch": 0.02439548168191384, + "grad_norm": 0.6206280589103699, + "learning_rate": 0.00011986301369863014, + "loss": 11.5134, + "step": 71 + }, + { + "epoch": 0.024739080015461924, + "grad_norm": 0.600658118724823, + "learning_rate": 0.00012157534246575343, + "loss": 11.5024, + "step": 72 + }, + { + "epoch": 0.02508267834901001, + "grad_norm": 0.6167108416557312, + "learning_rate": 0.0001232876712328767, + "loss": 11.4839, + "step": 73 + }, + { + "epoch": 0.02542627668255809, + "grad_norm": 0.6266405582427979, + "learning_rate": 0.000125, + "loss": 11.4643, + "step": 74 + }, + { + "epoch": 0.025769875016106173, + "grad_norm": 0.6280407309532166, + "learning_rate": 0.0001267123287671233, + "loss": 11.4495, + "step": 75 + }, + { + "epoch": 0.026113473349654255, + "grad_norm": 0.6044653654098511, + "learning_rate": 0.0001284246575342466, + "loss": 11.4406, + "step": 76 + }, + { + "epoch": 0.026457071683202337, + "grad_norm": 0.624096691608429, + "learning_rate": 0.00013013698630136986, + "loss": 11.4213, + "step": 77 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 0.6047064661979675, + "learning_rate": 0.00013184931506849315, + "loss": 11.4102, + "step": 78 + }, + { + "epoch": 0.0271442683502985, + "grad_norm": 0.6190263628959656, + "learning_rate": 0.00013356164383561644, + "loss": 11.3944, + "step": 79 + }, + { + "epoch": 0.027487866683846583, + "grad_norm": 0.6345274448394775, + "learning_rate": 0.00013527397260273974, + "loss": 11.3798, + "step": 80 + }, + { + "epoch": 0.027831465017394665, + "grad_norm": 0.6231955885887146, + "learning_rate": 0.000136986301369863, + "loss": 11.3585, + "step": 81 + }, + { + "epoch": 0.028175063350942747, + "grad_norm": 0.6305320262908936, + "learning_rate": 0.0001386986301369863, + "loss": 11.3541, + "step": 82 + }, + { + "epoch": 0.02851866168449083, + "grad_norm": 0.6369199156761169, + "learning_rate": 0.0001404109589041096, + "loss": 11.3318, + "step": 83 + }, + { + "epoch": 0.02886226001803891, + "grad_norm": 0.6159324049949646, + "learning_rate": 0.00014212328767123288, + "loss": 11.3296, + "step": 84 + }, + { + "epoch": 0.029205858351586993, + "grad_norm": 0.5997834205627441, + "learning_rate": 0.00014383561643835615, + "loss": 11.3051, + "step": 85 + }, + { + "epoch": 0.029549456685135075, + "grad_norm": 0.6137603521347046, + "learning_rate": 0.00014554794520547945, + "loss": 11.3053, + "step": 86 + }, + { + "epoch": 0.02989305501868316, + "grad_norm": 0.5946083068847656, + "learning_rate": 0.00014726027397260274, + "loss": 11.2886, + "step": 87 + }, + { + "epoch": 0.030236653352231243, + "grad_norm": 0.6125856041908264, + "learning_rate": 0.00014897260273972603, + "loss": 11.2621, + "step": 88 + }, + { + "epoch": 0.030580251685779325, + "grad_norm": 0.6170690059661865, + "learning_rate": 0.0001506849315068493, + "loss": 11.2379, + "step": 89 + }, + { + "epoch": 0.030923850019327407, + "grad_norm": 0.613112211227417, + "learning_rate": 0.0001523972602739726, + "loss": 11.2319, + "step": 90 + }, + { + "epoch": 0.03126744835287549, + "grad_norm": 0.6426445841789246, + "learning_rate": 0.00015410958904109589, + "loss": 11.217, + "step": 91 + }, + { + "epoch": 0.03161104668642357, + "grad_norm": 0.6466779708862305, + "learning_rate": 0.0001558219178082192, + "loss": 11.1905, + "step": 92 + }, + { + "epoch": 0.03195464501997165, + "grad_norm": 0.6395965814590454, + "learning_rate": 0.00015753424657534247, + "loss": 11.182, + "step": 93 + }, + { + "epoch": 0.03229824335351974, + "grad_norm": 0.6484614014625549, + "learning_rate": 0.00015924657534246577, + "loss": 11.166, + "step": 94 + }, + { + "epoch": 0.03264184168706782, + "grad_norm": 0.6011567115783691, + "learning_rate": 0.00016095890410958906, + "loss": 11.1601, + "step": 95 + }, + { + "epoch": 0.0329854400206159, + "grad_norm": 0.6242689490318298, + "learning_rate": 0.00016267123287671235, + "loss": 11.1447, + "step": 96 + }, + { + "epoch": 0.03332903835416398, + "grad_norm": 0.6144719123840332, + "learning_rate": 0.00016438356164383562, + "loss": 11.1382, + "step": 97 + }, + { + "epoch": 0.03367263668771207, + "grad_norm": 0.6248654127120972, + "learning_rate": 0.00016609589041095891, + "loss": 11.1246, + "step": 98 + }, + { + "epoch": 0.034016235021260145, + "grad_norm": 0.6292065978050232, + "learning_rate": 0.0001678082191780822, + "loss": 11.0922, + "step": 99 + }, + { + "epoch": 0.03435983335480823, + "grad_norm": 0.6909387707710266, + "learning_rate": 0.0001695205479452055, + "loss": 11.0901, + "step": 100 + }, + { + "epoch": 0.03470343168835631, + "grad_norm": 0.6519057750701904, + "learning_rate": 0.00017123287671232877, + "loss": 11.0416, + "step": 101 + }, + { + "epoch": 0.035047030021904395, + "grad_norm": 0.6702002882957458, + "learning_rate": 0.00017294520547945206, + "loss": 11.0272, + "step": 102 + }, + { + "epoch": 0.03539062835545247, + "grad_norm": 0.6443936228752136, + "learning_rate": 0.00017465753424657536, + "loss": 11.015, + "step": 103 + }, + { + "epoch": 0.03573422668900056, + "grad_norm": 0.6505857110023499, + "learning_rate": 0.00017636986301369865, + "loss": 10.999, + "step": 104 + }, + { + "epoch": 0.03607782502254864, + "grad_norm": 0.6232591867446899, + "learning_rate": 0.00017808219178082192, + "loss": 10.9871, + "step": 105 + }, + { + "epoch": 0.03642142335609672, + "grad_norm": 0.6446307897567749, + "learning_rate": 0.0001797945205479452, + "loss": 10.9577, + "step": 106 + }, + { + "epoch": 0.03676502168964481, + "grad_norm": 0.662112295627594, + "learning_rate": 0.0001815068493150685, + "loss": 10.9258, + "step": 107 + }, + { + "epoch": 0.03710862002319289, + "grad_norm": 0.6445570588111877, + "learning_rate": 0.0001832191780821918, + "loss": 10.9299, + "step": 108 + }, + { + "epoch": 0.03745221835674097, + "grad_norm": 0.6506933569908142, + "learning_rate": 0.00018493150684931506, + "loss": 10.9051, + "step": 109 + }, + { + "epoch": 0.03779581669028905, + "grad_norm": 0.6474794149398804, + "learning_rate": 0.00018664383561643836, + "loss": 10.8871, + "step": 110 + }, + { + "epoch": 0.03813941502383714, + "grad_norm": 0.6381330490112305, + "learning_rate": 0.00018835616438356165, + "loss": 10.8668, + "step": 111 + }, + { + "epoch": 0.038483013357385215, + "grad_norm": 0.625579833984375, + "learning_rate": 0.00019006849315068494, + "loss": 10.8491, + "step": 112 + }, + { + "epoch": 0.0388266116909333, + "grad_norm": 0.6460126042366028, + "learning_rate": 0.0001917808219178082, + "loss": 10.8417, + "step": 113 + }, + { + "epoch": 0.03917021002448138, + "grad_norm": 0.6264495253562927, + "learning_rate": 0.0001934931506849315, + "loss": 10.8322, + "step": 114 + }, + { + "epoch": 0.039513808358029465, + "grad_norm": 0.6536591053009033, + "learning_rate": 0.0001952054794520548, + "loss": 10.7748, + "step": 115 + }, + { + "epoch": 0.03985740669157754, + "grad_norm": 0.6263400912284851, + "learning_rate": 0.0001969178082191781, + "loss": 10.7766, + "step": 116 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 0.6201140284538269, + "learning_rate": 0.00019863013698630136, + "loss": 10.7611, + "step": 117 + }, + { + "epoch": 0.04054460335867371, + "grad_norm": 0.615506649017334, + "learning_rate": 0.00020034246575342465, + "loss": 10.7421, + "step": 118 + }, + { + "epoch": 0.04088820169222179, + "grad_norm": 0.6192501783370972, + "learning_rate": 0.00020205479452054795, + "loss": 10.72, + "step": 119 + }, + { + "epoch": 0.04123180002576988, + "grad_norm": 0.6386846899986267, + "learning_rate": 0.00020376712328767124, + "loss": 10.6968, + "step": 120 + }, + { + "epoch": 0.04157539835931796, + "grad_norm": 0.6488209962844849, + "learning_rate": 0.0002054794520547945, + "loss": 10.6839, + "step": 121 + }, + { + "epoch": 0.04191899669286604, + "grad_norm": 0.6438455581665039, + "learning_rate": 0.0002071917808219178, + "loss": 10.6481, + "step": 122 + }, + { + "epoch": 0.04226259502641412, + "grad_norm": 0.6512130498886108, + "learning_rate": 0.0002089041095890411, + "loss": 10.6226, + "step": 123 + }, + { + "epoch": 0.042606193359962206, + "grad_norm": 0.6471563577651978, + "learning_rate": 0.0002106164383561644, + "loss": 10.6165, + "step": 124 + }, + { + "epoch": 0.042949791693510285, + "grad_norm": 0.615242063999176, + "learning_rate": 0.00021232876712328765, + "loss": 10.6079, + "step": 125 + }, + { + "epoch": 0.04329339002705837, + "grad_norm": 0.6485576629638672, + "learning_rate": 0.00021404109589041095, + "loss": 10.577, + "step": 126 + }, + { + "epoch": 0.04363698836060645, + "grad_norm": 0.6325823664665222, + "learning_rate": 0.00021575342465753424, + "loss": 10.5585, + "step": 127 + }, + { + "epoch": 0.043980586694154535, + "grad_norm": 0.6436929702758789, + "learning_rate": 0.00021746575342465753, + "loss": 10.5302, + "step": 128 + }, + { + "epoch": 0.04432418502770261, + "grad_norm": 0.6149619817733765, + "learning_rate": 0.0002191780821917808, + "loss": 10.5413, + "step": 129 + }, + { + "epoch": 0.0446677833612507, + "grad_norm": 0.6164280772209167, + "learning_rate": 0.00022089041095890412, + "loss": 10.5123, + "step": 130 + }, + { + "epoch": 0.04501138169479878, + "grad_norm": 0.6501047015190125, + "learning_rate": 0.00022260273972602742, + "loss": 10.4795, + "step": 131 + }, + { + "epoch": 0.04535498002834686, + "grad_norm": 0.650057852268219, + "learning_rate": 0.0002243150684931507, + "loss": 10.4438, + "step": 132 + }, + { + "epoch": 0.04569857836189495, + "grad_norm": 0.6304041147232056, + "learning_rate": 0.00022602739726027398, + "loss": 10.4388, + "step": 133 + }, + { + "epoch": 0.04604217669544303, + "grad_norm": 0.6359322667121887, + "learning_rate": 0.00022773972602739727, + "loss": 10.4147, + "step": 134 + }, + { + "epoch": 0.04638577502899111, + "grad_norm": 0.6494120359420776, + "learning_rate": 0.00022945205479452056, + "loss": 10.3909, + "step": 135 + }, + { + "epoch": 0.04672937336253919, + "grad_norm": 0.6699193120002747, + "learning_rate": 0.00023116438356164386, + "loss": 10.3737, + "step": 136 + }, + { + "epoch": 0.047072971696087276, + "grad_norm": 0.6436437964439392, + "learning_rate": 0.00023287671232876712, + "loss": 10.3688, + "step": 137 + }, + { + "epoch": 0.047416570029635355, + "grad_norm": 0.6163201928138733, + "learning_rate": 0.00023458904109589042, + "loss": 10.3468, + "step": 138 + }, + { + "epoch": 0.04776016836318344, + "grad_norm": 0.6744312644004822, + "learning_rate": 0.0002363013698630137, + "loss": 10.3168, + "step": 139 + }, + { + "epoch": 0.04810376669673152, + "grad_norm": 0.6373304724693298, + "learning_rate": 0.000238013698630137, + "loss": 10.3134, + "step": 140 + }, + { + "epoch": 0.048447365030279604, + "grad_norm": 0.6340605616569519, + "learning_rate": 0.00023972602739726027, + "loss": 10.2961, + "step": 141 + }, + { + "epoch": 0.04879096336382768, + "grad_norm": 0.6478452682495117, + "learning_rate": 0.00024143835616438356, + "loss": 10.2708, + "step": 142 + }, + { + "epoch": 0.04913456169737577, + "grad_norm": 0.6576781272888184, + "learning_rate": 0.00024315068493150686, + "loss": 10.2501, + "step": 143 + }, + { + "epoch": 0.04947816003092385, + "grad_norm": 0.6450794339179993, + "learning_rate": 0.0002448630136986301, + "loss": 10.2429, + "step": 144 + }, + { + "epoch": 0.04982175836447193, + "grad_norm": 0.6385270953178406, + "learning_rate": 0.0002465753424657534, + "loss": 10.2387, + "step": 145 + }, + { + "epoch": 0.05016535669802002, + "grad_norm": 0.6406118869781494, + "learning_rate": 0.0002482876712328767, + "loss": 10.1705, + "step": 146 + }, + { + "epoch": 0.0505089550315681, + "grad_norm": 0.610375702381134, + "learning_rate": 0.00025, + "loss": 10.2039, + "step": 147 + }, + { + "epoch": 0.05085255336511618, + "grad_norm": 0.6268151998519897, + "learning_rate": 0.0002517123287671233, + "loss": 10.1923, + "step": 148 + }, + { + "epoch": 0.05119615169866426, + "grad_norm": 0.6328505277633667, + "learning_rate": 0.0002534246575342466, + "loss": 10.1562, + "step": 149 + }, + { + "epoch": 0.051539750032212346, + "grad_norm": 0.6489254236221313, + "learning_rate": 0.0002551369863013699, + "loss": 10.1535, + "step": 150 + }, + { + "epoch": 0.051883348365760425, + "grad_norm": 0.6568716764450073, + "learning_rate": 0.0002568493150684932, + "loss": 10.083, + "step": 151 + }, + { + "epoch": 0.05222694669930851, + "grad_norm": 0.6831304430961609, + "learning_rate": 0.0002585616438356164, + "loss": 10.0611, + "step": 152 + }, + { + "epoch": 0.05257054503285659, + "grad_norm": 0.6912304759025574, + "learning_rate": 0.0002602739726027397, + "loss": 10.0351, + "step": 153 + }, + { + "epoch": 0.052914143366404674, + "grad_norm": 0.6460905075073242, + "learning_rate": 0.000261986301369863, + "loss": 10.0702, + "step": 154 + }, + { + "epoch": 0.05325774169995275, + "grad_norm": 0.6496686339378357, + "learning_rate": 0.0002636986301369863, + "loss": 10.0229, + "step": 155 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 0.684745728969574, + "learning_rate": 0.0002654109589041096, + "loss": 9.986, + "step": 156 + }, + { + "epoch": 0.05394493836704892, + "grad_norm": 0.6830720901489258, + "learning_rate": 0.0002671232876712329, + "loss": 9.959, + "step": 157 + }, + { + "epoch": 0.054288536700597, + "grad_norm": 0.6738273501396179, + "learning_rate": 0.0002688356164383562, + "loss": 9.9602, + "step": 158 + }, + { + "epoch": 0.05463213503414508, + "grad_norm": 0.6997708082199097, + "learning_rate": 0.0002705479452054795, + "loss": 9.9037, + "step": 159 + }, + { + "epoch": 0.054975733367693166, + "grad_norm": 0.6702543497085571, + "learning_rate": 0.0002722602739726027, + "loss": 9.8971, + "step": 160 + }, + { + "epoch": 0.05531933170124125, + "grad_norm": 0.6914408206939697, + "learning_rate": 0.000273972602739726, + "loss": 9.8413, + "step": 161 + }, + { + "epoch": 0.05566293003478933, + "grad_norm": 0.670492947101593, + "learning_rate": 0.0002756849315068493, + "loss": 9.8841, + "step": 162 + }, + { + "epoch": 0.056006528368337416, + "grad_norm": 0.6712900996208191, + "learning_rate": 0.0002773972602739726, + "loss": 9.8381, + "step": 163 + }, + { + "epoch": 0.056350126701885495, + "grad_norm": 0.6814056038856506, + "learning_rate": 0.0002791095890410959, + "loss": 9.814, + "step": 164 + }, + { + "epoch": 0.05669372503543358, + "grad_norm": 0.6779478192329407, + "learning_rate": 0.0002808219178082192, + "loss": 9.7816, + "step": 165 + }, + { + "epoch": 0.05703732336898166, + "grad_norm": 0.6546109914779663, + "learning_rate": 0.0002825342465753425, + "loss": 9.7916, + "step": 166 + }, + { + "epoch": 0.057380921702529744, + "grad_norm": 0.6739956736564636, + "learning_rate": 0.00028424657534246577, + "loss": 9.7473, + "step": 167 + }, + { + "epoch": 0.05772452003607782, + "grad_norm": 0.6486943960189819, + "learning_rate": 0.000285958904109589, + "loss": 9.7394, + "step": 168 + }, + { + "epoch": 0.05806811836962591, + "grad_norm": 0.6602917313575745, + "learning_rate": 0.0002876712328767123, + "loss": 9.6986, + "step": 169 + }, + { + "epoch": 0.05841171670317399, + "grad_norm": 0.6496753096580505, + "learning_rate": 0.0002893835616438356, + "loss": 9.7017, + "step": 170 + }, + { + "epoch": 0.05875531503672207, + "grad_norm": 0.6671965718269348, + "learning_rate": 0.0002910958904109589, + "loss": 9.6521, + "step": 171 + }, + { + "epoch": 0.05909891337027015, + "grad_norm": 0.6669342517852783, + "learning_rate": 0.0002928082191780822, + "loss": 9.6483, + "step": 172 + }, + { + "epoch": 0.059442511703818236, + "grad_norm": 0.673161506652832, + "learning_rate": 0.0002945205479452055, + "loss": 9.6201, + "step": 173 + }, + { + "epoch": 0.05978611003736632, + "grad_norm": 0.6617433428764343, + "learning_rate": 0.00029623287671232877, + "loss": 9.6162, + "step": 174 + }, + { + "epoch": 0.0601297083709144, + "grad_norm": 0.6584163904190063, + "learning_rate": 0.00029794520547945206, + "loss": 9.5791, + "step": 175 + }, + { + "epoch": 0.060473306704462486, + "grad_norm": 0.6521568894386292, + "learning_rate": 0.0002996575342465753, + "loss": 9.6007, + "step": 176 + }, + { + "epoch": 0.060816905038010564, + "grad_norm": 0.6696915626525879, + "learning_rate": 0.0003013698630136986, + "loss": 9.5377, + "step": 177 + }, + { + "epoch": 0.06116050337155865, + "grad_norm": 0.6637017726898193, + "learning_rate": 0.0003030821917808219, + "loss": 9.5216, + "step": 178 + }, + { + "epoch": 0.06150410170510673, + "grad_norm": 0.6698155999183655, + "learning_rate": 0.0003047945205479452, + "loss": 9.4881, + "step": 179 + }, + { + "epoch": 0.061847700038654814, + "grad_norm": 0.6680699586868286, + "learning_rate": 0.0003065068493150685, + "loss": 9.4521, + "step": 180 + }, + { + "epoch": 0.06219129837220289, + "grad_norm": 0.6579115986824036, + "learning_rate": 0.00030821917808219177, + "loss": 9.4734, + "step": 181 + }, + { + "epoch": 0.06253489670575098, + "grad_norm": 0.6589128375053406, + "learning_rate": 0.00030993150684931507, + "loss": 9.4434, + "step": 182 + }, + { + "epoch": 0.06287849503929906, + "grad_norm": 0.6762176752090454, + "learning_rate": 0.0003116438356164384, + "loss": 9.3834, + "step": 183 + }, + { + "epoch": 0.06322209337284714, + "grad_norm": 0.6807367205619812, + "learning_rate": 0.0003133561643835616, + "loss": 9.3898, + "step": 184 + }, + { + "epoch": 0.06356569170639523, + "grad_norm": 0.6483362317085266, + "learning_rate": 0.00031506849315068495, + "loss": 9.3783, + "step": 185 + }, + { + "epoch": 0.0639092900399433, + "grad_norm": 0.6637298464775085, + "learning_rate": 0.00031678082191780824, + "loss": 9.3476, + "step": 186 + }, + { + "epoch": 0.06425288837349138, + "grad_norm": 0.6700524091720581, + "learning_rate": 0.00031849315068493153, + "loss": 9.3696, + "step": 187 + }, + { + "epoch": 0.06459648670703948, + "grad_norm": 0.6919601559638977, + "learning_rate": 0.00032020547945205483, + "loss": 9.2993, + "step": 188 + }, + { + "epoch": 0.06494008504058756, + "grad_norm": 0.6557508111000061, + "learning_rate": 0.0003219178082191781, + "loss": 9.316, + "step": 189 + }, + { + "epoch": 0.06528368337413563, + "grad_norm": 0.6749283671379089, + "learning_rate": 0.0003236301369863014, + "loss": 9.2672, + "step": 190 + }, + { + "epoch": 0.06562728170768371, + "grad_norm": 0.6584534049034119, + "learning_rate": 0.0003253424657534247, + "loss": 9.3218, + "step": 191 + }, + { + "epoch": 0.0659708800412318, + "grad_norm": 0.6565567851066589, + "learning_rate": 0.00032705479452054795, + "loss": 9.2853, + "step": 192 + }, + { + "epoch": 0.06631447837477988, + "grad_norm": 0.6454471349716187, + "learning_rate": 0.00032876712328767124, + "loss": 9.3203, + "step": 193 + }, + { + "epoch": 0.06665807670832796, + "grad_norm": 0.6428927183151245, + "learning_rate": 0.00033047945205479454, + "loss": 9.235, + "step": 194 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 0.6591072082519531, + "learning_rate": 0.00033219178082191783, + "loss": 9.2496, + "step": 195 + }, + { + "epoch": 0.06734527337542413, + "grad_norm": 0.666397750377655, + "learning_rate": 0.0003339041095890411, + "loss": 9.2136, + "step": 196 + }, + { + "epoch": 0.06768887170897221, + "grad_norm": 0.6873658895492554, + "learning_rate": 0.0003356164383561644, + "loss": 9.1737, + "step": 197 + }, + { + "epoch": 0.06803247004252029, + "grad_norm": 0.6527069211006165, + "learning_rate": 0.0003373287671232877, + "loss": 9.221, + "step": 198 + }, + { + "epoch": 0.06837606837606838, + "grad_norm": 0.6505078077316284, + "learning_rate": 0.000339041095890411, + "loss": 9.199, + "step": 199 + }, + { + "epoch": 0.06871966670961646, + "grad_norm": 0.6579967141151428, + "learning_rate": 0.00034075342465753424, + "loss": 9.2194, + "step": 200 + }, + { + "epoch": 0.06906326504316454, + "grad_norm": 0.7028538584709167, + "learning_rate": 0.00034246575342465754, + "loss": 9.0879, + "step": 201 + }, + { + "epoch": 0.06940686337671262, + "grad_norm": 0.7203247547149658, + "learning_rate": 0.00034417808219178083, + "loss": 9.0403, + "step": 202 + }, + { + "epoch": 0.06975046171026071, + "grad_norm": 0.6787813901901245, + "learning_rate": 0.0003458904109589041, + "loss": 9.0896, + "step": 203 + }, + { + "epoch": 0.07009406004380879, + "grad_norm": 0.6986070275306702, + "learning_rate": 0.0003476027397260274, + "loss": 9.025, + "step": 204 + }, + { + "epoch": 0.07043765837735687, + "grad_norm": 0.7031541466712952, + "learning_rate": 0.0003493150684931507, + "loss": 8.9863, + "step": 205 + }, + { + "epoch": 0.07078125671090495, + "grad_norm": 0.6880903244018555, + "learning_rate": 0.000351027397260274, + "loss": 9.0159, + "step": 206 + }, + { + "epoch": 0.07112485504445304, + "grad_norm": 0.6828157901763916, + "learning_rate": 0.0003527397260273973, + "loss": 8.9844, + "step": 207 + }, + { + "epoch": 0.07146845337800112, + "grad_norm": 0.6969785690307617, + "learning_rate": 0.00035445205479452054, + "loss": 8.9524, + "step": 208 + }, + { + "epoch": 0.0718120517115492, + "grad_norm": 0.6816834211349487, + "learning_rate": 0.00035616438356164383, + "loss": 8.9422, + "step": 209 + }, + { + "epoch": 0.07215565004509727, + "grad_norm": 0.7117534875869751, + "learning_rate": 0.0003578767123287671, + "loss": 8.8944, + "step": 210 + }, + { + "epoch": 0.07249924837864537, + "grad_norm": 0.685808002948761, + "learning_rate": 0.0003595890410958904, + "loss": 8.9264, + "step": 211 + }, + { + "epoch": 0.07284284671219345, + "grad_norm": 0.6689701080322266, + "learning_rate": 0.0003613013698630137, + "loss": 8.8904, + "step": 212 + }, + { + "epoch": 0.07318644504574152, + "grad_norm": 0.6558648347854614, + "learning_rate": 0.000363013698630137, + "loss": 8.8794, + "step": 213 + }, + { + "epoch": 0.07353004337928962, + "grad_norm": 0.6610527038574219, + "learning_rate": 0.0003647260273972603, + "loss": 8.8526, + "step": 214 + }, + { + "epoch": 0.0738736417128377, + "grad_norm": 0.6478385329246521, + "learning_rate": 0.0003664383561643836, + "loss": 8.8549, + "step": 215 + }, + { + "epoch": 0.07421724004638577, + "grad_norm": 0.6762006282806396, + "learning_rate": 0.00036815068493150683, + "loss": 8.8003, + "step": 216 + }, + { + "epoch": 0.07456083837993385, + "grad_norm": 0.6985549330711365, + "learning_rate": 0.0003698630136986301, + "loss": 8.7256, + "step": 217 + }, + { + "epoch": 0.07490443671348194, + "grad_norm": 0.647552490234375, + "learning_rate": 0.0003715753424657534, + "loss": 8.822, + "step": 218 + }, + { + "epoch": 0.07524803504703002, + "grad_norm": 0.6919633746147156, + "learning_rate": 0.0003732876712328767, + "loss": 8.6791, + "step": 219 + }, + { + "epoch": 0.0755916333805781, + "grad_norm": 0.6744142770767212, + "learning_rate": 0.000375, + "loss": 8.6984, + "step": 220 + }, + { + "epoch": 0.07593523171412618, + "grad_norm": 0.6745022535324097, + "learning_rate": 0.0003767123287671233, + "loss": 8.6842, + "step": 221 + }, + { + "epoch": 0.07627883004767427, + "grad_norm": 0.6558954119682312, + "learning_rate": 0.0003784246575342466, + "loss": 8.6935, + "step": 222 + }, + { + "epoch": 0.07662242838122235, + "grad_norm": 0.6784209609031677, + "learning_rate": 0.0003801369863013699, + "loss": 8.6508, + "step": 223 + }, + { + "epoch": 0.07696602671477043, + "grad_norm": 0.6614058613777161, + "learning_rate": 0.0003818493150684932, + "loss": 8.6713, + "step": 224 + }, + { + "epoch": 0.07730962504831851, + "grad_norm": 0.6582658886909485, + "learning_rate": 0.0003835616438356164, + "loss": 8.6104, + "step": 225 + }, + { + "epoch": 0.0776532233818666, + "grad_norm": 0.6616186499595642, + "learning_rate": 0.0003852739726027397, + "loss": 8.6167, + "step": 226 + }, + { + "epoch": 0.07799682171541468, + "grad_norm": 0.6750086545944214, + "learning_rate": 0.000386986301369863, + "loss": 8.6126, + "step": 227 + }, + { + "epoch": 0.07834042004896276, + "grad_norm": 0.6450734734535217, + "learning_rate": 0.0003886986301369863, + "loss": 8.6054, + "step": 228 + }, + { + "epoch": 0.07868401838251085, + "grad_norm": 0.6688063144683838, + "learning_rate": 0.0003904109589041096, + "loss": 8.5684, + "step": 229 + }, + { + "epoch": 0.07902761671605893, + "grad_norm": 0.6341917514801025, + "learning_rate": 0.0003921232876712329, + "loss": 8.5838, + "step": 230 + }, + { + "epoch": 0.07937121504960701, + "grad_norm": 0.6572481393814087, + "learning_rate": 0.0003938356164383562, + "loss": 8.5452, + "step": 231 + }, + { + "epoch": 0.07971481338315509, + "grad_norm": 0.6542115807533264, + "learning_rate": 0.0003955479452054795, + "loss": 8.5227, + "step": 232 + }, + { + "epoch": 0.08005841171670318, + "grad_norm": 0.6360100507736206, + "learning_rate": 0.0003972602739726027, + "loss": 8.5747, + "step": 233 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 0.6474318504333496, + "learning_rate": 0.000398972602739726, + "loss": 8.53, + "step": 234 + }, + { + "epoch": 0.08074560838379934, + "grad_norm": 0.639065682888031, + "learning_rate": 0.0004006849315068493, + "loss": 8.4835, + "step": 235 + }, + { + "epoch": 0.08108920671734741, + "grad_norm": 0.6288236379623413, + "learning_rate": 0.0004023972602739726, + "loss": 8.5055, + "step": 236 + }, + { + "epoch": 0.08143280505089551, + "grad_norm": 0.600935697555542, + "learning_rate": 0.0004041095890410959, + "loss": 8.5239, + "step": 237 + }, + { + "epoch": 0.08177640338444359, + "grad_norm": 0.6632112264633179, + "learning_rate": 0.0004058219178082192, + "loss": 8.4146, + "step": 238 + }, + { + "epoch": 0.08212000171799166, + "grad_norm": 0.6458113193511963, + "learning_rate": 0.0004075342465753425, + "loss": 8.4419, + "step": 239 + }, + { + "epoch": 0.08246360005153976, + "grad_norm": 0.6225025057792664, + "learning_rate": 0.00040924657534246577, + "loss": 8.4463, + "step": 240 + }, + { + "epoch": 0.08280719838508784, + "grad_norm": 0.5956367254257202, + "learning_rate": 0.000410958904109589, + "loss": 8.4468, + "step": 241 + }, + { + "epoch": 0.08315079671863591, + "grad_norm": 0.6125132441520691, + "learning_rate": 0.0004126712328767123, + "loss": 8.4347, + "step": 242 + }, + { + "epoch": 0.08349439505218399, + "grad_norm": 0.5911486744880676, + "learning_rate": 0.0004143835616438356, + "loss": 8.5122, + "step": 243 + }, + { + "epoch": 0.08383799338573208, + "grad_norm": 0.5973955392837524, + "learning_rate": 0.0004160958904109589, + "loss": 8.5174, + "step": 244 + }, + { + "epoch": 0.08418159171928016, + "grad_norm": 0.6203837394714355, + "learning_rate": 0.0004178082191780822, + "loss": 8.3962, + "step": 245 + }, + { + "epoch": 0.08452519005282824, + "grad_norm": 0.624750018119812, + "learning_rate": 0.0004195205479452055, + "loss": 8.3446, + "step": 246 + }, + { + "epoch": 0.08486878838637632, + "grad_norm": 0.613397479057312, + "learning_rate": 0.0004212328767123288, + "loss": 8.377, + "step": 247 + }, + { + "epoch": 0.08521238671992441, + "grad_norm": 0.6277963519096375, + "learning_rate": 0.00042294520547945207, + "loss": 8.3163, + "step": 248 + }, + { + "epoch": 0.08555598505347249, + "grad_norm": 0.6092117428779602, + "learning_rate": 0.0004246575342465753, + "loss": 8.4043, + "step": 249 + }, + { + "epoch": 0.08589958338702057, + "grad_norm": 0.6500337719917297, + "learning_rate": 0.0004263698630136986, + "loss": 8.3297, + "step": 250 + }, + { + "epoch": 0.08624318172056865, + "grad_norm": 0.6311240792274475, + "learning_rate": 0.0004280821917808219, + "loss": 8.3059, + "step": 251 + }, + { + "epoch": 0.08658678005411674, + "grad_norm": 0.6225205063819885, + "learning_rate": 0.0004297945205479452, + "loss": 8.2266, + "step": 252 + }, + { + "epoch": 0.08693037838766482, + "grad_norm": 0.6449727416038513, + "learning_rate": 0.0004315068493150685, + "loss": 8.2155, + "step": 253 + }, + { + "epoch": 0.0872739767212129, + "grad_norm": 0.6553236842155457, + "learning_rate": 0.0004332191780821918, + "loss": 8.1811, + "step": 254 + }, + { + "epoch": 0.08761757505476099, + "grad_norm": 0.6186061501502991, + "learning_rate": 0.00043493150684931507, + "loss": 8.21, + "step": 255 + }, + { + "epoch": 0.08796117338830907, + "grad_norm": 0.6524087190628052, + "learning_rate": 0.0004366438356164384, + "loss": 8.1486, + "step": 256 + }, + { + "epoch": 0.08830477172185715, + "grad_norm": 0.6334664225578308, + "learning_rate": 0.0004383561643835616, + "loss": 8.207, + "step": 257 + }, + { + "epoch": 0.08864837005540523, + "grad_norm": 0.597114086151123, + "learning_rate": 0.00044006849315068495, + "loss": 8.2354, + "step": 258 + }, + { + "epoch": 0.08899196838895332, + "grad_norm": 0.599537193775177, + "learning_rate": 0.00044178082191780824, + "loss": 8.1919, + "step": 259 + }, + { + "epoch": 0.0893355667225014, + "grad_norm": 0.6212196946144104, + "learning_rate": 0.00044349315068493154, + "loss": 8.1179, + "step": 260 + }, + { + "epoch": 0.08967916505604948, + "grad_norm": 0.5893779397010803, + "learning_rate": 0.00044520547945205483, + "loss": 8.2266, + "step": 261 + }, + { + "epoch": 0.09002276338959755, + "grad_norm": 0.5951864719390869, + "learning_rate": 0.0004469178082191781, + "loss": 8.1375, + "step": 262 + }, + { + "epoch": 0.09036636172314565, + "grad_norm": 0.6006582975387573, + "learning_rate": 0.0004486301369863014, + "loss": 8.1599, + "step": 263 + }, + { + "epoch": 0.09070996005669373, + "grad_norm": 0.5896690487861633, + "learning_rate": 0.0004503424657534247, + "loss": 8.08, + "step": 264 + }, + { + "epoch": 0.0910535583902418, + "grad_norm": 0.5560733675956726, + "learning_rate": 0.00045205479452054795, + "loss": 8.1453, + "step": 265 + }, + { + "epoch": 0.0913971567237899, + "grad_norm": 0.5687079429626465, + "learning_rate": 0.00045376712328767124, + "loss": 8.0543, + "step": 266 + }, + { + "epoch": 0.09174075505733797, + "grad_norm": 0.5450804829597473, + "learning_rate": 0.00045547945205479454, + "loss": 8.1676, + "step": 267 + }, + { + "epoch": 0.09208435339088605, + "grad_norm": 0.536425769329071, + "learning_rate": 0.00045719178082191783, + "loss": 8.0958, + "step": 268 + }, + { + "epoch": 0.09242795172443413, + "grad_norm": 0.5232003331184387, + "learning_rate": 0.0004589041095890411, + "loss": 8.1276, + "step": 269 + }, + { + "epoch": 0.09277155005798222, + "grad_norm": 0.556194543838501, + "learning_rate": 0.0004606164383561644, + "loss": 8.0365, + "step": 270 + }, + { + "epoch": 0.0931151483915303, + "grad_norm": 0.5368773341178894, + "learning_rate": 0.0004623287671232877, + "loss": 8.0283, + "step": 271 + }, + { + "epoch": 0.09345874672507838, + "grad_norm": 0.5228593945503235, + "learning_rate": 0.000464041095890411, + "loss": 8.0311, + "step": 272 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 0.5249463319778442, + "learning_rate": 0.00046575342465753425, + "loss": 7.9928, + "step": 273 + }, + { + "epoch": 0.09414594339217455, + "grad_norm": 0.511807918548584, + "learning_rate": 0.00046746575342465754, + "loss": 8.0101, + "step": 274 + }, + { + "epoch": 0.09448954172572263, + "grad_norm": 0.5214989185333252, + "learning_rate": 0.00046917808219178083, + "loss": 8.0251, + "step": 275 + }, + { + "epoch": 0.09483314005927071, + "grad_norm": 0.48177778720855713, + "learning_rate": 0.0004708904109589041, + "loss": 8.0669, + "step": 276 + }, + { + "epoch": 0.09517673839281879, + "grad_norm": 0.4860067069530487, + "learning_rate": 0.0004726027397260274, + "loss": 8.0312, + "step": 277 + }, + { + "epoch": 0.09552033672636688, + "grad_norm": 0.4770212769508362, + "learning_rate": 0.0004743150684931507, + "loss": 7.981, + "step": 278 + }, + { + "epoch": 0.09586393505991496, + "grad_norm": 0.4737316966056824, + "learning_rate": 0.000476027397260274, + "loss": 7.9322, + "step": 279 + }, + { + "epoch": 0.09620753339346304, + "grad_norm": 0.48152047395706177, + "learning_rate": 0.0004777397260273973, + "loss": 7.9734, + "step": 280 + }, + { + "epoch": 0.09655113172701113, + "grad_norm": 0.455615371465683, + "learning_rate": 0.00047945205479452054, + "loss": 7.9721, + "step": 281 + }, + { + "epoch": 0.09689473006055921, + "grad_norm": 0.45772621035575867, + "learning_rate": 0.00048116438356164383, + "loss": 8.0154, + "step": 282 + }, + { + "epoch": 0.09723832839410729, + "grad_norm": 0.4532890021800995, + "learning_rate": 0.00048287671232876713, + "loss": 7.9678, + "step": 283 + }, + { + "epoch": 0.09758192672765537, + "grad_norm": 0.46691393852233887, + "learning_rate": 0.0004845890410958904, + "loss": 7.8717, + "step": 284 + }, + { + "epoch": 0.09792552506120346, + "grad_norm": 0.4515172243118286, + "learning_rate": 0.0004863013698630137, + "loss": 7.9627, + "step": 285 + }, + { + "epoch": 0.09826912339475154, + "grad_norm": 0.4819401800632477, + "learning_rate": 0.000488013698630137, + "loss": 7.9424, + "step": 286 + }, + { + "epoch": 0.09861272172829962, + "grad_norm": 0.4063154458999634, + "learning_rate": 0.0004897260273972602, + "loss": 7.9792, + "step": 287 + }, + { + "epoch": 0.0989563200618477, + "grad_norm": 0.4167320132255554, + "learning_rate": 0.0004914383561643835, + "loss": 7.9856, + "step": 288 + }, + { + "epoch": 0.09929991839539579, + "grad_norm": 0.4385222792625427, + "learning_rate": 0.0004931506849315068, + "loss": 7.8981, + "step": 289 + }, + { + "epoch": 0.09964351672894387, + "grad_norm": 0.38443422317504883, + "learning_rate": 0.0004948630136986301, + "loss": 8.0099, + "step": 290 + }, + { + "epoch": 0.09998711506249194, + "grad_norm": 0.4247475564479828, + "learning_rate": 0.0004965753424657534, + "loss": 7.912, + "step": 291 + }, + { + "epoch": 0.10033071339604004, + "grad_norm": 0.3842032551765442, + "learning_rate": 0.0004982876712328767, + "loss": 7.9831, + "step": 292 + }, + { + "epoch": 0.10067431172958811, + "grad_norm": 0.40302786231040955, + "learning_rate": 0.0005, + "loss": 7.9525, + "step": 293 + }, + { + "epoch": 0.1010179100631362, + "grad_norm": 0.36673638224601746, + "learning_rate": 0.0004999998201382936, + "loss": 7.9676, + "step": 294 + }, + { + "epoch": 0.10136150839668427, + "grad_norm": 0.4072817862033844, + "learning_rate": 0.000499999280553433, + "loss": 7.8912, + "step": 295 + }, + { + "epoch": 0.10170510673023236, + "grad_norm": 0.3764428496360779, + "learning_rate": 0.0004999983812461949, + "loss": 8.003, + "step": 296 + }, + { + "epoch": 0.10204870506378044, + "grad_norm": 0.38769039511680603, + "learning_rate": 0.0004999971222178729, + "loss": 7.9691, + "step": 297 + }, + { + "epoch": 0.10239230339732852, + "grad_norm": 0.35378220677375793, + "learning_rate": 0.0004999955034702791, + "loss": 7.9757, + "step": 298 + }, + { + "epoch": 0.1027359017308766, + "grad_norm": 0.37706032395362854, + "learning_rate": 0.0004999935250057423, + "loss": 8.0697, + "step": 299 + }, + { + "epoch": 0.10307950006442469, + "grad_norm": 0.42491599917411804, + "learning_rate": 0.0004999911868271095, + "loss": 8.0201, + "step": 300 + }, + { + "epoch": 0.10342309839797277, + "grad_norm": 0.3899023234844208, + "learning_rate": 0.000499988488937745, + "loss": 7.7999, + "step": 301 + }, + { + "epoch": 0.10376669673152085, + "grad_norm": 0.4056253135204315, + "learning_rate": 0.0004999854313415308, + "loss": 7.7656, + "step": 302 + }, + { + "epoch": 0.10411029506506893, + "grad_norm": 0.34815019369125366, + "learning_rate": 0.0004999820140428665, + "loss": 7.8854, + "step": 303 + }, + { + "epoch": 0.10445389339861702, + "grad_norm": 0.32494989037513733, + "learning_rate": 0.0004999782370466693, + "loss": 7.9957, + "step": 304 + }, + { + "epoch": 0.1047974917321651, + "grad_norm": 0.3544636368751526, + "learning_rate": 0.0004999741003583737, + "loss": 7.8446, + "step": 305 + }, + { + "epoch": 0.10514109006571318, + "grad_norm": 0.34266841411590576, + "learning_rate": 0.000499969603983932, + "loss": 7.781, + "step": 306 + }, + { + "epoch": 0.10548468839926127, + "grad_norm": 0.3225366175174713, + "learning_rate": 0.0004999647479298142, + "loss": 7.8695, + "step": 307 + }, + { + "epoch": 0.10582828673280935, + "grad_norm": 0.3114038407802582, + "learning_rate": 0.0004999595322030074, + "loss": 7.8691, + "step": 308 + }, + { + "epoch": 0.10617188506635743, + "grad_norm": 0.30096235871315, + "learning_rate": 0.0004999539568110165, + "loss": 7.9136, + "step": 309 + }, + { + "epoch": 0.1065154833999055, + "grad_norm": 0.325207382440567, + "learning_rate": 0.0004999480217618641, + "loss": 7.8082, + "step": 310 + }, + { + "epoch": 0.1068590817334536, + "grad_norm": 0.29082146286964417, + "learning_rate": 0.0004999417270640899, + "loss": 7.8148, + "step": 311 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 0.29000920057296753, + "learning_rate": 0.0004999350727267515, + "loss": 7.8117, + "step": 312 + }, + { + "epoch": 0.10754627840054976, + "grad_norm": 0.290337473154068, + "learning_rate": 0.0004999280587594235, + "loss": 7.808, + "step": 313 + }, + { + "epoch": 0.10788987673409783, + "grad_norm": 0.2645377516746521, + "learning_rate": 0.0004999206851721985, + "loss": 7.8217, + "step": 314 + }, + { + "epoch": 0.10823347506764593, + "grad_norm": 0.23495185375213623, + "learning_rate": 0.0004999129519756862, + "loss": 7.8687, + "step": 315 + }, + { + "epoch": 0.108577073401194, + "grad_norm": 0.2429482489824295, + "learning_rate": 0.0004999048591810139, + "loss": 7.823, + "step": 316 + }, + { + "epoch": 0.10892067173474208, + "grad_norm": 0.22635473310947418, + "learning_rate": 0.0004998964067998262, + "loss": 7.8572, + "step": 317 + }, + { + "epoch": 0.10926427006829016, + "grad_norm": 0.2399221658706665, + "learning_rate": 0.0004998875948442852, + "loss": 7.7956, + "step": 318 + }, + { + "epoch": 0.10960786840183825, + "grad_norm": 0.24210013449192047, + "learning_rate": 0.0004998784233270705, + "loss": 7.8072, + "step": 319 + }, + { + "epoch": 0.10995146673538633, + "grad_norm": 0.224739670753479, + "learning_rate": 0.0004998688922613788, + "loss": 7.8351, + "step": 320 + }, + { + "epoch": 0.11029506506893441, + "grad_norm": 0.22617986798286438, + "learning_rate": 0.0004998590016609242, + "loss": 7.8284, + "step": 321 + }, + { + "epoch": 0.1106386634024825, + "grad_norm": 0.2693329453468323, + "learning_rate": 0.0004998487515399384, + "loss": 7.8181, + "step": 322 + }, + { + "epoch": 0.11098226173603058, + "grad_norm": 0.3233254551887512, + "learning_rate": 0.0004998381419131701, + "loss": 7.8468, + "step": 323 + }, + { + "epoch": 0.11132586006957866, + "grad_norm": 0.2308340072631836, + "learning_rate": 0.0004998271727958857, + "loss": 7.8013, + "step": 324 + }, + { + "epoch": 0.11166945840312674, + "grad_norm": 0.2707921862602234, + "learning_rate": 0.0004998158442038682, + "loss": 7.7526, + "step": 325 + }, + { + "epoch": 0.11201305673667483, + "grad_norm": 0.5553349852561951, + "learning_rate": 0.0004998041561534185, + "loss": 7.8472, + "step": 326 + }, + { + "epoch": 0.11235665507022291, + "grad_norm": 0.5412582159042358, + "learning_rate": 0.0004997921086613543, + "loss": 7.8249, + "step": 327 + }, + { + "epoch": 0.11270025340377099, + "grad_norm": 0.3885006606578827, + "learning_rate": 0.0004997797017450108, + "loss": 7.8228, + "step": 328 + }, + { + "epoch": 0.11304385173731907, + "grad_norm": 1.165300726890564, + "learning_rate": 0.0004997669354222401, + "loss": 7.8178, + "step": 329 + }, + { + "epoch": 0.11338745007086716, + "grad_norm": 0.4480549991130829, + "learning_rate": 0.0004997538097114118, + "loss": 7.8745, + "step": 330 + }, + { + "epoch": 0.11373104840441524, + "grad_norm": 0.5066674947738647, + "learning_rate": 0.0004997403246314123, + "loss": 7.8827, + "step": 331 + }, + { + "epoch": 0.11407464673796332, + "grad_norm": 0.2583351135253906, + "learning_rate": 0.000499726480201645, + "loss": 7.8056, + "step": 332 + }, + { + "epoch": 0.11441824507151141, + "grad_norm": 0.49098068475723267, + "learning_rate": 0.0004997122764420309, + "loss": 7.8203, + "step": 333 + }, + { + "epoch": 0.11476184340505949, + "grad_norm": 0.3366967439651489, + "learning_rate": 0.0004996977133730074, + "loss": 7.8302, + "step": 334 + }, + { + "epoch": 0.11510544173860757, + "grad_norm": 0.4575011134147644, + "learning_rate": 0.0004996827910155292, + "loss": 7.8429, + "step": 335 + }, + { + "epoch": 0.11544904007215565, + "grad_norm": 1.2171545028686523, + "learning_rate": 0.0004996675093910684, + "loss": 7.7702, + "step": 336 + }, + { + "epoch": 0.11579263840570374, + "grad_norm": 1.0319626331329346, + "learning_rate": 0.0004996518685216132, + "loss": 7.7597, + "step": 337 + }, + { + "epoch": 0.11613623673925182, + "grad_norm": 0.9371957182884216, + "learning_rate": 0.0004996358684296692, + "loss": 7.8775, + "step": 338 + }, + { + "epoch": 0.1164798350727999, + "grad_norm": 1.9794833660125732, + "learning_rate": 0.0004996195091382591, + "loss": 7.8503, + "step": 339 + }, + { + "epoch": 0.11682343340634797, + "grad_norm": 0.7713110446929932, + "learning_rate": 0.0004996027906709219, + "loss": 7.8266, + "step": 340 + }, + { + "epoch": 0.11716703173989607, + "grad_norm": 0.39992398023605347, + "learning_rate": 0.0004995857130517139, + "loss": 7.9142, + "step": 341 + }, + { + "epoch": 0.11751063007344414, + "grad_norm": 0.6665037274360657, + "learning_rate": 0.0004995682763052077, + "loss": 7.9238, + "step": 342 + }, + { + "epoch": 0.11785422840699222, + "grad_norm": 0.43217286467552185, + "learning_rate": 0.0004995504804564932, + "loss": 7.9302, + "step": 343 + }, + { + "epoch": 0.1181978267405403, + "grad_norm": 0.4050378203392029, + "learning_rate": 0.0004995323255311767, + "loss": 7.9248, + "step": 344 + }, + { + "epoch": 0.1185414250740884, + "grad_norm": 1.3695038557052612, + "learning_rate": 0.0004995138115553811, + "loss": 7.8124, + "step": 345 + }, + { + "epoch": 0.11888502340763647, + "grad_norm": 0.388114869594574, + "learning_rate": 0.0004994949385557461, + "loss": 7.8363, + "step": 346 + }, + { + "epoch": 0.11922862174118455, + "grad_norm": 1.1071730852127075, + "learning_rate": 0.0004994757065594279, + "loss": 7.8569, + "step": 347 + }, + { + "epoch": 0.11957222007473264, + "grad_norm": 0.5634297132492065, + "learning_rate": 0.0004994561155940994, + "loss": 7.9719, + "step": 348 + }, + { + "epoch": 0.11991581840828072, + "grad_norm": 0.4597019553184509, + "learning_rate": 0.0004994361656879497, + "loss": 7.8573, + "step": 349 + }, + { + "epoch": 0.1202594167418288, + "grad_norm": 1.1164665222167969, + "learning_rate": 0.0004994158568696848, + "loss": 7.7194, + "step": 350 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 0.6222408413887024, + "learning_rate": 0.0004993951891685269, + "loss": 7.8213, + "step": 351 + }, + { + "epoch": 0.12094661340892497, + "grad_norm": 0.3941010534763336, + "learning_rate": 0.0004993741626142145, + "loss": 7.8279, + "step": 352 + }, + { + "epoch": 0.12129021174247305, + "grad_norm": 0.46420833468437195, + "learning_rate": 0.0004993527772370028, + "loss": 7.6712, + "step": 353 + }, + { + "epoch": 0.12163381007602113, + "grad_norm": 0.28539156913757324, + "learning_rate": 0.0004993310330676629, + "loss": 7.8035, + "step": 354 + }, + { + "epoch": 0.12197740840956921, + "grad_norm": 0.32697808742523193, + "learning_rate": 0.0004993089301374823, + "loss": 7.6737, + "step": 355 + }, + { + "epoch": 0.1223210067431173, + "grad_norm": 0.33268988132476807, + "learning_rate": 0.0004992864684782649, + "loss": 7.8325, + "step": 356 + }, + { + "epoch": 0.12266460507666538, + "grad_norm": 0.5018799901008606, + "learning_rate": 0.0004992636481223306, + "loss": 7.6845, + "step": 357 + }, + { + "epoch": 0.12300820341021346, + "grad_norm": 0.31378456950187683, + "learning_rate": 0.0004992404691025155, + "loss": 7.7502, + "step": 358 + }, + { + "epoch": 0.12335180174376155, + "grad_norm": 0.512126088142395, + "learning_rate": 0.0004992169314521717, + "loss": 7.8296, + "step": 359 + }, + { + "epoch": 0.12369540007730963, + "grad_norm": 0.3105211555957794, + "learning_rate": 0.0004991930352051673, + "loss": 7.9009, + "step": 360 + }, + { + "epoch": 0.1240389984108577, + "grad_norm": 0.2352656126022339, + "learning_rate": 0.0004991687803958866, + "loss": 7.7176, + "step": 361 + }, + { + "epoch": 0.12438259674440579, + "grad_norm": 0.32939714193344116, + "learning_rate": 0.0004991441670592297, + "loss": 7.676, + "step": 362 + }, + { + "epoch": 0.12472619507795388, + "grad_norm": 0.3954201638698578, + "learning_rate": 0.0004991191952306124, + "loss": 7.6914, + "step": 363 + }, + { + "epoch": 0.12506979341150196, + "grad_norm": 0.3367353677749634, + "learning_rate": 0.0004990938649459667, + "loss": 7.6831, + "step": 364 + }, + { + "epoch": 0.12541339174505003, + "grad_norm": 0.2749853730201721, + "learning_rate": 0.00049906817624174, + "loss": 7.6483, + "step": 365 + }, + { + "epoch": 0.1257569900785981, + "grad_norm": 0.2436531037092209, + "learning_rate": 0.0004990421291548958, + "loss": 7.7626, + "step": 366 + }, + { + "epoch": 0.1261005884121462, + "grad_norm": 0.23887164890766144, + "learning_rate": 0.0004990157237229129, + "loss": 7.8766, + "step": 367 + }, + { + "epoch": 0.12644418674569427, + "grad_norm": 0.22134557366371155, + "learning_rate": 0.0004989889599837861, + "loss": 7.7312, + "step": 368 + }, + { + "epoch": 0.12678778507924238, + "grad_norm": 0.2769152820110321, + "learning_rate": 0.0004989618379760254, + "loss": 7.7004, + "step": 369 + }, + { + "epoch": 0.12713138341279046, + "grad_norm": 0.26756346225738525, + "learning_rate": 0.0004989343577386565, + "loss": 7.7231, + "step": 370 + }, + { + "epoch": 0.12747498174633853, + "grad_norm": 0.2157527357339859, + "learning_rate": 0.0004989065193112208, + "loss": 7.7513, + "step": 371 + }, + { + "epoch": 0.1278185800798866, + "grad_norm": 0.4620709717273712, + "learning_rate": 0.0004988783227337746, + "loss": 7.739, + "step": 372 + }, + { + "epoch": 0.1281621784134347, + "grad_norm": 0.24664214253425598, + "learning_rate": 0.0004988497680468898, + "loss": 7.7792, + "step": 373 + }, + { + "epoch": 0.12850577674698277, + "grad_norm": 0.45489999651908875, + "learning_rate": 0.0004988208552916534, + "loss": 7.7012, + "step": 374 + }, + { + "epoch": 0.12884937508053085, + "grad_norm": 0.2705993950366974, + "learning_rate": 0.0004987915845096683, + "loss": 7.7137, + "step": 375 + }, + { + "epoch": 0.12919297341407895, + "grad_norm": 0.26571300625801086, + "learning_rate": 0.0004987619557430513, + "loss": 7.6652, + "step": 376 + }, + { + "epoch": 0.12953657174762703, + "grad_norm": 0.2846947908401489, + "learning_rate": 0.0004987319690344358, + "loss": 7.6621, + "step": 377 + }, + { + "epoch": 0.1298801700811751, + "grad_norm": 0.253897100687027, + "learning_rate": 0.000498701624426969, + "loss": 7.6912, + "step": 378 + }, + { + "epoch": 0.1302237684147232, + "grad_norm": 0.21716997027397156, + "learning_rate": 0.0004986709219643136, + "loss": 7.8242, + "step": 379 + }, + { + "epoch": 0.13056736674827127, + "grad_norm": 0.2383873611688614, + "learning_rate": 0.0004986398616906474, + "loss": 7.6825, + "step": 380 + }, + { + "epoch": 0.13091096508181935, + "grad_norm": 0.26393526792526245, + "learning_rate": 0.0004986084436506625, + "loss": 7.6705, + "step": 381 + }, + { + "epoch": 0.13125456341536743, + "grad_norm": 0.22068330645561218, + "learning_rate": 0.0004985766678895665, + "loss": 7.7126, + "step": 382 + }, + { + "epoch": 0.1315981617489155, + "grad_norm": 0.269553542137146, + "learning_rate": 0.000498544534453081, + "loss": 7.8269, + "step": 383 + }, + { + "epoch": 0.1319417600824636, + "grad_norm": 0.38264012336730957, + "learning_rate": 0.0004985120433874429, + "loss": 7.6628, + "step": 384 + }, + { + "epoch": 0.1322853584160117, + "grad_norm": 0.3014664053916931, + "learning_rate": 0.0004984791947394032, + "loss": 7.6818, + "step": 385 + }, + { + "epoch": 0.13262895674955977, + "grad_norm": 0.24440672993659973, + "learning_rate": 0.0004984459885562277, + "loss": 7.6293, + "step": 386 + }, + { + "epoch": 0.13297255508310785, + "grad_norm": 0.2836483418941498, + "learning_rate": 0.0004984124248856964, + "loss": 7.7407, + "step": 387 + }, + { + "epoch": 0.13331615341665592, + "grad_norm": 0.26046499609947205, + "learning_rate": 0.0004983785037761041, + "loss": 7.7502, + "step": 388 + }, + { + "epoch": 0.133659751750204, + "grad_norm": 0.4465503692626953, + "learning_rate": 0.0004983442252762595, + "loss": 7.7441, + "step": 389 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 0.4302878975868225, + "learning_rate": 0.0004983095894354857, + "loss": 7.7906, + "step": 390 + }, + { + "epoch": 0.1343469484173002, + "grad_norm": 0.5368163585662842, + "learning_rate": 0.0004982745963036201, + "loss": 7.7609, + "step": 391 + }, + { + "epoch": 0.13469054675084827, + "grad_norm": 0.3662016689777374, + "learning_rate": 0.0004982392459310141, + "loss": 7.6962, + "step": 392 + }, + { + "epoch": 0.13503414508439635, + "grad_norm": 0.27907463908195496, + "learning_rate": 0.000498203538368533, + "loss": 7.7156, + "step": 393 + }, + { + "epoch": 0.13537774341794442, + "grad_norm": 0.6360219120979309, + "learning_rate": 0.0004981674736675563, + "loss": 7.8432, + "step": 394 + }, + { + "epoch": 0.1357213417514925, + "grad_norm": 0.38870713114738464, + "learning_rate": 0.0004981310518799772, + "loss": 7.6269, + "step": 395 + }, + { + "epoch": 0.13606494008504058, + "grad_norm": 0.3981183171272278, + "learning_rate": 0.0004980942730582028, + "loss": 7.7014, + "step": 396 + }, + { + "epoch": 0.13640853841858866, + "grad_norm": 0.48505455255508423, + "learning_rate": 0.0004980571372551538, + "loss": 7.8247, + "step": 397 + }, + { + "epoch": 0.13675213675213677, + "grad_norm": 0.3913789391517639, + "learning_rate": 0.0004980196445242651, + "loss": 7.8572, + "step": 398 + }, + { + "epoch": 0.13709573508568484, + "grad_norm": 0.6935904026031494, + "learning_rate": 0.0004979817949194842, + "loss": 7.8824, + "step": 399 + }, + { + "epoch": 0.13743933341923292, + "grad_norm": 0.7566063404083252, + "learning_rate": 0.000497943588495273, + "loss": 7.7401, + "step": 400 + }, + { + "epoch": 0.137782931752781, + "grad_norm": 0.6485404968261719, + "learning_rate": 0.0004979050253066063, + "loss": 7.7275, + "step": 401 + }, + { + "epoch": 0.13812653008632908, + "grad_norm": 1.2425336837768555, + "learning_rate": 0.0004978661054089726, + "loss": 7.6111, + "step": 402 + }, + { + "epoch": 0.13847012841987716, + "grad_norm": 0.5411668419837952, + "learning_rate": 0.0004978268288583733, + "loss": 7.6819, + "step": 403 + }, + { + "epoch": 0.13881372675342524, + "grad_norm": 0.7507022619247437, + "learning_rate": 0.0004977871957113233, + "loss": 7.5208, + "step": 404 + }, + { + "epoch": 0.13915732508697332, + "grad_norm": 0.7834251523017883, + "learning_rate": 0.0004977472060248505, + "loss": 7.6503, + "step": 405 + }, + { + "epoch": 0.13950092342052142, + "grad_norm": 0.37674951553344727, + "learning_rate": 0.0004977068598564957, + "loss": 7.6368, + "step": 406 + }, + { + "epoch": 0.1398445217540695, + "grad_norm": 0.4405427575111389, + "learning_rate": 0.0004976661572643128, + "loss": 7.6453, + "step": 407 + }, + { + "epoch": 0.14018812008761758, + "grad_norm": 0.40070560574531555, + "learning_rate": 0.0004976250983068687, + "loss": 7.6718, + "step": 408 + }, + { + "epoch": 0.14053171842116566, + "grad_norm": 0.49566248059272766, + "learning_rate": 0.0004975836830432425, + "loss": 7.6901, + "step": 409 + }, + { + "epoch": 0.14087531675471374, + "grad_norm": 0.5421594381332397, + "learning_rate": 0.0004975419115330267, + "loss": 7.6493, + "step": 410 + }, + { + "epoch": 0.14121891508826181, + "grad_norm": 0.33900225162506104, + "learning_rate": 0.0004974997838363258, + "loss": 7.6087, + "step": 411 + }, + { + "epoch": 0.1415625134218099, + "grad_norm": 0.32626527547836304, + "learning_rate": 0.0004974573000137572, + "loss": 7.6064, + "step": 412 + }, + { + "epoch": 0.141906111755358, + "grad_norm": 0.25622686743736267, + "learning_rate": 0.0004974144601264507, + "loss": 7.6272, + "step": 413 + }, + { + "epoch": 0.14224971008890608, + "grad_norm": 0.26424112915992737, + "learning_rate": 0.0004973712642360481, + "loss": 7.6451, + "step": 414 + }, + { + "epoch": 0.14259330842245416, + "grad_norm": 0.3264583647251129, + "learning_rate": 0.0004973277124047039, + "loss": 7.6348, + "step": 415 + }, + { + "epoch": 0.14293690675600224, + "grad_norm": 0.5043911933898926, + "learning_rate": 0.0004972838046950844, + "loss": 7.6353, + "step": 416 + }, + { + "epoch": 0.14328050508955031, + "grad_norm": 0.4086906313896179, + "learning_rate": 0.0004972395411703682, + "loss": 7.5676, + "step": 417 + }, + { + "epoch": 0.1436241034230984, + "grad_norm": 0.550713300704956, + "learning_rate": 0.0004971949218942459, + "loss": 7.6814, + "step": 418 + }, + { + "epoch": 0.14396770175664647, + "grad_norm": 0.44711998105049133, + "learning_rate": 0.0004971499469309196, + "loss": 7.5975, + "step": 419 + }, + { + "epoch": 0.14431130009019455, + "grad_norm": 0.5666840076446533, + "learning_rate": 0.0004971046163451039, + "loss": 7.5721, + "step": 420 + }, + { + "epoch": 0.14465489842374266, + "grad_norm": 0.8350973725318909, + "learning_rate": 0.0004970589302020244, + "loss": 7.5351, + "step": 421 + }, + { + "epoch": 0.14499849675729073, + "grad_norm": 0.22808456420898438, + "learning_rate": 0.0004970128885674188, + "loss": 7.6153, + "step": 422 + }, + { + "epoch": 0.1453420950908388, + "grad_norm": 0.7204859852790833, + "learning_rate": 0.0004969664915075358, + "loss": 7.5626, + "step": 423 + }, + { + "epoch": 0.1456856934243869, + "grad_norm": 0.34824058413505554, + "learning_rate": 0.0004969197390891361, + "loss": 7.6858, + "step": 424 + }, + { + "epoch": 0.14602929175793497, + "grad_norm": 0.4266470968723297, + "learning_rate": 0.0004968726313794914, + "loss": 7.6733, + "step": 425 + }, + { + "epoch": 0.14637289009148305, + "grad_norm": 0.42429882287979126, + "learning_rate": 0.0004968251684463847, + "loss": 7.6719, + "step": 426 + }, + { + "epoch": 0.14671648842503113, + "grad_norm": 0.3132624626159668, + "learning_rate": 0.0004967773503581101, + "loss": 7.7054, + "step": 427 + }, + { + "epoch": 0.14706008675857923, + "grad_norm": 0.5525080561637878, + "learning_rate": 0.0004967291771834727, + "loss": 7.5684, + "step": 428 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 0.3956605792045593, + "learning_rate": 0.0004966806489917886, + "loss": 7.5842, + "step": 429 + }, + { + "epoch": 0.1477472834256754, + "grad_norm": 0.5083109140396118, + "learning_rate": 0.0004966317658528847, + "loss": 7.5118, + "step": 430 + }, + { + "epoch": 0.14809088175922347, + "grad_norm": 0.4539259076118469, + "learning_rate": 0.0004965825278370987, + "loss": 7.5676, + "step": 431 + }, + { + "epoch": 0.14843448009277155, + "grad_norm": 0.6377333998680115, + "learning_rate": 0.0004965329350152788, + "loss": 7.6241, + "step": 432 + }, + { + "epoch": 0.14877807842631963, + "grad_norm": 0.32773837447166443, + "learning_rate": 0.0004964829874587838, + "loss": 7.579, + "step": 433 + }, + { + "epoch": 0.1491216767598677, + "grad_norm": 0.4683372378349304, + "learning_rate": 0.0004964326852394829, + "loss": 7.567, + "step": 434 + }, + { + "epoch": 0.14946527509341578, + "grad_norm": 0.5232347249984741, + "learning_rate": 0.0004963820284297558, + "loss": 7.554, + "step": 435 + }, + { + "epoch": 0.1498088734269639, + "grad_norm": 0.6766787171363831, + "learning_rate": 0.0004963310171024921, + "loss": 7.6316, + "step": 436 + }, + { + "epoch": 0.15015247176051197, + "grad_norm": 0.33900943398475647, + "learning_rate": 0.0004962796513310917, + "loss": 7.6034, + "step": 437 + }, + { + "epoch": 0.15049607009406005, + "grad_norm": 0.4721566140651703, + "learning_rate": 0.0004962279311894644, + "loss": 7.6699, + "step": 438 + }, + { + "epoch": 0.15083966842760813, + "grad_norm": 0.399872750043869, + "learning_rate": 0.0004961758567520302, + "loss": 7.6539, + "step": 439 + }, + { + "epoch": 0.1511832667611562, + "grad_norm": 0.4169398248195648, + "learning_rate": 0.0004961234280937185, + "loss": 7.6788, + "step": 440 + }, + { + "epoch": 0.15152686509470428, + "grad_norm": 0.4548245668411255, + "learning_rate": 0.0004960706452899687, + "loss": 7.6121, + "step": 441 + }, + { + "epoch": 0.15187046342825236, + "grad_norm": 0.7090252041816711, + "learning_rate": 0.0004960175084167296, + "loss": 7.5421, + "step": 442 + }, + { + "epoch": 0.15221406176180047, + "grad_norm": 0.431866854429245, + "learning_rate": 0.0004959640175504593, + "loss": 7.5093, + "step": 443 + }, + { + "epoch": 0.15255766009534855, + "grad_norm": 0.40838518738746643, + "learning_rate": 0.0004959101727681258, + "loss": 7.6853, + "step": 444 + }, + { + "epoch": 0.15290125842889662, + "grad_norm": 0.9189926981925964, + "learning_rate": 0.0004958559741472058, + "loss": 7.6081, + "step": 445 + }, + { + "epoch": 0.1532448567624447, + "grad_norm": 0.522569477558136, + "learning_rate": 0.0004958014217656855, + "loss": 7.5748, + "step": 446 + }, + { + "epoch": 0.15358845509599278, + "grad_norm": 0.9465051889419556, + "learning_rate": 0.0004957465157020598, + "loss": 7.6038, + "step": 447 + }, + { + "epoch": 0.15393205342954086, + "grad_norm": 0.7611620426177979, + "learning_rate": 0.0004956912560353327, + "loss": 7.6636, + "step": 448 + }, + { + "epoch": 0.15427565176308894, + "grad_norm": 0.6408162117004395, + "learning_rate": 0.0004956356428450171, + "loss": 7.6456, + "step": 449 + }, + { + "epoch": 0.15461925009663702, + "grad_norm": 0.8566272854804993, + "learning_rate": 0.0004955796762111345, + "loss": 7.6224, + "step": 450 + }, + { + "epoch": 0.15496284843018512, + "grad_norm": 0.9389132857322693, + "learning_rate": 0.0004955233562142148, + "loss": 7.5777, + "step": 451 + }, + { + "epoch": 0.1553064467637332, + "grad_norm": 1.3984068632125854, + "learning_rate": 0.0004954666829352966, + "loss": 7.4553, + "step": 452 + }, + { + "epoch": 0.15565004509728128, + "grad_norm": 0.6359997987747192, + "learning_rate": 0.0004954096564559267, + "loss": 7.5531, + "step": 453 + }, + { + "epoch": 0.15599364343082936, + "grad_norm": 1.2158923149108887, + "learning_rate": 0.00049535227685816, + "loss": 7.519, + "step": 454 + }, + { + "epoch": 0.15633724176437744, + "grad_norm": 0.7679694890975952, + "learning_rate": 0.0004952945442245598, + "loss": 7.5488, + "step": 455 + }, + { + "epoch": 0.15668084009792552, + "grad_norm": 1.019667387008667, + "learning_rate": 0.0004952364586381971, + "loss": 7.5456, + "step": 456 + }, + { + "epoch": 0.1570244384314736, + "grad_norm": 0.9500595927238464, + "learning_rate": 0.000495178020182651, + "loss": 7.4402, + "step": 457 + }, + { + "epoch": 0.1573680367650217, + "grad_norm": 0.764927089214325, + "learning_rate": 0.0004951192289420082, + "loss": 7.5489, + "step": 458 + }, + { + "epoch": 0.15771163509856978, + "grad_norm": 0.689294159412384, + "learning_rate": 0.0004950600850008629, + "loss": 7.3963, + "step": 459 + }, + { + "epoch": 0.15805523343211786, + "grad_norm": 0.9121485948562622, + "learning_rate": 0.0004950005884443171, + "loss": 7.3951, + "step": 460 + }, + { + "epoch": 0.15839883176566594, + "grad_norm": 0.3724419176578522, + "learning_rate": 0.00049494073935798, + "loss": 7.52, + "step": 461 + }, + { + "epoch": 0.15874243009921402, + "grad_norm": 1.2896360158920288, + "learning_rate": 0.0004948805378279681, + "loss": 7.5127, + "step": 462 + }, + { + "epoch": 0.1590860284327621, + "grad_norm": 0.5851932168006897, + "learning_rate": 0.0004948199839409047, + "loss": 7.4566, + "step": 463 + }, + { + "epoch": 0.15942962676631017, + "grad_norm": 0.8496333360671997, + "learning_rate": 0.0004947590777839209, + "loss": 7.4376, + "step": 464 + }, + { + "epoch": 0.15977322509985828, + "grad_norm": 0.40505436062812805, + "learning_rate": 0.0004946978194446538, + "loss": 7.5951, + "step": 465 + }, + { + "epoch": 0.16011682343340636, + "grad_norm": 0.7311084866523743, + "learning_rate": 0.0004946362090112479, + "loss": 7.5839, + "step": 466 + }, + { + "epoch": 0.16046042176695444, + "grad_norm": 0.3588482737541199, + "learning_rate": 0.0004945742465723537, + "loss": 7.4579, + "step": 467 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 0.5303255319595337, + "learning_rate": 0.000494511932217129, + "loss": 7.4116, + "step": 468 + }, + { + "epoch": 0.1611476184340506, + "grad_norm": 0.5300981998443604, + "learning_rate": 0.0004944492660352371, + "loss": 7.3889, + "step": 469 + }, + { + "epoch": 0.16149121676759867, + "grad_norm": 0.7826660871505737, + "learning_rate": 0.0004943862481168483, + "loss": 7.4045, + "step": 470 + }, + { + "epoch": 0.16183481510114675, + "grad_norm": 0.5411097407341003, + "learning_rate": 0.0004943228785526386, + "loss": 7.4755, + "step": 471 + }, + { + "epoch": 0.16217841343469483, + "grad_norm": 0.4609074294567108, + "learning_rate": 0.00049425915743379, + "loss": 7.3915, + "step": 472 + }, + { + "epoch": 0.16252201176824294, + "grad_norm": 0.6618660688400269, + "learning_rate": 0.0004941950848519904, + "loss": 7.494, + "step": 473 + }, + { + "epoch": 0.16286561010179101, + "grad_norm": 0.4540770351886749, + "learning_rate": 0.0004941306608994336, + "loss": 7.4365, + "step": 474 + }, + { + "epoch": 0.1632092084353391, + "grad_norm": 0.29459720849990845, + "learning_rate": 0.0004940658856688185, + "loss": 7.4786, + "step": 475 + }, + { + "epoch": 0.16355280676888717, + "grad_norm": 0.38014736771583557, + "learning_rate": 0.00049400075925335, + "loss": 7.5007, + "step": 476 + }, + { + "epoch": 0.16389640510243525, + "grad_norm": 0.3502187728881836, + "learning_rate": 0.0004939352817467382, + "loss": 7.3697, + "step": 477 + }, + { + "epoch": 0.16424000343598333, + "grad_norm": 0.539600133895874, + "learning_rate": 0.0004938694532431979, + "loss": 7.4328, + "step": 478 + }, + { + "epoch": 0.1645836017695314, + "grad_norm": 0.25308141112327576, + "learning_rate": 0.0004938032738374497, + "loss": 7.524, + "step": 479 + }, + { + "epoch": 0.1649272001030795, + "grad_norm": 0.5650960206985474, + "learning_rate": 0.0004937367436247186, + "loss": 7.5126, + "step": 480 + }, + { + "epoch": 0.1652707984366276, + "grad_norm": 0.27598950266838074, + "learning_rate": 0.0004936698627007343, + "loss": 7.4391, + "step": 481 + }, + { + "epoch": 0.16561439677017567, + "grad_norm": 0.4361298382282257, + "learning_rate": 0.0004936026311617315, + "loss": 7.5193, + "step": 482 + }, + { + "epoch": 0.16595799510372375, + "grad_norm": 0.3199003040790558, + "learning_rate": 0.0004935350491044493, + "loss": 7.4871, + "step": 483 + }, + { + "epoch": 0.16630159343727183, + "grad_norm": 0.4775875210762024, + "learning_rate": 0.000493467116626131, + "loss": 7.4474, + "step": 484 + }, + { + "epoch": 0.1666451917708199, + "grad_norm": 0.6670292019844055, + "learning_rate": 0.0004933988338245242, + "loss": 7.4594, + "step": 485 + }, + { + "epoch": 0.16698879010436798, + "grad_norm": 0.3038119375705719, + "learning_rate": 0.0004933302007978807, + "loss": 7.4544, + "step": 486 + }, + { + "epoch": 0.16733238843791606, + "grad_norm": 0.6222366690635681, + "learning_rate": 0.0004932612176449559, + "loss": 7.4342, + "step": 487 + }, + { + "epoch": 0.16767598677146417, + "grad_norm": 0.3004700541496277, + "learning_rate": 0.0004931918844650095, + "loss": 7.4134, + "step": 488 + }, + { + "epoch": 0.16801958510501225, + "grad_norm": 0.44555893540382385, + "learning_rate": 0.0004931222013578045, + "loss": 7.4252, + "step": 489 + }, + { + "epoch": 0.16836318343856033, + "grad_norm": 0.3072405159473419, + "learning_rate": 0.0004930521684236073, + "loss": 7.4277, + "step": 490 + }, + { + "epoch": 0.1687067817721084, + "grad_norm": 0.8164499998092651, + "learning_rate": 0.000492981785763188, + "loss": 7.4174, + "step": 491 + }, + { + "epoch": 0.16905038010565648, + "grad_norm": 0.6826271414756775, + "learning_rate": 0.0004929110534778197, + "loss": 7.4227, + "step": 492 + }, + { + "epoch": 0.16939397843920456, + "grad_norm": 0.6980798244476318, + "learning_rate": 0.0004928399716692787, + "loss": 7.3474, + "step": 493 + }, + { + "epoch": 0.16973757677275264, + "grad_norm": 0.6725314855575562, + "learning_rate": 0.0004927685404398441, + "loss": 7.5103, + "step": 494 + }, + { + "epoch": 0.17008117510630075, + "grad_norm": 0.38431742787361145, + "learning_rate": 0.000492696759892298, + "loss": 7.4468, + "step": 495 + }, + { + "epoch": 0.17042477343984883, + "grad_norm": 0.41020020842552185, + "learning_rate": 0.0004926246301299247, + "loss": 7.48, + "step": 496 + }, + { + "epoch": 0.1707683717733969, + "grad_norm": 0.5806956887245178, + "learning_rate": 0.0004925521512565114, + "loss": 7.4053, + "step": 497 + }, + { + "epoch": 0.17111197010694498, + "grad_norm": 0.7412662506103516, + "learning_rate": 0.0004924793233763476, + "loss": 7.329, + "step": 498 + }, + { + "epoch": 0.17145556844049306, + "grad_norm": 0.5328527092933655, + "learning_rate": 0.0004924061465942247, + "loss": 7.4516, + "step": 499 + }, + { + "epoch": 0.17179916677404114, + "grad_norm": 1.086364507675171, + "learning_rate": 0.0004923326210154364, + "loss": 7.6593, + "step": 500 + }, + { + "epoch": 0.17214276510758922, + "grad_norm": 0.6880685091018677, + "learning_rate": 0.0004922587467457781, + "loss": 7.3688, + "step": 501 + }, + { + "epoch": 0.1724863634411373, + "grad_norm": 1.1379863023757935, + "learning_rate": 0.0004921845238915472, + "loss": 7.3837, + "step": 502 + }, + { + "epoch": 0.1728299617746854, + "grad_norm": 0.8507835268974304, + "learning_rate": 0.0004921099525595423, + "loss": 7.339, + "step": 503 + }, + { + "epoch": 0.17317356010823348, + "grad_norm": 1.0241001844406128, + "learning_rate": 0.0004920350328570638, + "loss": 7.2608, + "step": 504 + }, + { + "epoch": 0.17351715844178156, + "grad_norm": 0.6690787672996521, + "learning_rate": 0.000491959764891913, + "loss": 7.3628, + "step": 505 + }, + { + "epoch": 0.17386075677532964, + "grad_norm": 0.8972269892692566, + "learning_rate": 0.0004918841487723926, + "loss": 7.2917, + "step": 506 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 0.33962926268577576, + "learning_rate": 0.0004918081846073059, + "loss": 7.4265, + "step": 507 + }, + { + "epoch": 0.1745479534424258, + "grad_norm": 0.9010359048843384, + "learning_rate": 0.0004917318725059577, + "loss": 7.3324, + "step": 508 + }, + { + "epoch": 0.17489155177597387, + "grad_norm": 0.42200517654418945, + "learning_rate": 0.0004916552125781528, + "loss": 7.3756, + "step": 509 + }, + { + "epoch": 0.17523515010952198, + "grad_norm": 0.8156759738922119, + "learning_rate": 0.0004915782049341967, + "loss": 7.2236, + "step": 510 + }, + { + "epoch": 0.17557874844307006, + "grad_norm": 0.3509763181209564, + "learning_rate": 0.0004915008496848951, + "loss": 7.2068, + "step": 511 + }, + { + "epoch": 0.17592234677661814, + "grad_norm": 0.7369383573532104, + "learning_rate": 0.000491423146941554, + "loss": 7.2954, + "step": 512 + }, + { + "epoch": 0.17626594511016622, + "grad_norm": 0.5369434356689453, + "learning_rate": 0.0004913450968159794, + "loss": 7.1138, + "step": 513 + }, + { + "epoch": 0.1766095434437143, + "grad_norm": 0.64871746301651, + "learning_rate": 0.0004912666994204773, + "loss": 7.2767, + "step": 514 + }, + { + "epoch": 0.17695314177726237, + "grad_norm": 0.5261757969856262, + "learning_rate": 0.0004911879548678531, + "loss": 7.2552, + "step": 515 + }, + { + "epoch": 0.17729674011081045, + "grad_norm": 0.8654578328132629, + "learning_rate": 0.0004911088632714117, + "loss": 7.4006, + "step": 516 + }, + { + "epoch": 0.17764033844435856, + "grad_norm": 0.7231168746948242, + "learning_rate": 0.0004910294247449576, + "loss": 7.2783, + "step": 517 + }, + { + "epoch": 0.17798393677790664, + "grad_norm": 0.6039850115776062, + "learning_rate": 0.0004909496394027945, + "loss": 7.3449, + "step": 518 + }, + { + "epoch": 0.17832753511145472, + "grad_norm": 0.5625784993171692, + "learning_rate": 0.0004908695073597249, + "loss": 7.1675, + "step": 519 + }, + { + "epoch": 0.1786711334450028, + "grad_norm": 0.39175331592559814, + "learning_rate": 0.0004907890287310504, + "loss": 7.3169, + "step": 520 + }, + { + "epoch": 0.17901473177855087, + "grad_norm": 0.6726620197296143, + "learning_rate": 0.000490708203632571, + "loss": 7.3171, + "step": 521 + }, + { + "epoch": 0.17935833011209895, + "grad_norm": 0.36819761991500854, + "learning_rate": 0.0004906270321805854, + "loss": 7.2229, + "step": 522 + }, + { + "epoch": 0.17970192844564703, + "grad_norm": 0.8770174384117126, + "learning_rate": 0.000490545514491891, + "loss": 7.2737, + "step": 523 + }, + { + "epoch": 0.1800455267791951, + "grad_norm": 0.3748573660850525, + "learning_rate": 0.0004904636506837828, + "loss": 7.2918, + "step": 524 + }, + { + "epoch": 0.18038912511274321, + "grad_norm": 1.1249299049377441, + "learning_rate": 0.0004903814408740543, + "loss": 7.2435, + "step": 525 + }, + { + "epoch": 0.1807327234462913, + "grad_norm": 0.3472743034362793, + "learning_rate": 0.0004902988851809965, + "loss": 7.3431, + "step": 526 + }, + { + "epoch": 0.18107632177983937, + "grad_norm": 0.8709349632263184, + "learning_rate": 0.0004902159837233984, + "loss": 7.3518, + "step": 527 + }, + { + "epoch": 0.18141992011338745, + "grad_norm": 0.818452775478363, + "learning_rate": 0.0004901327366205464, + "loss": 7.353, + "step": 528 + }, + { + "epoch": 0.18176351844693553, + "grad_norm": 0.6798619031906128, + "learning_rate": 0.000490049143992224, + "loss": 7.2756, + "step": 529 + }, + { + "epoch": 0.1821071167804836, + "grad_norm": 0.7426633238792419, + "learning_rate": 0.0004899652059587123, + "loss": 7.2484, + "step": 530 + }, + { + "epoch": 0.1824507151140317, + "grad_norm": 0.5532476305961609, + "learning_rate": 0.0004898809226407892, + "loss": 7.3746, + "step": 531 + }, + { + "epoch": 0.1827943134475798, + "grad_norm": 0.7635505795478821, + "learning_rate": 0.0004897962941597294, + "loss": 7.2973, + "step": 532 + }, + { + "epoch": 0.18313791178112787, + "grad_norm": 0.6389451622962952, + "learning_rate": 0.0004897113206373042, + "loss": 7.2043, + "step": 533 + }, + { + "epoch": 0.18348151011467595, + "grad_norm": 0.7844037413597107, + "learning_rate": 0.0004896260021957816, + "loss": 7.2125, + "step": 534 + }, + { + "epoch": 0.18382510844822403, + "grad_norm": 0.4252033829689026, + "learning_rate": 0.0004895403389579258, + "loss": 7.3679, + "step": 535 + }, + { + "epoch": 0.1841687067817721, + "grad_norm": 0.8579635620117188, + "learning_rate": 0.0004894543310469967, + "loss": 7.2665, + "step": 536 + }, + { + "epoch": 0.18451230511532019, + "grad_norm": 0.4492959678173065, + "learning_rate": 0.0004893679785867511, + "loss": 7.262, + "step": 537 + }, + { + "epoch": 0.18485590344886826, + "grad_norm": 0.6443487405776978, + "learning_rate": 0.0004892812817014407, + "loss": 7.3039, + "step": 538 + }, + { + "epoch": 0.18519950178241634, + "grad_norm": 0.35954129695892334, + "learning_rate": 0.000489194240515813, + "loss": 7.28, + "step": 539 + }, + { + "epoch": 0.18554310011596445, + "grad_norm": 0.5164822340011597, + "learning_rate": 0.0004891068551551112, + "loss": 7.4009, + "step": 540 + }, + { + "epoch": 0.18588669844951253, + "grad_norm": 0.38150545954704285, + "learning_rate": 0.0004890191257450736, + "loss": 7.3082, + "step": 541 + }, + { + "epoch": 0.1862302967830606, + "grad_norm": 0.7000799179077148, + "learning_rate": 0.0004889310524119331, + "loss": 7.3069, + "step": 542 + }, + { + "epoch": 0.18657389511660868, + "grad_norm": 0.45130738615989685, + "learning_rate": 0.0004888426352824184, + "loss": 7.3513, + "step": 543 + }, + { + "epoch": 0.18691749345015676, + "grad_norm": 0.7859145402908325, + "learning_rate": 0.000488753874483752, + "loss": 7.3586, + "step": 544 + }, + { + "epoch": 0.18726109178370484, + "grad_norm": 0.426577091217041, + "learning_rate": 0.0004886647701436513, + "loss": 7.3083, + "step": 545 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 0.41109466552734375, + "learning_rate": 0.0004885753223903281, + "loss": 7.3997, + "step": 546 + }, + { + "epoch": 0.18794828845080103, + "grad_norm": 0.552748441696167, + "learning_rate": 0.0004884855313524879, + "loss": 7.3883, + "step": 547 + }, + { + "epoch": 0.1882918867843491, + "grad_norm": 0.7718663811683655, + "learning_rate": 0.0004883953971593308, + "loss": 7.3032, + "step": 548 + }, + { + "epoch": 0.18863548511789718, + "grad_norm": 0.4850485622882843, + "learning_rate": 0.0004883049199405501, + "loss": 7.3862, + "step": 549 + }, + { + "epoch": 0.18897908345144526, + "grad_norm": 0.8388044238090515, + "learning_rate": 0.0004882140998263331, + "loss": 7.4342, + "step": 550 + }, + { + "epoch": 0.18932268178499334, + "grad_norm": 1.0273692607879639, + "learning_rate": 0.0004881229369473601, + "loss": 7.1913, + "step": 551 + }, + { + "epoch": 0.18966628011854142, + "grad_norm": 0.8978946208953857, + "learning_rate": 0.0004880314314348048, + "loss": 7.2436, + "step": 552 + }, + { + "epoch": 0.1900098784520895, + "grad_norm": 0.9823456406593323, + "learning_rate": 0.000487939583420334, + "loss": 7.2297, + "step": 553 + }, + { + "epoch": 0.19035347678563758, + "grad_norm": 1.137190580368042, + "learning_rate": 0.00048784739303610715, + "loss": 7.2173, + "step": 554 + }, + { + "epoch": 0.19069707511918568, + "grad_norm": 0.842155396938324, + "learning_rate": 0.00048775486041477645, + "loss": 7.1943, + "step": 555 + }, + { + "epoch": 0.19104067345273376, + "grad_norm": 0.9378860592842102, + "learning_rate": 0.0004876619856894864, + "loss": 7.1498, + "step": 556 + }, + { + "epoch": 0.19138427178628184, + "grad_norm": 0.8283404111862183, + "learning_rate": 0.000487568768993874, + "loss": 7.1507, + "step": 557 + }, + { + "epoch": 0.19172787011982992, + "grad_norm": 0.8430905342102051, + "learning_rate": 0.0004874752104620681, + "loss": 7.2152, + "step": 558 + }, + { + "epoch": 0.192071468453378, + "grad_norm": 0.5364805459976196, + "learning_rate": 0.00048738131022868947, + "loss": 7.2251, + "step": 559 + }, + { + "epoch": 0.19241506678692608, + "grad_norm": 0.550553023815155, + "learning_rate": 0.0004872870684288505, + "loss": 7.2291, + "step": 560 + }, + { + "epoch": 0.19275866512047415, + "grad_norm": 0.5642486810684204, + "learning_rate": 0.0004871924851981553, + "loss": 7.2261, + "step": 561 + }, + { + "epoch": 0.19310226345402226, + "grad_norm": 0.4002983868122101, + "learning_rate": 0.00048709756067269884, + "loss": 7.0912, + "step": 562 + }, + { + "epoch": 0.19344586178757034, + "grad_norm": 0.5690410137176514, + "learning_rate": 0.0004870022949890676, + "loss": 7.1634, + "step": 563 + }, + { + "epoch": 0.19378946012111842, + "grad_norm": 0.48423007130622864, + "learning_rate": 0.0004869066882843387, + "loss": 7.1851, + "step": 564 + }, + { + "epoch": 0.1941330584546665, + "grad_norm": 0.7870746850967407, + "learning_rate": 0.00048681074069608006, + "loss": 7.0857, + "step": 565 + }, + { + "epoch": 0.19447665678821457, + "grad_norm": 0.3157571852207184, + "learning_rate": 0.00048671445236234996, + "loss": 7.2261, + "step": 566 + }, + { + "epoch": 0.19482025512176265, + "grad_norm": 0.7632848024368286, + "learning_rate": 0.00048661782342169715, + "loss": 7.2091, + "step": 567 + }, + { + "epoch": 0.19516385345531073, + "grad_norm": 0.4711115062236786, + "learning_rate": 0.00048652085401316037, + "loss": 7.2743, + "step": 568 + }, + { + "epoch": 0.1955074517888588, + "grad_norm": 0.5823638439178467, + "learning_rate": 0.00048642354427626836, + "loss": 7.204, + "step": 569 + }, + { + "epoch": 0.19585105012240692, + "grad_norm": 0.6164575815200806, + "learning_rate": 0.00048632589435103937, + "loss": 7.1551, + "step": 570 + }, + { + "epoch": 0.196194648455955, + "grad_norm": 0.4023503363132477, + "learning_rate": 0.0004862279043779813, + "loss": 7.1331, + "step": 571 + }, + { + "epoch": 0.19653824678950307, + "grad_norm": 0.6359233856201172, + "learning_rate": 0.00048612957449809137, + "loss": 7.0852, + "step": 572 + }, + { + "epoch": 0.19688184512305115, + "grad_norm": 0.5544841289520264, + "learning_rate": 0.00048603090485285565, + "loss": 7.171, + "step": 573 + }, + { + "epoch": 0.19722544345659923, + "grad_norm": 0.6806702613830566, + "learning_rate": 0.00048593189558424944, + "loss": 7.154, + "step": 574 + }, + { + "epoch": 0.1975690417901473, + "grad_norm": 0.5554197430610657, + "learning_rate": 0.00048583254683473657, + "loss": 7.1088, + "step": 575 + }, + { + "epoch": 0.1979126401236954, + "grad_norm": 0.5139564871788025, + "learning_rate": 0.0004857328587472691, + "loss": 7.1426, + "step": 576 + }, + { + "epoch": 0.1982562384572435, + "grad_norm": 0.7411857843399048, + "learning_rate": 0.00048563283146528774, + "loss": 7.227, + "step": 577 + }, + { + "epoch": 0.19859983679079157, + "grad_norm": 0.533267617225647, + "learning_rate": 0.00048553246513272113, + "loss": 7.1424, + "step": 578 + }, + { + "epoch": 0.19894343512433965, + "grad_norm": 0.5299692749977112, + "learning_rate": 0.0004854317598939857, + "loss": 7.0561, + "step": 579 + }, + { + "epoch": 0.19928703345788773, + "grad_norm": 0.6946874856948853, + "learning_rate": 0.00048533071589398566, + "loss": 7.0598, + "step": 580 + }, + { + "epoch": 0.1996306317914358, + "grad_norm": 0.42671942710876465, + "learning_rate": 0.00048522933327811246, + "loss": 7.1416, + "step": 581 + }, + { + "epoch": 0.1999742301249839, + "grad_norm": 0.4139076769351959, + "learning_rate": 0.00048512761219224494, + "loss": 7.1578, + "step": 582 + }, + { + "epoch": 0.20031782845853197, + "grad_norm": 0.40493038296699524, + "learning_rate": 0.000485025552782749, + "loss": 7.1914, + "step": 583 + }, + { + "epoch": 0.20066142679208007, + "grad_norm": 0.4322982430458069, + "learning_rate": 0.0004849231551964771, + "loss": 7.1429, + "step": 584 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 0.5229026079177856, + "learning_rate": 0.0004848204195807687, + "loss": 7.1781, + "step": 585 + }, + { + "epoch": 0.20134862345917623, + "grad_norm": 0.3657551407814026, + "learning_rate": 0.00048471734608344924, + "loss": 7.1775, + "step": 586 + }, + { + "epoch": 0.2016922217927243, + "grad_norm": 0.6244291067123413, + "learning_rate": 0.0004846139348528307, + "loss": 7.234, + "step": 587 + }, + { + "epoch": 0.2020358201262724, + "grad_norm": 0.4564877450466156, + "learning_rate": 0.00048451018603771064, + "loss": 7.092, + "step": 588 + }, + { + "epoch": 0.20237941845982046, + "grad_norm": 0.3984292447566986, + "learning_rate": 0.0004844060997873727, + "loss": 7.2266, + "step": 589 + }, + { + "epoch": 0.20272301679336854, + "grad_norm": 0.4138945937156677, + "learning_rate": 0.00048430167625158595, + "loss": 7.1706, + "step": 590 + }, + { + "epoch": 0.20306661512691662, + "grad_norm": 0.5082581043243408, + "learning_rate": 0.00048419691558060466, + "loss": 7.0813, + "step": 591 + }, + { + "epoch": 0.20341021346046473, + "grad_norm": 0.48261651396751404, + "learning_rate": 0.0004840918179251683, + "loss": 7.1004, + "step": 592 + }, + { + "epoch": 0.2037538117940128, + "grad_norm": 0.4194554388523102, + "learning_rate": 0.0004839863834365013, + "loss": 7.0886, + "step": 593 + }, + { + "epoch": 0.20409741012756089, + "grad_norm": 0.5701256394386292, + "learning_rate": 0.00048388061226631264, + "loss": 7.188, + "step": 594 + }, + { + "epoch": 0.20444100846110896, + "grad_norm": 0.37624257802963257, + "learning_rate": 0.0004837745045667957, + "loss": 7.1977, + "step": 595 + }, + { + "epoch": 0.20478460679465704, + "grad_norm": 0.4813244640827179, + "learning_rate": 0.0004836680604906284, + "loss": 7.1103, + "step": 596 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.7968626022338867, + "learning_rate": 0.00048356128019097223, + "loss": 7.1366, + "step": 597 + }, + { + "epoch": 0.2054718034617532, + "grad_norm": 0.6953873038291931, + "learning_rate": 0.0004834541638214727, + "loss": 7.3475, + "step": 598 + }, + { + "epoch": 0.2058154017953013, + "grad_norm": 0.7357029318809509, + "learning_rate": 0.00048334671153625895, + "loss": 7.1789, + "step": 599 + }, + { + "epoch": 0.20615900012884938, + "grad_norm": 0.9581784605979919, + "learning_rate": 0.00048323892348994335, + "loss": 7.2079, + "step": 600 + }, + { + "epoch": 0.20650259846239746, + "grad_norm": 0.8259401917457581, + "learning_rate": 0.00048313079983762155, + "loss": 7.1046, + "step": 601 + }, + { + "epoch": 0.20684619679594554, + "grad_norm": 0.6607183814048767, + "learning_rate": 0.00048302234073487185, + "loss": 7.0981, + "step": 602 + }, + { + "epoch": 0.20718979512949362, + "grad_norm": 0.5796660780906677, + "learning_rate": 0.0004829135463377553, + "loss": 7.1793, + "step": 603 + }, + { + "epoch": 0.2075333934630417, + "grad_norm": 0.7962479591369629, + "learning_rate": 0.00048280441680281566, + "loss": 7.0865, + "step": 604 + }, + { + "epoch": 0.20787699179658978, + "grad_norm": 0.48541417717933655, + "learning_rate": 0.0004826949522870786, + "loss": 7.0426, + "step": 605 + }, + { + "epoch": 0.20822059013013786, + "grad_norm": 0.4835425913333893, + "learning_rate": 0.00048258515294805207, + "loss": 7.0629, + "step": 606 + }, + { + "epoch": 0.20856418846368596, + "grad_norm": 0.8537253737449646, + "learning_rate": 0.00048247501894372534, + "loss": 7.1095, + "step": 607 + }, + { + "epoch": 0.20890778679723404, + "grad_norm": 0.43013009428977966, + "learning_rate": 0.0004823645504325699, + "loss": 7.0569, + "step": 608 + }, + { + "epoch": 0.20925138513078212, + "grad_norm": 0.4540230929851532, + "learning_rate": 0.0004822537475735379, + "loss": 7.1163, + "step": 609 + }, + { + "epoch": 0.2095949834643302, + "grad_norm": 0.5654716491699219, + "learning_rate": 0.00048214261052606294, + "loss": 7.0631, + "step": 610 + }, + { + "epoch": 0.20993858179787828, + "grad_norm": 0.6178705096244812, + "learning_rate": 0.00048203113945005947, + "loss": 7.0843, + "step": 611 + }, + { + "epoch": 0.21028218013142636, + "grad_norm": 0.43849003314971924, + "learning_rate": 0.00048191933450592256, + "loss": 7.0351, + "step": 612 + }, + { + "epoch": 0.21062577846497443, + "grad_norm": 0.48836883902549744, + "learning_rate": 0.00048180719585452753, + "loss": 7.0211, + "step": 613 + }, + { + "epoch": 0.21096937679852254, + "grad_norm": 0.5587930679321289, + "learning_rate": 0.00048169472365723, + "loss": 7.0268, + "step": 614 + }, + { + "epoch": 0.21131297513207062, + "grad_norm": 0.41739344596862793, + "learning_rate": 0.00048158191807586546, + "loss": 6.9506, + "step": 615 + }, + { + "epoch": 0.2116565734656187, + "grad_norm": 0.4349384009838104, + "learning_rate": 0.0004814687792727493, + "loss": 7.0376, + "step": 616 + }, + { + "epoch": 0.21200017179916678, + "grad_norm": 0.6397560834884644, + "learning_rate": 0.00048135530741067606, + "loss": 7.0313, + "step": 617 + }, + { + "epoch": 0.21234377013271485, + "grad_norm": 0.36318764090538025, + "learning_rate": 0.00048124150265291976, + "loss": 7.0587, + "step": 618 + }, + { + "epoch": 0.21268736846626293, + "grad_norm": 0.7137599587440491, + "learning_rate": 0.0004811273651632333, + "loss": 6.9631, + "step": 619 + }, + { + "epoch": 0.213030966799811, + "grad_norm": 0.59110426902771, + "learning_rate": 0.00048101289510584845, + "loss": 7.0164, + "step": 620 + }, + { + "epoch": 0.2133745651333591, + "grad_norm": 0.4364396333694458, + "learning_rate": 0.00048089809264547533, + "loss": 7.063, + "step": 621 + }, + { + "epoch": 0.2137181634669072, + "grad_norm": 0.6641795039176941, + "learning_rate": 0.00048078295794730266, + "loss": 7.0567, + "step": 622 + }, + { + "epoch": 0.21406176180045527, + "grad_norm": 0.37408143281936646, + "learning_rate": 0.0004806674911769968, + "loss": 7.0209, + "step": 623 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 0.4407050907611847, + "learning_rate": 0.0004805516925007024, + "loss": 7.0024, + "step": 624 + }, + { + "epoch": 0.21474895846755143, + "grad_norm": 0.5581675171852112, + "learning_rate": 0.00048043556208504125, + "loss": 7.0305, + "step": 625 + }, + { + "epoch": 0.2150925568010995, + "grad_norm": 0.5932101011276245, + "learning_rate": 0.0004803191000971128, + "loss": 7.0775, + "step": 626 + }, + { + "epoch": 0.2154361551346476, + "grad_norm": 0.630458414554596, + "learning_rate": 0.0004802023067044933, + "loss": 7.0122, + "step": 627 + }, + { + "epoch": 0.21577975346819567, + "grad_norm": 0.6922136545181274, + "learning_rate": 0.0004800851820752361, + "loss": 7.0494, + "step": 628 + }, + { + "epoch": 0.21612335180174377, + "grad_norm": 0.5312909483909607, + "learning_rate": 0.00047996772637787124, + "loss": 6.9883, + "step": 629 + }, + { + "epoch": 0.21646695013529185, + "grad_norm": 0.635317862033844, + "learning_rate": 0.00047984993978140473, + "loss": 7.0347, + "step": 630 + }, + { + "epoch": 0.21681054846883993, + "grad_norm": 0.587139368057251, + "learning_rate": 0.0004797318224553191, + "loss": 7.0547, + "step": 631 + }, + { + "epoch": 0.217154146802388, + "grad_norm": 0.46918633580207825, + "learning_rate": 0.00047961337456957256, + "loss": 7.119, + "step": 632 + }, + { + "epoch": 0.2174977451359361, + "grad_norm": 0.9854417443275452, + "learning_rate": 0.0004794945962945991, + "loss": 6.9985, + "step": 633 + }, + { + "epoch": 0.21784134346948417, + "grad_norm": 0.5167178511619568, + "learning_rate": 0.0004793754878013079, + "loss": 7.1618, + "step": 634 + }, + { + "epoch": 0.21818494180303225, + "grad_norm": 0.8147865533828735, + "learning_rate": 0.00047925604926108355, + "loss": 7.1051, + "step": 635 + }, + { + "epoch": 0.21852854013658032, + "grad_norm": 0.4546651840209961, + "learning_rate": 0.0004791362808457854, + "loss": 7.0173, + "step": 636 + }, + { + "epoch": 0.21887213847012843, + "grad_norm": 0.5698447823524475, + "learning_rate": 0.0004790161827277473, + "loss": 7.0944, + "step": 637 + }, + { + "epoch": 0.2192157368036765, + "grad_norm": 0.9129824042320251, + "learning_rate": 0.0004788957550797778, + "loss": 7.0367, + "step": 638 + }, + { + "epoch": 0.2195593351372246, + "grad_norm": 0.46476656198501587, + "learning_rate": 0.0004787749980751595, + "loss": 6.9945, + "step": 639 + }, + { + "epoch": 0.21990293347077267, + "grad_norm": 0.8614557385444641, + "learning_rate": 0.00047865391188764883, + "loss": 7.0681, + "step": 640 + }, + { + "epoch": 0.22024653180432074, + "grad_norm": 0.4384600520133972, + "learning_rate": 0.0004785324966914759, + "loss": 7.0053, + "step": 641 + }, + { + "epoch": 0.22059013013786882, + "grad_norm": 0.4937174320220947, + "learning_rate": 0.00047841075266134435, + "loss": 7.1369, + "step": 642 + }, + { + "epoch": 0.2209337284714169, + "grad_norm": 0.43719929456710815, + "learning_rate": 0.00047828867997243085, + "loss": 7.0655, + "step": 643 + }, + { + "epoch": 0.221277326804965, + "grad_norm": 0.5568514466285706, + "learning_rate": 0.00047816627880038504, + "loss": 7.0752, + "step": 644 + }, + { + "epoch": 0.2216209251385131, + "grad_norm": 0.6036604046821594, + "learning_rate": 0.0004780435493213292, + "loss": 7.1172, + "step": 645 + }, + { + "epoch": 0.22196452347206116, + "grad_norm": 0.7314201593399048, + "learning_rate": 0.0004779204917118579, + "loss": 7.025, + "step": 646 + }, + { + "epoch": 0.22230812180560924, + "grad_norm": 0.8482968807220459, + "learning_rate": 0.00047779710614903804, + "loss": 7.1686, + "step": 647 + }, + { + "epoch": 0.22265172013915732, + "grad_norm": 0.3950001895427704, + "learning_rate": 0.00047767339281040835, + "loss": 7.0538, + "step": 648 + }, + { + "epoch": 0.2229953184727054, + "grad_norm": 0.9423245787620544, + "learning_rate": 0.00047754935187397914, + "loss": 6.9874, + "step": 649 + }, + { + "epoch": 0.22333891680625348, + "grad_norm": 0.7840802073478699, + "learning_rate": 0.000477424983518232, + "loss": 7.1969, + "step": 650 + }, + { + "epoch": 0.22368251513980159, + "grad_norm": 0.7974851131439209, + "learning_rate": 0.00047730028792212, + "loss": 6.9681, + "step": 651 + }, + { + "epoch": 0.22402611347334966, + "grad_norm": 1.3284245729446411, + "learning_rate": 0.00047717526526506673, + "loss": 6.8882, + "step": 652 + }, + { + "epoch": 0.22436971180689774, + "grad_norm": 1.194890022277832, + "learning_rate": 0.0004770499157269664, + "loss": 7.0421, + "step": 653 + }, + { + "epoch": 0.22471331014044582, + "grad_norm": 0.7155433893203735, + "learning_rate": 0.0004769242394881838, + "loss": 6.875, + "step": 654 + }, + { + "epoch": 0.2250569084739939, + "grad_norm": 1.2595062255859375, + "learning_rate": 0.00047679823672955356, + "loss": 6.9402, + "step": 655 + }, + { + "epoch": 0.22540050680754198, + "grad_norm": 0.8444681763648987, + "learning_rate": 0.0004766719076323804, + "loss": 6.8736, + "step": 656 + }, + { + "epoch": 0.22574410514109006, + "grad_norm": 0.7984046339988708, + "learning_rate": 0.00047654525237843834, + "loss": 6.9932, + "step": 657 + }, + { + "epoch": 0.22608770347463814, + "grad_norm": 0.8302541971206665, + "learning_rate": 0.00047641827114997085, + "loss": 7.0802, + "step": 658 + }, + { + "epoch": 0.22643130180818624, + "grad_norm": 0.8889338374137878, + "learning_rate": 0.0004762909641296904, + "loss": 7.016, + "step": 659 + }, + { + "epoch": 0.22677490014173432, + "grad_norm": 0.5691213011741638, + "learning_rate": 0.00047616333150077826, + "loss": 6.9966, + "step": 660 + }, + { + "epoch": 0.2271184984752824, + "grad_norm": 0.5026600360870361, + "learning_rate": 0.00047603537344688423, + "loss": 6.9897, + "step": 661 + }, + { + "epoch": 0.22746209680883048, + "grad_norm": 0.4827994704246521, + "learning_rate": 0.00047590709015212635, + "loss": 6.8955, + "step": 662 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 0.6659157872200012, + "learning_rate": 0.0004757784818010906, + "loss": 6.952, + "step": 663 + }, + { + "epoch": 0.22814929347592663, + "grad_norm": 0.5329311490058899, + "learning_rate": 0.00047564954857883077, + "loss": 6.9433, + "step": 664 + }, + { + "epoch": 0.2284928918094747, + "grad_norm": 0.40088188648223877, + "learning_rate": 0.000475520290670868, + "loss": 6.8927, + "step": 665 + }, + { + "epoch": 0.22883649014302282, + "grad_norm": 0.6760165691375732, + "learning_rate": 0.0004753907082631906, + "loss": 6.9534, + "step": 666 + }, + { + "epoch": 0.2291800884765709, + "grad_norm": 0.4778375029563904, + "learning_rate": 0.0004752608015422541, + "loss": 7.0038, + "step": 667 + }, + { + "epoch": 0.22952368681011898, + "grad_norm": 0.3798183500766754, + "learning_rate": 0.0004751305706949803, + "loss": 7.0103, + "step": 668 + }, + { + "epoch": 0.22986728514366706, + "grad_norm": 0.6319564580917358, + "learning_rate": 0.00047500001590875755, + "loss": 7.0189, + "step": 669 + }, + { + "epoch": 0.23021088347721513, + "grad_norm": 0.4286358952522278, + "learning_rate": 0.0004748691373714403, + "loss": 6.9399, + "step": 670 + }, + { + "epoch": 0.2305544818107632, + "grad_norm": 0.5430161356925964, + "learning_rate": 0.00047473793527134884, + "loss": 6.9448, + "step": 671 + }, + { + "epoch": 0.2308980801443113, + "grad_norm": 0.45105573534965515, + "learning_rate": 0.0004746064097972691, + "loss": 6.9236, + "step": 672 + }, + { + "epoch": 0.23124167847785937, + "grad_norm": 0.587155818939209, + "learning_rate": 0.00047447456113845223, + "loss": 6.9462, + "step": 673 + }, + { + "epoch": 0.23158527681140748, + "grad_norm": 0.33924758434295654, + "learning_rate": 0.00047434238948461437, + "loss": 6.8088, + "step": 674 + }, + { + "epoch": 0.23192887514495555, + "grad_norm": 0.45958212018013, + "learning_rate": 0.0004742098950259365, + "loss": 6.9157, + "step": 675 + }, + { + "epoch": 0.23227247347850363, + "grad_norm": 0.44224873185157776, + "learning_rate": 0.0004740770779530641, + "loss": 7.0208, + "step": 676 + }, + { + "epoch": 0.2326160718120517, + "grad_norm": 0.48070213198661804, + "learning_rate": 0.00047394393845710684, + "loss": 6.9281, + "step": 677 + }, + { + "epoch": 0.2329596701455998, + "grad_norm": 0.3841937482357025, + "learning_rate": 0.00047381047672963815, + "loss": 6.9006, + "step": 678 + }, + { + "epoch": 0.23330326847914787, + "grad_norm": 0.3501332700252533, + "learning_rate": 0.0004736766929626954, + "loss": 6.8771, + "step": 679 + }, + { + "epoch": 0.23364686681269595, + "grad_norm": 0.4568793773651123, + "learning_rate": 0.00047354258734877907, + "loss": 6.828, + "step": 680 + }, + { + "epoch": 0.23399046514624405, + "grad_norm": 0.4417133033275604, + "learning_rate": 0.00047340816008085306, + "loss": 6.8497, + "step": 681 + }, + { + "epoch": 0.23433406347979213, + "grad_norm": 0.32319939136505127, + "learning_rate": 0.0004732734113523438, + "loss": 6.9765, + "step": 682 + }, + { + "epoch": 0.2346776618133402, + "grad_norm": 0.41723182797431946, + "learning_rate": 0.0004731383413571404, + "loss": 6.9349, + "step": 683 + }, + { + "epoch": 0.2350212601468883, + "grad_norm": 0.5126868486404419, + "learning_rate": 0.0004730029502895942, + "loss": 6.8654, + "step": 684 + }, + { + "epoch": 0.23536485848043637, + "grad_norm": 0.4400496184825897, + "learning_rate": 0.0004728672383445185, + "loss": 6.9556, + "step": 685 + }, + { + "epoch": 0.23570845681398445, + "grad_norm": 0.513038158416748, + "learning_rate": 0.0004727312057171884, + "loss": 6.9809, + "step": 686 + }, + { + "epoch": 0.23605205514753252, + "grad_norm": 0.370490163564682, + "learning_rate": 0.0004725948526033405, + "loss": 6.9601, + "step": 687 + }, + { + "epoch": 0.2363956534810806, + "grad_norm": 0.42959868907928467, + "learning_rate": 0.00047245817919917225, + "loss": 6.8961, + "step": 688 + }, + { + "epoch": 0.2367392518146287, + "grad_norm": 0.4754343628883362, + "learning_rate": 0.0004723211857013423, + "loss": 6.9749, + "step": 689 + }, + { + "epoch": 0.2370828501481768, + "grad_norm": 0.5382852554321289, + "learning_rate": 0.0004721838723069696, + "loss": 6.9439, + "step": 690 + }, + { + "epoch": 0.23742644848172487, + "grad_norm": 0.5841006636619568, + "learning_rate": 0.00047204623921363355, + "loss": 6.8801, + "step": 691 + }, + { + "epoch": 0.23777004681527295, + "grad_norm": 0.3350447416305542, + "learning_rate": 0.0004719082866193736, + "loss": 6.9135, + "step": 692 + }, + { + "epoch": 0.23811364514882102, + "grad_norm": 0.7295602560043335, + "learning_rate": 0.0004717700147226887, + "loss": 7.0857, + "step": 693 + }, + { + "epoch": 0.2384572434823691, + "grad_norm": 0.656107485294342, + "learning_rate": 0.00047163142372253766, + "loss": 7.0187, + "step": 694 + }, + { + "epoch": 0.23880084181591718, + "grad_norm": 0.48529568314552307, + "learning_rate": 0.0004714925138183379, + "loss": 7.0068, + "step": 695 + }, + { + "epoch": 0.2391444401494653, + "grad_norm": 0.6135896444320679, + "learning_rate": 0.0004713532852099663, + "loss": 7.0369, + "step": 696 + }, + { + "epoch": 0.23948803848301337, + "grad_norm": 0.8559364080429077, + "learning_rate": 0.00047121373809775783, + "loss": 6.8712, + "step": 697 + }, + { + "epoch": 0.23983163681656144, + "grad_norm": 0.509367823600769, + "learning_rate": 0.0004710738726825059, + "loss": 7.0357, + "step": 698 + }, + { + "epoch": 0.24017523515010952, + "grad_norm": 0.878773033618927, + "learning_rate": 0.0004709336891654621, + "loss": 7.0097, + "step": 699 + }, + { + "epoch": 0.2405188334836576, + "grad_norm": 0.6598408818244934, + "learning_rate": 0.00047079318774833555, + "loss": 6.9585, + "step": 700 + }, + { + "epoch": 0.24086243181720568, + "grad_norm": 1.2100400924682617, + "learning_rate": 0.00047065236863329284, + "loss": 6.876, + "step": 701 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 0.577090322971344, + "learning_rate": 0.00047051123202295777, + "loss": 6.9374, + "step": 702 + }, + { + "epoch": 0.24154962848430186, + "grad_norm": 0.682905912399292, + "learning_rate": 0.0004703697781204108, + "loss": 6.6889, + "step": 703 + }, + { + "epoch": 0.24189322681784994, + "grad_norm": 0.8273911476135254, + "learning_rate": 0.0004702280071291891, + "loss": 6.7998, + "step": 704 + }, + { + "epoch": 0.24223682515139802, + "grad_norm": 0.39249464869499207, + "learning_rate": 0.00047008591925328614, + "loss": 6.9258, + "step": 705 + }, + { + "epoch": 0.2425804234849461, + "grad_norm": 0.8825654983520508, + "learning_rate": 0.00046994351469715107, + "loss": 6.8986, + "step": 706 + }, + { + "epoch": 0.24292402181849418, + "grad_norm": 0.4094250202178955, + "learning_rate": 0.0004698007936656891, + "loss": 6.8182, + "step": 707 + }, + { + "epoch": 0.24326762015204226, + "grad_norm": 0.6193078756332397, + "learning_rate": 0.00046965775636426046, + "loss": 6.952, + "step": 708 + }, + { + "epoch": 0.24361121848559034, + "grad_norm": 0.8103482723236084, + "learning_rate": 0.0004695144029986807, + "loss": 6.8076, + "step": 709 + }, + { + "epoch": 0.24395481681913841, + "grad_norm": 0.655996561050415, + "learning_rate": 0.00046937073377522004, + "loss": 6.7956, + "step": 710 + }, + { + "epoch": 0.24429841515268652, + "grad_norm": 0.6959365606307983, + "learning_rate": 0.00046922674890060326, + "loss": 6.8264, + "step": 711 + }, + { + "epoch": 0.2446420134862346, + "grad_norm": 0.5749850273132324, + "learning_rate": 0.0004690824485820092, + "loss": 6.7739, + "step": 712 + }, + { + "epoch": 0.24498561181978268, + "grad_norm": 0.840958833694458, + "learning_rate": 0.0004689378330270707, + "loss": 6.8343, + "step": 713 + }, + { + "epoch": 0.24532921015333076, + "grad_norm": 0.5014116168022156, + "learning_rate": 0.0004687929024438742, + "loss": 6.8268, + "step": 714 + }, + { + "epoch": 0.24567280848687884, + "grad_norm": 0.7229539155960083, + "learning_rate": 0.0004686476570409594, + "loss": 6.8121, + "step": 715 + }, + { + "epoch": 0.24601640682042691, + "grad_norm": 0.4928436279296875, + "learning_rate": 0.00046850209702731894, + "loss": 6.8857, + "step": 716 + }, + { + "epoch": 0.246360005153975, + "grad_norm": 0.5194845199584961, + "learning_rate": 0.00046835622261239825, + "loss": 6.7582, + "step": 717 + }, + { + "epoch": 0.2467036034875231, + "grad_norm": 0.4320314824581146, + "learning_rate": 0.0004682100340060951, + "loss": 6.9288, + "step": 718 + }, + { + "epoch": 0.24704720182107118, + "grad_norm": 0.5804980397224426, + "learning_rate": 0.0004680635314187592, + "loss": 6.8817, + "step": 719 + }, + { + "epoch": 0.24739080015461926, + "grad_norm": 0.458049476146698, + "learning_rate": 0.0004679167150611924, + "loss": 6.8723, + "step": 720 + }, + { + "epoch": 0.24773439848816733, + "grad_norm": 0.4215126931667328, + "learning_rate": 0.00046776958514464773, + "loss": 6.8758, + "step": 721 + }, + { + "epoch": 0.2480779968217154, + "grad_norm": 0.40314817428588867, + "learning_rate": 0.0004676221418808295, + "loss": 6.7961, + "step": 722 + }, + { + "epoch": 0.2484215951552635, + "grad_norm": 0.4778897166252136, + "learning_rate": 0.00046747438548189294, + "loss": 6.8339, + "step": 723 + }, + { + "epoch": 0.24876519348881157, + "grad_norm": 0.341545969247818, + "learning_rate": 0.00046732631616044364, + "loss": 6.8159, + "step": 724 + }, + { + "epoch": 0.24910879182235965, + "grad_norm": 0.3407266438007355, + "learning_rate": 0.0004671779341295378, + "loss": 6.8255, + "step": 725 + }, + { + "epoch": 0.24945239015590775, + "grad_norm": 0.407026469707489, + "learning_rate": 0.0004670292396026812, + "loss": 6.8622, + "step": 726 + }, + { + "epoch": 0.24979598848945583, + "grad_norm": 0.4541730582714081, + "learning_rate": 0.00046688023279382965, + "loss": 6.8587, + "step": 727 + }, + { + "epoch": 0.2501395868230039, + "grad_norm": 0.37031111121177673, + "learning_rate": 0.0004667309139173879, + "loss": 6.7835, + "step": 728 + }, + { + "epoch": 0.250483185156552, + "grad_norm": 0.47327926754951477, + "learning_rate": 0.00046658128318821, + "loss": 6.8738, + "step": 729 + }, + { + "epoch": 0.25082678349010007, + "grad_norm": 0.47790077328681946, + "learning_rate": 0.00046643134082159876, + "loss": 6.8492, + "step": 730 + }, + { + "epoch": 0.25117038182364815, + "grad_norm": 0.6300373673439026, + "learning_rate": 0.0004662810870333053, + "loss": 6.8715, + "step": 731 + }, + { + "epoch": 0.2515139801571962, + "grad_norm": 0.32731807231903076, + "learning_rate": 0.0004661305220395286, + "loss": 6.8528, + "step": 732 + }, + { + "epoch": 0.2518575784907443, + "grad_norm": 0.5586317181587219, + "learning_rate": 0.0004659796460569159, + "loss": 6.8647, + "step": 733 + }, + { + "epoch": 0.2522011768242924, + "grad_norm": 0.5391055941581726, + "learning_rate": 0.0004658284593025617, + "loss": 6.8655, + "step": 734 + }, + { + "epoch": 0.25254477515784046, + "grad_norm": 0.44915974140167236, + "learning_rate": 0.0004656769619940075, + "loss": 6.8233, + "step": 735 + }, + { + "epoch": 0.25288837349138854, + "grad_norm": 0.47281840443611145, + "learning_rate": 0.00046552515434924194, + "loss": 6.7832, + "step": 736 + }, + { + "epoch": 0.2532319718249367, + "grad_norm": 0.37644147872924805, + "learning_rate": 0.0004653730365867, + "loss": 6.8604, + "step": 737 + }, + { + "epoch": 0.25357557015848475, + "grad_norm": 0.35747960209846497, + "learning_rate": 0.0004652206089252631, + "loss": 6.7951, + "step": 738 + }, + { + "epoch": 0.25391916849203283, + "grad_norm": 0.4939366579055786, + "learning_rate": 0.00046506787158425827, + "loss": 7.0186, + "step": 739 + }, + { + "epoch": 0.2542627668255809, + "grad_norm": 0.4213995933532715, + "learning_rate": 0.00046491482478345836, + "loss": 6.944, + "step": 740 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 0.41876089572906494, + "learning_rate": 0.00046476146874308157, + "loss": 6.8617, + "step": 741 + }, + { + "epoch": 0.25494996349267707, + "grad_norm": 0.5702718496322632, + "learning_rate": 0.00046460780368379076, + "loss": 6.9404, + "step": 742 + }, + { + "epoch": 0.25529356182622515, + "grad_norm": 0.6110662817955017, + "learning_rate": 0.0004644538298266936, + "loss": 6.7365, + "step": 743 + }, + { + "epoch": 0.2556371601597732, + "grad_norm": 0.5077341794967651, + "learning_rate": 0.0004642995473933422, + "loss": 6.9497, + "step": 744 + }, + { + "epoch": 0.2559807584933213, + "grad_norm": 0.612311601638794, + "learning_rate": 0.0004641449566057325, + "loss": 7.0145, + "step": 745 + }, + { + "epoch": 0.2563243568268694, + "grad_norm": 0.6061681509017944, + "learning_rate": 0.00046399005768630425, + "loss": 6.965, + "step": 746 + }, + { + "epoch": 0.25666795516041746, + "grad_norm": 0.6442392468452454, + "learning_rate": 0.0004638348508579405, + "loss": 6.9068, + "step": 747 + }, + { + "epoch": 0.25701155349396554, + "grad_norm": 0.4608883857727051, + "learning_rate": 0.0004636793363439674, + "loss": 6.9511, + "step": 748 + }, + { + "epoch": 0.2573551518275136, + "grad_norm": 0.7235476970672607, + "learning_rate": 0.0004635235143681538, + "loss": 6.9665, + "step": 749 + }, + { + "epoch": 0.2576987501610617, + "grad_norm": 0.9052413702011108, + "learning_rate": 0.00046336738515471087, + "loss": 6.7848, + "step": 750 + }, + { + "epoch": 0.2580423484946098, + "grad_norm": 1.7460522651672363, + "learning_rate": 0.00046321094892829204, + "loss": 6.8786, + "step": 751 + }, + { + "epoch": 0.2583859468281579, + "grad_norm": 0.9062784314155579, + "learning_rate": 0.0004630542059139923, + "loss": 6.7366, + "step": 752 + }, + { + "epoch": 0.258729545161706, + "grad_norm": 0.9159809947013855, + "learning_rate": 0.0004628971563373483, + "loss": 6.7801, + "step": 753 + }, + { + "epoch": 0.25907314349525407, + "grad_norm": 1.137810230255127, + "learning_rate": 0.0004627398004243376, + "loss": 6.8466, + "step": 754 + }, + { + "epoch": 0.25941674182880214, + "grad_norm": 0.7727288007736206, + "learning_rate": 0.00046258213840137864, + "loss": 6.7646, + "step": 755 + }, + { + "epoch": 0.2597603401623502, + "grad_norm": 1.0215615034103394, + "learning_rate": 0.0004624241704953304, + "loss": 6.7423, + "step": 756 + }, + { + "epoch": 0.2601039384958983, + "grad_norm": 0.9777013063430786, + "learning_rate": 0.0004622658969334916, + "loss": 6.7828, + "step": 757 + }, + { + "epoch": 0.2604475368294464, + "grad_norm": 0.5539371371269226, + "learning_rate": 0.0004621073179436015, + "loss": 6.7059, + "step": 758 + }, + { + "epoch": 0.26079113516299446, + "grad_norm": 1.1476919651031494, + "learning_rate": 0.00046194843375383797, + "loss": 6.8811, + "step": 759 + }, + { + "epoch": 0.26113473349654254, + "grad_norm": 0.58660888671875, + "learning_rate": 0.0004617892445928188, + "loss": 6.7705, + "step": 760 + }, + { + "epoch": 0.2614783318300906, + "grad_norm": 0.6147243976593018, + "learning_rate": 0.00046162975068960013, + "loss": 6.7306, + "step": 761 + }, + { + "epoch": 0.2618219301636387, + "grad_norm": 0.8718441128730774, + "learning_rate": 0.00046146995227367663, + "loss": 6.7174, + "step": 762 + }, + { + "epoch": 0.2621655284971868, + "grad_norm": 0.6512937545776367, + "learning_rate": 0.00046130984957498135, + "loss": 6.8326, + "step": 763 + }, + { + "epoch": 0.26250912683073485, + "grad_norm": 0.7064685821533203, + "learning_rate": 0.00046114944282388504, + "loss": 6.83, + "step": 764 + }, + { + "epoch": 0.26285272516428293, + "grad_norm": 0.6572036743164062, + "learning_rate": 0.0004609887322511959, + "loss": 6.8905, + "step": 765 + }, + { + "epoch": 0.263196323497831, + "grad_norm": 0.44737884402275085, + "learning_rate": 0.0004608277180881594, + "loss": 6.8373, + "step": 766 + }, + { + "epoch": 0.26353992183137914, + "grad_norm": 0.8529016971588135, + "learning_rate": 0.00046066640056645775, + "loss": 6.8329, + "step": 767 + }, + { + "epoch": 0.2638835201649272, + "grad_norm": 0.4838736355304718, + "learning_rate": 0.0004605047799182097, + "loss": 6.8027, + "step": 768 + }, + { + "epoch": 0.2642271184984753, + "grad_norm": 0.6361299753189087, + "learning_rate": 0.0004603428563759703, + "loss": 6.8504, + "step": 769 + }, + { + "epoch": 0.2645707168320234, + "grad_norm": 0.5523536205291748, + "learning_rate": 0.0004601806301727302, + "loss": 6.7116, + "step": 770 + }, + { + "epoch": 0.26491431516557146, + "grad_norm": 0.683148980140686, + "learning_rate": 0.00046001810154191564, + "loss": 6.8294, + "step": 771 + }, + { + "epoch": 0.26525791349911954, + "grad_norm": 0.5535314083099365, + "learning_rate": 0.0004598552707173881, + "loss": 6.86, + "step": 772 + }, + { + "epoch": 0.2656015118326676, + "grad_norm": 0.41244781017303467, + "learning_rate": 0.0004596921379334438, + "loss": 6.7168, + "step": 773 + }, + { + "epoch": 0.2659451101662157, + "grad_norm": 0.6043175458908081, + "learning_rate": 0.0004595287034248134, + "loss": 6.7497, + "step": 774 + }, + { + "epoch": 0.26628870849976377, + "grad_norm": 0.6349027752876282, + "learning_rate": 0.0004593649674266619, + "loss": 6.8369, + "step": 775 + }, + { + "epoch": 0.26663230683331185, + "grad_norm": 0.4858072102069855, + "learning_rate": 0.00045920093017458785, + "loss": 6.7509, + "step": 776 + }, + { + "epoch": 0.26697590516685993, + "grad_norm": 0.6944791674613953, + "learning_rate": 0.0004590365919046235, + "loss": 6.8669, + "step": 777 + }, + { + "epoch": 0.267319503500408, + "grad_norm": 0.41476762294769287, + "learning_rate": 0.0004588719528532341, + "loss": 6.8717, + "step": 778 + }, + { + "epoch": 0.2676631018339561, + "grad_norm": 0.4901805520057678, + "learning_rate": 0.0004587070132573178, + "loss": 6.7902, + "step": 779 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 0.7249867916107178, + "learning_rate": 0.0004585417733542051, + "loss": 6.8388, + "step": 780 + }, + { + "epoch": 0.26835029850105224, + "grad_norm": 0.4491178095340729, + "learning_rate": 0.00045837623338165865, + "loss": 6.7337, + "step": 781 + }, + { + "epoch": 0.2686938968346004, + "grad_norm": 0.6511254906654358, + "learning_rate": 0.0004582103935778728, + "loss": 6.8176, + "step": 782 + }, + { + "epoch": 0.26903749516814845, + "grad_norm": 0.7600392699241638, + "learning_rate": 0.0004580442541814735, + "loss": 6.7143, + "step": 783 + }, + { + "epoch": 0.26938109350169653, + "grad_norm": 0.5467169284820557, + "learning_rate": 0.00045787781543151765, + "loss": 6.6721, + "step": 784 + }, + { + "epoch": 0.2697246918352446, + "grad_norm": 0.655737578868866, + "learning_rate": 0.0004577110775674928, + "loss": 6.8681, + "step": 785 + }, + { + "epoch": 0.2700682901687927, + "grad_norm": 0.7441399693489075, + "learning_rate": 0.00045754404082931714, + "loss": 6.7865, + "step": 786 + }, + { + "epoch": 0.27041188850234077, + "grad_norm": 0.42660608887672424, + "learning_rate": 0.00045737670545733866, + "loss": 6.7737, + "step": 787 + }, + { + "epoch": 0.27075548683588885, + "grad_norm": 0.6970313191413879, + "learning_rate": 0.0004572090716923353, + "loss": 6.7858, + "step": 788 + }, + { + "epoch": 0.2710990851694369, + "grad_norm": 0.4984079897403717, + "learning_rate": 0.0004570411397755141, + "loss": 6.8268, + "step": 789 + }, + { + "epoch": 0.271442683502985, + "grad_norm": 0.35249513387680054, + "learning_rate": 0.0004568729099485114, + "loss": 6.8356, + "step": 790 + }, + { + "epoch": 0.2717862818365331, + "grad_norm": 0.553500771522522, + "learning_rate": 0.00045670438245339176, + "loss": 6.759, + "step": 791 + }, + { + "epoch": 0.27212988017008116, + "grad_norm": 0.5268304944038391, + "learning_rate": 0.0004565355575326485, + "loss": 6.9307, + "step": 792 + }, + { + "epoch": 0.27247347850362924, + "grad_norm": 0.5753699541091919, + "learning_rate": 0.0004563664354292027, + "loss": 6.8501, + "step": 793 + }, + { + "epoch": 0.2728170768371773, + "grad_norm": 0.5838276147842407, + "learning_rate": 0.0004561970163864031, + "loss": 6.7411, + "step": 794 + }, + { + "epoch": 0.2731606751707254, + "grad_norm": 0.6358978152275085, + "learning_rate": 0.0004560273006480256, + "loss": 6.8322, + "step": 795 + }, + { + "epoch": 0.27350427350427353, + "grad_norm": 0.75990229845047, + "learning_rate": 0.0004558572884582732, + "loss": 6.7258, + "step": 796 + }, + { + "epoch": 0.2738478718378216, + "grad_norm": 0.6735451817512512, + "learning_rate": 0.00045568698006177535, + "loss": 6.7351, + "step": 797 + }, + { + "epoch": 0.2741914701713697, + "grad_norm": 0.5910247564315796, + "learning_rate": 0.0004555163757035876, + "loss": 6.8092, + "step": 798 + }, + { + "epoch": 0.27453506850491777, + "grad_norm": 0.6522710919380188, + "learning_rate": 0.0004553454756291916, + "loss": 6.866, + "step": 799 + }, + { + "epoch": 0.27487866683846585, + "grad_norm": 0.7660143375396729, + "learning_rate": 0.00045517428008449436, + "loss": 6.9458, + "step": 800 + }, + { + "epoch": 0.2752222651720139, + "grad_norm": 1.4569878578186035, + "learning_rate": 0.00045500278931582806, + "loss": 6.7737, + "step": 801 + }, + { + "epoch": 0.275565863505562, + "grad_norm": 0.972352147102356, + "learning_rate": 0.00045483100356994967, + "loss": 6.8857, + "step": 802 + }, + { + "epoch": 0.2759094618391101, + "grad_norm": 1.2153319120407104, + "learning_rate": 0.00045465892309404064, + "loss": 6.7493, + "step": 803 + }, + { + "epoch": 0.27625306017265816, + "grad_norm": 1.4174124002456665, + "learning_rate": 0.0004544865481357064, + "loss": 6.7461, + "step": 804 + }, + { + "epoch": 0.27659665850620624, + "grad_norm": 0.7363278269767761, + "learning_rate": 0.00045431387894297626, + "loss": 6.7705, + "step": 805 + }, + { + "epoch": 0.2769402568397543, + "grad_norm": 1.0926170349121094, + "learning_rate": 0.0004541409157643027, + "loss": 6.581, + "step": 806 + }, + { + "epoch": 0.2772838551733024, + "grad_norm": 1.3416601419448853, + "learning_rate": 0.00045396765884856154, + "loss": 6.7194, + "step": 807 + }, + { + "epoch": 0.2776274535068505, + "grad_norm": 0.5606733560562134, + "learning_rate": 0.0004537941084450509, + "loss": 6.8035, + "step": 808 + }, + { + "epoch": 0.27797105184039855, + "grad_norm": 0.7519741058349609, + "learning_rate": 0.0004536202648034914, + "loss": 6.8173, + "step": 809 + }, + { + "epoch": 0.27831465017394663, + "grad_norm": 0.9418850541114807, + "learning_rate": 0.0004534461281740255, + "loss": 6.7941, + "step": 810 + }, + { + "epoch": 0.27865824850749477, + "grad_norm": 0.6259487867355347, + "learning_rate": 0.0004532716988072175, + "loss": 6.6247, + "step": 811 + }, + { + "epoch": 0.27900184684104284, + "grad_norm": 0.8542924523353577, + "learning_rate": 0.00045309697695405243, + "loss": 6.7618, + "step": 812 + }, + { + "epoch": 0.2793454451745909, + "grad_norm": 0.8797979354858398, + "learning_rate": 0.0004529219628659366, + "loss": 6.7305, + "step": 813 + }, + { + "epoch": 0.279689043508139, + "grad_norm": 0.484947144985199, + "learning_rate": 0.00045274665679469666, + "loss": 6.7074, + "step": 814 + }, + { + "epoch": 0.2800326418416871, + "grad_norm": 0.940768301486969, + "learning_rate": 0.0004525710589925794, + "loss": 6.8354, + "step": 815 + }, + { + "epoch": 0.28037624017523516, + "grad_norm": 0.640664279460907, + "learning_rate": 0.0004523951697122514, + "loss": 6.7316, + "step": 816 + }, + { + "epoch": 0.28071983850878324, + "grad_norm": 0.3233809173107147, + "learning_rate": 0.0004522189892067985, + "loss": 6.8219, + "step": 817 + }, + { + "epoch": 0.2810634368423313, + "grad_norm": 0.71434485912323, + "learning_rate": 0.00045204251772972595, + "loss": 6.6404, + "step": 818 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 0.6310349702835083, + "learning_rate": 0.00045186575553495716, + "loss": 6.6152, + "step": 819 + }, + { + "epoch": 0.2817506335094275, + "grad_norm": 0.40379568934440613, + "learning_rate": 0.0004516887028768344, + "loss": 6.6638, + "step": 820 + }, + { + "epoch": 0.28209423184297555, + "grad_norm": 0.46431466937065125, + "learning_rate": 0.0004515113600101174, + "loss": 6.8226, + "step": 821 + }, + { + "epoch": 0.28243783017652363, + "grad_norm": 0.6690576672554016, + "learning_rate": 0.0004513337271899838, + "loss": 6.7364, + "step": 822 + }, + { + "epoch": 0.2827814285100717, + "grad_norm": 0.3921510875225067, + "learning_rate": 0.00045115580467202835, + "loss": 6.6688, + "step": 823 + }, + { + "epoch": 0.2831250268436198, + "grad_norm": 0.4708666205406189, + "learning_rate": 0.0004509775927122625, + "loss": 6.688, + "step": 824 + }, + { + "epoch": 0.28346862517716787, + "grad_norm": 0.6789883971214294, + "learning_rate": 0.00045079909156711445, + "loss": 6.5988, + "step": 825 + }, + { + "epoch": 0.283812223510716, + "grad_norm": 0.5630450248718262, + "learning_rate": 0.0004506203014934283, + "loss": 6.7569, + "step": 826 + }, + { + "epoch": 0.2841558218442641, + "grad_norm": 0.4851526916027069, + "learning_rate": 0.000450441222748464, + "loss": 6.7702, + "step": 827 + }, + { + "epoch": 0.28449942017781216, + "grad_norm": 0.6094221472740173, + "learning_rate": 0.00045026185558989676, + "loss": 6.864, + "step": 828 + }, + { + "epoch": 0.28484301851136024, + "grad_norm": 0.5844842195510864, + "learning_rate": 0.0004500822002758169, + "loss": 6.6631, + "step": 829 + }, + { + "epoch": 0.2851866168449083, + "grad_norm": 0.5621864199638367, + "learning_rate": 0.0004499022570647292, + "loss": 6.6732, + "step": 830 + }, + { + "epoch": 0.2855302151784564, + "grad_norm": 0.5564741492271423, + "learning_rate": 0.00044972202621555295, + "loss": 6.8367, + "step": 831 + }, + { + "epoch": 0.28587381351200447, + "grad_norm": 0.6472800970077515, + "learning_rate": 0.0004495415079876211, + "loss": 6.8898, + "step": 832 + }, + { + "epoch": 0.28621741184555255, + "grad_norm": 0.6958934664726257, + "learning_rate": 0.00044936070264068017, + "loss": 6.7203, + "step": 833 + }, + { + "epoch": 0.28656101017910063, + "grad_norm": 0.5514652132987976, + "learning_rate": 0.00044917961043488994, + "loss": 6.8128, + "step": 834 + }, + { + "epoch": 0.2869046085126487, + "grad_norm": 0.7259712815284729, + "learning_rate": 0.00044899823163082264, + "loss": 6.7471, + "step": 835 + }, + { + "epoch": 0.2872482068461968, + "grad_norm": 0.4060599207878113, + "learning_rate": 0.00044881656648946324, + "loss": 6.7305, + "step": 836 + }, + { + "epoch": 0.28759180517974486, + "grad_norm": 0.697080671787262, + "learning_rate": 0.0004486346152722085, + "loss": 6.6904, + "step": 837 + }, + { + "epoch": 0.28793540351329294, + "grad_norm": 0.46781861782073975, + "learning_rate": 0.0004484523782408668, + "loss": 6.6904, + "step": 838 + }, + { + "epoch": 0.288279001846841, + "grad_norm": 0.5770555734634399, + "learning_rate": 0.000448269855657658, + "loss": 6.7446, + "step": 839 + }, + { + "epoch": 0.2886226001803891, + "grad_norm": 0.458122193813324, + "learning_rate": 0.0004480870477852126, + "loss": 6.7125, + "step": 840 + }, + { + "epoch": 0.28896619851393723, + "grad_norm": 0.8331108093261719, + "learning_rate": 0.00044790395488657165, + "loss": 6.7704, + "step": 841 + }, + { + "epoch": 0.2893097968474853, + "grad_norm": 0.47524768114089966, + "learning_rate": 0.00044772057722518646, + "loss": 6.7999, + "step": 842 + }, + { + "epoch": 0.2896533951810334, + "grad_norm": 0.6736469864845276, + "learning_rate": 0.00044753691506491783, + "loss": 6.8244, + "step": 843 + }, + { + "epoch": 0.28999699351458147, + "grad_norm": 0.4914340078830719, + "learning_rate": 0.00044735296867003625, + "loss": 6.7941, + "step": 844 + }, + { + "epoch": 0.29034059184812955, + "grad_norm": 0.6356337070465088, + "learning_rate": 0.0004471687383052209, + "loss": 6.7074, + "step": 845 + }, + { + "epoch": 0.2906841901816776, + "grad_norm": 0.4956357777118683, + "learning_rate": 0.0004469842242355598, + "loss": 6.6277, + "step": 846 + }, + { + "epoch": 0.2910277885152257, + "grad_norm": 0.7174055576324463, + "learning_rate": 0.00044679942672654896, + "loss": 6.7337, + "step": 847 + }, + { + "epoch": 0.2913713868487738, + "grad_norm": 0.5987393856048584, + "learning_rate": 0.0004466143460440923, + "loss": 6.7455, + "step": 848 + }, + { + "epoch": 0.29171498518232186, + "grad_norm": 0.7290389537811279, + "learning_rate": 0.00044642898245450134, + "loss": 6.9492, + "step": 849 + }, + { + "epoch": 0.29205858351586994, + "grad_norm": 0.9655451774597168, + "learning_rate": 0.0004462433362244946, + "loss": 6.8043, + "step": 850 + }, + { + "epoch": 0.292402181849418, + "grad_norm": 0.908883810043335, + "learning_rate": 0.0004460574076211973, + "loss": 6.5924, + "step": 851 + }, + { + "epoch": 0.2927457801829661, + "grad_norm": 0.8863041996955872, + "learning_rate": 0.00044587119691214075, + "loss": 6.729, + "step": 852 + }, + { + "epoch": 0.2930893785165142, + "grad_norm": 0.6901432275772095, + "learning_rate": 0.0004456847043652624, + "loss": 6.7657, + "step": 853 + }, + { + "epoch": 0.29343297685006225, + "grad_norm": 1.0988612174987793, + "learning_rate": 0.00044549793024890535, + "loss": 6.5863, + "step": 854 + }, + { + "epoch": 0.29377657518361033, + "grad_norm": 0.700564980506897, + "learning_rate": 0.00044531087483181753, + "loss": 6.6539, + "step": 855 + }, + { + "epoch": 0.29412017351715847, + "grad_norm": 0.5881510376930237, + "learning_rate": 0.00044512353838315177, + "loss": 6.6578, + "step": 856 + }, + { + "epoch": 0.29446377185070655, + "grad_norm": 0.8495005369186401, + "learning_rate": 0.00044493592117246544, + "loss": 6.753, + "step": 857 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 0.5816578269004822, + "learning_rate": 0.00044474802346971973, + "loss": 6.6681, + "step": 858 + }, + { + "epoch": 0.2951509685178027, + "grad_norm": 0.6731425523757935, + "learning_rate": 0.00044455984554527927, + "loss": 6.7042, + "step": 859 + }, + { + "epoch": 0.2954945668513508, + "grad_norm": 0.5258037447929382, + "learning_rate": 0.0004443713876699124, + "loss": 6.6961, + "step": 860 + }, + { + "epoch": 0.29583816518489886, + "grad_norm": 0.6853783130645752, + "learning_rate": 0.00044418265011478964, + "loss": 6.7168, + "step": 861 + }, + { + "epoch": 0.29618176351844694, + "grad_norm": 0.5510942935943604, + "learning_rate": 0.0004439936331514844, + "loss": 6.5608, + "step": 862 + }, + { + "epoch": 0.296525361851995, + "grad_norm": 0.7021049857139587, + "learning_rate": 0.000443804337051972, + "loss": 6.6402, + "step": 863 + }, + { + "epoch": 0.2968689601855431, + "grad_norm": 0.522904634475708, + "learning_rate": 0.0004436147620886294, + "loss": 6.7086, + "step": 864 + }, + { + "epoch": 0.2972125585190912, + "grad_norm": 0.6038668155670166, + "learning_rate": 0.00044342490853423476, + "loss": 6.7431, + "step": 865 + }, + { + "epoch": 0.29755615685263925, + "grad_norm": 0.5508298277854919, + "learning_rate": 0.0004432347766619672, + "loss": 6.6429, + "step": 866 + }, + { + "epoch": 0.29789975518618733, + "grad_norm": 0.4411887526512146, + "learning_rate": 0.00044304436674540626, + "loss": 6.6424, + "step": 867 + }, + { + "epoch": 0.2982433535197354, + "grad_norm": 0.692892849445343, + "learning_rate": 0.0004428536790585315, + "loss": 6.601, + "step": 868 + }, + { + "epoch": 0.2985869518532835, + "grad_norm": 0.38894644379615784, + "learning_rate": 0.00044266271387572234, + "loss": 6.7866, + "step": 869 + }, + { + "epoch": 0.29893055018683157, + "grad_norm": 0.5061523914337158, + "learning_rate": 0.00044247147147175725, + "loss": 6.7596, + "step": 870 + }, + { + "epoch": 0.2992741485203797, + "grad_norm": 0.5069543123245239, + "learning_rate": 0.00044227995212181375, + "loss": 6.7587, + "step": 871 + }, + { + "epoch": 0.2996177468539278, + "grad_norm": 0.48187875747680664, + "learning_rate": 0.0004420881561014679, + "loss": 6.5468, + "step": 872 + }, + { + "epoch": 0.29996134518747586, + "grad_norm": 0.5325146317481995, + "learning_rate": 0.00044189608368669364, + "loss": 6.7372, + "step": 873 + }, + { + "epoch": 0.30030494352102394, + "grad_norm": 0.5697482228279114, + "learning_rate": 0.0004417037351538628, + "loss": 6.6634, + "step": 874 + }, + { + "epoch": 0.300648541854572, + "grad_norm": 0.4286175072193146, + "learning_rate": 0.0004415111107797445, + "loss": 6.6235, + "step": 875 + }, + { + "epoch": 0.3009921401881201, + "grad_norm": 0.6160731911659241, + "learning_rate": 0.0004413182108415047, + "loss": 6.5816, + "step": 876 + }, + { + "epoch": 0.3013357385216682, + "grad_norm": 0.4158060550689697, + "learning_rate": 0.00044112503561670593, + "loss": 6.6958, + "step": 877 + }, + { + "epoch": 0.30167933685521625, + "grad_norm": 0.43943968415260315, + "learning_rate": 0.00044093158538330675, + "loss": 6.7403, + "step": 878 + }, + { + "epoch": 0.30202293518876433, + "grad_norm": 0.45871615409851074, + "learning_rate": 0.0004407378604196615, + "loss": 6.6959, + "step": 879 + }, + { + "epoch": 0.3023665335223124, + "grad_norm": 0.35182371735572815, + "learning_rate": 0.00044054386100451974, + "loss": 6.7437, + "step": 880 + }, + { + "epoch": 0.3027101318558605, + "grad_norm": 0.5132912397384644, + "learning_rate": 0.0004403495874170261, + "loss": 6.6109, + "step": 881 + }, + { + "epoch": 0.30305373018940857, + "grad_norm": 0.4913354814052582, + "learning_rate": 0.00044015503993671953, + "loss": 6.6739, + "step": 882 + }, + { + "epoch": 0.30339732852295664, + "grad_norm": 0.37906894087791443, + "learning_rate": 0.0004399602188435332, + "loss": 6.5922, + "step": 883 + }, + { + "epoch": 0.3037409268565047, + "grad_norm": 0.4221290946006775, + "learning_rate": 0.0004397651244177939, + "loss": 6.6821, + "step": 884 + }, + { + "epoch": 0.3040845251900528, + "grad_norm": 0.40234696865081787, + "learning_rate": 0.0004395697569402218, + "loss": 6.5751, + "step": 885 + }, + { + "epoch": 0.30442812352360094, + "grad_norm": 0.6360386610031128, + "learning_rate": 0.00043937411669192996, + "loss": 6.6542, + "step": 886 + }, + { + "epoch": 0.304771721857149, + "grad_norm": 0.4311862885951996, + "learning_rate": 0.0004391782039544238, + "loss": 6.6606, + "step": 887 + }, + { + "epoch": 0.3051153201906971, + "grad_norm": 0.5377070903778076, + "learning_rate": 0.000438982019009601, + "loss": 6.7238, + "step": 888 + }, + { + "epoch": 0.30545891852424517, + "grad_norm": 0.441566526889801, + "learning_rate": 0.0004387855621397508, + "loss": 6.5752, + "step": 889 + }, + { + "epoch": 0.30580251685779325, + "grad_norm": 0.42087745666503906, + "learning_rate": 0.00043858883362755377, + "loss": 6.6638, + "step": 890 + }, + { + "epoch": 0.30614611519134133, + "grad_norm": 0.39083683490753174, + "learning_rate": 0.00043839183375608115, + "loss": 6.6812, + "step": 891 + }, + { + "epoch": 0.3064897135248894, + "grad_norm": 0.5337326526641846, + "learning_rate": 0.0004381945628087951, + "loss": 6.6301, + "step": 892 + }, + { + "epoch": 0.3068333118584375, + "grad_norm": 0.43058067560195923, + "learning_rate": 0.0004379970210695473, + "loss": 6.6669, + "step": 893 + }, + { + "epoch": 0.30717691019198556, + "grad_norm": 0.5187793374061584, + "learning_rate": 0.0004377992088225794, + "loss": 6.6735, + "step": 894 + }, + { + "epoch": 0.30752050852553364, + "grad_norm": 0.4057919681072235, + "learning_rate": 0.0004376011263525221, + "loss": 6.7691, + "step": 895 + }, + { + "epoch": 0.3078641068590817, + "grad_norm": 0.5876466631889343, + "learning_rate": 0.0004374027739443952, + "loss": 6.8599, + "step": 896 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 0.8425361514091492, + "learning_rate": 0.00043720415188360645, + "loss": 6.8207, + "step": 897 + }, + { + "epoch": 0.3085513035261779, + "grad_norm": 0.6548316478729248, + "learning_rate": 0.00043700526045595213, + "loss": 6.7563, + "step": 898 + }, + { + "epoch": 0.30889490185972596, + "grad_norm": 0.8088213801383972, + "learning_rate": 0.00043680609994761565, + "loss": 6.7839, + "step": 899 + }, + { + "epoch": 0.30923850019327404, + "grad_norm": 0.962527334690094, + "learning_rate": 0.00043660667064516795, + "loss": 6.8026, + "step": 900 + }, + { + "epoch": 0.30958209852682217, + "grad_norm": 1.5333473682403564, + "learning_rate": 0.0004364069728355665, + "loss": 6.5689, + "step": 901 + }, + { + "epoch": 0.30992569686037025, + "grad_norm": 0.7353837490081787, + "learning_rate": 0.0004362070068061553, + "loss": 6.7109, + "step": 902 + }, + { + "epoch": 0.3102692951939183, + "grad_norm": 1.1564072370529175, + "learning_rate": 0.00043600677284466404, + "loss": 6.7302, + "step": 903 + }, + { + "epoch": 0.3106128935274664, + "grad_norm": 0.8054890632629395, + "learning_rate": 0.00043580627123920824, + "loss": 6.5836, + "step": 904 + }, + { + "epoch": 0.3109564918610145, + "grad_norm": 0.8645017743110657, + "learning_rate": 0.0004356055022782884, + "loss": 6.5373, + "step": 905 + }, + { + "epoch": 0.31130009019456256, + "grad_norm": 1.007243037223816, + "learning_rate": 0.00043540446625078957, + "loss": 6.6522, + "step": 906 + }, + { + "epoch": 0.31164368852811064, + "grad_norm": 0.5772066116333008, + "learning_rate": 0.0004352031634459813, + "loss": 6.5941, + "step": 907 + }, + { + "epoch": 0.3119872868616587, + "grad_norm": 1.151341199874878, + "learning_rate": 0.00043500159415351693, + "loss": 6.71, + "step": 908 + }, + { + "epoch": 0.3123308851952068, + "grad_norm": 0.7844879031181335, + "learning_rate": 0.00043479975866343316, + "loss": 6.564, + "step": 909 + }, + { + "epoch": 0.3126744835287549, + "grad_norm": 0.6750956177711487, + "learning_rate": 0.0004345976572661499, + "loss": 6.52, + "step": 910 + }, + { + "epoch": 0.31301808186230295, + "grad_norm": 0.9015241861343384, + "learning_rate": 0.0004343952902524695, + "loss": 6.641, + "step": 911 + }, + { + "epoch": 0.31336168019585103, + "grad_norm": 0.6968957185745239, + "learning_rate": 0.00043419265791357656, + "loss": 6.5191, + "step": 912 + }, + { + "epoch": 0.3137052785293991, + "grad_norm": 0.6399244070053101, + "learning_rate": 0.00043398976054103756, + "loss": 6.4583, + "step": 913 + }, + { + "epoch": 0.3140488768629472, + "grad_norm": 0.7823150157928467, + "learning_rate": 0.0004337865984268001, + "loss": 6.6004, + "step": 914 + }, + { + "epoch": 0.3143924751964953, + "grad_norm": 0.6562049984931946, + "learning_rate": 0.000433583171863193, + "loss": 6.6225, + "step": 915 + }, + { + "epoch": 0.3147360735300434, + "grad_norm": 0.5780950784683228, + "learning_rate": 0.0004333794811429253, + "loss": 6.682, + "step": 916 + }, + { + "epoch": 0.3150796718635915, + "grad_norm": 0.7009182572364807, + "learning_rate": 0.0004331755265590864, + "loss": 6.5454, + "step": 917 + }, + { + "epoch": 0.31542327019713956, + "grad_norm": 0.4939836263656616, + "learning_rate": 0.0004329713084051452, + "loss": 6.6302, + "step": 918 + }, + { + "epoch": 0.31576686853068764, + "grad_norm": 0.7301641702651978, + "learning_rate": 0.00043276682697494995, + "loss": 6.5314, + "step": 919 + }, + { + "epoch": 0.3161104668642357, + "grad_norm": 0.4727785289287567, + "learning_rate": 0.00043256208256272765, + "loss": 6.6969, + "step": 920 + }, + { + "epoch": 0.3164540651977838, + "grad_norm": 0.4740373194217682, + "learning_rate": 0.0004323570754630838, + "loss": 6.6621, + "step": 921 + }, + { + "epoch": 0.3167976635313319, + "grad_norm": 0.5652904510498047, + "learning_rate": 0.00043215180597100167, + "loss": 6.5914, + "step": 922 + }, + { + "epoch": 0.31714126186487995, + "grad_norm": 0.678132176399231, + "learning_rate": 0.0004319462743818424, + "loss": 6.6089, + "step": 923 + }, + { + "epoch": 0.31748486019842803, + "grad_norm": 0.3767012357711792, + "learning_rate": 0.0004317404809913439, + "loss": 6.5765, + "step": 924 + }, + { + "epoch": 0.3178284585319761, + "grad_norm": 0.6184304356575012, + "learning_rate": 0.00043153442609562115, + "loss": 6.6968, + "step": 925 + }, + { + "epoch": 0.3181720568655242, + "grad_norm": 0.529187798500061, + "learning_rate": 0.00043132810999116513, + "loss": 6.6796, + "step": 926 + }, + { + "epoch": 0.31851565519907227, + "grad_norm": 0.486133873462677, + "learning_rate": 0.0004311215329748428, + "loss": 6.6639, + "step": 927 + }, + { + "epoch": 0.31885925353262035, + "grad_norm": 0.5016087293624878, + "learning_rate": 0.0004309146953438966, + "loss": 6.5808, + "step": 928 + }, + { + "epoch": 0.3192028518661684, + "grad_norm": 0.5674017071723938, + "learning_rate": 0.00043070759739594365, + "loss": 6.5018, + "step": 929 + }, + { + "epoch": 0.31954645019971656, + "grad_norm": 0.4067033529281616, + "learning_rate": 0.0004305002394289762, + "loss": 6.6528, + "step": 930 + }, + { + "epoch": 0.31989004853326464, + "grad_norm": 0.5798845887184143, + "learning_rate": 0.00043029262174136, + "loss": 6.5737, + "step": 931 + }, + { + "epoch": 0.3202336468668127, + "grad_norm": 0.6043611168861389, + "learning_rate": 0.000430084744631835, + "loss": 6.6468, + "step": 932 + }, + { + "epoch": 0.3205772452003608, + "grad_norm": 0.4977371394634247, + "learning_rate": 0.00042987660839951424, + "loss": 6.6475, + "step": 933 + }, + { + "epoch": 0.3209208435339089, + "grad_norm": 0.5232419371604919, + "learning_rate": 0.0004296682133438836, + "loss": 6.5821, + "step": 934 + }, + { + "epoch": 0.32126444186745695, + "grad_norm": 0.5253320932388306, + "learning_rate": 0.0004294595597648014, + "loss": 6.6007, + "step": 935 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 0.48890867829322815, + "learning_rate": 0.0004292506479624979, + "loss": 6.5654, + "step": 936 + }, + { + "epoch": 0.3219516385345531, + "grad_norm": 0.5480407476425171, + "learning_rate": 0.00042904147823757504, + "loss": 6.5487, + "step": 937 + }, + { + "epoch": 0.3222952368681012, + "grad_norm": 0.44250285625457764, + "learning_rate": 0.00042883205089100574, + "loss": 6.6104, + "step": 938 + }, + { + "epoch": 0.32263883520164927, + "grad_norm": 0.5121859908103943, + "learning_rate": 0.00042862236622413384, + "loss": 6.5706, + "step": 939 + }, + { + "epoch": 0.32298243353519734, + "grad_norm": 0.4506629407405853, + "learning_rate": 0.00042841242453867313, + "loss": 6.5679, + "step": 940 + }, + { + "epoch": 0.3233260318687454, + "grad_norm": 0.5592184662818909, + "learning_rate": 0.0004282022261367073, + "loss": 6.6794, + "step": 941 + }, + { + "epoch": 0.3236696302022935, + "grad_norm": 0.48790469765663147, + "learning_rate": 0.0004279917713206897, + "loss": 6.7813, + "step": 942 + }, + { + "epoch": 0.3240132285358416, + "grad_norm": 0.7107426524162292, + "learning_rate": 0.00042778106039344227, + "loss": 6.7156, + "step": 943 + }, + { + "epoch": 0.32435682686938966, + "grad_norm": 0.42893269658088684, + "learning_rate": 0.00042757009365815567, + "loss": 6.6114, + "step": 944 + }, + { + "epoch": 0.3247004252029378, + "grad_norm": 0.5906190276145935, + "learning_rate": 0.0004273588714183887, + "loss": 6.6893, + "step": 945 + }, + { + "epoch": 0.32504402353648587, + "grad_norm": 0.8306535482406616, + "learning_rate": 0.00042714739397806746, + "loss": 6.7016, + "step": 946 + }, + { + "epoch": 0.32538762187003395, + "grad_norm": 0.5476863384246826, + "learning_rate": 0.00042693566164148577, + "loss": 6.8143, + "step": 947 + }, + { + "epoch": 0.32573122020358203, + "grad_norm": 1.0128917694091797, + "learning_rate": 0.00042672367471330373, + "loss": 6.6837, + "step": 948 + }, + { + "epoch": 0.3260748185371301, + "grad_norm": 0.6552304625511169, + "learning_rate": 0.00042651143349854817, + "loss": 6.797, + "step": 949 + }, + { + "epoch": 0.3264184168706782, + "grad_norm": 0.8041029572486877, + "learning_rate": 0.0004262989383026115, + "loss": 6.8178, + "step": 950 + }, + { + "epoch": 0.32676201520422626, + "grad_norm": 1.0014656782150269, + "learning_rate": 0.00042608618943125166, + "loss": 6.5204, + "step": 951 + }, + { + "epoch": 0.32710561353777434, + "grad_norm": 0.7724018096923828, + "learning_rate": 0.00042587318719059176, + "loss": 6.3653, + "step": 952 + }, + { + "epoch": 0.3274492118713224, + "grad_norm": 0.7663524746894836, + "learning_rate": 0.00042565993188711934, + "loss": 6.5467, + "step": 953 + }, + { + "epoch": 0.3277928102048705, + "grad_norm": 0.9170653820037842, + "learning_rate": 0.00042544642382768606, + "loss": 6.4953, + "step": 954 + }, + { + "epoch": 0.3281364085384186, + "grad_norm": 0.462952584028244, + "learning_rate": 0.00042523266331950745, + "loss": 6.6712, + "step": 955 + }, + { + "epoch": 0.32848000687196666, + "grad_norm": 0.7020468711853027, + "learning_rate": 0.000425018650670162, + "loss": 6.5491, + "step": 956 + }, + { + "epoch": 0.32882360520551474, + "grad_norm": 0.5389872193336487, + "learning_rate": 0.0004248043861875912, + "loss": 6.499, + "step": 957 + }, + { + "epoch": 0.3291672035390628, + "grad_norm": 0.41495102643966675, + "learning_rate": 0.0004245898701800989, + "loss": 6.5433, + "step": 958 + }, + { + "epoch": 0.3295108018726109, + "grad_norm": 0.5482673048973083, + "learning_rate": 0.00042437510295635075, + "loss": 6.5594, + "step": 959 + }, + { + "epoch": 0.329854400206159, + "grad_norm": 0.5409672260284424, + "learning_rate": 0.0004241600848253739, + "loss": 6.5728, + "step": 960 + }, + { + "epoch": 0.3301979985397071, + "grad_norm": 0.3423202633857727, + "learning_rate": 0.0004239448160965567, + "loss": 6.6138, + "step": 961 + }, + { + "epoch": 0.3305415968732552, + "grad_norm": 0.7785842418670654, + "learning_rate": 0.00042372929707964796, + "loss": 6.4957, + "step": 962 + }, + { + "epoch": 0.33088519520680326, + "grad_norm": 0.49811336398124695, + "learning_rate": 0.0004235135280847565, + "loss": 6.5921, + "step": 963 + }, + { + "epoch": 0.33122879354035134, + "grad_norm": 0.4862115979194641, + "learning_rate": 0.0004232975094223511, + "loss": 6.5412, + "step": 964 + }, + { + "epoch": 0.3315723918738994, + "grad_norm": 0.5971689224243164, + "learning_rate": 0.0004230812414032595, + "loss": 6.4801, + "step": 965 + }, + { + "epoch": 0.3319159902074475, + "grad_norm": 0.48547112941741943, + "learning_rate": 0.0004228647243386685, + "loss": 6.5879, + "step": 966 + }, + { + "epoch": 0.3322595885409956, + "grad_norm": 0.36358118057250977, + "learning_rate": 0.000422647958540123, + "loss": 6.6918, + "step": 967 + }, + { + "epoch": 0.33260318687454365, + "grad_norm": 0.5594439506530762, + "learning_rate": 0.0004224309443195261, + "loss": 6.618, + "step": 968 + }, + { + "epoch": 0.33294678520809173, + "grad_norm": 0.4119882583618164, + "learning_rate": 0.000422213681989138, + "loss": 6.5929, + "step": 969 + }, + { + "epoch": 0.3332903835416398, + "grad_norm": 0.3595133423805237, + "learning_rate": 0.00042199617186157624, + "loss": 6.4484, + "step": 970 + }, + { + "epoch": 0.3336339818751879, + "grad_norm": 0.4845230281352997, + "learning_rate": 0.00042177841424981467, + "loss": 6.5066, + "step": 971 + }, + { + "epoch": 0.33397758020873597, + "grad_norm": 0.467210054397583, + "learning_rate": 0.00042156040946718344, + "loss": 6.5665, + "step": 972 + }, + { + "epoch": 0.33432117854228405, + "grad_norm": 0.537044107913971, + "learning_rate": 0.00042134215782736804, + "loss": 6.5144, + "step": 973 + }, + { + "epoch": 0.3346647768758321, + "grad_norm": 0.5238211750984192, + "learning_rate": 0.00042112365964440965, + "loss": 6.4788, + "step": 974 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 0.4372142255306244, + "learning_rate": 0.0004209049152327037, + "loss": 6.5712, + "step": 975 + }, + { + "epoch": 0.33535197354292834, + "grad_norm": 0.38211849331855774, + "learning_rate": 0.0004206859249070002, + "loss": 6.6268, + "step": 976 + }, + { + "epoch": 0.3356955718764764, + "grad_norm": 0.3685896098613739, + "learning_rate": 0.00042046668898240296, + "loss": 6.5762, + "step": 977 + }, + { + "epoch": 0.3360391702100245, + "grad_norm": 0.4509871006011963, + "learning_rate": 0.0004202472077743692, + "loss": 6.5991, + "step": 978 + }, + { + "epoch": 0.3363827685435726, + "grad_norm": 0.3491871953010559, + "learning_rate": 0.00042002748159870895, + "loss": 6.5537, + "step": 979 + }, + { + "epoch": 0.33672636687712065, + "grad_norm": 0.38078513741493225, + "learning_rate": 0.00041980751077158487, + "loss": 6.5054, + "step": 980 + }, + { + "epoch": 0.33706996521066873, + "grad_norm": 0.4195086658000946, + "learning_rate": 0.0004195872956095115, + "loss": 6.5264, + "step": 981 + }, + { + "epoch": 0.3374135635442168, + "grad_norm": 0.3654896318912506, + "learning_rate": 0.00041936683642935515, + "loss": 6.4927, + "step": 982 + }, + { + "epoch": 0.3377571618777649, + "grad_norm": 0.4824683368206024, + "learning_rate": 0.000419146133548333, + "loss": 6.5299, + "step": 983 + }, + { + "epoch": 0.33810076021131297, + "grad_norm": 0.4073243737220764, + "learning_rate": 0.00041892518728401317, + "loss": 6.5118, + "step": 984 + }, + { + "epoch": 0.33844435854486105, + "grad_norm": 0.37707698345184326, + "learning_rate": 0.0004187039979543138, + "loss": 6.5245, + "step": 985 + }, + { + "epoch": 0.3387879568784091, + "grad_norm": 0.40587034821510315, + "learning_rate": 0.0004184825658775027, + "loss": 6.6768, + "step": 986 + }, + { + "epoch": 0.3391315552119572, + "grad_norm": 0.4814893901348114, + "learning_rate": 0.00041826089137219724, + "loss": 6.5214, + "step": 987 + }, + { + "epoch": 0.3394751535455053, + "grad_norm": 0.38695332407951355, + "learning_rate": 0.0004180389747573634, + "loss": 6.6829, + "step": 988 + }, + { + "epoch": 0.33981875187905336, + "grad_norm": 0.4304019808769226, + "learning_rate": 0.00041781681635231555, + "loss": 6.6049, + "step": 989 + }, + { + "epoch": 0.3401623502126015, + "grad_norm": 0.41044285893440247, + "learning_rate": 0.00041759441647671604, + "loss": 6.56, + "step": 990 + }, + { + "epoch": 0.3405059485461496, + "grad_norm": 0.5040400624275208, + "learning_rate": 0.00041737177545057456, + "loss": 6.6315, + "step": 991 + }, + { + "epoch": 0.34084954687969765, + "grad_norm": 0.5537315607070923, + "learning_rate": 0.000417148893594248, + "loss": 6.5066, + "step": 992 + }, + { + "epoch": 0.34119314521324573, + "grad_norm": 0.3936966061592102, + "learning_rate": 0.00041692577122843963, + "loss": 6.572, + "step": 993 + }, + { + "epoch": 0.3415367435467938, + "grad_norm": 0.5949533581733704, + "learning_rate": 0.0004167024086741987, + "loss": 6.5359, + "step": 994 + }, + { + "epoch": 0.3418803418803419, + "grad_norm": 0.6033600568771362, + "learning_rate": 0.0004164788062529203, + "loss": 6.5554, + "step": 995 + }, + { + "epoch": 0.34222394021388997, + "grad_norm": 0.5860453248023987, + "learning_rate": 0.0004162549642863445, + "loss": 6.6641, + "step": 996 + }, + { + "epoch": 0.34256753854743804, + "grad_norm": 0.5877623558044434, + "learning_rate": 0.0004160308830965559, + "loss": 6.7018, + "step": 997 + }, + { + "epoch": 0.3429111368809861, + "grad_norm": 0.73051917552948, + "learning_rate": 0.00041580656300598375, + "loss": 6.5305, + "step": 998 + }, + { + "epoch": 0.3432547352145342, + "grad_norm": 0.6099729537963867, + "learning_rate": 0.00041558200433740067, + "loss": 6.6973, + "step": 999 + }, + { + "epoch": 0.3435983335480823, + "grad_norm": 0.9224446415901184, + "learning_rate": 0.0004153572074139228, + "loss": 6.6335, + "step": 1000 + }, + { + "epoch": 0.34394193188163036, + "grad_norm": 1.092505693435669, + "learning_rate": 0.00041513217255900893, + "loss": 6.5569, + "step": 1001 + }, + { + "epoch": 0.34428553021517844, + "grad_norm": 0.5701151490211487, + "learning_rate": 0.00041490690009646024, + "loss": 6.4043, + "step": 1002 + }, + { + "epoch": 0.3446291285487265, + "grad_norm": 0.838062047958374, + "learning_rate": 0.00041468139035042003, + "loss": 6.5534, + "step": 1003 + }, + { + "epoch": 0.3449727268822746, + "grad_norm": 0.7914153337478638, + "learning_rate": 0.0004144556436453727, + "loss": 6.526, + "step": 1004 + }, + { + "epoch": 0.34531632521582273, + "grad_norm": 0.47454750537872314, + "learning_rate": 0.00041422966030614375, + "loss": 6.5954, + "step": 1005 + }, + { + "epoch": 0.3456599235493708, + "grad_norm": 0.8763797283172607, + "learning_rate": 0.0004140034406578991, + "loss": 6.5447, + "step": 1006 + }, + { + "epoch": 0.3460035218829189, + "grad_norm": 0.7527329325675964, + "learning_rate": 0.000413776985026145, + "loss": 6.423, + "step": 1007 + }, + { + "epoch": 0.34634712021646696, + "grad_norm": 0.5256273150444031, + "learning_rate": 0.0004135502937367268, + "loss": 6.5776, + "step": 1008 + }, + { + "epoch": 0.34669071855001504, + "grad_norm": 0.8483496308326721, + "learning_rate": 0.00041332336711582916, + "loss": 6.4731, + "step": 1009 + }, + { + "epoch": 0.3470343168835631, + "grad_norm": 0.5697401165962219, + "learning_rate": 0.00041309620548997557, + "loss": 6.5361, + "step": 1010 + }, + { + "epoch": 0.3473779152171112, + "grad_norm": 0.4558808505535126, + "learning_rate": 0.0004128688091860273, + "loss": 6.4667, + "step": 1011 + }, + { + "epoch": 0.3477215135506593, + "grad_norm": 0.7396731376647949, + "learning_rate": 0.00041264117853118343, + "loss": 6.5441, + "step": 1012 + }, + { + "epoch": 0.34806511188420736, + "grad_norm": 0.5605579614639282, + "learning_rate": 0.0004124133138529803, + "loss": 6.561, + "step": 1013 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 0.5781577229499817, + "learning_rate": 0.00041218521547929096, + "loss": 6.5182, + "step": 1014 + }, + { + "epoch": 0.3487523085513035, + "grad_norm": 0.5698264837265015, + "learning_rate": 0.00041195688373832465, + "loss": 6.4753, + "step": 1015 + }, + { + "epoch": 0.3490959068848516, + "grad_norm": 0.6196632385253906, + "learning_rate": 0.0004117283189586265, + "loss": 6.6474, + "step": 1016 + }, + { + "epoch": 0.34943950521839967, + "grad_norm": 0.631287693977356, + "learning_rate": 0.00041149952146907697, + "loss": 6.508, + "step": 1017 + }, + { + "epoch": 0.34978310355194775, + "grad_norm": 0.6482592821121216, + "learning_rate": 0.0004112704915988913, + "loss": 6.4538, + "step": 1018 + }, + { + "epoch": 0.35012670188549583, + "grad_norm": 0.45326462388038635, + "learning_rate": 0.00041104122967761906, + "loss": 6.4599, + "step": 1019 + }, + { + "epoch": 0.35047030021904396, + "grad_norm": 0.5699672698974609, + "learning_rate": 0.0004108117360351438, + "loss": 6.5104, + "step": 1020 + }, + { + "epoch": 0.35081389855259204, + "grad_norm": 0.5698271989822388, + "learning_rate": 0.0004105820110016825, + "loss": 6.4758, + "step": 1021 + }, + { + "epoch": 0.3511574968861401, + "grad_norm": 0.48133784532546997, + "learning_rate": 0.000410352054907785, + "loss": 6.4689, + "step": 1022 + }, + { + "epoch": 0.3515010952196882, + "grad_norm": 0.5399252772331238, + "learning_rate": 0.00041012186808433364, + "loss": 6.5425, + "step": 1023 + }, + { + "epoch": 0.3518446935532363, + "grad_norm": 0.4314304292201996, + "learning_rate": 0.00040989145086254295, + "loss": 6.5328, + "step": 1024 + }, + { + "epoch": 0.35218829188678435, + "grad_norm": 0.5915732979774475, + "learning_rate": 0.0004096608035739585, + "loss": 6.504, + "step": 1025 + }, + { + "epoch": 0.35253189022033243, + "grad_norm": 0.5009291768074036, + "learning_rate": 0.0004094299265504575, + "loss": 6.4955, + "step": 1026 + }, + { + "epoch": 0.3528754885538805, + "grad_norm": 0.3504527807235718, + "learning_rate": 0.00040919882012424737, + "loss": 6.433, + "step": 1027 + }, + { + "epoch": 0.3532190868874286, + "grad_norm": 0.4587783217430115, + "learning_rate": 0.0004089674846278656, + "loss": 6.4936, + "step": 1028 + }, + { + "epoch": 0.35356268522097667, + "grad_norm": 0.5252232551574707, + "learning_rate": 0.00040873592039417935, + "loss": 6.5655, + "step": 1029 + }, + { + "epoch": 0.35390628355452475, + "grad_norm": 0.491263747215271, + "learning_rate": 0.000408504127756385, + "loss": 6.5125, + "step": 1030 + }, + { + "epoch": 0.3542498818880728, + "grad_norm": 0.4879305362701416, + "learning_rate": 0.0004082721070480075, + "loss": 6.623, + "step": 1031 + }, + { + "epoch": 0.3545934802216209, + "grad_norm": 0.6748274564743042, + "learning_rate": 0.00040803985860289995, + "loss": 6.5595, + "step": 1032 + }, + { + "epoch": 0.354937078555169, + "grad_norm": 0.356755793094635, + "learning_rate": 0.0004078073827552432, + "loss": 6.5241, + "step": 1033 + }, + { + "epoch": 0.3552806768887171, + "grad_norm": 0.6783245205879211, + "learning_rate": 0.0004075746798395452, + "loss": 6.5518, + "step": 1034 + }, + { + "epoch": 0.3556242752222652, + "grad_norm": 0.5618282556533813, + "learning_rate": 0.0004073417501906407, + "loss": 6.526, + "step": 1035 + }, + { + "epoch": 0.3559678735558133, + "grad_norm": 0.4738239347934723, + "learning_rate": 0.0004071085941436908, + "loss": 6.6759, + "step": 1036 + }, + { + "epoch": 0.35631147188936135, + "grad_norm": 0.7224136590957642, + "learning_rate": 0.00040687521203418216, + "loss": 6.519, + "step": 1037 + }, + { + "epoch": 0.35665507022290943, + "grad_norm": 0.638771116733551, + "learning_rate": 0.00040664160419792684, + "loss": 6.466, + "step": 1038 + }, + { + "epoch": 0.3569986685564575, + "grad_norm": 0.562751293182373, + "learning_rate": 0.00040640777097106164, + "loss": 6.6204, + "step": 1039 + }, + { + "epoch": 0.3573422668900056, + "grad_norm": 0.7001228928565979, + "learning_rate": 0.00040617371269004783, + "loss": 6.6701, + "step": 1040 + }, + { + "epoch": 0.35768586522355367, + "grad_norm": 0.7808020114898682, + "learning_rate": 0.0004059394296916702, + "loss": 6.4939, + "step": 1041 + }, + { + "epoch": 0.35802946355710175, + "grad_norm": 0.8399685025215149, + "learning_rate": 0.00040570492231303725, + "loss": 6.5546, + "step": 1042 + }, + { + "epoch": 0.3583730618906498, + "grad_norm": 0.659149169921875, + "learning_rate": 0.00040547019089158006, + "loss": 6.5908, + "step": 1043 + }, + { + "epoch": 0.3587166602241979, + "grad_norm": 0.7955062389373779, + "learning_rate": 0.00040523523576505217, + "loss": 6.5679, + "step": 1044 + }, + { + "epoch": 0.359060258557746, + "grad_norm": 0.7686776518821716, + "learning_rate": 0.000405000057271529, + "loss": 6.4661, + "step": 1045 + }, + { + "epoch": 0.35940385689129406, + "grad_norm": 0.5270580053329468, + "learning_rate": 0.0004047646557494076, + "loss": 6.4503, + "step": 1046 + }, + { + "epoch": 0.35974745522484214, + "grad_norm": 0.7081395983695984, + "learning_rate": 0.0004045290315374054, + "loss": 6.5489, + "step": 1047 + }, + { + "epoch": 0.3600910535583902, + "grad_norm": 0.8378854393959045, + "learning_rate": 0.00040429318497456075, + "loss": 6.7946, + "step": 1048 + }, + { + "epoch": 0.36043465189193835, + "grad_norm": 0.59283846616745, + "learning_rate": 0.00040405711640023183, + "loss": 6.6438, + "step": 1049 + }, + { + "epoch": 0.36077825022548643, + "grad_norm": 0.998915433883667, + "learning_rate": 0.0004038208261540961, + "loss": 6.4583, + "step": 1050 + }, + { + "epoch": 0.3611218485590345, + "grad_norm": 1.1409025192260742, + "learning_rate": 0.0004035843145761502, + "loss": 6.3864, + "step": 1051 + }, + { + "epoch": 0.3614654468925826, + "grad_norm": 0.7622621059417725, + "learning_rate": 0.0004033475820067091, + "loss": 6.552, + "step": 1052 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 0.8285176753997803, + "learning_rate": 0.0004031106287864057, + "loss": 6.4924, + "step": 1053 + }, + { + "epoch": 0.36215264355967874, + "grad_norm": 0.8217036724090576, + "learning_rate": 0.0004028734552561906, + "loss": 6.5873, + "step": 1054 + }, + { + "epoch": 0.3624962418932268, + "grad_norm": 0.797195553779602, + "learning_rate": 0.00040263606175733124, + "loss": 6.4403, + "step": 1055 + }, + { + "epoch": 0.3628398402267749, + "grad_norm": 0.6831180453300476, + "learning_rate": 0.00040239844863141163, + "loss": 6.5032, + "step": 1056 + }, + { + "epoch": 0.363183438560323, + "grad_norm": 0.6810872554779053, + "learning_rate": 0.0004021606162203318, + "loss": 6.4218, + "step": 1057 + }, + { + "epoch": 0.36352703689387106, + "grad_norm": 0.6327046155929565, + "learning_rate": 0.00040192256486630724, + "loss": 6.4264, + "step": 1058 + }, + { + "epoch": 0.36387063522741914, + "grad_norm": 0.5407282114028931, + "learning_rate": 0.0004016842949118686, + "loss": 6.4026, + "step": 1059 + }, + { + "epoch": 0.3642142335609672, + "grad_norm": 0.7327606081962585, + "learning_rate": 0.000401445806699861, + "loss": 6.5312, + "step": 1060 + }, + { + "epoch": 0.3645578318945153, + "grad_norm": 0.7049651145935059, + "learning_rate": 0.0004012071005734435, + "loss": 6.3978, + "step": 1061 + }, + { + "epoch": 0.3649014302280634, + "grad_norm": 0.6945012211799622, + "learning_rate": 0.000400968176876089, + "loss": 6.4396, + "step": 1062 + }, + { + "epoch": 0.36524502856161145, + "grad_norm": 0.7001397609710693, + "learning_rate": 0.0004007290359515832, + "loss": 6.5293, + "step": 1063 + }, + { + "epoch": 0.3655886268951596, + "grad_norm": 0.5994911193847656, + "learning_rate": 0.0004004896781440244, + "loss": 6.5305, + "step": 1064 + }, + { + "epoch": 0.36593222522870766, + "grad_norm": 0.4745972752571106, + "learning_rate": 0.0004002501037978232, + "loss": 6.5981, + "step": 1065 + }, + { + "epoch": 0.36627582356225574, + "grad_norm": 0.5303031802177429, + "learning_rate": 0.0004000103132577014, + "loss": 6.4155, + "step": 1066 + }, + { + "epoch": 0.3666194218958038, + "grad_norm": 0.4650964140892029, + "learning_rate": 0.0003997703068686923, + "loss": 6.4578, + "step": 1067 + }, + { + "epoch": 0.3669630202293519, + "grad_norm": 0.5369909405708313, + "learning_rate": 0.0003995300849761394, + "loss": 6.4916, + "step": 1068 + }, + { + "epoch": 0.3673066185629, + "grad_norm": 0.4537830650806427, + "learning_rate": 0.00039928964792569654, + "loss": 6.4896, + "step": 1069 + }, + { + "epoch": 0.36765021689644806, + "grad_norm": 0.467717707157135, + "learning_rate": 0.0003990489960633271, + "loss": 6.4669, + "step": 1070 + }, + { + "epoch": 0.36799381522999614, + "grad_norm": 0.5717526078224182, + "learning_rate": 0.00039880812973530335, + "loss": 6.39, + "step": 1071 + }, + { + "epoch": 0.3683374135635442, + "grad_norm": 0.3943920135498047, + "learning_rate": 0.0003985670492882065, + "loss": 6.4894, + "step": 1072 + }, + { + "epoch": 0.3686810118970923, + "grad_norm": 0.4640418291091919, + "learning_rate": 0.00039832575506892556, + "loss": 6.5758, + "step": 1073 + }, + { + "epoch": 0.36902461023064037, + "grad_norm": 0.527631938457489, + "learning_rate": 0.0003980842474246573, + "loss": 6.5324, + "step": 1074 + }, + { + "epoch": 0.36936820856418845, + "grad_norm": 0.5745725035667419, + "learning_rate": 0.00039784252670290555, + "loss": 6.4548, + "step": 1075 + }, + { + "epoch": 0.36971180689773653, + "grad_norm": 0.3347758948802948, + "learning_rate": 0.00039760059325148067, + "loss": 6.406, + "step": 1076 + }, + { + "epoch": 0.3700554052312846, + "grad_norm": 0.42385727167129517, + "learning_rate": 0.0003973584474184992, + "loss": 6.5037, + "step": 1077 + }, + { + "epoch": 0.3703990035648327, + "grad_norm": 0.3935813009738922, + "learning_rate": 0.00039711608955238334, + "loss": 6.5546, + "step": 1078 + }, + { + "epoch": 0.3707426018983808, + "grad_norm": 0.6405526995658875, + "learning_rate": 0.00039687352000186005, + "loss": 6.4536, + "step": 1079 + }, + { + "epoch": 0.3710862002319289, + "grad_norm": 0.4569138288497925, + "learning_rate": 0.00039663073911596134, + "loss": 6.5994, + "step": 1080 + }, + { + "epoch": 0.371429798565477, + "grad_norm": 0.5814416408538818, + "learning_rate": 0.00039638774724402295, + "loss": 6.5257, + "step": 1081 + }, + { + "epoch": 0.37177339689902505, + "grad_norm": 0.604465126991272, + "learning_rate": 0.0003961445447356844, + "loss": 6.4128, + "step": 1082 + }, + { + "epoch": 0.37211699523257313, + "grad_norm": 0.45002633333206177, + "learning_rate": 0.00039590113194088827, + "loss": 6.4803, + "step": 1083 + }, + { + "epoch": 0.3724605935661212, + "grad_norm": 0.6199561953544617, + "learning_rate": 0.00039565750920987966, + "loss": 6.5573, + "step": 1084 + }, + { + "epoch": 0.3728041918996693, + "grad_norm": 0.45244845747947693, + "learning_rate": 0.00039541367689320566, + "loss": 6.5621, + "step": 1085 + }, + { + "epoch": 0.37314779023321737, + "grad_norm": 0.6887307167053223, + "learning_rate": 0.0003951696353417152, + "loss": 6.5448, + "step": 1086 + }, + { + "epoch": 0.37349138856676545, + "grad_norm": 0.5372146368026733, + "learning_rate": 0.000394925384906558, + "loss": 6.5137, + "step": 1087 + }, + { + "epoch": 0.3738349869003135, + "grad_norm": 0.530670702457428, + "learning_rate": 0.0003946809259391846, + "loss": 6.5199, + "step": 1088 + }, + { + "epoch": 0.3741785852338616, + "grad_norm": 0.7162120342254639, + "learning_rate": 0.00039443625879134525, + "loss": 6.473, + "step": 1089 + }, + { + "epoch": 0.3745221835674097, + "grad_norm": 0.5712336897850037, + "learning_rate": 0.0003941913838150902, + "loss": 6.6048, + "step": 1090 + }, + { + "epoch": 0.37486578190095776, + "grad_norm": 0.6182492971420288, + "learning_rate": 0.0003939463013627683, + "loss": 6.5619, + "step": 1091 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 0.6004253625869751, + "learning_rate": 0.00039370101178702724, + "loss": 6.5003, + "step": 1092 + }, + { + "epoch": 0.3755529785680539, + "grad_norm": 0.7150577306747437, + "learning_rate": 0.00039345551544081256, + "loss": 6.5916, + "step": 1093 + }, + { + "epoch": 0.37589657690160205, + "grad_norm": 0.8572820425033569, + "learning_rate": 0.00039320981267736745, + "loss": 6.5192, + "step": 1094 + }, + { + "epoch": 0.37624017523515013, + "grad_norm": 0.6833814382553101, + "learning_rate": 0.00039296390385023204, + "loss": 6.5086, + "step": 1095 + }, + { + "epoch": 0.3765837735686982, + "grad_norm": 0.7316805720329285, + "learning_rate": 0.00039271778931324297, + "loss": 6.5474, + "step": 1096 + }, + { + "epoch": 0.3769273719022463, + "grad_norm": 0.6537583470344543, + "learning_rate": 0.00039247146942053297, + "loss": 6.4364, + "step": 1097 + }, + { + "epoch": 0.37727097023579437, + "grad_norm": 0.8147690892219543, + "learning_rate": 0.00039222494452653006, + "loss": 6.6332, + "step": 1098 + }, + { + "epoch": 0.37761456856934245, + "grad_norm": 0.6788108348846436, + "learning_rate": 0.00039197821498595744, + "loss": 6.6515, + "step": 1099 + }, + { + "epoch": 0.3779581669028905, + "grad_norm": 0.886707603931427, + "learning_rate": 0.0003917312811538325, + "loss": 6.5627, + "step": 1100 + }, + { + "epoch": 0.3783017652364386, + "grad_norm": 0.9681865572929382, + "learning_rate": 0.000391484143385467, + "loss": 6.4266, + "step": 1101 + }, + { + "epoch": 0.3786453635699867, + "grad_norm": 0.6177529096603394, + "learning_rate": 0.0003912368020364657, + "loss": 6.3702, + "step": 1102 + }, + { + "epoch": 0.37898896190353476, + "grad_norm": 1.0723011493682861, + "learning_rate": 0.0003909892574627266, + "loss": 6.4111, + "step": 1103 + }, + { + "epoch": 0.37933256023708284, + "grad_norm": 0.870075523853302, + "learning_rate": 0.0003907415100204401, + "loss": 6.5443, + "step": 1104 + }, + { + "epoch": 0.3796761585706309, + "grad_norm": 0.5722936987876892, + "learning_rate": 0.0003904935600660883, + "loss": 6.452, + "step": 1105 + }, + { + "epoch": 0.380019756904179, + "grad_norm": 0.8501458168029785, + "learning_rate": 0.0003902454079564447, + "loss": 6.3924, + "step": 1106 + }, + { + "epoch": 0.3803633552377271, + "grad_norm": 0.7085288763046265, + "learning_rate": 0.0003899970540485741, + "loss": 6.3751, + "step": 1107 + }, + { + "epoch": 0.38070695357127515, + "grad_norm": 0.509347140789032, + "learning_rate": 0.00038974849869983114, + "loss": 6.4594, + "step": 1108 + }, + { + "epoch": 0.3810505519048233, + "grad_norm": 0.6944195032119751, + "learning_rate": 0.00038949974226786053, + "loss": 6.2957, + "step": 1109 + }, + { + "epoch": 0.38139415023837137, + "grad_norm": 0.5596153736114502, + "learning_rate": 0.0003892507851105965, + "loss": 6.3734, + "step": 1110 + }, + { + "epoch": 0.38173774857191944, + "grad_norm": 0.5919499397277832, + "learning_rate": 0.0003890016275862618, + "loss": 6.4215, + "step": 1111 + }, + { + "epoch": 0.3820813469054675, + "grad_norm": 0.4953160583972931, + "learning_rate": 0.0003887522700533675, + "loss": 6.3711, + "step": 1112 + }, + { + "epoch": 0.3824249452390156, + "grad_norm": 0.580381453037262, + "learning_rate": 0.0003885027128707127, + "loss": 6.5075, + "step": 1113 + }, + { + "epoch": 0.3827685435725637, + "grad_norm": 0.4716396629810333, + "learning_rate": 0.0003882529563973837, + "loss": 6.4973, + "step": 1114 + }, + { + "epoch": 0.38311214190611176, + "grad_norm": 0.5329322814941406, + "learning_rate": 0.00038800300099275345, + "loss": 6.4449, + "step": 1115 + }, + { + "epoch": 0.38345574023965984, + "grad_norm": 0.5385730862617493, + "learning_rate": 0.00038775284701648115, + "loss": 6.3797, + "step": 1116 + }, + { + "epoch": 0.3837993385732079, + "grad_norm": 0.592823326587677, + "learning_rate": 0.00038750249482851184, + "loss": 6.579, + "step": 1117 + }, + { + "epoch": 0.384142936906756, + "grad_norm": 0.5093126893043518, + "learning_rate": 0.00038725194478907556, + "loss": 6.4524, + "step": 1118 + }, + { + "epoch": 0.3844865352403041, + "grad_norm": 0.42565739154815674, + "learning_rate": 0.00038700119725868735, + "loss": 6.5539, + "step": 1119 + }, + { + "epoch": 0.38483013357385215, + "grad_norm": 0.4704289734363556, + "learning_rate": 0.00038675025259814606, + "loss": 6.3764, + "step": 1120 + }, + { + "epoch": 0.38517373190740023, + "grad_norm": 0.46135643124580383, + "learning_rate": 0.00038649911116853456, + "loss": 6.4639, + "step": 1121 + }, + { + "epoch": 0.3855173302409483, + "grad_norm": 0.5629793405532837, + "learning_rate": 0.0003862477733312185, + "loss": 6.3743, + "step": 1122 + }, + { + "epoch": 0.3858609285744964, + "grad_norm": 0.41762539744377136, + "learning_rate": 0.0003859962394478464, + "loss": 6.443, + "step": 1123 + }, + { + "epoch": 0.3862045269080445, + "grad_norm": 0.5560463666915894, + "learning_rate": 0.0003857445098803487, + "loss": 6.5066, + "step": 1124 + }, + { + "epoch": 0.3865481252415926, + "grad_norm": 0.5698534250259399, + "learning_rate": 0.00038549258499093756, + "loss": 6.5049, + "step": 1125 + }, + { + "epoch": 0.3868917235751407, + "grad_norm": 0.47976475954055786, + "learning_rate": 0.000385240465142106, + "loss": 6.4264, + "step": 1126 + }, + { + "epoch": 0.38723532190868876, + "grad_norm": 0.5374791026115417, + "learning_rate": 0.00038498815069662766, + "loss": 6.4272, + "step": 1127 + }, + { + "epoch": 0.38757892024223684, + "grad_norm": 0.4447578191757202, + "learning_rate": 0.0003847356420175564, + "loss": 6.3393, + "step": 1128 + }, + { + "epoch": 0.3879225185757849, + "grad_norm": 0.5105568766593933, + "learning_rate": 0.0003844829394682251, + "loss": 6.4803, + "step": 1129 + }, + { + "epoch": 0.388266116909333, + "grad_norm": 0.5288437604904175, + "learning_rate": 0.00038423004341224597, + "loss": 6.4587, + "step": 1130 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 0.4225218594074249, + "learning_rate": 0.00038397695421350954, + "loss": 6.4213, + "step": 1131 + }, + { + "epoch": 0.38895331357642915, + "grad_norm": 0.5961380004882812, + "learning_rate": 0.0003837236722361842, + "loss": 6.5326, + "step": 1132 + }, + { + "epoch": 0.38929691190997723, + "grad_norm": 0.527696967124939, + "learning_rate": 0.00038347019784471594, + "loss": 6.3996, + "step": 1133 + }, + { + "epoch": 0.3896405102435253, + "grad_norm": 0.5186758041381836, + "learning_rate": 0.00038321653140382727, + "loss": 6.3858, + "step": 1134 + }, + { + "epoch": 0.3899841085770734, + "grad_norm": 0.45041173696517944, + "learning_rate": 0.00038296267327851723, + "loss": 6.561, + "step": 1135 + }, + { + "epoch": 0.39032770691062146, + "grad_norm": 0.6820952892303467, + "learning_rate": 0.0003827086238340608, + "loss": 6.4386, + "step": 1136 + }, + { + "epoch": 0.39067130524416954, + "grad_norm": 0.6062747240066528, + "learning_rate": 0.00038245438343600805, + "loss": 6.3137, + "step": 1137 + }, + { + "epoch": 0.3910149035777176, + "grad_norm": 0.4744030237197876, + "learning_rate": 0.0003821999524501837, + "loss": 6.3566, + "step": 1138 + }, + { + "epoch": 0.39135850191126575, + "grad_norm": 0.4075278639793396, + "learning_rate": 0.00038194533124268716, + "loss": 6.4366, + "step": 1139 + }, + { + "epoch": 0.39170210024481383, + "grad_norm": 0.5966638922691345, + "learning_rate": 0.0003816905201798912, + "loss": 6.4666, + "step": 1140 + }, + { + "epoch": 0.3920456985783619, + "grad_norm": 0.6207997798919678, + "learning_rate": 0.0003814355196284417, + "loss": 6.5711, + "step": 1141 + }, + { + "epoch": 0.39238929691191, + "grad_norm": 0.5189718008041382, + "learning_rate": 0.00038118032995525746, + "loss": 6.4682, + "step": 1142 + }, + { + "epoch": 0.39273289524545807, + "grad_norm": 0.5805226564407349, + "learning_rate": 0.0003809249515275293, + "loss": 6.4373, + "step": 1143 + }, + { + "epoch": 0.39307649357900615, + "grad_norm": 0.5662277340888977, + "learning_rate": 0.00038066938471271946, + "loss": 6.5011, + "step": 1144 + }, + { + "epoch": 0.3934200919125542, + "grad_norm": 0.6074782609939575, + "learning_rate": 0.00038041362987856155, + "loss": 6.5347, + "step": 1145 + }, + { + "epoch": 0.3937636902461023, + "grad_norm": 0.6265222430229187, + "learning_rate": 0.00038015768739305946, + "loss": 6.5338, + "step": 1146 + }, + { + "epoch": 0.3941072885796504, + "grad_norm": 0.6739917993545532, + "learning_rate": 0.0003799015576244874, + "loss": 6.4666, + "step": 1147 + }, + { + "epoch": 0.39445088691319846, + "grad_norm": 0.6248368620872498, + "learning_rate": 0.0003796452409413887, + "loss": 6.5219, + "step": 1148 + }, + { + "epoch": 0.39479448524674654, + "grad_norm": 0.7328528761863708, + "learning_rate": 0.00037938873771257585, + "loss": 6.4904, + "step": 1149 + }, + { + "epoch": 0.3951380835802946, + "grad_norm": 0.9122981429100037, + "learning_rate": 0.0003791320483071298, + "loss": 6.5959, + "step": 1150 + }, + { + "epoch": 0.3954816819138427, + "grad_norm": 0.8380439877510071, + "learning_rate": 0.0003788751730943991, + "loss": 6.5559, + "step": 1151 + }, + { + "epoch": 0.3958252802473908, + "grad_norm": 0.6608699560165405, + "learning_rate": 0.0003786181124440001, + "loss": 6.6044, + "step": 1152 + }, + { + "epoch": 0.3961688785809389, + "grad_norm": 0.7114856839179993, + "learning_rate": 0.0003783608667258156, + "loss": 6.5022, + "step": 1153 + }, + { + "epoch": 0.396512476914487, + "grad_norm": 0.5558148622512817, + "learning_rate": 0.0003781034363099949, + "loss": 6.5019, + "step": 1154 + }, + { + "epoch": 0.39685607524803507, + "grad_norm": 0.612568736076355, + "learning_rate": 0.00037784582156695284, + "loss": 6.4562, + "step": 1155 + }, + { + "epoch": 0.39719967358158315, + "grad_norm": 0.6382710337638855, + "learning_rate": 0.0003775880228673699, + "loss": 6.3982, + "step": 1156 + }, + { + "epoch": 0.3975432719151312, + "grad_norm": 0.6008266806602478, + "learning_rate": 0.00037733004058219076, + "loss": 6.4837, + "step": 1157 + }, + { + "epoch": 0.3978868702486793, + "grad_norm": 0.5972487926483154, + "learning_rate": 0.0003770718750826246, + "loss": 6.4001, + "step": 1158 + }, + { + "epoch": 0.3982304685822274, + "grad_norm": 0.6197747588157654, + "learning_rate": 0.0003768135267401441, + "loss": 6.3853, + "step": 1159 + }, + { + "epoch": 0.39857406691577546, + "grad_norm": 0.7038761377334595, + "learning_rate": 0.00037655499592648513, + "loss": 6.5378, + "step": 1160 + }, + { + "epoch": 0.39891766524932354, + "grad_norm": 0.5936171412467957, + "learning_rate": 0.0003762962830136458, + "loss": 6.2935, + "step": 1161 + }, + { + "epoch": 0.3992612635828716, + "grad_norm": 0.7182568907737732, + "learning_rate": 0.00037603738837388667, + "loss": 6.2826, + "step": 1162 + }, + { + "epoch": 0.3996048619164197, + "grad_norm": 0.655178427696228, + "learning_rate": 0.0003757783123797297, + "loss": 6.3054, + "step": 1163 + }, + { + "epoch": 0.3999484602499678, + "grad_norm": 0.40302804112434387, + "learning_rate": 0.00037551905540395735, + "loss": 6.3942, + "step": 1164 + }, + { + "epoch": 0.40029205858351585, + "grad_norm": 0.7752393484115601, + "learning_rate": 0.0003752596178196131, + "loss": 6.252, + "step": 1165 + }, + { + "epoch": 0.40063565691706393, + "grad_norm": 0.5546431541442871, + "learning_rate": 0.000375, + "loss": 6.3494, + "step": 1166 + }, + { + "epoch": 0.400979255250612, + "grad_norm": 0.6124146580696106, + "learning_rate": 0.00037474020231868045, + "loss": 6.4457, + "step": 1167 + }, + { + "epoch": 0.40132285358416014, + "grad_norm": 0.5023613572120667, + "learning_rate": 0.00037448022514947573, + "loss": 6.3672, + "step": 1168 + }, + { + "epoch": 0.4016664519177082, + "grad_norm": 0.6657715439796448, + "learning_rate": 0.0003742200688664653, + "loss": 6.3568, + "step": 1169 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 0.44071656465530396, + "learning_rate": 0.00037395973384398627, + "loss": 6.5611, + "step": 1170 + }, + { + "epoch": 0.4023536485848044, + "grad_norm": 0.5543152689933777, + "learning_rate": 0.00037369922045663327, + "loss": 6.3697, + "step": 1171 + }, + { + "epoch": 0.40269724691835246, + "grad_norm": 0.473361074924469, + "learning_rate": 0.0003734385290792573, + "loss": 6.4666, + "step": 1172 + }, + { + "epoch": 0.40304084525190054, + "grad_norm": 0.4800427556037903, + "learning_rate": 0.00037317766008696543, + "loss": 6.5445, + "step": 1173 + }, + { + "epoch": 0.4033844435854486, + "grad_norm": 0.6011155247688293, + "learning_rate": 0.0003729166138551204, + "loss": 6.4005, + "step": 1174 + }, + { + "epoch": 0.4037280419189967, + "grad_norm": 0.47439199686050415, + "learning_rate": 0.00037265539075934014, + "loss": 6.4418, + "step": 1175 + }, + { + "epoch": 0.4040716402525448, + "grad_norm": 0.7657064199447632, + "learning_rate": 0.00037239399117549676, + "loss": 6.4249, + "step": 1176 + }, + { + "epoch": 0.40441523858609285, + "grad_norm": 0.49792230129241943, + "learning_rate": 0.00037213241547971645, + "loss": 6.3721, + "step": 1177 + }, + { + "epoch": 0.40475883691964093, + "grad_norm": 0.7585155963897705, + "learning_rate": 0.0003718706640483789, + "loss": 6.4348, + "step": 1178 + }, + { + "epoch": 0.405102435253189, + "grad_norm": 0.4420984089374542, + "learning_rate": 0.0003716087372581165, + "loss": 6.3232, + "step": 1179 + }, + { + "epoch": 0.4054460335867371, + "grad_norm": 0.8976798057556152, + "learning_rate": 0.0003713466354858141, + "loss": 6.2834, + "step": 1180 + }, + { + "epoch": 0.40578963192028517, + "grad_norm": 0.5315551161766052, + "learning_rate": 0.0003710843591086083, + "loss": 6.3475, + "step": 1181 + }, + { + "epoch": 0.40613323025383324, + "grad_norm": 0.8557246923446655, + "learning_rate": 0.0003708219085038869, + "loss": 6.2522, + "step": 1182 + }, + { + "epoch": 0.4064768285873814, + "grad_norm": 0.7426571846008301, + "learning_rate": 0.0003705592840492883, + "loss": 6.416, + "step": 1183 + }, + { + "epoch": 0.40682042692092946, + "grad_norm": 0.5289968848228455, + "learning_rate": 0.0003702964861227013, + "loss": 6.3237, + "step": 1184 + }, + { + "epoch": 0.40716402525447754, + "grad_norm": 0.7140244841575623, + "learning_rate": 0.00037003351510226415, + "loss": 6.4297, + "step": 1185 + }, + { + "epoch": 0.4075076235880256, + "grad_norm": 0.5616683959960938, + "learning_rate": 0.00036977037136636404, + "loss": 6.3243, + "step": 1186 + }, + { + "epoch": 0.4078512219215737, + "grad_norm": 0.6869580745697021, + "learning_rate": 0.000369507055293637, + "loss": 6.3222, + "step": 1187 + }, + { + "epoch": 0.40819482025512177, + "grad_norm": 0.8005693554878235, + "learning_rate": 0.00036924356726296674, + "loss": 6.4359, + "step": 1188 + }, + { + "epoch": 0.40853841858866985, + "grad_norm": 0.6318978667259216, + "learning_rate": 0.00036897990765348467, + "loss": 6.4053, + "step": 1189 + }, + { + "epoch": 0.40888201692221793, + "grad_norm": 0.6056877374649048, + "learning_rate": 0.0003687160768445688, + "loss": 6.4243, + "step": 1190 + }, + { + "epoch": 0.409225615255766, + "grad_norm": 0.7091792225837708, + "learning_rate": 0.00036845207521584355, + "loss": 6.3087, + "step": 1191 + }, + { + "epoch": 0.4095692135893141, + "grad_norm": 0.7672974467277527, + "learning_rate": 0.00036818790314717935, + "loss": 6.409, + "step": 1192 + }, + { + "epoch": 0.40991281192286216, + "grad_norm": 0.9131182432174683, + "learning_rate": 0.00036792356101869155, + "loss": 6.4735, + "step": 1193 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.5129940509796143, + "learning_rate": 0.00036765904921074046, + "loss": 6.5078, + "step": 1194 + }, + { + "epoch": 0.4106000085899583, + "grad_norm": 1.1640437841415405, + "learning_rate": 0.0003673943681039305, + "loss": 6.3134, + "step": 1195 + }, + { + "epoch": 0.4109436069235064, + "grad_norm": 0.6941258907318115, + "learning_rate": 0.0003671295180791094, + "loss": 6.4081, + "step": 1196 + }, + { + "epoch": 0.4112872052570545, + "grad_norm": 0.8544447422027588, + "learning_rate": 0.0003668644995173684, + "loss": 6.3699, + "step": 1197 + }, + { + "epoch": 0.4116308035906026, + "grad_norm": 0.8141912221908569, + "learning_rate": 0.000366599312800041, + "loss": 6.537, + "step": 1198 + }, + { + "epoch": 0.4119744019241507, + "grad_norm": 0.9887378811836243, + "learning_rate": 0.0003663339583087025, + "loss": 6.6481, + "step": 1199 + }, + { + "epoch": 0.41231800025769877, + "grad_norm": 1.1278897523880005, + "learning_rate": 0.0003660684364251701, + "loss": 6.3531, + "step": 1200 + }, + { + "epoch": 0.41266159859124685, + "grad_norm": 0.8282272219657898, + "learning_rate": 0.00036580274753150125, + "loss": 6.3874, + "step": 1201 + }, + { + "epoch": 0.4130051969247949, + "grad_norm": 0.870682954788208, + "learning_rate": 0.00036553689200999426, + "loss": 6.3558, + "step": 1202 + }, + { + "epoch": 0.413348795258343, + "grad_norm": 0.9290775656700134, + "learning_rate": 0.00036527087024318676, + "loss": 6.3082, + "step": 1203 + }, + { + "epoch": 0.4136923935918911, + "grad_norm": 0.6926900148391724, + "learning_rate": 0.0003650046826138559, + "loss": 6.4447, + "step": 1204 + }, + { + "epoch": 0.41403599192543916, + "grad_norm": 0.69936203956604, + "learning_rate": 0.0003647383295050173, + "loss": 6.3996, + "step": 1205 + }, + { + "epoch": 0.41437959025898724, + "grad_norm": 0.7394657731056213, + "learning_rate": 0.0003644718112999249, + "loss": 6.4317, + "step": 1206 + }, + { + "epoch": 0.4147231885925353, + "grad_norm": 0.708120584487915, + "learning_rate": 0.0003642051283820699, + "loss": 6.3382, + "step": 1207 + }, + { + "epoch": 0.4150667869260834, + "grad_norm": 0.6249948740005493, + "learning_rate": 0.00036393828113518063, + "loss": 6.4358, + "step": 1208 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 0.6756424903869629, + "learning_rate": 0.00036367126994322195, + "loss": 6.3744, + "step": 1209 + }, + { + "epoch": 0.41575398359317955, + "grad_norm": 0.6610203385353088, + "learning_rate": 0.00036340409519039463, + "loss": 6.1553, + "step": 1210 + }, + { + "epoch": 0.41609758192672763, + "grad_norm": 0.7560092210769653, + "learning_rate": 0.0003631367572611348, + "loss": 6.2694, + "step": 1211 + }, + { + "epoch": 0.4164411802602757, + "grad_norm": 0.5730945467948914, + "learning_rate": 0.00036286925654011303, + "loss": 6.397, + "step": 1212 + }, + { + "epoch": 0.41678477859382385, + "grad_norm": 0.6729722619056702, + "learning_rate": 0.0003626015934122346, + "loss": 6.5225, + "step": 1213 + }, + { + "epoch": 0.4171283769273719, + "grad_norm": 0.5002995729446411, + "learning_rate": 0.0003623337682626383, + "loss": 6.4229, + "step": 1214 + }, + { + "epoch": 0.41747197526092, + "grad_norm": 0.5038610100746155, + "learning_rate": 0.000362065781476696, + "loss": 6.3297, + "step": 1215 + }, + { + "epoch": 0.4178155735944681, + "grad_norm": 0.5231453776359558, + "learning_rate": 0.00036179763344001216, + "loss": 6.3065, + "step": 1216 + }, + { + "epoch": 0.41815917192801616, + "grad_norm": 0.541223406791687, + "learning_rate": 0.00036152932453842324, + "loss": 6.2675, + "step": 1217 + }, + { + "epoch": 0.41850277026156424, + "grad_norm": 0.48831766843795776, + "learning_rate": 0.00036126085515799744, + "loss": 6.4614, + "step": 1218 + }, + { + "epoch": 0.4188463685951123, + "grad_norm": 0.480375736951828, + "learning_rate": 0.00036099222568503335, + "loss": 6.3863, + "step": 1219 + }, + { + "epoch": 0.4191899669286604, + "grad_norm": 0.4671798050403595, + "learning_rate": 0.00036072343650606043, + "loss": 6.3354, + "step": 1220 + }, + { + "epoch": 0.4195335652622085, + "grad_norm": 0.7875792384147644, + "learning_rate": 0.00036045448800783766, + "loss": 6.351, + "step": 1221 + }, + { + "epoch": 0.41987716359575655, + "grad_norm": 0.5600533485412598, + "learning_rate": 0.0003601853805773533, + "loss": 6.3025, + "step": 1222 + }, + { + "epoch": 0.42022076192930463, + "grad_norm": 0.7269924879074097, + "learning_rate": 0.0003599161146018243, + "loss": 6.3016, + "step": 1223 + }, + { + "epoch": 0.4205643602628527, + "grad_norm": 0.4470710754394531, + "learning_rate": 0.00035964669046869587, + "loss": 6.4315, + "step": 1224 + }, + { + "epoch": 0.4209079585964008, + "grad_norm": 0.6437981724739075, + "learning_rate": 0.00035937710856564055, + "loss": 6.4414, + "step": 1225 + }, + { + "epoch": 0.42125155692994887, + "grad_norm": 0.5562970042228699, + "learning_rate": 0.0003591073692805581, + "loss": 6.3088, + "step": 1226 + }, + { + "epoch": 0.42159515526349695, + "grad_norm": 0.37384918332099915, + "learning_rate": 0.00035883747300157463, + "loss": 6.4137, + "step": 1227 + }, + { + "epoch": 0.4219387535970451, + "grad_norm": 0.6116818785667419, + "learning_rate": 0.00035856742011704224, + "loss": 6.4572, + "step": 1228 + }, + { + "epoch": 0.42228235193059316, + "grad_norm": 0.5554793477058411, + "learning_rate": 0.0003582972110155383, + "loss": 6.414, + "step": 1229 + }, + { + "epoch": 0.42262595026414124, + "grad_norm": 0.4585476815700531, + "learning_rate": 0.0003580268460858649, + "loss": 6.3572, + "step": 1230 + }, + { + "epoch": 0.4229695485976893, + "grad_norm": 0.4323978126049042, + "learning_rate": 0.00035775632571704853, + "loss": 6.3846, + "step": 1231 + }, + { + "epoch": 0.4233131469312374, + "grad_norm": 0.725913405418396, + "learning_rate": 0.0003574856502983392, + "loss": 6.4419, + "step": 1232 + }, + { + "epoch": 0.4236567452647855, + "grad_norm": 0.5197131037712097, + "learning_rate": 0.00035721482021920995, + "loss": 6.3584, + "step": 1233 + }, + { + "epoch": 0.42400034359833355, + "grad_norm": 0.5760481357574463, + "learning_rate": 0.00035694383586935656, + "loss": 6.33, + "step": 1234 + }, + { + "epoch": 0.42434394193188163, + "grad_norm": 0.6064772009849548, + "learning_rate": 0.0003566726976386967, + "loss": 6.4748, + "step": 1235 + }, + { + "epoch": 0.4246875402654297, + "grad_norm": 0.5518282055854797, + "learning_rate": 0.0003564014059173694, + "loss": 6.4234, + "step": 1236 + }, + { + "epoch": 0.4250311385989778, + "grad_norm": 0.637048065662384, + "learning_rate": 0.0003561299610957346, + "loss": 6.3062, + "step": 1237 + }, + { + "epoch": 0.42537473693252587, + "grad_norm": 0.6192359328269958, + "learning_rate": 0.00035585836356437264, + "loss": 6.4022, + "step": 1238 + }, + { + "epoch": 0.42571833526607394, + "grad_norm": 0.46911513805389404, + "learning_rate": 0.00035558661371408326, + "loss": 6.4925, + "step": 1239 + }, + { + "epoch": 0.426061933599622, + "grad_norm": 0.5996941328048706, + "learning_rate": 0.00035531471193588575, + "loss": 6.4118, + "step": 1240 + }, + { + "epoch": 0.4264055319331701, + "grad_norm": 0.5280793905258179, + "learning_rate": 0.0003550426586210178, + "loss": 6.3713, + "step": 1241 + }, + { + "epoch": 0.4267491302667182, + "grad_norm": 0.5926524996757507, + "learning_rate": 0.0003547704541609353, + "loss": 6.2895, + "step": 1242 + }, + { + "epoch": 0.4270927286002663, + "grad_norm": 0.7197567224502563, + "learning_rate": 0.00035449809894731136, + "loss": 6.6008, + "step": 1243 + }, + { + "epoch": 0.4274363269338144, + "grad_norm": 0.6966863870620728, + "learning_rate": 0.0003542255933720363, + "loss": 6.3742, + "step": 1244 + }, + { + "epoch": 0.42777992526736247, + "grad_norm": 0.8108692765235901, + "learning_rate": 0.0003539529378272166, + "loss": 6.4517, + "step": 1245 + }, + { + "epoch": 0.42812352360091055, + "grad_norm": 0.7402332425117493, + "learning_rate": 0.0003536801327051746, + "loss": 6.3478, + "step": 1246 + }, + { + "epoch": 0.42846712193445863, + "grad_norm": 0.7129719853401184, + "learning_rate": 0.0003534071783984479, + "loss": 6.3975, + "step": 1247 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 0.7612306475639343, + "learning_rate": 0.0003531340752997886, + "loss": 6.3858, + "step": 1248 + }, + { + "epoch": 0.4291543186015548, + "grad_norm": 0.8239126801490784, + "learning_rate": 0.00035286082380216313, + "loss": 6.4085, + "step": 1249 + }, + { + "epoch": 0.42949791693510286, + "grad_norm": 1.0187656879425049, + "learning_rate": 0.00035258742429875137, + "loss": 6.4928, + "step": 1250 + }, + { + "epoch": 0.42984151526865094, + "grad_norm": 0.8988327980041504, + "learning_rate": 0.00035231387718294595, + "loss": 6.2908, + "step": 1251 + }, + { + "epoch": 0.430185113602199, + "grad_norm": 0.941773533821106, + "learning_rate": 0.00035204018284835226, + "loss": 6.3999, + "step": 1252 + }, + { + "epoch": 0.4305287119357471, + "grad_norm": 0.879536509513855, + "learning_rate": 0.00035176634168878723, + "loss": 6.3897, + "step": 1253 + }, + { + "epoch": 0.4308723102692952, + "grad_norm": 0.658165454864502, + "learning_rate": 0.0003514923540982793, + "loss": 6.2647, + "step": 1254 + }, + { + "epoch": 0.43121590860284326, + "grad_norm": 0.9977064728736877, + "learning_rate": 0.0003512182204710673, + "loss": 6.2759, + "step": 1255 + }, + { + "epoch": 0.43155950693639133, + "grad_norm": 0.8180080652236938, + "learning_rate": 0.0003509439412016004, + "loss": 6.3759, + "step": 1256 + }, + { + "epoch": 0.4319031052699394, + "grad_norm": 0.8202301859855652, + "learning_rate": 0.00035066951668453745, + "loss": 6.4117, + "step": 1257 + }, + { + "epoch": 0.43224670360348755, + "grad_norm": 0.6717396378517151, + "learning_rate": 0.000350394947314746, + "loss": 6.2032, + "step": 1258 + }, + { + "epoch": 0.4325903019370356, + "grad_norm": 0.7441490888595581, + "learning_rate": 0.0003501202334873021, + "loss": 6.3116, + "step": 1259 + }, + { + "epoch": 0.4329339002705837, + "grad_norm": 0.7404760122299194, + "learning_rate": 0.00034984537559749, + "loss": 6.3653, + "step": 1260 + }, + { + "epoch": 0.4332774986041318, + "grad_norm": 0.5555077791213989, + "learning_rate": 0.0003495703740408008, + "loss": 6.3558, + "step": 1261 + }, + { + "epoch": 0.43362109693767986, + "grad_norm": 0.8754700422286987, + "learning_rate": 0.00034929522921293244, + "loss": 6.3524, + "step": 1262 + }, + { + "epoch": 0.43396469527122794, + "grad_norm": 0.5776944756507874, + "learning_rate": 0.00034901994150978924, + "loss": 6.2332, + "step": 1263 + }, + { + "epoch": 0.434308293604776, + "grad_norm": 0.5266901850700378, + "learning_rate": 0.00034874451132748074, + "loss": 6.3565, + "step": 1264 + }, + { + "epoch": 0.4346518919383241, + "grad_norm": 0.5814431309700012, + "learning_rate": 0.0003484689390623218, + "loss": 6.4379, + "step": 1265 + }, + { + "epoch": 0.4349954902718722, + "grad_norm": 0.6413130164146423, + "learning_rate": 0.0003481932251108316, + "loss": 6.3527, + "step": 1266 + }, + { + "epoch": 0.43533908860542025, + "grad_norm": 0.5977810025215149, + "learning_rate": 0.0003479173698697331, + "loss": 6.2953, + "step": 1267 + }, + { + "epoch": 0.43568268693896833, + "grad_norm": 0.5503946542739868, + "learning_rate": 0.0003476413737359527, + "loss": 6.362, + "step": 1268 + }, + { + "epoch": 0.4360262852725164, + "grad_norm": 0.6731773614883423, + "learning_rate": 0.00034736523710661964, + "loss": 6.2602, + "step": 1269 + }, + { + "epoch": 0.4363698836060645, + "grad_norm": 0.5576619505882263, + "learning_rate": 0.000347088960379065, + "loss": 6.3608, + "step": 1270 + }, + { + "epoch": 0.43671348193961257, + "grad_norm": 0.580730676651001, + "learning_rate": 0.00034681254395082156, + "loss": 6.3143, + "step": 1271 + }, + { + "epoch": 0.43705708027316065, + "grad_norm": 0.5770998001098633, + "learning_rate": 0.0003465359882196233, + "loss": 6.364, + "step": 1272 + }, + { + "epoch": 0.4374006786067088, + "grad_norm": 0.6039701700210571, + "learning_rate": 0.0003462592935834044, + "loss": 6.2892, + "step": 1273 + }, + { + "epoch": 0.43774427694025686, + "grad_norm": 0.6770427823066711, + "learning_rate": 0.00034598246044029906, + "loss": 6.2845, + "step": 1274 + }, + { + "epoch": 0.43808787527380494, + "grad_norm": 0.4633253216743469, + "learning_rate": 0.00034570548918864074, + "loss": 6.3737, + "step": 1275 + }, + { + "epoch": 0.438431473607353, + "grad_norm": 0.6166013479232788, + "learning_rate": 0.0003454283802269617, + "loss": 6.3048, + "step": 1276 + }, + { + "epoch": 0.4387750719409011, + "grad_norm": 0.6878900527954102, + "learning_rate": 0.0003451511339539921, + "loss": 6.1519, + "step": 1277 + }, + { + "epoch": 0.4391186702744492, + "grad_norm": 0.5486677289009094, + "learning_rate": 0.0003448737507686599, + "loss": 6.2707, + "step": 1278 + }, + { + "epoch": 0.43946226860799725, + "grad_norm": 0.6651487946510315, + "learning_rate": 0.00034459623107009006, + "loss": 6.443, + "step": 1279 + }, + { + "epoch": 0.43980586694154533, + "grad_norm": 0.6062619090080261, + "learning_rate": 0.00034431857525760385, + "loss": 6.2852, + "step": 1280 + }, + { + "epoch": 0.4401494652750934, + "grad_norm": 0.5092039704322815, + "learning_rate": 0.00034404078373071845, + "loss": 6.4106, + "step": 1281 + }, + { + "epoch": 0.4404930636086415, + "grad_norm": 0.4990077316761017, + "learning_rate": 0.00034376285688914645, + "loss": 6.4131, + "step": 1282 + }, + { + "epoch": 0.44083666194218957, + "grad_norm": 0.6572175621986389, + "learning_rate": 0.00034348479513279486, + "loss": 6.2638, + "step": 1283 + }, + { + "epoch": 0.44118026027573765, + "grad_norm": 0.5944482088088989, + "learning_rate": 0.000343206598861765, + "loss": 6.4179, + "step": 1284 + }, + { + "epoch": 0.4415238586092857, + "grad_norm": 0.6007838249206543, + "learning_rate": 0.0003429282684763519, + "loss": 6.268, + "step": 1285 + }, + { + "epoch": 0.4418674569428338, + "grad_norm": 0.528721034526825, + "learning_rate": 0.0003426498043770432, + "loss": 6.4174, + "step": 1286 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 0.515328586101532, + "learning_rate": 0.00034237120696451904, + "loss": 6.3786, + "step": 1287 + }, + { + "epoch": 0.44255465360993, + "grad_norm": 0.7291128635406494, + "learning_rate": 0.0003420924766396517, + "loss": 6.3848, + "step": 1288 + }, + { + "epoch": 0.4428982519434781, + "grad_norm": 0.5647246837615967, + "learning_rate": 0.0003418136138035044, + "loss": 6.3112, + "step": 1289 + }, + { + "epoch": 0.4432418502770262, + "grad_norm": 0.6985518336296082, + "learning_rate": 0.000341534618857331, + "loss": 6.3603, + "step": 1290 + }, + { + "epoch": 0.44358544861057425, + "grad_norm": 0.6250963807106018, + "learning_rate": 0.0003412554922025756, + "loss": 6.4102, + "step": 1291 + }, + { + "epoch": 0.44392904694412233, + "grad_norm": 0.8003494739532471, + "learning_rate": 0.0003409762342408719, + "loss": 6.279, + "step": 1292 + }, + { + "epoch": 0.4442726452776704, + "grad_norm": 0.6796652674674988, + "learning_rate": 0.0003406968453740423, + "loss": 6.4095, + "step": 1293 + }, + { + "epoch": 0.4446162436112185, + "grad_norm": 0.7780187726020813, + "learning_rate": 0.0003404173260040976, + "loss": 6.4159, + "step": 1294 + }, + { + "epoch": 0.44495984194476657, + "grad_norm": 0.9151715040206909, + "learning_rate": 0.0003401376765332366, + "loss": 6.3968, + "step": 1295 + }, + { + "epoch": 0.44530344027831464, + "grad_norm": 0.8557326197624207, + "learning_rate": 0.000339857897363845, + "loss": 6.3813, + "step": 1296 + }, + { + "epoch": 0.4456470386118627, + "grad_norm": 0.8565590977668762, + "learning_rate": 0.0003395779888984954, + "loss": 6.3745, + "step": 1297 + }, + { + "epoch": 0.4459906369454108, + "grad_norm": 0.7628725171089172, + "learning_rate": 0.00033929795153994624, + "loss": 6.4808, + "step": 1298 + }, + { + "epoch": 0.4463342352789589, + "grad_norm": 0.8995116353034973, + "learning_rate": 0.00033901778569114154, + "loss": 6.4562, + "step": 1299 + }, + { + "epoch": 0.44667783361250696, + "grad_norm": 1.1376203298568726, + "learning_rate": 0.0003387374917552101, + "loss": 6.4389, + "step": 1300 + }, + { + "epoch": 0.44702143194605504, + "grad_norm": 0.8454490303993225, + "learning_rate": 0.0003384570701354652, + "loss": 6.2257, + "step": 1301 + }, + { + "epoch": 0.44736503027960317, + "grad_norm": 0.9702429175376892, + "learning_rate": 0.0003381765212354036, + "loss": 6.2489, + "step": 1302 + }, + { + "epoch": 0.44770862861315125, + "grad_norm": 0.8661314845085144, + "learning_rate": 0.0003378958454587054, + "loss": 6.2794, + "step": 1303 + }, + { + "epoch": 0.44805222694669933, + "grad_norm": 0.6850778460502625, + "learning_rate": 0.00033761504320923316, + "loss": 6.2974, + "step": 1304 + }, + { + "epoch": 0.4483958252802474, + "grad_norm": 0.8128910660743713, + "learning_rate": 0.0003373341148910315, + "loss": 6.3021, + "step": 1305 + }, + { + "epoch": 0.4487394236137955, + "grad_norm": 0.6577335596084595, + "learning_rate": 0.00033705306090832626, + "loss": 6.3377, + "step": 1306 + }, + { + "epoch": 0.44908302194734356, + "grad_norm": 0.7702295184135437, + "learning_rate": 0.0003367718816655244, + "loss": 6.2775, + "step": 1307 + }, + { + "epoch": 0.44942662028089164, + "grad_norm": 0.7849611043930054, + "learning_rate": 0.0003364905775672129, + "loss": 6.1687, + "step": 1308 + }, + { + "epoch": 0.4497702186144397, + "grad_norm": 0.5363289713859558, + "learning_rate": 0.00033620914901815835, + "loss": 6.3946, + "step": 1309 + }, + { + "epoch": 0.4501138169479878, + "grad_norm": 0.8963726162910461, + "learning_rate": 0.0003359275964233066, + "loss": 6.3973, + "step": 1310 + }, + { + "epoch": 0.4504574152815359, + "grad_norm": 0.6057178974151611, + "learning_rate": 0.0003356459201877819, + "loss": 6.1898, + "step": 1311 + }, + { + "epoch": 0.45080101361508396, + "grad_norm": 0.7307539582252502, + "learning_rate": 0.00033536412071688635, + "loss": 6.2201, + "step": 1312 + }, + { + "epoch": 0.45114461194863203, + "grad_norm": 0.7102933526039124, + "learning_rate": 0.0003350821984160994, + "loss": 6.3939, + "step": 1313 + }, + { + "epoch": 0.4514882102821801, + "grad_norm": 0.41668689250946045, + "learning_rate": 0.00033480015369107734, + "loss": 6.3618, + "step": 1314 + }, + { + "epoch": 0.4518318086157282, + "grad_norm": 0.5837572813034058, + "learning_rate": 0.00033451798694765256, + "loss": 6.2348, + "step": 1315 + }, + { + "epoch": 0.45217540694927627, + "grad_norm": 0.5444297790527344, + "learning_rate": 0.00033423569859183277, + "loss": 6.2863, + "step": 1316 + }, + { + "epoch": 0.4525190052828244, + "grad_norm": 0.5512393116950989, + "learning_rate": 0.00033395328902980113, + "loss": 6.227, + "step": 1317 + }, + { + "epoch": 0.4528626036163725, + "grad_norm": 0.6816386580467224, + "learning_rate": 0.00033367075866791484, + "loss": 6.2635, + "step": 1318 + }, + { + "epoch": 0.45320620194992056, + "grad_norm": 0.5417824387550354, + "learning_rate": 0.0003333881079127052, + "loss": 6.2983, + "step": 1319 + }, + { + "epoch": 0.45354980028346864, + "grad_norm": 0.5983556509017944, + "learning_rate": 0.00033310533717087633, + "loss": 6.2582, + "step": 1320 + }, + { + "epoch": 0.4538933986170167, + "grad_norm": 0.5117073059082031, + "learning_rate": 0.00033282244684930553, + "loss": 6.356, + "step": 1321 + }, + { + "epoch": 0.4542369969505648, + "grad_norm": 0.5673801302909851, + "learning_rate": 0.0003325394373550416, + "loss": 6.2926, + "step": 1322 + }, + { + "epoch": 0.4545805952841129, + "grad_norm": 0.6815798282623291, + "learning_rate": 0.00033225630909530535, + "loss": 6.3458, + "step": 1323 + }, + { + "epoch": 0.45492419361766095, + "grad_norm": 0.6034033894538879, + "learning_rate": 0.0003319730624774881, + "loss": 6.3044, + "step": 1324 + }, + { + "epoch": 0.45526779195120903, + "grad_norm": 0.6391438245773315, + "learning_rate": 0.0003316896979091517, + "loss": 6.312, + "step": 1325 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 0.5472264289855957, + "learning_rate": 0.0003314062157980275, + "loss": 6.279, + "step": 1326 + }, + { + "epoch": 0.4559549886183052, + "grad_norm": 0.5441725850105286, + "learning_rate": 0.0003311226165520163, + "loss": 6.2368, + "step": 1327 + }, + { + "epoch": 0.45629858695185327, + "grad_norm": 0.5921383500099182, + "learning_rate": 0.00033083890057918714, + "loss": 6.2127, + "step": 1328 + }, + { + "epoch": 0.45664218528540135, + "grad_norm": 0.5410550832748413, + "learning_rate": 0.0003305550682877771, + "loss": 6.3174, + "step": 1329 + }, + { + "epoch": 0.4569857836189494, + "grad_norm": 0.7097800374031067, + "learning_rate": 0.0003302711200861907, + "loss": 6.2349, + "step": 1330 + }, + { + "epoch": 0.4573293819524975, + "grad_norm": 0.6367291808128357, + "learning_rate": 0.00032998705638299925, + "loss": 6.2174, + "step": 1331 + }, + { + "epoch": 0.45767298028604564, + "grad_norm": 0.5397276878356934, + "learning_rate": 0.0003297028775869401, + "loss": 6.412, + "step": 1332 + }, + { + "epoch": 0.4580165786195937, + "grad_norm": 0.7171114683151245, + "learning_rate": 0.0003294185841069165, + "loss": 6.3053, + "step": 1333 + }, + { + "epoch": 0.4583601769531418, + "grad_norm": 0.5611921548843384, + "learning_rate": 0.0003291341763519963, + "loss": 6.2494, + "step": 1334 + }, + { + "epoch": 0.4587037752866899, + "grad_norm": 0.5535762906074524, + "learning_rate": 0.0003288496547314122, + "loss": 6.3483, + "step": 1335 + }, + { + "epoch": 0.45904737362023795, + "grad_norm": 0.5840194225311279, + "learning_rate": 0.00032856501965456043, + "loss": 6.4088, + "step": 1336 + }, + { + "epoch": 0.45939097195378603, + "grad_norm": 0.5516507029533386, + "learning_rate": 0.00032828027153100067, + "loss": 6.362, + "step": 1337 + }, + { + "epoch": 0.4597345702873341, + "grad_norm": 0.5980287194252014, + "learning_rate": 0.0003279954107704551, + "loss": 6.2421, + "step": 1338 + }, + { + "epoch": 0.4600781686208822, + "grad_norm": 0.6924148201942444, + "learning_rate": 0.00032771043778280826, + "loss": 6.1814, + "step": 1339 + }, + { + "epoch": 0.46042176695443027, + "grad_norm": 0.5325207114219666, + "learning_rate": 0.00032742535297810573, + "loss": 6.3799, + "step": 1340 + }, + { + "epoch": 0.46076536528797835, + "grad_norm": 0.7110170722007751, + "learning_rate": 0.0003271401567665544, + "loss": 6.4423, + "step": 1341 + }, + { + "epoch": 0.4611089636215264, + "grad_norm": 0.7545775771141052, + "learning_rate": 0.0003268548495585212, + "loss": 6.3796, + "step": 1342 + }, + { + "epoch": 0.4614525619550745, + "grad_norm": 0.6151261329650879, + "learning_rate": 0.0003265694317645328, + "loss": 6.3873, + "step": 1343 + }, + { + "epoch": 0.4617961602886226, + "grad_norm": 0.5837677121162415, + "learning_rate": 0.00032628390379527524, + "loss": 6.5048, + "step": 1344 + }, + { + "epoch": 0.46213975862217066, + "grad_norm": 0.6211174130439758, + "learning_rate": 0.0003259982660615927, + "loss": 6.6132, + "step": 1345 + }, + { + "epoch": 0.46248335695571874, + "grad_norm": 0.6927919387817383, + "learning_rate": 0.00032571251897448765, + "loss": 6.3105, + "step": 1346 + }, + { + "epoch": 0.4628269552892669, + "grad_norm": 0.796061098575592, + "learning_rate": 0.0003254266629451198, + "loss": 6.4112, + "step": 1347 + }, + { + "epoch": 0.46317055362281495, + "grad_norm": 0.8083237409591675, + "learning_rate": 0.00032514069838480536, + "loss": 6.3347, + "step": 1348 + }, + { + "epoch": 0.46351415195636303, + "grad_norm": 0.7419441938400269, + "learning_rate": 0.0003248546257050171, + "loss": 6.3444, + "step": 1349 + }, + { + "epoch": 0.4638577502899111, + "grad_norm": 0.9289708733558655, + "learning_rate": 0.00032456844531738313, + "loss": 6.4263, + "step": 1350 + }, + { + "epoch": 0.4642013486234592, + "grad_norm": 0.8992305397987366, + "learning_rate": 0.00032428215763368655, + "loss": 6.366, + "step": 1351 + }, + { + "epoch": 0.46454494695700727, + "grad_norm": 0.6328170895576477, + "learning_rate": 0.00032399576306586493, + "loss": 6.2029, + "step": 1352 + }, + { + "epoch": 0.46488854529055534, + "grad_norm": 0.48920828104019165, + "learning_rate": 0.0003237092620260096, + "loss": 6.2901, + "step": 1353 + }, + { + "epoch": 0.4652321436241034, + "grad_norm": 0.6819382309913635, + "learning_rate": 0.0003234226549263651, + "loss": 6.1887, + "step": 1354 + }, + { + "epoch": 0.4655757419576515, + "grad_norm": 0.6389537453651428, + "learning_rate": 0.0003231359421793286, + "loss": 6.2041, + "step": 1355 + }, + { + "epoch": 0.4659193402911996, + "grad_norm": 0.6682074666023254, + "learning_rate": 0.00032284912419744904, + "loss": 6.1585, + "step": 1356 + }, + { + "epoch": 0.46626293862474766, + "grad_norm": 0.5480344891548157, + "learning_rate": 0.0003225622013934273, + "loss": 6.2923, + "step": 1357 + }, + { + "epoch": 0.46660653695829574, + "grad_norm": 0.6636848449707031, + "learning_rate": 0.00032227517418011457, + "loss": 6.1032, + "step": 1358 + }, + { + "epoch": 0.4669501352918438, + "grad_norm": 0.7700697779655457, + "learning_rate": 0.00032198804297051256, + "loss": 6.2676, + "step": 1359 + }, + { + "epoch": 0.4672937336253919, + "grad_norm": 0.776321291923523, + "learning_rate": 0.0003217008081777726, + "loss": 6.1532, + "step": 1360 + }, + { + "epoch": 0.46763733195894, + "grad_norm": 0.5545940399169922, + "learning_rate": 0.00032141347021519485, + "loss": 6.2717, + "step": 1361 + }, + { + "epoch": 0.4679809302924881, + "grad_norm": 0.5399784445762634, + "learning_rate": 0.0003211260294962282, + "loss": 6.3534, + "step": 1362 + }, + { + "epoch": 0.4683245286260362, + "grad_norm": 0.6080695390701294, + "learning_rate": 0.00032083848643446936, + "loss": 6.338, + "step": 1363 + }, + { + "epoch": 0.46866812695958426, + "grad_norm": 0.6340306997299194, + "learning_rate": 0.00032055084144366194, + "loss": 6.4141, + "step": 1364 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 0.5792441368103027, + "learning_rate": 0.0003202630949376968, + "loss": 6.2975, + "step": 1365 + }, + { + "epoch": 0.4693553236266804, + "grad_norm": 0.7110378742218018, + "learning_rate": 0.00031997524733061027, + "loss": 6.3343, + "step": 1366 + }, + { + "epoch": 0.4696989219602285, + "grad_norm": 0.5993596911430359, + "learning_rate": 0.0003196872990365847, + "loss": 6.2514, + "step": 1367 + }, + { + "epoch": 0.4700425202937766, + "grad_norm": 0.5569903254508972, + "learning_rate": 0.00031939925046994686, + "loss": 6.3676, + "step": 1368 + }, + { + "epoch": 0.47038611862732466, + "grad_norm": 0.5726590156555176, + "learning_rate": 0.0003191111020451682, + "loss": 6.2824, + "step": 1369 + }, + { + "epoch": 0.47072971696087273, + "grad_norm": 0.5852641463279724, + "learning_rate": 0.00031882285417686354, + "loss": 6.1269, + "step": 1370 + }, + { + "epoch": 0.4710733152944208, + "grad_norm": 0.5540098547935486, + "learning_rate": 0.0003185345072797909, + "loss": 6.2548, + "step": 1371 + }, + { + "epoch": 0.4714169136279689, + "grad_norm": 0.6843763589859009, + "learning_rate": 0.0003182460617688508, + "loss": 6.2628, + "step": 1372 + }, + { + "epoch": 0.47176051196151697, + "grad_norm": 0.6013829708099365, + "learning_rate": 0.00031795751805908576, + "loss": 6.1731, + "step": 1373 + }, + { + "epoch": 0.47210411029506505, + "grad_norm": 0.667677104473114, + "learning_rate": 0.0003176688765656793, + "loss": 6.2667, + "step": 1374 + }, + { + "epoch": 0.47244770862861313, + "grad_norm": 0.4927704334259033, + "learning_rate": 0.000317380137703956, + "loss": 6.1375, + "step": 1375 + }, + { + "epoch": 0.4727913069621612, + "grad_norm": 0.4976654052734375, + "learning_rate": 0.0003170913018893804, + "loss": 6.3665, + "step": 1376 + }, + { + "epoch": 0.47313490529570934, + "grad_norm": 0.5618501901626587, + "learning_rate": 0.0003168023695375563, + "loss": 6.2159, + "step": 1377 + }, + { + "epoch": 0.4734785036292574, + "grad_norm": 0.48412713408470154, + "learning_rate": 0.0003165133410642268, + "loss": 6.3496, + "step": 1378 + }, + { + "epoch": 0.4738221019628055, + "grad_norm": 0.44460970163345337, + "learning_rate": 0.0003162242168852732, + "loss": 6.3743, + "step": 1379 + }, + { + "epoch": 0.4741657002963536, + "grad_norm": 0.4922606647014618, + "learning_rate": 0.0003159349974167143, + "loss": 6.3456, + "step": 1380 + }, + { + "epoch": 0.47450929862990165, + "grad_norm": 0.5359041690826416, + "learning_rate": 0.00031564568307470615, + "loss": 6.2156, + "step": 1381 + }, + { + "epoch": 0.47485289696344973, + "grad_norm": 0.5539677739143372, + "learning_rate": 0.00031535627427554144, + "loss": 6.1331, + "step": 1382 + }, + { + "epoch": 0.4751964952969978, + "grad_norm": 0.5184550881385803, + "learning_rate": 0.00031506677143564856, + "loss": 6.2359, + "step": 1383 + }, + { + "epoch": 0.4755400936305459, + "grad_norm": 0.5449308156967163, + "learning_rate": 0.00031477717497159133, + "loss": 6.2612, + "step": 1384 + }, + { + "epoch": 0.47588369196409397, + "grad_norm": 0.6895928978919983, + "learning_rate": 0.0003144874853000682, + "loss": 6.2665, + "step": 1385 + }, + { + "epoch": 0.47622729029764205, + "grad_norm": 0.5198205709457397, + "learning_rate": 0.000314197702837912, + "loss": 6.2982, + "step": 1386 + }, + { + "epoch": 0.4765708886311901, + "grad_norm": 0.622516393661499, + "learning_rate": 0.00031390782800208865, + "loss": 6.3573, + "step": 1387 + }, + { + "epoch": 0.4769144869647382, + "grad_norm": 0.7482896447181702, + "learning_rate": 0.00031361786120969734, + "loss": 6.3066, + "step": 1388 + }, + { + "epoch": 0.4772580852982863, + "grad_norm": 0.6616333723068237, + "learning_rate": 0.0003133278028779695, + "loss": 6.2663, + "step": 1389 + }, + { + "epoch": 0.47760168363183436, + "grad_norm": 0.6296412944793701, + "learning_rate": 0.000313037653424268, + "loss": 6.2826, + "step": 1390 + }, + { + "epoch": 0.47794528196538244, + "grad_norm": 0.7290968298912048, + "learning_rate": 0.0003127474132660872, + "loss": 6.4081, + "step": 1391 + }, + { + "epoch": 0.4782888802989306, + "grad_norm": 0.6888962388038635, + "learning_rate": 0.0003124570828210518, + "loss": 6.3042, + "step": 1392 + }, + { + "epoch": 0.47863247863247865, + "grad_norm": 0.5802988409996033, + "learning_rate": 0.0003121666625069165, + "loss": 6.4235, + "step": 1393 + }, + { + "epoch": 0.47897607696602673, + "grad_norm": 0.7673103213310242, + "learning_rate": 0.0003118761527415651, + "loss": 6.3122, + "step": 1394 + }, + { + "epoch": 0.4793196752995748, + "grad_norm": 0.9014639854431152, + "learning_rate": 0.0003115855539430104, + "loss": 6.1529, + "step": 1395 + }, + { + "epoch": 0.4796632736331229, + "grad_norm": 0.7389999032020569, + "learning_rate": 0.0003112948665293931, + "loss": 6.2241, + "step": 1396 + }, + { + "epoch": 0.48000687196667097, + "grad_norm": 0.8546797633171082, + "learning_rate": 0.0003110040909189815, + "loss": 6.457, + "step": 1397 + }, + { + "epoch": 0.48035047030021905, + "grad_norm": 0.8488891124725342, + "learning_rate": 0.0003107132275301707, + "loss": 6.4161, + "step": 1398 + }, + { + "epoch": 0.4806940686337671, + "grad_norm": 0.7810699939727783, + "learning_rate": 0.0003104222767814823, + "loss": 6.434, + "step": 1399 + }, + { + "epoch": 0.4810376669673152, + "grad_norm": 1.0687366724014282, + "learning_rate": 0.00031013123909156344, + "loss": 6.5133, + "step": 1400 + }, + { + "epoch": 0.4813812653008633, + "grad_norm": 0.9212234020233154, + "learning_rate": 0.0003098401148791863, + "loss": 6.223, + "step": 1401 + }, + { + "epoch": 0.48172486363441136, + "grad_norm": 0.7973336577415466, + "learning_rate": 0.0003095489045632479, + "loss": 6.1975, + "step": 1402 + }, + { + "epoch": 0.48206846196795944, + "grad_norm": 0.6696678400039673, + "learning_rate": 0.00030925760856276866, + "loss": 6.3295, + "step": 1403 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 0.5218870043754578, + "learning_rate": 0.00030896622729689266, + "loss": 6.2556, + "step": 1404 + }, + { + "epoch": 0.4827556586350556, + "grad_norm": 0.663040816783905, + "learning_rate": 0.00030867476118488654, + "loss": 6.1849, + "step": 1405 + }, + { + "epoch": 0.48309925696860373, + "grad_norm": 0.7346043586730957, + "learning_rate": 0.0003083832106461391, + "loss": 6.3262, + "step": 1406 + }, + { + "epoch": 0.4834428553021518, + "grad_norm": 0.6023099422454834, + "learning_rate": 0.0003080915761001605, + "loss": 6.2441, + "step": 1407 + }, + { + "epoch": 0.4837864536356999, + "grad_norm": 0.6472316980361938, + "learning_rate": 0.00030779985796658177, + "loss": 6.3055, + "step": 1408 + }, + { + "epoch": 0.48413005196924797, + "grad_norm": 0.6640135049819946, + "learning_rate": 0.0003075080566651544, + "loss": 6.245, + "step": 1409 + }, + { + "epoch": 0.48447365030279604, + "grad_norm": 0.6156014204025269, + "learning_rate": 0.0003072161726157494, + "loss": 6.1676, + "step": 1410 + }, + { + "epoch": 0.4848172486363441, + "grad_norm": 0.798943042755127, + "learning_rate": 0.0003069242062383569, + "loss": 6.3021, + "step": 1411 + }, + { + "epoch": 0.4851608469698922, + "grad_norm": 0.5238396525382996, + "learning_rate": 0.00030663215795308533, + "loss": 6.2686, + "step": 1412 + }, + { + "epoch": 0.4855044453034403, + "grad_norm": 0.5713295340538025, + "learning_rate": 0.0003063400281801613, + "loss": 6.0774, + "step": 1413 + }, + { + "epoch": 0.48584804363698836, + "grad_norm": 0.6762855052947998, + "learning_rate": 0.0003060478173399283, + "loss": 6.2609, + "step": 1414 + }, + { + "epoch": 0.48619164197053644, + "grad_norm": 0.589787483215332, + "learning_rate": 0.00030575552585284684, + "loss": 6.2896, + "step": 1415 + }, + { + "epoch": 0.4865352403040845, + "grad_norm": 0.6290336847305298, + "learning_rate": 0.0003054631541394932, + "loss": 6.1697, + "step": 1416 + }, + { + "epoch": 0.4868788386376326, + "grad_norm": 0.5550079345703125, + "learning_rate": 0.00030517070262055907, + "loss": 6.1668, + "step": 1417 + }, + { + "epoch": 0.4872224369711807, + "grad_norm": 0.5955278873443604, + "learning_rate": 0.00030487817171685126, + "loss": 6.2552, + "step": 1418 + }, + { + "epoch": 0.48756603530472875, + "grad_norm": 0.5142794251441956, + "learning_rate": 0.0003045855618492905, + "loss": 6.3203, + "step": 1419 + }, + { + "epoch": 0.48790963363827683, + "grad_norm": 0.6616760492324829, + "learning_rate": 0.0003042928734389114, + "loss": 6.3663, + "step": 1420 + }, + { + "epoch": 0.48825323197182496, + "grad_norm": 0.5484029650688171, + "learning_rate": 0.0003040001069068613, + "loss": 6.1665, + "step": 1421 + }, + { + "epoch": 0.48859683030537304, + "grad_norm": 0.5682584047317505, + "learning_rate": 0.0003037072626744003, + "loss": 6.3342, + "step": 1422 + }, + { + "epoch": 0.4889404286389211, + "grad_norm": 0.5823968648910522, + "learning_rate": 0.00030341434116289997, + "loss": 6.1182, + "step": 1423 + }, + { + "epoch": 0.4892840269724692, + "grad_norm": 0.5125210285186768, + "learning_rate": 0.00030312134279384317, + "loss": 6.3474, + "step": 1424 + }, + { + "epoch": 0.4896276253060173, + "grad_norm": 0.5110132098197937, + "learning_rate": 0.00030282826798882356, + "loss": 6.2739, + "step": 1425 + }, + { + "epoch": 0.48997122363956536, + "grad_norm": 0.56996750831604, + "learning_rate": 0.0003025351171695444, + "loss": 6.1881, + "step": 1426 + }, + { + "epoch": 0.49031482197311343, + "grad_norm": 0.5609503984451294, + "learning_rate": 0.0003022418907578188, + "loss": 6.3131, + "step": 1427 + }, + { + "epoch": 0.4906584203066615, + "grad_norm": 0.5459311604499817, + "learning_rate": 0.00030194858917556816, + "loss": 6.1978, + "step": 1428 + }, + { + "epoch": 0.4910020186402096, + "grad_norm": 0.5984681248664856, + "learning_rate": 0.0003016552128448224, + "loss": 6.321, + "step": 1429 + }, + { + "epoch": 0.49134561697375767, + "grad_norm": 0.5124654769897461, + "learning_rate": 0.00030136176218771875, + "loss": 6.2282, + "step": 1430 + }, + { + "epoch": 0.49168921530730575, + "grad_norm": 0.5804280638694763, + "learning_rate": 0.00030106823762650163, + "loss": 6.2156, + "step": 1431 + }, + { + "epoch": 0.49203281364085383, + "grad_norm": 0.7151570916175842, + "learning_rate": 0.0003007746395835215, + "loss": 6.2783, + "step": 1432 + }, + { + "epoch": 0.4923764119744019, + "grad_norm": 0.6895557641983032, + "learning_rate": 0.00030048096848123493, + "loss": 6.1935, + "step": 1433 + }, + { + "epoch": 0.49272001030795, + "grad_norm": 0.7361437082290649, + "learning_rate": 0.0003001872247422032, + "loss": 6.2602, + "step": 1434 + }, + { + "epoch": 0.49306360864149806, + "grad_norm": 0.7202491760253906, + "learning_rate": 0.00029989340878909244, + "loss": 6.2125, + "step": 1435 + }, + { + "epoch": 0.4934072069750462, + "grad_norm": 0.6291714906692505, + "learning_rate": 0.00029959952104467247, + "loss": 6.2358, + "step": 1436 + }, + { + "epoch": 0.4937508053085943, + "grad_norm": 0.8956025242805481, + "learning_rate": 0.0002993055619318166, + "loss": 6.2331, + "step": 1437 + }, + { + "epoch": 0.49409440364214235, + "grad_norm": 0.7390561103820801, + "learning_rate": 0.0002990115318735007, + "loss": 6.3176, + "step": 1438 + }, + { + "epoch": 0.49443800197569043, + "grad_norm": 0.6197091937065125, + "learning_rate": 0.00029871743129280273, + "loss": 6.1561, + "step": 1439 + }, + { + "epoch": 0.4947816003092385, + "grad_norm": 0.7448058128356934, + "learning_rate": 0.00029842326061290205, + "loss": 6.344, + "step": 1440 + }, + { + "epoch": 0.4951251986427866, + "grad_norm": 0.7358593940734863, + "learning_rate": 0.0002981290202570792, + "loss": 6.2683, + "step": 1441 + }, + { + "epoch": 0.49546879697633467, + "grad_norm": 0.8008904457092285, + "learning_rate": 0.0002978347106487146, + "loss": 6.2716, + "step": 1442 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 0.7615576386451721, + "learning_rate": 0.00029754033221128864, + "loss": 6.3032, + "step": 1443 + }, + { + "epoch": 0.4961559936434308, + "grad_norm": 0.7408410906791687, + "learning_rate": 0.0002972458853683803, + "loss": 6.352, + "step": 1444 + }, + { + "epoch": 0.4964995919769789, + "grad_norm": 0.6674975752830505, + "learning_rate": 0.0002969513705436676, + "loss": 6.3652, + "step": 1445 + }, + { + "epoch": 0.496843190310527, + "grad_norm": 0.9053188562393188, + "learning_rate": 0.0002966567881609258, + "loss": 6.1545, + "step": 1446 + }, + { + "epoch": 0.49718678864407506, + "grad_norm": 0.7032760381698608, + "learning_rate": 0.0002963621386440277, + "loss": 6.3121, + "step": 1447 + }, + { + "epoch": 0.49753038697762314, + "grad_norm": 0.9544557332992554, + "learning_rate": 0.0002960674224169427, + "loss": 6.2407, + "step": 1448 + }, + { + "epoch": 0.4978739853111712, + "grad_norm": 0.8960500955581665, + "learning_rate": 0.00029577263990373593, + "loss": 6.38, + "step": 1449 + }, + { + "epoch": 0.4982175836447193, + "grad_norm": 1.0645229816436768, + "learning_rate": 0.00029547779152856827, + "loss": 6.2797, + "step": 1450 + }, + { + "epoch": 0.49856118197826743, + "grad_norm": 1.2595845460891724, + "learning_rate": 0.0002951828777156951, + "loss": 6.2422, + "step": 1451 + }, + { + "epoch": 0.4989047803118155, + "grad_norm": 1.0342768430709839, + "learning_rate": 0.000294887898889466, + "loss": 6.2663, + "step": 1452 + }, + { + "epoch": 0.4992483786453636, + "grad_norm": 0.9279031753540039, + "learning_rate": 0.0002945928554743241, + "loss": 6.2004, + "step": 1453 + }, + { + "epoch": 0.49959197697891167, + "grad_norm": 0.8587821125984192, + "learning_rate": 0.0002942977478948057, + "loss": 6.1789, + "step": 1454 + }, + { + "epoch": 0.49993557531245975, + "grad_norm": 0.880979597568512, + "learning_rate": 0.00029400257657553896, + "loss": 6.1842, + "step": 1455 + }, + { + "epoch": 0.5002791736460078, + "grad_norm": 0.9389069676399231, + "learning_rate": 0.0002937073419412442, + "loss": 6.18, + "step": 1456 + }, + { + "epoch": 0.5006227719795558, + "grad_norm": 0.5919857025146484, + "learning_rate": 0.00029341204441673266, + "loss": 6.1496, + "step": 1457 + }, + { + "epoch": 0.500966370313104, + "grad_norm": 0.8019484281539917, + "learning_rate": 0.0002931166844269059, + "loss": 6.0585, + "step": 1458 + }, + { + "epoch": 0.5013099686466521, + "grad_norm": 0.6207021474838257, + "learning_rate": 0.0002928212623967556, + "loss": 6.1253, + "step": 1459 + }, + { + "epoch": 0.5016535669802001, + "grad_norm": 0.7272038459777832, + "learning_rate": 0.0002925257787513628, + "loss": 6.1487, + "step": 1460 + }, + { + "epoch": 0.5019971653137483, + "grad_norm": 0.7108120918273926, + "learning_rate": 0.00029223023391589695, + "loss": 6.1344, + "step": 1461 + }, + { + "epoch": 0.5023407636472963, + "grad_norm": 0.51841801404953, + "learning_rate": 0.0002919346283156155, + "loss": 6.3044, + "step": 1462 + }, + { + "epoch": 0.5026843619808444, + "grad_norm": 0.7992899417877197, + "learning_rate": 0.0002916389623758636, + "loss": 6.1096, + "step": 1463 + }, + { + "epoch": 0.5030279603143925, + "grad_norm": 0.8045459985733032, + "learning_rate": 0.0002913432365220732, + "loss": 6.1531, + "step": 1464 + }, + { + "epoch": 0.5033715586479406, + "grad_norm": 0.4957735240459442, + "learning_rate": 0.0002910474511797621, + "loss": 6.2309, + "step": 1465 + }, + { + "epoch": 0.5037151569814886, + "grad_norm": 0.5887153148651123, + "learning_rate": 0.00029075160677453416, + "loss": 6.2778, + "step": 1466 + }, + { + "epoch": 0.5040587553150367, + "grad_norm": 0.7119141221046448, + "learning_rate": 0.00029045570373207794, + "loss": 6.1357, + "step": 1467 + }, + { + "epoch": 0.5044023536485848, + "grad_norm": 0.5873328447341919, + "learning_rate": 0.0002901597424781664, + "loss": 6.1697, + "step": 1468 + }, + { + "epoch": 0.5047459519821329, + "grad_norm": 0.5278820395469666, + "learning_rate": 0.00028986372343865643, + "loss": 6.1599, + "step": 1469 + }, + { + "epoch": 0.5050895503156809, + "grad_norm": 0.5881029963493347, + "learning_rate": 0.00028956764703948787, + "loss": 6.2839, + "step": 1470 + }, + { + "epoch": 0.5054331486492291, + "grad_norm": 0.6216078996658325, + "learning_rate": 0.0002892715137066831, + "loss": 6.1502, + "step": 1471 + }, + { + "epoch": 0.5057767469827771, + "grad_norm": 0.548704981803894, + "learning_rate": 0.00028897532386634663, + "loss": 6.2749, + "step": 1472 + }, + { + "epoch": 0.5061203453163252, + "grad_norm": 0.6772187352180481, + "learning_rate": 0.00028867907794466403, + "loss": 6.1506, + "step": 1473 + }, + { + "epoch": 0.5064639436498733, + "grad_norm": 0.6250154972076416, + "learning_rate": 0.00028838277636790183, + "loss": 6.3769, + "step": 1474 + }, + { + "epoch": 0.5068075419834214, + "grad_norm": 0.6144253015518188, + "learning_rate": 0.0002880864195624063, + "loss": 6.2031, + "step": 1475 + }, + { + "epoch": 0.5071511403169695, + "grad_norm": 0.5683775544166565, + "learning_rate": 0.0002877900079546035, + "loss": 6.2813, + "step": 1476 + }, + { + "epoch": 0.5074947386505175, + "grad_norm": 0.6378828883171082, + "learning_rate": 0.0002874935419709982, + "loss": 6.2685, + "step": 1477 + }, + { + "epoch": 0.5078383369840657, + "grad_norm": 0.5964968800544739, + "learning_rate": 0.0002871970220381733, + "loss": 6.2304, + "step": 1478 + }, + { + "epoch": 0.5081819353176137, + "grad_norm": 0.6776195764541626, + "learning_rate": 0.0002869004485827896, + "loss": 6.3116, + "step": 1479 + }, + { + "epoch": 0.5085255336511618, + "grad_norm": 0.6209731101989746, + "learning_rate": 0.0002866038220315847, + "loss": 6.215, + "step": 1480 + }, + { + "epoch": 0.5088691319847098, + "grad_norm": 0.6367368698120117, + "learning_rate": 0.0002863071428113726, + "loss": 6.2344, + "step": 1481 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 0.6730502247810364, + "learning_rate": 0.0002860104113490432, + "loss": 6.2209, + "step": 1482 + }, + { + "epoch": 0.509556328651806, + "grad_norm": 0.529813289642334, + "learning_rate": 0.0002857136280715616, + "loss": 6.1866, + "step": 1483 + }, + { + "epoch": 0.5098999269853541, + "grad_norm": 0.7928996682167053, + "learning_rate": 0.0002854167934059672, + "loss": 6.2018, + "step": 1484 + }, + { + "epoch": 0.5102435253189022, + "grad_norm": 0.5787724852561951, + "learning_rate": 0.0002851199077793736, + "loss": 6.276, + "step": 1485 + }, + { + "epoch": 0.5105871236524503, + "grad_norm": 0.6694987416267395, + "learning_rate": 0.0002848229716189678, + "loss": 6.4032, + "step": 1486 + }, + { + "epoch": 0.5109307219859983, + "grad_norm": 0.604768693447113, + "learning_rate": 0.0002845259853520091, + "loss": 6.2687, + "step": 1487 + }, + { + "epoch": 0.5112743203195464, + "grad_norm": 0.6459552049636841, + "learning_rate": 0.00028422894940582927, + "loss": 6.2607, + "step": 1488 + }, + { + "epoch": 0.5116179186530946, + "grad_norm": 0.992656409740448, + "learning_rate": 0.00028393186420783145, + "loss": 6.3237, + "step": 1489 + }, + { + "epoch": 0.5119615169866426, + "grad_norm": 0.823799192905426, + "learning_rate": 0.0002836347301854897, + "loss": 6.092, + "step": 1490 + }, + { + "epoch": 0.5123051153201907, + "grad_norm": 0.8477516174316406, + "learning_rate": 0.0002833375477663481, + "loss": 6.223, + "step": 1491 + }, + { + "epoch": 0.5126487136537388, + "grad_norm": 0.7959129214286804, + "learning_rate": 0.00028304031737802076, + "loss": 6.2709, + "step": 1492 + }, + { + "epoch": 0.5129923119872869, + "grad_norm": 0.8560094237327576, + "learning_rate": 0.00028274303944819044, + "loss": 6.3429, + "step": 1493 + }, + { + "epoch": 0.5133359103208349, + "grad_norm": 0.8726269602775574, + "learning_rate": 0.0002824457144046086, + "loss": 6.244, + "step": 1494 + }, + { + "epoch": 0.513679508654383, + "grad_norm": 0.8484057188034058, + "learning_rate": 0.0002821483426750942, + "loss": 6.2978, + "step": 1495 + }, + { + "epoch": 0.5140231069879311, + "grad_norm": 0.7470120787620544, + "learning_rate": 0.00028185092468753373, + "loss": 6.3663, + "step": 1496 + }, + { + "epoch": 0.5143667053214792, + "grad_norm": 0.9047187566757202, + "learning_rate": 0.0002815534608698798, + "loss": 6.5125, + "step": 1497 + }, + { + "epoch": 0.5147103036550272, + "grad_norm": 0.9526438117027283, + "learning_rate": 0.00028125595165015137, + "loss": 6.2855, + "step": 1498 + }, + { + "epoch": 0.5150539019885754, + "grad_norm": 1.011681318283081, + "learning_rate": 0.0002809583974564326, + "loss": 6.3073, + "step": 1499 + }, + { + "epoch": 0.5153975003221234, + "grad_norm": 0.8152887225151062, + "learning_rate": 0.0002806607987168722, + "loss": 6.4397, + "step": 1500 + }, + { + "epoch": 0.5157410986556715, + "grad_norm": 1.0889192819595337, + "learning_rate": 0.0002803631558596832, + "loss": 6.2249, + "step": 1501 + }, + { + "epoch": 0.5160846969892195, + "grad_norm": 1.135572075843811, + "learning_rate": 0.000280065469313142, + "loss": 6.0517, + "step": 1502 + }, + { + "epoch": 0.5164282953227677, + "grad_norm": 0.6644673347473145, + "learning_rate": 0.0002797677395055879, + "loss": 6.1306, + "step": 1503 + }, + { + "epoch": 0.5167718936563158, + "grad_norm": 0.7205954194068909, + "learning_rate": 0.0002794699668654223, + "loss": 6.1675, + "step": 1504 + }, + { + "epoch": 0.5171154919898638, + "grad_norm": 0.6675506830215454, + "learning_rate": 0.00027917215182110853, + "loss": 6.1734, + "step": 1505 + }, + { + "epoch": 0.517459090323412, + "grad_norm": 0.7752920389175415, + "learning_rate": 0.00027887429480117075, + "loss": 6.2684, + "step": 1506 + }, + { + "epoch": 0.51780268865696, + "grad_norm": 0.7607707381248474, + "learning_rate": 0.00027857639623419346, + "loss": 6.1148, + "step": 1507 + }, + { + "epoch": 0.5181462869905081, + "grad_norm": 0.49518853425979614, + "learning_rate": 0.0002782784565488211, + "loss": 6.1403, + "step": 1508 + }, + { + "epoch": 0.5184898853240562, + "grad_norm": 0.7124084830284119, + "learning_rate": 0.0002779804761737571, + "loss": 6.1645, + "step": 1509 + }, + { + "epoch": 0.5188334836576043, + "grad_norm": 0.7854694724082947, + "learning_rate": 0.00027768245553776356, + "loss": 6.2317, + "step": 1510 + }, + { + "epoch": 0.5191770819911523, + "grad_norm": 0.6847150325775146, + "learning_rate": 0.00027738439506966046, + "loss": 6.005, + "step": 1511 + }, + { + "epoch": 0.5195206803247004, + "grad_norm": 0.6421884894371033, + "learning_rate": 0.00027708629519832516, + "loss": 6.1717, + "step": 1512 + }, + { + "epoch": 0.5198642786582485, + "grad_norm": 0.7636914253234863, + "learning_rate": 0.0002767881563526917, + "loss": 6.2715, + "step": 1513 + }, + { + "epoch": 0.5202078769917966, + "grad_norm": 0.5291933417320251, + "learning_rate": 0.00027648997896175003, + "loss": 6.3303, + "step": 1514 + }, + { + "epoch": 0.5205514753253446, + "grad_norm": 0.5756345987319946, + "learning_rate": 0.00027619176345454585, + "loss": 6.1258, + "step": 1515 + }, + { + "epoch": 0.5208950736588928, + "grad_norm": 0.7027370929718018, + "learning_rate": 0.0002758935102601796, + "loss": 6.214, + "step": 1516 + }, + { + "epoch": 0.5212386719924408, + "grad_norm": 0.7267154455184937, + "learning_rate": 0.00027559521980780564, + "loss": 6.2464, + "step": 1517 + }, + { + "epoch": 0.5215822703259889, + "grad_norm": 0.5030806064605713, + "learning_rate": 0.0002752968925266325, + "loss": 6.2359, + "step": 1518 + }, + { + "epoch": 0.521925868659537, + "grad_norm": 0.6831111311912537, + "learning_rate": 0.0002749985288459213, + "loss": 6.0984, + "step": 1519 + }, + { + "epoch": 0.5222694669930851, + "grad_norm": 0.6685227751731873, + "learning_rate": 0.00027470012919498567, + "loss": 6.0867, + "step": 1520 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 0.5604387521743774, + "learning_rate": 0.00027440169400319087, + "loss": 6.3091, + "step": 1521 + }, + { + "epoch": 0.5229566636601812, + "grad_norm": 0.6780193448066711, + "learning_rate": 0.00027410322369995357, + "loss": 6.2407, + "step": 1522 + }, + { + "epoch": 0.5233002619937294, + "grad_norm": 0.7392048835754395, + "learning_rate": 0.0002738047187147406, + "loss": 6.2447, + "step": 1523 + }, + { + "epoch": 0.5236438603272774, + "grad_norm": 0.6794151067733765, + "learning_rate": 0.00027350617947706913, + "loss": 6.3431, + "step": 1524 + }, + { + "epoch": 0.5239874586608255, + "grad_norm": 0.6256364583969116, + "learning_rate": 0.0002732076064165052, + "loss": 6.2444, + "step": 1525 + }, + { + "epoch": 0.5243310569943735, + "grad_norm": 0.6079226136207581, + "learning_rate": 0.0002729089999626637, + "loss": 6.2164, + "step": 1526 + }, + { + "epoch": 0.5246746553279217, + "grad_norm": 0.6453949809074402, + "learning_rate": 0.0002726103605452075, + "loss": 6.1986, + "step": 1527 + }, + { + "epoch": 0.5250182536614697, + "grad_norm": 0.7797389626502991, + "learning_rate": 0.0002723116885938472, + "loss": 6.1708, + "step": 1528 + }, + { + "epoch": 0.5253618519950178, + "grad_norm": 0.5642460584640503, + "learning_rate": 0.00027201298453833977, + "loss": 6.0971, + "step": 1529 + }, + { + "epoch": 0.5257054503285659, + "grad_norm": 0.5858938694000244, + "learning_rate": 0.00027171424880848867, + "loss": 6.3316, + "step": 1530 + }, + { + "epoch": 0.526049048662114, + "grad_norm": 0.8247479796409607, + "learning_rate": 0.00027141548183414274, + "loss": 6.1299, + "step": 1531 + }, + { + "epoch": 0.526392646995662, + "grad_norm": 0.6232000589370728, + "learning_rate": 0.00027111668404519604, + "loss": 6.2575, + "step": 1532 + }, + { + "epoch": 0.5267362453292102, + "grad_norm": 0.5557352900505066, + "learning_rate": 0.0002708178558715866, + "loss": 6.2561, + "step": 1533 + }, + { + "epoch": 0.5270798436627583, + "grad_norm": 0.7393467426300049, + "learning_rate": 0.00027051899774329665, + "loss": 6.1768, + "step": 1534 + }, + { + "epoch": 0.5274234419963063, + "grad_norm": 0.6794891357421875, + "learning_rate": 0.00027022011009035106, + "loss": 6.2293, + "step": 1535 + }, + { + "epoch": 0.5277670403298544, + "grad_norm": 0.7217805981636047, + "learning_rate": 0.0002699211933428174, + "loss": 6.1596, + "step": 1536 + }, + { + "epoch": 0.5281106386634025, + "grad_norm": 0.7262641787528992, + "learning_rate": 0.00026962224793080513, + "loss": 6.1514, + "step": 1537 + }, + { + "epoch": 0.5284542369969506, + "grad_norm": 0.7474170327186584, + "learning_rate": 0.0002693232742844649, + "loss": 6.2244, + "step": 1538 + }, + { + "epoch": 0.5287978353304986, + "grad_norm": 0.7755861878395081, + "learning_rate": 0.00026902427283398796, + "loss": 6.1124, + "step": 1539 + }, + { + "epoch": 0.5291414336640468, + "grad_norm": 0.7614656090736389, + "learning_rate": 0.00026872524400960564, + "loss": 6.3099, + "step": 1540 + }, + { + "epoch": 0.5294850319975948, + "grad_norm": 0.6895480751991272, + "learning_rate": 0.0002684261882415886, + "loss": 6.2277, + "step": 1541 + }, + { + "epoch": 0.5298286303311429, + "grad_norm": 0.7481386065483093, + "learning_rate": 0.0002681271059602462, + "loss": 6.4332, + "step": 1542 + }, + { + "epoch": 0.5301722286646909, + "grad_norm": 0.6497515439987183, + "learning_rate": 0.0002678279975959261, + "loss": 6.2423, + "step": 1543 + }, + { + "epoch": 0.5305158269982391, + "grad_norm": 0.7895376086235046, + "learning_rate": 0.00026752886357901353, + "loss": 6.4106, + "step": 1544 + }, + { + "epoch": 0.5308594253317871, + "grad_norm": 0.7385738492012024, + "learning_rate": 0.0002672297043399304, + "loss": 6.33, + "step": 1545 + }, + { + "epoch": 0.5312030236653352, + "grad_norm": 0.6905698776245117, + "learning_rate": 0.0002669305203091351, + "loss": 6.1804, + "step": 1546 + }, + { + "epoch": 0.5315466219988833, + "grad_norm": 0.9432231783866882, + "learning_rate": 0.0002666313119171216, + "loss": 6.2444, + "step": 1547 + }, + { + "epoch": 0.5318902203324314, + "grad_norm": 1.0359256267547607, + "learning_rate": 0.000266332079594419, + "loss": 6.2369, + "step": 1548 + }, + { + "epoch": 0.5322338186659795, + "grad_norm": 0.7891038656234741, + "learning_rate": 0.0002660328237715907, + "loss": 6.3167, + "step": 1549 + }, + { + "epoch": 0.5325774169995275, + "grad_norm": 1.0773836374282837, + "learning_rate": 0.000265733544879234, + "loss": 6.3635, + "step": 1550 + }, + { + "epoch": 0.5329210153330757, + "grad_norm": 1.258744478225708, + "learning_rate": 0.00026543424334797956, + "loss": 6.1315, + "step": 1551 + }, + { + "epoch": 0.5332646136666237, + "grad_norm": 1.075838565826416, + "learning_rate": 0.0002651349196084903, + "loss": 6.1109, + "step": 1552 + }, + { + "epoch": 0.5336082120001718, + "grad_norm": 0.7799094915390015, + "learning_rate": 0.0002648355740914613, + "loss": 6.1875, + "step": 1553 + }, + { + "epoch": 0.5339518103337199, + "grad_norm": 0.872112512588501, + "learning_rate": 0.00026453620722761897, + "loss": 6.1078, + "step": 1554 + }, + { + "epoch": 0.534295408667268, + "grad_norm": 0.8411591649055481, + "learning_rate": 0.00026423681944772034, + "loss": 6.0248, + "step": 1555 + }, + { + "epoch": 0.534639007000816, + "grad_norm": 0.8713768720626831, + "learning_rate": 0.00026393741118255253, + "loss": 6.1603, + "step": 1556 + }, + { + "epoch": 0.5349826053343641, + "grad_norm": 1.0349767208099365, + "learning_rate": 0.00026363798286293226, + "loss": 6.1652, + "step": 1557 + }, + { + "epoch": 0.5353262036679122, + "grad_norm": 0.7198126912117004, + "learning_rate": 0.0002633385349197051, + "loss": 6.125, + "step": 1558 + }, + { + "epoch": 0.5356698020014603, + "grad_norm": 0.8494895696640015, + "learning_rate": 0.0002630390677837447, + "loss": 6.2175, + "step": 1559 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 0.8133593201637268, + "learning_rate": 0.00026273958188595235, + "loss": 6.3068, + "step": 1560 + }, + { + "epoch": 0.5363569986685565, + "grad_norm": 0.7584961652755737, + "learning_rate": 0.0002624400776572566, + "loss": 6.1282, + "step": 1561 + }, + { + "epoch": 0.5367005970021045, + "grad_norm": 0.7136414051055908, + "learning_rate": 0.0002621405555286121, + "loss": 6.22, + "step": 1562 + }, + { + "epoch": 0.5370441953356526, + "grad_norm": 0.8710498809814453, + "learning_rate": 0.0002618410159309992, + "loss": 6.16, + "step": 1563 + }, + { + "epoch": 0.5373877936692008, + "grad_norm": 0.8503127098083496, + "learning_rate": 0.00026154145929542386, + "loss": 6.2239, + "step": 1564 + }, + { + "epoch": 0.5377313920027488, + "grad_norm": 0.8023079037666321, + "learning_rate": 0.0002612418860529158, + "loss": 6.1692, + "step": 1565 + }, + { + "epoch": 0.5380749903362969, + "grad_norm": 0.9042754769325256, + "learning_rate": 0.00026094229663452934, + "loss": 6.1975, + "step": 1566 + }, + { + "epoch": 0.5384185886698449, + "grad_norm": 0.7970142960548401, + "learning_rate": 0.0002606426914713418, + "loss": 6.2393, + "step": 1567 + }, + { + "epoch": 0.5387621870033931, + "grad_norm": 0.607698917388916, + "learning_rate": 0.00026034307099445295, + "loss": 6.19, + "step": 1568 + }, + { + "epoch": 0.5391057853369411, + "grad_norm": 0.8219443559646606, + "learning_rate": 0.0002600434356349849, + "loss": 6.3123, + "step": 1569 + }, + { + "epoch": 0.5394493836704892, + "grad_norm": 0.6942794919013977, + "learning_rate": 0.0002597437858240812, + "loss": 6.1638, + "step": 1570 + }, + { + "epoch": 0.5397929820040372, + "grad_norm": 0.7742037773132324, + "learning_rate": 0.00025944412199290585, + "loss": 6.1596, + "step": 1571 + }, + { + "epoch": 0.5401365803375854, + "grad_norm": 0.8137544989585876, + "learning_rate": 0.00025914444457264334, + "loss": 6.2533, + "step": 1572 + }, + { + "epoch": 0.5404801786711334, + "grad_norm": 0.6724058985710144, + "learning_rate": 0.0002588447539944976, + "loss": 6.2323, + "step": 1573 + }, + { + "epoch": 0.5408237770046815, + "grad_norm": 0.7243574857711792, + "learning_rate": 0.0002585450506896915, + "loss": 6.2136, + "step": 1574 + }, + { + "epoch": 0.5411673753382296, + "grad_norm": 0.734247624874115, + "learning_rate": 0.00025824533508946615, + "loss": 6.1987, + "step": 1575 + }, + { + "epoch": 0.5415109736717777, + "grad_norm": 0.6768348813056946, + "learning_rate": 0.00025794560762508044, + "loss": 6.2893, + "step": 1576 + }, + { + "epoch": 0.5418545720053258, + "grad_norm": 0.5567851662635803, + "learning_rate": 0.00025764586872781053, + "loss": 6.2841, + "step": 1577 + }, + { + "epoch": 0.5421981703388739, + "grad_norm": 0.6665927767753601, + "learning_rate": 0.00025734611882894857, + "loss": 6.3247, + "step": 1578 + }, + { + "epoch": 0.542541768672422, + "grad_norm": 0.6924320459365845, + "learning_rate": 0.0002570463583598028, + "loss": 6.2716, + "step": 1579 + }, + { + "epoch": 0.54288536700597, + "grad_norm": 0.6488571166992188, + "learning_rate": 0.0002567465877516968, + "loss": 6.191, + "step": 1580 + }, + { + "epoch": 0.5432289653395181, + "grad_norm": 0.5639288425445557, + "learning_rate": 0.0002564468074359684, + "loss": 6.1643, + "step": 1581 + }, + { + "epoch": 0.5435725636730662, + "grad_norm": 0.6971966624259949, + "learning_rate": 0.0002561470178439698, + "loss": 6.1649, + "step": 1582 + }, + { + "epoch": 0.5439161620066143, + "grad_norm": 0.7037007808685303, + "learning_rate": 0.0002558472194070662, + "loss": 6.1952, + "step": 1583 + }, + { + "epoch": 0.5442597603401623, + "grad_norm": 0.5069451332092285, + "learning_rate": 0.00025554741255663584, + "loss": 6.0759, + "step": 1584 + }, + { + "epoch": 0.5446033586737105, + "grad_norm": 0.6529680490493774, + "learning_rate": 0.00025524759772406865, + "loss": 6.2216, + "step": 1585 + }, + { + "epoch": 0.5449469570072585, + "grad_norm": 0.5853424072265625, + "learning_rate": 0.00025494777534076647, + "loss": 6.1679, + "step": 1586 + }, + { + "epoch": 0.5452905553408066, + "grad_norm": 0.4783673882484436, + "learning_rate": 0.00025464794583814174, + "loss": 6.2046, + "step": 1587 + }, + { + "epoch": 0.5456341536743546, + "grad_norm": 0.5306436419487, + "learning_rate": 0.00025434810964761726, + "loss": 6.1886, + "step": 1588 + }, + { + "epoch": 0.5459777520079028, + "grad_norm": 0.6045244932174683, + "learning_rate": 0.0002540482672006254, + "loss": 6.2545, + "step": 1589 + }, + { + "epoch": 0.5463213503414508, + "grad_norm": 0.6048717498779297, + "learning_rate": 0.0002537484189286076, + "loss": 6.3085, + "step": 1590 + }, + { + "epoch": 0.5466649486749989, + "grad_norm": 0.5896782279014587, + "learning_rate": 0.0002534485652630135, + "loss": 6.2663, + "step": 1591 + }, + { + "epoch": 0.5470085470085471, + "grad_norm": 0.5969629883766174, + "learning_rate": 0.0002531487066353008, + "loss": 6.1941, + "step": 1592 + }, + { + "epoch": 0.5473521453420951, + "grad_norm": 0.6015769243240356, + "learning_rate": 0.00025284884347693415, + "loss": 6.0663, + "step": 1593 + }, + { + "epoch": 0.5476957436756432, + "grad_norm": 0.6984557509422302, + "learning_rate": 0.0002525489762193847, + "loss": 6.2395, + "step": 1594 + }, + { + "epoch": 0.5480393420091912, + "grad_norm": 0.7120092511177063, + "learning_rate": 0.0002522491052941295, + "loss": 6.2486, + "step": 1595 + }, + { + "epoch": 0.5483829403427394, + "grad_norm": 0.6783263683319092, + "learning_rate": 0.00025194923113265095, + "loss": 6.2436, + "step": 1596 + }, + { + "epoch": 0.5487265386762874, + "grad_norm": 0.7355448007583618, + "learning_rate": 0.0002516493541664362, + "loss": 6.1035, + "step": 1597 + }, + { + "epoch": 0.5490701370098355, + "grad_norm": 0.6941987872123718, + "learning_rate": 0.00025134947482697613, + "loss": 6.3763, + "step": 1598 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 0.873782753944397, + "learning_rate": 0.0002510495935457653, + "loss": 6.3246, + "step": 1599 + }, + { + "epoch": 0.5497573336769317, + "grad_norm": 1.0551037788391113, + "learning_rate": 0.00025074971075430104, + "loss": 6.2419, + "step": 1600 + }, + { + "epoch": 0.5501009320104797, + "grad_norm": 0.7868710160255432, + "learning_rate": 0.0002504498268840826, + "loss": 6.1377, + "step": 1601 + }, + { + "epoch": 0.5504445303440278, + "grad_norm": 0.6369834542274475, + "learning_rate": 0.00025014994236661125, + "loss": 6.1627, + "step": 1602 + }, + { + "epoch": 0.5507881286775759, + "grad_norm": 0.6547728180885315, + "learning_rate": 0.00024985005763338876, + "loss": 6.0642, + "step": 1603 + }, + { + "epoch": 0.551131727011124, + "grad_norm": 0.5977075099945068, + "learning_rate": 0.0002495501731159174, + "loss": 6.0824, + "step": 1604 + }, + { + "epoch": 0.551475325344672, + "grad_norm": 0.8198515772819519, + "learning_rate": 0.0002492502892456991, + "loss": 6.0344, + "step": 1605 + }, + { + "epoch": 0.5518189236782202, + "grad_norm": 0.7595130801200867, + "learning_rate": 0.0002489504064542347, + "loss": 6.039, + "step": 1606 + }, + { + "epoch": 0.5521625220117683, + "grad_norm": 0.5898275375366211, + "learning_rate": 0.00024865052517302394, + "loss": 6.1183, + "step": 1607 + }, + { + "epoch": 0.5525061203453163, + "grad_norm": 0.6673107743263245, + "learning_rate": 0.0002483506458335639, + "loss": 6.0777, + "step": 1608 + }, + { + "epoch": 0.5528497186788645, + "grad_norm": 0.8127065300941467, + "learning_rate": 0.00024805076886734906, + "loss": 6.0912, + "step": 1609 + }, + { + "epoch": 0.5531933170124125, + "grad_norm": 0.6119105815887451, + "learning_rate": 0.00024775089470587057, + "loss": 6.2048, + "step": 1610 + }, + { + "epoch": 0.5535369153459606, + "grad_norm": 0.5875973105430603, + "learning_rate": 0.00024745102378061543, + "loss": 6.0123, + "step": 1611 + }, + { + "epoch": 0.5538805136795086, + "grad_norm": 0.837634265422821, + "learning_rate": 0.00024715115652306586, + "loss": 6.1173, + "step": 1612 + }, + { + "epoch": 0.5542241120130568, + "grad_norm": 0.7039463520050049, + "learning_rate": 0.0002468512933646992, + "loss": 6.1897, + "step": 1613 + }, + { + "epoch": 0.5545677103466048, + "grad_norm": 0.6563109159469604, + "learning_rate": 0.00024655143473698655, + "loss": 6.2105, + "step": 1614 + }, + { + "epoch": 0.5549113086801529, + "grad_norm": 0.5650578737258911, + "learning_rate": 0.00024625158107139246, + "loss": 6.1727, + "step": 1615 + }, + { + "epoch": 0.555254907013701, + "grad_norm": 0.6338276863098145, + "learning_rate": 0.0002459517327993746, + "loss": 6.0853, + "step": 1616 + }, + { + "epoch": 0.5555985053472491, + "grad_norm": 0.8130798935890198, + "learning_rate": 0.0002456518903523828, + "loss": 6.2192, + "step": 1617 + }, + { + "epoch": 0.5559421036807971, + "grad_norm": 0.47761037945747375, + "learning_rate": 0.00024535205416185827, + "loss": 6.2542, + "step": 1618 + }, + { + "epoch": 0.5562857020143452, + "grad_norm": 0.5506361126899719, + "learning_rate": 0.00024505222465923354, + "loss": 6.1728, + "step": 1619 + }, + { + "epoch": 0.5566293003478933, + "grad_norm": 0.6886653304100037, + "learning_rate": 0.0002447524022759313, + "loss": 6.1607, + "step": 1620 + }, + { + "epoch": 0.5569728986814414, + "grad_norm": 0.5948178768157959, + "learning_rate": 0.0002444525874433642, + "loss": 6.1875, + "step": 1621 + }, + { + "epoch": 0.5573164970149895, + "grad_norm": 0.5316332578659058, + "learning_rate": 0.0002441527805929338, + "loss": 6.1377, + "step": 1622 + }, + { + "epoch": 0.5576600953485376, + "grad_norm": 0.6047093868255615, + "learning_rate": 0.00024385298215603017, + "loss": 6.1663, + "step": 1623 + }, + { + "epoch": 0.5580036936820857, + "grad_norm": 0.7805117964744568, + "learning_rate": 0.00024355319256403156, + "loss": 6.2092, + "step": 1624 + }, + { + "epoch": 0.5583472920156337, + "grad_norm": 0.5755521655082703, + "learning_rate": 0.0002432534122483033, + "loss": 6.1914, + "step": 1625 + }, + { + "epoch": 0.5586908903491818, + "grad_norm": 0.6552025675773621, + "learning_rate": 0.0002429536416401972, + "loss": 6.1984, + "step": 1626 + }, + { + "epoch": 0.5590344886827299, + "grad_norm": 0.6080207824707031, + "learning_rate": 0.00024265388117105153, + "loss": 6.1819, + "step": 1627 + }, + { + "epoch": 0.559378087016278, + "grad_norm": 0.6865605711936951, + "learning_rate": 0.0002423541312721896, + "loss": 6.1663, + "step": 1628 + }, + { + "epoch": 0.559721685349826, + "grad_norm": 0.6106628179550171, + "learning_rate": 0.00024205439237491949, + "loss": 6.2782, + "step": 1629 + }, + { + "epoch": 0.5600652836833742, + "grad_norm": 0.5570806264877319, + "learning_rate": 0.00024175466491053392, + "loss": 6.1906, + "step": 1630 + }, + { + "epoch": 0.5604088820169222, + "grad_norm": 0.6362578272819519, + "learning_rate": 0.0002414549493103086, + "loss": 6.2011, + "step": 1631 + }, + { + "epoch": 0.5607524803504703, + "grad_norm": 0.7689515352249146, + "learning_rate": 0.00024115524600550243, + "loss": 6.0611, + "step": 1632 + }, + { + "epoch": 0.5610960786840183, + "grad_norm": 0.5351979732513428, + "learning_rate": 0.0002408555554273567, + "loss": 6.1061, + "step": 1633 + }, + { + "epoch": 0.5614396770175665, + "grad_norm": 0.5828037261962891, + "learning_rate": 0.0002405558780070942, + "loss": 6.2728, + "step": 1634 + }, + { + "epoch": 0.5617832753511145, + "grad_norm": 0.6618168354034424, + "learning_rate": 0.00024025621417591886, + "loss": 6.2246, + "step": 1635 + }, + { + "epoch": 0.5621268736846626, + "grad_norm": 0.7272453904151917, + "learning_rate": 0.0002399565643650151, + "loss": 6.0693, + "step": 1636 + }, + { + "epoch": 0.5624704720182108, + "grad_norm": 0.7130446434020996, + "learning_rate": 0.00023965692900554712, + "loss": 6.2137, + "step": 1637 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 0.4798155128955841, + "learning_rate": 0.0002393573085286583, + "loss": 6.2844, + "step": 1638 + }, + { + "epoch": 0.5631576686853069, + "grad_norm": 0.6884756088256836, + "learning_rate": 0.0002390577033654707, + "loss": 6.1711, + "step": 1639 + }, + { + "epoch": 0.563501267018855, + "grad_norm": 0.682988703250885, + "learning_rate": 0.0002387581139470843, + "loss": 6.2269, + "step": 1640 + }, + { + "epoch": 0.5638448653524031, + "grad_norm": 0.7200033068656921, + "learning_rate": 0.00023845854070457623, + "loss": 6.2101, + "step": 1641 + }, + { + "epoch": 0.5641884636859511, + "grad_norm": 0.6693586111068726, + "learning_rate": 0.0002381589840690008, + "loss": 6.2992, + "step": 1642 + }, + { + "epoch": 0.5645320620194992, + "grad_norm": 0.6025054454803467, + "learning_rate": 0.000237859444471388, + "loss": 6.2408, + "step": 1643 + }, + { + "epoch": 0.5648756603530473, + "grad_norm": 0.7449381351470947, + "learning_rate": 0.0002375599223427434, + "loss": 6.2862, + "step": 1644 + }, + { + "epoch": 0.5652192586865954, + "grad_norm": 1.0429128408432007, + "learning_rate": 0.00023726041811404766, + "loss": 6.2646, + "step": 1645 + }, + { + "epoch": 0.5655628570201434, + "grad_norm": 0.7181035876274109, + "learning_rate": 0.00023696093221625532, + "loss": 6.1514, + "step": 1646 + }, + { + "epoch": 0.5659064553536916, + "grad_norm": 0.838463306427002, + "learning_rate": 0.0002366614650802949, + "loss": 6.4686, + "step": 1647 + }, + { + "epoch": 0.5662500536872396, + "grad_norm": 1.0257136821746826, + "learning_rate": 0.00023636201713706772, + "loss": 6.3583, + "step": 1648 + }, + { + "epoch": 0.5665936520207877, + "grad_norm": 0.8851714134216309, + "learning_rate": 0.00023606258881744745, + "loss": 6.3048, + "step": 1649 + }, + { + "epoch": 0.5669372503543357, + "grad_norm": 1.000500202178955, + "learning_rate": 0.00023576318055227975, + "loss": 6.3964, + "step": 1650 + }, + { + "epoch": 0.5672808486878839, + "grad_norm": 1.0033082962036133, + "learning_rate": 0.00023546379277238105, + "loss": 6.1371, + "step": 1651 + }, + { + "epoch": 0.567624447021432, + "grad_norm": 0.7857053875923157, + "learning_rate": 0.0002351644259085387, + "loss": 6.0755, + "step": 1652 + }, + { + "epoch": 0.56796804535498, + "grad_norm": 0.7347760200500488, + "learning_rate": 0.00023486508039150976, + "loss": 6.1028, + "step": 1653 + }, + { + "epoch": 0.5683116436885282, + "grad_norm": 0.5923128128051758, + "learning_rate": 0.00023456575665202053, + "loss": 6.0312, + "step": 1654 + }, + { + "epoch": 0.5686552420220762, + "grad_norm": 0.7084698677062988, + "learning_rate": 0.000234266455120766, + "loss": 6.0676, + "step": 1655 + }, + { + "epoch": 0.5689988403556243, + "grad_norm": 0.7293703556060791, + "learning_rate": 0.0002339671762284094, + "loss": 6.1787, + "step": 1656 + }, + { + "epoch": 0.5693424386891723, + "grad_norm": 0.604590654373169, + "learning_rate": 0.00023366792040558113, + "loss": 6.1567, + "step": 1657 + }, + { + "epoch": 0.5696860370227205, + "grad_norm": 0.6981744766235352, + "learning_rate": 0.00023336868808287843, + "loss": 5.9918, + "step": 1658 + }, + { + "epoch": 0.5700296353562685, + "grad_norm": 0.6478411555290222, + "learning_rate": 0.00023306947969086494, + "loss": 6.1123, + "step": 1659 + }, + { + "epoch": 0.5703732336898166, + "grad_norm": 0.6853176951408386, + "learning_rate": 0.00023277029566006965, + "loss": 6.1282, + "step": 1660 + }, + { + "epoch": 0.5707168320233647, + "grad_norm": 0.7653744220733643, + "learning_rate": 0.00023247113642098648, + "loss": 6.0782, + "step": 1661 + }, + { + "epoch": 0.5710604303569128, + "grad_norm": 0.6941654086112976, + "learning_rate": 0.00023217200240407387, + "loss": 6.0358, + "step": 1662 + }, + { + "epoch": 0.5714040286904608, + "grad_norm": 0.6170705556869507, + "learning_rate": 0.0002318728940397539, + "loss": 6.2664, + "step": 1663 + }, + { + "epoch": 0.5717476270240089, + "grad_norm": 0.6748530864715576, + "learning_rate": 0.00023157381175841144, + "loss": 6.1793, + "step": 1664 + }, + { + "epoch": 0.572091225357557, + "grad_norm": 0.7513784170150757, + "learning_rate": 0.0002312747559903944, + "loss": 6.1495, + "step": 1665 + }, + { + "epoch": 0.5724348236911051, + "grad_norm": 0.7271863222122192, + "learning_rate": 0.0002309757271660121, + "loss": 6.1181, + "step": 1666 + }, + { + "epoch": 0.5727784220246532, + "grad_norm": 0.7138211131095886, + "learning_rate": 0.00023067672571553514, + "loss": 6.2254, + "step": 1667 + }, + { + "epoch": 0.5731220203582013, + "grad_norm": 0.7385585904121399, + "learning_rate": 0.00023037775206919493, + "loss": 6.2181, + "step": 1668 + }, + { + "epoch": 0.5734656186917494, + "grad_norm": 0.6244990229606628, + "learning_rate": 0.00023007880665718263, + "loss": 6.2382, + "step": 1669 + }, + { + "epoch": 0.5738092170252974, + "grad_norm": 0.6192176938056946, + "learning_rate": 0.00022977988990964898, + "loss": 6.2845, + "step": 1670 + }, + { + "epoch": 0.5741528153588455, + "grad_norm": 0.6722596287727356, + "learning_rate": 0.0002294810022567034, + "loss": 6.2133, + "step": 1671 + }, + { + "epoch": 0.5744964136923936, + "grad_norm": 0.7086919546127319, + "learning_rate": 0.0002291821441284133, + "loss": 6.0343, + "step": 1672 + }, + { + "epoch": 0.5748400120259417, + "grad_norm": 0.7796761989593506, + "learning_rate": 0.000228883315954804, + "loss": 6.1843, + "step": 1673 + }, + { + "epoch": 0.5751836103594897, + "grad_norm": 0.4925467371940613, + "learning_rate": 0.0002285845181658573, + "loss": 6.2025, + "step": 1674 + }, + { + "epoch": 0.5755272086930379, + "grad_norm": 0.6159957051277161, + "learning_rate": 0.00022828575119151134, + "loss": 6.1741, + "step": 1675 + }, + { + "epoch": 0.5758708070265859, + "grad_norm": 0.5439791083335876, + "learning_rate": 0.00022798701546166024, + "loss": 6.2161, + "step": 1676 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 0.6240994930267334, + "learning_rate": 0.00022768831140615285, + "loss": 6.2162, + "step": 1677 + }, + { + "epoch": 0.576558003693682, + "grad_norm": 0.5322961807250977, + "learning_rate": 0.0002273896394547924, + "loss": 6.1087, + "step": 1678 + }, + { + "epoch": 0.5769016020272302, + "grad_norm": 0.6332693696022034, + "learning_rate": 0.00022709100003733636, + "loss": 6.1368, + "step": 1679 + }, + { + "epoch": 0.5772452003607782, + "grad_norm": 0.6597732901573181, + "learning_rate": 0.0002267923935834949, + "loss": 6.1741, + "step": 1680 + }, + { + "epoch": 0.5775887986943263, + "grad_norm": 0.6494370102882385, + "learning_rate": 0.0002264938205229309, + "loss": 6.1665, + "step": 1681 + }, + { + "epoch": 0.5779323970278745, + "grad_norm": 0.565851628780365, + "learning_rate": 0.0002261952812852594, + "loss": 6.1571, + "step": 1682 + }, + { + "epoch": 0.5782759953614225, + "grad_norm": 0.4857479929924011, + "learning_rate": 0.0002258967763000465, + "loss": 6.2028, + "step": 1683 + }, + { + "epoch": 0.5786195936949706, + "grad_norm": 0.6660141944885254, + "learning_rate": 0.00022559830599680914, + "loss": 6.1034, + "step": 1684 + }, + { + "epoch": 0.5789631920285186, + "grad_norm": 0.7688015103340149, + "learning_rate": 0.0002252998708050144, + "loss": 6.1557, + "step": 1685 + }, + { + "epoch": 0.5793067903620668, + "grad_norm": 0.5798430442810059, + "learning_rate": 0.0002250014711540788, + "loss": 6.1569, + "step": 1686 + }, + { + "epoch": 0.5796503886956148, + "grad_norm": 0.8817176222801208, + "learning_rate": 0.0002247031074733675, + "loss": 6.0853, + "step": 1687 + }, + { + "epoch": 0.5799939870291629, + "grad_norm": 0.6133841872215271, + "learning_rate": 0.00022440478019219437, + "loss": 6.1425, + "step": 1688 + }, + { + "epoch": 0.580337585362711, + "grad_norm": 0.7280254364013672, + "learning_rate": 0.00022410648973982057, + "loss": 6.1649, + "step": 1689 + }, + { + "epoch": 0.5806811836962591, + "grad_norm": 0.7156420946121216, + "learning_rate": 0.00022380823654545416, + "loss": 6.0785, + "step": 1690 + }, + { + "epoch": 0.5810247820298071, + "grad_norm": 0.7227360010147095, + "learning_rate": 0.00022351002103825003, + "loss": 6.1744, + "step": 1691 + }, + { + "epoch": 0.5813683803633553, + "grad_norm": 0.6410605311393738, + "learning_rate": 0.00022321184364730847, + "loss": 6.2956, + "step": 1692 + }, + { + "epoch": 0.5817119786969033, + "grad_norm": 0.7240776419639587, + "learning_rate": 0.00022291370480167485, + "loss": 6.2184, + "step": 1693 + }, + { + "epoch": 0.5820555770304514, + "grad_norm": 0.7092165946960449, + "learning_rate": 0.0002226156049303396, + "loss": 6.3391, + "step": 1694 + }, + { + "epoch": 0.5823991753639994, + "grad_norm": 0.692663311958313, + "learning_rate": 0.00022231754446223656, + "loss": 6.1169, + "step": 1695 + }, + { + "epoch": 0.5827427736975476, + "grad_norm": 0.6744980812072754, + "learning_rate": 0.00022201952382624294, + "loss": 6.2275, + "step": 1696 + }, + { + "epoch": 0.5830863720310957, + "grad_norm": 0.6746431589126587, + "learning_rate": 0.00022172154345117894, + "loss": 6.2227, + "step": 1697 + }, + { + "epoch": 0.5834299703646437, + "grad_norm": 0.9164922833442688, + "learning_rate": 0.0002214236037658065, + "loss": 6.2579, + "step": 1698 + }, + { + "epoch": 0.5837735686981919, + "grad_norm": 0.8660106658935547, + "learning_rate": 0.00022112570519882923, + "loss": 6.2453, + "step": 1699 + }, + { + "epoch": 0.5841171670317399, + "grad_norm": 1.112679123878479, + "learning_rate": 0.00022082784817889148, + "loss": 6.1501, + "step": 1700 + }, + { + "epoch": 0.584460765365288, + "grad_norm": 1.1230827569961548, + "learning_rate": 0.00022053003313457763, + "loss": 6.0107, + "step": 1701 + }, + { + "epoch": 0.584804363698836, + "grad_norm": 1.0337865352630615, + "learning_rate": 0.00022023226049441218, + "loss": 6.1094, + "step": 1702 + }, + { + "epoch": 0.5851479620323842, + "grad_norm": 0.7783079743385315, + "learning_rate": 0.0002199345306868581, + "loss": 5.9931, + "step": 1703 + }, + { + "epoch": 0.5854915603659322, + "grad_norm": 0.6345077157020569, + "learning_rate": 0.0002196368441403168, + "loss": 5.9898, + "step": 1704 + }, + { + "epoch": 0.5858351586994803, + "grad_norm": 0.8554926514625549, + "learning_rate": 0.00021933920128312784, + "loss": 6.1712, + "step": 1705 + }, + { + "epoch": 0.5861787570330284, + "grad_norm": 0.8038747906684875, + "learning_rate": 0.0002190416025435675, + "loss": 6.1559, + "step": 1706 + }, + { + "epoch": 0.5865223553665765, + "grad_norm": 0.63972008228302, + "learning_rate": 0.0002187440483498486, + "loss": 6.1405, + "step": 1707 + }, + { + "epoch": 0.5868659537001245, + "grad_norm": 0.7269260883331299, + "learning_rate": 0.00021844653913012026, + "loss": 5.9317, + "step": 1708 + }, + { + "epoch": 0.5872095520336726, + "grad_norm": 0.6801608204841614, + "learning_rate": 0.00021814907531246642, + "loss": 6.08, + "step": 1709 + }, + { + "epoch": 0.5875531503672207, + "grad_norm": 0.6652011275291443, + "learning_rate": 0.0002178516573249058, + "loss": 6.1866, + "step": 1710 + }, + { + "epoch": 0.5878967487007688, + "grad_norm": 0.7540636658668518, + "learning_rate": 0.00021755428559539145, + "loss": 6.1272, + "step": 1711 + }, + { + "epoch": 0.5882403470343169, + "grad_norm": 0.7460317611694336, + "learning_rate": 0.0002172569605518096, + "loss": 6.0443, + "step": 1712 + }, + { + "epoch": 0.588583945367865, + "grad_norm": 0.5961695909500122, + "learning_rate": 0.00021695968262197928, + "loss": 5.9874, + "step": 1713 + }, + { + "epoch": 0.5889275437014131, + "grad_norm": 0.7832739353179932, + "learning_rate": 0.00021666245223365193, + "loss": 6.1076, + "step": 1714 + }, + { + "epoch": 0.5892711420349611, + "grad_norm": 0.7737155556678772, + "learning_rate": 0.00021636526981451038, + "loss": 6.0668, + "step": 1715 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 0.7129299640655518, + "learning_rate": 0.00021606813579216856, + "loss": 6.2231, + "step": 1716 + }, + { + "epoch": 0.5899583387020573, + "grad_norm": 0.7148540019989014, + "learning_rate": 0.00021577105059417077, + "loss": 6.0411, + "step": 1717 + }, + { + "epoch": 0.5903019370356054, + "grad_norm": 0.7563623785972595, + "learning_rate": 0.000215474014647991, + "loss": 6.2887, + "step": 1718 + }, + { + "epoch": 0.5906455353691534, + "grad_norm": 0.569948136806488, + "learning_rate": 0.00021517702838103224, + "loss": 6.1062, + "step": 1719 + }, + { + "epoch": 0.5909891337027016, + "grad_norm": 0.5353204011917114, + "learning_rate": 0.00021488009222062637, + "loss": 6.1717, + "step": 1720 + }, + { + "epoch": 0.5913327320362496, + "grad_norm": 0.592048168182373, + "learning_rate": 0.00021458320659403289, + "loss": 6.1654, + "step": 1721 + }, + { + "epoch": 0.5916763303697977, + "grad_norm": 0.526983380317688, + "learning_rate": 0.00021428637192843843, + "loss": 6.0983, + "step": 1722 + }, + { + "epoch": 0.5920199287033457, + "grad_norm": 0.5798513889312744, + "learning_rate": 0.00021398958865095682, + "loss": 6.1792, + "step": 1723 + }, + { + "epoch": 0.5923635270368939, + "grad_norm": 0.5458791255950928, + "learning_rate": 0.0002136928571886275, + "loss": 6.1895, + "step": 1724 + }, + { + "epoch": 0.5927071253704419, + "grad_norm": 0.5700400471687317, + "learning_rate": 0.00021339617796841534, + "loss": 6.1609, + "step": 1725 + }, + { + "epoch": 0.59305072370399, + "grad_norm": 0.4680607318878174, + "learning_rate": 0.00021309955141721044, + "loss": 6.1472, + "step": 1726 + }, + { + "epoch": 0.5933943220375382, + "grad_norm": 0.5086695551872253, + "learning_rate": 0.00021280297796182667, + "loss": 6.236, + "step": 1727 + }, + { + "epoch": 0.5937379203710862, + "grad_norm": 0.4671761691570282, + "learning_rate": 0.00021250645802900183, + "loss": 6.1338, + "step": 1728 + }, + { + "epoch": 0.5940815187046343, + "grad_norm": 0.5281257033348083, + "learning_rate": 0.0002122099920453965, + "loss": 6.1005, + "step": 1729 + }, + { + "epoch": 0.5944251170381823, + "grad_norm": 0.5001750588417053, + "learning_rate": 0.00021191358043759368, + "loss": 6.162, + "step": 1730 + }, + { + "epoch": 0.5947687153717305, + "grad_norm": 0.46611008048057556, + "learning_rate": 0.0002116172236320982, + "loss": 6.0662, + "step": 1731 + }, + { + "epoch": 0.5951123137052785, + "grad_norm": 0.5261972546577454, + "learning_rate": 0.00021132092205533598, + "loss": 6.2476, + "step": 1732 + }, + { + "epoch": 0.5954559120388266, + "grad_norm": 0.557052731513977, + "learning_rate": 0.00021102467613365336, + "loss": 6.0415, + "step": 1733 + }, + { + "epoch": 0.5957995103723747, + "grad_norm": 0.6007571816444397, + "learning_rate": 0.00021072848629331693, + "loss": 6.1562, + "step": 1734 + }, + { + "epoch": 0.5961431087059228, + "grad_norm": 0.5048114061355591, + "learning_rate": 0.00021043235296051225, + "loss": 6.1526, + "step": 1735 + }, + { + "epoch": 0.5964867070394708, + "grad_norm": 0.5392985939979553, + "learning_rate": 0.0002101362765613436, + "loss": 6.1723, + "step": 1736 + }, + { + "epoch": 0.596830305373019, + "grad_norm": 0.6697172522544861, + "learning_rate": 0.00020984025752183365, + "loss": 6.1767, + "step": 1737 + }, + { + "epoch": 0.597173903706567, + "grad_norm": 0.5006392598152161, + "learning_rate": 0.00020954429626792215, + "loss": 6.1836, + "step": 1738 + }, + { + "epoch": 0.5975175020401151, + "grad_norm": 0.665244996547699, + "learning_rate": 0.00020924839322546585, + "loss": 6.1117, + "step": 1739 + }, + { + "epoch": 0.5978611003736631, + "grad_norm": 0.6159089207649231, + "learning_rate": 0.00020895254882023791, + "loss": 6.1428, + "step": 1740 + }, + { + "epoch": 0.5982046987072113, + "grad_norm": 0.5167410969734192, + "learning_rate": 0.00020865676347792692, + "loss": 6.1017, + "step": 1741 + }, + { + "epoch": 0.5985482970407594, + "grad_norm": 0.6677618622779846, + "learning_rate": 0.0002083610376241364, + "loss": 6.1412, + "step": 1742 + }, + { + "epoch": 0.5988918953743074, + "grad_norm": 0.6453651785850525, + "learning_rate": 0.00020806537168438456, + "loss": 6.2316, + "step": 1743 + }, + { + "epoch": 0.5992354937078556, + "grad_norm": 0.7127484679222107, + "learning_rate": 0.00020776976608410317, + "loss": 6.1927, + "step": 1744 + }, + { + "epoch": 0.5995790920414036, + "grad_norm": 0.8041994571685791, + "learning_rate": 0.00020747422124863725, + "loss": 6.2754, + "step": 1745 + }, + { + "epoch": 0.5999226903749517, + "grad_norm": 0.6847991347312927, + "learning_rate": 0.00020717873760324443, + "loss": 6.2191, + "step": 1746 + }, + { + "epoch": 0.6002662887084997, + "grad_norm": 0.7265855073928833, + "learning_rate": 0.0002068833155730942, + "loss": 6.2995, + "step": 1747 + }, + { + "epoch": 0.6006098870420479, + "grad_norm": 0.7762460708618164, + "learning_rate": 0.00020658795558326743, + "loss": 6.2802, + "step": 1748 + }, + { + "epoch": 0.6009534853755959, + "grad_norm": 0.972196102142334, + "learning_rate": 0.00020629265805875585, + "loss": 6.3219, + "step": 1749 + }, + { + "epoch": 0.601297083709144, + "grad_norm": 1.1169794797897339, + "learning_rate": 0.0002059974234244611, + "loss": 6.2547, + "step": 1750 + }, + { + "epoch": 0.601640682042692, + "grad_norm": 0.9174626469612122, + "learning_rate": 0.00020570225210519432, + "loss": 6.1758, + "step": 1751 + }, + { + "epoch": 0.6019842803762402, + "grad_norm": 0.9274932146072388, + "learning_rate": 0.00020540714452567589, + "loss": 6.0432, + "step": 1752 + }, + { + "epoch": 0.6023278787097882, + "grad_norm": 0.7927635908126831, + "learning_rate": 0.000205112101110534, + "loss": 6.0737, + "step": 1753 + }, + { + "epoch": 0.6026714770433363, + "grad_norm": 0.7221388220787048, + "learning_rate": 0.00020481712228430493, + "loss": 6.1436, + "step": 1754 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 0.8973913192749023, + "learning_rate": 0.00020452220847143177, + "loss": 6.0908, + "step": 1755 + }, + { + "epoch": 0.6033586737104325, + "grad_norm": 1.0738279819488525, + "learning_rate": 0.00020422736009626405, + "loss": 6.1207, + "step": 1756 + }, + { + "epoch": 0.6037022720439806, + "grad_norm": 0.872310221195221, + "learning_rate": 0.00020393257758305738, + "loss": 6.0342, + "step": 1757 + }, + { + "epoch": 0.6040458703775287, + "grad_norm": 0.7840533256530762, + "learning_rate": 0.00020363786135597236, + "loss": 6.0474, + "step": 1758 + }, + { + "epoch": 0.6043894687110768, + "grad_norm": 0.6296777725219727, + "learning_rate": 0.00020334321183907428, + "loss": 6.1445, + "step": 1759 + }, + { + "epoch": 0.6047330670446248, + "grad_norm": 0.9561694860458374, + "learning_rate": 0.0002030486294563325, + "loss": 6.017, + "step": 1760 + }, + { + "epoch": 0.605076665378173, + "grad_norm": 0.6782475113868713, + "learning_rate": 0.0002027541146316197, + "loss": 6.2128, + "step": 1761 + }, + { + "epoch": 0.605420263711721, + "grad_norm": 0.8592099547386169, + "learning_rate": 0.00020245966778871145, + "loss": 5.9947, + "step": 1762 + }, + { + "epoch": 0.6057638620452691, + "grad_norm": 0.681970477104187, + "learning_rate": 0.00020216528935128542, + "loss": 5.98, + "step": 1763 + }, + { + "epoch": 0.6061074603788171, + "grad_norm": 0.7359707355499268, + "learning_rate": 0.00020187097974292087, + "loss": 6.0558, + "step": 1764 + }, + { + "epoch": 0.6064510587123653, + "grad_norm": 0.6512293815612793, + "learning_rate": 0.00020157673938709793, + "loss": 6.0842, + "step": 1765 + }, + { + "epoch": 0.6067946570459133, + "grad_norm": 0.7809913158416748, + "learning_rate": 0.00020128256870719736, + "loss": 6.0179, + "step": 1766 + }, + { + "epoch": 0.6071382553794614, + "grad_norm": 0.6424116492271423, + "learning_rate": 0.0002009884681264994, + "loss": 6.089, + "step": 1767 + }, + { + "epoch": 0.6074818537130094, + "grad_norm": 0.46616989374160767, + "learning_rate": 0.00020069443806818339, + "loss": 6.1271, + "step": 1768 + }, + { + "epoch": 0.6078254520465576, + "grad_norm": 0.6025314331054688, + "learning_rate": 0.00020040047895532754, + "loss": 6.153, + "step": 1769 + }, + { + "epoch": 0.6081690503801056, + "grad_norm": 0.7727947235107422, + "learning_rate": 0.00020010659121090765, + "loss": 6.0856, + "step": 1770 + }, + { + "epoch": 0.6085126487136537, + "grad_norm": 0.6243574619293213, + "learning_rate": 0.00019981277525779682, + "loss": 6.1613, + "step": 1771 + }, + { + "epoch": 0.6088562470472019, + "grad_norm": 0.5759406685829163, + "learning_rate": 0.00019951903151876516, + "loss": 6.1344, + "step": 1772 + }, + { + "epoch": 0.6091998453807499, + "grad_norm": 0.5405790209770203, + "learning_rate": 0.00019922536041647854, + "loss": 6.0394, + "step": 1773 + }, + { + "epoch": 0.609543443714298, + "grad_norm": 0.6683335304260254, + "learning_rate": 0.00019893176237349838, + "loss": 6.055, + "step": 1774 + }, + { + "epoch": 0.609887042047846, + "grad_norm": 0.6566500067710876, + "learning_rate": 0.00019863823781228127, + "loss": 5.9496, + "step": 1775 + }, + { + "epoch": 0.6102306403813942, + "grad_norm": 0.4817597270011902, + "learning_rate": 0.00019834478715517767, + "loss": 6.0593, + "step": 1776 + }, + { + "epoch": 0.6105742387149422, + "grad_norm": 0.5774630904197693, + "learning_rate": 0.00019805141082443188, + "loss": 6.1018, + "step": 1777 + }, + { + "epoch": 0.6109178370484903, + "grad_norm": 0.6785969138145447, + "learning_rate": 0.00019775810924218125, + "loss": 6.199, + "step": 1778 + }, + { + "epoch": 0.6112614353820384, + "grad_norm": 0.7792773842811584, + "learning_rate": 0.0001974648828304556, + "loss": 6.09, + "step": 1779 + }, + { + "epoch": 0.6116050337155865, + "grad_norm": 0.5020087361335754, + "learning_rate": 0.0001971717320111765, + "loss": 6.0291, + "step": 1780 + }, + { + "epoch": 0.6119486320491345, + "grad_norm": 0.764399528503418, + "learning_rate": 0.0001968786572061569, + "loss": 6.0347, + "step": 1781 + }, + { + "epoch": 0.6122922303826827, + "grad_norm": 0.656197726726532, + "learning_rate": 0.00019658565883710005, + "loss": 6.0684, + "step": 1782 + }, + { + "epoch": 0.6126358287162307, + "grad_norm": 0.6660633683204651, + "learning_rate": 0.00019629273732559973, + "loss": 6.0719, + "step": 1783 + }, + { + "epoch": 0.6129794270497788, + "grad_norm": 0.6405670642852783, + "learning_rate": 0.0001959998930931387, + "loss": 6.0928, + "step": 1784 + }, + { + "epoch": 0.6133230253833268, + "grad_norm": 0.6059765219688416, + "learning_rate": 0.0001957071265610886, + "loss": 6.2629, + "step": 1785 + }, + { + "epoch": 0.613666623716875, + "grad_norm": 0.6842978000640869, + "learning_rate": 0.00019541443815070952, + "loss": 6.3053, + "step": 1786 + }, + { + "epoch": 0.6140102220504231, + "grad_norm": 0.6792712211608887, + "learning_rate": 0.00019512182828314883, + "loss": 6.1212, + "step": 1787 + }, + { + "epoch": 0.6143538203839711, + "grad_norm": 0.6276842951774597, + "learning_rate": 0.00019482929737944094, + "loss": 6.1975, + "step": 1788 + }, + { + "epoch": 0.6146974187175193, + "grad_norm": 0.6486376523971558, + "learning_rate": 0.00019453684586050692, + "loss": 6.0693, + "step": 1789 + }, + { + "epoch": 0.6150410170510673, + "grad_norm": 0.6076637506484985, + "learning_rate": 0.00019424447414715323, + "loss": 6.1387, + "step": 1790 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.7046791911125183, + "learning_rate": 0.0001939521826600717, + "loss": 6.1665, + "step": 1791 + }, + { + "epoch": 0.6157282137181634, + "grad_norm": 0.7905498147010803, + "learning_rate": 0.00019365997181983874, + "loss": 6.2077, + "step": 1792 + }, + { + "epoch": 0.6160718120517116, + "grad_norm": 0.6241534352302551, + "learning_rate": 0.0001933678420469147, + "loss": 6.1993, + "step": 1793 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 0.7369571328163147, + "learning_rate": 0.00019307579376164313, + "loss": 6.2529, + "step": 1794 + }, + { + "epoch": 0.6167590087188077, + "grad_norm": 0.8129594326019287, + "learning_rate": 0.00019278382738425063, + "loss": 6.305, + "step": 1795 + }, + { + "epoch": 0.6171026070523558, + "grad_norm": 0.750092089176178, + "learning_rate": 0.00019249194333484566, + "loss": 6.3213, + "step": 1796 + }, + { + "epoch": 0.6174462053859039, + "grad_norm": 0.9499820470809937, + "learning_rate": 0.00019220014203341824, + "loss": 6.2478, + "step": 1797 + }, + { + "epoch": 0.6177898037194519, + "grad_norm": 0.764100193977356, + "learning_rate": 0.0001919084238998396, + "loss": 6.2935, + "step": 1798 + }, + { + "epoch": 0.618133402053, + "grad_norm": 0.8200209140777588, + "learning_rate": 0.00019161678935386098, + "loss": 6.2591, + "step": 1799 + }, + { + "epoch": 0.6184770003865481, + "grad_norm": 1.2245675325393677, + "learning_rate": 0.00019132523881511344, + "loss": 6.0823, + "step": 1800 + }, + { + "epoch": 0.6188205987200962, + "grad_norm": 0.81374192237854, + "learning_rate": 0.0001910337727031074, + "loss": 6.0214, + "step": 1801 + }, + { + "epoch": 0.6191641970536443, + "grad_norm": 0.6380704641342163, + "learning_rate": 0.00019074239143723144, + "loss": 5.9451, + "step": 1802 + }, + { + "epoch": 0.6195077953871924, + "grad_norm": 0.7890001535415649, + "learning_rate": 0.00019045109543675215, + "loss": 6.1033, + "step": 1803 + }, + { + "epoch": 0.6198513937207405, + "grad_norm": 0.7154576778411865, + "learning_rate": 0.00019015988512081369, + "loss": 6.0945, + "step": 1804 + }, + { + "epoch": 0.6201949920542885, + "grad_norm": 0.6763778924942017, + "learning_rate": 0.00018986876090843667, + "loss": 5.9776, + "step": 1805 + }, + { + "epoch": 0.6205385903878367, + "grad_norm": 0.6738643050193787, + "learning_rate": 0.00018957772321851767, + "loss": 6.0804, + "step": 1806 + }, + { + "epoch": 0.6208821887213847, + "grad_norm": 0.7186247706413269, + "learning_rate": 0.0001892867724698293, + "loss": 5.9855, + "step": 1807 + }, + { + "epoch": 0.6212257870549328, + "grad_norm": 0.556603729724884, + "learning_rate": 0.00018899590908101851, + "loss": 6.1443, + "step": 1808 + }, + { + "epoch": 0.6215693853884808, + "grad_norm": 0.5243987441062927, + "learning_rate": 0.0001887051334706069, + "loss": 6.032, + "step": 1809 + }, + { + "epoch": 0.621912983722029, + "grad_norm": 0.7323964834213257, + "learning_rate": 0.0001884144460569896, + "loss": 6.0841, + "step": 1810 + }, + { + "epoch": 0.622256582055577, + "grad_norm": 0.6673109531402588, + "learning_rate": 0.00018812384725843488, + "loss": 6.0665, + "step": 1811 + }, + { + "epoch": 0.6226001803891251, + "grad_norm": 0.5664012432098389, + "learning_rate": 0.00018783333749308357, + "loss": 6.0865, + "step": 1812 + }, + { + "epoch": 0.6229437787226731, + "grad_norm": 0.6237545013427734, + "learning_rate": 0.00018754291717894826, + "loss": 6.1261, + "step": 1813 + }, + { + "epoch": 0.6232873770562213, + "grad_norm": 0.7225657105445862, + "learning_rate": 0.00018725258673391281, + "loss": 6.1274, + "step": 1814 + }, + { + "epoch": 0.6236309753897693, + "grad_norm": 0.5414710640907288, + "learning_rate": 0.00018696234657573208, + "loss": 6.1915, + "step": 1815 + }, + { + "epoch": 0.6239745737233174, + "grad_norm": 0.587091326713562, + "learning_rate": 0.00018667219712203064, + "loss": 6.088, + "step": 1816 + }, + { + "epoch": 0.6243181720568656, + "grad_norm": 0.6084412932395935, + "learning_rate": 0.00018638213879030265, + "loss": 6.1797, + "step": 1817 + }, + { + "epoch": 0.6246617703904136, + "grad_norm": 0.6020777821540833, + "learning_rate": 0.00018609217199791136, + "loss": 6.1391, + "step": 1818 + }, + { + "epoch": 0.6250053687239617, + "grad_norm": 0.5605757832527161, + "learning_rate": 0.00018580229716208806, + "loss": 6.1469, + "step": 1819 + }, + { + "epoch": 0.6253489670575098, + "grad_norm": 0.634785532951355, + "learning_rate": 0.00018551251469993175, + "loss": 6.044, + "step": 1820 + }, + { + "epoch": 0.6256925653910579, + "grad_norm": 0.5692383050918579, + "learning_rate": 0.00018522282502840873, + "loss": 6.1452, + "step": 1821 + }, + { + "epoch": 0.6260361637246059, + "grad_norm": 0.5231955051422119, + "learning_rate": 0.00018493322856435155, + "loss": 6.0816, + "step": 1822 + }, + { + "epoch": 0.626379762058154, + "grad_norm": 0.5089519619941711, + "learning_rate": 0.00018464372572445865, + "loss": 6.1039, + "step": 1823 + }, + { + "epoch": 0.6267233603917021, + "grad_norm": 0.5622355341911316, + "learning_rate": 0.00018435431692529386, + "loss": 6.1048, + "step": 1824 + }, + { + "epoch": 0.6270669587252502, + "grad_norm": 0.543976366519928, + "learning_rate": 0.0001840650025832858, + "loss": 6.132, + "step": 1825 + }, + { + "epoch": 0.6274105570587982, + "grad_norm": 0.609881579875946, + "learning_rate": 0.00018377578311472683, + "loss": 6.0715, + "step": 1826 + }, + { + "epoch": 0.6277541553923464, + "grad_norm": 0.44123703241348267, + "learning_rate": 0.0001834866589357732, + "loss": 6.1373, + "step": 1827 + }, + { + "epoch": 0.6280977537258944, + "grad_norm": 0.6479792594909668, + "learning_rate": 0.0001831976304624438, + "loss": 6.1722, + "step": 1828 + }, + { + "epoch": 0.6284413520594425, + "grad_norm": 0.4903275966644287, + "learning_rate": 0.00018290869811061968, + "loss": 6.2031, + "step": 1829 + }, + { + "epoch": 0.6287849503929906, + "grad_norm": 0.4645865261554718, + "learning_rate": 0.00018261986229604402, + "loss": 6.1344, + "step": 1830 + }, + { + "epoch": 0.6291285487265387, + "grad_norm": 0.5052070617675781, + "learning_rate": 0.00018233112343432077, + "loss": 6.0615, + "step": 1831 + }, + { + "epoch": 0.6294721470600868, + "grad_norm": 0.6264731287956238, + "learning_rate": 0.00018204248194091428, + "loss": 6.1618, + "step": 1832 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 0.5955216884613037, + "learning_rate": 0.0001817539382311492, + "loss": 6.1757, + "step": 1833 + }, + { + "epoch": 0.630159343727183, + "grad_norm": 0.523066520690918, + "learning_rate": 0.00018146549272020918, + "loss": 6.1458, + "step": 1834 + }, + { + "epoch": 0.630502942060731, + "grad_norm": 0.583475649356842, + "learning_rate": 0.0001811771458231365, + "loss": 6.1725, + "step": 1835 + }, + { + "epoch": 0.6308465403942791, + "grad_norm": 0.6626928448677063, + "learning_rate": 0.00018088889795483184, + "loss": 6.1341, + "step": 1836 + }, + { + "epoch": 0.6311901387278271, + "grad_norm": 0.6540782451629639, + "learning_rate": 0.00018060074953005307, + "loss": 6.2446, + "step": 1837 + }, + { + "epoch": 0.6315337370613753, + "grad_norm": 0.6838895082473755, + "learning_rate": 0.00018031270096341534, + "loss": 6.1862, + "step": 1838 + }, + { + "epoch": 0.6318773353949233, + "grad_norm": 0.6299639940261841, + "learning_rate": 0.00018002475266938977, + "loss": 6.19, + "step": 1839 + }, + { + "epoch": 0.6322209337284714, + "grad_norm": 0.6744266152381897, + "learning_rate": 0.0001797369050623033, + "loss": 6.2666, + "step": 1840 + }, + { + "epoch": 0.6325645320620195, + "grad_norm": 0.5881931185722351, + "learning_rate": 0.0001794491585563381, + "loss": 6.0901, + "step": 1841 + }, + { + "epoch": 0.6329081303955676, + "grad_norm": 0.5639910697937012, + "learning_rate": 0.00017916151356553073, + "loss": 6.0917, + "step": 1842 + }, + { + "epoch": 0.6332517287291156, + "grad_norm": 0.8729662895202637, + "learning_rate": 0.0001788739705037718, + "loss": 6.1398, + "step": 1843 + }, + { + "epoch": 0.6335953270626637, + "grad_norm": 0.6799281239509583, + "learning_rate": 0.00017858652978480516, + "loss": 6.1564, + "step": 1844 + }, + { + "epoch": 0.6339389253962119, + "grad_norm": 0.7408438324928284, + "learning_rate": 0.00017829919182222752, + "loss": 6.1424, + "step": 1845 + }, + { + "epoch": 0.6342825237297599, + "grad_norm": 0.6003269553184509, + "learning_rate": 0.00017801195702948742, + "loss": 6.1305, + "step": 1846 + }, + { + "epoch": 0.634626122063308, + "grad_norm": 0.8122835755348206, + "learning_rate": 0.00017772482581988544, + "loss": 6.2311, + "step": 1847 + }, + { + "epoch": 0.6349697203968561, + "grad_norm": 0.8266429901123047, + "learning_rate": 0.0001774377986065728, + "loss": 6.1746, + "step": 1848 + }, + { + "epoch": 0.6353133187304042, + "grad_norm": 0.9495203495025635, + "learning_rate": 0.0001771508758025509, + "loss": 6.2391, + "step": 1849 + }, + { + "epoch": 0.6356569170639522, + "grad_norm": 1.0640305280685425, + "learning_rate": 0.0001768640578206715, + "loss": 6.1779, + "step": 1850 + }, + { + "epoch": 0.6360005153975004, + "grad_norm": 1.1024693250656128, + "learning_rate": 0.00017657734507363498, + "loss": 5.9968, + "step": 1851 + }, + { + "epoch": 0.6363441137310484, + "grad_norm": 1.1050010919570923, + "learning_rate": 0.00017629073797399036, + "loss": 6.1062, + "step": 1852 + }, + { + "epoch": 0.6366877120645965, + "grad_norm": 0.9328210949897766, + "learning_rate": 0.00017600423693413508, + "loss": 6.0665, + "step": 1853 + }, + { + "epoch": 0.6370313103981445, + "grad_norm": 0.6804933547973633, + "learning_rate": 0.00017571784236631351, + "loss": 5.9694, + "step": 1854 + }, + { + "epoch": 0.6373749087316927, + "grad_norm": 0.836593747138977, + "learning_rate": 0.00017543155468261696, + "loss": 5.8933, + "step": 1855 + }, + { + "epoch": 0.6377185070652407, + "grad_norm": 0.7389679551124573, + "learning_rate": 0.00017514537429498295, + "loss": 6.0701, + "step": 1856 + }, + { + "epoch": 0.6380621053987888, + "grad_norm": 0.769934356212616, + "learning_rate": 0.0001748593016151947, + "loss": 6.0735, + "step": 1857 + }, + { + "epoch": 0.6384057037323368, + "grad_norm": 0.7202440500259399, + "learning_rate": 0.00017457333705488026, + "loss": 5.9477, + "step": 1858 + }, + { + "epoch": 0.638749302065885, + "grad_norm": 0.7156488299369812, + "learning_rate": 0.00017428748102551236, + "loss": 6.1103, + "step": 1859 + }, + { + "epoch": 0.6390929003994331, + "grad_norm": 0.7631789445877075, + "learning_rate": 0.00017400173393840735, + "loss": 6.0724, + "step": 1860 + }, + { + "epoch": 0.6394364987329811, + "grad_norm": 0.6912445425987244, + "learning_rate": 0.00017371609620472477, + "loss": 5.9004, + "step": 1861 + }, + { + "epoch": 0.6397800970665293, + "grad_norm": 0.6902914047241211, + "learning_rate": 0.00017343056823546725, + "loss": 6.1013, + "step": 1862 + }, + { + "epoch": 0.6401236954000773, + "grad_norm": 0.5509893894195557, + "learning_rate": 0.00017314515044147884, + "loss": 6.0479, + "step": 1863 + }, + { + "epoch": 0.6404672937336254, + "grad_norm": 0.5896221995353699, + "learning_rate": 0.00017285984323344567, + "loss": 6.0818, + "step": 1864 + }, + { + "epoch": 0.6408108920671735, + "grad_norm": 0.6542482972145081, + "learning_rate": 0.00017257464702189433, + "loss": 6.1131, + "step": 1865 + }, + { + "epoch": 0.6411544904007216, + "grad_norm": 0.5327386856079102, + "learning_rate": 0.00017228956221719178, + "loss": 6.1203, + "step": 1866 + }, + { + "epoch": 0.6414980887342696, + "grad_norm": 0.5413427352905273, + "learning_rate": 0.00017200458922954486, + "loss": 6.2124, + "step": 1867 + }, + { + "epoch": 0.6418416870678177, + "grad_norm": 0.4703226685523987, + "learning_rate": 0.00017171972846899942, + "loss": 6.1323, + "step": 1868 + }, + { + "epoch": 0.6421852854013658, + "grad_norm": 0.4837634563446045, + "learning_rate": 0.00017143498034543958, + "loss": 6.0326, + "step": 1869 + }, + { + "epoch": 0.6425288837349139, + "grad_norm": 0.6183565258979797, + "learning_rate": 0.00017115034526858785, + "loss": 6.0834, + "step": 1870 + }, + { + "epoch": 0.6428724820684619, + "grad_norm": 0.6575108170509338, + "learning_rate": 0.00017086582364800375, + "loss": 6.0392, + "step": 1871 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 0.5329729318618774, + "learning_rate": 0.00017058141589308356, + "loss": 6.2179, + "step": 1872 + }, + { + "epoch": 0.6435596787355581, + "grad_norm": 0.5143067240715027, + "learning_rate": 0.0001702971224130599, + "loss": 5.9996, + "step": 1873 + }, + { + "epoch": 0.6439032770691062, + "grad_norm": 0.61328125, + "learning_rate": 0.0001700129436170008, + "loss": 6.1188, + "step": 1874 + }, + { + "epoch": 0.6442468754026544, + "grad_norm": 0.5242812037467957, + "learning_rate": 0.0001697288799138093, + "loss": 6.0961, + "step": 1875 + }, + { + "epoch": 0.6445904737362024, + "grad_norm": 0.45744621753692627, + "learning_rate": 0.00016944493171222296, + "loss": 6.0911, + "step": 1876 + }, + { + "epoch": 0.6449340720697505, + "grad_norm": 0.4443244934082031, + "learning_rate": 0.00016916109942081292, + "loss": 6.1137, + "step": 1877 + }, + { + "epoch": 0.6452776704032985, + "grad_norm": 0.668917179107666, + "learning_rate": 0.0001688773834479837, + "loss": 5.9674, + "step": 1878 + }, + { + "epoch": 0.6456212687368467, + "grad_norm": 0.5518603920936584, + "learning_rate": 0.00016859378420197246, + "loss": 6.0861, + "step": 1879 + }, + { + "epoch": 0.6459648670703947, + "grad_norm": 0.5576813817024231, + "learning_rate": 0.0001683103020908484, + "loss": 6.0854, + "step": 1880 + }, + { + "epoch": 0.6463084654039428, + "grad_norm": 0.5512084364891052, + "learning_rate": 0.00016802693752251187, + "loss": 5.9926, + "step": 1881 + }, + { + "epoch": 0.6466520637374908, + "grad_norm": 0.7495642304420471, + "learning_rate": 0.0001677436909046947, + "loss": 6.1177, + "step": 1882 + }, + { + "epoch": 0.646995662071039, + "grad_norm": 0.6886982917785645, + "learning_rate": 0.00016746056264495846, + "loss": 6.1576, + "step": 1883 + }, + { + "epoch": 0.647339260404587, + "grad_norm": 0.5660991072654724, + "learning_rate": 0.00016717755315069456, + "loss": 6.015, + "step": 1884 + }, + { + "epoch": 0.6476828587381351, + "grad_norm": 0.6272467374801636, + "learning_rate": 0.00016689466282912368, + "loss": 6.2081, + "step": 1885 + }, + { + "epoch": 0.6480264570716832, + "grad_norm": 0.6678543090820312, + "learning_rate": 0.0001666118920872949, + "loss": 6.1756, + "step": 1886 + }, + { + "epoch": 0.6483700554052313, + "grad_norm": 0.6672770977020264, + "learning_rate": 0.00016632924133208515, + "loss": 6.1607, + "step": 1887 + }, + { + "epoch": 0.6487136537387793, + "grad_norm": 0.8277221918106079, + "learning_rate": 0.00016604671097019885, + "loss": 6.0318, + "step": 1888 + }, + { + "epoch": 0.6490572520723275, + "grad_norm": 0.600071370601654, + "learning_rate": 0.00016576430140816716, + "loss": 6.1854, + "step": 1889 + }, + { + "epoch": 0.6494008504058756, + "grad_norm": 0.6369357705116272, + "learning_rate": 0.0001654820130523475, + "loss": 6.2219, + "step": 1890 + }, + { + "epoch": 0.6497444487394236, + "grad_norm": 0.6288464665412903, + "learning_rate": 0.00016519984630892264, + "loss": 6.0491, + "step": 1891 + }, + { + "epoch": 0.6500880470729717, + "grad_norm": 0.7976536750793457, + "learning_rate": 0.0001649178015839005, + "loss": 6.0651, + "step": 1892 + }, + { + "epoch": 0.6504316454065198, + "grad_norm": 0.7433154582977295, + "learning_rate": 0.00016463587928311363, + "loss": 6.2634, + "step": 1893 + }, + { + "epoch": 0.6507752437400679, + "grad_norm": 0.8158239722251892, + "learning_rate": 0.0001643540798122181, + "loss": 6.0948, + "step": 1894 + }, + { + "epoch": 0.6511188420736159, + "grad_norm": 0.9072747230529785, + "learning_rate": 0.00016407240357669333, + "loss": 6.1638, + "step": 1895 + }, + { + "epoch": 0.6514624404071641, + "grad_norm": 0.8244097828865051, + "learning_rate": 0.00016379085098184166, + "loss": 6.1314, + "step": 1896 + }, + { + "epoch": 0.6518060387407121, + "grad_norm": 0.7535924911499023, + "learning_rate": 0.0001635094224327872, + "loss": 6.1109, + "step": 1897 + }, + { + "epoch": 0.6521496370742602, + "grad_norm": 1.0342903137207031, + "learning_rate": 0.0001632281183344756, + "loss": 6.2378, + "step": 1898 + }, + { + "epoch": 0.6524932354078082, + "grad_norm": 0.8878806233406067, + "learning_rate": 0.00016294693909167378, + "loss": 6.2589, + "step": 1899 + }, + { + "epoch": 0.6528368337413564, + "grad_norm": 0.8649120330810547, + "learning_rate": 0.00016266588510896864, + "loss": 6.335, + "step": 1900 + }, + { + "epoch": 0.6531804320749044, + "grad_norm": 1.5248429775238037, + "learning_rate": 0.00016238495679076688, + "loss": 6.0537, + "step": 1901 + }, + { + "epoch": 0.6535240304084525, + "grad_norm": 1.4273432493209839, + "learning_rate": 0.00016210415454129463, + "loss": 5.9864, + "step": 1902 + }, + { + "epoch": 0.6538676287420006, + "grad_norm": 0.9812554717063904, + "learning_rate": 0.00016182347876459648, + "loss": 6.0068, + "step": 1903 + }, + { + "epoch": 0.6542112270755487, + "grad_norm": 0.7029387354850769, + "learning_rate": 0.00016154292986453485, + "loss": 6.1058, + "step": 1904 + }, + { + "epoch": 0.6545548254090968, + "grad_norm": 1.040303111076355, + "learning_rate": 0.0001612625082447899, + "loss": 6.0546, + "step": 1905 + }, + { + "epoch": 0.6548984237426448, + "grad_norm": 1.0513960123062134, + "learning_rate": 0.00016098221430885844, + "loss": 5.9321, + "step": 1906 + }, + { + "epoch": 0.655242022076193, + "grad_norm": 0.7275641560554504, + "learning_rate": 0.00016070204846005374, + "loss": 6.0139, + "step": 1907 + }, + { + "epoch": 0.655585620409741, + "grad_norm": 0.8591195940971375, + "learning_rate": 0.0001604220111015046, + "loss": 5.992, + "step": 1908 + }, + { + "epoch": 0.6559292187432891, + "grad_norm": 0.9006431698799133, + "learning_rate": 0.00016014210263615505, + "loss": 5.9939, + "step": 1909 + }, + { + "epoch": 0.6562728170768372, + "grad_norm": 0.611177384853363, + "learning_rate": 0.00015986232346676345, + "loss": 5.9714, + "step": 1910 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 0.555124819278717, + "learning_rate": 0.00015958267399590243, + "loss": 5.9323, + "step": 1911 + }, + { + "epoch": 0.6569600137439333, + "grad_norm": 0.6982326507568359, + "learning_rate": 0.0001593031546259578, + "loss": 5.9022, + "step": 1912 + }, + { + "epoch": 0.6573036120774814, + "grad_norm": 0.7364804744720459, + "learning_rate": 0.00015902376575912814, + "loss": 5.9534, + "step": 1913 + }, + { + "epoch": 0.6576472104110295, + "grad_norm": 0.6368590593338013, + "learning_rate": 0.0001587445077974244, + "loss": 6.0573, + "step": 1914 + }, + { + "epoch": 0.6579908087445776, + "grad_norm": 0.596760094165802, + "learning_rate": 0.00015846538114266912, + "loss": 6.1416, + "step": 1915 + }, + { + "epoch": 0.6583344070781256, + "grad_norm": 0.576379656791687, + "learning_rate": 0.00015818638619649567, + "loss": 6.0386, + "step": 1916 + }, + { + "epoch": 0.6586780054116738, + "grad_norm": 0.5851263999938965, + "learning_rate": 0.00015790752336034835, + "loss": 6.1689, + "step": 1917 + }, + { + "epoch": 0.6590216037452218, + "grad_norm": 0.6037353873252869, + "learning_rate": 0.00015762879303548094, + "loss": 6.1273, + "step": 1918 + }, + { + "epoch": 0.6593652020787699, + "grad_norm": 0.533187747001648, + "learning_rate": 0.00015735019562295688, + "loss": 5.9624, + "step": 1919 + }, + { + "epoch": 0.659708800412318, + "grad_norm": 0.6187282204627991, + "learning_rate": 0.00015707173152364816, + "loss": 6.1505, + "step": 1920 + }, + { + "epoch": 0.6600523987458661, + "grad_norm": 0.5342614054679871, + "learning_rate": 0.00015679340113823495, + "loss": 6.2201, + "step": 1921 + }, + { + "epoch": 0.6603959970794142, + "grad_norm": 0.536963939666748, + "learning_rate": 0.00015651520486720515, + "loss": 6.1204, + "step": 1922 + }, + { + "epoch": 0.6607395954129622, + "grad_norm": 0.5395856499671936, + "learning_rate": 0.00015623714311085364, + "loss": 5.9842, + "step": 1923 + }, + { + "epoch": 0.6610831937465104, + "grad_norm": 0.5402804613113403, + "learning_rate": 0.0001559592162692815, + "loss": 6.1283, + "step": 1924 + }, + { + "epoch": 0.6614267920800584, + "grad_norm": 0.4991236925125122, + "learning_rate": 0.00015568142474239622, + "loss": 5.9874, + "step": 1925 + }, + { + "epoch": 0.6617703904136065, + "grad_norm": 0.5533581376075745, + "learning_rate": 0.00015540376892991004, + "loss": 6.0261, + "step": 1926 + }, + { + "epoch": 0.6621139887471545, + "grad_norm": 0.549507737159729, + "learning_rate": 0.0001551262492313401, + "loss": 6.097, + "step": 1927 + }, + { + "epoch": 0.6624575870807027, + "grad_norm": 0.5345736742019653, + "learning_rate": 0.00015484886604600796, + "loss": 6.1086, + "step": 1928 + }, + { + "epoch": 0.6628011854142507, + "grad_norm": 0.4900307357311249, + "learning_rate": 0.0001545716197730384, + "loss": 6.0988, + "step": 1929 + }, + { + "epoch": 0.6631447837477988, + "grad_norm": 0.468268483877182, + "learning_rate": 0.00015429451081135922, + "loss": 6.0424, + "step": 1930 + }, + { + "epoch": 0.6634883820813469, + "grad_norm": 0.5394709706306458, + "learning_rate": 0.00015401753955970095, + "loss": 6.1305, + "step": 1931 + }, + { + "epoch": 0.663831980414895, + "grad_norm": 0.5436084866523743, + "learning_rate": 0.00015374070641659566, + "loss": 6.0656, + "step": 1932 + }, + { + "epoch": 0.664175578748443, + "grad_norm": 0.5724718570709229, + "learning_rate": 0.00015346401178037672, + "loss": 5.9819, + "step": 1933 + }, + { + "epoch": 0.6645191770819912, + "grad_norm": 0.5867025256156921, + "learning_rate": 0.00015318745604917848, + "loss": 6.1125, + "step": 1934 + }, + { + "epoch": 0.6648627754155393, + "grad_norm": 0.5411561131477356, + "learning_rate": 0.0001529110396209351, + "loss": 6.1245, + "step": 1935 + }, + { + "epoch": 0.6652063737490873, + "grad_norm": 0.5726490616798401, + "learning_rate": 0.0001526347628933804, + "loss": 6.1578, + "step": 1936 + }, + { + "epoch": 0.6655499720826354, + "grad_norm": 0.615553081035614, + "learning_rate": 0.00015235862626404727, + "loss": 6.0445, + "step": 1937 + }, + { + "epoch": 0.6658935704161835, + "grad_norm": 0.5673866868019104, + "learning_rate": 0.00015208263013026692, + "loss": 6.1863, + "step": 1938 + }, + { + "epoch": 0.6662371687497316, + "grad_norm": 0.5758171677589417, + "learning_rate": 0.00015180677488916845, + "loss": 6.1639, + "step": 1939 + }, + { + "epoch": 0.6665807670832796, + "grad_norm": 0.5809786319732666, + "learning_rate": 0.00015153106093767827, + "loss": 6.0779, + "step": 1940 + }, + { + "epoch": 0.6669243654168278, + "grad_norm": 0.6912338733673096, + "learning_rate": 0.00015125548867251935, + "loss": 6.134, + "step": 1941 + }, + { + "epoch": 0.6672679637503758, + "grad_norm": 0.7445138096809387, + "learning_rate": 0.0001509800584902108, + "loss": 6.1787, + "step": 1942 + }, + { + "epoch": 0.6676115620839239, + "grad_norm": 0.665468156337738, + "learning_rate": 0.00015070477078706757, + "loss": 6.1019, + "step": 1943 + }, + { + "epoch": 0.6679551604174719, + "grad_norm": 0.6977120041847229, + "learning_rate": 0.00015042962595919918, + "loss": 6.1073, + "step": 1944 + }, + { + "epoch": 0.6682987587510201, + "grad_norm": 0.6922124028205872, + "learning_rate": 0.00015015462440250997, + "loss": 6.21, + "step": 1945 + }, + { + "epoch": 0.6686423570845681, + "grad_norm": 0.6119797825813293, + "learning_rate": 0.00014987976651269788, + "loss": 6.154, + "step": 1946 + }, + { + "epoch": 0.6689859554181162, + "grad_norm": 0.6788223385810852, + "learning_rate": 0.000149605052685254, + "loss": 6.0747, + "step": 1947 + }, + { + "epoch": 0.6693295537516643, + "grad_norm": 0.9458711743354797, + "learning_rate": 0.00014933048331546258, + "loss": 6.2254, + "step": 1948 + }, + { + "epoch": 0.6696731520852124, + "grad_norm": 0.9792423844337463, + "learning_rate": 0.0001490560587983996, + "loss": 6.296, + "step": 1949 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 1.0456666946411133, + "learning_rate": 0.00014878177952893276, + "loss": 6.3253, + "step": 1950 + }, + { + "epoch": 0.6703603487523085, + "grad_norm": 0.954923689365387, + "learning_rate": 0.00014850764590172076, + "loss": 5.9976, + "step": 1951 + }, + { + "epoch": 0.6707039470858567, + "grad_norm": 0.8935602307319641, + "learning_rate": 0.00014823365831121278, + "loss": 6.1013, + "step": 1952 + }, + { + "epoch": 0.6710475454194047, + "grad_norm": 0.7865163087844849, + "learning_rate": 0.00014795981715164775, + "loss": 5.9458, + "step": 1953 + }, + { + "epoch": 0.6713911437529528, + "grad_norm": 0.7201636433601379, + "learning_rate": 0.00014768612281705406, + "loss": 6.1155, + "step": 1954 + }, + { + "epoch": 0.6717347420865009, + "grad_norm": 0.7660732269287109, + "learning_rate": 0.00014741257570124875, + "loss": 6.0565, + "step": 1955 + }, + { + "epoch": 0.672078340420049, + "grad_norm": 0.6856514811515808, + "learning_rate": 0.00014713917619783685, + "loss": 6.012, + "step": 1956 + }, + { + "epoch": 0.672421938753597, + "grad_norm": 0.716894805431366, + "learning_rate": 0.00014686592470021143, + "loss": 6.1342, + "step": 1957 + }, + { + "epoch": 0.6727655370871451, + "grad_norm": 0.715986430644989, + "learning_rate": 0.00014659282160155222, + "loss": 6.0448, + "step": 1958 + }, + { + "epoch": 0.6731091354206932, + "grad_norm": 0.6127437949180603, + "learning_rate": 0.0001463198672948254, + "loss": 6.1887, + "step": 1959 + }, + { + "epoch": 0.6734527337542413, + "grad_norm": 0.7476091384887695, + "learning_rate": 0.00014604706217278345, + "loss": 6.022, + "step": 1960 + }, + { + "epoch": 0.6737963320877893, + "grad_norm": 0.5875628590583801, + "learning_rate": 0.0001457744066279637, + "loss": 6.0752, + "step": 1961 + }, + { + "epoch": 0.6741399304213375, + "grad_norm": 0.6040765047073364, + "learning_rate": 0.00014550190105268863, + "loss": 6.0603, + "step": 1962 + }, + { + "epoch": 0.6744835287548855, + "grad_norm": 0.5486308336257935, + "learning_rate": 0.0001452295458390648, + "loss": 6.2029, + "step": 1963 + }, + { + "epoch": 0.6748271270884336, + "grad_norm": 0.5594898462295532, + "learning_rate": 0.00014495734137898227, + "loss": 6.077, + "step": 1964 + }, + { + "epoch": 0.6751707254219818, + "grad_norm": 0.5191746354103088, + "learning_rate": 0.0001446852880641143, + "loss": 6.0956, + "step": 1965 + }, + { + "epoch": 0.6755143237555298, + "grad_norm": 0.5390771627426147, + "learning_rate": 0.0001444133862859168, + "loss": 6.0423, + "step": 1966 + }, + { + "epoch": 0.6758579220890779, + "grad_norm": 0.48739349842071533, + "learning_rate": 0.00014414163643562756, + "loss": 6.0822, + "step": 1967 + }, + { + "epoch": 0.6762015204226259, + "grad_norm": 0.5168871283531189, + "learning_rate": 0.00014387003890426538, + "loss": 6.106, + "step": 1968 + }, + { + "epoch": 0.6765451187561741, + "grad_norm": 0.5096775889396667, + "learning_rate": 0.00014359859408263068, + "loss": 6.1346, + "step": 1969 + }, + { + "epoch": 0.6768887170897221, + "grad_norm": 0.48305463790893555, + "learning_rate": 0.00014332730236130337, + "loss": 6.122, + "step": 1970 + }, + { + "epoch": 0.6772323154232702, + "grad_norm": 0.5125235915184021, + "learning_rate": 0.00014305616413064345, + "loss": 6.2059, + "step": 1971 + }, + { + "epoch": 0.6775759137568182, + "grad_norm": 0.5879007577896118, + "learning_rate": 0.00014278517978079006, + "loss": 5.9098, + "step": 1972 + }, + { + "epoch": 0.6779195120903664, + "grad_norm": 0.48073089122772217, + "learning_rate": 0.00014251434970166083, + "loss": 6.0738, + "step": 1973 + }, + { + "epoch": 0.6782631104239144, + "grad_norm": 0.5409465432167053, + "learning_rate": 0.00014224367428295143, + "loss": 5.9913, + "step": 1974 + }, + { + "epoch": 0.6786067087574625, + "grad_norm": 0.5101212859153748, + "learning_rate": 0.00014197315391413512, + "loss": 6.0706, + "step": 1975 + }, + { + "epoch": 0.6789503070910106, + "grad_norm": 0.540166974067688, + "learning_rate": 0.00014170278898446175, + "loss": 6.094, + "step": 1976 + }, + { + "epoch": 0.6792939054245587, + "grad_norm": 0.5894446969032288, + "learning_rate": 0.00014143257988295777, + "loss": 6.1305, + "step": 1977 + }, + { + "epoch": 0.6796375037581067, + "grad_norm": 0.5489010214805603, + "learning_rate": 0.00014116252699842546, + "loss": 6.1303, + "step": 1978 + }, + { + "epoch": 0.6799811020916549, + "grad_norm": 0.5311551690101624, + "learning_rate": 0.00014089263071944192, + "loss": 6.1514, + "step": 1979 + }, + { + "epoch": 0.680324700425203, + "grad_norm": 0.5869585871696472, + "learning_rate": 0.00014062289143435957, + "loss": 6.0273, + "step": 1980 + }, + { + "epoch": 0.680668298758751, + "grad_norm": 0.6781054735183716, + "learning_rate": 0.00014035330953130422, + "loss": 6.0957, + "step": 1981 + }, + { + "epoch": 0.6810118970922991, + "grad_norm": 0.6006012558937073, + "learning_rate": 0.00014008388539817575, + "loss": 6.0071, + "step": 1982 + }, + { + "epoch": 0.6813554954258472, + "grad_norm": 0.5926790833473206, + "learning_rate": 0.00013981461942264673, + "loss": 6.1698, + "step": 1983 + }, + { + "epoch": 0.6816990937593953, + "grad_norm": 0.596321165561676, + "learning_rate": 0.00013954551199216246, + "loss": 6.046, + "step": 1984 + }, + { + "epoch": 0.6820426920929433, + "grad_norm": 0.5697970986366272, + "learning_rate": 0.00013927656349393952, + "loss": 6.1458, + "step": 1985 + }, + { + "epoch": 0.6823862904264915, + "grad_norm": 0.6854418516159058, + "learning_rate": 0.00013900777431496666, + "loss": 6.0886, + "step": 1986 + }, + { + "epoch": 0.6827298887600395, + "grad_norm": 0.7039015889167786, + "learning_rate": 0.00013873914484200262, + "loss": 6.1631, + "step": 1987 + }, + { + "epoch": 0.6830734870935876, + "grad_norm": 0.6003180146217346, + "learning_rate": 0.00013847067546157672, + "loss": 6.0595, + "step": 1988 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 0.6838335394859314, + "learning_rate": 0.00013820236655998785, + "loss": 6.1989, + "step": 1989 + }, + { + "epoch": 0.6837606837606838, + "grad_norm": 0.6573136448860168, + "learning_rate": 0.0001379342185233041, + "loss": 6.0901, + "step": 1990 + }, + { + "epoch": 0.6841042820942318, + "grad_norm": 0.5662709474563599, + "learning_rate": 0.00013766623173736177, + "loss": 6.1881, + "step": 1991 + }, + { + "epoch": 0.6844478804277799, + "grad_norm": 0.6747244596481323, + "learning_rate": 0.0001373984065877654, + "loss": 6.0781, + "step": 1992 + }, + { + "epoch": 0.684791478761328, + "grad_norm": 0.624338686466217, + "learning_rate": 0.000137130743459887, + "loss": 6.1607, + "step": 1993 + }, + { + "epoch": 0.6851350770948761, + "grad_norm": 0.7191410064697266, + "learning_rate": 0.0001368632427388653, + "loss": 6.0574, + "step": 1994 + }, + { + "epoch": 0.6854786754284242, + "grad_norm": 0.6128062009811401, + "learning_rate": 0.00013659590480960543, + "loss": 6.1612, + "step": 1995 + }, + { + "epoch": 0.6858222737619722, + "grad_norm": 0.7554376721382141, + "learning_rate": 0.0001363287300567781, + "loss": 6.1125, + "step": 1996 + }, + { + "epoch": 0.6861658720955204, + "grad_norm": 0.7863120436668396, + "learning_rate": 0.00013606171886481943, + "loss": 6.1154, + "step": 1997 + }, + { + "epoch": 0.6865094704290684, + "grad_norm": 0.7225969433784485, + "learning_rate": 0.00013579487161793018, + "loss": 6.1776, + "step": 1998 + }, + { + "epoch": 0.6868530687626165, + "grad_norm": 0.9673846364021301, + "learning_rate": 0.00013552818870007514, + "loss": 6.1753, + "step": 1999 + }, + { + "epoch": 0.6871966670961646, + "grad_norm": 1.0428494215011597, + "learning_rate": 0.00013526167049498263, + "loss": 6.1562, + "step": 2000 + }, + { + "epoch": 0.6875402654297127, + "grad_norm": 0.9498250484466553, + "learning_rate": 0.00013499531738614414, + "loss": 6.0596, + "step": 2001 + }, + { + "epoch": 0.6878838637632607, + "grad_norm": 0.8821714520454407, + "learning_rate": 0.00013472912975681317, + "loss": 6.052, + "step": 2002 + }, + { + "epoch": 0.6882274620968089, + "grad_norm": 0.7349355220794678, + "learning_rate": 0.00013446310799000578, + "loss": 6.0121, + "step": 2003 + }, + { + "epoch": 0.6885710604303569, + "grad_norm": 0.5704671740531921, + "learning_rate": 0.00013419725246849873, + "loss": 6.0056, + "step": 2004 + }, + { + "epoch": 0.688914658763905, + "grad_norm": 0.5243590474128723, + "learning_rate": 0.00013393156357482993, + "loss": 6.0066, + "step": 2005 + }, + { + "epoch": 0.689258257097453, + "grad_norm": 0.679142415523529, + "learning_rate": 0.00013366604169129742, + "loss": 5.9651, + "step": 2006 + }, + { + "epoch": 0.6896018554310012, + "grad_norm": 0.7514729499816895, + "learning_rate": 0.00013340068719995912, + "loss": 6.0792, + "step": 2007 + }, + { + "epoch": 0.6899454537645492, + "grad_norm": 0.7526649832725525, + "learning_rate": 0.00013313550048263168, + "loss": 5.9845, + "step": 2008 + }, + { + "epoch": 0.6902890520980973, + "grad_norm": 0.7030274868011475, + "learning_rate": 0.00013287048192089064, + "loss": 5.9931, + "step": 2009 + }, + { + "epoch": 0.6906326504316455, + "grad_norm": 0.7123146653175354, + "learning_rate": 0.0001326056318960697, + "loss": 5.9611, + "step": 2010 + }, + { + "epoch": 0.6909762487651935, + "grad_norm": 0.7373788356781006, + "learning_rate": 0.00013234095078925952, + "loss": 5.9859, + "step": 2011 + }, + { + "epoch": 0.6913198470987416, + "grad_norm": 0.6924092769622803, + "learning_rate": 0.00013207643898130854, + "loss": 6.1251, + "step": 2012 + }, + { + "epoch": 0.6916634454322896, + "grad_norm": 0.6907500624656677, + "learning_rate": 0.00013181209685282074, + "loss": 5.9978, + "step": 2013 + }, + { + "epoch": 0.6920070437658378, + "grad_norm": 0.4841921329498291, + "learning_rate": 0.00013154792478415646, + "loss": 6.1378, + "step": 2014 + }, + { + "epoch": 0.6923506420993858, + "grad_norm": 0.5584970712661743, + "learning_rate": 0.00013128392315543125, + "loss": 6.0776, + "step": 2015 + }, + { + "epoch": 0.6926942404329339, + "grad_norm": 0.6407040357589722, + "learning_rate": 0.00013102009234651542, + "loss": 6.0172, + "step": 2016 + }, + { + "epoch": 0.693037838766482, + "grad_norm": 0.663905918598175, + "learning_rate": 0.00013075643273703316, + "loss": 5.8666, + "step": 2017 + }, + { + "epoch": 0.6933814371000301, + "grad_norm": 0.6265025734901428, + "learning_rate": 0.00013049294470636303, + "loss": 6.1306, + "step": 2018 + }, + { + "epoch": 0.6937250354335781, + "grad_norm": 0.5826771259307861, + "learning_rate": 0.00013022962863363597, + "loss": 6.0611, + "step": 2019 + }, + { + "epoch": 0.6940686337671262, + "grad_norm": 0.515347421169281, + "learning_rate": 0.00012996648489773595, + "loss": 6.1146, + "step": 2020 + }, + { + "epoch": 0.6944122321006743, + "grad_norm": 0.623602569103241, + "learning_rate": 0.00012970351387729873, + "loss": 6.0535, + "step": 2021 + }, + { + "epoch": 0.6947558304342224, + "grad_norm": 0.4509178102016449, + "learning_rate": 0.0001294407159507118, + "loss": 6.0843, + "step": 2022 + }, + { + "epoch": 0.6950994287677704, + "grad_norm": 0.5675387382507324, + "learning_rate": 0.00012917809149611323, + "loss": 6.0791, + "step": 2023 + }, + { + "epoch": 0.6954430271013186, + "grad_norm": 0.4827669858932495, + "learning_rate": 0.0001289156408913918, + "loss": 6.0356, + "step": 2024 + }, + { + "epoch": 0.6957866254348667, + "grad_norm": 0.558542788028717, + "learning_rate": 0.00012865336451418593, + "loss": 6.0506, + "step": 2025 + }, + { + "epoch": 0.6961302237684147, + "grad_norm": 0.5373152494430542, + "learning_rate": 0.00012839126274188353, + "loss": 5.9674, + "step": 2026 + }, + { + "epoch": 0.6964738221019628, + "grad_norm": 0.5774347186088562, + "learning_rate": 0.00012812933595162125, + "loss": 6.0237, + "step": 2027 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 0.5224197506904602, + "learning_rate": 0.00012786758452028354, + "loss": 6.1522, + "step": 2028 + }, + { + "epoch": 0.697161018769059, + "grad_norm": 0.6040776371955872, + "learning_rate": 0.00012760600882450335, + "loss": 6.1, + "step": 2029 + }, + { + "epoch": 0.697504617102607, + "grad_norm": 0.44605669379234314, + "learning_rate": 0.00012734460924065992, + "loss": 6.1146, + "step": 2030 + }, + { + "epoch": 0.6978482154361552, + "grad_norm": 0.5897433161735535, + "learning_rate": 0.00012708338614487958, + "loss": 5.9674, + "step": 2031 + }, + { + "epoch": 0.6981918137697032, + "grad_norm": 0.5256454348564148, + "learning_rate": 0.00012682233991303458, + "loss": 6.1621, + "step": 2032 + }, + { + "epoch": 0.6985354121032513, + "grad_norm": 0.566554069519043, + "learning_rate": 0.00012656147092074277, + "loss": 6.0676, + "step": 2033 + }, + { + "epoch": 0.6988790104367993, + "grad_norm": 0.4864864945411682, + "learning_rate": 0.00012630077954336666, + "loss": 6.1398, + "step": 2034 + }, + { + "epoch": 0.6992226087703475, + "grad_norm": 0.5105878710746765, + "learning_rate": 0.0001260402661560137, + "loss": 6.1083, + "step": 2035 + }, + { + "epoch": 0.6995662071038955, + "grad_norm": 0.5783962607383728, + "learning_rate": 0.00012577993113353474, + "loss": 5.9497, + "step": 2036 + }, + { + "epoch": 0.6999098054374436, + "grad_norm": 0.5311532616615295, + "learning_rate": 0.00012551977485052428, + "loss": 6.0643, + "step": 2037 + }, + { + "epoch": 0.7002534037709917, + "grad_norm": 0.550070583820343, + "learning_rate": 0.0001252597976813195, + "loss": 6.0993, + "step": 2038 + }, + { + "epoch": 0.7005970021045398, + "grad_norm": 0.7595521211624146, + "learning_rate": 0.00012500000000000006, + "loss": 6.2794, + "step": 2039 + }, + { + "epoch": 0.7009406004380879, + "grad_norm": 0.575773298740387, + "learning_rate": 0.00012474038218038695, + "loss": 5.9778, + "step": 2040 + }, + { + "epoch": 0.701284198771636, + "grad_norm": 0.6314275860786438, + "learning_rate": 0.00012448094459604266, + "loss": 6.0772, + "step": 2041 + }, + { + "epoch": 0.7016277971051841, + "grad_norm": 0.565686821937561, + "learning_rate": 0.0001242216876202705, + "loss": 6.1339, + "step": 2042 + }, + { + "epoch": 0.7019713954387321, + "grad_norm": 0.6472426652908325, + "learning_rate": 0.0001239626116261133, + "loss": 6.0905, + "step": 2043 + }, + { + "epoch": 0.7023149937722802, + "grad_norm": 0.5603044629096985, + "learning_rate": 0.00012370371698635426, + "loss": 6.1049, + "step": 2044 + }, + { + "epoch": 0.7026585921058283, + "grad_norm": 0.573658287525177, + "learning_rate": 0.000123445004073515, + "loss": 6.1619, + "step": 2045 + }, + { + "epoch": 0.7030021904393764, + "grad_norm": 0.7000228762626648, + "learning_rate": 0.00012318647325985593, + "loss": 6.1155, + "step": 2046 + }, + { + "epoch": 0.7033457887729244, + "grad_norm": 0.7718038558959961, + "learning_rate": 0.00012292812491737542, + "loss": 6.1795, + "step": 2047 + }, + { + "epoch": 0.7036893871064726, + "grad_norm": 0.7203500866889954, + "learning_rate": 0.00012266995941780933, + "loss": 6.1553, + "step": 2048 + }, + { + "epoch": 0.7040329854400206, + "grad_norm": 0.9468965530395508, + "learning_rate": 0.0001224119771326301, + "loss": 6.3386, + "step": 2049 + }, + { + "epoch": 0.7043765837735687, + "grad_norm": 1.2192009687423706, + "learning_rate": 0.0001221541784330472, + "loss": 6.2315, + "step": 2050 + }, + { + "epoch": 0.7047201821071167, + "grad_norm": 0.7734485268592834, + "learning_rate": 0.00012189656369000518, + "loss": 6.0635, + "step": 2051 + }, + { + "epoch": 0.7050637804406649, + "grad_norm": 0.8719174265861511, + "learning_rate": 0.00012163913327418443, + "loss": 6.0625, + "step": 2052 + }, + { + "epoch": 0.7054073787742129, + "grad_norm": 0.7755087614059448, + "learning_rate": 0.00012138188755599994, + "loss": 5.9011, + "step": 2053 + }, + { + "epoch": 0.705750977107761, + "grad_norm": 0.5290982723236084, + "learning_rate": 0.00012112482690560089, + "loss": 5.9983, + "step": 2054 + }, + { + "epoch": 0.7060945754413092, + "grad_norm": 0.6337197422981262, + "learning_rate": 0.00012086795169287032, + "loss": 6.0545, + "step": 2055 + }, + { + "epoch": 0.7064381737748572, + "grad_norm": 0.7570812702178955, + "learning_rate": 0.00012061126228742419, + "loss": 5.9831, + "step": 2056 + }, + { + "epoch": 0.7067817721084053, + "grad_norm": 0.5576236248016357, + "learning_rate": 0.00012035475905861134, + "loss": 5.9773, + "step": 2057 + }, + { + "epoch": 0.7071253704419533, + "grad_norm": 0.544528067111969, + "learning_rate": 0.00012009844237551265, + "loss": 6.0095, + "step": 2058 + }, + { + "epoch": 0.7074689687755015, + "grad_norm": 0.6879101395606995, + "learning_rate": 0.00011984231260694061, + "loss": 5.9356, + "step": 2059 + }, + { + "epoch": 0.7078125671090495, + "grad_norm": 0.5554669499397278, + "learning_rate": 0.00011958637012143847, + "loss": 6.0969, + "step": 2060 + }, + { + "epoch": 0.7081561654425976, + "grad_norm": 0.6475436687469482, + "learning_rate": 0.00011933061528728062, + "loss": 5.9936, + "step": 2061 + }, + { + "epoch": 0.7084997637761457, + "grad_norm": 0.6078757643699646, + "learning_rate": 0.00011907504847247081, + "loss": 5.9177, + "step": 2062 + }, + { + "epoch": 0.7088433621096938, + "grad_norm": 0.5807406306266785, + "learning_rate": 0.00011881967004474257, + "loss": 5.9959, + "step": 2063 + }, + { + "epoch": 0.7091869604432418, + "grad_norm": 0.7721190452575684, + "learning_rate": 0.00011856448037155828, + "loss": 6.0585, + "step": 2064 + }, + { + "epoch": 0.7095305587767899, + "grad_norm": 0.6295862793922424, + "learning_rate": 0.00011830947982010889, + "loss": 6.0483, + "step": 2065 + }, + { + "epoch": 0.709874157110338, + "grad_norm": 0.5060104131698608, + "learning_rate": 0.00011805466875731277, + "loss": 6.0596, + "step": 2066 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 0.5339775681495667, + "learning_rate": 0.0001178000475498163, + "loss": 6.1276, + "step": 2067 + }, + { + "epoch": 0.7105613537774342, + "grad_norm": 0.6627702713012695, + "learning_rate": 0.00011754561656399204, + "loss": 6.0535, + "step": 2068 + }, + { + "epoch": 0.7109049521109823, + "grad_norm": 0.49420684576034546, + "learning_rate": 0.00011729137616593922, + "loss": 6.1306, + "step": 2069 + }, + { + "epoch": 0.7112485504445304, + "grad_norm": 0.6272643804550171, + "learning_rate": 0.00011703732672148274, + "loss": 6.0656, + "step": 2070 + }, + { + "epoch": 0.7115921487780784, + "grad_norm": 0.4226469099521637, + "learning_rate": 0.00011678346859617283, + "loss": 6.0244, + "step": 2071 + }, + { + "epoch": 0.7119357471116265, + "grad_norm": 0.5564938187599182, + "learning_rate": 0.00011652980215528414, + "loss": 6.0609, + "step": 2072 + }, + { + "epoch": 0.7122793454451746, + "grad_norm": 0.47332972288131714, + "learning_rate": 0.00011627632776381577, + "loss": 6.1597, + "step": 2073 + }, + { + "epoch": 0.7126229437787227, + "grad_norm": 0.5332189798355103, + "learning_rate": 0.00011602304578649056, + "loss": 6.0788, + "step": 2074 + }, + { + "epoch": 0.7129665421122707, + "grad_norm": 0.5555863976478577, + "learning_rate": 0.00011576995658775405, + "loss": 6.0669, + "step": 2075 + }, + { + "epoch": 0.7133101404458189, + "grad_norm": 0.5416020154953003, + "learning_rate": 0.000115517060531775, + "loss": 6.1068, + "step": 2076 + }, + { + "epoch": 0.7136537387793669, + "grad_norm": 0.5631720423698425, + "learning_rate": 0.0001152643579824437, + "loss": 6.058, + "step": 2077 + }, + { + "epoch": 0.713997337112915, + "grad_norm": 0.5092825293540955, + "learning_rate": 0.00011501184930337235, + "loss": 6.0472, + "step": 2078 + }, + { + "epoch": 0.714340935446463, + "grad_norm": 0.5118112564086914, + "learning_rate": 0.00011475953485789406, + "loss": 6.188, + "step": 2079 + }, + { + "epoch": 0.7146845337800112, + "grad_norm": 0.5366310477256775, + "learning_rate": 0.00011450741500906248, + "loss": 6.1434, + "step": 2080 + }, + { + "epoch": 0.7150281321135592, + "grad_norm": 0.5202789902687073, + "learning_rate": 0.00011425549011965128, + "loss": 5.9162, + "step": 2081 + }, + { + "epoch": 0.7153717304471073, + "grad_norm": 0.5178118944168091, + "learning_rate": 0.00011400376055215367, + "loss": 6.096, + "step": 2082 + }, + { + "epoch": 0.7157153287806555, + "grad_norm": 0.6584262251853943, + "learning_rate": 0.00011375222666878143, + "loss": 6.0834, + "step": 2083 + }, + { + "epoch": 0.7160589271142035, + "grad_norm": 0.6938084363937378, + "learning_rate": 0.00011350088883146548, + "loss": 6.0291, + "step": 2084 + }, + { + "epoch": 0.7164025254477516, + "grad_norm": 0.5790725350379944, + "learning_rate": 0.00011324974740185392, + "loss": 6.0404, + "step": 2085 + }, + { + "epoch": 0.7167461237812996, + "grad_norm": 0.571308970451355, + "learning_rate": 0.00011299880274131269, + "loss": 6.0675, + "step": 2086 + }, + { + "epoch": 0.7170897221148478, + "grad_norm": 0.6977917551994324, + "learning_rate": 0.00011274805521092452, + "loss": 6.1419, + "step": 2087 + }, + { + "epoch": 0.7174333204483958, + "grad_norm": 0.5644780397415161, + "learning_rate": 0.00011249750517148826, + "loss": 6.1105, + "step": 2088 + }, + { + "epoch": 0.7177769187819439, + "grad_norm": 0.6917959451675415, + "learning_rate": 0.00011224715298351889, + "loss": 6.2568, + "step": 2089 + }, + { + "epoch": 0.718120517115492, + "grad_norm": 0.7160729765892029, + "learning_rate": 0.00011199699900724659, + "loss": 6.0182, + "step": 2090 + }, + { + "epoch": 0.7184641154490401, + "grad_norm": 0.5881943106651306, + "learning_rate": 0.00011174704360261636, + "loss": 6.1496, + "step": 2091 + }, + { + "epoch": 0.7188077137825881, + "grad_norm": 0.6279813647270203, + "learning_rate": 0.00011149728712928724, + "loss": 6.165, + "step": 2092 + }, + { + "epoch": 0.7191513121161363, + "grad_norm": 0.6775481700897217, + "learning_rate": 0.00011124772994663257, + "loss": 6.165, + "step": 2093 + }, + { + "epoch": 0.7194949104496843, + "grad_norm": 0.7559806704521179, + "learning_rate": 0.00011099837241373831, + "loss": 6.2015, + "step": 2094 + }, + { + "epoch": 0.7198385087832324, + "grad_norm": 0.7788186073303223, + "learning_rate": 0.00011074921488940353, + "loss": 6.111, + "step": 2095 + }, + { + "epoch": 0.7201821071167804, + "grad_norm": 0.6424675583839417, + "learning_rate": 0.00011050025773213943, + "loss": 6.179, + "step": 2096 + }, + { + "epoch": 0.7205257054503286, + "grad_norm": 0.7416768670082092, + "learning_rate": 0.00011025150130016895, + "loss": 6.1499, + "step": 2097 + }, + { + "epoch": 0.7208693037838767, + "grad_norm": 0.7270972728729248, + "learning_rate": 0.00011000294595142591, + "loss": 6.1627, + "step": 2098 + }, + { + "epoch": 0.7212129021174247, + "grad_norm": 0.8351125121116638, + "learning_rate": 0.00010975459204355531, + "loss": 6.1789, + "step": 2099 + }, + { + "epoch": 0.7215565004509729, + "grad_norm": 1.0598890781402588, + "learning_rate": 0.0001095064399339118, + "loss": 6.4582, + "step": 2100 + }, + { + "epoch": 0.7219000987845209, + "grad_norm": 1.0137444734573364, + "learning_rate": 0.00010925848997955995, + "loss": 6.0051, + "step": 2101 + }, + { + "epoch": 0.722243697118069, + "grad_norm": 0.8496506810188293, + "learning_rate": 0.00010901074253727336, + "loss": 6.0438, + "step": 2102 + }, + { + "epoch": 0.722587295451617, + "grad_norm": 0.6422009468078613, + "learning_rate": 0.00010876319796353437, + "loss": 6.0723, + "step": 2103 + }, + { + "epoch": 0.7229308937851652, + "grad_norm": 0.6015517711639404, + "learning_rate": 0.00010851585661453309, + "loss": 5.9828, + "step": 2104 + }, + { + "epoch": 0.7232744921187132, + "grad_norm": 0.5879690051078796, + "learning_rate": 0.00010826871884616751, + "loss": 5.9908, + "step": 2105 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 0.5813199281692505, + "learning_rate": 0.00010802178501404272, + "loss": 6.1323, + "step": 2106 + }, + { + "epoch": 0.7239616887858094, + "grad_norm": 0.7803506851196289, + "learning_rate": 0.00010777505547346994, + "loss": 5.8687, + "step": 2107 + }, + { + "epoch": 0.7243052871193575, + "grad_norm": 0.6409358382225037, + "learning_rate": 0.0001075285305794671, + "loss": 6.0781, + "step": 2108 + }, + { + "epoch": 0.7246488854529055, + "grad_norm": 0.5260980129241943, + "learning_rate": 0.00010728221068675695, + "loss": 5.8364, + "step": 2109 + }, + { + "epoch": 0.7249924837864536, + "grad_norm": 0.5710437893867493, + "learning_rate": 0.00010703609614976798, + "loss": 5.9737, + "step": 2110 + }, + { + "epoch": 0.7253360821200017, + "grad_norm": 0.5062072277069092, + "learning_rate": 0.00010679018732263257, + "loss": 6.0028, + "step": 2111 + }, + { + "epoch": 0.7256796804535498, + "grad_norm": 0.6027816534042358, + "learning_rate": 0.00010654448455918747, + "loss": 6.0024, + "step": 2112 + }, + { + "epoch": 0.7260232787870979, + "grad_norm": 0.584230363368988, + "learning_rate": 0.00010629898821297279, + "loss": 6.0539, + "step": 2113 + }, + { + "epoch": 0.726366877120646, + "grad_norm": 0.5171947479248047, + "learning_rate": 0.0001060536986372318, + "loss": 6.0555, + "step": 2114 + }, + { + "epoch": 0.7267104754541941, + "grad_norm": 0.5520440936088562, + "learning_rate": 0.0001058086161849098, + "loss": 6.0871, + "step": 2115 + }, + { + "epoch": 0.7270540737877421, + "grad_norm": 0.5782375335693359, + "learning_rate": 0.00010556374120865477, + "loss": 5.9704, + "step": 2116 + }, + { + "epoch": 0.7273976721212903, + "grad_norm": 0.4811157286167145, + "learning_rate": 0.00010531907406081548, + "loss": 6.08, + "step": 2117 + }, + { + "epoch": 0.7277412704548383, + "grad_norm": 0.5225687623023987, + "learning_rate": 0.00010507461509344199, + "loss": 6.0993, + "step": 2118 + }, + { + "epoch": 0.7280848687883864, + "grad_norm": 0.5056168437004089, + "learning_rate": 0.00010483036465828492, + "loss": 6.0458, + "step": 2119 + }, + { + "epoch": 0.7284284671219344, + "grad_norm": 0.5187274217605591, + "learning_rate": 0.00010458632310679439, + "loss": 5.953, + "step": 2120 + }, + { + "epoch": 0.7287720654554826, + "grad_norm": 0.5479382276535034, + "learning_rate": 0.00010434249079012043, + "loss": 6.0058, + "step": 2121 + }, + { + "epoch": 0.7291156637890306, + "grad_norm": 0.4890539050102234, + "learning_rate": 0.00010409886805911175, + "loss": 6.0959, + "step": 2122 + }, + { + "epoch": 0.7294592621225787, + "grad_norm": 0.6457909941673279, + "learning_rate": 0.00010385545526431567, + "loss": 6.0905, + "step": 2123 + }, + { + "epoch": 0.7298028604561267, + "grad_norm": 0.66217440366745, + "learning_rate": 0.00010361225275597702, + "loss": 6.1174, + "step": 2124 + }, + { + "epoch": 0.7301464587896749, + "grad_norm": 0.6101366281509399, + "learning_rate": 0.00010336926088403873, + "loss": 6.0577, + "step": 2125 + }, + { + "epoch": 0.7304900571232229, + "grad_norm": 0.49836745858192444, + "learning_rate": 0.00010312647999813998, + "loss": 6.0062, + "step": 2126 + }, + { + "epoch": 0.730833655456771, + "grad_norm": 0.5868933200836182, + "learning_rate": 0.00010288391044761675, + "loss": 5.9208, + "step": 2127 + }, + { + "epoch": 0.7311772537903192, + "grad_norm": 0.479012131690979, + "learning_rate": 0.00010264155258150079, + "loss": 6.0414, + "step": 2128 + }, + { + "epoch": 0.7315208521238672, + "grad_norm": 0.5518100261688232, + "learning_rate": 0.00010239940674851941, + "loss": 6.116, + "step": 2129 + }, + { + "epoch": 0.7318644504574153, + "grad_norm": 0.5689438581466675, + "learning_rate": 0.00010215747329709446, + "loss": 6.0154, + "step": 2130 + }, + { + "epoch": 0.7322080487909634, + "grad_norm": 0.5302237868309021, + "learning_rate": 0.00010191575257534277, + "loss": 6.0576, + "step": 2131 + }, + { + "epoch": 0.7325516471245115, + "grad_norm": 0.6240813136100769, + "learning_rate": 0.00010167424493107449, + "loss": 6.0377, + "step": 2132 + }, + { + "epoch": 0.7328952454580595, + "grad_norm": 0.5226022601127625, + "learning_rate": 0.00010143295071179357, + "loss": 5.975, + "step": 2133 + }, + { + "epoch": 0.7332388437916076, + "grad_norm": 0.501981258392334, + "learning_rate": 0.00010119187026469668, + "loss": 5.9928, + "step": 2134 + }, + { + "epoch": 0.7335824421251557, + "grad_norm": 0.47182825207710266, + "learning_rate": 0.00010095100393667294, + "loss": 6.0488, + "step": 2135 + }, + { + "epoch": 0.7339260404587038, + "grad_norm": 0.6065984964370728, + "learning_rate": 0.00010071035207430351, + "loss": 5.9399, + "step": 2136 + }, + { + "epoch": 0.7342696387922518, + "grad_norm": 0.5377668142318726, + "learning_rate": 0.00010046991502386063, + "loss": 6.1058, + "step": 2137 + }, + { + "epoch": 0.7346132371258, + "grad_norm": 0.617766261100769, + "learning_rate": 0.00010022969313130773, + "loss": 6.1166, + "step": 2138 + }, + { + "epoch": 0.734956835459348, + "grad_norm": 0.6241163015365601, + "learning_rate": 9.998968674229855e-05, + "loss": 6.0711, + "step": 2139 + }, + { + "epoch": 0.7353004337928961, + "grad_norm": 0.5719399452209473, + "learning_rate": 9.974989620217689e-05, + "loss": 6.1984, + "step": 2140 + }, + { + "epoch": 0.7356440321264441, + "grad_norm": 0.6313961148262024, + "learning_rate": 9.951032185597553e-05, + "loss": 6.0932, + "step": 2141 + }, + { + "epoch": 0.7359876304599923, + "grad_norm": 0.7259103655815125, + "learning_rate": 9.927096404841688e-05, + "loss": 6.127, + "step": 2142 + }, + { + "epoch": 0.7363312287935404, + "grad_norm": 0.6876815557479858, + "learning_rate": 9.903182312391104e-05, + "loss": 6.0632, + "step": 2143 + }, + { + "epoch": 0.7366748271270884, + "grad_norm": 0.678907036781311, + "learning_rate": 9.87928994265565e-05, + "loss": 6.1833, + "step": 2144 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 0.7564148902893066, + "learning_rate": 9.855419330013904e-05, + "loss": 6.1087, + "step": 2145 + }, + { + "epoch": 0.7373620237941846, + "grad_norm": 0.7315529584884644, + "learning_rate": 9.831570508813149e-05, + "loss": 6.1149, + "step": 2146 + }, + { + "epoch": 0.7377056221277327, + "grad_norm": 0.8211150765419006, + "learning_rate": 9.807743513369271e-05, + "loss": 6.27, + "step": 2147 + }, + { + "epoch": 0.7380492204612807, + "grad_norm": 0.8686621189117432, + "learning_rate": 9.783938377966825e-05, + "loss": 6.142, + "step": 2148 + }, + { + "epoch": 0.7383928187948289, + "grad_norm": 0.8419855237007141, + "learning_rate": 9.760155136858839e-05, + "loss": 6.1696, + "step": 2149 + }, + { + "epoch": 0.7387364171283769, + "grad_norm": 1.0530507564544678, + "learning_rate": 9.736393824266876e-05, + "loss": 5.9706, + "step": 2150 + }, + { + "epoch": 0.739080015461925, + "grad_norm": 0.8615719079971313, + "learning_rate": 9.712654474380947e-05, + "loss": 5.9201, + "step": 2151 + }, + { + "epoch": 0.7394236137954731, + "grad_norm": 0.7670858502388, + "learning_rate": 9.688937121359434e-05, + "loss": 5.991, + "step": 2152 + }, + { + "epoch": 0.7397672121290212, + "grad_norm": 0.7319602966308594, + "learning_rate": 9.665241799329098e-05, + "loss": 6.0728, + "step": 2153 + }, + { + "epoch": 0.7401108104625692, + "grad_norm": 0.6953431963920593, + "learning_rate": 9.641568542384982e-05, + "loss": 6.0657, + "step": 2154 + }, + { + "epoch": 0.7404544087961173, + "grad_norm": 0.6169705390930176, + "learning_rate": 9.617917384590397e-05, + "loss": 6.0031, + "step": 2155 + }, + { + "epoch": 0.7407980071296654, + "grad_norm": 0.6087819933891296, + "learning_rate": 9.594288359976817e-05, + "loss": 5.9738, + "step": 2156 + }, + { + "epoch": 0.7411416054632135, + "grad_norm": 0.7109917998313904, + "learning_rate": 9.570681502543929e-05, + "loss": 6.0986, + "step": 2157 + }, + { + "epoch": 0.7414852037967616, + "grad_norm": 0.725594162940979, + "learning_rate": 9.547096846259467e-05, + "loss": 6.0714, + "step": 2158 + }, + { + "epoch": 0.7418288021303097, + "grad_norm": 0.6579272150993347, + "learning_rate": 9.523534425059252e-05, + "loss": 6.0939, + "step": 2159 + }, + { + "epoch": 0.7421724004638578, + "grad_norm": 0.61977618932724, + "learning_rate": 9.499994272847099e-05, + "loss": 5.9499, + "step": 2160 + }, + { + "epoch": 0.7425159987974058, + "grad_norm": 0.5611152052879333, + "learning_rate": 9.476476423494792e-05, + "loss": 6.0905, + "step": 2161 + }, + { + "epoch": 0.742859597130954, + "grad_norm": 0.5907820463180542, + "learning_rate": 9.452980910841993e-05, + "loss": 5.9445, + "step": 2162 + }, + { + "epoch": 0.743203195464502, + "grad_norm": 0.45404481887817383, + "learning_rate": 9.42950776869628e-05, + "loss": 6.0217, + "step": 2163 + }, + { + "epoch": 0.7435467937980501, + "grad_norm": 0.5082436800003052, + "learning_rate": 9.40605703083298e-05, + "loss": 5.9823, + "step": 2164 + }, + { + "epoch": 0.7438903921315981, + "grad_norm": 0.5696332454681396, + "learning_rate": 9.382628730995222e-05, + "loss": 6.1404, + "step": 2165 + }, + { + "epoch": 0.7442339904651463, + "grad_norm": 0.5783550143241882, + "learning_rate": 9.359222902893832e-05, + "loss": 6.0203, + "step": 2166 + }, + { + "epoch": 0.7445775887986943, + "grad_norm": 0.516828179359436, + "learning_rate": 9.335839580207317e-05, + "loss": 6.099, + "step": 2167 + }, + { + "epoch": 0.7449211871322424, + "grad_norm": 0.5123862624168396, + "learning_rate": 9.312478796581792e-05, + "loss": 6.0458, + "step": 2168 + }, + { + "epoch": 0.7452647854657904, + "grad_norm": 0.5932062864303589, + "learning_rate": 9.289140585630926e-05, + "loss": 6.0813, + "step": 2169 + }, + { + "epoch": 0.7456083837993386, + "grad_norm": 0.4866209626197815, + "learning_rate": 9.265824980935933e-05, + "loss": 6.1021, + "step": 2170 + }, + { + "epoch": 0.7459519821328866, + "grad_norm": 0.5570813417434692, + "learning_rate": 9.242532016045485e-05, + "loss": 6.1042, + "step": 2171 + }, + { + "epoch": 0.7462955804664347, + "grad_norm": 0.5078079104423523, + "learning_rate": 9.219261724475692e-05, + "loss": 6.0633, + "step": 2172 + }, + { + "epoch": 0.7466391787999829, + "grad_norm": 0.419669508934021, + "learning_rate": 9.196014139710005e-05, + "loss": 6.0652, + "step": 2173 + }, + { + "epoch": 0.7469827771335309, + "grad_norm": 0.511519193649292, + "learning_rate": 9.172789295199255e-05, + "loss": 6.1272, + "step": 2174 + }, + { + "epoch": 0.747326375467079, + "grad_norm": 0.4874584972858429, + "learning_rate": 9.149587224361503e-05, + "loss": 6.1074, + "step": 2175 + }, + { + "epoch": 0.747669973800627, + "grad_norm": 0.4667309820652008, + "learning_rate": 9.126407960582067e-05, + "loss": 6.0939, + "step": 2176 + }, + { + "epoch": 0.7480135721341752, + "grad_norm": 0.498703271150589, + "learning_rate": 9.103251537213445e-05, + "loss": 5.9496, + "step": 2177 + }, + { + "epoch": 0.7483571704677232, + "grad_norm": 0.536368727684021, + "learning_rate": 9.080117987575271e-05, + "loss": 5.9855, + "step": 2178 + }, + { + "epoch": 0.7487007688012713, + "grad_norm": 0.5683017373085022, + "learning_rate": 9.057007344954244e-05, + "loss": 6.095, + "step": 2179 + }, + { + "epoch": 0.7490443671348194, + "grad_norm": 0.6125717163085938, + "learning_rate": 9.033919642604149e-05, + "loss": 6.13, + "step": 2180 + }, + { + "epoch": 0.7493879654683675, + "grad_norm": 0.5337144732475281, + "learning_rate": 9.010854913745712e-05, + "loss": 6.0017, + "step": 2181 + }, + { + "epoch": 0.7497315638019155, + "grad_norm": 0.4696179926395416, + "learning_rate": 8.987813191566632e-05, + "loss": 6.0913, + "step": 2182 + }, + { + "epoch": 0.7500751621354637, + "grad_norm": 0.6470948457717896, + "learning_rate": 8.964794509221508e-05, + "loss": 6.0583, + "step": 2183 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 0.5472028851509094, + "learning_rate": 8.941798899831757e-05, + "loss": 6.0463, + "step": 2184 + }, + { + "epoch": 0.7507623588025598, + "grad_norm": 0.5236582159996033, + "learning_rate": 8.918826396485624e-05, + "loss": 6.0631, + "step": 2185 + }, + { + "epoch": 0.7511059571361078, + "grad_norm": 0.5017586946487427, + "learning_rate": 8.895877032238095e-05, + "loss": 6.1308, + "step": 2186 + }, + { + "epoch": 0.751449555469656, + "grad_norm": 0.736363410949707, + "learning_rate": 8.872950840110879e-05, + "loss": 6.0674, + "step": 2187 + }, + { + "epoch": 0.7517931538032041, + "grad_norm": 0.5078322291374207, + "learning_rate": 8.8500478530923e-05, + "loss": 6.0312, + "step": 2188 + }, + { + "epoch": 0.7521367521367521, + "grad_norm": 0.564947783946991, + "learning_rate": 8.827168104137353e-05, + "loss": 6.1036, + "step": 2189 + }, + { + "epoch": 0.7524803504703003, + "grad_norm": 0.6403129696846008, + "learning_rate": 8.804311626167533e-05, + "loss": 6.0244, + "step": 2190 + }, + { + "epoch": 0.7528239488038483, + "grad_norm": 0.675612211227417, + "learning_rate": 8.781478452070912e-05, + "loss": 6.142, + "step": 2191 + }, + { + "epoch": 0.7531675471373964, + "grad_norm": 0.6955165266990662, + "learning_rate": 8.758668614701973e-05, + "loss": 5.9662, + "step": 2192 + }, + { + "epoch": 0.7535111454709444, + "grad_norm": 0.58688884973526, + "learning_rate": 8.735882146881661e-05, + "loss": 5.996, + "step": 2193 + }, + { + "epoch": 0.7538547438044926, + "grad_norm": 0.7645385265350342, + "learning_rate": 8.713119081397273e-05, + "loss": 6.0944, + "step": 2194 + }, + { + "epoch": 0.7541983421380406, + "grad_norm": 0.8334187269210815, + "learning_rate": 8.690379451002448e-05, + "loss": 6.0435, + "step": 2195 + }, + { + "epoch": 0.7545419404715887, + "grad_norm": 0.7340932488441467, + "learning_rate": 8.667663288417082e-05, + "loss": 6.26, + "step": 2196 + }, + { + "epoch": 0.7548855388051368, + "grad_norm": 0.6961662173271179, + "learning_rate": 8.644970626327329e-05, + "loss": 6.204, + "step": 2197 + }, + { + "epoch": 0.7552291371386849, + "grad_norm": 0.87010657787323, + "learning_rate": 8.622301497385507e-05, + "loss": 6.2819, + "step": 2198 + }, + { + "epoch": 0.7555727354722329, + "grad_norm": 0.9306342601776123, + "learning_rate": 8.599655934210088e-05, + "loss": 6.1762, + "step": 2199 + }, + { + "epoch": 0.755916333805781, + "grad_norm": 1.0517386198043823, + "learning_rate": 8.577033969385639e-05, + "loss": 6.3565, + "step": 2200 + }, + { + "epoch": 0.7562599321393291, + "grad_norm": 0.8607641458511353, + "learning_rate": 8.55443563546274e-05, + "loss": 5.9011, + "step": 2201 + }, + { + "epoch": 0.7566035304728772, + "grad_norm": 0.789115846157074, + "learning_rate": 8.531860964958002e-05, + "loss": 5.9841, + "step": 2202 + }, + { + "epoch": 0.7569471288064253, + "grad_norm": 0.7245513796806335, + "learning_rate": 8.509309990353973e-05, + "loss": 6.0235, + "step": 2203 + }, + { + "epoch": 0.7572907271399734, + "grad_norm": 0.5847095251083374, + "learning_rate": 8.486782744099117e-05, + "loss": 5.931, + "step": 2204 + }, + { + "epoch": 0.7576343254735215, + "grad_norm": 0.5829930901527405, + "learning_rate": 8.464279258607718e-05, + "loss": 5.9971, + "step": 2205 + }, + { + "epoch": 0.7579779238070695, + "grad_norm": 0.5951849818229675, + "learning_rate": 8.441799566259937e-05, + "loss": 6.0391, + "step": 2206 + }, + { + "epoch": 0.7583215221406177, + "grad_norm": 0.6464923024177551, + "learning_rate": 8.41934369940163e-05, + "loss": 5.9902, + "step": 2207 + }, + { + "epoch": 0.7586651204741657, + "grad_norm": 0.6821208000183105, + "learning_rate": 8.396911690344411e-05, + "loss": 5.9381, + "step": 2208 + }, + { + "epoch": 0.7590087188077138, + "grad_norm": 0.651321530342102, + "learning_rate": 8.37450357136556e-05, + "loss": 5.9518, + "step": 2209 + }, + { + "epoch": 0.7593523171412618, + "grad_norm": 0.5200663805007935, + "learning_rate": 8.352119374707978e-05, + "loss": 5.8669, + "step": 2210 + }, + { + "epoch": 0.75969591547481, + "grad_norm": 0.5066912174224854, + "learning_rate": 8.329759132580126e-05, + "loss": 5.8757, + "step": 2211 + }, + { + "epoch": 0.760039513808358, + "grad_norm": 0.5652316212654114, + "learning_rate": 8.30742287715604e-05, + "loss": 5.8752, + "step": 2212 + }, + { + "epoch": 0.7603831121419061, + "grad_norm": 0.6964173316955566, + "learning_rate": 8.285110640575199e-05, + "loss": 5.9425, + "step": 2213 + }, + { + "epoch": 0.7607267104754541, + "grad_norm": 0.5349513292312622, + "learning_rate": 8.262822454942542e-05, + "loss": 6.1252, + "step": 2214 + }, + { + "epoch": 0.7610703088090023, + "grad_norm": 0.5608907341957092, + "learning_rate": 8.240558352328406e-05, + "loss": 6.0566, + "step": 2215 + }, + { + "epoch": 0.7614139071425503, + "grad_norm": 0.6297080516815186, + "learning_rate": 8.218318364768451e-05, + "loss": 6.0614, + "step": 2216 + }, + { + "epoch": 0.7617575054760984, + "grad_norm": 0.5819994807243347, + "learning_rate": 8.196102524263666e-05, + "loss": 5.9989, + "step": 2217 + }, + { + "epoch": 0.7621011038096466, + "grad_norm": 0.463117778301239, + "learning_rate": 8.173910862780275e-05, + "loss": 6.1422, + "step": 2218 + }, + { + "epoch": 0.7624447021431946, + "grad_norm": 0.5110581517219543, + "learning_rate": 8.15174341224973e-05, + "loss": 5.9919, + "step": 2219 + }, + { + "epoch": 0.7627883004767427, + "grad_norm": 0.5026494264602661, + "learning_rate": 8.129600204568624e-05, + "loss": 6.0571, + "step": 2220 + }, + { + "epoch": 0.7631318988102908, + "grad_norm": 0.5578264594078064, + "learning_rate": 8.10748127159869e-05, + "loss": 6.0543, + "step": 2221 + }, + { + "epoch": 0.7634754971438389, + "grad_norm": 0.6005034446716309, + "learning_rate": 8.085386645166699e-05, + "loss": 6.054, + "step": 2222 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 0.5268194675445557, + "learning_rate": 8.063316357064496e-05, + "loss": 6.0657, + "step": 2223 + }, + { + "epoch": 0.764162693810935, + "grad_norm": 0.43706896901130676, + "learning_rate": 8.041270439048857e-05, + "loss": 5.9922, + "step": 2224 + }, + { + "epoch": 0.7645062921444831, + "grad_norm": 0.4229617416858673, + "learning_rate": 8.019248922841518e-05, + "loss": 5.9731, + "step": 2225 + }, + { + "epoch": 0.7648498904780312, + "grad_norm": 0.44867685437202454, + "learning_rate": 7.997251840129105e-05, + "loss": 6.084, + "step": 2226 + }, + { + "epoch": 0.7651934888115792, + "grad_norm": 0.6207936406135559, + "learning_rate": 7.975279222563086e-05, + "loss": 6.0844, + "step": 2227 + }, + { + "epoch": 0.7655370871451274, + "grad_norm": 0.5367064476013184, + "learning_rate": 7.953331101759707e-05, + "loss": 6.0374, + "step": 2228 + }, + { + "epoch": 0.7658806854786754, + "grad_norm": 0.5211548805236816, + "learning_rate": 7.931407509299982e-05, + "loss": 6.1057, + "step": 2229 + }, + { + "epoch": 0.7662242838122235, + "grad_norm": 0.4982808530330658, + "learning_rate": 7.909508476729632e-05, + "loss": 5.9777, + "step": 2230 + }, + { + "epoch": 0.7665678821457715, + "grad_norm": 0.5660802721977234, + "learning_rate": 7.887634035559036e-05, + "loss": 6.0857, + "step": 2231 + }, + { + "epoch": 0.7669114804793197, + "grad_norm": 0.6563755869865417, + "learning_rate": 7.865784217263197e-05, + "loss": 6.051, + "step": 2232 + }, + { + "epoch": 0.7672550788128678, + "grad_norm": 0.5743918418884277, + "learning_rate": 7.843959053281663e-05, + "loss": 6.0887, + "step": 2233 + }, + { + "epoch": 0.7675986771464158, + "grad_norm": 0.4557611644268036, + "learning_rate": 7.822158575018534e-05, + "loss": 6.0547, + "step": 2234 + }, + { + "epoch": 0.767942275479964, + "grad_norm": 0.6070594787597656, + "learning_rate": 7.800382813842377e-05, + "loss": 6.0169, + "step": 2235 + }, + { + "epoch": 0.768285873813512, + "grad_norm": 0.5235620141029358, + "learning_rate": 7.778631801086209e-05, + "loss": 6.086, + "step": 2236 + }, + { + "epoch": 0.7686294721470601, + "grad_norm": 0.5600701570510864, + "learning_rate": 7.756905568047393e-05, + "loss": 5.9655, + "step": 2237 + }, + { + "epoch": 0.7689730704806081, + "grad_norm": 0.6043455004692078, + "learning_rate": 7.735204145987704e-05, + "loss": 6.0636, + "step": 2238 + }, + { + "epoch": 0.7693166688141563, + "grad_norm": 0.618334949016571, + "learning_rate": 7.713527566133158e-05, + "loss": 6.1404, + "step": 2239 + }, + { + "epoch": 0.7696602671477043, + "grad_norm": 0.5381041169166565, + "learning_rate": 7.691875859674053e-05, + "loss": 6.1488, + "step": 2240 + }, + { + "epoch": 0.7700038654812524, + "grad_norm": 0.6507089734077454, + "learning_rate": 7.670249057764894e-05, + "loss": 6.1051, + "step": 2241 + }, + { + "epoch": 0.7703474638148005, + "grad_norm": 0.5733596086502075, + "learning_rate": 7.648647191524355e-05, + "loss": 6.1027, + "step": 2242 + }, + { + "epoch": 0.7706910621483486, + "grad_norm": 0.6554251313209534, + "learning_rate": 7.627070292035201e-05, + "loss": 6.012, + "step": 2243 + }, + { + "epoch": 0.7710346604818966, + "grad_norm": 0.6997339129447937, + "learning_rate": 7.605518390344333e-05, + "loss": 6.0476, + "step": 2244 + }, + { + "epoch": 0.7713782588154448, + "grad_norm": 0.5664951205253601, + "learning_rate": 7.58399151746261e-05, + "loss": 6.0055, + "step": 2245 + }, + { + "epoch": 0.7717218571489928, + "grad_norm": 0.8235125541687012, + "learning_rate": 7.56248970436493e-05, + "loss": 6.1593, + "step": 2246 + }, + { + "epoch": 0.7720654554825409, + "grad_norm": 0.7746663689613342, + "learning_rate": 7.541012981990122e-05, + "loss": 6.0848, + "step": 2247 + }, + { + "epoch": 0.772409053816089, + "grad_norm": 0.7530941367149353, + "learning_rate": 7.519561381240878e-05, + "loss": 6.2576, + "step": 2248 + }, + { + "epoch": 0.7727526521496371, + "grad_norm": 0.9261403679847717, + "learning_rate": 7.498134932983805e-05, + "loss": 6.2184, + "step": 2249 + }, + { + "epoch": 0.7730962504831852, + "grad_norm": 1.2425649166107178, + "learning_rate": 7.476733668049259e-05, + "loss": 6.1733, + "step": 2250 + }, + { + "epoch": 0.7734398488167332, + "grad_norm": 1.1316070556640625, + "learning_rate": 7.455357617231392e-05, + "loss": 5.9885, + "step": 2251 + }, + { + "epoch": 0.7737834471502814, + "grad_norm": 0.9721357822418213, + "learning_rate": 7.434006811288069e-05, + "loss": 5.9354, + "step": 2252 + }, + { + "epoch": 0.7741270454838294, + "grad_norm": 0.9695913791656494, + "learning_rate": 7.412681280940834e-05, + "loss": 5.8127, + "step": 2253 + }, + { + "epoch": 0.7744706438173775, + "grad_norm": 0.6883681416511536, + "learning_rate": 7.391381056874835e-05, + "loss": 6.0319, + "step": 2254 + }, + { + "epoch": 0.7748142421509255, + "grad_norm": 0.6744367480278015, + "learning_rate": 7.37010616973886e-05, + "loss": 6.0038, + "step": 2255 + }, + { + "epoch": 0.7751578404844737, + "grad_norm": 0.5574299097061157, + "learning_rate": 7.348856650145188e-05, + "loss": 5.9949, + "step": 2256 + }, + { + "epoch": 0.7755014388180217, + "grad_norm": 0.7782508730888367, + "learning_rate": 7.327632528669625e-05, + "loss": 5.9506, + "step": 2257 + }, + { + "epoch": 0.7758450371515698, + "grad_norm": 0.6571906805038452, + "learning_rate": 7.306433835851423e-05, + "loss": 5.9716, + "step": 2258 + }, + { + "epoch": 0.7761886354851179, + "grad_norm": 0.6952961683273315, + "learning_rate": 7.285260602193256e-05, + "loss": 5.9193, + "step": 2259 + }, + { + "epoch": 0.776532233818666, + "grad_norm": 0.6619623303413391, + "learning_rate": 7.264112858161137e-05, + "loss": 6.0133, + "step": 2260 + }, + { + "epoch": 0.776875832152214, + "grad_norm": 0.6246268153190613, + "learning_rate": 7.242990634184432e-05, + "loss": 5.9537, + "step": 2261 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 0.5367864370346069, + "learning_rate": 7.221893960655773e-05, + "loss": 5.9849, + "step": 2262 + }, + { + "epoch": 0.7775630288193103, + "grad_norm": 0.5722734928131104, + "learning_rate": 7.200822867931032e-05, + "loss": 5.9398, + "step": 2263 + }, + { + "epoch": 0.7779066271528583, + "grad_norm": 0.6350281834602356, + "learning_rate": 7.179777386329276e-05, + "loss": 6.0259, + "step": 2264 + }, + { + "epoch": 0.7782502254864064, + "grad_norm": 0.6227221488952637, + "learning_rate": 7.158757546132696e-05, + "loss": 6.0207, + "step": 2265 + }, + { + "epoch": 0.7785938238199545, + "grad_norm": 0.6056457757949829, + "learning_rate": 7.13776337758662e-05, + "loss": 6.1297, + "step": 2266 + }, + { + "epoch": 0.7789374221535026, + "grad_norm": 0.571882963180542, + "learning_rate": 7.116794910899424e-05, + "loss": 6.1116, + "step": 2267 + }, + { + "epoch": 0.7792810204870506, + "grad_norm": 0.4469764530658722, + "learning_rate": 7.095852176242503e-05, + "loss": 6.0374, + "step": 2268 + }, + { + "epoch": 0.7796246188205987, + "grad_norm": 0.6148665547370911, + "learning_rate": 7.07493520375021e-05, + "loss": 5.9959, + "step": 2269 + }, + { + "epoch": 0.7799682171541468, + "grad_norm": 0.6256452798843384, + "learning_rate": 7.05404402351987e-05, + "loss": 5.9674, + "step": 2270 + }, + { + "epoch": 0.7803118154876949, + "grad_norm": 0.6558529138565063, + "learning_rate": 7.033178665611639e-05, + "loss": 6.028, + "step": 2271 + }, + { + "epoch": 0.7806554138212429, + "grad_norm": 0.6320913434028625, + "learning_rate": 7.012339160048578e-05, + "loss": 5.9793, + "step": 2272 + }, + { + "epoch": 0.7809990121547911, + "grad_norm": 0.5069714188575745, + "learning_rate": 6.991525536816498e-05, + "loss": 6.0383, + "step": 2273 + }, + { + "epoch": 0.7813426104883391, + "grad_norm": 0.6668805480003357, + "learning_rate": 6.970737825863999e-05, + "loss": 6.1826, + "step": 2274 + }, + { + "epoch": 0.7816862088218872, + "grad_norm": 0.5856713652610779, + "learning_rate": 6.949976057102384e-05, + "loss": 5.9855, + "step": 2275 + }, + { + "epoch": 0.7820298071554352, + "grad_norm": 0.5079368948936462, + "learning_rate": 6.929240260405634e-05, + "loss": 6.1041, + "step": 2276 + }, + { + "epoch": 0.7823734054889834, + "grad_norm": 0.5541518330574036, + "learning_rate": 6.908530465610347e-05, + "loss": 6.0746, + "step": 2277 + }, + { + "epoch": 0.7827170038225315, + "grad_norm": 0.49500662088394165, + "learning_rate": 6.887846702515718e-05, + "loss": 6.0113, + "step": 2278 + }, + { + "epoch": 0.7830606021560795, + "grad_norm": 0.7125208973884583, + "learning_rate": 6.867189000883495e-05, + "loss": 5.8986, + "step": 2279 + }, + { + "epoch": 0.7834042004896277, + "grad_norm": 0.49747544527053833, + "learning_rate": 6.846557390437883e-05, + "loss": 6.203, + "step": 2280 + }, + { + "epoch": 0.7837477988231757, + "grad_norm": 0.4951108694076538, + "learning_rate": 6.825951900865612e-05, + "loss": 6.0585, + "step": 2281 + }, + { + "epoch": 0.7840913971567238, + "grad_norm": 0.6161938905715942, + "learning_rate": 6.805372561815768e-05, + "loss": 5.951, + "step": 2282 + }, + { + "epoch": 0.7844349954902718, + "grad_norm": 0.5577975511550903, + "learning_rate": 6.784819402899833e-05, + "loss": 6.0114, + "step": 2283 + }, + { + "epoch": 0.78477859382382, + "grad_norm": 0.5856155157089233, + "learning_rate": 6.764292453691622e-05, + "loss": 6.0278, + "step": 2284 + }, + { + "epoch": 0.785122192157368, + "grad_norm": 0.5486255288124084, + "learning_rate": 6.74379174372724e-05, + "loss": 6.1078, + "step": 2285 + }, + { + "epoch": 0.7854657904909161, + "grad_norm": 0.6295291185379028, + "learning_rate": 6.723317302505e-05, + "loss": 6.1005, + "step": 2286 + }, + { + "epoch": 0.7858093888244642, + "grad_norm": 0.5545480251312256, + "learning_rate": 6.702869159485481e-05, + "loss": 6.1214, + "step": 2287 + }, + { + "epoch": 0.7861529871580123, + "grad_norm": 0.5096721649169922, + "learning_rate": 6.682447344091364e-05, + "loss": 6.0126, + "step": 2288 + }, + { + "epoch": 0.7864965854915603, + "grad_norm": 0.6050777435302734, + "learning_rate": 6.66205188570747e-05, + "loss": 6.1272, + "step": 2289 + }, + { + "epoch": 0.7868401838251085, + "grad_norm": 0.5570022463798523, + "learning_rate": 6.641682813680705e-05, + "loss": 6.156, + "step": 2290 + }, + { + "epoch": 0.7871837821586565, + "grad_norm": 0.5773416757583618, + "learning_rate": 6.621340157319997e-05, + "loss": 6.0434, + "step": 2291 + }, + { + "epoch": 0.7875273804922046, + "grad_norm": 0.6278853416442871, + "learning_rate": 6.60102394589625e-05, + "loss": 6.1401, + "step": 2292 + }, + { + "epoch": 0.7878709788257527, + "grad_norm": 0.8446879386901855, + "learning_rate": 6.580734208642344e-05, + "loss": 6.2182, + "step": 2293 + }, + { + "epoch": 0.7882145771593008, + "grad_norm": 0.6405084133148193, + "learning_rate": 6.560470974753053e-05, + "loss": 6.1942, + "step": 2294 + }, + { + "epoch": 0.7885581754928489, + "grad_norm": 0.6068698763847351, + "learning_rate": 6.54023427338501e-05, + "loss": 6.1913, + "step": 2295 + }, + { + "epoch": 0.7889017738263969, + "grad_norm": 0.6595099568367004, + "learning_rate": 6.520024133656687e-05, + "loss": 6.062, + "step": 2296 + }, + { + "epoch": 0.7892453721599451, + "grad_norm": 0.7970278859138489, + "learning_rate": 6.499840584648315e-05, + "loss": 6.1699, + "step": 2297 + }, + { + "epoch": 0.7895889704934931, + "grad_norm": 0.8106145262718201, + "learning_rate": 6.479683655401875e-05, + "loss": 6.2438, + "step": 2298 + }, + { + "epoch": 0.7899325688270412, + "grad_norm": 1.0167194604873657, + "learning_rate": 6.459553374921045e-05, + "loss": 6.4, + "step": 2299 + }, + { + "epoch": 0.7902761671605892, + "grad_norm": 1.1539579629898071, + "learning_rate": 6.439449772171163e-05, + "loss": 6.2757, + "step": 2300 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 0.8855049014091492, + "learning_rate": 6.419372876079174e-05, + "loss": 6.0337, + "step": 2301 + }, + { + "epoch": 0.7909633638276854, + "grad_norm": 1.005344271659851, + "learning_rate": 6.399322715533601e-05, + "loss": 5.8928, + "step": 2302 + }, + { + "epoch": 0.7913069621612335, + "grad_norm": 0.9247390627861023, + "learning_rate": 6.379299319384471e-05, + "loss": 5.9571, + "step": 2303 + }, + { + "epoch": 0.7916505604947816, + "grad_norm": 0.7353301644325256, + "learning_rate": 6.359302716443352e-05, + "loss": 6.0767, + "step": 2304 + }, + { + "epoch": 0.7919941588283297, + "grad_norm": 0.5871204733848572, + "learning_rate": 6.339332935483206e-05, + "loss": 5.9553, + "step": 2305 + }, + { + "epoch": 0.7923377571618778, + "grad_norm": 0.5191056728363037, + "learning_rate": 6.319390005238432e-05, + "loss": 6.0189, + "step": 2306 + }, + { + "epoch": 0.7926813554954258, + "grad_norm": 0.5726730227470398, + "learning_rate": 6.299473954404788e-05, + "loss": 5.9178, + "step": 2307 + }, + { + "epoch": 0.793024953828974, + "grad_norm": 0.6066359877586365, + "learning_rate": 6.279584811639357e-05, + "loss": 5.9549, + "step": 2308 + }, + { + "epoch": 0.793368552162522, + "grad_norm": 0.5796366930007935, + "learning_rate": 6.259722605560488e-05, + "loss": 6.0092, + "step": 2309 + }, + { + "epoch": 0.7937121504960701, + "grad_norm": 0.5898374915122986, + "learning_rate": 6.23988736474779e-05, + "loss": 5.9802, + "step": 2310 + }, + { + "epoch": 0.7940557488296182, + "grad_norm": 0.7097289562225342, + "learning_rate": 6.220079117742064e-05, + "loss": 6.0136, + "step": 2311 + }, + { + "epoch": 0.7943993471631663, + "grad_norm": 0.669428288936615, + "learning_rate": 6.20029789304527e-05, + "loss": 5.8873, + "step": 2312 + }, + { + "epoch": 0.7947429454967143, + "grad_norm": 0.6120542287826538, + "learning_rate": 6.180543719120496e-05, + "loss": 6.0431, + "step": 2313 + }, + { + "epoch": 0.7950865438302624, + "grad_norm": 0.5299326777458191, + "learning_rate": 6.160816624391886e-05, + "loss": 5.9385, + "step": 2314 + }, + { + "epoch": 0.7954301421638105, + "grad_norm": 0.49728134274482727, + "learning_rate": 6.141116637244631e-05, + "loss": 6.1362, + "step": 2315 + }, + { + "epoch": 0.7957737404973586, + "grad_norm": 0.6035153269767761, + "learning_rate": 6.121443786024921e-05, + "loss": 5.9053, + "step": 2316 + }, + { + "epoch": 0.7961173388309066, + "grad_norm": 0.5510128736495972, + "learning_rate": 6.101798099039907e-05, + "loss": 6.0034, + "step": 2317 + }, + { + "epoch": 0.7964609371644548, + "grad_norm": 0.49103474617004395, + "learning_rate": 6.082179604557617e-05, + "loss": 6.0575, + "step": 2318 + }, + { + "epoch": 0.7968045354980028, + "grad_norm": 0.5344441533088684, + "learning_rate": 6.062588330807009e-05, + "loss": 6.1826, + "step": 2319 + }, + { + "epoch": 0.7971481338315509, + "grad_norm": 0.5215442776679993, + "learning_rate": 6.043024305977823e-05, + "loss": 6.0573, + "step": 2320 + }, + { + "epoch": 0.797491732165099, + "grad_norm": 0.4892186224460602, + "learning_rate": 6.023487558220614e-05, + "loss": 6.0188, + "step": 2321 + }, + { + "epoch": 0.7978353304986471, + "grad_norm": 0.549307644367218, + "learning_rate": 6.003978115646683e-05, + "loss": 6.0237, + "step": 2322 + }, + { + "epoch": 0.7981789288321952, + "grad_norm": 0.5364143252372742, + "learning_rate": 5.984496006328055e-05, + "loss": 6.0695, + "step": 2323 + }, + { + "epoch": 0.7985225271657432, + "grad_norm": 0.4920170307159424, + "learning_rate": 5.965041258297396e-05, + "loss": 5.9426, + "step": 2324 + }, + { + "epoch": 0.7988661254992914, + "grad_norm": 0.6157858371734619, + "learning_rate": 5.94561389954803e-05, + "loss": 6.0046, + "step": 2325 + }, + { + "epoch": 0.7992097238328394, + "grad_norm": 0.47127848863601685, + "learning_rate": 5.926213958033855e-05, + "loss": 6.125, + "step": 2326 + }, + { + "epoch": 0.7995533221663875, + "grad_norm": 0.4696691632270813, + "learning_rate": 5.9068414616693264e-05, + "loss": 6.0563, + "step": 2327 + }, + { + "epoch": 0.7998969204999355, + "grad_norm": 0.5054667592048645, + "learning_rate": 5.887496438329412e-05, + "loss": 6.0715, + "step": 2328 + }, + { + "epoch": 0.8002405188334837, + "grad_norm": 0.5439390540122986, + "learning_rate": 5.868178915849526e-05, + "loss": 5.9687, + "step": 2329 + }, + { + "epoch": 0.8005841171670317, + "grad_norm": 0.5408337712287903, + "learning_rate": 5.848888922025553e-05, + "loss": 5.9605, + "step": 2330 + }, + { + "epoch": 0.8009277155005798, + "grad_norm": 0.6336498856544495, + "learning_rate": 5.82962648461372e-05, + "loss": 5.9145, + "step": 2331 + }, + { + "epoch": 0.8012713138341279, + "grad_norm": 0.46218758821487427, + "learning_rate": 5.810391631330639e-05, + "loss": 6.0208, + "step": 2332 + }, + { + "epoch": 0.801614912167676, + "grad_norm": 0.495760053396225, + "learning_rate": 5.791184389853213e-05, + "loss": 6.0446, + "step": 2333 + }, + { + "epoch": 0.801958510501224, + "grad_norm": 0.6889287829399109, + "learning_rate": 5.77200478781863e-05, + "loss": 5.9667, + "step": 2334 + }, + { + "epoch": 0.8023021088347722, + "grad_norm": 0.5764407515525818, + "learning_rate": 5.752852852824275e-05, + "loss": 6.0924, + "step": 2335 + }, + { + "epoch": 0.8026457071683203, + "grad_norm": 0.5324522256851196, + "learning_rate": 5.733728612427772e-05, + "loss": 6.0446, + "step": 2336 + }, + { + "epoch": 0.8029893055018683, + "grad_norm": 0.5219273567199707, + "learning_rate": 5.7146320941468515e-05, + "loss": 6.0894, + "step": 2337 + }, + { + "epoch": 0.8033329038354164, + "grad_norm": 0.6107932329177856, + "learning_rate": 5.695563325459377e-05, + "loss": 6.0145, + "step": 2338 + }, + { + "epoch": 0.8036765021689645, + "grad_norm": 0.74781733751297, + "learning_rate": 5.6765223338032804e-05, + "loss": 5.9561, + "step": 2339 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 0.5779476165771484, + "learning_rate": 5.6575091465765313e-05, + "loss": 6.1911, + "step": 2340 + }, + { + "epoch": 0.8043636988360606, + "grad_norm": 0.5426329970359802, + "learning_rate": 5.6385237911370654e-05, + "loss": 6.1045, + "step": 2341 + }, + { + "epoch": 0.8047072971696088, + "grad_norm": 0.6034143567085266, + "learning_rate": 5.6195662948028024e-05, + "loss": 6.0011, + "step": 2342 + }, + { + "epoch": 0.8050508955031568, + "grad_norm": 0.6380503177642822, + "learning_rate": 5.600636684851562e-05, + "loss": 6.0992, + "step": 2343 + }, + { + "epoch": 0.8053944938367049, + "grad_norm": 0.6098598837852478, + "learning_rate": 5.5817349885210395e-05, + "loss": 6.1604, + "step": 2344 + }, + { + "epoch": 0.8057380921702529, + "grad_norm": 0.7238516211509705, + "learning_rate": 5.562861233008773e-05, + "loss": 6.1236, + "step": 2345 + }, + { + "epoch": 0.8060816905038011, + "grad_norm": 0.6985032558441162, + "learning_rate": 5.5440154454720726e-05, + "loss": 6.0547, + "step": 2346 + }, + { + "epoch": 0.8064252888373491, + "grad_norm": 0.8203953504562378, + "learning_rate": 5.525197653028033e-05, + "loss": 6.0589, + "step": 2347 + }, + { + "epoch": 0.8067688871708972, + "grad_norm": 0.9466792941093445, + "learning_rate": 5.506407882753456e-05, + "loss": 6.2681, + "step": 2348 + }, + { + "epoch": 0.8071124855044453, + "grad_norm": 0.886099100112915, + "learning_rate": 5.4876461616848256e-05, + "loss": 6.2795, + "step": 2349 + }, + { + "epoch": 0.8074560838379934, + "grad_norm": 1.1105183362960815, + "learning_rate": 5.4689125168182504e-05, + "loss": 5.997, + "step": 2350 + }, + { + "epoch": 0.8077996821715415, + "grad_norm": 0.8322456479072571, + "learning_rate": 5.450206975109473e-05, + "loss": 6.0151, + "step": 2351 + }, + { + "epoch": 0.8081432805050895, + "grad_norm": 0.8582592010498047, + "learning_rate": 5.431529563473758e-05, + "loss": 6.0131, + "step": 2352 + }, + { + "epoch": 0.8084868788386377, + "grad_norm": 0.7677584290504456, + "learning_rate": 5.412880308785928e-05, + "loss": 5.9629, + "step": 2353 + }, + { + "epoch": 0.8088304771721857, + "grad_norm": 0.6369113922119141, + "learning_rate": 5.394259237880272e-05, + "loss": 6.0089, + "step": 2354 + }, + { + "epoch": 0.8091740755057338, + "grad_norm": 0.5916951298713684, + "learning_rate": 5.375666377550534e-05, + "loss": 6.0501, + "step": 2355 + }, + { + "epoch": 0.8095176738392819, + "grad_norm": 0.5984398722648621, + "learning_rate": 5.357101754549864e-05, + "loss": 5.9582, + "step": 2356 + }, + { + "epoch": 0.80986127217283, + "grad_norm": 0.6078851222991943, + "learning_rate": 5.338565395590772e-05, + "loss": 5.9114, + "step": 2357 + }, + { + "epoch": 0.810204870506378, + "grad_norm": 0.4389530420303345, + "learning_rate": 5.320057327345112e-05, + "loss": 5.9859, + "step": 2358 + }, + { + "epoch": 0.8105484688399262, + "grad_norm": 0.6114437580108643, + "learning_rate": 5.301577576444025e-05, + "loss": 6.1864, + "step": 2359 + }, + { + "epoch": 0.8108920671734742, + "grad_norm": 0.6425684690475464, + "learning_rate": 5.2831261694779144e-05, + "loss": 5.974, + "step": 2360 + }, + { + "epoch": 0.8112356655070223, + "grad_norm": 0.5639946460723877, + "learning_rate": 5.264703132996376e-05, + "loss": 5.9636, + "step": 2361 + }, + { + "epoch": 0.8115792638405703, + "grad_norm": 0.5804059505462646, + "learning_rate": 5.24630849350822e-05, + "loss": 5.9089, + "step": 2362 + }, + { + "epoch": 0.8119228621741185, + "grad_norm": 0.5132018327713013, + "learning_rate": 5.2279422774813624e-05, + "loss": 5.8329, + "step": 2363 + }, + { + "epoch": 0.8122664605076665, + "grad_norm": 0.5105260014533997, + "learning_rate": 5.2096045113428385e-05, + "loss": 6.1746, + "step": 2364 + }, + { + "epoch": 0.8126100588412146, + "grad_norm": 0.5083038806915283, + "learning_rate": 5.191295221478745e-05, + "loss": 6.044, + "step": 2365 + }, + { + "epoch": 0.8129536571747628, + "grad_norm": 0.7395703792572021, + "learning_rate": 5.1730144342342076e-05, + "loss": 6.0428, + "step": 2366 + }, + { + "epoch": 0.8132972555083108, + "grad_norm": 0.4767551124095917, + "learning_rate": 5.1547621759133165e-05, + "loss": 6.0285, + "step": 2367 + }, + { + "epoch": 0.8136408538418589, + "grad_norm": 0.4748813807964325, + "learning_rate": 5.136538472779156e-05, + "loss": 6.1975, + "step": 2368 + }, + { + "epoch": 0.8139844521754069, + "grad_norm": 0.5720754861831665, + "learning_rate": 5.118343351053681e-05, + "loss": 6.0841, + "step": 2369 + }, + { + "epoch": 0.8143280505089551, + "grad_norm": 0.49053552746772766, + "learning_rate": 5.100176836917736e-05, + "loss": 5.9318, + "step": 2370 + }, + { + "epoch": 0.8146716488425031, + "grad_norm": 0.5451865792274475, + "learning_rate": 5.0820389565110095e-05, + "loss": 6.0171, + "step": 2371 + }, + { + "epoch": 0.8150152471760512, + "grad_norm": 0.4565944969654083, + "learning_rate": 5.063929735931985e-05, + "loss": 6.1135, + "step": 2372 + }, + { + "epoch": 0.8153588455095993, + "grad_norm": 0.5404878854751587, + "learning_rate": 5.045849201237893e-05, + "loss": 6.024, + "step": 2373 + }, + { + "epoch": 0.8157024438431474, + "grad_norm": 0.46480605006217957, + "learning_rate": 5.027797378444707e-05, + "loss": 6.0778, + "step": 2374 + }, + { + "epoch": 0.8160460421766954, + "grad_norm": 0.5615720748901367, + "learning_rate": 5.0097742935270776e-05, + "loss": 6.095, + "step": 2375 + }, + { + "epoch": 0.8163896405102435, + "grad_norm": 0.5113908648490906, + "learning_rate": 4.991779972418315e-05, + "loss": 6.0162, + "step": 2376 + }, + { + "epoch": 0.8167332388437916, + "grad_norm": 0.4773457646369934, + "learning_rate": 4.97381444101033e-05, + "loss": 6.0966, + "step": 2377 + }, + { + "epoch": 0.8170768371773397, + "grad_norm": 0.4736880660057068, + "learning_rate": 4.9558777251536043e-05, + "loss": 5.9657, + "step": 2378 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 0.49745383858680725, + "learning_rate": 4.9379698506571705e-05, + "loss": 6.0504, + "step": 2379 + }, + { + "epoch": 0.8177640338444359, + "grad_norm": 0.49785807728767395, + "learning_rate": 4.920090843288558e-05, + "loss": 5.9924, + "step": 2380 + }, + { + "epoch": 0.818107632177984, + "grad_norm": 0.4761577546596527, + "learning_rate": 4.902240728773749e-05, + "loss": 6.0144, + "step": 2381 + }, + { + "epoch": 0.818451230511532, + "grad_norm": 0.5067473649978638, + "learning_rate": 4.884419532797169e-05, + "loss": 5.9129, + "step": 2382 + }, + { + "epoch": 0.8187948288450801, + "grad_norm": 0.5061054229736328, + "learning_rate": 4.866627281001626e-05, + "loss": 6.008, + "step": 2383 + }, + { + "epoch": 0.8191384271786282, + "grad_norm": 0.5565149784088135, + "learning_rate": 4.8488639989882596e-05, + "loss": 6.1808, + "step": 2384 + }, + { + "epoch": 0.8194820255121763, + "grad_norm": 0.5727932453155518, + "learning_rate": 4.8311297123165676e-05, + "loss": 6.1582, + "step": 2385 + }, + { + "epoch": 0.8198256238457243, + "grad_norm": 0.49524804949760437, + "learning_rate": 4.813424446504283e-05, + "loss": 6.0394, + "step": 2386 + }, + { + "epoch": 0.8201692221792725, + "grad_norm": 0.5881836414337158, + "learning_rate": 4.7957482270274106e-05, + "loss": 5.9767, + "step": 2387 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.5645787715911865, + "learning_rate": 4.778101079320152e-05, + "loss": 6.1487, + "step": 2388 + }, + { + "epoch": 0.8208564188463686, + "grad_norm": 0.5424336194992065, + "learning_rate": 4.760483028774868e-05, + "loss": 6.0081, + "step": 2389 + }, + { + "epoch": 0.8212000171799166, + "grad_norm": 0.6262788772583008, + "learning_rate": 4.742894100742062e-05, + "loss": 6.1402, + "step": 2390 + }, + { + "epoch": 0.8215436155134648, + "grad_norm": 0.6485010385513306, + "learning_rate": 4.725334320530333e-05, + "loss": 6.0652, + "step": 2391 + }, + { + "epoch": 0.8218872138470128, + "grad_norm": 0.657599925994873, + "learning_rate": 4.707803713406344e-05, + "loss": 6.1892, + "step": 2392 + }, + { + "epoch": 0.8222308121805609, + "grad_norm": 0.6848873496055603, + "learning_rate": 4.6903023045947544e-05, + "loss": 6.0109, + "step": 2393 + }, + { + "epoch": 0.822574410514109, + "grad_norm": 0.6060543060302734, + "learning_rate": 4.672830119278257e-05, + "loss": 6.0803, + "step": 2394 + }, + { + "epoch": 0.8229180088476571, + "grad_norm": 0.681524395942688, + "learning_rate": 4.655387182597445e-05, + "loss": 6.1263, + "step": 2395 + }, + { + "epoch": 0.8232616071812052, + "grad_norm": 0.689354419708252, + "learning_rate": 4.6379735196508596e-05, + "loss": 6.225, + "step": 2396 + }, + { + "epoch": 0.8236052055147532, + "grad_norm": 0.7622163891792297, + "learning_rate": 4.620589155494911e-05, + "loss": 6.2745, + "step": 2397 + }, + { + "epoch": 0.8239488038483014, + "grad_norm": 0.9404433369636536, + "learning_rate": 4.6032341151438536e-05, + "loss": 6.1798, + "step": 2398 + }, + { + "epoch": 0.8242924021818494, + "grad_norm": 0.8369949460029602, + "learning_rate": 4.585908423569724e-05, + "loss": 6.2033, + "step": 2399 + }, + { + "epoch": 0.8246360005153975, + "grad_norm": 1.1605521440505981, + "learning_rate": 4.5686121057023797e-05, + "loss": 6.1901, + "step": 2400 + }, + { + "epoch": 0.8249795988489456, + "grad_norm": 0.7132769823074341, + "learning_rate": 4.551345186429362e-05, + "loss": 5.9783, + "step": 2401 + }, + { + "epoch": 0.8253231971824937, + "grad_norm": 0.6558107137680054, + "learning_rate": 4.534107690595937e-05, + "loss": 5.9746, + "step": 2402 + }, + { + "epoch": 0.8256667955160417, + "grad_norm": 0.6440469026565552, + "learning_rate": 4.516899643005032e-05, + "loss": 6.0365, + "step": 2403 + }, + { + "epoch": 0.8260103938495899, + "grad_norm": 0.6780853271484375, + "learning_rate": 4.499721068417198e-05, + "loss": 5.869, + "step": 2404 + }, + { + "epoch": 0.8263539921831379, + "grad_norm": 0.5479860305786133, + "learning_rate": 4.482571991550566e-05, + "loss": 6.0475, + "step": 2405 + }, + { + "epoch": 0.826697590516686, + "grad_norm": 0.4628962576389313, + "learning_rate": 4.4654524370808415e-05, + "loss": 5.9255, + "step": 2406 + }, + { + "epoch": 0.827041188850234, + "grad_norm": 0.4671342670917511, + "learning_rate": 4.4483624296412425e-05, + "loss": 5.908, + "step": 2407 + }, + { + "epoch": 0.8273847871837822, + "grad_norm": 0.5757843255996704, + "learning_rate": 4.4313019938224703e-05, + "loss": 5.8512, + "step": 2408 + }, + { + "epoch": 0.8277283855173302, + "grad_norm": 0.5119126439094543, + "learning_rate": 4.414271154172686e-05, + "loss": 6.0938, + "step": 2409 + }, + { + "epoch": 0.8280719838508783, + "grad_norm": 0.49391672015190125, + "learning_rate": 4.3972699351974374e-05, + "loss": 5.9021, + "step": 2410 + }, + { + "epoch": 0.8284155821844265, + "grad_norm": 0.5547550916671753, + "learning_rate": 4.380298361359697e-05, + "loss": 6.0918, + "step": 2411 + }, + { + "epoch": 0.8287591805179745, + "grad_norm": 0.555704653263092, + "learning_rate": 4.363356457079734e-05, + "loss": 6.0062, + "step": 2412 + }, + { + "epoch": 0.8291027788515226, + "grad_norm": 0.6617487668991089, + "learning_rate": 4.346444246735151e-05, + "loss": 5.8591, + "step": 2413 + }, + { + "epoch": 0.8294463771850706, + "grad_norm": 0.5502761602401733, + "learning_rate": 4.329561754660827e-05, + "loss": 5.9567, + "step": 2414 + }, + { + "epoch": 0.8297899755186188, + "grad_norm": 0.5624343752861023, + "learning_rate": 4.312709005148871e-05, + "loss": 5.9204, + "step": 2415 + }, + { + "epoch": 0.8301335738521668, + "grad_norm": 0.5430840849876404, + "learning_rate": 4.295886022448583e-05, + "loss": 6.0215, + "step": 2416 + }, + { + "epoch": 0.8304771721857149, + "grad_norm": 0.49531492590904236, + "learning_rate": 4.279092830766471e-05, + "loss": 6.0789, + "step": 2417 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 0.5341028571128845, + "learning_rate": 4.262329454266131e-05, + "loss": 5.973, + "step": 2418 + }, + { + "epoch": 0.8311643688528111, + "grad_norm": 0.4793427884578705, + "learning_rate": 4.2455959170682874e-05, + "loss": 5.9221, + "step": 2419 + }, + { + "epoch": 0.8315079671863591, + "grad_norm": 0.5773627161979675, + "learning_rate": 4.228892243250726e-05, + "loss": 5.9938, + "step": 2420 + }, + { + "epoch": 0.8318515655199072, + "grad_norm": 0.5644697546958923, + "learning_rate": 4.212218456848243e-05, + "loss": 6.0866, + "step": 2421 + }, + { + "epoch": 0.8321951638534553, + "grad_norm": 0.5478450059890747, + "learning_rate": 4.195574581852654e-05, + "loss": 5.9969, + "step": 2422 + }, + { + "epoch": 0.8325387621870034, + "grad_norm": 0.4832424819469452, + "learning_rate": 4.178960642212723e-05, + "loss": 5.9835, + "step": 2423 + }, + { + "epoch": 0.8328823605205514, + "grad_norm": 0.5325506925582886, + "learning_rate": 4.162376661834147e-05, + "loss": 6.0678, + "step": 2424 + }, + { + "epoch": 0.8332259588540996, + "grad_norm": 0.4787379503250122, + "learning_rate": 4.145822664579491e-05, + "loss": 6.122, + "step": 2425 + }, + { + "epoch": 0.8335695571876477, + "grad_norm": 0.45453977584838867, + "learning_rate": 4.129298674268226e-05, + "loss": 6.0835, + "step": 2426 + }, + { + "epoch": 0.8339131555211957, + "grad_norm": 0.6428323984146118, + "learning_rate": 4.112804714676593e-05, + "loss": 5.9736, + "step": 2427 + }, + { + "epoch": 0.8342567538547438, + "grad_norm": 0.49018287658691406, + "learning_rate": 4.096340809537655e-05, + "loss": 5.9848, + "step": 2428 + }, + { + "epoch": 0.8346003521882919, + "grad_norm": 0.5041020512580872, + "learning_rate": 4.0799069825412176e-05, + "loss": 6.096, + "step": 2429 + }, + { + "epoch": 0.83494395052184, + "grad_norm": 0.4534924030303955, + "learning_rate": 4.06350325733382e-05, + "loss": 6.0244, + "step": 2430 + }, + { + "epoch": 0.835287548855388, + "grad_norm": 0.45006832480430603, + "learning_rate": 4.047129657518658e-05, + "loss": 6.0738, + "step": 2431 + }, + { + "epoch": 0.8356311471889362, + "grad_norm": 0.5864267945289612, + "learning_rate": 4.030786206655626e-05, + "loss": 6.0116, + "step": 2432 + }, + { + "epoch": 0.8359747455224842, + "grad_norm": 0.49015316367149353, + "learning_rate": 4.014472928261193e-05, + "loss": 6.035, + "step": 2433 + }, + { + "epoch": 0.8363183438560323, + "grad_norm": 0.6772815585136414, + "learning_rate": 3.998189845808437e-05, + "loss": 6.1424, + "step": 2434 + }, + { + "epoch": 0.8366619421895803, + "grad_norm": 0.5230357646942139, + "learning_rate": 3.98193698272698e-05, + "loss": 6.0487, + "step": 2435 + }, + { + "epoch": 0.8370055405231285, + "grad_norm": 0.5348901748657227, + "learning_rate": 3.9657143624029665e-05, + "loss": 6.0033, + "step": 2436 + }, + { + "epoch": 0.8373491388566765, + "grad_norm": 0.5133422613143921, + "learning_rate": 3.9495220081790297e-05, + "loss": 6.1628, + "step": 2437 + }, + { + "epoch": 0.8376927371902246, + "grad_norm": 0.4899173974990845, + "learning_rate": 3.9333599433542284e-05, + "loss": 6.0278, + "step": 2438 + }, + { + "epoch": 0.8380363355237727, + "grad_norm": 0.4960936903953552, + "learning_rate": 3.9172281911840636e-05, + "loss": 6.0515, + "step": 2439 + }, + { + "epoch": 0.8383799338573208, + "grad_norm": 0.6296806931495667, + "learning_rate": 3.901126774880412e-05, + "loss": 6.104, + "step": 2440 + }, + { + "epoch": 0.8387235321908689, + "grad_norm": 0.5749461054801941, + "learning_rate": 3.885055717611505e-05, + "loss": 6.0408, + "step": 2441 + }, + { + "epoch": 0.839067130524417, + "grad_norm": 0.5432246327400208, + "learning_rate": 3.869015042501864e-05, + "loss": 6.146, + "step": 2442 + }, + { + "epoch": 0.8394107288579651, + "grad_norm": 0.6659320592880249, + "learning_rate": 3.85300477263234e-05, + "loss": 6.2118, + "step": 2443 + }, + { + "epoch": 0.8397543271915131, + "grad_norm": 0.6398926973342896, + "learning_rate": 3.8370249310399955e-05, + "loss": 6.1283, + "step": 2444 + }, + { + "epoch": 0.8400979255250612, + "grad_norm": 0.6013005375862122, + "learning_rate": 3.821075540718122e-05, + "loss": 5.9936, + "step": 2445 + }, + { + "epoch": 0.8404415238586093, + "grad_norm": 0.7379624247550964, + "learning_rate": 3.805156624616199e-05, + "loss": 6.2026, + "step": 2446 + }, + { + "epoch": 0.8407851221921574, + "grad_norm": 0.7793958187103271, + "learning_rate": 3.789268205639859e-05, + "loss": 6.1806, + "step": 2447 + }, + { + "epoch": 0.8411287205257054, + "grad_norm": 0.8797317147254944, + "learning_rate": 3.773410306650832e-05, + "loss": 6.1008, + "step": 2448 + }, + { + "epoch": 0.8414723188592536, + "grad_norm": 1.1056629419326782, + "learning_rate": 3.757582950466967e-05, + "loss": 6.2515, + "step": 2449 + }, + { + "epoch": 0.8418159171928016, + "grad_norm": 1.1044436693191528, + "learning_rate": 3.7417861598621345e-05, + "loss": 6.3385, + "step": 2450 + }, + { + "epoch": 0.8421595155263497, + "grad_norm": 0.6891606450080872, + "learning_rate": 3.72601995756624e-05, + "loss": 5.9451, + "step": 2451 + }, + { + "epoch": 0.8425031138598977, + "grad_norm": 0.7626943588256836, + "learning_rate": 3.710284366265168e-05, + "loss": 6.032, + "step": 2452 + }, + { + "epoch": 0.8428467121934459, + "grad_norm": 0.6951205730438232, + "learning_rate": 3.69457940860077e-05, + "loss": 5.9814, + "step": 2453 + }, + { + "epoch": 0.8431903105269939, + "grad_norm": 0.6206390857696533, + "learning_rate": 3.6789051071708016e-05, + "loss": 5.9554, + "step": 2454 + }, + { + "epoch": 0.843533908860542, + "grad_norm": 0.6567794680595398, + "learning_rate": 3.6632614845289154e-05, + "loss": 5.8821, + "step": 2455 + }, + { + "epoch": 0.8438775071940902, + "grad_norm": 0.5018622875213623, + "learning_rate": 3.64764856318463e-05, + "loss": 5.9425, + "step": 2456 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 0.5784042477607727, + "learning_rate": 3.632066365603259e-05, + "loss": 5.925, + "step": 2457 + }, + { + "epoch": 0.8445647038611863, + "grad_norm": 0.5264183878898621, + "learning_rate": 3.616514914205954e-05, + "loss": 5.9962, + "step": 2458 + }, + { + "epoch": 0.8449083021947343, + "grad_norm": 0.49773308634757996, + "learning_rate": 3.600994231369578e-05, + "loss": 6.0242, + "step": 2459 + }, + { + "epoch": 0.8452519005282825, + "grad_norm": 0.4520735740661621, + "learning_rate": 3.585504339426754e-05, + "loss": 5.9737, + "step": 2460 + }, + { + "epoch": 0.8455954988618305, + "grad_norm": 0.4381536841392517, + "learning_rate": 3.5700452606657855e-05, + "loss": 6.0548, + "step": 2461 + }, + { + "epoch": 0.8459390971953786, + "grad_norm": 0.5472382307052612, + "learning_rate": 3.5546170173306444e-05, + "loss": 6.0331, + "step": 2462 + }, + { + "epoch": 0.8462826955289267, + "grad_norm": 0.5395277738571167, + "learning_rate": 3.53921963162093e-05, + "loss": 5.9856, + "step": 2463 + }, + { + "epoch": 0.8466262938624748, + "grad_norm": 0.5448597073554993, + "learning_rate": 3.5238531256918506e-05, + "loss": 6.0873, + "step": 2464 + }, + { + "epoch": 0.8469698921960228, + "grad_norm": 0.5200133323669434, + "learning_rate": 3.5085175216541614e-05, + "loss": 6.0775, + "step": 2465 + }, + { + "epoch": 0.847313490529571, + "grad_norm": 0.5575097799301147, + "learning_rate": 3.493212841574173e-05, + "loss": 6.0285, + "step": 2466 + }, + { + "epoch": 0.847657088863119, + "grad_norm": 0.5278059840202332, + "learning_rate": 3.4779391074736905e-05, + "loss": 6.0, + "step": 2467 + }, + { + "epoch": 0.8480006871966671, + "grad_norm": 0.5051273107528687, + "learning_rate": 3.462696341329996e-05, + "loss": 5.9783, + "step": 2468 + }, + { + "epoch": 0.8483442855302151, + "grad_norm": 0.45483526587486267, + "learning_rate": 3.4474845650758094e-05, + "loss": 6.0531, + "step": 2469 + }, + { + "epoch": 0.8486878838637633, + "grad_norm": 0.5154264569282532, + "learning_rate": 3.432303800599254e-05, + "loss": 6.053, + "step": 2470 + }, + { + "epoch": 0.8490314821973114, + "grad_norm": 0.4888317584991455, + "learning_rate": 3.4171540697438356e-05, + "loss": 6.1426, + "step": 2471 + }, + { + "epoch": 0.8493750805308594, + "grad_norm": 0.5659269690513611, + "learning_rate": 3.4020353943084087e-05, + "loss": 5.9778, + "step": 2472 + }, + { + "epoch": 0.8497186788644076, + "grad_norm": 0.4910965859889984, + "learning_rate": 3.386947796047144e-05, + "loss": 6.0154, + "step": 2473 + }, + { + "epoch": 0.8500622771979556, + "grad_norm": 0.5661794543266296, + "learning_rate": 3.371891296669474e-05, + "loss": 6.0507, + "step": 2474 + }, + { + "epoch": 0.8504058755315037, + "grad_norm": 0.4793378710746765, + "learning_rate": 3.356865917840124e-05, + "loss": 6.1263, + "step": 2475 + }, + { + "epoch": 0.8507494738650517, + "grad_norm": 0.5714482069015503, + "learning_rate": 3.3418716811789956e-05, + "loss": 6.0975, + "step": 2476 + }, + { + "epoch": 0.8510930721985999, + "grad_norm": 0.5254719257354736, + "learning_rate": 3.326908608261212e-05, + "loss": 6.033, + "step": 2477 + }, + { + "epoch": 0.8514366705321479, + "grad_norm": 0.40919145941734314, + "learning_rate": 3.311976720617038e-05, + "loss": 5.9691, + "step": 2478 + }, + { + "epoch": 0.851780268865696, + "grad_norm": 0.5050671100616455, + "learning_rate": 3.297076039731883e-05, + "loss": 6.1006, + "step": 2479 + }, + { + "epoch": 0.852123867199244, + "grad_norm": 0.49542513489723206, + "learning_rate": 3.2822065870462214e-05, + "loss": 6.0539, + "step": 2480 + }, + { + "epoch": 0.8524674655327922, + "grad_norm": 0.5702757239341736, + "learning_rate": 3.2673683839556376e-05, + "loss": 6.1042, + "step": 2481 + }, + { + "epoch": 0.8528110638663402, + "grad_norm": 0.5085723400115967, + "learning_rate": 3.252561451810712e-05, + "loss": 5.9759, + "step": 2482 + }, + { + "epoch": 0.8531546621998883, + "grad_norm": 0.5009546279907227, + "learning_rate": 3.237785811917049e-05, + "loss": 6.0489, + "step": 2483 + }, + { + "epoch": 0.8534982605334364, + "grad_norm": 0.489520400762558, + "learning_rate": 3.223041485535225e-05, + "loss": 5.9973, + "step": 2484 + }, + { + "epoch": 0.8538418588669845, + "grad_norm": 0.5538635849952698, + "learning_rate": 3.208328493880763e-05, + "loss": 6.1599, + "step": 2485 + }, + { + "epoch": 0.8541854572005326, + "grad_norm": 0.5288501977920532, + "learning_rate": 3.19364685812408e-05, + "loss": 6.0023, + "step": 2486 + }, + { + "epoch": 0.8545290555340807, + "grad_norm": 0.5383173823356628, + "learning_rate": 3.178996599390499e-05, + "loss": 6.0028, + "step": 2487 + }, + { + "epoch": 0.8548726538676288, + "grad_norm": 0.5179246664047241, + "learning_rate": 3.164377738760182e-05, + "loss": 6.1045, + "step": 2488 + }, + { + "epoch": 0.8552162522011768, + "grad_norm": 0.5900654792785645, + "learning_rate": 3.149790297268107e-05, + "loss": 6.1438, + "step": 2489 + }, + { + "epoch": 0.8555598505347249, + "grad_norm": 0.6226531863212585, + "learning_rate": 3.135234295904066e-05, + "loss": 6.0877, + "step": 2490 + }, + { + "epoch": 0.855903448868273, + "grad_norm": 0.639117419719696, + "learning_rate": 3.1207097556125777e-05, + "loss": 6.0413, + "step": 2491 + }, + { + "epoch": 0.8562470472018211, + "grad_norm": 0.6743376851081848, + "learning_rate": 3.106216697292932e-05, + "loss": 6.1535, + "step": 2492 + }, + { + "epoch": 0.8565906455353691, + "grad_norm": 0.6806616187095642, + "learning_rate": 3.0917551417990854e-05, + "loss": 6.0679, + "step": 2493 + }, + { + "epoch": 0.8569342438689173, + "grad_norm": 0.6853029131889343, + "learning_rate": 3.0773251099396773e-05, + "loss": 6.0792, + "step": 2494 + }, + { + "epoch": 0.8572778422024653, + "grad_norm": 0.6363233923912048, + "learning_rate": 3.062926622477996e-05, + "loss": 6.1392, + "step": 2495 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 0.6565764546394348, + "learning_rate": 3.0485597001319366e-05, + "loss": 6.0897, + "step": 2496 + }, + { + "epoch": 0.8579650388695614, + "grad_norm": 0.8439285755157471, + "learning_rate": 3.0342243635739593e-05, + "loss": 6.0918, + "step": 2497 + }, + { + "epoch": 0.8583086372031096, + "grad_norm": 0.7531304955482483, + "learning_rate": 3.0199206334310948e-05, + "loss": 6.1271, + "step": 2498 + }, + { + "epoch": 0.8586522355366576, + "grad_norm": 0.8806336522102356, + "learning_rate": 3.0056485302848934e-05, + "loss": 6.2187, + "step": 2499 + }, + { + "epoch": 0.8589958338702057, + "grad_norm": 1.2116683721542358, + "learning_rate": 2.9914080746713896e-05, + "loss": 6.2619, + "step": 2500 + }, + { + "epoch": 0.8589958338702057, + "eval_loss": 6.026234149932861, + "eval_runtime": 724.3408, + "eval_samples_per_second": 25.713, + "eval_steps_per_second": 6.429, + "step": 2500 + }, + { + "epoch": 0.8593394322037539, + "grad_norm": 0.7180514335632324, + "learning_rate": 2.9771992870810894e-05, + "loss": 5.8551, + "step": 2501 + }, + { + "epoch": 0.8596830305373019, + "grad_norm": 0.6997862458229065, + "learning_rate": 2.963022187958922e-05, + "loss": 5.9469, + "step": 2502 + }, + { + "epoch": 0.86002662887085, + "grad_norm": 0.7439790368080139, + "learning_rate": 2.9488767977042253e-05, + "loss": 5.9154, + "step": 2503 + }, + { + "epoch": 0.860370227204398, + "grad_norm": 0.670836865901947, + "learning_rate": 2.9347631366707124e-05, + "loss": 6.1126, + "step": 2504 + }, + { + "epoch": 0.8607138255379462, + "grad_norm": 0.664583683013916, + "learning_rate": 2.9206812251664492e-05, + "loss": 5.935, + "step": 2505 + }, + { + "epoch": 0.8610574238714942, + "grad_norm": 0.5612172484397888, + "learning_rate": 2.9066310834537867e-05, + "loss": 6.044, + "step": 2506 + }, + { + "epoch": 0.8614010222050423, + "grad_norm": 0.5193923115730286, + "learning_rate": 2.892612731749414e-05, + "loss": 5.9687, + "step": 2507 + }, + { + "epoch": 0.8617446205385904, + "grad_norm": 0.4159911572933197, + "learning_rate": 2.8786261902242232e-05, + "loss": 5.9301, + "step": 2508 + }, + { + "epoch": 0.8620882188721385, + "grad_norm": 0.442753404378891, + "learning_rate": 2.8646714790033752e-05, + "loss": 5.9968, + "step": 2509 + }, + { + "epoch": 0.8624318172056865, + "grad_norm": 0.5828900337219238, + "learning_rate": 2.8507486181662075e-05, + "loss": 5.9093, + "step": 2510 + }, + { + "epoch": 0.8627754155392346, + "grad_norm": 0.6334620118141174, + "learning_rate": 2.8368576277462422e-05, + "loss": 5.9837, + "step": 2511 + }, + { + "epoch": 0.8631190138727827, + "grad_norm": 0.5630239844322205, + "learning_rate": 2.822998527731127e-05, + "loss": 5.9947, + "step": 2512 + }, + { + "epoch": 0.8634626122063308, + "grad_norm": 0.6125161647796631, + "learning_rate": 2.8091713380626492e-05, + "loss": 5.9299, + "step": 2513 + }, + { + "epoch": 0.8638062105398788, + "grad_norm": 0.6197602152824402, + "learning_rate": 2.7953760786366493e-05, + "loss": 6.0298, + "step": 2514 + }, + { + "epoch": 0.864149808873427, + "grad_norm": 0.6317358016967773, + "learning_rate": 2.7816127693030462e-05, + "loss": 6.0517, + "step": 2515 + }, + { + "epoch": 0.8644934072069751, + "grad_norm": 0.6413782835006714, + "learning_rate": 2.7678814298657734e-05, + "loss": 6.0724, + "step": 2516 + }, + { + "epoch": 0.8648370055405231, + "grad_norm": 0.5042860507965088, + "learning_rate": 2.7541820800827733e-05, + "loss": 6.0285, + "step": 2517 + }, + { + "epoch": 0.8651806038740713, + "grad_norm": 0.49634793400764465, + "learning_rate": 2.7405147396659557e-05, + "loss": 6.0163, + "step": 2518 + }, + { + "epoch": 0.8655242022076193, + "grad_norm": 0.4745235741138458, + "learning_rate": 2.7268794282811595e-05, + "loss": 6.0094, + "step": 2519 + }, + { + "epoch": 0.8658678005411674, + "grad_norm": 0.48123598098754883, + "learning_rate": 2.7132761655481537e-05, + "loss": 6.1046, + "step": 2520 + }, + { + "epoch": 0.8662113988747154, + "grad_norm": 0.7108414769172668, + "learning_rate": 2.699704971040587e-05, + "loss": 5.9763, + "step": 2521 + }, + { + "epoch": 0.8665549972082636, + "grad_norm": 0.538608968257904, + "learning_rate": 2.6861658642859693e-05, + "loss": 5.9979, + "step": 2522 + }, + { + "epoch": 0.8668985955418116, + "grad_norm": 0.49514704942703247, + "learning_rate": 2.6726588647656204e-05, + "loss": 5.9548, + "step": 2523 + }, + { + "epoch": 0.8672421938753597, + "grad_norm": 0.589779794216156, + "learning_rate": 2.659183991914696e-05, + "loss": 6.0988, + "step": 2524 + }, + { + "epoch": 0.8675857922089077, + "grad_norm": 0.5593236684799194, + "learning_rate": 2.6457412651220896e-05, + "loss": 6.1345, + "step": 2525 + }, + { + "epoch": 0.8679293905424559, + "grad_norm": 0.49659085273742676, + "learning_rate": 2.6323307037304624e-05, + "loss": 6.073, + "step": 2526 + }, + { + "epoch": 0.8682729888760039, + "grad_norm": 0.6141493320465088, + "learning_rate": 2.6189523270361865e-05, + "loss": 6.0561, + "step": 2527 + }, + { + "epoch": 0.868616587209552, + "grad_norm": 0.4329065978527069, + "learning_rate": 2.605606154289322e-05, + "loss": 5.9661, + "step": 2528 + }, + { + "epoch": 0.8689601855431001, + "grad_norm": 0.4631238281726837, + "learning_rate": 2.5922922046935914e-05, + "loss": 5.9962, + "step": 2529 + }, + { + "epoch": 0.8693037838766482, + "grad_norm": 0.5284918546676636, + "learning_rate": 2.5790104974063505e-05, + "loss": 6.0944, + "step": 2530 + }, + { + "epoch": 0.8696473822101963, + "grad_norm": 0.5931186676025391, + "learning_rate": 2.5657610515385647e-05, + "loss": 6.0207, + "step": 2531 + }, + { + "epoch": 0.8699909805437444, + "grad_norm": 0.5561034679412842, + "learning_rate": 2.552543886154779e-05, + "loss": 6.044, + "step": 2532 + }, + { + "epoch": 0.8703345788772925, + "grad_norm": 0.5106163620948792, + "learning_rate": 2.539359020273094e-05, + "loss": 5.9788, + "step": 2533 + }, + { + "epoch": 0.8706781772108405, + "grad_norm": 0.5758001804351807, + "learning_rate": 2.5262064728651197e-05, + "loss": 6.0796, + "step": 2534 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 0.4941878020763397, + "learning_rate": 2.5130862628559765e-05, + "loss": 6.0645, + "step": 2535 + }, + { + "epoch": 0.8713653738779367, + "grad_norm": 0.6312124133110046, + "learning_rate": 2.499998409124252e-05, + "loss": 5.9077, + "step": 2536 + }, + { + "epoch": 0.8717089722114848, + "grad_norm": 0.6568105816841125, + "learning_rate": 2.4869429305019785e-05, + "loss": 6.0868, + "step": 2537 + }, + { + "epoch": 0.8720525705450328, + "grad_norm": 0.5947930812835693, + "learning_rate": 2.4739198457745915e-05, + "loss": 6.1068, + "step": 2538 + }, + { + "epoch": 0.872396168878581, + "grad_norm": 0.589493989944458, + "learning_rate": 2.4609291736809397e-05, + "loss": 5.9835, + "step": 2539 + }, + { + "epoch": 0.872739767212129, + "grad_norm": 0.6117597818374634, + "learning_rate": 2.447970932913207e-05, + "loss": 6.1012, + "step": 2540 + }, + { + "epoch": 0.8730833655456771, + "grad_norm": 0.6265408396720886, + "learning_rate": 2.435045142116929e-05, + "loss": 6.0829, + "step": 2541 + }, + { + "epoch": 0.8734269638792251, + "grad_norm": 0.5803533792495728, + "learning_rate": 2.4221518198909415e-05, + "loss": 6.0835, + "step": 2542 + }, + { + "epoch": 0.8737705622127733, + "grad_norm": 0.5373610854148865, + "learning_rate": 2.409290984787371e-05, + "loss": 6.0519, + "step": 2543 + }, + { + "epoch": 0.8741141605463213, + "grad_norm": 0.6814734935760498, + "learning_rate": 2.3964626553115766e-05, + "loss": 5.9762, + "step": 2544 + }, + { + "epoch": 0.8744577588798694, + "grad_norm": 0.7417898774147034, + "learning_rate": 2.3836668499221752e-05, + "loss": 6.1121, + "step": 2545 + }, + { + "epoch": 0.8748013572134176, + "grad_norm": 0.5553413033485413, + "learning_rate": 2.370903587030965e-05, + "loss": 6.1261, + "step": 2546 + }, + { + "epoch": 0.8751449555469656, + "grad_norm": 0.8009063005447388, + "learning_rate": 2.3581728850029182e-05, + "loss": 6.2087, + "step": 2547 + }, + { + "epoch": 0.8754885538805137, + "grad_norm": 0.7052872180938721, + "learning_rate": 2.345474762156169e-05, + "loss": 6.2921, + "step": 2548 + }, + { + "epoch": 0.8758321522140617, + "grad_norm": 0.9335289597511292, + "learning_rate": 2.3328092367619596e-05, + "loss": 6.1382, + "step": 2549 + }, + { + "epoch": 0.8761757505476099, + "grad_norm": 1.0865561962127686, + "learning_rate": 2.3201763270446457e-05, + "loss": 6.0753, + "step": 2550 + }, + { + "epoch": 0.8765193488811579, + "grad_norm": 0.7006704807281494, + "learning_rate": 2.3075760511816257e-05, + "loss": 5.7694, + "step": 2551 + }, + { + "epoch": 0.876862947214706, + "grad_norm": 0.6684394478797913, + "learning_rate": 2.2950084273033632e-05, + "loss": 5.835, + "step": 2552 + }, + { + "epoch": 0.8772065455482541, + "grad_norm": 0.6840022206306458, + "learning_rate": 2.2824734734933322e-05, + "loss": 5.8673, + "step": 2553 + }, + { + "epoch": 0.8775501438818022, + "grad_norm": 0.6259867548942566, + "learning_rate": 2.2699712077880046e-05, + "loss": 6.0112, + "step": 2554 + }, + { + "epoch": 0.8778937422153502, + "grad_norm": 0.5849865674972534, + "learning_rate": 2.2575016481767936e-05, + "loss": 5.988, + "step": 2555 + }, + { + "epoch": 0.8782373405488983, + "grad_norm": 0.5193759799003601, + "learning_rate": 2.2450648126020907e-05, + "loss": 5.9885, + "step": 2556 + }, + { + "epoch": 0.8785809388824464, + "grad_norm": 0.4761309325695038, + "learning_rate": 2.2326607189591676e-05, + "loss": 5.8873, + "step": 2557 + }, + { + "epoch": 0.8789245372159945, + "grad_norm": 0.5957377552986145, + "learning_rate": 2.220289385096194e-05, + "loss": 5.8765, + "step": 2558 + }, + { + "epoch": 0.8792681355495426, + "grad_norm": 0.5031549334526062, + "learning_rate": 2.2079508288142092e-05, + "loss": 6.0119, + "step": 2559 + }, + { + "epoch": 0.8796117338830907, + "grad_norm": 0.4540199041366577, + "learning_rate": 2.195645067867086e-05, + "loss": 5.9878, + "step": 2560 + }, + { + "epoch": 0.8799553322166388, + "grad_norm": 0.48119738698005676, + "learning_rate": 2.183372119961499e-05, + "loss": 5.9127, + "step": 2561 + }, + { + "epoch": 0.8802989305501868, + "grad_norm": 0.5656386613845825, + "learning_rate": 2.171132002756915e-05, + "loss": 6.0097, + "step": 2562 + }, + { + "epoch": 0.880642528883735, + "grad_norm": 0.49384355545043945, + "learning_rate": 2.1589247338655666e-05, + "loss": 5.9723, + "step": 2563 + }, + { + "epoch": 0.880986127217283, + "grad_norm": 0.5116087794303894, + "learning_rate": 2.1467503308524096e-05, + "loss": 6.0294, + "step": 2564 + }, + { + "epoch": 0.8813297255508311, + "grad_norm": 0.511194109916687, + "learning_rate": 2.1346088112351252e-05, + "loss": 5.9535, + "step": 2565 + }, + { + "epoch": 0.8816733238843791, + "grad_norm": 0.4660123586654663, + "learning_rate": 2.122500192484056e-05, + "loss": 6.0129, + "step": 2566 + }, + { + "epoch": 0.8820169222179273, + "grad_norm": 0.507530689239502, + "learning_rate": 2.1104244920222226e-05, + "loss": 5.9942, + "step": 2567 + }, + { + "epoch": 0.8823605205514753, + "grad_norm": 0.4880560338497162, + "learning_rate": 2.0983817272252737e-05, + "loss": 5.9319, + "step": 2568 + }, + { + "epoch": 0.8827041188850234, + "grad_norm": 0.47986316680908203, + "learning_rate": 2.08637191542147e-05, + "loss": 6.0592, + "step": 2569 + }, + { + "epoch": 0.8830477172185714, + "grad_norm": 0.4974256157875061, + "learning_rate": 2.074395073891644e-05, + "loss": 6.1117, + "step": 2570 + }, + { + "epoch": 0.8833913155521196, + "grad_norm": 0.41185539960861206, + "learning_rate": 2.06245121986921e-05, + "loss": 6.0907, + "step": 2571 + }, + { + "epoch": 0.8837349138856676, + "grad_norm": 0.47288328409194946, + "learning_rate": 2.0505403705400883e-05, + "loss": 6.0898, + "step": 2572 + }, + { + "epoch": 0.8840785122192157, + "grad_norm": 0.4536161422729492, + "learning_rate": 2.0386625430427436e-05, + "loss": 6.0654, + "step": 2573 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 0.4487610459327698, + "learning_rate": 2.026817754468091e-05, + "loss": 6.0253, + "step": 2574 + }, + { + "epoch": 0.8847657088863119, + "grad_norm": 0.4847327768802643, + "learning_rate": 2.015006021859528e-05, + "loss": 6.1225, + "step": 2575 + }, + { + "epoch": 0.88510930721986, + "grad_norm": 0.5354999899864197, + "learning_rate": 2.0032273622128784e-05, + "loss": 6.0677, + "step": 2576 + }, + { + "epoch": 0.885452905553408, + "grad_norm": 0.6489529013633728, + "learning_rate": 1.9914817924763878e-05, + "loss": 6.0719, + "step": 2577 + }, + { + "epoch": 0.8857965038869562, + "grad_norm": 0.4547826051712036, + "learning_rate": 1.9797693295506735e-05, + "loss": 6.0456, + "step": 2578 + }, + { + "epoch": 0.8861401022205042, + "grad_norm": 0.48801037669181824, + "learning_rate": 1.9680899902887266e-05, + "loss": 6.0169, + "step": 2579 + }, + { + "epoch": 0.8864837005540523, + "grad_norm": 0.5203239917755127, + "learning_rate": 1.9564437914958765e-05, + "loss": 6.0425, + "step": 2580 + }, + { + "epoch": 0.8868272988876004, + "grad_norm": 0.501476526260376, + "learning_rate": 1.94483074992976e-05, + "loss": 6.0669, + "step": 2581 + }, + { + "epoch": 0.8871708972211485, + "grad_norm": 0.49509742856025696, + "learning_rate": 1.9332508823003192e-05, + "loss": 5.947, + "step": 2582 + }, + { + "epoch": 0.8875144955546965, + "grad_norm": 0.5070309042930603, + "learning_rate": 1.9217042052697393e-05, + "loss": 6.1098, + "step": 2583 + }, + { + "epoch": 0.8878580938882447, + "grad_norm": 0.47886112332344055, + "learning_rate": 1.910190735452466e-05, + "loss": 6.0617, + "step": 2584 + }, + { + "epoch": 0.8882016922217927, + "grad_norm": 0.6764759421348572, + "learning_rate": 1.8987104894151592e-05, + "loss": 5.942, + "step": 2585 + }, + { + "epoch": 0.8885452905553408, + "grad_norm": 0.7424550652503967, + "learning_rate": 1.8872634836766768e-05, + "loss": 6.0808, + "step": 2586 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.49124762415885925, + "learning_rate": 1.8758497347080266e-05, + "loss": 5.9537, + "step": 2587 + }, + { + "epoch": 0.889232487222437, + "grad_norm": 0.5248420834541321, + "learning_rate": 1.864469258932397e-05, + "loss": 6.011, + "step": 2588 + }, + { + "epoch": 0.8895760855559851, + "grad_norm": 0.5532990097999573, + "learning_rate": 1.853122072725072e-05, + "loss": 6.0058, + "step": 2589 + }, + { + "epoch": 0.8899196838895331, + "grad_norm": 0.5646762847900391, + "learning_rate": 1.8418081924134494e-05, + "loss": 6.1114, + "step": 2590 + }, + { + "epoch": 0.8902632822230813, + "grad_norm": 0.5970567464828491, + "learning_rate": 1.8305276342770015e-05, + "loss": 6.0876, + "step": 2591 + }, + { + "epoch": 0.8906068805566293, + "grad_norm": 0.6650955080986023, + "learning_rate": 1.8192804145472502e-05, + "loss": 6.0022, + "step": 2592 + }, + { + "epoch": 0.8909504788901774, + "grad_norm": 0.6694637537002563, + "learning_rate": 1.8080665494077468e-05, + "loss": 6.1372, + "step": 2593 + }, + { + "epoch": 0.8912940772237254, + "grad_norm": 0.6882001757621765, + "learning_rate": 1.7968860549940512e-05, + "loss": 6.0342, + "step": 2594 + }, + { + "epoch": 0.8916376755572736, + "grad_norm": 0.6639662384986877, + "learning_rate": 1.7857389473937058e-05, + "loss": 6.13, + "step": 2595 + }, + { + "epoch": 0.8919812738908216, + "grad_norm": 0.7884045243263245, + "learning_rate": 1.7746252426462134e-05, + "loss": 6.2426, + "step": 2596 + }, + { + "epoch": 0.8923248722243697, + "grad_norm": 0.8145104050636292, + "learning_rate": 1.7635449567430185e-05, + "loss": 6.1495, + "step": 2597 + }, + { + "epoch": 0.8926684705579178, + "grad_norm": 0.8474352955818176, + "learning_rate": 1.7524981056274647e-05, + "loss": 6.2624, + "step": 2598 + }, + { + "epoch": 0.8930120688914659, + "grad_norm": 0.9839531183242798, + "learning_rate": 1.7414847051948012e-05, + "loss": 6.2734, + "step": 2599 + }, + { + "epoch": 0.8933556672250139, + "grad_norm": 1.2112486362457275, + "learning_rate": 1.730504771292138e-05, + "loss": 6.0494, + "step": 2600 + }, + { + "epoch": 0.893699265558562, + "grad_norm": 0.7058858275413513, + "learning_rate": 1.719558319718434e-05, + "loss": 5.9493, + "step": 2601 + }, + { + "epoch": 0.8940428638921101, + "grad_norm": 0.7136659622192383, + "learning_rate": 1.7086453662244678e-05, + "loss": 5.9466, + "step": 2602 + }, + { + "epoch": 0.8943864622256582, + "grad_norm": 0.6175331473350525, + "learning_rate": 1.697765926512823e-05, + "loss": 6.0478, + "step": 2603 + }, + { + "epoch": 0.8947300605592063, + "grad_norm": 0.7400083541870117, + "learning_rate": 1.6869200162378474e-05, + "loss": 5.9836, + "step": 2604 + }, + { + "epoch": 0.8950736588927544, + "grad_norm": 0.6715840697288513, + "learning_rate": 1.6761076510056623e-05, + "loss": 5.8835, + "step": 2605 + }, + { + "epoch": 0.8954172572263025, + "grad_norm": 0.6418355703353882, + "learning_rate": 1.665328846374106e-05, + "loss": 6.082, + "step": 2606 + }, + { + "epoch": 0.8957608555598505, + "grad_norm": 0.6705430150032043, + "learning_rate": 1.6545836178527313e-05, + "loss": 5.9216, + "step": 2607 + }, + { + "epoch": 0.8961044538933987, + "grad_norm": 0.5584115386009216, + "learning_rate": 1.6438719809027806e-05, + "loss": 5.9343, + "step": 2608 + }, + { + "epoch": 0.8964480522269467, + "grad_norm": 0.6551584005355835, + "learning_rate": 1.6331939509371647e-05, + "loss": 5.989, + "step": 2609 + }, + { + "epoch": 0.8967916505604948, + "grad_norm": 0.5179411172866821, + "learning_rate": 1.6225495433204256e-05, + "loss": 5.9579, + "step": 2610 + }, + { + "epoch": 0.8971352488940428, + "grad_norm": 0.5057214498519897, + "learning_rate": 1.6119387733687374e-05, + "loss": 6.0373, + "step": 2611 + }, + { + "epoch": 0.897478847227591, + "grad_norm": 0.4302932918071747, + "learning_rate": 1.60136165634987e-05, + "loss": 5.9966, + "step": 2612 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 0.5576332807540894, + "learning_rate": 1.590818207483169e-05, + "loss": 5.9207, + "step": 2613 + }, + { + "epoch": 0.8981660438946871, + "grad_norm": 0.5023398995399475, + "learning_rate": 1.58030844193954e-05, + "loss": 5.8995, + "step": 2614 + }, + { + "epoch": 0.8985096422282352, + "grad_norm": 0.555558443069458, + "learning_rate": 1.5698323748414122e-05, + "loss": 6.0222, + "step": 2615 + }, + { + "epoch": 0.8988532405617833, + "grad_norm": 0.46205613017082214, + "learning_rate": 1.5593900212627326e-05, + "loss": 6.1039, + "step": 2616 + }, + { + "epoch": 0.8991968388953313, + "grad_norm": 0.5434067249298096, + "learning_rate": 1.548981396228938e-05, + "loss": 5.9208, + "step": 2617 + }, + { + "epoch": 0.8995404372288794, + "grad_norm": 0.5215374231338501, + "learning_rate": 1.5386065147169392e-05, + "loss": 5.9905, + "step": 2618 + }, + { + "epoch": 0.8998840355624276, + "grad_norm": 0.5159684419631958, + "learning_rate": 1.528265391655076e-05, + "loss": 5.9615, + "step": 2619 + }, + { + "epoch": 0.9002276338959756, + "grad_norm": 0.5324832797050476, + "learning_rate": 1.5179580419231371e-05, + "loss": 5.9843, + "step": 2620 + }, + { + "epoch": 0.9005712322295237, + "grad_norm": 0.49988093972206116, + "learning_rate": 1.5076844803522921e-05, + "loss": 6.0481, + "step": 2621 + }, + { + "epoch": 0.9009148305630718, + "grad_norm": 0.4379015266895294, + "learning_rate": 1.497444721725108e-05, + "loss": 6.0757, + "step": 2622 + }, + { + "epoch": 0.9012584288966199, + "grad_norm": 0.45072677731513977, + "learning_rate": 1.4872387807755072e-05, + "loss": 6.0539, + "step": 2623 + }, + { + "epoch": 0.9016020272301679, + "grad_norm": 0.5357469916343689, + "learning_rate": 1.4770666721887622e-05, + "loss": 6.1215, + "step": 2624 + }, + { + "epoch": 0.901945625563716, + "grad_norm": 0.48705291748046875, + "learning_rate": 1.4669284106014369e-05, + "loss": 6.0271, + "step": 2625 + }, + { + "epoch": 0.9022892238972641, + "grad_norm": 0.4232194721698761, + "learning_rate": 1.4568240106014291e-05, + "loss": 5.9916, + "step": 2626 + }, + { + "epoch": 0.9026328222308122, + "grad_norm": 0.49429526925086975, + "learning_rate": 1.4467534867278864e-05, + "loss": 5.9512, + "step": 2627 + }, + { + "epoch": 0.9029764205643602, + "grad_norm": 0.47070807218551636, + "learning_rate": 1.436716853471226e-05, + "loss": 6.0667, + "step": 2628 + }, + { + "epoch": 0.9033200188979084, + "grad_norm": 0.5034219026565552, + "learning_rate": 1.4267141252730958e-05, + "loss": 6.0875, + "step": 2629 + }, + { + "epoch": 0.9036636172314564, + "grad_norm": 0.49576908349990845, + "learning_rate": 1.4167453165263495e-05, + "loss": 5.9769, + "step": 2630 + }, + { + "epoch": 0.9040072155650045, + "grad_norm": 0.6180304288864136, + "learning_rate": 1.4068104415750572e-05, + "loss": 6.1032, + "step": 2631 + }, + { + "epoch": 0.9043508138985525, + "grad_norm": 0.49888142943382263, + "learning_rate": 1.3969095147144339e-05, + "loss": 6.1171, + "step": 2632 + }, + { + "epoch": 0.9046944122321007, + "grad_norm": 0.5140848755836487, + "learning_rate": 1.3870425501908674e-05, + "loss": 6.1556, + "step": 2633 + }, + { + "epoch": 0.9050380105656488, + "grad_norm": 0.5649397969245911, + "learning_rate": 1.3772095622018698e-05, + "loss": 6.0178, + "step": 2634 + }, + { + "epoch": 0.9053816088991968, + "grad_norm": 0.5044685006141663, + "learning_rate": 1.3674105648960683e-05, + "loss": 6.1763, + "step": 2635 + }, + { + "epoch": 0.905725207232745, + "grad_norm": 0.59349524974823, + "learning_rate": 1.3576455723731645e-05, + "loss": 6.0187, + "step": 2636 + }, + { + "epoch": 0.906068805566293, + "grad_norm": 0.6226266622543335, + "learning_rate": 1.3479145986839636e-05, + "loss": 6.0291, + "step": 2637 + }, + { + "epoch": 0.9064124038998411, + "grad_norm": 0.6408131718635559, + "learning_rate": 1.3382176578302846e-05, + "loss": 6.0087, + "step": 2638 + }, + { + "epoch": 0.9067560022333891, + "grad_norm": 0.5794918537139893, + "learning_rate": 1.3285547637650052e-05, + "loss": 6.0192, + "step": 2639 + }, + { + "epoch": 0.9070996005669373, + "grad_norm": 0.5764051675796509, + "learning_rate": 1.3189259303919954e-05, + "loss": 6.1282, + "step": 2640 + }, + { + "epoch": 0.9074431989004853, + "grad_norm": 0.5949744582176208, + "learning_rate": 1.3093311715661304e-05, + "loss": 6.0212, + "step": 2641 + }, + { + "epoch": 0.9077867972340334, + "grad_norm": 0.5786007046699524, + "learning_rate": 1.2997705010932393e-05, + "loss": 6.059, + "step": 2642 + }, + { + "epoch": 0.9081303955675815, + "grad_norm": 0.6125741600990295, + "learning_rate": 1.2902439327301146e-05, + "loss": 6.076, + "step": 2643 + }, + { + "epoch": 0.9084739939011296, + "grad_norm": 0.5794528722763062, + "learning_rate": 1.2807514801844723e-05, + "loss": 6.0601, + "step": 2644 + }, + { + "epoch": 0.9088175922346776, + "grad_norm": 0.6618969440460205, + "learning_rate": 1.2712931571149444e-05, + "loss": 6.0093, + "step": 2645 + }, + { + "epoch": 0.9091611905682258, + "grad_norm": 0.7055357694625854, + "learning_rate": 1.261868977131056e-05, + "loss": 6.2117, + "step": 2646 + }, + { + "epoch": 0.9095047889017738, + "grad_norm": 0.6832464933395386, + "learning_rate": 1.252478953793193e-05, + "loss": 6.1221, + "step": 2647 + }, + { + "epoch": 0.9098483872353219, + "grad_norm": 0.8896463513374329, + "learning_rate": 1.2431231006126003e-05, + "loss": 6.1605, + "step": 2648 + }, + { + "epoch": 0.91019198556887, + "grad_norm": 0.8412816524505615, + "learning_rate": 1.2338014310513596e-05, + "loss": 6.1005, + "step": 2649 + }, + { + "epoch": 0.9105355839024181, + "grad_norm": 1.1724843978881836, + "learning_rate": 1.2245139585223636e-05, + "loss": 6.2482, + "step": 2650 + }, + { + "epoch": 0.9108791822359662, + "grad_norm": 0.6163187026977539, + "learning_rate": 1.2152606963892864e-05, + "loss": 5.7662, + "step": 2651 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 0.5164371132850647, + "learning_rate": 1.2060416579666028e-05, + "loss": 5.9042, + "step": 2652 + }, + { + "epoch": 0.9115663789030624, + "grad_norm": 0.5768353939056396, + "learning_rate": 1.1968568565195182e-05, + "loss": 5.9316, + "step": 2653 + }, + { + "epoch": 0.9119099772366104, + "grad_norm": 0.5464223623275757, + "learning_rate": 1.1877063052639913e-05, + "loss": 5.8956, + "step": 2654 + }, + { + "epoch": 0.9122535755701585, + "grad_norm": 0.5711444616317749, + "learning_rate": 1.1785900173666903e-05, + "loss": 6.0076, + "step": 2655 + }, + { + "epoch": 0.9125971739037065, + "grad_norm": 0.4955626130104065, + "learning_rate": 1.169508005944983e-05, + "loss": 6.0148, + "step": 2656 + }, + { + "epoch": 0.9129407722372547, + "grad_norm": 0.40896841883659363, + "learning_rate": 1.1604602840669164e-05, + "loss": 6.0712, + "step": 2657 + }, + { + "epoch": 0.9132843705708027, + "grad_norm": 0.5791770219802856, + "learning_rate": 1.1514468647512072e-05, + "loss": 5.7973, + "step": 2658 + }, + { + "epoch": 0.9136279689043508, + "grad_norm": 0.5330086350440979, + "learning_rate": 1.1424677609671974e-05, + "loss": 5.939, + "step": 2659 + }, + { + "epoch": 0.9139715672378989, + "grad_norm": 0.44915223121643066, + "learning_rate": 1.1335229856348689e-05, + "loss": 5.9109, + "step": 2660 + }, + { + "epoch": 0.914315165571447, + "grad_norm": 0.49018964171409607, + "learning_rate": 1.124612551624804e-05, + "loss": 5.9027, + "step": 2661 + }, + { + "epoch": 0.914658763904995, + "grad_norm": 0.4549656808376312, + "learning_rate": 1.115736471758158e-05, + "loss": 5.9732, + "step": 2662 + }, + { + "epoch": 0.9150023622385431, + "grad_norm": 0.471829891204834, + "learning_rate": 1.1068947588066813e-05, + "loss": 6.0014, + "step": 2663 + }, + { + "epoch": 0.9153459605720913, + "grad_norm": 0.49442002177238464, + "learning_rate": 1.09808742549265e-05, + "loss": 5.9958, + "step": 2664 + }, + { + "epoch": 0.9156895589056393, + "grad_norm": 0.4691806733608246, + "learning_rate": 1.0893144844888791e-05, + "loss": 6.0186, + "step": 2665 + }, + { + "epoch": 0.9160331572391874, + "grad_norm": 0.49828535318374634, + "learning_rate": 1.0805759484186994e-05, + "loss": 6.0637, + "step": 2666 + }, + { + "epoch": 0.9163767555727355, + "grad_norm": 0.48614802956581116, + "learning_rate": 1.0718718298559389e-05, + "loss": 6.1132, + "step": 2667 + }, + { + "epoch": 0.9167203539062836, + "grad_norm": 0.4641305208206177, + "learning_rate": 1.06320214132489e-05, + "loss": 6.0557, + "step": 2668 + }, + { + "epoch": 0.9170639522398316, + "grad_norm": 0.4682670831680298, + "learning_rate": 1.054566895300324e-05, + "loss": 5.9688, + "step": 2669 + }, + { + "epoch": 0.9174075505733797, + "grad_norm": 0.4661727547645569, + "learning_rate": 1.0459661042074326e-05, + "loss": 6.13, + "step": 2670 + }, + { + "epoch": 0.9177511489069278, + "grad_norm": 0.4203856885433197, + "learning_rate": 1.0373997804218411e-05, + "loss": 6.0238, + "step": 2671 + }, + { + "epoch": 0.9180947472404759, + "grad_norm": 0.4406087100505829, + "learning_rate": 1.0288679362695786e-05, + "loss": 6.0538, + "step": 2672 + }, + { + "epoch": 0.9184383455740239, + "grad_norm": 0.5143980383872986, + "learning_rate": 1.0203705840270665e-05, + "loss": 6.1826, + "step": 2673 + }, + { + "epoch": 0.9187819439075721, + "grad_norm": 0.44605106115341187, + "learning_rate": 1.0119077359210832e-05, + "loss": 5.9918, + "step": 2674 + }, + { + "epoch": 0.9191255422411201, + "grad_norm": 0.43419891595840454, + "learning_rate": 1.0034794041287709e-05, + "loss": 6.0827, + "step": 2675 + }, + { + "epoch": 0.9194691405746682, + "grad_norm": 0.4553866982460022, + "learning_rate": 9.950856007776011e-06, + "loss": 6.0581, + "step": 2676 + }, + { + "epoch": 0.9198127389082162, + "grad_norm": 0.46902498602867126, + "learning_rate": 9.867263379453677e-06, + "loss": 6.0497, + "step": 2677 + }, + { + "epoch": 0.9201563372417644, + "grad_norm": 0.4728301167488098, + "learning_rate": 9.78401627660161e-06, + "loss": 6.0556, + "step": 2678 + }, + { + "epoch": 0.9204999355753125, + "grad_norm": 0.4771122634410858, + "learning_rate": 9.701114819003486e-06, + "loss": 5.9773, + "step": 2679 + }, + { + "epoch": 0.9208435339088605, + "grad_norm": 0.470584511756897, + "learning_rate": 9.618559125945725e-06, + "loss": 6.0586, + "step": 2680 + }, + { + "epoch": 0.9211871322424087, + "grad_norm": 0.4982456862926483, + "learning_rate": 9.536349316217163e-06, + "loss": 6.0214, + "step": 2681 + }, + { + "epoch": 0.9215307305759567, + "grad_norm": 0.47169244289398193, + "learning_rate": 9.454485508109012e-06, + "loss": 6.1075, + "step": 2682 + }, + { + "epoch": 0.9218743289095048, + "grad_norm": 0.4510200321674347, + "learning_rate": 9.372967819414547e-06, + "loss": 6.1195, + "step": 2683 + }, + { + "epoch": 0.9222179272430528, + "grad_norm": 0.4280446171760559, + "learning_rate": 9.291796367429107e-06, + "loss": 5.9874, + "step": 2684 + }, + { + "epoch": 0.922561525576601, + "grad_norm": 0.5145493149757385, + "learning_rate": 9.21097126894968e-06, + "loss": 6.0329, + "step": 2685 + }, + { + "epoch": 0.922905123910149, + "grad_norm": 0.5257851481437683, + "learning_rate": 9.130492640275129e-06, + "loss": 6.0867, + "step": 2686 + }, + { + "epoch": 0.9232487222436971, + "grad_norm": 0.5830054879188538, + "learning_rate": 9.050360597205515e-06, + "loss": 5.9751, + "step": 2687 + }, + { + "epoch": 0.9235923205772452, + "grad_norm": 0.652542769908905, + "learning_rate": 8.970575255042385e-06, + "loss": 5.9654, + "step": 2688 + }, + { + "epoch": 0.9239359189107933, + "grad_norm": 0.5067824125289917, + "learning_rate": 8.891136728588323e-06, + "loss": 6.0476, + "step": 2689 + }, + { + "epoch": 0.9242795172443413, + "grad_norm": 0.5378184914588928, + "learning_rate": 8.812045132147007e-06, + "loss": 5.9726, + "step": 2690 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 0.5374602675437927, + "learning_rate": 8.733300579522707e-06, + "loss": 6.141, + "step": 2691 + }, + { + "epoch": 0.9249667139114375, + "grad_norm": 0.6461105942726135, + "learning_rate": 8.654903184020568e-06, + "loss": 5.9509, + "step": 2692 + }, + { + "epoch": 0.9253103122449856, + "grad_norm": 0.6112871766090393, + "learning_rate": 8.576853058446077e-06, + "loss": 5.9735, + "step": 2693 + }, + { + "epoch": 0.9256539105785337, + "grad_norm": 0.5804199576377869, + "learning_rate": 8.49915031510498e-06, + "loss": 6.1318, + "step": 2694 + }, + { + "epoch": 0.9259975089120818, + "grad_norm": 0.7698377966880798, + "learning_rate": 8.4217950658034e-06, + "loss": 6.1104, + "step": 2695 + }, + { + "epoch": 0.9263411072456299, + "grad_norm": 0.7038789391517639, + "learning_rate": 8.344787421847217e-06, + "loss": 6.1166, + "step": 2696 + }, + { + "epoch": 0.9266847055791779, + "grad_norm": 0.7374392151832581, + "learning_rate": 8.268127494042265e-06, + "loss": 6.1314, + "step": 2697 + }, + { + "epoch": 0.9270283039127261, + "grad_norm": 0.7966888546943665, + "learning_rate": 8.191815392694035e-06, + "loss": 6.1515, + "step": 2698 + }, + { + "epoch": 0.9273719022462741, + "grad_norm": 0.9581894278526306, + "learning_rate": 8.115851227607552e-06, + "loss": 6.2336, + "step": 2699 + }, + { + "epoch": 0.9277155005798222, + "grad_norm": 1.1803251504898071, + "learning_rate": 8.040235108087075e-06, + "loss": 6.0879, + "step": 2700 + }, + { + "epoch": 0.9280590989133702, + "grad_norm": 0.4471254050731659, + "learning_rate": 7.964967142936263e-06, + "loss": 6.0322, + "step": 2701 + }, + { + "epoch": 0.9284026972469184, + "grad_norm": 0.6071027517318726, + "learning_rate": 7.890047440457682e-06, + "loss": 5.865, + "step": 2702 + }, + { + "epoch": 0.9287462955804664, + "grad_norm": 0.5928388833999634, + "learning_rate": 7.81547610845279e-06, + "loss": 6.0027, + "step": 2703 + }, + { + "epoch": 0.9290898939140145, + "grad_norm": 0.5783106088638306, + "learning_rate": 7.741253254221841e-06, + "loss": 5.8402, + "step": 2704 + }, + { + "epoch": 0.9294334922475626, + "grad_norm": 0.508571207523346, + "learning_rate": 7.6673789845636e-06, + "loss": 5.8952, + "step": 2705 + }, + { + "epoch": 0.9297770905811107, + "grad_norm": 0.585649311542511, + "learning_rate": 7.593853405775286e-06, + "loss": 5.9212, + "step": 2706 + }, + { + "epoch": 0.9301206889146587, + "grad_norm": 0.5307704210281372, + "learning_rate": 7.520676623652411e-06, + "loss": 5.8958, + "step": 2707 + }, + { + "epoch": 0.9304642872482068, + "grad_norm": 0.48460954427719116, + "learning_rate": 7.4478487434885554e-06, + "loss": 6.0424, + "step": 2708 + }, + { + "epoch": 0.930807885581755, + "grad_norm": 0.45228397846221924, + "learning_rate": 7.3753698700753105e-06, + "loss": 6.0324, + "step": 2709 + }, + { + "epoch": 0.931151483915303, + "grad_norm": 0.5303856730461121, + "learning_rate": 7.303240107702086e-06, + "loss": 6.0299, + "step": 2710 + }, + { + "epoch": 0.9314950822488511, + "grad_norm": 0.46378111839294434, + "learning_rate": 7.2314595601558346e-06, + "loss": 5.9614, + "step": 2711 + }, + { + "epoch": 0.9318386805823992, + "grad_norm": 0.5033019185066223, + "learning_rate": 7.160028330721297e-06, + "loss": 6.0099, + "step": 2712 + }, + { + "epoch": 0.9321822789159473, + "grad_norm": 0.5241475701332092, + "learning_rate": 7.088946522180284e-06, + "loss": 6.0135, + "step": 2713 + }, + { + "epoch": 0.9325258772494953, + "grad_norm": 0.44836217164993286, + "learning_rate": 7.018214236812009e-06, + "loss": 5.9512, + "step": 2714 + }, + { + "epoch": 0.9328694755830435, + "grad_norm": 0.40336859226226807, + "learning_rate": 6.947831576392727e-06, + "loss": 5.9761, + "step": 2715 + }, + { + "epoch": 0.9332130739165915, + "grad_norm": 0.38975638151168823, + "learning_rate": 6.877798642195565e-06, + "loss": 5.9835, + "step": 2716 + }, + { + "epoch": 0.9335566722501396, + "grad_norm": 0.5378215312957764, + "learning_rate": 6.808115534990445e-06, + "loss": 6.0024, + "step": 2717 + }, + { + "epoch": 0.9339002705836876, + "grad_norm": 0.4532417953014374, + "learning_rate": 6.7387823550440485e-06, + "loss": 6.0132, + "step": 2718 + }, + { + "epoch": 0.9342438689172358, + "grad_norm": 0.4508761763572693, + "learning_rate": 6.669799202119353e-06, + "loss": 6.1408, + "step": 2719 + }, + { + "epoch": 0.9345874672507838, + "grad_norm": 0.48313215374946594, + "learning_rate": 6.601166175475792e-06, + "loss": 5.9936, + "step": 2720 + }, + { + "epoch": 0.9349310655843319, + "grad_norm": 0.520293653011322, + "learning_rate": 6.532883373869009e-06, + "loss": 6.0829, + "step": 2721 + }, + { + "epoch": 0.93527466391788, + "grad_norm": 0.45849609375, + "learning_rate": 6.464950895550742e-06, + "loss": 5.9805, + "step": 2722 + }, + { + "epoch": 0.9356182622514281, + "grad_norm": 0.5732299089431763, + "learning_rate": 6.3973688382684965e-06, + "loss": 6.0153, + "step": 2723 + }, + { + "epoch": 0.9359618605849762, + "grad_norm": 0.4997122585773468, + "learning_rate": 6.330137299265737e-06, + "loss": 5.9576, + "step": 2724 + }, + { + "epoch": 0.9363054589185242, + "grad_norm": 0.4114673435688019, + "learning_rate": 6.263256375281523e-06, + "loss": 6.0066, + "step": 2725 + }, + { + "epoch": 0.9366490572520724, + "grad_norm": 0.4480920135974884, + "learning_rate": 6.196726162550292e-06, + "loss": 6.0656, + "step": 2726 + }, + { + "epoch": 0.9369926555856204, + "grad_norm": 0.4622270166873932, + "learning_rate": 6.130546756802053e-06, + "loss": 5.9519, + "step": 2727 + }, + { + "epoch": 0.9373362539191685, + "grad_norm": 0.44009390473365784, + "learning_rate": 6.064718253261852e-06, + "loss": 6.0834, + "step": 2728 + }, + { + "epoch": 0.9376798522527166, + "grad_norm": 0.5133486986160278, + "learning_rate": 5.999240746649953e-06, + "loss": 6.1036, + "step": 2729 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 0.4361947774887085, + "learning_rate": 5.93411433118149e-06, + "loss": 6.0289, + "step": 2730 + }, + { + "epoch": 0.9383670489198127, + "grad_norm": 0.503173291683197, + "learning_rate": 5.8693391005665005e-06, + "loss": 5.9717, + "step": 2731 + }, + { + "epoch": 0.9387106472533608, + "grad_norm": 0.5202142000198364, + "learning_rate": 5.804915148009571e-06, + "loss": 6.0946, + "step": 2732 + }, + { + "epoch": 0.9390542455869089, + "grad_norm": 0.45355695486068726, + "learning_rate": 5.740842566210019e-06, + "loss": 6.0567, + "step": 2733 + }, + { + "epoch": 0.939397843920457, + "grad_norm": 0.47078660130500793, + "learning_rate": 5.677121447361405e-06, + "loss": 6.0976, + "step": 2734 + }, + { + "epoch": 0.939741442254005, + "grad_norm": 0.57381671667099, + "learning_rate": 5.613751883151663e-06, + "loss": 6.1569, + "step": 2735 + }, + { + "epoch": 0.9400850405875532, + "grad_norm": 0.5258800983428955, + "learning_rate": 5.550733964762855e-06, + "loss": 6.0503, + "step": 2736 + }, + { + "epoch": 0.9404286389211012, + "grad_norm": 0.5274262428283691, + "learning_rate": 5.488067782871059e-06, + "loss": 5.9954, + "step": 2737 + }, + { + "epoch": 0.9407722372546493, + "grad_norm": 0.55703204870224, + "learning_rate": 5.425753427646257e-06, + "loss": 6.0501, + "step": 2738 + }, + { + "epoch": 0.9411158355881974, + "grad_norm": 0.6053146123886108, + "learning_rate": 5.3637909887521705e-06, + "loss": 6.0434, + "step": 2739 + }, + { + "epoch": 0.9414594339217455, + "grad_norm": 0.5558188557624817, + "learning_rate": 5.302180555346175e-06, + "loss": 5.9769, + "step": 2740 + }, + { + "epoch": 0.9418030322552936, + "grad_norm": 0.6423223614692688, + "learning_rate": 5.24092221607908e-06, + "loss": 6.0606, + "step": 2741 + }, + { + "epoch": 0.9421466305888416, + "grad_norm": 0.7581862211227417, + "learning_rate": 5.18001605909521e-06, + "loss": 6.0086, + "step": 2742 + }, + { + "epoch": 0.9424902289223898, + "grad_norm": 0.7525710463523865, + "learning_rate": 5.119462172031963e-06, + "loss": 6.0993, + "step": 2743 + }, + { + "epoch": 0.9428338272559378, + "grad_norm": 0.6466197371482849, + "learning_rate": 5.059260642020003e-06, + "loss": 6.1109, + "step": 2744 + }, + { + "epoch": 0.9431774255894859, + "grad_norm": 0.6755435466766357, + "learning_rate": 4.99941155568287e-06, + "loss": 6.1111, + "step": 2745 + }, + { + "epoch": 0.9435210239230339, + "grad_norm": 0.6645384430885315, + "learning_rate": 4.939914999137096e-06, + "loss": 6.0707, + "step": 2746 + }, + { + "epoch": 0.9438646222565821, + "grad_norm": 0.770700216293335, + "learning_rate": 4.8807710579918394e-06, + "loss": 6.1119, + "step": 2747 + }, + { + "epoch": 0.9442082205901301, + "grad_norm": 0.8000778555870056, + "learning_rate": 4.8219798173490255e-06, + "loss": 5.9846, + "step": 2748 + }, + { + "epoch": 0.9445518189236782, + "grad_norm": 0.9796259999275208, + "learning_rate": 4.763541361802875e-06, + "loss": 6.3486, + "step": 2749 + }, + { + "epoch": 0.9448954172572263, + "grad_norm": 1.1820958852767944, + "learning_rate": 4.705455775440237e-06, + "loss": 6.272, + "step": 2750 + }, + { + "epoch": 0.9452390155907744, + "grad_norm": 0.48769789934158325, + "learning_rate": 4.647723141840033e-06, + "loss": 5.9491, + "step": 2751 + }, + { + "epoch": 0.9455826139243224, + "grad_norm": 0.5634806156158447, + "learning_rate": 4.590343544073367e-06, + "loss": 6.0842, + "step": 2752 + }, + { + "epoch": 0.9459262122578705, + "grad_norm": 0.5970801711082458, + "learning_rate": 4.533317064703391e-06, + "loss": 5.836, + "step": 2753 + }, + { + "epoch": 0.9462698105914187, + "grad_norm": 0.4978366792201996, + "learning_rate": 4.476643785785162e-06, + "loss": 5.9155, + "step": 2754 + }, + { + "epoch": 0.9466134089249667, + "grad_norm": 0.4581436216831207, + "learning_rate": 4.420323788865476e-06, + "loss": 5.9773, + "step": 2755 + }, + { + "epoch": 0.9469570072585148, + "grad_norm": 0.630322277545929, + "learning_rate": 4.364357154982846e-06, + "loss": 5.8643, + "step": 2756 + }, + { + "epoch": 0.9473006055920629, + "grad_norm": 0.6597774028778076, + "learning_rate": 4.308743964667294e-06, + "loss": 6.0093, + "step": 2757 + }, + { + "epoch": 0.947644203925611, + "grad_norm": 0.5160709619522095, + "learning_rate": 4.2534842979402575e-06, + "loss": 5.8845, + "step": 2758 + }, + { + "epoch": 0.947987802259159, + "grad_norm": 0.47501340508461, + "learning_rate": 4.198578234314604e-06, + "loss": 5.9749, + "step": 2759 + }, + { + "epoch": 0.9483314005927072, + "grad_norm": 0.4054076075553894, + "learning_rate": 4.14402585279422e-06, + "loss": 6.036, + "step": 2760 + }, + { + "epoch": 0.9486749989262552, + "grad_norm": 0.487505704164505, + "learning_rate": 4.0898272318742324e-06, + "loss": 5.9956, + "step": 2761 + }, + { + "epoch": 0.9490185972598033, + "grad_norm": 0.4308890402317047, + "learning_rate": 4.035982449540676e-06, + "loss": 5.9623, + "step": 2762 + }, + { + "epoch": 0.9493621955933513, + "grad_norm": 0.44951295852661133, + "learning_rate": 3.982491583270492e-06, + "loss": 5.9991, + "step": 2763 + }, + { + "epoch": 0.9497057939268995, + "grad_norm": 0.45850709080696106, + "learning_rate": 3.9293547100313075e-06, + "loss": 6.0219, + "step": 2764 + }, + { + "epoch": 0.9500493922604475, + "grad_norm": 0.4028855860233307, + "learning_rate": 3.87657190628149e-06, + "loss": 6.0092, + "step": 2765 + }, + { + "epoch": 0.9503929905939956, + "grad_norm": 0.3813883662223816, + "learning_rate": 3.824143247969813e-06, + "loss": 6.1796, + "step": 2766 + }, + { + "epoch": 0.9507365889275436, + "grad_norm": 0.4750194847583771, + "learning_rate": 3.7720688105356005e-06, + "loss": 6.0574, + "step": 2767 + }, + { + "epoch": 0.9510801872610918, + "grad_norm": 0.47151845693588257, + "learning_rate": 3.7203486689083854e-06, + "loss": 6.0058, + "step": 2768 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 0.41912469267845154, + "learning_rate": 3.668982897507972e-06, + "loss": 5.9304, + "step": 2769 + }, + { + "epoch": 0.9517673839281879, + "grad_norm": 0.4504321217536926, + "learning_rate": 3.617971570244266e-06, + "loss": 5.9236, + "step": 2770 + }, + { + "epoch": 0.9521109822617361, + "grad_norm": 0.49030429124832153, + "learning_rate": 3.5673147605170817e-06, + "loss": 6.0394, + "step": 2771 + }, + { + "epoch": 0.9524545805952841, + "grad_norm": 0.4937245845794678, + "learning_rate": 3.5170125412162247e-06, + "loss": 5.9422, + "step": 2772 + }, + { + "epoch": 0.9527981789288322, + "grad_norm": 0.4641237258911133, + "learning_rate": 3.467064984721241e-06, + "loss": 6.0364, + "step": 2773 + }, + { + "epoch": 0.9531417772623803, + "grad_norm": 0.44841110706329346, + "learning_rate": 3.417472162901336e-06, + "loss": 5.9775, + "step": 2774 + }, + { + "epoch": 0.9534853755959284, + "grad_norm": 0.4379451870918274, + "learning_rate": 3.36823414711529e-06, + "loss": 5.992, + "step": 2775 + }, + { + "epoch": 0.9538289739294764, + "grad_norm": 0.46499103307724, + "learning_rate": 3.3193510082114297e-06, + "loss": 5.9937, + "step": 2776 + }, + { + "epoch": 0.9541725722630245, + "grad_norm": 0.49435579776763916, + "learning_rate": 3.2708228165273244e-06, + "loss": 6.0961, + "step": 2777 + }, + { + "epoch": 0.9545161705965726, + "grad_norm": 0.47890639305114746, + "learning_rate": 3.2226496418899244e-06, + "loss": 6.0472, + "step": 2778 + }, + { + "epoch": 0.9548597689301207, + "grad_norm": 0.5395154356956482, + "learning_rate": 3.1748315536153094e-06, + "loss": 5.9907, + "step": 2779 + }, + { + "epoch": 0.9552033672636687, + "grad_norm": 0.45388686656951904, + "learning_rate": 3.127368620508608e-06, + "loss": 5.9788, + "step": 2780 + }, + { + "epoch": 0.9555469655972169, + "grad_norm": 0.44846445322036743, + "learning_rate": 3.0802609108638858e-06, + "loss": 6.0075, + "step": 2781 + }, + { + "epoch": 0.9558905639307649, + "grad_norm": 0.5502305626869202, + "learning_rate": 3.0335084924642263e-06, + "loss": 6.0372, + "step": 2782 + }, + { + "epoch": 0.956234162264313, + "grad_norm": 0.4417821168899536, + "learning_rate": 2.987111432581291e-06, + "loss": 5.9533, + "step": 2783 + }, + { + "epoch": 0.9565777605978611, + "grad_norm": 0.5430399775505066, + "learning_rate": 2.9410697979755928e-06, + "loss": 6.0111, + "step": 2784 + }, + { + "epoch": 0.9569213589314092, + "grad_norm": 0.5501893162727356, + "learning_rate": 2.8953836548960834e-06, + "loss": 5.9851, + "step": 2785 + }, + { + "epoch": 0.9572649572649573, + "grad_norm": 0.6305771470069885, + "learning_rate": 2.850053069080344e-06, + "loss": 6.0965, + "step": 2786 + }, + { + "epoch": 0.9576085555985053, + "grad_norm": 0.6068539023399353, + "learning_rate": 2.805078105754172e-06, + "loss": 6.0559, + "step": 2787 + }, + { + "epoch": 0.9579521539320535, + "grad_norm": 0.5408775806427002, + "learning_rate": 2.760458829631801e-06, + "loss": 5.9709, + "step": 2788 + }, + { + "epoch": 0.9582957522656015, + "grad_norm": 0.5502400398254395, + "learning_rate": 2.716195304915653e-06, + "loss": 6.0905, + "step": 2789 + }, + { + "epoch": 0.9586393505991496, + "grad_norm": 0.534099817276001, + "learning_rate": 2.672287595296169e-06, + "loss": 6.0971, + "step": 2790 + }, + { + "epoch": 0.9589829489326976, + "grad_norm": 0.5350020527839661, + "learning_rate": 2.6287357639519504e-06, + "loss": 6.1302, + "step": 2791 + }, + { + "epoch": 0.9593265472662458, + "grad_norm": 0.6089175939559937, + "learning_rate": 2.5855398735493697e-06, + "loss": 6.0019, + "step": 2792 + }, + { + "epoch": 0.9596701455997938, + "grad_norm": 0.6447976231575012, + "learning_rate": 2.5426999862427914e-06, + "loss": 5.9295, + "step": 2793 + }, + { + "epoch": 0.9600137439333419, + "grad_norm": 0.5977621674537659, + "learning_rate": 2.5002161636742125e-06, + "loss": 6.0715, + "step": 2794 + }, + { + "epoch": 0.96035734226689, + "grad_norm": 0.7416554689407349, + "learning_rate": 2.458088466973346e-06, + "loss": 6.1905, + "step": 2795 + }, + { + "epoch": 0.9607009406004381, + "grad_norm": 0.7069945931434631, + "learning_rate": 2.4163169567574526e-06, + "loss": 6.1577, + "step": 2796 + }, + { + "epoch": 0.9610445389339862, + "grad_norm": 0.8261065483093262, + "learning_rate": 2.3749016931313426e-06, + "loss": 6.1972, + "step": 2797 + }, + { + "epoch": 0.9613881372675342, + "grad_norm": 0.9036763310432434, + "learning_rate": 2.333842735687097e-06, + "loss": 6.0924, + "step": 2798 + }, + { + "epoch": 0.9617317356010824, + "grad_norm": 0.9265655279159546, + "learning_rate": 2.293140143504291e-06, + "loss": 6.1676, + "step": 2799 + }, + { + "epoch": 0.9620753339346304, + "grad_norm": 1.5305782556533813, + "learning_rate": 2.2527939751495476e-06, + "loss": 6.1288, + "step": 2800 + }, + { + "epoch": 0.9624189322681785, + "grad_norm": 0.5339880585670471, + "learning_rate": 2.212804288676706e-06, + "loss": 5.9065, + "step": 2801 + }, + { + "epoch": 0.9627625306017266, + "grad_norm": 0.5260118842124939, + "learning_rate": 2.1731711416267396e-06, + "loss": 5.9645, + "step": 2802 + }, + { + "epoch": 0.9631061289352747, + "grad_norm": 0.587617814540863, + "learning_rate": 2.1338945910274743e-06, + "loss": 5.8985, + "step": 2803 + }, + { + "epoch": 0.9634497272688227, + "grad_norm": 0.5828947424888611, + "learning_rate": 2.094974693393731e-06, + "loss": 5.9318, + "step": 2804 + }, + { + "epoch": 0.9637933256023709, + "grad_norm": 0.4665927290916443, + "learning_rate": 2.0564115047270458e-06, + "loss": 6.1154, + "step": 2805 + }, + { + "epoch": 0.9641369239359189, + "grad_norm": 0.5141355991363525, + "learning_rate": 2.0182050805158115e-06, + "loss": 5.991, + "step": 2806 + }, + { + "epoch": 0.964480522269467, + "grad_norm": 0.5290352702140808, + "learning_rate": 1.9803554757349685e-06, + "loss": 5.9607, + "step": 2807 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 0.48139309883117676, + "learning_rate": 1.942862744846091e-06, + "loss": 5.9097, + "step": 2808 + }, + { + "epoch": 0.9651677189365632, + "grad_norm": 0.6123693585395813, + "learning_rate": 1.90572694179722e-06, + "loss": 5.9149, + "step": 2809 + }, + { + "epoch": 0.9655113172701112, + "grad_norm": 0.4561925530433655, + "learning_rate": 1.8689481200228064e-06, + "loss": 5.8467, + "step": 2810 + }, + { + "epoch": 0.9658549156036593, + "grad_norm": 0.526841402053833, + "learning_rate": 1.8325263324437125e-06, + "loss": 6.0141, + "step": 2811 + }, + { + "epoch": 0.9661985139372075, + "grad_norm": 0.49907776713371277, + "learning_rate": 1.796461631466989e-06, + "loss": 6.0447, + "step": 2812 + }, + { + "epoch": 0.9665421122707555, + "grad_norm": 0.6142306923866272, + "learning_rate": 1.7607540689859035e-06, + "loss": 5.8905, + "step": 2813 + }, + { + "epoch": 0.9668857106043036, + "grad_norm": 0.45012393593788147, + "learning_rate": 1.7254036963798569e-06, + "loss": 6.0046, + "step": 2814 + }, + { + "epoch": 0.9672293089378516, + "grad_norm": 0.4007306396961212, + "learning_rate": 1.6904105645142442e-06, + "loss": 6.0132, + "step": 2815 + }, + { + "epoch": 0.9675729072713998, + "grad_norm": 0.3214866816997528, + "learning_rate": 1.6557747237405107e-06, + "loss": 6.032, + "step": 2816 + }, + { + "epoch": 0.9679165056049478, + "grad_norm": 0.4845171868801117, + "learning_rate": 1.621496223895902e-06, + "loss": 5.9634, + "step": 2817 + }, + { + "epoch": 0.9682601039384959, + "grad_norm": 0.4626264274120331, + "learning_rate": 1.5875751143035465e-06, + "loss": 5.8288, + "step": 2818 + }, + { + "epoch": 0.968603702272044, + "grad_norm": 0.42864149808883667, + "learning_rate": 1.5540114437723185e-06, + "loss": 6.0096, + "step": 2819 + }, + { + "epoch": 0.9689473006055921, + "grad_norm": 0.3970453143119812, + "learning_rate": 1.5208052605967804e-06, + "loss": 5.9898, + "step": 2820 + }, + { + "epoch": 0.9692908989391401, + "grad_norm": 0.3872493505477905, + "learning_rate": 1.4879566125570732e-06, + "loss": 6.0657, + "step": 2821 + }, + { + "epoch": 0.9696344972726882, + "grad_norm": 0.4018920660018921, + "learning_rate": 1.4554655469189438e-06, + "loss": 5.975, + "step": 2822 + }, + { + "epoch": 0.9699780956062363, + "grad_norm": 0.45553165674209595, + "learning_rate": 1.4233321104335506e-06, + "loss": 6.0283, + "step": 2823 + }, + { + "epoch": 0.9703216939397844, + "grad_norm": 0.46554648876190186, + "learning_rate": 1.391556349337464e-06, + "loss": 6.0846, + "step": 2824 + }, + { + "epoch": 0.9706652922733324, + "grad_norm": 0.4684000313282013, + "learning_rate": 1.3601383093526931e-06, + "loss": 6.0688, + "step": 2825 + }, + { + "epoch": 0.9710088906068806, + "grad_norm": 0.4713064134120941, + "learning_rate": 1.3290780356864374e-06, + "loss": 6.0044, + "step": 2826 + }, + { + "epoch": 0.9713524889404287, + "grad_norm": 0.4442633092403412, + "learning_rate": 1.2983755730310854e-06, + "loss": 6.0409, + "step": 2827 + }, + { + "epoch": 0.9716960872739767, + "grad_norm": 0.4553023874759674, + "learning_rate": 1.2680309655642431e-06, + "loss": 6.0703, + "step": 2828 + }, + { + "epoch": 0.9720396856075249, + "grad_norm": 0.41858962178230286, + "learning_rate": 1.238044256948595e-06, + "loss": 6.0488, + "step": 2829 + }, + { + "epoch": 0.9723832839410729, + "grad_norm": 0.4985685348510742, + "learning_rate": 1.2084154903317934e-06, + "loss": 6.0478, + "step": 2830 + }, + { + "epoch": 0.972726882274621, + "grad_norm": 0.5059686899185181, + "learning_rate": 1.1791447083465134e-06, + "loss": 5.9557, + "step": 2831 + }, + { + "epoch": 0.973070480608169, + "grad_norm": 0.5063678026199341, + "learning_rate": 1.150231953110259e-06, + "loss": 6.1021, + "step": 2832 + }, + { + "epoch": 0.9734140789417172, + "grad_norm": 0.4942188858985901, + "learning_rate": 1.1216772662254182e-06, + "loss": 6.0075, + "step": 2833 + }, + { + "epoch": 0.9737576772752652, + "grad_norm": 0.5138784646987915, + "learning_rate": 1.0934806887791803e-06, + "loss": 5.968, + "step": 2834 + }, + { + "epoch": 0.9741012756088133, + "grad_norm": 0.4592101275920868, + "learning_rate": 1.065642261343397e-06, + "loss": 6.1217, + "step": 2835 + }, + { + "epoch": 0.9744448739423613, + "grad_norm": 0.5657181739807129, + "learning_rate": 1.0381620239746093e-06, + "loss": 5.928, + "step": 2836 + }, + { + "epoch": 0.9747884722759095, + "grad_norm": 0.5013971924781799, + "learning_rate": 1.0110400162139377e-06, + "loss": 5.9431, + "step": 2837 + }, + { + "epoch": 0.9751320706094575, + "grad_norm": 0.5016094446182251, + "learning_rate": 9.842762770871094e-07, + "loss": 6.0844, + "step": 2838 + }, + { + "epoch": 0.9754756689430056, + "grad_norm": 0.5339999794960022, + "learning_rate": 9.57870845104264e-07, + "loss": 5.9993, + "step": 2839 + }, + { + "epoch": 0.9758192672765537, + "grad_norm": 0.5596013069152832, + "learning_rate": 9.318237582600086e-07, + "loss": 6.081, + "step": 2840 + }, + { + "epoch": 0.9761628656101018, + "grad_norm": 0.6841441988945007, + "learning_rate": 9.061350540333635e-07, + "loss": 5.903, + "step": 2841 + }, + { + "epoch": 0.9765064639436499, + "grad_norm": 0.6830129623413086, + "learning_rate": 8.80804769387622e-07, + "loss": 6.1143, + "step": 2842 + }, + { + "epoch": 0.976850062277198, + "grad_norm": 0.6157656311988831, + "learning_rate": 8.558329407703514e-07, + "loss": 6.0526, + "step": 2843 + }, + { + "epoch": 0.9771936606107461, + "grad_norm": 0.6276822686195374, + "learning_rate": 8.312196041133923e-07, + "loss": 6.06, + "step": 2844 + }, + { + "epoch": 0.9775372589442941, + "grad_norm": 0.6318298578262329, + "learning_rate": 8.069647948326653e-07, + "loss": 6.0182, + "step": 2845 + }, + { + "epoch": 0.9778808572778422, + "grad_norm": 0.73749178647995, + "learning_rate": 7.830685478283362e-07, + "loss": 6.1867, + "step": 2846 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 0.8540349006652832, + "learning_rate": 7.595308974845117e-07, + "loss": 6.0417, + "step": 2847 + }, + { + "epoch": 0.9785680539449384, + "grad_norm": 0.9874235987663269, + "learning_rate": 7.363518776694056e-07, + "loss": 6.0435, + "step": 2848 + }, + { + "epoch": 0.9789116522784864, + "grad_norm": 0.8798337578773499, + "learning_rate": 7.135315217350891e-07, + "loss": 6.1899, + "step": 2849 + }, + { + "epoch": 0.9792552506120346, + "grad_norm": 1.329888939857483, + "learning_rate": 6.910698625177126e-07, + "loss": 6.2289, + "step": 2850 + }, + { + "epoch": 0.9795988489455826, + "grad_norm": 0.4526410400867462, + "learning_rate": 6.689669323371728e-07, + "loss": 6.0513, + "step": 2851 + }, + { + "epoch": 0.9799424472791307, + "grad_norm": 0.4708828628063202, + "learning_rate": 6.472227629972239e-07, + "loss": 6.0642, + "step": 2852 + }, + { + "epoch": 0.9802860456126787, + "grad_norm": 0.3837355077266693, + "learning_rate": 6.258373857854494e-07, + "loss": 5.9422, + "step": 2853 + }, + { + "epoch": 0.9806296439462269, + "grad_norm": 0.47510474920272827, + "learning_rate": 6.04810831473096e-07, + "loss": 5.9057, + "step": 2854 + }, + { + "epoch": 0.9809732422797749, + "grad_norm": 0.42741143703460693, + "learning_rate": 5.841431303151845e-07, + "loss": 5.9465, + "step": 2855 + }, + { + "epoch": 0.981316840613323, + "grad_norm": 0.5458832383155823, + "learning_rate": 5.638343120502598e-07, + "loss": 6.0429, + "step": 2856 + }, + { + "epoch": 0.9816604389468712, + "grad_norm": 0.46792909502983093, + "learning_rate": 5.438844059006409e-07, + "loss": 6.0272, + "step": 2857 + }, + { + "epoch": 0.9820040372804192, + "grad_norm": 0.6238328218460083, + "learning_rate": 5.242934405720879e-07, + "loss": 5.818, + "step": 2858 + }, + { + "epoch": 0.9823476356139673, + "grad_norm": 0.5959128737449646, + "learning_rate": 5.050614442538848e-07, + "loss": 5.7514, + "step": 2859 + }, + { + "epoch": 0.9826912339475153, + "grad_norm": 0.45926153659820557, + "learning_rate": 4.86188444618868e-07, + "loss": 5.8914, + "step": 2860 + }, + { + "epoch": 0.9830348322810635, + "grad_norm": 0.39194220304489136, + "learning_rate": 4.6767446882328703e-07, + "loss": 6.0244, + "step": 2861 + }, + { + "epoch": 0.9833784306146115, + "grad_norm": 0.4128912389278412, + "learning_rate": 4.495195435067212e-07, + "loss": 6.0298, + "step": 2862 + }, + { + "epoch": 0.9837220289481596, + "grad_norm": 0.5608128905296326, + "learning_rate": 4.317236947922465e-07, + "loss": 5.8793, + "step": 2863 + }, + { + "epoch": 0.9840656272817077, + "grad_norm": 0.42504721879959106, + "learning_rate": 4.142869482861578e-07, + "loss": 5.8194, + "step": 2864 + }, + { + "epoch": 0.9844092256152558, + "grad_norm": 0.33117401599884033, + "learning_rate": 3.972093290781076e-07, + "loss": 6.0172, + "step": 2865 + }, + { + "epoch": 0.9847528239488038, + "grad_norm": 0.4305660128593445, + "learning_rate": 3.8049086174093973e-07, + "loss": 6.0165, + "step": 2866 + }, + { + "epoch": 0.985096422282352, + "grad_norm": 0.37048450112342834, + "learning_rate": 3.641315703307724e-07, + "loss": 5.973, + "step": 2867 + }, + { + "epoch": 0.9854400206159, + "grad_norm": 0.4533330500125885, + "learning_rate": 3.481314783868594e-07, + "loss": 6.0333, + "step": 2868 + }, + { + "epoch": 0.9857836189494481, + "grad_norm": 0.3933621048927307, + "learning_rate": 3.324906089316737e-07, + "loss": 6.0991, + "step": 2869 + }, + { + "epoch": 0.9861272172829961, + "grad_norm": 0.4046175479888916, + "learning_rate": 3.172089844707404e-07, + "loss": 6.0109, + "step": 2870 + }, + { + "epoch": 0.9864708156165443, + "grad_norm": 0.42547526955604553, + "learning_rate": 3.0228662699266494e-07, + "loss": 5.8854, + "step": 2871 + }, + { + "epoch": 0.9868144139500924, + "grad_norm": 0.4869686961174011, + "learning_rate": 2.8772355796918836e-07, + "loss": 5.8482, + "step": 2872 + }, + { + "epoch": 0.9871580122836404, + "grad_norm": 0.5427522659301758, + "learning_rate": 2.7351979835496534e-07, + "loss": 6.1293, + "step": 2873 + }, + { + "epoch": 0.9875016106171886, + "grad_norm": 0.408896267414093, + "learning_rate": 2.596753685877584e-07, + "loss": 6.0766, + "step": 2874 + }, + { + "epoch": 0.9878452089507366, + "grad_norm": 0.45009753108024597, + "learning_rate": 2.461902885881606e-07, + "loss": 6.1401, + "step": 2875 + }, + { + "epoch": 0.9881888072842847, + "grad_norm": 0.45075657963752747, + "learning_rate": 2.3306457775981728e-07, + "loss": 5.9555, + "step": 2876 + }, + { + "epoch": 0.9885324056178327, + "grad_norm": 0.4548248052597046, + "learning_rate": 2.202982549892041e-07, + "loss": 6.0435, + "step": 2877 + }, + { + "epoch": 0.9888760039513809, + "grad_norm": 0.44134655594825745, + "learning_rate": 2.0789133864571042e-07, + "loss": 5.957, + "step": 2878 + }, + { + "epoch": 0.9892196022849289, + "grad_norm": 0.44243818521499634, + "learning_rate": 1.9584384658158371e-07, + "loss": 6.0225, + "step": 2879 + }, + { + "epoch": 0.989563200618477, + "grad_norm": 0.5458806157112122, + "learning_rate": 1.841557961318463e-07, + "loss": 5.9614, + "step": 2880 + }, + { + "epoch": 0.989906798952025, + "grad_norm": 0.4366854131221771, + "learning_rate": 1.7282720411437858e-07, + "loss": 6.0524, + "step": 2881 + }, + { + "epoch": 0.9902503972855732, + "grad_norm": 0.4618584215641022, + "learning_rate": 1.6185808682986358e-07, + "loss": 6.0012, + "step": 2882 + }, + { + "epoch": 0.9905939956191212, + "grad_norm": 0.49428001046180725, + "learning_rate": 1.512484600616204e-07, + "loss": 6.0846, + "step": 2883 + }, + { + "epoch": 0.9909375939526693, + "grad_norm": 0.5537979602813721, + "learning_rate": 1.4099833907582627e-07, + "loss": 5.92, + "step": 2884 + }, + { + "epoch": 0.9912811922862174, + "grad_norm": 0.45787203311920166, + "learning_rate": 1.3110773862126668e-07, + "loss": 6.1011, + "step": 2885 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 0.48102542757987976, + "learning_rate": 1.2157667292952978e-07, + "loss": 5.9676, + "step": 2886 + }, + { + "epoch": 0.9919683889533136, + "grad_norm": 0.518682062625885, + "learning_rate": 1.1240515571475651e-07, + "loss": 6.0037, + "step": 2887 + }, + { + "epoch": 0.9923119872868617, + "grad_norm": 0.5656102299690247, + "learning_rate": 1.0359320017377937e-07, + "loss": 5.9474, + "step": 2888 + }, + { + "epoch": 0.9926555856204098, + "grad_norm": 0.4793003797531128, + "learning_rate": 9.514081898612247e-08, + "loss": 6.1492, + "step": 2889 + }, + { + "epoch": 0.9929991839539578, + "grad_norm": 0.6143734455108643, + "learning_rate": 8.704802431377945e-08, + "loss": 5.9602, + "step": 2890 + }, + { + "epoch": 0.9933427822875059, + "grad_norm": 0.5799821019172668, + "learning_rate": 7.931482780149102e-08, + "loss": 6.0795, + "step": 2891 + }, + { + "epoch": 0.993686380621054, + "grad_norm": 0.6480079889297485, + "learning_rate": 7.194124057649521e-08, + "loss": 6.1525, + "step": 2892 + }, + { + "epoch": 0.9940299789546021, + "grad_norm": 0.5958256125450134, + "learning_rate": 6.492727324855508e-08, + "loss": 6.0316, + "step": 2893 + }, + { + "epoch": 0.9943735772881501, + "grad_norm": 0.7720398902893066, + "learning_rate": 5.8272935910069765e-08, + "loss": 6.2484, + "step": 2894 + }, + { + "epoch": 0.9947171756216983, + "grad_norm": 0.5963339805603027, + "learning_rate": 5.1978238135907965e-08, + "loss": 6.1121, + "step": 2895 + }, + { + "epoch": 0.9950607739552463, + "grad_norm": 0.8526974320411682, + "learning_rate": 4.604318898346338e-08, + "loss": 6.0864, + "step": 2896 + }, + { + "epoch": 0.9954043722887944, + "grad_norm": 0.820821225643158, + "learning_rate": 4.0467796992654795e-08, + "loss": 6.1486, + "step": 2897 + }, + { + "epoch": 0.9957479706223424, + "grad_norm": 0.8554714918136597, + "learning_rate": 3.5252070185870514e-08, + "loss": 6.2766, + "step": 2898 + }, + { + "epoch": 0.9960915689558906, + "grad_norm": 0.914206862449646, + "learning_rate": 3.039601606796838e-08, + "loss": 6.222, + "step": 2899 + }, + { + "epoch": 0.9964351672894386, + "grad_norm": 1.1582309007644653, + "learning_rate": 2.5899641626331295e-08, + "loss": 6.2757, + "step": 2900 + }, + { + "epoch": 0.9967787656229867, + "grad_norm": 0.46385082602500916, + "learning_rate": 2.1762953330728417e-08, + "loss": 5.9457, + "step": 2901 + }, + { + "epoch": 0.9971223639565349, + "grad_norm": 0.5638375282287598, + "learning_rate": 1.798595713342621e-08, + "loss": 5.8336, + "step": 2902 + }, + { + "epoch": 0.9974659622900829, + "grad_norm": 0.42126670479774475, + "learning_rate": 1.4568658469132912e-08, + "loss": 5.9258, + "step": 2903 + }, + { + "epoch": 0.997809560623631, + "grad_norm": 0.44771361351013184, + "learning_rate": 1.1511062254970784e-08, + "loss": 5.9978, + "step": 2904 + }, + { + "epoch": 0.998153158957179, + "grad_norm": 0.44968634843826294, + "learning_rate": 8.813172890503874e-09, + "loss": 6.0372, + "step": 2905 + }, + { + "epoch": 0.9984967572907272, + "grad_norm": 0.48755761981010437, + "learning_rate": 6.4749942576824986e-09, + "loss": 5.919, + "step": 2906 + }, + { + "epoch": 0.9988403556242752, + "grad_norm": 0.5411795973777771, + "learning_rate": 4.496529720926512e-09, + "loss": 6.0362, + "step": 2907 + }, + { + "epoch": 0.9991839539578233, + "grad_norm": 0.5617440938949585, + "learning_rate": 2.8777821270142835e-09, + "loss": 6.0123, + "step": 2908 + }, + { + "epoch": 0.9995275522913714, + "grad_norm": 0.5796970725059509, + "learning_rate": 1.6187538051382067e-09, + "loss": 6.0778, + "step": 2909 + }, + { + "epoch": 0.9998711506249195, + "grad_norm": 0.6862174868583679, + "learning_rate": 7.194465669602135e-10, + "loss": 6.1623, + "step": 2910 + }, + { + "epoch": 1.0, + "grad_norm": 1.5478452444076538, + "learning_rate": 1.7986170644523901e-10, + "loss": 6.0827, + "step": 2911 + } + ], + "logging_steps": 1, + "max_steps": 2911, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 102226707072000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}