diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,13612 +10,13612 @@ "log_history": [ { "epoch": 0.005865102639296188, - "grad_norm": 36.14509413690048, + "grad_norm": 37.9880537632157, "learning_rate": 7.843137254901962e-07, - "loss": 2.5787, - "mean_token_accuracy": 0.4923335835337639, + "loss": 2.3824, + "mean_token_accuracy": 0.5152561776340008, "step": 1 }, { "epoch": 0.011730205278592375, - "grad_norm": 36.4475794468042, + "grad_norm": 38.24663692242419, "learning_rate": 1.5686274509803923e-06, - "loss": 2.6539, - "mean_token_accuracy": 0.48734448477625847, + "loss": 2.458, + "mean_token_accuracy": 0.5099295005202293, "step": 2 }, { "epoch": 0.017595307917888565, - "grad_norm": 32.10454485992091, + "grad_norm": 34.01933715185712, "learning_rate": 2.3529411764705885e-06, - "loss": 2.543, - "mean_token_accuracy": 0.48166975751519203, + "loss": 2.3636, + "mean_token_accuracy": 0.5083074681460857, "step": 3 }, { "epoch": 0.02346041055718475, - "grad_norm": 26.242796758548412, + "grad_norm": 27.051177114583112, "learning_rate": 3.1372549019607846e-06, - "loss": 2.4279, - "mean_token_accuracy": 0.5031268112361431, + "loss": 2.251, + "mean_token_accuracy": 0.5226383097469807, "step": 4 }, { "epoch": 0.02932551319648094, - "grad_norm": 15.599962593546184, + "grad_norm": 19.727174289533878, "learning_rate": 3.92156862745098e-06, - "loss": 2.1324, - "mean_token_accuracy": 0.5300704799592495, + "loss": 1.998, + "mean_token_accuracy": 0.555543951690197, "step": 5 }, { "epoch": 0.03519061583577713, - "grad_norm": 12.006705982946158, + "grad_norm": 21.744702794564137, "learning_rate": 4.705882352941177e-06, - "loss": 1.8898, - "mean_token_accuracy": 0.572486124932766, + "loss": 1.7848, + "mean_token_accuracy": 0.592386782169342, "step": 6 }, { "epoch": 0.04105571847507331, - "grad_norm": 11.765766521790566, + "grad_norm": 17.275924410129907, "learning_rate": 5.4901960784313735e-06, - "loss": 1.8913, - "mean_token_accuracy": 0.5782412365078926, + "loss": 1.7557, + "mean_token_accuracy": 0.6163813918828964, "step": 7 }, { "epoch": 0.0469208211143695, - "grad_norm": 9.957672917685553, + "grad_norm": 10.019425176366797, "learning_rate": 6.274509803921569e-06, - "loss": 1.6377, - "mean_token_accuracy": 0.6252397820353508, + "loss": 1.5142, + "mean_token_accuracy": 0.6524599567055702, "step": 8 }, { "epoch": 0.05278592375366569, - "grad_norm": 7.539104060500998, + "grad_norm": 9.550167622452179, "learning_rate": 7.058823529411766e-06, - "loss": 1.5152, - "mean_token_accuracy": 0.64116121083498, + "loss": 1.3688, + "mean_token_accuracy": 0.6706466227769852, "step": 9 }, { "epoch": 0.05865102639296188, - "grad_norm": 6.311700758053688, + "grad_norm": 7.641040982752429, "learning_rate": 7.84313725490196e-06, - "loss": 1.4626, - "mean_token_accuracy": 0.6572283431887627, + "loss": 1.3224, + "mean_token_accuracy": 0.6872824132442474, "step": 10 }, { "epoch": 0.06451612903225806, - "grad_norm": 6.03423316811303, + "grad_norm": 8.570091560611116, "learning_rate": 8.627450980392157e-06, - "loss": 1.32, - "mean_token_accuracy": 0.6869097501039505, + "loss": 1.2134, + "mean_token_accuracy": 0.7061515673995018, "step": 11 }, { "epoch": 0.07038123167155426, - "grad_norm": 5.8161864206350735, + "grad_norm": 11.290944541776646, "learning_rate": 9.411764705882354e-06, - "loss": 1.2741, - "mean_token_accuracy": 0.6884993687272072, + "loss": 1.2342, + "mean_token_accuracy": 0.69711734354496, "step": 12 }, { "epoch": 0.07624633431085044, - "grad_norm": 10.50308433461848, + "grad_norm": 7.038682211553209, "learning_rate": 1.0196078431372549e-05, - "loss": 1.2259, - "mean_token_accuracy": 0.6989131048321724, + "loss": 1.1575, + "mean_token_accuracy": 0.7149083167314529, "step": 13 }, { "epoch": 0.08211143695014662, - "grad_norm": 5.059241917511266, + "grad_norm": 6.932933409214858, "learning_rate": 1.0980392156862747e-05, - "loss": 1.3513, - "mean_token_accuracy": 0.6826794818043709, + "loss": 1.2858, + "mean_token_accuracy": 0.6853161156177521, "step": 14 }, { "epoch": 0.08797653958944282, - "grad_norm": 5.89298633359988, + "grad_norm": 5.731782601062065, "learning_rate": 1.1764705882352942e-05, - "loss": 1.3141, - "mean_token_accuracy": 0.6757656708359718, + "loss": 1.2554, + "mean_token_accuracy": 0.6931049451231956, "step": 15 }, { "epoch": 0.093841642228739, - "grad_norm": 5.168080959685619, + "grad_norm": 4.916389732621989, "learning_rate": 1.2549019607843138e-05, - "loss": 1.2261, - "mean_token_accuracy": 0.691883496940136, + "loss": 1.148, + "mean_token_accuracy": 0.7178131565451622, "step": 16 }, { "epoch": 0.09970674486803519, - "grad_norm": 4.484141065615124, + "grad_norm": 4.330030082475362, "learning_rate": 1.3333333333333333e-05, - "loss": 1.1443, - "mean_token_accuracy": 0.7193374708294868, + "loss": 1.0816, + "mean_token_accuracy": 0.7251480892300606, "step": 17 }, { "epoch": 0.10557184750733138, - "grad_norm": 5.453207713810213, + "grad_norm": 3.649363120902478, "learning_rate": 1.4117647058823532e-05, - "loss": 1.1347, - "mean_token_accuracy": 0.7149711772799492, + "loss": 1.0693, + "mean_token_accuracy": 0.7300560250878334, "step": 18 }, { "epoch": 0.11143695014662756, - "grad_norm": 4.536133715127149, + "grad_norm": 4.1519920206320196, "learning_rate": 1.4901960784313726e-05, - "loss": 1.2998, - "mean_token_accuracy": 0.6797390431165695, + "loss": 1.2506, + "mean_token_accuracy": 0.6863887533545494, "step": 19 }, { "epoch": 0.11730205278592376, - "grad_norm": 4.246131189412244, + "grad_norm": 3.8767361271495604, "learning_rate": 1.568627450980392e-05, - "loss": 1.1751, - "mean_token_accuracy": 0.7038690000772476, + "loss": 1.121, + "mean_token_accuracy": 0.7109393775463104, "step": 20 }, { "epoch": 0.12316715542521994, - "grad_norm": 4.073009596887261, + "grad_norm": 3.6104844493465396, "learning_rate": 1.647058823529412e-05, - "loss": 1.1252, - "mean_token_accuracy": 0.7140218988060951, + "loss": 1.0827, + "mean_token_accuracy": 0.7218014225363731, "step": 21 }, { "epoch": 0.12903225806451613, - "grad_norm": 3.699907899806749, + "grad_norm": 4.007119265101829, "learning_rate": 1.7254901960784314e-05, - "loss": 1.149, - "mean_token_accuracy": 0.714351512491703, + "loss": 1.1092, + "mean_token_accuracy": 0.7199599072337151, "step": 22 }, { "epoch": 0.1348973607038123, - "grad_norm": 3.972868223544853, + "grad_norm": 3.6042930404260085, "learning_rate": 1.8039215686274513e-05, - "loss": 1.0669, - "mean_token_accuracy": 0.7354029938578606, + "loss": 1.0332, + "mean_token_accuracy": 0.7422607615590096, "step": 23 }, { "epoch": 0.14076246334310852, - "grad_norm": 3.7388993855253982, + "grad_norm": 4.219241012714524, "learning_rate": 1.8823529411764708e-05, - "loss": 1.0795, - "mean_token_accuracy": 0.7333876341581345, + "loss": 1.0517, + "mean_token_accuracy": 0.74069644510746, "step": 24 }, { "epoch": 0.1466275659824047, - "grad_norm": 4.046285000278155, + "grad_norm": 3.9305087432057513, "learning_rate": 1.9607843137254903e-05, - "loss": 1.0917, - "mean_token_accuracy": 0.7187787368893623, + "loss": 1.0245, + "mean_token_accuracy": 0.7364556193351746, "step": 25 }, { "epoch": 0.15249266862170088, - "grad_norm": 4.194005650841938, + "grad_norm": 4.6730992254929635, "learning_rate": 2.0392156862745097e-05, - "loss": 1.1695, - "mean_token_accuracy": 0.7088376209139824, + "loss": 1.0677, + "mean_token_accuracy": 0.7419489920139313, "step": 26 }, { "epoch": 0.15835777126099707, - "grad_norm": 4.162135874639218, + "grad_norm": 3.785097641304545, "learning_rate": 2.1176470588235296e-05, - "loss": 1.0882, - "mean_token_accuracy": 0.7265072241425514, + "loss": 1.0868, + "mean_token_accuracy": 0.7323751673102379, "step": 27 }, { "epoch": 0.16422287390029325, - "grad_norm": 3.6630230495700165, + "grad_norm": 3.481309479312975, "learning_rate": 2.1960784313725494e-05, - "loss": 0.9752, - "mean_token_accuracy": 0.7463881373405457, + "loss": 0.9842, + "mean_token_accuracy": 0.7453407794237137, "step": 28 }, { "epoch": 0.17008797653958943, - "grad_norm": 4.192134812090267, + "grad_norm": 3.6592311495703087, "learning_rate": 2.274509803921569e-05, - "loss": 1.0355, - "mean_token_accuracy": 0.7389096990227699, + "loss": 1.0105, + "mean_token_accuracy": 0.7409603744745255, "step": 29 }, { "epoch": 0.17595307917888564, - "grad_norm": 4.153201939553374, + "grad_norm": 4.514364484724581, "learning_rate": 2.3529411764705884e-05, - "loss": 1.08, - "mean_token_accuracy": 0.723046787083149, + "loss": 1.1021, + "mean_token_accuracy": 0.7238170579075813, "step": 30 }, { "epoch": 0.18181818181818182, - "grad_norm": 4.158994764338232, + "grad_norm": 3.463973104001894, "learning_rate": 2.431372549019608e-05, - "loss": 1.2401, - "mean_token_accuracy": 0.7062453478574753, + "loss": 1.2255, + "mean_token_accuracy": 0.7069308832287788, "step": 31 }, { "epoch": 0.187683284457478, - "grad_norm": 4.056181311918012, + "grad_norm": 3.6412915937094206, "learning_rate": 2.5098039215686277e-05, - "loss": 0.967, - "mean_token_accuracy": 0.750753328204155, + "loss": 0.979, + "mean_token_accuracy": 0.7486892268061638, "step": 32 }, { "epoch": 0.1935483870967742, - "grad_norm": 4.093997127140846, + "grad_norm": 3.771238738745046, "learning_rate": 2.5882352941176475e-05, - "loss": 1.0725, - "mean_token_accuracy": 0.7313473895192146, + "loss": 1.0749, + "mean_token_accuracy": 0.7376544252038002, "step": 33 }, { "epoch": 0.19941348973607037, - "grad_norm": 3.2432440297693557, + "grad_norm": 2.849785727292785, "learning_rate": 2.6666666666666667e-05, - "loss": 0.9912, - "mean_token_accuracy": 0.7401512935757637, + "loss": 0.9826, + "mean_token_accuracy": 0.7478941157460213, "step": 34 }, { "epoch": 0.20527859237536658, - "grad_norm": 3.8928776327849253, + "grad_norm": 3.0734278923556877, "learning_rate": 2.7450980392156865e-05, - "loss": 1.023, - "mean_token_accuracy": 0.7389869540929794, + "loss": 1.0049, + "mean_token_accuracy": 0.7469039931893349, "step": 35 }, { "epoch": 0.21114369501466276, - "grad_norm": 3.8559916286631495, + "grad_norm": 4.114646538131496, "learning_rate": 2.8235294117647063e-05, - "loss": 1.045, - "mean_token_accuracy": 0.7432608231902122, + "loss": 1.0529, + "mean_token_accuracy": 0.7404080405831337, "step": 36 }, { "epoch": 0.21700879765395895, - "grad_norm": 3.420716935604122, + "grad_norm": 3.3388236941344567, "learning_rate": 2.9019607843137258e-05, - "loss": 0.9842, - "mean_token_accuracy": 0.7604367211461067, + "loss": 0.9828, + "mean_token_accuracy": 0.7534837573766708, "step": 37 }, { "epoch": 0.22287390029325513, - "grad_norm": 3.5100168737573747, + "grad_norm": 3.4638437496108825, "learning_rate": 2.9803921568627453e-05, - "loss": 0.921, - "mean_token_accuracy": 0.7548864558339119, + "loss": 0.9247, + "mean_token_accuracy": 0.7604290992021561, "step": 38 }, { "epoch": 0.2287390029325513, - "grad_norm": 3.7074432289739767, + "grad_norm": 3.7752117816398703, "learning_rate": 3.0588235294117644e-05, - "loss": 1.1716, - "mean_token_accuracy": 0.7088168561458588, + "loss": 1.1895, + "mean_token_accuracy": 0.7074295058846474, "step": 39 }, { "epoch": 0.23460410557184752, - "grad_norm": 3.3102650365950748, + "grad_norm": 3.528996257191355, "learning_rate": 3.137254901960784e-05, - "loss": 0.9172, - "mean_token_accuracy": 0.7633765339851379, + "loss": 0.9425, + "mean_token_accuracy": 0.7592396438121796, "step": 40 }, { "epoch": 0.2404692082111437, - "grad_norm": 3.149920605006581, + "grad_norm": 3.0732857130253364, "learning_rate": 3.215686274509804e-05, - "loss": 0.9249, - "mean_token_accuracy": 0.7586075663566589, + "loss": 0.9358, + "mean_token_accuracy": 0.7570799216628075, "step": 41 }, { "epoch": 0.24633431085043989, - "grad_norm": 3.2393738963150964, + "grad_norm": 3.279368475473708, "learning_rate": 3.294117647058824e-05, - "loss": 0.8995, - "mean_token_accuracy": 0.7706887423992157, + "loss": 0.8826, + "mean_token_accuracy": 0.7769674882292747, "step": 42 }, { "epoch": 0.25219941348973607, - "grad_norm": 2.952349602555809, + "grad_norm": 3.443187648731257, "learning_rate": 3.372549019607844e-05, - "loss": 0.7726, - "mean_token_accuracy": 0.7944382950663567, + "loss": 0.797, + "mean_token_accuracy": 0.7896388620138168, "step": 43 }, { "epoch": 0.25806451612903225, - "grad_norm": 3.252665163092252, + "grad_norm": 3.0279904907512933, "learning_rate": 3.450980392156863e-05, - "loss": 0.9039, - "mean_token_accuracy": 0.7575281262397766, + "loss": 0.9077, + "mean_token_accuracy": 0.7659174352884293, "step": 44 }, { "epoch": 0.26392961876832843, - "grad_norm": 3.3763731210881596, + "grad_norm": 3.2577062755111563, "learning_rate": 3.529411764705883e-05, - "loss": 0.8264, - "mean_token_accuracy": 0.7781133502721786, + "loss": 0.8252, + "mean_token_accuracy": 0.7776849120855331, "step": 45 }, { "epoch": 0.2697947214076246, - "grad_norm": 3.30879019930114, + "grad_norm": 3.3968975278177354, "learning_rate": 3.6078431372549025e-05, - "loss": 0.9723, - "mean_token_accuracy": 0.7479118257761002, + "loss": 0.9826, + "mean_token_accuracy": 0.7461246326565742, "step": 46 }, { "epoch": 0.2756598240469208, - "grad_norm": 3.464519745062525, + "grad_norm": 3.151238810305185, "learning_rate": 3.686274509803922e-05, - "loss": 0.9331, - "mean_token_accuracy": 0.7574068754911423, + "loss": 0.9504, + "mean_token_accuracy": 0.754719465970993, "step": 47 }, { "epoch": 0.28152492668621704, - "grad_norm": 3.1553239901431693, + "grad_norm": 2.989913005627508, "learning_rate": 3.7647058823529415e-05, - "loss": 0.8816, - "mean_token_accuracy": 0.7704252302646637, + "loss": 0.8824, + "mean_token_accuracy": 0.7735645473003387, "step": 48 }, { "epoch": 0.2873900293255132, - "grad_norm": 3.346562743926702, + "grad_norm": 3.0581268945006874, "learning_rate": 3.8431372549019614e-05, - "loss": 0.8737, - "mean_token_accuracy": 0.7910923138260841, + "loss": 0.8797, + "mean_token_accuracy": 0.7825532108545303, "step": 49 }, { "epoch": 0.2932551319648094, - "grad_norm": 3.3482663437109106, + "grad_norm": 3.3747466422512318, "learning_rate": 3.9215686274509805e-05, - "loss": 1.0582, - "mean_token_accuracy": 0.7400793433189392, + "loss": 1.0821, + "mean_token_accuracy": 0.7373353093862534, "step": 50 }, { "epoch": 0.2991202346041056, - "grad_norm": 2.808453690268802, + "grad_norm": 2.750915239042727, "learning_rate": 4e-05, - "loss": 0.8091, - "mean_token_accuracy": 0.7851236239075661, + "loss": 0.8237, + "mean_token_accuracy": 0.7809458151459694, "step": 51 }, { "epoch": 0.30498533724340177, - "grad_norm": 3.3563707690930378, + "grad_norm": 3.1589022742143533, "learning_rate": 3.999996733363487e-05, - "loss": 0.9554, - "mean_token_accuracy": 0.759026862680912, + "loss": 0.9686, + "mean_token_accuracy": 0.759947344660759, "step": 52 }, { "epoch": 0.31085043988269795, - "grad_norm": 3.177868427473436, + "grad_norm": 2.9486143286261512, "learning_rate": 3.9999869334658026e-05, - "loss": 0.8829, - "mean_token_accuracy": 0.7689605951309204, + "loss": 0.8867, + "mean_token_accuracy": 0.7693730369210243, "step": 53 }, { "epoch": 0.31671554252199413, - "grad_norm": 3.155245803816592, + "grad_norm": 2.871851544463913, "learning_rate": 3.9999706003425177e-05, - "loss": 0.9032, - "mean_token_accuracy": 0.7667737677693367, + "loss": 0.9387, + "mean_token_accuracy": 0.7589974477887154, "step": 54 }, { "epoch": 0.3225806451612903, - "grad_norm": 3.178614863392828, + "grad_norm": 3.450921621641314, "learning_rate": 3.999947734052915e-05, - "loss": 1.0082, - "mean_token_accuracy": 0.746660441160202, + "loss": 1.0382, + "mean_token_accuracy": 0.7438068389892578, "step": 55 }, { "epoch": 0.3284457478005865, - "grad_norm": 2.8354174033280786, + "grad_norm": 2.887445480842238, "learning_rate": 3.999918334679989e-05, - "loss": 0.9165, - "mean_token_accuracy": 0.7646413072943687, + "loss": 0.9559, + "mean_token_accuracy": 0.7529247477650642, "step": 56 }, { "epoch": 0.3343108504398827, - "grad_norm": 3.050044566712993, + "grad_norm": 3.0892814108067017, "learning_rate": 3.999882402330448e-05, - "loss": 0.8559, - "mean_token_accuracy": 0.7663176953792572, + "loss": 0.8971, + "mean_token_accuracy": 0.7540364935994148, "step": 57 }, { "epoch": 0.34017595307917886, - "grad_norm": 2.766027156435859, + "grad_norm": 3.0094437217519134, "learning_rate": 3.999839937134712e-05, - "loss": 0.8021, - "mean_token_accuracy": 0.7877992242574692, + "loss": 0.8167, + "mean_token_accuracy": 0.7749749794602394, "step": 58 }, { "epoch": 0.3460410557184751, - "grad_norm": 3.1089150664448253, + "grad_norm": 3.3487134647597356, "learning_rate": 3.999790939246912e-05, - "loss": 1.0441, - "mean_token_accuracy": 0.7402208596467972, + "loss": 1.0873, + "mean_token_accuracy": 0.7266764640808105, "step": 59 }, { "epoch": 0.3519061583577713, - "grad_norm": 3.070052989823742, + "grad_norm": 2.6453420164279966, "learning_rate": 3.999735408844892e-05, - "loss": 0.8303, - "mean_token_accuracy": 0.7838614583015442, + "loss": 0.8314, + "mean_token_accuracy": 0.7851419448852539, "step": 60 }, { "epoch": 0.35777126099706746, - "grad_norm": 2.996347966135584, + "grad_norm": 2.616232246648729, "learning_rate": 3.999673346130203e-05, - "loss": 0.9312, - "mean_token_accuracy": 0.7652290537953377, + "loss": 0.9273, + "mean_token_accuracy": 0.7678155675530434, "step": 61 }, { "epoch": 0.36363636363636365, - "grad_norm": 2.690721520008225, + "grad_norm": 2.7639102371481785, "learning_rate": 3.999604751328109e-05, - "loss": 0.7596, - "mean_token_accuracy": 0.8071479573845863, + "loss": 0.7851, + "mean_token_accuracy": 0.7944702953100204, "step": 62 }, { "epoch": 0.36950146627565983, - "grad_norm": 2.773174553879102, + "grad_norm": 2.9253735230565963, "learning_rate": 3.999529624687581e-05, - "loss": 0.7103, - "mean_token_accuracy": 0.8112666308879852, + "loss": 0.7166, + "mean_token_accuracy": 0.812786877155304, "step": 63 }, { "epoch": 0.375366568914956, - "grad_norm": 2.93779834083201, + "grad_norm": 3.035668751194949, "learning_rate": 3.999447966481298e-05, - "loss": 0.8697, - "mean_token_accuracy": 0.7921107411384583, + "loss": 0.8926, + "mean_token_accuracy": 0.7874413877725601, "step": 64 }, { "epoch": 0.3812316715542522, - "grad_norm": 3.1672195550770197, + "grad_norm": 3.081461551291182, "learning_rate": 3.999359777005647e-05, - "loss": 0.9436, - "mean_token_accuracy": 0.7524725720286369, + "loss": 0.9703, + "mean_token_accuracy": 0.7480718046426773, "step": 65 }, { "epoch": 0.3870967741935484, - "grad_norm": 2.8351133608014525, + "grad_norm": 2.8533613171820695, "learning_rate": 3.999265056580719e-05, - "loss": 0.7133, - "mean_token_accuracy": 0.8027519956231117, + "loss": 0.7026, + "mean_token_accuracy": 0.8061640858650208, "step": 66 }, { "epoch": 0.39296187683284456, - "grad_norm": 3.3754028291704237, + "grad_norm": 3.962990986339149, "learning_rate": 3.999163805550313e-05, - "loss": 1.0152, - "mean_token_accuracy": 0.763345830142498, + "loss": 1.0364, + "mean_token_accuracy": 0.7610009163618088, "step": 67 }, { "epoch": 0.39882697947214074, - "grad_norm": 3.0381916656684504, + "grad_norm": 4.444865136952274, "learning_rate": 3.9990560242819274e-05, - "loss": 0.8966, - "mean_token_accuracy": 0.7718857899308205, + "loss": 0.9392, + "mean_token_accuracy": 0.7688358500599861, "step": 68 }, { "epoch": 0.4046920821114369, - "grad_norm": 2.5365722428064412, + "grad_norm": 3.3184784161882837, "learning_rate": 3.9989417131667647e-05, - "loss": 0.7638, - "mean_token_accuracy": 0.8163110539317131, + "loss": 0.78, + "mean_token_accuracy": 0.8103457912802696, "step": 69 }, { "epoch": 0.41055718475073316, - "grad_norm": 2.7915162380323855, + "grad_norm": 2.7543975612278193, "learning_rate": 3.9988208726197293e-05, - "loss": 0.7726, - "mean_token_accuracy": 0.7922361120581627, + "loss": 0.7794, + "mean_token_accuracy": 0.7881907448172569, "step": 70 }, { "epoch": 0.41642228739002934, - "grad_norm": 2.978173227270339, + "grad_norm": 3.511969410814499, "learning_rate": 3.998693503079423e-05, - "loss": 0.927, - "mean_token_accuracy": 0.7751563489437103, + "loss": 0.9841, + "mean_token_accuracy": 0.7602343857288361, "step": 71 }, { "epoch": 0.4222873900293255, - "grad_norm": 2.8448079415335235, + "grad_norm": 3.189897347653094, "learning_rate": 3.998559605008146e-05, - "loss": 0.801, - "mean_token_accuracy": 0.779315672814846, + "loss": 0.8277, + "mean_token_accuracy": 0.7756155356764793, "step": 72 }, { "epoch": 0.4281524926686217, - "grad_norm": 2.8387570831780815, + "grad_norm": 2.9550650318433926, "learning_rate": 3.9984191788918936e-05, - "loss": 0.8389, - "mean_token_accuracy": 0.7829952985048294, + "loss": 0.8583, + "mean_token_accuracy": 0.7815645411610603, "step": 73 }, { "epoch": 0.4340175953079179, - "grad_norm": 2.5878507908913915, + "grad_norm": 3.119757664916935, "learning_rate": 3.998272225240356e-05, - "loss": 0.9174, - "mean_token_accuracy": 0.7716861665248871, + "loss": 0.9466, + "mean_token_accuracy": 0.7682439014315605, "step": 74 }, { "epoch": 0.4398826979472141, - "grad_norm": 2.6510951948539994, + "grad_norm": 3.031490133308079, "learning_rate": 3.9981187445869165e-05, - "loss": 0.7813, - "mean_token_accuracy": 0.8107101172208786, + "loss": 0.7991, + "mean_token_accuracy": 0.8088371828198433, "step": 75 }, { "epoch": 0.44574780058651026, - "grad_norm": 2.697548713971357, + "grad_norm": 2.9007072042172592, "learning_rate": 3.9979587374886466e-05, - "loss": 0.8972, - "mean_token_accuracy": 0.7646167501807213, + "loss": 0.9103, + "mean_token_accuracy": 0.7687933221459389, "step": 76 }, { "epoch": 0.45161290322580644, - "grad_norm": 2.8139069796203504, + "grad_norm": 38.67934100607019, "learning_rate": 3.997792204526309e-05, - "loss": 0.8033, - "mean_token_accuracy": 0.7897998914122581, + "loss": 0.9832, + "mean_token_accuracy": 0.7548127546906471, "step": 77 }, { "epoch": 0.4574780058651026, - "grad_norm": 2.3436223362656476, + "grad_norm": 3.3035061898149007, "learning_rate": 3.99761914630435e-05, - "loss": 0.7414, - "mean_token_accuracy": 0.7999845147132874, + "loss": 0.7777, + "mean_token_accuracy": 0.7979958653450012, "step": 78 }, { "epoch": 0.4633431085043988, - "grad_norm": 2.594588586928223, + "grad_norm": 22.781762493620928, "learning_rate": 3.997439563450901e-05, - "loss": 0.7127, - "mean_token_accuracy": 0.8027607202529907, + "loss": 0.8503, + "mean_token_accuracy": 0.7586963549256325, "step": 79 }, { "epoch": 0.46920821114369504, - "grad_norm": 2.8704349406315517, + "grad_norm": 2.7622593230896664, "learning_rate": 3.997253456617775e-05, - "loss": 0.7686, - "mean_token_accuracy": 0.8007281050086021, + "loss": 0.79, + "mean_token_accuracy": 0.7944292947649956, "step": 80 }, { "epoch": 0.4750733137829912, - "grad_norm": 2.5510137739446686, + "grad_norm": 2.5762465699762074, "learning_rate": 3.997060826480465e-05, - "loss": 0.6935, - "mean_token_accuracy": 0.8144687339663506, + "loss": 0.6844, + "mean_token_accuracy": 0.8140356242656708, "step": 81 }, { "epoch": 0.4809384164222874, - "grad_norm": 2.360869808267721, + "grad_norm": 2.8204659759274575, "learning_rate": 3.9968616737381414e-05, - "loss": 0.7855, - "mean_token_accuracy": 0.8035428002476692, + "loss": 0.8264, + "mean_token_accuracy": 0.7992580607533455, "step": 82 }, { "epoch": 0.4868035190615836, - "grad_norm": 2.2447686355133514, + "grad_norm": 2.841335055037953, "learning_rate": 3.996655999113647e-05, - "loss": 0.6925, - "mean_token_accuracy": 0.8235335797071457, + "loss": 0.6823, + "mean_token_accuracy": 0.82578344643116, "step": 83 }, { "epoch": 0.49266862170087977, - "grad_norm": 2.2547861026647382, + "grad_norm": 2.8147807305657264, "learning_rate": 3.9964438033534994e-05, - "loss": 0.5884, - "mean_token_accuracy": 0.8370934575796127, + "loss": 0.6119, + "mean_token_accuracy": 0.8287858068943024, "step": 84 }, { "epoch": 0.49853372434017595, - "grad_norm": 2.309713135086869, + "grad_norm": 2.376089124661705, "learning_rate": 3.996225087227881e-05, - "loss": 0.736, - "mean_token_accuracy": 0.8138050213456154, + "loss": 0.7493, + "mean_token_accuracy": 0.8083062320947647, "step": 85 }, { "epoch": 0.5043988269794721, - "grad_norm": 2.344634268841188, + "grad_norm": 2.544616939836254, "learning_rate": 3.995999851530645e-05, - "loss": 0.6836, - "mean_token_accuracy": 0.8341879695653915, + "loss": 0.7059, + "mean_token_accuracy": 0.8272803351283073, "step": 86 }, { "epoch": 0.5102639296187683, - "grad_norm": 2.5276817825702507, + "grad_norm": 2.3739375156900784, "learning_rate": 3.995768097079305e-05, - "loss": 0.7201, - "mean_token_accuracy": 0.8180172145366669, + "loss": 0.7219, + "mean_token_accuracy": 0.8119369223713875, "step": 87 }, { "epoch": 0.5161290322580645, - "grad_norm": 2.930450810837515, + "grad_norm": 2.724016943326451, "learning_rate": 3.9955298247150365e-05, - "loss": 0.8532, - "mean_token_accuracy": 0.7764740958809853, + "loss": 0.9042, + "mean_token_accuracy": 0.7662067860364914, "step": 88 }, { "epoch": 0.5219941348973607, - "grad_norm": 2.635872197106337, + "grad_norm": 2.4239661286688463, "learning_rate": 3.9952850353026715e-05, - "loss": 0.7202, - "mean_token_accuracy": 0.7964513823390007, + "loss": 0.7767, + "mean_token_accuracy": 0.7790317609906197, "step": 89 }, { "epoch": 0.5278592375366569, - "grad_norm": 2.6560892710983475, + "grad_norm": 2.665098789911307, "learning_rate": 3.9950337297306976e-05, - "loss": 0.7718, - "mean_token_accuracy": 0.8056656494736671, + "loss": 0.7754, + "mean_token_accuracy": 0.8018990531563759, "step": 90 }, { "epoch": 0.533724340175953, - "grad_norm": 2.9785297087153233, + "grad_norm": 2.8426441072209805, "learning_rate": 3.994775908911251e-05, - "loss": 0.8443, - "mean_token_accuracy": 0.7823826372623444, + "loss": 0.858, + "mean_token_accuracy": 0.7857545912265778, "step": 91 }, { "epoch": 0.5395894428152492, - "grad_norm": 2.6848005659586094, + "grad_norm": 2.436499001766652, "learning_rate": 3.9945115737801183e-05, - "loss": 0.6993, - "mean_token_accuracy": 0.8107479214668274, + "loss": 0.688, + "mean_token_accuracy": 0.8172339051961899, "step": 92 }, { "epoch": 0.5454545454545454, - "grad_norm": 2.6298393012226247, + "grad_norm": 2.5390866336120874, "learning_rate": 3.99424072529673e-05, - "loss": 0.8409, - "mean_token_accuracy": 0.7889999225735664, + "loss": 0.8688, + "mean_token_accuracy": 0.7890106663107872, "step": 93 }, { "epoch": 0.5513196480938416, - "grad_norm": 2.579937157375281, + "grad_norm": 2.6076783165281094, "learning_rate": 3.993963364444155e-05, - "loss": 0.7234, - "mean_token_accuracy": 0.8055694922804832, + "loss": 0.7134, + "mean_token_accuracy": 0.8137485533952713, "step": 94 }, { "epoch": 0.5571847507331378, - "grad_norm": 2.955654530084866, + "grad_norm": 2.4257871259207993, "learning_rate": 3.9936794922291015e-05, - "loss": 0.8405, - "mean_token_accuracy": 0.7774112895131111, + "loss": 0.8723, + "mean_token_accuracy": 0.7773057669401169, "step": 95 }, { "epoch": 0.5630498533724341, - "grad_norm": 3.0436689004178534, + "grad_norm": 2.8574887984110875, "learning_rate": 3.993389109681912e-05, - "loss": 0.7653, - "mean_token_accuracy": 0.79569511115551, + "loss": 0.788, + "mean_token_accuracy": 0.7993652150034904, "step": 96 }, { "epoch": 0.5689149560117303, - "grad_norm": 2.817659396803083, + "grad_norm": 2.616691569677792, "learning_rate": 3.993092217856557e-05, - "loss": 0.7067, - "mean_token_accuracy": 0.8145394548773766, + "loss": 0.6777, + "mean_token_accuracy": 0.8250965252518654, "step": 97 }, { "epoch": 0.5747800586510264, - "grad_norm": 2.7515378637269468, + "grad_norm": 2.6843846148715405, "learning_rate": 3.9927888178306346e-05, - "loss": 0.8013, - "mean_token_accuracy": 0.791895680129528, + "loss": 0.8095, + "mean_token_accuracy": 0.803053617477417, "step": 98 }, { "epoch": 0.5806451612903226, - "grad_norm": 2.907537751643136, + "grad_norm": 2.5522882845707833, "learning_rate": 3.992478910705364e-05, - "loss": 0.8031, - "mean_token_accuracy": 0.7878992408514023, + "loss": 0.7908, + "mean_token_accuracy": 0.7967691794037819, "step": 99 }, { "epoch": 0.5865102639296188, - "grad_norm": 2.979279035766898, + "grad_norm": 2.5520682672510016, "learning_rate": 3.992162497605583e-05, - "loss": 0.6806, - "mean_token_accuracy": 0.8274494782090187, + "loss": 0.6946, + "mean_token_accuracy": 0.8279445469379425, "step": 100 }, { "epoch": 0.592375366568915, - "grad_norm": 2.4353950711528043, + "grad_norm": 2.578013850702768, "learning_rate": 3.991839579679742e-05, - "loss": 0.7353, - "mean_token_accuracy": 0.806957870721817, + "loss": 0.7428, + "mean_token_accuracy": 0.8000684455037117, "step": 101 }, { "epoch": 0.5982404692082112, - "grad_norm": 2.4572327476853957, + "grad_norm": 2.3830631117684575, "learning_rate": 3.991510158099905e-05, - "loss": 0.5704, - "mean_token_accuracy": 0.846699096262455, + "loss": 0.5722, + "mean_token_accuracy": 0.8460334986448288, "step": 102 }, { "epoch": 0.6041055718475073, - "grad_norm": 2.3286640449972222, + "grad_norm": 2.414928415499407, "learning_rate": 3.991174234061738e-05, - "loss": 0.6006, - "mean_token_accuracy": 0.8506223112344742, + "loss": 0.6108, + "mean_token_accuracy": 0.8433133065700531, "step": 103 }, { "epoch": 0.6099706744868035, - "grad_norm": 2.631876593176074, + "grad_norm": 2.509706778446123, "learning_rate": 3.9908318087845104e-05, - "loss": 0.7379, - "mean_token_accuracy": 0.8100381121039391, + "loss": 0.7385, + "mean_token_accuracy": 0.8071222379803658, "step": 104 }, { "epoch": 0.6158357771260997, - "grad_norm": 2.2789202678122584, + "grad_norm": 2.6079854919350964, "learning_rate": 3.990482883511086e-05, - "loss": 0.5341, - "mean_token_accuracy": 0.8537792935967445, + "loss": 0.5505, + "mean_token_accuracy": 0.852523498237133, "step": 105 }, { "epoch": 0.6217008797653959, - "grad_norm": 2.120418551508169, + "grad_norm": 2.4027206543454045, "learning_rate": 3.990127459507924e-05, - "loss": 0.6053, - "mean_token_accuracy": 0.8266285732388496, + "loss": 0.6085, + "mean_token_accuracy": 0.8315910547971725, "step": 106 }, { "epoch": 0.6275659824046921, - "grad_norm": 2.364686651816796, + "grad_norm": 2.4987169867308654, "learning_rate": 3.98976553806507e-05, - "loss": 0.5868, - "mean_token_accuracy": 0.840075246989727, + "loss": 0.5936, + "mean_token_accuracy": 0.8380607068538666, "step": 107 }, { "epoch": 0.6334310850439883, - "grad_norm": 2.451948661442587, + "grad_norm": 2.4015675582275002, "learning_rate": 3.989397120496152e-05, - "loss": 0.5193, - "mean_token_accuracy": 0.8647155538201332, + "loss": 0.4922, + "mean_token_accuracy": 0.87723558396101, "step": 108 }, { "epoch": 0.6392961876832844, - "grad_norm": 2.355709310543082, + "grad_norm": 2.3732266526127455, "learning_rate": 3.989022208138377e-05, - "loss": 0.5675, - "mean_token_accuracy": 0.8488794639706612, + "loss": 0.56, + "mean_token_accuracy": 0.8504580110311508, "step": 109 }, { "epoch": 0.6451612903225806, - "grad_norm": 3.02749013010203, + "grad_norm": 2.8710876340305647, "learning_rate": 3.9886408023525256e-05, - "loss": 0.7619, - "mean_token_accuracy": 0.8114860579371452, + "loss": 0.7691, + "mean_token_accuracy": 0.8154280558228493, "step": 110 }, { "epoch": 0.6510263929618768, - "grad_norm": 2.8000919382354432, + "grad_norm": 2.8221688147494093, "learning_rate": 3.9882529045229475e-05, - "loss": 0.7982, - "mean_token_accuracy": 0.7895509079098701, + "loss": 0.8413, + "mean_token_accuracy": 0.7811410203576088, "step": 111 }, { "epoch": 0.656891495601173, - "grad_norm": 3.219052282134806, + "grad_norm": 2.5505094230738092, "learning_rate": 3.987858516057554e-05, - "loss": 0.5985, - "mean_token_accuracy": 0.8427421972155571, + "loss": 0.5915, + "mean_token_accuracy": 0.844106875360012, "step": 112 }, { "epoch": 0.6627565982404692, - "grad_norm": 2.433256738390738, + "grad_norm": 2.7933909899663676, "learning_rate": 3.9874576383878165e-05, - "loss": 0.6474, - "mean_token_accuracy": 0.8321207016706467, + "loss": 0.6512, + "mean_token_accuracy": 0.8320295438170433, "step": 113 }, { "epoch": 0.6686217008797654, - "grad_norm": 2.3493562113379274, + "grad_norm": 2.3083034683512342, "learning_rate": 3.9870502729687594e-05, - "loss": 0.6348, - "mean_token_accuracy": 0.8373970687389374, + "loss": 0.6401, + "mean_token_accuracy": 0.8332142606377602, "step": 114 }, { "epoch": 0.6744868035190615, - "grad_norm": 2.331602788927495, + "grad_norm": 2.621814147997228, "learning_rate": 3.986636421278954e-05, - "loss": 0.6854, - "mean_token_accuracy": 0.8225691393017769, + "loss": 0.6927, + "mean_token_accuracy": 0.8254417553544044, "step": 115 }, { "epoch": 0.6803519061583577, - "grad_norm": 2.142107119555709, + "grad_norm": 2.2582220425128785, "learning_rate": 3.986216084820515e-05, - "loss": 0.5011, - "mean_token_accuracy": 0.8588138148188591, + "loss": 0.5051, + "mean_token_accuracy": 0.8626674264669418, "step": 116 }, { "epoch": 0.6862170087976539, - "grad_norm": 2.419032052988893, + "grad_norm": 2.521627222816184, "learning_rate": 3.985789265119095e-05, - "loss": 0.5726, - "mean_token_accuracy": 0.8352588415145874, + "loss": 0.5789, + "mean_token_accuracy": 0.8454054147005081, "step": 117 }, { "epoch": 0.6920821114369502, - "grad_norm": 2.09085744769477, + "grad_norm": 2.1375840083749145, "learning_rate": 3.985355963723875e-05, - "loss": 0.4849, - "mean_token_accuracy": 0.8706546425819397, + "loss": 0.483, + "mean_token_accuracy": 0.8748066648840904, "step": 118 }, { "epoch": 0.6979472140762464, - "grad_norm": 2.1422570833131664, + "grad_norm": 3.7422524710601444, "learning_rate": 3.9849161822075655e-05, - "loss": 0.5376, - "mean_token_accuracy": 0.8550170734524727, + "loss": 0.5609, + "mean_token_accuracy": 0.8508216217160225, "step": 119 }, { "epoch": 0.7038123167155426, - "grad_norm": 2.388752967701632, + "grad_norm": 2.350290493183303, "learning_rate": 3.984469922166396e-05, - "loss": 0.6278, - "mean_token_accuracy": 0.842079646885395, + "loss": 0.6774, + "mean_token_accuracy": 0.8322641178965569, "step": 120 }, { "epoch": 0.7096774193548387, - "grad_norm": 2.518159286077506, + "grad_norm": 3.5137372539507252, "learning_rate": 3.984017185220109e-05, - "loss": 0.7772, - "mean_token_accuracy": 0.8071364387869835, + "loss": 0.8092, + "mean_token_accuracy": 0.8016931265592575, "step": 121 }, { "epoch": 0.7155425219941349, - "grad_norm": 2.2531929182116746, + "grad_norm": 2.5192379831163567, "learning_rate": 3.9835579730119576e-05, - "loss": 0.6705, - "mean_token_accuracy": 0.8298157975077629, + "loss": 0.7129, + "mean_token_accuracy": 0.818251371383667, "step": 122 }, { "epoch": 0.7214076246334311, - "grad_norm": 2.207859071920299, + "grad_norm": 2.193025994665411, "learning_rate": 3.9830922872086974e-05, - "loss": 0.6655, - "mean_token_accuracy": 0.8434372246265411, + "loss": 0.6943, + "mean_token_accuracy": 0.8356373012065887, "step": 123 }, { "epoch": 0.7272727272727273, - "grad_norm": 2.2972010787323818, + "grad_norm": 2.685832581829903, "learning_rate": 3.9826201295005784e-05, - "loss": 0.7564, - "mean_token_accuracy": 0.8130914643406868, + "loss": 0.7694, + "mean_token_accuracy": 0.809048131108284, "step": 124 }, { "epoch": 0.7331378299120235, - "grad_norm": 2.6764034052539807, + "grad_norm": 2.5160714877492736, "learning_rate": 3.982141501601343e-05, - "loss": 0.7236, - "mean_token_accuracy": 0.8133985474705696, + "loss": 0.7365, + "mean_token_accuracy": 0.8147779926657677, "step": 125 }, { "epoch": 0.7390029325513197, - "grad_norm": 2.2683967522097817, + "grad_norm": 2.237880574653323, "learning_rate": 3.9816564052482164e-05, - "loss": 0.646, - "mean_token_accuracy": 0.825585164129734, + "loss": 0.6554, + "mean_token_accuracy": 0.8241963237524033, "step": 126 }, { "epoch": 0.7448680351906158, - "grad_norm": 2.2683787938151263, + "grad_norm": 2.174659296975228, "learning_rate": 3.981164842201904e-05, - "loss": 0.6917, - "mean_token_accuracy": 0.8327226713299751, + "loss": 0.709, + "mean_token_accuracy": 0.826836347579956, "step": 127 }, { "epoch": 0.750733137829912, - "grad_norm": 2.3009219430061982, + "grad_norm": 2.4261092807260836, "learning_rate": 3.9806668142465804e-05, - "loss": 0.7333, - "mean_token_accuracy": 0.8195018395781517, + "loss": 0.7414, + "mean_token_accuracy": 0.8207696005702019, "step": 128 }, { "epoch": 0.7565982404692082, - "grad_norm": 2.045832816656146, + "grad_norm": 2.721891836616963, "learning_rate": 3.9801623231898856e-05, - "loss": 0.5323, - "mean_token_accuracy": 0.8589048609137535, + "loss": 0.5605, + "mean_token_accuracy": 0.8457466587424278, "step": 129 }, { "epoch": 0.7624633431085044, - "grad_norm": 2.0363179289257833, + "grad_norm": 2.3378897432690726, "learning_rate": 3.9796513708629186e-05, - "loss": 0.5484, - "mean_token_accuracy": 0.8502952381968498, + "loss": 0.5664, + "mean_token_accuracy": 0.8508183881640434, "step": 130 }, { "epoch": 0.7683284457478006, - "grad_norm": 2.1787352594889122, + "grad_norm": 2.3272679541979198, "learning_rate": 3.979133959120229e-05, - "loss": 0.535, - "mean_token_accuracy": 0.857828326523304, + "loss": 0.5489, + "mean_token_accuracy": 0.8570074439048767, "step": 131 }, { "epoch": 0.7741935483870968, - "grad_norm": 2.077917146910636, + "grad_norm": 2.315567352712671, "learning_rate": 3.9786100898398145e-05, - "loss": 0.5617, - "mean_token_accuracy": 0.8493303209543228, + "loss": 0.576, + "mean_token_accuracy": 0.8477486073970795, "step": 132 }, { "epoch": 0.7800586510263929, - "grad_norm": 2.097361535539421, + "grad_norm": 2.5536758744972543, "learning_rate": 3.9780797649231085e-05, - "loss": 0.5901, - "mean_token_accuracy": 0.8467446342110634, + "loss": 0.6087, + "mean_token_accuracy": 0.8472266495227814, "step": 133 }, { "epoch": 0.7859237536656891, - "grad_norm": 2.3198793806949154, + "grad_norm": 2.518398139484913, "learning_rate": 3.9775429862949745e-05, - "loss": 0.6641, - "mean_token_accuracy": 0.8338883817195892, + "loss": 0.7258, + "mean_token_accuracy": 0.8246461376547813, "step": 134 }, { "epoch": 0.7917888563049853, - "grad_norm": 2.260193065434171, + "grad_norm": 2.532155725945346, "learning_rate": 3.976999755903704e-05, - "loss": 0.6673, - "mean_token_accuracy": 0.827575221657753, + "loss": 0.7288, + "mean_token_accuracy": 0.8153877630829811, "step": 135 }, { "epoch": 0.7976539589442815, - "grad_norm": 2.1386712402376036, + "grad_norm": 2.3151492007502483, "learning_rate": 3.976450075721003e-05, - "loss": 0.5606, - "mean_token_accuracy": 0.8533760160207748, + "loss": 0.5781, + "mean_token_accuracy": 0.855253241956234, "step": 136 }, { "epoch": 0.8035190615835777, - "grad_norm": 2.145352123969171, + "grad_norm": 2.844190481561306, "learning_rate": 3.975893947741989e-05, - "loss": 0.5056, - "mean_token_accuracy": 0.8680669069290161, + "loss": 0.5248, + "mean_token_accuracy": 0.8630795776844025, "step": 137 }, { "epoch": 0.8093841642228738, - "grad_norm": 2.1768056211172295, + "grad_norm": 2.315966717331691, "learning_rate": 3.9753313739851824e-05, - "loss": 0.6832, - "mean_token_accuracy": 0.8207377269864082, + "loss": 0.6877, + "mean_token_accuracy": 0.8183143883943558, "step": 138 }, { "epoch": 0.8152492668621701, - "grad_norm": 2.3681293407151314, + "grad_norm": 2.32614866768996, "learning_rate": 3.974762356492498e-05, - "loss": 0.7659, - "mean_token_accuracy": 0.8131817951798439, + "loss": 0.8115, + "mean_token_accuracy": 0.8048359602689743, "step": 139 }, { "epoch": 0.8211143695014663, - "grad_norm": 2.379227773317173, + "grad_norm": 2.7089994032119775, "learning_rate": 3.974186897329239e-05, - "loss": 0.5398, - "mean_token_accuracy": 0.8676523044705391, + "loss": 0.5536, + "mean_token_accuracy": 0.8649086877703667, "step": 140 }, { "epoch": 0.8269794721407625, - "grad_norm": 2.1423522624381, + "grad_norm": 2.4396347849087587, "learning_rate": 3.97360499858409e-05, - "loss": 0.5587, - "mean_token_accuracy": 0.8550106212496758, + "loss": 0.5594, + "mean_token_accuracy": 0.8538892492651939, "step": 141 }, { "epoch": 0.8328445747800587, - "grad_norm": 2.3739550490799304, + "grad_norm": 2.696807756771278, "learning_rate": 3.9730166623691096e-05, - "loss": 0.7078, - "mean_token_accuracy": 0.8150490075349808, + "loss": 0.7182, + "mean_token_accuracy": 0.8146277740597725, "step": 142 }, { "epoch": 0.8387096774193549, - "grad_norm": 2.280723224876568, + "grad_norm": 2.307485749164851, "learning_rate": 3.9724218908197194e-05, - "loss": 0.5059, - "mean_token_accuracy": 0.853252723813057, + "loss": 0.5172, + "mean_token_accuracy": 0.8577979356050491, "step": 143 }, { "epoch": 0.844574780058651, - "grad_norm": 2.7177341329057483, + "grad_norm": 2.9013073880532487, "learning_rate": 3.971820686094701e-05, - "loss": 0.7745, - "mean_token_accuracy": 0.8030019998550415, + "loss": 0.8143, + "mean_token_accuracy": 0.8073533028364182, "step": 144 }, { "epoch": 0.8504398826979472, - "grad_norm": 2.2866660183054055, + "grad_norm": 2.545940900051437, "learning_rate": 3.971213050376183e-05, - "loss": 0.6888, - "mean_token_accuracy": 0.8160409331321716, + "loss": 0.6929, + "mean_token_accuracy": 0.8187995925545692, "step": 145 }, { "epoch": 0.8563049853372434, - "grad_norm": 1.7729411173268295, + "grad_norm": 2.07164598138888, "learning_rate": 3.9705989858696387e-05, - "loss": 0.5162, - "mean_token_accuracy": 0.8643370196223259, + "loss": 0.5164, + "mean_token_accuracy": 0.8661198765039444, "step": 146 }, { "epoch": 0.8621700879765396, - "grad_norm": 1.9544676092787794, + "grad_norm": 1.9877579647660435, "learning_rate": 3.969978494803876e-05, - "loss": 0.5033, - "mean_token_accuracy": 0.8608671575784683, + "loss": 0.504, + "mean_token_accuracy": 0.8651574477553368, "step": 147 }, { "epoch": 0.8680351906158358, - "grad_norm": 2.0416284223126797, + "grad_norm": 2.164735875266452, "learning_rate": 3.969351579431024e-05, - "loss": 0.5282, - "mean_token_accuracy": 0.8565196245908737, + "loss": 0.5236, + "mean_token_accuracy": 0.8638859316706657, "step": 148 }, { "epoch": 0.873900293255132, - "grad_norm": 1.9202190012827038, + "grad_norm": 2.387480098965745, "learning_rate": 3.968718242026533e-05, - "loss": 0.4962, - "mean_token_accuracy": 0.8656453415751457, + "loss": 0.5327, + "mean_token_accuracy": 0.8672320023179054, "step": 149 }, { "epoch": 0.8797653958944281, - "grad_norm": 1.7711919486309162, + "grad_norm": 1.7906670972986842, "learning_rate": 3.968078484889163e-05, - "loss": 0.3981, - "mean_token_accuracy": 0.8840658068656921, + "loss": 0.398, + "mean_token_accuracy": 0.8894071653485298, "step": 150 }, { "epoch": 0.8856304985337243, - "grad_norm": 2.103353063920332, + "grad_norm": 2.720776873508094, "learning_rate": 3.9674323103409736e-05, - "loss": 0.5512, - "mean_token_accuracy": 0.8531129956245422, + "loss": 0.6053, + "mean_token_accuracy": 0.8414165899157524, "step": 151 }, { "epoch": 0.8914956011730205, - "grad_norm": 2.441157781778918, + "grad_norm": 2.340371554995659, "learning_rate": 3.966779720727317e-05, - "loss": 0.6903, - "mean_token_accuracy": 0.8275642022490501, + "loss": 0.6905, + "mean_token_accuracy": 0.8298128694295883, "step": 152 }, { "epoch": 0.8973607038123167, - "grad_norm": 2.1644819233748693, + "grad_norm": 2.222842499416126, "learning_rate": 3.9661207184168305e-05, - "loss": 0.535, - "mean_token_accuracy": 0.8525630459189415, + "loss": 0.538, + "mean_token_accuracy": 0.852678582072258, "step": 153 }, { "epoch": 0.9032258064516129, - "grad_norm": 2.2864455907051577, + "grad_norm": 2.307131700118274, "learning_rate": 3.9654553058014265e-05, - "loss": 0.6607, - "mean_token_accuracy": 0.8330980539321899, + "loss": 0.6771, + "mean_token_accuracy": 0.8301350399851799, "step": 154 }, { "epoch": 0.9090909090909091, - "grad_norm": 2.0226335451161654, + "grad_norm": 2.383888745225798, "learning_rate": 3.9647834852962825e-05, - "loss": 0.542, - "mean_token_accuracy": 0.8625759854912758, + "loss": 0.5538, + "mean_token_accuracy": 0.8536617681384087, "step": 155 }, { "epoch": 0.9149560117302052, - "grad_norm": 2.420244762324832, + "grad_norm": 5.137805263388332, "learning_rate": 3.964105259339838e-05, - "loss": 0.7424, - "mean_token_accuracy": 0.8057239204645157, + "loss": 0.7932, + "mean_token_accuracy": 0.7996788993477821, "step": 156 }, { "epoch": 0.9208211143695014, - "grad_norm": 1.8979120793832391, + "grad_norm": 2.161972657457864, "learning_rate": 3.9634206303937773e-05, - "loss": 0.4512, - "mean_token_accuracy": 0.875034749507904, + "loss": 0.4423, + "mean_token_accuracy": 0.8784089833498001, "step": 157 }, { "epoch": 0.9266862170087976, - "grad_norm": 1.7284509709305393, + "grad_norm": 1.7747982918472702, "learning_rate": 3.962729600943028e-05, - "loss": 0.448, - "mean_token_accuracy": 0.8827922642230988, + "loss": 0.454, + "mean_token_accuracy": 0.8777899146080017, "step": 158 }, { "epoch": 0.9325513196480938, - "grad_norm": 2.3569337927624945, + "grad_norm": 1.994086972253065, "learning_rate": 3.962032173495748e-05, - "loss": 0.4919, - "mean_token_accuracy": 0.8744383007287979, + "loss": 0.4916, + "mean_token_accuracy": 0.8769926801323891, "step": 159 }, { "epoch": 0.9384164222873901, - "grad_norm": 1.9017225299582532, + "grad_norm": 2.0290404806475317, "learning_rate": 3.961328350583316e-05, - "loss": 0.4726, - "mean_token_accuracy": 0.871865801513195, + "loss": 0.4613, + "mean_token_accuracy": 0.8767556101083755, "step": 160 }, { "epoch": 0.9442815249266863, - "grad_norm": 2.1065821737511614, + "grad_norm": 2.2184277122166987, "learning_rate": 3.960618134760327e-05, - "loss": 0.5991, - "mean_token_accuracy": 0.8471841290593147, + "loss": 0.6018, + "mean_token_accuracy": 0.8447824791073799, "step": 161 }, { "epoch": 0.9501466275659824, - "grad_norm": 1.7468566820440716, + "grad_norm": 1.8714319974846567, "learning_rate": 3.959901528604575e-05, - "loss": 0.3703, - "mean_token_accuracy": 0.8899563401937485, + "loss": 0.3716, + "mean_token_accuracy": 0.8887424990534782, "step": 162 }, { "epoch": 0.9560117302052786, - "grad_norm": 2.0026819483535525, + "grad_norm": 2.1972929020277974, "learning_rate": 3.959178534717053e-05, - "loss": 0.6124, - "mean_token_accuracy": 0.844733901321888, + "loss": 0.6059, + "mean_token_accuracy": 0.8424908146262169, "step": 163 }, { "epoch": 0.9618768328445748, - "grad_norm": 1.8429810966574525, + "grad_norm": 1.9980815359087727, "learning_rate": 3.9584491557219366e-05, - "loss": 0.6114, - "mean_token_accuracy": 0.8512536585330963, + "loss": 0.5901, + "mean_token_accuracy": 0.8520945236086845, "step": 164 }, { "epoch": 0.967741935483871, - "grad_norm": 1.9068504240620572, + "grad_norm": 2.072248270926409, "learning_rate": 3.957713394266576e-05, - "loss": 0.5225, - "mean_token_accuracy": 0.8591367825865746, + "loss": 0.5486, + "mean_token_accuracy": 0.8482609316706657, "step": 165 }, { "epoch": 0.9736070381231672, - "grad_norm": 2.03904846981795, + "grad_norm": 2.1173865352854455, "learning_rate": 3.956971253021489e-05, - "loss": 0.4543, - "mean_token_accuracy": 0.8774362131953239, + "loss": 0.4607, + "mean_token_accuracy": 0.8749275654554367, "step": 166 }, { "epoch": 0.9794721407624634, - "grad_norm": 1.9730559274041801, + "grad_norm": 1.9428629083216018, "learning_rate": 3.956222734680348e-05, - "loss": 0.5391, - "mean_token_accuracy": 0.8570215106010437, + "loss": 0.5424, + "mean_token_accuracy": 0.857122503221035, "step": 167 }, { "epoch": 0.9853372434017595, - "grad_norm": 1.9611601962690912, + "grad_norm": 2.1694805344323673, "learning_rate": 3.955467841959972e-05, - "loss": 0.5588, - "mean_token_accuracy": 0.8605179488658905, + "loss": 0.5818, + "mean_token_accuracy": 0.8583057522773743, "step": 168 }, { "epoch": 0.9912023460410557, - "grad_norm": 1.8542913981304092, + "grad_norm": 2.1011172758309984, "learning_rate": 3.954706577600318e-05, - "loss": 0.5001, - "mean_token_accuracy": 0.8608422949910164, + "loss": 0.5309, + "mean_token_accuracy": 0.8498675376176834, "step": 169 }, { "epoch": 0.9970674486803519, - "grad_norm": 2.0514447581437483, + "grad_norm": 2.135249455720304, "learning_rate": 3.953938944364467e-05, - "loss": 0.6395, - "mean_token_accuracy": 0.8432887569069862, + "loss": 0.6643, + "mean_token_accuracy": 0.8334744796156883, "step": 170 }, { "epoch": 1.0, - "grad_norm": 2.0514447581437483, + "grad_norm": 2.135249455720304, "learning_rate": 3.953164945038618e-05, - "loss": 0.56, - "mean_token_accuracy": 0.8685450553894043, + "loss": 0.5537, + "mean_token_accuracy": 0.8696076571941376, "step": 171 }, { "epoch": 1.0058651026392962, - "grad_norm": 2.798511260901564, + "grad_norm": 2.9426546093236556, "learning_rate": 3.952384582432076e-05, - "loss": 0.3523, - "mean_token_accuracy": 0.8961983993649483, + "loss": 0.3773, + "mean_token_accuracy": 0.8955628573894501, "step": 172 }, { "epoch": 1.0117302052785924, - "grad_norm": 1.6143327247401649, + "grad_norm": 1.9429159271699736, "learning_rate": 3.9515978593772426e-05, - "loss": 0.2906, - "mean_token_accuracy": 0.9175504371523857, + "loss": 0.3063, + "mean_token_accuracy": 0.9107492417097092, "step": 173 }, { "epoch": 1.0175953079178885, - "grad_norm": 1.5392085231630535, + "grad_norm": 2.0141991872288383, "learning_rate": 3.9508047787296034e-05, - "loss": 0.2327, - "mean_token_accuracy": 0.9279068484902382, + "loss": 0.2379, + "mean_token_accuracy": 0.9283656999468803, "step": 174 }, { "epoch": 1.0234604105571847, - "grad_norm": 1.4196610216082999, + "grad_norm": 1.626380022632123, "learning_rate": 3.9500053433677226e-05, - "loss": 0.238, - "mean_token_accuracy": 0.9234108552336693, + "loss": 0.2344, + "mean_token_accuracy": 0.9287891238927841, "step": 175 }, { "epoch": 1.029325513196481, - "grad_norm": 1.739223331966883, + "grad_norm": 2.373020615301983, "learning_rate": 3.949199556193226e-05, - "loss": 0.3358, - "mean_token_accuracy": 0.8978307694196701, + "loss": 0.3377, + "mean_token_accuracy": 0.9061623066663742, "step": 176 }, { "epoch": 1.035190615835777, - "grad_norm": 1.6733699048918613, + "grad_norm": 2.1356888697655325, "learning_rate": 3.948387420130796e-05, - "loss": 0.2404, - "mean_token_accuracy": 0.9289108365774155, + "loss": 0.2294, + "mean_token_accuracy": 0.9320363029837608, "step": 177 }, { "epoch": 1.0410557184750733, - "grad_norm": 1.6691732023766321, + "grad_norm": 1.795075757224302, "learning_rate": 3.94756893812816e-05, - "loss": 0.317, - "mean_token_accuracy": 0.9076567217707634, + "loss": 0.3389, + "mean_token_accuracy": 0.9035566374659538, "step": 178 }, { "epoch": 1.0469208211143695, - "grad_norm": 1.8180580779574043, + "grad_norm": 2.025624174532658, "learning_rate": 3.946744113156075e-05, - "loss": 0.2832, - "mean_token_accuracy": 0.9121890664100647, + "loss": 0.269, + "mean_token_accuracy": 0.9122130498290062, "step": 179 }, { "epoch": 1.0527859237536656, - "grad_norm": 2.060502078131102, + "grad_norm": 2.0849133530495467, "learning_rate": 3.945912948208324e-05, - "loss": 0.391, - "mean_token_accuracy": 0.893414668738842, + "loss": 0.4147, + "mean_token_accuracy": 0.8821691945195198, "step": 180 }, { "epoch": 1.0586510263929618, - "grad_norm": 1.8802598380610254, + "grad_norm": 2.3538480856162236, "learning_rate": 3.9450754463016994e-05, - "loss": 0.3546, - "mean_token_accuracy": 0.8969884589314461, + "loss": 0.3542, + "mean_token_accuracy": 0.9003543704748154, "step": 181 }, { "epoch": 1.064516129032258, - "grad_norm": 1.9657434071689015, + "grad_norm": 2.0146157111462637, "learning_rate": 3.9442316104759955e-05, - "loss": 0.3342, - "mean_token_accuracy": 0.906390093266964, + "loss": 0.3418, + "mean_token_accuracy": 0.9114131703972816, "step": 182 }, { "epoch": 1.0703812316715542, - "grad_norm": 1.488874908366716, + "grad_norm": 1.5651301109137348, "learning_rate": 3.943381443793994e-05, - "loss": 0.3531, - "mean_token_accuracy": 0.904318280518055, + "loss": 0.3668, + "mean_token_accuracy": 0.9008131772279739, "step": 183 }, { "epoch": 1.0762463343108504, - "grad_norm": 1.8571080458413325, + "grad_norm": 2.9788519961880753, "learning_rate": 3.9425249493414585e-05, - "loss": 0.4322, - "mean_token_accuracy": 0.8704885244369507, + "loss": 0.4622, + "mean_token_accuracy": 0.8579861670732498, "step": 184 }, { "epoch": 1.0821114369501466, - "grad_norm": 1.7341671914679213, + "grad_norm": 139.27043531147197, "learning_rate": 3.941662130227118e-05, - "loss": 0.4563, - "mean_token_accuracy": 0.868084505200386, + "loss": 0.4682, + "mean_token_accuracy": 0.8681460916996002, "step": 185 }, { "epoch": 1.0879765395894427, - "grad_norm": 1.9301135643943996, + "grad_norm": 2.5171759946289662, "learning_rate": 3.940792989582654e-05, - "loss": 0.3557, - "mean_token_accuracy": 0.8965374007821083, + "loss": 0.3631, + "mean_token_accuracy": 0.8955680355429649, "step": 186 }, { "epoch": 1.093841642228739, - "grad_norm": 1.9752759214040492, + "grad_norm": 2.142201626535042, "learning_rate": 3.939917530562701e-05, - "loss": 0.2913, - "mean_token_accuracy": 0.9181935787200928, + "loss": 0.2754, + "mean_token_accuracy": 0.9177270233631134, "step": 187 }, { "epoch": 1.099706744868035, - "grad_norm": 1.80853857916468, + "grad_norm": 1.9141489035930843, "learning_rate": 3.939035756344818e-05, - "loss": 0.3053, - "mean_token_accuracy": 0.9187766760587692, + "loss": 0.3297, + "mean_token_accuracy": 0.9090648591518402, "step": 188 }, { "epoch": 1.1055718475073313, - "grad_norm": 1.7923257882566426, + "grad_norm": 2.0379286037072895, "learning_rate": 3.93814767012949e-05, - "loss": 0.3427, - "mean_token_accuracy": 0.8995430916547775, + "loss": 0.3305, + "mean_token_accuracy": 0.9021382927894592, "step": 189 }, { "epoch": 1.1114369501466275, - "grad_norm": 1.5136850309097942, + "grad_norm": 1.7753703302965071, "learning_rate": 3.937253275140113e-05, - "loss": 0.253, - "mean_token_accuracy": 0.9268705397844315, + "loss": 0.2498, + "mean_token_accuracy": 0.9244972839951515, "step": 190 }, { "epoch": 1.1173020527859236, - "grad_norm": 1.6967629405583817, + "grad_norm": 2.180640664647315, "learning_rate": 3.936352574622978e-05, - "loss": 0.2441, - "mean_token_accuracy": 0.9279790148139, + "loss": 0.2766, + "mean_token_accuracy": 0.9234517589211464, "step": 191 }, { "epoch": 1.1231671554252198, - "grad_norm": 1.3723631113386732, + "grad_norm": 1.5381199668110206, "learning_rate": 3.9354455718472646e-05, - "loss": 0.3622, - "mean_token_accuracy": 0.9010487943887711, + "loss": 0.3604, + "mean_token_accuracy": 0.8974525555968285, "step": 192 }, { "epoch": 1.129032258064516, - "grad_norm": 2.033274073582061, + "grad_norm": 2.1119703774116343, "learning_rate": 3.934532270105026e-05, - "loss": 0.4103, - "mean_token_accuracy": 0.8935407474637032, + "loss": 0.3612, + "mean_token_accuracy": 0.9062921851873398, "step": 193 }, { "epoch": 1.1348973607038122, - "grad_norm": 2.155900058898323, + "grad_norm": 2.0957303962988205, "learning_rate": 3.933612672711179e-05, - "loss": 0.3699, - "mean_token_accuracy": 0.8951970860362053, + "loss": 0.3643, + "mean_token_accuracy": 0.8999098464846611, "step": 194 }, { "epoch": 1.1407624633431086, - "grad_norm": 1.6616746411097532, + "grad_norm": 1.9050538760986877, "learning_rate": 3.9326867830034915e-05, - "loss": 0.3785, - "mean_token_accuracy": 0.8921806812286377, + "loss": 0.3696, + "mean_token_accuracy": 0.8904466778039932, "step": 195 }, { "epoch": 1.1466275659824048, - "grad_norm": 1.702931865108539, + "grad_norm": 2.0245298774846123, "learning_rate": 3.931754604342568e-05, - "loss": 0.2909, - "mean_token_accuracy": 0.9116301015019417, + "loss": 0.294, + "mean_token_accuracy": 0.9096431136131287, "step": 196 }, { "epoch": 1.152492668621701, - "grad_norm": 1.7232592227816033, + "grad_norm": 1.748128150837437, "learning_rate": 3.930816140111842e-05, - "loss": 0.2667, - "mean_token_accuracy": 0.9185249134898186, + "loss": 0.2688, + "mean_token_accuracy": 0.9198426976799965, "step": 197 }, { "epoch": 1.1583577712609971, - "grad_norm": 1.840797018145673, + "grad_norm": 2.0707144710674763, "learning_rate": 3.929871393717558e-05, - "loss": 0.336, - "mean_token_accuracy": 0.9130111038684845, + "loss": 0.3557, + "mean_token_accuracy": 0.9080502316355705, "step": 198 }, { "epoch": 1.1642228739002933, - "grad_norm": 1.917246893078202, + "grad_norm": 2.672512494774031, "learning_rate": 3.9289203685887644e-05, - "loss": 0.3626, - "mean_token_accuracy": 0.8997843265533447, + "loss": 0.399, + "mean_token_accuracy": 0.8905297890305519, "step": 199 }, { "epoch": 1.1700879765395895, - "grad_norm": 1.734990074144959, + "grad_norm": 1.818463575818881, "learning_rate": 3.927963068177299e-05, - "loss": 0.3718, - "mean_token_accuracy": 0.8979229480028152, + "loss": 0.3702, + "mean_token_accuracy": 0.8925187587738037, "step": 200 }, { "epoch": 1.1759530791788857, - "grad_norm": 1.8303417823422754, + "grad_norm": 1.8940957765717514, "learning_rate": 3.926999495957775e-05, - "loss": 0.4617, - "mean_token_accuracy": 0.8707368224859238, + "loss": 0.4394, + "mean_token_accuracy": 0.8725065067410469, "step": 201 }, { "epoch": 1.1818181818181819, - "grad_norm": 1.820508589510773, + "grad_norm": 1.9020997930436756, "learning_rate": 3.9260296554275704e-05, - "loss": 0.4704, - "mean_token_accuracy": 0.8739859238266945, + "loss": 0.461, + "mean_token_accuracy": 0.880850076675415, "step": 202 }, { "epoch": 1.187683284457478, - "grad_norm": 1.9745656184822862, + "grad_norm": 1.795499779219037, "learning_rate": 3.925053550106815e-05, - "loss": 0.3245, - "mean_token_accuracy": 0.9014926105737686, + "loss": 0.3238, + "mean_token_accuracy": 0.9072358906269073, "step": 203 }, { "epoch": 1.1935483870967742, - "grad_norm": 1.6927826737387262, + "grad_norm": 1.7087252975022835, "learning_rate": 3.9240711835383766e-05, - "loss": 0.2987, - "mean_token_accuracy": 0.9058414027094841, + "loss": 0.2935, + "mean_token_accuracy": 0.9104657173156738, "step": 204 }, { "epoch": 1.1994134897360704, - "grad_norm": 1.5770657862401005, + "grad_norm": 1.6303327633815248, "learning_rate": 3.9230825592878494e-05, - "loss": 0.313, - "mean_token_accuracy": 0.9111779928207397, + "loss": 0.3019, + "mean_token_accuracy": 0.9101001620292664, "step": 205 }, { "epoch": 1.2052785923753666, - "grad_norm": 1.675854486117753, + "grad_norm": 1.7889524668856331, "learning_rate": 3.92208768094354e-05, - "loss": 0.2895, - "mean_token_accuracy": 0.9199853986501694, + "loss": 0.3232, + "mean_token_accuracy": 0.9161754846572876, "step": 206 }, { "epoch": 1.2111436950146628, - "grad_norm": 1.6015878865124002, + "grad_norm": 1.8625830016750662, "learning_rate": 3.921086552116455e-05, - "loss": 0.2811, - "mean_token_accuracy": 0.9138716906309128, + "loss": 0.2943, + "mean_token_accuracy": 0.9140777736902237, "step": 207 }, { "epoch": 1.217008797653959, - "grad_norm": 1.718206784339055, + "grad_norm": 2.08984197350988, "learning_rate": 3.920079176440288e-05, - "loss": 0.2725, - "mean_token_accuracy": 0.9215174838900566, + "loss": 0.2892, + "mean_token_accuracy": 0.9188559651374817, "step": 208 }, { "epoch": 1.2228739002932552, - "grad_norm": 1.7698754302048199, + "grad_norm": 1.911462834786022, "learning_rate": 3.9190655575714045e-05, - "loss": 0.4332, - "mean_token_accuracy": 0.8880220949649811, + "loss": 0.425, + "mean_token_accuracy": 0.8878646790981293, "step": 209 }, { "epoch": 1.2287390029325513, - "grad_norm": 1.9266880843929173, + "grad_norm": 2.1199749653836246, "learning_rate": 3.918045699188833e-05, - "loss": 0.3224, - "mean_token_accuracy": 0.9085577055811882, + "loss": 0.3309, + "mean_token_accuracy": 0.9033946841955185, "step": 210 }, { "epoch": 1.2346041055718475, - "grad_norm": 1.6249028390550062, + "grad_norm": 1.7772550268734622, "learning_rate": 3.9170196049942474e-05, - "loss": 0.2676, - "mean_token_accuracy": 0.9211436435580254, + "loss": 0.2935, + "mean_token_accuracy": 0.9201069548726082, "step": 211 }, { "epoch": 1.2404692082111437, - "grad_norm": 1.533372145565326, + "grad_norm": 1.4880613708640376, "learning_rate": 3.915987278711954e-05, - "loss": 0.2888, - "mean_token_accuracy": 0.9172268733382225, + "loss": 0.2814, + "mean_token_accuracy": 0.9159248098731041, "step": 212 }, { "epoch": 1.2463343108504399, - "grad_norm": 1.4691488921950937, + "grad_norm": 1.4341780634467447, "learning_rate": 3.914948724088883e-05, - "loss": 0.3936, - "mean_token_accuracy": 0.8931452110409737, + "loss": 0.3993, + "mean_token_accuracy": 0.8969456627964973, "step": 213 }, { "epoch": 1.252199413489736, - "grad_norm": 1.8993586750719262, + "grad_norm": 1.9915246600304684, "learning_rate": 3.913903944894565e-05, - "loss": 0.3314, - "mean_token_accuracy": 0.9030940681695938, + "loss": 0.3357, + "mean_token_accuracy": 0.9005481451749802, "step": 214 }, { "epoch": 1.2580645161290323, - "grad_norm": 1.61898052839045, + "grad_norm": 1.580420874953109, "learning_rate": 3.912852944921129e-05, - "loss": 0.2963, - "mean_token_accuracy": 0.9144224375486374, + "loss": 0.2969, + "mean_token_accuracy": 0.9120687544345856, "step": 215 }, { "epoch": 1.2639296187683284, - "grad_norm": 1.949574828408539, + "grad_norm": 1.750899975898394, "learning_rate": 3.911795727983279e-05, - "loss": 0.3361, - "mean_token_accuracy": 0.9106857255101204, + "loss": 0.3287, + "mean_token_accuracy": 0.907410055398941, "step": 216 }, { "epoch": 1.2697947214076246, - "grad_norm": 1.8205919307098928, + "grad_norm": 1.928943141809165, "learning_rate": 3.910732297918285e-05, - "loss": 0.3859, - "mean_token_accuracy": 0.9019497409462929, + "loss": 0.3847, + "mean_token_accuracy": 0.8974480405449867, "step": 217 }, { "epoch": 1.2756598240469208, - "grad_norm": 2.1271420763487634, + "grad_norm": 2.1252213170886898, "learning_rate": 3.90966265858597e-05, - "loss": 0.3987, - "mean_token_accuracy": 0.8942231386899948, + "loss": 0.3873, + "mean_token_accuracy": 0.9002740904688835, "step": 218 }, { "epoch": 1.281524926686217, - "grad_norm": 1.8506277761965328, + "grad_norm": 2.109343454012867, "learning_rate": 3.908586813868693e-05, - "loss": 0.3845, - "mean_token_accuracy": 0.8941864669322968, + "loss": 0.403, + "mean_token_accuracy": 0.8951247781515121, "step": 219 }, { "epoch": 1.2873900293255132, - "grad_norm": 1.8340544441465951, + "grad_norm": 2.061638595149616, "learning_rate": 3.9075047676713354e-05, - "loss": 0.3838, - "mean_token_accuracy": 0.8961951732635498, + "loss": 0.3771, + "mean_token_accuracy": 0.896357998251915, "step": 220 }, { "epoch": 1.2932551319648093, - "grad_norm": 1.7279379977091884, + "grad_norm": 1.7021693465570895, "learning_rate": 3.9064165239212874e-05, - "loss": 0.3649, - "mean_token_accuracy": 0.8994920030236244, + "loss": 0.3778, + "mean_token_accuracy": 0.89422857016325, "step": 221 }, { "epoch": 1.2991202346041055, - "grad_norm": 1.7761193410253977, + "grad_norm": 1.849147955241084, "learning_rate": 3.905322086568434e-05, - "loss": 0.3971, - "mean_token_accuracy": 0.8978786915540695, + "loss": 0.3871, + "mean_token_accuracy": 0.8929040059447289, "step": 222 }, { "epoch": 1.3049853372434017, - "grad_norm": 2.1577492725825773, + "grad_norm": 2.0252993158406034, "learning_rate": 3.904221459585142e-05, - "loss": 0.3386, - "mean_token_accuracy": 0.9025338441133499, + "loss": 0.3396, + "mean_token_accuracy": 0.9043144136667252, "step": 223 }, { "epoch": 1.310850439882698, - "grad_norm": 2.0991144733466127, + "grad_norm": 1.9166105249433603, "learning_rate": 3.903114646966242e-05, - "loss": 0.394, - "mean_token_accuracy": 0.8991600722074509, + "loss": 0.3879, + "mean_token_accuracy": 0.8941874876618385, "step": 224 }, { "epoch": 1.316715542521994, - "grad_norm": 1.5430466222950896, + "grad_norm": 1.8159618403624833, "learning_rate": 3.9020016527290166e-05, - "loss": 0.3554, - "mean_token_accuracy": 0.8956394866108894, + "loss": 0.3898, + "mean_token_accuracy": 0.886252149939537, "step": 225 }, { "epoch": 1.3225806451612903, - "grad_norm": 1.5321878304784884, + "grad_norm": 1.8271070306390285, "learning_rate": 3.900882480913185e-05, - "loss": 0.2586, - "mean_token_accuracy": 0.9249091520905495, + "loss": 0.2508, + "mean_token_accuracy": 0.9200474843382835, "step": 226 }, { "epoch": 1.3284457478005864, - "grad_norm": 1.7052024839790294, + "grad_norm": 1.8641984333480377, "learning_rate": 3.899757135580891e-05, - "loss": 0.4302, - "mean_token_accuracy": 0.8868528082966805, + "loss": 0.4347, + "mean_token_accuracy": 0.8843434602022171, "step": 227 }, { "epoch": 1.3343108504398826, - "grad_norm": 1.8879042217654005, + "grad_norm": 1.9751688939339136, "learning_rate": 3.898625620816681e-05, - "loss": 0.3146, - "mean_token_accuracy": 0.9062324613332748, + "loss": 0.3196, + "mean_token_accuracy": 0.9012826457619667, "step": 228 }, { "epoch": 1.3401759530791788, - "grad_norm": 1.895441937578749, + "grad_norm": 1.7860530094904565, "learning_rate": 3.8974879407275e-05, - "loss": 0.4562, - "mean_token_accuracy": 0.8757164552807808, + "loss": 0.4468, + "mean_token_accuracy": 0.8767973035573959, "step": 229 }, { "epoch": 1.3460410557184752, - "grad_norm": 2.1290916797705077, + "grad_norm": 1.94722176291669, "learning_rate": 3.896344099442663e-05, - "loss": 0.3177, - "mean_token_accuracy": 0.904730461537838, + "loss": 0.3302, + "mean_token_accuracy": 0.9074369296431541, "step": 230 }, { "epoch": 1.3519061583577714, - "grad_norm": 1.5925408878888774, + "grad_norm": 1.6187074828159356, "learning_rate": 3.895194101113855e-05, - "loss": 0.2663, - "mean_token_accuracy": 0.9067297652363777, + "loss": 0.2696, + "mean_token_accuracy": 0.9088193848729134, "step": 231 }, { "epoch": 1.3577712609970676, - "grad_norm": 1.6755193524016967, + "grad_norm": 2.6096489592144936, "learning_rate": 3.894037949915104e-05, - "loss": 0.3058, - "mean_token_accuracy": 0.9183206856250763, + "loss": 0.2944, + "mean_token_accuracy": 0.9228176847100258, "step": 232 }, { "epoch": 1.3636363636363638, - "grad_norm": 1.4261041380447086, + "grad_norm": 1.5634486259914928, "learning_rate": 3.8928756500427735e-05, - "loss": 0.3274, - "mean_token_accuracy": 0.9030020982027054, + "loss": 0.3238, + "mean_token_accuracy": 0.8987634629011154, "step": 233 }, { "epoch": 1.36950146627566, - "grad_norm": 1.7424232651440439, + "grad_norm": 1.9297888485342156, "learning_rate": 3.89170720571554e-05, - "loss": 0.2836, - "mean_token_accuracy": 0.9171391725540161, + "loss": 0.2825, + "mean_token_accuracy": 0.919132649898529, "step": 234 }, { "epoch": 1.3753665689149561, - "grad_norm": 1.5473986942703324, + "grad_norm": 5.705912737037372, "learning_rate": 3.890532621174387e-05, - "loss": 0.311, - "mean_token_accuracy": 0.9047387689352036, + "loss": 0.309, + "mean_token_accuracy": 0.9044925644993782, "step": 235 }, { "epoch": 1.3812316715542523, - "grad_norm": 1.4904609752021027, + "grad_norm": 1.8759082661293862, "learning_rate": 3.8893519006825806e-05, - "loss": 0.2811, - "mean_token_accuracy": 0.9175030738115311, + "loss": 0.3026, + "mean_token_accuracy": 0.9114593118429184, "step": 236 }, { "epoch": 1.3870967741935485, - "grad_norm": 1.6614527615879586, + "grad_norm": 3.0350786549552233, "learning_rate": 3.88816504852566e-05, - "loss": 0.2803, - "mean_token_accuracy": 0.9204106256365776, + "loss": 0.2832, + "mean_token_accuracy": 0.9205159991979599, "step": 237 }, { "epoch": 1.3929618768328447, - "grad_norm": 1.660701923358152, + "grad_norm": 1.8811808391254248, "learning_rate": 3.886972069011419e-05, - "loss": 0.4732, - "mean_token_accuracy": 0.878779798746109, + "loss": 0.495, + "mean_token_accuracy": 0.8736753240227699, "step": 238 }, { "epoch": 1.3988269794721409, - "grad_norm": 1.8193960520526216, + "grad_norm": 2.326861975246291, "learning_rate": 3.885772966469891e-05, - "loss": 0.3177, - "mean_token_accuracy": 0.9024636000394821, + "loss": 0.3113, + "mean_token_accuracy": 0.9055031910538673, "step": 239 }, { "epoch": 1.404692082111437, - "grad_norm": 1.6110661415608767, + "grad_norm": 1.6166513093632866, "learning_rate": 3.884567745253335e-05, - "loss": 0.2395, - "mean_token_accuracy": 0.9258132427930832, + "loss": 0.2422, + "mean_token_accuracy": 0.9216142147779465, "step": 240 }, { "epoch": 1.4105571847507332, - "grad_norm": 1.4342539536793646, + "grad_norm": 1.6509216370463153, "learning_rate": 3.8833564097362157e-05, - "loss": 0.3794, - "mean_token_accuracy": 0.8934561610221863, + "loss": 0.375, + "mean_token_accuracy": 0.8933239132165909, "step": 241 }, { "epoch": 1.4164222873900294, - "grad_norm": 1.5191845667740436, + "grad_norm": 1.5963842065424707, "learning_rate": 3.8821389643151924e-05, - "loss": 0.2483, - "mean_token_accuracy": 0.9304336309432983, + "loss": 0.2731, + "mean_token_accuracy": 0.9260216429829597, "step": 242 }, { "epoch": 1.4222873900293256, - "grad_norm": 1.5906029983066425, + "grad_norm": 1.7409972319657108, "learning_rate": 3.880915413409102e-05, - "loss": 0.289, - "mean_token_accuracy": 0.9240436926484108, + "loss": 0.2994, + "mean_token_accuracy": 0.9160123020410538, "step": 243 }, { "epoch": 1.4281524926686218, - "grad_norm": 1.5750449999867253, + "grad_norm": 1.685305669968149, "learning_rate": 3.879685761458938e-05, - "loss": 0.4022, - "mean_token_accuracy": 0.8791051730513573, + "loss": 0.3998, + "mean_token_accuracy": 0.8791949227452278, "step": 244 }, { "epoch": 1.434017595307918, - "grad_norm": 1.6746780717366057, + "grad_norm": 1.714247881724903, "learning_rate": 3.8784500129278405e-05, - "loss": 0.2427, - "mean_token_accuracy": 0.9278705045580864, + "loss": 0.2456, + "mean_token_accuracy": 0.9248116835951805, "step": 245 }, { "epoch": 1.4398826979472141, - "grad_norm": 1.70626429733012, + "grad_norm": 1.8283775935111375, "learning_rate": 3.877208172301079e-05, - "loss": 0.4178, - "mean_token_accuracy": 0.8769783824682236, + "loss": 0.4024, + "mean_token_accuracy": 0.8830369934439659, "step": 246 }, { "epoch": 1.4457478005865103, - "grad_norm": 1.5645704363193036, + "grad_norm": 1.641693773681782, "learning_rate": 3.875960244086032e-05, - "loss": 0.3022, - "mean_token_accuracy": 0.9060285091400146, + "loss": 0.2956, + "mean_token_accuracy": 0.9141576886177063, "step": 247 }, { "epoch": 1.4516129032258065, - "grad_norm": 1.5927324346376226, + "grad_norm": 1.8072855331044935, "learning_rate": 3.8747062328121756e-05, - "loss": 0.3328, - "mean_token_accuracy": 0.9143104031682014, + "loss": 0.3248, + "mean_token_accuracy": 0.9176624119281769, "step": 248 }, { "epoch": 1.4574780058651027, - "grad_norm": 1.3295756265415142, + "grad_norm": 1.4529714307869996, "learning_rate": 3.873446143031064e-05, - "loss": 0.2642, - "mean_token_accuracy": 0.9250845462083817, + "loss": 0.233, + "mean_token_accuracy": 0.9335721209645271, "step": 249 }, { "epoch": 1.4633431085043989, - "grad_norm": 1.6451577919637062, + "grad_norm": 1.6109952281207667, "learning_rate": 3.872179979316314e-05, - "loss": 0.2576, - "mean_token_accuracy": 0.9231655597686768, + "loss": 0.2647, + "mean_token_accuracy": 0.9189906045794487, "step": 250 }, { "epoch": 1.469208211143695, - "grad_norm": 1.374617140630625, + "grad_norm": 1.5387664784956696, "learning_rate": 3.870907746263589e-05, - "loss": 0.2344, - "mean_token_accuracy": 0.9281893447041512, + "loss": 0.2368, + "mean_token_accuracy": 0.9255321696400642, "step": 251 }, { "epoch": 1.4750733137829912, - "grad_norm": 1.5385521582071795, + "grad_norm": 1.5682341766556274, "learning_rate": 3.869629448490582e-05, - "loss": 0.3019, - "mean_token_accuracy": 0.9170176237821579, + "loss": 0.3074, + "mean_token_accuracy": 0.9108378365635872, "step": 252 }, { "epoch": 1.4809384164222874, - "grad_norm": 1.4332428265169206, + "grad_norm": 1.605199961656554, "learning_rate": 3.868345090636995e-05, - "loss": 0.32, - "mean_token_accuracy": 0.9136760458350182, + "loss": 0.3372, + "mean_token_accuracy": 0.9136556684970856, "step": 253 }, { "epoch": 1.4868035190615836, - "grad_norm": 1.8145070360131068, + "grad_norm": 1.702260159704336, "learning_rate": 3.867054677364531e-05, - "loss": 0.3077, - "mean_token_accuracy": 0.9085892364382744, + "loss": 0.2964, + "mean_token_accuracy": 0.9146860465407372, "step": 254 }, { "epoch": 1.4926686217008798, - "grad_norm": 1.4540625255590833, + "grad_norm": 1.6758389465527221, "learning_rate": 3.865758213356868e-05, - "loss": 0.3122, - "mean_token_accuracy": 0.9093359783291817, + "loss": 0.3302, + "mean_token_accuracy": 0.9049453809857368, "step": 255 }, { "epoch": 1.498533724340176, - "grad_norm": 1.6678383017411527, + "grad_norm": 1.8470944243244563, "learning_rate": 3.8644557033196456e-05, - "loss": 0.3175, - "mean_token_accuracy": 0.9082972332835197, + "loss": 0.308, + "mean_token_accuracy": 0.9096508920192719, "step": 256 }, { "epoch": 1.5043988269794721, - "grad_norm": 1.5866601580017656, + "grad_norm": 1.6459334975457287, "learning_rate": 3.8631471519804514e-05, - "loss": 0.3432, - "mean_token_accuracy": 0.9088873639702797, + "loss": 0.3409, + "mean_token_accuracy": 0.908280074596405, "step": 257 }, { "epoch": 1.5102639296187683, - "grad_norm": 1.6958942000186437, + "grad_norm": 1.8846190933749478, "learning_rate": 3.861832564088797e-05, - "loss": 0.3633, - "mean_token_accuracy": 0.8945498690009117, + "loss": 0.3908, + "mean_token_accuracy": 0.8908302560448647, "step": 258 }, { "epoch": 1.5161290322580645, - "grad_norm": 1.6901057610483312, + "grad_norm": 1.8194930732382166, "learning_rate": 3.860511944416105e-05, - "loss": 0.2474, - "mean_token_accuracy": 0.9269101545214653, + "loss": 0.237, + "mean_token_accuracy": 0.9297190383076668, "step": 259 }, { "epoch": 1.5219941348973607, - "grad_norm": 1.4062959541358844, + "grad_norm": 1.4686546503831466, "learning_rate": 3.859185297755693e-05, - "loss": 0.2571, - "mean_token_accuracy": 0.9234707877039909, + "loss": 0.2434, + "mean_token_accuracy": 0.926492266356945, "step": 260 }, { "epoch": 1.5278592375366569, - "grad_norm": 1.293753749817753, + "grad_norm": 1.4178478581133294, "learning_rate": 3.857852628922751e-05, - "loss": 0.224, - "mean_token_accuracy": 0.9362157136201859, + "loss": 0.2247, + "mean_token_accuracy": 0.9361571744084358, "step": 261 }, { "epoch": 1.533724340175953, - "grad_norm": 1.7768770225023713, + "grad_norm": 1.8264634652027574, "learning_rate": 3.856513942754329e-05, - "loss": 0.2784, - "mean_token_accuracy": 0.9158712923526764, + "loss": 0.2798, + "mean_token_accuracy": 0.9201556444168091, "step": 262 }, { "epoch": 1.5395894428152492, - "grad_norm": 1.5368924710192076, + "grad_norm": 1.3877603216406522, "learning_rate": 3.8551692441093183e-05, - "loss": 0.2147, - "mean_token_accuracy": 0.9343990013003349, + "loss": 0.2084, + "mean_token_accuracy": 0.9382101222872734, "step": 263 }, { "epoch": 1.5454545454545454, - "grad_norm": 1.5208867150448775, + "grad_norm": 2.1242475529664406, "learning_rate": 3.85381853786843e-05, - "loss": 0.3668, - "mean_token_accuracy": 0.8892128467559814, + "loss": 0.3805, + "mean_token_accuracy": 0.8912002816796303, "step": 264 }, { "epoch": 1.5513196480938416, - "grad_norm": 1.615170986550086, + "grad_norm": 1.9343373311067464, "learning_rate": 3.852461828934184e-05, - "loss": 0.3435, - "mean_token_accuracy": 0.907134085893631, + "loss": 0.3576, + "mean_token_accuracy": 0.9037280976772308, "step": 265 }, { "epoch": 1.5571847507331378, - "grad_norm": 1.5028390308172306, + "grad_norm": 1.7197714668657997, "learning_rate": 3.851099122230885e-05, - "loss": 0.2613, - "mean_token_accuracy": 0.9213738068938255, + "loss": 0.2661, + "mean_token_accuracy": 0.9237689301371574, "step": 266 }, { "epoch": 1.563049853372434, - "grad_norm": 1.5527776825198953, + "grad_norm": 1.2968944947332304, "learning_rate": 3.849730422704608e-05, - "loss": 0.3699, - "mean_token_accuracy": 0.9007752239704132, + "loss": 0.367, + "mean_token_accuracy": 0.9015661999583244, "step": 267 }, { "epoch": 1.5689149560117301, - "grad_norm": 1.68946384944025, + "grad_norm": 1.8354778354820311, "learning_rate": 3.84835573532318e-05, - "loss": 0.2226, - "mean_token_accuracy": 0.9297270327806473, + "loss": 0.2307, + "mean_token_accuracy": 0.930214174091816, "step": 268 }, { "epoch": 1.5747800586510263, - "grad_norm": 1.8568160263580067, + "grad_norm": 1.6095719868962666, "learning_rate": 3.84697506507616e-05, - "loss": 0.3495, - "mean_token_accuracy": 0.8976611867547035, + "loss": 0.3754, + "mean_token_accuracy": 0.8950481489300728, "step": 269 }, { "epoch": 1.5806451612903225, - "grad_norm": 1.9946826389284182, + "grad_norm": 1.8589562104326827, "learning_rate": 3.845588416974824e-05, - "loss": 0.3764, - "mean_token_accuracy": 0.9064979031682014, + "loss": 0.3538, + "mean_token_accuracy": 0.9128340929746628, "step": 270 }, { "epoch": 1.5865102639296187, - "grad_norm": 1.6656738473748323, + "grad_norm": 1.5115174690077389, "learning_rate": 3.844195796052144e-05, - "loss": 0.3061, - "mean_token_accuracy": 0.9155899211764336, + "loss": 0.2958, + "mean_token_accuracy": 0.9182360097765923, "step": 271 }, { "epoch": 1.5923753665689149, - "grad_norm": 1.4699049383173204, + "grad_norm": 1.5391526339443062, "learning_rate": 3.8427972073627724e-05, - "loss": 0.4471, - "mean_token_accuracy": 0.8815479129552841, + "loss": 0.4294, + "mean_token_accuracy": 0.886161781847477, "step": 272 }, { "epoch": 1.598240469208211, - "grad_norm": 1.7825727959157462, + "grad_norm": 1.7890963365765244, "learning_rate": 3.841392655983021e-05, - "loss": 0.2179, - "mean_token_accuracy": 0.9388237595558167, + "loss": 0.2038, + "mean_token_accuracy": 0.9358106032013893, "step": 273 }, { "epoch": 1.6041055718475072, - "grad_norm": 1.2309725165752188, + "grad_norm": 1.0409480351181137, "learning_rate": 3.8399821470108444e-05, - "loss": 0.1733, - "mean_token_accuracy": 0.9496222510933876, + "loss": 0.169, + "mean_token_accuracy": 0.9508898928761482, "step": 274 }, { "epoch": 1.6099706744868034, - "grad_norm": 1.7497475431193252, + "grad_norm": 1.69462838247509, "learning_rate": 3.838565685565819e-05, - "loss": 0.41, - "mean_token_accuracy": 0.892163947224617, + "loss": 0.4235, + "mean_token_accuracy": 0.8907356783747673, "step": 275 }, { "epoch": 1.6158357771260996, - "grad_norm": 1.6032760832968165, + "grad_norm": 1.6504210584720351, "learning_rate": 3.8371432767891295e-05, - "loss": 0.3103, - "mean_token_accuracy": 0.9191496223211288, + "loss": 0.3019, + "mean_token_accuracy": 0.9208194762468338, "step": 276 }, { "epoch": 1.6217008797653958, - "grad_norm": 1.475523533191468, + "grad_norm": 1.5605291392635074, "learning_rate": 3.8357149258435444e-05, - "loss": 0.2458, - "mean_token_accuracy": 0.9312805011868477, + "loss": 0.2417, + "mean_token_accuracy": 0.9278599768877029, "step": 277 }, { "epoch": 1.627565982404692, - "grad_norm": 1.4874016779495678, + "grad_norm": 1.5359970078910663, "learning_rate": 3.8342806379134005e-05, - "loss": 0.3746, - "mean_token_accuracy": 0.8982912823557854, + "loss": 0.3813, + "mean_token_accuracy": 0.8966976255178452, "step": 278 }, { "epoch": 1.6334310850439882, - "grad_norm": 1.5396252294388457, + "grad_norm": 1.6548981878718405, "learning_rate": 3.8328404182045854e-05, - "loss": 0.3274, - "mean_token_accuracy": 0.9162414520978928, + "loss": 0.3248, + "mean_token_accuracy": 0.9139015227556229, "step": 279 }, { "epoch": 1.6392961876832843, - "grad_norm": 1.9220523269412597, + "grad_norm": 1.791651867333591, "learning_rate": 3.831394271944512e-05, - "loss": 0.3118, - "mean_token_accuracy": 0.9187277778983116, + "loss": 0.3067, + "mean_token_accuracy": 0.9152898713946342, "step": 280 }, { "epoch": 1.6451612903225805, - "grad_norm": 1.5408962396890666, + "grad_norm": 1.5338974063867743, "learning_rate": 3.82994220438211e-05, - "loss": 0.2972, - "mean_token_accuracy": 0.9149600267410278, + "loss": 0.3007, + "mean_token_accuracy": 0.9149986058473587, "step": 281 }, { "epoch": 1.6510263929618767, - "grad_norm": 1.6552152776718903, + "grad_norm": 1.6743377942352997, "learning_rate": 3.828484220787797e-05, - "loss": 0.3117, - "mean_token_accuracy": 0.9124673455953598, + "loss": 0.313, + "mean_token_accuracy": 0.9125666543841362, "step": 282 }, { "epoch": 1.6568914956011729, - "grad_norm": 1.9823703515758113, + "grad_norm": 1.87863609348132, "learning_rate": 3.8270203264534644e-05, - "loss": 0.3958, - "mean_token_accuracy": 0.8929010257124901, + "loss": 0.4001, + "mean_token_accuracy": 0.89353808760643, "step": 283 }, { "epoch": 1.662756598240469, - "grad_norm": 1.477782103664971, + "grad_norm": 1.5221720369596754, "learning_rate": 3.8255505266924585e-05, - "loss": 0.3019, - "mean_token_accuracy": 0.9156582951545715, + "loss": 0.2979, + "mean_token_accuracy": 0.9145095869898796, "step": 284 }, { "epoch": 1.6686217008797652, - "grad_norm": 1.3242833914955063, + "grad_norm": 1.4436482743787307, "learning_rate": 3.824074826839557e-05, - "loss": 0.2404, - "mean_token_accuracy": 0.93580362200737, + "loss": 0.234, + "mean_token_accuracy": 0.935015395283699, "step": 285 }, { "epoch": 1.6744868035190614, - "grad_norm": 2.0670290487108134, + "grad_norm": 2.0983241426301733, "learning_rate": 3.822593232250956e-05, - "loss": 0.4481, - "mean_token_accuracy": 0.8762071877717972, + "loss": 0.4151, + "mean_token_accuracy": 0.8874623849987984, "step": 286 }, { "epoch": 1.6803519061583576, - "grad_norm": 1.8797288846958915, + "grad_norm": 2.091453827833945, "learning_rate": 3.8211057483042446e-05, - "loss": 0.4279, - "mean_token_accuracy": 0.8910372480750084, + "loss": 0.4583, + "mean_token_accuracy": 0.8864308521151543, "step": 287 }, { "epoch": 1.6862170087976538, - "grad_norm": 1.5498598053807129, + "grad_norm": 1.7623769296533551, "learning_rate": 3.8196123803983895e-05, - "loss": 0.3023, - "mean_token_accuracy": 0.917064480483532, + "loss": 0.3116, + "mean_token_accuracy": 0.9158060252666473, "step": 288 }, { "epoch": 1.6920821114369502, - "grad_norm": 1.6259661356334203, + "grad_norm": 1.8634816515022323, "learning_rate": 3.818113133953712e-05, - "loss": 0.2943, - "mean_token_accuracy": 0.9172269403934479, + "loss": 0.2798, + "mean_token_accuracy": 0.9182194173336029, "step": 289 }, { "epoch": 1.6979472140762464, - "grad_norm": 1.3857325306672972, + "grad_norm": 1.3844096892784188, "learning_rate": 3.816608014411872e-05, - "loss": 0.2045, - "mean_token_accuracy": 0.9397373795509338, + "loss": 0.2108, + "mean_token_accuracy": 0.9366880431771278, "step": 290 }, { "epoch": 1.7038123167155426, - "grad_norm": 1.3545393801192716, + "grad_norm": 1.374168812023236, "learning_rate": 3.815097027235845e-05, - "loss": 0.3132, - "mean_token_accuracy": 0.9108520373702049, + "loss": 0.2938, + "mean_token_accuracy": 0.9153807386755943, "step": 291 }, { "epoch": 1.7096774193548387, - "grad_norm": 1.6993235166452834, + "grad_norm": 1.7034775235235637, "learning_rate": 3.813580177909906e-05, - "loss": 0.242, - "mean_token_accuracy": 0.9247912764549255, + "loss": 0.2667, + "mean_token_accuracy": 0.9222086817026138, "step": 292 }, { "epoch": 1.715542521994135, - "grad_norm": 1.062509305690891, + "grad_norm": 1.3790984180175314, "learning_rate": 3.8120574719396023e-05, - "loss": 0.2618, - "mean_token_accuracy": 0.9303434118628502, + "loss": 0.2556, + "mean_token_accuracy": 0.9333324581384659, "step": 293 }, { "epoch": 1.721407624633431, - "grad_norm": 1.7701722344281323, + "grad_norm": 1.8884841762010651, "learning_rate": 3.810528914851745e-05, - "loss": 0.3753, - "mean_token_accuracy": 0.900924563407898, + "loss": 0.3924, + "mean_token_accuracy": 0.8967092782258987, "step": 294 }, { "epoch": 1.7272727272727273, - "grad_norm": 1.5877419428096278, + "grad_norm": 1.7491440284079216, "learning_rate": 3.808994512194376e-05, - "loss": 0.3402, - "mean_token_accuracy": 0.8950434923171997, + "loss": 0.341, + "mean_token_accuracy": 0.8986316025257111, "step": 295 }, { "epoch": 1.7331378299120235, - "grad_norm": 1.6610643623647858, + "grad_norm": 1.7932797238017746, "learning_rate": 3.807454269536758e-05, - "loss": 0.34, - "mean_token_accuracy": 0.9047321453690529, + "loss": 0.3458, + "mean_token_accuracy": 0.9050753563642502, "step": 296 }, { "epoch": 1.7390029325513197, - "grad_norm": 1.4505169368504018, + "grad_norm": 1.6441635146631224, "learning_rate": 3.805908192469351e-05, - "loss": 0.224, - "mean_token_accuracy": 0.9305636957287788, + "loss": 0.2323, + "mean_token_accuracy": 0.9215081483125687, "step": 297 }, { "epoch": 1.7448680351906158, - "grad_norm": 1.4096184186739809, + "grad_norm": 1.9068409230817305, "learning_rate": 3.80435628660379e-05, - "loss": 0.3201, - "mean_token_accuracy": 0.9103965312242508, + "loss": 0.318, + "mean_token_accuracy": 0.9142180681228638, "step": 298 }, { "epoch": 1.750733137829912, - "grad_norm": 1.1931765360526814, + "grad_norm": 1.5615974684408795, "learning_rate": 3.802798557572867e-05, - "loss": 0.311, - "mean_token_accuracy": 0.9164270684123039, + "loss": 0.3187, + "mean_token_accuracy": 0.9167973101139069, "step": 299 }, { "epoch": 1.7565982404692082, - "grad_norm": 1.6327117487156346, + "grad_norm": 1.8792330981634289, "learning_rate": 3.801235011030506e-05, - "loss": 0.3023, - "mean_token_accuracy": 0.9153658151626587, + "loss": 0.3168, + "mean_token_accuracy": 0.9097247943282127, "step": 300 }, { "epoch": 1.7624633431085044, - "grad_norm": 1.3440051164242026, + "grad_norm": 1.4882463951833638, "learning_rate": 3.799665652651754e-05, - "loss": 0.1817, - "mean_token_accuracy": 0.951392151415348, + "loss": 0.1745, + "mean_token_accuracy": 0.9463199898600578, "step": 301 }, { "epoch": 1.7683284457478006, - "grad_norm": 1.2516019247625034, + "grad_norm": 1.3776003810458817, "learning_rate": 3.7980904881327446e-05, - "loss": 0.2478, - "mean_token_accuracy": 0.9341270625591278, + "loss": 0.259, + "mean_token_accuracy": 0.9295109435915947, "step": 302 }, { "epoch": 1.7741935483870968, - "grad_norm": 1.5459031897179347, + "grad_norm": 1.804906347600171, "learning_rate": 3.796509523190691e-05, - "loss": 0.2693, - "mean_token_accuracy": 0.918998509645462, + "loss": 0.2784, + "mean_token_accuracy": 0.9167697057127953, "step": 303 }, { "epoch": 1.780058651026393, - "grad_norm": 1.3298611087881873, + "grad_norm": 1.269707072306634, "learning_rate": 3.794922763563857e-05, - "loss": 0.2135, - "mean_token_accuracy": 0.9386330172419548, + "loss": 0.2092, + "mean_token_accuracy": 0.9398507922887802, "step": 304 }, { "epoch": 1.7859237536656891, - "grad_norm": 1.7138314899426605, + "grad_norm": 1.8243916364746902, "learning_rate": 3.793330215011538e-05, - "loss": 0.3072, - "mean_token_accuracy": 0.9276101067662239, + "loss": 0.3314, + "mean_token_accuracy": 0.923354484140873, "step": 305 }, { "epoch": 1.7917888563049853, - "grad_norm": 1.4545273510274497, + "grad_norm": 1.7139096688789461, "learning_rate": 3.791731883314043e-05, - "loss": 0.2681, - "mean_token_accuracy": 0.922805443406105, + "loss": 0.2919, + "mean_token_accuracy": 0.9158714264631271, "step": 306 }, { "epoch": 1.7976539589442815, - "grad_norm": 1.557251488291821, + "grad_norm": 1.5960616358194517, "learning_rate": 3.790127774272671e-05, - "loss": 0.233, - "mean_token_accuracy": 0.9324622675776482, + "loss": 0.2381, + "mean_token_accuracy": 0.9304629862308502, "step": 307 }, { "epoch": 1.8035190615835777, - "grad_norm": 1.4492526540365742, + "grad_norm": 1.5533548503357937, "learning_rate": 3.7885178937096884e-05, - "loss": 0.3703, - "mean_token_accuracy": 0.90493393689394, + "loss": 0.3685, + "mean_token_accuracy": 0.8978663310408592, "step": 308 }, { "epoch": 1.8093841642228738, - "grad_norm": 1.5281139962268382, + "grad_norm": 1.7461610002504417, "learning_rate": 3.7869022474683125e-05, - "loss": 0.4118, - "mean_token_accuracy": 0.9041909128427505, + "loss": 0.4383, + "mean_token_accuracy": 0.9004004076123238, "step": 309 }, { "epoch": 1.8152492668621703, - "grad_norm": 1.93639219005886, + "grad_norm": 2.06432042277161, "learning_rate": 3.7852808414126876e-05, - "loss": 0.3565, - "mean_token_accuracy": 0.9011876359581947, + "loss": 0.352, + "mean_token_accuracy": 0.9019398987293243, "step": 310 }, { "epoch": 1.8211143695014664, - "grad_norm": 1.4288238334885668, + "grad_norm": 1.4176538924862974, "learning_rate": 3.783653681427861e-05, - "loss": 0.2322, - "mean_token_accuracy": 0.9318009614944458, + "loss": 0.2086, + "mean_token_accuracy": 0.9427406489849091, "step": 311 }, { "epoch": 1.8269794721407626, - "grad_norm": 1.5419872027394301, + "grad_norm": 1.335202318172332, "learning_rate": 3.7820207734197676e-05, - "loss": 0.3153, - "mean_token_accuracy": 0.915338508784771, + "loss": 0.3102, + "mean_token_accuracy": 0.9114719554781914, "step": 312 }, { "epoch": 1.8328445747800588, - "grad_norm": 1.4488167015309188, + "grad_norm": 1.507155874356392, "learning_rate": 3.780382123315203e-05, - "loss": 0.1989, - "mean_token_accuracy": 0.9429738447070122, + "loss": 0.2132, + "mean_token_accuracy": 0.9400037527084351, "step": 313 }, { "epoch": 1.838709677419355, - "grad_norm": 1.3031287573895856, + "grad_norm": 1.3042920075399829, "learning_rate": 3.778737737061807e-05, - "loss": 0.3113, - "mean_token_accuracy": 0.9149032607674599, + "loss": 0.2946, + "mean_token_accuracy": 0.9200368896126747, "step": 314 }, { "epoch": 1.8445747800586512, - "grad_norm": 1.4920076612556301, + "grad_norm": 1.5791832331088969, "learning_rate": 3.777087620628035e-05, - "loss": 0.2359, - "mean_token_accuracy": 0.9325834512710571, + "loss": 0.2362, + "mean_token_accuracy": 0.9357219859957695, "step": 315 }, { "epoch": 1.8504398826979473, - "grad_norm": 1.385078335478815, + "grad_norm": 1.5908555105002338, "learning_rate": 3.775431780003145e-05, - "loss": 0.2044, - "mean_token_accuracy": 0.9399889931082726, + "loss": 0.2167, + "mean_token_accuracy": 0.9437434077262878, "step": 316 }, { "epoch": 1.8563049853372435, - "grad_norm": 1.269824397497335, + "grad_norm": 1.4150651589598287, "learning_rate": 3.7737702211971684e-05, - "loss": 0.2496, - "mean_token_accuracy": 0.9346247911453247, + "loss": 0.266, + "mean_token_accuracy": 0.9321496710181236, "step": 317 }, { "epoch": 1.8621700879765397, - "grad_norm": 1.4411880057880286, + "grad_norm": 2.0256566853788027, "learning_rate": 3.772102950240895e-05, - "loss": 0.2563, - "mean_token_accuracy": 0.9334022998809814, + "loss": 0.2481, + "mean_token_accuracy": 0.9347328022122383, "step": 318 }, { "epoch": 1.868035190615836, - "grad_norm": 1.532918693941707, + "grad_norm": 1.5023088590721538, "learning_rate": 3.770429973185842e-05, - "loss": 0.2856, - "mean_token_accuracy": 0.9228765368461609, + "loss": 0.2938, + "mean_token_accuracy": 0.9203558340668678, "step": 319 }, { "epoch": 1.873900293255132, - "grad_norm": 1.508327365783947, + "grad_norm": 1.496439833933689, "learning_rate": 3.768751296104243e-05, - "loss": 0.1919, - "mean_token_accuracy": 0.9403479546308517, + "loss": 0.1895, + "mean_token_accuracy": 0.9429217278957367, "step": 320 }, { "epoch": 1.8797653958944283, - "grad_norm": 1.2191592802013955, + "grad_norm": 1.1444559280831719, "learning_rate": 3.767066925089017e-05, - "loss": 0.2863, - "mean_token_accuracy": 0.9149582833051682, + "loss": 0.284, + "mean_token_accuracy": 0.9179976284503937, "step": 321 }, { "epoch": 1.8856304985337244, - "grad_norm": 1.4714233419752547, + "grad_norm": 1.520002548329364, "learning_rate": 3.765376866253749e-05, - "loss": 0.199, - "mean_token_accuracy": 0.9367243573069572, + "loss": 0.2249, + "mean_token_accuracy": 0.9298429787158966, "step": 322 }, { "epoch": 1.8914956011730206, - "grad_norm": 1.3785113383476932, + "grad_norm": 1.6152013282405222, "learning_rate": 3.763681125732672e-05, - "loss": 0.2652, - "mean_token_accuracy": 0.9183213263750076, + "loss": 0.2711, + "mean_token_accuracy": 0.9168659150600433, "step": 323 }, { "epoch": 1.8973607038123168, - "grad_norm": 1.5448358403304276, + "grad_norm": 1.6738843600847169, "learning_rate": 3.7619797096806386e-05, - "loss": 0.2859, - "mean_token_accuracy": 0.9174121469259262, + "loss": 0.2835, + "mean_token_accuracy": 0.920739158987999, "step": 324 }, { "epoch": 1.903225806451613, - "grad_norm": 1.4396711929830184, + "grad_norm": 1.468790562074948, "learning_rate": 3.7602726242731016e-05, - "loss": 0.3124, - "mean_token_accuracy": 0.9157911166548729, + "loss": 0.3202, + "mean_token_accuracy": 0.916345439851284, "step": 325 }, { "epoch": 1.9090909090909092, - "grad_norm": 1.437625652493676, + "grad_norm": 1.540640147913931, "learning_rate": 3.758559875706092e-05, - "loss": 0.2302, - "mean_token_accuracy": 0.9349333196878433, + "loss": 0.2337, + "mean_token_accuracy": 0.9352636113762856, "step": 326 }, { "epoch": 1.9149560117302054, - "grad_norm": 1.2452960419013337, + "grad_norm": 1.2452611699041465, "learning_rate": 3.756841470196195e-05, - "loss": 0.3109, - "mean_token_accuracy": 0.9157072603702545, + "loss": 0.321, + "mean_token_accuracy": 0.9145397022366524, "step": 327 }, { "epoch": 1.9208211143695015, - "grad_norm": 1.3037538478148276, + "grad_norm": 1.4066519231199601, "learning_rate": 3.7551174139805284e-05, - "loss": 0.3212, - "mean_token_accuracy": 0.9096843525767326, + "loss": 0.3281, + "mean_token_accuracy": 0.9094234853982925, "step": 328 }, { "epoch": 1.9266862170087977, - "grad_norm": 1.625979083841291, + "grad_norm": 1.5838261059071022, "learning_rate": 3.75338771331672e-05, - "loss": 0.3253, - "mean_token_accuracy": 0.9081972911953926, + "loss": 0.3144, + "mean_token_accuracy": 0.910196490585804, "step": 329 }, { "epoch": 1.932551319648094, - "grad_norm": 1.5169434445509558, + "grad_norm": 1.4176124121116533, "learning_rate": 3.7516523744828856e-05, - "loss": 0.3588, - "mean_token_accuracy": 0.9001770913600922, + "loss": 0.3585, + "mean_token_accuracy": 0.9048479348421097, "step": 330 }, { "epoch": 1.93841642228739, - "grad_norm": 1.530352955863984, + "grad_norm": 1.6180761569415234, "learning_rate": 3.7499114037776036e-05, - "loss": 0.2789, - "mean_token_accuracy": 0.9110410585999489, + "loss": 0.2809, + "mean_token_accuracy": 0.9143272861838341, "step": 331 }, { "epoch": 1.9442815249266863, - "grad_norm": 1.6088019528257314, + "grad_norm": 1.6370252877103721, "learning_rate": 3.748164807519894e-05, - "loss": 0.4174, - "mean_token_accuracy": 0.8930394127964973, + "loss": 0.4283, + "mean_token_accuracy": 0.8924917504191399, "step": 332 }, { "epoch": 1.9501466275659824, - "grad_norm": 1.8565709071738816, + "grad_norm": 2.0716920896437254, "learning_rate": 3.746412592049197e-05, - "loss": 0.3197, - "mean_token_accuracy": 0.9104765355587006, + "loss": 0.3132, + "mean_token_accuracy": 0.9136724919080734, "step": 333 }, { "epoch": 1.9560117302052786, - "grad_norm": 1.33049768118604, + "grad_norm": 1.340735087489091, "learning_rate": 3.7446547637253464e-05, - "loss": 0.1996, - "mean_token_accuracy": 0.9462396278977394, + "loss": 0.2091, + "mean_token_accuracy": 0.9418945610523224, "step": 334 }, { "epoch": 1.9618768328445748, - "grad_norm": 1.4386241294013715, + "grad_norm": 1.6719209132212014, "learning_rate": 3.742891328928549e-05, - "loss": 0.284, - "mean_token_accuracy": 0.9256806001067162, + "loss": 0.2879, + "mean_token_accuracy": 0.9235949739813805, "step": 335 }, { "epoch": 1.967741935483871, - "grad_norm": 1.1259642533453769, + "grad_norm": 1.147281989168185, "learning_rate": 3.74112229405936e-05, - "loss": 0.2623, - "mean_token_accuracy": 0.9194123968482018, + "loss": 0.2679, + "mean_token_accuracy": 0.9211752265691757, "step": 336 }, { "epoch": 1.9736070381231672, - "grad_norm": 1.2580114294563631, + "grad_norm": 1.3481384626672739, "learning_rate": 3.739347665538664e-05, - "loss": 0.2717, - "mean_token_accuracy": 0.9298633262515068, + "loss": 0.2892, + "mean_token_accuracy": 0.9232519641518593, "step": 337 }, { "epoch": 1.9794721407624634, - "grad_norm": 1.5397240772403886, + "grad_norm": 1.62371246747971, "learning_rate": 3.7375674498076445e-05, - "loss": 0.3439, - "mean_token_accuracy": 0.9038892313838005, + "loss": 0.3896, + "mean_token_accuracy": 0.8951458856463432, "step": 338 }, { "epoch": 1.9853372434017595, - "grad_norm": 1.6069200337143423, + "grad_norm": 1.6604679516789678, "learning_rate": 3.7357816533277646e-05, - "loss": 0.2785, - "mean_token_accuracy": 0.9279282689094543, + "loss": 0.2822, + "mean_token_accuracy": 0.9289601370692253, "step": 339 }, { "epoch": 1.9912023460410557, - "grad_norm": 1.3029156392114332, + "grad_norm": 1.4263291403576979, "learning_rate": 3.733990282580745e-05, - "loss": 0.2791, - "mean_token_accuracy": 0.9194482937455177, + "loss": 0.2715, + "mean_token_accuracy": 0.9229650124907494, "step": 340 }, { "epoch": 1.997067448680352, - "grad_norm": 1.559895084315268, + "grad_norm": 1.747667099197608, "learning_rate": 3.732193344068539e-05, - "loss": 0.2702, - "mean_token_accuracy": 0.9247054308652878, + "loss": 0.307, + "mean_token_accuracy": 0.9177920147776604, "step": 341 }, { "epoch": 2.0, - "grad_norm": 1.9400108632625268, + "grad_norm": 2.381256341303474, "learning_rate": 3.7303908443133054e-05, - "loss": 0.1662, - "mean_token_accuracy": 0.9493132084608078, + "loss": 0.1861, + "mean_token_accuracy": 0.9418904036283493, "step": 342 }, { "epoch": 2.005865102639296, - "grad_norm": 1.2407168217525242, + "grad_norm": 1.4058318833471946, "learning_rate": 3.728582789857393e-05, - "loss": 0.1643, - "mean_token_accuracy": 0.9579492285847664, + "loss": 0.1791, + "mean_token_accuracy": 0.95208640396595, "step": 343 }, { "epoch": 2.0117302052785924, - "grad_norm": 1.3093362788237175, + "grad_norm": 1.3080268747508232, "learning_rate": 3.726769187263308e-05, - "loss": 0.1865, - "mean_token_accuracy": 0.9434748664498329, + "loss": 0.2021, + "mean_token_accuracy": 0.9402620419859886, "step": 344 }, { "epoch": 2.0175953079178885, - "grad_norm": 1.1648794190467764, + "grad_norm": 1.1752970803269882, "learning_rate": 3.724950043113695e-05, - "loss": 0.1359, - "mean_token_accuracy": 0.9601836279034615, + "loss": 0.1354, + "mean_token_accuracy": 0.9612007439136505, "step": 345 }, { "epoch": 2.0234604105571847, - "grad_norm": 1.1333150827756964, + "grad_norm": 1.2121178056268758, "learning_rate": 3.723125364011313e-05, - "loss": 0.1379, - "mean_token_accuracy": 0.95941511541605, + "loss": 0.1272, + "mean_token_accuracy": 0.9641102254390717, "step": 346 }, { "epoch": 2.029325513196481, - "grad_norm": 1.1964853813194998, + "grad_norm": 1.5579896750905702, "learning_rate": 3.7212951565790094e-05, - "loss": 0.1448, - "mean_token_accuracy": 0.9546831250190735, + "loss": 0.1594, + "mean_token_accuracy": 0.9491528794169426, "step": 347 }, { "epoch": 2.035190615835777, - "grad_norm": 1.2734515348322968, + "grad_norm": 1.6099605117684852, "learning_rate": 3.7194594274597e-05, - "loss": 0.1495, - "mean_token_accuracy": 0.9548632949590683, + "loss": 0.1617, + "mean_token_accuracy": 0.9526640698313713, "step": 348 }, { "epoch": 2.0410557184750733, - "grad_norm": 1.2877766399858757, + "grad_norm": 1.5932197664707426, "learning_rate": 3.7176181833163385e-05, - "loss": 0.1739, - "mean_token_accuracy": 0.9461784809827805, + "loss": 0.1908, + "mean_token_accuracy": 0.9443105757236481, "step": 349 }, { "epoch": 2.0469208211143695, - "grad_norm": 1.413810988292415, + "grad_norm": 1.418900526462659, "learning_rate": 3.7157714308318966e-05, - "loss": 0.1596, - "mean_token_accuracy": 0.9559041485190392, + "loss": 0.1557, + "mean_token_accuracy": 0.9594366997480392, "step": 350 }, { "epoch": 2.0527859237536656, - "grad_norm": 1.5026104069307236, + "grad_norm": 1.3139191474546121, "learning_rate": 3.713919176709343e-05, - "loss": 0.1985, - "mean_token_accuracy": 0.9448290690779686, + "loss": 0.1671, + "mean_token_accuracy": 0.9532913491129875, "step": 351 }, { "epoch": 2.058651026392962, - "grad_norm": 1.2187901250703708, + "grad_norm": 1.1508507293987567, "learning_rate": 3.712061427671609e-05, - "loss": 0.1305, - "mean_token_accuracy": 0.9609132781624794, + "loss": 0.1363, + "mean_token_accuracy": 0.9612403735518456, "step": 352 }, { "epoch": 2.064516129032258, - "grad_norm": 1.2188630469947228, + "grad_norm": 1.5612158263654403, "learning_rate": 3.710198190461575e-05, - "loss": 0.1763, - "mean_token_accuracy": 0.955159068107605, + "loss": 0.1613, + "mean_token_accuracy": 0.9548765048384666, "step": 353 }, { "epoch": 2.070381231671554, - "grad_norm": 1.2160320840757712, + "grad_norm": 1.2155961492327827, "learning_rate": 3.7083294718420394e-05, - "loss": 0.1674, - "mean_token_accuracy": 0.9511153474450111, + "loss": 0.1573, + "mean_token_accuracy": 0.951531894505024, "step": 354 }, { "epoch": 2.0762463343108504, - "grad_norm": 1.4125365150094613, + "grad_norm": 1.5294208187687017, "learning_rate": 3.706455278595696e-05, - "loss": 0.1646, - "mean_token_accuracy": 0.9504409730434418, + "loss": 0.1628, + "mean_token_accuracy": 0.9525769129395485, "step": 355 }, { "epoch": 2.0821114369501466, - "grad_norm": 1.2271156324554804, + "grad_norm": 1.482122437822726, "learning_rate": 3.7045756175251086e-05, - "loss": 0.1575, - "mean_token_accuracy": 0.9546771794557571, + "loss": 0.1678, + "mean_token_accuracy": 0.9500371441245079, "step": 356 }, { "epoch": 2.0879765395894427, - "grad_norm": 1.2028870317780662, + "grad_norm": 1.1697605549768837, "learning_rate": 3.7026904954526884e-05, - "loss": 0.1424, - "mean_token_accuracy": 0.9531672671437263, + "loss": 0.1359, + "mean_token_accuracy": 0.9565827995538712, "step": 357 }, { "epoch": 2.093841642228739, - "grad_norm": 1.3273793231793187, + "grad_norm": 1.2020120209369713, "learning_rate": 3.7007999192206676e-05, - "loss": 0.1488, - "mean_token_accuracy": 0.9596338272094727, + "loss": 0.1433, + "mean_token_accuracy": 0.9575584605336189, "step": 358 }, { "epoch": 2.099706744868035, - "grad_norm": 1.1844630480638296, + "grad_norm": 1.144292059243363, "learning_rate": 3.698903895691073e-05, - "loss": 0.17, - "mean_token_accuracy": 0.9442361816763878, + "loss": 0.1611, + "mean_token_accuracy": 0.9512758925557137, "step": 359 }, { "epoch": 2.1055718475073313, - "grad_norm": 1.2591299685961057, + "grad_norm": 1.4488296727051815, "learning_rate": 3.697002431745706e-05, - "loss": 0.1597, - "mean_token_accuracy": 0.9520756751298904, + "loss": 0.1646, + "mean_token_accuracy": 0.9537394344806671, "step": 360 }, { "epoch": 2.1114369501466275, - "grad_norm": 1.1925719447582808, + "grad_norm": 1.311459494656687, "learning_rate": 3.695095534286111e-05, - "loss": 0.1782, - "mean_token_accuracy": 0.9533992558717728, + "loss": 0.1752, + "mean_token_accuracy": 0.9506743401288986, "step": 361 }, { "epoch": 2.1173020527859236, - "grad_norm": 1.2744217950123338, + "grad_norm": 1.1579677983869936, "learning_rate": 3.693183210233557e-05, - "loss": 0.1712, - "mean_token_accuracy": 0.9536459594964981, + "loss": 0.1485, + "mean_token_accuracy": 0.9572027623653412, "step": 362 }, { "epoch": 2.12316715542522, - "grad_norm": 1.306131992732695, + "grad_norm": 1.3670234083261403, "learning_rate": 3.691265466529007e-05, - "loss": 0.1806, - "mean_token_accuracy": 0.9424840211868286, + "loss": 0.1597, + "mean_token_accuracy": 0.9484404474496841, "step": 363 }, { "epoch": 2.129032258064516, - "grad_norm": 1.1582207478757602, + "grad_norm": 1.198713935476129, "learning_rate": 3.689342310133097e-05, - "loss": 0.1433, - "mean_token_accuracy": 0.9588482677936554, + "loss": 0.1412, + "mean_token_accuracy": 0.9606487452983856, "step": 364 }, { "epoch": 2.134897360703812, - "grad_norm": 1.0936038450708818, + "grad_norm": 1.5849456974588698, "learning_rate": 3.687413748026108e-05, - "loss": 0.1545, - "mean_token_accuracy": 0.9539923518896103, + "loss": 0.1752, + "mean_token_accuracy": 0.9527452364563942, "step": 365 }, { "epoch": 2.1407624633431084, - "grad_norm": 1.1150867305503824, + "grad_norm": 1.1717821506314905, "learning_rate": 3.68547978720794e-05, - "loss": 0.1487, - "mean_token_accuracy": 0.9566026851534843, + "loss": 0.1506, + "mean_token_accuracy": 0.9555543065071106, "step": 366 }, { "epoch": 2.1466275659824046, - "grad_norm": 1.2512990373026573, + "grad_norm": 1.2840089722257935, "learning_rate": 3.683540434698093e-05, - "loss": 0.1438, - "mean_token_accuracy": 0.9571522250771523, + "loss": 0.158, + "mean_token_accuracy": 0.9536146819591522, "step": 367 }, { "epoch": 2.1524926686217007, - "grad_norm": 1.1303528659447613, + "grad_norm": 1.3393472371316106, "learning_rate": 3.681595697535629e-05, - "loss": 0.1417, - "mean_token_accuracy": 0.9597708955407143, + "loss": 0.1443, + "mean_token_accuracy": 0.959295816719532, "step": 368 }, { "epoch": 2.158357771260997, - "grad_norm": 1.1288199363197544, + "grad_norm": 1.2426306397717, "learning_rate": 3.6796455827791614e-05, - "loss": 0.1367, - "mean_token_accuracy": 0.9587919190526009, + "loss": 0.1314, + "mean_token_accuracy": 0.9598246365785599, "step": 369 }, { "epoch": 2.164222873900293, - "grad_norm": 1.3375789572979553, + "grad_norm": 1.4126630090119772, "learning_rate": 3.677690097506819e-05, - "loss": 0.1657, - "mean_token_accuracy": 0.952460877597332, + "loss": 0.1659, + "mean_token_accuracy": 0.9537074118852615, "step": 370 }, { "epoch": 2.1700879765395893, - "grad_norm": 1.409154566117624, + "grad_norm": 1.3683138743411407, "learning_rate": 3.6757292488162224e-05, - "loss": 0.1692, - "mean_token_accuracy": 0.9513570293784142, + "loss": 0.166, + "mean_token_accuracy": 0.951523669064045, "step": 371 }, { "epoch": 2.1759530791788855, - "grad_norm": 1.2987241443422721, + "grad_norm": 1.4416888125441545, "learning_rate": 3.673763043824461e-05, - "loss": 0.1854, - "mean_token_accuracy": 0.9414068311452866, + "loss": 0.1892, + "mean_token_accuracy": 0.9457840546965599, "step": 372 }, { "epoch": 2.1818181818181817, - "grad_norm": 1.3236180598652694, + "grad_norm": 1.394980107167206, "learning_rate": 3.671791489668065e-05, - "loss": 0.1627, - "mean_token_accuracy": 0.9539598226547241, + "loss": 0.1634, + "mean_token_accuracy": 0.9523271024227142, "step": 373 }, { "epoch": 2.187683284457478, - "grad_norm": 1.2943073517775734, + "grad_norm": 1.2232114963409995, "learning_rate": 3.6698145935029794e-05, - "loss": 0.1418, - "mean_token_accuracy": 0.9585407078266144, + "loss": 0.1403, + "mean_token_accuracy": 0.9606116563081741, "step": 374 }, { "epoch": 2.193548387096774, - "grad_norm": 1.1789057411236086, + "grad_norm": 1.4676627974529237, "learning_rate": 3.66783236250454e-05, - "loss": 0.1518, - "mean_token_accuracy": 0.953452080488205, + "loss": 0.1568, + "mean_token_accuracy": 0.9536681324243546, "step": 375 }, { "epoch": 2.19941348973607, - "grad_norm": 1.389218125126014, + "grad_norm": 1.454629264354781, "learning_rate": 3.665844803867443e-05, - "loss": 0.1719, - "mean_token_accuracy": 0.9476408511400223, + "loss": 0.1879, + "mean_token_accuracy": 0.948579914867878, "step": 376 }, { "epoch": 2.2052785923753664, - "grad_norm": 1.3030670440092282, + "grad_norm": 1.449142524225023, "learning_rate": 3.663851924805725e-05, - "loss": 0.1798, - "mean_token_accuracy": 0.9471158385276794, + "loss": 0.1754, + "mean_token_accuracy": 0.9484867602586746, "step": 377 }, { "epoch": 2.2111436950146626, - "grad_norm": 1.2566016948623684, + "grad_norm": 1.2527495507514164, "learning_rate": 3.66185373255273e-05, - "loss": 0.166, - "mean_token_accuracy": 0.9486287534236908, + "loss": 0.1642, + "mean_token_accuracy": 0.949437327682972, "step": 378 }, { "epoch": 2.2170087976539588, - "grad_norm": 1.0379347645872854, + "grad_norm": 1.0280532148483927, "learning_rate": 3.6598502343610906e-05, - "loss": 0.1297, - "mean_token_accuracy": 0.9611979499459267, + "loss": 0.1376, + "mean_token_accuracy": 0.9621856659650803, "step": 379 }, { "epoch": 2.222873900293255, - "grad_norm": 1.3188761964469562, + "grad_norm": 1.5389105332308326, "learning_rate": 3.657841437502697e-05, - "loss": 0.2066, - "mean_token_accuracy": 0.9354860931634903, + "loss": 0.2034, + "mean_token_accuracy": 0.9333431124687195, "step": 380 }, { "epoch": 2.228739002932551, - "grad_norm": 1.43705231218096, + "grad_norm": 1.3670480368678344, "learning_rate": 3.6558273492686686e-05, - "loss": 0.1769, - "mean_token_accuracy": 0.946281909942627, + "loss": 0.1718, + "mean_token_accuracy": 0.9464050829410553, "step": 381 }, { "epoch": 2.2346041055718473, - "grad_norm": 1.1828401594348896, + "grad_norm": 1.183493824096973, "learning_rate": 3.6538079769693334e-05, - "loss": 0.1548, - "mean_token_accuracy": 0.9557049721479416, + "loss": 0.1427, + "mean_token_accuracy": 0.9561670497059822, "step": 382 }, { "epoch": 2.2404692082111435, - "grad_norm": 1.1169926579410214, + "grad_norm": 1.0132265198889023, "learning_rate": 3.6517833279341954e-05, - "loss": 0.1304, - "mean_token_accuracy": 0.9621347039937973, + "loss": 0.1216, + "mean_token_accuracy": 0.9647147804498672, "step": 383 }, { "epoch": 2.2463343108504397, - "grad_norm": 1.1389674170183997, + "grad_norm": 1.2169060218138956, "learning_rate": 3.649753409511916e-05, - "loss": 0.1398, - "mean_token_accuracy": 0.9618229940533638, + "loss": 0.1347, + "mean_token_accuracy": 0.9609787911176682, "step": 384 }, { "epoch": 2.252199413489736, - "grad_norm": 1.2539839985656354, + "grad_norm": 1.467320488504986, "learning_rate": 3.6477182290702766e-05, - "loss": 0.1722, - "mean_token_accuracy": 0.9477048069238663, + "loss": 0.1898, + "mean_token_accuracy": 0.945230670273304, "step": 385 }, { "epoch": 2.258064516129032, - "grad_norm": 1.180531998527333, + "grad_norm": 1.3016584136231526, "learning_rate": 3.645677793996161e-05, - "loss": 0.1736, - "mean_token_accuracy": 0.9495566114783287, + "loss": 0.1682, + "mean_token_accuracy": 0.9491082802414894, "step": 386 }, { "epoch": 2.263929618768328, - "grad_norm": 1.2558424458444957, + "grad_norm": 1.285509412490406, "learning_rate": 3.643632111695525e-05, - "loss": 0.1739, - "mean_token_accuracy": 0.9519843608140945, + "loss": 0.1583, + "mean_token_accuracy": 0.9523500502109528, "step": 387 }, { "epoch": 2.2697947214076244, - "grad_norm": 1.178692770955397, + "grad_norm": 1.3664830453923649, "learning_rate": 3.6415811895933685e-05, - "loss": 0.1586, - "mean_token_accuracy": 0.9524863511323929, + "loss": 0.1749, + "mean_token_accuracy": 0.9494151324033737, "step": 388 }, { "epoch": 2.2756598240469206, - "grad_norm": 1.0834395414342137, + "grad_norm": 1.1957263297883518, "learning_rate": 3.639525035133712e-05, - "loss": 0.1353, - "mean_token_accuracy": 0.9634513407945633, + "loss": 0.1299, + "mean_token_accuracy": 0.964408628642559, "step": 389 }, { "epoch": 2.281524926686217, - "grad_norm": 1.2781183197507804, + "grad_norm": 1.4050014432096367, "learning_rate": 3.637463655779563e-05, - "loss": 0.1813, - "mean_token_accuracy": 0.9502886831760406, + "loss": 0.1712, + "mean_token_accuracy": 0.951262354850769, "step": 390 }, { "epoch": 2.2873900293255134, - "grad_norm": 1.1027963255369508, + "grad_norm": 1.2409827279857322, "learning_rate": 3.6353970590128975e-05, - "loss": 0.1387, - "mean_token_accuracy": 0.9600658416748047, + "loss": 0.1535, + "mean_token_accuracy": 0.9573537707328796, "step": 391 }, { "epoch": 2.2932551319648096, - "grad_norm": 1.0503905361604666, + "grad_norm": 1.1811796197757145, "learning_rate": 3.633325252334628e-05, - "loss": 0.1462, - "mean_token_accuracy": 0.9516377374529839, + "loss": 0.1593, + "mean_token_accuracy": 0.9544741660356522, "step": 392 }, { "epoch": 2.2991202346041058, - "grad_norm": 1.3599511368264618, + "grad_norm": 1.4371670107191854, "learning_rate": 3.6312482432645746e-05, - "loss": 0.1947, - "mean_token_accuracy": 0.9380109906196594, + "loss": 0.1796, + "mean_token_accuracy": 0.9452351406216621, "step": 393 }, { "epoch": 2.304985337243402, - "grad_norm": 1.2159860550424557, + "grad_norm": 1.3934811764793158, "learning_rate": 3.6291660393414414e-05, - "loss": 0.1361, - "mean_token_accuracy": 0.9593810513615608, + "loss": 0.1535, + "mean_token_accuracy": 0.9559235870838165, "step": 394 }, { "epoch": 2.310850439882698, - "grad_norm": 1.199526169764911, + "grad_norm": 1.4259479703893028, "learning_rate": 3.6270786481227885e-05, - "loss": 0.1561, - "mean_token_accuracy": 0.9567776471376419, + "loss": 0.1751, + "mean_token_accuracy": 0.9490588009357452, "step": 395 }, { "epoch": 2.3167155425219943, - "grad_norm": 1.2570141761370908, + "grad_norm": 1.3992750205909668, "learning_rate": 3.624986077185003e-05, - "loss": 0.1582, - "mean_token_accuracy": 0.9552087634801865, + "loss": 0.1763, + "mean_token_accuracy": 0.9504696577787399, "step": 396 }, { "epoch": 2.3225806451612905, - "grad_norm": 1.1242395760244872, + "grad_norm": 1.1506772929960787, "learning_rate": 3.622888334123272e-05, - "loss": 0.1484, - "mean_token_accuracy": 0.9601116627454758, + "loss": 0.1445, + "mean_token_accuracy": 0.9598350897431374, "step": 397 }, { "epoch": 2.3284457478005867, - "grad_norm": 1.0414112129082755, + "grad_norm": 1.2826340913795922, "learning_rate": 3.620785426551555e-05, - "loss": 0.1372, - "mean_token_accuracy": 0.9608481675386429, + "loss": 0.154, + "mean_token_accuracy": 0.9541028887033463, "step": 398 }, { "epoch": 2.334310850439883, - "grad_norm": 1.1917239913966557, + "grad_norm": 1.154037385683219, "learning_rate": 3.618677362102558e-05, - "loss": 0.1343, - "mean_token_accuracy": 0.9602387845516205, + "loss": 0.1237, + "mean_token_accuracy": 0.9641017764806747, "step": 399 }, { "epoch": 2.340175953079179, - "grad_norm": 1.4226312444436942, + "grad_norm": 1.5501617825015497, "learning_rate": 3.616564148427703e-05, - "loss": 0.1568, - "mean_token_accuracy": 0.9512313082814217, + "loss": 0.1634, + "mean_token_accuracy": 0.950783371925354, "step": 400 }, { "epoch": 2.346041055718475, - "grad_norm": 1.2485026183547716, + "grad_norm": 1.3857185206015648, "learning_rate": 3.614445793197103e-05, - "loss": 0.1535, - "mean_token_accuracy": 0.9574841260910034, + "loss": 0.1666, + "mean_token_accuracy": 0.9557419568300247, "step": 401 }, { "epoch": 2.3519061583577714, - "grad_norm": 1.166319971957769, + "grad_norm": 1.4872535829714033, "learning_rate": 3.61232230409953e-05, - "loss": 0.1503, - "mean_token_accuracy": 0.9557152092456818, + "loss": 0.1606, + "mean_token_accuracy": 0.9534048959612846, "step": 402 }, { "epoch": 2.3577712609970676, - "grad_norm": 1.2712176223501275, + "grad_norm": 1.3796518663478101, "learning_rate": 3.6101936888423936e-05, - "loss": 0.1664, - "mean_token_accuracy": 0.9563668668270111, + "loss": 0.1766, + "mean_token_accuracy": 0.9547050073742867, "step": 403 }, { "epoch": 2.3636363636363638, - "grad_norm": 1.2891487616197648, + "grad_norm": 1.1751577653503666, "learning_rate": 3.6080599551517076e-05, - "loss": 0.1767, - "mean_token_accuracy": 0.9508347064256668, + "loss": 0.1601, + "mean_token_accuracy": 0.954358346760273, "step": 404 }, { "epoch": 2.36950146627566, - "grad_norm": 1.3686551265636433, + "grad_norm": 1.3861933824324206, "learning_rate": 3.605921110772063e-05, - "loss": 0.1799, - "mean_token_accuracy": 0.9483750611543655, + "loss": 0.17, + "mean_token_accuracy": 0.9526757672429085, "step": 405 }, { "epoch": 2.375366568914956, - "grad_norm": 1.2386733512977435, + "grad_norm": 1.3179185241316302, "learning_rate": 3.603777163466601e-05, - "loss": 0.1483, - "mean_token_accuracy": 0.9566259980201721, + "loss": 0.1599, + "mean_token_accuracy": 0.9561528638005257, "step": 406 }, { "epoch": 2.3812316715542523, - "grad_norm": 1.224756896302551, + "grad_norm": 1.209038746705086, "learning_rate": 3.6016281210169844e-05, - "loss": 0.1653, - "mean_token_accuracy": 0.9489512741565704, + "loss": 0.1734, + "mean_token_accuracy": 0.9518474340438843, "step": 407 }, { "epoch": 2.3870967741935485, - "grad_norm": 1.230326595109465, + "grad_norm": 1.3778876028119813, "learning_rate": 3.599473991223369e-05, - "loss": 0.1672, - "mean_token_accuracy": 0.9534252062439919, + "loss": 0.168, + "mean_token_accuracy": 0.9560379460453987, "step": 408 }, { "epoch": 2.3929618768328447, - "grad_norm": 1.2204737382987605, + "grad_norm": 1.1460376456761232, "learning_rate": 3.5973147819043765e-05, - "loss": 0.1725, - "mean_token_accuracy": 0.9434708282351494, + "loss": 0.1727, + "mean_token_accuracy": 0.9465153738856316, "step": 409 }, { "epoch": 2.398826979472141, - "grad_norm": 1.3957014903180014, + "grad_norm": 1.2468196399221245, "learning_rate": 3.595150500897065e-05, - "loss": 0.1932, - "mean_token_accuracy": 0.939488522708416, + "loss": 0.1954, + "mean_token_accuracy": 0.9417481794953346, "step": 410 }, { "epoch": 2.404692082111437, - "grad_norm": 1.1384701654699962, + "grad_norm": 1.291756928304674, "learning_rate": 3.5929811560569e-05, - "loss": 0.1584, - "mean_token_accuracy": 0.9575295448303223, + "loss": 0.1736, + "mean_token_accuracy": 0.9531151354312897, "step": 411 }, { "epoch": 2.410557184750733, - "grad_norm": 0.9579104530696713, + "grad_norm": 0.9431878971402365, "learning_rate": 3.590806755257726e-05, - "loss": 0.1333, - "mean_token_accuracy": 0.9575985744595528, + "loss": 0.1344, + "mean_token_accuracy": 0.956610195338726, "step": 412 }, { "epoch": 2.4164222873900294, - "grad_norm": 1.1734333582678909, + "grad_norm": 1.1558042696519284, "learning_rate": 3.5886273063917426e-05, - "loss": 0.1621, - "mean_token_accuracy": 0.9481822401285172, + "loss": 0.1494, + "mean_token_accuracy": 0.951548233628273, "step": 413 }, { "epoch": 2.4222873900293256, - "grad_norm": 1.1544037143542194, + "grad_norm": 1.5001844599940708, "learning_rate": 3.586442817369467e-05, - "loss": 0.1548, - "mean_token_accuracy": 0.9499908536672592, + "loss": 0.1635, + "mean_token_accuracy": 0.9464687779545784, "step": 414 }, { "epoch": 2.4281524926686218, - "grad_norm": 1.128417594334492, + "grad_norm": 1.0155866534384104, "learning_rate": 3.5842532961197114e-05, - "loss": 0.1466, - "mean_token_accuracy": 0.955817773938179, + "loss": 0.1273, + "mean_token_accuracy": 0.9602777808904648, "step": 415 }, { "epoch": 2.434017595307918, - "grad_norm": 1.2635081826287935, + "grad_norm": 1.5871630279548767, "learning_rate": 3.582058750589555e-05, - "loss": 0.1821, - "mean_token_accuracy": 0.9489640519022942, + "loss": 0.1898, + "mean_token_accuracy": 0.9440827742218971, "step": 416 }, { "epoch": 2.439882697947214, - "grad_norm": 1.3902977463993567, + "grad_norm": 1.6902190906292036, "learning_rate": 3.579859188744311e-05, - "loss": 0.2305, - "mean_token_accuracy": 0.9296863749623299, + "loss": 0.2219, + "mean_token_accuracy": 0.9334882199764252, "step": 417 }, { "epoch": 2.4457478005865103, - "grad_norm": 1.3034135953208241, + "grad_norm": 1.5061195306058557, "learning_rate": 3.5776546185675014e-05, - "loss": 0.1831, - "mean_token_accuracy": 0.9438828676939011, + "loss": 0.1789, + "mean_token_accuracy": 0.9478380531072617, "step": 418 }, { "epoch": 2.4516129032258065, - "grad_norm": 1.2546254507595782, + "grad_norm": 1.5587729299927646, "learning_rate": 3.5754450480608244e-05, - "loss": 0.1882, - "mean_token_accuracy": 0.9402789622545242, + "loss": 0.1854, + "mean_token_accuracy": 0.9462536200881004, "step": 419 }, { "epoch": 2.4574780058651027, - "grad_norm": 1.1880320034830234, + "grad_norm": 1.4376717265649912, "learning_rate": 3.5732304852441294e-05, - "loss": 0.187, - "mean_token_accuracy": 0.9429607689380646, + "loss": 0.1903, + "mean_token_accuracy": 0.9399577379226685, "step": 420 }, { "epoch": 2.463343108504399, - "grad_norm": 1.3907667610919618, + "grad_norm": 1.6157233488459077, "learning_rate": 3.571010938155386e-05, - "loss": 0.2268, - "mean_token_accuracy": 0.9306723326444626, + "loss": 0.2223, + "mean_token_accuracy": 0.9368812292814255, "step": 421 }, { "epoch": 2.469208211143695, - "grad_norm": 1.3355415014834187, + "grad_norm": 1.5802906558158325, "learning_rate": 3.5687864148506515e-05, - "loss": 0.1807, - "mean_token_accuracy": 0.9420250505208969, + "loss": 0.1722, + "mean_token_accuracy": 0.9450098648667336, "step": 422 }, { "epoch": 2.4750733137829912, - "grad_norm": 1.1013453172706291, + "grad_norm": 1.676334985706252, "learning_rate": 3.566556923404048e-05, - "loss": 0.1451, - "mean_token_accuracy": 0.9562686085700989, + "loss": 0.1412, + "mean_token_accuracy": 0.9557937532663345, "step": 423 }, { "epoch": 2.4809384164222874, - "grad_norm": 1.070627821312052, + "grad_norm": 1.127094474337832, "learning_rate": 3.5643224719077294e-05, - "loss": 0.1409, - "mean_token_accuracy": 0.9597943052649498, + "loss": 0.1393, + "mean_token_accuracy": 0.9604131132364273, "step": 424 }, { "epoch": 2.4868035190615836, - "grad_norm": 1.051505185810034, + "grad_norm": 1.3114913834919428, "learning_rate": 3.5620830684718515e-05, - "loss": 0.1443, - "mean_token_accuracy": 0.956302635371685, + "loss": 0.1644, + "mean_token_accuracy": 0.9522961899638176, "step": 425 }, { "epoch": 2.4926686217008798, - "grad_norm": 1.154259044158569, + "grad_norm": 1.1702325818953316, "learning_rate": 3.5598387212245456e-05, - "loss": 0.1595, - "mean_token_accuracy": 0.9494320005178452, + "loss": 0.1488, + "mean_token_accuracy": 0.9550611451268196, "step": 426 }, { "epoch": 2.498533724340176, - "grad_norm": 1.2471720681070686, + "grad_norm": 1.3403047895969575, "learning_rate": 3.5575894383118846e-05, - "loss": 0.1666, - "mean_token_accuracy": 0.9528159871697426, + "loss": 0.1787, + "mean_token_accuracy": 0.9506218433380127, "step": 427 }, { "epoch": 2.504398826979472, - "grad_norm": 1.1939193991897141, + "grad_norm": 1.2216800382995856, "learning_rate": 3.5553352278978574e-05, - "loss": 0.152, - "mean_token_accuracy": 0.9526803568005562, + "loss": 0.1558, + "mean_token_accuracy": 0.9495013952255249, "step": 428 }, { "epoch": 2.5102639296187683, - "grad_norm": 1.2727638707528373, + "grad_norm": 1.243593690786214, "learning_rate": 3.553076098164337e-05, - "loss": 0.1536, - "mean_token_accuracy": 0.9583421349525452, + "loss": 0.1425, + "mean_token_accuracy": 0.9575006663799286, "step": 429 }, { "epoch": 2.5161290322580645, - "grad_norm": 1.2422536760050964, + "grad_norm": 1.1984832556836194, "learning_rate": 3.5508120573110516e-05, - "loss": 0.1731, - "mean_token_accuracy": 0.9483218640089035, + "loss": 0.1614, + "mean_token_accuracy": 0.9521933421492577, "step": 430 }, { "epoch": 2.5219941348973607, - "grad_norm": 1.2403912167108455, + "grad_norm": 1.190960324660938, "learning_rate": 3.548543113555557e-05, - "loss": 0.1267, - "mean_token_accuracy": 0.9638039022684097, + "loss": 0.1304, + "mean_token_accuracy": 0.9650034531950951, "step": 431 }, { "epoch": 2.527859237536657, - "grad_norm": 1.3152817380517734, + "grad_norm": 1.2663063521365974, "learning_rate": 3.5462692751332014e-05, - "loss": 0.1791, - "mean_token_accuracy": 0.948051743209362, + "loss": 0.1505, + "mean_token_accuracy": 0.9575837999582291, "step": 432 }, { "epoch": 2.533724340175953, - "grad_norm": 1.085975867278954, + "grad_norm": 1.0193172996429314, "learning_rate": 3.5439905502970996e-05, - "loss": 0.1229, - "mean_token_accuracy": 0.9650994911789894, + "loss": 0.1106, + "mean_token_accuracy": 0.9644302129745483, "step": 433 }, { "epoch": 2.5395894428152492, - "grad_norm": 1.149985648209585, + "grad_norm": 1.369428546273513, "learning_rate": 3.541706947318103e-05, - "loss": 0.1543, - "mean_token_accuracy": 0.9525493830442429, + "loss": 0.1645, + "mean_token_accuracy": 0.954724870622158, "step": 434 }, { "epoch": 2.5454545454545454, - "grad_norm": 1.470381191104711, + "grad_norm": 1.483217761683716, "learning_rate": 3.539418474484768e-05, - "loss": 0.2024, - "mean_token_accuracy": 0.9437252059578896, + "loss": 0.1845, + "mean_token_accuracy": 0.9467028453946114, "step": 435 }, { "epoch": 2.5513196480938416, - "grad_norm": 1.12497151321389, + "grad_norm": 1.4574276394557277, "learning_rate": 3.537125140103327e-05, - "loss": 0.1568, - "mean_token_accuracy": 0.9551517963409424, + "loss": 0.1648, + "mean_token_accuracy": 0.9520954340696335, "step": 436 }, { "epoch": 2.557184750733138, - "grad_norm": 1.1347803729002162, + "grad_norm": 1.1343411307366602, "learning_rate": 3.534826952497657e-05, - "loss": 0.1281, - "mean_token_accuracy": 0.9624199569225311, + "loss": 0.1405, + "mean_token_accuracy": 0.9605788439512253, "step": 437 }, { "epoch": 2.563049853372434, - "grad_norm": 1.1951246836206368, + "grad_norm": 1.514912796637678, "learning_rate": 3.5325239200092505e-05, - "loss": 0.1647, - "mean_token_accuracy": 0.9496383666992188, + "loss": 0.1757, + "mean_token_accuracy": 0.9462239667773247, "step": 438 }, { "epoch": 2.56891495601173, - "grad_norm": 1.1809247397642806, + "grad_norm": 1.18115144712489, "learning_rate": 3.5302160509971866e-05, - "loss": 0.172, - "mean_token_accuracy": 0.9477546736598015, + "loss": 0.1685, + "mean_token_accuracy": 0.9481217116117477, "step": 439 }, { "epoch": 2.5747800586510263, - "grad_norm": 1.1596906219556977, + "grad_norm": 1.4046582203069407, "learning_rate": 3.5279033538380974e-05, - "loss": 0.1639, - "mean_token_accuracy": 0.9496021196246147, + "loss": 0.1703, + "mean_token_accuracy": 0.951741062104702, "step": 440 }, { "epoch": 2.5806451612903225, - "grad_norm": 0.9920483376297324, + "grad_norm": 1.0803369653160437, "learning_rate": 3.5255858369261385e-05, - "loss": 0.1189, - "mean_token_accuracy": 0.9629802703857422, + "loss": 0.1139, + "mean_token_accuracy": 0.9662863165140152, "step": 441 }, { "epoch": 2.5865102639296187, - "grad_norm": 1.367720649975484, + "grad_norm": 1.5965941102064563, "learning_rate": 3.523263508672961e-05, - "loss": 0.1885, - "mean_token_accuracy": 0.9483638033270836, + "loss": 0.1896, + "mean_token_accuracy": 0.9497494995594025, "step": 442 }, { "epoch": 2.592375366568915, - "grad_norm": 1.1849168973571333, + "grad_norm": 1.1603543822071827, "learning_rate": 3.520936377507679e-05, - "loss": 0.1537, - "mean_token_accuracy": 0.9526606574654579, + "loss": 0.1526, + "mean_token_accuracy": 0.9524945765733719, "step": 443 }, { "epoch": 2.598240469208211, - "grad_norm": 1.3702847278864858, + "grad_norm": 1.4268486498262112, "learning_rate": 3.5186044518768376e-05, - "loss": 0.2026, - "mean_token_accuracy": 0.9368810132145882, + "loss": 0.1856, + "mean_token_accuracy": 0.9400829747319221, "step": 444 }, { "epoch": 2.6041055718475072, - "grad_norm": 1.4593973761132104, + "grad_norm": 1.3811688946855898, "learning_rate": 3.5162677402443864e-05, - "loss": 0.1669, - "mean_token_accuracy": 0.9492918252944946, + "loss": 0.1716, + "mean_token_accuracy": 0.9469016790390015, "step": 445 }, { "epoch": 2.6099706744868034, - "grad_norm": 1.1882833288447554, + "grad_norm": 1.3500359157080903, "learning_rate": 3.513926251091644e-05, - "loss": 0.1538, - "mean_token_accuracy": 0.9531656056642532, + "loss": 0.168, + "mean_token_accuracy": 0.9493364244699478, "step": 446 }, { "epoch": 2.6158357771260996, - "grad_norm": 1.2494601852431129, + "grad_norm": 1.2057292019423687, "learning_rate": 3.51157999291727e-05, - "loss": 0.1726, - "mean_token_accuracy": 0.9487390294671059, + "loss": 0.1643, + "mean_token_accuracy": 0.9480642750859261, "step": 447 }, { "epoch": 2.621700879765396, - "grad_norm": 1.4261144649471686, + "grad_norm": 1.6243160945531778, "learning_rate": 3.509228974237235e-05, - "loss": 0.2139, - "mean_token_accuracy": 0.9373802468180656, + "loss": 0.2073, + "mean_token_accuracy": 0.9356926083564758, "step": 448 }, { "epoch": 2.627565982404692, - "grad_norm": 1.20200077833905, + "grad_norm": 1.2687211559386322, "learning_rate": 3.506873203584787e-05, - "loss": 0.16, - "mean_token_accuracy": 0.9530724361538887, + "loss": 0.1671, + "mean_token_accuracy": 0.9515552073717117, "step": 449 }, { "epoch": 2.633431085043988, - "grad_norm": 0.9474239286630374, + "grad_norm": 0.9855010264474293, "learning_rate": 3.504512689510422e-05, - "loss": 0.1243, - "mean_token_accuracy": 0.9638230577111244, + "loss": 0.124, + "mean_token_accuracy": 0.9672811701893806, "step": 450 }, { "epoch": 2.6392961876832843, - "grad_norm": 1.1867300274109467, + "grad_norm": 1.1552540803707831, "learning_rate": 3.5021474405818525e-05, - "loss": 0.1468, - "mean_token_accuracy": 0.9534016251564026, + "loss": 0.1327, + "mean_token_accuracy": 0.9567637592554092, "step": 451 }, { "epoch": 2.6451612903225805, - "grad_norm": 1.249095861980521, + "grad_norm": 1.378193372806992, "learning_rate": 3.499777465383977e-05, - "loss": 0.1821, - "mean_token_accuracy": 0.9496468231081963, + "loss": 0.1734, + "mean_token_accuracy": 0.9506984949111938, "step": 452 }, { "epoch": 2.6510263929618767, - "grad_norm": 1.2012362608906981, + "grad_norm": 1.425693391123194, "learning_rate": 3.497402772518848e-05, - "loss": 0.183, - "mean_token_accuracy": 0.9402816966176033, + "loss": 0.1919, + "mean_token_accuracy": 0.9396786019206047, "step": 453 }, { "epoch": 2.656891495601173, - "grad_norm": 1.1225945446252723, + "grad_norm": 1.2031173797533838, "learning_rate": 3.4950233706056415e-05, - "loss": 0.1433, - "mean_token_accuracy": 0.9563997834920883, + "loss": 0.1421, + "mean_token_accuracy": 0.9579765349626541, "step": 454 }, { "epoch": 2.662756598240469, - "grad_norm": 1.3042750039691624, + "grad_norm": 1.470261873197383, "learning_rate": 3.4926392682806265e-05, - "loss": 0.1905, - "mean_token_accuracy": 0.9459337666630745, + "loss": 0.1859, + "mean_token_accuracy": 0.9434613659977913, "step": 455 }, { "epoch": 2.6686217008797652, - "grad_norm": 1.232909377449062, + "grad_norm": 1.366478758555936, "learning_rate": 3.490250474197131e-05, - "loss": 0.1782, - "mean_token_accuracy": 0.9491490572690964, + "loss": 0.1796, + "mean_token_accuracy": 0.9481052234768867, "step": 456 }, { "epoch": 2.6744868035190614, - "grad_norm": 1.1867787570349237, + "grad_norm": 1.1912300344703033, "learning_rate": 3.4878569970255116e-05, - "loss": 0.1541, - "mean_token_accuracy": 0.9519700258970261, + "loss": 0.146, + "mean_token_accuracy": 0.9556678831577301, "step": 457 }, { "epoch": 2.6803519061583576, - "grad_norm": 1.247851879583845, + "grad_norm": 1.254578474168094, "learning_rate": 3.485458845453125e-05, - "loss": 0.1759, - "mean_token_accuracy": 0.9479285329580307, + "loss": 0.1638, + "mean_token_accuracy": 0.9488549754023552, "step": 458 }, { "epoch": 2.686217008797654, - "grad_norm": 1.0354665171969741, + "grad_norm": 1.1552424595084878, "learning_rate": 3.483056028184293e-05, - "loss": 0.1247, - "mean_token_accuracy": 0.9654245972633362, + "loss": 0.1226, + "mean_token_accuracy": 0.9643560871481895, "step": 459 }, { "epoch": 2.6920821114369504, - "grad_norm": 1.1760698781621217, + "grad_norm": 1.222050122616804, "learning_rate": 3.4806485539402716e-05, - "loss": 0.1507, - "mean_token_accuracy": 0.9528908804059029, + "loss": 0.1702, + "mean_token_accuracy": 0.9485805332660675, "step": 460 }, { "epoch": 2.6979472140762466, - "grad_norm": 1.0691808653279993, + "grad_norm": 1.04173325799313, "learning_rate": 3.4782364314592186e-05, - "loss": 0.1437, - "mean_token_accuracy": 0.9560307934880257, + "loss": 0.1315, + "mean_token_accuracy": 0.9575633853673935, "step": 461 }, { "epoch": 2.703812316715543, - "grad_norm": 1.17729816637266, + "grad_norm": 1.2620647522862372, "learning_rate": 3.475819669496167e-05, - "loss": 0.1363, - "mean_token_accuracy": 0.9557816758751869, + "loss": 0.1386, + "mean_token_accuracy": 0.9566176012158394, "step": 462 }, { "epoch": 2.709677419354839, - "grad_norm": 1.0634781787620604, + "grad_norm": 1.1878640456791556, "learning_rate": 3.473398276822985e-05, - "loss": 0.1467, - "mean_token_accuracy": 0.9533882141113281, + "loss": 0.1583, + "mean_token_accuracy": 0.9529377073049545, "step": 463 }, { "epoch": 2.715542521994135, - "grad_norm": 1.2950839756637311, + "grad_norm": 1.417678695883431, "learning_rate": 3.47097226222835e-05, - "loss": 0.1824, - "mean_token_accuracy": 0.9495379999279976, + "loss": 0.1802, + "mean_token_accuracy": 0.948061428964138, "step": 464 }, { "epoch": 2.7214076246334313, - "grad_norm": 1.2423471841197542, + "grad_norm": 1.3277295409980292, "learning_rate": 3.468541634517716e-05, - "loss": 0.155, - "mean_token_accuracy": 0.9580182358622551, + "loss": 0.1663, + "mean_token_accuracy": 0.9515725001692772, "step": 465 }, { "epoch": 2.7272727272727275, - "grad_norm": 1.0744705318053995, + "grad_norm": 1.0754314703112438, "learning_rate": 3.4661064025132796e-05, - "loss": 0.1206, - "mean_token_accuracy": 0.958877831697464, + "loss": 0.1313, + "mean_token_accuracy": 0.9582964032888412, "step": 466 }, { "epoch": 2.7331378299120237, - "grad_norm": 1.5727806483902393, + "grad_norm": 1.4666907651133738, "learning_rate": 3.463666575053949e-05, - "loss": 0.2087, - "mean_token_accuracy": 0.9415034204721451, + "loss": 0.2119, + "mean_token_accuracy": 0.9404854699969292, "step": 467 }, { "epoch": 2.73900293255132, - "grad_norm": 0.950722552564766, + "grad_norm": 0.9664273130964783, "learning_rate": 3.4612221609953126e-05, - "loss": 0.1352, - "mean_token_accuracy": 0.961692214012146, + "loss": 0.123, + "mean_token_accuracy": 0.9659903347492218, "step": 468 }, { "epoch": 2.744868035190616, - "grad_norm": 1.1223316535506394, + "grad_norm": 1.1357008791720598, "learning_rate": 3.4587731692096065e-05, - "loss": 0.1565, - "mean_token_accuracy": 0.9538895487785339, + "loss": 0.1531, + "mean_token_accuracy": 0.9576428085565567, "step": 469 }, { "epoch": 2.7507331378299122, - "grad_norm": 1.3022612967089928, + "grad_norm": 1.3120260141761082, "learning_rate": 3.4563196085856815e-05, - "loss": 0.1817, - "mean_token_accuracy": 0.9450778216123581, + "loss": 0.1869, + "mean_token_accuracy": 0.9426525309681892, "step": 470 }, { "epoch": 2.7565982404692084, - "grad_norm": 1.07230491048182, + "grad_norm": 1.3740403971580124, "learning_rate": 3.4538614880289724e-05, - "loss": 0.1615, - "mean_token_accuracy": 0.9541483297944069, + "loss": 0.2036, + "mean_token_accuracy": 0.9509467408061028, "step": 471 }, { "epoch": 2.7624633431085046, - "grad_norm": 0.9956217514306797, + "grad_norm": 1.0963128912837212, "learning_rate": 3.4513988164614635e-05, - "loss": 0.1227, - "mean_token_accuracy": 0.9635503962635994, + "loss": 0.1398, + "mean_token_accuracy": 0.9593295454978943, "step": 472 }, { "epoch": 2.768328445747801, - "grad_norm": 1.0071708374540242, + "grad_norm": 1.1019956064171137, "learning_rate": 3.4489316028216584e-05, - "loss": 0.1317, - "mean_token_accuracy": 0.9630229771137238, + "loss": 0.1247, + "mean_token_accuracy": 0.961910292506218, "step": 473 }, { "epoch": 2.774193548387097, - "grad_norm": 0.9934398270519144, + "grad_norm": 1.0889658684678445, "learning_rate": 3.446459856064545e-05, - "loss": 0.1364, - "mean_token_accuracy": 0.9594153240323067, + "loss": 0.1416, + "mean_token_accuracy": 0.9593067914247513, "step": 474 }, { "epoch": 2.780058651026393, - "grad_norm": 1.4123399599829125, + "grad_norm": 1.5364222822362106, "learning_rate": 3.443983585161568e-05, - "loss": 0.1758, - "mean_token_accuracy": 0.9463600069284439, + "loss": 0.2065, + "mean_token_accuracy": 0.9389885291457176, "step": 475 }, { "epoch": 2.7859237536656893, - "grad_norm": 1.2042742464006473, + "grad_norm": 1.2105587532674789, "learning_rate": 3.441502799100588e-05, - "loss": 0.1598, - "mean_token_accuracy": 0.959763303399086, + "loss": 0.1433, + "mean_token_accuracy": 0.95948126912117, "step": 476 }, { "epoch": 2.7917888563049855, - "grad_norm": 1.1516859677521762, + "grad_norm": 1.2951438817508967, "learning_rate": 3.439017506885858e-05, - "loss": 0.1616, - "mean_token_accuracy": 0.9520630687475204, + "loss": 0.1646, + "mean_token_accuracy": 0.9550016522407532, "step": 477 }, { "epoch": 2.7976539589442817, - "grad_norm": 1.2254942875301806, + "grad_norm": 1.311465052855291, "learning_rate": 3.436527717537985e-05, - "loss": 0.1617, - "mean_token_accuracy": 0.9591241255402565, + "loss": 0.1637, + "mean_token_accuracy": 0.9561468809843063, "step": 478 }, { "epoch": 2.803519061583578, - "grad_norm": 1.1073823095384847, + "grad_norm": 1.2479330923352838, "learning_rate": 3.434033440093899e-05, - "loss": 0.1639, - "mean_token_accuracy": 0.9487877935171127, + "loss": 0.1831, + "mean_token_accuracy": 0.9408740624785423, "step": 479 }, { "epoch": 2.809384164222874, - "grad_norm": 1.25742736357608, + "grad_norm": 1.418713289265348, "learning_rate": 3.431534683606818e-05, - "loss": 0.1864, - "mean_token_accuracy": 0.948534868657589, + "loss": 0.1842, + "mean_token_accuracy": 0.9517565071582794, "step": 480 }, { "epoch": 2.8152492668621703, - "grad_norm": 1.0344032720648466, + "grad_norm": 1.080464369466574, "learning_rate": 3.4290314571462214e-05, - "loss": 0.1417, - "mean_token_accuracy": 0.9601919278502464, + "loss": 0.149, + "mean_token_accuracy": 0.9588718563318253, "step": 481 }, { "epoch": 2.8211143695014664, - "grad_norm": 0.9930463384326542, + "grad_norm": 1.0261580719738286, "learning_rate": 3.426523769797808e-05, - "loss": 0.1359, - "mean_token_accuracy": 0.9583753347396851, + "loss": 0.1343, + "mean_token_accuracy": 0.9594571813941002, "step": 482 }, { "epoch": 2.8269794721407626, - "grad_norm": 1.2042277956276288, + "grad_norm": 1.256975848089233, "learning_rate": 3.424011630663472e-05, - "loss": 0.1653, - "mean_token_accuracy": 0.9464479833841324, + "loss": 0.1617, + "mean_token_accuracy": 0.9473337456583977, "step": 483 }, { "epoch": 2.832844574780059, - "grad_norm": 1.194262739676029, + "grad_norm": 1.1887187248938638, "learning_rate": 3.421495048861262e-05, - "loss": 0.171, - "mean_token_accuracy": 0.9502200856804848, + "loss": 0.1579, + "mean_token_accuracy": 0.9534508436918259, "step": 484 }, { "epoch": 2.838709677419355, - "grad_norm": 1.1544939224845419, + "grad_norm": 1.108974947781916, "learning_rate": 3.418974033525355e-05, - "loss": 0.1409, - "mean_token_accuracy": 0.9586869552731514, + "loss": 0.1456, + "mean_token_accuracy": 0.9567419737577438, "step": 485 }, { "epoch": 2.844574780058651, - "grad_norm": 1.187689984945221, + "grad_norm": 1.3884218537085193, "learning_rate": 3.416448593806019e-05, - "loss": 0.1751, - "mean_token_accuracy": 0.9496021121740341, + "loss": 0.1734, + "mean_token_accuracy": 0.9510295391082764, "step": 486 }, { "epoch": 2.8504398826979473, - "grad_norm": 1.1330699826268225, + "grad_norm": 1.265301003883797, "learning_rate": 3.4139187388695774e-05, - "loss": 0.1551, - "mean_token_accuracy": 0.950385794043541, + "loss": 0.152, + "mean_token_accuracy": 0.9507285058498383, "step": 487 }, { "epoch": 2.8563049853372435, - "grad_norm": 1.307262909941581, + "grad_norm": 1.4518823332516295, "learning_rate": 3.411384477898385e-05, - "loss": 0.1655, - "mean_token_accuracy": 0.9537685662508011, + "loss": 0.1708, + "mean_token_accuracy": 0.9520754367113113, "step": 488 }, { "epoch": 2.8621700879765397, - "grad_norm": 1.0577486778486105, + "grad_norm": 1.3170349542649433, "learning_rate": 3.408845820090784e-05, - "loss": 0.1544, - "mean_token_accuracy": 0.9556203186511993, + "loss": 0.1505, + "mean_token_accuracy": 0.9542975649237633, "step": 489 }, { "epoch": 2.868035190615836, - "grad_norm": 1.246360384808443, + "grad_norm": 1.4862687077718277, "learning_rate": 3.406302774661077e-05, - "loss": 0.1983, - "mean_token_accuracy": 0.939469151198864, + "loss": 0.1879, + "mean_token_accuracy": 0.9435409009456635, "step": 490 }, { "epoch": 2.873900293255132, - "grad_norm": 1.2815397210697366, + "grad_norm": 1.5510248888013005, "learning_rate": 3.403755350839492e-05, - "loss": 0.1934, - "mean_token_accuracy": 0.945215106010437, + "loss": 0.1953, + "mean_token_accuracy": 0.9470630958676338, "step": 491 }, { "epoch": 2.8797653958944283, - "grad_norm": 0.9463210261831989, + "grad_norm": 0.9083867690185883, "learning_rate": 3.401203557872149e-05, - "loss": 0.1094, - "mean_token_accuracy": 0.9667675942182541, + "loss": 0.1102, + "mean_token_accuracy": 0.9688790738582611, "step": 492 }, { "epoch": 2.8856304985337244, - "grad_norm": 1.1206820564307887, + "grad_norm": 1.20388441326265, "learning_rate": 3.398647405021026e-05, - "loss": 0.1511, - "mean_token_accuracy": 0.9558945000171661, + "loss": 0.157, + "mean_token_accuracy": 0.9560961946845055, "step": 493 }, { "epoch": 2.8914956011730206, - "grad_norm": 1.3956556588522069, + "grad_norm": 1.6131126124579498, "learning_rate": 3.396086901563925e-05, - "loss": 0.2011, - "mean_token_accuracy": 0.9396672174334526, + "loss": 0.2093, + "mean_token_accuracy": 0.9383601620793343, "step": 494 }, { "epoch": 2.897360703812317, - "grad_norm": 0.9893183505790021, + "grad_norm": 1.1132311008689093, "learning_rate": 3.3935220567944395e-05, - "loss": 0.1346, - "mean_token_accuracy": 0.9598894119262695, + "loss": 0.1368, + "mean_token_accuracy": 0.9590755850076675, "step": 495 }, { "epoch": 2.903225806451613, - "grad_norm": 1.2616449700071117, + "grad_norm": 1.388157178476272, "learning_rate": 3.39095288002192e-05, - "loss": 0.195, - "mean_token_accuracy": 0.9443835839629173, + "loss": 0.1933, + "mean_token_accuracy": 0.9421152919530869, "step": 496 }, { "epoch": 2.909090909090909, - "grad_norm": 1.0424261885719452, + "grad_norm": 1.2419877458073016, "learning_rate": 3.3883793805714406e-05, - "loss": 0.1502, - "mean_token_accuracy": 0.9556261077523232, + "loss": 0.1441, + "mean_token_accuracy": 0.95684465020895, "step": 497 }, { "epoch": 2.9149560117302054, - "grad_norm": 1.3618750738536685, + "grad_norm": 1.2687959153746118, "learning_rate": 3.3858015677837656e-05, - "loss": 0.18, - "mean_token_accuracy": 0.9481714516878128, + "loss": 0.1952, + "mean_token_accuracy": 0.9452904909849167, "step": 498 }, { "epoch": 2.9208211143695015, - "grad_norm": 1.2006128437590662, + "grad_norm": 1.3664534250202576, "learning_rate": 3.3832194510153126e-05, - "loss": 0.1718, - "mean_token_accuracy": 0.9532595574855804, + "loss": 0.1753, + "mean_token_accuracy": 0.9477646723389626, "step": 499 }, { "epoch": 2.9266862170087977, - "grad_norm": 1.2296015739991573, + "grad_norm": 1.2649573147861883, "learning_rate": 3.380633039638125e-05, - "loss": 0.164, - "mean_token_accuracy": 0.9541523456573486, + "loss": 0.1555, + "mean_token_accuracy": 0.9552437961101532, "step": 500 }, { "epoch": 2.932551319648094, - "grad_norm": 1.2323869227106137, + "grad_norm": 1.461470546166254, "learning_rate": 3.37804234303983e-05, - "loss": 0.1802, - "mean_token_accuracy": 0.946508027613163, + "loss": 0.1846, + "mean_token_accuracy": 0.9439187347888947, "step": 501 }, { "epoch": 2.93841642228739, - "grad_norm": 1.2142191669529525, + "grad_norm": 1.3620091302607829, "learning_rate": 3.37544737062361e-05, - "loss": 0.1738, - "mean_token_accuracy": 0.9512768238782883, + "loss": 0.1803, + "mean_token_accuracy": 0.9506632015109062, "step": 502 }, { "epoch": 2.9442815249266863, - "grad_norm": 1.192114759053001, + "grad_norm": 1.3089948493246146, "learning_rate": 3.372848131808167e-05, - "loss": 0.1685, - "mean_token_accuracy": 0.9535468518733978, + "loss": 0.1635, + "mean_token_accuracy": 0.9532084316015244, "step": 503 }, { "epoch": 2.9501466275659824, - "grad_norm": 1.2900442027550065, + "grad_norm": 1.4021607966733816, "learning_rate": 3.370244636027688e-05, - "loss": 0.1652, - "mean_token_accuracy": 0.9492153376340866, + "loss": 0.1589, + "mean_token_accuracy": 0.9532099440693855, "step": 504 }, { "epoch": 2.9560117302052786, - "grad_norm": 1.4383269282382705, + "grad_norm": 1.273314099966914, "learning_rate": 3.367636892731812e-05, - "loss": 0.1692, - "mean_token_accuracy": 0.9460426717996597, + "loss": 0.1536, + "mean_token_accuracy": 0.9473722502589226, "step": 505 }, { "epoch": 2.961876832844575, - "grad_norm": 0.965502377956891, + "grad_norm": 1.2111921824318914, "learning_rate": 3.365024911385593e-05, - "loss": 0.1286, - "mean_token_accuracy": 0.963849164545536, + "loss": 0.1473, + "mean_token_accuracy": 0.9612649232149124, "step": 506 }, { "epoch": 2.967741935483871, - "grad_norm": 1.0151333575737922, + "grad_norm": 1.1166189528554271, "learning_rate": 3.362408701469469e-05, - "loss": 0.1506, - "mean_token_accuracy": 0.9497946873307228, + "loss": 0.1512, + "mean_token_accuracy": 0.9526806995272636, "step": 507 }, { "epoch": 2.973607038123167, - "grad_norm": 1.2314813082134561, + "grad_norm": 1.536757210363506, "learning_rate": 3.359788272479225e-05, - "loss": 0.1796, - "mean_token_accuracy": 0.9487903341650963, + "loss": 0.1924, + "mean_token_accuracy": 0.9464055821299553, "step": 508 }, { "epoch": 2.9794721407624634, - "grad_norm": 1.2284872298249723, + "grad_norm": 1.2454395159603193, "learning_rate": 3.35716363392596e-05, - "loss": 0.1735, - "mean_token_accuracy": 0.946588970720768, + "loss": 0.1672, + "mean_token_accuracy": 0.9483352527022362, "step": 509 }, { "epoch": 2.9853372434017595, - "grad_norm": 1.3942128527138467, + "grad_norm": 1.3846359032452442, "learning_rate": 3.354534795336052e-05, - "loss": 0.2326, - "mean_token_accuracy": 0.9368415027856827, + "loss": 0.2416, + "mean_token_accuracy": 0.9301293268799782, "step": 510 }, { "epoch": 2.9912023460410557, - "grad_norm": 1.0199724160589165, + "grad_norm": 1.0907635693009343, "learning_rate": 3.351901766251123e-05, - "loss": 0.1612, - "mean_token_accuracy": 0.9494053423404694, + "loss": 0.1584, + "mean_token_accuracy": 0.9516775086522102, "step": 511 }, { "epoch": 2.997067448680352, - "grad_norm": 1.4236896466420417, + "grad_norm": 1.52351732920836, "learning_rate": 3.349264556228006e-05, - "loss": 0.2099, - "mean_token_accuracy": 0.9403877630829811, + "loss": 0.2106, + "mean_token_accuracy": 0.9392689317464828, "step": 512 }, { "epoch": 3.0, - "grad_norm": 1.4236896466420417, + "grad_norm": 1.52351732920836, "learning_rate": 3.3466231748387077e-05, - "loss": 0.2159, - "mean_token_accuracy": 0.929660826921463, + "loss": 0.2709, + "mean_token_accuracy": 0.9210451692342758, "step": 513 }, { "epoch": 3.005865102639296, - "grad_norm": 1.917706001626474, + "grad_norm": 1.968651229970697, "learning_rate": 3.343977631670376e-05, - "loss": 0.0928, - "mean_token_accuracy": 0.9745519906282425, + "loss": 0.1029, + "mean_token_accuracy": 0.9711232706904411, "step": 514 }, { "epoch": 3.0117302052785924, - "grad_norm": 0.9353206453623386, + "grad_norm": 0.9652696996950184, "learning_rate": 3.341327936325264e-05, - "loss": 0.1087, - "mean_token_accuracy": 0.9702077433466911, + "loss": 0.1143, + "mean_token_accuracy": 0.9680498614907265, "step": 515 }, { "epoch": 3.0175953079178885, - "grad_norm": 0.793270262682745, + "grad_norm": 0.8502990782157536, "learning_rate": 3.338674098420695e-05, - "loss": 0.0927, - "mean_token_accuracy": 0.9723505601286888, + "loss": 0.0908, + "mean_token_accuracy": 0.9716523513197899, "step": 516 }, { "epoch": 3.0234604105571847, - "grad_norm": 0.9442060709437615, + "grad_norm": 0.7630744145217659, "learning_rate": 3.33601612758903e-05, - "loss": 0.1154, - "mean_token_accuracy": 0.9676948711276054, + "loss": 0.0976, + "mean_token_accuracy": 0.970447264611721, "step": 517 }, { "epoch": 3.029325513196481, - "grad_norm": 0.9089470810917948, + "grad_norm": 0.81504254903722, "learning_rate": 3.3333540334776286e-05, - "loss": 0.1085, - "mean_token_accuracy": 0.9652741998434067, + "loss": 0.1127, + "mean_token_accuracy": 0.9646006375551224, "step": 518 }, { "epoch": 3.035190615835777, - "grad_norm": 0.9535344298120216, + "grad_norm": 0.9343172049366141, "learning_rate": 3.330687825748818e-05, - "loss": 0.0966, - "mean_token_accuracy": 0.9707278311252594, + "loss": 0.1117, + "mean_token_accuracy": 0.9674835652112961, "step": 519 }, { "epoch": 3.0410557184750733, - "grad_norm": 1.0173869551440817, + "grad_norm": 1.1971850042909595, "learning_rate": 3.328017514079855e-05, - "loss": 0.119, - "mean_token_accuracy": 0.9670073837041855, + "loss": 0.1253, + "mean_token_accuracy": 0.9672354459762573, "step": 520 }, { "epoch": 3.0469208211143695, - "grad_norm": 0.7792951102956577, + "grad_norm": 1.0563212992287103, "learning_rate": 3.325343108162893e-05, - "loss": 0.0937, - "mean_token_accuracy": 0.9708864092826843, + "loss": 0.0988, + "mean_token_accuracy": 0.9684354662895203, "step": 521 }, { "epoch": 3.0527859237536656, - "grad_norm": 0.945465442856844, + "grad_norm": 1.0716926109268614, "learning_rate": 3.3226646177049446e-05, - "loss": 0.115, - "mean_token_accuracy": 0.9687144085764885, + "loss": 0.1124, + "mean_token_accuracy": 0.9679635167121887, "step": 522 }, { "epoch": 3.058651026392962, - "grad_norm": 0.9843530411347534, + "grad_norm": 1.1051242844259859, "learning_rate": 3.3199820524278485e-05, - "loss": 0.1165, - "mean_token_accuracy": 0.9642806574702263, + "loss": 0.115, + "mean_token_accuracy": 0.9673113599419594, "step": 523 }, { "epoch": 3.064516129032258, - "grad_norm": 1.084726474246644, + "grad_norm": 1.106421479204272, "learning_rate": 3.317295422068234e-05, - "loss": 0.1176, - "mean_token_accuracy": 0.965855173766613, + "loss": 0.1172, + "mean_token_accuracy": 0.9651734456419945, "step": 524 }, { "epoch": 3.070381231671554, - "grad_norm": 0.9445207089649317, + "grad_norm": 1.1415664510277006, "learning_rate": 3.314604736377484e-05, - "loss": 0.0873, - "mean_token_accuracy": 0.9730576723814011, + "loss": 0.1021, + "mean_token_accuracy": 0.9700004607439041, "step": 525 }, { "epoch": 3.0762463343108504, - "grad_norm": 0.8091369892946971, + "grad_norm": 0.9404143331641928, "learning_rate": 3.3119100051217005e-05, - "loss": 0.0891, - "mean_token_accuracy": 0.9751559272408485, + "loss": 0.0886, + "mean_token_accuracy": 0.975829653441906, "step": 526 }, { "epoch": 3.0821114369501466, - "grad_norm": 0.94656315451539, + "grad_norm": 1.1187988373557514, "learning_rate": 3.3092112380816696e-05, - "loss": 0.104, - "mean_token_accuracy": 0.9682408720254898, + "loss": 0.1185, + "mean_token_accuracy": 0.9644615799188614, "step": 527 }, { "epoch": 3.0879765395894427, - "grad_norm": 0.8332801420075987, + "grad_norm": 1.0020657220144769, "learning_rate": 3.306508445052826e-05, - "loss": 0.1142, - "mean_token_accuracy": 0.9660920351743698, + "loss": 0.1211, + "mean_token_accuracy": 0.964887946844101, "step": 528 }, { "epoch": 3.093841642228739, - "grad_norm": 1.2155753046929505, + "grad_norm": 1.2143706108607386, "learning_rate": 3.303801635845216e-05, - "loss": 0.11, - "mean_token_accuracy": 0.9683258086442947, + "loss": 0.113, + "mean_token_accuracy": 0.9642840325832367, "step": 529 }, { "epoch": 3.099706744868035, - "grad_norm": 1.0637098841625154, + "grad_norm": 0.9975861638054916, "learning_rate": 3.301090820283465e-05, - "loss": 0.1197, - "mean_token_accuracy": 0.9640811383724213, + "loss": 0.1149, + "mean_token_accuracy": 0.9669056162238121, "step": 530 }, { "epoch": 3.1055718475073313, - "grad_norm": 1.0473569018050832, + "grad_norm": 1.0802501908744115, "learning_rate": 3.298376008206739e-05, - "loss": 0.111, - "mean_token_accuracy": 0.9675813242793083, + "loss": 0.1124, + "mean_token_accuracy": 0.9703796878457069, "step": 531 }, { "epoch": 3.1114369501466275, - "grad_norm": 0.6922733863108391, + "grad_norm": 0.8021624899339075, "learning_rate": 3.295657209468707e-05, - "loss": 0.0863, - "mean_token_accuracy": 0.9753805994987488, + "loss": 0.0943, + "mean_token_accuracy": 0.9735233038663864, "step": 532 }, { "epoch": 3.1173020527859236, - "grad_norm": 0.922894718374272, + "grad_norm": 0.9633823163777017, "learning_rate": 3.2929344339375125e-05, - "loss": 0.1141, - "mean_token_accuracy": 0.9667019098997116, + "loss": 0.1189, + "mean_token_accuracy": 0.9634987413883209, "step": 533 }, { "epoch": 3.12316715542522, - "grad_norm": 1.0319575218914387, + "grad_norm": 0.9351429992484981, "learning_rate": 3.290207691495731e-05, - "loss": 0.1078, - "mean_token_accuracy": 0.969107136130333, + "loss": 0.1223, + "mean_token_accuracy": 0.9647831618785858, "step": 534 }, { "epoch": 3.129032258064516, - "grad_norm": 0.9278719667767502, + "grad_norm": 1.0205424257089983, "learning_rate": 3.2874769920403355e-05, - "loss": 0.1026, - "mean_token_accuracy": 0.9660426154732704, + "loss": 0.1094, + "mean_token_accuracy": 0.9677336141467094, "step": 535 }, { "epoch": 3.134897360703812, - "grad_norm": 0.7507543958848601, + "grad_norm": 0.7996391695402355, "learning_rate": 3.2847423454826616e-05, - "loss": 0.0985, - "mean_token_accuracy": 0.9724163636565208, + "loss": 0.1029, + "mean_token_accuracy": 0.9708555638790131, "step": 536 }, { "epoch": 3.1407624633431084, - "grad_norm": 0.9242090726097795, + "grad_norm": 0.9929429090487744, "learning_rate": 3.2820037617483734e-05, - "loss": 0.1248, - "mean_token_accuracy": 0.9671787321567535, + "loss": 0.1326, + "mean_token_accuracy": 0.9639134034514427, "step": 537 }, { "epoch": 3.1466275659824046, - "grad_norm": 1.0582856339832543, + "grad_norm": 1.0550635107253659, "learning_rate": 3.2792612507774224e-05, - "loss": 0.1082, - "mean_token_accuracy": 0.9701619669795036, + "loss": 0.1091, + "mean_token_accuracy": 0.9677369222044945, "step": 538 }, { "epoch": 3.1524926686217007, - "grad_norm": 0.7962224695055707, + "grad_norm": 0.9231451013069485, "learning_rate": 3.2765148225240176e-05, - "loss": 0.1022, - "mean_token_accuracy": 0.9693987816572189, + "loss": 0.1127, + "mean_token_accuracy": 0.9681195095181465, "step": 539 }, { "epoch": 3.158357771260997, - "grad_norm": 0.8913562259790775, + "grad_norm": 0.862498716572109, "learning_rate": 3.273764486956583e-05, - "loss": 0.1188, - "mean_token_accuracy": 0.9641912281513214, + "loss": 0.1109, + "mean_token_accuracy": 0.9684896096587181, "step": 540 }, { "epoch": 3.164222873900293, - "grad_norm": 0.9391686716445263, + "grad_norm": 1.0956873032349619, "learning_rate": 3.2710102540577256e-05, - "loss": 0.1015, - "mean_token_accuracy": 0.9703034535050392, + "loss": 0.1088, + "mean_token_accuracy": 0.9692259877920151, "step": 541 }, { "epoch": 3.1700879765395893, - "grad_norm": 1.1425921994518553, + "grad_norm": 1.0839458966945006, "learning_rate": 3.268252133824198e-05, - "loss": 0.1396, - "mean_token_accuracy": 0.9599046036601067, + "loss": 0.1313, + "mean_token_accuracy": 0.9646385759115219, "step": 542 }, { "epoch": 3.1759530791788855, - "grad_norm": 0.9837980520671072, + "grad_norm": 0.9457995681638467, "learning_rate": 3.2654901362668656e-05, - "loss": 0.105, - "mean_token_accuracy": 0.9691286087036133, + "loss": 0.1034, + "mean_token_accuracy": 0.9699193686246872, "step": 543 }, { "epoch": 3.1818181818181817, - "grad_norm": 1.0478243618422738, + "grad_norm": 0.9874968177745931, "learning_rate": 3.262724271410661e-05, - "loss": 0.1177, - "mean_token_accuracy": 0.964763417840004, + "loss": 0.1211, + "mean_token_accuracy": 0.9654932543635368, "step": 544 }, { "epoch": 3.187683284457478, - "grad_norm": 1.051440350841896, + "grad_norm": 1.1532260866014505, "learning_rate": 3.2599545492945584e-05, - "loss": 0.1281, - "mean_token_accuracy": 0.9659412503242493, + "loss": 0.1291, + "mean_token_accuracy": 0.9629618749022484, "step": 545 }, { "epoch": 3.193548387096774, - "grad_norm": 1.2381984787570743, + "grad_norm": 1.244455403777776, "learning_rate": 3.257180979971529e-05, - "loss": 0.122, - "mean_token_accuracy": 0.96320890635252, + "loss": 0.1133, + "mean_token_accuracy": 0.9672550708055496, "step": 546 }, { "epoch": 3.19941348973607, - "grad_norm": 0.9566730062260478, + "grad_norm": 0.9864752841266905, "learning_rate": 3.25440357350851e-05, - "loss": 0.1227, - "mean_token_accuracy": 0.9624106585979462, + "loss": 0.1285, + "mean_token_accuracy": 0.9629417359828949, "step": 547 }, { "epoch": 3.2052785923753664, - "grad_norm": 1.0303596937757689, + "grad_norm": 1.1105194518868626, "learning_rate": 3.251622339986366e-05, - "loss": 0.1208, - "mean_token_accuracy": 0.9645057767629623, + "loss": 0.1348, + "mean_token_accuracy": 0.9636048302054405, "step": 548 }, { "epoch": 3.2111436950146626, - "grad_norm": 1.1499267317362676, + "grad_norm": 1.273551629134977, "learning_rate": 3.24883728949985e-05, - "loss": 0.1289, - "mean_token_accuracy": 0.9617469310760498, + "loss": 0.1324, + "mean_token_accuracy": 0.9609309211373329, "step": 549 }, { "epoch": 3.2170087976539588, - "grad_norm": 0.8579142698468588, + "grad_norm": 0.9952288620484804, "learning_rate": 3.2460484321575714e-05, - "loss": 0.0921, - "mean_token_accuracy": 0.9694699496030807, + "loss": 0.1098, + "mean_token_accuracy": 0.9686953574419022, "step": 550 }, { "epoch": 3.222873900293255, - "grad_norm": 1.2241087162872863, + "grad_norm": 0.9799228318089337, "learning_rate": 3.2432557780819556e-05, - "loss": 0.0932, - "mean_token_accuracy": 0.9721159860491753, + "loss": 0.1036, + "mean_token_accuracy": 0.9690500125288963, "step": 551 }, { "epoch": 3.228739002932551, - "grad_norm": 0.8167439933486885, + "grad_norm": 1.0078911048435029, "learning_rate": 3.240459337409209e-05, - "loss": 0.1116, - "mean_token_accuracy": 0.9674685597419739, + "loss": 0.1305, + "mean_token_accuracy": 0.9617227241396904, "step": 552 }, { "epoch": 3.2346041055718473, - "grad_norm": 0.7997732243833637, + "grad_norm": 0.948818618424717, "learning_rate": 3.237659120289282e-05, - "loss": 0.1017, - "mean_token_accuracy": 0.9701759144663811, + "loss": 0.1127, + "mean_token_accuracy": 0.9691289365291595, "step": 553 }, { "epoch": 3.2404692082111435, - "grad_norm": 1.1095187159507571, + "grad_norm": 1.2609074258141768, "learning_rate": 3.2348551368858315e-05, - "loss": 0.1156, - "mean_token_accuracy": 0.9673218578100204, + "loss": 0.1118, + "mean_token_accuracy": 0.9663236141204834, "step": 554 }, { "epoch": 3.2463343108504397, - "grad_norm": 0.9606996629666116, + "grad_norm": 0.986737754077552, "learning_rate": 3.2320473973761845e-05, - "loss": 0.1153, - "mean_token_accuracy": 0.9676761701703072, + "loss": 0.1086, + "mean_token_accuracy": 0.9680015295743942, "step": 555 }, { "epoch": 3.252199413489736, - "grad_norm": 0.9247316094180343, + "grad_norm": 0.9498740915078302, "learning_rate": 3.229235911951303e-05, - "loss": 0.1153, - "mean_token_accuracy": 0.9680519327521324, + "loss": 0.1131, + "mean_token_accuracy": 0.9682961478829384, "step": 556 }, { "epoch": 3.258064516129032, - "grad_norm": 1.1072521549049836, + "grad_norm": 0.975893859118113, "learning_rate": 3.2264206908157425e-05, - "loss": 0.101, - "mean_token_accuracy": 0.9698139801621437, + "loss": 0.097, + "mean_token_accuracy": 0.9698783755302429, "step": 557 }, { "epoch": 3.263929618768328, - "grad_norm": 0.8418708276056709, + "grad_norm": 0.8112438014718453, "learning_rate": 3.2236017441876185e-05, - "loss": 0.1188, - "mean_token_accuracy": 0.9673982262611389, + "loss": 0.1277, + "mean_token_accuracy": 0.9649044126272202, "step": 558 }, { "epoch": 3.2697947214076244, - "grad_norm": 1.0299813252012295, + "grad_norm": 1.233497752038787, "learning_rate": 3.220779082298569e-05, - "loss": 0.109, - "mean_token_accuracy": 0.9702173173427582, + "loss": 0.1244, + "mean_token_accuracy": 0.9652732387185097, "step": 559 }, { "epoch": 3.2756598240469206, - "grad_norm": 1.0202554844724807, + "grad_norm": 1.2473037646309517, "learning_rate": 3.2179527153937165e-05, - "loss": 0.1288, - "mean_token_accuracy": 0.9627135470509529, + "loss": 0.1388, + "mean_token_accuracy": 0.9597837403416634, "step": 560 }, { "epoch": 3.281524926686217, - "grad_norm": 0.9077193846639583, + "grad_norm": 1.1321059040397468, "learning_rate": 3.2151226537316315e-05, - "loss": 0.0963, - "mean_token_accuracy": 0.9713364169001579, + "loss": 0.1021, + "mean_token_accuracy": 0.9714441522955894, "step": 561 }, { "epoch": 3.2873900293255134, - "grad_norm": 0.8778878752612806, + "grad_norm": 0.8602666974497257, "learning_rate": 3.212288907584296e-05, - "loss": 0.103, - "mean_token_accuracy": 0.9681121036410332, + "loss": 0.1064, + "mean_token_accuracy": 0.9680186361074448, "step": 562 }, { "epoch": 3.2932551319648096, - "grad_norm": 0.9509437842453761, + "grad_norm": 1.2042098687516025, "learning_rate": 3.209451487237062e-05, - "loss": 0.1296, - "mean_token_accuracy": 0.9637468382716179, + "loss": 0.1371, + "mean_token_accuracy": 0.9599383175373077, "step": 563 }, { "epoch": 3.2991202346041058, - "grad_norm": 1.0297542251177307, + "grad_norm": 0.9050169219632497, "learning_rate": 3.206610402988621e-05, - "loss": 0.1095, - "mean_token_accuracy": 0.9680195823311806, + "loss": 0.1065, + "mean_token_accuracy": 0.9682442918419838, "step": 564 }, { "epoch": 3.304985337243402, - "grad_norm": 0.927223963783926, + "grad_norm": 1.026829641767951, "learning_rate": 3.20376566515096e-05, - "loss": 0.0991, - "mean_token_accuracy": 0.9703445583581924, + "loss": 0.1183, + "mean_token_accuracy": 0.9679248631000519, "step": 565 }, { "epoch": 3.310850439882698, - "grad_norm": 0.8239448328897864, + "grad_norm": 0.9674573819125384, "learning_rate": 3.20091728404933e-05, - "loss": 0.0992, - "mean_token_accuracy": 0.9700045213103294, + "loss": 0.0955, + "mean_token_accuracy": 0.9733124375343323, "step": 566 }, { "epoch": 3.3167155425219943, - "grad_norm": 0.9180503831666103, + "grad_norm": 0.8804094604877748, "learning_rate": 3.1980652700222024e-05, - "loss": 0.105, - "mean_token_accuracy": 0.9704331159591675, + "loss": 0.111, + "mean_token_accuracy": 0.968389168381691, "step": 567 }, { "epoch": 3.3225806451612905, - "grad_norm": 0.8526038871722106, + "grad_norm": 0.8181882227316067, "learning_rate": 3.195209633421237e-05, - "loss": 0.1126, - "mean_token_accuracy": 0.9635952338576317, + "loss": 0.1057, + "mean_token_accuracy": 0.9647178426384926, "step": 568 }, { "epoch": 3.3284457478005867, - "grad_norm": 0.9429157822338116, + "grad_norm": 1.144091624189587, "learning_rate": 3.192350384611242e-05, - "loss": 0.1249, - "mean_token_accuracy": 0.9621228873729706, + "loss": 0.1263, + "mean_token_accuracy": 0.960628017783165, "step": 569 }, { "epoch": 3.334310850439883, - "grad_norm": 1.0853666053637696, + "grad_norm": 0.8750690128095869, "learning_rate": 3.1894875339701354e-05, - "loss": 0.1161, - "mean_token_accuracy": 0.9705541431903839, + "loss": 0.1055, + "mean_token_accuracy": 0.9733807370066643, "step": 570 }, { "epoch": 3.340175953079179, - "grad_norm": 0.9698729643334536, + "grad_norm": 0.9491730861419487, "learning_rate": 3.186621091888909e-05, - "loss": 0.1228, - "mean_token_accuracy": 0.9655315577983856, + "loss": 0.123, + "mean_token_accuracy": 0.9650392830371857, "step": 571 }, { "epoch": 3.346041055718475, - "grad_norm": 0.9453776118037494, + "grad_norm": 0.8267533896847664, "learning_rate": 3.183751068771588e-05, - "loss": 0.1184, - "mean_token_accuracy": 0.9671064466238022, + "loss": 0.1136, + "mean_token_accuracy": 0.9666124507784843, "step": 572 }, { "epoch": 3.3519061583577714, - "grad_norm": 0.9290593976617697, + "grad_norm": 1.0647393991266463, "learning_rate": 3.180877475035199e-05, - "loss": 0.1112, - "mean_token_accuracy": 0.9667282104492188, + "loss": 0.1184, + "mean_token_accuracy": 0.9674052000045776, "step": 573 }, { "epoch": 3.3577712609970676, - "grad_norm": 0.8475132554914343, + "grad_norm": 0.8434998247171931, "learning_rate": 3.178000321109727e-05, - "loss": 0.1171, - "mean_token_accuracy": 0.9664184153079987, + "loss": 0.1183, + "mean_token_accuracy": 0.9672171697020531, "step": 574 }, { "epoch": 3.3636363636363638, - "grad_norm": 0.9901954629417643, + "grad_norm": 1.8503993478263725, "learning_rate": 3.175119617438078e-05, - "loss": 0.1193, - "mean_token_accuracy": 0.9652402922511101, + "loss": 0.126, + "mean_token_accuracy": 0.9652078375220299, "step": 575 }, { "epoch": 3.36950146627566, - "grad_norm": 1.0773481785229146, + "grad_norm": 1.2024058090106011, "learning_rate": 3.172235374476043e-05, - "loss": 0.1095, - "mean_token_accuracy": 0.9663127958774567, + "loss": 0.1164, + "mean_token_accuracy": 0.9678644984960556, "step": 576 }, { "epoch": 3.375366568914956, - "grad_norm": 0.8410303889573532, + "grad_norm": 1.0479012272568091, "learning_rate": 3.169347602692259e-05, - "loss": 0.1155, - "mean_token_accuracy": 0.9649907350540161, + "loss": 0.144, + "mean_token_accuracy": 0.9602736532688141, "step": 577 }, { "epoch": 3.3812316715542523, - "grad_norm": 0.919889189853559, + "grad_norm": 1.2562006259403447, "learning_rate": 3.166456312568171e-05, - "loss": 0.1066, - "mean_token_accuracy": 0.9651471823453903, + "loss": 0.1198, + "mean_token_accuracy": 0.9649272710084915, "step": 578 }, { "epoch": 3.3870967741935485, - "grad_norm": 0.9847134009233571, + "grad_norm": 1.0474486488431458, "learning_rate": 3.1635615145979955e-05, - "loss": 0.1325, - "mean_token_accuracy": 0.9620075672864914, + "loss": 0.1279, + "mean_token_accuracy": 0.9652384221553802, "step": 579 }, { "epoch": 3.3929618768328447, - "grad_norm": 0.9140552479884303, + "grad_norm": 0.9124821179146846, "learning_rate": 3.160663219288679e-05, - "loss": 0.0929, - "mean_token_accuracy": 0.9710179567337036, + "loss": 0.1068, + "mean_token_accuracy": 0.9672473296523094, "step": 580 }, { "epoch": 3.398826979472141, - "grad_norm": 0.8655879512510697, + "grad_norm": 0.9757202501702933, "learning_rate": 3.157761437159863e-05, - "loss": 0.1227, - "mean_token_accuracy": 0.9625556096434593, + "loss": 0.13, + "mean_token_accuracy": 0.9603447467088699, "step": 581 }, { "epoch": 3.404692082111437, - "grad_norm": 1.016148263569575, + "grad_norm": 1.0652616179562135, "learning_rate": 3.1548561787438445e-05, "loss": 0.1038, - "mean_token_accuracy": 0.970151960849762, + "mean_token_accuracy": 0.9710996374487877, "step": 582 }, { "epoch": 3.410557184750733, - "grad_norm": 0.9990010783150937, + "grad_norm": 0.836489733941965, "learning_rate": 3.15194745458554e-05, - "loss": 0.1094, - "mean_token_accuracy": 0.9681411162018776, + "loss": 0.1073, + "mean_token_accuracy": 0.9714136347174644, "step": 583 }, { "epoch": 3.4164222873900294, - "grad_norm": 0.8982758800759956, + "grad_norm": 1.0714963240647486, "learning_rate": 3.149035275242441e-05, - "loss": 0.1008, - "mean_token_accuracy": 0.970494419336319, + "loss": 0.1094, + "mean_token_accuracy": 0.9690913036465645, "step": 584 }, { "epoch": 3.4222873900293256, - "grad_norm": 0.9411992221855227, + "grad_norm": 0.9137331585277465, "learning_rate": 3.1461196512845834e-05, - "loss": 0.1134, - "mean_token_accuracy": 0.9663120433688164, + "loss": 0.1144, + "mean_token_accuracy": 0.9658253341913223, "step": 585 }, { "epoch": 3.4281524926686218, - "grad_norm": 1.130870863841404, + "grad_norm": 0.9997324531190634, "learning_rate": 3.143200593294504e-05, - "loss": 0.1154, - "mean_token_accuracy": 0.9686842858791351, + "loss": 0.1186, + "mean_token_accuracy": 0.9656195119023323, "step": 586 }, { "epoch": 3.434017595307918, - "grad_norm": 1.266553022930829, + "grad_norm": 1.1040122559087995, "learning_rate": 3.1402781118672065e-05, - "loss": 0.1313, - "mean_token_accuracy": 0.9647129997611046, + "loss": 0.113, + "mean_token_accuracy": 0.9666641429066658, "step": 587 }, { "epoch": 3.439882697947214, - "grad_norm": 1.0333390497650004, + "grad_norm": 1.0199551083482483, "learning_rate": 3.137352217610115e-05, - "loss": 0.1119, - "mean_token_accuracy": 0.967375859618187, + "loss": 0.1136, + "mean_token_accuracy": 0.9654843956232071, "step": 588 }, { "epoch": 3.4457478005865103, - "grad_norm": 0.8920677150256165, + "grad_norm": 0.7932841443360936, "learning_rate": 3.1344229211430465e-05, - "loss": 0.1126, - "mean_token_accuracy": 0.96572595089674, + "loss": 0.1118, + "mean_token_accuracy": 0.9717729911208153, "step": 589 }, { "epoch": 3.4516129032258065, - "grad_norm": 0.9100521404888532, + "grad_norm": 1.0038381415796183, "learning_rate": 3.131490233098164e-05, - "loss": 0.099, - "mean_token_accuracy": 0.973532646894455, + "loss": 0.0892, + "mean_token_accuracy": 0.9750974550843239, "step": 590 }, { "epoch": 3.4574780058651027, - "grad_norm": 1.0560831237985342, + "grad_norm": 0.9161828715428822, "learning_rate": 3.1285541641199383e-05, - "loss": 0.119, - "mean_token_accuracy": 0.9654569253325462, + "loss": 0.1179, + "mean_token_accuracy": 0.9669305682182312, "step": 591 }, { "epoch": 3.463343108504399, - "grad_norm": 0.9819382175267215, + "grad_norm": 1.0353554299516425, "learning_rate": 3.1256147248651166e-05, - "loss": 0.1105, - "mean_token_accuracy": 0.9697766527533531, + "loss": 0.1037, + "mean_token_accuracy": 0.9700469970703125, "step": 592 }, { "epoch": 3.469208211143695, - "grad_norm": 0.9456952920282133, + "grad_norm": 0.8583882324406463, "learning_rate": 3.122671926002675e-05, - "loss": 0.1141, - "mean_token_accuracy": 0.962925061583519, + "loss": 0.1116, + "mean_token_accuracy": 0.9654911234974861, "step": 593 }, { "epoch": 3.4750733137829912, - "grad_norm": 0.8886708651599997, + "grad_norm": 0.8653675322495964, "learning_rate": 3.119725778213785e-05, - "loss": 0.119, - "mean_token_accuracy": 0.9643419906497002, + "loss": 0.1131, + "mean_token_accuracy": 0.964490219950676, "step": 594 }, { "epoch": 3.4809384164222874, - "grad_norm": 1.2042272320453875, + "grad_norm": 1.1050450659698694, "learning_rate": 3.116776292191774e-05, - "loss": 0.1284, - "mean_token_accuracy": 0.9641223028302193, + "loss": 0.1248, + "mean_token_accuracy": 0.9631328731775284, "step": 595 }, { "epoch": 3.4868035190615836, - "grad_norm": 0.868426298920179, + "grad_norm": 0.9206209235146391, "learning_rate": 3.1138234786420834e-05, - "loss": 0.1075, - "mean_token_accuracy": 0.9682093411684036, + "loss": 0.1006, + "mean_token_accuracy": 0.9688453301787376, "step": 596 }, { "epoch": 3.4926686217008798, - "grad_norm": 0.8636859419474582, + "grad_norm": 0.9717559056575034, "learning_rate": 3.110867348282235e-05, - "loss": 0.1231, - "mean_token_accuracy": 0.9666341170668602, + "loss": 0.1285, + "mean_token_accuracy": 0.9620684534311295, "step": 597 }, { "epoch": 3.498533724340176, - "grad_norm": 1.0945410380452534, + "grad_norm": 1.0752755933385176, "learning_rate": 3.107907911841787e-05, - "loss": 0.1133, - "mean_token_accuracy": 0.9635356739163399, + "loss": 0.1042, + "mean_token_accuracy": 0.9659099653363228, "step": 598 }, { "epoch": 3.504398826979472, - "grad_norm": 0.8736645011251645, + "grad_norm": 0.7770239700047303, "learning_rate": 3.104945180062301e-05, - "loss": 0.1013, - "mean_token_accuracy": 0.9706991836428642, + "loss": 0.1076, + "mean_token_accuracy": 0.9701974913477898, "step": 599 }, { "epoch": 3.5102639296187683, - "grad_norm": 0.8599394312237839, + "grad_norm": 1.109185174777014, "learning_rate": 3.1019791636972936e-05, - "loss": 0.1088, - "mean_token_accuracy": 0.9657791554927826, + "loss": 0.1048, + "mean_token_accuracy": 0.966381847858429, "step": 600 }, { "epoch": 3.5161290322580645, - "grad_norm": 0.9734042928382984, + "grad_norm": 1.0263635177331654, "learning_rate": 3.099009873512208e-05, - "loss": 0.1147, - "mean_token_accuracy": 0.9690323546528816, + "loss": 0.1346, + "mean_token_accuracy": 0.9632927849888802, "step": 601 }, { "epoch": 3.5219941348973607, - "grad_norm": 0.8671838676434842, + "grad_norm": 0.871857986594143, "learning_rate": 3.0960373202843685e-05, - "loss": 0.0987, - "mean_token_accuracy": 0.9714084416627884, + "loss": 0.0982, + "mean_token_accuracy": 0.9723653644323349, "step": 602 }, { "epoch": 3.527859237536657, - "grad_norm": 1.0600541718141812, + "grad_norm": 1.143952454054944, "learning_rate": 3.093061514802943e-05, - "loss": 0.1223, - "mean_token_accuracy": 0.9637552127242088, + "loss": 0.1194, + "mean_token_accuracy": 0.9627049937844276, "step": 603 }, { "epoch": 3.533724340175953, - "grad_norm": 0.9672322618677217, + "grad_norm": 1.0299654159664773, "learning_rate": 3.090082467868901e-05, - "loss": 0.1004, - "mean_token_accuracy": 0.9703481644392014, + "loss": 0.0984, + "mean_token_accuracy": 0.9705034419894218, "step": 604 }, { "epoch": 3.5395894428152492, - "grad_norm": 0.9215427402997913, + "grad_norm": 0.9442694873564053, "learning_rate": 3.087100190294983e-05, - "loss": 0.1118, - "mean_token_accuracy": 0.9667570516467094, + "loss": 0.1195, + "mean_token_accuracy": 0.9644537046551704, "step": 605 }, { "epoch": 3.5454545454545454, - "grad_norm": 1.044225936500776, + "grad_norm": 1.0579964676689142, "learning_rate": 3.0841146929056505e-05, - "loss": 0.1282, - "mean_token_accuracy": 0.9646147862076759, + "loss": 0.124, + "mean_token_accuracy": 0.9656395390629768, "step": 606 }, { "epoch": 3.5513196480938416, - "grad_norm": 1.1406441411754127, + "grad_norm": 1.0119617464563753, "learning_rate": 3.0811259865370535e-05, - "loss": 0.1021, - "mean_token_accuracy": 0.97073944658041, + "loss": 0.1025, + "mean_token_accuracy": 0.9686230942606926, "step": 607 }, { "epoch": 3.557184750733138, - "grad_norm": 0.8384047045794155, + "grad_norm": 1.0545508663891536, "learning_rate": 3.07813408203699e-05, - "loss": 0.1001, - "mean_token_accuracy": 0.9701420590281487, + "loss": 0.1114, + "mean_token_accuracy": 0.9672858342528343, "step": 608 }, { "epoch": 3.563049853372434, - "grad_norm": 0.7890069517164189, + "grad_norm": 0.83374859697724, "learning_rate": 3.075138990264863e-05, - "loss": 0.1221, - "mean_token_accuracy": 0.9611668586730957, + "loss": 0.134, + "mean_token_accuracy": 0.9622107297182083, "step": 609 }, { "epoch": 3.56891495601173, - "grad_norm": 0.8128150056909983, + "grad_norm": 0.9945934991118207, "learning_rate": 3.072140722091648e-05, - "loss": 0.0897, - "mean_token_accuracy": 0.9721631705760956, + "loss": 0.0984, + "mean_token_accuracy": 0.9695856347680092, "step": 610 }, { "epoch": 3.5747800586510263, - "grad_norm": 0.982204128026469, + "grad_norm": 0.9385842099807644, "learning_rate": 3.0691392883998455e-05, - "loss": 0.1357, - "mean_token_accuracy": 0.9621530324220657, + "loss": 0.1245, + "mean_token_accuracy": 0.9652091339230537, "step": 611 }, { "epoch": 3.5806451612903225, - "grad_norm": 1.0337495257797795, + "grad_norm": 0.9443437934567439, "learning_rate": 3.0661347000834496e-05, - "loss": 0.1034, - "mean_token_accuracy": 0.9698660597205162, + "loss": 0.1139, + "mean_token_accuracy": 0.9681995660066605, "step": 612 }, { "epoch": 3.5865102639296187, - "grad_norm": 0.8802096358922299, + "grad_norm": 0.9825906807831453, "learning_rate": 3.063126968047901e-05, - "loss": 0.1073, - "mean_token_accuracy": 0.9643291085958481, + "loss": 0.1165, + "mean_token_accuracy": 0.9647257775068283, "step": 613 }, { "epoch": 3.592375366568915, - "grad_norm": 0.9070710629182922, + "grad_norm": 0.9344625985940249, "learning_rate": 3.060116103210053e-05, - "loss": 0.0907, - "mean_token_accuracy": 0.9729605987668037, + "loss": 0.0927, + "mean_token_accuracy": 0.9714125767350197, "step": 614 }, { "epoch": 3.598240469208211, - "grad_norm": 0.7858682826254871, + "grad_norm": 0.8666974286644824, "learning_rate": 3.057102116498129e-05, - "loss": 0.1061, - "mean_token_accuracy": 0.9679286181926727, + "loss": 0.1037, + "mean_token_accuracy": 0.9673436060547829, "step": 615 }, { "epoch": 3.6041055718475072, - "grad_norm": 1.116732342857861, + "grad_norm": 2.226359878211702, "learning_rate": 3.0540850188516826e-05, - "loss": 0.1317, - "mean_token_accuracy": 0.9623885974287987, + "loss": 0.1174, + "mean_token_accuracy": 0.9670251458883286, "step": 616 }, { "epoch": 3.6099706744868034, - "grad_norm": 0.9723503508992479, + "grad_norm": 0.9103387358090266, "learning_rate": 3.051064821221561e-05, - "loss": 0.095, - "mean_token_accuracy": 0.9739877283573151, + "loss": 0.0985, + "mean_token_accuracy": 0.9705724716186523, "step": 617 }, { "epoch": 3.6158357771260996, - "grad_norm": 0.8932015441700736, + "grad_norm": 1.0486044217213084, "learning_rate": 3.0480415345698606e-05, - "loss": 0.136, - "mean_token_accuracy": 0.958889864385128, + "loss": 0.1383, + "mean_token_accuracy": 0.9586537033319473, "step": 618 }, { "epoch": 3.621700879765396, - "grad_norm": 0.978385970902145, + "grad_norm": 1.0576298255454961, "learning_rate": 3.045015169869892e-05, - "loss": 0.1021, - "mean_token_accuracy": 0.9719918370246887, + "loss": 0.1036, + "mean_token_accuracy": 0.9723232910037041, "step": 619 }, { "epoch": 3.627565982404692, - "grad_norm": 1.111054861659078, + "grad_norm": 0.9867562775894735, "learning_rate": 3.0419857381061355e-05, - "loss": 0.1262, - "mean_token_accuracy": 0.9625189378857613, + "loss": 0.1179, + "mean_token_accuracy": 0.9649086743593216, "step": 620 }, { "epoch": 3.633431085043988, - "grad_norm": 0.7988893400125326, + "grad_norm": 0.8724859401368507, "learning_rate": 3.0389532502742066e-05, - "loss": 0.1131, - "mean_token_accuracy": 0.9654566794633865, + "loss": 0.0978, + "mean_token_accuracy": 0.9671208187937737, "step": 621 }, { "epoch": 3.6392961876832843, - "grad_norm": 0.9734473777514459, + "grad_norm": 1.07839803120773, "learning_rate": 3.0359177173808104e-05, - "loss": 0.1251, - "mean_token_accuracy": 0.9642170071601868, + "loss": 0.1221, + "mean_token_accuracy": 0.9660301432013512, "step": 622 }, { "epoch": 3.6451612903225805, - "grad_norm": 0.9400850428078926, + "grad_norm": 1.0170718623865844, "learning_rate": 3.032879150443705e-05, - "loss": 0.114, - "mean_token_accuracy": 0.9686667993664742, + "loss": 0.1105, + "mean_token_accuracy": 0.9689130410552025, "step": 623 }, { "epoch": 3.6510263929618767, - "grad_norm": 0.9535498861769731, + "grad_norm": 0.9733375577875363, "learning_rate": 3.029837560491662e-05, - "loss": 0.096, - "mean_token_accuracy": 0.9681509435176849, + "loss": 0.0968, + "mean_token_accuracy": 0.9714087471365929, "step": 624 }, { "epoch": 3.656891495601173, - "grad_norm": 0.938181261351189, + "grad_norm": 1.0174522668129806, "learning_rate": 3.0267929585644236e-05, - "loss": 0.1285, - "mean_token_accuracy": 0.9615221172571182, + "loss": 0.1192, + "mean_token_accuracy": 0.9647278562188148, "step": 625 }, { "epoch": 3.662756598240469, - "grad_norm": 0.9923272982279318, + "grad_norm": 1.0062347655942014, "learning_rate": 3.0237453557126656e-05, - "loss": 0.1001, - "mean_token_accuracy": 0.9676861017942429, + "loss": 0.1081, + "mean_token_accuracy": 0.9678944423794746, "step": 626 }, { "epoch": 3.6686217008797652, - "grad_norm": 0.8665048960994197, + "grad_norm": 0.9967334120614507, "learning_rate": 3.020694762997956e-05, - "loss": 0.1054, - "mean_token_accuracy": 0.967039056122303, + "loss": 0.1125, + "mean_token_accuracy": 0.9661891385912895, "step": 627 }, { "epoch": 3.6744868035190614, - "grad_norm": 0.8363872341156405, + "grad_norm": 0.9244227695101611, "learning_rate": 3.017641191492714e-05, - "loss": 0.0958, - "mean_token_accuracy": 0.9716126248240471, + "loss": 0.0937, + "mean_token_accuracy": 0.9720464572310448, "step": 628 }, { "epoch": 3.6803519061583576, - "grad_norm": 0.8150922734512929, + "grad_norm": 1.002358011210733, "learning_rate": 3.0145846522801703e-05, - "loss": 0.0956, - "mean_token_accuracy": 0.971979595720768, + "loss": 0.0919, + "mean_token_accuracy": 0.9704943671822548, "step": 629 }, { "epoch": 3.686217008797654, - "grad_norm": 0.9190785469098514, + "grad_norm": 1.198277332535796, "learning_rate": 3.0115251564543287e-05, - "loss": 0.1333, - "mean_token_accuracy": 0.9590764716267586, + "loss": 0.1403, + "mean_token_accuracy": 0.9590641185641289, "step": 630 }, { "epoch": 3.6920821114369504, - "grad_norm": 1.0778968802891915, + "grad_norm": 1.3056710213263667, "learning_rate": 3.008462715119922e-05, - "loss": 0.1461, - "mean_token_accuracy": 0.9540571868419647, + "loss": 0.158, + "mean_token_accuracy": 0.9538592919707298, "step": 631 }, { "epoch": 3.6979472140762466, - "grad_norm": 1.135241704691624, + "grad_norm": 1.3780337572816563, "learning_rate": 3.0053973393923768e-05, - "loss": 0.0949, - "mean_token_accuracy": 0.9684988856315613, + "loss": 0.1009, + "mean_token_accuracy": 0.9703397899866104, "step": 632 }, { "epoch": 3.703812316715543, - "grad_norm": 0.8026943444777539, + "grad_norm": 0.8557816240918312, "learning_rate": 3.0023290403977694e-05, - "loss": 0.1205, - "mean_token_accuracy": 0.9627582207322121, + "loss": 0.1209, + "mean_token_accuracy": 0.9657327905297279, "step": 633 }, { "epoch": 3.709677419354839, - "grad_norm": 1.060469864983061, + "grad_norm": 0.9844224729335839, "learning_rate": 2.9992578292727842e-05, - "loss": 0.1132, - "mean_token_accuracy": 0.965522937476635, + "loss": 0.1179, + "mean_token_accuracy": 0.9640586525201797, "step": 634 }, { "epoch": 3.715542521994135, - "grad_norm": 0.8622809489621395, + "grad_norm": 0.9674484215989431, "learning_rate": 2.9961837171646778e-05, - "loss": 0.1159, - "mean_token_accuracy": 0.967415414750576, + "loss": 0.1136, + "mean_token_accuracy": 0.9681580066680908, "step": 635 }, { "epoch": 3.7214076246334313, - "grad_norm": 0.8840405653301492, + "grad_norm": 0.9424755969637976, "learning_rate": 2.993106715231237e-05, - "loss": 0.1132, - "mean_token_accuracy": 0.9685205817222595, + "loss": 0.1201, + "mean_token_accuracy": 0.9686867296695709, "step": 636 }, { "epoch": 3.7272727272727275, - "grad_norm": 1.1001761901330231, + "grad_norm": 0.9455408526650082, "learning_rate": 2.9900268346407336e-05, - "loss": 0.1206, - "mean_token_accuracy": 0.9662887156009674, + "loss": 0.1317, + "mean_token_accuracy": 0.9632778465747833, "step": 637 }, { "epoch": 3.7331378299120237, - "grad_norm": 0.9880058176105925, + "grad_norm": 1.1249676030385811, "learning_rate": 2.986944086571893e-05, - "loss": 0.131, - "mean_token_accuracy": 0.9618512764573097, + "loss": 0.1395, + "mean_token_accuracy": 0.9588330909609795, "step": 638 }, { "epoch": 3.73900293255132, - "grad_norm": 0.9708212836272636, + "grad_norm": 1.0055233050654055, "learning_rate": 2.983858482213843e-05, - "loss": 0.0967, - "mean_token_accuracy": 0.9714419692754745, + "loss": 0.0966, + "mean_token_accuracy": 0.9707036763429642, "step": 639 }, { "epoch": 3.744868035190616, - "grad_norm": 0.7569383879074512, + "grad_norm": 0.7933758824565834, "learning_rate": 2.9807700327660834e-05, - "loss": 0.1072, - "mean_token_accuracy": 0.9683928042650223, + "loss": 0.1108, + "mean_token_accuracy": 0.9678425192832947, "step": 640 }, { "epoch": 3.7507331378299122, - "grad_norm": 0.9030504868168383, + "grad_norm": 0.8528929707688316, "learning_rate": 2.977678749438437e-05, - "loss": 0.1194, - "mean_token_accuracy": 0.9649059996008873, + "loss": 0.1145, + "mean_token_accuracy": 0.9663086980581284, "step": 641 }, { "epoch": 3.7565982404692084, - "grad_norm": 1.0884667040903802, + "grad_norm": 0.9774312078342523, "learning_rate": 2.9745846434510146e-05, - "loss": 0.1105, - "mean_token_accuracy": 0.9691413938999176, + "loss": 0.1084, + "mean_token_accuracy": 0.9694355204701424, "step": 642 }, { "epoch": 3.7624633431085046, - "grad_norm": 0.9753513539536118, + "grad_norm": 0.9193243474398831, "learning_rate": 2.9714877260341705e-05, - "loss": 0.1186, - "mean_token_accuracy": 0.9606969803571701, + "loss": 0.1268, + "mean_token_accuracy": 0.9626642465591431, "step": 643 }, { "epoch": 3.768328445747801, - "grad_norm": 0.7492296526886614, + "grad_norm": 0.8728109093263364, "learning_rate": 2.9683880084284648e-05, - "loss": 0.077, - "mean_token_accuracy": 0.9752858132123947, + "loss": 0.0955, + "mean_token_accuracy": 0.9732156321406364, "step": 644 }, { "epoch": 3.774193548387097, - "grad_norm": 0.8541278759879399, + "grad_norm": 0.9244646931723717, "learning_rate": 2.96528550188462e-05, - "loss": 0.1225, - "mean_token_accuracy": 0.9665696918964386, + "loss": 0.1175, + "mean_token_accuracy": 0.9670339375734329, "step": 645 }, { "epoch": 3.780058651026393, - "grad_norm": 0.8695425380218671, + "grad_norm": 0.8008396977844219, "learning_rate": 2.962180217663483e-05, - "loss": 0.1141, - "mean_token_accuracy": 0.9651920199394226, + "loss": 0.1114, + "mean_token_accuracy": 0.9655383676290512, "step": 646 }, { "epoch": 3.7859237536656893, - "grad_norm": 0.965732138124322, + "grad_norm": 1.0307500015042614, "learning_rate": 2.95907216703598e-05, - "loss": 0.1194, - "mean_token_accuracy": 0.9652410075068474, + "loss": 0.1124, + "mean_token_accuracy": 0.9662497565150261, "step": 647 }, { "epoch": 3.7917888563049855, - "grad_norm": 1.0337349786728662, + "grad_norm": 1.0372077539310007, "learning_rate": 2.9559613612830797e-05, - "loss": 0.1222, - "mean_token_accuracy": 0.9637459143996239, + "loss": 0.1255, + "mean_token_accuracy": 0.961214154958725, "step": 648 }, { "epoch": 3.7976539589442817, - "grad_norm": 0.8407085586625942, + "grad_norm": 0.8351390987420513, "learning_rate": 2.952847811695751e-05, - "loss": 0.1065, - "mean_token_accuracy": 0.9705112278461456, + "loss": 0.1121, + "mean_token_accuracy": 0.9706973433494568, "step": 649 }, { "epoch": 3.803519061583578, - "grad_norm": 0.7987034943568222, + "grad_norm": 0.9879339227027757, "learning_rate": 2.9497315295749218e-05, - "loss": 0.1165, - "mean_token_accuracy": 0.9658530652523041, + "loss": 0.116, + "mean_token_accuracy": 0.9665387272834778, "step": 650 }, { "epoch": 3.809384164222874, - "grad_norm": 0.9905673663074058, + "grad_norm": 1.2087606003229365, "learning_rate": 2.9466125262314368e-05, - "loss": 0.1365, - "mean_token_accuracy": 0.9607385098934174, + "loss": 0.142, + "mean_token_accuracy": 0.9571318477392197, "step": 651 }, { "epoch": 3.8152492668621703, - "grad_norm": 0.8431989173157322, + "grad_norm": 0.9043237278983707, "learning_rate": 2.9434908129860193e-05, - "loss": 0.1026, - "mean_token_accuracy": 0.9705355390906334, + "loss": 0.1054, + "mean_token_accuracy": 0.9690318033099174, "step": 652 }, { "epoch": 3.8211143695014664, - "grad_norm": 0.9761533220193974, + "grad_norm": 1.2355269720420694, "learning_rate": 2.9403664011692276e-05, - "loss": 0.1341, - "mean_token_accuracy": 0.9603022783994675, + "loss": 0.1375, + "mean_token_accuracy": 0.9594130963087082, "step": 653 }, { "epoch": 3.8269794721407626, - "grad_norm": 1.0057652946436697, + "grad_norm": 0.9958573729656797, "learning_rate": 2.9372393021214134e-05, - "loss": 0.138, - "mean_token_accuracy": 0.9568366184830666, + "loss": 0.1399, + "mean_token_accuracy": 0.9584333077073097, "step": 654 }, { "epoch": 3.832844574780059, - "grad_norm": 1.1193327300180262, + "grad_norm": 1.2287650649576085, "learning_rate": 2.9341095271926842e-05, - "loss": 0.1083, - "mean_token_accuracy": 0.9681970700621605, + "loss": 0.1284, + "mean_token_accuracy": 0.9632050767540932, "step": 655 }, { "epoch": 3.838709677419355, - "grad_norm": 1.211332723198081, + "grad_norm": 1.1734165312438198, "learning_rate": 2.930977087742859e-05, - "loss": 0.1119, - "mean_token_accuracy": 0.9678284898400307, + "loss": 0.1023, + "mean_token_accuracy": 0.9691994562745094, "step": 656 }, { "epoch": 3.844574780058651, - "grad_norm": 0.9825614982183972, + "grad_norm": 0.8472146928331651, "learning_rate": 2.9278419951414277e-05, - "loss": 0.1261, - "mean_token_accuracy": 0.9617257192730904, + "loss": 0.1158, + "mean_token_accuracy": 0.9662778377532959, "step": 657 }, { "epoch": 3.8504398826979473, - "grad_norm": 0.8315738446465553, + "grad_norm": 0.9489012330648977, "learning_rate": 2.9247042607675105e-05, - "loss": 0.1169, - "mean_token_accuracy": 0.9657503962516785, + "loss": 0.1195, + "mean_token_accuracy": 0.9632801786065102, "step": 658 }, { "epoch": 3.8563049853372435, - "grad_norm": 0.8775305493698721, + "grad_norm": 0.8468422513424982, "learning_rate": 2.9215638960098164e-05, - "loss": 0.0755, - "mean_token_accuracy": 0.9759255200624466, + "loss": 0.073, + "mean_token_accuracy": 0.9786935448646545, "step": 659 }, { "epoch": 3.8621700879765397, - "grad_norm": 0.6996032422889692, + "grad_norm": 0.6091605564057077, "learning_rate": 2.9184209122665996e-05, - "loss": 0.1072, - "mean_token_accuracy": 0.9670997187495232, + "loss": 0.0966, + "mean_token_accuracy": 0.969777375459671, "step": 660 }, { "epoch": 3.868035190615836, - "grad_norm": 0.8460534329988707, + "grad_norm": 0.7295390267906454, "learning_rate": 2.915275320945623e-05, - "loss": 0.122, - "mean_token_accuracy": 0.9646456241607666, + "loss": 0.1271, + "mean_token_accuracy": 0.9638236686587334, "step": 661 }, { "epoch": 3.873900293255132, - "grad_norm": 1.1089879954428397, + "grad_norm": 1.2158300799396626, "learning_rate": 2.9121271334641127e-05, - "loss": 0.1148, - "mean_token_accuracy": 0.9666710719466209, + "loss": 0.1282, + "mean_token_accuracy": 0.9644179865717888, "step": 662 }, { "epoch": 3.8797653958944283, - "grad_norm": 0.9487108641230017, + "grad_norm": 1.0737471767682187, "learning_rate": 2.908976361248717e-05, - "loss": 0.1017, - "mean_token_accuracy": 0.9736130684614182, + "loss": 0.114, + "mean_token_accuracy": 0.9729373753070831, "step": 663 }, { "epoch": 3.8856304985337244, - "grad_norm": 0.701535256126763, + "grad_norm": 0.7938043248994715, "learning_rate": 2.9058230157354674e-05, - "loss": 0.1162, - "mean_token_accuracy": 0.963954895734787, + "loss": 0.1261, + "mean_token_accuracy": 0.9626899287104607, "step": 664 }, { "epoch": 3.8914956011730206, - "grad_norm": 1.2184520006523096, + "grad_norm": 1.1900366046432855, "learning_rate": 2.902667108369734e-05, - "loss": 0.1125, - "mean_token_accuracy": 0.9652879014611244, + "loss": 0.1219, + "mean_token_accuracy": 0.9647004157304764, "step": 665 }, { "epoch": 3.897360703812317, - "grad_norm": 1.050805435187101, + "grad_norm": 0.9392537748131257, "learning_rate": 2.8995086506061862e-05, - "loss": 0.1169, - "mean_token_accuracy": 0.9657002538442612, + "loss": 0.1062, + "mean_token_accuracy": 0.9695572704076767, "step": 666 }, { "epoch": 3.903225806451613, - "grad_norm": 1.0753573344224259, + "grad_norm": 0.8882779542609907, "learning_rate": 2.896347653908749e-05, - "loss": 0.1065, - "mean_token_accuracy": 0.9706609547138214, + "loss": 0.098, + "mean_token_accuracy": 0.973405510187149, "step": 667 }, { "epoch": 3.909090909090909, - "grad_norm": 0.6759719386032113, + "grad_norm": 0.7571815374914855, "learning_rate": 2.8931841297505657e-05, - "loss": 0.1053, - "mean_token_accuracy": 0.9689249470829964, + "loss": 0.0982, + "mean_token_accuracy": 0.969622477889061, "step": 668 }, { "epoch": 3.9149560117302054, - "grad_norm": 0.7879426759278125, + "grad_norm": 0.8781968094346194, "learning_rate": 2.8900180896139503e-05, - "loss": 0.0825, - "mean_token_accuracy": 0.9760407134890556, + "loss": 0.0988, + "mean_token_accuracy": 0.972535714507103, "step": 669 }, { "epoch": 3.9208211143695015, - "grad_norm": 0.8961566450949507, + "grad_norm": 0.981652722231685, "learning_rate": 2.8868495449903498e-05, - "loss": 0.0891, - "mean_token_accuracy": 0.9749082326889038, + "loss": 0.0809, + "mean_token_accuracy": 0.9754110649228096, "step": 670 }, { "epoch": 3.9266862170087977, - "grad_norm": 0.7145724309301391, + "grad_norm": 0.7083811647983249, "learning_rate": 2.8836785073803014e-05, - "loss": 0.0968, - "mean_token_accuracy": 0.9699864313006401, + "loss": 0.0993, + "mean_token_accuracy": 0.9723174721002579, "step": 671 }, { "epoch": 3.932551319648094, - "grad_norm": 0.7112042624615512, + "grad_norm": 0.7916889704987639, "learning_rate": 2.880504988293391e-05, - "loss": 0.1009, - "mean_token_accuracy": 0.9703424945473671, + "loss": 0.1041, + "mean_token_accuracy": 0.968935064971447, "step": 672 }, { "epoch": 3.93841642228739, - "grad_norm": 0.8598718711890393, + "grad_norm": 0.8461912090206244, "learning_rate": 2.8773289992482115e-05, - "loss": 0.106, - "mean_token_accuracy": 0.9680499285459518, + "loss": 0.1058, + "mean_token_accuracy": 0.9667421951889992, "step": 673 }, { "epoch": 3.9442815249266863, - "grad_norm": 0.9698263823669294, + "grad_norm": 0.821497051407434, "learning_rate": 2.87415055177232e-05, - "loss": 0.1056, - "mean_token_accuracy": 0.9698052480816841, + "loss": 0.1099, + "mean_token_accuracy": 0.9673136919736862, "step": 674 }, { "epoch": 3.9501466275659824, - "grad_norm": 0.8335613687508117, + "grad_norm": 0.7737302184142852, "learning_rate": 2.870969657402197e-05, - "loss": 0.1152, - "mean_token_accuracy": 0.9647799134254456, + "loss": 0.1204, + "mean_token_accuracy": 0.9651020839810371, "step": 675 }, { "epoch": 3.9560117302052786, - "grad_norm": 1.2113603594671711, + "grad_norm": 1.0755731560585788, "learning_rate": 2.867786327683205e-05, - "loss": 0.1552, - "mean_token_accuracy": 0.9564560130238533, + "loss": 0.153, + "mean_token_accuracy": 0.9559778422117233, "step": 676 }, { "epoch": 3.961876832844575, - "grad_norm": 0.9088547489871039, + "grad_norm": 1.072752247884178, "learning_rate": 2.864600574169545e-05, - "loss": 0.1184, - "mean_token_accuracy": 0.9664463996887207, + "loss": 0.1122, + "mean_token_accuracy": 0.9674321562051773, "step": 677 }, { "epoch": 3.967741935483871, - "grad_norm": 1.050048176179559, + "grad_norm": 0.9687439947338238, "learning_rate": 2.861412408424216e-05, - "loss": 0.1118, - "mean_token_accuracy": 0.966325081884861, + "loss": 0.1152, + "mean_token_accuracy": 0.9655726253986359, "step": 678 }, { "epoch": 3.973607038123167, - "grad_norm": 0.994917923182441, + "grad_norm": 1.2078607637774967, "learning_rate": 2.8582218420189706e-05, - "loss": 0.1158, - "mean_token_accuracy": 0.9656457379460335, + "loss": 0.1288, + "mean_token_accuracy": 0.9655884131789207, "step": 679 }, { "epoch": 3.9794721407624634, - "grad_norm": 1.0408318438711954, + "grad_norm": 1.067708348114871, "learning_rate": 2.855028886534278e-05, - "loss": 0.1272, - "mean_token_accuracy": 0.9610341414809227, + "loss": 0.1357, + "mean_token_accuracy": 0.9624500647187233, "step": 680 }, { "epoch": 3.9853372434017595, - "grad_norm": 0.9371789995160021, + "grad_norm": 1.0105927291093553, "learning_rate": 2.851833553559276e-05, - "loss": 0.1091, - "mean_token_accuracy": 0.966740570962429, + "loss": 0.1121, + "mean_token_accuracy": 0.9678521081805229, "step": 681 }, { "epoch": 3.9912023460410557, - "grad_norm": 0.9273694196009427, + "grad_norm": 0.9189958182692042, "learning_rate": 2.848635854691733e-05, - "loss": 0.1184, - "mean_token_accuracy": 0.9653659835457802, + "loss": 0.1068, + "mean_token_accuracy": 0.9670966193079948, "step": 682 }, { "epoch": 3.997067448680352, - "grad_norm": 0.7340375512532604, + "grad_norm": 1.0148596552604907, "learning_rate": 2.8454358015380046e-05, - "loss": 0.088, - "mean_token_accuracy": 0.9720573499798775, + "loss": 0.0923, + "mean_token_accuracy": 0.9698840975761414, "step": 683 }, { "epoch": 4.0, - "grad_norm": 1.2796567322551584, + "grad_norm": 1.8462979410466998, "learning_rate": 2.8422334057129913e-05, - "loss": 0.1123, - "mean_token_accuracy": 0.9657177776098251, + "loss": 0.1143, + "mean_token_accuracy": 0.9644208699464798, "step": 684 }, { "epoch": 4.005865102639296, - "grad_norm": 0.5814062084412526, + "grad_norm": 0.8309715222349474, "learning_rate": 2.8390286788400967e-05, - "loss": 0.0807, - "mean_token_accuracy": 0.973865695297718, + "loss": 0.0824, + "mean_token_accuracy": 0.9704834669828415, "step": 685 }, { "epoch": 4.011730205278592, - "grad_norm": 0.6533584141705436, + "grad_norm": 0.870173230690534, "learning_rate": 2.8358216325511847e-05, - "loss": 0.0723, - "mean_token_accuracy": 0.9766295105218887, + "loss": 0.0762, + "mean_token_accuracy": 0.9775184169411659, "step": 686 }, { "epoch": 4.0175953079178885, - "grad_norm": 0.9162087220426504, + "grad_norm": 0.8724933903065529, "learning_rate": 2.832612278486538e-05, - "loss": 0.1061, - "mean_token_accuracy": 0.9709026291966438, + "loss": 0.1006, + "mean_token_accuracy": 0.9713100343942642, "step": 687 }, { "epoch": 4.023460410557185, - "grad_norm": 0.7375794680352793, + "grad_norm": 0.7725998310377216, "learning_rate": 2.8294006282948165e-05, - "loss": 0.0891, - "mean_token_accuracy": 0.976003848016262, + "loss": 0.0825, + "mean_token_accuracy": 0.9770268797874451, "step": 688 }, { "epoch": 4.029325513196481, - "grad_norm": 0.7494579181986135, + "grad_norm": 0.9213705846046193, "learning_rate": 2.8261866936330123e-05, - "loss": 0.0816, - "mean_token_accuracy": 0.9738617315888405, + "loss": 0.0928, + "mean_token_accuracy": 0.9697828590869904, "step": 689 }, { "epoch": 4.035190615835777, - "grad_norm": 0.6929603989933999, + "grad_norm": 0.9164311785394971, "learning_rate": 2.8229704861664113e-05, - "loss": 0.0801, - "mean_token_accuracy": 0.9780551716685295, + "loss": 0.0884, + "mean_token_accuracy": 0.974254809319973, "step": 690 }, { "epoch": 4.041055718475073, - "grad_norm": 0.7164876946150348, + "grad_norm": 0.9964223054765102, "learning_rate": 2.8197520175685462e-05, - "loss": 0.0788, - "mean_token_accuracy": 0.9771693646907806, + "loss": 0.0928, + "mean_token_accuracy": 0.9761571735143661, "step": 691 }, { "epoch": 4.0469208211143695, - "grad_norm": 0.8679669696761695, + "grad_norm": 0.6997071585836834, "learning_rate": 2.8165312995211596e-05, - "loss": 0.0799, - "mean_token_accuracy": 0.9789220467209816, + "loss": 0.0805, + "mean_token_accuracy": 0.9769442081451416, "step": 692 }, { "epoch": 4.052785923753666, - "grad_norm": 0.5785509783140412, + "grad_norm": 0.6214295435479037, "learning_rate": 2.813308343714156e-05, - "loss": 0.0706, - "mean_token_accuracy": 0.9771093800663948, + "loss": 0.0696, + "mean_token_accuracy": 0.9754916131496429, "step": 693 }, { "epoch": 4.058651026392962, - "grad_norm": 0.7587374823959739, + "grad_norm": 0.9160480861099328, "learning_rate": 2.810083161845564e-05, - "loss": 0.09, - "mean_token_accuracy": 0.9711701348423958, + "loss": 0.0843, + "mean_token_accuracy": 0.9728389903903008, "step": 694 }, { "epoch": 4.064516129032258, - "grad_norm": 0.8604246722345931, + "grad_norm": 0.8228292525646084, "learning_rate": 2.8068557656214913e-05, - "loss": 0.0861, - "mean_token_accuracy": 0.9758541658520699, + "loss": 0.0877, + "mean_token_accuracy": 0.975266806781292, "step": 695 }, { "epoch": 4.070381231671554, - "grad_norm": 0.6621420685564046, + "grad_norm": 0.7567610887777896, "learning_rate": 2.8036261667560826e-05, - "loss": 0.0766, - "mean_token_accuracy": 0.9776150584220886, + "loss": 0.0857, + "mean_token_accuracy": 0.9754011780023575, "step": 696 }, { "epoch": 4.07624633431085, - "grad_norm": 0.7637582164149367, + "grad_norm": 0.8298182774230494, "learning_rate": 2.8003943769714776e-05, - "loss": 0.0956, - "mean_token_accuracy": 0.9732154309749603, + "loss": 0.0987, + "mean_token_accuracy": 0.9720055907964706, "step": 697 }, { "epoch": 4.0821114369501466, - "grad_norm": 1.0199977041425903, + "grad_norm": 0.8355014577188172, "learning_rate": 2.7971604079977673e-05, - "loss": 0.1004, - "mean_token_accuracy": 0.9695825353264809, + "loss": 0.103, + "mean_token_accuracy": 0.969631053507328, "step": 698 }, { "epoch": 4.087976539589443, - "grad_norm": 0.750195052728083, + "grad_norm": 0.7652439863283277, "learning_rate": 2.793924271572954e-05, - "loss": 0.0785, - "mean_token_accuracy": 0.975655235350132, + "loss": 0.0877, + "mean_token_accuracy": 0.9737903922796249, "step": 699 }, { "epoch": 4.093841642228739, - "grad_norm": 0.6454840574121126, + "grad_norm": 0.8251456929986012, "learning_rate": 2.7906859794429047e-05, - "loss": 0.0812, - "mean_token_accuracy": 0.9726183488965034, + "loss": 0.0907, + "mean_token_accuracy": 0.9713805019855499, "step": 700 }, { "epoch": 4.099706744868035, - "grad_norm": 0.7783150151101518, + "grad_norm": 0.8578244497152002, "learning_rate": 2.787445543361313e-05, - "loss": 0.0801, - "mean_token_accuracy": 0.9766353219747543, + "loss": 0.0817, + "mean_token_accuracy": 0.9748812690377235, "step": 701 }, { "epoch": 4.105571847507331, - "grad_norm": 0.8354381215669707, + "grad_norm": 2.0012386249383654, "learning_rate": 2.7842029750896525e-05, - "loss": 0.091, - "mean_token_accuracy": 0.9731776043772697, + "loss": 0.1236, + "mean_token_accuracy": 0.9688002988696098, "step": 702 }, { "epoch": 4.1114369501466275, - "grad_norm": 0.8831663612555645, + "grad_norm": 0.8169472384017665, "learning_rate": 2.7809582863971373e-05, - "loss": 0.0903, - "mean_token_accuracy": 0.9750881195068359, + "loss": 0.1007, + "mean_token_accuracy": 0.9726080745458603, "step": 703 }, { "epoch": 4.117302052785924, - "grad_norm": 0.8696118136211701, + "grad_norm": 0.7344538146094689, "learning_rate": 2.777711489060676e-05, - "loss": 0.0903, - "mean_token_accuracy": 0.971781887114048, + "loss": 0.0952, + "mean_token_accuracy": 0.9706440791487694, "step": 704 }, { "epoch": 4.12316715542522, - "grad_norm": 0.6913327616601556, + "grad_norm": 0.6827415837619415, "learning_rate": 2.7744625948648316e-05, - "loss": 0.0735, - "mean_token_accuracy": 0.9773849919438362, + "loss": 0.0814, + "mean_token_accuracy": 0.9768778160214424, "step": 705 }, { "epoch": 4.129032258064516, - "grad_norm": 0.7555233699938674, + "grad_norm": 0.6892663949268744, "learning_rate": 2.7712116156017783e-05, - "loss": 0.0843, - "mean_token_accuracy": 0.9777982458472252, + "loss": 0.083, + "mean_token_accuracy": 0.976724274456501, "step": 706 }, { "epoch": 4.134897360703812, - "grad_norm": 0.9696476917690434, + "grad_norm": 0.8833845705107489, "learning_rate": 2.7679585630712585e-05, - "loss": 0.0975, - "mean_token_accuracy": 0.9709056839346886, + "loss": 0.1002, + "mean_token_accuracy": 0.9703218340873718, "step": 707 }, { "epoch": 4.140762463343108, - "grad_norm": 0.5938696022344896, + "grad_norm": 0.6951848212640516, "learning_rate": 2.764703449080538e-05, - "loss": 0.0811, - "mean_token_accuracy": 0.9765899106860161, + "loss": 0.0852, + "mean_token_accuracy": 0.9751351922750473, "step": 708 }, { "epoch": 4.146627565982405, - "grad_norm": 0.9067730169164242, + "grad_norm": 0.7844537288917677, "learning_rate": 2.761446285444366e-05, - "loss": 0.0987, - "mean_token_accuracy": 0.9722139462828636, + "loss": 0.094, + "mean_token_accuracy": 0.9713953137397766, "step": 709 }, { "epoch": 4.152492668621701, - "grad_norm": 0.639869651782152, + "grad_norm": 0.6073561410934927, "learning_rate": 2.758187083984931e-05, - "loss": 0.0679, - "mean_token_accuracy": 0.9808940887451172, + "loss": 0.0742, + "mean_token_accuracy": 0.979846678674221, "step": 710 }, { "epoch": 4.158357771260997, - "grad_norm": 0.783349736332696, + "grad_norm": 1.02646903244523, "learning_rate": 2.754925856531819e-05, - "loss": 0.1062, - "mean_token_accuracy": 0.9705143421888351, + "loss": 0.1017, + "mean_token_accuracy": 0.9713460206985474, "step": 711 }, { "epoch": 4.164222873900293, - "grad_norm": 1.7638873091647072, + "grad_norm": 0.791224278335533, "learning_rate": 2.7516626149219678e-05, - "loss": 0.0963, - "mean_token_accuracy": 0.9750856310129166, + "loss": 0.0822, + "mean_token_accuracy": 0.9769997522234917, "step": 712 }, { "epoch": 4.170087976539589, - "grad_norm": 0.6835305264635491, + "grad_norm": 0.6766217855079016, "learning_rate": 2.7483973709996267e-05, - "loss": 0.0873, - "mean_token_accuracy": 0.9742805510759354, + "loss": 0.0882, + "mean_token_accuracy": 0.9750803634524345, "step": 713 }, { "epoch": 4.1759530791788855, - "grad_norm": 0.7878929380998676, + "grad_norm": 0.8810581764225817, "learning_rate": 2.7451301366163116e-05, - "loss": 0.0978, - "mean_token_accuracy": 0.9704447388648987, + "loss": 0.103, + "mean_token_accuracy": 0.9694948419928551, "step": 714 }, { "epoch": 4.181818181818182, - "grad_norm": 0.6196814435565455, + "grad_norm": 0.6091010162338775, "learning_rate": 2.741860923630765e-05, - "loss": 0.0733, - "mean_token_accuracy": 0.9794884473085403, + "loss": 0.0722, + "mean_token_accuracy": 0.9799239709973335, "step": 715 }, { "epoch": 4.187683284457478, - "grad_norm": 0.7265891736306469, + "grad_norm": 0.9397306887291648, "learning_rate": 2.7385897439089086e-05, - "loss": 0.0862, - "mean_token_accuracy": 0.9740516096353531, + "loss": 0.0897, + "mean_token_accuracy": 0.9735899195075035, "step": 716 }, { "epoch": 4.193548387096774, - "grad_norm": 0.9562760180488723, + "grad_norm": 0.9305247718186483, "learning_rate": 2.735316609323804e-05, - "loss": 0.1026, - "mean_token_accuracy": 0.9694742858409882, + "loss": 0.105, + "mean_token_accuracy": 0.9696411564946175, "step": 717 }, { "epoch": 4.19941348973607, - "grad_norm": 0.7996872026272548, + "grad_norm": 0.839558099374855, "learning_rate": 2.7320415317556085e-05, - "loss": 0.0878, - "mean_token_accuracy": 0.9754326492547989, + "loss": 0.0926, + "mean_token_accuracy": 0.9743531122803688, "step": 718 }, { "epoch": 4.205278592375366, - "grad_norm": 0.6396887850496302, + "grad_norm": 0.8015165041637531, "learning_rate": 2.72876452309153e-05, - "loss": 0.071, - "mean_token_accuracy": 0.9772866442799568, + "loss": 0.0769, + "mean_token_accuracy": 0.9769142642617226, "step": 719 }, { "epoch": 4.211143695014663, - "grad_norm": 0.7719783389407485, + "grad_norm": 0.904863126847494, "learning_rate": 2.7254855952257867e-05, - "loss": 0.0898, - "mean_token_accuracy": 0.9726490750908852, + "loss": 0.0965, + "mean_token_accuracy": 0.9713497161865234, "step": 720 }, { "epoch": 4.217008797653959, - "grad_norm": 1.0644091045972202, + "grad_norm": 1.1447287147619216, "learning_rate": 2.7222047600595626e-05, - "loss": 0.1087, - "mean_token_accuracy": 0.9672816544771194, + "loss": 0.1125, + "mean_token_accuracy": 0.9678602889180183, "step": 721 }, { "epoch": 4.222873900293255, - "grad_norm": 0.7154307567634154, + "grad_norm": 0.6811363601209449, "learning_rate": 2.718922029500965e-05, - "loss": 0.084, - "mean_token_accuracy": 0.974772721529007, + "loss": 0.082, + "mean_token_accuracy": 0.9760172292590141, "step": 722 }, { "epoch": 4.228739002932551, - "grad_norm": 0.6908956165564543, + "grad_norm": 0.8682321737802231, "learning_rate": 2.7156374154649787e-05, - "loss": 0.0815, - "mean_token_accuracy": 0.973622277379036, + "loss": 0.0895, + "mean_token_accuracy": 0.972284585237503, "step": 723 }, { "epoch": 4.234604105571847, - "grad_norm": 0.7365960380961718, + "grad_norm": 0.7858842014740195, "learning_rate": 2.7123509298734267e-05, - "loss": 0.0858, - "mean_token_accuracy": 0.972634956240654, + "loss": 0.0896, + "mean_token_accuracy": 0.9748860970139503, "step": 724 }, { "epoch": 4.2404692082111435, - "grad_norm": 0.8876489162788943, + "grad_norm": 0.7803207391492687, "learning_rate": 2.7090625846549247e-05, - "loss": 0.0949, - "mean_token_accuracy": 0.972042515873909, + "loss": 0.0925, + "mean_token_accuracy": 0.9706885814666748, "step": 725 }, { "epoch": 4.24633431085044, - "grad_norm": 0.8351301909607796, + "grad_norm": 1.1677543957359848, "learning_rate": 2.705772391744837e-05, - "loss": 0.0912, - "mean_token_accuracy": 0.9738112688064575, + "loss": 0.1145, + "mean_token_accuracy": 0.9681753516197205, "step": 726 }, { "epoch": 4.252199413489736, - "grad_norm": 0.7667250445002752, + "grad_norm": 0.9170706318759467, "learning_rate": 2.7024803630852362e-05, - "loss": 0.0872, - "mean_token_accuracy": 0.9735923185944557, + "loss": 0.0921, + "mean_token_accuracy": 0.9747585207223892, "step": 727 }, { "epoch": 4.258064516129032, - "grad_norm": 0.9337786078065189, + "grad_norm": 0.7132222699945785, "learning_rate": 2.699186510624856e-05, - "loss": 0.1078, - "mean_token_accuracy": 0.969543345272541, + "loss": 0.0989, + "mean_token_accuracy": 0.9715269804000854, "step": 728 }, { "epoch": 4.263929618768328, - "grad_norm": 0.7401750526879216, + "grad_norm": 0.7219852764632368, "learning_rate": 2.6958908463190506e-05, - "loss": 0.0922, - "mean_token_accuracy": 0.9721928238868713, + "loss": 0.092, + "mean_token_accuracy": 0.9742382988333702, "step": 729 }, { "epoch": 4.269794721407624, - "grad_norm": 0.8159479602529707, + "grad_norm": 0.7543221985745602, "learning_rate": 2.6925933821297497e-05, - "loss": 0.0906, - "mean_token_accuracy": 0.9723092764616013, + "loss": 0.091, + "mean_token_accuracy": 0.9712286666035652, "step": 730 }, { "epoch": 4.275659824046921, - "grad_norm": 0.7863328763793721, + "grad_norm": 0.8374221740590484, "learning_rate": 2.6892941300254176e-05, - "loss": 0.0877, - "mean_token_accuracy": 0.9762269631028175, + "loss": 0.1058, + "mean_token_accuracy": 0.9711539074778557, "step": 731 }, { "epoch": 4.281524926686217, - "grad_norm": 0.7226575436605356, + "grad_norm": 0.753573683888248, "learning_rate": 2.685993101981007e-05, - "loss": 0.0842, - "mean_token_accuracy": 0.9757591262459755, + "loss": 0.0914, + "mean_token_accuracy": 0.9735663160681725, "step": 732 }, { "epoch": 4.287390029325513, - "grad_norm": 0.6717494608206148, + "grad_norm": 0.8643191548607165, "learning_rate": 2.6826903099779157e-05, - "loss": 0.0814, - "mean_token_accuracy": 0.9769570678472519, + "loss": 0.0925, + "mean_token_accuracy": 0.976229302585125, "step": 733 }, { "epoch": 4.293255131964809, - "grad_norm": 0.8798299785191601, + "grad_norm": 0.8183124471156848, "learning_rate": 2.679385766003945e-05, - "loss": 0.0998, - "mean_token_accuracy": 0.9717613831162453, + "loss": 0.0961, + "mean_token_accuracy": 0.971428208053112, "step": 734 }, { "epoch": 4.299120234604105, - "grad_norm": 0.6860539528931006, + "grad_norm": 0.7393638741111308, "learning_rate": 2.676079482053255e-05, - "loss": 0.0919, - "mean_token_accuracy": 0.9732666462659836, + "loss": 0.1018, + "mean_token_accuracy": 0.9702118262648582, "step": 735 }, { "epoch": 4.3049853372434015, - "grad_norm": 0.7965978126466666, + "grad_norm": 0.8476429324489293, "learning_rate": 2.6727714701263212e-05, - "loss": 0.0911, - "mean_token_accuracy": 0.9728346392512321, + "loss": 0.0952, + "mean_token_accuracy": 0.973472535610199, "step": 736 }, { "epoch": 4.310850439882698, - "grad_norm": 0.6745321116193526, + "grad_norm": 0.8099782154425992, "learning_rate": 2.669461742229891e-05, - "loss": 0.0872, - "mean_token_accuracy": 0.9757064208388329, + "loss": 0.1012, + "mean_token_accuracy": 0.9737088829278946, "step": 737 }, { "epoch": 4.316715542521994, - "grad_norm": 0.7052793622815522, + "grad_norm": 0.6707361657569837, "learning_rate": 2.6661503103769404e-05, - "loss": 0.0773, - "mean_token_accuracy": 0.9765629544854164, + "loss": 0.0792, + "mean_token_accuracy": 0.9772538915276527, "step": 738 }, { "epoch": 4.32258064516129, - "grad_norm": 0.8439535536764042, + "grad_norm": 0.8535587162942073, "learning_rate": 2.6628371865866286e-05, - "loss": 0.0976, - "mean_token_accuracy": 0.9718929752707481, + "loss": 0.1011, + "mean_token_accuracy": 0.9715052172541618, "step": 739 }, { "epoch": 4.328445747800586, - "grad_norm": 0.7590924148224023, + "grad_norm": 0.7764991300467, "learning_rate": 2.6595223828842578e-05, - "loss": 0.0942, - "mean_token_accuracy": 0.9718786254525185, + "loss": 0.0966, + "mean_token_accuracy": 0.9695071950554848, "step": 740 }, { "epoch": 4.334310850439882, - "grad_norm": 0.7115452306720939, + "grad_norm": 0.7804651105373259, "learning_rate": 2.6562059113012253e-05, - "loss": 0.0842, - "mean_token_accuracy": 0.9750241562724113, + "loss": 0.0933, + "mean_token_accuracy": 0.9718307480216026, "step": 741 }, { "epoch": 4.340175953079179, - "grad_norm": 0.5307398800874134, + "grad_norm": 0.8407216614420148, "learning_rate": 2.6528877838749853e-05, - "loss": 0.0699, - "mean_token_accuracy": 0.9786670580506325, + "loss": 0.0867, + "mean_token_accuracy": 0.9775464683771133, "step": 742 }, { "epoch": 4.346041055718475, - "grad_norm": 0.7855965606038025, + "grad_norm": 0.8618094586938339, "learning_rate": 2.6495680126489984e-05, - "loss": 0.0878, - "mean_token_accuracy": 0.9722012206912041, + "loss": 0.0919, + "mean_token_accuracy": 0.971205361187458, "step": 743 }, { "epoch": 4.351906158357771, - "grad_norm": 0.8072504806635915, + "grad_norm": 0.8163478228495167, "learning_rate": 2.6462466096726954e-05, - "loss": 0.1004, - "mean_token_accuracy": 0.9716598242521286, + "loss": 0.1073, + "mean_token_accuracy": 0.9718838185071945, "step": 744 }, { "epoch": 4.357771260997067, - "grad_norm": 0.7450796282399247, + "grad_norm": 0.8594960287919026, "learning_rate": 2.6429235870014256e-05, - "loss": 0.0853, - "mean_token_accuracy": 0.9746398106217384, + "loss": 0.0949, + "mean_token_accuracy": 0.9723696932196617, "step": 745 }, { "epoch": 4.363636363636363, - "grad_norm": 0.8608235347633818, + "grad_norm": 1.1525397696475228, "learning_rate": 2.639598956696421e-05, - "loss": 0.0979, - "mean_token_accuracy": 0.9726268425583839, + "loss": 0.1052, + "mean_token_accuracy": 0.9725246876478195, "step": 746 }, { "epoch": 4.3695014662756595, - "grad_norm": 0.6571077470596837, + "grad_norm": 0.6454962182852607, "learning_rate": 2.6362727308247458e-05, - "loss": 0.0809, - "mean_token_accuracy": 0.9742465242743492, + "loss": 0.08, + "mean_token_accuracy": 0.9751030281186104, "step": 747 }, { "epoch": 4.375366568914956, - "grad_norm": 0.7732181445332282, + "grad_norm": 0.7198413708470565, "learning_rate": 2.6329449214592568e-05, - "loss": 0.0977, - "mean_token_accuracy": 0.9746908023953438, + "loss": 0.0947, + "mean_token_accuracy": 0.9728525876998901, "step": 748 }, { "epoch": 4.381231671554252, - "grad_norm": 0.7521486012712132, + "grad_norm": 0.8073069554708537, "learning_rate": 2.6296155406785578e-05, - "loss": 0.0979, - "mean_token_accuracy": 0.9705567210912704, + "loss": 0.0943, + "mean_token_accuracy": 0.9722790792584419, "step": 749 }, { "epoch": 4.387096774193548, - "grad_norm": 0.736741257255026, + "grad_norm": 0.7273915769534642, "learning_rate": 2.6262846005669572e-05, - "loss": 0.0822, - "mean_token_accuracy": 0.9748862311244011, + "loss": 0.0872, + "mean_token_accuracy": 0.9755823463201523, "step": 750 }, { "epoch": 4.392961876832844, - "grad_norm": 0.7540120050721504, + "grad_norm": 0.6222972696609765, "learning_rate": 2.6229521132144212e-05, - "loss": 0.0864, - "mean_token_accuracy": 0.9734518304467201, + "loss": 0.0795, + "mean_token_accuracy": 0.9756253361701965, "step": 751 }, { "epoch": 4.39882697947214, - "grad_norm": 0.6830790352544612, + "grad_norm": 0.6886258769301212, "learning_rate": 2.619618090716534e-05, - "loss": 0.0886, - "mean_token_accuracy": 0.9746551960706711, + "loss": 0.0855, + "mean_token_accuracy": 0.973938450217247, "step": 752 }, { "epoch": 4.404692082111437, - "grad_norm": 0.6953309994974072, + "grad_norm": 0.8210958542755064, "learning_rate": 2.61628254517445e-05, - "loss": 0.0784, - "mean_token_accuracy": 0.9740583300590515, + "loss": 0.0831, + "mean_token_accuracy": 0.9736155867576599, "step": 753 }, { "epoch": 4.410557184750733, - "grad_norm": 0.693352579440526, + "grad_norm": 0.804804559262862, "learning_rate": 2.612945488694853e-05, - "loss": 0.0938, - "mean_token_accuracy": 0.9734242856502533, + "loss": 0.0997, + "mean_token_accuracy": 0.9714327901601791, "step": 754 }, { "epoch": 4.416422287390029, - "grad_norm": 0.8207186438354438, + "grad_norm": 1.007434669319931, "learning_rate": 2.6096069333899094e-05, - "loss": 0.0891, - "mean_token_accuracy": 0.9744479283690453, + "loss": 0.0922, + "mean_token_accuracy": 0.9747961461544037, "step": 755 }, { "epoch": 4.422287390029325, - "grad_norm": 0.9881527516230675, + "grad_norm": 0.9735266563411731, "learning_rate": 2.6062668913772275e-05, - "loss": 0.113, - "mean_token_accuracy": 0.9657848328351974, + "loss": 0.1074, + "mean_token_accuracy": 0.9690496772527695, "step": 756 }, { "epoch": 4.428152492668621, - "grad_norm": 0.8892111964841636, + "grad_norm": 0.7776668124323017, "learning_rate": 2.60292537477981e-05, - "loss": 0.097, - "mean_token_accuracy": 0.970199853181839, + "loss": 0.0912, + "mean_token_accuracy": 0.9735496193170547, "step": 757 }, { "epoch": 4.4340175953079175, - "grad_norm": 0.6005235109234973, + "grad_norm": 0.9820708255779215, "learning_rate": 2.5995823957260132e-05, - "loss": 0.0896, - "mean_token_accuracy": 0.970848336815834, + "loss": 0.1184, + "mean_token_accuracy": 0.9666420146822929, "step": 758 }, { "epoch": 4.439882697947214, - "grad_norm": 0.971010752823258, + "grad_norm": 0.6577310837958226, "learning_rate": 2.596237966349501e-05, - "loss": 0.0902, - "mean_token_accuracy": 0.9717428460717201, + "loss": 0.09, + "mean_token_accuracy": 0.9744406044483185, "step": 759 }, { "epoch": 4.44574780058651, - "grad_norm": 0.6147265684629026, + "grad_norm": 0.7130928227326236, "learning_rate": 2.592892098789201e-05, - "loss": 0.0732, - "mean_token_accuracy": 0.9768884256482124, + "loss": 0.0724, + "mean_token_accuracy": 0.9780396446585655, "step": 760 }, { "epoch": 4.451612903225806, - "grad_norm": 0.6909027369872971, + "grad_norm": 0.7194850985370883, "learning_rate": 2.589544805189261e-05, - "loss": 0.0786, - "mean_token_accuracy": 0.9766824841499329, + "loss": 0.0873, + "mean_token_accuracy": 0.9740453436970711, "step": 761 }, { "epoch": 4.457478005865102, - "grad_norm": 0.8361117235474076, + "grad_norm": 0.7484956451034829, "learning_rate": 2.5861960976990056e-05, - "loss": 0.0792, - "mean_token_accuracy": 0.9764246940612793, + "loss": 0.0887, + "mean_token_accuracy": 0.9758842661976814, "step": 762 }, { "epoch": 4.463343108504398, - "grad_norm": 0.8572632181135827, + "grad_norm": 0.8637894057449226, "learning_rate": 2.5828459884728898e-05, - "loss": 0.0957, - "mean_token_accuracy": 0.9725618660449982, + "loss": 0.1057, + "mean_token_accuracy": 0.9708142057061195, "step": 763 }, { "epoch": 4.469208211143695, - "grad_norm": 0.7140301153665474, + "grad_norm": 0.6269525817460034, "learning_rate": 2.5794944896704572e-05, - "loss": 0.0818, - "mean_token_accuracy": 0.9770683497190475, + "loss": 0.0769, + "mean_token_accuracy": 0.9770293310284615, "step": 764 }, { "epoch": 4.475073313782991, - "grad_norm": 0.7204009133906272, + "grad_norm": 0.7346587688130491, "learning_rate": 2.5761416134562955e-05, - "loss": 0.0871, - "mean_token_accuracy": 0.9739867746829987, + "loss": 0.088, + "mean_token_accuracy": 0.9742003232240677, "step": 765 }, { "epoch": 4.480938416422287, - "grad_norm": 0.714625922360012, + "grad_norm": 0.7913759302559801, "learning_rate": 2.5727873719999904e-05, - "loss": 0.0821, - "mean_token_accuracy": 0.9775990322232246, + "loss": 0.0835, + "mean_token_accuracy": 0.9742173254489899, "step": 766 }, { "epoch": 4.486803519061583, - "grad_norm": 0.6987722867821043, + "grad_norm": 0.937686464289818, "learning_rate": 2.569431777476084e-05, - "loss": 0.0864, - "mean_token_accuracy": 0.9749646931886673, + "loss": 0.1006, + "mean_token_accuracy": 0.9709660932421684, "step": 767 }, { "epoch": 4.492668621700879, - "grad_norm": 0.5888159677898139, + "grad_norm": 0.5744331833304884, "learning_rate": 2.566074842064029e-05, - "loss": 0.0711, - "mean_token_accuracy": 0.9794390574097633, + "loss": 0.0745, + "mean_token_accuracy": 0.9800982251763344, "step": 768 }, { "epoch": 4.4985337243401755, - "grad_norm": 0.5808906320753181, + "grad_norm": 0.5713487078602227, "learning_rate": 2.562716577948145e-05, - "loss": 0.0768, - "mean_token_accuracy": 0.9758628606796265, + "loss": 0.0816, + "mean_token_accuracy": 0.9764345660805702, "step": 769 }, { "epoch": 4.504398826979472, - "grad_norm": 0.6265757199170996, + "grad_norm": 0.6901009094958664, "learning_rate": 2.5593569973175757e-05, - "loss": 0.0819, - "mean_token_accuracy": 0.9730802923440933, + "loss": 0.0818, + "mean_token_accuracy": 0.9730259105563164, "step": 770 }, { "epoch": 4.510263929618768, - "grad_norm": 0.6979620561688771, + "grad_norm": 0.6757180129841208, "learning_rate": 2.5559961123662405e-05, - "loss": 0.0826, - "mean_token_accuracy": 0.9771312400698662, + "loss": 0.0834, + "mean_token_accuracy": 0.976299561560154, "step": 771 }, { "epoch": 4.516129032258064, - "grad_norm": 0.7445338952150568, + "grad_norm": 0.802104682961351, "learning_rate": 2.5526339352927956e-05, - "loss": 0.0901, - "mean_token_accuracy": 0.9738541170954704, + "loss": 0.0972, + "mean_token_accuracy": 0.9707585051655769, "step": 772 }, { "epoch": 4.52199413489736, - "grad_norm": 0.8162017405595724, + "grad_norm": 0.5581852249169775, "learning_rate": 2.5492704783005847e-05, - "loss": 0.0855, - "mean_token_accuracy": 0.9749506264925003, + "loss": 0.0856, + "mean_token_accuracy": 0.9735557809472084, "step": 773 }, { "epoch": 4.527859237536656, - "grad_norm": 0.8311724502956283, + "grad_norm": 0.9420509929703049, "learning_rate": 2.5459057535975985e-05, - "loss": 0.0909, - "mean_token_accuracy": 0.9752181246876717, + "loss": 0.1087, + "mean_token_accuracy": 0.9727348163723946, "step": 774 }, { "epoch": 4.533724340175953, - "grad_norm": 0.8339294840277889, + "grad_norm": 0.8421175681685341, "learning_rate": 2.542539773396429e-05, - "loss": 0.0929, - "mean_token_accuracy": 0.9707833006978035, + "loss": 0.0951, + "mean_token_accuracy": 0.9689114019274712, "step": 775 }, { "epoch": 4.539589442815249, - "grad_norm": 0.8200285823098186, + "grad_norm": 0.822523743674553, "learning_rate": 2.5391725499142253e-05, - "loss": 0.0995, - "mean_token_accuracy": 0.9717404097318649, + "loss": 0.1019, + "mean_token_accuracy": 0.9720019325613976, "step": 776 }, { "epoch": 4.545454545454545, - "grad_norm": 0.6868234256963321, + "grad_norm": 0.7865828546712166, "learning_rate": 2.535804095372648e-05, - "loss": 0.0848, - "mean_token_accuracy": 0.9748669788241386, + "loss": 0.0888, + "mean_token_accuracy": 0.9751103669404984, "step": 777 }, { "epoch": 4.551319648093841, - "grad_norm": 0.7683932196268053, + "grad_norm": 0.8382216485189787, "learning_rate": 2.5324344219978273e-05, - "loss": 0.0909, - "mean_token_accuracy": 0.9719524756073952, + "loss": 0.0917, + "mean_token_accuracy": 0.9712260961532593, "step": 778 }, { "epoch": 4.557184750733137, - "grad_norm": 0.8618259518246234, + "grad_norm": 0.8165510195730435, "learning_rate": 2.5290635420203162e-05, - "loss": 0.0941, - "mean_token_accuracy": 0.9723029881715775, + "loss": 0.0929, + "mean_token_accuracy": 0.973741427063942, "step": 779 }, { "epoch": 4.563049853372434, - "grad_norm": 0.6999101687652451, + "grad_norm": 0.6822046725758385, "learning_rate": 2.525691467675048e-05, - "loss": 0.0877, - "mean_token_accuracy": 0.9769936203956604, + "loss": 0.0922, + "mean_token_accuracy": 0.9758124351501465, "step": 780 }, { "epoch": 4.568914956011731, - "grad_norm": 0.6504309188678539, + "grad_norm": 0.7259563170378975, "learning_rate": 2.5223182112012897e-05, - "loss": 0.085, - "mean_token_accuracy": 0.9745290204882622, + "loss": 0.082, + "mean_token_accuracy": 0.9766591638326645, "step": 781 }, { "epoch": 4.574780058651027, - "grad_norm": 0.5824157915995132, + "grad_norm": 0.5910130365914025, "learning_rate": 2.5189437848426016e-05, - "loss": 0.0671, - "mean_token_accuracy": 0.9795333445072174, + "loss": 0.0731, + "mean_token_accuracy": 0.9782171174883842, "step": 782 }, { "epoch": 4.580645161290323, - "grad_norm": 0.8952614421196792, + "grad_norm": 1.024807120825259, "learning_rate": 2.515568200846787e-05, - "loss": 0.1028, - "mean_token_accuracy": 0.9680057391524315, + "loss": 0.1114, + "mean_token_accuracy": 0.9679220989346504, "step": 783 }, { "epoch": 4.586510263929619, - "grad_norm": 0.715413936449577, + "grad_norm": 0.8080385542671041, "learning_rate": 2.5121914714658526e-05, - "loss": 0.0877, - "mean_token_accuracy": 0.9725442752242088, + "loss": 0.0928, + "mean_token_accuracy": 0.9725649207830429, "step": 784 }, { "epoch": 4.592375366568915, - "grad_norm": 0.6999767791173405, + "grad_norm": 0.6671413109766742, "learning_rate": 2.5088136089559636e-05, - "loss": 0.0772, - "mean_token_accuracy": 0.9761156216263771, + "loss": 0.0768, + "mean_token_accuracy": 0.977656364440918, "step": 785 }, { "epoch": 4.5982404692082115, - "grad_norm": 0.7377420435043683, + "grad_norm": 0.6696595768279495, "learning_rate": 2.5054346255773952e-05, - "loss": 0.0711, - "mean_token_accuracy": 0.9770561680197716, + "loss": 0.0743, + "mean_token_accuracy": 0.977767638862133, "step": 786 }, { "epoch": 4.604105571847508, - "grad_norm": 0.8856700189046613, + "grad_norm": 0.8235186961278428, "learning_rate": 2.502054533594493e-05, - "loss": 0.0823, - "mean_token_accuracy": 0.975522093474865, + "loss": 0.0919, + "mean_token_accuracy": 0.9731400832533836, "step": 787 }, { "epoch": 4.609970674486804, - "grad_norm": 0.8051438952416194, + "grad_norm": 0.8741694871387966, "learning_rate": 2.4986733452756264e-05, - "loss": 0.098, - "mean_token_accuracy": 0.974338486790657, + "loss": 0.0935, + "mean_token_accuracy": 0.975216455757618, "step": 788 }, { "epoch": 4.6158357771261, - "grad_norm": 0.6892859753641255, + "grad_norm": 0.6077795411761349, "learning_rate": 2.495291072893142e-05, - "loss": 0.0865, - "mean_token_accuracy": 0.9751565381884575, + "loss": 0.0945, + "mean_token_accuracy": 0.9746409058570862, "step": 789 }, { "epoch": 4.621700879765396, - "grad_norm": 0.731544286085826, + "grad_norm": 0.9026723260115013, "learning_rate": 2.4919077287233237e-05, - "loss": 0.091, - "mean_token_accuracy": 0.975447840988636, + "loss": 0.0936, + "mean_token_accuracy": 0.9729236662387848, "step": 790 }, { "epoch": 4.627565982404692, - "grad_norm": 0.7861172613577541, + "grad_norm": 0.8316840710889603, "learning_rate": 2.4885233250463445e-05, - "loss": 0.0939, - "mean_token_accuracy": 0.9728690907359123, + "loss": 0.0924, + "mean_token_accuracy": 0.9731330275535583, "step": 791 }, { "epoch": 4.633431085043989, - "grad_norm": 0.7402637083225126, + "grad_norm": 0.8262082417744588, "learning_rate": 2.485137874146222e-05, - "loss": 0.0921, - "mean_token_accuracy": 0.9696981385350227, + "loss": 0.0927, + "mean_token_accuracy": 0.9719960913062096, "step": 792 }, { "epoch": 4.639296187683285, - "grad_norm": 0.8249445529463938, + "grad_norm": 0.7938102499987799, "learning_rate": 2.4817513883107762e-05, - "loss": 0.1127, - "mean_token_accuracy": 0.9655818939208984, + "loss": 0.1038, + "mean_token_accuracy": 0.9677957743406296, "step": 793 }, { "epoch": 4.645161290322581, - "grad_norm": 0.7805981393705633, + "grad_norm": 0.6991244736753061, "learning_rate": 2.4783638798315822e-05, - "loss": 0.0865, - "mean_token_accuracy": 0.9749421775341034, + "loss": 0.0836, + "mean_token_accuracy": 0.9754368960857391, "step": 794 }, { "epoch": 4.651026392961877, - "grad_norm": 0.693516849302169, + "grad_norm": 0.7542382340948242, "learning_rate": 2.4749753610039288e-05, - "loss": 0.0767, - "mean_token_accuracy": 0.9765638262033463, + "loss": 0.0806, + "mean_token_accuracy": 0.9750531688332558, "step": 795 }, { "epoch": 4.656891495601173, - "grad_norm": 0.6592370960932631, + "grad_norm": 0.7190292471649612, "learning_rate": 2.4715858441267706e-05, - "loss": 0.0848, - "mean_token_accuracy": 0.9731608182191849, + "loss": 0.0862, + "mean_token_accuracy": 0.9736650809645653, "step": 796 }, { "epoch": 4.6627565982404695, - "grad_norm": 0.8410351105686258, + "grad_norm": 0.9829058369454252, "learning_rate": 2.4681953415026845e-05, - "loss": 0.0947, - "mean_token_accuracy": 0.9714157283306122, + "loss": 0.1087, + "mean_token_accuracy": 0.9707975834608078, "step": 797 }, { "epoch": 4.668621700879766, - "grad_norm": 0.6292824938428746, + "grad_norm": 0.6830295915194216, "learning_rate": 2.464803865437826e-05, - "loss": 0.0805, - "mean_token_accuracy": 0.9759851396083832, + "loss": 0.0882, + "mean_token_accuracy": 0.9731217548251152, "step": 798 }, { "epoch": 4.674486803519062, - "grad_norm": 0.8578069940125966, + "grad_norm": 0.8978252666950851, "learning_rate": 2.461411428241883e-05, - "loss": 0.0975, - "mean_token_accuracy": 0.9713935777544975, + "loss": 0.1053, + "mean_token_accuracy": 0.9693214297294617, "step": 799 }, { "epoch": 4.680351906158358, - "grad_norm": 0.6320034192833647, + "grad_norm": 0.5435088680199814, "learning_rate": 2.4580180422280325e-05, - "loss": 0.0824, - "mean_token_accuracy": 0.9738278761506081, + "loss": 0.0811, + "mean_token_accuracy": 0.9754936322569847, "step": 800 }, { "epoch": 4.686217008797654, - "grad_norm": 0.7988988939828763, + "grad_norm": 0.7490474220611864, "learning_rate": 2.4546237197128955e-05, - "loss": 0.0945, - "mean_token_accuracy": 0.9745538905262947, + "loss": 0.096, + "mean_token_accuracy": 0.973475269973278, "step": 801 }, { "epoch": 4.69208211143695, - "grad_norm": 0.7108652667816605, + "grad_norm": 0.7093663497421644, "learning_rate": 2.451228473016492e-05, - "loss": 0.0821, - "mean_token_accuracy": 0.9754965752363205, + "loss": 0.0856, + "mean_token_accuracy": 0.9768622368574142, "step": 802 }, { "epoch": 4.697947214076247, - "grad_norm": 0.7536562662597699, + "grad_norm": 0.6479956911087104, "learning_rate": 2.447832314462196e-05, - "loss": 0.0919, - "mean_token_accuracy": 0.9743342474102974, + "loss": 0.0902, + "mean_token_accuracy": 0.9753652885556221, "step": 803 }, { "epoch": 4.703812316715543, - "grad_norm": 0.6242244952368606, + "grad_norm": 0.6097972135855906, "learning_rate": 2.444435256376692e-05, - "loss": 0.0798, - "mean_token_accuracy": 0.976905569434166, + "loss": 0.0799, + "mean_token_accuracy": 0.9760509207844734, "step": 804 }, { "epoch": 4.709677419354839, - "grad_norm": 0.6397473344753726, + "grad_norm": 0.7427447154179425, "learning_rate": 2.4410373110899278e-05, - "loss": 0.0712, - "mean_token_accuracy": 0.9791742563247681, + "loss": 0.0771, + "mean_token_accuracy": 0.978294849395752, "step": 805 }, { "epoch": 4.715542521994135, - "grad_norm": 0.7440219387317061, + "grad_norm": 0.6884469910478106, "learning_rate": 2.4376384909350735e-05, - "loss": 0.1014, - "mean_token_accuracy": 0.9721665903925896, + "loss": 0.0932, + "mean_token_accuracy": 0.9721456244587898, "step": 806 }, { "epoch": 4.721407624633431, - "grad_norm": 0.6325421602189405, + "grad_norm": 0.7936450834045352, "learning_rate": 2.434238808248472e-05, - "loss": 0.0807, - "mean_token_accuracy": 0.9754432812333107, + "loss": 0.0833, + "mean_token_accuracy": 0.974332258105278, "step": 807 }, { "epoch": 4.7272727272727275, - "grad_norm": 0.6613651496769305, + "grad_norm": 0.8518884554725347, "learning_rate": 2.4308382753696e-05, - "loss": 0.0844, - "mean_token_accuracy": 0.9745671674609184, + "loss": 0.0917, + "mean_token_accuracy": 0.973288893699646, "step": 808 }, { "epoch": 4.733137829912024, - "grad_norm": 0.655517420614535, + "grad_norm": 0.7828384801271718, "learning_rate": 2.4274369046410183e-05, - "loss": 0.0873, - "mean_token_accuracy": 0.9765310138463974, + "loss": 0.098, + "mean_token_accuracy": 0.9744915068149567, "step": 809 }, { "epoch": 4.73900293255132, - "grad_norm": 0.6824568045638673, + "grad_norm": 0.615178927181312, "learning_rate": 2.4240347084083284e-05, - "loss": 0.0883, - "mean_token_accuracy": 0.9757503718137741, + "loss": 0.08, + "mean_token_accuracy": 0.974980354309082, "step": 810 }, { "epoch": 4.744868035190616, - "grad_norm": 0.7344137920344779, + "grad_norm": 0.8838518898607127, "learning_rate": 2.4206316990201288e-05, - "loss": 0.091, - "mean_token_accuracy": 0.9691428020596504, + "loss": 0.0936, + "mean_token_accuracy": 0.9703791290521622, "step": 811 }, { "epoch": 4.750733137829912, - "grad_norm": 0.7658530881044803, + "grad_norm": 0.6907232787969249, "learning_rate": 2.4172278888279686e-05, - "loss": 0.1013, - "mean_token_accuracy": 0.972835585474968, + "loss": 0.0984, + "mean_token_accuracy": 0.9722641780972481, "step": 812 }, { "epoch": 4.756598240469208, - "grad_norm": 0.682396678860371, + "grad_norm": 1.029666605767164, "learning_rate": 2.4138232901863053e-05, - "loss": 0.0838, - "mean_token_accuracy": 0.9742336198687553, + "loss": 0.0943, + "mean_token_accuracy": 0.971537858247757, "step": 813 }, { "epoch": 4.762463343108505, - "grad_norm": 0.7462145098935278, + "grad_norm": 0.7159086498080418, "learning_rate": 2.4104179154524557e-05, - "loss": 0.0843, - "mean_token_accuracy": 0.9767311811447144, + "loss": 0.082, + "mean_token_accuracy": 0.9783857464790344, "step": 814 }, { "epoch": 4.768328445747801, - "grad_norm": 0.647595797993278, + "grad_norm": 0.649220942218314, "learning_rate": 2.4070117769865554e-05, - "loss": 0.074, - "mean_token_accuracy": 0.9771558046340942, + "loss": 0.0831, + "mean_token_accuracy": 0.9754119515419006, "step": 815 }, { "epoch": 4.774193548387097, - "grad_norm": 0.6375854586116831, + "grad_norm": 0.7457451847390401, "learning_rate": 2.403604887151512e-05, - "loss": 0.0825, - "mean_token_accuracy": 0.9733422249555588, + "loss": 0.0912, + "mean_token_accuracy": 0.9736931174993515, "step": 816 }, { "epoch": 4.780058651026393, - "grad_norm": 0.6430806428065273, + "grad_norm": 0.8083203917249773, "learning_rate": 2.400197258312959e-05, - "loss": 0.0761, - "mean_token_accuracy": 0.9772883579134941, + "loss": 0.0766, + "mean_token_accuracy": 0.9776452630758286, "step": 817 }, { "epoch": 4.785923753665689, - "grad_norm": 0.7027690903069118, + "grad_norm": 0.5884376674496129, "learning_rate": 2.3967889028392115e-05, - "loss": 0.0732, - "mean_token_accuracy": 0.9762856140732765, + "loss": 0.0779, + "mean_token_accuracy": 0.9779343828558922, "step": 818 }, { "epoch": 4.7917888563049855, - "grad_norm": 0.7915351422852497, + "grad_norm": 0.71170792279826, "learning_rate": 2.3933798331012255e-05, - "loss": 0.0941, - "mean_token_accuracy": 0.971655435860157, + "loss": 0.086, + "mean_token_accuracy": 0.9734879434108734, "step": 819 }, { "epoch": 4.797653958944282, - "grad_norm": 0.8230077673870994, + "grad_norm": 0.7493548129924873, "learning_rate": 2.3899700614725458e-05, - "loss": 0.0973, - "mean_token_accuracy": 0.9686101600527763, + "loss": 0.0921, + "mean_token_accuracy": 0.9694371744990349, "step": 820 }, { "epoch": 4.803519061583578, - "grad_norm": 0.8204373001199515, + "grad_norm": 0.8035228450448255, "learning_rate": 2.3865596003292674e-05, - "loss": 0.097, - "mean_token_accuracy": 0.9719519168138504, + "loss": 0.0914, + "mean_token_accuracy": 0.9717112332582474, "step": 821 }, { "epoch": 4.809384164222874, - "grad_norm": 0.7083813400624878, + "grad_norm": 0.7789660984026828, "learning_rate": 2.3831484620499867e-05, - "loss": 0.0842, - "mean_token_accuracy": 0.976848654448986, + "loss": 0.0859, + "mean_token_accuracy": 0.9748634770512581, "step": 822 }, { "epoch": 4.81524926686217, - "grad_norm": 0.7705104881876454, + "grad_norm": 0.712380771905728, "learning_rate": 2.3797366590157565e-05, - "loss": 0.0997, - "mean_token_accuracy": 0.9676300510764122, + "loss": 0.1038, + "mean_token_accuracy": 0.9669318273663521, "step": 823 }, { "epoch": 4.821114369501466, - "grad_norm": 0.726991332245595, + "grad_norm": 0.6611721434012169, "learning_rate": 2.3763242036100457e-05, - "loss": 0.0831, - "mean_token_accuracy": 0.974996529519558, + "loss": 0.0814, + "mean_token_accuracy": 0.9744805321097374, "step": 824 }, { "epoch": 4.826979472140763, - "grad_norm": 0.6741623993634527, + "grad_norm": 0.6735485939113598, "learning_rate": 2.372911108218688e-05, - "loss": 0.0863, - "mean_token_accuracy": 0.9751091077923775, + "loss": 0.0835, + "mean_token_accuracy": 0.9754609763622284, "step": 825 }, { "epoch": 4.832844574780059, - "grad_norm": 0.899437782138063, + "grad_norm": 0.7840099401445052, "learning_rate": 2.3694973852298425e-05, - "loss": 0.1008, - "mean_token_accuracy": 0.9714139476418495, + "loss": 0.0952, + "mean_token_accuracy": 0.9717969074845314, "step": 826 }, { "epoch": 4.838709677419355, - "grad_norm": 0.702151665184058, + "grad_norm": 0.7131184393140502, "learning_rate": 2.3660830470339436e-05, - "loss": 0.0814, - "mean_token_accuracy": 0.9744185507297516, + "loss": 0.0806, + "mean_token_accuracy": 0.9741077572107315, "step": 827 }, { "epoch": 4.844574780058651, - "grad_norm": 0.6507730374741707, + "grad_norm": 0.6852088395751421, "learning_rate": 2.362668106023661e-05, - "loss": 0.0829, - "mean_token_accuracy": 0.9753250107169151, + "loss": 0.0905, + "mean_token_accuracy": 0.9730967953801155, "step": 828 }, { "epoch": 4.850439882697947, - "grad_norm": 0.9585116172423198, + "grad_norm": 0.9163240926554376, "learning_rate": 2.3592525745938515e-05, - "loss": 0.0916, - "mean_token_accuracy": 0.974070705473423, + "loss": 0.094, + "mean_token_accuracy": 0.9740098342299461, "step": 829 }, { "epoch": 4.8563049853372435, - "grad_norm": 0.6456555295026087, + "grad_norm": 0.7099935602934261, "learning_rate": 2.355836465141513e-05, - "loss": 0.0765, - "mean_token_accuracy": 0.9768242910504341, + "loss": 0.0752, + "mean_token_accuracy": 0.9769460931420326, "step": 830 }, { "epoch": 4.86217008797654, - "grad_norm": 0.7629455851838094, + "grad_norm": 1.0067753814550802, "learning_rate": 2.3524197900657447e-05, - "loss": 0.1012, - "mean_token_accuracy": 0.9716575890779495, + "loss": 0.1104, + "mean_token_accuracy": 0.9696153551340103, "step": 831 }, { "epoch": 4.868035190615836, - "grad_norm": 0.6385751398243038, + "grad_norm": 0.6432755158012772, "learning_rate": 2.3490025617676966e-05, - "loss": 0.0785, - "mean_token_accuracy": 0.978433296084404, + "loss": 0.077, + "mean_token_accuracy": 0.9786104187369347, "step": 832 }, { "epoch": 4.873900293255132, - "grad_norm": 0.7762741748913455, + "grad_norm": 0.804250963741737, "learning_rate": 2.3455847926505283e-05, - "loss": 0.0998, - "mean_token_accuracy": 0.9690258279442787, + "loss": 0.0955, + "mean_token_accuracy": 0.9707959890365601, "step": 833 }, { "epoch": 4.879765395894428, - "grad_norm": 0.6712371371046408, + "grad_norm": 0.6226123101954701, "learning_rate": 2.3421664951193596e-05, - "loss": 0.0875, - "mean_token_accuracy": 0.9740894809365273, + "loss": 0.0913, + "mean_token_accuracy": 0.9721181318163872, "step": 834 }, { "epoch": 4.885630498533724, - "grad_norm": 0.7441124698561299, + "grad_norm": 0.6065233416312935, "learning_rate": 2.3387476815812313e-05, - "loss": 0.0927, - "mean_token_accuracy": 0.9717249646782875, + "loss": 0.0852, + "mean_token_accuracy": 0.9747011289000511, "step": 835 }, { "epoch": 4.891495601173021, - "grad_norm": 0.8250944839663914, + "grad_norm": 0.8683808500129969, "learning_rate": 2.3353283644450556e-05, - "loss": 0.1051, - "mean_token_accuracy": 0.9686624780297279, + "loss": 0.0979, + "mean_token_accuracy": 0.9719295874238014, "step": 836 }, { "epoch": 4.897360703812317, - "grad_norm": 0.7052455188209211, + "grad_norm": 0.6831530034869415, "learning_rate": 2.3319085561215724e-05, - "loss": 0.0904, - "mean_token_accuracy": 0.9723523110151291, + "loss": 0.0878, + "mean_token_accuracy": 0.9732778072357178, "step": 837 }, { "epoch": 4.903225806451613, - "grad_norm": 0.7239462558423183, + "grad_norm": 0.8048778653317876, "learning_rate": 2.328488269023305e-05, - "loss": 0.0825, - "mean_token_accuracy": 0.9772523939609528, + "loss": 0.0865, + "mean_token_accuracy": 0.9767483249306679, "step": 838 }, { "epoch": 4.909090909090909, - "grad_norm": 0.7180840126524404, + "grad_norm": 0.8918128316293171, "learning_rate": 2.3250675155645136e-05, - "loss": 0.0864, - "mean_token_accuracy": 0.9757138639688492, + "loss": 0.0925, + "mean_token_accuracy": 0.9727633222937584, "step": 839 }, { "epoch": 4.914956011730205, - "grad_norm": 0.6810604724605698, + "grad_norm": 0.6235830647708027, "learning_rate": 2.3216463081611525e-05, - "loss": 0.0734, - "mean_token_accuracy": 0.9767258539795876, + "loss": 0.0764, + "mean_token_accuracy": 0.9749378114938736, "step": 840 }, { "epoch": 4.9208211143695015, - "grad_norm": 0.9004959050046102, + "grad_norm": 1.0727573864438122, "learning_rate": 2.3182246592308235e-05, - "loss": 0.1048, - "mean_token_accuracy": 0.9709803834557533, + "loss": 0.1065, + "mean_token_accuracy": 0.971137098968029, "step": 841 }, { "epoch": 4.926686217008798, - "grad_norm": 0.6903589644398147, + "grad_norm": 0.6587157219709209, "learning_rate": 2.314802581192728e-05, - "loss": 0.0861, - "mean_token_accuracy": 0.9735254496335983, + "loss": 0.0892, + "mean_token_accuracy": 0.9734735786914825, "step": 842 }, { "epoch": 4.932551319648094, - "grad_norm": 0.9388649809230527, + "grad_norm": 0.9278686915137605, "learning_rate": 2.311380086467629e-05, - "loss": 0.1087, - "mean_token_accuracy": 0.9675555154681206, + "loss": 0.1097, + "mean_token_accuracy": 0.9702532961964607, "step": 843 }, { "epoch": 4.93841642228739, - "grad_norm": 0.6728193407529811, + "grad_norm": 0.6510783115585961, "learning_rate": 2.3079571874778e-05, - "loss": 0.0933, - "mean_token_accuracy": 0.9730138704180717, + "loss": 0.0923, + "mean_token_accuracy": 0.972200870513916, "step": 844 }, { "epoch": 4.944281524926686, - "grad_norm": 0.6533598028959027, + "grad_norm": 0.6796666479114434, "learning_rate": 2.304533896646981e-05, - "loss": 0.0847, - "mean_token_accuracy": 0.9751003682613373, + "loss": 0.0896, + "mean_token_accuracy": 0.9741808176040649, "step": 845 }, { "epoch": 4.9501466275659824, - "grad_norm": 0.6164310940206862, + "grad_norm": 0.5588006016435534, "learning_rate": 2.3011102264003354e-05, - "loss": 0.0775, - "mean_token_accuracy": 0.9751841053366661, + "loss": 0.0866, + "mean_token_accuracy": 0.9756291657686234, "step": 846 }, { "epoch": 4.956011730205279, - "grad_norm": 0.6639517976692416, + "grad_norm": 0.6515199213439168, "learning_rate": 2.2976861891644045e-05, - "loss": 0.0836, - "mean_token_accuracy": 0.9767726510763168, + "loss": 0.0862, + "mean_token_accuracy": 0.9755071625113487, "step": 847 }, { "epoch": 4.961876832844575, - "grad_norm": 0.62090714661712, + "grad_norm": 0.6597011754000468, "learning_rate": 2.2942617973670596e-05, - "loss": 0.0728, - "mean_token_accuracy": 0.9758273363113403, + "loss": 0.0841, + "mean_token_accuracy": 0.9742411524057388, "step": 848 }, { "epoch": 4.967741935483871, - "grad_norm": 0.8452829280392841, + "grad_norm": 0.8595960189097062, "learning_rate": 2.2908370634374603e-05, - "loss": 0.1073, - "mean_token_accuracy": 0.9687742963433266, + "loss": 0.1119, + "mean_token_accuracy": 0.9660947695374489, "step": 849 }, { "epoch": 4.973607038123167, - "grad_norm": 0.6759201564046626, + "grad_norm": 0.6326753578882204, "learning_rate": 2.287411999806007e-05, - "loss": 0.0801, - "mean_token_accuracy": 0.9753365591168404, + "loss": 0.0803, + "mean_token_accuracy": 0.9757585823535919, "step": 850 }, { "epoch": 4.979472140762463, - "grad_norm": 0.8874012153781123, + "grad_norm": 0.8954416714362815, "learning_rate": 2.2839866189042983e-05, - "loss": 0.0851, - "mean_token_accuracy": 0.9758308529853821, + "loss": 0.0928, + "mean_token_accuracy": 0.9719807282090187, "step": 851 }, { "epoch": 4.9853372434017595, - "grad_norm": 0.6394628435704744, + "grad_norm": 0.7040679174576174, "learning_rate": 2.2805609331650826e-05, - "loss": 0.0887, - "mean_token_accuracy": 0.9720895141363144, + "loss": 0.0987, + "mean_token_accuracy": 0.9714400470256805, "step": 852 }, { "epoch": 4.991202346041056, - "grad_norm": 0.6712758849886379, + "grad_norm": 0.7406739814087293, "learning_rate": 2.2771349550222158e-05, - "loss": 0.0802, - "mean_token_accuracy": 0.9771791622042656, + "loss": 0.0817, + "mean_token_accuracy": 0.9749223962426186, "step": 853 }, { "epoch": 4.997067448680352, - "grad_norm": 0.6076118168062584, + "grad_norm": 0.5768671268589982, "learning_rate": 2.273708696910616e-05, - "loss": 0.0766, - "mean_token_accuracy": 0.9765864163637161, + "loss": 0.0804, + "mean_token_accuracy": 0.9751258715987206, "step": 854 }, { "epoch": 5.0, - "grad_norm": 0.6076118168062584, + "grad_norm": 0.5768671268589982, "learning_rate": 2.2702821712662147e-05, - "loss": 0.073, - "mean_token_accuracy": 0.9801962226629257, + "loss": 0.0683, + "mean_token_accuracy": 0.9784054607152939, "step": 855 }, { "epoch": 5.005865102639296, - "grad_norm": 1.0026765048680202, + "grad_norm": 0.8099033611603849, "learning_rate": 2.2668553905259168e-05, - "loss": 0.0693, - "mean_token_accuracy": 0.9787380993366241, + "loss": 0.0681, + "mean_token_accuracy": 0.978220023214817, "step": 856 }, { "epoch": 5.011730205278592, - "grad_norm": 0.4886645531830975, + "grad_norm": 0.4450190361943267, "learning_rate": 2.2634283671275523e-05, - "loss": 0.0589, - "mean_token_accuracy": 0.9821438789367676, + "loss": 0.06, + "mean_token_accuracy": 0.9821461737155914, "step": 857 }, { "epoch": 5.0175953079178885, - "grad_norm": 0.5221300385899804, + "grad_norm": 0.5790492440675818, "learning_rate": 2.2600011135098323e-05, - "loss": 0.0647, - "mean_token_accuracy": 0.980193242430687, + "loss": 0.0715, + "mean_token_accuracy": 0.9782145693898201, "step": 858 }, { "epoch": 5.023460410557185, - "grad_norm": 0.506920475477819, + "grad_norm": 0.369951752998347, "learning_rate": 2.2565736421123035e-05, - "loss": 0.0759, - "mean_token_accuracy": 0.9781730622053146, + "loss": 0.0697, + "mean_token_accuracy": 0.9796851649880409, "step": 859 }, { "epoch": 5.029325513196481, - "grad_norm": 0.7195873400454875, + "grad_norm": 0.6060126099707575, "learning_rate": 2.253145965375302e-05, - "loss": 0.0839, - "mean_token_accuracy": 0.974876344203949, + "loss": 0.0875, + "mean_token_accuracy": 0.9720205143094063, "step": 860 }, { "epoch": 5.035190615835777, - "grad_norm": 0.5533130891308242, + "grad_norm": 0.6165958564660802, "learning_rate": 2.2497180957399108e-05, - "loss": 0.0788, - "mean_token_accuracy": 0.9747349694371223, + "loss": 0.0866, + "mean_token_accuracy": 0.9744477048516273, "step": 861 }, { "epoch": 5.041055718475073, - "grad_norm": 0.6804425923592532, + "grad_norm": 0.6771859544459322, "learning_rate": 2.246290045647912e-05, - "loss": 0.0643, - "mean_token_accuracy": 0.9811800122261047, + "loss": 0.0723, + "mean_token_accuracy": 0.9792094677686691, "step": 862 }, { "epoch": 5.0469208211143695, - "grad_norm": 0.5660705321290153, + "grad_norm": 0.7193259034964399, "learning_rate": 2.242861827541742e-05, - "loss": 0.0598, - "mean_token_accuracy": 0.9806637167930603, + "loss": 0.0634, + "mean_token_accuracy": 0.9807204157114029, "step": 863 }, { "epoch": 5.052785923753666, - "grad_norm": 0.4533661141133845, + "grad_norm": 0.5659196503147593, "learning_rate": 2.2394334538644494e-05, - "loss": 0.0683, - "mean_token_accuracy": 0.9784892648458481, + "loss": 0.0772, + "mean_token_accuracy": 0.976197637617588, "step": 864 }, { "epoch": 5.058651026392962, - "grad_norm": 0.7256930320993769, + "grad_norm": 0.7669216428482878, "learning_rate": 2.2360049370596454e-05, - "loss": 0.0753, - "mean_token_accuracy": 0.9793033376336098, + "loss": 0.0726, + "mean_token_accuracy": 0.9803265184164047, "step": 865 }, { "epoch": 5.064516129032258, - "grad_norm": 0.7563840557949948, + "grad_norm": 0.6675976013272769, "learning_rate": 2.2325762895714616e-05, - "loss": 0.0776, - "mean_token_accuracy": 0.9749964252114296, + "loss": 0.0763, + "mean_token_accuracy": 0.9768341109156609, "step": 866 }, { "epoch": 5.070381231671554, - "grad_norm": 0.6027240259420076, + "grad_norm": 0.6310094477428734, "learning_rate": 2.2291475238445033e-05, - "loss": 0.0748, - "mean_token_accuracy": 0.977846160531044, + "loss": 0.0768, + "mean_token_accuracy": 0.9780902415513992, "step": 867 }, { "epoch": 5.07624633431085, - "grad_norm": 0.7312019193066984, + "grad_norm": 0.551208157538957, "learning_rate": 2.225718652323805e-05, - "loss": 0.0748, - "mean_token_accuracy": 0.9755967482924461, + "loss": 0.0774, + "mean_token_accuracy": 0.9757518768310547, "step": 868 }, { "epoch": 5.0821114369501466, - "grad_norm": 0.5400054143225983, + "grad_norm": 0.7407751441472306, "learning_rate": 2.2222896874547856e-05, - "loss": 0.08, - "mean_token_accuracy": 0.9775899350643158, + "loss": 0.096, + "mean_token_accuracy": 0.9750194400548935, "step": 869 }, { "epoch": 5.087976539589443, - "grad_norm": 0.749736952140823, + "grad_norm": 0.6068040244873181, "learning_rate": 2.2188606416832035e-05, - "loss": 0.063, - "mean_token_accuracy": 0.9818405732512474, + "loss": 0.0636, + "mean_token_accuracy": 0.9805742874741554, "step": 870 }, { "epoch": 5.093841642228739, - "grad_norm": 0.7238913429587271, + "grad_norm": 0.6185459789558916, "learning_rate": 2.2154315274551093e-05, - "loss": 0.0829, - "mean_token_accuracy": 0.9755804762244225, + "loss": 0.0812, + "mean_token_accuracy": 0.9773125946521759, "step": 871 }, { "epoch": 5.099706744868035, - "grad_norm": 0.7005006356881465, + "grad_norm": 0.5449089650374084, "learning_rate": 2.2120023572168026e-05, - "loss": 0.0678, - "mean_token_accuracy": 0.9808993488550186, + "loss": 0.0681, + "mean_token_accuracy": 0.9807572066783905, "step": 872 }, { "epoch": 5.105571847507331, - "grad_norm": 0.6109157423351927, + "grad_norm": 0.5466347088828135, "learning_rate": 2.208573143414787e-05, - "loss": 0.0637, - "mean_token_accuracy": 0.9813826605677605, + "loss": 0.0653, + "mean_token_accuracy": 0.9815317243337631, "step": 873 }, { "epoch": 5.1114369501466275, - "grad_norm": 0.5061420246936063, + "grad_norm": 0.6933178932755342, "learning_rate": 2.2051438984957234e-05, - "loss": 0.0657, - "mean_token_accuracy": 0.9802054837346077, + "loss": 0.0758, + "mean_token_accuracy": 0.9780973717570305, "step": 874 }, { "epoch": 5.117302052785924, - "grad_norm": 0.553717575776044, + "grad_norm": 0.5794805869329679, "learning_rate": 2.2017146349063855e-05, - "loss": 0.0792, - "mean_token_accuracy": 0.9763247072696686, + "loss": 0.0703, + "mean_token_accuracy": 0.9793791621923447, "step": 875 }, { "epoch": 5.12316715542522, - "grad_norm": 0.6863136450685561, + "grad_norm": 0.49280867315149984, "learning_rate": 2.1982853650936154e-05, - "loss": 0.0713, - "mean_token_accuracy": 0.9781040996313095, + "loss": 0.0778, + "mean_token_accuracy": 0.9783745110034943, "step": 876 }, { "epoch": 5.129032258064516, - "grad_norm": 0.5124269447535285, + "grad_norm": 0.6399855244458721, "learning_rate": 2.1948561015042772e-05, - "loss": 0.0733, - "mean_token_accuracy": 0.978808619081974, + "loss": 0.0769, + "mean_token_accuracy": 0.9801036417484283, "step": 877 }, { "epoch": 5.134897360703812, - "grad_norm": 0.6562503546420304, + "grad_norm": 0.658746744986044, "learning_rate": 2.1914268565852134e-05, - "loss": 0.0768, - "mean_token_accuracy": 0.9771819114685059, + "loss": 0.0792, + "mean_token_accuracy": 0.9752889424562454, "step": 878 }, { "epoch": 5.140762463343108, - "grad_norm": 0.5648878023806803, + "grad_norm": 0.6889876053257266, "learning_rate": 2.1879976427831983e-05, - "loss": 0.0746, - "mean_token_accuracy": 0.9797395169734955, + "loss": 0.0791, + "mean_token_accuracy": 0.9774133116006851, "step": 879 }, { "epoch": 5.146627565982405, - "grad_norm": 0.5567734178801955, + "grad_norm": 0.6564951399065133, "learning_rate": 2.1845684725448916e-05, - "loss": 0.0774, - "mean_token_accuracy": 0.9762084037065506, + "loss": 0.0853, + "mean_token_accuracy": 0.9751299694180489, "step": 880 }, { "epoch": 5.152492668621701, - "grad_norm": 0.604016753073881, + "grad_norm": 0.6990710395104928, "learning_rate": 2.181139358316797e-05, - "loss": 0.0799, - "mean_token_accuracy": 0.9763386994600296, + "loss": 0.0802, + "mean_token_accuracy": 0.9749646335840225, "step": 881 }, { "epoch": 5.158357771260997, - "grad_norm": 0.7020291349192862, + "grad_norm": 0.7796329399858077, "learning_rate": 2.1777103125452146e-05, - "loss": 0.0716, - "mean_token_accuracy": 0.9784782081842422, + "loss": 0.0746, + "mean_token_accuracy": 0.9780853018164635, "step": 882 }, { "epoch": 5.164222873900293, - "grad_norm": 0.7007843726478299, + "grad_norm": 0.7991341320122649, "learning_rate": 2.1742813476761958e-05, - "loss": 0.0827, - "mean_token_accuracy": 0.9760249108076096, + "loss": 0.0923, + "mean_token_accuracy": 0.973864234983921, "step": 883 }, { "epoch": 5.170087976539589, - "grad_norm": 0.5702016510876317, + "grad_norm": 0.5346267401371, "learning_rate": 2.1708524761554973e-05, - "loss": 0.0754, - "mean_token_accuracy": 0.9762247651815414, + "loss": 0.084, + "mean_token_accuracy": 0.9747622609138489, "step": 884 }, { "epoch": 5.1759530791788855, - "grad_norm": 0.5730616842732398, + "grad_norm": 0.7536037260952937, "learning_rate": 2.1674237104285393e-05, - "loss": 0.0668, - "mean_token_accuracy": 0.9794589728116989, + "loss": 0.0718, + "mean_token_accuracy": 0.9780940935015678, "step": 885 }, { "epoch": 5.181818181818182, - "grad_norm": 0.49051489661277814, + "grad_norm": 0.4613117403210394, "learning_rate": 2.1639950629403552e-05, - "loss": 0.0627, - "mean_token_accuracy": 0.9801739826798439, + "loss": 0.0623, + "mean_token_accuracy": 0.9808356538414955, "step": 886 }, { "epoch": 5.187683284457478, - "grad_norm": 0.6147350528773317, + "grad_norm": 0.5208424966805913, "learning_rate": 2.1605665461355515e-05, - "loss": 0.0748, - "mean_token_accuracy": 0.9772293791174889, + "loss": 0.076, + "mean_token_accuracy": 0.9781753346323967, "step": 887 }, { "epoch": 5.193548387096774, - "grad_norm": 0.574887287790943, + "grad_norm": 0.6828043686906896, "learning_rate": 2.1571381724582588e-05, - "loss": 0.0737, - "mean_token_accuracy": 0.9775480031967163, + "loss": 0.0966, + "mean_token_accuracy": 0.9742318615317345, "step": 888 }, { "epoch": 5.19941348973607, - "grad_norm": 0.5802402780421291, + "grad_norm": 0.6401984996207497, "learning_rate": 2.153709954352089e-05, - "loss": 0.0707, - "mean_token_accuracy": 0.9791269749403, + "loss": 0.0688, + "mean_token_accuracy": 0.978904701769352, "step": 889 }, { "epoch": 5.205278592375366, - "grad_norm": 0.5892378515440839, + "grad_norm": 0.6584127670692117, "learning_rate": 2.15028190426009e-05, "loss": 0.0795, - "mean_token_accuracy": 0.9762731716036797, + "mean_token_accuracy": 0.9756882935762405, "step": 890 }, { "epoch": 5.211143695014663, - "grad_norm": 0.561441923874465, + "grad_norm": 0.5242045901075086, "learning_rate": 2.1468540346246986e-05, - "loss": 0.0825, - "mean_token_accuracy": 0.9746378436684608, + "loss": 0.0732, + "mean_token_accuracy": 0.9772383645176888, "step": 891 }, { "epoch": 5.217008797653959, - "grad_norm": 0.7693860021671447, + "grad_norm": 0.7677673529791075, "learning_rate": 2.143426357887697e-05, - "loss": 0.0765, - "mean_token_accuracy": 0.9771846383810043, + "loss": 0.0838, + "mean_token_accuracy": 0.9748184457421303, "step": 892 }, { "epoch": 5.222873900293255, - "grad_norm": 0.5880074332718885, + "grad_norm": 0.6457074194873286, "learning_rate": 2.139998886490169e-05, - "loss": 0.0632, - "mean_token_accuracy": 0.9813052341341972, + "loss": 0.0644, + "mean_token_accuracy": 0.9803335294127464, "step": 893 }, { "epoch": 5.228739002932551, - "grad_norm": 0.5589521393363575, + "grad_norm": 0.5107497247898192, "learning_rate": 2.136571632872449e-05, - "loss": 0.0783, - "mean_token_accuracy": 0.9764862582087517, + "loss": 0.0784, + "mean_token_accuracy": 0.9764648675918579, "step": 894 }, { "epoch": 5.234604105571847, - "grad_norm": 0.6958118598247672, + "grad_norm": 0.6904728447258073, "learning_rate": 2.1331446094740845e-05, - "loss": 0.0813, - "mean_token_accuracy": 0.9756343215703964, + "loss": 0.0885, + "mean_token_accuracy": 0.974873885512352, "step": 895 }, { "epoch": 5.2404692082111435, - "grad_norm": 0.648605914341562, + "grad_norm": 0.7503481221592241, "learning_rate": 2.1297178287337865e-05, - "loss": 0.0736, - "mean_token_accuracy": 0.9772472456097603, + "loss": 0.0799, + "mean_token_accuracy": 0.9755254238843918, "step": 896 }, { "epoch": 5.24633431085044, - "grad_norm": 0.6859493516947144, + "grad_norm": 0.6840723971970181, "learning_rate": 2.1262913030893855e-05, - "loss": 0.0773, - "mean_token_accuracy": 0.9786031097173691, + "loss": 0.0876, + "mean_token_accuracy": 0.9740219637751579, "step": 897 }, { "epoch": 5.252199413489736, - "grad_norm": 0.6803915454335052, + "grad_norm": 0.8127484480176936, "learning_rate": 2.1228650449777848e-05, - "loss": 0.0798, - "mean_token_accuracy": 0.9753960222005844, + "loss": 0.0853, + "mean_token_accuracy": 0.9751054421067238, "step": 898 }, { "epoch": 5.258064516129032, - "grad_norm": 0.5949700617476797, + "grad_norm": 0.6498239243812226, "learning_rate": 2.1194390668349186e-05, - "loss": 0.0771, - "mean_token_accuracy": 0.9780300334095955, + "loss": 0.0796, + "mean_token_accuracy": 0.9772773683071136, "step": 899 }, { "epoch": 5.263929618768328, - "grad_norm": 0.7410245541650864, + "grad_norm": 1.1008053864242204, "learning_rate": 2.116013381095703e-05, - "loss": 0.0697, - "mean_token_accuracy": 0.9797634407877922, + "loss": 0.07, + "mean_token_accuracy": 0.9801924675703049, "step": 900 }, { "epoch": 5.269794721407624, - "grad_norm": 0.4455821502764853, + "grad_norm": 0.547889815369571, "learning_rate": 2.112588000193994e-05, - "loss": 0.0712, - "mean_token_accuracy": 0.9782530590891838, + "loss": 0.0771, + "mean_token_accuracy": 0.9764879941940308, "step": 901 }, { "epoch": 5.275659824046921, - "grad_norm": 0.6951534941196735, + "grad_norm": 0.8521673135262312, "learning_rate": 2.1091629365625403e-05, - "loss": 0.067, - "mean_token_accuracy": 0.979590117931366, + "loss": 0.078, + "mean_token_accuracy": 0.976124718785286, "step": 902 }, { "epoch": 5.281524926686217, - "grad_norm": 0.5693519513539006, + "grad_norm": 0.6082701446387903, "learning_rate": 2.105738202632941e-05, - "loss": 0.0795, - "mean_token_accuracy": 0.97862908244133, + "loss": 0.0874, + "mean_token_accuracy": 0.9764643460512161, "step": 903 }, { "epoch": 5.287390029325513, - "grad_norm": 0.6074837432744669, + "grad_norm": 0.6600651650672481, "learning_rate": 2.1023138108355957e-05, - "loss": 0.0611, - "mean_token_accuracy": 0.9821363985538483, + "loss": 0.0713, + "mean_token_accuracy": 0.9807606041431427, "step": 904 }, { "epoch": 5.293255131964809, - "grad_norm": 0.49778279285216703, + "grad_norm": 0.49312738828651276, "learning_rate": 2.098889773599665e-05, - "loss": 0.0843, - "mean_token_accuracy": 0.9768336862325668, + "loss": 0.0859, + "mean_token_accuracy": 0.9751582518219948, "step": 905 }, { "epoch": 5.299120234604105, - "grad_norm": 0.7254806511844373, + "grad_norm": 0.6702665428374299, "learning_rate": 2.0954661033530193e-05, - "loss": 0.0663, - "mean_token_accuracy": 0.9805325642228127, + "loss": 0.0695, + "mean_token_accuracy": 0.9784847497940063, "step": 906 }, { "epoch": 5.3049853372434015, - "grad_norm": 0.4628049857510997, + "grad_norm": 0.45413216504967313, "learning_rate": 2.0920428125222004e-05, - "loss": 0.0717, - "mean_token_accuracy": 0.9796890690922737, + "loss": 0.0733, + "mean_token_accuracy": 0.9797236323356628, "step": 907 }, { "epoch": 5.310850439882698, - "grad_norm": 0.48912434388430215, + "grad_norm": 0.5443492720347446, "learning_rate": 2.0886199135323712e-05, - "loss": 0.0773, - "mean_token_accuracy": 0.9784531816840172, + "loss": 0.0874, + "mean_token_accuracy": 0.9746130779385567, "step": 908 }, { "epoch": 5.316715542521994, - "grad_norm": 0.6523411765325368, + "grad_norm": 0.6727309078114576, "learning_rate": 2.085197418807272e-05, - "loss": 0.0677, - "mean_token_accuracy": 0.9798686727881432, + "loss": 0.0735, + "mean_token_accuracy": 0.9795581921935081, "step": 909 }, { "epoch": 5.32258064516129, - "grad_norm": 0.6280379928431413, + "grad_norm": 0.48912171922210684, "learning_rate": 2.0817753407691774e-05, - "loss": 0.0765, - "mean_token_accuracy": 0.9757416620850563, + "loss": 0.0766, + "mean_token_accuracy": 0.9777567386627197, "step": 910 }, { "epoch": 5.328445747800586, - "grad_norm": 0.6506838547506919, + "grad_norm": 0.5828476592220785, "learning_rate": 2.0783536918388477e-05, - "loss": 0.0823, - "mean_token_accuracy": 0.9750187322497368, + "loss": 0.0911, + "mean_token_accuracy": 0.9743940532207489, "step": 911 }, { "epoch": 5.334310850439882, - "grad_norm": 0.5740998141514729, + "grad_norm": 0.6715410606054701, "learning_rate": 2.0749324844354867e-05, - "loss": 0.0721, - "mean_token_accuracy": 0.9790320321917534, + "loss": 0.0719, + "mean_token_accuracy": 0.9782103151082993, "step": 912 }, { "epoch": 5.340175953079179, - "grad_norm": 0.6100250700673111, + "grad_norm": 0.672571913081858, "learning_rate": 2.0715117309766953e-05, - "loss": 0.0801, - "mean_token_accuracy": 0.9779443517327309, + "loss": 0.0692, + "mean_token_accuracy": 0.9785708412528038, "step": 913 }, { "epoch": 5.346041055718475, - "grad_norm": 0.6254016260556631, + "grad_norm": 0.4654116633126373, "learning_rate": 2.068091443878428e-05, - "loss": 0.0806, - "mean_token_accuracy": 0.9745582416653633, + "loss": 0.0776, + "mean_token_accuracy": 0.9766899198293686, "step": 914 }, { "epoch": 5.351906158357771, - "grad_norm": 0.6948395876064227, + "grad_norm": 0.7860851552775493, "learning_rate": 2.064671635554945e-05, - "loss": 0.0972, - "mean_token_accuracy": 0.9737641960382462, + "loss": 0.0908, + "mean_token_accuracy": 0.9766247346997261, "step": 915 }, { "epoch": 5.357771260997067, - "grad_norm": 0.7081713644165885, + "grad_norm": 0.670661561387845, "learning_rate": 2.0612523184187693e-05, - "loss": 0.066, - "mean_token_accuracy": 0.9779355525970459, + "loss": 0.0747, + "mean_token_accuracy": 0.9768082797527313, "step": 916 }, { "epoch": 5.363636363636363, - "grad_norm": 0.5471530801482298, + "grad_norm": 0.5827806964669583, "learning_rate": 2.057833504880641e-05, - "loss": 0.0723, - "mean_token_accuracy": 0.9762101992964745, + "loss": 0.0793, + "mean_token_accuracy": 0.974205270409584, "step": 917 }, { "epoch": 5.3695014662756595, - "grad_norm": 0.5378564258248137, + "grad_norm": 0.6679178730365597, "learning_rate": 2.054415207349473e-05, - "loss": 0.0774, - "mean_token_accuracy": 0.976969949901104, + "loss": 0.0842, + "mean_token_accuracy": 0.9742937907576561, "step": 918 }, { "epoch": 5.375366568914956, - "grad_norm": 0.5600324469227966, + "grad_norm": 0.9882368279810648, "learning_rate": 2.0509974382323043e-05, - "loss": 0.0712, - "mean_token_accuracy": 0.9783627018332481, + "loss": 0.0726, + "mean_token_accuracy": 0.9780319705605507, "step": 919 }, { "epoch": 5.381231671554252, - "grad_norm": 0.5079271037640322, + "grad_norm": 0.471815730617989, "learning_rate": 2.047580209934256e-05, - "loss": 0.078, - "mean_token_accuracy": 0.9777331501245499, + "loss": 0.0701, + "mean_token_accuracy": 0.9799467623233795, "step": 920 }, { "epoch": 5.387096774193548, - "grad_norm": 0.6585224453075607, + "grad_norm": 0.47451791208039407, "learning_rate": 2.0441635348584876e-05, - "loss": 0.0773, - "mean_token_accuracy": 0.9772825464606285, + "loss": 0.076, + "mean_token_accuracy": 0.9799469783902168, "step": 921 }, { "epoch": 5.392961876832844, - "grad_norm": 0.6247906925375318, + "grad_norm": 0.5023809611511383, "learning_rate": 2.0407474254061498e-05, - "loss": 0.0831, - "mean_token_accuracy": 0.9741897881031036, + "loss": 0.0826, + "mean_token_accuracy": 0.9753458574414253, "step": 922 }, { "epoch": 5.39882697947214, - "grad_norm": 0.5472561096522515, + "grad_norm": 0.5075686409018196, "learning_rate": 2.0373318939763397e-05, - "loss": 0.0785, - "mean_token_accuracy": 0.976148895919323, + "loss": 0.0765, + "mean_token_accuracy": 0.9750477820634842, "step": 923 }, { "epoch": 5.404692082111437, - "grad_norm": 0.6294772303187749, + "grad_norm": 0.6435726318443664, "learning_rate": 2.033916952966057e-05, - "loss": 0.0683, - "mean_token_accuracy": 0.9790761917829514, + "loss": 0.0712, + "mean_token_accuracy": 0.9771361127495766, "step": 924 }, { "epoch": 5.410557184750733, - "grad_norm": 0.47662725471278244, + "grad_norm": 0.5962098923490655, "learning_rate": 2.0305026147701584e-05, - "loss": 0.0747, - "mean_token_accuracy": 0.9755368903279305, + "loss": 0.0816, + "mean_token_accuracy": 0.9740441292524338, "step": 925 }, { "epoch": 5.416422287390029, - "grad_norm": 0.6610622738741952, + "grad_norm": 0.5837016205213843, "learning_rate": 2.0270888917813124e-05, - "loss": 0.0671, - "mean_token_accuracy": 0.9788747951388359, + "loss": 0.0655, + "mean_token_accuracy": 0.9783652052283287, "step": 926 }, { "epoch": 5.422287390029325, - "grad_norm": 0.5589675836077449, + "grad_norm": 0.5544663430780863, "learning_rate": 2.0236757963899548e-05, - "loss": 0.0738, - "mean_token_accuracy": 0.9767239764332771, + "loss": 0.0791, + "mean_token_accuracy": 0.9762725159525871, "step": 927 }, { "epoch": 5.428152492668621, - "grad_norm": 0.6019998421138993, + "grad_norm": 0.5937736275356819, "learning_rate": 2.020263340984244e-05, - "loss": 0.0742, - "mean_token_accuracy": 0.9795248135924339, + "loss": 0.0814, + "mean_token_accuracy": 0.9787080064415932, "step": 928 }, { "epoch": 5.4340175953079175, - "grad_norm": 0.5166261136130894, + "grad_norm": 0.5482338781621862, "learning_rate": 2.0168515379500145e-05, - "loss": 0.0701, - "mean_token_accuracy": 0.9759251549839973, + "loss": 0.0664, + "mean_token_accuracy": 0.9771486297249794, "step": 929 }, { "epoch": 5.439882697947214, - "grad_norm": 0.6006084519966477, + "grad_norm": 0.6214123829163545, "learning_rate": 2.0134403996707338e-05, - "loss": 0.065, - "mean_token_accuracy": 0.9794113636016846, + "loss": 0.0691, + "mean_token_accuracy": 0.9789487943053246, "step": 930 }, { "epoch": 5.44574780058651, - "grad_norm": 0.45582821622973035, + "grad_norm": 0.39382140581253744, "learning_rate": 2.0100299385274547e-05, - "loss": 0.0713, - "mean_token_accuracy": 0.9784344360232353, + "loss": 0.0688, + "mean_token_accuracy": 0.9780780300498009, "step": 931 }, { "epoch": 5.451612903225806, - "grad_norm": 0.652447127156715, + "grad_norm": 0.7154256082721208, "learning_rate": 2.0066201668987757e-05, - "loss": 0.0875, - "mean_token_accuracy": 0.9729999005794525, + "loss": 0.0936, + "mean_token_accuracy": 0.973826490342617, "step": 932 }, { "epoch": 5.457478005865102, - "grad_norm": 0.5812504477926977, + "grad_norm": 0.5851811926501702, "learning_rate": 2.0032110971607894e-05, - "loss": 0.0688, - "mean_token_accuracy": 0.980715274810791, + "loss": 0.0715, + "mean_token_accuracy": 0.9797235950827599, "step": 933 }, { "epoch": 5.463343108504398, - "grad_norm": 0.5882443082261385, + "grad_norm": 0.5534379033832495, "learning_rate": 1.999802741687042e-05, - "loss": 0.0768, - "mean_token_accuracy": 0.9780777394771576, + "loss": 0.0838, + "mean_token_accuracy": 0.9770290404558182, "step": 934 }, { "epoch": 5.469208211143695, - "grad_norm": 0.48843000233363765, + "grad_norm": 0.7164411676483846, "learning_rate": 1.9963951128484886e-05, - "loss": 0.0613, - "mean_token_accuracy": 0.980921059846878, + "loss": 0.0649, + "mean_token_accuracy": 0.9806244075298309, "step": 935 }, { "epoch": 5.475073313782991, - "grad_norm": 0.578207102400483, + "grad_norm": 0.47696779969791275, "learning_rate": 1.9929882230134452e-05, - "loss": 0.0716, - "mean_token_accuracy": 0.9766323640942574, + "loss": 0.0748, + "mean_token_accuracy": 0.9765933528542519, "step": 936 }, { "epoch": 5.480938416422287, - "grad_norm": 0.5585603420730529, + "grad_norm": 0.5470776765030727, "learning_rate": 1.9895820845475445e-05, - "loss": 0.074, - "mean_token_accuracy": 0.9760611280798912, + "loss": 0.0878, + "mean_token_accuracy": 0.9753799438476562, "step": 937 }, { "epoch": 5.486803519061583, - "grad_norm": 0.5630265678683493, + "grad_norm": 0.7100920429755581, "learning_rate": 1.9861767098136956e-05, - "loss": 0.0668, - "mean_token_accuracy": 0.9797999039292336, + "loss": 0.0665, + "mean_token_accuracy": 0.9792315140366554, "step": 938 }, { "epoch": 5.492668621700879, - "grad_norm": 0.4600091835994956, + "grad_norm": 0.4029396388053985, "learning_rate": 1.982772111172032e-05, - "loss": 0.0723, - "mean_token_accuracy": 0.979090228676796, + "loss": 0.0722, + "mean_token_accuracy": 0.9785656332969666, "step": 939 }, { "epoch": 5.4985337243401755, - "grad_norm": 0.6002054250965584, + "grad_norm": 0.5709639222922572, "learning_rate": 1.9793683009798718e-05, - "loss": 0.0679, - "mean_token_accuracy": 0.9809942319989204, + "loss": 0.0647, + "mean_token_accuracy": 0.9799299314618111, "step": 940 }, { "epoch": 5.504398826979472, - "grad_norm": 0.6319575155464214, + "grad_norm": 0.4227698091411673, "learning_rate": 1.975965291591672e-05, - "loss": 0.0921, - "mean_token_accuracy": 0.9735964983701706, + "loss": 0.0838, + "mean_token_accuracy": 0.9743452370166779, "step": 941 }, { "epoch": 5.510263929618768, - "grad_norm": 0.7814944840670485, + "grad_norm": 0.5747683090929688, "learning_rate": 1.9725630953589823e-05, - "loss": 0.0766, - "mean_token_accuracy": 0.9783463180065155, + "loss": 0.0749, + "mean_token_accuracy": 0.9801840931177139, "step": 942 }, { "epoch": 5.516129032258064, - "grad_norm": 0.4898867119107616, + "grad_norm": 0.5511712501852833, "learning_rate": 1.9691617246304007e-05, - "loss": 0.0705, - "mean_token_accuracy": 0.9760597050189972, + "loss": 0.0699, + "mean_token_accuracy": 0.9766572862863541, "step": 943 }, { "epoch": 5.52199413489736, - "grad_norm": 0.6544194466244087, + "grad_norm": 0.48550957447325593, "learning_rate": 1.9657611917515287e-05, - "loss": 0.0774, - "mean_token_accuracy": 0.9778344482183456, + "loss": 0.0802, + "mean_token_accuracy": 0.977307416498661, "step": 944 }, { "epoch": 5.527859237536656, - "grad_norm": 0.5424147306490898, + "grad_norm": 0.4962656544581569, "learning_rate": 1.962361509064928e-05, - "loss": 0.0651, - "mean_token_accuracy": 0.9800689145922661, + "loss": 0.067, + "mean_token_accuracy": 0.9810660406947136, "step": 945 }, { "epoch": 5.533724340175953, - "grad_norm": 0.42614971645103156, + "grad_norm": 0.4585981984962692, "learning_rate": 1.958962688910073e-05, - "loss": 0.0621, - "mean_token_accuracy": 0.980566717684269, + "loss": 0.0664, + "mean_token_accuracy": 0.9795481562614441, "step": 946 }, { "epoch": 5.539589442815249, - "grad_norm": 0.4885883014556131, + "grad_norm": 0.8789699963455241, "learning_rate": 1.9555647436233093e-05, - "loss": 0.071, - "mean_token_accuracy": 0.9800935760140419, + "loss": 0.0811, + "mean_token_accuracy": 0.9794306084513664, "step": 947 }, { "epoch": 5.545454545454545, - "grad_norm": 0.5623124965585408, + "grad_norm": 0.6589501732448013, "learning_rate": 1.9521676855378045e-05, - "loss": 0.0704, - "mean_token_accuracy": 0.9787362143397331, + "loss": 0.0756, + "mean_token_accuracy": 0.9795940294861794, "step": 948 }, { "epoch": 5.551319648093841, - "grad_norm": 0.5876377449692796, + "grad_norm": 0.47109469222419814, "learning_rate": 1.9487715269835082e-05, - "loss": 0.0651, - "mean_token_accuracy": 0.9789851978421211, + "loss": 0.0673, + "mean_token_accuracy": 0.9784109368920326, "step": 949 }, { "epoch": 5.557184750733137, - "grad_norm": 0.5720532607343585, + "grad_norm": 0.6780584444910192, "learning_rate": 1.945376280287105e-05, - "loss": 0.0793, - "mean_token_accuracy": 0.9742227792739868, + "loss": 0.0789, + "mean_token_accuracy": 0.9758977070450783, "step": 950 }, { "epoch": 5.563049853372434, - "grad_norm": 0.6241120555707173, + "grad_norm": 0.547538371775988, "learning_rate": 1.9419819577719684e-05, - "loss": 0.0703, - "mean_token_accuracy": 0.979230061173439, + "loss": 0.075, + "mean_token_accuracy": 0.9771006405353546, "step": 951 }, { "epoch": 5.568914956011731, - "grad_norm": 0.6436812121385808, + "grad_norm": 0.7026091797815018, "learning_rate": 1.9385885717581182e-05, - "loss": 0.0865, - "mean_token_accuracy": 0.973546139895916, + "loss": 0.0831, + "mean_token_accuracy": 0.9737556874752045, "step": 952 }, { "epoch": 5.574780058651027, - "grad_norm": 0.5147412146073209, + "grad_norm": 0.6594999054981713, "learning_rate": 1.935196134562175e-05, - "loss": 0.0669, - "mean_token_accuracy": 0.9807008281350136, + "loss": 0.0659, + "mean_token_accuracy": 0.9800437390804291, "step": 953 }, { "epoch": 5.580645161290323, - "grad_norm": 0.5922549441597481, + "grad_norm": 0.5616998292882237, "learning_rate": 1.931804658497316e-05, - "loss": 0.0686, - "mean_token_accuracy": 0.9795825853943825, + "loss": 0.0795, + "mean_token_accuracy": 0.9787877053022385, "step": 954 }, { "epoch": 5.586510263929619, - "grad_norm": 0.6564154522631755, + "grad_norm": 0.7521827670887291, "learning_rate": 1.9284141558732296e-05, - "loss": 0.0764, - "mean_token_accuracy": 0.9765199050307274, + "loss": 0.0814, + "mean_token_accuracy": 0.9771803990006447, "step": 955 }, { "epoch": 5.592375366568915, - "grad_norm": 0.6294040843192674, + "grad_norm": 0.780497739247058, "learning_rate": 1.925024638996071e-05, - "loss": 0.0696, - "mean_token_accuracy": 0.9779465198516846, + "loss": 0.0801, + "mean_token_accuracy": 0.976881816983223, "step": 956 }, { "epoch": 5.5982404692082115, - "grad_norm": 0.46897658325826863, + "grad_norm": 0.4162052339210939, "learning_rate": 1.9216361201684174e-05, - "loss": 0.0753, - "mean_token_accuracy": 0.9799123182892799, + "loss": 0.0726, + "mean_token_accuracy": 0.9800549224019051, "step": 957 }, { "epoch": 5.604105571847508, - "grad_norm": 0.6226020057368947, + "grad_norm": 0.6287866231807394, "learning_rate": 1.918248611689224e-05, - "loss": 0.0669, - "mean_token_accuracy": 0.9796951934695244, + "loss": 0.076, + "mean_token_accuracy": 0.9785019308328629, "step": 958 }, { "epoch": 5.609970674486804, - "grad_norm": 0.5494371989177863, + "grad_norm": 0.49146726443913197, "learning_rate": 1.9148621258537782e-05, - "loss": 0.0755, - "mean_token_accuracy": 0.9753241837024689, + "loss": 0.0759, + "mean_token_accuracy": 0.9774388298392296, "step": 959 }, { "epoch": 5.6158357771261, - "grad_norm": 0.6032363669586224, + "grad_norm": 0.6072290541914621, "learning_rate": 1.911476674953656e-05, - "loss": 0.0579, - "mean_token_accuracy": 0.980271153151989, + "loss": 0.0683, + "mean_token_accuracy": 0.9797725901007652, "step": 960 }, { "epoch": 5.621700879765396, - "grad_norm": 0.4847213278354494, + "grad_norm": 0.5459267155440238, "learning_rate": 1.9080922712766762e-05, - "loss": 0.0723, - "mean_token_accuracy": 0.9749229624867439, + "loss": 0.0767, + "mean_token_accuracy": 0.9748610332608223, "step": 961 }, { "epoch": 5.627565982404692, - "grad_norm": 0.491493201519099, + "grad_norm": 0.46593356248507284, "learning_rate": 1.904708927106858e-05, - "loss": 0.0792, - "mean_token_accuracy": 0.9770414680242538, + "loss": 0.0821, + "mean_token_accuracy": 0.9751980304718018, "step": 962 }, { "epoch": 5.633431085043989, - "grad_norm": 0.8035666675533314, + "grad_norm": 0.6913652665852186, "learning_rate": 1.9013266547243742e-05, - "loss": 0.0693, - "mean_token_accuracy": 0.9798395037651062, + "loss": 0.068, + "mean_token_accuracy": 0.9801571518182755, "step": 963 }, { "epoch": 5.639296187683285, - "grad_norm": 0.5056374819958188, + "grad_norm": 0.5509304382779889, "learning_rate": 1.8979454664055068e-05, - "loss": 0.0748, - "mean_token_accuracy": 0.9769620299339294, + "loss": 0.073, + "mean_token_accuracy": 0.9766935929656029, "step": 964 }, { "epoch": 5.645161290322581, - "grad_norm": 0.6874918618082315, + "grad_norm": 0.635061720229963, "learning_rate": 1.894565374422605e-05, - "loss": 0.0675, - "mean_token_accuracy": 0.9785389676690102, + "loss": 0.067, + "mean_token_accuracy": 0.9784405454993248, "step": 965 }, { "epoch": 5.651026392961877, - "grad_norm": 0.4157605250823316, + "grad_norm": 0.39936389348973156, "learning_rate": 1.891186391044037e-05, - "loss": 0.0731, - "mean_token_accuracy": 0.975774921476841, + "loss": 0.0803, + "mean_token_accuracy": 0.9737694561481476, "step": 966 }, { "epoch": 5.656891495601173, - "grad_norm": 0.5823531742180683, + "grad_norm": 0.6400412396273522, "learning_rate": 1.887808528534148e-05, - "loss": 0.0955, - "mean_token_accuracy": 0.9756604135036469, + "loss": 0.0714, + "mean_token_accuracy": 0.9779755920171738, "step": 967 }, { "epoch": 5.6627565982404695, - "grad_norm": 1.5182787676052627, + "grad_norm": 0.7067667447837913, "learning_rate": 1.884431799153214e-05, - "loss": 0.0642, - "mean_token_accuracy": 0.9804680868983269, + "loss": 0.0772, + "mean_token_accuracy": 0.9778576120734215, "step": 968 }, { "epoch": 5.668621700879766, - "grad_norm": 0.5599066990953598, + "grad_norm": 1.012390712709044, "learning_rate": 1.8810562151573993e-05, - "loss": 0.0745, - "mean_token_accuracy": 0.9779726639389992, + "loss": 0.089, + "mean_token_accuracy": 0.9746776595711708, "step": 969 }, { "epoch": 5.674486803519062, - "grad_norm": 0.6070035232067619, + "grad_norm": 0.7423043220181863, "learning_rate": 1.8776817887987105e-05, - "loss": 0.0758, - "mean_token_accuracy": 0.9777623414993286, + "loss": 0.0877, + "mean_token_accuracy": 0.9754098355770111, "step": 970 }, { "epoch": 5.680351906158358, - "grad_norm": 0.5298345268067092, + "grad_norm": 0.6077907639730374, "learning_rate": 1.8743085323249527e-05, - "loss": 0.0743, - "mean_token_accuracy": 0.9766695126891136, + "loss": 0.0774, + "mean_token_accuracy": 0.9754339978098869, "step": 971 }, { "epoch": 5.686217008797654, - "grad_norm": 0.5296462474949736, + "grad_norm": 0.49881643744105175, "learning_rate": 1.870936457979684e-05, - "loss": 0.074, - "mean_token_accuracy": 0.9775098264217377, + "loss": 0.0797, + "mean_token_accuracy": 0.9773345366120338, "step": 972 }, { "epoch": 5.69208211143695, - "grad_norm": 0.5596310685328338, + "grad_norm": 0.4582361113723667, "learning_rate": 1.8675655780021733e-05, - "loss": 0.0634, - "mean_token_accuracy": 0.9791462272405624, + "loss": 0.0684, + "mean_token_accuracy": 0.9787016063928604, "step": 973 }, { "epoch": 5.697947214076247, - "grad_norm": 0.6286572843009949, + "grad_norm": 0.568024453102634, "learning_rate": 1.8641959046273525e-05, - "loss": 0.0784, - "mean_token_accuracy": 0.9750925973057747, + "loss": 0.0851, + "mean_token_accuracy": 0.9769276455044746, "step": 974 }, { "epoch": 5.703812316715543, - "grad_norm": 0.5273708760096594, + "grad_norm": 0.4968249690754107, "learning_rate": 1.8608274500857756e-05, - "loss": 0.0766, - "mean_token_accuracy": 0.9776112586259842, + "loss": 0.071, + "mean_token_accuracy": 0.9795166626572609, "step": 975 }, { "epoch": 5.709677419354839, - "grad_norm": 0.5577384204329816, + "grad_norm": 0.4002229392999077, "learning_rate": 1.8574602266035714e-05, - "loss": 0.0608, - "mean_token_accuracy": 0.9810106307268143, + "loss": 0.0681, + "mean_token_accuracy": 0.9806966185569763, "step": 976 }, { "epoch": 5.715542521994135, - "grad_norm": 0.5544850178756708, + "grad_norm": 0.4984740210286059, "learning_rate": 1.854094246402402e-05, - "loss": 0.0821, - "mean_token_accuracy": 0.9750274196267128, + "loss": 0.0872, + "mean_token_accuracy": 0.974516287446022, "step": 977 }, { "epoch": 5.721407624633431, - "grad_norm": 0.6178456682643919, + "grad_norm": 0.5848984981840332, "learning_rate": 1.8507295216994162e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.9818970337510109, + "loss": 0.0715, + "mean_token_accuracy": 0.9809279665350914, "step": 978 }, { "epoch": 5.7272727272727275, - "grad_norm": 0.5142270627968069, + "grad_norm": 0.4250499033716092, "learning_rate": 1.8473660647072053e-05, - "loss": 0.0751, - "mean_token_accuracy": 0.9759154841303825, + "loss": 0.0715, + "mean_token_accuracy": 0.9773638993501663, "step": 979 }, { "epoch": 5.733137829912024, - "grad_norm": 0.5021352568135602, + "grad_norm": 0.46841773113513163, "learning_rate": 1.8440038876337597e-05, - "loss": 0.0677, - "mean_token_accuracy": 0.9790510535240173, + "loss": 0.0624, + "mean_token_accuracy": 0.9801914915442467, "step": 980 }, { "epoch": 5.73900293255132, - "grad_norm": 0.5400003674319198, + "grad_norm": 0.5038261427174479, "learning_rate": 1.8406430026824252e-05, - "loss": 0.0727, - "mean_token_accuracy": 0.976162277162075, + "loss": 0.075, + "mean_token_accuracy": 0.9773709028959274, "step": 981 }, { "epoch": 5.744868035190616, - "grad_norm": 0.6653802174420181, + "grad_norm": 0.47075990803551643, "learning_rate": 1.837283422051855e-05, - "loss": 0.0708, - "mean_token_accuracy": 0.9791212901473045, + "loss": 0.0743, + "mean_token_accuracy": 0.9782584458589554, "step": 982 }, { "epoch": 5.750733137829912, - "grad_norm": 0.5336134949253077, + "grad_norm": 0.567572930616802, "learning_rate": 1.8339251579359713e-05, - "loss": 0.0736, - "mean_token_accuracy": 0.9781318008899689, + "loss": 0.0778, + "mean_token_accuracy": 0.9760931208729744, "step": 983 }, { "epoch": 5.756598240469208, - "grad_norm": 0.46625413729314213, + "grad_norm": 0.3936380859190155, "learning_rate": 1.8305682225239167e-05, - "loss": 0.0691, - "mean_token_accuracy": 0.9801111742854118, + "loss": 0.0713, + "mean_token_accuracy": 0.9801273196935654, "step": 984 }, { "epoch": 5.762463343108505, - "grad_norm": 0.7918478373807947, + "grad_norm": 0.7903767668109325, "learning_rate": 1.8272126280000102e-05, - "loss": 0.0939, - "mean_token_accuracy": 0.9726522043347359, + "loss": 0.0884, + "mean_token_accuracy": 0.9739851281046867, "step": 985 }, { "epoch": 5.768328445747801, - "grad_norm": 0.6659373208659011, + "grad_norm": 0.6608741086050125, "learning_rate": 1.823858386543705e-05, - "loss": 0.0705, - "mean_token_accuracy": 0.9795112237334251, + "loss": 0.0652, + "mean_token_accuracy": 0.9803629368543625, "step": 986 }, { "epoch": 5.774193548387097, - "grad_norm": 0.6937079334829163, + "grad_norm": 0.4998320602993573, "learning_rate": 1.8205055103295434e-05, - "loss": 0.0821, - "mean_token_accuracy": 0.9742537960410118, + "loss": 0.0768, + "mean_token_accuracy": 0.9743634089827538, "step": 987 }, { "epoch": 5.780058651026393, - "grad_norm": 0.5502882133997387, + "grad_norm": 0.4671426923936314, "learning_rate": 1.8171540115271108e-05, - "loss": 0.074, - "mean_token_accuracy": 0.9749346300959587, + "loss": 0.0841, + "mean_token_accuracy": 0.9734274372458458, "step": 988 }, { "epoch": 5.785923753665689, - "grad_norm": 0.5723847973030097, + "grad_norm": 0.593095309506939, "learning_rate": 1.813803902300995e-05, - "loss": 0.0767, - "mean_token_accuracy": 0.9775624573230743, + "loss": 0.0737, + "mean_token_accuracy": 0.9768568873405457, "step": 989 }, { "epoch": 5.7917888563049855, - "grad_norm": 0.48559336446340945, + "grad_norm": 0.5184984999658933, "learning_rate": 1.8104551948107395e-05, - "loss": 0.0689, - "mean_token_accuracy": 0.9818530306220055, + "loss": 0.0856, + "mean_token_accuracy": 0.97777359187603, "step": 990 }, { "epoch": 5.797653958944282, - "grad_norm": 0.43809664758169425, + "grad_norm": 0.5641853565258578, "learning_rate": 1.8071079012107997e-05, - "loss": 0.0641, - "mean_token_accuracy": 0.9799798876047134, + "loss": 0.0633, + "mean_token_accuracy": 0.9801298975944519, "step": 991 }, { "epoch": 5.803519061583578, - "grad_norm": 0.5446369284916783, + "grad_norm": 0.4631484693838226, "learning_rate": 1.8037620336504993e-05, - "loss": 0.07, - "mean_token_accuracy": 0.9789104983210564, + "loss": 0.0706, + "mean_token_accuracy": 0.9805011823773384, "step": 992 }, { "epoch": 5.809384164222874, - "grad_norm": 0.4733802615698864, + "grad_norm": 0.44416688250468606, "learning_rate": 1.8004176042739877e-05, - "loss": 0.0732, - "mean_token_accuracy": 0.9795755222439766, + "loss": 0.071, + "mean_token_accuracy": 0.9795691072940826, "step": 993 }, { "epoch": 5.81524926686217, - "grad_norm": 0.6076123349283594, + "grad_norm": 0.5965136059432004, "learning_rate": 1.797074625220191e-05, - "loss": 0.0715, - "mean_token_accuracy": 0.9788392633199692, + "loss": 0.0748, + "mean_token_accuracy": 0.9784457981586456, "step": 994 }, { "epoch": 5.821114369501466, - "grad_norm": 0.6527755849095698, + "grad_norm": 0.5438818438785552, "learning_rate": 1.7937331086227737e-05, - "loss": 0.0828, - "mean_token_accuracy": 0.9729266539216042, + "loss": 0.0763, + "mean_token_accuracy": 0.9762759432196617, "step": 995 }, { "epoch": 5.826979472140763, - "grad_norm": 0.6987750283606413, + "grad_norm": 0.4977872314812647, "learning_rate": 1.790393066610091e-05, "loss": 0.0826, - "mean_token_accuracy": 0.9741199016571045, + "mean_token_accuracy": 0.9762104973196983, "step": 996 }, { "epoch": 5.832844574780059, - "grad_norm": 0.673623466884957, + "grad_norm": 0.6131847389744154, "learning_rate": 1.787054511305148e-05, - "loss": 0.0852, - "mean_token_accuracy": 0.9762526527047157, + "loss": 0.0815, + "mean_token_accuracy": 0.9775402843952179, "step": 997 }, { "epoch": 5.838709677419355, - "grad_norm": 0.6730592209203903, + "grad_norm": 0.5577275741311593, "learning_rate": 1.7837174548255504e-05, - "loss": 0.075, - "mean_token_accuracy": 0.9776707738637924, + "loss": 0.0814, + "mean_token_accuracy": 0.9762129485607147, "step": 998 }, { "epoch": 5.844574780058651, - "grad_norm": 0.45848872494150783, + "grad_norm": 0.5592068604336378, "learning_rate": 1.7803819092834668e-05, - "loss": 0.0687, - "mean_token_accuracy": 0.9793645292520523, + "loss": 0.0645, + "mean_token_accuracy": 0.9782524555921555, "step": 999 }, { "epoch": 5.850439882697947, - "grad_norm": 0.7023448385378745, + "grad_norm": 0.6354990519774383, "learning_rate": 1.7770478867855797e-05, - "loss": 0.0761, - "mean_token_accuracy": 0.9792755618691444, + "loss": 0.0839, + "mean_token_accuracy": 0.9766361862421036, "step": 1000 }, { "epoch": 5.8563049853372435, - "grad_norm": 0.5175131551283948, + "grad_norm": 0.6256455849819228, "learning_rate": 1.7737153994330437e-05, - "loss": 0.0865, - "mean_token_accuracy": 0.9750565141439438, + "loss": 0.0896, + "mean_token_accuracy": 0.9751609116792679, "step": 1001 }, { "epoch": 5.86217008797654, - "grad_norm": 0.6262907861375655, + "grad_norm": 0.5452759134932382, "learning_rate": 1.7703844593214427e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.9800690039992332, + "loss": 0.075, + "mean_token_accuracy": 0.9791750013828278, "step": 1002 }, { "epoch": 5.868035190615836, - "grad_norm": 0.44221893487398767, + "grad_norm": 0.5075678241957828, "learning_rate": 1.7670550785407444e-05, - "loss": 0.0572, - "mean_token_accuracy": 0.9825234487652779, + "loss": 0.0631, + "mean_token_accuracy": 0.9794988706707954, "step": 1003 }, { "epoch": 5.873900293255132, - "grad_norm": 0.4987557150663674, + "grad_norm": 0.6731979059855191, "learning_rate": 1.7637272691752548e-05, - "loss": 0.0771, - "mean_token_accuracy": 0.975949339568615, + "loss": 0.0794, + "mean_token_accuracy": 0.974234364926815, "step": 1004 }, { "epoch": 5.879765395894428, - "grad_norm": 0.5103138713112173, + "grad_norm": 0.4480038296462438, "learning_rate": 1.7604010433035793e-05, - "loss": 0.0816, - "mean_token_accuracy": 0.9757112711668015, + "loss": 0.0869, + "mean_token_accuracy": 0.9753069505095482, "step": 1005 }, { "epoch": 5.885630498533724, - "grad_norm": 0.5757247782780586, + "grad_norm": 0.5462367128398795, "learning_rate": 1.7570764129985747e-05, - "loss": 0.0714, - "mean_token_accuracy": 0.9774347543716431, + "loss": 0.0824, + "mean_token_accuracy": 0.9754479303956032, "step": 1006 }, { "epoch": 5.891495601173021, - "grad_norm": 0.5735144828579501, + "grad_norm": 0.5481811313348861, "learning_rate": 1.7537533903273055e-05, - "loss": 0.0672, - "mean_token_accuracy": 0.9788115695118904, + "loss": 0.079, + "mean_token_accuracy": 0.9769573882222176, "step": 1007 }, { "epoch": 5.897360703812317, - "grad_norm": 0.5276596129193636, + "grad_norm": 0.8764032211615499, "learning_rate": 1.7504319873510014e-05, - "loss": 0.0816, - "mean_token_accuracy": 0.9764761105179787, + "loss": 0.0921, + "mean_token_accuracy": 0.9756467416882515, "step": 1008 }, { "epoch": 5.903225806451613, - "grad_norm": 0.5934904035247996, + "grad_norm": 0.5742525162178889, "learning_rate": 1.7471122161250153e-05, - "loss": 0.0829, - "mean_token_accuracy": 0.9750788882374763, + "loss": 0.0796, + "mean_token_accuracy": 0.9754255339503288, "step": 1009 }, { "epoch": 5.909090909090909, - "grad_norm": 0.7581092342018619, + "grad_norm": 0.7279542813059033, "learning_rate": 1.743794088698775e-05, - "loss": 0.0792, - "mean_token_accuracy": 0.9779830947518349, + "loss": 0.0754, + "mean_token_accuracy": 0.9784489199519157, "step": 1010 }, { "epoch": 5.914956011730205, - "grad_norm": 0.46297684327502575, + "grad_norm": 0.4522109182504477, "learning_rate": 1.7404776171157428e-05, - "loss": 0.0764, - "mean_token_accuracy": 0.9766190350055695, + "loss": 0.0823, + "mean_token_accuracy": 0.9760701656341553, "step": 1011 }, { "epoch": 5.9208211143695015, - "grad_norm": 0.5333324483571935, + "grad_norm": 0.5006876106993897, "learning_rate": 1.7371628134133716e-05, - "loss": 0.0875, - "mean_token_accuracy": 0.9737675860524178, + "loss": 0.0814, + "mean_token_accuracy": 0.9751926735043526, "step": 1012 }, { "epoch": 5.926686217008798, - "grad_norm": 0.6875722103361684, + "grad_norm": 0.4417156692330552, "learning_rate": 1.73384968962306e-05, - "loss": 0.0731, - "mean_token_accuracy": 0.9765476137399673, + "loss": 0.0756, + "mean_token_accuracy": 0.975707083940506, "step": 1013 }, { "epoch": 5.932551319648094, - "grad_norm": 0.5498994219533493, + "grad_norm": 0.6094248911195603, "learning_rate": 1.7305382577701088e-05, - "loss": 0.0786, - "mean_token_accuracy": 0.9759142473340034, + "loss": 0.0808, + "mean_token_accuracy": 0.9764841720461845, "step": 1014 }, { "epoch": 5.93841642228739, - "grad_norm": 0.5849125695448263, + "grad_norm": 0.551520586810897, "learning_rate": 1.7272285298736787e-05, - "loss": 0.069, - "mean_token_accuracy": 0.9774723574519157, + "loss": 0.0782, + "mean_token_accuracy": 0.9761537089943886, "step": 1015 }, { "epoch": 5.944281524926686, - "grad_norm": 0.5757882291325982, + "grad_norm": 0.6403501488586054, "learning_rate": 1.7239205179467453e-05, - "loss": 0.0811, - "mean_token_accuracy": 0.9770863503217697, + "loss": 0.0844, + "mean_token_accuracy": 0.974406361579895, "step": 1016 }, { "epoch": 5.9501466275659824, - "grad_norm": 0.6063631339216475, + "grad_norm": 0.5514850408949352, "learning_rate": 1.720614233996056e-05, - "loss": 0.0939, - "mean_token_accuracy": 0.9728550314903259, + "loss": 0.083, + "mean_token_accuracy": 0.9733666777610779, "step": 1017 }, { "epoch": 5.956011730205279, - "grad_norm": 0.7121493997252688, + "grad_norm": 0.5165899623290655, "learning_rate": 1.7173096900220852e-05, - "loss": 0.0716, - "mean_token_accuracy": 0.9775163680315018, + "loss": 0.0789, + "mean_token_accuracy": 0.9754836708307266, "step": 1018 }, { "epoch": 5.961876832844575, - "grad_norm": 0.5713158877121456, + "grad_norm": 0.49632223872221254, "learning_rate": 1.7140068980189943e-05, - "loss": 0.0855, - "mean_token_accuracy": 0.9739682152867317, + "loss": 0.0811, + "mean_token_accuracy": 0.974688708782196, "step": 1019 }, { "epoch": 5.967741935483871, - "grad_norm": 0.5823469676430719, + "grad_norm": 0.5819762705527881, "learning_rate": 1.710705869974583e-05, - "loss": 0.081, - "mean_token_accuracy": 0.9755722358822823, + "loss": 0.0771, + "mean_token_accuracy": 0.9771495833992958, "step": 1020 }, { "epoch": 5.973607038123167, - "grad_norm": 0.5279782610228617, + "grad_norm": 0.6180454944945127, "learning_rate": 1.7074066178702512e-05, - "loss": 0.0579, - "mean_token_accuracy": 0.980400986969471, + "loss": 0.0661, + "mean_token_accuracy": 0.9801411479711533, "step": 1021 }, { "epoch": 5.979472140762463, - "grad_norm": 0.5112568511100889, + "grad_norm": 0.4411679881699406, "learning_rate": 1.7041091536809506e-05, - "loss": 0.0688, - "mean_token_accuracy": 0.9795641005039215, + "loss": 0.0715, + "mean_token_accuracy": 0.9799164235591888, "step": 1022 }, { "epoch": 5.9853372434017595, - "grad_norm": 0.5481954671151484, + "grad_norm": 0.5065972988067083, "learning_rate": 1.7008134893751446e-05, - "loss": 0.0703, - "mean_token_accuracy": 0.980062872171402, + "loss": 0.0688, + "mean_token_accuracy": 0.979424737393856, "step": 1023 }, { "epoch": 5.991202346041056, - "grad_norm": 0.5077431608410231, + "grad_norm": 0.4240940682692869, "learning_rate": 1.697519636914765e-05, - "loss": 0.0627, - "mean_token_accuracy": 0.9807908609509468, + "loss": 0.071, + "mean_token_accuracy": 0.9785429760813713, "step": 1024 }, { "epoch": 5.997067448680352, - "grad_norm": 0.5942347658987599, + "grad_norm": 0.611593671142202, "learning_rate": 1.6942276082551634e-05, - "loss": 0.0818, - "mean_token_accuracy": 0.9745178669691086, + "loss": 0.0876, + "mean_token_accuracy": 0.973278783261776, "step": 1025 }, { "epoch": 6.0, - "grad_norm": 0.9379871683914409, + "grad_norm": 0.989390280244205, "learning_rate": 1.6909374153450762e-05, - "loss": 0.0758, - "mean_token_accuracy": 0.9805418103933334, + "loss": 0.081, + "mean_token_accuracy": 0.9779550731182098, "step": 1026 }, { "epoch": 6.005865102639296, - "grad_norm": 0.41625170627035957, + "grad_norm": 0.4230070349459214, "learning_rate": 1.6876490701265736e-05, - "loss": 0.0557, - "mean_token_accuracy": 0.9833519533276558, + "loss": 0.0602, + "mean_token_accuracy": 0.9809990078210831, "step": 1027 }, { "epoch": 6.011730205278592, - "grad_norm": 0.5625854280142911, + "grad_norm": 0.4809667816759392, "learning_rate": 1.684362584535022e-05, - "loss": 0.0651, - "mean_token_accuracy": 0.980522520840168, + "loss": 0.0686, + "mean_token_accuracy": 0.9796543195843697, "step": 1028 }, { "epoch": 6.0175953079178885, - "grad_norm": 0.4589596228148454, + "grad_norm": 0.9016883836801177, "learning_rate": 1.6810779704990358e-05, "loss": 0.064, - "mean_token_accuracy": 0.980443462729454, + "mean_token_accuracy": 0.9786367863416672, "step": 1029 }, { "epoch": 6.023460410557185, - "grad_norm": 0.4710321685654739, + "grad_norm": 0.5120683418670421, "learning_rate": 1.677795239940438e-05, - "loss": 0.0526, - "mean_token_accuracy": 0.9833626300096512, + "loss": 0.0589, + "mean_token_accuracy": 0.9816581308841705, "step": 1030 }, { "epoch": 6.029325513196481, - "grad_norm": 0.47115827056288706, + "grad_norm": 0.48081169024458126, "learning_rate": 1.674514404774214e-05, - "loss": 0.0682, - "mean_token_accuracy": 0.9806164056062698, + "loss": 0.066, + "mean_token_accuracy": 0.9813550114631653, "step": 1031 }, { "epoch": 6.035190615835777, - "grad_norm": 0.5007221500943095, + "grad_norm": 0.5765323121918493, "learning_rate": 1.671235476908471e-05, - "loss": 0.0638, - "mean_token_accuracy": 0.9794806391000748, + "loss": 0.0659, + "mean_token_accuracy": 0.9793925806879997, "step": 1032 }, { "epoch": 6.041055718475073, - "grad_norm": 0.46139880948373124, + "grad_norm": 0.4326327601941797, "learning_rate": 1.6679584682443924e-05, - "loss": 0.059, - "mean_token_accuracy": 0.9817759990692139, + "loss": 0.0591, + "mean_token_accuracy": 0.9802664965391159, "step": 1033 }, { "epoch": 6.0469208211143695, - "grad_norm": 0.47833827548137275, + "grad_norm": 0.5269803365906425, "learning_rate": 1.6646833906761965e-05, - "loss": 0.061, - "mean_token_accuracy": 0.9800918996334076, + "loss": 0.0653, + "mean_token_accuracy": 0.9782321378588676, "step": 1034 }, { "epoch": 6.052785923753666, - "grad_norm": 0.4157611090168959, + "grad_norm": 0.4138760129091301, "learning_rate": 1.661410256091092e-05, - "loss": 0.0576, - "mean_token_accuracy": 0.9824439659714699, + "loss": 0.0641, + "mean_token_accuracy": 0.9822335913777351, "step": 1035 }, { "epoch": 6.058651026392962, - "grad_norm": 0.5513319881808698, + "grad_norm": 0.3503802526401633, "learning_rate": 1.658139076369236e-05, - "loss": 0.0726, - "mean_token_accuracy": 0.9797214195132256, + "loss": 0.0664, + "mean_token_accuracy": 0.9802732914686203, "step": 1036 }, { "epoch": 6.064516129032258, - "grad_norm": 0.5997799870227748, + "grad_norm": 0.5238336475389761, "learning_rate": 1.6548698633836893e-05, - "loss": 0.0641, - "mean_token_accuracy": 0.9777173176407814, + "loss": 0.0654, + "mean_token_accuracy": 0.9764923080801964, "step": 1037 }, { "epoch": 6.070381231671554, - "grad_norm": 0.47333625171664667, + "grad_norm": 0.6481246518288559, "learning_rate": 1.6516026290003746e-05, - "loss": 0.0584, - "mean_token_accuracy": 0.9834053292870522, + "loss": 0.0655, + "mean_token_accuracy": 0.9822241738438606, "step": 1038 }, { "epoch": 6.07624633431085, - "grad_norm": 0.4424032802775701, + "grad_norm": 0.4818098827810952, "learning_rate": 1.6483373850780328e-05, - "loss": 0.0613, - "mean_token_accuracy": 0.9812067598104477, + "loss": 0.0671, + "mean_token_accuracy": 0.9789280816912651, "step": 1039 }, { "epoch": 6.0821114369501466, - "grad_norm": 0.3632789659233089, + "grad_norm": 0.3031875450319505, "learning_rate": 1.645074143468181e-05, - "loss": 0.055, - "mean_token_accuracy": 0.9817801341414452, + "loss": 0.0523, + "mean_token_accuracy": 0.9837686270475388, "step": 1040 }, { "epoch": 6.087976539589443, - "grad_norm": 0.6360564965039187, + "grad_norm": 0.5254248298346764, "learning_rate": 1.6418129160150692e-05, - "loss": 0.07, - "mean_token_accuracy": 0.978939987719059, + "loss": 0.0642, + "mean_token_accuracy": 0.9779577627778053, "step": 1041 }, { "epoch": 6.093841642228739, - "grad_norm": 0.42344045495487054, + "grad_norm": 0.4566878185169901, "learning_rate": 1.6385537145556346e-05, - "loss": 0.0548, - "mean_token_accuracy": 0.9843300357460976, + "loss": 0.0597, + "mean_token_accuracy": 0.9836693927645683, "step": 1042 }, { "epoch": 6.099706744868035, - "grad_norm": 0.45984406695397945, + "grad_norm": 0.41107292338776835, "learning_rate": 1.6352965509194634e-05, - "loss": 0.0556, - "mean_token_accuracy": 0.9822628200054169, + "loss": 0.0577, + "mean_token_accuracy": 0.9834092557430267, "step": 1043 }, { "epoch": 6.105571847507331, - "grad_norm": 0.4323284440143135, + "grad_norm": 0.46469592304351814, "learning_rate": 1.6320414369287427e-05, - "loss": 0.0557, - "mean_token_accuracy": 0.9819125235080719, + "loss": 0.0549, + "mean_token_accuracy": 0.9818287864327431, "step": 1044 }, { "epoch": 6.1114369501466275, - "grad_norm": 0.47646310093981065, + "grad_norm": 0.3758735824688024, "learning_rate": 1.6287883843982223e-05, - "loss": 0.063, - "mean_token_accuracy": 0.9816362336277962, + "loss": 0.064, + "mean_token_accuracy": 0.9818753302097321, "step": 1045 }, { "epoch": 6.117302052785924, - "grad_norm": 0.6185661703570164, + "grad_norm": 0.5436745352949359, "learning_rate": 1.625537405135169e-05, - "loss": 0.0797, - "mean_token_accuracy": 0.9750150516629219, + "loss": 0.081, + "mean_token_accuracy": 0.9740028008818626, "step": 1046 }, { "epoch": 6.12316715542522, - "grad_norm": 0.556525046678587, + "grad_norm": 0.4545248918295238, "learning_rate": 1.622288510939325e-05, - "loss": 0.0678, - "mean_token_accuracy": 0.9790779277682304, + "loss": 0.0656, + "mean_token_accuracy": 0.9772230237722397, "step": 1047 }, { "epoch": 6.129032258064516, - "grad_norm": 0.6626182528269104, + "grad_norm": 0.45739493592200814, "learning_rate": 1.619041713602864e-05, - "loss": 0.0774, - "mean_token_accuracy": 0.9791093915700912, + "loss": 0.0728, + "mean_token_accuracy": 0.9779245927929878, "step": 1048 }, { "epoch": 6.134897360703812, - "grad_norm": 0.5662890345517463, + "grad_norm": 0.51162293891879, "learning_rate": 1.6157970249103484e-05, - "loss": 0.0694, - "mean_token_accuracy": 0.9782344624400139, + "loss": 0.0743, + "mean_token_accuracy": 0.9762742295861244, "step": 1049 }, { "epoch": 6.140762463343108, - "grad_norm": 0.5645945744859859, + "grad_norm": 0.5507465476783818, "learning_rate": 1.612554456638688e-05, "loss": 0.0721, - "mean_token_accuracy": 0.9771447703242302, + "mean_token_accuracy": 0.9776153936982155, "step": 1050 }, { "epoch": 6.146627565982405, - "grad_norm": 0.6274940318220598, + "grad_norm": 0.5145632814275362, "learning_rate": 1.6093140205570962e-05, - "loss": 0.0753, - "mean_token_accuracy": 0.976951114833355, + "loss": 0.0791, + "mean_token_accuracy": 0.976268582046032, "step": 1051 }, { "epoch": 6.152492668621701, - "grad_norm": 0.4898064028531864, + "grad_norm": 0.9051351509370605, "learning_rate": 1.6060757284270474e-05, - "loss": 0.0724, - "mean_token_accuracy": 0.9772902429103851, + "loss": 0.0805, + "mean_token_accuracy": 0.9752627164125443, "step": 1052 }, { "epoch": 6.158357771260997, - "grad_norm": 0.4672713017679259, + "grad_norm": 0.4403044486083453, "learning_rate": 1.6028395920022336e-05, - "loss": 0.0549, - "mean_token_accuracy": 0.9808409512042999, + "loss": 0.0574, + "mean_token_accuracy": 0.9793043285608292, "step": 1053 }, { "epoch": 6.164222873900293, - "grad_norm": 0.5108336954770324, + "grad_norm": 0.3794117905310881, "learning_rate": 1.5996056230285237e-05, - "loss": 0.0622, - "mean_token_accuracy": 0.9801520705223083, + "loss": 0.0607, + "mean_token_accuracy": 0.9801375046372414, "step": 1054 }, { "epoch": 6.170087976539589, - "grad_norm": 0.41505936098873364, + "grad_norm": 0.4175007624888782, "learning_rate": 1.596373833243918e-05, - "loss": 0.0617, - "mean_token_accuracy": 0.978906974196434, + "loss": 0.0643, + "mean_token_accuracy": 0.978547640144825, "step": 1055 }, { "epoch": 6.1759530791788855, - "grad_norm": 0.6576675728809507, + "grad_norm": 0.45255458686021044, "learning_rate": 1.593144234378509e-05, - "loss": 0.0687, - "mean_token_accuracy": 0.978096179664135, + "loss": 0.0656, + "mean_token_accuracy": 0.9782090336084366, "step": 1056 }, { "epoch": 6.181818181818182, - "grad_norm": 0.4119939746598458, + "grad_norm": 0.49640241962018194, "learning_rate": 1.5899168381544362e-05, - "loss": 0.0592, - "mean_token_accuracy": 0.9820670709013939, + "loss": 0.0651, + "mean_token_accuracy": 0.9813453853130341, "step": 1057 }, { "epoch": 6.187683284457478, - "grad_norm": 0.38406056548322093, + "grad_norm": 0.4012097342615111, "learning_rate": 1.5866916562858444e-05, - "loss": 0.0579, - "mean_token_accuracy": 0.9805167242884636, + "loss": 0.0597, + "mean_token_accuracy": 0.9798336252570152, "step": 1058 }, { "epoch": 6.193548387096774, - "grad_norm": 0.42331392587902517, + "grad_norm": 0.3832795337648082, "learning_rate": 1.5834687004788406e-05, - "loss": 0.0636, - "mean_token_accuracy": 0.9793353825807571, + "loss": 0.061, + "mean_token_accuracy": 0.9790323451161385, "step": 1059 }, { "epoch": 6.19941348973607, - "grad_norm": 0.5444805201415094, + "grad_norm": 0.45921292178284495, "learning_rate": 1.5802479824314537e-05, - "loss": 0.0663, - "mean_token_accuracy": 0.9753135293722153, + "loss": 0.0713, + "mean_token_accuracy": 0.9753189608454704, "step": 1060 }, { "epoch": 6.205278592375366, - "grad_norm": 0.5768143603726456, + "grad_norm": 0.37110042750500344, "learning_rate": 1.5770295138335896e-05, - "loss": 0.0633, - "mean_token_accuracy": 0.9811239168047905, + "loss": 0.0557, + "mean_token_accuracy": 0.9829725921154022, "step": 1061 }, { "epoch": 6.211143695014663, - "grad_norm": 0.5086983780677398, + "grad_norm": 0.5292919165677671, "learning_rate": 1.573813306366988e-05, - "loss": 0.0591, - "mean_token_accuracy": 0.9830747321248055, + "loss": 0.0607, + "mean_token_accuracy": 0.9827964678406715, "step": 1062 }, { "epoch": 6.217008797653959, - "grad_norm": 0.49230527321663736, + "grad_norm": 0.5901800906736643, "learning_rate": 1.5705993717051838e-05, - "loss": 0.0721, - "mean_token_accuracy": 0.976339653134346, + "loss": 0.0803, + "mean_token_accuracy": 0.9757646322250366, "step": 1063 }, { "epoch": 6.222873900293255, - "grad_norm": 0.6986236541493921, + "grad_norm": 0.5673704851630527, "learning_rate": 1.567387721513462e-05, - "loss": 0.0698, - "mean_token_accuracy": 0.9772974252700806, + "loss": 0.0742, + "mean_token_accuracy": 0.9788571149110794, "step": 1064 }, { "epoch": 6.228739002932551, - "grad_norm": 0.42866239632289127, + "grad_norm": 0.517815339963219, "learning_rate": 1.5641783674488155e-05, - "loss": 0.0613, - "mean_token_accuracy": 0.9813599810004234, + "loss": 0.068, + "mean_token_accuracy": 0.9804918318986893, "step": 1065 }, { "epoch": 6.234604105571847, - "grad_norm": 0.3950681428606381, + "grad_norm": 0.6981822028647853, "learning_rate": 1.5609713211599035e-05, - "loss": 0.0691, - "mean_token_accuracy": 0.9793985933065414, + "loss": 0.0771, + "mean_token_accuracy": 0.9772537648677826, "step": 1066 }, { "epoch": 6.2404692082111435, - "grad_norm": 0.49277456950892906, + "grad_norm": 0.5645231236916332, "learning_rate": 1.557766594287009e-05, - "loss": 0.0722, - "mean_token_accuracy": 0.9784559234976768, + "loss": 0.0783, + "mean_token_accuracy": 0.9776250943541527, "step": 1067 }, { "epoch": 6.24633431085044, - "grad_norm": 0.5974984569237553, + "grad_norm": 0.425211654077209, "learning_rate": 1.554564198461996e-05, - "loss": 0.0773, - "mean_token_accuracy": 0.9730274602770805, + "loss": 0.0772, + "mean_token_accuracy": 0.9736573025584221, "step": 1068 }, { "epoch": 6.252199413489736, - "grad_norm": 0.5298257752867717, + "grad_norm": 0.6755364232320741, "learning_rate": 1.5513641453082672e-05, - "loss": 0.0652, - "mean_token_accuracy": 0.9812508746981621, + "loss": 0.0715, + "mean_token_accuracy": 0.9788224399089813, "step": 1069 }, { "epoch": 6.258064516129032, - "grad_norm": 0.4929415822759338, + "grad_norm": 0.4802248171400769, "learning_rate": 1.5481664464407246e-05, - "loss": 0.0619, - "mean_token_accuracy": 0.9832079485058784, + "loss": 0.0638, + "mean_token_accuracy": 0.9831561893224716, "step": 1070 }, { "epoch": 6.263929618768328, - "grad_norm": 0.45181732363229793, + "grad_norm": 1.7321569882434473, "learning_rate": 1.5449711134657224e-05, - "loss": 0.0687, - "mean_token_accuracy": 0.9794114828109741, + "loss": 0.0757, + "mean_token_accuracy": 0.9757629260420799, "step": 1071 }, { "epoch": 6.269794721407624, - "grad_norm": 0.4301733956357397, + "grad_norm": 0.44523777582660745, "learning_rate": 1.5417781579810296e-05, - "loss": 0.0676, - "mean_token_accuracy": 0.9799632504582405, + "loss": 0.0691, + "mean_token_accuracy": 0.9794281423091888, "step": 1072 }, { "epoch": 6.275659824046921, - "grad_norm": 0.45380792420397464, + "grad_norm": 0.5126014904862785, "learning_rate": 1.5385875915757846e-05, - "loss": 0.0565, - "mean_token_accuracy": 0.9798811078071594, + "loss": 0.0624, + "mean_token_accuracy": 0.9795156866312027, "step": 1073 }, { "epoch": 6.281524926686217, - "grad_norm": 0.4889342003194968, + "grad_norm": 0.3975436578422233, "learning_rate": 1.535399425830456e-05, - "loss": 0.062, - "mean_token_accuracy": 0.9813030734658241, + "loss": 0.0642, + "mean_token_accuracy": 0.9801967516541481, "step": 1074 }, { "epoch": 6.287390029325513, - "grad_norm": 0.49157956071263753, + "grad_norm": 0.5115367202459034, "learning_rate": 1.5322136723167957e-05, - "loss": 0.0678, - "mean_token_accuracy": 0.9778005704283714, + "loss": 0.0669, + "mean_token_accuracy": 0.9776618257164955, "step": 1075 }, { "epoch": 6.293255131964809, - "grad_norm": 0.36148981245399664, + "grad_norm": 0.4162534346021217, "learning_rate": 1.5290303425978036e-05, - "loss": 0.0578, - "mean_token_accuracy": 0.9815500751137733, + "loss": 0.0636, + "mean_token_accuracy": 0.9809231907129288, "step": 1076 }, { "epoch": 6.299120234604105, - "grad_norm": 0.5078662394006407, + "grad_norm": 0.5679322341771472, "learning_rate": 1.525849448227681e-05, - "loss": 0.0649, - "mean_token_accuracy": 0.9803764596581459, + "loss": 0.0721, + "mean_token_accuracy": 0.9788202419877052, "step": 1077 }, { "epoch": 6.3049853372434015, - "grad_norm": 0.44875417467973094, + "grad_norm": 0.33264597776146443, "learning_rate": 1.5226710007517894e-05, "loss": 0.0728, - "mean_token_accuracy": 0.977813683450222, + "mean_token_accuracy": 0.9753846675157547, "step": 1078 }, { "epoch": 6.310850439882698, - "grad_norm": 0.43324133716142355, + "grad_norm": 0.3700642741877526, "learning_rate": 1.5194950117066097e-05, - "loss": 0.0594, - "mean_token_accuracy": 0.979973241686821, + "loss": 0.0602, + "mean_token_accuracy": 0.9789994210004807, "step": 1079 }, { "epoch": 6.316715542521994, - "grad_norm": 0.4151240808065745, + "grad_norm": 0.4538963744798111, "learning_rate": 1.5163214926196995e-05, - "loss": 0.0701, - "mean_token_accuracy": 0.9771018698811531, + "loss": 0.0795, + "mean_token_accuracy": 0.9746260717511177, "step": 1080 }, { "epoch": 6.32258064516129, - "grad_norm": 0.4555302943525885, + "grad_norm": 0.41619435583306597, "learning_rate": 1.5131504550096515e-05, - "loss": 0.0669, - "mean_token_accuracy": 0.978136457502842, + "loss": 0.0677, + "mean_token_accuracy": 0.9778345227241516, "step": 1081 }, { "epoch": 6.328445747800586, - "grad_norm": 0.6519430545614232, + "grad_norm": 0.2562840436018545, "learning_rate": 1.5099819103860504e-05, - "loss": 0.0629, - "mean_token_accuracy": 0.9802589863538742, + "loss": 0.0603, + "mean_token_accuracy": 0.9805175438523293, "step": 1082 }, { "epoch": 6.334310850439882, - "grad_norm": 0.4967967194966807, + "grad_norm": 0.48366360471108993, "learning_rate": 1.5068158702494348e-05, - "loss": 0.0574, - "mean_token_accuracy": 0.9817872196435928, + "loss": 0.0588, + "mean_token_accuracy": 0.9807134121656418, "step": 1083 }, { "epoch": 6.340175953079179, - "grad_norm": 0.44345834591558786, + "grad_norm": 0.43233510759403737, "learning_rate": 1.5036523460912511e-05, - "loss": 0.0564, - "mean_token_accuracy": 0.9838694632053375, + "loss": 0.0589, + "mean_token_accuracy": 0.982612282037735, "step": 1084 }, { "epoch": 6.346041055718475, - "grad_norm": 0.4715058201780604, + "grad_norm": 0.41436575580785434, "learning_rate": 1.5004913493938147e-05, - "loss": 0.0634, - "mean_token_accuracy": 0.9793806448578835, + "loss": 0.0672, + "mean_token_accuracy": 0.9788793921470642, "step": 1085 }, { "epoch": 6.351906158357771, - "grad_norm": 0.5443439102135127, + "grad_norm": 0.5154873539617718, "learning_rate": 1.4973328916302667e-05, - "loss": 0.0737, - "mean_token_accuracy": 0.9753685146570206, + "loss": 0.0746, + "mean_token_accuracy": 0.9756126180291176, "step": 1086 }, { "epoch": 6.357771260997067, - "grad_norm": 0.5384867941568995, + "grad_norm": 0.40160039748906046, "learning_rate": 1.4941769842645335e-05, - "loss": 0.0658, - "mean_token_accuracy": 0.9772131741046906, + "loss": 0.0694, + "mean_token_accuracy": 0.9775484576821327, "step": 1087 }, { "epoch": 6.363636363636363, - "grad_norm": 0.4575840235851356, + "grad_norm": 0.3751129300388227, "learning_rate": 1.4910236387512837e-05, - "loss": 0.0635, - "mean_token_accuracy": 0.9794055670499802, + "loss": 0.06, + "mean_token_accuracy": 0.9810876697301865, "step": 1088 }, { "epoch": 6.3695014662756595, - "grad_norm": 0.40669986519786894, + "grad_norm": 0.38549860564235966, "learning_rate": 1.487872866535888e-05, - "loss": 0.0586, - "mean_token_accuracy": 0.9820075482130051, + "loss": 0.0633, + "mean_token_accuracy": 0.981402188539505, "step": 1089 }, { "epoch": 6.375366568914956, - "grad_norm": 0.5175296953254443, + "grad_norm": 0.4941639857655589, "learning_rate": 1.4847246790543773e-05, - "loss": 0.0618, - "mean_token_accuracy": 0.9794023782014847, + "loss": 0.067, + "mean_token_accuracy": 0.9788068756461143, "step": 1090 }, { "epoch": 6.381231671554252, - "grad_norm": 0.4715228486396621, + "grad_norm": 0.4267367405789239, "learning_rate": 1.4815790877334007e-05, - "loss": 0.0566, - "mean_token_accuracy": 0.9806531295180321, + "loss": 0.0585, + "mean_token_accuracy": 0.9816467389464378, "step": 1091 }, { "epoch": 6.387096774193548, - "grad_norm": 0.5288849918324824, + "grad_norm": 0.4632733601898166, "learning_rate": 1.4784361039901844e-05, - "loss": 0.0684, - "mean_token_accuracy": 0.9798084422945976, + "loss": 0.0699, + "mean_token_accuracy": 0.9788152202963829, "step": 1092 }, { "epoch": 6.392961876832844, - "grad_norm": 0.4773056484959844, + "grad_norm": 0.4401889419648337, "learning_rate": 1.47529573923249e-05, - "loss": 0.0602, - "mean_token_accuracy": 0.9808967262506485, + "loss": 0.0603, + "mean_token_accuracy": 0.98129902780056, "step": 1093 }, { "epoch": 6.39882697947214, - "grad_norm": 0.5215802713663142, + "grad_norm": 0.4408537773426437, "learning_rate": 1.472158004858573e-05, - "loss": 0.0623, - "mean_token_accuracy": 0.9784137681126595, + "loss": 0.0672, + "mean_token_accuracy": 0.979188434779644, "step": 1094 }, { "epoch": 6.404692082111437, - "grad_norm": 0.594013580780295, + "grad_norm": 0.40532565045994134, "learning_rate": 1.4690229122571419e-05, - "loss": 0.0749, - "mean_token_accuracy": 0.9752750173211098, + "loss": 0.0748, + "mean_token_accuracy": 0.9763947352766991, "step": 1095 }, { "epoch": 6.410557184750733, - "grad_norm": 0.45076680417605564, + "grad_norm": 0.467733449302367, "learning_rate": 1.4658904728073169e-05, - "loss": 0.0579, - "mean_token_accuracy": 0.9810296148061752, + "loss": 0.0619, + "mean_token_accuracy": 0.9803221970796585, "step": 1096 }, { "epoch": 6.416422287390029, - "grad_norm": 0.4686177977484101, + "grad_norm": 0.4423734507777753, "learning_rate": 1.4627606978785878e-05, - "loss": 0.0653, - "mean_token_accuracy": 0.9801298379898071, + "loss": 0.07, + "mean_token_accuracy": 0.9782881215214729, "step": 1097 }, { "epoch": 6.422287390029325, - "grad_norm": 0.473498323409717, + "grad_norm": 0.5813032813305795, "learning_rate": 1.4596335988307736e-05, - "loss": 0.0715, - "mean_token_accuracy": 0.9790318608283997, + "loss": 0.075, + "mean_token_accuracy": 0.9799735173583031, "step": 1098 }, { "epoch": 6.428152492668621, - "grad_norm": 0.3829985516560962, + "grad_norm": 0.3990593770762677, "learning_rate": 1.4565091870139814e-05, - "loss": 0.0544, - "mean_token_accuracy": 0.9823189005255699, + "loss": 0.058, + "mean_token_accuracy": 0.9816461279988289, "step": 1099 }, { "epoch": 6.4340175953079175, - "grad_norm": 0.6985399778014056, + "grad_norm": 0.5096360075436533, "learning_rate": 1.4533874737685638e-05, - "loss": 0.0832, - "mean_token_accuracy": 0.9754642993211746, + "loss": 0.0827, + "mean_token_accuracy": 0.9770070910453796, "step": 1100 }, { "epoch": 6.439882697947214, - "grad_norm": 0.47408544720881507, + "grad_norm": 0.4034576006966357, "learning_rate": 1.450268470425079e-05, - "loss": 0.0654, - "mean_token_accuracy": 0.9818674698472023, + "loss": 0.0623, + "mean_token_accuracy": 0.9812736809253693, "step": 1101 }, { "epoch": 6.44574780058651, - "grad_norm": 0.4132311752624299, + "grad_norm": 0.5083541291281424, "learning_rate": 1.4471521883042492e-05, - "loss": 0.0595, - "mean_token_accuracy": 0.9819449707865715, + "loss": 0.0656, + "mean_token_accuracy": 0.9808938726782799, "step": 1102 }, { "epoch": 6.451612903225806, - "grad_norm": 0.5282825996755145, + "grad_norm": 0.547718663444142, "learning_rate": 1.4440386387169207e-05, - "loss": 0.0676, - "mean_token_accuracy": 0.9805441722273827, + "loss": 0.0721, + "mean_token_accuracy": 0.9780811071395874, "step": 1103 }, { "epoch": 6.457478005865102, - "grad_norm": 0.5093444055532642, + "grad_norm": 0.471186574564861, "learning_rate": 1.4409278329640218e-05, - "loss": 0.0696, - "mean_token_accuracy": 0.9782739505171776, + "loss": 0.069, + "mean_token_accuracy": 0.9777532294392586, "step": 1104 }, { "epoch": 6.463343108504398, - "grad_norm": 0.40896293549583035, + "grad_norm": 0.39531628982526346, "learning_rate": 1.4378197823365186e-05, - "loss": 0.0653, - "mean_token_accuracy": 0.980708159506321, + "loss": 0.067, + "mean_token_accuracy": 0.9800560027360916, "step": 1105 }, { "epoch": 6.469208211143695, - "grad_norm": 0.5368030574271555, + "grad_norm": 0.5769580026963542, "learning_rate": 1.4347144981153807e-05, - "loss": 0.0763, - "mean_token_accuracy": 0.9757898151874542, + "loss": 0.0824, + "mean_token_accuracy": 0.9750961661338806, "step": 1106 }, { "epoch": 6.475073313782991, - "grad_norm": 0.33963125667874944, + "grad_norm": 0.41834706105462965, "learning_rate": 1.4316119915715363e-05, - "loss": 0.0552, - "mean_token_accuracy": 0.9820173308253288, + "loss": 0.0598, + "mean_token_accuracy": 0.9810346961021423, "step": 1107 }, { "epoch": 6.480938416422287, - "grad_norm": 0.5695780400518249, + "grad_norm": 0.5247956654805205, "learning_rate": 1.42851227396583e-05, - "loss": 0.0724, - "mean_token_accuracy": 0.9782184883952141, + "loss": 0.076, + "mean_token_accuracy": 0.9773802384734154, "step": 1108 }, { "epoch": 6.486803519061583, - "grad_norm": 0.4686135496909153, + "grad_norm": 0.5182710062953582, "learning_rate": 1.4254153565489861e-05, - "loss": 0.0694, - "mean_token_accuracy": 0.9768488556146622, + "loss": 0.074, + "mean_token_accuracy": 0.9766635000705719, "step": 1109 }, { "epoch": 6.492668621700879, - "grad_norm": 0.4444438012165556, + "grad_norm": 0.4548167468262035, "learning_rate": 1.4223212505615634e-05, - "loss": 0.0636, - "mean_token_accuracy": 0.981613002717495, + "loss": 0.0656, + "mean_token_accuracy": 0.9804381132125854, "step": 1110 }, { "epoch": 6.4985337243401755, - "grad_norm": 0.4325209107027411, + "grad_norm": 0.40430833242136927, "learning_rate": 1.4192299672339167e-05, - "loss": 0.0583, - "mean_token_accuracy": 0.9803315699100494, + "loss": 0.0578, + "mean_token_accuracy": 0.9801406562328339, "step": 1111 }, { "epoch": 6.504398826979472, - "grad_norm": 0.5013136187861221, + "grad_norm": 0.5370970451033729, "learning_rate": 1.4161415177861568e-05, - "loss": 0.0656, - "mean_token_accuracy": 0.9776363521814346, + "loss": 0.0713, + "mean_token_accuracy": 0.9772634580731392, "step": 1112 }, { "epoch": 6.510263929618768, - "grad_norm": 0.3588687094600502, + "grad_norm": 0.3364637357817881, "learning_rate": 1.4130559134281074e-05, - "loss": 0.0566, - "mean_token_accuracy": 0.9824674054980278, + "loss": 0.0592, + "mean_token_accuracy": 0.9810837209224701, "step": 1113 }, { "epoch": 6.516129032258064, - "grad_norm": 0.5881958388687754, + "grad_norm": 0.43331710088554903, "learning_rate": 1.4099731653592668e-05, - "loss": 0.0662, - "mean_token_accuracy": 0.9796800762414932, + "loss": 0.066, + "mean_token_accuracy": 0.9798248186707497, "step": 1114 }, { "epoch": 6.52199413489736, - "grad_norm": 0.5935746416719274, + "grad_norm": 0.4495256260222478, "learning_rate": 1.406893284768764e-05, - "loss": 0.0772, - "mean_token_accuracy": 0.9757950976490974, + "loss": 0.0781, + "mean_token_accuracy": 0.9766861349344254, "step": 1115 }, { "epoch": 6.527859237536656, - "grad_norm": 0.6106151329018146, + "grad_norm": 0.5395741783044383, "learning_rate": 1.4038162828353223e-05, - "loss": 0.0767, - "mean_token_accuracy": 0.9741943404078484, + "loss": 0.0753, + "mean_token_accuracy": 0.9756533950567245, "step": 1116 }, { "epoch": 6.533724340175953, - "grad_norm": 0.5190014325868866, + "grad_norm": 0.5422962438171246, "learning_rate": 1.4007421707272167e-05, - "loss": 0.0673, - "mean_token_accuracy": 0.9787988364696503, + "loss": 0.0716, + "mean_token_accuracy": 0.9782907888293266, "step": 1117 }, { "epoch": 6.539589442815249, - "grad_norm": 0.47127351993031963, + "grad_norm": 0.5546002790540274, "learning_rate": 1.3976709596022313e-05, - "loss": 0.0648, - "mean_token_accuracy": 0.9789974242448807, + "loss": 0.0709, + "mean_token_accuracy": 0.9770197793841362, "step": 1118 }, { "epoch": 6.545454545454545, - "grad_norm": 0.5045724179125629, + "grad_norm": 0.518015113413407, "learning_rate": 1.3946026606076232e-05, - "loss": 0.0682, - "mean_token_accuracy": 0.9812200665473938, + "loss": 0.0716, + "mean_token_accuracy": 0.9807683080434799, "step": 1119 }, { "epoch": 6.551319648093841, - "grad_norm": 0.5047482899527544, + "grad_norm": 0.6932508207570297, "learning_rate": 1.3915372848800784e-05, - "loss": 0.0631, - "mean_token_accuracy": 0.981167197227478, + "loss": 0.0776, + "mean_token_accuracy": 0.9786146953701973, "step": 1120 }, { "epoch": 6.557184750733137, - "grad_norm": 0.4417462758826341, + "grad_norm": 0.46596073614841477, "learning_rate": 1.388474843545672e-05, - "loss": 0.0552, - "mean_token_accuracy": 0.9817292094230652, + "loss": 0.0615, + "mean_token_accuracy": 0.979241244494915, "step": 1121 }, { "epoch": 6.563049853372434, - "grad_norm": 0.465911409594195, + "grad_norm": 0.48023768969255687, "learning_rate": 1.3854153477198305e-05, - "loss": 0.0776, - "mean_token_accuracy": 0.9728592708706856, + "loss": 0.083, + "mean_token_accuracy": 0.9719856157898903, "step": 1122 }, { "epoch": 6.568914956011731, - "grad_norm": 0.3971582032508505, + "grad_norm": 0.4138687687316157, "learning_rate": 1.3823588085072865e-05, - "loss": 0.0595, - "mean_token_accuracy": 0.9787317886948586, + "loss": 0.0661, + "mean_token_accuracy": 0.9797427132725716, "step": 1123 }, { "epoch": 6.574780058651027, - "grad_norm": 0.5127794810231362, + "grad_norm": 0.47858914117559365, "learning_rate": 1.3793052370020441e-05, - "loss": 0.0732, - "mean_token_accuracy": 0.9790031611919403, + "loss": 0.0718, + "mean_token_accuracy": 0.9773967415094376, "step": 1124 }, { "epoch": 6.580645161290323, - "grad_norm": 0.5745357026449952, + "grad_norm": 0.6243697871259365, "learning_rate": 1.3762546442873343e-05, - "loss": 0.0706, - "mean_token_accuracy": 0.9805065914988518, + "loss": 0.0692, + "mean_token_accuracy": 0.9803427085280418, "step": 1125 }, { "epoch": 6.586510263929619, - "grad_norm": 0.5035709411175152, + "grad_norm": 0.4875492799742511, "learning_rate": 1.3732070414355766e-05, - "loss": 0.069, - "mean_token_accuracy": 0.9779914170503616, + "loss": 0.0705, + "mean_token_accuracy": 0.9797642901539803, "step": 1126 }, { "epoch": 6.592375366568915, - "grad_norm": 0.49677571366761347, + "grad_norm": 0.3718015452532362, "learning_rate": 1.370162439508339e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.9810082614421844, + "loss": 0.0606, + "mean_token_accuracy": 0.9801923632621765, "step": 1127 }, { "epoch": 6.5982404692082115, - "grad_norm": 0.4300492329897516, + "grad_norm": 0.5704303577432631, "learning_rate": 1.367120849556296e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.9806487932801247, + "loss": 0.0724, + "mean_token_accuracy": 0.9793426245450974, "step": 1128 }, { "epoch": 6.604105571847508, - "grad_norm": 0.3446649815945842, + "grad_norm": 0.5146825324740882, "learning_rate": 1.3640822826191907e-05, - "loss": 0.0504, - "mean_token_accuracy": 0.9841926470398903, + "loss": 0.0631, + "mean_token_accuracy": 0.9816078171133995, "step": 1129 }, { "epoch": 6.609970674486804, - "grad_norm": 0.49179030281896613, + "grad_norm": 0.5643347398770314, "learning_rate": 1.361046749725794e-05, - "loss": 0.0683, - "mean_token_accuracy": 0.9768203571438789, + "loss": 0.0727, + "mean_token_accuracy": 0.9767194986343384, "step": 1130 }, { "epoch": 6.6158357771261, - "grad_norm": 0.3978124224369657, + "grad_norm": 0.38975674311350267, "learning_rate": 1.3580142618938647e-05, - "loss": 0.0538, - "mean_token_accuracy": 0.9839759469032288, + "loss": 0.0566, + "mean_token_accuracy": 0.9839957654476166, "step": 1131 }, { "epoch": 6.621700879765396, - "grad_norm": 0.4851032507862954, + "grad_norm": 0.34822313778323327, "learning_rate": 1.354984830130109e-05, - "loss": 0.0703, - "mean_token_accuracy": 0.9769893512129784, + "loss": 0.0696, + "mean_token_accuracy": 0.9769606366753578, "step": 1132 }, { "epoch": 6.627565982404692, - "grad_norm": 0.46896793554361244, + "grad_norm": 0.4365409164065618, "learning_rate": 1.3519584654301401e-05, - "loss": 0.0669, - "mean_token_accuracy": 0.979073740541935, + "loss": 0.0655, + "mean_token_accuracy": 0.9780000522732735, "step": 1133 }, { "epoch": 6.633431085043989, - "grad_norm": 0.4433020660589846, + "grad_norm": 0.4528538426658204, "learning_rate": 1.3489351787784398e-05, - "loss": 0.0641, - "mean_token_accuracy": 0.9796401932835579, + "loss": 0.0673, + "mean_token_accuracy": 0.9787932485342026, "step": 1134 }, { "epoch": 6.639296187683285, - "grad_norm": 0.6547657796915215, + "grad_norm": 0.48020115114867207, "learning_rate": 1.3459149811483178e-05, - "loss": 0.0826, - "mean_token_accuracy": 0.9772631600499153, + "loss": 0.0779, + "mean_token_accuracy": 0.9763984754681587, "step": 1135 }, { "epoch": 6.645161290322581, - "grad_norm": 0.6772433811470678, + "grad_norm": 0.5851000334081872, "learning_rate": 1.342897883501872e-05, - "loss": 0.0732, - "mean_token_accuracy": 0.9791544526815414, + "loss": 0.0796, + "mean_token_accuracy": 0.9779474139213562, "step": 1136 }, { "epoch": 6.651026392961877, - "grad_norm": 0.5203169937735215, + "grad_norm": 0.3951030203418182, "learning_rate": 1.3398838967899477e-05, - "loss": 0.0642, - "mean_token_accuracy": 0.978813648223877, + "loss": 0.0641, + "mean_token_accuracy": 0.9799654707312584, "step": 1137 }, { "epoch": 6.656891495601173, - "grad_norm": 0.42941729523725874, + "grad_norm": 0.41805758317928976, "learning_rate": 1.3368730319520992e-05, - "loss": 0.0642, - "mean_token_accuracy": 0.9808278232812881, + "loss": 0.0638, + "mean_token_accuracy": 0.9813594818115234, "step": 1138 }, { "epoch": 6.6627565982404695, - "grad_norm": 0.4520583507512642, + "grad_norm": 0.43689086975785124, "learning_rate": 1.3338652999165511e-05, - "loss": 0.0672, - "mean_token_accuracy": 0.9790240898728371, + "loss": 0.0725, + "mean_token_accuracy": 0.9788738563656807, "step": 1139 }, { "epoch": 6.668621700879766, - "grad_norm": 0.3642854711728884, + "grad_norm": 0.31790770823484493, "learning_rate": 1.3308607116001549e-05, - "loss": 0.0572, - "mean_token_accuracy": 0.98244908452034, + "loss": 0.0588, + "mean_token_accuracy": 0.9820768386125565, "step": 1140 }, { "epoch": 6.674486803519062, - "grad_norm": 0.521170844330364, + "grad_norm": 0.36071681861745586, "learning_rate": 1.3278592779083534e-05, - "loss": 0.0565, - "mean_token_accuracy": 0.983853779733181, + "loss": 0.059, + "mean_token_accuracy": 0.9846365228295326, "step": 1141 }, { "epoch": 6.680351906158358, - "grad_norm": 0.39924193832736776, + "grad_norm": 0.406874459719353, "learning_rate": 1.324861009735138e-05, - "loss": 0.0607, - "mean_token_accuracy": 0.9806713908910751, + "loss": 0.0657, + "mean_token_accuracy": 0.9802506789565086, "step": 1142 }, { "epoch": 6.686217008797654, - "grad_norm": 0.4039864199512111, + "grad_norm": 0.4026037779692382, "learning_rate": 1.3218659179630112e-05, - "loss": 0.0645, - "mean_token_accuracy": 0.9802507907152176, + "loss": 0.0689, + "mean_token_accuracy": 0.9794812649488449, "step": 1143 }, { "epoch": 6.69208211143695, - "grad_norm": 0.5196436019363877, + "grad_norm": 0.3609274332773618, "learning_rate": 1.3188740134629469e-05, - "loss": 0.0665, - "mean_token_accuracy": 0.9800485447049141, + "loss": 0.0651, + "mean_token_accuracy": 0.9809269681572914, "step": 1144 }, { "epoch": 6.697947214076247, - "grad_norm": 0.42089828963999687, + "grad_norm": 0.43413406807039673, "learning_rate": 1.3158853070943499e-05, - "loss": 0.0553, - "mean_token_accuracy": 0.9817090556025505, + "loss": 0.058, + "mean_token_accuracy": 0.981363870203495, "step": 1145 }, { "epoch": 6.703812316715543, - "grad_norm": 0.5007957363563511, + "grad_norm": 0.3905644328801645, "learning_rate": 1.3128998097050174e-05, - "loss": 0.0659, - "mean_token_accuracy": 0.9803542569279671, + "loss": 0.0639, + "mean_token_accuracy": 0.9799892753362656, "step": 1146 }, { "epoch": 6.709677419354839, - "grad_norm": 0.5010376117207007, + "grad_norm": 0.4415899012250326, "learning_rate": 1.3099175321310993e-05, - "loss": 0.066, - "mean_token_accuracy": 0.979380339384079, + "loss": 0.0663, + "mean_token_accuracy": 0.9794251173734665, "step": 1147 }, { "epoch": 6.715542521994135, - "grad_norm": 0.3592596677655769, + "grad_norm": 0.3303445901009437, "learning_rate": 1.3069384851970584e-05, - "loss": 0.0554, - "mean_token_accuracy": 0.9809225648641586, + "loss": 0.0577, + "mean_token_accuracy": 0.9799608886241913, "step": 1148 }, { "epoch": 6.721407624633431, - "grad_norm": 0.47254547050064377, + "grad_norm": 0.542470612619514, "learning_rate": 1.3039626797156321e-05, - "loss": 0.0613, - "mean_token_accuracy": 0.9800109416246414, + "loss": 0.0684, + "mean_token_accuracy": 0.978044830262661, "step": 1149 }, { "epoch": 6.7272727272727275, - "grad_norm": 0.40129886920431546, + "grad_norm": 0.4478625244312105, "learning_rate": 1.3009901264877924e-05, - "loss": 0.0588, - "mean_token_accuracy": 0.9832498729228973, + "loss": 0.0635, + "mean_token_accuracy": 0.9832303300499916, "step": 1150 }, { "epoch": 6.733137829912024, - "grad_norm": 0.3962732126618426, + "grad_norm": 0.3070246305139214, "learning_rate": 1.298020836302707e-05, - "loss": 0.0617, - "mean_token_accuracy": 0.9801494553685188, + "loss": 0.0634, + "mean_token_accuracy": 0.9798863604664803, "step": 1151 }, { "epoch": 6.73900293255132, - "grad_norm": 0.4094241439229563, + "grad_norm": 0.3787118140466253, "learning_rate": 1.2950548199376999e-05, - "loss": 0.0576, - "mean_token_accuracy": 0.9825976863503456, + "loss": 0.0619, + "mean_token_accuracy": 0.9830320551991463, "step": 1152 }, { "epoch": 6.744868035190616, - "grad_norm": 0.5378068116582913, + "grad_norm": 0.6262382974166212, "learning_rate": 1.292092088158213e-05, - "loss": 0.0676, - "mean_token_accuracy": 0.9790525138378143, + "loss": 0.0724, + "mean_token_accuracy": 0.9787451773881912, "step": 1153 }, { "epoch": 6.750733137829912, - "grad_norm": 0.3968231076570657, + "grad_norm": 0.4031409936415585, "learning_rate": 1.2891326517177663e-05, - "loss": 0.056, - "mean_token_accuracy": 0.9852932840585709, + "loss": 0.057, + "mean_token_accuracy": 0.9842765405774117, "step": 1154 }, { "epoch": 6.756598240469208, - "grad_norm": 0.48590510329215825, + "grad_norm": 0.370569744572269, "learning_rate": 1.2861765213579177e-05, - "loss": 0.069, - "mean_token_accuracy": 0.9758260548114777, + "loss": 0.0673, + "mean_token_accuracy": 0.9774162173271179, "step": 1155 }, { "epoch": 6.762463343108505, - "grad_norm": 0.4499456893763865, + "grad_norm": 0.4504539339758509, "learning_rate": 1.2832237078082272e-05, - "loss": 0.0588, - "mean_token_accuracy": 0.9807997494935989, + "loss": 0.0646, + "mean_token_accuracy": 0.9795903638005257, "step": 1156 }, { "epoch": 6.768328445747801, - "grad_norm": 0.4467830869566874, + "grad_norm": 0.3525672565690468, "learning_rate": 1.2802742217862156e-05, - "loss": 0.0659, - "mean_token_accuracy": 0.9799171090126038, + "loss": 0.068, + "mean_token_accuracy": 0.9798642173409462, "step": 1157 }, { "epoch": 6.774193548387097, - "grad_norm": 0.4405266054873109, + "grad_norm": 0.4219177264271062, "learning_rate": 1.2773280739973255e-05, - "loss": 0.0671, - "mean_token_accuracy": 0.977348655462265, + "loss": 0.0664, + "mean_token_accuracy": 0.978237122297287, "step": 1158 }, { "epoch": 6.780058651026393, - "grad_norm": 0.400408649403103, + "grad_norm": 0.3757554934232093, "learning_rate": 1.2743852751348833e-05, - "loss": 0.0553, - "mean_token_accuracy": 0.9837123081088066, + "loss": 0.0558, + "mean_token_accuracy": 0.9824602827429771, "step": 1159 }, { "epoch": 6.785923753665689, - "grad_norm": 0.3887758632837316, + "grad_norm": 0.44869090627023644, "learning_rate": 1.2714458358800612e-05, - "loss": 0.0471, - "mean_token_accuracy": 0.9850385040044785, + "loss": 0.0508, + "mean_token_accuracy": 0.9844018071889877, "step": 1160 }, { "epoch": 6.7917888563049855, - "grad_norm": 0.5172676745480713, + "grad_norm": 0.599932829838675, "learning_rate": 1.2685097669018362e-05, - "loss": 0.0689, - "mean_token_accuracy": 0.9788277596235275, + "loss": 0.0835, + "mean_token_accuracy": 0.976402185857296, "step": 1161 }, { "epoch": 6.797653958944282, - "grad_norm": 0.432827905221815, + "grad_norm": 0.35639663652954295, "learning_rate": 1.265577078856953e-05, - "loss": 0.0671, - "mean_token_accuracy": 0.9756351113319397, + "loss": 0.0708, + "mean_token_accuracy": 0.9760590195655823, "step": 1162 }, { "epoch": 6.803519061583578, - "grad_norm": 0.5482725408821638, + "grad_norm": 0.4358245772006115, "learning_rate": 1.2626477823898843e-05, - "loss": 0.072, - "mean_token_accuracy": 0.979312427341938, + "loss": 0.0737, + "mean_token_accuracy": 0.979936808347702, "step": 1163 }, { "epoch": 6.809384164222874, - "grad_norm": 0.5005950667187236, + "grad_norm": 0.4794803687851331, "learning_rate": 1.2597218881327944e-05, - "loss": 0.0644, - "mean_token_accuracy": 0.9781336486339569, + "loss": 0.0706, + "mean_token_accuracy": 0.9768901541829109, "step": 1164 }, { "epoch": 6.81524926686217, - "grad_norm": 0.3979272375106252, + "grad_norm": 0.32937163939726377, "learning_rate": 1.2567994067054961e-05, - "loss": 0.0624, - "mean_token_accuracy": 0.9795544818043709, + "loss": 0.063, + "mean_token_accuracy": 0.9787830784916878, "step": 1165 }, { "epoch": 6.821114369501466, - "grad_norm": 0.47983624496521715, + "grad_norm": 0.39110647767955126, "learning_rate": 1.2538803487154177e-05, - "loss": 0.0646, - "mean_token_accuracy": 0.9791703373193741, + "loss": 0.0643, + "mean_token_accuracy": 0.979418084025383, "step": 1166 }, { "epoch": 6.826979472140763, - "grad_norm": 0.4792233882496042, + "grad_norm": 0.33247603921786206, "learning_rate": 1.25096472475756e-05, - "loss": 0.0643, - "mean_token_accuracy": 0.9790126904845238, + "loss": 0.0644, + "mean_token_accuracy": 0.9782299101352692, "step": 1167 }, { "epoch": 6.832844574780059, - "grad_norm": 0.44047811584075663, + "grad_norm": 0.35810789676260385, "learning_rate": 1.248052545414461e-05, "loss": 0.0666, - "mean_token_accuracy": 0.9815377816557884, + "mean_token_accuracy": 0.9819423481822014, "step": 1168 }, { "epoch": 6.838709677419355, - "grad_norm": 0.45039377868298586, + "grad_norm": 0.4906003508933705, "learning_rate": 1.2451438212561556e-05, - "loss": 0.0728, - "mean_token_accuracy": 0.9747223258018494, + "loss": 0.0734, + "mean_token_accuracy": 0.9736126214265823, "step": 1169 }, { "epoch": 6.844574780058651, - "grad_norm": 0.437389204436552, + "grad_norm": 0.37903277572822597, "learning_rate": 1.2422385628401377e-05, - "loss": 0.064, - "mean_token_accuracy": 0.9793856963515282, + "loss": 0.0676, + "mean_token_accuracy": 0.9789053797721863, "step": 1170 }, { "epoch": 6.850439882697947, - "grad_norm": 0.488370718643388, + "grad_norm": 0.7759463426777434, "learning_rate": 1.2393367807113217e-05, - "loss": 0.0658, - "mean_token_accuracy": 0.9805554449558258, + "loss": 0.0752, + "mean_token_accuracy": 0.9790110364556313, "step": 1171 }, { "epoch": 6.8563049853372435, - "grad_norm": 0.39447639676794705, + "grad_norm": 0.43252889379637655, "learning_rate": 1.236438485402005e-05, - "loss": 0.0632, - "mean_token_accuracy": 0.9815583750605583, + "loss": 0.066, + "mean_token_accuracy": 0.9801287278532982, "step": 1172 }, { "epoch": 6.86217008797654, - "grad_norm": 0.3808687072825325, + "grad_norm": 0.5288479160944122, "learning_rate": 1.2335436874318293e-05, - "loss": 0.0593, - "mean_token_accuracy": 0.9811095669865608, + "loss": 0.0623, + "mean_token_accuracy": 0.9801551327109337, "step": 1173 }, { "epoch": 6.868035190615836, - "grad_norm": 0.45599887703857295, + "grad_norm": 0.35953238759821077, "learning_rate": 1.2306523973077416e-05, - "loss": 0.0714, - "mean_token_accuracy": 0.9796115532517433, + "loss": 0.0751, + "mean_token_accuracy": 0.9780450016260147, "step": 1174 }, { "epoch": 6.873900293255132, - "grad_norm": 0.47569451377053196, + "grad_norm": 0.44824241707674634, "learning_rate": 1.2277646255239572e-05, - "loss": 0.0694, - "mean_token_accuracy": 0.98045764118433, + "loss": 0.0742, + "mean_token_accuracy": 0.9785097539424896, "step": 1175 }, { "epoch": 6.879765395894428, - "grad_norm": 0.46631060127689494, + "grad_norm": 0.3386479051039518, "learning_rate": 1.2248803825619224e-05, - "loss": 0.0648, - "mean_token_accuracy": 0.9787806421518326, + "loss": 0.0652, + "mean_token_accuracy": 0.9789741039276123, "step": 1176 }, { "epoch": 6.885630498533724, - "grad_norm": 0.6885890063590842, + "grad_norm": 0.3734375985346956, "learning_rate": 1.2219996788902734e-05, - "loss": 0.0689, - "mean_token_accuracy": 0.978795975446701, + "loss": 0.0643, + "mean_token_accuracy": 0.980387419462204, "step": 1177 }, { "epoch": 6.891495601173021, - "grad_norm": 0.4303213452595518, + "grad_norm": 0.40272307561699655, "learning_rate": 1.2191225249648016e-05, - "loss": 0.0602, - "mean_token_accuracy": 0.9798460155725479, + "loss": 0.0616, + "mean_token_accuracy": 0.9799608364701271, "step": 1178 }, { "epoch": 6.897360703812317, - "grad_norm": 0.3937230216041198, + "grad_norm": 0.4925163928492032, "learning_rate": 1.216248931228413e-05, - "loss": 0.0667, - "mean_token_accuracy": 0.977984793484211, + "loss": 0.0707, + "mean_token_accuracy": 0.9777872562408447, "step": 1179 }, { "epoch": 6.903225806451613, - "grad_norm": 0.38130916152890315, + "grad_norm": 0.3894825168802102, "learning_rate": 1.2133789081110927e-05, - "loss": 0.0595, - "mean_token_accuracy": 0.981584794819355, + "loss": 0.062, + "mean_token_accuracy": 0.981035090982914, "step": 1180 }, { "epoch": 6.909090909090909, - "grad_norm": 0.46142872676320174, + "grad_norm": 0.5362801208592477, "learning_rate": 1.2105124660298655e-05, - "loss": 0.0625, - "mean_token_accuracy": 0.9815917834639549, + "loss": 0.0669, + "mean_token_accuracy": 0.9792104065418243, "step": 1181 }, { "epoch": 6.914956011730205, - "grad_norm": 0.43182919024308025, + "grad_norm": 0.44854169792906545, "learning_rate": 1.2076496153887587e-05, - "loss": 0.0591, - "mean_token_accuracy": 0.9817899540066719, + "loss": 0.066, + "mean_token_accuracy": 0.9807733446359634, "step": 1182 }, { "epoch": 6.9208211143695015, - "grad_norm": 0.42635933383608365, + "grad_norm": 0.3430754024750591, "learning_rate": 1.2047903665787633e-05, - "loss": 0.0618, - "mean_token_accuracy": 0.9816203713417053, + "loss": 0.0623, + "mean_token_accuracy": 0.9828178733587265, "step": 1183 }, { "epoch": 6.926686217008798, - "grad_norm": 0.41330290477227893, + "grad_norm": 0.4263487835060375, "learning_rate": 1.2019347299777981e-05, - "loss": 0.0557, - "mean_token_accuracy": 0.9830499663949013, + "loss": 0.0564, + "mean_token_accuracy": 0.9828721657395363, "step": 1184 }, { "epoch": 6.932551319648094, - "grad_norm": 0.4565062990930812, + "grad_norm": 0.4364783671104784, "learning_rate": 1.199082715950671e-05, - "loss": 0.071, - "mean_token_accuracy": 0.9775069504976273, + "loss": 0.074, + "mean_token_accuracy": 0.9775600358843803, "step": 1185 }, { "epoch": 6.93841642228739, - "grad_norm": 0.4137224208578769, + "grad_norm": 0.3615151589709061, "learning_rate": 1.1962343348490407e-05, - "loss": 0.0578, - "mean_token_accuracy": 0.980716660618782, + "loss": 0.0591, + "mean_token_accuracy": 0.9815037250518799, "step": 1186 }, { "epoch": 6.944281524926686, - "grad_norm": 0.3948613864092428, + "grad_norm": 0.34842137870991524, "learning_rate": 1.1933895970113798e-05, - "loss": 0.0652, - "mean_token_accuracy": 0.9824345782399178, + "loss": 0.07, + "mean_token_accuracy": 0.9812806695699692, "step": 1187 }, { "epoch": 6.9501466275659824, - "grad_norm": 0.5122310403133853, + "grad_norm": 0.39779087229671267, "learning_rate": 1.1905485127629387e-05, - "loss": 0.0709, - "mean_token_accuracy": 0.9782714620232582, + "loss": 0.0687, + "mean_token_accuracy": 0.9787686914205551, "step": 1188 }, { "epoch": 6.956011730205279, - "grad_norm": 0.4604201900693111, + "grad_norm": 0.37156810873265006, "learning_rate": 1.1877110924157046e-05, - "loss": 0.0612, - "mean_token_accuracy": 0.9789381250739098, + "loss": 0.064, + "mean_token_accuracy": 0.9793610125780106, "step": 1189 }, { "epoch": 6.961876832844575, - "grad_norm": 0.4017710192115634, + "grad_norm": 0.36891675930420137, "learning_rate": 1.1848773462683684e-05, - "loss": 0.0673, - "mean_token_accuracy": 0.9794546961784363, + "loss": 0.0691, + "mean_token_accuracy": 0.9793200343847275, "step": 1190 }, { "epoch": 6.967741935483871, - "grad_norm": 0.47930547919907945, + "grad_norm": 0.3973752929987383, "learning_rate": 1.1820472846062842e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.9796619564294815, + "loss": 0.0612, + "mean_token_accuracy": 0.980794683098793, "step": 1191 }, { "epoch": 6.973607038123167, - "grad_norm": 0.4406226699463579, + "grad_norm": 0.334569020236037, "learning_rate": 1.1792209177014317e-05, - "loss": 0.0625, - "mean_token_accuracy": 0.9820540770888329, + "loss": 0.0626, + "mean_token_accuracy": 0.9811923578381538, "step": 1192 }, { "epoch": 6.979472140762463, - "grad_norm": 0.4166350298425514, + "grad_norm": 0.349284009378184, "learning_rate": 1.1763982558123823e-05, - "loss": 0.0643, - "mean_token_accuracy": 0.9803269580006599, + "loss": 0.0673, + "mean_token_accuracy": 0.9803842082619667, "step": 1193 }, { "epoch": 6.9853372434017595, - "grad_norm": 0.5067924863320438, + "grad_norm": 0.48509516426421756, "learning_rate": 1.1735793091842583e-05, - "loss": 0.0673, - "mean_token_accuracy": 0.9796130433678627, + "loss": 0.0661, + "mean_token_accuracy": 0.9790410473942757, "step": 1194 }, { "epoch": 6.991202346041056, - "grad_norm": 0.5131689327156483, + "grad_norm": 0.6182624878074343, "learning_rate": 1.1707640880486975e-05, - "loss": 0.0786, - "mean_token_accuracy": 0.9753068685531616, + "loss": 0.083, + "mean_token_accuracy": 0.9752562269568443, "step": 1195 }, { "epoch": 6.997067448680352, - "grad_norm": 0.27953688453676967, + "grad_norm": 0.26860092970951027, "learning_rate": 1.1679526026238155e-05, - "loss": 0.0494, - "mean_token_accuracy": 0.9843882694840431, + "loss": 0.0506, + "mean_token_accuracy": 0.9840497225522995, "step": 1196 }, { "epoch": 7.0, - "grad_norm": 0.27953688453676967, + "grad_norm": 0.26860092970951027, "learning_rate": 1.165144863114169e-05, "loss": 0.0556, - "mean_token_accuracy": 0.9817256480455399, + "mean_token_accuracy": 0.9838097840547562, "step": 1197 }, { "epoch": 7.005865102639296, - "grad_norm": 0.6373555062592029, + "grad_norm": 0.543930531637957, "learning_rate": 1.1623408797107185e-05, - "loss": 0.064, - "mean_token_accuracy": 0.9785372838377953, + "loss": 0.068, + "mean_token_accuracy": 0.9765876084566116, "step": 1198 }, { "epoch": 7.011730205278592, - "grad_norm": 0.43666510254787677, + "grad_norm": 0.3599707466085984, "learning_rate": 1.1595406625907914e-05, - "loss": 0.0548, - "mean_token_accuracy": 0.9841993674635887, + "loss": 0.0596, + "mean_token_accuracy": 0.9841873943805695, "step": 1199 }, { "epoch": 7.0175953079178885, - "grad_norm": 0.36290262869575624, + "grad_norm": 0.7986070629261153, "learning_rate": 1.1567442219180446e-05, - "loss": 0.0528, - "mean_token_accuracy": 0.9837799668312073, + "loss": 0.0552, + "mean_token_accuracy": 0.9836561754345894, "step": 1200 }, { "epoch": 7.023460410557185, - "grad_norm": 0.3421833694866547, + "grad_norm": 0.2858257280488765, "learning_rate": 1.153951567842429e-05, - "loss": 0.0487, - "mean_token_accuracy": 0.9848127514123917, + "loss": 0.0511, + "mean_token_accuracy": 0.9851352721452713, "step": 1201 }, { "epoch": 7.029325513196481, - "grad_norm": 0.33545395972804354, + "grad_norm": 0.43882380033807117, "learning_rate": 1.1511627105001501e-05, - "loss": 0.0581, - "mean_token_accuracy": 0.9827064424753189, + "loss": 0.062, + "mean_token_accuracy": 0.9812534153461456, "step": 1202 }, { "epoch": 7.035190615835777, - "grad_norm": 0.45031796131371543, + "grad_norm": 0.32376660322769785, "learning_rate": 1.1483776600136344e-05, - "loss": 0.0636, - "mean_token_accuracy": 0.9784261807799339, + "loss": 0.0642, + "mean_token_accuracy": 0.9787701517343521, "step": 1203 }, { "epoch": 7.041055718475073, - "grad_norm": 0.43121710516783884, + "grad_norm": 0.2845445707837671, "learning_rate": 1.1455964264914906e-05, - "loss": 0.0512, - "mean_token_accuracy": 0.9828193038702011, + "loss": 0.0529, + "mean_token_accuracy": 0.9829073548316956, "step": 1204 }, { "epoch": 7.0469208211143695, - "grad_norm": 0.3947272877786806, + "grad_norm": 0.3581481876523457, "learning_rate": 1.142819020028472e-05, - "loss": 0.0632, - "mean_token_accuracy": 0.9800242558121681, + "loss": 0.0659, + "mean_token_accuracy": 0.9776208847761154, "step": 1205 }, { "epoch": 7.052785923753666, - "grad_norm": 0.4274474592379843, + "grad_norm": 0.31359656987227214, "learning_rate": 1.140045450705443e-05, - "loss": 0.0564, - "mean_token_accuracy": 0.9798077195882797, + "loss": 0.0594, + "mean_token_accuracy": 0.9799343273043633, "step": 1206 }, { "epoch": 7.058651026392962, - "grad_norm": 0.3834033075631306, + "grad_norm": 0.34541167440285503, "learning_rate": 1.13727572858934e-05, - "loss": 0.0511, - "mean_token_accuracy": 0.983852356672287, + "loss": 0.055, + "mean_token_accuracy": 0.9838607162237167, "step": 1207 }, { "epoch": 7.064516129032258, - "grad_norm": 0.3655976301540036, + "grad_norm": 0.2656366030140541, "learning_rate": 1.1345098637331356e-05, - "loss": 0.0508, - "mean_token_accuracy": 0.9838704839348793, + "loss": 0.0515, + "mean_token_accuracy": 0.9840966165065765, "step": 1208 }, { "epoch": 7.070381231671554, - "grad_norm": 0.5229786475813798, + "grad_norm": 0.3732026940010526, "learning_rate": 1.1317478661758022e-05, - "loss": 0.0712, - "mean_token_accuracy": 0.9760942086577415, + "loss": 0.0738, + "mean_token_accuracy": 0.9770562797784805, "step": 1209 }, { "epoch": 7.07624633431085, - "grad_norm": 0.4346420863510563, + "grad_norm": 0.3620841580037238, "learning_rate": 1.1289897459422756e-05, - "loss": 0.0536, - "mean_token_accuracy": 0.9831833392381668, + "loss": 0.053, + "mean_token_accuracy": 0.9833652973175049, "step": 1210 }, { "epoch": 7.0821114369501466, - "grad_norm": 0.5593461337456281, + "grad_norm": 0.3890273971843419, "learning_rate": 1.126235513043418e-05, - "loss": 0.0642, - "mean_token_accuracy": 0.9789699912071228, + "loss": 0.064, + "mean_token_accuracy": 0.9798462018370628, "step": 1211 }, { "epoch": 7.087976539589443, - "grad_norm": 0.43900643121666977, + "grad_norm": 0.40304516205749985, "learning_rate": 1.1234851774759828e-05, - "loss": 0.0524, - "mean_token_accuracy": 0.9849435314536095, + "loss": 0.0535, + "mean_token_accuracy": 0.9842343777418137, "step": 1212 }, { "epoch": 7.093841642228739, - "grad_norm": 0.34888430248322444, + "grad_norm": 0.2898298813640276, "learning_rate": 1.1207387492225772e-05, - "loss": 0.0545, - "mean_token_accuracy": 0.9824554324150085, + "loss": 0.0577, + "mean_token_accuracy": 0.9817237481474876, "step": 1213 }, { "epoch": 7.099706744868035, - "grad_norm": 0.3819215648450502, + "grad_norm": 0.346760418469883, "learning_rate": 1.1179962382516268e-05, - "loss": 0.0648, - "mean_token_accuracy": 0.9815644100308418, + "loss": 0.0677, + "mean_token_accuracy": 0.9793826043605804, "step": 1214 }, { "epoch": 7.105571847507331, - "grad_norm": 0.4018294483259087, + "grad_norm": 0.32802116910426204, "learning_rate": 1.1152576545173388e-05, - "loss": 0.0559, - "mean_token_accuracy": 0.9836894050240517, + "loss": 0.0571, + "mean_token_accuracy": 0.9839992597699165, "step": 1215 }, { "epoch": 7.1114369501466275, - "grad_norm": 0.3447430653572727, + "grad_norm": 0.33448701048483737, "learning_rate": 1.1125230079596654e-05, - "loss": 0.0511, - "mean_token_accuracy": 0.9840358719229698, + "loss": 0.0515, + "mean_token_accuracy": 0.9829333648085594, "step": 1216 }, { "epoch": 7.117302052785924, - "grad_norm": 0.4191025482663613, + "grad_norm": 0.47043879663050175, "learning_rate": 1.10979230850427e-05, - "loss": 0.0551, - "mean_token_accuracy": 0.9832866340875626, + "loss": 0.0567, + "mean_token_accuracy": 0.9834040552377701, "step": 1217 }, { "epoch": 7.12316715542522, - "grad_norm": 0.37400953837380885, + "grad_norm": 0.3169648164702154, "learning_rate": 1.1070655660624876e-05, - "loss": 0.0607, - "mean_token_accuracy": 0.980600893497467, + "loss": 0.0677, + "mean_token_accuracy": 0.9791783094406128, "step": 1218 }, { "epoch": 7.129032258064516, - "grad_norm": 0.37485779498534494, + "grad_norm": 0.42639076528279224, "learning_rate": 1.1043427905312933e-05, - "loss": 0.0632, - "mean_token_accuracy": 0.9809056371450424, + "loss": 0.0678, + "mean_token_accuracy": 0.9799103289842606, "step": 1219 }, { "epoch": 7.134897360703812, - "grad_norm": 0.37279266122405813, + "grad_norm": 0.41740113237596277, "learning_rate": 1.1016239917932618e-05, - "loss": 0.0572, - "mean_token_accuracy": 0.9826710894703865, + "loss": 0.0577, + "mean_token_accuracy": 0.9818514063954353, "step": 1220 }, { "epoch": 7.140762463343108, - "grad_norm": 0.3877743126318198, + "grad_norm": 0.2721497498601549, "learning_rate": 1.098909179716535e-05, - "loss": 0.0617, - "mean_token_accuracy": 0.9800911992788315, + "loss": 0.063, + "mean_token_accuracy": 0.979342058300972, "step": 1221 }, { "epoch": 7.146627565982405, - "grad_norm": 0.48036986394012565, + "grad_norm": 0.37368651106908646, "learning_rate": 1.096198364154784e-05, - "loss": 0.0602, - "mean_token_accuracy": 0.9828939959406853, + "loss": 0.0566, + "mean_token_accuracy": 0.9820699691772461, "step": 1222 }, { "epoch": 7.152492668621701, - "grad_norm": 0.3866461897489377, + "grad_norm": 0.46464039008643165, "learning_rate": 1.0934915549471747e-05, - "loss": 0.0526, - "mean_token_accuracy": 0.9831520467996597, + "loss": 0.0549, + "mean_token_accuracy": 0.9828799217939377, "step": 1223 }, { "epoch": 7.158357771260997, - "grad_norm": 0.3775525332017602, + "grad_norm": 0.24275403590533381, "learning_rate": 1.0907887619183308e-05, - "loss": 0.053, - "mean_token_accuracy": 0.9841544181108475, + "loss": 0.0531, + "mean_token_accuracy": 0.9846633151173592, "step": 1224 }, { "epoch": 7.164222873900293, - "grad_norm": 0.3836873587260523, + "grad_norm": 0.24404744608544118, "learning_rate": 1.0880899948783002e-05, - "loss": 0.0592, - "mean_token_accuracy": 0.9817483797669411, + "loss": 0.0593, + "mean_token_accuracy": 0.9814930409193039, "step": 1225 }, { "epoch": 7.170087976539589, - "grad_norm": 0.41100260358880575, + "grad_norm": 0.4054052415960616, "learning_rate": 1.0853952636225165e-05, - "loss": 0.0613, - "mean_token_accuracy": 0.9828031435608864, + "loss": 0.0665, + "mean_token_accuracy": 0.9808385893702507, "step": 1226 }, { "epoch": 7.1759530791788855, - "grad_norm": 0.4219853989912052, + "grad_norm": 0.4408475594615136, "learning_rate": 1.0827045779317662e-05, - "loss": 0.0543, - "mean_token_accuracy": 0.9837125688791275, + "loss": 0.0611, + "mean_token_accuracy": 0.9818901941180229, "step": 1227 }, { "epoch": 7.181818181818182, - "grad_norm": 0.43702873517895335, + "grad_norm": 0.3809276180859335, "learning_rate": 1.080017947572152e-05, - "loss": 0.0543, - "mean_token_accuracy": 0.9834098666906357, + "loss": 0.0563, + "mean_token_accuracy": 0.9834899082779884, "step": 1228 }, { "epoch": 7.187683284457478, - "grad_norm": 0.41016359666667, + "grad_norm": 0.23664871794200695, "learning_rate": 1.0773353822950563e-05, - "loss": 0.0609, - "mean_token_accuracy": 0.9809712171554565, + "loss": 0.0651, + "mean_token_accuracy": 0.9821924492716789, "step": 1229 }, { "epoch": 7.193548387096774, - "grad_norm": 0.49383511140721953, + "grad_norm": 0.39673119822118297, "learning_rate": 1.074656891837108e-05, - "loss": 0.0515, - "mean_token_accuracy": 0.9839422553777695, + "loss": 0.0547, + "mean_token_accuracy": 0.9831197336316109, "step": 1230 }, { "epoch": 7.19941348973607, - "grad_norm": 0.3956069878697063, + "grad_norm": 0.23952062843645364, "learning_rate": 1.0719824859201457e-05, - "loss": 0.058, - "mean_token_accuracy": 0.9819058403372765, + "loss": 0.0595, + "mean_token_accuracy": 0.9816852286458015, "step": 1231 }, { "epoch": 7.205278592375366, - "grad_norm": 0.4792058762225757, + "grad_norm": 0.4291529890704876, "learning_rate": 1.0693121742511828e-05, - "loss": 0.0668, - "mean_token_accuracy": 0.9765582084655762, + "loss": 0.0699, + "mean_token_accuracy": 0.9776555746793747, "step": 1232 }, { "epoch": 7.211143695014663, - "grad_norm": 0.4338995192160771, + "grad_norm": 0.3259261714222968, "learning_rate": 1.0666459665223718e-05, - "loss": 0.0593, - "mean_token_accuracy": 0.981715977191925, + "loss": 0.0615, + "mean_token_accuracy": 0.9814717844128609, "step": 1233 }, { "epoch": 7.217008797653959, - "grad_norm": 0.3855492769098808, + "grad_norm": 0.3219563587981844, "learning_rate": 1.0639838724109708e-05, - "loss": 0.0575, - "mean_token_accuracy": 0.9826395660638809, + "loss": 0.0606, + "mean_token_accuracy": 0.9811657071113586, "step": 1234 }, { "epoch": 7.222873900293255, - "grad_norm": 0.3437414453827899, + "grad_norm": 0.2887441757620735, "learning_rate": 1.0613259015793056e-05, - "loss": 0.0506, - "mean_token_accuracy": 0.9838635697960854, + "loss": 0.0526, + "mean_token_accuracy": 0.9813622236251831, "step": 1235 }, { "epoch": 7.228739002932551, - "grad_norm": 0.37946171096010817, + "grad_norm": 0.5016128124855813, "learning_rate": 1.0586720636747368e-05, - "loss": 0.0612, - "mean_token_accuracy": 0.9806694537401199, + "loss": 0.0614, + "mean_token_accuracy": 0.980540856719017, "step": 1236 }, { "epoch": 7.234604105571847, - "grad_norm": 0.38714008229821795, + "grad_norm": 0.32704366790322426, "learning_rate": 1.0560223683296244e-05, - "loss": 0.0565, - "mean_token_accuracy": 0.9813293144106865, + "loss": 0.0615, + "mean_token_accuracy": 0.979736678302288, "step": 1237 }, { "epoch": 7.2404692082111435, - "grad_norm": 0.40380335937441236, + "grad_norm": 0.24588497441247914, "learning_rate": 1.0533768251612924e-05, - "loss": 0.0631, - "mean_token_accuracy": 0.9799295514822006, + "loss": 0.0606, + "mean_token_accuracy": 0.9808222204446793, "step": 1238 }, { "epoch": 7.24633431085044, - "grad_norm": 0.4046737544797061, + "grad_norm": 0.3175117728124407, "learning_rate": 1.0507354437719938e-05, - "loss": 0.0512, - "mean_token_accuracy": 0.9842623844742775, + "loss": 0.0527, + "mean_token_accuracy": 0.9847884327173233, "step": 1239 }, { "epoch": 7.252199413489736, - "grad_norm": 0.3344455462825253, + "grad_norm": 0.3383051888288448, "learning_rate": 1.0480982337488768e-05, - "loss": 0.0553, - "mean_token_accuracy": 0.9825280457735062, + "loss": 0.057, + "mean_token_accuracy": 0.9824387729167938, "step": 1240 }, { "epoch": 7.258064516129032, - "grad_norm": 0.40570311112116275, + "grad_norm": 0.2760561608413442, "learning_rate": 1.0454652046639486e-05, - "loss": 0.0612, - "mean_token_accuracy": 0.9812857285141945, + "loss": 0.0627, + "mean_token_accuracy": 0.9802872315049171, "step": 1241 }, { "epoch": 7.263929618768328, - "grad_norm": 0.47975992267665485, + "grad_norm": 0.34101044843139294, "learning_rate": 1.0428363660740407e-05, - "loss": 0.0592, - "mean_token_accuracy": 0.9803736358880997, + "loss": 0.0621, + "mean_token_accuracy": 0.9803706407546997, "step": 1242 }, { "epoch": 7.269794721407624, - "grad_norm": 0.4288916334704027, + "grad_norm": 0.3854063802655829, "learning_rate": 1.0402117275207757e-05, - "loss": 0.0628, - "mean_token_accuracy": 0.980836883187294, + "loss": 0.0657, + "mean_token_accuracy": 0.9794178232550621, "step": 1243 }, { "epoch": 7.275659824046921, - "grad_norm": 0.4048834204911636, + "grad_norm": 0.3362556785057891, "learning_rate": 1.0375912985305319e-05, - "loss": 0.058, - "mean_token_accuracy": 0.9816715195775032, + "loss": 0.059, + "mean_token_accuracy": 0.9811040312051773, "step": 1244 }, { "epoch": 7.281524926686217, - "grad_norm": 0.33766750492275605, + "grad_norm": 0.24692741649753694, "learning_rate": 1.0349750886144077e-05, - "loss": 0.0565, - "mean_token_accuracy": 0.9801446571946144, + "loss": 0.0587, + "mean_token_accuracy": 0.980044037103653, "step": 1245 }, { "epoch": 7.287390029325513, - "grad_norm": 0.3553392017238559, + "grad_norm": 0.2366994852389697, "learning_rate": 1.0323631072681888e-05, - "loss": 0.0561, - "mean_token_accuracy": 0.982564315199852, + "loss": 0.058, + "mean_token_accuracy": 0.982071615755558, "step": 1246 }, { "epoch": 7.293255131964809, - "grad_norm": 0.383982297271317, + "grad_norm": 0.31542740583531625, "learning_rate": 1.0297553639723123e-05, - "loss": 0.0536, - "mean_token_accuracy": 0.9845903739333153, + "loss": 0.0533, + "mean_token_accuracy": 0.9839030206203461, "step": 1247 }, { "epoch": 7.299120234604105, - "grad_norm": 0.4060031394599371, + "grad_norm": 0.3706887349147297, "learning_rate": 1.027151868191834e-05, - "loss": 0.0609, - "mean_token_accuracy": 0.9797234684228897, + "loss": 0.0634, + "mean_token_accuracy": 0.9778928458690643, "step": 1248 }, { "epoch": 7.3049853372434015, - "grad_norm": 0.42793813736937725, + "grad_norm": 0.4809721901376831, "learning_rate": 1.0245526293763908e-05, - "loss": 0.0694, - "mean_token_accuracy": 0.9779497757554054, + "loss": 0.0692, + "mean_token_accuracy": 0.9787572473287582, "step": 1249 }, { "epoch": 7.310850439882698, - "grad_norm": 0.4167380663869755, + "grad_norm": 0.31650509706650065, "learning_rate": 1.0219576569601707e-05, - "loss": 0.0681, - "mean_token_accuracy": 0.9799009039998055, + "loss": 0.0714, + "mean_token_accuracy": 0.9798088297247887, "step": 1250 }, { "epoch": 7.316715542521994, - "grad_norm": 0.4377973731399899, + "grad_norm": 0.5798554299170097, "learning_rate": 1.0193669603618757e-05, - "loss": 0.0605, - "mean_token_accuracy": 0.9804759994149208, + "loss": 0.065, + "mean_token_accuracy": 0.9797441512346268, "step": 1251 }, { "epoch": 7.32258064516129, - "grad_norm": 0.313681809223758, + "grad_norm": 0.23068366144962413, "learning_rate": 1.0167805489846873e-05, - "loss": 0.0548, - "mean_token_accuracy": 0.9833528101444244, + "loss": 0.0597, + "mean_token_accuracy": 0.9825217947363853, "step": 1252 }, { "epoch": 7.328445747800586, - "grad_norm": 0.35044614156187415, + "grad_norm": 0.38274632853117607, "learning_rate": 1.0141984322162353e-05, - "loss": 0.0569, - "mean_token_accuracy": 0.9819622039794922, + "loss": 0.0563, + "mean_token_accuracy": 0.98308215290308, "step": 1253 }, { "epoch": 7.334310850439882, - "grad_norm": 0.37793729635473716, + "grad_norm": 0.2780363898982002, "learning_rate": 1.0116206194285598e-05, - "loss": 0.0604, - "mean_token_accuracy": 0.9819350242614746, + "loss": 0.0612, + "mean_token_accuracy": 0.9808923453092575, "step": 1254 }, { "epoch": 7.340175953079179, - "grad_norm": 0.40558850664867346, + "grad_norm": 0.43432483337721645, "learning_rate": 1.0090471199780812e-05, - "loss": 0.0635, - "mean_token_accuracy": 0.9799067080020905, + "loss": 0.0704, + "mean_token_accuracy": 0.9796313270926476, "step": 1255 }, { "epoch": 7.346041055718475, - "grad_norm": 0.3360663531138135, + "grad_norm": 0.44244848233473943, "learning_rate": 1.0064779432055616e-05, - "loss": 0.0586, - "mean_token_accuracy": 0.9792345017194748, + "loss": 0.0593, + "mean_token_accuracy": 0.9802607670426369, "step": 1256 }, { "epoch": 7.351906158357771, - "grad_norm": 0.37918955936120036, + "grad_norm": 0.25207004596265176, "learning_rate": 1.0039130984360761e-05, - "loss": 0.0584, - "mean_token_accuracy": 0.9800141528248787, + "loss": 0.0594, + "mean_token_accuracy": 0.9812619239091873, "step": 1257 }, { "epoch": 7.357771260997067, - "grad_norm": 0.4157661176569054, + "grad_norm": 0.2515940949933918, "learning_rate": 1.0013525949789745e-05, - "loss": 0.0593, - "mean_token_accuracy": 0.9802781492471695, + "loss": 0.0716, + "mean_token_accuracy": 0.9782749563455582, "step": 1258 }, { "epoch": 7.363636363636363, - "grad_norm": 0.45087026261043445, + "grad_norm": 0.46435768646393183, "learning_rate": 9.987964421278512e-06, - "loss": 0.0555, - "mean_token_accuracy": 0.9836331158876419, + "loss": 0.0551, + "mean_token_accuracy": 0.9833088368177414, "step": 1259 }, { "epoch": 7.3695014662756595, - "grad_norm": 0.3804640426965221, + "grad_norm": 0.26816012997514577, "learning_rate": 9.962446491605084e-06, - "loss": 0.0592, - "mean_token_accuracy": 0.978935495018959, + "loss": 0.0638, + "mean_token_accuracy": 0.9795294478535652, "step": 1260 }, { "epoch": 7.375366568914956, - "grad_norm": 0.46616629391754066, + "grad_norm": 0.3559008264487101, "learning_rate": 9.936972253389235e-06, - "loss": 0.0548, - "mean_token_accuracy": 0.9817307665944099, + "loss": 0.0581, + "mean_token_accuracy": 0.9806554466485977, "step": 1261 }, { "epoch": 7.381231671554252, - "grad_norm": 0.44707097128486495, + "grad_norm": 0.39486851602267986, "learning_rate": 9.911541799092162e-06, - "loss": 0.0618, - "mean_token_accuracy": 0.9799565002322197, + "loss": 0.0659, + "mean_token_accuracy": 0.9786085188388824, "step": 1262 }, { "epoch": 7.387096774193548, - "grad_norm": 0.3101720172363353, + "grad_norm": 0.335786358952341, "learning_rate": 9.88615522101615e-06, - "loss": 0.0537, - "mean_token_accuracy": 0.9818929210305214, + "loss": 0.0565, + "mean_token_accuracy": 0.9813254401087761, "step": 1263 }, { "epoch": 7.392961876832844, - "grad_norm": 0.3549119289206168, + "grad_norm": 0.2878142458194373, "learning_rate": 9.860812611304225e-06, - "loss": 0.0531, - "mean_token_accuracy": 0.9823939129710197, + "loss": 0.0571, + "mean_token_accuracy": 0.9820192754268646, "step": 1264 }, { "epoch": 7.39882697947214, - "grad_norm": 0.32942986436762367, + "grad_norm": 0.3408757371760065, "learning_rate": 9.835514061939822e-06, - "loss": 0.0507, - "mean_token_accuracy": 0.9844275042414665, + "loss": 0.0545, + "mean_token_accuracy": 0.9838089644908905, "step": 1265 }, { "epoch": 7.404692082111437, - "grad_norm": 0.36175422344149233, + "grad_norm": 0.24216134788014795, "learning_rate": 9.810259664746454e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9809439033269882, + "loss": 0.0607, + "mean_token_accuracy": 0.9823042750358582, "step": 1266 }, { "epoch": 7.410557184750733, - "grad_norm": 0.37478557515730054, + "grad_norm": 0.43774220429405847, "learning_rate": 9.785049511387383e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.9803262427449226, + "loss": 0.0569, + "mean_token_accuracy": 0.9796285480260849, "step": 1267 }, { "epoch": 7.416422287390029, - "grad_norm": 0.44581561472846937, + "grad_norm": 0.3108898649235883, "learning_rate": 9.759883693365287e-06, - "loss": 0.0642, - "mean_token_accuracy": 0.9802709370851517, + "loss": 0.067, + "mean_token_accuracy": 0.9790755361318588, "step": 1268 }, { "epoch": 7.422287390029325, - "grad_norm": 0.41500145999728155, + "grad_norm": 0.45200323631058287, "learning_rate": 9.734762302021923e-06, - "loss": 0.0484, - "mean_token_accuracy": 0.9843206256628036, + "loss": 0.05, + "mean_token_accuracy": 0.9834395200014114, "step": 1269 }, { "epoch": 7.428152492668621, - "grad_norm": 0.3840956000039933, + "grad_norm": 0.3597198573653463, "learning_rate": 9.709685428537794e-06, - "loss": 0.056, - "mean_token_accuracy": 0.9839591979980469, + "loss": 0.0556, + "mean_token_accuracy": 0.9842453375458717, "step": 1270 }, { "epoch": 7.4340175953079175, - "grad_norm": 0.4723303673366163, + "grad_norm": 0.3091752948831815, "learning_rate": 9.684653163931823e-06, - "loss": 0.063, - "mean_token_accuracy": 0.9816402345895767, + "loss": 0.0634, + "mean_token_accuracy": 0.9818849861621857, "step": 1271 }, { "epoch": 7.439882697947214, - "grad_norm": 0.4899840550170524, + "grad_norm": 0.28491665560399426, "learning_rate": 9.659665599061019e-06, - "loss": 0.068, - "mean_token_accuracy": 0.9774189367890358, + "loss": 0.0646, + "mean_token_accuracy": 0.9791956692934036, "step": 1272 }, { "epoch": 7.44574780058651, - "grad_norm": 0.3857766522115467, + "grad_norm": 0.3585720040589971, "learning_rate": 9.634722824620154e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.9810511991381645, + "loss": 0.0546, + "mean_token_accuracy": 0.9820611029863358, "step": 1273 }, { "epoch": 7.451612903225806, - "grad_norm": 0.3758751737714007, + "grad_norm": 0.22510237073905628, "learning_rate": 9.609824931141423e-06, - "loss": 0.0548, - "mean_token_accuracy": 0.9822421818971634, + "loss": 0.0542, + "mean_token_accuracy": 0.9816134124994278, "step": 1274 }, { "epoch": 7.457478005865102, - "grad_norm": 0.40199226438816, + "grad_norm": 0.2976725521857549, "learning_rate": 9.584972008994123e-06, - "loss": 0.0565, - "mean_token_accuracy": 0.9830095544457436, + "loss": 0.0608, + "mean_token_accuracy": 0.9823539853096008, "step": 1275 }, { "epoch": 7.463343108504398, - "grad_norm": 0.41133104233326856, + "grad_norm": 0.381204550554821, "learning_rate": 9.560164148384328e-06, - "loss": 0.0679, - "mean_token_accuracy": 0.9806720837950706, + "loss": 0.0706, + "mean_token_accuracy": 0.9788034185767174, "step": 1276 }, { "epoch": 7.469208211143695, - "grad_norm": 0.3711841671942756, + "grad_norm": 0.3902598224526612, "learning_rate": 9.53540143935455e-06, - "loss": 0.0603, - "mean_token_accuracy": 0.979490227997303, + "loss": 0.0625, + "mean_token_accuracy": 0.9800604283809662, "step": 1277 }, { "epoch": 7.475073313782991, - "grad_norm": 0.4353724058654228, + "grad_norm": 0.32985512212133755, "learning_rate": 9.510683971783425e-06, - "loss": 0.072, - "mean_token_accuracy": 0.9790943786501884, + "loss": 0.0772, + "mean_token_accuracy": 0.9780596867203712, "step": 1278 }, { "epoch": 7.480938416422287, - "grad_norm": 0.3262040671207362, + "grad_norm": 0.30612003390693643, "learning_rate": 9.486011835385372e-06, - "loss": 0.0432, - "mean_token_accuracy": 0.9866240471601486, + "loss": 0.0444, + "mean_token_accuracy": 0.9867977499961853, "step": 1279 }, { "epoch": 7.486803519061583, - "grad_norm": 0.40084758967083595, + "grad_norm": 0.23145050100573172, "learning_rate": 9.461385119710282e-06, - "loss": 0.0649, - "mean_token_accuracy": 0.9794245511293411, + "loss": 0.0646, + "mean_token_accuracy": 0.9781039953231812, "step": 1280 }, { "epoch": 7.492668621700879, - "grad_norm": 0.37179990528957857, + "grad_norm": 0.3258792257767987, "learning_rate": 9.436803914143189e-06, - "loss": 0.0649, - "mean_token_accuracy": 0.978649728000164, + "loss": 0.0682, + "mean_token_accuracy": 0.9785692542791367, "step": 1281 }, { "epoch": 7.4985337243401755, - "grad_norm": 0.4144971947017167, + "grad_norm": 0.3648664332592761, "learning_rate": 9.41226830790394e-06, - "loss": 0.0535, - "mean_token_accuracy": 0.9819266125559807, + "loss": 0.0572, + "mean_token_accuracy": 0.9803744629025459, "step": 1282 }, { "epoch": 7.504398826979472, - "grad_norm": 0.41089012248038864, + "grad_norm": 0.3949674277365223, "learning_rate": 9.387778390046881e-06, - "loss": 0.0567, - "mean_token_accuracy": 0.9813234284520149, + "loss": 0.0601, + "mean_token_accuracy": 0.9812920317053795, "step": 1283 }, { "epoch": 7.510263929618768, - "grad_norm": 0.3299345875181496, + "grad_norm": 0.41069422084481105, "learning_rate": 9.363334249460519e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9827776104211807, + "loss": 0.06, + "mean_token_accuracy": 0.9833412170410156, "step": 1284 }, { "epoch": 7.516129032258064, - "grad_norm": 0.3343562585006663, + "grad_norm": 0.3014214279503322, "learning_rate": 9.338935974867213e-06, - "loss": 0.0558, - "mean_token_accuracy": 0.9819636717438698, + "loss": 0.0626, + "mean_token_accuracy": 0.9807936921715736, "step": 1285 }, { "epoch": 7.52199413489736, - "grad_norm": 0.4643635750642076, + "grad_norm": 0.33124061212021144, "learning_rate": 9.314583654822844e-06, - "loss": 0.0632, - "mean_token_accuracy": 0.9794372394680977, + "loss": 0.062, + "mean_token_accuracy": 0.9802020415663719, "step": 1286 }, { "epoch": 7.527859237536656, - "grad_norm": 0.4244359984781323, + "grad_norm": 0.29393245792581324, "learning_rate": 9.290277377716503e-06, - "loss": 0.066, - "mean_token_accuracy": 0.9777902364730835, + "loss": 0.0653, + "mean_token_accuracy": 0.9785585701465607, "step": 1287 }, { "epoch": 7.533724340175953, - "grad_norm": 0.3949368748032798, + "grad_norm": 0.27560201878621987, "learning_rate": 9.266017231770155e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9815265461802483, + "loss": 0.0534, + "mean_token_accuracy": 0.9817678108811378, "step": 1288 }, { "epoch": 7.539589442815249, - "grad_norm": 0.37548934551322455, + "grad_norm": 0.4356751897841947, "learning_rate": 9.241803305038333e-06, - "loss": 0.062, - "mean_token_accuracy": 0.980858251452446, + "loss": 0.0621, + "mean_token_accuracy": 0.9805284291505814, "step": 1289 }, { "epoch": 7.545454545454545, - "grad_norm": 0.38084864607631785, + "grad_norm": 0.3108842343684137, "learning_rate": 9.217635685407813e-06, - "loss": 0.057, - "mean_token_accuracy": 0.9823618158698082, + "loss": 0.0592, + "mean_token_accuracy": 0.9813688769936562, "step": 1290 }, { "epoch": 7.551319648093841, - "grad_norm": 0.3614666166755709, + "grad_norm": 0.23658671346849064, "learning_rate": 9.19351446059729e-06, - "loss": 0.0509, - "mean_token_accuracy": 0.9841725453734398, + "loss": 0.0518, + "mean_token_accuracy": 0.9847207292914391, "step": 1291 }, { "epoch": 7.557184750733137, - "grad_norm": 0.32334267634784053, + "grad_norm": 0.31455427308213524, "learning_rate": 9.16943971815708e-06, - "loss": 0.0547, - "mean_token_accuracy": 0.9836216494441032, + "loss": 0.0584, + "mean_token_accuracy": 0.9820144027471542, "step": 1292 }, { "epoch": 7.563049853372434, - "grad_norm": 0.3549020438357482, + "grad_norm": 0.31471404836881833, "learning_rate": 9.145411545468756e-06, - "loss": 0.0537, - "mean_token_accuracy": 0.9822108149528503, + "loss": 0.0522, + "mean_token_accuracy": 0.981374055147171, "step": 1293 }, { "epoch": 7.568914956011731, - "grad_norm": 0.414896679893959, + "grad_norm": 0.49747244115547473, "learning_rate": 9.121430029744893e-06, - "loss": 0.0551, - "mean_token_accuracy": 0.9838257804512978, + "loss": 0.0573, + "mean_token_accuracy": 0.982080303132534, "step": 1294 }, { "epoch": 7.574780058651027, - "grad_norm": 0.4664630449633319, + "grad_norm": 0.4291736011020313, "learning_rate": 9.097495258028703e-06, - "loss": 0.0613, - "mean_token_accuracy": 0.9797477498650551, + "loss": 0.0622, + "mean_token_accuracy": 0.9802661165595055, "step": 1295 }, { "epoch": 7.580645161290323, - "grad_norm": 0.3624415444639565, + "grad_norm": 0.3477579573197191, "learning_rate": 9.073607317193742e-06, - "loss": 0.0513, - "mean_token_accuracy": 0.9819745272397995, + "loss": 0.0541, + "mean_token_accuracy": 0.9810939654707909, "step": 1296 }, { "epoch": 7.586510263929619, - "grad_norm": 0.362922738638366, + "grad_norm": 0.24442389845319185, "learning_rate": 9.049766293943589e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.9807760417461395, + "loss": 0.0638, + "mean_token_accuracy": 0.980124719440937, "step": 1297 }, { "epoch": 7.592375366568915, - "grad_norm": 0.4243136296793758, + "grad_norm": 0.45274589019511546, "learning_rate": 9.025972274811527e-06, - "loss": 0.0557, - "mean_token_accuracy": 0.9818306267261505, + "loss": 0.0575, + "mean_token_accuracy": 0.9824511334300041, "step": 1298 }, { "epoch": 7.5982404692082115, - "grad_norm": 0.4425658209939251, + "grad_norm": 0.38649873217195, "learning_rate": 9.002225346160238e-06, - "loss": 0.0577, - "mean_token_accuracy": 0.9808589443564415, + "loss": 0.0617, + "mean_token_accuracy": 0.9798657670617104, "step": 1299 }, { "epoch": 7.604105571847508, - "grad_norm": 0.3514744376386846, + "grad_norm": 0.37509806099205384, "learning_rate": 8.97852559418148e-06, - "loss": 0.0531, - "mean_token_accuracy": 0.9827908128499985, + "loss": 0.0557, + "mean_token_accuracy": 0.9814462289214134, "step": 1300 }, { "epoch": 7.609970674486804, - "grad_norm": 0.33745243601412556, + "grad_norm": 0.2609647266083045, "learning_rate": 8.954873104895787e-06, - "loss": 0.0538, - "mean_token_accuracy": 0.9833011403679848, + "loss": 0.0545, + "mean_token_accuracy": 0.9840743094682693, "step": 1301 }, { "epoch": 7.6158357771261, - "grad_norm": 0.3333868665559321, + "grad_norm": 0.2692253345151723, "learning_rate": 8.931267964152132e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.980949267745018, + "loss": 0.0607, + "mean_token_accuracy": 0.9792167991399765, "step": 1302 }, { "epoch": 7.621700879765396, - "grad_norm": 0.39314271223090547, + "grad_norm": 0.29179020681580975, "learning_rate": 8.907710257627651e-06, - "loss": 0.0574, - "mean_token_accuracy": 0.9824113622307777, + "loss": 0.0564, + "mean_token_accuracy": 0.9830718636512756, "step": 1303 }, { "epoch": 7.627565982404692, - "grad_norm": 0.3388898421743972, + "grad_norm": 0.25775769089056455, "learning_rate": 8.884200070827303e-06, - "loss": 0.0554, - "mean_token_accuracy": 0.9804334491491318, + "loss": 0.0563, + "mean_token_accuracy": 0.9800078570842743, "step": 1304 }, { "epoch": 7.633431085043989, - "grad_norm": 0.37499471768779674, + "grad_norm": 0.4126676449382726, "learning_rate": 8.86073748908357e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.981963500380516, + "loss": 0.0557, + "mean_token_accuracy": 0.9821161031723022, "step": 1305 }, { "epoch": 7.639296187683285, - "grad_norm": 0.38935220042170704, + "grad_norm": 0.29909785778360326, "learning_rate": 8.837322597556146e-06, - "loss": 0.0546, - "mean_token_accuracy": 0.9819600731134415, + "loss": 0.059, + "mean_token_accuracy": 0.9825484231114388, "step": 1306 }, { "epoch": 7.645161290322581, - "grad_norm": 0.4422890162616974, + "grad_norm": 0.41894554694492725, "learning_rate": 8.813955481231633e-06, - "loss": 0.0642, - "mean_token_accuracy": 0.9796778559684753, + "loss": 0.0706, + "mean_token_accuracy": 0.9785730615258217, "step": 1307 }, { "epoch": 7.651026392961877, - "grad_norm": 0.35255417861474725, + "grad_norm": 0.2968772340024343, "learning_rate": 8.790636224923221e-06, - "loss": 0.058, - "mean_token_accuracy": 0.9814234897494316, + "loss": 0.0602, + "mean_token_accuracy": 0.9801525101065636, "step": 1308 }, { "epoch": 7.656891495601173, - "grad_norm": 0.39985957065533995, + "grad_norm": 0.3409136197791452, "learning_rate": 8.767364913270399e-06, - "loss": 0.0592, - "mean_token_accuracy": 0.981181763112545, + "loss": 0.0607, + "mean_token_accuracy": 0.9808881357312202, "step": 1309 }, { "epoch": 7.6627565982404695, - "grad_norm": 0.4134732505479203, + "grad_norm": 0.243416715563901, "learning_rate": 8.744141630738624e-06, - "loss": 0.0681, - "mean_token_accuracy": 0.9794332608580589, + "loss": 0.0629, + "mean_token_accuracy": 0.9808500409126282, "step": 1310 }, { "epoch": 7.668621700879766, - "grad_norm": 0.4356893955326623, + "grad_norm": 0.33758735751589425, "learning_rate": 8.720966461619038e-06, - "loss": 0.0617, - "mean_token_accuracy": 0.9818313270807266, + "loss": 0.0707, + "mean_token_accuracy": 0.9802355766296387, "step": 1311 }, { "epoch": 7.674486803519062, - "grad_norm": 0.3989989585543621, + "grad_norm": 0.4117699771693344, "learning_rate": 8.69783949002814e-06, - "loss": 0.054, - "mean_token_accuracy": 0.9826485440135002, + "loss": 0.0572, + "mean_token_accuracy": 0.9822903126478195, "step": 1312 }, { "epoch": 7.680351906158358, - "grad_norm": 0.3359307886658749, + "grad_norm": 0.331202089534723, "learning_rate": 8.6747607999075e-06, - "loss": 0.0478, - "mean_token_accuracy": 0.9843815788626671, + "loss": 0.0482, + "mean_token_accuracy": 0.984705850481987, "step": 1313 }, { "epoch": 7.686217008797654, - "grad_norm": 0.3668758276797067, + "grad_norm": 0.2749224689840326, "learning_rate": 8.651730475023435e-06, - "loss": 0.0621, - "mean_token_accuracy": 0.9808582216501236, + "loss": 0.0655, + "mean_token_accuracy": 0.9800895005464554, "step": 1314 }, { "epoch": 7.69208211143695, - "grad_norm": 0.3993334587891021, + "grad_norm": 0.37140200419744357, "learning_rate": 8.628748598966739e-06, - "loss": 0.0564, - "mean_token_accuracy": 0.9802764654159546, + "loss": 0.0567, + "mean_token_accuracy": 0.9798331558704376, "step": 1315 }, { "epoch": 7.697947214076247, - "grad_norm": 0.4545815190595137, + "grad_norm": 0.3956036602768011, "learning_rate": 8.605815255152323e-06, - "loss": 0.0635, - "mean_token_accuracy": 0.9794041439890862, + "loss": 0.0665, + "mean_token_accuracy": 0.9770996049046516, "step": 1316 }, { "epoch": 7.703812316715543, - "grad_norm": 0.4487036384939457, + "grad_norm": 0.3731369409155563, "learning_rate": 8.582930526818973e-06, - "loss": 0.0621, - "mean_token_accuracy": 0.9806603714823723, + "loss": 0.0667, + "mean_token_accuracy": 0.979367084801197, "step": 1317 }, { "epoch": 7.709677419354839, - "grad_norm": 0.4666847229368787, + "grad_norm": 0.35123845927188907, "learning_rate": 8.560094497029008e-06, - "loss": 0.0586, - "mean_token_accuracy": 0.9819168671965599, + "loss": 0.061, + "mean_token_accuracy": 0.9824322983622551, "step": 1318 }, { "epoch": 7.715542521994135, - "grad_norm": 0.40195668204742174, + "grad_norm": 0.3332715311648278, "learning_rate": 8.537307248667992e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.9826847463846207, + "loss": 0.0605, + "mean_token_accuracy": 0.9805798158049583, "step": 1319 }, { "epoch": 7.721407624633431, - "grad_norm": 0.3985988940480605, + "grad_norm": 0.2682497870558032, "learning_rate": 8.514568864444432e-06, - "loss": 0.0659, - "mean_token_accuracy": 0.9786344021558762, + "loss": 0.0667, + "mean_token_accuracy": 0.9799603000283241, "step": 1320 }, { "epoch": 7.7272727272727275, - "grad_norm": 0.3803942159665741, + "grad_norm": 0.26843962718932374, "learning_rate": 8.491879426889483e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9822396486997604, + "loss": 0.0555, + "mean_token_accuracy": 0.9824989885091782, "step": 1321 }, { "epoch": 7.733137829912024, - "grad_norm": 0.4314813792648232, + "grad_norm": 0.3124980756288478, "learning_rate": 8.469239018356636e-06, - "loss": 0.0669, - "mean_token_accuracy": 0.9790932461619377, + "loss": 0.0645, + "mean_token_accuracy": 0.9792848080396652, "step": 1322 }, { "epoch": 7.73900293255132, - "grad_norm": 0.4256794549163884, + "grad_norm": 0.2547941571233965, "learning_rate": 8.446647721021435e-06, - "loss": 0.0704, - "mean_token_accuracy": 0.9789668694138527, + "loss": 0.0705, + "mean_token_accuracy": 0.9791503623127937, "step": 1323 }, { "epoch": 7.744868035190616, - "grad_norm": 0.4015989362398405, + "grad_norm": 0.2767295357262834, "learning_rate": 8.424105616881161e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9835130125284195, + "loss": 0.0598, + "mean_token_accuracy": 0.9829093441367149, "step": 1324 }, { "epoch": 7.750733137829912, - "grad_norm": 0.3748051299880915, + "grad_norm": 0.4218056274326058, "learning_rate": 8.40161278775455e-06, - "loss": 0.059, - "mean_token_accuracy": 0.9828163757920265, + "loss": 0.0611, + "mean_token_accuracy": 0.9812097251415253, "step": 1325 }, { "epoch": 7.756598240469208, - "grad_norm": 0.4175935071524493, + "grad_norm": 0.44385073200436737, "learning_rate": 8.379169315281485e-06, - "loss": 0.0638, - "mean_token_accuracy": 0.9799411669373512, + "loss": 0.0626, + "mean_token_accuracy": 0.9803273901343346, "step": 1326 }, { "epoch": 7.762463343108505, - "grad_norm": 0.38437401738180915, + "grad_norm": 0.23419463894081147, "learning_rate": 8.356775280922708e-06, - "loss": 0.065, - "mean_token_accuracy": 0.9807204306125641, + "loss": 0.0663, + "mean_token_accuracy": 0.9805652052164078, "step": 1327 }, { "epoch": 7.768328445747801, - "grad_norm": 0.3427453782640103, + "grad_norm": 0.4823100451315824, "learning_rate": 8.334430765959522e-06, - "loss": 0.0617, - "mean_token_accuracy": 0.97793348133564, + "loss": 0.0657, + "mean_token_accuracy": 0.9788806363940239, "step": 1328 }, { "epoch": 7.774193548387097, - "grad_norm": 0.36726795137742246, + "grad_norm": 0.3331813586088315, "learning_rate": 8.312135851493494e-06, - "loss": 0.0625, - "mean_token_accuracy": 0.9796304106712341, + "loss": 0.068, + "mean_token_accuracy": 0.9783801510930061, "step": 1329 }, { "epoch": 7.780058651026393, - "grad_norm": 0.33690389206076704, + "grad_norm": 0.37329187672139424, "learning_rate": 8.28989061844615e-06, - "loss": 0.0493, - "mean_token_accuracy": 0.9848483875393867, + "loss": 0.0512, + "mean_token_accuracy": 0.9843535274267197, "step": 1330 }, { "epoch": 7.785923753665689, - "grad_norm": 0.3730528248631496, + "grad_norm": 0.35298920655862964, "learning_rate": 8.267695147558705e-06, - "loss": 0.0664, - "mean_token_accuracy": 0.9790797233581543, + "loss": 0.0666, + "mean_token_accuracy": 0.9796379655599594, "step": 1331 }, { "epoch": 7.7917888563049855, - "grad_norm": 0.4445863650560794, + "grad_norm": 0.3039385846017442, "learning_rate": 8.245549519391758e-06, - "loss": 0.0618, - "mean_token_accuracy": 0.9811621233820915, + "loss": 0.0658, + "mean_token_accuracy": 0.9795048087835312, "step": 1332 }, { "epoch": 7.797653958944282, - "grad_norm": 0.3886988477025664, + "grad_norm": 0.2917560128374259, "learning_rate": 8.22345381432499e-06, - "loss": 0.0612, - "mean_token_accuracy": 0.9816948473453522, + "loss": 0.0619, + "mean_token_accuracy": 0.9815476760268211, "step": 1333 }, { "epoch": 7.803519061583578, - "grad_norm": 0.34021133386988633, + "grad_norm": 0.2569382185033971, "learning_rate": 8.201408112556893e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.9828803986310959, + "loss": 0.0583, + "mean_token_accuracy": 0.9816262051463127, "step": 1334 }, { "epoch": 7.809384164222874, - "grad_norm": 0.3722898209413316, + "grad_norm": 0.2582699795899447, "learning_rate": 8.179412494104457e-06, - "loss": 0.059, - "mean_token_accuracy": 0.9795228019356728, + "loss": 0.0604, + "mean_token_accuracy": 0.9799763336777687, "step": 1335 }, { "epoch": 7.81524926686217, - "grad_norm": 0.3530389348112938, + "grad_norm": 0.33387844068648576, "learning_rate": 8.15746703880289e-06, - "loss": 0.058, - "mean_token_accuracy": 0.981398917734623, + "loss": 0.0593, + "mean_token_accuracy": 0.9802704155445099, "step": 1336 }, { "epoch": 7.821114369501466, - "grad_norm": 0.35299669138074313, + "grad_norm": 0.2281458888766081, "learning_rate": 8.135571826305339e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9821040034294128, + "loss": 0.0534, + "mean_token_accuracy": 0.9816608652472496, "step": 1337 }, { "epoch": 7.826979472140763, - "grad_norm": 0.3591917463620187, + "grad_norm": 0.2814723881376163, "learning_rate": 8.113726936082576e-06, - "loss": 0.0655, - "mean_token_accuracy": 0.9790498167276382, + "loss": 0.066, + "mean_token_accuracy": 0.9795441180467606, "step": 1338 }, { "epoch": 7.832844574780059, - "grad_norm": 0.5220426396663527, + "grad_norm": 0.3449545305260099, "learning_rate": 8.091932447422737e-06, - "loss": 0.0617, - "mean_token_accuracy": 0.978603184223175, + "loss": 0.0622, + "mean_token_accuracy": 0.9795658588409424, "step": 1339 }, { "epoch": 7.838709677419355, - "grad_norm": 0.34756347572701596, + "grad_norm": 0.25471326027068014, "learning_rate": 8.070188439431005e-06, - "loss": 0.0575, - "mean_token_accuracy": 0.9806881099939346, + "loss": 0.0572, + "mean_token_accuracy": 0.9819264709949493, "step": 1340 }, { "epoch": 7.844574780058651, - "grad_norm": 0.41402266812479505, + "grad_norm": 0.2074367412183043, "learning_rate": 8.048494991029352e-06, - "loss": 0.052, - "mean_token_accuracy": 0.9825706034898758, + "loss": 0.0561, + "mean_token_accuracy": 0.9814460799098015, "step": 1341 }, { "epoch": 7.850439882697947, - "grad_norm": 0.34986065635759345, + "grad_norm": 0.40755999120243874, "learning_rate": 8.02685218095624e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9828275814652443, + "loss": 0.0627, + "mean_token_accuracy": 0.9809698536992073, "step": 1342 }, { "epoch": 7.8563049853372435, - "grad_norm": 0.35008434025300506, + "grad_norm": 0.3097454344053614, "learning_rate": 8.005260087766318e-06, - "loss": 0.0619, - "mean_token_accuracy": 0.9789273515343666, + "loss": 0.0639, + "mean_token_accuracy": 0.9798981621861458, "step": 1343 }, { "epoch": 7.86217008797654, - "grad_norm": 0.3628035072463874, + "grad_norm": 0.33298759837101477, "learning_rate": 7.983718789830167e-06, - "loss": 0.061, - "mean_token_accuracy": 0.9795840755105019, + "loss": 0.0627, + "mean_token_accuracy": 0.9803447499871254, "step": 1344 }, { "epoch": 7.868035190615836, - "grad_norm": 0.4052159012085872, + "grad_norm": 0.32489000595638734, "learning_rate": 7.962228365333999e-06, - "loss": 0.064, - "mean_token_accuracy": 0.9813359454274178, + "loss": 0.0649, + "mean_token_accuracy": 0.9803356230258942, "step": 1345 }, { "epoch": 7.873900293255132, - "grad_norm": 0.3695739612140032, + "grad_norm": 0.2251061914600319, "learning_rate": 7.940788892279375e-06, - "loss": 0.0616, - "mean_token_accuracy": 0.9815887585282326, + "loss": 0.0631, + "mean_token_accuracy": 0.981585755944252, "step": 1346 }, { "epoch": 7.879765395894428, - "grad_norm": 0.36628043012480255, + "grad_norm": 0.23969743573221897, "learning_rate": 7.919400448482928e-06, - "loss": 0.0577, - "mean_token_accuracy": 0.9804845973849297, + "loss": 0.059, + "mean_token_accuracy": 0.9801178500056267, "step": 1347 }, { "epoch": 7.885630498533724, - "grad_norm": 0.3773279787626722, + "grad_norm": 0.3011598459943263, "learning_rate": 7.898063111576066e-06, - "loss": 0.06, - "mean_token_accuracy": 0.9801003411412239, + "loss": 0.0593, + "mean_token_accuracy": 0.9817268326878548, "step": 1348 }, { "epoch": 7.891495601173021, - "grad_norm": 0.387347408588769, + "grad_norm": 0.22688728416225185, "learning_rate": 7.876776959004706e-06, - "loss": 0.0705, - "mean_token_accuracy": 0.976474940776825, + "loss": 0.0746, + "mean_token_accuracy": 0.9755787774920464, "step": 1349 }, { "epoch": 7.897360703812317, - "grad_norm": 0.3600666347332053, + "grad_norm": 0.29581805400389327, "learning_rate": 7.855542068028981e-06, - "loss": 0.0525, - "mean_token_accuracy": 0.9814046397805214, + "loss": 0.0549, + "mean_token_accuracy": 0.9815276637673378, "step": 1350 }, { "epoch": 7.903225806451613, - "grad_norm": 0.3168307759527497, + "grad_norm": 0.3134630356485751, "learning_rate": 7.834358515722977e-06, - "loss": 0.0558, - "mean_token_accuracy": 0.9831864088773727, + "loss": 0.0571, + "mean_token_accuracy": 0.9830066785216331, "step": 1351 }, { "epoch": 7.909090909090909, - "grad_norm": 0.34381906155090736, + "grad_norm": 0.31023619130265473, "learning_rate": 7.813226378974427e-06, - "loss": 0.0603, - "mean_token_accuracy": 0.9802919253706932, + "loss": 0.0613, + "mean_token_accuracy": 0.9804256185889244, "step": 1352 }, { "epoch": 7.914956011730205, - "grad_norm": 0.38700412978550913, + "grad_norm": 0.3167269601942049, "learning_rate": 7.792145734484455e-06, - "loss": 0.0575, - "mean_token_accuracy": 0.9791939035058022, + "loss": 0.0583, + "mean_token_accuracy": 0.9800725728273392, "step": 1353 }, { "epoch": 7.9208211143695015, - "grad_norm": 0.3526459336872874, + "grad_norm": 0.3579728764570681, "learning_rate": 7.771116658767286e-06, - "loss": 0.0627, - "mean_token_accuracy": 0.9802243933081627, + "loss": 0.0673, + "mean_token_accuracy": 0.9805974140763283, "step": 1354 }, { "epoch": 7.926686217008798, - "grad_norm": 0.399072301995629, + "grad_norm": 0.28576336706359023, "learning_rate": 7.750139228149978e-06, - "loss": 0.0658, - "mean_token_accuracy": 0.9786151126027107, + "loss": 0.069, + "mean_token_accuracy": 0.9772385880351067, "step": 1355 }, { "epoch": 7.932551319648094, - "grad_norm": 0.44580675695069405, + "grad_norm": 0.3127217642738492, "learning_rate": 7.729213518772121e-06, - "loss": 0.0558, - "mean_token_accuracy": 0.9831471219658852, + "loss": 0.0566, + "mean_token_accuracy": 0.9827322363853455, "step": 1356 }, { "epoch": 7.93841642228739, - "grad_norm": 0.38551496553122244, + "grad_norm": 0.3143837781915121, "learning_rate": 7.708339606585591e-06, - "loss": 0.0603, - "mean_token_accuracy": 0.9787757843732834, + "loss": 0.0623, + "mean_token_accuracy": 0.9770531207323074, "step": 1357 }, { "epoch": 7.944281524926686, - "grad_norm": 0.3707055933975031, + "grad_norm": 0.3220229393336074, "learning_rate": 7.687517567354266e-06, - "loss": 0.0679, - "mean_token_accuracy": 0.9794500693678856, + "loss": 0.0747, + "mean_token_accuracy": 0.9770351573824883, "step": 1358 }, { "epoch": 7.9501466275659824, - "grad_norm": 0.3770154187455032, + "grad_norm": 0.425406819172546, "learning_rate": 7.66674747665373e-06, - "loss": 0.0533, - "mean_token_accuracy": 0.981170766055584, + "loss": 0.0567, + "mean_token_accuracy": 0.9808967933058739, "step": 1359 }, { "epoch": 7.956011730205279, - "grad_norm": 0.3490341048905599, + "grad_norm": 0.4078670819905106, "learning_rate": 7.646029409871029e-06, - "loss": 0.0596, - "mean_token_accuracy": 0.9797269105911255, + "loss": 0.0641, + "mean_token_accuracy": 0.9776945933699608, "step": 1360 }, { "epoch": 7.961876832844575, - "grad_norm": 0.3213083693973658, + "grad_norm": 0.25029954580605374, "learning_rate": 7.625363442204379e-06, - "loss": 0.0485, - "mean_token_accuracy": 0.9845825582742691, + "loss": 0.0494, + "mean_token_accuracy": 0.984387032687664, "step": 1361 }, { "epoch": 7.967741935483871, - "grad_norm": 0.31915132992627365, + "grad_norm": 0.3067346484833591, "learning_rate": 7.604749648662892e-06, - "loss": 0.0568, - "mean_token_accuracy": 0.9816582277417183, + "loss": 0.0599, + "mean_token_accuracy": 0.9821083396673203, "step": 1362 }, { "epoch": 7.973607038123167, - "grad_norm": 0.41843021589666207, + "grad_norm": 0.3034537440779158, "learning_rate": 7.584188104066317e-06, - "loss": 0.0526, - "mean_token_accuracy": 0.9819516390562057, + "loss": 0.0537, + "mean_token_accuracy": 0.9811761453747749, "step": 1363 }, { "epoch": 7.979472140762463, - "grad_norm": 0.36578962992205144, + "grad_norm": 0.2676727146076349, "learning_rate": 7.563678883044754e-06, - "loss": 0.0669, - "mean_token_accuracy": 0.9804994836449623, + "loss": 0.0693, + "mean_token_accuracy": 0.9785163179039955, "step": 1364 }, { "epoch": 7.9853372434017595, - "grad_norm": 0.37137732144693464, + "grad_norm": 0.2628017567235926, "learning_rate": 7.5432220600383935e-06, - "loss": 0.0677, - "mean_token_accuracy": 0.978873997926712, + "loss": 0.0701, + "mean_token_accuracy": 0.9787680581212044, "step": 1365 }, { "epoch": 7.991202346041056, - "grad_norm": 0.32129501323857224, + "grad_norm": 0.24809405335761472, "learning_rate": 7.522817709297241e-06, "loss": 0.0577, - "mean_token_accuracy": 0.9820384383201599, + "mean_token_accuracy": 0.9816685542464256, "step": 1366 }, { "epoch": 7.997067448680352, - "grad_norm": 0.3778910440397723, + "grad_norm": 0.2842505637743014, "learning_rate": 7.502465904880849e-06, - "loss": 0.0592, - "mean_token_accuracy": 0.9810664132237434, + "loss": 0.0612, + "mean_token_accuracy": 0.9808202460408211, "step": 1367 }, { "epoch": 8.0, - "grad_norm": 0.5872724133391657, + "grad_norm": 0.41987617913313496, "learning_rate": 7.482166720658046e-06, - "loss": 0.0557, - "mean_token_accuracy": 0.9852104634046555, + "loss": 0.0578, + "mean_token_accuracy": 0.9846701323986053, "step": 1368 }, { "epoch": 8.005865102639296, - "grad_norm": 0.3580615397900473, + "grad_norm": 0.214916141983213, "learning_rate": 7.461920230306674e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.9798811301589012, + "loss": 0.0578, + "mean_token_accuracy": 0.9811231270432472, "step": 1369 }, { "epoch": 8.011730205278592, - "grad_norm": 0.30243346768551127, + "grad_norm": 0.2648143776987859, "learning_rate": 7.441726507313318e-06, - "loss": 0.0495, - "mean_token_accuracy": 0.9828371107578278, + "loss": 0.0509, + "mean_token_accuracy": 0.9820636063814163, "step": 1370 }, { "epoch": 8.017595307917889, - "grad_norm": 0.3074075490353524, + "grad_norm": 0.2201043806212357, "learning_rate": 7.421585624973033e-06, - "loss": 0.0568, - "mean_token_accuracy": 0.9812061563134193, + "loss": 0.0578, + "mean_token_accuracy": 0.9809773191809654, "step": 1371 }, { "epoch": 8.023460410557185, - "grad_norm": 0.2869959547346391, + "grad_norm": 0.19563315326882122, "learning_rate": 7.4014976563890915e-06, - "loss": 0.0482, - "mean_token_accuracy": 0.9833677485585213, + "loss": 0.0485, + "mean_token_accuracy": 0.9839671179652214, "step": 1372 }, { "epoch": 8.029325513196481, - "grad_norm": 0.3157738216475847, + "grad_norm": 0.20556851418206967, "learning_rate": 7.381462674472702e-06, - "loss": 0.0493, - "mean_token_accuracy": 0.9853304252028465, + "loss": 0.0498, + "mean_token_accuracy": 0.9850044846534729, "step": 1373 }, { "epoch": 8.035190615835777, - "grad_norm": 0.2917261306391619, + "grad_norm": 0.19766018394025286, "learning_rate": 7.36148075194276e-06, - "loss": 0.0495, - "mean_token_accuracy": 0.9836459308862686, + "loss": 0.051, + "mean_token_accuracy": 0.9836961477994919, "step": 1374 }, { "epoch": 8.041055718475073, - "grad_norm": 0.31842109310012173, + "grad_norm": 0.25314626449489763, "learning_rate": 7.341551961325574e-06, - "loss": 0.0507, - "mean_token_accuracy": 0.9838737025856972, + "loss": 0.051, + "mean_token_accuracy": 0.98270533233881, "step": 1375 }, { "epoch": 8.04692082111437, - "grad_norm": 0.30617736349863633, + "grad_norm": 0.3115274548014052, "learning_rate": 7.3216763749546025e-06, - "loss": 0.0464, - "mean_token_accuracy": 0.9870840385556221, + "loss": 0.0502, + "mean_token_accuracy": 0.9859990701079369, "step": 1376 }, { "epoch": 8.052785923753666, - "grad_norm": 0.3244368902054701, + "grad_norm": 0.21757031889947906, "learning_rate": 7.301854064970202e-06, - "loss": 0.0527, - "mean_token_accuracy": 0.9844281673431396, + "loss": 0.0529, + "mean_token_accuracy": 0.9847300946712494, "step": 1377 }, { "epoch": 8.058651026392962, - "grad_norm": 0.33740527739959525, + "grad_norm": 0.23951796689802396, "learning_rate": 7.282085103319349e-06, - "loss": 0.0522, - "mean_token_accuracy": 0.9837864488363266, + "loss": 0.0524, + "mean_token_accuracy": 0.9825025796890259, "step": 1378 }, { "epoch": 8.064516129032258, - "grad_norm": 0.35600976208331997, + "grad_norm": 0.23680698926945376, "learning_rate": 7.2623695617553934e-06, "loss": 0.0565, - "mean_token_accuracy": 0.9829199686646461, + "mean_token_accuracy": 0.9824689403176308, "step": 1379 }, { "epoch": 8.070381231671554, - "grad_norm": 0.34147147543660433, + "grad_norm": 0.22535881218387918, "learning_rate": 7.242707511837781e-06, - "loss": 0.0476, - "mean_token_accuracy": 0.9852808564901352, + "loss": 0.0491, + "mean_token_accuracy": 0.9835941642522812, "step": 1380 }, { "epoch": 8.07624633431085, - "grad_norm": 0.33417906439790784, + "grad_norm": 0.2785402711221106, "learning_rate": 7.223099024931817e-06, - "loss": 0.0474, - "mean_token_accuracy": 0.9858130812644958, + "loss": 0.0468, + "mean_token_accuracy": 0.9862391352653503, "step": 1381 }, { "epoch": 8.082111436950147, - "grad_norm": 0.39225200349168615, + "grad_norm": 0.2993996716485423, "learning_rate": 7.203544172208387e-06, - "loss": 0.0522, - "mean_token_accuracy": 0.9837429746985435, + "loss": 0.0551, + "mean_token_accuracy": 0.9826711639761925, "step": 1382 }, { "epoch": 8.087976539589443, - "grad_norm": 0.3559651753311173, + "grad_norm": 0.24507275921485255, "learning_rate": 7.184043024643712e-06, - "loss": 0.054, - "mean_token_accuracy": 0.9832148253917694, + "loss": 0.0532, + "mean_token_accuracy": 0.9831705465912819, "step": 1383 }, { "epoch": 8.093841642228739, - "grad_norm": 0.32846826889276376, + "grad_norm": 0.3103281371140337, "learning_rate": 7.16459565301908e-06, - "loss": 0.0508, - "mean_token_accuracy": 0.9839031621813774, + "loss": 0.0532, + "mean_token_accuracy": 0.9835177212953568, "step": 1384 }, { "epoch": 8.099706744868035, - "grad_norm": 0.36300700993630225, + "grad_norm": 0.25126193604629965, "learning_rate": 7.145202127920598e-06, - "loss": 0.0595, - "mean_token_accuracy": 0.9801753610372543, + "loss": 0.0607, + "mean_token_accuracy": 0.9802353978157043, "step": 1385 }, { "epoch": 8.105571847507331, - "grad_norm": 0.32115006218827374, + "grad_norm": 0.31079694307967204, "learning_rate": 7.125862519738924e-06, - "loss": 0.0516, - "mean_token_accuracy": 0.9814363420009613, + "loss": 0.0548, + "mean_token_accuracy": 0.981208898127079, "step": 1386 }, { "epoch": 8.111436950146627, - "grad_norm": 0.3703822810157945, + "grad_norm": 0.243430501615171, "learning_rate": 7.106576898669031e-06, - "loss": 0.0551, - "mean_token_accuracy": 0.9836417734622955, + "loss": 0.0565, + "mean_token_accuracy": 0.9828232526779175, "step": 1387 }, { "epoch": 8.117302052785924, - "grad_norm": 0.3596990409161751, + "grad_norm": 0.29248007573235113, "learning_rate": 7.087345334709931e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9793681129813194, + "loss": 0.0581, + "mean_token_accuracy": 0.9782914444804192, "step": 1388 }, { "epoch": 8.12316715542522, - "grad_norm": 0.439970724935907, + "grad_norm": 0.23983735227252295, "learning_rate": 7.068167897664433e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9794286787509918, + "loss": 0.0574, + "mean_token_accuracy": 0.9791986420750618, "step": 1389 }, { "epoch": 8.129032258064516, - "grad_norm": 0.34715662800736086, + "grad_norm": 0.25763068213117407, "learning_rate": 7.0490446571388925e-06, - "loss": 0.0592, - "mean_token_accuracy": 0.982256643474102, + "loss": 0.0607, + "mean_token_accuracy": 0.982701949775219, "step": 1390 }, { "epoch": 8.134897360703812, - "grad_norm": 0.3397432205528309, + "grad_norm": 0.31395890478128136, "learning_rate": 7.0299756825429465e-06, - "loss": 0.0507, - "mean_token_accuracy": 0.9833914712071419, + "loss": 0.0535, + "mean_token_accuracy": 0.9816442281007767, "step": 1391 }, { "epoch": 8.140762463343108, - "grad_norm": 0.2717469828760133, + "grad_norm": 0.36881180696205157, "learning_rate": 7.010961043089277e-06, - "loss": 0.0436, - "mean_token_accuracy": 0.9865109696984291, + "loss": 0.0457, + "mean_token_accuracy": 0.9867331683635712, "step": 1392 }, { "epoch": 8.146627565982405, - "grad_norm": 0.3481112115021732, + "grad_norm": 0.22371030335071296, "learning_rate": 6.992000807793333e-06, - "loss": 0.0492, - "mean_token_accuracy": 0.9850409254431725, + "loss": 0.0504, + "mean_token_accuracy": 0.9831229224801064, "step": 1393 }, { "epoch": 8.1524926686217, - "grad_norm": 0.34563812676429884, + "grad_norm": 0.23527634028093716, "learning_rate": 6.973095045473124e-06, - "loss": 0.0555, - "mean_token_accuracy": 0.982826754450798, + "loss": 0.057, + "mean_token_accuracy": 0.9822418987751007, "step": 1394 }, { "epoch": 8.158357771260997, - "grad_norm": 0.35087836917639226, + "grad_norm": 0.3098862024215743, "learning_rate": 6.954243824748922e-06, - "loss": 0.0578, - "mean_token_accuracy": 0.9829492494463921, + "loss": 0.0597, + "mean_token_accuracy": 0.9828020483255386, "step": 1395 }, { "epoch": 8.164222873900293, - "grad_norm": 0.2905320705631627, + "grad_norm": 0.2078992554645412, "learning_rate": 6.93544721404305e-06, - "loss": 0.05, - "mean_token_accuracy": 0.9838858619332314, + "loss": 0.0521, + "mean_token_accuracy": 0.9825119003653526, "step": 1396 }, { "epoch": 8.17008797653959, - "grad_norm": 0.3082826739718294, + "grad_norm": 0.432128734720523, "learning_rate": 6.916705281579612e-06, - "loss": 0.0519, - "mean_token_accuracy": 0.9831016063690186, + "loss": 0.0567, + "mean_token_accuracy": 0.9820001050829887, "step": 1397 }, { "epoch": 8.175953079178885, - "grad_norm": 0.36435966366625105, + "grad_norm": 0.24054569811011123, "learning_rate": 6.898018095384252e-06, - "loss": 0.0633, - "mean_token_accuracy": 0.9801773875951767, + "loss": 0.0657, + "mean_token_accuracy": 0.9803327471017838, "step": 1398 }, { "epoch": 8.181818181818182, - "grad_norm": 0.3803019536472587, + "grad_norm": 0.30936846417548536, "learning_rate": 6.879385723283913e-06, - "loss": 0.0521, - "mean_token_accuracy": 0.9834791570901871, + "loss": 0.0549, + "mean_token_accuracy": 0.9830240309238434, "step": 1399 }, { "epoch": 8.187683284457478, - "grad_norm": 0.3532847547848805, + "grad_norm": 0.20708706141150313, "learning_rate": 6.8608082329065775e-06, - "loss": 0.0532, - "mean_token_accuracy": 0.9834351092576981, + "loss": 0.0536, + "mean_token_accuracy": 0.9839501082897186, "step": 1400 }, { "epoch": 8.193548387096774, - "grad_norm": 0.38911936414789794, + "grad_norm": 0.4109806587663491, "learning_rate": 6.842285691681032e-06, - "loss": 0.0591, - "mean_token_accuracy": 0.9815716445446014, + "loss": 0.0672, + "mean_token_accuracy": 0.9807805866003036, "step": 1401 }, { "epoch": 8.19941348973607, - "grad_norm": 0.3428278171357225, + "grad_norm": 0.33459586080943454, "learning_rate": 6.8238181668366244e-06, - "loss": 0.0478, - "mean_token_accuracy": 0.9838706254959106, + "loss": 0.0506, + "mean_token_accuracy": 0.9823162779211998, "step": 1402 }, { "epoch": 8.205278592375366, - "grad_norm": 0.33191960396480547, + "grad_norm": 0.23204687344932642, "learning_rate": 6.805405725403006e-06, - "loss": 0.0572, - "mean_token_accuracy": 0.9818863347172737, + "loss": 0.059, + "mean_token_accuracy": 0.981462337076664, "step": 1403 }, { "epoch": 8.211143695014663, - "grad_norm": 0.36925272182757807, + "grad_norm": 0.24387787551349369, "learning_rate": 6.787048434209906e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.9810345396399498, + "loss": 0.0559, + "mean_token_accuracy": 0.9810043722391129, "step": 1404 }, { "epoch": 8.217008797653959, - "grad_norm": 0.382501882480973, + "grad_norm": 0.24111998738404342, "learning_rate": 6.768746359886882e-06, - "loss": 0.0546, - "mean_token_accuracy": 0.9816075041890144, + "loss": 0.0552, + "mean_token_accuracy": 0.9823629483580589, "step": 1405 }, { "epoch": 8.222873900293255, - "grad_norm": 0.3595201095049301, + "grad_norm": 0.2164729282188346, "learning_rate": 6.750499568863061e-06, - "loss": 0.0556, - "mean_token_accuracy": 0.9821967929601669, + "loss": 0.0554, + "mean_token_accuracy": 0.9823937118053436, "step": 1406 }, { "epoch": 8.228739002932551, - "grad_norm": 0.38016578185329586, + "grad_norm": 0.3701551999536545, "learning_rate": 6.732308127366931e-06, - "loss": 0.0624, - "mean_token_accuracy": 0.9800859242677689, + "loss": 0.0639, + "mean_token_accuracy": 0.9803007692098618, "step": 1407 }, { "epoch": 8.234604105571847, - "grad_norm": 0.38842597342794766, + "grad_norm": 0.2602007603930705, "learning_rate": 6.714172101426077e-06, - "loss": 0.0582, - "mean_token_accuracy": 0.9795337095856667, + "loss": 0.0586, + "mean_token_accuracy": 0.9811219796538353, "step": 1408 }, { "epoch": 8.240469208211143, - "grad_norm": 0.26062246060123856, + "grad_norm": 0.1814905190223246, "learning_rate": 6.696091556866948e-06, - "loss": 0.0435, - "mean_token_accuracy": 0.9856739714741707, + "loss": 0.0446, + "mean_token_accuracy": 0.9854880273342133, "step": 1409 }, { "epoch": 8.24633431085044, - "grad_norm": 0.37049498473441145, + "grad_norm": 0.2478857316703125, "learning_rate": 6.678066559314622e-06, - "loss": 0.0606, - "mean_token_accuracy": 0.9812714830040932, + "loss": 0.0615, + "mean_token_accuracy": 0.9804988503456116, "step": 1410 }, { "epoch": 8.252199413489736, - "grad_norm": 0.3989598961651288, + "grad_norm": 0.23841278471860133, "learning_rate": 6.660097174192556e-06, - "loss": 0.056, - "mean_token_accuracy": 0.9807810261845589, + "loss": 0.0572, + "mean_token_accuracy": 0.9817399382591248, "step": 1411 }, { "epoch": 8.258064516129032, - "grad_norm": 0.3749156284590479, + "grad_norm": 0.2565046678967367, "learning_rate": 6.642183466722363e-06, - "loss": 0.0565, - "mean_token_accuracy": 0.9821376726031303, + "loss": 0.0573, + "mean_token_accuracy": 0.980897068977356, "step": 1412 }, { "epoch": 8.263929618768328, - "grad_norm": 0.35702668344077504, + "grad_norm": 0.2463098769541463, "learning_rate": 6.624325501923565e-06, - "loss": 0.0563, - "mean_token_accuracy": 0.9820006415247917, + "loss": 0.0573, + "mean_token_accuracy": 0.9820313602685928, "step": 1413 }, { "epoch": 8.269794721407624, - "grad_norm": 0.40663026140935876, + "grad_norm": 0.24504153100466936, "learning_rate": 6.606523344613362e-06, - "loss": 0.0597, - "mean_token_accuracy": 0.9796445891261101, + "loss": 0.061, + "mean_token_accuracy": 0.9795975983142853, "step": 1414 }, { "epoch": 8.27565982404692, - "grad_norm": 0.34723841867587746, + "grad_norm": 0.22361167881403213, "learning_rate": 6.588777059406397e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9835962206125259, + "loss": 0.0571, + "mean_token_accuracy": 0.9831408709287643, "step": 1415 }, { "epoch": 8.281524926686217, - "grad_norm": 0.312293892386182, + "grad_norm": 0.18291052131426874, "learning_rate": 6.571086710714516e-06, - "loss": 0.0465, - "mean_token_accuracy": 0.9838526993989944, + "loss": 0.0478, + "mean_token_accuracy": 0.9837886691093445, "step": 1416 }, { "epoch": 8.287390029325513, - "grad_norm": 0.3524237439107806, + "grad_norm": 0.23482128316494943, "learning_rate": 6.553452362746543e-06, - "loss": 0.0585, - "mean_token_accuracy": 0.980175569653511, + "loss": 0.0592, + "mean_token_accuracy": 0.9811000153422356, "step": 1417 }, { "epoch": 8.29325513196481, - "grad_norm": 0.39261505836168864, + "grad_norm": 0.2571114682551005, "learning_rate": 6.5358740795080335e-06, - "loss": 0.0639, - "mean_token_accuracy": 0.9783350303769112, + "loss": 0.0659, + "mean_token_accuracy": 0.9779541790485382, "step": 1418 }, { "epoch": 8.299120234604105, - "grad_norm": 0.4192403382994383, + "grad_norm": 0.3411374440781876, "learning_rate": 6.518351924801061e-06, - "loss": 0.0619, - "mean_token_accuracy": 0.9807342141866684, + "loss": 0.0623, + "mean_token_accuracy": 0.9813627898693085, "step": 1419 }, { "epoch": 8.304985337243401, - "grad_norm": 0.28497911177981083, + "grad_norm": 0.32170991105288976, "learning_rate": 6.500885962223969e-06, - "loss": 0.0511, - "mean_token_accuracy": 0.9853582382202148, + "loss": 0.0569, + "mean_token_accuracy": 0.9852675199508667, "step": 1420 }, { "epoch": 8.310850439882698, - "grad_norm": 0.3635257303812234, + "grad_norm": 0.2520903886069418, "learning_rate": 6.483476255171146e-06, - "loss": 0.0613, - "mean_token_accuracy": 0.9806642904877663, + "loss": 0.0631, + "mean_token_accuracy": 0.9805245026946068, "step": 1421 }, { "epoch": 8.316715542521994, - "grad_norm": 0.3284725590763597, + "grad_norm": 0.35309257782861436, "learning_rate": 6.4661228668328015e-06, - "loss": 0.0504, - "mean_token_accuracy": 0.9835583493113518, + "loss": 0.0543, + "mean_token_accuracy": 0.983853705227375, "step": 1422 }, { "epoch": 8.32258064516129, - "grad_norm": 0.3809611371335918, + "grad_norm": 0.27103047498821015, "learning_rate": 6.448825860194722e-06, - "loss": 0.0596, - "mean_token_accuracy": 0.9815945476293564, + "loss": 0.0609, + "mean_token_accuracy": 0.981744721531868, "step": 1423 }, { "epoch": 8.328445747800586, - "grad_norm": 0.26700828855473413, + "grad_norm": 0.3668875312033516, "learning_rate": 6.431585298038057e-06, - "loss": 0.0416, - "mean_token_accuracy": 0.9862992838025093, + "loss": 0.0436, + "mean_token_accuracy": 0.9858937785029411, "step": 1424 }, { "epoch": 8.334310850439882, - "grad_norm": 0.31427705205892353, + "grad_norm": 0.21839281047639716, "learning_rate": 6.414401242939087e-06, - "loss": 0.0579, - "mean_token_accuracy": 0.9815279394388199, + "loss": 0.0598, + "mean_token_accuracy": 0.9812084659934044, "step": 1425 }, { "epoch": 8.340175953079179, - "grad_norm": 0.36011252789263, + "grad_norm": 0.2888554299502811, "learning_rate": 6.397273757268987e-06, - "loss": 0.0508, - "mean_token_accuracy": 0.9852896630764008, + "loss": 0.0579, + "mean_token_accuracy": 0.9844140931963921, "step": 1426 }, { "epoch": 8.346041055718475, - "grad_norm": 0.3739186167424333, + "grad_norm": 0.24613171679315907, "learning_rate": 6.380202903193616e-06, - "loss": 0.0626, - "mean_token_accuracy": 0.9816398844122887, + "loss": 0.0631, + "mean_token_accuracy": 0.9819495007395744, "step": 1427 }, { "epoch": 8.351906158357771, - "grad_norm": 0.3711990215340053, + "grad_norm": 0.22714519725510895, "learning_rate": 6.363188742673281e-06, - "loss": 0.0556, - "mean_token_accuracy": 0.9811735600233078, + "loss": 0.0567, + "mean_token_accuracy": 0.9815320670604706, "step": 1428 }, { "epoch": 8.357771260997067, - "grad_norm": 0.33932075687055624, + "grad_norm": 0.3202332452186219, "learning_rate": 6.346231337462513e-06, - "loss": 0.051, - "mean_token_accuracy": 0.9825574532151222, + "loss": 0.0554, + "mean_token_accuracy": 0.982030026614666, "step": 1429 }, { "epoch": 8.363636363636363, - "grad_norm": 0.44161637530546255, + "grad_norm": 0.316039144956545, "learning_rate": 6.329330749109839e-06, - "loss": 0.0667, - "mean_token_accuracy": 0.978769101202488, + "loss": 0.0691, + "mean_token_accuracy": 0.9775633811950684, "step": 1430 }, { "epoch": 8.36950146627566, - "grad_norm": 0.346684374066376, + "grad_norm": 0.2181845443857865, "learning_rate": 6.312487038957573e-06, - "loss": 0.0558, - "mean_token_accuracy": 0.9824666082859039, + "loss": 0.056, + "mean_token_accuracy": 0.9824205189943314, "step": 1431 }, { "epoch": 8.375366568914956, - "grad_norm": 0.32798515060125477, + "grad_norm": 0.22328961022302585, "learning_rate": 6.295700268141579e-06, - "loss": 0.0479, - "mean_token_accuracy": 0.9828667864203453, + "loss": 0.0496, + "mean_token_accuracy": 0.9836017712950706, "step": 1432 }, { "epoch": 8.381231671554252, - "grad_norm": 0.3050242296207711, + "grad_norm": 0.19937915686885815, "learning_rate": 6.2789704975910574e-06, - "loss": 0.0472, - "mean_token_accuracy": 0.9842007234692574, + "loss": 0.049, + "mean_token_accuracy": 0.982590489089489, "step": 1433 }, { "epoch": 8.387096774193548, - "grad_norm": 0.342916785749358, + "grad_norm": 0.24150867763099138, "learning_rate": 6.262297788028316e-06, - "loss": 0.0489, - "mean_token_accuracy": 0.9815072119235992, + "loss": 0.0513, + "mean_token_accuracy": 0.981402613222599, "step": 1434 }, { "epoch": 8.392961876832844, - "grad_norm": 0.36151402107915315, + "grad_norm": 0.2795051735249161, "learning_rate": 6.245682199968556e-06, - "loss": 0.0568, - "mean_token_accuracy": 0.9811645448207855, + "loss": 0.0607, + "mean_token_accuracy": 0.9804069846868515, "step": 1435 }, { "epoch": 8.39882697947214, - "grad_norm": 0.33322534234356715, + "grad_norm": 0.25783232931813, "learning_rate": 6.229123793719656e-06, - "loss": 0.0532, - "mean_token_accuracy": 0.9831674918532372, + "loss": 0.0569, + "mean_token_accuracy": 0.9817790612578392, "step": 1436 }, { "epoch": 8.404692082111437, - "grad_norm": 0.3283361999533351, + "grad_norm": 0.30486166707710477, "learning_rate": 6.21262262938194e-06, - "loss": 0.0514, - "mean_token_accuracy": 0.9837821051478386, + "loss": 0.0536, + "mean_token_accuracy": 0.9836645349860191, "step": 1437 }, { "epoch": 8.410557184750733, - "grad_norm": 0.3401690287440309, + "grad_norm": 0.27233965999418625, "learning_rate": 6.196178766847969e-06, - "loss": 0.0534, - "mean_token_accuracy": 0.9818862527608871, + "loss": 0.0551, + "mean_token_accuracy": 0.9816263765096664, "step": 1438 }, { "epoch": 8.416422287390029, - "grad_norm": 0.37528738834672287, + "grad_norm": 0.24723015413758212, "learning_rate": 6.1797922658023264e-06, - "loss": 0.0635, - "mean_token_accuracy": 0.978727675974369, + "loss": 0.0639, + "mean_token_accuracy": 0.9785666167736053, "step": 1439 }, { "epoch": 8.422287390029325, - "grad_norm": 0.3149172294095641, + "grad_norm": 0.21982320056204013, "learning_rate": 6.16346318572139e-06, - "loss": 0.0541, - "mean_token_accuracy": 0.9831294119358063, + "loss": 0.0547, + "mean_token_accuracy": 0.982805423438549, "step": 1440 }, { "epoch": 8.428152492668621, - "grad_norm": 0.3802962509030587, + "grad_norm": 0.28335201007211935, "learning_rate": 6.147191585873128e-06, - "loss": 0.0605, - "mean_token_accuracy": 0.981091320514679, + "loss": 0.0626, + "mean_token_accuracy": 0.9803605228662491, "step": 1441 }, { "epoch": 8.434017595307918, - "grad_norm": 0.325742841513319, + "grad_norm": 0.2103051058451436, "learning_rate": 6.130977525316878e-06, - "loss": 0.0567, - "mean_token_accuracy": 0.9818108677864075, + "loss": 0.0578, + "mean_token_accuracy": 0.9820494800806046, "step": 1442 }, { "epoch": 8.439882697947214, - "grad_norm": 0.350918927222633, + "grad_norm": 0.205299410579196, "learning_rate": 6.114821062903125e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9823700860142708, + "loss": 0.0548, + "mean_token_accuracy": 0.982606902718544, "step": 1443 }, { "epoch": 8.44574780058651, - "grad_norm": 0.33888867438286, + "grad_norm": 0.22390637291501272, "learning_rate": 6.098722257273303e-06, - "loss": 0.054, - "mean_token_accuracy": 0.980469599366188, + "loss": 0.0547, + "mean_token_accuracy": 0.9798173382878304, "step": 1444 }, { "epoch": 8.451612903225806, - "grad_norm": 0.3986416341368361, + "grad_norm": 0.2771499590517391, "learning_rate": 6.082681166859579e-06, - "loss": 0.0608, - "mean_token_accuracy": 0.9823858961462975, + "loss": 0.0636, + "mean_token_accuracy": 0.9806607887148857, "step": 1445 }, { "epoch": 8.457478005865102, - "grad_norm": 0.3321438872721226, + "grad_norm": 0.4554475097871347, "learning_rate": 6.066697849884629e-06, - "loss": 0.0588, - "mean_token_accuracy": 0.9816719517111778, + "loss": 0.0633, + "mean_token_accuracy": 0.9796369150280952, "step": 1446 }, { "epoch": 8.463343108504398, - "grad_norm": 0.29731617560561274, + "grad_norm": 0.19216615352509395, "learning_rate": 6.0507723643614415e-06, - "loss": 0.043, - "mean_token_accuracy": 0.9855259880423546, + "loss": 0.0436, + "mean_token_accuracy": 0.9855980426073074, "step": 1447 }, { "epoch": 8.469208211143695, - "grad_norm": 0.3544585856707004, + "grad_norm": 0.22727652768343684, "learning_rate": 6.034904768093095e-06, - "loss": 0.0564, - "mean_token_accuracy": 0.9806393161416054, + "loss": 0.057, + "mean_token_accuracy": 0.9814287722110748, "step": 1448 }, { "epoch": 8.47507331378299, - "grad_norm": 0.3566904543758243, + "grad_norm": 0.22752398618013056, "learning_rate": 6.019095118672557e-06, - "loss": 0.0607, - "mean_token_accuracy": 0.9797836020588875, + "loss": 0.0618, + "mean_token_accuracy": 0.9799441695213318, "step": 1449 }, { "epoch": 8.480938416422287, - "grad_norm": 0.4011506112780418, + "grad_norm": 0.24193075900923788, "learning_rate": 6.003343473482469e-06, - "loss": 0.0561, - "mean_token_accuracy": 0.9821067750453949, + "loss": 0.0557, + "mean_token_accuracy": 0.9827790260314941, "step": 1450 }, { "epoch": 8.486803519061583, - "grad_norm": 0.39239441137933195, + "grad_norm": 0.24392059844002878, "learning_rate": 5.98764988969494e-06, - "loss": 0.059, - "mean_token_accuracy": 0.9805739000439644, + "loss": 0.0595, + "mean_token_accuracy": 0.9794244691729546, "step": 1451 }, { "epoch": 8.49266862170088, - "grad_norm": 0.29869881818062005, + "grad_norm": 0.21220940879325176, "learning_rate": 5.972014424271344e-06, - "loss": 0.0486, - "mean_token_accuracy": 0.9846675246953964, + "loss": 0.0502, + "mean_token_accuracy": 0.9840523451566696, "step": 1452 }, { "epoch": 8.498533724340176, - "grad_norm": 0.32409437830897814, + "grad_norm": 0.3847730117232794, "learning_rate": 5.956437133962103e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.9832024946808815, + "loss": 0.0594, + "mean_token_accuracy": 0.9817788973450661, "step": 1453 }, { "epoch": 8.504398826979472, - "grad_norm": 0.44350748801496986, + "grad_norm": 0.35439580902846674, "learning_rate": 5.94091807530649e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.9812595695257187, + "loss": 0.0592, + "mean_token_accuracy": 0.9799836799502373, "step": 1454 }, { "epoch": 8.510263929618768, - "grad_norm": 0.375249797161884, + "grad_norm": 0.2448678361420796, "learning_rate": 5.925457304632421e-06, - "loss": 0.0581, - "mean_token_accuracy": 0.9809895157814026, + "loss": 0.0594, + "mean_token_accuracy": 0.9810876250267029, "step": 1455 }, { "epoch": 8.516129032258064, - "grad_norm": 0.3697637091968322, + "grad_norm": 0.2546365410929796, "learning_rate": 5.91005487805625e-06, - "loss": 0.0626, - "mean_token_accuracy": 0.9802481904625893, + "loss": 0.0655, + "mean_token_accuracy": 0.9798986539244652, "step": 1456 }, { "epoch": 8.52199413489736, - "grad_norm": 0.350870030605446, + "grad_norm": 0.22510297002392768, "learning_rate": 5.894710851482563e-06, - "loss": 0.0547, - "mean_token_accuracy": 0.9832234531641006, + "loss": 0.0556, + "mean_token_accuracy": 0.983493484556675, "step": 1457 }, { "epoch": 8.527859237536656, - "grad_norm": 0.365055384156501, + "grad_norm": 0.24029196708436434, "learning_rate": 5.879425280603981e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9823313876986504, + "loss": 0.0585, + "mean_token_accuracy": 0.9817355275154114, "step": 1458 }, { "epoch": 8.533724340175953, - "grad_norm": 0.3566574779135655, + "grad_norm": 0.2948271453952541, "learning_rate": 5.864198220900952e-06, - "loss": 0.0523, - "mean_token_accuracy": 0.9819063544273376, + "loss": 0.054, + "mean_token_accuracy": 0.9828294068574905, "step": 1459 }, { "epoch": 8.539589442815249, - "grad_norm": 0.36841885684673653, + "grad_norm": 0.24715359464949893, "learning_rate": 5.849029727641552e-06, - "loss": 0.0551, - "mean_token_accuracy": 0.9813630729913712, + "loss": 0.0573, + "mean_token_accuracy": 0.9808838665485382, "step": 1460 }, { "epoch": 8.545454545454545, - "grad_norm": 0.3518709497305404, + "grad_norm": 0.22895910866616212, "learning_rate": 5.833919855881286e-06, - "loss": 0.0567, - "mean_token_accuracy": 0.9804489463567734, + "loss": 0.0587, + "mean_token_accuracy": 0.9810523241758347, "step": 1461 }, { "epoch": 8.551319648093841, - "grad_norm": 0.3453349736449673, + "grad_norm": 0.23923347448065085, "learning_rate": 5.818868660462886e-06, - "loss": 0.0518, - "mean_token_accuracy": 0.9826655164361, + "loss": 0.053, + "mean_token_accuracy": 0.9833957925438881, "step": 1462 }, { "epoch": 8.557184750733137, - "grad_norm": 0.3147211495267202, + "grad_norm": 0.20995234181528383, "learning_rate": 5.803876196016114e-06, - "loss": 0.0525, - "mean_token_accuracy": 0.9852809086441994, + "loss": 0.0536, + "mean_token_accuracy": 0.9848415032029152, "step": 1463 }, { "epoch": 8.563049853372434, - "grad_norm": 0.3292651610864462, + "grad_norm": 0.3313140616468372, "learning_rate": 5.788942516957561e-06, - "loss": 0.0521, - "mean_token_accuracy": 0.9832091629505157, + "loss": 0.0554, + "mean_token_accuracy": 0.9830712005496025, "step": 1464 }, { "epoch": 8.56891495601173, - "grad_norm": 0.4001132741069757, + "grad_norm": 0.25299926535599937, "learning_rate": 5.774067677490448e-06, - "loss": 0.0609, - "mean_token_accuracy": 0.9808182790875435, + "loss": 0.0618, + "mean_token_accuracy": 0.9809750616550446, "step": 1465 }, { "epoch": 8.574780058651026, - "grad_norm": 0.3394619974830389, + "grad_norm": 0.26249947027843634, "learning_rate": 5.759251731604435e-06, - "loss": 0.0483, - "mean_token_accuracy": 0.9834600687026978, + "loss": 0.05, + "mean_token_accuracy": 0.9828970730304718, "step": 1466 }, { "epoch": 8.580645161290322, - "grad_norm": 0.3843990415825973, + "grad_norm": 0.2515681497508488, "learning_rate": 5.744494733075424e-06, - "loss": 0.0569, - "mean_token_accuracy": 0.9812219887971878, + "loss": 0.057, + "mean_token_accuracy": 0.9812851548194885, "step": 1467 }, { "epoch": 8.586510263929618, - "grad_norm": 0.313015761345117, + "grad_norm": 0.20645851852940889, "learning_rate": 5.729796735465359e-06, - "loss": 0.0556, - "mean_token_accuracy": 0.9805843010544777, + "loss": 0.0563, + "mean_token_accuracy": 0.9807659089565277, "step": 1468 }, { "epoch": 8.592375366568914, - "grad_norm": 0.397873067980578, + "grad_norm": 0.23265625943253596, "learning_rate": 5.7151577921220356e-06, - "loss": 0.0565, - "mean_token_accuracy": 0.980902798473835, + "loss": 0.0578, + "mean_token_accuracy": 0.981408916413784, "step": 1469 }, { "epoch": 8.59824046920821, - "grad_norm": 0.2775157728708668, + "grad_norm": 0.18593301851214442, "learning_rate": 5.7005779561789046e-06, - "loss": 0.046, - "mean_token_accuracy": 0.9848226681351662, + "loss": 0.0479, + "mean_token_accuracy": 0.9842170625925064, "step": 1470 }, { "epoch": 8.604105571847507, - "grad_norm": 0.32262469073810285, + "grad_norm": 0.38192119047399087, "learning_rate": 5.686057280554882e-06, - "loss": 0.0508, - "mean_token_accuracy": 0.9834897667169571, + "loss": 0.0535, + "mean_token_accuracy": 0.9830747321248055, "step": 1471 }, { "epoch": 8.609970674486803, - "grad_norm": 0.3381801962057254, + "grad_norm": 0.22481461210294473, "learning_rate": 5.671595817954157e-06, - "loss": 0.0549, - "mean_token_accuracy": 0.9829668179154396, + "loss": 0.0554, + "mean_token_accuracy": 0.9830892011523247, "step": 1472 }, { "epoch": 8.6158357771261, - "grad_norm": 0.4125328725756552, + "grad_norm": 0.2898055226555559, "learning_rate": 5.657193620865997e-06, - "loss": 0.0539, - "mean_token_accuracy": 0.9830298721790314, + "loss": 0.0542, + "mean_token_accuracy": 0.9828846156597137, "step": 1473 }, { "epoch": 8.621700879765395, - "grad_norm": 0.42397175137698584, + "grad_norm": 0.4735079320278208, "learning_rate": 5.642850741564562e-06, - "loss": 0.0597, - "mean_token_accuracy": 0.981752060353756, + "loss": 0.0672, + "mean_token_accuracy": 0.9807828441262245, "step": 1474 }, { "epoch": 8.627565982404692, - "grad_norm": 0.3678506175376002, + "grad_norm": 0.23634331953871995, "learning_rate": 5.62856723210871e-06, - "loss": 0.0569, - "mean_token_accuracy": 0.9808289110660553, + "loss": 0.0577, + "mean_token_accuracy": 0.9808989763259888, "step": 1475 }, { "epoch": 8.633431085043988, - "grad_norm": 0.38044929459613297, + "grad_norm": 0.30034970883591433, "learning_rate": 5.614343144341814e-06, - "loss": 0.0579, - "mean_token_accuracy": 0.9798820838332176, + "loss": 0.0607, + "mean_token_accuracy": 0.9796464666724205, "step": 1476 }, { "epoch": 8.639296187683284, - "grad_norm": 0.3173567806897864, + "grad_norm": 0.1988418114892257, "learning_rate": 5.600178529891564e-06, - "loss": 0.0492, - "mean_token_accuracy": 0.9839482828974724, + "loss": 0.05, + "mean_token_accuracy": 0.9832219183444977, "step": 1477 }, { "epoch": 8.64516129032258, - "grad_norm": 0.39392861788504485, + "grad_norm": 0.2733706899404459, "learning_rate": 5.58607344016979e-06, - "loss": 0.0637, - "mean_token_accuracy": 0.9784178957343102, + "loss": 0.0655, + "mean_token_accuracy": 0.9781039133667946, "step": 1478 }, { "epoch": 8.651026392961876, - "grad_norm": 0.3357796568293705, + "grad_norm": 0.21042727535719943, "learning_rate": 5.5720279263722795e-06, - "loss": 0.0528, - "mean_token_accuracy": 0.981262743473053, + "loss": 0.0533, + "mean_token_accuracy": 0.9818021357059479, "step": 1479 }, { "epoch": 8.656891495601172, - "grad_norm": 0.3159822158252555, + "grad_norm": 0.24661902402025798, "learning_rate": 5.558042039478564e-06, - "loss": 0.0514, - "mean_token_accuracy": 0.9824788197875023, + "loss": 0.0531, + "mean_token_accuracy": 0.9816571772098541, "step": 1480 }, { "epoch": 8.662756598240469, - "grad_norm": 0.38617284468702967, + "grad_norm": 0.2470785047648527, "learning_rate": 5.544115830251769e-06, - "loss": 0.0626, - "mean_token_accuracy": 0.9803685322403908, + "loss": 0.0638, + "mean_token_accuracy": 0.9801130592823029, "step": 1481 }, { "epoch": 8.668621700879765, - "grad_norm": 0.33796336603448435, + "grad_norm": 0.276592842472206, "learning_rate": 5.530249349238407e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9826963320374489, + "loss": 0.0572, + "mean_token_accuracy": 0.9824761971831322, "step": 1482 }, { "epoch": 8.674486803519061, - "grad_norm": 0.3953276485009274, + "grad_norm": 0.27445814157641885, "learning_rate": 5.516442646768207e-06, - "loss": 0.0616, - "mean_token_accuracy": 0.9777121841907501, + "loss": 0.063, + "mean_token_accuracy": 0.9772898554801941, "step": 1483 }, { "epoch": 8.680351906158357, - "grad_norm": 0.3483816517700947, + "grad_norm": 0.2249695783332849, "learning_rate": 5.502695772953922e-06, - "loss": 0.0615, - "mean_token_accuracy": 0.979569785296917, + "loss": 0.063, + "mean_token_accuracy": 0.9796239584684372, "step": 1484 }, { "epoch": 8.686217008797653, - "grad_norm": 0.3637996842940104, + "grad_norm": 0.24349980523835593, "learning_rate": 5.489008777691151e-06, - "loss": 0.0549, - "mean_token_accuracy": 0.9833445623517036, + "loss": 0.0559, + "mean_token_accuracy": 0.9837216436862946, "step": 1485 }, { "epoch": 8.69208211143695, - "grad_norm": 0.354900472955803, + "grad_norm": 0.440521910978946, "learning_rate": 5.475381710658161e-06, - "loss": 0.0557, - "mean_token_accuracy": 0.9816870614886284, + "loss": 0.0607, + "mean_token_accuracy": 0.9804883822798729, "step": 1486 }, { "epoch": 8.697947214076246, - "grad_norm": 0.4715464896596745, + "grad_norm": 0.2554597984182961, "learning_rate": 5.4618146213157e-06, - "loss": 0.064, - "mean_token_accuracy": 0.9779196679592133, + "loss": 0.0641, + "mean_token_accuracy": 0.9781029522418976, "step": 1487 }, { "epoch": 8.703812316715542, - "grad_norm": 0.34964552430972173, + "grad_norm": 0.47861781564254957, "learning_rate": 5.448307558906822e-06, - "loss": 0.059, - "mean_token_accuracy": 0.9810210913419724, + "loss": 0.0646, + "mean_token_accuracy": 0.9812514930963516, "step": 1488 }, { "epoch": 8.709677419354838, - "grad_norm": 0.35199334637919955, + "grad_norm": 0.2930027638485088, "learning_rate": 5.434860572456711e-06, - "loss": 0.0532, - "mean_token_accuracy": 0.9808976799249649, + "loss": 0.0559, + "mean_token_accuracy": 0.9799815565347672, "step": 1489 }, { "epoch": 8.715542521994134, - "grad_norm": 0.34772290388116905, + "grad_norm": 0.22677059107025838, "learning_rate": 5.421473710772496e-06, - "loss": 0.0574, - "mean_token_accuracy": 0.9827618896961212, + "loss": 0.0569, + "mean_token_accuracy": 0.9823431074619293, "step": 1490 }, { "epoch": 8.72140762463343, - "grad_norm": 0.3152598232442663, + "grad_norm": 0.258704788971786, "learning_rate": 5.408147022443077e-06, - "loss": 0.0509, - "mean_token_accuracy": 0.9823957309126854, + "loss": 0.0524, + "mean_token_accuracy": 0.9814721569418907, "step": 1491 }, { "epoch": 8.727272727272727, - "grad_norm": 0.3297027726384655, + "grad_norm": 0.2273206164374934, "learning_rate": 5.39488055583895e-06, - "loss": 0.0581, - "mean_token_accuracy": 0.9830864146351814, + "loss": 0.0601, + "mean_token_accuracy": 0.9824408814311028, "step": 1492 }, { "epoch": 8.733137829912023, - "grad_norm": 0.40568193564006616, + "grad_norm": 0.2502689684955035, "learning_rate": 5.3816743591120365e-06, "loss": 0.0565, - "mean_token_accuracy": 0.9806106314063072, + "mean_token_accuracy": 0.9813073575496674, "step": 1493 }, { "epoch": 8.739002932551319, - "grad_norm": 0.33951733127086026, + "grad_norm": 0.23368312729472407, "learning_rate": 5.368528480195492e-06, - "loss": 0.0574, - "mean_token_accuracy": 0.9826337546110153, + "loss": 0.0593, + "mean_token_accuracy": 0.9821663275361061, "step": 1494 }, { "epoch": 8.744868035190615, - "grad_norm": 0.27702511846746336, + "grad_norm": 0.1687899775907795, "learning_rate": 5.355442966803544e-06, - "loss": 0.0455, - "mean_token_accuracy": 0.9840430989861488, + "loss": 0.0459, + "mean_token_accuracy": 0.9838021844625473, "step": 1495 }, { "epoch": 8.750733137829911, - "grad_norm": 0.37541384151540996, + "grad_norm": 0.430656839260121, "learning_rate": 5.342417866431326e-06, - "loss": 0.0607, - "mean_token_accuracy": 0.9778427630662918, + "loss": 0.0645, + "mean_token_accuracy": 0.978046216070652, "step": 1496 }, { "epoch": 8.756598240469208, - "grad_norm": 0.37394179575858666, + "grad_norm": 0.2379693424056092, "learning_rate": 5.329453226354692e-06, - "loss": 0.0584, - "mean_token_accuracy": 0.9819178581237793, + "loss": 0.0589, + "mean_token_accuracy": 0.9819848984479904, "step": 1497 }, { "epoch": 8.762463343108504, - "grad_norm": 0.34900072180139713, + "grad_norm": 0.21809785004029345, "learning_rate": 5.31654909363005e-06, - "loss": 0.055, - "mean_token_accuracy": 0.9832709729671478, + "loss": 0.0559, + "mean_token_accuracy": 0.9828894585371017, "step": 1498 }, { "epoch": 8.7683284457478, - "grad_norm": 0.40720615945503313, + "grad_norm": 0.27329055037352745, "learning_rate": 5.303705515094187e-06, - "loss": 0.0674, - "mean_token_accuracy": 0.9800705909729004, + "loss": 0.0687, + "mean_token_accuracy": 0.9799009263515472, "step": 1499 }, { "epoch": 8.774193548387096, - "grad_norm": 0.419760149260306, + "grad_norm": 0.2757990646414536, "learning_rate": 5.290922537364109e-06, - "loss": 0.0663, - "mean_token_accuracy": 0.9766133427619934, + "loss": 0.0671, + "mean_token_accuracy": 0.9762155041098595, "step": 1500 }, { "epoch": 8.780058651026392, - "grad_norm": 0.3368715613053454, + "grad_norm": 0.22185210710048459, "learning_rate": 5.278200206836861e-06, - "loss": 0.0578, - "mean_token_accuracy": 0.9801322594285011, + "loss": 0.0591, + "mean_token_accuracy": 0.980154275894165, "step": 1501 }, { "epoch": 8.785923753665688, - "grad_norm": 0.3759270192655754, + "grad_norm": 0.22047265984608966, "learning_rate": 5.265538569689365e-06, - "loss": 0.0546, - "mean_token_accuracy": 0.9813186898827553, + "loss": 0.0538, + "mean_token_accuracy": 0.9819122850894928, "step": 1502 }, { "epoch": 8.791788856304985, - "grad_norm": 0.3253811690531697, + "grad_norm": 0.20721791327320876, "learning_rate": 5.25293767187825e-06, - "loss": 0.0521, - "mean_token_accuracy": 0.9838348925113678, + "loss": 0.0529, + "mean_token_accuracy": 0.9842581674456596, "step": 1503 }, { "epoch": 8.79765395894428, - "grad_norm": 0.40681946489582455, + "grad_norm": 0.2591529652741556, "learning_rate": 5.240397559139685e-06, - "loss": 0.0582, - "mean_token_accuracy": 0.9799980223178864, + "loss": 0.0594, + "mean_token_accuracy": 0.9804480522871017, "step": 1504 }, { "epoch": 8.803519061583577, - "grad_norm": 0.3113077584917748, + "grad_norm": 0.19629905391668, "learning_rate": 5.227918276989215e-06, - "loss": 0.0542, - "mean_token_accuracy": 0.9809886813163757, + "loss": 0.0541, + "mean_token_accuracy": 0.9810454174876213, "step": 1505 }, { "epoch": 8.809384164222873, - "grad_norm": 0.31490285722753797, + "grad_norm": 0.24900270526978613, "learning_rate": 5.2154998707215976e-06, - "loss": 0.0537, - "mean_token_accuracy": 0.9806480631232262, + "loss": 0.0554, + "mean_token_accuracy": 0.9797688350081444, "step": 1506 }, { "epoch": 8.81524926686217, - "grad_norm": 0.3335702216609551, + "grad_norm": 0.35646628836269056, "learning_rate": 5.203142385410628e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9840300157666206, + "loss": 0.0542, + "mean_token_accuracy": 0.9833614900708199, "step": 1507 }, { "epoch": 8.821114369501466, - "grad_norm": 0.4849332504235522, + "grad_norm": 0.21844551824759018, "learning_rate": 5.190845865908987e-06, - "loss": 0.0528, - "mean_token_accuracy": 0.9802124425768852, + "loss": 0.0527, + "mean_token_accuracy": 0.9799956232309341, "step": 1508 }, { "epoch": 8.826979472140762, - "grad_norm": 0.3769541452912475, + "grad_norm": 0.27531658663437253, "learning_rate": 5.178610356848075e-06, - "loss": 0.0567, - "mean_token_accuracy": 0.9822128117084503, + "loss": 0.0588, + "mean_token_accuracy": 0.9806870818138123, "step": 1509 }, { "epoch": 8.832844574780058, - "grad_norm": 0.37508843809475206, + "grad_norm": 0.27548218874842867, "learning_rate": 5.166435902637848e-06, - "loss": 0.0517, - "mean_token_accuracy": 0.9822444394230843, + "loss": 0.0533, + "mean_token_accuracy": 0.9819175824522972, "step": 1510 }, { "epoch": 8.838709677419354, - "grad_norm": 0.3221405938386644, + "grad_norm": 0.21735665267404067, "learning_rate": 5.154322547466658e-06, - "loss": 0.0517, - "mean_token_accuracy": 0.9838709086179733, + "loss": 0.0524, + "mean_token_accuracy": 0.9834935739636421, "step": 1511 }, { "epoch": 8.84457478005865, - "grad_norm": 0.3561073312103975, + "grad_norm": 0.23058685157888828, "learning_rate": 5.142270335301095e-06, - "loss": 0.0517, - "mean_token_accuracy": 0.9825182780623436, + "loss": 0.053, + "mean_token_accuracy": 0.9823649004101753, "step": 1512 }, { "epoch": 8.850439882697946, - "grad_norm": 0.34042481371976013, + "grad_norm": 0.24988833053602405, "learning_rate": 5.130279309885817e-06, - "loss": 0.0527, - "mean_token_accuracy": 0.9820515289902687, + "loss": 0.0537, + "mean_token_accuracy": 0.9819278046488762, "step": 1513 }, { "epoch": 8.856304985337243, - "grad_norm": 0.4953533693345446, + "grad_norm": 0.26317833073022107, "learning_rate": 5.118349514743404e-06, - "loss": 0.064, - "mean_token_accuracy": 0.979572020471096, + "loss": 0.0642, + "mean_token_accuracy": 0.9782841205596924, "step": 1514 }, { "epoch": 8.862170087976539, - "grad_norm": 0.45447927852562464, + "grad_norm": 0.2734025499041476, "learning_rate": 5.1064809931741975e-06, - "loss": 0.0669, - "mean_token_accuracy": 0.9793067052960396, + "loss": 0.0676, + "mean_token_accuracy": 0.9794860184192657, "step": 1515 }, { "epoch": 8.868035190615835, - "grad_norm": 0.31567146504603205, + "grad_norm": 0.2055012084171587, "learning_rate": 5.094673788256137e-06, - "loss": 0.0519, - "mean_token_accuracy": 0.9847134873270988, + "loss": 0.0527, + "mean_token_accuracy": 0.9848830178380013, "step": 1516 }, { "epoch": 8.873900293255131, - "grad_norm": 0.41749467992256695, + "grad_norm": 0.2619226204611417, "learning_rate": 5.082927942844603e-06, - "loss": 0.062, - "mean_token_accuracy": 0.9803328365087509, + "loss": 0.0635, + "mean_token_accuracy": 0.9811034798622131, "step": 1517 }, { "epoch": 8.879765395894427, - "grad_norm": 0.3155755372513205, + "grad_norm": 0.22565654238279362, "learning_rate": 5.0712434995722734e-06, - "loss": 0.056, - "mean_token_accuracy": 0.9794782549142838, + "loss": 0.0566, + "mean_token_accuracy": 0.9808684885501862, "step": 1518 }, { "epoch": 8.885630498533724, - "grad_norm": 0.38709916701582664, + "grad_norm": 0.3079929186156211, "learning_rate": 5.059620500848964e-06, - "loss": 0.057, - "mean_token_accuracy": 0.9829104915261269, + "loss": 0.06, + "mean_token_accuracy": 0.9821002259850502, "step": 1519 }, { "epoch": 8.89149560117302, - "grad_norm": 0.35821044241742506, + "grad_norm": 0.24670531669936663, "learning_rate": 5.048058988861455e-06, - "loss": 0.0562, - "mean_token_accuracy": 0.9820261895656586, + "loss": 0.0573, + "mean_token_accuracy": 0.9818818718194962, "step": 1520 }, { "epoch": 8.897360703812316, - "grad_norm": 0.3115758973353694, + "grad_norm": 0.21141946255543642, "learning_rate": 5.0365590055733715e-06, - "loss": 0.0525, - "mean_token_accuracy": 0.9838507696986198, + "loss": 0.0535, + "mean_token_accuracy": 0.9837949275970459, "step": 1521 }, { "epoch": 8.903225806451612, - "grad_norm": 0.399494434274281, + "grad_norm": 0.27230889518189605, "learning_rate": 5.025120592725009e-06, - "loss": 0.0622, - "mean_token_accuracy": 0.980305053293705, + "loss": 0.0628, + "mean_token_accuracy": 0.9804145693778992, "step": 1522 }, { "epoch": 8.909090909090908, - "grad_norm": 0.38255414855046266, + "grad_norm": 0.2255957667866212, "learning_rate": 5.013743791833187e-06, - "loss": 0.0581, - "mean_token_accuracy": 0.9823561608791351, + "loss": 0.0595, + "mean_token_accuracy": 0.9818353727459908, "step": 1523 }, { "epoch": 8.914956011730204, - "grad_norm": 0.3281961203877841, + "grad_norm": 0.3038137068972412, "learning_rate": 5.002428644191094e-06, - "loss": 0.0557, - "mean_token_accuracy": 0.9815320670604706, + "loss": 0.0584, + "mean_token_accuracy": 0.9827196970582008, "step": 1524 }, { "epoch": 8.9208211143695, - "grad_norm": 0.3245875923187815, + "grad_norm": 0.3100553005258722, "learning_rate": 4.991175190868148e-06, - "loss": 0.0573, - "mean_token_accuracy": 0.9829541444778442, + "loss": 0.0591, + "mean_token_accuracy": 0.9823226183652878, "step": 1525 }, { "epoch": 8.926686217008797, - "grad_norm": 0.3390424334484287, + "grad_norm": 0.22422743936182463, "learning_rate": 4.9799834727098415e-06, - "loss": 0.0501, - "mean_token_accuracy": 0.9831917360424995, + "loss": 0.0527, + "mean_token_accuracy": 0.9827243983745575, "step": 1526 }, { "epoch": 8.932551319648093, - "grad_norm": 0.3645654304118876, + "grad_norm": 0.263359825362434, "learning_rate": 4.968853530337587e-06, - "loss": 0.0584, - "mean_token_accuracy": 0.9815365374088287, + "loss": 0.0599, + "mean_token_accuracy": 0.981306865811348, "step": 1527 }, { "epoch": 8.93841642228739, - "grad_norm": 0.29681078383119797, + "grad_norm": 0.19382255427509223, "learning_rate": 4.957785404148585e-06, - "loss": 0.0487, - "mean_token_accuracy": 0.9808258190751076, + "loss": 0.0499, + "mean_token_accuracy": 0.9806587025523186, "step": 1528 }, { "epoch": 8.944281524926687, - "grad_norm": 0.3672868154121987, + "grad_norm": 0.25528185564658207, "learning_rate": 4.946779134315662e-06, - "loss": 0.0609, - "mean_token_accuracy": 0.9804074466228485, + "loss": 0.0628, + "mean_token_accuracy": 0.9796261489391327, "step": 1529 }, { "epoch": 8.950146627565982, - "grad_norm": 0.42562270878531583, + "grad_norm": 0.3398157063816798, "learning_rate": 4.935834760787133e-06, - "loss": 0.0588, - "mean_token_accuracy": 0.9829668998718262, + "loss": 0.0583, + "mean_token_accuracy": 0.9820915162563324, "step": 1530 }, { "epoch": 8.95601173020528, - "grad_norm": 0.3675536193118254, + "grad_norm": 0.2587597902562318, "learning_rate": 4.924952323286651e-06, - "loss": 0.0561, - "mean_token_accuracy": 0.9807698279619217, + "loss": 0.0585, + "mean_token_accuracy": 0.9807931408286095, "step": 1531 }, { "epoch": 8.961876832844574, - "grad_norm": 0.36281675499172533, + "grad_norm": 0.24341885443424222, "learning_rate": 4.91413186131307e-06, - "loss": 0.0566, - "mean_token_accuracy": 0.9821875244379044, + "loss": 0.0581, + "mean_token_accuracy": 0.9816311299800873, "step": 1532 }, { "epoch": 8.967741935483872, - "grad_norm": 0.34587044361172914, + "grad_norm": 0.23486359475035062, "learning_rate": 4.9033734141402964e-06, - "loss": 0.0571, - "mean_token_accuracy": 0.9813329204916954, + "loss": 0.0584, + "mean_token_accuracy": 0.9809052050113678, "step": 1533 }, { "epoch": 8.973607038123166, - "grad_norm": 0.32779198026920486, + "grad_norm": 0.21784879087173917, "learning_rate": 4.892677020817151e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.9812613651156425, + "loss": 0.0557, + "mean_token_accuracy": 0.980447068810463, "step": 1534 }, { "epoch": 8.979472140762464, - "grad_norm": 0.372802439575984, + "grad_norm": 0.3280892494362136, "learning_rate": 4.8820427201672195e-06, - "loss": 0.0563, - "mean_token_accuracy": 0.9804784283041954, + "loss": 0.0594, + "mean_token_accuracy": 0.9797915667295456, "step": 1535 }, { "epoch": 8.985337243401759, - "grad_norm": 0.41235127004454536, + "grad_norm": 0.25362092329151553, "learning_rate": 4.871470550788717e-06, - "loss": 0.063, - "mean_token_accuracy": 0.9769391268491745, + "loss": 0.064, + "mean_token_accuracy": 0.9781641364097595, "step": 1536 }, { "epoch": 8.991202346041057, - "grad_norm": 0.35330775445880935, + "grad_norm": 0.23260324944709893, "learning_rate": 4.860960551054352e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9815262779593468, + "loss": 0.0582, + "mean_token_accuracy": 0.9819280281662941, "step": 1537 }, { "epoch": 8.997067448680351, - "grad_norm": 0.3310851925728053, + "grad_norm": 0.2142235937949736, "learning_rate": 4.850512759111177e-06, - "loss": 0.0554, - "mean_token_accuracy": 0.9816920757293701, + "loss": 0.0566, + "mean_token_accuracy": 0.980823814868927, "step": 1538 }, { "epoch": 9.0, - "grad_norm": 0.3310851925728053, + "grad_norm": 0.2142235937949736, "learning_rate": 4.840127212880457e-06, - "loss": 0.0489, - "mean_token_accuracy": 0.9838157296180725, + "loss": 0.0498, + "mean_token_accuracy": 0.9837545901536942, "step": 1539 }, { "epoch": 9.005865102639296, - "grad_norm": 0.4652301353877543, + "grad_norm": 0.3027656081008066, "learning_rate": 4.82980395005753e-06, - "loss": 0.0539, - "mean_token_accuracy": 0.9824711456894875, + "loss": 0.0544, + "mean_token_accuracy": 0.983098641037941, "step": 1540 }, { "epoch": 9.011730205278592, - "grad_norm": 0.3871394835094803, + "grad_norm": 0.2411809393362278, "learning_rate": 4.8195430081116715e-06, - "loss": 0.0566, - "mean_token_accuracy": 0.9818312674760818, + "loss": 0.0573, + "mean_token_accuracy": 0.9815821573138237, "step": 1541 }, { "epoch": 9.017595307917889, - "grad_norm": 0.34228612380666135, + "grad_norm": 0.23641839942255174, "learning_rate": 4.809344424285959e-06, - "loss": 0.0467, - "mean_token_accuracy": 0.9852696433663368, + "loss": 0.0476, + "mean_token_accuracy": 0.9846600145101547, "step": 1542 }, { "epoch": 9.023460410557185, - "grad_norm": 0.3733639316356061, + "grad_norm": 0.23003438448199545, "learning_rate": 4.799208235597129e-06, - "loss": 0.0579, - "mean_token_accuracy": 0.9798446521162987, + "loss": 0.0582, + "mean_token_accuracy": 0.9806374609470367, "step": 1543 }, { "epoch": 9.029325513196481, - "grad_norm": 0.37824565482765427, + "grad_norm": 0.23956336736168363, "learning_rate": 4.7891344788354535e-06, - "loss": 0.0546, - "mean_token_accuracy": 0.9813016727566719, + "loss": 0.056, + "mean_token_accuracy": 0.9814530238509178, "step": 1544 }, { "epoch": 9.035190615835777, - "grad_norm": 0.3711710253715166, + "grad_norm": 0.23610643970912315, "learning_rate": 4.779123190564601e-06, - "loss": 0.0615, - "mean_token_accuracy": 0.9825925230979919, + "loss": 0.0624, + "mean_token_accuracy": 0.9810673967003822, "step": 1545 }, { "epoch": 9.041055718475073, - "grad_norm": 0.363510890139849, + "grad_norm": 0.22732253531742116, "learning_rate": 4.769174407121508e-06, - "loss": 0.0514, - "mean_token_accuracy": 0.982509970664978, + "loss": 0.0518, + "mean_token_accuracy": 0.9831937104463577, "step": 1546 }, { "epoch": 9.04692082111437, - "grad_norm": 0.3243226197829895, + "grad_norm": 0.1962674139871125, "learning_rate": 4.7592881646162336e-06, - "loss": 0.0623, - "mean_token_accuracy": 0.9804322570562363, + "loss": 0.0627, + "mean_token_accuracy": 0.979876346886158, "step": 1547 }, { "epoch": 9.052785923753666, - "grad_norm": 0.3894419167340949, + "grad_norm": 0.2539982090762478, "learning_rate": 4.749464498931852e-06, - "loss": 0.0456, - "mean_token_accuracy": 0.9836910218000412, + "loss": 0.0461, + "mean_token_accuracy": 0.9838007912039757, "step": 1548 }, { "epoch": 9.058651026392962, - "grad_norm": 0.3150320531942675, + "grad_norm": 0.2087119802614843, "learning_rate": 4.739703445724296e-06, - "loss": 0.0538, - "mean_token_accuracy": 0.9851875305175781, + "loss": 0.0558, + "mean_token_accuracy": 0.9843602925539017, "step": 1549 }, { "epoch": 9.064516129032258, - "grad_norm": 0.32325587902059943, + "grad_norm": 0.329300231457707, "learning_rate": 4.730005040422253e-06, - "loss": 0.0491, - "mean_token_accuracy": 0.9839693009853363, + "loss": 0.0505, + "mean_token_accuracy": 0.9836215823888779, "step": 1550 }, { "epoch": 9.070381231671554, - "grad_norm": 0.33343639062744773, + "grad_norm": 0.22188010507420589, "learning_rate": 4.720369318227014e-06, - "loss": 0.049, - "mean_token_accuracy": 0.9838002175092697, + "loss": 0.0496, + "mean_token_accuracy": 0.9835997819900513, "step": 1551 }, { "epoch": 9.07624633431085, - "grad_norm": 0.3335968019145587, + "grad_norm": 0.2046135402728503, "learning_rate": 4.710796314112358e-06, - "loss": 0.0527, - "mean_token_accuracy": 0.9830645993351936, + "loss": 0.0535, + "mean_token_accuracy": 0.9827775880694389, "step": 1552 }, { "epoch": 9.082111436950147, - "grad_norm": 0.35686275798991973, + "grad_norm": 0.22732327773733813, "learning_rate": 4.701286062824425e-06, - "loss": 0.0507, - "mean_token_accuracy": 0.9826326817274094, + "loss": 0.051, + "mean_token_accuracy": 0.9828957170248032, "step": 1553 }, { "epoch": 9.087976539589443, - "grad_norm": 0.39345801699794963, + "grad_norm": 0.242034790415479, "learning_rate": 4.691838598881587e-06, - "loss": 0.0547, - "mean_token_accuracy": 0.9825649484992027, + "loss": 0.0563, + "mean_token_accuracy": 0.9820270240306854, "step": 1554 }, { "epoch": 9.093841642228739, - "grad_norm": 0.3179764795801783, + "grad_norm": 0.20508126696203113, "learning_rate": 4.68245395657432e-06, "loss": 0.0518, - "mean_token_accuracy": 0.9854598566889763, + "mean_token_accuracy": 0.984300933778286, "step": 1555 }, { "epoch": 9.099706744868035, - "grad_norm": 0.3236361805001554, + "grad_norm": 0.20201114000920736, "learning_rate": 4.673132169965089e-06, - "loss": 0.0509, - "mean_token_accuracy": 0.9836238846182823, + "loss": 0.0516, + "mean_token_accuracy": 0.984064131975174, "step": 1556 }, { "epoch": 9.105571847507331, - "grad_norm": 0.30731096453465834, + "grad_norm": 0.18703109160266437, "learning_rate": 4.663873272888212e-06, - "loss": 0.0474, - "mean_token_accuracy": 0.986683115363121, + "loss": 0.0475, + "mean_token_accuracy": 0.9858269169926643, "step": 1557 }, { "epoch": 9.111436950146627, - "grad_norm": 0.306870446012294, + "grad_norm": 0.1948057279735745, "learning_rate": 4.654677298949746e-06, - "loss": 0.0514, - "mean_token_accuracy": 0.9820300340652466, + "loss": 0.0524, + "mean_token_accuracy": 0.9818159490823746, "step": 1558 }, { "epoch": 9.117302052785924, - "grad_norm": 0.3221981747722701, + "grad_norm": 0.2596533136977546, "learning_rate": 4.645544281527362e-06, - "loss": 0.0506, - "mean_token_accuracy": 0.9824572280049324, + "loss": 0.0547, + "mean_token_accuracy": 0.9818684533238411, "step": 1559 }, { "epoch": 9.12316715542522, - "grad_norm": 0.30207536729210593, + "grad_norm": 0.20182485325445904, "learning_rate": 4.636474253770226e-06, - "loss": 0.0453, - "mean_token_accuracy": 0.9838585555553436, + "loss": 0.0456, + "mean_token_accuracy": 0.9834398329257965, "step": 1560 }, { "epoch": 9.129032258064516, - "grad_norm": 0.3241825961068261, + "grad_norm": 0.21517039851897404, "learning_rate": 4.627467248598876e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9826252236962318, + "loss": 0.0532, + "mean_token_accuracy": 0.9830020219087601, "step": 1561 }, { "epoch": 9.134897360703812, - "grad_norm": 0.3384107887549902, + "grad_norm": 0.22131489560960235, "learning_rate": 4.618523298705101e-06, - "loss": 0.0512, - "mean_token_accuracy": 0.9828469306230545, + "loss": 0.0524, + "mean_token_accuracy": 0.98325315117836, "step": 1562 }, { "epoch": 9.140762463343108, - "grad_norm": 0.3418975656638684, + "grad_norm": 0.252039124629233, "learning_rate": 4.609642436551828e-06, - "loss": 0.0503, - "mean_token_accuracy": 0.9833681285381317, + "loss": 0.0516, + "mean_token_accuracy": 0.9828849211335182, "step": 1563 }, { "epoch": 9.146627565982405, - "grad_norm": 0.3144109232730043, + "grad_norm": 0.19354071592940614, "learning_rate": 4.600824694373e-06, - "loss": 0.0476, - "mean_token_accuracy": 0.9852636978030205, + "loss": 0.0482, + "mean_token_accuracy": 0.9855695217847824, "step": 1564 }, { "epoch": 9.1524926686217, - "grad_norm": 0.34715116832048837, + "grad_norm": 0.21751293046724127, "learning_rate": 4.592070104173461e-06, - "loss": 0.0502, - "mean_token_accuracy": 0.9837682098150253, + "loss": 0.0514, + "mean_token_accuracy": 0.9841759204864502, "step": 1565 }, { "epoch": 9.158357771260997, - "grad_norm": 0.32008000943713794, + "grad_norm": 0.20587402678942707, "learning_rate": 4.583378697728835e-06, - "loss": 0.0523, - "mean_token_accuracy": 0.9823531731963158, + "loss": 0.0527, + "mean_token_accuracy": 0.9819434061646461, "step": 1566 }, { "epoch": 9.164222873900293, - "grad_norm": 0.3357093036143234, + "grad_norm": 0.20305161489535564, "learning_rate": 4.574750506585419e-06, - "loss": 0.0478, - "mean_token_accuracy": 0.982313483953476, + "loss": 0.0481, + "mean_token_accuracy": 0.9825012981891632, "step": 1567 }, { "epoch": 9.17008797653959, - "grad_norm": 0.35978845877064625, + "grad_norm": 0.235658681817122, "learning_rate": 4.566185562060062e-06, - "loss": 0.0562, - "mean_token_accuracy": 0.9809844046831131, + "loss": 0.0568, + "mean_token_accuracy": 0.9813750684261322, "step": 1568 }, { "epoch": 9.175953079178885, - "grad_norm": 0.33646551155384047, + "grad_norm": 0.23612311691624208, "learning_rate": 4.557683895240052e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.9836227148771286, + "loss": 0.0568, + "mean_token_accuracy": 0.9832897707819939, "step": 1569 }, { "epoch": 9.181818181818182, - "grad_norm": 0.4084517717867541, + "grad_norm": 0.3283150789529255, "learning_rate": 4.549245536983009e-06, - "loss": 0.0523, - "mean_token_accuracy": 0.9832220077514648, + "loss": 0.0528, + "mean_token_accuracy": 0.9832657352089882, "step": 1570 }, { "epoch": 9.187683284457478, - "grad_norm": 0.3800460400014487, + "grad_norm": 0.2399360533000912, "learning_rate": 4.540870517916765e-06, - "loss": 0.0515, - "mean_token_accuracy": 0.9843539819121361, + "loss": 0.0525, + "mean_token_accuracy": 0.9828476384282112, "step": 1571 }, { "epoch": 9.193548387096774, - "grad_norm": 0.3875506291047543, + "grad_norm": 0.22784069298248552, "learning_rate": 4.532558868439249e-06, - "loss": 0.0538, - "mean_token_accuracy": 0.9836299493908882, + "loss": 0.0543, + "mean_token_accuracy": 0.9832234308123589, "step": 1572 }, { "epoch": 9.19941348973607, - "grad_norm": 0.3339053484134062, + "grad_norm": 0.2026105144234623, "learning_rate": 4.524310618718403e-06, - "loss": 0.0506, - "mean_token_accuracy": 0.9838271215558052, + "loss": 0.0517, + "mean_token_accuracy": 0.9834396690130234, "step": 1573 }, { "epoch": 9.205278592375366, - "grad_norm": 0.3442412710070444, + "grad_norm": 0.22618016171675648, "learning_rate": 4.516125798692037e-06, - "loss": 0.052, - "mean_token_accuracy": 0.983096070587635, + "loss": 0.0523, + "mean_token_accuracy": 0.9837358370423317, "step": 1574 }, { "epoch": 9.211143695014663, - "grad_norm": 0.3751785309738875, + "grad_norm": 0.23485766896731308, "learning_rate": 4.508004438067742e-06, - "loss": 0.0556, - "mean_token_accuracy": 0.9821300804615021, + "loss": 0.0568, + "mean_token_accuracy": 0.9811383709311485, "step": 1575 }, { "epoch": 9.217008797653959, - "grad_norm": 0.3431266472138059, + "grad_norm": 0.21454767504403688, "learning_rate": 4.4999465663227785e-06, - "loss": 0.0487, - "mean_token_accuracy": 0.9844409078359604, + "loss": 0.0494, + "mean_token_accuracy": 0.9843669608235359, "step": 1576 }, { "epoch": 9.222873900293255, - "grad_norm": 0.31684104303608773, + "grad_norm": 0.30069431080169556, "learning_rate": 4.491952212703964e-06, - "loss": 0.0522, - "mean_token_accuracy": 0.9832335263490677, + "loss": 0.055, + "mean_token_accuracy": 0.9814217612147331, "step": 1577 }, { "epoch": 9.228739002932551, - "grad_norm": 0.3484048489842224, + "grad_norm": 0.21290384341664184, "learning_rate": 4.484021406227576e-06, - "loss": 0.0521, - "mean_token_accuracy": 0.9830200746655464, + "loss": 0.0535, + "mean_token_accuracy": 0.9824668392539024, "step": 1578 }, { "epoch": 9.234604105571847, - "grad_norm": 0.37348106814548215, + "grad_norm": 0.2559674489279034, "learning_rate": 4.476154175679239e-06, - "loss": 0.0566, - "mean_token_accuracy": 0.9807815030217171, + "loss": 0.0569, + "mean_token_accuracy": 0.9815589785575867, "step": 1579 }, { "epoch": 9.240469208211143, - "grad_norm": 0.3465806459457609, + "grad_norm": 0.21187515075412042, "learning_rate": 4.468350549613822e-06, - "loss": 0.0449, - "mean_token_accuracy": 0.9852471351623535, + "loss": 0.0457, + "mean_token_accuracy": 0.9843628779053688, "step": 1580 }, { "epoch": 9.24633431085044, - "grad_norm": 0.3801664719699823, + "grad_norm": 0.2721970935911764, "learning_rate": 4.460610556355333e-06, - "loss": 0.0576, - "mean_token_accuracy": 0.9800804182887077, + "loss": 0.0578, + "mean_token_accuracy": 0.9794795215129852, "step": 1581 }, { "epoch": 9.252199413489736, - "grad_norm": 0.3195371027598601, + "grad_norm": 0.20751982847364947, "learning_rate": 4.452934223996824e-06, - "loss": 0.0483, - "mean_token_accuracy": 0.9841778427362442, + "loss": 0.0491, + "mean_token_accuracy": 0.9839389324188232, "step": 1582 }, { "epoch": 9.258064516129032, - "grad_norm": 0.31523364079250155, + "grad_norm": 0.3425707212666822, "learning_rate": 4.445321580400281e-06, - "loss": 0.0504, - "mean_token_accuracy": 0.9817590713500977, + "loss": 0.0518, + "mean_token_accuracy": 0.9814067259430885, "step": 1583 }, { "epoch": 9.263929618768328, - "grad_norm": 0.34571299346795814, + "grad_norm": 0.2169421398120668, "learning_rate": 4.437772653196527e-06, - "loss": 0.0555, - "mean_token_accuracy": 0.9840084314346313, + "loss": 0.0558, + "mean_token_accuracy": 0.9849090501666069, "step": 1584 }, { "epoch": 9.269794721407624, - "grad_norm": 0.3826595302078001, + "grad_norm": 0.2442231991992943, "learning_rate": 4.430287469785118e-06, - "loss": 0.063, - "mean_token_accuracy": 0.9781069308519363, + "loss": 0.0639, + "mean_token_accuracy": 0.9791273474693298, "step": 1585 }, { "epoch": 9.27565982404692, - "grad_norm": 0.45325381772938556, + "grad_norm": 0.2636751350455331, "learning_rate": 4.422866057334246e-06, - "loss": 0.0571, - "mean_token_accuracy": 0.9835334494709969, + "loss": 0.0563, + "mean_token_accuracy": 0.9829349890351295, "step": 1586 }, { "epoch": 9.281524926686217, - "grad_norm": 0.3544263748847487, + "grad_norm": 0.22338220479863272, "learning_rate": 4.415508442780642e-06, - "loss": 0.0571, - "mean_token_accuracy": 0.9812069460749626, + "loss": 0.057, + "mean_token_accuracy": 0.9812466502189636, "step": 1587 }, { "epoch": 9.287390029325513, - "grad_norm": 0.366957343280142, + "grad_norm": 0.2211547461370018, "learning_rate": 4.408214652829473e-06, "loss": 0.0557, - "mean_token_accuracy": 0.9829774498939514, + "mean_token_accuracy": 0.9828761592507362, "step": 1588 }, { "epoch": 9.29325513196481, - "grad_norm": 0.31643775338518065, + "grad_norm": 0.19198791621262234, "learning_rate": 4.400984713954253e-06, - "loss": 0.044, - "mean_token_accuracy": 0.9852609634399414, + "loss": 0.0444, + "mean_token_accuracy": 0.985915943980217, "step": 1589 }, { "epoch": 9.299120234604105, - "grad_norm": 0.3258433860675811, + "grad_norm": 0.21221727583615227, "learning_rate": 4.39381865239674e-06, - "loss": 0.0577, - "mean_token_accuracy": 0.9815997928380966, + "loss": 0.0594, + "mean_token_accuracy": 0.9805979430675507, "step": 1590 }, { "epoch": 9.304985337243401, - "grad_norm": 0.3809434963849625, + "grad_norm": 0.24745045411421357, "learning_rate": 4.386716494166842e-06, - "loss": 0.055, - "mean_token_accuracy": 0.981240376830101, + "loss": 0.0566, + "mean_token_accuracy": 0.9807746484875679, "step": 1591 }, { "epoch": 9.310850439882698, - "grad_norm": 0.39655430748262316, + "grad_norm": 0.2576436309116627, "learning_rate": 4.379678265042529e-06, - "loss": 0.0544, - "mean_token_accuracy": 0.9810396283864975, + "loss": 0.0557, + "mean_token_accuracy": 0.9808614030480385, "step": 1592 }, { "epoch": 9.316715542521994, - "grad_norm": 0.3706407474165295, + "grad_norm": 0.23874035411062744, "learning_rate": 4.372703990569725e-06, - "loss": 0.0543, - "mean_token_accuracy": 0.9823009446263313, + "loss": 0.0561, + "mean_token_accuracy": 0.9833986386656761, "step": 1593 }, { "epoch": 9.32258064516129, - "grad_norm": 0.36956199306402604, + "grad_norm": 0.23684781792614845, "learning_rate": 4.365793696062231e-06, - "loss": 0.056, - "mean_token_accuracy": 0.9810444936156273, + "loss": 0.0575, + "mean_token_accuracy": 0.9813083857297897, "step": 1594 }, { "epoch": 9.328445747800586, - "grad_norm": 0.36114052744290454, + "grad_norm": 0.23172577740881103, "learning_rate": 4.358947406601626e-06, - "loss": 0.0496, - "mean_token_accuracy": 0.9843268916010857, + "loss": 0.0505, + "mean_token_accuracy": 0.9838670641183853, "step": 1595 }, { "epoch": 9.334310850439882, - "grad_norm": 0.29903243274355235, + "grad_norm": 0.32230792952388126, "learning_rate": 4.352165147037177e-06, - "loss": 0.0532, - "mean_token_accuracy": 0.9818791821599007, + "loss": 0.054, + "mean_token_accuracy": 0.9821716770529747, "step": 1596 }, { "epoch": 9.340175953079179, - "grad_norm": 0.3402033677627262, + "grad_norm": 0.22343750102594195, "learning_rate": 4.345446941985741e-06, - "loss": 0.0497, - "mean_token_accuracy": 0.9822636842727661, + "loss": 0.0507, + "mean_token_accuracy": 0.9818222150206566, "step": 1597 }, { "epoch": 9.346041055718475, - "grad_norm": 0.3216326624615543, + "grad_norm": 0.20437357064777317, "learning_rate": 4.338792815831698e-06, - "loss": 0.05, - "mean_token_accuracy": 0.9808618873357773, + "loss": 0.0531, + "mean_token_accuracy": 0.9799600020051003, "step": 1598 }, { "epoch": 9.351906158357771, - "grad_norm": 0.3981133789009082, + "grad_norm": 0.3676726838740866, "learning_rate": 4.332202792726832e-06, - "loss": 0.06, - "mean_token_accuracy": 0.9808676987886429, + "loss": 0.0602, + "mean_token_accuracy": 0.9805522412061691, "step": 1599 }, { "epoch": 9.357771260997067, - "grad_norm": 0.3730359853930091, + "grad_norm": 0.23353133360423867, "learning_rate": 4.3256768965902684e-06, - "loss": 0.056, - "mean_token_accuracy": 0.9798526018857956, + "loss": 0.058, + "mean_token_accuracy": 0.9794843345880508, "step": 1600 }, { "epoch": 9.363636363636363, - "grad_norm": 0.38448222563283363, + "grad_norm": 0.27706490550836466, "learning_rate": 4.319215151108373e-06, - "loss": 0.0645, - "mean_token_accuracy": 0.9782325327396393, + "loss": 0.066, + "mean_token_accuracy": 0.978282243013382, "step": 1601 }, { "epoch": 9.36950146627566, - "grad_norm": 0.34869744775882194, + "grad_norm": 0.23756310821917054, "learning_rate": 4.312817579734673e-06, - "loss": 0.0515, - "mean_token_accuracy": 0.9844043850898743, + "loss": 0.0527, + "mean_token_accuracy": 0.9850875586271286, "step": 1602 }, { "epoch": 9.375366568914956, - "grad_norm": 0.35794045824975934, + "grad_norm": 0.22823060889515634, "learning_rate": 4.306484205689768e-06, - "loss": 0.0575, - "mean_token_accuracy": 0.9812995940446854, + "loss": 0.0584, + "mean_token_accuracy": 0.9804537519812584, "step": 1603 }, { "epoch": 9.381231671554252, - "grad_norm": 0.3536277186740108, + "grad_norm": 0.22815559277400072, "learning_rate": 4.300215051961248e-06, - "loss": 0.0574, - "mean_token_accuracy": 0.9826265349984169, + "loss": 0.058, + "mean_token_accuracy": 0.9819799065589905, "step": 1604 }, { "epoch": 9.387096774193548, - "grad_norm": 0.34843689510087195, + "grad_norm": 0.21826257691327555, "learning_rate": 4.2940101413036115e-06, - "loss": 0.0499, - "mean_token_accuracy": 0.9845296069979668, + "loss": 0.051, + "mean_token_accuracy": 0.9844619482755661, "step": 1605 }, { "epoch": 9.392961876832844, - "grad_norm": 0.3464031945943138, + "grad_norm": 0.2655912410779273, "learning_rate": 4.287869496238174e-06, - "loss": 0.0597, - "mean_token_accuracy": 0.9806225821375847, + "loss": 0.0606, + "mean_token_accuracy": 0.9803177416324615, "step": 1606 }, { "epoch": 9.39882697947214, - "grad_norm": 0.33197516109156383, + "grad_norm": 0.2083047435091801, "learning_rate": 4.281793139053001e-06, - "loss": 0.0522, - "mean_token_accuracy": 0.9814345613121986, + "loss": 0.0533, + "mean_token_accuracy": 0.9825778752565384, "step": 1607 }, { "epoch": 9.404692082111437, - "grad_norm": 0.38784078972064945, + "grad_norm": 0.22682792436802207, "learning_rate": 4.275781091802811e-06, - "loss": 0.0671, - "mean_token_accuracy": 0.9796320497989655, + "loss": 0.0675, + "mean_token_accuracy": 0.9798205196857452, "step": 1608 }, { "epoch": 9.410557184750733, - "grad_norm": 0.4182267076466508, + "grad_norm": 0.26954582422977497, "learning_rate": 4.26983337630891e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.9823887571692467, + "loss": 0.0562, + "mean_token_accuracy": 0.9811637997627258, "step": 1609 }, { "epoch": 9.416422287390029, - "grad_norm": 0.40263024742952774, + "grad_norm": 0.2744435039549544, "learning_rate": 4.263950014159103e-06, - "loss": 0.0566, - "mean_token_accuracy": 0.9805941879749298, + "loss": 0.0586, + "mean_token_accuracy": 0.9808680191636086, "step": 1610 }, { "epoch": 9.422287390029325, - "grad_norm": 0.3184552151743527, + "grad_norm": 0.2074828213741738, "learning_rate": 4.258131026707618e-06, - "loss": 0.0486, - "mean_token_accuracy": 0.9834803640842438, + "loss": 0.0492, + "mean_token_accuracy": 0.9836182445287704, "step": 1611 }, { "epoch": 9.428152492668621, - "grad_norm": 0.34850095126657826, + "grad_norm": 0.21893167411728395, "learning_rate": 4.2523764350750305e-06, - "loss": 0.0577, - "mean_token_accuracy": 0.9823050573468208, + "loss": 0.0587, + "mean_token_accuracy": 0.9815453141927719, "step": 1612 }, { "epoch": 9.434017595307918, - "grad_norm": 0.35877944489864333, + "grad_norm": 0.20889606834325938, "learning_rate": 4.246686260148179e-06, "loss": 0.0529, - "mean_token_accuracy": 0.9828868806362152, + "mean_token_accuracy": 0.9829302057623863, "step": 1613 }, { "epoch": 9.439882697947214, - "grad_norm": 0.41578422834006884, + "grad_norm": 0.2541914199711263, "learning_rate": 4.241060522580108e-06, - "loss": 0.0644, - "mean_token_accuracy": 0.9790749028325081, + "loss": 0.065, + "mean_token_accuracy": 0.9794387221336365, "step": 1614 }, { "epoch": 9.44574780058651, - "grad_norm": 0.3807165143700813, + "grad_norm": 0.24296870473167126, "learning_rate": 4.2354992427899674e-06, - "loss": 0.0504, - "mean_token_accuracy": 0.9833511561155319, + "loss": 0.0513, + "mean_token_accuracy": 0.9835287109017372, "step": 1615 }, { "epoch": 9.451612903225806, - "grad_norm": 0.3508137711756749, + "grad_norm": 0.24498569083766547, "learning_rate": 4.23000244096296e-06, - "loss": 0.053, - "mean_token_accuracy": 0.9821149632334709, + "loss": 0.0545, + "mean_token_accuracy": 0.981740228831768, "step": 1616 }, { "epoch": 9.457478005865102, - "grad_norm": 0.34790844523575315, + "grad_norm": 0.2085082101365372, "learning_rate": 4.224570137050254e-06, - "loss": 0.0439, - "mean_token_accuracy": 0.9859587997198105, + "loss": 0.0443, + "mean_token_accuracy": 0.9854484051465988, "step": 1617 }, { "epoch": 9.463343108504398, - "grad_norm": 0.3069321947925801, + "grad_norm": 0.19598121580283992, "learning_rate": 4.219202350768919e-06, - "loss": 0.0538, - "mean_token_accuracy": 0.9819561988115311, + "loss": 0.0545, + "mean_token_accuracy": 0.9815690070390701, "step": 1618 }, { "epoch": 9.469208211143695, - "grad_norm": 0.33995148789180835, + "grad_norm": 0.20469125288940168, "learning_rate": 4.213899101601853e-06, "loss": 0.0546, - "mean_token_accuracy": 0.9821692854166031, + "mean_token_accuracy": 0.9825413748621941, "step": 1619 }, { "epoch": 9.47507331378299, - "grad_norm": 0.33004647287525, + "grad_norm": 0.2160546209456866, "learning_rate": 4.208660408797708e-06, - "loss": 0.054, - "mean_token_accuracy": 0.9837897270917892, + "loss": 0.0546, + "mean_token_accuracy": 0.9825544431805611, "step": 1620 }, { "epoch": 9.480938416422287, - "grad_norm": 0.34405905205242354, + "grad_norm": 0.2126729448550127, "learning_rate": 4.203486291370821e-06, "loss": 0.0526, - "mean_token_accuracy": 0.9838336259126663, + "mean_token_accuracy": 0.9841529130935669, "step": 1621 }, { "epoch": 9.486803519061583, - "grad_norm": 0.3825920637539007, + "grad_norm": 0.24411839934891127, "learning_rate": 4.198376768101149e-06, - "loss": 0.0611, - "mean_token_accuracy": 0.9824720919132233, + "loss": 0.0627, + "mean_token_accuracy": 0.9811530858278275, "step": 1622 }, { "epoch": 9.49266862170088, - "grad_norm": 0.39337900293357947, + "grad_norm": 0.23545992883807737, "learning_rate": 4.193331857534198e-06, - "loss": 0.0506, - "mean_token_accuracy": 0.9829727709293365, + "loss": 0.0515, + "mean_token_accuracy": 0.9828750714659691, "step": 1623 }, { "epoch": 9.498533724340176, - "grad_norm": 0.33098384606131753, + "grad_norm": 0.2004862274519284, "learning_rate": 4.188351577980961e-06, - "loss": 0.048, - "mean_token_accuracy": 0.9843248054385185, + "loss": 0.0486, + "mean_token_accuracy": 0.9842849373817444, "step": 1624 }, { "epoch": 9.504398826979472, - "grad_norm": 0.3400217049112287, + "grad_norm": 0.20266526447683697, "learning_rate": 4.183435947517836e-06, - "loss": 0.0504, - "mean_token_accuracy": 0.9832568988204002, + "loss": 0.0511, + "mean_token_accuracy": 0.9830767437815666, "step": 1625 }, { "epoch": 9.510263929618768, - "grad_norm": 0.318142451201231, + "grad_norm": 0.19607531767878525, "learning_rate": 4.178584983986575e-06, - "loss": 0.0451, - "mean_token_accuracy": 0.9853277578949928, + "loss": 0.0457, + "mean_token_accuracy": 0.9845829010009766, "step": 1626 }, { "epoch": 9.516129032258064, - "grad_norm": 0.31249425970359535, + "grad_norm": 0.1831840832037411, "learning_rate": 4.173798704994221e-06, - "loss": 0.05, - "mean_token_accuracy": 0.9839732199907303, + "loss": 0.0502, + "mean_token_accuracy": 0.984029233455658, "step": 1627 }, { "epoch": 9.52199413489736, - "grad_norm": 0.34213460131037277, + "grad_norm": 0.22600559226813188, "learning_rate": 4.169077127913031e-06, - "loss": 0.0569, - "mean_token_accuracy": 0.9797552153468132, + "loss": 0.0611, + "mean_token_accuracy": 0.9794282466173172, "step": 1628 }, { "epoch": 9.527859237536656, - "grad_norm": 0.34521954141682165, + "grad_norm": 0.26377141694821216, "learning_rate": 4.164420269880422e-06, - "loss": 0.0537, - "mean_token_accuracy": 0.9796594232320786, + "loss": 0.0541, + "mean_token_accuracy": 0.9801126793026924, "step": 1629 }, { "epoch": 9.533724340175953, - "grad_norm": 0.3561614042212147, + "grad_norm": 0.222192265353475, "learning_rate": 4.159828147798914e-06, - "loss": 0.0495, - "mean_token_accuracy": 0.9844975918531418, + "loss": 0.0502, + "mean_token_accuracy": 0.98480124771595, "step": 1630 }, { "epoch": 9.539589442815249, - "grad_norm": 0.33458897730895226, + "grad_norm": 0.19974100668537378, "learning_rate": 4.155300778336047e-06, - "loss": 0.053, - "mean_token_accuracy": 0.9825539439916611, + "loss": 0.0541, + "mean_token_accuracy": 0.9811222404241562, "step": 1631 }, { "epoch": 9.545454545454545, - "grad_norm": 0.41386586180304163, + "grad_norm": 0.24575424610156923, "learning_rate": 4.150838177924349e-06, - "loss": 0.0516, - "mean_token_accuracy": 0.9856827855110168, + "loss": 0.0525, + "mean_token_accuracy": 0.9847328960895538, "step": 1632 }, { "epoch": 9.551319648093841, - "grad_norm": 0.2858406837497521, + "grad_norm": 0.19911462815470113, "learning_rate": 4.146440362761256e-06, - "loss": 0.0526, - "mean_token_accuracy": 0.9834114909172058, + "loss": 0.0539, + "mean_token_accuracy": 0.9825382307171822, "step": 1633 }, { "epoch": 9.557184750733137, - "grad_norm": 0.3319322824824308, + "grad_norm": 0.20741651683611387, "learning_rate": 4.142107348809058e-06, - "loss": 0.0591, - "mean_token_accuracy": 0.9804484695196152, + "loss": 0.0595, + "mean_token_accuracy": 0.9808006957173347, "step": 1634 }, { "epoch": 9.563049853372434, - "grad_norm": 0.36873534696604005, + "grad_norm": 0.2183255912044281, "learning_rate": 4.1378391517948505e-06, - "loss": 0.0489, - "mean_token_accuracy": 0.9854116439819336, + "loss": 0.0492, + "mean_token_accuracy": 0.9852641671895981, "step": 1635 }, { "epoch": 9.56891495601173, - "grad_norm": 0.371287763627244, + "grad_norm": 0.20745898164282808, "learning_rate": 4.1336357872104614e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.982661671936512, + "loss": 0.0556, + "mean_token_accuracy": 0.983629435300827, "step": 1636 }, { "epoch": 9.574780058651026, - "grad_norm": 0.31981684222677925, + "grad_norm": 0.22035964377547398, "learning_rate": 4.12949727031241e-06, - "loss": 0.0552, - "mean_token_accuracy": 0.9830471277236938, + "loss": 0.0556, + "mean_token_accuracy": 0.9830387458205223, "step": 1637 }, { "epoch": 9.580645161290322, - "grad_norm": 0.3496716369106886, + "grad_norm": 0.24377604786682897, "learning_rate": 4.125423616121837e-06, - "loss": 0.0492, - "mean_token_accuracy": 0.984118863940239, + "loss": 0.0509, + "mean_token_accuracy": 0.9837193712592125, "step": 1638 }, { "epoch": 9.586510263929618, - "grad_norm": 0.3019605956233339, + "grad_norm": 0.17631889494983286, "learning_rate": 4.121414839424464e-06, - "loss": 0.0512, - "mean_token_accuracy": 0.983857087790966, + "loss": 0.0516, + "mean_token_accuracy": 0.9836119785904884, "step": 1639 }, { "epoch": 9.592375366568914, - "grad_norm": 0.38743933966546673, + "grad_norm": 0.2222094804818117, "learning_rate": 4.117470954770529e-06, - "loss": 0.0575, - "mean_token_accuracy": 0.9823267832398415, + "loss": 0.0576, + "mean_token_accuracy": 0.9822694733738899, "step": 1640 }, { "epoch": 9.59824046920821, - "grad_norm": 0.2820952797381847, + "grad_norm": 0.17607834866947364, "learning_rate": 4.1135919764747454e-06, - "loss": 0.0487, - "mean_token_accuracy": 0.9831160977482796, + "loss": 0.0496, + "mean_token_accuracy": 0.9828854873776436, "step": 1641 }, { "epoch": 9.604105571847507, - "grad_norm": 0.33032888287751866, + "grad_norm": 0.19893350799495865, "learning_rate": 4.109777918616235e-06, - "loss": 0.0546, - "mean_token_accuracy": 0.9848859757184982, + "loss": 0.0557, + "mean_token_accuracy": 0.9839456379413605, "step": 1642 }, { "epoch": 9.609970674486803, - "grad_norm": 0.3447439897843936, + "grad_norm": 0.21733849157484478, "learning_rate": 4.106028795038487e-06, - "loss": 0.0559, - "mean_token_accuracy": 0.9810968413949013, + "loss": 0.0569, + "mean_token_accuracy": 0.9807590320706367, "step": 1643 }, { "epoch": 9.6158357771261, - "grad_norm": 0.39380441477471423, + "grad_norm": 0.2568971208282156, "learning_rate": 4.102344619349307e-06, - "loss": 0.064, - "mean_token_accuracy": 0.9785462468862534, + "loss": 0.065, + "mean_token_accuracy": 0.9786679521203041, "step": 1644 }, { "epoch": 9.621700879765395, - "grad_norm": 0.3795143444733662, + "grad_norm": 0.2360837765560061, "learning_rate": 4.098725404920763e-06, - "loss": 0.0614, - "mean_token_accuracy": 0.9801774621009827, + "loss": 0.0621, + "mean_token_accuracy": 0.9808361381292343, "step": 1645 }, { "epoch": 9.627565982404692, - "grad_norm": 0.4239571722097151, + "grad_norm": 0.2611634274793871, "learning_rate": 4.095171164889143e-06, - "loss": 0.0524, - "mean_token_accuracy": 0.9822197332978249, + "loss": 0.0536, + "mean_token_accuracy": 0.9824008718132973, "step": 1646 }, { "epoch": 9.633431085043988, - "grad_norm": 0.3279893196301856, + "grad_norm": 0.21117695398237793, "learning_rate": 4.091681912154903e-06, - "loss": 0.0525, - "mean_token_accuracy": 0.9810535982251167, + "loss": 0.0545, + "mean_token_accuracy": 0.9811904802918434, "step": 1647 }, { "epoch": 9.639296187683284, - "grad_norm": 0.4146256842051382, + "grad_norm": 0.2557544201554423, "learning_rate": 4.088257659382619e-06, - "loss": 0.0678, - "mean_token_accuracy": 0.9781135395169258, + "loss": 0.069, + "mean_token_accuracy": 0.9788208901882172, "step": 1648 }, { "epoch": 9.64516129032258, - "grad_norm": 0.41514329438471853, + "grad_norm": 0.2513606648386977, "learning_rate": 4.0848984190009495e-06, - "loss": 0.0587, - "mean_token_accuracy": 0.9801534190773964, + "loss": 0.06, + "mean_token_accuracy": 0.9795089364051819, "step": 1649 }, { "epoch": 9.651026392961876, - "grad_norm": 0.2900172472788311, + "grad_norm": 0.18464453228546868, "learning_rate": 4.081604203202577e-06, - "loss": 0.0467, - "mean_token_accuracy": 0.9854440614581108, + "loss": 0.0473, + "mean_token_accuracy": 0.985549159348011, "step": 1650 }, { "epoch": 9.656891495601172, - "grad_norm": 0.3171073659721082, + "grad_norm": 0.18744575277483974, "learning_rate": 4.078375023944175e-06, - "loss": 0.0528, - "mean_token_accuracy": 0.983460322022438, + "loss": 0.053, + "mean_token_accuracy": 0.983508974313736, "step": 1651 }, { "epoch": 9.662756598240469, - "grad_norm": 0.3618376086906897, + "grad_norm": 0.2210808736941966, "learning_rate": 4.0752108929463625e-06, - "loss": 0.0608, - "mean_token_accuracy": 0.9778873026371002, + "loss": 0.0615, + "mean_token_accuracy": 0.9779364317655563, "step": 1652 }, { "epoch": 9.668621700879765, - "grad_norm": 0.4064811886215041, + "grad_norm": 0.2443414624296323, "learning_rate": 4.072111821693655e-06, - "loss": 0.0569, - "mean_token_accuracy": 0.982071690261364, + "loss": 0.0579, + "mean_token_accuracy": 0.9824973717331886, "step": 1653 }, { "epoch": 9.674486803519061, - "grad_norm": 0.3487674151385284, + "grad_norm": 0.22949116375142498, "learning_rate": 4.069077821434429e-06, - "loss": 0.0603, - "mean_token_accuracy": 0.9813675135374069, + "loss": 0.06, + "mean_token_accuracy": 0.9825471565127373, "step": 1654 }, { "epoch": 9.680351906158357, - "grad_norm": 0.48683525747641127, + "grad_norm": 0.2321839216945802, "learning_rate": 4.06610890318088e-06, - "loss": 0.0505, - "mean_token_accuracy": 0.9826664626598358, + "loss": 0.0513, + "mean_token_accuracy": 0.9831982627511024, "step": 1655 }, { "epoch": 9.686217008797653, - "grad_norm": 0.27565303241857414, + "grad_norm": 0.17846779813822697, "learning_rate": 4.063205077708986e-06, - "loss": 0.0512, - "mean_token_accuracy": 0.9829757288098335, + "loss": 0.0515, + "mean_token_accuracy": 0.984035462141037, "step": 1656 }, { "epoch": 9.69208211143695, - "grad_norm": 0.4138815422448716, + "grad_norm": 0.26334148518013256, "learning_rate": 4.060366355558456e-06, - "loss": 0.0561, - "mean_token_accuracy": 0.9805786311626434, + "loss": 0.057, + "mean_token_accuracy": 0.9798618927598, "step": 1657 }, { "epoch": 9.697947214076246, - "grad_norm": 0.3521931887112191, + "grad_norm": 0.22995803605804366, "learning_rate": 4.057592747032707e-06, - "loss": 0.0651, - "mean_token_accuracy": 0.9799509420990944, + "loss": 0.0672, + "mean_token_accuracy": 0.9782466292381287, "step": 1658 }, { "epoch": 9.703812316715542, - "grad_norm": 0.36290266721596814, + "grad_norm": 0.2818986728639512, "learning_rate": 4.054884262198816e-06, - "loss": 0.0479, - "mean_token_accuracy": 0.9829519093036652, + "loss": 0.0484, + "mean_token_accuracy": 0.982783667743206, "step": 1659 }, { "epoch": 9.709677419354838, - "grad_norm": 0.29181690040474056, + "grad_norm": 0.18386103736306297, "learning_rate": 4.052240910887493e-06, - "loss": 0.0515, - "mean_token_accuracy": 0.9840708523988724, + "loss": 0.0526, + "mean_token_accuracy": 0.984276570379734, "step": 1660 }, { "epoch": 9.715542521994134, - "grad_norm": 0.3198053513374907, + "grad_norm": 0.2116890836344024, "learning_rate": 4.049662702693031e-06, - "loss": 0.0517, - "mean_token_accuracy": 0.9815531522035599, + "loss": 0.0525, + "mean_token_accuracy": 0.9818199425935745, "step": 1661 }, { "epoch": 9.72140762463343, - "grad_norm": 0.3567193819398695, + "grad_norm": 0.21630284866054866, "learning_rate": 4.047149646973288e-06, - "loss": 0.0534, - "mean_token_accuracy": 0.9812785014510155, + "loss": 0.0542, + "mean_token_accuracy": 0.9806619063019753, "step": 1662 }, { "epoch": 9.727272727272727, - "grad_norm": 0.3365250481082472, + "grad_norm": 0.21371977729637454, "learning_rate": 4.044701752849639e-06, - "loss": 0.05, - "mean_token_accuracy": 0.9830398857593536, + "loss": 0.0517, + "mean_token_accuracy": 0.9830491915345192, "step": 1663 }, { "epoch": 9.733137829912023, - "grad_norm": 0.3168117861192591, + "grad_norm": 0.19545948640696634, "learning_rate": 4.042319029206954e-06, - "loss": 0.0496, - "mean_token_accuracy": 0.9836679548025131, + "loss": 0.0497, + "mean_token_accuracy": 0.9841062873601913, "step": 1664 }, { "epoch": 9.739002932551319, - "grad_norm": 0.3083134474689967, + "grad_norm": 0.19578503009417925, "learning_rate": 4.040001484693553e-06, - "loss": 0.0487, - "mean_token_accuracy": 0.9842707514762878, + "loss": 0.0489, + "mean_token_accuracy": 0.9841093346476555, "step": 1665 }, { "epoch": 9.744868035190615, - "grad_norm": 0.3919955952416545, + "grad_norm": 0.2371983315755813, "learning_rate": 4.037749127721191e-06, - "loss": 0.0519, - "mean_token_accuracy": 0.9838557988405228, + "loss": 0.053, + "mean_token_accuracy": 0.9833494052290916, "step": 1666 }, { "epoch": 9.750733137829911, - "grad_norm": 0.29872178905205415, + "grad_norm": 0.19259742429604967, "learning_rate": 4.03556196646501e-06, - "loss": 0.049, - "mean_token_accuracy": 0.9841638430953026, + "loss": 0.0496, + "mean_token_accuracy": 0.9846142753958702, "step": 1667 }, { "epoch": 9.756598240469208, - "grad_norm": 0.33607599431555435, + "grad_norm": 0.22286796311809612, "learning_rate": 4.033440008863528e-06, - "loss": 0.0584, - "mean_token_accuracy": 0.9812700152397156, + "loss": 0.0594, + "mean_token_accuracy": 0.9808065742254257, "step": 1668 }, { "epoch": 9.762463343108504, - "grad_norm": 0.3394818397662197, + "grad_norm": 0.22025704687640948, "learning_rate": 4.031383262618588e-06, - "loss": 0.0586, - "mean_token_accuracy": 0.9813343957066536, + "loss": 0.0595, + "mean_token_accuracy": 0.9809862375259399, "step": 1669 }, { "epoch": 9.7683284457478, - "grad_norm": 0.38828667419079976, + "grad_norm": 0.24252690715905784, "learning_rate": 4.0293917351953505e-06, - "loss": 0.0538, - "mean_token_accuracy": 0.9834257811307907, + "loss": 0.0548, + "mean_token_accuracy": 0.9827341809868813, "step": 1670 }, { "epoch": 9.774193548387096, - "grad_norm": 0.33338280065712544, + "grad_norm": 0.20870009759134936, "learning_rate": 4.027465433822255e-06, - "loss": 0.0504, - "mean_token_accuracy": 0.98256666213274, + "loss": 0.0505, + "mean_token_accuracy": 0.9823991134762764, "step": 1671 }, { "epoch": 9.780058651026392, - "grad_norm": 0.3411185677369752, + "grad_norm": 0.21778727564935893, "learning_rate": 4.025604365490999e-06, - "loss": 0.0525, - "mean_token_accuracy": 0.9825597852468491, + "loss": 0.0533, + "mean_token_accuracy": 0.9838661327958107, "step": 1672 }, { "epoch": 9.785923753665688, - "grad_norm": 0.3212285265038135, + "grad_norm": 0.20561112970775772, "learning_rate": 4.0238085369565085e-06, - "loss": 0.0522, - "mean_token_accuracy": 0.9835484176874161, + "loss": 0.0531, + "mean_token_accuracy": 0.9828332811594009, "step": 1673 }, { "epoch": 9.791788856304985, - "grad_norm": 0.294204389979182, + "grad_norm": 0.18298749159069347, "learning_rate": 4.022077954736916e-06, - "loss": 0.0519, - "mean_token_accuracy": 0.9838820695877075, + "loss": 0.0529, + "mean_token_accuracy": 0.9837350472807884, "step": 1674 }, { "epoch": 9.79765395894428, - "grad_norm": 0.38254802196013876, + "grad_norm": 0.23044165771776254, "learning_rate": 4.020412625113535e-06, - "loss": 0.053, - "mean_token_accuracy": 0.9837944954633713, + "loss": 0.0531, + "mean_token_accuracy": 0.9837679117918015, "step": 1675 }, { "epoch": 9.803519061583577, - "grad_norm": 0.36617266625723327, + "grad_norm": 0.22929050117194524, "learning_rate": 4.018812554130839e-06, - "loss": 0.0638, - "mean_token_accuracy": 0.9815091416239738, + "loss": 0.0642, + "mean_token_accuracy": 0.9814090430736542, "step": 1676 }, { "epoch": 9.809384164222873, - "grad_norm": 0.39530260374994275, + "grad_norm": 0.24258828305446048, "learning_rate": 4.01727774759644e-06, - "loss": 0.0568, - "mean_token_accuracy": 0.9812266975641251, + "loss": 0.0579, + "mean_token_accuracy": 0.9815564751625061, "step": 1677 }, { "epoch": 9.81524926686217, - "grad_norm": 0.36224793191945875, + "grad_norm": 0.25774821859783087, "learning_rate": 4.0158082110810695e-06, - "loss": 0.0506, - "mean_token_accuracy": 0.9833802804350853, + "loss": 0.0522, + "mean_token_accuracy": 0.9827143624424934, "step": 1678 }, { "epoch": 9.821114369501466, - "grad_norm": 0.3461078001305434, + "grad_norm": 0.22573904277170584, "learning_rate": 4.014403949918545e-06, - "loss": 0.0518, - "mean_token_accuracy": 0.9832278341054916, + "loss": 0.0527, + "mean_token_accuracy": 0.9831245169043541, "step": 1679 }, { "epoch": 9.826979472140762, - "grad_norm": 0.3639700603722806, + "grad_norm": 0.22468139839540996, "learning_rate": 4.0130649692057715e-06, - "loss": 0.0556, - "mean_token_accuracy": 0.9809284582734108, + "loss": 0.0564, + "mean_token_accuracy": 0.9813647791743279, "step": 1680 }, { "epoch": 9.832844574780058, - "grad_norm": 0.3633034118576501, + "grad_norm": 0.22674431582862384, "learning_rate": 4.01179127380271e-06, - "loss": 0.0585, - "mean_token_accuracy": 0.9802731797099113, + "loss": 0.0601, + "mean_token_accuracy": 0.9805619269609451, "step": 1681 }, { "epoch": 9.838709677419354, - "grad_norm": 0.32362509615190976, + "grad_norm": 0.24116266207619552, "learning_rate": 4.010582868332353e-06, - "loss": 0.0474, - "mean_token_accuracy": 0.9848815277218819, + "loss": 0.0479, + "mean_token_accuracy": 0.9846414774656296, "step": 1682 }, { "epoch": 9.84457478005865, - "grad_norm": 0.3477758631730095, + "grad_norm": 0.2480489468253077, "learning_rate": 4.009439757180732e-06, - "loss": 0.0541, - "mean_token_accuracy": 0.9802240058779716, + "loss": 0.0554, + "mean_token_accuracy": 0.9797031283378601, "step": 1683 }, { "epoch": 9.850439882697946, - "grad_norm": 0.3854795876675265, + "grad_norm": 0.2612134501490512, "learning_rate": 4.008361944496875e-06, - "loss": 0.0545, - "mean_token_accuracy": 0.982623852789402, + "loss": 0.0554, + "mean_token_accuracy": 0.981481671333313, "step": 1684 }, { "epoch": 9.856304985337243, - "grad_norm": 0.3885500734464125, + "grad_norm": 0.2626478258672696, "learning_rate": 4.00734943419281e-06, - "loss": 0.062, - "mean_token_accuracy": 0.9794270023703575, + "loss": 0.0633, + "mean_token_accuracy": 0.9795757532119751, "step": 1685 }, { "epoch": 9.862170087976539, - "grad_norm": 0.38759202956879846, + "grad_norm": 0.25961503347240517, "learning_rate": 4.006402229943534e-06, - "loss": 0.0549, - "mean_token_accuracy": 0.9819841310381889, + "loss": 0.0571, + "mean_token_accuracy": 0.9808043912053108, "step": 1686 }, { "epoch": 9.868035190615835, - "grad_norm": 0.3150886823164933, + "grad_norm": 0.20225098537745445, "learning_rate": 4.005520335187023e-06, - "loss": 0.0564, - "mean_token_accuracy": 0.9822871387004852, + "loss": 0.0571, + "mean_token_accuracy": 0.9823412969708443, "step": 1687 }, { "epoch": 9.873900293255131, - "grad_norm": 0.35350832587231334, + "grad_norm": 0.24773203659800125, "learning_rate": 4.004703753124195e-06, - "loss": 0.0565, - "mean_token_accuracy": 0.9831016659736633, + "loss": 0.058, + "mean_token_accuracy": 0.9820492267608643, "step": 1688 }, { "epoch": 9.879765395894427, - "grad_norm": 0.3193894482554477, + "grad_norm": 0.19402283681662774, "learning_rate": 4.003952486718913e-06, - "loss": 0.0485, - "mean_token_accuracy": 0.9836105778813362, + "loss": 0.0495, + "mean_token_accuracy": 0.9836301207542419, "step": 1689 }, { "epoch": 9.885630498533724, - "grad_norm": 0.30675392858895184, + "grad_norm": 0.20474303626023752, "learning_rate": 4.003266538697973e-06, - "loss": 0.0523, - "mean_token_accuracy": 0.9822279661893845, + "loss": 0.0533, + "mean_token_accuracy": 0.9827889949083328, "step": 1690 }, { "epoch": 9.89149560117302, - "grad_norm": 0.31085526491264526, + "grad_norm": 0.21002275948618682, "learning_rate": 4.002645911551086e-06, - "loss": 0.0486, - "mean_token_accuracy": 0.9831148758530617, + "loss": 0.0494, + "mean_token_accuracy": 0.9827375486493111, "step": 1691 }, { "epoch": 9.897360703812316, - "grad_norm": 0.310609074510971, + "grad_norm": 0.1929295301975364, "learning_rate": 4.002090607530882e-06, - "loss": 0.0536, - "mean_token_accuracy": 0.9823655262589455, + "loss": 0.0549, + "mean_token_accuracy": 0.9815536439418793, "step": 1692 }, { "epoch": 9.903225806451612, - "grad_norm": 0.5082679092863019, + "grad_norm": 0.23064941531004654, "learning_rate": 4.001600628652887e-06, - "loss": 0.0684, - "mean_token_accuracy": 0.9778107851743698, + "loss": 0.0648, + "mean_token_accuracy": 0.9779929518699646, "step": 1693 }, { "epoch": 9.909090909090908, - "grad_norm": 0.3580467978873467, + "grad_norm": 0.43219738924036005, "learning_rate": 4.001175976695527e-06, - "loss": 0.0587, - "mean_token_accuracy": 0.9787137806415558, + "loss": 0.0601, + "mean_token_accuracy": 0.9785007312893867, "step": 1694 }, { "epoch": 9.914956011730204, - "grad_norm": 0.3474754641798292, + "grad_norm": 0.21205310141711015, "learning_rate": 4.000816653200117e-06, - "loss": 0.047, - "mean_token_accuracy": 0.9860777705907822, + "loss": 0.0474, + "mean_token_accuracy": 0.986022487282753, "step": 1695 }, { "epoch": 9.9208211143695, - "grad_norm": 0.3480833124779761, + "grad_norm": 0.22338455794977216, "learning_rate": 4.000522659470857e-06, - "loss": 0.0521, - "mean_token_accuracy": 0.9829437881708145, + "loss": 0.0532, + "mean_token_accuracy": 0.9828405454754829, "step": 1696 }, { "epoch": 9.926686217008797, - "grad_norm": 0.4096410573101437, + "grad_norm": 0.27487995501734275, "learning_rate": 4.000293996574826e-06, - "loss": 0.0666, - "mean_token_accuracy": 0.9793807342648506, + "loss": 0.0671, + "mean_token_accuracy": 0.9787180423736572, "step": 1697 }, { "epoch": 9.932551319648093, - "grad_norm": 0.3812968594988111, + "grad_norm": 0.2502076477797535, "learning_rate": 4.000130665341977e-06, - "loss": 0.0637, - "mean_token_accuracy": 0.9800157248973846, + "loss": 0.0645, + "mean_token_accuracy": 0.9796425253152847, "step": 1698 }, { "epoch": 9.93841642228739, - "grad_norm": 0.3406586288767792, + "grad_norm": 0.23598585484626972, "learning_rate": 4.000032666365136e-06, - "loss": 0.0514, - "mean_token_accuracy": 0.9838827252388, + "loss": 0.0522, + "mean_token_accuracy": 0.9837642535567284, "step": 1699 }, { "epoch": 9.944281524926687, - "grad_norm": 0.32809422893372536, + "grad_norm": 0.22474153991468065, "learning_rate": 4.000000000000001e-06, - "loss": 0.0515, - "mean_token_accuracy": 0.9818282052874565, + "loss": 0.0534, + "mean_token_accuracy": 0.9821964651346207, "step": 1700 }, { "epoch": 9.944281524926687, "step": 1700, - "total_flos": 18857631034368.0, - "train_loss": 0.18426384230746942, - "train_runtime": 35740.9083, - "train_samples_per_second": 1.526, - "train_steps_per_second": 0.048 + "total_flos": 15136392560640.0, + "train_loss": 0.18609273691826006, + "train_runtime": 35924.8672, + "train_samples_per_second": 1.518, + "train_steps_per_second": 0.047 } ], "logging_steps": 1, @@ -13635,7 +13635,7 @@ "attributes": {} } }, - "total_flos": 18857631034368.0, + "total_flos": 15136392560640.0, "train_batch_size": 1, "trial_name": null, "trial_params": null