SALAD_RTLLM_Contamination / trainer_state.json
zwSyc's picture
Upload VerilogEval contamination
7240f51 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9870598856455013,
"eval_steps": 500,
"global_step": 621,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004814926271441469,
"grad_norm": 0.3190668225288391,
"learning_rate": 9.98389694041868e-06,
"loss": 14.3722,
"step": 1
},
{
"epoch": 0.009629852542882938,
"grad_norm": 0.704807698726654,
"learning_rate": 9.96779388083736e-06,
"loss": 17.3468,
"step": 2
},
{
"epoch": 0.014444778814324405,
"grad_norm": 0.5227249264717102,
"learning_rate": 9.95169082125604e-06,
"loss": 18.7806,
"step": 3
},
{
"epoch": 0.019259705085765876,
"grad_norm": 0.34830373525619507,
"learning_rate": 9.93558776167472e-06,
"loss": 18.8868,
"step": 4
},
{
"epoch": 0.024074631357207343,
"grad_norm": 0.36558371782302856,
"learning_rate": 9.919484702093398e-06,
"loss": 17.0113,
"step": 5
},
{
"epoch": 0.02888955762864881,
"grad_norm": 0.464693546295166,
"learning_rate": 9.903381642512077e-06,
"loss": 16.2247,
"step": 6
},
{
"epoch": 0.03370448390009028,
"grad_norm": 0.45501771569252014,
"learning_rate": 9.887278582930757e-06,
"loss": 15.7179,
"step": 7
},
{
"epoch": 0.03851941017153175,
"grad_norm": 0.6688278317451477,
"learning_rate": 9.871175523349438e-06,
"loss": 18.7794,
"step": 8
},
{
"epoch": 0.043334336442973215,
"grad_norm": 0.40696507692337036,
"learning_rate": 9.855072463768118e-06,
"loss": 16.3889,
"step": 9
},
{
"epoch": 0.048149262714414685,
"grad_norm": 0.38113319873809814,
"learning_rate": 9.838969404186796e-06,
"loss": 16.7515,
"step": 10
},
{
"epoch": 0.052964188985856156,
"grad_norm": 0.35052913427352905,
"learning_rate": 9.822866344605476e-06,
"loss": 15.7069,
"step": 11
},
{
"epoch": 0.05777911525729762,
"grad_norm": 0.47708237171173096,
"learning_rate": 9.806763285024155e-06,
"loss": 16.9549,
"step": 12
},
{
"epoch": 0.0625940415287391,
"grad_norm": 0.4960598945617676,
"learning_rate": 9.790660225442835e-06,
"loss": 16.8876,
"step": 13
},
{
"epoch": 0.06740896780018056,
"grad_norm": 0.39951273798942566,
"learning_rate": 9.774557165861515e-06,
"loss": 15.1888,
"step": 14
},
{
"epoch": 0.07222389407162202,
"grad_norm": 0.21550379693508148,
"learning_rate": 9.758454106280194e-06,
"loss": 14.6054,
"step": 15
},
{
"epoch": 0.0770388203430635,
"grad_norm": 0.30602937936782837,
"learning_rate": 9.742351046698874e-06,
"loss": 16.4524,
"step": 16
},
{
"epoch": 0.08185374661450497,
"grad_norm": 0.30777233839035034,
"learning_rate": 9.726247987117554e-06,
"loss": 14.3263,
"step": 17
},
{
"epoch": 0.08666867288594643,
"grad_norm": 0.35533130168914795,
"learning_rate": 9.710144927536233e-06,
"loss": 16.2459,
"step": 18
},
{
"epoch": 0.09148359915738791,
"grad_norm": 0.23820991814136505,
"learning_rate": 9.694041867954911e-06,
"loss": 14.2147,
"step": 19
},
{
"epoch": 0.09629852542882937,
"grad_norm": 0.2193877398967743,
"learning_rate": 9.677938808373591e-06,
"loss": 14.4833,
"step": 20
},
{
"epoch": 0.10111345170027083,
"grad_norm": 0.24645549058914185,
"learning_rate": 9.66183574879227e-06,
"loss": 14.6868,
"step": 21
},
{
"epoch": 0.10592837797171231,
"grad_norm": 0.2614218592643738,
"learning_rate": 9.64573268921095e-06,
"loss": 15.4011,
"step": 22
},
{
"epoch": 0.11074330424315378,
"grad_norm": 0.3114742040634155,
"learning_rate": 9.62962962962963e-06,
"loss": 14.9497,
"step": 23
},
{
"epoch": 0.11555823051459524,
"grad_norm": 0.20465250313282013,
"learning_rate": 9.61352657004831e-06,
"loss": 13.4598,
"step": 24
},
{
"epoch": 0.12037315678603672,
"grad_norm": 0.3349449336528778,
"learning_rate": 9.59742351046699e-06,
"loss": 14.3437,
"step": 25
},
{
"epoch": 0.1251880830574782,
"grad_norm": 0.4164576828479767,
"learning_rate": 9.581320450885669e-06,
"loss": 14.5754,
"step": 26
},
{
"epoch": 0.13000300932891964,
"grad_norm": 0.3533851206302643,
"learning_rate": 9.565217391304349e-06,
"loss": 14.7727,
"step": 27
},
{
"epoch": 0.13481793560036112,
"grad_norm": 0.3998354375362396,
"learning_rate": 9.549114331723028e-06,
"loss": 13.1538,
"step": 28
},
{
"epoch": 0.1396328618718026,
"grad_norm": 0.3069708049297333,
"learning_rate": 9.533011272141708e-06,
"loss": 13.2611,
"step": 29
},
{
"epoch": 0.14444778814324405,
"grad_norm": 0.20584744215011597,
"learning_rate": 9.516908212560388e-06,
"loss": 14.7522,
"step": 30
},
{
"epoch": 0.14926271441468553,
"grad_norm": 0.2097318172454834,
"learning_rate": 9.500805152979067e-06,
"loss": 15.0184,
"step": 31
},
{
"epoch": 0.154077640686127,
"grad_norm": 0.3266746401786804,
"learning_rate": 9.484702093397747e-06,
"loss": 13.2641,
"step": 32
},
{
"epoch": 0.15889256695756845,
"grad_norm": 0.2459367960691452,
"learning_rate": 9.468599033816425e-06,
"loss": 15.1449,
"step": 33
},
{
"epoch": 0.16370749322900993,
"grad_norm": 0.4554983973503113,
"learning_rate": 9.452495974235105e-06,
"loss": 14.6538,
"step": 34
},
{
"epoch": 0.1685224195004514,
"grad_norm": 0.3142286241054535,
"learning_rate": 9.436392914653784e-06,
"loss": 14.0927,
"step": 35
},
{
"epoch": 0.17333734577189286,
"grad_norm": 0.2828330993652344,
"learning_rate": 9.420289855072464e-06,
"loss": 14.5898,
"step": 36
},
{
"epoch": 0.17815227204333434,
"grad_norm": 0.25663697719573975,
"learning_rate": 9.404186795491144e-06,
"loss": 12.7267,
"step": 37
},
{
"epoch": 0.18296719831477581,
"grad_norm": 0.4929574728012085,
"learning_rate": 9.388083735909823e-06,
"loss": 14.2236,
"step": 38
},
{
"epoch": 0.18778212458621726,
"grad_norm": 0.414725661277771,
"learning_rate": 9.371980676328503e-06,
"loss": 13.9036,
"step": 39
},
{
"epoch": 0.19259705085765874,
"grad_norm": 0.2808246910572052,
"learning_rate": 9.355877616747183e-06,
"loss": 14.7754,
"step": 40
},
{
"epoch": 0.19741197712910022,
"grad_norm": 0.2846072316169739,
"learning_rate": 9.339774557165862e-06,
"loss": 13.519,
"step": 41
},
{
"epoch": 0.20222690340054167,
"grad_norm": 0.2638435661792755,
"learning_rate": 9.323671497584542e-06,
"loss": 14.5145,
"step": 42
},
{
"epoch": 0.20704182967198315,
"grad_norm": 0.22342148423194885,
"learning_rate": 9.307568438003222e-06,
"loss": 12.7617,
"step": 43
},
{
"epoch": 0.21185675594342462,
"grad_norm": 0.2732909619808197,
"learning_rate": 9.291465378421901e-06,
"loss": 13.6826,
"step": 44
},
{
"epoch": 0.21667168221486607,
"grad_norm": 0.23550738394260406,
"learning_rate": 9.275362318840581e-06,
"loss": 13.1958,
"step": 45
},
{
"epoch": 0.22148660848630755,
"grad_norm": 0.2673870027065277,
"learning_rate": 9.25925925925926e-06,
"loss": 14.1934,
"step": 46
},
{
"epoch": 0.22630153475774903,
"grad_norm": 0.303568571805954,
"learning_rate": 9.243156199677939e-06,
"loss": 11.8819,
"step": 47
},
{
"epoch": 0.23111646102919048,
"grad_norm": 0.27822041511535645,
"learning_rate": 9.227053140096618e-06,
"loss": 11.8434,
"step": 48
},
{
"epoch": 0.23593138730063196,
"grad_norm": 0.21598580479621887,
"learning_rate": 9.210950080515298e-06,
"loss": 13.0898,
"step": 49
},
{
"epoch": 0.24074631357207343,
"grad_norm": 0.22329603135585785,
"learning_rate": 9.194847020933978e-06,
"loss": 12.1004,
"step": 50
},
{
"epoch": 0.24556123984351488,
"grad_norm": 0.22152170538902283,
"learning_rate": 9.178743961352658e-06,
"loss": 13.8467,
"step": 51
},
{
"epoch": 0.2503761661149564,
"grad_norm": 0.2549304664134979,
"learning_rate": 9.162640901771337e-06,
"loss": 13.3853,
"step": 52
},
{
"epoch": 0.2551910923863978,
"grad_norm": 0.2308962047100067,
"learning_rate": 9.146537842190017e-06,
"loss": 12.9115,
"step": 53
},
{
"epoch": 0.2600060186578393,
"grad_norm": 0.19011437892913818,
"learning_rate": 9.130434782608697e-06,
"loss": 12.7455,
"step": 54
},
{
"epoch": 0.26482094492928077,
"grad_norm": 0.21280792355537415,
"learning_rate": 9.114331723027376e-06,
"loss": 13.4157,
"step": 55
},
{
"epoch": 0.26963587120072224,
"grad_norm": 0.35571521520614624,
"learning_rate": 9.098228663446056e-06,
"loss": 13.961,
"step": 56
},
{
"epoch": 0.2744507974721637,
"grad_norm": 0.25055205821990967,
"learning_rate": 9.082125603864736e-06,
"loss": 13.5945,
"step": 57
},
{
"epoch": 0.2792657237436052,
"grad_norm": 0.22618041932582855,
"learning_rate": 9.066022544283415e-06,
"loss": 13.6034,
"step": 58
},
{
"epoch": 0.2840806500150466,
"grad_norm": 0.2419959455728531,
"learning_rate": 9.049919484702095e-06,
"loss": 13.5142,
"step": 59
},
{
"epoch": 0.2888955762864881,
"grad_norm": 0.3027523458003998,
"learning_rate": 9.033816425120775e-06,
"loss": 12.9825,
"step": 60
},
{
"epoch": 0.2937105025579296,
"grad_norm": 0.1812627613544464,
"learning_rate": 9.017713365539453e-06,
"loss": 12.4197,
"step": 61
},
{
"epoch": 0.29852542882937105,
"grad_norm": 0.2510731518268585,
"learning_rate": 9.001610305958132e-06,
"loss": 13.2717,
"step": 62
},
{
"epoch": 0.30334035510081253,
"grad_norm": 0.2064312994480133,
"learning_rate": 8.985507246376812e-06,
"loss": 13.3493,
"step": 63
},
{
"epoch": 0.308155281372254,
"grad_norm": 0.2627861797809601,
"learning_rate": 8.969404186795492e-06,
"loss": 13.3364,
"step": 64
},
{
"epoch": 0.31297020764369543,
"grad_norm": 0.22463975846767426,
"learning_rate": 8.953301127214171e-06,
"loss": 12.0759,
"step": 65
},
{
"epoch": 0.3177851339151369,
"grad_norm": 0.3166675865650177,
"learning_rate": 8.937198067632851e-06,
"loss": 12.8393,
"step": 66
},
{
"epoch": 0.3226000601865784,
"grad_norm": 0.16428841650485992,
"learning_rate": 8.92109500805153e-06,
"loss": 12.6723,
"step": 67
},
{
"epoch": 0.32741498645801986,
"grad_norm": 0.1815037578344345,
"learning_rate": 8.90499194847021e-06,
"loss": 12.6212,
"step": 68
},
{
"epoch": 0.33222991272946134,
"grad_norm": 0.2504093050956726,
"learning_rate": 8.888888888888888e-06,
"loss": 12.6547,
"step": 69
},
{
"epoch": 0.3370448390009028,
"grad_norm": 0.17379416525363922,
"learning_rate": 8.87278582930757e-06,
"loss": 10.5245,
"step": 70
},
{
"epoch": 0.34185976527234424,
"grad_norm": 0.20780153572559357,
"learning_rate": 8.85668276972625e-06,
"loss": 11.1868,
"step": 71
},
{
"epoch": 0.3466746915437857,
"grad_norm": 0.2680881917476654,
"learning_rate": 8.840579710144929e-06,
"loss": 11.9582,
"step": 72
},
{
"epoch": 0.3514896178152272,
"grad_norm": 0.1777425855398178,
"learning_rate": 8.824476650563609e-06,
"loss": 11.3178,
"step": 73
},
{
"epoch": 0.3563045440866687,
"grad_norm": 0.20199166238307953,
"learning_rate": 8.808373590982288e-06,
"loss": 12.5066,
"step": 74
},
{
"epoch": 0.36111947035811015,
"grad_norm": 0.23542606830596924,
"learning_rate": 8.792270531400966e-06,
"loss": 11.885,
"step": 75
},
{
"epoch": 0.36593439662955163,
"grad_norm": 0.23038695752620697,
"learning_rate": 8.776167471819646e-06,
"loss": 11.1026,
"step": 76
},
{
"epoch": 0.3707493229009931,
"grad_norm": 0.2536081075668335,
"learning_rate": 8.760064412238326e-06,
"loss": 13.065,
"step": 77
},
{
"epoch": 0.37556424917243453,
"grad_norm": 0.2599170207977295,
"learning_rate": 8.743961352657005e-06,
"loss": 12.4683,
"step": 78
},
{
"epoch": 0.380379175443876,
"grad_norm": 0.23882345855236053,
"learning_rate": 8.727858293075685e-06,
"loss": 11.6778,
"step": 79
},
{
"epoch": 0.3851941017153175,
"grad_norm": 0.23855774104595184,
"learning_rate": 8.711755233494365e-06,
"loss": 13.026,
"step": 80
},
{
"epoch": 0.39000902798675896,
"grad_norm": 0.26537057757377625,
"learning_rate": 8.695652173913044e-06,
"loss": 12.4535,
"step": 81
},
{
"epoch": 0.39482395425820044,
"grad_norm": 0.21693478524684906,
"learning_rate": 8.679549114331724e-06,
"loss": 12.9436,
"step": 82
},
{
"epoch": 0.3996388805296419,
"grad_norm": 0.162302166223526,
"learning_rate": 8.663446054750402e-06,
"loss": 11.3558,
"step": 83
},
{
"epoch": 0.40445380680108334,
"grad_norm": 0.271846741437912,
"learning_rate": 8.647342995169082e-06,
"loss": 11.0237,
"step": 84
},
{
"epoch": 0.4092687330725248,
"grad_norm": 0.16958190500736237,
"learning_rate": 8.631239935587761e-06,
"loss": 11.3822,
"step": 85
},
{
"epoch": 0.4140836593439663,
"grad_norm": 0.19066102802753448,
"learning_rate": 8.615136876006443e-06,
"loss": 11.6137,
"step": 86
},
{
"epoch": 0.41889858561540777,
"grad_norm": 0.21410760283470154,
"learning_rate": 8.599033816425122e-06,
"loss": 11.1353,
"step": 87
},
{
"epoch": 0.42371351188684925,
"grad_norm": 0.17947272956371307,
"learning_rate": 8.582930756843802e-06,
"loss": 11.0955,
"step": 88
},
{
"epoch": 0.4285284381582907,
"grad_norm": 0.2798727750778198,
"learning_rate": 8.56682769726248e-06,
"loss": 11.5026,
"step": 89
},
{
"epoch": 0.43334336442973215,
"grad_norm": 0.19547878205776215,
"learning_rate": 8.55072463768116e-06,
"loss": 11.2341,
"step": 90
},
{
"epoch": 0.4381582907011736,
"grad_norm": 0.20346851646900177,
"learning_rate": 8.53462157809984e-06,
"loss": 11.7612,
"step": 91
},
{
"epoch": 0.4429732169726151,
"grad_norm": 0.22177843749523163,
"learning_rate": 8.518518518518519e-06,
"loss": 12.027,
"step": 92
},
{
"epoch": 0.4477881432440566,
"grad_norm": 0.14566639065742493,
"learning_rate": 8.502415458937199e-06,
"loss": 12.1414,
"step": 93
},
{
"epoch": 0.45260306951549806,
"grad_norm": 0.19193682074546814,
"learning_rate": 8.486312399355879e-06,
"loss": 11.2928,
"step": 94
},
{
"epoch": 0.45741799578693954,
"grad_norm": 0.18830566108226776,
"learning_rate": 8.470209339774558e-06,
"loss": 12.3402,
"step": 95
},
{
"epoch": 0.46223292205838096,
"grad_norm": 0.19319747388362885,
"learning_rate": 8.454106280193238e-06,
"loss": 11.4159,
"step": 96
},
{
"epoch": 0.46704784832982243,
"grad_norm": 0.2581634521484375,
"learning_rate": 8.438003220611916e-06,
"loss": 12.5042,
"step": 97
},
{
"epoch": 0.4718627746012639,
"grad_norm": 0.2127319574356079,
"learning_rate": 8.421900161030596e-06,
"loss": 11.8059,
"step": 98
},
{
"epoch": 0.4766777008727054,
"grad_norm": 0.18906573951244354,
"learning_rate": 8.405797101449275e-06,
"loss": 12.773,
"step": 99
},
{
"epoch": 0.48149262714414687,
"grad_norm": 0.2039322406053543,
"learning_rate": 8.389694041867955e-06,
"loss": 11.2793,
"step": 100
},
{
"epoch": 0.48630755341558835,
"grad_norm": 0.17869459092617035,
"learning_rate": 8.373590982286636e-06,
"loss": 12.1488,
"step": 101
},
{
"epoch": 0.49112247968702977,
"grad_norm": 0.24505895376205444,
"learning_rate": 8.357487922705316e-06,
"loss": 12.6911,
"step": 102
},
{
"epoch": 0.49593740595847124,
"grad_norm": 0.24129539728164673,
"learning_rate": 8.341384863123994e-06,
"loss": 11.1825,
"step": 103
},
{
"epoch": 0.5007523322299128,
"grad_norm": 0.20321142673492432,
"learning_rate": 8.325281803542674e-06,
"loss": 11.3817,
"step": 104
},
{
"epoch": 0.5055672585013542,
"grad_norm": 0.2557075321674347,
"learning_rate": 8.309178743961353e-06,
"loss": 13.0008,
"step": 105
},
{
"epoch": 0.5103821847727956,
"grad_norm": 0.27801477909088135,
"learning_rate": 8.293075684380033e-06,
"loss": 10.5208,
"step": 106
},
{
"epoch": 0.5151971110442372,
"grad_norm": 0.18863140046596527,
"learning_rate": 8.276972624798713e-06,
"loss": 11.17,
"step": 107
},
{
"epoch": 0.5200120373156786,
"grad_norm": 0.1997506022453308,
"learning_rate": 8.260869565217392e-06,
"loss": 11.094,
"step": 108
},
{
"epoch": 0.5248269635871201,
"grad_norm": 0.17764043807983398,
"learning_rate": 8.244766505636072e-06,
"loss": 10.9546,
"step": 109
},
{
"epoch": 0.5296418898585615,
"grad_norm": 0.22004744410514832,
"learning_rate": 8.228663446054752e-06,
"loss": 10.8977,
"step": 110
},
{
"epoch": 0.534456816130003,
"grad_norm": 0.20619215071201324,
"learning_rate": 8.212560386473431e-06,
"loss": 12.0217,
"step": 111
},
{
"epoch": 0.5392717424014445,
"grad_norm": 0.1944962590932846,
"learning_rate": 8.19645732689211e-06,
"loss": 11.5528,
"step": 112
},
{
"epoch": 0.5440866686728859,
"grad_norm": 0.13986949622631073,
"learning_rate": 8.180354267310789e-06,
"loss": 10.8501,
"step": 113
},
{
"epoch": 0.5489015949443274,
"grad_norm": 0.18104106187820435,
"learning_rate": 8.164251207729469e-06,
"loss": 12.0401,
"step": 114
},
{
"epoch": 0.5537165212157689,
"grad_norm": 0.22354455292224884,
"learning_rate": 8.148148148148148e-06,
"loss": 12.3038,
"step": 115
},
{
"epoch": 0.5585314474872104,
"grad_norm": 0.21359990537166595,
"learning_rate": 8.132045088566828e-06,
"loss": 10.9812,
"step": 116
},
{
"epoch": 0.5633463737586518,
"grad_norm": 0.25966572761535645,
"learning_rate": 8.115942028985508e-06,
"loss": 11.0717,
"step": 117
},
{
"epoch": 0.5681613000300932,
"grad_norm": 0.18161477148532867,
"learning_rate": 8.099838969404187e-06,
"loss": 10.8503,
"step": 118
},
{
"epoch": 0.5729762263015348,
"grad_norm": 0.30178895592689514,
"learning_rate": 8.083735909822867e-06,
"loss": 12.6225,
"step": 119
},
{
"epoch": 0.5777911525729762,
"grad_norm": 0.13033385574817657,
"learning_rate": 8.067632850241547e-06,
"loss": 11.2823,
"step": 120
},
{
"epoch": 0.5826060788444177,
"grad_norm": 0.2345341593027115,
"learning_rate": 8.051529790660226e-06,
"loss": 10.6418,
"step": 121
},
{
"epoch": 0.5874210051158592,
"grad_norm": 0.23290252685546875,
"learning_rate": 8.035426731078906e-06,
"loss": 10.7231,
"step": 122
},
{
"epoch": 0.5922359313873007,
"grad_norm": 0.19367018342018127,
"learning_rate": 8.019323671497586e-06,
"loss": 10.2351,
"step": 123
},
{
"epoch": 0.5970508576587421,
"grad_norm": 0.22510769963264465,
"learning_rate": 8.003220611916265e-06,
"loss": 10.3216,
"step": 124
},
{
"epoch": 0.6018657839301835,
"grad_norm": 0.21876239776611328,
"learning_rate": 7.987117552334945e-06,
"loss": 11.0453,
"step": 125
},
{
"epoch": 0.6066807102016251,
"grad_norm": 0.23988570272922516,
"learning_rate": 7.971014492753623e-06,
"loss": 10.9186,
"step": 126
},
{
"epoch": 0.6114956364730665,
"grad_norm": 0.1909828633069992,
"learning_rate": 7.954911433172303e-06,
"loss": 10.7444,
"step": 127
},
{
"epoch": 0.616310562744508,
"grad_norm": 0.2268180102109909,
"learning_rate": 7.938808373590982e-06,
"loss": 12.1826,
"step": 128
},
{
"epoch": 0.6211254890159494,
"grad_norm": 0.18531453609466553,
"learning_rate": 7.922705314009662e-06,
"loss": 11.0919,
"step": 129
},
{
"epoch": 0.6259404152873909,
"grad_norm": 0.24563215672969818,
"learning_rate": 7.906602254428342e-06,
"loss": 10.825,
"step": 130
},
{
"epoch": 0.6307553415588324,
"grad_norm": 0.26069939136505127,
"learning_rate": 7.890499194847021e-06,
"loss": 10.9237,
"step": 131
},
{
"epoch": 0.6355702678302738,
"grad_norm": 0.18118217587471008,
"learning_rate": 7.874396135265701e-06,
"loss": 11.2994,
"step": 132
},
{
"epoch": 0.6403851941017153,
"grad_norm": 0.2178242951631546,
"learning_rate": 7.85829307568438e-06,
"loss": 10.6764,
"step": 133
},
{
"epoch": 0.6452001203731568,
"grad_norm": 0.18861421942710876,
"learning_rate": 7.84219001610306e-06,
"loss": 11.7684,
"step": 134
},
{
"epoch": 0.6500150466445983,
"grad_norm": 0.2540731430053711,
"learning_rate": 7.82608695652174e-06,
"loss": 10.4613,
"step": 135
},
{
"epoch": 0.6548299729160397,
"grad_norm": 0.22468675673007965,
"learning_rate": 7.80998389694042e-06,
"loss": 11.0479,
"step": 136
},
{
"epoch": 0.6596448991874811,
"grad_norm": 0.18307951092720032,
"learning_rate": 7.7938808373591e-06,
"loss": 11.7074,
"step": 137
},
{
"epoch": 0.6644598254589227,
"grad_norm": 0.2777751088142395,
"learning_rate": 7.77777777777778e-06,
"loss": 11.5034,
"step": 138
},
{
"epoch": 0.6692747517303641,
"grad_norm": 0.20376338064670563,
"learning_rate": 7.761674718196459e-06,
"loss": 11.178,
"step": 139
},
{
"epoch": 0.6740896780018056,
"grad_norm": 0.19434967637062073,
"learning_rate": 7.745571658615137e-06,
"loss": 10.198,
"step": 140
},
{
"epoch": 0.6789046042732471,
"grad_norm": 0.28449344635009766,
"learning_rate": 7.729468599033817e-06,
"loss": 11.1956,
"step": 141
},
{
"epoch": 0.6837195305446885,
"grad_norm": 0.18125340342521667,
"learning_rate": 7.713365539452496e-06,
"loss": 12.0773,
"step": 142
},
{
"epoch": 0.68853445681613,
"grad_norm": 0.2260919064283371,
"learning_rate": 7.697262479871176e-06,
"loss": 11.2763,
"step": 143
},
{
"epoch": 0.6933493830875714,
"grad_norm": 0.23274123668670654,
"learning_rate": 7.681159420289856e-06,
"loss": 11.9908,
"step": 144
},
{
"epoch": 0.698164309359013,
"grad_norm": 0.16333813965320587,
"learning_rate": 7.665056360708535e-06,
"loss": 12.1754,
"step": 145
},
{
"epoch": 0.7029792356304544,
"grad_norm": 0.19147440791130066,
"learning_rate": 7.648953301127215e-06,
"loss": 10.8012,
"step": 146
},
{
"epoch": 0.7077941619018959,
"grad_norm": 0.24757863581180573,
"learning_rate": 7.632850241545895e-06,
"loss": 11.222,
"step": 147
},
{
"epoch": 0.7126090881733373,
"grad_norm": 0.2936674952507019,
"learning_rate": 7.616747181964574e-06,
"loss": 10.7104,
"step": 148
},
{
"epoch": 0.7174240144447788,
"grad_norm": 0.25289615988731384,
"learning_rate": 7.600644122383254e-06,
"loss": 11.2394,
"step": 149
},
{
"epoch": 0.7222389407162203,
"grad_norm": 0.16242274641990662,
"learning_rate": 7.584541062801934e-06,
"loss": 10.9335,
"step": 150
},
{
"epoch": 0.7270538669876617,
"grad_norm": 0.16051234304904938,
"learning_rate": 7.568438003220613e-06,
"loss": 10.5078,
"step": 151
},
{
"epoch": 0.7318687932591033,
"grad_norm": 0.19001922011375427,
"learning_rate": 7.552334943639292e-06,
"loss": 10.2024,
"step": 152
},
{
"epoch": 0.7366837195305447,
"grad_norm": 0.1944311112165451,
"learning_rate": 7.536231884057972e-06,
"loss": 10.3439,
"step": 153
},
{
"epoch": 0.7414986458019862,
"grad_norm": 0.22597943246364594,
"learning_rate": 7.5201288244766514e-06,
"loss": 9.8315,
"step": 154
},
{
"epoch": 0.7463135720734276,
"grad_norm": 0.16061653196811676,
"learning_rate": 7.504025764895331e-06,
"loss": 10.1577,
"step": 155
},
{
"epoch": 0.7511284983448691,
"grad_norm": 0.18217833340168,
"learning_rate": 7.48792270531401e-06,
"loss": 11.1101,
"step": 156
},
{
"epoch": 0.7559434246163106,
"grad_norm": 0.24722352623939514,
"learning_rate": 7.47181964573269e-06,
"loss": 9.7077,
"step": 157
},
{
"epoch": 0.760758350887752,
"grad_norm": 0.19641828536987305,
"learning_rate": 7.455716586151369e-06,
"loss": 10.4689,
"step": 158
},
{
"epoch": 0.7655732771591935,
"grad_norm": 0.2800208330154419,
"learning_rate": 7.439613526570049e-06,
"loss": 11.402,
"step": 159
},
{
"epoch": 0.770388203430635,
"grad_norm": 0.19170229136943817,
"learning_rate": 7.423510466988728e-06,
"loss": 10.1995,
"step": 160
},
{
"epoch": 0.7752031297020764,
"grad_norm": 0.1706549972295761,
"learning_rate": 7.4074074074074075e-06,
"loss": 10.3468,
"step": 161
},
{
"epoch": 0.7800180559735179,
"grad_norm": 0.21024712920188904,
"learning_rate": 7.391304347826087e-06,
"loss": 11.7541,
"step": 162
},
{
"epoch": 0.7848329822449593,
"grad_norm": 0.22287265956401825,
"learning_rate": 7.375201288244767e-06,
"loss": 11.3616,
"step": 163
},
{
"epoch": 0.7896479085164009,
"grad_norm": 0.195387065410614,
"learning_rate": 7.359098228663447e-06,
"loss": 9.8747,
"step": 164
},
{
"epoch": 0.7944628347878423,
"grad_norm": 0.2072424590587616,
"learning_rate": 7.342995169082127e-06,
"loss": 11.2013,
"step": 165
},
{
"epoch": 0.7992777610592838,
"grad_norm": 0.17055857181549072,
"learning_rate": 7.326892109500806e-06,
"loss": 9.3551,
"step": 166
},
{
"epoch": 0.8040926873307253,
"grad_norm": 0.2913988530635834,
"learning_rate": 7.3107890499194855e-06,
"loss": 9.5701,
"step": 167
},
{
"epoch": 0.8089076136021667,
"grad_norm": 0.27838587760925293,
"learning_rate": 7.294685990338165e-06,
"loss": 9.1273,
"step": 168
},
{
"epoch": 0.8137225398736082,
"grad_norm": 0.16759181022644043,
"learning_rate": 7.278582930756845e-06,
"loss": 12.0986,
"step": 169
},
{
"epoch": 0.8185374661450496,
"grad_norm": 0.2335626184940338,
"learning_rate": 7.262479871175524e-06,
"loss": 9.4595,
"step": 170
},
{
"epoch": 0.8233523924164912,
"grad_norm": 0.22770944237709045,
"learning_rate": 7.246376811594203e-06,
"loss": 11.482,
"step": 171
},
{
"epoch": 0.8281673186879326,
"grad_norm": 0.16300161182880402,
"learning_rate": 7.230273752012883e-06,
"loss": 10.084,
"step": 172
},
{
"epoch": 0.832982244959374,
"grad_norm": 0.1577334851026535,
"learning_rate": 7.214170692431563e-06,
"loss": 11.7738,
"step": 173
},
{
"epoch": 0.8377971712308155,
"grad_norm": 0.26999086141586304,
"learning_rate": 7.1980676328502416e-06,
"loss": 11.7664,
"step": 174
},
{
"epoch": 0.842612097502257,
"grad_norm": 0.17184922099113464,
"learning_rate": 7.181964573268921e-06,
"loss": 10.0153,
"step": 175
},
{
"epoch": 0.8474270237736985,
"grad_norm": 0.19260835647583008,
"learning_rate": 7.165861513687601e-06,
"loss": 11.0206,
"step": 176
},
{
"epoch": 0.8522419500451399,
"grad_norm": 0.13800834119319916,
"learning_rate": 7.149758454106281e-06,
"loss": 11.2491,
"step": 177
},
{
"epoch": 0.8570568763165815,
"grad_norm": 0.18511894345283508,
"learning_rate": 7.1336553945249594e-06,
"loss": 10.0002,
"step": 178
},
{
"epoch": 0.8618718025880229,
"grad_norm": 0.19319257140159607,
"learning_rate": 7.117552334943641e-06,
"loss": 10.3143,
"step": 179
},
{
"epoch": 0.8666867288594643,
"grad_norm": 0.23793131113052368,
"learning_rate": 7.10144927536232e-06,
"loss": 9.9405,
"step": 180
},
{
"epoch": 0.8715016551309058,
"grad_norm": 0.22520898282527924,
"learning_rate": 7.085346215780999e-06,
"loss": 11.7933,
"step": 181
},
{
"epoch": 0.8763165814023472,
"grad_norm": 0.1998303085565567,
"learning_rate": 7.069243156199679e-06,
"loss": 10.8949,
"step": 182
},
{
"epoch": 0.8811315076737888,
"grad_norm": 0.2205827236175537,
"learning_rate": 7.053140096618359e-06,
"loss": 11.2439,
"step": 183
},
{
"epoch": 0.8859464339452302,
"grad_norm": 0.18895015120506287,
"learning_rate": 7.0370370370370375e-06,
"loss": 11.1144,
"step": 184
},
{
"epoch": 0.8907613602166716,
"grad_norm": 0.17686723172664642,
"learning_rate": 7.020933977455717e-06,
"loss": 10.2206,
"step": 185
},
{
"epoch": 0.8955762864881132,
"grad_norm": 0.2033979296684265,
"learning_rate": 7.004830917874397e-06,
"loss": 11.4571,
"step": 186
},
{
"epoch": 0.9003912127595546,
"grad_norm": 0.19752806425094604,
"learning_rate": 6.9887278582930765e-06,
"loss": 10.0299,
"step": 187
},
{
"epoch": 0.9052061390309961,
"grad_norm": 0.26918548345565796,
"learning_rate": 6.972624798711755e-06,
"loss": 10.0987,
"step": 188
},
{
"epoch": 0.9100210653024375,
"grad_norm": 0.14812156558036804,
"learning_rate": 6.956521739130435e-06,
"loss": 10.1505,
"step": 189
},
{
"epoch": 0.9148359915738791,
"grad_norm": 0.21255257725715637,
"learning_rate": 6.940418679549115e-06,
"loss": 10.48,
"step": 190
},
{
"epoch": 0.9196509178453205,
"grad_norm": 0.20056240260601044,
"learning_rate": 6.924315619967794e-06,
"loss": 10.615,
"step": 191
},
{
"epoch": 0.9244658441167619,
"grad_norm": 0.2510916590690613,
"learning_rate": 6.908212560386473e-06,
"loss": 9.2769,
"step": 192
},
{
"epoch": 0.9292807703882034,
"grad_norm": 0.19624245166778564,
"learning_rate": 6.892109500805153e-06,
"loss": 10.3494,
"step": 193
},
{
"epoch": 0.9340956966596449,
"grad_norm": 0.19198696315288544,
"learning_rate": 6.8760064412238326e-06,
"loss": 11.5993,
"step": 194
},
{
"epoch": 0.9389106229310864,
"grad_norm": 0.18541178107261658,
"learning_rate": 6.859903381642513e-06,
"loss": 11.1401,
"step": 195
},
{
"epoch": 0.9437255492025278,
"grad_norm": 0.2111266553401947,
"learning_rate": 6.843800322061193e-06,
"loss": 10.4827,
"step": 196
},
{
"epoch": 0.9485404754739694,
"grad_norm": 0.19431617856025696,
"learning_rate": 6.8276972624798724e-06,
"loss": 9.9673,
"step": 197
},
{
"epoch": 0.9533554017454108,
"grad_norm": 0.20121034979820251,
"learning_rate": 6.811594202898551e-06,
"loss": 10.0902,
"step": 198
},
{
"epoch": 0.9581703280168522,
"grad_norm": 0.24719102680683136,
"learning_rate": 6.795491143317231e-06,
"loss": 9.7116,
"step": 199
},
{
"epoch": 0.9629852542882937,
"grad_norm": 0.14550495147705078,
"learning_rate": 6.779388083735911e-06,
"loss": 10.3404,
"step": 200
},
{
"epoch": 0.9678001805597352,
"grad_norm": 0.19170908629894257,
"learning_rate": 6.76328502415459e-06,
"loss": 10.6444,
"step": 201
},
{
"epoch": 0.9726151068311767,
"grad_norm": 0.23954305052757263,
"learning_rate": 6.747181964573269e-06,
"loss": 10.3116,
"step": 202
},
{
"epoch": 0.9774300331026181,
"grad_norm": 0.15414614975452423,
"learning_rate": 6.731078904991949e-06,
"loss": 10.4329,
"step": 203
},
{
"epoch": 0.9822449593740595,
"grad_norm": 0.19790370762348175,
"learning_rate": 6.7149758454106285e-06,
"loss": 10.6693,
"step": 204
},
{
"epoch": 0.9870598856455011,
"grad_norm": 0.23332847654819489,
"learning_rate": 6.698872785829308e-06,
"loss": 11.7078,
"step": 205
},
{
"epoch": 0.9918748119169425,
"grad_norm": 0.1728251725435257,
"learning_rate": 6.682769726247987e-06,
"loss": 10.1582,
"step": 206
},
{
"epoch": 0.996689738188384,
"grad_norm": 0.18887676298618317,
"learning_rate": 6.666666666666667e-06,
"loss": 10.0072,
"step": 207
},
{
"epoch": 1.0,
"grad_norm": 0.18887676298618317,
"learning_rate": 6.666666666666667e-06,
"loss": 7.3169,
"step": 208
},
{
"epoch": 1.0048149262714414,
"grad_norm": 0.22321873903274536,
"learning_rate": 6.650563607085346e-06,
"loss": 9.9937,
"step": 209
},
{
"epoch": 1.0096298525428828,
"grad_norm": 0.23789285123348236,
"learning_rate": 6.634460547504026e-06,
"loss": 10.1548,
"step": 210
},
{
"epoch": 1.0144447788143245,
"grad_norm": 0.2545947730541229,
"learning_rate": 6.6183574879227065e-06,
"loss": 10.1225,
"step": 211
},
{
"epoch": 1.019259705085766,
"grad_norm": 0.19479095935821533,
"learning_rate": 6.602254428341386e-06,
"loss": 9.4662,
"step": 212
},
{
"epoch": 1.0240746313572073,
"grad_norm": 0.1563379466533661,
"learning_rate": 6.586151368760065e-06,
"loss": 10.5629,
"step": 213
},
{
"epoch": 1.0288895576286488,
"grad_norm": 0.25045880675315857,
"learning_rate": 6.570048309178745e-06,
"loss": 9.0693,
"step": 214
},
{
"epoch": 1.0337044839000902,
"grad_norm": 0.20094619691371918,
"learning_rate": 6.553945249597424e-06,
"loss": 10.7736,
"step": 215
},
{
"epoch": 1.0385194101715318,
"grad_norm": 0.2038065642118454,
"learning_rate": 6.537842190016104e-06,
"loss": 11.2619,
"step": 216
},
{
"epoch": 1.0433343364429732,
"grad_norm": 0.1970120072364807,
"learning_rate": 6.521739130434783e-06,
"loss": 10.3251,
"step": 217
},
{
"epoch": 1.0481492627144147,
"grad_norm": 0.19979062676429749,
"learning_rate": 6.5056360708534626e-06,
"loss": 10.6034,
"step": 218
},
{
"epoch": 1.052964188985856,
"grad_norm": 0.16085349023342133,
"learning_rate": 6.489533011272142e-06,
"loss": 9.5934,
"step": 219
},
{
"epoch": 1.0577791152572975,
"grad_norm": 0.20374242961406708,
"learning_rate": 6.473429951690822e-06,
"loss": 8.9761,
"step": 220
},
{
"epoch": 1.0625940415287392,
"grad_norm": 0.19417604804039001,
"learning_rate": 6.457326892109501e-06,
"loss": 9.8925,
"step": 221
},
{
"epoch": 1.0674089678001806,
"grad_norm": 0.1641014963388443,
"learning_rate": 6.44122383252818e-06,
"loss": 10.019,
"step": 222
},
{
"epoch": 1.072223894071622,
"grad_norm": 0.15444359183311462,
"learning_rate": 6.42512077294686e-06,
"loss": 9.8515,
"step": 223
},
{
"epoch": 1.0770388203430634,
"grad_norm": 0.31960421800613403,
"learning_rate": 6.40901771336554e-06,
"loss": 9.6703,
"step": 224
},
{
"epoch": 1.081853746614505,
"grad_norm": 0.18809086084365845,
"learning_rate": 6.3929146537842194e-06,
"loss": 10.5771,
"step": 225
},
{
"epoch": 1.0866686728859465,
"grad_norm": 0.2899991571903229,
"learning_rate": 6.376811594202898e-06,
"loss": 9.9195,
"step": 226
},
{
"epoch": 1.091483599157388,
"grad_norm": 0.20936541259288788,
"learning_rate": 6.360708534621579e-06,
"loss": 9.9802,
"step": 227
},
{
"epoch": 1.0962985254288293,
"grad_norm": 0.20921356976032257,
"learning_rate": 6.3446054750402585e-06,
"loss": 10.7929,
"step": 228
},
{
"epoch": 1.1011134517002708,
"grad_norm": 0.16953137516975403,
"learning_rate": 6.328502415458938e-06,
"loss": 9.2881,
"step": 229
},
{
"epoch": 1.1059283779717124,
"grad_norm": 0.16596080362796783,
"learning_rate": 6.312399355877618e-06,
"loss": 9.9087,
"step": 230
},
{
"epoch": 1.1107433042431538,
"grad_norm": 0.17415396869182587,
"learning_rate": 6.296296296296297e-06,
"loss": 10.5827,
"step": 231
},
{
"epoch": 1.1155582305145952,
"grad_norm": 0.1941956877708435,
"learning_rate": 6.280193236714976e-06,
"loss": 10.6707,
"step": 232
},
{
"epoch": 1.1203731567860367,
"grad_norm": 0.2597000300884247,
"learning_rate": 6.264090177133656e-06,
"loss": 12.2734,
"step": 233
},
{
"epoch": 1.1251880830574783,
"grad_norm": 0.1953185349702835,
"learning_rate": 6.247987117552336e-06,
"loss": 9.6042,
"step": 234
},
{
"epoch": 1.1300030093289197,
"grad_norm": 0.19797232747077942,
"learning_rate": 6.2318840579710145e-06,
"loss": 10.6643,
"step": 235
},
{
"epoch": 1.1348179356003611,
"grad_norm": 0.18180033564567566,
"learning_rate": 6.215780998389694e-06,
"loss": 10.2799,
"step": 236
},
{
"epoch": 1.1396328618718026,
"grad_norm": 0.17393337190151215,
"learning_rate": 6.199677938808374e-06,
"loss": 9.4656,
"step": 237
},
{
"epoch": 1.144447788143244,
"grad_norm": 0.1834544539451599,
"learning_rate": 6.1835748792270535e-06,
"loss": 8.9495,
"step": 238
},
{
"epoch": 1.1492627144146854,
"grad_norm": 0.14842462539672852,
"learning_rate": 6.167471819645733e-06,
"loss": 9.6697,
"step": 239
},
{
"epoch": 1.154077640686127,
"grad_norm": 0.2158040702342987,
"learning_rate": 6.151368760064412e-06,
"loss": 9.1501,
"step": 240
},
{
"epoch": 1.1588925669575685,
"grad_norm": 0.18131056427955627,
"learning_rate": 6.135265700483092e-06,
"loss": 10.3998,
"step": 241
},
{
"epoch": 1.16370749322901,
"grad_norm": 0.22484710812568665,
"learning_rate": 6.119162640901772e-06,
"loss": 10.0693,
"step": 242
},
{
"epoch": 1.1685224195004513,
"grad_norm": 0.18370361626148224,
"learning_rate": 6.103059581320452e-06,
"loss": 10.4377,
"step": 243
},
{
"epoch": 1.173337345771893,
"grad_norm": 0.2081800103187561,
"learning_rate": 6.086956521739132e-06,
"loss": 9.8576,
"step": 244
},
{
"epoch": 1.1781522720433344,
"grad_norm": 0.1726984828710556,
"learning_rate": 6.0708534621578104e-06,
"loss": 10.4158,
"step": 245
},
{
"epoch": 1.1829671983147758,
"grad_norm": 0.22167733311653137,
"learning_rate": 6.05475040257649e-06,
"loss": 9.2583,
"step": 246
},
{
"epoch": 1.1877821245862172,
"grad_norm": 0.24326634407043457,
"learning_rate": 6.03864734299517e-06,
"loss": 10.1546,
"step": 247
},
{
"epoch": 1.1925970508576587,
"grad_norm": 0.20624417066574097,
"learning_rate": 6.0225442834138495e-06,
"loss": 10.3518,
"step": 248
},
{
"epoch": 1.1974119771291003,
"grad_norm": 0.22262895107269287,
"learning_rate": 6.006441223832528e-06,
"loss": 10.2671,
"step": 249
},
{
"epoch": 1.2022269034005417,
"grad_norm": 0.18244238197803497,
"learning_rate": 5.990338164251208e-06,
"loss": 10.3412,
"step": 250
},
{
"epoch": 1.2070418296719831,
"grad_norm": 0.12642191350460052,
"learning_rate": 5.974235104669888e-06,
"loss": 8.3136,
"step": 251
},
{
"epoch": 1.2118567559434246,
"grad_norm": 0.22949941456317902,
"learning_rate": 5.958132045088567e-06,
"loss": 9.8669,
"step": 252
},
{
"epoch": 1.216671682214866,
"grad_norm": 0.17824606597423553,
"learning_rate": 5.942028985507247e-06,
"loss": 8.9438,
"step": 253
},
{
"epoch": 1.2214866084863076,
"grad_norm": 0.21706126630306244,
"learning_rate": 5.925925925925926e-06,
"loss": 10.5671,
"step": 254
},
{
"epoch": 1.226301534757749,
"grad_norm": 0.1777815967798233,
"learning_rate": 5.9098228663446055e-06,
"loss": 9.0566,
"step": 255
},
{
"epoch": 1.2311164610291905,
"grad_norm": 0.16943249106407166,
"learning_rate": 5.893719806763285e-06,
"loss": 10.5991,
"step": 256
},
{
"epoch": 1.235931387300632,
"grad_norm": 0.19475321471691132,
"learning_rate": 5.877616747181965e-06,
"loss": 9.7751,
"step": 257
},
{
"epoch": 1.2407463135720733,
"grad_norm": 0.15499532222747803,
"learning_rate": 5.861513687600645e-06,
"loss": 9.2155,
"step": 258
},
{
"epoch": 1.245561239843515,
"grad_norm": 0.21997332572937012,
"learning_rate": 5.845410628019324e-06,
"loss": 9.9653,
"step": 259
},
{
"epoch": 1.2503761661149564,
"grad_norm": 0.2071482390165329,
"learning_rate": 5.829307568438004e-06,
"loss": 9.9446,
"step": 260
},
{
"epoch": 1.2551910923863978,
"grad_norm": 0.18931487202644348,
"learning_rate": 5.8132045088566835e-06,
"loss": 10.322,
"step": 261
},
{
"epoch": 1.2600060186578392,
"grad_norm": 0.14098307490348816,
"learning_rate": 5.797101449275363e-06,
"loss": 9.7707,
"step": 262
},
{
"epoch": 1.2648209449292809,
"grad_norm": 0.22090758383274078,
"learning_rate": 5.780998389694043e-06,
"loss": 9.702,
"step": 263
},
{
"epoch": 1.2696358712007223,
"grad_norm": 0.181729257106781,
"learning_rate": 5.764895330112722e-06,
"loss": 9.5123,
"step": 264
},
{
"epoch": 1.2744507974721637,
"grad_norm": 0.1258496791124344,
"learning_rate": 5.748792270531401e-06,
"loss": 9.0927,
"step": 265
},
{
"epoch": 1.2792657237436051,
"grad_norm": 0.21762683987617493,
"learning_rate": 5.732689210950081e-06,
"loss": 8.5398,
"step": 266
},
{
"epoch": 1.2840806500150466,
"grad_norm": 0.14968731999397278,
"learning_rate": 5.716586151368761e-06,
"loss": 9.486,
"step": 267
},
{
"epoch": 1.288895576286488,
"grad_norm": 0.17779159545898438,
"learning_rate": 5.70048309178744e-06,
"loss": 10.4383,
"step": 268
},
{
"epoch": 1.2937105025579296,
"grad_norm": 0.19466915726661682,
"learning_rate": 5.684380032206119e-06,
"loss": 10.2354,
"step": 269
},
{
"epoch": 1.298525428829371,
"grad_norm": 0.22139185667037964,
"learning_rate": 5.668276972624799e-06,
"loss": 9.7784,
"step": 270
},
{
"epoch": 1.3033403551008125,
"grad_norm": 0.21013078093528748,
"learning_rate": 5.652173913043479e-06,
"loss": 11.4605,
"step": 271
},
{
"epoch": 1.3081552813722541,
"grad_norm": 0.17095215618610382,
"learning_rate": 5.6360708534621574e-06,
"loss": 9.6968,
"step": 272
},
{
"epoch": 1.3129702076436955,
"grad_norm": 0.15703898668289185,
"learning_rate": 5.619967793880838e-06,
"loss": 8.9945,
"step": 273
},
{
"epoch": 1.317785133915137,
"grad_norm": 0.16166311502456665,
"learning_rate": 5.603864734299518e-06,
"loss": 8.9587,
"step": 274
},
{
"epoch": 1.3226000601865784,
"grad_norm": 0.18226633965969086,
"learning_rate": 5.587761674718197e-06,
"loss": 9.5976,
"step": 275
},
{
"epoch": 1.3274149864580198,
"grad_norm": 0.16516032814979553,
"learning_rate": 5.571658615136877e-06,
"loss": 8.9366,
"step": 276
},
{
"epoch": 1.3322299127294612,
"grad_norm": 0.18485237658023834,
"learning_rate": 5.555555555555557e-06,
"loss": 8.6867,
"step": 277
},
{
"epoch": 1.3370448390009029,
"grad_norm": 0.16183756291866302,
"learning_rate": 5.5394524959742355e-06,
"loss": 10.0091,
"step": 278
},
{
"epoch": 1.3418597652723443,
"grad_norm": 0.18236857652664185,
"learning_rate": 5.523349436392915e-06,
"loss": 11.0512,
"step": 279
},
{
"epoch": 1.3466746915437857,
"grad_norm": 0.16111883521080017,
"learning_rate": 5.507246376811595e-06,
"loss": 9.9661,
"step": 280
},
{
"epoch": 1.3514896178152271,
"grad_norm": 0.17416836321353912,
"learning_rate": 5.4911433172302745e-06,
"loss": 8.5023,
"step": 281
},
{
"epoch": 1.3563045440866688,
"grad_norm": 0.1845031976699829,
"learning_rate": 5.475040257648953e-06,
"loss": 8.1757,
"step": 282
},
{
"epoch": 1.3611194703581102,
"grad_norm": 0.14829057455062866,
"learning_rate": 5.458937198067633e-06,
"loss": 10.4167,
"step": 283
},
{
"epoch": 1.3659343966295516,
"grad_norm": 0.18102510273456573,
"learning_rate": 5.442834138486313e-06,
"loss": 10.1862,
"step": 284
},
{
"epoch": 1.370749322900993,
"grad_norm": 0.1877845823764801,
"learning_rate": 5.426731078904992e-06,
"loss": 10.4398,
"step": 285
},
{
"epoch": 1.3755642491724345,
"grad_norm": 0.19289150834083557,
"learning_rate": 5.410628019323671e-06,
"loss": 10.1863,
"step": 286
},
{
"epoch": 1.380379175443876,
"grad_norm": 0.14551950991153717,
"learning_rate": 5.394524959742351e-06,
"loss": 8.6225,
"step": 287
},
{
"epoch": 1.3851941017153175,
"grad_norm": 0.15998440980911255,
"learning_rate": 5.3784219001610306e-06,
"loss": 9.6626,
"step": 288
},
{
"epoch": 1.390009027986759,
"grad_norm": 0.15218336880207062,
"learning_rate": 5.362318840579711e-06,
"loss": 9.0365,
"step": 289
},
{
"epoch": 1.3948239542582004,
"grad_norm": 0.19268082082271576,
"learning_rate": 5.346215780998391e-06,
"loss": 9.069,
"step": 290
},
{
"epoch": 1.399638880529642,
"grad_norm": 0.15415695309638977,
"learning_rate": 5.3301127214170704e-06,
"loss": 10.0018,
"step": 291
},
{
"epoch": 1.4044538068010834,
"grad_norm": 0.1783796101808548,
"learning_rate": 5.314009661835749e-06,
"loss": 8.5794,
"step": 292
},
{
"epoch": 1.4092687330725249,
"grad_norm": 0.23539525270462036,
"learning_rate": 5.297906602254429e-06,
"loss": 9.2077,
"step": 293
},
{
"epoch": 1.4140836593439663,
"grad_norm": 0.19150039553642273,
"learning_rate": 5.281803542673109e-06,
"loss": 8.6828,
"step": 294
},
{
"epoch": 1.4188985856154077,
"grad_norm": 0.18820087611675262,
"learning_rate": 5.265700483091788e-06,
"loss": 9.4677,
"step": 295
},
{
"epoch": 1.4237135118868491,
"grad_norm": 0.5018635988235474,
"learning_rate": 5.249597423510467e-06,
"loss": 9.6455,
"step": 296
},
{
"epoch": 1.4285284381582908,
"grad_norm": 0.17721492052078247,
"learning_rate": 5.233494363929147e-06,
"loss": 8.3182,
"step": 297
},
{
"epoch": 1.4333433644297322,
"grad_norm": 0.20144477486610413,
"learning_rate": 5.2173913043478265e-06,
"loss": 9.1568,
"step": 298
},
{
"epoch": 1.4381582907011736,
"grad_norm": 0.18805253505706787,
"learning_rate": 5.201288244766506e-06,
"loss": 9.7496,
"step": 299
},
{
"epoch": 1.442973216972615,
"grad_norm": 0.1500595659017563,
"learning_rate": 5.185185185185185e-06,
"loss": 9.9708,
"step": 300
},
{
"epoch": 1.4477881432440567,
"grad_norm": 0.19444873929023743,
"learning_rate": 5.169082125603865e-06,
"loss": 9.5143,
"step": 301
},
{
"epoch": 1.4526030695154981,
"grad_norm": 0.18682818114757538,
"learning_rate": 5.152979066022544e-06,
"loss": 10.2596,
"step": 302
},
{
"epoch": 1.4574179957869395,
"grad_norm": 0.17984358966350555,
"learning_rate": 5.136876006441224e-06,
"loss": 10.2697,
"step": 303
},
{
"epoch": 1.462232922058381,
"grad_norm": 0.17564424872398376,
"learning_rate": 5.1207729468599045e-06,
"loss": 9.9508,
"step": 304
},
{
"epoch": 1.4670478483298224,
"grad_norm": 0.1954619437456131,
"learning_rate": 5.104669887278584e-06,
"loss": 10.603,
"step": 305
},
{
"epoch": 1.4718627746012638,
"grad_norm": 0.16032911837100983,
"learning_rate": 5.088566827697263e-06,
"loss": 10.1388,
"step": 306
},
{
"epoch": 1.4766777008727054,
"grad_norm": 0.18712233006954193,
"learning_rate": 5.072463768115943e-06,
"loss": 10.603,
"step": 307
},
{
"epoch": 1.4814926271441469,
"grad_norm": 0.18479761481285095,
"learning_rate": 5.056360708534622e-06,
"loss": 9.8074,
"step": 308
},
{
"epoch": 1.4863075534155883,
"grad_norm": 0.14700675010681152,
"learning_rate": 5.040257648953302e-06,
"loss": 10.4248,
"step": 309
},
{
"epoch": 1.4911224796870297,
"grad_norm": 0.13533058762550354,
"learning_rate": 5.024154589371981e-06,
"loss": 9.1913,
"step": 310
},
{
"epoch": 1.4959374059584714,
"grad_norm": 0.1617136150598526,
"learning_rate": 5.0080515297906606e-06,
"loss": 8.7997,
"step": 311
},
{
"epoch": 1.5007523322299128,
"grad_norm": 0.14999867975711823,
"learning_rate": 4.99194847020934e-06,
"loss": 10.7806,
"step": 312
},
{
"epoch": 1.5055672585013542,
"grad_norm": 0.1483631134033203,
"learning_rate": 4.97584541062802e-06,
"loss": 8.9508,
"step": 313
},
{
"epoch": 1.5103821847727956,
"grad_norm": 0.1401262730360031,
"learning_rate": 4.959742351046699e-06,
"loss": 9.3086,
"step": 314
},
{
"epoch": 1.515197111044237,
"grad_norm": 0.20340582728385925,
"learning_rate": 4.9436392914653784e-06,
"loss": 9.6934,
"step": 315
},
{
"epoch": 1.5200120373156785,
"grad_norm": 0.10809484124183655,
"learning_rate": 4.927536231884059e-06,
"loss": 8.5021,
"step": 316
},
{
"epoch": 1.52482696358712,
"grad_norm": 0.18179920315742493,
"learning_rate": 4.911433172302738e-06,
"loss": 8.7153,
"step": 317
},
{
"epoch": 1.5296418898585615,
"grad_norm": 0.1383148580789566,
"learning_rate": 4.8953301127214175e-06,
"loss": 10.6062,
"step": 318
},
{
"epoch": 1.534456816130003,
"grad_norm": 0.21121209859848022,
"learning_rate": 4.879227053140097e-06,
"loss": 8.7374,
"step": 319
},
{
"epoch": 1.5392717424014446,
"grad_norm": 0.19276529550552368,
"learning_rate": 4.863123993558777e-06,
"loss": 8.7942,
"step": 320
},
{
"epoch": 1.544086668672886,
"grad_norm": 0.18534629046916962,
"learning_rate": 4.847020933977456e-06,
"loss": 7.3924,
"step": 321
},
{
"epoch": 1.5489015949443274,
"grad_norm": 0.11499077826738358,
"learning_rate": 4.830917874396135e-06,
"loss": 9.042,
"step": 322
},
{
"epoch": 1.5537165212157689,
"grad_norm": 0.19323264062404633,
"learning_rate": 4.814814814814815e-06,
"loss": 9.5308,
"step": 323
},
{
"epoch": 1.5585314474872103,
"grad_norm": 0.163632333278656,
"learning_rate": 4.798711755233495e-06,
"loss": 9.5654,
"step": 324
},
{
"epoch": 1.5633463737586517,
"grad_norm": 0.24960660934448242,
"learning_rate": 4.782608695652174e-06,
"loss": 8.9639,
"step": 325
},
{
"epoch": 1.5681613000300931,
"grad_norm": 0.13659049570560455,
"learning_rate": 4.766505636070854e-06,
"loss": 9.2311,
"step": 326
},
{
"epoch": 1.5729762263015348,
"grad_norm": 0.19566506147384644,
"learning_rate": 4.750402576489534e-06,
"loss": 9.9125,
"step": 327
},
{
"epoch": 1.5777911525729762,
"grad_norm": 0.13559715449810028,
"learning_rate": 4.7342995169082125e-06,
"loss": 10.2432,
"step": 328
},
{
"epoch": 1.5826060788444178,
"grad_norm": 0.20595477521419525,
"learning_rate": 4.718196457326892e-06,
"loss": 9.3677,
"step": 329
},
{
"epoch": 1.5874210051158593,
"grad_norm": 0.1580948680639267,
"learning_rate": 4.702093397745572e-06,
"loss": 10.0316,
"step": 330
},
{
"epoch": 1.5922359313873007,
"grad_norm": 0.1536228209733963,
"learning_rate": 4.6859903381642516e-06,
"loss": 10.8169,
"step": 331
},
{
"epoch": 1.597050857658742,
"grad_norm": 0.17159651219844818,
"learning_rate": 4.669887278582931e-06,
"loss": 9.8849,
"step": 332
},
{
"epoch": 1.6018657839301835,
"grad_norm": 0.14754590392112732,
"learning_rate": 4.653784219001611e-06,
"loss": 8.2419,
"step": 333
},
{
"epoch": 1.606680710201625,
"grad_norm": 0.15272633731365204,
"learning_rate": 4.637681159420291e-06,
"loss": 10.358,
"step": 334
},
{
"epoch": 1.6114956364730664,
"grad_norm": 0.23571325838565826,
"learning_rate": 4.621578099838969e-06,
"loss": 8.9846,
"step": 335
},
{
"epoch": 1.616310562744508,
"grad_norm": 0.1520383059978485,
"learning_rate": 4.605475040257649e-06,
"loss": 9.627,
"step": 336
},
{
"epoch": 1.6211254890159494,
"grad_norm": 0.16789157688617706,
"learning_rate": 4.589371980676329e-06,
"loss": 8.9584,
"step": 337
},
{
"epoch": 1.6259404152873909,
"grad_norm": 0.23156379163265228,
"learning_rate": 4.5732689210950084e-06,
"loss": 9.4165,
"step": 338
},
{
"epoch": 1.6307553415588325,
"grad_norm": 0.2569849491119385,
"learning_rate": 4.557165861513688e-06,
"loss": 8.6545,
"step": 339
},
{
"epoch": 1.635570267830274,
"grad_norm": 0.1471448540687561,
"learning_rate": 4.541062801932368e-06,
"loss": 9.7279,
"step": 340
},
{
"epoch": 1.6403851941017153,
"grad_norm": 0.19168996810913086,
"learning_rate": 4.5249597423510475e-06,
"loss": 8.6823,
"step": 341
},
{
"epoch": 1.6452001203731568,
"grad_norm": 0.16900351643562317,
"learning_rate": 4.508856682769726e-06,
"loss": 9.3926,
"step": 342
},
{
"epoch": 1.6500150466445982,
"grad_norm": 0.1279803216457367,
"learning_rate": 4.492753623188406e-06,
"loss": 8.8222,
"step": 343
},
{
"epoch": 1.6548299729160396,
"grad_norm": 0.16592150926589966,
"learning_rate": 4.476650563607086e-06,
"loss": 9.3739,
"step": 344
},
{
"epoch": 1.659644899187481,
"grad_norm": 0.18117226660251617,
"learning_rate": 4.460547504025765e-06,
"loss": 8.6028,
"step": 345
},
{
"epoch": 1.6644598254589227,
"grad_norm": 0.15843939781188965,
"learning_rate": 4.444444444444444e-06,
"loss": 7.3552,
"step": 346
},
{
"epoch": 1.669274751730364,
"grad_norm": 0.1672333925962448,
"learning_rate": 4.428341384863125e-06,
"loss": 8.1781,
"step": 347
},
{
"epoch": 1.6740896780018057,
"grad_norm": 0.1798122376203537,
"learning_rate": 4.412238325281804e-06,
"loss": 9.5976,
"step": 348
},
{
"epoch": 1.6789046042732472,
"grad_norm": 0.15125727653503418,
"learning_rate": 4.396135265700483e-06,
"loss": 9.2915,
"step": 349
},
{
"epoch": 1.6837195305446886,
"grad_norm": 0.15909244120121002,
"learning_rate": 4.380032206119163e-06,
"loss": 8.5171,
"step": 350
},
{
"epoch": 1.68853445681613,
"grad_norm": 0.19835767149925232,
"learning_rate": 4.3639291465378425e-06,
"loss": 9.3835,
"step": 351
},
{
"epoch": 1.6933493830875714,
"grad_norm": 0.30680009722709656,
"learning_rate": 4.347826086956522e-06,
"loss": 9.3733,
"step": 352
},
{
"epoch": 1.6981643093590129,
"grad_norm": 0.13429707288742065,
"learning_rate": 4.331723027375201e-06,
"loss": 9.6266,
"step": 353
},
{
"epoch": 1.7029792356304543,
"grad_norm": 0.16423039138317108,
"learning_rate": 4.315619967793881e-06,
"loss": 9.4546,
"step": 354
},
{
"epoch": 1.707794161901896,
"grad_norm": 0.14078310132026672,
"learning_rate": 4.299516908212561e-06,
"loss": 9.1663,
"step": 355
},
{
"epoch": 1.7126090881733373,
"grad_norm": 0.2016141414642334,
"learning_rate": 4.28341384863124e-06,
"loss": 8.6698,
"step": 356
},
{
"epoch": 1.7174240144447788,
"grad_norm": 0.13229703903198242,
"learning_rate": 4.26731078904992e-06,
"loss": 8.5468,
"step": 357
},
{
"epoch": 1.7222389407162204,
"grad_norm": 0.22356487810611725,
"learning_rate": 4.251207729468599e-06,
"loss": 9.311,
"step": 358
},
{
"epoch": 1.7270538669876618,
"grad_norm": 0.19844292104244232,
"learning_rate": 4.235104669887279e-06,
"loss": 9.054,
"step": 359
},
{
"epoch": 1.7318687932591033,
"grad_norm": 0.18081983923912048,
"learning_rate": 4.219001610305958e-06,
"loss": 8.9678,
"step": 360
},
{
"epoch": 1.7366837195305447,
"grad_norm": 0.2216968685388565,
"learning_rate": 4.202898550724638e-06,
"loss": 8.6121,
"step": 361
},
{
"epoch": 1.741498645801986,
"grad_norm": 0.14121295511722565,
"learning_rate": 4.186795491143318e-06,
"loss": 9.2074,
"step": 362
},
{
"epoch": 1.7463135720734275,
"grad_norm": 0.148764505982399,
"learning_rate": 4.170692431561997e-06,
"loss": 9.1965,
"step": 363
},
{
"epoch": 1.751128498344869,
"grad_norm": 0.20818910002708435,
"learning_rate": 4.154589371980677e-06,
"loss": 8.5382,
"step": 364
},
{
"epoch": 1.7559434246163106,
"grad_norm": 0.1755458116531372,
"learning_rate": 4.138486312399356e-06,
"loss": 9.0389,
"step": 365
},
{
"epoch": 1.760758350887752,
"grad_norm": 0.15656408667564392,
"learning_rate": 4.122383252818036e-06,
"loss": 9.226,
"step": 366
},
{
"epoch": 1.7655732771591937,
"grad_norm": 0.14213398098945618,
"learning_rate": 4.106280193236716e-06,
"loss": 8.2302,
"step": 367
},
{
"epoch": 1.770388203430635,
"grad_norm": 0.1693073809146881,
"learning_rate": 4.0901771336553945e-06,
"loss": 9.6989,
"step": 368
},
{
"epoch": 1.7752031297020765,
"grad_norm": 0.15878278017044067,
"learning_rate": 4.074074074074074e-06,
"loss": 9.4632,
"step": 369
},
{
"epoch": 1.780018055973518,
"grad_norm": 0.22463774681091309,
"learning_rate": 4.057971014492754e-06,
"loss": 9.7328,
"step": 370
},
{
"epoch": 1.7848329822449593,
"grad_norm": 0.4724883437156677,
"learning_rate": 4.0418679549114335e-06,
"loss": 10.2544,
"step": 371
},
{
"epoch": 1.7896479085164008,
"grad_norm": 0.17619994282722473,
"learning_rate": 4.025764895330113e-06,
"loss": 9.6433,
"step": 372
},
{
"epoch": 1.7944628347878422,
"grad_norm": 0.16114237904548645,
"learning_rate": 4.009661835748793e-06,
"loss": 10.651,
"step": 373
},
{
"epoch": 1.7992777610592838,
"grad_norm": 0.2053680568933487,
"learning_rate": 3.9935587761674725e-06,
"loss": 8.8208,
"step": 374
},
{
"epoch": 1.8040926873307253,
"grad_norm": 0.17200101912021637,
"learning_rate": 3.977455716586151e-06,
"loss": 8.9841,
"step": 375
},
{
"epoch": 1.8089076136021667,
"grad_norm": 0.12033673375844955,
"learning_rate": 3.961352657004831e-06,
"loss": 8.552,
"step": 376
},
{
"epoch": 1.8137225398736083,
"grad_norm": 0.17469695210456848,
"learning_rate": 3.945249597423511e-06,
"loss": 8.6438,
"step": 377
},
{
"epoch": 1.8185374661450497,
"grad_norm": 0.19993340969085693,
"learning_rate": 3.92914653784219e-06,
"loss": 9.326,
"step": 378
},
{
"epoch": 1.8233523924164912,
"grad_norm": 0.18282270431518555,
"learning_rate": 3.91304347826087e-06,
"loss": 8.6382,
"step": 379
},
{
"epoch": 1.8281673186879326,
"grad_norm": 0.21918214857578278,
"learning_rate": 3.89694041867955e-06,
"loss": 9.7637,
"step": 380
},
{
"epoch": 1.832982244959374,
"grad_norm": 0.19311483204364777,
"learning_rate": 3.880837359098229e-06,
"loss": 9.7215,
"step": 381
},
{
"epoch": 1.8377971712308154,
"grad_norm": 0.2024223506450653,
"learning_rate": 3.864734299516908e-06,
"loss": 9.1813,
"step": 382
},
{
"epoch": 1.8426120975022569,
"grad_norm": 0.15196166932582855,
"learning_rate": 3.848631239935588e-06,
"loss": 9.4212,
"step": 383
},
{
"epoch": 1.8474270237736985,
"grad_norm": 0.20014698803424835,
"learning_rate": 3.832528180354268e-06,
"loss": 9.0854,
"step": 384
},
{
"epoch": 1.85224195004514,
"grad_norm": 0.2045230120420456,
"learning_rate": 3.816425120772947e-06,
"loss": 9.3168,
"step": 385
},
{
"epoch": 1.8570568763165816,
"grad_norm": 0.13044817745685577,
"learning_rate": 3.800322061191627e-06,
"loss": 8.4085,
"step": 386
},
{
"epoch": 1.861871802588023,
"grad_norm": 0.19362546503543854,
"learning_rate": 3.7842190016103066e-06,
"loss": 8.5539,
"step": 387
},
{
"epoch": 1.8666867288594644,
"grad_norm": 0.19143155217170715,
"learning_rate": 3.768115942028986e-06,
"loss": 9.0545,
"step": 388
},
{
"epoch": 1.8715016551309058,
"grad_norm": 0.18278856575489044,
"learning_rate": 3.7520128824476656e-06,
"loss": 8.6361,
"step": 389
},
{
"epoch": 1.8763165814023472,
"grad_norm": 0.20836183428764343,
"learning_rate": 3.735909822866345e-06,
"loss": 9.0898,
"step": 390
},
{
"epoch": 1.8811315076737887,
"grad_norm": 0.18853327631950378,
"learning_rate": 3.7198067632850245e-06,
"loss": 9.8428,
"step": 391
},
{
"epoch": 1.88594643394523,
"grad_norm": 0.13650333881378174,
"learning_rate": 3.7037037037037037e-06,
"loss": 8.8437,
"step": 392
},
{
"epoch": 1.8907613602166715,
"grad_norm": 0.20635420083999634,
"learning_rate": 3.6876006441223834e-06,
"loss": 9.0691,
"step": 393
},
{
"epoch": 1.8955762864881132,
"grad_norm": 0.16736768186092377,
"learning_rate": 3.6714975845410635e-06,
"loss": 9.318,
"step": 394
},
{
"epoch": 1.9003912127595546,
"grad_norm": 0.21544639766216278,
"learning_rate": 3.6553945249597428e-06,
"loss": 8.9387,
"step": 395
},
{
"epoch": 1.9052061390309962,
"grad_norm": 0.17389844357967377,
"learning_rate": 3.6392914653784224e-06,
"loss": 9.549,
"step": 396
},
{
"epoch": 1.9100210653024376,
"grad_norm": 0.21728019416332245,
"learning_rate": 3.6231884057971017e-06,
"loss": 8.0753,
"step": 397
},
{
"epoch": 1.914835991573879,
"grad_norm": 0.199959859251976,
"learning_rate": 3.6070853462157814e-06,
"loss": 9.9683,
"step": 398
},
{
"epoch": 1.9196509178453205,
"grad_norm": 0.16808640956878662,
"learning_rate": 3.5909822866344606e-06,
"loss": 9.2667,
"step": 399
},
{
"epoch": 1.924465844116762,
"grad_norm": 0.15371474623680115,
"learning_rate": 3.5748792270531403e-06,
"loss": 8.9782,
"step": 400
},
{
"epoch": 1.9292807703882033,
"grad_norm": 0.22420039772987366,
"learning_rate": 3.5587761674718204e-06,
"loss": 9.5041,
"step": 401
},
{
"epoch": 1.9340956966596448,
"grad_norm": 0.19234929978847504,
"learning_rate": 3.5426731078904997e-06,
"loss": 8.8785,
"step": 402
},
{
"epoch": 1.9389106229310864,
"grad_norm": 0.13435740768909454,
"learning_rate": 3.5265700483091793e-06,
"loss": 9.3544,
"step": 403
},
{
"epoch": 1.9437255492025278,
"grad_norm": 0.21900928020477295,
"learning_rate": 3.5104669887278586e-06,
"loss": 8.1942,
"step": 404
},
{
"epoch": 1.9485404754739695,
"grad_norm": 0.16180120408535004,
"learning_rate": 3.4943639291465383e-06,
"loss": 9.4132,
"step": 405
},
{
"epoch": 1.953355401745411,
"grad_norm": 0.2743014991283417,
"learning_rate": 3.4782608695652175e-06,
"loss": 10.1588,
"step": 406
},
{
"epoch": 1.9581703280168523,
"grad_norm": 0.14160144329071045,
"learning_rate": 3.462157809983897e-06,
"loss": 9.0612,
"step": 407
},
{
"epoch": 1.9629852542882937,
"grad_norm": 0.1383216828107834,
"learning_rate": 3.4460547504025764e-06,
"loss": 7.8391,
"step": 408
},
{
"epoch": 1.9678001805597352,
"grad_norm": 0.16990961134433746,
"learning_rate": 3.4299516908212565e-06,
"loss": 9.6392,
"step": 409
},
{
"epoch": 1.9726151068311766,
"grad_norm": 0.17103661596775055,
"learning_rate": 3.4138486312399362e-06,
"loss": 9.8119,
"step": 410
},
{
"epoch": 1.977430033102618,
"grad_norm": 0.13866282999515533,
"learning_rate": 3.3977455716586155e-06,
"loss": 8.2033,
"step": 411
},
{
"epoch": 1.9822449593740594,
"grad_norm": 0.21080395579338074,
"learning_rate": 3.381642512077295e-06,
"loss": 10.3113,
"step": 412
},
{
"epoch": 1.987059885645501,
"grad_norm": 0.19845469295978546,
"learning_rate": 3.3655394524959744e-06,
"loss": 8.2103,
"step": 413
},
{
"epoch": 1.9918748119169425,
"grad_norm": 0.1903708279132843,
"learning_rate": 3.349436392914654e-06,
"loss": 9.1371,
"step": 414
},
{
"epoch": 1.9966897381883841,
"grad_norm": 0.16223041713237762,
"learning_rate": 3.3333333333333333e-06,
"loss": 7.368,
"step": 415
},
{
"epoch": 2.0,
"grad_norm": 0.14595092833042145,
"learning_rate": 3.317230273752013e-06,
"loss": 5.8598,
"step": 416
},
{
"epoch": 2.0048149262714414,
"grad_norm": 0.15010525286197662,
"learning_rate": 3.301127214170693e-06,
"loss": 8.1315,
"step": 417
},
{
"epoch": 2.009629852542883,
"grad_norm": 0.23141905665397644,
"learning_rate": 3.2850241545893724e-06,
"loss": 9.3878,
"step": 418
},
{
"epoch": 2.0144447788143243,
"grad_norm": 0.11268898099660873,
"learning_rate": 3.268921095008052e-06,
"loss": 7.8098,
"step": 419
},
{
"epoch": 2.0192597050857657,
"grad_norm": 0.16212859749794006,
"learning_rate": 3.2528180354267313e-06,
"loss": 8.5319,
"step": 420
},
{
"epoch": 2.0240746313572076,
"grad_norm": 0.1565706580877304,
"learning_rate": 3.236714975845411e-06,
"loss": 7.8942,
"step": 421
},
{
"epoch": 2.028889557628649,
"grad_norm": 0.1680455058813095,
"learning_rate": 3.22061191626409e-06,
"loss": 8.2839,
"step": 422
},
{
"epoch": 2.0337044839000904,
"grad_norm": 0.2539815306663513,
"learning_rate": 3.20450885668277e-06,
"loss": 8.9937,
"step": 423
},
{
"epoch": 2.038519410171532,
"grad_norm": 0.238030806183815,
"learning_rate": 3.188405797101449e-06,
"loss": 8.4321,
"step": 424
},
{
"epoch": 2.0433343364429732,
"grad_norm": 0.19473034143447876,
"learning_rate": 3.1723027375201292e-06,
"loss": 8.0307,
"step": 425
},
{
"epoch": 2.0481492627144147,
"grad_norm": 0.16554652154445648,
"learning_rate": 3.156199677938809e-06,
"loss": 9.5945,
"step": 426
},
{
"epoch": 2.052964188985856,
"grad_norm": 0.19130951166152954,
"learning_rate": 3.140096618357488e-06,
"loss": 8.0234,
"step": 427
},
{
"epoch": 2.0577791152572975,
"grad_norm": 0.14681276679039001,
"learning_rate": 3.123993558776168e-06,
"loss": 8.6784,
"step": 428
},
{
"epoch": 2.062594041528739,
"grad_norm": 0.10328257828950882,
"learning_rate": 3.107890499194847e-06,
"loss": 8.0287,
"step": 429
},
{
"epoch": 2.0674089678001804,
"grad_norm": 0.19125495851039886,
"learning_rate": 3.0917874396135268e-06,
"loss": 8.0046,
"step": 430
},
{
"epoch": 2.072223894071622,
"grad_norm": 0.1793103963136673,
"learning_rate": 3.075684380032206e-06,
"loss": 7.0518,
"step": 431
},
{
"epoch": 2.0770388203430636,
"grad_norm": 0.2568497657775879,
"learning_rate": 3.059581320450886e-06,
"loss": 8.2794,
"step": 432
},
{
"epoch": 2.081853746614505,
"grad_norm": 0.18120069801807404,
"learning_rate": 3.043478260869566e-06,
"loss": 10.1022,
"step": 433
},
{
"epoch": 2.0866686728859465,
"grad_norm": 0.27532005310058594,
"learning_rate": 3.027375201288245e-06,
"loss": 9.3059,
"step": 434
},
{
"epoch": 2.091483599157388,
"grad_norm": 0.15648192167282104,
"learning_rate": 3.0112721417069247e-06,
"loss": 7.8617,
"step": 435
},
{
"epoch": 2.0962985254288293,
"grad_norm": 0.17381350696086884,
"learning_rate": 2.995169082125604e-06,
"loss": 9.0082,
"step": 436
},
{
"epoch": 2.1011134517002708,
"grad_norm": 0.13711951673030853,
"learning_rate": 2.9790660225442837e-06,
"loss": 7.764,
"step": 437
},
{
"epoch": 2.105928377971712,
"grad_norm": 0.23948128521442413,
"learning_rate": 2.962962962962963e-06,
"loss": 8.4041,
"step": 438
},
{
"epoch": 2.1107433042431536,
"grad_norm": 0.15631070733070374,
"learning_rate": 2.9468599033816426e-06,
"loss": 8.1708,
"step": 439
},
{
"epoch": 2.115558230514595,
"grad_norm": 0.1608411967754364,
"learning_rate": 2.9307568438003227e-06,
"loss": 8.301,
"step": 440
},
{
"epoch": 2.120373156786037,
"grad_norm": 0.16660411655902863,
"learning_rate": 2.914653784219002e-06,
"loss": 9.5365,
"step": 441
},
{
"epoch": 2.1251880830574783,
"grad_norm": 0.17191386222839355,
"learning_rate": 2.8985507246376816e-06,
"loss": 8.9256,
"step": 442
},
{
"epoch": 2.1300030093289197,
"grad_norm": 0.18492081761360168,
"learning_rate": 2.882447665056361e-06,
"loss": 9.0577,
"step": 443
},
{
"epoch": 2.134817935600361,
"grad_norm": 0.2561168670654297,
"learning_rate": 2.8663446054750405e-06,
"loss": 8.8758,
"step": 444
},
{
"epoch": 2.1396328618718026,
"grad_norm": 0.1588340848684311,
"learning_rate": 2.85024154589372e-06,
"loss": 8.1364,
"step": 445
},
{
"epoch": 2.144447788143244,
"grad_norm": 0.1650805026292801,
"learning_rate": 2.8341384863123995e-06,
"loss": 8.337,
"step": 446
},
{
"epoch": 2.1492627144146854,
"grad_norm": 0.2011885941028595,
"learning_rate": 2.8180354267310787e-06,
"loss": 8.6296,
"step": 447
},
{
"epoch": 2.154077640686127,
"grad_norm": 0.18557001650333405,
"learning_rate": 2.801932367149759e-06,
"loss": 8.6218,
"step": 448
},
{
"epoch": 2.1588925669575683,
"grad_norm": 0.1598547399044037,
"learning_rate": 2.7858293075684385e-06,
"loss": 8.4741,
"step": 449
},
{
"epoch": 2.16370749322901,
"grad_norm": 0.17089636623859406,
"learning_rate": 2.7697262479871177e-06,
"loss": 9.0788,
"step": 450
},
{
"epoch": 2.1685224195004515,
"grad_norm": 0.1817985475063324,
"learning_rate": 2.7536231884057974e-06,
"loss": 9.9717,
"step": 451
},
{
"epoch": 2.173337345771893,
"grad_norm": 0.23914600908756256,
"learning_rate": 2.7375201288244767e-06,
"loss": 8.7548,
"step": 452
},
{
"epoch": 2.1781522720433344,
"grad_norm": 0.17113572359085083,
"learning_rate": 2.7214170692431564e-06,
"loss": 7.7566,
"step": 453
},
{
"epoch": 2.182967198314776,
"grad_norm": 0.14485716819763184,
"learning_rate": 2.7053140096618356e-06,
"loss": 8.9532,
"step": 454
},
{
"epoch": 2.1877821245862172,
"grad_norm": 0.14129236340522766,
"learning_rate": 2.6892109500805153e-06,
"loss": 9.2833,
"step": 455
},
{
"epoch": 2.1925970508576587,
"grad_norm": 0.23692472279071808,
"learning_rate": 2.6731078904991954e-06,
"loss": 8.3895,
"step": 456
},
{
"epoch": 2.1974119771291,
"grad_norm": 0.16027197241783142,
"learning_rate": 2.6570048309178746e-06,
"loss": 7.7012,
"step": 457
},
{
"epoch": 2.2022269034005415,
"grad_norm": 0.1416737139225006,
"learning_rate": 2.6409017713365543e-06,
"loss": 9.0799,
"step": 458
},
{
"epoch": 2.207041829671983,
"grad_norm": 0.20678099989891052,
"learning_rate": 2.6247987117552336e-06,
"loss": 9.3679,
"step": 459
},
{
"epoch": 2.211856755943425,
"grad_norm": 0.1649148017168045,
"learning_rate": 2.6086956521739132e-06,
"loss": 8.0503,
"step": 460
},
{
"epoch": 2.216671682214866,
"grad_norm": 0.21159884333610535,
"learning_rate": 2.5925925925925925e-06,
"loss": 9.0968,
"step": 461
},
{
"epoch": 2.2214866084863076,
"grad_norm": 0.13705681264400482,
"learning_rate": 2.576489533011272e-06,
"loss": 8.9948,
"step": 462
},
{
"epoch": 2.226301534757749,
"grad_norm": 0.16624397039413452,
"learning_rate": 2.5603864734299523e-06,
"loss": 8.6079,
"step": 463
},
{
"epoch": 2.2311164610291905,
"grad_norm": 0.1475958675146103,
"learning_rate": 2.5442834138486315e-06,
"loss": 8.0187,
"step": 464
},
{
"epoch": 2.235931387300632,
"grad_norm": 0.13494673371315002,
"learning_rate": 2.528180354267311e-06,
"loss": 8.6545,
"step": 465
},
{
"epoch": 2.2407463135720733,
"grad_norm": 0.17623811960220337,
"learning_rate": 2.5120772946859904e-06,
"loss": 9.4341,
"step": 466
},
{
"epoch": 2.2455612398435147,
"grad_norm": 0.1706833392381668,
"learning_rate": 2.49597423510467e-06,
"loss": 8.7199,
"step": 467
},
{
"epoch": 2.2503761661149566,
"grad_norm": 0.1953025609254837,
"learning_rate": 2.4798711755233494e-06,
"loss": 8.9361,
"step": 468
},
{
"epoch": 2.255191092386398,
"grad_norm": 0.20142245292663574,
"learning_rate": 2.4637681159420295e-06,
"loss": 8.0552,
"step": 469
},
{
"epoch": 2.2600060186578395,
"grad_norm": 0.20138177275657654,
"learning_rate": 2.4476650563607087e-06,
"loss": 8.5942,
"step": 470
},
{
"epoch": 2.264820944929281,
"grad_norm": 0.16559800505638123,
"learning_rate": 2.4315619967793884e-06,
"loss": 8.8228,
"step": 471
},
{
"epoch": 2.2696358712007223,
"grad_norm": 0.19990870356559753,
"learning_rate": 2.4154589371980677e-06,
"loss": 8.8207,
"step": 472
},
{
"epoch": 2.2744507974721637,
"grad_norm": 0.21723681688308716,
"learning_rate": 2.3993558776167473e-06,
"loss": 8.4973,
"step": 473
},
{
"epoch": 2.279265723743605,
"grad_norm": 0.17915472388267517,
"learning_rate": 2.383252818035427e-06,
"loss": 9.6049,
"step": 474
},
{
"epoch": 2.2840806500150466,
"grad_norm": 0.16757084429264069,
"learning_rate": 2.3671497584541063e-06,
"loss": 9.7332,
"step": 475
},
{
"epoch": 2.288895576286488,
"grad_norm": 0.16891081631183624,
"learning_rate": 2.351046698872786e-06,
"loss": 8.8673,
"step": 476
},
{
"epoch": 2.2937105025579294,
"grad_norm": 0.20567509531974792,
"learning_rate": 2.3349436392914656e-06,
"loss": 7.8363,
"step": 477
},
{
"epoch": 2.298525428829371,
"grad_norm": 0.1999160349369049,
"learning_rate": 2.3188405797101453e-06,
"loss": 8.7728,
"step": 478
},
{
"epoch": 2.3033403551008127,
"grad_norm": 0.2348831444978714,
"learning_rate": 2.3027375201288245e-06,
"loss": 9.1277,
"step": 479
},
{
"epoch": 2.308155281372254,
"grad_norm": 0.1700768917798996,
"learning_rate": 2.2866344605475042e-06,
"loss": 8.6687,
"step": 480
},
{
"epoch": 2.3129702076436955,
"grad_norm": 0.16349351406097412,
"learning_rate": 2.270531400966184e-06,
"loss": 8.0606,
"step": 481
},
{
"epoch": 2.317785133915137,
"grad_norm": 0.1540592461824417,
"learning_rate": 2.254428341384863e-06,
"loss": 7.3249,
"step": 482
},
{
"epoch": 2.3226000601865784,
"grad_norm": 0.1774080991744995,
"learning_rate": 2.238325281803543e-06,
"loss": 8.3134,
"step": 483
},
{
"epoch": 2.32741498645802,
"grad_norm": 0.14969424903392792,
"learning_rate": 2.222222222222222e-06,
"loss": 7.8938,
"step": 484
},
{
"epoch": 2.3322299127294612,
"grad_norm": 0.20331765711307526,
"learning_rate": 2.206119162640902e-06,
"loss": 8.3293,
"step": 485
},
{
"epoch": 2.3370448390009027,
"grad_norm": 0.1849997490644455,
"learning_rate": 2.1900161030595814e-06,
"loss": 7.811,
"step": 486
},
{
"epoch": 2.341859765272344,
"grad_norm": 0.1732867807149887,
"learning_rate": 2.173913043478261e-06,
"loss": 10.516,
"step": 487
},
{
"epoch": 2.346674691543786,
"grad_norm": 0.21279215812683105,
"learning_rate": 2.1578099838969404e-06,
"loss": 9.1675,
"step": 488
},
{
"epoch": 2.3514896178152274,
"grad_norm": 0.1616515964269638,
"learning_rate": 2.14170692431562e-06,
"loss": 7.8694,
"step": 489
},
{
"epoch": 2.356304544086669,
"grad_norm": 0.1548496037721634,
"learning_rate": 2.1256038647342997e-06,
"loss": 9.4236,
"step": 490
},
{
"epoch": 2.36111947035811,
"grad_norm": 0.19034922122955322,
"learning_rate": 2.109500805152979e-06,
"loss": 8.6999,
"step": 491
},
{
"epoch": 2.3659343966295516,
"grad_norm": 0.15850062668323517,
"learning_rate": 2.093397745571659e-06,
"loss": 9.3389,
"step": 492
},
{
"epoch": 2.370749322900993,
"grad_norm": 0.17764140665531158,
"learning_rate": 2.0772946859903383e-06,
"loss": 8.3777,
"step": 493
},
{
"epoch": 2.3755642491724345,
"grad_norm": 0.1516241729259491,
"learning_rate": 2.061191626409018e-06,
"loss": 8.215,
"step": 494
},
{
"epoch": 2.380379175443876,
"grad_norm": 0.19306409358978271,
"learning_rate": 2.0450885668276972e-06,
"loss": 8.5159,
"step": 495
},
{
"epoch": 2.3851941017153173,
"grad_norm": 0.18563927710056305,
"learning_rate": 2.028985507246377e-06,
"loss": 9.1431,
"step": 496
},
{
"epoch": 2.3900090279867587,
"grad_norm": 0.2177901268005371,
"learning_rate": 2.0128824476650566e-06,
"loss": 8.7708,
"step": 497
},
{
"epoch": 2.3948239542582006,
"grad_norm": 0.18854300677776337,
"learning_rate": 1.9967793880837363e-06,
"loss": 7.7517,
"step": 498
},
{
"epoch": 2.399638880529642,
"grad_norm": 0.19311924278736115,
"learning_rate": 1.9806763285024155e-06,
"loss": 8.5485,
"step": 499
},
{
"epoch": 2.4044538068010834,
"grad_norm": 0.1653197556734085,
"learning_rate": 1.964573268921095e-06,
"loss": 8.2121,
"step": 500
},
{
"epoch": 2.409268733072525,
"grad_norm": 0.14467386901378632,
"learning_rate": 1.948470209339775e-06,
"loss": 7.212,
"step": 501
},
{
"epoch": 2.4140836593439663,
"grad_norm": 0.127033531665802,
"learning_rate": 1.932367149758454e-06,
"loss": 7.9942,
"step": 502
},
{
"epoch": 2.4188985856154077,
"grad_norm": 0.22416523098945618,
"learning_rate": 1.916264090177134e-06,
"loss": 9.2825,
"step": 503
},
{
"epoch": 2.423713511886849,
"grad_norm": 0.15797053277492523,
"learning_rate": 1.9001610305958135e-06,
"loss": 9.0045,
"step": 504
},
{
"epoch": 2.4285284381582906,
"grad_norm": 0.16567374765872955,
"learning_rate": 1.884057971014493e-06,
"loss": 7.8467,
"step": 505
},
{
"epoch": 2.433343364429732,
"grad_norm": 0.2187729775905609,
"learning_rate": 1.8679549114331724e-06,
"loss": 6.8691,
"step": 506
},
{
"epoch": 2.438158290701174,
"grad_norm": 0.1330510675907135,
"learning_rate": 1.8518518518518519e-06,
"loss": 8.6586,
"step": 507
},
{
"epoch": 2.4429732169726153,
"grad_norm": 0.18938250839710236,
"learning_rate": 1.8357487922705318e-06,
"loss": 8.7543,
"step": 508
},
{
"epoch": 2.4477881432440567,
"grad_norm": 0.16788271069526672,
"learning_rate": 1.8196457326892112e-06,
"loss": 7.234,
"step": 509
},
{
"epoch": 2.452603069515498,
"grad_norm": 0.13278517127037048,
"learning_rate": 1.8035426731078907e-06,
"loss": 8.4826,
"step": 510
},
{
"epoch": 2.4574179957869395,
"grad_norm": 0.12632611393928528,
"learning_rate": 1.7874396135265702e-06,
"loss": 8.6997,
"step": 511
},
{
"epoch": 2.462232922058381,
"grad_norm": 0.21339954435825348,
"learning_rate": 1.7713365539452498e-06,
"loss": 8.8112,
"step": 512
},
{
"epoch": 2.4670478483298224,
"grad_norm": 0.17126010358333588,
"learning_rate": 1.7552334943639293e-06,
"loss": 7.5743,
"step": 513
},
{
"epoch": 2.471862774601264,
"grad_norm": 0.13244563341140747,
"learning_rate": 1.7391304347826088e-06,
"loss": 7.7622,
"step": 514
},
{
"epoch": 2.4766777008727052,
"grad_norm": 0.21267832815647125,
"learning_rate": 1.7230273752012882e-06,
"loss": 6.7007,
"step": 515
},
{
"epoch": 2.4814926271441466,
"grad_norm": 0.12102889269590378,
"learning_rate": 1.7069243156199681e-06,
"loss": 9.1424,
"step": 516
},
{
"epoch": 2.4863075534155885,
"grad_norm": 0.13392595946788788,
"learning_rate": 1.6908212560386476e-06,
"loss": 8.5965,
"step": 517
},
{
"epoch": 2.49112247968703,
"grad_norm": 0.1512872725725174,
"learning_rate": 1.674718196457327e-06,
"loss": 8.3953,
"step": 518
},
{
"epoch": 2.4959374059584714,
"grad_norm": 0.13532410562038422,
"learning_rate": 1.6586151368760065e-06,
"loss": 8.184,
"step": 519
},
{
"epoch": 2.5007523322299128,
"grad_norm": 0.1816960871219635,
"learning_rate": 1.6425120772946862e-06,
"loss": 9.9139,
"step": 520
},
{
"epoch": 2.505567258501354,
"grad_norm": 0.11753327399492264,
"learning_rate": 1.6264090177133656e-06,
"loss": 8.4936,
"step": 521
},
{
"epoch": 2.5103821847727956,
"grad_norm": 0.20234891772270203,
"learning_rate": 1.610305958132045e-06,
"loss": 8.1004,
"step": 522
},
{
"epoch": 2.515197111044237,
"grad_norm": 0.14017826318740845,
"learning_rate": 1.5942028985507246e-06,
"loss": 8.4294,
"step": 523
},
{
"epoch": 2.5200120373156785,
"grad_norm": 0.1481131762266159,
"learning_rate": 1.5780998389694045e-06,
"loss": 8.3886,
"step": 524
},
{
"epoch": 2.5248269635871203,
"grad_norm": 0.2701749801635742,
"learning_rate": 1.561996779388084e-06,
"loss": 8.1065,
"step": 525
},
{
"epoch": 2.5296418898585618,
"grad_norm": 0.16109466552734375,
"learning_rate": 1.5458937198067634e-06,
"loss": 8.4212,
"step": 526
},
{
"epoch": 2.534456816130003,
"grad_norm": 0.18063953518867493,
"learning_rate": 1.529790660225443e-06,
"loss": 7.8353,
"step": 527
},
{
"epoch": 2.5392717424014446,
"grad_norm": 0.16267195343971252,
"learning_rate": 1.5136876006441225e-06,
"loss": 7.9062,
"step": 528
},
{
"epoch": 2.544086668672886,
"grad_norm": 0.1997467577457428,
"learning_rate": 1.497584541062802e-06,
"loss": 8.6919,
"step": 529
},
{
"epoch": 2.5489015949443274,
"grad_norm": 0.15415464341640472,
"learning_rate": 1.4814814814814815e-06,
"loss": 8.2469,
"step": 530
},
{
"epoch": 2.553716521215769,
"grad_norm": 0.1869962513446808,
"learning_rate": 1.4653784219001613e-06,
"loss": 8.216,
"step": 531
},
{
"epoch": 2.5585314474872103,
"grad_norm": 0.14521171152591705,
"learning_rate": 1.4492753623188408e-06,
"loss": 8.2956,
"step": 532
},
{
"epoch": 2.5633463737586517,
"grad_norm": 0.1761654019355774,
"learning_rate": 1.4331723027375203e-06,
"loss": 8.3107,
"step": 533
},
{
"epoch": 2.568161300030093,
"grad_norm": 0.1776813566684723,
"learning_rate": 1.4170692431561997e-06,
"loss": 7.2249,
"step": 534
},
{
"epoch": 2.5729762263015346,
"grad_norm": 0.19041168689727783,
"learning_rate": 1.4009661835748794e-06,
"loss": 8.6567,
"step": 535
},
{
"epoch": 2.577791152572976,
"grad_norm": 0.1729832887649536,
"learning_rate": 1.3848631239935589e-06,
"loss": 9.1775,
"step": 536
},
{
"epoch": 2.582606078844418,
"grad_norm": 0.1917349100112915,
"learning_rate": 1.3687600644122383e-06,
"loss": 8.1724,
"step": 537
},
{
"epoch": 2.5874210051158593,
"grad_norm": 0.19829866290092468,
"learning_rate": 1.3526570048309178e-06,
"loss": 9.4741,
"step": 538
},
{
"epoch": 2.5922359313873007,
"grad_norm": 0.17467886209487915,
"learning_rate": 1.3365539452495977e-06,
"loss": 8.3608,
"step": 539
},
{
"epoch": 2.597050857658742,
"grad_norm": 0.25771814584732056,
"learning_rate": 1.3204508856682772e-06,
"loss": 8.5837,
"step": 540
},
{
"epoch": 2.6018657839301835,
"grad_norm": 0.13524986803531647,
"learning_rate": 1.3043478260869566e-06,
"loss": 7.8586,
"step": 541
},
{
"epoch": 2.606680710201625,
"grad_norm": 0.20528331398963928,
"learning_rate": 1.288244766505636e-06,
"loss": 8.9202,
"step": 542
},
{
"epoch": 2.6114956364730664,
"grad_norm": 0.18491816520690918,
"learning_rate": 1.2721417069243158e-06,
"loss": 8.1091,
"step": 543
},
{
"epoch": 2.6163105627445082,
"grad_norm": 0.14208512008190155,
"learning_rate": 1.2560386473429952e-06,
"loss": 8.8829,
"step": 544
},
{
"epoch": 2.6211254890159497,
"grad_norm": 0.22715114057064056,
"learning_rate": 1.2399355877616747e-06,
"loss": 8.4472,
"step": 545
},
{
"epoch": 2.625940415287391,
"grad_norm": 0.18286040425300598,
"learning_rate": 1.2238325281803544e-06,
"loss": 8.129,
"step": 546
},
{
"epoch": 2.6307553415588325,
"grad_norm": 0.18549402058124542,
"learning_rate": 1.2077294685990338e-06,
"loss": 8.3491,
"step": 547
},
{
"epoch": 2.635570267830274,
"grad_norm": 0.16227751970291138,
"learning_rate": 1.1916264090177135e-06,
"loss": 8.0191,
"step": 548
},
{
"epoch": 2.6403851941017153,
"grad_norm": 0.17795391380786896,
"learning_rate": 1.175523349436393e-06,
"loss": 7.1223,
"step": 549
},
{
"epoch": 2.6452001203731568,
"grad_norm": 0.17126573622226715,
"learning_rate": 1.1594202898550726e-06,
"loss": 8.2455,
"step": 550
},
{
"epoch": 2.650015046644598,
"grad_norm": 0.17369426786899567,
"learning_rate": 1.1433172302737521e-06,
"loss": 9.6182,
"step": 551
},
{
"epoch": 2.6548299729160396,
"grad_norm": 0.14956361055374146,
"learning_rate": 1.1272141706924316e-06,
"loss": 7.7231,
"step": 552
},
{
"epoch": 2.659644899187481,
"grad_norm": 0.17787741124629974,
"learning_rate": 1.111111111111111e-06,
"loss": 8.531,
"step": 553
},
{
"epoch": 2.6644598254589225,
"grad_norm": 0.16423143446445465,
"learning_rate": 1.0950080515297907e-06,
"loss": 9.07,
"step": 554
},
{
"epoch": 2.669274751730364,
"grad_norm": 0.18575292825698853,
"learning_rate": 1.0789049919484702e-06,
"loss": 9.2116,
"step": 555
},
{
"epoch": 2.6740896780018057,
"grad_norm": 0.1774529069662094,
"learning_rate": 1.0628019323671499e-06,
"loss": 7.6522,
"step": 556
},
{
"epoch": 2.678904604273247,
"grad_norm": 0.12618403136730194,
"learning_rate": 1.0466988727858295e-06,
"loss": 8.2429,
"step": 557
},
{
"epoch": 2.6837195305446886,
"grad_norm": 0.1379764825105667,
"learning_rate": 1.030595813204509e-06,
"loss": 9.0101,
"step": 558
},
{
"epoch": 2.68853445681613,
"grad_norm": 0.1804221123456955,
"learning_rate": 1.0144927536231885e-06,
"loss": 7.1618,
"step": 559
},
{
"epoch": 2.6933493830875714,
"grad_norm": 0.2020816057920456,
"learning_rate": 9.983896940418681e-07,
"loss": 8.0899,
"step": 560
},
{
"epoch": 2.698164309359013,
"grad_norm": 0.1975187063217163,
"learning_rate": 9.822866344605476e-07,
"loss": 6.9638,
"step": 561
},
{
"epoch": 2.7029792356304543,
"grad_norm": 0.21582917869091034,
"learning_rate": 9.66183574879227e-07,
"loss": 8.3501,
"step": 562
},
{
"epoch": 2.707794161901896,
"grad_norm": 0.1378657966852188,
"learning_rate": 9.500805152979067e-07,
"loss": 7.8302,
"step": 563
},
{
"epoch": 2.7126090881733376,
"grad_norm": 0.17029066383838654,
"learning_rate": 9.339774557165862e-07,
"loss": 8.3873,
"step": 564
},
{
"epoch": 2.717424014444779,
"grad_norm": 0.1723220944404602,
"learning_rate": 9.178743961352659e-07,
"loss": 7.8308,
"step": 565
},
{
"epoch": 2.7222389407162204,
"grad_norm": 0.15351563692092896,
"learning_rate": 9.017713365539453e-07,
"loss": 7.8458,
"step": 566
},
{
"epoch": 2.727053866987662,
"grad_norm": 0.15215708315372467,
"learning_rate": 8.856682769726249e-07,
"loss": 8.4561,
"step": 567
},
{
"epoch": 2.7318687932591033,
"grad_norm": 0.18015427887439728,
"learning_rate": 8.695652173913044e-07,
"loss": 7.986,
"step": 568
},
{
"epoch": 2.7366837195305447,
"grad_norm": 0.18228279054164886,
"learning_rate": 8.534621578099841e-07,
"loss": 8.3809,
"step": 569
},
{
"epoch": 2.741498645801986,
"grad_norm": 0.17855818569660187,
"learning_rate": 8.373590982286635e-07,
"loss": 7.2222,
"step": 570
},
{
"epoch": 2.7463135720734275,
"grad_norm": 0.12565724551677704,
"learning_rate": 8.212560386473431e-07,
"loss": 8.319,
"step": 571
},
{
"epoch": 2.751128498344869,
"grad_norm": 0.1549467146396637,
"learning_rate": 8.051529790660226e-07,
"loss": 7.4813,
"step": 572
},
{
"epoch": 2.7559434246163104,
"grad_norm": 0.19094257056713104,
"learning_rate": 7.890499194847022e-07,
"loss": 7.5127,
"step": 573
},
{
"epoch": 2.760758350887752,
"grad_norm": 0.18528102338314056,
"learning_rate": 7.729468599033817e-07,
"loss": 8.4165,
"step": 574
},
{
"epoch": 2.7655732771591937,
"grad_norm": 0.17467372119426727,
"learning_rate": 7.568438003220613e-07,
"loss": 8.4328,
"step": 575
},
{
"epoch": 2.770388203430635,
"grad_norm": 0.1786053627729416,
"learning_rate": 7.407407407407407e-07,
"loss": 7.6748,
"step": 576
},
{
"epoch": 2.7752031297020765,
"grad_norm": 0.2303641140460968,
"learning_rate": 7.246376811594204e-07,
"loss": 7.6819,
"step": 577
},
{
"epoch": 2.780018055973518,
"grad_norm": 0.20672529935836792,
"learning_rate": 7.085346215780999e-07,
"loss": 7.6466,
"step": 578
},
{
"epoch": 2.7848329822449593,
"grad_norm": 0.20678630471229553,
"learning_rate": 6.924315619967794e-07,
"loss": 7.83,
"step": 579
},
{
"epoch": 2.7896479085164008,
"grad_norm": 0.22579342126846313,
"learning_rate": 6.763285024154589e-07,
"loss": 8.352,
"step": 580
},
{
"epoch": 2.794462834787842,
"grad_norm": 0.21970775723457336,
"learning_rate": 6.602254428341386e-07,
"loss": 9.0786,
"step": 581
},
{
"epoch": 2.799277761059284,
"grad_norm": 0.15649309754371643,
"learning_rate": 6.44122383252818e-07,
"loss": 8.0365,
"step": 582
},
{
"epoch": 2.8040926873307255,
"grad_norm": 0.15020275115966797,
"learning_rate": 6.280193236714976e-07,
"loss": 7.5352,
"step": 583
},
{
"epoch": 2.808907613602167,
"grad_norm": 0.1699695736169815,
"learning_rate": 6.119162640901772e-07,
"loss": 7.9234,
"step": 584
},
{
"epoch": 2.8137225398736083,
"grad_norm": 0.14597013592720032,
"learning_rate": 5.958132045088568e-07,
"loss": 8.406,
"step": 585
},
{
"epoch": 2.8185374661450497,
"grad_norm": 0.1936945766210556,
"learning_rate": 5.797101449275363e-07,
"loss": 7.1909,
"step": 586
},
{
"epoch": 2.823352392416491,
"grad_norm": 0.1677147001028061,
"learning_rate": 5.636070853462158e-07,
"loss": 8.3866,
"step": 587
},
{
"epoch": 2.8281673186879326,
"grad_norm": 0.1816486269235611,
"learning_rate": 5.475040257648954e-07,
"loss": 8.0837,
"step": 588
},
{
"epoch": 2.832982244959374,
"grad_norm": 0.18202394247055054,
"learning_rate": 5.314009661835749e-07,
"loss": 9.1406,
"step": 589
},
{
"epoch": 2.8377971712308154,
"grad_norm": 0.2390686720609665,
"learning_rate": 5.152979066022545e-07,
"loss": 8.5755,
"step": 590
},
{
"epoch": 2.842612097502257,
"grad_norm": 0.18315307796001434,
"learning_rate": 4.991948470209341e-07,
"loss": 8.1501,
"step": 591
},
{
"epoch": 2.8474270237736983,
"grad_norm": 0.17412015795707703,
"learning_rate": 4.830917874396135e-07,
"loss": 8.2024,
"step": 592
},
{
"epoch": 2.8522419500451397,
"grad_norm": 0.18761633336544037,
"learning_rate": 4.669887278582931e-07,
"loss": 7.597,
"step": 593
},
{
"epoch": 2.8570568763165816,
"grad_norm": 0.1563250869512558,
"learning_rate": 4.5088566827697267e-07,
"loss": 8.4617,
"step": 594
},
{
"epoch": 2.861871802588023,
"grad_norm": 0.13112574815750122,
"learning_rate": 4.347826086956522e-07,
"loss": 7.6717,
"step": 595
},
{
"epoch": 2.8666867288594644,
"grad_norm": 0.1944950670003891,
"learning_rate": 4.1867954911433176e-07,
"loss": 8.5494,
"step": 596
},
{
"epoch": 2.871501655130906,
"grad_norm": 0.18215830624103546,
"learning_rate": 4.025764895330113e-07,
"loss": 8.4453,
"step": 597
},
{
"epoch": 2.8763165814023472,
"grad_norm": 0.15392394363880157,
"learning_rate": 3.8647342995169085e-07,
"loss": 7.0528,
"step": 598
},
{
"epoch": 2.8811315076737887,
"grad_norm": 0.17069800198078156,
"learning_rate": 3.7037037037037036e-07,
"loss": 9.8743,
"step": 599
},
{
"epoch": 2.88594643394523,
"grad_norm": 0.13186608254909515,
"learning_rate": 3.5426731078904993e-07,
"loss": 7.9741,
"step": 600
},
{
"epoch": 2.8907613602166715,
"grad_norm": 0.15300041437149048,
"learning_rate": 3.3816425120772945e-07,
"loss": 8.788,
"step": 601
},
{
"epoch": 2.8955762864881134,
"grad_norm": 0.15090717375278473,
"learning_rate": 3.22061191626409e-07,
"loss": 7.801,
"step": 602
},
{
"epoch": 2.900391212759555,
"grad_norm": 0.1606573611497879,
"learning_rate": 3.059581320450886e-07,
"loss": 8.5165,
"step": 603
},
{
"epoch": 2.9052061390309962,
"grad_norm": 0.15746456384658813,
"learning_rate": 2.8985507246376816e-07,
"loss": 7.3699,
"step": 604
},
{
"epoch": 2.9100210653024376,
"grad_norm": 0.1550646871328354,
"learning_rate": 2.737520128824477e-07,
"loss": 7.6848,
"step": 605
},
{
"epoch": 2.914835991573879,
"grad_norm": 0.14871163666248322,
"learning_rate": 2.5764895330112725e-07,
"loss": 7.6812,
"step": 606
},
{
"epoch": 2.9196509178453205,
"grad_norm": 0.2426673322916031,
"learning_rate": 2.4154589371980677e-07,
"loss": 7.7157,
"step": 607
},
{
"epoch": 2.924465844116762,
"grad_norm": 0.19695597887039185,
"learning_rate": 2.2544283413848634e-07,
"loss": 7.9716,
"step": 608
},
{
"epoch": 2.9292807703882033,
"grad_norm": 0.18192477524280548,
"learning_rate": 2.0933977455716588e-07,
"loss": 7.4395,
"step": 609
},
{
"epoch": 2.9340956966596448,
"grad_norm": 0.18087869882583618,
"learning_rate": 1.9323671497584542e-07,
"loss": 7.6177,
"step": 610
},
{
"epoch": 2.938910622931086,
"grad_norm": 0.1489817202091217,
"learning_rate": 1.7713365539452497e-07,
"loss": 7.4342,
"step": 611
},
{
"epoch": 2.9437255492025276,
"grad_norm": 0.12941974401474,
"learning_rate": 1.610305958132045e-07,
"loss": 9.0655,
"step": 612
},
{
"epoch": 2.9485404754739695,
"grad_norm": 0.1680421680212021,
"learning_rate": 1.4492753623188408e-07,
"loss": 7.2567,
"step": 613
},
{
"epoch": 2.953355401745411,
"grad_norm": 0.18065397441387177,
"learning_rate": 1.2882447665056362e-07,
"loss": 7.9862,
"step": 614
},
{
"epoch": 2.9581703280168523,
"grad_norm": 0.1599837988615036,
"learning_rate": 1.1272141706924317e-07,
"loss": 8.2424,
"step": 615
},
{
"epoch": 2.9629852542882937,
"grad_norm": 0.1959857940673828,
"learning_rate": 9.661835748792271e-08,
"loss": 7.4787,
"step": 616
},
{
"epoch": 2.967800180559735,
"grad_norm": 0.15649034082889557,
"learning_rate": 8.051529790660226e-08,
"loss": 8.1575,
"step": 617
},
{
"epoch": 2.9726151068311766,
"grad_norm": 0.1679297834634781,
"learning_rate": 6.441223832528181e-08,
"loss": 7.2253,
"step": 618
},
{
"epoch": 2.977430033102618,
"grad_norm": 0.15790539979934692,
"learning_rate": 4.8309178743961356e-08,
"loss": 7.6764,
"step": 619
},
{
"epoch": 2.9822449593740594,
"grad_norm": 0.1694169044494629,
"learning_rate": 3.2206119162640906e-08,
"loss": 8.6771,
"step": 620
},
{
"epoch": 2.9870598856455013,
"grad_norm": 0.17527279257774353,
"learning_rate": 1.6103059581320453e-08,
"loss": 10.7562,
"step": 621
},
{
"epoch": 2.9870598856455013,
"step": 621,
"total_flos": 2.802876100504453e+18,
"train_loss": 10.002562027622536,
"train_runtime": 59343.0737,
"train_samples_per_second": 1.344,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1.0,
"max_steps": 621,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.802876100504453e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}