openmathreasoning_300k / trainer_state.json
neginr's picture
Upload model
9168c82 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.997262581596125,
"eval_steps": 500,
"global_step": 1480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033691303432301536,
"grad_norm": 6.109744437799009,
"learning_rate": 5.405405405405406e-07,
"loss": 0.8395,
"step": 1
},
{
"epoch": 0.006738260686460307,
"grad_norm": 6.213278091341755,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.8539,
"step": 2
},
{
"epoch": 0.01010739102969046,
"grad_norm": 6.2309201627607536,
"learning_rate": 1.6216216216216219e-06,
"loss": 0.862,
"step": 3
},
{
"epoch": 0.013476521372920615,
"grad_norm": 5.960777622679009,
"learning_rate": 2.1621621621621623e-06,
"loss": 0.8418,
"step": 4
},
{
"epoch": 0.01684565171615077,
"grad_norm": 5.5645531135403825,
"learning_rate": 2.702702702702703e-06,
"loss": 0.8303,
"step": 5
},
{
"epoch": 0.02021478205938092,
"grad_norm": 4.368766112287261,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.7949,
"step": 6
},
{
"epoch": 0.023583912402611075,
"grad_norm": 2.4282762566778153,
"learning_rate": 3.7837837837837844e-06,
"loss": 0.7538,
"step": 7
},
{
"epoch": 0.02695304274584123,
"grad_norm": 2.286826804105074,
"learning_rate": 4.324324324324325e-06,
"loss": 0.7537,
"step": 8
},
{
"epoch": 0.030322173089071383,
"grad_norm": 1.7230122995757746,
"learning_rate": 4.864864864864866e-06,
"loss": 0.7465,
"step": 9
},
{
"epoch": 0.03369130343230154,
"grad_norm": 4.178444915489739,
"learning_rate": 5.405405405405406e-06,
"loss": 0.7473,
"step": 10
},
{
"epoch": 0.03706043377553169,
"grad_norm": 4.418045830813186,
"learning_rate": 5.945945945945947e-06,
"loss": 0.7374,
"step": 11
},
{
"epoch": 0.04042956411876184,
"grad_norm": 4.465657225144453,
"learning_rate": 6.486486486486487e-06,
"loss": 0.7306,
"step": 12
},
{
"epoch": 0.043798694461992,
"grad_norm": 4.089235560381299,
"learning_rate": 7.027027027027028e-06,
"loss": 0.6953,
"step": 13
},
{
"epoch": 0.04716782480522215,
"grad_norm": 3.9135602100241798,
"learning_rate": 7.567567567567569e-06,
"loss": 0.6963,
"step": 14
},
{
"epoch": 0.05053695514845231,
"grad_norm": 2.799692430785198,
"learning_rate": 8.108108108108109e-06,
"loss": 0.6717,
"step": 15
},
{
"epoch": 0.05390608549168246,
"grad_norm": 1.7463496609273474,
"learning_rate": 8.64864864864865e-06,
"loss": 0.6544,
"step": 16
},
{
"epoch": 0.057275215834912616,
"grad_norm": 1.788941312078948,
"learning_rate": 9.189189189189191e-06,
"loss": 0.6521,
"step": 17
},
{
"epoch": 0.060644346178142766,
"grad_norm": 2.287622821921788,
"learning_rate": 9.729729729729732e-06,
"loss": 0.6429,
"step": 18
},
{
"epoch": 0.06401347652137292,
"grad_norm": 1.960430434087058,
"learning_rate": 1.027027027027027e-05,
"loss": 0.6392,
"step": 19
},
{
"epoch": 0.06738260686460308,
"grad_norm": 1.349903658959779,
"learning_rate": 1.0810810810810812e-05,
"loss": 0.6281,
"step": 20
},
{
"epoch": 0.07075173720783323,
"grad_norm": 1.2019647976807795,
"learning_rate": 1.1351351351351352e-05,
"loss": 0.6151,
"step": 21
},
{
"epoch": 0.07412086755106338,
"grad_norm": 0.9434380017683085,
"learning_rate": 1.1891891891891894e-05,
"loss": 0.604,
"step": 22
},
{
"epoch": 0.07748999789429353,
"grad_norm": 1.1709993819720563,
"learning_rate": 1.2432432432432433e-05,
"loss": 0.6027,
"step": 23
},
{
"epoch": 0.08085912823752368,
"grad_norm": 0.7729753637518647,
"learning_rate": 1.2972972972972975e-05,
"loss": 0.6009,
"step": 24
},
{
"epoch": 0.08422825858075385,
"grad_norm": 0.9253134596999972,
"learning_rate": 1.3513513513513515e-05,
"loss": 0.5784,
"step": 25
},
{
"epoch": 0.087597388923984,
"grad_norm": 0.7203325434817915,
"learning_rate": 1.4054054054054055e-05,
"loss": 0.5784,
"step": 26
},
{
"epoch": 0.09096651926721415,
"grad_norm": 0.7738717415397595,
"learning_rate": 1.4594594594594596e-05,
"loss": 0.5724,
"step": 27
},
{
"epoch": 0.0943356496104443,
"grad_norm": 0.756189527437592,
"learning_rate": 1.5135135135135138e-05,
"loss": 0.5751,
"step": 28
},
{
"epoch": 0.09770477995367446,
"grad_norm": 0.5619223236064955,
"learning_rate": 1.5675675675675676e-05,
"loss": 0.5632,
"step": 29
},
{
"epoch": 0.10107391029690461,
"grad_norm": 0.6416604397150266,
"learning_rate": 1.6216216216216218e-05,
"loss": 0.5624,
"step": 30
},
{
"epoch": 0.10444304064013477,
"grad_norm": 0.5213594465327983,
"learning_rate": 1.6756756756756757e-05,
"loss": 0.5618,
"step": 31
},
{
"epoch": 0.10781217098336492,
"grad_norm": 0.6096320957570693,
"learning_rate": 1.72972972972973e-05,
"loss": 0.5569,
"step": 32
},
{
"epoch": 0.11118130132659507,
"grad_norm": 0.45054132971017113,
"learning_rate": 1.783783783783784e-05,
"loss": 0.547,
"step": 33
},
{
"epoch": 0.11455043166982523,
"grad_norm": 0.4599454116974351,
"learning_rate": 1.8378378378378383e-05,
"loss": 0.5484,
"step": 34
},
{
"epoch": 0.11791956201305538,
"grad_norm": 0.5174247359394964,
"learning_rate": 1.891891891891892e-05,
"loss": 0.5493,
"step": 35
},
{
"epoch": 0.12128869235628553,
"grad_norm": 0.37366751205930016,
"learning_rate": 1.9459459459459463e-05,
"loss": 0.5391,
"step": 36
},
{
"epoch": 0.12465782269951568,
"grad_norm": 0.48653413640763127,
"learning_rate": 2e-05,
"loss": 0.5403,
"step": 37
},
{
"epoch": 0.12802695304274583,
"grad_norm": 0.3651703371460431,
"learning_rate": 2.054054054054054e-05,
"loss": 0.5432,
"step": 38
},
{
"epoch": 0.131396083385976,
"grad_norm": 0.5149214857154895,
"learning_rate": 2.1081081081081082e-05,
"loss": 0.5351,
"step": 39
},
{
"epoch": 0.13476521372920616,
"grad_norm": 0.3233162021565579,
"learning_rate": 2.1621621621621624e-05,
"loss": 0.5358,
"step": 40
},
{
"epoch": 0.1381343440724363,
"grad_norm": 0.3994220446903501,
"learning_rate": 2.2162162162162163e-05,
"loss": 0.5215,
"step": 41
},
{
"epoch": 0.14150347441566646,
"grad_norm": 0.3613182742062023,
"learning_rate": 2.2702702702702705e-05,
"loss": 0.5306,
"step": 42
},
{
"epoch": 0.1448726047588966,
"grad_norm": 0.3444241276930956,
"learning_rate": 2.3243243243243243e-05,
"loss": 0.5255,
"step": 43
},
{
"epoch": 0.14824173510212676,
"grad_norm": 0.4493831957153716,
"learning_rate": 2.378378378378379e-05,
"loss": 0.525,
"step": 44
},
{
"epoch": 0.15161086544535693,
"grad_norm": 0.44813168016549754,
"learning_rate": 2.4324324324324327e-05,
"loss": 0.5182,
"step": 45
},
{
"epoch": 0.15497999578858707,
"grad_norm": 0.6714598609090919,
"learning_rate": 2.4864864864864866e-05,
"loss": 0.5236,
"step": 46
},
{
"epoch": 0.15834912613181723,
"grad_norm": 0.8782480079092542,
"learning_rate": 2.5405405405405404e-05,
"loss": 0.5189,
"step": 47
},
{
"epoch": 0.16171825647504737,
"grad_norm": 0.933082396567328,
"learning_rate": 2.594594594594595e-05,
"loss": 0.5136,
"step": 48
},
{
"epoch": 0.16508738681827753,
"grad_norm": 0.770716538259162,
"learning_rate": 2.6486486486486488e-05,
"loss": 0.515,
"step": 49
},
{
"epoch": 0.1684565171615077,
"grad_norm": 0.5162725525339301,
"learning_rate": 2.702702702702703e-05,
"loss": 0.5102,
"step": 50
},
{
"epoch": 0.17182564750473783,
"grad_norm": 1.015431854648304,
"learning_rate": 2.756756756756757e-05,
"loss": 0.5155,
"step": 51
},
{
"epoch": 0.175194777847968,
"grad_norm": 0.9581799901627799,
"learning_rate": 2.810810810810811e-05,
"loss": 0.5144,
"step": 52
},
{
"epoch": 0.17856390819119813,
"grad_norm": 0.7963519939531664,
"learning_rate": 2.8648648648648653e-05,
"loss": 0.5097,
"step": 53
},
{
"epoch": 0.1819330385344283,
"grad_norm": 1.2154735765132731,
"learning_rate": 2.918918918918919e-05,
"loss": 0.5,
"step": 54
},
{
"epoch": 0.18530216887765846,
"grad_norm": 0.5396425181695459,
"learning_rate": 2.972972972972973e-05,
"loss": 0.5028,
"step": 55
},
{
"epoch": 0.1886712992208886,
"grad_norm": 1.02978967009914,
"learning_rate": 3.0270270270270275e-05,
"loss": 0.5081,
"step": 56
},
{
"epoch": 0.19204042956411876,
"grad_norm": 0.8713408712796716,
"learning_rate": 3.081081081081082e-05,
"loss": 0.5117,
"step": 57
},
{
"epoch": 0.19540955990734893,
"grad_norm": 0.5039355010657219,
"learning_rate": 3.135135135135135e-05,
"loss": 0.5062,
"step": 58
},
{
"epoch": 0.19877869025057907,
"grad_norm": 0.8903957505195225,
"learning_rate": 3.1891891891891894e-05,
"loss": 0.5065,
"step": 59
},
{
"epoch": 0.20214782059380923,
"grad_norm": 0.6438414779422286,
"learning_rate": 3.2432432432432436e-05,
"loss": 0.505,
"step": 60
},
{
"epoch": 0.20551695093703937,
"grad_norm": 0.7196844068315907,
"learning_rate": 3.297297297297298e-05,
"loss": 0.5027,
"step": 61
},
{
"epoch": 0.20888608128026953,
"grad_norm": 0.768470871542238,
"learning_rate": 3.351351351351351e-05,
"loss": 0.4952,
"step": 62
},
{
"epoch": 0.2122552116234997,
"grad_norm": 1.2385285643157924,
"learning_rate": 3.4054054054054055e-05,
"loss": 0.4998,
"step": 63
},
{
"epoch": 0.21562434196672983,
"grad_norm": 1.2926026603680456,
"learning_rate": 3.45945945945946e-05,
"loss": 0.5059,
"step": 64
},
{
"epoch": 0.21899347230996,
"grad_norm": 0.5270463926051342,
"learning_rate": 3.513513513513514e-05,
"loss": 0.4882,
"step": 65
},
{
"epoch": 0.22236260265319013,
"grad_norm": 0.9232242256936453,
"learning_rate": 3.567567567567568e-05,
"loss": 0.5001,
"step": 66
},
{
"epoch": 0.2257317329964203,
"grad_norm": 1.1802567310747412,
"learning_rate": 3.6216216216216216e-05,
"loss": 0.4988,
"step": 67
},
{
"epoch": 0.22910086333965046,
"grad_norm": 0.6824243910642153,
"learning_rate": 3.6756756756756765e-05,
"loss": 0.4968,
"step": 68
},
{
"epoch": 0.2324699936828806,
"grad_norm": 1.1492977773270214,
"learning_rate": 3.72972972972973e-05,
"loss": 0.492,
"step": 69
},
{
"epoch": 0.23583912402611076,
"grad_norm": 0.6936995427226362,
"learning_rate": 3.783783783783784e-05,
"loss": 0.484,
"step": 70
},
{
"epoch": 0.2392082543693409,
"grad_norm": 0.8248787799708727,
"learning_rate": 3.837837837837838e-05,
"loss": 0.4875,
"step": 71
},
{
"epoch": 0.24257738471257106,
"grad_norm": 0.9148972625023225,
"learning_rate": 3.8918918918918926e-05,
"loss": 0.4936,
"step": 72
},
{
"epoch": 0.24594651505580123,
"grad_norm": 0.9090987008379405,
"learning_rate": 3.945945945945946e-05,
"loss": 0.4915,
"step": 73
},
{
"epoch": 0.24931564539903137,
"grad_norm": 1.3515657782891444,
"learning_rate": 4e-05,
"loss": 0.4898,
"step": 74
},
{
"epoch": 0.2526847757422615,
"grad_norm": 0.9152494686227338,
"learning_rate": 4.0540540540540545e-05,
"loss": 0.492,
"step": 75
},
{
"epoch": 0.25605390608549167,
"grad_norm": 1.5681418879764368,
"learning_rate": 4.108108108108108e-05,
"loss": 0.4927,
"step": 76
},
{
"epoch": 0.25942303642872183,
"grad_norm": 0.7357882817582275,
"learning_rate": 4.162162162162163e-05,
"loss": 0.4916,
"step": 77
},
{
"epoch": 0.262792166771952,
"grad_norm": 1.8207664011692413,
"learning_rate": 4.2162162162162164e-05,
"loss": 0.496,
"step": 78
},
{
"epoch": 0.26616129711518216,
"grad_norm": 1.1821288274944997,
"learning_rate": 4.2702702702702706e-05,
"loss": 0.4853,
"step": 79
},
{
"epoch": 0.2695304274584123,
"grad_norm": 1.6328730676456176,
"learning_rate": 4.324324324324325e-05,
"loss": 0.4942,
"step": 80
},
{
"epoch": 0.27289955780164243,
"grad_norm": 1.5462148913519038,
"learning_rate": 4.3783783783783783e-05,
"loss": 0.5016,
"step": 81
},
{
"epoch": 0.2762686881448726,
"grad_norm": 1.0513902425500052,
"learning_rate": 4.4324324324324325e-05,
"loss": 0.4816,
"step": 82
},
{
"epoch": 0.27963781848810276,
"grad_norm": 1.6621005750940228,
"learning_rate": 4.4864864864864874e-05,
"loss": 0.4999,
"step": 83
},
{
"epoch": 0.2830069488313329,
"grad_norm": 1.0166569610477454,
"learning_rate": 4.540540540540541e-05,
"loss": 0.4909,
"step": 84
},
{
"epoch": 0.2863760791745631,
"grad_norm": 1.3532816330257298,
"learning_rate": 4.594594594594595e-05,
"loss": 0.4853,
"step": 85
},
{
"epoch": 0.2897452095177932,
"grad_norm": 1.1921249261435982,
"learning_rate": 4.6486486486486486e-05,
"loss": 0.4907,
"step": 86
},
{
"epoch": 0.29311433986102337,
"grad_norm": 0.7446095855395451,
"learning_rate": 4.702702702702703e-05,
"loss": 0.4829,
"step": 87
},
{
"epoch": 0.29648347020425353,
"grad_norm": 1.13704598401882,
"learning_rate": 4.756756756756758e-05,
"loss": 0.4923,
"step": 88
},
{
"epoch": 0.2998526005474837,
"grad_norm": 0.9773983706335432,
"learning_rate": 4.810810810810811e-05,
"loss": 0.4925,
"step": 89
},
{
"epoch": 0.30322173089071386,
"grad_norm": 1.405804106766561,
"learning_rate": 4.8648648648648654e-05,
"loss": 0.4988,
"step": 90
},
{
"epoch": 0.30659086123394397,
"grad_norm": 1.2120207198526078,
"learning_rate": 4.9189189189189196e-05,
"loss": 0.493,
"step": 91
},
{
"epoch": 0.30995999157717413,
"grad_norm": 1.209860911666279,
"learning_rate": 4.972972972972973e-05,
"loss": 0.4871,
"step": 92
},
{
"epoch": 0.3133291219204043,
"grad_norm": 1.1742086251606596,
"learning_rate": 5.027027027027027e-05,
"loss": 0.4892,
"step": 93
},
{
"epoch": 0.31669825226363446,
"grad_norm": 0.9715216714272247,
"learning_rate": 5.081081081081081e-05,
"loss": 0.4869,
"step": 94
},
{
"epoch": 0.3200673826068646,
"grad_norm": 1.1287315453457836,
"learning_rate": 5.135135135135136e-05,
"loss": 0.483,
"step": 95
},
{
"epoch": 0.32343651295009473,
"grad_norm": 1.3753216500561123,
"learning_rate": 5.18918918918919e-05,
"loss": 0.5011,
"step": 96
},
{
"epoch": 0.3268056432933249,
"grad_norm": 1.0595415038900966,
"learning_rate": 5.2432432432432434e-05,
"loss": 0.4872,
"step": 97
},
{
"epoch": 0.33017477363655506,
"grad_norm": 0.993959867595984,
"learning_rate": 5.2972972972972976e-05,
"loss": 0.4874,
"step": 98
},
{
"epoch": 0.33354390397978523,
"grad_norm": 0.9824629794475075,
"learning_rate": 5.3513513513513525e-05,
"loss": 0.4813,
"step": 99
},
{
"epoch": 0.3369130343230154,
"grad_norm": 1.1711893920755474,
"learning_rate": 5.405405405405406e-05,
"loss": 0.4799,
"step": 100
},
{
"epoch": 0.3402821646662455,
"grad_norm": 0.6759536241348655,
"learning_rate": 5.45945945945946e-05,
"loss": 0.472,
"step": 101
},
{
"epoch": 0.34365129500947567,
"grad_norm": 0.7703133612780192,
"learning_rate": 5.513513513513514e-05,
"loss": 0.484,
"step": 102
},
{
"epoch": 0.34702042535270583,
"grad_norm": 0.7769913812178919,
"learning_rate": 5.567567567567568e-05,
"loss": 0.4758,
"step": 103
},
{
"epoch": 0.350389555695936,
"grad_norm": 0.8455984342427874,
"learning_rate": 5.621621621621622e-05,
"loss": 0.4807,
"step": 104
},
{
"epoch": 0.35375868603916616,
"grad_norm": 1.0743068767474058,
"learning_rate": 5.6756756756756757e-05,
"loss": 0.4845,
"step": 105
},
{
"epoch": 0.35712781638239627,
"grad_norm": 1.098648574354644,
"learning_rate": 5.7297297297297305e-05,
"loss": 0.4912,
"step": 106
},
{
"epoch": 0.36049694672562643,
"grad_norm": 1.6110582797161879,
"learning_rate": 5.783783783783785e-05,
"loss": 0.4975,
"step": 107
},
{
"epoch": 0.3638660770688566,
"grad_norm": 0.8739341305512099,
"learning_rate": 5.837837837837838e-05,
"loss": 0.4785,
"step": 108
},
{
"epoch": 0.36723520741208676,
"grad_norm": 1.267103393588983,
"learning_rate": 5.8918918918918924e-05,
"loss": 0.4922,
"step": 109
},
{
"epoch": 0.3706043377553169,
"grad_norm": 1.2547562478396024,
"learning_rate": 5.945945945945946e-05,
"loss": 0.4865,
"step": 110
},
{
"epoch": 0.37397346809854703,
"grad_norm": 1.1629837530431066,
"learning_rate": 6.000000000000001e-05,
"loss": 0.4815,
"step": 111
},
{
"epoch": 0.3773425984417772,
"grad_norm": 0.8450218760805568,
"learning_rate": 6.054054054054055e-05,
"loss": 0.4795,
"step": 112
},
{
"epoch": 0.38071172878500736,
"grad_norm": 1.116338785518496,
"learning_rate": 6.108108108108108e-05,
"loss": 0.4806,
"step": 113
},
{
"epoch": 0.38408085912823753,
"grad_norm": 0.8964893535442878,
"learning_rate": 6.162162162162163e-05,
"loss": 0.4756,
"step": 114
},
{
"epoch": 0.3874499894714677,
"grad_norm": 0.5881211331403472,
"learning_rate": 6.216216216216216e-05,
"loss": 0.4732,
"step": 115
},
{
"epoch": 0.39081911981469786,
"grad_norm": 0.7134703598403237,
"learning_rate": 6.27027027027027e-05,
"loss": 0.4788,
"step": 116
},
{
"epoch": 0.39418825015792797,
"grad_norm": 0.5897113644451194,
"learning_rate": 6.324324324324325e-05,
"loss": 0.4728,
"step": 117
},
{
"epoch": 0.39755738050115813,
"grad_norm": 0.6261295369849983,
"learning_rate": 6.378378378378379e-05,
"loss": 0.4773,
"step": 118
},
{
"epoch": 0.4009265108443883,
"grad_norm": 0.6923893019220413,
"learning_rate": 6.432432432432433e-05,
"loss": 0.4762,
"step": 119
},
{
"epoch": 0.40429564118761846,
"grad_norm": 0.9773203912887567,
"learning_rate": 6.486486486486487e-05,
"loss": 0.4835,
"step": 120
},
{
"epoch": 0.4076647715308486,
"grad_norm": 1.2845541454753984,
"learning_rate": 6.540540540540541e-05,
"loss": 0.4742,
"step": 121
},
{
"epoch": 0.41103390187407873,
"grad_norm": 0.7789854415201711,
"learning_rate": 6.594594594594596e-05,
"loss": 0.4693,
"step": 122
},
{
"epoch": 0.4144030322173089,
"grad_norm": 0.8431373845569413,
"learning_rate": 6.648648648648648e-05,
"loss": 0.4831,
"step": 123
},
{
"epoch": 0.41777216256053906,
"grad_norm": 0.8809025032679428,
"learning_rate": 6.702702702702703e-05,
"loss": 0.48,
"step": 124
},
{
"epoch": 0.4211412929037692,
"grad_norm": 0.7787166669908304,
"learning_rate": 6.756756756756758e-05,
"loss": 0.4861,
"step": 125
},
{
"epoch": 0.4245104232469994,
"grad_norm": 0.7332477510232297,
"learning_rate": 6.810810810810811e-05,
"loss": 0.478,
"step": 126
},
{
"epoch": 0.4278795535902295,
"grad_norm": 1.3066224708152618,
"learning_rate": 6.864864864864865e-05,
"loss": 0.4813,
"step": 127
},
{
"epoch": 0.43124868393345966,
"grad_norm": 0.9836726211958252,
"learning_rate": 6.91891891891892e-05,
"loss": 0.4714,
"step": 128
},
{
"epoch": 0.43461781427668983,
"grad_norm": 0.9119614187689974,
"learning_rate": 6.972972972972974e-05,
"loss": 0.4769,
"step": 129
},
{
"epoch": 0.43798694461992,
"grad_norm": 0.6243806457837586,
"learning_rate": 7.027027027027028e-05,
"loss": 0.4794,
"step": 130
},
{
"epoch": 0.44135607496315016,
"grad_norm": 0.7687415551391915,
"learning_rate": 7.081081081081081e-05,
"loss": 0.4732,
"step": 131
},
{
"epoch": 0.44472520530638027,
"grad_norm": 1.0092750754274926,
"learning_rate": 7.135135135135136e-05,
"loss": 0.4776,
"step": 132
},
{
"epoch": 0.44809433564961043,
"grad_norm": 0.8380704714410115,
"learning_rate": 7.18918918918919e-05,
"loss": 0.4648,
"step": 133
},
{
"epoch": 0.4514634659928406,
"grad_norm": 0.6767690752757246,
"learning_rate": 7.243243243243243e-05,
"loss": 0.4609,
"step": 134
},
{
"epoch": 0.45483259633607076,
"grad_norm": 0.9804619524504721,
"learning_rate": 7.297297297297297e-05,
"loss": 0.4713,
"step": 135
},
{
"epoch": 0.4582017266793009,
"grad_norm": 1.3776587580151205,
"learning_rate": 7.351351351351353e-05,
"loss": 0.4794,
"step": 136
},
{
"epoch": 0.46157085702253103,
"grad_norm": 0.5714903502719861,
"learning_rate": 7.405405405405406e-05,
"loss": 0.4693,
"step": 137
},
{
"epoch": 0.4649399873657612,
"grad_norm": 1.1165381199232975,
"learning_rate": 7.45945945945946e-05,
"loss": 0.4789,
"step": 138
},
{
"epoch": 0.46830911770899136,
"grad_norm": 1.1391177830520112,
"learning_rate": 7.513513513513514e-05,
"loss": 0.4763,
"step": 139
},
{
"epoch": 0.4716782480522215,
"grad_norm": 0.8405758310660678,
"learning_rate": 7.567567567567568e-05,
"loss": 0.4708,
"step": 140
},
{
"epoch": 0.4750473783954517,
"grad_norm": 0.673259823629261,
"learning_rate": 7.621621621621623e-05,
"loss": 0.4709,
"step": 141
},
{
"epoch": 0.4784165087386818,
"grad_norm": 0.9085593163135716,
"learning_rate": 7.675675675675675e-05,
"loss": 0.4658,
"step": 142
},
{
"epoch": 0.48178563908191196,
"grad_norm": 0.9065492978219933,
"learning_rate": 7.729729729729731e-05,
"loss": 0.4661,
"step": 143
},
{
"epoch": 0.48515476942514213,
"grad_norm": 0.8751584723634406,
"learning_rate": 7.783783783783785e-05,
"loss": 0.4725,
"step": 144
},
{
"epoch": 0.4885238997683723,
"grad_norm": 0.6907562652250656,
"learning_rate": 7.837837837837838e-05,
"loss": 0.4684,
"step": 145
},
{
"epoch": 0.49189303011160246,
"grad_norm": 0.5990641326148477,
"learning_rate": 7.891891891891892e-05,
"loss": 0.4672,
"step": 146
},
{
"epoch": 0.4952621604548326,
"grad_norm": 0.6994191437855128,
"learning_rate": 7.945945945945946e-05,
"loss": 0.4662,
"step": 147
},
{
"epoch": 0.49863129079806273,
"grad_norm": 0.5573598940486624,
"learning_rate": 8e-05,
"loss": 0.4665,
"step": 148
},
{
"epoch": 0.502000421141293,
"grad_norm": 0.6145912929008095,
"learning_rate": 7.999988874460243e-05,
"loss": 0.4669,
"step": 149
},
{
"epoch": 0.505369551484523,
"grad_norm": 0.8011807879753905,
"learning_rate": 7.999955497902857e-05,
"loss": 0.4669,
"step": 150
},
{
"epoch": 0.5087386818277532,
"grad_norm": 0.8616234404683479,
"learning_rate": 7.99989987051351e-05,
"loss": 0.4721,
"step": 151
},
{
"epoch": 0.5121078121709833,
"grad_norm": 0.7813814403741567,
"learning_rate": 7.999821992601645e-05,
"loss": 0.4753,
"step": 152
},
{
"epoch": 0.5154769425142135,
"grad_norm": 0.8208221005516424,
"learning_rate": 7.999721864600476e-05,
"loss": 0.4648,
"step": 153
},
{
"epoch": 0.5188460728574437,
"grad_norm": 0.7471274236204338,
"learning_rate": 7.999599487066996e-05,
"loss": 0.4665,
"step": 154
},
{
"epoch": 0.5222152032006738,
"grad_norm": 0.6025705010343646,
"learning_rate": 7.999454860681961e-05,
"loss": 0.4646,
"step": 155
},
{
"epoch": 0.525584333543904,
"grad_norm": 0.6278670733672859,
"learning_rate": 7.999287986249894e-05,
"loss": 0.4582,
"step": 156
},
{
"epoch": 0.5289534638871342,
"grad_norm": 0.7363381482182718,
"learning_rate": 7.999098864699078e-05,
"loss": 0.4644,
"step": 157
},
{
"epoch": 0.5323225942303643,
"grad_norm": 0.5321478176964342,
"learning_rate": 7.998887497081555e-05,
"loss": 0.4558,
"step": 158
},
{
"epoch": 0.5356917245735945,
"grad_norm": 0.5084748356369074,
"learning_rate": 7.998653884573114e-05,
"loss": 0.4576,
"step": 159
},
{
"epoch": 0.5390608549168246,
"grad_norm": 0.44466968144745794,
"learning_rate": 7.998398028473287e-05,
"loss": 0.4628,
"step": 160
},
{
"epoch": 0.5424299852600547,
"grad_norm": 0.5300560878644925,
"learning_rate": 7.998119930205342e-05,
"loss": 0.4587,
"step": 161
},
{
"epoch": 0.5457991156032849,
"grad_norm": 0.4482671223105369,
"learning_rate": 7.997819591316278e-05,
"loss": 0.4595,
"step": 162
},
{
"epoch": 0.549168245946515,
"grad_norm": 0.3831134887002804,
"learning_rate": 7.997497013476808e-05,
"loss": 0.4621,
"step": 163
},
{
"epoch": 0.5525373762897452,
"grad_norm": 0.42236120459010645,
"learning_rate": 7.99715219848136e-05,
"loss": 0.4574,
"step": 164
},
{
"epoch": 0.5559065066329754,
"grad_norm": 0.4457523958461928,
"learning_rate": 7.996785148248062e-05,
"loss": 0.4597,
"step": 165
},
{
"epoch": 0.5592756369762055,
"grad_norm": 0.5006917210647484,
"learning_rate": 7.996395864818727e-05,
"loss": 0.4594,
"step": 166
},
{
"epoch": 0.5626447673194357,
"grad_norm": 0.5999241658726214,
"learning_rate": 7.995984350358851e-05,
"loss": 0.4578,
"step": 167
},
{
"epoch": 0.5660138976626659,
"grad_norm": 0.7291489485043735,
"learning_rate": 7.995550607157592e-05,
"loss": 0.4538,
"step": 168
},
{
"epoch": 0.569383028005896,
"grad_norm": 0.7577464603442905,
"learning_rate": 7.995094637627767e-05,
"loss": 0.4507,
"step": 169
},
{
"epoch": 0.5727521583491262,
"grad_norm": 0.5373115669466836,
"learning_rate": 7.994616444305826e-05,
"loss": 0.4602,
"step": 170
},
{
"epoch": 0.5761212886923562,
"grad_norm": 0.5783937366804819,
"learning_rate": 7.994116029851852e-05,
"loss": 0.4621,
"step": 171
},
{
"epoch": 0.5794904190355864,
"grad_norm": 0.7289647138839453,
"learning_rate": 7.993593397049533e-05,
"loss": 0.4569,
"step": 172
},
{
"epoch": 0.5828595493788166,
"grad_norm": 0.7726864760162053,
"learning_rate": 7.993048548806155e-05,
"loss": 0.4609,
"step": 173
},
{
"epoch": 0.5862286797220467,
"grad_norm": 0.7101749816908381,
"learning_rate": 7.992481488152585e-05,
"loss": 0.4628,
"step": 174
},
{
"epoch": 0.5895978100652769,
"grad_norm": 0.7787526674806393,
"learning_rate": 7.991892218243251e-05,
"loss": 0.4664,
"step": 175
},
{
"epoch": 0.5929669404085071,
"grad_norm": 0.9193285112654672,
"learning_rate": 7.991280742356124e-05,
"loss": 0.4583,
"step": 176
},
{
"epoch": 0.5963360707517372,
"grad_norm": 0.863766394540256,
"learning_rate": 7.990647063892704e-05,
"loss": 0.4532,
"step": 177
},
{
"epoch": 0.5997052010949674,
"grad_norm": 0.7969950754484971,
"learning_rate": 7.989991186378e-05,
"loss": 0.4649,
"step": 178
},
{
"epoch": 0.6030743314381976,
"grad_norm": 0.9175228695778532,
"learning_rate": 7.989313113460506e-05,
"loss": 0.4598,
"step": 179
},
{
"epoch": 0.6064434617814277,
"grad_norm": 1.189324294096932,
"learning_rate": 7.988612848912186e-05,
"loss": 0.4616,
"step": 180
},
{
"epoch": 0.6098125921246578,
"grad_norm": 0.5502633850375939,
"learning_rate": 7.987890396628451e-05,
"loss": 0.4506,
"step": 181
},
{
"epoch": 0.6131817224678879,
"grad_norm": 0.6418325300837303,
"learning_rate": 7.987145760628138e-05,
"loss": 0.4589,
"step": 182
},
{
"epoch": 0.6165508528111181,
"grad_norm": 0.8487957991579048,
"learning_rate": 7.986378945053483e-05,
"loss": 0.4534,
"step": 183
},
{
"epoch": 0.6199199831543483,
"grad_norm": 0.729090543693198,
"learning_rate": 7.985589954170107e-05,
"loss": 0.4502,
"step": 184
},
{
"epoch": 0.6232891134975784,
"grad_norm": 0.564140229622775,
"learning_rate": 7.984778792366983e-05,
"loss": 0.4561,
"step": 185
},
{
"epoch": 0.6266582438408086,
"grad_norm": 0.5489014465662102,
"learning_rate": 7.983945464156419e-05,
"loss": 0.4511,
"step": 186
},
{
"epoch": 0.6300273741840388,
"grad_norm": 0.4439092473485429,
"learning_rate": 7.983089974174026e-05,
"loss": 0.4592,
"step": 187
},
{
"epoch": 0.6333965045272689,
"grad_norm": 0.4899343556871492,
"learning_rate": 7.982212327178699e-05,
"loss": 0.4576,
"step": 188
},
{
"epoch": 0.6367656348704991,
"grad_norm": 0.4429930723656228,
"learning_rate": 7.981312528052587e-05,
"loss": 0.4527,
"step": 189
},
{
"epoch": 0.6401347652137293,
"grad_norm": 0.3517045134537643,
"learning_rate": 7.980390581801064e-05,
"loss": 0.4533,
"step": 190
},
{
"epoch": 0.6435038955569593,
"grad_norm": 0.35596532078238685,
"learning_rate": 7.979446493552708e-05,
"loss": 0.4512,
"step": 191
},
{
"epoch": 0.6468730259001895,
"grad_norm": 0.4117813215790183,
"learning_rate": 7.97848026855926e-05,
"loss": 0.4427,
"step": 192
},
{
"epoch": 0.6502421562434196,
"grad_norm": 0.42570694906406503,
"learning_rate": 7.977491912195611e-05,
"loss": 0.4559,
"step": 193
},
{
"epoch": 0.6536112865866498,
"grad_norm": 0.32926038316817535,
"learning_rate": 7.976481429959758e-05,
"loss": 0.4525,
"step": 194
},
{
"epoch": 0.65698041692988,
"grad_norm": 0.3352588049162969,
"learning_rate": 7.975448827472782e-05,
"loss": 0.4465,
"step": 195
},
{
"epoch": 0.6603495472731101,
"grad_norm": 0.3121745237951815,
"learning_rate": 7.974394110478813e-05,
"loss": 0.4504,
"step": 196
},
{
"epoch": 0.6637186776163403,
"grad_norm": 0.3514443346936628,
"learning_rate": 7.973317284844998e-05,
"loss": 0.4543,
"step": 197
},
{
"epoch": 0.6670878079595705,
"grad_norm": 0.36563500765518064,
"learning_rate": 7.972218356561471e-05,
"loss": 0.4466,
"step": 198
},
{
"epoch": 0.6704569383028006,
"grad_norm": 0.36993328537084813,
"learning_rate": 7.971097331741318e-05,
"loss": 0.447,
"step": 199
},
{
"epoch": 0.6738260686460308,
"grad_norm": 0.4218574088374599,
"learning_rate": 7.96995421662054e-05,
"loss": 0.4456,
"step": 200
},
{
"epoch": 0.677195198989261,
"grad_norm": 0.5127127798248658,
"learning_rate": 7.968789017558026e-05,
"loss": 0.4367,
"step": 201
},
{
"epoch": 0.680564329332491,
"grad_norm": 0.5533862982628416,
"learning_rate": 7.967601741035507e-05,
"loss": 0.4464,
"step": 202
},
{
"epoch": 0.6839334596757212,
"grad_norm": 0.5128169904646379,
"learning_rate": 7.966392393657533e-05,
"loss": 0.4493,
"step": 203
},
{
"epoch": 0.6873025900189513,
"grad_norm": 0.47256773564418525,
"learning_rate": 7.965160982151422e-05,
"loss": 0.4536,
"step": 204
},
{
"epoch": 0.6906717203621815,
"grad_norm": 0.452879409095403,
"learning_rate": 7.963907513367234e-05,
"loss": 0.4589,
"step": 205
},
{
"epoch": 0.6940408507054117,
"grad_norm": 0.455219584683228,
"learning_rate": 7.962631994277728e-05,
"loss": 0.4414,
"step": 206
},
{
"epoch": 0.6974099810486418,
"grad_norm": 0.47863589957769587,
"learning_rate": 7.961334431978321e-05,
"loss": 0.4486,
"step": 207
},
{
"epoch": 0.700779111391872,
"grad_norm": 0.5110385780704738,
"learning_rate": 7.960014833687055e-05,
"loss": 0.4495,
"step": 208
},
{
"epoch": 0.7041482417351022,
"grad_norm": 0.4683257451933529,
"learning_rate": 7.958673206744553e-05,
"loss": 0.4522,
"step": 209
},
{
"epoch": 0.7075173720783323,
"grad_norm": 0.4506553993940309,
"learning_rate": 7.957309558613974e-05,
"loss": 0.4452,
"step": 210
},
{
"epoch": 0.7108865024215625,
"grad_norm": 0.4526028368594711,
"learning_rate": 7.955923896880982e-05,
"loss": 0.4456,
"step": 211
},
{
"epoch": 0.7142556327647925,
"grad_norm": 0.5212859488073646,
"learning_rate": 7.954516229253691e-05,
"loss": 0.4482,
"step": 212
},
{
"epoch": 0.7176247631080227,
"grad_norm": 0.4908480827080424,
"learning_rate": 7.953086563562635e-05,
"loss": 0.4404,
"step": 213
},
{
"epoch": 0.7209938934512529,
"grad_norm": 0.43474906852801837,
"learning_rate": 7.951634907760713e-05,
"loss": 0.4415,
"step": 214
},
{
"epoch": 0.724363023794483,
"grad_norm": 0.5465543422325746,
"learning_rate": 7.950161269923153e-05,
"loss": 0.453,
"step": 215
},
{
"epoch": 0.7277321541377132,
"grad_norm": 0.5191090578880476,
"learning_rate": 7.948665658247463e-05,
"loss": 0.4511,
"step": 216
},
{
"epoch": 0.7311012844809434,
"grad_norm": 0.41056922017028197,
"learning_rate": 7.947148081053388e-05,
"loss": 0.4428,
"step": 217
},
{
"epoch": 0.7344704148241735,
"grad_norm": 0.4280367756173325,
"learning_rate": 7.945608546782858e-05,
"loss": 0.4552,
"step": 218
},
{
"epoch": 0.7378395451674037,
"grad_norm": 0.44143498781875934,
"learning_rate": 7.944047063999952e-05,
"loss": 0.4461,
"step": 219
},
{
"epoch": 0.7412086755106339,
"grad_norm": 0.4671020826488003,
"learning_rate": 7.942463641390834e-05,
"loss": 0.433,
"step": 220
},
{
"epoch": 0.744577805853864,
"grad_norm": 0.4802991806753108,
"learning_rate": 7.940858287763724e-05,
"loss": 0.4487,
"step": 221
},
{
"epoch": 0.7479469361970941,
"grad_norm": 0.4271821601132076,
"learning_rate": 7.939231012048833e-05,
"loss": 0.4509,
"step": 222
},
{
"epoch": 0.7513160665403242,
"grad_norm": 0.38123610223687315,
"learning_rate": 7.93758182329832e-05,
"loss": 0.4372,
"step": 223
},
{
"epoch": 0.7546851968835544,
"grad_norm": 0.465830487650423,
"learning_rate": 7.935910730686246e-05,
"loss": 0.4444,
"step": 224
},
{
"epoch": 0.7580543272267846,
"grad_norm": 0.5651393352119582,
"learning_rate": 7.934217743508513e-05,
"loss": 0.4468,
"step": 225
},
{
"epoch": 0.7614234575700147,
"grad_norm": 0.6526912705793722,
"learning_rate": 7.932502871182818e-05,
"loss": 0.4509,
"step": 226
},
{
"epoch": 0.7647925879132449,
"grad_norm": 0.7684525411435036,
"learning_rate": 7.930766123248602e-05,
"loss": 0.4475,
"step": 227
},
{
"epoch": 0.7681617182564751,
"grad_norm": 0.8868257582573387,
"learning_rate": 7.929007509366994e-05,
"loss": 0.4486,
"step": 228
},
{
"epoch": 0.7715308485997052,
"grad_norm": 0.9592751619745519,
"learning_rate": 7.927227039320758e-05,
"loss": 0.442,
"step": 229
},
{
"epoch": 0.7748999789429354,
"grad_norm": 0.8928159966805775,
"learning_rate": 7.925424723014239e-05,
"loss": 0.4541,
"step": 230
},
{
"epoch": 0.7782691092861656,
"grad_norm": 0.7880900131568054,
"learning_rate": 7.923600570473308e-05,
"loss": 0.4514,
"step": 231
},
{
"epoch": 0.7816382396293957,
"grad_norm": 0.4783123604515285,
"learning_rate": 7.921754591845307e-05,
"loss": 0.4442,
"step": 232
},
{
"epoch": 0.7850073699726258,
"grad_norm": 0.4520386015737669,
"learning_rate": 7.91988679739899e-05,
"loss": 0.448,
"step": 233
},
{
"epoch": 0.7883765003158559,
"grad_norm": 0.6605527609379506,
"learning_rate": 7.917997197524467e-05,
"loss": 0.4435,
"step": 234
},
{
"epoch": 0.7917456306590861,
"grad_norm": 0.7089385732745206,
"learning_rate": 7.916085802733147e-05,
"loss": 0.4449,
"step": 235
},
{
"epoch": 0.7951147610023163,
"grad_norm": 0.5904512970852802,
"learning_rate": 7.914152623657678e-05,
"loss": 0.448,
"step": 236
},
{
"epoch": 0.7984838913455464,
"grad_norm": 0.5165195483185807,
"learning_rate": 7.912197671051894e-05,
"loss": 0.4475,
"step": 237
},
{
"epoch": 0.8018530216887766,
"grad_norm": 0.47278629514591364,
"learning_rate": 7.910220955790746e-05,
"loss": 0.447,
"step": 238
},
{
"epoch": 0.8052221520320068,
"grad_norm": 0.4466680465677497,
"learning_rate": 7.908222488870243e-05,
"loss": 0.4471,
"step": 239
},
{
"epoch": 0.8085912823752369,
"grad_norm": 0.40052321749076436,
"learning_rate": 7.906202281407398e-05,
"loss": 0.4453,
"step": 240
},
{
"epoch": 0.8119604127184671,
"grad_norm": 0.3808574042244712,
"learning_rate": 7.90416034464016e-05,
"loss": 0.4467,
"step": 241
},
{
"epoch": 0.8153295430616972,
"grad_norm": 0.3009379630644614,
"learning_rate": 7.902096689927355e-05,
"loss": 0.4405,
"step": 242
},
{
"epoch": 0.8186986734049273,
"grad_norm": 0.4006333439696202,
"learning_rate": 7.900011328748619e-05,
"loss": 0.441,
"step": 243
},
{
"epoch": 0.8220678037481575,
"grad_norm": 0.36250537572683333,
"learning_rate": 7.897904272704333e-05,
"loss": 0.4382,
"step": 244
},
{
"epoch": 0.8254369340913876,
"grad_norm": 0.37232144501481734,
"learning_rate": 7.895775533515569e-05,
"loss": 0.4455,
"step": 245
},
{
"epoch": 0.8288060644346178,
"grad_norm": 0.4169869556836039,
"learning_rate": 7.893625123024011e-05,
"loss": 0.4356,
"step": 246
},
{
"epoch": 0.832175194777848,
"grad_norm": 0.3864353557408192,
"learning_rate": 7.891453053191898e-05,
"loss": 0.4435,
"step": 247
},
{
"epoch": 0.8355443251210781,
"grad_norm": 0.3608352846793135,
"learning_rate": 7.889259336101957e-05,
"loss": 0.4462,
"step": 248
},
{
"epoch": 0.8389134554643083,
"grad_norm": 0.32373631118958723,
"learning_rate": 7.887043983957327e-05,
"loss": 0.4375,
"step": 249
},
{
"epoch": 0.8422825858075385,
"grad_norm": 0.26424914090383317,
"learning_rate": 7.884807009081506e-05,
"loss": 0.4375,
"step": 250
},
{
"epoch": 0.8456517161507686,
"grad_norm": 0.22444081020907958,
"learning_rate": 7.882548423918268e-05,
"loss": 0.4413,
"step": 251
},
{
"epoch": 0.8490208464939988,
"grad_norm": 0.26045857383329957,
"learning_rate": 7.880268241031604e-05,
"loss": 0.4317,
"step": 252
},
{
"epoch": 0.8523899768372288,
"grad_norm": 0.30550339254012787,
"learning_rate": 7.877966473105645e-05,
"loss": 0.4458,
"step": 253
},
{
"epoch": 0.855759107180459,
"grad_norm": 0.34559528308231324,
"learning_rate": 7.875643132944599e-05,
"loss": 0.4403,
"step": 254
},
{
"epoch": 0.8591282375236892,
"grad_norm": 0.35710994685108394,
"learning_rate": 7.873298233472671e-05,
"loss": 0.4394,
"step": 255
},
{
"epoch": 0.8624973678669193,
"grad_norm": 0.42956681122910056,
"learning_rate": 7.870931787733996e-05,
"loss": 0.4403,
"step": 256
},
{
"epoch": 0.8658664982101495,
"grad_norm": 0.5626197718228877,
"learning_rate": 7.868543808892569e-05,
"loss": 0.4387,
"step": 257
},
{
"epoch": 0.8692356285533797,
"grad_norm": 0.6076789146858117,
"learning_rate": 7.866134310232167e-05,
"loss": 0.4439,
"step": 258
},
{
"epoch": 0.8726047588966098,
"grad_norm": 0.5742280027785791,
"learning_rate": 7.863703305156273e-05,
"loss": 0.4455,
"step": 259
},
{
"epoch": 0.87597388923984,
"grad_norm": 0.5069317059933754,
"learning_rate": 7.861250807188014e-05,
"loss": 0.4476,
"step": 260
},
{
"epoch": 0.8793430195830702,
"grad_norm": 0.4288223928021788,
"learning_rate": 7.858776829970069e-05,
"loss": 0.4379,
"step": 261
},
{
"epoch": 0.8827121499263003,
"grad_norm": 0.5442592728854474,
"learning_rate": 7.856281387264603e-05,
"loss": 0.4379,
"step": 262
},
{
"epoch": 0.8860812802695305,
"grad_norm": 0.5638482346313414,
"learning_rate": 7.853764492953192e-05,
"loss": 0.4444,
"step": 263
},
{
"epoch": 0.8894504106127605,
"grad_norm": 0.4523819114426828,
"learning_rate": 7.851226161036739e-05,
"loss": 0.4394,
"step": 264
},
{
"epoch": 0.8928195409559907,
"grad_norm": 0.5349306408767115,
"learning_rate": 7.848666405635398e-05,
"loss": 0.441,
"step": 265
},
{
"epoch": 0.8961886712992209,
"grad_norm": 0.5452142089194884,
"learning_rate": 7.846085240988503e-05,
"loss": 0.4483,
"step": 266
},
{
"epoch": 0.899557801642451,
"grad_norm": 0.4222443920522887,
"learning_rate": 7.843482681454476e-05,
"loss": 0.4407,
"step": 267
},
{
"epoch": 0.9029269319856812,
"grad_norm": 0.5310106072977896,
"learning_rate": 7.840858741510758e-05,
"loss": 0.4442,
"step": 268
},
{
"epoch": 0.9062960623289114,
"grad_norm": 0.5876077411696179,
"learning_rate": 7.838213435753724e-05,
"loss": 0.4438,
"step": 269
},
{
"epoch": 0.9096651926721415,
"grad_norm": 0.6100738415200538,
"learning_rate": 7.835546778898599e-05,
"loss": 0.4465,
"step": 270
},
{
"epoch": 0.9130343230153717,
"grad_norm": 0.6760561504138676,
"learning_rate": 7.832858785779383e-05,
"loss": 0.4338,
"step": 271
},
{
"epoch": 0.9164034533586018,
"grad_norm": 0.45392830007094576,
"learning_rate": 7.830149471348763e-05,
"loss": 0.431,
"step": 272
},
{
"epoch": 0.919772583701832,
"grad_norm": 0.30596440551036547,
"learning_rate": 7.827418850678034e-05,
"loss": 0.4396,
"step": 273
},
{
"epoch": 0.9231417140450621,
"grad_norm": 0.4969999175377505,
"learning_rate": 7.824666938957004e-05,
"loss": 0.4375,
"step": 274
},
{
"epoch": 0.9265108443882922,
"grad_norm": 0.5437640388773309,
"learning_rate": 7.82189375149393e-05,
"loss": 0.444,
"step": 275
},
{
"epoch": 0.9298799747315224,
"grad_norm": 0.4134501055661062,
"learning_rate": 7.819099303715414e-05,
"loss": 0.4385,
"step": 276
},
{
"epoch": 0.9332491050747526,
"grad_norm": 0.3810051790575615,
"learning_rate": 7.816283611166328e-05,
"loss": 0.4339,
"step": 277
},
{
"epoch": 0.9366182354179827,
"grad_norm": 0.4135193612689647,
"learning_rate": 7.813446689509718e-05,
"loss": 0.4413,
"step": 278
},
{
"epoch": 0.9399873657612129,
"grad_norm": 0.5154216890519913,
"learning_rate": 7.810588554526728e-05,
"loss": 0.4409,
"step": 279
},
{
"epoch": 0.943356496104443,
"grad_norm": 0.5335234306967277,
"learning_rate": 7.807709222116506e-05,
"loss": 0.4392,
"step": 280
},
{
"epoch": 0.9467256264476732,
"grad_norm": 0.4582890089443176,
"learning_rate": 7.804808708296116e-05,
"loss": 0.44,
"step": 281
},
{
"epoch": 0.9500947567909034,
"grad_norm": 0.41636142631229706,
"learning_rate": 7.801887029200448e-05,
"loss": 0.4359,
"step": 282
},
{
"epoch": 0.9534638871341335,
"grad_norm": 0.3777680522962764,
"learning_rate": 7.798944201082128e-05,
"loss": 0.4305,
"step": 283
},
{
"epoch": 0.9568330174773636,
"grad_norm": 0.31197040692277506,
"learning_rate": 7.795980240311436e-05,
"loss": 0.4378,
"step": 284
},
{
"epoch": 0.9602021478205938,
"grad_norm": 0.2615719658181643,
"learning_rate": 7.7929951633762e-05,
"loss": 0.4349,
"step": 285
},
{
"epoch": 0.9635712781638239,
"grad_norm": 0.27255928093352183,
"learning_rate": 7.789988986881719e-05,
"loss": 0.4324,
"step": 286
},
{
"epoch": 0.9669404085070541,
"grad_norm": 0.3086259327892651,
"learning_rate": 7.78696172755066e-05,
"loss": 0.4338,
"step": 287
},
{
"epoch": 0.9703095388502843,
"grad_norm": 0.3128738492807504,
"learning_rate": 7.78391340222297e-05,
"loss": 0.4327,
"step": 288
},
{
"epoch": 0.9736786691935144,
"grad_norm": 0.28991557468061835,
"learning_rate": 7.78084402785578e-05,
"loss": 0.4368,
"step": 289
},
{
"epoch": 0.9770477995367446,
"grad_norm": 0.3462635013389902,
"learning_rate": 7.777753621523316e-05,
"loss": 0.4376,
"step": 290
},
{
"epoch": 0.9804169298799748,
"grad_norm": 0.41703460759212563,
"learning_rate": 7.774642200416795e-05,
"loss": 0.4364,
"step": 291
},
{
"epoch": 0.9837860602232049,
"grad_norm": 0.5058437435233563,
"learning_rate": 7.771509781844338e-05,
"loss": 0.4392,
"step": 292
},
{
"epoch": 0.9871551905664351,
"grad_norm": 0.49478795685868665,
"learning_rate": 7.768356383230868e-05,
"loss": 0.4387,
"step": 293
},
{
"epoch": 0.9905243209096652,
"grad_norm": 0.4745986454402833,
"learning_rate": 7.765182022118014e-05,
"loss": 0.435,
"step": 294
},
{
"epoch": 0.9938934512528953,
"grad_norm": 0.4611674206006931,
"learning_rate": 7.761986716164019e-05,
"loss": 0.4379,
"step": 295
},
{
"epoch": 0.9972625815961255,
"grad_norm": 0.42674160555276347,
"learning_rate": 7.758770483143634e-05,
"loss": 0.4408,
"step": 296
},
{
"epoch": 1.0033691303432302,
"grad_norm": 0.4680506210026581,
"learning_rate": 7.755533340948024e-05,
"loss": 0.4223,
"step": 297
},
{
"epoch": 1.0067382606864603,
"grad_norm": 0.5238195474514908,
"learning_rate": 7.752275307584664e-05,
"loss": 0.4295,
"step": 298
},
{
"epoch": 1.0101073910296905,
"grad_norm": 0.5889650401759404,
"learning_rate": 7.748996401177244e-05,
"loss": 0.4275,
"step": 299
},
{
"epoch": 1.0134765213729207,
"grad_norm": 0.5507636965946558,
"learning_rate": 7.745696639965569e-05,
"loss": 0.4194,
"step": 300
},
{
"epoch": 1.0168456517161508,
"grad_norm": 0.6157938140990948,
"learning_rate": 7.742376042305449e-05,
"loss": 0.433,
"step": 301
},
{
"epoch": 1.020214782059381,
"grad_norm": 0.6252887191974348,
"learning_rate": 7.739034626668605e-05,
"loss": 0.4262,
"step": 302
},
{
"epoch": 1.0235839124026112,
"grad_norm": 0.5880567265579987,
"learning_rate": 7.735672411642562e-05,
"loss": 0.4233,
"step": 303
},
{
"epoch": 1.0269530427458413,
"grad_norm": 0.5726576390809455,
"learning_rate": 7.732289415930549e-05,
"loss": 0.424,
"step": 304
},
{
"epoch": 1.0303221730890715,
"grad_norm": 0.47054865900441445,
"learning_rate": 7.728885658351395e-05,
"loss": 0.4176,
"step": 305
},
{
"epoch": 1.0336913034323016,
"grad_norm": 0.4115743947585953,
"learning_rate": 7.725461157839417e-05,
"loss": 0.4292,
"step": 306
},
{
"epoch": 1.0370604337755316,
"grad_norm": 0.4210186740776416,
"learning_rate": 7.722015933444325e-05,
"loss": 0.4247,
"step": 307
},
{
"epoch": 1.0404295641187618,
"grad_norm": 0.36198066116445515,
"learning_rate": 7.71855000433111e-05,
"loss": 0.4193,
"step": 308
},
{
"epoch": 1.043798694461992,
"grad_norm": 0.43778422486602975,
"learning_rate": 7.715063389779936e-05,
"loss": 0.4238,
"step": 309
},
{
"epoch": 1.047167824805222,
"grad_norm": 0.45920696669429717,
"learning_rate": 7.711556109186039e-05,
"loss": 0.4237,
"step": 310
},
{
"epoch": 1.0505369551484522,
"grad_norm": 0.3341781881526272,
"learning_rate": 7.708028182059612e-05,
"loss": 0.4239,
"step": 311
},
{
"epoch": 1.0539060854916824,
"grad_norm": 0.3082296798332506,
"learning_rate": 7.704479628025704e-05,
"loss": 0.4167,
"step": 312
},
{
"epoch": 1.0572752158349126,
"grad_norm": 0.33284929605340835,
"learning_rate": 7.700910466824104e-05,
"loss": 0.4233,
"step": 313
},
{
"epoch": 1.0606443461781427,
"grad_norm": 0.333894193250551,
"learning_rate": 7.697320718309235e-05,
"loss": 0.4177,
"step": 314
},
{
"epoch": 1.064013476521373,
"grad_norm": 0.39528163268670363,
"learning_rate": 7.69371040245004e-05,
"loss": 0.4188,
"step": 315
},
{
"epoch": 1.067382606864603,
"grad_norm": 0.28394188370498197,
"learning_rate": 7.690079539329875e-05,
"loss": 0.4129,
"step": 316
},
{
"epoch": 1.0707517372078332,
"grad_norm": 0.2953142618107928,
"learning_rate": 7.686428149146398e-05,
"loss": 0.4188,
"step": 317
},
{
"epoch": 1.0741208675510634,
"grad_norm": 0.2804966323905774,
"learning_rate": 7.682756252211453e-05,
"loss": 0.4171,
"step": 318
},
{
"epoch": 1.0774899978942936,
"grad_norm": 0.2510744302434169,
"learning_rate": 7.679063868950955e-05,
"loss": 0.4182,
"step": 319
},
{
"epoch": 1.0808591282375237,
"grad_norm": 0.307553691452299,
"learning_rate": 7.675351019904785e-05,
"loss": 0.4177,
"step": 320
},
{
"epoch": 1.084228258580754,
"grad_norm": 0.30605445723544605,
"learning_rate": 7.671617725726666e-05,
"loss": 0.4158,
"step": 321
},
{
"epoch": 1.087597388923984,
"grad_norm": 0.3011711568174157,
"learning_rate": 7.667864007184054e-05,
"loss": 0.4141,
"step": 322
},
{
"epoch": 1.0909665192672142,
"grad_norm": 0.286345264353555,
"learning_rate": 7.664089885158023e-05,
"loss": 0.4187,
"step": 323
},
{
"epoch": 1.0943356496104444,
"grad_norm": 0.3592297464995333,
"learning_rate": 7.660295380643144e-05,
"loss": 0.4175,
"step": 324
},
{
"epoch": 1.0977047799536745,
"grad_norm": 0.4650620300291997,
"learning_rate": 7.656480514747374e-05,
"loss": 0.4258,
"step": 325
},
{
"epoch": 1.1010739102969047,
"grad_norm": 0.6004744804102117,
"learning_rate": 7.652645308691933e-05,
"loss": 0.419,
"step": 326
},
{
"epoch": 1.1044430406401347,
"grad_norm": 0.6574099091252659,
"learning_rate": 7.648789783811191e-05,
"loss": 0.4217,
"step": 327
},
{
"epoch": 1.1078121709833648,
"grad_norm": 0.6456572845078092,
"learning_rate": 7.644913961552544e-05,
"loss": 0.4207,
"step": 328
},
{
"epoch": 1.111181301326595,
"grad_norm": 0.5305128144292774,
"learning_rate": 7.641017863476298e-05,
"loss": 0.4215,
"step": 329
},
{
"epoch": 1.1145504316698251,
"grad_norm": 0.3067892240749629,
"learning_rate": 7.637101511255554e-05,
"loss": 0.4127,
"step": 330
},
{
"epoch": 1.1179195620130553,
"grad_norm": 0.3619393126280428,
"learning_rate": 7.633164926676076e-05,
"loss": 0.4144,
"step": 331
},
{
"epoch": 1.1212886923562855,
"grad_norm": 0.5099026443948101,
"learning_rate": 7.629208131636179e-05,
"loss": 0.4247,
"step": 332
},
{
"epoch": 1.1246578226995156,
"grad_norm": 0.4600109550186491,
"learning_rate": 7.625231148146601e-05,
"loss": 0.4277,
"step": 333
},
{
"epoch": 1.1280269530427458,
"grad_norm": 0.34171771897702874,
"learning_rate": 7.621233998330387e-05,
"loss": 0.4111,
"step": 334
},
{
"epoch": 1.131396083385976,
"grad_norm": 0.3788094599826585,
"learning_rate": 7.617216704422763e-05,
"loss": 0.4238,
"step": 335
},
{
"epoch": 1.1347652137292061,
"grad_norm": 0.38930545757784435,
"learning_rate": 7.61317928877101e-05,
"loss": 0.4266,
"step": 336
},
{
"epoch": 1.1381343440724363,
"grad_norm": 0.3433858843038104,
"learning_rate": 7.609121773834341e-05,
"loss": 0.4113,
"step": 337
},
{
"epoch": 1.1415034744156665,
"grad_norm": 0.3664698026597823,
"learning_rate": 7.605044182183779e-05,
"loss": 0.4215,
"step": 338
},
{
"epoch": 1.1448726047588966,
"grad_norm": 0.3669871707797489,
"learning_rate": 7.600946536502028e-05,
"loss": 0.4187,
"step": 339
},
{
"epoch": 1.1482417351021268,
"grad_norm": 0.36360389026471707,
"learning_rate": 7.596828859583347e-05,
"loss": 0.4179,
"step": 340
},
{
"epoch": 1.151610865445357,
"grad_norm": 0.34303435626525425,
"learning_rate": 7.592691174333426e-05,
"loss": 0.4166,
"step": 341
},
{
"epoch": 1.1549799957885871,
"grad_norm": 0.37256557828106257,
"learning_rate": 7.588533503769257e-05,
"loss": 0.4181,
"step": 342
},
{
"epoch": 1.1583491261318173,
"grad_norm": 0.41467336010702505,
"learning_rate": 7.584355871019002e-05,
"loss": 0.4195,
"step": 343
},
{
"epoch": 1.1617182564750475,
"grad_norm": 0.37682711741626357,
"learning_rate": 7.580158299321872e-05,
"loss": 0.4226,
"step": 344
},
{
"epoch": 1.1650873868182776,
"grad_norm": 0.2646890963052802,
"learning_rate": 7.575940812027993e-05,
"loss": 0.4094,
"step": 345
},
{
"epoch": 1.1684565171615078,
"grad_norm": 0.23766486308489482,
"learning_rate": 7.571703432598275e-05,
"loss": 0.42,
"step": 346
},
{
"epoch": 1.171825647504738,
"grad_norm": 0.23844909696838593,
"learning_rate": 7.567446184604285e-05,
"loss": 0.4189,
"step": 347
},
{
"epoch": 1.175194777847968,
"grad_norm": 0.23287909504956197,
"learning_rate": 7.563169091728115e-05,
"loss": 0.4123,
"step": 348
},
{
"epoch": 1.178563908191198,
"grad_norm": 0.21692865660818864,
"learning_rate": 7.558872177762246e-05,
"loss": 0.4193,
"step": 349
},
{
"epoch": 1.1819330385344282,
"grad_norm": 0.2191653011493883,
"learning_rate": 7.554555466609425e-05,
"loss": 0.4271,
"step": 350
},
{
"epoch": 1.1853021688776584,
"grad_norm": 0.23748843252543808,
"learning_rate": 7.550218982282518e-05,
"loss": 0.4196,
"step": 351
},
{
"epoch": 1.1886712992208885,
"grad_norm": 0.24616974908904704,
"learning_rate": 7.545862748904394e-05,
"loss": 0.4146,
"step": 352
},
{
"epoch": 1.1920404295641187,
"grad_norm": 0.25196821392623664,
"learning_rate": 7.541486790707776e-05,
"loss": 0.4266,
"step": 353
},
{
"epoch": 1.1954095599073489,
"grad_norm": 0.2470207075626988,
"learning_rate": 7.537091132035111e-05,
"loss": 0.4148,
"step": 354
},
{
"epoch": 1.198778690250579,
"grad_norm": 0.2314470630644042,
"learning_rate": 7.532675797338438e-05,
"loss": 0.4033,
"step": 355
},
{
"epoch": 1.2021478205938092,
"grad_norm": 0.23746828407475515,
"learning_rate": 7.528240811179245e-05,
"loss": 0.4203,
"step": 356
},
{
"epoch": 1.2055169509370394,
"grad_norm": 0.28754749236703137,
"learning_rate": 7.523786198228344e-05,
"loss": 0.4182,
"step": 357
},
{
"epoch": 1.2088860812802695,
"grad_norm": 0.3151739472415091,
"learning_rate": 7.519311983265718e-05,
"loss": 0.4222,
"step": 358
},
{
"epoch": 1.2122552116234997,
"grad_norm": 0.34818120293706006,
"learning_rate": 7.514818191180397e-05,
"loss": 0.4162,
"step": 359
},
{
"epoch": 1.2156243419667299,
"grad_norm": 0.39359185740609565,
"learning_rate": 7.510304846970311e-05,
"loss": 0.4179,
"step": 360
},
{
"epoch": 1.21899347230996,
"grad_norm": 0.49169697889587877,
"learning_rate": 7.505771975742157e-05,
"loss": 0.42,
"step": 361
},
{
"epoch": 1.2223626026531902,
"grad_norm": 0.6588501716182329,
"learning_rate": 7.501219602711253e-05,
"loss": 0.4207,
"step": 362
},
{
"epoch": 1.2257317329964204,
"grad_norm": 0.6704211038154936,
"learning_rate": 7.496647753201403e-05,
"loss": 0.419,
"step": 363
},
{
"epoch": 1.2291008633396505,
"grad_norm": 0.5759654898267196,
"learning_rate": 7.492056452644753e-05,
"loss": 0.418,
"step": 364
},
{
"epoch": 1.2324699936828807,
"grad_norm": 0.46697984648682656,
"learning_rate": 7.487445726581654e-05,
"loss": 0.4202,
"step": 365
},
{
"epoch": 1.2358391240261108,
"grad_norm": 0.4072897949316555,
"learning_rate": 7.48281560066051e-05,
"loss": 0.416,
"step": 366
},
{
"epoch": 1.2392082543693408,
"grad_norm": 0.3677992679979327,
"learning_rate": 7.47816610063765e-05,
"loss": 0.4184,
"step": 367
},
{
"epoch": 1.242577384712571,
"grad_norm": 0.39067043330522583,
"learning_rate": 7.473497252377171e-05,
"loss": 0.4246,
"step": 368
},
{
"epoch": 1.2459465150558011,
"grad_norm": 0.4371263217453357,
"learning_rate": 7.468809081850802e-05,
"loss": 0.4154,
"step": 369
},
{
"epoch": 1.2493156453990313,
"grad_norm": 0.466667275005644,
"learning_rate": 7.464101615137756e-05,
"loss": 0.4221,
"step": 370
},
{
"epoch": 1.2526847757422614,
"grad_norm": 0.40517656554168363,
"learning_rate": 7.459374878424585e-05,
"loss": 0.4149,
"step": 371
},
{
"epoch": 1.2560539060854916,
"grad_norm": 0.318301658777578,
"learning_rate": 7.454628898005043e-05,
"loss": 0.4117,
"step": 372
},
{
"epoch": 1.2594230364287218,
"grad_norm": 0.27016373739597804,
"learning_rate": 7.449863700279923e-05,
"loss": 0.4151,
"step": 373
},
{
"epoch": 1.262792166771952,
"grad_norm": 0.27155592369639425,
"learning_rate": 7.445079311756924e-05,
"loss": 0.4121,
"step": 374
},
{
"epoch": 1.266161297115182,
"grad_norm": 0.3009069725735746,
"learning_rate": 7.440275759050499e-05,
"loss": 0.4209,
"step": 375
},
{
"epoch": 1.2695304274584123,
"grad_norm": 0.2981596620618354,
"learning_rate": 7.435453068881706e-05,
"loss": 0.4127,
"step": 376
},
{
"epoch": 1.2728995578016424,
"grad_norm": 0.323823588891271,
"learning_rate": 7.430611268078059e-05,
"loss": 0.4097,
"step": 377
},
{
"epoch": 1.2762686881448726,
"grad_norm": 0.3872478492699807,
"learning_rate": 7.425750383573384e-05,
"loss": 0.4142,
"step": 378
},
{
"epoch": 1.2796378184881028,
"grad_norm": 0.38340080550258643,
"learning_rate": 7.420870442407662e-05,
"loss": 0.4158,
"step": 379
},
{
"epoch": 1.283006948831333,
"grad_norm": 0.33826799912854405,
"learning_rate": 7.415971471726884e-05,
"loss": 0.4181,
"step": 380
},
{
"epoch": 1.286376079174563,
"grad_norm": 0.3412122527192401,
"learning_rate": 7.411053498782893e-05,
"loss": 0.4115,
"step": 381
},
{
"epoch": 1.2897452095177933,
"grad_norm": 0.339753253979875,
"learning_rate": 7.406116550933246e-05,
"loss": 0.414,
"step": 382
},
{
"epoch": 1.2931143398610234,
"grad_norm": 0.27036940059101494,
"learning_rate": 7.401160655641044e-05,
"loss": 0.4134,
"step": 383
},
{
"epoch": 1.2964834702042536,
"grad_norm": 0.26771109575539487,
"learning_rate": 7.396185840474792e-05,
"loss": 0.4145,
"step": 384
},
{
"epoch": 1.2998526005474837,
"grad_norm": 0.26853385874933655,
"learning_rate": 7.391192133108243e-05,
"loss": 0.4196,
"step": 385
},
{
"epoch": 1.303221730890714,
"grad_norm": 0.25978626330709237,
"learning_rate": 7.386179561320243e-05,
"loss": 0.4179,
"step": 386
},
{
"epoch": 1.306590861233944,
"grad_norm": 0.27982894045702544,
"learning_rate": 7.381148152994573e-05,
"loss": 0.4134,
"step": 387
},
{
"epoch": 1.3099599915771742,
"grad_norm": 0.2323947422520883,
"learning_rate": 7.376097936119803e-05,
"loss": 0.4125,
"step": 388
},
{
"epoch": 1.3133291219204044,
"grad_norm": 0.30635653504253,
"learning_rate": 7.371028938789122e-05,
"loss": 0.4169,
"step": 389
},
{
"epoch": 1.3166982522636346,
"grad_norm": 0.30800676867326193,
"learning_rate": 7.365941189200201e-05,
"loss": 0.4124,
"step": 390
},
{
"epoch": 1.3200673826068647,
"grad_norm": 0.26291762070095065,
"learning_rate": 7.360834715655019e-05,
"loss": 0.4163,
"step": 391
},
{
"epoch": 1.3234365129500947,
"grad_norm": 0.28430697703638136,
"learning_rate": 7.35570954655971e-05,
"loss": 0.4126,
"step": 392
},
{
"epoch": 1.3268056432933248,
"grad_norm": 0.26422175633500083,
"learning_rate": 7.350565710424414e-05,
"loss": 0.4089,
"step": 393
},
{
"epoch": 1.330174773636555,
"grad_norm": 0.2426759795255368,
"learning_rate": 7.345403235863105e-05,
"loss": 0.4164,
"step": 394
},
{
"epoch": 1.3335439039797852,
"grad_norm": 0.25948928094236345,
"learning_rate": 7.340222151593443e-05,
"loss": 0.4184,
"step": 395
},
{
"epoch": 1.3369130343230153,
"grad_norm": 0.3377516247942669,
"learning_rate": 7.335022486436608e-05,
"loss": 0.4169,
"step": 396
},
{
"epoch": 1.3402821646662455,
"grad_norm": 0.39062210665305114,
"learning_rate": 7.329804269317137e-05,
"loss": 0.4212,
"step": 397
},
{
"epoch": 1.3436512950094757,
"grad_norm": 0.43725817266447464,
"learning_rate": 7.324567529262775e-05,
"loss": 0.4162,
"step": 398
},
{
"epoch": 1.3470204253527058,
"grad_norm": 0.3940951458992484,
"learning_rate": 7.319312295404301e-05,
"loss": 0.4109,
"step": 399
},
{
"epoch": 1.350389555695936,
"grad_norm": 0.30866362073984877,
"learning_rate": 7.31403859697537e-05,
"loss": 0.4138,
"step": 400
},
{
"epoch": 1.3537586860391662,
"grad_norm": 0.26875958761791713,
"learning_rate": 7.308746463312353e-05,
"loss": 0.417,
"step": 401
},
{
"epoch": 1.3571278163823963,
"grad_norm": 0.3115107888080639,
"learning_rate": 7.303435923854172e-05,
"loss": 0.4122,
"step": 402
},
{
"epoch": 1.3604969467256265,
"grad_norm": 0.36714492695394935,
"learning_rate": 7.298107008142139e-05,
"loss": 0.4159,
"step": 403
},
{
"epoch": 1.3638660770688567,
"grad_norm": 0.3981685397894353,
"learning_rate": 7.292759745819781e-05,
"loss": 0.4133,
"step": 404
},
{
"epoch": 1.3672352074120868,
"grad_norm": 0.3069454557345131,
"learning_rate": 7.287394166632691e-05,
"loss": 0.4208,
"step": 405
},
{
"epoch": 1.370604337755317,
"grad_norm": 0.24748441489038914,
"learning_rate": 7.282010300428351e-05,
"loss": 0.4104,
"step": 406
},
{
"epoch": 1.373973468098547,
"grad_norm": 0.2118791200536055,
"learning_rate": 7.276608177155968e-05,
"loss": 0.4124,
"step": 407
},
{
"epoch": 1.377342598441777,
"grad_norm": 0.24963361520147168,
"learning_rate": 7.271187826866312e-05,
"loss": 0.4149,
"step": 408
},
{
"epoch": 1.3807117287850073,
"grad_norm": 0.31609061336459937,
"learning_rate": 7.265749279711543e-05,
"loss": 0.4266,
"step": 409
},
{
"epoch": 1.3840808591282374,
"grad_norm": 0.35611885888992273,
"learning_rate": 7.260292565945049e-05,
"loss": 0.4144,
"step": 410
},
{
"epoch": 1.3874499894714676,
"grad_norm": 0.36813941496856034,
"learning_rate": 7.254817715921273e-05,
"loss": 0.4148,
"step": 411
},
{
"epoch": 1.3908191198146977,
"grad_norm": 0.3386310771485794,
"learning_rate": 7.249324760095544e-05,
"loss": 0.4157,
"step": 412
},
{
"epoch": 1.394188250157928,
"grad_norm": 0.3286044748961816,
"learning_rate": 7.243813729023913e-05,
"loss": 0.418,
"step": 413
},
{
"epoch": 1.397557380501158,
"grad_norm": 0.36106557337509826,
"learning_rate": 7.238284653362977e-05,
"loss": 0.4127,
"step": 414
},
{
"epoch": 1.4009265108443882,
"grad_norm": 0.3713183412724868,
"learning_rate": 7.232737563869711e-05,
"loss": 0.4223,
"step": 415
},
{
"epoch": 1.4042956411876184,
"grad_norm": 0.4108948812505769,
"learning_rate": 7.227172491401299e-05,
"loss": 0.4159,
"step": 416
},
{
"epoch": 1.4076647715308486,
"grad_norm": 0.42746635756913826,
"learning_rate": 7.221589466914955e-05,
"loss": 0.4183,
"step": 417
},
{
"epoch": 1.4110339018740787,
"grad_norm": 0.4281754377696307,
"learning_rate": 7.215988521467763e-05,
"loss": 0.4143,
"step": 418
},
{
"epoch": 1.414403032217309,
"grad_norm": 0.34581190375042925,
"learning_rate": 7.210369686216492e-05,
"loss": 0.4232,
"step": 419
},
{
"epoch": 1.417772162560539,
"grad_norm": 0.24817011068233216,
"learning_rate": 7.204732992417431e-05,
"loss": 0.4203,
"step": 420
},
{
"epoch": 1.4211412929037692,
"grad_norm": 0.2703015486109723,
"learning_rate": 7.199078471426208e-05,
"loss": 0.4188,
"step": 421
},
{
"epoch": 1.4245104232469994,
"grad_norm": 0.3376907597722382,
"learning_rate": 7.193406154697625e-05,
"loss": 0.4123,
"step": 422
},
{
"epoch": 1.4278795535902296,
"grad_norm": 0.35688284736368614,
"learning_rate": 7.187716073785471e-05,
"loss": 0.4073,
"step": 423
},
{
"epoch": 1.4312486839334597,
"grad_norm": 0.29210262958830335,
"learning_rate": 7.18200826034236e-05,
"loss": 0.4155,
"step": 424
},
{
"epoch": 1.4346178142766899,
"grad_norm": 0.20624868853539452,
"learning_rate": 7.176282746119544e-05,
"loss": 0.4082,
"step": 425
},
{
"epoch": 1.43798694461992,
"grad_norm": 0.21431087254932987,
"learning_rate": 7.17053956296674e-05,
"loss": 0.4072,
"step": 426
},
{
"epoch": 1.4413560749631502,
"grad_norm": 0.25982900003092185,
"learning_rate": 7.164778742831954e-05,
"loss": 0.4113,
"step": 427
},
{
"epoch": 1.4447252053063804,
"grad_norm": 0.3503298873194117,
"learning_rate": 7.159000317761305e-05,
"loss": 0.4128,
"step": 428
},
{
"epoch": 1.4480943356496105,
"grad_norm": 0.4693051629184559,
"learning_rate": 7.153204319898839e-05,
"loss": 0.4138,
"step": 429
},
{
"epoch": 1.4514634659928407,
"grad_norm": 0.502991287048126,
"learning_rate": 7.14739078148636e-05,
"loss": 0.4157,
"step": 430
},
{
"epoch": 1.4548325963360709,
"grad_norm": 0.5001041791172387,
"learning_rate": 7.141559734863245e-05,
"loss": 0.4082,
"step": 431
},
{
"epoch": 1.458201726679301,
"grad_norm": 0.4696810029288007,
"learning_rate": 7.135711212466264e-05,
"loss": 0.4198,
"step": 432
},
{
"epoch": 1.461570857022531,
"grad_norm": 0.43034902073433023,
"learning_rate": 7.1298452468294e-05,
"loss": 0.4165,
"step": 433
},
{
"epoch": 1.4649399873657611,
"grad_norm": 0.4022839654121198,
"learning_rate": 7.123961870583671e-05,
"loss": 0.4096,
"step": 434
},
{
"epoch": 1.4683091177089913,
"grad_norm": 0.3107712308577315,
"learning_rate": 7.118061116456944e-05,
"loss": 0.4137,
"step": 435
},
{
"epoch": 1.4716782480522215,
"grad_norm": 0.3140180702883453,
"learning_rate": 7.112143017273759e-05,
"loss": 0.4108,
"step": 436
},
{
"epoch": 1.4750473783954516,
"grad_norm": 0.40495663409539695,
"learning_rate": 7.106207605955136e-05,
"loss": 0.4166,
"step": 437
},
{
"epoch": 1.4784165087386818,
"grad_norm": 0.4652370041483942,
"learning_rate": 7.100254915518408e-05,
"loss": 0.414,
"step": 438
},
{
"epoch": 1.481785639081912,
"grad_norm": 0.41391982007664,
"learning_rate": 7.094284979077015e-05,
"loss": 0.4131,
"step": 439
},
{
"epoch": 1.4851547694251421,
"grad_norm": 0.34516805959620245,
"learning_rate": 7.088297829840346e-05,
"loss": 0.4129,
"step": 440
},
{
"epoch": 1.4885238997683723,
"grad_norm": 0.32652038382328485,
"learning_rate": 7.08229350111353e-05,
"loss": 0.413,
"step": 441
},
{
"epoch": 1.4918930301116025,
"grad_norm": 0.22506092814882847,
"learning_rate": 7.076272026297268e-05,
"loss": 0.4127,
"step": 442
},
{
"epoch": 1.4952621604548326,
"grad_norm": 0.2282536847065667,
"learning_rate": 7.070233438887639e-05,
"loss": 0.4071,
"step": 443
},
{
"epoch": 1.4986312907980628,
"grad_norm": 0.2446847320184482,
"learning_rate": 7.064177772475912e-05,
"loss": 0.4138,
"step": 444
},
{
"epoch": 1.502000421141293,
"grad_norm": 0.25152698752852437,
"learning_rate": 7.05810506074837e-05,
"loss": 0.4141,
"step": 445
},
{
"epoch": 1.505369551484523,
"grad_norm": 0.2548217617366647,
"learning_rate": 7.052015337486109e-05,
"loss": 0.4098,
"step": 446
},
{
"epoch": 1.508738681827753,
"grad_norm": 0.2731777853595498,
"learning_rate": 7.045908636564858e-05,
"loss": 0.4118,
"step": 447
},
{
"epoch": 1.5121078121709832,
"grad_norm": 0.3121024086238583,
"learning_rate": 7.03978499195479e-05,
"loss": 0.4111,
"step": 448
},
{
"epoch": 1.5154769425142134,
"grad_norm": 0.28013154989340816,
"learning_rate": 7.03364443772033e-05,
"loss": 0.4123,
"step": 449
},
{
"epoch": 1.5188460728574436,
"grad_norm": 0.20045789950968235,
"learning_rate": 7.027487008019969e-05,
"loss": 0.41,
"step": 450
},
{
"epoch": 1.5222152032006737,
"grad_norm": 0.1935253416836786,
"learning_rate": 7.021312737106068e-05,
"loss": 0.4184,
"step": 451
},
{
"epoch": 1.5255843335439039,
"grad_norm": 0.2182563856327327,
"learning_rate": 7.015121659324678e-05,
"loss": 0.4121,
"step": 452
},
{
"epoch": 1.528953463887134,
"grad_norm": 0.20129933934815375,
"learning_rate": 7.00891380911534e-05,
"loss": 0.4136,
"step": 453
},
{
"epoch": 1.5323225942303642,
"grad_norm": 0.19011310030838788,
"learning_rate": 7.002689221010897e-05,
"loss": 0.4113,
"step": 454
},
{
"epoch": 1.5356917245735944,
"grad_norm": 0.19585723303180483,
"learning_rate": 6.9964479296373e-05,
"loss": 0.4139,
"step": 455
},
{
"epoch": 1.5390608549168245,
"grad_norm": 0.1740680287737997,
"learning_rate": 6.990189969713416e-05,
"loss": 0.4141,
"step": 456
},
{
"epoch": 1.5424299852600547,
"grad_norm": 0.2068670733390012,
"learning_rate": 6.983915376050833e-05,
"loss": 0.4093,
"step": 457
},
{
"epoch": 1.5457991156032849,
"grad_norm": 0.2583283837253456,
"learning_rate": 6.977624183553676e-05,
"loss": 0.4192,
"step": 458
},
{
"epoch": 1.549168245946515,
"grad_norm": 0.28194252885557924,
"learning_rate": 6.971316427218399e-05,
"loss": 0.412,
"step": 459
},
{
"epoch": 1.5525373762897452,
"grad_norm": 0.27071463569696164,
"learning_rate": 6.964992142133602e-05,
"loss": 0.4207,
"step": 460
},
{
"epoch": 1.5559065066329754,
"grad_norm": 0.27470579632282327,
"learning_rate": 6.958651363479822e-05,
"loss": 0.4165,
"step": 461
},
{
"epoch": 1.5592756369762055,
"grad_norm": 0.2703402600040993,
"learning_rate": 6.952294126529356e-05,
"loss": 0.4134,
"step": 462
},
{
"epoch": 1.5626447673194357,
"grad_norm": 0.26465479604538705,
"learning_rate": 6.94592046664605e-05,
"loss": 0.4136,
"step": 463
},
{
"epoch": 1.5660138976626659,
"grad_norm": 0.31132857636043815,
"learning_rate": 6.939530419285104e-05,
"loss": 0.4163,
"step": 464
},
{
"epoch": 1.569383028005896,
"grad_norm": 0.4012221142274,
"learning_rate": 6.933124019992884e-05,
"loss": 0.4138,
"step": 465
},
{
"epoch": 1.5727521583491262,
"grad_norm": 0.5021621002447393,
"learning_rate": 6.926701304406713e-05,
"loss": 0.4105,
"step": 466
},
{
"epoch": 1.5761212886923563,
"grad_norm": 0.5905418251776803,
"learning_rate": 6.920262308254683e-05,
"loss": 0.4147,
"step": 467
},
{
"epoch": 1.5794904190355865,
"grad_norm": 0.6182317762023337,
"learning_rate": 6.913807067355445e-05,
"loss": 0.4128,
"step": 468
},
{
"epoch": 1.5828595493788167,
"grad_norm": 0.4945917435433832,
"learning_rate": 6.907335617618018e-05,
"loss": 0.4167,
"step": 469
},
{
"epoch": 1.5862286797220468,
"grad_norm": 0.3166116083838581,
"learning_rate": 6.90084799504159e-05,
"loss": 0.4136,
"step": 470
},
{
"epoch": 1.589597810065277,
"grad_norm": 0.2848441164225104,
"learning_rate": 6.894344235715311e-05,
"loss": 0.4127,
"step": 471
},
{
"epoch": 1.5929669404085072,
"grad_norm": 0.35210847111444277,
"learning_rate": 6.887824375818099e-05,
"loss": 0.4125,
"step": 472
},
{
"epoch": 1.5963360707517373,
"grad_norm": 0.36122192833869504,
"learning_rate": 6.881288451618431e-05,
"loss": 0.4175,
"step": 473
},
{
"epoch": 1.5997052010949675,
"grad_norm": 0.30874010342588315,
"learning_rate": 6.874736499474154e-05,
"loss": 0.4123,
"step": 474
},
{
"epoch": 1.6030743314381977,
"grad_norm": 0.2415425383601781,
"learning_rate": 6.868168555832266e-05,
"loss": 0.409,
"step": 475
},
{
"epoch": 1.6064434617814278,
"grad_norm": 0.2777593930598247,
"learning_rate": 6.861584657228728e-05,
"loss": 0.4109,
"step": 476
},
{
"epoch": 1.6098125921246578,
"grad_norm": 0.2552160489277856,
"learning_rate": 6.854984840288253e-05,
"loss": 0.4063,
"step": 477
},
{
"epoch": 1.613181722467888,
"grad_norm": 0.21292379117303817,
"learning_rate": 6.848369141724104e-05,
"loss": 0.4113,
"step": 478
},
{
"epoch": 1.616550852811118,
"grad_norm": 0.25826725556041485,
"learning_rate": 6.841737598337886e-05,
"loss": 0.4162,
"step": 479
},
{
"epoch": 1.6199199831543483,
"grad_norm": 0.24587379643844692,
"learning_rate": 6.835090247019354e-05,
"loss": 0.4098,
"step": 480
},
{
"epoch": 1.6232891134975784,
"grad_norm": 0.22506059025604672,
"learning_rate": 6.828427124746191e-05,
"loss": 0.4177,
"step": 481
},
{
"epoch": 1.6266582438408086,
"grad_norm": 0.2625291980432951,
"learning_rate": 6.821748268583813e-05,
"loss": 0.4138,
"step": 482
},
{
"epoch": 1.6300273741840388,
"grad_norm": 0.2899682108073399,
"learning_rate": 6.815053715685161e-05,
"loss": 0.4112,
"step": 483
},
{
"epoch": 1.633396504527269,
"grad_norm": 0.24684733944107418,
"learning_rate": 6.808343503290491e-05,
"loss": 0.4084,
"step": 484
},
{
"epoch": 1.636765634870499,
"grad_norm": 0.22856568944562095,
"learning_rate": 6.80161766872717e-05,
"loss": 0.4099,
"step": 485
},
{
"epoch": 1.6401347652137293,
"grad_norm": 0.2528553309235842,
"learning_rate": 6.79487624940947e-05,
"loss": 0.4074,
"step": 486
},
{
"epoch": 1.6435038955569592,
"grad_norm": 0.24954291821287325,
"learning_rate": 6.788119282838355e-05,
"loss": 0.4156,
"step": 487
},
{
"epoch": 1.6468730259001894,
"grad_norm": 0.2486958212588815,
"learning_rate": 6.781346806601273e-05,
"loss": 0.4148,
"step": 488
},
{
"epoch": 1.6502421562434195,
"grad_norm": 0.20838834765340428,
"learning_rate": 6.774558858371952e-05,
"loss": 0.4107,
"step": 489
},
{
"epoch": 1.6536112865866497,
"grad_norm": 0.157993940020379,
"learning_rate": 6.767755475910185e-05,
"loss": 0.4112,
"step": 490
},
{
"epoch": 1.6569804169298799,
"grad_norm": 0.24383891745288697,
"learning_rate": 6.760936697061626e-05,
"loss": 0.4117,
"step": 491
},
{
"epoch": 1.66034954727311,
"grad_norm": 0.28630859094765176,
"learning_rate": 6.754102559757569e-05,
"loss": 0.4108,
"step": 492
},
{
"epoch": 1.6637186776163402,
"grad_norm": 0.2744705368738465,
"learning_rate": 6.74725310201475e-05,
"loss": 0.4068,
"step": 493
},
{
"epoch": 1.6670878079595703,
"grad_norm": 0.2832510381791776,
"learning_rate": 6.740388361935125e-05,
"loss": 0.4072,
"step": 494
},
{
"epoch": 1.6704569383028005,
"grad_norm": 0.2988249231230451,
"learning_rate": 6.733508377705661e-05,
"loss": 0.4077,
"step": 495
},
{
"epoch": 1.6738260686460307,
"grad_norm": 0.24557523045791532,
"learning_rate": 6.726613187598132e-05,
"loss": 0.416,
"step": 496
},
{
"epoch": 1.6771951989892608,
"grad_norm": 0.21450213834423756,
"learning_rate": 6.71970282996889e-05,
"loss": 0.4099,
"step": 497
},
{
"epoch": 1.680564329332491,
"grad_norm": 0.2564463597465919,
"learning_rate": 6.712777343258666e-05,
"loss": 0.4113,
"step": 498
},
{
"epoch": 1.6839334596757212,
"grad_norm": 0.28973958295073354,
"learning_rate": 6.705836765992348e-05,
"loss": 0.4173,
"step": 499
},
{
"epoch": 1.6873025900189513,
"grad_norm": 0.3093418967185147,
"learning_rate": 6.698881136778771e-05,
"loss": 0.4173,
"step": 500
},
{
"epoch": 1.6906717203621815,
"grad_norm": 0.30710292961925306,
"learning_rate": 6.691910494310499e-05,
"loss": 0.4202,
"step": 501
},
{
"epoch": 1.6940408507054117,
"grad_norm": 0.298386372490933,
"learning_rate": 6.684924877363613e-05,
"loss": 0.4063,
"step": 502
},
{
"epoch": 1.6974099810486418,
"grad_norm": 0.31358421654801716,
"learning_rate": 6.67792432479749e-05,
"loss": 0.4117,
"step": 503
},
{
"epoch": 1.700779111391872,
"grad_norm": 0.34684913918298366,
"learning_rate": 6.670908875554594e-05,
"loss": 0.4103,
"step": 504
},
{
"epoch": 1.7041482417351022,
"grad_norm": 0.3071849696400485,
"learning_rate": 6.663878568660258e-05,
"loss": 0.4064,
"step": 505
},
{
"epoch": 1.7075173720783323,
"grad_norm": 0.25934260311596186,
"learning_rate": 6.656833443222458e-05,
"loss": 0.4026,
"step": 506
},
{
"epoch": 1.7108865024215625,
"grad_norm": 0.254331135385578,
"learning_rate": 6.649773538431605e-05,
"loss": 0.4123,
"step": 507
},
{
"epoch": 1.7142556327647926,
"grad_norm": 0.2696672284837906,
"learning_rate": 6.642698893560327e-05,
"loss": 0.4135,
"step": 508
},
{
"epoch": 1.7176247631080228,
"grad_norm": 0.3170338993835499,
"learning_rate": 6.635609547963243e-05,
"loss": 0.4078,
"step": 509
},
{
"epoch": 1.720993893451253,
"grad_norm": 0.34598694657993484,
"learning_rate": 6.628505541076755e-05,
"loss": 0.4143,
"step": 510
},
{
"epoch": 1.7243630237944831,
"grad_norm": 0.3659302514618013,
"learning_rate": 6.621386912418816e-05,
"loss": 0.413,
"step": 511
},
{
"epoch": 1.7277321541377133,
"grad_norm": 0.3036155922766547,
"learning_rate": 6.614253701588718e-05,
"loss": 0.413,
"step": 512
},
{
"epoch": 1.7311012844809435,
"grad_norm": 0.26442302840915777,
"learning_rate": 6.607105948266872e-05,
"loss": 0.4141,
"step": 513
},
{
"epoch": 1.7344704148241736,
"grad_norm": 0.2820703196464,
"learning_rate": 6.599943692214587e-05,
"loss": 0.4154,
"step": 514
},
{
"epoch": 1.7378395451674038,
"grad_norm": 0.2716579783783052,
"learning_rate": 6.592766973273843e-05,
"loss": 0.418,
"step": 515
},
{
"epoch": 1.741208675510634,
"grad_norm": 0.2320214556767005,
"learning_rate": 6.585575831367078e-05,
"loss": 0.4136,
"step": 516
},
{
"epoch": 1.7445778058538641,
"grad_norm": 0.20790915888905742,
"learning_rate": 6.578370306496957e-05,
"loss": 0.4126,
"step": 517
},
{
"epoch": 1.747946936197094,
"grad_norm": 0.2165582926633229,
"learning_rate": 6.571150438746157e-05,
"loss": 0.4112,
"step": 518
},
{
"epoch": 1.7513160665403242,
"grad_norm": 0.24261057128754013,
"learning_rate": 6.563916268277144e-05,
"loss": 0.413,
"step": 519
},
{
"epoch": 1.7546851968835544,
"grad_norm": 0.2755800264624728,
"learning_rate": 6.55666783533194e-05,
"loss": 0.4166,
"step": 520
},
{
"epoch": 1.7580543272267846,
"grad_norm": 0.28813858434017786,
"learning_rate": 6.549405180231911e-05,
"loss": 0.404,
"step": 521
},
{
"epoch": 1.7614234575700147,
"grad_norm": 0.24090919880210407,
"learning_rate": 6.542128343377536e-05,
"loss": 0.4075,
"step": 522
},
{
"epoch": 1.764792587913245,
"grad_norm": 0.21389800108034238,
"learning_rate": 6.534837365248185e-05,
"loss": 0.4124,
"step": 523
},
{
"epoch": 1.768161718256475,
"grad_norm": 0.2562042134322129,
"learning_rate": 6.527532286401889e-05,
"loss": 0.4174,
"step": 524
},
{
"epoch": 1.7715308485997052,
"grad_norm": 0.2571401145743441,
"learning_rate": 6.520213147475123e-05,
"loss": 0.4144,
"step": 525
},
{
"epoch": 1.7748999789429354,
"grad_norm": 0.2423820773625362,
"learning_rate": 6.51287998918257e-05,
"loss": 0.4046,
"step": 526
},
{
"epoch": 1.7782691092861656,
"grad_norm": 0.2310131148631897,
"learning_rate": 6.505532852316904e-05,
"loss": 0.407,
"step": 527
},
{
"epoch": 1.7816382396293957,
"grad_norm": 0.2467085051059651,
"learning_rate": 6.498171777748557e-05,
"loss": 0.4134,
"step": 528
},
{
"epoch": 1.7850073699726257,
"grad_norm": 0.2429312927228722,
"learning_rate": 6.49079680642549e-05,
"loss": 0.4136,
"step": 529
},
{
"epoch": 1.7883765003158558,
"grad_norm": 0.18962286619000535,
"learning_rate": 6.483407979372975e-05,
"loss": 0.4094,
"step": 530
},
{
"epoch": 1.791745630659086,
"grad_norm": 0.17276030637120937,
"learning_rate": 6.476005337693355e-05,
"loss": 0.4127,
"step": 531
},
{
"epoch": 1.7951147610023162,
"grad_norm": 0.1991873488324741,
"learning_rate": 6.468588922565822e-05,
"loss": 0.407,
"step": 532
},
{
"epoch": 1.7984838913455463,
"grad_norm": 0.23230143768755912,
"learning_rate": 6.461158775246186e-05,
"loss": 0.4069,
"step": 533
},
{
"epoch": 1.8018530216887765,
"grad_norm": 0.25362081452848795,
"learning_rate": 6.453714937066648e-05,
"loss": 0.4089,
"step": 534
},
{
"epoch": 1.8052221520320066,
"grad_norm": 0.20024317986028692,
"learning_rate": 6.446257449435566e-05,
"loss": 0.4062,
"step": 535
},
{
"epoch": 1.8085912823752368,
"grad_norm": 0.16636181558776822,
"learning_rate": 6.438786353837228e-05,
"loss": 0.4061,
"step": 536
},
{
"epoch": 1.811960412718467,
"grad_norm": 0.20687002125002474,
"learning_rate": 6.43130169183162e-05,
"loss": 0.4131,
"step": 537
},
{
"epoch": 1.8153295430616971,
"grad_norm": 0.2568138645034864,
"learning_rate": 6.423803505054193e-05,
"loss": 0.411,
"step": 538
},
{
"epoch": 1.8186986734049273,
"grad_norm": 0.3369872578212292,
"learning_rate": 6.416291835215636e-05,
"loss": 0.4077,
"step": 539
},
{
"epoch": 1.8220678037481575,
"grad_norm": 0.41379320932213953,
"learning_rate": 6.408766724101638e-05,
"loss": 0.4077,
"step": 540
},
{
"epoch": 1.8254369340913876,
"grad_norm": 0.43767998472550695,
"learning_rate": 6.401228213572663e-05,
"loss": 0.4151,
"step": 541
},
{
"epoch": 1.8288060644346178,
"grad_norm": 0.4536984763596022,
"learning_rate": 6.393676345563708e-05,
"loss": 0.42,
"step": 542
},
{
"epoch": 1.832175194777848,
"grad_norm": 0.4692529959956868,
"learning_rate": 6.386111162084078e-05,
"loss": 0.4002,
"step": 543
},
{
"epoch": 1.8355443251210781,
"grad_norm": 0.34237321055490366,
"learning_rate": 6.378532705217148e-05,
"loss": 0.406,
"step": 544
},
{
"epoch": 1.8389134554643083,
"grad_norm": 0.2659729255014706,
"learning_rate": 6.370941017120127e-05,
"loss": 0.4135,
"step": 545
},
{
"epoch": 1.8422825858075385,
"grad_norm": 0.32797296963486666,
"learning_rate": 6.363336140023833e-05,
"loss": 0.4088,
"step": 546
},
{
"epoch": 1.8456517161507686,
"grad_norm": 0.35579650932418716,
"learning_rate": 6.355718116232444e-05,
"loss": 0.4093,
"step": 547
},
{
"epoch": 1.8490208464939988,
"grad_norm": 0.2907411351475013,
"learning_rate": 6.348086988123274e-05,
"loss": 0.4116,
"step": 548
},
{
"epoch": 1.852389976837229,
"grad_norm": 0.2732388318681213,
"learning_rate": 6.340442798146535e-05,
"loss": 0.4091,
"step": 549
},
{
"epoch": 1.855759107180459,
"grad_norm": 0.35761144655913124,
"learning_rate": 6.332785588825094e-05,
"loss": 0.4037,
"step": 550
},
{
"epoch": 1.8591282375236893,
"grad_norm": 0.3014328362434633,
"learning_rate": 6.325115402754245e-05,
"loss": 0.4072,
"step": 551
},
{
"epoch": 1.8624973678669194,
"grad_norm": 0.2340334979203501,
"learning_rate": 6.317432282601469e-05,
"loss": 0.403,
"step": 552
},
{
"epoch": 1.8658664982101496,
"grad_norm": 0.33855256005840595,
"learning_rate": 6.309736271106193e-05,
"loss": 0.4106,
"step": 553
},
{
"epoch": 1.8692356285533798,
"grad_norm": 0.31482993852294594,
"learning_rate": 6.302027411079562e-05,
"loss": 0.4079,
"step": 554
},
{
"epoch": 1.87260475889661,
"grad_norm": 0.21683415129270545,
"learning_rate": 6.294305745404185e-05,
"loss": 0.4032,
"step": 555
},
{
"epoch": 1.87597388923984,
"grad_norm": 0.209469978649313,
"learning_rate": 6.286571317033915e-05,
"loss": 0.4088,
"step": 556
},
{
"epoch": 1.8793430195830703,
"grad_norm": 0.2816343476274617,
"learning_rate": 6.278824168993596e-05,
"loss": 0.4126,
"step": 557
},
{
"epoch": 1.8827121499263004,
"grad_norm": 0.32252631746288557,
"learning_rate": 6.271064344378832e-05,
"loss": 0.4086,
"step": 558
},
{
"epoch": 1.8860812802695306,
"grad_norm": 0.2900131891387989,
"learning_rate": 6.263291886355738e-05,
"loss": 0.4086,
"step": 559
},
{
"epoch": 1.8894504106127605,
"grad_norm": 0.26445922268042416,
"learning_rate": 6.255506838160711e-05,
"loss": 0.4093,
"step": 560
},
{
"epoch": 1.8928195409559907,
"grad_norm": 0.2561028521945913,
"learning_rate": 6.247709243100185e-05,
"loss": 0.4136,
"step": 561
},
{
"epoch": 1.8961886712992209,
"grad_norm": 0.23899571940882475,
"learning_rate": 6.239899144550383e-05,
"loss": 0.4058,
"step": 562
},
{
"epoch": 1.899557801642451,
"grad_norm": 0.2338421290415243,
"learning_rate": 6.232076585957087e-05,
"loss": 0.4074,
"step": 563
},
{
"epoch": 1.9029269319856812,
"grad_norm": 0.18752299712254275,
"learning_rate": 6.224241610835391e-05,
"loss": 0.4096,
"step": 564
},
{
"epoch": 1.9062960623289114,
"grad_norm": 0.19324708447438393,
"learning_rate": 6.216394262769459e-05,
"loss": 0.4096,
"step": 565
},
{
"epoch": 1.9096651926721415,
"grad_norm": 0.21276012461948887,
"learning_rate": 6.208534585412282e-05,
"loss": 0.4033,
"step": 566
},
{
"epoch": 1.9130343230153717,
"grad_norm": 0.18970083289771164,
"learning_rate": 6.200662622485435e-05,
"loss": 0.4054,
"step": 567
},
{
"epoch": 1.9164034533586018,
"grad_norm": 0.1696360552220803,
"learning_rate": 6.19277841777884e-05,
"loss": 0.4069,
"step": 568
},
{
"epoch": 1.919772583701832,
"grad_norm": 0.19478504599245822,
"learning_rate": 6.18488201515051e-05,
"loss": 0.4054,
"step": 569
},
{
"epoch": 1.923141714045062,
"grad_norm": 0.16721019486842992,
"learning_rate": 6.176973458526317e-05,
"loss": 0.4142,
"step": 570
},
{
"epoch": 1.9265108443882921,
"grad_norm": 0.18059816629328238,
"learning_rate": 6.169052791899742e-05,
"loss": 0.4047,
"step": 571
},
{
"epoch": 1.9298799747315223,
"grad_norm": 0.2125539453111369,
"learning_rate": 6.161120059331628e-05,
"loss": 0.4074,
"step": 572
},
{
"epoch": 1.9332491050747524,
"grad_norm": 0.19087275687720429,
"learning_rate": 6.153175304949946e-05,
"loss": 0.411,
"step": 573
},
{
"epoch": 1.9366182354179826,
"grad_norm": 0.18049162279809125,
"learning_rate": 6.14521857294953e-05,
"loss": 0.4055,
"step": 574
},
{
"epoch": 1.9399873657612128,
"grad_norm": 0.17375875826436044,
"learning_rate": 6.137249907591855e-05,
"loss": 0.4065,
"step": 575
},
{
"epoch": 1.943356496104443,
"grad_norm": 0.1739704448036202,
"learning_rate": 6.129269353204769e-05,
"loss": 0.4055,
"step": 576
},
{
"epoch": 1.946725626447673,
"grad_norm": 0.18538527661707113,
"learning_rate": 6.121276954182261e-05,
"loss": 0.4097,
"step": 577
},
{
"epoch": 1.9500947567909033,
"grad_norm": 0.15156397322647622,
"learning_rate": 6.113272754984206e-05,
"loss": 0.4061,
"step": 578
},
{
"epoch": 1.9534638871341334,
"grad_norm": 0.18018187705246097,
"learning_rate": 6.105256800136125e-05,
"loss": 0.4086,
"step": 579
},
{
"epoch": 1.9568330174773636,
"grad_norm": 0.1842284584819115,
"learning_rate": 6.0972291342289274e-05,
"loss": 0.413,
"step": 580
},
{
"epoch": 1.9602021478205938,
"grad_norm": 0.20065268901018266,
"learning_rate": 6.0891898019186726e-05,
"loss": 0.4068,
"step": 581
},
{
"epoch": 1.963571278163824,
"grad_norm": 0.20725303582942523,
"learning_rate": 6.081138847926317e-05,
"loss": 0.4102,
"step": 582
},
{
"epoch": 1.966940408507054,
"grad_norm": 0.19644421357341532,
"learning_rate": 6.0730763170374636e-05,
"loss": 0.4053,
"step": 583
},
{
"epoch": 1.9703095388502843,
"grad_norm": 0.20950085034614344,
"learning_rate": 6.065002254102116e-05,
"loss": 0.4043,
"step": 584
},
{
"epoch": 1.9736786691935144,
"grad_norm": 0.22898989423400687,
"learning_rate": 6.056916704034429e-05,
"loss": 0.4038,
"step": 585
},
{
"epoch": 1.9770477995367446,
"grad_norm": 0.2379556008347109,
"learning_rate": 6.048819711812457e-05,
"loss": 0.4075,
"step": 586
},
{
"epoch": 1.9804169298799748,
"grad_norm": 0.23608922426333814,
"learning_rate": 6.040711322477906e-05,
"loss": 0.4074,
"step": 587
},
{
"epoch": 1.983786060223205,
"grad_norm": 0.2036587578092891,
"learning_rate": 6.032591581135878e-05,
"loss": 0.4116,
"step": 588
},
{
"epoch": 1.987155190566435,
"grad_norm": 0.1851902404809834,
"learning_rate": 6.024460532954626e-05,
"loss": 0.4015,
"step": 589
},
{
"epoch": 1.9905243209096652,
"grad_norm": 0.18802588423448818,
"learning_rate": 6.0163182231652985e-05,
"loss": 0.4054,
"step": 590
},
{
"epoch": 1.9938934512528954,
"grad_norm": 0.22345260630855865,
"learning_rate": 6.008164697061695e-05,
"loss": 0.4055,
"step": 591
},
{
"epoch": 1.9972625815961256,
"grad_norm": 0.23969549917986255,
"learning_rate": 6.000000000000001e-05,
"loss": 0.4015,
"step": 592
},
{
"epoch": 2.00336913034323,
"grad_norm": 0.2867299003150961,
"learning_rate": 5.991824177398549e-05,
"loss": 0.3913,
"step": 593
},
{
"epoch": 2.0067382606864603,
"grad_norm": 0.34375862252314415,
"learning_rate": 5.983637274737558e-05,
"loss": 0.391,
"step": 594
},
{
"epoch": 2.0101073910296905,
"grad_norm": 0.3635152444198319,
"learning_rate": 5.975439337558886e-05,
"loss": 0.3799,
"step": 595
},
{
"epoch": 2.0134765213729207,
"grad_norm": 0.3422619581016819,
"learning_rate": 5.967230411465768e-05,
"loss": 0.388,
"step": 596
},
{
"epoch": 2.016845651716151,
"grad_norm": 0.32857568135445225,
"learning_rate": 5.9590105421225715e-05,
"loss": 0.3873,
"step": 597
},
{
"epoch": 2.020214782059381,
"grad_norm": 0.34465546224144156,
"learning_rate": 5.950779775254539e-05,
"loss": 0.3864,
"step": 598
},
{
"epoch": 2.023583912402611,
"grad_norm": 0.3318091541966093,
"learning_rate": 5.9425381566475316e-05,
"loss": 0.3901,
"step": 599
},
{
"epoch": 2.0269530427458413,
"grad_norm": 0.3211852458337534,
"learning_rate": 5.934285732147778e-05,
"loss": 0.3865,
"step": 600
},
{
"epoch": 2.0303221730890715,
"grad_norm": 0.28372803606540153,
"learning_rate": 5.9260225476616157e-05,
"loss": 0.3809,
"step": 601
},
{
"epoch": 2.0336913034323016,
"grad_norm": 0.26378333051858827,
"learning_rate": 5.91774864915524e-05,
"loss": 0.3825,
"step": 602
},
{
"epoch": 2.037060433775532,
"grad_norm": 0.2699942011391507,
"learning_rate": 5.909464082654442e-05,
"loss": 0.3814,
"step": 603
},
{
"epoch": 2.040429564118762,
"grad_norm": 0.32423565538212784,
"learning_rate": 5.90116889424436e-05,
"loss": 0.3949,
"step": 604
},
{
"epoch": 2.043798694461992,
"grad_norm": 0.3504800062724603,
"learning_rate": 5.8928631300692185e-05,
"loss": 0.3919,
"step": 605
},
{
"epoch": 2.0471678248052223,
"grad_norm": 0.28670213447600656,
"learning_rate": 5.884546836332072e-05,
"loss": 0.3848,
"step": 606
},
{
"epoch": 2.0505369551484525,
"grad_norm": 0.24765267252916567,
"learning_rate": 5.8762200592945484e-05,
"loss": 0.3862,
"step": 607
},
{
"epoch": 2.0539060854916826,
"grad_norm": 0.25397158563496697,
"learning_rate": 5.867882845276593e-05,
"loss": 0.384,
"step": 608
},
{
"epoch": 2.057275215834913,
"grad_norm": 0.19777815923412465,
"learning_rate": 5.859535240656208e-05,
"loss": 0.385,
"step": 609
},
{
"epoch": 2.060644346178143,
"grad_norm": 0.25257499668230105,
"learning_rate": 5.851177291869197e-05,
"loss": 0.3902,
"step": 610
},
{
"epoch": 2.064013476521373,
"grad_norm": 0.23438152088089984,
"learning_rate": 5.842809045408905e-05,
"loss": 0.3828,
"step": 611
},
{
"epoch": 2.0673826068646033,
"grad_norm": 0.24579596547862945,
"learning_rate": 5.834430547825964e-05,
"loss": 0.3895,
"step": 612
},
{
"epoch": 2.070751737207833,
"grad_norm": 0.254567202187919,
"learning_rate": 5.826041845728026e-05,
"loss": 0.3884,
"step": 613
},
{
"epoch": 2.074120867551063,
"grad_norm": 0.26694805867978466,
"learning_rate": 5.8176429857795104e-05,
"loss": 0.3884,
"step": 614
},
{
"epoch": 2.0774899978942933,
"grad_norm": 0.292686078529123,
"learning_rate": 5.809234014701342e-05,
"loss": 0.3869,
"step": 615
},
{
"epoch": 2.0808591282375235,
"grad_norm": 0.2543773210365024,
"learning_rate": 5.8008149792706936e-05,
"loss": 0.3841,
"step": 616
},
{
"epoch": 2.0842282585807537,
"grad_norm": 0.23117543050120432,
"learning_rate": 5.7923859263207205e-05,
"loss": 0.3839,
"step": 617
},
{
"epoch": 2.087597388923984,
"grad_norm": 0.32949270894440474,
"learning_rate": 5.783946902740304e-05,
"loss": 0.3848,
"step": 618
},
{
"epoch": 2.090966519267214,
"grad_norm": 0.3487344164810163,
"learning_rate": 5.7754979554737924e-05,
"loss": 0.3841,
"step": 619
},
{
"epoch": 2.094335649610444,
"grad_norm": 0.23249972606551436,
"learning_rate": 5.767039131520733e-05,
"loss": 0.3808,
"step": 620
},
{
"epoch": 2.0977047799536743,
"grad_norm": 0.1642526127565639,
"learning_rate": 5.758570477935618e-05,
"loss": 0.3852,
"step": 621
},
{
"epoch": 2.1010739102969045,
"grad_norm": 0.22737138050339126,
"learning_rate": 5.750092041827618e-05,
"loss": 0.3862,
"step": 622
},
{
"epoch": 2.1044430406401347,
"grad_norm": 0.22187422496371617,
"learning_rate": 5.7416038703603216e-05,
"loss": 0.39,
"step": 623
},
{
"epoch": 2.107812170983365,
"grad_norm": 0.1976542359852637,
"learning_rate": 5.7331060107514754e-05,
"loss": 0.3828,
"step": 624
},
{
"epoch": 2.111181301326595,
"grad_norm": 0.22929255732564582,
"learning_rate": 5.724598510272714e-05,
"loss": 0.3865,
"step": 625
},
{
"epoch": 2.114550431669825,
"grad_norm": 0.2281829564525587,
"learning_rate": 5.716081416249307e-05,
"loss": 0.3834,
"step": 626
},
{
"epoch": 2.1179195620130553,
"grad_norm": 0.1711530750792344,
"learning_rate": 5.707554776059886e-05,
"loss": 0.3864,
"step": 627
},
{
"epoch": 2.1212886923562855,
"grad_norm": 0.1952598465412235,
"learning_rate": 5.699018637136192e-05,
"loss": 0.3853,
"step": 628
},
{
"epoch": 2.1246578226995156,
"grad_norm": 0.21178404694012465,
"learning_rate": 5.6904730469627985e-05,
"loss": 0.394,
"step": 629
},
{
"epoch": 2.128026953042746,
"grad_norm": 0.2291084803798316,
"learning_rate": 5.681918053076858e-05,
"loss": 0.3851,
"step": 630
},
{
"epoch": 2.131396083385976,
"grad_norm": 0.2550272051240587,
"learning_rate": 5.673353703067832e-05,
"loss": 0.3872,
"step": 631
},
{
"epoch": 2.134765213729206,
"grad_norm": 0.2497998419444254,
"learning_rate": 5.664780044577231e-05,
"loss": 0.3881,
"step": 632
},
{
"epoch": 2.1381343440724363,
"grad_norm": 0.2222082480877385,
"learning_rate": 5.6561971252983424e-05,
"loss": 0.388,
"step": 633
},
{
"epoch": 2.1415034744156665,
"grad_norm": 0.18680744639544267,
"learning_rate": 5.6476049929759714e-05,
"loss": 0.3891,
"step": 634
},
{
"epoch": 2.1448726047588966,
"grad_norm": 0.21245971460544757,
"learning_rate": 5.6390036954061726e-05,
"loss": 0.3863,
"step": 635
},
{
"epoch": 2.148241735102127,
"grad_norm": 0.2162219122370638,
"learning_rate": 5.6303932804359857e-05,
"loss": 0.3909,
"step": 636
},
{
"epoch": 2.151610865445357,
"grad_norm": 0.15581628741660436,
"learning_rate": 5.621773795963166e-05,
"loss": 0.3879,
"step": 637
},
{
"epoch": 2.154979995788587,
"grad_norm": 0.22990888646168536,
"learning_rate": 5.613145289935926e-05,
"loss": 0.3882,
"step": 638
},
{
"epoch": 2.1583491261318173,
"grad_norm": 0.24959544004712048,
"learning_rate": 5.6045078103526545e-05,
"loss": 0.3799,
"step": 639
},
{
"epoch": 2.1617182564750475,
"grad_norm": 0.2308113655952683,
"learning_rate": 5.595861405261666e-05,
"loss": 0.3879,
"step": 640
},
{
"epoch": 2.1650873868182776,
"grad_norm": 0.2092244335914582,
"learning_rate": 5.58720612276092e-05,
"loss": 0.3871,
"step": 641
},
{
"epoch": 2.168456517161508,
"grad_norm": 0.2134067897632055,
"learning_rate": 5.578542010997764e-05,
"loss": 0.3822,
"step": 642
},
{
"epoch": 2.171825647504738,
"grad_norm": 0.20839647987055449,
"learning_rate": 5.569869118168655e-05,
"loss": 0.3848,
"step": 643
},
{
"epoch": 2.175194777847968,
"grad_norm": 0.16985344503865618,
"learning_rate": 5.561187492518903e-05,
"loss": 0.3858,
"step": 644
},
{
"epoch": 2.1785639081911983,
"grad_norm": 0.20941799721128232,
"learning_rate": 5.5524971823423905e-05,
"loss": 0.392,
"step": 645
},
{
"epoch": 2.1819330385344284,
"grad_norm": 0.21048667694813664,
"learning_rate": 5.5437982359813156e-05,
"loss": 0.3837,
"step": 646
},
{
"epoch": 2.1853021688776586,
"grad_norm": 0.17246060013503955,
"learning_rate": 5.5350907018259135e-05,
"loss": 0.3863,
"step": 647
},
{
"epoch": 2.1886712992208888,
"grad_norm": 0.1808917523018754,
"learning_rate": 5.526374628314195e-05,
"loss": 0.3873,
"step": 648
},
{
"epoch": 2.192040429564119,
"grad_norm": 0.16962189075007583,
"learning_rate": 5.5176500639316693e-05,
"loss": 0.3806,
"step": 649
},
{
"epoch": 2.195409559907349,
"grad_norm": 0.15829489129124838,
"learning_rate": 5.50891705721108e-05,
"loss": 0.3912,
"step": 650
},
{
"epoch": 2.1987786902505793,
"grad_norm": 0.20128590320313494,
"learning_rate": 5.5001756567321355e-05,
"loss": 0.3792,
"step": 651
},
{
"epoch": 2.2021478205938094,
"grad_norm": 0.218877863583923,
"learning_rate": 5.4914259111212355e-05,
"loss": 0.3865,
"step": 652
},
{
"epoch": 2.2055169509370396,
"grad_norm": 0.17606235529471279,
"learning_rate": 5.482667869051199e-05,
"loss": 0.3917,
"step": 653
},
{
"epoch": 2.2088860812802693,
"grad_norm": 0.14890556371643418,
"learning_rate": 5.473901579241e-05,
"loss": 0.38,
"step": 654
},
{
"epoch": 2.2122552116235,
"grad_norm": 0.1654643380961197,
"learning_rate": 5.4651270904554915e-05,
"loss": 0.394,
"step": 655
},
{
"epoch": 2.2156243419667296,
"grad_norm": 0.1570214426630876,
"learning_rate": 5.4563444515051354e-05,
"loss": 0.3854,
"step": 656
},
{
"epoch": 2.21899347230996,
"grad_norm": 0.1691883131216727,
"learning_rate": 5.44755371124573e-05,
"loss": 0.3851,
"step": 657
},
{
"epoch": 2.22236260265319,
"grad_norm": 0.17557198906026328,
"learning_rate": 5.438754918578144e-05,
"loss": 0.3913,
"step": 658
},
{
"epoch": 2.22573173299642,
"grad_norm": 0.16768631591392807,
"learning_rate": 5.429948122448031e-05,
"loss": 0.386,
"step": 659
},
{
"epoch": 2.2291008633396503,
"grad_norm": 0.14731731125382688,
"learning_rate": 5.4211333718455756e-05,
"loss": 0.3922,
"step": 660
},
{
"epoch": 2.2324699936828805,
"grad_norm": 0.17746489461476853,
"learning_rate": 5.4123107158052034e-05,
"loss": 0.387,
"step": 661
},
{
"epoch": 2.2358391240261106,
"grad_norm": 0.1903089984499793,
"learning_rate": 5.4034802034053223e-05,
"loss": 0.3833,
"step": 662
},
{
"epoch": 2.239208254369341,
"grad_norm": 0.17184011460057994,
"learning_rate": 5.394641883768041e-05,
"loss": 0.39,
"step": 663
},
{
"epoch": 2.242577384712571,
"grad_norm": 0.20233097347593668,
"learning_rate": 5.3857958060588955e-05,
"loss": 0.3891,
"step": 664
},
{
"epoch": 2.245946515055801,
"grad_norm": 0.21958650033217517,
"learning_rate": 5.3769420194865806e-05,
"loss": 0.3856,
"step": 665
},
{
"epoch": 2.2493156453990313,
"grad_norm": 0.18358377095064263,
"learning_rate": 5.368080573302676e-05,
"loss": 0.3828,
"step": 666
},
{
"epoch": 2.2526847757422614,
"grad_norm": 0.17979672984272335,
"learning_rate": 5.359211516801365e-05,
"loss": 0.3804,
"step": 667
},
{
"epoch": 2.2560539060854916,
"grad_norm": 0.16294334924828324,
"learning_rate": 5.3503348993191706e-05,
"loss": 0.3825,
"step": 668
},
{
"epoch": 2.2594230364287218,
"grad_norm": 0.1508454226549176,
"learning_rate": 5.34145077023467e-05,
"loss": 0.385,
"step": 669
},
{
"epoch": 2.262792166771952,
"grad_norm": 0.15470462637665758,
"learning_rate": 5.332559178968231e-05,
"loss": 0.3778,
"step": 670
},
{
"epoch": 2.266161297115182,
"grad_norm": 0.1359656397629021,
"learning_rate": 5.3236601749817296e-05,
"loss": 0.3896,
"step": 671
},
{
"epoch": 2.2695304274584123,
"grad_norm": 0.15226695399087686,
"learning_rate": 5.314753807778276e-05,
"loss": 0.3874,
"step": 672
},
{
"epoch": 2.2728995578016424,
"grad_norm": 0.14503332183422835,
"learning_rate": 5.3058401269019415e-05,
"loss": 0.3878,
"step": 673
},
{
"epoch": 2.2762686881448726,
"grad_norm": 0.15318787409886342,
"learning_rate": 5.296919181937485e-05,
"loss": 0.3857,
"step": 674
},
{
"epoch": 2.2796378184881028,
"grad_norm": 0.16971373493795616,
"learning_rate": 5.2879910225100655e-05,
"loss": 0.3855,
"step": 675
},
{
"epoch": 2.283006948831333,
"grad_norm": 0.1654804092839339,
"learning_rate": 5.279055698284982e-05,
"loss": 0.3877,
"step": 676
},
{
"epoch": 2.286376079174563,
"grad_norm": 0.1505186583674958,
"learning_rate": 5.270113258967386e-05,
"loss": 0.3832,
"step": 677
},
{
"epoch": 2.2897452095177933,
"grad_norm": 0.16676666984467559,
"learning_rate": 5.261163754302011e-05,
"loss": 0.386,
"step": 678
},
{
"epoch": 2.2931143398610234,
"grad_norm": 0.18567032268425918,
"learning_rate": 5.2522072340728896e-05,
"loss": 0.3907,
"step": 679
},
{
"epoch": 2.2964834702042536,
"grad_norm": 0.1765483695468527,
"learning_rate": 5.2432437481030855e-05,
"loss": 0.3882,
"step": 680
},
{
"epoch": 2.2998526005474837,
"grad_norm": 0.165430115440251,
"learning_rate": 5.234273346254406e-05,
"loss": 0.3946,
"step": 681
},
{
"epoch": 2.303221730890714,
"grad_norm": 0.1690494896953244,
"learning_rate": 5.225296078427135e-05,
"loss": 0.3857,
"step": 682
},
{
"epoch": 2.306590861233944,
"grad_norm": 0.201198083663681,
"learning_rate": 5.216311994559744e-05,
"loss": 0.389,
"step": 683
},
{
"epoch": 2.3099599915771742,
"grad_norm": 0.20812621009650192,
"learning_rate": 5.207321144628628e-05,
"loss": 0.3865,
"step": 684
},
{
"epoch": 2.3133291219204044,
"grad_norm": 0.21426999240641148,
"learning_rate": 5.198323578647813e-05,
"loss": 0.3867,
"step": 685
},
{
"epoch": 2.3166982522636346,
"grad_norm": 0.213657425755296,
"learning_rate": 5.18931934666869e-05,
"loss": 0.3922,
"step": 686
},
{
"epoch": 2.3200673826068647,
"grad_norm": 0.17137164943244815,
"learning_rate": 5.180308498779728e-05,
"loss": 0.3789,
"step": 687
},
{
"epoch": 2.323436512950095,
"grad_norm": 0.18022826820320403,
"learning_rate": 5.171291085106202e-05,
"loss": 0.3815,
"step": 688
},
{
"epoch": 2.326805643293325,
"grad_norm": 0.1755115364994259,
"learning_rate": 5.162267155809908e-05,
"loss": 0.389,
"step": 689
},
{
"epoch": 2.3301747736365552,
"grad_norm": 0.2011673377143987,
"learning_rate": 5.153236761088888e-05,
"loss": 0.3894,
"step": 690
},
{
"epoch": 2.3335439039797854,
"grad_norm": 0.2305809255417625,
"learning_rate": 5.14419995117715e-05,
"loss": 0.3811,
"step": 691
},
{
"epoch": 2.3369130343230156,
"grad_norm": 0.2115835801437973,
"learning_rate": 5.135156776344389e-05,
"loss": 0.3892,
"step": 692
},
{
"epoch": 2.3402821646662453,
"grad_norm": 0.19470845993737926,
"learning_rate": 5.126107286895702e-05,
"loss": 0.3832,
"step": 693
},
{
"epoch": 2.343651295009476,
"grad_norm": 0.16438102517886552,
"learning_rate": 5.117051533171321e-05,
"loss": 0.3863,
"step": 694
},
{
"epoch": 2.3470204253527056,
"grad_norm": 0.17475480058915455,
"learning_rate": 5.1079895655463177e-05,
"loss": 0.3859,
"step": 695
},
{
"epoch": 2.350389555695936,
"grad_norm": 0.18741810484417695,
"learning_rate": 5.098921434430333e-05,
"loss": 0.3825,
"step": 696
},
{
"epoch": 2.353758686039166,
"grad_norm": 0.1687881382681767,
"learning_rate": 5.0898471902672917e-05,
"loss": 0.3758,
"step": 697
},
{
"epoch": 2.357127816382396,
"grad_norm": 0.18436298872908952,
"learning_rate": 5.080766883535129e-05,
"loss": 0.3852,
"step": 698
},
{
"epoch": 2.3604969467256263,
"grad_norm": 0.19845837669577285,
"learning_rate": 5.0716805647455006e-05,
"loss": 0.3854,
"step": 699
},
{
"epoch": 2.3638660770688564,
"grad_norm": 0.18343761135804904,
"learning_rate": 5.062588284443505e-05,
"loss": 0.3825,
"step": 700
},
{
"epoch": 2.3672352074120866,
"grad_norm": 0.13923107512819735,
"learning_rate": 5.053490093207408e-05,
"loss": 0.3797,
"step": 701
},
{
"epoch": 2.3706043377553168,
"grad_norm": 0.1783129344294203,
"learning_rate": 5.0443860416483536e-05,
"loss": 0.3813,
"step": 702
},
{
"epoch": 2.373973468098547,
"grad_norm": 0.2047126526455967,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.3869,
"step": 703
},
{
"epoch": 2.377342598441777,
"grad_norm": 0.18677317936073162,
"learning_rate": 5.026160560168661e-05,
"loss": 0.3829,
"step": 704
},
{
"epoch": 2.3807117287850073,
"grad_norm": 0.15858411985283818,
"learning_rate": 5.0170392316321826e-05,
"loss": 0.3906,
"step": 705
},
{
"epoch": 2.3840808591282374,
"grad_norm": 0.1542922309469812,
"learning_rate": 5.0079122455405014e-05,
"loss": 0.3898,
"step": 706
},
{
"epoch": 2.3874499894714676,
"grad_norm": 0.16034757146153225,
"learning_rate": 4.9987796526649394e-05,
"loss": 0.3856,
"step": 707
},
{
"epoch": 2.3908191198146977,
"grad_norm": 0.17396513204876746,
"learning_rate": 4.989641503808011e-05,
"loss": 0.3845,
"step": 708
},
{
"epoch": 2.394188250157928,
"grad_norm": 0.14385199298465493,
"learning_rate": 4.9804978498031326e-05,
"loss": 0.383,
"step": 709
},
{
"epoch": 2.397557380501158,
"grad_norm": 0.1424278412585639,
"learning_rate": 4.971348741514349e-05,
"loss": 0.3923,
"step": 710
},
{
"epoch": 2.4009265108443882,
"grad_norm": 0.18492577887926495,
"learning_rate": 4.962194229836045e-05,
"loss": 0.3841,
"step": 711
},
{
"epoch": 2.4042956411876184,
"grad_norm": 0.1732020596072231,
"learning_rate": 4.95303436569266e-05,
"loss": 0.3915,
"step": 712
},
{
"epoch": 2.4076647715308486,
"grad_norm": 0.12301305622548196,
"learning_rate": 4.943869200038413e-05,
"loss": 0.384,
"step": 713
},
{
"epoch": 2.4110339018740787,
"grad_norm": 0.18053993824097098,
"learning_rate": 4.934698783857011e-05,
"loss": 0.3817,
"step": 714
},
{
"epoch": 2.414403032217309,
"grad_norm": 0.21725687137817615,
"learning_rate": 4.9255231681613674e-05,
"loss": 0.3887,
"step": 715
},
{
"epoch": 2.417772162560539,
"grad_norm": 0.17070860183839026,
"learning_rate": 4.91634240399332e-05,
"loss": 0.3842,
"step": 716
},
{
"epoch": 2.4211412929037692,
"grad_norm": 0.16062080472612222,
"learning_rate": 4.907156542423351e-05,
"loss": 0.3753,
"step": 717
},
{
"epoch": 2.4245104232469994,
"grad_norm": 0.16452143222682503,
"learning_rate": 4.8979656345502904e-05,
"loss": 0.3819,
"step": 718
},
{
"epoch": 2.4278795535902296,
"grad_norm": 0.17121464354448115,
"learning_rate": 4.888769731501047e-05,
"loss": 0.3829,
"step": 719
},
{
"epoch": 2.4312486839334597,
"grad_norm": 0.1588530781256576,
"learning_rate": 4.8795688844303114e-05,
"loss": 0.3872,
"step": 720
},
{
"epoch": 2.43461781427669,
"grad_norm": 0.15259487087295576,
"learning_rate": 4.870363144520279e-05,
"loss": 0.3878,
"step": 721
},
{
"epoch": 2.43798694461992,
"grad_norm": 0.15808052014003177,
"learning_rate": 4.861152562980362e-05,
"loss": 0.3827,
"step": 722
},
{
"epoch": 2.44135607496315,
"grad_norm": 0.18095527833139824,
"learning_rate": 4.851937191046906e-05,
"loss": 0.3828,
"step": 723
},
{
"epoch": 2.4447252053063804,
"grad_norm": 0.17700515235134065,
"learning_rate": 4.8427170799829055e-05,
"loss": 0.3849,
"step": 724
},
{
"epoch": 2.4480943356496105,
"grad_norm": 0.15108262997817984,
"learning_rate": 4.833492281077717e-05,
"loss": 0.3827,
"step": 725
},
{
"epoch": 2.4514634659928407,
"grad_norm": 0.14610122044801815,
"learning_rate": 4.824262845646771e-05,
"loss": 0.3891,
"step": 726
},
{
"epoch": 2.454832596336071,
"grad_norm": 0.17949690552168968,
"learning_rate": 4.815028825031295e-05,
"loss": 0.3824,
"step": 727
},
{
"epoch": 2.458201726679301,
"grad_norm": 0.17860414349949053,
"learning_rate": 4.805790270598021e-05,
"loss": 0.3859,
"step": 728
},
{
"epoch": 2.461570857022531,
"grad_norm": 0.15714664302158635,
"learning_rate": 4.796547233738901e-05,
"loss": 0.3805,
"step": 729
},
{
"epoch": 2.4649399873657614,
"grad_norm": 0.13409742518350323,
"learning_rate": 4.787299765870822e-05,
"loss": 0.3894,
"step": 730
},
{
"epoch": 2.4683091177089915,
"grad_norm": 0.1375698590454868,
"learning_rate": 4.77804791843532e-05,
"loss": 0.3885,
"step": 731
},
{
"epoch": 2.4716782480522217,
"grad_norm": 0.1382618240475382,
"learning_rate": 4.768791742898292e-05,
"loss": 0.3875,
"step": 732
},
{
"epoch": 2.475047378395452,
"grad_norm": 0.1398622806337096,
"learning_rate": 4.7595312907497135e-05,
"loss": 0.3853,
"step": 733
},
{
"epoch": 2.4784165087386816,
"grad_norm": 0.14539506330457003,
"learning_rate": 4.7502666135033486e-05,
"loss": 0.3935,
"step": 734
},
{
"epoch": 2.481785639081912,
"grad_norm": 0.13109075183048932,
"learning_rate": 4.7409977626964666e-05,
"loss": 0.3848,
"step": 735
},
{
"epoch": 2.485154769425142,
"grad_norm": 0.12988278807806955,
"learning_rate": 4.731724789889547e-05,
"loss": 0.3839,
"step": 736
},
{
"epoch": 2.4885238997683725,
"grad_norm": 0.1578289932884262,
"learning_rate": 4.722447746666008e-05,
"loss": 0.3836,
"step": 737
},
{
"epoch": 2.4918930301116022,
"grad_norm": 0.1696600549846316,
"learning_rate": 4.7131666846319036e-05,
"loss": 0.3825,
"step": 738
},
{
"epoch": 2.495262160454833,
"grad_norm": 0.13151686953984587,
"learning_rate": 4.7038816554156484e-05,
"loss": 0.3879,
"step": 739
},
{
"epoch": 2.4986312907980626,
"grad_norm": 0.19638702203051203,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3873,
"step": 740
},
{
"epoch": 2.502000421141293,
"grad_norm": 0.18899466534966777,
"learning_rate": 4.6852999020603864e-05,
"loss": 0.3808,
"step": 741
},
{
"epoch": 2.505369551484523,
"grad_norm": 0.12219071702355794,
"learning_rate": 4.676003281287397e-05,
"loss": 0.3876,
"step": 742
},
{
"epoch": 2.508738681827753,
"grad_norm": 0.18236706911247189,
"learning_rate": 4.6667029000637164e-05,
"loss": 0.3846,
"step": 743
},
{
"epoch": 2.5121078121709832,
"grad_norm": 0.1684130303158305,
"learning_rate": 4.657398810125225e-05,
"loss": 0.3888,
"step": 744
},
{
"epoch": 2.5154769425142134,
"grad_norm": 0.16891778570455948,
"learning_rate": 4.648091063228435e-05,
"loss": 0.3878,
"step": 745
},
{
"epoch": 2.5188460728574436,
"grad_norm": 0.16123369621023537,
"learning_rate": 4.638779711150198e-05,
"loss": 0.3888,
"step": 746
},
{
"epoch": 2.5222152032006737,
"grad_norm": 0.13513366343949626,
"learning_rate": 4.629464805687426e-05,
"loss": 0.3826,
"step": 747
},
{
"epoch": 2.525584333543904,
"grad_norm": 0.1460461212872677,
"learning_rate": 4.620146398656792e-05,
"loss": 0.3841,
"step": 748
},
{
"epoch": 2.528953463887134,
"grad_norm": 0.16497117181141158,
"learning_rate": 4.610824541894452e-05,
"loss": 0.3842,
"step": 749
},
{
"epoch": 2.532322594230364,
"grad_norm": 0.16290788207612428,
"learning_rate": 4.601499287255748e-05,
"loss": 0.3885,
"step": 750
},
{
"epoch": 2.5356917245735944,
"grad_norm": 0.14489151093892186,
"learning_rate": 4.592170686614926e-05,
"loss": 0.3909,
"step": 751
},
{
"epoch": 2.5390608549168245,
"grad_norm": 0.1464122207528577,
"learning_rate": 4.582838791864846e-05,
"loss": 0.3864,
"step": 752
},
{
"epoch": 2.5424299852600547,
"grad_norm": 0.1543922436683134,
"learning_rate": 4.5735036549166907e-05,
"loss": 0.3781,
"step": 753
},
{
"epoch": 2.545799115603285,
"grad_norm": 0.1511363443793848,
"learning_rate": 4.5641653276996774e-05,
"loss": 0.388,
"step": 754
},
{
"epoch": 2.549168245946515,
"grad_norm": 0.14775900613642287,
"learning_rate": 4.5548238621607735e-05,
"loss": 0.3829,
"step": 755
},
{
"epoch": 2.552537376289745,
"grad_norm": 0.1609040357156897,
"learning_rate": 4.5454793102644006e-05,
"loss": 0.3913,
"step": 756
},
{
"epoch": 2.5559065066329754,
"grad_norm": 0.17452716126040962,
"learning_rate": 4.5361317239921515e-05,
"loss": 0.387,
"step": 757
},
{
"epoch": 2.5592756369762055,
"grad_norm": 0.15479208730353294,
"learning_rate": 4.5267811553424945e-05,
"loss": 0.3794,
"step": 758
},
{
"epoch": 2.5626447673194357,
"grad_norm": 0.16977092756406884,
"learning_rate": 4.517427656330496e-05,
"loss": 0.3813,
"step": 759
},
{
"epoch": 2.566013897662666,
"grad_norm": 0.15943557512689435,
"learning_rate": 4.5080712789875154e-05,
"loss": 0.3886,
"step": 760
},
{
"epoch": 2.569383028005896,
"grad_norm": 0.15146661036703893,
"learning_rate": 4.498712075360929e-05,
"loss": 0.3779,
"step": 761
},
{
"epoch": 2.572752158349126,
"grad_norm": 0.1583016214192411,
"learning_rate": 4.489350097513829e-05,
"loss": 0.3861,
"step": 762
},
{
"epoch": 2.5761212886923563,
"grad_norm": 0.18203713661130738,
"learning_rate": 4.479985397524748e-05,
"loss": 0.3872,
"step": 763
},
{
"epoch": 2.5794904190355865,
"grad_norm": 0.1411770309939346,
"learning_rate": 4.470618027487354e-05,
"loss": 0.3833,
"step": 764
},
{
"epoch": 2.5828595493788167,
"grad_norm": 0.15778048291503943,
"learning_rate": 4.4612480395101736e-05,
"loss": 0.3835,
"step": 765
},
{
"epoch": 2.586228679722047,
"grad_norm": 0.20283325612723238,
"learning_rate": 4.451875485716292e-05,
"loss": 0.3804,
"step": 766
},
{
"epoch": 2.589597810065277,
"grad_norm": 0.15957667387644875,
"learning_rate": 4.44250041824307e-05,
"loss": 0.3759,
"step": 767
},
{
"epoch": 2.592966940408507,
"grad_norm": 0.15580437360078891,
"learning_rate": 4.4331228892418473e-05,
"loss": 0.3869,
"step": 768
},
{
"epoch": 2.5963360707517373,
"grad_norm": 0.16733864762153852,
"learning_rate": 4.4237429508776645e-05,
"loss": 0.3901,
"step": 769
},
{
"epoch": 2.5997052010949675,
"grad_norm": 0.16840382892762462,
"learning_rate": 4.414360655328957e-05,
"loss": 0.3887,
"step": 770
},
{
"epoch": 2.6030743314381977,
"grad_norm": 0.16500477542253614,
"learning_rate": 4.4049760547872786e-05,
"loss": 0.3821,
"step": 771
},
{
"epoch": 2.606443461781428,
"grad_norm": 0.17637661287184536,
"learning_rate": 4.395589201457e-05,
"loss": 0.3901,
"step": 772
},
{
"epoch": 2.6098125921246575,
"grad_norm": 0.1426864712324038,
"learning_rate": 4.386200147555027e-05,
"loss": 0.3822,
"step": 773
},
{
"epoch": 2.613181722467888,
"grad_norm": 0.1359883054124575,
"learning_rate": 4.376808945310505e-05,
"loss": 0.3907,
"step": 774
},
{
"epoch": 2.616550852811118,
"grad_norm": 0.15390613245324686,
"learning_rate": 4.3674156469645335e-05,
"loss": 0.3844,
"step": 775
},
{
"epoch": 2.6199199831543485,
"grad_norm": 0.12544051069791048,
"learning_rate": 4.358020304769867e-05,
"loss": 0.3848,
"step": 776
},
{
"epoch": 2.623289113497578,
"grad_norm": 0.12982821849005882,
"learning_rate": 4.348622970990634e-05,
"loss": 0.386,
"step": 777
},
{
"epoch": 2.626658243840809,
"grad_norm": 0.15120996993879657,
"learning_rate": 4.339223697902037e-05,
"loss": 0.3809,
"step": 778
},
{
"epoch": 2.6300273741840385,
"grad_norm": 0.13233029817309008,
"learning_rate": 4.329822537790073e-05,
"loss": 0.3841,
"step": 779
},
{
"epoch": 2.633396504527269,
"grad_norm": 0.14136223246926025,
"learning_rate": 4.320419542951228e-05,
"loss": 0.3838,
"step": 780
},
{
"epoch": 2.636765634870499,
"grad_norm": 0.1228901057783663,
"learning_rate": 4.3110147656922034e-05,
"loss": 0.3802,
"step": 781
},
{
"epoch": 2.6401347652137295,
"grad_norm": 0.13251524939594994,
"learning_rate": 4.3016082583296067e-05,
"loss": 0.378,
"step": 782
},
{
"epoch": 2.643503895556959,
"grad_norm": 0.13001677701359055,
"learning_rate": 4.292200073189676e-05,
"loss": 0.3841,
"step": 783
},
{
"epoch": 2.6468730259001894,
"grad_norm": 0.15991064871524435,
"learning_rate": 4.2827902626079784e-05,
"loss": 0.3875,
"step": 784
},
{
"epoch": 2.6502421562434195,
"grad_norm": 0.12111670308432425,
"learning_rate": 4.2733788789291275e-05,
"loss": 0.3873,
"step": 785
},
{
"epoch": 2.6536112865866497,
"grad_norm": 0.1593860904845142,
"learning_rate": 4.263965974506483e-05,
"loss": 0.3864,
"step": 786
},
{
"epoch": 2.65698041692988,
"grad_norm": 0.16167383614529757,
"learning_rate": 4.254551601701866e-05,
"loss": 0.3845,
"step": 787
},
{
"epoch": 2.66034954727311,
"grad_norm": 0.13801503703615994,
"learning_rate": 4.2451358128852654e-05,
"loss": 0.3876,
"step": 788
},
{
"epoch": 2.66371867761634,
"grad_norm": 0.13674433021590243,
"learning_rate": 4.23571866043455e-05,
"loss": 0.3836,
"step": 789
},
{
"epoch": 2.6670878079595703,
"grad_norm": 0.1567228984572654,
"learning_rate": 4.22630019673517e-05,
"loss": 0.3819,
"step": 790
},
{
"epoch": 2.6704569383028005,
"grad_norm": 0.13292233430502193,
"learning_rate": 4.216880474179871e-05,
"loss": 0.3772,
"step": 791
},
{
"epoch": 2.6738260686460307,
"grad_norm": 0.14610126476091434,
"learning_rate": 4.207459545168405e-05,
"loss": 0.391,
"step": 792
},
{
"epoch": 2.677195198989261,
"grad_norm": 0.1295036399986597,
"learning_rate": 4.198037462107228e-05,
"loss": 0.39,
"step": 793
},
{
"epoch": 2.680564329332491,
"grad_norm": 0.14286486693120076,
"learning_rate": 4.188614277409224e-05,
"loss": 0.3824,
"step": 794
},
{
"epoch": 2.683933459675721,
"grad_norm": 0.1395089402065071,
"learning_rate": 4.179190043493397e-05,
"loss": 0.3893,
"step": 795
},
{
"epoch": 2.6873025900189513,
"grad_norm": 0.1312675673324047,
"learning_rate": 4.169764812784594e-05,
"loss": 0.3839,
"step": 796
},
{
"epoch": 2.6906717203621815,
"grad_norm": 0.15056150493927153,
"learning_rate": 4.1603386377132045e-05,
"loss": 0.3766,
"step": 797
},
{
"epoch": 2.6940408507054117,
"grad_norm": 0.15234002339266034,
"learning_rate": 4.1509115707148695e-05,
"loss": 0.3875,
"step": 798
},
{
"epoch": 2.697409981048642,
"grad_norm": 0.14172473902716337,
"learning_rate": 4.1414836642301954e-05,
"loss": 0.3835,
"step": 799
},
{
"epoch": 2.700779111391872,
"grad_norm": 0.1244063349961557,
"learning_rate": 4.132054970704454e-05,
"loss": 0.384,
"step": 800
},
{
"epoch": 2.704148241735102,
"grad_norm": 0.13151454461470574,
"learning_rate": 4.122625542587301e-05,
"loss": 0.3814,
"step": 801
},
{
"epoch": 2.7075173720783323,
"grad_norm": 0.13472018853386267,
"learning_rate": 4.1131954323324734e-05,
"loss": 0.3832,
"step": 802
},
{
"epoch": 2.7108865024215625,
"grad_norm": 0.14391402812007115,
"learning_rate": 4.103764692397504e-05,
"loss": 0.3907,
"step": 803
},
{
"epoch": 2.7142556327647926,
"grad_norm": 0.1204377593661656,
"learning_rate": 4.094333375243428e-05,
"loss": 0.3779,
"step": 804
},
{
"epoch": 2.717624763108023,
"grad_norm": 0.1345036853381592,
"learning_rate": 4.084901533334495e-05,
"loss": 0.3837,
"step": 805
},
{
"epoch": 2.720993893451253,
"grad_norm": 0.151432229349483,
"learning_rate": 4.075469219137868e-05,
"loss": 0.3867,
"step": 806
},
{
"epoch": 2.724363023794483,
"grad_norm": 0.13412559508113278,
"learning_rate": 4.066036485123344e-05,
"loss": 0.3809,
"step": 807
},
{
"epoch": 2.7277321541377133,
"grad_norm": 0.1407083620047968,
"learning_rate": 4.056603383763049e-05,
"loss": 0.3893,
"step": 808
},
{
"epoch": 2.7311012844809435,
"grad_norm": 0.1304023157361848,
"learning_rate": 4.0471699675311564e-05,
"loss": 0.3873,
"step": 809
},
{
"epoch": 2.7344704148241736,
"grad_norm": 0.13069329962842927,
"learning_rate": 4.0377362889035875e-05,
"loss": 0.3845,
"step": 810
},
{
"epoch": 2.737839545167404,
"grad_norm": 0.134836479542485,
"learning_rate": 4.0283024003577284e-05,
"loss": 0.3806,
"step": 811
},
{
"epoch": 2.741208675510634,
"grad_norm": 0.12753418534583713,
"learning_rate": 4.0188683543721295e-05,
"loss": 0.3797,
"step": 812
},
{
"epoch": 2.744577805853864,
"grad_norm": 0.13228859664320883,
"learning_rate": 4.009434203426215e-05,
"loss": 0.3856,
"step": 813
},
{
"epoch": 2.747946936197094,
"grad_norm": 0.14892311316819778,
"learning_rate": 4e-05,
"loss": 0.3838,
"step": 814
},
{
"epoch": 2.7513160665403245,
"grad_norm": 0.13386473278676905,
"learning_rate": 3.9905657965737854e-05,
"loss": 0.3829,
"step": 815
},
{
"epoch": 2.754685196883554,
"grad_norm": 0.14219980607382138,
"learning_rate": 3.981131645627872e-05,
"loss": 0.3819,
"step": 816
},
{
"epoch": 2.758054327226785,
"grad_norm": 0.1388449346696737,
"learning_rate": 3.971697599642273e-05,
"loss": 0.3834,
"step": 817
},
{
"epoch": 2.7614234575700145,
"grad_norm": 0.12977851410941868,
"learning_rate": 3.9622637110964125e-05,
"loss": 0.3831,
"step": 818
},
{
"epoch": 2.764792587913245,
"grad_norm": 0.13978459681010671,
"learning_rate": 3.9528300324688456e-05,
"loss": 0.383,
"step": 819
},
{
"epoch": 2.768161718256475,
"grad_norm": 0.13570459222433323,
"learning_rate": 3.943396616236953e-05,
"loss": 0.3851,
"step": 820
},
{
"epoch": 2.7715308485997054,
"grad_norm": 0.1347307304770039,
"learning_rate": 3.933963514876657e-05,
"loss": 0.3872,
"step": 821
},
{
"epoch": 2.774899978942935,
"grad_norm": 0.14708015270111557,
"learning_rate": 3.9245307808621325e-05,
"loss": 0.385,
"step": 822
},
{
"epoch": 2.7782691092861658,
"grad_norm": 0.12260128131766068,
"learning_rate": 3.915098466665506e-05,
"loss": 0.3855,
"step": 823
},
{
"epoch": 2.7816382396293955,
"grad_norm": 0.12292579106408079,
"learning_rate": 3.905666624756573e-05,
"loss": 0.3869,
"step": 824
},
{
"epoch": 2.7850073699726257,
"grad_norm": 0.14492807851132256,
"learning_rate": 3.8962353076024984e-05,
"loss": 0.3821,
"step": 825
},
{
"epoch": 2.788376500315856,
"grad_norm": 0.14449210295060477,
"learning_rate": 3.886804567667528e-05,
"loss": 0.3808,
"step": 826
},
{
"epoch": 2.791745630659086,
"grad_norm": 0.12971902156372891,
"learning_rate": 3.8773744574127e-05,
"loss": 0.3878,
"step": 827
},
{
"epoch": 2.795114761002316,
"grad_norm": 0.14230416274316593,
"learning_rate": 3.867945029295546e-05,
"loss": 0.3814,
"step": 828
},
{
"epoch": 2.7984838913455463,
"grad_norm": 0.1224339186137515,
"learning_rate": 3.858516335769806e-05,
"loss": 0.3819,
"step": 829
},
{
"epoch": 2.8018530216887765,
"grad_norm": 0.16733669157218356,
"learning_rate": 3.8490884292851325e-05,
"loss": 0.3825,
"step": 830
},
{
"epoch": 2.8052221520320066,
"grad_norm": 0.13398557625334945,
"learning_rate": 3.839661362286797e-05,
"loss": 0.3785,
"step": 831
},
{
"epoch": 2.808591282375237,
"grad_norm": 0.14930405489150408,
"learning_rate": 3.830235187215408e-05,
"loss": 0.3806,
"step": 832
},
{
"epoch": 2.811960412718467,
"grad_norm": 0.14534442897149916,
"learning_rate": 3.820809956506604e-05,
"loss": 0.3869,
"step": 833
},
{
"epoch": 2.815329543061697,
"grad_norm": 0.14294161233646072,
"learning_rate": 3.8113857225907783e-05,
"loss": 0.3834,
"step": 834
},
{
"epoch": 2.8186986734049273,
"grad_norm": 0.1304818403113972,
"learning_rate": 3.801962537892773e-05,
"loss": 0.3917,
"step": 835
},
{
"epoch": 2.8220678037481575,
"grad_norm": 0.16153213081562928,
"learning_rate": 3.792540454831596e-05,
"loss": 0.3877,
"step": 836
},
{
"epoch": 2.8254369340913876,
"grad_norm": 0.12199316427929723,
"learning_rate": 3.7831195258201295e-05,
"loss": 0.3836,
"step": 837
},
{
"epoch": 2.828806064434618,
"grad_norm": 0.14527010576989632,
"learning_rate": 3.7736998032648305e-05,
"loss": 0.3827,
"step": 838
},
{
"epoch": 2.832175194777848,
"grad_norm": 0.15971096124557288,
"learning_rate": 3.7642813395654504e-05,
"loss": 0.3801,
"step": 839
},
{
"epoch": 2.835544325121078,
"grad_norm": 0.12345484787366505,
"learning_rate": 3.754864187114736e-05,
"loss": 0.3855,
"step": 840
},
{
"epoch": 2.8389134554643083,
"grad_norm": 0.13837193216510435,
"learning_rate": 3.745448398298135e-05,
"loss": 0.3828,
"step": 841
},
{
"epoch": 2.8422825858075385,
"grad_norm": 0.1545419687841436,
"learning_rate": 3.736034025493519e-05,
"loss": 0.3821,
"step": 842
},
{
"epoch": 2.8456517161507686,
"grad_norm": 0.12965815907805744,
"learning_rate": 3.726621121070873e-05,
"loss": 0.3885,
"step": 843
},
{
"epoch": 2.849020846493999,
"grad_norm": 0.14437205080738458,
"learning_rate": 3.717209737392022e-05,
"loss": 0.3757,
"step": 844
},
{
"epoch": 2.852389976837229,
"grad_norm": 0.13760242198629977,
"learning_rate": 3.707799926810326e-05,
"loss": 0.3841,
"step": 845
},
{
"epoch": 2.855759107180459,
"grad_norm": 0.16923959033588096,
"learning_rate": 3.698391741670394e-05,
"loss": 0.3837,
"step": 846
},
{
"epoch": 2.8591282375236893,
"grad_norm": 0.1483758913428858,
"learning_rate": 3.688985234307798e-05,
"loss": 0.3854,
"step": 847
},
{
"epoch": 2.8624973678669194,
"grad_norm": 0.1409446277936609,
"learning_rate": 3.679580457048772e-05,
"loss": 0.3865,
"step": 848
},
{
"epoch": 2.8658664982101496,
"grad_norm": 0.13848959127311186,
"learning_rate": 3.6701774622099286e-05,
"loss": 0.3847,
"step": 849
},
{
"epoch": 2.8692356285533798,
"grad_norm": 0.13440901679008035,
"learning_rate": 3.660776302097965e-05,
"loss": 0.3809,
"step": 850
},
{
"epoch": 2.87260475889661,
"grad_norm": 0.13528288220600784,
"learning_rate": 3.6513770290093674e-05,
"loss": 0.3844,
"step": 851
},
{
"epoch": 2.87597388923984,
"grad_norm": 0.11930769920642463,
"learning_rate": 3.641979695230135e-05,
"loss": 0.3853,
"step": 852
},
{
"epoch": 2.8793430195830703,
"grad_norm": 0.1302640412084013,
"learning_rate": 3.632584353035467e-05,
"loss": 0.3834,
"step": 853
},
{
"epoch": 2.8827121499263004,
"grad_norm": 0.12093299855424389,
"learning_rate": 3.6231910546894956e-05,
"loss": 0.3851,
"step": 854
},
{
"epoch": 2.8860812802695306,
"grad_norm": 0.1342477899550942,
"learning_rate": 3.613799852444975e-05,
"loss": 0.3883,
"step": 855
},
{
"epoch": 2.8894504106127608,
"grad_norm": 0.11778883888529185,
"learning_rate": 3.6044107985430015e-05,
"loss": 0.3823,
"step": 856
},
{
"epoch": 2.8928195409559905,
"grad_norm": 0.12271043639462616,
"learning_rate": 3.595023945212723e-05,
"loss": 0.3816,
"step": 857
},
{
"epoch": 2.896188671299221,
"grad_norm": 0.12188701757865371,
"learning_rate": 3.585639344671043e-05,
"loss": 0.3863,
"step": 858
},
{
"epoch": 2.899557801642451,
"grad_norm": 0.12511895990769892,
"learning_rate": 3.576257049122336e-05,
"loss": 0.3829,
"step": 859
},
{
"epoch": 2.9029269319856814,
"grad_norm": 0.12002503720509249,
"learning_rate": 3.5668771107581526e-05,
"loss": 0.377,
"step": 860
},
{
"epoch": 2.906296062328911,
"grad_norm": 0.12993074211163566,
"learning_rate": 3.5574995817569317e-05,
"loss": 0.3755,
"step": 861
},
{
"epoch": 2.9096651926721417,
"grad_norm": 0.10532634808065627,
"learning_rate": 3.5481245142837095e-05,
"loss": 0.3869,
"step": 862
},
{
"epoch": 2.9130343230153715,
"grad_norm": 0.1296191433786778,
"learning_rate": 3.5387519604898264e-05,
"loss": 0.382,
"step": 863
},
{
"epoch": 2.916403453358602,
"grad_norm": 0.10734185230078218,
"learning_rate": 3.5293819725126464e-05,
"loss": 0.3849,
"step": 864
},
{
"epoch": 2.919772583701832,
"grad_norm": 0.1077939586524133,
"learning_rate": 3.520014602475252e-05,
"loss": 0.3828,
"step": 865
},
{
"epoch": 2.923141714045062,
"grad_norm": 0.12191898052299041,
"learning_rate": 3.5106499024861715e-05,
"loss": 0.3809,
"step": 866
},
{
"epoch": 2.926510844388292,
"grad_norm": 0.12081068176606237,
"learning_rate": 3.501287924639074e-05,
"loss": 0.3892,
"step": 867
},
{
"epoch": 2.9298799747315223,
"grad_norm": 0.13361270574401832,
"learning_rate": 3.491928721012485e-05,
"loss": 0.3818,
"step": 868
},
{
"epoch": 2.9332491050747524,
"grad_norm": 0.12126810590661805,
"learning_rate": 3.482572343669506e-05,
"loss": 0.3834,
"step": 869
},
{
"epoch": 2.9366182354179826,
"grad_norm": 0.1258581729968798,
"learning_rate": 3.4732188446575055e-05,
"loss": 0.3822,
"step": 870
},
{
"epoch": 2.939987365761213,
"grad_norm": 0.11858345315742196,
"learning_rate": 3.4638682760078505e-05,
"loss": 0.3922,
"step": 871
},
{
"epoch": 2.943356496104443,
"grad_norm": 0.11372309799338015,
"learning_rate": 3.454520689735602e-05,
"loss": 0.3824,
"step": 872
},
{
"epoch": 2.946725626447673,
"grad_norm": 0.14113850726940133,
"learning_rate": 3.445176137839227e-05,
"loss": 0.3796,
"step": 873
},
{
"epoch": 2.9500947567909033,
"grad_norm": 0.11612037625898579,
"learning_rate": 3.435834672300324e-05,
"loss": 0.3873,
"step": 874
},
{
"epoch": 2.9534638871341334,
"grad_norm": 0.12263857158882245,
"learning_rate": 3.426496345083309e-05,
"loss": 0.3807,
"step": 875
},
{
"epoch": 2.9568330174773636,
"grad_norm": 0.13787793243918434,
"learning_rate": 3.417161208135155e-05,
"loss": 0.3865,
"step": 876
},
{
"epoch": 2.9602021478205938,
"grad_norm": 0.12537808395950803,
"learning_rate": 3.407829313385075e-05,
"loss": 0.3887,
"step": 877
},
{
"epoch": 2.963571278163824,
"grad_norm": 0.1233586121783003,
"learning_rate": 3.398500712744254e-05,
"loss": 0.3831,
"step": 878
},
{
"epoch": 2.966940408507054,
"grad_norm": 0.127510517027595,
"learning_rate": 3.38917545810555e-05,
"loss": 0.3855,
"step": 879
},
{
"epoch": 2.9703095388502843,
"grad_norm": 0.12958054002462321,
"learning_rate": 3.379853601343209e-05,
"loss": 0.3867,
"step": 880
},
{
"epoch": 2.9736786691935144,
"grad_norm": 0.11339310625974686,
"learning_rate": 3.3705351943125755e-05,
"loss": 0.381,
"step": 881
},
{
"epoch": 2.9770477995367446,
"grad_norm": 0.1441132631100554,
"learning_rate": 3.361220288849804e-05,
"loss": 0.3853,
"step": 882
},
{
"epoch": 2.9804169298799748,
"grad_norm": 0.12590761879480403,
"learning_rate": 3.351908936771566e-05,
"loss": 0.3821,
"step": 883
},
{
"epoch": 2.983786060223205,
"grad_norm": 0.12580062137496578,
"learning_rate": 3.342601189874777e-05,
"loss": 0.3912,
"step": 884
},
{
"epoch": 2.987155190566435,
"grad_norm": 0.1375861040816144,
"learning_rate": 3.3332970999362836e-05,
"loss": 0.3843,
"step": 885
},
{
"epoch": 2.9905243209096652,
"grad_norm": 0.11745115999108842,
"learning_rate": 3.323996718712605e-05,
"loss": 0.3793,
"step": 886
},
{
"epoch": 2.9938934512528954,
"grad_norm": 0.1154957553487754,
"learning_rate": 3.3147000979396156e-05,
"loss": 0.386,
"step": 887
},
{
"epoch": 2.9972625815961256,
"grad_norm": 0.14419491852541183,
"learning_rate": 3.305407289332279e-05,
"loss": 0.387,
"step": 888
},
{
"epoch": 3.00336913034323,
"grad_norm": 0.17453356323499444,
"learning_rate": 3.296118344584352e-05,
"loss": 0.3658,
"step": 889
},
{
"epoch": 3.0067382606864603,
"grad_norm": 0.19958889229278365,
"learning_rate": 3.2868333153680964e-05,
"loss": 0.3563,
"step": 890
},
{
"epoch": 3.0101073910296905,
"grad_norm": 0.14823110731719627,
"learning_rate": 3.277552253333993e-05,
"loss": 0.3592,
"step": 891
},
{
"epoch": 3.0134765213729207,
"grad_norm": 0.15078557386759514,
"learning_rate": 3.2682752101104536e-05,
"loss": 0.3648,
"step": 892
},
{
"epoch": 3.016845651716151,
"grad_norm": 0.15261085897213972,
"learning_rate": 3.259002237303535e-05,
"loss": 0.365,
"step": 893
},
{
"epoch": 3.020214782059381,
"grad_norm": 0.12773087702299238,
"learning_rate": 3.249733386496653e-05,
"loss": 0.359,
"step": 894
},
{
"epoch": 3.023583912402611,
"grad_norm": 0.13787164527794113,
"learning_rate": 3.2404687092502865e-05,
"loss": 0.361,
"step": 895
},
{
"epoch": 3.0269530427458413,
"grad_norm": 0.15396809320630023,
"learning_rate": 3.231208257101709e-05,
"loss": 0.3639,
"step": 896
},
{
"epoch": 3.0303221730890715,
"grad_norm": 0.13565056548247828,
"learning_rate": 3.221952081564682e-05,
"loss": 0.3632,
"step": 897
},
{
"epoch": 3.0336913034323016,
"grad_norm": 0.16070873287428322,
"learning_rate": 3.212700234129179e-05,
"loss": 0.3594,
"step": 898
},
{
"epoch": 3.037060433775532,
"grad_norm": 0.14022297658804933,
"learning_rate": 3.2034527662611e-05,
"loss": 0.363,
"step": 899
},
{
"epoch": 3.040429564118762,
"grad_norm": 0.14407779140042834,
"learning_rate": 3.194209729401979e-05,
"loss": 0.3612,
"step": 900
},
{
"epoch": 3.043798694461992,
"grad_norm": 0.13752049086764745,
"learning_rate": 3.184971174968705e-05,
"loss": 0.3645,
"step": 901
},
{
"epoch": 3.0471678248052223,
"grad_norm": 0.13823706755645496,
"learning_rate": 3.175737154353231e-05,
"loss": 0.3626,
"step": 902
},
{
"epoch": 3.0505369551484525,
"grad_norm": 0.16264110826907188,
"learning_rate": 3.166507718922285e-05,
"loss": 0.3566,
"step": 903
},
{
"epoch": 3.0539060854916826,
"grad_norm": 0.15511577954565434,
"learning_rate": 3.157282920017096e-05,
"loss": 0.361,
"step": 904
},
{
"epoch": 3.057275215834913,
"grad_norm": 0.15232517037403773,
"learning_rate": 3.1480628089530943e-05,
"loss": 0.3662,
"step": 905
},
{
"epoch": 3.060644346178143,
"grad_norm": 0.17112367414740937,
"learning_rate": 3.1388474370196395e-05,
"loss": 0.3638,
"step": 906
},
{
"epoch": 3.064013476521373,
"grad_norm": 0.12748688705449465,
"learning_rate": 3.129636855479723e-05,
"loss": 0.3579,
"step": 907
},
{
"epoch": 3.0673826068646033,
"grad_norm": 0.14714355107055627,
"learning_rate": 3.12043111556969e-05,
"loss": 0.3582,
"step": 908
},
{
"epoch": 3.070751737207833,
"grad_norm": 0.13462631797401237,
"learning_rate": 3.111230268498954e-05,
"loss": 0.367,
"step": 909
},
{
"epoch": 3.074120867551063,
"grad_norm": 0.1372418048121636,
"learning_rate": 3.1020343654497096e-05,
"loss": 0.3588,
"step": 910
},
{
"epoch": 3.0774899978942933,
"grad_norm": 0.13072048530956415,
"learning_rate": 3.0928434575766505e-05,
"loss": 0.361,
"step": 911
},
{
"epoch": 3.0808591282375235,
"grad_norm": 0.12852995212281998,
"learning_rate": 3.083657596006681e-05,
"loss": 0.3543,
"step": 912
},
{
"epoch": 3.0842282585807537,
"grad_norm": 0.12589969103284174,
"learning_rate": 3.0744768318386346e-05,
"loss": 0.3573,
"step": 913
},
{
"epoch": 3.087597388923984,
"grad_norm": 0.1042227599830766,
"learning_rate": 3.065301216142991e-05,
"loss": 0.3571,
"step": 914
},
{
"epoch": 3.090966519267214,
"grad_norm": 0.12641784465437736,
"learning_rate": 3.056130799961587e-05,
"loss": 0.361,
"step": 915
},
{
"epoch": 3.094335649610444,
"grad_norm": 0.1189011090318916,
"learning_rate": 3.046965634307341e-05,
"loss": 0.3653,
"step": 916
},
{
"epoch": 3.0977047799536743,
"grad_norm": 0.11559017960748716,
"learning_rate": 3.0378057701639575e-05,
"loss": 0.371,
"step": 917
},
{
"epoch": 3.1010739102969045,
"grad_norm": 0.1198695027252497,
"learning_rate": 3.028651258485652e-05,
"loss": 0.3667,
"step": 918
},
{
"epoch": 3.1044430406401347,
"grad_norm": 0.11196979369755074,
"learning_rate": 3.019502150196869e-05,
"loss": 0.3575,
"step": 919
},
{
"epoch": 3.107812170983365,
"grad_norm": 0.12042692289106809,
"learning_rate": 3.010358496191991e-05,
"loss": 0.3618,
"step": 920
},
{
"epoch": 3.111181301326595,
"grad_norm": 0.1238521643735063,
"learning_rate": 3.0012203473350616e-05,
"loss": 0.3672,
"step": 921
},
{
"epoch": 3.114550431669825,
"grad_norm": 0.11597672612469004,
"learning_rate": 2.9920877544595002e-05,
"loss": 0.3577,
"step": 922
},
{
"epoch": 3.1179195620130553,
"grad_norm": 0.11363631100554263,
"learning_rate": 2.982960768367818e-05,
"loss": 0.3637,
"step": 923
},
{
"epoch": 3.1212886923562855,
"grad_norm": 0.12223781700368476,
"learning_rate": 2.9738394398313405e-05,
"loss": 0.3575,
"step": 924
},
{
"epoch": 3.1246578226995156,
"grad_norm": 0.11310391813366659,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.3666,
"step": 925
},
{
"epoch": 3.128026953042746,
"grad_norm": 0.12749001851980382,
"learning_rate": 2.955613958351647e-05,
"loss": 0.3577,
"step": 926
},
{
"epoch": 3.131396083385976,
"grad_norm": 0.11106465012495607,
"learning_rate": 2.946509906792593e-05,
"loss": 0.3661,
"step": 927
},
{
"epoch": 3.134765213729206,
"grad_norm": 0.13265615613597764,
"learning_rate": 2.9374117155564957e-05,
"loss": 0.3613,
"step": 928
},
{
"epoch": 3.1381343440724363,
"grad_norm": 0.1062334645184232,
"learning_rate": 2.928319435254501e-05,
"loss": 0.3601,
"step": 929
},
{
"epoch": 3.1415034744156665,
"grad_norm": 0.13654759521524176,
"learning_rate": 2.919233116464872e-05,
"loss": 0.357,
"step": 930
},
{
"epoch": 3.1448726047588966,
"grad_norm": 0.12274484555896063,
"learning_rate": 2.9101528097327093e-05,
"loss": 0.3659,
"step": 931
},
{
"epoch": 3.148241735102127,
"grad_norm": 0.11432950773248603,
"learning_rate": 2.9010785655696698e-05,
"loss": 0.3638,
"step": 932
},
{
"epoch": 3.151610865445357,
"grad_norm": 0.11354842248203202,
"learning_rate": 2.892010434453684e-05,
"loss": 0.36,
"step": 933
},
{
"epoch": 3.154979995788587,
"grad_norm": 0.12098639250864718,
"learning_rate": 2.88294846682868e-05,
"loss": 0.3591,
"step": 934
},
{
"epoch": 3.1583491261318173,
"grad_norm": 0.11027079481756498,
"learning_rate": 2.873892713104298e-05,
"loss": 0.3595,
"step": 935
},
{
"epoch": 3.1617182564750475,
"grad_norm": 0.12568594872253705,
"learning_rate": 2.864843223655613e-05,
"loss": 0.3678,
"step": 936
},
{
"epoch": 3.1650873868182776,
"grad_norm": 0.11667961571614835,
"learning_rate": 2.855800048822852e-05,
"loss": 0.3608,
"step": 937
},
{
"epoch": 3.168456517161508,
"grad_norm": 0.11058294572640527,
"learning_rate": 2.8467632389111126e-05,
"loss": 0.3683,
"step": 938
},
{
"epoch": 3.171825647504738,
"grad_norm": 0.1187950796415824,
"learning_rate": 2.837732844190094e-05,
"loss": 0.3644,
"step": 939
},
{
"epoch": 3.175194777847968,
"grad_norm": 0.10656085663558766,
"learning_rate": 2.828708914893799e-05,
"loss": 0.3671,
"step": 940
},
{
"epoch": 3.1785639081911983,
"grad_norm": 0.10817099139196962,
"learning_rate": 2.8196915012202728e-05,
"loss": 0.3672,
"step": 941
},
{
"epoch": 3.1819330385344284,
"grad_norm": 0.10075876050195509,
"learning_rate": 2.8106806533313106e-05,
"loss": 0.3631,
"step": 942
},
{
"epoch": 3.1853021688776586,
"grad_norm": 0.11551691063136907,
"learning_rate": 2.8016764213521875e-05,
"loss": 0.3608,
"step": 943
},
{
"epoch": 3.1886712992208888,
"grad_norm": 0.10092150997385874,
"learning_rate": 2.7926788553713734e-05,
"loss": 0.3652,
"step": 944
},
{
"epoch": 3.192040429564119,
"grad_norm": 0.11020311291162539,
"learning_rate": 2.783688005440256e-05,
"loss": 0.3656,
"step": 945
},
{
"epoch": 3.195409559907349,
"grad_norm": 0.10850184905841719,
"learning_rate": 2.7747039215728667e-05,
"loss": 0.3648,
"step": 946
},
{
"epoch": 3.1987786902505793,
"grad_norm": 0.10954311066114457,
"learning_rate": 2.7657266537455938e-05,
"loss": 0.3651,
"step": 947
},
{
"epoch": 3.2021478205938094,
"grad_norm": 0.10365234676829252,
"learning_rate": 2.7567562518969155e-05,
"loss": 0.3533,
"step": 948
},
{
"epoch": 3.2055169509370396,
"grad_norm": 0.10204242463146666,
"learning_rate": 2.7477927659271117e-05,
"loss": 0.3622,
"step": 949
},
{
"epoch": 3.2088860812802693,
"grad_norm": 0.10799341793471445,
"learning_rate": 2.7388362456979906e-05,
"loss": 0.3625,
"step": 950
},
{
"epoch": 3.2122552116235,
"grad_norm": 0.11115544373524708,
"learning_rate": 2.7298867410326155e-05,
"loss": 0.3629,
"step": 951
},
{
"epoch": 3.2156243419667296,
"grad_norm": 0.10949003369065348,
"learning_rate": 2.7209443017150193e-05,
"loss": 0.3635,
"step": 952
},
{
"epoch": 3.21899347230996,
"grad_norm": 0.10963161775177817,
"learning_rate": 2.712008977489936e-05,
"loss": 0.3594,
"step": 953
},
{
"epoch": 3.22236260265319,
"grad_norm": 0.11805544027584379,
"learning_rate": 2.703080818062517e-05,
"loss": 0.3635,
"step": 954
},
{
"epoch": 3.22573173299642,
"grad_norm": 0.10196046146217858,
"learning_rate": 2.694159873098058e-05,
"loss": 0.3626,
"step": 955
},
{
"epoch": 3.2291008633396503,
"grad_norm": 0.1120026689331707,
"learning_rate": 2.6852461922217253e-05,
"loss": 0.3649,
"step": 956
},
{
"epoch": 3.2324699936828805,
"grad_norm": 0.10926346301227147,
"learning_rate": 2.6763398250182714e-05,
"loss": 0.3579,
"step": 957
},
{
"epoch": 3.2358391240261106,
"grad_norm": 0.10913175373351278,
"learning_rate": 2.66744082103177e-05,
"loss": 0.3639,
"step": 958
},
{
"epoch": 3.239208254369341,
"grad_norm": 0.10485736112258066,
"learning_rate": 2.658549229765332e-05,
"loss": 0.3592,
"step": 959
},
{
"epoch": 3.242577384712571,
"grad_norm": 0.12101416878728995,
"learning_rate": 2.6496651006808308e-05,
"loss": 0.3574,
"step": 960
},
{
"epoch": 3.245946515055801,
"grad_norm": 0.1071236277697119,
"learning_rate": 2.6407884831986367e-05,
"loss": 0.3627,
"step": 961
},
{
"epoch": 3.2493156453990313,
"grad_norm": 0.11778875174805165,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.365,
"step": 962
},
{
"epoch": 3.2526847757422614,
"grad_norm": 0.12437906053481307,
"learning_rate": 2.6230579805134203e-05,
"loss": 0.3582,
"step": 963
},
{
"epoch": 3.2560539060854916,
"grad_norm": 0.11016391828701566,
"learning_rate": 2.614204193941107e-05,
"loss": 0.3628,
"step": 964
},
{
"epoch": 3.2594230364287218,
"grad_norm": 0.131288542140626,
"learning_rate": 2.6053581162319606e-05,
"loss": 0.3634,
"step": 965
},
{
"epoch": 3.262792166771952,
"grad_norm": 0.10515921544500577,
"learning_rate": 2.5965197965946783e-05,
"loss": 0.3649,
"step": 966
},
{
"epoch": 3.266161297115182,
"grad_norm": 0.12739731098762894,
"learning_rate": 2.587689284194797e-05,
"loss": 0.3703,
"step": 967
},
{
"epoch": 3.2695304274584123,
"grad_norm": 0.10406377203116793,
"learning_rate": 2.5788666281544258e-05,
"loss": 0.3657,
"step": 968
},
{
"epoch": 3.2728995578016424,
"grad_norm": 0.11191689402983139,
"learning_rate": 2.5700518775519702e-05,
"loss": 0.359,
"step": 969
},
{
"epoch": 3.2762686881448726,
"grad_norm": 0.10680144927044027,
"learning_rate": 2.561245081421857e-05,
"loss": 0.3604,
"step": 970
},
{
"epoch": 3.2796378184881028,
"grad_norm": 0.11505057898142523,
"learning_rate": 2.5524462887542703e-05,
"loss": 0.3599,
"step": 971
},
{
"epoch": 3.283006948831333,
"grad_norm": 0.10674300454641518,
"learning_rate": 2.5436555484948643e-05,
"loss": 0.3625,
"step": 972
},
{
"epoch": 3.286376079174563,
"grad_norm": 0.10772282874724956,
"learning_rate": 2.534872909544509e-05,
"loss": 0.3586,
"step": 973
},
{
"epoch": 3.2897452095177933,
"grad_norm": 0.11061913724144044,
"learning_rate": 2.5260984207590015e-05,
"loss": 0.3695,
"step": 974
},
{
"epoch": 3.2931143398610234,
"grad_norm": 0.11314868048581533,
"learning_rate": 2.517332130948802e-05,
"loss": 0.3597,
"step": 975
},
{
"epoch": 3.2964834702042536,
"grad_norm": 0.10483488263899578,
"learning_rate": 2.5085740888787662e-05,
"loss": 0.3583,
"step": 976
},
{
"epoch": 3.2998526005474837,
"grad_norm": 0.10912778564330813,
"learning_rate": 2.4998243432678644e-05,
"loss": 0.3601,
"step": 977
},
{
"epoch": 3.303221730890714,
"grad_norm": 0.11466754101476578,
"learning_rate": 2.4910829427889205e-05,
"loss": 0.3643,
"step": 978
},
{
"epoch": 3.306590861233944,
"grad_norm": 0.10733537636590312,
"learning_rate": 2.4823499360683333e-05,
"loss": 0.3651,
"step": 979
},
{
"epoch": 3.3099599915771742,
"grad_norm": 0.1161393261879057,
"learning_rate": 2.473625371685806e-05,
"loss": 0.3599,
"step": 980
},
{
"epoch": 3.3133291219204044,
"grad_norm": 0.0982571093572832,
"learning_rate": 2.464909298174088e-05,
"loss": 0.3526,
"step": 981
},
{
"epoch": 3.3166982522636346,
"grad_norm": 0.1100159657444912,
"learning_rate": 2.4562017640186847e-05,
"loss": 0.3626,
"step": 982
},
{
"epoch": 3.3200673826068647,
"grad_norm": 0.09926349760672294,
"learning_rate": 2.4475028176576102e-05,
"loss": 0.3677,
"step": 983
},
{
"epoch": 3.323436512950095,
"grad_norm": 0.12050759797842048,
"learning_rate": 2.4388125074810986e-05,
"loss": 0.359,
"step": 984
},
{
"epoch": 3.326805643293325,
"grad_norm": 0.09987805749588798,
"learning_rate": 2.430130881831345e-05,
"loss": 0.3618,
"step": 985
},
{
"epoch": 3.3301747736365552,
"grad_norm": 0.1091783241310202,
"learning_rate": 2.4214579890022373e-05,
"loss": 0.3696,
"step": 986
},
{
"epoch": 3.3335439039797854,
"grad_norm": 0.10898707191962656,
"learning_rate": 2.41279387723908e-05,
"loss": 0.3638,
"step": 987
},
{
"epoch": 3.3369130343230156,
"grad_norm": 0.10558034784682291,
"learning_rate": 2.404138594738335e-05,
"loss": 0.357,
"step": 988
},
{
"epoch": 3.3402821646662453,
"grad_norm": 0.10689449489731055,
"learning_rate": 2.395492189647347e-05,
"loss": 0.3594,
"step": 989
},
{
"epoch": 3.343651295009476,
"grad_norm": 0.11118497131539316,
"learning_rate": 2.386854710064075e-05,
"loss": 0.3542,
"step": 990
},
{
"epoch": 3.3470204253527056,
"grad_norm": 0.10782085280238568,
"learning_rate": 2.3782262040368344e-05,
"loss": 0.3608,
"step": 991
},
{
"epoch": 3.350389555695936,
"grad_norm": 0.10697566924440428,
"learning_rate": 2.369606719564015e-05,
"loss": 0.3551,
"step": 992
},
{
"epoch": 3.353758686039166,
"grad_norm": 0.09605638199170409,
"learning_rate": 2.3609963045938288e-05,
"loss": 0.3618,
"step": 993
},
{
"epoch": 3.357127816382396,
"grad_norm": 0.10827169360976367,
"learning_rate": 2.35239500702403e-05,
"loss": 0.3565,
"step": 994
},
{
"epoch": 3.3604969467256263,
"grad_norm": 0.10198375263244171,
"learning_rate": 2.3438028747016586e-05,
"loss": 0.3626,
"step": 995
},
{
"epoch": 3.3638660770688564,
"grad_norm": 0.1159958447674676,
"learning_rate": 2.3352199554227698e-05,
"loss": 0.3629,
"step": 996
},
{
"epoch": 3.3672352074120866,
"grad_norm": 0.10457139377595129,
"learning_rate": 2.326646296932168e-05,
"loss": 0.3638,
"step": 997
},
{
"epoch": 3.3706043377553168,
"grad_norm": 0.10333006497152411,
"learning_rate": 2.318081946923144e-05,
"loss": 0.3612,
"step": 998
},
{
"epoch": 3.373973468098547,
"grad_norm": 0.10461115888151253,
"learning_rate": 2.3095269530372032e-05,
"loss": 0.362,
"step": 999
},
{
"epoch": 3.377342598441777,
"grad_norm": 0.10087292499347122,
"learning_rate": 2.3009813628638085e-05,
"loss": 0.3603,
"step": 1000
},
{
"epoch": 3.3807117287850073,
"grad_norm": 0.09894098741998586,
"learning_rate": 2.2924452239401153e-05,
"loss": 0.3635,
"step": 1001
},
{
"epoch": 3.3840808591282374,
"grad_norm": 0.10636129988239897,
"learning_rate": 2.283918583750695e-05,
"loss": 0.3589,
"step": 1002
},
{
"epoch": 3.3874499894714676,
"grad_norm": 0.1087735124770059,
"learning_rate": 2.2754014897272868e-05,
"loss": 0.3603,
"step": 1003
},
{
"epoch": 3.3908191198146977,
"grad_norm": 0.1045786633320159,
"learning_rate": 2.266893989248527e-05,
"loss": 0.3634,
"step": 1004
},
{
"epoch": 3.394188250157928,
"grad_norm": 0.10630134016191294,
"learning_rate": 2.258396129639679e-05,
"loss": 0.3626,
"step": 1005
},
{
"epoch": 3.397557380501158,
"grad_norm": 0.10814614823364664,
"learning_rate": 2.2499079581723846e-05,
"loss": 0.3682,
"step": 1006
},
{
"epoch": 3.4009265108443882,
"grad_norm": 0.10249665362012134,
"learning_rate": 2.2414295220643822e-05,
"loss": 0.361,
"step": 1007
},
{
"epoch": 3.4042956411876184,
"grad_norm": 0.10378027402659071,
"learning_rate": 2.2329608684792676e-05,
"loss": 0.3606,
"step": 1008
},
{
"epoch": 3.4076647715308486,
"grad_norm": 0.10027376191210695,
"learning_rate": 2.22450204452621e-05,
"loss": 0.3608,
"step": 1009
},
{
"epoch": 3.4110339018740787,
"grad_norm": 0.10689722485945972,
"learning_rate": 2.216053097259697e-05,
"loss": 0.3706,
"step": 1010
},
{
"epoch": 3.414403032217309,
"grad_norm": 0.10357939152860053,
"learning_rate": 2.2076140736792805e-05,
"loss": 0.3623,
"step": 1011
},
{
"epoch": 3.417772162560539,
"grad_norm": 0.0902315706129379,
"learning_rate": 2.1991850207293064e-05,
"loss": 0.3596,
"step": 1012
},
{
"epoch": 3.4211412929037692,
"grad_norm": 0.10842563552035595,
"learning_rate": 2.1907659852986588e-05,
"loss": 0.3637,
"step": 1013
},
{
"epoch": 3.4245104232469994,
"grad_norm": 0.09666903812158173,
"learning_rate": 2.1823570142204902e-05,
"loss": 0.3624,
"step": 1014
},
{
"epoch": 3.4278795535902296,
"grad_norm": 0.100083090000888,
"learning_rate": 2.1739581542719748e-05,
"loss": 0.3624,
"step": 1015
},
{
"epoch": 3.4312486839334597,
"grad_norm": 0.10755809720758686,
"learning_rate": 2.1655694521740376e-05,
"loss": 0.3624,
"step": 1016
},
{
"epoch": 3.43461781427669,
"grad_norm": 0.1024231010803628,
"learning_rate": 2.1571909545910953e-05,
"loss": 0.3621,
"step": 1017
},
{
"epoch": 3.43798694461992,
"grad_norm": 0.10562299735859218,
"learning_rate": 2.1488227081308054e-05,
"loss": 0.3626,
"step": 1018
},
{
"epoch": 3.44135607496315,
"grad_norm": 0.0993759886031881,
"learning_rate": 2.140464759343794e-05,
"loss": 0.3654,
"step": 1019
},
{
"epoch": 3.4447252053063804,
"grad_norm": 0.09933521966725083,
"learning_rate": 2.132117154723408e-05,
"loss": 0.356,
"step": 1020
},
{
"epoch": 3.4480943356496105,
"grad_norm": 0.09953034686873165,
"learning_rate": 2.123779940705453e-05,
"loss": 0.366,
"step": 1021
},
{
"epoch": 3.4514634659928407,
"grad_norm": 0.10175075266526791,
"learning_rate": 2.115453163667929e-05,
"loss": 0.3583,
"step": 1022
},
{
"epoch": 3.454832596336071,
"grad_norm": 0.09594990983302608,
"learning_rate": 2.1071368699307818e-05,
"loss": 0.3584,
"step": 1023
},
{
"epoch": 3.458201726679301,
"grad_norm": 0.10219150476255269,
"learning_rate": 2.0988311057556397e-05,
"loss": 0.3597,
"step": 1024
},
{
"epoch": 3.461570857022531,
"grad_norm": 0.09691112693809913,
"learning_rate": 2.0905359173455593e-05,
"loss": 0.3621,
"step": 1025
},
{
"epoch": 3.4649399873657614,
"grad_norm": 0.09661009238935536,
"learning_rate": 2.0822513508447608e-05,
"loss": 0.3567,
"step": 1026
},
{
"epoch": 3.4683091177089915,
"grad_norm": 0.09590582546596066,
"learning_rate": 2.073977452338384e-05,
"loss": 0.3646,
"step": 1027
},
{
"epoch": 3.4716782480522217,
"grad_norm": 0.09606588648905236,
"learning_rate": 2.065714267852223e-05,
"loss": 0.3641,
"step": 1028
},
{
"epoch": 3.475047378395452,
"grad_norm": 0.10295819559523817,
"learning_rate": 2.057461843352469e-05,
"loss": 0.3557,
"step": 1029
},
{
"epoch": 3.4784165087386816,
"grad_norm": 0.09150758299366415,
"learning_rate": 2.049220224745463e-05,
"loss": 0.3636,
"step": 1030
},
{
"epoch": 3.481785639081912,
"grad_norm": 0.10198222794968945,
"learning_rate": 2.0409894578774302e-05,
"loss": 0.3642,
"step": 1031
},
{
"epoch": 3.485154769425142,
"grad_norm": 0.09986839616807734,
"learning_rate": 2.032769588534233e-05,
"loss": 0.3673,
"step": 1032
},
{
"epoch": 3.4885238997683725,
"grad_norm": 0.09939024454656914,
"learning_rate": 2.0245606624411165e-05,
"loss": 0.3591,
"step": 1033
},
{
"epoch": 3.4918930301116022,
"grad_norm": 0.09144469769761462,
"learning_rate": 2.0163627252624427e-05,
"loss": 0.3683,
"step": 1034
},
{
"epoch": 3.495262160454833,
"grad_norm": 0.09038126728850328,
"learning_rate": 2.0081758226014516e-05,
"loss": 0.3585,
"step": 1035
},
{
"epoch": 3.4986312907980626,
"grad_norm": 0.09791848188862595,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.3633,
"step": 1036
},
{
"epoch": 3.502000421141293,
"grad_norm": 0.09451335114654308,
"learning_rate": 1.9918353029383065e-05,
"loss": 0.3563,
"step": 1037
},
{
"epoch": 3.505369551484523,
"grad_norm": 0.09766694825632033,
"learning_rate": 1.9836817768347015e-05,
"loss": 0.3634,
"step": 1038
},
{
"epoch": 3.508738681827753,
"grad_norm": 0.09523718569743331,
"learning_rate": 1.9755394670453745e-05,
"loss": 0.364,
"step": 1039
},
{
"epoch": 3.5121078121709832,
"grad_norm": 0.09716008155147098,
"learning_rate": 1.9674084188641235e-05,
"loss": 0.3614,
"step": 1040
},
{
"epoch": 3.5154769425142134,
"grad_norm": 0.09307186314834033,
"learning_rate": 1.9592886775220957e-05,
"loss": 0.3663,
"step": 1041
},
{
"epoch": 3.5188460728574436,
"grad_norm": 0.0966569916505279,
"learning_rate": 1.9511802881875438e-05,
"loss": 0.3628,
"step": 1042
},
{
"epoch": 3.5222152032006737,
"grad_norm": 0.09953663178152124,
"learning_rate": 1.943083295965572e-05,
"loss": 0.3653,
"step": 1043
},
{
"epoch": 3.525584333543904,
"grad_norm": 0.09000177069317349,
"learning_rate": 1.9349977458978846e-05,
"loss": 0.357,
"step": 1044
},
{
"epoch": 3.528953463887134,
"grad_norm": 0.09693735378111683,
"learning_rate": 1.9269236829625387e-05,
"loss": 0.3623,
"step": 1045
},
{
"epoch": 3.532322594230364,
"grad_norm": 0.1010678964013295,
"learning_rate": 1.9188611520736846e-05,
"loss": 0.3631,
"step": 1046
},
{
"epoch": 3.5356917245735944,
"grad_norm": 0.08709082546898574,
"learning_rate": 1.9108101980813277e-05,
"loss": 0.3559,
"step": 1047
},
{
"epoch": 3.5390608549168245,
"grad_norm": 0.09973595422763583,
"learning_rate": 1.902770865771074e-05,
"loss": 0.3572,
"step": 1048
},
{
"epoch": 3.5424299852600547,
"grad_norm": 0.0932062947908472,
"learning_rate": 1.8947431998638762e-05,
"loss": 0.3703,
"step": 1049
},
{
"epoch": 3.545799115603285,
"grad_norm": 0.0927283626151012,
"learning_rate": 1.886727245015794e-05,
"loss": 0.3604,
"step": 1050
},
{
"epoch": 3.549168245946515,
"grad_norm": 0.0899928028286008,
"learning_rate": 1.8787230458177408e-05,
"loss": 0.3596,
"step": 1051
},
{
"epoch": 3.552537376289745,
"grad_norm": 0.09291563797483152,
"learning_rate": 1.8707306467952323e-05,
"loss": 0.3602,
"step": 1052
},
{
"epoch": 3.5559065066329754,
"grad_norm": 0.0873067226862915,
"learning_rate": 1.862750092408147e-05,
"loss": 0.3632,
"step": 1053
},
{
"epoch": 3.5592756369762055,
"grad_norm": 0.09201291502685034,
"learning_rate": 1.8547814270504705e-05,
"loss": 0.3665,
"step": 1054
},
{
"epoch": 3.5626447673194357,
"grad_norm": 0.08672862490756593,
"learning_rate": 1.8468246950500556e-05,
"loss": 0.3595,
"step": 1055
},
{
"epoch": 3.566013897662666,
"grad_norm": 0.08745897822977576,
"learning_rate": 1.838879940668373e-05,
"loss": 0.3605,
"step": 1056
},
{
"epoch": 3.569383028005896,
"grad_norm": 0.09233748547600358,
"learning_rate": 1.83094720810026e-05,
"loss": 0.36,
"step": 1057
},
{
"epoch": 3.572752158349126,
"grad_norm": 0.09364157413857832,
"learning_rate": 1.823026541473684e-05,
"loss": 0.3642,
"step": 1058
},
{
"epoch": 3.5761212886923563,
"grad_norm": 0.0919710962105762,
"learning_rate": 1.8151179848494905e-05,
"loss": 0.3629,
"step": 1059
},
{
"epoch": 3.5794904190355865,
"grad_norm": 0.09562060747735483,
"learning_rate": 1.8072215822211613e-05,
"loss": 0.3623,
"step": 1060
},
{
"epoch": 3.5828595493788167,
"grad_norm": 0.08937202924753714,
"learning_rate": 1.7993373775145663e-05,
"loss": 0.3608,
"step": 1061
},
{
"epoch": 3.586228679722047,
"grad_norm": 0.09294261879238143,
"learning_rate": 1.7914654145877187e-05,
"loss": 0.3605,
"step": 1062
},
{
"epoch": 3.589597810065277,
"grad_norm": 0.08234694139654683,
"learning_rate": 1.7836057372305423e-05,
"loss": 0.3628,
"step": 1063
},
{
"epoch": 3.592966940408507,
"grad_norm": 0.09244388782945556,
"learning_rate": 1.77575838916461e-05,
"loss": 0.3584,
"step": 1064
},
{
"epoch": 3.5963360707517373,
"grad_norm": 0.0993638664940156,
"learning_rate": 1.767923414042915e-05,
"loss": 0.3614,
"step": 1065
},
{
"epoch": 3.5997052010949675,
"grad_norm": 0.08789579064875053,
"learning_rate": 1.760100855449619e-05,
"loss": 0.3603,
"step": 1066
},
{
"epoch": 3.6030743314381977,
"grad_norm": 0.10411154322718334,
"learning_rate": 1.752290756899816e-05,
"loss": 0.3624,
"step": 1067
},
{
"epoch": 3.606443461781428,
"grad_norm": 0.08047024019249209,
"learning_rate": 1.7444931618392894e-05,
"loss": 0.3585,
"step": 1068
},
{
"epoch": 3.6098125921246575,
"grad_norm": 0.10614805370086325,
"learning_rate": 1.736708113644262e-05,
"loss": 0.363,
"step": 1069
},
{
"epoch": 3.613181722467888,
"grad_norm": 0.08611033060017922,
"learning_rate": 1.7289356556211687e-05,
"loss": 0.3637,
"step": 1070
},
{
"epoch": 3.616550852811118,
"grad_norm": 0.0873884543774505,
"learning_rate": 1.7211758310064042e-05,
"loss": 0.3578,
"step": 1071
},
{
"epoch": 3.6199199831543485,
"grad_norm": 0.09185990558523179,
"learning_rate": 1.7134286829660855e-05,
"loss": 0.3677,
"step": 1072
},
{
"epoch": 3.623289113497578,
"grad_norm": 0.08742862884585491,
"learning_rate": 1.7056942545958167e-05,
"loss": 0.3657,
"step": 1073
},
{
"epoch": 3.626658243840809,
"grad_norm": 0.08793012887671753,
"learning_rate": 1.697972588920439e-05,
"loss": 0.3655,
"step": 1074
},
{
"epoch": 3.6300273741840385,
"grad_norm": 0.09014218865360733,
"learning_rate": 1.6902637288938074e-05,
"loss": 0.364,
"step": 1075
},
{
"epoch": 3.633396504527269,
"grad_norm": 0.08892601725042051,
"learning_rate": 1.6825677173985332e-05,
"loss": 0.3665,
"step": 1076
},
{
"epoch": 3.636765634870499,
"grad_norm": 0.0878924041737089,
"learning_rate": 1.6748845972457562e-05,
"loss": 0.3563,
"step": 1077
},
{
"epoch": 3.6401347652137295,
"grad_norm": 0.09417513459021953,
"learning_rate": 1.6672144111749066e-05,
"loss": 0.3657,
"step": 1078
},
{
"epoch": 3.643503895556959,
"grad_norm": 0.09041822306873473,
"learning_rate": 1.659557201853465e-05,
"loss": 0.3687,
"step": 1079
},
{
"epoch": 3.6468730259001894,
"grad_norm": 0.08690354592106783,
"learning_rate": 1.6519130118767258e-05,
"loss": 0.3601,
"step": 1080
},
{
"epoch": 3.6502421562434195,
"grad_norm": 0.08875815871506505,
"learning_rate": 1.6442818837675578e-05,
"loss": 0.3602,
"step": 1081
},
{
"epoch": 3.6536112865866497,
"grad_norm": 0.08649489906143072,
"learning_rate": 1.6366638599761676e-05,
"loss": 0.362,
"step": 1082
},
{
"epoch": 3.65698041692988,
"grad_norm": 0.08572937873824316,
"learning_rate": 1.6290589828798736e-05,
"loss": 0.3614,
"step": 1083
},
{
"epoch": 3.66034954727311,
"grad_norm": 0.09798247509524252,
"learning_rate": 1.621467294782854e-05,
"loss": 0.3608,
"step": 1084
},
{
"epoch": 3.66371867761634,
"grad_norm": 0.08586977784228111,
"learning_rate": 1.6138888379159238e-05,
"loss": 0.3602,
"step": 1085
},
{
"epoch": 3.6670878079595703,
"grad_norm": 0.09119393564711122,
"learning_rate": 1.606323654436293e-05,
"loss": 0.3641,
"step": 1086
},
{
"epoch": 3.6704569383028005,
"grad_norm": 0.09035766592284558,
"learning_rate": 1.5987717864273377e-05,
"loss": 0.366,
"step": 1087
},
{
"epoch": 3.6738260686460307,
"grad_norm": 0.08837128914166983,
"learning_rate": 1.591233275898363e-05,
"loss": 0.3621,
"step": 1088
},
{
"epoch": 3.677195198989261,
"grad_norm": 0.09501671591225473,
"learning_rate": 1.5837081647843652e-05,
"loss": 0.3655,
"step": 1089
},
{
"epoch": 3.680564329332491,
"grad_norm": 0.08827549191913663,
"learning_rate": 1.5761964949458076e-05,
"loss": 0.3664,
"step": 1090
},
{
"epoch": 3.683933459675721,
"grad_norm": 0.08653211178416792,
"learning_rate": 1.5686983081683816e-05,
"loss": 0.3613,
"step": 1091
},
{
"epoch": 3.6873025900189513,
"grad_norm": 0.09007377723059057,
"learning_rate": 1.5612136461627726e-05,
"loss": 0.3605,
"step": 1092
},
{
"epoch": 3.6906717203621815,
"grad_norm": 0.08875691955803114,
"learning_rate": 1.5537425505644358e-05,
"loss": 0.3692,
"step": 1093
},
{
"epoch": 3.6940408507054117,
"grad_norm": 0.0865487137501953,
"learning_rate": 1.546285062933352e-05,
"loss": 0.3637,
"step": 1094
},
{
"epoch": 3.697409981048642,
"grad_norm": 0.08441316995604657,
"learning_rate": 1.5388412247538148e-05,
"loss": 0.3566,
"step": 1095
},
{
"epoch": 3.700779111391872,
"grad_norm": 0.08795109300895403,
"learning_rate": 1.5314110774341803e-05,
"loss": 0.3649,
"step": 1096
},
{
"epoch": 3.704148241735102,
"grad_norm": 0.08660466649366423,
"learning_rate": 1.5239946623066466e-05,
"loss": 0.3656,
"step": 1097
},
{
"epoch": 3.7075173720783323,
"grad_norm": 0.08962594968540999,
"learning_rate": 1.5165920206270257e-05,
"loss": 0.3578,
"step": 1098
},
{
"epoch": 3.7108865024215625,
"grad_norm": 0.08885368968596062,
"learning_rate": 1.5092031935745102e-05,
"loss": 0.362,
"step": 1099
},
{
"epoch": 3.7142556327647926,
"grad_norm": 0.0928847110975983,
"learning_rate": 1.5018282222514451e-05,
"loss": 0.3673,
"step": 1100
},
{
"epoch": 3.717624763108023,
"grad_norm": 0.08764355173587007,
"learning_rate": 1.4944671476830967e-05,
"loss": 0.3559,
"step": 1101
},
{
"epoch": 3.720993893451253,
"grad_norm": 0.08858832328866184,
"learning_rate": 1.4871200108174306e-05,
"loss": 0.3621,
"step": 1102
},
{
"epoch": 3.724363023794483,
"grad_norm": 0.08763675374855749,
"learning_rate": 1.479786852524879e-05,
"loss": 0.3588,
"step": 1103
},
{
"epoch": 3.7277321541377133,
"grad_norm": 0.08909210540103846,
"learning_rate": 1.4724677135981118e-05,
"loss": 0.3625,
"step": 1104
},
{
"epoch": 3.7311012844809435,
"grad_norm": 0.08178768871920535,
"learning_rate": 1.4651626347518169e-05,
"loss": 0.3621,
"step": 1105
},
{
"epoch": 3.7344704148241736,
"grad_norm": 0.09156080753526148,
"learning_rate": 1.457871656622463e-05,
"loss": 0.359,
"step": 1106
},
{
"epoch": 3.737839545167404,
"grad_norm": 0.08325843071720376,
"learning_rate": 1.4505948197680892e-05,
"loss": 0.3607,
"step": 1107
},
{
"epoch": 3.741208675510634,
"grad_norm": 0.08448426244553969,
"learning_rate": 1.4433321646680614e-05,
"loss": 0.3648,
"step": 1108
},
{
"epoch": 3.744577805853864,
"grad_norm": 0.08086940191673836,
"learning_rate": 1.4360837317228571e-05,
"loss": 0.3588,
"step": 1109
},
{
"epoch": 3.747946936197094,
"grad_norm": 0.08083156500148386,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.3571,
"step": 1110
},
{
"epoch": 3.7513160665403245,
"grad_norm": 0.08098967397738577,
"learning_rate": 1.4216296935030433e-05,
"loss": 0.3661,
"step": 1111
},
{
"epoch": 3.754685196883554,
"grad_norm": 0.08420676698532144,
"learning_rate": 1.4144241686329236e-05,
"loss": 0.3667,
"step": 1112
},
{
"epoch": 3.758054327226785,
"grad_norm": 0.08205757559067017,
"learning_rate": 1.4072330267261585e-05,
"loss": 0.3538,
"step": 1113
},
{
"epoch": 3.7614234575700145,
"grad_norm": 0.08130986735808181,
"learning_rate": 1.400056307785413e-05,
"loss": 0.358,
"step": 1114
},
{
"epoch": 3.764792587913245,
"grad_norm": 0.08513069994957134,
"learning_rate": 1.3928940517331282e-05,
"loss": 0.363,
"step": 1115
},
{
"epoch": 3.768161718256475,
"grad_norm": 0.08636038178242018,
"learning_rate": 1.3857462984112831e-05,
"loss": 0.3625,
"step": 1116
},
{
"epoch": 3.7715308485997054,
"grad_norm": 0.08228222146136721,
"learning_rate": 1.3786130875811864e-05,
"loss": 0.3643,
"step": 1117
},
{
"epoch": 3.774899978942935,
"grad_norm": 0.08245367651432615,
"learning_rate": 1.371494458923246e-05,
"loss": 0.3611,
"step": 1118
},
{
"epoch": 3.7782691092861658,
"grad_norm": 0.08984002063694464,
"learning_rate": 1.3643904520367568e-05,
"loss": 0.3665,
"step": 1119
},
{
"epoch": 3.7816382396293955,
"grad_norm": 0.08004513922265995,
"learning_rate": 1.3573011064396751e-05,
"loss": 0.3626,
"step": 1120
},
{
"epoch": 3.7850073699726257,
"grad_norm": 0.08501294133045856,
"learning_rate": 1.3502264615683966e-05,
"loss": 0.3584,
"step": 1121
},
{
"epoch": 3.788376500315856,
"grad_norm": 0.08838080335200914,
"learning_rate": 1.3431665567775439e-05,
"loss": 0.3584,
"step": 1122
},
{
"epoch": 3.791745630659086,
"grad_norm": 0.08330553315133392,
"learning_rate": 1.3361214313397444e-05,
"loss": 0.36,
"step": 1123
},
{
"epoch": 3.795114761002316,
"grad_norm": 0.08670146765162016,
"learning_rate": 1.3290911244454066e-05,
"loss": 0.3661,
"step": 1124
},
{
"epoch": 3.7984838913455463,
"grad_norm": 0.0841408453670069,
"learning_rate": 1.3220756752025126e-05,
"loss": 0.363,
"step": 1125
},
{
"epoch": 3.8018530216887765,
"grad_norm": 0.08384047682221397,
"learning_rate": 1.3150751226363886e-05,
"loss": 0.3622,
"step": 1126
},
{
"epoch": 3.8052221520320066,
"grad_norm": 0.08347244270329462,
"learning_rate": 1.3080895056895022e-05,
"loss": 0.3618,
"step": 1127
},
{
"epoch": 3.808591282375237,
"grad_norm": 0.0851964538331852,
"learning_rate": 1.3011188632212307e-05,
"loss": 0.3639,
"step": 1128
},
{
"epoch": 3.811960412718467,
"grad_norm": 0.08389988414632749,
"learning_rate": 1.2941632340076531e-05,
"loss": 0.3656,
"step": 1129
},
{
"epoch": 3.815329543061697,
"grad_norm": 0.0818943745196087,
"learning_rate": 1.2872226567413346e-05,
"loss": 0.3595,
"step": 1130
},
{
"epoch": 3.8186986734049273,
"grad_norm": 0.07744154226291297,
"learning_rate": 1.2802971700311103e-05,
"loss": 0.3595,
"step": 1131
},
{
"epoch": 3.8220678037481575,
"grad_norm": 0.08550107649728135,
"learning_rate": 1.2733868124018694e-05,
"loss": 0.3614,
"step": 1132
},
{
"epoch": 3.8254369340913876,
"grad_norm": 0.07860069500089853,
"learning_rate": 1.2664916222943392e-05,
"loss": 0.3552,
"step": 1133
},
{
"epoch": 3.828806064434618,
"grad_norm": 0.08126878361185912,
"learning_rate": 1.2596116380648761e-05,
"loss": 0.3622,
"step": 1134
},
{
"epoch": 3.832175194777848,
"grad_norm": 0.08610190267035886,
"learning_rate": 1.2527468979852513e-05,
"loss": 0.3645,
"step": 1135
},
{
"epoch": 3.835544325121078,
"grad_norm": 0.0815351952289208,
"learning_rate": 1.2458974402424312e-05,
"loss": 0.36,
"step": 1136
},
{
"epoch": 3.8389134554643083,
"grad_norm": 0.08556542224799225,
"learning_rate": 1.239063302938376e-05,
"loss": 0.3581,
"step": 1137
},
{
"epoch": 3.8422825858075385,
"grad_norm": 0.0863667479775735,
"learning_rate": 1.2322445240898158e-05,
"loss": 0.3592,
"step": 1138
},
{
"epoch": 3.8456517161507686,
"grad_norm": 0.09012131069922455,
"learning_rate": 1.2254411416280494e-05,
"loss": 0.3608,
"step": 1139
},
{
"epoch": 3.849020846493999,
"grad_norm": 0.0813110318115451,
"learning_rate": 1.2186531933987294e-05,
"loss": 0.3617,
"step": 1140
},
{
"epoch": 3.852389976837229,
"grad_norm": 0.08621918893656133,
"learning_rate": 1.2118807171616469e-05,
"loss": 0.3632,
"step": 1141
},
{
"epoch": 3.855759107180459,
"grad_norm": 0.08876046697132542,
"learning_rate": 1.2051237505905302e-05,
"loss": 0.363,
"step": 1142
},
{
"epoch": 3.8591282375236893,
"grad_norm": 0.08486782812443205,
"learning_rate": 1.1983823312728306e-05,
"loss": 0.3681,
"step": 1143
},
{
"epoch": 3.8624973678669194,
"grad_norm": 0.08182983032740812,
"learning_rate": 1.19165649670951e-05,
"loss": 0.3635,
"step": 1144
},
{
"epoch": 3.8658664982101496,
"grad_norm": 0.07583529894267067,
"learning_rate": 1.1849462843148398e-05,
"loss": 0.3633,
"step": 1145
},
{
"epoch": 3.8692356285533798,
"grad_norm": 0.09126795810440728,
"learning_rate": 1.1782517314161872e-05,
"loss": 0.3584,
"step": 1146
},
{
"epoch": 3.87260475889661,
"grad_norm": 0.0811651957931282,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.3617,
"step": 1147
},
{
"epoch": 3.87597388923984,
"grad_norm": 0.0843774615335534,
"learning_rate": 1.164909752980648e-05,
"loss": 0.3644,
"step": 1148
},
{
"epoch": 3.8793430195830703,
"grad_norm": 0.08099134098178276,
"learning_rate": 1.1582624016621154e-05,
"loss": 0.3595,
"step": 1149
},
{
"epoch": 3.8827121499263004,
"grad_norm": 0.07907946240714192,
"learning_rate": 1.1516308582758983e-05,
"loss": 0.3614,
"step": 1150
},
{
"epoch": 3.8860812802695306,
"grad_norm": 0.08423868561227663,
"learning_rate": 1.1450151597117479e-05,
"loss": 0.3613,
"step": 1151
},
{
"epoch": 3.8894504106127608,
"grad_norm": 0.08033686574245383,
"learning_rate": 1.1384153427712729e-05,
"loss": 0.3642,
"step": 1152
},
{
"epoch": 3.8928195409559905,
"grad_norm": 0.07677407670189697,
"learning_rate": 1.1318314441677348e-05,
"loss": 0.3569,
"step": 1153
},
{
"epoch": 3.896188671299221,
"grad_norm": 0.07906769729135289,
"learning_rate": 1.1252635005258466e-05,
"loss": 0.3595,
"step": 1154
},
{
"epoch": 3.899557801642451,
"grad_norm": 0.08225694582677316,
"learning_rate": 1.1187115483815693e-05,
"loss": 0.3644,
"step": 1155
},
{
"epoch": 3.9029269319856814,
"grad_norm": 0.08435086211540141,
"learning_rate": 1.1121756241819023e-05,
"loss": 0.3629,
"step": 1156
},
{
"epoch": 3.906296062328911,
"grad_norm": 0.0779208137414844,
"learning_rate": 1.105655764284689e-05,
"loss": 0.3594,
"step": 1157
},
{
"epoch": 3.9096651926721417,
"grad_norm": 0.07917404294021134,
"learning_rate": 1.0991520049584112e-05,
"loss": 0.3649,
"step": 1158
},
{
"epoch": 3.9130343230153715,
"grad_norm": 0.07819778959894405,
"learning_rate": 1.0926643823819827e-05,
"loss": 0.3643,
"step": 1159
},
{
"epoch": 3.916403453358602,
"grad_norm": 0.0822711836526933,
"learning_rate": 1.0861929326445572e-05,
"loss": 0.3627,
"step": 1160
},
{
"epoch": 3.919772583701832,
"grad_norm": 0.07971405947853387,
"learning_rate": 1.0797376917453187e-05,
"loss": 0.3599,
"step": 1161
},
{
"epoch": 3.923141714045062,
"grad_norm": 0.08341374870417605,
"learning_rate": 1.0732986955932869e-05,
"loss": 0.3555,
"step": 1162
},
{
"epoch": 3.926510844388292,
"grad_norm": 0.07752760209485876,
"learning_rate": 1.0668759800071174e-05,
"loss": 0.3591,
"step": 1163
},
{
"epoch": 3.9298799747315223,
"grad_norm": 0.0795643390039002,
"learning_rate": 1.0604695807148971e-05,
"loss": 0.3568,
"step": 1164
},
{
"epoch": 3.9332491050747524,
"grad_norm": 0.07806803915038091,
"learning_rate": 1.0540795333539515e-05,
"loss": 0.3629,
"step": 1165
},
{
"epoch": 3.9366182354179826,
"grad_norm": 0.07851161347206628,
"learning_rate": 1.0477058734706436e-05,
"loss": 0.3611,
"step": 1166
},
{
"epoch": 3.939987365761213,
"grad_norm": 0.07762641296833782,
"learning_rate": 1.0413486365201785e-05,
"loss": 0.3613,
"step": 1167
},
{
"epoch": 3.943356496104443,
"grad_norm": 0.08535005147447429,
"learning_rate": 1.0350078578664005e-05,
"loss": 0.3591,
"step": 1168
},
{
"epoch": 3.946725626447673,
"grad_norm": 0.07824237520005016,
"learning_rate": 1.0286835727816001e-05,
"loss": 0.363,
"step": 1169
},
{
"epoch": 3.9500947567909033,
"grad_norm": 0.0725027774844816,
"learning_rate": 1.0223758164463246e-05,
"loss": 0.361,
"step": 1170
},
{
"epoch": 3.9534638871341334,
"grad_norm": 0.08250211916215387,
"learning_rate": 1.0160846239491673e-05,
"loss": 0.3706,
"step": 1171
},
{
"epoch": 3.9568330174773636,
"grad_norm": 0.07768057857668437,
"learning_rate": 1.0098100302865865e-05,
"loss": 0.358,
"step": 1172
},
{
"epoch": 3.9602021478205938,
"grad_norm": 0.0743357334386284,
"learning_rate": 1.003552070362701e-05,
"loss": 0.3588,
"step": 1173
},
{
"epoch": 3.963571278163824,
"grad_norm": 0.08538154828312804,
"learning_rate": 9.973107789891024e-06,
"loss": 0.3687,
"step": 1174
},
{
"epoch": 3.966940408507054,
"grad_norm": 0.08474253190258095,
"learning_rate": 9.910861908846598e-06,
"loss": 0.36,
"step": 1175
},
{
"epoch": 3.9703095388502843,
"grad_norm": 0.07698260800417392,
"learning_rate": 9.848783406753224e-06,
"loss": 0.3655,
"step": 1176
},
{
"epoch": 3.9736786691935144,
"grad_norm": 0.07875068992732076,
"learning_rate": 9.786872628939329e-06,
"loss": 0.3605,
"step": 1177
},
{
"epoch": 3.9770477995367446,
"grad_norm": 0.08337836249305365,
"learning_rate": 9.725129919800339e-06,
"loss": 0.3653,
"step": 1178
},
{
"epoch": 3.9804169298799748,
"grad_norm": 0.0799444611097984,
"learning_rate": 9.66355562279671e-06,
"loss": 0.3604,
"step": 1179
},
{
"epoch": 3.983786060223205,
"grad_norm": 0.08618283586928363,
"learning_rate": 9.60215008045211e-06,
"loss": 0.3637,
"step": 1180
},
{
"epoch": 3.987155190566435,
"grad_norm": 0.08302579845358256,
"learning_rate": 9.540913634351408e-06,
"loss": 0.3602,
"step": 1181
},
{
"epoch": 3.9905243209096652,
"grad_norm": 0.07735294324245658,
"learning_rate": 9.479846625138909e-06,
"loss": 0.3596,
"step": 1182
},
{
"epoch": 3.9938934512528954,
"grad_norm": 0.07471734423709958,
"learning_rate": 9.418949392516307e-06,
"loss": 0.3611,
"step": 1183
},
{
"epoch": 3.9972625815961256,
"grad_norm": 0.08214012704171592,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3648,
"step": 1184
},
{
"epoch": 4.00336913034323,
"grad_norm": 0.11292758122904588,
"learning_rate": 9.297665611123628e-06,
"loss": 0.3527,
"step": 1185
},
{
"epoch": 4.00673826068646,
"grad_norm": 0.0941098295127884,
"learning_rate": 9.237279737027326e-06,
"loss": 0.3472,
"step": 1186
},
{
"epoch": 4.01010739102969,
"grad_norm": 0.09639154458998347,
"learning_rate": 9.177064988864712e-06,
"loss": 0.3425,
"step": 1187
},
{
"epoch": 4.013476521372921,
"grad_norm": 0.09835304863889502,
"learning_rate": 9.117021701596567e-06,
"loss": 0.3446,
"step": 1188
},
{
"epoch": 4.01684565171615,
"grad_norm": 0.08987244503280054,
"learning_rate": 9.057150209229845e-06,
"loss": 0.3513,
"step": 1189
},
{
"epoch": 4.020214782059381,
"grad_norm": 0.10031177854257561,
"learning_rate": 8.99745084481594e-06,
"loss": 0.3516,
"step": 1190
},
{
"epoch": 4.023583912402611,
"grad_norm": 0.10651297976200229,
"learning_rate": 8.937923940448634e-06,
"loss": 0.3489,
"step": 1191
},
{
"epoch": 4.026953042745841,
"grad_norm": 0.08656835316363745,
"learning_rate": 8.87856982726243e-06,
"loss": 0.3402,
"step": 1192
},
{
"epoch": 4.030322173089071,
"grad_norm": 0.0977560831877126,
"learning_rate": 8.819388835430569e-06,
"loss": 0.348,
"step": 1193
},
{
"epoch": 4.033691303432302,
"grad_norm": 0.09746909055035731,
"learning_rate": 8.7603812941633e-06,
"loss": 0.3492,
"step": 1194
},
{
"epoch": 4.037060433775531,
"grad_norm": 0.08395050874481182,
"learning_rate": 8.701547531706018e-06,
"loss": 0.3482,
"step": 1195
},
{
"epoch": 4.040429564118762,
"grad_norm": 0.09139581639425662,
"learning_rate": 8.642887875337376e-06,
"loss": 0.3509,
"step": 1196
},
{
"epoch": 4.043798694461992,
"grad_norm": 0.09015094643326858,
"learning_rate": 8.584402651367556e-06,
"loss": 0.3445,
"step": 1197
},
{
"epoch": 4.047167824805222,
"grad_norm": 0.08067803096785321,
"learning_rate": 8.526092185136394e-06,
"loss": 0.345,
"step": 1198
},
{
"epoch": 4.050536955148452,
"grad_norm": 0.08630631888609785,
"learning_rate": 8.467956801011618e-06,
"loss": 0.338,
"step": 1199
},
{
"epoch": 4.053906085491683,
"grad_norm": 0.08433690244909006,
"learning_rate": 8.409996822386972e-06,
"loss": 0.343,
"step": 1200
},
{
"epoch": 4.057275215834912,
"grad_norm": 0.07920044123514752,
"learning_rate": 8.352212571680458e-06,
"loss": 0.3473,
"step": 1201
},
{
"epoch": 4.060644346178143,
"grad_norm": 0.07927154455223241,
"learning_rate": 8.294604370332613e-06,
"loss": 0.3482,
"step": 1202
},
{
"epoch": 4.064013476521373,
"grad_norm": 0.08109057542606768,
"learning_rate": 8.23717253880457e-06,
"loss": 0.3428,
"step": 1203
},
{
"epoch": 4.067382606864603,
"grad_norm": 0.08569342844895425,
"learning_rate": 8.17991739657641e-06,
"loss": 0.3474,
"step": 1204
},
{
"epoch": 4.070751737207833,
"grad_norm": 0.08637139957757115,
"learning_rate": 8.122839262145294e-06,
"loss": 0.3467,
"step": 1205
},
{
"epoch": 4.074120867551064,
"grad_norm": 0.07781808041765698,
"learning_rate": 8.06593845302376e-06,
"loss": 0.3395,
"step": 1206
},
{
"epoch": 4.077489997894293,
"grad_norm": 0.08111376806052889,
"learning_rate": 8.00921528573793e-06,
"loss": 0.3389,
"step": 1207
},
{
"epoch": 4.080859128237524,
"grad_norm": 0.08619767447901233,
"learning_rate": 7.952670075825702e-06,
"loss": 0.348,
"step": 1208
},
{
"epoch": 4.084228258580754,
"grad_norm": 0.07737321565650793,
"learning_rate": 7.896303137835084e-06,
"loss": 0.3373,
"step": 1209
},
{
"epoch": 4.087597388923984,
"grad_norm": 0.07775405743530504,
"learning_rate": 7.840114785322384e-06,
"loss": 0.3443,
"step": 1210
},
{
"epoch": 4.090966519267214,
"grad_norm": 0.07816418598625743,
"learning_rate": 7.78410533085046e-06,
"loss": 0.345,
"step": 1211
},
{
"epoch": 4.094335649610445,
"grad_norm": 0.08021420493935687,
"learning_rate": 7.728275085987041e-06,
"loss": 0.3445,
"step": 1212
},
{
"epoch": 4.097704779953674,
"grad_norm": 0.07501876010838501,
"learning_rate": 7.672624361302894e-06,
"loss": 0.345,
"step": 1213
},
{
"epoch": 4.101073910296905,
"grad_norm": 0.07616193917641446,
"learning_rate": 7.6171534663702416e-06,
"loss": 0.3451,
"step": 1214
},
{
"epoch": 4.104443040640135,
"grad_norm": 0.08197274858236898,
"learning_rate": 7.5618627097608835e-06,
"loss": 0.3481,
"step": 1215
},
{
"epoch": 4.107812170983365,
"grad_norm": 0.07483017111226394,
"learning_rate": 7.50675239904457e-06,
"loss": 0.3454,
"step": 1216
},
{
"epoch": 4.111181301326595,
"grad_norm": 0.07441931083866478,
"learning_rate": 7.451822840787279e-06,
"loss": 0.3469,
"step": 1217
},
{
"epoch": 4.114550431669826,
"grad_norm": 0.08142190767207858,
"learning_rate": 7.397074340549508e-06,
"loss": 0.3431,
"step": 1218
},
{
"epoch": 4.117919562013055,
"grad_norm": 0.07876869644542178,
"learning_rate": 7.342507202884577e-06,
"loss": 0.3462,
"step": 1219
},
{
"epoch": 4.121288692356286,
"grad_norm": 0.07845687277699909,
"learning_rate": 7.288121731336901e-06,
"loss": 0.3456,
"step": 1220
},
{
"epoch": 4.124657822699516,
"grad_norm": 0.07817574483354851,
"learning_rate": 7.233918228440324e-06,
"loss": 0.3436,
"step": 1221
},
{
"epoch": 4.128026953042746,
"grad_norm": 0.07876507958828823,
"learning_rate": 7.1798969957165025e-06,
"loss": 0.3493,
"step": 1222
},
{
"epoch": 4.131396083385976,
"grad_norm": 0.07707210638891601,
"learning_rate": 7.126058333673094e-06,
"loss": 0.3402,
"step": 1223
},
{
"epoch": 4.134765213729207,
"grad_norm": 0.07947117463971737,
"learning_rate": 7.072402541802197e-06,
"loss": 0.3478,
"step": 1224
},
{
"epoch": 4.138134344072436,
"grad_norm": 0.07708906857469865,
"learning_rate": 7.018929918578621e-06,
"loss": 0.3457,
"step": 1225
},
{
"epoch": 4.141503474415666,
"grad_norm": 0.08008450821251828,
"learning_rate": 6.965640761458274e-06,
"loss": 0.3414,
"step": 1226
},
{
"epoch": 4.144872604758897,
"grad_norm": 0.07732322409168987,
"learning_rate": 6.912535366876483e-06,
"loss": 0.3427,
"step": 1227
},
{
"epoch": 4.148241735102126,
"grad_norm": 0.07450575669616548,
"learning_rate": 6.859614030246318e-06,
"loss": 0.3477,
"step": 1228
},
{
"epoch": 4.151610865445357,
"grad_norm": 0.08433118593640568,
"learning_rate": 6.806877045957003e-06,
"loss": 0.3425,
"step": 1229
},
{
"epoch": 4.154979995788587,
"grad_norm": 0.07513389398253724,
"learning_rate": 6.754324707372264e-06,
"loss": 0.3443,
"step": 1230
},
{
"epoch": 4.158349126131817,
"grad_norm": 0.07536890804885507,
"learning_rate": 6.701957306828637e-06,
"loss": 0.3438,
"step": 1231
},
{
"epoch": 4.161718256475047,
"grad_norm": 0.07685668754719273,
"learning_rate": 6.649775135633944e-06,
"loss": 0.3401,
"step": 1232
},
{
"epoch": 4.165087386818278,
"grad_norm": 0.07956673529792976,
"learning_rate": 6.597778484065571e-06,
"loss": 0.3503,
"step": 1233
},
{
"epoch": 4.168456517161507,
"grad_norm": 0.07209527381971025,
"learning_rate": 6.545967641368958e-06,
"loss": 0.3434,
"step": 1234
},
{
"epoch": 4.171825647504738,
"grad_norm": 0.07458918014634688,
"learning_rate": 6.494342895755879e-06,
"loss": 0.343,
"step": 1235
},
{
"epoch": 4.175194777847968,
"grad_norm": 0.08077306421411162,
"learning_rate": 6.4429045344029136e-06,
"loss": 0.3513,
"step": 1236
},
{
"epoch": 4.178563908191198,
"grad_norm": 0.08065308092284855,
"learning_rate": 6.391652843449829e-06,
"loss": 0.3434,
"step": 1237
},
{
"epoch": 4.181933038534428,
"grad_norm": 0.0731775502872814,
"learning_rate": 6.340588107997994e-06,
"loss": 0.3443,
"step": 1238
},
{
"epoch": 4.185302168877659,
"grad_norm": 0.07546567416391478,
"learning_rate": 6.289710612108786e-06,
"loss": 0.3434,
"step": 1239
},
{
"epoch": 4.188671299220888,
"grad_norm": 0.07650397977406549,
"learning_rate": 6.239020638801987e-06,
"loss": 0.3452,
"step": 1240
},
{
"epoch": 4.192040429564119,
"grad_norm": 0.07431679145535366,
"learning_rate": 6.18851847005427e-06,
"loss": 0.3484,
"step": 1241
},
{
"epoch": 4.195409559907349,
"grad_norm": 0.07416827387620398,
"learning_rate": 6.1382043867975836e-06,
"loss": 0.3452,
"step": 1242
},
{
"epoch": 4.198778690250579,
"grad_norm": 0.07754320392922942,
"learning_rate": 6.088078668917572e-06,
"loss": 0.3491,
"step": 1243
},
{
"epoch": 4.202147820593809,
"grad_norm": 0.07827458851806732,
"learning_rate": 6.038141595252094e-06,
"loss": 0.3406,
"step": 1244
},
{
"epoch": 4.20551695093704,
"grad_norm": 0.0725724426162921,
"learning_rate": 5.9883934435895774e-06,
"loss": 0.3496,
"step": 1245
},
{
"epoch": 4.208886081280269,
"grad_norm": 0.0719909369345341,
"learning_rate": 5.9388344906675485e-06,
"loss": 0.3526,
"step": 1246
},
{
"epoch": 4.2122552116235,
"grad_norm": 0.07567213228800986,
"learning_rate": 5.889465012171069e-06,
"loss": 0.3468,
"step": 1247
},
{
"epoch": 4.21562434196673,
"grad_norm": 0.07098076354440293,
"learning_rate": 5.840285282731173e-06,
"loss": 0.3466,
"step": 1248
},
{
"epoch": 4.21899347230996,
"grad_norm": 0.07019771893928237,
"learning_rate": 5.791295575923382e-06,
"loss": 0.3448,
"step": 1249
},
{
"epoch": 4.22236260265319,
"grad_norm": 0.07471579252214251,
"learning_rate": 5.742496164266174e-06,
"loss": 0.3491,
"step": 1250
},
{
"epoch": 4.225731732996421,
"grad_norm": 0.07236549423445121,
"learning_rate": 5.693887319219422e-06,
"loss": 0.3499,
"step": 1251
},
{
"epoch": 4.22910086333965,
"grad_norm": 0.07134479537520134,
"learning_rate": 5.645469311182958e-06,
"loss": 0.3459,
"step": 1252
},
{
"epoch": 4.232469993682881,
"grad_norm": 0.07072016749147457,
"learning_rate": 5.597242409495018e-06,
"loss": 0.3438,
"step": 1253
},
{
"epoch": 4.235839124026111,
"grad_norm": 0.07179051070856982,
"learning_rate": 5.549206882430773e-06,
"loss": 0.3419,
"step": 1254
},
{
"epoch": 4.239208254369341,
"grad_norm": 0.07302770625869862,
"learning_rate": 5.501362997200787e-06,
"loss": 0.3487,
"step": 1255
},
{
"epoch": 4.242577384712571,
"grad_norm": 0.06976392401988353,
"learning_rate": 5.453711019949581e-06,
"loss": 0.344,
"step": 1256
},
{
"epoch": 4.245946515055802,
"grad_norm": 0.07078499285712887,
"learning_rate": 5.406251215754146e-06,
"loss": 0.3465,
"step": 1257
},
{
"epoch": 4.249315645399031,
"grad_norm": 0.07118826571789505,
"learning_rate": 5.358983848622452e-06,
"loss": 0.3504,
"step": 1258
},
{
"epoch": 4.252684775742262,
"grad_norm": 0.0686563097499576,
"learning_rate": 5.311909181491994e-06,
"loss": 0.3433,
"step": 1259
},
{
"epoch": 4.256053906085492,
"grad_norm": 0.06836729686980945,
"learning_rate": 5.265027476228297e-06,
"loss": 0.3428,
"step": 1260
},
{
"epoch": 4.259423036428722,
"grad_norm": 0.07026205200909408,
"learning_rate": 5.218338993623499e-06,
"loss": 0.3475,
"step": 1261
},
{
"epoch": 4.262792166771952,
"grad_norm": 0.07032323091306557,
"learning_rate": 5.171843993394903e-06,
"loss": 0.3431,
"step": 1262
},
{
"epoch": 4.2661612971151825,
"grad_norm": 0.07423746533959613,
"learning_rate": 5.125542734183473e-06,
"loss": 0.3445,
"step": 1263
},
{
"epoch": 4.269530427458412,
"grad_norm": 0.07841448579779874,
"learning_rate": 5.079435473552474e-06,
"loss": 0.3481,
"step": 1264
},
{
"epoch": 4.272899557801642,
"grad_norm": 0.07040437269579536,
"learning_rate": 5.033522467985985e-06,
"loss": 0.3422,
"step": 1265
},
{
"epoch": 4.276268688144873,
"grad_norm": 0.07271729651198641,
"learning_rate": 4.987803972887482e-06,
"loss": 0.3433,
"step": 1266
},
{
"epoch": 4.279637818488103,
"grad_norm": 0.07717082685197238,
"learning_rate": 4.9422802425784475e-06,
"loss": 0.3459,
"step": 1267
},
{
"epoch": 4.283006948831333,
"grad_norm": 0.07646859752104176,
"learning_rate": 4.896951530296896e-06,
"loss": 0.3487,
"step": 1268
},
{
"epoch": 4.286376079174563,
"grad_norm": 0.07196146666335995,
"learning_rate": 4.851818088196041e-06,
"loss": 0.3451,
"step": 1269
},
{
"epoch": 4.289745209517793,
"grad_norm": 0.07601088345941356,
"learning_rate": 4.806880167342831e-06,
"loss": 0.346,
"step": 1270
},
{
"epoch": 4.293114339861023,
"grad_norm": 0.0730390084111676,
"learning_rate": 4.762138017716571e-06,
"loss": 0.3451,
"step": 1271
},
{
"epoch": 4.296483470204254,
"grad_norm": 0.08370554873202815,
"learning_rate": 4.7175918882075465e-06,
"loss": 0.3413,
"step": 1272
},
{
"epoch": 4.299852600547483,
"grad_norm": 0.07165140458981821,
"learning_rate": 4.673242026615627e-06,
"loss": 0.3413,
"step": 1273
},
{
"epoch": 4.303221730890714,
"grad_norm": 0.07124644667052794,
"learning_rate": 4.6290886796488946e-06,
"loss": 0.3474,
"step": 1274
},
{
"epoch": 4.306590861233944,
"grad_norm": 0.07331931362741691,
"learning_rate": 4.58513209292224e-06,
"loss": 0.3445,
"step": 1275
},
{
"epoch": 4.309959991577174,
"grad_norm": 0.07237727500497035,
"learning_rate": 4.54137251095605e-06,
"loss": 0.3511,
"step": 1276
},
{
"epoch": 4.313329121920404,
"grad_norm": 0.07038492284926416,
"learning_rate": 4.4978101771748195e-06,
"loss": 0.3429,
"step": 1277
},
{
"epoch": 4.316698252263635,
"grad_norm": 0.07186746493744087,
"learning_rate": 4.454445333905768e-06,
"loss": 0.3423,
"step": 1278
},
{
"epoch": 4.320067382606864,
"grad_norm": 0.07185532233373727,
"learning_rate": 4.411278222377551e-06,
"loss": 0.3416,
"step": 1279
},
{
"epoch": 4.323436512950095,
"grad_norm": 0.0702075072689657,
"learning_rate": 4.3683090827188666e-06,
"loss": 0.3452,
"step": 1280
},
{
"epoch": 4.326805643293325,
"grad_norm": 0.0752614715082349,
"learning_rate": 4.325538153957158e-06,
"loss": 0.3475,
"step": 1281
},
{
"epoch": 4.330174773636555,
"grad_norm": 0.07050331941427515,
"learning_rate": 4.282965674017265e-06,
"loss": 0.3477,
"step": 1282
},
{
"epoch": 4.333543903979785,
"grad_norm": 0.07219368807869528,
"learning_rate": 4.240591879720084e-06,
"loss": 0.3497,
"step": 1283
},
{
"epoch": 4.336913034323016,
"grad_norm": 0.06956963675751204,
"learning_rate": 4.198417006781283e-06,
"loss": 0.3474,
"step": 1284
},
{
"epoch": 4.340282164666245,
"grad_norm": 0.06960098578843016,
"learning_rate": 4.156441289809983e-06,
"loss": 0.3445,
"step": 1285
},
{
"epoch": 4.343651295009476,
"grad_norm": 0.07648526368534525,
"learning_rate": 4.114664962307439e-06,
"loss": 0.3479,
"step": 1286
},
{
"epoch": 4.347020425352706,
"grad_norm": 0.07088809269875901,
"learning_rate": 4.073088256665742e-06,
"loss": 0.3421,
"step": 1287
},
{
"epoch": 4.350389555695936,
"grad_norm": 0.07273421779811111,
"learning_rate": 4.031711404166525e-06,
"loss": 0.344,
"step": 1288
},
{
"epoch": 4.353758686039166,
"grad_norm": 0.07174713114445853,
"learning_rate": 3.9905346349797234e-06,
"loss": 0.3441,
"step": 1289
},
{
"epoch": 4.3571278163823965,
"grad_norm": 0.07290897068132188,
"learning_rate": 3.949558178162209e-06,
"loss": 0.3462,
"step": 1290
},
{
"epoch": 4.360496946725626,
"grad_norm": 0.07194649852054723,
"learning_rate": 3.9087822616565984e-06,
"loss": 0.3478,
"step": 1291
},
{
"epoch": 4.363866077068857,
"grad_norm": 0.07337266992394913,
"learning_rate": 3.86820711228991e-06,
"loss": 0.3447,
"step": 1292
},
{
"epoch": 4.367235207412087,
"grad_norm": 0.07030690021581439,
"learning_rate": 3.827832955772372e-06,
"loss": 0.3456,
"step": 1293
},
{
"epoch": 4.370604337755317,
"grad_norm": 0.07201158711941352,
"learning_rate": 3.7876600166961353e-06,
"loss": 0.3465,
"step": 1294
},
{
"epoch": 4.373973468098547,
"grad_norm": 0.07511999851456955,
"learning_rate": 3.747688518534003e-06,
"loss": 0.3509,
"step": 1295
},
{
"epoch": 4.3773425984417775,
"grad_norm": 0.07172350904328591,
"learning_rate": 3.707918683638223e-06,
"loss": 0.345,
"step": 1296
},
{
"epoch": 4.380711728785007,
"grad_norm": 0.0693885503387989,
"learning_rate": 3.6683507332392476e-06,
"loss": 0.3453,
"step": 1297
},
{
"epoch": 4.384080859128238,
"grad_norm": 0.07019744686285931,
"learning_rate": 3.628984887444462e-06,
"loss": 0.3432,
"step": 1298
},
{
"epoch": 4.387449989471468,
"grad_norm": 0.06892399615992918,
"learning_rate": 3.589821365237023e-06,
"loss": 0.3422,
"step": 1299
},
{
"epoch": 4.390819119814698,
"grad_norm": 0.0711323225202878,
"learning_rate": 3.550860384474568e-06,
"loss": 0.3468,
"step": 1300
},
{
"epoch": 4.394188250157928,
"grad_norm": 0.07222951484982641,
"learning_rate": 3.5121021618881e-06,
"loss": 0.3444,
"step": 1301
},
{
"epoch": 4.3975573805011585,
"grad_norm": 0.07011816955357002,
"learning_rate": 3.473546913080674e-06,
"loss": 0.3417,
"step": 1302
},
{
"epoch": 4.400926510844388,
"grad_norm": 0.06918135608237871,
"learning_rate": 3.4351948525262625e-06,
"loss": 0.3431,
"step": 1303
},
{
"epoch": 4.404295641187619,
"grad_norm": 0.07183448949638974,
"learning_rate": 3.397046193568558e-06,
"loss": 0.3454,
"step": 1304
},
{
"epoch": 4.407664771530849,
"grad_norm": 0.06841029875272973,
"learning_rate": 3.3591011484197744e-06,
"loss": 0.3471,
"step": 1305
},
{
"epoch": 4.411033901874079,
"grad_norm": 0.07008578728288764,
"learning_rate": 3.3213599281594688e-06,
"loss": 0.3469,
"step": 1306
},
{
"epoch": 4.414403032217309,
"grad_norm": 0.06784411674661273,
"learning_rate": 3.28382274273336e-06,
"loss": 0.3452,
"step": 1307
},
{
"epoch": 4.417772162560539,
"grad_norm": 0.06727601165426443,
"learning_rate": 3.246489800952155e-06,
"loss": 0.3513,
"step": 1308
},
{
"epoch": 4.421141292903769,
"grad_norm": 0.06930299868926686,
"learning_rate": 3.209361310490451e-06,
"loss": 0.344,
"step": 1309
},
{
"epoch": 4.424510423247,
"grad_norm": 0.06983137546711997,
"learning_rate": 3.172437477885475e-06,
"loss": 0.3432,
"step": 1310
},
{
"epoch": 4.4278795535902296,
"grad_norm": 0.06738405898147315,
"learning_rate": 3.1357185085360233e-06,
"loss": 0.3412,
"step": 1311
},
{
"epoch": 4.431248683933459,
"grad_norm": 0.069114436702608,
"learning_rate": 3.099204606701256e-06,
"loss": 0.3438,
"step": 1312
},
{
"epoch": 4.43461781427669,
"grad_norm": 0.07063250147224803,
"learning_rate": 3.062895975499616e-06,
"loss": 0.3449,
"step": 1313
},
{
"epoch": 4.43798694461992,
"grad_norm": 0.06869203050534661,
"learning_rate": 3.026792816907671e-06,
"loss": 0.347,
"step": 1314
},
{
"epoch": 4.44135607496315,
"grad_norm": 0.06790795340800003,
"learning_rate": 2.9908953317589675e-06,
"loss": 0.3511,
"step": 1315
},
{
"epoch": 4.44472520530638,
"grad_norm": 0.06801706888897209,
"learning_rate": 2.955203719742965e-06,
"loss": 0.3499,
"step": 1316
},
{
"epoch": 4.4480943356496105,
"grad_norm": 0.06703090567229934,
"learning_rate": 2.9197181794038896e-06,
"loss": 0.3409,
"step": 1317
},
{
"epoch": 4.45146346599284,
"grad_norm": 0.06845785402581211,
"learning_rate": 2.884438908139626e-06,
"loss": 0.3451,
"step": 1318
},
{
"epoch": 4.454832596336071,
"grad_norm": 0.06809288242514337,
"learning_rate": 2.8493661022006615e-06,
"loss": 0.349,
"step": 1319
},
{
"epoch": 4.458201726679301,
"grad_norm": 0.06993068933675987,
"learning_rate": 2.814499956688912e-06,
"loss": 0.3457,
"step": 1320
},
{
"epoch": 4.461570857022531,
"grad_norm": 0.06709969061038806,
"learning_rate": 2.7798406655567565e-06,
"loss": 0.3512,
"step": 1321
},
{
"epoch": 4.464939987365761,
"grad_norm": 0.06978980053246452,
"learning_rate": 2.7453884216058368e-06,
"loss": 0.3452,
"step": 1322
},
{
"epoch": 4.4683091177089915,
"grad_norm": 0.06806425403838408,
"learning_rate": 2.7111434164860573e-06,
"loss": 0.3489,
"step": 1323
},
{
"epoch": 4.471678248052221,
"grad_norm": 0.07023315792460011,
"learning_rate": 2.677105840694507e-06,
"loss": 0.3484,
"step": 1324
},
{
"epoch": 4.475047378395452,
"grad_norm": 0.0671632913864402,
"learning_rate": 2.6432758835743854e-06,
"loss": 0.3475,
"step": 1325
},
{
"epoch": 4.478416508738682,
"grad_norm": 0.0668737342617598,
"learning_rate": 2.6096537333139616e-06,
"loss": 0.3402,
"step": 1326
},
{
"epoch": 4.481785639081912,
"grad_norm": 0.06731813732301019,
"learning_rate": 2.5762395769455183e-06,
"loss": 0.3472,
"step": 1327
},
{
"epoch": 4.485154769425142,
"grad_norm": 0.06962894223132757,
"learning_rate": 2.5430336003443045e-06,
"loss": 0.3411,
"step": 1328
},
{
"epoch": 4.4885238997683725,
"grad_norm": 0.06651868659879541,
"learning_rate": 2.5100359882275526e-06,
"loss": 0.3463,
"step": 1329
},
{
"epoch": 4.491893030111602,
"grad_norm": 0.06589574436809537,
"learning_rate": 2.4772469241533648e-06,
"loss": 0.3449,
"step": 1330
},
{
"epoch": 4.495262160454833,
"grad_norm": 0.06851573366912253,
"learning_rate": 2.444666590519775e-06,
"loss": 0.3478,
"step": 1331
},
{
"epoch": 4.498631290798063,
"grad_norm": 0.06812334086330306,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.3493,
"step": 1332
},
{
"epoch": 4.502000421141293,
"grad_norm": 0.06783544762672909,
"learning_rate": 2.380132838359819e-06,
"loss": 0.3458,
"step": 1333
},
{
"epoch": 4.505369551484523,
"grad_norm": 0.06645851016955091,
"learning_rate": 2.3481797788198745e-06,
"loss": 0.3487,
"step": 1334
},
{
"epoch": 4.5087386818277535,
"grad_norm": 0.06691716361429041,
"learning_rate": 2.3164361676913406e-06,
"loss": 0.3461,
"step": 1335
},
{
"epoch": 4.512107812170983,
"grad_norm": 0.066561132769546,
"learning_rate": 2.284902181556632e-06,
"loss": 0.3451,
"step": 1336
},
{
"epoch": 4.515476942514214,
"grad_norm": 0.06972464277014613,
"learning_rate": 2.2535779958320614e-06,
"loss": 0.3363,
"step": 1337
},
{
"epoch": 4.5188460728574436,
"grad_norm": 0.06662582951723346,
"learning_rate": 2.2224637847668484e-06,
"loss": 0.3462,
"step": 1338
},
{
"epoch": 4.522215203200674,
"grad_norm": 0.06683364588110276,
"learning_rate": 2.1915597214422048e-06,
"loss": 0.345,
"step": 1339
},
{
"epoch": 4.525584333543904,
"grad_norm": 0.06973071855720024,
"learning_rate": 2.1608659777703033e-06,
"loss": 0.3486,
"step": 1340
},
{
"epoch": 4.5289534638871345,
"grad_norm": 0.06547912030107868,
"learning_rate": 2.130382724493405e-06,
"loss": 0.3481,
"step": 1341
},
{
"epoch": 4.532322594230364,
"grad_norm": 0.06796161455803124,
"learning_rate": 2.100110131182813e-06,
"loss": 0.3488,
"step": 1342
},
{
"epoch": 4.535691724573595,
"grad_norm": 0.06643717641974535,
"learning_rate": 2.070048366238e-06,
"loss": 0.3453,
"step": 1343
},
{
"epoch": 4.5390608549168245,
"grad_norm": 0.0657312313993076,
"learning_rate": 2.0401975968856514e-06,
"loss": 0.3364,
"step": 1344
},
{
"epoch": 4.542429985260055,
"grad_norm": 0.0662991056630753,
"learning_rate": 2.010557989178725e-06,
"loss": 0.3456,
"step": 1345
},
{
"epoch": 4.545799115603285,
"grad_norm": 0.06723548381525182,
"learning_rate": 1.981129707995542e-06,
"loss": 0.3428,
"step": 1346
},
{
"epoch": 4.549168245946515,
"grad_norm": 0.06854275132803765,
"learning_rate": 1.9519129170388496e-06,
"loss": 0.3519,
"step": 1347
},
{
"epoch": 4.552537376289745,
"grad_norm": 0.0687917997485082,
"learning_rate": 1.9229077788349393e-06,
"loss": 0.342,
"step": 1348
},
{
"epoch": 4.555906506632976,
"grad_norm": 0.06728042882661939,
"learning_rate": 1.8941144547327228e-06,
"loss": 0.3513,
"step": 1349
},
{
"epoch": 4.5592756369762055,
"grad_norm": 0.06733086071107253,
"learning_rate": 1.865533104902828e-06,
"loss": 0.3432,
"step": 1350
},
{
"epoch": 4.562644767319435,
"grad_norm": 0.06653132755662035,
"learning_rate": 1.8371638883367371e-06,
"loss": 0.3455,
"step": 1351
},
{
"epoch": 4.566013897662666,
"grad_norm": 0.07062467690102314,
"learning_rate": 1.8090069628458583e-06,
"loss": 0.3513,
"step": 1352
},
{
"epoch": 4.5693830280058965,
"grad_norm": 0.06749739958232552,
"learning_rate": 1.7810624850607007e-06,
"loss": 0.3422,
"step": 1353
},
{
"epoch": 4.572752158349126,
"grad_norm": 0.06715174264716953,
"learning_rate": 1.7533306104299663e-06,
"loss": 0.3427,
"step": 1354
},
{
"epoch": 4.576121288692356,
"grad_norm": 0.06825607468688703,
"learning_rate": 1.7258114932196824e-06,
"loss": 0.3484,
"step": 1355
},
{
"epoch": 4.5794904190355865,
"grad_norm": 0.0662384762896948,
"learning_rate": 1.6985052865123641e-06,
"loss": 0.344,
"step": 1356
},
{
"epoch": 4.582859549378816,
"grad_norm": 0.06749795339121123,
"learning_rate": 1.6714121422061636e-06,
"loss": 0.348,
"step": 1357
},
{
"epoch": 4.586228679722047,
"grad_norm": 0.06937799589584792,
"learning_rate": 1.6445322110140116e-06,
"loss": 0.3473,
"step": 1358
},
{
"epoch": 4.589597810065277,
"grad_norm": 0.06748221547140407,
"learning_rate": 1.617865642462766e-06,
"loss": 0.3414,
"step": 1359
},
{
"epoch": 4.592966940408507,
"grad_norm": 0.06814928775630703,
"learning_rate": 1.59141258489242e-06,
"loss": 0.345,
"step": 1360
},
{
"epoch": 4.596336070751737,
"grad_norm": 0.07057379791962957,
"learning_rate": 1.5651731854552466e-06,
"loss": 0.3432,
"step": 1361
},
{
"epoch": 4.5997052010949675,
"grad_norm": 0.06665029276024906,
"learning_rate": 1.53914759011498e-06,
"loss": 0.3524,
"step": 1362
},
{
"epoch": 4.603074331438197,
"grad_norm": 0.06906650342043347,
"learning_rate": 1.513335943646026e-06,
"loss": 0.3457,
"step": 1363
},
{
"epoch": 4.606443461781428,
"grad_norm": 0.06942705785663987,
"learning_rate": 1.4877383896326269e-06,
"loss": 0.3435,
"step": 1364
},
{
"epoch": 4.6098125921246575,
"grad_norm": 0.06819335124159634,
"learning_rate": 1.4623550704680889e-06,
"loss": 0.3508,
"step": 1365
},
{
"epoch": 4.613181722467888,
"grad_norm": 0.06742489592183823,
"learning_rate": 1.4371861273539778e-06,
"loss": 0.3457,
"step": 1366
},
{
"epoch": 4.616550852811118,
"grad_norm": 0.064467972456891,
"learning_rate": 1.4122317002993247e-06,
"loss": 0.3437,
"step": 1367
},
{
"epoch": 4.6199199831543485,
"grad_norm": 0.06450585611276803,
"learning_rate": 1.3874919281198662e-06,
"loss": 0.3471,
"step": 1368
},
{
"epoch": 4.623289113497578,
"grad_norm": 0.06675137780602221,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.3497,
"step": 1369
},
{
"epoch": 4.626658243840809,
"grad_norm": 0.06713388756947067,
"learning_rate": 1.3386568976783453e-06,
"loss": 0.3423,
"step": 1370
},
{
"epoch": 4.6300273741840385,
"grad_norm": 0.0647734710561896,
"learning_rate": 1.3145619110743169e-06,
"loss": 0.3451,
"step": 1371
},
{
"epoch": 4.633396504527269,
"grad_norm": 0.06580879452568121,
"learning_rate": 1.2906821226600453e-06,
"loss": 0.3429,
"step": 1372
},
{
"epoch": 4.636765634870499,
"grad_norm": 0.06578978457756152,
"learning_rate": 1.2670176652733023e-06,
"loss": 0.342,
"step": 1373
},
{
"epoch": 4.6401347652137295,
"grad_norm": 0.06786565921397064,
"learning_rate": 1.2435686705540228e-06,
"loss": 0.3458,
"step": 1374
},
{
"epoch": 4.643503895556959,
"grad_norm": 0.06730192180307096,
"learning_rate": 1.2203352689435532e-06,
"loss": 0.3505,
"step": 1375
},
{
"epoch": 4.64687302590019,
"grad_norm": 0.06442684402191479,
"learning_rate": 1.1973175896839684e-06,
"loss": 0.3417,
"step": 1376
},
{
"epoch": 4.6502421562434195,
"grad_norm": 0.06497046470832643,
"learning_rate": 1.1745157608173253e-06,
"loss": 0.3429,
"step": 1377
},
{
"epoch": 4.65361128658665,
"grad_norm": 0.0655614246650691,
"learning_rate": 1.1519299091849523e-06,
"loss": 0.3405,
"step": 1378
},
{
"epoch": 4.65698041692988,
"grad_norm": 0.06746924444935623,
"learning_rate": 1.1295601604267348e-06,
"loss": 0.347,
"step": 1379
},
{
"epoch": 4.6603495472731105,
"grad_norm": 0.06671677812012947,
"learning_rate": 1.1074066389804395e-06,
"loss": 0.348,
"step": 1380
},
{
"epoch": 4.66371867761634,
"grad_norm": 0.06798688584484958,
"learning_rate": 1.0854694680810175e-06,
"loss": 0.3468,
"step": 1381
},
{
"epoch": 4.667087807959571,
"grad_norm": 0.06373690906496436,
"learning_rate": 1.0637487697598937e-06,
"loss": 0.3391,
"step": 1382
},
{
"epoch": 4.6704569383028005,
"grad_norm": 0.06902986516002681,
"learning_rate": 1.0422446648443142e-06,
"loss": 0.3449,
"step": 1383
},
{
"epoch": 4.673826068646031,
"grad_norm": 0.06783886040134948,
"learning_rate": 1.0209572729566708e-06,
"loss": 0.3469,
"step": 1384
},
{
"epoch": 4.677195198989261,
"grad_norm": 0.06789415607732335,
"learning_rate": 9.998867125138223e-07,
"loss": 0.3483,
"step": 1385
},
{
"epoch": 4.680564329332491,
"grad_norm": 0.06478682570392917,
"learning_rate": 9.790331007264543e-07,
"loss": 0.3465,
"step": 1386
},
{
"epoch": 4.683933459675721,
"grad_norm": 0.06659198241596209,
"learning_rate": 9.583965535983997e-07,
"loss": 0.3377,
"step": 1387
},
{
"epoch": 4.687302590018952,
"grad_norm": 0.06679774424195298,
"learning_rate": 9.379771859260267e-07,
"loss": 0.3474,
"step": 1388
},
{
"epoch": 4.6906717203621815,
"grad_norm": 0.06562337466888649,
"learning_rate": 9.177751112975853e-07,
"loss": 0.3378,
"step": 1389
},
{
"epoch": 4.694040850705411,
"grad_norm": 0.0643058634496552,
"learning_rate": 8.977904420925543e-07,
"loss": 0.3401,
"step": 1390
},
{
"epoch": 4.697409981048642,
"grad_norm": 0.06520681777558435,
"learning_rate": 8.780232894810558e-07,
"loss": 0.3476,
"step": 1391
},
{
"epoch": 4.700779111391872,
"grad_norm": 0.06652677782803126,
"learning_rate": 8.584737634232154e-07,
"loss": 0.3445,
"step": 1392
},
{
"epoch": 4.704148241735102,
"grad_norm": 0.06513347952901734,
"learning_rate": 8.391419726685446e-07,
"loss": 0.3486,
"step": 1393
},
{
"epoch": 4.707517372078332,
"grad_norm": 0.06577657248355921,
"learning_rate": 8.200280247553461e-07,
"loss": 0.3461,
"step": 1394
},
{
"epoch": 4.7108865024215625,
"grad_norm": 0.06369190711960318,
"learning_rate": 8.011320260101052e-07,
"loss": 0.3478,
"step": 1395
},
{
"epoch": 4.714255632764792,
"grad_norm": 0.06569207225402134,
"learning_rate": 7.824540815469306e-07,
"loss": 0.3496,
"step": 1396
},
{
"epoch": 4.717624763108023,
"grad_norm": 0.0636558204987421,
"learning_rate": 7.639942952669232e-07,
"loss": 0.3462,
"step": 1397
},
{
"epoch": 4.7209938934512525,
"grad_norm": 0.06451389941556673,
"learning_rate": 7.457527698576217e-07,
"loss": 0.3454,
"step": 1398
},
{
"epoch": 4.724363023794483,
"grad_norm": 0.06490245056573639,
"learning_rate": 7.277296067924377e-07,
"loss": 0.345,
"step": 1399
},
{
"epoch": 4.727732154137713,
"grad_norm": 0.06421211046867673,
"learning_rate": 7.099249063300751e-07,
"loss": 0.3509,
"step": 1400
},
{
"epoch": 4.7311012844809435,
"grad_norm": 0.06376468633122387,
"learning_rate": 6.923387675139958e-07,
"loss": 0.3449,
"step": 1401
},
{
"epoch": 4.734470414824173,
"grad_norm": 0.06306595288457956,
"learning_rate": 6.749712881718306e-07,
"loss": 0.3438,
"step": 1402
},
{
"epoch": 4.737839545167404,
"grad_norm": 0.064531257088043,
"learning_rate": 6.578225649148806e-07,
"loss": 0.3459,
"step": 1403
},
{
"epoch": 4.7412086755106335,
"grad_norm": 0.06475033645731526,
"learning_rate": 6.408926931375403e-07,
"loss": 0.3489,
"step": 1404
},
{
"epoch": 4.744577805853864,
"grad_norm": 0.06725279891073008,
"learning_rate": 6.241817670167961e-07,
"loss": 0.3517,
"step": 1405
},
{
"epoch": 4.747946936197094,
"grad_norm": 0.06576628806576036,
"learning_rate": 6.076898795116792e-07,
"loss": 0.3476,
"step": 1406
},
{
"epoch": 4.7513160665403245,
"grad_norm": 0.06636084321383787,
"learning_rate": 5.914171223627652e-07,
"loss": 0.3431,
"step": 1407
},
{
"epoch": 4.754685196883554,
"grad_norm": 0.06307439592979396,
"learning_rate": 5.753635860916617e-07,
"loss": 0.344,
"step": 1408
},
{
"epoch": 4.758054327226785,
"grad_norm": 0.06354853186497929,
"learning_rate": 5.595293600004948e-07,
"loss": 0.3452,
"step": 1409
},
{
"epoch": 4.7614234575700145,
"grad_norm": 0.06640861850363539,
"learning_rate": 5.43914532171419e-07,
"loss": 0.3498,
"step": 1410
},
{
"epoch": 4.764792587913245,
"grad_norm": 0.06432227056221736,
"learning_rate": 5.285191894661257e-07,
"loss": 0.3448,
"step": 1411
},
{
"epoch": 4.768161718256475,
"grad_norm": 0.0650298496723325,
"learning_rate": 5.133434175253715e-07,
"loss": 0.348,
"step": 1412
},
{
"epoch": 4.771530848599705,
"grad_norm": 0.0642338741687956,
"learning_rate": 4.983873007684769e-07,
"loss": 0.3504,
"step": 1413
},
{
"epoch": 4.774899978942935,
"grad_norm": 0.06597221985673193,
"learning_rate": 4.83650922392882e-07,
"loss": 0.3443,
"step": 1414
},
{
"epoch": 4.778269109286166,
"grad_norm": 0.06414310328903884,
"learning_rate": 4.691343643736579e-07,
"loss": 0.3498,
"step": 1415
},
{
"epoch": 4.7816382396293955,
"grad_norm": 0.06423727553913079,
"learning_rate": 4.5483770746309383e-07,
"loss": 0.3462,
"step": 1416
},
{
"epoch": 4.785007369972626,
"grad_norm": 0.06712703203955196,
"learning_rate": 4.4076103119018666e-07,
"loss": 0.344,
"step": 1417
},
{
"epoch": 4.788376500315856,
"grad_norm": 0.06406676946222813,
"learning_rate": 4.269044138602585e-07,
"loss": 0.3424,
"step": 1418
},
{
"epoch": 4.791745630659086,
"grad_norm": 0.0650048525731774,
"learning_rate": 4.132679325544775e-07,
"loss": 0.3434,
"step": 1419
},
{
"epoch": 4.795114761002316,
"grad_norm": 0.06381393163242242,
"learning_rate": 3.998516631294491e-07,
"loss": 0.3464,
"step": 1420
},
{
"epoch": 4.798483891345547,
"grad_norm": 0.062168147457412865,
"learning_rate": 3.866556802167942e-07,
"loss": 0.3447,
"step": 1421
},
{
"epoch": 4.8018530216887765,
"grad_norm": 0.06359774281703022,
"learning_rate": 3.736800572227317e-07,
"loss": 0.3452,
"step": 1422
},
{
"epoch": 4.805222152032007,
"grad_norm": 0.06777082256384792,
"learning_rate": 3.6092486632766543e-07,
"loss": 0.3405,
"step": 1423
},
{
"epoch": 4.808591282375237,
"grad_norm": 0.06518391137080269,
"learning_rate": 3.483901784857846e-07,
"loss": 0.3499,
"step": 1424
},
{
"epoch": 4.811960412718467,
"grad_norm": 0.06360491257484012,
"learning_rate": 3.3607606342467293e-07,
"loss": 0.3464,
"step": 1425
},
{
"epoch": 4.815329543061697,
"grad_norm": 0.0630016058736709,
"learning_rate": 3.239825896449267e-07,
"loss": 0.3493,
"step": 1426
},
{
"epoch": 4.818698673404928,
"grad_norm": 0.06424370898677036,
"learning_rate": 3.1210982441974623e-07,
"loss": 0.3424,
"step": 1427
},
{
"epoch": 4.8220678037481575,
"grad_norm": 0.06333420103209184,
"learning_rate": 3.004578337945985e-07,
"loss": 0.3444,
"step": 1428
},
{
"epoch": 4.825436934091387,
"grad_norm": 0.06413449663730773,
"learning_rate": 2.8902668258683043e-07,
"loss": 0.3465,
"step": 1429
},
{
"epoch": 4.828806064434618,
"grad_norm": 0.06372049815441223,
"learning_rate": 2.778164343852918e-07,
"loss": 0.3478,
"step": 1430
},
{
"epoch": 4.832175194777848,
"grad_norm": 0.06414269762017184,
"learning_rate": 2.668271515500287e-07,
"loss": 0.3502,
"step": 1431
},
{
"epoch": 4.835544325121078,
"grad_norm": 0.06533137367117652,
"learning_rate": 2.5605889521188364e-07,
"loss": 0.3491,
"step": 1432
},
{
"epoch": 4.838913455464308,
"grad_norm": 0.06350312986484183,
"learning_rate": 2.455117252721895e-07,
"loss": 0.3453,
"step": 1433
},
{
"epoch": 4.8422825858075385,
"grad_norm": 0.06475788404284327,
"learning_rate": 2.351857004024316e-07,
"loss": 0.3503,
"step": 1434
},
{
"epoch": 4.845651716150769,
"grad_norm": 0.0631781774805789,
"learning_rate": 2.2508087804390178e-07,
"loss": 0.3446,
"step": 1435
},
{
"epoch": 4.849020846493999,
"grad_norm": 0.06379282423784381,
"learning_rate": 2.1519731440740487e-07,
"loss": 0.3474,
"step": 1436
},
{
"epoch": 4.8523899768372285,
"grad_norm": 0.06402172658556064,
"learning_rate": 2.055350644729348e-07,
"loss": 0.3511,
"step": 1437
},
{
"epoch": 4.855759107180459,
"grad_norm": 0.06513215066751245,
"learning_rate": 1.9609418198935916e-07,
"loss": 0.3471,
"step": 1438
},
{
"epoch": 4.859128237523689,
"grad_norm": 0.06283559414952865,
"learning_rate": 1.8687471947413495e-07,
"loss": 0.3446,
"step": 1439
},
{
"epoch": 4.862497367866919,
"grad_norm": 0.06309493725276366,
"learning_rate": 1.778767282130156e-07,
"loss": 0.3431,
"step": 1440
},
{
"epoch": 4.865866498210149,
"grad_norm": 0.06560809934411752,
"learning_rate": 1.691002582597534e-07,
"loss": 0.3526,
"step": 1441
},
{
"epoch": 4.86923562855338,
"grad_norm": 0.06433417193452762,
"learning_rate": 1.6054535843582854e-07,
"loss": 0.3507,
"step": 1442
},
{
"epoch": 4.8726047588966095,
"grad_norm": 0.06442999780392818,
"learning_rate": 1.522120763301782e-07,
"loss": 0.3492,
"step": 1443
},
{
"epoch": 4.87597388923984,
"grad_norm": 0.06306148601810407,
"learning_rate": 1.4410045829893915e-07,
"loss": 0.3434,
"step": 1444
},
{
"epoch": 4.87934301958307,
"grad_norm": 0.06308220046755993,
"learning_rate": 1.3621054946517666e-07,
"loss": 0.3445,
"step": 1445
},
{
"epoch": 4.8827121499263,
"grad_norm": 0.06305097370353915,
"learning_rate": 1.2854239371863142e-07,
"loss": 0.3431,
"step": 1446
},
{
"epoch": 4.88608128026953,
"grad_norm": 0.06293090962933129,
"learning_rate": 1.2109603371548873e-07,
"loss": 0.3397,
"step": 1447
},
{
"epoch": 4.889450410612761,
"grad_norm": 0.06368330582611549,
"learning_rate": 1.1387151087814297e-07,
"loss": 0.3468,
"step": 1448
},
{
"epoch": 4.8928195409559905,
"grad_norm": 0.0642396525858067,
"learning_rate": 1.06868865394949e-07,
"loss": 0.3419,
"step": 1449
},
{
"epoch": 4.896188671299221,
"grad_norm": 0.06286580837917152,
"learning_rate": 1.0008813622001345e-07,
"loss": 0.3465,
"step": 1450
},
{
"epoch": 4.899557801642451,
"grad_norm": 0.0646704999704258,
"learning_rate": 9.352936107296817e-08,
"loss": 0.3515,
"step": 1451
},
{
"epoch": 4.902926931985681,
"grad_norm": 0.06254527612862122,
"learning_rate": 8.719257643877044e-08,
"loss": 0.3418,
"step": 1452
},
{
"epoch": 4.906296062328911,
"grad_norm": 0.06265534232163783,
"learning_rate": 8.107781756749866e-08,
"loss": 0.3417,
"step": 1453
},
{
"epoch": 4.909665192672142,
"grad_norm": 0.06417368994248919,
"learning_rate": 7.51851184741481e-08,
"loss": 0.3451,
"step": 1454
},
{
"epoch": 4.9130343230153715,
"grad_norm": 0.06427635001716354,
"learning_rate": 6.951451193844883e-08,
"loss": 0.3517,
"step": 1455
},
{
"epoch": 4.916403453358602,
"grad_norm": 0.06446286415220177,
"learning_rate": 6.40660295046791e-08,
"loss": 0.3499,
"step": 1456
},
{
"epoch": 4.919772583701832,
"grad_norm": 0.06325304997383964,
"learning_rate": 5.8839701481487875e-08,
"loss": 0.3437,
"step": 1457
},
{
"epoch": 4.923141714045062,
"grad_norm": 0.06376968784671593,
"learning_rate": 5.3835556941743695e-08,
"loss": 0.3423,
"step": 1458
},
{
"epoch": 4.926510844388292,
"grad_norm": 0.06529781285688359,
"learning_rate": 4.905362372234379e-08,
"loss": 0.3492,
"step": 1459
},
{
"epoch": 4.929879974731523,
"grad_norm": 0.06414078488995091,
"learning_rate": 4.449392842408529e-08,
"loss": 0.3479,
"step": 1460
},
{
"epoch": 4.9332491050747524,
"grad_norm": 0.06362859239383568,
"learning_rate": 4.015649641150976e-08,
"loss": 0.3492,
"step": 1461
},
{
"epoch": 4.936618235417983,
"grad_norm": 0.06341769294492185,
"learning_rate": 3.6041351812743374e-08,
"loss": 0.351,
"step": 1462
},
{
"epoch": 4.939987365761213,
"grad_norm": 0.06486183719402762,
"learning_rate": 3.21485175193903e-08,
"loss": 0.3511,
"step": 1463
},
{
"epoch": 4.943356496104443,
"grad_norm": 0.06360741943701602,
"learning_rate": 2.8478015186399477e-08,
"loss": 0.3471,
"step": 1464
},
{
"epoch": 4.946725626447673,
"grad_norm": 0.06343696624954866,
"learning_rate": 2.5029865231922524e-08,
"loss": 0.3448,
"step": 1465
},
{
"epoch": 4.950094756790904,
"grad_norm": 0.06343915127065658,
"learning_rate": 2.1804086837229344e-08,
"loss": 0.3416,
"step": 1466
},
{
"epoch": 4.953463887134133,
"grad_norm": 0.06487303485827695,
"learning_rate": 1.880069794657935e-08,
"loss": 0.3444,
"step": 1467
},
{
"epoch": 4.956833017477363,
"grad_norm": 0.062408603956769386,
"learning_rate": 1.601971526713708e-08,
"loss": 0.341,
"step": 1468
},
{
"epoch": 4.960202147820594,
"grad_norm": 0.06255760369115392,
"learning_rate": 1.3461154268865628e-08,
"loss": 0.3445,
"step": 1469
},
{
"epoch": 4.963571278163824,
"grad_norm": 0.062112638608570706,
"learning_rate": 1.112502918445113e-08,
"loss": 0.3391,
"step": 1470
},
{
"epoch": 4.966940408507054,
"grad_norm": 0.06398681422452646,
"learning_rate": 9.011353009222846e-09,
"loss": 0.3455,
"step": 1471
},
{
"epoch": 4.970309538850284,
"grad_norm": 0.0637738300165632,
"learning_rate": 7.12013750107321e-09,
"loss": 0.3438,
"step": 1472
},
{
"epoch": 4.973678669193514,
"grad_norm": 0.06456086790149927,
"learning_rate": 5.451393180400111e-09,
"loss": 0.3486,
"step": 1473
},
{
"epoch": 4.977047799536745,
"grad_norm": 0.06334490636848067,
"learning_rate": 4.00512933004471e-09,
"loss": 0.3456,
"step": 1474
},
{
"epoch": 4.980416929879975,
"grad_norm": 0.06295292438577572,
"learning_rate": 2.7813539952381563e-09,
"loss": 0.3445,
"step": 1475
},
{
"epoch": 4.9837860602232045,
"grad_norm": 0.0633108315280129,
"learning_rate": 1.7800739835616143e-09,
"loss": 0.3451,
"step": 1476
},
{
"epoch": 4.987155190566435,
"grad_norm": 0.0630218856533905,
"learning_rate": 1.0012948649018584e-09,
"loss": 0.3497,
"step": 1477
},
{
"epoch": 4.990524320909666,
"grad_norm": 0.06351551674205162,
"learning_rate": 4.450209714379483e-10,
"loss": 0.3382,
"step": 1478
},
{
"epoch": 4.993893451252895,
"grad_norm": 0.06362931363383374,
"learning_rate": 1.1125539757905756e-10,
"loss": 0.3436,
"step": 1479
},
{
"epoch": 4.997262581596125,
"grad_norm": 0.0635855435860357,
"learning_rate": 0.0,
"loss": 0.3456,
"step": 1480
},
{
"epoch": 4.997262581596125,
"step": 1480,
"total_flos": 3.94117975967185e+19,
"train_loss": 0.06913654437741718,
"train_runtime": 69116.03,
"train_samples_per_second": 10.993,
"train_steps_per_second": 0.021
}
],
"logging_steps": 1,
"max_steps": 1480,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.94117975967185e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}