WavGPT-1.0 / trainer_state.json
Hack337's picture
Upload 14 files
7620edc verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 9123,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005480653293872629,
"grad_norm": 9.609877586364746,
"learning_rate": 4.999996294265421e-05,
"loss": 5.868,
"num_input_tokens_seen": 3944,
"step": 5
},
{
"epoch": 0.0010961306587745259,
"grad_norm": 8.435359001159668,
"learning_rate": 4.999985177072669e-05,
"loss": 5.1519,
"num_input_tokens_seen": 7552,
"step": 10
},
{
"epoch": 0.001644195988161789,
"grad_norm": 4.555312156677246,
"learning_rate": 4.999966648454702e-05,
"loss": 4.5297,
"num_input_tokens_seen": 10552,
"step": 15
},
{
"epoch": 0.0021922613175490518,
"grad_norm": 5.34758186340332,
"learning_rate": 4.9999407084664514e-05,
"loss": 4.1016,
"num_input_tokens_seen": 14720,
"step": 20
},
{
"epoch": 0.002740326646936315,
"grad_norm": 4.284458160400391,
"learning_rate": 4.999907357184816e-05,
"loss": 4.0075,
"num_input_tokens_seen": 17648,
"step": 25
},
{
"epoch": 0.003288391976323578,
"grad_norm": 6.062355041503906,
"learning_rate": 4.99986659470867e-05,
"loss": 3.9682,
"num_input_tokens_seen": 21192,
"step": 30
},
{
"epoch": 0.003836457305710841,
"grad_norm": 3.1782262325286865,
"learning_rate": 4.9998184211588574e-05,
"loss": 3.6158,
"num_input_tokens_seen": 24680,
"step": 35
},
{
"epoch": 0.0043845226350981035,
"grad_norm": 4.492194652557373,
"learning_rate": 4.999762836678192e-05,
"loss": 4.4312,
"num_input_tokens_seen": 27304,
"step": 40
},
{
"epoch": 0.004932587964485367,
"grad_norm": 4.35511589050293,
"learning_rate": 4.99969984143146e-05,
"loss": 4.0391,
"num_input_tokens_seen": 29824,
"step": 45
},
{
"epoch": 0.00548065329387263,
"grad_norm": 4.070927619934082,
"learning_rate": 4.999629435605416e-05,
"loss": 3.9559,
"num_input_tokens_seen": 32496,
"step": 50
},
{
"epoch": 0.006028718623259892,
"grad_norm": 3.5581634044647217,
"learning_rate": 4.9995516194087845e-05,
"loss": 3.6342,
"num_input_tokens_seen": 35624,
"step": 55
},
{
"epoch": 0.006576783952647156,
"grad_norm": 3.646406888961792,
"learning_rate": 4.999466393072258e-05,
"loss": 3.8581,
"num_input_tokens_seen": 38896,
"step": 60
},
{
"epoch": 0.007124849282034418,
"grad_norm": 3.964329719543457,
"learning_rate": 4.9993737568484967e-05,
"loss": 4.0054,
"num_input_tokens_seen": 42736,
"step": 65
},
{
"epoch": 0.007672914611421682,
"grad_norm": 4.500335693359375,
"learning_rate": 4.99927371101213e-05,
"loss": 3.3325,
"num_input_tokens_seen": 45256,
"step": 70
},
{
"epoch": 0.008220979940808944,
"grad_norm": 4.3628315925598145,
"learning_rate": 4.999166255859752e-05,
"loss": 3.5725,
"num_input_tokens_seen": 48576,
"step": 75
},
{
"epoch": 0.008769045270196207,
"grad_norm": 3.4167840480804443,
"learning_rate": 4.9990513917099225e-05,
"loss": 3.7729,
"num_input_tokens_seen": 52736,
"step": 80
},
{
"epoch": 0.00931711059958347,
"grad_norm": 4.027678489685059,
"learning_rate": 4.998929118903167e-05,
"loss": 3.7879,
"num_input_tokens_seen": 56256,
"step": 85
},
{
"epoch": 0.009865175928970734,
"grad_norm": 4.3075056076049805,
"learning_rate": 4.9987994378019746e-05,
"loss": 3.5822,
"num_input_tokens_seen": 59448,
"step": 90
},
{
"epoch": 0.010413241258357997,
"grad_norm": 3.550978899002075,
"learning_rate": 4.9986623487907955e-05,
"loss": 3.8015,
"num_input_tokens_seen": 63424,
"step": 95
},
{
"epoch": 0.01096130658774526,
"grad_norm": 3.6582727432250977,
"learning_rate": 4.998517852276042e-05,
"loss": 3.7712,
"num_input_tokens_seen": 66720,
"step": 100
},
{
"epoch": 0.011509371917132522,
"grad_norm": 5.284353733062744,
"learning_rate": 4.9983659486860865e-05,
"loss": 3.5192,
"num_input_tokens_seen": 69280,
"step": 105
},
{
"epoch": 0.012057437246519784,
"grad_norm": 3.712407350540161,
"learning_rate": 4.998206638471261e-05,
"loss": 3.9006,
"num_input_tokens_seen": 72488,
"step": 110
},
{
"epoch": 0.012605502575907049,
"grad_norm": 5.380141258239746,
"learning_rate": 4.9980399221038544e-05,
"loss": 3.7691,
"num_input_tokens_seen": 75728,
"step": 115
},
{
"epoch": 0.013153567905294311,
"grad_norm": 6.7210693359375,
"learning_rate": 4.997865800078112e-05,
"loss": 3.4306,
"num_input_tokens_seen": 78456,
"step": 120
},
{
"epoch": 0.013701633234681574,
"grad_norm": 3.6822457313537598,
"learning_rate": 4.997684272910233e-05,
"loss": 3.7098,
"num_input_tokens_seen": 81912,
"step": 125
},
{
"epoch": 0.014249698564068837,
"grad_norm": 4.587904453277588,
"learning_rate": 4.997495341138373e-05,
"loss": 3.7503,
"num_input_tokens_seen": 85768,
"step": 130
},
{
"epoch": 0.0147977638934561,
"grad_norm": 4.4221510887146,
"learning_rate": 4.997299005322634e-05,
"loss": 3.6916,
"num_input_tokens_seen": 89744,
"step": 135
},
{
"epoch": 0.015345829222843364,
"grad_norm": 4.955567359924316,
"learning_rate": 4.9970952660450734e-05,
"loss": 3.8345,
"num_input_tokens_seen": 93584,
"step": 140
},
{
"epoch": 0.015893894552230625,
"grad_norm": 3.8360307216644287,
"learning_rate": 4.996884123909692e-05,
"loss": 3.8622,
"num_input_tokens_seen": 96880,
"step": 145
},
{
"epoch": 0.01644195988161789,
"grad_norm": 4.293831825256348,
"learning_rate": 4.996665579542439e-05,
"loss": 3.6978,
"num_input_tokens_seen": 99736,
"step": 150
},
{
"epoch": 0.016990025211005153,
"grad_norm": 3.8615922927856445,
"learning_rate": 4.99643963359121e-05,
"loss": 3.7886,
"num_input_tokens_seen": 102768,
"step": 155
},
{
"epoch": 0.017538090540392414,
"grad_norm": 4.592337608337402,
"learning_rate": 4.996206286725841e-05,
"loss": 3.4776,
"num_input_tokens_seen": 107960,
"step": 160
},
{
"epoch": 0.01808615586977968,
"grad_norm": 5.695650577545166,
"learning_rate": 4.995965539638108e-05,
"loss": 3.9904,
"num_input_tokens_seen": 110712,
"step": 165
},
{
"epoch": 0.01863422119916694,
"grad_norm": 6.341024398803711,
"learning_rate": 4.995717393041729e-05,
"loss": 3.727,
"num_input_tokens_seen": 114496,
"step": 170
},
{
"epoch": 0.019182286528554204,
"grad_norm": 5.523504734039307,
"learning_rate": 4.995461847672354e-05,
"loss": 3.5366,
"num_input_tokens_seen": 118408,
"step": 175
},
{
"epoch": 0.019730351857941468,
"grad_norm": 4.576908111572266,
"learning_rate": 4.995198904287572e-05,
"loss": 3.4552,
"num_input_tokens_seen": 122024,
"step": 180
},
{
"epoch": 0.02027841718732873,
"grad_norm": 4.912643909454346,
"learning_rate": 4.9949285636669e-05,
"loss": 3.878,
"num_input_tokens_seen": 125680,
"step": 185
},
{
"epoch": 0.020826482516715993,
"grad_norm": 3.790379047393799,
"learning_rate": 4.994650826611787e-05,
"loss": 3.7852,
"num_input_tokens_seen": 129056,
"step": 190
},
{
"epoch": 0.021374547846103254,
"grad_norm": 4.877086162567139,
"learning_rate": 4.9943656939456094e-05,
"loss": 3.7977,
"num_input_tokens_seen": 132072,
"step": 195
},
{
"epoch": 0.02192261317549052,
"grad_norm": 4.675802230834961,
"learning_rate": 4.994073166513667e-05,
"loss": 3.6024,
"num_input_tokens_seen": 134448,
"step": 200
},
{
"epoch": 0.022470678504877783,
"grad_norm": 9.45524787902832,
"learning_rate": 4.9937732451831845e-05,
"loss": 3.9247,
"num_input_tokens_seen": 137808,
"step": 205
},
{
"epoch": 0.023018743834265044,
"grad_norm": 4.349103927612305,
"learning_rate": 4.9934659308433024e-05,
"loss": 3.5971,
"num_input_tokens_seen": 140752,
"step": 210
},
{
"epoch": 0.023566809163652308,
"grad_norm": 3.90029239654541,
"learning_rate": 4.993151224405084e-05,
"loss": 3.656,
"num_input_tokens_seen": 143328,
"step": 215
},
{
"epoch": 0.02411487449303957,
"grad_norm": 3.4128267765045166,
"learning_rate": 4.992829126801502e-05,
"loss": 3.7457,
"num_input_tokens_seen": 146792,
"step": 220
},
{
"epoch": 0.024662939822426833,
"grad_norm": 5.266091346740723,
"learning_rate": 4.9924996389874435e-05,
"loss": 3.3972,
"num_input_tokens_seen": 150352,
"step": 225
},
{
"epoch": 0.025211005151814098,
"grad_norm": 3.7570605278015137,
"learning_rate": 4.992162761939704e-05,
"loss": 2.8386,
"num_input_tokens_seen": 153688,
"step": 230
},
{
"epoch": 0.02575907048120136,
"grad_norm": 3.587785243988037,
"learning_rate": 4.991818496656986e-05,
"loss": 3.909,
"num_input_tokens_seen": 156824,
"step": 235
},
{
"epoch": 0.026307135810588623,
"grad_norm": 4.7243757247924805,
"learning_rate": 4.991466844159893e-05,
"loss": 3.7806,
"num_input_tokens_seen": 159728,
"step": 240
},
{
"epoch": 0.026855201139975884,
"grad_norm": 4.537757396697998,
"learning_rate": 4.99110780549093e-05,
"loss": 3.7949,
"num_input_tokens_seen": 162456,
"step": 245
},
{
"epoch": 0.027403266469363148,
"grad_norm": 5.187793731689453,
"learning_rate": 4.990741381714498e-05,
"loss": 3.7304,
"num_input_tokens_seen": 165176,
"step": 250
},
{
"epoch": 0.027951331798750412,
"grad_norm": 5.144887447357178,
"learning_rate": 4.990367573916894e-05,
"loss": 3.7232,
"num_input_tokens_seen": 168824,
"step": 255
},
{
"epoch": 0.028499397128137673,
"grad_norm": 5.238748550415039,
"learning_rate": 4.989986383206302e-05,
"loss": 3.5484,
"num_input_tokens_seen": 172512,
"step": 260
},
{
"epoch": 0.029047462457524938,
"grad_norm": 4.251674652099609,
"learning_rate": 4.9895978107127975e-05,
"loss": 3.3929,
"num_input_tokens_seen": 175544,
"step": 265
},
{
"epoch": 0.0295955277869122,
"grad_norm": 7.541206359863281,
"learning_rate": 4.9892018575883354e-05,
"loss": 3.5038,
"num_input_tokens_seen": 178784,
"step": 270
},
{
"epoch": 0.030143593116299463,
"grad_norm": 3.8806400299072266,
"learning_rate": 4.988798525006755e-05,
"loss": 3.9488,
"num_input_tokens_seen": 181112,
"step": 275
},
{
"epoch": 0.030691658445686727,
"grad_norm": 3.7918715476989746,
"learning_rate": 4.988387814163771e-05,
"loss": 3.4375,
"num_input_tokens_seen": 185416,
"step": 280
},
{
"epoch": 0.031239723775073988,
"grad_norm": 4.9953813552856445,
"learning_rate": 4.9879697262769706e-05,
"loss": 3.7866,
"num_input_tokens_seen": 188528,
"step": 285
},
{
"epoch": 0.03178778910446125,
"grad_norm": 4.683384418487549,
"learning_rate": 4.9875442625858125e-05,
"loss": 3.4738,
"num_input_tokens_seen": 191472,
"step": 290
},
{
"epoch": 0.03233585443384852,
"grad_norm": 3.5414726734161377,
"learning_rate": 4.987111424351622e-05,
"loss": 3.6306,
"num_input_tokens_seen": 195416,
"step": 295
},
{
"epoch": 0.03288391976323578,
"grad_norm": 6.5463547706604,
"learning_rate": 4.9866712128575855e-05,
"loss": 3.6409,
"num_input_tokens_seen": 198576,
"step": 300
},
{
"epoch": 0.03343198509262304,
"grad_norm": 4.8504180908203125,
"learning_rate": 4.9862236294087485e-05,
"loss": 3.9698,
"num_input_tokens_seen": 201432,
"step": 305
},
{
"epoch": 0.033980050422010306,
"grad_norm": 4.2637739181518555,
"learning_rate": 4.98576867533201e-05,
"loss": 3.4978,
"num_input_tokens_seen": 204776,
"step": 310
},
{
"epoch": 0.03452811575139757,
"grad_norm": 6.201929569244385,
"learning_rate": 4.9853063519761234e-05,
"loss": 3.5306,
"num_input_tokens_seen": 207984,
"step": 315
},
{
"epoch": 0.03507618108078483,
"grad_norm": 5.745614528656006,
"learning_rate": 4.984836660711686e-05,
"loss": 3.4114,
"num_input_tokens_seen": 211304,
"step": 320
},
{
"epoch": 0.035624246410172096,
"grad_norm": 7.258711338043213,
"learning_rate": 4.9843596029311386e-05,
"loss": 3.5909,
"num_input_tokens_seen": 214680,
"step": 325
},
{
"epoch": 0.03617231173955936,
"grad_norm": 5.421024799346924,
"learning_rate": 4.9838751800487606e-05,
"loss": 3.9625,
"num_input_tokens_seen": 217472,
"step": 330
},
{
"epoch": 0.03672037706894662,
"grad_norm": 4.33311653137207,
"learning_rate": 4.983383393500667e-05,
"loss": 3.1581,
"num_input_tokens_seen": 220824,
"step": 335
},
{
"epoch": 0.03726844239833388,
"grad_norm": 3.667479991912842,
"learning_rate": 4.982884244744801e-05,
"loss": 3.6578,
"num_input_tokens_seen": 224464,
"step": 340
},
{
"epoch": 0.037816507727721146,
"grad_norm": 4.797352313995361,
"learning_rate": 4.982377735260933e-05,
"loss": 3.4615,
"num_input_tokens_seen": 228120,
"step": 345
},
{
"epoch": 0.03836457305710841,
"grad_norm": 6.432485103607178,
"learning_rate": 4.981863866550656e-05,
"loss": 3.7862,
"num_input_tokens_seen": 231112,
"step": 350
},
{
"epoch": 0.03891263838649567,
"grad_norm": 5.501232624053955,
"learning_rate": 4.981342640137377e-05,
"loss": 3.5962,
"num_input_tokens_seen": 234456,
"step": 355
},
{
"epoch": 0.039460703715882936,
"grad_norm": 4.993545055389404,
"learning_rate": 4.9808140575663186e-05,
"loss": 3.4178,
"num_input_tokens_seen": 237744,
"step": 360
},
{
"epoch": 0.0400087690452702,
"grad_norm": 4.6652421951293945,
"learning_rate": 4.98027812040451e-05,
"loss": 3.3215,
"num_input_tokens_seen": 240240,
"step": 365
},
{
"epoch": 0.04055683437465746,
"grad_norm": 7.660661220550537,
"learning_rate": 4.979734830240784e-05,
"loss": 3.4482,
"num_input_tokens_seen": 243344,
"step": 370
},
{
"epoch": 0.041104899704044726,
"grad_norm": 5.362435340881348,
"learning_rate": 4.979184188685772e-05,
"loss": 3.6152,
"num_input_tokens_seen": 246928,
"step": 375
},
{
"epoch": 0.041652965033431986,
"grad_norm": 4.019466876983643,
"learning_rate": 4.9786261973718984e-05,
"loss": 3.4659,
"num_input_tokens_seen": 250592,
"step": 380
},
{
"epoch": 0.04220103036281925,
"grad_norm": 3.5128304958343506,
"learning_rate": 4.9780608579533774e-05,
"loss": 3.369,
"num_input_tokens_seen": 254136,
"step": 385
},
{
"epoch": 0.04274909569220651,
"grad_norm": 5.328804969787598,
"learning_rate": 4.9774881721062083e-05,
"loss": 3.396,
"num_input_tokens_seen": 257000,
"step": 390
},
{
"epoch": 0.043297161021593776,
"grad_norm": 3.9344732761383057,
"learning_rate": 4.976908141528168e-05,
"loss": 3.5748,
"num_input_tokens_seen": 259544,
"step": 395
},
{
"epoch": 0.04384522635098104,
"grad_norm": 6.34092903137207,
"learning_rate": 4.976320767938808e-05,
"loss": 3.2784,
"num_input_tokens_seen": 262648,
"step": 400
},
{
"epoch": 0.0443932916803683,
"grad_norm": 6.228747367858887,
"learning_rate": 4.975726053079448e-05,
"loss": 3.7733,
"num_input_tokens_seen": 265800,
"step": 405
},
{
"epoch": 0.044941357009755566,
"grad_norm": 6.360103130340576,
"learning_rate": 4.9751239987131735e-05,
"loss": 3.3795,
"num_input_tokens_seen": 268352,
"step": 410
},
{
"epoch": 0.045489422339142827,
"grad_norm": 5.080907821655273,
"learning_rate": 4.9745146066248275e-05,
"loss": 3.4467,
"num_input_tokens_seen": 271416,
"step": 415
},
{
"epoch": 0.04603748766853009,
"grad_norm": 4.075165271759033,
"learning_rate": 4.973897878621005e-05,
"loss": 3.4581,
"num_input_tokens_seen": 274912,
"step": 420
},
{
"epoch": 0.046585552997917355,
"grad_norm": 4.517000675201416,
"learning_rate": 4.973273816530051e-05,
"loss": 3.3681,
"num_input_tokens_seen": 279184,
"step": 425
},
{
"epoch": 0.047133618327304616,
"grad_norm": 5.66272497177124,
"learning_rate": 4.9726424222020527e-05,
"loss": 3.8983,
"num_input_tokens_seen": 283008,
"step": 430
},
{
"epoch": 0.04768168365669188,
"grad_norm": 5.277008056640625,
"learning_rate": 4.9720036975088334e-05,
"loss": 3.8482,
"num_input_tokens_seen": 285408,
"step": 435
},
{
"epoch": 0.04822974898607914,
"grad_norm": 5.911515235900879,
"learning_rate": 4.971357644343948e-05,
"loss": 3.7086,
"num_input_tokens_seen": 287672,
"step": 440
},
{
"epoch": 0.048777814315466406,
"grad_norm": 5.71356725692749,
"learning_rate": 4.9707042646226784e-05,
"loss": 3.7235,
"num_input_tokens_seen": 290608,
"step": 445
},
{
"epoch": 0.04932587964485367,
"grad_norm": 4.606592178344727,
"learning_rate": 4.9700435602820276e-05,
"loss": 3.5481,
"num_input_tokens_seen": 293688,
"step": 450
},
{
"epoch": 0.04987394497424093,
"grad_norm": 5.814152240753174,
"learning_rate": 4.969375533280708e-05,
"loss": 3.38,
"num_input_tokens_seen": 297160,
"step": 455
},
{
"epoch": 0.050422010303628195,
"grad_norm": 5.669627666473389,
"learning_rate": 4.968700185599147e-05,
"loss": 3.5052,
"num_input_tokens_seen": 300608,
"step": 460
},
{
"epoch": 0.050970075633015456,
"grad_norm": 4.943079471588135,
"learning_rate": 4.96801751923947e-05,
"loss": 3.5689,
"num_input_tokens_seen": 303680,
"step": 465
},
{
"epoch": 0.05151814096240272,
"grad_norm": 5.5774664878845215,
"learning_rate": 4.9673275362255035e-05,
"loss": 3.1872,
"num_input_tokens_seen": 306664,
"step": 470
},
{
"epoch": 0.052066206291789985,
"grad_norm": 5.742215633392334,
"learning_rate": 4.966630238602761e-05,
"loss": 3.873,
"num_input_tokens_seen": 310024,
"step": 475
},
{
"epoch": 0.052614271621177246,
"grad_norm": 5.4475507736206055,
"learning_rate": 4.9659256284384434e-05,
"loss": 3.5306,
"num_input_tokens_seen": 313296,
"step": 480
},
{
"epoch": 0.05316233695056451,
"grad_norm": 5.270495414733887,
"learning_rate": 4.965213707821428e-05,
"loss": 3.3911,
"num_input_tokens_seen": 317528,
"step": 485
},
{
"epoch": 0.05371040227995177,
"grad_norm": 4.345836639404297,
"learning_rate": 4.964494478862267e-05,
"loss": 3.338,
"num_input_tokens_seen": 320224,
"step": 490
},
{
"epoch": 0.054258467609339035,
"grad_norm": 8.715791702270508,
"learning_rate": 4.963767943693178e-05,
"loss": 3.6676,
"num_input_tokens_seen": 323576,
"step": 495
},
{
"epoch": 0.054806532938726296,
"grad_norm": 6.43541955947876,
"learning_rate": 4.9630341044680375e-05,
"loss": 3.4779,
"num_input_tokens_seen": 326840,
"step": 500
},
{
"epoch": 0.05535459826811356,
"grad_norm": 5.299740314483643,
"learning_rate": 4.962292963362376e-05,
"loss": 3.0794,
"num_input_tokens_seen": 330400,
"step": 505
},
{
"epoch": 0.055902663597500825,
"grad_norm": 5.377191543579102,
"learning_rate": 4.9615445225733714e-05,
"loss": 3.3778,
"num_input_tokens_seen": 334264,
"step": 510
},
{
"epoch": 0.056450728926888086,
"grad_norm": 4.671337127685547,
"learning_rate": 4.9607887843198417e-05,
"loss": 3.2423,
"num_input_tokens_seen": 338632,
"step": 515
},
{
"epoch": 0.05699879425627535,
"grad_norm": 4.917747497558594,
"learning_rate": 4.960025750842241e-05,
"loss": 3.2912,
"num_input_tokens_seen": 341576,
"step": 520
},
{
"epoch": 0.057546859585662614,
"grad_norm": 5.633148670196533,
"learning_rate": 4.959255424402647e-05,
"loss": 3.9649,
"num_input_tokens_seen": 343752,
"step": 525
},
{
"epoch": 0.058094924915049875,
"grad_norm": 5.843842506408691,
"learning_rate": 4.9584778072847605e-05,
"loss": 3.5301,
"num_input_tokens_seen": 346768,
"step": 530
},
{
"epoch": 0.058642990244437136,
"grad_norm": 6.019566059112549,
"learning_rate": 4.957692901793896e-05,
"loss": 3.7123,
"num_input_tokens_seen": 349488,
"step": 535
},
{
"epoch": 0.0591910555738244,
"grad_norm": 5.83019495010376,
"learning_rate": 4.9569007102569746e-05,
"loss": 4.0987,
"num_input_tokens_seen": 353448,
"step": 540
},
{
"epoch": 0.059739120903211665,
"grad_norm": 7.744917392730713,
"learning_rate": 4.9561012350225174e-05,
"loss": 3.4271,
"num_input_tokens_seen": 357336,
"step": 545
},
{
"epoch": 0.060287186232598926,
"grad_norm": 6.845799922943115,
"learning_rate": 4.955294478460638e-05,
"loss": 3.7176,
"num_input_tokens_seen": 361272,
"step": 550
},
{
"epoch": 0.06083525156198619,
"grad_norm": 7.8909592628479,
"learning_rate": 4.954480442963038e-05,
"loss": 3.3092,
"num_input_tokens_seen": 364048,
"step": 555
},
{
"epoch": 0.061383316891373454,
"grad_norm": 6.57379674911499,
"learning_rate": 4.953659130942997e-05,
"loss": 4.0073,
"num_input_tokens_seen": 368336,
"step": 560
},
{
"epoch": 0.061931382220760715,
"grad_norm": 5.875579833984375,
"learning_rate": 4.952830544835366e-05,
"loss": 3.4651,
"num_input_tokens_seen": 370824,
"step": 565
},
{
"epoch": 0.062479447550147976,
"grad_norm": 5.310330867767334,
"learning_rate": 4.951994687096562e-05,
"loss": 3.8036,
"num_input_tokens_seen": 374104,
"step": 570
},
{
"epoch": 0.06302751287953524,
"grad_norm": 6.611202239990234,
"learning_rate": 4.9511515602045563e-05,
"loss": 3.2939,
"num_input_tokens_seen": 376176,
"step": 575
},
{
"epoch": 0.0635755782089225,
"grad_norm": 4.5933451652526855,
"learning_rate": 4.950301166658875e-05,
"loss": 3.529,
"num_input_tokens_seen": 378600,
"step": 580
},
{
"epoch": 0.06412364353830977,
"grad_norm": 5.080543518066406,
"learning_rate": 4.9494435089805835e-05,
"loss": 4.0958,
"num_input_tokens_seen": 382584,
"step": 585
},
{
"epoch": 0.06467170886769703,
"grad_norm": 4.658755779266357,
"learning_rate": 4.948578589712283e-05,
"loss": 3.3213,
"num_input_tokens_seen": 386376,
"step": 590
},
{
"epoch": 0.06521977419708429,
"grad_norm": 5.556814670562744,
"learning_rate": 4.9477064114181026e-05,
"loss": 3.5986,
"num_input_tokens_seen": 390784,
"step": 595
},
{
"epoch": 0.06576783952647156,
"grad_norm": 6.1433491706848145,
"learning_rate": 4.946826976683691e-05,
"loss": 3.4305,
"num_input_tokens_seen": 395104,
"step": 600
},
{
"epoch": 0.06631590485585882,
"grad_norm": 4.176370143890381,
"learning_rate": 4.9459402881162095e-05,
"loss": 3.6053,
"num_input_tokens_seen": 398072,
"step": 605
},
{
"epoch": 0.06686397018524608,
"grad_norm": 4.746314525604248,
"learning_rate": 4.945046348344325e-05,
"loss": 3.4613,
"num_input_tokens_seen": 401112,
"step": 610
},
{
"epoch": 0.06741203551463334,
"grad_norm": 6.04541015625,
"learning_rate": 4.9441451600182e-05,
"loss": 3.3843,
"num_input_tokens_seen": 404728,
"step": 615
},
{
"epoch": 0.06796010084402061,
"grad_norm": 4.687957763671875,
"learning_rate": 4.943236725809485e-05,
"loss": 3.6494,
"num_input_tokens_seen": 407824,
"step": 620
},
{
"epoch": 0.06850816617340787,
"grad_norm": 5.392053604125977,
"learning_rate": 4.942321048411314e-05,
"loss": 3.7716,
"num_input_tokens_seen": 410064,
"step": 625
},
{
"epoch": 0.06905623150279513,
"grad_norm": 5.196096420288086,
"learning_rate": 4.9413981305382936e-05,
"loss": 3.7037,
"num_input_tokens_seen": 413664,
"step": 630
},
{
"epoch": 0.0696042968321824,
"grad_norm": 4.464987754821777,
"learning_rate": 4.940467974926493e-05,
"loss": 3.0886,
"num_input_tokens_seen": 416752,
"step": 635
},
{
"epoch": 0.07015236216156966,
"grad_norm": 4.81376838684082,
"learning_rate": 4.939530584333441e-05,
"loss": 3.11,
"num_input_tokens_seen": 420552,
"step": 640
},
{
"epoch": 0.07070042749095692,
"grad_norm": 5.184936046600342,
"learning_rate": 4.938585961538115e-05,
"loss": 3.1776,
"num_input_tokens_seen": 423200,
"step": 645
},
{
"epoch": 0.07124849282034419,
"grad_norm": 7.05800724029541,
"learning_rate": 4.9376341093409305e-05,
"loss": 3.2882,
"num_input_tokens_seen": 426840,
"step": 650
},
{
"epoch": 0.07179655814973145,
"grad_norm": 7.437703609466553,
"learning_rate": 4.9366750305637385e-05,
"loss": 3.3796,
"num_input_tokens_seen": 430168,
"step": 655
},
{
"epoch": 0.07234462347911871,
"grad_norm": 7.665436744689941,
"learning_rate": 4.9357087280498105e-05,
"loss": 3.6646,
"num_input_tokens_seen": 433080,
"step": 660
},
{
"epoch": 0.07289268880850597,
"grad_norm": 7.2700324058532715,
"learning_rate": 4.934735204663835e-05,
"loss": 3.4558,
"num_input_tokens_seen": 436600,
"step": 665
},
{
"epoch": 0.07344075413789324,
"grad_norm": 4.932444095611572,
"learning_rate": 4.9337544632919085e-05,
"loss": 3.1135,
"num_input_tokens_seen": 439552,
"step": 670
},
{
"epoch": 0.0739888194672805,
"grad_norm": 6.515824794769287,
"learning_rate": 4.9327665068415254e-05,
"loss": 3.3952,
"num_input_tokens_seen": 442776,
"step": 675
},
{
"epoch": 0.07453688479666776,
"grad_norm": 6.392978668212891,
"learning_rate": 4.931771338241566e-05,
"loss": 3.5728,
"num_input_tokens_seen": 445344,
"step": 680
},
{
"epoch": 0.07508495012605503,
"grad_norm": 5.692570209503174,
"learning_rate": 4.930768960442299e-05,
"loss": 3.3921,
"num_input_tokens_seen": 449360,
"step": 685
},
{
"epoch": 0.07563301545544229,
"grad_norm": 10.294317245483398,
"learning_rate": 4.929759376415358e-05,
"loss": 3.6814,
"num_input_tokens_seen": 452736,
"step": 690
},
{
"epoch": 0.07618108078482955,
"grad_norm": 7.613968849182129,
"learning_rate": 4.9287425891537454e-05,
"loss": 3.5298,
"num_input_tokens_seen": 455648,
"step": 695
},
{
"epoch": 0.07672914611421681,
"grad_norm": 5.538883209228516,
"learning_rate": 4.927718601671816e-05,
"loss": 3.4538,
"num_input_tokens_seen": 458256,
"step": 700
},
{
"epoch": 0.07727721144360408,
"grad_norm": 5.105963706970215,
"learning_rate": 4.926687417005268e-05,
"loss": 3.3759,
"num_input_tokens_seen": 461984,
"step": 705
},
{
"epoch": 0.07782527677299134,
"grad_norm": 5.424991130828857,
"learning_rate": 4.925649038211142e-05,
"loss": 3.4941,
"num_input_tokens_seen": 465216,
"step": 710
},
{
"epoch": 0.0783733421023786,
"grad_norm": 6.287330627441406,
"learning_rate": 4.924603468367801e-05,
"loss": 3.3536,
"num_input_tokens_seen": 468496,
"step": 715
},
{
"epoch": 0.07892140743176587,
"grad_norm": 7.270327568054199,
"learning_rate": 4.923550710574929e-05,
"loss": 3.1898,
"num_input_tokens_seen": 471784,
"step": 720
},
{
"epoch": 0.07946947276115313,
"grad_norm": 5.402751922607422,
"learning_rate": 4.922490767953519e-05,
"loss": 3.7645,
"num_input_tokens_seen": 474928,
"step": 725
},
{
"epoch": 0.0800175380905404,
"grad_norm": 5.472609996795654,
"learning_rate": 4.921423643645863e-05,
"loss": 3.5023,
"num_input_tokens_seen": 479376,
"step": 730
},
{
"epoch": 0.08056560341992766,
"grad_norm": 4.318566799163818,
"learning_rate": 4.9203493408155455e-05,
"loss": 3.1444,
"num_input_tokens_seen": 482328,
"step": 735
},
{
"epoch": 0.08111366874931492,
"grad_norm": 6.903258800506592,
"learning_rate": 4.919267862647431e-05,
"loss": 3.8837,
"num_input_tokens_seen": 486248,
"step": 740
},
{
"epoch": 0.08166173407870218,
"grad_norm": 4.821303844451904,
"learning_rate": 4.918179212347657e-05,
"loss": 3.7363,
"num_input_tokens_seen": 489736,
"step": 745
},
{
"epoch": 0.08220979940808945,
"grad_norm": 4.108252048492432,
"learning_rate": 4.917083393143621e-05,
"loss": 3.0709,
"num_input_tokens_seen": 492784,
"step": 750
},
{
"epoch": 0.0827578647374767,
"grad_norm": 6.259218215942383,
"learning_rate": 4.915980408283977e-05,
"loss": 3.4733,
"num_input_tokens_seen": 496528,
"step": 755
},
{
"epoch": 0.08330593006686397,
"grad_norm": 5.9338531494140625,
"learning_rate": 4.91487026103862e-05,
"loss": 3.8987,
"num_input_tokens_seen": 500832,
"step": 760
},
{
"epoch": 0.08385399539625123,
"grad_norm": 5.397777557373047,
"learning_rate": 4.913752954698677e-05,
"loss": 3.3764,
"num_input_tokens_seen": 503744,
"step": 765
},
{
"epoch": 0.0844020607256385,
"grad_norm": 5.536934852600098,
"learning_rate": 4.912628492576503e-05,
"loss": 3.7953,
"num_input_tokens_seen": 507656,
"step": 770
},
{
"epoch": 0.08495012605502576,
"grad_norm": 5.932541847229004,
"learning_rate": 4.9114968780056635e-05,
"loss": 3.4254,
"num_input_tokens_seen": 511216,
"step": 775
},
{
"epoch": 0.08549819138441302,
"grad_norm": 5.971353530883789,
"learning_rate": 4.910358114340929e-05,
"loss": 3.6466,
"num_input_tokens_seen": 514328,
"step": 780
},
{
"epoch": 0.08604625671380028,
"grad_norm": 8.010024070739746,
"learning_rate": 4.9092122049582636e-05,
"loss": 3.9475,
"num_input_tokens_seen": 518200,
"step": 785
},
{
"epoch": 0.08659432204318755,
"grad_norm": 6.520806312561035,
"learning_rate": 4.9080591532548175e-05,
"loss": 3.4056,
"num_input_tokens_seen": 521704,
"step": 790
},
{
"epoch": 0.0871423873725748,
"grad_norm": 5.646440029144287,
"learning_rate": 4.9068989626489126e-05,
"loss": 3.5912,
"num_input_tokens_seen": 524456,
"step": 795
},
{
"epoch": 0.08769045270196207,
"grad_norm": 4.937885284423828,
"learning_rate": 4.9057316365800366e-05,
"loss": 3.4854,
"num_input_tokens_seen": 526920,
"step": 800
},
{
"epoch": 0.08823851803134934,
"grad_norm": 6.204067230224609,
"learning_rate": 4.904557178508829e-05,
"loss": 3.3649,
"num_input_tokens_seen": 530544,
"step": 805
},
{
"epoch": 0.0887865833607366,
"grad_norm": 6.427296161651611,
"learning_rate": 4.9033755919170733e-05,
"loss": 3.8582,
"num_input_tokens_seen": 532832,
"step": 810
},
{
"epoch": 0.08933464869012386,
"grad_norm": 7.1010589599609375,
"learning_rate": 4.9021868803076875e-05,
"loss": 3.5353,
"num_input_tokens_seen": 536056,
"step": 815
},
{
"epoch": 0.08988271401951113,
"grad_norm": 4.813199043273926,
"learning_rate": 4.900991047204712e-05,
"loss": 3.2529,
"num_input_tokens_seen": 539248,
"step": 820
},
{
"epoch": 0.09043077934889839,
"grad_norm": 7.545267581939697,
"learning_rate": 4.899788096153297e-05,
"loss": 3.0758,
"num_input_tokens_seen": 543584,
"step": 825
},
{
"epoch": 0.09097884467828565,
"grad_norm": 5.574884414672852,
"learning_rate": 4.898578030719698e-05,
"loss": 3.0291,
"num_input_tokens_seen": 546792,
"step": 830
},
{
"epoch": 0.09152691000767292,
"grad_norm": 5.587398529052734,
"learning_rate": 4.897360854491259e-05,
"loss": 3.2747,
"num_input_tokens_seen": 549296,
"step": 835
},
{
"epoch": 0.09207497533706017,
"grad_norm": 6.558215618133545,
"learning_rate": 4.896136571076406e-05,
"loss": 3.4765,
"num_input_tokens_seen": 551784,
"step": 840
},
{
"epoch": 0.09262304066644744,
"grad_norm": 5.221803188323975,
"learning_rate": 4.894905184104634e-05,
"loss": 3.3299,
"num_input_tokens_seen": 555608,
"step": 845
},
{
"epoch": 0.09317110599583471,
"grad_norm": NaN,
"learning_rate": 4.8939149624187016e-05,
"loss": 3.5208,
"num_input_tokens_seen": 558848,
"step": 850
},
{
"epoch": 0.09371917132522196,
"grad_norm": 5.915983200073242,
"learning_rate": 4.8926707982580194e-05,
"loss": 3.5031,
"num_input_tokens_seen": 562384,
"step": 855
},
{
"epoch": 0.09426723665460923,
"grad_norm": 6.868443965911865,
"learning_rate": 4.891419540815006e-05,
"loss": 3.5194,
"num_input_tokens_seen": 565648,
"step": 860
},
{
"epoch": 0.09481530198399649,
"grad_norm": 6.696837902069092,
"learning_rate": 4.8901611937991244e-05,
"loss": 3.4405,
"num_input_tokens_seen": 568384,
"step": 865
},
{
"epoch": 0.09536336731338375,
"grad_norm": 6.879650592803955,
"learning_rate": 4.8888957609408535e-05,
"loss": 3.2062,
"num_input_tokens_seen": 571184,
"step": 870
},
{
"epoch": 0.09591143264277102,
"grad_norm": 5.235931396484375,
"learning_rate": 4.8876232459916805e-05,
"loss": 3.351,
"num_input_tokens_seen": 575328,
"step": 875
},
{
"epoch": 0.09645949797215828,
"grad_norm": 6.496284008026123,
"learning_rate": 4.886343652724088e-05,
"loss": 3.3753,
"num_input_tokens_seen": 578520,
"step": 880
},
{
"epoch": 0.09700756330154554,
"grad_norm": 8.708456039428711,
"learning_rate": 4.8850569849315414e-05,
"loss": 3.4456,
"num_input_tokens_seen": 581688,
"step": 885
},
{
"epoch": 0.09755562863093281,
"grad_norm": 5.558722496032715,
"learning_rate": 4.883763246428481e-05,
"loss": 3.3753,
"num_input_tokens_seen": 584736,
"step": 890
},
{
"epoch": 0.09810369396032007,
"grad_norm": 6.443663597106934,
"learning_rate": 4.882462441050308e-05,
"loss": 3.5381,
"num_input_tokens_seen": 587952,
"step": 895
},
{
"epoch": 0.09865175928970733,
"grad_norm": 6.3144073486328125,
"learning_rate": 4.881154572653373e-05,
"loss": 3.5416,
"num_input_tokens_seen": 590704,
"step": 900
},
{
"epoch": 0.0991998246190946,
"grad_norm": 5.615172386169434,
"learning_rate": 4.8798396451149676e-05,
"loss": 3.5944,
"num_input_tokens_seen": 593056,
"step": 905
},
{
"epoch": 0.09974788994848185,
"grad_norm": 6.011329174041748,
"learning_rate": 4.8785176623333094e-05,
"loss": 3.2378,
"num_input_tokens_seen": 596584,
"step": 910
},
{
"epoch": 0.10029595527786912,
"grad_norm": 5.445102214813232,
"learning_rate": 4.8771886282275324e-05,
"loss": 3.6375,
"num_input_tokens_seen": 600080,
"step": 915
},
{
"epoch": 0.10084402060725639,
"grad_norm": 6.635453701019287,
"learning_rate": 4.875852546737675e-05,
"loss": 3.5498,
"num_input_tokens_seen": 602696,
"step": 920
},
{
"epoch": 0.10139208593664364,
"grad_norm": 5.236489772796631,
"learning_rate": 4.874509421824667e-05,
"loss": 3.4216,
"num_input_tokens_seen": 606200,
"step": 925
},
{
"epoch": 0.10194015126603091,
"grad_norm": 6.734245300292969,
"learning_rate": 4.87315925747032e-05,
"loss": 3.3747,
"num_input_tokens_seen": 609848,
"step": 930
},
{
"epoch": 0.10248821659541818,
"grad_norm": 6.802552223205566,
"learning_rate": 4.871802057677315e-05,
"loss": 3.2441,
"num_input_tokens_seen": 613440,
"step": 935
},
{
"epoch": 0.10303628192480543,
"grad_norm": 6.780172824859619,
"learning_rate": 4.8704378264691894e-05,
"loss": 3.4606,
"num_input_tokens_seen": 617088,
"step": 940
},
{
"epoch": 0.1035843472541927,
"grad_norm": 6.527922630310059,
"learning_rate": 4.869066567890327e-05,
"loss": 3.4019,
"num_input_tokens_seen": 619952,
"step": 945
},
{
"epoch": 0.10413241258357997,
"grad_norm": 6.2412214279174805,
"learning_rate": 4.867688286005944e-05,
"loss": 3.2408,
"num_input_tokens_seen": 623088,
"step": 950
},
{
"epoch": 0.10468047791296722,
"grad_norm": 6.477228164672852,
"learning_rate": 4.8663029849020775e-05,
"loss": 3.2491,
"num_input_tokens_seen": 626376,
"step": 955
},
{
"epoch": 0.10522854324235449,
"grad_norm": 5.359529495239258,
"learning_rate": 4.864910668685574e-05,
"loss": 3.1534,
"num_input_tokens_seen": 628800,
"step": 960
},
{
"epoch": 0.10577660857174175,
"grad_norm": 5.2979960441589355,
"learning_rate": 4.863511341484077e-05,
"loss": 3.4653,
"num_input_tokens_seen": 631312,
"step": 965
},
{
"epoch": 0.10632467390112901,
"grad_norm": 12.67263126373291,
"learning_rate": 4.8621050074460136e-05,
"loss": 3.8407,
"num_input_tokens_seen": 634144,
"step": 970
},
{
"epoch": 0.10687273923051628,
"grad_norm": 4.020299434661865,
"learning_rate": 4.860691670740587e-05,
"loss": 3.6273,
"num_input_tokens_seen": 637568,
"step": 975
},
{
"epoch": 0.10742080455990353,
"grad_norm": 5.12907075881958,
"learning_rate": 4.8592713355577555e-05,
"loss": 2.9803,
"num_input_tokens_seen": 640368,
"step": 980
},
{
"epoch": 0.1079688698892908,
"grad_norm": 5.088891983032227,
"learning_rate": 4.8578440061082275e-05,
"loss": 3.0532,
"num_input_tokens_seen": 643928,
"step": 985
},
{
"epoch": 0.10851693521867807,
"grad_norm": 6.150454521179199,
"learning_rate": 4.856409686623447e-05,
"loss": 3.5733,
"num_input_tokens_seen": 648192,
"step": 990
},
{
"epoch": 0.10906500054806532,
"grad_norm": 6.601188659667969,
"learning_rate": 4.85496838135558e-05,
"loss": 3.4824,
"num_input_tokens_seen": 652272,
"step": 995
},
{
"epoch": 0.10961306587745259,
"grad_norm": 6.9974141120910645,
"learning_rate": 4.8535200945775016e-05,
"loss": 3.516,
"num_input_tokens_seen": 655696,
"step": 1000
},
{
"epoch": 0.11016113120683986,
"grad_norm": 7.116706371307373,
"learning_rate": 4.8520648305827855e-05,
"loss": 3.4208,
"num_input_tokens_seen": 658560,
"step": 1005
},
{
"epoch": 0.11070919653622711,
"grad_norm": 5.209189414978027,
"learning_rate": 4.850602593685689e-05,
"loss": 3.353,
"num_input_tokens_seen": 662152,
"step": 1010
},
{
"epoch": 0.11125726186561438,
"grad_norm": 5.9092278480529785,
"learning_rate": 4.8491333882211416e-05,
"loss": 3.2833,
"num_input_tokens_seen": 665968,
"step": 1015
},
{
"epoch": 0.11180532719500165,
"grad_norm": 7.026948928833008,
"learning_rate": 4.847657218544732e-05,
"loss": 3.291,
"num_input_tokens_seen": 668808,
"step": 1020
},
{
"epoch": 0.1123533925243889,
"grad_norm": 6.154213905334473,
"learning_rate": 4.8461740890326936e-05,
"loss": 3.3035,
"num_input_tokens_seen": 672280,
"step": 1025
},
{
"epoch": 0.11290145785377617,
"grad_norm": 6.6929521560668945,
"learning_rate": 4.844684004081895e-05,
"loss": 3.6387,
"num_input_tokens_seen": 675184,
"step": 1030
},
{
"epoch": 0.11344952318316344,
"grad_norm": 5.449969291687012,
"learning_rate": 4.843186968109823e-05,
"loss": 3.1393,
"num_input_tokens_seen": 677824,
"step": 1035
},
{
"epoch": 0.1139975885125507,
"grad_norm": 3.6720149517059326,
"learning_rate": 4.841682985554573e-05,
"loss": 3.2646,
"num_input_tokens_seen": 682856,
"step": 1040
},
{
"epoch": 0.11454565384193796,
"grad_norm": 5.606584072113037,
"learning_rate": 4.8401720608748324e-05,
"loss": 3.3697,
"num_input_tokens_seen": 687680,
"step": 1045
},
{
"epoch": 0.11509371917132523,
"grad_norm": 5.044498920440674,
"learning_rate": 4.83865419854987e-05,
"loss": 3.3275,
"num_input_tokens_seen": 690616,
"step": 1050
},
{
"epoch": 0.11564178450071248,
"grad_norm": 5.938497543334961,
"learning_rate": 4.83712940307952e-05,
"loss": 3.1055,
"num_input_tokens_seen": 693808,
"step": 1055
},
{
"epoch": 0.11618984983009975,
"grad_norm": 7.216318607330322,
"learning_rate": 4.8355976789841754e-05,
"loss": 3.5388,
"num_input_tokens_seen": 696992,
"step": 1060
},
{
"epoch": 0.116737915159487,
"grad_norm": 5.2063164710998535,
"learning_rate": 4.834059030804764e-05,
"loss": 3.3436,
"num_input_tokens_seen": 700448,
"step": 1065
},
{
"epoch": 0.11728598048887427,
"grad_norm": 6.457626819610596,
"learning_rate": 4.832513463102745e-05,
"loss": 3.281,
"num_input_tokens_seen": 702928,
"step": 1070
},
{
"epoch": 0.11783404581826154,
"grad_norm": 5.837212562561035,
"learning_rate": 4.8309609804600886e-05,
"loss": 3.3414,
"num_input_tokens_seen": 707064,
"step": 1075
},
{
"epoch": 0.1183821111476488,
"grad_norm": 5.227325439453125,
"learning_rate": 4.829401587479265e-05,
"loss": 3.0907,
"num_input_tokens_seen": 711056,
"step": 1080
},
{
"epoch": 0.11893017647703606,
"grad_norm": 7.185408115386963,
"learning_rate": 4.8278352887832326e-05,
"loss": 3.159,
"num_input_tokens_seen": 714472,
"step": 1085
},
{
"epoch": 0.11947824180642333,
"grad_norm": 7.311601638793945,
"learning_rate": 4.82626208901542e-05,
"loss": 3.5405,
"num_input_tokens_seen": 717400,
"step": 1090
},
{
"epoch": 0.12002630713581058,
"grad_norm": 4.9710693359375,
"learning_rate": 4.824681992839717e-05,
"loss": 3.3058,
"num_input_tokens_seen": 720472,
"step": 1095
},
{
"epoch": 0.12057437246519785,
"grad_norm": 4.5781779289245605,
"learning_rate": 4.823095004940456e-05,
"loss": 3.1374,
"num_input_tokens_seen": 723808,
"step": 1100
},
{
"epoch": 0.12112243779458512,
"grad_norm": 6.077118396759033,
"learning_rate": 4.8215011300224027e-05,
"loss": 3.1628,
"num_input_tokens_seen": 727576,
"step": 1105
},
{
"epoch": 0.12167050312397237,
"grad_norm": 6.6747870445251465,
"learning_rate": 4.819900372810739e-05,
"loss": 3.5095,
"num_input_tokens_seen": 730536,
"step": 1110
},
{
"epoch": 0.12221856845335964,
"grad_norm": 5.468014240264893,
"learning_rate": 4.818292738051049e-05,
"loss": 3.521,
"num_input_tokens_seen": 733024,
"step": 1115
},
{
"epoch": 0.12276663378274691,
"grad_norm": 6.263638019561768,
"learning_rate": 4.816678230509308e-05,
"loss": 3.2318,
"num_input_tokens_seen": 736048,
"step": 1120
},
{
"epoch": 0.12331469911213416,
"grad_norm": 5.998656272888184,
"learning_rate": 4.8150568549718655e-05,
"loss": 3.0286,
"num_input_tokens_seen": 739264,
"step": 1125
},
{
"epoch": 0.12386276444152143,
"grad_norm": 6.395206928253174,
"learning_rate": 4.81342861624543e-05,
"loss": 3.4223,
"num_input_tokens_seen": 742008,
"step": 1130
},
{
"epoch": 0.1244108297709087,
"grad_norm": 6.199779510498047,
"learning_rate": 4.811793519157059e-05,
"loss": 3.5237,
"num_input_tokens_seen": 745064,
"step": 1135
},
{
"epoch": 0.12495889510029595,
"grad_norm": 6.504228115081787,
"learning_rate": 4.81015156855414e-05,
"loss": 3.4249,
"num_input_tokens_seen": 748104,
"step": 1140
},
{
"epoch": 0.1255069604296832,
"grad_norm": 6.280592441558838,
"learning_rate": 4.80850276930438e-05,
"loss": 3.0411,
"num_input_tokens_seen": 752032,
"step": 1145
},
{
"epoch": 0.1260550257590705,
"grad_norm": 8.529096603393555,
"learning_rate": 4.806847126295789e-05,
"loss": 3.1457,
"num_input_tokens_seen": 755400,
"step": 1150
},
{
"epoch": 0.12660309108845774,
"grad_norm": 6.454196453094482,
"learning_rate": 4.8051846444366676e-05,
"loss": 3.0008,
"num_input_tokens_seen": 758392,
"step": 1155
},
{
"epoch": 0.127151156417845,
"grad_norm": 6.862017631530762,
"learning_rate": 4.803515328655586e-05,
"loss": 3.3972,
"num_input_tokens_seen": 760824,
"step": 1160
},
{
"epoch": 0.12769922174723228,
"grad_norm": 6.56373929977417,
"learning_rate": 4.8018391839013784e-05,
"loss": 3.4338,
"num_input_tokens_seen": 763680,
"step": 1165
},
{
"epoch": 0.12824728707661953,
"grad_norm": 5.431229114532471,
"learning_rate": 4.800156215143124e-05,
"loss": 3.2619,
"num_input_tokens_seen": 767352,
"step": 1170
},
{
"epoch": 0.12879535240600679,
"grad_norm": 5.761483192443848,
"learning_rate": 4.7984664273701305e-05,
"loss": 3.3616,
"num_input_tokens_seen": 771096,
"step": 1175
},
{
"epoch": 0.12934341773539407,
"grad_norm": 7.804869651794434,
"learning_rate": 4.796769825591921e-05,
"loss": 3.2658,
"num_input_tokens_seen": 774192,
"step": 1180
},
{
"epoch": 0.12989148306478132,
"grad_norm": 5.688300609588623,
"learning_rate": 4.7950664148382205e-05,
"loss": 3.7069,
"num_input_tokens_seen": 777712,
"step": 1185
},
{
"epoch": 0.13043954839416858,
"grad_norm": 4.980658054351807,
"learning_rate": 4.793356200158941e-05,
"loss": 3.0386,
"num_input_tokens_seen": 780680,
"step": 1190
},
{
"epoch": 0.13098761372355586,
"grad_norm": 6.9450249671936035,
"learning_rate": 4.791639186624162e-05,
"loss": 3.4293,
"num_input_tokens_seen": 783664,
"step": 1195
},
{
"epoch": 0.1315356790529431,
"grad_norm": 6.7938408851623535,
"learning_rate": 4.789915379324121e-05,
"loss": 3.2908,
"num_input_tokens_seen": 787480,
"step": 1200
},
{
"epoch": 0.13208374438233036,
"grad_norm": 5.833454608917236,
"learning_rate": 4.788184783369196e-05,
"loss": 3.3431,
"num_input_tokens_seen": 791560,
"step": 1205
},
{
"epoch": 0.13263180971171765,
"grad_norm": 6.020946502685547,
"learning_rate": 4.786447403889891e-05,
"loss": 3.1235,
"num_input_tokens_seen": 794600,
"step": 1210
},
{
"epoch": 0.1331798750411049,
"grad_norm": 9.639689445495605,
"learning_rate": 4.78470324603682e-05,
"loss": 3.357,
"num_input_tokens_seen": 796976,
"step": 1215
},
{
"epoch": 0.13372794037049215,
"grad_norm": 5.102296829223633,
"learning_rate": 4.782952314980691e-05,
"loss": 3.4762,
"num_input_tokens_seen": 801208,
"step": 1220
},
{
"epoch": 0.13427600569987944,
"grad_norm": 6.015713214874268,
"learning_rate": 4.781194615912292e-05,
"loss": 3.2738,
"num_input_tokens_seen": 804472,
"step": 1225
},
{
"epoch": 0.1348240710292667,
"grad_norm": 7.88398551940918,
"learning_rate": 4.7794301540424774e-05,
"loss": 3.3333,
"num_input_tokens_seen": 807568,
"step": 1230
},
{
"epoch": 0.13537213635865394,
"grad_norm": 6.841670989990234,
"learning_rate": 4.7776589346021486e-05,
"loss": 3.5167,
"num_input_tokens_seen": 811016,
"step": 1235
},
{
"epoch": 0.13592020168804123,
"grad_norm": 6.089728355407715,
"learning_rate": 4.775880962842241e-05,
"loss": 3.703,
"num_input_tokens_seen": 814536,
"step": 1240
},
{
"epoch": 0.13646826701742848,
"grad_norm": 6.35260009765625,
"learning_rate": 4.774096244033707e-05,
"loss": 3.1131,
"num_input_tokens_seen": 817496,
"step": 1245
},
{
"epoch": 0.13701633234681573,
"grad_norm": 5.8579254150390625,
"learning_rate": 4.772304783467503e-05,
"loss": 3.2992,
"num_input_tokens_seen": 821712,
"step": 1250
},
{
"epoch": 0.13756439767620302,
"grad_norm": 5.486454963684082,
"learning_rate": 4.7705065864545695e-05,
"loss": 3.1721,
"num_input_tokens_seen": 824688,
"step": 1255
},
{
"epoch": 0.13811246300559027,
"grad_norm": 6.544208526611328,
"learning_rate": 4.7687016583258203e-05,
"loss": 3.4493,
"num_input_tokens_seen": 828400,
"step": 1260
},
{
"epoch": 0.13866052833497752,
"grad_norm": 4.948637008666992,
"learning_rate": 4.7668900044321236e-05,
"loss": 3.0927,
"num_input_tokens_seen": 831936,
"step": 1265
},
{
"epoch": 0.1392085936643648,
"grad_norm": 6.64813756942749,
"learning_rate": 4.7650716301442856e-05,
"loss": 3.6065,
"num_input_tokens_seen": 834912,
"step": 1270
},
{
"epoch": 0.13975665899375206,
"grad_norm": 7.289310455322266,
"learning_rate": 4.763246540853035e-05,
"loss": 3.3871,
"num_input_tokens_seen": 839072,
"step": 1275
},
{
"epoch": 0.1403047243231393,
"grad_norm": 5.887922763824463,
"learning_rate": 4.761414741969011e-05,
"loss": 3.1424,
"num_input_tokens_seen": 842568,
"step": 1280
},
{
"epoch": 0.1408527896525266,
"grad_norm": 6.820570468902588,
"learning_rate": 4.7595762389227406e-05,
"loss": 3.0197,
"num_input_tokens_seen": 845808,
"step": 1285
},
{
"epoch": 0.14140085498191385,
"grad_norm": 6.593437671661377,
"learning_rate": 4.757731037164628e-05,
"loss": 3.2013,
"num_input_tokens_seen": 849184,
"step": 1290
},
{
"epoch": 0.1419489203113011,
"grad_norm": 8.89852523803711,
"learning_rate": 4.7558791421649354e-05,
"loss": 3.5085,
"num_input_tokens_seen": 852392,
"step": 1295
},
{
"epoch": 0.14249698564068838,
"grad_norm": 7.368271827697754,
"learning_rate": 4.754020559413768e-05,
"loss": 3.3167,
"num_input_tokens_seen": 855376,
"step": 1300
},
{
"epoch": 0.14304505097007564,
"grad_norm": 5.54932975769043,
"learning_rate": 4.752155294421056e-05,
"loss": 3.0516,
"num_input_tokens_seen": 858720,
"step": 1305
},
{
"epoch": 0.1435931162994629,
"grad_norm": 8.180092811584473,
"learning_rate": 4.750283352716543e-05,
"loss": 3.4647,
"num_input_tokens_seen": 861312,
"step": 1310
},
{
"epoch": 0.14414118162885015,
"grad_norm": 6.608414173126221,
"learning_rate": 4.748404739849763e-05,
"loss": 3.3686,
"num_input_tokens_seen": 864368,
"step": 1315
},
{
"epoch": 0.14468924695823743,
"grad_norm": 6.880706787109375,
"learning_rate": 4.746519461390029e-05,
"loss": 3.0061,
"num_input_tokens_seen": 868000,
"step": 1320
},
{
"epoch": 0.14523731228762468,
"grad_norm": 4.034643650054932,
"learning_rate": 4.744627522926414e-05,
"loss": 3.3709,
"num_input_tokens_seen": 871648,
"step": 1325
},
{
"epoch": 0.14578537761701194,
"grad_norm": 5.335696220397949,
"learning_rate": 4.742728930067736e-05,
"loss": 3.0955,
"num_input_tokens_seen": 875440,
"step": 1330
},
{
"epoch": 0.14633344294639922,
"grad_norm": 8.005532264709473,
"learning_rate": 4.7408236884425396e-05,
"loss": 3.6277,
"num_input_tokens_seen": 879208,
"step": 1335
},
{
"epoch": 0.14688150827578647,
"grad_norm": 7.770083904266357,
"learning_rate": 4.7389118036990795e-05,
"loss": 3.5794,
"num_input_tokens_seen": 882040,
"step": 1340
},
{
"epoch": 0.14742957360517372,
"grad_norm": 6.539053916931152,
"learning_rate": 4.736993281505307e-05,
"loss": 3.2326,
"num_input_tokens_seen": 884984,
"step": 1345
},
{
"epoch": 0.147977638934561,
"grad_norm": 7.831300258636475,
"learning_rate": 4.73506812754885e-05,
"loss": 3.2767,
"num_input_tokens_seen": 888128,
"step": 1350
},
{
"epoch": 0.14852570426394826,
"grad_norm": 5.242404937744141,
"learning_rate": 4.733136347536995e-05,
"loss": 3.4698,
"num_input_tokens_seen": 890520,
"step": 1355
},
{
"epoch": 0.14907376959333551,
"grad_norm": 5.803912162780762,
"learning_rate": 4.731197947196673e-05,
"loss": 3.4711,
"num_input_tokens_seen": 893464,
"step": 1360
},
{
"epoch": 0.1496218349227228,
"grad_norm": 8.300127983093262,
"learning_rate": 4.7292529322744416e-05,
"loss": 3.2302,
"num_input_tokens_seen": 897520,
"step": 1365
},
{
"epoch": 0.15016990025211005,
"grad_norm": 5.02566385269165,
"learning_rate": 4.7273013085364694e-05,
"loss": 3.2959,
"num_input_tokens_seen": 901416,
"step": 1370
},
{
"epoch": 0.1507179655814973,
"grad_norm": 4.600845813751221,
"learning_rate": 4.725343081768514e-05,
"loss": 3.3303,
"num_input_tokens_seen": 906432,
"step": 1375
},
{
"epoch": 0.15126603091088459,
"grad_norm": 6.849578380584717,
"learning_rate": 4.723378257775912e-05,
"loss": 3.1125,
"num_input_tokens_seen": 909264,
"step": 1380
},
{
"epoch": 0.15181409624027184,
"grad_norm": 7.15298318862915,
"learning_rate": 4.7214068423835566e-05,
"loss": 3.2795,
"num_input_tokens_seen": 912464,
"step": 1385
},
{
"epoch": 0.1523621615696591,
"grad_norm": 5.415898323059082,
"learning_rate": 4.7194288414358804e-05,
"loss": 3.1385,
"num_input_tokens_seen": 915960,
"step": 1390
},
{
"epoch": 0.15291022689904638,
"grad_norm": 6.559721946716309,
"learning_rate": 4.717444260796841e-05,
"loss": 3.4027,
"num_input_tokens_seen": 918984,
"step": 1395
},
{
"epoch": 0.15345829222843363,
"grad_norm": 5.312758922576904,
"learning_rate": 4.715453106349902e-05,
"loss": 3.4349,
"num_input_tokens_seen": 921912,
"step": 1400
},
{
"epoch": 0.15400635755782088,
"grad_norm": 6.985774040222168,
"learning_rate": 4.7134553839980143e-05,
"loss": 3.7019,
"num_input_tokens_seen": 925848,
"step": 1405
},
{
"epoch": 0.15455442288720816,
"grad_norm": 6.191575527191162,
"learning_rate": 4.711451099663603e-05,
"loss": 3.4276,
"num_input_tokens_seen": 929792,
"step": 1410
},
{
"epoch": 0.15510248821659542,
"grad_norm": 6.040350437164307,
"learning_rate": 4.709440259288542e-05,
"loss": 2.9173,
"num_input_tokens_seen": 932400,
"step": 1415
},
{
"epoch": 0.15565055354598267,
"grad_norm": 6.164414405822754,
"learning_rate": 4.707422868834146e-05,
"loss": 3.1684,
"num_input_tokens_seen": 935408,
"step": 1420
},
{
"epoch": 0.15619861887536995,
"grad_norm": 7.248453140258789,
"learning_rate": 4.705398934281145e-05,
"loss": 3.6365,
"num_input_tokens_seen": 938184,
"step": 1425
},
{
"epoch": 0.1567466842047572,
"grad_norm": 5.813863754272461,
"learning_rate": 4.70336846162967e-05,
"loss": 3.405,
"num_input_tokens_seen": 941272,
"step": 1430
},
{
"epoch": 0.15729474953414446,
"grad_norm": 6.239504337310791,
"learning_rate": 4.701331456899236e-05,
"loss": 3.0722,
"num_input_tokens_seen": 944728,
"step": 1435
},
{
"epoch": 0.15784281486353174,
"grad_norm": 9.224727630615234,
"learning_rate": 4.6992879261287226e-05,
"loss": 3.2262,
"num_input_tokens_seen": 947528,
"step": 1440
},
{
"epoch": 0.158390880192919,
"grad_norm": 7.570671558380127,
"learning_rate": 4.6972378753763545e-05,
"loss": 3.2116,
"num_input_tokens_seen": 950128,
"step": 1445
},
{
"epoch": 0.15893894552230625,
"grad_norm": 4.781320095062256,
"learning_rate": 4.6951813107196874e-05,
"loss": 3.2953,
"num_input_tokens_seen": 954336,
"step": 1450
},
{
"epoch": 0.15948701085169353,
"grad_norm": 7.117349147796631,
"learning_rate": 4.693118238255587e-05,
"loss": 3.2755,
"num_input_tokens_seen": 957704,
"step": 1455
},
{
"epoch": 0.1600350761810808,
"grad_norm": 6.41115665435791,
"learning_rate": 4.6910486641002136e-05,
"loss": 3.2523,
"num_input_tokens_seen": 960184,
"step": 1460
},
{
"epoch": 0.16058314151046804,
"grad_norm": 8.865285873413086,
"learning_rate": 4.688972594389001e-05,
"loss": 3.3998,
"num_input_tokens_seen": 963264,
"step": 1465
},
{
"epoch": 0.16113120683985532,
"grad_norm": 4.722679615020752,
"learning_rate": 4.6868900352766394e-05,
"loss": 3.0958,
"num_input_tokens_seen": 966536,
"step": 1470
},
{
"epoch": 0.16167927216924258,
"grad_norm": 8.334817886352539,
"learning_rate": 4.6848009929370575e-05,
"loss": 3.2969,
"num_input_tokens_seen": 969008,
"step": 1475
},
{
"epoch": 0.16222733749862983,
"grad_norm": 6.063559055328369,
"learning_rate": 4.682705473563406e-05,
"loss": 3.0186,
"num_input_tokens_seen": 972168,
"step": 1480
},
{
"epoch": 0.1627754028280171,
"grad_norm": 6.434414386749268,
"learning_rate": 4.680603483368033e-05,
"loss": 3.4689,
"num_input_tokens_seen": 976096,
"step": 1485
},
{
"epoch": 0.16332346815740437,
"grad_norm": 8.82730770111084,
"learning_rate": 4.678495028582476e-05,
"loss": 3.2562,
"num_input_tokens_seen": 979080,
"step": 1490
},
{
"epoch": 0.16387153348679162,
"grad_norm": 6.3244171142578125,
"learning_rate": 4.676380115457431e-05,
"loss": 3.0127,
"num_input_tokens_seen": 981896,
"step": 1495
},
{
"epoch": 0.1644195988161789,
"grad_norm": 6.033606052398682,
"learning_rate": 4.674258750262745e-05,
"loss": 3.1823,
"num_input_tokens_seen": 985072,
"step": 1500
},
{
"epoch": 0.16496766414556616,
"grad_norm": 4.211119174957275,
"learning_rate": 4.6721309392873926e-05,
"loss": 3.1351,
"num_input_tokens_seen": 987448,
"step": 1505
},
{
"epoch": 0.1655157294749534,
"grad_norm": 6.105933666229248,
"learning_rate": 4.669996688839453e-05,
"loss": 3.2884,
"num_input_tokens_seen": 990840,
"step": 1510
},
{
"epoch": 0.16606379480434066,
"grad_norm": 8.247055053710938,
"learning_rate": 4.6678560052460994e-05,
"loss": 3.1378,
"num_input_tokens_seen": 994768,
"step": 1515
},
{
"epoch": 0.16661186013372795,
"grad_norm": 5.653783798217773,
"learning_rate": 4.6657088948535776e-05,
"loss": 3.7376,
"num_input_tokens_seen": 997840,
"step": 1520
},
{
"epoch": 0.1671599254631152,
"grad_norm": 5.42575216293335,
"learning_rate": 4.6635553640271835e-05,
"loss": 3.4831,
"num_input_tokens_seen": 1000536,
"step": 1525
},
{
"epoch": 0.16770799079250245,
"grad_norm": 7.640921115875244,
"learning_rate": 4.6613954191512474e-05,
"loss": 3.5714,
"num_input_tokens_seen": 1003952,
"step": 1530
},
{
"epoch": 0.16825605612188974,
"grad_norm": 5.931758880615234,
"learning_rate": 4.6592290666291163e-05,
"loss": 3.4493,
"num_input_tokens_seen": 1006544,
"step": 1535
},
{
"epoch": 0.168804121451277,
"grad_norm": 4.96866512298584,
"learning_rate": 4.657056312883132e-05,
"loss": 3.0963,
"num_input_tokens_seen": 1009920,
"step": 1540
},
{
"epoch": 0.16935218678066424,
"grad_norm": 7.009856224060059,
"learning_rate": 4.6548771643546134e-05,
"loss": 3.0819,
"num_input_tokens_seen": 1012544,
"step": 1545
},
{
"epoch": 0.16990025211005153,
"grad_norm": 6.719354629516602,
"learning_rate": 4.652691627503837e-05,
"loss": 3.3187,
"num_input_tokens_seen": 1015248,
"step": 1550
},
{
"epoch": 0.17044831743943878,
"grad_norm": 7.1751837730407715,
"learning_rate": 4.650499708810018e-05,
"loss": 3.6579,
"num_input_tokens_seen": 1018720,
"step": 1555
},
{
"epoch": 0.17099638276882603,
"grad_norm": 11.277824401855469,
"learning_rate": 4.648301414771293e-05,
"loss": 3.5192,
"num_input_tokens_seen": 1021424,
"step": 1560
},
{
"epoch": 0.17154444809821331,
"grad_norm": 9.307093620300293,
"learning_rate": 4.646096751904696e-05,
"loss": 3.2431,
"num_input_tokens_seen": 1024192,
"step": 1565
},
{
"epoch": 0.17209251342760057,
"grad_norm": 6.657312393188477,
"learning_rate": 4.643885726746143e-05,
"loss": 3.1878,
"num_input_tokens_seen": 1027600,
"step": 1570
},
{
"epoch": 0.17264057875698782,
"grad_norm": 5.908510208129883,
"learning_rate": 4.641668345850414e-05,
"loss": 3.67,
"num_input_tokens_seen": 1030168,
"step": 1575
},
{
"epoch": 0.1731886440863751,
"grad_norm": 6.540554046630859,
"learning_rate": 4.639444615791128e-05,
"loss": 2.9285,
"num_input_tokens_seen": 1034472,
"step": 1580
},
{
"epoch": 0.17373670941576236,
"grad_norm": 6.857239723205566,
"learning_rate": 4.6372145431607264e-05,
"loss": 3.3879,
"num_input_tokens_seen": 1038520,
"step": 1585
},
{
"epoch": 0.1742847747451496,
"grad_norm": 5.343799591064453,
"learning_rate": 4.634978134570456e-05,
"loss": 3.3824,
"num_input_tokens_seen": 1041864,
"step": 1590
},
{
"epoch": 0.1748328400745369,
"grad_norm": 5.971281051635742,
"learning_rate": 4.632735396650346e-05,
"loss": 3.5344,
"num_input_tokens_seen": 1045192,
"step": 1595
},
{
"epoch": 0.17538090540392415,
"grad_norm": 5.474274158477783,
"learning_rate": 4.6304863360491906e-05,
"loss": 3.0682,
"num_input_tokens_seen": 1048680,
"step": 1600
},
{
"epoch": 0.1759289707333114,
"grad_norm": 6.720623970031738,
"learning_rate": 4.6282309594345266e-05,
"loss": 3.0808,
"num_input_tokens_seen": 1051776,
"step": 1605
},
{
"epoch": 0.17647703606269868,
"grad_norm": 6.88260555267334,
"learning_rate": 4.625969273492614e-05,
"loss": 3.5346,
"num_input_tokens_seen": 1054256,
"step": 1610
},
{
"epoch": 0.17702510139208594,
"grad_norm": 6.154021263122559,
"learning_rate": 4.623701284928421e-05,
"loss": 3.2947,
"num_input_tokens_seen": 1057536,
"step": 1615
},
{
"epoch": 0.1775731667214732,
"grad_norm": 6.108212471008301,
"learning_rate": 4.6214270004655985e-05,
"loss": 3.3287,
"num_input_tokens_seen": 1060872,
"step": 1620
},
{
"epoch": 0.17812123205086047,
"grad_norm": 4.82647705078125,
"learning_rate": 4.6191464268464614e-05,
"loss": 3.3231,
"num_input_tokens_seen": 1063536,
"step": 1625
},
{
"epoch": 0.17866929738024773,
"grad_norm": 6.965377330780029,
"learning_rate": 4.61685957083197e-05,
"loss": 3.5096,
"num_input_tokens_seen": 1066392,
"step": 1630
},
{
"epoch": 0.17921736270963498,
"grad_norm": 7.133657455444336,
"learning_rate": 4.6145664392017096e-05,
"loss": 3.2534,
"num_input_tokens_seen": 1068920,
"step": 1635
},
{
"epoch": 0.17976542803902226,
"grad_norm": 8.859077453613281,
"learning_rate": 4.6122670387538704e-05,
"loss": 3.2012,
"num_input_tokens_seen": 1071696,
"step": 1640
},
{
"epoch": 0.18031349336840952,
"grad_norm": 6.119090557098389,
"learning_rate": 4.6099613763052264e-05,
"loss": 3.6088,
"num_input_tokens_seen": 1074720,
"step": 1645
},
{
"epoch": 0.18086155869779677,
"grad_norm": 6.804201126098633,
"learning_rate": 4.607649458691115e-05,
"loss": 3.2794,
"num_input_tokens_seen": 1077944,
"step": 1650
},
{
"epoch": 0.18140962402718405,
"grad_norm": 7.389477729797363,
"learning_rate": 4.60533129276542e-05,
"loss": 3.4432,
"num_input_tokens_seen": 1080792,
"step": 1655
},
{
"epoch": 0.1819576893565713,
"grad_norm": 5.930356502532959,
"learning_rate": 4.6030068854005476e-05,
"loss": 3.2158,
"num_input_tokens_seen": 1083520,
"step": 1660
},
{
"epoch": 0.18250575468595856,
"grad_norm": 6.847218036651611,
"learning_rate": 4.6006762434874065e-05,
"loss": 3.4395,
"num_input_tokens_seen": 1086128,
"step": 1665
},
{
"epoch": 0.18305382001534584,
"grad_norm": 9.511390686035156,
"learning_rate": 4.598339373935389e-05,
"loss": 3.2795,
"num_input_tokens_seen": 1088560,
"step": 1670
},
{
"epoch": 0.1836018853447331,
"grad_norm": 4.90114688873291,
"learning_rate": 4.595996283672349e-05,
"loss": 3.2474,
"num_input_tokens_seen": 1091832,
"step": 1675
},
{
"epoch": 0.18414995067412035,
"grad_norm": 9.29576301574707,
"learning_rate": 4.5936469796445854e-05,
"loss": 3.3011,
"num_input_tokens_seen": 1095048,
"step": 1680
},
{
"epoch": 0.18469801600350763,
"grad_norm": 6.643434524536133,
"learning_rate": 4.5912914688168134e-05,
"loss": 3.4029,
"num_input_tokens_seen": 1097704,
"step": 1685
},
{
"epoch": 0.18524608133289489,
"grad_norm": 4.961350440979004,
"learning_rate": 4.5889297581721526e-05,
"loss": 3.0958,
"num_input_tokens_seen": 1100736,
"step": 1690
},
{
"epoch": 0.18579414666228214,
"grad_norm": 7.057353496551514,
"learning_rate": 4.5865618547121016e-05,
"loss": 3.1003,
"num_input_tokens_seen": 1104184,
"step": 1695
},
{
"epoch": 0.18634221199166942,
"grad_norm": 3.688004970550537,
"learning_rate": 4.584187765456516e-05,
"loss": 3.5992,
"num_input_tokens_seen": 1107880,
"step": 1700
},
{
"epoch": 0.18689027732105667,
"grad_norm": 6.79044246673584,
"learning_rate": 4.5818074974435935e-05,
"loss": 3.5112,
"num_input_tokens_seen": 1110728,
"step": 1705
},
{
"epoch": 0.18743834265044393,
"grad_norm": 5.125957489013672,
"learning_rate": 4.579421057729846e-05,
"loss": 3.4606,
"num_input_tokens_seen": 1113632,
"step": 1710
},
{
"epoch": 0.18798640797983118,
"grad_norm": 6.708007335662842,
"learning_rate": 4.577028453390084e-05,
"loss": 3.4139,
"num_input_tokens_seen": 1117248,
"step": 1715
},
{
"epoch": 0.18853447330921846,
"grad_norm": 4.76835298538208,
"learning_rate": 4.5746296915173924e-05,
"loss": 3.4408,
"num_input_tokens_seen": 1120600,
"step": 1720
},
{
"epoch": 0.18908253863860572,
"grad_norm": 6.29659366607666,
"learning_rate": 4.572224779223111e-05,
"loss": 3.4817,
"num_input_tokens_seen": 1123856,
"step": 1725
},
{
"epoch": 0.18963060396799297,
"grad_norm": 9.75003433227539,
"learning_rate": 4.569813723636813e-05,
"loss": 3.5152,
"num_input_tokens_seen": 1127872,
"step": 1730
},
{
"epoch": 0.19017866929738025,
"grad_norm": 6.846242427825928,
"learning_rate": 4.567396531906285e-05,
"loss": 3.4197,
"num_input_tokens_seen": 1131656,
"step": 1735
},
{
"epoch": 0.1907267346267675,
"grad_norm": 6.956099033355713,
"learning_rate": 4.564973211197503e-05,
"loss": 3.5098,
"num_input_tokens_seen": 1135160,
"step": 1740
},
{
"epoch": 0.19127479995615476,
"grad_norm": 5.187982559204102,
"learning_rate": 4.562543768694614e-05,
"loss": 3.2708,
"num_input_tokens_seen": 1137640,
"step": 1745
},
{
"epoch": 0.19182286528554204,
"grad_norm": 6.0655035972595215,
"learning_rate": 4.5601082115999126e-05,
"loss": 3.1415,
"num_input_tokens_seen": 1140624,
"step": 1750
},
{
"epoch": 0.1923709306149293,
"grad_norm": 7.111659049987793,
"learning_rate": 4.557666547133822e-05,
"loss": 3.419,
"num_input_tokens_seen": 1143352,
"step": 1755
},
{
"epoch": 0.19291899594431655,
"grad_norm": 5.601785659790039,
"learning_rate": 4.55521878253487e-05,
"loss": 3.1537,
"num_input_tokens_seen": 1146552,
"step": 1760
},
{
"epoch": 0.19346706127370383,
"grad_norm": 5.885753154754639,
"learning_rate": 4.5527649250596705e-05,
"loss": 3.1606,
"num_input_tokens_seen": 1150064,
"step": 1765
},
{
"epoch": 0.1940151266030911,
"grad_norm": 7.787903785705566,
"learning_rate": 4.5503049819828975e-05,
"loss": 3.5314,
"num_input_tokens_seen": 1152720,
"step": 1770
},
{
"epoch": 0.19456319193247834,
"grad_norm": 6.6935133934021,
"learning_rate": 4.5478389605972695e-05,
"loss": 3.2798,
"num_input_tokens_seen": 1155704,
"step": 1775
},
{
"epoch": 0.19511125726186562,
"grad_norm": 5.613322734832764,
"learning_rate": 4.545366868213521e-05,
"loss": 2.9432,
"num_input_tokens_seen": 1159064,
"step": 1780
},
{
"epoch": 0.19565932259125288,
"grad_norm": 5.332114219665527,
"learning_rate": 4.542888712160389e-05,
"loss": 3.417,
"num_input_tokens_seen": 1162384,
"step": 1785
},
{
"epoch": 0.19620738792064013,
"grad_norm": 5.810116291046143,
"learning_rate": 4.540404499784582e-05,
"loss": 3.4744,
"num_input_tokens_seen": 1165168,
"step": 1790
},
{
"epoch": 0.1967554532500274,
"grad_norm": 6.959201335906982,
"learning_rate": 4.537914238450768e-05,
"loss": 3.6205,
"num_input_tokens_seen": 1168288,
"step": 1795
},
{
"epoch": 0.19730351857941467,
"grad_norm": 7.266166687011719,
"learning_rate": 4.535417935541543e-05,
"loss": 3.5834,
"num_input_tokens_seen": 1170536,
"step": 1800
},
{
"epoch": 0.19785158390880192,
"grad_norm": 6.565328598022461,
"learning_rate": 4.5329155984574154e-05,
"loss": 3.094,
"num_input_tokens_seen": 1174016,
"step": 1805
},
{
"epoch": 0.1983996492381892,
"grad_norm": 6.1436944007873535,
"learning_rate": 4.5304072346167846e-05,
"loss": 3.6874,
"num_input_tokens_seen": 1177584,
"step": 1810
},
{
"epoch": 0.19894771456757646,
"grad_norm": 6.344284534454346,
"learning_rate": 4.527892851455915e-05,
"loss": 3.5916,
"num_input_tokens_seen": 1180544,
"step": 1815
},
{
"epoch": 0.1994957798969637,
"grad_norm": 6.047328472137451,
"learning_rate": 4.5253724564289144e-05,
"loss": 3.1019,
"num_input_tokens_seen": 1184376,
"step": 1820
},
{
"epoch": 0.200043845226351,
"grad_norm": 5.976099491119385,
"learning_rate": 4.522846057007716e-05,
"loss": 3.0793,
"num_input_tokens_seen": 1187280,
"step": 1825
},
{
"epoch": 0.20059191055573825,
"grad_norm": 6.050201892852783,
"learning_rate": 4.5203136606820515e-05,
"loss": 3.1914,
"num_input_tokens_seen": 1190952,
"step": 1830
},
{
"epoch": 0.2011399758851255,
"grad_norm": 5.573675632476807,
"learning_rate": 4.517775274959434e-05,
"loss": 3.3849,
"num_input_tokens_seen": 1194568,
"step": 1835
},
{
"epoch": 0.20168804121451278,
"grad_norm": 10.978282928466797,
"learning_rate": 4.5152309073651266e-05,
"loss": 3.3821,
"num_input_tokens_seen": 1197992,
"step": 1840
},
{
"epoch": 0.20223610654390003,
"grad_norm": 6.215994358062744,
"learning_rate": 4.512680565442133e-05,
"loss": 2.9822,
"num_input_tokens_seen": 1201456,
"step": 1845
},
{
"epoch": 0.2027841718732873,
"grad_norm": 5.15269660949707,
"learning_rate": 4.510124256751166e-05,
"loss": 3.0034,
"num_input_tokens_seen": 1205552,
"step": 1850
},
{
"epoch": 0.20333223720267457,
"grad_norm": 8.590337753295898,
"learning_rate": 4.507561988870624e-05,
"loss": 3.3385,
"num_input_tokens_seen": 1208496,
"step": 1855
},
{
"epoch": 0.20388030253206182,
"grad_norm": 6.038626194000244,
"learning_rate": 4.5049937693965764e-05,
"loss": 3.3063,
"num_input_tokens_seen": 1211856,
"step": 1860
},
{
"epoch": 0.20442836786144908,
"grad_norm": 6.621918678283691,
"learning_rate": 4.502419605942735e-05,
"loss": 3.2243,
"num_input_tokens_seen": 1216152,
"step": 1865
},
{
"epoch": 0.20497643319083636,
"grad_norm": 6.029962062835693,
"learning_rate": 4.499839506140433e-05,
"loss": 3.4138,
"num_input_tokens_seen": 1219840,
"step": 1870
},
{
"epoch": 0.20552449852022361,
"grad_norm": 7.1330952644348145,
"learning_rate": 4.497253477638602e-05,
"loss": 3.3366,
"num_input_tokens_seen": 1222888,
"step": 1875
},
{
"epoch": 0.20607256384961087,
"grad_norm": 7.775686264038086,
"learning_rate": 4.494661528103751e-05,
"loss": 3.1706,
"num_input_tokens_seen": 1227096,
"step": 1880
},
{
"epoch": 0.20662062917899815,
"grad_norm": 8.789952278137207,
"learning_rate": 4.492063665219941e-05,
"loss": 3.4648,
"num_input_tokens_seen": 1230856,
"step": 1885
},
{
"epoch": 0.2071686945083854,
"grad_norm": 7.492274284362793,
"learning_rate": 4.489459896688764e-05,
"loss": 3.6099,
"num_input_tokens_seen": 1234160,
"step": 1890
},
{
"epoch": 0.20771675983777266,
"grad_norm": 6.971865177154541,
"learning_rate": 4.48685023022932e-05,
"loss": 3.037,
"num_input_tokens_seen": 1236904,
"step": 1895
},
{
"epoch": 0.20826482516715994,
"grad_norm": 9.107683181762695,
"learning_rate": 4.484234673578196e-05,
"loss": 3.435,
"num_input_tokens_seen": 1239936,
"step": 1900
},
{
"epoch": 0.2088128904965472,
"grad_norm": 6.467232704162598,
"learning_rate": 4.4816132344894354e-05,
"loss": 3.6629,
"num_input_tokens_seen": 1242952,
"step": 1905
},
{
"epoch": 0.20936095582593445,
"grad_norm": 6.295756816864014,
"learning_rate": 4.4789859207345274e-05,
"loss": 3.1083,
"num_input_tokens_seen": 1246560,
"step": 1910
},
{
"epoch": 0.2099090211553217,
"grad_norm": 5.817240238189697,
"learning_rate": 4.4763527401023724e-05,
"loss": 3.2389,
"num_input_tokens_seen": 1249904,
"step": 1915
},
{
"epoch": 0.21045708648470898,
"grad_norm": 7.3531317710876465,
"learning_rate": 4.473713700399266e-05,
"loss": 3.1022,
"num_input_tokens_seen": 1252272,
"step": 1920
},
{
"epoch": 0.21100515181409624,
"grad_norm": 7.078802108764648,
"learning_rate": 4.471068809448872e-05,
"loss": 3.2372,
"num_input_tokens_seen": 1255904,
"step": 1925
},
{
"epoch": 0.2115532171434835,
"grad_norm": 5.776179313659668,
"learning_rate": 4.468418075092201e-05,
"loss": 3.2817,
"num_input_tokens_seen": 1259024,
"step": 1930
},
{
"epoch": 0.21210128247287077,
"grad_norm": 9.986640930175781,
"learning_rate": 4.465761505187589e-05,
"loss": 3.349,
"num_input_tokens_seen": 1262584,
"step": 1935
},
{
"epoch": 0.21264934780225803,
"grad_norm": 8.421146392822266,
"learning_rate": 4.463099107610669e-05,
"loss": 3.2711,
"num_input_tokens_seen": 1266072,
"step": 1940
},
{
"epoch": 0.21319741313164528,
"grad_norm": 8.646468162536621,
"learning_rate": 4.460430890254353e-05,
"loss": 3.264,
"num_input_tokens_seen": 1269528,
"step": 1945
},
{
"epoch": 0.21374547846103256,
"grad_norm": 6.439562797546387,
"learning_rate": 4.457756861028804e-05,
"loss": 3.2899,
"num_input_tokens_seen": 1272200,
"step": 1950
},
{
"epoch": 0.21429354379041982,
"grad_norm": 8.170503616333008,
"learning_rate": 4.455077027861417e-05,
"loss": 3.3649,
"num_input_tokens_seen": 1275360,
"step": 1955
},
{
"epoch": 0.21484160911980707,
"grad_norm": 6.329521179199219,
"learning_rate": 4.452391398696794e-05,
"loss": 3.4714,
"num_input_tokens_seen": 1278480,
"step": 1960
},
{
"epoch": 0.21538967444919435,
"grad_norm": 7.618672847747803,
"learning_rate": 4.449699981496714e-05,
"loss": 3.1889,
"num_input_tokens_seen": 1281312,
"step": 1965
},
{
"epoch": 0.2159377397785816,
"grad_norm": 5.937787055969238,
"learning_rate": 4.447002784240122e-05,
"loss": 3.2998,
"num_input_tokens_seen": 1284456,
"step": 1970
},
{
"epoch": 0.21648580510796886,
"grad_norm": 6.004344463348389,
"learning_rate": 4.444299814923096e-05,
"loss": 3.5535,
"num_input_tokens_seen": 1287512,
"step": 1975
},
{
"epoch": 0.21703387043735614,
"grad_norm": 6.512199878692627,
"learning_rate": 4.4415910815588235e-05,
"loss": 3.4036,
"num_input_tokens_seen": 1290336,
"step": 1980
},
{
"epoch": 0.2175819357667434,
"grad_norm": 6.4987616539001465,
"learning_rate": 4.438876592177584e-05,
"loss": 3.6318,
"num_input_tokens_seen": 1292832,
"step": 1985
},
{
"epoch": 0.21813000109613065,
"grad_norm": 5.955297946929932,
"learning_rate": 4.4361563548267186e-05,
"loss": 3.4087,
"num_input_tokens_seen": 1296336,
"step": 1990
},
{
"epoch": 0.21867806642551793,
"grad_norm": 9.001585960388184,
"learning_rate": 4.4334303775706087e-05,
"loss": 3.0256,
"num_input_tokens_seen": 1299928,
"step": 1995
},
{
"epoch": 0.21922613175490518,
"grad_norm": 8.543002128601074,
"learning_rate": 4.4306986684906534e-05,
"loss": 3.0983,
"num_input_tokens_seen": 1303344,
"step": 2000
},
{
"epoch": 0.21977419708429244,
"grad_norm": 5.445712089538574,
"learning_rate": 4.427961235685245e-05,
"loss": 3.5193,
"num_input_tokens_seen": 1306536,
"step": 2005
},
{
"epoch": 0.22032226241367972,
"grad_norm": 4.273796558380127,
"learning_rate": 4.4252180872697403e-05,
"loss": 3.036,
"num_input_tokens_seen": 1311056,
"step": 2010
},
{
"epoch": 0.22087032774306697,
"grad_norm": 5.357060432434082,
"learning_rate": 4.422469231376445e-05,
"loss": 3.2927,
"num_input_tokens_seen": 1314432,
"step": 2015
},
{
"epoch": 0.22141839307245423,
"grad_norm": 6.554574012756348,
"learning_rate": 4.4197146761545825e-05,
"loss": 3.4088,
"num_input_tokens_seen": 1317568,
"step": 2020
},
{
"epoch": 0.2219664584018415,
"grad_norm": 5.920197486877441,
"learning_rate": 4.4169544297702745e-05,
"loss": 3.1075,
"num_input_tokens_seen": 1321288,
"step": 2025
},
{
"epoch": 0.22251452373122876,
"grad_norm": 5.399965763092041,
"learning_rate": 4.414188500406513e-05,
"loss": 3.023,
"num_input_tokens_seen": 1324832,
"step": 2030
},
{
"epoch": 0.22306258906061602,
"grad_norm": 4.449610710144043,
"learning_rate": 4.411416896263137e-05,
"loss": 3.2649,
"num_input_tokens_seen": 1327992,
"step": 2035
},
{
"epoch": 0.2236106543900033,
"grad_norm": 5.2429304122924805,
"learning_rate": 4.408639625556812e-05,
"loss": 3.2027,
"num_input_tokens_seen": 1331448,
"step": 2040
},
{
"epoch": 0.22415871971939055,
"grad_norm": 5.563135623931885,
"learning_rate": 4.405856696520998e-05,
"loss": 3.0106,
"num_input_tokens_seen": 1334672,
"step": 2045
},
{
"epoch": 0.2247067850487778,
"grad_norm": 9.401083946228027,
"learning_rate": 4.403068117405933e-05,
"loss": 3.5604,
"num_input_tokens_seen": 1338664,
"step": 2050
},
{
"epoch": 0.2252548503781651,
"grad_norm": 6.381105899810791,
"learning_rate": 4.4002738964786047e-05,
"loss": 3.1456,
"num_input_tokens_seen": 1341320,
"step": 2055
},
{
"epoch": 0.22580291570755234,
"grad_norm": 8.379097938537598,
"learning_rate": 4.397474042022727e-05,
"loss": 3.7295,
"num_input_tokens_seen": 1344712,
"step": 2060
},
{
"epoch": 0.2263509810369396,
"grad_norm": 5.414994239807129,
"learning_rate": 4.394668562338711e-05,
"loss": 3.2339,
"num_input_tokens_seen": 1348704,
"step": 2065
},
{
"epoch": 0.22689904636632688,
"grad_norm": 6.6783447265625,
"learning_rate": 4.391857465743649e-05,
"loss": 3.1633,
"num_input_tokens_seen": 1352136,
"step": 2070
},
{
"epoch": 0.22744711169571413,
"grad_norm": 6.781215667724609,
"learning_rate": 4.389040760571284e-05,
"loss": 3.2454,
"num_input_tokens_seen": 1355704,
"step": 2075
},
{
"epoch": 0.2279951770251014,
"grad_norm": 8.376158714294434,
"learning_rate": 4.386218455171984e-05,
"loss": 3.2688,
"num_input_tokens_seen": 1358224,
"step": 2080
},
{
"epoch": 0.22854324235448867,
"grad_norm": 6.815377712249756,
"learning_rate": 4.383390557912722e-05,
"loss": 3.2047,
"num_input_tokens_seen": 1361624,
"step": 2085
},
{
"epoch": 0.22909130768387592,
"grad_norm": 9.893330574035645,
"learning_rate": 4.380557077177046e-05,
"loss": 3.3861,
"num_input_tokens_seen": 1365672,
"step": 2090
},
{
"epoch": 0.22963937301326318,
"grad_norm": 5.984465599060059,
"learning_rate": 4.3777180213650587e-05,
"loss": 3.2901,
"num_input_tokens_seen": 1368440,
"step": 2095
},
{
"epoch": 0.23018743834265046,
"grad_norm": 8.21902847290039,
"learning_rate": 4.37487339889339e-05,
"loss": 3.135,
"num_input_tokens_seen": 1370736,
"step": 2100
},
{
"epoch": 0.2307355036720377,
"grad_norm": 7.617781639099121,
"learning_rate": 4.3720232181951726e-05,
"loss": 3.2967,
"num_input_tokens_seen": 1373632,
"step": 2105
},
{
"epoch": 0.23128356900142497,
"grad_norm": 5.901704788208008,
"learning_rate": 4.3691674877200164e-05,
"loss": 3.0304,
"num_input_tokens_seen": 1376840,
"step": 2110
},
{
"epoch": 0.23183163433081222,
"grad_norm": 7.1147074699401855,
"learning_rate": 4.3663062159339855e-05,
"loss": 3.2797,
"num_input_tokens_seen": 1380024,
"step": 2115
},
{
"epoch": 0.2323796996601995,
"grad_norm": 6.9793243408203125,
"learning_rate": 4.363439411319571e-05,
"loss": 3.6079,
"num_input_tokens_seen": 1382992,
"step": 2120
},
{
"epoch": 0.23292776498958676,
"grad_norm": 5.454427242279053,
"learning_rate": 4.360567082375666e-05,
"loss": 3.1035,
"num_input_tokens_seen": 1385936,
"step": 2125
},
{
"epoch": 0.233475830318974,
"grad_norm": 9.776113510131836,
"learning_rate": 4.3576892376175414e-05,
"loss": 3.1049,
"num_input_tokens_seen": 1389176,
"step": 2130
},
{
"epoch": 0.2340238956483613,
"grad_norm": 5.588262557983398,
"learning_rate": 4.3553829961575053e-05,
"loss": 3.0589,
"num_input_tokens_seen": 1392080,
"step": 2135
},
{
"epoch": 0.23457196097774854,
"grad_norm": 7.208589553833008,
"learning_rate": 4.352495244444449e-05,
"loss": 3.3501,
"num_input_tokens_seen": 1395360,
"step": 2140
},
{
"epoch": 0.2351200263071358,
"grad_norm": 5.150116920471191,
"learning_rate": 4.349602000846844e-05,
"loss": 3.4204,
"num_input_tokens_seen": 1398760,
"step": 2145
},
{
"epoch": 0.23566809163652308,
"grad_norm": 7.456035137176514,
"learning_rate": 4.346703273941965e-05,
"loss": 2.9937,
"num_input_tokens_seen": 1402384,
"step": 2150
},
{
"epoch": 0.23621615696591033,
"grad_norm": 5.8624067306518555,
"learning_rate": 4.3437990723233416e-05,
"loss": 3.233,
"num_input_tokens_seen": 1406152,
"step": 2155
},
{
"epoch": 0.2367642222952976,
"grad_norm": 5.129085063934326,
"learning_rate": 4.3408894046007354e-05,
"loss": 3.3833,
"num_input_tokens_seen": 1409704,
"step": 2160
},
{
"epoch": 0.23731228762468487,
"grad_norm": 7.074642658233643,
"learning_rate": 4.337974279400111e-05,
"loss": 3.2288,
"num_input_tokens_seen": 1412984,
"step": 2165
},
{
"epoch": 0.23786035295407212,
"grad_norm": 7.073869228363037,
"learning_rate": 4.335053705363611e-05,
"loss": 3.1338,
"num_input_tokens_seen": 1416232,
"step": 2170
},
{
"epoch": 0.23840841828345938,
"grad_norm": 6.7071990966796875,
"learning_rate": 4.332127691149535e-05,
"loss": 3.1272,
"num_input_tokens_seen": 1419904,
"step": 2175
},
{
"epoch": 0.23895648361284666,
"grad_norm": 8.463297843933105,
"learning_rate": 4.3291962454323076e-05,
"loss": 3.3227,
"num_input_tokens_seen": 1423048,
"step": 2180
},
{
"epoch": 0.2395045489422339,
"grad_norm": 7.098794460296631,
"learning_rate": 4.3262593769024576e-05,
"loss": 3.1422,
"num_input_tokens_seen": 1425568,
"step": 2185
},
{
"epoch": 0.24005261427162117,
"grad_norm": 5.919711589813232,
"learning_rate": 4.323317094266589e-05,
"loss": 3.0584,
"num_input_tokens_seen": 1429464,
"step": 2190
},
{
"epoch": 0.24060067960100845,
"grad_norm": 5.311784267425537,
"learning_rate": 4.320369406247356e-05,
"loss": 2.8391,
"num_input_tokens_seen": 1432832,
"step": 2195
},
{
"epoch": 0.2411487449303957,
"grad_norm": 6.239211559295654,
"learning_rate": 4.317416321583437e-05,
"loss": 3.1701,
"num_input_tokens_seen": 1435960,
"step": 2200
},
{
"epoch": 0.24169681025978296,
"grad_norm": 9.268356323242188,
"learning_rate": 4.314457849029513e-05,
"loss": 3.3796,
"num_input_tokens_seen": 1439752,
"step": 2205
},
{
"epoch": 0.24224487558917024,
"grad_norm": 7.6005449295043945,
"learning_rate": 4.311493997356234e-05,
"loss": 3.189,
"num_input_tokens_seen": 1442488,
"step": 2210
},
{
"epoch": 0.2427929409185575,
"grad_norm": 6.128123760223389,
"learning_rate": 4.308524775350198e-05,
"loss": 3.2867,
"num_input_tokens_seen": 1445800,
"step": 2215
},
{
"epoch": 0.24334100624794475,
"grad_norm": 6.555956840515137,
"learning_rate": 4.305550191813923e-05,
"loss": 3.1985,
"num_input_tokens_seen": 1448992,
"step": 2220
},
{
"epoch": 0.24388907157733203,
"grad_norm": 6.0009446144104,
"learning_rate": 4.302570255565825e-05,
"loss": 3.1752,
"num_input_tokens_seen": 1452104,
"step": 2225
},
{
"epoch": 0.24443713690671928,
"grad_norm": 5.329344749450684,
"learning_rate": 4.299584975440184e-05,
"loss": 2.9533,
"num_input_tokens_seen": 1457016,
"step": 2230
},
{
"epoch": 0.24498520223610654,
"grad_norm": 4.869180202484131,
"learning_rate": 4.296594360287126e-05,
"loss": 2.9869,
"num_input_tokens_seen": 1459624,
"step": 2235
},
{
"epoch": 0.24553326756549382,
"grad_norm": 6.4714202880859375,
"learning_rate": 4.293598418972592e-05,
"loss": 3.2594,
"num_input_tokens_seen": 1462696,
"step": 2240
},
{
"epoch": 0.24608133289488107,
"grad_norm": 10.35406494140625,
"learning_rate": 4.2905971603783116e-05,
"loss": 3.164,
"num_input_tokens_seen": 1466832,
"step": 2245
},
{
"epoch": 0.24662939822426833,
"grad_norm": 5.773983001708984,
"learning_rate": 4.287590593401778e-05,
"loss": 3.2342,
"num_input_tokens_seen": 1470288,
"step": 2250
},
{
"epoch": 0.2471774635536556,
"grad_norm": 5.758610248565674,
"learning_rate": 4.284578726956225e-05,
"loss": 3.38,
"num_input_tokens_seen": 1473032,
"step": 2255
},
{
"epoch": 0.24772552888304286,
"grad_norm": 7.092349529266357,
"learning_rate": 4.2815615699705943e-05,
"loss": 3.1884,
"num_input_tokens_seen": 1476104,
"step": 2260
},
{
"epoch": 0.24827359421243012,
"grad_norm": 8.047478675842285,
"learning_rate": 4.2785391313895103e-05,
"loss": 3.3215,
"num_input_tokens_seen": 1479376,
"step": 2265
},
{
"epoch": 0.2488216595418174,
"grad_norm": 7.5882439613342285,
"learning_rate": 4.27551142017326e-05,
"loss": 3.0476,
"num_input_tokens_seen": 1482248,
"step": 2270
},
{
"epoch": 0.24936972487120465,
"grad_norm": 5.922421932220459,
"learning_rate": 4.2724784452977565e-05,
"loss": 3.3373,
"num_input_tokens_seen": 1485232,
"step": 2275
},
{
"epoch": 0.2499177902005919,
"grad_norm": 6.161900520324707,
"learning_rate": 4.26944021575452e-05,
"loss": 3.0011,
"num_input_tokens_seen": 1488896,
"step": 2280
},
{
"epoch": 0.2504658555299792,
"grad_norm": 7.3562397956848145,
"learning_rate": 4.2663967405506486e-05,
"loss": 2.9991,
"num_input_tokens_seen": 1492072,
"step": 2285
},
{
"epoch": 0.2510139208593664,
"grad_norm": 6.788776397705078,
"learning_rate": 4.263348028708792e-05,
"loss": 2.9735,
"num_input_tokens_seen": 1495224,
"step": 2290
},
{
"epoch": 0.2515619861887537,
"grad_norm": 8.632386207580566,
"learning_rate": 4.260294089267123e-05,
"loss": 3.2221,
"num_input_tokens_seen": 1498256,
"step": 2295
},
{
"epoch": 0.252110051518141,
"grad_norm": 6.462652683258057,
"learning_rate": 4.257234931279313e-05,
"loss": 2.8929,
"num_input_tokens_seen": 1501824,
"step": 2300
},
{
"epoch": 0.2526581168475282,
"grad_norm": 7.380079746246338,
"learning_rate": 4.254170563814505e-05,
"loss": 3.2545,
"num_input_tokens_seen": 1504768,
"step": 2305
},
{
"epoch": 0.2532061821769155,
"grad_norm": 5.370420455932617,
"learning_rate": 4.2511009959572826e-05,
"loss": 3.4558,
"num_input_tokens_seen": 1508056,
"step": 2310
},
{
"epoch": 0.25375424750630277,
"grad_norm": 5.953249454498291,
"learning_rate": 4.2480262368076504e-05,
"loss": 3.2177,
"num_input_tokens_seen": 1511920,
"step": 2315
},
{
"epoch": 0.25430231283569,
"grad_norm": 5.694786548614502,
"learning_rate": 4.244946295481001e-05,
"loss": 3.2378,
"num_input_tokens_seen": 1514936,
"step": 2320
},
{
"epoch": 0.2548503781650773,
"grad_norm": 7.257277965545654,
"learning_rate": 4.241861181108092e-05,
"loss": 3.616,
"num_input_tokens_seen": 1518416,
"step": 2325
},
{
"epoch": 0.25539844349446456,
"grad_norm": 6.388315200805664,
"learning_rate": 4.238770902835013e-05,
"loss": 3.2898,
"num_input_tokens_seen": 1521960,
"step": 2330
},
{
"epoch": 0.2559465088238518,
"grad_norm": 8.813338279724121,
"learning_rate": 4.235675469823166e-05,
"loss": 3.4491,
"num_input_tokens_seen": 1525312,
"step": 2335
},
{
"epoch": 0.25649457415323906,
"grad_norm": 6.0403947830200195,
"learning_rate": 4.232574891249234e-05,
"loss": 3.0747,
"num_input_tokens_seen": 1528632,
"step": 2340
},
{
"epoch": 0.25704263948262634,
"grad_norm": 6.77452278137207,
"learning_rate": 4.229469176305153e-05,
"loss": 3.2356,
"num_input_tokens_seen": 1532200,
"step": 2345
},
{
"epoch": 0.25759070481201357,
"grad_norm": 6.781161785125732,
"learning_rate": 4.2263583341980885e-05,
"loss": 3.1273,
"num_input_tokens_seen": 1535624,
"step": 2350
},
{
"epoch": 0.25813877014140085,
"grad_norm": 6.070975303649902,
"learning_rate": 4.223242374150402e-05,
"loss": 3.0905,
"num_input_tokens_seen": 1538504,
"step": 2355
},
{
"epoch": 0.25868683547078813,
"grad_norm": 6.770239353179932,
"learning_rate": 4.220121305399634e-05,
"loss": 3.2115,
"num_input_tokens_seen": 1541520,
"step": 2360
},
{
"epoch": 0.25923490080017536,
"grad_norm": 6.523434638977051,
"learning_rate": 4.216995137198463e-05,
"loss": 3.2605,
"num_input_tokens_seen": 1545656,
"step": 2365
},
{
"epoch": 0.25978296612956264,
"grad_norm": 6.475868225097656,
"learning_rate": 4.213863878814691e-05,
"loss": 3.2498,
"num_input_tokens_seen": 1549464,
"step": 2370
},
{
"epoch": 0.2603310314589499,
"grad_norm": 7.743395805358887,
"learning_rate": 4.210727539531206e-05,
"loss": 3.0166,
"num_input_tokens_seen": 1553408,
"step": 2375
},
{
"epoch": 0.26087909678833715,
"grad_norm": 6.206083297729492,
"learning_rate": 4.207586128645963e-05,
"loss": 3.2151,
"num_input_tokens_seen": 1557112,
"step": 2380
},
{
"epoch": 0.26142716211772443,
"grad_norm": 7.58196496963501,
"learning_rate": 4.204439655471949e-05,
"loss": 3.5573,
"num_input_tokens_seen": 1560984,
"step": 2385
},
{
"epoch": 0.2619752274471117,
"grad_norm": 8.101637840270996,
"learning_rate": 4.201288129337158e-05,
"loss": 3.4451,
"num_input_tokens_seen": 1563808,
"step": 2390
},
{
"epoch": 0.26252329277649894,
"grad_norm": 9.19637680053711,
"learning_rate": 4.1981315595845684e-05,
"loss": 3.191,
"num_input_tokens_seen": 1567344,
"step": 2395
},
{
"epoch": 0.2630713581058862,
"grad_norm": 7.602110862731934,
"learning_rate": 4.194969955572105e-05,
"loss": 3.7303,
"num_input_tokens_seen": 1570104,
"step": 2400
},
{
"epoch": 0.2636194234352735,
"grad_norm": 10.502030372619629,
"learning_rate": 4.191803326672622e-05,
"loss": 3.2205,
"num_input_tokens_seen": 1572864,
"step": 2405
},
{
"epoch": 0.26416748876466073,
"grad_norm": 5.903884410858154,
"learning_rate": 4.188631682273868e-05,
"loss": 3.5156,
"num_input_tokens_seen": 1575720,
"step": 2410
},
{
"epoch": 0.264715554094048,
"grad_norm": 5.067075729370117,
"learning_rate": 4.1854550317784604e-05,
"loss": 3.1053,
"num_input_tokens_seen": 1579008,
"step": 2415
},
{
"epoch": 0.2652636194234353,
"grad_norm": 6.393657207489014,
"learning_rate": 4.1822733846038584e-05,
"loss": 3.1813,
"num_input_tokens_seen": 1582216,
"step": 2420
},
{
"epoch": 0.2658116847528225,
"grad_norm": 10.575018882751465,
"learning_rate": 4.1790867501823345e-05,
"loss": 3.7197,
"num_input_tokens_seen": 1585440,
"step": 2425
},
{
"epoch": 0.2663597500822098,
"grad_norm": 7.280240535736084,
"learning_rate": 4.175895137960945e-05,
"loss": 3.0196,
"num_input_tokens_seen": 1588248,
"step": 2430
},
{
"epoch": 0.2669078154115971,
"grad_norm": 6.695456504821777,
"learning_rate": 4.172698557401503e-05,
"loss": 2.9587,
"num_input_tokens_seen": 1591288,
"step": 2435
},
{
"epoch": 0.2674558807409843,
"grad_norm": 6.2725653648376465,
"learning_rate": 4.169497017980555e-05,
"loss": 3.3583,
"num_input_tokens_seen": 1595056,
"step": 2440
},
{
"epoch": 0.2680039460703716,
"grad_norm": 6.505600929260254,
"learning_rate": 4.166290529189342e-05,
"loss": 3.474,
"num_input_tokens_seen": 1598096,
"step": 2445
},
{
"epoch": 0.26855201139975887,
"grad_norm": 7.131421089172363,
"learning_rate": 4.163079100533783e-05,
"loss": 3.2172,
"num_input_tokens_seen": 1602648,
"step": 2450
},
{
"epoch": 0.2691000767291461,
"grad_norm": 5.818497657775879,
"learning_rate": 4.1598627415344394e-05,
"loss": 3.2497,
"num_input_tokens_seen": 1605776,
"step": 2455
},
{
"epoch": 0.2696481420585334,
"grad_norm": 8.350225448608398,
"learning_rate": 4.156641461726489e-05,
"loss": 3.2372,
"num_input_tokens_seen": 1609960,
"step": 2460
},
{
"epoch": 0.27019620738792066,
"grad_norm": 10.619945526123047,
"learning_rate": 4.153415270659699e-05,
"loss": 3.0958,
"num_input_tokens_seen": 1612808,
"step": 2465
},
{
"epoch": 0.2707442727173079,
"grad_norm": 6.475553035736084,
"learning_rate": 4.150184177898394e-05,
"loss": 3.4121,
"num_input_tokens_seen": 1616104,
"step": 2470
},
{
"epoch": 0.27129233804669517,
"grad_norm": 9.670978546142578,
"learning_rate": 4.1469481930214335e-05,
"loss": 3.1002,
"num_input_tokens_seen": 1618920,
"step": 2475
},
{
"epoch": 0.27184040337608245,
"grad_norm": 5.271237850189209,
"learning_rate": 4.1437073256221784e-05,
"loss": 3.1366,
"num_input_tokens_seen": 1622272,
"step": 2480
},
{
"epoch": 0.2723884687054697,
"grad_norm": 6.107699394226074,
"learning_rate": 4.1404615853084626e-05,
"loss": 3.5266,
"num_input_tokens_seen": 1624928,
"step": 2485
},
{
"epoch": 0.27293653403485696,
"grad_norm": 8.945226669311523,
"learning_rate": 4.137210981702568e-05,
"loss": 3.627,
"num_input_tokens_seen": 1628632,
"step": 2490
},
{
"epoch": 0.27348459936424424,
"grad_norm": 5.393161296844482,
"learning_rate": 4.133955524441196e-05,
"loss": 3.6371,
"num_input_tokens_seen": 1631272,
"step": 2495
},
{
"epoch": 0.27403266469363147,
"grad_norm": 7.735115051269531,
"learning_rate": 4.130695223175434e-05,
"loss": 3.4529,
"num_input_tokens_seen": 1634272,
"step": 2500
},
{
"epoch": 0.27458073002301875,
"grad_norm": 9.375452041625977,
"learning_rate": 4.1274300875707295e-05,
"loss": 3.2474,
"num_input_tokens_seen": 1638000,
"step": 2505
},
{
"epoch": 0.27512879535240603,
"grad_norm": 6.957891464233398,
"learning_rate": 4.124160127306864e-05,
"loss": 3.0279,
"num_input_tokens_seen": 1641896,
"step": 2510
},
{
"epoch": 0.27567686068179326,
"grad_norm": 6.637111663818359,
"learning_rate": 4.120885352077922e-05,
"loss": 3.5516,
"num_input_tokens_seen": 1645288,
"step": 2515
},
{
"epoch": 0.27622492601118054,
"grad_norm": 6.921294212341309,
"learning_rate": 4.1176057715922624e-05,
"loss": 3.2415,
"num_input_tokens_seen": 1648800,
"step": 2520
},
{
"epoch": 0.2767729913405678,
"grad_norm": 6.21347713470459,
"learning_rate": 4.114321395572488e-05,
"loss": 3.3217,
"num_input_tokens_seen": 1652416,
"step": 2525
},
{
"epoch": 0.27732105666995505,
"grad_norm": 7.985599040985107,
"learning_rate": 4.111032233755418e-05,
"loss": 3.0362,
"num_input_tokens_seen": 1655720,
"step": 2530
},
{
"epoch": 0.27786912199934233,
"grad_norm": 6.855371952056885,
"learning_rate": 4.107738295892063e-05,
"loss": 3.0962,
"num_input_tokens_seen": 1659440,
"step": 2535
},
{
"epoch": 0.2784171873287296,
"grad_norm": 7.123937129974365,
"learning_rate": 4.104439591747591e-05,
"loss": 3.102,
"num_input_tokens_seen": 1662400,
"step": 2540
},
{
"epoch": 0.27896525265811684,
"grad_norm": 6.53096866607666,
"learning_rate": 4.101136131101297e-05,
"loss": 2.9064,
"num_input_tokens_seen": 1665336,
"step": 2545
},
{
"epoch": 0.2795133179875041,
"grad_norm": 8.0481538772583,
"learning_rate": 4.0978279237465825e-05,
"loss": 3.103,
"num_input_tokens_seen": 1668288,
"step": 2550
},
{
"epoch": 0.2800613833168914,
"grad_norm": 4.704191207885742,
"learning_rate": 4.094514979490917e-05,
"loss": 2.9912,
"num_input_tokens_seen": 1671840,
"step": 2555
},
{
"epoch": 0.2806094486462786,
"grad_norm": 6.396568775177002,
"learning_rate": 4.091197308155814e-05,
"loss": 3.0125,
"num_input_tokens_seen": 1675512,
"step": 2560
},
{
"epoch": 0.2811575139756659,
"grad_norm": 6.377243518829346,
"learning_rate": 4.087874919576801e-05,
"loss": 2.9588,
"num_input_tokens_seen": 1679232,
"step": 2565
},
{
"epoch": 0.2817055793050532,
"grad_norm": 7.850512981414795,
"learning_rate": 4.084547823603391e-05,
"loss": 3.1181,
"num_input_tokens_seen": 1682432,
"step": 2570
},
{
"epoch": 0.2822536446344404,
"grad_norm": 7.351206302642822,
"learning_rate": 4.08121603009905e-05,
"loss": 3.2493,
"num_input_tokens_seen": 1686064,
"step": 2575
},
{
"epoch": 0.2828017099638277,
"grad_norm": 6.765766620635986,
"learning_rate": 4.077879548941172e-05,
"loss": 2.9447,
"num_input_tokens_seen": 1689312,
"step": 2580
},
{
"epoch": 0.283349775293215,
"grad_norm": 6.162474155426025,
"learning_rate": 4.0745383900210514e-05,
"loss": 3.0923,
"num_input_tokens_seen": 1692976,
"step": 2585
},
{
"epoch": 0.2838978406226022,
"grad_norm": 6.094540119171143,
"learning_rate": 4.071192563243843e-05,
"loss": 3.4034,
"num_input_tokens_seen": 1695344,
"step": 2590
},
{
"epoch": 0.2844459059519895,
"grad_norm": 9.006319999694824,
"learning_rate": 4.0678420785285446e-05,
"loss": 3.3876,
"num_input_tokens_seen": 1698336,
"step": 2595
},
{
"epoch": 0.28499397128137677,
"grad_norm": 7.306302070617676,
"learning_rate": 4.064486945807963e-05,
"loss": 2.9591,
"num_input_tokens_seen": 1703912,
"step": 2600
},
{
"epoch": 0.285542036610764,
"grad_norm": 5.706150054931641,
"learning_rate": 4.0611271750286805e-05,
"loss": 3.0137,
"num_input_tokens_seen": 1707664,
"step": 2605
},
{
"epoch": 0.2860901019401513,
"grad_norm": 7.290525436401367,
"learning_rate": 4.057762776151035e-05,
"loss": 3.4755,
"num_input_tokens_seen": 1710832,
"step": 2610
},
{
"epoch": 0.2866381672695385,
"grad_norm": 7.548462867736816,
"learning_rate": 4.054393759149081e-05,
"loss": 3.1482,
"num_input_tokens_seen": 1713616,
"step": 2615
},
{
"epoch": 0.2871862325989258,
"grad_norm": 7.191598415374756,
"learning_rate": 4.051020134010564e-05,
"loss": 3.5189,
"num_input_tokens_seen": 1717328,
"step": 2620
},
{
"epoch": 0.28773429792831307,
"grad_norm": 5.576016426086426,
"learning_rate": 4.0476419107368924e-05,
"loss": 3.1058,
"num_input_tokens_seen": 1720976,
"step": 2625
},
{
"epoch": 0.2882823632577003,
"grad_norm": 5.512149333953857,
"learning_rate": 4.044259099343104e-05,
"loss": 3.3606,
"num_input_tokens_seen": 1723840,
"step": 2630
},
{
"epoch": 0.2888304285870876,
"grad_norm": 6.475109100341797,
"learning_rate": 4.040871709857842e-05,
"loss": 3.2876,
"num_input_tokens_seen": 1726944,
"step": 2635
},
{
"epoch": 0.28937849391647485,
"grad_norm": 6.24223518371582,
"learning_rate": 4.037479752323317e-05,
"loss": 3.2583,
"num_input_tokens_seen": 1730056,
"step": 2640
},
{
"epoch": 0.2899265592458621,
"grad_norm": 7.499751091003418,
"learning_rate": 4.034083236795286e-05,
"loss": 3.6548,
"num_input_tokens_seen": 1733800,
"step": 2645
},
{
"epoch": 0.29047462457524936,
"grad_norm": 5.272352695465088,
"learning_rate": 4.030682173343016e-05,
"loss": 3.345,
"num_input_tokens_seen": 1738176,
"step": 2650
},
{
"epoch": 0.29102268990463664,
"grad_norm": 4.747354030609131,
"learning_rate": 4.027276572049259e-05,
"loss": 2.8691,
"num_input_tokens_seen": 1742088,
"step": 2655
},
{
"epoch": 0.29157075523402387,
"grad_norm": 4.695064544677734,
"learning_rate": 4.0238664430102175e-05,
"loss": 3.3259,
"num_input_tokens_seen": 1746032,
"step": 2660
},
{
"epoch": 0.29211882056341115,
"grad_norm": 5.169468402862549,
"learning_rate": 4.020451796335518e-05,
"loss": 3.193,
"num_input_tokens_seen": 1749336,
"step": 2665
},
{
"epoch": 0.29266688589279843,
"grad_norm": 6.7505340576171875,
"learning_rate": 4.017032642148181e-05,
"loss": 3.1603,
"num_input_tokens_seen": 1752808,
"step": 2670
},
{
"epoch": 0.29321495122218566,
"grad_norm": 8.776106834411621,
"learning_rate": 4.0136089905845874e-05,
"loss": 3.065,
"num_input_tokens_seen": 1756768,
"step": 2675
},
{
"epoch": 0.29376301655157294,
"grad_norm": 5.4388203620910645,
"learning_rate": 4.010180851794453e-05,
"loss": 3.3523,
"num_input_tokens_seen": 1759960,
"step": 2680
},
{
"epoch": 0.2943110818809602,
"grad_norm": 7.309511661529541,
"learning_rate": 4.006748235940796e-05,
"loss": 3.1897,
"num_input_tokens_seen": 1763848,
"step": 2685
},
{
"epoch": 0.29485914721034745,
"grad_norm": 7.108086109161377,
"learning_rate": 4.003311153199908e-05,
"loss": 3.2525,
"num_input_tokens_seen": 1767224,
"step": 2690
},
{
"epoch": 0.29540721253973473,
"grad_norm": 6.940639495849609,
"learning_rate": 3.99986961376132e-05,
"loss": 3.0928,
"num_input_tokens_seen": 1770816,
"step": 2695
},
{
"epoch": 0.295955277869122,
"grad_norm": 8.109939575195312,
"learning_rate": 3.996423627827778e-05,
"loss": 3.2992,
"num_input_tokens_seen": 1775144,
"step": 2700
},
{
"epoch": 0.29650334319850924,
"grad_norm": 8.848753929138184,
"learning_rate": 3.9929732056152104e-05,
"loss": 3.1256,
"num_input_tokens_seen": 1777888,
"step": 2705
},
{
"epoch": 0.2970514085278965,
"grad_norm": 6.489472389221191,
"learning_rate": 3.989518357352695e-05,
"loss": 3.0047,
"num_input_tokens_seen": 1782160,
"step": 2710
},
{
"epoch": 0.2975994738572838,
"grad_norm": 7.247778415679932,
"learning_rate": 3.986059093282433e-05,
"loss": 3.075,
"num_input_tokens_seen": 1784824,
"step": 2715
},
{
"epoch": 0.29814753918667103,
"grad_norm": 7.691065788269043,
"learning_rate": 3.982595423659716e-05,
"loss": 3.4486,
"num_input_tokens_seen": 1788072,
"step": 2720
},
{
"epoch": 0.2986956045160583,
"grad_norm": 7.700766086578369,
"learning_rate": 3.979127358752897e-05,
"loss": 3.4979,
"num_input_tokens_seen": 1790944,
"step": 2725
},
{
"epoch": 0.2992436698454456,
"grad_norm": 5.059070110321045,
"learning_rate": 3.975654908843356e-05,
"loss": 3.305,
"num_input_tokens_seen": 1794368,
"step": 2730
},
{
"epoch": 0.2997917351748328,
"grad_norm": 6.1541595458984375,
"learning_rate": 3.972178084225478e-05,
"loss": 3.2146,
"num_input_tokens_seen": 1798760,
"step": 2735
},
{
"epoch": 0.3003398005042201,
"grad_norm": 8.040989875793457,
"learning_rate": 3.968696895206613e-05,
"loss": 3.482,
"num_input_tokens_seen": 1801512,
"step": 2740
},
{
"epoch": 0.3008878658336074,
"grad_norm": 5.050278186798096,
"learning_rate": 3.9652113521070513e-05,
"loss": 3.3143,
"num_input_tokens_seen": 1805240,
"step": 2745
},
{
"epoch": 0.3014359311629946,
"grad_norm": 5.1891279220581055,
"learning_rate": 3.9617214652599904e-05,
"loss": 2.8368,
"num_input_tokens_seen": 1809040,
"step": 2750
},
{
"epoch": 0.3019839964923819,
"grad_norm": 6.89003849029541,
"learning_rate": 3.958227245011506e-05,
"loss": 3.3205,
"num_input_tokens_seen": 1812536,
"step": 2755
},
{
"epoch": 0.30253206182176917,
"grad_norm": 6.001296043395996,
"learning_rate": 3.954728701720521e-05,
"loss": 3.4753,
"num_input_tokens_seen": 1816296,
"step": 2760
},
{
"epoch": 0.3030801271511564,
"grad_norm": 4.202249050140381,
"learning_rate": 3.951225845758773e-05,
"loss": 3.3659,
"num_input_tokens_seen": 1819896,
"step": 2765
},
{
"epoch": 0.3036281924805437,
"grad_norm": 6.209683418273926,
"learning_rate": 3.9477186875107865e-05,
"loss": 3.5706,
"num_input_tokens_seen": 1823960,
"step": 2770
},
{
"epoch": 0.30417625780993096,
"grad_norm": 5.219339847564697,
"learning_rate": 3.944207237373838e-05,
"loss": 3.121,
"num_input_tokens_seen": 1827176,
"step": 2775
},
{
"epoch": 0.3047243231393182,
"grad_norm": 6.556133270263672,
"learning_rate": 3.940691505757931e-05,
"loss": 3.1289,
"num_input_tokens_seen": 1830016,
"step": 2780
},
{
"epoch": 0.30527238846870547,
"grad_norm": 5.480815887451172,
"learning_rate": 3.9371715030857595e-05,
"loss": 2.8851,
"num_input_tokens_seen": 1833280,
"step": 2785
},
{
"epoch": 0.30582045379809275,
"grad_norm": 4.781624794006348,
"learning_rate": 3.933647239792679e-05,
"loss": 3.066,
"num_input_tokens_seen": 1836784,
"step": 2790
},
{
"epoch": 0.30636851912748,
"grad_norm": 5.901027202606201,
"learning_rate": 3.930118726326678e-05,
"loss": 3.0618,
"num_input_tokens_seen": 1840600,
"step": 2795
},
{
"epoch": 0.30691658445686726,
"grad_norm": 4.3098649978637695,
"learning_rate": 3.926585973148344e-05,
"loss": 3.0273,
"num_input_tokens_seen": 1844456,
"step": 2800
},
{
"epoch": 0.30746464978625454,
"grad_norm": 7.2452521324157715,
"learning_rate": 3.923048990730832e-05,
"loss": 3.3328,
"num_input_tokens_seen": 1847648,
"step": 2805
},
{
"epoch": 0.30801271511564177,
"grad_norm": 9.102137565612793,
"learning_rate": 3.9195077895598385e-05,
"loss": 3.4577,
"num_input_tokens_seen": 1851080,
"step": 2810
},
{
"epoch": 0.30856078044502905,
"grad_norm": 7.165421009063721,
"learning_rate": 3.9159623801335635e-05,
"loss": 3.2345,
"num_input_tokens_seen": 1854544,
"step": 2815
},
{
"epoch": 0.30910884577441633,
"grad_norm": 6.918674468994141,
"learning_rate": 3.912412772962685e-05,
"loss": 3.3151,
"num_input_tokens_seen": 1857488,
"step": 2820
},
{
"epoch": 0.30965691110380356,
"grad_norm": 7.7270660400390625,
"learning_rate": 3.908858978570324e-05,
"loss": 3.0722,
"num_input_tokens_seen": 1859744,
"step": 2825
},
{
"epoch": 0.31020497643319084,
"grad_norm": 5.471165657043457,
"learning_rate": 3.905301007492016e-05,
"loss": 3.3752,
"num_input_tokens_seen": 1862520,
"step": 2830
},
{
"epoch": 0.3107530417625781,
"grad_norm": 8.547778129577637,
"learning_rate": 3.9017388702756766e-05,
"loss": 3.4572,
"num_input_tokens_seen": 1865688,
"step": 2835
},
{
"epoch": 0.31130110709196535,
"grad_norm": 5.8289289474487305,
"learning_rate": 3.898172577481577e-05,
"loss": 3.0442,
"num_input_tokens_seen": 1869008,
"step": 2840
},
{
"epoch": 0.3118491724213526,
"grad_norm": 5.646442413330078,
"learning_rate": 3.894602139682301e-05,
"loss": 3.3365,
"num_input_tokens_seen": 1872200,
"step": 2845
},
{
"epoch": 0.3123972377507399,
"grad_norm": 5.7611565589904785,
"learning_rate": 3.891027567462727e-05,
"loss": 3.0501,
"num_input_tokens_seen": 1874936,
"step": 2850
},
{
"epoch": 0.31294530308012714,
"grad_norm": 6.07964563369751,
"learning_rate": 3.8874488714199874e-05,
"loss": 3.1584,
"num_input_tokens_seen": 1877880,
"step": 2855
},
{
"epoch": 0.3134933684095144,
"grad_norm": 6.76899528503418,
"learning_rate": 3.883866062163439e-05,
"loss": 3.2215,
"num_input_tokens_seen": 1880632,
"step": 2860
},
{
"epoch": 0.3140414337389017,
"grad_norm": 9.11755657196045,
"learning_rate": 3.880279150314636e-05,
"loss": 3.4992,
"num_input_tokens_seen": 1883792,
"step": 2865
},
{
"epoch": 0.3145894990682889,
"grad_norm": 4.672335147857666,
"learning_rate": 3.876688146507291e-05,
"loss": 3.2378,
"num_input_tokens_seen": 1887984,
"step": 2870
},
{
"epoch": 0.3151375643976762,
"grad_norm": 8.21897029876709,
"learning_rate": 3.873093061387251e-05,
"loss": 3.4215,
"num_input_tokens_seen": 1890952,
"step": 2875
},
{
"epoch": 0.3156856297270635,
"grad_norm": 6.4296674728393555,
"learning_rate": 3.869493905612461e-05,
"loss": 3.1436,
"num_input_tokens_seen": 1894376,
"step": 2880
},
{
"epoch": 0.3162336950564507,
"grad_norm": 6.088110446929932,
"learning_rate": 3.8658906898529325e-05,
"loss": 3.1597,
"num_input_tokens_seen": 1897632,
"step": 2885
},
{
"epoch": 0.316781760385838,
"grad_norm": 7.144382953643799,
"learning_rate": 3.8622834247907155e-05,
"loss": 3.3071,
"num_input_tokens_seen": 1899992,
"step": 2890
},
{
"epoch": 0.3173298257152253,
"grad_norm": 5.95371675491333,
"learning_rate": 3.858672121119863e-05,
"loss": 3.1272,
"num_input_tokens_seen": 1902928,
"step": 2895
},
{
"epoch": 0.3178778910446125,
"grad_norm": 5.033254623413086,
"learning_rate": 3.855056789546402e-05,
"loss": 3.5104,
"num_input_tokens_seen": 1905872,
"step": 2900
},
{
"epoch": 0.3184259563739998,
"grad_norm": 9.2310209274292,
"learning_rate": 3.8514374407883e-05,
"loss": 3.22,
"num_input_tokens_seen": 1910456,
"step": 2905
},
{
"epoch": 0.31897402170338707,
"grad_norm": 13.305641174316406,
"learning_rate": 3.847814085575432e-05,
"loss": 3.5537,
"num_input_tokens_seen": 1914432,
"step": 2910
},
{
"epoch": 0.3195220870327743,
"grad_norm": 4.90524959564209,
"learning_rate": 3.844186734649554e-05,
"loss": 3.1428,
"num_input_tokens_seen": 1917176,
"step": 2915
},
{
"epoch": 0.3200701523621616,
"grad_norm": 7.605042457580566,
"learning_rate": 3.840555398764265e-05,
"loss": 2.6933,
"num_input_tokens_seen": 1919488,
"step": 2920
},
{
"epoch": 0.32061821769154886,
"grad_norm": 6.435617923736572,
"learning_rate": 3.836920088684979e-05,
"loss": 3.1942,
"num_input_tokens_seen": 1922184,
"step": 2925
},
{
"epoch": 0.3211662830209361,
"grad_norm": 5.5276288986206055,
"learning_rate": 3.8332808151888906e-05,
"loss": 3.3987,
"num_input_tokens_seen": 1925760,
"step": 2930
},
{
"epoch": 0.32171434835032336,
"grad_norm": 7.981554985046387,
"learning_rate": 3.829637589064946e-05,
"loss": 3.107,
"num_input_tokens_seen": 1928024,
"step": 2935
},
{
"epoch": 0.32226241367971065,
"grad_norm": 6.667475700378418,
"learning_rate": 3.8259904211138074e-05,
"loss": 2.8259,
"num_input_tokens_seen": 1931992,
"step": 2940
},
{
"epoch": 0.3228104790090979,
"grad_norm": 6.904677867889404,
"learning_rate": 3.8223393221478257e-05,
"loss": 3.3099,
"num_input_tokens_seen": 1934432,
"step": 2945
},
{
"epoch": 0.32335854433848515,
"grad_norm": 6.4357008934021,
"learning_rate": 3.818684302991001e-05,
"loss": 3.5156,
"num_input_tokens_seen": 1938288,
"step": 2950
},
{
"epoch": 0.32390660966787244,
"grad_norm": 6.910282611846924,
"learning_rate": 3.8150253744789624e-05,
"loss": 3.7432,
"num_input_tokens_seen": 1941552,
"step": 2955
},
{
"epoch": 0.32445467499725966,
"grad_norm": 6.355223178863525,
"learning_rate": 3.811362547458919e-05,
"loss": 3.3951,
"num_input_tokens_seen": 1944848,
"step": 2960
},
{
"epoch": 0.32500274032664694,
"grad_norm": 5.630364418029785,
"learning_rate": 3.807695832789646e-05,
"loss": 3.1733,
"num_input_tokens_seen": 1947576,
"step": 2965
},
{
"epoch": 0.3255508056560342,
"grad_norm": 7.782848358154297,
"learning_rate": 3.80402524134144e-05,
"loss": 2.9549,
"num_input_tokens_seen": 1950920,
"step": 2970
},
{
"epoch": 0.32609887098542145,
"grad_norm": 6.886142730712891,
"learning_rate": 3.8003507839960895e-05,
"loss": 3.1884,
"num_input_tokens_seen": 1954424,
"step": 2975
},
{
"epoch": 0.32664693631480873,
"grad_norm": 6.035950660705566,
"learning_rate": 3.796672471646848e-05,
"loss": 2.9874,
"num_input_tokens_seen": 1957928,
"step": 2980
},
{
"epoch": 0.327195001644196,
"grad_norm": 8.303248405456543,
"learning_rate": 3.7929903151983934e-05,
"loss": 3.4268,
"num_input_tokens_seen": 1961240,
"step": 2985
},
{
"epoch": 0.32774306697358324,
"grad_norm": 6.161063194274902,
"learning_rate": 3.789304325566801e-05,
"loss": 2.8965,
"num_input_tokens_seen": 1963864,
"step": 2990
},
{
"epoch": 0.3282911323029705,
"grad_norm": 5.629215717315674,
"learning_rate": 3.7856145136795104e-05,
"loss": 3.0241,
"num_input_tokens_seen": 1967656,
"step": 2995
},
{
"epoch": 0.3288391976323578,
"grad_norm": 9.494491577148438,
"learning_rate": 3.781920890475294e-05,
"loss": 3.2297,
"num_input_tokens_seen": 1970608,
"step": 3000
},
{
"epoch": 0.32938726296174503,
"grad_norm": 4.975097179412842,
"learning_rate": 3.7782234669042186e-05,
"loss": 3.1757,
"num_input_tokens_seen": 1973664,
"step": 3005
},
{
"epoch": 0.3299353282911323,
"grad_norm": 7.1082258224487305,
"learning_rate": 3.7745222539276224e-05,
"loss": 3.1921,
"num_input_tokens_seen": 1976944,
"step": 3010
},
{
"epoch": 0.33048339362051954,
"grad_norm": 11.492435455322266,
"learning_rate": 3.770817262518076e-05,
"loss": 3.1751,
"num_input_tokens_seen": 1980160,
"step": 3015
},
{
"epoch": 0.3310314589499068,
"grad_norm": 6.560080051422119,
"learning_rate": 3.76710850365935e-05,
"loss": 3.0906,
"num_input_tokens_seen": 1983576,
"step": 3020
},
{
"epoch": 0.3315795242792941,
"grad_norm": 7.438432216644287,
"learning_rate": 3.763395988346386e-05,
"loss": 3.1074,
"num_input_tokens_seen": 1985784,
"step": 3025
},
{
"epoch": 0.33212758960868133,
"grad_norm": 7.6575164794921875,
"learning_rate": 3.759679727585262e-05,
"loss": 3.1625,
"num_input_tokens_seen": 1989344,
"step": 3030
},
{
"epoch": 0.3326756549380686,
"grad_norm": 6.756874084472656,
"learning_rate": 3.7559597323931566e-05,
"loss": 3.2758,
"num_input_tokens_seen": 1992304,
"step": 3035
},
{
"epoch": 0.3332237202674559,
"grad_norm": 5.427942276000977,
"learning_rate": 3.7522360137983235e-05,
"loss": 3.1905,
"num_input_tokens_seen": 1996120,
"step": 3040
},
{
"epoch": 0.3337717855968431,
"grad_norm": 5.814554691314697,
"learning_rate": 3.748508582840052e-05,
"loss": 2.8693,
"num_input_tokens_seen": 1999176,
"step": 3045
},
{
"epoch": 0.3343198509262304,
"grad_norm": 7.720613956451416,
"learning_rate": 3.744777450568638e-05,
"loss": 3.3644,
"num_input_tokens_seen": 2002112,
"step": 3050
},
{
"epoch": 0.3348679162556177,
"grad_norm": 5.780377388000488,
"learning_rate": 3.7410426280453505e-05,
"loss": 2.8918,
"num_input_tokens_seen": 2005800,
"step": 3055
},
{
"epoch": 0.3354159815850049,
"grad_norm": 5.939544677734375,
"learning_rate": 3.737304126342398e-05,
"loss": 3.0217,
"num_input_tokens_seen": 2009192,
"step": 3060
},
{
"epoch": 0.3359640469143922,
"grad_norm": 6.661081314086914,
"learning_rate": 3.7335619565428964e-05,
"loss": 3.2056,
"num_input_tokens_seen": 2012280,
"step": 3065
},
{
"epoch": 0.33651211224377947,
"grad_norm": 4.9228620529174805,
"learning_rate": 3.729816129740836e-05,
"loss": 3.106,
"num_input_tokens_seen": 2014984,
"step": 3070
},
{
"epoch": 0.3370601775731667,
"grad_norm": 6.285070896148682,
"learning_rate": 3.726066657041051e-05,
"loss": 3.1639,
"num_input_tokens_seen": 2019048,
"step": 3075
},
{
"epoch": 0.337608242902554,
"grad_norm": 6.625104904174805,
"learning_rate": 3.7223135495591776e-05,
"loss": 3.2258,
"num_input_tokens_seen": 2022776,
"step": 3080
},
{
"epoch": 0.33815630823194126,
"grad_norm": 8.347160339355469,
"learning_rate": 3.718556818421636e-05,
"loss": 3.4006,
"num_input_tokens_seen": 2026304,
"step": 3085
},
{
"epoch": 0.3387043735613285,
"grad_norm": 9.37065601348877,
"learning_rate": 3.7147964747655836e-05,
"loss": 3.2778,
"num_input_tokens_seen": 2030200,
"step": 3090
},
{
"epoch": 0.33925243889071577,
"grad_norm": 6.341724872589111,
"learning_rate": 3.711032529738887e-05,
"loss": 3.5654,
"num_input_tokens_seen": 2033656,
"step": 3095
},
{
"epoch": 0.33980050422010305,
"grad_norm": 6.54714298248291,
"learning_rate": 3.7072649945000936e-05,
"loss": 3.0664,
"num_input_tokens_seen": 2037328,
"step": 3100
},
{
"epoch": 0.3403485695494903,
"grad_norm": 6.289731979370117,
"learning_rate": 3.703493880218391e-05,
"loss": 2.8214,
"num_input_tokens_seen": 2040488,
"step": 3105
},
{
"epoch": 0.34089663487887756,
"grad_norm": 8.150530815124512,
"learning_rate": 3.699719198073578e-05,
"loss": 3.2654,
"num_input_tokens_seen": 2043256,
"step": 3110
},
{
"epoch": 0.34144470020826484,
"grad_norm": 7.053910255432129,
"learning_rate": 3.6959409592560304e-05,
"loss": 3.3008,
"num_input_tokens_seen": 2046064,
"step": 3115
},
{
"epoch": 0.34199276553765207,
"grad_norm": 5.083940505981445,
"learning_rate": 3.69215917496667e-05,
"loss": 3.0999,
"num_input_tokens_seen": 2049568,
"step": 3120
},
{
"epoch": 0.34254083086703935,
"grad_norm": 5.558229446411133,
"learning_rate": 3.6883738564169254e-05,
"loss": 3.4491,
"num_input_tokens_seen": 2052400,
"step": 3125
},
{
"epoch": 0.34308889619642663,
"grad_norm": 7.365407466888428,
"learning_rate": 3.684585014828708e-05,
"loss": 3.1569,
"num_input_tokens_seen": 2055864,
"step": 3130
},
{
"epoch": 0.34363696152581386,
"grad_norm": 7.316169738769531,
"learning_rate": 3.680792661434368e-05,
"loss": 3.1274,
"num_input_tokens_seen": 2058856,
"step": 3135
},
{
"epoch": 0.34418502685520114,
"grad_norm": 8.32957935333252,
"learning_rate": 3.676996807476671e-05,
"loss": 2.9842,
"num_input_tokens_seen": 2062056,
"step": 3140
},
{
"epoch": 0.3447330921845884,
"grad_norm": 7.238974094390869,
"learning_rate": 3.673197464208759e-05,
"loss": 3.1055,
"num_input_tokens_seen": 2064760,
"step": 3145
},
{
"epoch": 0.34528115751397565,
"grad_norm": 8.2353515625,
"learning_rate": 3.669394642894118e-05,
"loss": 2.7765,
"num_input_tokens_seen": 2068440,
"step": 3150
},
{
"epoch": 0.3458292228433629,
"grad_norm": 7.214339256286621,
"learning_rate": 3.665588354806545e-05,
"loss": 3.0102,
"num_input_tokens_seen": 2072136,
"step": 3155
},
{
"epoch": 0.3463772881727502,
"grad_norm": 6.484249114990234,
"learning_rate": 3.661778611230114e-05,
"loss": 3.2456,
"num_input_tokens_seen": 2074560,
"step": 3160
},
{
"epoch": 0.34692535350213743,
"grad_norm": 6.298303604125977,
"learning_rate": 3.657965423459145e-05,
"loss": 3.3588,
"num_input_tokens_seen": 2077248,
"step": 3165
},
{
"epoch": 0.3474734188315247,
"grad_norm": 8.595486640930176,
"learning_rate": 3.6541488027981675e-05,
"loss": 2.9303,
"num_input_tokens_seen": 2080160,
"step": 3170
},
{
"epoch": 0.348021484160912,
"grad_norm": 7.8414740562438965,
"learning_rate": 3.650328760561887e-05,
"loss": 3.5767,
"num_input_tokens_seen": 2082320,
"step": 3175
},
{
"epoch": 0.3485695494902992,
"grad_norm": 5.1522908210754395,
"learning_rate": 3.646505308075154e-05,
"loss": 3.1739,
"num_input_tokens_seen": 2085104,
"step": 3180
},
{
"epoch": 0.3491176148196865,
"grad_norm": 9.065922737121582,
"learning_rate": 3.642678456672929e-05,
"loss": 3.3567,
"num_input_tokens_seen": 2087800,
"step": 3185
},
{
"epoch": 0.3496656801490738,
"grad_norm": 11.175498962402344,
"learning_rate": 3.638848217700248e-05,
"loss": 3.3376,
"num_input_tokens_seen": 2090776,
"step": 3190
},
{
"epoch": 0.350213745478461,
"grad_norm": 7.90383768081665,
"learning_rate": 3.63501460251219e-05,
"loss": 2.9388,
"num_input_tokens_seen": 2093152,
"step": 3195
},
{
"epoch": 0.3507618108078483,
"grad_norm": 7.013014316558838,
"learning_rate": 3.6311776224738435e-05,
"loss": 3.0298,
"num_input_tokens_seen": 2096192,
"step": 3200
},
{
"epoch": 0.3513098761372356,
"grad_norm": 4.87260103225708,
"learning_rate": 3.627337288960272e-05,
"loss": 3.3596,
"num_input_tokens_seen": 2100256,
"step": 3205
},
{
"epoch": 0.3518579414666228,
"grad_norm": 7.644909858703613,
"learning_rate": 3.6234936133564823e-05,
"loss": 3.1154,
"num_input_tokens_seen": 2102928,
"step": 3210
},
{
"epoch": 0.3524060067960101,
"grad_norm": 5.678354263305664,
"learning_rate": 3.619646607057386e-05,
"loss": 2.8941,
"num_input_tokens_seen": 2106944,
"step": 3215
},
{
"epoch": 0.35295407212539737,
"grad_norm": 5.123593330383301,
"learning_rate": 3.61579628146777e-05,
"loss": 3.1417,
"num_input_tokens_seen": 2111496,
"step": 3220
},
{
"epoch": 0.3535021374547846,
"grad_norm": 5.542695999145508,
"learning_rate": 3.611942648002265e-05,
"loss": 3.1733,
"num_input_tokens_seen": 2114960,
"step": 3225
},
{
"epoch": 0.3540502027841719,
"grad_norm": 8.204092025756836,
"learning_rate": 3.6080857180853025e-05,
"loss": 3.4422,
"num_input_tokens_seen": 2117528,
"step": 3230
},
{
"epoch": 0.35459826811355916,
"grad_norm": 6.3048014640808105,
"learning_rate": 3.6042255031510895e-05,
"loss": 3.3049,
"num_input_tokens_seen": 2121312,
"step": 3235
},
{
"epoch": 0.3551463334429464,
"grad_norm": 8.287495613098145,
"learning_rate": 3.600362014643573e-05,
"loss": 3.2349,
"num_input_tokens_seen": 2125296,
"step": 3240
},
{
"epoch": 0.35569439877233366,
"grad_norm": 7.690340995788574,
"learning_rate": 3.5964952640164016e-05,
"loss": 3.4982,
"num_input_tokens_seen": 2127944,
"step": 3245
},
{
"epoch": 0.35624246410172095,
"grad_norm": 5.382369518280029,
"learning_rate": 3.592625262732898e-05,
"loss": 3.3248,
"num_input_tokens_seen": 2131200,
"step": 3250
},
{
"epoch": 0.35679052943110817,
"grad_norm": 7.964527606964111,
"learning_rate": 3.58875202226602e-05,
"loss": 3.2188,
"num_input_tokens_seen": 2133648,
"step": 3255
},
{
"epoch": 0.35733859476049545,
"grad_norm": 5.458812236785889,
"learning_rate": 3.5848755540983286e-05,
"loss": 3.3385,
"num_input_tokens_seen": 2136960,
"step": 3260
},
{
"epoch": 0.35788666008988274,
"grad_norm": 7.087930679321289,
"learning_rate": 3.580995869721953e-05,
"loss": 3.0703,
"num_input_tokens_seen": 2140656,
"step": 3265
},
{
"epoch": 0.35843472541926996,
"grad_norm": 6.762202262878418,
"learning_rate": 3.577112980638557e-05,
"loss": 2.9214,
"num_input_tokens_seen": 2143360,
"step": 3270
},
{
"epoch": 0.35898279074865724,
"grad_norm": 6.3621649742126465,
"learning_rate": 3.573226898359308e-05,
"loss": 3.4276,
"num_input_tokens_seen": 2146456,
"step": 3275
},
{
"epoch": 0.3595308560780445,
"grad_norm": 8.797203063964844,
"learning_rate": 3.5693376344048344e-05,
"loss": 3.0474,
"num_input_tokens_seen": 2149336,
"step": 3280
},
{
"epoch": 0.36007892140743175,
"grad_norm": 7.268299579620361,
"learning_rate": 3.5654452003052033e-05,
"loss": 2.8497,
"num_input_tokens_seen": 2152960,
"step": 3285
},
{
"epoch": 0.36062698673681903,
"grad_norm": 8.053544044494629,
"learning_rate": 3.5615496075998744e-05,
"loss": 3.6495,
"num_input_tokens_seen": 2157104,
"step": 3290
},
{
"epoch": 0.3611750520662063,
"grad_norm": 6.6186604499816895,
"learning_rate": 3.5576508678376743e-05,
"loss": 2.9909,
"num_input_tokens_seen": 2159576,
"step": 3295
},
{
"epoch": 0.36172311739559354,
"grad_norm": 6.244167327880859,
"learning_rate": 3.55374899257676e-05,
"loss": 3.064,
"num_input_tokens_seen": 2163112,
"step": 3300
},
{
"epoch": 0.3622711827249808,
"grad_norm": 7.658557891845703,
"learning_rate": 3.549843993384582e-05,
"loss": 3.1039,
"num_input_tokens_seen": 2166048,
"step": 3305
},
{
"epoch": 0.3628192480543681,
"grad_norm": 5.7698140144348145,
"learning_rate": 3.545935881837852e-05,
"loss": 2.9442,
"num_input_tokens_seen": 2169192,
"step": 3310
},
{
"epoch": 0.36336731338375533,
"grad_norm": 6.534774303436279,
"learning_rate": 3.542024669522511e-05,
"loss": 2.9845,
"num_input_tokens_seen": 2172544,
"step": 3315
},
{
"epoch": 0.3639153787131426,
"grad_norm": 5.373234748840332,
"learning_rate": 3.538110368033689e-05,
"loss": 3.0865,
"num_input_tokens_seen": 2176280,
"step": 3320
},
{
"epoch": 0.3644634440425299,
"grad_norm": 6.9778547286987305,
"learning_rate": 3.5341929889756775e-05,
"loss": 3.1341,
"num_input_tokens_seen": 2179792,
"step": 3325
},
{
"epoch": 0.3650115093719171,
"grad_norm": 10.10000991821289,
"learning_rate": 3.530272543961888e-05,
"loss": 3.3558,
"num_input_tokens_seen": 2182776,
"step": 3330
},
{
"epoch": 0.3655595747013044,
"grad_norm": 6.022150993347168,
"learning_rate": 3.526349044614826e-05,
"loss": 3.1005,
"num_input_tokens_seen": 2186112,
"step": 3335
},
{
"epoch": 0.3661076400306917,
"grad_norm": 6.781782150268555,
"learning_rate": 3.522422502566047e-05,
"loss": 3.3438,
"num_input_tokens_seen": 2188600,
"step": 3340
},
{
"epoch": 0.3666557053600789,
"grad_norm": 4.399787425994873,
"learning_rate": 3.51849292945613e-05,
"loss": 3.0477,
"num_input_tokens_seen": 2191600,
"step": 3345
},
{
"epoch": 0.3672037706894662,
"grad_norm": 6.852601528167725,
"learning_rate": 3.51456033693464e-05,
"loss": 2.8756,
"num_input_tokens_seen": 2194544,
"step": 3350
},
{
"epoch": 0.3677518360188535,
"grad_norm": 7.015017509460449,
"learning_rate": 3.510624736660091e-05,
"loss": 3.6253,
"num_input_tokens_seen": 2198296,
"step": 3355
},
{
"epoch": 0.3682999013482407,
"grad_norm": 4.540085792541504,
"learning_rate": 3.506686140299915e-05,
"loss": 2.9568,
"num_input_tokens_seen": 2201384,
"step": 3360
},
{
"epoch": 0.368847966677628,
"grad_norm": 9.393879890441895,
"learning_rate": 3.502744559530426e-05,
"loss": 3.1794,
"num_input_tokens_seen": 2205720,
"step": 3365
},
{
"epoch": 0.36939603200701526,
"grad_norm": 7.7508344650268555,
"learning_rate": 3.498800006036788e-05,
"loss": 3.0188,
"num_input_tokens_seen": 2210344,
"step": 3370
},
{
"epoch": 0.3699440973364025,
"grad_norm": 5.801796913146973,
"learning_rate": 3.4948524915129726e-05,
"loss": 3.1028,
"num_input_tokens_seen": 2213264,
"step": 3375
},
{
"epoch": 0.37049216266578977,
"grad_norm": 6.9859938621521,
"learning_rate": 3.490902027661734e-05,
"loss": 3.5774,
"num_input_tokens_seen": 2216560,
"step": 3380
},
{
"epoch": 0.37104022799517705,
"grad_norm": 5.871939659118652,
"learning_rate": 3.4869486261945695e-05,
"loss": 3.3648,
"num_input_tokens_seen": 2219376,
"step": 3385
},
{
"epoch": 0.3715882933245643,
"grad_norm": 6.051314830780029,
"learning_rate": 3.482992298831682e-05,
"loss": 3.2641,
"num_input_tokens_seen": 2222568,
"step": 3390
},
{
"epoch": 0.37213635865395156,
"grad_norm": 7.149409294128418,
"learning_rate": 3.4790330573019524e-05,
"loss": 3.0127,
"num_input_tokens_seen": 2225232,
"step": 3395
},
{
"epoch": 0.37268442398333884,
"grad_norm": 5.8362650871276855,
"learning_rate": 3.4750709133429e-05,
"loss": 3.2417,
"num_input_tokens_seen": 2228360,
"step": 3400
},
{
"epoch": 0.37323248931272607,
"grad_norm": 6.061380386352539,
"learning_rate": 3.471105878700646e-05,
"loss": 3.4256,
"num_input_tokens_seen": 2231864,
"step": 3405
},
{
"epoch": 0.37378055464211335,
"grad_norm": 7.543921947479248,
"learning_rate": 3.467137965129884e-05,
"loss": 3.1154,
"num_input_tokens_seen": 2234400,
"step": 3410
},
{
"epoch": 0.3743286199715006,
"grad_norm": 4.8110151290893555,
"learning_rate": 3.463167184393843e-05,
"loss": 3.1221,
"num_input_tokens_seen": 2238056,
"step": 3415
},
{
"epoch": 0.37487668530088786,
"grad_norm": 7.194852352142334,
"learning_rate": 3.459193548264248e-05,
"loss": 3.4609,
"num_input_tokens_seen": 2240472,
"step": 3420
},
{
"epoch": 0.37542475063027514,
"grad_norm": 7.457151889801025,
"learning_rate": 3.4552170685212936e-05,
"loss": 3.1907,
"num_input_tokens_seen": 2243944,
"step": 3425
},
{
"epoch": 0.37597281595966237,
"grad_norm": 8.671926498413086,
"learning_rate": 3.4512377569536025e-05,
"loss": 3.0142,
"num_input_tokens_seen": 2246376,
"step": 3430
},
{
"epoch": 0.37652088128904965,
"grad_norm": 6.243984222412109,
"learning_rate": 3.447255625358191e-05,
"loss": 3.094,
"num_input_tokens_seen": 2249288,
"step": 3435
},
{
"epoch": 0.37706894661843693,
"grad_norm": 7.37971830368042,
"learning_rate": 3.443270685540439e-05,
"loss": 3.4606,
"num_input_tokens_seen": 2252536,
"step": 3440
},
{
"epoch": 0.37761701194782415,
"grad_norm": 6.270237445831299,
"learning_rate": 3.43928294931405e-05,
"loss": 3.1928,
"num_input_tokens_seen": 2255576,
"step": 3445
},
{
"epoch": 0.37816507727721144,
"grad_norm": 5.272236347198486,
"learning_rate": 3.435292428501016e-05,
"loss": 3.4196,
"num_input_tokens_seen": 2258456,
"step": 3450
},
{
"epoch": 0.3787131426065987,
"grad_norm": 6.378783226013184,
"learning_rate": 3.431299134931587e-05,
"loss": 3.3069,
"num_input_tokens_seen": 2261160,
"step": 3455
},
{
"epoch": 0.37926120793598594,
"grad_norm": 7.296474456787109,
"learning_rate": 3.427303080444232e-05,
"loss": 3.3306,
"num_input_tokens_seen": 2263808,
"step": 3460
},
{
"epoch": 0.3798092732653732,
"grad_norm": 6.654740333557129,
"learning_rate": 3.423304276885605e-05,
"loss": 2.871,
"num_input_tokens_seen": 2267280,
"step": 3465
},
{
"epoch": 0.3803573385947605,
"grad_norm": 7.27192497253418,
"learning_rate": 3.419302736110508e-05,
"loss": 3.3171,
"num_input_tokens_seen": 2270632,
"step": 3470
},
{
"epoch": 0.38090540392414773,
"grad_norm": 5.948354721069336,
"learning_rate": 3.4152984699818614e-05,
"loss": 3.4794,
"num_input_tokens_seen": 2273960,
"step": 3475
},
{
"epoch": 0.381453469253535,
"grad_norm": 6.537465572357178,
"learning_rate": 3.4112914903706616e-05,
"loss": 3.1609,
"num_input_tokens_seen": 2277568,
"step": 3480
},
{
"epoch": 0.3820015345829223,
"grad_norm": 13.15424919128418,
"learning_rate": 3.4072818091559524e-05,
"loss": 3.0777,
"num_input_tokens_seen": 2279976,
"step": 3485
},
{
"epoch": 0.3825495999123095,
"grad_norm": 5.581765174865723,
"learning_rate": 3.403269438224784e-05,
"loss": 3.1242,
"num_input_tokens_seen": 2282912,
"step": 3490
},
{
"epoch": 0.3830976652416968,
"grad_norm": 5.730728626251221,
"learning_rate": 3.3992543894721825e-05,
"loss": 3.2418,
"num_input_tokens_seen": 2286272,
"step": 3495
},
{
"epoch": 0.3836457305710841,
"grad_norm": 9.713155746459961,
"learning_rate": 3.3952366748011114e-05,
"loss": 3.17,
"num_input_tokens_seen": 2289944,
"step": 3500
},
{
"epoch": 0.3841937959004713,
"grad_norm": 6.645389556884766,
"learning_rate": 3.391216306122439e-05,
"loss": 3.3796,
"num_input_tokens_seen": 2292688,
"step": 3505
},
{
"epoch": 0.3847418612298586,
"grad_norm": 7.148984432220459,
"learning_rate": 3.3871932953549005e-05,
"loss": 3.282,
"num_input_tokens_seen": 2295584,
"step": 3510
},
{
"epoch": 0.3852899265592459,
"grad_norm": 5.25370979309082,
"learning_rate": 3.3831676544250616e-05,
"loss": 2.9293,
"num_input_tokens_seen": 2298440,
"step": 3515
},
{
"epoch": 0.3858379918886331,
"grad_norm": 5.668978214263916,
"learning_rate": 3.3791393952672915e-05,
"loss": 3.0635,
"num_input_tokens_seen": 2301024,
"step": 3520
},
{
"epoch": 0.3863860572180204,
"grad_norm": 4.52470064163208,
"learning_rate": 3.375108529823715e-05,
"loss": 3.0398,
"num_input_tokens_seen": 2304392,
"step": 3525
},
{
"epoch": 0.38693412254740767,
"grad_norm": 5.700072288513184,
"learning_rate": 3.371075070044186e-05,
"loss": 3.0855,
"num_input_tokens_seen": 2307688,
"step": 3530
},
{
"epoch": 0.3874821878767949,
"grad_norm": 5.35679292678833,
"learning_rate": 3.367039027886252e-05,
"loss": 3.2953,
"num_input_tokens_seen": 2312384,
"step": 3535
},
{
"epoch": 0.3880302532061822,
"grad_norm": 6.735170841217041,
"learning_rate": 3.363000415315111e-05,
"loss": 3.1434,
"num_input_tokens_seen": 2315864,
"step": 3540
},
{
"epoch": 0.38857831853556946,
"grad_norm": 6.647335052490234,
"learning_rate": 3.358959244303585e-05,
"loss": 3.2033,
"num_input_tokens_seen": 2319744,
"step": 3545
},
{
"epoch": 0.3891263838649567,
"grad_norm": 6.841831684112549,
"learning_rate": 3.354915526832082e-05,
"loss": 3.3414,
"num_input_tokens_seen": 2322856,
"step": 3550
},
{
"epoch": 0.38967444919434396,
"grad_norm": 7.023780822753906,
"learning_rate": 3.350869274888554e-05,
"loss": 3.1525,
"num_input_tokens_seen": 2326016,
"step": 3555
},
{
"epoch": 0.39022251452373125,
"grad_norm": 8.96906852722168,
"learning_rate": 3.3468205004684695e-05,
"loss": 3.2852,
"num_input_tokens_seen": 2330120,
"step": 3560
},
{
"epoch": 0.39077057985311847,
"grad_norm": 7.874572277069092,
"learning_rate": 3.3427692155747766e-05,
"loss": 2.9457,
"num_input_tokens_seen": 2332776,
"step": 3565
},
{
"epoch": 0.39131864518250575,
"grad_norm": 6.962822914123535,
"learning_rate": 3.338715432217865e-05,
"loss": 3.0687,
"num_input_tokens_seen": 2336856,
"step": 3570
},
{
"epoch": 0.39186671051189303,
"grad_norm": 6.802676200866699,
"learning_rate": 3.334659162415529e-05,
"loss": 3.6562,
"num_input_tokens_seen": 2339768,
"step": 3575
},
{
"epoch": 0.39241477584128026,
"grad_norm": 7.828624725341797,
"learning_rate": 3.3306004181929375e-05,
"loss": 3.2111,
"num_input_tokens_seen": 2342920,
"step": 3580
},
{
"epoch": 0.39296284117066754,
"grad_norm": 7.1746320724487305,
"learning_rate": 3.326539211582592e-05,
"loss": 3.2333,
"num_input_tokens_seen": 2346656,
"step": 3585
},
{
"epoch": 0.3935109065000548,
"grad_norm": 7.000988006591797,
"learning_rate": 3.3224755546242967e-05,
"loss": 3.3291,
"num_input_tokens_seen": 2351008,
"step": 3590
},
{
"epoch": 0.39405897182944205,
"grad_norm": 6.557620048522949,
"learning_rate": 3.3184094593651196e-05,
"loss": 2.7686,
"num_input_tokens_seen": 2354160,
"step": 3595
},
{
"epoch": 0.39460703715882933,
"grad_norm": 7.011937618255615,
"learning_rate": 3.314340937859356e-05,
"loss": 3.4913,
"num_input_tokens_seen": 2357464,
"step": 3600
},
{
"epoch": 0.3951551024882166,
"grad_norm": 6.284838676452637,
"learning_rate": 3.310270002168493e-05,
"loss": 2.835,
"num_input_tokens_seen": 2360488,
"step": 3605
},
{
"epoch": 0.39570316781760384,
"grad_norm": 7.415198802947998,
"learning_rate": 3.306196664361178e-05,
"loss": 2.9347,
"num_input_tokens_seen": 2363448,
"step": 3610
},
{
"epoch": 0.3962512331469911,
"grad_norm": 7.382150650024414,
"learning_rate": 3.302120936513177e-05,
"loss": 3.3669,
"num_input_tokens_seen": 2365800,
"step": 3615
},
{
"epoch": 0.3967992984763784,
"grad_norm": 5.894745349884033,
"learning_rate": 3.2980428307073435e-05,
"loss": 2.8094,
"num_input_tokens_seen": 2369016,
"step": 3620
},
{
"epoch": 0.39734736380576563,
"grad_norm": 6.539662837982178,
"learning_rate": 3.29396235903358e-05,
"loss": 3.1544,
"num_input_tokens_seen": 2372144,
"step": 3625
},
{
"epoch": 0.3978954291351529,
"grad_norm": 6.1463799476623535,
"learning_rate": 3.2898795335888005e-05,
"loss": 3.2679,
"num_input_tokens_seen": 2374656,
"step": 3630
},
{
"epoch": 0.3984434944645402,
"grad_norm": 8.810948371887207,
"learning_rate": 3.2857943664769e-05,
"loss": 3.394,
"num_input_tokens_seen": 2378056,
"step": 3635
},
{
"epoch": 0.3989915597939274,
"grad_norm": 10.048519134521484,
"learning_rate": 3.2817068698087164e-05,
"loss": 3.4094,
"num_input_tokens_seen": 2380792,
"step": 3640
},
{
"epoch": 0.3995396251233147,
"grad_norm": 8.441570281982422,
"learning_rate": 3.277617055701989e-05,
"loss": 2.9142,
"num_input_tokens_seen": 2383912,
"step": 3645
},
{
"epoch": 0.400087690452702,
"grad_norm": 5.723228931427002,
"learning_rate": 3.273524936281331e-05,
"loss": 3.2162,
"num_input_tokens_seen": 2386592,
"step": 3650
},
{
"epoch": 0.4006357557820892,
"grad_norm": 5.869374752044678,
"learning_rate": 3.2694305236781904e-05,
"loss": 3.301,
"num_input_tokens_seen": 2390144,
"step": 3655
},
{
"epoch": 0.4011838211114765,
"grad_norm": 6.342257499694824,
"learning_rate": 3.26533383003081e-05,
"loss": 3.2055,
"num_input_tokens_seen": 2393872,
"step": 3660
},
{
"epoch": 0.4017318864408638,
"grad_norm": 6.534188270568848,
"learning_rate": 3.2612348674841995e-05,
"loss": 3.0935,
"num_input_tokens_seen": 2396648,
"step": 3665
},
{
"epoch": 0.402279951770251,
"grad_norm": 7.0050272941589355,
"learning_rate": 3.2571336481900926e-05,
"loss": 3.2582,
"num_input_tokens_seen": 2400328,
"step": 3670
},
{
"epoch": 0.4028280170996383,
"grad_norm": 8.4814453125,
"learning_rate": 3.253030184306912e-05,
"loss": 3.3026,
"num_input_tokens_seen": 2403080,
"step": 3675
},
{
"epoch": 0.40337608242902556,
"grad_norm": 7.716960906982422,
"learning_rate": 3.248924487999737e-05,
"loss": 3.052,
"num_input_tokens_seen": 2406352,
"step": 3680
},
{
"epoch": 0.4039241477584128,
"grad_norm": 6.716127395629883,
"learning_rate": 3.244816571440265e-05,
"loss": 3.2428,
"num_input_tokens_seen": 2409496,
"step": 3685
},
{
"epoch": 0.40447221308780007,
"grad_norm": 8.213761329650879,
"learning_rate": 3.240706446806773e-05,
"loss": 2.9107,
"num_input_tokens_seen": 2414032,
"step": 3690
},
{
"epoch": 0.40502027841718735,
"grad_norm": 6.492610931396484,
"learning_rate": 3.236594126284086e-05,
"loss": 3.293,
"num_input_tokens_seen": 2417472,
"step": 3695
},
{
"epoch": 0.4055683437465746,
"grad_norm": 6.562194347381592,
"learning_rate": 3.23247962206354e-05,
"loss": 3.4693,
"num_input_tokens_seen": 2420224,
"step": 3700
},
{
"epoch": 0.40611640907596186,
"grad_norm": 6.379699230194092,
"learning_rate": 3.228362946342942e-05,
"loss": 3.2036,
"num_input_tokens_seen": 2425376,
"step": 3705
},
{
"epoch": 0.40666447440534914,
"grad_norm": 8.669161796569824,
"learning_rate": 3.2242441113265395e-05,
"loss": 3.3417,
"num_input_tokens_seen": 2429616,
"step": 3710
},
{
"epoch": 0.40721253973473637,
"grad_norm": 4.813148021697998,
"learning_rate": 3.220123129224979e-05,
"loss": 2.9484,
"num_input_tokens_seen": 2433168,
"step": 3715
},
{
"epoch": 0.40776060506412365,
"grad_norm": 6.526965141296387,
"learning_rate": 3.216000012255273e-05,
"loss": 3.5202,
"num_input_tokens_seen": 2435880,
"step": 3720
},
{
"epoch": 0.40830867039351093,
"grad_norm": 7.899510860443115,
"learning_rate": 3.211874772640765e-05,
"loss": 3.2844,
"num_input_tokens_seen": 2439232,
"step": 3725
},
{
"epoch": 0.40885673572289816,
"grad_norm": 6.932427406311035,
"learning_rate": 3.2077474226110866e-05,
"loss": 3.5213,
"num_input_tokens_seen": 2443400,
"step": 3730
},
{
"epoch": 0.40940480105228544,
"grad_norm": 6.4443793296813965,
"learning_rate": 3.203617974402131e-05,
"loss": 3.4504,
"num_input_tokens_seen": 2446448,
"step": 3735
},
{
"epoch": 0.4099528663816727,
"grad_norm": 6.693415641784668,
"learning_rate": 3.199486440256009e-05,
"loss": 3.6388,
"num_input_tokens_seen": 2450016,
"step": 3740
},
{
"epoch": 0.41050093171105995,
"grad_norm": 6.27035665512085,
"learning_rate": 3.195352832421015e-05,
"loss": 3.4589,
"num_input_tokens_seen": 2452584,
"step": 3745
},
{
"epoch": 0.41104899704044723,
"grad_norm": 6.987046241760254,
"learning_rate": 3.191217163151593e-05,
"loss": 3.484,
"num_input_tokens_seen": 2455440,
"step": 3750
},
{
"epoch": 0.4115970623698345,
"grad_norm": 5.9024200439453125,
"learning_rate": 3.187079444708296e-05,
"loss": 2.9859,
"num_input_tokens_seen": 2459048,
"step": 3755
},
{
"epoch": 0.41214512769922174,
"grad_norm": 5.624914646148682,
"learning_rate": 3.182939689357753e-05,
"loss": 3.317,
"num_input_tokens_seen": 2463488,
"step": 3760
},
{
"epoch": 0.412693193028609,
"grad_norm": 5.933727264404297,
"learning_rate": 3.1787979093726314e-05,
"loss": 3.1318,
"num_input_tokens_seen": 2466560,
"step": 3765
},
{
"epoch": 0.4132412583579963,
"grad_norm": 8.507558822631836,
"learning_rate": 3.1746541170316036e-05,
"loss": 3.5896,
"num_input_tokens_seen": 2469072,
"step": 3770
},
{
"epoch": 0.4137893236873835,
"grad_norm": 6.940069198608398,
"learning_rate": 3.1705083246193015e-05,
"loss": 3.5636,
"num_input_tokens_seen": 2471528,
"step": 3775
},
{
"epoch": 0.4143373890167708,
"grad_norm": 7.710633277893066,
"learning_rate": 3.166360544426293e-05,
"loss": 3.373,
"num_input_tokens_seen": 2474672,
"step": 3780
},
{
"epoch": 0.4148854543461581,
"grad_norm": 6.710258960723877,
"learning_rate": 3.1622107887490354e-05,
"loss": 2.9773,
"num_input_tokens_seen": 2478184,
"step": 3785
},
{
"epoch": 0.4154335196755453,
"grad_norm": 6.593062400817871,
"learning_rate": 3.158059069889843e-05,
"loss": 3.1045,
"num_input_tokens_seen": 2481016,
"step": 3790
},
{
"epoch": 0.4159815850049326,
"grad_norm": 8.369247436523438,
"learning_rate": 3.1539054001568493e-05,
"loss": 2.7624,
"num_input_tokens_seen": 2483976,
"step": 3795
},
{
"epoch": 0.4165296503343199,
"grad_norm": 5.184842586517334,
"learning_rate": 3.149749791863974e-05,
"loss": 3.2427,
"num_input_tokens_seen": 2486960,
"step": 3800
},
{
"epoch": 0.4170777156637071,
"grad_norm": 5.449498653411865,
"learning_rate": 3.145592257330881e-05,
"loss": 3.3931,
"num_input_tokens_seen": 2490928,
"step": 3805
},
{
"epoch": 0.4176257809930944,
"grad_norm": 7.610599994659424,
"learning_rate": 3.141432808882946e-05,
"loss": 3.3562,
"num_input_tokens_seen": 2494760,
"step": 3810
},
{
"epoch": 0.4181738463224816,
"grad_norm": 6.789968490600586,
"learning_rate": 3.13727145885122e-05,
"loss": 2.823,
"num_input_tokens_seen": 2498352,
"step": 3815
},
{
"epoch": 0.4187219116518689,
"grad_norm": 6.654449462890625,
"learning_rate": 3.133108219572388e-05,
"loss": 3.2867,
"num_input_tokens_seen": 2501440,
"step": 3820
},
{
"epoch": 0.4192699769812562,
"grad_norm": 6.487675189971924,
"learning_rate": 3.1289431033887386e-05,
"loss": 3.3113,
"num_input_tokens_seen": 2504560,
"step": 3825
},
{
"epoch": 0.4198180423106434,
"grad_norm": 7.911233901977539,
"learning_rate": 3.1247761226481244e-05,
"loss": 2.8476,
"num_input_tokens_seen": 2507984,
"step": 3830
},
{
"epoch": 0.4203661076400307,
"grad_norm": 7.292878150939941,
"learning_rate": 3.120607289703925e-05,
"loss": 2.9229,
"num_input_tokens_seen": 2511632,
"step": 3835
},
{
"epoch": 0.42091417296941797,
"grad_norm": 7.699312686920166,
"learning_rate": 3.11643661691501e-05,
"loss": 3.2728,
"num_input_tokens_seen": 2514512,
"step": 3840
},
{
"epoch": 0.4214622382988052,
"grad_norm": 7.424167156219482,
"learning_rate": 3.112264116645705e-05,
"loss": 3.0013,
"num_input_tokens_seen": 2517840,
"step": 3845
},
{
"epoch": 0.4220103036281925,
"grad_norm": 6.991738796234131,
"learning_rate": 3.1080898012657536e-05,
"loss": 2.9434,
"num_input_tokens_seen": 2521296,
"step": 3850
},
{
"epoch": 0.42255836895757976,
"grad_norm": 6.644684314727783,
"learning_rate": 3.103913683150278e-05,
"loss": 3.4346,
"num_input_tokens_seen": 2523800,
"step": 3855
},
{
"epoch": 0.423106434286967,
"grad_norm": 6.666325092315674,
"learning_rate": 3.099735774679749e-05,
"loss": 3.2123,
"num_input_tokens_seen": 2526096,
"step": 3860
},
{
"epoch": 0.42365449961635426,
"grad_norm": 9.987031936645508,
"learning_rate": 3.09555608823994e-05,
"loss": 3.2205,
"num_input_tokens_seen": 2528464,
"step": 3865
},
{
"epoch": 0.42420256494574154,
"grad_norm": 8.114043235778809,
"learning_rate": 3.091374636221899e-05,
"loss": 3.1648,
"num_input_tokens_seen": 2530808,
"step": 3870
},
{
"epoch": 0.42475063027512877,
"grad_norm": 7.4291229248046875,
"learning_rate": 3.087191431021908e-05,
"loss": 2.874,
"num_input_tokens_seen": 2534400,
"step": 3875
},
{
"epoch": 0.42529869560451605,
"grad_norm": 6.414401054382324,
"learning_rate": 3.083006485041444e-05,
"loss": 3.0927,
"num_input_tokens_seen": 2538584,
"step": 3880
},
{
"epoch": 0.42584676093390333,
"grad_norm": 12.14594554901123,
"learning_rate": 3.078819810687147e-05,
"loss": 3.1133,
"num_input_tokens_seen": 2542184,
"step": 3885
},
{
"epoch": 0.42639482626329056,
"grad_norm": 6.391221046447754,
"learning_rate": 3.074631420370779e-05,
"loss": 3.0244,
"num_input_tokens_seen": 2545592,
"step": 3890
},
{
"epoch": 0.42694289159267784,
"grad_norm": 6.802542686462402,
"learning_rate": 3.0704413265091916e-05,
"loss": 3.2812,
"num_input_tokens_seen": 2548816,
"step": 3895
},
{
"epoch": 0.4274909569220651,
"grad_norm": 7.281493186950684,
"learning_rate": 3.066249541524285e-05,
"loss": 3.3321,
"num_input_tokens_seen": 2552352,
"step": 3900
},
{
"epoch": 0.42803902225145235,
"grad_norm": 6.2967047691345215,
"learning_rate": 3.0620560778429736e-05,
"loss": 3.1571,
"num_input_tokens_seen": 2556072,
"step": 3905
},
{
"epoch": 0.42858708758083963,
"grad_norm": 5.46196174621582,
"learning_rate": 3.0578609478971474e-05,
"loss": 2.9312,
"num_input_tokens_seen": 2559680,
"step": 3910
},
{
"epoch": 0.4291351529102269,
"grad_norm": 6.703193664550781,
"learning_rate": 3.0536641641236366e-05,
"loss": 3.1173,
"num_input_tokens_seen": 2564072,
"step": 3915
},
{
"epoch": 0.42968321823961414,
"grad_norm": 6.250140190124512,
"learning_rate": 3.0494657389641763e-05,
"loss": 2.8173,
"num_input_tokens_seen": 2567848,
"step": 3920
},
{
"epoch": 0.4302312835690014,
"grad_norm": 8.19283676147461,
"learning_rate": 3.0452656848653643e-05,
"loss": 3.1555,
"num_input_tokens_seen": 2570760,
"step": 3925
},
{
"epoch": 0.4307793488983887,
"grad_norm": 4.393120288848877,
"learning_rate": 3.041064014278629e-05,
"loss": 3.3082,
"num_input_tokens_seen": 2574112,
"step": 3930
},
{
"epoch": 0.43132741422777593,
"grad_norm": 7.910434246063232,
"learning_rate": 3.036860739660193e-05,
"loss": 3.0528,
"num_input_tokens_seen": 2578144,
"step": 3935
},
{
"epoch": 0.4318754795571632,
"grad_norm": 8.536887168884277,
"learning_rate": 3.0326558734710304e-05,
"loss": 3.224,
"num_input_tokens_seen": 2581008,
"step": 3940
},
{
"epoch": 0.4324235448865505,
"grad_norm": 5.810432434082031,
"learning_rate": 3.028449428176836e-05,
"loss": 3.2157,
"num_input_tokens_seen": 2583616,
"step": 3945
},
{
"epoch": 0.4329716102159377,
"grad_norm": 7.819321632385254,
"learning_rate": 3.024241416247987e-05,
"loss": 3.3845,
"num_input_tokens_seen": 2587680,
"step": 3950
},
{
"epoch": 0.433519675545325,
"grad_norm": 7.583765506744385,
"learning_rate": 3.0200318501595028e-05,
"loss": 3.4347,
"num_input_tokens_seen": 2590536,
"step": 3955
},
{
"epoch": 0.4340677408747123,
"grad_norm": 6.201939105987549,
"learning_rate": 3.01582074239101e-05,
"loss": 3.0368,
"num_input_tokens_seen": 2593560,
"step": 3960
},
{
"epoch": 0.4346158062040995,
"grad_norm": 6.4165425300598145,
"learning_rate": 3.0116081054267086e-05,
"loss": 3.1866,
"num_input_tokens_seen": 2597464,
"step": 3965
},
{
"epoch": 0.4351638715334868,
"grad_norm": 5.670197486877441,
"learning_rate": 3.007393951755329e-05,
"loss": 3.1721,
"num_input_tokens_seen": 2600616,
"step": 3970
},
{
"epoch": 0.43571193686287407,
"grad_norm": 6.542341709136963,
"learning_rate": 3.0031782938701004e-05,
"loss": 3.1902,
"num_input_tokens_seen": 2603832,
"step": 3975
},
{
"epoch": 0.4362600021922613,
"grad_norm": 11.36231803894043,
"learning_rate": 2.9989611442687087e-05,
"loss": 3.1505,
"num_input_tokens_seen": 2607032,
"step": 3980
},
{
"epoch": 0.4368080675216486,
"grad_norm": 8.223766326904297,
"learning_rate": 2.994742515453264e-05,
"loss": 3.2596,
"num_input_tokens_seen": 2609848,
"step": 3985
},
{
"epoch": 0.43735613285103586,
"grad_norm": 6.220792770385742,
"learning_rate": 2.9905224199302612e-05,
"loss": 3.105,
"num_input_tokens_seen": 2613072,
"step": 3990
},
{
"epoch": 0.4379041981804231,
"grad_norm": 9.295598983764648,
"learning_rate": 2.9863008702105444e-05,
"loss": 3.5309,
"num_input_tokens_seen": 2617216,
"step": 3995
},
{
"epoch": 0.43845226350981037,
"grad_norm": 7.482667446136475,
"learning_rate": 2.9820778788092662e-05,
"loss": 3.0894,
"num_input_tokens_seen": 2620440,
"step": 4000
},
{
"epoch": 0.43900032883919765,
"grad_norm": 8.263635635375977,
"learning_rate": 2.9778534582458563e-05,
"loss": 3.2592,
"num_input_tokens_seen": 2624136,
"step": 4005
},
{
"epoch": 0.4395483941685849,
"grad_norm": 6.1141180992126465,
"learning_rate": 2.973627621043979e-05,
"loss": 2.9611,
"num_input_tokens_seen": 2628416,
"step": 4010
},
{
"epoch": 0.44009645949797216,
"grad_norm": 5.068775653839111,
"learning_rate": 2.969400379731499e-05,
"loss": 3.2408,
"num_input_tokens_seen": 2632360,
"step": 4015
},
{
"epoch": 0.44064452482735944,
"grad_norm": 4.8074049949646,
"learning_rate": 2.965171746840445e-05,
"loss": 3.3503,
"num_input_tokens_seen": 2635144,
"step": 4020
},
{
"epoch": 0.44119259015674667,
"grad_norm": 5.924848556518555,
"learning_rate": 2.9609417349069685e-05,
"loss": 2.8347,
"num_input_tokens_seen": 2638880,
"step": 4025
},
{
"epoch": 0.44174065548613395,
"grad_norm": 6.371955871582031,
"learning_rate": 2.9567103564713107e-05,
"loss": 3.0076,
"num_input_tokens_seen": 2642200,
"step": 4030
},
{
"epoch": 0.44228872081552123,
"grad_norm": 6.616983890533447,
"learning_rate": 2.952477624077764e-05,
"loss": 3.1063,
"num_input_tokens_seen": 2647008,
"step": 4035
},
{
"epoch": 0.44283678614490846,
"grad_norm": 6.057950973510742,
"learning_rate": 2.9482435502746363e-05,
"loss": 2.9816,
"num_input_tokens_seen": 2649824,
"step": 4040
},
{
"epoch": 0.44338485147429574,
"grad_norm": 5.292036533355713,
"learning_rate": 2.944008147614208e-05,
"loss": 2.9774,
"num_input_tokens_seen": 2652424,
"step": 4045
},
{
"epoch": 0.443932916803683,
"grad_norm": 6.374473571777344,
"learning_rate": 2.9397714286527034e-05,
"loss": 2.9106,
"num_input_tokens_seen": 2655792,
"step": 4050
},
{
"epoch": 0.44448098213307025,
"grad_norm": 5.729962348937988,
"learning_rate": 2.9355334059502472e-05,
"loss": 3.1529,
"num_input_tokens_seen": 2658608,
"step": 4055
},
{
"epoch": 0.4450290474624575,
"grad_norm": 8.748932838439941,
"learning_rate": 2.9312940920708277e-05,
"loss": 3.236,
"num_input_tokens_seen": 2661312,
"step": 4060
},
{
"epoch": 0.4455771127918448,
"grad_norm": 8.778289794921875,
"learning_rate": 2.927053499582264e-05,
"loss": 3.1197,
"num_input_tokens_seen": 2665256,
"step": 4065
},
{
"epoch": 0.44612517812123204,
"grad_norm": 8.748550415039062,
"learning_rate": 2.922811641056164e-05,
"loss": 3.2486,
"num_input_tokens_seen": 2669288,
"step": 4070
},
{
"epoch": 0.4466732434506193,
"grad_norm": 5.559131145477295,
"learning_rate": 2.9185685290678888e-05,
"loss": 2.9932,
"num_input_tokens_seen": 2672312,
"step": 4075
},
{
"epoch": 0.4472213087800066,
"grad_norm": 5.6860575675964355,
"learning_rate": 2.9143241761965155e-05,
"loss": 3.1337,
"num_input_tokens_seen": 2676312,
"step": 4080
},
{
"epoch": 0.4477693741093938,
"grad_norm": 7.295080184936523,
"learning_rate": 2.9100785950248015e-05,
"loss": 2.9724,
"num_input_tokens_seen": 2679592,
"step": 4085
},
{
"epoch": 0.4483174394387811,
"grad_norm": 9.514237403869629,
"learning_rate": 2.9058317981391437e-05,
"loss": 3.1765,
"num_input_tokens_seen": 2682472,
"step": 4090
},
{
"epoch": 0.4488655047681684,
"grad_norm": 7.216882705688477,
"learning_rate": 2.901583798129543e-05,
"loss": 3.3707,
"num_input_tokens_seen": 2685328,
"step": 4095
},
{
"epoch": 0.4494135700975556,
"grad_norm": 7.9535298347473145,
"learning_rate": 2.8973346075895695e-05,
"loss": 3.4585,
"num_input_tokens_seen": 2688080,
"step": 4100
},
{
"epoch": 0.4499616354269429,
"grad_norm": 7.782059669494629,
"learning_rate": 2.8930842391163192e-05,
"loss": 2.9516,
"num_input_tokens_seen": 2691112,
"step": 4105
},
{
"epoch": 0.4505097007563302,
"grad_norm": 6.065903186798096,
"learning_rate": 2.8888327053103836e-05,
"loss": 3.0919,
"num_input_tokens_seen": 2694328,
"step": 4110
},
{
"epoch": 0.4510577660857174,
"grad_norm": 6.912715435028076,
"learning_rate": 2.884580018775807e-05,
"loss": 2.9052,
"num_input_tokens_seen": 2696856,
"step": 4115
},
{
"epoch": 0.4516058314151047,
"grad_norm": 8.30929946899414,
"learning_rate": 2.8803261921200503e-05,
"loss": 3.3268,
"num_input_tokens_seen": 2699968,
"step": 4120
},
{
"epoch": 0.45215389674449197,
"grad_norm": 8.51347541809082,
"learning_rate": 2.8760712379539567e-05,
"loss": 3.3617,
"num_input_tokens_seen": 2702416,
"step": 4125
},
{
"epoch": 0.4527019620738792,
"grad_norm": 6.167294979095459,
"learning_rate": 2.8718151688917105e-05,
"loss": 3.1805,
"num_input_tokens_seen": 2705440,
"step": 4130
},
{
"epoch": 0.4532500274032665,
"grad_norm": 8.299149513244629,
"learning_rate": 2.867557997550801e-05,
"loss": 3.2122,
"num_input_tokens_seen": 2708248,
"step": 4135
},
{
"epoch": 0.45379809273265376,
"grad_norm": 8.19796085357666,
"learning_rate": 2.8632997365519877e-05,
"loss": 3.0817,
"num_input_tokens_seen": 2712464,
"step": 4140
},
{
"epoch": 0.454346158062041,
"grad_norm": 6.964700698852539,
"learning_rate": 2.859040398519256e-05,
"loss": 3.4051,
"num_input_tokens_seen": 2715048,
"step": 4145
},
{
"epoch": 0.45489422339142827,
"grad_norm": 6.310876846313477,
"learning_rate": 2.8547799960797883e-05,
"loss": 2.7846,
"num_input_tokens_seen": 2718192,
"step": 4150
},
{
"epoch": 0.45544228872081555,
"grad_norm": 6.786360263824463,
"learning_rate": 2.8505185418639212e-05,
"loss": 2.829,
"num_input_tokens_seen": 2722064,
"step": 4155
},
{
"epoch": 0.4559903540502028,
"grad_norm": 7.1503520011901855,
"learning_rate": 2.8462560485051098e-05,
"loss": 2.9883,
"num_input_tokens_seen": 2725640,
"step": 4160
},
{
"epoch": 0.45653841937959005,
"grad_norm": 5.350907802581787,
"learning_rate": 2.841992528639888e-05,
"loss": 3.0743,
"num_input_tokens_seen": 2729992,
"step": 4165
},
{
"epoch": 0.45708648470897734,
"grad_norm": 5.482122421264648,
"learning_rate": 2.837727994907835e-05,
"loss": 3.2459,
"num_input_tokens_seen": 2733424,
"step": 4170
},
{
"epoch": 0.45763455003836456,
"grad_norm": 4.941489219665527,
"learning_rate": 2.833462459951534e-05,
"loss": 3.2963,
"num_input_tokens_seen": 2736656,
"step": 4175
},
{
"epoch": 0.45818261536775184,
"grad_norm": 10.229253768920898,
"learning_rate": 2.8291959364165387e-05,
"loss": 3.2607,
"num_input_tokens_seen": 2739808,
"step": 4180
},
{
"epoch": 0.4587306806971391,
"grad_norm": 5.911849498748779,
"learning_rate": 2.824928436951332e-05,
"loss": 3.3887,
"num_input_tokens_seen": 2742752,
"step": 4185
},
{
"epoch": 0.45927874602652635,
"grad_norm": 6.14879846572876,
"learning_rate": 2.8206599742072883e-05,
"loss": 3.0095,
"num_input_tokens_seen": 2746256,
"step": 4190
},
{
"epoch": 0.45982681135591363,
"grad_norm": 6.8150529861450195,
"learning_rate": 2.8163905608386415e-05,
"loss": 3.0599,
"num_input_tokens_seen": 2750736,
"step": 4195
},
{
"epoch": 0.4603748766853009,
"grad_norm": 5.578204154968262,
"learning_rate": 2.812120209502441e-05,
"loss": 3.4177,
"num_input_tokens_seen": 2753832,
"step": 4200
},
{
"epoch": 0.46092294201468814,
"grad_norm": 7.075170040130615,
"learning_rate": 2.8078489328585184e-05,
"loss": 3.2787,
"num_input_tokens_seen": 2757176,
"step": 4205
},
{
"epoch": 0.4614710073440754,
"grad_norm": 7.633877754211426,
"learning_rate": 2.803576743569447e-05,
"loss": 3.2838,
"num_input_tokens_seen": 2760632,
"step": 4210
},
{
"epoch": 0.46201907267346265,
"grad_norm": 7.296063423156738,
"learning_rate": 2.7993036543005073e-05,
"loss": 3.2533,
"num_input_tokens_seen": 2763160,
"step": 4215
},
{
"epoch": 0.46256713800284993,
"grad_norm": 9.778048515319824,
"learning_rate": 2.7950296777196454e-05,
"loss": 3.2876,
"num_input_tokens_seen": 2766304,
"step": 4220
},
{
"epoch": 0.4631152033322372,
"grad_norm": 6.1279826164245605,
"learning_rate": 2.7907548264974408e-05,
"loss": 3.3613,
"num_input_tokens_seen": 2769112,
"step": 4225
},
{
"epoch": 0.46366326866162444,
"grad_norm": 7.0411458015441895,
"learning_rate": 2.7864791133070655e-05,
"loss": 2.9218,
"num_input_tokens_seen": 2773120,
"step": 4230
},
{
"epoch": 0.4642113339910117,
"grad_norm": 7.575366497039795,
"learning_rate": 2.782202550824244e-05,
"loss": 2.7816,
"num_input_tokens_seen": 2775712,
"step": 4235
},
{
"epoch": 0.464759399320399,
"grad_norm": 4.21223258972168,
"learning_rate": 2.777925151727222e-05,
"loss": 2.913,
"num_input_tokens_seen": 2778872,
"step": 4240
},
{
"epoch": 0.46530746464978623,
"grad_norm": 7.198635101318359,
"learning_rate": 2.7736469286967244e-05,
"loss": 3.3944,
"num_input_tokens_seen": 2783424,
"step": 4245
},
{
"epoch": 0.4658555299791735,
"grad_norm": 6.785750389099121,
"learning_rate": 2.7693678944159168e-05,
"loss": 3.0493,
"num_input_tokens_seen": 2787720,
"step": 4250
},
{
"epoch": 0.4664035953085608,
"grad_norm": 5.799097061157227,
"learning_rate": 2.7650880615703735e-05,
"loss": 3.043,
"num_input_tokens_seen": 2790528,
"step": 4255
},
{
"epoch": 0.466951660637948,
"grad_norm": 5.558688163757324,
"learning_rate": 2.760807442848033e-05,
"loss": 3.0476,
"num_input_tokens_seen": 2794088,
"step": 4260
},
{
"epoch": 0.4674997259673353,
"grad_norm": 7.959995269775391,
"learning_rate": 2.7565260509391644e-05,
"loss": 3.3705,
"num_input_tokens_seen": 2797168,
"step": 4265
},
{
"epoch": 0.4680477912967226,
"grad_norm": 5.836214542388916,
"learning_rate": 2.7522438985363297e-05,
"loss": 3.1173,
"num_input_tokens_seen": 2799752,
"step": 4270
},
{
"epoch": 0.4685958566261098,
"grad_norm": 5.6099348068237305,
"learning_rate": 2.7479609983343457e-05,
"loss": 3.4298,
"num_input_tokens_seen": 2803560,
"step": 4275
},
{
"epoch": 0.4691439219554971,
"grad_norm": 6.971024513244629,
"learning_rate": 2.7436773630302448e-05,
"loss": 3.4299,
"num_input_tokens_seen": 2806360,
"step": 4280
},
{
"epoch": 0.46969198728488437,
"grad_norm": 5.738091945648193,
"learning_rate": 2.7393930053232393e-05,
"loss": 3.0872,
"num_input_tokens_seen": 2809408,
"step": 4285
},
{
"epoch": 0.4702400526142716,
"grad_norm": 10.746182441711426,
"learning_rate": 2.7351079379146844e-05,
"loss": 3.5487,
"num_input_tokens_seen": 2812752,
"step": 4290
},
{
"epoch": 0.4707881179436589,
"grad_norm": 6.557742595672607,
"learning_rate": 2.7308221735080363e-05,
"loss": 3.1006,
"num_input_tokens_seen": 2816432,
"step": 4295
},
{
"epoch": 0.47133618327304616,
"grad_norm": 7.124549865722656,
"learning_rate": 2.726535724808821e-05,
"loss": 3.2491,
"num_input_tokens_seen": 2819608,
"step": 4300
},
{
"epoch": 0.4718842486024334,
"grad_norm": 8.328391075134277,
"learning_rate": 2.7222486045245905e-05,
"loss": 2.9571,
"num_input_tokens_seen": 2822304,
"step": 4305
},
{
"epoch": 0.47243231393182067,
"grad_norm": 8.121037483215332,
"learning_rate": 2.717960825364888e-05,
"loss": 3.0946,
"num_input_tokens_seen": 2826112,
"step": 4310
},
{
"epoch": 0.47298037926120795,
"grad_norm": 7.5214715003967285,
"learning_rate": 2.7136724000412122e-05,
"loss": 3.2682,
"num_input_tokens_seen": 2829640,
"step": 4315
},
{
"epoch": 0.4735284445905952,
"grad_norm": 5.765413761138916,
"learning_rate": 2.709383341266975e-05,
"loss": 3.3871,
"num_input_tokens_seen": 2832536,
"step": 4320
},
{
"epoch": 0.47407650991998246,
"grad_norm": 7.573315143585205,
"learning_rate": 2.7050936617574674e-05,
"loss": 3.0505,
"num_input_tokens_seen": 2835312,
"step": 4325
},
{
"epoch": 0.47462457524936974,
"grad_norm": 5.444807052612305,
"learning_rate": 2.70080337422982e-05,
"loss": 3.1385,
"num_input_tokens_seen": 2839520,
"step": 4330
},
{
"epoch": 0.47517264057875697,
"grad_norm": 5.842774868011475,
"learning_rate": 2.696512491402967e-05,
"loss": 3.0295,
"num_input_tokens_seen": 2842096,
"step": 4335
},
{
"epoch": 0.47572070590814425,
"grad_norm": 6.1106157302856445,
"learning_rate": 2.692221025997606e-05,
"loss": 3.0393,
"num_input_tokens_seen": 2845424,
"step": 4340
},
{
"epoch": 0.47626877123753153,
"grad_norm": 7.988515377044678,
"learning_rate": 2.687928990736163e-05,
"loss": 3.3657,
"num_input_tokens_seen": 2847648,
"step": 4345
},
{
"epoch": 0.47681683656691876,
"grad_norm": 7.0514655113220215,
"learning_rate": 2.683636398342753e-05,
"loss": 3.4438,
"num_input_tokens_seen": 2850432,
"step": 4350
},
{
"epoch": 0.47736490189630604,
"grad_norm": 5.54784631729126,
"learning_rate": 2.6793432615431406e-05,
"loss": 2.9583,
"num_input_tokens_seen": 2854176,
"step": 4355
},
{
"epoch": 0.4779129672256933,
"grad_norm": 6.001830577850342,
"learning_rate": 2.6750495930647083e-05,
"loss": 3.4694,
"num_input_tokens_seen": 2857368,
"step": 4360
},
{
"epoch": 0.47846103255508055,
"grad_norm": 7.455556392669678,
"learning_rate": 2.670755405636412e-05,
"loss": 3.0839,
"num_input_tokens_seen": 2860064,
"step": 4365
},
{
"epoch": 0.4790090978844678,
"grad_norm": 6.409590721130371,
"learning_rate": 2.6664607119887462e-05,
"loss": 3.0962,
"num_input_tokens_seen": 2863128,
"step": 4370
},
{
"epoch": 0.4795571632138551,
"grad_norm": 5.903439044952393,
"learning_rate": 2.6621655248537075e-05,
"loss": 3.0613,
"num_input_tokens_seen": 2866720,
"step": 4375
},
{
"epoch": 0.48010522854324233,
"grad_norm": 7.286397457122803,
"learning_rate": 2.657869856964754e-05,
"loss": 2.9673,
"num_input_tokens_seen": 2869568,
"step": 4380
},
{
"epoch": 0.4806532938726296,
"grad_norm": 7.941439151763916,
"learning_rate": 2.6535737210567707e-05,
"loss": 3.3656,
"num_input_tokens_seen": 2874584,
"step": 4385
},
{
"epoch": 0.4812013592020169,
"grad_norm": 3.8733413219451904,
"learning_rate": 2.6492771298660286e-05,
"loss": 2.8012,
"num_input_tokens_seen": 2879248,
"step": 4390
},
{
"epoch": 0.4817494245314041,
"grad_norm": 4.492478370666504,
"learning_rate": 2.6449800961301485e-05,
"loss": 2.9495,
"num_input_tokens_seen": 2882824,
"step": 4395
},
{
"epoch": 0.4822974898607914,
"grad_norm": 7.726132392883301,
"learning_rate": 2.640682632588064e-05,
"loss": 3.1087,
"num_input_tokens_seen": 2886440,
"step": 4400
},
{
"epoch": 0.4828455551901787,
"grad_norm": 6.549642562866211,
"learning_rate": 2.6363847519799822e-05,
"loss": 2.985,
"num_input_tokens_seen": 2889808,
"step": 4405
},
{
"epoch": 0.4833936205195659,
"grad_norm": 8.789740562438965,
"learning_rate": 2.632086467047348e-05,
"loss": 3.1352,
"num_input_tokens_seen": 2893680,
"step": 4410
},
{
"epoch": 0.4839416858489532,
"grad_norm": 8.024590492248535,
"learning_rate": 2.6277877905328023e-05,
"loss": 3.3008,
"num_input_tokens_seen": 2895872,
"step": 4415
},
{
"epoch": 0.4844897511783405,
"grad_norm": 6.235259532928467,
"learning_rate": 2.623488735180149e-05,
"loss": 3.1758,
"num_input_tokens_seen": 2898680,
"step": 4420
},
{
"epoch": 0.4850378165077277,
"grad_norm": 7.674651145935059,
"learning_rate": 2.619189313734316e-05,
"loss": 2.9519,
"num_input_tokens_seen": 2903496,
"step": 4425
},
{
"epoch": 0.485585881837115,
"grad_norm": 5.884274959564209,
"learning_rate": 2.614889538941313e-05,
"loss": 3.3259,
"num_input_tokens_seen": 2906248,
"step": 4430
},
{
"epoch": 0.48613394716650227,
"grad_norm": 5.681421279907227,
"learning_rate": 2.610589423548201e-05,
"loss": 3.4432,
"num_input_tokens_seen": 2909352,
"step": 4435
},
{
"epoch": 0.4866820124958895,
"grad_norm": 8.08205795288086,
"learning_rate": 2.6062889803030477e-05,
"loss": 3.6165,
"num_input_tokens_seen": 2911960,
"step": 4440
},
{
"epoch": 0.4872300778252768,
"grad_norm": 7.7329277992248535,
"learning_rate": 2.601988221954894e-05,
"loss": 3.2172,
"num_input_tokens_seen": 2915256,
"step": 4445
},
{
"epoch": 0.48777814315466406,
"grad_norm": 6.208625793457031,
"learning_rate": 2.5976871612537164e-05,
"loss": 3.2373,
"num_input_tokens_seen": 2919040,
"step": 4450
},
{
"epoch": 0.4883262084840513,
"grad_norm": 8.127032279968262,
"learning_rate": 2.593385810950386e-05,
"loss": 2.9402,
"num_input_tokens_seen": 2922272,
"step": 4455
},
{
"epoch": 0.48887427381343856,
"grad_norm": 6.481329441070557,
"learning_rate": 2.589084183796632e-05,
"loss": 3.0208,
"num_input_tokens_seen": 2926072,
"step": 4460
},
{
"epoch": 0.48942233914282585,
"grad_norm": 6.350535869598389,
"learning_rate": 2.5847822925450055e-05,
"loss": 3.1026,
"num_input_tokens_seen": 2928760,
"step": 4465
},
{
"epoch": 0.4899704044722131,
"grad_norm": 7.3511457443237305,
"learning_rate": 2.5804801499488407e-05,
"loss": 2.9358,
"num_input_tokens_seen": 2932088,
"step": 4470
},
{
"epoch": 0.49051846980160035,
"grad_norm": 5.9759521484375,
"learning_rate": 2.576177768762216e-05,
"loss": 3.1564,
"num_input_tokens_seen": 2935272,
"step": 4475
},
{
"epoch": 0.49106653513098764,
"grad_norm": 7.138418674468994,
"learning_rate": 2.5718751617399182e-05,
"loss": 3.0998,
"num_input_tokens_seen": 2938280,
"step": 4480
},
{
"epoch": 0.49161460046037486,
"grad_norm": 10.551050186157227,
"learning_rate": 2.5675723416374026e-05,
"loss": 3.1874,
"num_input_tokens_seen": 2941648,
"step": 4485
},
{
"epoch": 0.49216266578976214,
"grad_norm": 6.085887432098389,
"learning_rate": 2.5632693212107567e-05,
"loss": 2.8506,
"num_input_tokens_seen": 2944680,
"step": 4490
},
{
"epoch": 0.4927107311191494,
"grad_norm": 6.314172267913818,
"learning_rate": 2.5589661132166613e-05,
"loss": 2.8206,
"num_input_tokens_seen": 2948744,
"step": 4495
},
{
"epoch": 0.49325879644853665,
"grad_norm": 6.3680853843688965,
"learning_rate": 2.5546627304123545e-05,
"loss": 2.85,
"num_input_tokens_seen": 2951256,
"step": 4500
},
{
"epoch": 0.49380686177792393,
"grad_norm": 6.314942359924316,
"learning_rate": 2.5503591855555908e-05,
"loss": 3.2021,
"num_input_tokens_seen": 2954536,
"step": 4505
},
{
"epoch": 0.4943549271073112,
"grad_norm": 6.349035739898682,
"learning_rate": 2.546055491404607e-05,
"loss": 2.9022,
"num_input_tokens_seen": 2958112,
"step": 4510
},
{
"epoch": 0.49490299243669844,
"grad_norm": 6.812668800354004,
"learning_rate": 2.5417516607180825e-05,
"loss": 3.2304,
"num_input_tokens_seen": 2961024,
"step": 4515
},
{
"epoch": 0.4954510577660857,
"grad_norm": 4.483590126037598,
"learning_rate": 2.5374477062550984e-05,
"loss": 2.8489,
"num_input_tokens_seen": 2964344,
"step": 4520
},
{
"epoch": 0.495999123095473,
"grad_norm": 6.769683837890625,
"learning_rate": 2.5331436407751074e-05,
"loss": 3.1946,
"num_input_tokens_seen": 2967608,
"step": 4525
},
{
"epoch": 0.49654718842486023,
"grad_norm": 9.059048652648926,
"learning_rate": 2.528839477037887e-05,
"loss": 3.2895,
"num_input_tokens_seen": 2970488,
"step": 4530
},
{
"epoch": 0.4970952537542475,
"grad_norm": 9.555692672729492,
"learning_rate": 2.5245352278035095e-05,
"loss": 3.0595,
"num_input_tokens_seen": 2973200,
"step": 4535
},
{
"epoch": 0.4976433190836348,
"grad_norm": 8.808011054992676,
"learning_rate": 2.520230905832298e-05,
"loss": 3.1939,
"num_input_tokens_seen": 2976576,
"step": 4540
},
{
"epoch": 0.498191384413022,
"grad_norm": 7.059693336486816,
"learning_rate": 2.515926523884792e-05,
"loss": 3.3154,
"num_input_tokens_seen": 2980624,
"step": 4545
},
{
"epoch": 0.4987394497424093,
"grad_norm": 5.0204973220825195,
"learning_rate": 2.5116220947217107e-05,
"loss": 3.2012,
"num_input_tokens_seen": 2983328,
"step": 4550
},
{
"epoch": 0.4992875150717966,
"grad_norm": 8.473772048950195,
"learning_rate": 2.507317631103911e-05,
"loss": 3.3448,
"num_input_tokens_seen": 2986664,
"step": 4555
},
{
"epoch": 0.4998355804011838,
"grad_norm": 5.891829490661621,
"learning_rate": 2.5030131457923512e-05,
"loss": 3.0624,
"num_input_tokens_seen": 2990088,
"step": 4560
},
{
"epoch": 0.500383645730571,
"grad_norm": 8.812019348144531,
"learning_rate": 2.498708651548057e-05,
"loss": 3.1606,
"num_input_tokens_seen": 2993152,
"step": 4565
},
{
"epoch": 0.5009317110599584,
"grad_norm": 6.772736549377441,
"learning_rate": 2.494404161132079e-05,
"loss": 2.6401,
"num_input_tokens_seen": 2996104,
"step": 4570
},
{
"epoch": 0.5014797763893456,
"grad_norm": 6.640130996704102,
"learning_rate": 2.490099687305455e-05,
"loss": 2.8047,
"num_input_tokens_seen": 3000664,
"step": 4575
},
{
"epoch": 0.5020278417187328,
"grad_norm": 8.050363540649414,
"learning_rate": 2.485795242829177e-05,
"loss": 2.9757,
"num_input_tokens_seen": 3004312,
"step": 4580
},
{
"epoch": 0.5025759070481202,
"grad_norm": 7.689075469970703,
"learning_rate": 2.481490840464147e-05,
"loss": 3.6823,
"num_input_tokens_seen": 3008056,
"step": 4585
},
{
"epoch": 0.5031239723775074,
"grad_norm": 7.890453815460205,
"learning_rate": 2.4771864929711414e-05,
"loss": 3.5555,
"num_input_tokens_seen": 3010640,
"step": 4590
},
{
"epoch": 0.5036720377068946,
"grad_norm": 8.07981014251709,
"learning_rate": 2.4728822131107784e-05,
"loss": 2.9504,
"num_input_tokens_seen": 3013752,
"step": 4595
},
{
"epoch": 0.504220103036282,
"grad_norm": 5.753955364227295,
"learning_rate": 2.468578013643472e-05,
"loss": 3.1703,
"num_input_tokens_seen": 3016248,
"step": 4600
},
{
"epoch": 0.5047681683656692,
"grad_norm": 5.296700954437256,
"learning_rate": 2.4642739073293978e-05,
"loss": 2.8482,
"num_input_tokens_seen": 3019256,
"step": 4605
},
{
"epoch": 0.5053162336950564,
"grad_norm": 11.357376098632812,
"learning_rate": 2.459969906928458e-05,
"loss": 2.8125,
"num_input_tokens_seen": 3021936,
"step": 4610
},
{
"epoch": 0.5058642990244437,
"grad_norm": 9.2806396484375,
"learning_rate": 2.4556660252002384e-05,
"loss": 3.1294,
"num_input_tokens_seen": 3025888,
"step": 4615
},
{
"epoch": 0.506412364353831,
"grad_norm": 7.156399250030518,
"learning_rate": 2.451362274903973e-05,
"loss": 3.202,
"num_input_tokens_seen": 3029752,
"step": 4620
},
{
"epoch": 0.5069604296832182,
"grad_norm": 7.298778533935547,
"learning_rate": 2.4470586687985077e-05,
"loss": 3.2958,
"num_input_tokens_seen": 3033576,
"step": 4625
},
{
"epoch": 0.5075084950126055,
"grad_norm": 7.478179454803467,
"learning_rate": 2.4427552196422602e-05,
"loss": 3.1416,
"num_input_tokens_seen": 3037016,
"step": 4630
},
{
"epoch": 0.5080565603419928,
"grad_norm": 8.109244346618652,
"learning_rate": 2.438451940193181e-05,
"loss": 2.7633,
"num_input_tokens_seen": 3040640,
"step": 4635
},
{
"epoch": 0.50860462567138,
"grad_norm": 6.991682052612305,
"learning_rate": 2.434148843208722e-05,
"loss": 2.9995,
"num_input_tokens_seen": 3043424,
"step": 4640
},
{
"epoch": 0.5091526910007673,
"grad_norm": 5.315702438354492,
"learning_rate": 2.4298459414457896e-05,
"loss": 2.9122,
"num_input_tokens_seen": 3046672,
"step": 4645
},
{
"epoch": 0.5097007563301545,
"grad_norm": 8.090765953063965,
"learning_rate": 2.425543247660713e-05,
"loss": 3.3741,
"num_input_tokens_seen": 3049736,
"step": 4650
},
{
"epoch": 0.5102488216595418,
"grad_norm": 9.288080215454102,
"learning_rate": 2.4212407746092066e-05,
"loss": 3.4609,
"num_input_tokens_seen": 3053656,
"step": 4655
},
{
"epoch": 0.5107968869889291,
"grad_norm": 5.754721164703369,
"learning_rate": 2.4169385350463282e-05,
"loss": 2.9946,
"num_input_tokens_seen": 3056144,
"step": 4660
},
{
"epoch": 0.5113449523183163,
"grad_norm": 6.588372230529785,
"learning_rate": 2.412636541726444e-05,
"loss": 3.0074,
"num_input_tokens_seen": 3059712,
"step": 4665
},
{
"epoch": 0.5118930176477036,
"grad_norm": 7.401770114898682,
"learning_rate": 2.4083348074031904e-05,
"loss": 3.4029,
"num_input_tokens_seen": 3062288,
"step": 4670
},
{
"epoch": 0.5124410829770909,
"grad_norm": 5.612600803375244,
"learning_rate": 2.4040333448294364e-05,
"loss": 3.2012,
"num_input_tokens_seen": 3065728,
"step": 4675
},
{
"epoch": 0.5129891483064781,
"grad_norm": 5.925127983093262,
"learning_rate": 2.399732166757243e-05,
"loss": 3.0461,
"num_input_tokens_seen": 3068632,
"step": 4680
},
{
"epoch": 0.5135372136358654,
"grad_norm": 8.738677978515625,
"learning_rate": 2.3954312859378325e-05,
"loss": 3.4782,
"num_input_tokens_seen": 3070968,
"step": 4685
},
{
"epoch": 0.5140852789652527,
"grad_norm": 9.27092170715332,
"learning_rate": 2.3911307151215413e-05,
"loss": 3.2625,
"num_input_tokens_seen": 3074696,
"step": 4690
},
{
"epoch": 0.5146333442946399,
"grad_norm": 5.855086326599121,
"learning_rate": 2.3868304670577886e-05,
"loss": 3.045,
"num_input_tokens_seen": 3078584,
"step": 4695
},
{
"epoch": 0.5151814096240271,
"grad_norm": 8.794078826904297,
"learning_rate": 2.3825305544950374e-05,
"loss": 2.7209,
"num_input_tokens_seen": 3081624,
"step": 4700
},
{
"epoch": 0.5157294749534145,
"grad_norm": 7.675835132598877,
"learning_rate": 2.3782309901807555e-05,
"loss": 3.3431,
"num_input_tokens_seen": 3084152,
"step": 4705
},
{
"epoch": 0.5162775402828017,
"grad_norm": 7.583930969238281,
"learning_rate": 2.3739317868613776e-05,
"loss": 3.1141,
"num_input_tokens_seen": 3087040,
"step": 4710
},
{
"epoch": 0.5168256056121889,
"grad_norm": 7.561563968658447,
"learning_rate": 2.369632957282269e-05,
"loss": 3.4023,
"num_input_tokens_seen": 3090352,
"step": 4715
},
{
"epoch": 0.5173736709415763,
"grad_norm": 6.868551254272461,
"learning_rate": 2.365334514187687e-05,
"loss": 3.0766,
"num_input_tokens_seen": 3093552,
"step": 4720
},
{
"epoch": 0.5179217362709635,
"grad_norm": 5.663219928741455,
"learning_rate": 2.3610364703207432e-05,
"loss": 3.1136,
"num_input_tokens_seen": 3097168,
"step": 4725
},
{
"epoch": 0.5184698016003507,
"grad_norm": 7.611098766326904,
"learning_rate": 2.3567388384233648e-05,
"loss": 3.0911,
"num_input_tokens_seen": 3101648,
"step": 4730
},
{
"epoch": 0.5190178669297381,
"grad_norm": 6.850576877593994,
"learning_rate": 2.352441631236259e-05,
"loss": 2.9311,
"num_input_tokens_seen": 3105888,
"step": 4735
},
{
"epoch": 0.5195659322591253,
"grad_norm": 5.57901668548584,
"learning_rate": 2.348144861498873e-05,
"loss": 3.0239,
"num_input_tokens_seen": 3110648,
"step": 4740
},
{
"epoch": 0.5201139975885125,
"grad_norm": 6.950675010681152,
"learning_rate": 2.343848541949356e-05,
"loss": 3.053,
"num_input_tokens_seen": 3113400,
"step": 4745
},
{
"epoch": 0.5206620629178998,
"grad_norm": 5.661995887756348,
"learning_rate": 2.3395526853245264e-05,
"loss": 3.2619,
"num_input_tokens_seen": 3117000,
"step": 4750
},
{
"epoch": 0.5212101282472871,
"grad_norm": 6.956995010375977,
"learning_rate": 2.3352573043598267e-05,
"loss": 3.6572,
"num_input_tokens_seen": 3121664,
"step": 4755
},
{
"epoch": 0.5217581935766743,
"grad_norm": 4.707006454467773,
"learning_rate": 2.3309624117892885e-05,
"loss": 2.9066,
"num_input_tokens_seen": 3124872,
"step": 4760
},
{
"epoch": 0.5223062589060616,
"grad_norm": 5.503338813781738,
"learning_rate": 2.3266680203455004e-05,
"loss": 3.2066,
"num_input_tokens_seen": 3128760,
"step": 4765
},
{
"epoch": 0.5228543242354489,
"grad_norm": 7.054602146148682,
"learning_rate": 2.322374142759561e-05,
"loss": 2.8683,
"num_input_tokens_seen": 3131480,
"step": 4770
},
{
"epoch": 0.5234023895648361,
"grad_norm": 8.06494140625,
"learning_rate": 2.318080791761046e-05,
"loss": 3.2634,
"num_input_tokens_seen": 3135040,
"step": 4775
},
{
"epoch": 0.5239504548942234,
"grad_norm": 8.718894958496094,
"learning_rate": 2.313787980077972e-05,
"loss": 3.3735,
"num_input_tokens_seen": 3137816,
"step": 4780
},
{
"epoch": 0.5244985202236107,
"grad_norm": 6.601426124572754,
"learning_rate": 2.309495720436755e-05,
"loss": 3.0622,
"num_input_tokens_seen": 3141752,
"step": 4785
},
{
"epoch": 0.5250465855529979,
"grad_norm": 7.08184814453125,
"learning_rate": 2.305204025562174e-05,
"loss": 2.6361,
"num_input_tokens_seen": 3144792,
"step": 4790
},
{
"epoch": 0.5255946508823852,
"grad_norm": 8.298012733459473,
"learning_rate": 2.3009129081773366e-05,
"loss": 2.8071,
"num_input_tokens_seen": 3147904,
"step": 4795
},
{
"epoch": 0.5261427162117724,
"grad_norm": 7.070413589477539,
"learning_rate": 2.2966223810036357e-05,
"loss": 3.2667,
"num_input_tokens_seen": 3150344,
"step": 4800
},
{
"epoch": 0.5266907815411597,
"grad_norm": 8.037806510925293,
"learning_rate": 2.292332456760714e-05,
"loss": 3.3148,
"num_input_tokens_seen": 3154328,
"step": 4805
},
{
"epoch": 0.527238846870547,
"grad_norm": 5.284430980682373,
"learning_rate": 2.2880431481664306e-05,
"loss": 2.6196,
"num_input_tokens_seen": 3157392,
"step": 4810
},
{
"epoch": 0.5277869121999342,
"grad_norm": 7.804793357849121,
"learning_rate": 2.283754467936815e-05,
"loss": 2.9899,
"num_input_tokens_seen": 3160304,
"step": 4815
},
{
"epoch": 0.5283349775293215,
"grad_norm": 8.394335746765137,
"learning_rate": 2.279466428786035e-05,
"loss": 3.2071,
"num_input_tokens_seen": 3163736,
"step": 4820
},
{
"epoch": 0.5288830428587088,
"grad_norm": 6.269372463226318,
"learning_rate": 2.2751790434263608e-05,
"loss": 3.1003,
"num_input_tokens_seen": 3166368,
"step": 4825
},
{
"epoch": 0.529431108188096,
"grad_norm": 7.112332820892334,
"learning_rate": 2.2708923245681203e-05,
"loss": 3.2725,
"num_input_tokens_seen": 3169960,
"step": 4830
},
{
"epoch": 0.5299791735174832,
"grad_norm": 8.58667278289795,
"learning_rate": 2.266606284919667e-05,
"loss": 2.7479,
"num_input_tokens_seen": 3172744,
"step": 4835
},
{
"epoch": 0.5305272388468706,
"grad_norm": 7.745898723602295,
"learning_rate": 2.262320937187344e-05,
"loss": 3.4911,
"num_input_tokens_seen": 3175984,
"step": 4840
},
{
"epoch": 0.5310753041762578,
"grad_norm": 6.885601997375488,
"learning_rate": 2.258036294075438e-05,
"loss": 2.8831,
"num_input_tokens_seen": 3178800,
"step": 4845
},
{
"epoch": 0.531623369505645,
"grad_norm": 6.387146472930908,
"learning_rate": 2.2537523682861484e-05,
"loss": 3.0745,
"num_input_tokens_seen": 3182328,
"step": 4850
},
{
"epoch": 0.5321714348350324,
"grad_norm": 4.868107795715332,
"learning_rate": 2.249469172519551e-05,
"loss": 3.0048,
"num_input_tokens_seen": 3185912,
"step": 4855
},
{
"epoch": 0.5327195001644196,
"grad_norm": 8.075777053833008,
"learning_rate": 2.2451867194735542e-05,
"loss": 3.3234,
"num_input_tokens_seen": 3189352,
"step": 4860
},
{
"epoch": 0.5332675654938068,
"grad_norm": 5.830811500549316,
"learning_rate": 2.2409050218438645e-05,
"loss": 3.0588,
"num_input_tokens_seen": 3193072,
"step": 4865
},
{
"epoch": 0.5338156308231942,
"grad_norm": 5.349551200866699,
"learning_rate": 2.2366240923239514e-05,
"loss": 2.7223,
"num_input_tokens_seen": 3196104,
"step": 4870
},
{
"epoch": 0.5343636961525814,
"grad_norm": 8.454142570495605,
"learning_rate": 2.2323439436050054e-05,
"loss": 3.1157,
"num_input_tokens_seen": 3198648,
"step": 4875
},
{
"epoch": 0.5349117614819686,
"grad_norm": 7.110290050506592,
"learning_rate": 2.2280645883759006e-05,
"loss": 3.0379,
"num_input_tokens_seen": 3201056,
"step": 4880
},
{
"epoch": 0.535459826811356,
"grad_norm": 5.1915154457092285,
"learning_rate": 2.2237860393231634e-05,
"loss": 3.575,
"num_input_tokens_seen": 3203712,
"step": 4885
},
{
"epoch": 0.5360078921407432,
"grad_norm": 8.497429847717285,
"learning_rate": 2.219508309130927e-05,
"loss": 2.9379,
"num_input_tokens_seen": 3206288,
"step": 4890
},
{
"epoch": 0.5365559574701304,
"grad_norm": 8.26462173461914,
"learning_rate": 2.2152314104808956e-05,
"loss": 3.1587,
"num_input_tokens_seen": 3209928,
"step": 4895
},
{
"epoch": 0.5371040227995177,
"grad_norm": 6.499933242797852,
"learning_rate": 2.210955356052313e-05,
"loss": 2.9181,
"num_input_tokens_seen": 3213336,
"step": 4900
},
{
"epoch": 0.537652088128905,
"grad_norm": 5.8398590087890625,
"learning_rate": 2.2066801585219156e-05,
"loss": 2.8303,
"num_input_tokens_seen": 3216464,
"step": 4905
},
{
"epoch": 0.5382001534582922,
"grad_norm": 6.813495635986328,
"learning_rate": 2.2024058305639015e-05,
"loss": 2.9079,
"num_input_tokens_seen": 3221256,
"step": 4910
},
{
"epoch": 0.5387482187876795,
"grad_norm": 8.064513206481934,
"learning_rate": 2.198132384849891e-05,
"loss": 3.2373,
"num_input_tokens_seen": 3224320,
"step": 4915
},
{
"epoch": 0.5392962841170668,
"grad_norm": 7.14154577255249,
"learning_rate": 2.1938598340488886e-05,
"loss": 3.0737,
"num_input_tokens_seen": 3227128,
"step": 4920
},
{
"epoch": 0.539844349446454,
"grad_norm": 6.514719009399414,
"learning_rate": 2.1895881908272446e-05,
"loss": 2.8825,
"num_input_tokens_seen": 3230352,
"step": 4925
},
{
"epoch": 0.5403924147758413,
"grad_norm": 7.076175212860107,
"learning_rate": 2.1853174678486213e-05,
"loss": 2.8721,
"num_input_tokens_seen": 3234440,
"step": 4930
},
{
"epoch": 0.5409404801052285,
"grad_norm": 5.526149749755859,
"learning_rate": 2.1810476777739508e-05,
"loss": 3.1112,
"num_input_tokens_seen": 3238176,
"step": 4935
},
{
"epoch": 0.5414885454346158,
"grad_norm": 8.458449363708496,
"learning_rate": 2.176778833261399e-05,
"loss": 3.2798,
"num_input_tokens_seen": 3241728,
"step": 4940
},
{
"epoch": 0.5420366107640031,
"grad_norm": 7.216832160949707,
"learning_rate": 2.1725109469663318e-05,
"loss": 3.1847,
"num_input_tokens_seen": 3244416,
"step": 4945
},
{
"epoch": 0.5425846760933903,
"grad_norm": 5.6720147132873535,
"learning_rate": 2.168244031541271e-05,
"loss": 3.4552,
"num_input_tokens_seen": 3247816,
"step": 4950
},
{
"epoch": 0.5431327414227776,
"grad_norm": 7.452066898345947,
"learning_rate": 2.163978099635861e-05,
"loss": 2.958,
"num_input_tokens_seen": 3250432,
"step": 4955
},
{
"epoch": 0.5436808067521649,
"grad_norm": 6.589701175689697,
"learning_rate": 2.159713163896832e-05,
"loss": 3.4633,
"num_input_tokens_seen": 3253376,
"step": 4960
},
{
"epoch": 0.5442288720815521,
"grad_norm": 4.926830768585205,
"learning_rate": 2.1554492369679598e-05,
"loss": 3.0458,
"num_input_tokens_seen": 3257640,
"step": 4965
},
{
"epoch": 0.5447769374109394,
"grad_norm": 8.084177017211914,
"learning_rate": 2.1511863314900275e-05,
"loss": 2.992,
"num_input_tokens_seen": 3261952,
"step": 4970
},
{
"epoch": 0.5453250027403267,
"grad_norm": 5.291374683380127,
"learning_rate": 2.146924460100795e-05,
"loss": 2.5116,
"num_input_tokens_seen": 3265912,
"step": 4975
},
{
"epoch": 0.5458730680697139,
"grad_norm": 9.101826667785645,
"learning_rate": 2.1426636354349523e-05,
"loss": 3.0809,
"num_input_tokens_seen": 3269624,
"step": 4980
},
{
"epoch": 0.5464211333991011,
"grad_norm": 9.933355331420898,
"learning_rate": 2.1384038701240865e-05,
"loss": 2.6956,
"num_input_tokens_seen": 3273112,
"step": 4985
},
{
"epoch": 0.5469691987284885,
"grad_norm": 8.288704872131348,
"learning_rate": 2.1341451767966475e-05,
"loss": 3.319,
"num_input_tokens_seen": 3275624,
"step": 4990
},
{
"epoch": 0.5475172640578757,
"grad_norm": 6.39847469329834,
"learning_rate": 2.129887568077904e-05,
"loss": 3.0552,
"num_input_tokens_seen": 3279792,
"step": 4995
},
{
"epoch": 0.5480653293872629,
"grad_norm": 6.739533424377441,
"learning_rate": 2.12563105658991e-05,
"loss": 3.1218,
"num_input_tokens_seen": 3283560,
"step": 5000
},
{
"epoch": 0.5486133947166503,
"grad_norm": 7.888918399810791,
"learning_rate": 2.1213756549514674e-05,
"loss": 3.0369,
"num_input_tokens_seen": 3286504,
"step": 5005
},
{
"epoch": 0.5491614600460375,
"grad_norm": 6.957367897033691,
"learning_rate": 2.1171213757780873e-05,
"loss": 2.9968,
"num_input_tokens_seen": 3289512,
"step": 5010
},
{
"epoch": 0.5497095253754247,
"grad_norm": 6.351596355438232,
"learning_rate": 2.1128682316819522e-05,
"loss": 3.0657,
"num_input_tokens_seen": 3293512,
"step": 5015
},
{
"epoch": 0.5502575907048121,
"grad_norm": 7.056116104125977,
"learning_rate": 2.1086162352718825e-05,
"loss": 3.029,
"num_input_tokens_seen": 3298024,
"step": 5020
},
{
"epoch": 0.5508056560341993,
"grad_norm": 6.343071937561035,
"learning_rate": 2.1043653991532934e-05,
"loss": 2.8398,
"num_input_tokens_seen": 3301000,
"step": 5025
},
{
"epoch": 0.5513537213635865,
"grad_norm": 8.5012788772583,
"learning_rate": 2.1001157359281605e-05,
"loss": 3.1406,
"num_input_tokens_seen": 3304064,
"step": 5030
},
{
"epoch": 0.5519017866929739,
"grad_norm": 5.8669819831848145,
"learning_rate": 2.095867258194984e-05,
"loss": 2.7844,
"num_input_tokens_seen": 3308616,
"step": 5035
},
{
"epoch": 0.5524498520223611,
"grad_norm": 6.373290061950684,
"learning_rate": 2.0916199785487488e-05,
"loss": 3.346,
"num_input_tokens_seen": 3312128,
"step": 5040
},
{
"epoch": 0.5529979173517483,
"grad_norm": 7.038343906402588,
"learning_rate": 2.0873739095808865e-05,
"loss": 3.1385,
"num_input_tokens_seen": 3315040,
"step": 5045
},
{
"epoch": 0.5535459826811356,
"grad_norm": 7.340169429779053,
"learning_rate": 2.083129063879242e-05,
"loss": 2.9194,
"num_input_tokens_seen": 3319432,
"step": 5050
},
{
"epoch": 0.5540940480105229,
"grad_norm": 5.199733734130859,
"learning_rate": 2.0788854540280315e-05,
"loss": 3.5487,
"num_input_tokens_seen": 3322568,
"step": 5055
},
{
"epoch": 0.5546421133399101,
"grad_norm": 7.935201168060303,
"learning_rate": 2.0746430926078086e-05,
"loss": 2.8886,
"num_input_tokens_seen": 3325536,
"step": 5060
},
{
"epoch": 0.5551901786692974,
"grad_norm": 7.43034029006958,
"learning_rate": 2.0704019921954264e-05,
"loss": 3.0405,
"num_input_tokens_seen": 3329312,
"step": 5065
},
{
"epoch": 0.5557382439986847,
"grad_norm": 5.411002159118652,
"learning_rate": 2.0661621653639987e-05,
"loss": 3.1599,
"num_input_tokens_seen": 3333232,
"step": 5070
},
{
"epoch": 0.5562863093280719,
"grad_norm": 8.897222518920898,
"learning_rate": 2.0619236246828622e-05,
"loss": 2.8413,
"num_input_tokens_seen": 3336312,
"step": 5075
},
{
"epoch": 0.5568343746574592,
"grad_norm": 8.512425422668457,
"learning_rate": 2.0576863827175447e-05,
"loss": 2.9528,
"num_input_tokens_seen": 3339344,
"step": 5080
},
{
"epoch": 0.5573824399868464,
"grad_norm": 7.003962516784668,
"learning_rate": 2.0534504520297203e-05,
"loss": 3.3579,
"num_input_tokens_seen": 3342520,
"step": 5085
},
{
"epoch": 0.5579305053162337,
"grad_norm": 6.14302396774292,
"learning_rate": 2.0492158451771767e-05,
"loss": 3.3721,
"num_input_tokens_seen": 3346272,
"step": 5090
},
{
"epoch": 0.558478570645621,
"grad_norm": 8.199108123779297,
"learning_rate": 2.0449825747137778e-05,
"loss": 2.9852,
"num_input_tokens_seen": 3350232,
"step": 5095
},
{
"epoch": 0.5590266359750082,
"grad_norm": 7.849426746368408,
"learning_rate": 2.0407506531894245e-05,
"loss": 3.1338,
"num_input_tokens_seen": 3353144,
"step": 5100
},
{
"epoch": 0.5595747013043955,
"grad_norm": 6.752470016479492,
"learning_rate": 2.0365200931500177e-05,
"loss": 2.9589,
"num_input_tokens_seen": 3356952,
"step": 5105
},
{
"epoch": 0.5601227666337828,
"grad_norm": 7.846312046051025,
"learning_rate": 2.0322909071374265e-05,
"loss": 3.2629,
"num_input_tokens_seen": 3360424,
"step": 5110
},
{
"epoch": 0.56067083196317,
"grad_norm": 6.629732131958008,
"learning_rate": 2.028063107689442e-05,
"loss": 3.2232,
"num_input_tokens_seen": 3363544,
"step": 5115
},
{
"epoch": 0.5612188972925573,
"grad_norm": 7.26005220413208,
"learning_rate": 2.023836707339745e-05,
"loss": 3.2771,
"num_input_tokens_seen": 3366664,
"step": 5120
},
{
"epoch": 0.5617669626219446,
"grad_norm": 7.383485317230225,
"learning_rate": 2.0196117186178727e-05,
"loss": 2.8273,
"num_input_tokens_seen": 3369848,
"step": 5125
},
{
"epoch": 0.5623150279513318,
"grad_norm": 7.374210357666016,
"learning_rate": 2.015388154049173e-05,
"loss": 3.2708,
"num_input_tokens_seen": 3373208,
"step": 5130
},
{
"epoch": 0.562863093280719,
"grad_norm": 6.803157329559326,
"learning_rate": 2.0111660261547728e-05,
"loss": 3.1036,
"num_input_tokens_seen": 3376872,
"step": 5135
},
{
"epoch": 0.5634111586101064,
"grad_norm": 6.192258358001709,
"learning_rate": 2.006945347451541e-05,
"loss": 3.0572,
"num_input_tokens_seen": 3382136,
"step": 5140
},
{
"epoch": 0.5639592239394936,
"grad_norm": 9.468875885009766,
"learning_rate": 2.00272613045205e-05,
"loss": 3.2346,
"num_input_tokens_seen": 3385456,
"step": 5145
},
{
"epoch": 0.5645072892688808,
"grad_norm": 6.274002552032471,
"learning_rate": 1.9985083876645368e-05,
"loss": 3.1731,
"num_input_tokens_seen": 3388976,
"step": 5150
},
{
"epoch": 0.5650553545982682,
"grad_norm": 5.550570487976074,
"learning_rate": 1.994292131592872e-05,
"loss": 3.2257,
"num_input_tokens_seen": 3392736,
"step": 5155
},
{
"epoch": 0.5656034199276554,
"grad_norm": 8.218210220336914,
"learning_rate": 1.990077374736515e-05,
"loss": 3.0855,
"num_input_tokens_seen": 3396128,
"step": 5160
},
{
"epoch": 0.5661514852570426,
"grad_norm": 7.721156597137451,
"learning_rate": 1.9858641295904813e-05,
"loss": 2.9721,
"num_input_tokens_seen": 3399376,
"step": 5165
},
{
"epoch": 0.56669955058643,
"grad_norm": 6.2414231300354,
"learning_rate": 1.981652408645307e-05,
"loss": 3.3822,
"num_input_tokens_seen": 3401928,
"step": 5170
},
{
"epoch": 0.5672476159158172,
"grad_norm": 8.496658325195312,
"learning_rate": 1.9774422243870078e-05,
"loss": 3.0474,
"num_input_tokens_seen": 3404744,
"step": 5175
},
{
"epoch": 0.5677956812452044,
"grad_norm": 7.224369049072266,
"learning_rate": 1.9732335892970427e-05,
"loss": 3.259,
"num_input_tokens_seen": 3407824,
"step": 5180
},
{
"epoch": 0.5683437465745917,
"grad_norm": 9.386946678161621,
"learning_rate": 1.969026515852281e-05,
"loss": 3.0473,
"num_input_tokens_seen": 3410608,
"step": 5185
},
{
"epoch": 0.568891811903979,
"grad_norm": 8.189655303955078,
"learning_rate": 1.96482101652496e-05,
"loss": 3.3926,
"num_input_tokens_seen": 3413592,
"step": 5190
},
{
"epoch": 0.5694398772333662,
"grad_norm": 6.405150890350342,
"learning_rate": 1.9606171037826502e-05,
"loss": 2.9921,
"num_input_tokens_seen": 3417320,
"step": 5195
},
{
"epoch": 0.5699879425627535,
"grad_norm": 6.89292573928833,
"learning_rate": 1.9564147900882213e-05,
"loss": 2.9261,
"num_input_tokens_seen": 3420888,
"step": 5200
},
{
"epoch": 0.5705360078921408,
"grad_norm": 6.517080307006836,
"learning_rate": 1.9522140878997995e-05,
"loss": 3.3255,
"num_input_tokens_seen": 3424336,
"step": 5205
},
{
"epoch": 0.571084073221528,
"grad_norm": 8.910572052001953,
"learning_rate": 1.9480150096707344e-05,
"loss": 2.9723,
"num_input_tokens_seen": 3428120,
"step": 5210
},
{
"epoch": 0.5716321385509152,
"grad_norm": 8.455070495605469,
"learning_rate": 1.943817567849563e-05,
"loss": 3.0703,
"num_input_tokens_seen": 3430880,
"step": 5215
},
{
"epoch": 0.5721802038803026,
"grad_norm": 6.948888778686523,
"learning_rate": 1.9396217748799682e-05,
"loss": 2.9862,
"num_input_tokens_seen": 3435560,
"step": 5220
},
{
"epoch": 0.5727282692096898,
"grad_norm": 6.147201061248779,
"learning_rate": 1.935427643200746e-05,
"loss": 3.0719,
"num_input_tokens_seen": 3438352,
"step": 5225
},
{
"epoch": 0.573276334539077,
"grad_norm": 7.213772773742676,
"learning_rate": 1.9312351852457686e-05,
"loss": 2.9474,
"num_input_tokens_seen": 3441216,
"step": 5230
},
{
"epoch": 0.5738243998684643,
"grad_norm": 6.16003942489624,
"learning_rate": 1.9270444134439434e-05,
"loss": 3.0849,
"num_input_tokens_seen": 3444944,
"step": 5235
},
{
"epoch": 0.5743724651978516,
"grad_norm": 7.64081335067749,
"learning_rate": 1.9228553402191822e-05,
"loss": 3.0799,
"num_input_tokens_seen": 3449568,
"step": 5240
},
{
"epoch": 0.5749205305272388,
"grad_norm": 7.353094577789307,
"learning_rate": 1.91866797799036e-05,
"loss": 3.3501,
"num_input_tokens_seen": 3452544,
"step": 5245
},
{
"epoch": 0.5754685958566261,
"grad_norm": 7.696213722229004,
"learning_rate": 1.9144823391712785e-05,
"loss": 3.2286,
"num_input_tokens_seen": 3455600,
"step": 5250
},
{
"epoch": 0.5760166611860134,
"grad_norm": 9.90982723236084,
"learning_rate": 1.91029843617063e-05,
"loss": 3.3799,
"num_input_tokens_seen": 3458728,
"step": 5255
},
{
"epoch": 0.5765647265154006,
"grad_norm": 6.676484107971191,
"learning_rate": 1.9061162813919637e-05,
"loss": 3.2611,
"num_input_tokens_seen": 3461888,
"step": 5260
},
{
"epoch": 0.5771127918447879,
"grad_norm": 6.546321868896484,
"learning_rate": 1.9019358872336428e-05,
"loss": 2.9518,
"num_input_tokens_seen": 3464880,
"step": 5265
},
{
"epoch": 0.5776608571741751,
"grad_norm": 5.9848151206970215,
"learning_rate": 1.8977572660888122e-05,
"loss": 3.1144,
"num_input_tokens_seen": 3467712,
"step": 5270
},
{
"epoch": 0.5782089225035624,
"grad_norm": 6.030148506164551,
"learning_rate": 1.8935804303453612e-05,
"loss": 3.0001,
"num_input_tokens_seen": 3471760,
"step": 5275
},
{
"epoch": 0.5787569878329497,
"grad_norm": 9.319378852844238,
"learning_rate": 1.8894053923858857e-05,
"loss": 2.7935,
"num_input_tokens_seen": 3475928,
"step": 5280
},
{
"epoch": 0.5793050531623369,
"grad_norm": 7.607476711273193,
"learning_rate": 1.8852321645876507e-05,
"loss": 2.9319,
"num_input_tokens_seen": 3478968,
"step": 5285
},
{
"epoch": 0.5798531184917242,
"grad_norm": 7.065295219421387,
"learning_rate": 1.8810607593225567e-05,
"loss": 2.9655,
"num_input_tokens_seen": 3482160,
"step": 5290
},
{
"epoch": 0.5804011838211115,
"grad_norm": 6.528260707855225,
"learning_rate": 1.8768911889571002e-05,
"loss": 3.0625,
"num_input_tokens_seen": 3486016,
"step": 5295
},
{
"epoch": 0.5809492491504987,
"grad_norm": 8.56631851196289,
"learning_rate": 1.8727234658523368e-05,
"loss": 3.1642,
"num_input_tokens_seen": 3488552,
"step": 5300
},
{
"epoch": 0.581497314479886,
"grad_norm": 6.70935583114624,
"learning_rate": 1.8685576023638495e-05,
"loss": 2.908,
"num_input_tokens_seen": 3492192,
"step": 5305
},
{
"epoch": 0.5820453798092733,
"grad_norm": 9.139800071716309,
"learning_rate": 1.864393610841704e-05,
"loss": 3.0694,
"num_input_tokens_seen": 3495032,
"step": 5310
},
{
"epoch": 0.5825934451386605,
"grad_norm": 6.343008041381836,
"learning_rate": 1.8602315036304175e-05,
"loss": 2.939,
"num_input_tokens_seen": 3498288,
"step": 5315
},
{
"epoch": 0.5831415104680477,
"grad_norm": 6.961386203765869,
"learning_rate": 1.8560712930689238e-05,
"loss": 2.7722,
"num_input_tokens_seen": 3501112,
"step": 5320
},
{
"epoch": 0.5836895757974351,
"grad_norm": 8.582582473754883,
"learning_rate": 1.851912991490531e-05,
"loss": 3.0957,
"num_input_tokens_seen": 3504384,
"step": 5325
},
{
"epoch": 0.5842376411268223,
"grad_norm": 6.227029800415039,
"learning_rate": 1.8477566112228878e-05,
"loss": 3.2204,
"num_input_tokens_seen": 3508024,
"step": 5330
},
{
"epoch": 0.5847857064562095,
"grad_norm": 6.587297439575195,
"learning_rate": 1.8436021645879494e-05,
"loss": 3.1471,
"num_input_tokens_seen": 3511392,
"step": 5335
},
{
"epoch": 0.5853337717855969,
"grad_norm": 5.520746231079102,
"learning_rate": 1.839449663901936e-05,
"loss": 2.9406,
"num_input_tokens_seen": 3514568,
"step": 5340
},
{
"epoch": 0.5858818371149841,
"grad_norm": 5.80632209777832,
"learning_rate": 1.8352991214752983e-05,
"loss": 2.9652,
"num_input_tokens_seen": 3517672,
"step": 5345
},
{
"epoch": 0.5864299024443713,
"grad_norm": 4.704535484313965,
"learning_rate": 1.8311505496126868e-05,
"loss": 2.7212,
"num_input_tokens_seen": 3522392,
"step": 5350
},
{
"epoch": 0.5869779677737587,
"grad_norm": 12.650748252868652,
"learning_rate": 1.8270039606129045e-05,
"loss": 3.7118,
"num_input_tokens_seen": 3526336,
"step": 5355
},
{
"epoch": 0.5875260331031459,
"grad_norm": 9.578808784484863,
"learning_rate": 1.8228593667688772e-05,
"loss": 3.2441,
"num_input_tokens_seen": 3530656,
"step": 5360
},
{
"epoch": 0.5880740984325331,
"grad_norm": 6.2789812088012695,
"learning_rate": 1.818716780367618e-05,
"loss": 2.7651,
"num_input_tokens_seen": 3533184,
"step": 5365
},
{
"epoch": 0.5886221637619204,
"grad_norm": 8.422161102294922,
"learning_rate": 1.8145762136901874e-05,
"loss": 3.3134,
"num_input_tokens_seen": 3536976,
"step": 5370
},
{
"epoch": 0.5891702290913077,
"grad_norm": 7.674281597137451,
"learning_rate": 1.8104376790116572e-05,
"loss": 3.1223,
"num_input_tokens_seen": 3540496,
"step": 5375
},
{
"epoch": 0.5897182944206949,
"grad_norm": 7.617640495300293,
"learning_rate": 1.8063011886010777e-05,
"loss": 3.4106,
"num_input_tokens_seen": 3542952,
"step": 5380
},
{
"epoch": 0.5902663597500822,
"grad_norm": 6.847158908843994,
"learning_rate": 1.8021667547214367e-05,
"loss": 3.4031,
"num_input_tokens_seen": 3545952,
"step": 5385
},
{
"epoch": 0.5908144250794695,
"grad_norm": 7.656712532043457,
"learning_rate": 1.7980343896296243e-05,
"loss": 3.1261,
"num_input_tokens_seen": 3548960,
"step": 5390
},
{
"epoch": 0.5913624904088567,
"grad_norm": 6.854838848114014,
"learning_rate": 1.7939041055764015e-05,
"loss": 2.8715,
"num_input_tokens_seen": 3552888,
"step": 5395
},
{
"epoch": 0.591910555738244,
"grad_norm": 7.809703350067139,
"learning_rate": 1.789775914806357e-05,
"loss": 3.0002,
"num_input_tokens_seen": 3556448,
"step": 5400
},
{
"epoch": 0.5924586210676313,
"grad_norm": 9.405502319335938,
"learning_rate": 1.785649829557873e-05,
"loss": 3.4519,
"num_input_tokens_seen": 3560392,
"step": 5405
},
{
"epoch": 0.5930066863970185,
"grad_norm": 9.429394721984863,
"learning_rate": 1.781525862063092e-05,
"loss": 3.2288,
"num_input_tokens_seen": 3563680,
"step": 5410
},
{
"epoch": 0.5935547517264058,
"grad_norm": 6.114898204803467,
"learning_rate": 1.7774040245478767e-05,
"loss": 3.3265,
"num_input_tokens_seen": 3567200,
"step": 5415
},
{
"epoch": 0.594102817055793,
"grad_norm": 6.565958499908447,
"learning_rate": 1.7732843292317757e-05,
"loss": 3.0318,
"num_input_tokens_seen": 3570120,
"step": 5420
},
{
"epoch": 0.5946508823851803,
"grad_norm": 7.470787048339844,
"learning_rate": 1.7691667883279877e-05,
"loss": 2.9758,
"num_input_tokens_seen": 3573704,
"step": 5425
},
{
"epoch": 0.5951989477145676,
"grad_norm": 6.305603504180908,
"learning_rate": 1.7650514140433226e-05,
"loss": 2.8946,
"num_input_tokens_seen": 3577472,
"step": 5430
},
{
"epoch": 0.5957470130439548,
"grad_norm": 7.486173629760742,
"learning_rate": 1.760938218578168e-05,
"loss": 3.0453,
"num_input_tokens_seen": 3579928,
"step": 5435
},
{
"epoch": 0.5962950783733421,
"grad_norm": 5.27332067489624,
"learning_rate": 1.7568272141264542e-05,
"loss": 3.0027,
"num_input_tokens_seen": 3582744,
"step": 5440
},
{
"epoch": 0.5968431437027294,
"grad_norm": 5.261857986450195,
"learning_rate": 1.752718412875613e-05,
"loss": 3.373,
"num_input_tokens_seen": 3586344,
"step": 5445
},
{
"epoch": 0.5973912090321166,
"grad_norm": 7.151644706726074,
"learning_rate": 1.748611827006545e-05,
"loss": 3.0059,
"num_input_tokens_seen": 3590696,
"step": 5450
},
{
"epoch": 0.5979392743615038,
"grad_norm": 6.867771148681641,
"learning_rate": 1.7445074686935865e-05,
"loss": 2.9594,
"num_input_tokens_seen": 3593960,
"step": 5455
},
{
"epoch": 0.5984873396908912,
"grad_norm": 10.243605613708496,
"learning_rate": 1.740405350104466e-05,
"loss": 3.1614,
"num_input_tokens_seen": 3597248,
"step": 5460
},
{
"epoch": 0.5990354050202784,
"grad_norm": 7.2442827224731445,
"learning_rate": 1.736305483400273e-05,
"loss": 3.444,
"num_input_tokens_seen": 3600048,
"step": 5465
},
{
"epoch": 0.5995834703496656,
"grad_norm": 8.634395599365234,
"learning_rate": 1.7322078807354232e-05,
"loss": 3.6502,
"num_input_tokens_seen": 3603160,
"step": 5470
},
{
"epoch": 0.600131535679053,
"grad_norm": 7.339416027069092,
"learning_rate": 1.728112554257618e-05,
"loss": 2.9444,
"num_input_tokens_seen": 3606976,
"step": 5475
},
{
"epoch": 0.6006796010084402,
"grad_norm": 6.438117027282715,
"learning_rate": 1.7240195161078112e-05,
"loss": 2.7825,
"num_input_tokens_seen": 3610368,
"step": 5480
},
{
"epoch": 0.6012276663378274,
"grad_norm": 8.13581657409668,
"learning_rate": 1.7199287784201752e-05,
"loss": 3.1469,
"num_input_tokens_seen": 3613240,
"step": 5485
},
{
"epoch": 0.6017757316672148,
"grad_norm": 9.25243854522705,
"learning_rate": 1.715840353322059e-05,
"loss": 3.1494,
"num_input_tokens_seen": 3616384,
"step": 5490
},
{
"epoch": 0.602323796996602,
"grad_norm": 6.846777439117432,
"learning_rate": 1.7117542529339564e-05,
"loss": 3.0651,
"num_input_tokens_seen": 3620600,
"step": 5495
},
{
"epoch": 0.6028718623259892,
"grad_norm": 9.576505661010742,
"learning_rate": 1.7076704893694725e-05,
"loss": 3.2062,
"num_input_tokens_seen": 3624184,
"step": 5500
},
{
"epoch": 0.6034199276553766,
"grad_norm": 5.831842422485352,
"learning_rate": 1.7035890747352812e-05,
"loss": 2.9302,
"num_input_tokens_seen": 3628160,
"step": 5505
},
{
"epoch": 0.6039679929847638,
"grad_norm": 6.526121139526367,
"learning_rate": 1.699510021131093e-05,
"loss": 3.0619,
"num_input_tokens_seen": 3632144,
"step": 5510
},
{
"epoch": 0.604516058314151,
"grad_norm": 8.087743759155273,
"learning_rate": 1.695433340649622e-05,
"loss": 3.1402,
"num_input_tokens_seen": 3635512,
"step": 5515
},
{
"epoch": 0.6050641236435383,
"grad_norm": 4.840604305267334,
"learning_rate": 1.6913590453765436e-05,
"loss": 3.0223,
"num_input_tokens_seen": 3638824,
"step": 5520
},
{
"epoch": 0.6056121889729256,
"grad_norm": 7.919428825378418,
"learning_rate": 1.687287147390463e-05,
"loss": 2.7976,
"num_input_tokens_seen": 3642704,
"step": 5525
},
{
"epoch": 0.6061602543023128,
"grad_norm": 5.97782039642334,
"learning_rate": 1.6832176587628784e-05,
"loss": 2.9795,
"num_input_tokens_seen": 3645432,
"step": 5530
},
{
"epoch": 0.6067083196317001,
"grad_norm": 7.9558539390563965,
"learning_rate": 1.6791505915581474e-05,
"loss": 3.0965,
"num_input_tokens_seen": 3647912,
"step": 5535
},
{
"epoch": 0.6072563849610874,
"grad_norm": 7.399658203125,
"learning_rate": 1.675085957833446e-05,
"loss": 3.0064,
"num_input_tokens_seen": 3651176,
"step": 5540
},
{
"epoch": 0.6078044502904746,
"grad_norm": 5.475082874298096,
"learning_rate": 1.6710237696387364e-05,
"loss": 3.0204,
"num_input_tokens_seen": 3653864,
"step": 5545
},
{
"epoch": 0.6083525156198619,
"grad_norm": 7.328055381774902,
"learning_rate": 1.666964039016734e-05,
"loss": 3.4209,
"num_input_tokens_seen": 3656896,
"step": 5550
},
{
"epoch": 0.6089005809492491,
"grad_norm": 6.844607353210449,
"learning_rate": 1.6629067780028643e-05,
"loss": 2.8587,
"num_input_tokens_seen": 3660032,
"step": 5555
},
{
"epoch": 0.6094486462786364,
"grad_norm": 8.957280158996582,
"learning_rate": 1.6588519986252334e-05,
"loss": 3.3932,
"num_input_tokens_seen": 3662592,
"step": 5560
},
{
"epoch": 0.6099967116080237,
"grad_norm": 6.236993789672852,
"learning_rate": 1.6547997129045907e-05,
"loss": 2.8217,
"num_input_tokens_seen": 3665480,
"step": 5565
},
{
"epoch": 0.6105447769374109,
"grad_norm": 6.7575201988220215,
"learning_rate": 1.6507499328542926e-05,
"loss": 3.1285,
"num_input_tokens_seen": 3668296,
"step": 5570
},
{
"epoch": 0.6110928422667982,
"grad_norm": 6.297115802764893,
"learning_rate": 1.6467026704802652e-05,
"loss": 3.0519,
"num_input_tokens_seen": 3671088,
"step": 5575
},
{
"epoch": 0.6116409075961855,
"grad_norm": 5.6386003494262695,
"learning_rate": 1.6426579377809755e-05,
"loss": 3.0005,
"num_input_tokens_seen": 3674856,
"step": 5580
},
{
"epoch": 0.6121889729255727,
"grad_norm": 5.507198333740234,
"learning_rate": 1.6386157467473867e-05,
"loss": 3.0995,
"num_input_tokens_seen": 3677256,
"step": 5585
},
{
"epoch": 0.61273703825496,
"grad_norm": 6.467530250549316,
"learning_rate": 1.6345761093629276e-05,
"loss": 3.1279,
"num_input_tokens_seen": 3680248,
"step": 5590
},
{
"epoch": 0.6132851035843473,
"grad_norm": 6.12019681930542,
"learning_rate": 1.630539037603459e-05,
"loss": 3.0768,
"num_input_tokens_seen": 3683464,
"step": 5595
},
{
"epoch": 0.6138331689137345,
"grad_norm": 6.198227882385254,
"learning_rate": 1.626504543437234e-05,
"loss": 3.1144,
"num_input_tokens_seen": 3686448,
"step": 5600
},
{
"epoch": 0.6143812342431217,
"grad_norm": 8.729185104370117,
"learning_rate": 1.6224726388248622e-05,
"loss": 3.2992,
"num_input_tokens_seen": 3690360,
"step": 5605
},
{
"epoch": 0.6149292995725091,
"grad_norm": 8.366303443908691,
"learning_rate": 1.618443335719281e-05,
"loss": 3.1796,
"num_input_tokens_seen": 3693344,
"step": 5610
},
{
"epoch": 0.6154773649018963,
"grad_norm": 5.997150897979736,
"learning_rate": 1.614416646065711e-05,
"loss": 3.0782,
"num_input_tokens_seen": 3696488,
"step": 5615
},
{
"epoch": 0.6160254302312835,
"grad_norm": 6.210281848907471,
"learning_rate": 1.6103925818016257e-05,
"loss": 3.0592,
"num_input_tokens_seen": 3700080,
"step": 5620
},
{
"epoch": 0.6165734955606709,
"grad_norm": 10.414953231811523,
"learning_rate": 1.606371154856719e-05,
"loss": 2.9467,
"num_input_tokens_seen": 3703264,
"step": 5625
},
{
"epoch": 0.6171215608900581,
"grad_norm": 6.666655540466309,
"learning_rate": 1.6023523771528623e-05,
"loss": 3.3406,
"num_input_tokens_seen": 3706232,
"step": 5630
},
{
"epoch": 0.6176696262194453,
"grad_norm": 6.776188373565674,
"learning_rate": 1.5983362606040733e-05,
"loss": 2.9584,
"num_input_tokens_seen": 3709728,
"step": 5635
},
{
"epoch": 0.6182176915488327,
"grad_norm": 6.977499008178711,
"learning_rate": 1.5943228171164837e-05,
"loss": 3.607,
"num_input_tokens_seen": 3713824,
"step": 5640
},
{
"epoch": 0.6187657568782199,
"grad_norm": 6.040121555328369,
"learning_rate": 1.5903120585882974e-05,
"loss": 3.4444,
"num_input_tokens_seen": 3718048,
"step": 5645
},
{
"epoch": 0.6193138222076071,
"grad_norm": 7.120656967163086,
"learning_rate": 1.5863039969097592e-05,
"loss": 3.3153,
"num_input_tokens_seen": 3720360,
"step": 5650
},
{
"epoch": 0.6198618875369944,
"grad_norm": 10.212481498718262,
"learning_rate": 1.5822986439631207e-05,
"loss": 3.0222,
"num_input_tokens_seen": 3723136,
"step": 5655
},
{
"epoch": 0.6204099528663817,
"grad_norm": 6.770248889923096,
"learning_rate": 1.5782960116226007e-05,
"loss": 2.9785,
"num_input_tokens_seen": 3726064,
"step": 5660
},
{
"epoch": 0.6209580181957689,
"grad_norm": 5.595423221588135,
"learning_rate": 1.574296111754353e-05,
"loss": 3.03,
"num_input_tokens_seen": 3729800,
"step": 5665
},
{
"epoch": 0.6215060835251562,
"grad_norm": 6.7276225090026855,
"learning_rate": 1.5702989562164337e-05,
"loss": 3.2465,
"num_input_tokens_seen": 3733608,
"step": 5670
},
{
"epoch": 0.6220541488545435,
"grad_norm": 7.501856327056885,
"learning_rate": 1.5663045568587592e-05,
"loss": 2.8702,
"num_input_tokens_seen": 3736928,
"step": 5675
},
{
"epoch": 0.6226022141839307,
"grad_norm": 4.790249824523926,
"learning_rate": 1.562312925523076e-05,
"loss": 3.0023,
"num_input_tokens_seen": 3740256,
"step": 5680
},
{
"epoch": 0.623150279513318,
"grad_norm": 6.182326316833496,
"learning_rate": 1.5583240740429266e-05,
"loss": 2.9844,
"num_input_tokens_seen": 3743504,
"step": 5685
},
{
"epoch": 0.6236983448427053,
"grad_norm": 8.316134452819824,
"learning_rate": 1.5543380142436108e-05,
"loss": 3.1194,
"num_input_tokens_seen": 3746976,
"step": 5690
},
{
"epoch": 0.6242464101720925,
"grad_norm": 4.825036525726318,
"learning_rate": 1.5503547579421507e-05,
"loss": 2.9029,
"num_input_tokens_seen": 3749736,
"step": 5695
},
{
"epoch": 0.6247944755014798,
"grad_norm": 5.379034996032715,
"learning_rate": 1.5463743169472604e-05,
"loss": 2.813,
"num_input_tokens_seen": 3754312,
"step": 5700
},
{
"epoch": 0.625342540830867,
"grad_norm": 7.649238586425781,
"learning_rate": 1.5423967030593054e-05,
"loss": 2.9726,
"num_input_tokens_seen": 3757320,
"step": 5705
},
{
"epoch": 0.6258906061602543,
"grad_norm": 8.456625938415527,
"learning_rate": 1.5384219280702707e-05,
"loss": 2.9852,
"num_input_tokens_seen": 3761320,
"step": 5710
},
{
"epoch": 0.6264386714896416,
"grad_norm": 5.238711833953857,
"learning_rate": 1.534450003763726e-05,
"loss": 2.8722,
"num_input_tokens_seen": 3764536,
"step": 5715
},
{
"epoch": 0.6269867368190288,
"grad_norm": 7.77496337890625,
"learning_rate": 1.5304809419147885e-05,
"loss": 3.0119,
"num_input_tokens_seen": 3766832,
"step": 5720
},
{
"epoch": 0.6275348021484161,
"grad_norm": 6.092039108276367,
"learning_rate": 1.526514754290089e-05,
"loss": 3.1644,
"num_input_tokens_seen": 3770960,
"step": 5725
},
{
"epoch": 0.6280828674778034,
"grad_norm": 8.289813995361328,
"learning_rate": 1.5225514526477408e-05,
"loss": 3.0392,
"num_input_tokens_seen": 3774184,
"step": 5730
},
{
"epoch": 0.6286309328071906,
"grad_norm": 7.361676216125488,
"learning_rate": 1.5185910487372973e-05,
"loss": 2.9171,
"num_input_tokens_seen": 3778784,
"step": 5735
},
{
"epoch": 0.6291789981365778,
"grad_norm": 6.253126621246338,
"learning_rate": 1.514633554299723e-05,
"loss": 2.9294,
"num_input_tokens_seen": 3781568,
"step": 5740
},
{
"epoch": 0.6297270634659652,
"grad_norm": 10.453216552734375,
"learning_rate": 1.5106789810673578e-05,
"loss": 3.2064,
"num_input_tokens_seen": 3784152,
"step": 5745
},
{
"epoch": 0.6302751287953524,
"grad_norm": 7.798788547515869,
"learning_rate": 1.506727340763881e-05,
"loss": 2.9679,
"num_input_tokens_seen": 3786864,
"step": 5750
},
{
"epoch": 0.6308231941247396,
"grad_norm": 7.438601493835449,
"learning_rate": 1.5027786451042758e-05,
"loss": 2.9835,
"num_input_tokens_seen": 3790360,
"step": 5755
},
{
"epoch": 0.631371259454127,
"grad_norm": 8.202717781066895,
"learning_rate": 1.498832905794797e-05,
"loss": 3.1209,
"num_input_tokens_seen": 3793160,
"step": 5760
},
{
"epoch": 0.6319193247835142,
"grad_norm": 7.448530673980713,
"learning_rate": 1.4948901345329352e-05,
"loss": 3.1779,
"num_input_tokens_seen": 3797568,
"step": 5765
},
{
"epoch": 0.6324673901129014,
"grad_norm": 5.029766082763672,
"learning_rate": 1.4909503430073796e-05,
"loss": 2.8519,
"num_input_tokens_seen": 3801096,
"step": 5770
},
{
"epoch": 0.6330154554422888,
"grad_norm": 5.234902858734131,
"learning_rate": 1.48701354289799e-05,
"loss": 3.1461,
"num_input_tokens_seen": 3806256,
"step": 5775
},
{
"epoch": 0.633563520771676,
"grad_norm": 8.089512825012207,
"learning_rate": 1.4830797458757544e-05,
"loss": 3.12,
"num_input_tokens_seen": 3808880,
"step": 5780
},
{
"epoch": 0.6341115861010632,
"grad_norm": 5.7707839012146,
"learning_rate": 1.4791489636027583e-05,
"loss": 2.7087,
"num_input_tokens_seen": 3813584,
"step": 5785
},
{
"epoch": 0.6346596514304506,
"grad_norm": 6.020088195800781,
"learning_rate": 1.475221207732151e-05,
"loss": 2.9224,
"num_input_tokens_seen": 3816848,
"step": 5790
},
{
"epoch": 0.6352077167598378,
"grad_norm": 6.976149082183838,
"learning_rate": 1.4712964899081093e-05,
"loss": 3.0359,
"num_input_tokens_seen": 3820368,
"step": 5795
},
{
"epoch": 0.635755782089225,
"grad_norm": 7.066904544830322,
"learning_rate": 1.4673748217658026e-05,
"loss": 3.0753,
"num_input_tokens_seen": 3823064,
"step": 5800
},
{
"epoch": 0.6363038474186123,
"grad_norm": 5.929400444030762,
"learning_rate": 1.4634562149313607e-05,
"loss": 3.1222,
"num_input_tokens_seen": 3826048,
"step": 5805
},
{
"epoch": 0.6368519127479996,
"grad_norm": 6.900379657745361,
"learning_rate": 1.459540681021836e-05,
"loss": 3.4275,
"num_input_tokens_seen": 3829584,
"step": 5810
},
{
"epoch": 0.6373999780773868,
"grad_norm": 6.451569080352783,
"learning_rate": 1.4556282316451733e-05,
"loss": 3.0381,
"num_input_tokens_seen": 3832848,
"step": 5815
},
{
"epoch": 0.6379480434067741,
"grad_norm": 6.459670066833496,
"learning_rate": 1.4517188784001712e-05,
"loss": 2.9231,
"num_input_tokens_seen": 3835392,
"step": 5820
},
{
"epoch": 0.6384961087361614,
"grad_norm": 9.6491117477417,
"learning_rate": 1.4478126328764496e-05,
"loss": 3.1121,
"num_input_tokens_seen": 3839016,
"step": 5825
},
{
"epoch": 0.6390441740655486,
"grad_norm": 6.9248552322387695,
"learning_rate": 1.4439095066544154e-05,
"loss": 3.0439,
"num_input_tokens_seen": 3841424,
"step": 5830
},
{
"epoch": 0.6395922393949359,
"grad_norm": 8.927162170410156,
"learning_rate": 1.44000951130523e-05,
"loss": 2.9511,
"num_input_tokens_seen": 3843624,
"step": 5835
},
{
"epoch": 0.6401403047243232,
"grad_norm": 7.547786712646484,
"learning_rate": 1.4361126583907708e-05,
"loss": 3.2556,
"num_input_tokens_seen": 3846024,
"step": 5840
},
{
"epoch": 0.6406883700537104,
"grad_norm": 9.325125694274902,
"learning_rate": 1.432218959463599e-05,
"loss": 3.2518,
"num_input_tokens_seen": 3849176,
"step": 5845
},
{
"epoch": 0.6412364353830977,
"grad_norm": 7.831711292266846,
"learning_rate": 1.4283284260669282e-05,
"loss": 3.3252,
"num_input_tokens_seen": 3851496,
"step": 5850
},
{
"epoch": 0.6417845007124849,
"grad_norm": 5.674088001251221,
"learning_rate": 1.4244410697345845e-05,
"loss": 3.1402,
"num_input_tokens_seen": 3854384,
"step": 5855
},
{
"epoch": 0.6423325660418722,
"grad_norm": 5.759450912475586,
"learning_rate": 1.4205569019909759e-05,
"loss": 3.2573,
"num_input_tokens_seen": 3857336,
"step": 5860
},
{
"epoch": 0.6428806313712595,
"grad_norm": 6.425468921661377,
"learning_rate": 1.4166759343510599e-05,
"loss": 2.994,
"num_input_tokens_seen": 3860008,
"step": 5865
},
{
"epoch": 0.6434286967006467,
"grad_norm": 8.979571342468262,
"learning_rate": 1.4127981783203049e-05,
"loss": 2.8518,
"num_input_tokens_seen": 3863232,
"step": 5870
},
{
"epoch": 0.643976762030034,
"grad_norm": 7.848270416259766,
"learning_rate": 1.4089236453946563e-05,
"loss": 3.312,
"num_input_tokens_seen": 3867768,
"step": 5875
},
{
"epoch": 0.6445248273594213,
"grad_norm": 6.893942832946777,
"learning_rate": 1.4050523470605099e-05,
"loss": 3.0278,
"num_input_tokens_seen": 3870384,
"step": 5880
},
{
"epoch": 0.6450728926888085,
"grad_norm": 6.547880172729492,
"learning_rate": 1.4011842947946674e-05,
"loss": 2.7762,
"num_input_tokens_seen": 3873064,
"step": 5885
},
{
"epoch": 0.6456209580181957,
"grad_norm": 8.624503135681152,
"learning_rate": 1.397319500064308e-05,
"loss": 2.8362,
"num_input_tokens_seen": 3876656,
"step": 5890
},
{
"epoch": 0.6461690233475831,
"grad_norm": 7.134870529174805,
"learning_rate": 1.3934579743269561e-05,
"loss": 2.6202,
"num_input_tokens_seen": 3880296,
"step": 5895
},
{
"epoch": 0.6467170886769703,
"grad_norm": 7.61886739730835,
"learning_rate": 1.389599729030443e-05,
"loss": 2.9104,
"num_input_tokens_seen": 3883280,
"step": 5900
},
{
"epoch": 0.6472651540063575,
"grad_norm": 6.761881351470947,
"learning_rate": 1.3857447756128744e-05,
"loss": 2.9658,
"num_input_tokens_seen": 3885848,
"step": 5905
},
{
"epoch": 0.6478132193357449,
"grad_norm": 9.020877838134766,
"learning_rate": 1.381893125502598e-05,
"loss": 3.1887,
"num_input_tokens_seen": 3889168,
"step": 5910
},
{
"epoch": 0.6483612846651321,
"grad_norm": 7.6226091384887695,
"learning_rate": 1.3780447901181681e-05,
"loss": 3.2913,
"num_input_tokens_seen": 3892368,
"step": 5915
},
{
"epoch": 0.6489093499945193,
"grad_norm": 6.327563285827637,
"learning_rate": 1.374199780868311e-05,
"loss": 2.868,
"num_input_tokens_seen": 3895192,
"step": 5920
},
{
"epoch": 0.6494574153239067,
"grad_norm": 7.200982093811035,
"learning_rate": 1.3703581091518964e-05,
"loss": 2.9841,
"num_input_tokens_seen": 3899104,
"step": 5925
},
{
"epoch": 0.6500054806532939,
"grad_norm": 7.297597885131836,
"learning_rate": 1.3665197863578954e-05,
"loss": 3.1225,
"num_input_tokens_seen": 3901696,
"step": 5930
},
{
"epoch": 0.6505535459826811,
"grad_norm": 6.203746318817139,
"learning_rate": 1.3626848238653516e-05,
"loss": 3.082,
"num_input_tokens_seen": 3905192,
"step": 5935
},
{
"epoch": 0.6511016113120685,
"grad_norm": 7.677253246307373,
"learning_rate": 1.358853233043349e-05,
"loss": 3.2795,
"num_input_tokens_seen": 3908456,
"step": 5940
},
{
"epoch": 0.6516496766414557,
"grad_norm": 6.703474044799805,
"learning_rate": 1.3550250252509744e-05,
"loss": 3.123,
"num_input_tokens_seen": 3910504,
"step": 5945
},
{
"epoch": 0.6521977419708429,
"grad_norm": 7.855628967285156,
"learning_rate": 1.3512002118372835e-05,
"loss": 2.8393,
"num_input_tokens_seen": 3913032,
"step": 5950
},
{
"epoch": 0.6527458073002302,
"grad_norm": 7.922531604766846,
"learning_rate": 1.3473788041412732e-05,
"loss": 2.7007,
"num_input_tokens_seen": 3916392,
"step": 5955
},
{
"epoch": 0.6532938726296175,
"grad_norm": 10.957340240478516,
"learning_rate": 1.3435608134918412e-05,
"loss": 2.9213,
"num_input_tokens_seen": 3919248,
"step": 5960
},
{
"epoch": 0.6538419379590047,
"grad_norm": 5.184296607971191,
"learning_rate": 1.3397462512077535e-05,
"loss": 3.203,
"num_input_tokens_seen": 3922528,
"step": 5965
},
{
"epoch": 0.654390003288392,
"grad_norm": 8.037724494934082,
"learning_rate": 1.3359351285976174e-05,
"loss": 3.1737,
"num_input_tokens_seen": 3925200,
"step": 5970
},
{
"epoch": 0.6549380686177793,
"grad_norm": 7.275876045227051,
"learning_rate": 1.3321274569598382e-05,
"loss": 2.848,
"num_input_tokens_seen": 3928128,
"step": 5975
},
{
"epoch": 0.6554861339471665,
"grad_norm": 5.043073654174805,
"learning_rate": 1.3283232475825916e-05,
"loss": 2.8843,
"num_input_tokens_seen": 3931696,
"step": 5980
},
{
"epoch": 0.6560341992765538,
"grad_norm": 8.235861778259277,
"learning_rate": 1.3245225117437918e-05,
"loss": 3.3592,
"num_input_tokens_seen": 3934656,
"step": 5985
},
{
"epoch": 0.656582264605941,
"grad_norm": 7.135794162750244,
"learning_rate": 1.3207252607110521e-05,
"loss": 3.263,
"num_input_tokens_seen": 3937536,
"step": 5990
},
{
"epoch": 0.6571303299353283,
"grad_norm": 8.360773086547852,
"learning_rate": 1.3169315057416564e-05,
"loss": 3.1673,
"num_input_tokens_seen": 3940200,
"step": 5995
},
{
"epoch": 0.6576783952647156,
"grad_norm": 9.115818977355957,
"learning_rate": 1.3131412580825236e-05,
"loss": 3.1802,
"num_input_tokens_seen": 3942688,
"step": 6000
},
{
"epoch": 0.6582264605941028,
"grad_norm": 8.476052284240723,
"learning_rate": 1.3093545289701747e-05,
"loss": 3.1919,
"num_input_tokens_seen": 3945760,
"step": 6005
},
{
"epoch": 0.6587745259234901,
"grad_norm": 6.621984004974365,
"learning_rate": 1.3055713296307016e-05,
"loss": 2.8701,
"num_input_tokens_seen": 3948512,
"step": 6010
},
{
"epoch": 0.6593225912528773,
"grad_norm": 8.03313159942627,
"learning_rate": 1.3017916712797293e-05,
"loss": 3.3227,
"num_input_tokens_seen": 3951520,
"step": 6015
},
{
"epoch": 0.6598706565822646,
"grad_norm": 7.0439677238464355,
"learning_rate": 1.2980155651223867e-05,
"loss": 2.8738,
"num_input_tokens_seen": 3955392,
"step": 6020
},
{
"epoch": 0.6604187219116519,
"grad_norm": 7.3785529136657715,
"learning_rate": 1.2942430223532703e-05,
"loss": 3.3427,
"num_input_tokens_seen": 3959592,
"step": 6025
},
{
"epoch": 0.6609667872410391,
"grad_norm": 5.641672134399414,
"learning_rate": 1.2904740541564159e-05,
"loss": 3.0156,
"num_input_tokens_seen": 3963064,
"step": 6030
},
{
"epoch": 0.6615148525704264,
"grad_norm": 6.209802150726318,
"learning_rate": 1.286708671705259e-05,
"loss": 3.0553,
"num_input_tokens_seen": 3965552,
"step": 6035
},
{
"epoch": 0.6620629178998136,
"grad_norm": 6.092316627502441,
"learning_rate": 1.2829468861626052e-05,
"loss": 2.9092,
"num_input_tokens_seen": 3968480,
"step": 6040
},
{
"epoch": 0.6626109832292009,
"grad_norm": 10.323710441589355,
"learning_rate": 1.2791887086805993e-05,
"loss": 3.4687,
"num_input_tokens_seen": 3971464,
"step": 6045
},
{
"epoch": 0.6631590485585882,
"grad_norm": 6.506869792938232,
"learning_rate": 1.2754341504006872e-05,
"loss": 3.0349,
"num_input_tokens_seen": 3975640,
"step": 6050
},
{
"epoch": 0.6637071138879754,
"grad_norm": 6.929319381713867,
"learning_rate": 1.2716832224535847e-05,
"loss": 3.1761,
"num_input_tokens_seen": 3978928,
"step": 6055
},
{
"epoch": 0.6642551792173627,
"grad_norm": 6.731025218963623,
"learning_rate": 1.2679359359592488e-05,
"loss": 2.7582,
"num_input_tokens_seen": 3984016,
"step": 6060
},
{
"epoch": 0.66480324454675,
"grad_norm": 7.775283336639404,
"learning_rate": 1.2641923020268377e-05,
"loss": 3.222,
"num_input_tokens_seen": 3986544,
"step": 6065
},
{
"epoch": 0.6653513098761372,
"grad_norm": 9.189234733581543,
"learning_rate": 1.2604523317546813e-05,
"loss": 2.7329,
"num_input_tokens_seen": 3989440,
"step": 6070
},
{
"epoch": 0.6658993752055244,
"grad_norm": 6.482409954071045,
"learning_rate": 1.2567160362302515e-05,
"loss": 3.0355,
"num_input_tokens_seen": 3993928,
"step": 6075
},
{
"epoch": 0.6664474405349118,
"grad_norm": 6.9843878746032715,
"learning_rate": 1.2529834265301227e-05,
"loss": 3.1331,
"num_input_tokens_seen": 3997312,
"step": 6080
},
{
"epoch": 0.666995505864299,
"grad_norm": 7.9999308586120605,
"learning_rate": 1.2492545137199426e-05,
"loss": 3.2756,
"num_input_tokens_seen": 4000160,
"step": 6085
},
{
"epoch": 0.6675435711936862,
"grad_norm": 5.13596773147583,
"learning_rate": 1.2455293088544023e-05,
"loss": 3.382,
"num_input_tokens_seen": 4003720,
"step": 6090
},
{
"epoch": 0.6680916365230736,
"grad_norm": 6.42021369934082,
"learning_rate": 1.2418078229771973e-05,
"loss": 2.9692,
"num_input_tokens_seen": 4006680,
"step": 6095
},
{
"epoch": 0.6686397018524608,
"grad_norm": 9.268325805664062,
"learning_rate": 1.2380900671209984e-05,
"loss": 2.9399,
"num_input_tokens_seen": 4009632,
"step": 6100
},
{
"epoch": 0.669187767181848,
"grad_norm": 5.049006938934326,
"learning_rate": 1.2343760523074186e-05,
"loss": 3.0858,
"num_input_tokens_seen": 4012552,
"step": 6105
},
{
"epoch": 0.6697358325112354,
"grad_norm": 6.255411148071289,
"learning_rate": 1.2306657895469809e-05,
"loss": 3.16,
"num_input_tokens_seen": 4016240,
"step": 6110
},
{
"epoch": 0.6702838978406226,
"grad_norm": 10.016054153442383,
"learning_rate": 1.2269592898390833e-05,
"loss": 3.0065,
"num_input_tokens_seen": 4019680,
"step": 6115
},
{
"epoch": 0.6708319631700098,
"grad_norm": 7.499462604522705,
"learning_rate": 1.223256564171971e-05,
"loss": 3.3602,
"num_input_tokens_seen": 4022288,
"step": 6120
},
{
"epoch": 0.6713800284993972,
"grad_norm": 7.838258266448975,
"learning_rate": 1.2195576235226977e-05,
"loss": 2.7866,
"num_input_tokens_seen": 4025216,
"step": 6125
},
{
"epoch": 0.6719280938287844,
"grad_norm": 7.931380271911621,
"learning_rate": 1.2158624788570965e-05,
"loss": 3.4889,
"num_input_tokens_seen": 4029376,
"step": 6130
},
{
"epoch": 0.6724761591581716,
"grad_norm": 5.675364971160889,
"learning_rate": 1.2121711411297498e-05,
"loss": 3.3344,
"num_input_tokens_seen": 4031616,
"step": 6135
},
{
"epoch": 0.6730242244875589,
"grad_norm": 5.3835577964782715,
"learning_rate": 1.2084836212839507e-05,
"loss": 3.1429,
"num_input_tokens_seen": 4034840,
"step": 6140
},
{
"epoch": 0.6735722898169462,
"grad_norm": 7.542428016662598,
"learning_rate": 1.2047999302516737e-05,
"loss": 2.9853,
"num_input_tokens_seen": 4037792,
"step": 6145
},
{
"epoch": 0.6741203551463334,
"grad_norm": 7.841860771179199,
"learning_rate": 1.2011200789535464e-05,
"loss": 3.011,
"num_input_tokens_seen": 4041272,
"step": 6150
},
{
"epoch": 0.6746684204757207,
"grad_norm": 10.116206169128418,
"learning_rate": 1.1974440782988094e-05,
"loss": 3.1755,
"num_input_tokens_seen": 4044360,
"step": 6155
},
{
"epoch": 0.675216485805108,
"grad_norm": 6.566442489624023,
"learning_rate": 1.1937719391852877e-05,
"loss": 3.0532,
"num_input_tokens_seen": 4047544,
"step": 6160
},
{
"epoch": 0.6757645511344952,
"grad_norm": 6.767369747161865,
"learning_rate": 1.1901036724993616e-05,
"loss": 2.9114,
"num_input_tokens_seen": 4050584,
"step": 6165
},
{
"epoch": 0.6763126164638825,
"grad_norm": 5.782663822174072,
"learning_rate": 1.1864392891159284e-05,
"loss": 3.4902,
"num_input_tokens_seen": 4053392,
"step": 6170
},
{
"epoch": 0.6768606817932697,
"grad_norm": 7.807350158691406,
"learning_rate": 1.1827787998983731e-05,
"loss": 3.1896,
"num_input_tokens_seen": 4056184,
"step": 6175
},
{
"epoch": 0.677408747122657,
"grad_norm": 8.840995788574219,
"learning_rate": 1.1791222156985382e-05,
"loss": 3.4261,
"num_input_tokens_seen": 4060616,
"step": 6180
},
{
"epoch": 0.6779568124520443,
"grad_norm": 5.441840171813965,
"learning_rate": 1.1754695473566877e-05,
"loss": 2.8645,
"num_input_tokens_seen": 4065008,
"step": 6185
},
{
"epoch": 0.6785048777814315,
"grad_norm": 7.820642471313477,
"learning_rate": 1.1718208057014768e-05,
"loss": 3.1664,
"num_input_tokens_seen": 4068872,
"step": 6190
},
{
"epoch": 0.6790529431108188,
"grad_norm": 7.290872573852539,
"learning_rate": 1.1681760015499201e-05,
"loss": 3.4087,
"num_input_tokens_seen": 4071376,
"step": 6195
},
{
"epoch": 0.6796010084402061,
"grad_norm": 5.5174360275268555,
"learning_rate": 1.1645351457073594e-05,
"loss": 3.3074,
"num_input_tokens_seen": 4074528,
"step": 6200
},
{
"epoch": 0.6801490737695933,
"grad_norm": 6.114542484283447,
"learning_rate": 1.1608982489674295e-05,
"loss": 3.0535,
"num_input_tokens_seen": 4077600,
"step": 6205
},
{
"epoch": 0.6806971390989806,
"grad_norm": 8.515054702758789,
"learning_rate": 1.1572653221120316e-05,
"loss": 3.2291,
"num_input_tokens_seen": 4080664,
"step": 6210
},
{
"epoch": 0.6812452044283679,
"grad_norm": 8.11023235321045,
"learning_rate": 1.1536363759112952e-05,
"loss": 3.1448,
"num_input_tokens_seen": 4083256,
"step": 6215
},
{
"epoch": 0.6817932697577551,
"grad_norm": 7.834672927856445,
"learning_rate": 1.1500114211235482e-05,
"loss": 3.1213,
"num_input_tokens_seen": 4085568,
"step": 6220
},
{
"epoch": 0.6823413350871423,
"grad_norm": 6.758762836456299,
"learning_rate": 1.146390468495289e-05,
"loss": 3.0515,
"num_input_tokens_seen": 4088248,
"step": 6225
},
{
"epoch": 0.6828894004165297,
"grad_norm": 6.3487372398376465,
"learning_rate": 1.1427735287611477e-05,
"loss": 2.5775,
"num_input_tokens_seen": 4090848,
"step": 6230
},
{
"epoch": 0.6834374657459169,
"grad_norm": 5.81227445602417,
"learning_rate": 1.1391606126438586e-05,
"loss": 3.0297,
"num_input_tokens_seen": 4094232,
"step": 6235
},
{
"epoch": 0.6839855310753041,
"grad_norm": 7.857996463775635,
"learning_rate": 1.1355517308542301e-05,
"loss": 3.0582,
"num_input_tokens_seen": 4097096,
"step": 6240
},
{
"epoch": 0.6845335964046915,
"grad_norm": 5.819544792175293,
"learning_rate": 1.1319468940911079e-05,
"loss": 2.8814,
"num_input_tokens_seen": 4099912,
"step": 6245
},
{
"epoch": 0.6850816617340787,
"grad_norm": 9.14799976348877,
"learning_rate": 1.1283461130413453e-05,
"loss": 3.3229,
"num_input_tokens_seen": 4102320,
"step": 6250
},
{
"epoch": 0.6856297270634659,
"grad_norm": 7.087406158447266,
"learning_rate": 1.1247493983797754e-05,
"loss": 2.8581,
"num_input_tokens_seen": 4106480,
"step": 6255
},
{
"epoch": 0.6861777923928533,
"grad_norm": 7.298010349273682,
"learning_rate": 1.1218749616158092e-05,
"loss": 3.1186,
"num_input_tokens_seen": 4110064,
"step": 6260
},
{
"epoch": 0.6867258577222405,
"grad_norm": 6.6678290367126465,
"learning_rate": 1.1182855933150582e-05,
"loss": 2.971,
"num_input_tokens_seen": 4113304,
"step": 6265
},
{
"epoch": 0.6872739230516277,
"grad_norm": 8.044167518615723,
"learning_rate": 1.1147003212277912e-05,
"loss": 3.3036,
"num_input_tokens_seen": 4115752,
"step": 6270
},
{
"epoch": 0.687821988381015,
"grad_norm": 6.803138256072998,
"learning_rate": 1.1111191559828627e-05,
"loss": 2.7812,
"num_input_tokens_seen": 4119488,
"step": 6275
},
{
"epoch": 0.6883700537104023,
"grad_norm": 5.070322513580322,
"learning_rate": 1.1075421081969502e-05,
"loss": 3.152,
"num_input_tokens_seen": 4122168,
"step": 6280
},
{
"epoch": 0.6889181190397895,
"grad_norm": 6.463720321655273,
"learning_rate": 1.1039691884745252e-05,
"loss": 2.9657,
"num_input_tokens_seen": 4125704,
"step": 6285
},
{
"epoch": 0.6894661843691768,
"grad_norm": 9.405960083007812,
"learning_rate": 1.1004004074078223e-05,
"loss": 3.5484,
"num_input_tokens_seen": 4128608,
"step": 6290
},
{
"epoch": 0.6900142496985641,
"grad_norm": 6.504082679748535,
"learning_rate": 1.0968357755768051e-05,
"loss": 2.7744,
"num_input_tokens_seen": 4131416,
"step": 6295
},
{
"epoch": 0.6905623150279513,
"grad_norm": 7.679104804992676,
"learning_rate": 1.093275303549137e-05,
"loss": 3.1396,
"num_input_tokens_seen": 4135168,
"step": 6300
},
{
"epoch": 0.6911103803573386,
"grad_norm": 10.499975204467773,
"learning_rate": 1.0897190018801503e-05,
"loss": 3.4244,
"num_input_tokens_seen": 4138320,
"step": 6305
},
{
"epoch": 0.6916584456867259,
"grad_norm": 5.967805862426758,
"learning_rate": 1.0861668811128129e-05,
"loss": 3.0676,
"num_input_tokens_seen": 4140880,
"step": 6310
},
{
"epoch": 0.6922065110161131,
"grad_norm": 6.552985668182373,
"learning_rate": 1.0826189517776975e-05,
"loss": 3.0805,
"num_input_tokens_seen": 4143912,
"step": 6315
},
{
"epoch": 0.6927545763455004,
"grad_norm": 8.34593677520752,
"learning_rate": 1.0790752243929523e-05,
"loss": 3.2587,
"num_input_tokens_seen": 4147320,
"step": 6320
},
{
"epoch": 0.6933026416748876,
"grad_norm": 6.536946773529053,
"learning_rate": 1.0755357094642674e-05,
"loss": 3.0053,
"num_input_tokens_seen": 4150928,
"step": 6325
},
{
"epoch": 0.6938507070042749,
"grad_norm": 7.138943672180176,
"learning_rate": 1.0720004174848444e-05,
"loss": 2.9898,
"num_input_tokens_seen": 4154120,
"step": 6330
},
{
"epoch": 0.6943987723336622,
"grad_norm": 9.60561466217041,
"learning_rate": 1.0684693589353678e-05,
"loss": 3.4849,
"num_input_tokens_seen": 4156832,
"step": 6335
},
{
"epoch": 0.6949468376630494,
"grad_norm": 8.691582679748535,
"learning_rate": 1.0649425442839697e-05,
"loss": 3.1178,
"num_input_tokens_seen": 4159704,
"step": 6340
},
{
"epoch": 0.6954949029924367,
"grad_norm": 8.004415512084961,
"learning_rate": 1.0614199839862002e-05,
"loss": 3.0848,
"num_input_tokens_seen": 4162168,
"step": 6345
},
{
"epoch": 0.696042968321824,
"grad_norm": 12.674962043762207,
"learning_rate": 1.0579016884849999e-05,
"loss": 3.4026,
"num_input_tokens_seen": 4165384,
"step": 6350
},
{
"epoch": 0.6965910336512112,
"grad_norm": 7.9511284828186035,
"learning_rate": 1.0543876682106632e-05,
"loss": 3.0329,
"num_input_tokens_seen": 4168128,
"step": 6355
},
{
"epoch": 0.6971390989805984,
"grad_norm": 9.268970489501953,
"learning_rate": 1.0508779335808105e-05,
"loss": 3.1994,
"num_input_tokens_seen": 4171888,
"step": 6360
},
{
"epoch": 0.6976871643099858,
"grad_norm": 6.21211051940918,
"learning_rate": 1.04737249500036e-05,
"loss": 3.1242,
"num_input_tokens_seen": 4174896,
"step": 6365
},
{
"epoch": 0.698235229639373,
"grad_norm": 7.668500900268555,
"learning_rate": 1.04387136286149e-05,
"loss": 3.0467,
"num_input_tokens_seen": 4178504,
"step": 6370
},
{
"epoch": 0.6987832949687602,
"grad_norm": 5.02815580368042,
"learning_rate": 1.040374547543613e-05,
"loss": 2.9279,
"num_input_tokens_seen": 4182040,
"step": 6375
},
{
"epoch": 0.6993313602981476,
"grad_norm": 5.940211772918701,
"learning_rate": 1.0368820594133466e-05,
"loss": 2.968,
"num_input_tokens_seen": 4185880,
"step": 6380
},
{
"epoch": 0.6998794256275348,
"grad_norm": 6.044907093048096,
"learning_rate": 1.0333939088244771e-05,
"loss": 3.3093,
"num_input_tokens_seen": 4189000,
"step": 6385
},
{
"epoch": 0.700427490956922,
"grad_norm": 6.427306652069092,
"learning_rate": 1.0299101061179317e-05,
"loss": 3.2814,
"num_input_tokens_seen": 4191736,
"step": 6390
},
{
"epoch": 0.7009755562863094,
"grad_norm": 7.336453914642334,
"learning_rate": 1.0264306616217507e-05,
"loss": 2.8437,
"num_input_tokens_seen": 4194360,
"step": 6395
},
{
"epoch": 0.7015236216156966,
"grad_norm": 7.562320709228516,
"learning_rate": 1.0229555856510512e-05,
"loss": 2.828,
"num_input_tokens_seen": 4197920,
"step": 6400
},
{
"epoch": 0.7020716869450838,
"grad_norm": 7.142042636871338,
"learning_rate": 1.0194848885080011e-05,
"loss": 3.1228,
"num_input_tokens_seen": 4201984,
"step": 6405
},
{
"epoch": 0.7026197522744712,
"grad_norm": 6.18742036819458,
"learning_rate": 1.0160185804817859e-05,
"loss": 2.8393,
"num_input_tokens_seen": 4205328,
"step": 6410
},
{
"epoch": 0.7031678176038584,
"grad_norm": 7.195977687835693,
"learning_rate": 1.0125566718485788e-05,
"loss": 2.9868,
"num_input_tokens_seen": 4208312,
"step": 6415
},
{
"epoch": 0.7037158829332456,
"grad_norm": 10.329099655151367,
"learning_rate": 1.0090991728715132e-05,
"loss": 2.829,
"num_input_tokens_seen": 4211312,
"step": 6420
},
{
"epoch": 0.7042639482626329,
"grad_norm": 6.6712236404418945,
"learning_rate": 1.0056460938006473e-05,
"loss": 2.9549,
"num_input_tokens_seen": 4213800,
"step": 6425
},
{
"epoch": 0.7048120135920202,
"grad_norm": 4.803092002868652,
"learning_rate": 1.0021974448729365e-05,
"loss": 3.3355,
"num_input_tokens_seen": 4217200,
"step": 6430
},
{
"epoch": 0.7053600789214074,
"grad_norm": 6.527164459228516,
"learning_rate": 9.987532363122018e-06,
"loss": 2.9652,
"num_input_tokens_seen": 4220768,
"step": 6435
},
{
"epoch": 0.7059081442507947,
"grad_norm": 7.362782955169678,
"learning_rate": 9.953134783291036e-06,
"loss": 2.8684,
"num_input_tokens_seen": 4224224,
"step": 6440
},
{
"epoch": 0.706456209580182,
"grad_norm": 9.984780311584473,
"learning_rate": 9.918781811211045e-06,
"loss": 2.8968,
"num_input_tokens_seen": 4229272,
"step": 6445
},
{
"epoch": 0.7070042749095692,
"grad_norm": 6.219121932983398,
"learning_rate": 9.884473548724441e-06,
"loss": 3.1832,
"num_input_tokens_seen": 4232096,
"step": 6450
},
{
"epoch": 0.7075523402389565,
"grad_norm": 6.208556652069092,
"learning_rate": 9.850210097541085e-06,
"loss": 3.108,
"num_input_tokens_seen": 4235496,
"step": 6455
},
{
"epoch": 0.7081004055683437,
"grad_norm": 7.7808003425598145,
"learning_rate": 9.81599155923798e-06,
"loss": 3.0694,
"num_input_tokens_seen": 4238320,
"step": 6460
},
{
"epoch": 0.708648470897731,
"grad_norm": 8.587124824523926,
"learning_rate": 9.781818035258972e-06,
"loss": 3.1773,
"num_input_tokens_seen": 4240792,
"step": 6465
},
{
"epoch": 0.7091965362271183,
"grad_norm": 11.057994842529297,
"learning_rate": 9.747689626914483e-06,
"loss": 3.4154,
"num_input_tokens_seen": 4244904,
"step": 6470
},
{
"epoch": 0.7097446015565055,
"grad_norm": 6.430279731750488,
"learning_rate": 9.713606435381165e-06,
"loss": 3.1772,
"num_input_tokens_seen": 4247632,
"step": 6475
},
{
"epoch": 0.7102926668858928,
"grad_norm": 7.846237659454346,
"learning_rate": 9.679568561701615e-06,
"loss": 2.9962,
"num_input_tokens_seen": 4250768,
"step": 6480
},
{
"epoch": 0.7108407322152801,
"grad_norm": 8.467151641845703,
"learning_rate": 9.645576106784118e-06,
"loss": 2.8687,
"num_input_tokens_seen": 4253904,
"step": 6485
},
{
"epoch": 0.7113887975446673,
"grad_norm": 16.991235733032227,
"learning_rate": 9.611629171402273e-06,
"loss": 3.1696,
"num_input_tokens_seen": 4256768,
"step": 6490
},
{
"epoch": 0.7119368628740546,
"grad_norm": 7.091182231903076,
"learning_rate": 9.577727856194746e-06,
"loss": 2.7567,
"num_input_tokens_seen": 4260192,
"step": 6495
},
{
"epoch": 0.7124849282034419,
"grad_norm": 7.963916778564453,
"learning_rate": 9.543872261664952e-06,
"loss": 2.9586,
"num_input_tokens_seen": 4263560,
"step": 6500
},
{
"epoch": 0.7130329935328291,
"grad_norm": 6.632905006408691,
"learning_rate": 9.510062488180781e-06,
"loss": 2.8122,
"num_input_tokens_seen": 4266624,
"step": 6505
},
{
"epoch": 0.7135810588622163,
"grad_norm": 8.157563209533691,
"learning_rate": 9.476298635974265e-06,
"loss": 2.9458,
"num_input_tokens_seen": 4269488,
"step": 6510
},
{
"epoch": 0.7141291241916037,
"grad_norm": 7.982326507568359,
"learning_rate": 9.442580805141305e-06,
"loss": 3.172,
"num_input_tokens_seen": 4272592,
"step": 6515
},
{
"epoch": 0.7146771895209909,
"grad_norm": 5.6351423263549805,
"learning_rate": 9.408909095641363e-06,
"loss": 3.139,
"num_input_tokens_seen": 4275552,
"step": 6520
},
{
"epoch": 0.7152252548503781,
"grad_norm": 7.883710861206055,
"learning_rate": 9.375283607297175e-06,
"loss": 3.3458,
"num_input_tokens_seen": 4277912,
"step": 6525
},
{
"epoch": 0.7157733201797655,
"grad_norm": 5.036897659301758,
"learning_rate": 9.341704439794441e-06,
"loss": 2.9759,
"num_input_tokens_seen": 4280520,
"step": 6530
},
{
"epoch": 0.7163213855091527,
"grad_norm": 6.539727687835693,
"learning_rate": 9.308171692681565e-06,
"loss": 2.7201,
"num_input_tokens_seen": 4284248,
"step": 6535
},
{
"epoch": 0.7168694508385399,
"grad_norm": 7.108365058898926,
"learning_rate": 9.274685465369303e-06,
"loss": 3.1882,
"num_input_tokens_seen": 4288664,
"step": 6540
},
{
"epoch": 0.7174175161679273,
"grad_norm": 5.567689418792725,
"learning_rate": 9.241245857130507e-06,
"loss": 3.3889,
"num_input_tokens_seen": 4292104,
"step": 6545
},
{
"epoch": 0.7179655814973145,
"grad_norm": 7.539772033691406,
"learning_rate": 9.207852967099841e-06,
"loss": 3.2677,
"num_input_tokens_seen": 4296664,
"step": 6550
},
{
"epoch": 0.7185136468267017,
"grad_norm": 11.019807815551758,
"learning_rate": 9.174506894273448e-06,
"loss": 3.2587,
"num_input_tokens_seen": 4298936,
"step": 6555
},
{
"epoch": 0.719061712156089,
"grad_norm": 4.87662935256958,
"learning_rate": 9.141207737508677e-06,
"loss": 3.4056,
"num_input_tokens_seen": 4301872,
"step": 6560
},
{
"epoch": 0.7196097774854763,
"grad_norm": 7.396250247955322,
"learning_rate": 9.107955595523812e-06,
"loss": 3.0741,
"num_input_tokens_seen": 4305096,
"step": 6565
},
{
"epoch": 0.7201578428148635,
"grad_norm": 9.769874572753906,
"learning_rate": 9.074750566897733e-06,
"loss": 2.8083,
"num_input_tokens_seen": 4309576,
"step": 6570
},
{
"epoch": 0.7207059081442508,
"grad_norm": 7.023451805114746,
"learning_rate": 9.041592750069652e-06,
"loss": 3.162,
"num_input_tokens_seen": 4313728,
"step": 6575
},
{
"epoch": 0.7212539734736381,
"grad_norm": 7.67805814743042,
"learning_rate": 9.008482243338841e-06,
"loss": 3.1487,
"num_input_tokens_seen": 4316864,
"step": 6580
},
{
"epoch": 0.7218020388030253,
"grad_norm": 5.812924385070801,
"learning_rate": 8.975419144864292e-06,
"loss": 2.6071,
"num_input_tokens_seen": 4320688,
"step": 6585
},
{
"epoch": 0.7223501041324126,
"grad_norm": 9.005423545837402,
"learning_rate": 8.94240355266445e-06,
"loss": 3.2333,
"num_input_tokens_seen": 4323184,
"step": 6590
},
{
"epoch": 0.7228981694617999,
"grad_norm": 5.683709144592285,
"learning_rate": 8.909435564616944e-06,
"loss": 2.9484,
"num_input_tokens_seen": 4326304,
"step": 6595
},
{
"epoch": 0.7234462347911871,
"grad_norm": 9.263490676879883,
"learning_rate": 8.876515278458265e-06,
"loss": 3.2337,
"num_input_tokens_seen": 4329120,
"step": 6600
},
{
"epoch": 0.7239943001205744,
"grad_norm": 6.478157997131348,
"learning_rate": 8.84364279178348e-06,
"loss": 3.0925,
"num_input_tokens_seen": 4332440,
"step": 6605
},
{
"epoch": 0.7245423654499616,
"grad_norm": 8.741613388061523,
"learning_rate": 8.810818202045962e-06,
"loss": 3.3093,
"num_input_tokens_seen": 4335440,
"step": 6610
},
{
"epoch": 0.7250904307793489,
"grad_norm": 7.031724452972412,
"learning_rate": 8.77804160655708e-06,
"loss": 3.3767,
"num_input_tokens_seen": 4337912,
"step": 6615
},
{
"epoch": 0.7256384961087362,
"grad_norm": 8.763786315917969,
"learning_rate": 8.745313102485923e-06,
"loss": 3.201,
"num_input_tokens_seen": 4341472,
"step": 6620
},
{
"epoch": 0.7261865614381234,
"grad_norm": 5.877601623535156,
"learning_rate": 8.712632786859021e-06,
"loss": 2.7422,
"num_input_tokens_seen": 4345304,
"step": 6625
},
{
"epoch": 0.7267346267675107,
"grad_norm": 7.608758926391602,
"learning_rate": 8.68000075656003e-06,
"loss": 3.2688,
"num_input_tokens_seen": 4348264,
"step": 6630
},
{
"epoch": 0.727282692096898,
"grad_norm": 6.207149982452393,
"learning_rate": 8.647417108329454e-06,
"loss": 3.1522,
"num_input_tokens_seen": 4352144,
"step": 6635
},
{
"epoch": 0.7278307574262852,
"grad_norm": 6.543735504150391,
"learning_rate": 8.61488193876439e-06,
"loss": 2.968,
"num_input_tokens_seen": 4355840,
"step": 6640
},
{
"epoch": 0.7283788227556725,
"grad_norm": 7.882357597351074,
"learning_rate": 8.582395344318197e-06,
"loss": 2.8674,
"num_input_tokens_seen": 4358640,
"step": 6645
},
{
"epoch": 0.7289268880850598,
"grad_norm": 10.999910354614258,
"learning_rate": 8.54995742130022e-06,
"loss": 3.2327,
"num_input_tokens_seen": 4361656,
"step": 6650
},
{
"epoch": 0.729474953414447,
"grad_norm": 8.629473686218262,
"learning_rate": 8.517568265875541e-06,
"loss": 3.1042,
"num_input_tokens_seen": 4363968,
"step": 6655
},
{
"epoch": 0.7300230187438342,
"grad_norm": 8.353252410888672,
"learning_rate": 8.485227974064647e-06,
"loss": 2.7692,
"num_input_tokens_seen": 4367200,
"step": 6660
},
{
"epoch": 0.7305710840732216,
"grad_norm": 7.927604675292969,
"learning_rate": 8.452936641743156e-06,
"loss": 3.2321,
"num_input_tokens_seen": 4370096,
"step": 6665
},
{
"epoch": 0.7311191494026088,
"grad_norm": 5.507778644561768,
"learning_rate": 8.42069436464157e-06,
"loss": 3.1024,
"num_input_tokens_seen": 4374264,
"step": 6670
},
{
"epoch": 0.731667214731996,
"grad_norm": 6.3533172607421875,
"learning_rate": 8.38850123834494e-06,
"loss": 2.7559,
"num_input_tokens_seen": 4378824,
"step": 6675
},
{
"epoch": 0.7322152800613834,
"grad_norm": 6.395352840423584,
"learning_rate": 8.356357358292601e-06,
"loss": 3.243,
"num_input_tokens_seen": 4382616,
"step": 6680
},
{
"epoch": 0.7327633453907706,
"grad_norm": 8.324797630310059,
"learning_rate": 8.32426281977792e-06,
"loss": 3.6588,
"num_input_tokens_seen": 4385488,
"step": 6685
},
{
"epoch": 0.7333114107201578,
"grad_norm": 6.711746692657471,
"learning_rate": 8.292217717947962e-06,
"loss": 3.1062,
"num_input_tokens_seen": 4388592,
"step": 6690
},
{
"epoch": 0.7338594760495452,
"grad_norm": 11.369217872619629,
"learning_rate": 8.26022214780324e-06,
"loss": 3.0253,
"num_input_tokens_seen": 4391640,
"step": 6695
},
{
"epoch": 0.7344075413789324,
"grad_norm": 7.522586822509766,
"learning_rate": 8.228276204197427e-06,
"loss": 3.3273,
"num_input_tokens_seen": 4394456,
"step": 6700
},
{
"epoch": 0.7349556067083196,
"grad_norm": 7.1993207931518555,
"learning_rate": 8.196379981837071e-06,
"loss": 2.9679,
"num_input_tokens_seen": 4397352,
"step": 6705
},
{
"epoch": 0.735503672037707,
"grad_norm": 9.711231231689453,
"learning_rate": 8.164533575281316e-06,
"loss": 3.5035,
"num_input_tokens_seen": 4400744,
"step": 6710
},
{
"epoch": 0.7360517373670942,
"grad_norm": 8.696206092834473,
"learning_rate": 8.132737078941642e-06,
"loss": 2.8264,
"num_input_tokens_seen": 4404712,
"step": 6715
},
{
"epoch": 0.7365998026964814,
"grad_norm": 8.558262825012207,
"learning_rate": 8.100990587081536e-06,
"loss": 3.0127,
"num_input_tokens_seen": 4407448,
"step": 6720
},
{
"epoch": 0.7371478680258687,
"grad_norm": 7.874935626983643,
"learning_rate": 8.069294193816252e-06,
"loss": 2.9852,
"num_input_tokens_seen": 4410096,
"step": 6725
},
{
"epoch": 0.737695933355256,
"grad_norm": 10.938785552978516,
"learning_rate": 8.037647993112543e-06,
"loss": 2.8523,
"num_input_tokens_seen": 4413248,
"step": 6730
},
{
"epoch": 0.7382439986846432,
"grad_norm": 6.2363786697387695,
"learning_rate": 8.006052078788335e-06,
"loss": 3.5423,
"num_input_tokens_seen": 4417016,
"step": 6735
},
{
"epoch": 0.7387920640140305,
"grad_norm": 7.439382553100586,
"learning_rate": 7.974506544512478e-06,
"loss": 3.0829,
"num_input_tokens_seen": 4420144,
"step": 6740
},
{
"epoch": 0.7393401293434178,
"grad_norm": 8.05595588684082,
"learning_rate": 7.943011483804494e-06,
"loss": 2.8291,
"num_input_tokens_seen": 4422672,
"step": 6745
},
{
"epoch": 0.739888194672805,
"grad_norm": 7.396727561950684,
"learning_rate": 7.91156699003424e-06,
"loss": 3.1015,
"num_input_tokens_seen": 4425368,
"step": 6750
},
{
"epoch": 0.7404362600021923,
"grad_norm": 5.773197650909424,
"learning_rate": 7.880173156421661e-06,
"loss": 3.0124,
"num_input_tokens_seen": 4427720,
"step": 6755
},
{
"epoch": 0.7409843253315795,
"grad_norm": 7.078009128570557,
"learning_rate": 7.848830076036556e-06,
"loss": 3.007,
"num_input_tokens_seen": 4430872,
"step": 6760
},
{
"epoch": 0.7415323906609668,
"grad_norm": 6.219594478607178,
"learning_rate": 7.817537841798216e-06,
"loss": 3.0966,
"num_input_tokens_seen": 4434816,
"step": 6765
},
{
"epoch": 0.7420804559903541,
"grad_norm": 7.2829365730285645,
"learning_rate": 7.786296546475213e-06,
"loss": 3.4504,
"num_input_tokens_seen": 4437960,
"step": 6770
},
{
"epoch": 0.7426285213197413,
"grad_norm": 7.280004978179932,
"learning_rate": 7.755106282685118e-06,
"loss": 3.0042,
"num_input_tokens_seen": 4440624,
"step": 6775
},
{
"epoch": 0.7431765866491286,
"grad_norm": 6.213809490203857,
"learning_rate": 7.723967142894195e-06,
"loss": 3.0603,
"num_input_tokens_seen": 4444120,
"step": 6780
},
{
"epoch": 0.7437246519785159,
"grad_norm": 6.277675628662109,
"learning_rate": 7.69287921941715e-06,
"loss": 2.9716,
"num_input_tokens_seen": 4447152,
"step": 6785
},
{
"epoch": 0.7442727173079031,
"grad_norm": 8.690731048583984,
"learning_rate": 7.661842604416863e-06,
"loss": 3.2242,
"num_input_tokens_seen": 4450720,
"step": 6790
},
{
"epoch": 0.7448207826372903,
"grad_norm": 6.518171787261963,
"learning_rate": 7.630857389904095e-06,
"loss": 2.8793,
"num_input_tokens_seen": 4454448,
"step": 6795
},
{
"epoch": 0.7453688479666777,
"grad_norm": 10.606318473815918,
"learning_rate": 7.599923667737227e-06,
"loss": 2.9673,
"num_input_tokens_seen": 4457816,
"step": 6800
},
{
"epoch": 0.7459169132960649,
"grad_norm": 10.472159385681152,
"learning_rate": 7.5690415296220035e-06,
"loss": 3.0352,
"num_input_tokens_seen": 4460936,
"step": 6805
},
{
"epoch": 0.7464649786254521,
"grad_norm": 7.0004496574401855,
"learning_rate": 7.538211067111223e-06,
"loss": 3.165,
"num_input_tokens_seen": 4463688,
"step": 6810
},
{
"epoch": 0.7470130439548394,
"grad_norm": 7.692315101623535,
"learning_rate": 7.5074323716044835e-06,
"loss": 3.3064,
"num_input_tokens_seen": 4466616,
"step": 6815
},
{
"epoch": 0.7475611092842267,
"grad_norm": 5.7364702224731445,
"learning_rate": 7.476705534347947e-06,
"loss": 3.2443,
"num_input_tokens_seen": 4470464,
"step": 6820
},
{
"epoch": 0.7481091746136139,
"grad_norm": 6.589802265167236,
"learning_rate": 7.446030646434008e-06,
"loss": 2.9859,
"num_input_tokens_seen": 4472944,
"step": 6825
},
{
"epoch": 0.7486572399430012,
"grad_norm": 8.241453170776367,
"learning_rate": 7.4154077988010466e-06,
"loss": 3.1194,
"num_input_tokens_seen": 4475896,
"step": 6830
},
{
"epoch": 0.7492053052723885,
"grad_norm": 7.177932262420654,
"learning_rate": 7.3848370822332005e-06,
"loss": 2.9095,
"num_input_tokens_seen": 4478424,
"step": 6835
},
{
"epoch": 0.7497533706017757,
"grad_norm": 6.683755397796631,
"learning_rate": 7.354318587360029e-06,
"loss": 2.8105,
"num_input_tokens_seen": 4481120,
"step": 6840
},
{
"epoch": 0.7503014359311629,
"grad_norm": 7.998584747314453,
"learning_rate": 7.323852404656279e-06,
"loss": 2.5817,
"num_input_tokens_seen": 4484912,
"step": 6845
},
{
"epoch": 0.7508495012605503,
"grad_norm": 5.244688034057617,
"learning_rate": 7.293438624441637e-06,
"loss": 3.1018,
"num_input_tokens_seen": 4488416,
"step": 6850
},
{
"epoch": 0.7513975665899375,
"grad_norm": 7.417481422424316,
"learning_rate": 7.263077336880406e-06,
"loss": 3.2385,
"num_input_tokens_seen": 4491392,
"step": 6855
},
{
"epoch": 0.7519456319193247,
"grad_norm": 5.952940464019775,
"learning_rate": 7.232768631981285e-06,
"loss": 2.5967,
"num_input_tokens_seen": 4494608,
"step": 6860
},
{
"epoch": 0.7524936972487121,
"grad_norm": 7.974299907684326,
"learning_rate": 7.202512599597097e-06,
"loss": 3.3131,
"num_input_tokens_seen": 4497952,
"step": 6865
},
{
"epoch": 0.7530417625780993,
"grad_norm": 10.40588092803955,
"learning_rate": 7.172309329424495e-06,
"loss": 2.8735,
"num_input_tokens_seen": 4500792,
"step": 6870
},
{
"epoch": 0.7535898279074865,
"grad_norm": 7.208824634552002,
"learning_rate": 7.142158911003724e-06,
"loss": 3.3135,
"num_input_tokens_seen": 4504032,
"step": 6875
},
{
"epoch": 0.7541378932368739,
"grad_norm": 7.409761428833008,
"learning_rate": 7.112061433718339e-06,
"loss": 2.955,
"num_input_tokens_seen": 4506784,
"step": 6880
},
{
"epoch": 0.7546859585662611,
"grad_norm": 6.84408712387085,
"learning_rate": 7.082016986794951e-06,
"loss": 3.3193,
"num_input_tokens_seen": 4510016,
"step": 6885
},
{
"epoch": 0.7552340238956483,
"grad_norm": 5.721726417541504,
"learning_rate": 7.052025659302952e-06,
"loss": 3.1054,
"num_input_tokens_seen": 4512496,
"step": 6890
},
{
"epoch": 0.7557820892250356,
"grad_norm": 7.73302698135376,
"learning_rate": 7.022087540154274e-06,
"loss": 3.0514,
"num_input_tokens_seen": 4515040,
"step": 6895
},
{
"epoch": 0.7563301545544229,
"grad_norm": 8.347733497619629,
"learning_rate": 6.992202718103086e-06,
"loss": 2.9805,
"num_input_tokens_seen": 4517944,
"step": 6900
},
{
"epoch": 0.7568782198838101,
"grad_norm": 7.3970255851745605,
"learning_rate": 6.962371281745561e-06,
"loss": 3.3263,
"num_input_tokens_seen": 4520568,
"step": 6905
},
{
"epoch": 0.7574262852131974,
"grad_norm": 7.3923797607421875,
"learning_rate": 6.932593319519618e-06,
"loss": 3.2219,
"num_input_tokens_seen": 4524592,
"step": 6910
},
{
"epoch": 0.7579743505425847,
"grad_norm": 7.414371490478516,
"learning_rate": 6.902868919704627e-06,
"loss": 2.5203,
"num_input_tokens_seen": 4528528,
"step": 6915
},
{
"epoch": 0.7585224158719719,
"grad_norm": 7.776823043823242,
"learning_rate": 6.873198170421175e-06,
"loss": 3.1746,
"num_input_tokens_seen": 4532008,
"step": 6920
},
{
"epoch": 0.7590704812013592,
"grad_norm": 7.0230889320373535,
"learning_rate": 6.84358115963081e-06,
"loss": 3.0865,
"num_input_tokens_seen": 4536232,
"step": 6925
},
{
"epoch": 0.7596185465307465,
"grad_norm": 4.996485233306885,
"learning_rate": 6.814017975135753e-06,
"loss": 3.2363,
"num_input_tokens_seen": 4539680,
"step": 6930
},
{
"epoch": 0.7601666118601337,
"grad_norm": 9.683207511901855,
"learning_rate": 6.784508704578646e-06,
"loss": 3.2016,
"num_input_tokens_seen": 4542848,
"step": 6935
},
{
"epoch": 0.760714677189521,
"grad_norm": 5.796095848083496,
"learning_rate": 6.755053435442324e-06,
"loss": 2.9563,
"num_input_tokens_seen": 4547104,
"step": 6940
},
{
"epoch": 0.7612627425189082,
"grad_norm": 7.686697959899902,
"learning_rate": 6.725652255049508e-06,
"loss": 2.7968,
"num_input_tokens_seen": 4550392,
"step": 6945
},
{
"epoch": 0.7618108078482955,
"grad_norm": 7.243149280548096,
"learning_rate": 6.696305250562562e-06,
"loss": 2.9016,
"num_input_tokens_seen": 4553760,
"step": 6950
},
{
"epoch": 0.7623588731776828,
"grad_norm": 5.771494388580322,
"learning_rate": 6.667012508983278e-06,
"loss": 3.1646,
"num_input_tokens_seen": 4558080,
"step": 6955
},
{
"epoch": 0.76290693850707,
"grad_norm": 7.9829816818237305,
"learning_rate": 6.63777411715254e-06,
"loss": 2.946,
"num_input_tokens_seen": 4560904,
"step": 6960
},
{
"epoch": 0.7634550038364573,
"grad_norm": 6.072175979614258,
"learning_rate": 6.608590161750131e-06,
"loss": 3.2183,
"num_input_tokens_seen": 4563864,
"step": 6965
},
{
"epoch": 0.7640030691658446,
"grad_norm": 6.895592212677002,
"learning_rate": 6.579460729294429e-06,
"loss": 3.2887,
"num_input_tokens_seen": 4566800,
"step": 6970
},
{
"epoch": 0.7645511344952318,
"grad_norm": 7.528575897216797,
"learning_rate": 6.550385906142212e-06,
"loss": 3.0147,
"num_input_tokens_seen": 4569680,
"step": 6975
},
{
"epoch": 0.765099199824619,
"grad_norm": 5.899028301239014,
"learning_rate": 6.521365778488331e-06,
"loss": 2.9008,
"num_input_tokens_seen": 4573704,
"step": 6980
},
{
"epoch": 0.7656472651540064,
"grad_norm": 7.313390254974365,
"learning_rate": 6.492400432365503e-06,
"loss": 3.1414,
"num_input_tokens_seen": 4576368,
"step": 6985
},
{
"epoch": 0.7661953304833936,
"grad_norm": 7.083227634429932,
"learning_rate": 6.463489953644031e-06,
"loss": 2.7539,
"num_input_tokens_seen": 4578936,
"step": 6990
},
{
"epoch": 0.7667433958127808,
"grad_norm": 7.272182941436768,
"learning_rate": 6.434634428031558e-06,
"loss": 3.1749,
"num_input_tokens_seen": 4582096,
"step": 6995
},
{
"epoch": 0.7672914611421682,
"grad_norm": 9.697888374328613,
"learning_rate": 6.405833941072834e-06,
"loss": 3.1397,
"num_input_tokens_seen": 4584400,
"step": 7000
},
{
"epoch": 0.7678395264715554,
"grad_norm": 7.066343307495117,
"learning_rate": 6.377088578149418e-06,
"loss": 2.8686,
"num_input_tokens_seen": 4587688,
"step": 7005
},
{
"epoch": 0.7683875918009426,
"grad_norm": 5.80040979385376,
"learning_rate": 6.348398424479454e-06,
"loss": 2.7322,
"num_input_tokens_seen": 4591120,
"step": 7010
},
{
"epoch": 0.76893565713033,
"grad_norm": 8.803409576416016,
"learning_rate": 6.319763565117432e-06,
"loss": 3.2123,
"num_input_tokens_seen": 4594456,
"step": 7015
},
{
"epoch": 0.7694837224597172,
"grad_norm": 6.382712364196777,
"learning_rate": 6.291184084953894e-06,
"loss": 3.3465,
"num_input_tokens_seen": 4597120,
"step": 7020
},
{
"epoch": 0.7700317877891044,
"grad_norm": 6.3958740234375,
"learning_rate": 6.2626600687152064e-06,
"loss": 2.9045,
"num_input_tokens_seen": 4599416,
"step": 7025
},
{
"epoch": 0.7705798531184918,
"grad_norm": 5.454673767089844,
"learning_rate": 6.234191600963335e-06,
"loss": 3.1258,
"num_input_tokens_seen": 4602760,
"step": 7030
},
{
"epoch": 0.771127918447879,
"grad_norm": 4.992536544799805,
"learning_rate": 6.205778766095533e-06,
"loss": 3.0881,
"num_input_tokens_seen": 4605312,
"step": 7035
},
{
"epoch": 0.7716759837772662,
"grad_norm": 7.264188766479492,
"learning_rate": 6.1774216483441394e-06,
"loss": 3.117,
"num_input_tokens_seen": 4608784,
"step": 7040
},
{
"epoch": 0.7722240491066535,
"grad_norm": 7.106401443481445,
"learning_rate": 6.149120331776329e-06,
"loss": 2.8674,
"num_input_tokens_seen": 4612728,
"step": 7045
},
{
"epoch": 0.7727721144360408,
"grad_norm": 8.04111385345459,
"learning_rate": 6.120874900293827e-06,
"loss": 3.0187,
"num_input_tokens_seen": 4616096,
"step": 7050
},
{
"epoch": 0.773320179765428,
"grad_norm": 7.114358901977539,
"learning_rate": 6.092685437632683e-06,
"loss": 2.9277,
"num_input_tokens_seen": 4619312,
"step": 7055
},
{
"epoch": 0.7738682450948153,
"grad_norm": 6.135927200317383,
"learning_rate": 6.064552027363049e-06,
"loss": 2.8,
"num_input_tokens_seen": 4623080,
"step": 7060
},
{
"epoch": 0.7744163104242026,
"grad_norm": 9.407398223876953,
"learning_rate": 6.0364747528888734e-06,
"loss": 2.8471,
"num_input_tokens_seen": 4625720,
"step": 7065
},
{
"epoch": 0.7749643757535898,
"grad_norm": 8.590024948120117,
"learning_rate": 6.0084536974476995e-06,
"loss": 3.1369,
"num_input_tokens_seen": 4628368,
"step": 7070
},
{
"epoch": 0.7755124410829771,
"grad_norm": 6.1918721199035645,
"learning_rate": 5.980488944110408e-06,
"loss": 2.9941,
"num_input_tokens_seen": 4631128,
"step": 7075
},
{
"epoch": 0.7760605064123643,
"grad_norm": 6.956912994384766,
"learning_rate": 5.9525805757809524e-06,
"loss": 3.3899,
"num_input_tokens_seen": 4634672,
"step": 7080
},
{
"epoch": 0.7766085717417516,
"grad_norm": 6.198210716247559,
"learning_rate": 5.9247286751961366e-06,
"loss": 3.165,
"num_input_tokens_seen": 4638184,
"step": 7085
},
{
"epoch": 0.7771566370711389,
"grad_norm": 6.877211570739746,
"learning_rate": 5.896933324925372e-06,
"loss": 3.1694,
"num_input_tokens_seen": 4641976,
"step": 7090
},
{
"epoch": 0.7777047024005261,
"grad_norm": 6.007309436798096,
"learning_rate": 5.869194607370409e-06,
"loss": 3.1036,
"num_input_tokens_seen": 4645280,
"step": 7095
},
{
"epoch": 0.7782527677299134,
"grad_norm": 7.9656572341918945,
"learning_rate": 5.8415126047650955e-06,
"loss": 3.2545,
"num_input_tokens_seen": 4648904,
"step": 7100
},
{
"epoch": 0.7788008330593007,
"grad_norm": 7.05634069442749,
"learning_rate": 5.813887399175169e-06,
"loss": 2.912,
"num_input_tokens_seen": 4651232,
"step": 7105
},
{
"epoch": 0.7793488983886879,
"grad_norm": 8.77833080291748,
"learning_rate": 5.7863190724979695e-06,
"loss": 3.0476,
"num_input_tokens_seen": 4654288,
"step": 7110
},
{
"epoch": 0.7798969637180752,
"grad_norm": 6.191843032836914,
"learning_rate": 5.75880770646221e-06,
"loss": 3.1158,
"num_input_tokens_seen": 4657808,
"step": 7115
},
{
"epoch": 0.7804450290474625,
"grad_norm": 5.634969234466553,
"learning_rate": 5.73135338262776e-06,
"loss": 2.8591,
"num_input_tokens_seen": 4661440,
"step": 7120
},
{
"epoch": 0.7809930943768497,
"grad_norm": 6.004340648651123,
"learning_rate": 5.7039561823853615e-06,
"loss": 2.8518,
"num_input_tokens_seen": 4665104,
"step": 7125
},
{
"epoch": 0.7815411597062369,
"grad_norm": 7.3791680335998535,
"learning_rate": 5.676616186956413e-06,
"loss": 3.1628,
"num_input_tokens_seen": 4668432,
"step": 7130
},
{
"epoch": 0.7820892250356243,
"grad_norm": 9.166860580444336,
"learning_rate": 5.649333477392735e-06,
"loss": 3.3455,
"num_input_tokens_seen": 4671688,
"step": 7135
},
{
"epoch": 0.7826372903650115,
"grad_norm": 6.651597023010254,
"learning_rate": 5.622108134576312e-06,
"loss": 3.4196,
"num_input_tokens_seen": 4675408,
"step": 7140
},
{
"epoch": 0.7831853556943987,
"grad_norm": 7.5387797355651855,
"learning_rate": 5.594940239219049e-06,
"loss": 3.2571,
"num_input_tokens_seen": 4678440,
"step": 7145
},
{
"epoch": 0.7837334210237861,
"grad_norm": 9.256987571716309,
"learning_rate": 5.5678298718625674e-06,
"loss": 3.1553,
"num_input_tokens_seen": 4681320,
"step": 7150
},
{
"epoch": 0.7842814863531733,
"grad_norm": 8.727250099182129,
"learning_rate": 5.54077711287792e-06,
"loss": 3.2874,
"num_input_tokens_seen": 4685024,
"step": 7155
},
{
"epoch": 0.7848295516825605,
"grad_norm": 8.900041580200195,
"learning_rate": 5.513782042465385e-06,
"loss": 2.8368,
"num_input_tokens_seen": 4687568,
"step": 7160
},
{
"epoch": 0.7853776170119479,
"grad_norm": 10.776511192321777,
"learning_rate": 5.4868447406542125e-06,
"loss": 2.9062,
"num_input_tokens_seen": 4690632,
"step": 7165
},
{
"epoch": 0.7859256823413351,
"grad_norm": 6.669962406158447,
"learning_rate": 5.459965287302396e-06,
"loss": 3.3375,
"num_input_tokens_seen": 4694528,
"step": 7170
},
{
"epoch": 0.7864737476707223,
"grad_norm": 8.748539924621582,
"learning_rate": 5.4331437620964235e-06,
"loss": 3.1538,
"num_input_tokens_seen": 4697304,
"step": 7175
},
{
"epoch": 0.7870218130001096,
"grad_norm": 6.20130729675293,
"learning_rate": 5.406380244551077e-06,
"loss": 3.3296,
"num_input_tokens_seen": 4701400,
"step": 7180
},
{
"epoch": 0.7875698783294969,
"grad_norm": 6.8918304443359375,
"learning_rate": 5.379674814009133e-06,
"loss": 2.9058,
"num_input_tokens_seen": 4704688,
"step": 7185
},
{
"epoch": 0.7881179436588841,
"grad_norm": 8.053811073303223,
"learning_rate": 5.353027549641185e-06,
"loss": 3.19,
"num_input_tokens_seen": 4707832,
"step": 7190
},
{
"epoch": 0.7886660089882714,
"grad_norm": 8.722176551818848,
"learning_rate": 5.326438530445394e-06,
"loss": 3.1039,
"num_input_tokens_seen": 4711272,
"step": 7195
},
{
"epoch": 0.7892140743176587,
"grad_norm": 8.22156810760498,
"learning_rate": 5.299907835247228e-06,
"loss": 2.9179,
"num_input_tokens_seen": 4714584,
"step": 7200
},
{
"epoch": 0.7897621396470459,
"grad_norm": 8.812997817993164,
"learning_rate": 5.273435542699259e-06,
"loss": 2.9421,
"num_input_tokens_seen": 4717960,
"step": 7205
},
{
"epoch": 0.7903102049764332,
"grad_norm": 7.295377731323242,
"learning_rate": 5.247021731280927e-06,
"loss": 3.1538,
"num_input_tokens_seen": 4721208,
"step": 7210
},
{
"epoch": 0.7908582703058205,
"grad_norm": 6.8964762687683105,
"learning_rate": 5.220666479298283e-06,
"loss": 2.9399,
"num_input_tokens_seen": 4723760,
"step": 7215
},
{
"epoch": 0.7914063356352077,
"grad_norm": 8.851302146911621,
"learning_rate": 5.194369864883783e-06,
"loss": 3.0368,
"num_input_tokens_seen": 4727808,
"step": 7220
},
{
"epoch": 0.791954400964595,
"grad_norm": 6.765636444091797,
"learning_rate": 5.168131965996051e-06,
"loss": 2.5498,
"num_input_tokens_seen": 4730984,
"step": 7225
},
{
"epoch": 0.7925024662939822,
"grad_norm": 6.0574750900268555,
"learning_rate": 5.1419528604196385e-06,
"loss": 2.9546,
"num_input_tokens_seen": 4734472,
"step": 7230
},
{
"epoch": 0.7930505316233695,
"grad_norm": 6.703484535217285,
"learning_rate": 5.1158326257647855e-06,
"loss": 3.0816,
"num_input_tokens_seen": 4736976,
"step": 7235
},
{
"epoch": 0.7935985969527568,
"grad_norm": 5.429347038269043,
"learning_rate": 5.089771339467236e-06,
"loss": 2.8567,
"num_input_tokens_seen": 4740592,
"step": 7240
},
{
"epoch": 0.794146662282144,
"grad_norm": 6.805422306060791,
"learning_rate": 5.06376907878795e-06,
"loss": 3.0524,
"num_input_tokens_seen": 4744232,
"step": 7245
},
{
"epoch": 0.7946947276115313,
"grad_norm": 7.566915512084961,
"learning_rate": 5.0378259208129054e-06,
"loss": 2.7767,
"num_input_tokens_seen": 4748392,
"step": 7250
},
{
"epoch": 0.7952427929409186,
"grad_norm": 8.171722412109375,
"learning_rate": 5.011941942452872e-06,
"loss": 2.9925,
"num_input_tokens_seen": 4751496,
"step": 7255
},
{
"epoch": 0.7957908582703058,
"grad_norm": 9.192333221435547,
"learning_rate": 4.986117220443173e-06,
"loss": 3.3195,
"num_input_tokens_seen": 4754624,
"step": 7260
},
{
"epoch": 0.796338923599693,
"grad_norm": 6.089689254760742,
"learning_rate": 4.960351831343452e-06,
"loss": 3.3298,
"num_input_tokens_seen": 4758304,
"step": 7265
},
{
"epoch": 0.7968869889290804,
"grad_norm": 7.405531883239746,
"learning_rate": 4.9346458515374785e-06,
"loss": 3.3122,
"num_input_tokens_seen": 4760592,
"step": 7270
},
{
"epoch": 0.7974350542584676,
"grad_norm": 7.917971611022949,
"learning_rate": 4.908999357232874e-06,
"loss": 3.0276,
"num_input_tokens_seen": 4763392,
"step": 7275
},
{
"epoch": 0.7979831195878548,
"grad_norm": 8.550086975097656,
"learning_rate": 4.8834124244609145e-06,
"loss": 3.2591,
"num_input_tokens_seen": 4766544,
"step": 7280
},
{
"epoch": 0.7985311849172422,
"grad_norm": 7.939424514770508,
"learning_rate": 4.857885129076317e-06,
"loss": 2.8357,
"num_input_tokens_seen": 4769408,
"step": 7285
},
{
"epoch": 0.7990792502466294,
"grad_norm": 6.404162406921387,
"learning_rate": 4.8324175467569845e-06,
"loss": 3.0799,
"num_input_tokens_seen": 4773344,
"step": 7290
},
{
"epoch": 0.7996273155760166,
"grad_norm": 7.251323699951172,
"learning_rate": 4.807009753003791e-06,
"loss": 3.1363,
"num_input_tokens_seen": 4776640,
"step": 7295
},
{
"epoch": 0.800175380905404,
"grad_norm": 8.667237281799316,
"learning_rate": 4.781661823140366e-06,
"loss": 3.2124,
"num_input_tokens_seen": 4779376,
"step": 7300
},
{
"epoch": 0.8007234462347912,
"grad_norm": 8.147212028503418,
"learning_rate": 4.756373832312879e-06,
"loss": 2.874,
"num_input_tokens_seen": 4781952,
"step": 7305
},
{
"epoch": 0.8012715115641784,
"grad_norm": 8.90487003326416,
"learning_rate": 4.731145855489794e-06,
"loss": 3.2025,
"num_input_tokens_seen": 4784816,
"step": 7310
},
{
"epoch": 0.8018195768935658,
"grad_norm": 7.192740440368652,
"learning_rate": 4.70597796746165e-06,
"loss": 2.9843,
"num_input_tokens_seen": 4787592,
"step": 7315
},
{
"epoch": 0.802367642222953,
"grad_norm": 6.346043586730957,
"learning_rate": 4.6808702428408706e-06,
"loss": 3.1331,
"num_input_tokens_seen": 4790256,
"step": 7320
},
{
"epoch": 0.8029157075523402,
"grad_norm": 8.076735496520996,
"learning_rate": 4.655822756061503e-06,
"loss": 3.1571,
"num_input_tokens_seen": 4792768,
"step": 7325
},
{
"epoch": 0.8034637728817275,
"grad_norm": 7.521450519561768,
"learning_rate": 4.630835581379006e-06,
"loss": 2.929,
"num_input_tokens_seen": 4796152,
"step": 7330
},
{
"epoch": 0.8040118382111148,
"grad_norm": 12.113771438598633,
"learning_rate": 4.605908792870067e-06,
"loss": 3.1268,
"num_input_tokens_seen": 4798376,
"step": 7335
},
{
"epoch": 0.804559903540502,
"grad_norm": 5.997092247009277,
"learning_rate": 4.581042464432328e-06,
"loss": 2.8665,
"num_input_tokens_seen": 4802104,
"step": 7340
},
{
"epoch": 0.8051079688698893,
"grad_norm": 6.922906875610352,
"learning_rate": 4.556236669784197e-06,
"loss": 3.3316,
"num_input_tokens_seen": 4805648,
"step": 7345
},
{
"epoch": 0.8056560341992766,
"grad_norm": 9.63893985748291,
"learning_rate": 4.531491482464628e-06,
"loss": 3.2614,
"num_input_tokens_seen": 4810112,
"step": 7350
},
{
"epoch": 0.8062040995286638,
"grad_norm": 8.894881248474121,
"learning_rate": 4.5068069758329e-06,
"loss": 3.2695,
"num_input_tokens_seen": 4813192,
"step": 7355
},
{
"epoch": 0.8067521648580511,
"grad_norm": 6.436181545257568,
"learning_rate": 4.482183223068387e-06,
"loss": 2.8622,
"num_input_tokens_seen": 4815768,
"step": 7360
},
{
"epoch": 0.8073002301874384,
"grad_norm": 7.975905895233154,
"learning_rate": 4.457620297170381e-06,
"loss": 3.3166,
"num_input_tokens_seen": 4819144,
"step": 7365
},
{
"epoch": 0.8078482955168256,
"grad_norm": 7.515452861785889,
"learning_rate": 4.433118270957818e-06,
"loss": 2.5207,
"num_input_tokens_seen": 4822152,
"step": 7370
},
{
"epoch": 0.8083963608462129,
"grad_norm": 6.722434997558594,
"learning_rate": 4.408677217069096e-06,
"loss": 3.1815,
"num_input_tokens_seen": 4825920,
"step": 7375
},
{
"epoch": 0.8089444261756001,
"grad_norm": 6.1937031745910645,
"learning_rate": 4.3842972079618765e-06,
"loss": 3.0536,
"num_input_tokens_seen": 4829224,
"step": 7380
},
{
"epoch": 0.8094924915049874,
"grad_norm": 7.4900898933410645,
"learning_rate": 4.359978315912827e-06,
"loss": 2.9555,
"num_input_tokens_seen": 4832576,
"step": 7385
},
{
"epoch": 0.8100405568343747,
"grad_norm": 7.267132759094238,
"learning_rate": 4.33572061301743e-06,
"loss": 3.376,
"num_input_tokens_seen": 4834896,
"step": 7390
},
{
"epoch": 0.8105886221637619,
"grad_norm": 6.553824424743652,
"learning_rate": 4.311524171189782e-06,
"loss": 3.1203,
"num_input_tokens_seen": 4838536,
"step": 7395
},
{
"epoch": 0.8111366874931492,
"grad_norm": 6.04332971572876,
"learning_rate": 4.28738906216235e-06,
"loss": 2.898,
"num_input_tokens_seen": 4842312,
"step": 7400
},
{
"epoch": 0.8116847528225365,
"grad_norm": 6.300970077514648,
"learning_rate": 4.263315357485775e-06,
"loss": 3.2478,
"num_input_tokens_seen": 4845640,
"step": 7405
},
{
"epoch": 0.8122328181519237,
"grad_norm": 8.834260940551758,
"learning_rate": 4.2393031285286796e-06,
"loss": 3.1214,
"num_input_tokens_seen": 4848880,
"step": 7410
},
{
"epoch": 0.812780883481311,
"grad_norm": 7.611583709716797,
"learning_rate": 4.215352446477413e-06,
"loss": 2.8593,
"num_input_tokens_seen": 4852904,
"step": 7415
},
{
"epoch": 0.8133289488106983,
"grad_norm": 5.708853244781494,
"learning_rate": 4.191463382335867e-06,
"loss": 3.1984,
"num_input_tokens_seen": 4855720,
"step": 7420
},
{
"epoch": 0.8138770141400855,
"grad_norm": 5.545560836791992,
"learning_rate": 4.167636006925274e-06,
"loss": 3.1826,
"num_input_tokens_seen": 4859488,
"step": 7425
},
{
"epoch": 0.8144250794694727,
"grad_norm": 9.735588073730469,
"learning_rate": 4.143870390883978e-06,
"loss": 2.8356,
"num_input_tokens_seen": 4862808,
"step": 7430
},
{
"epoch": 0.8149731447988601,
"grad_norm": 10.298928260803223,
"learning_rate": 4.120166604667225e-06,
"loss": 2.9738,
"num_input_tokens_seen": 4866608,
"step": 7435
},
{
"epoch": 0.8155212101282473,
"grad_norm": 8.623414039611816,
"learning_rate": 4.096524718546974e-06,
"loss": 3.0776,
"num_input_tokens_seen": 4868832,
"step": 7440
},
{
"epoch": 0.8160692754576345,
"grad_norm": 10.033533096313477,
"learning_rate": 4.072944802611655e-06,
"loss": 3.1786,
"num_input_tokens_seen": 4872536,
"step": 7445
},
{
"epoch": 0.8166173407870219,
"grad_norm": 8.511270523071289,
"learning_rate": 4.0494269267660144e-06,
"loss": 3.4183,
"num_input_tokens_seen": 4876032,
"step": 7450
},
{
"epoch": 0.8171654061164091,
"grad_norm": 6.882598876953125,
"learning_rate": 4.025971160730846e-06,
"loss": 3.0995,
"num_input_tokens_seen": 4878536,
"step": 7455
},
{
"epoch": 0.8177134714457963,
"grad_norm": 6.228262901306152,
"learning_rate": 4.002577574042829e-06,
"loss": 2.8603,
"num_input_tokens_seen": 4880976,
"step": 7460
},
{
"epoch": 0.8182615367751837,
"grad_norm": 9.165740013122559,
"learning_rate": 3.9792462360542935e-06,
"loss": 2.8565,
"num_input_tokens_seen": 4884688,
"step": 7465
},
{
"epoch": 0.8188096021045709,
"grad_norm": 7.1637701988220215,
"learning_rate": 3.955977215933046e-06,
"loss": 2.9947,
"num_input_tokens_seen": 4888200,
"step": 7470
},
{
"epoch": 0.8193576674339581,
"grad_norm": 7.321343421936035,
"learning_rate": 3.932770582662135e-06,
"loss": 3.1105,
"num_input_tokens_seen": 4890856,
"step": 7475
},
{
"epoch": 0.8199057327633454,
"grad_norm": 7.804381847381592,
"learning_rate": 3.9096264050396485e-06,
"loss": 2.9519,
"num_input_tokens_seen": 4893712,
"step": 7480
},
{
"epoch": 0.8204537980927327,
"grad_norm": 6.569583415985107,
"learning_rate": 3.886544751678547e-06,
"loss": 3.0457,
"num_input_tokens_seen": 4897104,
"step": 7485
},
{
"epoch": 0.8210018634221199,
"grad_norm": 10.908699035644531,
"learning_rate": 3.863525691006406e-06,
"loss": 3.5541,
"num_input_tokens_seen": 4900616,
"step": 7490
},
{
"epoch": 0.8215499287515072,
"grad_norm": 8.427760124206543,
"learning_rate": 3.840569291265242e-06,
"loss": 2.9541,
"num_input_tokens_seen": 4902848,
"step": 7495
},
{
"epoch": 0.8220979940808945,
"grad_norm": 10.59475040435791,
"learning_rate": 3.817675620511329e-06,
"loss": 2.932,
"num_input_tokens_seen": 4905424,
"step": 7500
},
{
"epoch": 0.8226460594102817,
"grad_norm": 8.56042194366455,
"learning_rate": 3.794844746614956e-06,
"loss": 3.3314,
"num_input_tokens_seen": 4908016,
"step": 7505
},
{
"epoch": 0.823194124739669,
"grad_norm": 8.957588195800781,
"learning_rate": 3.772076737260241e-06,
"loss": 3.4287,
"num_input_tokens_seen": 4912944,
"step": 7510
},
{
"epoch": 0.8237421900690562,
"grad_norm": 8.641453742980957,
"learning_rate": 3.7493716599449557e-06,
"loss": 2.7836,
"num_input_tokens_seen": 4915344,
"step": 7515
},
{
"epoch": 0.8242902553984435,
"grad_norm": 9.905373573303223,
"learning_rate": 3.726729581980287e-06,
"loss": 3.3792,
"num_input_tokens_seen": 4918280,
"step": 7520
},
{
"epoch": 0.8248383207278308,
"grad_norm": 6.359044075012207,
"learning_rate": 3.7041505704906554e-06,
"loss": 2.6283,
"num_input_tokens_seen": 4923056,
"step": 7525
},
{
"epoch": 0.825386386057218,
"grad_norm": 8.611063957214355,
"learning_rate": 3.681634692413527e-06,
"loss": 3.0805,
"num_input_tokens_seen": 4925992,
"step": 7530
},
{
"epoch": 0.8259344513866053,
"grad_norm": 6.022265911102295,
"learning_rate": 3.659182014499199e-06,
"loss": 2.9173,
"num_input_tokens_seen": 4928312,
"step": 7535
},
{
"epoch": 0.8264825167159926,
"grad_norm": 7.828344821929932,
"learning_rate": 3.636792603310593e-06,
"loss": 3.3786,
"num_input_tokens_seen": 4931816,
"step": 7540
},
{
"epoch": 0.8270305820453798,
"grad_norm": 9.197246551513672,
"learning_rate": 3.6144665252230897e-06,
"loss": 3.1869,
"num_input_tokens_seen": 4934904,
"step": 7545
},
{
"epoch": 0.827578647374767,
"grad_norm": 6.626698017120361,
"learning_rate": 3.5922038464243e-06,
"loss": 2.864,
"num_input_tokens_seen": 4937320,
"step": 7550
},
{
"epoch": 0.8281267127041544,
"grad_norm": 6.149302959442139,
"learning_rate": 3.570004632913884e-06,
"loss": 2.9841,
"num_input_tokens_seen": 4940472,
"step": 7555
},
{
"epoch": 0.8286747780335416,
"grad_norm": 5.897488117218018,
"learning_rate": 3.5478689505033635e-06,
"loss": 3.0083,
"num_input_tokens_seen": 4943240,
"step": 7560
},
{
"epoch": 0.8292228433629288,
"grad_norm": 5.379867076873779,
"learning_rate": 3.5257968648159085e-06,
"loss": 3.2044,
"num_input_tokens_seen": 4947448,
"step": 7565
},
{
"epoch": 0.8297709086923162,
"grad_norm": 8.127168655395508,
"learning_rate": 3.503788441286143e-06,
"loss": 3.0341,
"num_input_tokens_seen": 4950720,
"step": 7570
},
{
"epoch": 0.8303189740217034,
"grad_norm": 7.3780364990234375,
"learning_rate": 3.4818437451599796e-06,
"loss": 3.2321,
"num_input_tokens_seen": 4954728,
"step": 7575
},
{
"epoch": 0.8308670393510906,
"grad_norm": 6.4768757820129395,
"learning_rate": 3.459962841494391e-06,
"loss": 3.1017,
"num_input_tokens_seen": 4957936,
"step": 7580
},
{
"epoch": 0.831415104680478,
"grad_norm": 7.365682125091553,
"learning_rate": 3.4381457951572245e-06,
"loss": 2.8212,
"num_input_tokens_seen": 4961240,
"step": 7585
},
{
"epoch": 0.8319631700098652,
"grad_norm": 7.922868251800537,
"learning_rate": 3.41639267082704e-06,
"loss": 2.8681,
"num_input_tokens_seen": 4964016,
"step": 7590
},
{
"epoch": 0.8325112353392524,
"grad_norm": 4.56962251663208,
"learning_rate": 3.3947035329928768e-06,
"loss": 3.0944,
"num_input_tokens_seen": 4966208,
"step": 7595
},
{
"epoch": 0.8330593006686398,
"grad_norm": 8.027546882629395,
"learning_rate": 3.3730784459540755e-06,
"loss": 2.62,
"num_input_tokens_seen": 4969656,
"step": 7600
},
{
"epoch": 0.833607365998027,
"grad_norm": 9.634477615356445,
"learning_rate": 3.3515174738201204e-06,
"loss": 3.0848,
"num_input_tokens_seen": 4972656,
"step": 7605
},
{
"epoch": 0.8341554313274142,
"grad_norm": 6.137497901916504,
"learning_rate": 3.3300206805103902e-06,
"loss": 2.8019,
"num_input_tokens_seen": 4976816,
"step": 7610
},
{
"epoch": 0.8347034966568014,
"grad_norm": 6.958483695983887,
"learning_rate": 3.3085881297540143e-06,
"loss": 3.1585,
"num_input_tokens_seen": 4979448,
"step": 7615
},
{
"epoch": 0.8352515619861888,
"grad_norm": 6.135876178741455,
"learning_rate": 3.2872198850896763e-06,
"loss": 3.4485,
"num_input_tokens_seen": 4982096,
"step": 7620
},
{
"epoch": 0.835799627315576,
"grad_norm": 5.784817218780518,
"learning_rate": 3.265916009865405e-06,
"loss": 2.5781,
"num_input_tokens_seen": 4987624,
"step": 7625
},
{
"epoch": 0.8363476926449632,
"grad_norm": 7.2112603187561035,
"learning_rate": 3.2446765672384083e-06,
"loss": 3.1842,
"num_input_tokens_seen": 4991016,
"step": 7630
},
{
"epoch": 0.8368957579743506,
"grad_norm": 8.30711555480957,
"learning_rate": 3.223501620174871e-06,
"loss": 2.8567,
"num_input_tokens_seen": 4994496,
"step": 7635
},
{
"epoch": 0.8374438233037378,
"grad_norm": 5.6931915283203125,
"learning_rate": 3.2023912314497835e-06,
"loss": 3.109,
"num_input_tokens_seen": 4997176,
"step": 7640
},
{
"epoch": 0.837991888633125,
"grad_norm": 7.178470611572266,
"learning_rate": 3.18134546364674e-06,
"loss": 3.1472,
"num_input_tokens_seen": 5001168,
"step": 7645
},
{
"epoch": 0.8385399539625124,
"grad_norm": 6.247611045837402,
"learning_rate": 3.160364379157771e-06,
"loss": 3.0272,
"num_input_tokens_seen": 5004928,
"step": 7650
},
{
"epoch": 0.8390880192918996,
"grad_norm": 8.314835548400879,
"learning_rate": 3.1394480401831376e-06,
"loss": 3.1062,
"num_input_tokens_seen": 5007976,
"step": 7655
},
{
"epoch": 0.8396360846212868,
"grad_norm": 8.253650665283203,
"learning_rate": 3.118596508731153e-06,
"loss": 3.1373,
"num_input_tokens_seen": 5010840,
"step": 7660
},
{
"epoch": 0.8401841499506741,
"grad_norm": 8.37070083618164,
"learning_rate": 3.0978098466180246e-06,
"loss": 3.1474,
"num_input_tokens_seen": 5013264,
"step": 7665
},
{
"epoch": 0.8407322152800614,
"grad_norm": 7.3890700340271,
"learning_rate": 3.0770881154676244e-06,
"loss": 2.9336,
"num_input_tokens_seen": 5016288,
"step": 7670
},
{
"epoch": 0.8412802806094486,
"grad_norm": 9.55408000946045,
"learning_rate": 3.056431376711341e-06,
"loss": 3.1662,
"num_input_tokens_seen": 5019184,
"step": 7675
},
{
"epoch": 0.8418283459388359,
"grad_norm": 9.764185905456543,
"learning_rate": 3.035839691587891e-06,
"loss": 3.3416,
"num_input_tokens_seen": 5022032,
"step": 7680
},
{
"epoch": 0.8423764112682232,
"grad_norm": 6.572988510131836,
"learning_rate": 3.015313121143132e-06,
"loss": 3.44,
"num_input_tokens_seen": 5025704,
"step": 7685
},
{
"epoch": 0.8429244765976104,
"grad_norm": 6.35365629196167,
"learning_rate": 2.994851726229872e-06,
"loss": 2.8245,
"num_input_tokens_seen": 5029360,
"step": 7690
},
{
"epoch": 0.8434725419269977,
"grad_norm": 5.579585552215576,
"learning_rate": 2.9744555675077195e-06,
"loss": 2.9123,
"num_input_tokens_seen": 5032232,
"step": 7695
},
{
"epoch": 0.844020607256385,
"grad_norm": 9.263272285461426,
"learning_rate": 2.9541247054428732e-06,
"loss": 3.1231,
"num_input_tokens_seen": 5034616,
"step": 7700
},
{
"epoch": 0.8445686725857722,
"grad_norm": 6.095417022705078,
"learning_rate": 2.933859200307948e-06,
"loss": 2.822,
"num_input_tokens_seen": 5037736,
"step": 7705
},
{
"epoch": 0.8451167379151595,
"grad_norm": 7.388354778289795,
"learning_rate": 2.913659112181824e-06,
"loss": 2.8813,
"num_input_tokens_seen": 5040224,
"step": 7710
},
{
"epoch": 0.8456648032445467,
"grad_norm": 5.476953983306885,
"learning_rate": 2.893524500949424e-06,
"loss": 2.9058,
"num_input_tokens_seen": 5042920,
"step": 7715
},
{
"epoch": 0.846212868573934,
"grad_norm": 8.243193626403809,
"learning_rate": 2.8734554263015717e-06,
"loss": 3.0815,
"num_input_tokens_seen": 5046384,
"step": 7720
},
{
"epoch": 0.8467609339033213,
"grad_norm": 5.285266399383545,
"learning_rate": 2.853451947734795e-06,
"loss": 2.8613,
"num_input_tokens_seen": 5050096,
"step": 7725
},
{
"epoch": 0.8473089992327085,
"grad_norm": 7.07433557510376,
"learning_rate": 2.833514124551162e-06,
"loss": 3.2751,
"num_input_tokens_seen": 5053016,
"step": 7730
},
{
"epoch": 0.8478570645620958,
"grad_norm": 7.447408676147461,
"learning_rate": 2.8136420158580923e-06,
"loss": 3.199,
"num_input_tokens_seen": 5055816,
"step": 7735
},
{
"epoch": 0.8484051298914831,
"grad_norm": 6.6446757316589355,
"learning_rate": 2.793835680568202e-06,
"loss": 2.9382,
"num_input_tokens_seen": 5059872,
"step": 7740
},
{
"epoch": 0.8489531952208703,
"grad_norm": 6.634135723114014,
"learning_rate": 2.774095177399108e-06,
"loss": 2.7486,
"num_input_tokens_seen": 5063104,
"step": 7745
},
{
"epoch": 0.8495012605502575,
"grad_norm": 6.349103927612305,
"learning_rate": 2.75442056487325e-06,
"loss": 2.8114,
"num_input_tokens_seen": 5067312,
"step": 7750
},
{
"epoch": 0.8500493258796449,
"grad_norm": 9.979939460754395,
"learning_rate": 2.7348119013177605e-06,
"loss": 3.0652,
"num_input_tokens_seen": 5070232,
"step": 7755
},
{
"epoch": 0.8505973912090321,
"grad_norm": 9.005098342895508,
"learning_rate": 2.7152692448642297e-06,
"loss": 2.7476,
"num_input_tokens_seen": 5073736,
"step": 7760
},
{
"epoch": 0.8511454565384193,
"grad_norm": 7.502773761749268,
"learning_rate": 2.695792653448573e-06,
"loss": 2.6705,
"num_input_tokens_seen": 5076032,
"step": 7765
},
{
"epoch": 0.8516935218678067,
"grad_norm": 6.317687511444092,
"learning_rate": 2.6763821848108634e-06,
"loss": 2.7642,
"num_input_tokens_seen": 5078736,
"step": 7770
},
{
"epoch": 0.8522415871971939,
"grad_norm": 6.520786762237549,
"learning_rate": 2.6570378964951322e-06,
"loss": 2.9362,
"num_input_tokens_seen": 5081560,
"step": 7775
},
{
"epoch": 0.8527896525265811,
"grad_norm": 7.41638708114624,
"learning_rate": 2.637759845849211e-06,
"loss": 2.9981,
"num_input_tokens_seen": 5084504,
"step": 7780
},
{
"epoch": 0.8533377178559685,
"grad_norm": 7.572868824005127,
"learning_rate": 2.6185480900245836e-06,
"loss": 2.7595,
"num_input_tokens_seen": 5088232,
"step": 7785
},
{
"epoch": 0.8538857831853557,
"grad_norm": 6.104272842407227,
"learning_rate": 2.5994026859761766e-06,
"loss": 2.9084,
"num_input_tokens_seen": 5090552,
"step": 7790
},
{
"epoch": 0.8544338485147429,
"grad_norm": 8.887699127197266,
"learning_rate": 2.5803236904622134e-06,
"loss": 3.3633,
"num_input_tokens_seen": 5093720,
"step": 7795
},
{
"epoch": 0.8549819138441302,
"grad_norm": 7.048088550567627,
"learning_rate": 2.5613111600440637e-06,
"loss": 2.94,
"num_input_tokens_seen": 5096984,
"step": 7800
},
{
"epoch": 0.8555299791735175,
"grad_norm": 7.457699775695801,
"learning_rate": 2.5423651510860292e-06,
"loss": 2.9086,
"num_input_tokens_seen": 5100088,
"step": 7805
},
{
"epoch": 0.8560780445029047,
"grad_norm": 7.127599239349365,
"learning_rate": 2.5234857197552197e-06,
"loss": 3.2513,
"num_input_tokens_seen": 5102776,
"step": 7810
},
{
"epoch": 0.856626109832292,
"grad_norm": 6.716034412384033,
"learning_rate": 2.5046729220213615e-06,
"loss": 3.1929,
"num_input_tokens_seen": 5106680,
"step": 7815
},
{
"epoch": 0.8571741751616793,
"grad_norm": 8.033172607421875,
"learning_rate": 2.4859268136566415e-06,
"loss": 3.2828,
"num_input_tokens_seen": 5110400,
"step": 7820
},
{
"epoch": 0.8577222404910665,
"grad_norm": 7.232936859130859,
"learning_rate": 2.4672474502355406e-06,
"loss": 2.9178,
"num_input_tokens_seen": 5113896,
"step": 7825
},
{
"epoch": 0.8582703058204538,
"grad_norm": 7.433042526245117,
"learning_rate": 2.4486348871346738e-06,
"loss": 3.2398,
"num_input_tokens_seen": 5116440,
"step": 7830
},
{
"epoch": 0.858818371149841,
"grad_norm": 6.7432756423950195,
"learning_rate": 2.4300891795326157e-06,
"loss": 2.8448,
"num_input_tokens_seen": 5119296,
"step": 7835
},
{
"epoch": 0.8593664364792283,
"grad_norm": 6.955072402954102,
"learning_rate": 2.4116103824097345e-06,
"loss": 3.0554,
"num_input_tokens_seen": 5122136,
"step": 7840
},
{
"epoch": 0.8599145018086156,
"grad_norm": 7.900850296020508,
"learning_rate": 2.3931985505480564e-06,
"loss": 2.9951,
"num_input_tokens_seen": 5125056,
"step": 7845
},
{
"epoch": 0.8604625671380028,
"grad_norm": 5.292073726654053,
"learning_rate": 2.374853738531063e-06,
"loss": 3.1992,
"num_input_tokens_seen": 5128688,
"step": 7850
},
{
"epoch": 0.8610106324673901,
"grad_norm": 6.894753932952881,
"learning_rate": 2.356576000743557e-06,
"loss": 3.2569,
"num_input_tokens_seen": 5132184,
"step": 7855
},
{
"epoch": 0.8615586977967774,
"grad_norm": 6.101509094238281,
"learning_rate": 2.3383653913714996e-06,
"loss": 2.8422,
"num_input_tokens_seen": 5136352,
"step": 7860
},
{
"epoch": 0.8621067631261646,
"grad_norm": 6.467989444732666,
"learning_rate": 2.3202219644018365e-06,
"loss": 3.0615,
"num_input_tokens_seen": 5139152,
"step": 7865
},
{
"epoch": 0.8626548284555519,
"grad_norm": 6.982528209686279,
"learning_rate": 2.3021457736223412e-06,
"loss": 3.0371,
"num_input_tokens_seen": 5142336,
"step": 7870
},
{
"epoch": 0.8632028937849392,
"grad_norm": 5.719668388366699,
"learning_rate": 2.2841368726214755e-06,
"loss": 3.1793,
"num_input_tokens_seen": 5145504,
"step": 7875
},
{
"epoch": 0.8637509591143264,
"grad_norm": 6.815168380737305,
"learning_rate": 2.2661953147882024e-06,
"loss": 3.2501,
"num_input_tokens_seen": 5148672,
"step": 7880
},
{
"epoch": 0.8642990244437136,
"grad_norm": 6.836389541625977,
"learning_rate": 2.2483211533118357e-06,
"loss": 3.2825,
"num_input_tokens_seen": 5152104,
"step": 7885
},
{
"epoch": 0.864847089773101,
"grad_norm": 9.11992359161377,
"learning_rate": 2.2305144411819052e-06,
"loss": 3.1458,
"num_input_tokens_seen": 5154840,
"step": 7890
},
{
"epoch": 0.8653951551024882,
"grad_norm": 7.1421308517456055,
"learning_rate": 2.212775231187966e-06,
"loss": 3.2977,
"num_input_tokens_seen": 5157496,
"step": 7895
},
{
"epoch": 0.8659432204318754,
"grad_norm": 6.900385856628418,
"learning_rate": 2.1951035759194605e-06,
"loss": 2.9658,
"num_input_tokens_seen": 5161824,
"step": 7900
},
{
"epoch": 0.8664912857612628,
"grad_norm": 8.681853294372559,
"learning_rate": 2.1774995277655556e-06,
"loss": 2.9868,
"num_input_tokens_seen": 5164840,
"step": 7905
},
{
"epoch": 0.86703935109065,
"grad_norm": 6.421346187591553,
"learning_rate": 2.1599631389150027e-06,
"loss": 3.3,
"num_input_tokens_seen": 5169320,
"step": 7910
},
{
"epoch": 0.8675874164200372,
"grad_norm": 6.86265754699707,
"learning_rate": 2.1424944613559537e-06,
"loss": 3.1633,
"num_input_tokens_seen": 5172784,
"step": 7915
},
{
"epoch": 0.8681354817494246,
"grad_norm": 4.766587257385254,
"learning_rate": 2.1250935468758446e-06,
"loss": 3.2877,
"num_input_tokens_seen": 5175600,
"step": 7920
},
{
"epoch": 0.8686835470788118,
"grad_norm": 6.533714771270752,
"learning_rate": 2.1077604470612106e-06,
"loss": 2.9995,
"num_input_tokens_seen": 5178624,
"step": 7925
},
{
"epoch": 0.869231612408199,
"grad_norm": 7.438570022583008,
"learning_rate": 2.0904952132975386e-06,
"loss": 2.7973,
"num_input_tokens_seen": 5181688,
"step": 7930
},
{
"epoch": 0.8697796777375864,
"grad_norm": 7.600935459136963,
"learning_rate": 2.0732978967691357e-06,
"loss": 3.4927,
"num_input_tokens_seen": 5184008,
"step": 7935
},
{
"epoch": 0.8703277430669736,
"grad_norm": 10.930978775024414,
"learning_rate": 2.0561685484589506e-06,
"loss": 3.0121,
"num_input_tokens_seen": 5187600,
"step": 7940
},
{
"epoch": 0.8708758083963608,
"grad_norm": 8.671449661254883,
"learning_rate": 2.0391072191484338e-06,
"loss": 3.1692,
"num_input_tokens_seen": 5190976,
"step": 7945
},
{
"epoch": 0.8714238737257481,
"grad_norm": 9.432777404785156,
"learning_rate": 2.0221139594174018e-06,
"loss": 3.0802,
"num_input_tokens_seen": 5193664,
"step": 7950
},
{
"epoch": 0.8719719390551354,
"grad_norm": 8.096484184265137,
"learning_rate": 2.0051888196438552e-06,
"loss": 2.8438,
"num_input_tokens_seen": 5196696,
"step": 7955
},
{
"epoch": 0.8725200043845226,
"grad_norm": 8.458807945251465,
"learning_rate": 1.988331850003855e-06,
"loss": 3.4075,
"num_input_tokens_seen": 5200640,
"step": 7960
},
{
"epoch": 0.8730680697139099,
"grad_norm": 9.191377639770508,
"learning_rate": 1.971543100471368e-06,
"loss": 3.276,
"num_input_tokens_seen": 5204240,
"step": 7965
},
{
"epoch": 0.8736161350432972,
"grad_norm": 6.790607929229736,
"learning_rate": 1.954822620818114e-06,
"loss": 2.9706,
"num_input_tokens_seen": 5208024,
"step": 7970
},
{
"epoch": 0.8741642003726844,
"grad_norm": 7.511916637420654,
"learning_rate": 1.938170460613417e-06,
"loss": 2.8037,
"num_input_tokens_seen": 5211272,
"step": 7975
},
{
"epoch": 0.8747122657020717,
"grad_norm": 6.600817680358887,
"learning_rate": 1.921586669224071e-06,
"loss": 3.3576,
"num_input_tokens_seen": 5215392,
"step": 7980
},
{
"epoch": 0.875260331031459,
"grad_norm": 5.347980976104736,
"learning_rate": 1.9050712958141758e-06,
"loss": 3.3071,
"num_input_tokens_seen": 5217928,
"step": 7985
},
{
"epoch": 0.8758083963608462,
"grad_norm": 6.689899921417236,
"learning_rate": 1.8886243893450061e-06,
"loss": 3.2119,
"num_input_tokens_seen": 5220984,
"step": 7990
},
{
"epoch": 0.8763564616902335,
"grad_norm": 6.363076210021973,
"learning_rate": 1.8722459985748563e-06,
"loss": 2.9524,
"num_input_tokens_seen": 5224504,
"step": 7995
},
{
"epoch": 0.8769045270196207,
"grad_norm": 7.521759986877441,
"learning_rate": 1.8559361720588974e-06,
"loss": 3.1379,
"num_input_tokens_seen": 5227336,
"step": 8000
},
{
"epoch": 0.877452592349008,
"grad_norm": 8.488334655761719,
"learning_rate": 1.8396949581490463e-06,
"loss": 3.2758,
"num_input_tokens_seen": 5229968,
"step": 8005
},
{
"epoch": 0.8780006576783953,
"grad_norm": 7.164643287658691,
"learning_rate": 1.8235224049938049e-06,
"loss": 3.0142,
"num_input_tokens_seen": 5233280,
"step": 8010
},
{
"epoch": 0.8785487230077825,
"grad_norm": 8.150335311889648,
"learning_rate": 1.8074185605381239e-06,
"loss": 3.2278,
"num_input_tokens_seen": 5236408,
"step": 8015
},
{
"epoch": 0.8790967883371698,
"grad_norm": 9.74315357208252,
"learning_rate": 1.791383472523256e-06,
"loss": 3.3009,
"num_input_tokens_seen": 5240040,
"step": 8020
},
{
"epoch": 0.8796448536665571,
"grad_norm": 6.548309326171875,
"learning_rate": 1.7754171884866362e-06,
"loss": 3.0949,
"num_input_tokens_seen": 5243480,
"step": 8025
},
{
"epoch": 0.8801929189959443,
"grad_norm": 6.918182373046875,
"learning_rate": 1.7595197557617044e-06,
"loss": 3.1496,
"num_input_tokens_seen": 5246664,
"step": 8030
},
{
"epoch": 0.8807409843253315,
"grad_norm": 6.263129711151123,
"learning_rate": 1.7436912214777945e-06,
"loss": 2.9099,
"num_input_tokens_seen": 5249392,
"step": 8035
},
{
"epoch": 0.8812890496547189,
"grad_norm": 8.55476188659668,
"learning_rate": 1.7279316325599898e-06,
"loss": 2.8569,
"num_input_tokens_seen": 5252584,
"step": 8040
},
{
"epoch": 0.8818371149841061,
"grad_norm": 7.661272048950195,
"learning_rate": 1.7122410357289703e-06,
"loss": 2.9037,
"num_input_tokens_seen": 5256184,
"step": 8045
},
{
"epoch": 0.8823851803134933,
"grad_norm": 5.52952766418457,
"learning_rate": 1.6966194775008798e-06,
"loss": 3.0452,
"num_input_tokens_seen": 5260048,
"step": 8050
},
{
"epoch": 0.8829332456428807,
"grad_norm": 8.354534149169922,
"learning_rate": 1.6810670041872062e-06,
"loss": 3.005,
"num_input_tokens_seen": 5264288,
"step": 8055
},
{
"epoch": 0.8834813109722679,
"grad_norm": 7.364735126495361,
"learning_rate": 1.6655836618946151e-06,
"loss": 3.1181,
"num_input_tokens_seen": 5268000,
"step": 8060
},
{
"epoch": 0.8840293763016551,
"grad_norm": 7.844119071960449,
"learning_rate": 1.650169496524831e-06,
"loss": 2.9376,
"num_input_tokens_seen": 5270984,
"step": 8065
},
{
"epoch": 0.8845774416310425,
"grad_norm": 5.87100076675415,
"learning_rate": 1.6348245537745028e-06,
"loss": 3.1916,
"num_input_tokens_seen": 5274448,
"step": 8070
},
{
"epoch": 0.8851255069604297,
"grad_norm": 7.44371223449707,
"learning_rate": 1.6195488791350548e-06,
"loss": 2.9924,
"num_input_tokens_seen": 5277432,
"step": 8075
},
{
"epoch": 0.8856735722898169,
"grad_norm": 6.34487771987915,
"learning_rate": 1.6043425178925652e-06,
"loss": 3.0224,
"num_input_tokens_seen": 5279944,
"step": 8080
},
{
"epoch": 0.8862216376192042,
"grad_norm": 5.726871490478516,
"learning_rate": 1.5892055151276258e-06,
"loss": 2.7579,
"num_input_tokens_seen": 5283720,
"step": 8085
},
{
"epoch": 0.8867697029485915,
"grad_norm": 9.92805004119873,
"learning_rate": 1.574137915715207e-06,
"loss": 3.0515,
"num_input_tokens_seen": 5286392,
"step": 8090
},
{
"epoch": 0.8873177682779787,
"grad_norm": 9.383995056152344,
"learning_rate": 1.559139764324527e-06,
"loss": 3.3639,
"num_input_tokens_seen": 5289440,
"step": 8095
},
{
"epoch": 0.887865833607366,
"grad_norm": 6.371479034423828,
"learning_rate": 1.5442111054189246e-06,
"loss": 3.0694,
"num_input_tokens_seen": 5293168,
"step": 8100
},
{
"epoch": 0.8884138989367533,
"grad_norm": 7.600619316101074,
"learning_rate": 1.5293519832557113e-06,
"loss": 3.1645,
"num_input_tokens_seen": 5296272,
"step": 8105
},
{
"epoch": 0.8889619642661405,
"grad_norm": 10.624588966369629,
"learning_rate": 1.5145624418860637e-06,
"loss": 2.9331,
"num_input_tokens_seen": 5299248,
"step": 8110
},
{
"epoch": 0.8895100295955278,
"grad_norm": 6.536969184875488,
"learning_rate": 1.4998425251548654e-06,
"loss": 2.962,
"num_input_tokens_seen": 5302376,
"step": 8115
},
{
"epoch": 0.890058094924915,
"grad_norm": 5.556844234466553,
"learning_rate": 1.4851922767006088e-06,
"loss": 2.9318,
"num_input_tokens_seen": 5305704,
"step": 8120
},
{
"epoch": 0.8906061602543023,
"grad_norm": 7.522222995758057,
"learning_rate": 1.4706117399552383e-06,
"loss": 3.0438,
"num_input_tokens_seen": 5308112,
"step": 8125
},
{
"epoch": 0.8911542255836896,
"grad_norm": 9.176352500915527,
"learning_rate": 1.4561009581440272e-06,
"loss": 3.0732,
"num_input_tokens_seen": 5310768,
"step": 8130
},
{
"epoch": 0.8917022909130768,
"grad_norm": 6.739439010620117,
"learning_rate": 1.441659974285467e-06,
"loss": 3.0154,
"num_input_tokens_seen": 5313544,
"step": 8135
},
{
"epoch": 0.8922503562424641,
"grad_norm": 6.810214042663574,
"learning_rate": 1.4272888311911176e-06,
"loss": 3.0619,
"num_input_tokens_seen": 5316352,
"step": 8140
},
{
"epoch": 0.8927984215718514,
"grad_norm": 5.931697368621826,
"learning_rate": 1.4129875714654905e-06,
"loss": 3.3196,
"num_input_tokens_seen": 5320160,
"step": 8145
},
{
"epoch": 0.8933464869012386,
"grad_norm": 7.526365280151367,
"learning_rate": 1.398756237505927e-06,
"loss": 2.9404,
"num_input_tokens_seen": 5323560,
"step": 8150
},
{
"epoch": 0.8938945522306259,
"grad_norm": 6.762884616851807,
"learning_rate": 1.3845948715024648e-06,
"loss": 3.2493,
"num_input_tokens_seen": 5326504,
"step": 8155
},
{
"epoch": 0.8944426175600132,
"grad_norm": 4.969104290008545,
"learning_rate": 1.37050351543771e-06,
"loss": 3.3379,
"num_input_tokens_seen": 5329424,
"step": 8160
},
{
"epoch": 0.8949906828894004,
"grad_norm": 6.4593586921691895,
"learning_rate": 1.3564822110867264e-06,
"loss": 3.2228,
"num_input_tokens_seen": 5332600,
"step": 8165
},
{
"epoch": 0.8955387482187877,
"grad_norm": 7.721135139465332,
"learning_rate": 1.3425310000169028e-06,
"loss": 3.2133,
"num_input_tokens_seen": 5335792,
"step": 8170
},
{
"epoch": 0.896086813548175,
"grad_norm": 8.572230339050293,
"learning_rate": 1.3286499235878214e-06,
"loss": 3.1945,
"num_input_tokens_seen": 5339616,
"step": 8175
},
{
"epoch": 0.8966348788775622,
"grad_norm": 7.773857593536377,
"learning_rate": 1.3148390229511532e-06,
"loss": 2.9125,
"num_input_tokens_seen": 5342320,
"step": 8180
},
{
"epoch": 0.8971829442069494,
"grad_norm": 7.451086521148682,
"learning_rate": 1.3010983390505244e-06,
"loss": 3.1514,
"num_input_tokens_seen": 5345336,
"step": 8185
},
{
"epoch": 0.8977310095363368,
"grad_norm": 7.28810453414917,
"learning_rate": 1.2874279126213973e-06,
"loss": 3.1191,
"num_input_tokens_seen": 5348880,
"step": 8190
},
{
"epoch": 0.898279074865724,
"grad_norm": 4.2049078941345215,
"learning_rate": 1.2738277841909479e-06,
"loss": 2.9685,
"num_input_tokens_seen": 5352936,
"step": 8195
},
{
"epoch": 0.8988271401951112,
"grad_norm": 7.404577732086182,
"learning_rate": 1.2602979940779524e-06,
"loss": 3.107,
"num_input_tokens_seen": 5355952,
"step": 8200
},
{
"epoch": 0.8993752055244986,
"grad_norm": 11.230597496032715,
"learning_rate": 1.2468385823926481e-06,
"loss": 2.9561,
"num_input_tokens_seen": 5359608,
"step": 8205
},
{
"epoch": 0.8999232708538858,
"grad_norm": 8.928146362304688,
"learning_rate": 1.233449589036656e-06,
"loss": 3.172,
"num_input_tokens_seen": 5363024,
"step": 8210
},
{
"epoch": 0.900471336183273,
"grad_norm": 5.939243316650391,
"learning_rate": 1.2201310537028138e-06,
"loss": 3.0996,
"num_input_tokens_seen": 5366928,
"step": 8215
},
{
"epoch": 0.9010194015126604,
"grad_norm": 7.374519348144531,
"learning_rate": 1.206883015875085e-06,
"loss": 3.0966,
"num_input_tokens_seen": 5369984,
"step": 8220
},
{
"epoch": 0.9015674668420476,
"grad_norm": 8.059386253356934,
"learning_rate": 1.1937055148284444e-06,
"loss": 3.0717,
"num_input_tokens_seen": 5372632,
"step": 8225
},
{
"epoch": 0.9021155321714348,
"grad_norm": 8.80373764038086,
"learning_rate": 1.1805985896287452e-06,
"loss": 3.1543,
"num_input_tokens_seen": 5375544,
"step": 8230
},
{
"epoch": 0.9026635975008221,
"grad_norm": 6.8497443199157715,
"learning_rate": 1.1675622791326169e-06,
"loss": 2.9531,
"num_input_tokens_seen": 5378856,
"step": 8235
},
{
"epoch": 0.9032116628302094,
"grad_norm": 7.791383266448975,
"learning_rate": 1.1545966219873444e-06,
"loss": 2.9187,
"num_input_tokens_seen": 5382752,
"step": 8240
},
{
"epoch": 0.9037597281595966,
"grad_norm": 6.825507640838623,
"learning_rate": 1.1417016566307586e-06,
"loss": 2.8782,
"num_input_tokens_seen": 5386080,
"step": 8245
},
{
"epoch": 0.9043077934889839,
"grad_norm": 6.135127544403076,
"learning_rate": 1.1288774212911052e-06,
"loss": 2.8879,
"num_input_tokens_seen": 5389680,
"step": 8250
},
{
"epoch": 0.9048558588183712,
"grad_norm": 8.292460441589355,
"learning_rate": 1.1161239539869668e-06,
"loss": 2.9108,
"num_input_tokens_seen": 5393112,
"step": 8255
},
{
"epoch": 0.9054039241477584,
"grad_norm": 6.192307949066162,
"learning_rate": 1.1034412925271075e-06,
"loss": 2.72,
"num_input_tokens_seen": 5397056,
"step": 8260
},
{
"epoch": 0.9059519894771457,
"grad_norm": 6.773381233215332,
"learning_rate": 1.0908294745103882e-06,
"loss": 2.7747,
"num_input_tokens_seen": 5400928,
"step": 8265
},
{
"epoch": 0.906500054806533,
"grad_norm": 9.411810874938965,
"learning_rate": 1.078288537325653e-06,
"loss": 3.1762,
"num_input_tokens_seen": 5403744,
"step": 8270
},
{
"epoch": 0.9070481201359202,
"grad_norm": 5.909646511077881,
"learning_rate": 1.0658185181516094e-06,
"loss": 2.9356,
"num_input_tokens_seen": 5406888,
"step": 8275
},
{
"epoch": 0.9075961854653075,
"grad_norm": 8.18594741821289,
"learning_rate": 1.0534194539567194e-06,
"loss": 3.0487,
"num_input_tokens_seen": 5409856,
"step": 8280
},
{
"epoch": 0.9081442507946947,
"grad_norm": 10.775045394897461,
"learning_rate": 1.0410913814990985e-06,
"loss": 2.8025,
"num_input_tokens_seen": 5412416,
"step": 8285
},
{
"epoch": 0.908692316124082,
"grad_norm": 8.237727165222168,
"learning_rate": 1.0288343373263954e-06,
"loss": 3.0227,
"num_input_tokens_seen": 5415176,
"step": 8290
},
{
"epoch": 0.9092403814534693,
"grad_norm": 7.0511884689331055,
"learning_rate": 1.016648357775693e-06,
"loss": 2.8189,
"num_input_tokens_seen": 5418552,
"step": 8295
},
{
"epoch": 0.9097884467828565,
"grad_norm": 6.959300518035889,
"learning_rate": 1.004533478973399e-06,
"loss": 3.3864,
"num_input_tokens_seen": 5421712,
"step": 8300
},
{
"epoch": 0.9103365121122438,
"grad_norm": 7.333334922790527,
"learning_rate": 9.924897368351282e-07,
"loss": 3.1543,
"num_input_tokens_seen": 5425312,
"step": 8305
},
{
"epoch": 0.9108845774416311,
"grad_norm": 7.005816459655762,
"learning_rate": 9.805171670656117e-07,
"loss": 3.1113,
"num_input_tokens_seen": 5428680,
"step": 8310
},
{
"epoch": 0.9114326427710183,
"grad_norm": 5.512388229370117,
"learning_rate": 9.686158051585874e-07,
"loss": 3.0001,
"num_input_tokens_seen": 5431848,
"step": 8315
},
{
"epoch": 0.9119807081004055,
"grad_norm": 6.378774642944336,
"learning_rate": 9.56785686396683e-07,
"loss": 3.1063,
"num_input_tokens_seen": 5434648,
"step": 8320
},
{
"epoch": 0.9125287734297929,
"grad_norm": 6.719765663146973,
"learning_rate": 9.450268458513156e-07,
"loss": 2.7967,
"num_input_tokens_seen": 5438728,
"step": 8325
},
{
"epoch": 0.9130768387591801,
"grad_norm": 8.518233299255371,
"learning_rate": 9.333393183826089e-07,
"loss": 2.7597,
"num_input_tokens_seen": 5442232,
"step": 8330
},
{
"epoch": 0.9136249040885673,
"grad_norm": 7.718142986297607,
"learning_rate": 9.217231386392577e-07,
"loss": 3.5149,
"num_input_tokens_seen": 5445320,
"step": 8335
},
{
"epoch": 0.9141729694179547,
"grad_norm": 7.286013603210449,
"learning_rate": 9.101783410584458e-07,
"loss": 3.2542,
"num_input_tokens_seen": 5448280,
"step": 8340
},
{
"epoch": 0.9147210347473419,
"grad_norm": 6.524003028869629,
"learning_rate": 8.987049598657398e-07,
"loss": 3.0042,
"num_input_tokens_seen": 5452360,
"step": 8345
},
{
"epoch": 0.9152691000767291,
"grad_norm": 6.262417316436768,
"learning_rate": 8.87303029074979e-07,
"loss": 2.6819,
"num_input_tokens_seen": 5455872,
"step": 8350
},
{
"epoch": 0.9158171654061165,
"grad_norm": 6.51323127746582,
"learning_rate": 8.75972582488191e-07,
"loss": 3.1662,
"num_input_tokens_seen": 5458616,
"step": 8355
},
{
"epoch": 0.9163652307355037,
"grad_norm": 7.502628803253174,
"learning_rate": 8.647136536954787e-07,
"loss": 2.4922,
"num_input_tokens_seen": 5461408,
"step": 8360
},
{
"epoch": 0.9169132960648909,
"grad_norm": 6.768873691558838,
"learning_rate": 8.535262760749202e-07,
"loss": 2.7696,
"num_input_tokens_seen": 5466664,
"step": 8365
},
{
"epoch": 0.9174613613942783,
"grad_norm": 9.054154396057129,
"learning_rate": 8.4241048279248e-07,
"loss": 3.3125,
"num_input_tokens_seen": 5469400,
"step": 8370
},
{
"epoch": 0.9180094267236655,
"grad_norm": 7.729340076446533,
"learning_rate": 8.313663068019007e-07,
"loss": 3.383,
"num_input_tokens_seen": 5472936,
"step": 8375
},
{
"epoch": 0.9185574920530527,
"grad_norm": 8.844609260559082,
"learning_rate": 8.203937808446083e-07,
"loss": 2.7089,
"num_input_tokens_seen": 5476176,
"step": 8380
},
{
"epoch": 0.91910555738244,
"grad_norm": 7.043740272521973,
"learning_rate": 8.094929374496185e-07,
"loss": 3.2024,
"num_input_tokens_seen": 5479576,
"step": 8385
},
{
"epoch": 0.9196536227118273,
"grad_norm": 8.144498825073242,
"learning_rate": 7.986638089334392e-07,
"loss": 3.4681,
"num_input_tokens_seen": 5483592,
"step": 8390
},
{
"epoch": 0.9202016880412145,
"grad_norm": 7.295477867126465,
"learning_rate": 7.879064273999731e-07,
"loss": 3.3592,
"num_input_tokens_seen": 5486736,
"step": 8395
},
{
"epoch": 0.9207497533706018,
"grad_norm": 6.9401960372924805,
"learning_rate": 7.772208247404128e-07,
"loss": 2.8916,
"num_input_tokens_seen": 5489720,
"step": 8400
},
{
"epoch": 0.9212978186999891,
"grad_norm": 5.044391632080078,
"learning_rate": 7.666070326331709e-07,
"loss": 2.9984,
"num_input_tokens_seen": 5494312,
"step": 8405
},
{
"epoch": 0.9218458840293763,
"grad_norm": 7.426214218139648,
"learning_rate": 7.560650825437637e-07,
"loss": 2.6398,
"num_input_tokens_seen": 5498536,
"step": 8410
},
{
"epoch": 0.9223939493587635,
"grad_norm": 6.066382884979248,
"learning_rate": 7.455950057247252e-07,
"loss": 3.0293,
"num_input_tokens_seen": 5501256,
"step": 8415
},
{
"epoch": 0.9229420146881508,
"grad_norm": 6.4779181480407715,
"learning_rate": 7.351968332155152e-07,
"loss": 3.0215,
"num_input_tokens_seen": 5504440,
"step": 8420
},
{
"epoch": 0.9234900800175381,
"grad_norm": 5.473248481750488,
"learning_rate": 7.248705958424307e-07,
"loss": 2.9114,
"num_input_tokens_seen": 5507752,
"step": 8425
},
{
"epoch": 0.9240381453469253,
"grad_norm": 7.87445592880249,
"learning_rate": 7.146163242185033e-07,
"loss": 3.0642,
"num_input_tokens_seen": 5511168,
"step": 8430
},
{
"epoch": 0.9245862106763126,
"grad_norm": 7.2715959548950195,
"learning_rate": 7.044340487434242e-07,
"loss": 3.0391,
"num_input_tokens_seen": 5513984,
"step": 8435
},
{
"epoch": 0.9251342760056999,
"grad_norm": 7.839521408081055,
"learning_rate": 6.943237996034386e-07,
"loss": 3.2316,
"num_input_tokens_seen": 5516632,
"step": 8440
},
{
"epoch": 0.9256823413350871,
"grad_norm": 7.8146820068359375,
"learning_rate": 6.842856067712677e-07,
"loss": 3.0688,
"num_input_tokens_seen": 5520488,
"step": 8445
},
{
"epoch": 0.9262304066644744,
"grad_norm": 7.480862140655518,
"learning_rate": 6.743195000060154e-07,
"loss": 2.8072,
"num_input_tokens_seen": 5524136,
"step": 8450
},
{
"epoch": 0.9267784719938617,
"grad_norm": 6.187289237976074,
"learning_rate": 6.644255088530782e-07,
"loss": 3.1597,
"num_input_tokens_seen": 5528256,
"step": 8455
},
{
"epoch": 0.9273265373232489,
"grad_norm": 7.108201026916504,
"learning_rate": 6.546036626440599e-07,
"loss": 2.8195,
"num_input_tokens_seen": 5531368,
"step": 8460
},
{
"epoch": 0.9278746026526362,
"grad_norm": 9.429540634155273,
"learning_rate": 6.448539904966827e-07,
"loss": 3.1321,
"num_input_tokens_seen": 5534144,
"step": 8465
},
{
"epoch": 0.9284226679820234,
"grad_norm": 6.745710849761963,
"learning_rate": 6.351765213147037e-07,
"loss": 2.8217,
"num_input_tokens_seen": 5536848,
"step": 8470
},
{
"epoch": 0.9289707333114107,
"grad_norm": 6.650664806365967,
"learning_rate": 6.255712837878347e-07,
"loss": 3.1658,
"num_input_tokens_seen": 5540136,
"step": 8475
},
{
"epoch": 0.929518798640798,
"grad_norm": 7.63946008682251,
"learning_rate": 6.160383063916419e-07,
"loss": 3.1177,
"num_input_tokens_seen": 5543192,
"step": 8480
},
{
"epoch": 0.9300668639701852,
"grad_norm": 7.223082542419434,
"learning_rate": 6.065776173874687e-07,
"loss": 3.6049,
"num_input_tokens_seen": 5547392,
"step": 8485
},
{
"epoch": 0.9306149292995725,
"grad_norm": 7.673356533050537,
"learning_rate": 5.971892448223576e-07,
"loss": 2.8851,
"num_input_tokens_seen": 5550056,
"step": 8490
},
{
"epoch": 0.9311629946289598,
"grad_norm": 7.799294471740723,
"learning_rate": 5.878732165289668e-07,
"loss": 3.2135,
"num_input_tokens_seen": 5552728,
"step": 8495
},
{
"epoch": 0.931711059958347,
"grad_norm": 5.8991312980651855,
"learning_rate": 5.786295601254765e-07,
"loss": 3.5495,
"num_input_tokens_seen": 5556008,
"step": 8500
},
{
"epoch": 0.9322591252877342,
"grad_norm": 8.919817924499512,
"learning_rate": 5.694583030155131e-07,
"loss": 3.2696,
"num_input_tokens_seen": 5558680,
"step": 8505
},
{
"epoch": 0.9328071906171216,
"grad_norm": 6.0595293045043945,
"learning_rate": 5.60359472388075e-07,
"loss": 3.1983,
"num_input_tokens_seen": 5561976,
"step": 8510
},
{
"epoch": 0.9333552559465088,
"grad_norm": 7.8532185554504395,
"learning_rate": 5.513330952174462e-07,
"loss": 2.8831,
"num_input_tokens_seen": 5565032,
"step": 8515
},
{
"epoch": 0.933903321275896,
"grad_norm": 6.592312335968018,
"learning_rate": 5.423791982631071e-07,
"loss": 3.2783,
"num_input_tokens_seen": 5567976,
"step": 8520
},
{
"epoch": 0.9344513866052834,
"grad_norm": 5.455694198608398,
"learning_rate": 5.334978080696773e-07,
"loss": 2.3299,
"num_input_tokens_seen": 5572544,
"step": 8525
},
{
"epoch": 0.9349994519346706,
"grad_norm": 6.956151008605957,
"learning_rate": 5.246889509668118e-07,
"loss": 3.0221,
"num_input_tokens_seen": 5575256,
"step": 8530
},
{
"epoch": 0.9355475172640578,
"grad_norm": 7.278057098388672,
"learning_rate": 5.159526530691378e-07,
"loss": 3.2783,
"num_input_tokens_seen": 5577928,
"step": 8535
},
{
"epoch": 0.9360955825934452,
"grad_norm": 5.909106731414795,
"learning_rate": 5.072889402761821e-07,
"loss": 3.2452,
"num_input_tokens_seen": 5580632,
"step": 8540
},
{
"epoch": 0.9366436479228324,
"grad_norm": 6.952794075012207,
"learning_rate": 4.986978382722773e-07,
"loss": 3.0232,
"num_input_tokens_seen": 5584824,
"step": 8545
},
{
"epoch": 0.9371917132522196,
"grad_norm": 8.14654541015625,
"learning_rate": 4.901793725264975e-07,
"loss": 3.0803,
"num_input_tokens_seen": 5589208,
"step": 8550
},
{
"epoch": 0.937739778581607,
"grad_norm": 6.610713958740234,
"learning_rate": 4.817335682925805e-07,
"loss": 2.8802,
"num_input_tokens_seen": 5592056,
"step": 8555
},
{
"epoch": 0.9382878439109942,
"grad_norm": 10.567109107971191,
"learning_rate": 4.73360450608859e-07,
"loss": 3.3952,
"num_input_tokens_seen": 5595120,
"step": 8560
},
{
"epoch": 0.9388359092403814,
"grad_norm": 7.1954545974731445,
"learning_rate": 4.6506004429817117e-07,
"loss": 3.2835,
"num_input_tokens_seen": 5598408,
"step": 8565
},
{
"epoch": 0.9393839745697687,
"grad_norm": 7.200895309448242,
"learning_rate": 4.568323739677971e-07,
"loss": 3.2721,
"num_input_tokens_seen": 5602328,
"step": 8570
},
{
"epoch": 0.939932039899156,
"grad_norm": 7.637218952178955,
"learning_rate": 4.486774640093894e-07,
"loss": 3.0411,
"num_input_tokens_seen": 5606096,
"step": 8575
},
{
"epoch": 0.9404801052285432,
"grad_norm": 8.214374542236328,
"learning_rate": 4.405953385988898e-07,
"loss": 3.1399,
"num_input_tokens_seen": 5608544,
"step": 8580
},
{
"epoch": 0.9410281705579305,
"grad_norm": 7.163279056549072,
"learning_rate": 4.325860216964711e-07,
"loss": 2.7451,
"num_input_tokens_seen": 5611872,
"step": 8585
},
{
"epoch": 0.9415762358873178,
"grad_norm": 7.930347919464111,
"learning_rate": 4.2464953704645647e-07,
"loss": 2.9838,
"num_input_tokens_seen": 5614440,
"step": 8590
},
{
"epoch": 0.942124301216705,
"grad_norm": 4.849373817443848,
"learning_rate": 4.167859081772446e-07,
"loss": 2.9805,
"num_input_tokens_seen": 5617856,
"step": 8595
},
{
"epoch": 0.9426723665460923,
"grad_norm": 8.461563110351562,
"learning_rate": 4.0899515840125966e-07,
"loss": 3.2951,
"num_input_tokens_seen": 5620824,
"step": 8600
},
{
"epoch": 0.9432204318754795,
"grad_norm": 8.734384536743164,
"learning_rate": 4.0127731081485987e-07,
"loss": 3.3802,
"num_input_tokens_seen": 5624696,
"step": 8605
},
{
"epoch": 0.9437684972048668,
"grad_norm": 9.480766296386719,
"learning_rate": 3.936323882982762e-07,
"loss": 2.8742,
"num_input_tokens_seen": 5628648,
"step": 8610
},
{
"epoch": 0.9443165625342541,
"grad_norm": 8.393555641174316,
"learning_rate": 3.8606041351555986e-07,
"loss": 3.3445,
"num_input_tokens_seen": 5631048,
"step": 8615
},
{
"epoch": 0.9448646278636413,
"grad_norm": 5.754420757293701,
"learning_rate": 3.785614089144879e-07,
"loss": 3.2994,
"num_input_tokens_seen": 5634840,
"step": 8620
},
{
"epoch": 0.9454126931930286,
"grad_norm": 7.406842231750488,
"learning_rate": 3.7113539672651853e-07,
"loss": 3.2169,
"num_input_tokens_seen": 5639056,
"step": 8625
},
{
"epoch": 0.9459607585224159,
"grad_norm": 8.346644401550293,
"learning_rate": 3.637823989667166e-07,
"loss": 3.5016,
"num_input_tokens_seen": 5642368,
"step": 8630
},
{
"epoch": 0.9465088238518031,
"grad_norm": 6.256731033325195,
"learning_rate": 3.565024374336895e-07,
"loss": 2.9251,
"num_input_tokens_seen": 5645288,
"step": 8635
},
{
"epoch": 0.9470568891811904,
"grad_norm": 8.30922794342041,
"learning_rate": 3.4929553370951496e-07,
"loss": 2.897,
"num_input_tokens_seen": 5648256,
"step": 8640
},
{
"epoch": 0.9476049545105777,
"grad_norm": 5.839921951293945,
"learning_rate": 3.421617091596996e-07,
"loss": 3.0709,
"num_input_tokens_seen": 5651456,
"step": 8645
},
{
"epoch": 0.9481530198399649,
"grad_norm": 8.873268127441406,
"learning_rate": 3.3510098493308715e-07,
"loss": 2.8349,
"num_input_tokens_seen": 5654936,
"step": 8650
},
{
"epoch": 0.9487010851693521,
"grad_norm": 7.447127342224121,
"learning_rate": 3.2811338196181706e-07,
"loss": 3.1457,
"num_input_tokens_seen": 5658344,
"step": 8655
},
{
"epoch": 0.9492491504987395,
"grad_norm": 7.901216506958008,
"learning_rate": 3.211989209612437e-07,
"loss": 3.0331,
"num_input_tokens_seen": 5661088,
"step": 8660
},
{
"epoch": 0.9497972158281267,
"grad_norm": 6.363575458526611,
"learning_rate": 3.1435762242990053e-07,
"loss": 3.0904,
"num_input_tokens_seen": 5664544,
"step": 8665
},
{
"epoch": 0.9503452811575139,
"grad_norm": 8.245457649230957,
"learning_rate": 3.0758950664940833e-07,
"loss": 2.9634,
"num_input_tokens_seen": 5667704,
"step": 8670
},
{
"epoch": 0.9508933464869013,
"grad_norm": 6.969222068786621,
"learning_rate": 3.008945936844504e-07,
"loss": 2.9006,
"num_input_tokens_seen": 5671088,
"step": 8675
},
{
"epoch": 0.9514414118162885,
"grad_norm": 9.956710815429688,
"learning_rate": 2.942729033826752e-07,
"loss": 3.3092,
"num_input_tokens_seen": 5673784,
"step": 8680
},
{
"epoch": 0.9519894771456757,
"grad_norm": 6.730470657348633,
"learning_rate": 2.877244553746633e-07,
"loss": 2.8794,
"num_input_tokens_seen": 5677024,
"step": 8685
},
{
"epoch": 0.9525375424750631,
"grad_norm": 7.628656387329102,
"learning_rate": 2.8124926907386885e-07,
"loss": 2.9683,
"num_input_tokens_seen": 5680552,
"step": 8690
},
{
"epoch": 0.9530856078044503,
"grad_norm": 8.587575912475586,
"learning_rate": 2.748473636765475e-07,
"loss": 3.0311,
"num_input_tokens_seen": 5684128,
"step": 8695
},
{
"epoch": 0.9536336731338375,
"grad_norm": 8.781567573547363,
"learning_rate": 2.6851875816170655e-07,
"loss": 2.9722,
"num_input_tokens_seen": 5687784,
"step": 8700
},
{
"epoch": 0.9541817384632248,
"grad_norm": 6.88287353515625,
"learning_rate": 2.622634712910521e-07,
"loss": 3.3128,
"num_input_tokens_seen": 5690464,
"step": 8705
},
{
"epoch": 0.9547298037926121,
"grad_norm": 7.1090874671936035,
"learning_rate": 2.560815216089335e-07,
"loss": 3.0189,
"num_input_tokens_seen": 5693312,
"step": 8710
},
{
"epoch": 0.9552778691219993,
"grad_norm": 7.3000168800354,
"learning_rate": 2.499729274422796e-07,
"loss": 3.5534,
"num_input_tokens_seen": 5697232,
"step": 8715
},
{
"epoch": 0.9558259344513866,
"grad_norm": 8.97269344329834,
"learning_rate": 2.439377069005544e-07,
"loss": 3.5597,
"num_input_tokens_seen": 5699808,
"step": 8720
},
{
"epoch": 0.9563739997807739,
"grad_norm": 8.973227500915527,
"learning_rate": 2.3797587787569852e-07,
"loss": 3.0848,
"num_input_tokens_seen": 5703784,
"step": 8725
},
{
"epoch": 0.9569220651101611,
"grad_norm": 7.142612934112549,
"learning_rate": 2.3208745804207398e-07,
"loss": 2.8029,
"num_input_tokens_seen": 5706344,
"step": 8730
},
{
"epoch": 0.9574701304395484,
"grad_norm": 8.567402839660645,
"learning_rate": 2.262724648564224e-07,
"loss": 3.3482,
"num_input_tokens_seen": 5710600,
"step": 8735
},
{
"epoch": 0.9580181957689357,
"grad_norm": 11.277481079101562,
"learning_rate": 2.2053091555779837e-07,
"loss": 3.0415,
"num_input_tokens_seen": 5714152,
"step": 8740
},
{
"epoch": 0.9585662610983229,
"grad_norm": 7.343226432800293,
"learning_rate": 2.1486282716752791e-07,
"loss": 3.0087,
"num_input_tokens_seen": 5716376,
"step": 8745
},
{
"epoch": 0.9591143264277102,
"grad_norm": 6.354895114898682,
"learning_rate": 2.0926821648915574e-07,
"loss": 3.0672,
"num_input_tokens_seen": 5719152,
"step": 8750
},
{
"epoch": 0.9596623917570974,
"grad_norm": 7.212831497192383,
"learning_rate": 2.0374710010839793e-07,
"loss": 3.3,
"num_input_tokens_seen": 5723064,
"step": 8755
},
{
"epoch": 0.9602104570864847,
"grad_norm": 6.967692852020264,
"learning_rate": 1.982994943930838e-07,
"loss": 3.1401,
"num_input_tokens_seen": 5725768,
"step": 8760
},
{
"epoch": 0.960758522415872,
"grad_norm": 8.500665664672852,
"learning_rate": 1.9292541549311983e-07,
"loss": 3.2358,
"num_input_tokens_seen": 5728104,
"step": 8765
},
{
"epoch": 0.9613065877452592,
"grad_norm": 7.204361915588379,
"learning_rate": 1.876248793404367e-07,
"loss": 2.9241,
"num_input_tokens_seen": 5730688,
"step": 8770
},
{
"epoch": 0.9618546530746465,
"grad_norm": 7.031684398651123,
"learning_rate": 1.8239790164893412e-07,
"loss": 3.2293,
"num_input_tokens_seen": 5733936,
"step": 8775
},
{
"epoch": 0.9624027184040338,
"grad_norm": 8.101325035095215,
"learning_rate": 1.7724449791444997e-07,
"loss": 2.7716,
"num_input_tokens_seen": 5737880,
"step": 8780
},
{
"epoch": 0.962950783733421,
"grad_norm": 6.74721622467041,
"learning_rate": 1.721646834146967e-07,
"loss": 2.715,
"num_input_tokens_seen": 5741936,
"step": 8785
},
{
"epoch": 0.9634988490628082,
"grad_norm": 9.26173210144043,
"learning_rate": 1.671584732092335e-07,
"loss": 2.8224,
"num_input_tokens_seen": 5746160,
"step": 8790
},
{
"epoch": 0.9640469143921956,
"grad_norm": 5.797330856323242,
"learning_rate": 1.6222588213940792e-07,
"loss": 3.3261,
"num_input_tokens_seen": 5750696,
"step": 8795
},
{
"epoch": 0.9645949797215828,
"grad_norm": 9.205500602722168,
"learning_rate": 1.5736692482831995e-07,
"loss": 2.9268,
"num_input_tokens_seen": 5753384,
"step": 8800
},
{
"epoch": 0.96514304505097,
"grad_norm": 6.270941257476807,
"learning_rate": 1.5258161568077188e-07,
"loss": 2.8041,
"num_input_tokens_seen": 5756640,
"step": 8805
},
{
"epoch": 0.9656911103803574,
"grad_norm": 7.947140693664551,
"learning_rate": 1.4786996888323524e-07,
"loss": 3.1006,
"num_input_tokens_seen": 5759848,
"step": 8810
},
{
"epoch": 0.9662391757097446,
"grad_norm": 8.765256881713867,
"learning_rate": 1.4323199840380053e-07,
"loss": 3.2065,
"num_input_tokens_seen": 5763416,
"step": 8815
},
{
"epoch": 0.9667872410391318,
"grad_norm": 5.335040092468262,
"learning_rate": 1.3866771799213307e-07,
"loss": 2.9768,
"num_input_tokens_seen": 5766160,
"step": 8820
},
{
"epoch": 0.9673353063685192,
"grad_norm": 5.483620643615723,
"learning_rate": 1.3417714117944513e-07,
"loss": 2.8682,
"num_input_tokens_seen": 5771024,
"step": 8825
},
{
"epoch": 0.9678833716979064,
"grad_norm": 8.511704444885254,
"learning_rate": 1.2976028127844597e-07,
"loss": 3.1851,
"num_input_tokens_seen": 5774632,
"step": 8830
},
{
"epoch": 0.9684314370272936,
"grad_norm": 6.916325569152832,
"learning_rate": 1.25417151383303e-07,
"loss": 3.2018,
"num_input_tokens_seen": 5778048,
"step": 8835
},
{
"epoch": 0.968979502356681,
"grad_norm": 6.791527271270752,
"learning_rate": 1.2114776436960294e-07,
"loss": 3.1153,
"num_input_tokens_seen": 5781288,
"step": 8840
},
{
"epoch": 0.9695275676860682,
"grad_norm": 7.304278373718262,
"learning_rate": 1.1695213289432406e-07,
"loss": 2.7359,
"num_input_tokens_seen": 5783776,
"step": 8845
},
{
"epoch": 0.9700756330154554,
"grad_norm": 7.467769145965576,
"learning_rate": 1.128302693957778e-07,
"loss": 3.1941,
"num_input_tokens_seen": 5786120,
"step": 8850
},
{
"epoch": 0.9706236983448427,
"grad_norm": 8.969725608825684,
"learning_rate": 1.0878218609359502e-07,
"loss": 3.0654,
"num_input_tokens_seen": 5789672,
"step": 8855
},
{
"epoch": 0.97117176367423,
"grad_norm": 8.292722702026367,
"learning_rate": 1.0480789498866772e-07,
"loss": 2.9517,
"num_input_tokens_seen": 5792480,
"step": 8860
},
{
"epoch": 0.9717198290036172,
"grad_norm": 5.788974285125732,
"learning_rate": 1.0090740786313502e-07,
"loss": 2.9964,
"num_input_tokens_seen": 5796848,
"step": 8865
},
{
"epoch": 0.9722678943330045,
"grad_norm": 8.003725051879883,
"learning_rate": 9.708073628033055e-08,
"loss": 2.8592,
"num_input_tokens_seen": 5801376,
"step": 8870
},
{
"epoch": 0.9728159596623918,
"grad_norm": 6.711467742919922,
"learning_rate": 9.332789158476018e-08,
"loss": 2.9653,
"num_input_tokens_seen": 5804480,
"step": 8875
},
{
"epoch": 0.973364024991779,
"grad_norm": 5.3671417236328125,
"learning_rate": 8.964888490205769e-08,
"loss": 3.1577,
"num_input_tokens_seen": 5807632,
"step": 8880
},
{
"epoch": 0.9739120903211663,
"grad_norm": 6.408278942108154,
"learning_rate": 8.604372713896247e-08,
"loss": 2.7764,
"num_input_tokens_seen": 5810096,
"step": 8885
},
{
"epoch": 0.9744601556505536,
"grad_norm": 8.041277885437012,
"learning_rate": 8.251242898328071e-08,
"loss": 3.2175,
"num_input_tokens_seen": 5813808,
"step": 8890
},
{
"epoch": 0.9750082209799408,
"grad_norm": 6.138535499572754,
"learning_rate": 7.905500090385487e-08,
"loss": 2.9364,
"num_input_tokens_seen": 5816552,
"step": 8895
},
{
"epoch": 0.9755562863093281,
"grad_norm": 8.328486442565918,
"learning_rate": 7.567145315053314e-08,
"loss": 3.163,
"num_input_tokens_seen": 5820568,
"step": 8900
},
{
"epoch": 0.9761043516387153,
"grad_norm": 9.473198890686035,
"learning_rate": 7.236179575414448e-08,
"loss": 3.2253,
"num_input_tokens_seen": 5823808,
"step": 8905
},
{
"epoch": 0.9766524169681026,
"grad_norm": 5.804590225219727,
"learning_rate": 6.912603852645138e-08,
"loss": 3.0782,
"num_input_tokens_seen": 5826744,
"step": 8910
},
{
"epoch": 0.9772004822974899,
"grad_norm": 5.613870620727539,
"learning_rate": 6.596419106014163e-08,
"loss": 2.9843,
"num_input_tokens_seen": 5831144,
"step": 8915
},
{
"epoch": 0.9777485476268771,
"grad_norm": 8.519886016845703,
"learning_rate": 6.28762627287921e-08,
"loss": 3.0685,
"num_input_tokens_seen": 5834792,
"step": 8920
},
{
"epoch": 0.9782966129562644,
"grad_norm": 7.168541431427002,
"learning_rate": 5.986226268683282e-08,
"loss": 3.2515,
"num_input_tokens_seen": 5838368,
"step": 8925
},
{
"epoch": 0.9788446782856517,
"grad_norm": 10.949654579162598,
"learning_rate": 5.692219986953573e-08,
"loss": 2.9654,
"num_input_tokens_seen": 5842120,
"step": 8930
},
{
"epoch": 0.9793927436150389,
"grad_norm": 6.906786918640137,
"learning_rate": 5.4056082992973155e-08,
"loss": 3.0675,
"num_input_tokens_seen": 5845248,
"step": 8935
},
{
"epoch": 0.9799408089444261,
"grad_norm": 5.457529067993164,
"learning_rate": 5.1263920553998315e-08,
"loss": 2.9989,
"num_input_tokens_seen": 5848536,
"step": 8940
},
{
"epoch": 0.9804888742738135,
"grad_norm": 9.393891334533691,
"learning_rate": 4.854572083022313e-08,
"loss": 3.1355,
"num_input_tokens_seen": 5851824,
"step": 8945
},
{
"epoch": 0.9810369396032007,
"grad_norm": 8.42390251159668,
"learning_rate": 4.5901491879984934e-08,
"loss": 3.0677,
"num_input_tokens_seen": 5855152,
"step": 8950
},
{
"epoch": 0.9815850049325879,
"grad_norm": 7.749826908111572,
"learning_rate": 4.3331241542340916e-08,
"loss": 3.1391,
"num_input_tokens_seen": 5858576,
"step": 8955
},
{
"epoch": 0.9821330702619753,
"grad_norm": 8.214120864868164,
"learning_rate": 4.083497743701259e-08,
"loss": 2.8317,
"num_input_tokens_seen": 5861528,
"step": 8960
},
{
"epoch": 0.9826811355913625,
"grad_norm": 6.369811058044434,
"learning_rate": 3.8412706964402465e-08,
"loss": 2.9487,
"num_input_tokens_seen": 5865128,
"step": 8965
},
{
"epoch": 0.9832292009207497,
"grad_norm": 8.29269027709961,
"learning_rate": 3.606443730554132e-08,
"loss": 3.0666,
"num_input_tokens_seen": 5867928,
"step": 8970
},
{
"epoch": 0.9837772662501371,
"grad_norm": 7.444830417633057,
"learning_rate": 3.379017542207707e-08,
"loss": 3.0067,
"num_input_tokens_seen": 5870968,
"step": 8975
},
{
"epoch": 0.9843253315795243,
"grad_norm": 7.021453380584717,
"learning_rate": 3.1589928056263704e-08,
"loss": 3.1972,
"num_input_tokens_seen": 5874496,
"step": 8980
},
{
"epoch": 0.9848733969089115,
"grad_norm": 7.41176176071167,
"learning_rate": 2.9463701730922388e-08,
"loss": 2.826,
"num_input_tokens_seen": 5878088,
"step": 8985
},
{
"epoch": 0.9854214622382989,
"grad_norm": 9.515088081359863,
"learning_rate": 2.7411502749441488e-08,
"loss": 3.1693,
"num_input_tokens_seen": 5881752,
"step": 8990
},
{
"epoch": 0.9859695275676861,
"grad_norm": 8.658610343933105,
"learning_rate": 2.5433337195743258e-08,
"loss": 2.8453,
"num_input_tokens_seen": 5884816,
"step": 8995
},
{
"epoch": 0.9865175928970733,
"grad_norm": 7.5331830978393555,
"learning_rate": 2.3529210934272738e-08,
"loss": 2.8423,
"num_input_tokens_seen": 5887864,
"step": 9000
},
{
"epoch": 0.9870656582264606,
"grad_norm": 8.601006507873535,
"learning_rate": 2.2059222016279636e-08,
"loss": 3.5074,
"num_input_tokens_seen": 5892776,
"step": 9005
},
{
"epoch": 0.9876137235558479,
"grad_norm": 9.700572967529297,
"learning_rate": 2.0288380558580732e-08,
"loss": 2.9729,
"num_input_tokens_seen": 5895976,
"step": 9010
},
{
"epoch": 0.9881617888852351,
"grad_norm": 7.793155193328857,
"learning_rate": 1.859159364578089e-08,
"loss": 3.1164,
"num_input_tokens_seen": 5897952,
"step": 9015
},
{
"epoch": 0.9887098542146224,
"grad_norm": 6.612551212310791,
"learning_rate": 1.696886630815908e-08,
"loss": 2.9729,
"num_input_tokens_seen": 5901264,
"step": 9020
},
{
"epoch": 0.9892579195440097,
"grad_norm": 7.382999897003174,
"learning_rate": 1.5420203356431018e-08,
"loss": 3.2611,
"num_input_tokens_seen": 5904096,
"step": 9025
},
{
"epoch": 0.9898059848733969,
"grad_norm": 6.810866832733154,
"learning_rate": 1.3945609381743607e-08,
"loss": 2.8127,
"num_input_tokens_seen": 5907072,
"step": 9030
},
{
"epoch": 0.9903540502027842,
"grad_norm": 7.927409648895264,
"learning_rate": 1.2545088755658296e-08,
"loss": 3.2365,
"num_input_tokens_seen": 5910056,
"step": 9035
},
{
"epoch": 0.9909021155321714,
"grad_norm": 7.214841842651367,
"learning_rate": 1.121864563014552e-08,
"loss": 3.0081,
"num_input_tokens_seen": 5913112,
"step": 9040
},
{
"epoch": 0.9914501808615587,
"grad_norm": 8.652878761291504,
"learning_rate": 9.966283937559716e-09,
"loss": 3.0332,
"num_input_tokens_seen": 5916360,
"step": 9045
},
{
"epoch": 0.991998246190946,
"grad_norm": 8.960352897644043,
"learning_rate": 8.78800739063379e-09,
"loss": 2.6109,
"num_input_tokens_seen": 5918704,
"step": 9050
},
{
"epoch": 0.9925463115203332,
"grad_norm": 7.337709903717041,
"learning_rate": 7.683819482479094e-09,
"loss": 2.7987,
"num_input_tokens_seen": 5921928,
"step": 9055
},
{
"epoch": 0.9930943768497205,
"grad_norm": 7.972464561462402,
"learning_rate": 6.653723486549357e-09,
"loss": 3.1164,
"num_input_tokens_seen": 5924176,
"step": 9060
},
{
"epoch": 0.9936424421791078,
"grad_norm": 5.17326021194458,
"learning_rate": 5.69772245666289e-09,
"loss": 2.8857,
"num_input_tokens_seen": 5927832,
"step": 9065
},
{
"epoch": 0.994190507508495,
"grad_norm": 9.227761268615723,
"learning_rate": 4.815819226960949e-09,
"loss": 3.0089,
"num_input_tokens_seen": 5931264,
"step": 9070
},
{
"epoch": 0.9947385728378823,
"grad_norm": 8.926158905029297,
"learning_rate": 4.008016411927162e-09,
"loss": 3.3191,
"num_input_tokens_seen": 5933904,
"step": 9075
},
{
"epoch": 0.9952866381672696,
"grad_norm": 10.433160781860352,
"learning_rate": 3.274316406362554e-09,
"loss": 3.447,
"num_input_tokens_seen": 5936464,
"step": 9080
},
{
"epoch": 0.9958347034966568,
"grad_norm": 7.052779197692871,
"learning_rate": 2.6147213853855436e-09,
"loss": 3.0385,
"num_input_tokens_seen": 5939544,
"step": 9085
},
{
"epoch": 0.996382768826044,
"grad_norm": 5.819647789001465,
"learning_rate": 2.0292333044236166e-09,
"loss": 3.3745,
"num_input_tokens_seen": 5943312,
"step": 9090
},
{
"epoch": 0.9969308341554314,
"grad_norm": 7.4259748458862305,
"learning_rate": 1.5178538992050018e-09,
"loss": 2.8346,
"num_input_tokens_seen": 5946248,
"step": 9095
},
{
"epoch": 0.9974788994848186,
"grad_norm": 9.022146224975586,
"learning_rate": 1.0805846857642188e-09,
"loss": 2.969,
"num_input_tokens_seen": 5949520,
"step": 9100
},
{
"epoch": 0.9980269648142058,
"grad_norm": 7.631455898284912,
"learning_rate": 7.174269604171002e-10,
"loss": 3.0908,
"num_input_tokens_seen": 5953392,
"step": 9105
},
{
"epoch": 0.9985750301435932,
"grad_norm": 8.837788581848145,
"learning_rate": 4.283817997829953e-10,
"loss": 2.8613,
"num_input_tokens_seen": 5957048,
"step": 9110
},
{
"epoch": 0.9991230954729804,
"grad_norm": 6.420173645019531,
"learning_rate": 2.1345006075979e-10,
"loss": 2.8579,
"num_input_tokens_seen": 5959744,
"step": 9115
},
{
"epoch": 0.9996711608023676,
"grad_norm": 8.133180618286133,
"learning_rate": 7.263238052668264e-11,
"loss": 3.1424,
"num_input_tokens_seen": 5962752,
"step": 9120
}
],
"logging_steps": 5,
"max_steps": 9123,
"num_input_tokens_seen": 5964208,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.722124677282202e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}