utf8-lm-tiny / trainer_state.json
AmitMY's picture
Upload folder using huggingface_hub
7e3321c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 3.9445953369140625,
"learning_rate": 2.97e-05,
"loss": 6.7008,
"num_input_tokens_seen": 6553600,
"step": 100,
"train_runtime": 61.1942,
"train_tokens_per_second": 107095.166
},
{
"epoch": 0.002,
"grad_norm": 0.6828203797340393,
"learning_rate": 5.97e-05,
"loss": 3.3177,
"num_input_tokens_seen": 13107200,
"step": 200,
"train_runtime": 107.6856,
"train_tokens_per_second": 121717.274
},
{
"epoch": 0.003,
"grad_norm": 16.05720329284668,
"learning_rate": 8.969999999999998e-05,
"loss": 3.0024,
"num_input_tokens_seen": 19660800,
"step": 300,
"train_runtime": 154.3564,
"train_tokens_per_second": 127372.748
},
{
"epoch": 0.004,
"grad_norm": 13.74783706665039,
"learning_rate": 0.0001197,
"loss": 2.6797,
"num_input_tokens_seen": 26214400,
"step": 400,
"train_runtime": 200.698,
"train_tokens_per_second": 130616.167
},
{
"epoch": 0.005,
"grad_norm": 12.893468856811523,
"learning_rate": 0.00014969999999999998,
"loss": 2.4588,
"num_input_tokens_seen": 32768000,
"step": 500,
"train_runtime": 252.1632,
"train_tokens_per_second": 129947.566
},
{
"epoch": 0.006,
"grad_norm": 9.021939277648926,
"learning_rate": 0.00017969999999999998,
"loss": 2.276,
"num_input_tokens_seen": 39321600,
"step": 600,
"train_runtime": 299.2712,
"train_tokens_per_second": 131391.184
},
{
"epoch": 0.007,
"grad_norm": 8.669090270996094,
"learning_rate": 0.00020969999999999997,
"loss": 2.1203,
"num_input_tokens_seen": 45875200,
"step": 700,
"train_runtime": 346.3366,
"train_tokens_per_second": 132458.429
},
{
"epoch": 0.008,
"grad_norm": 7.335177898406982,
"learning_rate": 0.0002397,
"loss": 1.9886,
"num_input_tokens_seen": 52428800,
"step": 800,
"train_runtime": 393.5299,
"train_tokens_per_second": 133226.965
},
{
"epoch": 0.009,
"grad_norm": 6.051175117492676,
"learning_rate": 0.0002697,
"loss": 1.9128,
"num_input_tokens_seen": 58982400,
"step": 900,
"train_runtime": 440.0136,
"train_tokens_per_second": 134046.765
},
{
"epoch": 0.01,
"grad_norm": 5.503482818603516,
"learning_rate": 0.00029969999999999997,
"loss": 1.8296,
"num_input_tokens_seen": 65536000,
"step": 1000,
"train_runtime": 492.2662,
"train_tokens_per_second": 133131.222
},
{
"epoch": 0.011,
"grad_norm": 2.8459227085113525,
"learning_rate": 0.00029999925978027874,
"loss": 1.779,
"num_input_tokens_seen": 72089600,
"step": 1100,
"train_runtime": 538.0301,
"train_tokens_per_second": 133988.032
},
{
"epoch": 0.012,
"grad_norm": 2.292707920074463,
"learning_rate": 0.0002999970091452017,
"loss": 1.7037,
"num_input_tokens_seen": 78643200,
"step": 1200,
"train_runtime": 585.618,
"train_tokens_per_second": 134290.951
},
{
"epoch": 0.013,
"grad_norm": 3.362025737762451,
"learning_rate": 0.00029999324804190795,
"loss": 1.6688,
"num_input_tokens_seen": 85196800,
"step": 1300,
"train_runtime": 632.1008,
"train_tokens_per_second": 134783.565
},
{
"epoch": 0.014,
"grad_norm": 2.2756998538970947,
"learning_rate": 0.0002999879765082716,
"loss": 1.6397,
"num_input_tokens_seen": 91750400,
"step": 1400,
"train_runtime": 684.3545,
"train_tokens_per_second": 134068.525
},
{
"epoch": 0.015,
"grad_norm": 2.5730831623077393,
"learning_rate": 0.000299981194597377,
"loss": 1.605,
"num_input_tokens_seen": 98304000,
"step": 1500,
"train_runtime": 730.5087,
"train_tokens_per_second": 134569.247
},
{
"epoch": 0.016,
"grad_norm": 1.7514433860778809,
"learning_rate": 0.0002999729023775179,
"loss": 1.5838,
"num_input_tokens_seen": 104857600,
"step": 1600,
"train_runtime": 781.9407,
"train_tokens_per_second": 134099.179
},
{
"epoch": 0.017,
"grad_norm": 1.8343929052352905,
"learning_rate": 0.0002999630999321969,
"loss": 1.6037,
"num_input_tokens_seen": 111411200,
"step": 1700,
"train_runtime": 824.7241,
"train_tokens_per_second": 135089.057
},
{
"epoch": 0.018,
"grad_norm": 1.5672227144241333,
"learning_rate": 0.00029995178736012443,
"loss": 1.5627,
"num_input_tokens_seen": 117964800,
"step": 1800,
"train_runtime": 871.9564,
"train_tokens_per_second": 135287.497
},
{
"epoch": 0.019,
"grad_norm": 1.6202061176300049,
"learning_rate": 0.0002999389647752181,
"loss": 1.5398,
"num_input_tokens_seen": 124518400,
"step": 1900,
"train_runtime": 923.402,
"train_tokens_per_second": 134847.439
},
{
"epoch": 0.02,
"grad_norm": 1.5145666599273682,
"learning_rate": 0.00029992463230660104,
"loss": 1.5389,
"num_input_tokens_seen": 131072000,
"step": 2000,
"train_runtime": 968.9283,
"train_tokens_per_second": 135275.229
},
{
"epoch": 0.021,
"grad_norm": 1.0306257009506226,
"learning_rate": 0.00029990879009860117,
"loss": 1.5098,
"num_input_tokens_seen": 137625600,
"step": 2100,
"train_runtime": 1020.8371,
"train_tokens_per_second": 134816.412
},
{
"epoch": 0.022,
"grad_norm": 2.0710599422454834,
"learning_rate": 0.0002998914383107493,
"loss": 1.5081,
"num_input_tokens_seen": 144179200,
"step": 2200,
"train_runtime": 1067.2796,
"train_tokens_per_second": 135090.368
},
{
"epoch": 0.023,
"grad_norm": 1.4022581577301025,
"learning_rate": 0.0002998725771177778,
"loss": 1.521,
"num_input_tokens_seen": 150732800,
"step": 2300,
"train_runtime": 1114.7094,
"train_tokens_per_second": 135221.616
},
{
"epoch": 0.024,
"grad_norm": 1.4328904151916504,
"learning_rate": 0.00029985220670961847,
"loss": 1.4855,
"num_input_tokens_seen": 157286400,
"step": 2400,
"train_runtime": 1160.6217,
"train_tokens_per_second": 135519.092
},
{
"epoch": 0.025,
"grad_norm": 1.3760366439819336,
"learning_rate": 0.0002998303272914014,
"loss": 1.4966,
"num_input_tokens_seen": 163840000,
"step": 2500,
"train_runtime": 1212.6489,
"train_tokens_per_second": 135109.18
},
{
"epoch": 0.026,
"grad_norm": 0.9530190825462341,
"learning_rate": 0.00029980693908345185,
"loss": 1.4795,
"num_input_tokens_seen": 170393600,
"step": 2600,
"train_runtime": 1258.3106,
"train_tokens_per_second": 135414.576
},
{
"epoch": 0.027,
"grad_norm": 0.8715839385986328,
"learning_rate": 0.00029978204232128895,
"loss": 1.4601,
"num_input_tokens_seen": 176947200,
"step": 2700,
"train_runtime": 1304.6837,
"train_tokens_per_second": 135624.597
},
{
"epoch": 0.028,
"grad_norm": 1.1879854202270508,
"learning_rate": 0.0002997556372556227,
"loss": 1.487,
"num_input_tokens_seen": 183500800,
"step": 2800,
"train_runtime": 1358.2195,
"train_tokens_per_second": 135103.938
},
{
"epoch": 0.029,
"grad_norm": 1.0949848890304565,
"learning_rate": 0.0002997277241523519,
"loss": 1.4658,
"num_input_tokens_seen": 190054400,
"step": 2900,
"train_runtime": 1404.4203,
"train_tokens_per_second": 135325.869
},
{
"epoch": 0.03,
"grad_norm": 1.465809941291809,
"learning_rate": 0.00029969830329256125,
"loss": 1.4463,
"num_input_tokens_seen": 196608000,
"step": 3000,
"train_runtime": 1451.3838,
"train_tokens_per_second": 135462.45
},
{
"epoch": 0.031,
"grad_norm": 0.9500088095664978,
"learning_rate": 0.00029966737497251836,
"loss": 1.4533,
"num_input_tokens_seen": 203161600,
"step": 3100,
"train_runtime": 1496.7114,
"train_tokens_per_second": 135738.657
},
{
"epoch": 0.032,
"grad_norm": 1.3393683433532715,
"learning_rate": 0.0002996349395036711,
"loss": 1.4402,
"num_input_tokens_seen": 209715200,
"step": 3200,
"train_runtime": 1549.2536,
"train_tokens_per_second": 135365.316
},
{
"epoch": 0.033,
"grad_norm": 0.7998270988464355,
"learning_rate": 0.00029960099721264435,
"loss": 1.4467,
"num_input_tokens_seen": 216268800,
"step": 3300,
"train_runtime": 1596.5035,
"train_tokens_per_second": 135464.03
},
{
"epoch": 0.034,
"grad_norm": 0.8441318273544312,
"learning_rate": 0.0002995655484412365,
"loss": 1.4353,
"num_input_tokens_seen": 222822400,
"step": 3400,
"train_runtime": 1642.6114,
"train_tokens_per_second": 135651.317
},
{
"epoch": 0.035,
"grad_norm": 0.7577129006385803,
"learning_rate": 0.00029952859354641636,
"loss": 1.4253,
"num_input_tokens_seen": 229376000,
"step": 3500,
"train_runtime": 1690.0779,
"train_tokens_per_second": 135719.187
},
{
"epoch": 0.036,
"grad_norm": 0.8359817862510681,
"learning_rate": 0.00029949013290031924,
"loss": 1.4348,
"num_input_tokens_seen": 235929600,
"step": 3600,
"train_runtime": 1736.0232,
"train_tokens_per_second": 135902.33
},
{
"epoch": 0.037,
"grad_norm": 0.7565376162528992,
"learning_rate": 0.00029945016689024353,
"loss": 1.4114,
"num_input_tokens_seen": 242483200,
"step": 3700,
"train_runtime": 1788.0113,
"train_tokens_per_second": 135616.148
},
{
"epoch": 0.038,
"grad_norm": 0.9537010788917542,
"learning_rate": 0.0002994086959186464,
"loss": 1.4134,
"num_input_tokens_seen": 249036800,
"step": 3800,
"train_runtime": 1835.9254,
"train_tokens_per_second": 135646.47
},
{
"epoch": 0.039,
"grad_norm": 0.8911266922950745,
"learning_rate": 0.00029936572040314014,
"loss": 1.4224,
"num_input_tokens_seen": 255590400,
"step": 3900,
"train_runtime": 1882.537,
"train_tokens_per_second": 135769.123
},
{
"epoch": 0.04,
"grad_norm": 0.7832906246185303,
"learning_rate": 0.0002993212407764877,
"loss": 1.4177,
"num_input_tokens_seen": 262144000,
"step": 4000,
"train_runtime": 1928.8118,
"train_tokens_per_second": 135909.579
},
{
"epoch": 0.041,
"grad_norm": 0.8426671624183655,
"learning_rate": 0.00029927525748659834,
"loss": 1.4194,
"num_input_tokens_seen": 268697600,
"step": 4100,
"train_runtime": 1981.7143,
"train_tokens_per_second": 135588.467
},
{
"epoch": 0.042,
"grad_norm": 0.9675344824790955,
"learning_rate": 0.0002992277709965234,
"loss": 1.4059,
"num_input_tokens_seen": 275251200,
"step": 4200,
"train_runtime": 2027.927,
"train_tokens_per_second": 135730.33
},
{
"epoch": 0.043,
"grad_norm": 1.1866440773010254,
"learning_rate": 0.0002991787817844513,
"loss": 1.4065,
"num_input_tokens_seen": 281804800,
"step": 4300,
"train_runtime": 2074.708,
"train_tokens_per_second": 135828.659
},
{
"epoch": 0.044,
"grad_norm": 0.8417257070541382,
"learning_rate": 0.0002991282903437028,
"loss": 1.397,
"num_input_tokens_seen": 288358400,
"step": 4400,
"train_runtime": 2126.0513,
"train_tokens_per_second": 135630.972
},
{
"epoch": 0.045,
"grad_norm": 0.8226633071899414,
"learning_rate": 0.0002990762971827262,
"loss": 1.3996,
"num_input_tokens_seen": 294912000,
"step": 4500,
"train_runtime": 2172.3837,
"train_tokens_per_second": 135755.024
},
{
"epoch": 0.046,
"grad_norm": 0.8411224484443665,
"learning_rate": 0.00029902280282509197,
"loss": 1.4002,
"num_input_tokens_seen": 301465600,
"step": 4600,
"train_runtime": 2220.1775,
"train_tokens_per_second": 135784.456
},
{
"epoch": 0.047,
"grad_norm": 0.7082719802856445,
"learning_rate": 0.0002989678078094878,
"loss": 1.3804,
"num_input_tokens_seen": 308019200,
"step": 4700,
"train_runtime": 2266.6848,
"train_tokens_per_second": 135889.739
},
{
"epoch": 0.048,
"grad_norm": 0.7628137469291687,
"learning_rate": 0.00029891131268971284,
"loss": 1.3795,
"num_input_tokens_seen": 314572800,
"step": 4800,
"train_runtime": 2318.5885,
"train_tokens_per_second": 135674.269
},
{
"epoch": 0.049,
"grad_norm": 0.7231079936027527,
"learning_rate": 0.0002988533180346723,
"loss": 1.3789,
"num_input_tokens_seen": 321126400,
"step": 4900,
"train_runtime": 2364.3453,
"train_tokens_per_second": 135820.432
},
{
"epoch": 0.05,
"grad_norm": 0.7210503816604614,
"learning_rate": 0.0002987938244283717,
"loss": 1.3641,
"num_input_tokens_seen": 327680000,
"step": 5000,
"train_runtime": 2410.3286,
"train_tokens_per_second": 135948.267
},
{
"epoch": 0.051,
"grad_norm": 0.729364275932312,
"learning_rate": 0.00029873283246991105,
"loss": 1.3756,
"num_input_tokens_seen": 334233600,
"step": 5100,
"train_runtime": 2458.4762,
"train_tokens_per_second": 135951.532
},
{
"epoch": 0.052,
"grad_norm": 0.7513293027877808,
"learning_rate": 0.0002986703427734787,
"loss": 1.3778,
"num_input_tokens_seen": 340787200,
"step": 5200,
"train_runtime": 2506.9032,
"train_tokens_per_second": 135939.511
},
{
"epoch": 0.053,
"grad_norm": 0.7382386326789856,
"learning_rate": 0.00029860635596834517,
"loss": 1.3807,
"num_input_tokens_seen": 347340800,
"step": 5300,
"train_runtime": 2559.5035,
"train_tokens_per_second": 135706.321
},
{
"epoch": 0.054,
"grad_norm": 0.5869194269180298,
"learning_rate": 0.0002985408726988569,
"loss": 1.3695,
"num_input_tokens_seen": 353894400,
"step": 5400,
"train_runtime": 2605.4484,
"train_tokens_per_second": 135828.598
},
{
"epoch": 0.055,
"grad_norm": 0.7805973291397095,
"learning_rate": 0.0002984738936244296,
"loss": 1.3746,
"num_input_tokens_seen": 360448000,
"step": 5500,
"train_runtime": 2655.8515,
"train_tokens_per_second": 135718.431
},
{
"epoch": 0.056,
"grad_norm": 0.6918448209762573,
"learning_rate": 0.0002984054194195419,
"loss": 1.3855,
"num_input_tokens_seen": 367001600,
"step": 5600,
"train_runtime": 2703.0299,
"train_tokens_per_second": 135774.155
},
{
"epoch": 0.057,
"grad_norm": 0.6129201054573059,
"learning_rate": 0.0002983354507737283,
"loss": 1.3816,
"num_input_tokens_seen": 373555200,
"step": 5700,
"train_runtime": 2750.071,
"train_tokens_per_second": 135834.747
},
{
"epoch": 0.058,
"grad_norm": 0.7457948923110962,
"learning_rate": 0.00029826398839157215,
"loss": 1.3748,
"num_input_tokens_seen": 380108800,
"step": 5800,
"train_runtime": 2795.4164,
"train_tokens_per_second": 135975.735
},
{
"epoch": 0.059,
"grad_norm": 0.6171481013298035,
"learning_rate": 0.000298191032992699,
"loss": 1.3725,
"num_input_tokens_seen": 386662400,
"step": 5900,
"train_runtime": 2842.5021,
"train_tokens_per_second": 136028.889
},
{
"epoch": 0.06,
"grad_norm": 0.6233596205711365,
"learning_rate": 0.0002981165853117688,
"loss": 1.3624,
"num_input_tokens_seen": 393216000,
"step": 6000,
"train_runtime": 2892.8273,
"train_tokens_per_second": 135927.922
},
{
"epoch": 0.061,
"grad_norm": 0.5645745396614075,
"learning_rate": 0.000298040646098469,
"loss": 1.356,
"num_input_tokens_seen": 399769600,
"step": 6100,
"train_runtime": 2940.1153,
"train_tokens_per_second": 135970.721
},
{
"epoch": 0.062,
"grad_norm": 0.6580554246902466,
"learning_rate": 0.0002979632161175064,
"loss": 1.3627,
"num_input_tokens_seen": 406323200,
"step": 6200,
"train_runtime": 2986.9073,
"train_tokens_per_second": 136034.754
},
{
"epoch": 0.063,
"grad_norm": 0.6815545558929443,
"learning_rate": 0.0002978842961486003,
"loss": 1.3562,
"num_input_tokens_seen": 412876800,
"step": 6300,
"train_runtime": 3038.4238,
"train_tokens_per_second": 135885.191
},
{
"epoch": 0.064,
"grad_norm": 0.9602898955345154,
"learning_rate": 0.0002978038869864738,
"loss": 1.3562,
"num_input_tokens_seen": 419430400,
"step": 6400,
"train_runtime": 3085.1228,
"train_tokens_per_second": 135952.578
},
{
"epoch": 0.065,
"grad_norm": 0.7086384892463684,
"learning_rate": 0.0002977219894408463,
"loss": 1.3579,
"num_input_tokens_seen": 425984000,
"step": 6500,
"train_runtime": 3130.8346,
"train_tokens_per_second": 136060.844
},
{
"epoch": 0.066,
"grad_norm": 0.5864439010620117,
"learning_rate": 0.0002976386043364251,
"loss": 1.3563,
"num_input_tokens_seen": 432537600,
"step": 6600,
"train_runtime": 3182.4893,
"train_tokens_per_second": 135911.72
},
{
"epoch": 0.067,
"grad_norm": 0.6041991114616394,
"learning_rate": 0.00029755373251289733,
"loss": 1.3753,
"num_input_tokens_seen": 439091200,
"step": 6700,
"train_runtime": 3229.4118,
"train_tokens_per_second": 135966.308
},
{
"epoch": 0.068,
"grad_norm": 0.7153160572052002,
"learning_rate": 0.0002974673748249213,
"loss": 1.3475,
"num_input_tokens_seen": 445644800,
"step": 6800,
"train_runtime": 3276.7034,
"train_tokens_per_second": 136004.008
},
{
"epoch": 0.069,
"grad_norm": 0.5409119725227356,
"learning_rate": 0.00029737953214211804,
"loss": 1.3464,
"num_input_tokens_seen": 452198400,
"step": 6900,
"train_runtime": 3324.3119,
"train_tokens_per_second": 136027.67
},
{
"epoch": 0.07,
"grad_norm": 0.6369441151618958,
"learning_rate": 0.0002972902053490623,
"loss": 1.3546,
"num_input_tokens_seen": 458752000,
"step": 7000,
"train_runtime": 3370.6322,
"train_tokens_per_second": 136102.657
},
{
"epoch": 0.071,
"grad_norm": 0.8589248061180115,
"learning_rate": 0.00029719939534527393,
"loss": 1.3479,
"num_input_tokens_seen": 465305600,
"step": 7100,
"train_runtime": 3424.7139,
"train_tokens_per_second": 135867.0
},
{
"epoch": 0.072,
"grad_norm": 0.8014613389968872,
"learning_rate": 0.00029710710304520866,
"loss": 1.3667,
"num_input_tokens_seen": 471859200,
"step": 7200,
"train_runtime": 3472.985,
"train_tokens_per_second": 135865.601
},
{
"epoch": 0.073,
"grad_norm": 0.5970280766487122,
"learning_rate": 0.00029701332937824885,
"loss": 1.3423,
"num_input_tokens_seen": 478412800,
"step": 7300,
"train_runtime": 3519.3052,
"train_tokens_per_second": 135939.558
},
{
"epoch": 0.074,
"grad_norm": 0.6963617205619812,
"learning_rate": 0.0002969180752886944,
"loss": 1.3443,
"num_input_tokens_seen": 484966400,
"step": 7400,
"train_runtime": 3565.8739,
"train_tokens_per_second": 136002.118
},
{
"epoch": 0.075,
"grad_norm": 0.5769393444061279,
"learning_rate": 0.0002968213417357529,
"loss": 1.3576,
"num_input_tokens_seen": 491520000,
"step": 7500,
"train_runtime": 3611.5043,
"train_tokens_per_second": 136098.411
},
{
"epoch": 0.076,
"grad_norm": 0.5492929816246033,
"learning_rate": 0.00029672312969353015,
"loss": 1.3422,
"num_input_tokens_seen": 498073600,
"step": 7600,
"train_runtime": 3664.3633,
"train_tokens_per_second": 135923.642
},
{
"epoch": 0.077,
"grad_norm": 0.8065637946128845,
"learning_rate": 0.00029662344015102027,
"loss": 1.3395,
"num_input_tokens_seen": 504627200,
"step": 7700,
"train_runtime": 3711.2689,
"train_tokens_per_second": 135971.608
},
{
"epoch": 0.078,
"grad_norm": 0.552871584892273,
"learning_rate": 0.00029652227411209594,
"loss": 1.3427,
"num_input_tokens_seen": 511180800,
"step": 7800,
"train_runtime": 3758.1209,
"train_tokens_per_second": 136020.319
},
{
"epoch": 0.079,
"grad_norm": 0.6378001570701599,
"learning_rate": 0.0002964196325954979,
"loss": 1.3339,
"num_input_tokens_seen": 517734400,
"step": 7900,
"train_runtime": 3804.2295,
"train_tokens_per_second": 136094.417
},
{
"epoch": 0.08,
"grad_norm": 0.6196131706237793,
"learning_rate": 0.0002963155166348253,
"loss": 1.341,
"num_input_tokens_seen": 524288000,
"step": 8000,
"train_runtime": 3855.6562,
"train_tokens_per_second": 135978.93
},
{
"epoch": 0.081,
"grad_norm": 0.5841253399848938,
"learning_rate": 0.0002962099272785246,
"loss": 1.3366,
"num_input_tokens_seen": 530841600,
"step": 8100,
"train_runtime": 3903.5348,
"train_tokens_per_second": 135989.977
},
{
"epoch": 0.082,
"grad_norm": 0.5912770628929138,
"learning_rate": 0.0002961028655898794,
"loss": 1.3417,
"num_input_tokens_seen": 537395200,
"step": 8200,
"train_runtime": 3951.3698,
"train_tokens_per_second": 136002.255
},
{
"epoch": 0.083,
"grad_norm": 0.5480249524116516,
"learning_rate": 0.0002959943326469998,
"loss": 1.3419,
"num_input_tokens_seen": 543948800,
"step": 8300,
"train_runtime": 3997.3554,
"train_tokens_per_second": 136077.166
},
{
"epoch": 0.084,
"grad_norm": 0.49880343675613403,
"learning_rate": 0.0002958843295428112,
"loss": 1.3165,
"num_input_tokens_seen": 550502400,
"step": 8400,
"train_runtime": 4044.3967,
"train_tokens_per_second": 136114.838
},
{
"epoch": 0.085,
"grad_norm": 0.5670176148414612,
"learning_rate": 0.0002957728573850438,
"loss": 1.3314,
"num_input_tokens_seen": 557056000,
"step": 8500,
"train_runtime": 4095.7201,
"train_tokens_per_second": 136009.294
},
{
"epoch": 0.086,
"grad_norm": 2.3274426460266113,
"learning_rate": 0.0002956599172962209,
"loss": 1.3323,
"num_input_tokens_seen": 563609600,
"step": 8600,
"train_runtime": 4143.1443,
"train_tokens_per_second": 136034.268
},
{
"epoch": 0.087,
"grad_norm": 0.7660558819770813,
"learning_rate": 0.0002955455104136479,
"loss": 1.3382,
"num_input_tokens_seen": 570163200,
"step": 8700,
"train_runtime": 4190.7065,
"train_tokens_per_second": 136054.194
},
{
"epoch": 0.088,
"grad_norm": 0.5114762783050537,
"learning_rate": 0.00029542963788940096,
"loss": 1.3252,
"num_input_tokens_seen": 576716800,
"step": 8800,
"train_runtime": 4237.8545,
"train_tokens_per_second": 136086.974
},
{
"epoch": 0.089,
"grad_norm": 0.6698548197746277,
"learning_rate": 0.00029531230089031505,
"loss": 1.3449,
"num_input_tokens_seen": 583270400,
"step": 8900,
"train_runtime": 4285.2299,
"train_tokens_per_second": 136111.81
},
{
"epoch": 0.09,
"grad_norm": 0.5562598705291748,
"learning_rate": 0.0002951935005979724,
"loss": 1.3204,
"num_input_tokens_seen": 589824000,
"step": 9000,
"train_runtime": 4336.4907,
"train_tokens_per_second": 136014.126
},
{
"epoch": 0.091,
"grad_norm": 0.6327181458473206,
"learning_rate": 0.0002950732382086907,
"loss": 1.3178,
"num_input_tokens_seen": 596377600,
"step": 9100,
"train_runtime": 4383.0811,
"train_tokens_per_second": 136063.555
},
{
"epoch": 0.092,
"grad_norm": 0.6857426166534424,
"learning_rate": 0.0002949515149335108,
"loss": 1.3332,
"num_input_tokens_seen": 602931200,
"step": 9200,
"train_runtime": 4431.4231,
"train_tokens_per_second": 136058.142
},
{
"epoch": 0.093,
"grad_norm": 0.6040679812431335,
"learning_rate": 0.0002948283319981848,
"loss": 1.307,
"num_input_tokens_seen": 609484800,
"step": 9300,
"train_runtime": 4478.1663,
"train_tokens_per_second": 136101.423
},
{
"epoch": 0.094,
"grad_norm": 1.0060901641845703,
"learning_rate": 0.00029470369064316354,
"loss": 1.3108,
"num_input_tokens_seen": 616038400,
"step": 9400,
"train_runtime": 4524.7167,
"train_tokens_per_second": 136149.607
},
{
"epoch": 0.095,
"grad_norm": 0.504460871219635,
"learning_rate": 0.00029457759212358397,
"loss": 1.3169,
"num_input_tokens_seen": 622592000,
"step": 9500,
"train_runtime": 4575.869,
"train_tokens_per_second": 136059.84
},
{
"epoch": 0.096,
"grad_norm": 0.5062097907066345,
"learning_rate": 0.00029445003770925686,
"loss": 1.3137,
"num_input_tokens_seen": 629145600,
"step": 9600,
"train_runtime": 4621.4422,
"train_tokens_per_second": 136136.203
},
{
"epoch": 0.097,
"grad_norm": 0.5388786792755127,
"learning_rate": 0.00029432102868465367,
"loss": 1.3128,
"num_input_tokens_seen": 635699200,
"step": 9700,
"train_runtime": 4668.6149,
"train_tokens_per_second": 136164.411
},
{
"epoch": 0.098,
"grad_norm": 0.5705980062484741,
"learning_rate": 0.0002941905663488939,
"loss": 1.3065,
"num_input_tokens_seen": 642252800,
"step": 9800,
"train_runtime": 4715.2389,
"train_tokens_per_second": 136207.903
},
{
"epoch": 0.099,
"grad_norm": 0.5500839352607727,
"learning_rate": 0.0002940586520157318,
"loss": 1.3222,
"num_input_tokens_seen": 648806400,
"step": 9900,
"train_runtime": 4767.1995,
"train_tokens_per_second": 136098.019
},
{
"epoch": 0.1,
"grad_norm": 0.5740068554878235,
"learning_rate": 0.00029392528701354325,
"loss": 1.3173,
"num_input_tokens_seen": 655360000,
"step": 10000,
"train_runtime": 4814.2762,
"train_tokens_per_second": 136128.458
},
{
"epoch": 0.101,
"grad_norm": 0.47691279649734497,
"learning_rate": 0.00029379047268531243,
"loss": 1.3084,
"num_input_tokens_seen": 661913600,
"step": 10100,
"train_runtime": 4861.0919,
"train_tokens_per_second": 136165.622
},
{
"epoch": 0.102,
"grad_norm": 0.5993319153785706,
"learning_rate": 0.00029365421038861795,
"loss": 1.3299,
"num_input_tokens_seen": 668467200,
"step": 10200,
"train_runtime": 4908.6949,
"train_tokens_per_second": 136180.229
},
{
"epoch": 0.103,
"grad_norm": 0.556516170501709,
"learning_rate": 0.0002935165014956198,
"loss": 1.316,
"num_input_tokens_seen": 675020800,
"step": 10300,
"train_runtime": 4956.5309,
"train_tokens_per_second": 136188.156
},
{
"epoch": 0.104,
"grad_norm": 0.6757346391677856,
"learning_rate": 0.0002933773473930448,
"loss": 1.3048,
"num_input_tokens_seen": 681574400,
"step": 10400,
"train_runtime": 5003.7965,
"train_tokens_per_second": 136211.454
},
{
"epoch": 0.105,
"grad_norm": 0.9610360860824585,
"learning_rate": 0.0002932367494821734,
"loss": 1.3043,
"num_input_tokens_seen": 688128000,
"step": 10500,
"train_runtime": 5050.8058,
"train_tokens_per_second": 136241.232
},
{
"epoch": 0.106,
"grad_norm": 0.5780071020126343,
"learning_rate": 0.00029309470917882497,
"loss": 1.3015,
"num_input_tokens_seen": 694681600,
"step": 10600,
"train_runtime": 5104.0171,
"train_tokens_per_second": 136104.873
},
{
"epoch": 0.107,
"grad_norm": 0.6387894749641418,
"learning_rate": 0.0002929512279133437,
"loss": 1.3342,
"num_input_tokens_seen": 701235200,
"step": 10700,
"train_runtime": 5151.2508,
"train_tokens_per_second": 136129.112
},
{
"epoch": 0.108,
"grad_norm": 0.48744165897369385,
"learning_rate": 0.0002928063071305844,
"loss": 1.2999,
"num_input_tokens_seen": 707788800,
"step": 10800,
"train_runtime": 5198.4813,
"train_tokens_per_second": 136152.995
},
{
"epoch": 0.109,
"grad_norm": 0.5223510265350342,
"learning_rate": 0.0002926599482898978,
"loss": 1.2996,
"num_input_tokens_seen": 714342400,
"step": 10900,
"train_runtime": 5244.0735,
"train_tokens_per_second": 136218.99
},
{
"epoch": 0.11,
"grad_norm": 0.6020687222480774,
"learning_rate": 0.00029251215286511573,
"loss": 1.3029,
"num_input_tokens_seen": 720896000,
"step": 11000,
"train_runtime": 5291.0983,
"train_tokens_per_second": 136246.948
},
{
"epoch": 0.111,
"grad_norm": 0.5317751169204712,
"learning_rate": 0.00029236292234453647,
"loss": 1.316,
"num_input_tokens_seen": 727449600,
"step": 11100,
"train_runtime": 5342.4851,
"train_tokens_per_second": 136163.15
},
{
"epoch": 0.112,
"grad_norm": 1.2369730472564697,
"learning_rate": 0.0002922122582309097,
"loss": 1.298,
"num_input_tokens_seen": 734003200,
"step": 11200,
"train_runtime": 5391.0041,
"train_tokens_per_second": 136153.338
},
{
"epoch": 0.113,
"grad_norm": 0.5294257998466492,
"learning_rate": 0.0002920601620414215,
"loss": 1.316,
"num_input_tokens_seen": 740556800,
"step": 11300,
"train_runtime": 5437.8422,
"train_tokens_per_second": 136185.784
},
{
"epoch": 0.114,
"grad_norm": 0.5318885445594788,
"learning_rate": 0.0002919066353076786,
"loss": 1.2993,
"num_input_tokens_seen": 747110400,
"step": 11400,
"train_runtime": 5484.1183,
"train_tokens_per_second": 136231.635
},
{
"epoch": 0.115,
"grad_norm": 0.5208443403244019,
"learning_rate": 0.00029175167957569366,
"loss": 1.3066,
"num_input_tokens_seen": 753664000,
"step": 11500,
"train_runtime": 5531.5155,
"train_tokens_per_second": 136249.099
},
{
"epoch": 0.116,
"grad_norm": 0.5068408250808716,
"learning_rate": 0.0002915952964058691,
"loss": 1.3041,
"num_input_tokens_seen": 760217600,
"step": 11600,
"train_runtime": 5578.6188,
"train_tokens_per_second": 136273.445
},
{
"epoch": 0.117,
"grad_norm": 0.6206523776054382,
"learning_rate": 0.00029143748737298173,
"loss": 1.3061,
"num_input_tokens_seen": 766771200,
"step": 11700,
"train_runtime": 5631.31,
"train_tokens_per_second": 136162.136
},
{
"epoch": 0.118,
"grad_norm": 0.5741725564002991,
"learning_rate": 0.00029127825406616677,
"loss": 1.3097,
"num_input_tokens_seen": 773324800,
"step": 11800,
"train_runtime": 5678.817,
"train_tokens_per_second": 136177.096
},
{
"epoch": 0.119,
"grad_norm": 0.5251154899597168,
"learning_rate": 0.0002911175980889019,
"loss": 1.3054,
"num_input_tokens_seen": 779878400,
"step": 11900,
"train_runtime": 5725.8659,
"train_tokens_per_second": 136202.701
},
{
"epoch": 0.12,
"grad_norm": 0.4509083032608032,
"learning_rate": 0.00029095552105899095,
"loss": 1.301,
"num_input_tokens_seen": 786432000,
"step": 12000,
"train_runtime": 5772.0962,
"train_tokens_per_second": 136247.211
},
{
"epoch": 0.121,
"grad_norm": 0.4560108184814453,
"learning_rate": 0.0002907920246085478,
"loss": 1.2981,
"num_input_tokens_seen": 792985600,
"step": 12100,
"train_runtime": 5817.8977,
"train_tokens_per_second": 136301.056
},
{
"epoch": 0.122,
"grad_norm": 1.227121114730835,
"learning_rate": 0.00029062711038397996,
"loss": 1.302,
"num_input_tokens_seen": 799539200,
"step": 12200,
"train_runtime": 5870.3451,
"train_tokens_per_second": 136199.693
},
{
"epoch": 0.123,
"grad_norm": 0.4861258864402771,
"learning_rate": 0.00029046078004597175,
"loss": 1.318,
"num_input_tokens_seen": 806092800,
"step": 12300,
"train_runtime": 5916.8489,
"train_tokens_per_second": 136236.84
},
{
"epoch": 0.124,
"grad_norm": 0.9702387452125549,
"learning_rate": 0.00029029303526946796,
"loss": 1.2869,
"num_input_tokens_seen": 812646400,
"step": 12400,
"train_runtime": 5964.0243,
"train_tokens_per_second": 136258.063
},
{
"epoch": 0.125,
"grad_norm": 0.4712119400501251,
"learning_rate": 0.0002901238777436565,
"loss": 1.2924,
"num_input_tokens_seen": 819200000,
"step": 12500,
"train_runtime": 6009.6089,
"train_tokens_per_second": 136315.026
},
{
"epoch": 0.126,
"grad_norm": 0.4670332372188568,
"learning_rate": 0.00028995330917195184,
"loss": 1.2942,
"num_input_tokens_seen": 825753600,
"step": 12600,
"train_runtime": 6061.3166,
"train_tokens_per_second": 136233.371
},
{
"epoch": 0.127,
"grad_norm": 0.4821685552597046,
"learning_rate": 0.00028978133127197765,
"loss": 1.2856,
"num_input_tokens_seen": 832307200,
"step": 12700,
"train_runtime": 6108.5206,
"train_tokens_per_second": 136253.481
},
{
"epoch": 0.128,
"grad_norm": 0.5634518265724182,
"learning_rate": 0.0002896079457755493,
"loss": 1.2982,
"num_input_tokens_seen": 838860800,
"step": 12800,
"train_runtime": 6155.2503,
"train_tokens_per_second": 136283.785
},
{
"epoch": 0.129,
"grad_norm": 0.45673057436943054,
"learning_rate": 0.000289433154428657,
"loss": 1.2997,
"num_input_tokens_seen": 845414400,
"step": 12900,
"train_runtime": 6202.1106,
"train_tokens_per_second": 136310.758
},
{
"epoch": 0.13,
"grad_norm": 0.4386661648750305,
"learning_rate": 0.0002892569589914476,
"loss": 1.2985,
"num_input_tokens_seen": 851968000,
"step": 13000,
"train_runtime": 6249.4681,
"train_tokens_per_second": 136326.482
},
{
"epoch": 0.131,
"grad_norm": 0.4749270975589752,
"learning_rate": 0.0002890793612382072,
"loss": 1.2946,
"num_input_tokens_seen": 858521600,
"step": 13100,
"train_runtime": 6301.6638,
"train_tokens_per_second": 136237.291
},
{
"epoch": 0.132,
"grad_norm": 0.5405780673027039,
"learning_rate": 0.0002889003629573432,
"loss": 1.2857,
"num_input_tokens_seen": 865075200,
"step": 13200,
"train_runtime": 6349.664,
"train_tokens_per_second": 136239.523
},
{
"epoch": 0.133,
"grad_norm": 0.4045722782611847,
"learning_rate": 0.00028871996595136626,
"loss": 1.3009,
"num_input_tokens_seen": 871628800,
"step": 13300,
"train_runtime": 6396.2349,
"train_tokens_per_second": 136272.169
},
{
"epoch": 0.134,
"grad_norm": 0.5851114392280579,
"learning_rate": 0.0002885381720368723,
"loss": 1.3026,
"num_input_tokens_seen": 878182400,
"step": 13400,
"train_runtime": 6442.8884,
"train_tokens_per_second": 136302.594
},
{
"epoch": 0.135,
"grad_norm": 0.5135608315467834,
"learning_rate": 0.000288354983044524,
"loss": 1.2778,
"num_input_tokens_seen": 884736000,
"step": 13500,
"train_runtime": 6489.2417,
"train_tokens_per_second": 136338.889
},
{
"epoch": 0.136,
"grad_norm": 0.4828953742980957,
"learning_rate": 0.00028817040081903245,
"loss": 1.2864,
"num_input_tokens_seen": 891289600,
"step": 13600,
"train_runtime": 6540.9813,
"train_tokens_per_second": 136262.368
},
{
"epoch": 0.137,
"grad_norm": 0.5756350755691528,
"learning_rate": 0.00028798442721913867,
"loss": 1.2858,
"num_input_tokens_seen": 897843200,
"step": 13700,
"train_runtime": 6588.3179,
"train_tokens_per_second": 136278.063
},
{
"epoch": 0.138,
"grad_norm": 0.5231483578681946,
"learning_rate": 0.00028779706411759465,
"loss": 1.282,
"num_input_tokens_seen": 904396800,
"step": 13800,
"train_runtime": 6635.0521,
"train_tokens_per_second": 136305.909
},
{
"epoch": 0.139,
"grad_norm": 0.5475858449935913,
"learning_rate": 0.00028760831340114484,
"loss": 1.2797,
"num_input_tokens_seen": 910950400,
"step": 13900,
"train_runtime": 6681.4731,
"train_tokens_per_second": 136339.754
},
{
"epoch": 0.14,
"grad_norm": 0.7064163684844971,
"learning_rate": 0.00028741817697050683,
"loss": 1.2927,
"num_input_tokens_seen": 917504000,
"step": 14000,
"train_runtime": 6730.4553,
"train_tokens_per_second": 136321.238
},
{
"epoch": 0.141,
"grad_norm": 0.5267386436462402,
"learning_rate": 0.00028722665674035233,
"loss": 1.2815,
"num_input_tokens_seen": 924057600,
"step": 14100,
"train_runtime": 6782.7717,
"train_tokens_per_second": 136235.987
},
{
"epoch": 0.142,
"grad_norm": 0.5816136598587036,
"learning_rate": 0.0002870337546392879,
"loss": 1.2983,
"num_input_tokens_seen": 930611200,
"step": 14200,
"train_runtime": 6829.7567,
"train_tokens_per_second": 136258.323
},
{
"epoch": 0.143,
"grad_norm": 0.4982451796531677,
"learning_rate": 0.00028683947260983576,
"loss": 1.3026,
"num_input_tokens_seen": 937164800,
"step": 14300,
"train_runtime": 6877.8163,
"train_tokens_per_second": 136259.063
},
{
"epoch": 0.144,
"grad_norm": 0.49408379197120667,
"learning_rate": 0.00028664381260841356,
"loss": 1.2869,
"num_input_tokens_seen": 943718400,
"step": 14400,
"train_runtime": 6923.5994,
"train_tokens_per_second": 136304.593
},
{
"epoch": 0.145,
"grad_norm": 0.4885796904563904,
"learning_rate": 0.0002864467766053154,
"loss": 1.2768,
"num_input_tokens_seen": 950272000,
"step": 14500,
"train_runtime": 6969.9199,
"train_tokens_per_second": 136339.014
},
{
"epoch": 0.146,
"grad_norm": 0.5424348711967468,
"learning_rate": 0.00028624836658469165,
"loss": 1.2806,
"num_input_tokens_seen": 956825600,
"step": 14600,
"train_runtime": 7020.7829,
"train_tokens_per_second": 136284.743
},
{
"epoch": 0.147,
"grad_norm": 0.4333992898464203,
"learning_rate": 0.00028604858454452906,
"loss": 1.2776,
"num_input_tokens_seen": 963379200,
"step": 14700,
"train_runtime": 7066.7012,
"train_tokens_per_second": 136326.58
},
{
"epoch": 0.148,
"grad_norm": 1.3118066787719727,
"learning_rate": 0.00028584743249663057,
"loss": 1.3039,
"num_input_tokens_seen": 969932800,
"step": 14800,
"train_runtime": 7115.8691,
"train_tokens_per_second": 136305.6
},
{
"epoch": 0.149,
"grad_norm": 0.5320950150489807,
"learning_rate": 0.000285644912466595,
"loss": 1.2801,
"num_input_tokens_seen": 976486400,
"step": 14900,
"train_runtime": 7162.6662,
"train_tokens_per_second": 136330.016
},
{
"epoch": 0.15,
"grad_norm": 0.6902542114257812,
"learning_rate": 0.00028544102649379684,
"loss": 1.2832,
"num_input_tokens_seen": 983040000,
"step": 15000,
"train_runtime": 7209.6657,
"train_tokens_per_second": 136350.29
},
{
"epoch": 0.151,
"grad_norm": 0.544683039188385,
"learning_rate": 0.00028523577663136556,
"loss": 1.2948,
"num_input_tokens_seen": 989593600,
"step": 15100,
"train_runtime": 7261.0326,
"train_tokens_per_second": 136288.275
},
{
"epoch": 0.152,
"grad_norm": 0.500091552734375,
"learning_rate": 0.000285029164946165,
"loss": 1.2746,
"num_input_tokens_seen": 996147200,
"step": 15200,
"train_runtime": 7306.6445,
"train_tokens_per_second": 136334.427
},
{
"epoch": 0.153,
"grad_norm": 0.4995329678058624,
"learning_rate": 0.0002848211935187725,
"loss": 1.2893,
"num_input_tokens_seen": 1002700800,
"step": 15300,
"train_runtime": 7353.2711,
"train_tokens_per_second": 136361.19
},
{
"epoch": 0.154,
"grad_norm": 0.42985284328460693,
"learning_rate": 0.0002846118644434581,
"loss": 1.3077,
"num_input_tokens_seen": 1009254400,
"step": 15400,
"train_runtime": 7400.7889,
"train_tokens_per_second": 136371.192
},
{
"epoch": 0.155,
"grad_norm": 0.4847468137741089,
"learning_rate": 0.00028440117982816326,
"loss": 1.2723,
"num_input_tokens_seen": 1015808000,
"step": 15500,
"train_runtime": 7452.7433,
"train_tokens_per_second": 136299.877
},
{
"epoch": 0.156,
"grad_norm": 0.47867411375045776,
"learning_rate": 0.0002841891417944796,
"loss": 1.2754,
"num_input_tokens_seen": 1022361600,
"step": 15600,
"train_runtime": 7498.8195,
"train_tokens_per_second": 136336.339
},
{
"epoch": 0.157,
"grad_norm": 0.43365904688835144,
"learning_rate": 0.0002839757524776279,
"loss": 1.2737,
"num_input_tokens_seen": 1028915200,
"step": 15700,
"train_runtime": 7545.0284,
"train_tokens_per_second": 136369.957
},
{
"epoch": 0.158,
"grad_norm": 0.5739541053771973,
"learning_rate": 0.0002837610140264361,
"loss": 1.286,
"num_input_tokens_seen": 1035468800,
"step": 15800,
"train_runtime": 7597.8039,
"train_tokens_per_second": 136285.275
},
{
"epoch": 0.159,
"grad_norm": 0.4836307168006897,
"learning_rate": 0.0002835449286033182,
"loss": 1.2779,
"num_input_tokens_seen": 1042022400,
"step": 15900,
"train_runtime": 7643.6023,
"train_tokens_per_second": 136326.088
},
{
"epoch": 0.16,
"grad_norm": 0.5712729692459106,
"learning_rate": 0.0002833274983842518,
"loss": 1.2702,
"num_input_tokens_seen": 1048576000,
"step": 16000,
"train_runtime": 7691.0096,
"train_tokens_per_second": 136337.887
},
{
"epoch": 0.161,
"grad_norm": 0.48568034172058105,
"learning_rate": 0.0002831087255587569,
"loss": 1.2696,
"num_input_tokens_seen": 1055129600,
"step": 16100,
"train_runtime": 7737.6132,
"train_tokens_per_second": 136363.705
},
{
"epoch": 0.162,
"grad_norm": 0.5240116715431213,
"learning_rate": 0.0002828886123298734,
"loss": 1.2636,
"num_input_tokens_seen": 1061683200,
"step": 16200,
"train_runtime": 7790.0975,
"train_tokens_per_second": 136286.253
},
{
"epoch": 0.163,
"grad_norm": 0.4505080580711365,
"learning_rate": 0.00028266716091413906,
"loss": 1.2679,
"num_input_tokens_seen": 1068236800,
"step": 16300,
"train_runtime": 7837.0156,
"train_tokens_per_second": 136306.581
},
{
"epoch": 0.164,
"grad_norm": 0.38184958696365356,
"learning_rate": 0.0002824443735415673,
"loss": 1.2801,
"num_input_tokens_seen": 1074790400,
"step": 16400,
"train_runtime": 7884.0198,
"train_tokens_per_second": 136325.178
},
{
"epoch": 0.165,
"grad_norm": 0.860382616519928,
"learning_rate": 0.0002822202524556243,
"loss": 1.2737,
"num_input_tokens_seen": 1081344000,
"step": 16500,
"train_runtime": 7930.486,
"train_tokens_per_second": 136352.803
},
{
"epoch": 0.166,
"grad_norm": 0.771594226360321,
"learning_rate": 0.00028199479991320695,
"loss": 1.2876,
"num_input_tokens_seen": 1087897600,
"step": 16600,
"train_runtime": 7977.0943,
"train_tokens_per_second": 136377.678
},
{
"epoch": 0.167,
"grad_norm": 0.4533759653568268,
"learning_rate": 0.00028176801818461994,
"loss": 1.2769,
"num_input_tokens_seen": 1094451200,
"step": 16700,
"train_runtime": 8024.6165,
"train_tokens_per_second": 136386.73
},
{
"epoch": 0.168,
"grad_norm": 0.548772394657135,
"learning_rate": 0.00028153990955355273,
"loss": 1.2647,
"num_input_tokens_seen": 1101004800,
"step": 16800,
"train_runtime": 8077.0632,
"train_tokens_per_second": 136312.515
},
{
"epoch": 0.169,
"grad_norm": 0.5390068888664246,
"learning_rate": 0.00028131047631705665,
"loss": 1.2799,
"num_input_tokens_seen": 1107558400,
"step": 16900,
"train_runtime": 8123.3347,
"train_tokens_per_second": 136342.824
},
{
"epoch": 0.17,
"grad_norm": 0.4429817795753479,
"learning_rate": 0.00028107972078552187,
"loss": 1.2727,
"num_input_tokens_seen": 1114112000,
"step": 17000,
"train_runtime": 8169.0719,
"train_tokens_per_second": 136381.71
},
{
"epoch": 0.171,
"grad_norm": 0.6212127208709717,
"learning_rate": 0.0002808476452826541,
"loss": 1.2743,
"num_input_tokens_seen": 1120665600,
"step": 17100,
"train_runtime": 8217.1136,
"train_tokens_per_second": 136381.904
},
{
"epoch": 0.172,
"grad_norm": 0.44569867849349976,
"learning_rate": 0.00028061425214545094,
"loss": 1.2628,
"num_input_tokens_seen": 1127219200,
"step": 17200,
"train_runtime": 8268.2495,
"train_tokens_per_second": 136331.057
},
{
"epoch": 0.173,
"grad_norm": 0.5025371313095093,
"learning_rate": 0.00028037954372417883,
"loss": 1.2651,
"num_input_tokens_seen": 1133772800,
"step": 17300,
"train_runtime": 8315.4333,
"train_tokens_per_second": 136345.607
},
{
"epoch": 0.174,
"grad_norm": 0.5257975459098816,
"learning_rate": 0.0002801435223823488,
"loss": 1.2701,
"num_input_tokens_seen": 1140326400,
"step": 17400,
"train_runtime": 8361.8666,
"train_tokens_per_second": 136372.23
},
{
"epoch": 0.175,
"grad_norm": 0.6858969926834106,
"learning_rate": 0.00027990619049669336,
"loss": 1.2759,
"num_input_tokens_seen": 1146880000,
"step": 17500,
"train_runtime": 8408.7431,
"train_tokens_per_second": 136391.371
},
{
"epoch": 0.176,
"grad_norm": 0.5586578845977783,
"learning_rate": 0.00027966755045714177,
"loss": 1.2782,
"num_input_tokens_seen": 1153433600,
"step": 17600,
"train_runtime": 8455.5155,
"train_tokens_per_second": 136411.978
},
{
"epoch": 0.177,
"grad_norm": 0.583242654800415,
"learning_rate": 0.00027942760466679673,
"loss": 1.287,
"num_input_tokens_seen": 1159987200,
"step": 17700,
"train_runtime": 8508.2754,
"train_tokens_per_second": 136336.349
},
{
"epoch": 0.178,
"grad_norm": 0.5521747469902039,
"learning_rate": 0.00027918635554190956,
"loss": 1.2704,
"num_input_tokens_seen": 1166540800,
"step": 17800,
"train_runtime": 8555.5497,
"train_tokens_per_second": 136349.018
},
{
"epoch": 0.179,
"grad_norm": 0.6325215697288513,
"learning_rate": 0.00027894380551185636,
"loss": 1.2912,
"num_input_tokens_seen": 1173094400,
"step": 17900,
"train_runtime": 8602.3857,
"train_tokens_per_second": 136368.495
},
{
"epoch": 0.18,
"grad_norm": 0.44643789529800415,
"learning_rate": 0.00027869995701911314,
"loss": 1.2762,
"num_input_tokens_seen": 1179648000,
"step": 18000,
"train_runtime": 8649.7648,
"train_tokens_per_second": 136379.2
},
{
"epoch": 0.181,
"grad_norm": 0.49556615948677063,
"learning_rate": 0.0002784548125192316,
"loss": 1.2577,
"num_input_tokens_seen": 1186201600,
"step": 18100,
"train_runtime": 8701.0558,
"train_tokens_per_second": 136328.467
},
{
"epoch": 0.182,
"grad_norm": 0.5336231589317322,
"learning_rate": 0.0002782083744808141,
"loss": 1.2629,
"num_input_tokens_seen": 1192755200,
"step": 18200,
"train_runtime": 8748.3794,
"train_tokens_per_second": 136340.131
},
{
"epoch": 0.183,
"grad_norm": 0.3993295431137085,
"learning_rate": 0.000277960645385489,
"loss": 1.2621,
"num_input_tokens_seen": 1199308800,
"step": 18300,
"train_runtime": 8795.9903,
"train_tokens_per_second": 136347.217
},
{
"epoch": 0.184,
"grad_norm": 0.5608197450637817,
"learning_rate": 0.00027771162772788544,
"loss": 1.2746,
"num_input_tokens_seen": 1205862400,
"step": 18400,
"train_runtime": 8844.0918,
"train_tokens_per_second": 136346.663
},
{
"epoch": 0.185,
"grad_norm": 0.5299677848815918,
"learning_rate": 0.00027746132401560857,
"loss": 1.2608,
"num_input_tokens_seen": 1212416000,
"step": 18500,
"train_runtime": 8890.974,
"train_tokens_per_second": 136364.812
},
{
"epoch": 0.186,
"grad_norm": 0.5247559547424316,
"learning_rate": 0.0002772097367692139,
"loss": 1.2628,
"num_input_tokens_seen": 1218969600,
"step": 18600,
"train_runtime": 8937.3092,
"train_tokens_per_second": 136391.119
},
{
"epoch": 0.187,
"grad_norm": 0.4991471469402313,
"learning_rate": 0.00027695686852218226,
"loss": 1.2617,
"num_input_tokens_seen": 1225523200,
"step": 18700,
"train_runtime": 8984.1463,
"train_tokens_per_second": 136409.532
},
{
"epoch": 0.188,
"grad_norm": 0.4922790229320526,
"learning_rate": 0.00027670272182089416,
"loss": 1.277,
"num_input_tokens_seen": 1232076800,
"step": 18800,
"train_runtime": 9036.4876,
"train_tokens_per_second": 136344.656
},
{
"epoch": 0.189,
"grad_norm": 0.49377188086509705,
"learning_rate": 0.0002764472992246039,
"loss": 1.2767,
"num_input_tokens_seen": 1238630400,
"step": 18900,
"train_runtime": 9084.3866,
"train_tokens_per_second": 136347.169
},
{
"epoch": 0.19,
"grad_norm": 0.6417357921600342,
"learning_rate": 0.0002761906033054143,
"loss": 1.2616,
"num_input_tokens_seen": 1245184000,
"step": 19000,
"train_runtime": 9130.7221,
"train_tokens_per_second": 136373.004
},
{
"epoch": 0.191,
"grad_norm": 0.44580140709877014,
"learning_rate": 0.00027593263664825045,
"loss": 1.2686,
"num_input_tokens_seen": 1251737600,
"step": 19100,
"train_runtime": 9176.6051,
"train_tokens_per_second": 136405.303
},
{
"epoch": 0.192,
"grad_norm": 0.5867856740951538,
"learning_rate": 0.00027567340185083363,
"loss": 1.2638,
"num_input_tokens_seen": 1258291200,
"step": 19200,
"train_runtime": 9229.719,
"train_tokens_per_second": 136330.391
},
{
"epoch": 0.193,
"grad_norm": 0.4900195896625519,
"learning_rate": 0.00027541290152365537,
"loss": 1.263,
"num_input_tokens_seen": 1264844800,
"step": 19300,
"train_runtime": 9276.2421,
"train_tokens_per_second": 136353.147
},
{
"epoch": 0.194,
"grad_norm": 0.49572521448135376,
"learning_rate": 0.00027515113828995117,
"loss": 1.273,
"num_input_tokens_seen": 1271398400,
"step": 19400,
"train_runtime": 9323.5363,
"train_tokens_per_second": 136364.396
},
{
"epoch": 0.195,
"grad_norm": 0.440213680267334,
"learning_rate": 0.00027488811478567374,
"loss": 1.2657,
"num_input_tokens_seen": 1277952000,
"step": 19500,
"train_runtime": 9371.4717,
"train_tokens_per_second": 136366.201
},
{
"epoch": 0.196,
"grad_norm": 0.5604475736618042,
"learning_rate": 0.0002746238336594671,
"loss": 1.2619,
"num_input_tokens_seen": 1284505600,
"step": 19600,
"train_runtime": 9417.129,
"train_tokens_per_second": 136400.978
},
{
"epoch": 0.197,
"grad_norm": 0.45344123244285583,
"learning_rate": 0.00027435829757263894,
"loss": 1.2573,
"num_input_tokens_seen": 1291059200,
"step": 19700,
"train_runtime": 9468.5748,
"train_tokens_per_second": 136352.009
},
{
"epoch": 0.198,
"grad_norm": 0.7260287404060364,
"learning_rate": 0.0002740915091991349,
"loss": 1.2668,
"num_input_tokens_seen": 1297612800,
"step": 19800,
"train_runtime": 9515.3702,
"train_tokens_per_second": 136370.186
},
{
"epoch": 0.199,
"grad_norm": 0.47865310311317444,
"learning_rate": 0.0002738234712255109,
"loss": 1.2674,
"num_input_tokens_seen": 1304166400,
"step": 19900,
"train_runtime": 9562.0606,
"train_tokens_per_second": 136389.682
},
{
"epoch": 0.2,
"grad_norm": 0.8422930240631104,
"learning_rate": 0.00027355418635090635,
"loss": 1.2671,
"num_input_tokens_seen": 1310720000,
"step": 20000,
"train_runtime": 9614.8867,
"train_tokens_per_second": 136321.939
},
{
"epoch": 0.201,
"grad_norm": 0.8500565886497498,
"learning_rate": 0.000273283657287017,
"loss": 1.2722,
"num_input_tokens_seen": 1317273600,
"step": 20100,
"train_runtime": 9662.5316,
"train_tokens_per_second": 136327.999
},
{
"epoch": 0.202,
"grad_norm": 0.4511219263076782,
"learning_rate": 0.00027301188675806745,
"loss": 1.257,
"num_input_tokens_seen": 1323827200,
"step": 20200,
"train_runtime": 9710.3614,
"train_tokens_per_second": 136331.404
},
{
"epoch": 0.203,
"grad_norm": 0.6040441393852234,
"learning_rate": 0.0002727388775007839,
"loss": 1.2787,
"num_input_tokens_seen": 1330380800,
"step": 20300,
"train_runtime": 9757.2415,
"train_tokens_per_second": 136348.045
},
{
"epoch": 0.204,
"grad_norm": 0.531548798084259,
"learning_rate": 0.0002724646322643666,
"loss": 1.2567,
"num_input_tokens_seen": 1336934400,
"step": 20400,
"train_runtime": 9803.907,
"train_tokens_per_second": 136367.512
},
{
"epoch": 0.205,
"grad_norm": 0.5128377079963684,
"learning_rate": 0.000272189153810462,
"loss": 1.2634,
"num_input_tokens_seen": 1343488000,
"step": 20500,
"train_runtime": 9849.6975,
"train_tokens_per_second": 136398.909
},
{
"epoch": 0.206,
"grad_norm": 0.5763120651245117,
"learning_rate": 0.0002719124449131351,
"loss": 1.2708,
"num_input_tokens_seen": 1350041600,
"step": 20600,
"train_runtime": 9902.5747,
"train_tokens_per_second": 136332.382
},
{
"epoch": 0.207,
"grad_norm": 0.5266316533088684,
"learning_rate": 0.00027163450835884144,
"loss": 1.2579,
"num_input_tokens_seen": 1356595200,
"step": 20700,
"train_runtime": 9950.4471,
"train_tokens_per_second": 136335.1
},
{
"epoch": 0.208,
"grad_norm": 0.6279749274253845,
"learning_rate": 0.00027135534694639894,
"loss": 1.2566,
"num_input_tokens_seen": 1363148800,
"step": 20800,
"train_runtime": 9997.0613,
"train_tokens_per_second": 136354.951
},
{
"epoch": 0.209,
"grad_norm": 0.5421542525291443,
"learning_rate": 0.00027107496348696003,
"loss": 1.2687,
"num_input_tokens_seen": 1369702400,
"step": 20900,
"train_runtime": 10044.3146,
"train_tokens_per_second": 136365.939
},
{
"epoch": 0.21,
"grad_norm": 0.5376498699188232,
"learning_rate": 0.00027079336080398296,
"loss": 1.2772,
"num_input_tokens_seen": 1376256000,
"step": 21000,
"train_runtime": 10090.6051,
"train_tokens_per_second": 136389.839
},
{
"epoch": 0.211,
"grad_norm": 0.41719597578048706,
"learning_rate": 0.00027051054173320366,
"loss": 1.2502,
"num_input_tokens_seen": 1382809600,
"step": 21100,
"train_runtime": 10143.3243,
"train_tokens_per_second": 136327.063
},
{
"epoch": 0.212,
"grad_norm": 0.4714694321155548,
"learning_rate": 0.000270226509122607,
"loss": 1.2537,
"num_input_tokens_seen": 1389363200,
"step": 21200,
"train_runtime": 10188.8874,
"train_tokens_per_second": 136360.639
},
{
"epoch": 0.213,
"grad_norm": 0.4616274833679199,
"learning_rate": 0.0002699412658323983,
"loss": 1.2571,
"num_input_tokens_seen": 1395916800,
"step": 21300,
"train_runtime": 10236.5378,
"train_tokens_per_second": 136366.107
},
{
"epoch": 0.214,
"grad_norm": 0.4215717911720276,
"learning_rate": 0.00026965481473497423,
"loss": 1.2687,
"num_input_tokens_seen": 1402470400,
"step": 21400,
"train_runtime": 10282.9404,
"train_tokens_per_second": 136388.071
},
{
"epoch": 0.215,
"grad_norm": 0.5976271033287048,
"learning_rate": 0.0002693671587148942,
"loss": 1.2573,
"num_input_tokens_seen": 1409024000,
"step": 21500,
"train_runtime": 10329.955,
"train_tokens_per_second": 136401.756
},
{
"epoch": 0.216,
"grad_norm": 0.5200098752975464,
"learning_rate": 0.0002690783006688511,
"loss": 1.247,
"num_input_tokens_seen": 1415577600,
"step": 21600,
"train_runtime": 10382.0767,
"train_tokens_per_second": 136348.213
},
{
"epoch": 0.217,
"grad_norm": 0.8170623779296875,
"learning_rate": 0.0002687882435056423,
"loss": 1.2562,
"num_input_tokens_seen": 1422131200,
"step": 21700,
"train_runtime": 10429.827,
"train_tokens_per_second": 136352.329
},
{
"epoch": 0.218,
"grad_norm": 0.52497398853302,
"learning_rate": 0.0002684969901461402,
"loss": 1.2533,
"num_input_tokens_seen": 1428684800,
"step": 21800,
"train_runtime": 10476.8104,
"train_tokens_per_second": 136366.388
},
{
"epoch": 0.219,
"grad_norm": 0.4417087137699127,
"learning_rate": 0.000268204543523263,
"loss": 1.2721,
"num_input_tokens_seen": 1435238400,
"step": 21900,
"train_runtime": 10524.1028,
"train_tokens_per_second": 136376.319
},
{
"epoch": 0.22,
"grad_norm": 0.5729189515113831,
"learning_rate": 0.0002679109065819447,
"loss": 1.2654,
"num_input_tokens_seen": 1441792000,
"step": 22000,
"train_runtime": 10572.3447,
"train_tokens_per_second": 136373.911
},
{
"epoch": 0.221,
"grad_norm": 0.5111753940582275,
"learning_rate": 0.0002676160822791062,
"loss": 1.2581,
"num_input_tokens_seen": 1448345600,
"step": 22100,
"train_runtime": 10619.3771,
"train_tokens_per_second": 136387.057
},
{
"epoch": 0.222,
"grad_norm": 0.4302677512168884,
"learning_rate": 0.00026732007358362496,
"loss": 1.2581,
"num_input_tokens_seen": 1454899200,
"step": 22200,
"train_runtime": 10666.0714,
"train_tokens_per_second": 136404.413
},
{
"epoch": 0.223,
"grad_norm": 3.9242477416992188,
"learning_rate": 0.0002670228834763052,
"loss": 1.2872,
"num_input_tokens_seen": 1461452800,
"step": 22300,
"train_runtime": 10719.3985,
"train_tokens_per_second": 136337.203
},
{
"epoch": 0.224,
"grad_norm": 0.7662601470947266,
"learning_rate": 0.00026672451494984804,
"loss": 1.2602,
"num_input_tokens_seen": 1468006400,
"step": 22400,
"train_runtime": 10767.2807,
"train_tokens_per_second": 136339.568
},
{
"epoch": 0.225,
"grad_norm": 0.48544740676879883,
"learning_rate": 0.0002664249710088213,
"loss": 1.257,
"num_input_tokens_seen": 1474560000,
"step": 22500,
"train_runtime": 10813.982,
"train_tokens_per_second": 136356.802
},
{
"epoch": 0.226,
"grad_norm": 0.4495686888694763,
"learning_rate": 0.00026612425466962893,
"loss": 1.2552,
"num_input_tokens_seen": 1481113600,
"step": 22600,
"train_runtime": 10860.2948,
"train_tokens_per_second": 136378.766
},
{
"epoch": 0.227,
"grad_norm": 0.5733143091201782,
"learning_rate": 0.00026582236896048134,
"loss": 1.2403,
"num_input_tokens_seen": 1487667200,
"step": 22700,
"train_runtime": 10907.2107,
"train_tokens_per_second": 136393.001
},
{
"epoch": 0.228,
"grad_norm": 0.7318263649940491,
"learning_rate": 0.00026551931692136413,
"loss": 1.2468,
"num_input_tokens_seen": 1494220800,
"step": 22800,
"train_runtime": 10953.9499,
"train_tokens_per_second": 136409.315
},
{
"epoch": 0.229,
"grad_norm": 0.5192084312438965,
"learning_rate": 0.00026521510160400804,
"loss": 1.2458,
"num_input_tokens_seen": 1500774400,
"step": 22900,
"train_runtime": 11006.6198,
"train_tokens_per_second": 136351.98
},
{
"epoch": 0.23,
"grad_norm": 0.4651305079460144,
"learning_rate": 0.00026490972607185793,
"loss": 1.2601,
"num_input_tokens_seen": 1507328000,
"step": 23000,
"train_runtime": 11053.8305,
"train_tokens_per_second": 136362.504
},
{
"epoch": 0.231,
"grad_norm": 0.5470275282859802,
"learning_rate": 0.0002646031934000421,
"loss": 1.2405,
"num_input_tokens_seen": 1513881600,
"step": 23100,
"train_runtime": 11099.6418,
"train_tokens_per_second": 136390.132
},
{
"epoch": 0.232,
"grad_norm": 0.519235372543335,
"learning_rate": 0.00026429550667534095,
"loss": 1.2586,
"num_input_tokens_seen": 1520435200,
"step": 23200,
"train_runtime": 11152.1986,
"train_tokens_per_second": 136335.018
},
{
"epoch": 0.233,
"grad_norm": 0.4892626404762268,
"learning_rate": 0.0002639866689961565,
"loss": 1.2595,
"num_input_tokens_seen": 1526988800,
"step": 23300,
"train_runtime": 11199.2653,
"train_tokens_per_second": 136347.23
},
{
"epoch": 0.234,
"grad_norm": 0.4089221656322479,
"learning_rate": 0.00026367668347248083,
"loss": 1.2393,
"num_input_tokens_seen": 1533542400,
"step": 23400,
"train_runtime": 11247.6635,
"train_tokens_per_second": 136343.196
},
{
"epoch": 0.235,
"grad_norm": 0.467582106590271,
"learning_rate": 0.0002633655532258646,
"loss": 1.2534,
"num_input_tokens_seen": 1540096000,
"step": 23500,
"train_runtime": 11294.1646,
"train_tokens_per_second": 136362.099
},
{
"epoch": 0.236,
"grad_norm": 0.48117080330848694,
"learning_rate": 0.000263053281389386,
"loss": 1.2644,
"num_input_tokens_seen": 1546649600,
"step": 23600,
"train_runtime": 11340.9021,
"train_tokens_per_second": 136378.004
},
{
"epoch": 0.237,
"grad_norm": 0.4495629072189331,
"learning_rate": 0.0002627398711076189,
"loss": 1.2442,
"num_input_tokens_seen": 1553203200,
"step": 23700,
"train_runtime": 11387.7566,
"train_tokens_per_second": 136392.377
},
{
"epoch": 0.238,
"grad_norm": 0.4376384913921356,
"learning_rate": 0.0002624253255366014,
"loss": 1.2489,
"num_input_tokens_seen": 1559756800,
"step": 23800,
"train_runtime": 11439.8893,
"train_tokens_per_second": 136343.696
},
{
"epoch": 0.239,
"grad_norm": 0.4419648349285126,
"learning_rate": 0.0002621096478438039,
"loss": 1.2353,
"num_input_tokens_seen": 1566310400,
"step": 23900,
"train_runtime": 11486.001,
"train_tokens_per_second": 136366.904
},
{
"epoch": 0.24,
"grad_norm": 0.669739305973053,
"learning_rate": 0.00026179284120809727,
"loss": 1.2528,
"num_input_tokens_seen": 1572864000,
"step": 24000,
"train_runtime": 11533.9608,
"train_tokens_per_second": 136368.072
},
{
"epoch": 0.241,
"grad_norm": 0.4047415554523468,
"learning_rate": 0.0002614749088197208,
"loss": 1.2679,
"num_input_tokens_seen": 1579417600,
"step": 24100,
"train_runtime": 11582.9583,
"train_tokens_per_second": 136357.013
},
{
"epoch": 0.242,
"grad_norm": 0.5224933624267578,
"learning_rate": 0.00026115585388025015,
"loss": 1.2425,
"num_input_tokens_seen": 1585971200,
"step": 24200,
"train_runtime": 11630.022,
"train_tokens_per_second": 136368.719
},
{
"epoch": 0.243,
"grad_norm": 0.5125856399536133,
"learning_rate": 0.00026083567960256493,
"loss": 1.2423,
"num_input_tokens_seen": 1592524800,
"step": 24300,
"train_runtime": 11677.13,
"train_tokens_per_second": 136379.813
},
{
"epoch": 0.244,
"grad_norm": 0.5344144701957703,
"learning_rate": 0.00026051438921081667,
"loss": 1.2431,
"num_input_tokens_seen": 1599078400,
"step": 24400,
"train_runtime": 11723.5349,
"train_tokens_per_second": 136398.997
},
{
"epoch": 0.245,
"grad_norm": 0.4386890232563019,
"learning_rate": 0.00026019198594039595,
"loss": 1.2426,
"num_input_tokens_seen": 1605632000,
"step": 24500,
"train_runtime": 11773.1296,
"train_tokens_per_second": 136381.069
},
{
"epoch": 0.246,
"grad_norm": 0.4986630082130432,
"learning_rate": 0.00025986847303790026,
"loss": 1.2531,
"num_input_tokens_seen": 1612185600,
"step": 24600,
"train_runtime": 11820.6579,
"train_tokens_per_second": 136387.13
},
{
"epoch": 0.247,
"grad_norm": 0.5271715521812439,
"learning_rate": 0.00025954385376110076,
"loss": 1.249,
"num_input_tokens_seen": 1618739200,
"step": 24700,
"train_runtime": 11867.4874,
"train_tokens_per_second": 136401.172
},
{
"epoch": 0.248,
"grad_norm": 0.45263609290122986,
"learning_rate": 0.00025921813137891005,
"loss": 1.2507,
"num_input_tokens_seen": 1625292800,
"step": 24800,
"train_runtime": 11919.9131,
"train_tokens_per_second": 136351.061
},
{
"epoch": 0.249,
"grad_norm": 0.5932081937789917,
"learning_rate": 0.000258891309171349,
"loss": 1.2438,
"num_input_tokens_seen": 1631846400,
"step": 24900,
"train_runtime": 11962.6395,
"train_tokens_per_second": 136411.902
},
{
"epoch": 0.25,
"grad_norm": 0.5539859533309937,
"learning_rate": 0.00025856339042951344,
"loss": 1.2548,
"num_input_tokens_seen": 1638400000,
"step": 25000,
"train_runtime": 12014.9411,
"train_tokens_per_second": 136363.548
},
{
"epoch": 0.251,
"grad_norm": 0.5236772298812866,
"learning_rate": 0.0002582343784555415,
"loss": 1.2386,
"num_input_tokens_seen": 1644953600,
"step": 25100,
"train_runtime": 12062.3997,
"train_tokens_per_second": 136370.344
},
{
"epoch": 0.252,
"grad_norm": 0.5913048982620239,
"learning_rate": 0.00025790427656258017,
"loss": 1.2354,
"num_input_tokens_seen": 1651507200,
"step": 25200,
"train_runtime": 12108.5333,
"train_tokens_per_second": 136392.01
},
{
"epoch": 0.253,
"grad_norm": 0.5929732322692871,
"learning_rate": 0.00025757308807475185,
"loss": 1.2582,
"num_input_tokens_seen": 1658060800,
"step": 25300,
"train_runtime": 12154.8252,
"train_tokens_per_second": 136411.736
},
{
"epoch": 0.254,
"grad_norm": 0.4542764723300934,
"learning_rate": 0.00025724081632712086,
"loss": 1.2488,
"num_input_tokens_seen": 1664614400,
"step": 25400,
"train_runtime": 12207.8935,
"train_tokens_per_second": 136355.58
},
{
"epoch": 0.255,
"grad_norm": 1.0848513841629028,
"learning_rate": 0.0002569074646656601,
"loss": 1.2375,
"num_input_tokens_seen": 1671168000,
"step": 25500,
"train_runtime": 12254.3162,
"train_tokens_per_second": 136373.827
},
{
"epoch": 0.256,
"grad_norm": 0.5190780162811279,
"learning_rate": 0.00025657303644721695,
"loss": 1.236,
"num_input_tokens_seen": 1677721600,
"step": 25600,
"train_runtime": 12301.2378,
"train_tokens_per_second": 136386.405
},
{
"epoch": 0.257,
"grad_norm": 0.43418362736701965,
"learning_rate": 0.00025623753503948004,
"loss": 1.2484,
"num_input_tokens_seen": 1684275200,
"step": 25700,
"train_runtime": 12347.684,
"train_tokens_per_second": 136404.138
},
{
"epoch": 0.258,
"grad_norm": 0.4586409032344818,
"learning_rate": 0.00025590096382094475,
"loss": 1.2674,
"num_input_tokens_seen": 1690828800,
"step": 25800,
"train_runtime": 12394.5809,
"train_tokens_per_second": 136416.778
},
{
"epoch": 0.259,
"grad_norm": 0.5069702863693237,
"learning_rate": 0.00025556332618087945,
"loss": 1.2428,
"num_input_tokens_seen": 1697382400,
"step": 25900,
"train_runtime": 12447.2116,
"train_tokens_per_second": 136366.478
},
{
"epoch": 0.26,
"grad_norm": 0.591788649559021,
"learning_rate": 0.00025522462551929155,
"loss": 1.2417,
"num_input_tokens_seen": 1703936000,
"step": 26000,
"train_runtime": 12492.8891,
"train_tokens_per_second": 136392.47
},
{
"epoch": 0.261,
"grad_norm": 0.6001791954040527,
"learning_rate": 0.00025488486524689283,
"loss": 1.2407,
"num_input_tokens_seen": 1710489600,
"step": 26100,
"train_runtime": 12539.4548,
"train_tokens_per_second": 136408.61
},
{
"epoch": 0.262,
"grad_norm": 0.47005897760391235,
"learning_rate": 0.00025454404878506555,
"loss": 1.2558,
"num_input_tokens_seen": 1717043200,
"step": 26200,
"train_runtime": 12587.1655,
"train_tokens_per_second": 136412.221
},
{
"epoch": 0.263,
"grad_norm": 0.42708972096443176,
"learning_rate": 0.0002542021795658276,
"loss": 1.2445,
"num_input_tokens_seen": 1723596800,
"step": 26300,
"train_runtime": 12634.1294,
"train_tokens_per_second": 136423.868
},
{
"epoch": 0.264,
"grad_norm": 0.48100486397743225,
"learning_rate": 0.0002538592610317984,
"loss": 1.2416,
"num_input_tokens_seen": 1730150400,
"step": 26400,
"train_runtime": 12686.5075,
"train_tokens_per_second": 136377.202
},
{
"epoch": 0.265,
"grad_norm": 0.5689502954483032,
"learning_rate": 0.00025351529663616355,
"loss": 1.2476,
"num_input_tokens_seen": 1736704000,
"step": 26500,
"train_runtime": 12733.1403,
"train_tokens_per_second": 136392.435
},
{
"epoch": 0.266,
"grad_norm": 0.3999510705471039,
"learning_rate": 0.00025317028984264087,
"loss": 1.2507,
"num_input_tokens_seen": 1743257600,
"step": 26600,
"train_runtime": 12780.4326,
"train_tokens_per_second": 136400.515
},
{
"epoch": 0.267,
"grad_norm": 0.4349440336227417,
"learning_rate": 0.0002528242441254448,
"loss": 1.2359,
"num_input_tokens_seen": 1749811200,
"step": 26700,
"train_runtime": 12826.6298,
"train_tokens_per_second": 136420.184
},
{
"epoch": 0.268,
"grad_norm": 0.40468648076057434,
"learning_rate": 0.000252477162969252,
"loss": 1.2463,
"num_input_tokens_seen": 1756364800,
"step": 26800,
"train_runtime": 12873.4848,
"train_tokens_per_second": 136432.739
},
{
"epoch": 0.269,
"grad_norm": 0.5858653783798218,
"learning_rate": 0.00025212904986916584,
"loss": 1.2385,
"num_input_tokens_seen": 1762918400,
"step": 26900,
"train_runtime": 12926.2009,
"train_tokens_per_second": 136383.336
},
{
"epoch": 0.27,
"grad_norm": 0.4621046483516693,
"learning_rate": 0.00025177990833068133,
"loss": 1.2366,
"num_input_tokens_seen": 1769472000,
"step": 27000,
"train_runtime": 12973.4952,
"train_tokens_per_second": 136391.31
},
{
"epoch": 0.271,
"grad_norm": 0.4884892404079437,
"learning_rate": 0.0002514297418696499,
"loss": 1.2436,
"num_input_tokens_seen": 1776025600,
"step": 27100,
"train_runtime": 13021.2871,
"train_tokens_per_second": 136394.013
},
{
"epoch": 0.272,
"grad_norm": 0.5108981132507324,
"learning_rate": 0.0002510785540122439,
"loss": 1.2423,
"num_input_tokens_seen": 1782579200,
"step": 27200,
"train_runtime": 13068.0423,
"train_tokens_per_second": 136407.517
},
{
"epoch": 0.273,
"grad_norm": 0.3898067772388458,
"learning_rate": 0.0002507263482949212,
"loss": 1.2415,
"num_input_tokens_seen": 1789132800,
"step": 27300,
"train_runtime": 13113.8421,
"train_tokens_per_second": 136430.864
},
{
"epoch": 0.274,
"grad_norm": 0.5622383952140808,
"learning_rate": 0.0002503731282643894,
"loss": 1.2378,
"num_input_tokens_seen": 1795686400,
"step": 27400,
"train_runtime": 13161.1635,
"train_tokens_per_second": 136438.272
},
{
"epoch": 0.275,
"grad_norm": 0.7748796343803406,
"learning_rate": 0.0002500188974775704,
"loss": 1.248,
"num_input_tokens_seen": 1802240000,
"step": 27500,
"train_runtime": 13209.4471,
"train_tokens_per_second": 136435.688
},
{
"epoch": 0.276,
"grad_norm": 0.8867826461791992,
"learning_rate": 0.00024966365950156416,
"loss": 1.2409,
"num_input_tokens_seen": 1808793600,
"step": 27600,
"train_runtime": 13256.4066,
"train_tokens_per_second": 136446.751
},
{
"epoch": 0.277,
"grad_norm": 0.49997836351394653,
"learning_rate": 0.00024930741791361326,
"loss": 1.2382,
"num_input_tokens_seen": 1815347200,
"step": 27700,
"train_runtime": 13309.6196,
"train_tokens_per_second": 136393.62
},
{
"epoch": 0.278,
"grad_norm": 0.5048521161079407,
"learning_rate": 0.0002489501763010664,
"loss": 1.2351,
"num_input_tokens_seen": 1821900800,
"step": 27800,
"train_runtime": 13356.706,
"train_tokens_per_second": 136403.451
},
{
"epoch": 0.279,
"grad_norm": 0.5528578162193298,
"learning_rate": 0.00024859193826134285,
"loss": 1.2404,
"num_input_tokens_seen": 1828454400,
"step": 27900,
"train_runtime": 13405.5813,
"train_tokens_per_second": 136395.01
},
{
"epoch": 0.28,
"grad_norm": 0.44376805424690247,
"learning_rate": 0.00024823270740189556,
"loss": 1.2461,
"num_input_tokens_seen": 1835008000,
"step": 28000,
"train_runtime": 13452.7686,
"train_tokens_per_second": 136403.743
},
{
"epoch": 0.281,
"grad_norm": 0.5072674751281738,
"learning_rate": 0.00024787248734017527,
"loss": 1.2301,
"num_input_tokens_seen": 1841561600,
"step": 28100,
"train_runtime": 13501.0413,
"train_tokens_per_second": 136401.449
},
{
"epoch": 0.282,
"grad_norm": 0.46835577487945557,
"learning_rate": 0.0002475112817035941,
"loss": 1.237,
"num_input_tokens_seen": 1848115200,
"step": 28200,
"train_runtime": 13547.4814,
"train_tokens_per_second": 136417.622
},
{
"epoch": 0.283,
"grad_norm": 0.4893036186695099,
"learning_rate": 0.0002471490941294887,
"loss": 1.2612,
"num_input_tokens_seen": 1854668800,
"step": 28300,
"train_runtime": 13593.9904,
"train_tokens_per_second": 136432.993
},
{
"epoch": 0.284,
"grad_norm": 0.66542649269104,
"learning_rate": 0.000246785928265084,
"loss": 1.2405,
"num_input_tokens_seen": 1861222400,
"step": 28400,
"train_runtime": 13646.3147,
"train_tokens_per_second": 136390.113
},
{
"epoch": 0.285,
"grad_norm": 0.669306755065918,
"learning_rate": 0.0002464217877674562,
"loss": 1.2409,
"num_input_tokens_seen": 1867776000,
"step": 28500,
"train_runtime": 13692.502,
"train_tokens_per_second": 136408.671
},
{
"epoch": 0.286,
"grad_norm": 0.43464845418930054,
"learning_rate": 0.0002460566763034961,
"loss": 1.2435,
"num_input_tokens_seen": 1874329600,
"step": 28600,
"train_runtime": 13738.7564,
"train_tokens_per_second": 136426.438
},
{
"epoch": 0.287,
"grad_norm": 0.5084187388420105,
"learning_rate": 0.00024569059754987196,
"loss": 1.2572,
"num_input_tokens_seen": 1880883200,
"step": 28700,
"train_runtime": 13785.6191,
"train_tokens_per_second": 136438.065
},
{
"epoch": 0.288,
"grad_norm": 0.473603755235672,
"learning_rate": 0.00024532355519299296,
"loss": 1.2459,
"num_input_tokens_seen": 1887436800,
"step": 28800,
"train_runtime": 13838.5181,
"train_tokens_per_second": 136390.095
},
{
"epoch": 0.289,
"grad_norm": 0.493012011051178,
"learning_rate": 0.0002449555529289714,
"loss": 1.243,
"num_input_tokens_seen": 1893990400,
"step": 28900,
"train_runtime": 13886.1283,
"train_tokens_per_second": 136394.419
},
{
"epoch": 0.29,
"grad_norm": 0.7421333193778992,
"learning_rate": 0.0002445865944635861,
"loss": 1.2455,
"num_input_tokens_seen": 1900544000,
"step": 29000,
"train_runtime": 13931.9406,
"train_tokens_per_second": 136416.315
},
{
"epoch": 0.291,
"grad_norm": 0.5027185678482056,
"learning_rate": 0.0002442166835122446,
"loss": 1.2686,
"num_input_tokens_seen": 1907097600,
"step": 29100,
"train_runtime": 13980.446,
"train_tokens_per_second": 136411.785
},
{
"epoch": 0.292,
"grad_norm": 0.48427557945251465,
"learning_rate": 0.00024384582379994614,
"loss": 1.2369,
"num_input_tokens_seen": 1913651200,
"step": 29200,
"train_runtime": 14028.0456,
"train_tokens_per_second": 136416.095
},
{
"epoch": 0.293,
"grad_norm": 0.6620755195617676,
"learning_rate": 0.00024347401906124388,
"loss": 1.2317,
"num_input_tokens_seen": 1920204800,
"step": 29300,
"train_runtime": 14074.3372,
"train_tokens_per_second": 136433.054
},
{
"epoch": 0.294,
"grad_norm": 0.5745883584022522,
"learning_rate": 0.0002431012730402075,
"loss": 1.2443,
"num_input_tokens_seen": 1926758400,
"step": 29400,
"train_runtime": 14125.645,
"train_tokens_per_second": 136401.446
},
{
"epoch": 0.295,
"grad_norm": 0.441680908203125,
"learning_rate": 0.00024272758949038517,
"loss": 1.2393,
"num_input_tokens_seen": 1933312000,
"step": 29500,
"train_runtime": 14172.5336,
"train_tokens_per_second": 136412.588
},
{
"epoch": 0.296,
"grad_norm": 0.4417046904563904,
"learning_rate": 0.00024235297217476616,
"loss": 1.2371,
"num_input_tokens_seen": 1939865600,
"step": 29600,
"train_runtime": 14220.1572,
"train_tokens_per_second": 136416.608
},
{
"epoch": 0.297,
"grad_norm": 0.5888639688491821,
"learning_rate": 0.00024197742486574268,
"loss": 1.2344,
"num_input_tokens_seen": 1946419200,
"step": 29700,
"train_runtime": 14267.366,
"train_tokens_per_second": 136424.565
},
{
"epoch": 0.298,
"grad_norm": 0.4625283479690552,
"learning_rate": 0.0002416009513450719,
"loss": 1.2373,
"num_input_tokens_seen": 1952972800,
"step": 29800,
"train_runtime": 14318.8989,
"train_tokens_per_second": 136391.27
},
{
"epoch": 0.299,
"grad_norm": 0.47661375999450684,
"learning_rate": 0.00024122355540383806,
"loss": 1.2454,
"num_input_tokens_seen": 1959526400,
"step": 29900,
"train_runtime": 14365.8797,
"train_tokens_per_second": 136401.42
},
{
"epoch": 0.3,
"grad_norm": 0.727032482624054,
"learning_rate": 0.00024084524084241405,
"loss": 1.2379,
"num_input_tokens_seen": 1966080000,
"step": 30000,
"train_runtime": 14415.1273,
"train_tokens_per_second": 136390.055
},
{
"epoch": 0.301,
"grad_norm": 0.45500555634498596,
"learning_rate": 0.00024046601147042332,
"loss": 1.2358,
"num_input_tokens_seen": 1972633600,
"step": 30100,
"train_runtime": 14461.5845,
"train_tokens_per_second": 136405.08
},
{
"epoch": 0.302,
"grad_norm": 0.44596830010414124,
"learning_rate": 0.0002400858711067015,
"loss": 1.2301,
"num_input_tokens_seen": 1979187200,
"step": 30200,
"train_runtime": 14508.0707,
"train_tokens_per_second": 136419.737
},
{
"epoch": 0.303,
"grad_norm": 0.4207491874694824,
"learning_rate": 0.00023970482357925772,
"loss": 1.2441,
"num_input_tokens_seen": 1985740800,
"step": 30300,
"train_runtime": 14555.5751,
"train_tokens_per_second": 136424.757
},
{
"epoch": 0.304,
"grad_norm": 0.4833202064037323,
"learning_rate": 0.00023932287272523646,
"loss": 1.2351,
"num_input_tokens_seen": 1992294400,
"step": 30400,
"train_runtime": 14601.9546,
"train_tokens_per_second": 136440.255
},
{
"epoch": 0.305,
"grad_norm": 0.5268282294273376,
"learning_rate": 0.00023894002239087847,
"loss": 1.2384,
"num_input_tokens_seen": 1998848000,
"step": 30500,
"train_runtime": 14654.2539,
"train_tokens_per_second": 136400.53
},
{
"epoch": 0.306,
"grad_norm": 0.4639832377433777,
"learning_rate": 0.0002385562764314825,
"loss": 1.3007,
"num_input_tokens_seen": 2005401600,
"step": 30600,
"train_runtime": 14702.026,
"train_tokens_per_second": 136403.078
},
{
"epoch": 0.307,
"grad_norm": 0.526703953742981,
"learning_rate": 0.00023817163871136596,
"loss": 1.2481,
"num_input_tokens_seen": 2011955200,
"step": 30700,
"train_runtime": 14749.4458,
"train_tokens_per_second": 136408.868
},
{
"epoch": 0.308,
"grad_norm": 0.43404075503349304,
"learning_rate": 0.00023778611310382652,
"loss": 1.2273,
"num_input_tokens_seen": 2018508800,
"step": 30800,
"train_runtime": 14796.5936,
"train_tokens_per_second": 136417.128
},
{
"epoch": 0.309,
"grad_norm": 0.39956456422805786,
"learning_rate": 0.0002373997034911027,
"loss": 1.2275,
"num_input_tokens_seen": 2025062400,
"step": 30900,
"train_runtime": 14843.3887,
"train_tokens_per_second": 136428.578
},
{
"epoch": 0.31,
"grad_norm": 0.46024298667907715,
"learning_rate": 0.00023701241376433506,
"loss": 1.2353,
"num_input_tokens_seen": 2031616000,
"step": 31000,
"train_runtime": 14890.8282,
"train_tokens_per_second": 136434.05
},
{
"epoch": 0.311,
"grad_norm": 0.38429203629493713,
"learning_rate": 0.0002366242478235268,
"loss": 1.2403,
"num_input_tokens_seen": 2038169600,
"step": 31100,
"train_runtime": 14937.8781,
"train_tokens_per_second": 136443.047
},
{
"epoch": 0.312,
"grad_norm": 0.5401485562324524,
"learning_rate": 0.00023623520957750471,
"loss": 1.2273,
"num_input_tokens_seen": 2044723200,
"step": 31200,
"train_runtime": 14990.0842,
"train_tokens_per_second": 136405.051
},
{
"epoch": 0.313,
"grad_norm": 0.5360187888145447,
"learning_rate": 0.00023584530294387953,
"loss": 1.2312,
"num_input_tokens_seen": 2051276800,
"step": 31300,
"train_runtime": 15037.4257,
"train_tokens_per_second": 136411.434
},
{
"epoch": 0.314,
"grad_norm": 0.4468795359134674,
"learning_rate": 0.00023545453184900682,
"loss": 1.2383,
"num_input_tokens_seen": 2057830400,
"step": 31400,
"train_runtime": 15083.4771,
"train_tokens_per_second": 136429.444
},
{
"epoch": 0.315,
"grad_norm": 0.4575517177581787,
"learning_rate": 0.00023506290022794706,
"loss": 1.2354,
"num_input_tokens_seen": 2064384000,
"step": 31500,
"train_runtime": 15131.2692,
"train_tokens_per_second": 136431.648
},
{
"epoch": 0.316,
"grad_norm": 0.7983475923538208,
"learning_rate": 0.00023467041202442643,
"loss": 1.2309,
"num_input_tokens_seen": 2070937600,
"step": 31600,
"train_runtime": 15178.6218,
"train_tokens_per_second": 136437.789
},
{
"epoch": 0.317,
"grad_norm": 0.4316498339176178,
"learning_rate": 0.00023427707119079669,
"loss": 1.2462,
"num_input_tokens_seen": 2077491200,
"step": 31700,
"train_runtime": 15225.1881,
"train_tokens_per_second": 136450.938
},
{
"epoch": 0.318,
"grad_norm": 0.5765666365623474,
"learning_rate": 0.0002338828816879957,
"loss": 1.2367,
"num_input_tokens_seen": 2084044800,
"step": 31800,
"train_runtime": 15277.5735,
"train_tokens_per_second": 136412.029
},
{
"epoch": 0.319,
"grad_norm": 0.44825831055641174,
"learning_rate": 0.00023348784748550744,
"loss": 1.2354,
"num_input_tokens_seen": 2090598400,
"step": 31900,
"train_runtime": 15324.8285,
"train_tokens_per_second": 136419.04
},
{
"epoch": 0.32,
"grad_norm": 0.5602436661720276,
"learning_rate": 0.00023309197256132184,
"loss": 1.2324,
"num_input_tokens_seen": 2097152000,
"step": 32000,
"train_runtime": 15371.4775,
"train_tokens_per_second": 136431.387
},
{
"epoch": 0.321,
"grad_norm": 0.4002476930618286,
"learning_rate": 0.00023269526090189505,
"loss": 1.2396,
"num_input_tokens_seen": 2103705600,
"step": 32100,
"train_runtime": 15419.2672,
"train_tokens_per_second": 136433.565
},
{
"epoch": 0.322,
"grad_norm": 0.4306688606739044,
"learning_rate": 0.00023229771650210907,
"loss": 1.2468,
"num_input_tokens_seen": 2110259200,
"step": 32200,
"train_runtime": 15466.1068,
"train_tokens_per_second": 136444.111
},
{
"epoch": 0.323,
"grad_norm": 0.584658145904541,
"learning_rate": 0.00023189934336523163,
"loss": 1.2459,
"num_input_tokens_seen": 2116812800,
"step": 32300,
"train_runtime": 15513.277,
"train_tokens_per_second": 136451.686
},
{
"epoch": 0.324,
"grad_norm": 0.4049496352672577,
"learning_rate": 0.00023150014550287574,
"loss": 1.2455,
"num_input_tokens_seen": 2123366400,
"step": 32400,
"train_runtime": 15565.7808,
"train_tokens_per_second": 136412.456
},
{
"epoch": 0.325,
"grad_norm": 0.45713433623313904,
"learning_rate": 0.00023110012693495943,
"loss": 1.2308,
"num_input_tokens_seen": 2129920000,
"step": 32500,
"train_runtime": 15610.6324,
"train_tokens_per_second": 136440.341
},
{
"epoch": 0.326,
"grad_norm": 0.5710960030555725,
"learning_rate": 0.00023069929168966527,
"loss": 1.2434,
"num_input_tokens_seen": 2136473600,
"step": 32600,
"train_runtime": 15657.7335,
"train_tokens_per_second": 136448.458
},
{
"epoch": 0.327,
"grad_norm": 0.5807371735572815,
"learning_rate": 0.0002302976438033997,
"loss": 1.2292,
"num_input_tokens_seen": 2143027200,
"step": 32700,
"train_runtime": 15710.1819,
"train_tokens_per_second": 136410.082
},
{
"epoch": 0.328,
"grad_norm": 0.4462313652038574,
"learning_rate": 0.0002298951873207525,
"loss": 1.2427,
"num_input_tokens_seen": 2149580800,
"step": 32800,
"train_runtime": 15757.3708,
"train_tokens_per_second": 136417.479
},
{
"epoch": 0.329,
"grad_norm": 0.6099971532821655,
"learning_rate": 0.00022949192629445606,
"loss": 1.2313,
"num_input_tokens_seen": 2156134400,
"step": 32900,
"train_runtime": 15804.1823,
"train_tokens_per_second": 136428.089
},
{
"epoch": 0.33,
"grad_norm": 0.8630947470664978,
"learning_rate": 0.0002290878647853443,
"loss": 1.247,
"num_input_tokens_seen": 2162688000,
"step": 33000,
"train_runtime": 15852.2039,
"train_tokens_per_second": 136428.223
},
{
"epoch": 0.331,
"grad_norm": 0.5154317021369934,
"learning_rate": 0.00022868300686231224,
"loss": 1.2246,
"num_input_tokens_seen": 2169241600,
"step": 33100,
"train_runtime": 15899.5617,
"train_tokens_per_second": 136434.05
},
{
"epoch": 0.332,
"grad_norm": 0.5033185482025146,
"learning_rate": 0.00022827735660227457,
"loss": 1.2271,
"num_input_tokens_seen": 2175795200,
"step": 33200,
"train_runtime": 15947.1716,
"train_tokens_per_second": 136437.686
},
{
"epoch": 0.333,
"grad_norm": 0.7760284543037415,
"learning_rate": 0.000227870918090125,
"loss": 1.2445,
"num_input_tokens_seen": 2182348800,
"step": 33300,
"train_runtime": 16000.1889,
"train_tokens_per_second": 136395.189
},
{
"epoch": 0.334,
"grad_norm": 0.5042400360107422,
"learning_rate": 0.00022746369541869476,
"loss": 1.223,
"num_input_tokens_seen": 2188902400,
"step": 33400,
"train_runtime": 16047.8873,
"train_tokens_per_second": 136398.166
},
{
"epoch": 0.335,
"grad_norm": 0.421273410320282,
"learning_rate": 0.00022705569268871163,
"loss": 1.2222,
"num_input_tokens_seen": 2195456000,
"step": 33500,
"train_runtime": 16094.6711,
"train_tokens_per_second": 136408.876
},
{
"epoch": 0.336,
"grad_norm": 0.48292359709739685,
"learning_rate": 0.00022664691400875865,
"loss": 1.222,
"num_input_tokens_seen": 2202009600,
"step": 33600,
"train_runtime": 16143.6943,
"train_tokens_per_second": 136400.601
},
{
"epoch": 0.337,
"grad_norm": 0.4301004409790039,
"learning_rate": 0.00022623736349523254,
"loss": 1.2308,
"num_input_tokens_seen": 2208563200,
"step": 33700,
"train_runtime": 16189.7469,
"train_tokens_per_second": 136417.401
},
{
"epoch": 0.338,
"grad_norm": 0.6592893600463867,
"learning_rate": 0.00022582704527230238,
"loss": 1.2401,
"num_input_tokens_seen": 2215116800,
"step": 33800,
"train_runtime": 16235.6512,
"train_tokens_per_second": 136435.353
},
{
"epoch": 0.339,
"grad_norm": 0.6183221340179443,
"learning_rate": 0.0002254159634718682,
"loss": 1.2364,
"num_input_tokens_seen": 2221670400,
"step": 33900,
"train_runtime": 16283.1306,
"train_tokens_per_second": 136440.003
},
{
"epoch": 0.34,
"grad_norm": 0.529971182346344,
"learning_rate": 0.00022500412223351915,
"loss": 1.2222,
"num_input_tokens_seen": 2228224000,
"step": 34000,
"train_runtime": 16330.1955,
"train_tokens_per_second": 136448.091
},
{
"epoch": 0.341,
"grad_norm": 0.41906896233558655,
"learning_rate": 0.0002245915257044919,
"loss": 1.2261,
"num_input_tokens_seen": 2234777600,
"step": 34100,
"train_runtime": 16381.7912,
"train_tokens_per_second": 136418.391
},
{
"epoch": 0.342,
"grad_norm": 0.4326164722442627,
"learning_rate": 0.00022417817803962892,
"loss": 1.2452,
"num_input_tokens_seen": 2241331200,
"step": 34200,
"train_runtime": 16429.3997,
"train_tokens_per_second": 136421.978
},
{
"epoch": 0.343,
"grad_norm": 0.8329346179962158,
"learning_rate": 0.0002237640834013366,
"loss": 1.2197,
"num_input_tokens_seen": 2247884800,
"step": 34300,
"train_runtime": 16476.2139,
"train_tokens_per_second": 136432.121
},
{
"epoch": 0.344,
"grad_norm": 0.4649752378463745,
"learning_rate": 0.0002233492459595434,
"loss": 1.2255,
"num_input_tokens_seen": 2254438400,
"step": 34400,
"train_runtime": 16523.092,
"train_tokens_per_second": 136441.678
},
{
"epoch": 0.345,
"grad_norm": 0.5218563675880432,
"learning_rate": 0.00022293366989165772,
"loss": 1.2365,
"num_input_tokens_seen": 2260992000,
"step": 34500,
"train_runtime": 16575.1624,
"train_tokens_per_second": 136408.437
},
{
"epoch": 0.346,
"grad_norm": 0.8002403974533081,
"learning_rate": 0.00022251735938252587,
"loss": 1.2179,
"num_input_tokens_seen": 2267545600,
"step": 34600,
"train_runtime": 16622.274,
"train_tokens_per_second": 136416.088
},
{
"epoch": 0.347,
"grad_norm": 0.5648475289344788,
"learning_rate": 0.0002221003186243902,
"loss": 1.2301,
"num_input_tokens_seen": 2274099200,
"step": 34700,
"train_runtime": 16668.9107,
"train_tokens_per_second": 136427.583
},
{
"epoch": 0.348,
"grad_norm": 0.4631340801715851,
"learning_rate": 0.00022168255181684643,
"loss": 1.2292,
"num_input_tokens_seen": 2280652800,
"step": 34800,
"train_runtime": 16715.4649,
"train_tokens_per_second": 136439.687
},
{
"epoch": 0.349,
"grad_norm": 0.4492770731449127,
"learning_rate": 0.00022126406316680172,
"loss": 1.226,
"num_input_tokens_seen": 2287206400,
"step": 34900,
"train_runtime": 16761.744,
"train_tokens_per_second": 136453.963
},
{
"epoch": 0.35,
"grad_norm": 0.5984812378883362,
"learning_rate": 0.00022084485688843208,
"loss": 1.2332,
"num_input_tokens_seen": 2293760000,
"step": 35000,
"train_runtime": 16816.4332,
"train_tokens_per_second": 136399.912
},
{
"epoch": 0.351,
"grad_norm": 0.6245887875556946,
"learning_rate": 0.00022042493720314003,
"loss": 1.2324,
"num_input_tokens_seen": 2300313600,
"step": 35100,
"train_runtime": 16864.2018,
"train_tokens_per_second": 136402.163
},
{
"epoch": 0.352,
"grad_norm": 0.6719664335250854,
"learning_rate": 0.00022000430833951228,
"loss": 1.2272,
"num_input_tokens_seen": 2306867200,
"step": 35200,
"train_runtime": 16910.313,
"train_tokens_per_second": 136417.77
},
{
"epoch": 0.353,
"grad_norm": 0.43880173563957214,
"learning_rate": 0.00021958297453327673,
"loss": 1.2572,
"num_input_tokens_seen": 2313420800,
"step": 35300,
"train_runtime": 16958.9376,
"train_tokens_per_second": 136413.073
},
{
"epoch": 0.354,
"grad_norm": 0.6195557713508606,
"learning_rate": 0.00021916094002726012,
"loss": 1.2299,
"num_input_tokens_seen": 2319974400,
"step": 35400,
"train_runtime": 17005.9814,
"train_tokens_per_second": 136421.083
},
{
"epoch": 0.355,
"grad_norm": 0.5288188457489014,
"learning_rate": 0.00021873820907134534,
"loss": 1.2157,
"num_input_tokens_seen": 2326528000,
"step": 35500,
"train_runtime": 17053.3579,
"train_tokens_per_second": 136426.387
},
{
"epoch": 0.356,
"grad_norm": 0.4962466061115265,
"learning_rate": 0.0002183147859224283,
"loss": 1.2282,
"num_input_tokens_seen": 2333081600,
"step": 35600,
"train_runtime": 17099.0541,
"train_tokens_per_second": 136445.068
},
{
"epoch": 0.357,
"grad_norm": 0.4940129518508911,
"learning_rate": 0.00021789067484437544,
"loss": 1.2349,
"num_input_tokens_seen": 2339635200,
"step": 35700,
"train_runtime": 17146.892,
"train_tokens_per_second": 136446.605
},
{
"epoch": 0.358,
"grad_norm": 0.5929033160209656,
"learning_rate": 0.00021746588010798068,
"loss": 1.2368,
"num_input_tokens_seen": 2346188800,
"step": 35800,
"train_runtime": 17199.6266,
"train_tokens_per_second": 136409.287
},
{
"epoch": 0.359,
"grad_norm": 0.4825666546821594,
"learning_rate": 0.00021704040599092216,
"loss": 1.2215,
"num_input_tokens_seen": 2352742400,
"step": 35900,
"train_runtime": 17246.2748,
"train_tokens_per_second": 136420.324
},
{
"epoch": 0.36,
"grad_norm": 0.4572449028491974,
"learning_rate": 0.00021661425677771965,
"loss": 1.2291,
"num_input_tokens_seen": 2359296000,
"step": 36000,
"train_runtime": 17292.1332,
"train_tokens_per_second": 136437.533
},
{
"epoch": 0.361,
"grad_norm": 0.467132568359375,
"learning_rate": 0.00021618743675969095,
"loss": 1.2295,
"num_input_tokens_seen": 2365849600,
"step": 36100,
"train_runtime": 17339.1599,
"train_tokens_per_second": 136445.457
},
{
"epoch": 0.362,
"grad_norm": 0.4863705635070801,
"learning_rate": 0.0002157599502349089,
"loss": 1.2154,
"num_input_tokens_seen": 2372403200,
"step": 36200,
"train_runtime": 17386.7454,
"train_tokens_per_second": 136448.952
},
{
"epoch": 0.363,
"grad_norm": 0.43923652172088623,
"learning_rate": 0.00021533180150815802,
"loss": 1.2268,
"num_input_tokens_seen": 2378956800,
"step": 36300,
"train_runtime": 17439.0785,
"train_tokens_per_second": 136415.282
},
{
"epoch": 0.364,
"grad_norm": 0.5028465390205383,
"learning_rate": 0.00021490299489089132,
"loss": 1.2293,
"num_input_tokens_seen": 2385510400,
"step": 36400,
"train_runtime": 17485.9662,
"train_tokens_per_second": 136424.283
},
{
"epoch": 0.365,
"grad_norm": 0.4366530478000641,
"learning_rate": 0.00021447353470118656,
"loss": 1.2276,
"num_input_tokens_seen": 2392064000,
"step": 36500,
"train_runtime": 17533.3809,
"train_tokens_per_second": 136429.136
},
{
"epoch": 0.366,
"grad_norm": 0.46415793895721436,
"learning_rate": 0.00021404342526370326,
"loss": 1.2227,
"num_input_tokens_seen": 2398617600,
"step": 36600,
"train_runtime": 17580.8443,
"train_tokens_per_second": 136433.584
},
{
"epoch": 0.367,
"grad_norm": 0.6382859349250793,
"learning_rate": 0.00021361267090963846,
"loss": 1.2212,
"num_input_tokens_seen": 2405171200,
"step": 36700,
"train_runtime": 17626.7905,
"train_tokens_per_second": 136449.753
},
{
"epoch": 0.368,
"grad_norm": 0.6642177700996399,
"learning_rate": 0.0002131812759766839,
"loss": 1.2317,
"num_input_tokens_seen": 2411724800,
"step": 36800,
"train_runtime": 17679.381,
"train_tokens_per_second": 136414.55
},
{
"epoch": 0.369,
"grad_norm": 0.4071521461009979,
"learning_rate": 0.00021274924480898169,
"loss": 1.2262,
"num_input_tokens_seen": 2418278400,
"step": 36900,
"train_runtime": 17726.5473,
"train_tokens_per_second": 136421.288
},
{
"epoch": 0.37,
"grad_norm": 0.5301467776298523,
"learning_rate": 0.00021231658175708087,
"loss": 1.2192,
"num_input_tokens_seen": 2424832000,
"step": 37000,
"train_runtime": 17772.7667,
"train_tokens_per_second": 136435.258
},
{
"epoch": 0.371,
"grad_norm": 0.5216257572174072,
"learning_rate": 0.00021188329117789357,
"loss": 1.213,
"num_input_tokens_seen": 2431385600,
"step": 37100,
"train_runtime": 17824.6083,
"train_tokens_per_second": 136406.116
},
{
"epoch": 0.372,
"grad_norm": 0.5098195672035217,
"learning_rate": 0.0002114493774346512,
"loss": 1.2311,
"num_input_tokens_seen": 2437939200,
"step": 37200,
"train_runtime": 17870.9901,
"train_tokens_per_second": 136418.81
},
{
"epoch": 0.373,
"grad_norm": 0.47295039892196655,
"learning_rate": 0.00021101484489686025,
"loss": 1.2211,
"num_input_tokens_seen": 2444492800,
"step": 37300,
"train_runtime": 17918.4906,
"train_tokens_per_second": 136422.919
},
{
"epoch": 0.374,
"grad_norm": 0.49752944707870483,
"learning_rate": 0.00021057969794025866,
"loss": 1.2292,
"num_input_tokens_seen": 2451046400,
"step": 37400,
"train_runtime": 17965.5373,
"train_tokens_per_second": 136430.453
},
{
"epoch": 0.375,
"grad_norm": 0.9500930905342102,
"learning_rate": 0.00021014394094677128,
"loss": 1.2187,
"num_input_tokens_seen": 2457600000,
"step": 37500,
"train_runtime": 18012.267,
"train_tokens_per_second": 136440.349
},
{
"epoch": 0.376,
"grad_norm": 0.4800110459327698,
"learning_rate": 0.00020970757830446633,
"loss": 1.2336,
"num_input_tokens_seen": 2464153600,
"step": 37600,
"train_runtime": 18059.6653,
"train_tokens_per_second": 136445.143
},
{
"epoch": 0.377,
"grad_norm": 0.48905813694000244,
"learning_rate": 0.00020927061440751072,
"loss": 1.2189,
"num_input_tokens_seen": 2470707200,
"step": 37700,
"train_runtime": 18111.7548,
"train_tokens_per_second": 136414.567
},
{
"epoch": 0.378,
"grad_norm": 0.593604564666748,
"learning_rate": 0.00020883305365612602,
"loss": 1.2178,
"num_input_tokens_seen": 2477260800,
"step": 37800,
"train_runtime": 18157.6424,
"train_tokens_per_second": 136430.751
},
{
"epoch": 0.379,
"grad_norm": 0.46399399638175964,
"learning_rate": 0.00020839490045654425,
"loss": 1.2141,
"num_input_tokens_seen": 2483814400,
"step": 37900,
"train_runtime": 18204.4326,
"train_tokens_per_second": 136440.089
},
{
"epoch": 0.38,
"grad_norm": 0.5679593086242676,
"learning_rate": 0.00020795615922096313,
"loss": 1.2332,
"num_input_tokens_seen": 2490368000,
"step": 38000,
"train_runtime": 18252.6627,
"train_tokens_per_second": 136438.614
},
{
"epoch": 0.381,
"grad_norm": 0.48073315620422363,
"learning_rate": 0.00020751683436750207,
"loss": 1.2369,
"num_input_tokens_seen": 2496921600,
"step": 38100,
"train_runtime": 18300.6025,
"train_tokens_per_second": 136439.311
},
{
"epoch": 0.382,
"grad_norm": 0.4134567677974701,
"learning_rate": 0.00020707693032015752,
"loss": 1.2168,
"num_input_tokens_seen": 2503475200,
"step": 38200,
"train_runtime": 18351.6848,
"train_tokens_per_second": 136416.641
},
{
"epoch": 0.383,
"grad_norm": 0.4675845503807068,
"learning_rate": 0.00020663645150875834,
"loss": 1.2272,
"num_input_tokens_seen": 2510028800,
"step": 38300,
"train_runtime": 18398.2852,
"train_tokens_per_second": 136427.323
},
{
"epoch": 0.384,
"grad_norm": 0.4632211923599243,
"learning_rate": 0.00020619540236892125,
"loss": 1.2444,
"num_input_tokens_seen": 2516582400,
"step": 38400,
"train_runtime": 18445.2271,
"train_tokens_per_second": 136435.425
},
{
"epoch": 0.385,
"grad_norm": 0.5543389916419983,
"learning_rate": 0.00020575378734200616,
"loss": 1.22,
"num_input_tokens_seen": 2523136000,
"step": 38500,
"train_runtime": 18492.3307,
"train_tokens_per_second": 136442.292
},
{
"epoch": 0.386,
"grad_norm": 0.5775281190872192,
"learning_rate": 0.0002053116108750715,
"loss": 1.2277,
"num_input_tokens_seen": 2529689600,
"step": 38600,
"train_runtime": 18544.2017,
"train_tokens_per_second": 136414.047
},
{
"epoch": 0.387,
"grad_norm": 0.5202789306640625,
"learning_rate": 0.0002048688774208294,
"loss": 1.2203,
"num_input_tokens_seen": 2536243200,
"step": 38700,
"train_runtime": 18591.8641,
"train_tokens_per_second": 136416.832
},
{
"epoch": 0.388,
"grad_norm": 0.44833704829216003,
"learning_rate": 0.0002044255914376009,
"loss": 1.2209,
"num_input_tokens_seen": 2542796800,
"step": 38800,
"train_runtime": 18637.8905,
"train_tokens_per_second": 136431.577
},
{
"epoch": 0.389,
"grad_norm": 0.5180789828300476,
"learning_rate": 0.00020398175738927082,
"loss": 1.2105,
"num_input_tokens_seen": 2549350400,
"step": 38900,
"train_runtime": 18684.0663,
"train_tokens_per_second": 136445.159
},
{
"epoch": 0.39,
"grad_norm": 0.6083468794822693,
"learning_rate": 0.00020353737974524312,
"loss": 1.2136,
"num_input_tokens_seen": 2555904000,
"step": 39000,
"train_runtime": 18730.572,
"train_tokens_per_second": 136456.27
},
{
"epoch": 0.391,
"grad_norm": 0.39693883061408997,
"learning_rate": 0.00020309246298039584,
"loss": 1.2285,
"num_input_tokens_seen": 2562457600,
"step": 39100,
"train_runtime": 18784.1544,
"train_tokens_per_second": 136415.914
},
{
"epoch": 0.392,
"grad_norm": 0.5166248679161072,
"learning_rate": 0.0002026470115750357,
"loss": 1.223,
"num_input_tokens_seen": 2569011200,
"step": 39200,
"train_runtime": 18830.687,
"train_tokens_per_second": 136426.844
},
{
"epoch": 0.393,
"grad_norm": 0.4967111051082611,
"learning_rate": 0.0002022010300148535,
"loss": 1.2163,
"num_input_tokens_seen": 2575564800,
"step": 39300,
"train_runtime": 18876.8963,
"train_tokens_per_second": 136440.057
},
{
"epoch": 0.394,
"grad_norm": 0.627816915512085,
"learning_rate": 0.0002017545227908786,
"loss": 1.2328,
"num_input_tokens_seen": 2582118400,
"step": 39400,
"train_runtime": 18923.6736,
"train_tokens_per_second": 136449.109
},
{
"epoch": 0.395,
"grad_norm": 0.489969938993454,
"learning_rate": 0.00020130749439943376,
"loss": 1.224,
"num_input_tokens_seen": 2588672000,
"step": 39500,
"train_runtime": 18970.0964,
"train_tokens_per_second": 136460.666
},
{
"epoch": 0.396,
"grad_norm": 0.6713995933532715,
"learning_rate": 0.00020085994934208998,
"loss": 1.2156,
"num_input_tokens_seen": 2595225600,
"step": 39600,
"train_runtime": 19023.1241,
"train_tokens_per_second": 136424.784
},
{
"epoch": 0.397,
"grad_norm": 0.4549367427825928,
"learning_rate": 0.00020041189212562094,
"loss": 1.2094,
"num_input_tokens_seen": 2601779200,
"step": 39700,
"train_runtime": 19070.6234,
"train_tokens_per_second": 136428.639
},
{
"epoch": 0.398,
"grad_norm": 0.47548773884773254,
"learning_rate": 0.0001999633272619579,
"loss": 1.2244,
"num_input_tokens_seen": 2608332800,
"step": 39800,
"train_runtime": 19117.4992,
"train_tokens_per_second": 136436.925
},
{
"epoch": 0.399,
"grad_norm": 0.46569159626960754,
"learning_rate": 0.00019951425926814404,
"loss": 1.2189,
"num_input_tokens_seen": 2614886400,
"step": 39900,
"train_runtime": 19164.3173,
"train_tokens_per_second": 136445.581
},
{
"epoch": 0.4,
"grad_norm": 0.5518438220024109,
"learning_rate": 0.00019906469266628904,
"loss": 1.2097,
"num_input_tokens_seen": 2621440000,
"step": 40000,
"train_runtime": 19211.1586,
"train_tokens_per_second": 136454.029
},
{
"epoch": 0.401,
"grad_norm": 0.4615115821361542,
"learning_rate": 0.0001986146319835236,
"loss": 1.2177,
"num_input_tokens_seen": 2627993600,
"step": 40100,
"train_runtime": 19263.5816,
"train_tokens_per_second": 136422.897
},
{
"epoch": 0.402,
"grad_norm": 0.4154411554336548,
"learning_rate": 0.00019816408175195383,
"loss": 1.2262,
"num_input_tokens_seen": 2634547200,
"step": 40200,
"train_runtime": 19310.6242,
"train_tokens_per_second": 136429.935
},
{
"epoch": 0.403,
"grad_norm": 0.48504838347435,
"learning_rate": 0.0001977130465086155,
"loss": 1.2205,
"num_input_tokens_seen": 2641100800,
"step": 40300,
"train_runtime": 19356.9428,
"train_tokens_per_second": 136442.042
},
{
"epoch": 0.404,
"grad_norm": 0.477006196975708,
"learning_rate": 0.0001972615307954286,
"loss": 1.2099,
"num_input_tokens_seen": 2647654400,
"step": 40400,
"train_runtime": 19403.4467,
"train_tokens_per_second": 136452.788
},
{
"epoch": 0.405,
"grad_norm": 0.46401214599609375,
"learning_rate": 0.00019680953915915124,
"loss": 1.2142,
"num_input_tokens_seen": 2654208000,
"step": 40500,
"train_runtime": 19456.0604,
"train_tokens_per_second": 136420.629
},
{
"epoch": 0.406,
"grad_norm": 0.4205267131328583,
"learning_rate": 0.00019635707615133427,
"loss": 1.2233,
"num_input_tokens_seen": 2660761600,
"step": 40600,
"train_runtime": 19503.129,
"train_tokens_per_second": 136427.422
},
{
"epoch": 0.407,
"grad_norm": 0.7298253178596497,
"learning_rate": 0.00019590414632827513,
"loss": 1.2143,
"num_input_tokens_seen": 2667315200,
"step": 40700,
"train_runtime": 19550.1113,
"train_tokens_per_second": 136434.783
},
{
"epoch": 0.408,
"grad_norm": 0.47734642028808594,
"learning_rate": 0.00019545075425097204,
"loss": 1.222,
"num_input_tokens_seen": 2673868800,
"step": 40800,
"train_runtime": 19596.9887,
"train_tokens_per_second": 136442.84
},
{
"epoch": 0.409,
"grad_norm": 0.4535351097583771,
"learning_rate": 0.00019499690448507827,
"loss": 1.2373,
"num_input_tokens_seen": 2680422400,
"step": 40900,
"train_runtime": 19649.1805,
"train_tokens_per_second": 136413.954
},
{
"epoch": 0.41,
"grad_norm": 0.572079062461853,
"learning_rate": 0.00019454260160085588,
"loss": 1.2125,
"num_input_tokens_seen": 2686976000,
"step": 41000,
"train_runtime": 19697.7854,
"train_tokens_per_second": 136410.056
},
{
"epoch": 0.411,
"grad_norm": 0.4487378001213074,
"learning_rate": 0.0001940878501731299,
"loss": 1.2124,
"num_input_tokens_seen": 2693529600,
"step": 41100,
"train_runtime": 19744.9135,
"train_tokens_per_second": 136416.379
},
{
"epoch": 0.412,
"grad_norm": 0.47419917583465576,
"learning_rate": 0.00019363265478124214,
"loss": 1.2037,
"num_input_tokens_seen": 2700083200,
"step": 41200,
"train_runtime": 19791.8314,
"train_tokens_per_second": 136424.121
},
{
"epoch": 0.413,
"grad_norm": 0.6295040845870972,
"learning_rate": 0.00019317702000900516,
"loss": 1.2246,
"num_input_tokens_seen": 2706636800,
"step": 41300,
"train_runtime": 19838.5236,
"train_tokens_per_second": 136433.379
},
{
"epoch": 0.414,
"grad_norm": 0.53326016664505,
"learning_rate": 0.000192720950444656,
"loss": 1.2192,
"num_input_tokens_seen": 2713190400,
"step": 41400,
"train_runtime": 19885.4264,
"train_tokens_per_second": 136441.147
},
{
"epoch": 0.415,
"grad_norm": 0.49727046489715576,
"learning_rate": 0.00019226445068081018,
"loss": 1.2279,
"num_input_tokens_seen": 2719744000,
"step": 41500,
"train_runtime": 19937.4737,
"train_tokens_per_second": 136413.672
},
{
"epoch": 0.416,
"grad_norm": 0.47963398694992065,
"learning_rate": 0.00019180752531441523,
"loss": 1.2226,
"num_input_tokens_seen": 2726297600,
"step": 41600,
"train_runtime": 19984.6667,
"train_tokens_per_second": 136419.468
},
{
"epoch": 0.417,
"grad_norm": 0.4789304733276367,
"learning_rate": 0.00019135017894670456,
"loss": 1.2222,
"num_input_tokens_seen": 2732851200,
"step": 41700,
"train_runtime": 20032.7071,
"train_tokens_per_second": 136419.465
},
{
"epoch": 0.418,
"grad_norm": 0.6693325638771057,
"learning_rate": 0.0001908924161831509,
"loss": 1.2366,
"num_input_tokens_seen": 2739404800,
"step": 41800,
"train_runtime": 20078.7138,
"train_tokens_per_second": 136433.281
},
{
"epoch": 0.419,
"grad_norm": 0.41989439725875854,
"learning_rate": 0.0001904342416334203,
"loss": 1.2212,
"num_input_tokens_seen": 2745958400,
"step": 41900,
"train_runtime": 20125.0521,
"train_tokens_per_second": 136444.785
},
{
"epoch": 0.42,
"grad_norm": 0.5444014072418213,
"learning_rate": 0.00018997565991132532,
"loss": 1.2164,
"num_input_tokens_seen": 2752512000,
"step": 42000,
"train_runtime": 20177.4596,
"train_tokens_per_second": 136415.191
},
{
"epoch": 0.421,
"grad_norm": 0.5790873169898987,
"learning_rate": 0.0001895166756347789,
"loss": 1.215,
"num_input_tokens_seen": 2759065600,
"step": 42100,
"train_runtime": 20224.878,
"train_tokens_per_second": 136419.394
},
{
"epoch": 0.422,
"grad_norm": 0.4666343927383423,
"learning_rate": 0.0001890572934257475,
"loss": 1.2229,
"num_input_tokens_seen": 2765619200,
"step": 42200,
"train_runtime": 20270.922,
"train_tokens_per_second": 136432.827
},
{
"epoch": 0.423,
"grad_norm": 0.4322357177734375,
"learning_rate": 0.00018859751791020497,
"loss": 1.2258,
"num_input_tokens_seen": 2772172800,
"step": 42300,
"train_runtime": 20317.4494,
"train_tokens_per_second": 136442.954
},
{
"epoch": 0.424,
"grad_norm": 0.6240208148956299,
"learning_rate": 0.0001881373537180856,
"loss": 1.221,
"num_input_tokens_seen": 2778726400,
"step": 42400,
"train_runtime": 20364.5753,
"train_tokens_per_second": 136449.023
},
{
"epoch": 0.425,
"grad_norm": 0.5865579843521118,
"learning_rate": 0.00018767680548323766,
"loss": 1.2244,
"num_input_tokens_seen": 2785280000,
"step": 42500,
"train_runtime": 20417.9029,
"train_tokens_per_second": 136413.617
},
{
"epoch": 0.426,
"grad_norm": 0.5201649069786072,
"learning_rate": 0.0001872158778433768,
"loss": 1.2076,
"num_input_tokens_seen": 2791833600,
"step": 42600,
"train_runtime": 20464.7135,
"train_tokens_per_second": 136421.827
},
{
"epoch": 0.427,
"grad_norm": 0.5092735290527344,
"learning_rate": 0.0001867545754400392,
"loss": 1.2057,
"num_input_tokens_seen": 2798387200,
"step": 42700,
"train_runtime": 20511.0273,
"train_tokens_per_second": 136433.303
},
{
"epoch": 0.428,
"grad_norm": 0.4439486265182495,
"learning_rate": 0.000186292902918535,
"loss": 1.209,
"num_input_tokens_seen": 2804940800,
"step": 42800,
"train_runtime": 20558.3684,
"train_tokens_per_second": 136437.909
},
{
"epoch": 0.429,
"grad_norm": 0.4466177225112915,
"learning_rate": 0.00018583086492790136,
"loss": 1.218,
"num_input_tokens_seen": 2811494400,
"step": 42900,
"train_runtime": 20605.5543,
"train_tokens_per_second": 136443.522
},
{
"epoch": 0.43,
"grad_norm": 0.5813594460487366,
"learning_rate": 0.00018536846612085566,
"loss": 1.2161,
"num_input_tokens_seen": 2818048000,
"step": 43000,
"train_runtime": 20658.6134,
"train_tokens_per_second": 136410.317
},
{
"epoch": 0.431,
"grad_norm": 0.49140629172325134,
"learning_rate": 0.00018490571115374878,
"loss": 1.227,
"num_input_tokens_seen": 2824601600,
"step": 43100,
"train_runtime": 20705.6255,
"train_tokens_per_second": 136417.11
},
{
"epoch": 0.432,
"grad_norm": 0.4938826858997345,
"learning_rate": 0.00018444260468651816,
"loss": 1.2252,
"num_input_tokens_seen": 2831155200,
"step": 43200,
"train_runtime": 20752.3571,
"train_tokens_per_second": 136425.717
},
{
"epoch": 0.433,
"grad_norm": 0.5228791832923889,
"learning_rate": 0.00018397915138264068,
"loss": 1.2274,
"num_input_tokens_seen": 2837708800,
"step": 43300,
"train_runtime": 20799.4436,
"train_tokens_per_second": 136431.957
},
{
"epoch": 0.434,
"grad_norm": 0.46896296739578247,
"learning_rate": 0.00018351535590908606,
"loss": 1.2043,
"num_input_tokens_seen": 2844262400,
"step": 43400,
"train_runtime": 20845.6184,
"train_tokens_per_second": 136444.137
},
{
"epoch": 0.435,
"grad_norm": 0.4269004464149475,
"learning_rate": 0.00018305122293626948,
"loss": 1.2213,
"num_input_tokens_seen": 2850816000,
"step": 43500,
"train_runtime": 20897.7485,
"train_tokens_per_second": 136417.375
},
{
"epoch": 0.436,
"grad_norm": 0.6213890314102173,
"learning_rate": 0.00018258675713800492,
"loss": 1.2096,
"num_input_tokens_seen": 2857369600,
"step": 43600,
"train_runtime": 20944.9642,
"train_tokens_per_second": 136422.749
},
{
"epoch": 0.437,
"grad_norm": 0.4281384348869324,
"learning_rate": 0.00018212196319145773,
"loss": 1.2111,
"num_input_tokens_seen": 2863923200,
"step": 43700,
"train_runtime": 20992.0443,
"train_tokens_per_second": 136428.98
},
{
"epoch": 0.438,
"grad_norm": 1.044310212135315,
"learning_rate": 0.00018165684577709778,
"loss": 1.2142,
"num_input_tokens_seen": 2870476800,
"step": 43800,
"train_runtime": 21039.718,
"train_tokens_per_second": 136431.334
},
{
"epoch": 0.439,
"grad_norm": 0.445425808429718,
"learning_rate": 0.0001811914095786524,
"loss": 1.218,
"num_input_tokens_seen": 2877030400,
"step": 43900,
"train_runtime": 21088.215,
"train_tokens_per_second": 136428.351
},
{
"epoch": 0.44,
"grad_norm": 0.43947216868400574,
"learning_rate": 0.0001807256592830588,
"loss": 1.2124,
"num_input_tokens_seen": 2883584000,
"step": 44000,
"train_runtime": 21136.0286,
"train_tokens_per_second": 136429.793
},
{
"epoch": 0.441,
"grad_norm": 0.5147203803062439,
"learning_rate": 0.00018025959958041732,
"loss": 1.2227,
"num_input_tokens_seen": 2890137600,
"step": 44100,
"train_runtime": 21182.9913,
"train_tokens_per_second": 136436.708
},
{
"epoch": 0.442,
"grad_norm": 0.473652184009552,
"learning_rate": 0.00017979323516394407,
"loss": 1.2277,
"num_input_tokens_seen": 2896691200,
"step": 44200,
"train_runtime": 21236.5796,
"train_tokens_per_second": 136401.024
},
{
"epoch": 0.443,
"grad_norm": 0.4356568157672882,
"learning_rate": 0.00017932657072992344,
"loss": 1.2018,
"num_input_tokens_seen": 2903244800,
"step": 44300,
"train_runtime": 21282.9387,
"train_tokens_per_second": 136411.838
},
{
"epoch": 0.444,
"grad_norm": 0.4458017647266388,
"learning_rate": 0.00017885961097766117,
"loss": 1.2124,
"num_input_tokens_seen": 2909798400,
"step": 44400,
"train_runtime": 21331.1223,
"train_tokens_per_second": 136410.938
},
{
"epoch": 0.445,
"grad_norm": 0.5065773725509644,
"learning_rate": 0.00017839236060943674,
"loss": 1.2262,
"num_input_tokens_seen": 2916352000,
"step": 44500,
"train_runtime": 21377.5493,
"train_tokens_per_second": 136421.25
},
{
"epoch": 0.446,
"grad_norm": 0.5424425601959229,
"learning_rate": 0.0001779248243304562,
"loss": 1.2171,
"num_input_tokens_seen": 2922905600,
"step": 44600,
"train_runtime": 21424.9021,
"train_tokens_per_second": 136425.622
},
{
"epoch": 0.447,
"grad_norm": 0.4595748484134674,
"learning_rate": 0.00017745700684880465,
"loss": 1.2039,
"num_input_tokens_seen": 2929459200,
"step": 44700,
"train_runtime": 21472.2167,
"train_tokens_per_second": 136430.218
},
{
"epoch": 0.448,
"grad_norm": 0.5353960990905762,
"learning_rate": 0.000176988912875399,
"loss": 1.2075,
"num_input_tokens_seen": 2936012800,
"step": 44800,
"train_runtime": 21524.5148,
"train_tokens_per_second": 136403.205
},
{
"epoch": 0.449,
"grad_norm": 0.4949302673339844,
"learning_rate": 0.00017652054712394028,
"loss": 1.2174,
"num_input_tokens_seen": 2942566400,
"step": 44900,
"train_runtime": 21571.6626,
"train_tokens_per_second": 136408.883
},
{
"epoch": 0.45,
"grad_norm": 0.5596060752868652,
"learning_rate": 0.0001760519143108665,
"loss": 1.2178,
"num_input_tokens_seen": 2949120000,
"step": 45000,
"train_runtime": 21618.3195,
"train_tokens_per_second": 136417.634
},
{
"epoch": 0.451,
"grad_norm": 0.5348083972930908,
"learning_rate": 0.00017558301915530483,
"loss": 1.215,
"num_input_tokens_seen": 2955673600,
"step": 45100,
"train_runtime": 21666.1069,
"train_tokens_per_second": 136419.229
},
{
"epoch": 0.452,
"grad_norm": 0.46748441457748413,
"learning_rate": 0.00017511386637902428,
"loss": 1.2104,
"num_input_tokens_seen": 2962227200,
"step": 45200,
"train_runtime": 21713.1957,
"train_tokens_per_second": 136425.206
},
{
"epoch": 0.453,
"grad_norm": 0.47188806533813477,
"learning_rate": 0.00017464446070638814,
"loss": 1.213,
"num_input_tokens_seen": 2968780800,
"step": 45300,
"train_runtime": 21760.1393,
"train_tokens_per_second": 136432.068
},
{
"epoch": 0.454,
"grad_norm": 0.5225762128829956,
"learning_rate": 0.00017417480686430622,
"loss": 1.2152,
"num_input_tokens_seen": 2975334400,
"step": 45400,
"train_runtime": 21812.7666,
"train_tokens_per_second": 136403.348
},
{
"epoch": 0.455,
"grad_norm": 0.5889186263084412,
"learning_rate": 0.00017370490958218765,
"loss": 1.2214,
"num_input_tokens_seen": 2981888000,
"step": 45500,
"train_runtime": 21859.0263,
"train_tokens_per_second": 136414.493
},
{
"epoch": 0.456,
"grad_norm": 0.6613258719444275,
"learning_rate": 0.00017323477359189272,
"loss": 1.2334,
"num_input_tokens_seen": 2988441600,
"step": 45600,
"train_runtime": 21905.9003,
"train_tokens_per_second": 136421.766
},
{
"epoch": 0.457,
"grad_norm": 0.4657646715641022,
"learning_rate": 0.00017276440362768564,
"loss": 1.2132,
"num_input_tokens_seen": 2994995200,
"step": 45700,
"train_runtime": 21952.9851,
"train_tokens_per_second": 136427.697
},
{
"epoch": 0.458,
"grad_norm": 0.8410550355911255,
"learning_rate": 0.0001722938044261868,
"loss": 1.2073,
"num_input_tokens_seen": 3001548800,
"step": 45800,
"train_runtime": 22005.352,
"train_tokens_per_second": 136400.854
},
{
"epoch": 0.459,
"grad_norm": 0.7687750458717346,
"learning_rate": 0.0001718229807263249,
"loss": 1.2116,
"num_input_tokens_seen": 3008102400,
"step": 45900,
"train_runtime": 22051.2762,
"train_tokens_per_second": 136413.982
},
{
"epoch": 0.46,
"grad_norm": 0.40700653195381165,
"learning_rate": 0.0001713519372692894,
"loss": 1.2082,
"num_input_tokens_seen": 3014656000,
"step": 46000,
"train_runtime": 22102.8898,
"train_tokens_per_second": 136391.939
},
{
"epoch": 0.461,
"grad_norm": 0.44239944219589233,
"learning_rate": 0.0001708806787984826,
"loss": 1.2177,
"num_input_tokens_seen": 3021209600,
"step": 46100,
"train_runtime": 22149.1222,
"train_tokens_per_second": 136403.13
},
{
"epoch": 0.462,
"grad_norm": 0.4981868267059326,
"learning_rate": 0.00017040921005947212,
"loss": 1.2073,
"num_input_tokens_seen": 3027763200,
"step": 46200,
"train_runtime": 22195.5009,
"train_tokens_per_second": 136413.376
},
{
"epoch": 0.463,
"grad_norm": 0.5651112198829651,
"learning_rate": 0.0001699375357999429,
"loss": 1.2098,
"num_input_tokens_seen": 3034316800,
"step": 46300,
"train_runtime": 22241.367,
"train_tokens_per_second": 136426.722
},
{
"epoch": 0.464,
"grad_norm": 1.1314237117767334,
"learning_rate": 0.0001694656607696496,
"loss": 1.2335,
"num_input_tokens_seen": 3040870400,
"step": 46400,
"train_runtime": 22294.8896,
"train_tokens_per_second": 136393.158
},
{
"epoch": 0.465,
"grad_norm": 0.568980872631073,
"learning_rate": 0.0001689935897203684,
"loss": 1.2096,
"num_input_tokens_seen": 3047424000,
"step": 46500,
"train_runtime": 22342.7849,
"train_tokens_per_second": 136394.098
},
{
"epoch": 0.466,
"grad_norm": 0.7110226154327393,
"learning_rate": 0.0001685213274058496,
"loss": 1.2136,
"num_input_tokens_seen": 3053977600,
"step": 46600,
"train_runtime": 22393.3193,
"train_tokens_per_second": 136378.96
},
{
"epoch": 0.467,
"grad_norm": 0.5052018761634827,
"learning_rate": 0.00016804887858176944,
"loss": 1.2237,
"num_input_tokens_seen": 3060531200,
"step": 46700,
"train_runtime": 22441.2606,
"train_tokens_per_second": 136379.647
},
{
"epoch": 0.468,
"grad_norm": 0.4663156270980835,
"learning_rate": 0.00016757624800568238,
"loss": 1.2071,
"num_input_tokens_seen": 3067084800,
"step": 46800,
"train_runtime": 22487.9084,
"train_tokens_per_second": 136388.175
},
{
"epoch": 0.469,
"grad_norm": 0.5441033840179443,
"learning_rate": 0.00016710344043697301,
"loss": 1.2078,
"num_input_tokens_seen": 3073638400,
"step": 46900,
"train_runtime": 22534.6023,
"train_tokens_per_second": 136396.39
},
{
"epoch": 0.47,
"grad_norm": 0.4578142464160919,
"learning_rate": 0.0001666304606368083,
"loss": 1.1956,
"num_input_tokens_seen": 3080192000,
"step": 47000,
"train_runtime": 22587.0441,
"train_tokens_per_second": 136369.858
},
{
"epoch": 0.471,
"grad_norm": 0.6252749562263489,
"learning_rate": 0.00016615731336808962,
"loss": 1.1911,
"num_input_tokens_seen": 3086745600,
"step": 47100,
"train_runtime": 22634.7186,
"train_tokens_per_second": 136372.166
},
{
"epoch": 0.472,
"grad_norm": 0.45418813824653625,
"learning_rate": 0.0001656840033954047,
"loss": 1.22,
"num_input_tokens_seen": 3093299200,
"step": 47200,
"train_runtime": 22681.221,
"train_tokens_per_second": 136381.511
},
{
"epoch": 0.473,
"grad_norm": 0.55946284532547,
"learning_rate": 0.00016521053548497973,
"loss": 1.2073,
"num_input_tokens_seen": 3099852800,
"step": 47300,
"train_runtime": 22728.7635,
"train_tokens_per_second": 136384.577
},
{
"epoch": 0.474,
"grad_norm": 0.508859395980835,
"learning_rate": 0.0001647369144046313,
"loss": 1.1957,
"num_input_tokens_seen": 3106406400,
"step": 47400,
"train_runtime": 22775.8652,
"train_tokens_per_second": 136390.27
},
{
"epoch": 0.475,
"grad_norm": 0.5557622313499451,
"learning_rate": 0.00016426314492371842,
"loss": 1.1996,
"num_input_tokens_seen": 3112960000,
"step": 47500,
"train_runtime": 22823.5391,
"train_tokens_per_second": 136392.519
},
{
"epoch": 0.476,
"grad_norm": 0.5686858296394348,
"learning_rate": 0.0001637892318130945,
"loss": 1.201,
"num_input_tokens_seen": 3119513600,
"step": 47600,
"train_runtime": 22875.0526,
"train_tokens_per_second": 136371.866
},
{
"epoch": 0.477,
"grad_norm": 0.47568413615226746,
"learning_rate": 0.00016331517984505934,
"loss": 1.2132,
"num_input_tokens_seen": 3126067200,
"step": 47700,
"train_runtime": 22923.2754,
"train_tokens_per_second": 136370.878
},
{
"epoch": 0.478,
"grad_norm": 0.40612325072288513,
"learning_rate": 0.00016284099379331092,
"loss": 1.2085,
"num_input_tokens_seen": 3132620800,
"step": 47800,
"train_runtime": 22970.2831,
"train_tokens_per_second": 136377.109
},
{
"epoch": 0.479,
"grad_norm": 0.491755872964859,
"learning_rate": 0.00016236667843289759,
"loss": 1.206,
"num_input_tokens_seen": 3139174400,
"step": 47900,
"train_runtime": 23016.8676,
"train_tokens_per_second": 136385.821
},
{
"epoch": 0.48,
"grad_norm": 1.2421867847442627,
"learning_rate": 0.00016189223854016973,
"loss": 1.1991,
"num_input_tokens_seen": 3145728000,
"step": 48000,
"train_runtime": 23070.1067,
"train_tokens_per_second": 136355.156
},
{
"epoch": 0.481,
"grad_norm": 0.44709935784339905,
"learning_rate": 0.00016141767889273182,
"loss": 1.1987,
"num_input_tokens_seen": 3152281600,
"step": 48100,
"train_runtime": 23117.6704,
"train_tokens_per_second": 136358.1
},
{
"epoch": 0.482,
"grad_norm": 0.6956078410148621,
"learning_rate": 0.00016094300426939417,
"loss": 1.206,
"num_input_tokens_seen": 3158835200,
"step": 48200,
"train_runtime": 23164.6084,
"train_tokens_per_second": 136364.714
},
{
"epoch": 0.483,
"grad_norm": 0.4756148159503937,
"learning_rate": 0.00016046821945012505,
"loss": 1.213,
"num_input_tokens_seen": 3165388800,
"step": 48300,
"train_runtime": 23212.4256,
"train_tokens_per_second": 136366.137
},
{
"epoch": 0.484,
"grad_norm": 0.4668136239051819,
"learning_rate": 0.00015999332921600226,
"loss": 1.2027,
"num_input_tokens_seen": 3171942400,
"step": 48400,
"train_runtime": 23260.1957,
"train_tokens_per_second": 136367.829
},
{
"epoch": 0.485,
"grad_norm": 0.48166415095329285,
"learning_rate": 0.00015951833834916532,
"loss": 1.1885,
"num_input_tokens_seen": 3178496000,
"step": 48500,
"train_runtime": 23308.4042,
"train_tokens_per_second": 136366.951
},
{
"epoch": 0.486,
"grad_norm": 1.4835230112075806,
"learning_rate": 0.00015904325163276672,
"loss": 1.2144,
"num_input_tokens_seen": 3185049600,
"step": 48600,
"train_runtime": 23355.0119,
"train_tokens_per_second": 136375.422
},
{
"epoch": 0.487,
"grad_norm": 0.47993043065071106,
"learning_rate": 0.00015856807385092466,
"loss": 1.2092,
"num_input_tokens_seen": 3191603200,
"step": 48700,
"train_runtime": 23408.2289,
"train_tokens_per_second": 136345.352
},
{
"epoch": 0.488,
"grad_norm": 0.4617721736431122,
"learning_rate": 0.00015809280978867405,
"loss": 1.2079,
"num_input_tokens_seen": 3198156800,
"step": 48800,
"train_runtime": 23456.3091,
"train_tokens_per_second": 136345.27
},
{
"epoch": 0.489,
"grad_norm": 0.4698822796344757,
"learning_rate": 0.0001576174642319187,
"loss": 1.2221,
"num_input_tokens_seen": 3204710400,
"step": 48900,
"train_runtime": 23502.92,
"train_tokens_per_second": 136353.713
},
{
"epoch": 0.49,
"grad_norm": 0.5454009771347046,
"learning_rate": 0.0001571420419673831,
"loss": 1.201,
"num_input_tokens_seen": 3211264000,
"step": 49000,
"train_runtime": 23550.5868,
"train_tokens_per_second": 136356.008
},
{
"epoch": 0.491,
"grad_norm": 0.9021556973457336,
"learning_rate": 0.0001566665477825642,
"loss": 1.2047,
"num_input_tokens_seen": 3217817600,
"step": 49100,
"train_runtime": 23597.4655,
"train_tokens_per_second": 136362.848
},
{
"epoch": 0.492,
"grad_norm": 0.4959240257740021,
"learning_rate": 0.0001561909864656831,
"loss": 1.2042,
"num_input_tokens_seen": 3224371200,
"step": 49200,
"train_runtime": 23650.6048,
"train_tokens_per_second": 136333.562
},
{
"epoch": 0.493,
"grad_norm": 0.554251492023468,
"learning_rate": 0.00015571536280563705,
"loss": 1.2163,
"num_input_tokens_seen": 3230924800,
"step": 49300,
"train_runtime": 23697.3685,
"train_tokens_per_second": 136341.079
},
{
"epoch": 0.494,
"grad_norm": 0.5000952482223511,
"learning_rate": 0.000155239681591951,
"loss": 1.2086,
"num_input_tokens_seen": 3237478400,
"step": 49400,
"train_runtime": 23745.4988,
"train_tokens_per_second": 136340.72
},
{
"epoch": 0.495,
"grad_norm": 0.7438832521438599,
"learning_rate": 0.00015476394761472953,
"loss": 1.1999,
"num_input_tokens_seen": 3244032000,
"step": 49500,
"train_runtime": 23793.3349,
"train_tokens_per_second": 136342.048
},
{
"epoch": 0.496,
"grad_norm": 0.4872761368751526,
"learning_rate": 0.00015428816566460843,
"loss": 1.194,
"num_input_tokens_seen": 3250585600,
"step": 49600,
"train_runtime": 23839.649,
"train_tokens_per_second": 136352.074
},
{
"epoch": 0.497,
"grad_norm": 0.48635321855545044,
"learning_rate": 0.00015381234053270669,
"loss": 1.1957,
"num_input_tokens_seen": 3257139200,
"step": 49700,
"train_runtime": 23886.4418,
"train_tokens_per_second": 136359.33
},
{
"epoch": 0.498,
"grad_norm": 0.899361252784729,
"learning_rate": 0.0001533364770105781,
"loss": 1.201,
"num_input_tokens_seen": 3263692800,
"step": 49800,
"train_runtime": 23933.6337,
"train_tokens_per_second": 136364.283
},
{
"epoch": 0.499,
"grad_norm": 0.5460925698280334,
"learning_rate": 0.0001528605798901631,
"loss": 1.2086,
"num_input_tokens_seen": 3270246400,
"step": 49900,
"train_runtime": 23985.6033,
"train_tokens_per_second": 136342.053
},
{
"epoch": 0.5,
"grad_norm": 0.4763907194137573,
"learning_rate": 0.00015238465396374027,
"loss": 1.1987,
"num_input_tokens_seen": 3276800000,
"step": 50000,
"train_runtime": 24033.0829,
"train_tokens_per_second": 136345.388
},
{
"epoch": 0.501,
"grad_norm": 0.4716530442237854,
"learning_rate": 0.00015190870402387858,
"loss": 1.2083,
"num_input_tokens_seen": 3283353600,
"step": 50100,
"train_runtime": 24080.0017,
"train_tokens_per_second": 136351.884
},
{
"epoch": 0.502,
"grad_norm": 0.65655517578125,
"learning_rate": 0.00015143273486338857,
"loss": 1.2026,
"num_input_tokens_seen": 3289907200,
"step": 50200,
"train_runtime": 24132.759,
"train_tokens_per_second": 136325.366
},
{
"epoch": 0.503,
"grad_norm": 0.494205117225647,
"learning_rate": 0.00015095675127527438,
"loss": 1.208,
"num_input_tokens_seen": 3296460800,
"step": 50300,
"train_runtime": 24179.9126,
"train_tokens_per_second": 136330.551
},
{
"epoch": 0.504,
"grad_norm": 0.485307902097702,
"learning_rate": 0.00015048075805268547,
"loss": 1.1965,
"num_input_tokens_seen": 3303014400,
"step": 50400,
"train_runtime": 24227.2685,
"train_tokens_per_second": 136334.577
},
{
"epoch": 0.505,
"grad_norm": 0.4843132793903351,
"learning_rate": 0.00015000475998886825,
"loss": 1.2028,
"num_input_tokens_seen": 3309568000,
"step": 50500,
"train_runtime": 24274.7092,
"train_tokens_per_second": 136338.111
},
{
"epoch": 0.506,
"grad_norm": 0.4654887020587921,
"learning_rate": 0.00014952876187711804,
"loss": 1.2151,
"num_input_tokens_seen": 3316121600,
"step": 50600,
"train_runtime": 24321.273,
"train_tokens_per_second": 136346.547
},
{
"epoch": 0.507,
"grad_norm": 0.4625457525253296,
"learning_rate": 0.00014905276851073053,
"loss": 1.209,
"num_input_tokens_seen": 3322675200,
"step": 50700,
"train_runtime": 24374.7609,
"train_tokens_per_second": 136316.217
},
{
"epoch": 0.508,
"grad_norm": 0.527594268321991,
"learning_rate": 0.00014857678468295352,
"loss": 1.2043,
"num_input_tokens_seen": 3329228800,
"step": 50800,
"train_runtime": 24422.59,
"train_tokens_per_second": 136317.598
},
{
"epoch": 0.509,
"grad_norm": 0.4604775011539459,
"learning_rate": 0.00014810081518693902,
"loss": 1.1895,
"num_input_tokens_seen": 3335782400,
"step": 50900,
"train_runtime": 24468.7673,
"train_tokens_per_second": 136328.176
},
{
"epoch": 0.51,
"grad_norm": 0.4973219335079193,
"learning_rate": 0.0001476248648156945,
"loss": 1.1977,
"num_input_tokens_seen": 3342336000,
"step": 51000,
"train_runtime": 24516.9703,
"train_tokens_per_second": 136327.448
},
{
"epoch": 0.511,
"grad_norm": 0.42552006244659424,
"learning_rate": 0.00014714893836203485,
"loss": 1.2109,
"num_input_tokens_seen": 3348889600,
"step": 51100,
"train_runtime": 24564.5614,
"train_tokens_per_second": 136330.12
},
{
"epoch": 0.512,
"grad_norm": 0.5027197003364563,
"learning_rate": 0.0001466730406185343,
"loss": 1.1949,
"num_input_tokens_seen": 3355443200,
"step": 51200,
"train_runtime": 24611.9784,
"train_tokens_per_second": 136333.745
},
{
"epoch": 0.513,
"grad_norm": 0.6097121238708496,
"learning_rate": 0.0001461971763774778,
"loss": 1.2,
"num_input_tokens_seen": 3361996800,
"step": 51300,
"train_runtime": 24665.0046,
"train_tokens_per_second": 136306.352
},
{
"epoch": 0.514,
"grad_norm": 0.9953346848487854,
"learning_rate": 0.0001457213504308129,
"loss": 1.1919,
"num_input_tokens_seen": 3368550400,
"step": 51400,
"train_runtime": 24711.3817,
"train_tokens_per_second": 136315.745
},
{
"epoch": 0.515,
"grad_norm": 0.5582478642463684,
"learning_rate": 0.00014524556757010177,
"loss": 1.1924,
"num_input_tokens_seen": 3375104000,
"step": 51500,
"train_runtime": 24758.0554,
"train_tokens_per_second": 136323.469
},
{
"epoch": 0.516,
"grad_norm": 0.5084798336029053,
"learning_rate": 0.00014476983258647234,
"loss": 1.2068,
"num_input_tokens_seen": 3381657600,
"step": 51600,
"train_runtime": 24807.6959,
"train_tokens_per_second": 136314.86
},
{
"epoch": 0.517,
"grad_norm": 0.6907379627227783,
"learning_rate": 0.0001442941502705707,
"loss": 1.1945,
"num_input_tokens_seen": 3388211200,
"step": 51700,
"train_runtime": 24855.3849,
"train_tokens_per_second": 136316.988
},
{
"epoch": 0.518,
"grad_norm": 0.6037150025367737,
"learning_rate": 0.0001438185254125125,
"loss": 1.2053,
"num_input_tokens_seen": 3394764800,
"step": 51800,
"train_runtime": 24901.8712,
"train_tokens_per_second": 136325.691
},
{
"epoch": 0.519,
"grad_norm": 0.6816796064376831,
"learning_rate": 0.00014334296280183473,
"loss": 1.2019,
"num_input_tokens_seen": 3401318400,
"step": 51900,
"train_runtime": 24955.4949,
"train_tokens_per_second": 136295.37
},
{
"epoch": 0.52,
"grad_norm": 0.5201036930084229,
"learning_rate": 0.00014286746722744768,
"loss": 1.206,
"num_input_tokens_seen": 3407872000,
"step": 52000,
"train_runtime": 25002.9753,
"train_tokens_per_second": 136298.659
},
{
"epoch": 0.521,
"grad_norm": 0.5104642510414124,
"learning_rate": 0.00014239204347758647,
"loss": 1.2029,
"num_input_tokens_seen": 3414425600,
"step": 52100,
"train_runtime": 25051.9745,
"train_tokens_per_second": 136293.672
},
{
"epoch": 0.522,
"grad_norm": 0.4965505003929138,
"learning_rate": 0.00014191669633976294,
"loss": 1.1961,
"num_input_tokens_seen": 3420979200,
"step": 52200,
"train_runtime": 25099.3949,
"train_tokens_per_second": 136297.278
},
{
"epoch": 0.523,
"grad_norm": 0.5390327572822571,
"learning_rate": 0.00014144143060071756,
"loss": 1.194,
"num_input_tokens_seen": 3427532800,
"step": 52300,
"train_runtime": 25146.6291,
"train_tokens_per_second": 136301.879
},
{
"epoch": 0.524,
"grad_norm": 2.647089719772339,
"learning_rate": 0.000140966251046371,
"loss": 1.2006,
"num_input_tokens_seen": 3434086400,
"step": 52400,
"train_runtime": 25194.2742,
"train_tokens_per_second": 136304.24
},
{
"epoch": 0.525,
"grad_norm": 0.46030643582344055,
"learning_rate": 0.0001404911624617761,
"loss": 1.2071,
"num_input_tokens_seen": 3440640000,
"step": 52500,
"train_runtime": 25247.7567,
"train_tokens_per_second": 136275.077
},
{
"epoch": 0.526,
"grad_norm": 0.487699031829834,
"learning_rate": 0.00014001616963106966,
"loss": 1.2046,
"num_input_tokens_seen": 3447193600,
"step": 52600,
"train_runtime": 25295.5054,
"train_tokens_per_second": 136276.921
},
{
"epoch": 0.527,
"grad_norm": 0.4782906472682953,
"learning_rate": 0.00013954127733742416,
"loss": 1.1891,
"num_input_tokens_seen": 3453747200,
"step": 52700,
"train_runtime": 25344.1317,
"train_tokens_per_second": 136274.039
},
{
"epoch": 0.528,
"grad_norm": 0.595632016658783,
"learning_rate": 0.0001390664903629998,
"loss": 1.1867,
"num_input_tokens_seen": 3460300800,
"step": 52800,
"train_runtime": 25391.6777,
"train_tokens_per_second": 136276.966
},
{
"epoch": 0.529,
"grad_norm": 0.5201537609100342,
"learning_rate": 0.0001385918134888961,
"loss": 1.1955,
"num_input_tokens_seen": 3466854400,
"step": 52900,
"train_runtime": 25439.3874,
"train_tokens_per_second": 136279.005
},
{
"epoch": 0.53,
"grad_norm": 0.4726644456386566,
"learning_rate": 0.00013811725149510387,
"loss": 1.206,
"num_input_tokens_seen": 3473408000,
"step": 53000,
"train_runtime": 25492.0415,
"train_tokens_per_second": 136254.603
},
{
"epoch": 0.531,
"grad_norm": 0.5846008062362671,
"learning_rate": 0.0001376428091604572,
"loss": 1.2117,
"num_input_tokens_seen": 3479961600,
"step": 53100,
"train_runtime": 25540.3083,
"train_tokens_per_second": 136253.704
},
{
"epoch": 0.532,
"grad_norm": 0.4758647382259369,
"learning_rate": 0.00013716849126258512,
"loss": 1.2042,
"num_input_tokens_seen": 3486515200,
"step": 53200,
"train_runtime": 25589.0853,
"train_tokens_per_second": 136250.091
},
{
"epoch": 0.533,
"grad_norm": 0.4607105255126953,
"learning_rate": 0.00013669430257786354,
"loss": 1.1992,
"num_input_tokens_seen": 3493068800,
"step": 53300,
"train_runtime": 25636.4376,
"train_tokens_per_second": 136254.063
},
{
"epoch": 0.534,
"grad_norm": 0.6885077357292175,
"learning_rate": 0.00013622024788136728,
"loss": 1.2006,
"num_input_tokens_seen": 3499622400,
"step": 53400,
"train_runtime": 25684.4816,
"train_tokens_per_second": 136254.352
},
{
"epoch": 0.535,
"grad_norm": 0.6578366160392761,
"learning_rate": 0.00013574633194682185,
"loss": 1.1948,
"num_input_tokens_seen": 3506176000,
"step": 53500,
"train_runtime": 25730.7322,
"train_tokens_per_second": 136264.136
},
{
"epoch": 0.536,
"grad_norm": 0.4718693196773529,
"learning_rate": 0.0001352725595465555,
"loss": 1.2,
"num_input_tokens_seen": 3512729600,
"step": 53600,
"train_runtime": 25783.9922,
"train_tokens_per_second": 136236.839
},
{
"epoch": 0.537,
"grad_norm": 0.5561531186103821,
"learning_rate": 0.000134798935451451,
"loss": 1.2052,
"num_input_tokens_seen": 3519283200,
"step": 53700,
"train_runtime": 25832.8858,
"train_tokens_per_second": 136232.677
},
{
"epoch": 0.538,
"grad_norm": 0.5250628590583801,
"learning_rate": 0.00013432546443089768,
"loss": 1.2,
"num_input_tokens_seen": 3525836800,
"step": 53800,
"train_runtime": 25880.0084,
"train_tokens_per_second": 136237.854
},
{
"epoch": 0.539,
"grad_norm": 0.5457636117935181,
"learning_rate": 0.0001338521512527436,
"loss": 1.1944,
"num_input_tokens_seen": 3532390400,
"step": 53900,
"train_runtime": 25927.8228,
"train_tokens_per_second": 136239.376
},
{
"epoch": 0.54,
"grad_norm": 0.4437522292137146,
"learning_rate": 0.00013337900068324712,
"loss": 1.1912,
"num_input_tokens_seen": 3538944000,
"step": 54000,
"train_runtime": 25975.6777,
"train_tokens_per_second": 136240.68
},
{
"epoch": 0.541,
"grad_norm": 0.5343025326728821,
"learning_rate": 0.00013290601748702918,
"loss": 1.188,
"num_input_tokens_seen": 3545497600,
"step": 54100,
"train_runtime": 26027.6243,
"train_tokens_per_second": 136220.562
},
{
"epoch": 0.542,
"grad_norm": 0.4907335042953491,
"learning_rate": 0.00013243320642702543,
"loss": 1.1909,
"num_input_tokens_seen": 3552051200,
"step": 54200,
"train_runtime": 26075.5648,
"train_tokens_per_second": 136221.448
},
{
"epoch": 0.543,
"grad_norm": 0.7268043160438538,
"learning_rate": 0.0001319605722644379,
"loss": 1.1911,
"num_input_tokens_seen": 3558604800,
"step": 54300,
"train_runtime": 26122.2114,
"train_tokens_per_second": 136229.079
},
{
"epoch": 0.544,
"grad_norm": 1.3769776821136475,
"learning_rate": 0.0001314881197586874,
"loss": 1.224,
"num_input_tokens_seen": 3565158400,
"step": 54400,
"train_runtime": 26170.2324,
"train_tokens_per_second": 136229.528
},
{
"epoch": 0.545,
"grad_norm": 0.7141419649124146,
"learning_rate": 0.0001310158536673654,
"loss": 1.2025,
"num_input_tokens_seen": 3571712000,
"step": 54500,
"train_runtime": 26217.6992,
"train_tokens_per_second": 136232.854
},
{
"epoch": 0.546,
"grad_norm": 0.5124280452728271,
"learning_rate": 0.0001305437787461862,
"loss": 1.1972,
"num_input_tokens_seen": 3578265600,
"step": 54600,
"train_runtime": 26264.9719,
"train_tokens_per_second": 136237.176
},
{
"epoch": 0.547,
"grad_norm": 0.5609524250030518,
"learning_rate": 0.00013007189974893903,
"loss": 1.1924,
"num_input_tokens_seen": 3584819200,
"step": 54700,
"train_runtime": 26319.2824,
"train_tokens_per_second": 136205.051
},
{
"epoch": 0.548,
"grad_norm": 0.5220986604690552,
"learning_rate": 0.00012960022142744016,
"loss": 1.188,
"num_input_tokens_seen": 3591372800,
"step": 54800,
"train_runtime": 26367.119,
"train_tokens_per_second": 136206.493
},
{
"epoch": 0.549,
"grad_norm": 0.5159165263175964,
"learning_rate": 0.00012912874853148506,
"loss": 1.1891,
"num_input_tokens_seen": 3597926400,
"step": 54900,
"train_runtime": 26415.2651,
"train_tokens_per_second": 136206.333
},
{
"epoch": 0.55,
"grad_norm": 0.5019519925117493,
"learning_rate": 0.00012865748580880053,
"loss": 1.1827,
"num_input_tokens_seen": 3604480000,
"step": 55000,
"train_runtime": 26462.5595,
"train_tokens_per_second": 136210.558
},
{
"epoch": 0.551,
"grad_norm": 0.5309172868728638,
"learning_rate": 0.0001281864380049969,
"loss": 1.1876,
"num_input_tokens_seen": 3611033600,
"step": 55100,
"train_runtime": 26514.9513,
"train_tokens_per_second": 136188.581
},
{
"epoch": 0.552,
"grad_norm": 0.5431755781173706,
"learning_rate": 0.00012771560986352042,
"loss": 1.2038,
"num_input_tokens_seen": 3617587200,
"step": 55200,
"train_runtime": 26562.7975,
"train_tokens_per_second": 136189.993
},
{
"epoch": 0.553,
"grad_norm": 0.5063371658325195,
"learning_rate": 0.0001272450061256052,
"loss": 1.1837,
"num_input_tokens_seen": 3624140800,
"step": 55300,
"train_runtime": 26609.2594,
"train_tokens_per_second": 136198.484
},
{
"epoch": 0.554,
"grad_norm": 0.502314567565918,
"learning_rate": 0.00012677463153022565,
"loss": 1.1988,
"num_input_tokens_seen": 3630694400,
"step": 55400,
"train_runtime": 26655.8656,
"train_tokens_per_second": 136206.209
},
{
"epoch": 0.555,
"grad_norm": 0.5824739336967468,
"learning_rate": 0.0001263044908140488,
"loss": 1.1917,
"num_input_tokens_seen": 3637248000,
"step": 55500,
"train_runtime": 26707.6694,
"train_tokens_per_second": 136187.398
},
{
"epoch": 0.556,
"grad_norm": 0.5498598217964172,
"learning_rate": 0.00012583458871138632,
"loss": 1.1908,
"num_input_tokens_seen": 3643801600,
"step": 55600,
"train_runtime": 26755.8413,
"train_tokens_per_second": 136187.144
},
{
"epoch": 0.557,
"grad_norm": 0.5867239832878113,
"learning_rate": 0.00012536492995414723,
"loss": 1.193,
"num_input_tokens_seen": 3650355200,
"step": 55700,
"train_runtime": 26804.5182,
"train_tokens_per_second": 136184.324
},
{
"epoch": 0.558,
"grad_norm": 0.5584626197814941,
"learning_rate": 0.00012489551927179007,
"loss": 1.1833,
"num_input_tokens_seen": 3656908800,
"step": 55800,
"train_runtime": 26850.8981,
"train_tokens_per_second": 136193.165
},
{
"epoch": 0.559,
"grad_norm": 0.48578086495399475,
"learning_rate": 0.00012442636139127508,
"loss": 1.1919,
"num_input_tokens_seen": 3663462400,
"step": 55900,
"train_runtime": 26898.2376,
"train_tokens_per_second": 136197.116
},
{
"epoch": 0.56,
"grad_norm": 0.5344805121421814,
"learning_rate": 0.00012395746103701695,
"loss": 1.1978,
"num_input_tokens_seen": 3670016000,
"step": 56000,
"train_runtime": 26951.1383,
"train_tokens_per_second": 136172.95
},
{
"epoch": 0.561,
"grad_norm": 0.5378079414367676,
"learning_rate": 0.00012348882293083708,
"loss": 1.192,
"num_input_tokens_seen": 3676569600,
"step": 56100,
"train_runtime": 26999.7429,
"train_tokens_per_second": 136170.541
},
{
"epoch": 0.562,
"grad_norm": 0.6195780038833618,
"learning_rate": 0.00012302045179191594,
"loss": 1.1919,
"num_input_tokens_seen": 3683123200,
"step": 56200,
"train_runtime": 27047.827,
"train_tokens_per_second": 136170.761
},
{
"epoch": 0.563,
"grad_norm": 0.5348559617996216,
"learning_rate": 0.00012255235233674572,
"loss": 1.1875,
"num_input_tokens_seen": 3689676800,
"step": 56300,
"train_runtime": 27094.1422,
"train_tokens_per_second": 136179.871
},
{
"epoch": 0.564,
"grad_norm": 0.48098888993263245,
"learning_rate": 0.00012208452927908278,
"loss": 1.1818,
"num_input_tokens_seen": 3696230400,
"step": 56400,
"train_runtime": 27141.6856,
"train_tokens_per_second": 136182.787
},
{
"epoch": 0.565,
"grad_norm": 0.585021436214447,
"learning_rate": 0.00012161698732990003,
"loss": 1.1887,
"num_input_tokens_seen": 3702784000,
"step": 56500,
"train_runtime": 27194.4825,
"train_tokens_per_second": 136159.385
},
{
"epoch": 0.566,
"grad_norm": 0.5269266963005066,
"learning_rate": 0.00012114973119733987,
"loss": 1.187,
"num_input_tokens_seen": 3709337600,
"step": 56600,
"train_runtime": 27242.6521,
"train_tokens_per_second": 136159.196
},
{
"epoch": 0.567,
"grad_norm": 0.5563040971755981,
"learning_rate": 0.00012068276558666616,
"loss": 1.1996,
"num_input_tokens_seen": 3715891200,
"step": 56700,
"train_runtime": 27290.3101,
"train_tokens_per_second": 136161.56
},
{
"epoch": 0.568,
"grad_norm": 0.6131460666656494,
"learning_rate": 0.00012021609520021752,
"loss": 1.195,
"num_input_tokens_seen": 3722444800,
"step": 56800,
"train_runtime": 27337.7804,
"train_tokens_per_second": 136164.851
},
{
"epoch": 0.569,
"grad_norm": 0.5921023488044739,
"learning_rate": 0.00011974972473735957,
"loss": 1.2018,
"num_input_tokens_seen": 3728998400,
"step": 56900,
"train_runtime": 27384.9126,
"train_tokens_per_second": 136169.812
},
{
"epoch": 0.57,
"grad_norm": 0.4582422375679016,
"learning_rate": 0.00011928365889443764,
"loss": 1.1914,
"num_input_tokens_seen": 3735552000,
"step": 57000,
"train_runtime": 27436.2125,
"train_tokens_per_second": 136154.07
},
{
"epoch": 0.571,
"grad_norm": 0.6521887183189392,
"learning_rate": 0.00011881790236472966,
"loss": 1.2041,
"num_input_tokens_seen": 3742105600,
"step": 57100,
"train_runtime": 27484.9505,
"train_tokens_per_second": 136151.076
},
{
"epoch": 0.572,
"grad_norm": 0.5971055030822754,
"learning_rate": 0.00011835245983839869,
"loss": 1.1992,
"num_input_tokens_seen": 3748659200,
"step": 57200,
"train_runtime": 27531.7756,
"train_tokens_per_second": 136157.553
},
{
"epoch": 0.573,
"grad_norm": 0.5187013745307922,
"learning_rate": 0.00011788733600244575,
"loss": 1.193,
"num_input_tokens_seen": 3755212800,
"step": 57300,
"train_runtime": 27579.3239,
"train_tokens_per_second": 136160.437
},
{
"epoch": 0.574,
"grad_norm": 0.5805628299713135,
"learning_rate": 0.00011742253554066278,
"loss": 1.1925,
"num_input_tokens_seen": 3761766400,
"step": 57400,
"train_runtime": 27633.4529,
"train_tokens_per_second": 136130.885
},
{
"epoch": 0.575,
"grad_norm": 0.5242844223976135,
"learning_rate": 0.00011695806313358523,
"loss": 1.1991,
"num_input_tokens_seen": 3768320000,
"step": 57500,
"train_runtime": 27681.3237,
"train_tokens_per_second": 136132.218
},
{
"epoch": 0.576,
"grad_norm": 0.7652018666267395,
"learning_rate": 0.00011649392345844506,
"loss": 1.192,
"num_input_tokens_seen": 3774873600,
"step": 57600,
"train_runtime": 27728.8266,
"train_tokens_per_second": 136135.353
},
{
"epoch": 0.577,
"grad_norm": 0.5232011675834656,
"learning_rate": 0.00011603012118912372,
"loss": 1.2019,
"num_input_tokens_seen": 3781427200,
"step": 57700,
"train_runtime": 27778.1555,
"train_tokens_per_second": 136129.528
},
{
"epoch": 0.578,
"grad_norm": 0.5537053942680359,
"learning_rate": 0.00011556666099610485,
"loss": 1.1948,
"num_input_tokens_seen": 3787980800,
"step": 57800,
"train_runtime": 27824.9287,
"train_tokens_per_second": 136136.227
},
{
"epoch": 0.579,
"grad_norm": 0.6031852960586548,
"learning_rate": 0.00011510354754642745,
"loss": 1.1888,
"num_input_tokens_seen": 3794534400,
"step": 57900,
"train_runtime": 27872.2044,
"train_tokens_per_second": 136140.448
},
{
"epoch": 0.58,
"grad_norm": 0.5748854875564575,
"learning_rate": 0.00011464078550363887,
"loss": 1.1921,
"num_input_tokens_seen": 3801088000,
"step": 58000,
"train_runtime": 27925.2055,
"train_tokens_per_second": 136116.742
},
{
"epoch": 0.581,
"grad_norm": 0.5586141347885132,
"learning_rate": 0.0001141783795277477,
"loss": 1.2024,
"num_input_tokens_seen": 3807641600,
"step": 58100,
"train_runtime": 27972.7534,
"train_tokens_per_second": 136119.657
},
{
"epoch": 0.582,
"grad_norm": 0.4893476366996765,
"learning_rate": 0.00011371633427517696,
"loss": 1.2034,
"num_input_tokens_seen": 3814195200,
"step": 58200,
"train_runtime": 28020.2529,
"train_tokens_per_second": 136122.797
},
{
"epoch": 0.583,
"grad_norm": 0.5007518529891968,
"learning_rate": 0.00011325465439871731,
"loss": 1.1885,
"num_input_tokens_seen": 3820748800,
"step": 58300,
"train_runtime": 28067.154,
"train_tokens_per_second": 136128.829
},
{
"epoch": 0.584,
"grad_norm": 0.5260310769081116,
"learning_rate": 0.00011279334454747989,
"loss": 1.1931,
"num_input_tokens_seen": 3827302400,
"step": 58400,
"train_runtime": 28120.6157,
"train_tokens_per_second": 136103.08
},
{
"epoch": 0.585,
"grad_norm": 0.5364392399787903,
"learning_rate": 0.00011233240936684981,
"loss": 1.1928,
"num_input_tokens_seen": 3833856000,
"step": 58500,
"train_runtime": 28168.5149,
"train_tokens_per_second": 136104.3
},
{
"epoch": 0.586,
"grad_norm": 0.49333399534225464,
"learning_rate": 0.00011187185349843916,
"loss": 1.1935,
"num_input_tokens_seen": 3840409600,
"step": 58600,
"train_runtime": 28215.0596,
"train_tokens_per_second": 136112.05
},
{
"epoch": 0.587,
"grad_norm": 0.5711957216262817,
"learning_rate": 0.00011141168158004053,
"loss": 1.1812,
"num_input_tokens_seen": 3846963200,
"step": 58700,
"train_runtime": 28264.2863,
"train_tokens_per_second": 136106.858
},
{
"epoch": 0.588,
"grad_norm": 1.0157184600830078,
"learning_rate": 0.00011095189824557998,
"loss": 1.1929,
"num_input_tokens_seen": 3853516800,
"step": 58800,
"train_runtime": 28311.6057,
"train_tokens_per_second": 136110.853
},
{
"epoch": 0.589,
"grad_norm": 0.552700936794281,
"learning_rate": 0.00011049250812507054,
"loss": 1.1909,
"num_input_tokens_seen": 3860070400,
"step": 58900,
"train_runtime": 28359.0956,
"train_tokens_per_second": 136114.016
},
{
"epoch": 0.59,
"grad_norm": 0.46860748529434204,
"learning_rate": 0.00011003351584456571,
"loss": 1.1972,
"num_input_tokens_seen": 3866624000,
"step": 59000,
"train_runtime": 28412.8978,
"train_tokens_per_second": 136086.929
},
{
"epoch": 0.591,
"grad_norm": 0.5399055480957031,
"learning_rate": 0.0001095749260261126,
"loss": 1.1895,
"num_input_tokens_seen": 3873177600,
"step": 59100,
"train_runtime": 28462.0603,
"train_tokens_per_second": 136082.123
},
{
"epoch": 0.592,
"grad_norm": 0.49921005964279175,
"learning_rate": 0.00010911674328770559,
"loss": 1.1968,
"num_input_tokens_seen": 3879731200,
"step": 59200,
"train_runtime": 28510.9551,
"train_tokens_per_second": 136078.612
},
{
"epoch": 0.593,
"grad_norm": 0.5357686877250671,
"learning_rate": 0.00010865897224323979,
"loss": 1.1889,
"num_input_tokens_seen": 3886284800,
"step": 59300,
"train_runtime": 28558.3344,
"train_tokens_per_second": 136082.334
},
{
"epoch": 0.594,
"grad_norm": 0.5710283517837524,
"learning_rate": 0.00010820161750246453,
"loss": 1.1864,
"num_input_tokens_seen": 3892838400,
"step": 59400,
"train_runtime": 28606.454,
"train_tokens_per_second": 136082.522
},
{
"epoch": 0.595,
"grad_norm": 0.6333475112915039,
"learning_rate": 0.00010774468367093696,
"loss": 1.2009,
"num_input_tokens_seen": 3899392000,
"step": 59500,
"train_runtime": 28653.986,
"train_tokens_per_second": 136085.5
},
{
"epoch": 0.596,
"grad_norm": 0.5585243701934814,
"learning_rate": 0.00010728817534997573,
"loss": 1.1877,
"num_input_tokens_seen": 3905945600,
"step": 59600,
"train_runtime": 28701.832,
"train_tokens_per_second": 136086.979
},
{
"epoch": 0.597,
"grad_norm": 0.5805736184120178,
"learning_rate": 0.00010683209713661453,
"loss": 1.211,
"num_input_tokens_seen": 3912499200,
"step": 59700,
"train_runtime": 28751.7229,
"train_tokens_per_second": 136078.774
},
{
"epoch": 0.598,
"grad_norm": 0.5607670545578003,
"learning_rate": 0.00010637645362355589,
"loss": 1.196,
"num_input_tokens_seen": 3919052800,
"step": 59800,
"train_runtime": 28798.1873,
"train_tokens_per_second": 136086.788
},
{
"epoch": 0.599,
"grad_norm": 0.4962175488471985,
"learning_rate": 0.00010592124939912497,
"loss": 1.1889,
"num_input_tokens_seen": 3925606400,
"step": 59900,
"train_runtime": 28852.3337,
"train_tokens_per_second": 136058.54
},
{
"epoch": 0.6,
"grad_norm": 0.6488810777664185,
"learning_rate": 0.00010546648904722326,
"loss": 1.1968,
"num_input_tokens_seen": 3932160000,
"step": 60000,
"train_runtime": 28898.713,
"train_tokens_per_second": 136066.959
},
{
"epoch": 0.601,
"grad_norm": 0.9370976686477661,
"learning_rate": 0.0001050121771472824,
"loss": 1.183,
"num_input_tokens_seen": 3938713600,
"step": 60100,
"train_runtime": 28946.5523,
"train_tokens_per_second": 136068.488
},
{
"epoch": 0.602,
"grad_norm": 0.5040610432624817,
"learning_rate": 0.0001045583182742182,
"loss": 1.2023,
"num_input_tokens_seen": 3945267200,
"step": 60200,
"train_runtime": 28994.2594,
"train_tokens_per_second": 136070.632
},
{
"epoch": 0.603,
"grad_norm": 0.5120612382888794,
"learning_rate": 0.00010410491699838448,
"loss": 1.1865,
"num_input_tokens_seen": 3951820800,
"step": 60300,
"train_runtime": 29042.095,
"train_tokens_per_second": 136072.167
},
{
"epoch": 0.604,
"grad_norm": 0.8983064889907837,
"learning_rate": 0.00010365197788552707,
"loss": 1.1734,
"num_input_tokens_seen": 3958374400,
"step": 60400,
"train_runtime": 29090.1772,
"train_tokens_per_second": 136072.543
},
{
"epoch": 0.605,
"grad_norm": 0.5155735015869141,
"learning_rate": 0.00010319950549673778,
"loss": 1.1923,
"num_input_tokens_seen": 3964928000,
"step": 60500,
"train_runtime": 29143.642,
"train_tokens_per_second": 136047.787
},
{
"epoch": 0.606,
"grad_norm": 1.5562913417816162,
"learning_rate": 0.00010274750438840855,
"loss": 1.1877,
"num_input_tokens_seen": 3971481600,
"step": 60600,
"train_runtime": 29191.8256,
"train_tokens_per_second": 136047.73
},
{
"epoch": 0.607,
"grad_norm": 0.5603190064430237,
"learning_rate": 0.00010229597911218554,
"loss": 1.1862,
"num_input_tokens_seen": 3978035200,
"step": 60700,
"train_runtime": 29240.4534,
"train_tokens_per_second": 136045.606
},
{
"epoch": 0.608,
"grad_norm": 0.550956130027771,
"learning_rate": 0.00010184493421492324,
"loss": 1.1869,
"num_input_tokens_seen": 3984588800,
"step": 60800,
"train_runtime": 29287.1822,
"train_tokens_per_second": 136052.31
},
{
"epoch": 0.609,
"grad_norm": 0.5152813196182251,
"learning_rate": 0.0001013943742386388,
"loss": 1.1902,
"num_input_tokens_seen": 3991142400,
"step": 60900,
"train_runtime": 29335.0152,
"train_tokens_per_second": 136053.872
},
{
"epoch": 0.61,
"grad_norm": 0.5258508324623108,
"learning_rate": 0.00010094430372046616,
"loss": 1.1843,
"num_input_tokens_seen": 3997696000,
"step": 61000,
"train_runtime": 29387.778,
"train_tokens_per_second": 136032.605
},
{
"epoch": 0.611,
"grad_norm": 0.5804030895233154,
"learning_rate": 0.0001004947271926104,
"loss": 1.1872,
"num_input_tokens_seen": 4004249600,
"step": 61100,
"train_runtime": 29435.5024,
"train_tokens_per_second": 136034.695
},
{
"epoch": 0.612,
"grad_norm": 0.5679774284362793,
"learning_rate": 0.00010004564918230222,
"loss": 1.1933,
"num_input_tokens_seen": 4010803200,
"step": 61200,
"train_runtime": 29483.504,
"train_tokens_per_second": 136035.5
},
{
"epoch": 0.613,
"grad_norm": 0.611191987991333,
"learning_rate": 9.959707421175217e-05,
"loss": 1.1926,
"num_input_tokens_seen": 4017356800,
"step": 61300,
"train_runtime": 29529.7223,
"train_tokens_per_second": 136044.517
},
{
"epoch": 0.614,
"grad_norm": 0.5725626945495605,
"learning_rate": 9.914900679810522e-05,
"loss": 1.1812,
"num_input_tokens_seen": 4023910400,
"step": 61400,
"train_runtime": 29577.4052,
"train_tokens_per_second": 136046.769
},
{
"epoch": 0.615,
"grad_norm": 0.6058773398399353,
"learning_rate": 9.870145145339529e-05,
"loss": 1.1904,
"num_input_tokens_seen": 4030464000,
"step": 61500,
"train_runtime": 29630.1636,
"train_tokens_per_second": 136025.708
},
{
"epoch": 0.616,
"grad_norm": 0.5151665806770325,
"learning_rate": 9.825441268449969e-05,
"loss": 1.1783,
"num_input_tokens_seen": 4037017600,
"step": 61600,
"train_runtime": 29677.4813,
"train_tokens_per_second": 136029.657
},
{
"epoch": 0.617,
"grad_norm": 0.5461622476577759,
"learning_rate": 9.780789499309391e-05,
"loss": 1.1825,
"num_input_tokens_seen": 4043571200,
"step": 61700,
"train_runtime": 29725.432,
"train_tokens_per_second": 136030.696
},
{
"epoch": 0.618,
"grad_norm": 0.8243169784545898,
"learning_rate": 9.736190287560608e-05,
"loss": 1.1933,
"num_input_tokens_seen": 4050124800,
"step": 61800,
"train_runtime": 29772.1739,
"train_tokens_per_second": 136037.255
},
{
"epoch": 0.619,
"grad_norm": 0.4877258539199829,
"learning_rate": 9.691644082317186e-05,
"loss": 1.1881,
"num_input_tokens_seen": 4056678400,
"step": 61900,
"train_runtime": 29825.721,
"train_tokens_per_second": 136012.752
},
{
"epoch": 0.62,
"grad_norm": 0.5376379489898682,
"learning_rate": 9.647151332158926e-05,
"loss": 1.1812,
"num_input_tokens_seen": 4063232000,
"step": 62000,
"train_runtime": 29872.1612,
"train_tokens_per_second": 136020.691
},
{
"epoch": 0.621,
"grad_norm": 0.5128985643386841,
"learning_rate": 9.60271248512732e-05,
"loss": 1.1719,
"num_input_tokens_seen": 4069785600,
"step": 62100,
"train_runtime": 29919.8698,
"train_tokens_per_second": 136022.838
},
{
"epoch": 0.622,
"grad_norm": 0.6911051273345947,
"learning_rate": 9.558327988721068e-05,
"loss": 1.199,
"num_input_tokens_seen": 4076339200,
"step": 62200,
"train_runtime": 29967.7263,
"train_tokens_per_second": 136024.307
},
{
"epoch": 0.623,
"grad_norm": 0.5334423184394836,
"learning_rate": 9.513998289891559e-05,
"loss": 1.1922,
"num_input_tokens_seen": 4082892800,
"step": 62300,
"train_runtime": 30014.7483,
"train_tokens_per_second": 136029.553
},
{
"epoch": 0.624,
"grad_norm": 0.47934290766716003,
"learning_rate": 9.469723835038361e-05,
"loss": 1.1864,
"num_input_tokens_seen": 4089446400,
"step": 62400,
"train_runtime": 30062.3944,
"train_tokens_per_second": 136031.959
},
{
"epoch": 0.625,
"grad_norm": 0.6690011620521545,
"learning_rate": 9.42550507000475e-05,
"loss": 1.1887,
"num_input_tokens_seen": 4096000000,
"step": 62500,
"train_runtime": 30115.1503,
"train_tokens_per_second": 136011.275
},
{
"epoch": 0.626,
"grad_norm": 0.5379562973976135,
"learning_rate": 9.381342440073194e-05,
"loss": 1.1873,
"num_input_tokens_seen": 4102553600,
"step": 62600,
"train_runtime": 30162.8214,
"train_tokens_per_second": 136013.589
},
{
"epoch": 0.627,
"grad_norm": 0.5619449615478516,
"learning_rate": 9.337236389960886e-05,
"loss": 1.184,
"num_input_tokens_seen": 4109107200,
"step": 62700,
"train_runtime": 30211.3171,
"train_tokens_per_second": 136012.183
},
{
"epoch": 0.628,
"grad_norm": 0.9017994999885559,
"learning_rate": 9.293187363815265e-05,
"loss": 1.1869,
"num_input_tokens_seen": 4115660800,
"step": 62800,
"train_runtime": 30263.5761,
"train_tokens_per_second": 135993.869
},
{
"epoch": 0.629,
"grad_norm": 0.6502019762992859,
"learning_rate": 9.249195805209533e-05,
"loss": 1.1944,
"num_input_tokens_seen": 4122214400,
"step": 62900,
"train_runtime": 30310.6247,
"train_tokens_per_second": 135998.992
},
{
"epoch": 0.63,
"grad_norm": 0.5749123096466064,
"learning_rate": 9.205262157138192e-05,
"loss": 1.1896,
"num_input_tokens_seen": 4128768000,
"step": 63000,
"train_runtime": 30359.0787,
"train_tokens_per_second": 135997.803
},
{
"epoch": 0.631,
"grad_norm": 0.4843611419200897,
"learning_rate": 9.161386862012601e-05,
"loss": 1.1932,
"num_input_tokens_seen": 4135321600,
"step": 63100,
"train_runtime": 30406.8492,
"train_tokens_per_second": 135999.675
},
{
"epoch": 0.632,
"grad_norm": 0.634504497051239,
"learning_rate": 9.11757036165649e-05,
"loss": 1.181,
"num_input_tokens_seen": 4141875200,
"step": 63200,
"train_runtime": 30453.794,
"train_tokens_per_second": 136005.228
},
{
"epoch": 0.633,
"grad_norm": 0.605948269367218,
"learning_rate": 9.073813097301521e-05,
"loss": 1.1742,
"num_input_tokens_seen": 4148428800,
"step": 63300,
"train_runtime": 30506.719,
"train_tokens_per_second": 135984.102
},
{
"epoch": 0.634,
"grad_norm": 0.5731847882270813,
"learning_rate": 9.030115509582883e-05,
"loss": 1.1809,
"num_input_tokens_seen": 4154982400,
"step": 63400,
"train_runtime": 30554.7018,
"train_tokens_per_second": 135985.042
},
{
"epoch": 0.635,
"grad_norm": 0.9707246422767639,
"learning_rate": 8.986478038534775e-05,
"loss": 1.1981,
"num_input_tokens_seen": 4161536000,
"step": 63500,
"train_runtime": 30602.1945,
"train_tokens_per_second": 135988.156
},
{
"epoch": 0.636,
"grad_norm": 0.7120965719223022,
"learning_rate": 8.942901123586059e-05,
"loss": 1.1816,
"num_input_tokens_seen": 4168089600,
"step": 63600,
"train_runtime": 30649.6499,
"train_tokens_per_second": 135991.426
},
{
"epoch": 0.637,
"grad_norm": 0.5136720538139343,
"learning_rate": 8.899385203555781e-05,
"loss": 1.177,
"num_input_tokens_seen": 4174643200,
"step": 63700,
"train_runtime": 30696.6221,
"train_tokens_per_second": 135996.827
},
{
"epoch": 0.638,
"grad_norm": 0.5284336805343628,
"learning_rate": 8.855930716648774e-05,
"loss": 1.184,
"num_input_tokens_seen": 4181196800,
"step": 63800,
"train_runtime": 30745.5123,
"train_tokens_per_second": 135993.727
},
{
"epoch": 0.639,
"grad_norm": 0.5269259810447693,
"learning_rate": 8.812538100451239e-05,
"loss": 1.2174,
"num_input_tokens_seen": 4187750400,
"step": 63900,
"train_runtime": 30792.1632,
"train_tokens_per_second": 136000.526
},
{
"epoch": 0.64,
"grad_norm": 0.5354572534561157,
"learning_rate": 8.769207791926338e-05,
"loss": 1.1771,
"num_input_tokens_seen": 4194304000,
"step": 64000,
"train_runtime": 30846.5823,
"train_tokens_per_second": 135973.054
},
{
"epoch": 0.641,
"grad_norm": 0.7058772444725037,
"learning_rate": 8.725940227409797e-05,
"loss": 1.179,
"num_input_tokens_seen": 4200857600,
"step": 64100,
"train_runtime": 30893.4429,
"train_tokens_per_second": 135978.94
},
{
"epoch": 0.642,
"grad_norm": 0.5777366161346436,
"learning_rate": 8.682735842605509e-05,
"loss": 1.182,
"num_input_tokens_seen": 4207411200,
"step": 64200,
"train_runtime": 30940.3826,
"train_tokens_per_second": 135984.459
},
{
"epoch": 0.643,
"grad_norm": 0.5608710646629333,
"learning_rate": 8.639595072581158e-05,
"loss": 1.1904,
"num_input_tokens_seen": 4213964800,
"step": 64300,
"train_runtime": 30988.4894,
"train_tokens_per_second": 135984.841
},
{
"epoch": 0.644,
"grad_norm": 0.6048064231872559,
"learning_rate": 8.596518351763806e-05,
"loss": 1.1851,
"num_input_tokens_seen": 4220518400,
"step": 64400,
"train_runtime": 31041.3711,
"train_tokens_per_second": 135964.304
},
{
"epoch": 0.645,
"grad_norm": 0.47835734486579895,
"learning_rate": 8.553506113935561e-05,
"loss": 1.1803,
"num_input_tokens_seen": 4227072000,
"step": 64500,
"train_runtime": 31089.9624,
"train_tokens_per_second": 135962.596
},
{
"epoch": 0.646,
"grad_norm": 1.1150704622268677,
"learning_rate": 8.510558792229183e-05,
"loss": 1.1878,
"num_input_tokens_seen": 4233625600,
"step": 64600,
"train_runtime": 31137.4325,
"train_tokens_per_second": 135965.79
},
{
"epoch": 0.647,
"grad_norm": 0.6650880575180054,
"learning_rate": 8.467676819123716e-05,
"loss": 1.1951,
"num_input_tokens_seen": 4240179200,
"step": 64700,
"train_runtime": 31185.0957,
"train_tokens_per_second": 135968.132
},
{
"epoch": 0.648,
"grad_norm": 0.7750310897827148,
"learning_rate": 8.424860626440158e-05,
"loss": 1.1829,
"num_input_tokens_seen": 4246732800,
"step": 64800,
"train_runtime": 31237.5852,
"train_tokens_per_second": 135949.459
},
{
"epoch": 0.649,
"grad_norm": 0.595783531665802,
"learning_rate": 8.382110645337102e-05,
"loss": 1.1856,
"num_input_tokens_seen": 4253286400,
"step": 64900,
"train_runtime": 31285.0064,
"train_tokens_per_second": 135952.87
},
{
"epoch": 0.65,
"grad_norm": 0.6093938946723938,
"learning_rate": 8.339427306306365e-05,
"loss": 1.1842,
"num_input_tokens_seen": 4259840000,
"step": 65000,
"train_runtime": 31332.1176,
"train_tokens_per_second": 135957.615
},
{
"epoch": 0.651,
"grad_norm": 0.6823499798774719,
"learning_rate": 8.296811039168716e-05,
"loss": 1.1818,
"num_input_tokens_seen": 4266393600,
"step": 65100,
"train_runtime": 31381.0925,
"train_tokens_per_second": 135954.273
},
{
"epoch": 0.652,
"grad_norm": 0.5052744746208191,
"learning_rate": 8.254262273069477e-05,
"loss": 1.2034,
"num_input_tokens_seen": 4272947200,
"step": 65200,
"train_runtime": 31428.8012,
"train_tokens_per_second": 135956.417
},
{
"epoch": 0.653,
"grad_norm": 0.5003641247749329,
"learning_rate": 8.211781436474263e-05,
"loss": 1.177,
"num_input_tokens_seen": 4279500800,
"step": 65300,
"train_runtime": 31476.0702,
"train_tokens_per_second": 135960.454
},
{
"epoch": 0.654,
"grad_norm": 0.5675527453422546,
"learning_rate": 8.169368957164613e-05,
"loss": 1.1707,
"num_input_tokens_seen": 4286054400,
"step": 65400,
"train_runtime": 31524.8831,
"train_tokens_per_second": 135957.82
},
{
"epoch": 0.655,
"grad_norm": 0.5109818577766418,
"learning_rate": 8.127025262233731e-05,
"loss": 1.187,
"num_input_tokens_seen": 4292608000,
"step": 65500,
"train_runtime": 31578.0721,
"train_tokens_per_second": 135936.354
},
{
"epoch": 0.656,
"grad_norm": 0.6228885054588318,
"learning_rate": 8.084750778082159e-05,
"loss": 1.1944,
"num_input_tokens_seen": 4299161600,
"step": 65600,
"train_runtime": 31626.6624,
"train_tokens_per_second": 135934.723
},
{
"epoch": 0.657,
"grad_norm": 0.6139951348304749,
"learning_rate": 8.042545930413473e-05,
"loss": 1.1788,
"num_input_tokens_seen": 4305715200,
"step": 65700,
"train_runtime": 31673.2442,
"train_tokens_per_second": 135941.717
},
{
"epoch": 0.658,
"grad_norm": 0.6792371273040771,
"learning_rate": 8.000411144230025e-05,
"loss": 1.2019,
"num_input_tokens_seen": 4312268800,
"step": 65800,
"train_runtime": 31721.455,
"train_tokens_per_second": 135941.709
},
{
"epoch": 0.659,
"grad_norm": 0.546470582485199,
"learning_rate": 7.95834684382865e-05,
"loss": 1.1905,
"num_input_tokens_seen": 4318822400,
"step": 65900,
"train_runtime": 31770.1998,
"train_tokens_per_second": 135939.416
},
{
"epoch": 0.66,
"grad_norm": 0.5273057818412781,
"learning_rate": 7.916353452796378e-05,
"loss": 1.1769,
"num_input_tokens_seen": 4325376000,
"step": 66000,
"train_runtime": 31818.123,
"train_tokens_per_second": 135940.64
},
{
"epoch": 0.661,
"grad_norm": 0.5213398933410645,
"learning_rate": 7.874431394006188e-05,
"loss": 1.1834,
"num_input_tokens_seen": 4331929600,
"step": 66100,
"train_runtime": 31870.8187,
"train_tokens_per_second": 135921.504
},
{
"epoch": 0.662,
"grad_norm": 0.5762707591056824,
"learning_rate": 7.832581089612762e-05,
"loss": 1.1875,
"num_input_tokens_seen": 4338483200,
"step": 66200,
"train_runtime": 31918.6258,
"train_tokens_per_second": 135923.245
},
{
"epoch": 0.663,
"grad_norm": 0.6153529286384583,
"learning_rate": 7.790802961048183e-05,
"loss": 1.1895,
"num_input_tokens_seen": 4345036800,
"step": 66300,
"train_runtime": 31967.5441,
"train_tokens_per_second": 135920.257
},
{
"epoch": 0.664,
"grad_norm": 0.6668293476104736,
"learning_rate": 7.749097429017749e-05,
"loss": 1.1835,
"num_input_tokens_seen": 4351590400,
"step": 66400,
"train_runtime": 32014.502,
"train_tokens_per_second": 135925.6
},
{
"epoch": 0.665,
"grad_norm": 0.49117180705070496,
"learning_rate": 7.70746491349571e-05,
"loss": 1.1762,
"num_input_tokens_seen": 4358144000,
"step": 66500,
"train_runtime": 32062.234,
"train_tokens_per_second": 135927.646
},
{
"epoch": 0.666,
"grad_norm": 0.5580335259437561,
"learning_rate": 7.665905833721025e-05,
"loss": 1.1751,
"num_input_tokens_seen": 4364697600,
"step": 66600,
"train_runtime": 32116.4057,
"train_tokens_per_second": 135902.431
},
{
"epoch": 0.667,
"grad_norm": 0.4941908121109009,
"learning_rate": 7.624420608193171e-05,
"loss": 1.1991,
"num_input_tokens_seen": 4371251200,
"step": 66700,
"train_runtime": 32164.7962,
"train_tokens_per_second": 135901.722
},
{
"epoch": 0.668,
"grad_norm": 0.5203377604484558,
"learning_rate": 7.583009654667912e-05,
"loss": 1.1892,
"num_input_tokens_seen": 4377804800,
"step": 66800,
"train_runtime": 32211.7614,
"train_tokens_per_second": 135907.029
},
{
"epoch": 0.669,
"grad_norm": 0.5924380421638489,
"learning_rate": 7.541673390153087e-05,
"loss": 1.1749,
"num_input_tokens_seen": 4384358400,
"step": 66900,
"train_runtime": 32259.5523,
"train_tokens_per_second": 135908.842
},
{
"epoch": 0.67,
"grad_norm": 0.5180861353874207,
"learning_rate": 7.500412230904416e-05,
"loss": 1.1833,
"num_input_tokens_seen": 4390912000,
"step": 67000,
"train_runtime": 32305.7062,
"train_tokens_per_second": 135917.536
},
{
"epoch": 0.671,
"grad_norm": 0.5575404167175293,
"learning_rate": 7.459226592421318e-05,
"loss": 1.1908,
"num_input_tokens_seen": 4397465600,
"step": 67100,
"train_runtime": 32353.5616,
"train_tokens_per_second": 135919.058
},
{
"epoch": 0.672,
"grad_norm": 0.519868016242981,
"learning_rate": 7.418116889442721e-05,
"loss": 1.191,
"num_input_tokens_seen": 4404019200,
"step": 67200,
"train_runtime": 32407.2129,
"train_tokens_per_second": 135896.265
},
{
"epoch": 0.673,
"grad_norm": 0.5036019086837769,
"learning_rate": 7.377083535942868e-05,
"loss": 1.1771,
"num_input_tokens_seen": 4410572800,
"step": 67300,
"train_runtime": 32454.4825,
"train_tokens_per_second": 135900.266
},
{
"epoch": 0.674,
"grad_norm": 0.5349675416946411,
"learning_rate": 7.336126945127178e-05,
"loss": 1.1834,
"num_input_tokens_seen": 4417126400,
"step": 67400,
"train_runtime": 32501.8427,
"train_tokens_per_second": 135903.876
},
{
"epoch": 0.675,
"grad_norm": 0.675538957118988,
"learning_rate": 7.29524752942807e-05,
"loss": 1.1852,
"num_input_tokens_seen": 4423680000,
"step": 67500,
"train_runtime": 32550.3797,
"train_tokens_per_second": 135902.562
},
{
"epoch": 0.676,
"grad_norm": 0.5116747617721558,
"learning_rate": 7.254445700500798e-05,
"loss": 1.1816,
"num_input_tokens_seen": 4430233600,
"step": 67600,
"train_runtime": 32598.0387,
"train_tokens_per_second": 135904.913
},
{
"epoch": 0.677,
"grad_norm": 0.5892815589904785,
"learning_rate": 7.213721869219329e-05,
"loss": 1.1827,
"num_input_tokens_seen": 4436787200,
"step": 67700,
"train_runtime": 32650.3715,
"train_tokens_per_second": 135887.802
},
{
"epoch": 0.678,
"grad_norm": 0.6862092614173889,
"learning_rate": 7.173076445672198e-05,
"loss": 1.1801,
"num_input_tokens_seen": 4443340800,
"step": 67800,
"train_runtime": 32698.6817,
"train_tokens_per_second": 135887.46
},
{
"epoch": 0.679,
"grad_norm": 0.8308249115943909,
"learning_rate": 7.132509839158359e-05,
"loss": 1.1887,
"num_input_tokens_seen": 4449894400,
"step": 67900,
"train_runtime": 32745.9782,
"train_tokens_per_second": 135891.326
},
{
"epoch": 0.68,
"grad_norm": 0.5063105225563049,
"learning_rate": 7.092022458183096e-05,
"loss": 1.1949,
"num_input_tokens_seen": 4456448000,
"step": 68000,
"train_runtime": 32794.3077,
"train_tokens_per_second": 135890.9
},
{
"epoch": 0.681,
"grad_norm": 0.6090216040611267,
"learning_rate": 7.051614710453888e-05,
"loss": 1.1827,
"num_input_tokens_seen": 4463001600,
"step": 68100,
"train_runtime": 32841.7871,
"train_tokens_per_second": 135893.993
},
{
"epoch": 0.682,
"grad_norm": 0.5802315473556519,
"learning_rate": 7.011287002876296e-05,
"loss": 1.1808,
"num_input_tokens_seen": 4469555200,
"step": 68200,
"train_runtime": 32889.3297,
"train_tokens_per_second": 135896.816
},
{
"epoch": 0.683,
"grad_norm": 0.5431249141693115,
"learning_rate": 6.971039741549894e-05,
"loss": 1.1872,
"num_input_tokens_seen": 4476108800,
"step": 68300,
"train_runtime": 32943.0615,
"train_tokens_per_second": 135874.099
},
{
"epoch": 0.684,
"grad_norm": 0.8621413111686707,
"learning_rate": 6.930873331764162e-05,
"loss": 1.1776,
"num_input_tokens_seen": 4482662400,
"step": 68400,
"train_runtime": 32991.0019,
"train_tokens_per_second": 135875.304
},
{
"epoch": 0.685,
"grad_norm": 0.6102387309074402,
"learning_rate": 6.890788177994391e-05,
"loss": 1.18,
"num_input_tokens_seen": 4489216000,
"step": 68500,
"train_runtime": 33039.2288,
"train_tokens_per_second": 135875.326
},
{
"epoch": 0.686,
"grad_norm": 0.5266649723052979,
"learning_rate": 6.850784683897641e-05,
"loss": 1.1743,
"num_input_tokens_seen": 4495769600,
"step": 68600,
"train_runtime": 33086.8363,
"train_tokens_per_second": 135877.893
},
{
"epoch": 0.687,
"grad_norm": 0.5879511833190918,
"learning_rate": 6.810863252308653e-05,
"loss": 1.1803,
"num_input_tokens_seen": 4502323200,
"step": 68700,
"train_runtime": 33133.6328,
"train_tokens_per_second": 135883.778
},
{
"epoch": 0.688,
"grad_norm": 0.5183672308921814,
"learning_rate": 6.771024285235792e-05,
"loss": 1.1834,
"num_input_tokens_seen": 4508876800,
"step": 68800,
"train_runtime": 33182.6281,
"train_tokens_per_second": 135880.642
},
{
"epoch": 0.689,
"grad_norm": 0.5091114640235901,
"learning_rate": 6.73126818385702e-05,
"loss": 1.1913,
"num_input_tokens_seen": 4515430400,
"step": 68900,
"train_runtime": 33236.4019,
"train_tokens_per_second": 135857.979
},
{
"epoch": 0.69,
"grad_norm": 0.7696628570556641,
"learning_rate": 6.691595348515837e-05,
"loss": 1.1786,
"num_input_tokens_seen": 4521984000,
"step": 69000,
"train_runtime": 33285.7582,
"train_tokens_per_second": 135853.417
},
{
"epoch": 0.691,
"grad_norm": 0.5338857769966125,
"learning_rate": 6.65200617871726e-05,
"loss": 1.1832,
"num_input_tokens_seen": 4528537600,
"step": 69100,
"train_runtime": 33332.8826,
"train_tokens_per_second": 135857.965
},
{
"epoch": 0.692,
"grad_norm": 0.7705228328704834,
"learning_rate": 6.612501073123775e-05,
"loss": 1.1762,
"num_input_tokens_seen": 4535091200,
"step": 69200,
"train_runtime": 33380.8611,
"train_tokens_per_second": 135859.024
},
{
"epoch": 0.693,
"grad_norm": 0.5423911213874817,
"learning_rate": 6.573080429551368e-05,
"loss": 1.19,
"num_input_tokens_seen": 4541644800,
"step": 69300,
"train_runtime": 33429.7481,
"train_tokens_per_second": 135856.387
},
{
"epoch": 0.694,
"grad_norm": 0.5332856774330139,
"learning_rate": 6.533744644965482e-05,
"loss": 1.1753,
"num_input_tokens_seen": 4548198400,
"step": 69400,
"train_runtime": 33476.6955,
"train_tokens_per_second": 135861.629
},
{
"epoch": 0.695,
"grad_norm": 0.5862846970558167,
"learning_rate": 6.494494115477023e-05,
"loss": 1.1799,
"num_input_tokens_seen": 4554752000,
"step": 69500,
"train_runtime": 33523.7618,
"train_tokens_per_second": 135866.375
},
{
"epoch": 0.696,
"grad_norm": 0.658592164516449,
"learning_rate": 6.455329236338394e-05,
"loss": 1.1846,
"num_input_tokens_seen": 4561305600,
"step": 69600,
"train_runtime": 33571.8888,
"train_tokens_per_second": 135866.815
},
{
"epoch": 0.697,
"grad_norm": 0.5558256506919861,
"learning_rate": 6.416250401939496e-05,
"loss": 1.1873,
"num_input_tokens_seen": 4567859200,
"step": 69700,
"train_runtime": 33620.7189,
"train_tokens_per_second": 135864.412
},
{
"epoch": 0.698,
"grad_norm": 0.5283026099205017,
"learning_rate": 6.377258005803746e-05,
"loss": 1.1743,
"num_input_tokens_seen": 4574412800,
"step": 69800,
"train_runtime": 33674.4741,
"train_tokens_per_second": 135842.145
},
{
"epoch": 0.699,
"grad_norm": 0.802412211894989,
"learning_rate": 6.338352440584149e-05,
"loss": 1.1782,
"num_input_tokens_seen": 4580966400,
"step": 69900,
"train_runtime": 33722.7187,
"train_tokens_per_second": 135842.144
},
{
"epoch": 0.7,
"grad_norm": 0.5585867762565613,
"learning_rate": 6.299534098059318e-05,
"loss": 1.1809,
"num_input_tokens_seen": 4587520000,
"step": 70000,
"train_runtime": 33770.2671,
"train_tokens_per_second": 135844.943
},
{
"epoch": 0.701,
"grad_norm": 0.6285941004753113,
"learning_rate": 6.260803369129522e-05,
"loss": 1.1807,
"num_input_tokens_seen": 4594073600,
"step": 70100,
"train_runtime": 33818.7011,
"train_tokens_per_second": 135844.176
},
{
"epoch": 0.702,
"grad_norm": 0.9580085277557373,
"learning_rate": 6.222160643812774e-05,
"loss": 1.1802,
"num_input_tokens_seen": 4600627200,
"step": 70200,
"train_runtime": 33866.618,
"train_tokens_per_second": 135845.487
},
{
"epoch": 0.703,
"grad_norm": 0.6520081162452698,
"learning_rate": 6.183606311240901e-05,
"loss": 1.1879,
"num_input_tokens_seen": 4607180800,
"step": 70300,
"train_runtime": 33915.2388,
"train_tokens_per_second": 135843.973
},
{
"epoch": 0.704,
"grad_norm": 0.520710289478302,
"learning_rate": 6.145140759655585e-05,
"loss": 1.179,
"num_input_tokens_seen": 4613734400,
"step": 70400,
"train_runtime": 33968.6026,
"train_tokens_per_second": 135823.497
},
{
"epoch": 0.705,
"grad_norm": 0.5945906639099121,
"learning_rate": 6.10676437640451e-05,
"loss": 1.192,
"num_input_tokens_seen": 4620288000,
"step": 70500,
"train_runtime": 34016.7254,
"train_tokens_per_second": 135824.008
},
{
"epoch": 0.706,
"grad_norm": 0.5285692811012268,
"learning_rate": 6.068477547937436e-05,
"loss": 1.1855,
"num_input_tokens_seen": 4626841600,
"step": 70600,
"train_runtime": 34064.6033,
"train_tokens_per_second": 135825.495
},
{
"epoch": 0.707,
"grad_norm": 0.6492000222206116,
"learning_rate": 6.030280659802294e-05,
"loss": 1.192,
"num_input_tokens_seen": 4633395200,
"step": 70700,
"train_runtime": 34111.1694,
"train_tokens_per_second": 135832.2
},
{
"epoch": 0.708,
"grad_norm": 0.5521112084388733,
"learning_rate": 5.9921740966413204e-05,
"loss": 1.1781,
"num_input_tokens_seen": 4639948800,
"step": 70800,
"train_runtime": 34162.8893,
"train_tokens_per_second": 135818.395
},
{
"epoch": 0.709,
"grad_norm": 0.9012600183486938,
"learning_rate": 5.954158242187197e-05,
"loss": 1.1748,
"num_input_tokens_seen": 4646502400,
"step": 70900,
"train_runtime": 34211.5739,
"train_tokens_per_second": 135816.68
},
{
"epoch": 0.71,
"grad_norm": 0.4976861774921417,
"learning_rate": 5.91623347925914e-05,
"loss": 1.1902,
"num_input_tokens_seen": 4653056000,
"step": 71000,
"train_runtime": 34258.4131,
"train_tokens_per_second": 135822.287
},
{
"epoch": 0.711,
"grad_norm": 0.5690837502479553,
"learning_rate": 5.8784001897590996e-05,
"loss": 1.1767,
"num_input_tokens_seen": 4659609600,
"step": 71100,
"train_runtime": 34307.7023,
"train_tokens_per_second": 135818.177
},
{
"epoch": 0.712,
"grad_norm": 0.5648302435874939,
"learning_rate": 5.840658754667877e-05,
"loss": 1.182,
"num_input_tokens_seen": 4666163200,
"step": 71200,
"train_runtime": 34355.8058,
"train_tokens_per_second": 135818.768
},
{
"epoch": 0.713,
"grad_norm": 0.5309351086616516,
"learning_rate": 5.8030095540413144e-05,
"loss": 1.1755,
"num_input_tokens_seen": 4672716800,
"step": 71300,
"train_runtime": 34402.7961,
"train_tokens_per_second": 135823.751
},
{
"epoch": 0.714,
"grad_norm": 1.0066486597061157,
"learning_rate": 5.7654529670064326e-05,
"loss": 1.2073,
"num_input_tokens_seen": 4679270400,
"step": 71400,
"train_runtime": 34458.8447,
"train_tokens_per_second": 135793.015
},
{
"epoch": 0.715,
"grad_norm": 0.625823974609375,
"learning_rate": 5.7279893717576485e-05,
"loss": 1.2012,
"num_input_tokens_seen": 4685824000,
"step": 71500,
"train_runtime": 34506.5957,
"train_tokens_per_second": 135795.024
},
{
"epoch": 0.716,
"grad_norm": 0.512055516242981,
"learning_rate": 5.690619145552958e-05,
"loss": 1.1702,
"num_input_tokens_seen": 4692377600,
"step": 71600,
"train_runtime": 34554.5393,
"train_tokens_per_second": 135796.271
},
{
"epoch": 0.717,
"grad_norm": 0.749454915523529,
"learning_rate": 5.6533426647101135e-05,
"loss": 1.1812,
"num_input_tokens_seen": 4698931200,
"step": 71700,
"train_runtime": 34601.4153,
"train_tokens_per_second": 135801.705
},
{
"epoch": 0.718,
"grad_norm": 0.5417782068252563,
"learning_rate": 5.6161603046028674e-05,
"loss": 1.1681,
"num_input_tokens_seen": 4705484800,
"step": 71800,
"train_runtime": 34650.0822,
"train_tokens_per_second": 135800.105
},
{
"epoch": 0.719,
"grad_norm": 0.7127480506896973,
"learning_rate": 5.579072439657179e-05,
"loss": 1.1946,
"num_input_tokens_seen": 4712038400,
"step": 71900,
"train_runtime": 34698.539,
"train_tokens_per_second": 135799.331
},
{
"epoch": 0.72,
"grad_norm": 0.5434790253639221,
"learning_rate": 5.542079443347431e-05,
"loss": 1.1761,
"num_input_tokens_seen": 4718592000,
"step": 72000,
"train_runtime": 34745.7766,
"train_tokens_per_second": 135803.325
},
{
"epoch": 0.721,
"grad_norm": 0.5872786045074463,
"learning_rate": 5.505181688192682e-05,
"loss": 1.1758,
"num_input_tokens_seen": 4725145600,
"step": 72100,
"train_runtime": 34797.942,
"train_tokens_per_second": 135788.076
},
{
"epoch": 0.722,
"grad_norm": 0.5440493822097778,
"learning_rate": 5.468379545752925e-05,
"loss": 1.2086,
"num_input_tokens_seen": 4731699200,
"step": 72200,
"train_runtime": 34846.6082,
"train_tokens_per_second": 135786.507
},
{
"epoch": 0.723,
"grad_norm": 0.5699992775917053,
"learning_rate": 5.4316733866253166e-05,
"loss": 1.1705,
"num_input_tokens_seen": 4738252800,
"step": 72300,
"train_runtime": 34894.2941,
"train_tokens_per_second": 135788.756
},
{
"epoch": 0.724,
"grad_norm": 0.7067492604255676,
"learning_rate": 5.3950635804404754e-05,
"loss": 1.1788,
"num_input_tokens_seen": 4744806400,
"step": 72400,
"train_runtime": 34943.1279,
"train_tokens_per_second": 135786.539
},
{
"epoch": 0.725,
"grad_norm": 0.4926595389842987,
"learning_rate": 5.358550495858751e-05,
"loss": 1.1712,
"num_input_tokens_seen": 4751360000,
"step": 72500,
"train_runtime": 34988.8033,
"train_tokens_per_second": 135796.585
},
{
"epoch": 0.726,
"grad_norm": 0.6217764616012573,
"learning_rate": 5.322134500566487e-05,
"loss": 1.199,
"num_input_tokens_seen": 4757913600,
"step": 72600,
"train_runtime": 35043.098,
"train_tokens_per_second": 135773.201
},
{
"epoch": 0.727,
"grad_norm": 0.5704054236412048,
"learning_rate": 5.285815961272359e-05,
"loss": 1.1782,
"num_input_tokens_seen": 4764467200,
"step": 72700,
"train_runtime": 35090.0359,
"train_tokens_per_second": 135778.351
},
{
"epoch": 0.728,
"grad_norm": 0.6081520915031433,
"learning_rate": 5.249595243703658e-05,
"loss": 1.1679,
"num_input_tokens_seen": 4771020800,
"step": 72800,
"train_runtime": 35136.6254,
"train_tokens_per_second": 135784.833
},
{
"epoch": 0.729,
"grad_norm": 0.6235555410385132,
"learning_rate": 5.213472712602598e-05,
"loss": 1.1707,
"num_input_tokens_seen": 4777574400,
"step": 72900,
"train_runtime": 35185.4188,
"train_tokens_per_second": 135782.792
},
{
"epoch": 0.73,
"grad_norm": 0.5777461528778076,
"learning_rate": 5.17744873172267e-05,
"loss": 1.1816,
"num_input_tokens_seen": 4784128000,
"step": 73000,
"train_runtime": 35238.2318,
"train_tokens_per_second": 135765.268
},
{
"epoch": 0.731,
"grad_norm": 0.569218635559082,
"learning_rate": 5.1415236638249694e-05,
"loss": 1.1757,
"num_input_tokens_seen": 4790681600,
"step": 73100,
"train_runtime": 35286.0257,
"train_tokens_per_second": 135767.106
},
{
"epoch": 0.732,
"grad_norm": 1.2679173946380615,
"learning_rate": 5.105697870674519e-05,
"loss": 1.1686,
"num_input_tokens_seen": 4797235200,
"step": 73200,
"train_runtime": 35333.5517,
"train_tokens_per_second": 135769.969
},
{
"epoch": 0.733,
"grad_norm": 0.5663115382194519,
"learning_rate": 5.069971713036664e-05,
"loss": 1.1699,
"num_input_tokens_seen": 4803788800,
"step": 73300,
"train_runtime": 35380.3642,
"train_tokens_per_second": 135775.561
},
{
"epoch": 0.734,
"grad_norm": 0.5404617190361023,
"learning_rate": 5.034345550673415e-05,
"loss": 1.1916,
"num_input_tokens_seen": 4810342400,
"step": 73400,
"train_runtime": 35434.8234,
"train_tokens_per_second": 135751.838
},
{
"epoch": 0.735,
"grad_norm": 0.7994534373283386,
"learning_rate": 4.998819742339835e-05,
"loss": 1.1842,
"num_input_tokens_seen": 4816896000,
"step": 73500,
"train_runtime": 35482.3263,
"train_tokens_per_second": 135754.797
},
{
"epoch": 0.736,
"grad_norm": 0.6482565402984619,
"learning_rate": 4.963394645780411e-05,
"loss": 1.1789,
"num_input_tokens_seen": 4823449600,
"step": 73600,
"train_runtime": 35530.782,
"train_tokens_per_second": 135754.107
},
{
"epoch": 0.737,
"grad_norm": 0.5401994585990906,
"learning_rate": 4.928070617725482e-05,
"loss": 1.1832,
"num_input_tokens_seen": 4830003200,
"step": 73700,
"train_runtime": 35578.1016,
"train_tokens_per_second": 135757.755
},
{
"epoch": 0.738,
"grad_norm": 0.5170857906341553,
"learning_rate": 4.892848013887613e-05,
"loss": 1.1804,
"num_input_tokens_seen": 4836556800,
"step": 73800,
"train_runtime": 35625.1017,
"train_tokens_per_second": 135762.61
},
{
"epoch": 0.739,
"grad_norm": 0.5744811296463013,
"learning_rate": 4.857727188958031e-05,
"loss": 1.181,
"num_input_tokens_seen": 4843110400,
"step": 73900,
"train_runtime": 35672.7413,
"train_tokens_per_second": 135765.019
},
{
"epoch": 0.74,
"grad_norm": 0.6613340377807617,
"learning_rate": 4.822708496603052e-05,
"loss": 1.1879,
"num_input_tokens_seen": 4849664000,
"step": 74000,
"train_runtime": 35721.0554,
"train_tokens_per_second": 135764.858
},
{
"epoch": 0.741,
"grad_norm": 0.5571849346160889,
"learning_rate": 4.7877922894605304e-05,
"loss": 1.1781,
"num_input_tokens_seen": 4856217600,
"step": 74100,
"train_runtime": 35771.1997,
"train_tokens_per_second": 135757.75
},
{
"epoch": 0.742,
"grad_norm": 0.6960323452949524,
"learning_rate": 4.752978919136273e-05,
"loss": 1.1702,
"num_input_tokens_seen": 4862771200,
"step": 74200,
"train_runtime": 35823.4168,
"train_tokens_per_second": 135742.808
},
{
"epoch": 0.743,
"grad_norm": 0.5823075175285339,
"learning_rate": 4.7182687362005337e-05,
"loss": 1.1762,
"num_input_tokens_seen": 4869324800,
"step": 74300,
"train_runtime": 35872.0393,
"train_tokens_per_second": 135741.511
},
{
"epoch": 0.744,
"grad_norm": 0.5310567021369934,
"learning_rate": 4.6836620901844794e-05,
"loss": 1.1737,
"num_input_tokens_seen": 4875878400,
"step": 74400,
"train_runtime": 35918.2124,
"train_tokens_per_second": 135749.473
},
{
"epoch": 0.745,
"grad_norm": 0.560118556022644,
"learning_rate": 4.64915932957664e-05,
"loss": 1.1746,
"num_input_tokens_seen": 4882432000,
"step": 74500,
"train_runtime": 35972.3831,
"train_tokens_per_second": 135727.232
},
{
"epoch": 0.746,
"grad_norm": 0.5729120969772339,
"learning_rate": 4.614760801819433e-05,
"loss": 1.1729,
"num_input_tokens_seen": 4888985600,
"step": 74600,
"train_runtime": 36018.4093,
"train_tokens_per_second": 135735.744
},
{
"epoch": 0.747,
"grad_norm": 0.5329717397689819,
"learning_rate": 4.58046685330566e-05,
"loss": 1.1969,
"num_input_tokens_seen": 4895539200,
"step": 74700,
"train_runtime": 36066.8487,
"train_tokens_per_second": 135735.152
},
{
"epoch": 0.748,
"grad_norm": 0.5714908838272095,
"learning_rate": 4.546277829374993e-05,
"loss": 1.172,
"num_input_tokens_seen": 4902092800,
"step": 74800,
"train_runtime": 36115.3648,
"train_tokens_per_second": 135734.273
},
{
"epoch": 0.749,
"grad_norm": 0.5672817826271057,
"learning_rate": 4.5121940743105246e-05,
"loss": 1.1813,
"num_input_tokens_seen": 4908646400,
"step": 74900,
"train_runtime": 36164.0493,
"train_tokens_per_second": 135732.765
},
{
"epoch": 0.75,
"grad_norm": 0.5890370607376099,
"learning_rate": 4.478215931335295e-05,
"loss": 1.1667,
"num_input_tokens_seen": 4915200000,
"step": 75000,
"train_runtime": 36215.8524,
"train_tokens_per_second": 135719.572
},
{
"epoch": 0.751,
"grad_norm": 0.6215245127677917,
"learning_rate": 4.4443437426088205e-05,
"loss": 1.179,
"num_input_tokens_seen": 4921753600,
"step": 75100,
"train_runtime": 36264.1849,
"train_tokens_per_second": 135719.405
},
{
"epoch": 0.752,
"grad_norm": 1.4719446897506714,
"learning_rate": 4.410577849223666e-05,
"loss": 1.1847,
"num_input_tokens_seen": 4928307200,
"step": 75200,
"train_runtime": 36312.9781,
"train_tokens_per_second": 135717.516
},
{
"epoch": 0.753,
"grad_norm": 1.3475043773651123,
"learning_rate": 4.376918591202006e-05,
"loss": 1.1745,
"num_input_tokens_seen": 4934860800,
"step": 75300,
"train_runtime": 36359.7761,
"train_tokens_per_second": 135723.08
},
{
"epoch": 0.754,
"grad_norm": 0.9558594822883606,
"learning_rate": 4.3433663074922046e-05,
"loss": 1.181,
"num_input_tokens_seen": 4941414400,
"step": 75400,
"train_runtime": 36406.8385,
"train_tokens_per_second": 135727.644
},
{
"epoch": 0.755,
"grad_norm": 0.5916360020637512,
"learning_rate": 4.309921335965367e-05,
"loss": 1.1706,
"num_input_tokens_seen": 4947968000,
"step": 75500,
"train_runtime": 36460.2599,
"train_tokens_per_second": 135708.522
},
{
"epoch": 0.756,
"grad_norm": 0.5985275506973267,
"learning_rate": 4.276584013411992e-05,
"loss": 1.1758,
"num_input_tokens_seen": 4954521600,
"step": 75600,
"train_runtime": 36507.6786,
"train_tokens_per_second": 135711.768
},
{
"epoch": 0.757,
"grad_norm": 0.5550095438957214,
"learning_rate": 4.243354675538555e-05,
"loss": 1.1705,
"num_input_tokens_seen": 4961075200,
"step": 75700,
"train_runtime": 36554.9962,
"train_tokens_per_second": 135715.38
},
{
"epoch": 0.758,
"grad_norm": 0.5496001243591309,
"learning_rate": 4.210233656964111e-05,
"loss": 1.1746,
"num_input_tokens_seen": 4967628800,
"step": 75800,
"train_runtime": 36602.3493,
"train_tokens_per_second": 135718.851
},
{
"epoch": 0.759,
"grad_norm": 0.570070743560791,
"learning_rate": 4.1772212912169516e-05,
"loss": 1.1771,
"num_input_tokens_seen": 4974182400,
"step": 75900,
"train_runtime": 36656.3482,
"train_tokens_per_second": 135697.707
},
{
"epoch": 0.76,
"grad_norm": 0.7570028305053711,
"learning_rate": 4.14431791073124e-05,
"loss": 1.1756,
"num_input_tokens_seen": 4980736000,
"step": 76000,
"train_runtime": 36704.1036,
"train_tokens_per_second": 135699.704
},
{
"epoch": 0.761,
"grad_norm": 0.6243161559104919,
"learning_rate": 4.111523846843639e-05,
"loss": 1.1667,
"num_input_tokens_seen": 4987289600,
"step": 76100,
"train_runtime": 36753.037,
"train_tokens_per_second": 135697.347
},
{
"epoch": 0.762,
"grad_norm": 0.5531216263771057,
"learning_rate": 4.078839429790019e-05,
"loss": 1.1755,
"num_input_tokens_seen": 4993843200,
"step": 76200,
"train_runtime": 36800.3039,
"train_tokens_per_second": 135701.14
},
{
"epoch": 0.763,
"grad_norm": 0.5894837379455566,
"learning_rate": 4.046264988702097e-05,
"loss": 1.1778,
"num_input_tokens_seen": 5000396800,
"step": 76300,
"train_runtime": 36847.8696,
"train_tokens_per_second": 135703.824
},
{
"epoch": 0.764,
"grad_norm": 0.6210083365440369,
"learning_rate": 4.013800851604123e-05,
"loss": 1.1729,
"num_input_tokens_seen": 5006950400,
"step": 76400,
"train_runtime": 36901.2456,
"train_tokens_per_second": 135685.133
},
{
"epoch": 0.765,
"grad_norm": 0.5929700136184692,
"learning_rate": 3.981447345409606e-05,
"loss": 1.171,
"num_input_tokens_seen": 5013504000,
"step": 76500,
"train_runtime": 36949.2788,
"train_tokens_per_second": 135686.113
},
{
"epoch": 0.766,
"grad_norm": 0.5809143781661987,
"learning_rate": 3.949204795917995e-05,
"loss": 1.1775,
"num_input_tokens_seen": 5020057600,
"step": 76600,
"train_runtime": 36996.6957,
"train_tokens_per_second": 135689.35
},
{
"epoch": 0.767,
"grad_norm": 0.5398791432380676,
"learning_rate": 3.917073527811399e-05,
"loss": 1.1765,
"num_input_tokens_seen": 5026611200,
"step": 76700,
"train_runtime": 37044.9859,
"train_tokens_per_second": 135689.381
},
{
"epoch": 0.768,
"grad_norm": 0.8559983372688293,
"learning_rate": 3.885053864651334e-05,
"loss": 1.1661,
"num_input_tokens_seen": 5033164800,
"step": 76800,
"train_runtime": 37092.5707,
"train_tokens_per_second": 135691.992
},
{
"epoch": 0.769,
"grad_norm": 1.0961577892303467,
"learning_rate": 3.8531461288754564e-05,
"loss": 1.1734,
"num_input_tokens_seen": 5039718400,
"step": 76900,
"train_runtime": 37145.642,
"train_tokens_per_second": 135674.554
},
{
"epoch": 0.77,
"grad_norm": 0.5564078688621521,
"learning_rate": 3.821350641794305e-05,
"loss": 1.1783,
"num_input_tokens_seen": 5046272000,
"step": 77000,
"train_runtime": 37194.2194,
"train_tokens_per_second": 135673.556
},
{
"epoch": 0.771,
"grad_norm": 0.6036384701728821,
"learning_rate": 3.789667723588087e-05,
"loss": 1.1651,
"num_input_tokens_seen": 5052825600,
"step": 77100,
"train_runtime": 37242.6728,
"train_tokens_per_second": 135673.012
},
{
"epoch": 0.772,
"grad_norm": 1.4465519189834595,
"learning_rate": 3.758097693303431e-05,
"loss": 1.1783,
"num_input_tokens_seen": 5059379200,
"step": 77200,
"train_runtime": 37290.7014,
"train_tokens_per_second": 135674.015
},
{
"epoch": 0.773,
"grad_norm": 0.5566693544387817,
"learning_rate": 3.7266408688502005e-05,
"loss": 1.1751,
"num_input_tokens_seen": 5065932800,
"step": 77300,
"train_runtime": 37338.6452,
"train_tokens_per_second": 135675.324
},
{
"epoch": 0.774,
"grad_norm": 0.653806209564209,
"learning_rate": 3.695297566998256e-05,
"loss": 1.1709,
"num_input_tokens_seen": 5072486400,
"step": 77400,
"train_runtime": 37386.3122,
"train_tokens_per_second": 135677.634
},
{
"epoch": 0.775,
"grad_norm": 0.8704593777656555,
"learning_rate": 3.664068103374307e-05,
"loss": 1.1794,
"num_input_tokens_seen": 5079040000,
"step": 77500,
"train_runtime": 37436.1356,
"train_tokens_per_second": 135672.123
},
{
"epoch": 0.776,
"grad_norm": 0.6627979874610901,
"learning_rate": 3.63295279245871e-05,
"loss": 1.175,
"num_input_tokens_seen": 5085593600,
"step": 77600,
"train_runtime": 37484.0969,
"train_tokens_per_second": 135673.366
},
{
"epoch": 0.777,
"grad_norm": 0.6232652068138123,
"learning_rate": 3.601951947582291e-05,
"loss": 1.1665,
"num_input_tokens_seen": 5092147200,
"step": 77700,
"train_runtime": 37536.8508,
"train_tokens_per_second": 135657.283
},
{
"epoch": 0.778,
"grad_norm": 0.5873488187789917,
"learning_rate": 3.571065880923216e-05,
"loss": 1.1734,
"num_input_tokens_seen": 5098700800,
"step": 77800,
"train_runtime": 37584.0839,
"train_tokens_per_second": 135661.17
},
{
"epoch": 0.779,
"grad_norm": 0.56858891248703,
"learning_rate": 3.540294903503841e-05,
"loss": 1.1696,
"num_input_tokens_seen": 5105254400,
"step": 77900,
"train_runtime": 37631.6286,
"train_tokens_per_second": 135663.924
},
{
"epoch": 0.78,
"grad_norm": 0.5939886569976807,
"learning_rate": 3.5096393251875566e-05,
"loss": 1.1784,
"num_input_tokens_seen": 5111808000,
"step": 78000,
"train_runtime": 37679.4424,
"train_tokens_per_second": 135665.702
},
{
"epoch": 0.781,
"grad_norm": 0.5839298367500305,
"learning_rate": 3.479099454675701e-05,
"loss": 1.1672,
"num_input_tokens_seen": 5118361600,
"step": 78100,
"train_runtime": 37733.7363,
"train_tokens_per_second": 135644.177
},
{
"epoch": 0.782,
"grad_norm": 0.6057742238044739,
"learning_rate": 3.448675599504434e-05,
"loss": 1.1767,
"num_input_tokens_seen": 5124915200,
"step": 78200,
"train_runtime": 37781.8162,
"train_tokens_per_second": 135645.019
},
{
"epoch": 0.783,
"grad_norm": 0.9875990748405457,
"learning_rate": 3.418368066041633e-05,
"loss": 1.1619,
"num_input_tokens_seen": 5131468800,
"step": 78300,
"train_runtime": 37829.8727,
"train_tokens_per_second": 135645.944
},
{
"epoch": 0.784,
"grad_norm": 0.5806832313537598,
"learning_rate": 3.388177159483826e-05,
"loss": 1.1747,
"num_input_tokens_seen": 5138022400,
"step": 78400,
"train_runtime": 37877.7351,
"train_tokens_per_second": 135647.561
},
{
"epoch": 0.785,
"grad_norm": 0.7016937136650085,
"learning_rate": 3.3581031838531116e-05,
"loss": 1.1664,
"num_input_tokens_seen": 5144576000,
"step": 78500,
"train_runtime": 37924.0105,
"train_tokens_per_second": 135654.851
},
{
"epoch": 0.786,
"grad_norm": 0.7171750664710999,
"learning_rate": 3.328146441994084e-05,
"loss": 1.1905,
"num_input_tokens_seen": 5151129600,
"step": 78600,
"train_runtime": 37971.9481,
"train_tokens_per_second": 135656.184
},
{
"epoch": 0.787,
"grad_norm": 0.5550017356872559,
"learning_rate": 3.2983072355708026e-05,
"loss": 1.1741,
"num_input_tokens_seen": 5157683200,
"step": 78700,
"train_runtime": 38021.3003,
"train_tokens_per_second": 135652.467
},
{
"epoch": 0.788,
"grad_norm": 0.5833317637443542,
"learning_rate": 3.2685858650637486e-05,
"loss": 1.176,
"num_input_tokens_seen": 5164236800,
"step": 78800,
"train_runtime": 38074.1209,
"train_tokens_per_second": 135636.403
},
{
"epoch": 0.789,
"grad_norm": 0.9918714165687561,
"learning_rate": 3.238982629766793e-05,
"loss": 1.1653,
"num_input_tokens_seen": 5170790400,
"step": 78900,
"train_runtime": 38121.5575,
"train_tokens_per_second": 135639.537
},
{
"epoch": 0.79,
"grad_norm": 1.2304959297180176,
"learning_rate": 3.209497827784177e-05,
"loss": 1.177,
"num_input_tokens_seen": 5177344000,
"step": 79000,
"train_runtime": 38168.8984,
"train_tokens_per_second": 135643.003
},
{
"epoch": 0.791,
"grad_norm": 0.5920888185501099,
"learning_rate": 3.1801317560275394e-05,
"loss": 1.1717,
"num_input_tokens_seen": 5183897600,
"step": 79100,
"train_runtime": 38223.2691,
"train_tokens_per_second": 135621.513
},
{
"epoch": 0.792,
"grad_norm": 0.5991621017456055,
"learning_rate": 3.150884710212895e-05,
"loss": 1.1933,
"num_input_tokens_seen": 5190451200,
"step": 79200,
"train_runtime": 38270.5225,
"train_tokens_per_second": 135625.303
},
{
"epoch": 0.793,
"grad_norm": 0.6007819175720215,
"learning_rate": 3.121756984857665e-05,
"loss": 1.1721,
"num_input_tokens_seen": 5197004800,
"step": 79300,
"train_runtime": 38316.5535,
"train_tokens_per_second": 135633.41
},
{
"epoch": 0.794,
"grad_norm": 0.6040635704994202,
"learning_rate": 3.092748873277725e-05,
"loss": 1.1784,
"num_input_tokens_seen": 5203558400,
"step": 79400,
"train_runtime": 38364.1371,
"train_tokens_per_second": 135636.008
},
{
"epoch": 0.795,
"grad_norm": 1.8925070762634277,
"learning_rate": 3.06386066758444e-05,
"loss": 1.179,
"num_input_tokens_seen": 5210112000,
"step": 79500,
"train_runtime": 38412.6561,
"train_tokens_per_second": 135635.297
},
{
"epoch": 0.796,
"grad_norm": 0.6026915311813354,
"learning_rate": 3.0350926586817127e-05,
"loss": 1.1706,
"num_input_tokens_seen": 5216665600,
"step": 79600,
"train_runtime": 38465.3514,
"train_tokens_per_second": 135619.861
},
{
"epoch": 0.797,
"grad_norm": 0.7981861233711243,
"learning_rate": 3.0064451362630765e-05,
"loss": 1.1796,
"num_input_tokens_seen": 5223219200,
"step": 79700,
"train_runtime": 38512.271,
"train_tokens_per_second": 135624.804
},
{
"epoch": 0.798,
"grad_norm": 1.3739973306655884,
"learning_rate": 2.9779183888087683e-05,
"loss": 1.1827,
"num_input_tokens_seen": 5229772800,
"step": 79800,
"train_runtime": 38560.5377,
"train_tokens_per_second": 135624.997
},
{
"epoch": 0.799,
"grad_norm": 0.7507041692733765,
"learning_rate": 2.9495127035828103e-05,
"loss": 1.164,
"num_input_tokens_seen": 5236326400,
"step": 79900,
"train_runtime": 38608.5419,
"train_tokens_per_second": 135626.111
},
{
"epoch": 0.8,
"grad_norm": 0.5848426818847656,
"learning_rate": 2.921228366630144e-05,
"loss": 1.1746,
"num_input_tokens_seen": 5242880000,
"step": 80000,
"train_runtime": 38660.3487,
"train_tokens_per_second": 135613.883
},
{
"epoch": 0.801,
"grad_norm": 0.5851396322250366,
"learning_rate": 2.8930656627737276e-05,
"loss": 1.1999,
"num_input_tokens_seen": 5249433600,
"step": 80100,
"train_runtime": 38707.849,
"train_tokens_per_second": 135616.774
},
{
"epoch": 0.802,
"grad_norm": 0.5581755638122559,
"learning_rate": 2.8650248756116727e-05,
"loss": 1.1657,
"num_input_tokens_seen": 5255987200,
"step": 80200,
"train_runtime": 38755.0614,
"train_tokens_per_second": 135620.665
},
{
"epoch": 0.803,
"grad_norm": 0.8737390637397766,
"learning_rate": 2.8371062875143968e-05,
"loss": 1.168,
"num_input_tokens_seen": 5262540800,
"step": 80300,
"train_runtime": 38809.0814,
"train_tokens_per_second": 135600.757
},
{
"epoch": 0.804,
"grad_norm": 0.6018446683883667,
"learning_rate": 2.809310179621776e-05,
"loss": 1.1603,
"num_input_tokens_seen": 5269094400,
"step": 80400,
"train_runtime": 38856.5205,
"train_tokens_per_second": 135603.866
},
{
"epoch": 0.805,
"grad_norm": 0.5673835873603821,
"learning_rate": 2.781636831840303e-05,
"loss": 1.1748,
"num_input_tokens_seen": 5275648000,
"step": 80500,
"train_runtime": 38904.9212,
"train_tokens_per_second": 135603.616
},
{
"epoch": 0.806,
"grad_norm": 0.5929433703422546,
"learning_rate": 2.754086522840282e-05,
"loss": 1.1663,
"num_input_tokens_seen": 5282201600,
"step": 80600,
"train_runtime": 38952.3955,
"train_tokens_per_second": 135606.592
},
{
"epoch": 0.807,
"grad_norm": 0.555366039276123,
"learning_rate": 2.7266595300530204e-05,
"loss": 1.1665,
"num_input_tokens_seen": 5288755200,
"step": 80700,
"train_runtime": 39001.4372,
"train_tokens_per_second": 135604.11
},
{
"epoch": 0.808,
"grad_norm": 0.5364073514938354,
"learning_rate": 2.6993561296680342e-05,
"loss": 1.1687,
"num_input_tokens_seen": 5295308800,
"step": 80800,
"train_runtime": 39048.23,
"train_tokens_per_second": 135609.445
},
{
"epoch": 0.809,
"grad_norm": 0.9588598608970642,
"learning_rate": 2.672176596630258e-05,
"loss": 1.1831,
"num_input_tokens_seen": 5301862400,
"step": 80900,
"train_runtime": 39096.7929,
"train_tokens_per_second": 135608.627
},
{
"epoch": 0.81,
"grad_norm": 0.6481744050979614,
"learning_rate": 2.6451212046372883e-05,
"loss": 1.1686,
"num_input_tokens_seen": 5308416000,
"step": 81000,
"train_runtime": 39152.1435,
"train_tokens_per_second": 135584.301
},
{
"epoch": 0.811,
"grad_norm": 0.5828465819358826,
"learning_rate": 2.6181902261366256e-05,
"loss": 1.1662,
"num_input_tokens_seen": 5314969600,
"step": 81100,
"train_runtime": 39199.715,
"train_tokens_per_second": 135586.945
},
{
"epoch": 0.812,
"grad_norm": 0.5715954899787903,
"learning_rate": 2.5913839323229195e-05,
"loss": 1.1623,
"num_input_tokens_seen": 5321523200,
"step": 81200,
"train_runtime": 39246.528,
"train_tokens_per_second": 135592.203
},
{
"epoch": 0.813,
"grad_norm": 0.8631576299667358,
"learning_rate": 2.564702593135253e-05,
"loss": 1.1896,
"num_input_tokens_seen": 5328076800,
"step": 81300,
"train_runtime": 39294.7731,
"train_tokens_per_second": 135592.507
},
{
"epoch": 0.814,
"grad_norm": 0.5882650017738342,
"learning_rate": 2.538146477254419e-05,
"loss": 1.1728,
"num_input_tokens_seen": 5334630400,
"step": 81400,
"train_runtime": 39341.8017,
"train_tokens_per_second": 135597.003
},
{
"epoch": 0.815,
"grad_norm": 0.5567020773887634,
"learning_rate": 2.5117158521002033e-05,
"loss": 1.1669,
"num_input_tokens_seen": 5341184000,
"step": 81500,
"train_runtime": 39389.3033,
"train_tokens_per_second": 135599.86
},
{
"epoch": 0.816,
"grad_norm": 0.7412062883377075,
"learning_rate": 2.4854109838287116e-05,
"loss": 1.1629,
"num_input_tokens_seen": 5347737600,
"step": 81600,
"train_runtime": 39443.4282,
"train_tokens_per_second": 135579.939
},
{
"epoch": 0.817,
"grad_norm": 0.6353700757026672,
"learning_rate": 2.459232137329679e-05,
"loss": 1.1676,
"num_input_tokens_seen": 5354291200,
"step": 81700,
"train_runtime": 39490.3956,
"train_tokens_per_second": 135584.643
},
{
"epoch": 0.818,
"grad_norm": 0.6541226506233215,
"learning_rate": 2.4331795762237894e-05,
"loss": 1.1669,
"num_input_tokens_seen": 5360844800,
"step": 81800,
"train_runtime": 39539.3049,
"train_tokens_per_second": 135582.677
},
{
"epoch": 0.819,
"grad_norm": 0.684333086013794,
"learning_rate": 2.4072535628600514e-05,
"loss": 1.1623,
"num_input_tokens_seen": 5367398400,
"step": 81900,
"train_runtime": 39587.3713,
"train_tokens_per_second": 135583.602
},
{
"epoch": 0.82,
"grad_norm": 0.5568915605545044,
"learning_rate": 2.3814543583131306e-05,
"loss": 1.1662,
"num_input_tokens_seen": 5373952000,
"step": 82000,
"train_runtime": 39636.1132,
"train_tokens_per_second": 135582.214
},
{
"epoch": 0.821,
"grad_norm": 0.6357592940330505,
"learning_rate": 2.3557822223807287e-05,
"loss": 1.1617,
"num_input_tokens_seen": 5380505600,
"step": 82100,
"train_runtime": 39683.9299,
"train_tokens_per_second": 135583.991
},
{
"epoch": 0.822,
"grad_norm": 0.6660736203193665,
"learning_rate": 2.3302374135809727e-05,
"loss": 1.1788,
"num_input_tokens_seen": 5387059200,
"step": 82200,
"train_runtime": 39731.7683,
"train_tokens_per_second": 135585.69
},
{
"epoch": 0.823,
"grad_norm": 0.6093869805335999,
"learning_rate": 2.304820189149798e-05,
"loss": 1.1823,
"num_input_tokens_seen": 5393612800,
"step": 82300,
"train_runtime": 39780.5498,
"train_tokens_per_second": 135584.169
},
{
"epoch": 0.824,
"grad_norm": 1.0343610048294067,
"learning_rate": 2.2795308050383787e-05,
"loss": 1.1942,
"num_input_tokens_seen": 5400166400,
"step": 82400,
"train_runtime": 39833.9775,
"train_tokens_per_second": 135566.839
},
{
"epoch": 0.825,
"grad_norm": 0.5363211035728455,
"learning_rate": 2.2543695159105248e-05,
"loss": 1.1659,
"num_input_tokens_seen": 5406720000,
"step": 82500,
"train_runtime": 39881.8503,
"train_tokens_per_second": 135568.434
},
{
"epoch": 0.826,
"grad_norm": 0.9732265472412109,
"learning_rate": 2.2293365751401443e-05,
"loss": 1.1757,
"num_input_tokens_seen": 5413273600,
"step": 82600,
"train_runtime": 39929.975,
"train_tokens_per_second": 135569.171
},
{
"epoch": 0.827,
"grad_norm": 0.5309200286865234,
"learning_rate": 2.2044322348086735e-05,
"loss": 1.1651,
"num_input_tokens_seen": 5419827200,
"step": 82700,
"train_runtime": 39978.229,
"train_tokens_per_second": 135569.467
},
{
"epoch": 0.828,
"grad_norm": 0.543769121170044,
"learning_rate": 2.1796567457025372e-05,
"loss": 1.1685,
"num_input_tokens_seen": 5426380800,
"step": 82800,
"train_runtime": 40026.0125,
"train_tokens_per_second": 135571.356
},
{
"epoch": 0.829,
"grad_norm": 0.5210631489753723,
"learning_rate": 2.15501035731064e-05,
"loss": 1.1778,
"num_input_tokens_seen": 5432934400,
"step": 82900,
"train_runtime": 40075.0654,
"train_tokens_per_second": 135568.947
},
{
"epoch": 0.83,
"grad_norm": 1.3538480997085571,
"learning_rate": 2.1304933178218426e-05,
"loss": 1.1655,
"num_input_tokens_seen": 5439488000,
"step": 83000,
"train_runtime": 40123.2015,
"train_tokens_per_second": 135569.64
},
{
"epoch": 0.831,
"grad_norm": 1.2901802062988281,
"learning_rate": 2.1061058741224518e-05,
"loss": 1.1668,
"num_input_tokens_seen": 5446041600,
"step": 83100,
"train_runtime": 40170.8312,
"train_tokens_per_second": 135572.042
},
{
"epoch": 0.832,
"grad_norm": 0.6960340142250061,
"learning_rate": 2.0818482717937596e-05,
"loss": 1.163,
"num_input_tokens_seen": 5452595200,
"step": 83200,
"train_runtime": 40225.6882,
"train_tokens_per_second": 135550.079
},
{
"epoch": 0.833,
"grad_norm": 0.537268340587616,
"learning_rate": 2.0577207551095552e-05,
"loss": 1.1689,
"num_input_tokens_seen": 5459148800,
"step": 83300,
"train_runtime": 40273.4785,
"train_tokens_per_second": 135551.956
},
{
"epoch": 0.834,
"grad_norm": 0.564239501953125,
"learning_rate": 2.0337235670336584e-05,
"loss": 1.1662,
"num_input_tokens_seen": 5465702400,
"step": 83400,
"train_runtime": 40320.9705,
"train_tokens_per_second": 135554.832
},
{
"epoch": 0.835,
"grad_norm": 0.520041823387146,
"learning_rate": 2.0098569492174887e-05,
"loss": 1.1642,
"num_input_tokens_seen": 5472256000,
"step": 83500,
"train_runtime": 40369.1468,
"train_tokens_per_second": 135555.404
},
{
"epoch": 0.836,
"grad_norm": 0.616112232208252,
"learning_rate": 1.9861211419976258e-05,
"loss": 1.1671,
"num_input_tokens_seen": 5478809600,
"step": 83600,
"train_runtime": 40416.0661,
"train_tokens_per_second": 135560.19
},
{
"epoch": 0.837,
"grad_norm": 1.3083754777908325,
"learning_rate": 1.962516384393377e-05,
"loss": 1.1778,
"num_input_tokens_seen": 5485363200,
"step": 83700,
"train_runtime": 40465.3405,
"train_tokens_per_second": 135557.075
},
{
"epoch": 0.838,
"grad_norm": 0.5721991062164307,
"learning_rate": 1.939042914104396e-05,
"loss": 1.179,
"num_input_tokens_seen": 5491916800,
"step": 83800,
"train_runtime": 40513.1256,
"train_tokens_per_second": 135558.951
},
{
"epoch": 0.839,
"grad_norm": 0.8014708161354065,
"learning_rate": 1.9157009675082702e-05,
"loss": 1.1698,
"num_input_tokens_seen": 5498470400,
"step": 83900,
"train_runtime": 40567.2253,
"train_tokens_per_second": 135539.721
},
{
"epoch": 0.84,
"grad_norm": 0.7554424405097961,
"learning_rate": 1.8924907796581363e-05,
"loss": 1.1689,
"num_input_tokens_seen": 5505024000,
"step": 84000,
"train_runtime": 40615.2949,
"train_tokens_per_second": 135540.663
},
{
"epoch": 0.841,
"grad_norm": 0.6026338934898376,
"learning_rate": 1.869412584280329e-05,
"loss": 1.1727,
"num_input_tokens_seen": 5511577600,
"step": 84100,
"train_runtime": 40664.1179,
"train_tokens_per_second": 135539.091
},
{
"epoch": 0.842,
"grad_norm": 0.6569694876670837,
"learning_rate": 1.8464666137720208e-05,
"loss": 1.1717,
"num_input_tokens_seen": 5518131200,
"step": 84200,
"train_runtime": 40713.1869,
"train_tokens_per_second": 135536.705
},
{
"epoch": 0.843,
"grad_norm": 0.5886375904083252,
"learning_rate": 1.823653099198884e-05,
"loss": 1.1764,
"num_input_tokens_seen": 5524684800,
"step": 84300,
"train_runtime": 40759.1071,
"train_tokens_per_second": 135544.795
},
{
"epoch": 0.844,
"grad_norm": 0.6782867908477783,
"learning_rate": 1.800972270292749e-05,
"loss": 1.1637,
"num_input_tokens_seen": 5531238400,
"step": 84400,
"train_runtime": 40811.77,
"train_tokens_per_second": 135530.471
},
{
"epoch": 0.845,
"grad_norm": 0.6513829231262207,
"learning_rate": 1.778424355449317e-05,
"loss": 1.165,
"num_input_tokens_seen": 5537792000,
"step": 84500,
"train_runtime": 40858.6857,
"train_tokens_per_second": 135535.245
},
{
"epoch": 0.846,
"grad_norm": 0.6192531585693359,
"learning_rate": 1.756009581725841e-05,
"loss": 1.1589,
"num_input_tokens_seen": 5544345600,
"step": 84600,
"train_runtime": 40906.0609,
"train_tokens_per_second": 135538.487
},
{
"epoch": 0.847,
"grad_norm": 0.5640349388122559,
"learning_rate": 1.7337281748388387e-05,
"loss": 1.1653,
"num_input_tokens_seen": 5550899200,
"step": 84700,
"train_runtime": 40952.689,
"train_tokens_per_second": 135544.194
},
{
"epoch": 0.848,
"grad_norm": 0.5606239438056946,
"learning_rate": 1.7115803591618312e-05,
"loss": 1.1734,
"num_input_tokens_seen": 5557452800,
"step": 84800,
"train_runtime": 41006.8402,
"train_tokens_per_second": 135525.019
},
{
"epoch": 0.849,
"grad_norm": 0.5700273513793945,
"learning_rate": 1.6895663577230816e-05,
"loss": 1.1755,
"num_input_tokens_seen": 5564006400,
"step": 84900,
"train_runtime": 41054.6511,
"train_tokens_per_second": 135526.822
},
{
"epoch": 0.85,
"grad_norm": 0.7111489176750183,
"learning_rate": 1.667686392203333e-05,
"loss": 1.1673,
"num_input_tokens_seen": 5570560000,
"step": 85000,
"train_runtime": 41102.7763,
"train_tokens_per_second": 135527.585
},
{
"epoch": 0.851,
"grad_norm": 0.5908454060554504,
"learning_rate": 1.6459406829335996e-05,
"loss": 1.1767,
"num_input_tokens_seen": 5577113600,
"step": 85100,
"train_runtime": 41150.3215,
"train_tokens_per_second": 135530.256
},
{
"epoch": 0.852,
"grad_norm": 0.6215232610702515,
"learning_rate": 1.624329448892932e-05,
"loss": 1.171,
"num_input_tokens_seen": 5583667200,
"step": 85200,
"train_runtime": 41205.5284,
"train_tokens_per_second": 135507.72
},
{
"epoch": 0.853,
"grad_norm": 0.6203814744949341,
"learning_rate": 1.6028529077062163e-05,
"loss": 1.1591,
"num_input_tokens_seen": 5590220800,
"step": 85300,
"train_runtime": 41253.0291,
"train_tokens_per_second": 135510.553
},
{
"epoch": 0.854,
"grad_norm": 0.5267207026481628,
"learning_rate": 1.5815112756419805e-05,
"loss": 1.185,
"num_input_tokens_seen": 5596774400,
"step": 85400,
"train_runtime": 41301.2794,
"train_tokens_per_second": 135510.921
},
{
"epoch": 0.855,
"grad_norm": 0.5815737843513489,
"learning_rate": 1.5603047676102313e-05,
"loss": 1.173,
"num_input_tokens_seen": 5603328000,
"step": 85500,
"train_runtime": 41349.0127,
"train_tokens_per_second": 135512.982
},
{
"epoch": 0.856,
"grad_norm": 0.6342357397079468,
"learning_rate": 1.5392335971602638e-05,
"loss": 1.1568,
"num_input_tokens_seen": 5609881600,
"step": 85600,
"train_runtime": 41397.1556,
"train_tokens_per_second": 135513.697
},
{
"epoch": 0.857,
"grad_norm": 0.6623713970184326,
"learning_rate": 1.5182979764785258e-05,
"loss": 1.1649,
"num_input_tokens_seen": 5616435200,
"step": 85700,
"train_runtime": 41450.3243,
"train_tokens_per_second": 135497.98
},
{
"epoch": 0.858,
"grad_norm": 0.6217081546783447,
"learning_rate": 1.4974981163864896e-05,
"loss": 1.1772,
"num_input_tokens_seen": 5622988800,
"step": 85800,
"train_runtime": 41497.5379,
"train_tokens_per_second": 135501.745
},
{
"epoch": 0.859,
"grad_norm": 0.6180946826934814,
"learning_rate": 1.4768342263385192e-05,
"loss": 1.1601,
"num_input_tokens_seen": 5629542400,
"step": 85900,
"train_runtime": 41546.6611,
"train_tokens_per_second": 135499.274
},
{
"epoch": 0.86,
"grad_norm": 0.5609486103057861,
"learning_rate": 1.4563065144197517e-05,
"loss": 1.1866,
"num_input_tokens_seen": 5636096000,
"step": 86000,
"train_runtime": 41594.2678,
"train_tokens_per_second": 135501.748
},
{
"epoch": 0.861,
"grad_norm": 0.5352550148963928,
"learning_rate": 1.4359151873440216e-05,
"loss": 1.1732,
"num_input_tokens_seen": 5642649600,
"step": 86100,
"train_runtime": 41640.8053,
"train_tokens_per_second": 135507.696
},
{
"epoch": 0.862,
"grad_norm": 0.5788577198982239,
"learning_rate": 1.415660450451767e-05,
"loss": 1.1785,
"num_input_tokens_seen": 5649203200,
"step": 86200,
"train_runtime": 41695.0254,
"train_tokens_per_second": 135488.662
},
{
"epoch": 0.863,
"grad_norm": 0.5672028064727783,
"learning_rate": 1.3955425077079595e-05,
"loss": 1.1692,
"num_input_tokens_seen": 5655756800,
"step": 86300,
"train_runtime": 41742.7936,
"train_tokens_per_second": 135490.615
},
{
"epoch": 0.864,
"grad_norm": 0.577563464641571,
"learning_rate": 1.375561561700061e-05,
"loss": 1.1662,
"num_input_tokens_seen": 5662310400,
"step": 86400,
"train_runtime": 41789.652,
"train_tokens_per_second": 135495.515
},
{
"epoch": 0.865,
"grad_norm": 0.544994592666626,
"learning_rate": 1.3557178136359798e-05,
"loss": 1.1665,
"num_input_tokens_seen": 5668864000,
"step": 86500,
"train_runtime": 41842.8709,
"train_tokens_per_second": 135479.805
},
{
"epoch": 0.866,
"grad_norm": 0.5978608727455139,
"learning_rate": 1.3360114633420333e-05,
"loss": 1.1644,
"num_input_tokens_seen": 5675417600,
"step": 86600,
"train_runtime": 41891.5143,
"train_tokens_per_second": 135478.932
},
{
"epoch": 0.867,
"grad_norm": 0.6005887985229492,
"learning_rate": 1.3164427092609503e-05,
"loss": 1.1742,
"num_input_tokens_seen": 5681971200,
"step": 86700,
"train_runtime": 41939.4895,
"train_tokens_per_second": 135480.218
},
{
"epoch": 0.868,
"grad_norm": 0.5312247276306152,
"learning_rate": 1.2970117484498732e-05,
"loss": 1.1575,
"num_input_tokens_seen": 5688524800,
"step": 86800,
"train_runtime": 41987.1811,
"train_tokens_per_second": 135482.418
},
{
"epoch": 0.869,
"grad_norm": 0.9317598342895508,
"learning_rate": 1.2777187765783558e-05,
"loss": 1.1668,
"num_input_tokens_seen": 5695078400,
"step": 86900,
"train_runtime": 42034.5611,
"train_tokens_per_second": 135485.616
},
{
"epoch": 0.87,
"grad_norm": 0.5501394271850586,
"learning_rate": 1.2585639879264103e-05,
"loss": 1.1741,
"num_input_tokens_seen": 5701632000,
"step": 87000,
"train_runtime": 42082.1201,
"train_tokens_per_second": 135488.231
},
{
"epoch": 0.871,
"grad_norm": 0.6144236326217651,
"learning_rate": 1.2395475753825518e-05,
"loss": 1.1665,
"num_input_tokens_seen": 5708185600,
"step": 87100,
"train_runtime": 42136.7086,
"train_tokens_per_second": 135468.236
},
{
"epoch": 0.872,
"grad_norm": 0.6324082612991333,
"learning_rate": 1.2206697304418367e-05,
"loss": 1.1523,
"num_input_tokens_seen": 5714739200,
"step": 87200,
"train_runtime": 42184.2095,
"train_tokens_per_second": 135471.051
},
{
"epoch": 0.873,
"grad_norm": 0.6486518979072571,
"learning_rate": 1.2019306432039594e-05,
"loss": 1.1872,
"num_input_tokens_seen": 5721292800,
"step": 87300,
"train_runtime": 42230.9222,
"train_tokens_per_second": 135476.388
},
{
"epoch": 0.874,
"grad_norm": 0.5755148530006409,
"learning_rate": 1.1833305023713153e-05,
"loss": 1.1963,
"num_input_tokens_seen": 5727846400,
"step": 87400,
"train_runtime": 42278.9901,
"train_tokens_per_second": 135477.37
},
{
"epoch": 0.875,
"grad_norm": 0.6408706307411194,
"learning_rate": 1.1648694952471205e-05,
"loss": 1.163,
"num_input_tokens_seen": 5734400000,
"step": 87500,
"train_runtime": 42326.8376,
"train_tokens_per_second": 135479.056
},
{
"epoch": 0.876,
"grad_norm": 0.6233325600624084,
"learning_rate": 1.1465478077335088e-05,
"loss": 1.1591,
"num_input_tokens_seen": 5740953600,
"step": 87600,
"train_runtime": 42379.4952,
"train_tokens_per_second": 135465.36
},
{
"epoch": 0.877,
"grad_norm": 0.8282228708267212,
"learning_rate": 1.1283656243296695e-05,
"loss": 1.1799,
"num_input_tokens_seen": 5747507200,
"step": 87700,
"train_runtime": 42427.8149,
"train_tokens_per_second": 135465.548
},
{
"epoch": 0.878,
"grad_norm": 0.7755045294761658,
"learning_rate": 1.1103231281299923e-05,
"loss": 1.1565,
"num_input_tokens_seen": 5754060800,
"step": 87800,
"train_runtime": 42474.6192,
"train_tokens_per_second": 135470.568
},
{
"epoch": 0.879,
"grad_norm": 0.6230588555335999,
"learning_rate": 1.0924205008222086e-05,
"loss": 1.1673,
"num_input_tokens_seen": 5760614400,
"step": 87900,
"train_runtime": 42522.6205,
"train_tokens_per_second": 135471.764
},
{
"epoch": 0.88,
"grad_norm": 0.5966441035270691,
"learning_rate": 1.0746579226855768e-05,
"loss": 1.1628,
"num_input_tokens_seen": 5767168000,
"step": 88000,
"train_runtime": 42576.1454,
"train_tokens_per_second": 135455.381
},
{
"epoch": 0.881,
"grad_norm": 0.6604552865028381,
"learning_rate": 1.0570355725890678e-05,
"loss": 1.1769,
"num_input_tokens_seen": 5773721600,
"step": 88100,
"train_runtime": 42624.8502,
"train_tokens_per_second": 135454.355
},
{
"epoch": 0.882,
"grad_norm": 0.5727500319480896,
"learning_rate": 1.0395536279895428e-05,
"loss": 1.1571,
"num_input_tokens_seen": 5780275200,
"step": 88200,
"train_runtime": 42673.6883,
"train_tokens_per_second": 135452.909
},
{
"epoch": 0.883,
"grad_norm": 0.5748215317726135,
"learning_rate": 1.0222122649299952e-05,
"loss": 1.1666,
"num_input_tokens_seen": 5786828800,
"step": 88300,
"train_runtime": 42720.0242,
"train_tokens_per_second": 135459.399
},
{
"epoch": 0.884,
"grad_norm": 0.6671021580696106,
"learning_rate": 1.0050116580377593e-05,
"loss": 1.1887,
"num_input_tokens_seen": 5793382400,
"step": 88400,
"train_runtime": 42766.9841,
"train_tokens_per_second": 135463.899
},
{
"epoch": 0.885,
"grad_norm": 0.7352688908576965,
"learning_rate": 9.879519805227515e-06,
"loss": 1.173,
"num_input_tokens_seen": 5799936000,
"step": 88500,
"train_runtime": 42820.4689,
"train_tokens_per_second": 135447.746
},
{
"epoch": 0.886,
"grad_norm": 0.5779001712799072,
"learning_rate": 9.710334041757351e-06,
"loss": 1.1612,
"num_input_tokens_seen": 5806489600,
"step": 88600,
"train_runtime": 42866.8877,
"train_tokens_per_second": 135453.958
},
{
"epoch": 0.887,
"grad_norm": 0.7246189713478088,
"learning_rate": 9.542560993665932e-06,
"loss": 1.1926,
"num_input_tokens_seen": 5813043200,
"step": 88700,
"train_runtime": 42915.9912,
"train_tokens_per_second": 135451.682
},
{
"epoch": 0.888,
"grad_norm": 0.5459685921669006,
"learning_rate": 9.376202350425888e-06,
"loss": 1.1698,
"num_input_tokens_seen": 5819596800,
"step": 88800,
"train_runtime": 42964.4051,
"train_tokens_per_second": 135451.586
},
{
"epoch": 0.889,
"grad_norm": 0.5574699640274048,
"learning_rate": 9.211259787266972e-06,
"loss": 1.1627,
"num_input_tokens_seen": 5826150400,
"step": 88900,
"train_runtime": 43011.9797,
"train_tokens_per_second": 135454.133
},
{
"epoch": 0.89,
"grad_norm": 0.5637386441230774,
"learning_rate": 9.047734965158966e-06,
"loss": 1.1659,
"num_input_tokens_seen": 5832704000,
"step": 89000,
"train_runtime": 43065.5789,
"train_tokens_per_second": 135437.724
},
{
"epoch": 0.891,
"grad_norm": 0.5420241951942444,
"learning_rate": 8.885629530794997e-06,
"loss": 1.1693,
"num_input_tokens_seen": 5839257600,
"step": 89100,
"train_runtime": 43113.8932,
"train_tokens_per_second": 135437.957
},
{
"epoch": 0.892,
"grad_norm": 0.5701260566711426,
"learning_rate": 8.724945116574983e-06,
"loss": 1.1592,
"num_input_tokens_seen": 5845811200,
"step": 89200,
"train_runtime": 43161.415,
"train_tokens_per_second": 135440.675
},
{
"epoch": 0.893,
"grad_norm": 0.5882892608642578,
"learning_rate": 8.565683340589185e-06,
"loss": 1.1601,
"num_input_tokens_seen": 5852364800,
"step": 89300,
"train_runtime": 43209.5307,
"train_tokens_per_second": 135441.527
},
{
"epoch": 0.894,
"grad_norm": 0.5708109736442566,
"learning_rate": 8.40784580660196e-06,
"loss": 1.1684,
"num_input_tokens_seen": 5858918400,
"step": 89400,
"train_runtime": 43257.3597,
"train_tokens_per_second": 135443.273
},
{
"epoch": 0.895,
"grad_norm": 0.5796698927879333,
"learning_rate": 8.251434104035465e-06,
"loss": 1.1753,
"num_input_tokens_seen": 5865472000,
"step": 89500,
"train_runtime": 43305.3116,
"train_tokens_per_second": 135444.632
},
{
"epoch": 0.896,
"grad_norm": 0.9602819681167603,
"learning_rate": 8.09644980795383e-06,
"loss": 1.1672,
"num_input_tokens_seen": 5872025600,
"step": 89600,
"train_runtime": 43360.788,
"train_tokens_per_second": 135422.484
},
{
"epoch": 0.897,
"grad_norm": 0.6962534189224243,
"learning_rate": 7.942894479047252e-06,
"loss": 1.1622,
"num_input_tokens_seen": 5878579200,
"step": 89700,
"train_runtime": 43407.8503,
"train_tokens_per_second": 135426.637
},
{
"epoch": 0.898,
"grad_norm": 0.6292552351951599,
"learning_rate": 7.790769663616098e-06,
"loss": 1.1632,
"num_input_tokens_seen": 5885132800,
"step": 89800,
"train_runtime": 43455.9389,
"train_tokens_per_second": 135427.584
},
{
"epoch": 0.899,
"grad_norm": 0.5883670449256897,
"learning_rate": 7.64007689355563e-06,
"loss": 1.1632,
"num_input_tokens_seen": 5891686400,
"step": 89900,
"train_runtime": 43504.2315,
"train_tokens_per_second": 135427.893
},
{
"epoch": 0.9,
"grad_norm": 0.8059070706367493,
"learning_rate": 7.490817686340361e-06,
"loss": 1.1728,
"num_input_tokens_seen": 5898240000,
"step": 90000,
"train_runtime": 43552.1457,
"train_tokens_per_second": 135429.378
},
{
"epoch": 0.901,
"grad_norm": 0.5949374437332153,
"learning_rate": 7.342993545008818e-06,
"loss": 1.1732,
"num_input_tokens_seen": 5904793600,
"step": 90100,
"train_runtime": 43599.6931,
"train_tokens_per_second": 135431.999
},
{
"epoch": 0.902,
"grad_norm": 0.6094557642936707,
"learning_rate": 7.196605958148505e-06,
"loss": 1.1713,
"num_input_tokens_seen": 5911347200,
"step": 90200,
"train_runtime": 43653.2541,
"train_tokens_per_second": 135415.957
},
{
"epoch": 0.903,
"grad_norm": 0.6275845170021057,
"learning_rate": 7.051656399880778e-06,
"loss": 1.1743,
"num_input_tokens_seen": 5917900800,
"step": 90300,
"train_runtime": 43702.1275,
"train_tokens_per_second": 135414.478
},
{
"epoch": 0.904,
"grad_norm": 0.7113337516784668,
"learning_rate": 6.9081463298460815e-06,
"loss": 1.162,
"num_input_tokens_seen": 5924454400,
"step": 90400,
"train_runtime": 43749.6704,
"train_tokens_per_second": 135417.121
},
{
"epoch": 0.905,
"grad_norm": 0.6237180233001709,
"learning_rate": 6.766077193189201e-06,
"loss": 1.159,
"num_input_tokens_seen": 5931008000,
"step": 90500,
"train_runtime": 43797.6522,
"train_tokens_per_second": 135418.4
},
{
"epoch": 0.906,
"grad_norm": 0.9803968667984009,
"learning_rate": 6.625450420544831e-06,
"loss": 1.1788,
"num_input_tokens_seen": 5937561600,
"step": 90600,
"train_runtime": 43846.1111,
"train_tokens_per_second": 135418.203
},
{
"epoch": 0.907,
"grad_norm": 0.5648267269134521,
"learning_rate": 6.486267428022967e-06,
"loss": 1.1581,
"num_input_tokens_seen": 5944115200,
"step": 90700,
"train_runtime": 43893.4216,
"train_tokens_per_second": 135421.55
},
{
"epoch": 0.908,
"grad_norm": 0.610898494720459,
"learning_rate": 6.34852961719477e-06,
"loss": 1.1557,
"num_input_tokens_seen": 5950668800,
"step": 90800,
"train_runtime": 43947.4481,
"train_tokens_per_second": 135404.194
},
{
"epoch": 0.909,
"grad_norm": 0.732876718044281,
"learning_rate": 6.212238375078521e-06,
"loss": 1.1683,
"num_input_tokens_seen": 5957222400,
"step": 90900,
"train_runtime": 43996.4271,
"train_tokens_per_second": 135402.413
},
{
"epoch": 0.91,
"grad_norm": 0.5793011784553528,
"learning_rate": 6.077395074125491e-06,
"loss": 1.1747,
"num_input_tokens_seen": 5963776000,
"step": 91000,
"train_runtime": 44044.5112,
"train_tokens_per_second": 135403.387
},
{
"epoch": 0.911,
"grad_norm": 0.6567527651786804,
"learning_rate": 5.944001072206212e-06,
"loss": 1.1594,
"num_input_tokens_seen": 5970329600,
"step": 91100,
"train_runtime": 44091.43,
"train_tokens_per_second": 135407.938
},
{
"epoch": 0.912,
"grad_norm": 0.6197203397750854,
"learning_rate": 5.812057712596807e-06,
"loss": 1.1504,
"num_input_tokens_seen": 5976883200,
"step": 91200,
"train_runtime": 44140.2623,
"train_tokens_per_second": 135406.608
},
{
"epoch": 0.913,
"grad_norm": 0.6190736889839172,
"learning_rate": 5.681566323965486e-06,
"loss": 1.1645,
"num_input_tokens_seen": 5983436800,
"step": 91300,
"train_runtime": 44194.3429,
"train_tokens_per_second": 135389.202
},
{
"epoch": 0.914,
"grad_norm": 0.5632036924362183,
"learning_rate": 5.552528220359004e-06,
"loss": 1.1691,
"num_input_tokens_seen": 5989990400,
"step": 91400,
"train_runtime": 44242.165,
"train_tokens_per_second": 135390.987
},
{
"epoch": 0.915,
"grad_norm": 0.6650084257125854,
"learning_rate": 5.424944701189704e-06,
"loss": 1.1587,
"num_input_tokens_seen": 5996544000,
"step": 91500,
"train_runtime": 44290.3253,
"train_tokens_per_second": 135391.735
},
{
"epoch": 0.916,
"grad_norm": 0.6665343642234802,
"learning_rate": 5.298817051222182e-06,
"loss": 1.16,
"num_input_tokens_seen": 6003097600,
"step": 91600,
"train_runtime": 44344.1461,
"train_tokens_per_second": 135375.199
},
{
"epoch": 0.917,
"grad_norm": 0.9934324026107788,
"learning_rate": 5.174146540560442e-06,
"loss": 1.186,
"num_input_tokens_seen": 6009651200,
"step": 91700,
"train_runtime": 44386.6411,
"train_tokens_per_second": 135393.241
},
{
"epoch": 0.918,
"grad_norm": 0.587840735912323,
"learning_rate": 5.050934424635195e-06,
"loss": 1.1685,
"num_input_tokens_seen": 6016204800,
"step": 91800,
"train_runtime": 44440.2445,
"train_tokens_per_second": 135377.401
},
{
"epoch": 0.919,
"grad_norm": 0.6308780312538147,
"learning_rate": 4.9291819441910465e-06,
"loss": 1.1593,
"num_input_tokens_seen": 6022758400,
"step": 91900,
"train_runtime": 44487.4748,
"train_tokens_per_second": 135380.99
},
{
"epoch": 0.92,
"grad_norm": 0.6875436305999756,
"learning_rate": 4.808890325274129e-06,
"loss": 1.1686,
"num_input_tokens_seen": 6029312000,
"step": 92000,
"train_runtime": 44535.4396,
"train_tokens_per_second": 135382.339
},
{
"epoch": 0.921,
"grad_norm": 0.6450539231300354,
"learning_rate": 4.690060779219723e-06,
"loss": 1.1669,
"num_input_tokens_seen": 6035865600,
"step": 92100,
"train_runtime": 44583.0204,
"train_tokens_per_second": 135384.852
},
{
"epoch": 0.922,
"grad_norm": 1.0118526220321655,
"learning_rate": 4.572694502640023e-06,
"loss": 1.1601,
"num_input_tokens_seen": 6042419200,
"step": 92200,
"train_runtime": 44632.4327,
"train_tokens_per_second": 135381.803
},
{
"epoch": 0.923,
"grad_norm": 0.5630050897598267,
"learning_rate": 4.456792677412141e-06,
"loss": 1.164,
"num_input_tokens_seen": 6048972800,
"step": 92300,
"train_runtime": 44685.5287,
"train_tokens_per_second": 135367.6
},
{
"epoch": 0.924,
"grad_norm": 0.5819036364555359,
"learning_rate": 4.342356470666153e-06,
"loss": 1.177,
"num_input_tokens_seen": 6055526400,
"step": 92400,
"train_runtime": 44733.1102,
"train_tokens_per_second": 135370.118
},
{
"epoch": 0.925,
"grad_norm": 0.5852016806602478,
"learning_rate": 4.22938703477344e-06,
"loss": 1.1846,
"num_input_tokens_seen": 6062080000,
"step": 92500,
"train_runtime": 44781.2518,
"train_tokens_per_second": 135370.937
},
{
"epoch": 0.926,
"grad_norm": 0.7466326355934143,
"learning_rate": 4.117885507334884e-06,
"loss": 1.1564,
"num_input_tokens_seen": 6068633600,
"step": 92600,
"train_runtime": 44829.0669,
"train_tokens_per_second": 135372.739
},
{
"epoch": 0.927,
"grad_norm": 0.7777779698371887,
"learning_rate": 4.007853011169687e-06,
"loss": 1.1654,
"num_input_tokens_seen": 6075187200,
"step": 92700,
"train_runtime": 44882.4041,
"train_tokens_per_second": 135357.883
},
{
"epoch": 0.928,
"grad_norm": 0.9159000515937805,
"learning_rate": 3.899290654303855e-06,
"loss": 1.1854,
"num_input_tokens_seen": 6081740800,
"step": 92800,
"train_runtime": 44929.6625,
"train_tokens_per_second": 135361.373
},
{
"epoch": 0.929,
"grad_norm": 0.5948230028152466,
"learning_rate": 3.7921995299591168e-06,
"loss": 1.1602,
"num_input_tokens_seen": 6088294400,
"step": 92900,
"train_runtime": 44977.4717,
"train_tokens_per_second": 135363.198
},
{
"epoch": 0.93,
"grad_norm": 0.5999124646186829,
"learning_rate": 3.686580716541887e-06,
"loss": 1.1484,
"num_input_tokens_seen": 6094848000,
"step": 93000,
"train_runtime": 45026.2424,
"train_tokens_per_second": 135362.128
},
{
"epoch": 0.931,
"grad_norm": 0.6015925407409668,
"learning_rate": 3.582435277632456e-06,
"loss": 1.1638,
"num_input_tokens_seen": 6101401600,
"step": 93100,
"train_runtime": 45073.6825,
"train_tokens_per_second": 135365.057
},
{
"epoch": 0.932,
"grad_norm": 0.5493288040161133,
"learning_rate": 3.479764261974266e-06,
"loss": 1.1644,
"num_input_tokens_seen": 6107955200,
"step": 93200,
"train_runtime": 45131.734,
"train_tokens_per_second": 135336.152
},
{
"epoch": 0.933,
"grad_norm": 0.5847836136817932,
"learning_rate": 3.3785687034632523e-06,
"loss": 1.1528,
"num_input_tokens_seen": 6114508800,
"step": 93300,
"train_runtime": 45180.4411,
"train_tokens_per_second": 135335.305
},
{
"epoch": 0.934,
"grad_norm": 0.6086737513542175,
"learning_rate": 3.2788496211376024e-06,
"loss": 1.1525,
"num_input_tokens_seen": 6121062400,
"step": 93400,
"train_runtime": 45228.3556,
"train_tokens_per_second": 135336.833
},
{
"epoch": 0.935,
"grad_norm": 0.6097891330718994,
"learning_rate": 3.180608019167363e-06,
"loss": 1.1681,
"num_input_tokens_seen": 6127616000,
"step": 93500,
"train_runtime": 45275.6501,
"train_tokens_per_second": 135340.21
},
{
"epoch": 0.936,
"grad_norm": 0.5980057716369629,
"learning_rate": 3.0838448868443665e-06,
"loss": 1.1603,
"num_input_tokens_seen": 6134169600,
"step": 93600,
"train_runtime": 45322.6488,
"train_tokens_per_second": 135344.464
},
{
"epoch": 0.937,
"grad_norm": 0.7306444048881531,
"learning_rate": 2.988561198572287e-06,
"loss": 1.1702,
"num_input_tokens_seen": 6140723200,
"step": 93700,
"train_runtime": 45376.9708,
"train_tokens_per_second": 135326.865
},
{
"epoch": 0.938,
"grad_norm": 0.9187434911727905,
"learning_rate": 2.8947579138567987e-06,
"loss": 1.1654,
"num_input_tokens_seen": 6147276800,
"step": 93800,
"train_runtime": 45427.1088,
"train_tokens_per_second": 135321.771
},
{
"epoch": 0.939,
"grad_norm": 0.6403319835662842,
"learning_rate": 2.8024359772959525e-06,
"loss": 1.1581,
"num_input_tokens_seen": 6153830400,
"step": 93900,
"train_runtime": 45475.34,
"train_tokens_per_second": 135322.362
},
{
"epoch": 0.94,
"grad_norm": 0.7088416218757629,
"learning_rate": 2.711596318570597e-06,
"loss": 1.1683,
"num_input_tokens_seen": 6160384000,
"step": 94000,
"train_runtime": 45523.8789,
"train_tokens_per_second": 135322.037
},
{
"epoch": 0.941,
"grad_norm": 0.6289553642272949,
"learning_rate": 2.6222398524351206e-06,
"loss": 1.1538,
"num_input_tokens_seen": 6166937600,
"step": 94100,
"train_runtime": 45571.6907,
"train_tokens_per_second": 135323.871
},
{
"epoch": 0.942,
"grad_norm": 0.8788822889328003,
"learning_rate": 2.5343674787081435e-06,
"loss": 1.1666,
"num_input_tokens_seen": 6173491200,
"step": 94200,
"train_runtime": 45621.3271,
"train_tokens_per_second": 135320.29
},
{
"epoch": 0.943,
"grad_norm": 0.575515866279602,
"learning_rate": 2.4479800822634565e-06,
"loss": 1.1685,
"num_input_tokens_seen": 6180044800,
"step": 94300,
"train_runtime": 45670.6842,
"train_tokens_per_second": 135317.543
},
{
"epoch": 0.944,
"grad_norm": 0.5740439891815186,
"learning_rate": 2.3630785330212286e-06,
"loss": 1.1588,
"num_input_tokens_seen": 6186598400,
"step": 94400,
"train_runtime": 45717.875,
"train_tokens_per_second": 135321.215
},
{
"epoch": 0.945,
"grad_norm": 0.6576538681983948,
"learning_rate": 2.2796636859390815e-06,
"loss": 1.1492,
"num_input_tokens_seen": 6193152000,
"step": 94500,
"train_runtime": 45766.0209,
"train_tokens_per_second": 135322.055
},
{
"epoch": 0.946,
"grad_norm": 0.5781713128089905,
"learning_rate": 2.197736381003612e-06,
"loss": 1.1725,
"num_input_tokens_seen": 6199705600,
"step": 94600,
"train_runtime": 45819.6687,
"train_tokens_per_second": 135306.644
},
{
"epoch": 0.947,
"grad_norm": 0.6812490820884705,
"learning_rate": 2.1172974432218826e-06,
"loss": 1.1509,
"num_input_tokens_seen": 6206259200,
"step": 94700,
"train_runtime": 45866.8187,
"train_tokens_per_second": 135310.435
},
{
"epoch": 0.948,
"grad_norm": 0.8884466886520386,
"learning_rate": 2.0383476826130786e-06,
"loss": 1.157,
"num_input_tokens_seen": 6212812800,
"step": 94800,
"train_runtime": 45915.7744,
"train_tokens_per_second": 135308.897
},
{
"epoch": 0.949,
"grad_norm": 0.6096293926239014,
"learning_rate": 1.96088789420043e-06,
"loss": 1.1609,
"num_input_tokens_seen": 6219366400,
"step": 94900,
"train_runtime": 45963.3824,
"train_tokens_per_second": 135311.33
},
{
"epoch": 0.95,
"grad_norm": 0.5762118697166443,
"learning_rate": 1.8849188580031539e-06,
"loss": 1.1621,
"num_input_tokens_seen": 6225920000,
"step": 95000,
"train_runtime": 46012.4538,
"train_tokens_per_second": 135309.454
},
{
"epoch": 0.951,
"grad_norm": 0.5296618938446045,
"learning_rate": 1.8104413390286066e-06,
"loss": 1.157,
"num_input_tokens_seen": 6232473600,
"step": 95100,
"train_runtime": 46059.2761,
"train_tokens_per_second": 135314.189
},
{
"epoch": 0.952,
"grad_norm": 0.6025533676147461,
"learning_rate": 1.7374560872645438e-06,
"loss": 1.1507,
"num_input_tokens_seen": 6239027200,
"step": 95200,
"train_runtime": 46113.68,
"train_tokens_per_second": 135296.667
},
{
"epoch": 0.953,
"grad_norm": 0.616148829460144,
"learning_rate": 1.6659638376716578e-06,
"loss": 1.1711,
"num_input_tokens_seen": 6245580800,
"step": 95300,
"train_runtime": 46162.0494,
"train_tokens_per_second": 135296.87
},
{
"epoch": 0.954,
"grad_norm": 0.6661262512207031,
"learning_rate": 1.5959653101761172e-06,
"loss": 1.1604,
"num_input_tokens_seen": 6252134400,
"step": 95400,
"train_runtime": 46208.848,
"train_tokens_per_second": 135301.672
},
{
"epoch": 0.955,
"grad_norm": 0.8173303604125977,
"learning_rate": 1.5274612096623063e-06,
"loss": 1.1498,
"num_input_tokens_seen": 6258688000,
"step": 95500,
"train_runtime": 46256.5159,
"train_tokens_per_second": 135303.922
},
{
"epoch": 0.956,
"grad_norm": 0.6189817786216736,
"learning_rate": 1.4604522259657635e-06,
"loss": 1.1602,
"num_input_tokens_seen": 6265241600,
"step": 95600,
"train_runtime": 46309.4141,
"train_tokens_per_second": 135290.885
},
{
"epoch": 0.957,
"grad_norm": 0.7523248195648193,
"learning_rate": 1.3949390338662047e-06,
"loss": 1.1655,
"num_input_tokens_seen": 6271795200,
"step": 95700,
"train_runtime": 46357.4405,
"train_tokens_per_second": 135292.094
},
{
"epoch": 0.958,
"grad_norm": 0.5935103297233582,
"learning_rate": 1.330922293080744e-06,
"loss": 1.1702,
"num_input_tokens_seen": 6278348800,
"step": 95800,
"train_runtime": 46406.0604,
"train_tokens_per_second": 135291.571
},
{
"epoch": 0.959,
"grad_norm": 0.8042653203010559,
"learning_rate": 1.2684026482572662e-06,
"loss": 1.1623,
"num_input_tokens_seen": 6284902400,
"step": 95900,
"train_runtime": 46454.8491,
"train_tokens_per_second": 135290.557
},
{
"epoch": 0.96,
"grad_norm": 0.5935735106468201,
"learning_rate": 1.2073807289678993e-06,
"loss": 1.1441,
"num_input_tokens_seen": 6291456000,
"step": 96000,
"train_runtime": 46502.688,
"train_tokens_per_second": 135292.308
},
{
"epoch": 0.961,
"grad_norm": 0.5718377828598022,
"learning_rate": 1.147857149702669e-06,
"loss": 1.1618,
"num_input_tokens_seen": 6298009600,
"step": 96100,
"train_runtime": 46555.2337,
"train_tokens_per_second": 135280.378
},
{
"epoch": 0.962,
"grad_norm": 0.6801995635032654,
"learning_rate": 1.0898325098633697e-06,
"loss": 1.1479,
"num_input_tokens_seen": 6304563200,
"step": 96200,
"train_runtime": 46603.2751,
"train_tokens_per_second": 135281.548
},
{
"epoch": 0.963,
"grad_norm": 0.5564619898796082,
"learning_rate": 1.0333073937575043e-06,
"loss": 1.1582,
"num_input_tokens_seen": 6311116800,
"step": 96300,
"train_runtime": 46652.5681,
"train_tokens_per_second": 135279.087
},
{
"epoch": 0.964,
"grad_norm": 0.6501321792602539,
"learning_rate": 9.782823705923204e-07,
"loss": 1.1617,
"num_input_tokens_seen": 6317670400,
"step": 96400,
"train_runtime": 46700.1727,
"train_tokens_per_second": 135281.521
},
{
"epoch": 0.965,
"grad_norm": 0.6728459596633911,
"learning_rate": 9.247579944692162e-07,
"loss": 1.1592,
"num_input_tokens_seen": 6324224000,
"step": 96500,
"train_runtime": 46748.7553,
"train_tokens_per_second": 135281.12
},
{
"epoch": 0.966,
"grad_norm": 0.5893784761428833,
"learning_rate": 8.72734804378078e-07,
"loss": 1.1691,
"num_input_tokens_seen": 6330777600,
"step": 96600,
"train_runtime": 46801.015,
"train_tokens_per_second": 135270.092
},
{
"epoch": 0.967,
"grad_norm": 0.8625339269638062,
"learning_rate": 8.222133241918172e-07,
"loss": 1.1518,
"num_input_tokens_seen": 6337331200,
"step": 96700,
"train_runtime": 46847.2237,
"train_tokens_per_second": 135276.559
},
{
"epoch": 0.968,
"grad_norm": 0.6501858830451965,
"learning_rate": 7.731940626612088e-07,
"loss": 1.1693,
"num_input_tokens_seen": 6343884800,
"step": 96800,
"train_runtime": 46895.3712,
"train_tokens_per_second": 135277.419
},
{
"epoch": 0.969,
"grad_norm": 0.6575475335121155,
"learning_rate": 7.256775134096615e-07,
"loss": 1.1552,
"num_input_tokens_seen": 6350438400,
"step": 96900,
"train_runtime": 46942.8491,
"train_tokens_per_second": 135280.208
},
{
"epoch": 0.97,
"grad_norm": 0.5287050604820251,
"learning_rate": 6.796641549283055e-07,
"loss": 1.1946,
"num_input_tokens_seen": 6356992000,
"step": 97000,
"train_runtime": 46991.8919,
"train_tokens_per_second": 135278.486
},
{
"epoch": 0.971,
"grad_norm": 0.568566083908081,
"learning_rate": 6.351544505711292e-07,
"loss": 1.1559,
"num_input_tokens_seen": 6363545600,
"step": 97100,
"train_runtime": 47040.0316,
"train_tokens_per_second": 135279.365
},
{
"epoch": 0.972,
"grad_norm": 0.9329395890235901,
"learning_rate": 5.921488485503833e-07,
"loss": 1.1603,
"num_input_tokens_seen": 6370099200,
"step": 97200,
"train_runtime": 47092.2725,
"train_tokens_per_second": 135268.46
},
{
"epoch": 0.973,
"grad_norm": 0.6256415843963623,
"learning_rate": 5.506477819319843e-07,
"loss": 1.1571,
"num_input_tokens_seen": 6376652800,
"step": 97300,
"train_runtime": 47139.4068,
"train_tokens_per_second": 135272.233
},
{
"epoch": 0.974,
"grad_norm": 0.7202081680297852,
"learning_rate": 5.106516686312345e-07,
"loss": 1.1638,
"num_input_tokens_seen": 6383206400,
"step": 97400,
"train_runtime": 47191.9059,
"train_tokens_per_second": 135260.619
},
{
"epoch": 0.975,
"grad_norm": 1.2700363397598267,
"learning_rate": 4.721609114085256e-07,
"loss": 1.1649,
"num_input_tokens_seen": 6389760000,
"step": 97500,
"train_runtime": 47240.0777,
"train_tokens_per_second": 135261.42
},
{
"epoch": 0.976,
"grad_norm": 0.5555500388145447,
"learning_rate": 4.3517589786539186e-07,
"loss": 1.1505,
"num_input_tokens_seen": 6396313600,
"step": 97600,
"train_runtime": 47287.972,
"train_tokens_per_second": 135263.013
},
{
"epoch": 0.977,
"grad_norm": 0.6499391198158264,
"learning_rate": 3.996970004404798e-07,
"loss": 1.153,
"num_input_tokens_seen": 6402867200,
"step": 97700,
"train_runtime": 47335.8726,
"train_tokens_per_second": 135264.586
},
{
"epoch": 0.978,
"grad_norm": 0.6353591084480286,
"learning_rate": 3.657245764058847e-07,
"loss": 1.1621,
"num_input_tokens_seen": 6409420800,
"step": 97800,
"train_runtime": 47382.5196,
"train_tokens_per_second": 135269.733
},
{
"epoch": 0.979,
"grad_norm": 0.62052321434021,
"learning_rate": 3.3325896786355334e-07,
"loss": 1.1539,
"num_input_tokens_seen": 6415974400,
"step": 97900,
"train_runtime": 47435.6023,
"train_tokens_per_second": 135256.518
},
{
"epoch": 0.98,
"grad_norm": 0.5979087352752686,
"learning_rate": 3.023005017418201e-07,
"loss": 1.1615,
"num_input_tokens_seen": 6422528000,
"step": 98000,
"train_runtime": 47484.0018,
"train_tokens_per_second": 135256.671
},
{
"epoch": 0.981,
"grad_norm": 1.0899096727371216,
"learning_rate": 2.7284948979205967e-07,
"loss": 1.166,
"num_input_tokens_seen": 6429081600,
"step": 98100,
"train_runtime": 47531.611,
"train_tokens_per_second": 135259.072
},
{
"epoch": 0.982,
"grad_norm": 0.6240010857582092,
"learning_rate": 2.449062285856729e-07,
"loss": 1.1565,
"num_input_tokens_seen": 6435635200,
"step": 98200,
"train_runtime": 47578.8884,
"train_tokens_per_second": 135262.412
},
{
"epoch": 0.983,
"grad_norm": 0.7941544651985168,
"learning_rate": 2.184709995109557e-07,
"loss": 1.1572,
"num_input_tokens_seen": 6442188800,
"step": 98300,
"train_runtime": 47627.3828,
"train_tokens_per_second": 135262.289
},
{
"epoch": 0.984,
"grad_norm": 0.5704551339149475,
"learning_rate": 1.9354406877038487e-07,
"loss": 1.1629,
"num_input_tokens_seen": 6448742400,
"step": 98400,
"train_runtime": 47679.6586,
"train_tokens_per_second": 135251.438
},
{
"epoch": 0.985,
"grad_norm": 0.5758212208747864,
"learning_rate": 1.7012568737788668e-07,
"loss": 1.1892,
"num_input_tokens_seen": 6455296000,
"step": 98500,
"train_runtime": 47728.7818,
"train_tokens_per_second": 135249.545
},
{
"epoch": 0.986,
"grad_norm": 0.5768951773643494,
"learning_rate": 1.4821609115630574e-07,
"loss": 1.1617,
"num_input_tokens_seen": 6461849600,
"step": 98600,
"train_runtime": 47775.3275,
"train_tokens_per_second": 135254.952
},
{
"epoch": 0.987,
"grad_norm": 0.5714033842086792,
"learning_rate": 1.278155007350068e-07,
"loss": 1.1712,
"num_input_tokens_seen": 6468403200,
"step": 98700,
"train_runtime": 47823.1467,
"train_tokens_per_second": 135256.746
},
{
"epoch": 0.988,
"grad_norm": 1.029975414276123,
"learning_rate": 1.089241215477099e-07,
"loss": 1.1621,
"num_input_tokens_seen": 6474956800,
"step": 98800,
"train_runtime": 47875.5087,
"train_tokens_per_second": 135245.702
},
{
"epoch": 0.989,
"grad_norm": 0.5554516315460205,
"learning_rate": 9.154214383042535e-08,
"loss": 1.1489,
"num_input_tokens_seen": 6481510400,
"step": 98900,
"train_runtime": 47923.8409,
"train_tokens_per_second": 135246.055
},
{
"epoch": 0.99,
"grad_norm": 0.6340943574905396,
"learning_rate": 7.566974261945524e-08,
"loss": 1.1721,
"num_input_tokens_seen": 6488064000,
"step": 99000,
"train_runtime": 47972.1937,
"train_tokens_per_second": 135246.348
},
{
"epoch": 0.991,
"grad_norm": 0.582399845123291,
"learning_rate": 6.13070777496949e-08,
"loss": 1.1497,
"num_input_tokens_seen": 6494617600,
"step": 99100,
"train_runtime": 48020.3976,
"train_tokens_per_second": 135247.06
},
{
"epoch": 0.992,
"grad_norm": 0.6133337020874023,
"learning_rate": 4.845429385303412e-08,
"loss": 1.1601,
"num_input_tokens_seen": 6501171200,
"step": 99200,
"train_runtime": 48068.6895,
"train_tokens_per_second": 135247.523
},
{
"epoch": 0.993,
"grad_norm": 0.5691381096839905,
"learning_rate": 3.711152035685838e-08,
"loss": 1.1571,
"num_input_tokens_seen": 6507724800,
"step": 99300,
"train_runtime": 48115.7967,
"train_tokens_per_second": 135251.315
},
{
"epoch": 0.994,
"grad_norm": 0.6613404750823975,
"learning_rate": 2.727887148278318e-08,
"loss": 1.1569,
"num_input_tokens_seen": 6514278400,
"step": 99400,
"train_runtime": 48169.6246,
"train_tokens_per_second": 135236.229
},
{
"epoch": 0.995,
"grad_norm": 0.5285235047340393,
"learning_rate": 1.8956446245455005e-08,
"loss": 1.1722,
"num_input_tokens_seen": 6520832000,
"step": 99500,
"train_runtime": 48217.4936,
"train_tokens_per_second": 135237.888
},
{
"epoch": 0.996,
"grad_norm": 0.8071156144142151,
"learning_rate": 1.2144328451618724e-08,
"loss": 1.1571,
"num_input_tokens_seen": 6527385600,
"step": 99600,
"train_runtime": 48264.7605,
"train_tokens_per_second": 135241.231
},
{
"epoch": 0.997,
"grad_norm": 0.5775815844535828,
"learning_rate": 6.84258669920168e-09,
"loss": 1.1634,
"num_input_tokens_seen": 6533939200,
"step": 99700,
"train_runtime": 48314.0709,
"train_tokens_per_second": 135238.846
},
{
"epoch": 0.998,
"grad_norm": 0.5299545526504517,
"learning_rate": 3.0512743767141524e-09,
"loss": 1.1563,
"num_input_tokens_seen": 6540492800,
"step": 99800,
"train_runtime": 48364.7142,
"train_tokens_per_second": 135232.74
},
{
"epoch": 0.999,
"grad_norm": 0.636650800704956,
"learning_rate": 7.70429662616534e-10,
"loss": 1.1653,
"num_input_tokens_seen": 6547046400,
"step": 99900,
"train_runtime": 48412.6126,
"train_tokens_per_second": 135234.313
},
{
"epoch": 1.0,
"grad_norm": 0.5705932974815369,
"learning_rate": 7.552498626495208e-14,
"loss": 1.1814,
"num_input_tokens_seen": 6553600000,
"step": 100000,
"train_runtime": 48460.0302,
"train_tokens_per_second": 135237.225
},
{
"epoch": 1.0,
"num_input_tokens_seen": 6553600000,
"step": 100000,
"total_flos": 1.23866185728e+17,
"train_loss": 1.241861473388672,
"train_runtime": 48460.2218,
"train_samples_per_second": 528.268,
"train_steps_per_second": 2.064
}
],
"logging_steps": 100,
"max_steps": 100000,
"num_input_tokens_seen": 6553600000,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.23866185728e+17,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}