| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 938, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005330490405117271, |
| "grad_norm": 0.3734353482723236, |
| "learning_rate": 2e-05, |
| "loss": 1.7104, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010660980810234541, |
| "grad_norm": 0.2635837197303772, |
| "learning_rate": 2e-05, |
| "loss": 1.6538, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.015991471215351813, |
| "grad_norm": 0.19964125752449036, |
| "learning_rate": 2e-05, |
| "loss": 1.4818, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.021321961620469083, |
| "grad_norm": 0.2099446952342987, |
| "learning_rate": 2e-05, |
| "loss": 1.3886, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.026652452025586353, |
| "grad_norm": 0.34373781085014343, |
| "learning_rate": 2e-05, |
| "loss": 1.3364, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.031982942430703626, |
| "grad_norm": 0.18734169006347656, |
| "learning_rate": 2e-05, |
| "loss": 1.2899, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 0.16565224528312683, |
| "learning_rate": 2e-05, |
| "loss": 1.2788, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.042643923240938165, |
| "grad_norm": 0.20943662524223328, |
| "learning_rate": 2e-05, |
| "loss": 1.2335, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04797441364605544, |
| "grad_norm": 0.3209005892276764, |
| "learning_rate": 2e-05, |
| "loss": 1.1854, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.053304904051172705, |
| "grad_norm": 0.3170112669467926, |
| "learning_rate": 2e-05, |
| "loss": 1.0846, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.05863539445628998, |
| "grad_norm": 0.19068752229213715, |
| "learning_rate": 2e-05, |
| "loss": 1.0976, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06396588486140725, |
| "grad_norm": 0.19504626095294952, |
| "learning_rate": 2e-05, |
| "loss": 1.0597, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.06929637526652452, |
| "grad_norm": 0.2150755077600479, |
| "learning_rate": 2e-05, |
| "loss": 1.0369, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 0.3036380410194397, |
| "learning_rate": 2e-05, |
| "loss": 0.992, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.07995735607675906, |
| "grad_norm": 0.3687366545200348, |
| "learning_rate": 2e-05, |
| "loss": 0.9768, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08528784648187633, |
| "grad_norm": 0.20827943086624146, |
| "learning_rate": 2e-05, |
| "loss": 1.0187, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0906183368869936, |
| "grad_norm": 0.19415774941444397, |
| "learning_rate": 2e-05, |
| "loss": 0.9108, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.09594882729211088, |
| "grad_norm": 0.23162966966629028, |
| "learning_rate": 2e-05, |
| "loss": 0.9096, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.10127931769722814, |
| "grad_norm": 0.30539438128471375, |
| "learning_rate": 2e-05, |
| "loss": 0.9119, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.10660980810234541, |
| "grad_norm": 0.3614450693130493, |
| "learning_rate": 2e-05, |
| "loss": 0.9107, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 0.20569603145122528, |
| "learning_rate": 2e-05, |
| "loss": 0.9069, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.11727078891257996, |
| "grad_norm": 0.1889234036207199, |
| "learning_rate": 2e-05, |
| "loss": 0.867, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.12260127931769722, |
| "grad_norm": 0.26535317301750183, |
| "learning_rate": 2e-05, |
| "loss": 0.9013, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.1279317697228145, |
| "grad_norm": 0.32258930802345276, |
| "learning_rate": 2e-05, |
| "loss": 0.8905, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.13326226012793177, |
| "grad_norm": 0.3419113755226135, |
| "learning_rate": 2e-05, |
| "loss": 0.8779, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.13859275053304904, |
| "grad_norm": 0.23869968950748444, |
| "learning_rate": 2e-05, |
| "loss": 0.894, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.1439232409381663, |
| "grad_norm": 0.2865942418575287, |
| "learning_rate": 2e-05, |
| "loss": 0.8571, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 0.2303633689880371, |
| "learning_rate": 2e-05, |
| "loss": 0.8787, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.15458422174840086, |
| "grad_norm": 0.3257729113101959, |
| "learning_rate": 2e-05, |
| "loss": 0.8533, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.15991471215351813, |
| "grad_norm": 0.34180137515068054, |
| "learning_rate": 2e-05, |
| "loss": 0.8456, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1652452025586354, |
| "grad_norm": 0.22115589678287506, |
| "learning_rate": 2e-05, |
| "loss": 0.8752, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.17057569296375266, |
| "grad_norm": 0.2092457115650177, |
| "learning_rate": 2e-05, |
| "loss": 0.8318, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.17590618336886993, |
| "grad_norm": 0.24170993268489838, |
| "learning_rate": 2e-05, |
| "loss": 0.8441, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.1812366737739872, |
| "grad_norm": 0.3601376414299011, |
| "learning_rate": 2e-05, |
| "loss": 0.8457, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 0.4388696849346161, |
| "learning_rate": 2e-05, |
| "loss": 0.8473, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.19189765458422176, |
| "grad_norm": 0.25066882371902466, |
| "learning_rate": 2e-05, |
| "loss": 0.8794, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.19722814498933902, |
| "grad_norm": 0.2521555721759796, |
| "learning_rate": 2e-05, |
| "loss": 0.8257, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2025586353944563, |
| "grad_norm": 0.24965791404247284, |
| "learning_rate": 2e-05, |
| "loss": 0.836, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.20788912579957355, |
| "grad_norm": 0.3060144782066345, |
| "learning_rate": 2e-05, |
| "loss": 0.8342, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.21321961620469082, |
| "grad_norm": 0.29012420773506165, |
| "learning_rate": 2e-05, |
| "loss": 0.8282, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.21855010660980811, |
| "grad_norm": 0.2453121393918991, |
| "learning_rate": 2e-05, |
| "loss": 0.8353, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 0.23359909653663635, |
| "learning_rate": 2e-05, |
| "loss": 0.8301, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.22921108742004265, |
| "grad_norm": 0.2844357192516327, |
| "learning_rate": 2e-05, |
| "loss": 0.8246, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.2345415778251599, |
| "grad_norm": 0.33136624097824097, |
| "learning_rate": 2e-05, |
| "loss": 0.8207, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.23987206823027718, |
| "grad_norm": 0.3795192837715149, |
| "learning_rate": 2e-05, |
| "loss": 0.827, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.24520255863539445, |
| "grad_norm": 0.2862633168697357, |
| "learning_rate": 2e-05, |
| "loss": 0.813, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.2505330490405117, |
| "grad_norm": 0.2595326602458954, |
| "learning_rate": 2e-05, |
| "loss": 0.7736, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.255863539445629, |
| "grad_norm": 0.28593310713768005, |
| "learning_rate": 2e-05, |
| "loss": 0.7976, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 0.3470572531223297, |
| "learning_rate": 2e-05, |
| "loss": 0.8094, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.26652452025586354, |
| "grad_norm": 0.35488802194595337, |
| "learning_rate": 2e-05, |
| "loss": 0.8174, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.27185501066098083, |
| "grad_norm": 0.28033536672592163, |
| "learning_rate": 2e-05, |
| "loss": 0.8377, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.2771855010660981, |
| "grad_norm": 0.23253943026065826, |
| "learning_rate": 2e-05, |
| "loss": 0.7725, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.28251599147121537, |
| "grad_norm": 0.2662801146507263, |
| "learning_rate": 2e-05, |
| "loss": 0.779, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.2878464818763326, |
| "grad_norm": 0.33942538499832153, |
| "learning_rate": 2e-05, |
| "loss": 0.8234, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.2931769722814499, |
| "grad_norm": 0.34840917587280273, |
| "learning_rate": 2e-05, |
| "loss": 0.8095, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 0.28402140736579895, |
| "learning_rate": 2e-05, |
| "loss": 0.8321, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.30383795309168443, |
| "grad_norm": 0.26083698868751526, |
| "learning_rate": 2e-05, |
| "loss": 0.761, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.3091684434968017, |
| "grad_norm": 0.25820574164390564, |
| "learning_rate": 2e-05, |
| "loss": 0.8052, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.31449893390191896, |
| "grad_norm": 0.3496425449848175, |
| "learning_rate": 2e-05, |
| "loss": 0.7961, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.31982942430703626, |
| "grad_norm": 0.36911600828170776, |
| "learning_rate": 2e-05, |
| "loss": 0.8297, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3251599147121535, |
| "grad_norm": 0.2791857421398163, |
| "learning_rate": 2e-05, |
| "loss": 0.8117, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.3304904051172708, |
| "grad_norm": 0.2672363817691803, |
| "learning_rate": 2e-05, |
| "loss": 0.7098, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 0.2714046239852905, |
| "learning_rate": 2e-05, |
| "loss": 0.7758, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.3411513859275053, |
| "grad_norm": 0.3329773247241974, |
| "learning_rate": 2e-05, |
| "loss": 0.7929, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.3464818763326226, |
| "grad_norm": 0.3871755599975586, |
| "learning_rate": 2e-05, |
| "loss": 0.8303, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.35181236673773986, |
| "grad_norm": 0.2991976737976074, |
| "learning_rate": 2e-05, |
| "loss": 0.8265, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.35714285714285715, |
| "grad_norm": 0.2702910304069519, |
| "learning_rate": 2e-05, |
| "loss": 0.7564, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3624733475479744, |
| "grad_norm": 0.24136558175086975, |
| "learning_rate": 2e-05, |
| "loss": 0.7688, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.3678038379530917, |
| "grad_norm": 0.3107840418815613, |
| "learning_rate": 2e-05, |
| "loss": 0.7912, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 0.3215864598751068, |
| "learning_rate": 2e-05, |
| "loss": 0.8324, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3784648187633262, |
| "grad_norm": 0.31696921586990356, |
| "learning_rate": 2e-05, |
| "loss": 0.8032, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.3837953091684435, |
| "grad_norm": 0.2812045216560364, |
| "learning_rate": 2e-05, |
| "loss": 0.7391, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.38912579957356075, |
| "grad_norm": 0.2580372393131256, |
| "learning_rate": 2e-05, |
| "loss": 0.7912, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.39445628997867804, |
| "grad_norm": 0.32673174142837524, |
| "learning_rate": 2e-05, |
| "loss": 0.802, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.3997867803837953, |
| "grad_norm": 0.36080583930015564, |
| "learning_rate": 2e-05, |
| "loss": 0.8197, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4051172707889126, |
| "grad_norm": 0.32174256443977356, |
| "learning_rate": 2e-05, |
| "loss": 0.7906, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 0.288492888212204, |
| "learning_rate": 2e-05, |
| "loss": 0.7471, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.4157782515991471, |
| "grad_norm": 0.2705792188644409, |
| "learning_rate": 2e-05, |
| "loss": 0.7552, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.4211087420042644, |
| "grad_norm": 0.35100996494293213, |
| "learning_rate": 2e-05, |
| "loss": 0.8007, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.42643923240938164, |
| "grad_norm": 0.3444612920284271, |
| "learning_rate": 2e-05, |
| "loss": 0.8095, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.43176972281449894, |
| "grad_norm": 0.3082815706729889, |
| "learning_rate": 2e-05, |
| "loss": 0.7817, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.43710021321961623, |
| "grad_norm": 0.27543190121650696, |
| "learning_rate": 2e-05, |
| "loss": 0.7575, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.44243070362473347, |
| "grad_norm": 0.26944637298583984, |
| "learning_rate": 2e-05, |
| "loss": 0.7727, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 0.3868255019187927, |
| "learning_rate": 2e-05, |
| "loss": 0.779, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.453091684434968, |
| "grad_norm": 0.3912637233734131, |
| "learning_rate": 2e-05, |
| "loss": 0.7755, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.4584221748400853, |
| "grad_norm": 0.29420873522758484, |
| "learning_rate": 2e-05, |
| "loss": 0.7715, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.46375266524520253, |
| "grad_norm": 0.2589830160140991, |
| "learning_rate": 2e-05, |
| "loss": 0.7388, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.4690831556503198, |
| "grad_norm": 0.2840547561645508, |
| "learning_rate": 2e-05, |
| "loss": 0.748, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.4744136460554371, |
| "grad_norm": 0.30651238560676575, |
| "learning_rate": 2e-05, |
| "loss": 0.7478, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.47974413646055436, |
| "grad_norm": 0.36977389454841614, |
| "learning_rate": 2e-05, |
| "loss": 0.8058, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 0.28890499472618103, |
| "learning_rate": 2e-05, |
| "loss": 0.7683, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.4904051172707889, |
| "grad_norm": 0.2896074652671814, |
| "learning_rate": 2e-05, |
| "loss": 0.7317, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4957356076759062, |
| "grad_norm": 0.3064037263393402, |
| "learning_rate": 2e-05, |
| "loss": 0.7531, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.5010660980810234, |
| "grad_norm": 0.41676151752471924, |
| "learning_rate": 2e-05, |
| "loss": 0.7766, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.5063965884861408, |
| "grad_norm": 0.3909554183483124, |
| "learning_rate": 2e-05, |
| "loss": 0.7852, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.511727078891258, |
| "grad_norm": 0.3161725103855133, |
| "learning_rate": 2e-05, |
| "loss": 0.7755, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.5170575692963753, |
| "grad_norm": 0.2737087607383728, |
| "learning_rate": 2e-05, |
| "loss": 0.7484, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 0.3294726610183716, |
| "learning_rate": 2e-05, |
| "loss": 0.7421, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.5277185501066098, |
| "grad_norm": 0.38085660338401794, |
| "learning_rate": 2e-05, |
| "loss": 0.8126, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.5330490405117271, |
| "grad_norm": 0.3341747522354126, |
| "learning_rate": 2e-05, |
| "loss": 0.7578, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.5383795309168443, |
| "grad_norm": 0.34995269775390625, |
| "learning_rate": 2e-05, |
| "loss": 0.7874, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.5437100213219617, |
| "grad_norm": 0.28949394822120667, |
| "learning_rate": 2e-05, |
| "loss": 0.7031, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.5490405117270789, |
| "grad_norm": 0.5440807342529297, |
| "learning_rate": 2e-05, |
| "loss": 0.7623, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.5543710021321961, |
| "grad_norm": 0.3526187837123871, |
| "learning_rate": 2e-05, |
| "loss": 0.7682, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 0.4043067991733551, |
| "learning_rate": 2e-05, |
| "loss": 0.7551, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.5650319829424307, |
| "grad_norm": 0.30657297372817993, |
| "learning_rate": 2e-05, |
| "loss": 0.7768, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.570362473347548, |
| "grad_norm": 0.27761197090148926, |
| "learning_rate": 2e-05, |
| "loss": 0.7453, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.5756929637526652, |
| "grad_norm": 0.30321934819221497, |
| "learning_rate": 2e-05, |
| "loss": 0.7569, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.5810234541577826, |
| "grad_norm": 0.3802403509616852, |
| "learning_rate": 2e-05, |
| "loss": 0.7485, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.5863539445628998, |
| "grad_norm": 0.4318316876888275, |
| "learning_rate": 2e-05, |
| "loss": 0.7839, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.591684434968017, |
| "grad_norm": 0.31432363390922546, |
| "learning_rate": 2e-05, |
| "loss": 0.7571, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.2873448431491852, |
| "learning_rate": 2e-05, |
| "loss": 0.721, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.6023454157782516, |
| "grad_norm": 0.3069778084754944, |
| "learning_rate": 2e-05, |
| "loss": 0.7283, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.6076759061833689, |
| "grad_norm": 0.35373619198799133, |
| "learning_rate": 2e-05, |
| "loss": 0.7622, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.6130063965884861, |
| "grad_norm": 0.35318872332572937, |
| "learning_rate": 2e-05, |
| "loss": 0.7821, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.6183368869936035, |
| "grad_norm": 0.30835840106010437, |
| "learning_rate": 2e-05, |
| "loss": 0.7511, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.6236673773987207, |
| "grad_norm": 0.27320486307144165, |
| "learning_rate": 2e-05, |
| "loss": 0.7308, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.6289978678038379, |
| "grad_norm": 0.3529856503009796, |
| "learning_rate": 2e-05, |
| "loss": 0.7652, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 0.33610275387763977, |
| "learning_rate": 2e-05, |
| "loss": 0.741, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.6396588486140725, |
| "grad_norm": 0.3909617066383362, |
| "learning_rate": 2e-05, |
| "loss": 0.7445, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.6449893390191898, |
| "grad_norm": 0.3135911226272583, |
| "learning_rate": 2e-05, |
| "loss": 0.7638, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.650319829424307, |
| "grad_norm": 0.2903372347354889, |
| "learning_rate": 2e-05, |
| "loss": 0.7371, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.6556503198294243, |
| "grad_norm": 0.3075706958770752, |
| "learning_rate": 2e-05, |
| "loss": 0.7528, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.6609808102345416, |
| "grad_norm": 0.3804391622543335, |
| "learning_rate": 2e-05, |
| "loss": 0.7421, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.6663113006396588, |
| "grad_norm": 0.3764164447784424, |
| "learning_rate": 2e-05, |
| "loss": 0.7793, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 0.3079053461551666, |
| "learning_rate": 2e-05, |
| "loss": 0.7762, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.6769722814498934, |
| "grad_norm": 0.2808702886104584, |
| "learning_rate": 2e-05, |
| "loss": 0.7294, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.6823027718550106, |
| "grad_norm": 0.3023492395877838, |
| "learning_rate": 2e-05, |
| "loss": 0.7608, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.6876332622601279, |
| "grad_norm": 0.3514968752861023, |
| "learning_rate": 2e-05, |
| "loss": 0.7506, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.6929637526652452, |
| "grad_norm": 0.3713417947292328, |
| "learning_rate": 2e-05, |
| "loss": 0.7403, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.6982942430703625, |
| "grad_norm": 0.30928298830986023, |
| "learning_rate": 2e-05, |
| "loss": 0.7598, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.7036247334754797, |
| "grad_norm": 0.30625951290130615, |
| "learning_rate": 2e-05, |
| "loss": 0.7462, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 0.3103967010974884, |
| "learning_rate": 2e-05, |
| "loss": 0.7442, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.7142857142857143, |
| "grad_norm": 0.3370122015476227, |
| "learning_rate": 2e-05, |
| "loss": 0.7794, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.7196162046908315, |
| "grad_norm": 0.39418521523475647, |
| "learning_rate": 2e-05, |
| "loss": 0.7613, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.7249466950959488, |
| "grad_norm": 0.3453653156757355, |
| "learning_rate": 2e-05, |
| "loss": 0.7648, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.7302771855010661, |
| "grad_norm": 0.33969759941101074, |
| "learning_rate": 2e-05, |
| "loss": 0.6901, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.7356076759061834, |
| "grad_norm": 0.29864346981048584, |
| "learning_rate": 2e-05, |
| "loss": 0.735, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.7409381663113006, |
| "grad_norm": 0.417368620634079, |
| "learning_rate": 2e-05, |
| "loss": 0.7609, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.3895966708660126, |
| "learning_rate": 2e-05, |
| "loss": 0.773, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.7515991471215352, |
| "grad_norm": 0.33772552013397217, |
| "learning_rate": 2e-05, |
| "loss": 0.746, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.7569296375266524, |
| "grad_norm": 0.29536184668540955, |
| "learning_rate": 2e-05, |
| "loss": 0.7263, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.7622601279317697, |
| "grad_norm": 0.2753921449184418, |
| "learning_rate": 2e-05, |
| "loss": 0.7659, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.767590618336887, |
| "grad_norm": 0.34762871265411377, |
| "learning_rate": 2e-05, |
| "loss": 0.7584, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.7729211087420043, |
| "grad_norm": 0.3880026340484619, |
| "learning_rate": 2e-05, |
| "loss": 0.7844, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.7782515991471215, |
| "grad_norm": 0.3299189507961273, |
| "learning_rate": 2e-05, |
| "loss": 0.7381, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 0.30803337693214417, |
| "learning_rate": 2e-05, |
| "loss": 0.7058, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.7889125799573561, |
| "grad_norm": 0.3036564290523529, |
| "learning_rate": 2e-05, |
| "loss": 0.7196, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.7942430703624733, |
| "grad_norm": 0.40265294909477234, |
| "learning_rate": 2e-05, |
| "loss": 0.7381, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.7995735607675906, |
| "grad_norm": 0.4096594452857971, |
| "learning_rate": 2e-05, |
| "loss": 0.7541, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.8049040511727079, |
| "grad_norm": 0.36740148067474365, |
| "learning_rate": 2e-05, |
| "loss": 0.7608, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.8102345415778252, |
| "grad_norm": 0.32650887966156006, |
| "learning_rate": 2e-05, |
| "loss": 0.7351, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.8155650319829424, |
| "grad_norm": 0.3586374521255493, |
| "learning_rate": 2e-05, |
| "loss": 0.7604, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 0.3661261796951294, |
| "learning_rate": 2e-05, |
| "loss": 0.7452, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.826226012793177, |
| "grad_norm": 0.35194748640060425, |
| "learning_rate": 2e-05, |
| "loss": 0.7327, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.8315565031982942, |
| "grad_norm": 0.3226638734340668, |
| "learning_rate": 2e-05, |
| "loss": 0.758, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.8368869936034116, |
| "grad_norm": 0.3084862530231476, |
| "learning_rate": 2e-05, |
| "loss": 0.7265, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.8422174840085288, |
| "grad_norm": 0.3141063451766968, |
| "learning_rate": 2e-05, |
| "loss": 0.7373, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.847547974413646, |
| "grad_norm": 0.48754021525382996, |
| "learning_rate": 2e-05, |
| "loss": 0.7344, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.8528784648187633, |
| "grad_norm": 0.36847469210624695, |
| "learning_rate": 2e-05, |
| "loss": 0.742, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 0.31983086466789246, |
| "learning_rate": 2e-05, |
| "loss": 0.7499, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.8635394456289979, |
| "grad_norm": 0.2929735481739044, |
| "learning_rate": 2e-05, |
| "loss": 0.7025, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.8688699360341151, |
| "grad_norm": 0.33163875341415405, |
| "learning_rate": 2e-05, |
| "loss": 0.7158, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.8742004264392325, |
| "grad_norm": 0.3640444874763489, |
| "learning_rate": 2e-05, |
| "loss": 0.7089, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.8795309168443497, |
| "grad_norm": 0.37700313329696655, |
| "learning_rate": 2e-05, |
| "loss": 0.7389, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.8848614072494669, |
| "grad_norm": 0.3186069130897522, |
| "learning_rate": 2e-05, |
| "loss": 0.721, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.8901918976545842, |
| "grad_norm": 0.30727624893188477, |
| "learning_rate": 2e-05, |
| "loss": 0.719, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 0.3146567642688751, |
| "learning_rate": 2e-05, |
| "loss": 0.7147, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.9008528784648188, |
| "grad_norm": 0.3437662720680237, |
| "learning_rate": 2e-05, |
| "loss": 0.7312, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.906183368869936, |
| "grad_norm": 0.3754054009914398, |
| "learning_rate": 2e-05, |
| "loss": 0.743, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.9115138592750534, |
| "grad_norm": 0.30614492297172546, |
| "learning_rate": 2e-05, |
| "loss": 0.7415, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.9168443496801706, |
| "grad_norm": 0.293458491563797, |
| "learning_rate": 2e-05, |
| "loss": 0.7, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.9221748400852878, |
| "grad_norm": 0.2879101037979126, |
| "learning_rate": 2e-05, |
| "loss": 0.7572, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.9275053304904051, |
| "grad_norm": 0.30240631103515625, |
| "learning_rate": 2e-05, |
| "loss": 0.736, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 0.40850868821144104, |
| "learning_rate": 2e-05, |
| "loss": 0.744, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.9381663113006397, |
| "grad_norm": 0.3220716416835785, |
| "learning_rate": 2e-05, |
| "loss": 0.7444, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.9434968017057569, |
| "grad_norm": 0.3097483813762665, |
| "learning_rate": 2e-05, |
| "loss": 0.7366, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.9488272921108742, |
| "grad_norm": 0.3266868591308594, |
| "learning_rate": 2e-05, |
| "loss": 0.7318, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.9541577825159915, |
| "grad_norm": 0.33244094252586365, |
| "learning_rate": 2e-05, |
| "loss": 0.7311, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.9594882729211087, |
| "grad_norm": 0.37434569001197815, |
| "learning_rate": 2e-05, |
| "loss": 0.7468, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.964818763326226, |
| "grad_norm": 0.33542153239250183, |
| "learning_rate": 2e-05, |
| "loss": 0.7464, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 0.3000001907348633, |
| "learning_rate": 2e-05, |
| "loss": 0.6872, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.9754797441364605, |
| "grad_norm": 0.3260563313961029, |
| "learning_rate": 2e-05, |
| "loss": 0.7265, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.9808102345415778, |
| "grad_norm": 0.33576592803001404, |
| "learning_rate": 2e-05, |
| "loss": 0.7254, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.9861407249466951, |
| "grad_norm": 0.41705191135406494, |
| "learning_rate": 2e-05, |
| "loss": 0.734, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.9914712153518124, |
| "grad_norm": 0.3626074194908142, |
| "learning_rate": 2e-05, |
| "loss": 0.749, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.9968017057569296, |
| "grad_norm": 0.3002621531486511, |
| "learning_rate": 2e-05, |
| "loss": 0.7157, |
| "step": 935 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 938, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 99999, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.798520507990016e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|