{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 938, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005330490405117271, "grad_norm": 0.3734353482723236, "learning_rate": 2e-05, "loss": 1.7104, "step": 5 }, { "epoch": 0.010660980810234541, "grad_norm": 0.2635837197303772, "learning_rate": 2e-05, "loss": 1.6538, "step": 10 }, { "epoch": 0.015991471215351813, "grad_norm": 0.19964125752449036, "learning_rate": 2e-05, "loss": 1.4818, "step": 15 }, { "epoch": 0.021321961620469083, "grad_norm": 0.2099446952342987, "learning_rate": 2e-05, "loss": 1.3886, "step": 20 }, { "epoch": 0.026652452025586353, "grad_norm": 0.34373781085014343, "learning_rate": 2e-05, "loss": 1.3364, "step": 25 }, { "epoch": 0.031982942430703626, "grad_norm": 0.18734169006347656, "learning_rate": 2e-05, "loss": 1.2899, "step": 30 }, { "epoch": 0.03731343283582089, "grad_norm": 0.16565224528312683, "learning_rate": 2e-05, "loss": 1.2788, "step": 35 }, { "epoch": 0.042643923240938165, "grad_norm": 0.20943662524223328, "learning_rate": 2e-05, "loss": 1.2335, "step": 40 }, { "epoch": 0.04797441364605544, "grad_norm": 0.3209005892276764, "learning_rate": 2e-05, "loss": 1.1854, "step": 45 }, { "epoch": 0.053304904051172705, "grad_norm": 0.3170112669467926, "learning_rate": 2e-05, "loss": 1.0846, "step": 50 }, { "epoch": 0.05863539445628998, "grad_norm": 0.19068752229213715, "learning_rate": 2e-05, "loss": 1.0976, "step": 55 }, { "epoch": 0.06396588486140725, "grad_norm": 0.19504626095294952, "learning_rate": 2e-05, "loss": 1.0597, "step": 60 }, { "epoch": 0.06929637526652452, "grad_norm": 0.2150755077600479, "learning_rate": 2e-05, "loss": 1.0369, "step": 65 }, { "epoch": 0.07462686567164178, "grad_norm": 0.3036380410194397, "learning_rate": 2e-05, "loss": 0.992, "step": 70 }, { "epoch": 0.07995735607675906, "grad_norm": 0.3687366545200348, "learning_rate": 2e-05, "loss": 0.9768, "step": 75 }, { "epoch": 0.08528784648187633, "grad_norm": 0.20827943086624146, "learning_rate": 2e-05, "loss": 1.0187, "step": 80 }, { "epoch": 0.0906183368869936, "grad_norm": 0.19415774941444397, "learning_rate": 2e-05, "loss": 0.9108, "step": 85 }, { "epoch": 0.09594882729211088, "grad_norm": 0.23162966966629028, "learning_rate": 2e-05, "loss": 0.9096, "step": 90 }, { "epoch": 0.10127931769722814, "grad_norm": 0.30539438128471375, "learning_rate": 2e-05, "loss": 0.9119, "step": 95 }, { "epoch": 0.10660980810234541, "grad_norm": 0.3614450693130493, "learning_rate": 2e-05, "loss": 0.9107, "step": 100 }, { "epoch": 0.11194029850746269, "grad_norm": 0.20569603145122528, "learning_rate": 2e-05, "loss": 0.9069, "step": 105 }, { "epoch": 0.11727078891257996, "grad_norm": 0.1889234036207199, "learning_rate": 2e-05, "loss": 0.867, "step": 110 }, { "epoch": 0.12260127931769722, "grad_norm": 0.26535317301750183, "learning_rate": 2e-05, "loss": 0.9013, "step": 115 }, { "epoch": 0.1279317697228145, "grad_norm": 0.32258930802345276, "learning_rate": 2e-05, "loss": 0.8905, "step": 120 }, { "epoch": 0.13326226012793177, "grad_norm": 0.3419113755226135, "learning_rate": 2e-05, "loss": 0.8779, "step": 125 }, { "epoch": 0.13859275053304904, "grad_norm": 0.23869968950748444, "learning_rate": 2e-05, "loss": 0.894, "step": 130 }, { "epoch": 0.1439232409381663, "grad_norm": 0.2865942418575287, "learning_rate": 2e-05, "loss": 0.8571, "step": 135 }, { "epoch": 0.14925373134328357, "grad_norm": 0.2303633689880371, "learning_rate": 2e-05, "loss": 0.8787, "step": 140 }, { "epoch": 0.15458422174840086, "grad_norm": 0.3257729113101959, "learning_rate": 2e-05, "loss": 0.8533, "step": 145 }, { "epoch": 0.15991471215351813, "grad_norm": 0.34180137515068054, "learning_rate": 2e-05, "loss": 0.8456, "step": 150 }, { "epoch": 0.1652452025586354, "grad_norm": 0.22115589678287506, "learning_rate": 2e-05, "loss": 0.8752, "step": 155 }, { "epoch": 0.17057569296375266, "grad_norm": 0.2092457115650177, "learning_rate": 2e-05, "loss": 0.8318, "step": 160 }, { "epoch": 0.17590618336886993, "grad_norm": 0.24170993268489838, "learning_rate": 2e-05, "loss": 0.8441, "step": 165 }, { "epoch": 0.1812366737739872, "grad_norm": 0.3601376414299011, "learning_rate": 2e-05, "loss": 0.8457, "step": 170 }, { "epoch": 0.1865671641791045, "grad_norm": 0.4388696849346161, "learning_rate": 2e-05, "loss": 0.8473, "step": 175 }, { "epoch": 0.19189765458422176, "grad_norm": 0.25066882371902466, "learning_rate": 2e-05, "loss": 0.8794, "step": 180 }, { "epoch": 0.19722814498933902, "grad_norm": 0.2521555721759796, "learning_rate": 2e-05, "loss": 0.8257, "step": 185 }, { "epoch": 0.2025586353944563, "grad_norm": 0.24965791404247284, "learning_rate": 2e-05, "loss": 0.836, "step": 190 }, { "epoch": 0.20788912579957355, "grad_norm": 0.3060144782066345, "learning_rate": 2e-05, "loss": 0.8342, "step": 195 }, { "epoch": 0.21321961620469082, "grad_norm": 0.29012420773506165, "learning_rate": 2e-05, "loss": 0.8282, "step": 200 }, { "epoch": 0.21855010660980811, "grad_norm": 0.2453121393918991, "learning_rate": 2e-05, "loss": 0.8353, "step": 205 }, { "epoch": 0.22388059701492538, "grad_norm": 0.23359909653663635, "learning_rate": 2e-05, "loss": 0.8301, "step": 210 }, { "epoch": 0.22921108742004265, "grad_norm": 0.2844357192516327, "learning_rate": 2e-05, "loss": 0.8246, "step": 215 }, { "epoch": 0.2345415778251599, "grad_norm": 0.33136624097824097, "learning_rate": 2e-05, "loss": 0.8207, "step": 220 }, { "epoch": 0.23987206823027718, "grad_norm": 0.3795192837715149, "learning_rate": 2e-05, "loss": 0.827, "step": 225 }, { "epoch": 0.24520255863539445, "grad_norm": 0.2862633168697357, "learning_rate": 2e-05, "loss": 0.813, "step": 230 }, { "epoch": 0.2505330490405117, "grad_norm": 0.2595326602458954, "learning_rate": 2e-05, "loss": 0.7736, "step": 235 }, { "epoch": 0.255863539445629, "grad_norm": 0.28593310713768005, "learning_rate": 2e-05, "loss": 0.7976, "step": 240 }, { "epoch": 0.26119402985074625, "grad_norm": 0.3470572531223297, "learning_rate": 2e-05, "loss": 0.8094, "step": 245 }, { "epoch": 0.26652452025586354, "grad_norm": 0.35488802194595337, "learning_rate": 2e-05, "loss": 0.8174, "step": 250 }, { "epoch": 0.27185501066098083, "grad_norm": 0.28033536672592163, "learning_rate": 2e-05, "loss": 0.8377, "step": 255 }, { "epoch": 0.2771855010660981, "grad_norm": 0.23253943026065826, "learning_rate": 2e-05, "loss": 0.7725, "step": 260 }, { "epoch": 0.28251599147121537, "grad_norm": 0.2662801146507263, "learning_rate": 2e-05, "loss": 0.779, "step": 265 }, { "epoch": 0.2878464818763326, "grad_norm": 0.33942538499832153, "learning_rate": 2e-05, "loss": 0.8234, "step": 270 }, { "epoch": 0.2931769722814499, "grad_norm": 0.34840917587280273, "learning_rate": 2e-05, "loss": 0.8095, "step": 275 }, { "epoch": 0.29850746268656714, "grad_norm": 0.28402140736579895, "learning_rate": 2e-05, "loss": 0.8321, "step": 280 }, { "epoch": 0.30383795309168443, "grad_norm": 0.26083698868751526, "learning_rate": 2e-05, "loss": 0.761, "step": 285 }, { "epoch": 0.3091684434968017, "grad_norm": 0.25820574164390564, "learning_rate": 2e-05, "loss": 0.8052, "step": 290 }, { "epoch": 0.31449893390191896, "grad_norm": 0.3496425449848175, "learning_rate": 2e-05, "loss": 0.7961, "step": 295 }, { "epoch": 0.31982942430703626, "grad_norm": 0.36911600828170776, "learning_rate": 2e-05, "loss": 0.8297, "step": 300 }, { "epoch": 0.3251599147121535, "grad_norm": 0.2791857421398163, "learning_rate": 2e-05, "loss": 0.8117, "step": 305 }, { "epoch": 0.3304904051172708, "grad_norm": 0.2672363817691803, "learning_rate": 2e-05, "loss": 0.7098, "step": 310 }, { "epoch": 0.3358208955223881, "grad_norm": 0.2714046239852905, "learning_rate": 2e-05, "loss": 0.7758, "step": 315 }, { "epoch": 0.3411513859275053, "grad_norm": 0.3329773247241974, "learning_rate": 2e-05, "loss": 0.7929, "step": 320 }, { "epoch": 0.3464818763326226, "grad_norm": 0.3871755599975586, "learning_rate": 2e-05, "loss": 0.8303, "step": 325 }, { "epoch": 0.35181236673773986, "grad_norm": 0.2991976737976074, "learning_rate": 2e-05, "loss": 0.8265, "step": 330 }, { "epoch": 0.35714285714285715, "grad_norm": 0.2702910304069519, "learning_rate": 2e-05, "loss": 0.7564, "step": 335 }, { "epoch": 0.3624733475479744, "grad_norm": 0.24136558175086975, "learning_rate": 2e-05, "loss": 0.7688, "step": 340 }, { "epoch": 0.3678038379530917, "grad_norm": 0.3107840418815613, "learning_rate": 2e-05, "loss": 0.7912, "step": 345 }, { "epoch": 0.373134328358209, "grad_norm": 0.3215864598751068, "learning_rate": 2e-05, "loss": 0.8324, "step": 350 }, { "epoch": 0.3784648187633262, "grad_norm": 0.31696921586990356, "learning_rate": 2e-05, "loss": 0.8032, "step": 355 }, { "epoch": 0.3837953091684435, "grad_norm": 0.2812045216560364, "learning_rate": 2e-05, "loss": 0.7391, "step": 360 }, { "epoch": 0.38912579957356075, "grad_norm": 0.2580372393131256, "learning_rate": 2e-05, "loss": 0.7912, "step": 365 }, { "epoch": 0.39445628997867804, "grad_norm": 0.32673174142837524, "learning_rate": 2e-05, "loss": 0.802, "step": 370 }, { "epoch": 0.3997867803837953, "grad_norm": 0.36080583930015564, "learning_rate": 2e-05, "loss": 0.8197, "step": 375 }, { "epoch": 0.4051172707889126, "grad_norm": 0.32174256443977356, "learning_rate": 2e-05, "loss": 0.7906, "step": 380 }, { "epoch": 0.41044776119402987, "grad_norm": 0.288492888212204, "learning_rate": 2e-05, "loss": 0.7471, "step": 385 }, { "epoch": 0.4157782515991471, "grad_norm": 0.2705792188644409, "learning_rate": 2e-05, "loss": 0.7552, "step": 390 }, { "epoch": 0.4211087420042644, "grad_norm": 0.35100996494293213, "learning_rate": 2e-05, "loss": 0.8007, "step": 395 }, { "epoch": 0.42643923240938164, "grad_norm": 0.3444612920284271, "learning_rate": 2e-05, "loss": 0.8095, "step": 400 }, { "epoch": 0.43176972281449894, "grad_norm": 0.3082815706729889, "learning_rate": 2e-05, "loss": 0.7817, "step": 405 }, { "epoch": 0.43710021321961623, "grad_norm": 0.27543190121650696, "learning_rate": 2e-05, "loss": 0.7575, "step": 410 }, { "epoch": 0.44243070362473347, "grad_norm": 0.26944637298583984, "learning_rate": 2e-05, "loss": 0.7727, "step": 415 }, { "epoch": 0.44776119402985076, "grad_norm": 0.3868255019187927, "learning_rate": 2e-05, "loss": 0.779, "step": 420 }, { "epoch": 0.453091684434968, "grad_norm": 0.3912637233734131, "learning_rate": 2e-05, "loss": 0.7755, "step": 425 }, { "epoch": 0.4584221748400853, "grad_norm": 0.29420873522758484, "learning_rate": 2e-05, "loss": 0.7715, "step": 430 }, { "epoch": 0.46375266524520253, "grad_norm": 0.2589830160140991, "learning_rate": 2e-05, "loss": 0.7388, "step": 435 }, { "epoch": 0.4690831556503198, "grad_norm": 0.2840547561645508, "learning_rate": 2e-05, "loss": 0.748, "step": 440 }, { "epoch": 0.4744136460554371, "grad_norm": 0.30651238560676575, "learning_rate": 2e-05, "loss": 0.7478, "step": 445 }, { "epoch": 0.47974413646055436, "grad_norm": 0.36977389454841614, "learning_rate": 2e-05, "loss": 0.8058, "step": 450 }, { "epoch": 0.48507462686567165, "grad_norm": 0.28890499472618103, "learning_rate": 2e-05, "loss": 0.7683, "step": 455 }, { "epoch": 0.4904051172707889, "grad_norm": 0.2896074652671814, "learning_rate": 2e-05, "loss": 0.7317, "step": 460 }, { "epoch": 0.4957356076759062, "grad_norm": 0.3064037263393402, "learning_rate": 2e-05, "loss": 0.7531, "step": 465 }, { "epoch": 0.5010660980810234, "grad_norm": 0.41676151752471924, "learning_rate": 2e-05, "loss": 0.7766, "step": 470 }, { "epoch": 0.5063965884861408, "grad_norm": 0.3909554183483124, "learning_rate": 2e-05, "loss": 0.7852, "step": 475 }, { "epoch": 0.511727078891258, "grad_norm": 0.3161725103855133, "learning_rate": 2e-05, "loss": 0.7755, "step": 480 }, { "epoch": 0.5170575692963753, "grad_norm": 0.2737087607383728, "learning_rate": 2e-05, "loss": 0.7484, "step": 485 }, { "epoch": 0.5223880597014925, "grad_norm": 0.3294726610183716, "learning_rate": 2e-05, "loss": 0.7421, "step": 490 }, { "epoch": 0.5277185501066098, "grad_norm": 0.38085660338401794, "learning_rate": 2e-05, "loss": 0.8126, "step": 495 }, { "epoch": 0.5330490405117271, "grad_norm": 0.3341747522354126, "learning_rate": 2e-05, "loss": 0.7578, "step": 500 }, { "epoch": 0.5383795309168443, "grad_norm": 0.34995269775390625, "learning_rate": 2e-05, "loss": 0.7874, "step": 505 }, { "epoch": 0.5437100213219617, "grad_norm": 0.28949394822120667, "learning_rate": 2e-05, "loss": 0.7031, "step": 510 }, { "epoch": 0.5490405117270789, "grad_norm": 0.5440807342529297, "learning_rate": 2e-05, "loss": 0.7623, "step": 515 }, { "epoch": 0.5543710021321961, "grad_norm": 0.3526187837123871, "learning_rate": 2e-05, "loss": 0.7682, "step": 520 }, { "epoch": 0.5597014925373134, "grad_norm": 0.4043067991733551, "learning_rate": 2e-05, "loss": 0.7551, "step": 525 }, { "epoch": 0.5650319829424307, "grad_norm": 0.30657297372817993, "learning_rate": 2e-05, "loss": 0.7768, "step": 530 }, { "epoch": 0.570362473347548, "grad_norm": 0.27761197090148926, "learning_rate": 2e-05, "loss": 0.7453, "step": 535 }, { "epoch": 0.5756929637526652, "grad_norm": 0.30321934819221497, "learning_rate": 2e-05, "loss": 0.7569, "step": 540 }, { "epoch": 0.5810234541577826, "grad_norm": 0.3802403509616852, "learning_rate": 2e-05, "loss": 0.7485, "step": 545 }, { "epoch": 0.5863539445628998, "grad_norm": 0.4318316876888275, "learning_rate": 2e-05, "loss": 0.7839, "step": 550 }, { "epoch": 0.591684434968017, "grad_norm": 0.31432363390922546, "learning_rate": 2e-05, "loss": 0.7571, "step": 555 }, { "epoch": 0.5970149253731343, "grad_norm": 0.2873448431491852, "learning_rate": 2e-05, "loss": 0.721, "step": 560 }, { "epoch": 0.6023454157782516, "grad_norm": 0.3069778084754944, "learning_rate": 2e-05, "loss": 0.7283, "step": 565 }, { "epoch": 0.6076759061833689, "grad_norm": 0.35373619198799133, "learning_rate": 2e-05, "loss": 0.7622, "step": 570 }, { "epoch": 0.6130063965884861, "grad_norm": 0.35318872332572937, "learning_rate": 2e-05, "loss": 0.7821, "step": 575 }, { "epoch": 0.6183368869936035, "grad_norm": 0.30835840106010437, "learning_rate": 2e-05, "loss": 0.7511, "step": 580 }, { "epoch": 0.6236673773987207, "grad_norm": 0.27320486307144165, "learning_rate": 2e-05, "loss": 0.7308, "step": 585 }, { "epoch": 0.6289978678038379, "grad_norm": 0.3529856503009796, "learning_rate": 2e-05, "loss": 0.7652, "step": 590 }, { "epoch": 0.6343283582089553, "grad_norm": 0.33610275387763977, "learning_rate": 2e-05, "loss": 0.741, "step": 595 }, { "epoch": 0.6396588486140725, "grad_norm": 0.3909617066383362, "learning_rate": 2e-05, "loss": 0.7445, "step": 600 }, { "epoch": 0.6449893390191898, "grad_norm": 0.3135911226272583, "learning_rate": 2e-05, "loss": 0.7638, "step": 605 }, { "epoch": 0.650319829424307, "grad_norm": 0.2903372347354889, "learning_rate": 2e-05, "loss": 0.7371, "step": 610 }, { "epoch": 0.6556503198294243, "grad_norm": 0.3075706958770752, "learning_rate": 2e-05, "loss": 0.7528, "step": 615 }, { "epoch": 0.6609808102345416, "grad_norm": 0.3804391622543335, "learning_rate": 2e-05, "loss": 0.7421, "step": 620 }, { "epoch": 0.6663113006396588, "grad_norm": 0.3764164447784424, "learning_rate": 2e-05, "loss": 0.7793, "step": 625 }, { "epoch": 0.6716417910447762, "grad_norm": 0.3079053461551666, "learning_rate": 2e-05, "loss": 0.7762, "step": 630 }, { "epoch": 0.6769722814498934, "grad_norm": 0.2808702886104584, "learning_rate": 2e-05, "loss": 0.7294, "step": 635 }, { "epoch": 0.6823027718550106, "grad_norm": 0.3023492395877838, "learning_rate": 2e-05, "loss": 0.7608, "step": 640 }, { "epoch": 0.6876332622601279, "grad_norm": 0.3514968752861023, "learning_rate": 2e-05, "loss": 0.7506, "step": 645 }, { "epoch": 0.6929637526652452, "grad_norm": 0.3713417947292328, "learning_rate": 2e-05, "loss": 0.7403, "step": 650 }, { "epoch": 0.6982942430703625, "grad_norm": 0.30928298830986023, "learning_rate": 2e-05, "loss": 0.7598, "step": 655 }, { "epoch": 0.7036247334754797, "grad_norm": 0.30625951290130615, "learning_rate": 2e-05, "loss": 0.7462, "step": 660 }, { "epoch": 0.7089552238805971, "grad_norm": 0.3103967010974884, "learning_rate": 2e-05, "loss": 0.7442, "step": 665 }, { "epoch": 0.7142857142857143, "grad_norm": 0.3370122015476227, "learning_rate": 2e-05, "loss": 0.7794, "step": 670 }, { "epoch": 0.7196162046908315, "grad_norm": 0.39418521523475647, "learning_rate": 2e-05, "loss": 0.7613, "step": 675 }, { "epoch": 0.7249466950959488, "grad_norm": 0.3453653156757355, "learning_rate": 2e-05, "loss": 0.7648, "step": 680 }, { "epoch": 0.7302771855010661, "grad_norm": 0.33969759941101074, "learning_rate": 2e-05, "loss": 0.6901, "step": 685 }, { "epoch": 0.7356076759061834, "grad_norm": 0.29864346981048584, "learning_rate": 2e-05, "loss": 0.735, "step": 690 }, { "epoch": 0.7409381663113006, "grad_norm": 0.417368620634079, "learning_rate": 2e-05, "loss": 0.7609, "step": 695 }, { "epoch": 0.746268656716418, "grad_norm": 0.3895966708660126, "learning_rate": 2e-05, "loss": 0.773, "step": 700 }, { "epoch": 0.7515991471215352, "grad_norm": 0.33772552013397217, "learning_rate": 2e-05, "loss": 0.746, "step": 705 }, { "epoch": 0.7569296375266524, "grad_norm": 0.29536184668540955, "learning_rate": 2e-05, "loss": 0.7263, "step": 710 }, { "epoch": 0.7622601279317697, "grad_norm": 0.2753921449184418, "learning_rate": 2e-05, "loss": 0.7659, "step": 715 }, { "epoch": 0.767590618336887, "grad_norm": 0.34762871265411377, "learning_rate": 2e-05, "loss": 0.7584, "step": 720 }, { "epoch": 0.7729211087420043, "grad_norm": 0.3880026340484619, "learning_rate": 2e-05, "loss": 0.7844, "step": 725 }, { "epoch": 0.7782515991471215, "grad_norm": 0.3299189507961273, "learning_rate": 2e-05, "loss": 0.7381, "step": 730 }, { "epoch": 0.7835820895522388, "grad_norm": 0.30803337693214417, "learning_rate": 2e-05, "loss": 0.7058, "step": 735 }, { "epoch": 0.7889125799573561, "grad_norm": 0.3036564290523529, "learning_rate": 2e-05, "loss": 0.7196, "step": 740 }, { "epoch": 0.7942430703624733, "grad_norm": 0.40265294909477234, "learning_rate": 2e-05, "loss": 0.7381, "step": 745 }, { "epoch": 0.7995735607675906, "grad_norm": 0.4096594452857971, "learning_rate": 2e-05, "loss": 0.7541, "step": 750 }, { "epoch": 0.8049040511727079, "grad_norm": 0.36740148067474365, "learning_rate": 2e-05, "loss": 0.7608, "step": 755 }, { "epoch": 0.8102345415778252, "grad_norm": 0.32650887966156006, "learning_rate": 2e-05, "loss": 0.7351, "step": 760 }, { "epoch": 0.8155650319829424, "grad_norm": 0.3586374521255493, "learning_rate": 2e-05, "loss": 0.7604, "step": 765 }, { "epoch": 0.8208955223880597, "grad_norm": 0.3661261796951294, "learning_rate": 2e-05, "loss": 0.7452, "step": 770 }, { "epoch": 0.826226012793177, "grad_norm": 0.35194748640060425, "learning_rate": 2e-05, "loss": 0.7327, "step": 775 }, { "epoch": 0.8315565031982942, "grad_norm": 0.3226638734340668, "learning_rate": 2e-05, "loss": 0.758, "step": 780 }, { "epoch": 0.8368869936034116, "grad_norm": 0.3084862530231476, "learning_rate": 2e-05, "loss": 0.7265, "step": 785 }, { "epoch": 0.8422174840085288, "grad_norm": 0.3141063451766968, "learning_rate": 2e-05, "loss": 0.7373, "step": 790 }, { "epoch": 0.847547974413646, "grad_norm": 0.48754021525382996, "learning_rate": 2e-05, "loss": 0.7344, "step": 795 }, { "epoch": 0.8528784648187633, "grad_norm": 0.36847469210624695, "learning_rate": 2e-05, "loss": 0.742, "step": 800 }, { "epoch": 0.8582089552238806, "grad_norm": 0.31983086466789246, "learning_rate": 2e-05, "loss": 0.7499, "step": 805 }, { "epoch": 0.8635394456289979, "grad_norm": 0.2929735481739044, "learning_rate": 2e-05, "loss": 0.7025, "step": 810 }, { "epoch": 0.8688699360341151, "grad_norm": 0.33163875341415405, "learning_rate": 2e-05, "loss": 0.7158, "step": 815 }, { "epoch": 0.8742004264392325, "grad_norm": 0.3640444874763489, "learning_rate": 2e-05, "loss": 0.7089, "step": 820 }, { "epoch": 0.8795309168443497, "grad_norm": 0.37700313329696655, "learning_rate": 2e-05, "loss": 0.7389, "step": 825 }, { "epoch": 0.8848614072494669, "grad_norm": 0.3186069130897522, "learning_rate": 2e-05, "loss": 0.721, "step": 830 }, { "epoch": 0.8901918976545842, "grad_norm": 0.30727624893188477, "learning_rate": 2e-05, "loss": 0.719, "step": 835 }, { "epoch": 0.8955223880597015, "grad_norm": 0.3146567642688751, "learning_rate": 2e-05, "loss": 0.7147, "step": 840 }, { "epoch": 0.9008528784648188, "grad_norm": 0.3437662720680237, "learning_rate": 2e-05, "loss": 0.7312, "step": 845 }, { "epoch": 0.906183368869936, "grad_norm": 0.3754054009914398, "learning_rate": 2e-05, "loss": 0.743, "step": 850 }, { "epoch": 0.9115138592750534, "grad_norm": 0.30614492297172546, "learning_rate": 2e-05, "loss": 0.7415, "step": 855 }, { "epoch": 0.9168443496801706, "grad_norm": 0.293458491563797, "learning_rate": 2e-05, "loss": 0.7, "step": 860 }, { "epoch": 0.9221748400852878, "grad_norm": 0.2879101037979126, "learning_rate": 2e-05, "loss": 0.7572, "step": 865 }, { "epoch": 0.9275053304904051, "grad_norm": 0.30240631103515625, "learning_rate": 2e-05, "loss": 0.736, "step": 870 }, { "epoch": 0.9328358208955224, "grad_norm": 0.40850868821144104, "learning_rate": 2e-05, "loss": 0.744, "step": 875 }, { "epoch": 0.9381663113006397, "grad_norm": 0.3220716416835785, "learning_rate": 2e-05, "loss": 0.7444, "step": 880 }, { "epoch": 0.9434968017057569, "grad_norm": 0.3097483813762665, "learning_rate": 2e-05, "loss": 0.7366, "step": 885 }, { "epoch": 0.9488272921108742, "grad_norm": 0.3266868591308594, "learning_rate": 2e-05, "loss": 0.7318, "step": 890 }, { "epoch": 0.9541577825159915, "grad_norm": 0.33244094252586365, "learning_rate": 2e-05, "loss": 0.7311, "step": 895 }, { "epoch": 0.9594882729211087, "grad_norm": 0.37434569001197815, "learning_rate": 2e-05, "loss": 0.7468, "step": 900 }, { "epoch": 0.964818763326226, "grad_norm": 0.33542153239250183, "learning_rate": 2e-05, "loss": 0.7464, "step": 905 }, { "epoch": 0.9701492537313433, "grad_norm": 0.3000001907348633, "learning_rate": 2e-05, "loss": 0.6872, "step": 910 }, { "epoch": 0.9754797441364605, "grad_norm": 0.3260563313961029, "learning_rate": 2e-05, "loss": 0.7265, "step": 915 }, { "epoch": 0.9808102345415778, "grad_norm": 0.33576592803001404, "learning_rate": 2e-05, "loss": 0.7254, "step": 920 }, { "epoch": 0.9861407249466951, "grad_norm": 0.41705191135406494, "learning_rate": 2e-05, "loss": 0.734, "step": 925 }, { "epoch": 0.9914712153518124, "grad_norm": 0.3626074194908142, "learning_rate": 2e-05, "loss": 0.749, "step": 930 }, { "epoch": 0.9968017057569296, "grad_norm": 0.3002621531486511, "learning_rate": 2e-05, "loss": 0.7157, "step": 935 } ], "logging_steps": 5, "max_steps": 938, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.798520507990016e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }