{ "best_global_step": 2000, "best_metric": 0.6596935206968907, "best_model_checkpoint": "/workspace/output/resnet50/checkpoint-2000", "epoch": 0.28388928317955997, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014194464158978, "grad_norm": 3.2342276573181152, "learning_rate": 9.999872249822569e-05, "loss": 5.98863525390625, "step": 10 }, { "epoch": 0.0028388928317956, "grad_norm": 3.3994972705841064, "learning_rate": 9.99973030518098e-05, "loss": 5.97633056640625, "step": 20 }, { "epoch": 0.0042583392476933995, "grad_norm": 3.3180341720581055, "learning_rate": 9.99958836053939e-05, "loss": 5.97711181640625, "step": 30 }, { "epoch": 0.0056777856635912, "grad_norm": 2.9379143714904785, "learning_rate": 9.999446415897801e-05, "loss": 5.9991455078125, "step": 40 }, { "epoch": 0.007097232079488999, "grad_norm": 2.2698018550872803, "learning_rate": 9.99930447125621e-05, "loss": 5.96363525390625, "step": 50 }, { "epoch": 0.008516678495386799, "grad_norm": 2.0626659393310547, "learning_rate": 9.99916252661462e-05, "loss": 5.96995849609375, "step": 60 }, { "epoch": 0.0099361249112846, "grad_norm": 2.814460277557373, "learning_rate": 9.999020581973031e-05, "loss": 5.9493408203125, "step": 70 }, { "epoch": 0.0113555713271824, "grad_norm": 2.871051788330078, "learning_rate": 9.998878637331441e-05, "loss": 5.9510498046875, "step": 80 }, { "epoch": 0.0127750177430802, "grad_norm": 2.3897151947021484, "learning_rate": 9.998736692689852e-05, "loss": 5.94254150390625, "step": 90 }, { "epoch": 0.014194464158977998, "grad_norm": 2.9910531044006348, "learning_rate": 9.99859474804826e-05, "loss": 5.9062255859375, "step": 100 }, { "epoch": 0.015613910574875798, "grad_norm": 3.137518882751465, "learning_rate": 9.998452803406672e-05, "loss": 5.9070068359375, "step": 110 }, { "epoch": 0.017033356990773598, "grad_norm": 3.021024703979492, "learning_rate": 9.998310858765082e-05, "loss": 5.87197265625, "step": 120 }, { "epoch": 0.018452803406671398, "grad_norm": 3.499450445175171, "learning_rate": 9.998168914123493e-05, "loss": 5.8237548828125, "step": 130 }, { "epoch": 0.0198722498225692, "grad_norm": 3.87576961517334, "learning_rate": 9.998026969481902e-05, "loss": 5.754150390625, "step": 140 }, { "epoch": 0.021291696238467, "grad_norm": 3.9846458435058594, "learning_rate": 9.997885024840313e-05, "loss": 5.697198486328125, "step": 150 }, { "epoch": 0.0227111426543648, "grad_norm": 4.339130878448486, "learning_rate": 9.997743080198723e-05, "loss": 5.63760986328125, "step": 160 }, { "epoch": 0.0241305890702626, "grad_norm": 4.891483783721924, "learning_rate": 9.997601135557133e-05, "loss": 5.5271728515625, "step": 170 }, { "epoch": 0.0255500354861604, "grad_norm": 5.147222995758057, "learning_rate": 9.997459190915544e-05, "loss": 5.45938720703125, "step": 180 }, { "epoch": 0.0269694819020582, "grad_norm": 5.365755558013916, "learning_rate": 9.997317246273954e-05, "loss": 5.355255126953125, "step": 190 }, { "epoch": 0.028388928317955996, "grad_norm": 5.888001918792725, "learning_rate": 9.997175301632365e-05, "loss": 5.1554931640625, "step": 200 }, { "epoch": 0.029808374733853796, "grad_norm": 6.100172996520996, "learning_rate": 9.997033356990773e-05, "loss": 5.035284423828125, "step": 210 }, { "epoch": 0.031227821149751596, "grad_norm": 6.491486549377441, "learning_rate": 9.996891412349184e-05, "loss": 4.899530029296875, "step": 220 }, { "epoch": 0.032647267565649396, "grad_norm": 6.916806697845459, "learning_rate": 9.996749467707594e-05, "loss": 4.851350402832031, "step": 230 }, { "epoch": 0.034066713981547196, "grad_norm": 6.837950706481934, "learning_rate": 9.996607523066005e-05, "loss": 4.726431274414063, "step": 240 }, { "epoch": 0.035486160397444996, "grad_norm": 7.554074287414551, "learning_rate": 9.996465578424415e-05, "loss": 4.4839630126953125, "step": 250 }, { "epoch": 0.036905606813342796, "grad_norm": 7.574995994567871, "learning_rate": 9.996323633782825e-05, "loss": 4.506732177734375, "step": 260 }, { "epoch": 0.0383250532292406, "grad_norm": 7.498238563537598, "learning_rate": 9.996181689141236e-05, "loss": 4.319998168945313, "step": 270 }, { "epoch": 0.0397444996451384, "grad_norm": 7.978142261505127, "learning_rate": 9.996039744499645e-05, "loss": 4.214613342285157, "step": 280 }, { "epoch": 0.0411639460610362, "grad_norm": 8.194511413574219, "learning_rate": 9.995897799858057e-05, "loss": 4.212762451171875, "step": 290 }, { "epoch": 0.042583392476934, "grad_norm": 8.136639595031738, "learning_rate": 9.995755855216466e-05, "loss": 4.009028625488281, "step": 300 }, { "epoch": 0.0440028388928318, "grad_norm": 8.684012413024902, "learning_rate": 9.995613910574876e-05, "loss": 3.9817459106445314, "step": 310 }, { "epoch": 0.0454222853087296, "grad_norm": 8.888952255249023, "learning_rate": 9.995471965933286e-05, "loss": 3.94019775390625, "step": 320 }, { "epoch": 0.0468417317246274, "grad_norm": 8.79919719696045, "learning_rate": 9.995330021291697e-05, "loss": 3.9265777587890627, "step": 330 }, { "epoch": 0.0482611781405252, "grad_norm": 8.571785926818848, "learning_rate": 9.995188076650107e-05, "loss": 3.7262115478515625, "step": 340 }, { "epoch": 0.049680624556423, "grad_norm": 8.640142440795898, "learning_rate": 9.995046132008518e-05, "loss": 3.644915771484375, "step": 350 }, { "epoch": 0.0511000709723208, "grad_norm": 9.322779655456543, "learning_rate": 9.994904187366927e-05, "loss": 3.644049072265625, "step": 360 }, { "epoch": 0.0525195173882186, "grad_norm": 8.790424346923828, "learning_rate": 9.994762242725337e-05, "loss": 3.4869285583496095, "step": 370 }, { "epoch": 0.0539389638041164, "grad_norm": 9.344154357910156, "learning_rate": 9.994620298083748e-05, "loss": 3.55142822265625, "step": 380 }, { "epoch": 0.05535841022001419, "grad_norm": 8.807840347290039, "learning_rate": 9.994478353442158e-05, "loss": 3.4293190002441407, "step": 390 }, { "epoch": 0.05677785663591199, "grad_norm": 9.36971378326416, "learning_rate": 9.994336408800569e-05, "loss": 3.429082489013672, "step": 400 }, { "epoch": 0.05819730305180979, "grad_norm": 9.73521900177002, "learning_rate": 9.994194464158977e-05, "loss": 3.408639907836914, "step": 410 }, { "epoch": 0.05961674946770759, "grad_norm": 9.646844863891602, "learning_rate": 9.994052519517389e-05, "loss": 3.1950119018554686, "step": 420 }, { "epoch": 0.06103619588360539, "grad_norm": 9.722207069396973, "learning_rate": 9.993910574875798e-05, "loss": 3.4140243530273438, "step": 430 }, { "epoch": 0.06245564229950319, "grad_norm": 10.609601020812988, "learning_rate": 9.99376863023421e-05, "loss": 3.320109558105469, "step": 440 }, { "epoch": 0.063875088715401, "grad_norm": 10.271575927734375, "learning_rate": 9.993626685592619e-05, "loss": 3.232251739501953, "step": 450 }, { "epoch": 0.06529453513129879, "grad_norm": 9.766585350036621, "learning_rate": 9.993484740951029e-05, "loss": 3.149517059326172, "step": 460 }, { "epoch": 0.0667139815471966, "grad_norm": 10.358244895935059, "learning_rate": 9.99334279630944e-05, "loss": 3.1863967895507814, "step": 470 }, { "epoch": 0.06813342796309439, "grad_norm": 10.473136901855469, "learning_rate": 9.99320085166785e-05, "loss": 3.222390365600586, "step": 480 }, { "epoch": 0.0695528743789922, "grad_norm": 9.905110359191895, "learning_rate": 9.993058907026261e-05, "loss": 3.1823768615722656, "step": 490 }, { "epoch": 0.07097232079488999, "grad_norm": 9.858973503112793, "learning_rate": 9.99291696238467e-05, "loss": 2.9202560424804687, "step": 500 }, { "epoch": 0.07097232079488999, "eval_accuracy": 0.1867489031601704, "eval_loss": 3.0744524002075195, "eval_runtime": 31.2289, "eval_samples_per_second": 503.605, "eval_steps_per_second": 15.755, "step": 500 }, { "epoch": 0.0723917672107878, "grad_norm": 10.224215507507324, "learning_rate": 9.992775017743082e-05, "loss": 3.0410499572753906, "step": 510 }, { "epoch": 0.07381121362668559, "grad_norm": 9.867650032043457, "learning_rate": 9.99263307310149e-05, "loss": 3.116912078857422, "step": 520 }, { "epoch": 0.07523066004258339, "grad_norm": 10.343064308166504, "learning_rate": 9.992491128459901e-05, "loss": 3.06390266418457, "step": 530 }, { "epoch": 0.0766501064584812, "grad_norm": 10.38116455078125, "learning_rate": 9.992349183818311e-05, "loss": 2.973680114746094, "step": 540 }, { "epoch": 0.07806955287437899, "grad_norm": 10.979643821716309, "learning_rate": 9.992207239176722e-05, "loss": 3.0906436920166014, "step": 550 }, { "epoch": 0.0794889992902768, "grad_norm": 10.06657886505127, "learning_rate": 9.992065294535132e-05, "loss": 3.0091484069824217, "step": 560 }, { "epoch": 0.08090844570617459, "grad_norm": 10.663322448730469, "learning_rate": 9.991923349893541e-05, "loss": 2.862255859375, "step": 570 }, { "epoch": 0.0823278921220724, "grad_norm": 9.277785301208496, "learning_rate": 9.991781405251952e-05, "loss": 2.8638259887695314, "step": 580 }, { "epoch": 0.08374733853797019, "grad_norm": 10.807332038879395, "learning_rate": 9.991639460610362e-05, "loss": 2.732352066040039, "step": 590 }, { "epoch": 0.085166784953868, "grad_norm": 9.970373153686523, "learning_rate": 9.991497515968773e-05, "loss": 2.736968231201172, "step": 600 }, { "epoch": 0.08658623136976579, "grad_norm": 11.008269309997559, "learning_rate": 9.991355571327183e-05, "loss": 2.7735246658325194, "step": 610 }, { "epoch": 0.0880056777856636, "grad_norm": 8.758193969726562, "learning_rate": 9.991213626685593e-05, "loss": 2.5436214447021483, "step": 620 }, { "epoch": 0.08942512420156139, "grad_norm": 11.253259658813477, "learning_rate": 9.991071682044003e-05, "loss": 2.748835563659668, "step": 630 }, { "epoch": 0.0908445706174592, "grad_norm": 10.979547500610352, "learning_rate": 9.990929737402414e-05, "loss": 2.7314834594726562, "step": 640 }, { "epoch": 0.09226401703335699, "grad_norm": 11.182887077331543, "learning_rate": 9.990787792760823e-05, "loss": 2.645678901672363, "step": 650 }, { "epoch": 0.0936834634492548, "grad_norm": 10.636208534240723, "learning_rate": 9.990645848119234e-05, "loss": 2.5704013824462892, "step": 660 }, { "epoch": 0.09510290986515259, "grad_norm": 10.351170539855957, "learning_rate": 9.990503903477644e-05, "loss": 2.5628406524658205, "step": 670 }, { "epoch": 0.0965223562810504, "grad_norm": 9.914809226989746, "learning_rate": 9.990361958836054e-05, "loss": 2.5872230529785156, "step": 680 }, { "epoch": 0.09794180269694819, "grad_norm": 10.839837074279785, "learning_rate": 9.990220014194465e-05, "loss": 2.490940475463867, "step": 690 }, { "epoch": 0.099361249112846, "grad_norm": 11.259613990783691, "learning_rate": 9.990078069552875e-05, "loss": 2.64483585357666, "step": 700 }, { "epoch": 0.10078069552874379, "grad_norm": 11.213078498840332, "learning_rate": 9.989936124911286e-05, "loss": 2.5397150039672853, "step": 710 }, { "epoch": 0.1022001419446416, "grad_norm": 10.366206169128418, "learning_rate": 9.989794180269694e-05, "loss": 2.457781219482422, "step": 720 }, { "epoch": 0.10361958836053939, "grad_norm": 11.44458293914795, "learning_rate": 9.989652235628105e-05, "loss": 2.5090484619140625, "step": 730 }, { "epoch": 0.1050390347764372, "grad_norm": 11.689805030822754, "learning_rate": 9.989510290986515e-05, "loss": 2.409171485900879, "step": 740 }, { "epoch": 0.10645848119233499, "grad_norm": 10.568279266357422, "learning_rate": 9.989368346344926e-05, "loss": 2.3308380126953123, "step": 750 }, { "epoch": 0.1078779276082328, "grad_norm": 11.917696952819824, "learning_rate": 9.989226401703337e-05, "loss": 2.3733493804931642, "step": 760 }, { "epoch": 0.10929737402413059, "grad_norm": 9.960722923278809, "learning_rate": 9.989098651525906e-05, "loss": 2.4058095932006838, "step": 770 }, { "epoch": 0.11071682044002838, "grad_norm": 11.068999290466309, "learning_rate": 9.988956706884315e-05, "loss": 2.4371658325195313, "step": 780 }, { "epoch": 0.11213626685592619, "grad_norm": 10.340009689331055, "learning_rate": 9.988814762242725e-05, "loss": 2.2587520599365236, "step": 790 }, { "epoch": 0.11355571327182398, "grad_norm": 9.941303253173828, "learning_rate": 9.988672817601136e-05, "loss": 2.268446350097656, "step": 800 }, { "epoch": 0.11497515968772179, "grad_norm": 11.490272521972656, "learning_rate": 9.988530872959546e-05, "loss": 2.471067428588867, "step": 810 }, { "epoch": 0.11639460610361958, "grad_norm": 10.67241382598877, "learning_rate": 9.988388928317957e-05, "loss": 2.3497791290283203, "step": 820 }, { "epoch": 0.11781405251951739, "grad_norm": 10.710894584655762, "learning_rate": 9.988246983676367e-05, "loss": 2.1724626541137697, "step": 830 }, { "epoch": 0.11923349893541518, "grad_norm": 10.985452651977539, "learning_rate": 9.988105039034778e-05, "loss": 2.1848114013671873, "step": 840 }, { "epoch": 0.12065294535131299, "grad_norm": 10.063145637512207, "learning_rate": 9.987963094393186e-05, "loss": 2.180558776855469, "step": 850 }, { "epoch": 0.12207239176721078, "grad_norm": 11.236614227294922, "learning_rate": 9.987821149751597e-05, "loss": 2.282668876647949, "step": 860 }, { "epoch": 0.12349183818310859, "grad_norm": 10.98898983001709, "learning_rate": 9.987679205110007e-05, "loss": 2.235186767578125, "step": 870 }, { "epoch": 0.12491128459900638, "grad_norm": 11.805492401123047, "learning_rate": 9.987537260468418e-05, "loss": 2.2264921188354494, "step": 880 }, { "epoch": 0.1263307310149042, "grad_norm": 10.717041015625, "learning_rate": 9.987395315826828e-05, "loss": 2.1385255813598634, "step": 890 }, { "epoch": 0.127750177430802, "grad_norm": 9.613192558288574, "learning_rate": 9.987253371185238e-05, "loss": 2.1964336395263673, "step": 900 }, { "epoch": 0.12916962384669978, "grad_norm": 10.594833374023438, "learning_rate": 9.987111426543649e-05, "loss": 2.050688362121582, "step": 910 }, { "epoch": 0.13058907026259758, "grad_norm": 11.596671104431152, "learning_rate": 9.986969481902059e-05, "loss": 2.077385139465332, "step": 920 }, { "epoch": 0.1320085166784954, "grad_norm": 10.779032707214355, "learning_rate": 9.98682753726047e-05, "loss": 2.0280479431152343, "step": 930 }, { "epoch": 0.1334279630943932, "grad_norm": 10.522924423217773, "learning_rate": 9.98668559261888e-05, "loss": 1.9384689331054688, "step": 940 }, { "epoch": 0.13484740951029098, "grad_norm": 9.86844539642334, "learning_rate": 9.986543647977289e-05, "loss": 2.0612548828125, "step": 950 }, { "epoch": 0.13626685592618878, "grad_norm": 12.521405220031738, "learning_rate": 9.986401703335699e-05, "loss": 2.139466094970703, "step": 960 }, { "epoch": 0.1376863023420866, "grad_norm": 11.292656898498535, "learning_rate": 9.98625975869411e-05, "loss": 2.077956199645996, "step": 970 }, { "epoch": 0.1391057487579844, "grad_norm": 11.186986923217773, "learning_rate": 9.98611781405252e-05, "loss": 2.028730010986328, "step": 980 }, { "epoch": 0.14052519517388218, "grad_norm": 10.553022384643555, "learning_rate": 9.985975869410931e-05, "loss": 1.9375551223754883, "step": 990 }, { "epoch": 0.14194464158977999, "grad_norm": 11.089204788208008, "learning_rate": 9.98583392476934e-05, "loss": 2.0689823150634767, "step": 1000 }, { "epoch": 0.14194464158977999, "eval_accuracy": 0.42239460799898265, "eval_loss": 1.9010688066482544, "eval_runtime": 31.4593, "eval_samples_per_second": 499.916, "eval_steps_per_second": 15.639, "step": 1000 }, { "epoch": 0.1433640880056778, "grad_norm": 10.988676071166992, "learning_rate": 9.98569198012775e-05, "loss": 1.9830604553222657, "step": 1010 }, { "epoch": 0.1447835344215756, "grad_norm": 11.2459077835083, "learning_rate": 9.985550035486161e-05, "loss": 1.9190074920654296, "step": 1020 }, { "epoch": 0.14620298083747338, "grad_norm": 10.437894821166992, "learning_rate": 9.985408090844571e-05, "loss": 1.8999460220336915, "step": 1030 }, { "epoch": 0.14762242725337119, "grad_norm": 10.94793701171875, "learning_rate": 9.985266146202982e-05, "loss": 1.8579456329345703, "step": 1040 }, { "epoch": 0.149041873669269, "grad_norm": 11.168233871459961, "learning_rate": 9.98512420156139e-05, "loss": 1.8979732513427734, "step": 1050 }, { "epoch": 0.15046132008516677, "grad_norm": 10.14195728302002, "learning_rate": 9.984982256919802e-05, "loss": 1.7833553314208985, "step": 1060 }, { "epoch": 0.15188076650106458, "grad_norm": 9.160737991333008, "learning_rate": 9.984840312278211e-05, "loss": 1.8624576568603515, "step": 1070 }, { "epoch": 0.1533002129169624, "grad_norm": 11.151049613952637, "learning_rate": 9.984698367636623e-05, "loss": 1.8210905075073243, "step": 1080 }, { "epoch": 0.1547196593328602, "grad_norm": 10.053725242614746, "learning_rate": 9.984556422995032e-05, "loss": 1.7738643646240235, "step": 1090 }, { "epoch": 0.15613910574875797, "grad_norm": 10.97727108001709, "learning_rate": 9.984414478353442e-05, "loss": 1.866429328918457, "step": 1100 }, { "epoch": 0.15755855216465578, "grad_norm": 12.384384155273438, "learning_rate": 9.984272533711853e-05, "loss": 1.8680984497070312, "step": 1110 }, { "epoch": 0.1589779985805536, "grad_norm": 11.387879371643066, "learning_rate": 9.984130589070263e-05, "loss": 1.8034194946289062, "step": 1120 }, { "epoch": 0.1603974449964514, "grad_norm": 10.6587495803833, "learning_rate": 9.983988644428674e-05, "loss": 1.772690773010254, "step": 1130 }, { "epoch": 0.16181689141234917, "grad_norm": 12.721858024597168, "learning_rate": 9.983846699787084e-05, "loss": 1.7724496841430664, "step": 1140 }, { "epoch": 0.16323633782824698, "grad_norm": 11.116838455200195, "learning_rate": 9.983704755145493e-05, "loss": 1.7527042388916017, "step": 1150 }, { "epoch": 0.1646557842441448, "grad_norm": 10.033406257629395, "learning_rate": 9.983562810503903e-05, "loss": 1.674898338317871, "step": 1160 }, { "epoch": 0.1660752306600426, "grad_norm": 11.121773719787598, "learning_rate": 9.983420865862314e-05, "loss": 1.741505241394043, "step": 1170 }, { "epoch": 0.16749467707594037, "grad_norm": 11.052094459533691, "learning_rate": 9.983278921220724e-05, "loss": 1.7749841690063477, "step": 1180 }, { "epoch": 0.16891412349183818, "grad_norm": 10.183452606201172, "learning_rate": 9.983136976579135e-05, "loss": 1.6881484985351562, "step": 1190 }, { "epoch": 0.170333569907736, "grad_norm": 11.106999397277832, "learning_rate": 9.982995031937545e-05, "loss": 1.814961051940918, "step": 1200 }, { "epoch": 0.1717530163236338, "grad_norm": 12.08647632598877, "learning_rate": 9.982853087295955e-05, "loss": 1.682515525817871, "step": 1210 }, { "epoch": 0.17317246273953157, "grad_norm": 13.744584083557129, "learning_rate": 9.982711142654366e-05, "loss": 1.6713733673095703, "step": 1220 }, { "epoch": 0.17459190915542938, "grad_norm": 9.970173835754395, "learning_rate": 9.982569198012775e-05, "loss": 1.711156463623047, "step": 1230 }, { "epoch": 0.1760113555713272, "grad_norm": 11.027495384216309, "learning_rate": 9.982427253371186e-05, "loss": 1.759619140625, "step": 1240 }, { "epoch": 0.177430801987225, "grad_norm": 10.876315116882324, "learning_rate": 9.982285308729596e-05, "loss": 1.618482780456543, "step": 1250 }, { "epoch": 0.17885024840312277, "grad_norm": 10.26490592956543, "learning_rate": 9.982143364088006e-05, "loss": 1.6674427032470702, "step": 1260 }, { "epoch": 0.18026969481902058, "grad_norm": 11.872292518615723, "learning_rate": 9.982001419446416e-05, "loss": 1.6325908660888673, "step": 1270 }, { "epoch": 0.1816891412349184, "grad_norm": 9.946234703063965, "learning_rate": 9.981859474804827e-05, "loss": 1.5453743934631348, "step": 1280 }, { "epoch": 0.18310858765081617, "grad_norm": 11.03128719329834, "learning_rate": 9.981717530163236e-05, "loss": 1.658684539794922, "step": 1290 }, { "epoch": 0.18452803406671398, "grad_norm": 12.145915031433105, "learning_rate": 9.981575585521648e-05, "loss": 1.5792274475097656, "step": 1300 }, { "epoch": 0.18594748048261178, "grad_norm": 11.820379257202148, "learning_rate": 9.981433640880057e-05, "loss": 1.5301803588867187, "step": 1310 }, { "epoch": 0.1873669268985096, "grad_norm": 11.046746253967285, "learning_rate": 9.981291696238467e-05, "loss": 1.6124080657958983, "step": 1320 }, { "epoch": 0.18878637331440737, "grad_norm": 9.545868873596191, "learning_rate": 9.981149751596878e-05, "loss": 1.5502593994140625, "step": 1330 }, { "epoch": 0.19020581973030518, "grad_norm": 11.999979019165039, "learning_rate": 9.981007806955288e-05, "loss": 1.5360203742980958, "step": 1340 }, { "epoch": 0.19162526614620298, "grad_norm": 9.949675559997559, "learning_rate": 9.980865862313699e-05, "loss": 1.353858470916748, "step": 1350 }, { "epoch": 0.1930447125621008, "grad_norm": 11.573400497436523, "learning_rate": 9.980723917672107e-05, "loss": 1.3946660995483398, "step": 1360 }, { "epoch": 0.19446415897799857, "grad_norm": 10.249485969543457, "learning_rate": 9.980581973030518e-05, "loss": 1.518262004852295, "step": 1370 }, { "epoch": 0.19588360539389638, "grad_norm": 10.011629104614258, "learning_rate": 9.980440028388928e-05, "loss": 1.5000194549560546, "step": 1380 }, { "epoch": 0.19730305180979418, "grad_norm": 12.186440467834473, "learning_rate": 9.980298083747339e-05, "loss": 1.554741382598877, "step": 1390 }, { "epoch": 0.198722498225692, "grad_norm": 11.845844268798828, "learning_rate": 9.980156139105749e-05, "loss": 1.4599843978881837, "step": 1400 }, { "epoch": 0.20014194464158977, "grad_norm": 10.98592472076416, "learning_rate": 9.980014194464159e-05, "loss": 1.4062080383300781, "step": 1410 }, { "epoch": 0.20156139105748758, "grad_norm": 11.54171371459961, "learning_rate": 9.97987224982257e-05, "loss": 1.5128003120422364, "step": 1420 }, { "epoch": 0.20298083747338538, "grad_norm": 10.248682022094727, "learning_rate": 9.97973030518098e-05, "loss": 1.5022719383239747, "step": 1430 }, { "epoch": 0.2044002838892832, "grad_norm": 8.78536319732666, "learning_rate": 9.97958836053939e-05, "loss": 1.4118841171264649, "step": 1440 }, { "epoch": 0.20581973030518097, "grad_norm": 9.993626594543457, "learning_rate": 9.9794464158978e-05, "loss": 1.3945957183837892, "step": 1450 }, { "epoch": 0.20723917672107878, "grad_norm": 11.31412124633789, "learning_rate": 9.97930447125621e-05, "loss": 1.26229887008667, "step": 1460 }, { "epoch": 0.20865862313697658, "grad_norm": 11.182840347290039, "learning_rate": 9.97916252661462e-05, "loss": 1.3171740531921388, "step": 1470 }, { "epoch": 0.2100780695528744, "grad_norm": 12.25224781036377, "learning_rate": 9.979020581973031e-05, "loss": 1.3310781478881837, "step": 1480 }, { "epoch": 0.21149751596877217, "grad_norm": 11.81201457977295, "learning_rate": 9.978878637331441e-05, "loss": 1.3043070793151856, "step": 1490 }, { "epoch": 0.21291696238466998, "grad_norm": 10.484480857849121, "learning_rate": 9.978736692689852e-05, "loss": 1.2629288673400878, "step": 1500 }, { "epoch": 0.21291696238466998, "eval_accuracy": 0.5395180263241559, "eval_loss": 1.438815712928772, "eval_runtime": 32.1456, "eval_samples_per_second": 489.242, "eval_steps_per_second": 15.305, "step": 1500 }, { "epoch": 0.21433640880056778, "grad_norm": 10.796157836914062, "learning_rate": 9.978594748048262e-05, "loss": 1.3752121925354004, "step": 1510 }, { "epoch": 0.2157558552164656, "grad_norm": 10.1256742477417, "learning_rate": 9.978452803406671e-05, "loss": 1.3005435943603516, "step": 1520 }, { "epoch": 0.21717530163236337, "grad_norm": 11.182530403137207, "learning_rate": 9.978310858765082e-05, "loss": 1.3048934936523438, "step": 1530 }, { "epoch": 0.21859474804826118, "grad_norm": 10.190278053283691, "learning_rate": 9.978168914123492e-05, "loss": 1.3993605613708495, "step": 1540 }, { "epoch": 0.22001419446415899, "grad_norm": 10.497735977172852, "learning_rate": 9.978026969481903e-05, "loss": 1.303945541381836, "step": 1550 }, { "epoch": 0.22143364088005676, "grad_norm": 10.535606384277344, "learning_rate": 9.977885024840313e-05, "loss": 1.2210904121398927, "step": 1560 }, { "epoch": 0.22285308729595457, "grad_norm": 11.385029792785645, "learning_rate": 9.977743080198723e-05, "loss": 1.3508376121520995, "step": 1570 }, { "epoch": 0.22427253371185238, "grad_norm": 9.528643608093262, "learning_rate": 9.977601135557132e-05, "loss": 1.2278815269470216, "step": 1580 }, { "epoch": 0.22569198012775019, "grad_norm": 13.161009788513184, "learning_rate": 9.977459190915544e-05, "loss": 1.254448413848877, "step": 1590 }, { "epoch": 0.22711142654364797, "grad_norm": 11.288809776306152, "learning_rate": 9.977317246273953e-05, "loss": 1.271047878265381, "step": 1600 }, { "epoch": 0.22853087295954577, "grad_norm": 11.30105209350586, "learning_rate": 9.977175301632364e-05, "loss": 1.3242988586425781, "step": 1610 }, { "epoch": 0.22995031937544358, "grad_norm": 10.600774765014648, "learning_rate": 9.977033356990774e-05, "loss": 1.3170942306518554, "step": 1620 }, { "epoch": 0.2313697657913414, "grad_norm": 10.652543067932129, "learning_rate": 9.976891412349184e-05, "loss": 1.3998719215393067, "step": 1630 }, { "epoch": 0.23278921220723917, "grad_norm": 11.354793548583984, "learning_rate": 9.976749467707595e-05, "loss": 1.270443820953369, "step": 1640 }, { "epoch": 0.23420865862313697, "grad_norm": 9.926568031311035, "learning_rate": 9.976607523066005e-05, "loss": 1.117215347290039, "step": 1650 }, { "epoch": 0.23562810503903478, "grad_norm": 11.167335510253906, "learning_rate": 9.976465578424416e-05, "loss": 1.348717212677002, "step": 1660 }, { "epoch": 0.2370475514549326, "grad_norm": 11.364425659179688, "learning_rate": 9.976323633782824e-05, "loss": 1.2113998413085938, "step": 1670 }, { "epoch": 0.23846699787083037, "grad_norm": 10.315034866333008, "learning_rate": 9.976181689141235e-05, "loss": 1.2621678352355956, "step": 1680 }, { "epoch": 0.23988644428672817, "grad_norm": 11.332146644592285, "learning_rate": 9.976039744499645e-05, "loss": 1.2919418334960937, "step": 1690 }, { "epoch": 0.24130589070262598, "grad_norm": 9.863037109375, "learning_rate": 9.975897799858056e-05, "loss": 1.262222957611084, "step": 1700 }, { "epoch": 0.2427253371185238, "grad_norm": 13.898163795471191, "learning_rate": 9.975755855216467e-05, "loss": 1.349098300933838, "step": 1710 }, { "epoch": 0.24414478353442157, "grad_norm": 9.008386611938477, "learning_rate": 9.975613910574876e-05, "loss": 1.1653017044067382, "step": 1720 }, { "epoch": 0.24556422995031937, "grad_norm": 9.755669593811035, "learning_rate": 9.975471965933287e-05, "loss": 1.304057788848877, "step": 1730 }, { "epoch": 0.24698367636621718, "grad_norm": 10.742278099060059, "learning_rate": 9.975330021291696e-05, "loss": 1.1656038284301757, "step": 1740 }, { "epoch": 0.248403122782115, "grad_norm": 11.937880516052246, "learning_rate": 9.975188076650107e-05, "loss": 1.2565963745117188, "step": 1750 }, { "epoch": 0.24982256919801277, "grad_norm": 9.80545711517334, "learning_rate": 9.975046132008517e-05, "loss": 1.1316876411437988, "step": 1760 }, { "epoch": 0.2512420156139106, "grad_norm": 11.162557601928711, "learning_rate": 9.974904187366927e-05, "loss": 1.2094581604003907, "step": 1770 }, { "epoch": 0.2526614620298084, "grad_norm": 12.278450965881348, "learning_rate": 9.974762242725337e-05, "loss": 1.2499947547912598, "step": 1780 }, { "epoch": 0.2540809084457062, "grad_norm": 10.95953369140625, "learning_rate": 9.974620298083748e-05, "loss": 1.1540046691894532, "step": 1790 }, { "epoch": 0.255500354861604, "grad_norm": 7.865696430206299, "learning_rate": 9.974478353442159e-05, "loss": 1.1665989875793457, "step": 1800 }, { "epoch": 0.25691980127750175, "grad_norm": 12.1609468460083, "learning_rate": 9.974336408800569e-05, "loss": 1.120746898651123, "step": 1810 }, { "epoch": 0.25833924769339955, "grad_norm": 9.554359436035156, "learning_rate": 9.974194464158978e-05, "loss": 1.3381189346313476, "step": 1820 }, { "epoch": 0.25975869410929736, "grad_norm": 9.497129440307617, "learning_rate": 9.974052519517388e-05, "loss": 1.1758546829223633, "step": 1830 }, { "epoch": 0.26117814052519517, "grad_norm": 10.584992408752441, "learning_rate": 9.973910574875799e-05, "loss": 1.0787659645080567, "step": 1840 }, { "epoch": 0.262597586941093, "grad_norm": 9.558980941772461, "learning_rate": 9.973768630234209e-05, "loss": 0.9334567070007325, "step": 1850 }, { "epoch": 0.2640170333569908, "grad_norm": 9.41112995147705, "learning_rate": 9.97362668559262e-05, "loss": 1.1376053810119628, "step": 1860 }, { "epoch": 0.2654364797728886, "grad_norm": 11.666831970214844, "learning_rate": 9.973484740951028e-05, "loss": 1.207914447784424, "step": 1870 }, { "epoch": 0.2668559261887864, "grad_norm": 11.217955589294434, "learning_rate": 9.97334279630944e-05, "loss": 1.052849578857422, "step": 1880 }, { "epoch": 0.26827537260468415, "grad_norm": 8.3615083694458, "learning_rate": 9.97320085166785e-05, "loss": 0.9782976150512696, "step": 1890 }, { "epoch": 0.26969481902058196, "grad_norm": 10.69944953918457, "learning_rate": 9.97305890702626e-05, "loss": 0.9639101982116699, "step": 1900 }, { "epoch": 0.27111426543647976, "grad_norm": 11.15194034576416, "learning_rate": 9.972916962384671e-05, "loss": 1.0744239807128906, "step": 1910 }, { "epoch": 0.27253371185237757, "grad_norm": 10.363690376281738, "learning_rate": 9.972775017743081e-05, "loss": 1.1180108070373536, "step": 1920 }, { "epoch": 0.2739531582682754, "grad_norm": 10.816513061523438, "learning_rate": 9.972633073101491e-05, "loss": 1.118791103363037, "step": 1930 }, { "epoch": 0.2753726046841732, "grad_norm": 8.64388656616211, "learning_rate": 9.9724911284599e-05, "loss": 1.1368459701538085, "step": 1940 }, { "epoch": 0.276792051100071, "grad_norm": 9.002252578735352, "learning_rate": 9.972349183818312e-05, "loss": 1.1344121932983398, "step": 1950 }, { "epoch": 0.2782114975159688, "grad_norm": 11.083386421203613, "learning_rate": 9.972207239176721e-05, "loss": 1.1827295303344727, "step": 1960 }, { "epoch": 0.27963094393186655, "grad_norm": 8.360145568847656, "learning_rate": 9.972065294535133e-05, "loss": 0.9954969406127929, "step": 1970 }, { "epoch": 0.28105039034776436, "grad_norm": 12.982026100158691, "learning_rate": 9.971923349893542e-05, "loss": 0.9865982055664062, "step": 1980 }, { "epoch": 0.28246983676366216, "grad_norm": 9.3854341506958, "learning_rate": 9.971781405251952e-05, "loss": 0.9238475799560547, "step": 1990 }, { "epoch": 0.28388928317955997, "grad_norm": 10.693597793579102, "learning_rate": 9.971639460610363e-05, "loss": 0.9660484313964843, "step": 2000 }, { "epoch": 0.28388928317955997, "eval_accuracy": 0.6596935206968907, "eval_loss": 1.0827350616455078, "eval_runtime": 31.44, "eval_samples_per_second": 500.222, "eval_steps_per_second": 15.649, "step": 2000 } ], "logging_steps": 10, "max_steps": 704500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }