diff --git "a/adapters/checkpoint-11240/trainer_state.json" "b/adapters/checkpoint-11240/trainer_state.json" new file mode 100644--- /dev/null +++ "b/adapters/checkpoint-11240/trainer_state.json" @@ -0,0 +1,8078 @@ +{ + "best_global_step": 11000, + "best_metric": 0.6082284450531006, + "best_model_checkpoint": "/workspace/rails-finetune/adapters-qwen3-8b/checkpoint-11000", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 11240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017793594306049821, + "grad_norm": 1.1857829093933105, + "learning_rate": 9.000000000000001e-07, + "loss": 1.7382530212402343, + "step": 10 + }, + { + "epoch": 0.0035587188612099642, + "grad_norm": 1.1971900463104248, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.7001752853393555, + "step": 20 + }, + { + "epoch": 0.005338078291814947, + "grad_norm": 1.2064718008041382, + "learning_rate": 2.9e-06, + "loss": 1.7585294723510743, + "step": 30 + }, + { + "epoch": 0.0071174377224199285, + "grad_norm": 1.1466728448867798, + "learning_rate": 3.900000000000001e-06, + "loss": 1.6992141723632812, + "step": 40 + }, + { + "epoch": 0.008896797153024912, + "grad_norm": 1.0449944734573364, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.7329919815063477, + "step": 50 + }, + { + "epoch": 0.010676156583629894, + "grad_norm": 1.25763738155365, + "learning_rate": 5.9e-06, + "loss": 1.5657649040222168, + "step": 60 + }, + { + "epoch": 0.012455516014234875, + "grad_norm": 0.9314267039299011, + "learning_rate": 6.9e-06, + "loss": 1.5626873970031738, + "step": 70 + }, + { + "epoch": 0.014234875444839857, + "grad_norm": 0.8502155542373657, + "learning_rate": 7.9e-06, + "loss": 1.3948446273803712, + "step": 80 + }, + { + "epoch": 0.01601423487544484, + "grad_norm": 0.505824625492096, + "learning_rate": 8.900000000000001e-06, + "loss": 1.318073558807373, + "step": 90 + }, + { + "epoch": 0.017793594306049824, + "grad_norm": 0.4828319251537323, + "learning_rate": 9.9e-06, + "loss": 1.2034348487854003, + "step": 100 + }, + { + "epoch": 0.019572953736654804, + "grad_norm": 0.4351405203342438, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.2164586067199707, + "step": 110 + }, + { + "epoch": 0.021352313167259787, + "grad_norm": 0.3060351014137268, + "learning_rate": 1.1900000000000001e-05, + "loss": 1.1403413772583009, + "step": 120 + }, + { + "epoch": 0.023131672597864767, + "grad_norm": 0.30274662375450134, + "learning_rate": 1.2900000000000002e-05, + "loss": 1.0329069137573241, + "step": 130 + }, + { + "epoch": 0.02491103202846975, + "grad_norm": 0.27452540397644043, + "learning_rate": 1.39e-05, + "loss": 1.0137288093566894, + "step": 140 + }, + { + "epoch": 0.026690391459074734, + "grad_norm": 0.22874200344085693, + "learning_rate": 1.4900000000000001e-05, + "loss": 0.9552822113037109, + "step": 150 + }, + { + "epoch": 0.028469750889679714, + "grad_norm": 0.20503273606300354, + "learning_rate": 1.5900000000000004e-05, + "loss": 0.9580234527587891, + "step": 160 + }, + { + "epoch": 0.030249110320284697, + "grad_norm": 0.20734967291355133, + "learning_rate": 1.69e-05, + "loss": 0.9269493103027344, + "step": 170 + }, + { + "epoch": 0.03202846975088968, + "grad_norm": 0.2046293467283249, + "learning_rate": 1.79e-05, + "loss": 0.898008155822754, + "step": 180 + }, + { + "epoch": 0.033807829181494664, + "grad_norm": 0.20902672410011292, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.8822259902954102, + "step": 190 + }, + { + "epoch": 0.03558718861209965, + "grad_norm": 0.265747606754303, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.9367627143859864, + "step": 200 + }, + { + "epoch": 0.037366548042704624, + "grad_norm": 0.21941998600959778, + "learning_rate": 1.9999967204339314e-05, + "loss": 0.8784577369689941, + "step": 210 + }, + { + "epoch": 0.03914590747330961, + "grad_norm": 0.23806284368038177, + "learning_rate": 1.999985383689953e-05, + "loss": 0.8498974800109863, + "step": 220 + }, + { + "epoch": 0.04092526690391459, + "grad_norm": 0.2539118826389313, + "learning_rate": 1.9999659493713742e-05, + "loss": 0.8517349243164063, + "step": 230 + }, + { + "epoch": 0.042704626334519574, + "grad_norm": 0.279153436422348, + "learning_rate": 1.9999384176355685e-05, + "loss": 0.8303974151611329, + "step": 240 + }, + { + "epoch": 0.04448398576512456, + "grad_norm": 0.2661009132862091, + "learning_rate": 1.9999027887054793e-05, + "loss": 0.8362269401550293, + "step": 250 + }, + { + "epoch": 0.046263345195729534, + "grad_norm": 0.2643781006336212, + "learning_rate": 1.9998590628696186e-05, + "loss": 0.7995978832244873, + "step": 260 + }, + { + "epoch": 0.04804270462633452, + "grad_norm": 0.22854603826999664, + "learning_rate": 1.9998072404820648e-05, + "loss": 0.8313385009765625, + "step": 270 + }, + { + "epoch": 0.0498220640569395, + "grad_norm": 0.25225695967674255, + "learning_rate": 1.99974732196246e-05, + "loss": 0.8484455108642578, + "step": 280 + }, + { + "epoch": 0.051601423487544484, + "grad_norm": 0.2538894712924957, + "learning_rate": 1.999679307796006e-05, + "loss": 0.8153322219848633, + "step": 290 + }, + { + "epoch": 0.05338078291814947, + "grad_norm": 0.257697194814682, + "learning_rate": 1.9996031985334604e-05, + "loss": 0.8145216941833496, + "step": 300 + }, + { + "epoch": 0.05516014234875445, + "grad_norm": 0.2465839385986328, + "learning_rate": 1.9995189947911323e-05, + "loss": 0.8290293693542481, + "step": 310 + }, + { + "epoch": 0.05693950177935943, + "grad_norm": 0.2330036759376526, + "learning_rate": 1.9994266972508786e-05, + "loss": 0.8094453811645508, + "step": 320 + }, + { + "epoch": 0.05871886120996441, + "grad_norm": 0.24966399371623993, + "learning_rate": 1.9993263066600956e-05, + "loss": 0.7964043140411377, + "step": 330 + }, + { + "epoch": 0.060498220640569395, + "grad_norm": 0.2663300335407257, + "learning_rate": 1.9992178238317156e-05, + "loss": 0.7674037456512451, + "step": 340 + }, + { + "epoch": 0.06227758007117438, + "grad_norm": 0.29889997839927673, + "learning_rate": 1.9991012496442e-05, + "loss": 0.7757882118225098, + "step": 350 + }, + { + "epoch": 0.06405693950177936, + "grad_norm": 0.2283693104982376, + "learning_rate": 1.9989765850415303e-05, + "loss": 0.7695906162261963, + "step": 360 + }, + { + "epoch": 0.06583629893238434, + "grad_norm": 0.2109132707118988, + "learning_rate": 1.9988438310332015e-05, + "loss": 0.7833895683288574, + "step": 370 + }, + { + "epoch": 0.06761565836298933, + "grad_norm": 0.2546916604042053, + "learning_rate": 1.998702988694216e-05, + "loss": 0.8128045082092286, + "step": 380 + }, + { + "epoch": 0.0693950177935943, + "grad_norm": 0.2531549632549286, + "learning_rate": 1.998554059165071e-05, + "loss": 0.7902643203735351, + "step": 390 + }, + { + "epoch": 0.0711743772241993, + "grad_norm": 0.21756362915039062, + "learning_rate": 1.9983970436517523e-05, + "loss": 0.7454084873199462, + "step": 400 + }, + { + "epoch": 0.07295373665480427, + "grad_norm": 0.2294779121875763, + "learning_rate": 1.9982319434257236e-05, + "loss": 0.7487451553344726, + "step": 410 + }, + { + "epoch": 0.07473309608540925, + "grad_norm": 0.26356714963912964, + "learning_rate": 1.9980587598239155e-05, + "loss": 0.748570442199707, + "step": 420 + }, + { + "epoch": 0.07651245551601424, + "grad_norm": 0.22382904589176178, + "learning_rate": 1.9978774942487155e-05, + "loss": 0.7261887073516846, + "step": 430 + }, + { + "epoch": 0.07829181494661921, + "grad_norm": 0.22544927895069122, + "learning_rate": 1.997688148167957e-05, + "loss": 0.7761906623840332, + "step": 440 + }, + { + "epoch": 0.0800711743772242, + "grad_norm": 0.2679840922355652, + "learning_rate": 1.997490723114906e-05, + "loss": 0.7872249603271484, + "step": 450 + }, + { + "epoch": 0.08185053380782918, + "grad_norm": 0.20914621651172638, + "learning_rate": 1.9972852206882504e-05, + "loss": 0.7876029014587402, + "step": 460 + }, + { + "epoch": 0.08362989323843416, + "grad_norm": 0.36782026290893555, + "learning_rate": 1.9970716425520854e-05, + "loss": 0.7812703132629395, + "step": 470 + }, + { + "epoch": 0.08540925266903915, + "grad_norm": 0.2317405343055725, + "learning_rate": 1.9968499904359017e-05, + "loss": 0.7694793701171875, + "step": 480 + }, + { + "epoch": 0.08718861209964412, + "grad_norm": 0.23587286472320557, + "learning_rate": 1.9966202661345694e-05, + "loss": 0.7593471050262451, + "step": 490 + }, + { + "epoch": 0.08896797153024912, + "grad_norm": 0.3179067075252533, + "learning_rate": 1.9963824715083255e-05, + "loss": 0.804302978515625, + "step": 500 + }, + { + "epoch": 0.08896797153024912, + "eval_loss": 0.7566477656364441, + "eval_runtime": 406.3608, + "eval_samples_per_second": 12.376, + "eval_steps_per_second": 6.189, + "step": 500 + }, + { + "epoch": 0.09074733096085409, + "grad_norm": 0.2583659291267395, + "learning_rate": 1.9961366084827584e-05, + "loss": 0.7231699466705322, + "step": 510 + }, + { + "epoch": 0.09252669039145907, + "grad_norm": 0.3117372393608093, + "learning_rate": 1.9958826790487905e-05, + "loss": 0.7830834865570069, + "step": 520 + }, + { + "epoch": 0.09430604982206406, + "grad_norm": 0.26633507013320923, + "learning_rate": 1.995620685262665e-05, + "loss": 0.7565378189086914, + "step": 530 + }, + { + "epoch": 0.09608540925266904, + "grad_norm": 0.2491220384836197, + "learning_rate": 1.9953506292459275e-05, + "loss": 0.770139503479004, + "step": 540 + }, + { + "epoch": 0.09786476868327403, + "grad_norm": 0.33897528052330017, + "learning_rate": 1.9950725131854082e-05, + "loss": 0.8139609336853028, + "step": 550 + }, + { + "epoch": 0.099644128113879, + "grad_norm": 0.27655312418937683, + "learning_rate": 1.9947863393332053e-05, + "loss": 0.7632327556610108, + "step": 560 + }, + { + "epoch": 0.10142348754448399, + "grad_norm": 0.2421133816242218, + "learning_rate": 1.994492110006667e-05, + "loss": 0.7782410621643067, + "step": 570 + }, + { + "epoch": 0.10320284697508897, + "grad_norm": 0.26968705654144287, + "learning_rate": 1.994189827588372e-05, + "loss": 0.7438684940338135, + "step": 580 + }, + { + "epoch": 0.10498220640569395, + "grad_norm": 0.28697794675827026, + "learning_rate": 1.993879494526111e-05, + "loss": 0.7320491313934326, + "step": 590 + }, + { + "epoch": 0.10676156583629894, + "grad_norm": 0.327286034822464, + "learning_rate": 1.9935611133328657e-05, + "loss": 0.7248147010803223, + "step": 600 + }, + { + "epoch": 0.10854092526690391, + "grad_norm": 0.3679325580596924, + "learning_rate": 1.9932346865867885e-05, + "loss": 0.7461765766143799, + "step": 610 + }, + { + "epoch": 0.1103202846975089, + "grad_norm": 0.3118029832839966, + "learning_rate": 1.992900216931184e-05, + "loss": 0.764728593826294, + "step": 620 + }, + { + "epoch": 0.11209964412811388, + "grad_norm": 0.2683190107345581, + "learning_rate": 1.992557707074484e-05, + "loss": 0.7256179809570312, + "step": 630 + }, + { + "epoch": 0.11387900355871886, + "grad_norm": 0.32056325674057007, + "learning_rate": 1.9922071597902286e-05, + "loss": 0.7553198337554932, + "step": 640 + }, + { + "epoch": 0.11565836298932385, + "grad_norm": 0.4004197418689728, + "learning_rate": 1.9918485779170417e-05, + "loss": 0.7440505027770996, + "step": 650 + }, + { + "epoch": 0.11743772241992882, + "grad_norm": 0.3497128486633301, + "learning_rate": 1.9914819643586096e-05, + "loss": 0.7672629833221436, + "step": 660 + }, + { + "epoch": 0.11921708185053381, + "grad_norm": 0.3187069594860077, + "learning_rate": 1.9911073220836562e-05, + "loss": 0.7420164585113526, + "step": 670 + }, + { + "epoch": 0.12099644128113879, + "grad_norm": 0.29900017380714417, + "learning_rate": 1.9907246541259194e-05, + "loss": 0.7458691120147705, + "step": 680 + }, + { + "epoch": 0.12277580071174377, + "grad_norm": 0.298501193523407, + "learning_rate": 1.9903339635841274e-05, + "loss": 0.7346842765808106, + "step": 690 + }, + { + "epoch": 0.12455516014234876, + "grad_norm": 0.38077300786972046, + "learning_rate": 1.9899352536219713e-05, + "loss": 0.7980701446533203, + "step": 700 + }, + { + "epoch": 0.12633451957295375, + "grad_norm": 0.2950842082500458, + "learning_rate": 1.9895285274680826e-05, + "loss": 0.7282920837402344, + "step": 710 + }, + { + "epoch": 0.12811387900355872, + "grad_norm": 0.2746928632259369, + "learning_rate": 1.989113788416005e-05, + "loss": 0.7119527816772461, + "step": 720 + }, + { + "epoch": 0.1298932384341637, + "grad_norm": 0.30003389716148376, + "learning_rate": 1.9886910398241673e-05, + "loss": 0.7120148181915283, + "step": 730 + }, + { + "epoch": 0.13167259786476868, + "grad_norm": 0.3238595128059387, + "learning_rate": 1.9882602851158584e-05, + "loss": 0.7219894886016845, + "step": 740 + }, + { + "epoch": 0.13345195729537365, + "grad_norm": 0.4421483278274536, + "learning_rate": 1.9878215277791977e-05, + "loss": 0.6878085613250733, + "step": 750 + }, + { + "epoch": 0.13523131672597866, + "grad_norm": 0.3009251058101654, + "learning_rate": 1.9873747713671073e-05, + "loss": 0.777537488937378, + "step": 760 + }, + { + "epoch": 0.13701067615658363, + "grad_norm": 0.3649790287017822, + "learning_rate": 1.9869200194972828e-05, + "loss": 0.7608931541442872, + "step": 770 + }, + { + "epoch": 0.1387900355871886, + "grad_norm": 0.38479843735694885, + "learning_rate": 1.986457275852166e-05, + "loss": 0.7644641876220704, + "step": 780 + }, + { + "epoch": 0.14056939501779359, + "grad_norm": 0.3267346918582916, + "learning_rate": 1.9859865441789126e-05, + "loss": 0.7437977313995361, + "step": 790 + }, + { + "epoch": 0.1423487544483986, + "grad_norm": 0.3238297402858734, + "learning_rate": 1.985507828289363e-05, + "loss": 0.7596290111541748, + "step": 800 + }, + { + "epoch": 0.14412811387900357, + "grad_norm": 0.4067242443561554, + "learning_rate": 1.985021132060012e-05, + "loss": 0.723486328125, + "step": 810 + }, + { + "epoch": 0.14590747330960854, + "grad_norm": 0.3294743299484253, + "learning_rate": 1.9845264594319755e-05, + "loss": 0.8035991668701172, + "step": 820 + }, + { + "epoch": 0.14768683274021352, + "grad_norm": 0.2861204445362091, + "learning_rate": 1.9840238144109613e-05, + "loss": 0.7118996620178223, + "step": 830 + }, + { + "epoch": 0.1494661921708185, + "grad_norm": 0.3281143605709076, + "learning_rate": 1.9835132010672334e-05, + "loss": 0.7610855102539062, + "step": 840 + }, + { + "epoch": 0.1512455516014235, + "grad_norm": 0.3176390826702118, + "learning_rate": 1.982994623535583e-05, + "loss": 0.7489484310150146, + "step": 850 + }, + { + "epoch": 0.15302491103202848, + "grad_norm": 0.34338897466659546, + "learning_rate": 1.9824680860152914e-05, + "loss": 0.7180755615234375, + "step": 860 + }, + { + "epoch": 0.15480427046263345, + "grad_norm": 0.41952571272850037, + "learning_rate": 1.9819335927700975e-05, + "loss": 0.6884951591491699, + "step": 870 + }, + { + "epoch": 0.15658362989323843, + "grad_norm": 0.29155367612838745, + "learning_rate": 1.9813911481281637e-05, + "loss": 0.683270263671875, + "step": 880 + }, + { + "epoch": 0.1583629893238434, + "grad_norm": 0.3815101981163025, + "learning_rate": 1.98084075648204e-05, + "loss": 0.714734411239624, + "step": 890 + }, + { + "epoch": 0.1601423487544484, + "grad_norm": 0.364044189453125, + "learning_rate": 1.980282422288629e-05, + "loss": 0.7451518535614013, + "step": 900 + }, + { + "epoch": 0.1619217081850534, + "grad_norm": 0.4181061387062073, + "learning_rate": 1.9797161500691496e-05, + "loss": 0.7484359741210938, + "step": 910 + }, + { + "epoch": 0.16370106761565836, + "grad_norm": 0.3803650140762329, + "learning_rate": 1.9791419444091006e-05, + "loss": 0.7413453578948974, + "step": 920 + }, + { + "epoch": 0.16548042704626334, + "grad_norm": 0.3450170159339905, + "learning_rate": 1.9785598099582225e-05, + "loss": 0.732274341583252, + "step": 930 + }, + { + "epoch": 0.16725978647686832, + "grad_norm": 0.31141409277915955, + "learning_rate": 1.9779697514304624e-05, + "loss": 0.768674898147583, + "step": 940 + }, + { + "epoch": 0.16903914590747332, + "grad_norm": 0.33961221575737, + "learning_rate": 1.977371773603932e-05, + "loss": 0.7539153099060059, + "step": 950 + }, + { + "epoch": 0.1708185053380783, + "grad_norm": 0.3468526303768158, + "learning_rate": 1.9767658813208725e-05, + "loss": 0.6995216369628906, + "step": 960 + }, + { + "epoch": 0.17259786476868327, + "grad_norm": 0.3776821196079254, + "learning_rate": 1.976152079487614e-05, + "loss": 0.7344133853912354, + "step": 970 + }, + { + "epoch": 0.17437722419928825, + "grad_norm": 0.43669673800468445, + "learning_rate": 1.9755303730745344e-05, + "loss": 0.7037209510803223, + "step": 980 + }, + { + "epoch": 0.17615658362989323, + "grad_norm": 0.43197065591812134, + "learning_rate": 1.9749007671160223e-05, + "loss": 0.7632620334625244, + "step": 990 + }, + { + "epoch": 0.17793594306049823, + "grad_norm": 0.36352500319480896, + "learning_rate": 1.9742632667104332e-05, + "loss": 0.7493629455566406, + "step": 1000 + }, + { + "epoch": 0.17793594306049823, + "eval_loss": 0.7127183079719543, + "eval_runtime": 407.5587, + "eval_samples_per_second": 12.339, + "eval_steps_per_second": 6.171, + "step": 1000 + }, + { + "epoch": 0.1797153024911032, + "grad_norm": 0.4177298843860626, + "learning_rate": 1.9736178770200492e-05, + "loss": 0.7822850227355957, + "step": 1010 + }, + { + "epoch": 0.18149466192170818, + "grad_norm": 0.4624420404434204, + "learning_rate": 1.972964603271038e-05, + "loss": 0.7450732707977294, + "step": 1020 + }, + { + "epoch": 0.18327402135231316, + "grad_norm": 0.35002920031547546, + "learning_rate": 1.97230345075341e-05, + "loss": 0.7509373188018799, + "step": 1030 + }, + { + "epoch": 0.18505338078291814, + "grad_norm": 0.4136241674423218, + "learning_rate": 1.9716344248209754e-05, + "loss": 0.6727419853210449, + "step": 1040 + }, + { + "epoch": 0.18683274021352314, + "grad_norm": 0.41807156801223755, + "learning_rate": 1.9709575308913004e-05, + "loss": 0.7258425712585449, + "step": 1050 + }, + { + "epoch": 0.18861209964412812, + "grad_norm": 0.4383244514465332, + "learning_rate": 1.9702727744456645e-05, + "loss": 0.7470430374145508, + "step": 1060 + }, + { + "epoch": 0.1903914590747331, + "grad_norm": 0.44305190443992615, + "learning_rate": 1.969580161029015e-05, + "loss": 0.6925637722015381, + "step": 1070 + }, + { + "epoch": 0.19217081850533807, + "grad_norm": 0.45914319157600403, + "learning_rate": 1.9688796962499228e-05, + "loss": 0.7320804595947266, + "step": 1080 + }, + { + "epoch": 0.19395017793594305, + "grad_norm": 0.37220245599746704, + "learning_rate": 1.9681713857805367e-05, + "loss": 0.6863605499267578, + "step": 1090 + }, + { + "epoch": 0.19572953736654805, + "grad_norm": 0.473320871591568, + "learning_rate": 1.9674552353565374e-05, + "loss": 0.7069521427154541, + "step": 1100 + }, + { + "epoch": 0.19750889679715303, + "grad_norm": 0.34435564279556274, + "learning_rate": 1.9667312507770905e-05, + "loss": 0.7488323211669922, + "step": 1110 + }, + { + "epoch": 0.199288256227758, + "grad_norm": 0.4495692551136017, + "learning_rate": 1.9659994379048015e-05, + "loss": 0.7306941032409668, + "step": 1120 + }, + { + "epoch": 0.20106761565836298, + "grad_norm": 0.37990689277648926, + "learning_rate": 1.9652598026656666e-05, + "loss": 0.6781065464019775, + "step": 1130 + }, + { + "epoch": 0.20284697508896798, + "grad_norm": 0.374970942735672, + "learning_rate": 1.9645123510490242e-05, + "loss": 0.732900333404541, + "step": 1140 + }, + { + "epoch": 0.20462633451957296, + "grad_norm": 0.42331087589263916, + "learning_rate": 1.963757089107508e-05, + "loss": 0.7150296211242676, + "step": 1150 + }, + { + "epoch": 0.20640569395017794, + "grad_norm": 0.39357990026474, + "learning_rate": 1.962994022956998e-05, + "loss": 0.70610032081604, + "step": 1160 + }, + { + "epoch": 0.20818505338078291, + "grad_norm": 0.3616408407688141, + "learning_rate": 1.9622231587765688e-05, + "loss": 0.6808771610260009, + "step": 1170 + }, + { + "epoch": 0.2099644128113879, + "grad_norm": 0.4579455554485321, + "learning_rate": 1.9614445028084424e-05, + "loss": 0.7024923801422119, + "step": 1180 + }, + { + "epoch": 0.2117437722419929, + "grad_norm": 0.47354868054389954, + "learning_rate": 1.9606580613579352e-05, + "loss": 0.7523046493530273, + "step": 1190 + }, + { + "epoch": 0.21352313167259787, + "grad_norm": 0.3599710166454315, + "learning_rate": 1.9598638407934096e-05, + "loss": 0.7396236419677734, + "step": 1200 + }, + { + "epoch": 0.21530249110320285, + "grad_norm": 0.37311187386512756, + "learning_rate": 1.959061847546219e-05, + "loss": 0.6676182746887207, + "step": 1210 + }, + { + "epoch": 0.21708185053380782, + "grad_norm": 0.4571494460105896, + "learning_rate": 1.9582520881106585e-05, + "loss": 0.7176971435546875, + "step": 1220 + }, + { + "epoch": 0.2188612099644128, + "grad_norm": 0.39556118845939636, + "learning_rate": 1.9574345690439113e-05, + "loss": 0.6899125576019287, + "step": 1230 + }, + { + "epoch": 0.2206405693950178, + "grad_norm": 0.4152670204639435, + "learning_rate": 1.9566092969659964e-05, + "loss": 0.7056239128112793, + "step": 1240 + }, + { + "epoch": 0.22241992882562278, + "grad_norm": 0.5533052086830139, + "learning_rate": 1.9557762785597133e-05, + "loss": 0.7325549602508545, + "step": 1250 + }, + { + "epoch": 0.22419928825622776, + "grad_norm": 0.457738995552063, + "learning_rate": 1.9549355205705895e-05, + "loss": 0.7097938060760498, + "step": 1260 + }, + { + "epoch": 0.22597864768683273, + "grad_norm": 0.4812074303627014, + "learning_rate": 1.9540870298068247e-05, + "loss": 0.6999053478240966, + "step": 1270 + }, + { + "epoch": 0.2277580071174377, + "grad_norm": 0.42034661769866943, + "learning_rate": 1.9532308131392365e-05, + "loss": 0.7124747276306153, + "step": 1280 + }, + { + "epoch": 0.22953736654804271, + "grad_norm": 0.44799792766571045, + "learning_rate": 1.9523668775012053e-05, + "loss": 0.7096034049987793, + "step": 1290 + }, + { + "epoch": 0.2313167259786477, + "grad_norm": 0.4658997654914856, + "learning_rate": 1.9514952298886157e-05, + "loss": 0.7080921649932861, + "step": 1300 + }, + { + "epoch": 0.23309608540925267, + "grad_norm": 0.5269479751586914, + "learning_rate": 1.9506158773598035e-05, + "loss": 0.6962251663208008, + "step": 1310 + }, + { + "epoch": 0.23487544483985764, + "grad_norm": 0.45521607995033264, + "learning_rate": 1.9497288270354944e-05, + "loss": 0.7323726177215576, + "step": 1320 + }, + { + "epoch": 0.23665480427046262, + "grad_norm": 0.4322509169578552, + "learning_rate": 1.9488340860987504e-05, + "loss": 0.7227589607238769, + "step": 1330 + }, + { + "epoch": 0.23843416370106763, + "grad_norm": 0.47730758786201477, + "learning_rate": 1.9479316617949084e-05, + "loss": 0.702051305770874, + "step": 1340 + }, + { + "epoch": 0.2402135231316726, + "grad_norm": 0.42763814330101013, + "learning_rate": 1.9470215614315232e-05, + "loss": 0.7380130767822266, + "step": 1350 + }, + { + "epoch": 0.24199288256227758, + "grad_norm": 0.4630064070224762, + "learning_rate": 1.9461037923783087e-05, + "loss": 0.7470481395721436, + "step": 1360 + }, + { + "epoch": 0.24377224199288255, + "grad_norm": 0.367767870426178, + "learning_rate": 1.9451783620670767e-05, + "loss": 0.7009376049041748, + "step": 1370 + }, + { + "epoch": 0.24555160142348753, + "grad_norm": 0.38925161957740784, + "learning_rate": 1.9442452779916775e-05, + "loss": 0.7043869972229004, + "step": 1380 + }, + { + "epoch": 0.24733096085409254, + "grad_norm": 0.37959718704223633, + "learning_rate": 1.943304547707939e-05, + "loss": 0.7024062633514404, + "step": 1390 + }, + { + "epoch": 0.2491103202846975, + "grad_norm": 0.3639119267463684, + "learning_rate": 1.9423561788336073e-05, + "loss": 0.7025011539459228, + "step": 1400 + }, + { + "epoch": 0.2508896797153025, + "grad_norm": 0.5123258233070374, + "learning_rate": 1.9414001790482815e-05, + "loss": 0.7173181533813476, + "step": 1410 + }, + { + "epoch": 0.2526690391459075, + "grad_norm": 0.5145444273948669, + "learning_rate": 1.940436556093355e-05, + "loss": 0.6975203514099121, + "step": 1420 + }, + { + "epoch": 0.25444839857651247, + "grad_norm": 0.41073641180992126, + "learning_rate": 1.93946531777195e-05, + "loss": 0.7116940021514893, + "step": 1430 + }, + { + "epoch": 0.25622775800711745, + "grad_norm": 0.5241482853889465, + "learning_rate": 1.9384864719488562e-05, + "loss": 0.7322525024414063, + "step": 1440 + }, + { + "epoch": 0.2580071174377224, + "grad_norm": 0.41522547602653503, + "learning_rate": 1.9375000265504673e-05, + "loss": 0.7099958419799804, + "step": 1450 + }, + { + "epoch": 0.2597864768683274, + "grad_norm": 0.4306512773036957, + "learning_rate": 1.9365059895647146e-05, + "loss": 0.6963861465454102, + "step": 1460 + }, + { + "epoch": 0.2615658362989324, + "grad_norm": 0.4515567421913147, + "learning_rate": 1.935504369041004e-05, + "loss": 0.7317886829376221, + "step": 1470 + }, + { + "epoch": 0.26334519572953735, + "grad_norm": 0.46133843064308167, + "learning_rate": 1.9344951730901523e-05, + "loss": 0.6936720371246338, + "step": 1480 + }, + { + "epoch": 0.26512455516014233, + "grad_norm": 0.4332071840763092, + "learning_rate": 1.933478409884317e-05, + "loss": 0.7092292308807373, + "step": 1490 + }, + { + "epoch": 0.2669039145907473, + "grad_norm": 0.44672346115112305, + "learning_rate": 1.9324540876569356e-05, + "loss": 0.7138745784759521, + "step": 1500 + }, + { + "epoch": 0.2669039145907473, + "eval_loss": 0.6928849816322327, + "eval_runtime": 409.2608, + "eval_samples_per_second": 12.288, + "eval_steps_per_second": 6.145, + "step": 1500 + }, + { + "epoch": 0.26868327402135234, + "grad_norm": 0.4207611680030823, + "learning_rate": 1.9314222147026538e-05, + "loss": 0.682267141342163, + "step": 1510 + }, + { + "epoch": 0.2704626334519573, + "grad_norm": 0.4715620279312134, + "learning_rate": 1.9303827993772627e-05, + "loss": 0.6928095340728759, + "step": 1520 + }, + { + "epoch": 0.2722419928825623, + "grad_norm": 0.5061793327331543, + "learning_rate": 1.9293358500976284e-05, + "loss": 0.7393241405487061, + "step": 1530 + }, + { + "epoch": 0.27402135231316727, + "grad_norm": 0.46995067596435547, + "learning_rate": 1.9282813753416247e-05, + "loss": 0.7240311622619628, + "step": 1540 + }, + { + "epoch": 0.27580071174377224, + "grad_norm": 0.35846948623657227, + "learning_rate": 1.927219383648064e-05, + "loss": 0.6961266994476318, + "step": 1550 + }, + { + "epoch": 0.2775800711743772, + "grad_norm": 0.4667574465274811, + "learning_rate": 1.9261498836166297e-05, + "loss": 0.7295796871185303, + "step": 1560 + }, + { + "epoch": 0.2793594306049822, + "grad_norm": 0.5029377341270447, + "learning_rate": 1.9250728839078043e-05, + "loss": 0.7046424865722656, + "step": 1570 + }, + { + "epoch": 0.28113879003558717, + "grad_norm": 0.46402570605278015, + "learning_rate": 1.9239883932428002e-05, + "loss": 0.7341272830963135, + "step": 1580 + }, + { + "epoch": 0.28291814946619215, + "grad_norm": 0.45273569226264954, + "learning_rate": 1.9228964204034906e-05, + "loss": 0.7231084823608398, + "step": 1590 + }, + { + "epoch": 0.2846975088967972, + "grad_norm": 0.44853758811950684, + "learning_rate": 1.9217969742323358e-05, + "loss": 0.7228631019592285, + "step": 1600 + }, + { + "epoch": 0.28647686832740216, + "grad_norm": 0.4073372483253479, + "learning_rate": 1.9206900636323138e-05, + "loss": 0.7120136737823486, + "step": 1610 + }, + { + "epoch": 0.28825622775800713, + "grad_norm": 0.4837400019168854, + "learning_rate": 1.9195756975668463e-05, + "loss": 0.7246061325073242, + "step": 1620 + }, + { + "epoch": 0.2900355871886121, + "grad_norm": 0.48374027013778687, + "learning_rate": 1.918453885059728e-05, + "loss": 0.6975275039672851, + "step": 1630 + }, + { + "epoch": 0.2918149466192171, + "grad_norm": 0.40771543979644775, + "learning_rate": 1.9173246351950515e-05, + "loss": 0.7376579284667969, + "step": 1640 + }, + { + "epoch": 0.29359430604982206, + "grad_norm": 0.47605007886886597, + "learning_rate": 1.916187957117136e-05, + "loss": 0.7509649753570556, + "step": 1650 + }, + { + "epoch": 0.29537366548042704, + "grad_norm": 0.4246625006198883, + "learning_rate": 1.9150438600304514e-05, + "loss": 0.7086214065551758, + "step": 1660 + }, + { + "epoch": 0.297153024911032, + "grad_norm": 0.4986574053764343, + "learning_rate": 1.9138923531995448e-05, + "loss": 0.6780657768249512, + "step": 1670 + }, + { + "epoch": 0.298932384341637, + "grad_norm": 0.5241477489471436, + "learning_rate": 1.912733445948965e-05, + "loss": 0.7178135395050049, + "step": 1680 + }, + { + "epoch": 0.30071174377224197, + "grad_norm": 0.48822927474975586, + "learning_rate": 1.9115671476631865e-05, + "loss": 0.6914261817932129, + "step": 1690 + }, + { + "epoch": 0.302491103202847, + "grad_norm": 0.4603191018104553, + "learning_rate": 1.910393467786535e-05, + "loss": 0.7189798355102539, + "step": 1700 + }, + { + "epoch": 0.304270462633452, + "grad_norm": 0.5957316160202026, + "learning_rate": 1.90921241582311e-05, + "loss": 0.6929316520690918, + "step": 1710 + }, + { + "epoch": 0.30604982206405695, + "grad_norm": 0.4563639163970947, + "learning_rate": 1.9080240013367075e-05, + "loss": 0.6874090194702148, + "step": 1720 + }, + { + "epoch": 0.30782918149466193, + "grad_norm": 0.4933296740055084, + "learning_rate": 1.9068282339507433e-05, + "loss": 0.7047487258911133, + "step": 1730 + }, + { + "epoch": 0.3096085409252669, + "grad_norm": 0.46199893951416016, + "learning_rate": 1.9056251233481747e-05, + "loss": 0.6585260391235351, + "step": 1740 + }, + { + "epoch": 0.3113879003558719, + "grad_norm": 0.4224775433540344, + "learning_rate": 1.904414679271421e-05, + "loss": 0.7062453269958496, + "step": 1750 + }, + { + "epoch": 0.31316725978647686, + "grad_norm": 0.5592194199562073, + "learning_rate": 1.9031969115222876e-05, + "loss": 0.7372538089752197, + "step": 1760 + }, + { + "epoch": 0.31494661921708184, + "grad_norm": 0.447518527507782, + "learning_rate": 1.9019718299618836e-05, + "loss": 0.6815076351165772, + "step": 1770 + }, + { + "epoch": 0.3167259786476868, + "grad_norm": 0.4872969686985016, + "learning_rate": 1.9007394445105433e-05, + "loss": 0.6925329685211181, + "step": 1780 + }, + { + "epoch": 0.3185053380782918, + "grad_norm": 0.5796515941619873, + "learning_rate": 1.8994997651477457e-05, + "loss": 0.6911862850189209, + "step": 1790 + }, + { + "epoch": 0.3202846975088968, + "grad_norm": 0.45499077439308167, + "learning_rate": 1.8982528019120335e-05, + "loss": 0.689606761932373, + "step": 1800 + }, + { + "epoch": 0.3220640569395018, + "grad_norm": 0.5108110308647156, + "learning_rate": 1.8969985649009325e-05, + "loss": 0.698862886428833, + "step": 1810 + }, + { + "epoch": 0.3238434163701068, + "grad_norm": 0.5635261535644531, + "learning_rate": 1.8957370642708682e-05, + "loss": 0.7093265533447266, + "step": 1820 + }, + { + "epoch": 0.32562277580071175, + "grad_norm": 0.5035433769226074, + "learning_rate": 1.8944683102370862e-05, + "loss": 0.6829179763793946, + "step": 1830 + }, + { + "epoch": 0.3274021352313167, + "grad_norm": 0.5030480623245239, + "learning_rate": 1.8931923130735667e-05, + "loss": 0.6931506633758545, + "step": 1840 + }, + { + "epoch": 0.3291814946619217, + "grad_norm": 0.42200684547424316, + "learning_rate": 1.891909083112943e-05, + "loss": 0.7182873725891114, + "step": 1850 + }, + { + "epoch": 0.3309608540925267, + "grad_norm": 0.5390200614929199, + "learning_rate": 1.8906186307464168e-05, + "loss": 0.7040542602539063, + "step": 1860 + }, + { + "epoch": 0.33274021352313166, + "grad_norm": 0.4840771555900574, + "learning_rate": 1.889320966423676e-05, + "loss": 0.71949143409729, + "step": 1870 + }, + { + "epoch": 0.33451957295373663, + "grad_norm": 0.46363523602485657, + "learning_rate": 1.8880161006528075e-05, + "loss": 0.7095215797424317, + "step": 1880 + }, + { + "epoch": 0.33629893238434166, + "grad_norm": 0.5143831372261047, + "learning_rate": 1.8867040440002137e-05, + "loss": 0.6844244003295898, + "step": 1890 + }, + { + "epoch": 0.33807829181494664, + "grad_norm": 0.42344361543655396, + "learning_rate": 1.8853848070905264e-05, + "loss": 0.7266733169555664, + "step": 1900 + }, + { + "epoch": 0.3398576512455516, + "grad_norm": 0.5283271074295044, + "learning_rate": 1.884058400606521e-05, + "loss": 0.6759650707244873, + "step": 1910 + }, + { + "epoch": 0.3416370106761566, + "grad_norm": 0.6041154861450195, + "learning_rate": 1.88272483528903e-05, + "loss": 0.7138604164123535, + "step": 1920 + }, + { + "epoch": 0.34341637010676157, + "grad_norm": 0.4607132077217102, + "learning_rate": 1.8813841219368562e-05, + "loss": 0.7108243465423584, + "step": 1930 + }, + { + "epoch": 0.34519572953736655, + "grad_norm": 0.4770022928714752, + "learning_rate": 1.880036271406684e-05, + "loss": 0.700477123260498, + "step": 1940 + }, + { + "epoch": 0.3469750889679715, + "grad_norm": 0.5347304940223694, + "learning_rate": 1.8786812946129934e-05, + "loss": 0.7176999092102051, + "step": 1950 + }, + { + "epoch": 0.3487544483985765, + "grad_norm": 0.4638780355453491, + "learning_rate": 1.8773192025279712e-05, + "loss": 0.6859623908996582, + "step": 1960 + }, + { + "epoch": 0.3505338078291815, + "grad_norm": 0.4621883034706116, + "learning_rate": 1.87595000618142e-05, + "loss": 0.6809545516967773, + "step": 1970 + }, + { + "epoch": 0.35231316725978645, + "grad_norm": 0.4492979943752289, + "learning_rate": 1.8745737166606716e-05, + "loss": 0.7484791278839111, + "step": 1980 + }, + { + "epoch": 0.3540925266903915, + "grad_norm": 0.5058820843696594, + "learning_rate": 1.873190345110496e-05, + "loss": 0.6860589027404785, + "step": 1990 + }, + { + "epoch": 0.35587188612099646, + "grad_norm": 0.467847615480423, + "learning_rate": 1.8717999027330114e-05, + "loss": 0.6946381092071533, + "step": 2000 + }, + { + "epoch": 0.35587188612099646, + "eval_loss": 0.6786053776741028, + "eval_runtime": 408.2053, + "eval_samples_per_second": 12.32, + "eval_steps_per_second": 6.161, + "step": 2000 + }, + { + "epoch": 0.35765124555160144, + "grad_norm": 0.4924924373626709, + "learning_rate": 1.870402400787593e-05, + "loss": 0.7328177452087402, + "step": 2010 + }, + { + "epoch": 0.3594306049822064, + "grad_norm": 0.529029369354248, + "learning_rate": 1.8689978505907828e-05, + "loss": 0.6788232803344727, + "step": 2020 + }, + { + "epoch": 0.3612099644128114, + "grad_norm": 0.5123940110206604, + "learning_rate": 1.8675862635161968e-05, + "loss": 0.6493151664733887, + "step": 2030 + }, + { + "epoch": 0.36298932384341637, + "grad_norm": 0.4411087930202484, + "learning_rate": 1.866167650994434e-05, + "loss": 0.6652609825134277, + "step": 2040 + }, + { + "epoch": 0.36476868327402134, + "grad_norm": 0.5237643122673035, + "learning_rate": 1.8647420245129822e-05, + "loss": 0.6707428932189942, + "step": 2050 + }, + { + "epoch": 0.3665480427046263, + "grad_norm": 0.5926516056060791, + "learning_rate": 1.863309395616128e-05, + "loss": 0.7144505023956299, + "step": 2060 + }, + { + "epoch": 0.3683274021352313, + "grad_norm": 0.5331065058708191, + "learning_rate": 1.86186977590486e-05, + "loss": 0.6472876071929932, + "step": 2070 + }, + { + "epoch": 0.3701067615658363, + "grad_norm": 0.6008846163749695, + "learning_rate": 1.860423177036776e-05, + "loss": 0.6628296852111817, + "step": 2080 + }, + { + "epoch": 0.3718861209964413, + "grad_norm": 0.49397945404052734, + "learning_rate": 1.85896961072599e-05, + "loss": 0.7032846927642822, + "step": 2090 + }, + { + "epoch": 0.3736654804270463, + "grad_norm": 0.5322824716567993, + "learning_rate": 1.8575090887430354e-05, + "loss": 0.6903145790100098, + "step": 2100 + }, + { + "epoch": 0.37544483985765126, + "grad_norm": 0.45118311047554016, + "learning_rate": 1.8560416229147718e-05, + "loss": 0.7186954975128174, + "step": 2110 + }, + { + "epoch": 0.37722419928825623, + "grad_norm": 0.47973230481147766, + "learning_rate": 1.8545672251242855e-05, + "loss": 0.677478837966919, + "step": 2120 + }, + { + "epoch": 0.3790035587188612, + "grad_norm": 0.5959491729736328, + "learning_rate": 1.8530859073107973e-05, + "loss": 0.7285795211791992, + "step": 2130 + }, + { + "epoch": 0.3807829181494662, + "grad_norm": 0.5627938508987427, + "learning_rate": 1.851597681469565e-05, + "loss": 0.6971286773681641, + "step": 2140 + }, + { + "epoch": 0.38256227758007116, + "grad_norm": 0.5175175666809082, + "learning_rate": 1.850102559651784e-05, + "loss": 0.7243900775909424, + "step": 2150 + }, + { + "epoch": 0.38434163701067614, + "grad_norm": 0.5537509918212891, + "learning_rate": 1.848600553964491e-05, + "loss": 0.6653794765472412, + "step": 2160 + }, + { + "epoch": 0.3861209964412811, + "grad_norm": 0.6098501086235046, + "learning_rate": 1.847091676570468e-05, + "loss": 0.6803691864013672, + "step": 2170 + }, + { + "epoch": 0.3879003558718861, + "grad_norm": 0.5026776790618896, + "learning_rate": 1.8455759396881402e-05, + "loss": 0.7138524055480957, + "step": 2180 + }, + { + "epoch": 0.3896797153024911, + "grad_norm": 0.5244454741477966, + "learning_rate": 1.8440533555914795e-05, + "loss": 0.7204444885253907, + "step": 2190 + }, + { + "epoch": 0.3914590747330961, + "grad_norm": 0.5315225124359131, + "learning_rate": 1.842523936609905e-05, + "loss": 0.7181321144104004, + "step": 2200 + }, + { + "epoch": 0.3932384341637011, + "grad_norm": 0.5039063096046448, + "learning_rate": 1.8409876951281814e-05, + "loss": 0.7282841205596924, + "step": 2210 + }, + { + "epoch": 0.39501779359430605, + "grad_norm": 0.47619393467903137, + "learning_rate": 1.8394446435863206e-05, + "loss": 0.7016836643218994, + "step": 2220 + }, + { + "epoch": 0.39679715302491103, + "grad_norm": 0.5733043551445007, + "learning_rate": 1.8378947944794806e-05, + "loss": 0.6978562355041504, + "step": 2230 + }, + { + "epoch": 0.398576512455516, + "grad_norm": 0.5004534125328064, + "learning_rate": 1.8363381603578628e-05, + "loss": 0.6713067531585694, + "step": 2240 + }, + { + "epoch": 0.400355871886121, + "grad_norm": 0.6110662817955017, + "learning_rate": 1.8347747538266133e-05, + "loss": 0.6862231254577636, + "step": 2250 + }, + { + "epoch": 0.40213523131672596, + "grad_norm": 0.5308374166488647, + "learning_rate": 1.8332045875457174e-05, + "loss": 0.6602601528167724, + "step": 2260 + }, + { + "epoch": 0.40391459074733094, + "grad_norm": 0.6257572174072266, + "learning_rate": 1.8316276742299e-05, + "loss": 0.6580804347991943, + "step": 2270 + }, + { + "epoch": 0.40569395017793597, + "grad_norm": 0.6403529047966003, + "learning_rate": 1.830044026648521e-05, + "loss": 0.6985883712768555, + "step": 2280 + }, + { + "epoch": 0.40747330960854095, + "grad_norm": 0.5609093308448792, + "learning_rate": 1.828453657625472e-05, + "loss": 0.7306273937225342, + "step": 2290 + }, + { + "epoch": 0.4092526690391459, + "grad_norm": 0.5089350342750549, + "learning_rate": 1.8268565800390733e-05, + "loss": 0.6552363395690918, + "step": 2300 + }, + { + "epoch": 0.4110320284697509, + "grad_norm": 0.5988994240760803, + "learning_rate": 1.8252528068219683e-05, + "loss": 0.6836632251739502, + "step": 2310 + }, + { + "epoch": 0.4128113879003559, + "grad_norm": 0.558757483959198, + "learning_rate": 1.8236423509610207e-05, + "loss": 0.6945361137390137, + "step": 2320 + }, + { + "epoch": 0.41459074733096085, + "grad_norm": 0.5903414487838745, + "learning_rate": 1.8220252254972077e-05, + "loss": 0.6987195014953613, + "step": 2330 + }, + { + "epoch": 0.41637010676156583, + "grad_norm": 0.5187668800354004, + "learning_rate": 1.8204014435255136e-05, + "loss": 0.7381620407104492, + "step": 2340 + }, + { + "epoch": 0.4181494661921708, + "grad_norm": 0.5883176922798157, + "learning_rate": 1.8187710181948274e-05, + "loss": 0.6803225994110107, + "step": 2350 + }, + { + "epoch": 0.4199288256227758, + "grad_norm": 0.5465121269226074, + "learning_rate": 1.817133962707833e-05, + "loss": 0.6955878257751464, + "step": 2360 + }, + { + "epoch": 0.42170818505338076, + "grad_norm": 0.5035154223442078, + "learning_rate": 1.815490290320902e-05, + "loss": 0.6446046829223633, + "step": 2370 + }, + { + "epoch": 0.4234875444839858, + "grad_norm": 0.561638593673706, + "learning_rate": 1.8138400143439892e-05, + "loss": 0.6994437217712403, + "step": 2380 + }, + { + "epoch": 0.42526690391459077, + "grad_norm": 0.547978937625885, + "learning_rate": 1.812183148140523e-05, + "loss": 0.6748724937438965, + "step": 2390 + }, + { + "epoch": 0.42704626334519574, + "grad_norm": 0.7230744957923889, + "learning_rate": 1.8105197051272974e-05, + "loss": 0.6960064888000488, + "step": 2400 + }, + { + "epoch": 0.4288256227758007, + "grad_norm": 0.6158634424209595, + "learning_rate": 1.8088496987743623e-05, + "loss": 0.6599089622497558, + "step": 2410 + }, + { + "epoch": 0.4306049822064057, + "grad_norm": 0.5255537629127502, + "learning_rate": 1.807173142604917e-05, + "loss": 0.7354787349700928, + "step": 2420 + }, + { + "epoch": 0.43238434163701067, + "grad_norm": 0.5891067385673523, + "learning_rate": 1.8054900501951988e-05, + "loss": 0.6725128650665283, + "step": 2430 + }, + { + "epoch": 0.43416370106761565, + "grad_norm": 0.6183168888092041, + "learning_rate": 1.8038004351743726e-05, + "loss": 0.6672306537628174, + "step": 2440 + }, + { + "epoch": 0.4359430604982206, + "grad_norm": 0.5241208672523499, + "learning_rate": 1.8021043112244222e-05, + "loss": 0.6435032367706299, + "step": 2450 + }, + { + "epoch": 0.4377224199288256, + "grad_norm": 0.47406890988349915, + "learning_rate": 1.8004016920800392e-05, + "loss": 0.7117865085601807, + "step": 2460 + }, + { + "epoch": 0.4395017793594306, + "grad_norm": 0.45818889141082764, + "learning_rate": 1.7986925915285098e-05, + "loss": 0.7240960597991943, + "step": 2470 + }, + { + "epoch": 0.4412811387900356, + "grad_norm": 0.5167953372001648, + "learning_rate": 1.796977023409606e-05, + "loss": 0.6828316211700439, + "step": 2480 + }, + { + "epoch": 0.4430604982206406, + "grad_norm": 0.48911231756210327, + "learning_rate": 1.795255001615472e-05, + "loss": 0.6907057285308837, + "step": 2490 + }, + { + "epoch": 0.44483985765124556, + "grad_norm": 0.4384164810180664, + "learning_rate": 1.7935265400905107e-05, + "loss": 0.6647283554077148, + "step": 2500 + }, + { + "epoch": 0.44483985765124556, + "eval_loss": 0.667682409286499, + "eval_runtime": 408.4893, + "eval_samples_per_second": 12.311, + "eval_steps_per_second": 6.157, + "step": 2500 + }, + { + "epoch": 0.44661921708185054, + "grad_norm": 0.5545853972434998, + "learning_rate": 1.7917916528312735e-05, + "loss": 0.6821264743804931, + "step": 2510 + }, + { + "epoch": 0.4483985765124555, + "grad_norm": 0.5834780335426331, + "learning_rate": 1.7900503538863446e-05, + "loss": 0.6818623542785645, + "step": 2520 + }, + { + "epoch": 0.4501779359430605, + "grad_norm": 0.6038497090339661, + "learning_rate": 1.7883026573562278e-05, + "loss": 0.6842663764953614, + "step": 2530 + }, + { + "epoch": 0.45195729537366547, + "grad_norm": 0.5673312544822693, + "learning_rate": 1.7865485773932336e-05, + "loss": 0.6788026809692382, + "step": 2540 + }, + { + "epoch": 0.45373665480427045, + "grad_norm": 0.6115579605102539, + "learning_rate": 1.7847881282013623e-05, + "loss": 0.724776029586792, + "step": 2550 + }, + { + "epoch": 0.4555160142348754, + "grad_norm": 0.6251904964447021, + "learning_rate": 1.7830213240361916e-05, + "loss": 0.6907171249389649, + "step": 2560 + }, + { + "epoch": 0.45729537366548045, + "grad_norm": 0.5565618872642517, + "learning_rate": 1.7812481792047587e-05, + "loss": 0.6535940647125245, + "step": 2570 + }, + { + "epoch": 0.45907473309608543, + "grad_norm": 0.5508270263671875, + "learning_rate": 1.7794687080654462e-05, + "loss": 0.6779204845428467, + "step": 2580 + }, + { + "epoch": 0.4608540925266904, + "grad_norm": 0.6121755242347717, + "learning_rate": 1.777682925027865e-05, + "loss": 0.6833740234375, + "step": 2590 + }, + { + "epoch": 0.4626334519572954, + "grad_norm": 0.5710318684577942, + "learning_rate": 1.7758908445527376e-05, + "loss": 0.6820470333099365, + "step": 2600 + }, + { + "epoch": 0.46441281138790036, + "grad_norm": 0.624721884727478, + "learning_rate": 1.774092481151782e-05, + "loss": 0.7131676197052002, + "step": 2610 + }, + { + "epoch": 0.46619217081850534, + "grad_norm": 0.5539907813072205, + "learning_rate": 1.7722878493875922e-05, + "loss": 0.6834908962249756, + "step": 2620 + }, + { + "epoch": 0.4679715302491103, + "grad_norm": 0.7775362730026245, + "learning_rate": 1.7704769638735225e-05, + "loss": 0.7060842990875245, + "step": 2630 + }, + { + "epoch": 0.4697508896797153, + "grad_norm": 0.5198112726211548, + "learning_rate": 1.7686598392735678e-05, + "loss": 0.6912449836730957, + "step": 2640 + }, + { + "epoch": 0.47153024911032027, + "grad_norm": 0.5231760144233704, + "learning_rate": 1.766836490302245e-05, + "loss": 0.7053235054016114, + "step": 2650 + }, + { + "epoch": 0.47330960854092524, + "grad_norm": 0.7802064418792725, + "learning_rate": 1.7650069317244743e-05, + "loss": 0.6698042392730713, + "step": 2660 + }, + { + "epoch": 0.4750889679715303, + "grad_norm": 0.5244280099868774, + "learning_rate": 1.763171178355459e-05, + "loss": 0.6572854042053222, + "step": 2670 + }, + { + "epoch": 0.47686832740213525, + "grad_norm": 0.6323163509368896, + "learning_rate": 1.761329245060567e-05, + "loss": 0.6909306526184082, + "step": 2680 + }, + { + "epoch": 0.4786476868327402, + "grad_norm": 0.636985182762146, + "learning_rate": 1.7594811467552076e-05, + "loss": 0.7432591915130615, + "step": 2690 + }, + { + "epoch": 0.4804270462633452, + "grad_norm": 0.5935755372047424, + "learning_rate": 1.7576268984047146e-05, + "loss": 0.6589434146881104, + "step": 2700 + }, + { + "epoch": 0.4822064056939502, + "grad_norm": 0.5581539869308472, + "learning_rate": 1.7557665150242205e-05, + "loss": 0.6235795974731445, + "step": 2710 + }, + { + "epoch": 0.48398576512455516, + "grad_norm": 0.5456061363220215, + "learning_rate": 1.7539000116785402e-05, + "loss": 0.6965986251831054, + "step": 2720 + }, + { + "epoch": 0.48576512455516013, + "grad_norm": 0.6723548769950867, + "learning_rate": 1.752027403482043e-05, + "loss": 0.6792643070220947, + "step": 2730 + }, + { + "epoch": 0.4875444839857651, + "grad_norm": 0.5704858899116516, + "learning_rate": 1.7501487055985364e-05, + "loss": 0.6697192192077637, + "step": 2740 + }, + { + "epoch": 0.4893238434163701, + "grad_norm": 0.559579074382782, + "learning_rate": 1.7482639332411386e-05, + "loss": 0.6839393615722656, + "step": 2750 + }, + { + "epoch": 0.49110320284697506, + "grad_norm": 0.6328978538513184, + "learning_rate": 1.7463731016721574e-05, + "loss": 0.6757444858551025, + "step": 2760 + }, + { + "epoch": 0.4928825622775801, + "grad_norm": 0.5843029022216797, + "learning_rate": 1.744476226202966e-05, + "loss": 0.6616747379302979, + "step": 2770 + }, + { + "epoch": 0.49466192170818507, + "grad_norm": 0.593262791633606, + "learning_rate": 1.7425733221938802e-05, + "loss": 0.6773699283599853, + "step": 2780 + }, + { + "epoch": 0.49644128113879005, + "grad_norm": 0.4597082734107971, + "learning_rate": 1.740664405054032e-05, + "loss": 0.6922731876373291, + "step": 2790 + }, + { + "epoch": 0.498220640569395, + "grad_norm": 0.5115553140640259, + "learning_rate": 1.7387494902412462e-05, + "loss": 0.6650360107421875, + "step": 2800 + }, + { + "epoch": 0.5, + "grad_norm": 0.5256341695785522, + "learning_rate": 1.7368285932619152e-05, + "loss": 0.6744340896606446, + "step": 2810 + }, + { + "epoch": 0.501779359430605, + "grad_norm": 0.5476647019386292, + "learning_rate": 1.734901729670873e-05, + "loss": 0.6778338432312012, + "step": 2820 + }, + { + "epoch": 0.50355871886121, + "grad_norm": 0.5267303586006165, + "learning_rate": 1.7329689150712692e-05, + "loss": 0.6595091342926025, + "step": 2830 + }, + { + "epoch": 0.505338078291815, + "grad_norm": 0.49511632323265076, + "learning_rate": 1.7310301651144427e-05, + "loss": 0.6996967315673828, + "step": 2840 + }, + { + "epoch": 0.5071174377224199, + "grad_norm": 0.5894142985343933, + "learning_rate": 1.729085495499796e-05, + "loss": 0.7025831699371338, + "step": 2850 + }, + { + "epoch": 0.5088967971530249, + "grad_norm": 0.5248763561248779, + "learning_rate": 1.727134921974666e-05, + "loss": 0.6990334510803222, + "step": 2860 + }, + { + "epoch": 0.5106761565836299, + "grad_norm": 0.6369486451148987, + "learning_rate": 1.7251784603341984e-05, + "loss": 0.6775143146514893, + "step": 2870 + }, + { + "epoch": 0.5124555160142349, + "grad_norm": 0.7855329513549805, + "learning_rate": 1.723216126421219e-05, + "loss": 0.6654418468475342, + "step": 2880 + }, + { + "epoch": 0.5142348754448398, + "grad_norm": 0.5383808016777039, + "learning_rate": 1.7212479361261047e-05, + "loss": 0.7224104404449463, + "step": 2890 + }, + { + "epoch": 0.5160142348754448, + "grad_norm": 0.5875473618507385, + "learning_rate": 1.7192739053866568e-05, + "loss": 0.6993866920471191, + "step": 2900 + }, + { + "epoch": 0.5177935943060499, + "grad_norm": 0.5973880290985107, + "learning_rate": 1.7172940501879702e-05, + "loss": 0.6883405685424805, + "step": 2910 + }, + { + "epoch": 0.5195729537366548, + "grad_norm": 0.7815128564834595, + "learning_rate": 1.715308386562304e-05, + "loss": 0.6863636493682861, + "step": 2920 + }, + { + "epoch": 0.5213523131672598, + "grad_norm": 0.5661780834197998, + "learning_rate": 1.7133169305889526e-05, + "loss": 0.6976628303527832, + "step": 2930 + }, + { + "epoch": 0.5231316725978647, + "grad_norm": 0.6116606593132019, + "learning_rate": 1.7113196983941152e-05, + "loss": 0.7353427410125732, + "step": 2940 + }, + { + "epoch": 0.5249110320284698, + "grad_norm": 0.5908172726631165, + "learning_rate": 1.709316706150765e-05, + "loss": 0.7118365287780761, + "step": 2950 + }, + { + "epoch": 0.5266903914590747, + "grad_norm": 0.5671530365943909, + "learning_rate": 1.707307970078518e-05, + "loss": 0.6777832508087158, + "step": 2960 + }, + { + "epoch": 0.5284697508896797, + "grad_norm": 0.635502815246582, + "learning_rate": 1.7052935064435023e-05, + "loss": 0.6588366985321045, + "step": 2970 + }, + { + "epoch": 0.5302491103202847, + "grad_norm": 0.674394428730011, + "learning_rate": 1.7032733315582254e-05, + "loss": 0.6987817764282227, + "step": 2980 + }, + { + "epoch": 0.5320284697508897, + "grad_norm": 0.6007933020591736, + "learning_rate": 1.7012474617814433e-05, + "loss": 0.6640087604522705, + "step": 2990 + }, + { + "epoch": 0.5338078291814946, + "grad_norm": 0.5124489665031433, + "learning_rate": 1.6992159135180283e-05, + "loss": 0.642765474319458, + "step": 3000 + }, + { + "epoch": 0.5338078291814946, + "eval_loss": 0.6583885550498962, + "eval_runtime": 409.5482, + "eval_samples_per_second": 12.279, + "eval_steps_per_second": 6.141, + "step": 3000 + }, + { + "epoch": 0.5355871886120996, + "grad_norm": 0.6412521004676819, + "learning_rate": 1.6971787032188336e-05, + "loss": 0.6574789047241211, + "step": 3010 + }, + { + "epoch": 0.5373665480427047, + "grad_norm": 0.5690603852272034, + "learning_rate": 1.6951358473805633e-05, + "loss": 0.7070611476898193, + "step": 3020 + }, + { + "epoch": 0.5391459074733096, + "grad_norm": 0.5345459580421448, + "learning_rate": 1.6930873625456362e-05, + "loss": 0.6888082027435303, + "step": 3030 + }, + { + "epoch": 0.5409252669039146, + "grad_norm": 0.67616868019104, + "learning_rate": 1.6910332653020536e-05, + "loss": 0.6946770668029785, + "step": 3040 + }, + { + "epoch": 0.5427046263345195, + "grad_norm": 0.6721035838127136, + "learning_rate": 1.6889735722832643e-05, + "loss": 0.6662865161895752, + "step": 3050 + }, + { + "epoch": 0.5444839857651246, + "grad_norm": 0.6241431832313538, + "learning_rate": 1.6869083001680304e-05, + "loss": 0.6311937808990479, + "step": 3060 + }, + { + "epoch": 0.5462633451957295, + "grad_norm": 0.6454595923423767, + "learning_rate": 1.6848374656802912e-05, + "loss": 0.6600630283355713, + "step": 3070 + }, + { + "epoch": 0.5480427046263345, + "grad_norm": 0.5625023245811462, + "learning_rate": 1.6827610855890278e-05, + "loss": 0.7191053867340088, + "step": 3080 + }, + { + "epoch": 0.5498220640569395, + "grad_norm": 0.6803082823753357, + "learning_rate": 1.6806791767081296e-05, + "loss": 0.6553170680999756, + "step": 3090 + }, + { + "epoch": 0.5516014234875445, + "grad_norm": 0.6465727686882019, + "learning_rate": 1.6785917558962552e-05, + "loss": 0.6733019351959229, + "step": 3100 + }, + { + "epoch": 0.5533807829181495, + "grad_norm": 0.5595579743385315, + "learning_rate": 1.6764988400566973e-05, + "loss": 0.6531811714172363, + "step": 3110 + }, + { + "epoch": 0.5551601423487544, + "grad_norm": 0.6368974447250366, + "learning_rate": 1.6744004461372455e-05, + "loss": 0.6765477657318115, + "step": 3120 + }, + { + "epoch": 0.5569395017793595, + "grad_norm": 0.5315082669258118, + "learning_rate": 1.67229659113005e-05, + "loss": 0.6876490116119385, + "step": 3130 + }, + { + "epoch": 0.5587188612099644, + "grad_norm": 0.6011325716972351, + "learning_rate": 1.6701872920714822e-05, + "loss": 0.671757698059082, + "step": 3140 + }, + { + "epoch": 0.5604982206405694, + "grad_norm": 0.5254577994346619, + "learning_rate": 1.6680725660419987e-05, + "loss": 0.6709373950958252, + "step": 3150 + }, + { + "epoch": 0.5622775800711743, + "grad_norm": 0.5257652401924133, + "learning_rate": 1.6659524301660014e-05, + "loss": 0.6314177513122559, + "step": 3160 + }, + { + "epoch": 0.5640569395017794, + "grad_norm": 0.577984094619751, + "learning_rate": 1.6638269016116995e-05, + "loss": 0.6948809623718262, + "step": 3170 + }, + { + "epoch": 0.5658362989323843, + "grad_norm": 0.5697906613349915, + "learning_rate": 1.6616959975909706e-05, + "loss": 0.6561762809753418, + "step": 3180 + }, + { + "epoch": 0.5676156583629893, + "grad_norm": 0.5052287578582764, + "learning_rate": 1.6595597353592216e-05, + "loss": 0.6577863693237305, + "step": 3190 + }, + { + "epoch": 0.5693950177935944, + "grad_norm": 0.5690486431121826, + "learning_rate": 1.6574181322152477e-05, + "loss": 0.6753673553466797, + "step": 3200 + }, + { + "epoch": 0.5711743772241993, + "grad_norm": 0.6763755679130554, + "learning_rate": 1.6552712055010935e-05, + "loss": 0.6881022453308105, + "step": 3210 + }, + { + "epoch": 0.5729537366548043, + "grad_norm": 0.7030187249183655, + "learning_rate": 1.6531189726019127e-05, + "loss": 0.6971624851226806, + "step": 3220 + }, + { + "epoch": 0.5747330960854092, + "grad_norm": 0.5229400992393494, + "learning_rate": 1.6509614509458263e-05, + "loss": 0.6652966976165772, + "step": 3230 + }, + { + "epoch": 0.5765124555160143, + "grad_norm": 0.5338436961174011, + "learning_rate": 1.6487986580037822e-05, + "loss": 0.6946625709533691, + "step": 3240 + }, + { + "epoch": 0.5782918149466192, + "grad_norm": 0.5740528702735901, + "learning_rate": 1.646630611289414e-05, + "loss": 0.6570149421691894, + "step": 3250 + }, + { + "epoch": 0.5800711743772242, + "grad_norm": 0.5782309174537659, + "learning_rate": 1.6444573283588977e-05, + "loss": 0.657336950302124, + "step": 3260 + }, + { + "epoch": 0.5818505338078291, + "grad_norm": 0.5372493863105774, + "learning_rate": 1.6422788268108112e-05, + "loss": 0.7015529632568359, + "step": 3270 + }, + { + "epoch": 0.5836298932384342, + "grad_norm": 0.734725832939148, + "learning_rate": 1.6400951242859915e-05, + "loss": 0.7008133888244629, + "step": 3280 + }, + { + "epoch": 0.5854092526690391, + "grad_norm": 0.6092361211776733, + "learning_rate": 1.6379062384673914e-05, + "loss": 0.6785021305084229, + "step": 3290 + }, + { + "epoch": 0.5871886120996441, + "grad_norm": 0.5179940462112427, + "learning_rate": 1.6357121870799357e-05, + "loss": 0.6846163749694825, + "step": 3300 + }, + { + "epoch": 0.5889679715302492, + "grad_norm": 0.6822018027305603, + "learning_rate": 1.633512987890379e-05, + "loss": 0.6979034423828125, + "step": 3310 + }, + { + "epoch": 0.5907473309608541, + "grad_norm": 0.6708216667175293, + "learning_rate": 1.631308658707161e-05, + "loss": 0.616365623474121, + "step": 3320 + }, + { + "epoch": 0.5925266903914591, + "grad_norm": 0.5766979455947876, + "learning_rate": 1.6290992173802628e-05, + "loss": 0.6930478096008301, + "step": 3330 + }, + { + "epoch": 0.594306049822064, + "grad_norm": 0.5902812480926514, + "learning_rate": 1.6268846818010615e-05, + "loss": 0.6487626552581787, + "step": 3340 + }, + { + "epoch": 0.5960854092526691, + "grad_norm": 0.45683762431144714, + "learning_rate": 1.6246650699021866e-05, + "loss": 0.670246696472168, + "step": 3350 + }, + { + "epoch": 0.597864768683274, + "grad_norm": 0.4878489077091217, + "learning_rate": 1.6224403996573743e-05, + "loss": 0.6784106254577636, + "step": 3360 + }, + { + "epoch": 0.599644128113879, + "grad_norm": 0.6697226166725159, + "learning_rate": 1.62021068908132e-05, + "loss": 0.7080551147460937, + "step": 3370 + }, + { + "epoch": 0.6014234875444839, + "grad_norm": 0.6667253971099854, + "learning_rate": 1.6179759562295356e-05, + "loss": 0.6681561470031738, + "step": 3380 + }, + { + "epoch": 0.603202846975089, + "grad_norm": 0.560409426689148, + "learning_rate": 1.6157362191982025e-05, + "loss": 0.7013211727142334, + "step": 3390 + }, + { + "epoch": 0.604982206405694, + "grad_norm": 0.6660729050636292, + "learning_rate": 1.6134914961240224e-05, + "loss": 0.6561143398284912, + "step": 3400 + }, + { + "epoch": 0.6067615658362989, + "grad_norm": 0.5322085618972778, + "learning_rate": 1.6112418051840745e-05, + "loss": 0.6231056213378906, + "step": 3410 + }, + { + "epoch": 0.608540925266904, + "grad_norm": 0.6110619902610779, + "learning_rate": 1.6089871645956644e-05, + "loss": 0.696910810470581, + "step": 3420 + }, + { + "epoch": 0.6103202846975089, + "grad_norm": 0.7193135619163513, + "learning_rate": 1.6067275926161792e-05, + "loss": 0.6709835052490234, + "step": 3430 + }, + { + "epoch": 0.6120996441281139, + "grad_norm": 0.5821454524993896, + "learning_rate": 1.6044631075429406e-05, + "loss": 0.6750143527984619, + "step": 3440 + }, + { + "epoch": 0.6138790035587188, + "grad_norm": 0.6266823410987854, + "learning_rate": 1.6021937277130516e-05, + "loss": 0.6660190105438233, + "step": 3450 + }, + { + "epoch": 0.6156583629893239, + "grad_norm": 0.6361811757087708, + "learning_rate": 1.5999194715032543e-05, + "loss": 0.6241551399230957, + "step": 3460 + }, + { + "epoch": 0.6174377224199288, + "grad_norm": 0.5690405964851379, + "learning_rate": 1.5976403573297767e-05, + "loss": 0.6768513679504394, + "step": 3470 + }, + { + "epoch": 0.6192170818505338, + "grad_norm": 0.6438109278678894, + "learning_rate": 1.595356403648186e-05, + "loss": 0.6852968215942383, + "step": 3480 + }, + { + "epoch": 0.6209964412811388, + "grad_norm": 0.5949437022209167, + "learning_rate": 1.5930676289532373e-05, + "loss": 0.6737981319427491, + "step": 3490 + }, + { + "epoch": 0.6227758007117438, + "grad_norm": 0.6436291933059692, + "learning_rate": 1.590774051778726e-05, + "loss": 0.6780194282531739, + "step": 3500 + }, + { + "epoch": 0.6227758007117438, + "eval_loss": 0.6505803465843201, + "eval_runtime": 412.7337, + "eval_samples_per_second": 12.185, + "eval_steps_per_second": 6.094, + "step": 3500 + }, + { + "epoch": 0.6245551601423488, + "grad_norm": 0.5366589426994324, + "learning_rate": 1.588475690697335e-05, + "loss": 0.6628384590148926, + "step": 3510 + }, + { + "epoch": 0.6263345195729537, + "grad_norm": 0.5512357950210571, + "learning_rate": 1.5861725643204876e-05, + "loss": 0.6656132221221924, + "step": 3520 + }, + { + "epoch": 0.6281138790035588, + "grad_norm": 0.5239390134811401, + "learning_rate": 1.5838646912981937e-05, + "loss": 0.6585372447967529, + "step": 3530 + }, + { + "epoch": 0.6298932384341637, + "grad_norm": 0.611584484577179, + "learning_rate": 1.5815520903188998e-05, + "loss": 0.6700050354003906, + "step": 3540 + }, + { + "epoch": 0.6316725978647687, + "grad_norm": 0.5967355966567993, + "learning_rate": 1.5792347801093393e-05, + "loss": 0.6763735294342041, + "step": 3550 + }, + { + "epoch": 0.6334519572953736, + "grad_norm": 0.5486634373664856, + "learning_rate": 1.576912779434379e-05, + "loss": 0.6555093765258789, + "step": 3560 + }, + { + "epoch": 0.6352313167259787, + "grad_norm": 0.6155397891998291, + "learning_rate": 1.5745861070968667e-05, + "loss": 0.660044813156128, + "step": 3570 + }, + { + "epoch": 0.6370106761565836, + "grad_norm": 0.6666173338890076, + "learning_rate": 1.5722547819374807e-05, + "loss": 0.6919036388397217, + "step": 3580 + }, + { + "epoch": 0.6387900355871886, + "grad_norm": 0.628685474395752, + "learning_rate": 1.5699188228345765e-05, + "loss": 0.6886429786682129, + "step": 3590 + }, + { + "epoch": 0.6405693950177936, + "grad_norm": 0.5950825810432434, + "learning_rate": 1.5675782487040337e-05, + "loss": 0.6522699356079101, + "step": 3600 + }, + { + "epoch": 0.6423487544483986, + "grad_norm": 0.6900691390037537, + "learning_rate": 1.565233078499103e-05, + "loss": 0.6545799255371094, + "step": 3610 + }, + { + "epoch": 0.6441281138790036, + "grad_norm": 0.6600697040557861, + "learning_rate": 1.5628833312102526e-05, + "loss": 0.690045976638794, + "step": 3620 + }, + { + "epoch": 0.6459074733096085, + "grad_norm": 0.7045279145240784, + "learning_rate": 1.5605290258650144e-05, + "loss": 0.6638887405395508, + "step": 3630 + }, + { + "epoch": 0.6476868327402135, + "grad_norm": 0.6415700316429138, + "learning_rate": 1.5581701815278302e-05, + "loss": 0.6451498508453369, + "step": 3640 + }, + { + "epoch": 0.6494661921708185, + "grad_norm": 0.6846213340759277, + "learning_rate": 1.5558068172998977e-05, + "loss": 0.6113666534423828, + "step": 3650 + }, + { + "epoch": 0.6512455516014235, + "grad_norm": 0.584511935710907, + "learning_rate": 1.5534389523190142e-05, + "loss": 0.7053666591644288, + "step": 3660 + }, + { + "epoch": 0.6530249110320284, + "grad_norm": 0.7102410793304443, + "learning_rate": 1.551066605759424e-05, + "loss": 0.6806889533996582, + "step": 3670 + }, + { + "epoch": 0.6548042704626335, + "grad_norm": 0.5693709254264832, + "learning_rate": 1.5486897968316604e-05, + "loss": 0.6717594623565674, + "step": 3680 + }, + { + "epoch": 0.6565836298932385, + "grad_norm": 0.6451858878135681, + "learning_rate": 1.546308544782392e-05, + "loss": 0.69324951171875, + "step": 3690 + }, + { + "epoch": 0.6583629893238434, + "grad_norm": 0.5983235836029053, + "learning_rate": 1.543922868894268e-05, + "loss": 0.6728285312652588, + "step": 3700 + }, + { + "epoch": 0.6601423487544484, + "grad_norm": 0.5725272297859192, + "learning_rate": 1.541532788485758e-05, + "loss": 0.6560508728027343, + "step": 3710 + }, + { + "epoch": 0.6619217081850534, + "grad_norm": 0.6461522579193115, + "learning_rate": 1.5391383229110005e-05, + "loss": 0.6883309364318848, + "step": 3720 + }, + { + "epoch": 0.6637010676156584, + "grad_norm": 0.6512270569801331, + "learning_rate": 1.5367394915596414e-05, + "loss": 0.6622300624847413, + "step": 3730 + }, + { + "epoch": 0.6654804270462633, + "grad_norm": 0.6106224656105042, + "learning_rate": 1.534336313856681e-05, + "loss": 0.6878883361816406, + "step": 3740 + }, + { + "epoch": 0.6672597864768683, + "grad_norm": 0.6902608275413513, + "learning_rate": 1.5319288092623142e-05, + "loss": 0.6434782981872559, + "step": 3750 + }, + { + "epoch": 0.6690391459074733, + "grad_norm": 0.5784608721733093, + "learning_rate": 1.5295169972717743e-05, + "loss": 0.6367124557495117, + "step": 3760 + }, + { + "epoch": 0.6708185053380783, + "grad_norm": 0.6913280487060547, + "learning_rate": 1.5271008974151744e-05, + "loss": 0.630396556854248, + "step": 3770 + }, + { + "epoch": 0.6725978647686833, + "grad_norm": 0.7483834624290466, + "learning_rate": 1.5246805292573487e-05, + "loss": 0.6295557975769043, + "step": 3780 + }, + { + "epoch": 0.6743772241992882, + "grad_norm": 0.7149993777275085, + "learning_rate": 1.5222559123976962e-05, + "loss": 0.6769547939300538, + "step": 3790 + }, + { + "epoch": 0.6761565836298933, + "grad_norm": 0.6534271836280823, + "learning_rate": 1.5198270664700187e-05, + "loss": 0.7023432731628418, + "step": 3800 + }, + { + "epoch": 0.6779359430604982, + "grad_norm": 0.6507487297058105, + "learning_rate": 1.5173940111423657e-05, + "loss": 0.6808289527893067, + "step": 3810 + }, + { + "epoch": 0.6797153024911032, + "grad_norm": 0.5977747440338135, + "learning_rate": 1.5149567661168715e-05, + "loss": 0.6398194313049317, + "step": 3820 + }, + { + "epoch": 0.6814946619217082, + "grad_norm": 0.6036016941070557, + "learning_rate": 1.5125153511295989e-05, + "loss": 0.6317630767822265, + "step": 3830 + }, + { + "epoch": 0.6832740213523132, + "grad_norm": 0.8481062054634094, + "learning_rate": 1.5100697859503762e-05, + "loss": 0.6741732597351074, + "step": 3840 + }, + { + "epoch": 0.6850533807829181, + "grad_norm": 0.5988038182258606, + "learning_rate": 1.5076200903826391e-05, + "loss": 0.6638431072235107, + "step": 3850 + }, + { + "epoch": 0.6868327402135231, + "grad_norm": 0.6785585880279541, + "learning_rate": 1.5051662842632709e-05, + "loss": 0.6523032665252686, + "step": 3860 + }, + { + "epoch": 0.6886120996441281, + "grad_norm": 0.6749284863471985, + "learning_rate": 1.5027083874624392e-05, + "loss": 0.6517146110534668, + "step": 3870 + }, + { + "epoch": 0.6903914590747331, + "grad_norm": 0.6880838871002197, + "learning_rate": 1.5002464198834383e-05, + "loss": 0.6895311355590821, + "step": 3880 + }, + { + "epoch": 0.6921708185053381, + "grad_norm": 0.6296641826629639, + "learning_rate": 1.4977804014625257e-05, + "loss": 0.691303300857544, + "step": 3890 + }, + { + "epoch": 0.693950177935943, + "grad_norm": 0.62867671251297, + "learning_rate": 1.4953103521687612e-05, + "loss": 0.6549241065979003, + "step": 3900 + }, + { + "epoch": 0.6957295373665481, + "grad_norm": 0.6810404658317566, + "learning_rate": 1.4928362920038455e-05, + "loss": 0.6266399383544922, + "step": 3910 + }, + { + "epoch": 0.697508896797153, + "grad_norm": 0.7017929553985596, + "learning_rate": 1.4903582410019586e-05, + "loss": 0.6477387905120849, + "step": 3920 + }, + { + "epoch": 0.699288256227758, + "grad_norm": 0.5611311197280884, + "learning_rate": 1.487876219229596e-05, + "loss": 0.6562673568725585, + "step": 3930 + }, + { + "epoch": 0.701067615658363, + "grad_norm": 0.5652351975440979, + "learning_rate": 1.4853902467854084e-05, + "loss": 0.6768126487731934, + "step": 3940 + }, + { + "epoch": 0.702846975088968, + "grad_norm": 0.7303992509841919, + "learning_rate": 1.4829003438000374e-05, + "loss": 0.6467350959777832, + "step": 3950 + }, + { + "epoch": 0.7046263345195729, + "grad_norm": 0.5587636232376099, + "learning_rate": 1.4804065304359525e-05, + "loss": 0.6700319766998291, + "step": 3960 + }, + { + "epoch": 0.7064056939501779, + "grad_norm": 0.7100324630737305, + "learning_rate": 1.477908826887289e-05, + "loss": 0.6796034812927246, + "step": 3970 + }, + { + "epoch": 0.708185053380783, + "grad_norm": 0.6912830471992493, + "learning_rate": 1.4754072533796833e-05, + "loss": 0.6907623291015625, + "step": 3980 + }, + { + "epoch": 0.7099644128113879, + "grad_norm": 0.744030773639679, + "learning_rate": 1.4729018301701093e-05, + "loss": 0.6747735023498536, + "step": 3990 + }, + { + "epoch": 0.7117437722419929, + "grad_norm": 0.6915676593780518, + "learning_rate": 1.4703925775467149e-05, + "loss": 0.6172840595245361, + "step": 4000 + }, + { + "epoch": 0.7117437722419929, + "eval_loss": 0.6436223387718201, + "eval_runtime": 412.5315, + "eval_samples_per_second": 12.191, + "eval_steps_per_second": 6.097, + "step": 4000 + }, + { + "epoch": 0.7135231316725978, + "grad_norm": 0.5881261229515076, + "learning_rate": 1.4678795158286579e-05, + "loss": 0.6607324123382569, + "step": 4010 + }, + { + "epoch": 0.7153024911032029, + "grad_norm": 0.7214511036872864, + "learning_rate": 1.4653626653659399e-05, + "loss": 0.6573444366455078, + "step": 4020 + }, + { + "epoch": 0.7170818505338078, + "grad_norm": 0.6598398089408875, + "learning_rate": 1.4628420465392432e-05, + "loss": 0.6589923858642578, + "step": 4030 + }, + { + "epoch": 0.7188612099644128, + "grad_norm": 0.6202049255371094, + "learning_rate": 1.4603176797597654e-05, + "loss": 0.6494176387786865, + "step": 4040 + }, + { + "epoch": 0.7206405693950177, + "grad_norm": 0.6085755825042725, + "learning_rate": 1.4577895854690536e-05, + "loss": 0.6625244140625, + "step": 4050 + }, + { + "epoch": 0.7224199288256228, + "grad_norm": 0.799647331237793, + "learning_rate": 1.4552577841388388e-05, + "loss": 0.6426272392272949, + "step": 4060 + }, + { + "epoch": 0.7241992882562278, + "grad_norm": 0.5650002956390381, + "learning_rate": 1.4527222962708714e-05, + "loss": 0.6512536525726318, + "step": 4070 + }, + { + "epoch": 0.7259786476868327, + "grad_norm": 0.7105417251586914, + "learning_rate": 1.4501831423967531e-05, + "loss": 0.639466667175293, + "step": 4080 + }, + { + "epoch": 0.7277580071174378, + "grad_norm": 0.6997768878936768, + "learning_rate": 1.4476403430777729e-05, + "loss": 0.6274962425231934, + "step": 4090 + }, + { + "epoch": 0.7295373665480427, + "grad_norm": 0.6622691750526428, + "learning_rate": 1.4450939189047379e-05, + "loss": 0.6450656890869141, + "step": 4100 + }, + { + "epoch": 0.7313167259786477, + "grad_norm": 0.669937789440155, + "learning_rate": 1.4425438904978103e-05, + "loss": 0.6467487335205078, + "step": 4110 + }, + { + "epoch": 0.7330960854092526, + "grad_norm": 0.6928410530090332, + "learning_rate": 1.4399902785063366e-05, + "loss": 0.6678302764892579, + "step": 4120 + }, + { + "epoch": 0.7348754448398577, + "grad_norm": 0.7608679533004761, + "learning_rate": 1.4374331036086831e-05, + "loss": 0.6973666191101074, + "step": 4130 + }, + { + "epoch": 0.7366548042704626, + "grad_norm": 0.7009332180023193, + "learning_rate": 1.4348723865120663e-05, + "loss": 0.6456516742706299, + "step": 4140 + }, + { + "epoch": 0.7384341637010676, + "grad_norm": 0.7158817648887634, + "learning_rate": 1.4323081479523878e-05, + "loss": 0.6287565708160401, + "step": 4150 + }, + { + "epoch": 0.7402135231316725, + "grad_norm": 0.5314562916755676, + "learning_rate": 1.4297404086940635e-05, + "loss": 0.6649733543395996, + "step": 4160 + }, + { + "epoch": 0.7419928825622776, + "grad_norm": 0.6606214642524719, + "learning_rate": 1.4271691895298573e-05, + "loss": 0.6175446033477783, + "step": 4170 + }, + { + "epoch": 0.7437722419928826, + "grad_norm": 0.618217945098877, + "learning_rate": 1.4245945112807133e-05, + "loss": 0.6604631423950196, + "step": 4180 + }, + { + "epoch": 0.7455516014234875, + "grad_norm": 0.5399601459503174, + "learning_rate": 1.422016394795585e-05, + "loss": 0.6682997226715088, + "step": 4190 + }, + { + "epoch": 0.7473309608540926, + "grad_norm": 0.5808553695678711, + "learning_rate": 1.419434860951268e-05, + "loss": 0.6625009059906006, + "step": 4200 + }, + { + "epoch": 0.7491103202846975, + "grad_norm": 0.670625627040863, + "learning_rate": 1.416849930652231e-05, + "loss": 0.6778800010681152, + "step": 4210 + }, + { + "epoch": 0.7508896797153025, + "grad_norm": 0.6508112549781799, + "learning_rate": 1.4142616248304459e-05, + "loss": 0.6265085220336915, + "step": 4220 + }, + { + "epoch": 0.7526690391459074, + "grad_norm": 0.5993587970733643, + "learning_rate": 1.4116699644452182e-05, + "loss": 0.656840181350708, + "step": 4230 + }, + { + "epoch": 0.7544483985765125, + "grad_norm": 0.6819363236427307, + "learning_rate": 1.4090749704830184e-05, + "loss": 0.6575029373168946, + "step": 4240 + }, + { + "epoch": 0.7562277580071174, + "grad_norm": 0.6625942587852478, + "learning_rate": 1.4064766639573104e-05, + "loss": 0.6340457916259765, + "step": 4250 + }, + { + "epoch": 0.7580071174377224, + "grad_norm": 0.8185866475105286, + "learning_rate": 1.4038750659083831e-05, + "loss": 0.6835246086120605, + "step": 4260 + }, + { + "epoch": 0.7597864768683275, + "grad_norm": 0.8232684135437012, + "learning_rate": 1.4012701974031782e-05, + "loss": 0.6450761795043946, + "step": 4270 + }, + { + "epoch": 0.7615658362989324, + "grad_norm": 0.6913644671440125, + "learning_rate": 1.3986620795351214e-05, + "loss": 0.6313485145568848, + "step": 4280 + }, + { + "epoch": 0.7633451957295374, + "grad_norm": 0.641167163848877, + "learning_rate": 1.3960507334239501e-05, + "loss": 0.6450904846191406, + "step": 4290 + }, + { + "epoch": 0.7651245551601423, + "grad_norm": 0.5932332873344421, + "learning_rate": 1.3934361802155436e-05, + "loss": 0.6665386199951172, + "step": 4300 + }, + { + "epoch": 0.7669039145907474, + "grad_norm": 0.5669364929199219, + "learning_rate": 1.3908184410817511e-05, + "loss": 0.6564301013946533, + "step": 4310 + }, + { + "epoch": 0.7686832740213523, + "grad_norm": 0.6267620325088501, + "learning_rate": 1.3881975372202201e-05, + "loss": 0.6448751449584961, + "step": 4320 + }, + { + "epoch": 0.7704626334519573, + "grad_norm": 0.8893764615058899, + "learning_rate": 1.3855734898542252e-05, + "loss": 0.6395359516143799, + "step": 4330 + }, + { + "epoch": 0.7722419928825622, + "grad_norm": 0.6805179119110107, + "learning_rate": 1.3829463202324967e-05, + "loss": 0.6458981990814209, + "step": 4340 + }, + { + "epoch": 0.7740213523131673, + "grad_norm": 0.6416231393814087, + "learning_rate": 1.3803160496290472e-05, + "loss": 0.6462121963500976, + "step": 4350 + }, + { + "epoch": 0.7758007117437722, + "grad_norm": 0.7718709707260132, + "learning_rate": 1.3776826993430006e-05, + "loss": 0.6271074295043946, + "step": 4360 + }, + { + "epoch": 0.7775800711743772, + "grad_norm": 0.6111568212509155, + "learning_rate": 1.375046290698419e-05, + "loss": 0.6282791137695313, + "step": 4370 + }, + { + "epoch": 0.7793594306049823, + "grad_norm": 0.7178627252578735, + "learning_rate": 1.3724068450441303e-05, + "loss": 0.6567965507507324, + "step": 4380 + }, + { + "epoch": 0.7811387900355872, + "grad_norm": 0.6303468346595764, + "learning_rate": 1.3697643837535546e-05, + "loss": 0.6168845653533935, + "step": 4390 + }, + { + "epoch": 0.7829181494661922, + "grad_norm": 0.6654033660888672, + "learning_rate": 1.3671189282245326e-05, + "loss": 0.6331443309783935, + "step": 4400 + }, + { + "epoch": 0.7846975088967971, + "grad_norm": 0.7051156163215637, + "learning_rate": 1.3644704998791501e-05, + "loss": 0.6948952198028564, + "step": 4410 + }, + { + "epoch": 0.7864768683274022, + "grad_norm": 0.5967740416526794, + "learning_rate": 1.361819120163567e-05, + "loss": 0.6140963077545166, + "step": 4420 + }, + { + "epoch": 0.7882562277580071, + "grad_norm": 0.688831090927124, + "learning_rate": 1.3591648105478423e-05, + "loss": 0.6627942085266113, + "step": 4430 + }, + { + "epoch": 0.7900355871886121, + "grad_norm": 0.5357785820960999, + "learning_rate": 1.3565075925257605e-05, + "loss": 0.6558830261230468, + "step": 4440 + }, + { + "epoch": 0.791814946619217, + "grad_norm": 0.6449471712112427, + "learning_rate": 1.3538474876146567e-05, + "loss": 0.6528484344482421, + "step": 4450 + }, + { + "epoch": 0.7935943060498221, + "grad_norm": 0.6996321678161621, + "learning_rate": 1.3511845173552446e-05, + "loss": 0.6519684314727783, + "step": 4460 + }, + { + "epoch": 0.7953736654804271, + "grad_norm": 0.6798763871192932, + "learning_rate": 1.348518703311439e-05, + "loss": 0.6224774360656739, + "step": 4470 + }, + { + "epoch": 0.797153024911032, + "grad_norm": 0.698722243309021, + "learning_rate": 1.3458500670701833e-05, + "loss": 0.6481215953826904, + "step": 4480 + }, + { + "epoch": 0.798932384341637, + "grad_norm": 0.6421968340873718, + "learning_rate": 1.3431786302412749e-05, + "loss": 0.6016243934631348, + "step": 4490 + }, + { + "epoch": 0.800711743772242, + "grad_norm": 0.6529412269592285, + "learning_rate": 1.3405044144571888e-05, + "loss": 0.6958633899688721, + "step": 4500 + }, + { + "epoch": 0.800711743772242, + "eval_loss": 0.6375713348388672, + "eval_runtime": 411.392, + "eval_samples_per_second": 12.224, + "eval_steps_per_second": 6.113, + "step": 4500 + }, + { + "epoch": 0.802491103202847, + "grad_norm": 0.7270268797874451, + "learning_rate": 1.3378274413729036e-05, + "loss": 0.6834945201873779, + "step": 4510 + }, + { + "epoch": 0.8042704626334519, + "grad_norm": 0.6442169547080994, + "learning_rate": 1.335147732665725e-05, + "loss": 0.6535075187683106, + "step": 4520 + }, + { + "epoch": 0.806049822064057, + "grad_norm": 0.7817319631576538, + "learning_rate": 1.3324653100351117e-05, + "loss": 0.6588070869445801, + "step": 4530 + }, + { + "epoch": 0.8078291814946619, + "grad_norm": 0.6025936603546143, + "learning_rate": 1.3297801952024983e-05, + "loss": 0.6654253482818604, + "step": 4540 + }, + { + "epoch": 0.8096085409252669, + "grad_norm": 0.6981809735298157, + "learning_rate": 1.3270924099111204e-05, + "loss": 0.6565302848815918, + "step": 4550 + }, + { + "epoch": 0.8113879003558719, + "grad_norm": 0.7100082635879517, + "learning_rate": 1.3244019759258378e-05, + "loss": 0.6451606273651123, + "step": 4560 + }, + { + "epoch": 0.8131672597864769, + "grad_norm": 0.7197165489196777, + "learning_rate": 1.3217089150329589e-05, + "loss": 0.6707216739654541, + "step": 4570 + }, + { + "epoch": 0.8149466192170819, + "grad_norm": 0.6494991183280945, + "learning_rate": 1.3190132490400642e-05, + "loss": 0.6538206577301026, + "step": 4580 + }, + { + "epoch": 0.8167259786476868, + "grad_norm": 0.610467255115509, + "learning_rate": 1.316314999775829e-05, + "loss": 0.6370253562927246, + "step": 4590 + }, + { + "epoch": 0.8185053380782918, + "grad_norm": 0.765133261680603, + "learning_rate": 1.3136141890898473e-05, + "loss": 0.6969471454620362, + "step": 4600 + }, + { + "epoch": 0.8202846975088968, + "grad_norm": 0.6949911117553711, + "learning_rate": 1.3109108388524551e-05, + "loss": 0.6275270462036133, + "step": 4610 + }, + { + "epoch": 0.8220640569395018, + "grad_norm": 0.6720005869865417, + "learning_rate": 1.3082049709545524e-05, + "loss": 0.6380300521850586, + "step": 4620 + }, + { + "epoch": 0.8238434163701067, + "grad_norm": 0.7357544898986816, + "learning_rate": 1.3054966073074264e-05, + "loss": 0.6391136169433593, + "step": 4630 + }, + { + "epoch": 0.8256227758007118, + "grad_norm": 0.6352887749671936, + "learning_rate": 1.3027857698425748e-05, + "loss": 0.6527080059051513, + "step": 4640 + }, + { + "epoch": 0.8274021352313167, + "grad_norm": 0.6615159511566162, + "learning_rate": 1.3000724805115265e-05, + "loss": 0.6786240100860595, + "step": 4650 + }, + { + "epoch": 0.8291814946619217, + "grad_norm": 0.7004484534263611, + "learning_rate": 1.2973567612856659e-05, + "loss": 0.6390516281127929, + "step": 4660 + }, + { + "epoch": 0.8309608540925267, + "grad_norm": 0.7463746070861816, + "learning_rate": 1.294638634156053e-05, + "loss": 0.6174628734588623, + "step": 4670 + }, + { + "epoch": 0.8327402135231317, + "grad_norm": 0.7666842341423035, + "learning_rate": 1.2919181211332474e-05, + "loss": 0.6169525623321533, + "step": 4680 + }, + { + "epoch": 0.8345195729537367, + "grad_norm": 0.6636411547660828, + "learning_rate": 1.2891952442471274e-05, + "loss": 0.6446310043334961, + "step": 4690 + }, + { + "epoch": 0.8362989323843416, + "grad_norm": 0.9001480937004089, + "learning_rate": 1.2864700255467148e-05, + "loss": 0.6689016342163085, + "step": 4700 + }, + { + "epoch": 0.8380782918149466, + "grad_norm": 0.551397442817688, + "learning_rate": 1.2837424870999933e-05, + "loss": 0.6616122245788574, + "step": 4710 + }, + { + "epoch": 0.8398576512455516, + "grad_norm": 0.7142363786697388, + "learning_rate": 1.281012650993732e-05, + "loss": 0.690334415435791, + "step": 4720 + }, + { + "epoch": 0.8416370106761566, + "grad_norm": 0.701836347579956, + "learning_rate": 1.2782805393333054e-05, + "loss": 0.6755175113677978, + "step": 4730 + }, + { + "epoch": 0.8434163701067615, + "grad_norm": 0.6752656102180481, + "learning_rate": 1.2755461742425147e-05, + "loss": 0.6132485866546631, + "step": 4740 + }, + { + "epoch": 0.8451957295373665, + "grad_norm": 0.6739411354064941, + "learning_rate": 1.2728095778634094e-05, + "loss": 0.6823519706726074, + "step": 4750 + }, + { + "epoch": 0.8469750889679716, + "grad_norm": 0.6728504300117493, + "learning_rate": 1.2700707723561064e-05, + "loss": 0.6264513492584228, + "step": 4760 + }, + { + "epoch": 0.8487544483985765, + "grad_norm": 0.6379041075706482, + "learning_rate": 1.2673297798986118e-05, + "loss": 0.6372091770172119, + "step": 4770 + }, + { + "epoch": 0.8505338078291815, + "grad_norm": 0.6896198987960815, + "learning_rate": 1.2645866226866405e-05, + "loss": 0.6795585632324219, + "step": 4780 + }, + { + "epoch": 0.8523131672597865, + "grad_norm": 0.6732060313224792, + "learning_rate": 1.261841322933438e-05, + "loss": 0.6782153129577637, + "step": 4790 + }, + { + "epoch": 0.8540925266903915, + "grad_norm": 0.7170758247375488, + "learning_rate": 1.2590939028695987e-05, + "loss": 0.6489102840423584, + "step": 4800 + }, + { + "epoch": 0.8558718861209964, + "grad_norm": 0.7446131110191345, + "learning_rate": 1.2563443847428862e-05, + "loss": 0.6556113719940185, + "step": 4810 + }, + { + "epoch": 0.8576512455516014, + "grad_norm": 0.5690045952796936, + "learning_rate": 1.2535927908180547e-05, + "loss": 0.6207235813140869, + "step": 4820 + }, + { + "epoch": 0.8594306049822064, + "grad_norm": 0.7088342308998108, + "learning_rate": 1.2508391433766667e-05, + "loss": 0.6439788341522217, + "step": 4830 + }, + { + "epoch": 0.8612099644128114, + "grad_norm": 0.573747992515564, + "learning_rate": 1.2480834647169134e-05, + "loss": 0.6493591785430908, + "step": 4840 + }, + { + "epoch": 0.8629893238434164, + "grad_norm": 0.6367696523666382, + "learning_rate": 1.2453257771534348e-05, + "loss": 0.6573034286499023, + "step": 4850 + }, + { + "epoch": 0.8647686832740213, + "grad_norm": 0.6127105355262756, + "learning_rate": 1.2425661030171382e-05, + "loss": 0.6419090747833252, + "step": 4860 + }, + { + "epoch": 0.8665480427046264, + "grad_norm": 0.6644669771194458, + "learning_rate": 1.2398044646550167e-05, + "loss": 0.6521550178527832, + "step": 4870 + }, + { + "epoch": 0.8683274021352313, + "grad_norm": 0.6144851446151733, + "learning_rate": 1.2370408844299705e-05, + "loss": 0.6388635158538818, + "step": 4880 + }, + { + "epoch": 0.8701067615658363, + "grad_norm": 0.748590886592865, + "learning_rate": 1.2342753847206236e-05, + "loss": 0.6553171157836915, + "step": 4890 + }, + { + "epoch": 0.8718861209964412, + "grad_norm": 0.7603781819343567, + "learning_rate": 1.2315079879211435e-05, + "loss": 0.639424467086792, + "step": 4900 + }, + { + "epoch": 0.8736654804270463, + "grad_norm": 0.60029137134552, + "learning_rate": 1.2287387164410597e-05, + "loss": 0.6580222606658935, + "step": 4910 + }, + { + "epoch": 0.8754448398576512, + "grad_norm": 0.5586308836936951, + "learning_rate": 1.2259675927050829e-05, + "loss": 0.6166585445404053, + "step": 4920 + }, + { + "epoch": 0.8772241992882562, + "grad_norm": 0.6883022785186768, + "learning_rate": 1.2231946391529213e-05, + "loss": 0.651360559463501, + "step": 4930 + }, + { + "epoch": 0.8790035587188612, + "grad_norm": 0.7347468733787537, + "learning_rate": 1.2204198782391018e-05, + "loss": 0.6587865352630615, + "step": 4940 + }, + { + "epoch": 0.8807829181494662, + "grad_norm": 0.8330339193344116, + "learning_rate": 1.2176433324327868e-05, + "loss": 0.6858234405517578, + "step": 4950 + }, + { + "epoch": 0.8825622775800712, + "grad_norm": 0.6569383144378662, + "learning_rate": 1.2148650242175908e-05, + "loss": 0.7256248950958252, + "step": 4960 + }, + { + "epoch": 0.8843416370106761, + "grad_norm": 0.7147039771080017, + "learning_rate": 1.2120849760914013e-05, + "loss": 0.6601557254791259, + "step": 4970 + }, + { + "epoch": 0.8861209964412812, + "grad_norm": 0.7715662717819214, + "learning_rate": 1.2093032105661944e-05, + "loss": 0.6112511634826661, + "step": 4980 + }, + { + "epoch": 0.8879003558718861, + "grad_norm": 0.7267486453056335, + "learning_rate": 1.2065197501678529e-05, + "loss": 0.6230842590332031, + "step": 4990 + }, + { + "epoch": 0.8896797153024911, + "grad_norm": 0.7366806268692017, + "learning_rate": 1.203734617435985e-05, + "loss": 0.6849907875061035, + "step": 5000 + }, + { + "epoch": 0.8896797153024911, + "eval_loss": 0.6321616172790527, + "eval_runtime": 411.8441, + "eval_samples_per_second": 12.211, + "eval_steps_per_second": 6.107, + "step": 5000 + }, + { + "epoch": 0.891459074733096, + "grad_norm": 0.7812427282333374, + "learning_rate": 1.2009478349237397e-05, + "loss": 0.6636211395263671, + "step": 5010 + }, + { + "epoch": 0.8932384341637011, + "grad_norm": 0.6625553965568542, + "learning_rate": 1.1981594251976265e-05, + "loss": 0.6543920040130615, + "step": 5020 + }, + { + "epoch": 0.895017793594306, + "grad_norm": 0.6391006112098694, + "learning_rate": 1.1953694108373313e-05, + "loss": 0.653505516052246, + "step": 5030 + }, + { + "epoch": 0.896797153024911, + "grad_norm": 0.763852059841156, + "learning_rate": 1.1925778144355338e-05, + "loss": 0.6871216773986817, + "step": 5040 + }, + { + "epoch": 0.8985765124555161, + "grad_norm": 0.7798473834991455, + "learning_rate": 1.189784658597724e-05, + "loss": 0.6243946552276611, + "step": 5050 + }, + { + "epoch": 0.900355871886121, + "grad_norm": 0.6348104476928711, + "learning_rate": 1.1869899659420208e-05, + "loss": 0.6567151069641113, + "step": 5060 + }, + { + "epoch": 0.902135231316726, + "grad_norm": 0.6433530449867249, + "learning_rate": 1.1841937590989873e-05, + "loss": 0.677039909362793, + "step": 5070 + }, + { + "epoch": 0.9039145907473309, + "grad_norm": 0.8366072177886963, + "learning_rate": 1.1813960607114476e-05, + "loss": 0.6381283760070801, + "step": 5080 + }, + { + "epoch": 0.905693950177936, + "grad_norm": 0.7176342606544495, + "learning_rate": 1.1785968934343045e-05, + "loss": 0.6601722717285157, + "step": 5090 + }, + { + "epoch": 0.9074733096085409, + "grad_norm": 0.6531491279602051, + "learning_rate": 1.1757962799343548e-05, + "loss": 0.6662145137786866, + "step": 5100 + }, + { + "epoch": 0.9092526690391459, + "grad_norm": 0.6918512582778931, + "learning_rate": 1.1729942428901068e-05, + "loss": 0.6182730674743653, + "step": 5110 + }, + { + "epoch": 0.9110320284697508, + "grad_norm": 0.5597010850906372, + "learning_rate": 1.1701908049915964e-05, + "loss": 0.6443261623382568, + "step": 5120 + }, + { + "epoch": 0.9128113879003559, + "grad_norm": 0.6807442307472229, + "learning_rate": 1.1673859889402028e-05, + "loss": 0.6334492206573487, + "step": 5130 + }, + { + "epoch": 0.9145907473309609, + "grad_norm": 0.6249061226844788, + "learning_rate": 1.1645798174484653e-05, + "loss": 0.614483642578125, + "step": 5140 + }, + { + "epoch": 0.9163701067615658, + "grad_norm": 0.5527177453041077, + "learning_rate": 1.161772313239899e-05, + "loss": 0.650807237625122, + "step": 5150 + }, + { + "epoch": 0.9181494661921709, + "grad_norm": 0.7590207457542419, + "learning_rate": 1.1589634990488107e-05, + "loss": 0.6382132053375245, + "step": 5160 + }, + { + "epoch": 0.9199288256227758, + "grad_norm": 0.7238802909851074, + "learning_rate": 1.1561533976201157e-05, + "loss": 0.6336176872253418, + "step": 5170 + }, + { + "epoch": 0.9217081850533808, + "grad_norm": 0.8075314164161682, + "learning_rate": 1.1533420317091519e-05, + "loss": 0.6544758796691894, + "step": 5180 + }, + { + "epoch": 0.9234875444839857, + "grad_norm": 0.7770307064056396, + "learning_rate": 1.1505294240814979e-05, + "loss": 0.6220970153808594, + "step": 5190 + }, + { + "epoch": 0.9252669039145908, + "grad_norm": 0.5794429779052734, + "learning_rate": 1.1477155975127866e-05, + "loss": 0.6260251045227051, + "step": 5200 + }, + { + "epoch": 0.9270462633451957, + "grad_norm": 0.6863322257995605, + "learning_rate": 1.1449005747885212e-05, + "loss": 0.6177238464355469, + "step": 5210 + }, + { + "epoch": 0.9288256227758007, + "grad_norm": 0.6806472539901733, + "learning_rate": 1.142084378703892e-05, + "loss": 0.6731130599975585, + "step": 5220 + }, + { + "epoch": 0.9306049822064056, + "grad_norm": 0.6430651545524597, + "learning_rate": 1.1392670320635894e-05, + "loss": 0.6535952091217041, + "step": 5230 + }, + { + "epoch": 0.9323843416370107, + "grad_norm": 0.6920055150985718, + "learning_rate": 1.1364485576816225e-05, + "loss": 0.6301285266876221, + "step": 5240 + }, + { + "epoch": 0.9341637010676157, + "grad_norm": 0.7043541669845581, + "learning_rate": 1.1336289783811303e-05, + "loss": 0.6545778751373291, + "step": 5250 + }, + { + "epoch": 0.9359430604982206, + "grad_norm": 0.7749147415161133, + "learning_rate": 1.130808316994201e-05, + "loss": 0.6474496841430664, + "step": 5260 + }, + { + "epoch": 0.9377224199288257, + "grad_norm": 0.6907658576965332, + "learning_rate": 1.127986596361684e-05, + "loss": 0.6155929565429688, + "step": 5270 + }, + { + "epoch": 0.9395017793594306, + "grad_norm": 0.6255580186843872, + "learning_rate": 1.125163839333007e-05, + "loss": 0.6644204139709473, + "step": 5280 + }, + { + "epoch": 0.9412811387900356, + "grad_norm": 0.5674806833267212, + "learning_rate": 1.1223400687659898e-05, + "loss": 0.6196834564208984, + "step": 5290 + }, + { + "epoch": 0.9430604982206405, + "grad_norm": 0.7296446561813354, + "learning_rate": 1.1195153075266591e-05, + "loss": 0.6685366153717041, + "step": 5300 + }, + { + "epoch": 0.9448398576512456, + "grad_norm": 0.5960925817489624, + "learning_rate": 1.1166895784890644e-05, + "loss": 0.6431320190429688, + "step": 5310 + }, + { + "epoch": 0.9466192170818505, + "grad_norm": 0.6691383719444275, + "learning_rate": 1.1138629045350911e-05, + "loss": 0.6147652626037597, + "step": 5320 + }, + { + "epoch": 0.9483985765124555, + "grad_norm": 0.7819799184799194, + "learning_rate": 1.1110353085542778e-05, + "loss": 0.6021285057067871, + "step": 5330 + }, + { + "epoch": 0.9501779359430605, + "grad_norm": 0.690168023109436, + "learning_rate": 1.1082068134436281e-05, + "loss": 0.6405185699462891, + "step": 5340 + }, + { + "epoch": 0.9519572953736655, + "grad_norm": 0.6121358871459961, + "learning_rate": 1.1053774421074272e-05, + "loss": 0.6638952255249023, + "step": 5350 + }, + { + "epoch": 0.9537366548042705, + "grad_norm": 0.6227392554283142, + "learning_rate": 1.1025472174570554e-05, + "loss": 0.6735183715820312, + "step": 5360 + }, + { + "epoch": 0.9555160142348754, + "grad_norm": 0.7254964113235474, + "learning_rate": 1.099716162410803e-05, + "loss": 0.6874561309814453, + "step": 5370 + }, + { + "epoch": 0.9572953736654805, + "grad_norm": 0.916182816028595, + "learning_rate": 1.0968842998936843e-05, + "loss": 0.6294379234313965, + "step": 5380 + }, + { + "epoch": 0.9590747330960854, + "grad_norm": 0.7955939769744873, + "learning_rate": 1.0940516528372527e-05, + "loss": 0.648458194732666, + "step": 5390 + }, + { + "epoch": 0.9608540925266904, + "grad_norm": 0.5914123058319092, + "learning_rate": 1.0912182441794141e-05, + "loss": 0.641319465637207, + "step": 5400 + }, + { + "epoch": 0.9626334519572953, + "grad_norm": 0.8009176850318909, + "learning_rate": 1.0883840968642416e-05, + "loss": 0.6294963836669922, + "step": 5410 + }, + { + "epoch": 0.9644128113879004, + "grad_norm": 0.8268579244613647, + "learning_rate": 1.0855492338417905e-05, + "loss": 0.6227912425994873, + "step": 5420 + }, + { + "epoch": 0.9661921708185054, + "grad_norm": 0.7227009534835815, + "learning_rate": 1.0827136780679109e-05, + "loss": 0.6420284271240234, + "step": 5430 + }, + { + "epoch": 0.9679715302491103, + "grad_norm": 0.7250238060951233, + "learning_rate": 1.0798774525040628e-05, + "loss": 0.6643415927886963, + "step": 5440 + }, + { + "epoch": 0.9697508896797153, + "grad_norm": 0.6804770827293396, + "learning_rate": 1.07704058011713e-05, + "loss": 0.6299617290496826, + "step": 5450 + }, + { + "epoch": 0.9715302491103203, + "grad_norm": 0.8592752814292908, + "learning_rate": 1.0742030838792343e-05, + "loss": 0.6287535667419434, + "step": 5460 + }, + { + "epoch": 0.9733096085409253, + "grad_norm": 0.6754792928695679, + "learning_rate": 1.0713649867675483e-05, + "loss": 0.6633755207061768, + "step": 5470 + }, + { + "epoch": 0.9750889679715302, + "grad_norm": 0.8593308925628662, + "learning_rate": 1.0685263117641117e-05, + "loss": 0.6187657833099365, + "step": 5480 + }, + { + "epoch": 0.9768683274021353, + "grad_norm": 0.6328932642936707, + "learning_rate": 1.065687081855643e-05, + "loss": 0.618121862411499, + "step": 5490 + }, + { + "epoch": 0.9786476868327402, + "grad_norm": 0.7381909489631653, + "learning_rate": 1.062847320033354e-05, + "loss": 0.6680717945098877, + "step": 5500 + }, + { + "epoch": 0.9786476868327402, + "eval_loss": 0.6279275417327881, + "eval_runtime": 413.1424, + "eval_samples_per_second": 12.173, + "eval_steps_per_second": 6.087, + "step": 5500 + }, + { + "epoch": 0.9804270462633452, + "grad_norm": 0.8082507848739624, + "learning_rate": 1.0600070492927644e-05, + "loss": 0.619424295425415, + "step": 5510 + }, + { + "epoch": 0.9822064056939501, + "grad_norm": 0.782515823841095, + "learning_rate": 1.0571662926335138e-05, + "loss": 0.6681380271911621, + "step": 5520 + }, + { + "epoch": 0.9839857651245552, + "grad_norm": 0.756851851940155, + "learning_rate": 1.0543250730591778e-05, + "loss": 0.648841381072998, + "step": 5530 + }, + { + "epoch": 0.9857651245551602, + "grad_norm": 0.6018344759941101, + "learning_rate": 1.0514834135770805e-05, + "loss": 0.6303011417388916, + "step": 5540 + }, + { + "epoch": 0.9875444839857651, + "grad_norm": 0.6782673597335815, + "learning_rate": 1.0486413371981077e-05, + "loss": 0.6756137371063232, + "step": 5550 + }, + { + "epoch": 0.9893238434163701, + "grad_norm": 0.680172860622406, + "learning_rate": 1.045798866936521e-05, + "loss": 0.62680082321167, + "step": 5560 + }, + { + "epoch": 0.9911032028469751, + "grad_norm": 0.7947545647621155, + "learning_rate": 1.042956025809772e-05, + "loss": 0.6698834419250488, + "step": 5570 + }, + { + "epoch": 0.9928825622775801, + "grad_norm": 0.6484361290931702, + "learning_rate": 1.0401128368383151e-05, + "loss": 0.6242072582244873, + "step": 5580 + }, + { + "epoch": 0.994661921708185, + "grad_norm": 0.8095009326934814, + "learning_rate": 1.037269323045422e-05, + "loss": 0.6368332862854004, + "step": 5590 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 0.7750351428985596, + "learning_rate": 1.034425507456994e-05, + "loss": 0.6159787654876709, + "step": 5600 + }, + { + "epoch": 0.998220640569395, + "grad_norm": 0.6969480514526367, + "learning_rate": 1.0315814131013768e-05, + "loss": 0.6518408298492432, + "step": 5610 + }, + { + "epoch": 1.0, + "grad_norm": 0.7439318299293518, + "learning_rate": 1.0287370630091731e-05, + "loss": 0.6812664031982422, + "step": 5620 + }, + { + "epoch": 1.001779359430605, + "grad_norm": 0.7261248826980591, + "learning_rate": 1.0258924802130565e-05, + "loss": 0.6570511817932129, + "step": 5630 + }, + { + "epoch": 1.00355871886121, + "grad_norm": 0.6743627190589905, + "learning_rate": 1.0230476877475854e-05, + "loss": 0.6013021469116211, + "step": 5640 + }, + { + "epoch": 1.0053380782918149, + "grad_norm": 0.698407769203186, + "learning_rate": 1.0202027086490154e-05, + "loss": 0.6084145069122314, + "step": 5650 + }, + { + "epoch": 1.00711743772242, + "grad_norm": 0.7206895351409912, + "learning_rate": 1.0173575659551137e-05, + "loss": 0.642551326751709, + "step": 5660 + }, + { + "epoch": 1.008896797153025, + "grad_norm": 0.8103647232055664, + "learning_rate": 1.0145122827049725e-05, + "loss": 0.5881889820098877, + "step": 5670 + }, + { + "epoch": 1.01067615658363, + "grad_norm": 0.7496983408927917, + "learning_rate": 1.0116668819388218e-05, + "loss": 0.6021871089935302, + "step": 5680 + }, + { + "epoch": 1.0124555160142348, + "grad_norm": 0.6843361258506775, + "learning_rate": 1.0088213866978435e-05, + "loss": 0.6468405723571777, + "step": 5690 + }, + { + "epoch": 1.0142348754448398, + "grad_norm": 0.6858223676681519, + "learning_rate": 1.0059758200239842e-05, + "loss": 0.6358915328979492, + "step": 5700 + }, + { + "epoch": 1.0160142348754448, + "grad_norm": 0.9082915186882019, + "learning_rate": 1.0031302049597691e-05, + "loss": 0.6340798377990723, + "step": 5710 + }, + { + "epoch": 1.0177935943060499, + "grad_norm": 0.6842049360275269, + "learning_rate": 1.0002845645481152e-05, + "loss": 0.6286153793334961, + "step": 5720 + }, + { + "epoch": 1.019572953736655, + "grad_norm": 0.7649215459823608, + "learning_rate": 9.974389218321453e-06, + "loss": 0.6225019931793213, + "step": 5730 + }, + { + "epoch": 1.0213523131672597, + "grad_norm": 0.6515229344367981, + "learning_rate": 9.945932998549996e-06, + "loss": 0.6120016574859619, + "step": 5740 + }, + { + "epoch": 1.0231316725978647, + "grad_norm": 0.5917989015579224, + "learning_rate": 9.917477216596521e-06, + "loss": 0.6404437065124512, + "step": 5750 + }, + { + "epoch": 1.0249110320284698, + "grad_norm": 0.7759416699409485, + "learning_rate": 9.889022102887205e-06, + "loss": 0.6269901275634766, + "step": 5760 + }, + { + "epoch": 1.0266903914590748, + "grad_norm": 0.7149049043655396, + "learning_rate": 9.860567887842827e-06, + "loss": 0.6032210350036621, + "step": 5770 + }, + { + "epoch": 1.0284697508896796, + "grad_norm": 0.6775475144386292, + "learning_rate": 9.832114801876877e-06, + "loss": 0.6163185119628907, + "step": 5780 + }, + { + "epoch": 1.0302491103202847, + "grad_norm": 0.7221093773841858, + "learning_rate": 9.80366307539372e-06, + "loss": 0.5833635330200195, + "step": 5790 + }, + { + "epoch": 1.0320284697508897, + "grad_norm": 0.7127135396003723, + "learning_rate": 9.775212938786689e-06, + "loss": 0.6052863597869873, + "step": 5800 + }, + { + "epoch": 1.0338078291814947, + "grad_norm": 0.7285045981407166, + "learning_rate": 9.746764622436265e-06, + "loss": 0.6380964756011963, + "step": 5810 + }, + { + "epoch": 1.0355871886120998, + "grad_norm": 0.7396540641784668, + "learning_rate": 9.718318356708172e-06, + "loss": 0.6616352081298829, + "step": 5820 + }, + { + "epoch": 1.0373665480427046, + "grad_norm": 0.6225027441978455, + "learning_rate": 9.689874371951541e-06, + "loss": 0.6689383029937744, + "step": 5830 + }, + { + "epoch": 1.0391459074733096, + "grad_norm": 0.8353135585784912, + "learning_rate": 9.661432898497024e-06, + "loss": 0.6042938232421875, + "step": 5840 + }, + { + "epoch": 1.0409252669039146, + "grad_norm": 0.6610475778579712, + "learning_rate": 9.632994166654941e-06, + "loss": 0.6156399726867676, + "step": 5850 + }, + { + "epoch": 1.0427046263345197, + "grad_norm": 0.7322497963905334, + "learning_rate": 9.604558406713413e-06, + "loss": 0.6352502822875976, + "step": 5860 + }, + { + "epoch": 1.0444839857651245, + "grad_norm": 0.7838056087493896, + "learning_rate": 9.576125848936484e-06, + "loss": 0.6298691272735596, + "step": 5870 + }, + { + "epoch": 1.0462633451957295, + "grad_norm": 0.81569504737854, + "learning_rate": 9.547696723562289e-06, + "loss": 0.6307301998138428, + "step": 5880 + }, + { + "epoch": 1.0480427046263345, + "grad_norm": 0.8897931575775146, + "learning_rate": 9.519271260801146e-06, + "loss": 0.6494285106658936, + "step": 5890 + }, + { + "epoch": 1.0498220640569396, + "grad_norm": 0.7175611257553101, + "learning_rate": 9.490849690833732e-06, + "loss": 0.5930293083190918, + "step": 5900 + }, + { + "epoch": 1.0516014234875444, + "grad_norm": 0.6708109378814697, + "learning_rate": 9.462432243809191e-06, + "loss": 0.595799732208252, + "step": 5910 + }, + { + "epoch": 1.0533807829181494, + "grad_norm": 0.6808570623397827, + "learning_rate": 9.434019149843285e-06, + "loss": 0.6564432621002197, + "step": 5920 + }, + { + "epoch": 1.0551601423487544, + "grad_norm": 0.8115124106407166, + "learning_rate": 9.405610639016522e-06, + "loss": 0.6025516033172608, + "step": 5930 + }, + { + "epoch": 1.0569395017793595, + "grad_norm": 0.7574615478515625, + "learning_rate": 9.377206941372306e-06, + "loss": 0.6670190811157226, + "step": 5940 + }, + { + "epoch": 1.0587188612099645, + "grad_norm": 0.8795053362846375, + "learning_rate": 9.34880828691505e-06, + "loss": 0.6606448650360107, + "step": 5950 + }, + { + "epoch": 1.0604982206405693, + "grad_norm": 0.7287918925285339, + "learning_rate": 9.320414905608348e-06, + "loss": 0.6295819759368897, + "step": 5960 + }, + { + "epoch": 1.0622775800711743, + "grad_norm": 0.7272006869316101, + "learning_rate": 9.292027027373075e-06, + "loss": 0.6425396919250488, + "step": 5970 + }, + { + "epoch": 1.0640569395017794, + "grad_norm": 0.6797536015510559, + "learning_rate": 9.263644882085564e-06, + "loss": 0.5847621917724609, + "step": 5980 + }, + { + "epoch": 1.0658362989323844, + "grad_norm": 0.6698694229125977, + "learning_rate": 9.235268699575704e-06, + "loss": 0.6509596824645996, + "step": 5990 + }, + { + "epoch": 1.0676156583629894, + "grad_norm": 0.7406882643699646, + "learning_rate": 9.206898709625109e-06, + "loss": 0.6330312252044678, + "step": 6000 + }, + { + "epoch": 1.0676156583629894, + "eval_loss": 0.6239920854568481, + "eval_runtime": 411.471, + "eval_samples_per_second": 12.222, + "eval_steps_per_second": 6.112, + "step": 6000 + }, + { + "epoch": 1.0693950177935942, + "grad_norm": 0.695398211479187, + "learning_rate": 9.17853514196525e-06, + "loss": 0.6093502998352051, + "step": 6010 + }, + { + "epoch": 1.0711743772241993, + "grad_norm": 0.839644730091095, + "learning_rate": 9.150178226275584e-06, + "loss": 0.6351033687591553, + "step": 6020 + }, + { + "epoch": 1.0729537366548043, + "grad_norm": 0.6793309450149536, + "learning_rate": 9.121828192181716e-06, + "loss": 0.6527360916137696, + "step": 6030 + }, + { + "epoch": 1.0747330960854093, + "grad_norm": 0.8601770401000977, + "learning_rate": 9.093485269253508e-06, + "loss": 0.6375271320343018, + "step": 6040 + }, + { + "epoch": 1.0765124555160142, + "grad_norm": 0.8268119692802429, + "learning_rate": 9.065149687003256e-06, + "loss": 0.6043989181518554, + "step": 6050 + }, + { + "epoch": 1.0782918149466192, + "grad_norm": 0.6872331500053406, + "learning_rate": 9.0368216748838e-06, + "loss": 0.5880636215209961, + "step": 6060 + }, + { + "epoch": 1.0800711743772242, + "grad_norm": 0.7637034058570862, + "learning_rate": 9.008501462286682e-06, + "loss": 0.6080229759216309, + "step": 6070 + }, + { + "epoch": 1.0818505338078293, + "grad_norm": 0.7357282042503357, + "learning_rate": 8.980189278540294e-06, + "loss": 0.6133236408233642, + "step": 6080 + }, + { + "epoch": 1.083629893238434, + "grad_norm": 0.7250421643257141, + "learning_rate": 8.951885352908006e-06, + "loss": 0.6319203853607178, + "step": 6090 + }, + { + "epoch": 1.085409252669039, + "grad_norm": 0.6590055227279663, + "learning_rate": 8.92358991458631e-06, + "loss": 0.649744987487793, + "step": 6100 + }, + { + "epoch": 1.0871886120996441, + "grad_norm": 0.7239564061164856, + "learning_rate": 8.895303192702988e-06, + "loss": 0.6277432918548584, + "step": 6110 + }, + { + "epoch": 1.0889679715302492, + "grad_norm": 0.8234860301017761, + "learning_rate": 8.867025416315221e-06, + "loss": 0.6343496799468994, + "step": 6120 + }, + { + "epoch": 1.0907473309608542, + "grad_norm": 0.8464725613594055, + "learning_rate": 8.838756814407766e-06, + "loss": 0.6384105205535888, + "step": 6130 + }, + { + "epoch": 1.092526690391459, + "grad_norm": 0.8043434619903564, + "learning_rate": 8.810497615891078e-06, + "loss": 0.6212152957916259, + "step": 6140 + }, + { + "epoch": 1.094306049822064, + "grad_norm": 0.7087031602859497, + "learning_rate": 8.782248049599468e-06, + "loss": 0.5984238624572754, + "step": 6150 + }, + { + "epoch": 1.096085409252669, + "grad_norm": 0.7631204724311829, + "learning_rate": 8.754008344289253e-06, + "loss": 0.6375543594360351, + "step": 6160 + }, + { + "epoch": 1.097864768683274, + "grad_norm": 0.6403396725654602, + "learning_rate": 8.725778728636893e-06, + "loss": 0.6312360763549805, + "step": 6170 + }, + { + "epoch": 1.099644128113879, + "grad_norm": 0.8257510662078857, + "learning_rate": 8.697559431237153e-06, + "loss": 0.6133918762207031, + "step": 6180 + }, + { + "epoch": 1.101423487544484, + "grad_norm": 0.6782218813896179, + "learning_rate": 8.66935068060123e-06, + "loss": 0.6308522701263428, + "step": 6190 + }, + { + "epoch": 1.103202846975089, + "grad_norm": 0.7291736602783203, + "learning_rate": 8.641152705154935e-06, + "loss": 0.6390894889831543, + "step": 6200 + }, + { + "epoch": 1.104982206405694, + "grad_norm": 0.7014942169189453, + "learning_rate": 8.612965733236811e-06, + "loss": 0.6089354038238526, + "step": 6210 + }, + { + "epoch": 1.106761565836299, + "grad_norm": 0.6617055535316467, + "learning_rate": 8.584789993096303e-06, + "loss": 0.6288234710693359, + "step": 6220 + }, + { + "epoch": 1.1085409252669038, + "grad_norm": 0.7303082942962646, + "learning_rate": 8.5566257128919e-06, + "loss": 0.6383206367492675, + "step": 6230 + }, + { + "epoch": 1.1103202846975089, + "grad_norm": 0.7110517024993896, + "learning_rate": 8.528473120689302e-06, + "loss": 0.6590532779693603, + "step": 6240 + }, + { + "epoch": 1.112099644128114, + "grad_norm": 0.779214084148407, + "learning_rate": 8.50033244445955e-06, + "loss": 0.6261368274688721, + "step": 6250 + }, + { + "epoch": 1.113879003558719, + "grad_norm": 0.819240152835846, + "learning_rate": 8.472203912077205e-06, + "loss": 0.6361266136169433, + "step": 6260 + }, + { + "epoch": 1.1156583629893237, + "grad_norm": 0.7131257653236389, + "learning_rate": 8.444087751318484e-06, + "loss": 0.6407057285308838, + "step": 6270 + }, + { + "epoch": 1.1174377224199288, + "grad_norm": 0.9261246919631958, + "learning_rate": 8.415984189859418e-06, + "loss": 0.5832745552062988, + "step": 6280 + }, + { + "epoch": 1.1192170818505338, + "grad_norm": 0.8466585278511047, + "learning_rate": 8.38789345527403e-06, + "loss": 0.6238195419311523, + "step": 6290 + }, + { + "epoch": 1.1209964412811388, + "grad_norm": 0.7208281755447388, + "learning_rate": 8.359815775032457e-06, + "loss": 0.6571403980255127, + "step": 6300 + }, + { + "epoch": 1.1227758007117439, + "grad_norm": 0.6916828155517578, + "learning_rate": 8.331751376499131e-06, + "loss": 0.6353542327880859, + "step": 6310 + }, + { + "epoch": 1.1245551601423487, + "grad_norm": 0.804423987865448, + "learning_rate": 8.303700486930935e-06, + "loss": 0.6343297481536865, + "step": 6320 + }, + { + "epoch": 1.1263345195729537, + "grad_norm": 0.8437130451202393, + "learning_rate": 8.275663333475365e-06, + "loss": 0.6212164878845214, + "step": 6330 + }, + { + "epoch": 1.1281138790035588, + "grad_norm": 0.8472525477409363, + "learning_rate": 8.247640143168675e-06, + "loss": 0.6239662647247315, + "step": 6340 + }, + { + "epoch": 1.1298932384341638, + "grad_norm": 0.696539044380188, + "learning_rate": 8.219631142934062e-06, + "loss": 0.6344510078430176, + "step": 6350 + }, + { + "epoch": 1.1316725978647686, + "grad_norm": 0.7422542572021484, + "learning_rate": 8.191636559579802e-06, + "loss": 0.6504097938537597, + "step": 6360 + }, + { + "epoch": 1.1334519572953736, + "grad_norm": 0.801693856716156, + "learning_rate": 8.163656619797444e-06, + "loss": 0.6314446449279785, + "step": 6370 + }, + { + "epoch": 1.1352313167259787, + "grad_norm": 0.786159098148346, + "learning_rate": 8.135691550159943e-06, + "loss": 0.6034214973449707, + "step": 6380 + }, + { + "epoch": 1.1370106761565837, + "grad_norm": 0.7948549389839172, + "learning_rate": 8.107741577119853e-06, + "loss": 0.653587532043457, + "step": 6390 + }, + { + "epoch": 1.1387900355871885, + "grad_norm": 0.6635374426841736, + "learning_rate": 8.079806927007469e-06, + "loss": 0.6300637722015381, + "step": 6400 + }, + { + "epoch": 1.1405693950177935, + "grad_norm": 0.7938548922538757, + "learning_rate": 8.051887826029006e-06, + "loss": 0.6417815685272217, + "step": 6410 + }, + { + "epoch": 1.1423487544483986, + "grad_norm": 0.7719388008117676, + "learning_rate": 8.023984500264782e-06, + "loss": 0.624653434753418, + "step": 6420 + }, + { + "epoch": 1.1441281138790036, + "grad_norm": 0.8489981293678284, + "learning_rate": 7.996097175667352e-06, + "loss": 0.6066908836364746, + "step": 6430 + }, + { + "epoch": 1.1459074733096086, + "grad_norm": 0.7810853123664856, + "learning_rate": 7.968226078059716e-06, + "loss": 0.5920337677001953, + "step": 6440 + }, + { + "epoch": 1.1476868327402134, + "grad_norm": 0.7887650728225708, + "learning_rate": 7.940371433133459e-06, + "loss": 0.6641497135162353, + "step": 6450 + }, + { + "epoch": 1.1494661921708185, + "grad_norm": 0.7652443051338196, + "learning_rate": 7.912533466446947e-06, + "loss": 0.6473179340362549, + "step": 6460 + }, + { + "epoch": 1.1512455516014235, + "grad_norm": 0.7206646800041199, + "learning_rate": 7.88471240342348e-06, + "loss": 0.5841953754425049, + "step": 6470 + }, + { + "epoch": 1.1530249110320285, + "grad_norm": 0.7867498993873596, + "learning_rate": 7.856908469349495e-06, + "loss": 0.6486004829406739, + "step": 6480 + }, + { + "epoch": 1.1548042704626336, + "grad_norm": 0.8296671509742737, + "learning_rate": 7.829121889372702e-06, + "loss": 0.6224873065948486, + "step": 6490 + }, + { + "epoch": 1.1565836298932384, + "grad_norm": 0.8746348023414612, + "learning_rate": 7.801352888500304e-06, + "loss": 0.640509843826294, + "step": 6500 + }, + { + "epoch": 1.1565836298932384, + "eval_loss": 0.620580792427063, + "eval_runtime": 411.917, + "eval_samples_per_second": 12.209, + "eval_steps_per_second": 6.106, + "step": 6500 + }, + { + "epoch": 1.1583629893238434, + "grad_norm": 0.7378506660461426, + "learning_rate": 7.773601691597135e-06, + "loss": 0.6328612327575683, + "step": 6510 + }, + { + "epoch": 1.1601423487544484, + "grad_norm": 0.7263290882110596, + "learning_rate": 7.745868523383876e-06, + "loss": 0.6071421146392822, + "step": 6520 + }, + { + "epoch": 1.1619217081850535, + "grad_norm": 0.8653813004493713, + "learning_rate": 7.7181536084352e-06, + "loss": 0.6067211627960205, + "step": 6530 + }, + { + "epoch": 1.1637010676156583, + "grad_norm": 0.9452764987945557, + "learning_rate": 7.690457171177984e-06, + "loss": 0.6291831970214844, + "step": 6540 + }, + { + "epoch": 1.1654804270462633, + "grad_norm": 0.65375155210495, + "learning_rate": 7.66277943588947e-06, + "loss": 0.6606705665588379, + "step": 6550 + }, + { + "epoch": 1.1672597864768683, + "grad_norm": 0.978212833404541, + "learning_rate": 7.635120626695456e-06, + "loss": 0.5956210613250732, + "step": 6560 + }, + { + "epoch": 1.1690391459074734, + "grad_norm": 0.7400839328765869, + "learning_rate": 7.607480967568497e-06, + "loss": 0.6242622852325439, + "step": 6570 + }, + { + "epoch": 1.1708185053380782, + "grad_norm": 1.0911694765090942, + "learning_rate": 7.579860682326055e-06, + "loss": 0.6239947319030762, + "step": 6580 + }, + { + "epoch": 1.1725978647686832, + "grad_norm": 0.7858565449714661, + "learning_rate": 7.552259994628728e-06, + "loss": 0.6366580963134766, + "step": 6590 + }, + { + "epoch": 1.1743772241992882, + "grad_norm": 0.8578958511352539, + "learning_rate": 7.5246791279784056e-06, + "loss": 0.6067252635955811, + "step": 6600 + }, + { + "epoch": 1.1761565836298933, + "grad_norm": 0.7615208029747009, + "learning_rate": 7.4971183057164785e-06, + "loss": 0.6032675266265869, + "step": 6610 + }, + { + "epoch": 1.1779359430604983, + "grad_norm": 0.8318284749984741, + "learning_rate": 7.469577751022024e-06, + "loss": 0.5974441528320312, + "step": 6620 + }, + { + "epoch": 1.1797153024911031, + "grad_norm": 0.882125198841095, + "learning_rate": 7.442057686909998e-06, + "loss": 0.6573843002319336, + "step": 6630 + }, + { + "epoch": 1.1814946619217082, + "grad_norm": 0.7975313663482666, + "learning_rate": 7.414558336229426e-06, + "loss": 0.6807239055633545, + "step": 6640 + }, + { + "epoch": 1.1832740213523132, + "grad_norm": 0.7822281122207642, + "learning_rate": 7.387079921661614e-06, + "loss": 0.6663013458251953, + "step": 6650 + }, + { + "epoch": 1.1850533807829182, + "grad_norm": 0.8564392328262329, + "learning_rate": 7.359622665718317e-06, + "loss": 0.6218353271484375, + "step": 6660 + }, + { + "epoch": 1.1868327402135233, + "grad_norm": 0.7377424836158752, + "learning_rate": 7.332186790739973e-06, + "loss": 0.5968504905700683, + "step": 6670 + }, + { + "epoch": 1.188612099644128, + "grad_norm": 0.730351984500885, + "learning_rate": 7.304772518893866e-06, + "loss": 0.6158394813537598, + "step": 6680 + }, + { + "epoch": 1.190391459074733, + "grad_norm": 0.7883412837982178, + "learning_rate": 7.277380072172354e-06, + "loss": 0.6515108585357666, + "step": 6690 + }, + { + "epoch": 1.1921708185053381, + "grad_norm": 0.9434972405433655, + "learning_rate": 7.250009672391063e-06, + "loss": 0.6276324272155762, + "step": 6700 + }, + { + "epoch": 1.193950177935943, + "grad_norm": 0.8042780756950378, + "learning_rate": 7.2226615411870796e-06, + "loss": 0.6224194526672363, + "step": 6710 + }, + { + "epoch": 1.195729537366548, + "grad_norm": 0.8094464540481567, + "learning_rate": 7.195335900017181e-06, + "loss": 0.6250015735626221, + "step": 6720 + }, + { + "epoch": 1.197508896797153, + "grad_norm": 0.8359760046005249, + "learning_rate": 7.168032970156011e-06, + "loss": 0.6454795837402344, + "step": 6730 + }, + { + "epoch": 1.199288256227758, + "grad_norm": 0.728487491607666, + "learning_rate": 7.140752972694325e-06, + "loss": 0.6341996192932129, + "step": 6740 + }, + { + "epoch": 1.201067615658363, + "grad_norm": 0.8493318557739258, + "learning_rate": 7.113496128537154e-06, + "loss": 0.6078832626342774, + "step": 6750 + }, + { + "epoch": 1.2028469750889679, + "grad_norm": 0.7177495360374451, + "learning_rate": 7.086262658402059e-06, + "loss": 0.6244680881500244, + "step": 6760 + }, + { + "epoch": 1.204626334519573, + "grad_norm": 0.7511991858482361, + "learning_rate": 7.059052782817317e-06, + "loss": 0.6233652591705322, + "step": 6770 + }, + { + "epoch": 1.206405693950178, + "grad_norm": 1.0159603357315063, + "learning_rate": 7.0318667221201505e-06, + "loss": 0.6203540802001953, + "step": 6780 + }, + { + "epoch": 1.208185053380783, + "grad_norm": 0.9800230264663696, + "learning_rate": 7.004704696454924e-06, + "loss": 0.621929931640625, + "step": 6790 + }, + { + "epoch": 1.209964412811388, + "grad_norm": 0.740306556224823, + "learning_rate": 6.97756692577139e-06, + "loss": 0.5935471534729004, + "step": 6800 + }, + { + "epoch": 1.2117437722419928, + "grad_norm": 0.7026015520095825, + "learning_rate": 6.950453629822874e-06, + "loss": 0.5996420383453369, + "step": 6810 + }, + { + "epoch": 1.2135231316725978, + "grad_norm": 0.7063615918159485, + "learning_rate": 6.923365028164532e-06, + "loss": 0.6140129089355468, + "step": 6820 + }, + { + "epoch": 1.2153024911032029, + "grad_norm": 0.6547632217407227, + "learning_rate": 6.8963013401515365e-06, + "loss": 0.6896812915802002, + "step": 6830 + }, + { + "epoch": 1.217081850533808, + "grad_norm": 0.7407922744750977, + "learning_rate": 6.8692627849373226e-06, + "loss": 0.6203756332397461, + "step": 6840 + }, + { + "epoch": 1.2188612099644127, + "grad_norm": 0.7740945219993591, + "learning_rate": 6.842249581471814e-06, + "loss": 0.6261481761932373, + "step": 6850 + }, + { + "epoch": 1.2206405693950177, + "grad_norm": 0.8180769085884094, + "learning_rate": 6.815261948499628e-06, + "loss": 0.6112605094909668, + "step": 6860 + }, + { + "epoch": 1.2224199288256228, + "grad_norm": 0.7909367084503174, + "learning_rate": 6.788300104558341e-06, + "loss": 0.6290336132049561, + "step": 6870 + }, + { + "epoch": 1.2241992882562278, + "grad_norm": 0.7958800792694092, + "learning_rate": 6.76136426797668e-06, + "loss": 0.6461628913879395, + "step": 6880 + }, + { + "epoch": 1.2259786476868326, + "grad_norm": 0.83876633644104, + "learning_rate": 6.734454656872784e-06, + "loss": 0.6532453060150146, + "step": 6890 + }, + { + "epoch": 1.2277580071174377, + "grad_norm": 0.8222100138664246, + "learning_rate": 6.707571489152418e-06, + "loss": 0.6066458225250244, + "step": 6900 + }, + { + "epoch": 1.2295373665480427, + "grad_norm": 0.7338365316390991, + "learning_rate": 6.680714982507219e-06, + "loss": 0.6187913417816162, + "step": 6910 + }, + { + "epoch": 1.2313167259786477, + "grad_norm": 1.0070669651031494, + "learning_rate": 6.653885354412935e-06, + "loss": 0.638453197479248, + "step": 6920 + }, + { + "epoch": 1.2330960854092528, + "grad_norm": 0.8759623169898987, + "learning_rate": 6.627082822127657e-06, + "loss": 0.6360817909240722, + "step": 6930 + }, + { + "epoch": 1.2348754448398576, + "grad_norm": 0.7884645462036133, + "learning_rate": 6.600307602690057e-06, + "loss": 0.6332939147949219, + "step": 6940 + }, + { + "epoch": 1.2366548042704626, + "grad_norm": 0.7928617596626282, + "learning_rate": 6.573559912917648e-06, + "loss": 0.6071663856506347, + "step": 6950 + }, + { + "epoch": 1.2384341637010676, + "grad_norm": 0.8969078063964844, + "learning_rate": 6.546839969405003e-06, + "loss": 0.6088360786437989, + "step": 6960 + }, + { + "epoch": 1.2402135231316727, + "grad_norm": 0.6923132538795471, + "learning_rate": 6.520147988522021e-06, + "loss": 0.602042293548584, + "step": 6970 + }, + { + "epoch": 1.2419928825622777, + "grad_norm": 0.8759524822235107, + "learning_rate": 6.493484186412172e-06, + "loss": 0.6699591636657715, + "step": 6980 + }, + { + "epoch": 1.2437722419928825, + "grad_norm": 0.9501714706420898, + "learning_rate": 6.466848778990734e-06, + "loss": 0.631440258026123, + "step": 6990 + }, + { + "epoch": 1.2455516014234875, + "grad_norm": 0.8564425706863403, + "learning_rate": 6.440241981943063e-06, + "loss": 0.6175348281860351, + "step": 7000 + }, + { + "epoch": 1.2455516014234875, + "eval_loss": 0.6177652478218079, + "eval_runtime": 410.1105, + "eval_samples_per_second": 12.263, + "eval_steps_per_second": 6.132, + "step": 7000 + }, + { + "epoch": 1.2473309608540926, + "grad_norm": 0.8425670266151428, + "learning_rate": 6.413664010722825e-06, + "loss": 0.6334220886230468, + "step": 7010 + }, + { + "epoch": 1.2491103202846976, + "grad_norm": 0.6265377998352051, + "learning_rate": 6.3871150805502765e-06, + "loss": 0.6206889629364014, + "step": 7020 + }, + { + "epoch": 1.2508896797153026, + "grad_norm": 0.7120670676231384, + "learning_rate": 6.360595406410497e-06, + "loss": 0.609261417388916, + "step": 7030 + }, + { + "epoch": 1.2526690391459074, + "grad_norm": 0.7951711416244507, + "learning_rate": 6.334105203051673e-06, + "loss": 0.6361230850219727, + "step": 7040 + }, + { + "epoch": 1.2544483985765125, + "grad_norm": 0.9487454891204834, + "learning_rate": 6.307644684983329e-06, + "loss": 0.5716474533081055, + "step": 7050 + }, + { + "epoch": 1.2562277580071175, + "grad_norm": 0.7447927594184875, + "learning_rate": 6.281214066474623e-06, + "loss": 0.6536383152008056, + "step": 7060 + }, + { + "epoch": 1.2580071174377223, + "grad_norm": 0.8137961626052856, + "learning_rate": 6.254813561552585e-06, + "loss": 0.6361209869384765, + "step": 7070 + }, + { + "epoch": 1.2597864768683273, + "grad_norm": 0.7065114378929138, + "learning_rate": 6.2284433840004e-06, + "loss": 0.6489431858062744, + "step": 7080 + }, + { + "epoch": 1.2615658362989324, + "grad_norm": 0.7409806847572327, + "learning_rate": 6.202103747355667e-06, + "loss": 0.6485908031463623, + "step": 7090 + }, + { + "epoch": 1.2633451957295374, + "grad_norm": 0.662041962146759, + "learning_rate": 6.175794864908672e-06, + "loss": 0.6204521179199218, + "step": 7100 + }, + { + "epoch": 1.2651245551601424, + "grad_norm": 0.7687442898750305, + "learning_rate": 6.1495169497006755e-06, + "loss": 0.6361071109771729, + "step": 7110 + }, + { + "epoch": 1.2669039145907472, + "grad_norm": 0.9222472906112671, + "learning_rate": 6.123270214522159e-06, + "loss": 0.6162978172302246, + "step": 7120 + }, + { + "epoch": 1.2686832740213523, + "grad_norm": 0.819275975227356, + "learning_rate": 6.097054871911132e-06, + "loss": 0.6023138523101806, + "step": 7130 + }, + { + "epoch": 1.2704626334519573, + "grad_norm": 0.8842992782592773, + "learning_rate": 6.070871134151386e-06, + "loss": 0.658526086807251, + "step": 7140 + }, + { + "epoch": 1.2722419928825623, + "grad_norm": 0.7101578116416931, + "learning_rate": 6.044719213270791e-06, + "loss": 0.6004268169403076, + "step": 7150 + }, + { + "epoch": 1.2740213523131674, + "grad_norm": 0.7537800669670105, + "learning_rate": 6.018599321039569e-06, + "loss": 0.6223325729370117, + "step": 7160 + }, + { + "epoch": 1.2758007117437722, + "grad_norm": 0.6810758709907532, + "learning_rate": 5.9925116689685925e-06, + "loss": 0.6302705764770508, + "step": 7170 + }, + { + "epoch": 1.2775800711743772, + "grad_norm": 0.8179675936698914, + "learning_rate": 5.966456468307653e-06, + "loss": 0.6177227020263671, + "step": 7180 + }, + { + "epoch": 1.2793594306049823, + "grad_norm": 0.7833004593849182, + "learning_rate": 5.940433930043772e-06, + "loss": 0.6414045333862305, + "step": 7190 + }, + { + "epoch": 1.281138790035587, + "grad_norm": 0.9323675036430359, + "learning_rate": 5.914444264899466e-06, + "loss": 0.603922176361084, + "step": 7200 + }, + { + "epoch": 1.282918149466192, + "grad_norm": 0.844142735004425, + "learning_rate": 5.888487683331072e-06, + "loss": 0.6308831691741943, + "step": 7210 + }, + { + "epoch": 1.2846975088967971, + "grad_norm": 0.8182924389839172, + "learning_rate": 5.862564395527013e-06, + "loss": 0.6259811401367188, + "step": 7220 + }, + { + "epoch": 1.2864768683274022, + "grad_norm": 0.8091204166412354, + "learning_rate": 5.836674611406117e-06, + "loss": 0.6158743858337402, + "step": 7230 + }, + { + "epoch": 1.2882562277580072, + "grad_norm": 0.9385861754417419, + "learning_rate": 5.810818540615903e-06, + "loss": 0.6304144382476806, + "step": 7240 + }, + { + "epoch": 1.290035587188612, + "grad_norm": 1.0828267335891724, + "learning_rate": 5.784996392530892e-06, + "loss": 0.6548801898956299, + "step": 7250 + }, + { + "epoch": 1.291814946619217, + "grad_norm": 0.7489603161811829, + "learning_rate": 5.759208376250916e-06, + "loss": 0.6102576732635498, + "step": 7260 + }, + { + "epoch": 1.293594306049822, + "grad_norm": 0.7216728329658508, + "learning_rate": 5.733454700599408e-06, + "loss": 0.6362377166748047, + "step": 7270 + }, + { + "epoch": 1.295373665480427, + "grad_norm": 0.8442836403846741, + "learning_rate": 5.707735574121732e-06, + "loss": 0.6299904346466064, + "step": 7280 + }, + { + "epoch": 1.2971530249110321, + "grad_norm": 0.6905636787414551, + "learning_rate": 5.68205120508347e-06, + "loss": 0.6162694454193115, + "step": 7290 + }, + { + "epoch": 1.298932384341637, + "grad_norm": 0.8710734844207764, + "learning_rate": 5.656401801468764e-06, + "loss": 0.5905096054077148, + "step": 7300 + }, + { + "epoch": 1.300711743772242, + "grad_norm": 0.7497642040252686, + "learning_rate": 5.630787570978601e-06, + "loss": 0.6081714630126953, + "step": 7310 + }, + { + "epoch": 1.302491103202847, + "grad_norm": 0.7930024266242981, + "learning_rate": 5.605208721029162e-06, + "loss": 0.6393218517303467, + "step": 7320 + }, + { + "epoch": 1.304270462633452, + "grad_norm": 0.622921347618103, + "learning_rate": 5.579665458750119e-06, + "loss": 0.6169804096221924, + "step": 7330 + }, + { + "epoch": 1.306049822064057, + "grad_norm": 0.7385269403457642, + "learning_rate": 5.554157990982964e-06, + "loss": 0.6091341495513916, + "step": 7340 + }, + { + "epoch": 1.3078291814946619, + "grad_norm": 0.839229941368103, + "learning_rate": 5.528686524279336e-06, + "loss": 0.6216320037841797, + "step": 7350 + }, + { + "epoch": 1.309608540925267, + "grad_norm": 0.8395034670829773, + "learning_rate": 5.503251264899361e-06, + "loss": 0.6268163204193116, + "step": 7360 + }, + { + "epoch": 1.311387900355872, + "grad_norm": 0.7301527261734009, + "learning_rate": 5.477852418809955e-06, + "loss": 0.6226423740386963, + "step": 7370 + }, + { + "epoch": 1.3131672597864767, + "grad_norm": 0.9010869860649109, + "learning_rate": 5.452490191683171e-06, + "loss": 0.6155654907226562, + "step": 7380 + }, + { + "epoch": 1.3149466192170818, + "grad_norm": 0.7506121397018433, + "learning_rate": 5.427164788894543e-06, + "loss": 0.6015159130096436, + "step": 7390 + }, + { + "epoch": 1.3167259786476868, + "grad_norm": 0.8243475556373596, + "learning_rate": 5.401876415521402e-06, + "loss": 0.6120582103729248, + "step": 7400 + }, + { + "epoch": 1.3185053380782918, + "grad_norm": 0.877004086971283, + "learning_rate": 5.3766252763412375e-06, + "loss": 0.6368375301361084, + "step": 7410 + }, + { + "epoch": 1.3202846975088969, + "grad_norm": 0.6776864528656006, + "learning_rate": 5.351411575830015e-06, + "loss": 0.5798074722290039, + "step": 7420 + }, + { + "epoch": 1.3220640569395017, + "grad_norm": 0.7413604259490967, + "learning_rate": 5.326235518160543e-06, + "loss": 0.6417413711547851, + "step": 7430 + }, + { + "epoch": 1.3238434163701067, + "grad_norm": 0.680547297000885, + "learning_rate": 5.301097307200804e-06, + "loss": 0.6709810256958008, + "step": 7440 + }, + { + "epoch": 1.3256227758007118, + "grad_norm": 0.8982633352279663, + "learning_rate": 5.275997146512317e-06, + "loss": 0.6244979858398437, + "step": 7450 + }, + { + "epoch": 1.3274021352313168, + "grad_norm": 0.7878574728965759, + "learning_rate": 5.250935239348469e-06, + "loss": 0.6555228233337402, + "step": 7460 + }, + { + "epoch": 1.3291814946619218, + "grad_norm": 0.7958076000213623, + "learning_rate": 5.2259117886528974e-06, + "loss": 0.6336334228515625, + "step": 7470 + }, + { + "epoch": 1.3309608540925266, + "grad_norm": 0.8020284175872803, + "learning_rate": 5.200926997057818e-06, + "loss": 0.6250798225402832, + "step": 7480 + }, + { + "epoch": 1.3327402135231317, + "grad_norm": 0.9492243528366089, + "learning_rate": 5.175981066882403e-06, + "loss": 0.5951490879058838, + "step": 7490 + }, + { + "epoch": 1.3345195729537367, + "grad_norm": 0.8778184056282043, + "learning_rate": 5.15107420013113e-06, + "loss": 0.5949514865875244, + "step": 7500 + }, + { + "epoch": 1.3345195729537367, + "eval_loss": 0.6148350238800049, + "eval_runtime": 411.3736, + "eval_samples_per_second": 12.225, + "eval_steps_per_second": 6.114, + "step": 7500 + }, + { + "epoch": 1.3362989323843417, + "grad_norm": 0.888278603553772, + "learning_rate": 5.126206598492167e-06, + "loss": 0.6134849071502686, + "step": 7510 + }, + { + "epoch": 1.3380782918149468, + "grad_norm": 0.8123595714569092, + "learning_rate": 5.101378463335713e-06, + "loss": 0.6279965877532959, + "step": 7520 + }, + { + "epoch": 1.3398576512455516, + "grad_norm": 0.7082033753395081, + "learning_rate": 5.07658999571238e-06, + "loss": 0.6239530086517334, + "step": 7530 + }, + { + "epoch": 1.3416370106761566, + "grad_norm": 0.8400927782058716, + "learning_rate": 5.051841396351574e-06, + "loss": 0.5917630195617676, + "step": 7540 + }, + { + "epoch": 1.3434163701067616, + "grad_norm": 0.8962631225585938, + "learning_rate": 5.027132865659847e-06, + "loss": 0.6083420753479004, + "step": 7550 + }, + { + "epoch": 1.3451957295373664, + "grad_norm": 0.9693284630775452, + "learning_rate": 5.0024646037193025e-06, + "loss": 0.606035566329956, + "step": 7560 + }, + { + "epoch": 1.3469750889679715, + "grad_norm": 0.8629393577575684, + "learning_rate": 4.977836810285942e-06, + "loss": 0.6264309406280517, + "step": 7570 + }, + { + "epoch": 1.3487544483985765, + "grad_norm": 0.7961446642875671, + "learning_rate": 4.953249684788084e-06, + "loss": 0.6090545654296875, + "step": 7580 + }, + { + "epoch": 1.3505338078291815, + "grad_norm": 0.7653380036354065, + "learning_rate": 4.928703426324712e-06, + "loss": 0.6106627941131592, + "step": 7590 + }, + { + "epoch": 1.3523131672597866, + "grad_norm": 0.8642345070838928, + "learning_rate": 4.9041982336639004e-06, + "loss": 0.5813540458679199, + "step": 7600 + }, + { + "epoch": 1.3540925266903914, + "grad_norm": 0.7263400554656982, + "learning_rate": 4.8797343052411675e-06, + "loss": 0.685100507736206, + "step": 7610 + }, + { + "epoch": 1.3558718861209964, + "grad_norm": 0.8232746720314026, + "learning_rate": 4.855311839157899e-06, + "loss": 0.6377671718597412, + "step": 7620 + }, + { + "epoch": 1.3576512455516014, + "grad_norm": 0.73633873462677, + "learning_rate": 4.830931033179725e-06, + "loss": 0.6779125690460205, + "step": 7630 + }, + { + "epoch": 1.3594306049822065, + "grad_norm": 0.8550395965576172, + "learning_rate": 4.806592084734928e-06, + "loss": 0.6058492660522461, + "step": 7640 + }, + { + "epoch": 1.3612099644128115, + "grad_norm": 0.7828453779220581, + "learning_rate": 4.782295190912831e-06, + "loss": 0.6215229034423828, + "step": 7650 + }, + { + "epoch": 1.3629893238434163, + "grad_norm": 0.8796820640563965, + "learning_rate": 4.758040548462233e-06, + "loss": 0.5914856433868408, + "step": 7660 + }, + { + "epoch": 1.3647686832740213, + "grad_norm": 0.7559328675270081, + "learning_rate": 4.733828353789772e-06, + "loss": 0.6324697017669678, + "step": 7670 + }, + { + "epoch": 1.3665480427046264, + "grad_norm": 0.9165831804275513, + "learning_rate": 4.709658802958366e-06, + "loss": 0.6462049961090088, + "step": 7680 + }, + { + "epoch": 1.3683274021352312, + "grad_norm": 0.9339064359664917, + "learning_rate": 4.685532091685625e-06, + "loss": 0.6461282253265381, + "step": 7690 + }, + { + "epoch": 1.3701067615658362, + "grad_norm": 0.8191850781440735, + "learning_rate": 4.66144841534224e-06, + "loss": 0.6682465553283692, + "step": 7700 + }, + { + "epoch": 1.3718861209964412, + "grad_norm": 0.9715204238891602, + "learning_rate": 4.637407968950434e-06, + "loss": 0.5913180351257324, + "step": 7710 + }, + { + "epoch": 1.3736654804270463, + "grad_norm": 0.7584412693977356, + "learning_rate": 4.613410947182354e-06, + "loss": 0.5695419311523438, + "step": 7720 + }, + { + "epoch": 1.3754448398576513, + "grad_norm": 0.6991924047470093, + "learning_rate": 4.589457544358521e-06, + "loss": 0.6356719493865967, + "step": 7730 + }, + { + "epoch": 1.3772241992882561, + "grad_norm": 0.8555135726928711, + "learning_rate": 4.56554795444623e-06, + "loss": 0.5793665409088135, + "step": 7740 + }, + { + "epoch": 1.3790035587188612, + "grad_norm": 0.9497565031051636, + "learning_rate": 4.5416823710580046e-06, + "loss": 0.6186023712158203, + "step": 7750 + }, + { + "epoch": 1.3807829181494662, + "grad_norm": 0.8282408118247986, + "learning_rate": 4.517860987450007e-06, + "loss": 0.6287422180175781, + "step": 7760 + }, + { + "epoch": 1.3825622775800712, + "grad_norm": 0.7512138485908508, + "learning_rate": 4.494083996520492e-06, + "loss": 0.6511450290679932, + "step": 7770 + }, + { + "epoch": 1.3843416370106763, + "grad_norm": 0.8342724442481995, + "learning_rate": 4.470351590808228e-06, + "loss": 0.6355900764465332, + "step": 7780 + }, + { + "epoch": 1.386120996441281, + "grad_norm": 0.9951474666595459, + "learning_rate": 4.446663962490951e-06, + "loss": 0.6075921058654785, + "step": 7790 + }, + { + "epoch": 1.387900355871886, + "grad_norm": 0.7905099391937256, + "learning_rate": 4.423021303383799e-06, + "loss": 0.5925492286682129, + "step": 7800 + }, + { + "epoch": 1.3896797153024911, + "grad_norm": 0.8992406129837036, + "learning_rate": 4.3994238049377715e-06, + "loss": 0.6204987049102784, + "step": 7810 + }, + { + "epoch": 1.3914590747330962, + "grad_norm": 1.124568223953247, + "learning_rate": 4.375871658238163e-06, + "loss": 0.6461961746215821, + "step": 7820 + }, + { + "epoch": 1.3932384341637012, + "grad_norm": 0.7916883230209351, + "learning_rate": 4.352365054003022e-06, + "loss": 0.6402833461761475, + "step": 7830 + }, + { + "epoch": 1.395017793594306, + "grad_norm": 1.0332368612289429, + "learning_rate": 4.328904182581619e-06, + "loss": 0.6191961288452148, + "step": 7840 + }, + { + "epoch": 1.396797153024911, + "grad_norm": 1.0284537076950073, + "learning_rate": 4.305489233952881e-06, + "loss": 0.634841012954712, + "step": 7850 + }, + { + "epoch": 1.398576512455516, + "grad_norm": 0.8202362656593323, + "learning_rate": 4.282120397723879e-06, + "loss": 0.5904129028320313, + "step": 7860 + }, + { + "epoch": 1.4003558718861209, + "grad_norm": 0.8149864077568054, + "learning_rate": 4.258797863128266e-06, + "loss": 0.6212067127227783, + "step": 7870 + }, + { + "epoch": 1.402135231316726, + "grad_norm": 0.750307023525238, + "learning_rate": 4.235521819024776e-06, + "loss": 0.6193465232849121, + "step": 7880 + }, + { + "epoch": 1.403914590747331, + "grad_norm": 0.719935417175293, + "learning_rate": 4.212292453895658e-06, + "loss": 0.6318532466888428, + "step": 7890 + }, + { + "epoch": 1.405693950177936, + "grad_norm": 0.8568369150161743, + "learning_rate": 4.189109955845186e-06, + "loss": 0.6267857074737548, + "step": 7900 + }, + { + "epoch": 1.407473309608541, + "grad_norm": 0.8335803151130676, + "learning_rate": 4.165974512598102e-06, + "loss": 0.6253261089324951, + "step": 7910 + }, + { + "epoch": 1.4092526690391458, + "grad_norm": 0.9222790598869324, + "learning_rate": 4.142886311498133e-06, + "loss": 0.6295660972595215, + "step": 7920 + }, + { + "epoch": 1.4110320284697508, + "grad_norm": 0.7769973874092102, + "learning_rate": 4.119845539506436e-06, + "loss": 0.6295949935913085, + "step": 7930 + }, + { + "epoch": 1.4128113879003559, + "grad_norm": 0.8693393468856812, + "learning_rate": 4.096852383200106e-06, + "loss": 0.5903539180755615, + "step": 7940 + }, + { + "epoch": 1.414590747330961, + "grad_norm": 0.8232077956199646, + "learning_rate": 4.073907028770671e-06, + "loss": 0.5986703395843506, + "step": 7950 + }, + { + "epoch": 1.416370106761566, + "grad_norm": 0.9507026672363281, + "learning_rate": 4.051009662022559e-06, + "loss": 0.6164099693298339, + "step": 7960 + }, + { + "epoch": 1.4181494661921707, + "grad_norm": 0.9032939672470093, + "learning_rate": 4.028160468371618e-06, + "loss": 0.5873546600341797, + "step": 7970 + }, + { + "epoch": 1.4199288256227758, + "grad_norm": 0.7442892789840698, + "learning_rate": 4.005359632843598e-06, + "loss": 0.6300751209259033, + "step": 7980 + }, + { + "epoch": 1.4217081850533808, + "grad_norm": 0.870611310005188, + "learning_rate": 3.982607340072673e-06, + "loss": 0.6464887142181397, + "step": 7990 + }, + { + "epoch": 1.4234875444839858, + "grad_norm": 0.9346739053726196, + "learning_rate": 3.959903774299914e-06, + "loss": 0.5978512763977051, + "step": 8000 + }, + { + "epoch": 1.4234875444839858, + "eval_loss": 0.6126887202262878, + "eval_runtime": 408.0866, + "eval_samples_per_second": 12.323, + "eval_steps_per_second": 6.163, + "step": 8000 + }, + { + "epoch": 1.4252669039145909, + "grad_norm": 0.7768172025680542, + "learning_rate": 3.937249119371837e-06, + "loss": 0.6360022544860839, + "step": 8010 + }, + { + "epoch": 1.4270462633451957, + "grad_norm": 0.9232711791992188, + "learning_rate": 3.914643558738871e-06, + "loss": 0.6255642414093018, + "step": 8020 + }, + { + "epoch": 1.4288256227758007, + "grad_norm": 0.6983018517494202, + "learning_rate": 3.892087275453913e-06, + "loss": 0.5994945526123047, + "step": 8030 + }, + { + "epoch": 1.4306049822064058, + "grad_norm": 1.073339819908142, + "learning_rate": 3.869580452170813e-06, + "loss": 0.6356189250946045, + "step": 8040 + }, + { + "epoch": 1.4323843416370106, + "grad_norm": 0.6503099799156189, + "learning_rate": 3.847123271142921e-06, + "loss": 0.6280940055847168, + "step": 8050 + }, + { + "epoch": 1.4341637010676156, + "grad_norm": 0.7652536630630493, + "learning_rate": 3.824715914221593e-06, + "loss": 0.633913803100586, + "step": 8060 + }, + { + "epoch": 1.4359430604982206, + "grad_norm": 0.7776927351951599, + "learning_rate": 3.802358562854719e-06, + "loss": 0.6014345169067383, + "step": 8070 + }, + { + "epoch": 1.4377224199288257, + "grad_norm": 0.9608566164970398, + "learning_rate": 3.780051398085274e-06, + "loss": 0.6504060745239257, + "step": 8080 + }, + { + "epoch": 1.4395017793594307, + "grad_norm": 1.0493940114974976, + "learning_rate": 3.7577946005498224e-06, + "loss": 0.6023014545440674, + "step": 8090 + }, + { + "epoch": 1.4412811387900355, + "grad_norm": 0.6732901930809021, + "learning_rate": 3.735588350477083e-06, + "loss": 0.618662166595459, + "step": 8100 + }, + { + "epoch": 1.4430604982206405, + "grad_norm": 0.7948004007339478, + "learning_rate": 3.7134328276864474e-06, + "loss": 0.6384446144104003, + "step": 8110 + }, + { + "epoch": 1.4448398576512456, + "grad_norm": 0.8854271769523621, + "learning_rate": 3.691328211586537e-06, + "loss": 0.6317539215087891, + "step": 8120 + }, + { + "epoch": 1.4466192170818506, + "grad_norm": 0.849312961101532, + "learning_rate": 3.669274681173741e-06, + "loss": 0.632611894607544, + "step": 8130 + }, + { + "epoch": 1.4483985765124556, + "grad_norm": 0.8737841844558716, + "learning_rate": 3.647272415030787e-06, + "loss": 0.6378211498260498, + "step": 8140 + }, + { + "epoch": 1.4501779359430604, + "grad_norm": 0.9041767716407776, + "learning_rate": 3.6253215913252614e-06, + "loss": 0.6225557804107666, + "step": 8150 + }, + { + "epoch": 1.4519572953736655, + "grad_norm": 0.9240919947624207, + "learning_rate": 3.603422387808203e-06, + "loss": 0.6011790752410888, + "step": 8160 + }, + { + "epoch": 1.4537366548042705, + "grad_norm": 0.7381271719932556, + "learning_rate": 3.5815749818126298e-06, + "loss": 0.6100322723388671, + "step": 8170 + }, + { + "epoch": 1.4555160142348753, + "grad_norm": 0.766522228717804, + "learning_rate": 3.559779550252135e-06, + "loss": 0.6223373413085938, + "step": 8180 + }, + { + "epoch": 1.4572953736654806, + "grad_norm": 0.722597599029541, + "learning_rate": 3.5380362696194246e-06, + "loss": 0.6096580982208252, + "step": 8190 + }, + { + "epoch": 1.4590747330960854, + "grad_norm": 0.848345935344696, + "learning_rate": 3.5163453159849158e-06, + "loss": 0.5932037830352783, + "step": 8200 + }, + { + "epoch": 1.4608540925266904, + "grad_norm": 0.8649978637695312, + "learning_rate": 3.4947068649952875e-06, + "loss": 0.616939926147461, + "step": 8210 + }, + { + "epoch": 1.4626334519572954, + "grad_norm": 1.0290734767913818, + "learning_rate": 3.473121091872068e-06, + "loss": 0.5814998149871826, + "step": 8220 + }, + { + "epoch": 1.4644128113879002, + "grad_norm": 1.0387070178985596, + "learning_rate": 3.4515881714102283e-06, + "loss": 0.602289867401123, + "step": 8230 + }, + { + "epoch": 1.4661921708185053, + "grad_norm": 0.7532253861427307, + "learning_rate": 3.4301082779767394e-06, + "loss": 0.6335249423980713, + "step": 8240 + }, + { + "epoch": 1.4679715302491103, + "grad_norm": 0.8629032373428345, + "learning_rate": 3.4086815855091906e-06, + "loss": 0.6227278709411621, + "step": 8250 + }, + { + "epoch": 1.4697508896797153, + "grad_norm": 0.9435595870018005, + "learning_rate": 3.3873082675143533e-06, + "loss": 0.6437982559204102, + "step": 8260 + }, + { + "epoch": 1.4715302491103204, + "grad_norm": 0.9069509506225586, + "learning_rate": 3.3659884970667955e-06, + "loss": 0.5948871612548828, + "step": 8270 + }, + { + "epoch": 1.4733096085409252, + "grad_norm": 0.7571467757225037, + "learning_rate": 3.344722446807469e-06, + "loss": 0.6302636623382568, + "step": 8280 + }, + { + "epoch": 1.4750889679715302, + "grad_norm": 0.7709717154502869, + "learning_rate": 3.3235102889423263e-06, + "loss": 0.6287346839904785, + "step": 8290 + }, + { + "epoch": 1.4768683274021353, + "grad_norm": 0.9050244688987732, + "learning_rate": 3.302352195240901e-06, + "loss": 0.6431692123413086, + "step": 8300 + }, + { + "epoch": 1.4786476868327403, + "grad_norm": 0.8429788947105408, + "learning_rate": 3.281248337034947e-06, + "loss": 0.6204410076141358, + "step": 8310 + }, + { + "epoch": 1.4804270462633453, + "grad_norm": 0.7680916786193848, + "learning_rate": 3.2601988852170207e-06, + "loss": 0.6185726642608642, + "step": 8320 + }, + { + "epoch": 1.4822064056939501, + "grad_norm": 0.7400604486465454, + "learning_rate": 3.2392040102391278e-06, + "loss": 0.711566686630249, + "step": 8330 + }, + { + "epoch": 1.4839857651245552, + "grad_norm": 0.7004432082176208, + "learning_rate": 3.2182638821113156e-06, + "loss": 0.5973163604736328, + "step": 8340 + }, + { + "epoch": 1.4857651245551602, + "grad_norm": 0.8438174724578857, + "learning_rate": 3.1973786704003086e-06, + "loss": 0.6273365497589112, + "step": 8350 + }, + { + "epoch": 1.487544483985765, + "grad_norm": 0.9420303702354431, + "learning_rate": 3.1765485442281453e-06, + "loss": 0.6080061435699463, + "step": 8360 + }, + { + "epoch": 1.48932384341637, + "grad_norm": 0.8402919769287109, + "learning_rate": 3.1557736722707843e-06, + "loss": 0.6027824401855468, + "step": 8370 + }, + { + "epoch": 1.491103202846975, + "grad_norm": 1.0150177478790283, + "learning_rate": 3.1350542227567693e-06, + "loss": 0.629277515411377, + "step": 8380 + }, + { + "epoch": 1.49288256227758, + "grad_norm": 0.8253095149993896, + "learning_rate": 3.1143903634658314e-06, + "loss": 0.6055563926696778, + "step": 8390 + }, + { + "epoch": 1.4946619217081851, + "grad_norm": 0.7219249606132507, + "learning_rate": 3.093782261727567e-06, + "loss": 0.6305870532989502, + "step": 8400 + }, + { + "epoch": 1.49644128113879, + "grad_norm": 0.7794970870018005, + "learning_rate": 3.073230084420051e-06, + "loss": 0.6157556533813476, + "step": 8410 + }, + { + "epoch": 1.498220640569395, + "grad_norm": 0.8257132172584534, + "learning_rate": 3.0527339979685068e-06, + "loss": 0.628666877746582, + "step": 8420 + }, + { + "epoch": 1.5, + "grad_norm": 0.7853142619132996, + "learning_rate": 3.0322941683439455e-06, + "loss": 0.6387495040893555, + "step": 8430 + }, + { + "epoch": 1.501779359430605, + "grad_norm": 0.9834029674530029, + "learning_rate": 3.011910761061837e-06, + "loss": 0.6235998153686524, + "step": 8440 + }, + { + "epoch": 1.50355871886121, + "grad_norm": 0.7815040349960327, + "learning_rate": 2.99158394118075e-06, + "loss": 0.6229785919189453, + "step": 8450 + }, + { + "epoch": 1.5053380782918149, + "grad_norm": 0.6993287801742554, + "learning_rate": 2.9713138733010373e-06, + "loss": 0.6498080730438233, + "step": 8460 + }, + { + "epoch": 1.50711743772242, + "grad_norm": 0.8505419492721558, + "learning_rate": 2.951100721563479e-06, + "loss": 0.6163151264190674, + "step": 8470 + }, + { + "epoch": 1.508896797153025, + "grad_norm": 0.7740962505340576, + "learning_rate": 2.930944649647971e-06, + "loss": 0.6463754177093506, + "step": 8480 + }, + { + "epoch": 1.5106761565836297, + "grad_norm": 0.9081360101699829, + "learning_rate": 2.9108458207722013e-06, + "loss": 0.5963332653045654, + "step": 8490 + }, + { + "epoch": 1.512455516014235, + "grad_norm": 0.7486013770103455, + "learning_rate": 2.8908043976903065e-06, + "loss": 0.5907905101776123, + "step": 8500 + }, + { + "epoch": 1.512455516014235, + "eval_loss": 0.6111557483673096, + "eval_runtime": 408.1089, + "eval_samples_per_second": 12.323, + "eval_steps_per_second": 6.163, + "step": 8500 + }, + { + "epoch": 1.5142348754448398, + "grad_norm": 0.6754533648490906, + "learning_rate": 2.8708205426915858e-06, + "loss": 0.5735606670379638, + "step": 8510 + }, + { + "epoch": 1.5160142348754448, + "grad_norm": 0.9227625727653503, + "learning_rate": 2.850894417599154e-06, + "loss": 0.586240005493164, + "step": 8520 + }, + { + "epoch": 1.5177935943060499, + "grad_norm": 1.1501375436782837, + "learning_rate": 2.8310261837686594e-06, + "loss": 0.5986839771270752, + "step": 8530 + }, + { + "epoch": 1.5195729537366547, + "grad_norm": 0.8052434325218201, + "learning_rate": 2.811216002086954e-06, + "loss": 0.6387444972991944, + "step": 8540 + }, + { + "epoch": 1.52135231316726, + "grad_norm": 0.9873703718185425, + "learning_rate": 2.791464032970812e-06, + "loss": 0.6114506244659423, + "step": 8550 + }, + { + "epoch": 1.5231316725978647, + "grad_norm": 0.8305763006210327, + "learning_rate": 2.771770436365612e-06, + "loss": 0.65102219581604, + "step": 8560 + }, + { + "epoch": 1.5249110320284698, + "grad_norm": 0.9540684223175049, + "learning_rate": 2.7521353717440523e-06, + "loss": 0.6242643833160401, + "step": 8570 + }, + { + "epoch": 1.5266903914590748, + "grad_norm": 0.764707624912262, + "learning_rate": 2.732558998104855e-06, + "loss": 0.6340816020965576, + "step": 8580 + }, + { + "epoch": 1.5284697508896796, + "grad_norm": 0.8800462484359741, + "learning_rate": 2.7130414739714884e-06, + "loss": 0.6269487857818603, + "step": 8590 + }, + { + "epoch": 1.5302491103202847, + "grad_norm": 0.7068182826042175, + "learning_rate": 2.6935829573908645e-06, + "loss": 0.6376915454864502, + "step": 8600 + }, + { + "epoch": 1.5320284697508897, + "grad_norm": 0.9243912100791931, + "learning_rate": 2.6741836059320813e-06, + "loss": 0.6650017261505127, + "step": 8610 + }, + { + "epoch": 1.5338078291814945, + "grad_norm": 0.8034710884094238, + "learning_rate": 2.654843576685129e-06, + "loss": 0.6140787601470947, + "step": 8620 + }, + { + "epoch": 1.5355871886120998, + "grad_norm": 0.6924305558204651, + "learning_rate": 2.635563026259622e-06, + "loss": 0.592177438735962, + "step": 8630 + }, + { + "epoch": 1.5373665480427046, + "grad_norm": 0.9960291385650635, + "learning_rate": 2.616342110783544e-06, + "loss": 0.620767879486084, + "step": 8640 + }, + { + "epoch": 1.5391459074733096, + "grad_norm": 0.9700925350189209, + "learning_rate": 2.5971809859019616e-06, + "loss": 0.6059544563293457, + "step": 8650 + }, + { + "epoch": 1.5409252669039146, + "grad_norm": 0.8979527354240417, + "learning_rate": 2.578079806775786e-06, + "loss": 0.6165768146514893, + "step": 8660 + }, + { + "epoch": 1.5427046263345194, + "grad_norm": 0.7694889903068542, + "learning_rate": 2.559038728080495e-06, + "loss": 0.6269434452056885, + "step": 8670 + }, + { + "epoch": 1.5444839857651247, + "grad_norm": 0.7812192440032959, + "learning_rate": 2.5400579040049045e-06, + "loss": 0.6156674385070801, + "step": 8680 + }, + { + "epoch": 1.5462633451957295, + "grad_norm": 0.8593633770942688, + "learning_rate": 2.521137488249892e-06, + "loss": 0.5995303630828858, + "step": 8690 + }, + { + "epoch": 1.5480427046263345, + "grad_norm": 0.7315478920936584, + "learning_rate": 2.5022776340271827e-06, + "loss": 0.6255430698394775, + "step": 8700 + }, + { + "epoch": 1.5498220640569396, + "grad_norm": 0.7979263663291931, + "learning_rate": 2.483478494058081e-06, + "loss": 0.617695426940918, + "step": 8710 + }, + { + "epoch": 1.5516014234875444, + "grad_norm": 0.791521430015564, + "learning_rate": 2.4647402205722513e-06, + "loss": 0.5985545158386231, + "step": 8720 + }, + { + "epoch": 1.5533807829181496, + "grad_norm": 0.8115281462669373, + "learning_rate": 2.446062965306476e-06, + "loss": 0.6137414932250976, + "step": 8730 + }, + { + "epoch": 1.5551601423487544, + "grad_norm": 0.73805832862854, + "learning_rate": 2.427446879503439e-06, + "loss": 0.6292818069458008, + "step": 8740 + }, + { + "epoch": 1.5569395017793595, + "grad_norm": 0.7681689262390137, + "learning_rate": 2.408892113910485e-06, + "loss": 0.6184986114501954, + "step": 8750 + }, + { + "epoch": 1.5587188612099645, + "grad_norm": 0.8392589688301086, + "learning_rate": 2.390398818778403e-06, + "loss": 0.6432001113891601, + "step": 8760 + }, + { + "epoch": 1.5604982206405693, + "grad_norm": 0.7140183448791504, + "learning_rate": 2.3719671438602287e-06, + "loss": 0.5931034088134766, + "step": 8770 + }, + { + "epoch": 1.5622775800711743, + "grad_norm": 1.0340561866760254, + "learning_rate": 2.353597238409997e-06, + "loss": 0.6184981346130372, + "step": 8780 + }, + { + "epoch": 1.5640569395017794, + "grad_norm": 1.3788442611694336, + "learning_rate": 2.3352892511815695e-06, + "loss": 0.6254217147827148, + "step": 8790 + }, + { + "epoch": 1.5658362989323842, + "grad_norm": 0.8685413599014282, + "learning_rate": 2.3170433304274e-06, + "loss": 0.6440535545349121, + "step": 8800 + }, + { + "epoch": 1.5676156583629894, + "grad_norm": 0.8243607878684998, + "learning_rate": 2.298859623897357e-06, + "loss": 0.6024580001831055, + "step": 8810 + }, + { + "epoch": 1.5693950177935942, + "grad_norm": 0.9618055820465088, + "learning_rate": 2.280738278837508e-06, + "loss": 0.6213048934936524, + "step": 8820 + }, + { + "epoch": 1.5711743772241993, + "grad_norm": 0.8591263294219971, + "learning_rate": 2.2626794419889463e-06, + "loss": 0.6279497146606445, + "step": 8830 + }, + { + "epoch": 1.5729537366548043, + "grad_norm": 0.8621962070465088, + "learning_rate": 2.2446832595865833e-06, + "loss": 0.5932358741760254, + "step": 8840 + }, + { + "epoch": 1.5747330960854091, + "grad_norm": 0.9073888659477234, + "learning_rate": 2.226749877357983e-06, + "loss": 0.6451629161834717, + "step": 8850 + }, + { + "epoch": 1.5765124555160144, + "grad_norm": 0.9106960892677307, + "learning_rate": 2.208879440522167e-06, + "loss": 0.6058315753936767, + "step": 8860 + }, + { + "epoch": 1.5782918149466192, + "grad_norm": 0.7805183529853821, + "learning_rate": 2.1910720937884432e-06, + "loss": 0.6308177947998047, + "step": 8870 + }, + { + "epoch": 1.5800711743772242, + "grad_norm": 0.8798645734786987, + "learning_rate": 2.1733279813552386e-06, + "loss": 0.6120688438415527, + "step": 8880 + }, + { + "epoch": 1.5818505338078293, + "grad_norm": 0.7852058410644531, + "learning_rate": 2.1556472469089305e-06, + "loss": 0.6294644832611084, + "step": 8890 + }, + { + "epoch": 1.583629893238434, + "grad_norm": 1.0219141244888306, + "learning_rate": 2.1380300336226756e-06, + "loss": 0.6379860877990723, + "step": 8900 + }, + { + "epoch": 1.585409252669039, + "grad_norm": 0.8301982879638672, + "learning_rate": 2.120476484155255e-06, + "loss": 0.5950196743011474, + "step": 8910 + }, + { + "epoch": 1.5871886120996441, + "grad_norm": 0.781911313533783, + "learning_rate": 2.102986740649928e-06, + "loss": 0.6155390739440918, + "step": 8920 + }, + { + "epoch": 1.5889679715302492, + "grad_norm": 0.8553647994995117, + "learning_rate": 2.0855609447332635e-06, + "loss": 0.6030520439147949, + "step": 8930 + }, + { + "epoch": 1.5907473309608542, + "grad_norm": 0.8010233640670776, + "learning_rate": 2.06819923751401e-06, + "loss": 0.5782717227935791, + "step": 8940 + }, + { + "epoch": 1.592526690391459, + "grad_norm": 0.7767868041992188, + "learning_rate": 2.050901759581937e-06, + "loss": 0.5862733364105225, + "step": 8950 + }, + { + "epoch": 1.594306049822064, + "grad_norm": 0.894256591796875, + "learning_rate": 2.033668651006715e-06, + "loss": 0.6170249938964844, + "step": 8960 + }, + { + "epoch": 1.596085409252669, + "grad_norm": 0.8816112875938416, + "learning_rate": 2.0165000513367604e-06, + "loss": 0.5960803508758545, + "step": 8970 + }, + { + "epoch": 1.5978647686832739, + "grad_norm": 0.7646706700325012, + "learning_rate": 1.9993960995981287e-06, + "loss": 0.609787130355835, + "step": 8980 + }, + { + "epoch": 1.5996441281138791, + "grad_norm": 0.8380435109138489, + "learning_rate": 1.9823569342933624e-06, + "loss": 0.5809425354003906, + "step": 8990 + }, + { + "epoch": 1.601423487544484, + "grad_norm": 0.7763661742210388, + "learning_rate": 1.965382693400396e-06, + "loss": 0.6282608032226562, + "step": 9000 + }, + { + "epoch": 1.601423487544484, + "eval_loss": 0.6098406314849854, + "eval_runtime": 407.9599, + "eval_samples_per_second": 12.327, + "eval_steps_per_second": 6.165, + "step": 9000 + }, + { + "epoch": 1.603202846975089, + "grad_norm": 0.7980552315711975, + "learning_rate": 1.9484735143714184e-06, + "loss": 0.5892675876617431, + "step": 9010 + }, + { + "epoch": 1.604982206405694, + "grad_norm": 0.7350971102714539, + "learning_rate": 1.931629534131769e-06, + "loss": 0.615026330947876, + "step": 9020 + }, + { + "epoch": 1.6067615658362988, + "grad_norm": 0.9350213408470154, + "learning_rate": 1.9148508890788263e-06, + "loss": 0.5945257663726806, + "step": 9030 + }, + { + "epoch": 1.608540925266904, + "grad_norm": 0.8562370538711548, + "learning_rate": 1.8981377150809111e-06, + "loss": 0.6524335384368897, + "step": 9040 + }, + { + "epoch": 1.6103202846975089, + "grad_norm": 0.8481477499008179, + "learning_rate": 1.8814901474761704e-06, + "loss": 0.572668981552124, + "step": 9050 + }, + { + "epoch": 1.612099644128114, + "grad_norm": 0.8405110239982605, + "learning_rate": 1.8649083210714946e-06, + "loss": 0.6025730609893799, + "step": 9060 + }, + { + "epoch": 1.613879003558719, + "grad_norm": 0.6442246437072754, + "learning_rate": 1.8483923701414274e-06, + "loss": 0.619115161895752, + "step": 9070 + }, + { + "epoch": 1.6156583629893237, + "grad_norm": 0.9033611416816711, + "learning_rate": 1.8319424284270638e-06, + "loss": 0.6313360214233399, + "step": 9080 + }, + { + "epoch": 1.6174377224199288, + "grad_norm": 0.8982630968093872, + "learning_rate": 1.8155586291349836e-06, + "loss": 0.6707229137420654, + "step": 9090 + }, + { + "epoch": 1.6192170818505338, + "grad_norm": 0.8548517823219299, + "learning_rate": 1.7992411049361612e-06, + "loss": 0.5913959503173828, + "step": 9100 + }, + { + "epoch": 1.6209964412811388, + "grad_norm": 0.8158797025680542, + "learning_rate": 1.7829899879649005e-06, + "loss": 0.6455716133117676, + "step": 9110 + }, + { + "epoch": 1.6227758007117439, + "grad_norm": 1.0375958681106567, + "learning_rate": 1.7668054098177512e-06, + "loss": 0.6378812789916992, + "step": 9120 + }, + { + "epoch": 1.6245551601423487, + "grad_norm": 0.9441247582435608, + "learning_rate": 1.7506875015524649e-06, + "loss": 0.6197398662567138, + "step": 9130 + }, + { + "epoch": 1.6263345195729537, + "grad_norm": 0.7956865429878235, + "learning_rate": 1.7346363936869082e-06, + "loss": 0.6085912704467773, + "step": 9140 + }, + { + "epoch": 1.6281138790035588, + "grad_norm": 0.7922634482383728, + "learning_rate": 1.7186522161980279e-06, + "loss": 0.6055495738983154, + "step": 9150 + }, + { + "epoch": 1.6298932384341636, + "grad_norm": 0.9479956030845642, + "learning_rate": 1.7027350985207847e-06, + "loss": 0.5977864265441895, + "step": 9160 + }, + { + "epoch": 1.6316725978647688, + "grad_norm": 0.8339085578918457, + "learning_rate": 1.68688516954711e-06, + "loss": 0.6118191242218017, + "step": 9170 + }, + { + "epoch": 1.6334519572953736, + "grad_norm": 0.8879317045211792, + "learning_rate": 1.6711025576248586e-06, + "loss": 0.607643461227417, + "step": 9180 + }, + { + "epoch": 1.6352313167259787, + "grad_norm": 0.7563620805740356, + "learning_rate": 1.655387390556782e-06, + "loss": 0.566606092453003, + "step": 9190 + }, + { + "epoch": 1.6370106761565837, + "grad_norm": 0.8990421295166016, + "learning_rate": 1.6397397955994742e-06, + "loss": 0.6038555145263672, + "step": 9200 + }, + { + "epoch": 1.6387900355871885, + "grad_norm": 0.8692913055419922, + "learning_rate": 1.624159899462353e-06, + "loss": 0.6045926570892334, + "step": 9210 + }, + { + "epoch": 1.6405693950177938, + "grad_norm": 0.7604990005493164, + "learning_rate": 1.6086478283066386e-06, + "loss": 0.6289362907409668, + "step": 9220 + }, + { + "epoch": 1.6423487544483986, + "grad_norm": 0.8552067875862122, + "learning_rate": 1.5932037077443163e-06, + "loss": 0.5995774269104004, + "step": 9230 + }, + { + "epoch": 1.6441281138790036, + "grad_norm": 0.9507073760032654, + "learning_rate": 1.577827662837136e-06, + "loss": 0.6592823028564453, + "step": 9240 + }, + { + "epoch": 1.6459074733096086, + "grad_norm": 0.7741204500198364, + "learning_rate": 1.5625198180955836e-06, + "loss": 0.5817698001861572, + "step": 9250 + }, + { + "epoch": 1.6476868327402134, + "grad_norm": 0.906514048576355, + "learning_rate": 1.5472802974778911e-06, + "loss": 0.6105581760406494, + "step": 9260 + }, + { + "epoch": 1.6494661921708185, + "grad_norm": 1.1253999471664429, + "learning_rate": 1.5321092243890112e-06, + "loss": 0.6198730945587159, + "step": 9270 + }, + { + "epoch": 1.6512455516014235, + "grad_norm": 0.7494739294052124, + "learning_rate": 1.5170067216796425e-06, + "loss": 0.5921574115753174, + "step": 9280 + }, + { + "epoch": 1.6530249110320283, + "grad_norm": 0.674996018409729, + "learning_rate": 1.5019729116452086e-06, + "loss": 0.6231479644775391, + "step": 9290 + }, + { + "epoch": 1.6548042704626336, + "grad_norm": 0.8674888610839844, + "learning_rate": 1.4870079160248918e-06, + "loss": 0.5954656600952148, + "step": 9300 + }, + { + "epoch": 1.6565836298932384, + "grad_norm": 0.8911182284355164, + "learning_rate": 1.4721118560006298e-06, + "loss": 0.5914658069610595, + "step": 9310 + }, + { + "epoch": 1.6583629893238434, + "grad_norm": 0.9357933402061462, + "learning_rate": 1.4572848521961414e-06, + "loss": 0.6310736656188964, + "step": 9320 + }, + { + "epoch": 1.6601423487544484, + "grad_norm": 0.936082124710083, + "learning_rate": 1.4425270246759549e-06, + "loss": 0.5978594303131104, + "step": 9330 + }, + { + "epoch": 1.6619217081850532, + "grad_norm": 0.8404752016067505, + "learning_rate": 1.4278384929444233e-06, + "loss": 0.5916398048400879, + "step": 9340 + }, + { + "epoch": 1.6637010676156585, + "grad_norm": 0.6900134086608887, + "learning_rate": 1.4132193759447665e-06, + "loss": 0.6098292350769043, + "step": 9350 + }, + { + "epoch": 1.6654804270462633, + "grad_norm": 0.8345168828964233, + "learning_rate": 1.398669792058105e-06, + "loss": 0.6169853687286377, + "step": 9360 + }, + { + "epoch": 1.6672597864768683, + "grad_norm": 0.8186418414115906, + "learning_rate": 1.384189859102505e-06, + "loss": 0.6554735660552978, + "step": 9370 + }, + { + "epoch": 1.6690391459074734, + "grad_norm": 0.8094502687454224, + "learning_rate": 1.369779694332012e-06, + "loss": 0.57339506149292, + "step": 9380 + }, + { + "epoch": 1.6708185053380782, + "grad_norm": 0.8150178790092468, + "learning_rate": 1.3554394144357218e-06, + "loss": 0.5624317169189453, + "step": 9390 + }, + { + "epoch": 1.6725978647686834, + "grad_norm": 0.7492024898529053, + "learning_rate": 1.341169135536814e-06, + "loss": 0.619264554977417, + "step": 9400 + }, + { + "epoch": 1.6743772241992882, + "grad_norm": 0.7049137949943542, + "learning_rate": 1.326968973191628e-06, + "loss": 0.5918323040008545, + "step": 9410 + }, + { + "epoch": 1.6761565836298933, + "grad_norm": 0.8279508948326111, + "learning_rate": 1.312839042388716e-06, + "loss": 0.6143953800201416, + "step": 9420 + }, + { + "epoch": 1.6779359430604983, + "grad_norm": 1.0093584060668945, + "learning_rate": 1.2987794575479218e-06, + "loss": 0.6140541076660156, + "step": 9430 + }, + { + "epoch": 1.6797153024911031, + "grad_norm": 0.9923004508018494, + "learning_rate": 1.2847903325194455e-06, + "loss": 0.6000154495239258, + "step": 9440 + }, + { + "epoch": 1.6814946619217082, + "grad_norm": 0.9289199113845825, + "learning_rate": 1.270871780582924e-06, + "loss": 0.6113490104675293, + "step": 9450 + }, + { + "epoch": 1.6832740213523132, + "grad_norm": 0.6759699583053589, + "learning_rate": 1.2570239144465212e-06, + "loss": 0.6019508838653564, + "step": 9460 + }, + { + "epoch": 1.685053380782918, + "grad_norm": 0.8999039530754089, + "learning_rate": 1.2432468462460024e-06, + "loss": 0.6267662048339844, + "step": 9470 + }, + { + "epoch": 1.6868327402135233, + "grad_norm": 0.9530540704727173, + "learning_rate": 1.2295406875438377e-06, + "loss": 0.5737581729888916, + "step": 9480 + }, + { + "epoch": 1.688612099644128, + "grad_norm": 0.9010872840881348, + "learning_rate": 1.2159055493282911e-06, + "loss": 0.6142421722412109, + "step": 9490 + }, + { + "epoch": 1.690391459074733, + "grad_norm": 0.9528436660766602, + "learning_rate": 1.2023415420125262e-06, + "loss": 0.5961336612701416, + "step": 9500 + }, + { + "epoch": 1.690391459074733, + "eval_loss": 0.6090449094772339, + "eval_runtime": 409.4467, + "eval_samples_per_second": 12.282, + "eval_steps_per_second": 6.142, + "step": 9500 + }, + { + "epoch": 1.6921708185053381, + "grad_norm": 0.863777220249176, + "learning_rate": 1.1888487754337052e-06, + "loss": 0.6345890045166016, + "step": 9510 + }, + { + "epoch": 1.693950177935943, + "grad_norm": 1.0439646244049072, + "learning_rate": 1.1754273588521149e-06, + "loss": 0.6450316429138183, + "step": 9520 + }, + { + "epoch": 1.6957295373665482, + "grad_norm": 0.8258497714996338, + "learning_rate": 1.162077400950261e-06, + "loss": 0.6218976497650146, + "step": 9530 + }, + { + "epoch": 1.697508896797153, + "grad_norm": 0.8564954996109009, + "learning_rate": 1.148799009832009e-06, + "loss": 0.6231951713562012, + "step": 9540 + }, + { + "epoch": 1.699288256227758, + "grad_norm": 0.7925037741661072, + "learning_rate": 1.1355922930216867e-06, + "loss": 0.5838134765625, + "step": 9550 + }, + { + "epoch": 1.701067615658363, + "grad_norm": 0.8268294930458069, + "learning_rate": 1.122457357463238e-06, + "loss": 0.6027390480041503, + "step": 9560 + }, + { + "epoch": 1.7028469750889679, + "grad_norm": 0.9756558537483215, + "learning_rate": 1.1093943095193328e-06, + "loss": 0.6181758403778076, + "step": 9570 + }, + { + "epoch": 1.704626334519573, + "grad_norm": 0.8576071262359619, + "learning_rate": 1.096403254970526e-06, + "loss": 0.6374680519104003, + "step": 9580 + }, + { + "epoch": 1.706405693950178, + "grad_norm": 0.9196457266807556, + "learning_rate": 1.083484299014389e-06, + "loss": 0.6039985179901123, + "step": 9590 + }, + { + "epoch": 1.708185053380783, + "grad_norm": 0.9652523994445801, + "learning_rate": 1.0706375462646557e-06, + "loss": 0.6314795970916748, + "step": 9600 + }, + { + "epoch": 1.709964412811388, + "grad_norm": 0.9042948484420776, + "learning_rate": 1.0578631007503914e-06, + "loss": 0.6144063472747803, + "step": 9610 + }, + { + "epoch": 1.7117437722419928, + "grad_norm": 1.0348018407821655, + "learning_rate": 1.045161065915129e-06, + "loss": 0.6448927879333496, + "step": 9620 + }, + { + "epoch": 1.7135231316725978, + "grad_norm": 0.9222016334533691, + "learning_rate": 1.03253154461605e-06, + "loss": 0.6264961719512939, + "step": 9630 + }, + { + "epoch": 1.7153024911032029, + "grad_norm": 0.8399068117141724, + "learning_rate": 1.019974639123138e-06, + "loss": 0.6070163726806641, + "step": 9640 + }, + { + "epoch": 1.7170818505338077, + "grad_norm": 0.8186811208724976, + "learning_rate": 1.0074904511183592e-06, + "loss": 0.6530916690826416, + "step": 9650 + }, + { + "epoch": 1.718861209964413, + "grad_norm": 0.8317114114761353, + "learning_rate": 9.95079081694832e-07, + "loss": 0.6258822917938233, + "step": 9660 + }, + { + "epoch": 1.7206405693950177, + "grad_norm": 0.8472376465797424, + "learning_rate": 9.827406313560194e-07, + "loss": 0.6343019485473633, + "step": 9670 + }, + { + "epoch": 1.7224199288256228, + "grad_norm": 0.7907516956329346, + "learning_rate": 9.704752000148998e-07, + "loss": 0.5924992561340332, + "step": 9680 + }, + { + "epoch": 1.7241992882562278, + "grad_norm": 0.8266814351081848, + "learning_rate": 9.582828869931749e-07, + "loss": 0.63353590965271, + "step": 9690 + }, + { + "epoch": 1.7259786476868326, + "grad_norm": 0.8828052878379822, + "learning_rate": 9.461637910204468e-07, + "loss": 0.5913454532623291, + "step": 9700 + }, + { + "epoch": 1.7277580071174379, + "grad_norm": 0.795305609703064, + "learning_rate": 9.341180102334391e-07, + "loss": 0.625618314743042, + "step": 9710 + }, + { + "epoch": 1.7295373665480427, + "grad_norm": 0.8134214878082275, + "learning_rate": 9.221456421751828e-07, + "loss": 0.6241415023803711, + "step": 9720 + }, + { + "epoch": 1.7313167259786477, + "grad_norm": 0.8993518948554993, + "learning_rate": 9.102467837942364e-07, + "loss": 0.6320163249969483, + "step": 9730 + }, + { + "epoch": 1.7330960854092528, + "grad_norm": 0.8220566511154175, + "learning_rate": 8.984215314439071e-07, + "loss": 0.5945559501647949, + "step": 9740 + }, + { + "epoch": 1.7348754448398576, + "grad_norm": 0.8112704753875732, + "learning_rate": 8.866699808814516e-07, + "loss": 0.5934808731079102, + "step": 9750 + }, + { + "epoch": 1.7366548042704626, + "grad_norm": 0.9084371328353882, + "learning_rate": 8.749922272673228e-07, + "loss": 0.614768123626709, + "step": 9760 + }, + { + "epoch": 1.7384341637010676, + "grad_norm": 1.0116448402404785, + "learning_rate": 8.633883651643815e-07, + "loss": 0.6284510612487793, + "step": 9770 + }, + { + "epoch": 1.7402135231316724, + "grad_norm": 0.7356001734733582, + "learning_rate": 8.518584885371417e-07, + "loss": 0.5880512237548828, + "step": 9780 + }, + { + "epoch": 1.7419928825622777, + "grad_norm": 1.018543004989624, + "learning_rate": 8.404026907510043e-07, + "loss": 0.6093903064727784, + "step": 9790 + }, + { + "epoch": 1.7437722419928825, + "grad_norm": 0.7327563166618347, + "learning_rate": 8.290210645715002e-07, + "loss": 0.6004554748535156, + "step": 9800 + }, + { + "epoch": 1.7455516014234875, + "grad_norm": 0.8729904294013977, + "learning_rate": 8.177137021635439e-07, + "loss": 0.6066318035125733, + "step": 9810 + }, + { + "epoch": 1.7473309608540926, + "grad_norm": 0.9604983925819397, + "learning_rate": 8.064806950906867e-07, + "loss": 0.617271900177002, + "step": 9820 + }, + { + "epoch": 1.7491103202846974, + "grad_norm": 0.8511675000190735, + "learning_rate": 7.95322134314368e-07, + "loss": 0.6143304824829101, + "step": 9830 + }, + { + "epoch": 1.7508896797153026, + "grad_norm": 1.240357518196106, + "learning_rate": 7.842381101931895e-07, + "loss": 0.5943353652954102, + "step": 9840 + }, + { + "epoch": 1.7526690391459074, + "grad_norm": 0.7550404667854309, + "learning_rate": 7.732287124821747e-07, + "loss": 0.6033238887786865, + "step": 9850 + }, + { + "epoch": 1.7544483985765125, + "grad_norm": 0.8071495294570923, + "learning_rate": 7.622940303320426e-07, + "loss": 0.6051639080047607, + "step": 9860 + }, + { + "epoch": 1.7562277580071175, + "grad_norm": 1.0281059741973877, + "learning_rate": 7.514341522884971e-07, + "loss": 0.6654407501220703, + "step": 9870 + }, + { + "epoch": 1.7580071174377223, + "grad_norm": 0.8573912978172302, + "learning_rate": 7.406491662914917e-07, + "loss": 0.5752899646759033, + "step": 9880 + }, + { + "epoch": 1.7597864768683276, + "grad_norm": 0.9136864542961121, + "learning_rate": 7.299391596745342e-07, + "loss": 0.5918805599212646, + "step": 9890 + }, + { + "epoch": 1.7615658362989324, + "grad_norm": 0.7273184657096863, + "learning_rate": 7.193042191639677e-07, + "loss": 0.6129786491394043, + "step": 9900 + }, + { + "epoch": 1.7633451957295374, + "grad_norm": 0.8380533456802368, + "learning_rate": 7.087444308782787e-07, + "loss": 0.6275459289550781, + "step": 9910 + }, + { + "epoch": 1.7651245551601424, + "grad_norm": 0.8067348003387451, + "learning_rate": 6.982598803273854e-07, + "loss": 0.6274401187896729, + "step": 9920 + }, + { + "epoch": 1.7669039145907472, + "grad_norm": 0.9172580242156982, + "learning_rate": 6.878506524119644e-07, + "loss": 0.6016909122467041, + "step": 9930 + }, + { + "epoch": 1.7686832740213523, + "grad_norm": 0.8141267895698547, + "learning_rate": 6.775168314227442e-07, + "loss": 0.5785222053527832, + "step": 9940 + }, + { + "epoch": 1.7704626334519573, + "grad_norm": 0.8267232179641724, + "learning_rate": 6.672585010398347e-07, + "loss": 0.59950852394104, + "step": 9950 + }, + { + "epoch": 1.7722419928825621, + "grad_norm": 1.015816569328308, + "learning_rate": 6.570757443320441e-07, + "loss": 0.5815055847167969, + "step": 9960 + }, + { + "epoch": 1.7740213523131674, + "grad_norm": 0.9419827461242676, + "learning_rate": 6.469686437562117e-07, + "loss": 0.6218266963958741, + "step": 9970 + }, + { + "epoch": 1.7758007117437722, + "grad_norm": 0.8901230692863464, + "learning_rate": 6.369372811565333e-07, + "loss": 0.6288963317871094, + "step": 9980 + }, + { + "epoch": 1.7775800711743772, + "grad_norm": 0.8378164172172546, + "learning_rate": 6.269817377639054e-07, + "loss": 0.5956534385681153, + "step": 9990 + }, + { + "epoch": 1.7793594306049823, + "grad_norm": 0.6490439772605896, + "learning_rate": 6.171020941952611e-07, + "loss": 0.6209733486175537, + "step": 10000 + }, + { + "epoch": 1.7793594306049823, + "eval_loss": 0.6085147261619568, + "eval_runtime": 413.8517, + "eval_samples_per_second": 12.152, + "eval_steps_per_second": 6.077, + "step": 10000 + }, + { + "epoch": 1.781138790035587, + "grad_norm": 0.7866795063018799, + "learning_rate": 6.072984304529172e-07, + "loss": 0.620451545715332, + "step": 10010 + }, + { + "epoch": 1.7829181494661923, + "grad_norm": 0.7238859534263611, + "learning_rate": 5.975708259239366e-07, + "loss": 0.6792274951934815, + "step": 10020 + }, + { + "epoch": 1.7846975088967971, + "grad_norm": 0.7588909268379211, + "learning_rate": 5.879193593794708e-07, + "loss": 0.5726376056671143, + "step": 10030 + }, + { + "epoch": 1.7864768683274022, + "grad_norm": 0.7037184834480286, + "learning_rate": 5.783441089741359e-07, + "loss": 0.617862319946289, + "step": 10040 + }, + { + "epoch": 1.7882562277580072, + "grad_norm": 0.9185910224914551, + "learning_rate": 5.688451522453664e-07, + "loss": 0.6003572463989257, + "step": 10050 + }, + { + "epoch": 1.790035587188612, + "grad_norm": 0.8440210223197937, + "learning_rate": 5.594225661128005e-07, + "loss": 0.6071921825408936, + "step": 10060 + }, + { + "epoch": 1.791814946619217, + "grad_norm": 0.8852623105049133, + "learning_rate": 5.500764268776437e-07, + "loss": 0.5917786121368408, + "step": 10070 + }, + { + "epoch": 1.793594306049822, + "grad_norm": 0.9420585632324219, + "learning_rate": 5.408068102220665e-07, + "loss": 0.5636492252349854, + "step": 10080 + }, + { + "epoch": 1.795373665480427, + "grad_norm": 0.8379149436950684, + "learning_rate": 5.316137912085761e-07, + "loss": 0.5862648963928223, + "step": 10090 + }, + { + "epoch": 1.7971530249110321, + "grad_norm": 0.9554212689399719, + "learning_rate": 5.224974442794162e-07, + "loss": 0.5997903347015381, + "step": 10100 + }, + { + "epoch": 1.798932384341637, + "grad_norm": 0.784550666809082, + "learning_rate": 5.134578432559667e-07, + "loss": 0.6203852653503418, + "step": 10110 + }, + { + "epoch": 1.800711743772242, + "grad_norm": 0.9477099180221558, + "learning_rate": 5.044950613381417e-07, + "loss": 0.6295520782470703, + "step": 10120 + }, + { + "epoch": 1.802491103202847, + "grad_norm": 0.8329566717147827, + "learning_rate": 4.956091711037969e-07, + "loss": 0.6353004932403564, + "step": 10130 + }, + { + "epoch": 1.8042704626334518, + "grad_norm": 0.8794506788253784, + "learning_rate": 4.868002445081421e-07, + "loss": 0.6215813636779786, + "step": 10140 + }, + { + "epoch": 1.806049822064057, + "grad_norm": 0.9077814221382141, + "learning_rate": 4.780683528831609e-07, + "loss": 0.6175337314605713, + "step": 10150 + }, + { + "epoch": 1.8078291814946619, + "grad_norm": 0.8299528956413269, + "learning_rate": 4.6941356693703034e-07, + "loss": 0.6105076789855957, + "step": 10160 + }, + { + "epoch": 1.809608540925267, + "grad_norm": 0.8642259240150452, + "learning_rate": 4.608359567535503e-07, + "loss": 0.6132296085357666, + "step": 10170 + }, + { + "epoch": 1.811387900355872, + "grad_norm": 0.7558045387268066, + "learning_rate": 4.523355917915739e-07, + "loss": 0.5516393661499024, + "step": 10180 + }, + { + "epoch": 1.8131672597864767, + "grad_norm": 1.0017114877700806, + "learning_rate": 4.4391254088444693e-07, + "loss": 0.6016965866088867, + "step": 10190 + }, + { + "epoch": 1.814946619217082, + "grad_norm": 0.8784421682357788, + "learning_rate": 4.35566872239449e-07, + "loss": 0.6034076690673829, + "step": 10200 + }, + { + "epoch": 1.8167259786476868, + "grad_norm": 0.9458897113800049, + "learning_rate": 4.2729865343724254e-07, + "loss": 0.6276726722717285, + "step": 10210 + }, + { + "epoch": 1.8185053380782918, + "grad_norm": 0.7058622241020203, + "learning_rate": 4.1910795143132456e-07, + "loss": 0.6476336002349854, + "step": 10220 + }, + { + "epoch": 1.8202846975088969, + "grad_norm": 0.8122621774673462, + "learning_rate": 4.109948325474855e-07, + "loss": 0.6218099594116211, + "step": 10230 + }, + { + "epoch": 1.8220640569395017, + "grad_norm": 0.7891798615455627, + "learning_rate": 4.029593624832706e-07, + "loss": 0.6232967853546143, + "step": 10240 + }, + { + "epoch": 1.8238434163701067, + "grad_norm": 0.9203746914863586, + "learning_rate": 3.9500160630745023e-07, + "loss": 0.6128015518188477, + "step": 10250 + }, + { + "epoch": 1.8256227758007118, + "grad_norm": 0.8149620294570923, + "learning_rate": 3.871216284594881e-07, + "loss": 0.6312131881713867, + "step": 10260 + }, + { + "epoch": 1.8274021352313166, + "grad_norm": 0.8598204255104065, + "learning_rate": 3.7931949274902737e-07, + "loss": 0.5871327877044678, + "step": 10270 + }, + { + "epoch": 1.8291814946619218, + "grad_norm": 0.8834412693977356, + "learning_rate": 3.715952623553676e-07, + "loss": 0.5824168682098388, + "step": 10280 + }, + { + "epoch": 1.8309608540925266, + "grad_norm": 1.0543700456619263, + "learning_rate": 3.6394899982695297e-07, + "loss": 0.6174266338348389, + "step": 10290 + }, + { + "epoch": 1.8327402135231317, + "grad_norm": 0.8103786706924438, + "learning_rate": 3.563807670808717e-07, + "loss": 0.6298078536987305, + "step": 10300 + }, + { + "epoch": 1.8345195729537367, + "grad_norm": 0.9107673764228821, + "learning_rate": 3.488906254023472e-07, + "loss": 0.6039840698242187, + "step": 10310 + }, + { + "epoch": 1.8362989323843415, + "grad_norm": 0.8083145022392273, + "learning_rate": 3.4147863544424896e-07, + "loss": 0.6395132064819335, + "step": 10320 + }, + { + "epoch": 1.8380782918149468, + "grad_norm": 0.9567161202430725, + "learning_rate": 3.3414485722659374e-07, + "loss": 0.6278172969818115, + "step": 10330 + }, + { + "epoch": 1.8398576512455516, + "grad_norm": 0.8990734219551086, + "learning_rate": 3.2688935013606706e-07, + "loss": 0.6095623016357422, + "step": 10340 + }, + { + "epoch": 1.8416370106761566, + "grad_norm": 0.8392049670219421, + "learning_rate": 3.19712172925537e-07, + "loss": 0.5831768989562989, + "step": 10350 + }, + { + "epoch": 1.8434163701067616, + "grad_norm": 0.8746337890625, + "learning_rate": 3.126133837135825e-07, + "loss": 0.6429958820343018, + "step": 10360 + }, + { + "epoch": 1.8451957295373664, + "grad_norm": 0.842336893081665, + "learning_rate": 3.055930399840168e-07, + "loss": 0.6127379417419434, + "step": 10370 + }, + { + "epoch": 1.8469750889679717, + "grad_norm": 0.8559570908546448, + "learning_rate": 2.9865119858542904e-07, + "loss": 0.5989023208618164, + "step": 10380 + }, + { + "epoch": 1.8487544483985765, + "grad_norm": 0.9535056352615356, + "learning_rate": 2.9178791573071907e-07, + "loss": 0.6665677070617676, + "step": 10390 + }, + { + "epoch": 1.8505338078291815, + "grad_norm": 0.781708300113678, + "learning_rate": 2.8500324699664575e-07, + "loss": 0.6113322257995606, + "step": 10400 + }, + { + "epoch": 1.8523131672597866, + "grad_norm": 0.6821208000183105, + "learning_rate": 2.7829724732336914e-07, + "loss": 0.6359220027923584, + "step": 10410 + }, + { + "epoch": 1.8540925266903914, + "grad_norm": 0.8178049921989441, + "learning_rate": 2.7166997101402025e-07, + "loss": 0.605013656616211, + "step": 10420 + }, + { + "epoch": 1.8558718861209964, + "grad_norm": 0.8356890678405762, + "learning_rate": 2.651214717342454e-07, + "loss": 0.5625947475433349, + "step": 10430 + }, + { + "epoch": 1.8576512455516014, + "grad_norm": 0.9983499646186829, + "learning_rate": 2.5865180251178014e-07, + "loss": 0.623445463180542, + "step": 10440 + }, + { + "epoch": 1.8594306049822062, + "grad_norm": 0.7427167296409607, + "learning_rate": 2.522610157360228e-07, + "loss": 0.6319187641143799, + "step": 10450 + }, + { + "epoch": 1.8612099644128115, + "grad_norm": 0.7787860631942749, + "learning_rate": 2.459491631575983e-07, + "loss": 0.616677713394165, + "step": 10460 + }, + { + "epoch": 1.8629893238434163, + "grad_norm": 0.9523722529411316, + "learning_rate": 2.397162958879551e-07, + "loss": 0.6324071884155273, + "step": 10470 + }, + { + "epoch": 1.8647686832740213, + "grad_norm": 0.9004319310188293, + "learning_rate": 2.3356246439893425e-07, + "loss": 0.6087874412536621, + "step": 10480 + }, + { + "epoch": 1.8665480427046264, + "grad_norm": 0.8960275053977966, + "learning_rate": 2.2748771852237672e-07, + "loss": 0.5879695415496826, + "step": 10490 + }, + { + "epoch": 1.8683274021352312, + "grad_norm": 0.9242817759513855, + "learning_rate": 2.2149210744970896e-07, + "loss": 0.6239465713500977, + "step": 10500 + }, + { + "epoch": 1.8683274021352312, + "eval_loss": 0.6083104014396667, + "eval_runtime": 425.8922, + "eval_samples_per_second": 11.808, + "eval_steps_per_second": 5.905, + "step": 10500 + }, + { + "epoch": 1.8701067615658364, + "grad_norm": 0.7920718789100647, + "learning_rate": 2.155756797315478e-07, + "loss": 0.5853692531585694, + "step": 10510 + }, + { + "epoch": 1.8718861209964412, + "grad_norm": 0.8980622291564941, + "learning_rate": 2.097384832773075e-07, + "loss": 0.6284151077270508, + "step": 10520 + }, + { + "epoch": 1.8736654804270463, + "grad_norm": 0.8127864599227905, + "learning_rate": 2.0398056535481547e-07, + "loss": 0.6136427402496338, + "step": 10530 + }, + { + "epoch": 1.8754448398576513, + "grad_norm": 0.7731335759162903, + "learning_rate": 1.9830197258992158e-07, + "loss": 0.6308210849761963, + "step": 10540 + }, + { + "epoch": 1.8772241992882561, + "grad_norm": 0.7324168086051941, + "learning_rate": 1.9270275096612612e-07, + "loss": 0.6134955406188964, + "step": 10550 + }, + { + "epoch": 1.8790035587188612, + "grad_norm": 0.8311629891395569, + "learning_rate": 1.871829458242058e-07, + "loss": 0.6565740585327149, + "step": 10560 + }, + { + "epoch": 1.8807829181494662, + "grad_norm": 0.9405824542045593, + "learning_rate": 1.8174260186184956e-07, + "loss": 0.5937122344970703, + "step": 10570 + }, + { + "epoch": 1.8825622775800712, + "grad_norm": 0.977630615234375, + "learning_rate": 1.7638176313329203e-07, + "loss": 0.5619380474090576, + "step": 10580 + }, + { + "epoch": 1.8843416370106763, + "grad_norm": 0.8144553303718567, + "learning_rate": 1.7110047304895738e-07, + "loss": 0.5915993213653564, + "step": 10590 + }, + { + "epoch": 1.886120996441281, + "grad_norm": 0.7680257558822632, + "learning_rate": 1.6589877437511502e-07, + "loss": 0.6170778751373291, + "step": 10600 + }, + { + "epoch": 1.887900355871886, + "grad_norm": 0.7631038427352905, + "learning_rate": 1.6077670923351884e-07, + "loss": 0.6260135173797607, + "step": 10610 + }, + { + "epoch": 1.8896797153024911, + "grad_norm": 0.8002687692642212, + "learning_rate": 1.5573431910108404e-07, + "loss": 0.6207282543182373, + "step": 10620 + }, + { + "epoch": 1.891459074733096, + "grad_norm": 0.8984543681144714, + "learning_rate": 1.5077164480953422e-07, + "loss": 0.6157234668731689, + "step": 10630 + }, + { + "epoch": 1.8932384341637012, + "grad_norm": 0.8822320103645325, + "learning_rate": 1.458887265450859e-07, + "loss": 0.6343502044677735, + "step": 10640 + }, + { + "epoch": 1.895017793594306, + "grad_norm": 0.8803090453147888, + "learning_rate": 1.4108560384810787e-07, + "loss": 0.6439435005187988, + "step": 10650 + }, + { + "epoch": 1.896797153024911, + "grad_norm": 0.9255337715148926, + "learning_rate": 1.3636231561281464e-07, + "loss": 0.614884328842163, + "step": 10660 + }, + { + "epoch": 1.898576512455516, + "grad_norm": 0.9993085861206055, + "learning_rate": 1.3171890008694454e-07, + "loss": 0.5709932327270508, + "step": 10670 + }, + { + "epoch": 1.9003558718861209, + "grad_norm": 0.7870404720306396, + "learning_rate": 1.2715539487145212e-07, + "loss": 0.6171539783477783, + "step": 10680 + }, + { + "epoch": 1.9021352313167261, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.2267183692019845e-07, + "loss": 0.6215362548828125, + "step": 10690 + }, + { + "epoch": 1.903914590747331, + "grad_norm": 0.8149640560150146, + "learning_rate": 1.1826826253966029e-07, + "loss": 0.6112423419952393, + "step": 10700 + }, + { + "epoch": 1.905693950177936, + "grad_norm": 0.875933825969696, + "learning_rate": 1.1394470738863128e-07, + "loss": 0.6336851119995117, + "step": 10710 + }, + { + "epoch": 1.907473309608541, + "grad_norm": 0.8472994565963745, + "learning_rate": 1.0970120647793347e-07, + "loss": 0.6054853916168212, + "step": 10720 + }, + { + "epoch": 1.9092526690391458, + "grad_norm": 0.8871169090270996, + "learning_rate": 1.055377941701341e-07, + "loss": 0.6998202323913574, + "step": 10730 + }, + { + "epoch": 1.9110320284697508, + "grad_norm": 0.9006623029708862, + "learning_rate": 1.0145450417926806e-07, + "loss": 0.6072861194610596, + "step": 10740 + }, + { + "epoch": 1.9128113879003559, + "grad_norm": 0.781328558921814, + "learning_rate": 9.745136957056368e-08, + "loss": 0.6054211616516113, + "step": 10750 + }, + { + "epoch": 1.914590747330961, + "grad_norm": 0.8939800262451172, + "learning_rate": 9.352842276017516e-08, + "loss": 0.5997004985809327, + "step": 10760 + }, + { + "epoch": 1.916370106761566, + "grad_norm": 0.8280763030052185, + "learning_rate": 8.968569551492278e-08, + "loss": 0.6360623836517334, + "step": 10770 + }, + { + "epoch": 1.9181494661921707, + "grad_norm": 0.9543731808662415, + "learning_rate": 8.592321895203093e-08, + "loss": 0.5910046100616455, + "step": 10780 + }, + { + "epoch": 1.9199288256227758, + "grad_norm": 0.85121750831604, + "learning_rate": 8.224102353887931e-08, + "loss": 0.6036299228668213, + "step": 10790 + }, + { + "epoch": 1.9217081850533808, + "grad_norm": 0.8640086054801941, + "learning_rate": 7.863913909275656e-08, + "loss": 0.63557448387146, + "step": 10800 + }, + { + "epoch": 1.9234875444839856, + "grad_norm": 0.8530787825584412, + "learning_rate": 7.511759478061709e-08, + "loss": 0.5753629684448243, + "step": 10810 + }, + { + "epoch": 1.9252669039145909, + "grad_norm": 0.799323320388794, + "learning_rate": 7.16764191188446e-08, + "loss": 0.6003875732421875, + "step": 10820 + }, + { + "epoch": 1.9270462633451957, + "grad_norm": 0.8886220455169678, + "learning_rate": 6.831563997302337e-08, + "loss": 0.6101795196533203, + "step": 10830 + }, + { + "epoch": 1.9288256227758007, + "grad_norm": 0.9187102913856506, + "learning_rate": 6.50352845577118e-08, + "loss": 0.5787545680999756, + "step": 10840 + }, + { + "epoch": 1.9306049822064058, + "grad_norm": 0.8966771960258484, + "learning_rate": 6.18353794362192e-08, + "loss": 0.6280184745788574, + "step": 10850 + }, + { + "epoch": 1.9323843416370106, + "grad_norm": 1.0109564065933228, + "learning_rate": 5.8715950520396024e-08, + "loss": 0.5829953193664551, + "step": 10860 + }, + { + "epoch": 1.9341637010676158, + "grad_norm": 0.8951236009597778, + "learning_rate": 5.5677023070420665e-08, + "loss": 0.6363350868225097, + "step": 10870 + }, + { + "epoch": 1.9359430604982206, + "grad_norm": 1.040250539779663, + "learning_rate": 5.271862169459296e-08, + "loss": 0.6194582939147949, + "step": 10880 + }, + { + "epoch": 1.9377224199288257, + "grad_norm": 0.7401016354560852, + "learning_rate": 4.98407703491377e-08, + "loss": 0.6160064220428467, + "step": 10890 + }, + { + "epoch": 1.9395017793594307, + "grad_norm": 0.9136718511581421, + "learning_rate": 4.704349233801475e-08, + "loss": 0.6500599384307861, + "step": 10900 + }, + { + "epoch": 1.9412811387900355, + "grad_norm": 0.9346413612365723, + "learning_rate": 4.4326810312720345e-08, + "loss": 0.5980278968811035, + "step": 10910 + }, + { + "epoch": 1.9430604982206405, + "grad_norm": 0.8895809054374695, + "learning_rate": 4.169074627211278e-08, + "loss": 0.6100972652435303, + "step": 10920 + }, + { + "epoch": 1.9448398576512456, + "grad_norm": 0.8696462512016296, + "learning_rate": 3.913532156222699e-08, + "loss": 0.6204128742218018, + "step": 10930 + }, + { + "epoch": 1.9466192170818504, + "grad_norm": 0.8176571130752563, + "learning_rate": 3.6660556876110256e-08, + "loss": 0.6417241096496582, + "step": 10940 + }, + { + "epoch": 1.9483985765124556, + "grad_norm": 0.6966422200202942, + "learning_rate": 3.426647225364565e-08, + "loss": 0.5693438053131104, + "step": 10950 + }, + { + "epoch": 1.9501779359430604, + "grad_norm": 0.7873883247375488, + "learning_rate": 3.195308708139555e-08, + "loss": 0.5997710704803467, + "step": 10960 + }, + { + "epoch": 1.9519572953736655, + "grad_norm": 0.8650481700897217, + "learning_rate": 2.9720420092442803e-08, + "loss": 0.6134130477905273, + "step": 10970 + }, + { + "epoch": 1.9537366548042705, + "grad_norm": 0.7547999620437622, + "learning_rate": 2.756848936623979e-08, + "loss": 0.5843913555145264, + "step": 10980 + }, + { + "epoch": 1.9555160142348753, + "grad_norm": 1.011824369430542, + "learning_rate": 2.5497312328460756e-08, + "loss": 0.5818317413330079, + "step": 10990 + }, + { + "epoch": 1.9572953736654806, + "grad_norm": 0.9581059217453003, + "learning_rate": 2.3506905750859678e-08, + "loss": 0.5937665939331055, + "step": 11000 + }, + { + "epoch": 1.9572953736654806, + "eval_loss": 0.6082284450531006, + "eval_runtime": 725.2, + "eval_samples_per_second": 6.935, + "eval_steps_per_second": 3.468, + "step": 11000 + }, + { + "epoch": 1.9590747330960854, + "grad_norm": 0.9106253385543823, + "learning_rate": 2.1597285751140397e-08, + "loss": 0.6484842777252198, + "step": 11010 + }, + { + "epoch": 1.9608540925266904, + "grad_norm": 0.8921802639961243, + "learning_rate": 1.9768467792817825e-08, + "loss": 0.5928434371948242, + "step": 11020 + }, + { + "epoch": 1.9626334519572954, + "grad_norm": 0.8079352378845215, + "learning_rate": 1.802046668509916e-08, + "loss": 0.6446592807769775, + "step": 11030 + }, + { + "epoch": 1.9644128113879002, + "grad_norm": 0.9401485323905945, + "learning_rate": 1.6353296582758416e-08, + "loss": 0.6148817539215088, + "step": 11040 + }, + { + "epoch": 1.9661921708185055, + "grad_norm": 0.89174485206604, + "learning_rate": 1.476697098602986e-08, + "loss": 0.6317745685577393, + "step": 11050 + }, + { + "epoch": 1.9679715302491103, + "grad_norm": 0.8186066150665283, + "learning_rate": 1.3261502740490317e-08, + "loss": 0.604107666015625, + "step": 11060 + }, + { + "epoch": 1.9697508896797153, + "grad_norm": 1.0631054639816284, + "learning_rate": 1.1836904036959252e-08, + "loss": 0.6059531688690185, + "step": 11070 + }, + { + "epoch": 1.9715302491103204, + "grad_norm": 0.7882722020149231, + "learning_rate": 1.0493186411398848e-08, + "loss": 0.5915180683135987, + "step": 11080 + }, + { + "epoch": 1.9733096085409252, + "grad_norm": 0.9640932679176331, + "learning_rate": 9.230360744822975e-09, + "loss": 0.6303177356719971, + "step": 11090 + }, + { + "epoch": 1.9750889679715302, + "grad_norm": 0.8806138634681702, + "learning_rate": 8.048437263205033e-09, + "loss": 0.6188820362091064, + "step": 11100 + }, + { + "epoch": 1.9768683274021353, + "grad_norm": 0.7664757370948792, + "learning_rate": 6.947425537400243e-09, + "loss": 0.5683889865875245, + "step": 11110 + }, + { + "epoch": 1.97864768683274, + "grad_norm": 1.0602748394012451, + "learning_rate": 5.927334483061264e-09, + "loss": 0.5750191688537598, + "step": 11120 + }, + { + "epoch": 1.9804270462633453, + "grad_norm": 1.0477946996688843, + "learning_rate": 4.988172360573806e-09, + "loss": 0.6328764438629151, + "step": 11130 + }, + { + "epoch": 1.9822064056939501, + "grad_norm": 0.7248848080635071, + "learning_rate": 4.129946774984461e-09, + "loss": 0.6349906921386719, + "step": 11140 + }, + { + "epoch": 1.9839857651245552, + "grad_norm": 0.8907039761543274, + "learning_rate": 3.352664675938533e-09, + "loss": 0.6227742195129394, + "step": 11150 + }, + { + "epoch": 1.9857651245551602, + "grad_norm": 0.7807096242904663, + "learning_rate": 2.656332357631186e-09, + "loss": 0.6313164234161377, + "step": 11160 + }, + { + "epoch": 1.987544483985765, + "grad_norm": 0.6477228403091431, + "learning_rate": 2.040955458744165e-09, + "loss": 0.6183161258697509, + "step": 11170 + }, + { + "epoch": 1.9893238434163703, + "grad_norm": 0.7875910997390747, + "learning_rate": 1.5065389624124848e-09, + "loss": 0.6141678810119628, + "step": 11180 + }, + { + "epoch": 1.991103202846975, + "grad_norm": 0.8616359829902649, + "learning_rate": 1.0530871961778044e-09, + "loss": 0.6099072933197022, + "step": 11190 + }, + { + "epoch": 1.99288256227758, + "grad_norm": 0.9508731961250305, + "learning_rate": 6.806038319506769e-10, + "loss": 0.6085052967071534, + "step": 11200 + }, + { + "epoch": 1.9946619217081851, + "grad_norm": 0.7681756019592285, + "learning_rate": 3.8909188598945656e-10, + "loss": 0.6388015270233154, + "step": 11210 + }, + { + "epoch": 1.99644128113879, + "grad_norm": 0.7130521535873413, + "learning_rate": 1.7855371886699169e-10, + "loss": 0.643953514099121, + "step": 11220 + }, + { + "epoch": 1.998220640569395, + "grad_norm": 0.8050049543380737, + "learning_rate": 4.8991035457301994e-11, + "loss": 0.6025295257568359, + "step": 11230 + }, + { + "epoch": 2.0, + "grad_norm": 0.8600704669952393, + "learning_rate": 4.04884918925319e-13, + "loss": 0.6060295104980469, + "step": 11240 + } + ], + "logging_steps": 10, + "max_steps": 11240, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.298854192009953e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}