diff --git "a/checkpoint-42000/trainer_state.json" "b/checkpoint-42000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-42000/trainer_state.json" @@ -0,0 +1,29434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1678003176220298, + "eval_steps": 500, + "global_step": 42000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.995245657667376e-05, + "grad_norm": 20.315261840820312, + "learning_rate": 2e-09, + "loss": 0.1283, + "step": 10 + }, + { + "epoch": 7.990491315334752e-05, + "grad_norm": 12.392072677612305, + "learning_rate": 4e-09, + "loss": 0.1276, + "step": 20 + }, + { + "epoch": 0.00011985736973002127, + "grad_norm": 10.60136890411377, + "learning_rate": 5.999999999999999e-09, + "loss": 0.1152, + "step": 30 + }, + { + "epoch": 0.00015980982630669504, + "grad_norm": 9.118634223937988, + "learning_rate": 8e-09, + "loss": 0.1124, + "step": 40 + }, + { + "epoch": 0.00019976228288336878, + "grad_norm": 9.946356773376465, + "learning_rate": 1e-08, + "loss": 0.1204, + "step": 50 + }, + { + "epoch": 0.00023971473946004254, + "grad_norm": 9.961893081665039, + "learning_rate": 1.1999999999999998e-08, + "loss": 0.1246, + "step": 60 + }, + { + "epoch": 0.0002796671960367163, + "grad_norm": 8.61424446105957, + "learning_rate": 1.4000000000000001e-08, + "loss": 0.1183, + "step": 70 + }, + { + "epoch": 0.0003196196526133901, + "grad_norm": 9.977755546569824, + "learning_rate": 1.6e-08, + "loss": 0.1222, + "step": 80 + }, + { + "epoch": 0.00035957210919006384, + "grad_norm": 14.417372703552246, + "learning_rate": 1.8e-08, + "loss": 0.1184, + "step": 90 + }, + { + "epoch": 0.00039952456576673756, + "grad_norm": 9.235715866088867, + "learning_rate": 2e-08, + "loss": 0.1215, + "step": 100 + }, + { + "epoch": 0.0004394770223434113, + "grad_norm": 10.29968547821045, + "learning_rate": 2.2e-08, + "loss": 0.1267, + "step": 110 + }, + { + "epoch": 0.0004794294789200851, + "grad_norm": 15.24200439453125, + "learning_rate": 2.3999999999999997e-08, + "loss": 0.1207, + "step": 120 + }, + { + "epoch": 0.0005193819354967589, + "grad_norm": 47.8303108215332, + "learning_rate": 2.6e-08, + "loss": 0.1198, + "step": 130 + }, + { + "epoch": 0.0005593343920734326, + "grad_norm": 11.697916030883789, + "learning_rate": 2.8000000000000003e-08, + "loss": 0.1195, + "step": 140 + }, + { + "epoch": 0.0005992868486501063, + "grad_norm": 8.443284034729004, + "learning_rate": 3e-08, + "loss": 0.1162, + "step": 150 + }, + { + "epoch": 0.0006392393052267802, + "grad_norm": 12.932719230651855, + "learning_rate": 3.2e-08, + "loss": 0.1152, + "step": 160 + }, + { + "epoch": 0.0006791917618034539, + "grad_norm": 9.250285148620605, + "learning_rate": 3.4e-08, + "loss": 0.1139, + "step": 170 + }, + { + "epoch": 0.0007191442183801277, + "grad_norm": 10.522610664367676, + "learning_rate": 3.6e-08, + "loss": 0.1183, + "step": 180 + }, + { + "epoch": 0.0007590966749568014, + "grad_norm": 9.422713279724121, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.1106, + "step": 190 + }, + { + "epoch": 0.0007990491315334751, + "grad_norm": 6.509659290313721, + "learning_rate": 4e-08, + "loss": 0.1116, + "step": 200 + }, + { + "epoch": 0.0008390015881101489, + "grad_norm": 9.737756729125977, + "learning_rate": 4.2e-08, + "loss": 0.1099, + "step": 210 + }, + { + "epoch": 0.0008789540446868226, + "grad_norm": 5.644337177276611, + "learning_rate": 4.4e-08, + "loss": 0.1089, + "step": 220 + }, + { + "epoch": 0.0009189065012634965, + "grad_norm": 4.686175346374512, + "learning_rate": 4.6e-08, + "loss": 0.1034, + "step": 230 + }, + { + "epoch": 0.0009588589578401702, + "grad_norm": 7.596631050109863, + "learning_rate": 4.799999999999999e-08, + "loss": 0.0999, + "step": 240 + }, + { + "epoch": 0.000998811414416844, + "grad_norm": 6.625464916229248, + "learning_rate": 5e-08, + "loss": 0.1052, + "step": 250 + }, + { + "epoch": 0.0010387638709935178, + "grad_norm": 6.050367832183838, + "learning_rate": 5.2e-08, + "loss": 0.1031, + "step": 260 + }, + { + "epoch": 0.0010787163275701914, + "grad_norm": 4.522998809814453, + "learning_rate": 5.4e-08, + "loss": 0.1005, + "step": 270 + }, + { + "epoch": 0.0011186687841468652, + "grad_norm": 3.8976027965545654, + "learning_rate": 5.6000000000000005e-08, + "loss": 0.1009, + "step": 280 + }, + { + "epoch": 0.001158621240723539, + "grad_norm": 9.10118293762207, + "learning_rate": 5.7999999999999997e-08, + "loss": 0.1002, + "step": 290 + }, + { + "epoch": 0.0011985736973002127, + "grad_norm": 5.121732234954834, + "learning_rate": 6e-08, + "loss": 0.0974, + "step": 300 + }, + { + "epoch": 0.0012385261538768865, + "grad_norm": 4.138407230377197, + "learning_rate": 6.2e-08, + "loss": 0.0939, + "step": 310 + }, + { + "epoch": 0.0012784786104535603, + "grad_norm": 4.201289176940918, + "learning_rate": 6.4e-08, + "loss": 0.0963, + "step": 320 + }, + { + "epoch": 0.0013184310670302341, + "grad_norm": 3.457749605178833, + "learning_rate": 6.6e-08, + "loss": 0.0989, + "step": 330 + }, + { + "epoch": 0.0013583835236069077, + "grad_norm": 3.5751171112060547, + "learning_rate": 6.8e-08, + "loss": 0.0967, + "step": 340 + }, + { + "epoch": 0.0013983359801835816, + "grad_norm": 3.3785996437072754, + "learning_rate": 6.999999999999999e-08, + "loss": 0.0939, + "step": 350 + }, + { + "epoch": 0.0014382884367602554, + "grad_norm": 4.1580071449279785, + "learning_rate": 7.2e-08, + "loss": 0.0975, + "step": 360 + }, + { + "epoch": 0.001478240893336929, + "grad_norm": 3.2523224353790283, + "learning_rate": 7.4e-08, + "loss": 0.0961, + "step": 370 + }, + { + "epoch": 0.0015181933499136028, + "grad_norm": 2.7915871143341064, + "learning_rate": 7.599999999999999e-08, + "loss": 0.094, + "step": 380 + }, + { + "epoch": 0.0015581458064902766, + "grad_norm": 3.255483388900757, + "learning_rate": 7.8e-08, + "loss": 0.0956, + "step": 390 + }, + { + "epoch": 0.0015980982630669502, + "grad_norm": 3.6536667346954346, + "learning_rate": 8e-08, + "loss": 0.0946, + "step": 400 + }, + { + "epoch": 0.001638050719643624, + "grad_norm": 5.193263530731201, + "learning_rate": 8.199999999999999e-08, + "loss": 0.0953, + "step": 410 + }, + { + "epoch": 0.0016780031762202979, + "grad_norm": 3.2381672859191895, + "learning_rate": 8.4e-08, + "loss": 0.0942, + "step": 420 + }, + { + "epoch": 0.0017179556327969717, + "grad_norm": 3.1204910278320312, + "learning_rate": 8.599999999999999e-08, + "loss": 0.0919, + "step": 430 + }, + { + "epoch": 0.0017579080893736453, + "grad_norm": 3.854353427886963, + "learning_rate": 8.8e-08, + "loss": 0.0962, + "step": 440 + }, + { + "epoch": 0.001797860545950319, + "grad_norm": 3.0543718338012695, + "learning_rate": 9e-08, + "loss": 0.0961, + "step": 450 + }, + { + "epoch": 0.001837813002526993, + "grad_norm": 16.88381004333496, + "learning_rate": 9.2e-08, + "loss": 0.095, + "step": 460 + }, + { + "epoch": 0.0018777654591036665, + "grad_norm": 3.970008373260498, + "learning_rate": 9.4e-08, + "loss": 0.0955, + "step": 470 + }, + { + "epoch": 0.0019177179156803404, + "grad_norm": 3.068601369857788, + "learning_rate": 9.599999999999999e-08, + "loss": 0.0945, + "step": 480 + }, + { + "epoch": 0.001957670372257014, + "grad_norm": 12.934038162231445, + "learning_rate": 9.799999999999999e-08, + "loss": 0.0962, + "step": 490 + }, + { + "epoch": 0.001997622828833688, + "grad_norm": 3.24997615814209, + "learning_rate": 1e-07, + "loss": 0.0932, + "step": 500 + }, + { + "epoch": 0.002037575285410362, + "grad_norm": 5.041693210601807, + "learning_rate": 1.02e-07, + "loss": 0.0943, + "step": 510 + }, + { + "epoch": 0.0020775277419870356, + "grad_norm": 2.0988104343414307, + "learning_rate": 1.04e-07, + "loss": 0.0922, + "step": 520 + }, + { + "epoch": 0.002117480198563709, + "grad_norm": 3.1149399280548096, + "learning_rate": 1.06e-07, + "loss": 0.0939, + "step": 530 + }, + { + "epoch": 0.002157432655140383, + "grad_norm": 2.965878963470459, + "learning_rate": 1.08e-07, + "loss": 0.0939, + "step": 540 + }, + { + "epoch": 0.0021973851117170567, + "grad_norm": 2.6551949977874756, + "learning_rate": 1.1e-07, + "loss": 0.093, + "step": 550 + }, + { + "epoch": 0.0022373375682937305, + "grad_norm": 3.9663093090057373, + "learning_rate": 1.1200000000000001e-07, + "loss": 0.0948, + "step": 560 + }, + { + "epoch": 0.0022772900248704043, + "grad_norm": 4.39984655380249, + "learning_rate": 1.1399999999999999e-07, + "loss": 0.0922, + "step": 570 + }, + { + "epoch": 0.002317242481447078, + "grad_norm": 2.172233819961548, + "learning_rate": 1.1599999999999999e-07, + "loss": 0.094, + "step": 580 + }, + { + "epoch": 0.002357194938023752, + "grad_norm": 3.2275962829589844, + "learning_rate": 1.1799999999999998e-07, + "loss": 0.0927, + "step": 590 + }, + { + "epoch": 0.0023971473946004253, + "grad_norm": 3.7049944400787354, + "learning_rate": 1.2e-07, + "loss": 0.0933, + "step": 600 + }, + { + "epoch": 0.002437099851177099, + "grad_norm": 2.302375316619873, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.0926, + "step": 610 + }, + { + "epoch": 0.002477052307753773, + "grad_norm": 6.9334330558776855, + "learning_rate": 1.24e-07, + "loss": 0.0945, + "step": 620 + }, + { + "epoch": 0.002517004764330447, + "grad_norm": 3.6118319034576416, + "learning_rate": 1.26e-07, + "loss": 0.0925, + "step": 630 + }, + { + "epoch": 0.0025569572209071206, + "grad_norm": 5.224996089935303, + "learning_rate": 1.28e-07, + "loss": 0.0929, + "step": 640 + }, + { + "epoch": 0.0025969096774837944, + "grad_norm": 2.4619297981262207, + "learning_rate": 1.3e-07, + "loss": 0.0922, + "step": 650 + }, + { + "epoch": 0.0026368621340604683, + "grad_norm": 2.932446002960205, + "learning_rate": 1.32e-07, + "loss": 0.0908, + "step": 660 + }, + { + "epoch": 0.0026768145906371416, + "grad_norm": 3.371274471282959, + "learning_rate": 1.34e-07, + "loss": 0.0915, + "step": 670 + }, + { + "epoch": 0.0027167670472138155, + "grad_norm": 7.18813943862915, + "learning_rate": 1.36e-07, + "loss": 0.093, + "step": 680 + }, + { + "epoch": 0.0027567195037904893, + "grad_norm": 2.6801059246063232, + "learning_rate": 1.38e-07, + "loss": 0.0936, + "step": 690 + }, + { + "epoch": 0.002796671960367163, + "grad_norm": 2.498303174972534, + "learning_rate": 1.3999999999999998e-07, + "loss": 0.0907, + "step": 700 + }, + { + "epoch": 0.002836624416943837, + "grad_norm": 2.889376640319824, + "learning_rate": 1.4199999999999997e-07, + "loss": 0.0958, + "step": 710 + }, + { + "epoch": 0.0028765768735205107, + "grad_norm": 3.3200151920318604, + "learning_rate": 1.44e-07, + "loss": 0.0917, + "step": 720 + }, + { + "epoch": 0.0029165293300971846, + "grad_norm": 2.1992945671081543, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.0895, + "step": 730 + }, + { + "epoch": 0.002956481786673858, + "grad_norm": 2.7956016063690186, + "learning_rate": 1.48e-07, + "loss": 0.0893, + "step": 740 + }, + { + "epoch": 0.0029964342432505318, + "grad_norm": 3.785031795501709, + "learning_rate": 1.5e-07, + "loss": 0.0882, + "step": 750 + }, + { + "epoch": 0.0030363866998272056, + "grad_norm": 4.1994404792785645, + "learning_rate": 1.5199999999999998e-07, + "loss": 0.09, + "step": 760 + }, + { + "epoch": 0.0030763391564038794, + "grad_norm": 19.363445281982422, + "learning_rate": 1.54e-07, + "loss": 0.0899, + "step": 770 + }, + { + "epoch": 0.0031162916129805532, + "grad_norm": 2.3020198345184326, + "learning_rate": 1.56e-07, + "loss": 0.0891, + "step": 780 + }, + { + "epoch": 0.003156244069557227, + "grad_norm": 7.748526573181152, + "learning_rate": 1.58e-07, + "loss": 0.0888, + "step": 790 + }, + { + "epoch": 0.0031961965261339004, + "grad_norm": 4.087902545928955, + "learning_rate": 1.6e-07, + "loss": 0.0908, + "step": 800 + }, + { + "epoch": 0.0032361489827105743, + "grad_norm": 3.1431095600128174, + "learning_rate": 1.62e-07, + "loss": 0.0896, + "step": 810 + }, + { + "epoch": 0.003276101439287248, + "grad_norm": 5.284871578216553, + "learning_rate": 1.6399999999999999e-07, + "loss": 0.0931, + "step": 820 + }, + { + "epoch": 0.003316053895863922, + "grad_norm": 6.127146244049072, + "learning_rate": 1.6599999999999998e-07, + "loss": 0.0878, + "step": 830 + }, + { + "epoch": 0.0033560063524405957, + "grad_norm": 5.513176441192627, + "learning_rate": 1.68e-07, + "loss": 0.0877, + "step": 840 + }, + { + "epoch": 0.0033959588090172695, + "grad_norm": 4.571882247924805, + "learning_rate": 1.7e-07, + "loss": 0.0889, + "step": 850 + }, + { + "epoch": 0.0034359112655939434, + "grad_norm": 4.611965656280518, + "learning_rate": 1.7199999999999998e-07, + "loss": 0.0871, + "step": 860 + }, + { + "epoch": 0.0034758637221706168, + "grad_norm": 3.6662094593048096, + "learning_rate": 1.74e-07, + "loss": 0.0917, + "step": 870 + }, + { + "epoch": 0.0035158161787472906, + "grad_norm": 3.2792646884918213, + "learning_rate": 1.76e-07, + "loss": 0.0865, + "step": 880 + }, + { + "epoch": 0.0035557686353239644, + "grad_norm": 4.663689136505127, + "learning_rate": 1.78e-07, + "loss": 0.0875, + "step": 890 + }, + { + "epoch": 0.003595721091900638, + "grad_norm": 4.592284679412842, + "learning_rate": 1.8e-07, + "loss": 0.0835, + "step": 900 + }, + { + "epoch": 0.003635673548477312, + "grad_norm": 3.564134359359741, + "learning_rate": 1.82e-07, + "loss": 0.0878, + "step": 910 + }, + { + "epoch": 0.003675626005053986, + "grad_norm": 3.7273480892181396, + "learning_rate": 1.84e-07, + "loss": 0.0889, + "step": 920 + }, + { + "epoch": 0.0037155784616306597, + "grad_norm": 3.845996618270874, + "learning_rate": 1.86e-07, + "loss": 0.0862, + "step": 930 + }, + { + "epoch": 0.003755530918207333, + "grad_norm": 4.088866710662842, + "learning_rate": 1.88e-07, + "loss": 0.0837, + "step": 940 + }, + { + "epoch": 0.003795483374784007, + "grad_norm": 3.820903778076172, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.0842, + "step": 950 + }, + { + "epoch": 0.0038354358313606807, + "grad_norm": 14.669225692749023, + "learning_rate": 1.9199999999999997e-07, + "loss": 0.0852, + "step": 960 + }, + { + "epoch": 0.0038753882879373545, + "grad_norm": 3.9636285305023193, + "learning_rate": 1.94e-07, + "loss": 0.0871, + "step": 970 + }, + { + "epoch": 0.003915340744514028, + "grad_norm": 3.5666964054107666, + "learning_rate": 1.9599999999999998e-07, + "loss": 0.0846, + "step": 980 + }, + { + "epoch": 0.003955293201090702, + "grad_norm": 3.4256503582000732, + "learning_rate": 1.98e-07, + "loss": 0.0855, + "step": 990 + }, + { + "epoch": 0.003995245657667376, + "grad_norm": 10.105947494506836, + "learning_rate": 2e-07, + "loss": 0.0855, + "step": 1000 + }, + { + "epoch": 0.004035198114244049, + "grad_norm": 4.789931774139404, + "learning_rate": 1.9999999980228855e-07, + "loss": 0.0858, + "step": 1010 + }, + { + "epoch": 0.004075150570820724, + "grad_norm": 4.466113567352295, + "learning_rate": 1.9999999920915415e-07, + "loss": 0.0862, + "step": 1020 + }, + { + "epoch": 0.004115103027397397, + "grad_norm": 5.084461688995361, + "learning_rate": 1.9999999822059685e-07, + "loss": 0.0859, + "step": 1030 + }, + { + "epoch": 0.004155055483974071, + "grad_norm": 6.006556510925293, + "learning_rate": 1.9999999683661664e-07, + "loss": 0.0881, + "step": 1040 + }, + { + "epoch": 0.004195007940550745, + "grad_norm": 3.2521345615386963, + "learning_rate": 1.999999950572135e-07, + "loss": 0.0849, + "step": 1050 + }, + { + "epoch": 0.004234960397127418, + "grad_norm": 4.059595108032227, + "learning_rate": 1.9999999288238748e-07, + "loss": 0.0838, + "step": 1060 + }, + { + "epoch": 0.004274912853704092, + "grad_norm": 4.865835189819336, + "learning_rate": 1.9999999031213858e-07, + "loss": 0.0869, + "step": 1070 + }, + { + "epoch": 0.004314865310280766, + "grad_norm": 3.5474905967712402, + "learning_rate": 1.9999998734646678e-07, + "loss": 0.0875, + "step": 1080 + }, + { + "epoch": 0.00435481776685744, + "grad_norm": 3.235589027404785, + "learning_rate": 1.999999839853721e-07, + "loss": 0.0833, + "step": 1090 + }, + { + "epoch": 0.004394770223434113, + "grad_norm": 3.4536237716674805, + "learning_rate": 1.999999802288546e-07, + "loss": 0.0808, + "step": 1100 + }, + { + "epoch": 0.004434722680010788, + "grad_norm": 3.480510711669922, + "learning_rate": 1.999999760769142e-07, + "loss": 0.085, + "step": 1110 + }, + { + "epoch": 0.004474675136587461, + "grad_norm": 5.696749210357666, + "learning_rate": 1.9999997152955101e-07, + "loss": 0.0833, + "step": 1120 + }, + { + "epoch": 0.004514627593164134, + "grad_norm": 5.169352054595947, + "learning_rate": 1.99999966586765e-07, + "loss": 0.0801, + "step": 1130 + }, + { + "epoch": 0.004554580049740809, + "grad_norm": 4.78287410736084, + "learning_rate": 1.9999996124855622e-07, + "loss": 0.0804, + "step": 1140 + }, + { + "epoch": 0.004594532506317482, + "grad_norm": 7.360321521759033, + "learning_rate": 1.9999995551492463e-07, + "loss": 0.0847, + "step": 1150 + }, + { + "epoch": 0.004634484962894156, + "grad_norm": 4.4191179275512695, + "learning_rate": 1.9999994938587032e-07, + "loss": 0.0843, + "step": 1160 + }, + { + "epoch": 0.00467443741947083, + "grad_norm": 4.392884254455566, + "learning_rate": 1.999999428613933e-07, + "loss": 0.0826, + "step": 1170 + }, + { + "epoch": 0.004714389876047504, + "grad_norm": 4.025437355041504, + "learning_rate": 1.9999993594149358e-07, + "loss": 0.0808, + "step": 1180 + }, + { + "epoch": 0.004754342332624177, + "grad_norm": 4.164750576019287, + "learning_rate": 1.999999286261712e-07, + "loss": 0.0843, + "step": 1190 + }, + { + "epoch": 0.004794294789200851, + "grad_norm": 4.162405490875244, + "learning_rate": 1.9999992091542614e-07, + "loss": 0.0831, + "step": 1200 + }, + { + "epoch": 0.004834247245777525, + "grad_norm": 4.556960582733154, + "learning_rate": 1.9999991280925852e-07, + "loss": 0.0793, + "step": 1210 + }, + { + "epoch": 0.004874199702354198, + "grad_norm": 5.228780746459961, + "learning_rate": 1.999999043076683e-07, + "loss": 0.0806, + "step": 1220 + }, + { + "epoch": 0.0049141521589308726, + "grad_norm": 3.104085683822632, + "learning_rate": 1.9999989541065553e-07, + "loss": 0.0817, + "step": 1230 + }, + { + "epoch": 0.004954104615507546, + "grad_norm": 3.8059277534484863, + "learning_rate": 1.9999988611822026e-07, + "loss": 0.0804, + "step": 1240 + }, + { + "epoch": 0.00499405707208422, + "grad_norm": 6.938385009765625, + "learning_rate": 1.9999987643036253e-07, + "loss": 0.0788, + "step": 1250 + }, + { + "epoch": 0.005034009528660894, + "grad_norm": 4.4594011306762695, + "learning_rate": 1.9999986634708237e-07, + "loss": 0.0812, + "step": 1260 + }, + { + "epoch": 0.005073961985237567, + "grad_norm": 2.942603826522827, + "learning_rate": 1.9999985586837979e-07, + "loss": 0.0819, + "step": 1270 + }, + { + "epoch": 0.005113914441814241, + "grad_norm": 3.9391515254974365, + "learning_rate": 1.9999984499425485e-07, + "loss": 0.0824, + "step": 1280 + }, + { + "epoch": 0.005153866898390915, + "grad_norm": 4.9759931564331055, + "learning_rate": 1.9999983372470765e-07, + "loss": 0.0814, + "step": 1290 + }, + { + "epoch": 0.005193819354967589, + "grad_norm": 4.812597751617432, + "learning_rate": 1.9999982205973817e-07, + "loss": 0.0776, + "step": 1300 + }, + { + "epoch": 0.005233771811544262, + "grad_norm": 3.4965343475341797, + "learning_rate": 1.9999980999934648e-07, + "loss": 0.0808, + "step": 1310 + }, + { + "epoch": 0.0052737242681209365, + "grad_norm": 3.5398149490356445, + "learning_rate": 1.9999979754353257e-07, + "loss": 0.0791, + "step": 1320 + }, + { + "epoch": 0.00531367672469761, + "grad_norm": 3.435899019241333, + "learning_rate": 1.999997846922966e-07, + "loss": 0.0748, + "step": 1330 + }, + { + "epoch": 0.005353629181274283, + "grad_norm": 4.133796215057373, + "learning_rate": 1.9999977144563852e-07, + "loss": 0.083, + "step": 1340 + }, + { + "epoch": 0.0053935816378509575, + "grad_norm": 3.5189433097839355, + "learning_rate": 1.9999975780355843e-07, + "loss": 0.0791, + "step": 1350 + }, + { + "epoch": 0.005433534094427631, + "grad_norm": 3.971168041229248, + "learning_rate": 1.999997437660564e-07, + "loss": 0.0807, + "step": 1360 + }, + { + "epoch": 0.005473486551004305, + "grad_norm": 8.99303913116455, + "learning_rate": 1.9999972933313246e-07, + "loss": 0.0804, + "step": 1370 + }, + { + "epoch": 0.005513439007580979, + "grad_norm": 6.399931907653809, + "learning_rate": 1.9999971450478665e-07, + "loss": 0.0798, + "step": 1380 + }, + { + "epoch": 0.005553391464157653, + "grad_norm": 4.760232448577881, + "learning_rate": 1.9999969928101902e-07, + "loss": 0.076, + "step": 1390 + }, + { + "epoch": 0.005593343920734326, + "grad_norm": 5.587004661560059, + "learning_rate": 1.999996836618297e-07, + "loss": 0.08, + "step": 1400 + }, + { + "epoch": 0.005633296377311, + "grad_norm": 3.0035483837127686, + "learning_rate": 1.9999966764721872e-07, + "loss": 0.0816, + "step": 1410 + }, + { + "epoch": 0.005673248833887674, + "grad_norm": 5.433310508728027, + "learning_rate": 1.999996512371861e-07, + "loss": 0.0733, + "step": 1420 + }, + { + "epoch": 0.005713201290464347, + "grad_norm": 5.407934665679932, + "learning_rate": 1.9999963443173197e-07, + "loss": 0.077, + "step": 1430 + }, + { + "epoch": 0.0057531537470410215, + "grad_norm": 5.5297136306762695, + "learning_rate": 1.9999961723085632e-07, + "loss": 0.0744, + "step": 1440 + }, + { + "epoch": 0.005793106203617695, + "grad_norm": 7.000801086425781, + "learning_rate": 1.999995996345593e-07, + "loss": 0.0772, + "step": 1450 + }, + { + "epoch": 0.005833058660194369, + "grad_norm": 7.005748748779297, + "learning_rate": 1.9999958164284094e-07, + "loss": 0.079, + "step": 1460 + }, + { + "epoch": 0.0058730111167710425, + "grad_norm": 4.568352699279785, + "learning_rate": 1.9999956325570132e-07, + "loss": 0.0764, + "step": 1470 + }, + { + "epoch": 0.005912963573347716, + "grad_norm": 4.320087432861328, + "learning_rate": 1.9999954447314046e-07, + "loss": 0.0769, + "step": 1480 + }, + { + "epoch": 0.00595291602992439, + "grad_norm": 3.638920783996582, + "learning_rate": 1.9999952529515852e-07, + "loss": 0.0769, + "step": 1490 + }, + { + "epoch": 0.0059928684865010635, + "grad_norm": 3.999699831008911, + "learning_rate": 1.9999950572175553e-07, + "loss": 0.0737, + "step": 1500 + }, + { + "epoch": 0.006032820943077738, + "grad_norm": 3.887305498123169, + "learning_rate": 1.9999948575293156e-07, + "loss": 0.0782, + "step": 1510 + }, + { + "epoch": 0.006072773399654411, + "grad_norm": 3.590383529663086, + "learning_rate": 1.999994653886867e-07, + "loss": 0.0738, + "step": 1520 + }, + { + "epoch": 0.0061127258562310854, + "grad_norm": 4.09374475479126, + "learning_rate": 1.9999944462902105e-07, + "loss": 0.0786, + "step": 1530 + }, + { + "epoch": 0.006152678312807759, + "grad_norm": 5.750084400177002, + "learning_rate": 1.9999942347393467e-07, + "loss": 0.0788, + "step": 1540 + }, + { + "epoch": 0.006192630769384432, + "grad_norm": 5.532803058624268, + "learning_rate": 1.9999940192342763e-07, + "loss": 0.0726, + "step": 1550 + }, + { + "epoch": 0.0062325832259611065, + "grad_norm": 8.183211326599121, + "learning_rate": 1.9999937997750007e-07, + "loss": 0.0743, + "step": 1560 + }, + { + "epoch": 0.00627253568253778, + "grad_norm": 5.710239410400391, + "learning_rate": 1.99999357636152e-07, + "loss": 0.0795, + "step": 1570 + }, + { + "epoch": 0.006312488139114454, + "grad_norm": 3.8146636486053467, + "learning_rate": 1.999993348993836e-07, + "loss": 0.0756, + "step": 1580 + }, + { + "epoch": 0.0063524405956911275, + "grad_norm": 4.738094329833984, + "learning_rate": 1.9999931176719486e-07, + "loss": 0.0771, + "step": 1590 + }, + { + "epoch": 0.006392393052267801, + "grad_norm": 4.774824619293213, + "learning_rate": 1.9999928823958594e-07, + "loss": 0.077, + "step": 1600 + }, + { + "epoch": 0.006432345508844475, + "grad_norm": 5.525330066680908, + "learning_rate": 1.999992643165569e-07, + "loss": 0.0719, + "step": 1610 + }, + { + "epoch": 0.0064722979654211485, + "grad_norm": 4.31286096572876, + "learning_rate": 1.9999923999810785e-07, + "loss": 0.0745, + "step": 1620 + }, + { + "epoch": 0.006512250421997823, + "grad_norm": 5.983364105224609, + "learning_rate": 1.999992152842389e-07, + "loss": 0.0736, + "step": 1630 + }, + { + "epoch": 0.006552202878574496, + "grad_norm": 6.835084915161133, + "learning_rate": 1.9999919017495012e-07, + "loss": 0.0724, + "step": 1640 + }, + { + "epoch": 0.00659215533515117, + "grad_norm": 5.799070835113525, + "learning_rate": 1.9999916467024166e-07, + "loss": 0.0722, + "step": 1650 + }, + { + "epoch": 0.006632107791727844, + "grad_norm": 4.562288761138916, + "learning_rate": 1.9999913877011353e-07, + "loss": 0.0749, + "step": 1660 + }, + { + "epoch": 0.006672060248304517, + "grad_norm": 4.678937911987305, + "learning_rate": 1.999991124745659e-07, + "loss": 0.0794, + "step": 1670 + }, + { + "epoch": 0.0067120127048811914, + "grad_norm": 5.087986469268799, + "learning_rate": 1.9999908578359887e-07, + "loss": 0.0705, + "step": 1680 + }, + { + "epoch": 0.006751965161457865, + "grad_norm": 3.7558884620666504, + "learning_rate": 1.9999905869721255e-07, + "loss": 0.0783, + "step": 1690 + }, + { + "epoch": 0.006791917618034539, + "grad_norm": 4.804757118225098, + "learning_rate": 1.9999903121540699e-07, + "loss": 0.0735, + "step": 1700 + }, + { + "epoch": 0.0068318700746112125, + "grad_norm": 4.257309913635254, + "learning_rate": 1.9999900333818233e-07, + "loss": 0.0755, + "step": 1710 + }, + { + "epoch": 0.006871822531187887, + "grad_norm": 3.274388313293457, + "learning_rate": 1.9999897506553873e-07, + "loss": 0.0739, + "step": 1720 + }, + { + "epoch": 0.00691177498776456, + "grad_norm": 5.1400299072265625, + "learning_rate": 1.9999894639747625e-07, + "loss": 0.0706, + "step": 1730 + }, + { + "epoch": 0.0069517274443412335, + "grad_norm": 4.5385284423828125, + "learning_rate": 1.99998917333995e-07, + "loss": 0.0768, + "step": 1740 + }, + { + "epoch": 0.006991679900917908, + "grad_norm": 3.6151580810546875, + "learning_rate": 1.9999888787509513e-07, + "loss": 0.0719, + "step": 1750 + }, + { + "epoch": 0.007031632357494581, + "grad_norm": 5.180079936981201, + "learning_rate": 1.9999885802077672e-07, + "loss": 0.0793, + "step": 1760 + }, + { + "epoch": 0.007071584814071255, + "grad_norm": 3.8837599754333496, + "learning_rate": 1.999988277710399e-07, + "loss": 0.0734, + "step": 1770 + }, + { + "epoch": 0.007111537270647929, + "grad_norm": 4.17250919342041, + "learning_rate": 1.9999879712588484e-07, + "loss": 0.0689, + "step": 1780 + }, + { + "epoch": 0.007151489727224603, + "grad_norm": 4.388998031616211, + "learning_rate": 1.9999876608531156e-07, + "loss": 0.071, + "step": 1790 + }, + { + "epoch": 0.007191442183801276, + "grad_norm": 35.300262451171875, + "learning_rate": 1.999987346493203e-07, + "loss": 0.0762, + "step": 1800 + }, + { + "epoch": 0.00723139464037795, + "grad_norm": 4.7005438804626465, + "learning_rate": 1.9999870281791105e-07, + "loss": 0.0716, + "step": 1810 + }, + { + "epoch": 0.007271347096954624, + "grad_norm": 4.587695598602295, + "learning_rate": 1.9999867059108406e-07, + "loss": 0.0758, + "step": 1820 + }, + { + "epoch": 0.0073112995535312975, + "grad_norm": 7.7645063400268555, + "learning_rate": 1.9999863796883938e-07, + "loss": 0.0703, + "step": 1830 + }, + { + "epoch": 0.007351252010107972, + "grad_norm": 5.538671493530273, + "learning_rate": 1.9999860495117717e-07, + "loss": 0.0719, + "step": 1840 + }, + { + "epoch": 0.007391204466684645, + "grad_norm": 4.648557662963867, + "learning_rate": 1.9999857153809753e-07, + "loss": 0.0723, + "step": 1850 + }, + { + "epoch": 0.007431156923261319, + "grad_norm": 5.345491409301758, + "learning_rate": 1.9999853772960065e-07, + "loss": 0.0719, + "step": 1860 + }, + { + "epoch": 0.007471109379837993, + "grad_norm": 4.889125823974609, + "learning_rate": 1.9999850352568663e-07, + "loss": 0.0716, + "step": 1870 + }, + { + "epoch": 0.007511061836414666, + "grad_norm": 5.144402027130127, + "learning_rate": 1.999984689263556e-07, + "loss": 0.0741, + "step": 1880 + }, + { + "epoch": 0.00755101429299134, + "grad_norm": 6.539116382598877, + "learning_rate": 1.999984339316077e-07, + "loss": 0.0714, + "step": 1890 + }, + { + "epoch": 0.007590966749568014, + "grad_norm": 5.308338165283203, + "learning_rate": 1.9999839854144308e-07, + "loss": 0.069, + "step": 1900 + }, + { + "epoch": 0.007630919206144688, + "grad_norm": 9.062039375305176, + "learning_rate": 1.9999836275586187e-07, + "loss": 0.0728, + "step": 1910 + }, + { + "epoch": 0.007670871662721361, + "grad_norm": 4.469082832336426, + "learning_rate": 1.999983265748642e-07, + "loss": 0.0634, + "step": 1920 + }, + { + "epoch": 0.007710824119298036, + "grad_norm": 4.078295707702637, + "learning_rate": 1.9999828999845026e-07, + "loss": 0.0736, + "step": 1930 + }, + { + "epoch": 0.007750776575874709, + "grad_norm": 5.0896992683410645, + "learning_rate": 1.9999825302662013e-07, + "loss": 0.0685, + "step": 1940 + }, + { + "epoch": 0.0077907290324513824, + "grad_norm": 5.298365116119385, + "learning_rate": 1.99998215659374e-07, + "loss": 0.0729, + "step": 1950 + }, + { + "epoch": 0.007830681489028057, + "grad_norm": 9.375609397888184, + "learning_rate": 1.9999817789671195e-07, + "loss": 0.0703, + "step": 1960 + }, + { + "epoch": 0.00787063394560473, + "grad_norm": 4.6998186111450195, + "learning_rate": 1.9999813973863424e-07, + "loss": 0.0729, + "step": 1970 + }, + { + "epoch": 0.007910586402181403, + "grad_norm": 4.949733257293701, + "learning_rate": 1.9999810118514098e-07, + "loss": 0.0642, + "step": 1980 + }, + { + "epoch": 0.007950538858758079, + "grad_norm": 7.436445236206055, + "learning_rate": 1.9999806223623228e-07, + "loss": 0.0652, + "step": 1990 + }, + { + "epoch": 0.007990491315334752, + "grad_norm": 6.345901966094971, + "learning_rate": 1.999980228919083e-07, + "loss": 0.0683, + "step": 2000 + }, + { + "epoch": 0.008030443771911425, + "grad_norm": 8.87732219696045, + "learning_rate": 1.9999798315216924e-07, + "loss": 0.0733, + "step": 2010 + }, + { + "epoch": 0.008070396228488099, + "grad_norm": 5.325031757354736, + "learning_rate": 1.9999794301701524e-07, + "loss": 0.0707, + "step": 2020 + }, + { + "epoch": 0.008110348685064772, + "grad_norm": 8.674820899963379, + "learning_rate": 1.9999790248644645e-07, + "loss": 0.0693, + "step": 2030 + }, + { + "epoch": 0.008150301141641447, + "grad_norm": 5.873802661895752, + "learning_rate": 1.9999786156046303e-07, + "loss": 0.0708, + "step": 2040 + }, + { + "epoch": 0.00819025359821812, + "grad_norm": 4.968667030334473, + "learning_rate": 1.9999782023906515e-07, + "loss": 0.0702, + "step": 2050 + }, + { + "epoch": 0.008230206054794794, + "grad_norm": 7.31874418258667, + "learning_rate": 1.9999777852225294e-07, + "loss": 0.0685, + "step": 2060 + }, + { + "epoch": 0.008270158511371467, + "grad_norm": 6.53287410736084, + "learning_rate": 1.9999773641002664e-07, + "loss": 0.0644, + "step": 2070 + }, + { + "epoch": 0.008310110967948143, + "grad_norm": 6.626532077789307, + "learning_rate": 1.9999769390238635e-07, + "loss": 0.0723, + "step": 2080 + }, + { + "epoch": 0.008350063424524816, + "grad_norm": 3.8849735260009766, + "learning_rate": 1.9999765099933224e-07, + "loss": 0.0715, + "step": 2090 + }, + { + "epoch": 0.00839001588110149, + "grad_norm": 6.144596099853516, + "learning_rate": 1.9999760770086451e-07, + "loss": 0.0728, + "step": 2100 + }, + { + "epoch": 0.008429968337678163, + "grad_norm": 5.362217903137207, + "learning_rate": 1.999975640069833e-07, + "loss": 0.0701, + "step": 2110 + }, + { + "epoch": 0.008469920794254836, + "grad_norm": 4.9148478507995605, + "learning_rate": 1.9999751991768883e-07, + "loss": 0.0692, + "step": 2120 + }, + { + "epoch": 0.008509873250831511, + "grad_norm": 4.821737766265869, + "learning_rate": 1.9999747543298124e-07, + "loss": 0.0668, + "step": 2130 + }, + { + "epoch": 0.008549825707408185, + "grad_norm": 6.6895270347595215, + "learning_rate": 1.9999743055286072e-07, + "loss": 0.0692, + "step": 2140 + }, + { + "epoch": 0.008589778163984858, + "grad_norm": 15.727624893188477, + "learning_rate": 1.9999738527732744e-07, + "loss": 0.0705, + "step": 2150 + }, + { + "epoch": 0.008629730620561531, + "grad_norm": 8.518064498901367, + "learning_rate": 1.9999733960638154e-07, + "loss": 0.0681, + "step": 2160 + }, + { + "epoch": 0.008669683077138205, + "grad_norm": 4.448235034942627, + "learning_rate": 1.9999729354002328e-07, + "loss": 0.0689, + "step": 2170 + }, + { + "epoch": 0.00870963553371488, + "grad_norm": 5.125146389007568, + "learning_rate": 1.9999724707825275e-07, + "loss": 0.0719, + "step": 2180 + }, + { + "epoch": 0.008749587990291553, + "grad_norm": 4.904445171356201, + "learning_rate": 1.9999720022107022e-07, + "loss": 0.0693, + "step": 2190 + }, + { + "epoch": 0.008789540446868227, + "grad_norm": 5.865431308746338, + "learning_rate": 1.9999715296847586e-07, + "loss": 0.0661, + "step": 2200 + }, + { + "epoch": 0.0088294929034449, + "grad_norm": 10.226508140563965, + "learning_rate": 1.999971053204698e-07, + "loss": 0.0637, + "step": 2210 + }, + { + "epoch": 0.008869445360021575, + "grad_norm": 6.67934513092041, + "learning_rate": 1.9999705727705228e-07, + "loss": 0.0678, + "step": 2220 + }, + { + "epoch": 0.008909397816598249, + "grad_norm": 5.464043617248535, + "learning_rate": 1.9999700883822348e-07, + "loss": 0.0618, + "step": 2230 + }, + { + "epoch": 0.008949350273174922, + "grad_norm": 4.38269567489624, + "learning_rate": 1.9999696000398357e-07, + "loss": 0.064, + "step": 2240 + }, + { + "epoch": 0.008989302729751595, + "grad_norm": 7.904757499694824, + "learning_rate": 1.9999691077433278e-07, + "loss": 0.0688, + "step": 2250 + }, + { + "epoch": 0.009029255186328269, + "grad_norm": 5.12589168548584, + "learning_rate": 1.9999686114927128e-07, + "loss": 0.0654, + "step": 2260 + }, + { + "epoch": 0.009069207642904944, + "grad_norm": 14.386942863464355, + "learning_rate": 1.9999681112879924e-07, + "loss": 0.0671, + "step": 2270 + }, + { + "epoch": 0.009109160099481617, + "grad_norm": 7.759122371673584, + "learning_rate": 1.9999676071291694e-07, + "loss": 0.0716, + "step": 2280 + }, + { + "epoch": 0.00914911255605829, + "grad_norm": 4.223438739776611, + "learning_rate": 1.999967099016245e-07, + "loss": 0.067, + "step": 2290 + }, + { + "epoch": 0.009189065012634964, + "grad_norm": 3.5409440994262695, + "learning_rate": 1.9999665869492213e-07, + "loss": 0.0704, + "step": 2300 + }, + { + "epoch": 0.009229017469211637, + "grad_norm": 4.6483612060546875, + "learning_rate": 1.9999660709281008e-07, + "loss": 0.0679, + "step": 2310 + }, + { + "epoch": 0.009268969925788312, + "grad_norm": 3.5321383476257324, + "learning_rate": 1.9999655509528852e-07, + "loss": 0.0612, + "step": 2320 + }, + { + "epoch": 0.009308922382364986, + "grad_norm": 15.838689804077148, + "learning_rate": 1.9999650270235766e-07, + "loss": 0.0633, + "step": 2330 + }, + { + "epoch": 0.00934887483894166, + "grad_norm": 4.161035537719727, + "learning_rate": 1.999964499140177e-07, + "loss": 0.0654, + "step": 2340 + }, + { + "epoch": 0.009388827295518333, + "grad_norm": 7.581517696380615, + "learning_rate": 1.9999639673026887e-07, + "loss": 0.0657, + "step": 2350 + }, + { + "epoch": 0.009428779752095008, + "grad_norm": 5.2406816482543945, + "learning_rate": 1.9999634315111135e-07, + "loss": 0.0666, + "step": 2360 + }, + { + "epoch": 0.009468732208671681, + "grad_norm": 8.651517868041992, + "learning_rate": 1.9999628917654539e-07, + "loss": 0.063, + "step": 2370 + }, + { + "epoch": 0.009508684665248355, + "grad_norm": 5.244176387786865, + "learning_rate": 1.9999623480657115e-07, + "loss": 0.0645, + "step": 2380 + }, + { + "epoch": 0.009548637121825028, + "grad_norm": 4.2036824226379395, + "learning_rate": 1.999961800411889e-07, + "loss": 0.0636, + "step": 2390 + }, + { + "epoch": 0.009588589578401701, + "grad_norm": 4.1927947998046875, + "learning_rate": 1.9999612488039883e-07, + "loss": 0.0651, + "step": 2400 + }, + { + "epoch": 0.009628542034978376, + "grad_norm": 5.556817531585693, + "learning_rate": 1.9999606932420117e-07, + "loss": 0.0686, + "step": 2410 + }, + { + "epoch": 0.00966849449155505, + "grad_norm": 4.338217735290527, + "learning_rate": 1.9999601337259612e-07, + "loss": 0.0646, + "step": 2420 + }, + { + "epoch": 0.009708446948131723, + "grad_norm": 6.011440277099609, + "learning_rate": 1.999959570255839e-07, + "loss": 0.0656, + "step": 2430 + }, + { + "epoch": 0.009748399404708397, + "grad_norm": 4.676085948944092, + "learning_rate": 1.9999590028316478e-07, + "loss": 0.058, + "step": 2440 + }, + { + "epoch": 0.00978835186128507, + "grad_norm": 8.417377471923828, + "learning_rate": 1.9999584314533892e-07, + "loss": 0.0724, + "step": 2450 + }, + { + "epoch": 0.009828304317861745, + "grad_norm": 6.624401569366455, + "learning_rate": 1.9999578561210659e-07, + "loss": 0.0633, + "step": 2460 + }, + { + "epoch": 0.009868256774438419, + "grad_norm": 6.37753963470459, + "learning_rate": 1.9999572768346803e-07, + "loss": 0.0646, + "step": 2470 + }, + { + "epoch": 0.009908209231015092, + "grad_norm": 4.513980388641357, + "learning_rate": 1.999956693594234e-07, + "loss": 0.0729, + "step": 2480 + }, + { + "epoch": 0.009948161687591765, + "grad_norm": 4.258601188659668, + "learning_rate": 1.99995610639973e-07, + "loss": 0.0633, + "step": 2490 + }, + { + "epoch": 0.00998811414416844, + "grad_norm": 4.369763374328613, + "learning_rate": 1.99995551525117e-07, + "loss": 0.0659, + "step": 2500 + }, + { + "epoch": 0.010028066600745114, + "grad_norm": 8.97291088104248, + "learning_rate": 1.999954920148557e-07, + "loss": 0.0654, + "step": 2510 + }, + { + "epoch": 0.010068019057321787, + "grad_norm": 6.192311763763428, + "learning_rate": 1.999954321091893e-07, + "loss": 0.0701, + "step": 2520 + }, + { + "epoch": 0.01010797151389846, + "grad_norm": 10.451215744018555, + "learning_rate": 1.9999537180811804e-07, + "loss": 0.0648, + "step": 2530 + }, + { + "epoch": 0.010147923970475134, + "grad_norm": 49.48408508300781, + "learning_rate": 1.9999531111164215e-07, + "loss": 0.0606, + "step": 2540 + }, + { + "epoch": 0.010187876427051809, + "grad_norm": 6.317320346832275, + "learning_rate": 1.9999525001976188e-07, + "loss": 0.0614, + "step": 2550 + }, + { + "epoch": 0.010227828883628482, + "grad_norm": 29.881383895874023, + "learning_rate": 1.9999518853247747e-07, + "loss": 0.066, + "step": 2560 + }, + { + "epoch": 0.010267781340205156, + "grad_norm": 8.161931991577148, + "learning_rate": 1.999951266497892e-07, + "loss": 0.0656, + "step": 2570 + }, + { + "epoch": 0.01030773379678183, + "grad_norm": 9.390154838562012, + "learning_rate": 1.9999506437169723e-07, + "loss": 0.0622, + "step": 2580 + }, + { + "epoch": 0.010347686253358503, + "grad_norm": 5.921825885772705, + "learning_rate": 1.999950016982019e-07, + "loss": 0.0625, + "step": 2590 + }, + { + "epoch": 0.010387638709935178, + "grad_norm": 6.152169704437256, + "learning_rate": 1.9999493862930336e-07, + "loss": 0.0688, + "step": 2600 + }, + { + "epoch": 0.010427591166511851, + "grad_norm": 5.45894718170166, + "learning_rate": 1.9999487516500195e-07, + "loss": 0.0608, + "step": 2610 + }, + { + "epoch": 0.010467543623088525, + "grad_norm": 5.739716529846191, + "learning_rate": 1.9999481130529787e-07, + "loss": 0.0654, + "step": 2620 + }, + { + "epoch": 0.010507496079665198, + "grad_norm": 8.366711616516113, + "learning_rate": 1.9999474705019138e-07, + "loss": 0.0627, + "step": 2630 + }, + { + "epoch": 0.010547448536241873, + "grad_norm": 5.5183563232421875, + "learning_rate": 1.999946823996828e-07, + "loss": 0.0603, + "step": 2640 + }, + { + "epoch": 0.010587400992818546, + "grad_norm": 5.0801005363464355, + "learning_rate": 1.9999461735377226e-07, + "loss": 0.0608, + "step": 2650 + }, + { + "epoch": 0.01062735344939522, + "grad_norm": 6.439628601074219, + "learning_rate": 1.9999455191246007e-07, + "loss": 0.062, + "step": 2660 + }, + { + "epoch": 0.010667305905971893, + "grad_norm": 4.484591960906982, + "learning_rate": 1.9999448607574656e-07, + "loss": 0.0634, + "step": 2670 + }, + { + "epoch": 0.010707258362548567, + "grad_norm": 4.483346462249756, + "learning_rate": 1.999944198436319e-07, + "loss": 0.0575, + "step": 2680 + }, + { + "epoch": 0.010747210819125242, + "grad_norm": 5.103272914886475, + "learning_rate": 1.999943532161164e-07, + "loss": 0.0662, + "step": 2690 + }, + { + "epoch": 0.010787163275701915, + "grad_norm": 4.706233024597168, + "learning_rate": 1.999942861932003e-07, + "loss": 0.0599, + "step": 2700 + }, + { + "epoch": 0.010827115732278588, + "grad_norm": 16.617538452148438, + "learning_rate": 1.9999421877488385e-07, + "loss": 0.0619, + "step": 2710 + }, + { + "epoch": 0.010867068188855262, + "grad_norm": 5.156278610229492, + "learning_rate": 1.9999415096116738e-07, + "loss": 0.0659, + "step": 2720 + }, + { + "epoch": 0.010907020645431935, + "grad_norm": 5.735390663146973, + "learning_rate": 1.9999408275205108e-07, + "loss": 0.0622, + "step": 2730 + }, + { + "epoch": 0.01094697310200861, + "grad_norm": 4.2900848388671875, + "learning_rate": 1.9999401414753528e-07, + "loss": 0.058, + "step": 2740 + }, + { + "epoch": 0.010986925558585284, + "grad_norm": 7.4003987312316895, + "learning_rate": 1.9999394514762024e-07, + "loss": 0.0643, + "step": 2750 + }, + { + "epoch": 0.011026878015161957, + "grad_norm": 5.71486759185791, + "learning_rate": 1.9999387575230616e-07, + "loss": 0.0617, + "step": 2760 + }, + { + "epoch": 0.01106683047173863, + "grad_norm": 6.0663580894470215, + "learning_rate": 1.9999380596159346e-07, + "loss": 0.0647, + "step": 2770 + }, + { + "epoch": 0.011106782928315306, + "grad_norm": 7.182838439941406, + "learning_rate": 1.9999373577548228e-07, + "loss": 0.0643, + "step": 2780 + }, + { + "epoch": 0.011146735384891979, + "grad_norm": 2.9280507564544678, + "learning_rate": 1.9999366519397297e-07, + "loss": 0.0572, + "step": 2790 + }, + { + "epoch": 0.011186687841468652, + "grad_norm": 5.142223834991455, + "learning_rate": 1.9999359421706577e-07, + "loss": 0.0592, + "step": 2800 + }, + { + "epoch": 0.011226640298045326, + "grad_norm": 9.914444923400879, + "learning_rate": 1.9999352284476096e-07, + "loss": 0.0635, + "step": 2810 + }, + { + "epoch": 0.011266592754622, + "grad_norm": 17.795087814331055, + "learning_rate": 1.9999345107705888e-07, + "loss": 0.0597, + "step": 2820 + }, + { + "epoch": 0.011306545211198674, + "grad_norm": 6.50901985168457, + "learning_rate": 1.9999337891395975e-07, + "loss": 0.0648, + "step": 2830 + }, + { + "epoch": 0.011346497667775348, + "grad_norm": 6.162097454071045, + "learning_rate": 1.999933063554639e-07, + "loss": 0.061, + "step": 2840 + }, + { + "epoch": 0.011386450124352021, + "grad_norm": 4.456450462341309, + "learning_rate": 1.9999323340157158e-07, + "loss": 0.0664, + "step": 2850 + }, + { + "epoch": 0.011426402580928694, + "grad_norm": 5.097443103790283, + "learning_rate": 1.9999316005228312e-07, + "loss": 0.0612, + "step": 2860 + }, + { + "epoch": 0.011466355037505368, + "grad_norm": 4.066407203674316, + "learning_rate": 1.9999308630759875e-07, + "loss": 0.0634, + "step": 2870 + }, + { + "epoch": 0.011506307494082043, + "grad_norm": 5.307518482208252, + "learning_rate": 1.999930121675188e-07, + "loss": 0.0634, + "step": 2880 + }, + { + "epoch": 0.011546259950658716, + "grad_norm": 6.979706287384033, + "learning_rate": 1.999929376320436e-07, + "loss": 0.0604, + "step": 2890 + }, + { + "epoch": 0.01158621240723539, + "grad_norm": 4.886172771453857, + "learning_rate": 1.9999286270117338e-07, + "loss": 0.0615, + "step": 2900 + }, + { + "epoch": 0.011626164863812063, + "grad_norm": 4.774227619171143, + "learning_rate": 1.9999278737490846e-07, + "loss": 0.0621, + "step": 2910 + }, + { + "epoch": 0.011666117320388738, + "grad_norm": 5.1474528312683105, + "learning_rate": 1.9999271165324917e-07, + "loss": 0.0628, + "step": 2920 + }, + { + "epoch": 0.011706069776965412, + "grad_norm": 19.40691375732422, + "learning_rate": 1.9999263553619571e-07, + "loss": 0.0605, + "step": 2930 + }, + { + "epoch": 0.011746022233542085, + "grad_norm": 5.122270584106445, + "learning_rate": 1.999925590237485e-07, + "loss": 0.0593, + "step": 2940 + }, + { + "epoch": 0.011785974690118758, + "grad_norm": 6.664755821228027, + "learning_rate": 1.9999248211590779e-07, + "loss": 0.0594, + "step": 2950 + }, + { + "epoch": 0.011825927146695432, + "grad_norm": 9.390413284301758, + "learning_rate": 1.9999240481267387e-07, + "loss": 0.0582, + "step": 2960 + }, + { + "epoch": 0.011865879603272107, + "grad_norm": 12.886116981506348, + "learning_rate": 1.999923271140471e-07, + "loss": 0.0638, + "step": 2970 + }, + { + "epoch": 0.01190583205984878, + "grad_norm": 5.591165065765381, + "learning_rate": 1.9999224902002768e-07, + "loss": 0.0623, + "step": 2980 + }, + { + "epoch": 0.011945784516425454, + "grad_norm": 9.652422904968262, + "learning_rate": 1.9999217053061605e-07, + "loss": 0.0614, + "step": 2990 + }, + { + "epoch": 0.011985736973002127, + "grad_norm": 7.830957889556885, + "learning_rate": 1.9999209164581243e-07, + "loss": 0.0648, + "step": 3000 + }, + { + "epoch": 0.0120256894295788, + "grad_norm": 6.437974452972412, + "learning_rate": 1.9999201236561716e-07, + "loss": 0.0563, + "step": 3010 + }, + { + "epoch": 0.012065641886155476, + "grad_norm": 4.757688045501709, + "learning_rate": 1.9999193269003057e-07, + "loss": 0.0597, + "step": 3020 + }, + { + "epoch": 0.012105594342732149, + "grad_norm": 7.4366278648376465, + "learning_rate": 1.9999185261905295e-07, + "loss": 0.0619, + "step": 3030 + }, + { + "epoch": 0.012145546799308822, + "grad_norm": 10.12637710571289, + "learning_rate": 1.9999177215268463e-07, + "loss": 0.0588, + "step": 3040 + }, + { + "epoch": 0.012185499255885496, + "grad_norm": 12.869869232177734, + "learning_rate": 1.999916912909259e-07, + "loss": 0.0584, + "step": 3050 + }, + { + "epoch": 0.012225451712462171, + "grad_norm": 5.366079330444336, + "learning_rate": 1.999916100337771e-07, + "loss": 0.0676, + "step": 3060 + }, + { + "epoch": 0.012265404169038844, + "grad_norm": 4.153650760650635, + "learning_rate": 1.9999152838123861e-07, + "loss": 0.0631, + "step": 3070 + }, + { + "epoch": 0.012305356625615518, + "grad_norm": 23.691713333129883, + "learning_rate": 1.9999144633331065e-07, + "loss": 0.0622, + "step": 3080 + }, + { + "epoch": 0.012345309082192191, + "grad_norm": 6.222256660461426, + "learning_rate": 1.999913638899936e-07, + "loss": 0.0629, + "step": 3090 + }, + { + "epoch": 0.012385261538768864, + "grad_norm": 6.147396087646484, + "learning_rate": 1.9999128105128779e-07, + "loss": 0.0613, + "step": 3100 + }, + { + "epoch": 0.01242521399534554, + "grad_norm": 8.302750587463379, + "learning_rate": 1.9999119781719355e-07, + "loss": 0.0661, + "step": 3110 + }, + { + "epoch": 0.012465166451922213, + "grad_norm": 6.202587604522705, + "learning_rate": 1.9999111418771115e-07, + "loss": 0.0598, + "step": 3120 + }, + { + "epoch": 0.012505118908498886, + "grad_norm": 6.227569103240967, + "learning_rate": 1.99991030162841e-07, + "loss": 0.0618, + "step": 3130 + }, + { + "epoch": 0.01254507136507556, + "grad_norm": 4.248029708862305, + "learning_rate": 1.9999094574258338e-07, + "loss": 0.0612, + "step": 3140 + }, + { + "epoch": 0.012585023821652233, + "grad_norm": 5.985173225402832, + "learning_rate": 1.9999086092693865e-07, + "loss": 0.0579, + "step": 3150 + }, + { + "epoch": 0.012624976278228908, + "grad_norm": 4.616217613220215, + "learning_rate": 1.9999077571590712e-07, + "loss": 0.0586, + "step": 3160 + }, + { + "epoch": 0.012664928734805582, + "grad_norm": 8.506669044494629, + "learning_rate": 1.9999069010948917e-07, + "loss": 0.0612, + "step": 3170 + }, + { + "epoch": 0.012704881191382255, + "grad_norm": 4.161624431610107, + "learning_rate": 1.999906041076851e-07, + "loss": 0.0576, + "step": 3180 + }, + { + "epoch": 0.012744833647958928, + "grad_norm": 11.790534019470215, + "learning_rate": 1.9999051771049524e-07, + "loss": 0.0664, + "step": 3190 + }, + { + "epoch": 0.012784786104535602, + "grad_norm": 5.822751998901367, + "learning_rate": 1.9999043091791997e-07, + "loss": 0.0609, + "step": 3200 + }, + { + "epoch": 0.012824738561112277, + "grad_norm": 4.998936653137207, + "learning_rate": 1.9999034372995963e-07, + "loss": 0.0584, + "step": 3210 + }, + { + "epoch": 0.01286469101768895, + "grad_norm": 10.613631248474121, + "learning_rate": 1.9999025614661456e-07, + "loss": 0.0581, + "step": 3220 + }, + { + "epoch": 0.012904643474265624, + "grad_norm": 7.282402992248535, + "learning_rate": 1.9999016816788509e-07, + "loss": 0.062, + "step": 3230 + }, + { + "epoch": 0.012944595930842297, + "grad_norm": 3.915494203567505, + "learning_rate": 1.9999007979377157e-07, + "loss": 0.059, + "step": 3240 + }, + { + "epoch": 0.012984548387418972, + "grad_norm": 7.693503379821777, + "learning_rate": 1.9998999102427435e-07, + "loss": 0.0574, + "step": 3250 + }, + { + "epoch": 0.013024500843995646, + "grad_norm": 4.810811996459961, + "learning_rate": 1.999899018593938e-07, + "loss": 0.0576, + "step": 3260 + }, + { + "epoch": 0.013064453300572319, + "grad_norm": 6.214397430419922, + "learning_rate": 1.9998981229913023e-07, + "loss": 0.0548, + "step": 3270 + }, + { + "epoch": 0.013104405757148992, + "grad_norm": 5.55842924118042, + "learning_rate": 1.9998972234348407e-07, + "loss": 0.0643, + "step": 3280 + }, + { + "epoch": 0.013144358213725666, + "grad_norm": 3.472238063812256, + "learning_rate": 1.9998963199245558e-07, + "loss": 0.0594, + "step": 3290 + }, + { + "epoch": 0.01318431067030234, + "grad_norm": 4.414295196533203, + "learning_rate": 1.9998954124604519e-07, + "loss": 0.0593, + "step": 3300 + }, + { + "epoch": 0.013224263126879014, + "grad_norm": 4.651453971862793, + "learning_rate": 1.9998945010425326e-07, + "loss": 0.0563, + "step": 3310 + }, + { + "epoch": 0.013264215583455688, + "grad_norm": 4.688543796539307, + "learning_rate": 1.9998935856708007e-07, + "loss": 0.0609, + "step": 3320 + }, + { + "epoch": 0.013304168040032361, + "grad_norm": 5.625494480133057, + "learning_rate": 1.9998926663452606e-07, + "loss": 0.0594, + "step": 3330 + }, + { + "epoch": 0.013344120496609034, + "grad_norm": 5.699452877044678, + "learning_rate": 1.9998917430659162e-07, + "loss": 0.0587, + "step": 3340 + }, + { + "epoch": 0.01338407295318571, + "grad_norm": 12.8616943359375, + "learning_rate": 1.99989081583277e-07, + "loss": 0.0607, + "step": 3350 + }, + { + "epoch": 0.013424025409762383, + "grad_norm": 4.09845495223999, + "learning_rate": 1.9998898846458268e-07, + "loss": 0.0629, + "step": 3360 + }, + { + "epoch": 0.013463977866339056, + "grad_norm": 7.667017459869385, + "learning_rate": 1.9998889495050894e-07, + "loss": 0.0626, + "step": 3370 + }, + { + "epoch": 0.01350393032291573, + "grad_norm": 4.188732147216797, + "learning_rate": 1.999888010410562e-07, + "loss": 0.0572, + "step": 3380 + }, + { + "epoch": 0.013543882779492405, + "grad_norm": 19.452119827270508, + "learning_rate": 1.9998870673622484e-07, + "loss": 0.0544, + "step": 3390 + }, + { + "epoch": 0.013583835236069078, + "grad_norm": 3.4278697967529297, + "learning_rate": 1.999886120360152e-07, + "loss": 0.0586, + "step": 3400 + }, + { + "epoch": 0.013623787692645752, + "grad_norm": 10.832151412963867, + "learning_rate": 1.999885169404277e-07, + "loss": 0.0583, + "step": 3410 + }, + { + "epoch": 0.013663740149222425, + "grad_norm": 7.351028919219971, + "learning_rate": 1.9998842144946265e-07, + "loss": 0.0564, + "step": 3420 + }, + { + "epoch": 0.013703692605799098, + "grad_norm": 5.654749393463135, + "learning_rate": 1.9998832556312047e-07, + "loss": 0.057, + "step": 3430 + }, + { + "epoch": 0.013743645062375773, + "grad_norm": 5.403013706207275, + "learning_rate": 1.9998822928140154e-07, + "loss": 0.0549, + "step": 3440 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 7.070730686187744, + "learning_rate": 1.999881326043062e-07, + "loss": 0.0573, + "step": 3450 + }, + { + "epoch": 0.01382354997552912, + "grad_norm": 5.02528715133667, + "learning_rate": 1.999880355318349e-07, + "loss": 0.0593, + "step": 3460 + }, + { + "epoch": 0.013863502432105794, + "grad_norm": 9.05947208404541, + "learning_rate": 1.9998793806398798e-07, + "loss": 0.0536, + "step": 3470 + }, + { + "epoch": 0.013903454888682467, + "grad_norm": 4.074074745178223, + "learning_rate": 1.9998784020076587e-07, + "loss": 0.0542, + "step": 3480 + }, + { + "epoch": 0.013943407345259142, + "grad_norm": 10.380420684814453, + "learning_rate": 1.999877419421689e-07, + "loss": 0.0615, + "step": 3490 + }, + { + "epoch": 0.013983359801835816, + "grad_norm": 5.242879390716553, + "learning_rate": 1.9998764328819746e-07, + "loss": 0.056, + "step": 3500 + }, + { + "epoch": 0.014023312258412489, + "grad_norm": 5.361444473266602, + "learning_rate": 1.9998754423885195e-07, + "loss": 0.0614, + "step": 3510 + }, + { + "epoch": 0.014063264714989162, + "grad_norm": 3.0353522300720215, + "learning_rate": 1.999874447941328e-07, + "loss": 0.0593, + "step": 3520 + }, + { + "epoch": 0.014103217171565837, + "grad_norm": 3.6585006713867188, + "learning_rate": 1.9998734495404037e-07, + "loss": 0.0561, + "step": 3530 + }, + { + "epoch": 0.01414316962814251, + "grad_norm": 4.330874919891357, + "learning_rate": 1.9998724471857504e-07, + "loss": 0.0573, + "step": 3540 + }, + { + "epoch": 0.014183122084719184, + "grad_norm": 6.229880332946777, + "learning_rate": 1.9998714408773726e-07, + "loss": 0.0574, + "step": 3550 + }, + { + "epoch": 0.014223074541295858, + "grad_norm": 4.216676235198975, + "learning_rate": 1.9998704306152736e-07, + "loss": 0.0603, + "step": 3560 + }, + { + "epoch": 0.014263026997872531, + "grad_norm": 7.729151725769043, + "learning_rate": 1.999869416399458e-07, + "loss": 0.0577, + "step": 3570 + }, + { + "epoch": 0.014302979454449206, + "grad_norm": 3.9965620040893555, + "learning_rate": 1.9998683982299296e-07, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.01434293191102588, + "grad_norm": 9.594494819641113, + "learning_rate": 1.999867376106692e-07, + "loss": 0.0563, + "step": 3590 + }, + { + "epoch": 0.014382884367602553, + "grad_norm": 7.562802314758301, + "learning_rate": 1.99986635002975e-07, + "loss": 0.0598, + "step": 3600 + }, + { + "epoch": 0.014422836824179226, + "grad_norm": 7.098608016967773, + "learning_rate": 1.999865319999107e-07, + "loss": 0.0577, + "step": 3610 + }, + { + "epoch": 0.0144627892807559, + "grad_norm": 6.73671817779541, + "learning_rate": 1.9998642860147674e-07, + "loss": 0.0594, + "step": 3620 + }, + { + "epoch": 0.014502741737332575, + "grad_norm": 6.535707950592041, + "learning_rate": 1.9998632480767354e-07, + "loss": 0.0588, + "step": 3630 + }, + { + "epoch": 0.014542694193909248, + "grad_norm": 5.9286980628967285, + "learning_rate": 1.9998622061850145e-07, + "loss": 0.056, + "step": 3640 + }, + { + "epoch": 0.014582646650485922, + "grad_norm": 5.018116474151611, + "learning_rate": 1.99986116033961e-07, + "loss": 0.0593, + "step": 3650 + }, + { + "epoch": 0.014622599107062595, + "grad_norm": 8.149359703063965, + "learning_rate": 1.9998601105405247e-07, + "loss": 0.0577, + "step": 3660 + }, + { + "epoch": 0.01466255156363927, + "grad_norm": 4.476722717285156, + "learning_rate": 1.9998590567877636e-07, + "loss": 0.0572, + "step": 3670 + }, + { + "epoch": 0.014702504020215943, + "grad_norm": 3.990445375442505, + "learning_rate": 1.9998579990813305e-07, + "loss": 0.0596, + "step": 3680 + }, + { + "epoch": 0.014742456476792617, + "grad_norm": 5.281167030334473, + "learning_rate": 1.9998569374212294e-07, + "loss": 0.0587, + "step": 3690 + }, + { + "epoch": 0.01478240893336929, + "grad_norm": 4.960692405700684, + "learning_rate": 1.9998558718074655e-07, + "loss": 0.0525, + "step": 3700 + }, + { + "epoch": 0.014822361389945964, + "grad_norm": 4.984175205230713, + "learning_rate": 1.9998548022400416e-07, + "loss": 0.0615, + "step": 3710 + }, + { + "epoch": 0.014862313846522639, + "grad_norm": 6.324837684631348, + "learning_rate": 1.999853728718963e-07, + "loss": 0.0587, + "step": 3720 + }, + { + "epoch": 0.014902266303099312, + "grad_norm": 9.661124229431152, + "learning_rate": 1.9998526512442335e-07, + "loss": 0.0581, + "step": 3730 + }, + { + "epoch": 0.014942218759675985, + "grad_norm": 26.455751419067383, + "learning_rate": 1.9998515698158576e-07, + "loss": 0.0533, + "step": 3740 + }, + { + "epoch": 0.014982171216252659, + "grad_norm": 7.365353584289551, + "learning_rate": 1.999850484433839e-07, + "loss": 0.0575, + "step": 3750 + }, + { + "epoch": 0.015022123672829332, + "grad_norm": 6.3583760261535645, + "learning_rate": 1.9998493950981827e-07, + "loss": 0.0584, + "step": 3760 + }, + { + "epoch": 0.015062076129406007, + "grad_norm": 6.970668315887451, + "learning_rate": 1.9998483018088928e-07, + "loss": 0.0536, + "step": 3770 + }, + { + "epoch": 0.01510202858598268, + "grad_norm": 6.991600036621094, + "learning_rate": 1.9998472045659734e-07, + "loss": 0.0545, + "step": 3780 + }, + { + "epoch": 0.015141981042559354, + "grad_norm": 9.549487113952637, + "learning_rate": 1.999846103369429e-07, + "loss": 0.0505, + "step": 3790 + }, + { + "epoch": 0.015181933499136028, + "grad_norm": 6.326900482177734, + "learning_rate": 1.999844998219264e-07, + "loss": 0.0562, + "step": 3800 + }, + { + "epoch": 0.015221885955712703, + "grad_norm": 11.637884140014648, + "learning_rate": 1.9998438891154825e-07, + "loss": 0.0579, + "step": 3810 + }, + { + "epoch": 0.015261838412289376, + "grad_norm": 6.781747341156006, + "learning_rate": 1.9998427760580895e-07, + "loss": 0.0548, + "step": 3820 + }, + { + "epoch": 0.01530179086886605, + "grad_norm": 4.175704002380371, + "learning_rate": 1.9998416590470884e-07, + "loss": 0.0598, + "step": 3830 + }, + { + "epoch": 0.015341743325442723, + "grad_norm": 3.7842535972595215, + "learning_rate": 1.9998405380824845e-07, + "loss": 0.0539, + "step": 3840 + }, + { + "epoch": 0.015381695782019396, + "grad_norm": 11.304333686828613, + "learning_rate": 1.9998394131642819e-07, + "loss": 0.0565, + "step": 3850 + }, + { + "epoch": 0.015421648238596071, + "grad_norm": 6.897860050201416, + "learning_rate": 1.9998382842924852e-07, + "loss": 0.0557, + "step": 3860 + }, + { + "epoch": 0.015461600695172745, + "grad_norm": 4.277543544769287, + "learning_rate": 1.9998371514670986e-07, + "loss": 0.0519, + "step": 3870 + }, + { + "epoch": 0.015501553151749418, + "grad_norm": 7.145922660827637, + "learning_rate": 1.9998360146881268e-07, + "loss": 0.0588, + "step": 3880 + }, + { + "epoch": 0.015541505608326091, + "grad_norm": 5.36540412902832, + "learning_rate": 1.9998348739555744e-07, + "loss": 0.0538, + "step": 3890 + }, + { + "epoch": 0.015581458064902765, + "grad_norm": 5.027917861938477, + "learning_rate": 1.9998337292694455e-07, + "loss": 0.0545, + "step": 3900 + }, + { + "epoch": 0.01562141052147944, + "grad_norm": 8.556904792785645, + "learning_rate": 1.999832580629745e-07, + "loss": 0.0582, + "step": 3910 + }, + { + "epoch": 0.015661362978056113, + "grad_norm": 3.5763332843780518, + "learning_rate": 1.9998314280364772e-07, + "loss": 0.0575, + "step": 3920 + }, + { + "epoch": 0.01570131543463279, + "grad_norm": 4.078769207000732, + "learning_rate": 1.9998302714896468e-07, + "loss": 0.0549, + "step": 3930 + }, + { + "epoch": 0.01574126789120946, + "grad_norm": 12.152727127075195, + "learning_rate": 1.9998291109892582e-07, + "loss": 0.0576, + "step": 3940 + }, + { + "epoch": 0.015781220347786135, + "grad_norm": 5.786967754364014, + "learning_rate": 1.9998279465353162e-07, + "loss": 0.0593, + "step": 3950 + }, + { + "epoch": 0.015821172804362807, + "grad_norm": 6.199632167816162, + "learning_rate": 1.9998267781278255e-07, + "loss": 0.0538, + "step": 3960 + }, + { + "epoch": 0.015861125260939482, + "grad_norm": 8.640288352966309, + "learning_rate": 1.9998256057667903e-07, + "loss": 0.0555, + "step": 3970 + }, + { + "epoch": 0.015901077717516157, + "grad_norm": 7.416386127471924, + "learning_rate": 1.9998244294522157e-07, + "loss": 0.0568, + "step": 3980 + }, + { + "epoch": 0.01594103017409283, + "grad_norm": 8.727124214172363, + "learning_rate": 1.999823249184106e-07, + "loss": 0.0564, + "step": 3990 + }, + { + "epoch": 0.015980982630669504, + "grad_norm": 4.962089538574219, + "learning_rate": 1.999822064962466e-07, + "loss": 0.0547, + "step": 4000 + }, + { + "epoch": 0.016020935087246176, + "grad_norm": 4.818735599517822, + "learning_rate": 1.9998208767873006e-07, + "loss": 0.0566, + "step": 4010 + }, + { + "epoch": 0.01606088754382285, + "grad_norm": 14.160734176635742, + "learning_rate": 1.9998196846586138e-07, + "loss": 0.052, + "step": 4020 + }, + { + "epoch": 0.016100840000399526, + "grad_norm": 28.753080368041992, + "learning_rate": 1.9998184885764114e-07, + "loss": 0.0543, + "step": 4030 + }, + { + "epoch": 0.016140792456976197, + "grad_norm": 4.815136432647705, + "learning_rate": 1.9998172885406972e-07, + "loss": 0.061, + "step": 4040 + }, + { + "epoch": 0.016180744913552873, + "grad_norm": 3.810335874557495, + "learning_rate": 1.9998160845514763e-07, + "loss": 0.0518, + "step": 4050 + }, + { + "epoch": 0.016220697370129544, + "grad_norm": 13.2455415725708, + "learning_rate": 1.9998148766087532e-07, + "loss": 0.0537, + "step": 4060 + }, + { + "epoch": 0.01626064982670622, + "grad_norm": 5.861957550048828, + "learning_rate": 1.9998136647125332e-07, + "loss": 0.0555, + "step": 4070 + }, + { + "epoch": 0.016300602283282895, + "grad_norm": 4.1156005859375, + "learning_rate": 1.9998124488628208e-07, + "loss": 0.0573, + "step": 4080 + }, + { + "epoch": 0.016340554739859566, + "grad_norm": 4.387622833251953, + "learning_rate": 1.9998112290596208e-07, + "loss": 0.0526, + "step": 4090 + }, + { + "epoch": 0.01638050719643624, + "grad_norm": 6.111711502075195, + "learning_rate": 1.999810005302938e-07, + "loss": 0.0545, + "step": 4100 + }, + { + "epoch": 0.016420459653012913, + "grad_norm": 5.214175701141357, + "learning_rate": 1.999808777592777e-07, + "loss": 0.0597, + "step": 4110 + }, + { + "epoch": 0.016460412109589588, + "grad_norm": 3.6139066219329834, + "learning_rate": 1.9998075459291433e-07, + "loss": 0.0572, + "step": 4120 + }, + { + "epoch": 0.016500364566166263, + "grad_norm": 4.450981616973877, + "learning_rate": 1.9998063103120412e-07, + "loss": 0.0584, + "step": 4130 + }, + { + "epoch": 0.016540317022742935, + "grad_norm": 6.515717506408691, + "learning_rate": 1.999805070741476e-07, + "loss": 0.0575, + "step": 4140 + }, + { + "epoch": 0.01658026947931961, + "grad_norm": 4.358113765716553, + "learning_rate": 1.999803827217452e-07, + "loss": 0.0542, + "step": 4150 + }, + { + "epoch": 0.016620221935896285, + "grad_norm": 5.8263678550720215, + "learning_rate": 1.999802579739975e-07, + "loss": 0.0608, + "step": 4160 + }, + { + "epoch": 0.016660174392472957, + "grad_norm": 14.973634719848633, + "learning_rate": 1.999801328309049e-07, + "loss": 0.0552, + "step": 4170 + }, + { + "epoch": 0.016700126849049632, + "grad_norm": 3.962754964828491, + "learning_rate": 1.9998000729246795e-07, + "loss": 0.0582, + "step": 4180 + }, + { + "epoch": 0.016740079305626304, + "grad_norm": 4.576677322387695, + "learning_rate": 1.9997988135868715e-07, + "loss": 0.0497, + "step": 4190 + }, + { + "epoch": 0.01678003176220298, + "grad_norm": 6.045628070831299, + "learning_rate": 1.9997975502956296e-07, + "loss": 0.0551, + "step": 4200 + }, + { + "epoch": 0.016819984218779654, + "grad_norm": 7.284296035766602, + "learning_rate": 1.9997962830509594e-07, + "loss": 0.0566, + "step": 4210 + }, + { + "epoch": 0.016859936675356325, + "grad_norm": 6.021513938903809, + "learning_rate": 1.999795011852865e-07, + "loss": 0.0563, + "step": 4220 + }, + { + "epoch": 0.016899889131933, + "grad_norm": 7.721805572509766, + "learning_rate": 1.9997937367013522e-07, + "loss": 0.0592, + "step": 4230 + }, + { + "epoch": 0.016939841588509672, + "grad_norm": 5.197780132293701, + "learning_rate": 1.999792457596426e-07, + "loss": 0.0534, + "step": 4240 + }, + { + "epoch": 0.016979794045086347, + "grad_norm": 5.810977458953857, + "learning_rate": 1.9997911745380912e-07, + "loss": 0.056, + "step": 4250 + }, + { + "epoch": 0.017019746501663022, + "grad_norm": 6.099017143249512, + "learning_rate": 1.9997898875263526e-07, + "loss": 0.0559, + "step": 4260 + }, + { + "epoch": 0.017059698958239694, + "grad_norm": 12.204468727111816, + "learning_rate": 1.999788596561216e-07, + "loss": 0.0539, + "step": 4270 + }, + { + "epoch": 0.01709965141481637, + "grad_norm": 3.879924774169922, + "learning_rate": 1.999787301642686e-07, + "loss": 0.0551, + "step": 4280 + }, + { + "epoch": 0.01713960387139304, + "grad_norm": 4.906920909881592, + "learning_rate": 1.9997860027707678e-07, + "loss": 0.0652, + "step": 4290 + }, + { + "epoch": 0.017179556327969716, + "grad_norm": 3.3767309188842773, + "learning_rate": 1.9997846999454665e-07, + "loss": 0.0555, + "step": 4300 + }, + { + "epoch": 0.01721950878454639, + "grad_norm": 6.776486396789551, + "learning_rate": 1.9997833931667873e-07, + "loss": 0.0499, + "step": 4310 + }, + { + "epoch": 0.017259461241123063, + "grad_norm": 6.508986949920654, + "learning_rate": 1.9997820824347356e-07, + "loss": 0.0504, + "step": 4320 + }, + { + "epoch": 0.017299413697699738, + "grad_norm": 5.917038440704346, + "learning_rate": 1.9997807677493163e-07, + "loss": 0.0554, + "step": 4330 + }, + { + "epoch": 0.01733936615427641, + "grad_norm": 4.1241631507873535, + "learning_rate": 1.9997794491105347e-07, + "loss": 0.0577, + "step": 4340 + }, + { + "epoch": 0.017379318610853085, + "grad_norm": 5.5418596267700195, + "learning_rate": 1.9997781265183963e-07, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.01741927106742976, + "grad_norm": 7.255547523498535, + "learning_rate": 1.9997767999729055e-07, + "loss": 0.0536, + "step": 4360 + }, + { + "epoch": 0.01745922352400643, + "grad_norm": 5.058083534240723, + "learning_rate": 1.9997754694740681e-07, + "loss": 0.0511, + "step": 4370 + }, + { + "epoch": 0.017499175980583107, + "grad_norm": 12.02692985534668, + "learning_rate": 1.9997741350218895e-07, + "loss": 0.0527, + "step": 4380 + }, + { + "epoch": 0.017539128437159778, + "grad_norm": 6.463216781616211, + "learning_rate": 1.9997727966163752e-07, + "loss": 0.0576, + "step": 4390 + }, + { + "epoch": 0.017579080893736453, + "grad_norm": 9.245741844177246, + "learning_rate": 1.9997714542575297e-07, + "loss": 0.0558, + "step": 4400 + }, + { + "epoch": 0.01761903335031313, + "grad_norm": 9.142390251159668, + "learning_rate": 1.9997701079453586e-07, + "loss": 0.0531, + "step": 4410 + }, + { + "epoch": 0.0176589858068898, + "grad_norm": 15.368417739868164, + "learning_rate": 1.9997687576798674e-07, + "loss": 0.0564, + "step": 4420 + }, + { + "epoch": 0.017698938263466475, + "grad_norm": 6.798610210418701, + "learning_rate": 1.9997674034610616e-07, + "loss": 0.0569, + "step": 4430 + }, + { + "epoch": 0.01773889072004315, + "grad_norm": 4.176766395568848, + "learning_rate": 1.999766045288946e-07, + "loss": 0.0543, + "step": 4440 + }, + { + "epoch": 0.017778843176619822, + "grad_norm": 12.19363784790039, + "learning_rate": 1.9997646831635264e-07, + "loss": 0.0553, + "step": 4450 + }, + { + "epoch": 0.017818795633196497, + "grad_norm": 6.921565055847168, + "learning_rate": 1.9997633170848083e-07, + "loss": 0.0547, + "step": 4460 + }, + { + "epoch": 0.01785874808977317, + "grad_norm": 9.632270812988281, + "learning_rate": 1.9997619470527969e-07, + "loss": 0.0538, + "step": 4470 + }, + { + "epoch": 0.017898700546349844, + "grad_norm": 4.778588771820068, + "learning_rate": 1.9997605730674974e-07, + "loss": 0.056, + "step": 4480 + }, + { + "epoch": 0.01793865300292652, + "grad_norm": 4.472227096557617, + "learning_rate": 1.9997591951289154e-07, + "loss": 0.0581, + "step": 4490 + }, + { + "epoch": 0.01797860545950319, + "grad_norm": 7.201125144958496, + "learning_rate": 1.9997578132370565e-07, + "loss": 0.0527, + "step": 4500 + }, + { + "epoch": 0.018018557916079866, + "grad_norm": 6.694137096405029, + "learning_rate": 1.9997564273919262e-07, + "loss": 0.0565, + "step": 4510 + }, + { + "epoch": 0.018058510372656537, + "grad_norm": 4.935319900512695, + "learning_rate": 1.9997550375935295e-07, + "loss": 0.0529, + "step": 4520 + }, + { + "epoch": 0.018098462829233213, + "grad_norm": 8.33291244506836, + "learning_rate": 1.9997536438418724e-07, + "loss": 0.054, + "step": 4530 + }, + { + "epoch": 0.018138415285809888, + "grad_norm": 4.439997673034668, + "learning_rate": 1.9997522461369602e-07, + "loss": 0.0486, + "step": 4540 + }, + { + "epoch": 0.01817836774238656, + "grad_norm": 4.137319087982178, + "learning_rate": 1.9997508444787984e-07, + "loss": 0.0568, + "step": 4550 + }, + { + "epoch": 0.018218320198963234, + "grad_norm": 5.443885803222656, + "learning_rate": 1.9997494388673928e-07, + "loss": 0.0499, + "step": 4560 + }, + { + "epoch": 0.018258272655539906, + "grad_norm": 6.272200107574463, + "learning_rate": 1.9997480293027486e-07, + "loss": 0.0552, + "step": 4570 + }, + { + "epoch": 0.01829822511211658, + "grad_norm": 11.901983261108398, + "learning_rate": 1.9997466157848714e-07, + "loss": 0.0506, + "step": 4580 + }, + { + "epoch": 0.018338177568693256, + "grad_norm": 6.200244903564453, + "learning_rate": 1.9997451983137675e-07, + "loss": 0.0538, + "step": 4590 + }, + { + "epoch": 0.018378130025269928, + "grad_norm": 4.97955322265625, + "learning_rate": 1.9997437768894415e-07, + "loss": 0.0528, + "step": 4600 + }, + { + "epoch": 0.018418082481846603, + "grad_norm": 5.510714530944824, + "learning_rate": 1.9997423515118995e-07, + "loss": 0.0491, + "step": 4610 + }, + { + "epoch": 0.018458034938423275, + "grad_norm": 8.431427955627441, + "learning_rate": 1.999740922181147e-07, + "loss": 0.0541, + "step": 4620 + }, + { + "epoch": 0.01849798739499995, + "grad_norm": 7.068202972412109, + "learning_rate": 1.9997394888971897e-07, + "loss": 0.0511, + "step": 4630 + }, + { + "epoch": 0.018537939851576625, + "grad_norm": 4.055573463439941, + "learning_rate": 1.9997380516600334e-07, + "loss": 0.0487, + "step": 4640 + }, + { + "epoch": 0.018577892308153297, + "grad_norm": 10.819625854492188, + "learning_rate": 1.999736610469684e-07, + "loss": 0.051, + "step": 4650 + }, + { + "epoch": 0.018617844764729972, + "grad_norm": 7.983130931854248, + "learning_rate": 1.9997351653261464e-07, + "loss": 0.0503, + "step": 4660 + }, + { + "epoch": 0.018657797221306643, + "grad_norm": 14.434221267700195, + "learning_rate": 1.9997337162294271e-07, + "loss": 0.0554, + "step": 4670 + }, + { + "epoch": 0.01869774967788332, + "grad_norm": 15.442559242248535, + "learning_rate": 1.9997322631795315e-07, + "loss": 0.052, + "step": 4680 + }, + { + "epoch": 0.018737702134459994, + "grad_norm": 4.498370170593262, + "learning_rate": 1.9997308061764654e-07, + "loss": 0.0535, + "step": 4690 + }, + { + "epoch": 0.018777654591036665, + "grad_norm": 7.150179386138916, + "learning_rate": 1.9997293452202345e-07, + "loss": 0.051, + "step": 4700 + }, + { + "epoch": 0.01881760704761334, + "grad_norm": 6.014276027679443, + "learning_rate": 1.9997278803108445e-07, + "loss": 0.0528, + "step": 4710 + }, + { + "epoch": 0.018857559504190016, + "grad_norm": 8.904228210449219, + "learning_rate": 1.9997264114483015e-07, + "loss": 0.053, + "step": 4720 + }, + { + "epoch": 0.018897511960766687, + "grad_norm": 8.693397521972656, + "learning_rate": 1.9997249386326112e-07, + "loss": 0.049, + "step": 4730 + }, + { + "epoch": 0.018937464417343362, + "grad_norm": 10.814845085144043, + "learning_rate": 1.999723461863779e-07, + "loss": 0.0525, + "step": 4740 + }, + { + "epoch": 0.018977416873920034, + "grad_norm": 4.762622356414795, + "learning_rate": 1.9997219811418114e-07, + "loss": 0.0519, + "step": 4750 + }, + { + "epoch": 0.01901736933049671, + "grad_norm": 3.2520346641540527, + "learning_rate": 1.9997204964667138e-07, + "loss": 0.049, + "step": 4760 + }, + { + "epoch": 0.019057321787073384, + "grad_norm": 11.317850112915039, + "learning_rate": 1.999719007838492e-07, + "loss": 0.0562, + "step": 4770 + }, + { + "epoch": 0.019097274243650056, + "grad_norm": 16.8450870513916, + "learning_rate": 1.9997175152571524e-07, + "loss": 0.0573, + "step": 4780 + }, + { + "epoch": 0.01913722670022673, + "grad_norm": 5.594962120056152, + "learning_rate": 1.9997160187227005e-07, + "loss": 0.0547, + "step": 4790 + }, + { + "epoch": 0.019177179156803403, + "grad_norm": 9.333102226257324, + "learning_rate": 1.9997145182351424e-07, + "loss": 0.0493, + "step": 4800 + }, + { + "epoch": 0.019217131613380078, + "grad_norm": 7.660949230194092, + "learning_rate": 1.9997130137944838e-07, + "loss": 0.0509, + "step": 4810 + }, + { + "epoch": 0.019257084069956753, + "grad_norm": 6.180744647979736, + "learning_rate": 1.999711505400731e-07, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.019297036526533425, + "grad_norm": 9.046843528747559, + "learning_rate": 1.9997099930538895e-07, + "loss": 0.0559, + "step": 4830 + }, + { + "epoch": 0.0193369889831101, + "grad_norm": 6.462779521942139, + "learning_rate": 1.9997084767539656e-07, + "loss": 0.053, + "step": 4840 + }, + { + "epoch": 0.01937694143968677, + "grad_norm": 3.8118205070495605, + "learning_rate": 1.9997069565009653e-07, + "loss": 0.0585, + "step": 4850 + }, + { + "epoch": 0.019416893896263446, + "grad_norm": 4.751912593841553, + "learning_rate": 1.9997054322948946e-07, + "loss": 0.0531, + "step": 4860 + }, + { + "epoch": 0.01945684635284012, + "grad_norm": 4.9640655517578125, + "learning_rate": 1.9997039041357593e-07, + "loss": 0.0553, + "step": 4870 + }, + { + "epoch": 0.019496798809416793, + "grad_norm": 4.0941548347473145, + "learning_rate": 1.9997023720235656e-07, + "loss": 0.0471, + "step": 4880 + }, + { + "epoch": 0.01953675126599347, + "grad_norm": 4.968613147735596, + "learning_rate": 1.99970083595832e-07, + "loss": 0.0516, + "step": 4890 + }, + { + "epoch": 0.01957670372257014, + "grad_norm": 3.5731613636016846, + "learning_rate": 1.9996992959400278e-07, + "loss": 0.0546, + "step": 4900 + }, + { + "epoch": 0.019616656179146815, + "grad_norm": 5.621215343475342, + "learning_rate": 1.9996977519686956e-07, + "loss": 0.0554, + "step": 4910 + }, + { + "epoch": 0.01965660863572349, + "grad_norm": 4.17621374130249, + "learning_rate": 1.9996962040443294e-07, + "loss": 0.0498, + "step": 4920 + }, + { + "epoch": 0.019696561092300162, + "grad_norm": 3.883863687515259, + "learning_rate": 1.999694652166935e-07, + "loss": 0.0513, + "step": 4930 + }, + { + "epoch": 0.019736513548876837, + "grad_norm": 8.0399751663208, + "learning_rate": 1.999693096336519e-07, + "loss": 0.0497, + "step": 4940 + }, + { + "epoch": 0.01977646600545351, + "grad_norm": 16.862865447998047, + "learning_rate": 1.9996915365530872e-07, + "loss": 0.0599, + "step": 4950 + }, + { + "epoch": 0.019816418462030184, + "grad_norm": 6.834232330322266, + "learning_rate": 1.999689972816646e-07, + "loss": 0.0565, + "step": 4960 + }, + { + "epoch": 0.01985637091860686, + "grad_norm": 6.377359390258789, + "learning_rate": 1.9996884051272017e-07, + "loss": 0.0523, + "step": 4970 + }, + { + "epoch": 0.01989632337518353, + "grad_norm": 7.963636875152588, + "learning_rate": 1.99968683348476e-07, + "loss": 0.0536, + "step": 4980 + }, + { + "epoch": 0.019936275831760206, + "grad_norm": 6.329069137573242, + "learning_rate": 1.9996852578893274e-07, + "loss": 0.0556, + "step": 4990 + }, + { + "epoch": 0.01997622828833688, + "grad_norm": 6.8979620933532715, + "learning_rate": 1.9996836783409107e-07, + "loss": 0.0506, + "step": 5000 + }, + { + "epoch": 0.020016180744913552, + "grad_norm": 6.618072986602783, + "learning_rate": 1.999682094839515e-07, + "loss": 0.0493, + "step": 5010 + }, + { + "epoch": 0.020056133201490228, + "grad_norm": 7.938747882843018, + "learning_rate": 1.9996805073851472e-07, + "loss": 0.0535, + "step": 5020 + }, + { + "epoch": 0.0200960856580669, + "grad_norm": 7.057828426361084, + "learning_rate": 1.999678915977814e-07, + "loss": 0.0488, + "step": 5030 + }, + { + "epoch": 0.020136038114643574, + "grad_norm": 8.569436073303223, + "learning_rate": 1.9996773206175205e-07, + "loss": 0.0518, + "step": 5040 + }, + { + "epoch": 0.02017599057122025, + "grad_norm": 15.794445991516113, + "learning_rate": 1.9996757213042742e-07, + "loss": 0.0524, + "step": 5050 + }, + { + "epoch": 0.02021594302779692, + "grad_norm": 7.339202404022217, + "learning_rate": 1.9996741180380813e-07, + "loss": 0.0563, + "step": 5060 + }, + { + "epoch": 0.020255895484373596, + "grad_norm": 6.327882766723633, + "learning_rate": 1.999672510818947e-07, + "loss": 0.0532, + "step": 5070 + }, + { + "epoch": 0.020295847940950268, + "grad_norm": 8.023276329040527, + "learning_rate": 1.9996708996468793e-07, + "loss": 0.0497, + "step": 5080 + }, + { + "epoch": 0.020335800397526943, + "grad_norm": 4.3544697761535645, + "learning_rate": 1.999669284521883e-07, + "loss": 0.0486, + "step": 5090 + }, + { + "epoch": 0.020375752854103618, + "grad_norm": 6.104060649871826, + "learning_rate": 1.9996676654439658e-07, + "loss": 0.0509, + "step": 5100 + }, + { + "epoch": 0.02041570531068029, + "grad_norm": 6.5528564453125, + "learning_rate": 1.9996660424131328e-07, + "loss": 0.0474, + "step": 5110 + }, + { + "epoch": 0.020455657767256965, + "grad_norm": 14.309955596923828, + "learning_rate": 1.999664415429392e-07, + "loss": 0.052, + "step": 5120 + }, + { + "epoch": 0.020495610223833637, + "grad_norm": 8.03148078918457, + "learning_rate": 1.9996627844927483e-07, + "loss": 0.0588, + "step": 5130 + }, + { + "epoch": 0.02053556268041031, + "grad_norm": 5.87081241607666, + "learning_rate": 1.999661149603209e-07, + "loss": 0.0453, + "step": 5140 + }, + { + "epoch": 0.020575515136986987, + "grad_norm": 4.932061672210693, + "learning_rate": 1.9996595107607805e-07, + "loss": 0.0541, + "step": 5150 + }, + { + "epoch": 0.02061546759356366, + "grad_norm": 3.4494550228118896, + "learning_rate": 1.999657867965469e-07, + "loss": 0.0476, + "step": 5160 + }, + { + "epoch": 0.020655420050140334, + "grad_norm": 8.88809871673584, + "learning_rate": 1.9996562212172812e-07, + "loss": 0.0498, + "step": 5170 + }, + { + "epoch": 0.020695372506717005, + "grad_norm": 6.585300445556641, + "learning_rate": 1.9996545705162236e-07, + "loss": 0.0486, + "step": 5180 + }, + { + "epoch": 0.02073532496329368, + "grad_norm": 13.496743202209473, + "learning_rate": 1.9996529158623029e-07, + "loss": 0.0514, + "step": 5190 + }, + { + "epoch": 0.020775277419870355, + "grad_norm": 3.518712043762207, + "learning_rate": 1.999651257255525e-07, + "loss": 0.0478, + "step": 5200 + }, + { + "epoch": 0.020815229876447027, + "grad_norm": 4.171905040740967, + "learning_rate": 1.999649594695897e-07, + "loss": 0.0444, + "step": 5210 + }, + { + "epoch": 0.020855182333023702, + "grad_norm": 8.100517272949219, + "learning_rate": 1.9996479281834256e-07, + "loss": 0.0528, + "step": 5220 + }, + { + "epoch": 0.020895134789600374, + "grad_norm": 9.999641418457031, + "learning_rate": 1.9996462577181173e-07, + "loss": 0.0571, + "step": 5230 + }, + { + "epoch": 0.02093508724617705, + "grad_norm": 5.320511817932129, + "learning_rate": 1.999644583299978e-07, + "loss": 0.0483, + "step": 5240 + }, + { + "epoch": 0.020975039702753724, + "grad_norm": 5.94979190826416, + "learning_rate": 1.9996429049290154e-07, + "loss": 0.0472, + "step": 5250 + }, + { + "epoch": 0.021014992159330396, + "grad_norm": 4.8294196128845215, + "learning_rate": 1.9996412226052356e-07, + "loss": 0.0486, + "step": 5260 + }, + { + "epoch": 0.02105494461590707, + "grad_norm": 5.149157524108887, + "learning_rate": 1.9996395363286452e-07, + "loss": 0.0543, + "step": 5270 + }, + { + "epoch": 0.021094897072483746, + "grad_norm": 7.555394172668457, + "learning_rate": 1.9996378460992507e-07, + "loss": 0.0483, + "step": 5280 + }, + { + "epoch": 0.021134849529060418, + "grad_norm": 4.935739517211914, + "learning_rate": 1.999636151917059e-07, + "loss": 0.054, + "step": 5290 + }, + { + "epoch": 0.021174801985637093, + "grad_norm": 4.376059055328369, + "learning_rate": 1.999634453782077e-07, + "loss": 0.0521, + "step": 5300 + }, + { + "epoch": 0.021214754442213764, + "grad_norm": 6.445003509521484, + "learning_rate": 1.9996327516943115e-07, + "loss": 0.0469, + "step": 5310 + }, + { + "epoch": 0.02125470689879044, + "grad_norm": 4.50531005859375, + "learning_rate": 1.9996310456537687e-07, + "loss": 0.0502, + "step": 5320 + }, + { + "epoch": 0.021294659355367115, + "grad_norm": 5.2720046043396, + "learning_rate": 1.9996293356604554e-07, + "loss": 0.0544, + "step": 5330 + }, + { + "epoch": 0.021334611811943786, + "grad_norm": 3.2854228019714355, + "learning_rate": 1.999627621714379e-07, + "loss": 0.0491, + "step": 5340 + }, + { + "epoch": 0.02137456426852046, + "grad_norm": 7.7239885330200195, + "learning_rate": 1.9996259038155456e-07, + "loss": 0.047, + "step": 5350 + }, + { + "epoch": 0.021414516725097133, + "grad_norm": 8.376391410827637, + "learning_rate": 1.9996241819639624e-07, + "loss": 0.0513, + "step": 5360 + }, + { + "epoch": 0.021454469181673808, + "grad_norm": 11.829011917114258, + "learning_rate": 1.999622456159636e-07, + "loss": 0.0501, + "step": 5370 + }, + { + "epoch": 0.021494421638250483, + "grad_norm": 5.673402309417725, + "learning_rate": 1.9996207264025733e-07, + "loss": 0.0566, + "step": 5380 + }, + { + "epoch": 0.021534374094827155, + "grad_norm": 8.002850532531738, + "learning_rate": 1.999618992692781e-07, + "loss": 0.0495, + "step": 5390 + }, + { + "epoch": 0.02157432655140383, + "grad_norm": 3.7710397243499756, + "learning_rate": 1.9996172550302664e-07, + "loss": 0.0529, + "step": 5400 + }, + { + "epoch": 0.021614279007980502, + "grad_norm": 3.5189783573150635, + "learning_rate": 1.9996155134150355e-07, + "loss": 0.0477, + "step": 5410 + }, + { + "epoch": 0.021654231464557177, + "grad_norm": 6.907649993896484, + "learning_rate": 1.999613767847096e-07, + "loss": 0.05, + "step": 5420 + }, + { + "epoch": 0.021694183921133852, + "grad_norm": 4.267192363739014, + "learning_rate": 1.999612018326455e-07, + "loss": 0.049, + "step": 5430 + }, + { + "epoch": 0.021734136377710524, + "grad_norm": 5.154108047485352, + "learning_rate": 1.9996102648531186e-07, + "loss": 0.0489, + "step": 5440 + }, + { + "epoch": 0.0217740888342872, + "grad_norm": 9.884163856506348, + "learning_rate": 1.999608507427094e-07, + "loss": 0.0477, + "step": 5450 + }, + { + "epoch": 0.02181404129086387, + "grad_norm": 5.263055324554443, + "learning_rate": 1.9996067460483884e-07, + "loss": 0.0556, + "step": 5460 + }, + { + "epoch": 0.021853993747440546, + "grad_norm": 5.632888317108154, + "learning_rate": 1.9996049807170081e-07, + "loss": 0.0513, + "step": 5470 + }, + { + "epoch": 0.02189394620401722, + "grad_norm": 4.350039958953857, + "learning_rate": 1.999603211432961e-07, + "loss": 0.0463, + "step": 5480 + }, + { + "epoch": 0.021933898660593892, + "grad_norm": 7.990609645843506, + "learning_rate": 1.9996014381962538e-07, + "loss": 0.0519, + "step": 5490 + }, + { + "epoch": 0.021973851117170567, + "grad_norm": 5.383752346038818, + "learning_rate": 1.9995996610068933e-07, + "loss": 0.0536, + "step": 5500 + }, + { + "epoch": 0.02201380357374724, + "grad_norm": 5.199809551239014, + "learning_rate": 1.9995978798648866e-07, + "loss": 0.0503, + "step": 5510 + }, + { + "epoch": 0.022053756030323914, + "grad_norm": 10.054726600646973, + "learning_rate": 1.9995960947702406e-07, + "loss": 0.053, + "step": 5520 + }, + { + "epoch": 0.02209370848690059, + "grad_norm": 5.247880935668945, + "learning_rate": 1.9995943057229627e-07, + "loss": 0.0428, + "step": 5530 + }, + { + "epoch": 0.02213366094347726, + "grad_norm": 5.113954544067383, + "learning_rate": 1.9995925127230599e-07, + "loss": 0.0433, + "step": 5540 + }, + { + "epoch": 0.022173613400053936, + "grad_norm": 6.664379596710205, + "learning_rate": 1.9995907157705386e-07, + "loss": 0.0521, + "step": 5550 + }, + { + "epoch": 0.02221356585663061, + "grad_norm": 5.542282581329346, + "learning_rate": 1.999588914865407e-07, + "loss": 0.047, + "step": 5560 + }, + { + "epoch": 0.022253518313207283, + "grad_norm": 30.600914001464844, + "learning_rate": 1.9995871100076718e-07, + "loss": 0.051, + "step": 5570 + }, + { + "epoch": 0.022293470769783958, + "grad_norm": 11.56572437286377, + "learning_rate": 1.99958530119734e-07, + "loss": 0.0568, + "step": 5580 + }, + { + "epoch": 0.02233342322636063, + "grad_norm": 7.477743625640869, + "learning_rate": 1.9995834884344186e-07, + "loss": 0.0505, + "step": 5590 + }, + { + "epoch": 0.022373375682937305, + "grad_norm": 6.812231063842773, + "learning_rate": 1.999581671718915e-07, + "loss": 0.0486, + "step": 5600 + }, + { + "epoch": 0.02241332813951398, + "grad_norm": 6.20554256439209, + "learning_rate": 1.9995798510508367e-07, + "loss": 0.0497, + "step": 5610 + }, + { + "epoch": 0.02245328059609065, + "grad_norm": 4.946862697601318, + "learning_rate": 1.99957802643019e-07, + "loss": 0.0524, + "step": 5620 + }, + { + "epoch": 0.022493233052667327, + "grad_norm": 4.0679402351379395, + "learning_rate": 1.999576197856983e-07, + "loss": 0.0476, + "step": 5630 + }, + { + "epoch": 0.022533185509244, + "grad_norm": 8.819137573242188, + "learning_rate": 1.9995743653312226e-07, + "loss": 0.0482, + "step": 5640 + }, + { + "epoch": 0.022573137965820674, + "grad_norm": 6.6104888916015625, + "learning_rate": 1.9995725288529158e-07, + "loss": 0.0472, + "step": 5650 + }, + { + "epoch": 0.02261309042239735, + "grad_norm": 5.4974260330200195, + "learning_rate": 1.9995706884220703e-07, + "loss": 0.0509, + "step": 5660 + }, + { + "epoch": 0.02265304287897402, + "grad_norm": 3.571427345275879, + "learning_rate": 1.9995688440386933e-07, + "loss": 0.0527, + "step": 5670 + }, + { + "epoch": 0.022692995335550695, + "grad_norm": 4.1748576164245605, + "learning_rate": 1.999566995702792e-07, + "loss": 0.0552, + "step": 5680 + }, + { + "epoch": 0.022732947792127367, + "grad_norm": 9.6434965133667, + "learning_rate": 1.9995651434143735e-07, + "loss": 0.0487, + "step": 5690 + }, + { + "epoch": 0.022772900248704042, + "grad_norm": 5.59194803237915, + "learning_rate": 1.9995632871734452e-07, + "loss": 0.0478, + "step": 5700 + }, + { + "epoch": 0.022812852705280717, + "grad_norm": 5.324710845947266, + "learning_rate": 1.9995614269800148e-07, + "loss": 0.0481, + "step": 5710 + }, + { + "epoch": 0.02285280516185739, + "grad_norm": 4.931123733520508, + "learning_rate": 1.999559562834089e-07, + "loss": 0.0458, + "step": 5720 + }, + { + "epoch": 0.022892757618434064, + "grad_norm": 3.9789791107177734, + "learning_rate": 1.9995576947356763e-07, + "loss": 0.0547, + "step": 5730 + }, + { + "epoch": 0.022932710075010736, + "grad_norm": 5.861281394958496, + "learning_rate": 1.9995558226847828e-07, + "loss": 0.048, + "step": 5740 + }, + { + "epoch": 0.02297266253158741, + "grad_norm": 7.420010089874268, + "learning_rate": 1.9995539466814167e-07, + "loss": 0.0498, + "step": 5750 + }, + { + "epoch": 0.023012614988164086, + "grad_norm": 6.531066417694092, + "learning_rate": 1.999552066725585e-07, + "loss": 0.0504, + "step": 5760 + }, + { + "epoch": 0.023052567444740758, + "grad_norm": 5.092825889587402, + "learning_rate": 1.9995501828172954e-07, + "loss": 0.0485, + "step": 5770 + }, + { + "epoch": 0.023092519901317433, + "grad_norm": 10.873908996582031, + "learning_rate": 1.9995482949565555e-07, + "loss": 0.047, + "step": 5780 + }, + { + "epoch": 0.023132472357894104, + "grad_norm": 5.651072025299072, + "learning_rate": 1.999546403143372e-07, + "loss": 0.0457, + "step": 5790 + }, + { + "epoch": 0.02317242481447078, + "grad_norm": 4.596477508544922, + "learning_rate": 1.9995445073777532e-07, + "loss": 0.0466, + "step": 5800 + }, + { + "epoch": 0.023212377271047455, + "grad_norm": 5.580630779266357, + "learning_rate": 1.9995426076597063e-07, + "loss": 0.0447, + "step": 5810 + }, + { + "epoch": 0.023252329727624126, + "grad_norm": 5.789882183074951, + "learning_rate": 1.999540703989239e-07, + "loss": 0.049, + "step": 5820 + }, + { + "epoch": 0.0232922821842008, + "grad_norm": 6.233381748199463, + "learning_rate": 1.9995387963663587e-07, + "loss": 0.0518, + "step": 5830 + }, + { + "epoch": 0.023332234640777477, + "grad_norm": 8.467476844787598, + "learning_rate": 1.9995368847910727e-07, + "loss": 0.0551, + "step": 5840 + }, + { + "epoch": 0.023372187097354148, + "grad_norm": 12.194501876831055, + "learning_rate": 1.9995349692633886e-07, + "loss": 0.0494, + "step": 5850 + }, + { + "epoch": 0.023412139553930823, + "grad_norm": 7.740863800048828, + "learning_rate": 1.9995330497833143e-07, + "loss": 0.0527, + "step": 5860 + }, + { + "epoch": 0.023452092010507495, + "grad_norm": 23.1185359954834, + "learning_rate": 1.9995311263508574e-07, + "loss": 0.0488, + "step": 5870 + }, + { + "epoch": 0.02349204446708417, + "grad_norm": 19.6190128326416, + "learning_rate": 1.999529198966025e-07, + "loss": 0.0467, + "step": 5880 + }, + { + "epoch": 0.023531996923660845, + "grad_norm": 9.024277687072754, + "learning_rate": 1.9995272676288253e-07, + "loss": 0.0513, + "step": 5890 + }, + { + "epoch": 0.023571949380237517, + "grad_norm": 4.810340881347656, + "learning_rate": 1.9995253323392656e-07, + "loss": 0.0492, + "step": 5900 + }, + { + "epoch": 0.023611901836814192, + "grad_norm": 7.728888988494873, + "learning_rate": 1.9995233930973533e-07, + "loss": 0.0462, + "step": 5910 + }, + { + "epoch": 0.023651854293390864, + "grad_norm": 6.717733383178711, + "learning_rate": 1.999521449903097e-07, + "loss": 0.05, + "step": 5920 + }, + { + "epoch": 0.02369180674996754, + "grad_norm": 6.905145168304443, + "learning_rate": 1.9995195027565032e-07, + "loss": 0.048, + "step": 5930 + }, + { + "epoch": 0.023731759206544214, + "grad_norm": 5.004509925842285, + "learning_rate": 1.9995175516575805e-07, + "loss": 0.0438, + "step": 5940 + }, + { + "epoch": 0.023771711663120886, + "grad_norm": 8.355717658996582, + "learning_rate": 1.9995155966063364e-07, + "loss": 0.0477, + "step": 5950 + }, + { + "epoch": 0.02381166411969756, + "grad_norm": 5.910022735595703, + "learning_rate": 1.9995136376027783e-07, + "loss": 0.047, + "step": 5960 + }, + { + "epoch": 0.023851616576274232, + "grad_norm": 4.9337663650512695, + "learning_rate": 1.9995116746469142e-07, + "loss": 0.0497, + "step": 5970 + }, + { + "epoch": 0.023891569032850907, + "grad_norm": 6.242760181427002, + "learning_rate": 1.9995097077387517e-07, + "loss": 0.0484, + "step": 5980 + }, + { + "epoch": 0.023931521489427583, + "grad_norm": 9.45016098022461, + "learning_rate": 1.9995077368782991e-07, + "loss": 0.0505, + "step": 5990 + }, + { + "epoch": 0.023971473946004254, + "grad_norm": 4.398714065551758, + "learning_rate": 1.9995057620655635e-07, + "loss": 0.0534, + "step": 6000 + }, + { + "epoch": 0.02401142640258093, + "grad_norm": 4.338630676269531, + "learning_rate": 1.9995037833005532e-07, + "loss": 0.0463, + "step": 6010 + }, + { + "epoch": 0.0240513788591576, + "grad_norm": 5.78023099899292, + "learning_rate": 1.999501800583276e-07, + "loss": 0.0468, + "step": 6020 + }, + { + "epoch": 0.024091331315734276, + "grad_norm": 8.258600234985352, + "learning_rate": 1.999499813913739e-07, + "loss": 0.0509, + "step": 6030 + }, + { + "epoch": 0.02413128377231095, + "grad_norm": 13.860574722290039, + "learning_rate": 1.9994978232919514e-07, + "loss": 0.0511, + "step": 6040 + }, + { + "epoch": 0.024171236228887623, + "grad_norm": 4.281649112701416, + "learning_rate": 1.9994958287179196e-07, + "loss": 0.0426, + "step": 6050 + }, + { + "epoch": 0.024211188685464298, + "grad_norm": 3.845460891723633, + "learning_rate": 1.9994938301916526e-07, + "loss": 0.0465, + "step": 6060 + }, + { + "epoch": 0.02425114114204097, + "grad_norm": 6.270993232727051, + "learning_rate": 1.9994918277131578e-07, + "loss": 0.0561, + "step": 6070 + }, + { + "epoch": 0.024291093598617645, + "grad_norm": 3.442826747894287, + "learning_rate": 1.9994898212824434e-07, + "loss": 0.0439, + "step": 6080 + }, + { + "epoch": 0.02433104605519432, + "grad_norm": 3.9892542362213135, + "learning_rate": 1.999487810899517e-07, + "loss": 0.0511, + "step": 6090 + }, + { + "epoch": 0.02437099851177099, + "grad_norm": 5.239718914031982, + "learning_rate": 1.9994857965643864e-07, + "loss": 0.0494, + "step": 6100 + }, + { + "epoch": 0.024410950968347667, + "grad_norm": 5.721076965332031, + "learning_rate": 1.9994837782770601e-07, + "loss": 0.0484, + "step": 6110 + }, + { + "epoch": 0.024450903424924342, + "grad_norm": 3.5553338527679443, + "learning_rate": 1.9994817560375461e-07, + "loss": 0.0444, + "step": 6120 + }, + { + "epoch": 0.024490855881501013, + "grad_norm": 4.850107192993164, + "learning_rate": 1.9994797298458518e-07, + "loss": 0.0509, + "step": 6130 + }, + { + "epoch": 0.02453080833807769, + "grad_norm": 5.250258445739746, + "learning_rate": 1.9994776997019857e-07, + "loss": 0.0493, + "step": 6140 + }, + { + "epoch": 0.02457076079465436, + "grad_norm": 5.880673408508301, + "learning_rate": 1.999475665605956e-07, + "loss": 0.0472, + "step": 6150 + }, + { + "epoch": 0.024610713251231035, + "grad_norm": 6.41334867477417, + "learning_rate": 1.9994736275577698e-07, + "loss": 0.0478, + "step": 6160 + }, + { + "epoch": 0.02465066570780771, + "grad_norm": 5.663977146148682, + "learning_rate": 1.999471585557436e-07, + "loss": 0.0471, + "step": 6170 + }, + { + "epoch": 0.024690618164384382, + "grad_norm": 8.224438667297363, + "learning_rate": 1.999469539604963e-07, + "loss": 0.0459, + "step": 6180 + }, + { + "epoch": 0.024730570620961057, + "grad_norm": 7.266992092132568, + "learning_rate": 1.9994674897003576e-07, + "loss": 0.0474, + "step": 6190 + }, + { + "epoch": 0.02477052307753773, + "grad_norm": 13.62252426147461, + "learning_rate": 1.999465435843629e-07, + "loss": 0.0539, + "step": 6200 + }, + { + "epoch": 0.024810475534114404, + "grad_norm": 6.109665393829346, + "learning_rate": 1.999463378034785e-07, + "loss": 0.0503, + "step": 6210 + }, + { + "epoch": 0.02485042799069108, + "grad_norm": 5.504624366760254, + "learning_rate": 1.9994613162738334e-07, + "loss": 0.0506, + "step": 6220 + }, + { + "epoch": 0.02489038044726775, + "grad_norm": 3.7311296463012695, + "learning_rate": 1.999459250560783e-07, + "loss": 0.0468, + "step": 6230 + }, + { + "epoch": 0.024930332903844426, + "grad_norm": 7.991785049438477, + "learning_rate": 1.9994571808956414e-07, + "loss": 0.0449, + "step": 6240 + }, + { + "epoch": 0.024970285360421098, + "grad_norm": 3.7617454528808594, + "learning_rate": 1.9994551072784173e-07, + "loss": 0.052, + "step": 6250 + }, + { + "epoch": 0.025010237816997773, + "grad_norm": 4.627028942108154, + "learning_rate": 1.9994530297091185e-07, + "loss": 0.0466, + "step": 6260 + }, + { + "epoch": 0.025050190273574448, + "grad_norm": 3.9418978691101074, + "learning_rate": 1.9994509481877535e-07, + "loss": 0.0488, + "step": 6270 + }, + { + "epoch": 0.02509014273015112, + "grad_norm": 4.304305076599121, + "learning_rate": 1.99944886271433e-07, + "loss": 0.0487, + "step": 6280 + }, + { + "epoch": 0.025130095186727795, + "grad_norm": 5.13655424118042, + "learning_rate": 1.9994467732888566e-07, + "loss": 0.0483, + "step": 6290 + }, + { + "epoch": 0.025170047643304466, + "grad_norm": 6.739612102508545, + "learning_rate": 1.999444679911342e-07, + "loss": 0.0463, + "step": 6300 + }, + { + "epoch": 0.02521000009988114, + "grad_norm": 5.165511608123779, + "learning_rate": 1.999442582581794e-07, + "loss": 0.0513, + "step": 6310 + }, + { + "epoch": 0.025249952556457816, + "grad_norm": 3.534308910369873, + "learning_rate": 1.9994404813002207e-07, + "loss": 0.0493, + "step": 6320 + }, + { + "epoch": 0.025289905013034488, + "grad_norm": 5.314731121063232, + "learning_rate": 1.9994383760666307e-07, + "loss": 0.0483, + "step": 6330 + }, + { + "epoch": 0.025329857469611163, + "grad_norm": 530.1204833984375, + "learning_rate": 1.999436266881032e-07, + "loss": 0.0447, + "step": 6340 + }, + { + "epoch": 0.025369809926187835, + "grad_norm": 11.158661842346191, + "learning_rate": 1.9994341537434338e-07, + "loss": 0.0489, + "step": 6350 + }, + { + "epoch": 0.02540976238276451, + "grad_norm": 3.1001086235046387, + "learning_rate": 1.9994320366538435e-07, + "loss": 0.05, + "step": 6360 + }, + { + "epoch": 0.025449714839341185, + "grad_norm": 4.639468669891357, + "learning_rate": 1.99942991561227e-07, + "loss": 0.0464, + "step": 6370 + }, + { + "epoch": 0.025489667295917857, + "grad_norm": 8.713321685791016, + "learning_rate": 1.9994277906187213e-07, + "loss": 0.0422, + "step": 6380 + }, + { + "epoch": 0.025529619752494532, + "grad_norm": 4.05655574798584, + "learning_rate": 1.9994256616732063e-07, + "loss": 0.0525, + "step": 6390 + }, + { + "epoch": 0.025569572209071204, + "grad_norm": 6.559214115142822, + "learning_rate": 1.9994235287757331e-07, + "loss": 0.0489, + "step": 6400 + }, + { + "epoch": 0.02560952466564788, + "grad_norm": 4.222221374511719, + "learning_rate": 1.9994213919263102e-07, + "loss": 0.0491, + "step": 6410 + }, + { + "epoch": 0.025649477122224554, + "grad_norm": 5.469334125518799, + "learning_rate": 1.999419251124946e-07, + "loss": 0.0453, + "step": 6420 + }, + { + "epoch": 0.025689429578801225, + "grad_norm": 5.401805877685547, + "learning_rate": 1.999417106371649e-07, + "loss": 0.0488, + "step": 6430 + }, + { + "epoch": 0.0257293820353779, + "grad_norm": 5.982754230499268, + "learning_rate": 1.9994149576664277e-07, + "loss": 0.044, + "step": 6440 + }, + { + "epoch": 0.025769334491954576, + "grad_norm": 6.22883415222168, + "learning_rate": 1.9994128050092902e-07, + "loss": 0.0493, + "step": 6450 + }, + { + "epoch": 0.025809286948531247, + "grad_norm": 4.266751289367676, + "learning_rate": 1.999410648400246e-07, + "loss": 0.0456, + "step": 6460 + }, + { + "epoch": 0.025849239405107922, + "grad_norm": 6.855615615844727, + "learning_rate": 1.9994084878393024e-07, + "loss": 0.0507, + "step": 6470 + }, + { + "epoch": 0.025889191861684594, + "grad_norm": 3.2928202152252197, + "learning_rate": 1.999406323326469e-07, + "loss": 0.0511, + "step": 6480 + }, + { + "epoch": 0.02592914431826127, + "grad_norm": 3.6329526901245117, + "learning_rate": 1.999404154861754e-07, + "loss": 0.0486, + "step": 6490 + }, + { + "epoch": 0.025969096774837944, + "grad_norm": 3.7720136642456055, + "learning_rate": 1.9994019824451658e-07, + "loss": 0.0491, + "step": 6500 + }, + { + "epoch": 0.026009049231414616, + "grad_norm": 3.690717935562134, + "learning_rate": 1.9993998060767127e-07, + "loss": 0.048, + "step": 6510 + }, + { + "epoch": 0.02604900168799129, + "grad_norm": 8.989951133728027, + "learning_rate": 1.9993976257564043e-07, + "loss": 0.0497, + "step": 6520 + }, + { + "epoch": 0.026088954144567963, + "grad_norm": 2.9775304794311523, + "learning_rate": 1.9993954414842483e-07, + "loss": 0.0436, + "step": 6530 + }, + { + "epoch": 0.026128906601144638, + "grad_norm": 8.083571434020996, + "learning_rate": 1.9993932532602535e-07, + "loss": 0.0464, + "step": 6540 + }, + { + "epoch": 0.026168859057721313, + "grad_norm": 4.857685565948486, + "learning_rate": 1.9993910610844292e-07, + "loss": 0.0451, + "step": 6550 + }, + { + "epoch": 0.026208811514297985, + "grad_norm": 4.80525016784668, + "learning_rate": 1.9993888649567832e-07, + "loss": 0.0459, + "step": 6560 + }, + { + "epoch": 0.02624876397087466, + "grad_norm": 5.788197040557861, + "learning_rate": 1.9993866648773246e-07, + "loss": 0.0487, + "step": 6570 + }, + { + "epoch": 0.02628871642745133, + "grad_norm": 4.4205002784729, + "learning_rate": 1.999384460846062e-07, + "loss": 0.0494, + "step": 6580 + }, + { + "epoch": 0.026328668884028007, + "grad_norm": 6.1486005783081055, + "learning_rate": 1.9993822528630041e-07, + "loss": 0.0424, + "step": 6590 + }, + { + "epoch": 0.02636862134060468, + "grad_norm": 6.578733444213867, + "learning_rate": 1.9993800409281598e-07, + "loss": 0.0459, + "step": 6600 + }, + { + "epoch": 0.026408573797181353, + "grad_norm": 6.339469909667969, + "learning_rate": 1.999377825041538e-07, + "loss": 0.0462, + "step": 6610 + }, + { + "epoch": 0.02644852625375803, + "grad_norm": 5.239964008331299, + "learning_rate": 1.9993756052031468e-07, + "loss": 0.0461, + "step": 6620 + }, + { + "epoch": 0.0264884787103347, + "grad_norm": 4.343082904815674, + "learning_rate": 1.9993733814129956e-07, + "loss": 0.0444, + "step": 6630 + }, + { + "epoch": 0.026528431166911375, + "grad_norm": 13.146953582763672, + "learning_rate": 1.999371153671093e-07, + "loss": 0.0453, + "step": 6640 + }, + { + "epoch": 0.02656838362348805, + "grad_norm": 6.201329231262207, + "learning_rate": 1.9993689219774477e-07, + "loss": 0.047, + "step": 6650 + }, + { + "epoch": 0.026608336080064722, + "grad_norm": 4.421543121337891, + "learning_rate": 1.9993666863320685e-07, + "loss": 0.0444, + "step": 6660 + }, + { + "epoch": 0.026648288536641397, + "grad_norm": 4.977525234222412, + "learning_rate": 1.9993644467349648e-07, + "loss": 0.0512, + "step": 6670 + }, + { + "epoch": 0.02668824099321807, + "grad_norm": 7.353769779205322, + "learning_rate": 1.999362203186145e-07, + "loss": 0.0491, + "step": 6680 + }, + { + "epoch": 0.026728193449794744, + "grad_norm": 16.738561630249023, + "learning_rate": 1.9993599556856174e-07, + "loss": 0.0469, + "step": 6690 + }, + { + "epoch": 0.02676814590637142, + "grad_norm": 5.454195499420166, + "learning_rate": 1.999357704233392e-07, + "loss": 0.0437, + "step": 6700 + }, + { + "epoch": 0.02680809836294809, + "grad_norm": 4.483206272125244, + "learning_rate": 1.9993554488294768e-07, + "loss": 0.0448, + "step": 6710 + }, + { + "epoch": 0.026848050819524766, + "grad_norm": 7.553791046142578, + "learning_rate": 1.9993531894738813e-07, + "loss": 0.0497, + "step": 6720 + }, + { + "epoch": 0.02688800327610144, + "grad_norm": 5.343426704406738, + "learning_rate": 1.999350926166614e-07, + "loss": 0.0413, + "step": 6730 + }, + { + "epoch": 0.026927955732678113, + "grad_norm": 3.965423345565796, + "learning_rate": 1.9993486589076844e-07, + "loss": 0.0416, + "step": 6740 + }, + { + "epoch": 0.026967908189254788, + "grad_norm": 5.249233245849609, + "learning_rate": 1.999346387697101e-07, + "loss": 0.0468, + "step": 6750 + }, + { + "epoch": 0.02700786064583146, + "grad_norm": 3.7344627380371094, + "learning_rate": 1.999344112534873e-07, + "loss": 0.0426, + "step": 6760 + }, + { + "epoch": 0.027047813102408134, + "grad_norm": 24.810274124145508, + "learning_rate": 1.9993418334210093e-07, + "loss": 0.0504, + "step": 6770 + }, + { + "epoch": 0.02708776555898481, + "grad_norm": 13.401935577392578, + "learning_rate": 1.9993395503555187e-07, + "loss": 0.0473, + "step": 6780 + }, + { + "epoch": 0.02712771801556148, + "grad_norm": 3.010798215866089, + "learning_rate": 1.9993372633384108e-07, + "loss": 0.0456, + "step": 6790 + }, + { + "epoch": 0.027167670472138156, + "grad_norm": 4.275506973266602, + "learning_rate": 1.999334972369694e-07, + "loss": 0.0497, + "step": 6800 + }, + { + "epoch": 0.027207622928714828, + "grad_norm": 65.73387908935547, + "learning_rate": 1.9993326774493777e-07, + "loss": 0.0471, + "step": 6810 + }, + { + "epoch": 0.027247575385291503, + "grad_norm": 5.430583953857422, + "learning_rate": 1.9993303785774712e-07, + "loss": 0.0431, + "step": 6820 + }, + { + "epoch": 0.027287527841868178, + "grad_norm": 5.368495464324951, + "learning_rate": 1.999328075753983e-07, + "loss": 0.045, + "step": 6830 + }, + { + "epoch": 0.02732748029844485, + "grad_norm": 10.347506523132324, + "learning_rate": 1.9993257689789226e-07, + "loss": 0.0434, + "step": 6840 + }, + { + "epoch": 0.027367432755021525, + "grad_norm": 6.616314888000488, + "learning_rate": 1.999323458252299e-07, + "loss": 0.0493, + "step": 6850 + }, + { + "epoch": 0.027407385211598197, + "grad_norm": 3.66725492477417, + "learning_rate": 1.9993211435741218e-07, + "loss": 0.047, + "step": 6860 + }, + { + "epoch": 0.027447337668174872, + "grad_norm": 7.846648693084717, + "learning_rate": 1.9993188249443993e-07, + "loss": 0.0403, + "step": 6870 + }, + { + "epoch": 0.027487290124751547, + "grad_norm": 9.427396774291992, + "learning_rate": 1.9993165023631414e-07, + "loss": 0.0439, + "step": 6880 + }, + { + "epoch": 0.02752724258132822, + "grad_norm": 6.936969757080078, + "learning_rate": 1.999314175830357e-07, + "loss": 0.0468, + "step": 6890 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 6.241415500640869, + "learning_rate": 1.9993118453460548e-07, + "loss": 0.0476, + "step": 6900 + }, + { + "epoch": 0.027607147494481565, + "grad_norm": 5.624272346496582, + "learning_rate": 1.999309510910245e-07, + "loss": 0.0464, + "step": 6910 + }, + { + "epoch": 0.02764709995105824, + "grad_norm": 5.167245864868164, + "learning_rate": 1.9993071725229363e-07, + "loss": 0.0484, + "step": 6920 + }, + { + "epoch": 0.027687052407634916, + "grad_norm": 5.06905460357666, + "learning_rate": 1.9993048301841378e-07, + "loss": 0.0446, + "step": 6930 + }, + { + "epoch": 0.027727004864211587, + "grad_norm": 9.074588775634766, + "learning_rate": 1.999302483893859e-07, + "loss": 0.0466, + "step": 6940 + }, + { + "epoch": 0.027766957320788262, + "grad_norm": 5.1984052658081055, + "learning_rate": 1.9993001336521092e-07, + "loss": 0.0432, + "step": 6950 + }, + { + "epoch": 0.027806909777364934, + "grad_norm": 3.495778799057007, + "learning_rate": 1.9992977794588977e-07, + "loss": 0.0459, + "step": 6960 + }, + { + "epoch": 0.02784686223394161, + "grad_norm": 7.78740119934082, + "learning_rate": 1.9992954213142335e-07, + "loss": 0.0485, + "step": 6970 + }, + { + "epoch": 0.027886814690518284, + "grad_norm": 4.514039516448975, + "learning_rate": 1.999293059218126e-07, + "loss": 0.0457, + "step": 6980 + }, + { + "epoch": 0.027926767147094956, + "grad_norm": 5.419587135314941, + "learning_rate": 1.9992906931705852e-07, + "loss": 0.0453, + "step": 6990 + }, + { + "epoch": 0.02796671960367163, + "grad_norm": 4.54557991027832, + "learning_rate": 1.9992883231716195e-07, + "loss": 0.0457, + "step": 7000 + }, + { + "epoch": 0.028006672060248306, + "grad_norm": 4.78110408782959, + "learning_rate": 1.9992859492212393e-07, + "loss": 0.0472, + "step": 7010 + }, + { + "epoch": 0.028046624516824978, + "grad_norm": 8.69890022277832, + "learning_rate": 1.999283571319453e-07, + "loss": 0.0497, + "step": 7020 + }, + { + "epoch": 0.028086576973401653, + "grad_norm": 4.0936598777771, + "learning_rate": 1.9992811894662705e-07, + "loss": 0.05, + "step": 7030 + }, + { + "epoch": 0.028126529429978325, + "grad_norm": 8.446420669555664, + "learning_rate": 1.999278803661701e-07, + "loss": 0.0448, + "step": 7040 + }, + { + "epoch": 0.028166481886555, + "grad_norm": 4.45729923248291, + "learning_rate": 1.999276413905754e-07, + "loss": 0.0459, + "step": 7050 + }, + { + "epoch": 0.028206434343131675, + "grad_norm": 5.573879241943359, + "learning_rate": 1.999274020198439e-07, + "loss": 0.0446, + "step": 7060 + }, + { + "epoch": 0.028246386799708346, + "grad_norm": 5.744200706481934, + "learning_rate": 1.9992716225397658e-07, + "loss": 0.0519, + "step": 7070 + }, + { + "epoch": 0.02828633925628502, + "grad_norm": 5.143575191497803, + "learning_rate": 1.9992692209297433e-07, + "loss": 0.046, + "step": 7080 + }, + { + "epoch": 0.028326291712861693, + "grad_norm": 4.442143440246582, + "learning_rate": 1.9992668153683815e-07, + "loss": 0.0501, + "step": 7090 + }, + { + "epoch": 0.02836624416943837, + "grad_norm": 6.805935859680176, + "learning_rate": 1.9992644058556896e-07, + "loss": 0.0463, + "step": 7100 + }, + { + "epoch": 0.028406196626015043, + "grad_norm": 4.256086349487305, + "learning_rate": 1.999261992391677e-07, + "loss": 0.0434, + "step": 7110 + }, + { + "epoch": 0.028446149082591715, + "grad_norm": 7.170339107513428, + "learning_rate": 1.9992595749763534e-07, + "loss": 0.0455, + "step": 7120 + }, + { + "epoch": 0.02848610153916839, + "grad_norm": 5.770754814147949, + "learning_rate": 1.9992571536097287e-07, + "loss": 0.0478, + "step": 7130 + }, + { + "epoch": 0.028526053995745062, + "grad_norm": 3.2282845973968506, + "learning_rate": 1.9992547282918123e-07, + "loss": 0.0497, + "step": 7140 + }, + { + "epoch": 0.028566006452321737, + "grad_norm": 7.425354480743408, + "learning_rate": 1.9992522990226132e-07, + "loss": 0.0428, + "step": 7150 + }, + { + "epoch": 0.028605958908898412, + "grad_norm": 3.925921678543091, + "learning_rate": 1.9992498658021417e-07, + "loss": 0.0461, + "step": 7160 + }, + { + "epoch": 0.028645911365475084, + "grad_norm": 4.809581756591797, + "learning_rate": 1.999247428630407e-07, + "loss": 0.0447, + "step": 7170 + }, + { + "epoch": 0.02868586382205176, + "grad_norm": 2.94031023979187, + "learning_rate": 1.9992449875074191e-07, + "loss": 0.0464, + "step": 7180 + }, + { + "epoch": 0.02872581627862843, + "grad_norm": 7.016168117523193, + "learning_rate": 1.9992425424331874e-07, + "loss": 0.0443, + "step": 7190 + }, + { + "epoch": 0.028765768735205106, + "grad_norm": 9.977611541748047, + "learning_rate": 1.9992400934077217e-07, + "loss": 0.0364, + "step": 7200 + }, + { + "epoch": 0.02880572119178178, + "grad_norm": 4.1296892166137695, + "learning_rate": 1.9992376404310315e-07, + "loss": 0.0441, + "step": 7210 + }, + { + "epoch": 0.028845673648358452, + "grad_norm": 7.136328220367432, + "learning_rate": 1.999235183503127e-07, + "loss": 0.0434, + "step": 7220 + }, + { + "epoch": 0.028885626104935128, + "grad_norm": 6.309767246246338, + "learning_rate": 1.9992327226240174e-07, + "loss": 0.049, + "step": 7230 + }, + { + "epoch": 0.0289255785615118, + "grad_norm": 2.7057790756225586, + "learning_rate": 1.9992302577937124e-07, + "loss": 0.0476, + "step": 7240 + }, + { + "epoch": 0.028965531018088474, + "grad_norm": 10.383499145507812, + "learning_rate": 1.999227789012222e-07, + "loss": 0.0466, + "step": 7250 + }, + { + "epoch": 0.02900548347466515, + "grad_norm": 4.96394157409668, + "learning_rate": 1.9992253162795558e-07, + "loss": 0.0458, + "step": 7260 + }, + { + "epoch": 0.02904543593124182, + "grad_norm": 3.893139362335205, + "learning_rate": 1.999222839595724e-07, + "loss": 0.0421, + "step": 7270 + }, + { + "epoch": 0.029085388387818496, + "grad_norm": 6.8509955406188965, + "learning_rate": 1.9992203589607356e-07, + "loss": 0.0405, + "step": 7280 + }, + { + "epoch": 0.02912534084439517, + "grad_norm": 9.527997016906738, + "learning_rate": 1.9992178743746012e-07, + "loss": 0.0491, + "step": 7290 + }, + { + "epoch": 0.029165293300971843, + "grad_norm": 4.369818687438965, + "learning_rate": 1.9992153858373302e-07, + "loss": 0.0494, + "step": 7300 + }, + { + "epoch": 0.029205245757548518, + "grad_norm": 23.57316780090332, + "learning_rate": 1.9992128933489324e-07, + "loss": 0.0492, + "step": 7310 + }, + { + "epoch": 0.02924519821412519, + "grad_norm": 3.7531349658966064, + "learning_rate": 1.999210396909418e-07, + "loss": 0.0386, + "step": 7320 + }, + { + "epoch": 0.029285150670701865, + "grad_norm": 3.9437386989593506, + "learning_rate": 1.9992078965187966e-07, + "loss": 0.044, + "step": 7330 + }, + { + "epoch": 0.02932510312727854, + "grad_norm": 5.5322265625, + "learning_rate": 1.9992053921770783e-07, + "loss": 0.0459, + "step": 7340 + }, + { + "epoch": 0.02936505558385521, + "grad_norm": 3.679600715637207, + "learning_rate": 1.9992028838842725e-07, + "loss": 0.0419, + "step": 7350 + }, + { + "epoch": 0.029405008040431887, + "grad_norm": 5.609346866607666, + "learning_rate": 1.9992003716403895e-07, + "loss": 0.049, + "step": 7360 + }, + { + "epoch": 0.02944496049700856, + "grad_norm": 7.371177673339844, + "learning_rate": 1.9991978554454395e-07, + "loss": 0.0483, + "step": 7370 + }, + { + "epoch": 0.029484912953585234, + "grad_norm": 4.4073567390441895, + "learning_rate": 1.999195335299432e-07, + "loss": 0.0438, + "step": 7380 + }, + { + "epoch": 0.02952486541016191, + "grad_norm": 6.6758623123168945, + "learning_rate": 1.999192811202377e-07, + "loss": 0.0454, + "step": 7390 + }, + { + "epoch": 0.02956481786673858, + "grad_norm": 8.937129020690918, + "learning_rate": 1.9991902831542848e-07, + "loss": 0.0473, + "step": 7400 + }, + { + "epoch": 0.029604770323315256, + "grad_norm": 5.5586090087890625, + "learning_rate": 1.999187751155165e-07, + "loss": 0.0449, + "step": 7410 + }, + { + "epoch": 0.029644722779891927, + "grad_norm": 3.13584303855896, + "learning_rate": 1.9991852152050282e-07, + "loss": 0.0453, + "step": 7420 + }, + { + "epoch": 0.029684675236468602, + "grad_norm": 6.769225120544434, + "learning_rate": 1.9991826753038836e-07, + "loss": 0.053, + "step": 7430 + }, + { + "epoch": 0.029724627693045277, + "grad_norm": 4.008066177368164, + "learning_rate": 1.999180131451742e-07, + "loss": 0.046, + "step": 7440 + }, + { + "epoch": 0.02976458014962195, + "grad_norm": 15.3802490234375, + "learning_rate": 1.9991775836486127e-07, + "loss": 0.0454, + "step": 7450 + }, + { + "epoch": 0.029804532606198624, + "grad_norm": 6.527248859405518, + "learning_rate": 1.9991750318945065e-07, + "loss": 0.0436, + "step": 7460 + }, + { + "epoch": 0.029844485062775296, + "grad_norm": 8.488015174865723, + "learning_rate": 1.9991724761894333e-07, + "loss": 0.045, + "step": 7470 + }, + { + "epoch": 0.02988443751935197, + "grad_norm": 5.326269626617432, + "learning_rate": 1.9991699165334028e-07, + "loss": 0.0462, + "step": 7480 + }, + { + "epoch": 0.029924389975928646, + "grad_norm": 4.114626884460449, + "learning_rate": 1.9991673529264257e-07, + "loss": 0.044, + "step": 7490 + }, + { + "epoch": 0.029964342432505318, + "grad_norm": 7.968732833862305, + "learning_rate": 1.9991647853685116e-07, + "loss": 0.0466, + "step": 7500 + }, + { + "epoch": 0.030004294889081993, + "grad_norm": 7.475108623504639, + "learning_rate": 1.9991622138596712e-07, + "loss": 0.047, + "step": 7510 + }, + { + "epoch": 0.030044247345658665, + "grad_norm": 4.319616794586182, + "learning_rate": 1.9991596383999143e-07, + "loss": 0.0445, + "step": 7520 + }, + { + "epoch": 0.03008419980223534, + "grad_norm": 4.2889227867126465, + "learning_rate": 1.9991570589892511e-07, + "loss": 0.0454, + "step": 7530 + }, + { + "epoch": 0.030124152258812015, + "grad_norm": 6.934574127197266, + "learning_rate": 1.999154475627692e-07, + "loss": 0.0465, + "step": 7540 + }, + { + "epoch": 0.030164104715388686, + "grad_norm": 4.612616062164307, + "learning_rate": 1.999151888315247e-07, + "loss": 0.0485, + "step": 7550 + }, + { + "epoch": 0.03020405717196536, + "grad_norm": 4.821898937225342, + "learning_rate": 1.999149297051926e-07, + "loss": 0.0469, + "step": 7560 + }, + { + "epoch": 0.030244009628542037, + "grad_norm": 3.471865177154541, + "learning_rate": 1.9991467018377404e-07, + "loss": 0.0414, + "step": 7570 + }, + { + "epoch": 0.03028396208511871, + "grad_norm": 7.560727596282959, + "learning_rate": 1.9991441026726995e-07, + "loss": 0.0417, + "step": 7580 + }, + { + "epoch": 0.030323914541695383, + "grad_norm": 5.594384670257568, + "learning_rate": 1.9991414995568134e-07, + "loss": 0.0464, + "step": 7590 + }, + { + "epoch": 0.030363866998272055, + "grad_norm": 5.286545276641846, + "learning_rate": 1.999138892490093e-07, + "loss": 0.0442, + "step": 7600 + }, + { + "epoch": 0.03040381945484873, + "grad_norm": 11.793651580810547, + "learning_rate": 1.9991362814725488e-07, + "loss": 0.0444, + "step": 7610 + }, + { + "epoch": 0.030443771911425405, + "grad_norm": 4.939403533935547, + "learning_rate": 1.9991336665041905e-07, + "loss": 0.0456, + "step": 7620 + }, + { + "epoch": 0.030483724368002077, + "grad_norm": 10.83222770690918, + "learning_rate": 1.9991310475850284e-07, + "loss": 0.0421, + "step": 7630 + }, + { + "epoch": 0.030523676824578752, + "grad_norm": 8.376405715942383, + "learning_rate": 1.9991284247150736e-07, + "loss": 0.0467, + "step": 7640 + }, + { + "epoch": 0.030563629281155424, + "grad_norm": 9.207889556884766, + "learning_rate": 1.9991257978943357e-07, + "loss": 0.043, + "step": 7650 + }, + { + "epoch": 0.0306035817377321, + "grad_norm": 4.248324871063232, + "learning_rate": 1.9991231671228253e-07, + "loss": 0.0391, + "step": 7660 + }, + { + "epoch": 0.030643534194308774, + "grad_norm": 4.642838001251221, + "learning_rate": 1.9991205324005528e-07, + "loss": 0.0476, + "step": 7670 + }, + { + "epoch": 0.030683486650885446, + "grad_norm": 4.110138893127441, + "learning_rate": 1.9991178937275291e-07, + "loss": 0.0424, + "step": 7680 + }, + { + "epoch": 0.03072343910746212, + "grad_norm": 6.8642401695251465, + "learning_rate": 1.9991152511037638e-07, + "loss": 0.0417, + "step": 7690 + }, + { + "epoch": 0.030763391564038792, + "grad_norm": 4.3245978355407715, + "learning_rate": 1.9991126045292682e-07, + "loss": 0.0453, + "step": 7700 + }, + { + "epoch": 0.030803344020615468, + "grad_norm": 5.051377773284912, + "learning_rate": 1.999109954004052e-07, + "loss": 0.0453, + "step": 7710 + }, + { + "epoch": 0.030843296477192143, + "grad_norm": 9.759674072265625, + "learning_rate": 1.999107299528126e-07, + "loss": 0.0453, + "step": 7720 + }, + { + "epoch": 0.030883248933768814, + "grad_norm": 3.724529266357422, + "learning_rate": 1.999104641101501e-07, + "loss": 0.047, + "step": 7730 + }, + { + "epoch": 0.03092320139034549, + "grad_norm": 4.939204216003418, + "learning_rate": 1.9991019787241873e-07, + "loss": 0.0453, + "step": 7740 + }, + { + "epoch": 0.03096315384692216, + "grad_norm": 4.330246925354004, + "learning_rate": 1.9990993123961952e-07, + "loss": 0.0479, + "step": 7750 + }, + { + "epoch": 0.031003106303498836, + "grad_norm": 5.311270236968994, + "learning_rate": 1.9990966421175357e-07, + "loss": 0.0439, + "step": 7760 + }, + { + "epoch": 0.03104305876007551, + "grad_norm": 5.402991771697998, + "learning_rate": 1.9990939678882188e-07, + "loss": 0.0438, + "step": 7770 + }, + { + "epoch": 0.031083011216652183, + "grad_norm": 12.780564308166504, + "learning_rate": 1.9990912897082555e-07, + "loss": 0.048, + "step": 7780 + }, + { + "epoch": 0.031122963673228858, + "grad_norm": 7.807069301605225, + "learning_rate": 1.9990886075776562e-07, + "loss": 0.0465, + "step": 7790 + }, + { + "epoch": 0.03116291612980553, + "grad_norm": 6.138376235961914, + "learning_rate": 1.9990859214964314e-07, + "loss": 0.05, + "step": 7800 + }, + { + "epoch": 0.031202868586382205, + "grad_norm": 5.1743011474609375, + "learning_rate": 1.9990832314645917e-07, + "loss": 0.0424, + "step": 7810 + }, + { + "epoch": 0.03124282104295888, + "grad_norm": 3.5580432415008545, + "learning_rate": 1.9990805374821483e-07, + "loss": 0.0458, + "step": 7820 + }, + { + "epoch": 0.03128277349953555, + "grad_norm": 5.461874008178711, + "learning_rate": 1.9990778395491112e-07, + "loss": 0.0426, + "step": 7830 + }, + { + "epoch": 0.03132272595611223, + "grad_norm": 3.1932787895202637, + "learning_rate": 1.9990751376654912e-07, + "loss": 0.0475, + "step": 7840 + }, + { + "epoch": 0.0313626784126889, + "grad_norm": 4.286773681640625, + "learning_rate": 1.9990724318312995e-07, + "loss": 0.0495, + "step": 7850 + }, + { + "epoch": 0.03140263086926558, + "grad_norm": 5.41080379486084, + "learning_rate": 1.9990697220465462e-07, + "loss": 0.0417, + "step": 7860 + }, + { + "epoch": 0.031442583325842245, + "grad_norm": 6.56943941116333, + "learning_rate": 1.9990670083112423e-07, + "loss": 0.0481, + "step": 7870 + }, + { + "epoch": 0.03148253578241892, + "grad_norm": 3.2740559577941895, + "learning_rate": 1.9990642906253982e-07, + "loss": 0.0406, + "step": 7880 + }, + { + "epoch": 0.031522488238995595, + "grad_norm": 6.59450101852417, + "learning_rate": 1.9990615689890255e-07, + "loss": 0.0446, + "step": 7890 + }, + { + "epoch": 0.03156244069557227, + "grad_norm": 5.957405090332031, + "learning_rate": 1.9990588434021337e-07, + "loss": 0.0464, + "step": 7900 + }, + { + "epoch": 0.031602393152148946, + "grad_norm": 8.191832542419434, + "learning_rate": 1.9990561138647343e-07, + "loss": 0.038, + "step": 7910 + }, + { + "epoch": 0.031642345608725614, + "grad_norm": 4.557767391204834, + "learning_rate": 1.9990533803768383e-07, + "loss": 0.0457, + "step": 7920 + }, + { + "epoch": 0.03168229806530229, + "grad_norm": 5.36492395401001, + "learning_rate": 1.999050642938456e-07, + "loss": 0.0427, + "step": 7930 + }, + { + "epoch": 0.031722250521878964, + "grad_norm": 5.60749626159668, + "learning_rate": 1.9990479015495988e-07, + "loss": 0.0398, + "step": 7940 + }, + { + "epoch": 0.03176220297845564, + "grad_norm": 6.976503849029541, + "learning_rate": 1.999045156210277e-07, + "loss": 0.0436, + "step": 7950 + }, + { + "epoch": 0.031802155435032314, + "grad_norm": 5.276156902313232, + "learning_rate": 1.9990424069205018e-07, + "loss": 0.0455, + "step": 7960 + }, + { + "epoch": 0.03184210789160898, + "grad_norm": 3.9917948246002197, + "learning_rate": 1.9990396536802837e-07, + "loss": 0.0468, + "step": 7970 + }, + { + "epoch": 0.03188206034818566, + "grad_norm": 6.211780548095703, + "learning_rate": 1.9990368964896337e-07, + "loss": 0.0474, + "step": 7980 + }, + { + "epoch": 0.03192201280476233, + "grad_norm": 3.481832981109619, + "learning_rate": 1.9990341353485634e-07, + "loss": 0.0372, + "step": 7990 + }, + { + "epoch": 0.03196196526133901, + "grad_norm": 4.580691814422607, + "learning_rate": 1.9990313702570828e-07, + "loss": 0.0425, + "step": 8000 + }, + { + "epoch": 0.03200191771791568, + "grad_norm": 28.095706939697266, + "learning_rate": 1.9990286012152028e-07, + "loss": 0.0438, + "step": 8010 + }, + { + "epoch": 0.03204187017449235, + "grad_norm": 5.868214130401611, + "learning_rate": 1.999025828222935e-07, + "loss": 0.0421, + "step": 8020 + }, + { + "epoch": 0.032081822631069026, + "grad_norm": 3.987368106842041, + "learning_rate": 1.9990230512802904e-07, + "loss": 0.0483, + "step": 8030 + }, + { + "epoch": 0.0321217750876457, + "grad_norm": 5.484925746917725, + "learning_rate": 1.9990202703872794e-07, + "loss": 0.0469, + "step": 8040 + }, + { + "epoch": 0.03216172754422238, + "grad_norm": 5.672989845275879, + "learning_rate": 1.9990174855439132e-07, + "loss": 0.0485, + "step": 8050 + }, + { + "epoch": 0.03220168000079905, + "grad_norm": 3.6686062812805176, + "learning_rate": 1.999014696750203e-07, + "loss": 0.0451, + "step": 8060 + }, + { + "epoch": 0.03224163245737572, + "grad_norm": 3.0937087535858154, + "learning_rate": 1.9990119040061597e-07, + "loss": 0.0468, + "step": 8070 + }, + { + "epoch": 0.032281584913952395, + "grad_norm": 3.9656879901885986, + "learning_rate": 1.9990091073117946e-07, + "loss": 0.0428, + "step": 8080 + }, + { + "epoch": 0.03232153737052907, + "grad_norm": 5.094934463500977, + "learning_rate": 1.999006306667118e-07, + "loss": 0.0467, + "step": 8090 + }, + { + "epoch": 0.032361489827105745, + "grad_norm": 5.617321968078613, + "learning_rate": 1.9990035020721415e-07, + "loss": 0.045, + "step": 8100 + }, + { + "epoch": 0.03240144228368242, + "grad_norm": 5.728837013244629, + "learning_rate": 1.9990006935268763e-07, + "loss": 0.0457, + "step": 8110 + }, + { + "epoch": 0.03244139474025909, + "grad_norm": 5.159755229949951, + "learning_rate": 1.9989978810313336e-07, + "loss": 0.0465, + "step": 8120 + }, + { + "epoch": 0.032481347196835764, + "grad_norm": 5.1893720626831055, + "learning_rate": 1.9989950645855242e-07, + "loss": 0.0425, + "step": 8130 + }, + { + "epoch": 0.03252129965341244, + "grad_norm": 2.9055211544036865, + "learning_rate": 1.9989922441894594e-07, + "loss": 0.0435, + "step": 8140 + }, + { + "epoch": 0.032561252109989114, + "grad_norm": 4.239157676696777, + "learning_rate": 1.99898941984315e-07, + "loss": 0.0381, + "step": 8150 + }, + { + "epoch": 0.03260120456656579, + "grad_norm": 7.810853481292725, + "learning_rate": 1.9989865915466074e-07, + "loss": 0.0458, + "step": 8160 + }, + { + "epoch": 0.03264115702314246, + "grad_norm": 18.205888748168945, + "learning_rate": 1.9989837592998433e-07, + "loss": 0.0435, + "step": 8170 + }, + { + "epoch": 0.03268110947971913, + "grad_norm": 4.28945255279541, + "learning_rate": 1.9989809231028683e-07, + "loss": 0.0445, + "step": 8180 + }, + { + "epoch": 0.03272106193629581, + "grad_norm": 9.878496170043945, + "learning_rate": 1.9989780829556934e-07, + "loss": 0.0394, + "step": 8190 + }, + { + "epoch": 0.03276101439287248, + "grad_norm": 3.6072211265563965, + "learning_rate": 1.9989752388583306e-07, + "loss": 0.0475, + "step": 8200 + }, + { + "epoch": 0.03280096684944916, + "grad_norm": 8.26125431060791, + "learning_rate": 1.9989723908107905e-07, + "loss": 0.0426, + "step": 8210 + }, + { + "epoch": 0.032840919306025826, + "grad_norm": 2.683516025543213, + "learning_rate": 1.9989695388130845e-07, + "loss": 0.0457, + "step": 8220 + }, + { + "epoch": 0.0328808717626025, + "grad_norm": 5.404463768005371, + "learning_rate": 1.9989666828652244e-07, + "loss": 0.0455, + "step": 8230 + }, + { + "epoch": 0.032920824219179176, + "grad_norm": 2.8093130588531494, + "learning_rate": 1.998963822967221e-07, + "loss": 0.0448, + "step": 8240 + }, + { + "epoch": 0.03296077667575585, + "grad_norm": 5.717135429382324, + "learning_rate": 1.998960959119085e-07, + "loss": 0.043, + "step": 8250 + }, + { + "epoch": 0.033000729132332526, + "grad_norm": 3.8091046810150146, + "learning_rate": 1.998958091320829e-07, + "loss": 0.0456, + "step": 8260 + }, + { + "epoch": 0.0330406815889092, + "grad_norm": 7.556629180908203, + "learning_rate": 1.9989552195724635e-07, + "loss": 0.048, + "step": 8270 + }, + { + "epoch": 0.03308063404548587, + "grad_norm": 6.911259651184082, + "learning_rate": 1.998952343874e-07, + "loss": 0.0443, + "step": 8280 + }, + { + "epoch": 0.033120586502062545, + "grad_norm": 4.932618618011475, + "learning_rate": 1.99894946422545e-07, + "loss": 0.0428, + "step": 8290 + }, + { + "epoch": 0.03316053895863922, + "grad_norm": 6.648102760314941, + "learning_rate": 1.9989465806268254e-07, + "loss": 0.0436, + "step": 8300 + }, + { + "epoch": 0.033200491415215895, + "grad_norm": 10.520838737487793, + "learning_rate": 1.9989436930781362e-07, + "loss": 0.0416, + "step": 8310 + }, + { + "epoch": 0.03324044387179257, + "grad_norm": 11.202194213867188, + "learning_rate": 1.9989408015793954e-07, + "loss": 0.0417, + "step": 8320 + }, + { + "epoch": 0.03328039632836924, + "grad_norm": 6.944174766540527, + "learning_rate": 1.9989379061306133e-07, + "loss": 0.0467, + "step": 8330 + }, + { + "epoch": 0.03332034878494591, + "grad_norm": 2.846296787261963, + "learning_rate": 1.998935006731802e-07, + "loss": 0.0422, + "step": 8340 + }, + { + "epoch": 0.03336030124152259, + "grad_norm": 6.126442909240723, + "learning_rate": 1.9989321033829723e-07, + "loss": 0.0421, + "step": 8350 + }, + { + "epoch": 0.033400253698099264, + "grad_norm": 6.239260673522949, + "learning_rate": 1.9989291960841363e-07, + "loss": 0.0414, + "step": 8360 + }, + { + "epoch": 0.03344020615467594, + "grad_norm": 7.422464370727539, + "learning_rate": 1.9989262848353053e-07, + "loss": 0.0442, + "step": 8370 + }, + { + "epoch": 0.03348015861125261, + "grad_norm": 9.61851978302002, + "learning_rate": 1.9989233696364908e-07, + "loss": 0.042, + "step": 8380 + }, + { + "epoch": 0.03352011106782928, + "grad_norm": 5.053872585296631, + "learning_rate": 1.9989204504877044e-07, + "loss": 0.044, + "step": 8390 + }, + { + "epoch": 0.03356006352440596, + "grad_norm": 4.788636207580566, + "learning_rate": 1.9989175273889577e-07, + "loss": 0.0444, + "step": 8400 + }, + { + "epoch": 0.03360001598098263, + "grad_norm": 5.940697193145752, + "learning_rate": 1.998914600340262e-07, + "loss": 0.0451, + "step": 8410 + }, + { + "epoch": 0.03363996843755931, + "grad_norm": 6.103991985321045, + "learning_rate": 1.9989116693416288e-07, + "loss": 0.0362, + "step": 8420 + }, + { + "epoch": 0.033679920894135976, + "grad_norm": 5.4879679679870605, + "learning_rate": 1.99890873439307e-07, + "loss": 0.0411, + "step": 8430 + }, + { + "epoch": 0.03371987335071265, + "grad_norm": 7.0948486328125, + "learning_rate": 1.9989057954945975e-07, + "loss": 0.0427, + "step": 8440 + }, + { + "epoch": 0.033759825807289326, + "grad_norm": 7.239338397979736, + "learning_rate": 1.9989028526462222e-07, + "loss": 0.0446, + "step": 8450 + }, + { + "epoch": 0.033799778263866, + "grad_norm": 4.757110118865967, + "learning_rate": 1.9988999058479559e-07, + "loss": 0.0392, + "step": 8460 + }, + { + "epoch": 0.033839730720442676, + "grad_norm": 4.657273769378662, + "learning_rate": 1.9988969550998104e-07, + "loss": 0.0436, + "step": 8470 + }, + { + "epoch": 0.033879683177019344, + "grad_norm": 3.6115918159484863, + "learning_rate": 1.9988940004017977e-07, + "loss": 0.0437, + "step": 8480 + }, + { + "epoch": 0.03391963563359602, + "grad_norm": 6.405703544616699, + "learning_rate": 1.9988910417539292e-07, + "loss": 0.0453, + "step": 8490 + }, + { + "epoch": 0.033959588090172695, + "grad_norm": 7.548530578613281, + "learning_rate": 1.998888079156216e-07, + "loss": 0.0432, + "step": 8500 + }, + { + "epoch": 0.03399954054674937, + "grad_norm": 6.058099746704102, + "learning_rate": 1.9988851126086709e-07, + "loss": 0.0398, + "step": 8510 + }, + { + "epoch": 0.034039493003326045, + "grad_norm": 6.906967639923096, + "learning_rate": 1.998882142111305e-07, + "loss": 0.0388, + "step": 8520 + }, + { + "epoch": 0.03407944545990271, + "grad_norm": 4.687408447265625, + "learning_rate": 1.9988791676641302e-07, + "loss": 0.0497, + "step": 8530 + }, + { + "epoch": 0.03411939791647939, + "grad_norm": 4.66245174407959, + "learning_rate": 1.998876189267158e-07, + "loss": 0.0409, + "step": 8540 + }, + { + "epoch": 0.03415935037305606, + "grad_norm": 4.677622318267822, + "learning_rate": 1.998873206920401e-07, + "loss": 0.0388, + "step": 8550 + }, + { + "epoch": 0.03419930282963274, + "grad_norm": 5.701066017150879, + "learning_rate": 1.9988702206238696e-07, + "loss": 0.0438, + "step": 8560 + }, + { + "epoch": 0.034239255286209413, + "grad_norm": 4.159022331237793, + "learning_rate": 1.9988672303775765e-07, + "loss": 0.0367, + "step": 8570 + }, + { + "epoch": 0.03427920774278608, + "grad_norm": 6.628027439117432, + "learning_rate": 1.998864236181534e-07, + "loss": 0.0471, + "step": 8580 + }, + { + "epoch": 0.03431916019936276, + "grad_norm": 4.6008806228637695, + "learning_rate": 1.998861238035753e-07, + "loss": 0.0406, + "step": 8590 + }, + { + "epoch": 0.03435911265593943, + "grad_norm": 9.601922988891602, + "learning_rate": 1.9988582359402456e-07, + "loss": 0.0445, + "step": 8600 + }, + { + "epoch": 0.03439906511251611, + "grad_norm": 13.006784439086914, + "learning_rate": 1.998855229895024e-07, + "loss": 0.0417, + "step": 8610 + }, + { + "epoch": 0.03443901756909278, + "grad_norm": 7.047754764556885, + "learning_rate": 1.9988522199000997e-07, + "loss": 0.0428, + "step": 8620 + }, + { + "epoch": 0.03447897002566945, + "grad_norm": 9.206453323364258, + "learning_rate": 1.998849205955485e-07, + "loss": 0.0452, + "step": 8630 + }, + { + "epoch": 0.034518922482246125, + "grad_norm": 7.958896160125732, + "learning_rate": 1.9988461880611912e-07, + "loss": 0.0359, + "step": 8640 + }, + { + "epoch": 0.0345588749388228, + "grad_norm": 6.810109615325928, + "learning_rate": 1.9988431662172308e-07, + "loss": 0.0416, + "step": 8650 + }, + { + "epoch": 0.034598827395399476, + "grad_norm": 5.657182693481445, + "learning_rate": 1.9988401404236156e-07, + "loss": 0.043, + "step": 8660 + }, + { + "epoch": 0.03463877985197615, + "grad_norm": 4.922200679779053, + "learning_rate": 1.9988371106803575e-07, + "loss": 0.0401, + "step": 8670 + }, + { + "epoch": 0.03467873230855282, + "grad_norm": 4.074776649475098, + "learning_rate": 1.9988340769874687e-07, + "loss": 0.0447, + "step": 8680 + }, + { + "epoch": 0.034718684765129494, + "grad_norm": 3.0053231716156006, + "learning_rate": 1.998831039344961e-07, + "loss": 0.0405, + "step": 8690 + }, + { + "epoch": 0.03475863722170617, + "grad_norm": 11.148980140686035, + "learning_rate": 1.9988279977528463e-07, + "loss": 0.0414, + "step": 8700 + }, + { + "epoch": 0.034798589678282844, + "grad_norm": 6.391834735870361, + "learning_rate": 1.998824952211137e-07, + "loss": 0.0448, + "step": 8710 + }, + { + "epoch": 0.03483854213485952, + "grad_norm": 5.426973342895508, + "learning_rate": 1.9988219027198444e-07, + "loss": 0.0424, + "step": 8720 + }, + { + "epoch": 0.03487849459143619, + "grad_norm": 5.320807456970215, + "learning_rate": 1.9988188492789812e-07, + "loss": 0.044, + "step": 8730 + }, + { + "epoch": 0.03491844704801286, + "grad_norm": 5.016045093536377, + "learning_rate": 1.9988157918885595e-07, + "loss": 0.04, + "step": 8740 + }, + { + "epoch": 0.03495839950458954, + "grad_norm": 5.401718616485596, + "learning_rate": 1.9988127305485915e-07, + "loss": 0.0418, + "step": 8750 + }, + { + "epoch": 0.03499835196116621, + "grad_norm": 13.161367416381836, + "learning_rate": 1.9988096652590886e-07, + "loss": 0.0452, + "step": 8760 + }, + { + "epoch": 0.03503830441774289, + "grad_norm": 4.700403690338135, + "learning_rate": 1.9988065960200632e-07, + "loss": 0.0396, + "step": 8770 + }, + { + "epoch": 0.035078256874319556, + "grad_norm": 6.645179271697998, + "learning_rate": 1.9988035228315278e-07, + "loss": 0.0393, + "step": 8780 + }, + { + "epoch": 0.03511820933089623, + "grad_norm": 5.481716156005859, + "learning_rate": 1.9988004456934946e-07, + "loss": 0.048, + "step": 8790 + }, + { + "epoch": 0.03515816178747291, + "grad_norm": 5.112944602966309, + "learning_rate": 1.9987973646059752e-07, + "loss": 0.0445, + "step": 8800 + }, + { + "epoch": 0.03519811424404958, + "grad_norm": 4.463167190551758, + "learning_rate": 1.9987942795689823e-07, + "loss": 0.0393, + "step": 8810 + }, + { + "epoch": 0.03523806670062626, + "grad_norm": 5.276488780975342, + "learning_rate": 1.9987911905825275e-07, + "loss": 0.0416, + "step": 8820 + }, + { + "epoch": 0.03527801915720293, + "grad_norm": 5.612264156341553, + "learning_rate": 1.998788097646624e-07, + "loss": 0.0401, + "step": 8830 + }, + { + "epoch": 0.0353179716137796, + "grad_norm": 3.2566630840301514, + "learning_rate": 1.9987850007612828e-07, + "loss": 0.0391, + "step": 8840 + }, + { + "epoch": 0.035357924070356275, + "grad_norm": 7.780763626098633, + "learning_rate": 1.9987818999265168e-07, + "loss": 0.0462, + "step": 8850 + }, + { + "epoch": 0.03539787652693295, + "grad_norm": 3.179110527038574, + "learning_rate": 1.9987787951423388e-07, + "loss": 0.045, + "step": 8860 + }, + { + "epoch": 0.035437828983509626, + "grad_norm": 5.401456356048584, + "learning_rate": 1.99877568640876e-07, + "loss": 0.0403, + "step": 8870 + }, + { + "epoch": 0.0354777814400863, + "grad_norm": 5.445248126983643, + "learning_rate": 1.9987725737257932e-07, + "loss": 0.0421, + "step": 8880 + }, + { + "epoch": 0.03551773389666297, + "grad_norm": 4.250365257263184, + "learning_rate": 1.9987694570934512e-07, + "loss": 0.0444, + "step": 8890 + }, + { + "epoch": 0.035557686353239644, + "grad_norm": 5.571085453033447, + "learning_rate": 1.9987663365117453e-07, + "loss": 0.0419, + "step": 8900 + }, + { + "epoch": 0.03559763880981632, + "grad_norm": 17.303264617919922, + "learning_rate": 1.9987632119806887e-07, + "loss": 0.0424, + "step": 8910 + }, + { + "epoch": 0.035637591266392994, + "grad_norm": 5.953324794769287, + "learning_rate": 1.9987600835002934e-07, + "loss": 0.0411, + "step": 8920 + }, + { + "epoch": 0.03567754372296967, + "grad_norm": 3.720167875289917, + "learning_rate": 1.9987569510705717e-07, + "loss": 0.0432, + "step": 8930 + }, + { + "epoch": 0.03571749617954634, + "grad_norm": 4.079144477844238, + "learning_rate": 1.9987538146915362e-07, + "loss": 0.0443, + "step": 8940 + }, + { + "epoch": 0.03575744863612301, + "grad_norm": 6.434004783630371, + "learning_rate": 1.998750674363199e-07, + "loss": 0.0398, + "step": 8950 + }, + { + "epoch": 0.03579740109269969, + "grad_norm": 10.054451942443848, + "learning_rate": 1.998747530085573e-07, + "loss": 0.0427, + "step": 8960 + }, + { + "epoch": 0.03583735354927636, + "grad_norm": 5.760168552398682, + "learning_rate": 1.99874438185867e-07, + "loss": 0.0441, + "step": 8970 + }, + { + "epoch": 0.03587730600585304, + "grad_norm": 4.561140537261963, + "learning_rate": 1.998741229682503e-07, + "loss": 0.0421, + "step": 8980 + }, + { + "epoch": 0.035917258462429706, + "grad_norm": 76.51840209960938, + "learning_rate": 1.998738073557084e-07, + "loss": 0.0424, + "step": 8990 + }, + { + "epoch": 0.03595721091900638, + "grad_norm": 3.9611339569091797, + "learning_rate": 1.998734913482426e-07, + "loss": 0.0373, + "step": 9000 + }, + { + "epoch": 0.035997163375583056, + "grad_norm": 4.957666873931885, + "learning_rate": 1.998731749458541e-07, + "loss": 0.0409, + "step": 9010 + }, + { + "epoch": 0.03603711583215973, + "grad_norm": 4.426530361175537, + "learning_rate": 1.9987285814854421e-07, + "loss": 0.0437, + "step": 9020 + }, + { + "epoch": 0.03607706828873641, + "grad_norm": 2.9557745456695557, + "learning_rate": 1.9987254095631415e-07, + "loss": 0.0446, + "step": 9030 + }, + { + "epoch": 0.036117020745313075, + "grad_norm": 4.324443340301514, + "learning_rate": 1.9987222336916514e-07, + "loss": 0.0473, + "step": 9040 + }, + { + "epoch": 0.03615697320188975, + "grad_norm": 25.445621490478516, + "learning_rate": 1.9987190538709848e-07, + "loss": 0.0437, + "step": 9050 + }, + { + "epoch": 0.036196925658466425, + "grad_norm": 7.144325256347656, + "learning_rate": 1.998715870101154e-07, + "loss": 0.041, + "step": 9060 + }, + { + "epoch": 0.0362368781150431, + "grad_norm": 4.832206726074219, + "learning_rate": 1.9987126823821719e-07, + "loss": 0.0413, + "step": 9070 + }, + { + "epoch": 0.036276830571619775, + "grad_norm": 3.231318950653076, + "learning_rate": 1.998709490714051e-07, + "loss": 0.041, + "step": 9080 + }, + { + "epoch": 0.036316783028196443, + "grad_norm": 5.887318134307861, + "learning_rate": 1.9987062950968038e-07, + "loss": 0.0469, + "step": 9090 + }, + { + "epoch": 0.03635673548477312, + "grad_norm": 5.167107105255127, + "learning_rate": 1.9987030955304426e-07, + "loss": 0.0396, + "step": 9100 + }, + { + "epoch": 0.036396687941349794, + "grad_norm": 4.420378684997559, + "learning_rate": 1.998699892014981e-07, + "loss": 0.0414, + "step": 9110 + }, + { + "epoch": 0.03643664039792647, + "grad_norm": 3.985407590866089, + "learning_rate": 1.9986966845504308e-07, + "loss": 0.0393, + "step": 9120 + }, + { + "epoch": 0.036476592854503144, + "grad_norm": 3.0735177993774414, + "learning_rate": 1.998693473136805e-07, + "loss": 0.044, + "step": 9130 + }, + { + "epoch": 0.03651654531107981, + "grad_norm": 5.947671413421631, + "learning_rate": 1.9986902577741164e-07, + "loss": 0.0371, + "step": 9140 + }, + { + "epoch": 0.03655649776765649, + "grad_norm": 2.1852359771728516, + "learning_rate": 1.9986870384623775e-07, + "loss": 0.0439, + "step": 9150 + }, + { + "epoch": 0.03659645022423316, + "grad_norm": 8.30316162109375, + "learning_rate": 1.9986838152016013e-07, + "loss": 0.04, + "step": 9160 + }, + { + "epoch": 0.03663640268080984, + "grad_norm": 4.644260406494141, + "learning_rate": 1.9986805879918003e-07, + "loss": 0.044, + "step": 9170 + }, + { + "epoch": 0.03667635513738651, + "grad_norm": 12.981446266174316, + "learning_rate": 1.9986773568329874e-07, + "loss": 0.0432, + "step": 9180 + }, + { + "epoch": 0.03671630759396318, + "grad_norm": 6.297481060028076, + "learning_rate": 1.9986741217251751e-07, + "loss": 0.038, + "step": 9190 + }, + { + "epoch": 0.036756260050539856, + "grad_norm": 4.664112567901611, + "learning_rate": 1.9986708826683767e-07, + "loss": 0.0415, + "step": 9200 + }, + { + "epoch": 0.03679621250711653, + "grad_norm": 4.5905375480651855, + "learning_rate": 1.9986676396626047e-07, + "loss": 0.0415, + "step": 9210 + }, + { + "epoch": 0.036836164963693206, + "grad_norm": 6.544052600860596, + "learning_rate": 1.998664392707872e-07, + "loss": 0.0406, + "step": 9220 + }, + { + "epoch": 0.03687611742026988, + "grad_norm": 7.172725677490234, + "learning_rate": 1.9986611418041912e-07, + "loss": 0.0435, + "step": 9230 + }, + { + "epoch": 0.03691606987684655, + "grad_norm": 6.414148807525635, + "learning_rate": 1.9986578869515753e-07, + "loss": 0.0424, + "step": 9240 + }, + { + "epoch": 0.036956022333423225, + "grad_norm": 4.3801045417785645, + "learning_rate": 1.9986546281500372e-07, + "loss": 0.0436, + "step": 9250 + }, + { + "epoch": 0.0369959747899999, + "grad_norm": 5.663834095001221, + "learning_rate": 1.99865136539959e-07, + "loss": 0.046, + "step": 9260 + }, + { + "epoch": 0.037035927246576575, + "grad_norm": 7.403417110443115, + "learning_rate": 1.9986480987002464e-07, + "loss": 0.0434, + "step": 9270 + }, + { + "epoch": 0.03707587970315325, + "grad_norm": 4.723799705505371, + "learning_rate": 1.998644828052019e-07, + "loss": 0.044, + "step": 9280 + }, + { + "epoch": 0.03711583215972992, + "grad_norm": 3.8982038497924805, + "learning_rate": 1.9986415534549214e-07, + "loss": 0.0444, + "step": 9290 + }, + { + "epoch": 0.03715578461630659, + "grad_norm": 8.826886177062988, + "learning_rate": 1.998638274908966e-07, + "loss": 0.0432, + "step": 9300 + }, + { + "epoch": 0.03719573707288327, + "grad_norm": 3.1312479972839355, + "learning_rate": 1.998634992414166e-07, + "loss": 0.0412, + "step": 9310 + }, + { + "epoch": 0.037235689529459944, + "grad_norm": 5.4836745262146, + "learning_rate": 1.9986317059705344e-07, + "loss": 0.0411, + "step": 9320 + }, + { + "epoch": 0.03727564198603662, + "grad_norm": 9.161508560180664, + "learning_rate": 1.9986284155780839e-07, + "loss": 0.0386, + "step": 9330 + }, + { + "epoch": 0.03731559444261329, + "grad_norm": 6.577349662780762, + "learning_rate": 1.9986251212368282e-07, + "loss": 0.0398, + "step": 9340 + }, + { + "epoch": 0.03735554689918996, + "grad_norm": 10.735897064208984, + "learning_rate": 1.9986218229467796e-07, + "loss": 0.0367, + "step": 9350 + }, + { + "epoch": 0.03739549935576664, + "grad_norm": 7.8642706871032715, + "learning_rate": 1.9986185207079513e-07, + "loss": 0.041, + "step": 9360 + }, + { + "epoch": 0.03743545181234331, + "grad_norm": 3.775810718536377, + "learning_rate": 1.9986152145203565e-07, + "loss": 0.0396, + "step": 9370 + }, + { + "epoch": 0.03747540426891999, + "grad_norm": 5.940392017364502, + "learning_rate": 1.9986119043840085e-07, + "loss": 0.0421, + "step": 9380 + }, + { + "epoch": 0.037515356725496656, + "grad_norm": 4.77976131439209, + "learning_rate": 1.99860859029892e-07, + "loss": 0.0435, + "step": 9390 + }, + { + "epoch": 0.03755530918207333, + "grad_norm": 5.481605052947998, + "learning_rate": 1.9986052722651042e-07, + "loss": 0.0439, + "step": 9400 + }, + { + "epoch": 0.037595261638650006, + "grad_norm": 2.8992667198181152, + "learning_rate": 1.9986019502825743e-07, + "loss": 0.0353, + "step": 9410 + }, + { + "epoch": 0.03763521409522668, + "grad_norm": 6.296430587768555, + "learning_rate": 1.9985986243513432e-07, + "loss": 0.0491, + "step": 9420 + }, + { + "epoch": 0.037675166551803356, + "grad_norm": 8.620664596557617, + "learning_rate": 1.9985952944714245e-07, + "loss": 0.0356, + "step": 9430 + }, + { + "epoch": 0.03771511900838003, + "grad_norm": 6.192637920379639, + "learning_rate": 1.9985919606428312e-07, + "loss": 0.0436, + "step": 9440 + }, + { + "epoch": 0.0377550714649567, + "grad_norm": 3.137894630432129, + "learning_rate": 1.9985886228655762e-07, + "loss": 0.0408, + "step": 9450 + }, + { + "epoch": 0.037795023921533374, + "grad_norm": 101.87504577636719, + "learning_rate": 1.998585281139673e-07, + "loss": 0.0467, + "step": 9460 + }, + { + "epoch": 0.03783497637811005, + "grad_norm": 3.367908239364624, + "learning_rate": 1.9985819354651347e-07, + "loss": 0.042, + "step": 9470 + }, + { + "epoch": 0.037874928834686725, + "grad_norm": 4.316262722015381, + "learning_rate": 1.9985785858419744e-07, + "loss": 0.0418, + "step": 9480 + }, + { + "epoch": 0.0379148812912634, + "grad_norm": 8.299461364746094, + "learning_rate": 1.9985752322702053e-07, + "loss": 0.0455, + "step": 9490 + }, + { + "epoch": 0.03795483374784007, + "grad_norm": 6.472344875335693, + "learning_rate": 1.9985718747498416e-07, + "loss": 0.044, + "step": 9500 + }, + { + "epoch": 0.03799478620441674, + "grad_norm": 4.34024715423584, + "learning_rate": 1.998568513280895e-07, + "loss": 0.0406, + "step": 9510 + }, + { + "epoch": 0.03803473866099342, + "grad_norm": 10.391509056091309, + "learning_rate": 1.99856514786338e-07, + "loss": 0.0433, + "step": 9520 + }, + { + "epoch": 0.03807469111757009, + "grad_norm": 6.3040995597839355, + "learning_rate": 1.9985617784973098e-07, + "loss": 0.0404, + "step": 9530 + }, + { + "epoch": 0.03811464357414677, + "grad_norm": 5.021454811096191, + "learning_rate": 1.998558405182697e-07, + "loss": 0.0416, + "step": 9540 + }, + { + "epoch": 0.03815459603072344, + "grad_norm": 7.288928031921387, + "learning_rate": 1.9985550279195556e-07, + "loss": 0.0402, + "step": 9550 + }, + { + "epoch": 0.03819454848730011, + "grad_norm": 6.32245397567749, + "learning_rate": 1.9985516467078985e-07, + "loss": 0.0398, + "step": 9560 + }, + { + "epoch": 0.03823450094387679, + "grad_norm": 5.856606960296631, + "learning_rate": 1.9985482615477393e-07, + "loss": 0.0419, + "step": 9570 + }, + { + "epoch": 0.03827445340045346, + "grad_norm": 9.752838134765625, + "learning_rate": 1.9985448724390918e-07, + "loss": 0.0398, + "step": 9580 + }, + { + "epoch": 0.03831440585703014, + "grad_norm": 5.933298110961914, + "learning_rate": 1.9985414793819687e-07, + "loss": 0.0441, + "step": 9590 + }, + { + "epoch": 0.038354358313606805, + "grad_norm": 2.8128058910369873, + "learning_rate": 1.9985380823763838e-07, + "loss": 0.0394, + "step": 9600 + }, + { + "epoch": 0.03839431077018348, + "grad_norm": 12.29112720489502, + "learning_rate": 1.9985346814223503e-07, + "loss": 0.0396, + "step": 9610 + }, + { + "epoch": 0.038434263226760156, + "grad_norm": 6.439874172210693, + "learning_rate": 1.9985312765198817e-07, + "loss": 0.0434, + "step": 9620 + }, + { + "epoch": 0.03847421568333683, + "grad_norm": 3.614041805267334, + "learning_rate": 1.9985278676689918e-07, + "loss": 0.0412, + "step": 9630 + }, + { + "epoch": 0.038514168139913506, + "grad_norm": 5.472325325012207, + "learning_rate": 1.9985244548696941e-07, + "loss": 0.0438, + "step": 9640 + }, + { + "epoch": 0.038554120596490174, + "grad_norm": 5.515043258666992, + "learning_rate": 1.9985210381220016e-07, + "loss": 0.0354, + "step": 9650 + }, + { + "epoch": 0.03859407305306685, + "grad_norm": 8.969761848449707, + "learning_rate": 1.9985176174259277e-07, + "loss": 0.0362, + "step": 9660 + }, + { + "epoch": 0.038634025509643524, + "grad_norm": 4.376002311706543, + "learning_rate": 1.9985141927814867e-07, + "loss": 0.0379, + "step": 9670 + }, + { + "epoch": 0.0386739779662202, + "grad_norm": 4.731054306030273, + "learning_rate": 1.9985107641886914e-07, + "loss": 0.0455, + "step": 9680 + }, + { + "epoch": 0.038713930422796874, + "grad_norm": 6.2753987312316895, + "learning_rate": 1.998507331647556e-07, + "loss": 0.0468, + "step": 9690 + }, + { + "epoch": 0.03875388287937354, + "grad_norm": 6.443223476409912, + "learning_rate": 1.9985038951580936e-07, + "loss": 0.0433, + "step": 9700 + }, + { + "epoch": 0.03879383533595022, + "grad_norm": 4.747934818267822, + "learning_rate": 1.9985004547203178e-07, + "loss": 0.0421, + "step": 9710 + }, + { + "epoch": 0.03883378779252689, + "grad_norm": 7.029856204986572, + "learning_rate": 1.9984970103342422e-07, + "loss": 0.0436, + "step": 9720 + }, + { + "epoch": 0.03887374024910357, + "grad_norm": 3.3462793827056885, + "learning_rate": 1.998493561999881e-07, + "loss": 0.0441, + "step": 9730 + }, + { + "epoch": 0.03891369270568024, + "grad_norm": 6.149877071380615, + "learning_rate": 1.998490109717247e-07, + "loss": 0.0441, + "step": 9740 + }, + { + "epoch": 0.03895364516225691, + "grad_norm": 4.294563293457031, + "learning_rate": 1.9984866534863546e-07, + "loss": 0.0474, + "step": 9750 + }, + { + "epoch": 0.038993597618833586, + "grad_norm": 3.7777650356292725, + "learning_rate": 1.9984831933072168e-07, + "loss": 0.042, + "step": 9760 + }, + { + "epoch": 0.03903355007541026, + "grad_norm": 3.382599115371704, + "learning_rate": 1.9984797291798478e-07, + "loss": 0.0394, + "step": 9770 + }, + { + "epoch": 0.03907350253198694, + "grad_norm": 4.784400463104248, + "learning_rate": 1.998476261104261e-07, + "loss": 0.0414, + "step": 9780 + }, + { + "epoch": 0.03911345498856361, + "grad_norm": 22.11922836303711, + "learning_rate": 1.99847278908047e-07, + "loss": 0.0452, + "step": 9790 + }, + { + "epoch": 0.03915340744514028, + "grad_norm": 5.024914264678955, + "learning_rate": 1.9984693131084888e-07, + "loss": 0.0415, + "step": 9800 + }, + { + "epoch": 0.039193359901716955, + "grad_norm": 4.0563788414001465, + "learning_rate": 1.9984658331883315e-07, + "loss": 0.0368, + "step": 9810 + }, + { + "epoch": 0.03923331235829363, + "grad_norm": 4.917999267578125, + "learning_rate": 1.998462349320011e-07, + "loss": 0.0472, + "step": 9820 + }, + { + "epoch": 0.039273264814870305, + "grad_norm": 3.578237295150757, + "learning_rate": 1.9984588615035416e-07, + "loss": 0.037, + "step": 9830 + }, + { + "epoch": 0.03931321727144698, + "grad_norm": 7.866146564483643, + "learning_rate": 1.998455369738937e-07, + "loss": 0.0396, + "step": 9840 + }, + { + "epoch": 0.03935316972802365, + "grad_norm": 4.114502429962158, + "learning_rate": 1.9984518740262111e-07, + "loss": 0.0429, + "step": 9850 + }, + { + "epoch": 0.039393122184600324, + "grad_norm": 3.9383718967437744, + "learning_rate": 1.9984483743653777e-07, + "loss": 0.0457, + "step": 9860 + }, + { + "epoch": 0.039433074641177, + "grad_norm": 5.975452423095703, + "learning_rate": 1.9984448707564506e-07, + "loss": 0.0427, + "step": 9870 + }, + { + "epoch": 0.039473027097753674, + "grad_norm": 4.147711753845215, + "learning_rate": 1.9984413631994435e-07, + "loss": 0.0423, + "step": 9880 + }, + { + "epoch": 0.03951297955433035, + "grad_norm": 9.578468322753906, + "learning_rate": 1.9984378516943702e-07, + "loss": 0.0455, + "step": 9890 + }, + { + "epoch": 0.03955293201090702, + "grad_norm": 5.065750598907471, + "learning_rate": 1.998434336241245e-07, + "loss": 0.0393, + "step": 9900 + }, + { + "epoch": 0.03959288446748369, + "grad_norm": 3.9796578884124756, + "learning_rate": 1.9984308168400817e-07, + "loss": 0.0412, + "step": 9910 + }, + { + "epoch": 0.03963283692406037, + "grad_norm": 5.1515793800354, + "learning_rate": 1.9984272934908938e-07, + "loss": 0.0432, + "step": 9920 + }, + { + "epoch": 0.03967278938063704, + "grad_norm": 6.010573387145996, + "learning_rate": 1.998423766193696e-07, + "loss": 0.0338, + "step": 9930 + }, + { + "epoch": 0.03971274183721372, + "grad_norm": 4.9055891036987305, + "learning_rate": 1.9984202349485012e-07, + "loss": 0.0348, + "step": 9940 + }, + { + "epoch": 0.039752694293790386, + "grad_norm": 4.0952935218811035, + "learning_rate": 1.998416699755324e-07, + "loss": 0.0346, + "step": 9950 + }, + { + "epoch": 0.03979264675036706, + "grad_norm": 8.057466506958008, + "learning_rate": 1.9984131606141786e-07, + "loss": 0.0405, + "step": 9960 + }, + { + "epoch": 0.039832599206943736, + "grad_norm": 5.919281005859375, + "learning_rate": 1.9984096175250787e-07, + "loss": 0.0444, + "step": 9970 + }, + { + "epoch": 0.03987255166352041, + "grad_norm": 7.792891502380371, + "learning_rate": 1.9984060704880381e-07, + "loss": 0.0435, + "step": 9980 + }, + { + "epoch": 0.039912504120097086, + "grad_norm": 22.224016189575195, + "learning_rate": 1.9984025195030713e-07, + "loss": 0.0405, + "step": 9990 + }, + { + "epoch": 0.03995245657667376, + "grad_norm": 4.409722805023193, + "learning_rate": 1.998398964570192e-07, + "loss": 0.0434, + "step": 10000 + }, + { + "epoch": 0.03999240903325043, + "grad_norm": 1.9897490739822388, + "learning_rate": 1.9983954056894144e-07, + "loss": 0.0372, + "step": 10010 + }, + { + "epoch": 0.040032361489827105, + "grad_norm": 9.55389404296875, + "learning_rate": 1.9983918428607524e-07, + "loss": 0.0442, + "step": 10020 + }, + { + "epoch": 0.04007231394640378, + "grad_norm": 9.745152473449707, + "learning_rate": 1.9983882760842202e-07, + "loss": 0.0376, + "step": 10030 + }, + { + "epoch": 0.040112266402980455, + "grad_norm": 7.532162666320801, + "learning_rate": 1.9983847053598318e-07, + "loss": 0.0419, + "step": 10040 + }, + { + "epoch": 0.04015221885955713, + "grad_norm": 4.4302873611450195, + "learning_rate": 1.9983811306876018e-07, + "loss": 0.0411, + "step": 10050 + }, + { + "epoch": 0.0401921713161338, + "grad_norm": 7.041217803955078, + "learning_rate": 1.998377552067544e-07, + "loss": 0.0366, + "step": 10060 + }, + { + "epoch": 0.040232123772710474, + "grad_norm": 10.470749855041504, + "learning_rate": 1.9983739694996722e-07, + "loss": 0.0403, + "step": 10070 + }, + { + "epoch": 0.04027207622928715, + "grad_norm": 5.260657787322998, + "learning_rate": 1.998370382984001e-07, + "loss": 0.0372, + "step": 10080 + }, + { + "epoch": 0.040312028685863824, + "grad_norm": 4.915506839752197, + "learning_rate": 1.9983667925205444e-07, + "loss": 0.0386, + "step": 10090 + }, + { + "epoch": 0.0403519811424405, + "grad_norm": 4.264202117919922, + "learning_rate": 1.9983631981093167e-07, + "loss": 0.0456, + "step": 10100 + }, + { + "epoch": 0.04039193359901717, + "grad_norm": 4.751368999481201, + "learning_rate": 1.9983595997503323e-07, + "loss": 0.0393, + "step": 10110 + }, + { + "epoch": 0.04043188605559384, + "grad_norm": 5.31189489364624, + "learning_rate": 1.9983559974436048e-07, + "loss": 0.0362, + "step": 10120 + }, + { + "epoch": 0.04047183851217052, + "grad_norm": 4.228819370269775, + "learning_rate": 1.998352391189149e-07, + "loss": 0.0372, + "step": 10130 + }, + { + "epoch": 0.04051179096874719, + "grad_norm": 6.6441168785095215, + "learning_rate": 1.998348780986979e-07, + "loss": 0.0385, + "step": 10140 + }, + { + "epoch": 0.04055174342532387, + "grad_norm": 4.8251752853393555, + "learning_rate": 1.9983451668371095e-07, + "loss": 0.0417, + "step": 10150 + }, + { + "epoch": 0.040591695881900536, + "grad_norm": 8.733756065368652, + "learning_rate": 1.9983415487395538e-07, + "loss": 0.0422, + "step": 10160 + }, + { + "epoch": 0.04063164833847721, + "grad_norm": 5.42238712310791, + "learning_rate": 1.9983379266943272e-07, + "loss": 0.0421, + "step": 10170 + }, + { + "epoch": 0.040671600795053886, + "grad_norm": 6.2550249099731445, + "learning_rate": 1.9983343007014432e-07, + "loss": 0.041, + "step": 10180 + }, + { + "epoch": 0.04071155325163056, + "grad_norm": 8.067708969116211, + "learning_rate": 1.998330670760917e-07, + "loss": 0.0356, + "step": 10190 + }, + { + "epoch": 0.040751505708207236, + "grad_norm": 7.151599884033203, + "learning_rate": 1.9983270368727623e-07, + "loss": 0.0401, + "step": 10200 + }, + { + "epoch": 0.040791458164783904, + "grad_norm": 4.2172160148620605, + "learning_rate": 1.9983233990369937e-07, + "loss": 0.0423, + "step": 10210 + }, + { + "epoch": 0.04083141062136058, + "grad_norm": 4.274363040924072, + "learning_rate": 1.9983197572536254e-07, + "loss": 0.0393, + "step": 10220 + }, + { + "epoch": 0.040871363077937255, + "grad_norm": 6.02797794342041, + "learning_rate": 1.998316111522672e-07, + "loss": 0.041, + "step": 10230 + }, + { + "epoch": 0.04091131553451393, + "grad_norm": 7.846842288970947, + "learning_rate": 1.9983124618441477e-07, + "loss": 0.0408, + "step": 10240 + }, + { + "epoch": 0.040951267991090605, + "grad_norm": 15.180091857910156, + "learning_rate": 1.9983088082180674e-07, + "loss": 0.0426, + "step": 10250 + }, + { + "epoch": 0.04099122044766727, + "grad_norm": 6.9776105880737305, + "learning_rate": 1.9983051506444452e-07, + "loss": 0.0372, + "step": 10260 + }, + { + "epoch": 0.04103117290424395, + "grad_norm": 4.986519813537598, + "learning_rate": 1.9983014891232957e-07, + "loss": 0.0405, + "step": 10270 + }, + { + "epoch": 0.04107112536082062, + "grad_norm": 8.593491554260254, + "learning_rate": 1.998297823654633e-07, + "loss": 0.0437, + "step": 10280 + }, + { + "epoch": 0.0411110778173973, + "grad_norm": 3.5372915267944336, + "learning_rate": 1.9982941542384722e-07, + "loss": 0.0323, + "step": 10290 + }, + { + "epoch": 0.041151030273973974, + "grad_norm": 3.5848894119262695, + "learning_rate": 1.9982904808748273e-07, + "loss": 0.0412, + "step": 10300 + }, + { + "epoch": 0.04119098273055064, + "grad_norm": 6.361612796783447, + "learning_rate": 1.998286803563713e-07, + "loss": 0.0373, + "step": 10310 + }, + { + "epoch": 0.04123093518712732, + "grad_norm": 9.877705574035645, + "learning_rate": 1.998283122305144e-07, + "loss": 0.0415, + "step": 10320 + }, + { + "epoch": 0.04127088764370399, + "grad_norm": 4.144218444824219, + "learning_rate": 1.9982794370991344e-07, + "loss": 0.0343, + "step": 10330 + }, + { + "epoch": 0.04131084010028067, + "grad_norm": 4.3605146408081055, + "learning_rate": 1.9982757479456995e-07, + "loss": 0.0381, + "step": 10340 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 5.343433380126953, + "learning_rate": 1.9982720548448533e-07, + "loss": 0.0386, + "step": 10350 + }, + { + "epoch": 0.04139074501343401, + "grad_norm": 4.225393295288086, + "learning_rate": 1.9982683577966105e-07, + "loss": 0.0337, + "step": 10360 + }, + { + "epoch": 0.041430697470010686, + "grad_norm": 7.256253719329834, + "learning_rate": 1.9982646568009858e-07, + "loss": 0.0413, + "step": 10370 + }, + { + "epoch": 0.04147064992658736, + "grad_norm": 7.098186016082764, + "learning_rate": 1.998260951857994e-07, + "loss": 0.044, + "step": 10380 + }, + { + "epoch": 0.041510602383164036, + "grad_norm": 8.040099143981934, + "learning_rate": 1.9982572429676496e-07, + "loss": 0.0381, + "step": 10390 + }, + { + "epoch": 0.04155055483974071, + "grad_norm": 38.414180755615234, + "learning_rate": 1.998253530129967e-07, + "loss": 0.0482, + "step": 10400 + }, + { + "epoch": 0.04159050729631738, + "grad_norm": 5.797572612762451, + "learning_rate": 1.9982498133449616e-07, + "loss": 0.0392, + "step": 10410 + }, + { + "epoch": 0.041630459752894054, + "grad_norm": 7.237467288970947, + "learning_rate": 1.998246092612647e-07, + "loss": 0.0376, + "step": 10420 + }, + { + "epoch": 0.04167041220947073, + "grad_norm": 5.287996292114258, + "learning_rate": 1.9982423679330389e-07, + "loss": 0.0375, + "step": 10430 + }, + { + "epoch": 0.041710364666047405, + "grad_norm": 23.13751983642578, + "learning_rate": 1.9982386393061518e-07, + "loss": 0.0368, + "step": 10440 + }, + { + "epoch": 0.04175031712262408, + "grad_norm": 6.17320442199707, + "learning_rate": 1.9982349067319998e-07, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.04179026957920075, + "grad_norm": 3.478424310684204, + "learning_rate": 1.9982311702105986e-07, + "loss": 0.0422, + "step": 10460 + }, + { + "epoch": 0.04183022203577742, + "grad_norm": 11.880602836608887, + "learning_rate": 1.9982274297419622e-07, + "loss": 0.0392, + "step": 10470 + }, + { + "epoch": 0.0418701744923541, + "grad_norm": 4.732335567474365, + "learning_rate": 1.9982236853261062e-07, + "loss": 0.0444, + "step": 10480 + }, + { + "epoch": 0.04191012694893077, + "grad_norm": 5.376004219055176, + "learning_rate": 1.9982199369630447e-07, + "loss": 0.0456, + "step": 10490 + }, + { + "epoch": 0.04195007940550745, + "grad_norm": 5.608545780181885, + "learning_rate": 1.9982161846527927e-07, + "loss": 0.0389, + "step": 10500 + }, + { + "epoch": 0.041990031862084116, + "grad_norm": 8.584953308105469, + "learning_rate": 1.9982124283953652e-07, + "loss": 0.0394, + "step": 10510 + }, + { + "epoch": 0.04202998431866079, + "grad_norm": 3.007359743118286, + "learning_rate": 1.9982086681907767e-07, + "loss": 0.0414, + "step": 10520 + }, + { + "epoch": 0.04206993677523747, + "grad_norm": 3.882066011428833, + "learning_rate": 1.998204904039043e-07, + "loss": 0.0367, + "step": 10530 + }, + { + "epoch": 0.04210988923181414, + "grad_norm": 4.266863822937012, + "learning_rate": 1.9982011359401776e-07, + "loss": 0.0397, + "step": 10540 + }, + { + "epoch": 0.04214984168839082, + "grad_norm": 5.314237117767334, + "learning_rate": 1.9981973638941966e-07, + "loss": 0.0401, + "step": 10550 + }, + { + "epoch": 0.04218979414496749, + "grad_norm": 3.261385440826416, + "learning_rate": 1.998193587901114e-07, + "loss": 0.0384, + "step": 10560 + }, + { + "epoch": 0.04222974660154416, + "grad_norm": 5.1781182289123535, + "learning_rate": 1.9981898079609453e-07, + "loss": 0.038, + "step": 10570 + }, + { + "epoch": 0.042269699058120835, + "grad_norm": 9.43580150604248, + "learning_rate": 1.9981860240737055e-07, + "loss": 0.0396, + "step": 10580 + }, + { + "epoch": 0.04230965151469751, + "grad_norm": 6.607363224029541, + "learning_rate": 1.9981822362394092e-07, + "loss": 0.0367, + "step": 10590 + }, + { + "epoch": 0.042349603971274186, + "grad_norm": 5.0340046882629395, + "learning_rate": 1.9981784444580716e-07, + "loss": 0.0364, + "step": 10600 + }, + { + "epoch": 0.04238955642785086, + "grad_norm": 5.191564083099365, + "learning_rate": 1.9981746487297077e-07, + "loss": 0.0353, + "step": 10610 + }, + { + "epoch": 0.04242950888442753, + "grad_norm": 9.527766227722168, + "learning_rate": 1.998170849054332e-07, + "loss": 0.0367, + "step": 10620 + }, + { + "epoch": 0.042469461341004204, + "grad_norm": 3.8079116344451904, + "learning_rate": 1.9981670454319607e-07, + "loss": 0.0403, + "step": 10630 + }, + { + "epoch": 0.04250941379758088, + "grad_norm": 4.623437404632568, + "learning_rate": 1.9981632378626077e-07, + "loss": 0.0388, + "step": 10640 + }, + { + "epoch": 0.042549366254157554, + "grad_norm": 6.778859615325928, + "learning_rate": 1.9981594263462884e-07, + "loss": 0.0385, + "step": 10650 + }, + { + "epoch": 0.04258931871073423, + "grad_norm": 6.348071575164795, + "learning_rate": 1.998155610883018e-07, + "loss": 0.0397, + "step": 10660 + }, + { + "epoch": 0.0426292711673109, + "grad_norm": 4.22435998916626, + "learning_rate": 1.9981517914728117e-07, + "loss": 0.0447, + "step": 10670 + }, + { + "epoch": 0.04266922362388757, + "grad_norm": 7.8935723304748535, + "learning_rate": 1.9981479681156844e-07, + "loss": 0.0444, + "step": 10680 + }, + { + "epoch": 0.04270917608046425, + "grad_norm": 4.953075885772705, + "learning_rate": 1.9981441408116512e-07, + "loss": 0.0389, + "step": 10690 + }, + { + "epoch": 0.04274912853704092, + "grad_norm": 4.226785659790039, + "learning_rate": 1.998140309560727e-07, + "loss": 0.0393, + "step": 10700 + }, + { + "epoch": 0.0427890809936176, + "grad_norm": 4.1701483726501465, + "learning_rate": 1.9981364743629273e-07, + "loss": 0.0369, + "step": 10710 + }, + { + "epoch": 0.042829033450194266, + "grad_norm": 5.168369770050049, + "learning_rate": 1.9981326352182674e-07, + "loss": 0.0444, + "step": 10720 + }, + { + "epoch": 0.04286898590677094, + "grad_norm": 3.8677730560302734, + "learning_rate": 1.998128792126762e-07, + "loss": 0.0335, + "step": 10730 + }, + { + "epoch": 0.042908938363347617, + "grad_norm": 5.179389476776123, + "learning_rate": 1.998124945088427e-07, + "loss": 0.0399, + "step": 10740 + }, + { + "epoch": 0.04294889081992429, + "grad_norm": 5.425754070281982, + "learning_rate": 1.9981210941032768e-07, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.04298884327650097, + "grad_norm": 7.896840572357178, + "learning_rate": 1.9981172391713275e-07, + "loss": 0.0397, + "step": 10760 + }, + { + "epoch": 0.043028795733077635, + "grad_norm": 2.262988328933716, + "learning_rate": 1.9981133802925935e-07, + "loss": 0.0403, + "step": 10770 + }, + { + "epoch": 0.04306874818965431, + "grad_norm": 10.651250839233398, + "learning_rate": 1.9981095174670904e-07, + "loss": 0.0408, + "step": 10780 + }, + { + "epoch": 0.043108700646230985, + "grad_norm": 4.126854419708252, + "learning_rate": 1.9981056506948338e-07, + "loss": 0.0403, + "step": 10790 + }, + { + "epoch": 0.04314865310280766, + "grad_norm": 6.846651554107666, + "learning_rate": 1.998101779975838e-07, + "loss": 0.0388, + "step": 10800 + }, + { + "epoch": 0.043188605559384335, + "grad_norm": 4.617361545562744, + "learning_rate": 1.9980979053101196e-07, + "loss": 0.0401, + "step": 10810 + }, + { + "epoch": 0.043228558015961004, + "grad_norm": 6.3959856033325195, + "learning_rate": 1.9980940266976932e-07, + "loss": 0.0392, + "step": 10820 + }, + { + "epoch": 0.04326851047253768, + "grad_norm": 51.39228439331055, + "learning_rate": 1.9980901441385742e-07, + "loss": 0.0402, + "step": 10830 + }, + { + "epoch": 0.043308462929114354, + "grad_norm": 5.093213081359863, + "learning_rate": 1.998086257632778e-07, + "loss": 0.0383, + "step": 10840 + }, + { + "epoch": 0.04334841538569103, + "grad_norm": 3.6338119506835938, + "learning_rate": 1.9980823671803197e-07, + "loss": 0.0467, + "step": 10850 + }, + { + "epoch": 0.043388367842267704, + "grad_norm": 5.799893379211426, + "learning_rate": 1.9980784727812153e-07, + "loss": 0.0336, + "step": 10860 + }, + { + "epoch": 0.04342832029884437, + "grad_norm": 4.662907123565674, + "learning_rate": 1.9980745744354796e-07, + "loss": 0.0365, + "step": 10870 + }, + { + "epoch": 0.04346827275542105, + "grad_norm": 3.361572265625, + "learning_rate": 1.9980706721431285e-07, + "loss": 0.0402, + "step": 10880 + }, + { + "epoch": 0.04350822521199772, + "grad_norm": 3.8855862617492676, + "learning_rate": 1.998066765904177e-07, + "loss": 0.0381, + "step": 10890 + }, + { + "epoch": 0.0435481776685744, + "grad_norm": 7.214828014373779, + "learning_rate": 1.998062855718641e-07, + "loss": 0.0402, + "step": 10900 + }, + { + "epoch": 0.04358813012515107, + "grad_norm": 2.3154706954956055, + "learning_rate": 1.9980589415865353e-07, + "loss": 0.0393, + "step": 10910 + }, + { + "epoch": 0.04362808258172774, + "grad_norm": 5.5614118576049805, + "learning_rate": 1.9980550235078758e-07, + "loss": 0.0385, + "step": 10920 + }, + { + "epoch": 0.043668035038304416, + "grad_norm": 7.530680179595947, + "learning_rate": 1.9980511014826783e-07, + "loss": 0.0396, + "step": 10930 + }, + { + "epoch": 0.04370798749488109, + "grad_norm": 3.7718653678894043, + "learning_rate": 1.9980471755109577e-07, + "loss": 0.0366, + "step": 10940 + }, + { + "epoch": 0.043747939951457766, + "grad_norm": 5.6751179695129395, + "learning_rate": 1.99804324559273e-07, + "loss": 0.0329, + "step": 10950 + }, + { + "epoch": 0.04378789240803444, + "grad_norm": 5.222137451171875, + "learning_rate": 1.9980393117280107e-07, + "loss": 0.0369, + "step": 10960 + }, + { + "epoch": 0.04382784486461111, + "grad_norm": 5.370736598968506, + "learning_rate": 1.998035373916815e-07, + "loss": 0.0403, + "step": 10970 + }, + { + "epoch": 0.043867797321187785, + "grad_norm": 9.261459350585938, + "learning_rate": 1.9980314321591587e-07, + "loss": 0.0354, + "step": 10980 + }, + { + "epoch": 0.04390774977776446, + "grad_norm": 8.06772518157959, + "learning_rate": 1.9980274864550572e-07, + "loss": 0.0437, + "step": 10990 + }, + { + "epoch": 0.043947702234341135, + "grad_norm": 6.10985803604126, + "learning_rate": 1.9980235368045263e-07, + "loss": 0.0402, + "step": 11000 + }, + { + "epoch": 0.04398765469091781, + "grad_norm": 6.742569446563721, + "learning_rate": 1.9980195832075818e-07, + "loss": 0.034, + "step": 11010 + }, + { + "epoch": 0.04402760714749448, + "grad_norm": 4.980236053466797, + "learning_rate": 1.9980156256642388e-07, + "loss": 0.0336, + "step": 11020 + }, + { + "epoch": 0.04406755960407115, + "grad_norm": 3.70383882522583, + "learning_rate": 1.9980116641745134e-07, + "loss": 0.0385, + "step": 11030 + }, + { + "epoch": 0.04410751206064783, + "grad_norm": 9.321043014526367, + "learning_rate": 1.998007698738421e-07, + "loss": 0.0351, + "step": 11040 + }, + { + "epoch": 0.044147464517224504, + "grad_norm": 6.116376876831055, + "learning_rate": 1.998003729355978e-07, + "loss": 0.0395, + "step": 11050 + }, + { + "epoch": 0.04418741697380118, + "grad_norm": 5.3540263175964355, + "learning_rate": 1.997999756027199e-07, + "loss": 0.0361, + "step": 11060 + }, + { + "epoch": 0.04422736943037785, + "grad_norm": 3.9912922382354736, + "learning_rate": 1.9979957787521e-07, + "loss": 0.0377, + "step": 11070 + }, + { + "epoch": 0.04426732188695452, + "grad_norm": 3.044334650039673, + "learning_rate": 1.9979917975306972e-07, + "loss": 0.034, + "step": 11080 + }, + { + "epoch": 0.0443072743435312, + "grad_norm": 4.004938125610352, + "learning_rate": 1.9979878123630062e-07, + "loss": 0.0376, + "step": 11090 + }, + { + "epoch": 0.04434722680010787, + "grad_norm": 2.5519423484802246, + "learning_rate": 1.9979838232490425e-07, + "loss": 0.0382, + "step": 11100 + }, + { + "epoch": 0.04438717925668455, + "grad_norm": 6.7057271003723145, + "learning_rate": 1.997979830188822e-07, + "loss": 0.0405, + "step": 11110 + }, + { + "epoch": 0.04442713171326122, + "grad_norm": 5.8751349449157715, + "learning_rate": 1.9979758331823607e-07, + "loss": 0.0325, + "step": 11120 + }, + { + "epoch": 0.04446708416983789, + "grad_norm": 7.706109046936035, + "learning_rate": 1.997971832229674e-07, + "loss": 0.0436, + "step": 11130 + }, + { + "epoch": 0.044507036626414566, + "grad_norm": 5.668893337249756, + "learning_rate": 1.997967827330778e-07, + "loss": 0.0409, + "step": 11140 + }, + { + "epoch": 0.04454698908299124, + "grad_norm": 5.649062156677246, + "learning_rate": 1.9979638184856883e-07, + "loss": 0.0396, + "step": 11150 + }, + { + "epoch": 0.044586941539567916, + "grad_norm": 18.903596878051758, + "learning_rate": 1.9979598056944212e-07, + "loss": 0.0358, + "step": 11160 + }, + { + "epoch": 0.04462689399614459, + "grad_norm": 4.475780963897705, + "learning_rate": 1.997955788956992e-07, + "loss": 0.038, + "step": 11170 + }, + { + "epoch": 0.04466684645272126, + "grad_norm": 6.148825168609619, + "learning_rate": 1.9979517682734168e-07, + "loss": 0.0365, + "step": 11180 + }, + { + "epoch": 0.044706798909297935, + "grad_norm": 6.73000431060791, + "learning_rate": 1.9979477436437118e-07, + "loss": 0.0382, + "step": 11190 + }, + { + "epoch": 0.04474675136587461, + "grad_norm": 3.3984313011169434, + "learning_rate": 1.9979437150678925e-07, + "loss": 0.0348, + "step": 11200 + }, + { + "epoch": 0.044786703822451285, + "grad_norm": 4.921637058258057, + "learning_rate": 1.9979396825459752e-07, + "loss": 0.0366, + "step": 11210 + }, + { + "epoch": 0.04482665627902796, + "grad_norm": 11.748395919799805, + "learning_rate": 1.997935646077976e-07, + "loss": 0.0378, + "step": 11220 + }, + { + "epoch": 0.04486660873560463, + "grad_norm": 4.7719855308532715, + "learning_rate": 1.99793160566391e-07, + "loss": 0.0375, + "step": 11230 + }, + { + "epoch": 0.0449065611921813, + "grad_norm": 7.632346153259277, + "learning_rate": 1.9979275613037937e-07, + "loss": 0.0388, + "step": 11240 + }, + { + "epoch": 0.04494651364875798, + "grad_norm": 6.593498229980469, + "learning_rate": 1.9979235129976432e-07, + "loss": 0.0391, + "step": 11250 + }, + { + "epoch": 0.04498646610533465, + "grad_norm": 9.29541015625, + "learning_rate": 1.9979194607454746e-07, + "loss": 0.0393, + "step": 11260 + }, + { + "epoch": 0.04502641856191133, + "grad_norm": 5.111682415008545, + "learning_rate": 1.9979154045473034e-07, + "loss": 0.0345, + "step": 11270 + }, + { + "epoch": 0.045066371018488, + "grad_norm": 8.651838302612305, + "learning_rate": 1.9979113444031462e-07, + "loss": 0.0393, + "step": 11280 + }, + { + "epoch": 0.04510632347506467, + "grad_norm": 8.823320388793945, + "learning_rate": 1.9979072803130185e-07, + "loss": 0.037, + "step": 11290 + }, + { + "epoch": 0.04514627593164135, + "grad_norm": 3.910707950592041, + "learning_rate": 1.9979032122769367e-07, + "loss": 0.0409, + "step": 11300 + }, + { + "epoch": 0.04518622838821802, + "grad_norm": 4.888339042663574, + "learning_rate": 1.9978991402949173e-07, + "loss": 0.0379, + "step": 11310 + }, + { + "epoch": 0.0452261808447947, + "grad_norm": 6.321568965911865, + "learning_rate": 1.9978950643669758e-07, + "loss": 0.0353, + "step": 11320 + }, + { + "epoch": 0.045266133301371365, + "grad_norm": 4.3503313064575195, + "learning_rate": 1.997890984493128e-07, + "loss": 0.0394, + "step": 11330 + }, + { + "epoch": 0.04530608575794804, + "grad_norm": 7.432867527008057, + "learning_rate": 1.9978869006733912e-07, + "loss": 0.0367, + "step": 11340 + }, + { + "epoch": 0.045346038214524716, + "grad_norm": 8.711628913879395, + "learning_rate": 1.9978828129077805e-07, + "loss": 0.0457, + "step": 11350 + }, + { + "epoch": 0.04538599067110139, + "grad_norm": 9.34750747680664, + "learning_rate": 1.9978787211963123e-07, + "loss": 0.0354, + "step": 11360 + }, + { + "epoch": 0.045425943127678066, + "grad_norm": 4.978283405303955, + "learning_rate": 1.9978746255390033e-07, + "loss": 0.0354, + "step": 11370 + }, + { + "epoch": 0.045465895584254734, + "grad_norm": 4.6693878173828125, + "learning_rate": 1.9978705259358692e-07, + "loss": 0.0376, + "step": 11380 + }, + { + "epoch": 0.04550584804083141, + "grad_norm": 5.777257442474365, + "learning_rate": 1.9978664223869263e-07, + "loss": 0.0392, + "step": 11390 + }, + { + "epoch": 0.045545800497408084, + "grad_norm": 4.226143836975098, + "learning_rate": 1.9978623148921908e-07, + "loss": 0.0387, + "step": 11400 + }, + { + "epoch": 0.04558575295398476, + "grad_norm": 6.125042915344238, + "learning_rate": 1.9978582034516788e-07, + "loss": 0.0425, + "step": 11410 + }, + { + "epoch": 0.045625705410561435, + "grad_norm": 6.051574230194092, + "learning_rate": 1.9978540880654068e-07, + "loss": 0.0379, + "step": 11420 + }, + { + "epoch": 0.0456656578671381, + "grad_norm": 4.974116325378418, + "learning_rate": 1.9978499687333911e-07, + "loss": 0.0349, + "step": 11430 + }, + { + "epoch": 0.04570561032371478, + "grad_norm": 5.680203914642334, + "learning_rate": 1.997845845455648e-07, + "loss": 0.041, + "step": 11440 + }, + { + "epoch": 0.04574556278029145, + "grad_norm": 3.307708263397217, + "learning_rate": 1.9978417182321938e-07, + "loss": 0.0402, + "step": 11450 + }, + { + "epoch": 0.04578551523686813, + "grad_norm": 6.049238681793213, + "learning_rate": 1.9978375870630444e-07, + "loss": 0.0388, + "step": 11460 + }, + { + "epoch": 0.0458254676934448, + "grad_norm": 6.473019123077393, + "learning_rate": 1.9978334519482167e-07, + "loss": 0.036, + "step": 11470 + }, + { + "epoch": 0.04586542015002147, + "grad_norm": 4.761008262634277, + "learning_rate": 1.9978293128877267e-07, + "loss": 0.034, + "step": 11480 + }, + { + "epoch": 0.04590537260659815, + "grad_norm": 10.383992195129395, + "learning_rate": 1.9978251698815912e-07, + "loss": 0.0356, + "step": 11490 + }, + { + "epoch": 0.04594532506317482, + "grad_norm": 4.762767791748047, + "learning_rate": 1.997821022929826e-07, + "loss": 0.0371, + "step": 11500 + }, + { + "epoch": 0.0459852775197515, + "grad_norm": 6.442586421966553, + "learning_rate": 1.9978168720324475e-07, + "loss": 0.0408, + "step": 11510 + }, + { + "epoch": 0.04602522997632817, + "grad_norm": 4.5697021484375, + "learning_rate": 1.9978127171894728e-07, + "loss": 0.0354, + "step": 11520 + }, + { + "epoch": 0.04606518243290484, + "grad_norm": 3.78368878364563, + "learning_rate": 1.9978085584009173e-07, + "loss": 0.0369, + "step": 11530 + }, + { + "epoch": 0.046105134889481515, + "grad_norm": 7.075984477996826, + "learning_rate": 1.9978043956667988e-07, + "loss": 0.0431, + "step": 11540 + }, + { + "epoch": 0.04614508734605819, + "grad_norm": 4.776932716369629, + "learning_rate": 1.9978002289871327e-07, + "loss": 0.0404, + "step": 11550 + }, + { + "epoch": 0.046185039802634865, + "grad_norm": 3.4465811252593994, + "learning_rate": 1.9977960583619358e-07, + "loss": 0.0387, + "step": 11560 + }, + { + "epoch": 0.04622499225921154, + "grad_norm": 10.689706802368164, + "learning_rate": 1.9977918837912244e-07, + "loss": 0.0364, + "step": 11570 + }, + { + "epoch": 0.04626494471578821, + "grad_norm": 2.4716131687164307, + "learning_rate": 1.9977877052750154e-07, + "loss": 0.0393, + "step": 11580 + }, + { + "epoch": 0.046304897172364884, + "grad_norm": 3.0888442993164062, + "learning_rate": 1.997783522813325e-07, + "loss": 0.0379, + "step": 11590 + }, + { + "epoch": 0.04634484962894156, + "grad_norm": 4.148375034332275, + "learning_rate": 1.99777933640617e-07, + "loss": 0.0373, + "step": 11600 + }, + { + "epoch": 0.046384802085518234, + "grad_norm": 4.76580810546875, + "learning_rate": 1.9977751460535667e-07, + "loss": 0.0418, + "step": 11610 + }, + { + "epoch": 0.04642475454209491, + "grad_norm": 5.378291130065918, + "learning_rate": 1.9977709517555317e-07, + "loss": 0.0343, + "step": 11620 + }, + { + "epoch": 0.04646470699867158, + "grad_norm": 7.4895734786987305, + "learning_rate": 1.997766753512082e-07, + "loss": 0.0365, + "step": 11630 + }, + { + "epoch": 0.04650465945524825, + "grad_norm": 7.839143753051758, + "learning_rate": 1.9977625513232335e-07, + "loss": 0.0406, + "step": 11640 + }, + { + "epoch": 0.04654461191182493, + "grad_norm": 8.61608600616455, + "learning_rate": 1.9977583451890033e-07, + "loss": 0.0396, + "step": 11650 + }, + { + "epoch": 0.0465845643684016, + "grad_norm": 3.0282042026519775, + "learning_rate": 1.9977541351094077e-07, + "loss": 0.0344, + "step": 11660 + }, + { + "epoch": 0.04662451682497828, + "grad_norm": 2.9160585403442383, + "learning_rate": 1.997749921084464e-07, + "loss": 0.0383, + "step": 11670 + }, + { + "epoch": 0.04666446928155495, + "grad_norm": 6.0823211669921875, + "learning_rate": 1.9977457031141879e-07, + "loss": 0.0345, + "step": 11680 + }, + { + "epoch": 0.04670442173813162, + "grad_norm": 6.5396223068237305, + "learning_rate": 1.997741481198597e-07, + "loss": 0.036, + "step": 11690 + }, + { + "epoch": 0.046744374194708296, + "grad_norm": 4.976490497589111, + "learning_rate": 1.997737255337707e-07, + "loss": 0.0413, + "step": 11700 + }, + { + "epoch": 0.04678432665128497, + "grad_norm": 6.077700614929199, + "learning_rate": 1.9977330255315358e-07, + "loss": 0.0391, + "step": 11710 + }, + { + "epoch": 0.04682427910786165, + "grad_norm": 5.669658660888672, + "learning_rate": 1.9977287917800997e-07, + "loss": 0.0354, + "step": 11720 + }, + { + "epoch": 0.04686423156443832, + "grad_norm": 3.1015145778656006, + "learning_rate": 1.9977245540834146e-07, + "loss": 0.0329, + "step": 11730 + }, + { + "epoch": 0.04690418402101499, + "grad_norm": 6.432806491851807, + "learning_rate": 1.9977203124414984e-07, + "loss": 0.0343, + "step": 11740 + }, + { + "epoch": 0.046944136477591665, + "grad_norm": 4.835119247436523, + "learning_rate": 1.9977160668543671e-07, + "loss": 0.0342, + "step": 11750 + }, + { + "epoch": 0.04698408893416834, + "grad_norm": 6.6479105949401855, + "learning_rate": 1.9977118173220378e-07, + "loss": 0.0385, + "step": 11760 + }, + { + "epoch": 0.047024041390745015, + "grad_norm": 3.6159307956695557, + "learning_rate": 1.9977075638445275e-07, + "loss": 0.0388, + "step": 11770 + }, + { + "epoch": 0.04706399384732169, + "grad_norm": 4.264798641204834, + "learning_rate": 1.9977033064218528e-07, + "loss": 0.0417, + "step": 11780 + }, + { + "epoch": 0.04710394630389836, + "grad_norm": 4.906524181365967, + "learning_rate": 1.9976990450540303e-07, + "loss": 0.0343, + "step": 11790 + }, + { + "epoch": 0.047143898760475034, + "grad_norm": 17.75540542602539, + "learning_rate": 1.9976947797410772e-07, + "loss": 0.0428, + "step": 11800 + }, + { + "epoch": 0.04718385121705171, + "grad_norm": 4.956783294677734, + "learning_rate": 1.9976905104830104e-07, + "loss": 0.0389, + "step": 11810 + }, + { + "epoch": 0.047223803673628384, + "grad_norm": 3.980287790298462, + "learning_rate": 1.9976862372798467e-07, + "loss": 0.0398, + "step": 11820 + }, + { + "epoch": 0.04726375613020506, + "grad_norm": 4.459374904632568, + "learning_rate": 1.9976819601316026e-07, + "loss": 0.0388, + "step": 11830 + }, + { + "epoch": 0.04730370858678173, + "grad_norm": 4.643595218658447, + "learning_rate": 1.9976776790382954e-07, + "loss": 0.0309, + "step": 11840 + }, + { + "epoch": 0.0473436610433584, + "grad_norm": 7.059641361236572, + "learning_rate": 1.9976733939999421e-07, + "loss": 0.0417, + "step": 11850 + }, + { + "epoch": 0.04738361349993508, + "grad_norm": 10.09939193725586, + "learning_rate": 1.9976691050165596e-07, + "loss": 0.0364, + "step": 11860 + }, + { + "epoch": 0.04742356595651175, + "grad_norm": 6.750746250152588, + "learning_rate": 1.9976648120881648e-07, + "loss": 0.0379, + "step": 11870 + }, + { + "epoch": 0.04746351841308843, + "grad_norm": 4.703256130218506, + "learning_rate": 1.9976605152147747e-07, + "loss": 0.0357, + "step": 11880 + }, + { + "epoch": 0.047503470869665096, + "grad_norm": 5.061774730682373, + "learning_rate": 1.9976562143964058e-07, + "loss": 0.039, + "step": 11890 + }, + { + "epoch": 0.04754342332624177, + "grad_norm": 5.56594705581665, + "learning_rate": 1.9976519096330759e-07, + "loss": 0.0381, + "step": 11900 + }, + { + "epoch": 0.047583375782818446, + "grad_norm": 3.117969036102295, + "learning_rate": 1.9976476009248016e-07, + "loss": 0.0386, + "step": 11910 + }, + { + "epoch": 0.04762332823939512, + "grad_norm": 4.3010053634643555, + "learning_rate": 1.9976432882715998e-07, + "loss": 0.0412, + "step": 11920 + }, + { + "epoch": 0.047663280695971796, + "grad_norm": 5.136009216308594, + "learning_rate": 1.9976389716734884e-07, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.047703233152548465, + "grad_norm": 4.137892246246338, + "learning_rate": 1.9976346511304833e-07, + "loss": 0.038, + "step": 11940 + }, + { + "epoch": 0.04774318560912514, + "grad_norm": 5.927318096160889, + "learning_rate": 1.9976303266426023e-07, + "loss": 0.0335, + "step": 11950 + }, + { + "epoch": 0.047783138065701815, + "grad_norm": 5.983517646789551, + "learning_rate": 1.9976259982098622e-07, + "loss": 0.0358, + "step": 11960 + }, + { + "epoch": 0.04782309052227849, + "grad_norm": 23.603527069091797, + "learning_rate": 1.9976216658322804e-07, + "loss": 0.0362, + "step": 11970 + }, + { + "epoch": 0.047863042978855165, + "grad_norm": 2.95516037940979, + "learning_rate": 1.9976173295098736e-07, + "loss": 0.0368, + "step": 11980 + }, + { + "epoch": 0.04790299543543183, + "grad_norm": 5.98943567276001, + "learning_rate": 1.9976129892426595e-07, + "loss": 0.0322, + "step": 11990 + }, + { + "epoch": 0.04794294789200851, + "grad_norm": 11.412007331848145, + "learning_rate": 1.9976086450306551e-07, + "loss": 0.0369, + "step": 12000 + }, + { + "epoch": 0.047982900348585183, + "grad_norm": 7.7495927810668945, + "learning_rate": 1.9976042968738768e-07, + "loss": 0.0357, + "step": 12010 + }, + { + "epoch": 0.04802285280516186, + "grad_norm": 3.6755735874176025, + "learning_rate": 1.997599944772343e-07, + "loss": 0.0356, + "step": 12020 + }, + { + "epoch": 0.048062805261738534, + "grad_norm": 4.69427490234375, + "learning_rate": 1.9975955887260704e-07, + "loss": 0.039, + "step": 12030 + }, + { + "epoch": 0.0481027577183152, + "grad_norm": 3.787788152694702, + "learning_rate": 1.997591228735076e-07, + "loss": 0.0369, + "step": 12040 + }, + { + "epoch": 0.04814271017489188, + "grad_norm": 5.380619049072266, + "learning_rate": 1.9975868647993772e-07, + "loss": 0.0396, + "step": 12050 + }, + { + "epoch": 0.04818266263146855, + "grad_norm": 3.5342724323272705, + "learning_rate": 1.997582496918991e-07, + "loss": 0.0377, + "step": 12060 + }, + { + "epoch": 0.04822261508804523, + "grad_norm": 2.476850986480713, + "learning_rate": 1.9975781250939353e-07, + "loss": 0.0361, + "step": 12070 + }, + { + "epoch": 0.0482625675446219, + "grad_norm": 4.320591926574707, + "learning_rate": 1.9975737493242272e-07, + "loss": 0.0416, + "step": 12080 + }, + { + "epoch": 0.04830252000119857, + "grad_norm": 3.0253970623016357, + "learning_rate": 1.9975693696098835e-07, + "loss": 0.0356, + "step": 12090 + }, + { + "epoch": 0.048342472457775246, + "grad_norm": 5.606649398803711, + "learning_rate": 1.9975649859509222e-07, + "loss": 0.036, + "step": 12100 + }, + { + "epoch": 0.04838242491435192, + "grad_norm": 4.7577900886535645, + "learning_rate": 1.99756059834736e-07, + "loss": 0.0395, + "step": 12110 + }, + { + "epoch": 0.048422377370928596, + "grad_norm": 6.534089088439941, + "learning_rate": 1.9975562067992148e-07, + "loss": 0.0441, + "step": 12120 + }, + { + "epoch": 0.04846232982750527, + "grad_norm": 3.470907211303711, + "learning_rate": 1.9975518113065033e-07, + "loss": 0.0404, + "step": 12130 + }, + { + "epoch": 0.04850228228408194, + "grad_norm": 7.684278964996338, + "learning_rate": 1.9975474118692436e-07, + "loss": 0.0366, + "step": 12140 + }, + { + "epoch": 0.048542234740658614, + "grad_norm": 7.354087829589844, + "learning_rate": 1.9975430084874528e-07, + "loss": 0.0352, + "step": 12150 + }, + { + "epoch": 0.04858218719723529, + "grad_norm": 8.517842292785645, + "learning_rate": 1.9975386011611482e-07, + "loss": 0.0405, + "step": 12160 + }, + { + "epoch": 0.048622139653811965, + "grad_norm": 3.1344072818756104, + "learning_rate": 1.9975341898903472e-07, + "loss": 0.0346, + "step": 12170 + }, + { + "epoch": 0.04866209211038864, + "grad_norm": 7.965940475463867, + "learning_rate": 1.9975297746750676e-07, + "loss": 0.0449, + "step": 12180 + }, + { + "epoch": 0.04870204456696531, + "grad_norm": 6.340517520904541, + "learning_rate": 1.9975253555153265e-07, + "loss": 0.0343, + "step": 12190 + }, + { + "epoch": 0.04874199702354198, + "grad_norm": 2.942605972290039, + "learning_rate": 1.9975209324111414e-07, + "loss": 0.0406, + "step": 12200 + }, + { + "epoch": 0.04878194948011866, + "grad_norm": 3.1229324340820312, + "learning_rate": 1.9975165053625301e-07, + "loss": 0.0372, + "step": 12210 + }, + { + "epoch": 0.04882190193669533, + "grad_norm": 3.7730588912963867, + "learning_rate": 1.9975120743695098e-07, + "loss": 0.0391, + "step": 12220 + }, + { + "epoch": 0.04886185439327201, + "grad_norm": 9.914134979248047, + "learning_rate": 1.997507639432098e-07, + "loss": 0.0385, + "step": 12230 + }, + { + "epoch": 0.048901806849848684, + "grad_norm": 3.8947882652282715, + "learning_rate": 1.9975032005503125e-07, + "loss": 0.0375, + "step": 12240 + }, + { + "epoch": 0.04894175930642535, + "grad_norm": 3.742812156677246, + "learning_rate": 1.9974987577241707e-07, + "loss": 0.0347, + "step": 12250 + }, + { + "epoch": 0.04898171176300203, + "grad_norm": 3.470212936401367, + "learning_rate": 1.9974943109536903e-07, + "loss": 0.037, + "step": 12260 + }, + { + "epoch": 0.0490216642195787, + "grad_norm": 3.873788356781006, + "learning_rate": 1.9974898602388884e-07, + "loss": 0.0373, + "step": 12270 + }, + { + "epoch": 0.04906161667615538, + "grad_norm": 7.522433280944824, + "learning_rate": 1.9974854055797832e-07, + "loss": 0.0384, + "step": 12280 + }, + { + "epoch": 0.04910156913273205, + "grad_norm": 5.251286506652832, + "learning_rate": 1.9974809469763922e-07, + "loss": 0.0422, + "step": 12290 + }, + { + "epoch": 0.04914152158930872, + "grad_norm": 3.5757033824920654, + "learning_rate": 1.9974764844287326e-07, + "loss": 0.0316, + "step": 12300 + }, + { + "epoch": 0.049181474045885396, + "grad_norm": 5.1237945556640625, + "learning_rate": 1.9974720179368224e-07, + "loss": 0.035, + "step": 12310 + }, + { + "epoch": 0.04922142650246207, + "grad_norm": 4.135406494140625, + "learning_rate": 1.9974675475006793e-07, + "loss": 0.0399, + "step": 12320 + }, + { + "epoch": 0.049261378959038746, + "grad_norm": 4.7161760330200195, + "learning_rate": 1.997463073120321e-07, + "loss": 0.0404, + "step": 12330 + }, + { + "epoch": 0.04930133141561542, + "grad_norm": 6.862948417663574, + "learning_rate": 1.997458594795765e-07, + "loss": 0.0369, + "step": 12340 + }, + { + "epoch": 0.04934128387219209, + "grad_norm": 8.0114164352417, + "learning_rate": 1.9974541125270288e-07, + "loss": 0.0346, + "step": 12350 + }, + { + "epoch": 0.049381236328768764, + "grad_norm": 5.246989727020264, + "learning_rate": 1.9974496263141307e-07, + "loss": 0.0324, + "step": 12360 + }, + { + "epoch": 0.04942118878534544, + "grad_norm": 5.514709949493408, + "learning_rate": 1.997445136157088e-07, + "loss": 0.0326, + "step": 12370 + }, + { + "epoch": 0.049461141241922114, + "grad_norm": 3.7838997840881348, + "learning_rate": 1.997440642055919e-07, + "loss": 0.0357, + "step": 12380 + }, + { + "epoch": 0.04950109369849879, + "grad_norm": 8.688282012939453, + "learning_rate": 1.9974361440106406e-07, + "loss": 0.0385, + "step": 12390 + }, + { + "epoch": 0.04954104615507546, + "grad_norm": 6.945598125457764, + "learning_rate": 1.9974316420212713e-07, + "loss": 0.0329, + "step": 12400 + }, + { + "epoch": 0.04958099861165213, + "grad_norm": 10.362908363342285, + "learning_rate": 1.9974271360878286e-07, + "loss": 0.032, + "step": 12410 + }, + { + "epoch": 0.04962095106822881, + "grad_norm": 7.576197147369385, + "learning_rate": 1.9974226262103303e-07, + "loss": 0.0389, + "step": 12420 + }, + { + "epoch": 0.04966090352480548, + "grad_norm": 6.97221565246582, + "learning_rate": 1.9974181123887946e-07, + "loss": 0.0367, + "step": 12430 + }, + { + "epoch": 0.04970085598138216, + "grad_norm": 9.38051700592041, + "learning_rate": 1.997413594623239e-07, + "loss": 0.0366, + "step": 12440 + }, + { + "epoch": 0.049740808437958826, + "grad_norm": 7.921683311462402, + "learning_rate": 1.997409072913681e-07, + "loss": 0.0389, + "step": 12450 + }, + { + "epoch": 0.0497807608945355, + "grad_norm": 6.253096580505371, + "learning_rate": 1.9974045472601393e-07, + "loss": 0.0396, + "step": 12460 + }, + { + "epoch": 0.04982071335111218, + "grad_norm": 5.214062213897705, + "learning_rate": 1.997400017662631e-07, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.04986066580768885, + "grad_norm": 3.473515510559082, + "learning_rate": 1.9973954841211752e-07, + "loss": 0.036, + "step": 12480 + }, + { + "epoch": 0.04990061826426553, + "grad_norm": 6.88945198059082, + "learning_rate": 1.9973909466357883e-07, + "loss": 0.0388, + "step": 12490 + }, + { + "epoch": 0.049940570720842195, + "grad_norm": 6.99591588973999, + "learning_rate": 1.9973864052064892e-07, + "loss": 0.037, + "step": 12500 + }, + { + "epoch": 0.04998052317741887, + "grad_norm": 4.152871608734131, + "learning_rate": 1.997381859833296e-07, + "loss": 0.036, + "step": 12510 + }, + { + "epoch": 0.050020475633995545, + "grad_norm": 9.676370620727539, + "learning_rate": 1.9973773105162257e-07, + "loss": 0.0356, + "step": 12520 + }, + { + "epoch": 0.05006042809057222, + "grad_norm": 5.482659339904785, + "learning_rate": 1.9973727572552975e-07, + "loss": 0.0372, + "step": 12530 + }, + { + "epoch": 0.050100380547148896, + "grad_norm": 4.8499603271484375, + "learning_rate": 1.9973682000505284e-07, + "loss": 0.0357, + "step": 12540 + }, + { + "epoch": 0.050140333003725564, + "grad_norm": 4.6550068855285645, + "learning_rate": 1.9973636389019367e-07, + "loss": 0.0379, + "step": 12550 + }, + { + "epoch": 0.05018028546030224, + "grad_norm": 5.952493190765381, + "learning_rate": 1.997359073809541e-07, + "loss": 0.0343, + "step": 12560 + }, + { + "epoch": 0.050220237916878914, + "grad_norm": 6.382560729980469, + "learning_rate": 1.9973545047733586e-07, + "loss": 0.0358, + "step": 12570 + }, + { + "epoch": 0.05026019037345559, + "grad_norm": 6.9215168952941895, + "learning_rate": 1.9973499317934079e-07, + "loss": 0.0311, + "step": 12580 + }, + { + "epoch": 0.050300142830032264, + "grad_norm": 5.659091949462891, + "learning_rate": 1.9973453548697072e-07, + "loss": 0.0413, + "step": 12590 + }, + { + "epoch": 0.05034009528660893, + "grad_norm": 9.18416976928711, + "learning_rate": 1.997340774002274e-07, + "loss": 0.0423, + "step": 12600 + }, + { + "epoch": 0.05038004774318561, + "grad_norm": 3.009695529937744, + "learning_rate": 1.9973361891911272e-07, + "loss": 0.0367, + "step": 12610 + }, + { + "epoch": 0.05042000019976228, + "grad_norm": 4.980035305023193, + "learning_rate": 1.9973316004362845e-07, + "loss": 0.038, + "step": 12620 + }, + { + "epoch": 0.05045995265633896, + "grad_norm": 7.780754566192627, + "learning_rate": 1.9973270077377638e-07, + "loss": 0.0367, + "step": 12630 + }, + { + "epoch": 0.05049990511291563, + "grad_norm": 8.3926362991333, + "learning_rate": 1.9973224110955834e-07, + "loss": 0.0341, + "step": 12640 + }, + { + "epoch": 0.0505398575694923, + "grad_norm": 3.6425817012786865, + "learning_rate": 1.997317810509762e-07, + "loss": 0.037, + "step": 12650 + }, + { + "epoch": 0.050579810026068976, + "grad_norm": 4.722818851470947, + "learning_rate": 1.9973132059803168e-07, + "loss": 0.0336, + "step": 12660 + }, + { + "epoch": 0.05061976248264565, + "grad_norm": 7.417166233062744, + "learning_rate": 1.997308597507267e-07, + "loss": 0.0358, + "step": 12670 + }, + { + "epoch": 0.050659714939222326, + "grad_norm": 3.593705892562866, + "learning_rate": 1.9973039850906303e-07, + "loss": 0.0407, + "step": 12680 + }, + { + "epoch": 0.050699667395799, + "grad_norm": 4.970849514007568, + "learning_rate": 1.997299368730425e-07, + "loss": 0.0374, + "step": 12690 + }, + { + "epoch": 0.05073961985237567, + "grad_norm": 3.0160117149353027, + "learning_rate": 1.9972947484266697e-07, + "loss": 0.0366, + "step": 12700 + }, + { + "epoch": 0.050779572308952345, + "grad_norm": 5.4482102394104, + "learning_rate": 1.9972901241793822e-07, + "loss": 0.035, + "step": 12710 + }, + { + "epoch": 0.05081952476552902, + "grad_norm": 3.4361727237701416, + "learning_rate": 1.997285495988581e-07, + "loss": 0.0378, + "step": 12720 + }, + { + "epoch": 0.050859477222105695, + "grad_norm": 4.0484514236450195, + "learning_rate": 1.9972808638542841e-07, + "loss": 0.0302, + "step": 12730 + }, + { + "epoch": 0.05089942967868237, + "grad_norm": 5.491161823272705, + "learning_rate": 1.9972762277765103e-07, + "loss": 0.0384, + "step": 12740 + }, + { + "epoch": 0.05093938213525904, + "grad_norm": 2.9985287189483643, + "learning_rate": 1.9972715877552777e-07, + "loss": 0.035, + "step": 12750 + }, + { + "epoch": 0.050979334591835714, + "grad_norm": 3.813697338104248, + "learning_rate": 1.9972669437906048e-07, + "loss": 0.037, + "step": 12760 + }, + { + "epoch": 0.05101928704841239, + "grad_norm": 2.8480942249298096, + "learning_rate": 1.9972622958825097e-07, + "loss": 0.0414, + "step": 12770 + }, + { + "epoch": 0.051059239504989064, + "grad_norm": 8.141436576843262, + "learning_rate": 1.9972576440310104e-07, + "loss": 0.0357, + "step": 12780 + }, + { + "epoch": 0.05109919196156574, + "grad_norm": 4.836388111114502, + "learning_rate": 1.9972529882361266e-07, + "loss": 0.0358, + "step": 12790 + }, + { + "epoch": 0.05113914441814241, + "grad_norm": 5.266736030578613, + "learning_rate": 1.9972483284978755e-07, + "loss": 0.0344, + "step": 12800 + }, + { + "epoch": 0.05117909687471908, + "grad_norm": 6.34413480758667, + "learning_rate": 1.997243664816276e-07, + "loss": 0.036, + "step": 12810 + }, + { + "epoch": 0.05121904933129576, + "grad_norm": 7.060769557952881, + "learning_rate": 1.9972389971913462e-07, + "loss": 0.0337, + "step": 12820 + }, + { + "epoch": 0.05125900178787243, + "grad_norm": 5.362131595611572, + "learning_rate": 1.997234325623105e-07, + "loss": 0.0405, + "step": 12830 + }, + { + "epoch": 0.05129895424444911, + "grad_norm": 4.519298076629639, + "learning_rate": 1.9972296501115712e-07, + "loss": 0.0376, + "step": 12840 + }, + { + "epoch": 0.05133890670102578, + "grad_norm": 4.554305553436279, + "learning_rate": 1.997224970656762e-07, + "loss": 0.0385, + "step": 12850 + }, + { + "epoch": 0.05137885915760245, + "grad_norm": 7.060479640960693, + "learning_rate": 1.9972202872586972e-07, + "loss": 0.0364, + "step": 12860 + }, + { + "epoch": 0.051418811614179126, + "grad_norm": 6.606038570404053, + "learning_rate": 1.9972155999173946e-07, + "loss": 0.0355, + "step": 12870 + }, + { + "epoch": 0.0514587640707558, + "grad_norm": 6.979849815368652, + "learning_rate": 1.9972109086328727e-07, + "loss": 0.04, + "step": 12880 + }, + { + "epoch": 0.051498716527332476, + "grad_norm": 6.28257417678833, + "learning_rate": 1.9972062134051507e-07, + "loss": 0.0355, + "step": 12890 + }, + { + "epoch": 0.05153866898390915, + "grad_norm": 6.2827982902526855, + "learning_rate": 1.9972015142342466e-07, + "loss": 0.0428, + "step": 12900 + }, + { + "epoch": 0.05157862144048582, + "grad_norm": 4.069390296936035, + "learning_rate": 1.9971968111201794e-07, + "loss": 0.0416, + "step": 12910 + }, + { + "epoch": 0.051618573897062495, + "grad_norm": 5.422885417938232, + "learning_rate": 1.997192104062967e-07, + "loss": 0.0373, + "step": 12920 + }, + { + "epoch": 0.05165852635363917, + "grad_norm": 11.66561222076416, + "learning_rate": 1.997187393062629e-07, + "loss": 0.04, + "step": 12930 + }, + { + "epoch": 0.051698478810215845, + "grad_norm": 7.8622212409973145, + "learning_rate": 1.997182678119183e-07, + "loss": 0.0375, + "step": 12940 + }, + { + "epoch": 0.05173843126679252, + "grad_norm": 5.40322208404541, + "learning_rate": 1.9971779592326481e-07, + "loss": 0.0375, + "step": 12950 + }, + { + "epoch": 0.05177838372336919, + "grad_norm": 6.207991600036621, + "learning_rate": 1.9971732364030433e-07, + "loss": 0.0357, + "step": 12960 + }, + { + "epoch": 0.05181833617994586, + "grad_norm": 4.366454124450684, + "learning_rate": 1.997168509630387e-07, + "loss": 0.0361, + "step": 12970 + }, + { + "epoch": 0.05185828863652254, + "grad_norm": 5.509269714355469, + "learning_rate": 1.9971637789146976e-07, + "loss": 0.0355, + "step": 12980 + }, + { + "epoch": 0.051898241093099214, + "grad_norm": 2.7901713848114014, + "learning_rate": 1.997159044255994e-07, + "loss": 0.0326, + "step": 12990 + }, + { + "epoch": 0.05193819354967589, + "grad_norm": 6.905604362487793, + "learning_rate": 1.9971543056542952e-07, + "loss": 0.0317, + "step": 13000 + }, + { + "epoch": 0.05197814600625256, + "grad_norm": 7.98009729385376, + "learning_rate": 1.9971495631096195e-07, + "loss": 0.0363, + "step": 13010 + }, + { + "epoch": 0.05201809846282923, + "grad_norm": 3.2150604724884033, + "learning_rate": 1.997144816621986e-07, + "loss": 0.0345, + "step": 13020 + }, + { + "epoch": 0.05205805091940591, + "grad_norm": 7.531830310821533, + "learning_rate": 1.9971400661914136e-07, + "loss": 0.0324, + "step": 13030 + }, + { + "epoch": 0.05209800337598258, + "grad_norm": 4.781637668609619, + "learning_rate": 1.9971353118179204e-07, + "loss": 0.0378, + "step": 13040 + }, + { + "epoch": 0.05213795583255926, + "grad_norm": 3.7450222969055176, + "learning_rate": 1.9971305535015257e-07, + "loss": 0.0367, + "step": 13050 + }, + { + "epoch": 0.052177908289135926, + "grad_norm": 2.6890316009521484, + "learning_rate": 1.9971257912422484e-07, + "loss": 0.0332, + "step": 13060 + }, + { + "epoch": 0.0522178607457126, + "grad_norm": 6.244346618652344, + "learning_rate": 1.997121025040107e-07, + "loss": 0.0373, + "step": 13070 + }, + { + "epoch": 0.052257813202289276, + "grad_norm": 8.633000373840332, + "learning_rate": 1.9971162548951206e-07, + "loss": 0.0379, + "step": 13080 + }, + { + "epoch": 0.05229776565886595, + "grad_norm": 3.197789430618286, + "learning_rate": 1.997111480807308e-07, + "loss": 0.0359, + "step": 13090 + }, + { + "epoch": 0.052337718115442626, + "grad_norm": 5.881928443908691, + "learning_rate": 1.9971067027766878e-07, + "loss": 0.0334, + "step": 13100 + }, + { + "epoch": 0.052377670572019294, + "grad_norm": 4.695040225982666, + "learning_rate": 1.9971019208032796e-07, + "loss": 0.0379, + "step": 13110 + }, + { + "epoch": 0.05241762302859597, + "grad_norm": 6.324784278869629, + "learning_rate": 1.9970971348871016e-07, + "loss": 0.0304, + "step": 13120 + }, + { + "epoch": 0.052457575485172644, + "grad_norm": 3.8689091205596924, + "learning_rate": 1.997092345028173e-07, + "loss": 0.0346, + "step": 13130 + }, + { + "epoch": 0.05249752794174932, + "grad_norm": 7.394871711730957, + "learning_rate": 1.997087551226513e-07, + "loss": 0.0371, + "step": 13140 + }, + { + "epoch": 0.052537480398325995, + "grad_norm": 5.249591827392578, + "learning_rate": 1.9970827534821397e-07, + "loss": 0.0358, + "step": 13150 + }, + { + "epoch": 0.05257743285490266, + "grad_norm": 3.8783648014068604, + "learning_rate": 1.997077951795073e-07, + "loss": 0.0352, + "step": 13160 + }, + { + "epoch": 0.05261738531147934, + "grad_norm": 4.973453044891357, + "learning_rate": 1.9970731461653316e-07, + "loss": 0.0373, + "step": 13170 + }, + { + "epoch": 0.05265733776805601, + "grad_norm": 4.340676307678223, + "learning_rate": 1.9970683365929345e-07, + "loss": 0.0346, + "step": 13180 + }, + { + "epoch": 0.05269729022463269, + "grad_norm": 2.8319714069366455, + "learning_rate": 1.9970635230779002e-07, + "loss": 0.0375, + "step": 13190 + }, + { + "epoch": 0.05273724268120936, + "grad_norm": 6.987737655639648, + "learning_rate": 1.9970587056202484e-07, + "loss": 0.0322, + "step": 13200 + }, + { + "epoch": 0.05277719513778603, + "grad_norm": 5.344844818115234, + "learning_rate": 1.9970538842199981e-07, + "loss": 0.0301, + "step": 13210 + }, + { + "epoch": 0.05281714759436271, + "grad_norm": 34.78802490234375, + "learning_rate": 1.9970490588771681e-07, + "loss": 0.0409, + "step": 13220 + }, + { + "epoch": 0.05285710005093938, + "grad_norm": 6.65936279296875, + "learning_rate": 1.9970442295917778e-07, + "loss": 0.036, + "step": 13230 + }, + { + "epoch": 0.05289705250751606, + "grad_norm": 5.675324440002441, + "learning_rate": 1.9970393963638458e-07, + "loss": 0.0381, + "step": 13240 + }, + { + "epoch": 0.05293700496409273, + "grad_norm": 8.33770751953125, + "learning_rate": 1.9970345591933918e-07, + "loss": 0.0349, + "step": 13250 + }, + { + "epoch": 0.0529769574206694, + "grad_norm": 5.794460773468018, + "learning_rate": 1.997029718080434e-07, + "loss": 0.0337, + "step": 13260 + }, + { + "epoch": 0.053016909877246075, + "grad_norm": 9.829218864440918, + "learning_rate": 1.9970248730249928e-07, + "loss": 0.0391, + "step": 13270 + }, + { + "epoch": 0.05305686233382275, + "grad_norm": 6.669118404388428, + "learning_rate": 1.9970200240270864e-07, + "loss": 0.0328, + "step": 13280 + }, + { + "epoch": 0.053096814790399426, + "grad_norm": 3.6074411869049072, + "learning_rate": 1.9970151710867345e-07, + "loss": 0.0315, + "step": 13290 + }, + { + "epoch": 0.0531367672469761, + "grad_norm": 7.18286657333374, + "learning_rate": 1.997010314203956e-07, + "loss": 0.0309, + "step": 13300 + }, + { + "epoch": 0.05317671970355277, + "grad_norm": 6.009220600128174, + "learning_rate": 1.99700545337877e-07, + "loss": 0.0356, + "step": 13310 + }, + { + "epoch": 0.053216672160129444, + "grad_norm": 4.894280910491943, + "learning_rate": 1.9970005886111963e-07, + "loss": 0.0336, + "step": 13320 + }, + { + "epoch": 0.05325662461670612, + "grad_norm": 262.32501220703125, + "learning_rate": 1.9969957199012535e-07, + "loss": 0.0348, + "step": 13330 + }, + { + "epoch": 0.053296577073282794, + "grad_norm": 7.926087379455566, + "learning_rate": 1.996990847248961e-07, + "loss": 0.0353, + "step": 13340 + }, + { + "epoch": 0.05333652952985947, + "grad_norm": 7.204603672027588, + "learning_rate": 1.9969859706543384e-07, + "loss": 0.0358, + "step": 13350 + }, + { + "epoch": 0.05337648198643614, + "grad_norm": 5.633981704711914, + "learning_rate": 1.9969810901174045e-07, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.05341643444301281, + "grad_norm": 3.706915855407715, + "learning_rate": 1.9969762056381792e-07, + "loss": 0.034, + "step": 13370 + }, + { + "epoch": 0.05345638689958949, + "grad_norm": 7.230068206787109, + "learning_rate": 1.9969713172166812e-07, + "loss": 0.0343, + "step": 13380 + }, + { + "epoch": 0.05349633935616616, + "grad_norm": 3.870473861694336, + "learning_rate": 1.99696642485293e-07, + "loss": 0.0386, + "step": 13390 + }, + { + "epoch": 0.05353629181274284, + "grad_norm": 3.2633185386657715, + "learning_rate": 1.9969615285469453e-07, + "loss": 0.0379, + "step": 13400 + }, + { + "epoch": 0.05357624426931951, + "grad_norm": 10.95995807647705, + "learning_rate": 1.9969566282987464e-07, + "loss": 0.0375, + "step": 13410 + }, + { + "epoch": 0.05361619672589618, + "grad_norm": 6.077620506286621, + "learning_rate": 1.996951724108352e-07, + "loss": 0.0345, + "step": 13420 + }, + { + "epoch": 0.053656149182472856, + "grad_norm": 6.804446220397949, + "learning_rate": 1.9969468159757822e-07, + "loss": 0.0362, + "step": 13430 + }, + { + "epoch": 0.05369610163904953, + "grad_norm": 4.145209789276123, + "learning_rate": 1.996941903901056e-07, + "loss": 0.0381, + "step": 13440 + }, + { + "epoch": 0.05373605409562621, + "grad_norm": 6.543659210205078, + "learning_rate": 1.996936987884193e-07, + "loss": 0.0366, + "step": 13450 + }, + { + "epoch": 0.05377600655220288, + "grad_norm": 5.165908336639404, + "learning_rate": 1.996932067925213e-07, + "loss": 0.0413, + "step": 13460 + }, + { + "epoch": 0.05381595900877955, + "grad_norm": 5.810034275054932, + "learning_rate": 1.9969271440241348e-07, + "loss": 0.0396, + "step": 13470 + }, + { + "epoch": 0.053855911465356225, + "grad_norm": 4.806185722351074, + "learning_rate": 1.996922216180978e-07, + "loss": 0.0362, + "step": 13480 + }, + { + "epoch": 0.0538958639219329, + "grad_norm": 5.146542549133301, + "learning_rate": 1.9969172843957628e-07, + "loss": 0.035, + "step": 13490 + }, + { + "epoch": 0.053935816378509575, + "grad_norm": 3.219454765319824, + "learning_rate": 1.9969123486685078e-07, + "loss": 0.0351, + "step": 13500 + }, + { + "epoch": 0.05397576883508625, + "grad_norm": 8.781963348388672, + "learning_rate": 1.996907408999233e-07, + "loss": 0.0348, + "step": 13510 + }, + { + "epoch": 0.05401572129166292, + "grad_norm": 7.23284387588501, + "learning_rate": 1.9969024653879573e-07, + "loss": 0.0359, + "step": 13520 + }, + { + "epoch": 0.054055673748239594, + "grad_norm": 7.608070373535156, + "learning_rate": 1.9968975178347015e-07, + "loss": 0.0383, + "step": 13530 + }, + { + "epoch": 0.05409562620481627, + "grad_norm": 3.938473701477051, + "learning_rate": 1.9968925663394837e-07, + "loss": 0.0378, + "step": 13540 + }, + { + "epoch": 0.054135578661392944, + "grad_norm": 5.25358772277832, + "learning_rate": 1.9968876109023248e-07, + "loss": 0.0375, + "step": 13550 + }, + { + "epoch": 0.05417553111796962, + "grad_norm": 4.876136302947998, + "learning_rate": 1.9968826515232436e-07, + "loss": 0.0357, + "step": 13560 + }, + { + "epoch": 0.05421548357454629, + "grad_norm": 6.4702534675598145, + "learning_rate": 1.9968776882022596e-07, + "loss": 0.0403, + "step": 13570 + }, + { + "epoch": 0.05425543603112296, + "grad_norm": 3.7794556617736816, + "learning_rate": 1.9968727209393933e-07, + "loss": 0.0344, + "step": 13580 + }, + { + "epoch": 0.05429538848769964, + "grad_norm": 5.49770450592041, + "learning_rate": 1.996867749734663e-07, + "loss": 0.0416, + "step": 13590 + }, + { + "epoch": 0.05433534094427631, + "grad_norm": 5.44439697265625, + "learning_rate": 1.9968627745880898e-07, + "loss": 0.0366, + "step": 13600 + }, + { + "epoch": 0.05437529340085299, + "grad_norm": 2.1642513275146484, + "learning_rate": 1.9968577954996923e-07, + "loss": 0.0339, + "step": 13610 + }, + { + "epoch": 0.054415245857429656, + "grad_norm": 4.368811130523682, + "learning_rate": 1.9968528124694909e-07, + "loss": 0.0371, + "step": 13620 + }, + { + "epoch": 0.05445519831400633, + "grad_norm": 4.706299304962158, + "learning_rate": 1.9968478254975046e-07, + "loss": 0.0387, + "step": 13630 + }, + { + "epoch": 0.054495150770583006, + "grad_norm": 2.8007113933563232, + "learning_rate": 1.9968428345837537e-07, + "loss": 0.0383, + "step": 13640 + }, + { + "epoch": 0.05453510322715968, + "grad_norm": 3.770853042602539, + "learning_rate": 1.996837839728258e-07, + "loss": 0.0319, + "step": 13650 + }, + { + "epoch": 0.054575055683736357, + "grad_norm": 5.124091625213623, + "learning_rate": 1.9968328409310366e-07, + "loss": 0.0356, + "step": 13660 + }, + { + "epoch": 0.054615008140313025, + "grad_norm": 4.929914951324463, + "learning_rate": 1.99682783819211e-07, + "loss": 0.0374, + "step": 13670 + }, + { + "epoch": 0.0546549605968897, + "grad_norm": 3.209024429321289, + "learning_rate": 1.9968228315114974e-07, + "loss": 0.0361, + "step": 13680 + }, + { + "epoch": 0.054694913053466375, + "grad_norm": 5.6610283851623535, + "learning_rate": 1.9968178208892187e-07, + "loss": 0.0352, + "step": 13690 + }, + { + "epoch": 0.05473486551004305, + "grad_norm": 4.797921657562256, + "learning_rate": 1.9968128063252943e-07, + "loss": 0.0407, + "step": 13700 + }, + { + "epoch": 0.054774817966619725, + "grad_norm": 5.222414970397949, + "learning_rate": 1.9968077878197435e-07, + "loss": 0.0379, + "step": 13710 + }, + { + "epoch": 0.05481477042319639, + "grad_norm": 3.351482391357422, + "learning_rate": 1.996802765372586e-07, + "loss": 0.0385, + "step": 13720 + }, + { + "epoch": 0.05485472287977307, + "grad_norm": 6.8207855224609375, + "learning_rate": 1.9967977389838418e-07, + "loss": 0.0381, + "step": 13730 + }, + { + "epoch": 0.054894675336349744, + "grad_norm": 6.319904327392578, + "learning_rate": 1.996792708653531e-07, + "loss": 0.0358, + "step": 13740 + }, + { + "epoch": 0.05493462779292642, + "grad_norm": 4.442610740661621, + "learning_rate": 1.9967876743816734e-07, + "loss": 0.0335, + "step": 13750 + }, + { + "epoch": 0.054974580249503094, + "grad_norm": 4.605607986450195, + "learning_rate": 1.996782636168289e-07, + "loss": 0.0326, + "step": 13760 + }, + { + "epoch": 0.05501453270607976, + "grad_norm": 4.1917643547058105, + "learning_rate": 1.9967775940133972e-07, + "loss": 0.0327, + "step": 13770 + }, + { + "epoch": 0.05505448516265644, + "grad_norm": 5.950768947601318, + "learning_rate": 1.9967725479170189e-07, + "loss": 0.0352, + "step": 13780 + }, + { + "epoch": 0.05509443761923311, + "grad_norm": 11.662089347839355, + "learning_rate": 1.9967674978791728e-07, + "loss": 0.0326, + "step": 13790 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 3.357208251953125, + "learning_rate": 1.9967624438998798e-07, + "loss": 0.0288, + "step": 13800 + }, + { + "epoch": 0.05517434253238646, + "grad_norm": 6.691584587097168, + "learning_rate": 1.9967573859791597e-07, + "loss": 0.0341, + "step": 13810 + }, + { + "epoch": 0.05521429498896313, + "grad_norm": 9.418229103088379, + "learning_rate": 1.9967523241170326e-07, + "loss": 0.0345, + "step": 13820 + }, + { + "epoch": 0.055254247445539806, + "grad_norm": 5.668757915496826, + "learning_rate": 1.996747258313518e-07, + "loss": 0.0337, + "step": 13830 + }, + { + "epoch": 0.05529419990211648, + "grad_norm": 3.9080021381378174, + "learning_rate": 1.9967421885686365e-07, + "loss": 0.0392, + "step": 13840 + }, + { + "epoch": 0.055334152358693156, + "grad_norm": 3.4337520599365234, + "learning_rate": 1.996737114882408e-07, + "loss": 0.0335, + "step": 13850 + }, + { + "epoch": 0.05537410481526983, + "grad_norm": 7.439955711364746, + "learning_rate": 1.9967320372548521e-07, + "loss": 0.0392, + "step": 13860 + }, + { + "epoch": 0.0554140572718465, + "grad_norm": 3.6607344150543213, + "learning_rate": 1.9967269556859895e-07, + "loss": 0.0341, + "step": 13870 + }, + { + "epoch": 0.055454009728423174, + "grad_norm": 7.416835308074951, + "learning_rate": 1.99672187017584e-07, + "loss": 0.0298, + "step": 13880 + }, + { + "epoch": 0.05549396218499985, + "grad_norm": 7.481778621673584, + "learning_rate": 1.9967167807244237e-07, + "loss": 0.0317, + "step": 13890 + }, + { + "epoch": 0.055533914641576525, + "grad_norm": 33.536319732666016, + "learning_rate": 1.9967116873317612e-07, + "loss": 0.041, + "step": 13900 + }, + { + "epoch": 0.0555738670981532, + "grad_norm": 5.172069072723389, + "learning_rate": 1.9967065899978718e-07, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.05561381955472987, + "grad_norm": 9.00122356414795, + "learning_rate": 1.9967014887227762e-07, + "loss": 0.0323, + "step": 13920 + }, + { + "epoch": 0.05565377201130654, + "grad_norm": 5.451925277709961, + "learning_rate": 1.9966963835064946e-07, + "loss": 0.0348, + "step": 13930 + }, + { + "epoch": 0.05569372446788322, + "grad_norm": 3.4193506240844727, + "learning_rate": 1.996691274349047e-07, + "loss": 0.0389, + "step": 13940 + }, + { + "epoch": 0.05573367692445989, + "grad_norm": 3.705003499984741, + "learning_rate": 1.9966861612504535e-07, + "loss": 0.0361, + "step": 13950 + }, + { + "epoch": 0.05577362938103657, + "grad_norm": 3.6168766021728516, + "learning_rate": 1.9966810442107346e-07, + "loss": 0.033, + "step": 13960 + }, + { + "epoch": 0.055813581837613244, + "grad_norm": 4.367231845855713, + "learning_rate": 1.9966759232299104e-07, + "loss": 0.0386, + "step": 13970 + }, + { + "epoch": 0.05585353429418991, + "grad_norm": 5.673027515411377, + "learning_rate": 1.996670798308001e-07, + "loss": 0.0362, + "step": 13980 + }, + { + "epoch": 0.05589348675076659, + "grad_norm": 3.916266441345215, + "learning_rate": 1.996665669445027e-07, + "loss": 0.0367, + "step": 13990 + }, + { + "epoch": 0.05593343920734326, + "grad_norm": 2.267015218734741, + "learning_rate": 1.9966605366410082e-07, + "loss": 0.0328, + "step": 14000 + }, + { + "epoch": 0.05597339166391994, + "grad_norm": 6.4910478591918945, + "learning_rate": 1.9966553998959654e-07, + "loss": 0.0336, + "step": 14010 + }, + { + "epoch": 0.05601334412049661, + "grad_norm": 6.011497497558594, + "learning_rate": 1.9966502592099186e-07, + "loss": 0.0312, + "step": 14020 + }, + { + "epoch": 0.05605329657707328, + "grad_norm": 5.3140034675598145, + "learning_rate": 1.9966451145828885e-07, + "loss": 0.0387, + "step": 14030 + }, + { + "epoch": 0.056093249033649956, + "grad_norm": 2.892942428588867, + "learning_rate": 1.996639966014895e-07, + "loss": 0.0351, + "step": 14040 + }, + { + "epoch": 0.05613320149022663, + "grad_norm": 2.1699140071868896, + "learning_rate": 1.9966348135059586e-07, + "loss": 0.0281, + "step": 14050 + }, + { + "epoch": 0.056173153946803306, + "grad_norm": 5.678905487060547, + "learning_rate": 1.9966296570560996e-07, + "loss": 0.0384, + "step": 14060 + }, + { + "epoch": 0.05621310640337998, + "grad_norm": 7.470632076263428, + "learning_rate": 1.9966244966653387e-07, + "loss": 0.0357, + "step": 14070 + }, + { + "epoch": 0.05625305885995665, + "grad_norm": 5.191848278045654, + "learning_rate": 1.996619332333696e-07, + "loss": 0.0349, + "step": 14080 + }, + { + "epoch": 0.056293011316533324, + "grad_norm": 4.030303955078125, + "learning_rate": 1.9966141640611921e-07, + "loss": 0.0374, + "step": 14090 + }, + { + "epoch": 0.05633296377311, + "grad_norm": 4.73069429397583, + "learning_rate": 1.9966089918478475e-07, + "loss": 0.035, + "step": 14100 + }, + { + "epoch": 0.056372916229686675, + "grad_norm": 6.2693963050842285, + "learning_rate": 1.9966038156936822e-07, + "loss": 0.0365, + "step": 14110 + }, + { + "epoch": 0.05641286868626335, + "grad_norm": 4.869785785675049, + "learning_rate": 1.9965986355987171e-07, + "loss": 0.0361, + "step": 14120 + }, + { + "epoch": 0.05645282114284002, + "grad_norm": 4.965143203735352, + "learning_rate": 1.9965934515629725e-07, + "loss": 0.0324, + "step": 14130 + }, + { + "epoch": 0.05649277359941669, + "grad_norm": 4.937821388244629, + "learning_rate": 1.996588263586469e-07, + "loss": 0.0339, + "step": 14140 + }, + { + "epoch": 0.05653272605599337, + "grad_norm": 3.653012752532959, + "learning_rate": 1.9965830716692272e-07, + "loss": 0.0422, + "step": 14150 + }, + { + "epoch": 0.05657267851257004, + "grad_norm": 2.6135001182556152, + "learning_rate": 1.9965778758112672e-07, + "loss": 0.0356, + "step": 14160 + }, + { + "epoch": 0.05661263096914672, + "grad_norm": 5.278318405151367, + "learning_rate": 1.9965726760126103e-07, + "loss": 0.0384, + "step": 14170 + }, + { + "epoch": 0.056652583425723387, + "grad_norm": 3.540865659713745, + "learning_rate": 1.9965674722732764e-07, + "loss": 0.0355, + "step": 14180 + }, + { + "epoch": 0.05669253588230006, + "grad_norm": 3.5045857429504395, + "learning_rate": 1.9965622645932863e-07, + "loss": 0.0301, + "step": 14190 + }, + { + "epoch": 0.05673248833887674, + "grad_norm": 37.530643463134766, + "learning_rate": 1.9965570529726604e-07, + "loss": 0.0346, + "step": 14200 + }, + { + "epoch": 0.05677244079545341, + "grad_norm": 7.336424350738525, + "learning_rate": 1.9965518374114197e-07, + "loss": 0.0344, + "step": 14210 + }, + { + "epoch": 0.05681239325203009, + "grad_norm": 5.6113667488098145, + "learning_rate": 1.9965466179095848e-07, + "loss": 0.0326, + "step": 14220 + }, + { + "epoch": 0.056852345708606755, + "grad_norm": 9.114960670471191, + "learning_rate": 1.9965413944671755e-07, + "loss": 0.0285, + "step": 14230 + }, + { + "epoch": 0.05689229816518343, + "grad_norm": 6.307765483856201, + "learning_rate": 1.9965361670842138e-07, + "loss": 0.0404, + "step": 14240 + }, + { + "epoch": 0.056932250621760105, + "grad_norm": 6.057517051696777, + "learning_rate": 1.9965309357607194e-07, + "loss": 0.0388, + "step": 14250 + }, + { + "epoch": 0.05697220307833678, + "grad_norm": 16.724258422851562, + "learning_rate": 1.996525700496713e-07, + "loss": 0.0344, + "step": 14260 + }, + { + "epoch": 0.057012155534913456, + "grad_norm": 7.452014446258545, + "learning_rate": 1.9965204612922157e-07, + "loss": 0.0331, + "step": 14270 + }, + { + "epoch": 0.057052107991490124, + "grad_norm": 4.077507972717285, + "learning_rate": 1.9965152181472484e-07, + "loss": 0.0309, + "step": 14280 + }, + { + "epoch": 0.0570920604480668, + "grad_norm": 4.096234321594238, + "learning_rate": 1.9965099710618313e-07, + "loss": 0.0316, + "step": 14290 + }, + { + "epoch": 0.057132012904643474, + "grad_norm": 7.39116096496582, + "learning_rate": 1.9965047200359852e-07, + "loss": 0.0387, + "step": 14300 + }, + { + "epoch": 0.05717196536122015, + "grad_norm": 5.44392204284668, + "learning_rate": 1.9964994650697315e-07, + "loss": 0.0392, + "step": 14310 + }, + { + "epoch": 0.057211917817796824, + "grad_norm": 2.7085354328155518, + "learning_rate": 1.99649420616309e-07, + "loss": 0.0324, + "step": 14320 + }, + { + "epoch": 0.05725187027437349, + "grad_norm": 7.401857852935791, + "learning_rate": 1.9964889433160824e-07, + "loss": 0.0299, + "step": 14330 + }, + { + "epoch": 0.05729182273095017, + "grad_norm": 8.876870155334473, + "learning_rate": 1.996483676528729e-07, + "loss": 0.035, + "step": 14340 + }, + { + "epoch": 0.05733177518752684, + "grad_norm": 3.327584981918335, + "learning_rate": 1.9964784058010504e-07, + "loss": 0.0336, + "step": 14350 + }, + { + "epoch": 0.05737172764410352, + "grad_norm": 5.198122978210449, + "learning_rate": 1.9964731311330681e-07, + "loss": 0.0322, + "step": 14360 + }, + { + "epoch": 0.05741168010068019, + "grad_norm": 3.387697696685791, + "learning_rate": 1.9964678525248027e-07, + "loss": 0.0275, + "step": 14370 + }, + { + "epoch": 0.05745163255725686, + "grad_norm": 8.45565414428711, + "learning_rate": 1.9964625699762748e-07, + "loss": 0.0384, + "step": 14380 + }, + { + "epoch": 0.057491585013833536, + "grad_norm": 58.75517654418945, + "learning_rate": 1.9964572834875055e-07, + "loss": 0.0304, + "step": 14390 + }, + { + "epoch": 0.05753153747041021, + "grad_norm": 3.250046730041504, + "learning_rate": 1.9964519930585157e-07, + "loss": 0.032, + "step": 14400 + }, + { + "epoch": 0.05757148992698689, + "grad_norm": 5.1772661209106445, + "learning_rate": 1.9964466986893264e-07, + "loss": 0.0338, + "step": 14410 + }, + { + "epoch": 0.05761144238356356, + "grad_norm": 3.0055994987487793, + "learning_rate": 1.9964414003799585e-07, + "loss": 0.0398, + "step": 14420 + }, + { + "epoch": 0.05765139484014023, + "grad_norm": 2.9095394611358643, + "learning_rate": 1.9964360981304328e-07, + "loss": 0.0335, + "step": 14430 + }, + { + "epoch": 0.057691347296716905, + "grad_norm": 2.577996015548706, + "learning_rate": 1.9964307919407706e-07, + "loss": 0.0383, + "step": 14440 + }, + { + "epoch": 0.05773129975329358, + "grad_norm": 6.2603068351745605, + "learning_rate": 1.9964254818109926e-07, + "loss": 0.0334, + "step": 14450 + }, + { + "epoch": 0.057771252209870255, + "grad_norm": 4.406505107879639, + "learning_rate": 1.9964201677411197e-07, + "loss": 0.0354, + "step": 14460 + }, + { + "epoch": 0.05781120466644693, + "grad_norm": 5.972916126251221, + "learning_rate": 1.996414849731173e-07, + "loss": 0.0341, + "step": 14470 + }, + { + "epoch": 0.0578511571230236, + "grad_norm": 4.035129547119141, + "learning_rate": 1.9964095277811736e-07, + "loss": 0.0364, + "step": 14480 + }, + { + "epoch": 0.057891109579600274, + "grad_norm": 8.16456413269043, + "learning_rate": 1.9964042018911426e-07, + "loss": 0.0344, + "step": 14490 + }, + { + "epoch": 0.05793106203617695, + "grad_norm": 4.382503032684326, + "learning_rate": 1.996398872061101e-07, + "loss": 0.0323, + "step": 14500 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 13.397266387939453, + "learning_rate": 1.9963935382910702e-07, + "loss": 0.0368, + "step": 14510 + }, + { + "epoch": 0.0580109669493303, + "grad_norm": 3.6122636795043945, + "learning_rate": 1.9963882005810703e-07, + "loss": 0.0375, + "step": 14520 + }, + { + "epoch": 0.058050919405906974, + "grad_norm": 3.719393730163574, + "learning_rate": 1.9963828589311237e-07, + "loss": 0.0346, + "step": 14530 + }, + { + "epoch": 0.05809087186248364, + "grad_norm": 9.22439193725586, + "learning_rate": 1.9963775133412506e-07, + "loss": 0.0355, + "step": 14540 + }, + { + "epoch": 0.05813082431906032, + "grad_norm": 9.891926765441895, + "learning_rate": 1.9963721638114724e-07, + "loss": 0.0367, + "step": 14550 + }, + { + "epoch": 0.05817077677563699, + "grad_norm": 4.411949157714844, + "learning_rate": 1.9963668103418105e-07, + "loss": 0.0359, + "step": 14560 + }, + { + "epoch": 0.05821072923221367, + "grad_norm": 4.101836681365967, + "learning_rate": 1.9963614529322856e-07, + "loss": 0.0318, + "step": 14570 + }, + { + "epoch": 0.05825068168879034, + "grad_norm": 3.0229132175445557, + "learning_rate": 1.9963560915829194e-07, + "loss": 0.0356, + "step": 14580 + }, + { + "epoch": 0.05829063414536701, + "grad_norm": 11.306370735168457, + "learning_rate": 1.9963507262937324e-07, + "loss": 0.0317, + "step": 14590 + }, + { + "epoch": 0.058330586601943686, + "grad_norm": 7.473814487457275, + "learning_rate": 1.9963453570647467e-07, + "loss": 0.0332, + "step": 14600 + }, + { + "epoch": 0.05837053905852036, + "grad_norm": 8.827414512634277, + "learning_rate": 1.996339983895983e-07, + "loss": 0.0282, + "step": 14610 + }, + { + "epoch": 0.058410491515097036, + "grad_norm": 4.987014293670654, + "learning_rate": 1.9963346067874624e-07, + "loss": 0.0348, + "step": 14620 + }, + { + "epoch": 0.05845044397167371, + "grad_norm": 3.6029093265533447, + "learning_rate": 1.9963292257392065e-07, + "loss": 0.0377, + "step": 14630 + }, + { + "epoch": 0.05849039642825038, + "grad_norm": 4.140063285827637, + "learning_rate": 1.9963238407512363e-07, + "loss": 0.0317, + "step": 14640 + }, + { + "epoch": 0.058530348884827055, + "grad_norm": 5.837855815887451, + "learning_rate": 1.9963184518235737e-07, + "loss": 0.0315, + "step": 14650 + }, + { + "epoch": 0.05857030134140373, + "grad_norm": 6.536017417907715, + "learning_rate": 1.9963130589562394e-07, + "loss": 0.0316, + "step": 14660 + }, + { + "epoch": 0.058610253797980405, + "grad_norm": 4.697779655456543, + "learning_rate": 1.9963076621492547e-07, + "loss": 0.0366, + "step": 14670 + }, + { + "epoch": 0.05865020625455708, + "grad_norm": 6.573713302612305, + "learning_rate": 1.9963022614026412e-07, + "loss": 0.0326, + "step": 14680 + }, + { + "epoch": 0.05869015871113375, + "grad_norm": 5.826941967010498, + "learning_rate": 1.9962968567164203e-07, + "loss": 0.0373, + "step": 14690 + }, + { + "epoch": 0.05873011116771042, + "grad_norm": 4.2321624755859375, + "learning_rate": 1.9962914480906135e-07, + "loss": 0.035, + "step": 14700 + }, + { + "epoch": 0.0587700636242871, + "grad_norm": 4.835977077484131, + "learning_rate": 1.9962860355252415e-07, + "loss": 0.0315, + "step": 14710 + }, + { + "epoch": 0.058810016080863774, + "grad_norm": 8.145109176635742, + "learning_rate": 1.9962806190203265e-07, + "loss": 0.0311, + "step": 14720 + }, + { + "epoch": 0.05884996853744045, + "grad_norm": 7.037175178527832, + "learning_rate": 1.9962751985758893e-07, + "loss": 0.0358, + "step": 14730 + }, + { + "epoch": 0.05888992099401712, + "grad_norm": 3.826059103012085, + "learning_rate": 1.9962697741919516e-07, + "loss": 0.0331, + "step": 14740 + }, + { + "epoch": 0.05892987345059379, + "grad_norm": 8.670318603515625, + "learning_rate": 1.9962643458685352e-07, + "loss": 0.0332, + "step": 14750 + }, + { + "epoch": 0.05896982590717047, + "grad_norm": 5.3066606521606445, + "learning_rate": 1.996258913605661e-07, + "loss": 0.0336, + "step": 14760 + }, + { + "epoch": 0.05900977836374714, + "grad_norm": 6.312092304229736, + "learning_rate": 1.9962534774033507e-07, + "loss": 0.0375, + "step": 14770 + }, + { + "epoch": 0.05904973082032382, + "grad_norm": 3.953294038772583, + "learning_rate": 1.9962480372616258e-07, + "loss": 0.03, + "step": 14780 + }, + { + "epoch": 0.059089683276900486, + "grad_norm": 5.222171306610107, + "learning_rate": 1.996242593180508e-07, + "loss": 0.034, + "step": 14790 + }, + { + "epoch": 0.05912963573347716, + "grad_norm": 11.904159545898438, + "learning_rate": 1.9962371451600184e-07, + "loss": 0.0386, + "step": 14800 + }, + { + "epoch": 0.059169588190053836, + "grad_norm": 4.609586238861084, + "learning_rate": 1.9962316932001788e-07, + "loss": 0.0376, + "step": 14810 + }, + { + "epoch": 0.05920954064663051, + "grad_norm": 3.90680193901062, + "learning_rate": 1.9962262373010108e-07, + "loss": 0.0384, + "step": 14820 + }, + { + "epoch": 0.059249493103207186, + "grad_norm": 4.541289806365967, + "learning_rate": 1.9962207774625358e-07, + "loss": 0.0333, + "step": 14830 + }, + { + "epoch": 0.059289445559783854, + "grad_norm": 4.79822301864624, + "learning_rate": 1.9962153136847755e-07, + "loss": 0.0365, + "step": 14840 + }, + { + "epoch": 0.05932939801636053, + "grad_norm": 3.7336559295654297, + "learning_rate": 1.9962098459677516e-07, + "loss": 0.0362, + "step": 14850 + }, + { + "epoch": 0.059369350472937205, + "grad_norm": 5.617799758911133, + "learning_rate": 1.9962043743114857e-07, + "loss": 0.0325, + "step": 14860 + }, + { + "epoch": 0.05940930292951388, + "grad_norm": 4.3133063316345215, + "learning_rate": 1.996198898715999e-07, + "loss": 0.0338, + "step": 14870 + }, + { + "epoch": 0.059449255386090555, + "grad_norm": 7.36416482925415, + "learning_rate": 1.9961934191813139e-07, + "loss": 0.0327, + "step": 14880 + }, + { + "epoch": 0.05948920784266722, + "grad_norm": 10.091732025146484, + "learning_rate": 1.9961879357074515e-07, + "loss": 0.0358, + "step": 14890 + }, + { + "epoch": 0.0595291602992439, + "grad_norm": 2.900958299636841, + "learning_rate": 1.9961824482944337e-07, + "loss": 0.0259, + "step": 14900 + }, + { + "epoch": 0.05956911275582057, + "grad_norm": 6.90102481842041, + "learning_rate": 1.9961769569422824e-07, + "loss": 0.0355, + "step": 14910 + }, + { + "epoch": 0.05960906521239725, + "grad_norm": 4.922801494598389, + "learning_rate": 1.9961714616510187e-07, + "loss": 0.0353, + "step": 14920 + }, + { + "epoch": 0.059649017668973923, + "grad_norm": 3.279049873352051, + "learning_rate": 1.9961659624206648e-07, + "loss": 0.0293, + "step": 14930 + }, + { + "epoch": 0.05968897012555059, + "grad_norm": 3.922060489654541, + "learning_rate": 1.9961604592512423e-07, + "loss": 0.0428, + "step": 14940 + }, + { + "epoch": 0.05972892258212727, + "grad_norm": 3.5555062294006348, + "learning_rate": 1.9961549521427734e-07, + "loss": 0.0314, + "step": 14950 + }, + { + "epoch": 0.05976887503870394, + "grad_norm": 3.7024998664855957, + "learning_rate": 1.9961494410952794e-07, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.05980882749528062, + "grad_norm": 5.7350335121154785, + "learning_rate": 1.9961439261087817e-07, + "loss": 0.0367, + "step": 14970 + }, + { + "epoch": 0.05984877995185729, + "grad_norm": 6.709193706512451, + "learning_rate": 1.996138407183303e-07, + "loss": 0.0355, + "step": 14980 + }, + { + "epoch": 0.05988873240843396, + "grad_norm": 3.895350217819214, + "learning_rate": 1.9961328843188644e-07, + "loss": 0.0294, + "step": 14990 + }, + { + "epoch": 0.059928684865010635, + "grad_norm": 5.212899684906006, + "learning_rate": 1.9961273575154885e-07, + "loss": 0.036, + "step": 15000 + }, + { + "epoch": 0.05996863732158731, + "grad_norm": 3.7638449668884277, + "learning_rate": 1.9961218267731965e-07, + "loss": 0.0333, + "step": 15010 + }, + { + "epoch": 0.060008589778163986, + "grad_norm": 4.560139179229736, + "learning_rate": 1.9961162920920102e-07, + "loss": 0.0335, + "step": 15020 + }, + { + "epoch": 0.06004854223474066, + "grad_norm": 2.96637225151062, + "learning_rate": 1.996110753471952e-07, + "loss": 0.0321, + "step": 15030 + }, + { + "epoch": 0.06008849469131733, + "grad_norm": 5.354013442993164, + "learning_rate": 1.996105210913044e-07, + "loss": 0.0331, + "step": 15040 + }, + { + "epoch": 0.060128447147894004, + "grad_norm": 9.645467758178711, + "learning_rate": 1.9960996644153068e-07, + "loss": 0.0329, + "step": 15050 + }, + { + "epoch": 0.06016839960447068, + "grad_norm": 3.210808038711548, + "learning_rate": 1.9960941139787637e-07, + "loss": 0.0322, + "step": 15060 + }, + { + "epoch": 0.060208352061047354, + "grad_norm": 9.006463050842285, + "learning_rate": 1.9960885596034362e-07, + "loss": 0.0372, + "step": 15070 + }, + { + "epoch": 0.06024830451762403, + "grad_norm": 4.345051288604736, + "learning_rate": 1.996083001289346e-07, + "loss": 0.0356, + "step": 15080 + }, + { + "epoch": 0.060288256974200705, + "grad_norm": 14.351428031921387, + "learning_rate": 1.9960774390365156e-07, + "loss": 0.0328, + "step": 15090 + }, + { + "epoch": 0.06032820943077737, + "grad_norm": 5.871204376220703, + "learning_rate": 1.9960718728449665e-07, + "loss": 0.0339, + "step": 15100 + }, + { + "epoch": 0.06036816188735405, + "grad_norm": 12.937674522399902, + "learning_rate": 1.9960663027147208e-07, + "loss": 0.0407, + "step": 15110 + }, + { + "epoch": 0.06040811434393072, + "grad_norm": 2.681015729904175, + "learning_rate": 1.9960607286458007e-07, + "loss": 0.0291, + "step": 15120 + }, + { + "epoch": 0.0604480668005074, + "grad_norm": 6.077793598175049, + "learning_rate": 1.996055150638228e-07, + "loss": 0.0351, + "step": 15130 + }, + { + "epoch": 0.06048801925708407, + "grad_norm": 6.35223913192749, + "learning_rate": 1.9960495686920254e-07, + "loss": 0.035, + "step": 15140 + }, + { + "epoch": 0.06052797171366074, + "grad_norm": 6.044998645782471, + "learning_rate": 1.9960439828072142e-07, + "loss": 0.0367, + "step": 15150 + }, + { + "epoch": 0.06056792417023742, + "grad_norm": 2.943432331085205, + "learning_rate": 1.996038392983817e-07, + "loss": 0.0348, + "step": 15160 + }, + { + "epoch": 0.06060787662681409, + "grad_norm": 5.535979270935059, + "learning_rate": 1.9960327992218552e-07, + "loss": 0.0338, + "step": 15170 + }, + { + "epoch": 0.06064782908339077, + "grad_norm": 4.005102157592773, + "learning_rate": 1.9960272015213517e-07, + "loss": 0.0381, + "step": 15180 + }, + { + "epoch": 0.06068778153996744, + "grad_norm": 4.573872089385986, + "learning_rate": 1.9960215998823285e-07, + "loss": 0.0344, + "step": 15190 + }, + { + "epoch": 0.06072773399654411, + "grad_norm": 5.246135711669922, + "learning_rate": 1.9960159943048077e-07, + "loss": 0.0377, + "step": 15200 + }, + { + "epoch": 0.060767686453120785, + "grad_norm": 9.04214859008789, + "learning_rate": 1.996010384788811e-07, + "loss": 0.0356, + "step": 15210 + }, + { + "epoch": 0.06080763890969746, + "grad_norm": 2.240480422973633, + "learning_rate": 1.996004771334361e-07, + "loss": 0.03, + "step": 15220 + }, + { + "epoch": 0.060847591366274136, + "grad_norm": 3.601421594619751, + "learning_rate": 1.9959991539414799e-07, + "loss": 0.0364, + "step": 15230 + }, + { + "epoch": 0.06088754382285081, + "grad_norm": 5.068973064422607, + "learning_rate": 1.9959935326101897e-07, + "loss": 0.0312, + "step": 15240 + }, + { + "epoch": 0.06092749627942748, + "grad_norm": 4.767705917358398, + "learning_rate": 1.995987907340513e-07, + "loss": 0.0365, + "step": 15250 + }, + { + "epoch": 0.060967448736004154, + "grad_norm": 10.087220191955566, + "learning_rate": 1.9959822781324717e-07, + "loss": 0.0339, + "step": 15260 + }, + { + "epoch": 0.06100740119258083, + "grad_norm": 10.937508583068848, + "learning_rate": 1.9959766449860881e-07, + "loss": 0.0349, + "step": 15270 + }, + { + "epoch": 0.061047353649157504, + "grad_norm": 6.839133262634277, + "learning_rate": 1.9959710079013846e-07, + "loss": 0.0356, + "step": 15280 + }, + { + "epoch": 0.06108730610573418, + "grad_norm": 4.552211284637451, + "learning_rate": 1.9959653668783837e-07, + "loss": 0.0383, + "step": 15290 + }, + { + "epoch": 0.06112725856231085, + "grad_norm": 4.760563373565674, + "learning_rate": 1.9959597219171073e-07, + "loss": 0.036, + "step": 15300 + }, + { + "epoch": 0.06116721101888752, + "grad_norm": 5.755191326141357, + "learning_rate": 1.995954073017578e-07, + "loss": 0.0332, + "step": 15310 + }, + { + "epoch": 0.0612071634754642, + "grad_norm": 3.9909369945526123, + "learning_rate": 1.9959484201798179e-07, + "loss": 0.0357, + "step": 15320 + }, + { + "epoch": 0.06124711593204087, + "grad_norm": 5.214570999145508, + "learning_rate": 1.9959427634038493e-07, + "loss": 0.0367, + "step": 15330 + }, + { + "epoch": 0.06128706838861755, + "grad_norm": 6.35009765625, + "learning_rate": 1.995937102689695e-07, + "loss": 0.0374, + "step": 15340 + }, + { + "epoch": 0.061327020845194216, + "grad_norm": 3.5789413452148438, + "learning_rate": 1.9959314380373767e-07, + "loss": 0.0338, + "step": 15350 + }, + { + "epoch": 0.06136697330177089, + "grad_norm": 4.566047668457031, + "learning_rate": 1.9959257694469175e-07, + "loss": 0.0341, + "step": 15360 + }, + { + "epoch": 0.061406925758347566, + "grad_norm": 2.746675968170166, + "learning_rate": 1.9959200969183397e-07, + "loss": 0.0392, + "step": 15370 + }, + { + "epoch": 0.06144687821492424, + "grad_norm": 2.883867025375366, + "learning_rate": 1.9959144204516654e-07, + "loss": 0.0302, + "step": 15380 + }, + { + "epoch": 0.06148683067150092, + "grad_norm": 2.068504810333252, + "learning_rate": 1.995908740046917e-07, + "loss": 0.0352, + "step": 15390 + }, + { + "epoch": 0.061526783128077585, + "grad_norm": 5.023419380187988, + "learning_rate": 1.9959030557041173e-07, + "loss": 0.03, + "step": 15400 + }, + { + "epoch": 0.06156673558465426, + "grad_norm": 6.626621723175049, + "learning_rate": 1.9958973674232887e-07, + "loss": 0.0328, + "step": 15410 + }, + { + "epoch": 0.061606688041230935, + "grad_norm": 5.259562969207764, + "learning_rate": 1.995891675204454e-07, + "loss": 0.0377, + "step": 15420 + }, + { + "epoch": 0.06164664049780761, + "grad_norm": 4.371773719787598, + "learning_rate": 1.9958859790476348e-07, + "loss": 0.0347, + "step": 15430 + }, + { + "epoch": 0.061686592954384285, + "grad_norm": 3.9902522563934326, + "learning_rate": 1.9958802789528542e-07, + "loss": 0.0322, + "step": 15440 + }, + { + "epoch": 0.061726545410960953, + "grad_norm": 5.42441463470459, + "learning_rate": 1.995874574920135e-07, + "loss": 0.0331, + "step": 15450 + }, + { + "epoch": 0.06176649786753763, + "grad_norm": 6.295658588409424, + "learning_rate": 1.9958688669494994e-07, + "loss": 0.0361, + "step": 15460 + }, + { + "epoch": 0.061806450324114304, + "grad_norm": 4.286712646484375, + "learning_rate": 1.99586315504097e-07, + "loss": 0.0328, + "step": 15470 + }, + { + "epoch": 0.06184640278069098, + "grad_norm": 4.421769142150879, + "learning_rate": 1.9958574391945693e-07, + "loss": 0.031, + "step": 15480 + }, + { + "epoch": 0.061886355237267654, + "grad_norm": 3.749305486679077, + "learning_rate": 1.9958517194103201e-07, + "loss": 0.0325, + "step": 15490 + }, + { + "epoch": 0.06192630769384432, + "grad_norm": 5.404612064361572, + "learning_rate": 1.9958459956882447e-07, + "loss": 0.0405, + "step": 15500 + }, + { + "epoch": 0.061966260150421, + "grad_norm": 14.418130874633789, + "learning_rate": 1.9958402680283663e-07, + "loss": 0.0335, + "step": 15510 + }, + { + "epoch": 0.06200621260699767, + "grad_norm": 7.0557990074157715, + "learning_rate": 1.9958345364307072e-07, + "loss": 0.0329, + "step": 15520 + }, + { + "epoch": 0.06204616506357435, + "grad_norm": 3.2791848182678223, + "learning_rate": 1.9958288008952898e-07, + "loss": 0.0337, + "step": 15530 + }, + { + "epoch": 0.06208611752015102, + "grad_norm": 3.738346576690674, + "learning_rate": 1.9958230614221375e-07, + "loss": 0.0346, + "step": 15540 + }, + { + "epoch": 0.06212606997672769, + "grad_norm": 6.758075714111328, + "learning_rate": 1.9958173180112725e-07, + "loss": 0.0394, + "step": 15550 + }, + { + "epoch": 0.062166022433304366, + "grad_norm": 5.838968276977539, + "learning_rate": 1.9958115706627173e-07, + "loss": 0.0348, + "step": 15560 + }, + { + "epoch": 0.06220597488988104, + "grad_norm": 10.315160751342773, + "learning_rate": 1.9958058193764952e-07, + "loss": 0.0348, + "step": 15570 + }, + { + "epoch": 0.062245927346457716, + "grad_norm": 3.8313164710998535, + "learning_rate": 1.9958000641526284e-07, + "loss": 0.0355, + "step": 15580 + }, + { + "epoch": 0.06228587980303439, + "grad_norm": 3.5259108543395996, + "learning_rate": 1.99579430499114e-07, + "loss": 0.0355, + "step": 15590 + }, + { + "epoch": 0.06232583225961106, + "grad_norm": 6.1456170082092285, + "learning_rate": 1.9957885418920526e-07, + "loss": 0.0309, + "step": 15600 + }, + { + "epoch": 0.062365784716187735, + "grad_norm": 5.113983154296875, + "learning_rate": 1.9957827748553891e-07, + "loss": 0.0354, + "step": 15610 + }, + { + "epoch": 0.06240573717276441, + "grad_norm": 3.873997449874878, + "learning_rate": 1.995777003881172e-07, + "loss": 0.0318, + "step": 15620 + }, + { + "epoch": 0.062445689629341085, + "grad_norm": 6.5724897384643555, + "learning_rate": 1.9957712289694246e-07, + "loss": 0.0378, + "step": 15630 + }, + { + "epoch": 0.06248564208591776, + "grad_norm": 23.319108963012695, + "learning_rate": 1.9957654501201697e-07, + "loss": 0.0363, + "step": 15640 + }, + { + "epoch": 0.06252559454249443, + "grad_norm": 4.618587493896484, + "learning_rate": 1.99575966733343e-07, + "loss": 0.0337, + "step": 15650 + }, + { + "epoch": 0.0625655469990711, + "grad_norm": 4.229916572570801, + "learning_rate": 1.995753880609228e-07, + "loss": 0.0335, + "step": 15660 + }, + { + "epoch": 0.06260549945564778, + "grad_norm": 5.49269437789917, + "learning_rate": 1.995748089947587e-07, + "loss": 0.0322, + "step": 15670 + }, + { + "epoch": 0.06264545191222445, + "grad_norm": 13.285565376281738, + "learning_rate": 1.99574229534853e-07, + "loss": 0.0346, + "step": 15680 + }, + { + "epoch": 0.06268540436880113, + "grad_norm": 22.488142013549805, + "learning_rate": 1.9957364968120797e-07, + "loss": 0.0331, + "step": 15690 + }, + { + "epoch": 0.0627253568253778, + "grad_norm": 5.55100679397583, + "learning_rate": 1.995730694338259e-07, + "loss": 0.0304, + "step": 15700 + }, + { + "epoch": 0.06276530928195448, + "grad_norm": 4.881155014038086, + "learning_rate": 1.9957248879270908e-07, + "loss": 0.0308, + "step": 15710 + }, + { + "epoch": 0.06280526173853115, + "grad_norm": 21.45250129699707, + "learning_rate": 1.9957190775785982e-07, + "loss": 0.0294, + "step": 15720 + }, + { + "epoch": 0.06284521419510782, + "grad_norm": 4.389239311218262, + "learning_rate": 1.9957132632928043e-07, + "loss": 0.0273, + "step": 15730 + }, + { + "epoch": 0.06288516665168449, + "grad_norm": 4.7697954177856445, + "learning_rate": 1.9957074450697315e-07, + "loss": 0.0302, + "step": 15740 + }, + { + "epoch": 0.06292511910826117, + "grad_norm": 4.619146823883057, + "learning_rate": 1.9957016229094037e-07, + "loss": 0.0337, + "step": 15750 + }, + { + "epoch": 0.06296507156483784, + "grad_norm": 3.4877572059631348, + "learning_rate": 1.995695796811843e-07, + "loss": 0.0324, + "step": 15760 + }, + { + "epoch": 0.06300502402141452, + "grad_norm": 4.727746963500977, + "learning_rate": 1.9956899667770735e-07, + "loss": 0.0307, + "step": 15770 + }, + { + "epoch": 0.06304497647799119, + "grad_norm": 5.311588764190674, + "learning_rate": 1.9956841328051174e-07, + "loss": 0.0348, + "step": 15780 + }, + { + "epoch": 0.06308492893456787, + "grad_norm": 5.293699741363525, + "learning_rate": 1.9956782948959978e-07, + "loss": 0.0329, + "step": 15790 + }, + { + "epoch": 0.06312488139114454, + "grad_norm": 5.6261372566223145, + "learning_rate": 1.995672453049738e-07, + "loss": 0.032, + "step": 15800 + }, + { + "epoch": 0.06316483384772122, + "grad_norm": 3.7855374813079834, + "learning_rate": 1.9956666072663615e-07, + "loss": 0.034, + "step": 15810 + }, + { + "epoch": 0.06320478630429789, + "grad_norm": 5.377115249633789, + "learning_rate": 1.9956607575458908e-07, + "loss": 0.0342, + "step": 15820 + }, + { + "epoch": 0.06324473876087455, + "grad_norm": 3.35722279548645, + "learning_rate": 1.995654903888349e-07, + "loss": 0.0348, + "step": 15830 + }, + { + "epoch": 0.06328469121745123, + "grad_norm": 5.333555221557617, + "learning_rate": 1.9956490462937597e-07, + "loss": 0.037, + "step": 15840 + }, + { + "epoch": 0.0633246436740279, + "grad_norm": 4.864048004150391, + "learning_rate": 1.9956431847621456e-07, + "loss": 0.0409, + "step": 15850 + }, + { + "epoch": 0.06336459613060458, + "grad_norm": 3.9942846298217773, + "learning_rate": 1.9956373192935306e-07, + "loss": 0.034, + "step": 15860 + }, + { + "epoch": 0.06340454858718125, + "grad_norm": 14.137945175170898, + "learning_rate": 1.9956314498879375e-07, + "loss": 0.0315, + "step": 15870 + }, + { + "epoch": 0.06344450104375793, + "grad_norm": 5.510288238525391, + "learning_rate": 1.9956255765453892e-07, + "loss": 0.0361, + "step": 15880 + }, + { + "epoch": 0.0634844535003346, + "grad_norm": 7.460511207580566, + "learning_rate": 1.9956196992659088e-07, + "loss": 0.0364, + "step": 15890 + }, + { + "epoch": 0.06352440595691128, + "grad_norm": 10.651732444763184, + "learning_rate": 1.9956138180495203e-07, + "loss": 0.0352, + "step": 15900 + }, + { + "epoch": 0.06356435841348795, + "grad_norm": 5.3208794593811035, + "learning_rate": 1.9956079328962464e-07, + "loss": 0.0327, + "step": 15910 + }, + { + "epoch": 0.06360431087006463, + "grad_norm": 5.633444309234619, + "learning_rate": 1.9956020438061107e-07, + "loss": 0.0323, + "step": 15920 + }, + { + "epoch": 0.06364426332664129, + "grad_norm": 6.029275894165039, + "learning_rate": 1.995596150779136e-07, + "loss": 0.0342, + "step": 15930 + }, + { + "epoch": 0.06368421578321797, + "grad_norm": 5.55606746673584, + "learning_rate": 1.9955902538153464e-07, + "loss": 0.0352, + "step": 15940 + }, + { + "epoch": 0.06372416823979464, + "grad_norm": 4.5515618324279785, + "learning_rate": 1.9955843529147643e-07, + "loss": 0.0312, + "step": 15950 + }, + { + "epoch": 0.06376412069637132, + "grad_norm": 4.174352169036865, + "learning_rate": 1.9955784480774137e-07, + "loss": 0.0323, + "step": 15960 + }, + { + "epoch": 0.06380407315294799, + "grad_norm": 10.903118133544922, + "learning_rate": 1.9955725393033176e-07, + "loss": 0.0343, + "step": 15970 + }, + { + "epoch": 0.06384402560952467, + "grad_norm": 7.489772319793701, + "learning_rate": 1.9955666265925e-07, + "loss": 0.0356, + "step": 15980 + }, + { + "epoch": 0.06388397806610134, + "grad_norm": 4.079732894897461, + "learning_rate": 1.995560709944983e-07, + "loss": 0.0384, + "step": 15990 + }, + { + "epoch": 0.06392393052267802, + "grad_norm": 6.114555358886719, + "learning_rate": 1.9955547893607911e-07, + "loss": 0.0343, + "step": 16000 + }, + { + "epoch": 0.06396388297925469, + "grad_norm": 7.240649223327637, + "learning_rate": 1.9955488648399473e-07, + "loss": 0.036, + "step": 16010 + }, + { + "epoch": 0.06400383543583137, + "grad_norm": 5.466864109039307, + "learning_rate": 1.995542936382475e-07, + "loss": 0.0361, + "step": 16020 + }, + { + "epoch": 0.06404378789240804, + "grad_norm": 10.049688339233398, + "learning_rate": 1.995537003988398e-07, + "loss": 0.0326, + "step": 16030 + }, + { + "epoch": 0.0640837403489847, + "grad_norm": 23.10397720336914, + "learning_rate": 1.9955310676577395e-07, + "loss": 0.028, + "step": 16040 + }, + { + "epoch": 0.06412369280556138, + "grad_norm": 4.560031890869141, + "learning_rate": 1.9955251273905228e-07, + "loss": 0.0323, + "step": 16050 + }, + { + "epoch": 0.06416364526213805, + "grad_norm": 5.315290927886963, + "learning_rate": 1.9955191831867715e-07, + "loss": 0.0333, + "step": 16060 + }, + { + "epoch": 0.06420359771871473, + "grad_norm": 2.651266098022461, + "learning_rate": 1.9955132350465092e-07, + "loss": 0.0319, + "step": 16070 + }, + { + "epoch": 0.0642435501752914, + "grad_norm": 4.349170684814453, + "learning_rate": 1.9955072829697595e-07, + "loss": 0.0366, + "step": 16080 + }, + { + "epoch": 0.06428350263186808, + "grad_norm": 4.3765177726745605, + "learning_rate": 1.9955013269565457e-07, + "loss": 0.0342, + "step": 16090 + }, + { + "epoch": 0.06432345508844475, + "grad_norm": 5.692534923553467, + "learning_rate": 1.9954953670068915e-07, + "loss": 0.0347, + "step": 16100 + }, + { + "epoch": 0.06436340754502143, + "grad_norm": 4.5247087478637695, + "learning_rate": 1.9954894031208207e-07, + "loss": 0.038, + "step": 16110 + }, + { + "epoch": 0.0644033600015981, + "grad_norm": 2.0040154457092285, + "learning_rate": 1.9954834352983563e-07, + "loss": 0.0308, + "step": 16120 + }, + { + "epoch": 0.06444331245817478, + "grad_norm": 6.525778293609619, + "learning_rate": 1.9954774635395224e-07, + "loss": 0.0306, + "step": 16130 + }, + { + "epoch": 0.06448326491475144, + "grad_norm": 11.11319351196289, + "learning_rate": 1.9954714878443424e-07, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.06452321737132811, + "grad_norm": 2.9648358821868896, + "learning_rate": 1.9954655082128398e-07, + "loss": 0.0325, + "step": 16150 + }, + { + "epoch": 0.06456316982790479, + "grad_norm": 2.983877420425415, + "learning_rate": 1.9954595246450386e-07, + "loss": 0.0325, + "step": 16160 + }, + { + "epoch": 0.06460312228448147, + "grad_norm": 3.9356579780578613, + "learning_rate": 1.9954535371409622e-07, + "loss": 0.0342, + "step": 16170 + }, + { + "epoch": 0.06464307474105814, + "grad_norm": 6.504552364349365, + "learning_rate": 1.9954475457006343e-07, + "loss": 0.0346, + "step": 16180 + }, + { + "epoch": 0.06468302719763482, + "grad_norm": 4.527277946472168, + "learning_rate": 1.9954415503240786e-07, + "loss": 0.0333, + "step": 16190 + }, + { + "epoch": 0.06472297965421149, + "grad_norm": 4.214667797088623, + "learning_rate": 1.9954355510113187e-07, + "loss": 0.0304, + "step": 16200 + }, + { + "epoch": 0.06476293211078817, + "grad_norm": 4.867095470428467, + "learning_rate": 1.995429547762379e-07, + "loss": 0.0343, + "step": 16210 + }, + { + "epoch": 0.06480288456736484, + "grad_norm": 5.413068771362305, + "learning_rate": 1.9954235405772822e-07, + "loss": 0.031, + "step": 16220 + }, + { + "epoch": 0.06484283702394152, + "grad_norm": 5.5159502029418945, + "learning_rate": 1.995417529456053e-07, + "loss": 0.0328, + "step": 16230 + }, + { + "epoch": 0.06488278948051818, + "grad_norm": 5.392263889312744, + "learning_rate": 1.9954115143987143e-07, + "loss": 0.0367, + "step": 16240 + }, + { + "epoch": 0.06492274193709485, + "grad_norm": 5.028489589691162, + "learning_rate": 1.9954054954052902e-07, + "loss": 0.0317, + "step": 16250 + }, + { + "epoch": 0.06496269439367153, + "grad_norm": 9.43990707397461, + "learning_rate": 1.9953994724758052e-07, + "loss": 0.0314, + "step": 16260 + }, + { + "epoch": 0.0650026468502482, + "grad_norm": 5.189778804779053, + "learning_rate": 1.9953934456102822e-07, + "loss": 0.0335, + "step": 16270 + }, + { + "epoch": 0.06504259930682488, + "grad_norm": 8.741296768188477, + "learning_rate": 1.9953874148087454e-07, + "loss": 0.0344, + "step": 16280 + }, + { + "epoch": 0.06508255176340155, + "grad_norm": 5.896868705749512, + "learning_rate": 1.9953813800712185e-07, + "loss": 0.034, + "step": 16290 + }, + { + "epoch": 0.06512250421997823, + "grad_norm": 3.992014169692993, + "learning_rate": 1.9953753413977256e-07, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.0651624566765549, + "grad_norm": 3.9922776222229004, + "learning_rate": 1.9953692987882903e-07, + "loss": 0.0354, + "step": 16310 + }, + { + "epoch": 0.06520240913313158, + "grad_norm": 4.355060577392578, + "learning_rate": 1.995363252242937e-07, + "loss": 0.0233, + "step": 16320 + }, + { + "epoch": 0.06524236158970825, + "grad_norm": 3.0993587970733643, + "learning_rate": 1.995357201761689e-07, + "loss": 0.0295, + "step": 16330 + }, + { + "epoch": 0.06528231404628491, + "grad_norm": 4.741004943847656, + "learning_rate": 1.9953511473445704e-07, + "loss": 0.0328, + "step": 16340 + }, + { + "epoch": 0.06532226650286159, + "grad_norm": 5.582737445831299, + "learning_rate": 1.9953450889916053e-07, + "loss": 0.0376, + "step": 16350 + }, + { + "epoch": 0.06536221895943826, + "grad_norm": 4.5389509201049805, + "learning_rate": 1.9953390267028177e-07, + "loss": 0.0323, + "step": 16360 + }, + { + "epoch": 0.06540217141601494, + "grad_norm": 51.486778259277344, + "learning_rate": 1.9953329604782312e-07, + "loss": 0.0373, + "step": 16370 + }, + { + "epoch": 0.06544212387259161, + "grad_norm": 5.501858234405518, + "learning_rate": 1.99532689031787e-07, + "loss": 0.0313, + "step": 16380 + }, + { + "epoch": 0.06548207632916829, + "grad_norm": 4.264871597290039, + "learning_rate": 1.9953208162217584e-07, + "loss": 0.031, + "step": 16390 + }, + { + "epoch": 0.06552202878574497, + "grad_norm": 5.775469779968262, + "learning_rate": 1.99531473818992e-07, + "loss": 0.0326, + "step": 16400 + }, + { + "epoch": 0.06556198124232164, + "grad_norm": 6.67592191696167, + "learning_rate": 1.9953086562223792e-07, + "loss": 0.0348, + "step": 16410 + }, + { + "epoch": 0.06560193369889832, + "grad_norm": 5.271911144256592, + "learning_rate": 1.9953025703191596e-07, + "loss": 0.0299, + "step": 16420 + }, + { + "epoch": 0.06564188615547499, + "grad_norm": 4.127933502197266, + "learning_rate": 1.9952964804802855e-07, + "loss": 0.0267, + "step": 16430 + }, + { + "epoch": 0.06568183861205165, + "grad_norm": 4.8747878074646, + "learning_rate": 1.9952903867057808e-07, + "loss": 0.0323, + "step": 16440 + }, + { + "epoch": 0.06572179106862833, + "grad_norm": 4.220418930053711, + "learning_rate": 1.9952842889956702e-07, + "loss": 0.0319, + "step": 16450 + }, + { + "epoch": 0.065761743525205, + "grad_norm": 4.1253132820129395, + "learning_rate": 1.995278187349977e-07, + "loss": 0.0329, + "step": 16460 + }, + { + "epoch": 0.06580169598178168, + "grad_norm": 5.357776641845703, + "learning_rate": 1.9952720817687263e-07, + "loss": 0.0316, + "step": 16470 + }, + { + "epoch": 0.06584164843835835, + "grad_norm": 4.432567119598389, + "learning_rate": 1.9952659722519412e-07, + "loss": 0.0334, + "step": 16480 + }, + { + "epoch": 0.06588160089493503, + "grad_norm": 3.840463638305664, + "learning_rate": 1.9952598587996464e-07, + "loss": 0.0404, + "step": 16490 + }, + { + "epoch": 0.0659215533515117, + "grad_norm": 3.9114139080047607, + "learning_rate": 1.995253741411866e-07, + "loss": 0.0294, + "step": 16500 + }, + { + "epoch": 0.06596150580808838, + "grad_norm": 13.38376235961914, + "learning_rate": 1.9952476200886242e-07, + "loss": 0.034, + "step": 16510 + }, + { + "epoch": 0.06600145826466505, + "grad_norm": 4.647912979125977, + "learning_rate": 1.995241494829945e-07, + "loss": 0.0322, + "step": 16520 + }, + { + "epoch": 0.06604141072124173, + "grad_norm": 5.637537956237793, + "learning_rate": 1.9952353656358533e-07, + "loss": 0.0321, + "step": 16530 + }, + { + "epoch": 0.0660813631778184, + "grad_norm": 4.812287330627441, + "learning_rate": 1.9952292325063725e-07, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.06612131563439506, + "grad_norm": 4.787288188934326, + "learning_rate": 1.995223095441527e-07, + "loss": 0.0316, + "step": 16550 + }, + { + "epoch": 0.06616126809097174, + "grad_norm": 13.104178428649902, + "learning_rate": 1.9952169544413417e-07, + "loss": 0.0354, + "step": 16560 + }, + { + "epoch": 0.06620122054754841, + "grad_norm": 4.022050857543945, + "learning_rate": 1.9952108095058402e-07, + "loss": 0.0347, + "step": 16570 + }, + { + "epoch": 0.06624117300412509, + "grad_norm": 4.48053503036499, + "learning_rate": 1.9952046606350468e-07, + "loss": 0.0355, + "step": 16580 + }, + { + "epoch": 0.06628112546070176, + "grad_norm": 8.250699043273926, + "learning_rate": 1.9951985078289867e-07, + "loss": 0.0305, + "step": 16590 + }, + { + "epoch": 0.06632107791727844, + "grad_norm": 3.5213990211486816, + "learning_rate": 1.995192351087683e-07, + "loss": 0.0326, + "step": 16600 + }, + { + "epoch": 0.06636103037385511, + "grad_norm": 4.499130725860596, + "learning_rate": 1.9951861904111609e-07, + "loss": 0.0338, + "step": 16610 + }, + { + "epoch": 0.06640098283043179, + "grad_norm": 3.4757239818573, + "learning_rate": 1.9951800257994443e-07, + "loss": 0.0313, + "step": 16620 + }, + { + "epoch": 0.06644093528700847, + "grad_norm": 3.014509439468384, + "learning_rate": 1.995173857252558e-07, + "loss": 0.0302, + "step": 16630 + }, + { + "epoch": 0.06648088774358514, + "grad_norm": 6.683443546295166, + "learning_rate": 1.9951676847705261e-07, + "loss": 0.0353, + "step": 16640 + }, + { + "epoch": 0.0665208402001618, + "grad_norm": 6.68731164932251, + "learning_rate": 1.9951615083533731e-07, + "loss": 0.0348, + "step": 16650 + }, + { + "epoch": 0.06656079265673848, + "grad_norm": 3.348966598510742, + "learning_rate": 1.9951553280011232e-07, + "loss": 0.0303, + "step": 16660 + }, + { + "epoch": 0.06660074511331515, + "grad_norm": 5.734196186065674, + "learning_rate": 1.9951491437138012e-07, + "loss": 0.0387, + "step": 16670 + }, + { + "epoch": 0.06664069756989183, + "grad_norm": 3.125372886657715, + "learning_rate": 1.9951429554914314e-07, + "loss": 0.0374, + "step": 16680 + }, + { + "epoch": 0.0666806500264685, + "grad_norm": 3.249130964279175, + "learning_rate": 1.9951367633340383e-07, + "loss": 0.0295, + "step": 16690 + }, + { + "epoch": 0.06672060248304518, + "grad_norm": 3.025574207305908, + "learning_rate": 1.995130567241646e-07, + "loss": 0.031, + "step": 16700 + }, + { + "epoch": 0.06676055493962185, + "grad_norm": 3.5956504344940186, + "learning_rate": 1.9951243672142798e-07, + "loss": 0.0333, + "step": 16710 + }, + { + "epoch": 0.06680050739619853, + "grad_norm": 5.526428699493408, + "learning_rate": 1.9951181632519637e-07, + "loss": 0.0368, + "step": 16720 + }, + { + "epoch": 0.0668404598527752, + "grad_norm": 6.401792049407959, + "learning_rate": 1.995111955354722e-07, + "loss": 0.0333, + "step": 16730 + }, + { + "epoch": 0.06688041230935188, + "grad_norm": 3.120136260986328, + "learning_rate": 1.9951057435225796e-07, + "loss": 0.0377, + "step": 16740 + }, + { + "epoch": 0.06692036476592854, + "grad_norm": 7.109729290008545, + "learning_rate": 1.995099527755561e-07, + "loss": 0.0301, + "step": 16750 + }, + { + "epoch": 0.06696031722250521, + "grad_norm": 7.661312580108643, + "learning_rate": 1.995093308053691e-07, + "loss": 0.0315, + "step": 16760 + }, + { + "epoch": 0.06700026967908189, + "grad_norm": 6.842121601104736, + "learning_rate": 1.995087084416994e-07, + "loss": 0.0328, + "step": 16770 + }, + { + "epoch": 0.06704022213565856, + "grad_norm": 2.9244823455810547, + "learning_rate": 1.9950808568454943e-07, + "loss": 0.0341, + "step": 16780 + }, + { + "epoch": 0.06708017459223524, + "grad_norm": 8.433223724365234, + "learning_rate": 1.995074625339217e-07, + "loss": 0.0297, + "step": 16790 + }, + { + "epoch": 0.06712012704881191, + "grad_norm": 4.176825523376465, + "learning_rate": 1.9950683898981865e-07, + "loss": 0.0338, + "step": 16800 + }, + { + "epoch": 0.06716007950538859, + "grad_norm": 4.547979354858398, + "learning_rate": 1.9950621505224271e-07, + "loss": 0.0311, + "step": 16810 + }, + { + "epoch": 0.06720003196196526, + "grad_norm": 3.397244453430176, + "learning_rate": 1.9950559072119644e-07, + "loss": 0.0312, + "step": 16820 + }, + { + "epoch": 0.06723998441854194, + "grad_norm": 4.080970287322998, + "learning_rate": 1.9950496599668225e-07, + "loss": 0.0323, + "step": 16830 + }, + { + "epoch": 0.06727993687511861, + "grad_norm": 6.489947319030762, + "learning_rate": 1.9950434087870264e-07, + "loss": 0.0325, + "step": 16840 + }, + { + "epoch": 0.06731988933169528, + "grad_norm": 5.586682319641113, + "learning_rate": 1.9950371536726e-07, + "loss": 0.032, + "step": 16850 + }, + { + "epoch": 0.06735984178827195, + "grad_norm": 4.313248157501221, + "learning_rate": 1.9950308946235687e-07, + "loss": 0.0374, + "step": 16860 + }, + { + "epoch": 0.06739979424484863, + "grad_norm": 4.060424327850342, + "learning_rate": 1.9950246316399574e-07, + "loss": 0.0341, + "step": 16870 + }, + { + "epoch": 0.0674397467014253, + "grad_norm": 5.266396522521973, + "learning_rate": 1.995018364721791e-07, + "loss": 0.0327, + "step": 16880 + }, + { + "epoch": 0.06747969915800198, + "grad_norm": 5.949278354644775, + "learning_rate": 1.9950120938690932e-07, + "loss": 0.0371, + "step": 16890 + }, + { + "epoch": 0.06751965161457865, + "grad_norm": 3.426069736480713, + "learning_rate": 1.99500581908189e-07, + "loss": 0.0333, + "step": 16900 + }, + { + "epoch": 0.06755960407115533, + "grad_norm": 4.401515960693359, + "learning_rate": 1.9949995403602052e-07, + "loss": 0.0285, + "step": 16910 + }, + { + "epoch": 0.067599556527732, + "grad_norm": 6.739278316497803, + "learning_rate": 1.9949932577040648e-07, + "loss": 0.0333, + "step": 16920 + }, + { + "epoch": 0.06763950898430868, + "grad_norm": 10.184057235717773, + "learning_rate": 1.9949869711134925e-07, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 0.06767946144088535, + "grad_norm": 4.752099990844727, + "learning_rate": 1.9949806805885139e-07, + "loss": 0.0303, + "step": 16940 + }, + { + "epoch": 0.06771941389746201, + "grad_norm": 2.5675899982452393, + "learning_rate": 1.9949743861291537e-07, + "loss": 0.0305, + "step": 16950 + }, + { + "epoch": 0.06775936635403869, + "grad_norm": 2.2600769996643066, + "learning_rate": 1.9949680877354368e-07, + "loss": 0.0276, + "step": 16960 + }, + { + "epoch": 0.06779931881061536, + "grad_norm": 5.14525032043457, + "learning_rate": 1.9949617854073875e-07, + "loss": 0.0311, + "step": 16970 + }, + { + "epoch": 0.06783927126719204, + "grad_norm": 14.264254570007324, + "learning_rate": 1.9949554791450317e-07, + "loss": 0.0397, + "step": 16980 + }, + { + "epoch": 0.06787922372376871, + "grad_norm": 6.908894062042236, + "learning_rate": 1.9949491689483937e-07, + "loss": 0.0335, + "step": 16990 + }, + { + "epoch": 0.06791917618034539, + "grad_norm": 3.5037779808044434, + "learning_rate": 1.9949428548174986e-07, + "loss": 0.0356, + "step": 17000 + }, + { + "epoch": 0.06795912863692206, + "grad_norm": 2.366603374481201, + "learning_rate": 1.9949365367523713e-07, + "loss": 0.0347, + "step": 17010 + }, + { + "epoch": 0.06799908109349874, + "grad_norm": 3.8914506435394287, + "learning_rate": 1.9949302147530372e-07, + "loss": 0.0355, + "step": 17020 + }, + { + "epoch": 0.06803903355007541, + "grad_norm": 3.6226541996002197, + "learning_rate": 1.994923888819521e-07, + "loss": 0.0346, + "step": 17030 + }, + { + "epoch": 0.06807898600665209, + "grad_norm": 4.676554203033447, + "learning_rate": 1.9949175589518474e-07, + "loss": 0.0301, + "step": 17040 + }, + { + "epoch": 0.06811893846322875, + "grad_norm": 3.932526111602783, + "learning_rate": 1.9949112251500417e-07, + "loss": 0.0337, + "step": 17050 + }, + { + "epoch": 0.06815889091980543, + "grad_norm": 3.3862240314483643, + "learning_rate": 1.9949048874141293e-07, + "loss": 0.0273, + "step": 17060 + }, + { + "epoch": 0.0681988433763821, + "grad_norm": 3.74690842628479, + "learning_rate": 1.9948985457441348e-07, + "loss": 0.0332, + "step": 17070 + }, + { + "epoch": 0.06823879583295878, + "grad_norm": 4.2274627685546875, + "learning_rate": 1.994892200140083e-07, + "loss": 0.0338, + "step": 17080 + }, + { + "epoch": 0.06827874828953545, + "grad_norm": 5.8503522872924805, + "learning_rate": 1.994885850602e-07, + "loss": 0.0307, + "step": 17090 + }, + { + "epoch": 0.06831870074611213, + "grad_norm": 4.2426557540893555, + "learning_rate": 1.99487949712991e-07, + "loss": 0.0324, + "step": 17100 + }, + { + "epoch": 0.0683586532026888, + "grad_norm": 9.06663990020752, + "learning_rate": 1.9948731397238389e-07, + "loss": 0.0301, + "step": 17110 + }, + { + "epoch": 0.06839860565926548, + "grad_norm": 6.156297206878662, + "learning_rate": 1.9948667783838107e-07, + "loss": 0.0372, + "step": 17120 + }, + { + "epoch": 0.06843855811584215, + "grad_norm": 4.519973278045654, + "learning_rate": 1.9948604131098516e-07, + "loss": 0.0299, + "step": 17130 + }, + { + "epoch": 0.06847851057241883, + "grad_norm": 5.231022834777832, + "learning_rate": 1.9948540439019862e-07, + "loss": 0.0311, + "step": 17140 + }, + { + "epoch": 0.0685184630289955, + "grad_norm": 4.272489070892334, + "learning_rate": 1.99484767076024e-07, + "loss": 0.0328, + "step": 17150 + }, + { + "epoch": 0.06855841548557216, + "grad_norm": 3.590024948120117, + "learning_rate": 1.9948412936846379e-07, + "loss": 0.0284, + "step": 17160 + }, + { + "epoch": 0.06859836794214884, + "grad_norm": 4.151247978210449, + "learning_rate": 1.9948349126752055e-07, + "loss": 0.0335, + "step": 17170 + }, + { + "epoch": 0.06863832039872551, + "grad_norm": 4.129812240600586, + "learning_rate": 1.9948285277319677e-07, + "loss": 0.0341, + "step": 17180 + }, + { + "epoch": 0.06867827285530219, + "grad_norm": 4.660896301269531, + "learning_rate": 1.99482213885495e-07, + "loss": 0.0326, + "step": 17190 + }, + { + "epoch": 0.06871822531187886, + "grad_norm": 5.388843059539795, + "learning_rate": 1.9948157460441775e-07, + "loss": 0.0294, + "step": 17200 + }, + { + "epoch": 0.06875817776845554, + "grad_norm": 5.557071208953857, + "learning_rate": 1.9948093492996754e-07, + "loss": 0.0323, + "step": 17210 + }, + { + "epoch": 0.06879813022503221, + "grad_norm": 3.920119524002075, + "learning_rate": 1.9948029486214692e-07, + "loss": 0.0282, + "step": 17220 + }, + { + "epoch": 0.06883808268160889, + "grad_norm": 6.210538864135742, + "learning_rate": 1.9947965440095842e-07, + "loss": 0.0288, + "step": 17230 + }, + { + "epoch": 0.06887803513818556, + "grad_norm": 5.372893810272217, + "learning_rate": 1.9947901354640457e-07, + "loss": 0.0319, + "step": 17240 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 2.413724899291992, + "learning_rate": 1.9947837229848788e-07, + "loss": 0.0297, + "step": 17250 + }, + { + "epoch": 0.0689579400513389, + "grad_norm": 4.7887654304504395, + "learning_rate": 1.9947773065721089e-07, + "loss": 0.0345, + "step": 17260 + }, + { + "epoch": 0.06899789250791558, + "grad_norm": 4.909997940063477, + "learning_rate": 1.9947708862257619e-07, + "loss": 0.0341, + "step": 17270 + }, + { + "epoch": 0.06903784496449225, + "grad_norm": 3.6699676513671875, + "learning_rate": 1.9947644619458627e-07, + "loss": 0.0305, + "step": 17280 + }, + { + "epoch": 0.06907779742106893, + "grad_norm": 6.654294490814209, + "learning_rate": 1.9947580337324364e-07, + "loss": 0.0282, + "step": 17290 + }, + { + "epoch": 0.0691177498776456, + "grad_norm": 5.834504127502441, + "learning_rate": 1.9947516015855094e-07, + "loss": 0.0302, + "step": 17300 + }, + { + "epoch": 0.06915770233422228, + "grad_norm": 8.998693466186523, + "learning_rate": 1.994745165505106e-07, + "loss": 0.0332, + "step": 17310 + }, + { + "epoch": 0.06919765479079895, + "grad_norm": 2.8466033935546875, + "learning_rate": 1.9947387254912525e-07, + "loss": 0.032, + "step": 17320 + }, + { + "epoch": 0.06923760724737563, + "grad_norm": 2.2874574661254883, + "learning_rate": 1.994732281543974e-07, + "loss": 0.0274, + "step": 17330 + }, + { + "epoch": 0.0692775597039523, + "grad_norm": 6.664508819580078, + "learning_rate": 1.9947258336632965e-07, + "loss": 0.0275, + "step": 17340 + }, + { + "epoch": 0.06931751216052898, + "grad_norm": 5.563503742218018, + "learning_rate": 1.9947193818492446e-07, + "loss": 0.035, + "step": 17350 + }, + { + "epoch": 0.06935746461710564, + "grad_norm": 8.736769676208496, + "learning_rate": 1.9947129261018443e-07, + "loss": 0.0307, + "step": 17360 + }, + { + "epoch": 0.06939741707368231, + "grad_norm": 10.021705627441406, + "learning_rate": 1.9947064664211213e-07, + "loss": 0.0284, + "step": 17370 + }, + { + "epoch": 0.06943736953025899, + "grad_norm": 7.21409797668457, + "learning_rate": 1.9947000028071005e-07, + "loss": 0.0336, + "step": 17380 + }, + { + "epoch": 0.06947732198683566, + "grad_norm": 5.61565637588501, + "learning_rate": 1.9946935352598084e-07, + "loss": 0.0389, + "step": 17390 + }, + { + "epoch": 0.06951727444341234, + "grad_norm": 5.879587173461914, + "learning_rate": 1.9946870637792698e-07, + "loss": 0.0329, + "step": 17400 + }, + { + "epoch": 0.06955722689998901, + "grad_norm": 8.909744262695312, + "learning_rate": 1.9946805883655107e-07, + "loss": 0.0357, + "step": 17410 + }, + { + "epoch": 0.06959717935656569, + "grad_norm": 7.981522083282471, + "learning_rate": 1.9946741090185565e-07, + "loss": 0.034, + "step": 17420 + }, + { + "epoch": 0.06963713181314236, + "grad_norm": 3.779330015182495, + "learning_rate": 1.9946676257384328e-07, + "loss": 0.035, + "step": 17430 + }, + { + "epoch": 0.06967708426971904, + "grad_norm": 3.551500082015991, + "learning_rate": 1.9946611385251654e-07, + "loss": 0.0384, + "step": 17440 + }, + { + "epoch": 0.06971703672629571, + "grad_norm": 2.8989222049713135, + "learning_rate": 1.9946546473787796e-07, + "loss": 0.0305, + "step": 17450 + }, + { + "epoch": 0.06975698918287238, + "grad_norm": 3.967578887939453, + "learning_rate": 1.9946481522993017e-07, + "loss": 0.0309, + "step": 17460 + }, + { + "epoch": 0.06979694163944905, + "grad_norm": 4.0344929695129395, + "learning_rate": 1.9946416532867566e-07, + "loss": 0.036, + "step": 17470 + }, + { + "epoch": 0.06983689409602573, + "grad_norm": 8.140336036682129, + "learning_rate": 1.994635150341171e-07, + "loss": 0.0288, + "step": 17480 + }, + { + "epoch": 0.0698768465526024, + "grad_norm": 7.396175384521484, + "learning_rate": 1.9946286434625696e-07, + "loss": 0.0316, + "step": 17490 + }, + { + "epoch": 0.06991679900917908, + "grad_norm": 9.998981475830078, + "learning_rate": 1.9946221326509786e-07, + "loss": 0.0302, + "step": 17500 + }, + { + "epoch": 0.06995675146575575, + "grad_norm": 4.901121139526367, + "learning_rate": 1.994615617906424e-07, + "loss": 0.033, + "step": 17510 + }, + { + "epoch": 0.06999670392233243, + "grad_norm": 4.129850387573242, + "learning_rate": 1.994609099228931e-07, + "loss": 0.032, + "step": 17520 + }, + { + "epoch": 0.0700366563789091, + "grad_norm": 2.962146282196045, + "learning_rate": 1.9946025766185256e-07, + "loss": 0.0372, + "step": 17530 + }, + { + "epoch": 0.07007660883548578, + "grad_norm": 4.983671188354492, + "learning_rate": 1.9945960500752338e-07, + "loss": 0.0356, + "step": 17540 + }, + { + "epoch": 0.07011656129206245, + "grad_norm": 3.325476884841919, + "learning_rate": 1.9945895195990813e-07, + "loss": 0.0312, + "step": 17550 + }, + { + "epoch": 0.07015651374863911, + "grad_norm": 10.359166145324707, + "learning_rate": 1.9945829851900936e-07, + "loss": 0.0341, + "step": 17560 + }, + { + "epoch": 0.07019646620521579, + "grad_norm": 6.542383193969727, + "learning_rate": 1.994576446848297e-07, + "loss": 0.0351, + "step": 17570 + }, + { + "epoch": 0.07023641866179246, + "grad_norm": 3.999924421310425, + "learning_rate": 1.9945699045737172e-07, + "loss": 0.0304, + "step": 17580 + }, + { + "epoch": 0.07027637111836914, + "grad_norm": 2.8909735679626465, + "learning_rate": 1.9945633583663798e-07, + "loss": 0.0379, + "step": 17590 + }, + { + "epoch": 0.07031632357494581, + "grad_norm": 6.10703706741333, + "learning_rate": 1.9945568082263113e-07, + "loss": 0.0267, + "step": 17600 + }, + { + "epoch": 0.07035627603152249, + "grad_norm": 13.66802978515625, + "learning_rate": 1.9945502541535372e-07, + "loss": 0.0386, + "step": 17610 + }, + { + "epoch": 0.07039622848809916, + "grad_norm": 7.310481071472168, + "learning_rate": 1.994543696148083e-07, + "loss": 0.0329, + "step": 17620 + }, + { + "epoch": 0.07043618094467584, + "grad_norm": 7.305968284606934, + "learning_rate": 1.9945371342099753e-07, + "loss": 0.0262, + "step": 17630 + }, + { + "epoch": 0.07047613340125251, + "grad_norm": 6.71469259262085, + "learning_rate": 1.9945305683392397e-07, + "loss": 0.034, + "step": 17640 + }, + { + "epoch": 0.07051608585782919, + "grad_norm": 4.815409183502197, + "learning_rate": 1.9945239985359025e-07, + "loss": 0.0325, + "step": 17650 + }, + { + "epoch": 0.07055603831440586, + "grad_norm": 10.783384323120117, + "learning_rate": 1.9945174247999893e-07, + "loss": 0.0338, + "step": 17660 + }, + { + "epoch": 0.07059599077098253, + "grad_norm": 4.979619026184082, + "learning_rate": 1.994510847131526e-07, + "loss": 0.0382, + "step": 17670 + }, + { + "epoch": 0.0706359432275592, + "grad_norm": 18.108789443969727, + "learning_rate": 1.9945042655305392e-07, + "loss": 0.0308, + "step": 17680 + }, + { + "epoch": 0.07067589568413588, + "grad_norm": 4.367242813110352, + "learning_rate": 1.9944976799970545e-07, + "loss": 0.031, + "step": 17690 + }, + { + "epoch": 0.07071584814071255, + "grad_norm": 6.721596717834473, + "learning_rate": 1.994491090531098e-07, + "loss": 0.0308, + "step": 17700 + }, + { + "epoch": 0.07075580059728923, + "grad_norm": 4.558361053466797, + "learning_rate": 1.9944844971326962e-07, + "loss": 0.0305, + "step": 17710 + }, + { + "epoch": 0.0707957530538659, + "grad_norm": 7.289640426635742, + "learning_rate": 1.994477899801874e-07, + "loss": 0.034, + "step": 17720 + }, + { + "epoch": 0.07083570551044258, + "grad_norm": 5.036042213439941, + "learning_rate": 1.994471298538659e-07, + "loss": 0.0329, + "step": 17730 + }, + { + "epoch": 0.07087565796701925, + "grad_norm": 8.122034072875977, + "learning_rate": 1.994464693343076e-07, + "loss": 0.0342, + "step": 17740 + }, + { + "epoch": 0.07091561042359593, + "grad_norm": 4.217157363891602, + "learning_rate": 1.994458084215152e-07, + "loss": 0.0293, + "step": 17750 + }, + { + "epoch": 0.0709555628801726, + "grad_norm": 6.2500200271606445, + "learning_rate": 1.9944514711549125e-07, + "loss": 0.0332, + "step": 17760 + }, + { + "epoch": 0.07099551533674926, + "grad_norm": 5.202104568481445, + "learning_rate": 1.994444854162384e-07, + "loss": 0.0316, + "step": 17770 + }, + { + "epoch": 0.07103546779332594, + "grad_norm": 11.813491821289062, + "learning_rate": 1.9944382332375925e-07, + "loss": 0.0254, + "step": 17780 + }, + { + "epoch": 0.07107542024990261, + "grad_norm": 4.000515937805176, + "learning_rate": 1.9944316083805646e-07, + "loss": 0.0329, + "step": 17790 + }, + { + "epoch": 0.07111537270647929, + "grad_norm": 5.151356220245361, + "learning_rate": 1.994424979591326e-07, + "loss": 0.0323, + "step": 17800 + }, + { + "epoch": 0.07115532516305596, + "grad_norm": 4.217026233673096, + "learning_rate": 1.994418346869903e-07, + "loss": 0.0326, + "step": 17810 + }, + { + "epoch": 0.07119527761963264, + "grad_norm": 5.960097312927246, + "learning_rate": 1.994411710216322e-07, + "loss": 0.031, + "step": 17820 + }, + { + "epoch": 0.07123523007620931, + "grad_norm": 6.223817825317383, + "learning_rate": 1.9944050696306092e-07, + "loss": 0.0333, + "step": 17830 + }, + { + "epoch": 0.07127518253278599, + "grad_norm": 2.352207899093628, + "learning_rate": 1.994398425112791e-07, + "loss": 0.0314, + "step": 17840 + }, + { + "epoch": 0.07131513498936266, + "grad_norm": 2.376627206802368, + "learning_rate": 1.9943917766628932e-07, + "loss": 0.0342, + "step": 17850 + }, + { + "epoch": 0.07135508744593934, + "grad_norm": 6.219802379608154, + "learning_rate": 1.9943851242809426e-07, + "loss": 0.0348, + "step": 17860 + }, + { + "epoch": 0.071395039902516, + "grad_norm": 3.7364940643310547, + "learning_rate": 1.9943784679669652e-07, + "loss": 0.0328, + "step": 17870 + }, + { + "epoch": 0.07143499235909267, + "grad_norm": 3.251006603240967, + "learning_rate": 1.9943718077209875e-07, + "loss": 0.03, + "step": 17880 + }, + { + "epoch": 0.07147494481566935, + "grad_norm": 5.631760120391846, + "learning_rate": 1.9943651435430358e-07, + "loss": 0.0328, + "step": 17890 + }, + { + "epoch": 0.07151489727224603, + "grad_norm": 1.9941624402999878, + "learning_rate": 1.994358475433136e-07, + "loss": 0.0309, + "step": 17900 + }, + { + "epoch": 0.0715548497288227, + "grad_norm": 34.009918212890625, + "learning_rate": 1.994351803391315e-07, + "loss": 0.0319, + "step": 17910 + }, + { + "epoch": 0.07159480218539938, + "grad_norm": 4.4864935874938965, + "learning_rate": 1.9943451274175995e-07, + "loss": 0.035, + "step": 17920 + }, + { + "epoch": 0.07163475464197605, + "grad_norm": 4.558022975921631, + "learning_rate": 1.9943384475120152e-07, + "loss": 0.035, + "step": 17930 + }, + { + "epoch": 0.07167470709855273, + "grad_norm": 4.827419757843018, + "learning_rate": 1.9943317636745887e-07, + "loss": 0.0286, + "step": 17940 + }, + { + "epoch": 0.0717146595551294, + "grad_norm": 2.6064014434814453, + "learning_rate": 1.9943250759053464e-07, + "loss": 0.033, + "step": 17950 + }, + { + "epoch": 0.07175461201170608, + "grad_norm": 3.550555944442749, + "learning_rate": 1.994318384204315e-07, + "loss": 0.0336, + "step": 17960 + }, + { + "epoch": 0.07179456446828274, + "grad_norm": 3.925306558609009, + "learning_rate": 1.994311688571521e-07, + "loss": 0.0345, + "step": 17970 + }, + { + "epoch": 0.07183451692485941, + "grad_norm": 8.440332412719727, + "learning_rate": 1.99430498900699e-07, + "loss": 0.0326, + "step": 17980 + }, + { + "epoch": 0.07187446938143609, + "grad_norm": 7.786309242248535, + "learning_rate": 1.9942982855107497e-07, + "loss": 0.0278, + "step": 17990 + }, + { + "epoch": 0.07191442183801276, + "grad_norm": 3.6763811111450195, + "learning_rate": 1.994291578082826e-07, + "loss": 0.0339, + "step": 18000 + }, + { + "epoch": 0.07195437429458944, + "grad_norm": 2.644946575164795, + "learning_rate": 1.9942848667232456e-07, + "loss": 0.0291, + "step": 18010 + }, + { + "epoch": 0.07199432675116611, + "grad_norm": 3.157585859298706, + "learning_rate": 1.9942781514320346e-07, + "loss": 0.0295, + "step": 18020 + }, + { + "epoch": 0.07203427920774279, + "grad_norm": 6.450218677520752, + "learning_rate": 1.9942714322092201e-07, + "loss": 0.0298, + "step": 18030 + }, + { + "epoch": 0.07207423166431946, + "grad_norm": 6.96489953994751, + "learning_rate": 1.9942647090548284e-07, + "loss": 0.0274, + "step": 18040 + }, + { + "epoch": 0.07211418412089614, + "grad_norm": 5.013808250427246, + "learning_rate": 1.9942579819688864e-07, + "loss": 0.0341, + "step": 18050 + }, + { + "epoch": 0.07215413657747281, + "grad_norm": 2.854428291320801, + "learning_rate": 1.9942512509514203e-07, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.07219408903404947, + "grad_norm": 3.5474538803100586, + "learning_rate": 1.9942445160024566e-07, + "loss": 0.029, + "step": 18070 + }, + { + "epoch": 0.07223404149062615, + "grad_norm": 2.960975170135498, + "learning_rate": 1.9942377771220223e-07, + "loss": 0.0296, + "step": 18080 + }, + { + "epoch": 0.07227399394720282, + "grad_norm": 6.3573431968688965, + "learning_rate": 1.994231034310144e-07, + "loss": 0.031, + "step": 18090 + }, + { + "epoch": 0.0723139464037795, + "grad_norm": 2.9318788051605225, + "learning_rate": 1.9942242875668485e-07, + "loss": 0.0334, + "step": 18100 + }, + { + "epoch": 0.07235389886035618, + "grad_norm": 7.589346885681152, + "learning_rate": 1.994217536892162e-07, + "loss": 0.0357, + "step": 18110 + }, + { + "epoch": 0.07239385131693285, + "grad_norm": 4.943765163421631, + "learning_rate": 1.9942107822861116e-07, + "loss": 0.0375, + "step": 18120 + }, + { + "epoch": 0.07243380377350953, + "grad_norm": 2.9642770290374756, + "learning_rate": 1.9942040237487237e-07, + "loss": 0.0345, + "step": 18130 + }, + { + "epoch": 0.0724737562300862, + "grad_norm": 4.5025858879089355, + "learning_rate": 1.9941972612800254e-07, + "loss": 0.0351, + "step": 18140 + }, + { + "epoch": 0.07251370868666288, + "grad_norm": 2.0538976192474365, + "learning_rate": 1.994190494880043e-07, + "loss": 0.0319, + "step": 18150 + }, + { + "epoch": 0.07255366114323955, + "grad_norm": 3.9308531284332275, + "learning_rate": 1.994183724548804e-07, + "loss": 0.0338, + "step": 18160 + }, + { + "epoch": 0.07259361359981621, + "grad_norm": 2.01715350151062, + "learning_rate": 1.9941769502863344e-07, + "loss": 0.032, + "step": 18170 + }, + { + "epoch": 0.07263356605639289, + "grad_norm": 13.375420570373535, + "learning_rate": 1.9941701720926613e-07, + "loss": 0.0292, + "step": 18180 + }, + { + "epoch": 0.07267351851296956, + "grad_norm": 6.226934432983398, + "learning_rate": 1.9941633899678112e-07, + "loss": 0.0274, + "step": 18190 + }, + { + "epoch": 0.07271347096954624, + "grad_norm": 3.1634671688079834, + "learning_rate": 1.9941566039118115e-07, + "loss": 0.0278, + "step": 18200 + }, + { + "epoch": 0.07275342342612291, + "grad_norm": 4.7596964836120605, + "learning_rate": 1.9941498139246885e-07, + "loss": 0.0314, + "step": 18210 + }, + { + "epoch": 0.07279337588269959, + "grad_norm": 4.098544120788574, + "learning_rate": 1.9941430200064695e-07, + "loss": 0.0281, + "step": 18220 + }, + { + "epoch": 0.07283332833927626, + "grad_norm": 4.835303783416748, + "learning_rate": 1.994136222157181e-07, + "loss": 0.0308, + "step": 18230 + }, + { + "epoch": 0.07287328079585294, + "grad_norm": 4.273542404174805, + "learning_rate": 1.9941294203768501e-07, + "loss": 0.0311, + "step": 18240 + }, + { + "epoch": 0.07291323325242961, + "grad_norm": 5.596797466278076, + "learning_rate": 1.9941226146655036e-07, + "loss": 0.0369, + "step": 18250 + }, + { + "epoch": 0.07295318570900629, + "grad_norm": 5.280571937561035, + "learning_rate": 1.9941158050231682e-07, + "loss": 0.0261, + "step": 18260 + }, + { + "epoch": 0.07299313816558296, + "grad_norm": 16.817594528198242, + "learning_rate": 1.9941089914498712e-07, + "loss": 0.0294, + "step": 18270 + }, + { + "epoch": 0.07303309062215962, + "grad_norm": 4.650108814239502, + "learning_rate": 1.9941021739456394e-07, + "loss": 0.0318, + "step": 18280 + }, + { + "epoch": 0.0730730430787363, + "grad_norm": 1.6510111093521118, + "learning_rate": 1.9940953525104994e-07, + "loss": 0.0307, + "step": 18290 + }, + { + "epoch": 0.07311299553531297, + "grad_norm": 2.9088215827941895, + "learning_rate": 1.9940885271444791e-07, + "loss": 0.0309, + "step": 18300 + }, + { + "epoch": 0.07315294799188965, + "grad_norm": 5.256911277770996, + "learning_rate": 1.9940816978476045e-07, + "loss": 0.0317, + "step": 18310 + }, + { + "epoch": 0.07319290044846632, + "grad_norm": 2.377007484436035, + "learning_rate": 1.994074864619903e-07, + "loss": 0.032, + "step": 18320 + }, + { + "epoch": 0.073232852905043, + "grad_norm": 2.828054189682007, + "learning_rate": 1.9940680274614018e-07, + "loss": 0.0324, + "step": 18330 + }, + { + "epoch": 0.07327280536161968, + "grad_norm": 4.193475246429443, + "learning_rate": 1.9940611863721278e-07, + "loss": 0.0278, + "step": 18340 + }, + { + "epoch": 0.07331275781819635, + "grad_norm": 3.458343267440796, + "learning_rate": 1.994054341352108e-07, + "loss": 0.0313, + "step": 18350 + }, + { + "epoch": 0.07335271027477303, + "grad_norm": 5.169400691986084, + "learning_rate": 1.9940474924013696e-07, + "loss": 0.0293, + "step": 18360 + }, + { + "epoch": 0.0733926627313497, + "grad_norm": 5.823927402496338, + "learning_rate": 1.9940406395199393e-07, + "loss": 0.0285, + "step": 18370 + }, + { + "epoch": 0.07343261518792636, + "grad_norm": 3.912001848220825, + "learning_rate": 1.9940337827078447e-07, + "loss": 0.0287, + "step": 18380 + }, + { + "epoch": 0.07347256764450304, + "grad_norm": 4.5439677238464355, + "learning_rate": 1.9940269219651124e-07, + "loss": 0.0333, + "step": 18390 + }, + { + "epoch": 0.07351252010107971, + "grad_norm": 5.859480381011963, + "learning_rate": 1.9940200572917701e-07, + "loss": 0.0328, + "step": 18400 + }, + { + "epoch": 0.07355247255765639, + "grad_norm": 10.146559715270996, + "learning_rate": 1.9940131886878445e-07, + "loss": 0.0315, + "step": 18410 + }, + { + "epoch": 0.07359242501423306, + "grad_norm": 6.215435028076172, + "learning_rate": 1.9940063161533628e-07, + "loss": 0.0339, + "step": 18420 + }, + { + "epoch": 0.07363237747080974, + "grad_norm": 5.249377250671387, + "learning_rate": 1.9939994396883523e-07, + "loss": 0.0327, + "step": 18430 + }, + { + "epoch": 0.07367232992738641, + "grad_norm": 55.96763610839844, + "learning_rate": 1.9939925592928405e-07, + "loss": 0.0321, + "step": 18440 + }, + { + "epoch": 0.07371228238396309, + "grad_norm": 4.233126163482666, + "learning_rate": 1.9939856749668542e-07, + "loss": 0.0297, + "step": 18450 + }, + { + "epoch": 0.07375223484053976, + "grad_norm": 11.8255033493042, + "learning_rate": 1.9939787867104206e-07, + "loss": 0.0305, + "step": 18460 + }, + { + "epoch": 0.07379218729711644, + "grad_norm": 3.5481619834899902, + "learning_rate": 1.993971894523567e-07, + "loss": 0.0298, + "step": 18470 + }, + { + "epoch": 0.0738321397536931, + "grad_norm": 5.644193649291992, + "learning_rate": 1.9939649984063208e-07, + "loss": 0.034, + "step": 18480 + }, + { + "epoch": 0.07387209221026977, + "grad_norm": 5.700722694396973, + "learning_rate": 1.9939580983587087e-07, + "loss": 0.0288, + "step": 18490 + }, + { + "epoch": 0.07391204466684645, + "grad_norm": 8.196548461914062, + "learning_rate": 1.9939511943807589e-07, + "loss": 0.0274, + "step": 18500 + }, + { + "epoch": 0.07395199712342312, + "grad_norm": 3.5429933071136475, + "learning_rate": 1.9939442864724983e-07, + "loss": 0.0316, + "step": 18510 + }, + { + "epoch": 0.0739919495799998, + "grad_norm": 5.5551605224609375, + "learning_rate": 1.9939373746339544e-07, + "loss": 0.029, + "step": 18520 + }, + { + "epoch": 0.07403190203657647, + "grad_norm": 3.4301552772521973, + "learning_rate": 1.9939304588651537e-07, + "loss": 0.0319, + "step": 18530 + }, + { + "epoch": 0.07407185449315315, + "grad_norm": 6.505978584289551, + "learning_rate": 1.9939235391661244e-07, + "loss": 0.0325, + "step": 18540 + }, + { + "epoch": 0.07411180694972982, + "grad_norm": 25.043777465820312, + "learning_rate": 1.9939166155368935e-07, + "loss": 0.0301, + "step": 18550 + }, + { + "epoch": 0.0741517594063065, + "grad_norm": 5.53081750869751, + "learning_rate": 1.9939096879774887e-07, + "loss": 0.0293, + "step": 18560 + }, + { + "epoch": 0.07419171186288318, + "grad_norm": 4.500675678253174, + "learning_rate": 1.993902756487937e-07, + "loss": 0.0334, + "step": 18570 + }, + { + "epoch": 0.07423166431945984, + "grad_norm": 2.7694263458251953, + "learning_rate": 1.9938958210682664e-07, + "loss": 0.0328, + "step": 18580 + }, + { + "epoch": 0.07427161677603651, + "grad_norm": 4.887722015380859, + "learning_rate": 1.9938888817185034e-07, + "loss": 0.0293, + "step": 18590 + }, + { + "epoch": 0.07431156923261319, + "grad_norm": 5.541425704956055, + "learning_rate": 1.993881938438676e-07, + "loss": 0.0295, + "step": 18600 + }, + { + "epoch": 0.07435152168918986, + "grad_norm": 7.266062259674072, + "learning_rate": 1.9938749912288118e-07, + "loss": 0.0313, + "step": 18610 + }, + { + "epoch": 0.07439147414576654, + "grad_norm": 4.053021430969238, + "learning_rate": 1.993868040088938e-07, + "loss": 0.0326, + "step": 18620 + }, + { + "epoch": 0.07443142660234321, + "grad_norm": 58.46577453613281, + "learning_rate": 1.993861085019082e-07, + "loss": 0.0318, + "step": 18630 + }, + { + "epoch": 0.07447137905891989, + "grad_norm": 5.2011799812316895, + "learning_rate": 1.9938541260192718e-07, + "loss": 0.035, + "step": 18640 + }, + { + "epoch": 0.07451133151549656, + "grad_norm": 4.068670272827148, + "learning_rate": 1.9938471630895343e-07, + "loss": 0.0295, + "step": 18650 + }, + { + "epoch": 0.07455128397207324, + "grad_norm": 6.214854717254639, + "learning_rate": 1.9938401962298976e-07, + "loss": 0.0348, + "step": 18660 + }, + { + "epoch": 0.07459123642864991, + "grad_norm": 2.9383373260498047, + "learning_rate": 1.993833225440389e-07, + "loss": 0.0307, + "step": 18670 + }, + { + "epoch": 0.07463118888522657, + "grad_norm": 2.566362142562866, + "learning_rate": 1.9938262507210357e-07, + "loss": 0.0319, + "step": 18680 + }, + { + "epoch": 0.07467114134180325, + "grad_norm": 4.524789810180664, + "learning_rate": 1.9938192720718657e-07, + "loss": 0.0396, + "step": 18690 + }, + { + "epoch": 0.07471109379837992, + "grad_norm": 4.532100200653076, + "learning_rate": 1.9938122894929067e-07, + "loss": 0.0338, + "step": 18700 + }, + { + "epoch": 0.0747510462549566, + "grad_norm": 2.2835772037506104, + "learning_rate": 1.9938053029841857e-07, + "loss": 0.0295, + "step": 18710 + }, + { + "epoch": 0.07479099871153327, + "grad_norm": 5.2361273765563965, + "learning_rate": 1.993798312545731e-07, + "loss": 0.0292, + "step": 18720 + }, + { + "epoch": 0.07483095116810995, + "grad_norm": 6.56334924697876, + "learning_rate": 1.99379131817757e-07, + "loss": 0.0347, + "step": 18730 + }, + { + "epoch": 0.07487090362468662, + "grad_norm": 6.257125377655029, + "learning_rate": 1.9937843198797304e-07, + "loss": 0.0328, + "step": 18740 + }, + { + "epoch": 0.0749108560812633, + "grad_norm": 10.40974235534668, + "learning_rate": 1.9937773176522397e-07, + "loss": 0.0313, + "step": 18750 + }, + { + "epoch": 0.07495080853783997, + "grad_norm": 4.239706993103027, + "learning_rate": 1.9937703114951257e-07, + "loss": 0.0357, + "step": 18760 + }, + { + "epoch": 0.07499076099441665, + "grad_norm": 3.768031597137451, + "learning_rate": 1.9937633014084162e-07, + "loss": 0.0387, + "step": 18770 + }, + { + "epoch": 0.07503071345099331, + "grad_norm": 4.554696559906006, + "learning_rate": 1.9937562873921388e-07, + "loss": 0.0332, + "step": 18780 + }, + { + "epoch": 0.07507066590756999, + "grad_norm": 4.356616497039795, + "learning_rate": 1.9937492694463213e-07, + "loss": 0.0361, + "step": 18790 + }, + { + "epoch": 0.07511061836414666, + "grad_norm": 6.403358459472656, + "learning_rate": 1.9937422475709913e-07, + "loss": 0.0275, + "step": 18800 + }, + { + "epoch": 0.07515057082072334, + "grad_norm": 6.190196514129639, + "learning_rate": 1.9937352217661765e-07, + "loss": 0.0289, + "step": 18810 + }, + { + "epoch": 0.07519052327730001, + "grad_norm": 6.249524116516113, + "learning_rate": 1.9937281920319052e-07, + "loss": 0.0348, + "step": 18820 + }, + { + "epoch": 0.07523047573387669, + "grad_norm": 5.092658519744873, + "learning_rate": 1.9937211583682044e-07, + "loss": 0.0326, + "step": 18830 + }, + { + "epoch": 0.07527042819045336, + "grad_norm": 6.324077129364014, + "learning_rate": 1.993714120775103e-07, + "loss": 0.0304, + "step": 18840 + }, + { + "epoch": 0.07531038064703004, + "grad_norm": 4.10156774520874, + "learning_rate": 1.9937070792526278e-07, + "loss": 0.0292, + "step": 18850 + }, + { + "epoch": 0.07535033310360671, + "grad_norm": 8.13530158996582, + "learning_rate": 1.993700033800807e-07, + "loss": 0.0326, + "step": 18860 + }, + { + "epoch": 0.07539028556018339, + "grad_norm": 6.794101715087891, + "learning_rate": 1.9936929844196685e-07, + "loss": 0.0321, + "step": 18870 + }, + { + "epoch": 0.07543023801676006, + "grad_norm": 3.540358066558838, + "learning_rate": 1.9936859311092404e-07, + "loss": 0.0321, + "step": 18880 + }, + { + "epoch": 0.07547019047333672, + "grad_norm": 3.830427885055542, + "learning_rate": 1.9936788738695504e-07, + "loss": 0.0329, + "step": 18890 + }, + { + "epoch": 0.0755101429299134, + "grad_norm": 5.945143699645996, + "learning_rate": 1.9936718127006262e-07, + "loss": 0.0292, + "step": 18900 + }, + { + "epoch": 0.07555009538649007, + "grad_norm": 5.652834892272949, + "learning_rate": 1.993664747602496e-07, + "loss": 0.0297, + "step": 18910 + }, + { + "epoch": 0.07559004784306675, + "grad_norm": 2.5595405101776123, + "learning_rate": 1.9936576785751873e-07, + "loss": 0.0261, + "step": 18920 + }, + { + "epoch": 0.07563000029964342, + "grad_norm": 3.779371976852417, + "learning_rate": 1.9936506056187286e-07, + "loss": 0.0314, + "step": 18930 + }, + { + "epoch": 0.0756699527562201, + "grad_norm": 4.654202938079834, + "learning_rate": 1.9936435287331478e-07, + "loss": 0.03, + "step": 18940 + }, + { + "epoch": 0.07570990521279677, + "grad_norm": 3.4947872161865234, + "learning_rate": 1.9936364479184726e-07, + "loss": 0.031, + "step": 18950 + }, + { + "epoch": 0.07574985766937345, + "grad_norm": 10.669760704040527, + "learning_rate": 1.9936293631747313e-07, + "loss": 0.0335, + "step": 18960 + }, + { + "epoch": 0.07578981012595012, + "grad_norm": 2.882920265197754, + "learning_rate": 1.9936222745019516e-07, + "loss": 0.0273, + "step": 18970 + }, + { + "epoch": 0.0758297625825268, + "grad_norm": 2.6910693645477295, + "learning_rate": 1.9936151819001615e-07, + "loss": 0.0313, + "step": 18980 + }, + { + "epoch": 0.07586971503910346, + "grad_norm": 2.898571491241455, + "learning_rate": 1.9936080853693892e-07, + "loss": 0.0279, + "step": 18990 + }, + { + "epoch": 0.07590966749568014, + "grad_norm": 4.249948024749756, + "learning_rate": 1.993600984909663e-07, + "loss": 0.0277, + "step": 19000 + }, + { + "epoch": 0.07594961995225681, + "grad_norm": 5.3590312004089355, + "learning_rate": 1.9935938805210109e-07, + "loss": 0.0303, + "step": 19010 + }, + { + "epoch": 0.07598957240883349, + "grad_norm": 4.669713973999023, + "learning_rate": 1.9935867722034606e-07, + "loss": 0.0333, + "step": 19020 + }, + { + "epoch": 0.07602952486541016, + "grad_norm": 4.936066627502441, + "learning_rate": 1.9935796599570405e-07, + "loss": 0.0359, + "step": 19030 + }, + { + "epoch": 0.07606947732198684, + "grad_norm": 3.8303980827331543, + "learning_rate": 1.9935725437817787e-07, + "loss": 0.0356, + "step": 19040 + }, + { + "epoch": 0.07610942977856351, + "grad_norm": 4.281059741973877, + "learning_rate": 1.9935654236777033e-07, + "loss": 0.031, + "step": 19050 + }, + { + "epoch": 0.07614938223514019, + "grad_norm": 4.739426136016846, + "learning_rate": 1.9935582996448427e-07, + "loss": 0.0273, + "step": 19060 + }, + { + "epoch": 0.07618933469171686, + "grad_norm": 5.981328964233398, + "learning_rate": 1.9935511716832244e-07, + "loss": 0.0338, + "step": 19070 + }, + { + "epoch": 0.07622928714829354, + "grad_norm": 9.007943153381348, + "learning_rate": 1.9935440397928775e-07, + "loss": 0.0285, + "step": 19080 + }, + { + "epoch": 0.0762692396048702, + "grad_norm": 4.636965274810791, + "learning_rate": 1.9935369039738298e-07, + "loss": 0.0289, + "step": 19090 + }, + { + "epoch": 0.07630919206144687, + "grad_norm": 8.008565902709961, + "learning_rate": 1.993529764226109e-07, + "loss": 0.0328, + "step": 19100 + }, + { + "epoch": 0.07634914451802355, + "grad_norm": 5.238550186157227, + "learning_rate": 1.993522620549744e-07, + "loss": 0.0296, + "step": 19110 + }, + { + "epoch": 0.07638909697460022, + "grad_norm": 4.078050136566162, + "learning_rate": 1.9935154729447627e-07, + "loss": 0.0324, + "step": 19120 + }, + { + "epoch": 0.0764290494311769, + "grad_norm": 3.1894097328186035, + "learning_rate": 1.9935083214111936e-07, + "loss": 0.0324, + "step": 19130 + }, + { + "epoch": 0.07646900188775357, + "grad_norm": 4.833470344543457, + "learning_rate": 1.993501165949065e-07, + "loss": 0.0282, + "step": 19140 + }, + { + "epoch": 0.07650895434433025, + "grad_norm": 3.3836967945098877, + "learning_rate": 1.993494006558405e-07, + "loss": 0.0331, + "step": 19150 + }, + { + "epoch": 0.07654890680090692, + "grad_norm": 7.108190059661865, + "learning_rate": 1.9934868432392418e-07, + "loss": 0.0273, + "step": 19160 + }, + { + "epoch": 0.0765888592574836, + "grad_norm": 4.915879249572754, + "learning_rate": 1.9934796759916038e-07, + "loss": 0.0316, + "step": 19170 + }, + { + "epoch": 0.07662881171406027, + "grad_norm": 4.654725074768066, + "learning_rate": 1.9934725048155197e-07, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.07666876417063694, + "grad_norm": 5.730825424194336, + "learning_rate": 1.9934653297110174e-07, + "loss": 0.0336, + "step": 19190 + }, + { + "epoch": 0.07670871662721361, + "grad_norm": 6.1118950843811035, + "learning_rate": 1.993458150678126e-07, + "loss": 0.0277, + "step": 19200 + }, + { + "epoch": 0.07674866908379029, + "grad_norm": 4.939913272857666, + "learning_rate": 1.993450967716873e-07, + "loss": 0.0289, + "step": 19210 + }, + { + "epoch": 0.07678862154036696, + "grad_norm": 4.205333709716797, + "learning_rate": 1.993443780827287e-07, + "loss": 0.0293, + "step": 19220 + }, + { + "epoch": 0.07682857399694364, + "grad_norm": 12.205085754394531, + "learning_rate": 1.9934365900093967e-07, + "loss": 0.0261, + "step": 19230 + }, + { + "epoch": 0.07686852645352031, + "grad_norm": 4.492034435272217, + "learning_rate": 1.9934293952632304e-07, + "loss": 0.0305, + "step": 19240 + }, + { + "epoch": 0.07690847891009699, + "grad_norm": 5.672061443328857, + "learning_rate": 1.9934221965888166e-07, + "loss": 0.0309, + "step": 19250 + }, + { + "epoch": 0.07694843136667366, + "grad_norm": 3.2150797843933105, + "learning_rate": 1.9934149939861839e-07, + "loss": 0.0297, + "step": 19260 + }, + { + "epoch": 0.07698838382325034, + "grad_norm": 4.379467964172363, + "learning_rate": 1.9934077874553603e-07, + "loss": 0.0296, + "step": 19270 + }, + { + "epoch": 0.07702833627982701, + "grad_norm": 3.649623155593872, + "learning_rate": 1.993400576996375e-07, + "loss": 0.0276, + "step": 19280 + }, + { + "epoch": 0.07706828873640367, + "grad_norm": 5.412050724029541, + "learning_rate": 1.9933933626092557e-07, + "loss": 0.0288, + "step": 19290 + }, + { + "epoch": 0.07710824119298035, + "grad_norm": 3.1777687072753906, + "learning_rate": 1.9933861442940315e-07, + "loss": 0.0271, + "step": 19300 + }, + { + "epoch": 0.07714819364955702, + "grad_norm": 6.6531243324279785, + "learning_rate": 1.993378922050731e-07, + "loss": 0.0334, + "step": 19310 + }, + { + "epoch": 0.0771881461061337, + "grad_norm": 5.872490406036377, + "learning_rate": 1.9933716958793823e-07, + "loss": 0.03, + "step": 19320 + }, + { + "epoch": 0.07722809856271037, + "grad_norm": 3.7594568729400635, + "learning_rate": 1.9933644657800142e-07, + "loss": 0.0322, + "step": 19330 + }, + { + "epoch": 0.07726805101928705, + "grad_norm": 3.2085213661193848, + "learning_rate": 1.9933572317526555e-07, + "loss": 0.0288, + "step": 19340 + }, + { + "epoch": 0.07730800347586372, + "grad_norm": 7.610959053039551, + "learning_rate": 1.9933499937973345e-07, + "loss": 0.0348, + "step": 19350 + }, + { + "epoch": 0.0773479559324404, + "grad_norm": 2.8320109844207764, + "learning_rate": 1.99334275191408e-07, + "loss": 0.0329, + "step": 19360 + }, + { + "epoch": 0.07738790838901707, + "grad_norm": 7.830523490905762, + "learning_rate": 1.9933355061029204e-07, + "loss": 0.0275, + "step": 19370 + }, + { + "epoch": 0.07742786084559375, + "grad_norm": 4.915370464324951, + "learning_rate": 1.9933282563638846e-07, + "loss": 0.0328, + "step": 19380 + }, + { + "epoch": 0.07746781330217042, + "grad_norm": 5.0761799812316895, + "learning_rate": 1.9933210026970013e-07, + "loss": 0.0301, + "step": 19390 + }, + { + "epoch": 0.07750776575874709, + "grad_norm": 5.905306816101074, + "learning_rate": 1.993313745102299e-07, + "loss": 0.0301, + "step": 19400 + }, + { + "epoch": 0.07754771821532376, + "grad_norm": 5.789778709411621, + "learning_rate": 1.9933064835798065e-07, + "loss": 0.0323, + "step": 19410 + }, + { + "epoch": 0.07758767067190044, + "grad_norm": 6.123649597167969, + "learning_rate": 1.9932992181295525e-07, + "loss": 0.0347, + "step": 19420 + }, + { + "epoch": 0.07762762312847711, + "grad_norm": 3.8950507640838623, + "learning_rate": 1.9932919487515657e-07, + "loss": 0.0273, + "step": 19430 + }, + { + "epoch": 0.07766757558505379, + "grad_norm": 3.5783369541168213, + "learning_rate": 1.9932846754458747e-07, + "loss": 0.0286, + "step": 19440 + }, + { + "epoch": 0.07770752804163046, + "grad_norm": 3.639143705368042, + "learning_rate": 1.9932773982125086e-07, + "loss": 0.0262, + "step": 19450 + }, + { + "epoch": 0.07774748049820714, + "grad_norm": 6.7082037925720215, + "learning_rate": 1.9932701170514962e-07, + "loss": 0.031, + "step": 19460 + }, + { + "epoch": 0.07778743295478381, + "grad_norm": 3.784010887145996, + "learning_rate": 1.9932628319628656e-07, + "loss": 0.0327, + "step": 19470 + }, + { + "epoch": 0.07782738541136049, + "grad_norm": 3.2239675521850586, + "learning_rate": 1.9932555429466467e-07, + "loss": 0.0234, + "step": 19480 + }, + { + "epoch": 0.07786733786793716, + "grad_norm": 5.854137420654297, + "learning_rate": 1.9932482500028671e-07, + "loss": 0.0299, + "step": 19490 + }, + { + "epoch": 0.07790729032451382, + "grad_norm": 5.25482177734375, + "learning_rate": 1.9932409531315567e-07, + "loss": 0.0371, + "step": 19500 + }, + { + "epoch": 0.0779472427810905, + "grad_norm": 3.3556909561157227, + "learning_rate": 1.9932336523327437e-07, + "loss": 0.028, + "step": 19510 + }, + { + "epoch": 0.07798719523766717, + "grad_norm": 41.19482421875, + "learning_rate": 1.9932263476064574e-07, + "loss": 0.0295, + "step": 19520 + }, + { + "epoch": 0.07802714769424385, + "grad_norm": 4.002786159515381, + "learning_rate": 1.9932190389527262e-07, + "loss": 0.0302, + "step": 19530 + }, + { + "epoch": 0.07806710015082052, + "grad_norm": 11.57844352722168, + "learning_rate": 1.9932117263715793e-07, + "loss": 0.0321, + "step": 19540 + }, + { + "epoch": 0.0781070526073972, + "grad_norm": 5.326249599456787, + "learning_rate": 1.9932044098630456e-07, + "loss": 0.0316, + "step": 19550 + }, + { + "epoch": 0.07814700506397387, + "grad_norm": 5.312546730041504, + "learning_rate": 1.993197089427154e-07, + "loss": 0.0329, + "step": 19560 + }, + { + "epoch": 0.07818695752055055, + "grad_norm": 4.101090908050537, + "learning_rate": 1.9931897650639336e-07, + "loss": 0.0362, + "step": 19570 + }, + { + "epoch": 0.07822690997712722, + "grad_norm": 4.179448127746582, + "learning_rate": 1.993182436773413e-07, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.0782668624337039, + "grad_norm": 4.033622741699219, + "learning_rate": 1.9931751045556212e-07, + "loss": 0.0303, + "step": 19590 + }, + { + "epoch": 0.07830681489028056, + "grad_norm": 5.1436920166015625, + "learning_rate": 1.993167768410588e-07, + "loss": 0.0278, + "step": 19600 + }, + { + "epoch": 0.07834676734685724, + "grad_norm": 3.486423969268799, + "learning_rate": 1.993160428338341e-07, + "loss": 0.031, + "step": 19610 + }, + { + "epoch": 0.07838671980343391, + "grad_norm": 4.036117076873779, + "learning_rate": 1.9931530843389108e-07, + "loss": 0.0308, + "step": 19620 + }, + { + "epoch": 0.07842667226001059, + "grad_norm": 8.32155704498291, + "learning_rate": 1.9931457364123255e-07, + "loss": 0.0315, + "step": 19630 + }, + { + "epoch": 0.07846662471658726, + "grad_norm": 6.235025405883789, + "learning_rate": 1.993138384558614e-07, + "loss": 0.0333, + "step": 19640 + }, + { + "epoch": 0.07850657717316394, + "grad_norm": 4.955433368682861, + "learning_rate": 1.993131028777806e-07, + "loss": 0.0288, + "step": 19650 + }, + { + "epoch": 0.07854652962974061, + "grad_norm": 6.3393964767456055, + "learning_rate": 1.9931236690699302e-07, + "loss": 0.035, + "step": 19660 + }, + { + "epoch": 0.07858648208631729, + "grad_norm": 2.2710368633270264, + "learning_rate": 1.9931163054350156e-07, + "loss": 0.0297, + "step": 19670 + }, + { + "epoch": 0.07862643454289396, + "grad_norm": 2.8959269523620605, + "learning_rate": 1.9931089378730917e-07, + "loss": 0.0341, + "step": 19680 + }, + { + "epoch": 0.07866638699947064, + "grad_norm": 7.993409633636475, + "learning_rate": 1.9931015663841872e-07, + "loss": 0.0259, + "step": 19690 + }, + { + "epoch": 0.0787063394560473, + "grad_norm": 4.887556076049805, + "learning_rate": 1.9930941909683316e-07, + "loss": 0.0292, + "step": 19700 + }, + { + "epoch": 0.07874629191262397, + "grad_norm": 7.027575969696045, + "learning_rate": 1.9930868116255543e-07, + "loss": 0.0282, + "step": 19710 + }, + { + "epoch": 0.07878624436920065, + "grad_norm": 3.000983476638794, + "learning_rate": 1.9930794283558834e-07, + "loss": 0.0313, + "step": 19720 + }, + { + "epoch": 0.07882619682577732, + "grad_norm": 13.213887214660645, + "learning_rate": 1.9930720411593493e-07, + "loss": 0.0299, + "step": 19730 + }, + { + "epoch": 0.078866149282354, + "grad_norm": 12.377205848693848, + "learning_rate": 1.9930646500359805e-07, + "loss": 0.0284, + "step": 19740 + }, + { + "epoch": 0.07890610173893067, + "grad_norm": 7.5306715965271, + "learning_rate": 1.9930572549858067e-07, + "loss": 0.0364, + "step": 19750 + }, + { + "epoch": 0.07894605419550735, + "grad_norm": 1.5488656759262085, + "learning_rate": 1.9930498560088567e-07, + "loss": 0.0255, + "step": 19760 + }, + { + "epoch": 0.07898600665208402, + "grad_norm": 6.3472394943237305, + "learning_rate": 1.99304245310516e-07, + "loss": 0.0312, + "step": 19770 + }, + { + "epoch": 0.0790259591086607, + "grad_norm": 2.4783267974853516, + "learning_rate": 1.993035046274746e-07, + "loss": 0.0317, + "step": 19780 + }, + { + "epoch": 0.07906591156523737, + "grad_norm": 4.869140148162842, + "learning_rate": 1.9930276355176434e-07, + "loss": 0.0269, + "step": 19790 + }, + { + "epoch": 0.07910586402181403, + "grad_norm": 9.931827545166016, + "learning_rate": 1.9930202208338822e-07, + "loss": 0.0335, + "step": 19800 + }, + { + "epoch": 0.07914581647839071, + "grad_norm": 2.8274707794189453, + "learning_rate": 1.9930128022234914e-07, + "loss": 0.0251, + "step": 19810 + }, + { + "epoch": 0.07918576893496738, + "grad_norm": 7.017154216766357, + "learning_rate": 1.9930053796865005e-07, + "loss": 0.0274, + "step": 19820 + }, + { + "epoch": 0.07922572139154406, + "grad_norm": 7.301417827606201, + "learning_rate": 1.9929979532229385e-07, + "loss": 0.0304, + "step": 19830 + }, + { + "epoch": 0.07926567384812074, + "grad_norm": 3.493360996246338, + "learning_rate": 1.992990522832835e-07, + "loss": 0.0339, + "step": 19840 + }, + { + "epoch": 0.07930562630469741, + "grad_norm": 2.857882022857666, + "learning_rate": 1.9929830885162195e-07, + "loss": 0.0344, + "step": 19850 + }, + { + "epoch": 0.07934557876127409, + "grad_norm": 9.52408218383789, + "learning_rate": 1.9929756502731213e-07, + "loss": 0.0257, + "step": 19860 + }, + { + "epoch": 0.07938553121785076, + "grad_norm": 4.6855597496032715, + "learning_rate": 1.9929682081035697e-07, + "loss": 0.0313, + "step": 19870 + }, + { + "epoch": 0.07942548367442744, + "grad_norm": 4.38700532913208, + "learning_rate": 1.992960762007594e-07, + "loss": 0.0329, + "step": 19880 + }, + { + "epoch": 0.07946543613100411, + "grad_norm": 3.969442129135132, + "learning_rate": 1.9929533119852242e-07, + "loss": 0.0346, + "step": 19890 + }, + { + "epoch": 0.07950538858758077, + "grad_norm": 3.808331251144409, + "learning_rate": 1.9929458580364891e-07, + "loss": 0.032, + "step": 19900 + }, + { + "epoch": 0.07954534104415745, + "grad_norm": 5.476817607879639, + "learning_rate": 1.9929384001614187e-07, + "loss": 0.0322, + "step": 19910 + }, + { + "epoch": 0.07958529350073412, + "grad_norm": 3.843646764755249, + "learning_rate": 1.9929309383600424e-07, + "loss": 0.0314, + "step": 19920 + }, + { + "epoch": 0.0796252459573108, + "grad_norm": 2.963543653488159, + "learning_rate": 1.992923472632389e-07, + "loss": 0.028, + "step": 19930 + }, + { + "epoch": 0.07966519841388747, + "grad_norm": 11.538369178771973, + "learning_rate": 1.992916002978489e-07, + "loss": 0.0362, + "step": 19940 + }, + { + "epoch": 0.07970515087046415, + "grad_norm": 4.9244608879089355, + "learning_rate": 1.9929085293983716e-07, + "loss": 0.0274, + "step": 19950 + }, + { + "epoch": 0.07974510332704082, + "grad_norm": 5.008912563323975, + "learning_rate": 1.9929010518920664e-07, + "loss": 0.0305, + "step": 19960 + }, + { + "epoch": 0.0797850557836175, + "grad_norm": 2.7604455947875977, + "learning_rate": 1.9928935704596025e-07, + "loss": 0.0304, + "step": 19970 + }, + { + "epoch": 0.07982500824019417, + "grad_norm": 2.9451711177825928, + "learning_rate": 1.9928860851010101e-07, + "loss": 0.0344, + "step": 19980 + }, + { + "epoch": 0.07986496069677085, + "grad_norm": 4.067835807800293, + "learning_rate": 1.9928785958163184e-07, + "loss": 0.0293, + "step": 19990 + }, + { + "epoch": 0.07990491315334752, + "grad_norm": 6.125262260437012, + "learning_rate": 1.992871102605557e-07, + "loss": 0.0297, + "step": 20000 + }, + { + "epoch": 0.07994486560992418, + "grad_norm": 3.3513526916503906, + "learning_rate": 1.992863605468756e-07, + "loss": 0.0307, + "step": 20010 + }, + { + "epoch": 0.07998481806650086, + "grad_norm": 5.2369537353515625, + "learning_rate": 1.9928561044059445e-07, + "loss": 0.0301, + "step": 20020 + }, + { + "epoch": 0.08002477052307753, + "grad_norm": 4.495051383972168, + "learning_rate": 1.9928485994171526e-07, + "loss": 0.0302, + "step": 20030 + }, + { + "epoch": 0.08006472297965421, + "grad_norm": 3.91096568107605, + "learning_rate": 1.9928410905024096e-07, + "loss": 0.0337, + "step": 20040 + }, + { + "epoch": 0.08010467543623088, + "grad_norm": 5.642776966094971, + "learning_rate": 1.9928335776617454e-07, + "loss": 0.0343, + "step": 20050 + }, + { + "epoch": 0.08014462789280756, + "grad_norm": 6.14677619934082, + "learning_rate": 1.9928260608951897e-07, + "loss": 0.0287, + "step": 20060 + }, + { + "epoch": 0.08018458034938424, + "grad_norm": 2.9090936183929443, + "learning_rate": 1.992818540202772e-07, + "loss": 0.033, + "step": 20070 + }, + { + "epoch": 0.08022453280596091, + "grad_norm": 5.4579620361328125, + "learning_rate": 1.9928110155845226e-07, + "loss": 0.0311, + "step": 20080 + }, + { + "epoch": 0.08026448526253759, + "grad_norm": 3.875136375427246, + "learning_rate": 1.9928034870404705e-07, + "loss": 0.0317, + "step": 20090 + }, + { + "epoch": 0.08030443771911426, + "grad_norm": 4.898037910461426, + "learning_rate": 1.992795954570646e-07, + "loss": 0.0325, + "step": 20100 + }, + { + "epoch": 0.08034439017569092, + "grad_norm": 2.89672589302063, + "learning_rate": 1.992788418175079e-07, + "loss": 0.0335, + "step": 20110 + }, + { + "epoch": 0.0803843426322676, + "grad_norm": 3.785813570022583, + "learning_rate": 1.9927808778537987e-07, + "loss": 0.0288, + "step": 20120 + }, + { + "epoch": 0.08042429508884427, + "grad_norm": 3.5722334384918213, + "learning_rate": 1.992773333606835e-07, + "loss": 0.0295, + "step": 20130 + }, + { + "epoch": 0.08046424754542095, + "grad_norm": 5.433884143829346, + "learning_rate": 1.9927657854342184e-07, + "loss": 0.0297, + "step": 20140 + }, + { + "epoch": 0.08050420000199762, + "grad_norm": 12.786368370056152, + "learning_rate": 1.9927582333359782e-07, + "loss": 0.0285, + "step": 20150 + }, + { + "epoch": 0.0805441524585743, + "grad_norm": 5.6891374588012695, + "learning_rate": 1.9927506773121445e-07, + "loss": 0.0327, + "step": 20160 + }, + { + "epoch": 0.08058410491515097, + "grad_norm": 4.939685344696045, + "learning_rate": 1.992743117362747e-07, + "loss": 0.0273, + "step": 20170 + }, + { + "epoch": 0.08062405737172765, + "grad_norm": 8.045799255371094, + "learning_rate": 1.9927355534878155e-07, + "loss": 0.0311, + "step": 20180 + }, + { + "epoch": 0.08066400982830432, + "grad_norm": 3.0450007915496826, + "learning_rate": 1.9927279856873804e-07, + "loss": 0.0327, + "step": 20190 + }, + { + "epoch": 0.080703962284881, + "grad_norm": 2.6919808387756348, + "learning_rate": 1.9927204139614712e-07, + "loss": 0.0278, + "step": 20200 + }, + { + "epoch": 0.08074391474145766, + "grad_norm": 2.9354207515716553, + "learning_rate": 1.992712838310118e-07, + "loss": 0.0338, + "step": 20210 + }, + { + "epoch": 0.08078386719803433, + "grad_norm": 3.3964498043060303, + "learning_rate": 1.9927052587333505e-07, + "loss": 0.0312, + "step": 20220 + }, + { + "epoch": 0.08082381965461101, + "grad_norm": 10.14988899230957, + "learning_rate": 1.992697675231199e-07, + "loss": 0.0307, + "step": 20230 + }, + { + "epoch": 0.08086377211118768, + "grad_norm": 8.322985649108887, + "learning_rate": 1.9926900878036932e-07, + "loss": 0.0326, + "step": 20240 + }, + { + "epoch": 0.08090372456776436, + "grad_norm": 9.55587100982666, + "learning_rate": 1.9926824964508635e-07, + "loss": 0.0334, + "step": 20250 + }, + { + "epoch": 0.08094367702434103, + "grad_norm": 3.0211684703826904, + "learning_rate": 1.9926749011727395e-07, + "loss": 0.0293, + "step": 20260 + }, + { + "epoch": 0.08098362948091771, + "grad_norm": 2.402214765548706, + "learning_rate": 1.9926673019693517e-07, + "loss": 0.0295, + "step": 20270 + }, + { + "epoch": 0.08102358193749438, + "grad_norm": 4.541537761688232, + "learning_rate": 1.9926596988407293e-07, + "loss": 0.0322, + "step": 20280 + }, + { + "epoch": 0.08106353439407106, + "grad_norm": 2.8663346767425537, + "learning_rate": 1.9926520917869033e-07, + "loss": 0.027, + "step": 20290 + }, + { + "epoch": 0.08110348685064774, + "grad_norm": 7.131405353546143, + "learning_rate": 1.9926444808079036e-07, + "loss": 0.032, + "step": 20300 + }, + { + "epoch": 0.0811434393072244, + "grad_norm": 5.362752437591553, + "learning_rate": 1.9926368659037596e-07, + "loss": 0.0336, + "step": 20310 + }, + { + "epoch": 0.08118339176380107, + "grad_norm": 5.061661720275879, + "learning_rate": 1.9926292470745022e-07, + "loss": 0.0312, + "step": 20320 + }, + { + "epoch": 0.08122334422037775, + "grad_norm": 4.269407272338867, + "learning_rate": 1.992621624320161e-07, + "loss": 0.0335, + "step": 20330 + }, + { + "epoch": 0.08126329667695442, + "grad_norm": 3.234576940536499, + "learning_rate": 1.9926139976407668e-07, + "loss": 0.0271, + "step": 20340 + }, + { + "epoch": 0.0813032491335311, + "grad_norm": 4.951357364654541, + "learning_rate": 1.9926063670363488e-07, + "loss": 0.0318, + "step": 20350 + }, + { + "epoch": 0.08134320159010777, + "grad_norm": 8.109390258789062, + "learning_rate": 1.9925987325069381e-07, + "loss": 0.0323, + "step": 20360 + }, + { + "epoch": 0.08138315404668445, + "grad_norm": 5.599213123321533, + "learning_rate": 1.9925910940525642e-07, + "loss": 0.0307, + "step": 20370 + }, + { + "epoch": 0.08142310650326112, + "grad_norm": 4.48295259475708, + "learning_rate": 1.992583451673258e-07, + "loss": 0.0275, + "step": 20380 + }, + { + "epoch": 0.0814630589598378, + "grad_norm": 5.547143936157227, + "learning_rate": 1.9925758053690492e-07, + "loss": 0.0303, + "step": 20390 + }, + { + "epoch": 0.08150301141641447, + "grad_norm": 1.8308138847351074, + "learning_rate": 1.9925681551399678e-07, + "loss": 0.0265, + "step": 20400 + }, + { + "epoch": 0.08154296387299113, + "grad_norm": 2.3221280574798584, + "learning_rate": 1.9925605009860447e-07, + "loss": 0.029, + "step": 20410 + }, + { + "epoch": 0.08158291632956781, + "grad_norm": 3.7587385177612305, + "learning_rate": 1.99255284290731e-07, + "loss": 0.0284, + "step": 20420 + }, + { + "epoch": 0.08162286878614448, + "grad_norm": 4.5888566970825195, + "learning_rate": 1.9925451809037933e-07, + "loss": 0.0296, + "step": 20430 + }, + { + "epoch": 0.08166282124272116, + "grad_norm": 4.565674781799316, + "learning_rate": 1.9925375149755262e-07, + "loss": 0.0298, + "step": 20440 + }, + { + "epoch": 0.08170277369929783, + "grad_norm": 5.855221271514893, + "learning_rate": 1.9925298451225378e-07, + "loss": 0.0333, + "step": 20450 + }, + { + "epoch": 0.08174272615587451, + "grad_norm": 2.9527997970581055, + "learning_rate": 1.992522171344859e-07, + "loss": 0.0286, + "step": 20460 + }, + { + "epoch": 0.08178267861245118, + "grad_norm": 2.765468120574951, + "learning_rate": 1.99251449364252e-07, + "loss": 0.0338, + "step": 20470 + }, + { + "epoch": 0.08182263106902786, + "grad_norm": 2.846670389175415, + "learning_rate": 1.9925068120155514e-07, + "loss": 0.027, + "step": 20480 + }, + { + "epoch": 0.08186258352560453, + "grad_norm": 4.615137577056885, + "learning_rate": 1.9924991264639833e-07, + "loss": 0.0303, + "step": 20490 + }, + { + "epoch": 0.08190253598218121, + "grad_norm": 2.3244593143463135, + "learning_rate": 1.9924914369878458e-07, + "loss": 0.0301, + "step": 20500 + }, + { + "epoch": 0.08194248843875789, + "grad_norm": 4.517736911773682, + "learning_rate": 1.99248374358717e-07, + "loss": 0.0321, + "step": 20510 + }, + { + "epoch": 0.08198244089533455, + "grad_norm": 9.391547203063965, + "learning_rate": 1.9924760462619857e-07, + "loss": 0.0298, + "step": 20520 + }, + { + "epoch": 0.08202239335191122, + "grad_norm": 5.758331298828125, + "learning_rate": 1.992468345012324e-07, + "loss": 0.0266, + "step": 20530 + }, + { + "epoch": 0.0820623458084879, + "grad_norm": 4.431139945983887, + "learning_rate": 1.992460639838215e-07, + "loss": 0.0319, + "step": 20540 + }, + { + "epoch": 0.08210229826506457, + "grad_norm": 2.225196123123169, + "learning_rate": 1.9924529307396889e-07, + "loss": 0.0319, + "step": 20550 + }, + { + "epoch": 0.08214225072164125, + "grad_norm": 5.426529407501221, + "learning_rate": 1.9924452177167766e-07, + "loss": 0.0288, + "step": 20560 + }, + { + "epoch": 0.08218220317821792, + "grad_norm": 2.187887191772461, + "learning_rate": 1.9924375007695083e-07, + "loss": 0.0285, + "step": 20570 + }, + { + "epoch": 0.0822221556347946, + "grad_norm": 4.062244892120361, + "learning_rate": 1.9924297798979146e-07, + "loss": 0.0287, + "step": 20580 + }, + { + "epoch": 0.08226210809137127, + "grad_norm": 6.134011745452881, + "learning_rate": 1.992422055102026e-07, + "loss": 0.0326, + "step": 20590 + }, + { + "epoch": 0.08230206054794795, + "grad_norm": 5.560269832611084, + "learning_rate": 1.9924143263818732e-07, + "loss": 0.0328, + "step": 20600 + }, + { + "epoch": 0.08234201300452462, + "grad_norm": 3.654231548309326, + "learning_rate": 1.992406593737487e-07, + "loss": 0.0312, + "step": 20610 + }, + { + "epoch": 0.08238196546110128, + "grad_norm": 6.461172103881836, + "learning_rate": 1.9923988571688973e-07, + "loss": 0.029, + "step": 20620 + }, + { + "epoch": 0.08242191791767796, + "grad_norm": 3.09200382232666, + "learning_rate": 1.992391116676135e-07, + "loss": 0.0313, + "step": 20630 + }, + { + "epoch": 0.08246187037425463, + "grad_norm": 3.853585958480835, + "learning_rate": 1.9923833722592312e-07, + "loss": 0.0278, + "step": 20640 + }, + { + "epoch": 0.08250182283083131, + "grad_norm": 5.904369354248047, + "learning_rate": 1.9923756239182156e-07, + "loss": 0.0293, + "step": 20650 + }, + { + "epoch": 0.08254177528740798, + "grad_norm": 2.8223657608032227, + "learning_rate": 1.9923678716531195e-07, + "loss": 0.0288, + "step": 20660 + }, + { + "epoch": 0.08258172774398466, + "grad_norm": 4.411504745483398, + "learning_rate": 1.9923601154639735e-07, + "loss": 0.026, + "step": 20670 + }, + { + "epoch": 0.08262168020056133, + "grad_norm": 5.778646469116211, + "learning_rate": 1.992352355350808e-07, + "loss": 0.0299, + "step": 20680 + }, + { + "epoch": 0.08266163265713801, + "grad_norm": 2.445195198059082, + "learning_rate": 1.9923445913136535e-07, + "loss": 0.0299, + "step": 20690 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 6.461365699768066, + "learning_rate": 1.9923368233525414e-07, + "loss": 0.029, + "step": 20700 + }, + { + "epoch": 0.08274153757029136, + "grad_norm": 10.868332862854004, + "learning_rate": 1.9923290514675023e-07, + "loss": 0.0237, + "step": 20710 + }, + { + "epoch": 0.08278149002686802, + "grad_norm": 5.969699859619141, + "learning_rate": 1.9923212756585663e-07, + "loss": 0.032, + "step": 20720 + }, + { + "epoch": 0.0828214424834447, + "grad_norm": 3.0846450328826904, + "learning_rate": 1.9923134959257646e-07, + "loss": 0.0347, + "step": 20730 + }, + { + "epoch": 0.08286139494002137, + "grad_norm": 4.5443243980407715, + "learning_rate": 1.992305712269128e-07, + "loss": 0.0303, + "step": 20740 + }, + { + "epoch": 0.08290134739659805, + "grad_norm": 3.1168994903564453, + "learning_rate": 1.992297924688687e-07, + "loss": 0.0295, + "step": 20750 + }, + { + "epoch": 0.08294129985317472, + "grad_norm": 3.0230350494384766, + "learning_rate": 1.992290133184473e-07, + "loss": 0.0351, + "step": 20760 + }, + { + "epoch": 0.0829812523097514, + "grad_norm": 2.516590118408203, + "learning_rate": 1.9922823377565156e-07, + "loss": 0.0297, + "step": 20770 + }, + { + "epoch": 0.08302120476632807, + "grad_norm": 4.741844177246094, + "learning_rate": 1.9922745384048468e-07, + "loss": 0.0297, + "step": 20780 + }, + { + "epoch": 0.08306115722290475, + "grad_norm": 4.50992488861084, + "learning_rate": 1.992266735129497e-07, + "loss": 0.0274, + "step": 20790 + }, + { + "epoch": 0.08310110967948142, + "grad_norm": 5.2872090339660645, + "learning_rate": 1.9922589279304973e-07, + "loss": 0.0339, + "step": 20800 + }, + { + "epoch": 0.0831410621360581, + "grad_norm": 11.166078567504883, + "learning_rate": 1.992251116807878e-07, + "loss": 0.0325, + "step": 20810 + }, + { + "epoch": 0.08318101459263476, + "grad_norm": 3.440033197402954, + "learning_rate": 1.9922433017616707e-07, + "loss": 0.0288, + "step": 20820 + }, + { + "epoch": 0.08322096704921143, + "grad_norm": 2.739901065826416, + "learning_rate": 1.9922354827919057e-07, + "loss": 0.0312, + "step": 20830 + }, + { + "epoch": 0.08326091950578811, + "grad_norm": 2.5838634967803955, + "learning_rate": 1.9922276598986142e-07, + "loss": 0.031, + "step": 20840 + }, + { + "epoch": 0.08330087196236478, + "grad_norm": 3.6943154335021973, + "learning_rate": 1.9922198330818272e-07, + "loss": 0.0305, + "step": 20850 + }, + { + "epoch": 0.08334082441894146, + "grad_norm": 4.035580635070801, + "learning_rate": 1.9922120023415757e-07, + "loss": 0.0324, + "step": 20860 + }, + { + "epoch": 0.08338077687551813, + "grad_norm": 1.8706244230270386, + "learning_rate": 1.9922041676778902e-07, + "loss": 0.0285, + "step": 20870 + }, + { + "epoch": 0.08342072933209481, + "grad_norm": 4.06820011138916, + "learning_rate": 1.992196329090802e-07, + "loss": 0.0329, + "step": 20880 + }, + { + "epoch": 0.08346068178867148, + "grad_norm": 1.9070794582366943, + "learning_rate": 1.9921884865803423e-07, + "loss": 0.0235, + "step": 20890 + }, + { + "epoch": 0.08350063424524816, + "grad_norm": 4.9120707511901855, + "learning_rate": 1.9921806401465417e-07, + "loss": 0.0286, + "step": 20900 + }, + { + "epoch": 0.08354058670182483, + "grad_norm": 4.699753761291504, + "learning_rate": 1.9921727897894317e-07, + "loss": 0.0367, + "step": 20910 + }, + { + "epoch": 0.0835805391584015, + "grad_norm": 3.385662794113159, + "learning_rate": 1.992164935509043e-07, + "loss": 0.0342, + "step": 20920 + }, + { + "epoch": 0.08362049161497817, + "grad_norm": 6.25269079208374, + "learning_rate": 1.9921570773054066e-07, + "loss": 0.0315, + "step": 20930 + }, + { + "epoch": 0.08366044407155485, + "grad_norm": 4.676117897033691, + "learning_rate": 1.9921492151785536e-07, + "loss": 0.0297, + "step": 20940 + }, + { + "epoch": 0.08370039652813152, + "grad_norm": 2.198004722595215, + "learning_rate": 1.9921413491285152e-07, + "loss": 0.0309, + "step": 20950 + }, + { + "epoch": 0.0837403489847082, + "grad_norm": 3.409837484359741, + "learning_rate": 1.9921334791553228e-07, + "loss": 0.0329, + "step": 20960 + }, + { + "epoch": 0.08378030144128487, + "grad_norm": 13.39801025390625, + "learning_rate": 1.992125605259007e-07, + "loss": 0.0307, + "step": 20970 + }, + { + "epoch": 0.08382025389786155, + "grad_norm": 2.9631876945495605, + "learning_rate": 1.992117727439599e-07, + "loss": 0.0307, + "step": 20980 + }, + { + "epoch": 0.08386020635443822, + "grad_norm": 4.887350559234619, + "learning_rate": 1.9921098456971305e-07, + "loss": 0.0327, + "step": 20990 + }, + { + "epoch": 0.0839001588110149, + "grad_norm": 5.375432014465332, + "learning_rate": 1.992101960031632e-07, + "loss": 0.0301, + "step": 21000 + }, + { + "epoch": 0.08394011126759157, + "grad_norm": 2.3191912174224854, + "learning_rate": 1.992094070443135e-07, + "loss": 0.0267, + "step": 21010 + }, + { + "epoch": 0.08398006372416823, + "grad_norm": 5.292407512664795, + "learning_rate": 1.9920861769316706e-07, + "loss": 0.0308, + "step": 21020 + }, + { + "epoch": 0.08402001618074491, + "grad_norm": 2.809521198272705, + "learning_rate": 1.99207827949727e-07, + "loss": 0.0242, + "step": 21030 + }, + { + "epoch": 0.08405996863732158, + "grad_norm": 7.180039882659912, + "learning_rate": 1.9920703781399649e-07, + "loss": 0.027, + "step": 21040 + }, + { + "epoch": 0.08409992109389826, + "grad_norm": 3.531507730484009, + "learning_rate": 1.9920624728597856e-07, + "loss": 0.0321, + "step": 21050 + }, + { + "epoch": 0.08413987355047493, + "grad_norm": 3.9547982215881348, + "learning_rate": 1.9920545636567643e-07, + "loss": 0.03, + "step": 21060 + }, + { + "epoch": 0.08417982600705161, + "grad_norm": 36.783966064453125, + "learning_rate": 1.9920466505309316e-07, + "loss": 0.0276, + "step": 21070 + }, + { + "epoch": 0.08421977846362828, + "grad_norm": 10.994132041931152, + "learning_rate": 1.9920387334823192e-07, + "loss": 0.03, + "step": 21080 + }, + { + "epoch": 0.08425973092020496, + "grad_norm": 5.2808637619018555, + "learning_rate": 1.992030812510958e-07, + "loss": 0.0321, + "step": 21090 + }, + { + "epoch": 0.08429968337678163, + "grad_norm": 8.780991554260254, + "learning_rate": 1.99202288761688e-07, + "loss": 0.0275, + "step": 21100 + }, + { + "epoch": 0.08433963583335831, + "grad_norm": 2.442147731781006, + "learning_rate": 1.9920149588001157e-07, + "loss": 0.0274, + "step": 21110 + }, + { + "epoch": 0.08437958828993498, + "grad_norm": 12.275412559509277, + "learning_rate": 1.9920070260606972e-07, + "loss": 0.0343, + "step": 21120 + }, + { + "epoch": 0.08441954074651165, + "grad_norm": 4.726264476776123, + "learning_rate": 1.9919990893986553e-07, + "loss": 0.0268, + "step": 21130 + }, + { + "epoch": 0.08445949320308832, + "grad_norm": 5.4749908447265625, + "learning_rate": 1.991991148814022e-07, + "loss": 0.0301, + "step": 21140 + }, + { + "epoch": 0.084499445659665, + "grad_norm": 7.759589195251465, + "learning_rate": 1.991983204306828e-07, + "loss": 0.0294, + "step": 21150 + }, + { + "epoch": 0.08453939811624167, + "grad_norm": 8.025832176208496, + "learning_rate": 1.9919752558771048e-07, + "loss": 0.0298, + "step": 21160 + }, + { + "epoch": 0.08457935057281835, + "grad_norm": 3.486295461654663, + "learning_rate": 1.9919673035248845e-07, + "loss": 0.0303, + "step": 21170 + }, + { + "epoch": 0.08461930302939502, + "grad_norm": 4.656230926513672, + "learning_rate": 1.991959347250198e-07, + "loss": 0.0295, + "step": 21180 + }, + { + "epoch": 0.0846592554859717, + "grad_norm": 3.8710103034973145, + "learning_rate": 1.991951387053077e-07, + "loss": 0.0333, + "step": 21190 + }, + { + "epoch": 0.08469920794254837, + "grad_norm": 5.2065606117248535, + "learning_rate": 1.9919434229335525e-07, + "loss": 0.0271, + "step": 21200 + }, + { + "epoch": 0.08473916039912505, + "grad_norm": 3.4179227352142334, + "learning_rate": 1.9919354548916565e-07, + "loss": 0.0292, + "step": 21210 + }, + { + "epoch": 0.08477911285570172, + "grad_norm": 3.182903528213501, + "learning_rate": 1.9919274829274204e-07, + "loss": 0.0278, + "step": 21220 + }, + { + "epoch": 0.08481906531227838, + "grad_norm": 5.694780349731445, + "learning_rate": 1.9919195070408757e-07, + "loss": 0.0281, + "step": 21230 + }, + { + "epoch": 0.08485901776885506, + "grad_norm": 3.2846245765686035, + "learning_rate": 1.9919115272320538e-07, + "loss": 0.0285, + "step": 21240 + }, + { + "epoch": 0.08489897022543173, + "grad_norm": 12.316702842712402, + "learning_rate": 1.9919035435009866e-07, + "loss": 0.0314, + "step": 21250 + }, + { + "epoch": 0.08493892268200841, + "grad_norm": 6.759394645690918, + "learning_rate": 1.991895555847705e-07, + "loss": 0.0286, + "step": 21260 + }, + { + "epoch": 0.08497887513858508, + "grad_norm": 5.086463451385498, + "learning_rate": 1.9918875642722413e-07, + "loss": 0.0331, + "step": 21270 + }, + { + "epoch": 0.08501882759516176, + "grad_norm": 3.592823028564453, + "learning_rate": 1.9918795687746268e-07, + "loss": 0.0285, + "step": 21280 + }, + { + "epoch": 0.08505878005173843, + "grad_norm": 4.130397319793701, + "learning_rate": 1.9918715693548932e-07, + "loss": 0.0348, + "step": 21290 + }, + { + "epoch": 0.08509873250831511, + "grad_norm": 3.4720866680145264, + "learning_rate": 1.9918635660130717e-07, + "loss": 0.0233, + "step": 21300 + }, + { + "epoch": 0.08513868496489178, + "grad_norm": 3.5655477046966553, + "learning_rate": 1.9918555587491946e-07, + "loss": 0.0288, + "step": 21310 + }, + { + "epoch": 0.08517863742146846, + "grad_norm": 4.922514915466309, + "learning_rate": 1.9918475475632935e-07, + "loss": 0.0352, + "step": 21320 + }, + { + "epoch": 0.08521858987804512, + "grad_norm": 4.810778617858887, + "learning_rate": 1.9918395324553996e-07, + "loss": 0.029, + "step": 21330 + }, + { + "epoch": 0.0852585423346218, + "grad_norm": 4.7886834144592285, + "learning_rate": 1.991831513425545e-07, + "loss": 0.0295, + "step": 21340 + }, + { + "epoch": 0.08529849479119847, + "grad_norm": 3.084864616394043, + "learning_rate": 1.991823490473761e-07, + "loss": 0.0248, + "step": 21350 + }, + { + "epoch": 0.08533844724777515, + "grad_norm": 2.3347067832946777, + "learning_rate": 1.9918154636000797e-07, + "loss": 0.0295, + "step": 21360 + }, + { + "epoch": 0.08537839970435182, + "grad_norm": 6.95567512512207, + "learning_rate": 1.9918074328045328e-07, + "loss": 0.0341, + "step": 21370 + }, + { + "epoch": 0.0854183521609285, + "grad_norm": 2.384726047515869, + "learning_rate": 1.991799398087152e-07, + "loss": 0.0303, + "step": 21380 + }, + { + "epoch": 0.08545830461750517, + "grad_norm": 8.163933753967285, + "learning_rate": 1.9917913594479688e-07, + "loss": 0.0306, + "step": 21390 + }, + { + "epoch": 0.08549825707408185, + "grad_norm": 5.924780368804932, + "learning_rate": 1.9917833168870155e-07, + "loss": 0.0283, + "step": 21400 + }, + { + "epoch": 0.08553820953065852, + "grad_norm": 2.570429563522339, + "learning_rate": 1.9917752704043234e-07, + "loss": 0.0315, + "step": 21410 + }, + { + "epoch": 0.0855781619872352, + "grad_norm": 4.103671550750732, + "learning_rate": 1.9917672199999246e-07, + "loss": 0.0319, + "step": 21420 + }, + { + "epoch": 0.08561811444381186, + "grad_norm": 6.577326774597168, + "learning_rate": 1.991759165673851e-07, + "loss": 0.029, + "step": 21430 + }, + { + "epoch": 0.08565806690038853, + "grad_norm": 5.091348648071289, + "learning_rate": 1.9917511074261344e-07, + "loss": 0.0283, + "step": 21440 + }, + { + "epoch": 0.08569801935696521, + "grad_norm": 4.076991558074951, + "learning_rate": 1.9917430452568064e-07, + "loss": 0.0281, + "step": 21450 + }, + { + "epoch": 0.08573797181354188, + "grad_norm": 2.8533215522766113, + "learning_rate": 1.9917349791658993e-07, + "loss": 0.0251, + "step": 21460 + }, + { + "epoch": 0.08577792427011856, + "grad_norm": 14.729586601257324, + "learning_rate": 1.9917269091534448e-07, + "loss": 0.0362, + "step": 21470 + }, + { + "epoch": 0.08581787672669523, + "grad_norm": 2.4388928413391113, + "learning_rate": 1.9917188352194745e-07, + "loss": 0.028, + "step": 21480 + }, + { + "epoch": 0.08585782918327191, + "grad_norm": 3.2387280464172363, + "learning_rate": 1.9917107573640206e-07, + "loss": 0.0335, + "step": 21490 + }, + { + "epoch": 0.08589778163984858, + "grad_norm": 4.91099739074707, + "learning_rate": 1.9917026755871153e-07, + "loss": 0.0334, + "step": 21500 + }, + { + "epoch": 0.08593773409642526, + "grad_norm": 4.767516136169434, + "learning_rate": 1.99169458988879e-07, + "loss": 0.0294, + "step": 21510 + }, + { + "epoch": 0.08597768655300193, + "grad_norm": 13.731861114501953, + "learning_rate": 1.9916865002690775e-07, + "loss": 0.0302, + "step": 21520 + }, + { + "epoch": 0.0860176390095786, + "grad_norm": 9.437625885009766, + "learning_rate": 1.991678406728009e-07, + "loss": 0.0292, + "step": 21530 + }, + { + "epoch": 0.08605759146615527, + "grad_norm": 7.044157981872559, + "learning_rate": 1.9916703092656167e-07, + "loss": 0.0287, + "step": 21540 + }, + { + "epoch": 0.08609754392273195, + "grad_norm": 3.3409814834594727, + "learning_rate": 1.9916622078819328e-07, + "loss": 0.0289, + "step": 21550 + }, + { + "epoch": 0.08613749637930862, + "grad_norm": 4.742686748504639, + "learning_rate": 1.991654102576989e-07, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.0861774488358853, + "grad_norm": 4.436203956604004, + "learning_rate": 1.9916459933508179e-07, + "loss": 0.0268, + "step": 21570 + }, + { + "epoch": 0.08621740129246197, + "grad_norm": 4.9632954597473145, + "learning_rate": 1.991637880203451e-07, + "loss": 0.0305, + "step": 21580 + }, + { + "epoch": 0.08625735374903865, + "grad_norm": 6.778901100158691, + "learning_rate": 1.9916297631349207e-07, + "loss": 0.0292, + "step": 21590 + }, + { + "epoch": 0.08629730620561532, + "grad_norm": 3.224247694015503, + "learning_rate": 1.991621642145259e-07, + "loss": 0.0295, + "step": 21600 + }, + { + "epoch": 0.086337258662192, + "grad_norm": 5.390450954437256, + "learning_rate": 1.9916135172344983e-07, + "loss": 0.0257, + "step": 21610 + }, + { + "epoch": 0.08637721111876867, + "grad_norm": 3.871021270751953, + "learning_rate": 1.9916053884026702e-07, + "loss": 0.034, + "step": 21620 + }, + { + "epoch": 0.08641716357534535, + "grad_norm": 5.126806735992432, + "learning_rate": 1.9915972556498072e-07, + "loss": 0.0341, + "step": 21630 + }, + { + "epoch": 0.08645711603192201, + "grad_norm": 3.77921986579895, + "learning_rate": 1.9915891189759413e-07, + "loss": 0.0278, + "step": 21640 + }, + { + "epoch": 0.08649706848849868, + "grad_norm": 5.103912830352783, + "learning_rate": 1.9915809783811048e-07, + "loss": 0.0298, + "step": 21650 + }, + { + "epoch": 0.08653702094507536, + "grad_norm": 2.8066344261169434, + "learning_rate": 1.9915728338653296e-07, + "loss": 0.0269, + "step": 21660 + }, + { + "epoch": 0.08657697340165203, + "grad_norm": 7.315302848815918, + "learning_rate": 1.9915646854286484e-07, + "loss": 0.0286, + "step": 21670 + }, + { + "epoch": 0.08661692585822871, + "grad_norm": 7.4948835372924805, + "learning_rate": 1.9915565330710932e-07, + "loss": 0.0317, + "step": 21680 + }, + { + "epoch": 0.08665687831480538, + "grad_norm": 3.7436575889587402, + "learning_rate": 1.991548376792696e-07, + "loss": 0.0332, + "step": 21690 + }, + { + "epoch": 0.08669683077138206, + "grad_norm": 3.4914896488189697, + "learning_rate": 1.9915402165934893e-07, + "loss": 0.0287, + "step": 21700 + }, + { + "epoch": 0.08673678322795873, + "grad_norm": 6.204156398773193, + "learning_rate": 1.9915320524735052e-07, + "loss": 0.0242, + "step": 21710 + }, + { + "epoch": 0.08677673568453541, + "grad_norm": 8.49747085571289, + "learning_rate": 1.991523884432776e-07, + "loss": 0.0337, + "step": 21720 + }, + { + "epoch": 0.08681668814111208, + "grad_norm": 5.445048809051514, + "learning_rate": 1.9915157124713347e-07, + "loss": 0.0248, + "step": 21730 + }, + { + "epoch": 0.08685664059768874, + "grad_norm": 4.580132484436035, + "learning_rate": 1.9915075365892123e-07, + "loss": 0.03, + "step": 21740 + }, + { + "epoch": 0.08689659305426542, + "grad_norm": 3.868684768676758, + "learning_rate": 1.991499356786442e-07, + "loss": 0.0284, + "step": 21750 + }, + { + "epoch": 0.0869365455108421, + "grad_norm": 3.3764026165008545, + "learning_rate": 1.9914911730630565e-07, + "loss": 0.026, + "step": 21760 + }, + { + "epoch": 0.08697649796741877, + "grad_norm": 5.130948066711426, + "learning_rate": 1.991482985419087e-07, + "loss": 0.0324, + "step": 21770 + }, + { + "epoch": 0.08701645042399545, + "grad_norm": 2.9651167392730713, + "learning_rate": 1.991474793854567e-07, + "loss": 0.0296, + "step": 21780 + }, + { + "epoch": 0.08705640288057212, + "grad_norm": 5.604620933532715, + "learning_rate": 1.9914665983695282e-07, + "loss": 0.0302, + "step": 21790 + }, + { + "epoch": 0.0870963553371488, + "grad_norm": 2.0848007202148438, + "learning_rate": 1.9914583989640033e-07, + "loss": 0.03, + "step": 21800 + }, + { + "epoch": 0.08713630779372547, + "grad_norm": 3.643782377243042, + "learning_rate": 1.9914501956380246e-07, + "loss": 0.0265, + "step": 21810 + }, + { + "epoch": 0.08717626025030215, + "grad_norm": 5.0743327140808105, + "learning_rate": 1.9914419883916245e-07, + "loss": 0.0248, + "step": 21820 + }, + { + "epoch": 0.08721621270687882, + "grad_norm": 3.6983346939086914, + "learning_rate": 1.9914337772248352e-07, + "loss": 0.0338, + "step": 21830 + }, + { + "epoch": 0.08725616516345548, + "grad_norm": 5.466250896453857, + "learning_rate": 1.9914255621376902e-07, + "loss": 0.0297, + "step": 21840 + }, + { + "epoch": 0.08729611762003216, + "grad_norm": 4.62263298034668, + "learning_rate": 1.9914173431302207e-07, + "loss": 0.0289, + "step": 21850 + }, + { + "epoch": 0.08733607007660883, + "grad_norm": 3.9924798011779785, + "learning_rate": 1.9914091202024601e-07, + "loss": 0.0288, + "step": 21860 + }, + { + "epoch": 0.08737602253318551, + "grad_norm": 3.42214035987854, + "learning_rate": 1.9914008933544405e-07, + "loss": 0.0294, + "step": 21870 + }, + { + "epoch": 0.08741597498976218, + "grad_norm": 4.048675060272217, + "learning_rate": 1.9913926625861944e-07, + "loss": 0.0248, + "step": 21880 + }, + { + "epoch": 0.08745592744633886, + "grad_norm": 2.1240601539611816, + "learning_rate": 1.991384427897755e-07, + "loss": 0.0349, + "step": 21890 + }, + { + "epoch": 0.08749587990291553, + "grad_norm": 3.957717180252075, + "learning_rate": 1.9913761892891537e-07, + "loss": 0.0348, + "step": 21900 + }, + { + "epoch": 0.08753583235949221, + "grad_norm": 7.228148460388184, + "learning_rate": 1.991367946760424e-07, + "loss": 0.0275, + "step": 21910 + }, + { + "epoch": 0.08757578481606888, + "grad_norm": 23.77587890625, + "learning_rate": 1.9913597003115982e-07, + "loss": 0.0299, + "step": 21920 + }, + { + "epoch": 0.08761573727264556, + "grad_norm": 3.47082781791687, + "learning_rate": 1.9913514499427087e-07, + "loss": 0.0326, + "step": 21930 + }, + { + "epoch": 0.08765568972922222, + "grad_norm": 5.165219306945801, + "learning_rate": 1.9913431956537886e-07, + "loss": 0.0303, + "step": 21940 + }, + { + "epoch": 0.0876956421857989, + "grad_norm": 3.125967264175415, + "learning_rate": 1.99133493744487e-07, + "loss": 0.0286, + "step": 21950 + }, + { + "epoch": 0.08773559464237557, + "grad_norm": 8.205841064453125, + "learning_rate": 1.9913266753159862e-07, + "loss": 0.0272, + "step": 21960 + }, + { + "epoch": 0.08777554709895224, + "grad_norm": 2.0752270221710205, + "learning_rate": 1.991318409267169e-07, + "loss": 0.0281, + "step": 21970 + }, + { + "epoch": 0.08781549955552892, + "grad_norm": 4.399165630340576, + "learning_rate": 1.9913101392984522e-07, + "loss": 0.0315, + "step": 21980 + }, + { + "epoch": 0.0878554520121056, + "grad_norm": 2.803875684738159, + "learning_rate": 1.9913018654098673e-07, + "loss": 0.0307, + "step": 21990 + }, + { + "epoch": 0.08789540446868227, + "grad_norm": 5.610924243927002, + "learning_rate": 1.991293587601448e-07, + "loss": 0.0264, + "step": 22000 + }, + { + "epoch": 0.08793535692525895, + "grad_norm": 4.200576305389404, + "learning_rate": 1.9912853058732266e-07, + "loss": 0.0298, + "step": 22010 + }, + { + "epoch": 0.08797530938183562, + "grad_norm": 9.817244529724121, + "learning_rate": 1.9912770202252357e-07, + "loss": 0.0321, + "step": 22020 + }, + { + "epoch": 0.0880152618384123, + "grad_norm": 4.690633773803711, + "learning_rate": 1.9912687306575085e-07, + "loss": 0.0294, + "step": 22030 + }, + { + "epoch": 0.08805521429498896, + "grad_norm": 3.592590093612671, + "learning_rate": 1.9912604371700773e-07, + "loss": 0.0329, + "step": 22040 + }, + { + "epoch": 0.08809516675156563, + "grad_norm": 3.5412986278533936, + "learning_rate": 1.9912521397629754e-07, + "loss": 0.0282, + "step": 22050 + }, + { + "epoch": 0.0881351192081423, + "grad_norm": 7.6807050704956055, + "learning_rate": 1.991243838436235e-07, + "loss": 0.0324, + "step": 22060 + }, + { + "epoch": 0.08817507166471898, + "grad_norm": 7.093482494354248, + "learning_rate": 1.9912355331898895e-07, + "loss": 0.0302, + "step": 22070 + }, + { + "epoch": 0.08821502412129566, + "grad_norm": 4.652997016906738, + "learning_rate": 1.9912272240239714e-07, + "loss": 0.0309, + "step": 22080 + }, + { + "epoch": 0.08825497657787233, + "grad_norm": 5.413193225860596, + "learning_rate": 1.9912189109385139e-07, + "loss": 0.029, + "step": 22090 + }, + { + "epoch": 0.08829492903444901, + "grad_norm": 3.8737056255340576, + "learning_rate": 1.991210593933549e-07, + "loss": 0.0283, + "step": 22100 + }, + { + "epoch": 0.08833488149102568, + "grad_norm": 5.716788291931152, + "learning_rate": 1.9912022730091108e-07, + "loss": 0.0311, + "step": 22110 + }, + { + "epoch": 0.08837483394760236, + "grad_norm": 5.064697265625, + "learning_rate": 1.9911939481652314e-07, + "loss": 0.0304, + "step": 22120 + }, + { + "epoch": 0.08841478640417903, + "grad_norm": 4.658206939697266, + "learning_rate": 1.991185619401944e-07, + "loss": 0.025, + "step": 22130 + }, + { + "epoch": 0.0884547388607557, + "grad_norm": 21.083620071411133, + "learning_rate": 1.9911772867192812e-07, + "loss": 0.0258, + "step": 22140 + }, + { + "epoch": 0.08849469131733237, + "grad_norm": 3.3487560749053955, + "learning_rate": 1.9911689501172766e-07, + "loss": 0.0329, + "step": 22150 + }, + { + "epoch": 0.08853464377390904, + "grad_norm": 5.253650665283203, + "learning_rate": 1.9911606095959627e-07, + "loss": 0.0318, + "step": 22160 + }, + { + "epoch": 0.08857459623048572, + "grad_norm": 3.5285260677337646, + "learning_rate": 1.9911522651553723e-07, + "loss": 0.0282, + "step": 22170 + }, + { + "epoch": 0.0886145486870624, + "grad_norm": 5.020899772644043, + "learning_rate": 1.991143916795539e-07, + "loss": 0.0265, + "step": 22180 + }, + { + "epoch": 0.08865450114363907, + "grad_norm": 2.226538896560669, + "learning_rate": 1.991135564516495e-07, + "loss": 0.026, + "step": 22190 + }, + { + "epoch": 0.08869445360021574, + "grad_norm": 4.574934005737305, + "learning_rate": 1.9911272083182744e-07, + "loss": 0.031, + "step": 22200 + }, + { + "epoch": 0.08873440605679242, + "grad_norm": 3.861339807510376, + "learning_rate": 1.9911188482009094e-07, + "loss": 0.031, + "step": 22210 + }, + { + "epoch": 0.0887743585133691, + "grad_norm": 2.824004650115967, + "learning_rate": 1.9911104841644333e-07, + "loss": 0.0338, + "step": 22220 + }, + { + "epoch": 0.08881431096994577, + "grad_norm": 4.528470516204834, + "learning_rate": 1.991102116208879e-07, + "loss": 0.0293, + "step": 22230 + }, + { + "epoch": 0.08885426342652245, + "grad_norm": 4.819164276123047, + "learning_rate": 1.99109374433428e-07, + "loss": 0.0324, + "step": 22240 + }, + { + "epoch": 0.0888942158830991, + "grad_norm": 6.087536334991455, + "learning_rate": 1.991085368540669e-07, + "loss": 0.0284, + "step": 22250 + }, + { + "epoch": 0.08893416833967578, + "grad_norm": 4.470921516418457, + "learning_rate": 1.9910769888280794e-07, + "loss": 0.0291, + "step": 22260 + }, + { + "epoch": 0.08897412079625246, + "grad_norm": 2.3187062740325928, + "learning_rate": 1.9910686051965443e-07, + "loss": 0.0326, + "step": 22270 + }, + { + "epoch": 0.08901407325282913, + "grad_norm": 4.519500255584717, + "learning_rate": 1.9910602176460963e-07, + "loss": 0.0275, + "step": 22280 + }, + { + "epoch": 0.0890540257094058, + "grad_norm": 2.7656760215759277, + "learning_rate": 1.9910518261767695e-07, + "loss": 0.0294, + "step": 22290 + }, + { + "epoch": 0.08909397816598248, + "grad_norm": 3.459686756134033, + "learning_rate": 1.9910434307885965e-07, + "loss": 0.0266, + "step": 22300 + }, + { + "epoch": 0.08913393062255916, + "grad_norm": 5.065589904785156, + "learning_rate": 1.9910350314816108e-07, + "loss": 0.0306, + "step": 22310 + }, + { + "epoch": 0.08917388307913583, + "grad_norm": 3.9963462352752686, + "learning_rate": 1.991026628255845e-07, + "loss": 0.0287, + "step": 22320 + }, + { + "epoch": 0.08921383553571251, + "grad_norm": 5.598817348480225, + "learning_rate": 1.9910182211113328e-07, + "loss": 0.0282, + "step": 22330 + }, + { + "epoch": 0.08925378799228918, + "grad_norm": 3.0265285968780518, + "learning_rate": 1.9910098100481076e-07, + "loss": 0.0288, + "step": 22340 + }, + { + "epoch": 0.08929374044886584, + "grad_norm": 2.090585708618164, + "learning_rate": 1.9910013950662026e-07, + "loss": 0.0281, + "step": 22350 + }, + { + "epoch": 0.08933369290544252, + "grad_norm": 3.380606174468994, + "learning_rate": 1.9909929761656507e-07, + "loss": 0.0288, + "step": 22360 + }, + { + "epoch": 0.0893736453620192, + "grad_norm": 4.164679527282715, + "learning_rate": 1.9909845533464854e-07, + "loss": 0.0282, + "step": 22370 + }, + { + "epoch": 0.08941359781859587, + "grad_norm": 17.23598289489746, + "learning_rate": 1.9909761266087403e-07, + "loss": 0.0297, + "step": 22380 + }, + { + "epoch": 0.08945355027517254, + "grad_norm": 6.4324493408203125, + "learning_rate": 1.9909676959524484e-07, + "loss": 0.0301, + "step": 22390 + }, + { + "epoch": 0.08949350273174922, + "grad_norm": 12.030893325805664, + "learning_rate": 1.9909592613776427e-07, + "loss": 0.0323, + "step": 22400 + }, + { + "epoch": 0.0895334551883259, + "grad_norm": 3.7154271602630615, + "learning_rate": 1.990950822884357e-07, + "loss": 0.0321, + "step": 22410 + }, + { + "epoch": 0.08957340764490257, + "grad_norm": 5.977232456207275, + "learning_rate": 1.9909423804726252e-07, + "loss": 0.0281, + "step": 22420 + }, + { + "epoch": 0.08961336010147924, + "grad_norm": 6.525287628173828, + "learning_rate": 1.99093393414248e-07, + "loss": 0.0269, + "step": 22430 + }, + { + "epoch": 0.08965331255805592, + "grad_norm": 4.342243194580078, + "learning_rate": 1.9909254838939544e-07, + "loss": 0.0315, + "step": 22440 + }, + { + "epoch": 0.08969326501463258, + "grad_norm": 2.766613721847534, + "learning_rate": 1.9909170297270826e-07, + "loss": 0.0265, + "step": 22450 + }, + { + "epoch": 0.08973321747120926, + "grad_norm": 4.506977558135986, + "learning_rate": 1.9909085716418974e-07, + "loss": 0.0282, + "step": 22460 + }, + { + "epoch": 0.08977316992778593, + "grad_norm": 5.440458297729492, + "learning_rate": 1.990900109638433e-07, + "loss": 0.0314, + "step": 22470 + }, + { + "epoch": 0.0898131223843626, + "grad_norm": 3.9099154472351074, + "learning_rate": 1.990891643716722e-07, + "loss": 0.0292, + "step": 22480 + }, + { + "epoch": 0.08985307484093928, + "grad_norm": 5.046438694000244, + "learning_rate": 1.990883173876799e-07, + "loss": 0.0358, + "step": 22490 + }, + { + "epoch": 0.08989302729751596, + "grad_norm": 6.298309326171875, + "learning_rate": 1.990874700118696e-07, + "loss": 0.0337, + "step": 22500 + }, + { + "epoch": 0.08993297975409263, + "grad_norm": 3.420680522918701, + "learning_rate": 1.9908662224424478e-07, + "loss": 0.0277, + "step": 22510 + }, + { + "epoch": 0.0899729322106693, + "grad_norm": 5.745359420776367, + "learning_rate": 1.9908577408480874e-07, + "loss": 0.0331, + "step": 22520 + }, + { + "epoch": 0.09001288466724598, + "grad_norm": 5.399534225463867, + "learning_rate": 1.9908492553356485e-07, + "loss": 0.0316, + "step": 22530 + }, + { + "epoch": 0.09005283712382266, + "grad_norm": 2.392191171646118, + "learning_rate": 1.990840765905164e-07, + "loss": 0.0266, + "step": 22540 + }, + { + "epoch": 0.09009278958039932, + "grad_norm": 2.8454225063323975, + "learning_rate": 1.9908322725566685e-07, + "loss": 0.0248, + "step": 22550 + }, + { + "epoch": 0.090132742036976, + "grad_norm": 10.068068504333496, + "learning_rate": 1.9908237752901948e-07, + "loss": 0.0292, + "step": 22560 + }, + { + "epoch": 0.09017269449355267, + "grad_norm": 7.975190162658691, + "learning_rate": 1.9908152741057771e-07, + "loss": 0.0274, + "step": 22570 + }, + { + "epoch": 0.09021264695012934, + "grad_norm": 4.461019515991211, + "learning_rate": 1.9908067690034485e-07, + "loss": 0.0341, + "step": 22580 + }, + { + "epoch": 0.09025259940670602, + "grad_norm": 5.3769450187683105, + "learning_rate": 1.9907982599832426e-07, + "loss": 0.0273, + "step": 22590 + }, + { + "epoch": 0.0902925518632827, + "grad_norm": 2.5205929279327393, + "learning_rate": 1.9907897470451935e-07, + "loss": 0.0273, + "step": 22600 + }, + { + "epoch": 0.09033250431985937, + "grad_norm": 3.7353832721710205, + "learning_rate": 1.9907812301893346e-07, + "loss": 0.0282, + "step": 22610 + }, + { + "epoch": 0.09037245677643604, + "grad_norm": 5.909454345703125, + "learning_rate": 1.9907727094156998e-07, + "loss": 0.0269, + "step": 22620 + }, + { + "epoch": 0.09041240923301272, + "grad_norm": 8.17597484588623, + "learning_rate": 1.9907641847243227e-07, + "loss": 0.0276, + "step": 22630 + }, + { + "epoch": 0.0904523616895894, + "grad_norm": 3.8617777824401855, + "learning_rate": 1.9907556561152365e-07, + "loss": 0.0255, + "step": 22640 + }, + { + "epoch": 0.09049231414616606, + "grad_norm": 3.631284236907959, + "learning_rate": 1.9907471235884756e-07, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.09053226660274273, + "grad_norm": 2.660611867904663, + "learning_rate": 1.9907385871440735e-07, + "loss": 0.0315, + "step": 22660 + }, + { + "epoch": 0.0905722190593194, + "grad_norm": 2.4329891204833984, + "learning_rate": 1.9907300467820638e-07, + "loss": 0.0241, + "step": 22670 + }, + { + "epoch": 0.09061217151589608, + "grad_norm": 7.595456123352051, + "learning_rate": 1.9907215025024806e-07, + "loss": 0.0299, + "step": 22680 + }, + { + "epoch": 0.09065212397247276, + "grad_norm": 3.3907175064086914, + "learning_rate": 1.990712954305357e-07, + "loss": 0.0265, + "step": 22690 + }, + { + "epoch": 0.09069207642904943, + "grad_norm": 8.328044891357422, + "learning_rate": 1.990704402190728e-07, + "loss": 0.0256, + "step": 22700 + }, + { + "epoch": 0.0907320288856261, + "grad_norm": 3.5071980953216553, + "learning_rate": 1.9906958461586262e-07, + "loss": 0.0353, + "step": 22710 + }, + { + "epoch": 0.09077198134220278, + "grad_norm": 5.159221172332764, + "learning_rate": 1.9906872862090862e-07, + "loss": 0.033, + "step": 22720 + }, + { + "epoch": 0.09081193379877946, + "grad_norm": 3.191012382507324, + "learning_rate": 1.9906787223421414e-07, + "loss": 0.0288, + "step": 22730 + }, + { + "epoch": 0.09085188625535613, + "grad_norm": 5.731540203094482, + "learning_rate": 1.9906701545578263e-07, + "loss": 0.0321, + "step": 22740 + }, + { + "epoch": 0.0908918387119328, + "grad_norm": 2.2204625606536865, + "learning_rate": 1.9906615828561737e-07, + "loss": 0.0336, + "step": 22750 + }, + { + "epoch": 0.09093179116850947, + "grad_norm": 3.107001304626465, + "learning_rate": 1.9906530072372188e-07, + "loss": 0.0278, + "step": 22760 + }, + { + "epoch": 0.09097174362508614, + "grad_norm": 3.3498854637145996, + "learning_rate": 1.990644427700994e-07, + "loss": 0.0307, + "step": 22770 + }, + { + "epoch": 0.09101169608166282, + "grad_norm": 9.900470733642578, + "learning_rate": 1.9906358442475347e-07, + "loss": 0.0364, + "step": 22780 + }, + { + "epoch": 0.0910516485382395, + "grad_norm": 8.70706844329834, + "learning_rate": 1.9906272568768744e-07, + "loss": 0.0292, + "step": 22790 + }, + { + "epoch": 0.09109160099481617, + "grad_norm": 3.1795098781585693, + "learning_rate": 1.9906186655890463e-07, + "loss": 0.0272, + "step": 22800 + }, + { + "epoch": 0.09113155345139284, + "grad_norm": 4.822513580322266, + "learning_rate": 1.990610070384085e-07, + "loss": 0.0257, + "step": 22810 + }, + { + "epoch": 0.09117150590796952, + "grad_norm": 17.023841857910156, + "learning_rate": 1.9906014712620247e-07, + "loss": 0.0306, + "step": 22820 + }, + { + "epoch": 0.0912114583645462, + "grad_norm": 4.148003578186035, + "learning_rate": 1.9905928682228989e-07, + "loss": 0.0286, + "step": 22830 + }, + { + "epoch": 0.09125141082112287, + "grad_norm": 11.132587432861328, + "learning_rate": 1.9905842612667422e-07, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.09129136327769954, + "grad_norm": 4.586612701416016, + "learning_rate": 1.9905756503935878e-07, + "loss": 0.0301, + "step": 22850 + }, + { + "epoch": 0.0913313157342762, + "grad_norm": 5.084049224853516, + "learning_rate": 1.9905670356034704e-07, + "loss": 0.0282, + "step": 22860 + }, + { + "epoch": 0.09137126819085288, + "grad_norm": 5.159307956695557, + "learning_rate": 1.990558416896424e-07, + "loss": 0.0281, + "step": 22870 + }, + { + "epoch": 0.09141122064742956, + "grad_norm": 7.127583980560303, + "learning_rate": 1.9905497942724824e-07, + "loss": 0.0284, + "step": 22880 + }, + { + "epoch": 0.09145117310400623, + "grad_norm": 2.3338911533355713, + "learning_rate": 1.9905411677316802e-07, + "loss": 0.0299, + "step": 22890 + }, + { + "epoch": 0.0914911255605829, + "grad_norm": 4.014945983886719, + "learning_rate": 1.990532537274051e-07, + "loss": 0.0298, + "step": 22900 + }, + { + "epoch": 0.09153107801715958, + "grad_norm": 10.284894943237305, + "learning_rate": 1.9905239028996287e-07, + "loss": 0.0309, + "step": 22910 + }, + { + "epoch": 0.09157103047373626, + "grad_norm": 6.84528923034668, + "learning_rate": 1.9905152646084485e-07, + "loss": 0.0308, + "step": 22920 + }, + { + "epoch": 0.09161098293031293, + "grad_norm": 4.549258708953857, + "learning_rate": 1.9905066224005434e-07, + "loss": 0.0253, + "step": 22930 + }, + { + "epoch": 0.0916509353868896, + "grad_norm": 5.202526569366455, + "learning_rate": 1.9904979762759485e-07, + "loss": 0.0364, + "step": 22940 + }, + { + "epoch": 0.09169088784346628, + "grad_norm": 3.412672758102417, + "learning_rate": 1.990489326234697e-07, + "loss": 0.0294, + "step": 22950 + }, + { + "epoch": 0.09173084030004294, + "grad_norm": 2.399216890335083, + "learning_rate": 1.990480672276824e-07, + "loss": 0.0235, + "step": 22960 + }, + { + "epoch": 0.09177079275661962, + "grad_norm": 3.8786118030548096, + "learning_rate": 1.9904720144023636e-07, + "loss": 0.0269, + "step": 22970 + }, + { + "epoch": 0.0918107452131963, + "grad_norm": 4.190281867980957, + "learning_rate": 1.9904633526113496e-07, + "loss": 0.0269, + "step": 22980 + }, + { + "epoch": 0.09185069766977297, + "grad_norm": 7.129141330718994, + "learning_rate": 1.9904546869038167e-07, + "loss": 0.03, + "step": 22990 + }, + { + "epoch": 0.09189065012634964, + "grad_norm": 6.3112382888793945, + "learning_rate": 1.9904460172797987e-07, + "loss": 0.0254, + "step": 23000 + }, + { + "epoch": 0.09193060258292632, + "grad_norm": 1.543702483177185, + "learning_rate": 1.9904373437393302e-07, + "loss": 0.0301, + "step": 23010 + }, + { + "epoch": 0.091970555039503, + "grad_norm": 2.5702881813049316, + "learning_rate": 1.9904286662824456e-07, + "loss": 0.0258, + "step": 23020 + }, + { + "epoch": 0.09201050749607967, + "grad_norm": 3.8752024173736572, + "learning_rate": 1.9904199849091789e-07, + "loss": 0.0328, + "step": 23030 + }, + { + "epoch": 0.09205045995265634, + "grad_norm": 4.878530025482178, + "learning_rate": 1.9904112996195643e-07, + "loss": 0.0292, + "step": 23040 + }, + { + "epoch": 0.09209041240923302, + "grad_norm": 4.833332061767578, + "learning_rate": 1.9904026104136367e-07, + "loss": 0.0317, + "step": 23050 + }, + { + "epoch": 0.09213036486580968, + "grad_norm": 4.3780012130737305, + "learning_rate": 1.9903939172914298e-07, + "loss": 0.0277, + "step": 23060 + }, + { + "epoch": 0.09217031732238636, + "grad_norm": 3.6928858757019043, + "learning_rate": 1.9903852202529788e-07, + "loss": 0.0258, + "step": 23070 + }, + { + "epoch": 0.09221026977896303, + "grad_norm": 6.134823322296143, + "learning_rate": 1.9903765192983176e-07, + "loss": 0.0288, + "step": 23080 + }, + { + "epoch": 0.0922502222355397, + "grad_norm": 1.920209527015686, + "learning_rate": 1.9903678144274805e-07, + "loss": 0.028, + "step": 23090 + }, + { + "epoch": 0.09229017469211638, + "grad_norm": 10.62780475616455, + "learning_rate": 1.9903591056405017e-07, + "loss": 0.0309, + "step": 23100 + }, + { + "epoch": 0.09233012714869306, + "grad_norm": 14.246769905090332, + "learning_rate": 1.9903503929374165e-07, + "loss": 0.0272, + "step": 23110 + }, + { + "epoch": 0.09237007960526973, + "grad_norm": 4.525990962982178, + "learning_rate": 1.990341676318259e-07, + "loss": 0.0304, + "step": 23120 + }, + { + "epoch": 0.0924100320618464, + "grad_norm": 5.492067813873291, + "learning_rate": 1.990332955783063e-07, + "loss": 0.0319, + "step": 23130 + }, + { + "epoch": 0.09244998451842308, + "grad_norm": 4.05776834487915, + "learning_rate": 1.9903242313318638e-07, + "loss": 0.0319, + "step": 23140 + }, + { + "epoch": 0.09248993697499976, + "grad_norm": 4.716386795043945, + "learning_rate": 1.9903155029646954e-07, + "loss": 0.0296, + "step": 23150 + }, + { + "epoch": 0.09252988943157642, + "grad_norm": 3.6253011226654053, + "learning_rate": 1.9903067706815926e-07, + "loss": 0.028, + "step": 23160 + }, + { + "epoch": 0.09256984188815309, + "grad_norm": 4.925887584686279, + "learning_rate": 1.99029803448259e-07, + "loss": 0.0315, + "step": 23170 + }, + { + "epoch": 0.09260979434472977, + "grad_norm": 5.941005229949951, + "learning_rate": 1.9902892943677218e-07, + "loss": 0.0275, + "step": 23180 + }, + { + "epoch": 0.09264974680130644, + "grad_norm": 3.1749351024627686, + "learning_rate": 1.990280550337023e-07, + "loss": 0.0321, + "step": 23190 + }, + { + "epoch": 0.09268969925788312, + "grad_norm": 4.957430839538574, + "learning_rate": 1.9902718023905273e-07, + "loss": 0.0326, + "step": 23200 + }, + { + "epoch": 0.0927296517144598, + "grad_norm": 14.042521476745605, + "learning_rate": 1.9902630505282706e-07, + "loss": 0.03, + "step": 23210 + }, + { + "epoch": 0.09276960417103647, + "grad_norm": 4.58452033996582, + "learning_rate": 1.9902542947502862e-07, + "loss": 0.0327, + "step": 23220 + }, + { + "epoch": 0.09280955662761314, + "grad_norm": 3.4792566299438477, + "learning_rate": 1.9902455350566098e-07, + "loss": 0.0292, + "step": 23230 + }, + { + "epoch": 0.09284950908418982, + "grad_norm": 1.7829407453536987, + "learning_rate": 1.9902367714472753e-07, + "loss": 0.0279, + "step": 23240 + }, + { + "epoch": 0.0928894615407665, + "grad_norm": 5.54933500289917, + "learning_rate": 1.990228003922318e-07, + "loss": 0.0257, + "step": 23250 + }, + { + "epoch": 0.09292941399734315, + "grad_norm": 2.5600059032440186, + "learning_rate": 1.990219232481772e-07, + "loss": 0.0283, + "step": 23260 + }, + { + "epoch": 0.09296936645391983, + "grad_norm": 2.3814282417297363, + "learning_rate": 1.990210457125672e-07, + "loss": 0.0254, + "step": 23270 + }, + { + "epoch": 0.0930093189104965, + "grad_norm": 3.067052125930786, + "learning_rate": 1.990201677854053e-07, + "loss": 0.0255, + "step": 23280 + }, + { + "epoch": 0.09304927136707318, + "grad_norm": 5.083580017089844, + "learning_rate": 1.9901928946669496e-07, + "loss": 0.0252, + "step": 23290 + }, + { + "epoch": 0.09308922382364986, + "grad_norm": 2.8653531074523926, + "learning_rate": 1.9901841075643967e-07, + "loss": 0.0305, + "step": 23300 + }, + { + "epoch": 0.09312917628022653, + "grad_norm": 4.669271945953369, + "learning_rate": 1.9901753165464287e-07, + "loss": 0.0239, + "step": 23310 + }, + { + "epoch": 0.0931691287368032, + "grad_norm": 4.915122985839844, + "learning_rate": 1.9901665216130805e-07, + "loss": 0.0315, + "step": 23320 + }, + { + "epoch": 0.09320908119337988, + "grad_norm": 106.47334289550781, + "learning_rate": 1.9901577227643874e-07, + "loss": 0.0276, + "step": 23330 + }, + { + "epoch": 0.09324903364995656, + "grad_norm": 10.6792573928833, + "learning_rate": 1.990148920000383e-07, + "loss": 0.0321, + "step": 23340 + }, + { + "epoch": 0.09328898610653323, + "grad_norm": 4.463834762573242, + "learning_rate": 1.9901401133211034e-07, + "loss": 0.0305, + "step": 23350 + }, + { + "epoch": 0.0933289385631099, + "grad_norm": 5.05720853805542, + "learning_rate": 1.9901313027265822e-07, + "loss": 0.0315, + "step": 23360 + }, + { + "epoch": 0.09336889101968657, + "grad_norm": 5.6240739822387695, + "learning_rate": 1.9901224882168555e-07, + "loss": 0.0281, + "step": 23370 + }, + { + "epoch": 0.09340884347626324, + "grad_norm": 2.3791110515594482, + "learning_rate": 1.9901136697919572e-07, + "loss": 0.0298, + "step": 23380 + }, + { + "epoch": 0.09344879593283992, + "grad_norm": 3.204160690307617, + "learning_rate": 1.9901048474519226e-07, + "loss": 0.0239, + "step": 23390 + }, + { + "epoch": 0.09348874838941659, + "grad_norm": 4.338263511657715, + "learning_rate": 1.9900960211967863e-07, + "loss": 0.0261, + "step": 23400 + }, + { + "epoch": 0.09352870084599327, + "grad_norm": 3.6223225593566895, + "learning_rate": 1.9900871910265837e-07, + "loss": 0.0284, + "step": 23410 + }, + { + "epoch": 0.09356865330256994, + "grad_norm": 3.842743396759033, + "learning_rate": 1.9900783569413494e-07, + "loss": 0.0289, + "step": 23420 + }, + { + "epoch": 0.09360860575914662, + "grad_norm": 4.563722133636475, + "learning_rate": 1.9900695189411182e-07, + "loss": 0.0288, + "step": 23430 + }, + { + "epoch": 0.0936485582157233, + "grad_norm": 6.428160667419434, + "learning_rate": 1.9900606770259253e-07, + "loss": 0.0291, + "step": 23440 + }, + { + "epoch": 0.09368851067229997, + "grad_norm": 12.686612129211426, + "learning_rate": 1.9900518311958051e-07, + "loss": 0.029, + "step": 23450 + }, + { + "epoch": 0.09372846312887664, + "grad_norm": 4.785505294799805, + "learning_rate": 1.9900429814507935e-07, + "loss": 0.0283, + "step": 23460 + }, + { + "epoch": 0.0937684155854533, + "grad_norm": 4.129634380340576, + "learning_rate": 1.990034127790925e-07, + "loss": 0.0267, + "step": 23470 + }, + { + "epoch": 0.09380836804202998, + "grad_norm": 3.9306952953338623, + "learning_rate": 1.9900252702162343e-07, + "loss": 0.0267, + "step": 23480 + }, + { + "epoch": 0.09384832049860665, + "grad_norm": 4.470452308654785, + "learning_rate": 1.990016408726757e-07, + "loss": 0.0256, + "step": 23490 + }, + { + "epoch": 0.09388827295518333, + "grad_norm": 2.9731550216674805, + "learning_rate": 1.990007543322528e-07, + "loss": 0.0316, + "step": 23500 + }, + { + "epoch": 0.09392822541176, + "grad_norm": 7.339372634887695, + "learning_rate": 1.989998674003582e-07, + "loss": 0.0338, + "step": 23510 + }, + { + "epoch": 0.09396817786833668, + "grad_norm": 16.14704132080078, + "learning_rate": 1.9899898007699548e-07, + "loss": 0.0283, + "step": 23520 + }, + { + "epoch": 0.09400813032491336, + "grad_norm": 6.336671829223633, + "learning_rate": 1.9899809236216806e-07, + "loss": 0.0286, + "step": 23530 + }, + { + "epoch": 0.09404808278149003, + "grad_norm": 6.047158241271973, + "learning_rate": 1.9899720425587949e-07, + "loss": 0.0244, + "step": 23540 + }, + { + "epoch": 0.0940880352380667, + "grad_norm": 4.674551010131836, + "learning_rate": 1.989963157581333e-07, + "loss": 0.0302, + "step": 23550 + }, + { + "epoch": 0.09412798769464338, + "grad_norm": 7.721298694610596, + "learning_rate": 1.9899542686893298e-07, + "loss": 0.0264, + "step": 23560 + }, + { + "epoch": 0.09416794015122004, + "grad_norm": 33.496437072753906, + "learning_rate": 1.9899453758828204e-07, + "loss": 0.0294, + "step": 23570 + }, + { + "epoch": 0.09420789260779672, + "grad_norm": 4.85537052154541, + "learning_rate": 1.98993647916184e-07, + "loss": 0.0292, + "step": 23580 + }, + { + "epoch": 0.09424784506437339, + "grad_norm": 3.69162654876709, + "learning_rate": 1.9899275785264243e-07, + "loss": 0.0294, + "step": 23590 + }, + { + "epoch": 0.09428779752095007, + "grad_norm": 2.967045545578003, + "learning_rate": 1.9899186739766077e-07, + "loss": 0.0305, + "step": 23600 + }, + { + "epoch": 0.09432774997752674, + "grad_norm": 2.9837570190429688, + "learning_rate": 1.9899097655124256e-07, + "loss": 0.0266, + "step": 23610 + }, + { + "epoch": 0.09436770243410342, + "grad_norm": 2.6987969875335693, + "learning_rate": 1.9899008531339137e-07, + "loss": 0.0272, + "step": 23620 + }, + { + "epoch": 0.09440765489068009, + "grad_norm": 4.7137885093688965, + "learning_rate": 1.9898919368411068e-07, + "loss": 0.0319, + "step": 23630 + }, + { + "epoch": 0.09444760734725677, + "grad_norm": 4.801179885864258, + "learning_rate": 1.9898830166340405e-07, + "loss": 0.0287, + "step": 23640 + }, + { + "epoch": 0.09448755980383344, + "grad_norm": 4.466228485107422, + "learning_rate": 1.9898740925127498e-07, + "loss": 0.0264, + "step": 23650 + }, + { + "epoch": 0.09452751226041012, + "grad_norm": 2.8530240058898926, + "learning_rate": 1.9898651644772698e-07, + "loss": 0.0263, + "step": 23660 + }, + { + "epoch": 0.09456746471698678, + "grad_norm": 5.267767906188965, + "learning_rate": 1.9898562325276362e-07, + "loss": 0.0268, + "step": 23670 + }, + { + "epoch": 0.09460741717356345, + "grad_norm": 5.214029312133789, + "learning_rate": 1.9898472966638843e-07, + "loss": 0.0291, + "step": 23680 + }, + { + "epoch": 0.09464736963014013, + "grad_norm": 4.293683052062988, + "learning_rate": 1.9898383568860487e-07, + "loss": 0.0297, + "step": 23690 + }, + { + "epoch": 0.0946873220867168, + "grad_norm": 4.862778663635254, + "learning_rate": 1.9898294131941663e-07, + "loss": 0.0296, + "step": 23700 + }, + { + "epoch": 0.09472727454329348, + "grad_norm": 3.226255178451538, + "learning_rate": 1.9898204655882708e-07, + "loss": 0.0304, + "step": 23710 + }, + { + "epoch": 0.09476722699987015, + "grad_norm": 1.3968764543533325, + "learning_rate": 1.9898115140683984e-07, + "loss": 0.0268, + "step": 23720 + }, + { + "epoch": 0.09480717945644683, + "grad_norm": 2.6343846321105957, + "learning_rate": 1.9898025586345848e-07, + "loss": 0.0284, + "step": 23730 + }, + { + "epoch": 0.0948471319130235, + "grad_norm": 7.0892462730407715, + "learning_rate": 1.9897935992868645e-07, + "loss": 0.0279, + "step": 23740 + }, + { + "epoch": 0.09488708436960018, + "grad_norm": 2.743563413619995, + "learning_rate": 1.989784636025274e-07, + "loss": 0.0262, + "step": 23750 + }, + { + "epoch": 0.09492703682617686, + "grad_norm": 5.503167152404785, + "learning_rate": 1.9897756688498477e-07, + "loss": 0.0248, + "step": 23760 + }, + { + "epoch": 0.09496698928275352, + "grad_norm": 13.216927528381348, + "learning_rate": 1.9897666977606216e-07, + "loss": 0.0255, + "step": 23770 + }, + { + "epoch": 0.09500694173933019, + "grad_norm": 7.640445232391357, + "learning_rate": 1.9897577227576313e-07, + "loss": 0.0208, + "step": 23780 + }, + { + "epoch": 0.09504689419590687, + "grad_norm": 4.127798557281494, + "learning_rate": 1.989748743840912e-07, + "loss": 0.0283, + "step": 23790 + }, + { + "epoch": 0.09508684665248354, + "grad_norm": 6.137058258056641, + "learning_rate": 1.9897397610104993e-07, + "loss": 0.0345, + "step": 23800 + }, + { + "epoch": 0.09512679910906022, + "grad_norm": 4.922272205352783, + "learning_rate": 1.9897307742664288e-07, + "loss": 0.0277, + "step": 23810 + }, + { + "epoch": 0.09516675156563689, + "grad_norm": 6.559903144836426, + "learning_rate": 1.989721783608736e-07, + "loss": 0.0276, + "step": 23820 + }, + { + "epoch": 0.09520670402221357, + "grad_norm": 2.7836709022521973, + "learning_rate": 1.989712789037456e-07, + "loss": 0.0279, + "step": 23830 + }, + { + "epoch": 0.09524665647879024, + "grad_norm": 6.262801647186279, + "learning_rate": 1.9897037905526253e-07, + "loss": 0.0293, + "step": 23840 + }, + { + "epoch": 0.09528660893536692, + "grad_norm": 5.837818622589111, + "learning_rate": 1.9896947881542787e-07, + "loss": 0.0256, + "step": 23850 + }, + { + "epoch": 0.09532656139194359, + "grad_norm": 2.5654866695404053, + "learning_rate": 1.9896857818424522e-07, + "loss": 0.0285, + "step": 23860 + }, + { + "epoch": 0.09536651384852025, + "grad_norm": 2.800607919692993, + "learning_rate": 1.989676771617181e-07, + "loss": 0.0287, + "step": 23870 + }, + { + "epoch": 0.09540646630509693, + "grad_norm": 4.850164890289307, + "learning_rate": 1.9896677574785012e-07, + "loss": 0.0292, + "step": 23880 + }, + { + "epoch": 0.0954464187616736, + "grad_norm": 5.165111541748047, + "learning_rate": 1.9896587394264483e-07, + "loss": 0.0311, + "step": 23890 + }, + { + "epoch": 0.09548637121825028, + "grad_norm": 4.668185234069824, + "learning_rate": 1.9896497174610578e-07, + "loss": 0.0289, + "step": 23900 + }, + { + "epoch": 0.09552632367482695, + "grad_norm": 5.028717517852783, + "learning_rate": 1.9896406915823655e-07, + "loss": 0.0278, + "step": 23910 + }, + { + "epoch": 0.09556627613140363, + "grad_norm": 3.6407155990600586, + "learning_rate": 1.989631661790407e-07, + "loss": 0.0253, + "step": 23920 + }, + { + "epoch": 0.0956062285879803, + "grad_norm": 4.1379852294921875, + "learning_rate": 1.9896226280852183e-07, + "loss": 0.0215, + "step": 23930 + }, + { + "epoch": 0.09564618104455698, + "grad_norm": 6.015931129455566, + "learning_rate": 1.9896135904668346e-07, + "loss": 0.0261, + "step": 23940 + }, + { + "epoch": 0.09568613350113366, + "grad_norm": 8.867350578308105, + "learning_rate": 1.9896045489352921e-07, + "loss": 0.0288, + "step": 23950 + }, + { + "epoch": 0.09572608595771033, + "grad_norm": 2.6480066776275635, + "learning_rate": 1.9895955034906263e-07, + "loss": 0.0293, + "step": 23960 + }, + { + "epoch": 0.095766038414287, + "grad_norm": 2.0545401573181152, + "learning_rate": 1.9895864541328732e-07, + "loss": 0.0277, + "step": 23970 + }, + { + "epoch": 0.09580599087086367, + "grad_norm": 5.642204284667969, + "learning_rate": 1.9895774008620681e-07, + "loss": 0.032, + "step": 23980 + }, + { + "epoch": 0.09584594332744034, + "grad_norm": 2.9330925941467285, + "learning_rate": 1.9895683436782475e-07, + "loss": 0.0314, + "step": 23990 + }, + { + "epoch": 0.09588589578401702, + "grad_norm": 3.273153305053711, + "learning_rate": 1.989559282581447e-07, + "loss": 0.0285, + "step": 24000 + }, + { + "epoch": 0.09592584824059369, + "grad_norm": 5.1099114418029785, + "learning_rate": 1.9895502175717018e-07, + "loss": 0.0263, + "step": 24010 + }, + { + "epoch": 0.09596580069717037, + "grad_norm": 4.307168006896973, + "learning_rate": 1.9895411486490485e-07, + "loss": 0.0309, + "step": 24020 + }, + { + "epoch": 0.09600575315374704, + "grad_norm": 4.580697536468506, + "learning_rate": 1.9895320758135225e-07, + "loss": 0.028, + "step": 24030 + }, + { + "epoch": 0.09604570561032372, + "grad_norm": 7.881589412689209, + "learning_rate": 1.98952299906516e-07, + "loss": 0.0343, + "step": 24040 + }, + { + "epoch": 0.09608565806690039, + "grad_norm": 4.712338924407959, + "learning_rate": 1.9895139184039968e-07, + "loss": 0.0259, + "step": 24050 + }, + { + "epoch": 0.09612561052347707, + "grad_norm": 2.9436607360839844, + "learning_rate": 1.9895048338300685e-07, + "loss": 0.032, + "step": 24060 + }, + { + "epoch": 0.09616556298005374, + "grad_norm": 6.196104526519775, + "learning_rate": 1.9894957453434116e-07, + "loss": 0.0296, + "step": 24070 + }, + { + "epoch": 0.0962055154366304, + "grad_norm": 1.856370449066162, + "learning_rate": 1.9894866529440615e-07, + "loss": 0.029, + "step": 24080 + }, + { + "epoch": 0.09624546789320708, + "grad_norm": 6.060257434844971, + "learning_rate": 1.9894775566320546e-07, + "loss": 0.0257, + "step": 24090 + }, + { + "epoch": 0.09628542034978375, + "grad_norm": 9.195799827575684, + "learning_rate": 1.9894684564074263e-07, + "loss": 0.0312, + "step": 24100 + }, + { + "epoch": 0.09632537280636043, + "grad_norm": 3.089419364929199, + "learning_rate": 1.989459352270213e-07, + "loss": 0.0287, + "step": 24110 + }, + { + "epoch": 0.0963653252629371, + "grad_norm": 4.336328506469727, + "learning_rate": 1.989450244220451e-07, + "loss": 0.0262, + "step": 24120 + }, + { + "epoch": 0.09640527771951378, + "grad_norm": 9.269365310668945, + "learning_rate": 1.9894411322581758e-07, + "loss": 0.0294, + "step": 24130 + }, + { + "epoch": 0.09644523017609045, + "grad_norm": 4.65213680267334, + "learning_rate": 1.9894320163834232e-07, + "loss": 0.0275, + "step": 24140 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 4.5808329582214355, + "learning_rate": 1.98942289659623e-07, + "loss": 0.0295, + "step": 24150 + }, + { + "epoch": 0.0965251350892438, + "grad_norm": 5.561981678009033, + "learning_rate": 1.9894137728966316e-07, + "loss": 0.0339, + "step": 24160 + }, + { + "epoch": 0.09656508754582048, + "grad_norm": 7.198792934417725, + "learning_rate": 1.9894046452846647e-07, + "loss": 0.0258, + "step": 24170 + }, + { + "epoch": 0.09660504000239714, + "grad_norm": 4.518613815307617, + "learning_rate": 1.9893955137603646e-07, + "loss": 0.0299, + "step": 24180 + }, + { + "epoch": 0.09664499245897382, + "grad_norm": 3.9162003993988037, + "learning_rate": 1.9893863783237683e-07, + "loss": 0.0299, + "step": 24190 + }, + { + "epoch": 0.09668494491555049, + "grad_norm": 4.140887260437012, + "learning_rate": 1.989377238974911e-07, + "loss": 0.0196, + "step": 24200 + }, + { + "epoch": 0.09672489737212717, + "grad_norm": 4.983668327331543, + "learning_rate": 1.98936809571383e-07, + "loss": 0.0318, + "step": 24210 + }, + { + "epoch": 0.09676484982870384, + "grad_norm": 3.9586780071258545, + "learning_rate": 1.9893589485405603e-07, + "loss": 0.0254, + "step": 24220 + }, + { + "epoch": 0.09680480228528052, + "grad_norm": 4.01473331451416, + "learning_rate": 1.9893497974551386e-07, + "loss": 0.0235, + "step": 24230 + }, + { + "epoch": 0.09684475474185719, + "grad_norm": 8.972138404846191, + "learning_rate": 1.989340642457601e-07, + "loss": 0.0284, + "step": 24240 + }, + { + "epoch": 0.09688470719843387, + "grad_norm": 2.976442575454712, + "learning_rate": 1.9893314835479836e-07, + "loss": 0.0286, + "step": 24250 + }, + { + "epoch": 0.09692465965501054, + "grad_norm": 4.959746360778809, + "learning_rate": 1.9893223207263233e-07, + "loss": 0.0255, + "step": 24260 + }, + { + "epoch": 0.09696461211158722, + "grad_norm": 8.64358139038086, + "learning_rate": 1.989313153992655e-07, + "loss": 0.0266, + "step": 24270 + }, + { + "epoch": 0.09700456456816388, + "grad_norm": 6.232528209686279, + "learning_rate": 1.989303983347016e-07, + "loss": 0.0255, + "step": 24280 + }, + { + "epoch": 0.09704451702474055, + "grad_norm": 3.212221145629883, + "learning_rate": 1.9892948087894427e-07, + "loss": 0.0263, + "step": 24290 + }, + { + "epoch": 0.09708446948131723, + "grad_norm": 3.298417091369629, + "learning_rate": 1.9892856303199706e-07, + "loss": 0.0276, + "step": 24300 + }, + { + "epoch": 0.0971244219378939, + "grad_norm": 4.906227111816406, + "learning_rate": 1.9892764479386366e-07, + "loss": 0.0282, + "step": 24310 + }, + { + "epoch": 0.09716437439447058, + "grad_norm": 7.227405548095703, + "learning_rate": 1.9892672616454767e-07, + "loss": 0.03, + "step": 24320 + }, + { + "epoch": 0.09720432685104725, + "grad_norm": 6.9342803955078125, + "learning_rate": 1.9892580714405273e-07, + "loss": 0.0265, + "step": 24330 + }, + { + "epoch": 0.09724427930762393, + "grad_norm": 8.357393264770508, + "learning_rate": 1.9892488773238246e-07, + "loss": 0.0268, + "step": 24340 + }, + { + "epoch": 0.0972842317642006, + "grad_norm": 11.6072998046875, + "learning_rate": 1.9892396792954052e-07, + "loss": 0.0249, + "step": 24350 + }, + { + "epoch": 0.09732418422077728, + "grad_norm": 4.060388088226318, + "learning_rate": 1.9892304773553053e-07, + "loss": 0.0276, + "step": 24360 + }, + { + "epoch": 0.09736413667735395, + "grad_norm": 4.204684734344482, + "learning_rate": 1.9892212715035615e-07, + "loss": 0.0296, + "step": 24370 + }, + { + "epoch": 0.09740408913393062, + "grad_norm": 3.20978045463562, + "learning_rate": 1.9892120617402098e-07, + "loss": 0.0292, + "step": 24380 + }, + { + "epoch": 0.09744404159050729, + "grad_norm": 3.4304583072662354, + "learning_rate": 1.989202848065287e-07, + "loss": 0.0281, + "step": 24390 + }, + { + "epoch": 0.09748399404708397, + "grad_norm": 3.3792595863342285, + "learning_rate": 1.9891936304788293e-07, + "loss": 0.0337, + "step": 24400 + }, + { + "epoch": 0.09752394650366064, + "grad_norm": 3.2862093448638916, + "learning_rate": 1.9891844089808734e-07, + "loss": 0.0285, + "step": 24410 + }, + { + "epoch": 0.09756389896023732, + "grad_norm": 3.114910125732422, + "learning_rate": 1.9891751835714557e-07, + "loss": 0.0319, + "step": 24420 + }, + { + "epoch": 0.09760385141681399, + "grad_norm": 3.33140230178833, + "learning_rate": 1.9891659542506122e-07, + "loss": 0.0253, + "step": 24430 + }, + { + "epoch": 0.09764380387339067, + "grad_norm": 1.7220983505249023, + "learning_rate": 1.9891567210183798e-07, + "loss": 0.0271, + "step": 24440 + }, + { + "epoch": 0.09768375632996734, + "grad_norm": 5.012001991271973, + "learning_rate": 1.9891474838747955e-07, + "loss": 0.0277, + "step": 24450 + }, + { + "epoch": 0.09772370878654402, + "grad_norm": 2.2257208824157715, + "learning_rate": 1.989138242819895e-07, + "loss": 0.0243, + "step": 24460 + }, + { + "epoch": 0.09776366124312069, + "grad_norm": 5.138134956359863, + "learning_rate": 1.9891289978537152e-07, + "loss": 0.0287, + "step": 24470 + }, + { + "epoch": 0.09780361369969737, + "grad_norm": 6.350701332092285, + "learning_rate": 1.9891197489762928e-07, + "loss": 0.0267, + "step": 24480 + }, + { + "epoch": 0.09784356615627403, + "grad_norm": 8.628104209899902, + "learning_rate": 1.989110496187664e-07, + "loss": 0.0248, + "step": 24490 + }, + { + "epoch": 0.0978835186128507, + "grad_norm": 5.718710422515869, + "learning_rate": 1.9891012394878656e-07, + "loss": 0.0225, + "step": 24500 + }, + { + "epoch": 0.09792347106942738, + "grad_norm": 2.439939022064209, + "learning_rate": 1.9890919788769344e-07, + "loss": 0.0296, + "step": 24510 + }, + { + "epoch": 0.09796342352600405, + "grad_norm": 4.396555423736572, + "learning_rate": 1.9890827143549066e-07, + "loss": 0.0286, + "step": 24520 + }, + { + "epoch": 0.09800337598258073, + "grad_norm": 4.097753047943115, + "learning_rate": 1.9890734459218188e-07, + "loss": 0.0296, + "step": 24530 + }, + { + "epoch": 0.0980433284391574, + "grad_norm": 5.313680171966553, + "learning_rate": 1.989064173577708e-07, + "loss": 0.0289, + "step": 24540 + }, + { + "epoch": 0.09808328089573408, + "grad_norm": 2.1566548347473145, + "learning_rate": 1.9890548973226111e-07, + "loss": 0.0266, + "step": 24550 + }, + { + "epoch": 0.09812323335231075, + "grad_norm": 7.80508279800415, + "learning_rate": 1.989045617156564e-07, + "loss": 0.0283, + "step": 24560 + }, + { + "epoch": 0.09816318580888743, + "grad_norm": 3.574852228164673, + "learning_rate": 1.9890363330796044e-07, + "loss": 0.0305, + "step": 24570 + }, + { + "epoch": 0.0982031382654641, + "grad_norm": 4.882966995239258, + "learning_rate": 1.9890270450917679e-07, + "loss": 0.029, + "step": 24580 + }, + { + "epoch": 0.09824309072204077, + "grad_norm": 5.520013332366943, + "learning_rate": 1.9890177531930918e-07, + "loss": 0.0275, + "step": 24590 + }, + { + "epoch": 0.09828304317861744, + "grad_norm": 4.264495372772217, + "learning_rate": 1.9890084573836133e-07, + "loss": 0.0325, + "step": 24600 + }, + { + "epoch": 0.09832299563519412, + "grad_norm": 3.8093693256378174, + "learning_rate": 1.9889991576633683e-07, + "loss": 0.0269, + "step": 24610 + }, + { + "epoch": 0.09836294809177079, + "grad_norm": 6.49940824508667, + "learning_rate": 1.9889898540323936e-07, + "loss": 0.0297, + "step": 24620 + }, + { + "epoch": 0.09840290054834747, + "grad_norm": 2.2433106899261475, + "learning_rate": 1.988980546490727e-07, + "loss": 0.0249, + "step": 24630 + }, + { + "epoch": 0.09844285300492414, + "grad_norm": 4.2674078941345215, + "learning_rate": 1.9889712350384043e-07, + "loss": 0.0269, + "step": 24640 + }, + { + "epoch": 0.09848280546150082, + "grad_norm": 4.996397495269775, + "learning_rate": 1.9889619196754627e-07, + "loss": 0.0254, + "step": 24650 + }, + { + "epoch": 0.09852275791807749, + "grad_norm": 3.633824586868286, + "learning_rate": 1.988952600401939e-07, + "loss": 0.0271, + "step": 24660 + }, + { + "epoch": 0.09856271037465417, + "grad_norm": 2.294180393218994, + "learning_rate": 1.98894327721787e-07, + "loss": 0.0266, + "step": 24670 + }, + { + "epoch": 0.09860266283123084, + "grad_norm": 2.8874332904815674, + "learning_rate": 1.9889339501232926e-07, + "loss": 0.0309, + "step": 24680 + }, + { + "epoch": 0.0986426152878075, + "grad_norm": 2.3485422134399414, + "learning_rate": 1.9889246191182436e-07, + "loss": 0.0253, + "step": 24690 + }, + { + "epoch": 0.09868256774438418, + "grad_norm": 2.932359218597412, + "learning_rate": 1.9889152842027603e-07, + "loss": 0.0257, + "step": 24700 + }, + { + "epoch": 0.09872252020096085, + "grad_norm": 2.8491218090057373, + "learning_rate": 1.9889059453768787e-07, + "loss": 0.0261, + "step": 24710 + }, + { + "epoch": 0.09876247265753753, + "grad_norm": 3.4065089225769043, + "learning_rate": 1.9888966026406367e-07, + "loss": 0.0268, + "step": 24720 + }, + { + "epoch": 0.0988024251141142, + "grad_norm": 4.473590850830078, + "learning_rate": 1.9888872559940714e-07, + "loss": 0.031, + "step": 24730 + }, + { + "epoch": 0.09884237757069088, + "grad_norm": 3.348212480545044, + "learning_rate": 1.9888779054372184e-07, + "loss": 0.0314, + "step": 24740 + }, + { + "epoch": 0.09888233002726755, + "grad_norm": 3.001941204071045, + "learning_rate": 1.9888685509701159e-07, + "loss": 0.0267, + "step": 24750 + }, + { + "epoch": 0.09892228248384423, + "grad_norm": 8.497267723083496, + "learning_rate": 1.9888591925928004e-07, + "loss": 0.0274, + "step": 24760 + }, + { + "epoch": 0.0989622349404209, + "grad_norm": 5.149484157562256, + "learning_rate": 1.988849830305309e-07, + "loss": 0.027, + "step": 24770 + }, + { + "epoch": 0.09900218739699758, + "grad_norm": 7.558290004730225, + "learning_rate": 1.9888404641076786e-07, + "loss": 0.0284, + "step": 24780 + }, + { + "epoch": 0.09904213985357424, + "grad_norm": 16.8209285736084, + "learning_rate": 1.9888310939999465e-07, + "loss": 0.0285, + "step": 24790 + }, + { + "epoch": 0.09908209231015092, + "grad_norm": 3.7597384452819824, + "learning_rate": 1.9888217199821497e-07, + "loss": 0.0304, + "step": 24800 + }, + { + "epoch": 0.09912204476672759, + "grad_norm": 6.015547752380371, + "learning_rate": 1.9888123420543248e-07, + "loss": 0.0272, + "step": 24810 + }, + { + "epoch": 0.09916199722330427, + "grad_norm": 3.3251829147338867, + "learning_rate": 1.9888029602165096e-07, + "loss": 0.0297, + "step": 24820 + }, + { + "epoch": 0.09920194967988094, + "grad_norm": 5.752717971801758, + "learning_rate": 1.9887935744687406e-07, + "loss": 0.0285, + "step": 24830 + }, + { + "epoch": 0.09924190213645762, + "grad_norm": 7.329909324645996, + "learning_rate": 1.9887841848110553e-07, + "loss": 0.0289, + "step": 24840 + }, + { + "epoch": 0.09928185459303429, + "grad_norm": 6.668873310089111, + "learning_rate": 1.9887747912434905e-07, + "loss": 0.0296, + "step": 24850 + }, + { + "epoch": 0.09932180704961097, + "grad_norm": 7.863513946533203, + "learning_rate": 1.988765393766084e-07, + "loss": 0.0306, + "step": 24860 + }, + { + "epoch": 0.09936175950618764, + "grad_norm": 3.6350250244140625, + "learning_rate": 1.988755992378872e-07, + "loss": 0.0288, + "step": 24870 + }, + { + "epoch": 0.09940171196276432, + "grad_norm": 3.459024429321289, + "learning_rate": 1.9887465870818923e-07, + "loss": 0.0293, + "step": 24880 + }, + { + "epoch": 0.09944166441934098, + "grad_norm": 2.768312931060791, + "learning_rate": 1.988737177875182e-07, + "loss": 0.0289, + "step": 24890 + }, + { + "epoch": 0.09948161687591765, + "grad_norm": 2.5919346809387207, + "learning_rate": 1.9887277647587785e-07, + "loss": 0.0268, + "step": 24900 + }, + { + "epoch": 0.09952156933249433, + "grad_norm": 2.810037136077881, + "learning_rate": 1.9887183477327185e-07, + "loss": 0.0294, + "step": 24910 + }, + { + "epoch": 0.099561521789071, + "grad_norm": 2.51900577545166, + "learning_rate": 1.9887089267970398e-07, + "loss": 0.0304, + "step": 24920 + }, + { + "epoch": 0.09960147424564768, + "grad_norm": 2.293471097946167, + "learning_rate": 1.9886995019517792e-07, + "loss": 0.0308, + "step": 24930 + }, + { + "epoch": 0.09964142670222435, + "grad_norm": 5.65454626083374, + "learning_rate": 1.988690073196974e-07, + "loss": 0.0274, + "step": 24940 + }, + { + "epoch": 0.09968137915880103, + "grad_norm": 22.553386688232422, + "learning_rate": 1.988680640532662e-07, + "loss": 0.0274, + "step": 24950 + }, + { + "epoch": 0.0997213316153777, + "grad_norm": 2.9636473655700684, + "learning_rate": 1.9886712039588797e-07, + "loss": 0.0252, + "step": 24960 + }, + { + "epoch": 0.09976128407195438, + "grad_norm": 1.9934322834014893, + "learning_rate": 1.988661763475665e-07, + "loss": 0.0278, + "step": 24970 + }, + { + "epoch": 0.09980123652853105, + "grad_norm": 11.082646369934082, + "learning_rate": 1.988652319083055e-07, + "loss": 0.0295, + "step": 24980 + }, + { + "epoch": 0.09984118898510772, + "grad_norm": 1.8586386442184448, + "learning_rate": 1.9886428707810875e-07, + "loss": 0.0251, + "step": 24990 + }, + { + "epoch": 0.09988114144168439, + "grad_norm": 5.09521484375, + "learning_rate": 1.9886334185697994e-07, + "loss": 0.0296, + "step": 25000 + }, + { + "epoch": 0.09992109389826107, + "grad_norm": 5.679256916046143, + "learning_rate": 1.9886239624492278e-07, + "loss": 0.0289, + "step": 25010 + }, + { + "epoch": 0.09996104635483774, + "grad_norm": 6.243730545043945, + "learning_rate": 1.9886145024194105e-07, + "loss": 0.0279, + "step": 25020 + }, + { + "epoch": 0.10000099881141442, + "grad_norm": 4.739089012145996, + "learning_rate": 1.9886050384803853e-07, + "loss": 0.0281, + "step": 25030 + }, + { + "epoch": 0.10004095126799109, + "grad_norm": 12.003172874450684, + "learning_rate": 1.9885955706321886e-07, + "loss": 0.0298, + "step": 25040 + }, + { + "epoch": 0.10008090372456777, + "grad_norm": 4.860800266265869, + "learning_rate": 1.9885860988748587e-07, + "loss": 0.0297, + "step": 25050 + }, + { + "epoch": 0.10012085618114444, + "grad_norm": 3.847805976867676, + "learning_rate": 1.9885766232084328e-07, + "loss": 0.0329, + "step": 25060 + }, + { + "epoch": 0.10016080863772112, + "grad_norm": 3.3209433555603027, + "learning_rate": 1.9885671436329483e-07, + "loss": 0.0262, + "step": 25070 + }, + { + "epoch": 0.10020076109429779, + "grad_norm": 2.8408608436584473, + "learning_rate": 1.9885576601484427e-07, + "loss": 0.0275, + "step": 25080 + }, + { + "epoch": 0.10024071355087447, + "grad_norm": 7.41806173324585, + "learning_rate": 1.9885481727549536e-07, + "loss": 0.0291, + "step": 25090 + }, + { + "epoch": 0.10028066600745113, + "grad_norm": 4.789546966552734, + "learning_rate": 1.9885386814525184e-07, + "loss": 0.0265, + "step": 25100 + }, + { + "epoch": 0.1003206184640278, + "grad_norm": 2.3887293338775635, + "learning_rate": 1.9885291862411747e-07, + "loss": 0.0242, + "step": 25110 + }, + { + "epoch": 0.10036057092060448, + "grad_norm": 6.706392288208008, + "learning_rate": 1.9885196871209597e-07, + "loss": 0.0292, + "step": 25120 + }, + { + "epoch": 0.10040052337718115, + "grad_norm": 10.017127990722656, + "learning_rate": 1.9885101840919116e-07, + "loss": 0.028, + "step": 25130 + }, + { + "epoch": 0.10044047583375783, + "grad_norm": 4.877748489379883, + "learning_rate": 1.9885006771540674e-07, + "loss": 0.0298, + "step": 25140 + }, + { + "epoch": 0.1004804282903345, + "grad_norm": 7.099983215332031, + "learning_rate": 1.9884911663074653e-07, + "loss": 0.0322, + "step": 25150 + }, + { + "epoch": 0.10052038074691118, + "grad_norm": 6.730006217956543, + "learning_rate": 1.9884816515521426e-07, + "loss": 0.0303, + "step": 25160 + }, + { + "epoch": 0.10056033320348785, + "grad_norm": 2.0699260234832764, + "learning_rate": 1.9884721328881366e-07, + "loss": 0.0283, + "step": 25170 + }, + { + "epoch": 0.10060028566006453, + "grad_norm": 4.354230880737305, + "learning_rate": 1.9884626103154854e-07, + "loss": 0.0274, + "step": 25180 + }, + { + "epoch": 0.1006402381166412, + "grad_norm": 4.730912208557129, + "learning_rate": 1.9884530838342263e-07, + "loss": 0.0306, + "step": 25190 + }, + { + "epoch": 0.10068019057321786, + "grad_norm": 2.7206103801727295, + "learning_rate": 1.988443553444397e-07, + "loss": 0.0302, + "step": 25200 + }, + { + "epoch": 0.10072014302979454, + "grad_norm": 1.838386058807373, + "learning_rate": 1.9884340191460355e-07, + "loss": 0.0266, + "step": 25210 + }, + { + "epoch": 0.10076009548637122, + "grad_norm": 2.943053722381592, + "learning_rate": 1.9884244809391791e-07, + "loss": 0.0307, + "step": 25220 + }, + { + "epoch": 0.10080004794294789, + "grad_norm": 3.8740928173065186, + "learning_rate": 1.9884149388238664e-07, + "loss": 0.0256, + "step": 25230 + }, + { + "epoch": 0.10084000039952457, + "grad_norm": 4.3987717628479, + "learning_rate": 1.988405392800134e-07, + "loss": 0.0307, + "step": 25240 + }, + { + "epoch": 0.10087995285610124, + "grad_norm": 2.7708988189697266, + "learning_rate": 1.9883958428680202e-07, + "loss": 0.0278, + "step": 25250 + }, + { + "epoch": 0.10091990531267792, + "grad_norm": 4.649082660675049, + "learning_rate": 1.9883862890275628e-07, + "loss": 0.0283, + "step": 25260 + }, + { + "epoch": 0.10095985776925459, + "grad_norm": 2.8994874954223633, + "learning_rate": 1.988376731278799e-07, + "loss": 0.0323, + "step": 25270 + }, + { + "epoch": 0.10099981022583127, + "grad_norm": 2.9358623027801514, + "learning_rate": 1.9883671696217675e-07, + "loss": 0.0258, + "step": 25280 + }, + { + "epoch": 0.10103976268240794, + "grad_norm": 6.153625965118408, + "learning_rate": 1.988357604056506e-07, + "loss": 0.0287, + "step": 25290 + }, + { + "epoch": 0.1010797151389846, + "grad_norm": 3.6180946826934814, + "learning_rate": 1.9883480345830512e-07, + "loss": 0.0253, + "step": 25300 + }, + { + "epoch": 0.10111966759556128, + "grad_norm": 6.995093822479248, + "learning_rate": 1.988338461201442e-07, + "loss": 0.0244, + "step": 25310 + }, + { + "epoch": 0.10115962005213795, + "grad_norm": 2.9237027168273926, + "learning_rate": 1.9883288839117163e-07, + "loss": 0.0277, + "step": 25320 + }, + { + "epoch": 0.10119957250871463, + "grad_norm": 6.7656731605529785, + "learning_rate": 1.9883193027139113e-07, + "loss": 0.0279, + "step": 25330 + }, + { + "epoch": 0.1012395249652913, + "grad_norm": 2.948256492614746, + "learning_rate": 1.9883097176080654e-07, + "loss": 0.0274, + "step": 25340 + }, + { + "epoch": 0.10127947742186798, + "grad_norm": 4.4739251136779785, + "learning_rate": 1.9883001285942163e-07, + "loss": 0.0322, + "step": 25350 + }, + { + "epoch": 0.10131942987844465, + "grad_norm": 6.022429943084717, + "learning_rate": 1.988290535672402e-07, + "loss": 0.0276, + "step": 25360 + }, + { + "epoch": 0.10135938233502133, + "grad_norm": 19.881559371948242, + "learning_rate": 1.9882809388426603e-07, + "loss": 0.0312, + "step": 25370 + }, + { + "epoch": 0.101399334791598, + "grad_norm": 7.21485710144043, + "learning_rate": 1.9882713381050292e-07, + "loss": 0.0265, + "step": 25380 + }, + { + "epoch": 0.10143928724817468, + "grad_norm": 4.4046525955200195, + "learning_rate": 1.9882617334595467e-07, + "loss": 0.0281, + "step": 25390 + }, + { + "epoch": 0.10147923970475134, + "grad_norm": 5.569292068481445, + "learning_rate": 1.988252124906251e-07, + "loss": 0.0281, + "step": 25400 + }, + { + "epoch": 0.10151919216132801, + "grad_norm": 4.068460464477539, + "learning_rate": 1.98824251244518e-07, + "loss": 0.0239, + "step": 25410 + }, + { + "epoch": 0.10155914461790469, + "grad_norm": 7.27683687210083, + "learning_rate": 1.9882328960763713e-07, + "loss": 0.0267, + "step": 25420 + }, + { + "epoch": 0.10159909707448136, + "grad_norm": 6.2538628578186035, + "learning_rate": 1.988223275799863e-07, + "loss": 0.0282, + "step": 25430 + }, + { + "epoch": 0.10163904953105804, + "grad_norm": 13.753068923950195, + "learning_rate": 1.9882136516156937e-07, + "loss": 0.0294, + "step": 25440 + }, + { + "epoch": 0.10167900198763472, + "grad_norm": 5.340479373931885, + "learning_rate": 1.9882040235239012e-07, + "loss": 0.0311, + "step": 25450 + }, + { + "epoch": 0.10171895444421139, + "grad_norm": 4.662692546844482, + "learning_rate": 1.9881943915245233e-07, + "loss": 0.0244, + "step": 25460 + }, + { + "epoch": 0.10175890690078807, + "grad_norm": 4.37068510055542, + "learning_rate": 1.9881847556175982e-07, + "loss": 0.0234, + "step": 25470 + }, + { + "epoch": 0.10179885935736474, + "grad_norm": 3.571040391921997, + "learning_rate": 1.9881751158031643e-07, + "loss": 0.023, + "step": 25480 + }, + { + "epoch": 0.10183881181394142, + "grad_norm": 7.598379611968994, + "learning_rate": 1.9881654720812592e-07, + "loss": 0.0347, + "step": 25490 + }, + { + "epoch": 0.10187876427051808, + "grad_norm": 5.315488815307617, + "learning_rate": 1.9881558244519217e-07, + "loss": 0.0265, + "step": 25500 + }, + { + "epoch": 0.10191871672709475, + "grad_norm": 3.8550801277160645, + "learning_rate": 1.9881461729151892e-07, + "loss": 0.0251, + "step": 25510 + }, + { + "epoch": 0.10195866918367143, + "grad_norm": 3.1213881969451904, + "learning_rate": 1.9881365174711003e-07, + "loss": 0.0272, + "step": 25520 + }, + { + "epoch": 0.1019986216402481, + "grad_norm": 3.5948288440704346, + "learning_rate": 1.9881268581196932e-07, + "loss": 0.024, + "step": 25530 + }, + { + "epoch": 0.10203857409682478, + "grad_norm": 1.438394546508789, + "learning_rate": 1.9881171948610062e-07, + "loss": 0.0265, + "step": 25540 + }, + { + "epoch": 0.10207852655340145, + "grad_norm": 14.62169075012207, + "learning_rate": 1.9881075276950772e-07, + "loss": 0.0275, + "step": 25550 + }, + { + "epoch": 0.10211847900997813, + "grad_norm": 1.947511911392212, + "learning_rate": 1.9880978566219443e-07, + "loss": 0.0273, + "step": 25560 + }, + { + "epoch": 0.1021584314665548, + "grad_norm": 2.042447566986084, + "learning_rate": 1.9880881816416464e-07, + "loss": 0.0285, + "step": 25570 + }, + { + "epoch": 0.10219838392313148, + "grad_norm": 17.523895263671875, + "learning_rate": 1.988078502754221e-07, + "loss": 0.0276, + "step": 25580 + }, + { + "epoch": 0.10223833637970815, + "grad_norm": 7.530393123626709, + "learning_rate": 1.988068819959707e-07, + "loss": 0.0344, + "step": 25590 + }, + { + "epoch": 0.10227828883628481, + "grad_norm": 10.46077823638916, + "learning_rate": 1.9880591332581423e-07, + "loss": 0.0286, + "step": 25600 + }, + { + "epoch": 0.10231824129286149, + "grad_norm": 2.7909905910491943, + "learning_rate": 1.988049442649565e-07, + "loss": 0.032, + "step": 25610 + }, + { + "epoch": 0.10235819374943816, + "grad_norm": 3.2623708248138428, + "learning_rate": 1.9880397481340142e-07, + "loss": 0.0295, + "step": 25620 + }, + { + "epoch": 0.10239814620601484, + "grad_norm": 2.7718546390533447, + "learning_rate": 1.9880300497115273e-07, + "loss": 0.0274, + "step": 25630 + }, + { + "epoch": 0.10243809866259151, + "grad_norm": 5.204047203063965, + "learning_rate": 1.9880203473821436e-07, + "loss": 0.0295, + "step": 25640 + }, + { + "epoch": 0.10247805111916819, + "grad_norm": 3.426187038421631, + "learning_rate": 1.9880106411459009e-07, + "loss": 0.0265, + "step": 25650 + }, + { + "epoch": 0.10251800357574486, + "grad_norm": 3.8072099685668945, + "learning_rate": 1.9880009310028375e-07, + "loss": 0.0273, + "step": 25660 + }, + { + "epoch": 0.10255795603232154, + "grad_norm": 21.09086799621582, + "learning_rate": 1.987991216952992e-07, + "loss": 0.0282, + "step": 25670 + }, + { + "epoch": 0.10259790848889822, + "grad_norm": 2.457852363586426, + "learning_rate": 1.9879814989964027e-07, + "loss": 0.0281, + "step": 25680 + }, + { + "epoch": 0.10263786094547489, + "grad_norm": 3.0236945152282715, + "learning_rate": 1.987971777133108e-07, + "loss": 0.0279, + "step": 25690 + }, + { + "epoch": 0.10267781340205157, + "grad_norm": 4.487567901611328, + "learning_rate": 1.9879620513631468e-07, + "loss": 0.0292, + "step": 25700 + }, + { + "epoch": 0.10271776585862823, + "grad_norm": 8.856450080871582, + "learning_rate": 1.987952321686557e-07, + "loss": 0.0206, + "step": 25710 + }, + { + "epoch": 0.1027577183152049, + "grad_norm": 9.044903755187988, + "learning_rate": 1.9879425881033774e-07, + "loss": 0.0319, + "step": 25720 + }, + { + "epoch": 0.10279767077178158, + "grad_norm": 4.972631931304932, + "learning_rate": 1.987932850613646e-07, + "loss": 0.0251, + "step": 25730 + }, + { + "epoch": 0.10283762322835825, + "grad_norm": 3.2554826736450195, + "learning_rate": 1.987923109217402e-07, + "loss": 0.0275, + "step": 25740 + }, + { + "epoch": 0.10287757568493493, + "grad_norm": 3.0088441371917725, + "learning_rate": 1.9879133639146834e-07, + "loss": 0.029, + "step": 25750 + }, + { + "epoch": 0.1029175281415116, + "grad_norm": 2.392936944961548, + "learning_rate": 1.9879036147055288e-07, + "loss": 0.0275, + "step": 25760 + }, + { + "epoch": 0.10295748059808828, + "grad_norm": 2.778597354888916, + "learning_rate": 1.987893861589977e-07, + "loss": 0.0286, + "step": 25770 + }, + { + "epoch": 0.10299743305466495, + "grad_norm": 3.205664873123169, + "learning_rate": 1.9878841045680667e-07, + "loss": 0.0264, + "step": 25780 + }, + { + "epoch": 0.10303738551124163, + "grad_norm": 3.8408594131469727, + "learning_rate": 1.987874343639836e-07, + "loss": 0.0271, + "step": 25790 + }, + { + "epoch": 0.1030773379678183, + "grad_norm": 10.4679536819458, + "learning_rate": 1.9878645788053237e-07, + "loss": 0.0284, + "step": 25800 + }, + { + "epoch": 0.10311729042439496, + "grad_norm": 5.81333589553833, + "learning_rate": 1.9878548100645681e-07, + "loss": 0.0284, + "step": 25810 + }, + { + "epoch": 0.10315724288097164, + "grad_norm": 4.977374076843262, + "learning_rate": 1.9878450374176087e-07, + "loss": 0.0226, + "step": 25820 + }, + { + "epoch": 0.10319719533754831, + "grad_norm": 3.298175096511841, + "learning_rate": 1.9878352608644833e-07, + "loss": 0.0263, + "step": 25830 + }, + { + "epoch": 0.10323714779412499, + "grad_norm": 2.6284139156341553, + "learning_rate": 1.987825480405231e-07, + "loss": 0.0281, + "step": 25840 + }, + { + "epoch": 0.10327710025070166, + "grad_norm": 4.4218525886535645, + "learning_rate": 1.9878156960398903e-07, + "loss": 0.0264, + "step": 25850 + }, + { + "epoch": 0.10331705270727834, + "grad_norm": 3.4939255714416504, + "learning_rate": 1.9878059077685e-07, + "loss": 0.0228, + "step": 25860 + }, + { + "epoch": 0.10335700516385501, + "grad_norm": 7.772692680358887, + "learning_rate": 1.9877961155910985e-07, + "loss": 0.0259, + "step": 25870 + }, + { + "epoch": 0.10339695762043169, + "grad_norm": 6.016354084014893, + "learning_rate": 1.9877863195077248e-07, + "loss": 0.0288, + "step": 25880 + }, + { + "epoch": 0.10343691007700836, + "grad_norm": 1.7986489534378052, + "learning_rate": 1.9877765195184177e-07, + "loss": 0.0237, + "step": 25890 + }, + { + "epoch": 0.10347686253358504, + "grad_norm": 2.5561578273773193, + "learning_rate": 1.987766715623216e-07, + "loss": 0.0272, + "step": 25900 + }, + { + "epoch": 0.1035168149901617, + "grad_norm": 5.965950012207031, + "learning_rate": 1.9877569078221578e-07, + "loss": 0.0266, + "step": 25910 + }, + { + "epoch": 0.10355676744673838, + "grad_norm": 3.777688503265381, + "learning_rate": 1.9877470961152826e-07, + "loss": 0.0273, + "step": 25920 + }, + { + "epoch": 0.10359671990331505, + "grad_norm": 5.887746810913086, + "learning_rate": 1.987737280502629e-07, + "loss": 0.0233, + "step": 25930 + }, + { + "epoch": 0.10363667235989173, + "grad_norm": 4.412550449371338, + "learning_rate": 1.987727460984236e-07, + "loss": 0.0262, + "step": 25940 + }, + { + "epoch": 0.1036766248164684, + "grad_norm": 7.327163219451904, + "learning_rate": 1.987717637560142e-07, + "loss": 0.0336, + "step": 25950 + }, + { + "epoch": 0.10371657727304508, + "grad_norm": 3.3658909797668457, + "learning_rate": 1.987707810230386e-07, + "loss": 0.0281, + "step": 25960 + }, + { + "epoch": 0.10375652972962175, + "grad_norm": 5.340783596038818, + "learning_rate": 1.987697978995007e-07, + "loss": 0.023, + "step": 25970 + }, + { + "epoch": 0.10379648218619843, + "grad_norm": 5.636935710906982, + "learning_rate": 1.987688143854044e-07, + "loss": 0.0242, + "step": 25980 + }, + { + "epoch": 0.1038364346427751, + "grad_norm": 4.219330787658691, + "learning_rate": 1.9876783048075355e-07, + "loss": 0.0256, + "step": 25990 + }, + { + "epoch": 0.10387638709935178, + "grad_norm": 3.6393537521362305, + "learning_rate": 1.9876684618555206e-07, + "loss": 0.0286, + "step": 26000 + }, + { + "epoch": 0.10391633955592844, + "grad_norm": 2.4842209815979004, + "learning_rate": 1.987658614998038e-07, + "loss": 0.0299, + "step": 26010 + }, + { + "epoch": 0.10395629201250511, + "grad_norm": 4.507482051849365, + "learning_rate": 1.987648764235127e-07, + "loss": 0.029, + "step": 26020 + }, + { + "epoch": 0.10399624446908179, + "grad_norm": 3.1344544887542725, + "learning_rate": 1.9876389095668268e-07, + "loss": 0.0283, + "step": 26030 + }, + { + "epoch": 0.10403619692565846, + "grad_norm": 9.435651779174805, + "learning_rate": 1.9876290509931754e-07, + "loss": 0.0266, + "step": 26040 + }, + { + "epoch": 0.10407614938223514, + "grad_norm": 3.670053720474243, + "learning_rate": 1.9876191885142125e-07, + "loss": 0.0241, + "step": 26050 + }, + { + "epoch": 0.10411610183881181, + "grad_norm": 4.763343334197998, + "learning_rate": 1.9876093221299772e-07, + "loss": 0.0279, + "step": 26060 + }, + { + "epoch": 0.10415605429538849, + "grad_norm": 7.469449996948242, + "learning_rate": 1.9875994518405084e-07, + "loss": 0.0288, + "step": 26070 + }, + { + "epoch": 0.10419600675196516, + "grad_norm": 3.58823299407959, + "learning_rate": 1.9875895776458447e-07, + "loss": 0.0299, + "step": 26080 + }, + { + "epoch": 0.10423595920854184, + "grad_norm": 9.735212326049805, + "learning_rate": 1.987579699546025e-07, + "loss": 0.0317, + "step": 26090 + }, + { + "epoch": 0.10427591166511851, + "grad_norm": 7.366652488708496, + "learning_rate": 1.9875698175410895e-07, + "loss": 0.0296, + "step": 26100 + }, + { + "epoch": 0.10431586412169518, + "grad_norm": 4.625205039978027, + "learning_rate": 1.9875599316310764e-07, + "loss": 0.0263, + "step": 26110 + }, + { + "epoch": 0.10435581657827185, + "grad_norm": 5.726237773895264, + "learning_rate": 1.9875500418160248e-07, + "loss": 0.0246, + "step": 26120 + }, + { + "epoch": 0.10439576903484853, + "grad_norm": 8.52668571472168, + "learning_rate": 1.9875401480959743e-07, + "loss": 0.032, + "step": 26130 + }, + { + "epoch": 0.1044357214914252, + "grad_norm": 2.692918300628662, + "learning_rate": 1.9875302504709635e-07, + "loss": 0.0263, + "step": 26140 + }, + { + "epoch": 0.10447567394800188, + "grad_norm": 2.3756003379821777, + "learning_rate": 1.9875203489410317e-07, + "loss": 0.0295, + "step": 26150 + }, + { + "epoch": 0.10451562640457855, + "grad_norm": 3.3385732173919678, + "learning_rate": 1.9875104435062183e-07, + "loss": 0.0256, + "step": 26160 + }, + { + "epoch": 0.10455557886115523, + "grad_norm": 5.577840328216553, + "learning_rate": 1.9875005341665618e-07, + "loss": 0.0266, + "step": 26170 + }, + { + "epoch": 0.1045955313177319, + "grad_norm": 3.119859457015991, + "learning_rate": 1.9874906209221022e-07, + "loss": 0.0293, + "step": 26180 + }, + { + "epoch": 0.10463548377430858, + "grad_norm": 2.968820810317993, + "learning_rate": 1.9874807037728781e-07, + "loss": 0.022, + "step": 26190 + }, + { + "epoch": 0.10467543623088525, + "grad_norm": 2.377575397491455, + "learning_rate": 1.987470782718929e-07, + "loss": 0.0265, + "step": 26200 + }, + { + "epoch": 0.10471538868746193, + "grad_norm": 8.84180736541748, + "learning_rate": 1.9874608577602947e-07, + "loss": 0.0281, + "step": 26210 + }, + { + "epoch": 0.10475534114403859, + "grad_norm": 4.607086658477783, + "learning_rate": 1.9874509288970132e-07, + "loss": 0.029, + "step": 26220 + }, + { + "epoch": 0.10479529360061526, + "grad_norm": 3.9904797077178955, + "learning_rate": 1.9874409961291244e-07, + "loss": 0.0264, + "step": 26230 + }, + { + "epoch": 0.10483524605719194, + "grad_norm": 3.5439295768737793, + "learning_rate": 1.987431059456668e-07, + "loss": 0.0277, + "step": 26240 + }, + { + "epoch": 0.10487519851376861, + "grad_norm": 4.266513347625732, + "learning_rate": 1.9874211188796822e-07, + "loss": 0.0318, + "step": 26250 + }, + { + "epoch": 0.10491515097034529, + "grad_norm": 4.996826648712158, + "learning_rate": 1.9874111743982077e-07, + "loss": 0.0276, + "step": 26260 + }, + { + "epoch": 0.10495510342692196, + "grad_norm": 2.8123931884765625, + "learning_rate": 1.9874012260122823e-07, + "loss": 0.025, + "step": 26270 + }, + { + "epoch": 0.10499505588349864, + "grad_norm": 8.176513671875, + "learning_rate": 1.9873912737219467e-07, + "loss": 0.0295, + "step": 26280 + }, + { + "epoch": 0.10503500834007531, + "grad_norm": 4.292375564575195, + "learning_rate": 1.9873813175272397e-07, + "loss": 0.0215, + "step": 26290 + }, + { + "epoch": 0.10507496079665199, + "grad_norm": 11.274954795837402, + "learning_rate": 1.9873713574282002e-07, + "loss": 0.0309, + "step": 26300 + }, + { + "epoch": 0.10511491325322866, + "grad_norm": 2.3699393272399902, + "learning_rate": 1.9873613934248686e-07, + "loss": 0.0251, + "step": 26310 + }, + { + "epoch": 0.10515486570980533, + "grad_norm": 4.37923002243042, + "learning_rate": 1.9873514255172833e-07, + "loss": 0.029, + "step": 26320 + }, + { + "epoch": 0.105194818166382, + "grad_norm": 7.09029483795166, + "learning_rate": 1.9873414537054841e-07, + "loss": 0.0276, + "step": 26330 + }, + { + "epoch": 0.10523477062295868, + "grad_norm": 2.5866665840148926, + "learning_rate": 1.9873314779895108e-07, + "loss": 0.0235, + "step": 26340 + }, + { + "epoch": 0.10527472307953535, + "grad_norm": 6.4519782066345215, + "learning_rate": 1.9873214983694025e-07, + "loss": 0.0258, + "step": 26350 + }, + { + "epoch": 0.10531467553611203, + "grad_norm": 3.7830684185028076, + "learning_rate": 1.9873115148451984e-07, + "loss": 0.0272, + "step": 26360 + }, + { + "epoch": 0.1053546279926887, + "grad_norm": 2.5546653270721436, + "learning_rate": 1.9873015274169384e-07, + "loss": 0.0309, + "step": 26370 + }, + { + "epoch": 0.10539458044926538, + "grad_norm": 5.839615821838379, + "learning_rate": 1.987291536084662e-07, + "loss": 0.0298, + "step": 26380 + }, + { + "epoch": 0.10543453290584205, + "grad_norm": 6.888423442840576, + "learning_rate": 1.9872815408484084e-07, + "loss": 0.0301, + "step": 26390 + }, + { + "epoch": 0.10547448536241873, + "grad_norm": 5.997581481933594, + "learning_rate": 1.9872715417082175e-07, + "loss": 0.0262, + "step": 26400 + }, + { + "epoch": 0.1055144378189954, + "grad_norm": 3.0020639896392822, + "learning_rate": 1.9872615386641285e-07, + "loss": 0.0238, + "step": 26410 + }, + { + "epoch": 0.10555439027557206, + "grad_norm": 4.342557907104492, + "learning_rate": 1.9872515317161811e-07, + "loss": 0.029, + "step": 26420 + }, + { + "epoch": 0.10559434273214874, + "grad_norm": 7.003819465637207, + "learning_rate": 1.9872415208644148e-07, + "loss": 0.0272, + "step": 26430 + }, + { + "epoch": 0.10563429518872541, + "grad_norm": 6.924483299255371, + "learning_rate": 1.9872315061088692e-07, + "loss": 0.0304, + "step": 26440 + }, + { + "epoch": 0.10567424764530209, + "grad_norm": 4.927184581756592, + "learning_rate": 1.9872214874495841e-07, + "loss": 0.0279, + "step": 26450 + }, + { + "epoch": 0.10571420010187876, + "grad_norm": 2.478452205657959, + "learning_rate": 1.987211464886599e-07, + "loss": 0.0231, + "step": 26460 + }, + { + "epoch": 0.10575415255845544, + "grad_norm": 4.792454242706299, + "learning_rate": 1.9872014384199534e-07, + "loss": 0.025, + "step": 26470 + }, + { + "epoch": 0.10579410501503211, + "grad_norm": 7.218120098114014, + "learning_rate": 1.987191408049687e-07, + "loss": 0.0242, + "step": 26480 + }, + { + "epoch": 0.10583405747160879, + "grad_norm": 3.956031084060669, + "learning_rate": 1.9871813737758393e-07, + "loss": 0.0287, + "step": 26490 + }, + { + "epoch": 0.10587400992818546, + "grad_norm": 7.695531368255615, + "learning_rate": 1.9871713355984505e-07, + "loss": 0.0231, + "step": 26500 + }, + { + "epoch": 0.10591396238476214, + "grad_norm": 12.49987506866455, + "learning_rate": 1.9871612935175598e-07, + "loss": 0.0262, + "step": 26510 + }, + { + "epoch": 0.1059539148413388, + "grad_norm": 2.2339251041412354, + "learning_rate": 1.987151247533207e-07, + "loss": 0.0249, + "step": 26520 + }, + { + "epoch": 0.10599386729791548, + "grad_norm": 5.577976226806641, + "learning_rate": 1.987141197645432e-07, + "loss": 0.0266, + "step": 26530 + }, + { + "epoch": 0.10603381975449215, + "grad_norm": 5.3416056632995605, + "learning_rate": 1.9871311438542747e-07, + "loss": 0.0234, + "step": 26540 + }, + { + "epoch": 0.10607377221106883, + "grad_norm": 3.9813039302825928, + "learning_rate": 1.9871210861597743e-07, + "loss": 0.0261, + "step": 26550 + }, + { + "epoch": 0.1061137246676455, + "grad_norm": 9.757558822631836, + "learning_rate": 1.9871110245619708e-07, + "loss": 0.0297, + "step": 26560 + }, + { + "epoch": 0.10615367712422218, + "grad_norm": 3.401294708251953, + "learning_rate": 1.9871009590609042e-07, + "loss": 0.0277, + "step": 26570 + }, + { + "epoch": 0.10619362958079885, + "grad_norm": 2.753633975982666, + "learning_rate": 1.987090889656614e-07, + "loss": 0.0264, + "step": 26580 + }, + { + "epoch": 0.10623358203737553, + "grad_norm": 3.835784912109375, + "learning_rate": 1.9870808163491404e-07, + "loss": 0.0332, + "step": 26590 + }, + { + "epoch": 0.1062735344939522, + "grad_norm": 1.915737509727478, + "learning_rate": 1.9870707391385229e-07, + "loss": 0.0274, + "step": 26600 + }, + { + "epoch": 0.10631348695052888, + "grad_norm": 4.356095314025879, + "learning_rate": 1.9870606580248013e-07, + "loss": 0.0307, + "step": 26610 + }, + { + "epoch": 0.10635343940710554, + "grad_norm": 5.953808784484863, + "learning_rate": 1.987050573008016e-07, + "loss": 0.0291, + "step": 26620 + }, + { + "epoch": 0.10639339186368221, + "grad_norm": 3.8007636070251465, + "learning_rate": 1.9870404840882063e-07, + "loss": 0.0268, + "step": 26630 + }, + { + "epoch": 0.10643334432025889, + "grad_norm": 5.4945292472839355, + "learning_rate": 1.987030391265412e-07, + "loss": 0.0232, + "step": 26640 + }, + { + "epoch": 0.10647329677683556, + "grad_norm": 7.2119035720825195, + "learning_rate": 1.9870202945396736e-07, + "loss": 0.0307, + "step": 26650 + }, + { + "epoch": 0.10651324923341224, + "grad_norm": 3.5051116943359375, + "learning_rate": 1.9870101939110305e-07, + "loss": 0.0249, + "step": 26660 + }, + { + "epoch": 0.10655320168998891, + "grad_norm": 3.4030938148498535, + "learning_rate": 1.987000089379523e-07, + "loss": 0.0313, + "step": 26670 + }, + { + "epoch": 0.10659315414656559, + "grad_norm": 3.596802234649658, + "learning_rate": 1.986989980945191e-07, + "loss": 0.0293, + "step": 26680 + }, + { + "epoch": 0.10663310660314226, + "grad_norm": 3.3151938915252686, + "learning_rate": 1.9869798686080742e-07, + "loss": 0.0234, + "step": 26690 + }, + { + "epoch": 0.10667305905971894, + "grad_norm": 5.168748378753662, + "learning_rate": 1.986969752368213e-07, + "loss": 0.0311, + "step": 26700 + }, + { + "epoch": 0.10671301151629561, + "grad_norm": 3.818817377090454, + "learning_rate": 1.986959632225647e-07, + "loss": 0.028, + "step": 26710 + }, + { + "epoch": 0.10675296397287228, + "grad_norm": 5.123197078704834, + "learning_rate": 1.9869495081804163e-07, + "loss": 0.0259, + "step": 26720 + }, + { + "epoch": 0.10679291642944895, + "grad_norm": 4.391772270202637, + "learning_rate": 1.9869393802325614e-07, + "loss": 0.0251, + "step": 26730 + }, + { + "epoch": 0.10683286888602563, + "grad_norm": 4.876038074493408, + "learning_rate": 1.9869292483821216e-07, + "loss": 0.0273, + "step": 26740 + }, + { + "epoch": 0.1068728213426023, + "grad_norm": 3.0409555435180664, + "learning_rate": 1.9869191126291374e-07, + "loss": 0.0253, + "step": 26750 + }, + { + "epoch": 0.10691277379917898, + "grad_norm": 4.002513408660889, + "learning_rate": 1.9869089729736488e-07, + "loss": 0.0289, + "step": 26760 + }, + { + "epoch": 0.10695272625575565, + "grad_norm": 4.036714553833008, + "learning_rate": 1.9868988294156962e-07, + "loss": 0.0283, + "step": 26770 + }, + { + "epoch": 0.10699267871233233, + "grad_norm": 4.003391742706299, + "learning_rate": 1.9868886819553193e-07, + "loss": 0.0284, + "step": 26780 + }, + { + "epoch": 0.107032631168909, + "grad_norm": 3.1063218116760254, + "learning_rate": 1.986878530592558e-07, + "loss": 0.027, + "step": 26790 + }, + { + "epoch": 0.10707258362548568, + "grad_norm": 3.921475887298584, + "learning_rate": 1.9868683753274532e-07, + "loss": 0.0283, + "step": 26800 + }, + { + "epoch": 0.10711253608206235, + "grad_norm": 5.833354473114014, + "learning_rate": 1.9868582161600444e-07, + "loss": 0.0251, + "step": 26810 + }, + { + "epoch": 0.10715248853863903, + "grad_norm": 3.7478280067443848, + "learning_rate": 1.986848053090372e-07, + "loss": 0.0288, + "step": 26820 + }, + { + "epoch": 0.10719244099521569, + "grad_norm": 6.621691703796387, + "learning_rate": 1.986837886118476e-07, + "loss": 0.0309, + "step": 26830 + }, + { + "epoch": 0.10723239345179236, + "grad_norm": 4.509171962738037, + "learning_rate": 1.9868277152443976e-07, + "loss": 0.0267, + "step": 26840 + }, + { + "epoch": 0.10727234590836904, + "grad_norm": 3.2230570316314697, + "learning_rate": 1.9868175404681756e-07, + "loss": 0.0255, + "step": 26850 + }, + { + "epoch": 0.10731229836494571, + "grad_norm": 4.180356979370117, + "learning_rate": 1.986807361789851e-07, + "loss": 0.0316, + "step": 26860 + }, + { + "epoch": 0.10735225082152239, + "grad_norm": 3.3670172691345215, + "learning_rate": 1.9867971792094637e-07, + "loss": 0.0278, + "step": 26870 + }, + { + "epoch": 0.10739220327809906, + "grad_norm": 5.596536636352539, + "learning_rate": 1.9867869927270546e-07, + "loss": 0.0284, + "step": 26880 + }, + { + "epoch": 0.10743215573467574, + "grad_norm": 3.5360262393951416, + "learning_rate": 1.9867768023426632e-07, + "loss": 0.03, + "step": 26890 + }, + { + "epoch": 0.10747210819125241, + "grad_norm": 3.7247745990753174, + "learning_rate": 1.9867666080563303e-07, + "loss": 0.0247, + "step": 26900 + }, + { + "epoch": 0.10751206064782909, + "grad_norm": 1.9713071584701538, + "learning_rate": 1.986756409868096e-07, + "loss": 0.0269, + "step": 26910 + }, + { + "epoch": 0.10755201310440576, + "grad_norm": 10.79904556274414, + "learning_rate": 1.9867462077780007e-07, + "loss": 0.0261, + "step": 26920 + }, + { + "epoch": 0.10759196556098242, + "grad_norm": 3.6289305686950684, + "learning_rate": 1.9867360017860847e-07, + "loss": 0.0241, + "step": 26930 + }, + { + "epoch": 0.1076319180175591, + "grad_norm": 4.3684916496276855, + "learning_rate": 1.9867257918923884e-07, + "loss": 0.0272, + "step": 26940 + }, + { + "epoch": 0.10767187047413578, + "grad_norm": 4.744363784790039, + "learning_rate": 1.9867155780969518e-07, + "loss": 0.0286, + "step": 26950 + }, + { + "epoch": 0.10771182293071245, + "grad_norm": 4.299191474914551, + "learning_rate": 1.986705360399816e-07, + "loss": 0.03, + "step": 26960 + }, + { + "epoch": 0.10775177538728913, + "grad_norm": 3.653930425643921, + "learning_rate": 1.986695138801021e-07, + "loss": 0.0259, + "step": 26970 + }, + { + "epoch": 0.1077917278438658, + "grad_norm": 4.466867446899414, + "learning_rate": 1.9866849133006073e-07, + "loss": 0.027, + "step": 26980 + }, + { + "epoch": 0.10783168030044248, + "grad_norm": 16.481983184814453, + "learning_rate": 1.9866746838986152e-07, + "loss": 0.0255, + "step": 26990 + }, + { + "epoch": 0.10787163275701915, + "grad_norm": 2.724349021911621, + "learning_rate": 1.9866644505950852e-07, + "loss": 0.0239, + "step": 27000 + }, + { + "epoch": 0.10791158521359583, + "grad_norm": 3.325464963912964, + "learning_rate": 1.986654213390058e-07, + "loss": 0.0242, + "step": 27010 + }, + { + "epoch": 0.1079515376701725, + "grad_norm": 1.733672022819519, + "learning_rate": 1.986643972283574e-07, + "loss": 0.0289, + "step": 27020 + }, + { + "epoch": 0.10799149012674916, + "grad_norm": 3.6638684272766113, + "learning_rate": 1.9866337272756732e-07, + "loss": 0.0253, + "step": 27030 + }, + { + "epoch": 0.10803144258332584, + "grad_norm": 3.642035484313965, + "learning_rate": 1.9866234783663966e-07, + "loss": 0.0264, + "step": 27040 + }, + { + "epoch": 0.10807139503990251, + "grad_norm": 4.7474164962768555, + "learning_rate": 1.9866132255557848e-07, + "loss": 0.028, + "step": 27050 + }, + { + "epoch": 0.10811134749647919, + "grad_norm": 2.3218772411346436, + "learning_rate": 1.9866029688438778e-07, + "loss": 0.0253, + "step": 27060 + }, + { + "epoch": 0.10815129995305586, + "grad_norm": 2.5420145988464355, + "learning_rate": 1.986592708230717e-07, + "loss": 0.0281, + "step": 27070 + }, + { + "epoch": 0.10819125240963254, + "grad_norm": 3.1303482055664062, + "learning_rate": 1.986582443716342e-07, + "loss": 0.0261, + "step": 27080 + }, + { + "epoch": 0.10823120486620921, + "grad_norm": 2.2326319217681885, + "learning_rate": 1.9865721753007944e-07, + "loss": 0.0286, + "step": 27090 + }, + { + "epoch": 0.10827115732278589, + "grad_norm": 4.5724406242370605, + "learning_rate": 1.9865619029841138e-07, + "loss": 0.0326, + "step": 27100 + }, + { + "epoch": 0.10831110977936256, + "grad_norm": 3.7288217544555664, + "learning_rate": 1.9865516267663417e-07, + "loss": 0.0253, + "step": 27110 + }, + { + "epoch": 0.10835106223593924, + "grad_norm": 4.773068904876709, + "learning_rate": 1.986541346647518e-07, + "loss": 0.0286, + "step": 27120 + }, + { + "epoch": 0.1083910146925159, + "grad_norm": 4.5517730712890625, + "learning_rate": 1.986531062627684e-07, + "loss": 0.0263, + "step": 27130 + }, + { + "epoch": 0.10843096714909257, + "grad_norm": 3.898563861846924, + "learning_rate": 1.98652077470688e-07, + "loss": 0.0296, + "step": 27140 + }, + { + "epoch": 0.10847091960566925, + "grad_norm": 8.597043991088867, + "learning_rate": 1.9865104828851465e-07, + "loss": 0.0264, + "step": 27150 + }, + { + "epoch": 0.10851087206224592, + "grad_norm": 2.3783719539642334, + "learning_rate": 1.9865001871625244e-07, + "loss": 0.0268, + "step": 27160 + }, + { + "epoch": 0.1085508245188226, + "grad_norm": 4.137610912322998, + "learning_rate": 1.986489887539055e-07, + "loss": 0.0241, + "step": 27170 + }, + { + "epoch": 0.10859077697539928, + "grad_norm": 1.6401265859603882, + "learning_rate": 1.986479584014778e-07, + "loss": 0.0265, + "step": 27180 + }, + { + "epoch": 0.10863072943197595, + "grad_norm": 8.234698295593262, + "learning_rate": 1.9864692765897347e-07, + "loss": 0.0257, + "step": 27190 + }, + { + "epoch": 0.10867068188855263, + "grad_norm": 3.1435327529907227, + "learning_rate": 1.9864589652639655e-07, + "loss": 0.0286, + "step": 27200 + }, + { + "epoch": 0.1087106343451293, + "grad_norm": 2.721997022628784, + "learning_rate": 1.986448650037512e-07, + "loss": 0.024, + "step": 27210 + }, + { + "epoch": 0.10875058680170598, + "grad_norm": 7.364617824554443, + "learning_rate": 1.9864383309104137e-07, + "loss": 0.0303, + "step": 27220 + }, + { + "epoch": 0.10879053925828264, + "grad_norm": 78.19937133789062, + "learning_rate": 1.9864280078827126e-07, + "loss": 0.0259, + "step": 27230 + }, + { + "epoch": 0.10883049171485931, + "grad_norm": 5.5029520988464355, + "learning_rate": 1.986417680954449e-07, + "loss": 0.0259, + "step": 27240 + }, + { + "epoch": 0.10887044417143599, + "grad_norm": 4.095463275909424, + "learning_rate": 1.9864073501256643e-07, + "loss": 0.0276, + "step": 27250 + }, + { + "epoch": 0.10891039662801266, + "grad_norm": 6.929133415222168, + "learning_rate": 1.986397015396398e-07, + "loss": 0.0231, + "step": 27260 + }, + { + "epoch": 0.10895034908458934, + "grad_norm": 2.022120237350464, + "learning_rate": 1.9863866767666923e-07, + "loss": 0.0263, + "step": 27270 + }, + { + "epoch": 0.10899030154116601, + "grad_norm": 2.887474775314331, + "learning_rate": 1.9863763342365872e-07, + "loss": 0.0291, + "step": 27280 + }, + { + "epoch": 0.10903025399774269, + "grad_norm": 1.5554161071777344, + "learning_rate": 1.9863659878061244e-07, + "loss": 0.0267, + "step": 27290 + }, + { + "epoch": 0.10907020645431936, + "grad_norm": 10.142356872558594, + "learning_rate": 1.9863556374753443e-07, + "loss": 0.0314, + "step": 27300 + }, + { + "epoch": 0.10911015891089604, + "grad_norm": 2.2729477882385254, + "learning_rate": 1.986345283244288e-07, + "loss": 0.0228, + "step": 27310 + }, + { + "epoch": 0.10915011136747271, + "grad_norm": 2.7695536613464355, + "learning_rate": 1.9863349251129962e-07, + "loss": 0.0306, + "step": 27320 + }, + { + "epoch": 0.10919006382404939, + "grad_norm": 3.8773884773254395, + "learning_rate": 1.98632456308151e-07, + "loss": 0.0231, + "step": 27330 + }, + { + "epoch": 0.10923001628062605, + "grad_norm": 4.320541858673096, + "learning_rate": 1.9863141971498703e-07, + "loss": 0.0279, + "step": 27340 + }, + { + "epoch": 0.10926996873720272, + "grad_norm": 4.729318618774414, + "learning_rate": 1.9863038273181183e-07, + "loss": 0.0225, + "step": 27350 + }, + { + "epoch": 0.1093099211937794, + "grad_norm": 5.039651870727539, + "learning_rate": 1.986293453586295e-07, + "loss": 0.0296, + "step": 27360 + }, + { + "epoch": 0.10934987365035607, + "grad_norm": 4.043004035949707, + "learning_rate": 1.9862830759544412e-07, + "loss": 0.0225, + "step": 27370 + }, + { + "epoch": 0.10938982610693275, + "grad_norm": 4.381801128387451, + "learning_rate": 1.9862726944225982e-07, + "loss": 0.0224, + "step": 27380 + }, + { + "epoch": 0.10942977856350943, + "grad_norm": 3.1091811656951904, + "learning_rate": 1.986262308990807e-07, + "loss": 0.0253, + "step": 27390 + }, + { + "epoch": 0.1094697310200861, + "grad_norm": 2.7364182472229004, + "learning_rate": 1.9862519196591082e-07, + "loss": 0.0247, + "step": 27400 + }, + { + "epoch": 0.10950968347666278, + "grad_norm": 3.3801991939544678, + "learning_rate": 1.9862415264275434e-07, + "loss": 0.0259, + "step": 27410 + }, + { + "epoch": 0.10954963593323945, + "grad_norm": 2.9533257484436035, + "learning_rate": 1.9862311292961535e-07, + "loss": 0.0246, + "step": 27420 + }, + { + "epoch": 0.10958958838981613, + "grad_norm": 7.317258358001709, + "learning_rate": 1.9862207282649797e-07, + "loss": 0.0234, + "step": 27430 + }, + { + "epoch": 0.10962954084639279, + "grad_norm": 3.826073408126831, + "learning_rate": 1.9862103233340633e-07, + "loss": 0.0251, + "step": 27440 + }, + { + "epoch": 0.10966949330296946, + "grad_norm": 8.934080123901367, + "learning_rate": 1.986199914503445e-07, + "loss": 0.0321, + "step": 27450 + }, + { + "epoch": 0.10970944575954614, + "grad_norm": 2.701096534729004, + "learning_rate": 1.9861895017731662e-07, + "loss": 0.0327, + "step": 27460 + }, + { + "epoch": 0.10974939821612281, + "grad_norm": 9.912062644958496, + "learning_rate": 1.986179085143268e-07, + "loss": 0.0266, + "step": 27470 + }, + { + "epoch": 0.10978935067269949, + "grad_norm": 4.662773132324219, + "learning_rate": 1.986168664613792e-07, + "loss": 0.0293, + "step": 27480 + }, + { + "epoch": 0.10982930312927616, + "grad_norm": 3.2220637798309326, + "learning_rate": 1.9861582401847787e-07, + "loss": 0.023, + "step": 27490 + }, + { + "epoch": 0.10986925558585284, + "grad_norm": 2.3605287075042725, + "learning_rate": 1.98614781185627e-07, + "loss": 0.0251, + "step": 27500 + }, + { + "epoch": 0.10990920804242951, + "grad_norm": 6.113361358642578, + "learning_rate": 1.9861373796283064e-07, + "loss": 0.0226, + "step": 27510 + }, + { + "epoch": 0.10994916049900619, + "grad_norm": 4.072295665740967, + "learning_rate": 1.98612694350093e-07, + "loss": 0.0312, + "step": 27520 + }, + { + "epoch": 0.10998911295558286, + "grad_norm": 3.2521626949310303, + "learning_rate": 1.9861165034741811e-07, + "loss": 0.0249, + "step": 27530 + }, + { + "epoch": 0.11002906541215952, + "grad_norm": 3.995570659637451, + "learning_rate": 1.986106059548102e-07, + "loss": 0.0257, + "step": 27540 + }, + { + "epoch": 0.1100690178687362, + "grad_norm": 3.094017505645752, + "learning_rate": 1.986095611722733e-07, + "loss": 0.0275, + "step": 27550 + }, + { + "epoch": 0.11010897032531287, + "grad_norm": 2.434588670730591, + "learning_rate": 1.9860851599981166e-07, + "loss": 0.028, + "step": 27560 + }, + { + "epoch": 0.11014892278188955, + "grad_norm": 3.1495866775512695, + "learning_rate": 1.986074704374293e-07, + "loss": 0.0248, + "step": 27570 + }, + { + "epoch": 0.11018887523846622, + "grad_norm": 2.621001958847046, + "learning_rate": 1.9860642448513042e-07, + "loss": 0.0247, + "step": 27580 + }, + { + "epoch": 0.1102288276950429, + "grad_norm": 4.253136157989502, + "learning_rate": 1.9860537814291912e-07, + "loss": 0.0288, + "step": 27590 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 2.745056390762329, + "learning_rate": 1.9860433141079955e-07, + "loss": 0.0276, + "step": 27600 + }, + { + "epoch": 0.11030873260819625, + "grad_norm": 3.846118211746216, + "learning_rate": 1.9860328428877587e-07, + "loss": 0.0281, + "step": 27610 + }, + { + "epoch": 0.11034868506477293, + "grad_norm": 8.806084632873535, + "learning_rate": 1.986022367768522e-07, + "loss": 0.0278, + "step": 27620 + }, + { + "epoch": 0.1103886375213496, + "grad_norm": 4.0377726554870605, + "learning_rate": 1.9860118887503267e-07, + "loss": 0.0232, + "step": 27630 + }, + { + "epoch": 0.11042858997792626, + "grad_norm": 2.070357084274292, + "learning_rate": 1.9860014058332144e-07, + "loss": 0.0309, + "step": 27640 + }, + { + "epoch": 0.11046854243450294, + "grad_norm": 3.4514083862304688, + "learning_rate": 1.985990919017227e-07, + "loss": 0.0215, + "step": 27650 + }, + { + "epoch": 0.11050849489107961, + "grad_norm": 2.032841682434082, + "learning_rate": 1.9859804283024048e-07, + "loss": 0.0234, + "step": 27660 + }, + { + "epoch": 0.11054844734765629, + "grad_norm": 3.2597413063049316, + "learning_rate": 1.9859699336887903e-07, + "loss": 0.0327, + "step": 27670 + }, + { + "epoch": 0.11058839980423296, + "grad_norm": 4.005598068237305, + "learning_rate": 1.9859594351764247e-07, + "loss": 0.028, + "step": 27680 + }, + { + "epoch": 0.11062835226080964, + "grad_norm": 2.8623526096343994, + "learning_rate": 1.9859489327653497e-07, + "loss": 0.0223, + "step": 27690 + }, + { + "epoch": 0.11066830471738631, + "grad_norm": 2.5685088634490967, + "learning_rate": 1.9859384264556062e-07, + "loss": 0.0258, + "step": 27700 + }, + { + "epoch": 0.11070825717396299, + "grad_norm": 4.847537040710449, + "learning_rate": 1.9859279162472364e-07, + "loss": 0.0224, + "step": 27710 + }, + { + "epoch": 0.11074820963053966, + "grad_norm": 4.729526996612549, + "learning_rate": 1.9859174021402817e-07, + "loss": 0.0318, + "step": 27720 + }, + { + "epoch": 0.11078816208711634, + "grad_norm": 2.3926472663879395, + "learning_rate": 1.9859068841347835e-07, + "loss": 0.0266, + "step": 27730 + }, + { + "epoch": 0.110828114543693, + "grad_norm": 2.5160317420959473, + "learning_rate": 1.9858963622307837e-07, + "loss": 0.0241, + "step": 27740 + }, + { + "epoch": 0.11086806700026967, + "grad_norm": 2.5173850059509277, + "learning_rate": 1.9858858364283236e-07, + "loss": 0.0271, + "step": 27750 + }, + { + "epoch": 0.11090801945684635, + "grad_norm": 3.7380614280700684, + "learning_rate": 1.9858753067274448e-07, + "loss": 0.0265, + "step": 27760 + }, + { + "epoch": 0.11094797191342302, + "grad_norm": 3.3096728324890137, + "learning_rate": 1.9858647731281891e-07, + "loss": 0.0245, + "step": 27770 + }, + { + "epoch": 0.1109879243699997, + "grad_norm": 6.698188304901123, + "learning_rate": 1.9858542356305983e-07, + "loss": 0.0293, + "step": 27780 + }, + { + "epoch": 0.11102787682657637, + "grad_norm": 4.212856292724609, + "learning_rate": 1.9858436942347136e-07, + "loss": 0.0267, + "step": 27790 + }, + { + "epoch": 0.11106782928315305, + "grad_norm": 2.6101365089416504, + "learning_rate": 1.985833148940577e-07, + "loss": 0.0261, + "step": 27800 + }, + { + "epoch": 0.11110778173972972, + "grad_norm": 2.825183153152466, + "learning_rate": 1.9858225997482302e-07, + "loss": 0.0275, + "step": 27810 + }, + { + "epoch": 0.1111477341963064, + "grad_norm": 4.510190010070801, + "learning_rate": 1.9858120466577152e-07, + "loss": 0.0295, + "step": 27820 + }, + { + "epoch": 0.11118768665288307, + "grad_norm": 8.794641494750977, + "learning_rate": 1.985801489669073e-07, + "loss": 0.0253, + "step": 27830 + }, + { + "epoch": 0.11122763910945974, + "grad_norm": 5.876779556274414, + "learning_rate": 1.985790928782346e-07, + "loss": 0.0229, + "step": 27840 + }, + { + "epoch": 0.11126759156603641, + "grad_norm": 4.5862717628479, + "learning_rate": 1.9857803639975756e-07, + "loss": 0.0272, + "step": 27850 + }, + { + "epoch": 0.11130754402261309, + "grad_norm": 2.7617881298065186, + "learning_rate": 1.9857697953148035e-07, + "loss": 0.0253, + "step": 27860 + }, + { + "epoch": 0.11134749647918976, + "grad_norm": 3.322721242904663, + "learning_rate": 1.985759222734072e-07, + "loss": 0.0295, + "step": 27870 + }, + { + "epoch": 0.11138744893576644, + "grad_norm": 3.608619451522827, + "learning_rate": 1.9857486462554225e-07, + "loss": 0.0287, + "step": 27880 + }, + { + "epoch": 0.11142740139234311, + "grad_norm": 4.012343406677246, + "learning_rate": 1.985738065878897e-07, + "loss": 0.0274, + "step": 27890 + }, + { + "epoch": 0.11146735384891979, + "grad_norm": 3.921919584274292, + "learning_rate": 1.9857274816045372e-07, + "loss": 0.0284, + "step": 27900 + }, + { + "epoch": 0.11150730630549646, + "grad_norm": 5.322065830230713, + "learning_rate": 1.985716893432385e-07, + "loss": 0.0259, + "step": 27910 + }, + { + "epoch": 0.11154725876207314, + "grad_norm": 2.992708683013916, + "learning_rate": 1.985706301362482e-07, + "loss": 0.0234, + "step": 27920 + }, + { + "epoch": 0.11158721121864981, + "grad_norm": 5.478329181671143, + "learning_rate": 1.9856957053948708e-07, + "loss": 0.0253, + "step": 27930 + }, + { + "epoch": 0.11162716367522649, + "grad_norm": 9.625199317932129, + "learning_rate": 1.9856851055295923e-07, + "loss": 0.0257, + "step": 27940 + }, + { + "epoch": 0.11166711613180315, + "grad_norm": 4.622597694396973, + "learning_rate": 1.9856745017666892e-07, + "loss": 0.027, + "step": 27950 + }, + { + "epoch": 0.11170706858837982, + "grad_norm": 8.71230697631836, + "learning_rate": 1.9856638941062033e-07, + "loss": 0.0262, + "step": 27960 + }, + { + "epoch": 0.1117470210449565, + "grad_norm": 2.498870372772217, + "learning_rate": 1.9856532825481764e-07, + "loss": 0.0267, + "step": 27970 + }, + { + "epoch": 0.11178697350153317, + "grad_norm": 3.5158679485321045, + "learning_rate": 1.9856426670926504e-07, + "loss": 0.0252, + "step": 27980 + }, + { + "epoch": 0.11182692595810985, + "grad_norm": 1.709236741065979, + "learning_rate": 1.9856320477396675e-07, + "loss": 0.0224, + "step": 27990 + }, + { + "epoch": 0.11186687841468652, + "grad_norm": 5.762019157409668, + "learning_rate": 1.9856214244892695e-07, + "loss": 0.0247, + "step": 28000 + }, + { + "epoch": 0.1119068308712632, + "grad_norm": 4.856345176696777, + "learning_rate": 1.9856107973414984e-07, + "loss": 0.028, + "step": 28010 + }, + { + "epoch": 0.11194678332783987, + "grad_norm": 9.02064323425293, + "learning_rate": 1.9856001662963964e-07, + "loss": 0.0231, + "step": 28020 + }, + { + "epoch": 0.11198673578441655, + "grad_norm": 7.13366174697876, + "learning_rate": 1.9855895313540053e-07, + "loss": 0.0291, + "step": 28030 + }, + { + "epoch": 0.11202668824099322, + "grad_norm": 5.453084945678711, + "learning_rate": 1.9855788925143675e-07, + "loss": 0.0224, + "step": 28040 + }, + { + "epoch": 0.11206664069756989, + "grad_norm": 3.2838401794433594, + "learning_rate": 1.9855682497775246e-07, + "loss": 0.0258, + "step": 28050 + }, + { + "epoch": 0.11210659315414656, + "grad_norm": 2.3332769870758057, + "learning_rate": 1.985557603143519e-07, + "loss": 0.0266, + "step": 28060 + }, + { + "epoch": 0.11214654561072324, + "grad_norm": 4.236571788787842, + "learning_rate": 1.985546952612393e-07, + "loss": 0.0244, + "step": 28070 + }, + { + "epoch": 0.11218649806729991, + "grad_norm": 1.795524001121521, + "learning_rate": 1.9855362981841882e-07, + "loss": 0.0282, + "step": 28080 + }, + { + "epoch": 0.11222645052387659, + "grad_norm": 3.975964307785034, + "learning_rate": 1.985525639858947e-07, + "loss": 0.0251, + "step": 28090 + }, + { + "epoch": 0.11226640298045326, + "grad_norm": 4.152592658996582, + "learning_rate": 1.9855149776367116e-07, + "loss": 0.0261, + "step": 28100 + }, + { + "epoch": 0.11230635543702994, + "grad_norm": 2.7634692192077637, + "learning_rate": 1.985504311517524e-07, + "loss": 0.0238, + "step": 28110 + }, + { + "epoch": 0.11234630789360661, + "grad_norm": 3.3691227436065674, + "learning_rate": 1.9854936415014266e-07, + "loss": 0.023, + "step": 28120 + }, + { + "epoch": 0.11238626035018329, + "grad_norm": 6.491100788116455, + "learning_rate": 1.9854829675884615e-07, + "loss": 0.0318, + "step": 28130 + }, + { + "epoch": 0.11242621280675996, + "grad_norm": 6.349086284637451, + "learning_rate": 1.9854722897786708e-07, + "loss": 0.0256, + "step": 28140 + }, + { + "epoch": 0.11246616526333662, + "grad_norm": 9.671690940856934, + "learning_rate": 1.9854616080720965e-07, + "loss": 0.0249, + "step": 28150 + }, + { + "epoch": 0.1125061177199133, + "grad_norm": 3.461264133453369, + "learning_rate": 1.9854509224687815e-07, + "loss": 0.0265, + "step": 28160 + }, + { + "epoch": 0.11254607017648997, + "grad_norm": 4.186182498931885, + "learning_rate": 1.9854402329687675e-07, + "loss": 0.0268, + "step": 28170 + }, + { + "epoch": 0.11258602263306665, + "grad_norm": 6.943350791931152, + "learning_rate": 1.9854295395720967e-07, + "loss": 0.0271, + "step": 28180 + }, + { + "epoch": 0.11262597508964332, + "grad_norm": 3.4615187644958496, + "learning_rate": 1.985418842278812e-07, + "loss": 0.0269, + "step": 28190 + }, + { + "epoch": 0.11266592754622, + "grad_norm": 4.472960948944092, + "learning_rate": 1.9854081410889553e-07, + "loss": 0.0274, + "step": 28200 + }, + { + "epoch": 0.11270588000279667, + "grad_norm": 2.7845804691314697, + "learning_rate": 1.9853974360025688e-07, + "loss": 0.0236, + "step": 28210 + }, + { + "epoch": 0.11274583245937335, + "grad_norm": 5.569211483001709, + "learning_rate": 1.985386727019695e-07, + "loss": 0.0288, + "step": 28220 + }, + { + "epoch": 0.11278578491595002, + "grad_norm": 4.732113361358643, + "learning_rate": 1.9853760141403763e-07, + "loss": 0.0261, + "step": 28230 + }, + { + "epoch": 0.1128257373725267, + "grad_norm": 4.710103988647461, + "learning_rate": 1.9853652973646549e-07, + "loss": 0.0268, + "step": 28240 + }, + { + "epoch": 0.11286568982910336, + "grad_norm": 1.5442793369293213, + "learning_rate": 1.9853545766925732e-07, + "loss": 0.0235, + "step": 28250 + }, + { + "epoch": 0.11290564228568004, + "grad_norm": 4.830338954925537, + "learning_rate": 1.9853438521241738e-07, + "loss": 0.023, + "step": 28260 + }, + { + "epoch": 0.11294559474225671, + "grad_norm": 3.512244462966919, + "learning_rate": 1.9853331236594988e-07, + "loss": 0.0255, + "step": 28270 + }, + { + "epoch": 0.11298554719883339, + "grad_norm": 5.735579490661621, + "learning_rate": 1.985322391298591e-07, + "loss": 0.027, + "step": 28280 + }, + { + "epoch": 0.11302549965541006, + "grad_norm": 3.6391937732696533, + "learning_rate": 1.9853116550414923e-07, + "loss": 0.0249, + "step": 28290 + }, + { + "epoch": 0.11306545211198674, + "grad_norm": 4.758711814880371, + "learning_rate": 1.9853009148882457e-07, + "loss": 0.028, + "step": 28300 + }, + { + "epoch": 0.11310540456856341, + "grad_norm": 3.858341693878174, + "learning_rate": 1.9852901708388935e-07, + "loss": 0.0271, + "step": 28310 + }, + { + "epoch": 0.11314535702514009, + "grad_norm": 7.1041579246521, + "learning_rate": 1.9852794228934782e-07, + "loss": 0.0262, + "step": 28320 + }, + { + "epoch": 0.11318530948171676, + "grad_norm": 4.020003318786621, + "learning_rate": 1.985268671052042e-07, + "loss": 0.0235, + "step": 28330 + }, + { + "epoch": 0.11322526193829344, + "grad_norm": 4.161738395690918, + "learning_rate": 1.9852579153146277e-07, + "loss": 0.0297, + "step": 28340 + }, + { + "epoch": 0.1132652143948701, + "grad_norm": 5.947208404541016, + "learning_rate": 1.9852471556812777e-07, + "loss": 0.0252, + "step": 28350 + }, + { + "epoch": 0.11330516685144677, + "grad_norm": 2.539433479309082, + "learning_rate": 1.9852363921520346e-07, + "loss": 0.0249, + "step": 28360 + }, + { + "epoch": 0.11334511930802345, + "grad_norm": 1.8439832925796509, + "learning_rate": 1.985225624726941e-07, + "loss": 0.0293, + "step": 28370 + }, + { + "epoch": 0.11338507176460012, + "grad_norm": 4.263535022735596, + "learning_rate": 1.9852148534060396e-07, + "loss": 0.0271, + "step": 28380 + }, + { + "epoch": 0.1134250242211768, + "grad_norm": 3.1554436683654785, + "learning_rate": 1.9852040781893726e-07, + "loss": 0.0273, + "step": 28390 + }, + { + "epoch": 0.11346497667775347, + "grad_norm": 3.2993171215057373, + "learning_rate": 1.9851932990769833e-07, + "loss": 0.0214, + "step": 28400 + }, + { + "epoch": 0.11350492913433015, + "grad_norm": 5.726454734802246, + "learning_rate": 1.9851825160689134e-07, + "loss": 0.0314, + "step": 28410 + }, + { + "epoch": 0.11354488159090682, + "grad_norm": 4.73268461227417, + "learning_rate": 1.985171729165206e-07, + "loss": 0.0258, + "step": 28420 + }, + { + "epoch": 0.1135848340474835, + "grad_norm": 3.70870041847229, + "learning_rate": 1.985160938365904e-07, + "loss": 0.0281, + "step": 28430 + }, + { + "epoch": 0.11362478650406017, + "grad_norm": 3.477391242980957, + "learning_rate": 1.98515014367105e-07, + "loss": 0.0251, + "step": 28440 + }, + { + "epoch": 0.11366473896063684, + "grad_norm": 9.323370933532715, + "learning_rate": 1.9851393450806862e-07, + "loss": 0.0288, + "step": 28450 + }, + { + "epoch": 0.11370469141721351, + "grad_norm": 3.0761911869049072, + "learning_rate": 1.9851285425948558e-07, + "loss": 0.0243, + "step": 28460 + }, + { + "epoch": 0.11374464387379019, + "grad_norm": 5.502758026123047, + "learning_rate": 1.9851177362136012e-07, + "loss": 0.0258, + "step": 28470 + }, + { + "epoch": 0.11378459633036686, + "grad_norm": 2.9924862384796143, + "learning_rate": 1.9851069259369655e-07, + "loss": 0.0261, + "step": 28480 + }, + { + "epoch": 0.11382454878694354, + "grad_norm": 3.018148422241211, + "learning_rate": 1.985096111764991e-07, + "loss": 0.0298, + "step": 28490 + }, + { + "epoch": 0.11386450124352021, + "grad_norm": 2.6798858642578125, + "learning_rate": 1.9850852936977208e-07, + "loss": 0.0222, + "step": 28500 + }, + { + "epoch": 0.11390445370009689, + "grad_norm": 3.2327611446380615, + "learning_rate": 1.9850744717351975e-07, + "loss": 0.0233, + "step": 28510 + }, + { + "epoch": 0.11394440615667356, + "grad_norm": 4.902924537658691, + "learning_rate": 1.985063645877464e-07, + "loss": 0.0297, + "step": 28520 + }, + { + "epoch": 0.11398435861325024, + "grad_norm": 4.420677661895752, + "learning_rate": 1.985052816124563e-07, + "loss": 0.0248, + "step": 28530 + }, + { + "epoch": 0.11402431106982691, + "grad_norm": 4.353881359100342, + "learning_rate": 1.9850419824765374e-07, + "loss": 0.0231, + "step": 28540 + }, + { + "epoch": 0.11406426352640359, + "grad_norm": 3.959216833114624, + "learning_rate": 1.9850311449334302e-07, + "loss": 0.0275, + "step": 28550 + }, + { + "epoch": 0.11410421598298025, + "grad_norm": 6.036563873291016, + "learning_rate": 1.9850203034952837e-07, + "loss": 0.0295, + "step": 28560 + }, + { + "epoch": 0.11414416843955692, + "grad_norm": 2.664726734161377, + "learning_rate": 1.9850094581621413e-07, + "loss": 0.0273, + "step": 28570 + }, + { + "epoch": 0.1141841208961336, + "grad_norm": 6.436692714691162, + "learning_rate": 1.984998608934046e-07, + "loss": 0.0267, + "step": 28580 + }, + { + "epoch": 0.11422407335271027, + "grad_norm": 6.888981819152832, + "learning_rate": 1.9849877558110403e-07, + "loss": 0.0222, + "step": 28590 + }, + { + "epoch": 0.11426402580928695, + "grad_norm": 4.400328159332275, + "learning_rate": 1.984976898793167e-07, + "loss": 0.0255, + "step": 28600 + }, + { + "epoch": 0.11430397826586362, + "grad_norm": 3.6007730960845947, + "learning_rate": 1.9849660378804695e-07, + "loss": 0.0213, + "step": 28610 + }, + { + "epoch": 0.1143439307224403, + "grad_norm": 3.6643073558807373, + "learning_rate": 1.98495517307299e-07, + "loss": 0.0343, + "step": 28620 + }, + { + "epoch": 0.11438388317901697, + "grad_norm": 4.034998416900635, + "learning_rate": 1.9849443043707726e-07, + "loss": 0.0256, + "step": 28630 + }, + { + "epoch": 0.11442383563559365, + "grad_norm": 4.398351669311523, + "learning_rate": 1.9849334317738596e-07, + "loss": 0.0274, + "step": 28640 + }, + { + "epoch": 0.11446378809217032, + "grad_norm": 3.028280019760132, + "learning_rate": 1.9849225552822936e-07, + "loss": 0.0264, + "step": 28650 + }, + { + "epoch": 0.11450374054874699, + "grad_norm": 4.883791446685791, + "learning_rate": 1.9849116748961183e-07, + "loss": 0.0269, + "step": 28660 + }, + { + "epoch": 0.11454369300532366, + "grad_norm": 5.687098979949951, + "learning_rate": 1.9849007906153768e-07, + "loss": 0.0246, + "step": 28670 + }, + { + "epoch": 0.11458364546190034, + "grad_norm": 3.6709048748016357, + "learning_rate": 1.9848899024401115e-07, + "loss": 0.0262, + "step": 28680 + }, + { + "epoch": 0.11462359791847701, + "grad_norm": 10.058586120605469, + "learning_rate": 1.9848790103703658e-07, + "loss": 0.0241, + "step": 28690 + }, + { + "epoch": 0.11466355037505369, + "grad_norm": 14.911577224731445, + "learning_rate": 1.9848681144061828e-07, + "loss": 0.0298, + "step": 28700 + }, + { + "epoch": 0.11470350283163036, + "grad_norm": 3.612455368041992, + "learning_rate": 1.9848572145476057e-07, + "loss": 0.0286, + "step": 28710 + }, + { + "epoch": 0.11474345528820704, + "grad_norm": 2.8505630493164062, + "learning_rate": 1.984846310794677e-07, + "loss": 0.0272, + "step": 28720 + }, + { + "epoch": 0.11478340774478371, + "grad_norm": 3.4876155853271484, + "learning_rate": 1.9848354031474407e-07, + "loss": 0.0254, + "step": 28730 + }, + { + "epoch": 0.11482336020136039, + "grad_norm": 2.5420165061950684, + "learning_rate": 1.984824491605939e-07, + "loss": 0.0241, + "step": 28740 + }, + { + "epoch": 0.11486331265793706, + "grad_norm": 3.9202258586883545, + "learning_rate": 1.9848135761702164e-07, + "loss": 0.0249, + "step": 28750 + }, + { + "epoch": 0.11490326511451372, + "grad_norm": 3.3724162578582764, + "learning_rate": 1.9848026568403143e-07, + "loss": 0.0287, + "step": 28760 + }, + { + "epoch": 0.1149432175710904, + "grad_norm": 2.533864736557007, + "learning_rate": 1.984791733616277e-07, + "loss": 0.0268, + "step": 28770 + }, + { + "epoch": 0.11498317002766707, + "grad_norm": 4.3451313972473145, + "learning_rate": 1.9847808064981476e-07, + "loss": 0.0286, + "step": 28780 + }, + { + "epoch": 0.11502312248424375, + "grad_norm": 10.997457504272461, + "learning_rate": 1.9847698754859692e-07, + "loss": 0.0314, + "step": 28790 + }, + { + "epoch": 0.11506307494082042, + "grad_norm": 12.456637382507324, + "learning_rate": 1.9847589405797847e-07, + "loss": 0.0281, + "step": 28800 + }, + { + "epoch": 0.1151030273973971, + "grad_norm": 4.258942604064941, + "learning_rate": 1.984748001779638e-07, + "loss": 0.0261, + "step": 28810 + }, + { + "epoch": 0.11514297985397377, + "grad_norm": 2.0428977012634277, + "learning_rate": 1.9847370590855718e-07, + "loss": 0.0263, + "step": 28820 + }, + { + "epoch": 0.11518293231055045, + "grad_norm": 3.13594651222229, + "learning_rate": 1.9847261124976296e-07, + "loss": 0.0256, + "step": 28830 + }, + { + "epoch": 0.11522288476712712, + "grad_norm": 6.373015880584717, + "learning_rate": 1.9847151620158545e-07, + "loss": 0.0332, + "step": 28840 + }, + { + "epoch": 0.1152628372237038, + "grad_norm": 4.950849533081055, + "learning_rate": 1.98470420764029e-07, + "loss": 0.0296, + "step": 28850 + }, + { + "epoch": 0.11530278968028046, + "grad_norm": 3.5521183013916016, + "learning_rate": 1.9846932493709795e-07, + "loss": 0.0228, + "step": 28860 + }, + { + "epoch": 0.11534274213685713, + "grad_norm": 4.678577423095703, + "learning_rate": 1.9846822872079662e-07, + "loss": 0.0265, + "step": 28870 + }, + { + "epoch": 0.11538269459343381, + "grad_norm": 2.2316229343414307, + "learning_rate": 1.984671321151293e-07, + "loss": 0.0257, + "step": 28880 + }, + { + "epoch": 0.11542264705001049, + "grad_norm": 6.931086540222168, + "learning_rate": 1.9846603512010042e-07, + "loss": 0.0263, + "step": 28890 + }, + { + "epoch": 0.11546259950658716, + "grad_norm": 2.3863704204559326, + "learning_rate": 1.9846493773571424e-07, + "loss": 0.0269, + "step": 28900 + }, + { + "epoch": 0.11550255196316384, + "grad_norm": 3.204249382019043, + "learning_rate": 1.9846383996197515e-07, + "loss": 0.0302, + "step": 28910 + }, + { + "epoch": 0.11554250441974051, + "grad_norm": 9.140707969665527, + "learning_rate": 1.9846274179888747e-07, + "loss": 0.0217, + "step": 28920 + }, + { + "epoch": 0.11558245687631719, + "grad_norm": 4.503727436065674, + "learning_rate": 1.984616432464555e-07, + "loss": 0.0267, + "step": 28930 + }, + { + "epoch": 0.11562240933289386, + "grad_norm": 3.628911256790161, + "learning_rate": 1.9846054430468366e-07, + "loss": 0.0216, + "step": 28940 + }, + { + "epoch": 0.11566236178947054, + "grad_norm": 4.141042232513428, + "learning_rate": 1.9845944497357624e-07, + "loss": 0.0265, + "step": 28950 + }, + { + "epoch": 0.1157023142460472, + "grad_norm": 7.47949743270874, + "learning_rate": 1.984583452531376e-07, + "loss": 0.0296, + "step": 28960 + }, + { + "epoch": 0.11574226670262387, + "grad_norm": 4.589731693267822, + "learning_rate": 1.9845724514337214e-07, + "loss": 0.0285, + "step": 28970 + }, + { + "epoch": 0.11578221915920055, + "grad_norm": 2.6894521713256836, + "learning_rate": 1.9845614464428413e-07, + "loss": 0.0228, + "step": 28980 + }, + { + "epoch": 0.11582217161577722, + "grad_norm": 6.518776893615723, + "learning_rate": 1.9845504375587797e-07, + "loss": 0.0252, + "step": 28990 + }, + { + "epoch": 0.1158621240723539, + "grad_norm": 5.164569854736328, + "learning_rate": 1.98453942478158e-07, + "loss": 0.0244, + "step": 29000 + }, + { + "epoch": 0.11590207652893057, + "grad_norm": 2.6148841381073, + "learning_rate": 1.9845284081112853e-07, + "loss": 0.0246, + "step": 29010 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 3.5515213012695312, + "learning_rate": 1.98451738754794e-07, + "loss": 0.025, + "step": 29020 + }, + { + "epoch": 0.11598198144208392, + "grad_norm": 3.390441656112671, + "learning_rate": 1.9845063630915875e-07, + "loss": 0.0234, + "step": 29030 + }, + { + "epoch": 0.1160219338986606, + "grad_norm": 3.039564847946167, + "learning_rate": 1.9844953347422707e-07, + "loss": 0.0214, + "step": 29040 + }, + { + "epoch": 0.11606188635523727, + "grad_norm": 3.7877979278564453, + "learning_rate": 1.984484302500034e-07, + "loss": 0.0263, + "step": 29050 + }, + { + "epoch": 0.11610183881181395, + "grad_norm": 2.3479502201080322, + "learning_rate": 1.9844732663649205e-07, + "loss": 0.024, + "step": 29060 + }, + { + "epoch": 0.11614179126839061, + "grad_norm": 3.8294990062713623, + "learning_rate": 1.9844622263369744e-07, + "loss": 0.0273, + "step": 29070 + }, + { + "epoch": 0.11618174372496728, + "grad_norm": 13.254944801330566, + "learning_rate": 1.9844511824162388e-07, + "loss": 0.0292, + "step": 29080 + }, + { + "epoch": 0.11622169618154396, + "grad_norm": 2.7625725269317627, + "learning_rate": 1.9844401346027574e-07, + "loss": 0.0259, + "step": 29090 + }, + { + "epoch": 0.11626164863812063, + "grad_norm": 7.3216166496276855, + "learning_rate": 1.9844290828965743e-07, + "loss": 0.0212, + "step": 29100 + }, + { + "epoch": 0.11630160109469731, + "grad_norm": 3.0759851932525635, + "learning_rate": 1.984418027297733e-07, + "loss": 0.0261, + "step": 29110 + }, + { + "epoch": 0.11634155355127399, + "grad_norm": 5.810417652130127, + "learning_rate": 1.9844069678062771e-07, + "loss": 0.0249, + "step": 29120 + }, + { + "epoch": 0.11638150600785066, + "grad_norm": 6.338130474090576, + "learning_rate": 1.9843959044222504e-07, + "loss": 0.023, + "step": 29130 + }, + { + "epoch": 0.11642145846442734, + "grad_norm": 10.331703186035156, + "learning_rate": 1.9843848371456967e-07, + "loss": 0.0304, + "step": 29140 + }, + { + "epoch": 0.11646141092100401, + "grad_norm": 4.121038436889648, + "learning_rate": 1.9843737659766595e-07, + "loss": 0.0254, + "step": 29150 + }, + { + "epoch": 0.11650136337758069, + "grad_norm": 2.579838514328003, + "learning_rate": 1.9843626909151832e-07, + "loss": 0.0291, + "step": 29160 + }, + { + "epoch": 0.11654131583415735, + "grad_norm": 2.666472911834717, + "learning_rate": 1.9843516119613105e-07, + "loss": 0.0248, + "step": 29170 + }, + { + "epoch": 0.11658126829073402, + "grad_norm": 5.351405143737793, + "learning_rate": 1.9843405291150866e-07, + "loss": 0.0255, + "step": 29180 + }, + { + "epoch": 0.1166212207473107, + "grad_norm": 4.340632915496826, + "learning_rate": 1.9843294423765544e-07, + "loss": 0.0283, + "step": 29190 + }, + { + "epoch": 0.11666117320388737, + "grad_norm": 1.4842169284820557, + "learning_rate": 1.984318351745758e-07, + "loss": 0.0209, + "step": 29200 + }, + { + "epoch": 0.11670112566046405, + "grad_norm": 4.522999286651611, + "learning_rate": 1.984307257222741e-07, + "loss": 0.0237, + "step": 29210 + }, + { + "epoch": 0.11674107811704072, + "grad_norm": 2.273221731185913, + "learning_rate": 1.9842961588075478e-07, + "loss": 0.0209, + "step": 29220 + }, + { + "epoch": 0.1167810305736174, + "grad_norm": 6.054821014404297, + "learning_rate": 1.9842850565002218e-07, + "loss": 0.0248, + "step": 29230 + }, + { + "epoch": 0.11682098303019407, + "grad_norm": 5.578814506530762, + "learning_rate": 1.9842739503008072e-07, + "loss": 0.0255, + "step": 29240 + }, + { + "epoch": 0.11686093548677075, + "grad_norm": 8.437750816345215, + "learning_rate": 1.9842628402093474e-07, + "loss": 0.026, + "step": 29250 + }, + { + "epoch": 0.11690088794334742, + "grad_norm": 2.1578292846679688, + "learning_rate": 1.9842517262258874e-07, + "loss": 0.0265, + "step": 29260 + }, + { + "epoch": 0.11694084039992408, + "grad_norm": 2.900973081588745, + "learning_rate": 1.9842406083504702e-07, + "loss": 0.0247, + "step": 29270 + }, + { + "epoch": 0.11698079285650076, + "grad_norm": 5.370423793792725, + "learning_rate": 1.9842294865831398e-07, + "loss": 0.0274, + "step": 29280 + }, + { + "epoch": 0.11702074531307743, + "grad_norm": 6.227208614349365, + "learning_rate": 1.9842183609239405e-07, + "loss": 0.0319, + "step": 29290 + }, + { + "epoch": 0.11706069776965411, + "grad_norm": 4.687236785888672, + "learning_rate": 1.9842072313729163e-07, + "loss": 0.0308, + "step": 29300 + }, + { + "epoch": 0.11710065022623078, + "grad_norm": 3.902515172958374, + "learning_rate": 1.984196097930111e-07, + "loss": 0.03, + "step": 29310 + }, + { + "epoch": 0.11714060268280746, + "grad_norm": 3.9451022148132324, + "learning_rate": 1.9841849605955687e-07, + "loss": 0.031, + "step": 29320 + }, + { + "epoch": 0.11718055513938413, + "grad_norm": 3.7660131454467773, + "learning_rate": 1.9841738193693337e-07, + "loss": 0.0263, + "step": 29330 + }, + { + "epoch": 0.11722050759596081, + "grad_norm": 3.9686479568481445, + "learning_rate": 1.9841626742514496e-07, + "loss": 0.022, + "step": 29340 + }, + { + "epoch": 0.11726046005253749, + "grad_norm": 2.1783416271209717, + "learning_rate": 1.984151525241961e-07, + "loss": 0.026, + "step": 29350 + }, + { + "epoch": 0.11730041250911416, + "grad_norm": 2.9527041912078857, + "learning_rate": 1.9841403723409113e-07, + "loss": 0.029, + "step": 29360 + }, + { + "epoch": 0.11734036496569082, + "grad_norm": 4.187051773071289, + "learning_rate": 1.9841292155483455e-07, + "loss": 0.0265, + "step": 29370 + }, + { + "epoch": 0.1173803174222675, + "grad_norm": 2.975456714630127, + "learning_rate": 1.984118054864307e-07, + "loss": 0.0211, + "step": 29380 + }, + { + "epoch": 0.11742026987884417, + "grad_norm": 5.15948486328125, + "learning_rate": 1.9841068902888398e-07, + "loss": 0.032, + "step": 29390 + }, + { + "epoch": 0.11746022233542085, + "grad_norm": 3.66445255279541, + "learning_rate": 1.9840957218219888e-07, + "loss": 0.0299, + "step": 29400 + }, + { + "epoch": 0.11750017479199752, + "grad_norm": 3.1095995903015137, + "learning_rate": 1.9840845494637977e-07, + "loss": 0.0231, + "step": 29410 + }, + { + "epoch": 0.1175401272485742, + "grad_norm": 6.040139198303223, + "learning_rate": 1.9840733732143105e-07, + "loss": 0.0282, + "step": 29420 + }, + { + "epoch": 0.11758007970515087, + "grad_norm": 5.313448905944824, + "learning_rate": 1.9840621930735716e-07, + "loss": 0.0312, + "step": 29430 + }, + { + "epoch": 0.11762003216172755, + "grad_norm": 7.053373336791992, + "learning_rate": 1.9840510090416257e-07, + "loss": 0.0274, + "step": 29440 + }, + { + "epoch": 0.11765998461830422, + "grad_norm": 4.04543399810791, + "learning_rate": 1.984039821118516e-07, + "loss": 0.0283, + "step": 29450 + }, + { + "epoch": 0.1176999370748809, + "grad_norm": 3.7842087745666504, + "learning_rate": 1.9840286293042875e-07, + "loss": 0.0253, + "step": 29460 + }, + { + "epoch": 0.11773988953145756, + "grad_norm": 4.744464874267578, + "learning_rate": 1.9840174335989842e-07, + "loss": 0.0289, + "step": 29470 + }, + { + "epoch": 0.11777984198803423, + "grad_norm": 7.5194854736328125, + "learning_rate": 1.9840062340026504e-07, + "loss": 0.027, + "step": 29480 + }, + { + "epoch": 0.11781979444461091, + "grad_norm": 5.836795330047607, + "learning_rate": 1.9839950305153306e-07, + "loss": 0.0244, + "step": 29490 + }, + { + "epoch": 0.11785974690118758, + "grad_norm": 1.3010187149047852, + "learning_rate": 1.983983823137069e-07, + "loss": 0.0229, + "step": 29500 + }, + { + "epoch": 0.11789969935776426, + "grad_norm": 7.305105209350586, + "learning_rate": 1.9839726118679095e-07, + "loss": 0.0211, + "step": 29510 + }, + { + "epoch": 0.11793965181434093, + "grad_norm": 3.9768545627593994, + "learning_rate": 1.9839613967078966e-07, + "loss": 0.0361, + "step": 29520 + }, + { + "epoch": 0.11797960427091761, + "grad_norm": 5.364870071411133, + "learning_rate": 1.9839501776570752e-07, + "loss": 0.0229, + "step": 29530 + }, + { + "epoch": 0.11801955672749428, + "grad_norm": 5.052065849304199, + "learning_rate": 1.9839389547154892e-07, + "loss": 0.0253, + "step": 29540 + }, + { + "epoch": 0.11805950918407096, + "grad_norm": 3.5699260234832764, + "learning_rate": 1.983927727883183e-07, + "loss": 0.0192, + "step": 29550 + }, + { + "epoch": 0.11809946164064763, + "grad_norm": 10.263736724853516, + "learning_rate": 1.983916497160201e-07, + "loss": 0.0291, + "step": 29560 + }, + { + "epoch": 0.1181394140972243, + "grad_norm": 3.354701280593872, + "learning_rate": 1.9839052625465876e-07, + "loss": 0.0277, + "step": 29570 + }, + { + "epoch": 0.11817936655380097, + "grad_norm": 5.560471057891846, + "learning_rate": 1.9838940240423874e-07, + "loss": 0.0301, + "step": 29580 + }, + { + "epoch": 0.11821931901037765, + "grad_norm": 8.503247261047363, + "learning_rate": 1.9838827816476447e-07, + "loss": 0.0247, + "step": 29590 + }, + { + "epoch": 0.11825927146695432, + "grad_norm": 9.720861434936523, + "learning_rate": 1.9838715353624037e-07, + "loss": 0.0297, + "step": 29600 + }, + { + "epoch": 0.118299223923531, + "grad_norm": 2.195321798324585, + "learning_rate": 1.9838602851867095e-07, + "loss": 0.0291, + "step": 29610 + }, + { + "epoch": 0.11833917638010767, + "grad_norm": 4.449635028839111, + "learning_rate": 1.983849031120606e-07, + "loss": 0.0227, + "step": 29620 + }, + { + "epoch": 0.11837912883668435, + "grad_norm": 1.977815866470337, + "learning_rate": 1.983837773164138e-07, + "loss": 0.0231, + "step": 29630 + }, + { + "epoch": 0.11841908129326102, + "grad_norm": 4.661524295806885, + "learning_rate": 1.9838265113173497e-07, + "loss": 0.0292, + "step": 29640 + }, + { + "epoch": 0.1184590337498377, + "grad_norm": 5.200439929962158, + "learning_rate": 1.9838152455802864e-07, + "loss": 0.0287, + "step": 29650 + }, + { + "epoch": 0.11849898620641437, + "grad_norm": 3.129641532897949, + "learning_rate": 1.9838039759529917e-07, + "loss": 0.0261, + "step": 29660 + }, + { + "epoch": 0.11853893866299105, + "grad_norm": 6.4201812744140625, + "learning_rate": 1.9837927024355107e-07, + "loss": 0.0298, + "step": 29670 + }, + { + "epoch": 0.11857889111956771, + "grad_norm": 5.668672561645508, + "learning_rate": 1.983781425027888e-07, + "loss": 0.0264, + "step": 29680 + }, + { + "epoch": 0.11861884357614438, + "grad_norm": 4.43161153793335, + "learning_rate": 1.9837701437301677e-07, + "loss": 0.0227, + "step": 29690 + }, + { + "epoch": 0.11865879603272106, + "grad_norm": 5.622380256652832, + "learning_rate": 1.983758858542395e-07, + "loss": 0.03, + "step": 29700 + }, + { + "epoch": 0.11869874848929773, + "grad_norm": 4.566784858703613, + "learning_rate": 1.9837475694646142e-07, + "loss": 0.0261, + "step": 29710 + }, + { + "epoch": 0.11873870094587441, + "grad_norm": 6.067584037780762, + "learning_rate": 1.9837362764968703e-07, + "loss": 0.022, + "step": 29720 + }, + { + "epoch": 0.11877865340245108, + "grad_norm": 3.8608264923095703, + "learning_rate": 1.9837249796392072e-07, + "loss": 0.0257, + "step": 29730 + }, + { + "epoch": 0.11881860585902776, + "grad_norm": 1.9141911268234253, + "learning_rate": 1.9837136788916702e-07, + "loss": 0.0216, + "step": 29740 + }, + { + "epoch": 0.11885855831560443, + "grad_norm": 4.177089691162109, + "learning_rate": 1.983702374254304e-07, + "loss": 0.0252, + "step": 29750 + }, + { + "epoch": 0.11889851077218111, + "grad_norm": 3.8935649394989014, + "learning_rate": 1.983691065727153e-07, + "loss": 0.0253, + "step": 29760 + }, + { + "epoch": 0.11893846322875778, + "grad_norm": 11.706738471984863, + "learning_rate": 1.983679753310262e-07, + "loss": 0.0248, + "step": 29770 + }, + { + "epoch": 0.11897841568533445, + "grad_norm": 4.718491077423096, + "learning_rate": 1.983668437003676e-07, + "loss": 0.029, + "step": 29780 + }, + { + "epoch": 0.11901836814191112, + "grad_norm": 5.1308135986328125, + "learning_rate": 1.9836571168074397e-07, + "loss": 0.0231, + "step": 29790 + }, + { + "epoch": 0.1190583205984878, + "grad_norm": 6.478251934051514, + "learning_rate": 1.9836457927215973e-07, + "loss": 0.0264, + "step": 29800 + }, + { + "epoch": 0.11909827305506447, + "grad_norm": 1.8301464319229126, + "learning_rate": 1.983634464746194e-07, + "loss": 0.0295, + "step": 29810 + }, + { + "epoch": 0.11913822551164115, + "grad_norm": 2.0773518085479736, + "learning_rate": 1.9836231328812748e-07, + "loss": 0.026, + "step": 29820 + }, + { + "epoch": 0.11917817796821782, + "grad_norm": 4.918954849243164, + "learning_rate": 1.9836117971268839e-07, + "loss": 0.0267, + "step": 29830 + }, + { + "epoch": 0.1192181304247945, + "grad_norm": 4.69482421875, + "learning_rate": 1.983600457483067e-07, + "loss": 0.0283, + "step": 29840 + }, + { + "epoch": 0.11925808288137117, + "grad_norm": 12.317242622375488, + "learning_rate": 1.983589113949868e-07, + "loss": 0.0254, + "step": 29850 + }, + { + "epoch": 0.11929803533794785, + "grad_norm": 3.3825316429138184, + "learning_rate": 1.9835777665273324e-07, + "loss": 0.0252, + "step": 29860 + }, + { + "epoch": 0.11933798779452452, + "grad_norm": 4.592342376708984, + "learning_rate": 1.9835664152155048e-07, + "loss": 0.0225, + "step": 29870 + }, + { + "epoch": 0.11937794025110118, + "grad_norm": 6.05885124206543, + "learning_rate": 1.9835550600144301e-07, + "loss": 0.0286, + "step": 29880 + }, + { + "epoch": 0.11941789270767786, + "grad_norm": 2.6635313034057617, + "learning_rate": 1.9835437009241534e-07, + "loss": 0.026, + "step": 29890 + }, + { + "epoch": 0.11945784516425453, + "grad_norm": 4.3915605545043945, + "learning_rate": 1.9835323379447196e-07, + "loss": 0.0258, + "step": 29900 + }, + { + "epoch": 0.11949779762083121, + "grad_norm": 4.191644191741943, + "learning_rate": 1.9835209710761732e-07, + "loss": 0.0237, + "step": 29910 + }, + { + "epoch": 0.11953775007740788, + "grad_norm": 5.089380741119385, + "learning_rate": 1.9835096003185594e-07, + "loss": 0.0246, + "step": 29920 + }, + { + "epoch": 0.11957770253398456, + "grad_norm": 2.2266225814819336, + "learning_rate": 1.9834982256719234e-07, + "loss": 0.0229, + "step": 29930 + }, + { + "epoch": 0.11961765499056123, + "grad_norm": 2.622830390930176, + "learning_rate": 1.98348684713631e-07, + "loss": 0.027, + "step": 29940 + }, + { + "epoch": 0.11965760744713791, + "grad_norm": 13.037530899047852, + "learning_rate": 1.983475464711764e-07, + "loss": 0.0267, + "step": 29950 + }, + { + "epoch": 0.11969755990371458, + "grad_norm": 1.679103970527649, + "learning_rate": 1.9834640783983305e-07, + "loss": 0.0248, + "step": 29960 + }, + { + "epoch": 0.11973751236029126, + "grad_norm": 5.625346660614014, + "learning_rate": 1.983452688196055e-07, + "loss": 0.027, + "step": 29970 + }, + { + "epoch": 0.11977746481686792, + "grad_norm": 4.563669681549072, + "learning_rate": 1.983441294104982e-07, + "loss": 0.0234, + "step": 29980 + }, + { + "epoch": 0.1198174172734446, + "grad_norm": 10.536704063415527, + "learning_rate": 1.9834298961251565e-07, + "loss": 0.0275, + "step": 29990 + }, + { + "epoch": 0.11985736973002127, + "grad_norm": 4.86958122253418, + "learning_rate": 1.9834184942566238e-07, + "loss": 0.0251, + "step": 30000 + }, + { + "epoch": 0.11989732218659795, + "grad_norm": 5.883798122406006, + "learning_rate": 1.9834070884994292e-07, + "loss": 0.0233, + "step": 30010 + }, + { + "epoch": 0.11993727464317462, + "grad_norm": 2.801987886428833, + "learning_rate": 1.9833956788536175e-07, + "loss": 0.0238, + "step": 30020 + }, + { + "epoch": 0.1199772270997513, + "grad_norm": 3.9979984760284424, + "learning_rate": 1.983384265319234e-07, + "loss": 0.0238, + "step": 30030 + }, + { + "epoch": 0.12001717955632797, + "grad_norm": 5.793106555938721, + "learning_rate": 1.9833728478963234e-07, + "loss": 0.0246, + "step": 30040 + }, + { + "epoch": 0.12005713201290465, + "grad_norm": 2.279597043991089, + "learning_rate": 1.9833614265849315e-07, + "loss": 0.0221, + "step": 30050 + }, + { + "epoch": 0.12009708446948132, + "grad_norm": 3.2133278846740723, + "learning_rate": 1.9833500013851028e-07, + "loss": 0.0272, + "step": 30060 + }, + { + "epoch": 0.120137036926058, + "grad_norm": 3.613976240158081, + "learning_rate": 1.983338572296883e-07, + "loss": 0.0239, + "step": 30070 + }, + { + "epoch": 0.12017698938263466, + "grad_norm": 5.073135852813721, + "learning_rate": 1.983327139320317e-07, + "loss": 0.0284, + "step": 30080 + }, + { + "epoch": 0.12021694183921133, + "grad_norm": 3.5572147369384766, + "learning_rate": 1.9833157024554504e-07, + "loss": 0.0274, + "step": 30090 + }, + { + "epoch": 0.12025689429578801, + "grad_norm": 9.406094551086426, + "learning_rate": 1.9833042617023278e-07, + "loss": 0.0272, + "step": 30100 + }, + { + "epoch": 0.12029684675236468, + "grad_norm": 3.4140231609344482, + "learning_rate": 1.9832928170609947e-07, + "loss": 0.024, + "step": 30110 + }, + { + "epoch": 0.12033679920894136, + "grad_norm": 4.575106620788574, + "learning_rate": 1.9832813685314965e-07, + "loss": 0.0233, + "step": 30120 + }, + { + "epoch": 0.12037675166551803, + "grad_norm": 12.881264686584473, + "learning_rate": 1.9832699161138785e-07, + "loss": 0.0253, + "step": 30130 + }, + { + "epoch": 0.12041670412209471, + "grad_norm": 2.000500440597534, + "learning_rate": 1.983258459808186e-07, + "loss": 0.0261, + "step": 30140 + }, + { + "epoch": 0.12045665657867138, + "grad_norm": 3.9185376167297363, + "learning_rate": 1.983246999614464e-07, + "loss": 0.0275, + "step": 30150 + }, + { + "epoch": 0.12049660903524806, + "grad_norm": 2.9582836627960205, + "learning_rate": 1.9832355355327577e-07, + "loss": 0.0274, + "step": 30160 + }, + { + "epoch": 0.12053656149182473, + "grad_norm": 2.863779067993164, + "learning_rate": 1.9832240675631128e-07, + "loss": 0.0239, + "step": 30170 + }, + { + "epoch": 0.12057651394840141, + "grad_norm": 1.625516414642334, + "learning_rate": 1.983212595705575e-07, + "loss": 0.0283, + "step": 30180 + }, + { + "epoch": 0.12061646640497807, + "grad_norm": 2.3883864879608154, + "learning_rate": 1.983201119960189e-07, + "loss": 0.0294, + "step": 30190 + }, + { + "epoch": 0.12065641886155475, + "grad_norm": 3.602478265762329, + "learning_rate": 1.9831896403270004e-07, + "loss": 0.0272, + "step": 30200 + }, + { + "epoch": 0.12069637131813142, + "grad_norm": 3.681063652038574, + "learning_rate": 1.9831781568060543e-07, + "loss": 0.0272, + "step": 30210 + }, + { + "epoch": 0.1207363237747081, + "grad_norm": 3.5512044429779053, + "learning_rate": 1.983166669397397e-07, + "loss": 0.0251, + "step": 30220 + }, + { + "epoch": 0.12077627623128477, + "grad_norm": 3.2305922508239746, + "learning_rate": 1.983155178101073e-07, + "loss": 0.0253, + "step": 30230 + }, + { + "epoch": 0.12081622868786145, + "grad_norm": 2.4012691974639893, + "learning_rate": 1.9831436829171282e-07, + "loss": 0.0256, + "step": 30240 + }, + { + "epoch": 0.12085618114443812, + "grad_norm": 3.0920965671539307, + "learning_rate": 1.983132183845608e-07, + "loss": 0.0239, + "step": 30250 + }, + { + "epoch": 0.1208961336010148, + "grad_norm": 4.0588603019714355, + "learning_rate": 1.9831206808865575e-07, + "loss": 0.0276, + "step": 30260 + }, + { + "epoch": 0.12093608605759147, + "grad_norm": 7.11159610748291, + "learning_rate": 1.9831091740400226e-07, + "loss": 0.025, + "step": 30270 + }, + { + "epoch": 0.12097603851416815, + "grad_norm": 4.0428314208984375, + "learning_rate": 1.9830976633060485e-07, + "loss": 0.0259, + "step": 30280 + }, + { + "epoch": 0.12101599097074481, + "grad_norm": 3.6243388652801514, + "learning_rate": 1.9830861486846816e-07, + "loss": 0.0272, + "step": 30290 + }, + { + "epoch": 0.12105594342732148, + "grad_norm": 6.045682430267334, + "learning_rate": 1.983074630175966e-07, + "loss": 0.0258, + "step": 30300 + }, + { + "epoch": 0.12109589588389816, + "grad_norm": 5.790445327758789, + "learning_rate": 1.9830631077799483e-07, + "loss": 0.0287, + "step": 30310 + }, + { + "epoch": 0.12113584834047483, + "grad_norm": 2.9511497020721436, + "learning_rate": 1.9830515814966735e-07, + "loss": 0.0208, + "step": 30320 + }, + { + "epoch": 0.12117580079705151, + "grad_norm": 1.9936420917510986, + "learning_rate": 1.9830400513261878e-07, + "loss": 0.0228, + "step": 30330 + }, + { + "epoch": 0.12121575325362818, + "grad_norm": 4.936613082885742, + "learning_rate": 1.9830285172685362e-07, + "loss": 0.0231, + "step": 30340 + }, + { + "epoch": 0.12125570571020486, + "grad_norm": 2.4000492095947266, + "learning_rate": 1.9830169793237645e-07, + "loss": 0.024, + "step": 30350 + }, + { + "epoch": 0.12129565816678153, + "grad_norm": 4.253389835357666, + "learning_rate": 1.9830054374919186e-07, + "loss": 0.0264, + "step": 30360 + }, + { + "epoch": 0.12133561062335821, + "grad_norm": 3.8204166889190674, + "learning_rate": 1.9829938917730433e-07, + "loss": 0.0245, + "step": 30370 + }, + { + "epoch": 0.12137556307993488, + "grad_norm": 4.3851494789123535, + "learning_rate": 1.9829823421671854e-07, + "loss": 0.0275, + "step": 30380 + }, + { + "epoch": 0.12141551553651155, + "grad_norm": 3.585768938064575, + "learning_rate": 1.9829707886743896e-07, + "loss": 0.0244, + "step": 30390 + }, + { + "epoch": 0.12145546799308822, + "grad_norm": 2.924774408340454, + "learning_rate": 1.9829592312947023e-07, + "loss": 0.0226, + "step": 30400 + }, + { + "epoch": 0.1214954204496649, + "grad_norm": 4.669759273529053, + "learning_rate": 1.9829476700281688e-07, + "loss": 0.0247, + "step": 30410 + }, + { + "epoch": 0.12153537290624157, + "grad_norm": 3.6008191108703613, + "learning_rate": 1.982936104874835e-07, + "loss": 0.0233, + "step": 30420 + }, + { + "epoch": 0.12157532536281825, + "grad_norm": 9.031393051147461, + "learning_rate": 1.9829245358347464e-07, + "loss": 0.0305, + "step": 30430 + }, + { + "epoch": 0.12161527781939492, + "grad_norm": 2.76313853263855, + "learning_rate": 1.982912962907949e-07, + "loss": 0.0231, + "step": 30440 + }, + { + "epoch": 0.1216552302759716, + "grad_norm": 7.855586528778076, + "learning_rate": 1.9829013860944884e-07, + "loss": 0.0281, + "step": 30450 + }, + { + "epoch": 0.12169518273254827, + "grad_norm": 1.9519516229629517, + "learning_rate": 1.9828898053944102e-07, + "loss": 0.0278, + "step": 30460 + }, + { + "epoch": 0.12173513518912495, + "grad_norm": 2.366786003112793, + "learning_rate": 1.9828782208077608e-07, + "loss": 0.0245, + "step": 30470 + }, + { + "epoch": 0.12177508764570162, + "grad_norm": 5.360734939575195, + "learning_rate": 1.9828666323345856e-07, + "loss": 0.0261, + "step": 30480 + }, + { + "epoch": 0.12181504010227828, + "grad_norm": 9.166790962219238, + "learning_rate": 1.9828550399749303e-07, + "loss": 0.0264, + "step": 30490 + }, + { + "epoch": 0.12185499255885496, + "grad_norm": 5.380296230316162, + "learning_rate": 1.9828434437288405e-07, + "loss": 0.0234, + "step": 30500 + }, + { + "epoch": 0.12189494501543163, + "grad_norm": 5.040682315826416, + "learning_rate": 1.982831843596363e-07, + "loss": 0.0262, + "step": 30510 + }, + { + "epoch": 0.12193489747200831, + "grad_norm": 7.695793628692627, + "learning_rate": 1.9828202395775432e-07, + "loss": 0.0248, + "step": 30520 + }, + { + "epoch": 0.12197484992858498, + "grad_norm": 5.833404064178467, + "learning_rate": 1.9828086316724266e-07, + "loss": 0.0245, + "step": 30530 + }, + { + "epoch": 0.12201480238516166, + "grad_norm": 4.040808200836182, + "learning_rate": 1.9827970198810594e-07, + "loss": 0.0281, + "step": 30540 + }, + { + "epoch": 0.12205475484173833, + "grad_norm": 2.224146604537964, + "learning_rate": 1.9827854042034875e-07, + "loss": 0.0264, + "step": 30550 + }, + { + "epoch": 0.12209470729831501, + "grad_norm": 3.0244650840759277, + "learning_rate": 1.982773784639757e-07, + "loss": 0.0276, + "step": 30560 + }, + { + "epoch": 0.12213465975489168, + "grad_norm": 4.43235969543457, + "learning_rate": 1.9827621611899136e-07, + "loss": 0.0281, + "step": 30570 + }, + { + "epoch": 0.12217461221146836, + "grad_norm": 7.112714767456055, + "learning_rate": 1.9827505338540034e-07, + "loss": 0.0242, + "step": 30580 + }, + { + "epoch": 0.12221456466804502, + "grad_norm": 8.95909309387207, + "learning_rate": 1.9827389026320722e-07, + "loss": 0.0314, + "step": 30590 + }, + { + "epoch": 0.1222545171246217, + "grad_norm": 5.871224403381348, + "learning_rate": 1.9827272675241662e-07, + "loss": 0.0242, + "step": 30600 + }, + { + "epoch": 0.12229446958119837, + "grad_norm": 3.991833448410034, + "learning_rate": 1.9827156285303316e-07, + "loss": 0.0198, + "step": 30610 + }, + { + "epoch": 0.12233442203777505, + "grad_norm": 2.411550283432007, + "learning_rate": 1.9827039856506137e-07, + "loss": 0.026, + "step": 30620 + }, + { + "epoch": 0.12237437449435172, + "grad_norm": 3.470397710800171, + "learning_rate": 1.9826923388850592e-07, + "loss": 0.0246, + "step": 30630 + }, + { + "epoch": 0.1224143269509284, + "grad_norm": 8.23436450958252, + "learning_rate": 1.982680688233714e-07, + "loss": 0.0286, + "step": 30640 + }, + { + "epoch": 0.12245427940750507, + "grad_norm": 3.6807684898376465, + "learning_rate": 1.9826690336966241e-07, + "loss": 0.0237, + "step": 30650 + }, + { + "epoch": 0.12249423186408175, + "grad_norm": 2.964940309524536, + "learning_rate": 1.9826573752738355e-07, + "loss": 0.022, + "step": 30660 + }, + { + "epoch": 0.12253418432065842, + "grad_norm": 5.007888317108154, + "learning_rate": 1.9826457129653945e-07, + "loss": 0.0259, + "step": 30670 + }, + { + "epoch": 0.1225741367772351, + "grad_norm": 3.2726213932037354, + "learning_rate": 1.982634046771347e-07, + "loss": 0.0265, + "step": 30680 + }, + { + "epoch": 0.12261408923381176, + "grad_norm": 4.006194114685059, + "learning_rate": 1.9826223766917391e-07, + "loss": 0.0227, + "step": 30690 + }, + { + "epoch": 0.12265404169038843, + "grad_norm": 4.21021842956543, + "learning_rate": 1.9826107027266175e-07, + "loss": 0.0271, + "step": 30700 + }, + { + "epoch": 0.12269399414696511, + "grad_norm": 3.089609146118164, + "learning_rate": 1.982599024876028e-07, + "loss": 0.0234, + "step": 30710 + }, + { + "epoch": 0.12273394660354178, + "grad_norm": 2.7303154468536377, + "learning_rate": 1.9825873431400164e-07, + "loss": 0.0222, + "step": 30720 + }, + { + "epoch": 0.12277389906011846, + "grad_norm": 3.991281509399414, + "learning_rate": 1.9825756575186293e-07, + "loss": 0.0252, + "step": 30730 + }, + { + "epoch": 0.12281385151669513, + "grad_norm": 3.121595859527588, + "learning_rate": 1.982563968011913e-07, + "loss": 0.0254, + "step": 30740 + }, + { + "epoch": 0.12285380397327181, + "grad_norm": 2.859830379486084, + "learning_rate": 1.9825522746199133e-07, + "loss": 0.0245, + "step": 30750 + }, + { + "epoch": 0.12289375642984848, + "grad_norm": 2.4791295528411865, + "learning_rate": 1.9825405773426765e-07, + "loss": 0.0236, + "step": 30760 + }, + { + "epoch": 0.12293370888642516, + "grad_norm": 2.8919107913970947, + "learning_rate": 1.9825288761802495e-07, + "loss": 0.0273, + "step": 30770 + }, + { + "epoch": 0.12297366134300183, + "grad_norm": 2.2381088733673096, + "learning_rate": 1.982517171132678e-07, + "loss": 0.0236, + "step": 30780 + }, + { + "epoch": 0.12301361379957851, + "grad_norm": 5.815417289733887, + "learning_rate": 1.9825054622000086e-07, + "loss": 0.0237, + "step": 30790 + }, + { + "epoch": 0.12305356625615517, + "grad_norm": 2.001610279083252, + "learning_rate": 1.9824937493822872e-07, + "loss": 0.0243, + "step": 30800 + }, + { + "epoch": 0.12309351871273184, + "grad_norm": 3.9816949367523193, + "learning_rate": 1.98248203267956e-07, + "loss": 0.0267, + "step": 30810 + }, + { + "epoch": 0.12313347116930852, + "grad_norm": 5.918919563293457, + "learning_rate": 1.982470312091874e-07, + "loss": 0.0272, + "step": 30820 + }, + { + "epoch": 0.1231734236258852, + "grad_norm": 8.038961410522461, + "learning_rate": 1.9824585876192753e-07, + "loss": 0.0251, + "step": 30830 + }, + { + "epoch": 0.12321337608246187, + "grad_norm": 9.71016788482666, + "learning_rate": 1.98244685926181e-07, + "loss": 0.0194, + "step": 30840 + }, + { + "epoch": 0.12325332853903855, + "grad_norm": 7.030154705047607, + "learning_rate": 1.9824351270195244e-07, + "loss": 0.0282, + "step": 30850 + }, + { + "epoch": 0.12329328099561522, + "grad_norm": 2.3177709579467773, + "learning_rate": 1.9824233908924654e-07, + "loss": 0.023, + "step": 30860 + }, + { + "epoch": 0.1233332334521919, + "grad_norm": 5.074477195739746, + "learning_rate": 1.982411650880679e-07, + "loss": 0.0291, + "step": 30870 + }, + { + "epoch": 0.12337318590876857, + "grad_norm": 3.7671444416046143, + "learning_rate": 1.982399906984212e-07, + "loss": 0.0271, + "step": 30880 + }, + { + "epoch": 0.12341313836534525, + "grad_norm": 5.094048500061035, + "learning_rate": 1.9823881592031103e-07, + "loss": 0.0273, + "step": 30890 + }, + { + "epoch": 0.12345309082192191, + "grad_norm": 3.502107858657837, + "learning_rate": 1.9823764075374208e-07, + "loss": 0.0239, + "step": 30900 + }, + { + "epoch": 0.12349304327849858, + "grad_norm": 3.1292598247528076, + "learning_rate": 1.9823646519871898e-07, + "loss": 0.0243, + "step": 30910 + }, + { + "epoch": 0.12353299573507526, + "grad_norm": 1.851166009902954, + "learning_rate": 1.9823528925524636e-07, + "loss": 0.0249, + "step": 30920 + }, + { + "epoch": 0.12357294819165193, + "grad_norm": 2.8047404289245605, + "learning_rate": 1.9823411292332892e-07, + "loss": 0.0212, + "step": 30930 + }, + { + "epoch": 0.12361290064822861, + "grad_norm": 4.46400260925293, + "learning_rate": 1.9823293620297125e-07, + "loss": 0.0251, + "step": 30940 + }, + { + "epoch": 0.12365285310480528, + "grad_norm": 3.769240617752075, + "learning_rate": 1.9823175909417804e-07, + "loss": 0.0246, + "step": 30950 + }, + { + "epoch": 0.12369280556138196, + "grad_norm": 10.1272611618042, + "learning_rate": 1.9823058159695397e-07, + "loss": 0.0241, + "step": 30960 + }, + { + "epoch": 0.12373275801795863, + "grad_norm": 2.66916823387146, + "learning_rate": 1.9822940371130362e-07, + "loss": 0.0215, + "step": 30970 + }, + { + "epoch": 0.12377271047453531, + "grad_norm": 8.624239921569824, + "learning_rate": 1.9822822543723175e-07, + "loss": 0.0239, + "step": 30980 + }, + { + "epoch": 0.12381266293111198, + "grad_norm": 4.934239864349365, + "learning_rate": 1.9822704677474292e-07, + "loss": 0.0259, + "step": 30990 + }, + { + "epoch": 0.12385261538768864, + "grad_norm": 2.6507816314697266, + "learning_rate": 1.9822586772384184e-07, + "loss": 0.0289, + "step": 31000 + }, + { + "epoch": 0.12389256784426532, + "grad_norm": 4.515950679779053, + "learning_rate": 1.9822468828453316e-07, + "loss": 0.0256, + "step": 31010 + }, + { + "epoch": 0.123932520300842, + "grad_norm": 4.782577037811279, + "learning_rate": 1.9822350845682157e-07, + "loss": 0.022, + "step": 31020 + }, + { + "epoch": 0.12397247275741867, + "grad_norm": 3.83223032951355, + "learning_rate": 1.982223282407117e-07, + "loss": 0.0271, + "step": 31030 + }, + { + "epoch": 0.12401242521399534, + "grad_norm": 6.892681121826172, + "learning_rate": 1.982211476362082e-07, + "loss": 0.0264, + "step": 31040 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 4.537559986114502, + "learning_rate": 1.982199666433158e-07, + "loss": 0.0315, + "step": 31050 + }, + { + "epoch": 0.1240923301271487, + "grad_norm": 3.678162097930908, + "learning_rate": 1.9821878526203912e-07, + "loss": 0.0301, + "step": 31060 + }, + { + "epoch": 0.12413228258372537, + "grad_norm": 5.602746486663818, + "learning_rate": 1.9821760349238287e-07, + "loss": 0.0292, + "step": 31070 + }, + { + "epoch": 0.12417223504030205, + "grad_norm": 3.1480581760406494, + "learning_rate": 1.982164213343517e-07, + "loss": 0.0249, + "step": 31080 + }, + { + "epoch": 0.12421218749687872, + "grad_norm": 4.124526500701904, + "learning_rate": 1.9821523878795028e-07, + "loss": 0.0285, + "step": 31090 + }, + { + "epoch": 0.12425213995345538, + "grad_norm": 3.9495363235473633, + "learning_rate": 1.982140558531833e-07, + "loss": 0.0239, + "step": 31100 + }, + { + "epoch": 0.12429209241003206, + "grad_norm": 5.235224723815918, + "learning_rate": 1.9821287253005545e-07, + "loss": 0.0259, + "step": 31110 + }, + { + "epoch": 0.12433204486660873, + "grad_norm": 7.678544044494629, + "learning_rate": 1.9821168881857135e-07, + "loss": 0.0226, + "step": 31120 + }, + { + "epoch": 0.12437199732318541, + "grad_norm": 5.920469760894775, + "learning_rate": 1.9821050471873575e-07, + "loss": 0.0278, + "step": 31130 + }, + { + "epoch": 0.12441194977976208, + "grad_norm": 6.04869270324707, + "learning_rate": 1.982093202305533e-07, + "loss": 0.0285, + "step": 31140 + }, + { + "epoch": 0.12445190223633876, + "grad_norm": 2.293212890625, + "learning_rate": 1.9820813535402868e-07, + "loss": 0.0267, + "step": 31150 + }, + { + "epoch": 0.12449185469291543, + "grad_norm": 2.6353583335876465, + "learning_rate": 1.982069500891666e-07, + "loss": 0.0226, + "step": 31160 + }, + { + "epoch": 0.12453180714949211, + "grad_norm": 6.085938930511475, + "learning_rate": 1.982057644359717e-07, + "loss": 0.0254, + "step": 31170 + }, + { + "epoch": 0.12457175960606878, + "grad_norm": 5.286264896392822, + "learning_rate": 1.982045783944487e-07, + "loss": 0.027, + "step": 31180 + }, + { + "epoch": 0.12461171206264546, + "grad_norm": 2.549724578857422, + "learning_rate": 1.9820339196460232e-07, + "loss": 0.0248, + "step": 31190 + }, + { + "epoch": 0.12465166451922212, + "grad_norm": 2.6574156284332275, + "learning_rate": 1.982022051464372e-07, + "loss": 0.0229, + "step": 31200 + }, + { + "epoch": 0.1246916169757988, + "grad_norm": 3.0999789237976074, + "learning_rate": 1.9820101793995802e-07, + "loss": 0.0218, + "step": 31210 + }, + { + "epoch": 0.12473156943237547, + "grad_norm": 5.150970935821533, + "learning_rate": 1.9819983034516956e-07, + "loss": 0.0242, + "step": 31220 + }, + { + "epoch": 0.12477152188895214, + "grad_norm": 5.516458034515381, + "learning_rate": 1.9819864236207642e-07, + "loss": 0.0251, + "step": 31230 + }, + { + "epoch": 0.12481147434552882, + "grad_norm": 4.014908790588379, + "learning_rate": 1.9819745399068334e-07, + "loss": 0.0242, + "step": 31240 + }, + { + "epoch": 0.1248514268021055, + "grad_norm": 3.9944562911987305, + "learning_rate": 1.9819626523099506e-07, + "loss": 0.0263, + "step": 31250 + }, + { + "epoch": 0.12489137925868217, + "grad_norm": 1.5109765529632568, + "learning_rate": 1.981950760830162e-07, + "loss": 0.0212, + "step": 31260 + }, + { + "epoch": 0.12493133171525884, + "grad_norm": 3.420461893081665, + "learning_rate": 1.9819388654675153e-07, + "loss": 0.0256, + "step": 31270 + }, + { + "epoch": 0.12497128417183552, + "grad_norm": 5.700145244598389, + "learning_rate": 1.981926966222057e-07, + "loss": 0.026, + "step": 31280 + }, + { + "epoch": 0.12501123662841218, + "grad_norm": 4.567274570465088, + "learning_rate": 1.9819150630938348e-07, + "loss": 0.0266, + "step": 31290 + }, + { + "epoch": 0.12505118908498886, + "grad_norm": 2.672191858291626, + "learning_rate": 1.981903156082895e-07, + "loss": 0.0258, + "step": 31300 + }, + { + "epoch": 0.12509114154156553, + "grad_norm": 7.893247127532959, + "learning_rate": 1.9818912451892854e-07, + "loss": 0.023, + "step": 31310 + }, + { + "epoch": 0.1251310939981422, + "grad_norm": 3.5404751300811768, + "learning_rate": 1.9818793304130521e-07, + "loss": 0.0215, + "step": 31320 + }, + { + "epoch": 0.12517104645471888, + "grad_norm": 2.5844409465789795, + "learning_rate": 1.9818674117542432e-07, + "loss": 0.021, + "step": 31330 + }, + { + "epoch": 0.12521099891129556, + "grad_norm": 3.8845012187957764, + "learning_rate": 1.9818554892129058e-07, + "loss": 0.0226, + "step": 31340 + }, + { + "epoch": 0.12525095136787223, + "grad_norm": 2.727587938308716, + "learning_rate": 1.9818435627890867e-07, + "loss": 0.022, + "step": 31350 + }, + { + "epoch": 0.1252909038244489, + "grad_norm": 5.795429706573486, + "learning_rate": 1.9818316324828328e-07, + "loss": 0.0294, + "step": 31360 + }, + { + "epoch": 0.12533085628102558, + "grad_norm": 1.6398024559020996, + "learning_rate": 1.9818196982941918e-07, + "loss": 0.025, + "step": 31370 + }, + { + "epoch": 0.12537080873760226, + "grad_norm": 4.787398338317871, + "learning_rate": 1.9818077602232104e-07, + "loss": 0.0251, + "step": 31380 + }, + { + "epoch": 0.12541076119417893, + "grad_norm": 4.661961078643799, + "learning_rate": 1.9817958182699364e-07, + "loss": 0.0291, + "step": 31390 + }, + { + "epoch": 0.1254507136507556, + "grad_norm": 4.387763500213623, + "learning_rate": 1.9817838724344165e-07, + "loss": 0.0286, + "step": 31400 + }, + { + "epoch": 0.12549066610733228, + "grad_norm": 3.7336065769195557, + "learning_rate": 1.9817719227166983e-07, + "loss": 0.0262, + "step": 31410 + }, + { + "epoch": 0.12553061856390896, + "grad_norm": 3.729398727416992, + "learning_rate": 1.9817599691168287e-07, + "loss": 0.0207, + "step": 31420 + }, + { + "epoch": 0.12557057102048563, + "grad_norm": 4.170183181762695, + "learning_rate": 1.9817480116348552e-07, + "loss": 0.0294, + "step": 31430 + }, + { + "epoch": 0.1256105234770623, + "grad_norm": 2.6864101886749268, + "learning_rate": 1.9817360502708248e-07, + "loss": 0.0251, + "step": 31440 + }, + { + "epoch": 0.12565047593363898, + "grad_norm": 2.8742318153381348, + "learning_rate": 1.9817240850247856e-07, + "loss": 0.0219, + "step": 31450 + }, + { + "epoch": 0.12569042839021563, + "grad_norm": 7.241616249084473, + "learning_rate": 1.981712115896784e-07, + "loss": 0.027, + "step": 31460 + }, + { + "epoch": 0.1257303808467923, + "grad_norm": 2.5224554538726807, + "learning_rate": 1.9817001428868675e-07, + "loss": 0.026, + "step": 31470 + }, + { + "epoch": 0.12577033330336898, + "grad_norm": 3.4115283489227295, + "learning_rate": 1.981688165995084e-07, + "loss": 0.0255, + "step": 31480 + }, + { + "epoch": 0.12581028575994566, + "grad_norm": 2.105731248855591, + "learning_rate": 1.98167618522148e-07, + "loss": 0.0224, + "step": 31490 + }, + { + "epoch": 0.12585023821652233, + "grad_norm": 1.3861132860183716, + "learning_rate": 1.981664200566104e-07, + "loss": 0.0197, + "step": 31500 + }, + { + "epoch": 0.125890190673099, + "grad_norm": 3.674146890640259, + "learning_rate": 1.9816522120290021e-07, + "loss": 0.0246, + "step": 31510 + }, + { + "epoch": 0.12593014312967568, + "grad_norm": 8.56765079498291, + "learning_rate": 1.981640219610223e-07, + "loss": 0.0249, + "step": 31520 + }, + { + "epoch": 0.12597009558625236, + "grad_norm": 2.9283297061920166, + "learning_rate": 1.981628223309813e-07, + "loss": 0.0232, + "step": 31530 + }, + { + "epoch": 0.12601004804282903, + "grad_norm": 6.038548469543457, + "learning_rate": 1.98161622312782e-07, + "loss": 0.0251, + "step": 31540 + }, + { + "epoch": 0.1260500004994057, + "grad_norm": 3.5183677673339844, + "learning_rate": 1.9816042190642914e-07, + "loss": 0.0285, + "step": 31550 + }, + { + "epoch": 0.12608995295598238, + "grad_norm": 15.188865661621094, + "learning_rate": 1.981592211119275e-07, + "loss": 0.0302, + "step": 31560 + }, + { + "epoch": 0.12612990541255906, + "grad_norm": 3.543917655944824, + "learning_rate": 1.981580199292818e-07, + "loss": 0.0246, + "step": 31570 + }, + { + "epoch": 0.12616985786913573, + "grad_norm": 1.5099226236343384, + "learning_rate": 1.981568183584968e-07, + "loss": 0.0245, + "step": 31580 + }, + { + "epoch": 0.1262098103257124, + "grad_norm": 5.187595844268799, + "learning_rate": 1.9815561639957718e-07, + "loss": 0.024, + "step": 31590 + }, + { + "epoch": 0.12624976278228908, + "grad_norm": 4.4823994636535645, + "learning_rate": 1.981544140525278e-07, + "loss": 0.0293, + "step": 31600 + }, + { + "epoch": 0.12628971523886576, + "grad_norm": 3.659695625305176, + "learning_rate": 1.9815321131735337e-07, + "loss": 0.026, + "step": 31610 + }, + { + "epoch": 0.12632966769544243, + "grad_norm": 1.7553337812423706, + "learning_rate": 1.9815200819405865e-07, + "loss": 0.0249, + "step": 31620 + }, + { + "epoch": 0.1263696201520191, + "grad_norm": 5.511161804199219, + "learning_rate": 1.9815080468264838e-07, + "loss": 0.0258, + "step": 31630 + }, + { + "epoch": 0.12640957260859578, + "grad_norm": 2.4567582607269287, + "learning_rate": 1.9814960078312732e-07, + "loss": 0.0233, + "step": 31640 + }, + { + "epoch": 0.12644952506517246, + "grad_norm": 4.449940204620361, + "learning_rate": 1.9814839649550023e-07, + "loss": 0.02, + "step": 31650 + }, + { + "epoch": 0.1264894775217491, + "grad_norm": 7.7336859703063965, + "learning_rate": 1.9814719181977192e-07, + "loss": 0.0248, + "step": 31660 + }, + { + "epoch": 0.12652942997832578, + "grad_norm": 5.211762428283691, + "learning_rate": 1.9814598675594708e-07, + "loss": 0.0227, + "step": 31670 + }, + { + "epoch": 0.12656938243490246, + "grad_norm": 8.814682006835938, + "learning_rate": 1.9814478130403057e-07, + "loss": 0.0268, + "step": 31680 + }, + { + "epoch": 0.12660933489147913, + "grad_norm": 3.3300137519836426, + "learning_rate": 1.9814357546402704e-07, + "loss": 0.0278, + "step": 31690 + }, + { + "epoch": 0.1266492873480558, + "grad_norm": 5.263627052307129, + "learning_rate": 1.9814236923594136e-07, + "loss": 0.0225, + "step": 31700 + }, + { + "epoch": 0.12668923980463248, + "grad_norm": 4.101144790649414, + "learning_rate": 1.9814116261977822e-07, + "loss": 0.0242, + "step": 31710 + }, + { + "epoch": 0.12672919226120916, + "grad_norm": 2.0097548961639404, + "learning_rate": 1.9813995561554244e-07, + "loss": 0.0211, + "step": 31720 + }, + { + "epoch": 0.12676914471778583, + "grad_norm": 5.110353946685791, + "learning_rate": 1.981387482232388e-07, + "loss": 0.0214, + "step": 31730 + }, + { + "epoch": 0.1268090971743625, + "grad_norm": 4.564248085021973, + "learning_rate": 1.9813754044287205e-07, + "loss": 0.025, + "step": 31740 + }, + { + "epoch": 0.12684904963093918, + "grad_norm": 2.9247186183929443, + "learning_rate": 1.9813633227444694e-07, + "loss": 0.0276, + "step": 31750 + }, + { + "epoch": 0.12688900208751586, + "grad_norm": 18.451419830322266, + "learning_rate": 1.981351237179683e-07, + "loss": 0.0243, + "step": 31760 + }, + { + "epoch": 0.12692895454409253, + "grad_norm": 4.924864768981934, + "learning_rate": 1.981339147734409e-07, + "loss": 0.0266, + "step": 31770 + }, + { + "epoch": 0.1269689070006692, + "grad_norm": 3.4415969848632812, + "learning_rate": 1.981327054408695e-07, + "loss": 0.0225, + "step": 31780 + }, + { + "epoch": 0.12700885945724588, + "grad_norm": 5.447090148925781, + "learning_rate": 1.981314957202589e-07, + "loss": 0.0259, + "step": 31790 + }, + { + "epoch": 0.12704881191382256, + "grad_norm": 2.690577745437622, + "learning_rate": 1.9813028561161381e-07, + "loss": 0.0252, + "step": 31800 + }, + { + "epoch": 0.12708876437039923, + "grad_norm": 3.045470952987671, + "learning_rate": 1.9812907511493914e-07, + "loss": 0.0247, + "step": 31810 + }, + { + "epoch": 0.1271287168269759, + "grad_norm": 4.302901744842529, + "learning_rate": 1.9812786423023962e-07, + "loss": 0.0238, + "step": 31820 + }, + { + "epoch": 0.12716866928355258, + "grad_norm": 2.943524122238159, + "learning_rate": 1.9812665295752e-07, + "loss": 0.0275, + "step": 31830 + }, + { + "epoch": 0.12720862174012926, + "grad_norm": 6.807154178619385, + "learning_rate": 1.9812544129678512e-07, + "loss": 0.0213, + "step": 31840 + }, + { + "epoch": 0.12724857419670593, + "grad_norm": 3.1984338760375977, + "learning_rate": 1.9812422924803975e-07, + "loss": 0.0258, + "step": 31850 + }, + { + "epoch": 0.12728852665328258, + "grad_norm": 11.737739562988281, + "learning_rate": 1.9812301681128867e-07, + "loss": 0.0247, + "step": 31860 + }, + { + "epoch": 0.12732847910985925, + "grad_norm": 2.9529690742492676, + "learning_rate": 1.981218039865367e-07, + "loss": 0.0239, + "step": 31870 + }, + { + "epoch": 0.12736843156643593, + "grad_norm": 8.520682334899902, + "learning_rate": 1.9812059077378863e-07, + "loss": 0.0279, + "step": 31880 + }, + { + "epoch": 0.1274083840230126, + "grad_norm": 7.449371814727783, + "learning_rate": 1.9811937717304925e-07, + "loss": 0.0268, + "step": 31890 + }, + { + "epoch": 0.12744833647958928, + "grad_norm": 1.4915614128112793, + "learning_rate": 1.981181631843234e-07, + "loss": 0.0217, + "step": 31900 + }, + { + "epoch": 0.12748828893616596, + "grad_norm": 2.202767848968506, + "learning_rate": 1.9811694880761582e-07, + "loss": 0.0197, + "step": 31910 + }, + { + "epoch": 0.12752824139274263, + "grad_norm": 4.089975357055664, + "learning_rate": 1.9811573404293132e-07, + "loss": 0.0329, + "step": 31920 + }, + { + "epoch": 0.1275681938493193, + "grad_norm": 3.456818103790283, + "learning_rate": 1.9811451889027472e-07, + "loss": 0.024, + "step": 31930 + }, + { + "epoch": 0.12760814630589598, + "grad_norm": 1.4961016178131104, + "learning_rate": 1.981133033496508e-07, + "loss": 0.0247, + "step": 31940 + }, + { + "epoch": 0.12764809876247266, + "grad_norm": 4.583690166473389, + "learning_rate": 1.981120874210644e-07, + "loss": 0.0249, + "step": 31950 + }, + { + "epoch": 0.12768805121904933, + "grad_norm": 4.906161785125732, + "learning_rate": 1.9811087110452037e-07, + "loss": 0.0278, + "step": 31960 + }, + { + "epoch": 0.127728003675626, + "grad_norm": 5.526266098022461, + "learning_rate": 1.9810965440002344e-07, + "loss": 0.0258, + "step": 31970 + }, + { + "epoch": 0.12776795613220268, + "grad_norm": 4.216318607330322, + "learning_rate": 1.9810843730757843e-07, + "loss": 0.0237, + "step": 31980 + }, + { + "epoch": 0.12780790858877936, + "grad_norm": 2.901221752166748, + "learning_rate": 1.9810721982719016e-07, + "loss": 0.0223, + "step": 31990 + }, + { + "epoch": 0.12784786104535603, + "grad_norm": 3.5233705043792725, + "learning_rate": 1.981060019588635e-07, + "loss": 0.0233, + "step": 32000 + }, + { + "epoch": 0.1278878135019327, + "grad_norm": 3.028592824935913, + "learning_rate": 1.981047837026032e-07, + "loss": 0.0231, + "step": 32010 + }, + { + "epoch": 0.12792776595850938, + "grad_norm": 7.894491672515869, + "learning_rate": 1.9810356505841411e-07, + "loss": 0.0267, + "step": 32020 + }, + { + "epoch": 0.12796771841508606, + "grad_norm": 10.350991249084473, + "learning_rate": 1.98102346026301e-07, + "loss": 0.0257, + "step": 32030 + }, + { + "epoch": 0.12800767087166273, + "grad_norm": 1.9900563955307007, + "learning_rate": 1.9810112660626874e-07, + "loss": 0.0218, + "step": 32040 + }, + { + "epoch": 0.1280476233282394, + "grad_norm": 6.842346668243408, + "learning_rate": 1.9809990679832218e-07, + "loss": 0.0251, + "step": 32050 + }, + { + "epoch": 0.12808757578481608, + "grad_norm": 4.689587116241455, + "learning_rate": 1.9809868660246606e-07, + "loss": 0.0234, + "step": 32060 + }, + { + "epoch": 0.12812752824139273, + "grad_norm": 3.6415984630584717, + "learning_rate": 1.9809746601870527e-07, + "loss": 0.0262, + "step": 32070 + }, + { + "epoch": 0.1281674806979694, + "grad_norm": 4.9456915855407715, + "learning_rate": 1.980962450470446e-07, + "loss": 0.0223, + "step": 32080 + }, + { + "epoch": 0.12820743315454608, + "grad_norm": 2.3123891353607178, + "learning_rate": 1.980950236874889e-07, + "loss": 0.0275, + "step": 32090 + }, + { + "epoch": 0.12824738561112276, + "grad_norm": 2.8561432361602783, + "learning_rate": 1.9809380194004298e-07, + "loss": 0.0254, + "step": 32100 + }, + { + "epoch": 0.12828733806769943, + "grad_norm": 1.9302200078964233, + "learning_rate": 1.980925798047117e-07, + "loss": 0.0303, + "step": 32110 + }, + { + "epoch": 0.1283272905242761, + "grad_norm": 3.20123028755188, + "learning_rate": 1.9809135728149985e-07, + "loss": 0.0241, + "step": 32120 + }, + { + "epoch": 0.12836724298085278, + "grad_norm": 2.1180365085601807, + "learning_rate": 1.9809013437041233e-07, + "loss": 0.025, + "step": 32130 + }, + { + "epoch": 0.12840719543742946, + "grad_norm": 1.6932445764541626, + "learning_rate": 1.9808891107145392e-07, + "loss": 0.0253, + "step": 32140 + }, + { + "epoch": 0.12844714789400613, + "grad_norm": 1.9654618501663208, + "learning_rate": 1.9808768738462945e-07, + "loss": 0.0218, + "step": 32150 + }, + { + "epoch": 0.1284871003505828, + "grad_norm": 6.443096160888672, + "learning_rate": 1.980864633099438e-07, + "loss": 0.0243, + "step": 32160 + }, + { + "epoch": 0.12852705280715948, + "grad_norm": 5.554087162017822, + "learning_rate": 1.9808523884740178e-07, + "loss": 0.023, + "step": 32170 + }, + { + "epoch": 0.12856700526373616, + "grad_norm": 6.256402015686035, + "learning_rate": 1.9808401399700827e-07, + "loss": 0.0326, + "step": 32180 + }, + { + "epoch": 0.12860695772031283, + "grad_norm": 9.923408508300781, + "learning_rate": 1.9808278875876808e-07, + "loss": 0.0228, + "step": 32190 + }, + { + "epoch": 0.1286469101768895, + "grad_norm": 2.859222412109375, + "learning_rate": 1.9808156313268603e-07, + "loss": 0.0294, + "step": 32200 + }, + { + "epoch": 0.12868686263346618, + "grad_norm": 3.4908854961395264, + "learning_rate": 1.98080337118767e-07, + "loss": 0.0231, + "step": 32210 + }, + { + "epoch": 0.12872681509004286, + "grad_norm": 3.2065541744232178, + "learning_rate": 1.980791107170159e-07, + "loss": 0.0252, + "step": 32220 + }, + { + "epoch": 0.12876676754661953, + "grad_norm": 2.799485445022583, + "learning_rate": 1.9807788392743743e-07, + "loss": 0.0216, + "step": 32230 + }, + { + "epoch": 0.1288067200031962, + "grad_norm": 4.646878242492676, + "learning_rate": 1.980766567500366e-07, + "loss": 0.0216, + "step": 32240 + }, + { + "epoch": 0.12884667245977288, + "grad_norm": 1.8887706995010376, + "learning_rate": 1.9807542918481812e-07, + "loss": 0.0223, + "step": 32250 + }, + { + "epoch": 0.12888662491634956, + "grad_norm": 3.34112286567688, + "learning_rate": 1.9807420123178696e-07, + "loss": 0.027, + "step": 32260 + }, + { + "epoch": 0.1289265773729262, + "grad_norm": 4.06746768951416, + "learning_rate": 1.980729728909479e-07, + "loss": 0.0213, + "step": 32270 + }, + { + "epoch": 0.12896652982950288, + "grad_norm": 3.0136735439300537, + "learning_rate": 1.9807174416230586e-07, + "loss": 0.0195, + "step": 32280 + }, + { + "epoch": 0.12900648228607955, + "grad_norm": 22.130029678344727, + "learning_rate": 1.9807051504586566e-07, + "loss": 0.0307, + "step": 32290 + }, + { + "epoch": 0.12904643474265623, + "grad_norm": 5.775410175323486, + "learning_rate": 1.9806928554163215e-07, + "loss": 0.0212, + "step": 32300 + }, + { + "epoch": 0.1290863871992329, + "grad_norm": 9.423768043518066, + "learning_rate": 1.980680556496102e-07, + "loss": 0.0238, + "step": 32310 + }, + { + "epoch": 0.12912633965580958, + "grad_norm": 5.196847915649414, + "learning_rate": 1.9806682536980468e-07, + "loss": 0.0269, + "step": 32320 + }, + { + "epoch": 0.12916629211238626, + "grad_norm": 5.781191825866699, + "learning_rate": 1.9806559470222047e-07, + "loss": 0.0262, + "step": 32330 + }, + { + "epoch": 0.12920624456896293, + "grad_norm": 2.3911547660827637, + "learning_rate": 1.980643636468624e-07, + "loss": 0.0266, + "step": 32340 + }, + { + "epoch": 0.1292461970255396, + "grad_norm": 22.52849769592285, + "learning_rate": 1.9806313220373537e-07, + "loss": 0.0238, + "step": 32350 + }, + { + "epoch": 0.12928614948211628, + "grad_norm": 3.02000093460083, + "learning_rate": 1.9806190037284423e-07, + "loss": 0.0233, + "step": 32360 + }, + { + "epoch": 0.12932610193869296, + "grad_norm": 2.71122670173645, + "learning_rate": 1.9806066815419385e-07, + "loss": 0.025, + "step": 32370 + }, + { + "epoch": 0.12936605439526963, + "grad_norm": 2.7403271198272705, + "learning_rate": 1.9805943554778915e-07, + "loss": 0.0224, + "step": 32380 + }, + { + "epoch": 0.1294060068518463, + "grad_norm": 4.468947410583496, + "learning_rate": 1.9805820255363495e-07, + "loss": 0.0276, + "step": 32390 + }, + { + "epoch": 0.12944595930842298, + "grad_norm": 2.0653347969055176, + "learning_rate": 1.9805696917173612e-07, + "loss": 0.0249, + "step": 32400 + }, + { + "epoch": 0.12948591176499966, + "grad_norm": 6.298212051391602, + "learning_rate": 1.9805573540209758e-07, + "loss": 0.0257, + "step": 32410 + }, + { + "epoch": 0.12952586422157633, + "grad_norm": 1.8548295497894287, + "learning_rate": 1.9805450124472418e-07, + "loss": 0.0241, + "step": 32420 + }, + { + "epoch": 0.129565816678153, + "grad_norm": 5.324073314666748, + "learning_rate": 1.980532666996208e-07, + "loss": 0.0257, + "step": 32430 + }, + { + "epoch": 0.12960576913472968, + "grad_norm": 6.837103843688965, + "learning_rate": 1.9805203176679234e-07, + "loss": 0.0261, + "step": 32440 + }, + { + "epoch": 0.12964572159130636, + "grad_norm": 8.793717384338379, + "learning_rate": 1.9805079644624367e-07, + "loss": 0.0269, + "step": 32450 + }, + { + "epoch": 0.12968567404788303, + "grad_norm": 3.7164671421051025, + "learning_rate": 1.9804956073797967e-07, + "loss": 0.0279, + "step": 32460 + }, + { + "epoch": 0.1297256265044597, + "grad_norm": 3.82368803024292, + "learning_rate": 1.9804832464200522e-07, + "loss": 0.0241, + "step": 32470 + }, + { + "epoch": 0.12976557896103635, + "grad_norm": 3.928201913833618, + "learning_rate": 1.9804708815832526e-07, + "loss": 0.022, + "step": 32480 + }, + { + "epoch": 0.12980553141761303, + "grad_norm": 5.7331132888793945, + "learning_rate": 1.980458512869446e-07, + "loss": 0.0271, + "step": 32490 + }, + { + "epoch": 0.1298454838741897, + "grad_norm": 4.8804521560668945, + "learning_rate": 1.980446140278682e-07, + "loss": 0.023, + "step": 32500 + }, + { + "epoch": 0.12988543633076638, + "grad_norm": 4.27169942855835, + "learning_rate": 1.980433763811009e-07, + "loss": 0.0221, + "step": 32510 + }, + { + "epoch": 0.12992538878734305, + "grad_norm": 4.616256237030029, + "learning_rate": 1.9804213834664764e-07, + "loss": 0.0284, + "step": 32520 + }, + { + "epoch": 0.12996534124391973, + "grad_norm": 3.924711227416992, + "learning_rate": 1.9804089992451327e-07, + "loss": 0.0237, + "step": 32530 + }, + { + "epoch": 0.1300052937004964, + "grad_norm": 4.485348701477051, + "learning_rate": 1.9803966111470275e-07, + "loss": 0.0264, + "step": 32540 + }, + { + "epoch": 0.13004524615707308, + "grad_norm": 2.511676073074341, + "learning_rate": 1.9803842191722086e-07, + "loss": 0.0223, + "step": 32550 + }, + { + "epoch": 0.13008519861364976, + "grad_norm": 5.5210466384887695, + "learning_rate": 1.9803718233207266e-07, + "loss": 0.0243, + "step": 32560 + }, + { + "epoch": 0.13012515107022643, + "grad_norm": 3.8145625591278076, + "learning_rate": 1.9803594235926293e-07, + "loss": 0.0224, + "step": 32570 + }, + { + "epoch": 0.1301651035268031, + "grad_norm": 3.4723727703094482, + "learning_rate": 1.980347019987966e-07, + "loss": 0.0239, + "step": 32580 + }, + { + "epoch": 0.13020505598337978, + "grad_norm": 3.7907614707946777, + "learning_rate": 1.9803346125067862e-07, + "loss": 0.0281, + "step": 32590 + }, + { + "epoch": 0.13024500843995646, + "grad_norm": 1.9160479307174683, + "learning_rate": 1.9803222011491382e-07, + "loss": 0.0234, + "step": 32600 + }, + { + "epoch": 0.13028496089653313, + "grad_norm": 6.284611701965332, + "learning_rate": 1.980309785915072e-07, + "loss": 0.0239, + "step": 32610 + }, + { + "epoch": 0.1303249133531098, + "grad_norm": 6.031239986419678, + "learning_rate": 1.980297366804636e-07, + "loss": 0.0264, + "step": 32620 + }, + { + "epoch": 0.13036486580968648, + "grad_norm": 2.902940511703491, + "learning_rate": 1.9802849438178795e-07, + "loss": 0.0235, + "step": 32630 + }, + { + "epoch": 0.13040481826626316, + "grad_norm": 4.144721508026123, + "learning_rate": 1.980272516954852e-07, + "loss": 0.0255, + "step": 32640 + }, + { + "epoch": 0.13044477072283983, + "grad_norm": 22.110721588134766, + "learning_rate": 1.9802600862156017e-07, + "loss": 0.0244, + "step": 32650 + }, + { + "epoch": 0.1304847231794165, + "grad_norm": 3.0266244411468506, + "learning_rate": 1.9802476516001786e-07, + "loss": 0.023, + "step": 32660 + }, + { + "epoch": 0.13052467563599318, + "grad_norm": 9.044718742370605, + "learning_rate": 1.9802352131086314e-07, + "loss": 0.0223, + "step": 32670 + }, + { + "epoch": 0.13056462809256983, + "grad_norm": 4.289109706878662, + "learning_rate": 1.98022277074101e-07, + "loss": 0.0334, + "step": 32680 + }, + { + "epoch": 0.1306045805491465, + "grad_norm": 1.746596336364746, + "learning_rate": 1.9802103244973628e-07, + "loss": 0.0239, + "step": 32690 + }, + { + "epoch": 0.13064453300572318, + "grad_norm": 1.7506800889968872, + "learning_rate": 1.980197874377739e-07, + "loss": 0.0206, + "step": 32700 + }, + { + "epoch": 0.13068448546229985, + "grad_norm": 1.205881118774414, + "learning_rate": 1.9801854203821884e-07, + "loss": 0.0208, + "step": 32710 + }, + { + "epoch": 0.13072443791887653, + "grad_norm": 4.28289794921875, + "learning_rate": 1.98017296251076e-07, + "loss": 0.0214, + "step": 32720 + }, + { + "epoch": 0.1307643903754532, + "grad_norm": 22.846296310424805, + "learning_rate": 1.9801605007635032e-07, + "loss": 0.0258, + "step": 32730 + }, + { + "epoch": 0.13080434283202988, + "grad_norm": 4.520547389984131, + "learning_rate": 1.9801480351404666e-07, + "loss": 0.0315, + "step": 32740 + }, + { + "epoch": 0.13084429528860655, + "grad_norm": 2.969719409942627, + "learning_rate": 1.9801355656417005e-07, + "loss": 0.0264, + "step": 32750 + }, + { + "epoch": 0.13088424774518323, + "grad_norm": 11.287675857543945, + "learning_rate": 1.9801230922672536e-07, + "loss": 0.0281, + "step": 32760 + }, + { + "epoch": 0.1309242002017599, + "grad_norm": 7.451023101806641, + "learning_rate": 1.9801106150171751e-07, + "loss": 0.0233, + "step": 32770 + }, + { + "epoch": 0.13096415265833658, + "grad_norm": 1.9810092449188232, + "learning_rate": 1.980098133891515e-07, + "loss": 0.0229, + "step": 32780 + }, + { + "epoch": 0.13100410511491326, + "grad_norm": 2.522408962249756, + "learning_rate": 1.9800856488903217e-07, + "loss": 0.0225, + "step": 32790 + }, + { + "epoch": 0.13104405757148993, + "grad_norm": 6.2328009605407715, + "learning_rate": 1.9800731600136453e-07, + "loss": 0.0244, + "step": 32800 + }, + { + "epoch": 0.1310840100280666, + "grad_norm": 3.528076410293579, + "learning_rate": 1.9800606672615352e-07, + "loss": 0.0213, + "step": 32810 + }, + { + "epoch": 0.13112396248464328, + "grad_norm": 2.081374406814575, + "learning_rate": 1.9800481706340407e-07, + "loss": 0.0274, + "step": 32820 + }, + { + "epoch": 0.13116391494121996, + "grad_norm": 6.26076602935791, + "learning_rate": 1.9800356701312106e-07, + "loss": 0.0238, + "step": 32830 + }, + { + "epoch": 0.13120386739779663, + "grad_norm": 3.0236430168151855, + "learning_rate": 1.980023165753095e-07, + "loss": 0.023, + "step": 32840 + }, + { + "epoch": 0.1312438198543733, + "grad_norm": 6.1415019035339355, + "learning_rate": 1.9800106574997433e-07, + "loss": 0.0218, + "step": 32850 + }, + { + "epoch": 0.13128377231094998, + "grad_norm": 5.557791233062744, + "learning_rate": 1.9799981453712047e-07, + "loss": 0.0247, + "step": 32860 + }, + { + "epoch": 0.13132372476752666, + "grad_norm": 4.087674140930176, + "learning_rate": 1.9799856293675287e-07, + "loss": 0.0218, + "step": 32870 + }, + { + "epoch": 0.1313636772241033, + "grad_norm": 5.25404167175293, + "learning_rate": 1.9799731094887652e-07, + "loss": 0.0281, + "step": 32880 + }, + { + "epoch": 0.13140362968067998, + "grad_norm": 3.2685654163360596, + "learning_rate": 1.9799605857349633e-07, + "loss": 0.0187, + "step": 32890 + }, + { + "epoch": 0.13144358213725665, + "grad_norm": 2.465409755706787, + "learning_rate": 1.9799480581061726e-07, + "loss": 0.0244, + "step": 32900 + }, + { + "epoch": 0.13148353459383333, + "grad_norm": 2.1655080318450928, + "learning_rate": 1.9799355266024427e-07, + "loss": 0.022, + "step": 32910 + }, + { + "epoch": 0.13152348705041, + "grad_norm": 3.925851345062256, + "learning_rate": 1.979922991223823e-07, + "loss": 0.0286, + "step": 32920 + }, + { + "epoch": 0.13156343950698668, + "grad_norm": 3.74627423286438, + "learning_rate": 1.9799104519703632e-07, + "loss": 0.024, + "step": 32930 + }, + { + "epoch": 0.13160339196356335, + "grad_norm": 3.775543212890625, + "learning_rate": 1.9798979088421128e-07, + "loss": 0.0261, + "step": 32940 + }, + { + "epoch": 0.13164334442014003, + "grad_norm": 4.005124568939209, + "learning_rate": 1.9798853618391219e-07, + "loss": 0.0217, + "step": 32950 + }, + { + "epoch": 0.1316832968767167, + "grad_norm": 4.467537879943848, + "learning_rate": 1.979872810961439e-07, + "loss": 0.0217, + "step": 32960 + }, + { + "epoch": 0.13172324933329338, + "grad_norm": 12.713835716247559, + "learning_rate": 1.9798602562091151e-07, + "loss": 0.0196, + "step": 32970 + }, + { + "epoch": 0.13176320178987005, + "grad_norm": 3.981092691421509, + "learning_rate": 1.9798476975821987e-07, + "loss": 0.0287, + "step": 32980 + }, + { + "epoch": 0.13180315424644673, + "grad_norm": 7.8936991691589355, + "learning_rate": 1.9798351350807401e-07, + "loss": 0.0229, + "step": 32990 + }, + { + "epoch": 0.1318431067030234, + "grad_norm": 1.8003275394439697, + "learning_rate": 1.9798225687047885e-07, + "loss": 0.0224, + "step": 33000 + }, + { + "epoch": 0.13188305915960008, + "grad_norm": 5.02206563949585, + "learning_rate": 1.9798099984543941e-07, + "loss": 0.0258, + "step": 33010 + }, + { + "epoch": 0.13192301161617676, + "grad_norm": 3.5254955291748047, + "learning_rate": 1.9797974243296065e-07, + "loss": 0.0286, + "step": 33020 + }, + { + "epoch": 0.13196296407275343, + "grad_norm": 3.5721514225006104, + "learning_rate": 1.9797848463304753e-07, + "loss": 0.0267, + "step": 33030 + }, + { + "epoch": 0.1320029165293301, + "grad_norm": 2.7325944900512695, + "learning_rate": 1.97977226445705e-07, + "loss": 0.0193, + "step": 33040 + }, + { + "epoch": 0.13204286898590678, + "grad_norm": 4.630646228790283, + "learning_rate": 1.9797596787093805e-07, + "loss": 0.0276, + "step": 33050 + }, + { + "epoch": 0.13208282144248346, + "grad_norm": 4.239681720733643, + "learning_rate": 1.979747089087517e-07, + "loss": 0.0265, + "step": 33060 + }, + { + "epoch": 0.13212277389906013, + "grad_norm": 4.042301177978516, + "learning_rate": 1.9797344955915086e-07, + "loss": 0.0218, + "step": 33070 + }, + { + "epoch": 0.1321627263556368, + "grad_norm": 4.179530620574951, + "learning_rate": 1.979721898221406e-07, + "loss": 0.0238, + "step": 33080 + }, + { + "epoch": 0.13220267881221345, + "grad_norm": 3.612924575805664, + "learning_rate": 1.979709296977258e-07, + "loss": 0.0224, + "step": 33090 + }, + { + "epoch": 0.13224263126879013, + "grad_norm": 4.491313457489014, + "learning_rate": 1.9796966918591149e-07, + "loss": 0.0255, + "step": 33100 + }, + { + "epoch": 0.1322825837253668, + "grad_norm": 2.7060463428497314, + "learning_rate": 1.9796840828670267e-07, + "loss": 0.024, + "step": 33110 + }, + { + "epoch": 0.13232253618194348, + "grad_norm": 2.1088922023773193, + "learning_rate": 1.9796714700010428e-07, + "loss": 0.0249, + "step": 33120 + }, + { + "epoch": 0.13236248863852015, + "grad_norm": 4.20639705657959, + "learning_rate": 1.9796588532612136e-07, + "loss": 0.0217, + "step": 33130 + }, + { + "epoch": 0.13240244109509683, + "grad_norm": 5.098376274108887, + "learning_rate": 1.9796462326475886e-07, + "loss": 0.0228, + "step": 33140 + }, + { + "epoch": 0.1324423935516735, + "grad_norm": 8.907161712646484, + "learning_rate": 1.979633608160218e-07, + "loss": 0.0238, + "step": 33150 + }, + { + "epoch": 0.13248234600825018, + "grad_norm": 17.54938507080078, + "learning_rate": 1.9796209797991518e-07, + "loss": 0.0214, + "step": 33160 + }, + { + "epoch": 0.13252229846482685, + "grad_norm": 3.797881841659546, + "learning_rate": 1.9796083475644393e-07, + "loss": 0.0234, + "step": 33170 + }, + { + "epoch": 0.13256225092140353, + "grad_norm": 5.137433052062988, + "learning_rate": 1.9795957114561305e-07, + "loss": 0.026, + "step": 33180 + }, + { + "epoch": 0.1326022033779802, + "grad_norm": 1.4437425136566162, + "learning_rate": 1.9795830714742765e-07, + "loss": 0.0237, + "step": 33190 + }, + { + "epoch": 0.13264215583455688, + "grad_norm": 3.0079829692840576, + "learning_rate": 1.9795704276189263e-07, + "loss": 0.0249, + "step": 33200 + }, + { + "epoch": 0.13268210829113355, + "grad_norm": 2.43396258354187, + "learning_rate": 1.9795577798901297e-07, + "loss": 0.0267, + "step": 33210 + }, + { + "epoch": 0.13272206074771023, + "grad_norm": 16.676982879638672, + "learning_rate": 1.9795451282879374e-07, + "loss": 0.0256, + "step": 33220 + }, + { + "epoch": 0.1327620132042869, + "grad_norm": 5.135403156280518, + "learning_rate": 1.9795324728123992e-07, + "loss": 0.0233, + "step": 33230 + }, + { + "epoch": 0.13280196566086358, + "grad_norm": 3.605592727661133, + "learning_rate": 1.9795198134635652e-07, + "loss": 0.0231, + "step": 33240 + }, + { + "epoch": 0.13284191811744026, + "grad_norm": 4.659249782562256, + "learning_rate": 1.9795071502414848e-07, + "loss": 0.0219, + "step": 33250 + }, + { + "epoch": 0.13288187057401693, + "grad_norm": 5.028445243835449, + "learning_rate": 1.979494483146209e-07, + "loss": 0.0267, + "step": 33260 + }, + { + "epoch": 0.1329218230305936, + "grad_norm": 9.546895980834961, + "learning_rate": 1.9794818121777875e-07, + "loss": 0.028, + "step": 33270 + }, + { + "epoch": 0.13296177548717028, + "grad_norm": 4.557383060455322, + "learning_rate": 1.9794691373362704e-07, + "loss": 0.0246, + "step": 33280 + }, + { + "epoch": 0.13300172794374693, + "grad_norm": 2.36018443107605, + "learning_rate": 1.979456458621708e-07, + "loss": 0.0228, + "step": 33290 + }, + { + "epoch": 0.1330416804003236, + "grad_norm": 8.982595443725586, + "learning_rate": 1.97944377603415e-07, + "loss": 0.0253, + "step": 33300 + }, + { + "epoch": 0.13308163285690028, + "grad_norm": 4.4432196617126465, + "learning_rate": 1.9794310895736466e-07, + "loss": 0.0257, + "step": 33310 + }, + { + "epoch": 0.13312158531347695, + "grad_norm": 3.2304844856262207, + "learning_rate": 1.9794183992402487e-07, + "loss": 0.0254, + "step": 33320 + }, + { + "epoch": 0.13316153777005363, + "grad_norm": 1.8717305660247803, + "learning_rate": 1.9794057050340056e-07, + "loss": 0.0247, + "step": 33330 + }, + { + "epoch": 0.1332014902266303, + "grad_norm": 8.090788841247559, + "learning_rate": 1.979393006954968e-07, + "loss": 0.0248, + "step": 33340 + }, + { + "epoch": 0.13324144268320698, + "grad_norm": 4.081297874450684, + "learning_rate": 1.9793803050031858e-07, + "loss": 0.0211, + "step": 33350 + }, + { + "epoch": 0.13328139513978365, + "grad_norm": 3.4349403381347656, + "learning_rate": 1.9793675991787095e-07, + "loss": 0.0224, + "step": 33360 + }, + { + "epoch": 0.13332134759636033, + "grad_norm": 2.746399402618408, + "learning_rate": 1.9793548894815892e-07, + "loss": 0.0216, + "step": 33370 + }, + { + "epoch": 0.133361300052937, + "grad_norm": 2.549433708190918, + "learning_rate": 1.9793421759118752e-07, + "loss": 0.0188, + "step": 33380 + }, + { + "epoch": 0.13340125250951368, + "grad_norm": 5.604772090911865, + "learning_rate": 1.9793294584696176e-07, + "loss": 0.0272, + "step": 33390 + }, + { + "epoch": 0.13344120496609035, + "grad_norm": 6.113785266876221, + "learning_rate": 1.9793167371548673e-07, + "loss": 0.0259, + "step": 33400 + }, + { + "epoch": 0.13348115742266703, + "grad_norm": 5.2335591316223145, + "learning_rate": 1.979304011967674e-07, + "loss": 0.0231, + "step": 33410 + }, + { + "epoch": 0.1335211098792437, + "grad_norm": 5.006474494934082, + "learning_rate": 1.979291282908088e-07, + "loss": 0.0245, + "step": 33420 + }, + { + "epoch": 0.13356106233582038, + "grad_norm": 2.173447370529175, + "learning_rate": 1.9792785499761595e-07, + "loss": 0.0255, + "step": 33430 + }, + { + "epoch": 0.13360101479239705, + "grad_norm": 4.902973651885986, + "learning_rate": 1.9792658131719397e-07, + "loss": 0.0277, + "step": 33440 + }, + { + "epoch": 0.13364096724897373, + "grad_norm": 4.215675354003906, + "learning_rate": 1.9792530724954781e-07, + "loss": 0.0263, + "step": 33450 + }, + { + "epoch": 0.1336809197055504, + "grad_norm": 2.9482288360595703, + "learning_rate": 1.9792403279468256e-07, + "loss": 0.0242, + "step": 33460 + }, + { + "epoch": 0.13372087216212708, + "grad_norm": 4.267545700073242, + "learning_rate": 1.9792275795260323e-07, + "loss": 0.0275, + "step": 33470 + }, + { + "epoch": 0.13376082461870376, + "grad_norm": 3.1655261516571045, + "learning_rate": 1.9792148272331486e-07, + "loss": 0.0246, + "step": 33480 + }, + { + "epoch": 0.1338007770752804, + "grad_norm": 4.51795768737793, + "learning_rate": 1.979202071068225e-07, + "loss": 0.0255, + "step": 33490 + }, + { + "epoch": 0.13384072953185708, + "grad_norm": 9.998356819152832, + "learning_rate": 1.979189311031312e-07, + "loss": 0.0254, + "step": 33500 + }, + { + "epoch": 0.13388068198843375, + "grad_norm": 7.985647201538086, + "learning_rate": 1.9791765471224605e-07, + "loss": 0.028, + "step": 33510 + }, + { + "epoch": 0.13392063444501043, + "grad_norm": 7.032173156738281, + "learning_rate": 1.97916377934172e-07, + "loss": 0.0273, + "step": 33520 + }, + { + "epoch": 0.1339605869015871, + "grad_norm": 6.418251991271973, + "learning_rate": 1.9791510076891414e-07, + "loss": 0.0255, + "step": 33530 + }, + { + "epoch": 0.13400053935816378, + "grad_norm": 4.935680389404297, + "learning_rate": 1.9791382321647756e-07, + "loss": 0.0251, + "step": 33540 + }, + { + "epoch": 0.13404049181474045, + "grad_norm": 10.869719505310059, + "learning_rate": 1.9791254527686725e-07, + "loss": 0.0241, + "step": 33550 + }, + { + "epoch": 0.13408044427131713, + "grad_norm": 4.031705379486084, + "learning_rate": 1.9791126695008834e-07, + "loss": 0.0263, + "step": 33560 + }, + { + "epoch": 0.1341203967278938, + "grad_norm": 8.595075607299805, + "learning_rate": 1.9790998823614578e-07, + "loss": 0.0274, + "step": 33570 + }, + { + "epoch": 0.13416034918447048, + "grad_norm": 1.2283108234405518, + "learning_rate": 1.9790870913504472e-07, + "loss": 0.026, + "step": 33580 + }, + { + "epoch": 0.13420030164104715, + "grad_norm": 7.623051643371582, + "learning_rate": 1.9790742964679017e-07, + "loss": 0.0279, + "step": 33590 + }, + { + "epoch": 0.13424025409762383, + "grad_norm": 2.6161017417907715, + "learning_rate": 1.979061497713872e-07, + "loss": 0.026, + "step": 33600 + }, + { + "epoch": 0.1342802065542005, + "grad_norm": 5.169800281524658, + "learning_rate": 1.9790486950884088e-07, + "loss": 0.0306, + "step": 33610 + }, + { + "epoch": 0.13432015901077718, + "grad_norm": 2.8069727420806885, + "learning_rate": 1.9790358885915625e-07, + "loss": 0.0296, + "step": 33620 + }, + { + "epoch": 0.13436011146735385, + "grad_norm": 1.5143392086029053, + "learning_rate": 1.9790230782233844e-07, + "loss": 0.0226, + "step": 33630 + }, + { + "epoch": 0.13440006392393053, + "grad_norm": 4.213233470916748, + "learning_rate": 1.979010263983924e-07, + "loss": 0.0255, + "step": 33640 + }, + { + "epoch": 0.1344400163805072, + "grad_norm": 2.783130407333374, + "learning_rate": 1.978997445873233e-07, + "loss": 0.0267, + "step": 33650 + }, + { + "epoch": 0.13447996883708388, + "grad_norm": 1.7708677053451538, + "learning_rate": 1.9789846238913617e-07, + "loss": 0.0207, + "step": 33660 + }, + { + "epoch": 0.13451992129366055, + "grad_norm": 5.229522228240967, + "learning_rate": 1.9789717980383604e-07, + "loss": 0.0249, + "step": 33670 + }, + { + "epoch": 0.13455987375023723, + "grad_norm": 6.776401996612549, + "learning_rate": 1.9789589683142807e-07, + "loss": 0.0305, + "step": 33680 + }, + { + "epoch": 0.1345998262068139, + "grad_norm": 4.164223670959473, + "learning_rate": 1.9789461347191727e-07, + "loss": 0.0243, + "step": 33690 + }, + { + "epoch": 0.13463977866339055, + "grad_norm": 2.3928751945495605, + "learning_rate": 1.978933297253087e-07, + "loss": 0.0175, + "step": 33700 + }, + { + "epoch": 0.13467973111996723, + "grad_norm": 4.023430824279785, + "learning_rate": 1.978920455916075e-07, + "loss": 0.026, + "step": 33710 + }, + { + "epoch": 0.1347196835765439, + "grad_norm": 4.418040752410889, + "learning_rate": 1.978907610708187e-07, + "loss": 0.0281, + "step": 33720 + }, + { + "epoch": 0.13475963603312058, + "grad_norm": 8.485740661621094, + "learning_rate": 1.978894761629474e-07, + "loss": 0.0235, + "step": 33730 + }, + { + "epoch": 0.13479958848969725, + "grad_norm": 2.422381639480591, + "learning_rate": 1.9788819086799866e-07, + "loss": 0.0196, + "step": 33740 + }, + { + "epoch": 0.13483954094627393, + "grad_norm": 3.516059160232544, + "learning_rate": 1.978869051859776e-07, + "loss": 0.0266, + "step": 33750 + }, + { + "epoch": 0.1348794934028506, + "grad_norm": 5.098599433898926, + "learning_rate": 1.9788561911688925e-07, + "loss": 0.0305, + "step": 33760 + }, + { + "epoch": 0.13491944585942728, + "grad_norm": 4.108199596405029, + "learning_rate": 1.9788433266073876e-07, + "loss": 0.0247, + "step": 33770 + }, + { + "epoch": 0.13495939831600395, + "grad_norm": 8.31602954864502, + "learning_rate": 1.9788304581753116e-07, + "loss": 0.0224, + "step": 33780 + }, + { + "epoch": 0.13499935077258063, + "grad_norm": 4.407034873962402, + "learning_rate": 1.9788175858727157e-07, + "loss": 0.0257, + "step": 33790 + }, + { + "epoch": 0.1350393032291573, + "grad_norm": 4.409352779388428, + "learning_rate": 1.9788047096996507e-07, + "loss": 0.0251, + "step": 33800 + }, + { + "epoch": 0.13507925568573398, + "grad_norm": 4.270468235015869, + "learning_rate": 1.9787918296561673e-07, + "loss": 0.0213, + "step": 33810 + }, + { + "epoch": 0.13511920814231065, + "grad_norm": 3.4814045429229736, + "learning_rate": 1.978778945742317e-07, + "loss": 0.0231, + "step": 33820 + }, + { + "epoch": 0.13515916059888733, + "grad_norm": 3.182652473449707, + "learning_rate": 1.9787660579581505e-07, + "loss": 0.0276, + "step": 33830 + }, + { + "epoch": 0.135199113055464, + "grad_norm": 3.9483065605163574, + "learning_rate": 1.9787531663037182e-07, + "loss": 0.0223, + "step": 33840 + }, + { + "epoch": 0.13523906551204068, + "grad_norm": 6.127503871917725, + "learning_rate": 1.9787402707790716e-07, + "loss": 0.0275, + "step": 33850 + }, + { + "epoch": 0.13527901796861735, + "grad_norm": 4.041313648223877, + "learning_rate": 1.9787273713842621e-07, + "loss": 0.0231, + "step": 33860 + }, + { + "epoch": 0.13531897042519403, + "grad_norm": 4.70305871963501, + "learning_rate": 1.9787144681193398e-07, + "loss": 0.0228, + "step": 33870 + }, + { + "epoch": 0.1353589228817707, + "grad_norm": 1.435410737991333, + "learning_rate": 1.9787015609843563e-07, + "loss": 0.0231, + "step": 33880 + }, + { + "epoch": 0.13539887533834738, + "grad_norm": 2.348294496536255, + "learning_rate": 1.9786886499793623e-07, + "loss": 0.0272, + "step": 33890 + }, + { + "epoch": 0.13543882779492403, + "grad_norm": 2.7004892826080322, + "learning_rate": 1.9786757351044092e-07, + "loss": 0.0275, + "step": 33900 + }, + { + "epoch": 0.1354787802515007, + "grad_norm": 3.22611665725708, + "learning_rate": 1.9786628163595482e-07, + "loss": 0.0263, + "step": 33910 + }, + { + "epoch": 0.13551873270807738, + "grad_norm": 4.899319171905518, + "learning_rate": 1.9786498937448297e-07, + "loss": 0.0264, + "step": 33920 + }, + { + "epoch": 0.13555868516465405, + "grad_norm": 5.345300674438477, + "learning_rate": 1.9786369672603055e-07, + "loss": 0.0253, + "step": 33930 + }, + { + "epoch": 0.13559863762123073, + "grad_norm": 4.673030853271484, + "learning_rate": 1.9786240369060262e-07, + "loss": 0.0242, + "step": 33940 + }, + { + "epoch": 0.1356385900778074, + "grad_norm": 3.286438226699829, + "learning_rate": 1.978611102682043e-07, + "loss": 0.0226, + "step": 33950 + }, + { + "epoch": 0.13567854253438408, + "grad_norm": 3.3766379356384277, + "learning_rate": 1.9785981645884076e-07, + "loss": 0.0215, + "step": 33960 + }, + { + "epoch": 0.13571849499096075, + "grad_norm": 13.05203914642334, + "learning_rate": 1.9785852226251704e-07, + "loss": 0.0213, + "step": 33970 + }, + { + "epoch": 0.13575844744753743, + "grad_norm": 10.16220474243164, + "learning_rate": 1.9785722767923831e-07, + "loss": 0.0218, + "step": 33980 + }, + { + "epoch": 0.1357983999041141, + "grad_norm": 8.076005935668945, + "learning_rate": 1.9785593270900968e-07, + "loss": 0.0262, + "step": 33990 + }, + { + "epoch": 0.13583835236069078, + "grad_norm": 3.8977084159851074, + "learning_rate": 1.9785463735183626e-07, + "loss": 0.0224, + "step": 34000 + }, + { + "epoch": 0.13587830481726745, + "grad_norm": 8.993185043334961, + "learning_rate": 1.9785334160772315e-07, + "loss": 0.0279, + "step": 34010 + }, + { + "epoch": 0.13591825727384413, + "grad_norm": 4.108241081237793, + "learning_rate": 1.9785204547667554e-07, + "loss": 0.0287, + "step": 34020 + }, + { + "epoch": 0.1359582097304208, + "grad_norm": 3.756319999694824, + "learning_rate": 1.9785074895869847e-07, + "loss": 0.0271, + "step": 34030 + }, + { + "epoch": 0.13599816218699748, + "grad_norm": 3.838879346847534, + "learning_rate": 1.9784945205379714e-07, + "loss": 0.0225, + "step": 34040 + }, + { + "epoch": 0.13603811464357415, + "grad_norm": 12.038919448852539, + "learning_rate": 1.9784815476197661e-07, + "loss": 0.0241, + "step": 34050 + }, + { + "epoch": 0.13607806710015083, + "grad_norm": 4.963292598724365, + "learning_rate": 1.9784685708324208e-07, + "loss": 0.0256, + "step": 34060 + }, + { + "epoch": 0.1361180195567275, + "grad_norm": 2.3073813915252686, + "learning_rate": 1.9784555901759864e-07, + "loss": 0.0256, + "step": 34070 + }, + { + "epoch": 0.13615797201330418, + "grad_norm": 2.329432964324951, + "learning_rate": 1.9784426056505144e-07, + "loss": 0.0239, + "step": 34080 + }, + { + "epoch": 0.13619792446988085, + "grad_norm": 14.714067459106445, + "learning_rate": 1.978429617256056e-07, + "loss": 0.0232, + "step": 34090 + }, + { + "epoch": 0.1362378769264575, + "grad_norm": 7.751311302185059, + "learning_rate": 1.9784166249926628e-07, + "loss": 0.0237, + "step": 34100 + }, + { + "epoch": 0.13627782938303418, + "grad_norm": 49.20457458496094, + "learning_rate": 1.9784036288603857e-07, + "loss": 0.0277, + "step": 34110 + }, + { + "epoch": 0.13631778183961085, + "grad_norm": 3.3991212844848633, + "learning_rate": 1.9783906288592766e-07, + "loss": 0.0234, + "step": 34120 + }, + { + "epoch": 0.13635773429618753, + "grad_norm": 1.5460011959075928, + "learning_rate": 1.9783776249893868e-07, + "loss": 0.0241, + "step": 34130 + }, + { + "epoch": 0.1363976867527642, + "grad_norm": 7.2543044090271, + "learning_rate": 1.9783646172507672e-07, + "loss": 0.0221, + "step": 34140 + }, + { + "epoch": 0.13643763920934088, + "grad_norm": 6.021000862121582, + "learning_rate": 1.97835160564347e-07, + "loss": 0.0266, + "step": 34150 + }, + { + "epoch": 0.13647759166591755, + "grad_norm": 4.147846698760986, + "learning_rate": 1.9783385901675462e-07, + "loss": 0.0246, + "step": 34160 + }, + { + "epoch": 0.13651754412249423, + "grad_norm": 7.097258567810059, + "learning_rate": 1.9783255708230474e-07, + "loss": 0.024, + "step": 34170 + }, + { + "epoch": 0.1365574965790709, + "grad_norm": 4.486091613769531, + "learning_rate": 1.978312547610025e-07, + "loss": 0.0232, + "step": 34180 + }, + { + "epoch": 0.13659744903564758, + "grad_norm": 3.744936466217041, + "learning_rate": 1.9782995205285306e-07, + "loss": 0.0292, + "step": 34190 + }, + { + "epoch": 0.13663740149222425, + "grad_norm": 2.6164772510528564, + "learning_rate": 1.9782864895786155e-07, + "loss": 0.0248, + "step": 34200 + }, + { + "epoch": 0.13667735394880093, + "grad_norm": 2.4239611625671387, + "learning_rate": 1.9782734547603317e-07, + "loss": 0.0242, + "step": 34210 + }, + { + "epoch": 0.1367173064053776, + "grad_norm": 2.2157959938049316, + "learning_rate": 1.9782604160737303e-07, + "loss": 0.023, + "step": 34220 + }, + { + "epoch": 0.13675725886195428, + "grad_norm": 3.1903045177459717, + "learning_rate": 1.978247373518863e-07, + "loss": 0.0272, + "step": 34230 + }, + { + "epoch": 0.13679721131853095, + "grad_norm": 5.019002437591553, + "learning_rate": 1.978234327095781e-07, + "loss": 0.0235, + "step": 34240 + }, + { + "epoch": 0.13683716377510763, + "grad_norm": 7.767727375030518, + "learning_rate": 1.9782212768045364e-07, + "loss": 0.024, + "step": 34250 + }, + { + "epoch": 0.1368771162316843, + "grad_norm": 3.0759072303771973, + "learning_rate": 1.9782082226451812e-07, + "loss": 0.024, + "step": 34260 + }, + { + "epoch": 0.13691706868826098, + "grad_norm": 11.949503898620605, + "learning_rate": 1.9781951646177658e-07, + "loss": 0.0244, + "step": 34270 + }, + { + "epoch": 0.13695702114483765, + "grad_norm": 5.2979326248168945, + "learning_rate": 1.978182102722343e-07, + "loss": 0.0265, + "step": 34280 + }, + { + "epoch": 0.13699697360141433, + "grad_norm": 5.116485595703125, + "learning_rate": 1.9781690369589635e-07, + "loss": 0.0276, + "step": 34290 + }, + { + "epoch": 0.137036926057991, + "grad_norm": 3.1615800857543945, + "learning_rate": 1.9781559673276798e-07, + "loss": 0.0217, + "step": 34300 + }, + { + "epoch": 0.13707687851456765, + "grad_norm": 4.321274280548096, + "learning_rate": 1.978142893828543e-07, + "loss": 0.0238, + "step": 34310 + }, + { + "epoch": 0.13711683097114433, + "grad_norm": 3.603127956390381, + "learning_rate": 1.978129816461605e-07, + "loss": 0.0231, + "step": 34320 + }, + { + "epoch": 0.137156783427721, + "grad_norm": 3.039100408554077, + "learning_rate": 1.9781167352269175e-07, + "loss": 0.0219, + "step": 34330 + }, + { + "epoch": 0.13719673588429768, + "grad_norm": 2.377878189086914, + "learning_rate": 1.9781036501245323e-07, + "loss": 0.0234, + "step": 34340 + }, + { + "epoch": 0.13723668834087435, + "grad_norm": 2.7062172889709473, + "learning_rate": 1.978090561154501e-07, + "loss": 0.0234, + "step": 34350 + }, + { + "epoch": 0.13727664079745103, + "grad_norm": 2.556090831756592, + "learning_rate": 1.9780774683168754e-07, + "loss": 0.025, + "step": 34360 + }, + { + "epoch": 0.1373165932540277, + "grad_norm": 2.7142975330352783, + "learning_rate": 1.9780643716117074e-07, + "loss": 0.0161, + "step": 34370 + }, + { + "epoch": 0.13735654571060438, + "grad_norm": 7.1128058433532715, + "learning_rate": 1.9780512710390487e-07, + "loss": 0.0227, + "step": 34380 + }, + { + "epoch": 0.13739649816718105, + "grad_norm": 7.993420124053955, + "learning_rate": 1.978038166598951e-07, + "loss": 0.0286, + "step": 34390 + }, + { + "epoch": 0.13743645062375773, + "grad_norm": 3.6655147075653076, + "learning_rate": 1.9780250582914665e-07, + "loss": 0.0216, + "step": 34400 + }, + { + "epoch": 0.1374764030803344, + "grad_norm": 4.35040283203125, + "learning_rate": 1.9780119461166462e-07, + "loss": 0.0208, + "step": 34410 + }, + { + "epoch": 0.13751635553691108, + "grad_norm": 3.196428060531616, + "learning_rate": 1.977998830074543e-07, + "loss": 0.024, + "step": 34420 + }, + { + "epoch": 0.13755630799348775, + "grad_norm": 2.0085225105285645, + "learning_rate": 1.977985710165208e-07, + "loss": 0.0305, + "step": 34430 + }, + { + "epoch": 0.13759626045006443, + "grad_norm": 5.28802490234375, + "learning_rate": 1.9779725863886932e-07, + "loss": 0.0255, + "step": 34440 + }, + { + "epoch": 0.1376362129066411, + "grad_norm": 2.055100917816162, + "learning_rate": 1.977959458745051e-07, + "loss": 0.0226, + "step": 34450 + }, + { + "epoch": 0.13767616536321778, + "grad_norm": 3.566770553588867, + "learning_rate": 1.9779463272343328e-07, + "loss": 0.0252, + "step": 34460 + }, + { + "epoch": 0.13771611781979445, + "grad_norm": 7.842406272888184, + "learning_rate": 1.9779331918565906e-07, + "loss": 0.0273, + "step": 34470 + }, + { + "epoch": 0.13775607027637113, + "grad_norm": 4.231375217437744, + "learning_rate": 1.9779200526118766e-07, + "loss": 0.0291, + "step": 34480 + }, + { + "epoch": 0.1377960227329478, + "grad_norm": 1.8254448175430298, + "learning_rate": 1.9779069095002423e-07, + "loss": 0.0185, + "step": 34490 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 2.067455768585205, + "learning_rate": 1.97789376252174e-07, + "loss": 0.0231, + "step": 34500 + }, + { + "epoch": 0.13787592764610113, + "grad_norm": 1.8332295417785645, + "learning_rate": 1.9778806116764217e-07, + "loss": 0.0275, + "step": 34510 + }, + { + "epoch": 0.1379158801026778, + "grad_norm": 8.138545989990234, + "learning_rate": 1.9778674569643395e-07, + "loss": 0.0266, + "step": 34520 + }, + { + "epoch": 0.13795583255925448, + "grad_norm": 1.7563450336456299, + "learning_rate": 1.9778542983855448e-07, + "loss": 0.0254, + "step": 34530 + }, + { + "epoch": 0.13799578501583115, + "grad_norm": 0.7347117066383362, + "learning_rate": 1.9778411359400904e-07, + "loss": 0.0189, + "step": 34540 + }, + { + "epoch": 0.13803573747240783, + "grad_norm": 12.002326011657715, + "learning_rate": 1.9778279696280278e-07, + "loss": 0.0257, + "step": 34550 + }, + { + "epoch": 0.1380756899289845, + "grad_norm": 4.919247627258301, + "learning_rate": 1.977814799449409e-07, + "loss": 0.0274, + "step": 34560 + }, + { + "epoch": 0.13811564238556118, + "grad_norm": 2.2084972858428955, + "learning_rate": 1.9778016254042872e-07, + "loss": 0.0271, + "step": 34570 + }, + { + "epoch": 0.13815559484213785, + "grad_norm": 3.7197561264038086, + "learning_rate": 1.977788447492713e-07, + "loss": 0.0279, + "step": 34580 + }, + { + "epoch": 0.13819554729871453, + "grad_norm": 4.722288608551025, + "learning_rate": 1.9777752657147393e-07, + "loss": 0.0224, + "step": 34590 + }, + { + "epoch": 0.1382354997552912, + "grad_norm": 3.0534090995788574, + "learning_rate": 1.9777620800704183e-07, + "loss": 0.0214, + "step": 34600 + }, + { + "epoch": 0.13827545221186788, + "grad_norm": 2.9459753036499023, + "learning_rate": 1.9777488905598017e-07, + "loss": 0.023, + "step": 34610 + }, + { + "epoch": 0.13831540466844455, + "grad_norm": 1.8070234060287476, + "learning_rate": 1.977735697182942e-07, + "loss": 0.0306, + "step": 34620 + }, + { + "epoch": 0.13835535712502123, + "grad_norm": 4.567336559295654, + "learning_rate": 1.9777224999398912e-07, + "loss": 0.0246, + "step": 34630 + }, + { + "epoch": 0.1383953095815979, + "grad_norm": 2.0939767360687256, + "learning_rate": 1.9777092988307013e-07, + "loss": 0.0248, + "step": 34640 + }, + { + "epoch": 0.13843526203817458, + "grad_norm": 3.6276209354400635, + "learning_rate": 1.977696093855425e-07, + "loss": 0.0243, + "step": 34650 + }, + { + "epoch": 0.13847521449475125, + "grad_norm": 3.861079454421997, + "learning_rate": 1.977682885014114e-07, + "loss": 0.0259, + "step": 34660 + }, + { + "epoch": 0.13851516695132793, + "grad_norm": 2.492488145828247, + "learning_rate": 1.977669672306821e-07, + "loss": 0.0257, + "step": 34670 + }, + { + "epoch": 0.1385551194079046, + "grad_norm": 1.321135401725769, + "learning_rate": 1.9776564557335979e-07, + "loss": 0.0215, + "step": 34680 + }, + { + "epoch": 0.13859507186448128, + "grad_norm": 4.732155799865723, + "learning_rate": 1.9776432352944973e-07, + "loss": 0.0272, + "step": 34690 + }, + { + "epoch": 0.13863502432105795, + "grad_norm": 1.5884151458740234, + "learning_rate": 1.9776300109895708e-07, + "loss": 0.0277, + "step": 34700 + }, + { + "epoch": 0.1386749767776346, + "grad_norm": 2.6321635246276855, + "learning_rate": 1.9776167828188716e-07, + "loss": 0.0224, + "step": 34710 + }, + { + "epoch": 0.13871492923421128, + "grad_norm": 3.8346407413482666, + "learning_rate": 1.9776035507824513e-07, + "loss": 0.0248, + "step": 34720 + }, + { + "epoch": 0.13875488169078795, + "grad_norm": 1.5566445589065552, + "learning_rate": 1.9775903148803627e-07, + "loss": 0.0215, + "step": 34730 + }, + { + "epoch": 0.13879483414736463, + "grad_norm": 2.3568897247314453, + "learning_rate": 1.977577075112658e-07, + "loss": 0.0264, + "step": 34740 + }, + { + "epoch": 0.1388347866039413, + "grad_norm": 5.923532009124756, + "learning_rate": 1.977563831479389e-07, + "loss": 0.0234, + "step": 34750 + }, + { + "epoch": 0.13887473906051798, + "grad_norm": 5.066998481750488, + "learning_rate": 1.977550583980609e-07, + "loss": 0.0208, + "step": 34760 + }, + { + "epoch": 0.13891469151709465, + "grad_norm": 4.441910743713379, + "learning_rate": 1.9775373326163699e-07, + "loss": 0.0234, + "step": 34770 + }, + { + "epoch": 0.13895464397367133, + "grad_norm": 6.651628494262695, + "learning_rate": 1.977524077386724e-07, + "loss": 0.0239, + "step": 34780 + }, + { + "epoch": 0.138994596430248, + "grad_norm": 3.639585256576538, + "learning_rate": 1.9775108182917237e-07, + "loss": 0.0204, + "step": 34790 + }, + { + "epoch": 0.13903454888682468, + "grad_norm": 2.8234260082244873, + "learning_rate": 1.9774975553314217e-07, + "loss": 0.0275, + "step": 34800 + }, + { + "epoch": 0.13907450134340135, + "grad_norm": 3.0938332080841064, + "learning_rate": 1.9774842885058701e-07, + "loss": 0.0234, + "step": 34810 + }, + { + "epoch": 0.13911445379997803, + "grad_norm": 4.00971794128418, + "learning_rate": 1.977471017815122e-07, + "loss": 0.0244, + "step": 34820 + }, + { + "epoch": 0.1391544062565547, + "grad_norm": 3.401770830154419, + "learning_rate": 1.977457743259229e-07, + "loss": 0.0205, + "step": 34830 + }, + { + "epoch": 0.13919435871313138, + "grad_norm": 4.151263236999512, + "learning_rate": 1.9774444648382446e-07, + "loss": 0.0223, + "step": 34840 + }, + { + "epoch": 0.13923431116970805, + "grad_norm": 6.861757755279541, + "learning_rate": 1.9774311825522202e-07, + "loss": 0.0279, + "step": 34850 + }, + { + "epoch": 0.13927426362628473, + "grad_norm": 9.144564628601074, + "learning_rate": 1.977417896401209e-07, + "loss": 0.0283, + "step": 34860 + }, + { + "epoch": 0.1393142160828614, + "grad_norm": 14.355792999267578, + "learning_rate": 1.9774046063852636e-07, + "loss": 0.0235, + "step": 34870 + }, + { + "epoch": 0.13935416853943808, + "grad_norm": 5.097663402557373, + "learning_rate": 1.977391312504436e-07, + "loss": 0.0241, + "step": 34880 + }, + { + "epoch": 0.13939412099601475, + "grad_norm": 4.94574499130249, + "learning_rate": 1.9773780147587792e-07, + "loss": 0.0266, + "step": 34890 + }, + { + "epoch": 0.13943407345259143, + "grad_norm": 3.3217549324035645, + "learning_rate": 1.977364713148346e-07, + "loss": 0.0304, + "step": 34900 + }, + { + "epoch": 0.1394740259091681, + "grad_norm": 1.2836613655090332, + "learning_rate": 1.9773514076731886e-07, + "loss": 0.0241, + "step": 34910 + }, + { + "epoch": 0.13951397836574475, + "grad_norm": 4.282000541687012, + "learning_rate": 1.9773380983333596e-07, + "loss": 0.0236, + "step": 34920 + }, + { + "epoch": 0.13955393082232143, + "grad_norm": 2.6215388774871826, + "learning_rate": 1.9773247851289118e-07, + "loss": 0.022, + "step": 34930 + }, + { + "epoch": 0.1395938832788981, + "grad_norm": 3.7930266857147217, + "learning_rate": 1.977311468059898e-07, + "loss": 0.0225, + "step": 34940 + }, + { + "epoch": 0.13963383573547478, + "grad_norm": 7.174178600311279, + "learning_rate": 1.9772981471263703e-07, + "loss": 0.0284, + "step": 34950 + }, + { + "epoch": 0.13967378819205145, + "grad_norm": 2.5713837146759033, + "learning_rate": 1.9772848223283817e-07, + "loss": 0.0208, + "step": 34960 + }, + { + "epoch": 0.13971374064862813, + "grad_norm": 5.751051902770996, + "learning_rate": 1.977271493665985e-07, + "loss": 0.0242, + "step": 34970 + }, + { + "epoch": 0.1397536931052048, + "grad_norm": 4.3649797439575195, + "learning_rate": 1.9772581611392329e-07, + "loss": 0.0218, + "step": 34980 + }, + { + "epoch": 0.13979364556178148, + "grad_norm": 1.6961451768875122, + "learning_rate": 1.9772448247481782e-07, + "loss": 0.0177, + "step": 34990 + }, + { + "epoch": 0.13983359801835815, + "grad_norm": 4.921760559082031, + "learning_rate": 1.977231484492873e-07, + "loss": 0.0256, + "step": 35000 + }, + { + "epoch": 0.13987355047493483, + "grad_norm": 6.12150764465332, + "learning_rate": 1.977218140373371e-07, + "loss": 0.0243, + "step": 35010 + }, + { + "epoch": 0.1399135029315115, + "grad_norm": 2.8453853130340576, + "learning_rate": 1.9772047923897244e-07, + "loss": 0.0247, + "step": 35020 + }, + { + "epoch": 0.13995345538808818, + "grad_norm": 5.0122175216674805, + "learning_rate": 1.9771914405419859e-07, + "loss": 0.0285, + "step": 35030 + }, + { + "epoch": 0.13999340784466485, + "grad_norm": 14.428194046020508, + "learning_rate": 1.9771780848302084e-07, + "loss": 0.0266, + "step": 35040 + }, + { + "epoch": 0.14003336030124153, + "grad_norm": 7.570789337158203, + "learning_rate": 1.977164725254445e-07, + "loss": 0.0251, + "step": 35050 + }, + { + "epoch": 0.1400733127578182, + "grad_norm": 5.156691074371338, + "learning_rate": 1.977151361814748e-07, + "loss": 0.024, + "step": 35060 + }, + { + "epoch": 0.14011326521439488, + "grad_norm": 4.117483615875244, + "learning_rate": 1.9771379945111705e-07, + "loss": 0.0225, + "step": 35070 + }, + { + "epoch": 0.14015321767097155, + "grad_norm": 2.8584911823272705, + "learning_rate": 1.9771246233437658e-07, + "loss": 0.0253, + "step": 35080 + }, + { + "epoch": 0.14019317012754823, + "grad_norm": 2.170681953430176, + "learning_rate": 1.9771112483125862e-07, + "loss": 0.0222, + "step": 35090 + }, + { + "epoch": 0.1402331225841249, + "grad_norm": 2.5863726139068604, + "learning_rate": 1.9770978694176845e-07, + "loss": 0.0223, + "step": 35100 + }, + { + "epoch": 0.14027307504070158, + "grad_norm": 3.708726644515991, + "learning_rate": 1.9770844866591144e-07, + "loss": 0.0331, + "step": 35110 + }, + { + "epoch": 0.14031302749727823, + "grad_norm": 1.9058455228805542, + "learning_rate": 1.9770711000369279e-07, + "loss": 0.0233, + "step": 35120 + }, + { + "epoch": 0.1403529799538549, + "grad_norm": 9.438362121582031, + "learning_rate": 1.9770577095511782e-07, + "loss": 0.0248, + "step": 35130 + }, + { + "epoch": 0.14039293241043158, + "grad_norm": 2.2214455604553223, + "learning_rate": 1.9770443152019185e-07, + "loss": 0.0225, + "step": 35140 + }, + { + "epoch": 0.14043288486700825, + "grad_norm": 4.088798999786377, + "learning_rate": 1.9770309169892018e-07, + "loss": 0.0251, + "step": 35150 + }, + { + "epoch": 0.14047283732358493, + "grad_norm": 3.4400503635406494, + "learning_rate": 1.9770175149130804e-07, + "loss": 0.0243, + "step": 35160 + }, + { + "epoch": 0.1405127897801616, + "grad_norm": 6.273932456970215, + "learning_rate": 1.9770041089736085e-07, + "loss": 0.0232, + "step": 35170 + }, + { + "epoch": 0.14055274223673828, + "grad_norm": 3.314335823059082, + "learning_rate": 1.976990699170838e-07, + "loss": 0.0226, + "step": 35180 + }, + { + "epoch": 0.14059269469331495, + "grad_norm": 6.193079948425293, + "learning_rate": 1.9769772855048226e-07, + "loss": 0.0241, + "step": 35190 + }, + { + "epoch": 0.14063264714989163, + "grad_norm": 3.0378224849700928, + "learning_rate": 1.9769638679756148e-07, + "loss": 0.0236, + "step": 35200 + }, + { + "epoch": 0.1406725996064683, + "grad_norm": 5.730050086975098, + "learning_rate": 1.9769504465832682e-07, + "loss": 0.0261, + "step": 35210 + }, + { + "epoch": 0.14071255206304498, + "grad_norm": 15.591375350952148, + "learning_rate": 1.9769370213278353e-07, + "loss": 0.0238, + "step": 35220 + }, + { + "epoch": 0.14075250451962165, + "grad_norm": 6.681907653808594, + "learning_rate": 1.9769235922093698e-07, + "loss": 0.0263, + "step": 35230 + }, + { + "epoch": 0.14079245697619833, + "grad_norm": 5.357916831970215, + "learning_rate": 1.9769101592279242e-07, + "loss": 0.0253, + "step": 35240 + }, + { + "epoch": 0.140832409432775, + "grad_norm": 4.239778995513916, + "learning_rate": 1.976896722383552e-07, + "loss": 0.0229, + "step": 35250 + }, + { + "epoch": 0.14087236188935168, + "grad_norm": 10.143454551696777, + "learning_rate": 1.9768832816763061e-07, + "loss": 0.0166, + "step": 35260 + }, + { + "epoch": 0.14091231434592835, + "grad_norm": 7.376945972442627, + "learning_rate": 1.9768698371062402e-07, + "loss": 0.023, + "step": 35270 + }, + { + "epoch": 0.14095226680250503, + "grad_norm": 3.8397536277770996, + "learning_rate": 1.976856388673407e-07, + "loss": 0.0251, + "step": 35280 + }, + { + "epoch": 0.1409922192590817, + "grad_norm": 3.7506468296051025, + "learning_rate": 1.9768429363778594e-07, + "loss": 0.0213, + "step": 35290 + }, + { + "epoch": 0.14103217171565838, + "grad_norm": 4.126847267150879, + "learning_rate": 1.976829480219651e-07, + "loss": 0.0203, + "step": 35300 + }, + { + "epoch": 0.14107212417223505, + "grad_norm": 5.279444694519043, + "learning_rate": 1.976816020198835e-07, + "loss": 0.023, + "step": 35310 + }, + { + "epoch": 0.14111207662881173, + "grad_norm": 5.017458915710449, + "learning_rate": 1.9768025563154647e-07, + "loss": 0.0212, + "step": 35320 + }, + { + "epoch": 0.14115202908538838, + "grad_norm": 1.1486772298812866, + "learning_rate": 1.9767890885695932e-07, + "loss": 0.0248, + "step": 35330 + }, + { + "epoch": 0.14119198154196505, + "grad_norm": 4.644484043121338, + "learning_rate": 1.9767756169612736e-07, + "loss": 0.0259, + "step": 35340 + }, + { + "epoch": 0.14123193399854173, + "grad_norm": 2.8517022132873535, + "learning_rate": 1.9767621414905592e-07, + "loss": 0.0274, + "step": 35350 + }, + { + "epoch": 0.1412718864551184, + "grad_norm": 4.328672409057617, + "learning_rate": 1.9767486621575035e-07, + "loss": 0.021, + "step": 35360 + }, + { + "epoch": 0.14131183891169508, + "grad_norm": 3.0443246364593506, + "learning_rate": 1.97673517896216e-07, + "loss": 0.0245, + "step": 35370 + }, + { + "epoch": 0.14135179136827175, + "grad_norm": 3.9364864826202393, + "learning_rate": 1.9767216919045814e-07, + "loss": 0.0267, + "step": 35380 + }, + { + "epoch": 0.14139174382484843, + "grad_norm": 4.698602676391602, + "learning_rate": 1.9767082009848214e-07, + "loss": 0.0239, + "step": 35390 + }, + { + "epoch": 0.1414316962814251, + "grad_norm": 6.075052261352539, + "learning_rate": 1.9766947062029332e-07, + "loss": 0.0221, + "step": 35400 + }, + { + "epoch": 0.14147164873800178, + "grad_norm": 2.6886587142944336, + "learning_rate": 1.9766812075589705e-07, + "loss": 0.0233, + "step": 35410 + }, + { + "epoch": 0.14151160119457845, + "grad_norm": 1.6127032041549683, + "learning_rate": 1.9766677050529862e-07, + "loss": 0.0237, + "step": 35420 + }, + { + "epoch": 0.14155155365115513, + "grad_norm": 4.187127590179443, + "learning_rate": 1.9766541986850343e-07, + "loss": 0.0249, + "step": 35430 + }, + { + "epoch": 0.1415915061077318, + "grad_norm": 2.0121591091156006, + "learning_rate": 1.9766406884551676e-07, + "loss": 0.0233, + "step": 35440 + }, + { + "epoch": 0.14163145856430848, + "grad_norm": 5.682636737823486, + "learning_rate": 1.9766271743634396e-07, + "loss": 0.0211, + "step": 35450 + }, + { + "epoch": 0.14167141102088515, + "grad_norm": 3.7941393852233887, + "learning_rate": 1.976613656409904e-07, + "loss": 0.0213, + "step": 35460 + }, + { + "epoch": 0.14171136347746183, + "grad_norm": 2.2896342277526855, + "learning_rate": 1.9766001345946143e-07, + "loss": 0.0244, + "step": 35470 + }, + { + "epoch": 0.1417513159340385, + "grad_norm": 8.265035629272461, + "learning_rate": 1.976586608917624e-07, + "loss": 0.0287, + "step": 35480 + }, + { + "epoch": 0.14179126839061518, + "grad_norm": 10.989958763122559, + "learning_rate": 1.976573079378986e-07, + "loss": 0.0228, + "step": 35490 + }, + { + "epoch": 0.14183122084719185, + "grad_norm": 1.7070534229278564, + "learning_rate": 1.9765595459787542e-07, + "loss": 0.0234, + "step": 35500 + }, + { + "epoch": 0.14187117330376853, + "grad_norm": 2.880136013031006, + "learning_rate": 1.9765460087169824e-07, + "loss": 0.028, + "step": 35510 + }, + { + "epoch": 0.1419111257603452, + "grad_norm": 3.3386669158935547, + "learning_rate": 1.976532467593724e-07, + "loss": 0.0262, + "step": 35520 + }, + { + "epoch": 0.14195107821692185, + "grad_norm": 3.1386213302612305, + "learning_rate": 1.9765189226090318e-07, + "loss": 0.024, + "step": 35530 + }, + { + "epoch": 0.14199103067349853, + "grad_norm": 5.211058616638184, + "learning_rate": 1.9765053737629602e-07, + "loss": 0.0209, + "step": 35540 + }, + { + "epoch": 0.1420309831300752, + "grad_norm": 5.863748073577881, + "learning_rate": 1.9764918210555626e-07, + "loss": 0.0246, + "step": 35550 + }, + { + "epoch": 0.14207093558665188, + "grad_norm": 5.0925679206848145, + "learning_rate": 1.9764782644868924e-07, + "loss": 0.0214, + "step": 35560 + }, + { + "epoch": 0.14211088804322855, + "grad_norm": 6.76969051361084, + "learning_rate": 1.9764647040570036e-07, + "loss": 0.0241, + "step": 35570 + }, + { + "epoch": 0.14215084049980523, + "grad_norm": 4.476104736328125, + "learning_rate": 1.9764511397659493e-07, + "loss": 0.0266, + "step": 35580 + }, + { + "epoch": 0.1421907929563819, + "grad_norm": 12.753828048706055, + "learning_rate": 1.9764375716137834e-07, + "loss": 0.0231, + "step": 35590 + }, + { + "epoch": 0.14223074541295858, + "grad_norm": 6.745530128479004, + "learning_rate": 1.9764239996005595e-07, + "loss": 0.0273, + "step": 35600 + }, + { + "epoch": 0.14227069786953525, + "grad_norm": 6.7454447746276855, + "learning_rate": 1.9764104237263312e-07, + "loss": 0.0235, + "step": 35610 + }, + { + "epoch": 0.14231065032611193, + "grad_norm": 2.6425435543060303, + "learning_rate": 1.9763968439911526e-07, + "loss": 0.0231, + "step": 35620 + }, + { + "epoch": 0.1423506027826886, + "grad_norm": 3.612199544906616, + "learning_rate": 1.9763832603950769e-07, + "loss": 0.0248, + "step": 35630 + }, + { + "epoch": 0.14239055523926528, + "grad_norm": 6.202563762664795, + "learning_rate": 1.976369672938158e-07, + "loss": 0.0213, + "step": 35640 + }, + { + "epoch": 0.14243050769584195, + "grad_norm": 3.84175443649292, + "learning_rate": 1.9763560816204496e-07, + "loss": 0.0196, + "step": 35650 + }, + { + "epoch": 0.14247046015241863, + "grad_norm": 8.560140609741211, + "learning_rate": 1.9763424864420054e-07, + "loss": 0.0226, + "step": 35660 + }, + { + "epoch": 0.1425104126089953, + "grad_norm": 4.194342613220215, + "learning_rate": 1.9763288874028797e-07, + "loss": 0.0221, + "step": 35670 + }, + { + "epoch": 0.14255036506557198, + "grad_norm": 6.854725360870361, + "learning_rate": 1.976315284503125e-07, + "loss": 0.0276, + "step": 35680 + }, + { + "epoch": 0.14259031752214865, + "grad_norm": 4.860105991363525, + "learning_rate": 1.9763016777427962e-07, + "loss": 0.0205, + "step": 35690 + }, + { + "epoch": 0.14263026997872533, + "grad_norm": 6.561639785766602, + "learning_rate": 1.976288067121947e-07, + "loss": 0.0225, + "step": 35700 + }, + { + "epoch": 0.142670222435302, + "grad_norm": 15.707592010498047, + "learning_rate": 1.976274452640631e-07, + "loss": 0.022, + "step": 35710 + }, + { + "epoch": 0.14271017489187868, + "grad_norm": 3.3297860622406006, + "learning_rate": 1.9762608342989017e-07, + "loss": 0.024, + "step": 35720 + }, + { + "epoch": 0.14275012734845532, + "grad_norm": 2.442486047744751, + "learning_rate": 1.9762472120968136e-07, + "loss": 0.0262, + "step": 35730 + }, + { + "epoch": 0.142790079805032, + "grad_norm": 7.114905834197998, + "learning_rate": 1.9762335860344202e-07, + "loss": 0.0243, + "step": 35740 + }, + { + "epoch": 0.14283003226160867, + "grad_norm": 3.8980300426483154, + "learning_rate": 1.9762199561117753e-07, + "loss": 0.0226, + "step": 35750 + }, + { + "epoch": 0.14286998471818535, + "grad_norm": 3.7887353897094727, + "learning_rate": 1.976206322328933e-07, + "loss": 0.0288, + "step": 35760 + }, + { + "epoch": 0.14290993717476203, + "grad_norm": 4.018516540527344, + "learning_rate": 1.9761926846859473e-07, + "loss": 0.026, + "step": 35770 + }, + { + "epoch": 0.1429498896313387, + "grad_norm": 5.139012813568115, + "learning_rate": 1.976179043182872e-07, + "loss": 0.0233, + "step": 35780 + }, + { + "epoch": 0.14298984208791538, + "grad_norm": 5.135836601257324, + "learning_rate": 1.9761653978197606e-07, + "loss": 0.0256, + "step": 35790 + }, + { + "epoch": 0.14302979454449205, + "grad_norm": 1.6182612180709839, + "learning_rate": 1.976151748596668e-07, + "loss": 0.0296, + "step": 35800 + }, + { + "epoch": 0.14306974700106873, + "grad_norm": 3.5682759284973145, + "learning_rate": 1.9761380955136472e-07, + "loss": 0.0269, + "step": 35810 + }, + { + "epoch": 0.1431096994576454, + "grad_norm": 5.166018962860107, + "learning_rate": 1.9761244385707527e-07, + "loss": 0.0282, + "step": 35820 + }, + { + "epoch": 0.14314965191422208, + "grad_norm": 4.268496036529541, + "learning_rate": 1.9761107777680385e-07, + "loss": 0.0203, + "step": 35830 + }, + { + "epoch": 0.14318960437079875, + "grad_norm": 6.670180797576904, + "learning_rate": 1.9760971131055586e-07, + "loss": 0.0232, + "step": 35840 + }, + { + "epoch": 0.14322955682737543, + "grad_norm": 4.438498497009277, + "learning_rate": 1.9760834445833668e-07, + "loss": 0.0271, + "step": 35850 + }, + { + "epoch": 0.1432695092839521, + "grad_norm": 3.4229555130004883, + "learning_rate": 1.9760697722015175e-07, + "loss": 0.0239, + "step": 35860 + }, + { + "epoch": 0.14330946174052878, + "grad_norm": 5.317664623260498, + "learning_rate": 1.9760560959600646e-07, + "loss": 0.027, + "step": 35870 + }, + { + "epoch": 0.14334941419710545, + "grad_norm": 11.485435485839844, + "learning_rate": 1.9760424158590622e-07, + "loss": 0.0271, + "step": 35880 + }, + { + "epoch": 0.14338936665368213, + "grad_norm": 2.104294538497925, + "learning_rate": 1.9760287318985642e-07, + "loss": 0.0243, + "step": 35890 + }, + { + "epoch": 0.1434293191102588, + "grad_norm": 3.4167728424072266, + "learning_rate": 1.976015044078625e-07, + "loss": 0.0215, + "step": 35900 + }, + { + "epoch": 0.14346927156683548, + "grad_norm": 2.5202324390411377, + "learning_rate": 1.9760013523992987e-07, + "loss": 0.0229, + "step": 35910 + }, + { + "epoch": 0.14350922402341215, + "grad_norm": 2.982971429824829, + "learning_rate": 1.9759876568606393e-07, + "loss": 0.0254, + "step": 35920 + }, + { + "epoch": 0.14354917647998883, + "grad_norm": 2.2571587562561035, + "learning_rate": 1.975973957462701e-07, + "loss": 0.0242, + "step": 35930 + }, + { + "epoch": 0.14358912893656547, + "grad_norm": 8.901283264160156, + "learning_rate": 1.975960254205538e-07, + "loss": 0.0263, + "step": 35940 + }, + { + "epoch": 0.14362908139314215, + "grad_norm": 3.88493013381958, + "learning_rate": 1.975946547089204e-07, + "loss": 0.0191, + "step": 35950 + }, + { + "epoch": 0.14366903384971882, + "grad_norm": 1.801302194595337, + "learning_rate": 1.9759328361137542e-07, + "loss": 0.0231, + "step": 35960 + }, + { + "epoch": 0.1437089863062955, + "grad_norm": 5.641369819641113, + "learning_rate": 1.975919121279242e-07, + "loss": 0.0298, + "step": 35970 + }, + { + "epoch": 0.14374893876287217, + "grad_norm": 3.7656943798065186, + "learning_rate": 1.975905402585722e-07, + "loss": 0.025, + "step": 35980 + }, + { + "epoch": 0.14378889121944885, + "grad_norm": 4.345277309417725, + "learning_rate": 1.9758916800332483e-07, + "loss": 0.0289, + "step": 35990 + }, + { + "epoch": 0.14382884367602553, + "grad_norm": 2.1028382778167725, + "learning_rate": 1.9758779536218754e-07, + "loss": 0.0212, + "step": 36000 + }, + { + "epoch": 0.1438687961326022, + "grad_norm": 3.8721184730529785, + "learning_rate": 1.9758642233516572e-07, + "loss": 0.0225, + "step": 36010 + }, + { + "epoch": 0.14390874858917888, + "grad_norm": 2.2488465309143066, + "learning_rate": 1.975850489222648e-07, + "loss": 0.0252, + "step": 36020 + }, + { + "epoch": 0.14394870104575555, + "grad_norm": 3.824909210205078, + "learning_rate": 1.9758367512349025e-07, + "loss": 0.0244, + "step": 36030 + }, + { + "epoch": 0.14398865350233223, + "grad_norm": 2.241901159286499, + "learning_rate": 1.9758230093884749e-07, + "loss": 0.0194, + "step": 36040 + }, + { + "epoch": 0.1440286059589089, + "grad_norm": 4.324286937713623, + "learning_rate": 1.9758092636834194e-07, + "loss": 0.023, + "step": 36050 + }, + { + "epoch": 0.14406855841548558, + "grad_norm": 5.577347755432129, + "learning_rate": 1.9757955141197905e-07, + "loss": 0.0213, + "step": 36060 + }, + { + "epoch": 0.14410851087206225, + "grad_norm": 4.713369846343994, + "learning_rate": 1.975781760697642e-07, + "loss": 0.0277, + "step": 36070 + }, + { + "epoch": 0.14414846332863893, + "grad_norm": 3.4324703216552734, + "learning_rate": 1.9757680034170292e-07, + "loss": 0.024, + "step": 36080 + }, + { + "epoch": 0.1441884157852156, + "grad_norm": 4.28224515914917, + "learning_rate": 1.975754242278006e-07, + "loss": 0.0265, + "step": 36090 + }, + { + "epoch": 0.14422836824179228, + "grad_norm": 2.6101021766662598, + "learning_rate": 1.9757404772806267e-07, + "loss": 0.0219, + "step": 36100 + }, + { + "epoch": 0.14426832069836895, + "grad_norm": 2.100649118423462, + "learning_rate": 1.975726708424946e-07, + "loss": 0.0233, + "step": 36110 + }, + { + "epoch": 0.14430827315494563, + "grad_norm": 3.517669677734375, + "learning_rate": 1.9757129357110185e-07, + "loss": 0.0278, + "step": 36120 + }, + { + "epoch": 0.1443482256115223, + "grad_norm": 3.5617971420288086, + "learning_rate": 1.975699159138898e-07, + "loss": 0.023, + "step": 36130 + }, + { + "epoch": 0.14438817806809895, + "grad_norm": 4.624080657958984, + "learning_rate": 1.9756853787086396e-07, + "loss": 0.0214, + "step": 36140 + }, + { + "epoch": 0.14442813052467562, + "grad_norm": 3.0819242000579834, + "learning_rate": 1.9756715944202976e-07, + "loss": 0.0299, + "step": 36150 + }, + { + "epoch": 0.1444680829812523, + "grad_norm": 4.485870838165283, + "learning_rate": 1.9756578062739263e-07, + "loss": 0.0255, + "step": 36160 + }, + { + "epoch": 0.14450803543782897, + "grad_norm": 5.364537239074707, + "learning_rate": 1.9756440142695805e-07, + "loss": 0.0243, + "step": 36170 + }, + { + "epoch": 0.14454798789440565, + "grad_norm": 1.5159248113632202, + "learning_rate": 1.9756302184073145e-07, + "loss": 0.0269, + "step": 36180 + }, + { + "epoch": 0.14458794035098232, + "grad_norm": 5.189027309417725, + "learning_rate": 1.9756164186871834e-07, + "loss": 0.0256, + "step": 36190 + }, + { + "epoch": 0.144627892807559, + "grad_norm": 2.5698421001434326, + "learning_rate": 1.975602615109241e-07, + "loss": 0.0224, + "step": 36200 + }, + { + "epoch": 0.14466784526413567, + "grad_norm": 2.5640997886657715, + "learning_rate": 1.9755888076735424e-07, + "loss": 0.0257, + "step": 36210 + }, + { + "epoch": 0.14470779772071235, + "grad_norm": 2.2111549377441406, + "learning_rate": 1.9755749963801418e-07, + "loss": 0.0206, + "step": 36220 + }, + { + "epoch": 0.14474775017728903, + "grad_norm": 10.40003490447998, + "learning_rate": 1.9755611812290943e-07, + "loss": 0.0251, + "step": 36230 + }, + { + "epoch": 0.1447877026338657, + "grad_norm": 22.705312728881836, + "learning_rate": 1.9755473622204542e-07, + "loss": 0.0239, + "step": 36240 + }, + { + "epoch": 0.14482765509044238, + "grad_norm": 7.277915954589844, + "learning_rate": 1.9755335393542762e-07, + "loss": 0.0244, + "step": 36250 + }, + { + "epoch": 0.14486760754701905, + "grad_norm": 3.6032874584198, + "learning_rate": 1.975519712630615e-07, + "loss": 0.0244, + "step": 36260 + }, + { + "epoch": 0.14490756000359573, + "grad_norm": 3.3344404697418213, + "learning_rate": 1.9755058820495255e-07, + "loss": 0.0219, + "step": 36270 + }, + { + "epoch": 0.1449475124601724, + "grad_norm": 3.3199870586395264, + "learning_rate": 1.9754920476110618e-07, + "loss": 0.0303, + "step": 36280 + }, + { + "epoch": 0.14498746491674908, + "grad_norm": 3.262826442718506, + "learning_rate": 1.975478209315279e-07, + "loss": 0.0239, + "step": 36290 + }, + { + "epoch": 0.14502741737332575, + "grad_norm": 1.8659439086914062, + "learning_rate": 1.9754643671622317e-07, + "loss": 0.0223, + "step": 36300 + }, + { + "epoch": 0.14506736982990243, + "grad_norm": 3.573730230331421, + "learning_rate": 1.975450521151975e-07, + "loss": 0.0248, + "step": 36310 + }, + { + "epoch": 0.1451073222864791, + "grad_norm": 4.597622394561768, + "learning_rate": 1.9754366712845633e-07, + "loss": 0.0237, + "step": 36320 + }, + { + "epoch": 0.14514727474305578, + "grad_norm": 4.534427642822266, + "learning_rate": 1.9754228175600512e-07, + "loss": 0.0242, + "step": 36330 + }, + { + "epoch": 0.14518722719963242, + "grad_norm": 2.049865484237671, + "learning_rate": 1.9754089599784939e-07, + "loss": 0.0263, + "step": 36340 + }, + { + "epoch": 0.1452271796562091, + "grad_norm": 14.161840438842773, + "learning_rate": 1.9753950985399458e-07, + "loss": 0.0148, + "step": 36350 + }, + { + "epoch": 0.14526713211278577, + "grad_norm": 4.129796981811523, + "learning_rate": 1.975381233244462e-07, + "loss": 0.0258, + "step": 36360 + }, + { + "epoch": 0.14530708456936245, + "grad_norm": 3.428621768951416, + "learning_rate": 1.9753673640920972e-07, + "loss": 0.0282, + "step": 36370 + }, + { + "epoch": 0.14534703702593912, + "grad_norm": 3.4578378200531006, + "learning_rate": 1.9753534910829064e-07, + "loss": 0.0227, + "step": 36380 + }, + { + "epoch": 0.1453869894825158, + "grad_norm": 8.858713150024414, + "learning_rate": 1.9753396142169442e-07, + "loss": 0.0198, + "step": 36390 + }, + { + "epoch": 0.14542694193909247, + "grad_norm": 3.7795660495758057, + "learning_rate": 1.9753257334942657e-07, + "loss": 0.0236, + "step": 36400 + }, + { + "epoch": 0.14546689439566915, + "grad_norm": 5.539763927459717, + "learning_rate": 1.9753118489149257e-07, + "loss": 0.0248, + "step": 36410 + }, + { + "epoch": 0.14550684685224582, + "grad_norm": 3.5225610733032227, + "learning_rate": 1.975297960478979e-07, + "loss": 0.0255, + "step": 36420 + }, + { + "epoch": 0.1455467993088225, + "grad_norm": 6.001698970794678, + "learning_rate": 1.9752840681864806e-07, + "loss": 0.0246, + "step": 36430 + }, + { + "epoch": 0.14558675176539917, + "grad_norm": 2.806865930557251, + "learning_rate": 1.9752701720374856e-07, + "loss": 0.0273, + "step": 36440 + }, + { + "epoch": 0.14562670422197585, + "grad_norm": 3.001862049102783, + "learning_rate": 1.9752562720320486e-07, + "loss": 0.0212, + "step": 36450 + }, + { + "epoch": 0.14566665667855253, + "grad_norm": 1.96480393409729, + "learning_rate": 1.9752423681702252e-07, + "loss": 0.0234, + "step": 36460 + }, + { + "epoch": 0.1457066091351292, + "grad_norm": 4.258877277374268, + "learning_rate": 1.9752284604520693e-07, + "loss": 0.0223, + "step": 36470 + }, + { + "epoch": 0.14574656159170588, + "grad_norm": 3.0243663787841797, + "learning_rate": 1.9752145488776368e-07, + "loss": 0.0188, + "step": 36480 + }, + { + "epoch": 0.14578651404828255, + "grad_norm": 2.9573328495025635, + "learning_rate": 1.9752006334469826e-07, + "loss": 0.0277, + "step": 36490 + }, + { + "epoch": 0.14582646650485923, + "grad_norm": 2.8872272968292236, + "learning_rate": 1.9751867141601614e-07, + "loss": 0.0239, + "step": 36500 + }, + { + "epoch": 0.1458664189614359, + "grad_norm": 2.438018798828125, + "learning_rate": 1.9751727910172287e-07, + "loss": 0.0225, + "step": 36510 + }, + { + "epoch": 0.14590637141801258, + "grad_norm": 11.1793851852417, + "learning_rate": 1.9751588640182387e-07, + "loss": 0.0232, + "step": 36520 + }, + { + "epoch": 0.14594632387458925, + "grad_norm": 6.9751129150390625, + "learning_rate": 1.9751449331632474e-07, + "loss": 0.018, + "step": 36530 + }, + { + "epoch": 0.14598627633116593, + "grad_norm": 5.453727722167969, + "learning_rate": 1.9751309984523096e-07, + "loss": 0.0263, + "step": 36540 + }, + { + "epoch": 0.14602622878774257, + "grad_norm": 4.399521350860596, + "learning_rate": 1.97511705988548e-07, + "loss": 0.0205, + "step": 36550 + }, + { + "epoch": 0.14606618124431925, + "grad_norm": 2.946415901184082, + "learning_rate": 1.9751031174628144e-07, + "loss": 0.0186, + "step": 36560 + }, + { + "epoch": 0.14610613370089592, + "grad_norm": 5.174699783325195, + "learning_rate": 1.9750891711843677e-07, + "loss": 0.0249, + "step": 36570 + }, + { + "epoch": 0.1461460861574726, + "grad_norm": 3.544377088546753, + "learning_rate": 1.9750752210501944e-07, + "loss": 0.0192, + "step": 36580 + }, + { + "epoch": 0.14618603861404927, + "grad_norm": 3.110279083251953, + "learning_rate": 1.9750612670603502e-07, + "loss": 0.0235, + "step": 36590 + }, + { + "epoch": 0.14622599107062595, + "grad_norm": 5.014284610748291, + "learning_rate": 1.9750473092148904e-07, + "loss": 0.0201, + "step": 36600 + }, + { + "epoch": 0.14626594352720262, + "grad_norm": 6.93101692199707, + "learning_rate": 1.9750333475138703e-07, + "loss": 0.0226, + "step": 36610 + }, + { + "epoch": 0.1463058959837793, + "grad_norm": 1.3996944427490234, + "learning_rate": 1.9750193819573447e-07, + "loss": 0.0221, + "step": 36620 + }, + { + "epoch": 0.14634584844035597, + "grad_norm": 5.216516017913818, + "learning_rate": 1.975005412545369e-07, + "loss": 0.0228, + "step": 36630 + }, + { + "epoch": 0.14638580089693265, + "grad_norm": 2.3350441455841064, + "learning_rate": 1.9749914392779982e-07, + "loss": 0.0236, + "step": 36640 + }, + { + "epoch": 0.14642575335350932, + "grad_norm": 6.988284587860107, + "learning_rate": 1.974977462155288e-07, + "loss": 0.019, + "step": 36650 + }, + { + "epoch": 0.146465705810086, + "grad_norm": 4.826686382293701, + "learning_rate": 1.9749634811772932e-07, + "loss": 0.0276, + "step": 36660 + }, + { + "epoch": 0.14650565826666267, + "grad_norm": 2.9414632320404053, + "learning_rate": 1.9749494963440696e-07, + "loss": 0.0245, + "step": 36670 + }, + { + "epoch": 0.14654561072323935, + "grad_norm": 4.370475769042969, + "learning_rate": 1.974935507655672e-07, + "loss": 0.0239, + "step": 36680 + }, + { + "epoch": 0.14658556317981603, + "grad_norm": 4.39299201965332, + "learning_rate": 1.9749215151121562e-07, + "loss": 0.0248, + "step": 36690 + }, + { + "epoch": 0.1466255156363927, + "grad_norm": 2.4573681354522705, + "learning_rate": 1.974907518713577e-07, + "loss": 0.0207, + "step": 36700 + }, + { + "epoch": 0.14666546809296938, + "grad_norm": 2.2203147411346436, + "learning_rate": 1.9748935184599902e-07, + "loss": 0.0279, + "step": 36710 + }, + { + "epoch": 0.14670542054954605, + "grad_norm": 10.217171669006348, + "learning_rate": 1.9748795143514508e-07, + "loss": 0.023, + "step": 36720 + }, + { + "epoch": 0.14674537300612273, + "grad_norm": 2.7481770515441895, + "learning_rate": 1.9748655063880145e-07, + "loss": 0.0262, + "step": 36730 + }, + { + "epoch": 0.1467853254626994, + "grad_norm": 5.365272045135498, + "learning_rate": 1.9748514945697362e-07, + "loss": 0.0213, + "step": 36740 + }, + { + "epoch": 0.14682527791927605, + "grad_norm": 2.8809738159179688, + "learning_rate": 1.974837478896672e-07, + "loss": 0.0204, + "step": 36750 + }, + { + "epoch": 0.14686523037585272, + "grad_norm": 3.9021644592285156, + "learning_rate": 1.9748234593688769e-07, + "loss": 0.0184, + "step": 36760 + }, + { + "epoch": 0.1469051828324294, + "grad_norm": 2.355780601501465, + "learning_rate": 1.9748094359864064e-07, + "loss": 0.0246, + "step": 36770 + }, + { + "epoch": 0.14694513528900607, + "grad_norm": 2.270540952682495, + "learning_rate": 1.9747954087493158e-07, + "loss": 0.0235, + "step": 36780 + }, + { + "epoch": 0.14698508774558275, + "grad_norm": 4.783381938934326, + "learning_rate": 1.974781377657661e-07, + "loss": 0.024, + "step": 36790 + }, + { + "epoch": 0.14702504020215942, + "grad_norm": 1.7100498676300049, + "learning_rate": 1.974767342711497e-07, + "loss": 0.0231, + "step": 36800 + }, + { + "epoch": 0.1470649926587361, + "grad_norm": 2.8766255378723145, + "learning_rate": 1.9747533039108794e-07, + "loss": 0.0238, + "step": 36810 + }, + { + "epoch": 0.14710494511531277, + "grad_norm": 5.094200611114502, + "learning_rate": 1.974739261255864e-07, + "loss": 0.0277, + "step": 36820 + }, + { + "epoch": 0.14714489757188945, + "grad_norm": 4.700295448303223, + "learning_rate": 1.974725214746506e-07, + "loss": 0.0245, + "step": 36830 + }, + { + "epoch": 0.14718485002846612, + "grad_norm": 6.805001258850098, + "learning_rate": 1.9747111643828614e-07, + "loss": 0.0193, + "step": 36840 + }, + { + "epoch": 0.1472248024850428, + "grad_norm": 3.1060874462127686, + "learning_rate": 1.9746971101649853e-07, + "loss": 0.0258, + "step": 36850 + }, + { + "epoch": 0.14726475494161947, + "grad_norm": 3.3026113510131836, + "learning_rate": 1.9746830520929332e-07, + "loss": 0.0224, + "step": 36860 + }, + { + "epoch": 0.14730470739819615, + "grad_norm": 15.951434135437012, + "learning_rate": 1.974668990166761e-07, + "loss": 0.0243, + "step": 36870 + }, + { + "epoch": 0.14734465985477282, + "grad_norm": 3.065429449081421, + "learning_rate": 1.9746549243865244e-07, + "loss": 0.0231, + "step": 36880 + }, + { + "epoch": 0.1473846123113495, + "grad_norm": 6.137859344482422, + "learning_rate": 1.9746408547522785e-07, + "loss": 0.0296, + "step": 36890 + }, + { + "epoch": 0.14742456476792618, + "grad_norm": 2.2138919830322266, + "learning_rate": 1.9746267812640794e-07, + "loss": 0.0226, + "step": 36900 + }, + { + "epoch": 0.14746451722450285, + "grad_norm": 3.2709476947784424, + "learning_rate": 1.9746127039219827e-07, + "loss": 0.0227, + "step": 36910 + }, + { + "epoch": 0.14750446968107953, + "grad_norm": 3.9847986698150635, + "learning_rate": 1.9745986227260437e-07, + "loss": 0.0252, + "step": 36920 + }, + { + "epoch": 0.1475444221376562, + "grad_norm": 2.3641655445098877, + "learning_rate": 1.9745845376763186e-07, + "loss": 0.0232, + "step": 36930 + }, + { + "epoch": 0.14758437459423288, + "grad_norm": 2.865467071533203, + "learning_rate": 1.9745704487728628e-07, + "loss": 0.0245, + "step": 36940 + }, + { + "epoch": 0.14762432705080952, + "grad_norm": 6.663944721221924, + "learning_rate": 1.9745563560157322e-07, + "loss": 0.0269, + "step": 36950 + }, + { + "epoch": 0.1476642795073862, + "grad_norm": 7.181324005126953, + "learning_rate": 1.9745422594049823e-07, + "loss": 0.0282, + "step": 36960 + }, + { + "epoch": 0.14770423196396287, + "grad_norm": 5.914403915405273, + "learning_rate": 1.974528158940669e-07, + "loss": 0.0237, + "step": 36970 + }, + { + "epoch": 0.14774418442053955, + "grad_norm": 9.236377716064453, + "learning_rate": 1.9745140546228477e-07, + "loss": 0.023, + "step": 36980 + }, + { + "epoch": 0.14778413687711622, + "grad_norm": 2.6809000968933105, + "learning_rate": 1.9744999464515746e-07, + "loss": 0.0277, + "step": 36990 + }, + { + "epoch": 0.1478240893336929, + "grad_norm": 5.236153602600098, + "learning_rate": 1.9744858344269055e-07, + "loss": 0.0273, + "step": 37000 + }, + { + "epoch": 0.14786404179026957, + "grad_norm": 2.301499366760254, + "learning_rate": 1.974471718548896e-07, + "loss": 0.0206, + "step": 37010 + }, + { + "epoch": 0.14790399424684625, + "grad_norm": 1.7760995626449585, + "learning_rate": 1.9744575988176017e-07, + "loss": 0.0237, + "step": 37020 + }, + { + "epoch": 0.14794394670342292, + "grad_norm": 4.261663913726807, + "learning_rate": 1.974443475233079e-07, + "loss": 0.0301, + "step": 37030 + }, + { + "epoch": 0.1479838991599996, + "grad_norm": 2.9086999893188477, + "learning_rate": 1.9744293477953832e-07, + "loss": 0.0226, + "step": 37040 + }, + { + "epoch": 0.14802385161657627, + "grad_norm": 3.5912373065948486, + "learning_rate": 1.9744152165045707e-07, + "loss": 0.0228, + "step": 37050 + }, + { + "epoch": 0.14806380407315295, + "grad_norm": 3.800626516342163, + "learning_rate": 1.974401081360697e-07, + "loss": 0.025, + "step": 37060 + }, + { + "epoch": 0.14810375652972962, + "grad_norm": 4.740072250366211, + "learning_rate": 1.974386942363818e-07, + "loss": 0.0247, + "step": 37070 + }, + { + "epoch": 0.1481437089863063, + "grad_norm": 1.83176851272583, + "learning_rate": 1.9743727995139898e-07, + "loss": 0.0238, + "step": 37080 + }, + { + "epoch": 0.14818366144288297, + "grad_norm": 6.66281795501709, + "learning_rate": 1.974358652811268e-07, + "loss": 0.0216, + "step": 37090 + }, + { + "epoch": 0.14822361389945965, + "grad_norm": 3.7396750450134277, + "learning_rate": 1.974344502255709e-07, + "loss": 0.0244, + "step": 37100 + }, + { + "epoch": 0.14826356635603632, + "grad_norm": 4.056804656982422, + "learning_rate": 1.9743303478473683e-07, + "loss": 0.0203, + "step": 37110 + }, + { + "epoch": 0.148303518812613, + "grad_norm": 2.551299810409546, + "learning_rate": 1.974316189586302e-07, + "loss": 0.0232, + "step": 37120 + }, + { + "epoch": 0.14834347126918968, + "grad_norm": 5.182560920715332, + "learning_rate": 1.9743020274725663e-07, + "loss": 0.0267, + "step": 37130 + }, + { + "epoch": 0.14838342372576635, + "grad_norm": 3.5119853019714355, + "learning_rate": 1.9742878615062173e-07, + "loss": 0.019, + "step": 37140 + }, + { + "epoch": 0.14842337618234303, + "grad_norm": 5.283496856689453, + "learning_rate": 1.9742736916873103e-07, + "loss": 0.0239, + "step": 37150 + }, + { + "epoch": 0.14846332863891967, + "grad_norm": 6.276760578155518, + "learning_rate": 1.974259518015902e-07, + "loss": 0.0248, + "step": 37160 + }, + { + "epoch": 0.14850328109549635, + "grad_norm": 3.4047553539276123, + "learning_rate": 1.9742453404920485e-07, + "loss": 0.0171, + "step": 37170 + }, + { + "epoch": 0.14854323355207302, + "grad_norm": 3.086782217025757, + "learning_rate": 1.9742311591158054e-07, + "loss": 0.0217, + "step": 37180 + }, + { + "epoch": 0.1485831860086497, + "grad_norm": 7.778290271759033, + "learning_rate": 1.9742169738872292e-07, + "loss": 0.0229, + "step": 37190 + }, + { + "epoch": 0.14862313846522637, + "grad_norm": 7.972380638122559, + "learning_rate": 1.9742027848063755e-07, + "loss": 0.0241, + "step": 37200 + }, + { + "epoch": 0.14866309092180305, + "grad_norm": 6.191544055938721, + "learning_rate": 1.9741885918733007e-07, + "loss": 0.0263, + "step": 37210 + }, + { + "epoch": 0.14870304337837972, + "grad_norm": 4.011446475982666, + "learning_rate": 1.974174395088061e-07, + "loss": 0.0236, + "step": 37220 + }, + { + "epoch": 0.1487429958349564, + "grad_norm": 6.154690265655518, + "learning_rate": 1.9741601944507125e-07, + "loss": 0.023, + "step": 37230 + }, + { + "epoch": 0.14878294829153307, + "grad_norm": 4.856029987335205, + "learning_rate": 1.974145989961311e-07, + "loss": 0.0243, + "step": 37240 + }, + { + "epoch": 0.14882290074810975, + "grad_norm": 2.0134329795837402, + "learning_rate": 1.9741317816199135e-07, + "loss": 0.0249, + "step": 37250 + }, + { + "epoch": 0.14886285320468642, + "grad_norm": 2.4924674034118652, + "learning_rate": 1.9741175694265752e-07, + "loss": 0.0248, + "step": 37260 + }, + { + "epoch": 0.1489028056612631, + "grad_norm": 2.5818591117858887, + "learning_rate": 1.974103353381353e-07, + "loss": 0.0249, + "step": 37270 + }, + { + "epoch": 0.14894275811783977, + "grad_norm": 2.811244249343872, + "learning_rate": 1.9740891334843028e-07, + "loss": 0.0264, + "step": 37280 + }, + { + "epoch": 0.14898271057441645, + "grad_norm": 2.2618446350097656, + "learning_rate": 1.9740749097354808e-07, + "loss": 0.0229, + "step": 37290 + }, + { + "epoch": 0.14902266303099312, + "grad_norm": 4.1205620765686035, + "learning_rate": 1.9740606821349436e-07, + "loss": 0.0232, + "step": 37300 + }, + { + "epoch": 0.1490626154875698, + "grad_norm": 1.5477849245071411, + "learning_rate": 1.974046450682747e-07, + "loss": 0.0269, + "step": 37310 + }, + { + "epoch": 0.14910256794414647, + "grad_norm": 1.9294304847717285, + "learning_rate": 1.9740322153789474e-07, + "loss": 0.0233, + "step": 37320 + }, + { + "epoch": 0.14914252040072315, + "grad_norm": 5.046880722045898, + "learning_rate": 1.9740179762236015e-07, + "loss": 0.0245, + "step": 37330 + }, + { + "epoch": 0.14918247285729982, + "grad_norm": 12.156357765197754, + "learning_rate": 1.9740037332167647e-07, + "loss": 0.0268, + "step": 37340 + }, + { + "epoch": 0.1492224253138765, + "grad_norm": 3.718733310699463, + "learning_rate": 1.9739894863584943e-07, + "loss": 0.0253, + "step": 37350 + }, + { + "epoch": 0.14926237777045315, + "grad_norm": 4.026338577270508, + "learning_rate": 1.9739752356488463e-07, + "loss": 0.0232, + "step": 37360 + }, + { + "epoch": 0.14930233022702982, + "grad_norm": 2.720050096511841, + "learning_rate": 1.9739609810878766e-07, + "loss": 0.0228, + "step": 37370 + }, + { + "epoch": 0.1493422826836065, + "grad_norm": 3.6438636779785156, + "learning_rate": 1.973946722675642e-07, + "loss": 0.0241, + "step": 37380 + }, + { + "epoch": 0.14938223514018317, + "grad_norm": 2.737497329711914, + "learning_rate": 1.9739324604121992e-07, + "loss": 0.0279, + "step": 37390 + }, + { + "epoch": 0.14942218759675985, + "grad_norm": 2.9159977436065674, + "learning_rate": 1.973918194297604e-07, + "loss": 0.0282, + "step": 37400 + }, + { + "epoch": 0.14946214005333652, + "grad_norm": 2.0627501010894775, + "learning_rate": 1.973903924331913e-07, + "loss": 0.0207, + "step": 37410 + }, + { + "epoch": 0.1495020925099132, + "grad_norm": 5.191338539123535, + "learning_rate": 1.9738896505151826e-07, + "loss": 0.027, + "step": 37420 + }, + { + "epoch": 0.14954204496648987, + "grad_norm": 6.640006065368652, + "learning_rate": 1.9738753728474694e-07, + "loss": 0.0253, + "step": 37430 + }, + { + "epoch": 0.14958199742306655, + "grad_norm": 3.9230642318725586, + "learning_rate": 1.9738610913288297e-07, + "loss": 0.0215, + "step": 37440 + }, + { + "epoch": 0.14962194987964322, + "grad_norm": 15.226836204528809, + "learning_rate": 1.97384680595932e-07, + "loss": 0.024, + "step": 37450 + }, + { + "epoch": 0.1496619023362199, + "grad_norm": 4.18654727935791, + "learning_rate": 1.9738325167389968e-07, + "loss": 0.0245, + "step": 37460 + }, + { + "epoch": 0.14970185479279657, + "grad_norm": 3.3693253993988037, + "learning_rate": 1.9738182236679166e-07, + "loss": 0.0263, + "step": 37470 + }, + { + "epoch": 0.14974180724937325, + "grad_norm": 9.061246871948242, + "learning_rate": 1.973803926746136e-07, + "loss": 0.0233, + "step": 37480 + }, + { + "epoch": 0.14978175970594992, + "grad_norm": 4.39763069152832, + "learning_rate": 1.9737896259737116e-07, + "loss": 0.0219, + "step": 37490 + }, + { + "epoch": 0.1498217121625266, + "grad_norm": 3.5678932666778564, + "learning_rate": 1.9737753213506997e-07, + "loss": 0.0227, + "step": 37500 + }, + { + "epoch": 0.14986166461910327, + "grad_norm": 3.5999765396118164, + "learning_rate": 1.9737610128771568e-07, + "loss": 0.0248, + "step": 37510 + }, + { + "epoch": 0.14990161707567995, + "grad_norm": 7.227781772613525, + "learning_rate": 1.9737467005531395e-07, + "loss": 0.0268, + "step": 37520 + }, + { + "epoch": 0.14994156953225662, + "grad_norm": 6.2920050621032715, + "learning_rate": 1.973732384378705e-07, + "loss": 0.0231, + "step": 37530 + }, + { + "epoch": 0.1499815219888333, + "grad_norm": 3.5838894844055176, + "learning_rate": 1.9737180643539094e-07, + "loss": 0.0258, + "step": 37540 + }, + { + "epoch": 0.15002147444540997, + "grad_norm": 2.244795799255371, + "learning_rate": 1.9737037404788093e-07, + "loss": 0.0219, + "step": 37550 + }, + { + "epoch": 0.15006142690198662, + "grad_norm": 5.283054351806641, + "learning_rate": 1.9736894127534613e-07, + "loss": 0.0228, + "step": 37560 + }, + { + "epoch": 0.1501013793585633, + "grad_norm": 5.310852527618408, + "learning_rate": 1.9736750811779223e-07, + "loss": 0.0314, + "step": 37570 + }, + { + "epoch": 0.15014133181513997, + "grad_norm": 2.7146708965301514, + "learning_rate": 1.973660745752249e-07, + "loss": 0.0233, + "step": 37580 + }, + { + "epoch": 0.15018128427171665, + "grad_norm": 2.563347101211548, + "learning_rate": 1.9736464064764977e-07, + "loss": 0.0188, + "step": 37590 + }, + { + "epoch": 0.15022123672829332, + "grad_norm": 3.6658637523651123, + "learning_rate": 1.9736320633507252e-07, + "loss": 0.0238, + "step": 37600 + }, + { + "epoch": 0.15026118918487, + "grad_norm": 4.1783528327941895, + "learning_rate": 1.9736177163749887e-07, + "loss": 0.0254, + "step": 37610 + }, + { + "epoch": 0.15030114164144667, + "grad_norm": 4.130256652832031, + "learning_rate": 1.9736033655493445e-07, + "loss": 0.0212, + "step": 37620 + }, + { + "epoch": 0.15034109409802335, + "grad_norm": 4.313755512237549, + "learning_rate": 1.9735890108738492e-07, + "loss": 0.0271, + "step": 37630 + }, + { + "epoch": 0.15038104655460002, + "grad_norm": 1.2102051973342896, + "learning_rate": 1.97357465234856e-07, + "loss": 0.0246, + "step": 37640 + }, + { + "epoch": 0.1504209990111767, + "grad_norm": 3.484377145767212, + "learning_rate": 1.9735602899735335e-07, + "loss": 0.0241, + "step": 37650 + }, + { + "epoch": 0.15046095146775337, + "grad_norm": 1.9252504110336304, + "learning_rate": 1.9735459237488266e-07, + "loss": 0.023, + "step": 37660 + }, + { + "epoch": 0.15050090392433005, + "grad_norm": 3.1157796382904053, + "learning_rate": 1.9735315536744958e-07, + "loss": 0.0237, + "step": 37670 + }, + { + "epoch": 0.15054085638090672, + "grad_norm": 1.791231393814087, + "learning_rate": 1.9735171797505982e-07, + "loss": 0.0166, + "step": 37680 + }, + { + "epoch": 0.1505808088374834, + "grad_norm": 12.195718765258789, + "learning_rate": 1.9735028019771902e-07, + "loss": 0.0242, + "step": 37690 + }, + { + "epoch": 0.15062076129406007, + "grad_norm": 3.4967682361602783, + "learning_rate": 1.973488420354329e-07, + "loss": 0.0217, + "step": 37700 + }, + { + "epoch": 0.15066071375063675, + "grad_norm": 3.495807647705078, + "learning_rate": 1.9734740348820717e-07, + "loss": 0.0211, + "step": 37710 + }, + { + "epoch": 0.15070066620721342, + "grad_norm": 5.808645725250244, + "learning_rate": 1.973459645560475e-07, + "loss": 0.0239, + "step": 37720 + }, + { + "epoch": 0.1507406186637901, + "grad_norm": 1.5906858444213867, + "learning_rate": 1.9734452523895956e-07, + "loss": 0.0238, + "step": 37730 + }, + { + "epoch": 0.15078057112036677, + "grad_norm": 3.7045516967773438, + "learning_rate": 1.9734308553694907e-07, + "loss": 0.0187, + "step": 37740 + }, + { + "epoch": 0.15082052357694345, + "grad_norm": 4.777308940887451, + "learning_rate": 1.9734164545002167e-07, + "loss": 0.0225, + "step": 37750 + }, + { + "epoch": 0.15086047603352012, + "grad_norm": 3.5953445434570312, + "learning_rate": 1.973402049781831e-07, + "loss": 0.0237, + "step": 37760 + }, + { + "epoch": 0.15090042849009677, + "grad_norm": 3.2895772457122803, + "learning_rate": 1.973387641214391e-07, + "loss": 0.0218, + "step": 37770 + }, + { + "epoch": 0.15094038094667345, + "grad_norm": 3.8856747150421143, + "learning_rate": 1.9733732287979525e-07, + "loss": 0.023, + "step": 37780 + }, + { + "epoch": 0.15098033340325012, + "grad_norm": 9.432013511657715, + "learning_rate": 1.9733588125325735e-07, + "loss": 0.0234, + "step": 37790 + }, + { + "epoch": 0.1510202858598268, + "grad_norm": 3.367218017578125, + "learning_rate": 1.9733443924183107e-07, + "loss": 0.0218, + "step": 37800 + }, + { + "epoch": 0.15106023831640347, + "grad_norm": 4.860969066619873, + "learning_rate": 1.973329968455221e-07, + "loss": 0.0219, + "step": 37810 + }, + { + "epoch": 0.15110019077298015, + "grad_norm": 4.817014217376709, + "learning_rate": 1.9733155406433616e-07, + "loss": 0.0203, + "step": 37820 + }, + { + "epoch": 0.15114014322955682, + "grad_norm": 3.0078189373016357, + "learning_rate": 1.973301108982789e-07, + "loss": 0.0235, + "step": 37830 + }, + { + "epoch": 0.1511800956861335, + "grad_norm": 3.746394395828247, + "learning_rate": 1.973286673473561e-07, + "loss": 0.0242, + "step": 37840 + }, + { + "epoch": 0.15122004814271017, + "grad_norm": 5.759269714355469, + "learning_rate": 1.9732722341157343e-07, + "loss": 0.0234, + "step": 37850 + }, + { + "epoch": 0.15126000059928685, + "grad_norm": 2.5495660305023193, + "learning_rate": 1.9732577909093666e-07, + "loss": 0.0231, + "step": 37860 + }, + { + "epoch": 0.15129995305586352, + "grad_norm": 3.8824667930603027, + "learning_rate": 1.973243343854514e-07, + "loss": 0.0211, + "step": 37870 + }, + { + "epoch": 0.1513399055124402, + "grad_norm": 3.0612869262695312, + "learning_rate": 1.9732288929512345e-07, + "loss": 0.0207, + "step": 37880 + }, + { + "epoch": 0.15137985796901687, + "grad_norm": 5.289431571960449, + "learning_rate": 1.9732144381995846e-07, + "loss": 0.0263, + "step": 37890 + }, + { + "epoch": 0.15141981042559355, + "grad_norm": 7.7611589431762695, + "learning_rate": 1.9731999795996217e-07, + "loss": 0.0251, + "step": 37900 + }, + { + "epoch": 0.15145976288217022, + "grad_norm": 5.095252513885498, + "learning_rate": 1.9731855171514032e-07, + "loss": 0.0222, + "step": 37910 + }, + { + "epoch": 0.1514997153387469, + "grad_norm": 5.654365539550781, + "learning_rate": 1.9731710508549858e-07, + "loss": 0.0286, + "step": 37920 + }, + { + "epoch": 0.15153966779532357, + "grad_norm": 1.712631106376648, + "learning_rate": 1.9731565807104274e-07, + "loss": 0.0219, + "step": 37930 + }, + { + "epoch": 0.15157962025190025, + "grad_norm": 6.73758602142334, + "learning_rate": 1.9731421067177845e-07, + "loss": 0.0249, + "step": 37940 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 2.4678518772125244, + "learning_rate": 1.973127628877115e-07, + "loss": 0.0236, + "step": 37950 + }, + { + "epoch": 0.1516595251650536, + "grad_norm": 4.654429912567139, + "learning_rate": 1.9731131471884755e-07, + "loss": 0.0232, + "step": 37960 + }, + { + "epoch": 0.15169947762163025, + "grad_norm": 3.32098650932312, + "learning_rate": 1.9730986616519236e-07, + "loss": 0.0252, + "step": 37970 + }, + { + "epoch": 0.15173943007820692, + "grad_norm": 1.7452661991119385, + "learning_rate": 1.9730841722675166e-07, + "loss": 0.0238, + "step": 37980 + }, + { + "epoch": 0.1517793825347836, + "grad_norm": 3.0706276893615723, + "learning_rate": 1.9730696790353118e-07, + "loss": 0.0283, + "step": 37990 + }, + { + "epoch": 0.15181933499136027, + "grad_norm": 2.110459327697754, + "learning_rate": 1.9730551819553663e-07, + "loss": 0.0205, + "step": 38000 + }, + { + "epoch": 0.15185928744793695, + "grad_norm": 3.2319843769073486, + "learning_rate": 1.9730406810277378e-07, + "loss": 0.0239, + "step": 38010 + }, + { + "epoch": 0.15189923990451362, + "grad_norm": 4.610530376434326, + "learning_rate": 1.9730261762524832e-07, + "loss": 0.0262, + "step": 38020 + }, + { + "epoch": 0.1519391923610903, + "grad_norm": 4.114920139312744, + "learning_rate": 1.9730116676296603e-07, + "loss": 0.0252, + "step": 38030 + }, + { + "epoch": 0.15197914481766697, + "grad_norm": 4.191993236541748, + "learning_rate": 1.972997155159326e-07, + "loss": 0.0275, + "step": 38040 + }, + { + "epoch": 0.15201909727424365, + "grad_norm": 2.185185670852661, + "learning_rate": 1.972982638841538e-07, + "loss": 0.023, + "step": 38050 + }, + { + "epoch": 0.15205904973082032, + "grad_norm": 12.962717056274414, + "learning_rate": 1.9729681186763538e-07, + "loss": 0.0214, + "step": 38060 + }, + { + "epoch": 0.152099002187397, + "grad_norm": 3.0912559032440186, + "learning_rate": 1.9729535946638304e-07, + "loss": 0.0187, + "step": 38070 + }, + { + "epoch": 0.15213895464397367, + "grad_norm": 8.182560920715332, + "learning_rate": 1.9729390668040256e-07, + "loss": 0.0274, + "step": 38080 + }, + { + "epoch": 0.15217890710055035, + "grad_norm": 4.927254676818848, + "learning_rate": 1.9729245350969969e-07, + "loss": 0.0257, + "step": 38090 + }, + { + "epoch": 0.15221885955712702, + "grad_norm": 2.665463447570801, + "learning_rate": 1.9729099995428012e-07, + "loss": 0.0236, + "step": 38100 + }, + { + "epoch": 0.1522588120137037, + "grad_norm": 2.297901153564453, + "learning_rate": 1.9728954601414967e-07, + "loss": 0.0197, + "step": 38110 + }, + { + "epoch": 0.15229876447028037, + "grad_norm": 5.04636812210083, + "learning_rate": 1.97288091689314e-07, + "loss": 0.0245, + "step": 38120 + }, + { + "epoch": 0.15233871692685705, + "grad_norm": 2.480323553085327, + "learning_rate": 1.9728663697977899e-07, + "loss": 0.0233, + "step": 38130 + }, + { + "epoch": 0.15237866938343372, + "grad_norm": 2.6070852279663086, + "learning_rate": 1.9728518188555027e-07, + "loss": 0.0233, + "step": 38140 + }, + { + "epoch": 0.1524186218400104, + "grad_norm": 3.050318956375122, + "learning_rate": 1.9728372640663364e-07, + "loss": 0.0239, + "step": 38150 + }, + { + "epoch": 0.15245857429658707, + "grad_norm": 2.0808358192443848, + "learning_rate": 1.9728227054303487e-07, + "loss": 0.0238, + "step": 38160 + }, + { + "epoch": 0.15249852675316375, + "grad_norm": 6.946518898010254, + "learning_rate": 1.9728081429475973e-07, + "loss": 0.0218, + "step": 38170 + }, + { + "epoch": 0.1525384792097404, + "grad_norm": 7.2495198249816895, + "learning_rate": 1.9727935766181392e-07, + "loss": 0.0215, + "step": 38180 + }, + { + "epoch": 0.15257843166631707, + "grad_norm": 2.2856252193450928, + "learning_rate": 1.9727790064420321e-07, + "loss": 0.0271, + "step": 38190 + }, + { + "epoch": 0.15261838412289375, + "grad_norm": 4.9190168380737305, + "learning_rate": 1.9727644324193343e-07, + "loss": 0.0213, + "step": 38200 + }, + { + "epoch": 0.15265833657947042, + "grad_norm": 5.543955326080322, + "learning_rate": 1.972749854550103e-07, + "loss": 0.0274, + "step": 38210 + }, + { + "epoch": 0.1526982890360471, + "grad_norm": 4.498771667480469, + "learning_rate": 1.9727352728343957e-07, + "loss": 0.0215, + "step": 38220 + }, + { + "epoch": 0.15273824149262377, + "grad_norm": 5.472683429718018, + "learning_rate": 1.97272068727227e-07, + "loss": 0.027, + "step": 38230 + }, + { + "epoch": 0.15277819394920045, + "grad_norm": 2.0857725143432617, + "learning_rate": 1.9727060978637838e-07, + "loss": 0.0206, + "step": 38240 + }, + { + "epoch": 0.15281814640577712, + "grad_norm": 3.8677096366882324, + "learning_rate": 1.9726915046089946e-07, + "loss": 0.0231, + "step": 38250 + }, + { + "epoch": 0.1528580988623538, + "grad_norm": 4.2046003341674805, + "learning_rate": 1.9726769075079606e-07, + "loss": 0.0234, + "step": 38260 + }, + { + "epoch": 0.15289805131893047, + "grad_norm": 2.6273863315582275, + "learning_rate": 1.972662306560739e-07, + "loss": 0.025, + "step": 38270 + }, + { + "epoch": 0.15293800377550715, + "grad_norm": 4.888377666473389, + "learning_rate": 1.9726477017673874e-07, + "loss": 0.0213, + "step": 38280 + }, + { + "epoch": 0.15297795623208382, + "grad_norm": 5.9463019371032715, + "learning_rate": 1.9726330931279642e-07, + "loss": 0.0208, + "step": 38290 + }, + { + "epoch": 0.1530179086886605, + "grad_norm": 11.438782691955566, + "learning_rate": 1.9726184806425268e-07, + "loss": 0.0264, + "step": 38300 + }, + { + "epoch": 0.15305786114523717, + "grad_norm": 3.860434055328369, + "learning_rate": 1.9726038643111328e-07, + "loss": 0.0262, + "step": 38310 + }, + { + "epoch": 0.15309781360181385, + "grad_norm": 3.5877983570098877, + "learning_rate": 1.9725892441338402e-07, + "loss": 0.0221, + "step": 38320 + }, + { + "epoch": 0.15313776605839052, + "grad_norm": 2.9879555702209473, + "learning_rate": 1.972574620110707e-07, + "loss": 0.0213, + "step": 38330 + }, + { + "epoch": 0.1531777185149672, + "grad_norm": 4.628042697906494, + "learning_rate": 1.9725599922417905e-07, + "loss": 0.0249, + "step": 38340 + }, + { + "epoch": 0.15321767097154387, + "grad_norm": 3.4096696376800537, + "learning_rate": 1.9725453605271492e-07, + "loss": 0.0226, + "step": 38350 + }, + { + "epoch": 0.15325762342812055, + "grad_norm": 4.169748306274414, + "learning_rate": 1.9725307249668405e-07, + "loss": 0.0207, + "step": 38360 + }, + { + "epoch": 0.15329757588469722, + "grad_norm": 1.1717791557312012, + "learning_rate": 1.9725160855609225e-07, + "loss": 0.0255, + "step": 38370 + }, + { + "epoch": 0.15333752834127387, + "grad_norm": 4.162323951721191, + "learning_rate": 1.9725014423094527e-07, + "loss": 0.0239, + "step": 38380 + }, + { + "epoch": 0.15337748079785055, + "grad_norm": 5.982388973236084, + "learning_rate": 1.9724867952124896e-07, + "loss": 0.0236, + "step": 38390 + }, + { + "epoch": 0.15341743325442722, + "grad_norm": 7.854784965515137, + "learning_rate": 1.9724721442700906e-07, + "loss": 0.0255, + "step": 38400 + }, + { + "epoch": 0.1534573857110039, + "grad_norm": 7.506379127502441, + "learning_rate": 1.9724574894823142e-07, + "loss": 0.0252, + "step": 38410 + }, + { + "epoch": 0.15349733816758057, + "grad_norm": 3.579601526260376, + "learning_rate": 1.9724428308492173e-07, + "loss": 0.0244, + "step": 38420 + }, + { + "epoch": 0.15353729062415725, + "grad_norm": 3.431586742401123, + "learning_rate": 1.9724281683708593e-07, + "loss": 0.0236, + "step": 38430 + }, + { + "epoch": 0.15357724308073392, + "grad_norm": 4.138676643371582, + "learning_rate": 1.972413502047297e-07, + "loss": 0.0227, + "step": 38440 + }, + { + "epoch": 0.1536171955373106, + "grad_norm": 4.326998710632324, + "learning_rate": 1.972398831878589e-07, + "loss": 0.0212, + "step": 38450 + }, + { + "epoch": 0.15365714799388727, + "grad_norm": 4.3127923011779785, + "learning_rate": 1.972384157864793e-07, + "loss": 0.0241, + "step": 38460 + }, + { + "epoch": 0.15369710045046395, + "grad_norm": 3.349456787109375, + "learning_rate": 1.972369480005967e-07, + "loss": 0.0242, + "step": 38470 + }, + { + "epoch": 0.15373705290704062, + "grad_norm": 7.216701984405518, + "learning_rate": 1.9723547983021696e-07, + "loss": 0.0223, + "step": 38480 + }, + { + "epoch": 0.1537770053636173, + "grad_norm": 4.561606407165527, + "learning_rate": 1.972340112753458e-07, + "loss": 0.0241, + "step": 38490 + }, + { + "epoch": 0.15381695782019397, + "grad_norm": 3.2638678550720215, + "learning_rate": 1.9723254233598913e-07, + "loss": 0.0197, + "step": 38500 + }, + { + "epoch": 0.15385691027677065, + "grad_norm": 1.8726704120635986, + "learning_rate": 1.9723107301215265e-07, + "loss": 0.0222, + "step": 38510 + }, + { + "epoch": 0.15389686273334732, + "grad_norm": 4.464521408081055, + "learning_rate": 1.9722960330384225e-07, + "loss": 0.0221, + "step": 38520 + }, + { + "epoch": 0.153936815189924, + "grad_norm": 5.9991135597229, + "learning_rate": 1.9722813321106367e-07, + "loss": 0.0263, + "step": 38530 + }, + { + "epoch": 0.15397676764650067, + "grad_norm": 32.1702880859375, + "learning_rate": 1.9722666273382283e-07, + "loss": 0.0232, + "step": 38540 + }, + { + "epoch": 0.15401672010307735, + "grad_norm": 2.5080580711364746, + "learning_rate": 1.9722519187212545e-07, + "loss": 0.0185, + "step": 38550 + }, + { + "epoch": 0.15405667255965402, + "grad_norm": 7.813970565795898, + "learning_rate": 1.9722372062597737e-07, + "loss": 0.0258, + "step": 38560 + }, + { + "epoch": 0.1540966250162307, + "grad_norm": 120.81849670410156, + "learning_rate": 1.972222489953844e-07, + "loss": 0.0266, + "step": 38570 + }, + { + "epoch": 0.15413657747280735, + "grad_norm": 2.92917799949646, + "learning_rate": 1.972207769803524e-07, + "loss": 0.0232, + "step": 38580 + }, + { + "epoch": 0.15417652992938402, + "grad_norm": 4.512289047241211, + "learning_rate": 1.9721930458088715e-07, + "loss": 0.0208, + "step": 38590 + }, + { + "epoch": 0.1542164823859607, + "grad_norm": 8.276458740234375, + "learning_rate": 1.9721783179699448e-07, + "loss": 0.0192, + "step": 38600 + }, + { + "epoch": 0.15425643484253737, + "grad_norm": 3.7317261695861816, + "learning_rate": 1.9721635862868023e-07, + "loss": 0.0208, + "step": 38610 + }, + { + "epoch": 0.15429638729911405, + "grad_norm": 8.720733642578125, + "learning_rate": 1.972148850759502e-07, + "loss": 0.0202, + "step": 38620 + }, + { + "epoch": 0.15433633975569072, + "grad_norm": 4.096574306488037, + "learning_rate": 1.9721341113881027e-07, + "loss": 0.0277, + "step": 38630 + }, + { + "epoch": 0.1543762922122674, + "grad_norm": 2.7084805965423584, + "learning_rate": 1.972119368172662e-07, + "loss": 0.0223, + "step": 38640 + }, + { + "epoch": 0.15441624466884407, + "grad_norm": 4.848781108856201, + "learning_rate": 1.9721046211132386e-07, + "loss": 0.0254, + "step": 38650 + }, + { + "epoch": 0.15445619712542075, + "grad_norm": 2.091804265975952, + "learning_rate": 1.9720898702098906e-07, + "loss": 0.02, + "step": 38660 + }, + { + "epoch": 0.15449614958199742, + "grad_norm": 10.902840614318848, + "learning_rate": 1.9720751154626764e-07, + "loss": 0.0236, + "step": 38670 + }, + { + "epoch": 0.1545361020385741, + "grad_norm": 3.815249443054199, + "learning_rate": 1.9720603568716548e-07, + "loss": 0.023, + "step": 38680 + }, + { + "epoch": 0.15457605449515077, + "grad_norm": 9.016788482666016, + "learning_rate": 1.972045594436883e-07, + "loss": 0.0255, + "step": 38690 + }, + { + "epoch": 0.15461600695172745, + "grad_norm": 4.396219253540039, + "learning_rate": 1.9720308281584207e-07, + "loss": 0.0212, + "step": 38700 + }, + { + "epoch": 0.15465595940830412, + "grad_norm": 4.453922748565674, + "learning_rate": 1.9720160580363256e-07, + "loss": 0.0284, + "step": 38710 + }, + { + "epoch": 0.1546959118648808, + "grad_norm": 3.9673054218292236, + "learning_rate": 1.9720012840706562e-07, + "loss": 0.0211, + "step": 38720 + }, + { + "epoch": 0.15473586432145747, + "grad_norm": 4.616243362426758, + "learning_rate": 1.9719865062614708e-07, + "loss": 0.0267, + "step": 38730 + }, + { + "epoch": 0.15477581677803415, + "grad_norm": 4.53894567489624, + "learning_rate": 1.971971724608828e-07, + "loss": 0.0231, + "step": 38740 + }, + { + "epoch": 0.15481576923461082, + "grad_norm": 3.6283047199249268, + "learning_rate": 1.9719569391127864e-07, + "loss": 0.0184, + "step": 38750 + }, + { + "epoch": 0.1548557216911875, + "grad_norm": 4.011991024017334, + "learning_rate": 1.971942149773404e-07, + "loss": 0.0206, + "step": 38760 + }, + { + "epoch": 0.15489567414776417, + "grad_norm": 3.1889917850494385, + "learning_rate": 1.9719273565907398e-07, + "loss": 0.023, + "step": 38770 + }, + { + "epoch": 0.15493562660434085, + "grad_norm": 5.600579261779785, + "learning_rate": 1.9719125595648518e-07, + "loss": 0.0203, + "step": 38780 + }, + { + "epoch": 0.1549755790609175, + "grad_norm": 4.470995903015137, + "learning_rate": 1.971897758695799e-07, + "loss": 0.0231, + "step": 38790 + }, + { + "epoch": 0.15501553151749417, + "grad_norm": 5.013420581817627, + "learning_rate": 1.9718829539836395e-07, + "loss": 0.0264, + "step": 38800 + }, + { + "epoch": 0.15505548397407085, + "grad_norm": 6.9627604484558105, + "learning_rate": 1.9718681454284324e-07, + "loss": 0.0233, + "step": 38810 + }, + { + "epoch": 0.15509543643064752, + "grad_norm": 3.3349499702453613, + "learning_rate": 1.9718533330302357e-07, + "loss": 0.0257, + "step": 38820 + }, + { + "epoch": 0.1551353888872242, + "grad_norm": 37.19544219970703, + "learning_rate": 1.971838516789108e-07, + "loss": 0.0228, + "step": 38830 + }, + { + "epoch": 0.15517534134380087, + "grad_norm": 4.100769996643066, + "learning_rate": 1.971823696705108e-07, + "loss": 0.0198, + "step": 38840 + }, + { + "epoch": 0.15521529380037755, + "grad_norm": 1.9755911827087402, + "learning_rate": 1.9718088727782944e-07, + "loss": 0.0258, + "step": 38850 + }, + { + "epoch": 0.15525524625695422, + "grad_norm": 2.4214062690734863, + "learning_rate": 1.971794045008726e-07, + "loss": 0.0249, + "step": 38860 + }, + { + "epoch": 0.1552951987135309, + "grad_norm": 1.6134446859359741, + "learning_rate": 1.971779213396461e-07, + "loss": 0.0272, + "step": 38870 + }, + { + "epoch": 0.15533515117010757, + "grad_norm": 5.046985149383545, + "learning_rate": 1.9717643779415582e-07, + "loss": 0.021, + "step": 38880 + }, + { + "epoch": 0.15537510362668425, + "grad_norm": 3.195880174636841, + "learning_rate": 1.9717495386440764e-07, + "loss": 0.0207, + "step": 38890 + }, + { + "epoch": 0.15541505608326092, + "grad_norm": 4.505516052246094, + "learning_rate": 1.9717346955040743e-07, + "loss": 0.0231, + "step": 38900 + }, + { + "epoch": 0.1554550085398376, + "grad_norm": 4.07151985168457, + "learning_rate": 1.9717198485216105e-07, + "loss": 0.0204, + "step": 38910 + }, + { + "epoch": 0.15549496099641427, + "grad_norm": 8.789405822753906, + "learning_rate": 1.9717049976967435e-07, + "loss": 0.0237, + "step": 38920 + }, + { + "epoch": 0.15553491345299095, + "grad_norm": 3.184966802597046, + "learning_rate": 1.9716901430295323e-07, + "loss": 0.0221, + "step": 38930 + }, + { + "epoch": 0.15557486590956762, + "grad_norm": 9.441816329956055, + "learning_rate": 1.9716752845200353e-07, + "loss": 0.0232, + "step": 38940 + }, + { + "epoch": 0.1556148183661443, + "grad_norm": 1.5350779294967651, + "learning_rate": 1.9716604221683117e-07, + "loss": 0.0285, + "step": 38950 + }, + { + "epoch": 0.15565477082272097, + "grad_norm": 3.6117708683013916, + "learning_rate": 1.9716455559744202e-07, + "loss": 0.0274, + "step": 38960 + }, + { + "epoch": 0.15569472327929765, + "grad_norm": 3.150804281234741, + "learning_rate": 1.9716306859384195e-07, + "loss": 0.0211, + "step": 38970 + }, + { + "epoch": 0.15573467573587432, + "grad_norm": 1.9973138570785522, + "learning_rate": 1.971615812060368e-07, + "loss": 0.025, + "step": 38980 + }, + { + "epoch": 0.15577462819245097, + "grad_norm": 10.898021697998047, + "learning_rate": 1.971600934340325e-07, + "loss": 0.0212, + "step": 38990 + }, + { + "epoch": 0.15581458064902765, + "grad_norm": 1.5193637609481812, + "learning_rate": 1.9715860527783495e-07, + "loss": 0.0215, + "step": 39000 + }, + { + "epoch": 0.15585453310560432, + "grad_norm": 5.284668445587158, + "learning_rate": 1.9715711673744999e-07, + "loss": 0.0221, + "step": 39010 + }, + { + "epoch": 0.155894485562181, + "grad_norm": 9.259320259094238, + "learning_rate": 1.971556278128835e-07, + "loss": 0.0202, + "step": 39020 + }, + { + "epoch": 0.15593443801875767, + "grad_norm": 3.5339250564575195, + "learning_rate": 1.9715413850414143e-07, + "loss": 0.0206, + "step": 39030 + }, + { + "epoch": 0.15597439047533435, + "grad_norm": 3.2493772506713867, + "learning_rate": 1.971526488112296e-07, + "loss": 0.021, + "step": 39040 + }, + { + "epoch": 0.15601434293191102, + "grad_norm": 5.91404914855957, + "learning_rate": 1.9715115873415393e-07, + "loss": 0.0245, + "step": 39050 + }, + { + "epoch": 0.1560542953884877, + "grad_norm": 5.419095516204834, + "learning_rate": 1.971496682729203e-07, + "loss": 0.0244, + "step": 39060 + }, + { + "epoch": 0.15609424784506437, + "grad_norm": 10.791189193725586, + "learning_rate": 1.9714817742753463e-07, + "loss": 0.0211, + "step": 39070 + }, + { + "epoch": 0.15613420030164105, + "grad_norm": 1.8458092212677002, + "learning_rate": 1.9714668619800278e-07, + "loss": 0.0256, + "step": 39080 + }, + { + "epoch": 0.15617415275821772, + "grad_norm": 2.3905279636383057, + "learning_rate": 1.9714519458433068e-07, + "loss": 0.0249, + "step": 39090 + }, + { + "epoch": 0.1562141052147944, + "grad_norm": 1.004303216934204, + "learning_rate": 1.9714370258652422e-07, + "loss": 0.023, + "step": 39100 + }, + { + "epoch": 0.15625405767137107, + "grad_norm": 4.17534875869751, + "learning_rate": 1.971422102045893e-07, + "loss": 0.0232, + "step": 39110 + }, + { + "epoch": 0.15629401012794775, + "grad_norm": 3.5859532356262207, + "learning_rate": 1.9714071743853178e-07, + "loss": 0.021, + "step": 39120 + }, + { + "epoch": 0.15633396258452442, + "grad_norm": 2.1898622512817383, + "learning_rate": 1.9713922428835762e-07, + "loss": 0.0198, + "step": 39130 + }, + { + "epoch": 0.1563739150411011, + "grad_norm": 7.1107563972473145, + "learning_rate": 1.971377307540727e-07, + "loss": 0.029, + "step": 39140 + }, + { + "epoch": 0.15641386749767777, + "grad_norm": 3.256678342819214, + "learning_rate": 1.9713623683568293e-07, + "loss": 0.0198, + "step": 39150 + }, + { + "epoch": 0.15645381995425445, + "grad_norm": 7.857561111450195, + "learning_rate": 1.9713474253319423e-07, + "loss": 0.0251, + "step": 39160 + }, + { + "epoch": 0.15649377241083112, + "grad_norm": 3.5520288944244385, + "learning_rate": 1.9713324784661244e-07, + "loss": 0.021, + "step": 39170 + }, + { + "epoch": 0.1565337248674078, + "grad_norm": 23.600582122802734, + "learning_rate": 1.9713175277594356e-07, + "loss": 0.0217, + "step": 39180 + }, + { + "epoch": 0.15657367732398444, + "grad_norm": 6.893317222595215, + "learning_rate": 1.971302573211935e-07, + "loss": 0.0206, + "step": 39190 + }, + { + "epoch": 0.15661362978056112, + "grad_norm": 2.555112361907959, + "learning_rate": 1.9712876148236808e-07, + "loss": 0.0203, + "step": 39200 + }, + { + "epoch": 0.1566535822371378, + "grad_norm": 3.2724525928497314, + "learning_rate": 1.971272652594733e-07, + "loss": 0.0271, + "step": 39210 + }, + { + "epoch": 0.15669353469371447, + "grad_norm": 4.291676998138428, + "learning_rate": 1.9712576865251505e-07, + "loss": 0.0182, + "step": 39220 + }, + { + "epoch": 0.15673348715029115, + "grad_norm": 2.979619026184082, + "learning_rate": 1.9712427166149924e-07, + "loss": 0.0248, + "step": 39230 + }, + { + "epoch": 0.15677343960686782, + "grad_norm": 3.387373208999634, + "learning_rate": 1.971227742864318e-07, + "loss": 0.0253, + "step": 39240 + }, + { + "epoch": 0.1568133920634445, + "grad_norm": 9.374722480773926, + "learning_rate": 1.9712127652731864e-07, + "loss": 0.0238, + "step": 39250 + }, + { + "epoch": 0.15685334452002117, + "grad_norm": 1.6784944534301758, + "learning_rate": 1.9711977838416573e-07, + "loss": 0.0217, + "step": 39260 + }, + { + "epoch": 0.15689329697659785, + "grad_norm": 12.254950523376465, + "learning_rate": 1.971182798569789e-07, + "loss": 0.0223, + "step": 39270 + }, + { + "epoch": 0.15693324943317452, + "grad_norm": 3.1751394271850586, + "learning_rate": 1.971167809457642e-07, + "loss": 0.0204, + "step": 39280 + }, + { + "epoch": 0.1569732018897512, + "grad_norm": 3.1382617950439453, + "learning_rate": 1.9711528165052743e-07, + "loss": 0.0224, + "step": 39290 + }, + { + "epoch": 0.15701315434632787, + "grad_norm": 4.31651496887207, + "learning_rate": 1.971137819712746e-07, + "loss": 0.0211, + "step": 39300 + }, + { + "epoch": 0.15705310680290455, + "grad_norm": 7.776539325714111, + "learning_rate": 1.971122819080116e-07, + "loss": 0.0244, + "step": 39310 + }, + { + "epoch": 0.15709305925948122, + "grad_norm": 3.381805658340454, + "learning_rate": 1.971107814607444e-07, + "loss": 0.0234, + "step": 39320 + }, + { + "epoch": 0.1571330117160579, + "grad_norm": 2.3123440742492676, + "learning_rate": 1.9710928062947892e-07, + "loss": 0.0214, + "step": 39330 + }, + { + "epoch": 0.15717296417263457, + "grad_norm": 3.564122438430786, + "learning_rate": 1.9710777941422104e-07, + "loss": 0.0213, + "step": 39340 + }, + { + "epoch": 0.15721291662921125, + "grad_norm": 3.6023926734924316, + "learning_rate": 1.9710627781497676e-07, + "loss": 0.026, + "step": 39350 + }, + { + "epoch": 0.15725286908578792, + "grad_norm": 12.803044319152832, + "learning_rate": 1.9710477583175202e-07, + "loss": 0.025, + "step": 39360 + }, + { + "epoch": 0.1572928215423646, + "grad_norm": 4.296139717102051, + "learning_rate": 1.9710327346455273e-07, + "loss": 0.022, + "step": 39370 + }, + { + "epoch": 0.15733277399894127, + "grad_norm": 4.302359104156494, + "learning_rate": 1.9710177071338483e-07, + "loss": 0.0207, + "step": 39380 + }, + { + "epoch": 0.15737272645551795, + "grad_norm": 6.029001712799072, + "learning_rate": 1.9710026757825428e-07, + "loss": 0.0276, + "step": 39390 + }, + { + "epoch": 0.1574126789120946, + "grad_norm": 2.6727793216705322, + "learning_rate": 1.97098764059167e-07, + "loss": 0.0215, + "step": 39400 + }, + { + "epoch": 0.15745263136867127, + "grad_norm": 3.550934314727783, + "learning_rate": 1.9709726015612898e-07, + "loss": 0.0228, + "step": 39410 + }, + { + "epoch": 0.15749258382524794, + "grad_norm": 3.5140600204467773, + "learning_rate": 1.9709575586914614e-07, + "loss": 0.0212, + "step": 39420 + }, + { + "epoch": 0.15753253628182462, + "grad_norm": 3.4382729530334473, + "learning_rate": 1.970942511982244e-07, + "loss": 0.0238, + "step": 39430 + }, + { + "epoch": 0.1575724887384013, + "grad_norm": 2.6356465816497803, + "learning_rate": 1.9709274614336973e-07, + "loss": 0.025, + "step": 39440 + }, + { + "epoch": 0.15761244119497797, + "grad_norm": 4.728875160217285, + "learning_rate": 1.970912407045881e-07, + "loss": 0.0231, + "step": 39450 + }, + { + "epoch": 0.15765239365155465, + "grad_norm": 15.280255317687988, + "learning_rate": 1.9708973488188546e-07, + "loss": 0.0209, + "step": 39460 + }, + { + "epoch": 0.15769234610813132, + "grad_norm": 2.8932087421417236, + "learning_rate": 1.9708822867526772e-07, + "loss": 0.0229, + "step": 39470 + }, + { + "epoch": 0.157732298564708, + "grad_norm": 4.973537445068359, + "learning_rate": 1.970867220847409e-07, + "loss": 0.0253, + "step": 39480 + }, + { + "epoch": 0.15777225102128467, + "grad_norm": 1.9493350982666016, + "learning_rate": 1.9708521511031093e-07, + "loss": 0.0206, + "step": 39490 + }, + { + "epoch": 0.15781220347786135, + "grad_norm": 4.8284759521484375, + "learning_rate": 1.9708370775198374e-07, + "loss": 0.0202, + "step": 39500 + }, + { + "epoch": 0.15785215593443802, + "grad_norm": 7.249011039733887, + "learning_rate": 1.9708220000976534e-07, + "loss": 0.0215, + "step": 39510 + }, + { + "epoch": 0.1578921083910147, + "grad_norm": 4.1237592697143555, + "learning_rate": 1.9708069188366168e-07, + "loss": 0.0241, + "step": 39520 + }, + { + "epoch": 0.15793206084759137, + "grad_norm": 4.906115531921387, + "learning_rate": 1.9707918337367869e-07, + "loss": 0.023, + "step": 39530 + }, + { + "epoch": 0.15797201330416805, + "grad_norm": 3.861283779144287, + "learning_rate": 1.9707767447982235e-07, + "loss": 0.0239, + "step": 39540 + }, + { + "epoch": 0.15801196576074472, + "grad_norm": 4.640395641326904, + "learning_rate": 1.9707616520209865e-07, + "loss": 0.0251, + "step": 39550 + }, + { + "epoch": 0.1580519182173214, + "grad_norm": 7.53455924987793, + "learning_rate": 1.9707465554051354e-07, + "loss": 0.0195, + "step": 39560 + }, + { + "epoch": 0.15809187067389807, + "grad_norm": 2.3979086875915527, + "learning_rate": 1.97073145495073e-07, + "loss": 0.0214, + "step": 39570 + }, + { + "epoch": 0.15813182313047475, + "grad_norm": 5.717700481414795, + "learning_rate": 1.9707163506578297e-07, + "loss": 0.0222, + "step": 39580 + }, + { + "epoch": 0.15817177558705142, + "grad_norm": 5.098435401916504, + "learning_rate": 1.9707012425264947e-07, + "loss": 0.0201, + "step": 39590 + }, + { + "epoch": 0.15821172804362807, + "grad_norm": 2.338484048843384, + "learning_rate": 1.9706861305567842e-07, + "loss": 0.0252, + "step": 39600 + }, + { + "epoch": 0.15825168050020474, + "grad_norm": 2.051520347595215, + "learning_rate": 1.9706710147487584e-07, + "loss": 0.0187, + "step": 39610 + }, + { + "epoch": 0.15829163295678142, + "grad_norm": 4.96392297744751, + "learning_rate": 1.970655895102477e-07, + "loss": 0.0216, + "step": 39620 + }, + { + "epoch": 0.1583315854133581, + "grad_norm": 5.377878665924072, + "learning_rate": 1.9706407716179995e-07, + "loss": 0.0191, + "step": 39630 + }, + { + "epoch": 0.15837153786993477, + "grad_norm": 4.726428985595703, + "learning_rate": 1.9706256442953862e-07, + "loss": 0.0232, + "step": 39640 + }, + { + "epoch": 0.15841149032651144, + "grad_norm": 1.9428244829177856, + "learning_rate": 1.9706105131346963e-07, + "loss": 0.0166, + "step": 39650 + }, + { + "epoch": 0.15845144278308812, + "grad_norm": 2.7281582355499268, + "learning_rate": 1.9705953781359903e-07, + "loss": 0.0231, + "step": 39660 + }, + { + "epoch": 0.1584913952396648, + "grad_norm": 1.1287589073181152, + "learning_rate": 1.9705802392993274e-07, + "loss": 0.0246, + "step": 39670 + }, + { + "epoch": 0.15853134769624147, + "grad_norm": 5.90299654006958, + "learning_rate": 1.9705650966247678e-07, + "loss": 0.0216, + "step": 39680 + }, + { + "epoch": 0.15857130015281815, + "grad_norm": 3.3538994789123535, + "learning_rate": 1.9705499501123716e-07, + "loss": 0.0243, + "step": 39690 + }, + { + "epoch": 0.15861125260939482, + "grad_norm": 3.165130853652954, + "learning_rate": 1.9705347997621982e-07, + "loss": 0.0262, + "step": 39700 + }, + { + "epoch": 0.1586512050659715, + "grad_norm": 1.999441385269165, + "learning_rate": 1.9705196455743078e-07, + "loss": 0.0225, + "step": 39710 + }, + { + "epoch": 0.15869115752254817, + "grad_norm": 4.9700775146484375, + "learning_rate": 1.9705044875487602e-07, + "loss": 0.0218, + "step": 39720 + }, + { + "epoch": 0.15873110997912485, + "grad_norm": 3.537006139755249, + "learning_rate": 1.9704893256856155e-07, + "loss": 0.0175, + "step": 39730 + }, + { + "epoch": 0.15877106243570152, + "grad_norm": 3.062613010406494, + "learning_rate": 1.9704741599849334e-07, + "loss": 0.023, + "step": 39740 + }, + { + "epoch": 0.1588110148922782, + "grad_norm": 4.246539115905762, + "learning_rate": 1.9704589904467742e-07, + "loss": 0.0198, + "step": 39750 + }, + { + "epoch": 0.15885096734885487, + "grad_norm": 12.157230377197266, + "learning_rate": 1.9704438170711976e-07, + "loss": 0.0222, + "step": 39760 + }, + { + "epoch": 0.15889091980543155, + "grad_norm": 17.683080673217773, + "learning_rate": 1.970428639858264e-07, + "loss": 0.0211, + "step": 39770 + }, + { + "epoch": 0.15893087226200822, + "grad_norm": 2.209165096282959, + "learning_rate": 1.970413458808033e-07, + "loss": 0.0223, + "step": 39780 + }, + { + "epoch": 0.1589708247185849, + "grad_norm": 8.604565620422363, + "learning_rate": 1.9703982739205646e-07, + "loss": 0.024, + "step": 39790 + }, + { + "epoch": 0.15901077717516154, + "grad_norm": 3.118241786956787, + "learning_rate": 1.970383085195919e-07, + "loss": 0.0204, + "step": 39800 + }, + { + "epoch": 0.15905072963173822, + "grad_norm": 1.9785290956497192, + "learning_rate": 1.9703678926341562e-07, + "loss": 0.0233, + "step": 39810 + }, + { + "epoch": 0.1590906820883149, + "grad_norm": 3.0680692195892334, + "learning_rate": 1.9703526962353366e-07, + "loss": 0.0261, + "step": 39820 + }, + { + "epoch": 0.15913063454489157, + "grad_norm": 3.502997875213623, + "learning_rate": 1.9703374959995199e-07, + "loss": 0.0217, + "step": 39830 + }, + { + "epoch": 0.15917058700146824, + "grad_norm": 5.576287269592285, + "learning_rate": 1.9703222919267662e-07, + "loss": 0.0238, + "step": 39840 + }, + { + "epoch": 0.15921053945804492, + "grad_norm": 6.856359958648682, + "learning_rate": 1.970307084017136e-07, + "loss": 0.0213, + "step": 39850 + }, + { + "epoch": 0.1592504919146216, + "grad_norm": 5.653652191162109, + "learning_rate": 1.970291872270689e-07, + "loss": 0.0217, + "step": 39860 + }, + { + "epoch": 0.15929044437119827, + "grad_norm": 7.657471179962158, + "learning_rate": 1.9702766566874854e-07, + "loss": 0.0227, + "step": 39870 + }, + { + "epoch": 0.15933039682777494, + "grad_norm": 3.684795379638672, + "learning_rate": 1.9702614372675855e-07, + "loss": 0.0227, + "step": 39880 + }, + { + "epoch": 0.15937034928435162, + "grad_norm": 2.51953125, + "learning_rate": 1.97024621401105e-07, + "loss": 0.024, + "step": 39890 + }, + { + "epoch": 0.1594103017409283, + "grad_norm": 4.294746398925781, + "learning_rate": 1.9702309869179382e-07, + "loss": 0.0273, + "step": 39900 + }, + { + "epoch": 0.15945025419750497, + "grad_norm": 2.1882917881011963, + "learning_rate": 1.9702157559883106e-07, + "loss": 0.0228, + "step": 39910 + }, + { + "epoch": 0.15949020665408165, + "grad_norm": 2.6875391006469727, + "learning_rate": 1.9702005212222278e-07, + "loss": 0.0233, + "step": 39920 + }, + { + "epoch": 0.15953015911065832, + "grad_norm": 2.1031572818756104, + "learning_rate": 1.9701852826197494e-07, + "loss": 0.0233, + "step": 39930 + }, + { + "epoch": 0.159570111567235, + "grad_norm": 6.24700927734375, + "learning_rate": 1.9701700401809361e-07, + "loss": 0.022, + "step": 39940 + }, + { + "epoch": 0.15961006402381167, + "grad_norm": 8.563042640686035, + "learning_rate": 1.9701547939058482e-07, + "loss": 0.0218, + "step": 39950 + }, + { + "epoch": 0.15965001648038835, + "grad_norm": 2.9382145404815674, + "learning_rate": 1.9701395437945457e-07, + "loss": 0.0245, + "step": 39960 + }, + { + "epoch": 0.15968996893696502, + "grad_norm": 4.2750163078308105, + "learning_rate": 1.9701242898470895e-07, + "loss": 0.0209, + "step": 39970 + }, + { + "epoch": 0.1597299213935417, + "grad_norm": 2.5046634674072266, + "learning_rate": 1.970109032063539e-07, + "loss": 0.0215, + "step": 39980 + }, + { + "epoch": 0.15976987385011837, + "grad_norm": 15.04212760925293, + "learning_rate": 1.9700937704439554e-07, + "loss": 0.0217, + "step": 39990 + }, + { + "epoch": 0.15980982630669505, + "grad_norm": 7.202577590942383, + "learning_rate": 1.9700785049883986e-07, + "loss": 0.0245, + "step": 40000 + }, + { + "epoch": 0.1598497787632717, + "grad_norm": 5.294686317443848, + "learning_rate": 1.970063235696929e-07, + "loss": 0.0221, + "step": 40010 + }, + { + "epoch": 0.15988973121984837, + "grad_norm": 8.199002265930176, + "learning_rate": 1.970047962569607e-07, + "loss": 0.0261, + "step": 40020 + }, + { + "epoch": 0.15992968367642504, + "grad_norm": 2.0829155445098877, + "learning_rate": 1.970032685606493e-07, + "loss": 0.0242, + "step": 40030 + }, + { + "epoch": 0.15996963613300172, + "grad_norm": 7.519226551055908, + "learning_rate": 1.9700174048076472e-07, + "loss": 0.0275, + "step": 40040 + }, + { + "epoch": 0.1600095885895784, + "grad_norm": 3.9898955821990967, + "learning_rate": 1.9700021201731304e-07, + "loss": 0.0238, + "step": 40050 + }, + { + "epoch": 0.16004954104615507, + "grad_norm": 3.445028781890869, + "learning_rate": 1.9699868317030033e-07, + "loss": 0.0213, + "step": 40060 + }, + { + "epoch": 0.16008949350273174, + "grad_norm": 3.890197992324829, + "learning_rate": 1.9699715393973255e-07, + "loss": 0.0241, + "step": 40070 + }, + { + "epoch": 0.16012944595930842, + "grad_norm": 4.831511974334717, + "learning_rate": 1.9699562432561578e-07, + "loss": 0.0249, + "step": 40080 + }, + { + "epoch": 0.1601693984158851, + "grad_norm": 2.054342031478882, + "learning_rate": 1.9699409432795612e-07, + "loss": 0.0221, + "step": 40090 + }, + { + "epoch": 0.16020935087246177, + "grad_norm": 4.836823463439941, + "learning_rate": 1.9699256394675953e-07, + "loss": 0.0209, + "step": 40100 + }, + { + "epoch": 0.16024930332903844, + "grad_norm": 5.7656331062316895, + "learning_rate": 1.9699103318203215e-07, + "loss": 0.0249, + "step": 40110 + }, + { + "epoch": 0.16028925578561512, + "grad_norm": 2.737917184829712, + "learning_rate": 1.9698950203377998e-07, + "loss": 0.0221, + "step": 40120 + }, + { + "epoch": 0.1603292082421918, + "grad_norm": 2.534008741378784, + "learning_rate": 1.9698797050200907e-07, + "loss": 0.0236, + "step": 40130 + }, + { + "epoch": 0.16036916069876847, + "grad_norm": 3.920358419418335, + "learning_rate": 1.9698643858672552e-07, + "loss": 0.0237, + "step": 40140 + }, + { + "epoch": 0.16040911315534515, + "grad_norm": 2.693265676498413, + "learning_rate": 1.9698490628793537e-07, + "loss": 0.0198, + "step": 40150 + }, + { + "epoch": 0.16044906561192182, + "grad_norm": 4.421778202056885, + "learning_rate": 1.9698337360564464e-07, + "loss": 0.0241, + "step": 40160 + }, + { + "epoch": 0.1604890180684985, + "grad_norm": 3.8917696475982666, + "learning_rate": 1.9698184053985943e-07, + "loss": 0.0229, + "step": 40170 + }, + { + "epoch": 0.16052897052507517, + "grad_norm": 3.6518454551696777, + "learning_rate": 1.969803070905858e-07, + "loss": 0.0223, + "step": 40180 + }, + { + "epoch": 0.16056892298165185, + "grad_norm": 4.952287197113037, + "learning_rate": 1.9697877325782982e-07, + "loss": 0.0243, + "step": 40190 + }, + { + "epoch": 0.16060887543822852, + "grad_norm": 3.7513349056243896, + "learning_rate": 1.969772390415975e-07, + "loss": 0.0217, + "step": 40200 + }, + { + "epoch": 0.16064882789480517, + "grad_norm": 1.6722077131271362, + "learning_rate": 1.9697570444189499e-07, + "loss": 0.0235, + "step": 40210 + }, + { + "epoch": 0.16068878035138184, + "grad_norm": 3.772930145263672, + "learning_rate": 1.969741694587283e-07, + "loss": 0.019, + "step": 40220 + }, + { + "epoch": 0.16072873280795852, + "grad_norm": 4.130879878997803, + "learning_rate": 1.969726340921035e-07, + "loss": 0.0247, + "step": 40230 + }, + { + "epoch": 0.1607686852645352, + "grad_norm": 7.951900005340576, + "learning_rate": 1.969710983420267e-07, + "loss": 0.0206, + "step": 40240 + }, + { + "epoch": 0.16080863772111187, + "grad_norm": 2.222496271133423, + "learning_rate": 1.9696956220850393e-07, + "loss": 0.0241, + "step": 40250 + }, + { + "epoch": 0.16084859017768854, + "grad_norm": 7.225752353668213, + "learning_rate": 1.9696802569154131e-07, + "loss": 0.0249, + "step": 40260 + }, + { + "epoch": 0.16088854263426522, + "grad_norm": 6.75662899017334, + "learning_rate": 1.9696648879114488e-07, + "loss": 0.0209, + "step": 40270 + }, + { + "epoch": 0.1609284950908419, + "grad_norm": 4.944123268127441, + "learning_rate": 1.9696495150732072e-07, + "loss": 0.0233, + "step": 40280 + }, + { + "epoch": 0.16096844754741857, + "grad_norm": 3.5615134239196777, + "learning_rate": 1.9696341384007492e-07, + "loss": 0.0231, + "step": 40290 + }, + { + "epoch": 0.16100840000399524, + "grad_norm": 3.7751715183258057, + "learning_rate": 1.9696187578941357e-07, + "loss": 0.0204, + "step": 40300 + }, + { + "epoch": 0.16104835246057192, + "grad_norm": 5.3341965675354, + "learning_rate": 1.9696033735534273e-07, + "loss": 0.0236, + "step": 40310 + }, + { + "epoch": 0.1610883049171486, + "grad_norm": 4.382688045501709, + "learning_rate": 1.9695879853786848e-07, + "loss": 0.0231, + "step": 40320 + }, + { + "epoch": 0.16112825737372527, + "grad_norm": 3.0303759574890137, + "learning_rate": 1.9695725933699692e-07, + "loss": 0.0239, + "step": 40330 + }, + { + "epoch": 0.16116820983030195, + "grad_norm": 5.70577335357666, + "learning_rate": 1.9695571975273414e-07, + "loss": 0.0191, + "step": 40340 + }, + { + "epoch": 0.16120816228687862, + "grad_norm": 3.267605781555176, + "learning_rate": 1.9695417978508622e-07, + "loss": 0.0215, + "step": 40350 + }, + { + "epoch": 0.1612481147434553, + "grad_norm": 8.694793701171875, + "learning_rate": 1.9695263943405927e-07, + "loss": 0.0185, + "step": 40360 + }, + { + "epoch": 0.16128806720003197, + "grad_norm": 2.526245355606079, + "learning_rate": 1.9695109869965934e-07, + "loss": 0.0202, + "step": 40370 + }, + { + "epoch": 0.16132801965660865, + "grad_norm": 2.321990489959717, + "learning_rate": 1.969495575818925e-07, + "loss": 0.0221, + "step": 40380 + }, + { + "epoch": 0.16136797211318532, + "grad_norm": 6.241804122924805, + "learning_rate": 1.9694801608076496e-07, + "loss": 0.0218, + "step": 40390 + }, + { + "epoch": 0.161407924569762, + "grad_norm": 3.6177451610565186, + "learning_rate": 1.969464741962827e-07, + "loss": 0.023, + "step": 40400 + }, + { + "epoch": 0.16144787702633864, + "grad_norm": 6.285497188568115, + "learning_rate": 1.9694493192845188e-07, + "loss": 0.0219, + "step": 40410 + }, + { + "epoch": 0.16148782948291532, + "grad_norm": 2.3659868240356445, + "learning_rate": 1.969433892772786e-07, + "loss": 0.02, + "step": 40420 + }, + { + "epoch": 0.161527781939492, + "grad_norm": 6.68261194229126, + "learning_rate": 1.969418462427689e-07, + "loss": 0.0252, + "step": 40430 + }, + { + "epoch": 0.16156773439606867, + "grad_norm": 3.1467864513397217, + "learning_rate": 1.969403028249289e-07, + "loss": 0.0206, + "step": 40440 + }, + { + "epoch": 0.16160768685264534, + "grad_norm": 2.9065823554992676, + "learning_rate": 1.9693875902376477e-07, + "loss": 0.0208, + "step": 40450 + }, + { + "epoch": 0.16164763930922202, + "grad_norm": 2.3695831298828125, + "learning_rate": 1.9693721483928254e-07, + "loss": 0.0237, + "step": 40460 + }, + { + "epoch": 0.1616875917657987, + "grad_norm": 2.780733823776245, + "learning_rate": 1.9693567027148833e-07, + "loss": 0.024, + "step": 40470 + }, + { + "epoch": 0.16172754422237537, + "grad_norm": 2.5985350608825684, + "learning_rate": 1.9693412532038832e-07, + "loss": 0.021, + "step": 40480 + }, + { + "epoch": 0.16176749667895204, + "grad_norm": 4.858410835266113, + "learning_rate": 1.969325799859885e-07, + "loss": 0.0241, + "step": 40490 + }, + { + "epoch": 0.16180744913552872, + "grad_norm": 1.5437614917755127, + "learning_rate": 1.9693103426829506e-07, + "loss": 0.0227, + "step": 40500 + }, + { + "epoch": 0.1618474015921054, + "grad_norm": 3.9599196910858154, + "learning_rate": 1.9692948816731413e-07, + "loss": 0.0251, + "step": 40510 + }, + { + "epoch": 0.16188735404868207, + "grad_norm": 2.4407496452331543, + "learning_rate": 1.9692794168305174e-07, + "loss": 0.0228, + "step": 40520 + }, + { + "epoch": 0.16192730650525874, + "grad_norm": 5.206437587738037, + "learning_rate": 1.9692639481551406e-07, + "loss": 0.025, + "step": 40530 + }, + { + "epoch": 0.16196725896183542, + "grad_norm": 4.513131618499756, + "learning_rate": 1.969248475647072e-07, + "loss": 0.02, + "step": 40540 + }, + { + "epoch": 0.1620072114184121, + "grad_norm": 5.555194854736328, + "learning_rate": 1.9692329993063726e-07, + "loss": 0.0244, + "step": 40550 + }, + { + "epoch": 0.16204716387498877, + "grad_norm": 5.6486592292785645, + "learning_rate": 1.969217519133104e-07, + "loss": 0.0209, + "step": 40560 + }, + { + "epoch": 0.16208711633156545, + "grad_norm": 3.680288314819336, + "learning_rate": 1.9692020351273272e-07, + "loss": 0.0221, + "step": 40570 + }, + { + "epoch": 0.16212706878814212, + "grad_norm": 3.353807210922241, + "learning_rate": 1.9691865472891034e-07, + "loss": 0.0206, + "step": 40580 + }, + { + "epoch": 0.1621670212447188, + "grad_norm": 4.204568386077881, + "learning_rate": 1.9691710556184937e-07, + "loss": 0.025, + "step": 40590 + }, + { + "epoch": 0.16220697370129547, + "grad_norm": 7.10123872756958, + "learning_rate": 1.9691555601155596e-07, + "loss": 0.0207, + "step": 40600 + }, + { + "epoch": 0.16224692615787215, + "grad_norm": 5.091904163360596, + "learning_rate": 1.9691400607803623e-07, + "loss": 0.025, + "step": 40610 + }, + { + "epoch": 0.1622868786144488, + "grad_norm": 6.760296821594238, + "learning_rate": 1.9691245576129632e-07, + "loss": 0.0208, + "step": 40620 + }, + { + "epoch": 0.16232683107102547, + "grad_norm": 5.17340087890625, + "learning_rate": 1.9691090506134232e-07, + "loss": 0.0221, + "step": 40630 + }, + { + "epoch": 0.16236678352760214, + "grad_norm": 4.377808570861816, + "learning_rate": 1.9690935397818037e-07, + "loss": 0.0191, + "step": 40640 + }, + { + "epoch": 0.16240673598417882, + "grad_norm": 2.8296096324920654, + "learning_rate": 1.9690780251181667e-07, + "loss": 0.0224, + "step": 40650 + }, + { + "epoch": 0.1624466884407555, + "grad_norm": 11.081764221191406, + "learning_rate": 1.9690625066225727e-07, + "loss": 0.024, + "step": 40660 + }, + { + "epoch": 0.16248664089733217, + "grad_norm": 4.167782783508301, + "learning_rate": 1.9690469842950838e-07, + "loss": 0.0213, + "step": 40670 + }, + { + "epoch": 0.16252659335390884, + "grad_norm": 3.0244226455688477, + "learning_rate": 1.9690314581357604e-07, + "loss": 0.0191, + "step": 40680 + }, + { + "epoch": 0.16256654581048552, + "grad_norm": 5.192479610443115, + "learning_rate": 1.969015928144665e-07, + "loss": 0.0221, + "step": 40690 + }, + { + "epoch": 0.1626064982670622, + "grad_norm": 5.204336166381836, + "learning_rate": 1.9690003943218586e-07, + "loss": 0.0344, + "step": 40700 + }, + { + "epoch": 0.16264645072363887, + "grad_norm": 5.383275032043457, + "learning_rate": 1.9689848566674022e-07, + "loss": 0.0241, + "step": 40710 + }, + { + "epoch": 0.16268640318021554, + "grad_norm": 4.68315315246582, + "learning_rate": 1.9689693151813575e-07, + "loss": 0.023, + "step": 40720 + }, + { + "epoch": 0.16272635563679222, + "grad_norm": 4.32759428024292, + "learning_rate": 1.9689537698637863e-07, + "loss": 0.0278, + "step": 40730 + }, + { + "epoch": 0.1627663080933689, + "grad_norm": 4.149475574493408, + "learning_rate": 1.9689382207147495e-07, + "loss": 0.0217, + "step": 40740 + }, + { + "epoch": 0.16280626054994557, + "grad_norm": 3.2928173542022705, + "learning_rate": 1.9689226677343092e-07, + "loss": 0.0234, + "step": 40750 + }, + { + "epoch": 0.16284621300652224, + "grad_norm": 15.039417266845703, + "learning_rate": 1.9689071109225266e-07, + "loss": 0.0222, + "step": 40760 + }, + { + "epoch": 0.16288616546309892, + "grad_norm": 3.575151205062866, + "learning_rate": 1.968891550279463e-07, + "loss": 0.0176, + "step": 40770 + }, + { + "epoch": 0.1629261179196756, + "grad_norm": 2.5365400314331055, + "learning_rate": 1.9688759858051803e-07, + "loss": 0.0214, + "step": 40780 + }, + { + "epoch": 0.16296607037625227, + "grad_norm": 3.755199909210205, + "learning_rate": 1.9688604174997398e-07, + "loss": 0.02, + "step": 40790 + }, + { + "epoch": 0.16300602283282895, + "grad_norm": 4.790876865386963, + "learning_rate": 1.968844845363203e-07, + "loss": 0.0285, + "step": 40800 + }, + { + "epoch": 0.16304597528940562, + "grad_norm": 1.8867501020431519, + "learning_rate": 1.968829269395632e-07, + "loss": 0.0209, + "step": 40810 + }, + { + "epoch": 0.16308592774598227, + "grad_norm": 4.1786112785339355, + "learning_rate": 1.9688136895970875e-07, + "loss": 0.02, + "step": 40820 + }, + { + "epoch": 0.16312588020255894, + "grad_norm": 2.8565967082977295, + "learning_rate": 1.968798105967632e-07, + "loss": 0.0228, + "step": 40830 + }, + { + "epoch": 0.16316583265913562, + "grad_norm": 6.097909450531006, + "learning_rate": 1.9687825185073264e-07, + "loss": 0.0251, + "step": 40840 + }, + { + "epoch": 0.1632057851157123, + "grad_norm": 10.984161376953125, + "learning_rate": 1.9687669272162328e-07, + "loss": 0.0261, + "step": 40850 + }, + { + "epoch": 0.16324573757228897, + "grad_norm": 2.5820703506469727, + "learning_rate": 1.9687513320944128e-07, + "loss": 0.0208, + "step": 40860 + }, + { + "epoch": 0.16328569002886564, + "grad_norm": 4.279748439788818, + "learning_rate": 1.968735733141928e-07, + "loss": 0.0233, + "step": 40870 + }, + { + "epoch": 0.16332564248544232, + "grad_norm": 3.9820313453674316, + "learning_rate": 1.9687201303588398e-07, + "loss": 0.0226, + "step": 40880 + }, + { + "epoch": 0.163365594942019, + "grad_norm": 2.029860019683838, + "learning_rate": 1.9687045237452106e-07, + "loss": 0.0252, + "step": 40890 + }, + { + "epoch": 0.16340554739859567, + "grad_norm": 2.3441388607025146, + "learning_rate": 1.9686889133011014e-07, + "loss": 0.0245, + "step": 40900 + }, + { + "epoch": 0.16344549985517234, + "grad_norm": 2.6291720867156982, + "learning_rate": 1.968673299026574e-07, + "loss": 0.0199, + "step": 40910 + }, + { + "epoch": 0.16348545231174902, + "grad_norm": 4.812448978424072, + "learning_rate": 1.9686576809216909e-07, + "loss": 0.0235, + "step": 40920 + }, + { + "epoch": 0.1635254047683257, + "grad_norm": 5.517206192016602, + "learning_rate": 1.968642058986513e-07, + "loss": 0.0205, + "step": 40930 + }, + { + "epoch": 0.16356535722490237, + "grad_norm": 2.297800302505493, + "learning_rate": 1.968626433221102e-07, + "loss": 0.0197, + "step": 40940 + }, + { + "epoch": 0.16360530968147904, + "grad_norm": 1.8552587032318115, + "learning_rate": 1.9686108036255208e-07, + "loss": 0.0193, + "step": 40950 + }, + { + "epoch": 0.16364526213805572, + "grad_norm": 1.3201109170913696, + "learning_rate": 1.96859517019983e-07, + "loss": 0.0234, + "step": 40960 + }, + { + "epoch": 0.1636852145946324, + "grad_norm": 4.947554588317871, + "learning_rate": 1.968579532944092e-07, + "loss": 0.0251, + "step": 40970 + }, + { + "epoch": 0.16372516705120907, + "grad_norm": 2.7316267490386963, + "learning_rate": 1.968563891858368e-07, + "loss": 0.0235, + "step": 40980 + }, + { + "epoch": 0.16376511950778574, + "grad_norm": 3.2493443489074707, + "learning_rate": 1.968548246942721e-07, + "loss": 0.0205, + "step": 40990 + }, + { + "epoch": 0.16380507196436242, + "grad_norm": 6.443737983703613, + "learning_rate": 1.9685325981972123e-07, + "loss": 0.0221, + "step": 41000 + }, + { + "epoch": 0.1638450244209391, + "grad_norm": 12.064022064208984, + "learning_rate": 1.9685169456219035e-07, + "loss": 0.0187, + "step": 41010 + }, + { + "epoch": 0.16388497687751577, + "grad_norm": 8.41429328918457, + "learning_rate": 1.9685012892168565e-07, + "loss": 0.0202, + "step": 41020 + }, + { + "epoch": 0.16392492933409242, + "grad_norm": 3.1033339500427246, + "learning_rate": 1.9684856289821337e-07, + "loss": 0.0206, + "step": 41030 + }, + { + "epoch": 0.1639648817906691, + "grad_norm": 2.57879376411438, + "learning_rate": 1.9684699649177965e-07, + "loss": 0.0212, + "step": 41040 + }, + { + "epoch": 0.16400483424724577, + "grad_norm": 3.2255396842956543, + "learning_rate": 1.9684542970239072e-07, + "loss": 0.0225, + "step": 41050 + }, + { + "epoch": 0.16404478670382244, + "grad_norm": 3.113497734069824, + "learning_rate": 1.9684386253005277e-07, + "loss": 0.0239, + "step": 41060 + }, + { + "epoch": 0.16408473916039912, + "grad_norm": 2.3847734928131104, + "learning_rate": 1.96842294974772e-07, + "loss": 0.0201, + "step": 41070 + }, + { + "epoch": 0.1641246916169758, + "grad_norm": 3.4261250495910645, + "learning_rate": 1.9684072703655458e-07, + "loss": 0.0242, + "step": 41080 + }, + { + "epoch": 0.16416464407355247, + "grad_norm": 3.6662964820861816, + "learning_rate": 1.968391587154067e-07, + "loss": 0.0239, + "step": 41090 + }, + { + "epoch": 0.16420459653012914, + "grad_norm": 3.2980005741119385, + "learning_rate": 1.9683759001133465e-07, + "loss": 0.0265, + "step": 41100 + }, + { + "epoch": 0.16424454898670582, + "grad_norm": 10.53225326538086, + "learning_rate": 1.9683602092434453e-07, + "loss": 0.0232, + "step": 41110 + }, + { + "epoch": 0.1642845014432825, + "grad_norm": 8.201281547546387, + "learning_rate": 1.968344514544426e-07, + "loss": 0.0219, + "step": 41120 + }, + { + "epoch": 0.16432445389985917, + "grad_norm": 2.2565219402313232, + "learning_rate": 1.9683288160163505e-07, + "loss": 0.0208, + "step": 41130 + }, + { + "epoch": 0.16436440635643584, + "grad_norm": 1.2412267923355103, + "learning_rate": 1.9683131136592807e-07, + "loss": 0.025, + "step": 41140 + }, + { + "epoch": 0.16440435881301252, + "grad_norm": 4.839122772216797, + "learning_rate": 1.968297407473279e-07, + "loss": 0.019, + "step": 41150 + }, + { + "epoch": 0.1644443112695892, + "grad_norm": 4.204844951629639, + "learning_rate": 1.968281697458408e-07, + "loss": 0.0238, + "step": 41160 + }, + { + "epoch": 0.16448426372616587, + "grad_norm": 3.276904344558716, + "learning_rate": 1.9682659836147286e-07, + "loss": 0.0231, + "step": 41170 + }, + { + "epoch": 0.16452421618274254, + "grad_norm": 3.9062299728393555, + "learning_rate": 1.9682502659423038e-07, + "loss": 0.0205, + "step": 41180 + }, + { + "epoch": 0.16456416863931922, + "grad_norm": 3.4271442890167236, + "learning_rate": 1.9682345444411954e-07, + "loss": 0.0234, + "step": 41190 + }, + { + "epoch": 0.1646041210958959, + "grad_norm": 4.054495334625244, + "learning_rate": 1.9682188191114657e-07, + "loss": 0.0225, + "step": 41200 + }, + { + "epoch": 0.16464407355247257, + "grad_norm": 2.5310046672821045, + "learning_rate": 1.968203089953177e-07, + "loss": 0.0249, + "step": 41210 + }, + { + "epoch": 0.16468402600904924, + "grad_norm": 3.692927122116089, + "learning_rate": 1.9681873569663913e-07, + "loss": 0.0236, + "step": 41220 + }, + { + "epoch": 0.1647239784656259, + "grad_norm": 3.5697548389434814, + "learning_rate": 1.9681716201511706e-07, + "loss": 0.0255, + "step": 41230 + }, + { + "epoch": 0.16476393092220257, + "grad_norm": 2.963742256164551, + "learning_rate": 1.9681558795075776e-07, + "loss": 0.0214, + "step": 41240 + }, + { + "epoch": 0.16480388337877924, + "grad_norm": 7.3265180587768555, + "learning_rate": 1.9681401350356747e-07, + "loss": 0.0243, + "step": 41250 + }, + { + "epoch": 0.16484383583535592, + "grad_norm": 6.797448635101318, + "learning_rate": 1.9681243867355232e-07, + "loss": 0.0252, + "step": 41260 + }, + { + "epoch": 0.1648837882919326, + "grad_norm": 4.774101734161377, + "learning_rate": 1.9681086346071862e-07, + "loss": 0.0216, + "step": 41270 + }, + { + "epoch": 0.16492374074850927, + "grad_norm": 3.1106622219085693, + "learning_rate": 1.9680928786507259e-07, + "loss": 0.0249, + "step": 41280 + }, + { + "epoch": 0.16496369320508594, + "grad_norm": 5.600759029388428, + "learning_rate": 1.9680771188662042e-07, + "loss": 0.0256, + "step": 41290 + }, + { + "epoch": 0.16500364566166262, + "grad_norm": 2.5153567790985107, + "learning_rate": 1.9680613552536838e-07, + "loss": 0.0251, + "step": 41300 + }, + { + "epoch": 0.1650435981182393, + "grad_norm": 4.9882307052612305, + "learning_rate": 1.968045587813227e-07, + "loss": 0.0234, + "step": 41310 + }, + { + "epoch": 0.16508355057481597, + "grad_norm": 2.2646420001983643, + "learning_rate": 1.968029816544896e-07, + "loss": 0.021, + "step": 41320 + }, + { + "epoch": 0.16512350303139264, + "grad_norm": 5.336530685424805, + "learning_rate": 1.9680140414487532e-07, + "loss": 0.0256, + "step": 41330 + }, + { + "epoch": 0.16516345548796932, + "grad_norm": 5.0777764320373535, + "learning_rate": 1.967998262524861e-07, + "loss": 0.0187, + "step": 41340 + }, + { + "epoch": 0.165203407944546, + "grad_norm": 1.9329618215560913, + "learning_rate": 1.9679824797732817e-07, + "loss": 0.0212, + "step": 41350 + }, + { + "epoch": 0.16524336040112267, + "grad_norm": 7.094888687133789, + "learning_rate": 1.967966693194078e-07, + "loss": 0.0274, + "step": 41360 + }, + { + "epoch": 0.16528331285769934, + "grad_norm": 4.578734874725342, + "learning_rate": 1.967950902787312e-07, + "loss": 0.0244, + "step": 41370 + }, + { + "epoch": 0.16532326531427602, + "grad_norm": 4.453488826751709, + "learning_rate": 1.9679351085530463e-07, + "loss": 0.0231, + "step": 41380 + }, + { + "epoch": 0.1653632177708527, + "grad_norm": 4.372800827026367, + "learning_rate": 1.967919310491343e-07, + "loss": 0.023, + "step": 41390 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 3.209566593170166, + "learning_rate": 1.9679035086022652e-07, + "loss": 0.0213, + "step": 41400 + }, + { + "epoch": 0.16544312268400604, + "grad_norm": 1.0620590448379517, + "learning_rate": 1.9678877028858753e-07, + "loss": 0.0239, + "step": 41410 + }, + { + "epoch": 0.16548307514058272, + "grad_norm": 4.595701694488525, + "learning_rate": 1.9678718933422353e-07, + "loss": 0.0261, + "step": 41420 + }, + { + "epoch": 0.16552302759715937, + "grad_norm": 4.874622821807861, + "learning_rate": 1.967856079971408e-07, + "loss": 0.0218, + "step": 41430 + }, + { + "epoch": 0.16556298005373604, + "grad_norm": 5.27595853805542, + "learning_rate": 1.967840262773456e-07, + "loss": 0.0224, + "step": 41440 + }, + { + "epoch": 0.16560293251031272, + "grad_norm": 2.7335267066955566, + "learning_rate": 1.9678244417484416e-07, + "loss": 0.0205, + "step": 41450 + }, + { + "epoch": 0.1656428849668894, + "grad_norm": 2.211862564086914, + "learning_rate": 1.9678086168964276e-07, + "loss": 0.0217, + "step": 41460 + }, + { + "epoch": 0.16568283742346607, + "grad_norm": 2.314974784851074, + "learning_rate": 1.9677927882174766e-07, + "loss": 0.0253, + "step": 41470 + }, + { + "epoch": 0.16572278988004274, + "grad_norm": 7.724504470825195, + "learning_rate": 1.9677769557116509e-07, + "loss": 0.0225, + "step": 41480 + }, + { + "epoch": 0.16576274233661942, + "grad_norm": 3.6390819549560547, + "learning_rate": 1.9677611193790136e-07, + "loss": 0.0226, + "step": 41490 + }, + { + "epoch": 0.1658026947931961, + "grad_norm": 7.233211994171143, + "learning_rate": 1.9677452792196266e-07, + "loss": 0.0244, + "step": 41500 + }, + { + "epoch": 0.16584264724977277, + "grad_norm": 6.685949802398682, + "learning_rate": 1.967729435233553e-07, + "loss": 0.0252, + "step": 41510 + }, + { + "epoch": 0.16588259970634944, + "grad_norm": 3.548733949661255, + "learning_rate": 1.9677135874208558e-07, + "loss": 0.0231, + "step": 41520 + }, + { + "epoch": 0.16592255216292612, + "grad_norm": 4.310141086578369, + "learning_rate": 1.9676977357815968e-07, + "loss": 0.0221, + "step": 41530 + }, + { + "epoch": 0.1659625046195028, + "grad_norm": 3.354367971420288, + "learning_rate": 1.9676818803158395e-07, + "loss": 0.0179, + "step": 41540 + }, + { + "epoch": 0.16600245707607947, + "grad_norm": 5.172255516052246, + "learning_rate": 1.967666021023646e-07, + "loss": 0.019, + "step": 41550 + }, + { + "epoch": 0.16604240953265614, + "grad_norm": 6.004179954528809, + "learning_rate": 1.9676501579050795e-07, + "loss": 0.0209, + "step": 41560 + }, + { + "epoch": 0.16608236198923282, + "grad_norm": 10.784679412841797, + "learning_rate": 1.9676342909602023e-07, + "loss": 0.024, + "step": 41570 + }, + { + "epoch": 0.1661223144458095, + "grad_norm": 1.7954953908920288, + "learning_rate": 1.9676184201890774e-07, + "loss": 0.0222, + "step": 41580 + }, + { + "epoch": 0.16616226690238617, + "grad_norm": 2.0850648880004883, + "learning_rate": 1.9676025455917673e-07, + "loss": 0.0208, + "step": 41590 + }, + { + "epoch": 0.16620221935896284, + "grad_norm": 4.5083136558532715, + "learning_rate": 1.967586667168335e-07, + "loss": 0.0196, + "step": 41600 + }, + { + "epoch": 0.16624217181553952, + "grad_norm": 6.547225475311279, + "learning_rate": 1.9675707849188435e-07, + "loss": 0.0205, + "step": 41610 + }, + { + "epoch": 0.1662821242721162, + "grad_norm": 2.318268299102783, + "learning_rate": 1.967554898843355e-07, + "loss": 0.0256, + "step": 41620 + }, + { + "epoch": 0.16632207672869287, + "grad_norm": 4.302777290344238, + "learning_rate": 1.9675390089419328e-07, + "loss": 0.0233, + "step": 41630 + }, + { + "epoch": 0.16636202918526952, + "grad_norm": 1.9341299533843994, + "learning_rate": 1.9675231152146395e-07, + "loss": 0.019, + "step": 41640 + }, + { + "epoch": 0.1664019816418462, + "grad_norm": 7.383570671081543, + "learning_rate": 1.9675072176615383e-07, + "loss": 0.0207, + "step": 41650 + }, + { + "epoch": 0.16644193409842287, + "grad_norm": 6.057552814483643, + "learning_rate": 1.9674913162826915e-07, + "loss": 0.0238, + "step": 41660 + }, + { + "epoch": 0.16648188655499954, + "grad_norm": 3.4019243717193604, + "learning_rate": 1.9674754110781624e-07, + "loss": 0.0229, + "step": 41670 + }, + { + "epoch": 0.16652183901157622, + "grad_norm": 3.457552671432495, + "learning_rate": 1.9674595020480138e-07, + "loss": 0.0238, + "step": 41680 + }, + { + "epoch": 0.1665617914681529, + "grad_norm": 4.628293514251709, + "learning_rate": 1.9674435891923084e-07, + "loss": 0.0258, + "step": 41690 + }, + { + "epoch": 0.16660174392472957, + "grad_norm": 3.4211440086364746, + "learning_rate": 1.9674276725111097e-07, + "loss": 0.0222, + "step": 41700 + }, + { + "epoch": 0.16664169638130624, + "grad_norm": 2.738976240158081, + "learning_rate": 1.9674117520044797e-07, + "loss": 0.019, + "step": 41710 + }, + { + "epoch": 0.16668164883788292, + "grad_norm": 5.792210578918457, + "learning_rate": 1.9673958276724823e-07, + "loss": 0.0179, + "step": 41720 + }, + { + "epoch": 0.1667216012944596, + "grad_norm": 8.90170955657959, + "learning_rate": 1.96737989951518e-07, + "loss": 0.0199, + "step": 41730 + }, + { + "epoch": 0.16676155375103627, + "grad_norm": 10.758452415466309, + "learning_rate": 1.9673639675326357e-07, + "loss": 0.0208, + "step": 41740 + }, + { + "epoch": 0.16680150620761294, + "grad_norm": 2.1283960342407227, + "learning_rate": 1.9673480317249127e-07, + "loss": 0.0192, + "step": 41750 + }, + { + "epoch": 0.16684145866418962, + "grad_norm": 7.631636142730713, + "learning_rate": 1.9673320920920738e-07, + "loss": 0.0304, + "step": 41760 + }, + { + "epoch": 0.1668814111207663, + "grad_norm": 4.629011154174805, + "learning_rate": 1.967316148634182e-07, + "loss": 0.0288, + "step": 41770 + }, + { + "epoch": 0.16692136357734297, + "grad_norm": 3.1912038326263428, + "learning_rate": 1.9673002013513003e-07, + "loss": 0.0193, + "step": 41780 + }, + { + "epoch": 0.16696131603391964, + "grad_norm": 2.824049949645996, + "learning_rate": 1.967284250243492e-07, + "loss": 0.0221, + "step": 41790 + }, + { + "epoch": 0.16700126849049632, + "grad_norm": 2.640998363494873, + "learning_rate": 1.9672682953108204e-07, + "loss": 0.0206, + "step": 41800 + }, + { + "epoch": 0.167041220947073, + "grad_norm": 5.8817315101623535, + "learning_rate": 1.967252336553348e-07, + "loss": 0.0209, + "step": 41810 + }, + { + "epoch": 0.16708117340364967, + "grad_norm": 3.0803885459899902, + "learning_rate": 1.9672363739711378e-07, + "loss": 0.022, + "step": 41820 + }, + { + "epoch": 0.16712112586022634, + "grad_norm": 4.625086784362793, + "learning_rate": 1.9672204075642538e-07, + "loss": 0.0227, + "step": 41830 + }, + { + "epoch": 0.167161078316803, + "grad_norm": 3.288339853286743, + "learning_rate": 1.9672044373327584e-07, + "loss": 0.0225, + "step": 41840 + }, + { + "epoch": 0.16720103077337967, + "grad_norm": 2.3096585273742676, + "learning_rate": 1.967188463276715e-07, + "loss": 0.0222, + "step": 41850 + }, + { + "epoch": 0.16724098322995634, + "grad_norm": 3.355165719985962, + "learning_rate": 1.9671724853961867e-07, + "loss": 0.021, + "step": 41860 + }, + { + "epoch": 0.16728093568653302, + "grad_norm": 2.5251457691192627, + "learning_rate": 1.9671565036912367e-07, + "loss": 0.0212, + "step": 41870 + }, + { + "epoch": 0.1673208881431097, + "grad_norm": 10.509708404541016, + "learning_rate": 1.967140518161928e-07, + "loss": 0.0178, + "step": 41880 + }, + { + "epoch": 0.16736084059968637, + "grad_norm": 4.828654766082764, + "learning_rate": 1.9671245288083241e-07, + "loss": 0.0178, + "step": 41890 + }, + { + "epoch": 0.16740079305626304, + "grad_norm": 7.18017053604126, + "learning_rate": 1.9671085356304884e-07, + "loss": 0.0296, + "step": 41900 + }, + { + "epoch": 0.16744074551283972, + "grad_norm": 3.4338700771331787, + "learning_rate": 1.9670925386284839e-07, + "loss": 0.0214, + "step": 41910 + }, + { + "epoch": 0.1674806979694164, + "grad_norm": 2.0173709392547607, + "learning_rate": 1.9670765378023736e-07, + "loss": 0.0198, + "step": 41920 + }, + { + "epoch": 0.16752065042599307, + "grad_norm": 1.7911444902420044, + "learning_rate": 1.967060533152221e-07, + "loss": 0.0219, + "step": 41930 + }, + { + "epoch": 0.16756060288256974, + "grad_norm": 7.583564758300781, + "learning_rate": 1.9670445246780892e-07, + "loss": 0.0233, + "step": 41940 + }, + { + "epoch": 0.16760055533914642, + "grad_norm": 5.82903528213501, + "learning_rate": 1.9670285123800424e-07, + "loss": 0.019, + "step": 41950 + }, + { + "epoch": 0.1676405077957231, + "grad_norm": 7.67936372756958, + "learning_rate": 1.9670124962581424e-07, + "loss": 0.0266, + "step": 41960 + }, + { + "epoch": 0.16768046025229977, + "grad_norm": 4.502984523773193, + "learning_rate": 1.9669964763124537e-07, + "loss": 0.0224, + "step": 41970 + }, + { + "epoch": 0.16772041270887644, + "grad_norm": 6.118256092071533, + "learning_rate": 1.9669804525430394e-07, + "loss": 0.0187, + "step": 41980 + }, + { + "epoch": 0.16776036516545312, + "grad_norm": 5.217944622039795, + "learning_rate": 1.9669644249499625e-07, + "loss": 0.0202, + "step": 41990 + }, + { + "epoch": 0.1678003176220298, + "grad_norm": 6.952162742614746, + "learning_rate": 1.9669483935332868e-07, + "loss": 0.0225, + "step": 42000 + } + ], + "logging_steps": 10, + "max_steps": 500596, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}