Safetensors
renruilong's picture
Add scripts and checkpoints (CosFly-Track release) (#85)
284878d
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032,
"grad_norm": 7.051180674009678,
"learning_rate": 1.437699680511182e-07,
"loss": 0.45998425483703614,
"step": 10
},
{
"epoch": 0.0064,
"grad_norm": 6.386709802443142,
"learning_rate": 3.0351437699680514e-07,
"loss": 0.44952831268310545,
"step": 20
},
{
"epoch": 0.0096,
"grad_norm": 5.3631010908380015,
"learning_rate": 4.6325878594249205e-07,
"loss": 0.3993690013885498,
"step": 30
},
{
"epoch": 0.0128,
"grad_norm": 0.852155255839625,
"learning_rate": 6.230031948881789e-07,
"loss": 0.3118258237838745,
"step": 40
},
{
"epoch": 0.016,
"grad_norm": 0.4705571475990448,
"learning_rate": 7.82747603833866e-07,
"loss": 0.2786674976348877,
"step": 50
},
{
"epoch": 0.0192,
"grad_norm": 0.39720799855122535,
"learning_rate": 9.424920127795528e-07,
"loss": 0.2685645580291748,
"step": 60
},
{
"epoch": 0.0224,
"grad_norm": 0.34144681090493506,
"learning_rate": 1.1022364217252397e-06,
"loss": 0.27388153076171873,
"step": 70
},
{
"epoch": 0.0256,
"grad_norm": 0.29670665469044527,
"learning_rate": 1.2619808306709266e-06,
"loss": 0.2562382221221924,
"step": 80
},
{
"epoch": 0.0288,
"grad_norm": 0.2721949763566226,
"learning_rate": 1.4217252396166134e-06,
"loss": 0.2521932125091553,
"step": 90
},
{
"epoch": 0.032,
"grad_norm": 0.30509418891876505,
"learning_rate": 1.5814696485623005e-06,
"loss": 0.2553669214248657,
"step": 100
},
{
"epoch": 0.0352,
"grad_norm": 0.2710599378904947,
"learning_rate": 1.7412140575079875e-06,
"loss": 0.2548961162567139,
"step": 110
},
{
"epoch": 0.0384,
"grad_norm": 0.3180117403374185,
"learning_rate": 1.9009584664536742e-06,
"loss": 0.2442842960357666,
"step": 120
},
{
"epoch": 0.0416,
"grad_norm": 0.2695352733592907,
"learning_rate": 2.060702875399361e-06,
"loss": 0.24766459465026855,
"step": 130
},
{
"epoch": 0.0448,
"grad_norm": 0.3064535363854503,
"learning_rate": 2.220447284345048e-06,
"loss": 0.23845260143280028,
"step": 140
},
{
"epoch": 0.048,
"grad_norm": 0.29068646435586043,
"learning_rate": 2.380191693290735e-06,
"loss": 0.23559024333953857,
"step": 150
},
{
"epoch": 0.0512,
"grad_norm": 0.3186203915842237,
"learning_rate": 2.539936102236422e-06,
"loss": 0.23029537200927735,
"step": 160
},
{
"epoch": 0.0544,
"grad_norm": 0.32754075011707046,
"learning_rate": 2.699680511182109e-06,
"loss": 0.2385089635848999,
"step": 170
},
{
"epoch": 0.0576,
"grad_norm": 0.38484368515577855,
"learning_rate": 2.8594249201277955e-06,
"loss": 0.23111426830291748,
"step": 180
},
{
"epoch": 0.0608,
"grad_norm": 0.299502856060473,
"learning_rate": 3.0191693290734825e-06,
"loss": 0.23530282974243164,
"step": 190
},
{
"epoch": 0.064,
"grad_norm": 0.3078123289936782,
"learning_rate": 3.17891373801917e-06,
"loss": 0.23611860275268554,
"step": 200
},
{
"epoch": 0.0672,
"grad_norm": 0.30717572422426626,
"learning_rate": 3.3386581469648564e-06,
"loss": 0.23241891860961914,
"step": 210
},
{
"epoch": 0.0704,
"grad_norm": 0.30949630760689323,
"learning_rate": 3.4984025559105434e-06,
"loss": 0.2257370948791504,
"step": 220
},
{
"epoch": 0.0736,
"grad_norm": 0.2734080768093611,
"learning_rate": 3.6581469648562303e-06,
"loss": 0.22820501327514647,
"step": 230
},
{
"epoch": 0.0768,
"grad_norm": 0.31986420438753294,
"learning_rate": 3.817891373801918e-06,
"loss": 0.22324295043945314,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 0.3271935835910018,
"learning_rate": 3.977635782747604e-06,
"loss": 0.22092509269714355,
"step": 250
},
{
"epoch": 0.0832,
"grad_norm": 0.28164810489138675,
"learning_rate": 4.137380191693291e-06,
"loss": 0.22088565826416015,
"step": 260
},
{
"epoch": 0.0864,
"grad_norm": 0.2806581165168549,
"learning_rate": 4.297124600638978e-06,
"loss": 0.2235860824584961,
"step": 270
},
{
"epoch": 0.0896,
"grad_norm": 0.2818314341404028,
"learning_rate": 4.456869009584665e-06,
"loss": 0.21951718330383302,
"step": 280
},
{
"epoch": 0.0928,
"grad_norm": 0.2755068214230404,
"learning_rate": 4.616613418530352e-06,
"loss": 0.22480430603027343,
"step": 290
},
{
"epoch": 0.096,
"grad_norm": 0.2991295097090295,
"learning_rate": 4.776357827476039e-06,
"loss": 0.22600164413452148,
"step": 300
},
{
"epoch": 0.0992,
"grad_norm": 0.3239664056294863,
"learning_rate": 4.936102236421725e-06,
"loss": 0.21372499465942382,
"step": 310
},
{
"epoch": 0.1024,
"grad_norm": 0.2881723034602484,
"learning_rate": 4.999943833158769e-06,
"loss": 0.21513206958770753,
"step": 320
},
{
"epoch": 0.1056,
"grad_norm": 0.27877645475403023,
"learning_rate": 4.999600600490783e-06,
"loss": 0.22072982788085938,
"step": 330
},
{
"epoch": 0.1088,
"grad_norm": 0.28224550070191395,
"learning_rate": 4.9989453817439345e-06,
"loss": 0.2146312713623047,
"step": 340
},
{
"epoch": 0.112,
"grad_norm": 0.26853026150431764,
"learning_rate": 4.997978258698942e-06,
"loss": 0.21449072360992433,
"step": 350
},
{
"epoch": 0.1152,
"grad_norm": 0.2706003676564934,
"learning_rate": 4.996699352066659e-06,
"loss": 0.2151791572570801,
"step": 360
},
{
"epoch": 0.1184,
"grad_norm": 0.28539700359373177,
"learning_rate": 4.995108821473014e-06,
"loss": 0.21470160484313966,
"step": 370
},
{
"epoch": 0.1216,
"grad_norm": 0.29207494833659137,
"learning_rate": 4.993206865439084e-06,
"loss": 0.21086468696594238,
"step": 380
},
{
"epoch": 0.1248,
"grad_norm": 0.263064572322246,
"learning_rate": 4.990993721356317e-06,
"loss": 0.20984139442443847,
"step": 390
},
{
"epoch": 0.128,
"grad_norm": 0.2865097413347111,
"learning_rate": 4.988469665456901e-06,
"loss": 0.21040558815002441,
"step": 400
},
{
"epoch": 0.1312,
"grad_norm": 0.2738592744136949,
"learning_rate": 4.985635012779288e-06,
"loss": 0.21828360557556153,
"step": 410
},
{
"epoch": 0.1344,
"grad_norm": 0.24970618963972283,
"learning_rate": 4.98249011712887e-06,
"loss": 0.2106489658355713,
"step": 420
},
{
"epoch": 0.1376,
"grad_norm": 0.2629431248486553,
"learning_rate": 4.979035371033824e-06,
"loss": 0.20979018211364747,
"step": 430
},
{
"epoch": 0.1408,
"grad_norm": 0.2725505982701801,
"learning_rate": 4.975271205696115e-06,
"loss": 0.20948367118835448,
"step": 440
},
{
"epoch": 0.144,
"grad_norm": 0.2704053444924022,
"learning_rate": 4.971198090937671e-06,
"loss": 0.2033768653869629,
"step": 450
},
{
"epoch": 0.1472,
"grad_norm": 0.31765035973786815,
"learning_rate": 4.966816535141756e-06,
"loss": 0.20044360160827637,
"step": 460
},
{
"epoch": 0.1504,
"grad_norm": 0.2502347867419884,
"learning_rate": 4.9621270851895035e-06,
"loss": 0.2100567102432251,
"step": 470
},
{
"epoch": 0.1536,
"grad_norm": 0.2934932151321077,
"learning_rate": 4.957130326391662e-06,
"loss": 0.21090621948242189,
"step": 480
},
{
"epoch": 0.1568,
"grad_norm": 0.26660410583968774,
"learning_rate": 4.951826882415544e-06,
"loss": 0.20775444507598878,
"step": 490
},
{
"epoch": 0.16,
"grad_norm": 0.28519626596006936,
"learning_rate": 4.946217415207177e-06,
"loss": 0.20256528854370118,
"step": 500
},
{
"epoch": 0.1632,
"grad_norm": 0.2798675045050625,
"learning_rate": 4.940302624908689e-06,
"loss": 0.20623595714569093,
"step": 510
},
{
"epoch": 0.1664,
"grad_norm": 0.28222884808809434,
"learning_rate": 4.934083249770912e-06,
"loss": 0.20097856521606444,
"step": 520
},
{
"epoch": 0.1696,
"grad_norm": 0.2788085638053828,
"learning_rate": 4.927560066061251e-06,
"loss": 0.20387496948242187,
"step": 530
},
{
"epoch": 0.1728,
"grad_norm": 0.27554368272722524,
"learning_rate": 4.920733887966783e-06,
"loss": 0.21524934768676757,
"step": 540
},
{
"epoch": 0.176,
"grad_norm": 0.26559833530971816,
"learning_rate": 4.913605567492636e-06,
"loss": 0.20402135848999023,
"step": 550
},
{
"epoch": 0.1792,
"grad_norm": 0.26554772115650926,
"learning_rate": 4.906175994355656e-06,
"loss": 0.20598478317260743,
"step": 560
},
{
"epoch": 0.1824,
"grad_norm": 0.3223429392292309,
"learning_rate": 4.898446095873345e-06,
"loss": 0.20747475624084472,
"step": 570
},
{
"epoch": 0.1856,
"grad_norm": 0.24355730693567182,
"learning_rate": 4.890416836848128e-06,
"loss": 0.20512137413024903,
"step": 580
},
{
"epoch": 0.1888,
"grad_norm": 0.3001767473938059,
"learning_rate": 4.882089219446925e-06,
"loss": 0.19992779493331908,
"step": 590
},
{
"epoch": 0.192,
"grad_norm": 0.2835389086432711,
"learning_rate": 4.873464283076074e-06,
"loss": 0.20495295524597168,
"step": 600
},
{
"epoch": 0.1952,
"grad_norm": 0.26019712508927473,
"learning_rate": 4.864543104251587e-06,
"loss": 0.2035728931427002,
"step": 610
},
{
"epoch": 0.1984,
"grad_norm": 0.2657949563176517,
"learning_rate": 4.855326796464798e-06,
"loss": 0.20619282722473145,
"step": 620
},
{
"epoch": 0.2016,
"grad_norm": 0.28295912792439204,
"learning_rate": 4.8458165100433725e-06,
"loss": 0.2016925811767578,
"step": 630
},
{
"epoch": 0.2048,
"grad_norm": 0.2902924299127114,
"learning_rate": 4.836013432007738e-06,
"loss": 0.20164456367492675,
"step": 640
},
{
"epoch": 0.208,
"grad_norm": 0.24256417788990398,
"learning_rate": 4.825918785922921e-06,
"loss": 0.20648303031921386,
"step": 650
},
{
"epoch": 0.2112,
"grad_norm": 0.27122351891055063,
"learning_rate": 4.8155338317458315e-06,
"loss": 0.20356349945068358,
"step": 660
},
{
"epoch": 0.2144,
"grad_norm": 0.2600569122055766,
"learning_rate": 4.804859865668002e-06,
"loss": 0.19959055185317992,
"step": 670
},
{
"epoch": 0.2176,
"grad_norm": 0.25345624369635567,
"learning_rate": 4.793898219953804e-06,
"loss": 0.2007960557937622,
"step": 680
},
{
"epoch": 0.2208,
"grad_norm": 0.2544929334444299,
"learning_rate": 4.782650262774164e-06,
"loss": 0.20300769805908203,
"step": 690
},
{
"epoch": 0.224,
"grad_norm": 0.2897145189307127,
"learning_rate": 4.7711173980357886e-06,
"loss": 0.19880002737045288,
"step": 700
},
{
"epoch": 0.2272,
"grad_norm": 0.2560542526589546,
"learning_rate": 4.759301065205947e-06,
"loss": 0.19960763454437255,
"step": 710
},
{
"epoch": 0.2304,
"grad_norm": 0.3097914904022575,
"learning_rate": 4.7472027391328e-06,
"loss": 0.2003918170928955,
"step": 720
},
{
"epoch": 0.2336,
"grad_norm": 0.2887809607696432,
"learning_rate": 4.734823929861317e-06,
"loss": 0.20292911529541016,
"step": 730
},
{
"epoch": 0.2368,
"grad_norm": 0.28917619670340877,
"learning_rate": 4.722166182444801e-06,
"loss": 0.20004558563232422,
"step": 740
},
{
"epoch": 0.24,
"grad_norm": 0.27043264841658887,
"learning_rate": 4.709231076752045e-06,
"loss": 0.19843683242797852,
"step": 750
},
{
"epoch": 0.2432,
"grad_norm": 0.2645651727770741,
"learning_rate": 4.696020227270142e-06,
"loss": 0.20258240699768065,
"step": 760
},
{
"epoch": 0.2464,
"grad_norm": 0.2777282429222742,
"learning_rate": 4.6825352829029705e-06,
"loss": 0.1994302749633789,
"step": 770
},
{
"epoch": 0.2496,
"grad_norm": 0.28182340391383837,
"learning_rate": 4.668777926765392e-06,
"loss": 0.197939932346344,
"step": 780
},
{
"epoch": 0.2528,
"grad_norm": 0.2390403179508666,
"learning_rate": 4.6547498759731725e-06,
"loss": 0.19328031539916993,
"step": 790
},
{
"epoch": 0.256,
"grad_norm": 0.30761446053746666,
"learning_rate": 4.6404528814286575e-06,
"loss": 0.1962287425994873,
"step": 800
},
{
"epoch": 0.2592,
"grad_norm": 0.26058296777263723,
"learning_rate": 4.6258887276022425e-06,
"loss": 0.20304152965545655,
"step": 810
},
{
"epoch": 0.2624,
"grad_norm": 0.3023946784650888,
"learning_rate": 4.611059232309639e-06,
"loss": 0.19789116382598876,
"step": 820
},
{
"epoch": 0.2656,
"grad_norm": 0.28736962727648746,
"learning_rate": 4.595966246484986e-06,
"loss": 0.19968997240066527,
"step": 830
},
{
"epoch": 0.2688,
"grad_norm": 0.28571881200537336,
"learning_rate": 4.580611653949829e-06,
"loss": 0.20007586479187012,
"step": 840
},
{
"epoch": 0.272,
"grad_norm": 0.295019179491335,
"learning_rate": 4.564997371177992e-06,
"loss": 0.19763822555541993,
"step": 850
},
{
"epoch": 0.2752,
"grad_norm": 0.29653404936460237,
"learning_rate": 4.54912534705637e-06,
"loss": 0.19755616188049316,
"step": 860
},
{
"epoch": 0.2784,
"grad_norm": 0.2642449071502374,
"learning_rate": 4.532997562641683e-06,
"loss": 0.19439829587936402,
"step": 870
},
{
"epoch": 0.2816,
"grad_norm": 0.25657475126133233,
"learning_rate": 4.516616030913214e-06,
"loss": 0.1987127423286438,
"step": 880
},
{
"epoch": 0.2848,
"grad_norm": 0.28458590654874555,
"learning_rate": 4.499982796521556e-06,
"loss": 0.19352295398712158,
"step": 890
},
{
"epoch": 0.288,
"grad_norm": 0.2793448530701338,
"learning_rate": 4.48309993553341e-06,
"loss": 0.1959349274635315,
"step": 900
},
{
"epoch": 0.2912,
"grad_norm": 0.3163250873932861,
"learning_rate": 4.465969555172468e-06,
"loss": 0.1957021713256836,
"step": 910
},
{
"epoch": 0.2944,
"grad_norm": 0.2933329400631374,
"learning_rate": 4.448593793556391e-06,
"loss": 0.20156097412109375,
"step": 920
},
{
"epoch": 0.2976,
"grad_norm": 0.2688085579058971,
"learning_rate": 4.430974819429954e-06,
"loss": 0.1948945164680481,
"step": 930
},
{
"epoch": 0.3008,
"grad_norm": 0.28553708068341715,
"learning_rate": 4.413114831894344e-06,
"loss": 0.18995710611343383,
"step": 940
},
{
"epoch": 0.304,
"grad_norm": 0.26518275753825254,
"learning_rate": 4.3950160601326865e-06,
"loss": 0.18871839046478273,
"step": 950
},
{
"epoch": 0.3072,
"grad_norm": 0.28692003913342795,
"learning_rate": 4.376680763131811e-06,
"loss": 0.19533849954605104,
"step": 960
},
{
"epoch": 0.3104,
"grad_norm": 0.27227233815166896,
"learning_rate": 4.358111229400296e-06,
"loss": 0.19751427173614503,
"step": 970
},
{
"epoch": 0.3136,
"grad_norm": 0.27245831220598377,
"learning_rate": 4.33930977668283e-06,
"loss": 0.20111453533172607,
"step": 980
},
{
"epoch": 0.3168,
"grad_norm": 0.2482632152661181,
"learning_rate": 4.320278751670922e-06,
"loss": 0.19406617879867555,
"step": 990
},
{
"epoch": 0.32,
"grad_norm": 0.2892442073812178,
"learning_rate": 4.301020529710009e-06,
"loss": 0.19525597095489503,
"step": 1000
},
{
"epoch": 0.3232,
"grad_norm": 0.26392559431034407,
"learning_rate": 4.281537514502962e-06,
"loss": 0.19918107986450195,
"step": 1010
},
{
"epoch": 0.3264,
"grad_norm": 0.27003912401002855,
"learning_rate": 4.261832137810093e-06,
"loss": 0.1964997172355652,
"step": 1020
},
{
"epoch": 0.3296,
"grad_norm": 0.2664017566726753,
"learning_rate": 4.241906859145611e-06,
"loss": 0.19660145044326782,
"step": 1030
},
{
"epoch": 0.3328,
"grad_norm": 0.2744161118643581,
"learning_rate": 4.221764165470661e-06,
"loss": 0.1935626745223999,
"step": 1040
},
{
"epoch": 0.336,
"grad_norm": 0.2717693030089869,
"learning_rate": 4.201406570882898e-06,
"loss": 0.19286205768585205,
"step": 1050
},
{
"epoch": 0.3392,
"grad_norm": 0.259292524653773,
"learning_rate": 4.180836616302704e-06,
"loss": 0.1922353148460388,
"step": 1060
},
{
"epoch": 0.3424,
"grad_norm": 0.2739674960468982,
"learning_rate": 4.160056869156041e-06,
"loss": 0.19553282260894775,
"step": 1070
},
{
"epoch": 0.3456,
"grad_norm": 0.272965837223612,
"learning_rate": 4.139069923053995e-06,
"loss": 0.19367674589157105,
"step": 1080
},
{
"epoch": 0.3488,
"grad_norm": 0.2463436566122966,
"learning_rate": 4.117878397469062e-06,
"loss": 0.19772920608520508,
"step": 1090
},
{
"epoch": 0.352,
"grad_norm": 0.24672019869428047,
"learning_rate": 4.096484937408195e-06,
"loss": 0.1892393112182617,
"step": 1100
},
{
"epoch": 0.3552,
"grad_norm": 0.2673060417093708,
"learning_rate": 4.074892213082676e-06,
"loss": 0.1892371416091919,
"step": 1110
},
{
"epoch": 0.3584,
"grad_norm": 0.26767314750680543,
"learning_rate": 4.0531029195748265e-06,
"loss": 0.19717614650726317,
"step": 1120
},
{
"epoch": 0.3616,
"grad_norm": 0.2796524343786416,
"learning_rate": 4.03111977650163e-06,
"loss": 0.19503848552703856,
"step": 1130
},
{
"epoch": 0.3648,
"grad_norm": 0.2816284710404393,
"learning_rate": 4.008945527675281e-06,
"loss": 0.19529366493225098,
"step": 1140
},
{
"epoch": 0.368,
"grad_norm": 0.31949481569871324,
"learning_rate": 3.986582940760717e-06,
"loss": 0.18451136350631714,
"step": 1150
},
{
"epoch": 0.3712,
"grad_norm": 0.2723449306170863,
"learning_rate": 3.9640348069301785e-06,
"loss": 0.191510009765625,
"step": 1160
},
{
"epoch": 0.3744,
"grad_norm": 0.27747112521567696,
"learning_rate": 3.941303940514826e-06,
"loss": 0.19263410568237305,
"step": 1170
},
{
"epoch": 0.3776,
"grad_norm": 0.2719099807762723,
"learning_rate": 3.918393178653472e-06,
"loss": 0.19341590404510497,
"step": 1180
},
{
"epoch": 0.3808,
"grad_norm": 0.29074805846664115,
"learning_rate": 3.895305380938468e-06,
"loss": 0.19099385738372804,
"step": 1190
},
{
"epoch": 0.384,
"grad_norm": 0.2517462589595264,
"learning_rate": 3.872043429058783e-06,
"loss": 0.18874506950378417,
"step": 1200
},
{
"epoch": 0.3872,
"grad_norm": 0.2591827841853763,
"learning_rate": 3.84861022644033e-06,
"loss": 0.19069148302078248,
"step": 1210
},
{
"epoch": 0.3904,
"grad_norm": 0.2702770742629986,
"learning_rate": 3.825008697883574e-06,
"loss": 0.19928838014602662,
"step": 1220
},
{
"epoch": 0.3936,
"grad_norm": 0.27788866885326635,
"learning_rate": 3.8012417891984776e-06,
"loss": 0.19237933158874512,
"step": 1230
},
{
"epoch": 0.3968,
"grad_norm": 0.2656255469668472,
"learning_rate": 3.777312466836819e-06,
"loss": 0.19055767059326173,
"step": 1240
},
{
"epoch": 0.4,
"grad_norm": 0.28446496354107703,
"learning_rate": 3.7532237175219378e-06,
"loss": 0.18940582275390624,
"step": 1250
},
{
"epoch": 0.4032,
"grad_norm": 0.4152862546777316,
"learning_rate": 3.728978547875948e-06,
"loss": 0.19362914562225342,
"step": 1260
},
{
"epoch": 0.4064,
"grad_norm": 0.28537432061728957,
"learning_rate": 3.7045799840444712e-06,
"loss": 0.1886904716491699,
"step": 1270
},
{
"epoch": 0.4096,
"grad_norm": 0.29038310854731697,
"learning_rate": 3.6800310713189258e-06,
"loss": 0.18923617601394654,
"step": 1280
},
{
"epoch": 0.4128,
"grad_norm": 0.32132086585692904,
"learning_rate": 3.6553348737564328e-06,
"loss": 0.19005811214447021,
"step": 1290
},
{
"epoch": 0.416,
"grad_norm": 0.2669423345384319,
"learning_rate": 3.6304944737973794e-06,
"loss": 0.19575085639953613,
"step": 1300
},
{
"epoch": 0.4192,
"grad_norm": 0.28931030301965927,
"learning_rate": 3.6055129718806836e-06,
"loss": 0.18975239992141724,
"step": 1310
},
{
"epoch": 0.4224,
"grad_norm": 0.28948269391746034,
"learning_rate": 3.5803934860568134e-06,
"loss": 0.18510067462921143,
"step": 1320
},
{
"epoch": 0.4256,
"grad_norm": 0.2821484963772758,
"learning_rate": 3.5551391515986163e-06,
"loss": 0.1907583475112915,
"step": 1330
},
{
"epoch": 0.4288,
"grad_norm": 0.27423888046510925,
"learning_rate": 3.529753120609982e-06,
"loss": 0.18690071105957032,
"step": 1340
},
{
"epoch": 0.432,
"grad_norm": 0.30811658453814883,
"learning_rate": 3.5042385616324243e-06,
"loss": 0.19000139236450195,
"step": 1350
},
{
"epoch": 0.4352,
"grad_norm": 0.24402420223179272,
"learning_rate": 3.4785986592495934e-06,
"loss": 0.18803791999816893,
"step": 1360
},
{
"epoch": 0.4384,
"grad_norm": 0.24576039119812526,
"learning_rate": 3.452836613689803e-06,
"loss": 0.1866163969039917,
"step": 1370
},
{
"epoch": 0.4416,
"grad_norm": 0.2949022587874532,
"learning_rate": 3.426955640426584e-06,
"loss": 0.1890486001968384,
"step": 1380
},
{
"epoch": 0.4448,
"grad_norm": 0.2582182081996982,
"learning_rate": 3.4009589697773605e-06,
"loss": 0.18851635456085206,
"step": 1390
},
{
"epoch": 0.448,
"grad_norm": 0.2722482128131903,
"learning_rate": 3.3748498465002475e-06,
"loss": 0.18554195165634155,
"step": 1400
},
{
"epoch": 0.4512,
"grad_norm": 0.27484686642107964,
"learning_rate": 3.3486315293890693e-06,
"loss": 0.19425587654113768,
"step": 1410
},
{
"epoch": 0.4544,
"grad_norm": 0.28258316073925427,
"learning_rate": 3.3223072908666053e-06,
"loss": 0.1843653440475464,
"step": 1420
},
{
"epoch": 0.4576,
"grad_norm": 0.28555979247115143,
"learning_rate": 3.295880416576153e-06,
"loss": 0.1941524863243103,
"step": 1430
},
{
"epoch": 0.4608,
"grad_norm": 0.2969010932820601,
"learning_rate": 3.269354204971427e-06,
"loss": 0.18759560585021973,
"step": 1440
},
{
"epoch": 0.464,
"grad_norm": 0.30795851200957197,
"learning_rate": 3.242731966904865e-06,
"loss": 0.18544803857803344,
"step": 1450
},
{
"epoch": 0.4672,
"grad_norm": 0.28527072571260903,
"learning_rate": 3.2160170252143913e-06,
"loss": 0.18547136783599855,
"step": 1460
},
{
"epoch": 0.4704,
"grad_norm": 0.2533866816866613,
"learning_rate": 3.1892127143086716e-06,
"loss": 0.19228132963180541,
"step": 1470
},
{
"epoch": 0.4736,
"grad_norm": 0.2776942873045479,
"learning_rate": 3.1623223797509347e-06,
"loss": 0.1812342882156372,
"step": 1480
},
{
"epoch": 0.4768,
"grad_norm": 0.2744584915099732,
"learning_rate": 3.135349377841396e-06,
"loss": 0.1853887915611267,
"step": 1490
},
{
"epoch": 0.48,
"grad_norm": 0.2866639604297882,
"learning_rate": 3.1082970751983497e-06,
"loss": 0.1918737769126892,
"step": 1500
},
{
"epoch": 0.4832,
"grad_norm": 0.26310322890713356,
"learning_rate": 3.0811688483379546e-06,
"loss": 0.18995790481567382,
"step": 1510
},
{
"epoch": 0.4864,
"grad_norm": 0.28320054398109096,
"learning_rate": 3.0539680832528074e-06,
"loss": 0.18962399959564208,
"step": 1520
},
{
"epoch": 0.4896,
"grad_norm": 0.2654570443815982,
"learning_rate": 3.026698174989316e-06,
"loss": 0.18734774589538575,
"step": 1530
},
{
"epoch": 0.4928,
"grad_norm": 0.2658181920127404,
"learning_rate": 2.999362527223952e-06,
"loss": 0.1873406410217285,
"step": 1540
},
{
"epoch": 0.496,
"grad_norm": 0.29250213703445505,
"learning_rate": 2.9719645518384194e-06,
"loss": 0.1892526626586914,
"step": 1550
},
{
"epoch": 0.4992,
"grad_norm": 0.3090995402302473,
"learning_rate": 2.944507668493807e-06,
"loss": 0.19257349967956544,
"step": 1560
},
{
"epoch": 0.5024,
"grad_norm": 0.28272052629438726,
"learning_rate": 2.9169953042037623e-06,
"loss": 0.18868753910064698,
"step": 1570
},
{
"epoch": 0.5056,
"grad_norm": 0.3954198531333443,
"learning_rate": 2.889430892906754e-06,
"loss": 0.18459179401397705,
"step": 1580
},
{
"epoch": 0.5088,
"grad_norm": 0.2563261821009193,
"learning_rate": 2.861817875037462e-06,
"loss": 0.18163517713546753,
"step": 1590
},
{
"epoch": 0.512,
"grad_norm": 0.28115388072993086,
"learning_rate": 2.8341596970973683e-06,
"loss": 0.19087796211242675,
"step": 1600
},
{
"epoch": 0.5152,
"grad_norm": 0.27079102831839946,
"learning_rate": 2.80645981122458e-06,
"loss": 0.1863863706588745,
"step": 1610
},
{
"epoch": 0.5184,
"grad_norm": 0.27596423249252744,
"learning_rate": 2.7787216747629508e-06,
"loss": 0.19303735494613647,
"step": 1620
},
{
"epoch": 0.5216,
"grad_norm": 0.2682301223547138,
"learning_rate": 2.7509487498305615e-06,
"loss": 0.18045294284820557,
"step": 1630
},
{
"epoch": 0.5248,
"grad_norm": 0.27817197846381203,
"learning_rate": 2.7231445028875924e-06,
"loss": 0.18653267621994019,
"step": 1640
},
{
"epoch": 0.528,
"grad_norm": 0.25176165708531945,
"learning_rate": 2.6953124043036604e-06,
"loss": 0.18530716896057128,
"step": 1650
},
{
"epoch": 0.5312,
"grad_norm": 0.272299195118528,
"learning_rate": 2.667455927924667e-06,
"loss": 0.18495219945907593,
"step": 1660
},
{
"epoch": 0.5344,
"grad_norm": 0.26513870922757315,
"learning_rate": 2.6395785506392164e-06,
"loss": 0.18016864061355592,
"step": 1670
},
{
"epoch": 0.5376,
"grad_norm": 0.26899577641448663,
"learning_rate": 2.6116837519446407e-06,
"loss": 0.18437364101409912,
"step": 1680
},
{
"epoch": 0.5408,
"grad_norm": 0.29589553270345376,
"learning_rate": 2.5837750135127192e-06,
"loss": 0.18141529560089112,
"step": 1690
},
{
"epoch": 0.544,
"grad_norm": 0.28180995392351926,
"learning_rate": 2.555855818755108e-06,
"loss": 0.18680166006088256,
"step": 1700
},
{
"epoch": 0.5472,
"grad_norm": 0.29608650413456306,
"learning_rate": 2.5279296523885636e-06,
"loss": 0.18486298322677613,
"step": 1710
},
{
"epoch": 0.5504,
"grad_norm": 0.28475957723655715,
"learning_rate": 2.5e-06,
"loss": 0.1850725531578064,
"step": 1720
},
{
"epoch": 0.5536,
"grad_norm": 0.27856833997611247,
"learning_rate": 2.472070347611437e-06,
"loss": 0.1791991949081421,
"step": 1730
},
{
"epoch": 0.5568,
"grad_norm": 0.30516489860119894,
"learning_rate": 2.444144181244893e-06,
"loss": 0.18483606576919556,
"step": 1740
},
{
"epoch": 0.56,
"grad_norm": 0.29804656625996045,
"learning_rate": 2.416224986487282e-06,
"loss": 0.18195321559906005,
"step": 1750
},
{
"epoch": 0.5632,
"grad_norm": 0.30740179095263215,
"learning_rate": 2.3883162480553605e-06,
"loss": 0.17964634895324708,
"step": 1760
},
{
"epoch": 0.5664,
"grad_norm": 0.29672245353605753,
"learning_rate": 2.3604214493607844e-06,
"loss": 0.18308933973312377,
"step": 1770
},
{
"epoch": 0.5696,
"grad_norm": 0.2837212145176832,
"learning_rate": 2.332544072075333e-06,
"loss": 0.18688681125640869,
"step": 1780
},
{
"epoch": 0.5728,
"grad_norm": 0.28451872958084823,
"learning_rate": 2.30468759569634e-06,
"loss": 0.18532857894897461,
"step": 1790
},
{
"epoch": 0.576,
"grad_norm": 0.29734825652467917,
"learning_rate": 2.276855497112408e-06,
"loss": 0.18262310028076173,
"step": 1800
},
{
"epoch": 0.5792,
"grad_norm": 0.3012944650683003,
"learning_rate": 2.2490512501694394e-06,
"loss": 0.17781586647033693,
"step": 1810
},
{
"epoch": 0.5824,
"grad_norm": 0.2692920477116042,
"learning_rate": 2.2212783252370496e-06,
"loss": 0.18318163156509398,
"step": 1820
},
{
"epoch": 0.5856,
"grad_norm": 0.2700619255739624,
"learning_rate": 2.1935401887754213e-06,
"loss": 0.18857367038726808,
"step": 1830
},
{
"epoch": 0.5888,
"grad_norm": 0.2868516489290536,
"learning_rate": 2.165840302902632e-06,
"loss": 0.18190672397613525,
"step": 1840
},
{
"epoch": 0.592,
"grad_norm": 0.28726300225812107,
"learning_rate": 2.1381821249625383e-06,
"loss": 0.1867521286010742,
"step": 1850
},
{
"epoch": 0.5952,
"grad_norm": 0.2995145996099388,
"learning_rate": 2.1105691070932465e-06,
"loss": 0.17851842641830445,
"step": 1860
},
{
"epoch": 0.5984,
"grad_norm": 0.28575212768410063,
"learning_rate": 2.083004695796238e-06,
"loss": 0.17741835117340088,
"step": 1870
},
{
"epoch": 0.6016,
"grad_norm": 0.31284763297048707,
"learning_rate": 2.055492331506194e-06,
"loss": 0.1843113422393799,
"step": 1880
},
{
"epoch": 0.6048,
"grad_norm": 0.3170666816206652,
"learning_rate": 2.0280354481615814e-06,
"loss": 0.18248820304870605,
"step": 1890
},
{
"epoch": 0.608,
"grad_norm": 0.30950907311465886,
"learning_rate": 2.000637472776049e-06,
"loss": 0.1839754819869995,
"step": 1900
},
{
"epoch": 0.6112,
"grad_norm": 0.2536972696685391,
"learning_rate": 1.973301825010685e-06,
"loss": 0.17841637134552002,
"step": 1910
},
{
"epoch": 0.6144,
"grad_norm": 0.291862692607901,
"learning_rate": 1.9460319167471934e-06,
"loss": 0.18339977264404297,
"step": 1920
},
{
"epoch": 0.6176,
"grad_norm": 0.2848109477155621,
"learning_rate": 1.9188311516620466e-06,
"loss": 0.17915148735046388,
"step": 1930
},
{
"epoch": 0.6208,
"grad_norm": 0.3060077712638729,
"learning_rate": 1.891702924801651e-06,
"loss": 0.1848907709121704,
"step": 1940
},
{
"epoch": 0.624,
"grad_norm": 0.27297816434517674,
"learning_rate": 1.864650622158604e-06,
"loss": 0.18888840675354004,
"step": 1950
},
{
"epoch": 0.6272,
"grad_norm": 0.2781302448691454,
"learning_rate": 1.8376776202490666e-06,
"loss": 0.1847243309020996,
"step": 1960
},
{
"epoch": 0.6304,
"grad_norm": 0.31527749144779466,
"learning_rate": 1.8107872856913293e-06,
"loss": 0.17888798713684081,
"step": 1970
},
{
"epoch": 0.6336,
"grad_norm": 0.2981389294211551,
"learning_rate": 1.7839829747856096e-06,
"loss": 0.18081605434417725,
"step": 1980
},
{
"epoch": 0.6368,
"grad_norm": 0.29438595992497246,
"learning_rate": 1.7572680330951359e-06,
"loss": 0.17975808382034303,
"step": 1990
},
{
"epoch": 0.64,
"grad_norm": 0.2777422843592099,
"learning_rate": 1.7306457950285747e-06,
"loss": 0.1812159538269043,
"step": 2000
},
{
"epoch": 0.6432,
"grad_norm": 0.3068388373590525,
"learning_rate": 1.704119583423848e-06,
"loss": 0.17536230087280275,
"step": 2010
},
{
"epoch": 0.6464,
"grad_norm": 0.272885194568128,
"learning_rate": 1.677692709133396e-06,
"loss": 0.18365554809570311,
"step": 2020
},
{
"epoch": 0.6496,
"grad_norm": 0.3023336412584975,
"learning_rate": 1.6513684706109311e-06,
"loss": 0.18368566036224365,
"step": 2030
},
{
"epoch": 0.6528,
"grad_norm": 0.28988866387653284,
"learning_rate": 1.6251501534997529e-06,
"loss": 0.18175660371780394,
"step": 2040
},
{
"epoch": 0.656,
"grad_norm": 0.28123365590903454,
"learning_rate": 1.5990410302226405e-06,
"loss": 0.17483808994293212,
"step": 2050
},
{
"epoch": 0.6592,
"grad_norm": 0.28187049939921544,
"learning_rate": 1.5730443595734162e-06,
"loss": 0.18124582767486572,
"step": 2060
},
{
"epoch": 0.6624,
"grad_norm": 0.31643189708694724,
"learning_rate": 1.5471633863101982e-06,
"loss": 0.18188211917877198,
"step": 2070
},
{
"epoch": 0.6656,
"grad_norm": 0.3071146379480691,
"learning_rate": 1.521401340750407e-06,
"loss": 0.18458983898162842,
"step": 2080
},
{
"epoch": 0.6688,
"grad_norm": 0.30923765962914507,
"learning_rate": 1.495761438367577e-06,
"loss": 0.18291953802108765,
"step": 2090
},
{
"epoch": 0.672,
"grad_norm": 0.31506268222239586,
"learning_rate": 1.4702468793900187e-06,
"loss": 0.18112607002258302,
"step": 2100
},
{
"epoch": 0.6752,
"grad_norm": 0.2991031913192095,
"learning_rate": 1.444860848401384e-06,
"loss": 0.18132129907608033,
"step": 2110
},
{
"epoch": 0.6784,
"grad_norm": 0.306957825954438,
"learning_rate": 1.4196065139431866e-06,
"loss": 0.18091821670532227,
"step": 2120
},
{
"epoch": 0.6816,
"grad_norm": 0.30984784981623864,
"learning_rate": 1.3944870281193178e-06,
"loss": 0.17975277900695802,
"step": 2130
},
{
"epoch": 0.6848,
"grad_norm": 0.33685631116321924,
"learning_rate": 1.3695055262026208e-06,
"loss": 0.18606040477752686,
"step": 2140
},
{
"epoch": 0.688,
"grad_norm": 0.28362188085343176,
"learning_rate": 1.3446651262435679e-06,
"loss": 0.17845985889434815,
"step": 2150
},
{
"epoch": 0.6912,
"grad_norm": 0.28046286761312267,
"learning_rate": 1.3199689286810746e-06,
"loss": 0.18048195838928222,
"step": 2160
},
{
"epoch": 0.6944,
"grad_norm": 0.29900090645940436,
"learning_rate": 1.2954200159555294e-06,
"loss": 0.17538446187973022,
"step": 2170
},
{
"epoch": 0.6976,
"grad_norm": 0.32576508972663926,
"learning_rate": 1.2710214521240527e-06,
"loss": 0.18001599311828614,
"step": 2180
},
{
"epoch": 0.7008,
"grad_norm": 0.30869890145158635,
"learning_rate": 1.246776282478063e-06,
"loss": 0.18135268688201905,
"step": 2190
},
{
"epoch": 0.704,
"grad_norm": 0.28612747319198,
"learning_rate": 1.222687533163181e-06,
"loss": 0.18038851022720337,
"step": 2200
},
{
"epoch": 0.7072,
"grad_norm": 0.32303440375726766,
"learning_rate": 1.1987582108015228e-06,
"loss": 0.18109045028686524,
"step": 2210
},
{
"epoch": 0.7104,
"grad_norm": 0.3093047688685527,
"learning_rate": 1.1749913021164255e-06,
"loss": 0.18254566192626953,
"step": 2220
},
{
"epoch": 0.7136,
"grad_norm": 0.2882548432858515,
"learning_rate": 1.1513897735596702e-06,
"loss": 0.17732615470886232,
"step": 2230
},
{
"epoch": 0.7168,
"grad_norm": 0.29445166285798274,
"learning_rate": 1.127956570941218e-06,
"loss": 0.17425966262817383,
"step": 2240
},
{
"epoch": 0.72,
"grad_norm": 0.3514589237334647,
"learning_rate": 1.104694619061533e-06,
"loss": 0.18296418190002442,
"step": 2250
},
{
"epoch": 0.7232,
"grad_norm": 0.32323021290499837,
"learning_rate": 1.0816068213465295e-06,
"loss": 0.1851881265640259,
"step": 2260
},
{
"epoch": 0.7264,
"grad_norm": 0.30421571681673176,
"learning_rate": 1.0586960594851762e-06,
"loss": 0.180436372756958,
"step": 2270
},
{
"epoch": 0.7296,
"grad_norm": 0.31911631321578676,
"learning_rate": 1.0359651930698217e-06,
"loss": 0.17929892539978026,
"step": 2280
},
{
"epoch": 0.7328,
"grad_norm": 0.30015899620754233,
"learning_rate": 1.0134170592392837e-06,
"loss": 0.18022915124893188,
"step": 2290
},
{
"epoch": 0.736,
"grad_norm": 0.31786084969492157,
"learning_rate": 9.910544723247204e-07,
"loss": 0.17959039211273192,
"step": 2300
},
{
"epoch": 0.7392,
"grad_norm": 0.31599364626026827,
"learning_rate": 9.688802234983706e-07,
"loss": 0.17806137800216676,
"step": 2310
},
{
"epoch": 0.7424,
"grad_norm": 0.3303243768736776,
"learning_rate": 9.468970804251742e-07,
"loss": 0.1811964988708496,
"step": 2320
},
{
"epoch": 0.7456,
"grad_norm": 0.3312986961423255,
"learning_rate": 9.251077869173244e-07,
"loss": 0.17583439350128174,
"step": 2330
},
{
"epoch": 0.7488,
"grad_norm": 0.30030412592967864,
"learning_rate": 9.035150625918054e-07,
"loss": 0.17623555660247803,
"step": 2340
},
{
"epoch": 0.752,
"grad_norm": 0.3177646626866783,
"learning_rate": 8.821216025309395e-07,
"loss": 0.18003884553909302,
"step": 2350
},
{
"epoch": 0.7552,
"grad_norm": 0.3012142976429357,
"learning_rate": 8.609300769460055e-07,
"loss": 0.17543296813964843,
"step": 2360
},
{
"epoch": 0.7584,
"grad_norm": 0.3177168816443014,
"learning_rate": 8.399431308439592e-07,
"loss": 0.18021781444549562,
"step": 2370
},
{
"epoch": 0.7616,
"grad_norm": 0.34248252589513506,
"learning_rate": 8.191633836972962e-07,
"loss": 0.18125417232513427,
"step": 2380
},
{
"epoch": 0.7648,
"grad_norm": 0.29292480325152365,
"learning_rate": 7.985934291171024e-07,
"loss": 0.17757056951522826,
"step": 2390
},
{
"epoch": 0.768,
"grad_norm": 0.3257764859746147,
"learning_rate": 7.7823583452934e-07,
"loss": 0.18096057176589966,
"step": 2400
},
{
"epoch": 0.7712,
"grad_norm": 0.28892062916284306,
"learning_rate": 7.58093140854389e-07,
"loss": 0.18015010356903077,
"step": 2410
},
{
"epoch": 0.7744,
"grad_norm": 0.32360358107292697,
"learning_rate": 7.381678621899077e-07,
"loss": 0.18067935705184937,
"step": 2420
},
{
"epoch": 0.7776,
"grad_norm": 0.3139428787829718,
"learning_rate": 7.184624854970379e-07,
"loss": 0.1768512487411499,
"step": 2430
},
{
"epoch": 0.7808,
"grad_norm": 0.3182311104789415,
"learning_rate": 6.989794702899932e-07,
"loss": 0.17589566707611085,
"step": 2440
},
{
"epoch": 0.784,
"grad_norm": 0.3112954733861784,
"learning_rate": 6.797212483290777e-07,
"loss": 0.177903413772583,
"step": 2450
},
{
"epoch": 0.7872,
"grad_norm": 0.31026727362843554,
"learning_rate": 6.60690223317171e-07,
"loss": 0.17535500526428222,
"step": 2460
},
{
"epoch": 0.7904,
"grad_norm": 0.2855504901999764,
"learning_rate": 6.418887705997046e-07,
"loss": 0.1787285327911377,
"step": 2470
},
{
"epoch": 0.7936,
"grad_norm": 0.33581031525319194,
"learning_rate": 6.23319236868189e-07,
"loss": 0.181508469581604,
"step": 2480
},
{
"epoch": 0.7968,
"grad_norm": 0.30084134655605693,
"learning_rate": 6.049839398673141e-07,
"loss": 0.18244649171829225,
"step": 2490
},
{
"epoch": 0.8,
"grad_norm": 0.3207759323449182,
"learning_rate": 5.868851681056567e-07,
"loss": 0.18296375274658203,
"step": 2500
},
{
"epoch": 0.8032,
"grad_norm": 0.3103299858846911,
"learning_rate": 5.690251805700467e-07,
"loss": 0.18089601993560792,
"step": 2510
},
{
"epoch": 0.8064,
"grad_norm": 0.3310470653200237,
"learning_rate": 5.514062064436096e-07,
"loss": 0.1829407334327698,
"step": 2520
},
{
"epoch": 0.8096,
"grad_norm": 0.31783823046596615,
"learning_rate": 5.34030444827533e-07,
"loss": 0.17886234521865846,
"step": 2530
},
{
"epoch": 0.8128,
"grad_norm": 0.3279151171862584,
"learning_rate": 5.169000644665895e-07,
"loss": 0.17618993520736695,
"step": 2540
},
{
"epoch": 0.816,
"grad_norm": 0.3006249030100123,
"learning_rate": 5.000172034784442e-07,
"loss": 0.17779455184936524,
"step": 2550
},
{
"epoch": 0.8192,
"grad_norm": 0.3164261324675526,
"learning_rate": 4.833839690867853e-07,
"loss": 0.18002912998199463,
"step": 2560
},
{
"epoch": 0.8224,
"grad_norm": 0.31374931318878396,
"learning_rate": 4.6700243735831705e-07,
"loss": 0.173567795753479,
"step": 2570
},
{
"epoch": 0.8256,
"grad_norm": 0.31170459979916293,
"learning_rate": 4.508746529436311e-07,
"loss": 0.1724323034286499,
"step": 2580
},
{
"epoch": 0.8288,
"grad_norm": 0.3080863565290302,
"learning_rate": 4.350026288220083e-07,
"loss": 0.1794981598854065,
"step": 2590
},
{
"epoch": 0.832,
"grad_norm": 0.30618951989415283,
"learning_rate": 4.1938834605017133e-07,
"loss": 0.1761255979537964,
"step": 2600
},
{
"epoch": 0.8352,
"grad_norm": 0.3029510706797137,
"learning_rate": 4.0403375351501515e-07,
"loss": 0.17623082399368287,
"step": 2610
},
{
"epoch": 0.8384,
"grad_norm": 0.336336912959277,
"learning_rate": 3.88940767690362e-07,
"loss": 0.1757615327835083,
"step": 2620
},
{
"epoch": 0.8416,
"grad_norm": 0.32859024308656015,
"learning_rate": 3.7411127239775774e-07,
"loss": 0.17455869913101196,
"step": 2630
},
{
"epoch": 0.8448,
"grad_norm": 0.3174124959768476,
"learning_rate": 3.595471185713431e-07,
"loss": 0.17312180995941162,
"step": 2640
},
{
"epoch": 0.848,
"grad_norm": 0.3247217043719523,
"learning_rate": 3.4525012402682826e-07,
"loss": 0.17421470880508422,
"step": 2650
},
{
"epoch": 0.8512,
"grad_norm": 0.3290462164412991,
"learning_rate": 3.3122207323460804e-07,
"loss": 0.17708632946014405,
"step": 2660
},
{
"epoch": 0.8544,
"grad_norm": 0.3024938333869805,
"learning_rate": 3.1746471709702963e-07,
"loss": 0.17333836555480958,
"step": 2670
},
{
"epoch": 0.8576,
"grad_norm": 0.32678703604131465,
"learning_rate": 3.039797727298585e-07,
"loss": 0.1801586151123047,
"step": 2680
},
{
"epoch": 0.8608,
"grad_norm": 0.32985764106850785,
"learning_rate": 2.9076892324795546e-07,
"loss": 0.17783432006835936,
"step": 2690
},
{
"epoch": 0.864,
"grad_norm": 0.31242585953952057,
"learning_rate": 2.778338175551995e-07,
"loss": 0.17357670068740844,
"step": 2700
},
{
"epoch": 0.8672,
"grad_norm": 0.3220012856306909,
"learning_rate": 2.6517607013868326e-07,
"loss": 0.18131563663482667,
"step": 2710
},
{
"epoch": 0.8704,
"grad_norm": 0.33350326064348024,
"learning_rate": 2.527972608672002e-07,
"loss": 0.17757024765014648,
"step": 2720
},
{
"epoch": 0.8736,
"grad_norm": 0.335919926946263,
"learning_rate": 2.40698934794053e-07,
"loss": 0.17683808803558348,
"step": 2730
},
{
"epoch": 0.8768,
"grad_norm": 0.3209912976041497,
"learning_rate": 2.2888260196421237e-07,
"loss": 0.17635661363601685,
"step": 2740
},
{
"epoch": 0.88,
"grad_norm": 0.3165955269677658,
"learning_rate": 2.1734973722583735e-07,
"loss": 0.17913974523544313,
"step": 2750
},
{
"epoch": 0.8832,
"grad_norm": 0.31474674596852353,
"learning_rate": 2.0610178004619564e-07,
"loss": 0.17095563411712647,
"step": 2760
},
{
"epoch": 0.8864,
"grad_norm": 0.305115903859637,
"learning_rate": 1.9514013433199834e-07,
"loss": 0.18293533325195313,
"step": 2770
},
{
"epoch": 0.8896,
"grad_norm": 0.3164297745100823,
"learning_rate": 1.8446616825416958e-07,
"loss": 0.18138229846954346,
"step": 2780
},
{
"epoch": 0.8928,
"grad_norm": 0.3526140625065779,
"learning_rate": 1.7408121407708007e-07,
"loss": 0.18163397312164306,
"step": 2790
},
{
"epoch": 0.896,
"grad_norm": 0.3224933819196559,
"learning_rate": 1.6398656799226253e-07,
"loss": 0.1705089807510376,
"step": 2800
},
{
"epoch": 0.8992,
"grad_norm": 0.31764589677400257,
"learning_rate": 1.5418348995662773e-07,
"loss": 0.17652597427368164,
"step": 2810
},
{
"epoch": 0.9024,
"grad_norm": 0.3414067132784035,
"learning_rate": 1.4467320353520275e-07,
"loss": 0.17487871646881104,
"step": 2820
},
{
"epoch": 0.9056,
"grad_norm": 0.3138098972679996,
"learning_rate": 1.3545689574841341e-07,
"loss": 0.17592911720275878,
"step": 2830
},
{
"epoch": 0.9088,
"grad_norm": 0.31560573280288073,
"learning_rate": 1.26535716923927e-07,
"loss": 0.18197228908538818,
"step": 2840
},
{
"epoch": 0.912,
"grad_norm": 0.3188962184685744,
"learning_rate": 1.1791078055307493e-07,
"loss": 0.1777464509010315,
"step": 2850
},
{
"epoch": 0.9152,
"grad_norm": 0.31575220367713525,
"learning_rate": 1.0958316315187289e-07,
"loss": 0.17706483602523804,
"step": 2860
},
{
"epoch": 0.9184,
"grad_norm": 0.3131837624055497,
"learning_rate": 1.0155390412665528e-07,
"loss": 0.17496002912521363,
"step": 2870
},
{
"epoch": 0.9216,
"grad_norm": 0.32248583567737266,
"learning_rate": 9.38240056443443e-08,
"loss": 0.17229046821594238,
"step": 2880
},
{
"epoch": 0.9248,
"grad_norm": 0.3101253584845484,
"learning_rate": 8.639443250736402e-08,
"loss": 0.17552309036254882,
"step": 2890
},
{
"epoch": 0.928,
"grad_norm": 0.33217431742972764,
"learning_rate": 7.926611203321777e-08,
"loss": 0.17659810781478882,
"step": 2900
},
{
"epoch": 0.9312,
"grad_norm": 0.33918124282098266,
"learning_rate": 7.243993393874882e-08,
"loss": 0.17737939357757568,
"step": 2910
},
{
"epoch": 0.9344,
"grad_norm": 0.31351790893613213,
"learning_rate": 6.591675022908805e-08,
"loss": 0.1745692253112793,
"step": 2920
},
{
"epoch": 0.9376,
"grad_norm": 0.33783778867129854,
"learning_rate": 5.969737509131241e-08,
"loss": 0.1722058415412903,
"step": 2930
},
{
"epoch": 0.9408,
"grad_norm": 0.308776655874055,
"learning_rate": 5.3782584792823334e-08,
"loss": 0.17710112333297728,
"step": 2940
},
{
"epoch": 0.944,
"grad_norm": 0.3142338038371378,
"learning_rate": 4.817311758445686e-08,
"loss": 0.178252911567688,
"step": 2950
},
{
"epoch": 0.9472,
"grad_norm": 0.33048986218580767,
"learning_rate": 4.286967360833866e-08,
"loss": 0.1782402753829956,
"step": 2960
},
{
"epoch": 0.9504,
"grad_norm": 0.3110909627270251,
"learning_rate": 3.787291481049754e-08,
"loss": 0.17829475402832032,
"step": 2970
},
{
"epoch": 0.9536,
"grad_norm": 0.33726065147122686,
"learning_rate": 3.3183464858244364e-08,
"loss": 0.18406097888946532,
"step": 2980
},
{
"epoch": 0.9568,
"grad_norm": 0.3326393750086487,
"learning_rate": 2.8801909062328992e-08,
"loss": 0.17060396671295167,
"step": 2990
},
{
"epoch": 0.96,
"grad_norm": 0.32948960265922206,
"learning_rate": 2.4728794303886248e-08,
"loss": 0.16899311542510986,
"step": 3000
},
{
"epoch": 0.9632,
"grad_norm": 0.33211982053439487,
"learning_rate": 2.0964628966175794e-08,
"loss": 0.17517964839935302,
"step": 3010
},
{
"epoch": 0.9664,
"grad_norm": 0.30613498697830943,
"learning_rate": 1.750988287113009e-08,
"loss": 0.17458994388580323,
"step": 3020
},
{
"epoch": 0.9696,
"grad_norm": 0.3027770955918648,
"learning_rate": 1.4364987220713278e-08,
"loss": 0.18178436756134034,
"step": 3030
},
{
"epoch": 0.9728,
"grad_norm": 0.3292318037983906,
"learning_rate": 1.1530334543099763e-08,
"loss": 0.1790144681930542,
"step": 3040
},
{
"epoch": 0.976,
"grad_norm": 0.33300755292787143,
"learning_rate": 9.006278643683697e-09,
"loss": 0.1808505654335022,
"step": 3050
},
{
"epoch": 0.9792,
"grad_norm": 0.32631723332989787,
"learning_rate": 6.793134560916514e-09,
"loss": 0.18275127410888672,
"step": 3060
},
{
"epoch": 0.9824,
"grad_norm": 0.3082787331662993,
"learning_rate": 4.891178526986451e-09,
"loss": 0.1783647656440735,
"step": 3070
},
{
"epoch": 0.9856,
"grad_norm": 0.32341550392390483,
"learning_rate": 3.3006479333413943e-09,
"loss": 0.18126009702682494,
"step": 3080
},
{
"epoch": 0.9888,
"grad_norm": 0.30931371888762194,
"learning_rate": 2.021741301058422e-09,
"loss": 0.17681236267089845,
"step": 3090
},
{
"epoch": 0.992,
"grad_norm": 0.3419672311636941,
"learning_rate": 1.0546182560652872e-09,
"loss": 0.17989683151245117,
"step": 3100
},
{
"epoch": 0.9952,
"grad_norm": 0.3111951639834393,
"learning_rate": 3.9939950921774607e-10,
"loss": 0.17482796907424927,
"step": 3110
},
{
"epoch": 0.9984,
"grad_norm": 0.3230413672933209,
"learning_rate": 5.616684123160854e-11,
"loss": 0.17436976432800294,
"step": 3120
},
{
"epoch": 1.0,
"step": 3125,
"total_flos": 1955525886476288.0,
"train_loss": 0.1948647116279602,
"train_runtime": 28413.61,
"train_samples_per_second": 7.039,
"train_steps_per_second": 0.11
}
],
"logging_steps": 10,
"max_steps": 3125,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1955525886476288.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}