WEBGEN / trainer_state.json
eshmoideas's picture
Duplicate from Tesslate/WEBGEN-OSS-20B
f6c5cd5
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7979688066739209,
"eval_steps": 500,
"global_step": 2200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003627130939426913,
"grad_norm": 0.64821457862854,
"learning_rate": 0.0,
"loss": 0.4482,
"step": 1
},
{
"epoch": 0.0007254261878853826,
"grad_norm": 0.6469861268997192,
"learning_rate": 2e-05,
"loss": 0.4874,
"step": 2
},
{
"epoch": 0.001088139281828074,
"grad_norm": 0.45289790630340576,
"learning_rate": 4e-05,
"loss": 0.4732,
"step": 3
},
{
"epoch": 0.0014508523757707653,
"grad_norm": 0.38072678446769714,
"learning_rate": 6e-05,
"loss": 0.4503,
"step": 4
},
{
"epoch": 0.0018135654697134566,
"grad_norm": 0.4032226800918579,
"learning_rate": 8e-05,
"loss": 0.4312,
"step": 5
},
{
"epoch": 0.002176278563656148,
"grad_norm": 0.3684772551059723,
"learning_rate": 0.0001,
"loss": 0.4055,
"step": 6
},
{
"epoch": 0.0025389916575988395,
"grad_norm": 0.3409311771392822,
"learning_rate": 0.00012,
"loss": 0.4201,
"step": 7
},
{
"epoch": 0.0029017047515415306,
"grad_norm": 0.3772580921649933,
"learning_rate": 0.00014,
"loss": 0.4086,
"step": 8
},
{
"epoch": 0.003264417845484222,
"grad_norm": 0.30869755148887634,
"learning_rate": 0.00016,
"loss": 0.3954,
"step": 9
},
{
"epoch": 0.003627130939426913,
"grad_norm": 0.23723824322223663,
"learning_rate": 0.00018,
"loss": 0.3992,
"step": 10
},
{
"epoch": 0.003989844033369605,
"grad_norm": 0.18091322481632233,
"learning_rate": 0.0002,
"loss": 0.368,
"step": 11
},
{
"epoch": 0.004352557127312296,
"grad_norm": 0.20436523854732513,
"learning_rate": 0.00019999993460381957,
"loss": 0.3711,
"step": 12
},
{
"epoch": 0.004715270221254987,
"grad_norm": 0.19207683205604553,
"learning_rate": 0.00019999973841536373,
"loss": 0.3788,
"step": 13
},
{
"epoch": 0.005077983315197679,
"grad_norm": 0.1436341255903244,
"learning_rate": 0.00019999941143488914,
"loss": 0.3936,
"step": 14
},
{
"epoch": 0.00544069640914037,
"grad_norm": 0.13892005383968353,
"learning_rate": 0.0001999989536628234,
"loss": 0.4062,
"step": 15
},
{
"epoch": 0.005803409503083061,
"grad_norm": 0.12910069525241852,
"learning_rate": 0.00019999836509976534,
"loss": 0.3863,
"step": 16
},
{
"epoch": 0.006166122597025753,
"grad_norm": 0.10763731598854065,
"learning_rate": 0.00019999764574648465,
"loss": 0.3757,
"step": 17
},
{
"epoch": 0.006528835690968444,
"grad_norm": 0.1078948974609375,
"learning_rate": 0.00019999679560392226,
"loss": 0.3342,
"step": 18
},
{
"epoch": 0.006891548784911135,
"grad_norm": 0.10403122007846832,
"learning_rate": 0.00019999581467319006,
"loss": 0.3371,
"step": 19
},
{
"epoch": 0.007254261878853826,
"grad_norm": 0.09776491671800613,
"learning_rate": 0.00019999470295557105,
"loss": 0.3263,
"step": 20
},
{
"epoch": 0.007616974972796518,
"grad_norm": 0.10792049765586853,
"learning_rate": 0.00019999346045251925,
"loss": 0.3447,
"step": 21
},
{
"epoch": 0.00797968806673921,
"grad_norm": 0.1174544170498848,
"learning_rate": 0.00019999208716565977,
"loss": 0.336,
"step": 22
},
{
"epoch": 0.008342401160681901,
"grad_norm": 0.09458769857883453,
"learning_rate": 0.0001999905830967888,
"loss": 0.3262,
"step": 23
},
{
"epoch": 0.008705114254624592,
"grad_norm": 0.09644383937120438,
"learning_rate": 0.0001999889482478735,
"loss": 0.3361,
"step": 24
},
{
"epoch": 0.009067827348567283,
"grad_norm": 0.09843447804450989,
"learning_rate": 0.0001999871826210521,
"loss": 0.3485,
"step": 25
},
{
"epoch": 0.009430540442509974,
"grad_norm": 0.10075519979000092,
"learning_rate": 0.00019998528621863396,
"loss": 0.3461,
"step": 26
},
{
"epoch": 0.009793253536452665,
"grad_norm": 0.09084542095661163,
"learning_rate": 0.00019998325904309946,
"loss": 0.3267,
"step": 27
},
{
"epoch": 0.010155966630395358,
"grad_norm": 0.10560671985149384,
"learning_rate": 0.00019998110109709988,
"loss": 0.3532,
"step": 28
},
{
"epoch": 0.01051867972433805,
"grad_norm": 0.08736245334148407,
"learning_rate": 0.00019997881238345775,
"loss": 0.37,
"step": 29
},
{
"epoch": 0.01088139281828074,
"grad_norm": 0.103543221950531,
"learning_rate": 0.0001999763929051665,
"loss": 0.3093,
"step": 30
},
{
"epoch": 0.011244105912223431,
"grad_norm": 0.09106361120939255,
"learning_rate": 0.0001999738426653906,
"loss": 0.3231,
"step": 31
},
{
"epoch": 0.011606819006166122,
"grad_norm": 0.09385113418102264,
"learning_rate": 0.00019997116166746562,
"loss": 0.3162,
"step": 32
},
{
"epoch": 0.011969532100108813,
"grad_norm": 0.10086266696453094,
"learning_rate": 0.00019996834991489805,
"loss": 0.3105,
"step": 33
},
{
"epoch": 0.012332245194051506,
"grad_norm": 0.08959592878818512,
"learning_rate": 0.00019996540741136544,
"loss": 0.3241,
"step": 34
},
{
"epoch": 0.012694958287994197,
"grad_norm": 0.10446605086326599,
"learning_rate": 0.00019996233416071644,
"loss": 0.338,
"step": 35
},
{
"epoch": 0.013057671381936888,
"grad_norm": 0.08997010439634323,
"learning_rate": 0.00019995913016697053,
"loss": 0.3089,
"step": 36
},
{
"epoch": 0.01342038447587958,
"grad_norm": 0.09340513497591019,
"learning_rate": 0.00019995579543431835,
"loss": 0.3167,
"step": 37
},
{
"epoch": 0.01378309756982227,
"grad_norm": 0.0928172841668129,
"learning_rate": 0.00019995232996712146,
"loss": 0.3236,
"step": 38
},
{
"epoch": 0.014145810663764961,
"grad_norm": 0.10568640381097794,
"learning_rate": 0.0001999487337699124,
"loss": 0.3213,
"step": 39
},
{
"epoch": 0.014508523757707652,
"grad_norm": 0.09213658422231674,
"learning_rate": 0.0001999450068473948,
"loss": 0.3308,
"step": 40
},
{
"epoch": 0.014871236851650345,
"grad_norm": 0.09331916272640228,
"learning_rate": 0.0001999411492044431,
"loss": 0.3265,
"step": 41
},
{
"epoch": 0.015233949945593036,
"grad_norm": 0.0938337966799736,
"learning_rate": 0.00019993716084610284,
"loss": 0.3084,
"step": 42
},
{
"epoch": 0.015596663039535727,
"grad_norm": 0.1006985679268837,
"learning_rate": 0.00019993304177759046,
"loss": 0.3932,
"step": 43
},
{
"epoch": 0.01595937613347842,
"grad_norm": 0.09978915005922318,
"learning_rate": 0.00019992879200429346,
"loss": 0.3147,
"step": 44
},
{
"epoch": 0.01632208922742111,
"grad_norm": 0.095309779047966,
"learning_rate": 0.00019992441153177015,
"loss": 0.3271,
"step": 45
},
{
"epoch": 0.016684802321363802,
"grad_norm": 0.09498284012079239,
"learning_rate": 0.00019991990036574987,
"loss": 0.3138,
"step": 46
},
{
"epoch": 0.017047515415306493,
"grad_norm": 0.09961807727813721,
"learning_rate": 0.0001999152585121329,
"loss": 0.3447,
"step": 47
},
{
"epoch": 0.017410228509249184,
"grad_norm": 0.11686038225889206,
"learning_rate": 0.0001999104859769904,
"loss": 0.3059,
"step": 48
},
{
"epoch": 0.017772941603191875,
"grad_norm": 0.09790387004613876,
"learning_rate": 0.0001999055827665645,
"loss": 0.3241,
"step": 49
},
{
"epoch": 0.018135654697134566,
"grad_norm": 0.0987682044506073,
"learning_rate": 0.00019990054888726824,
"loss": 0.3159,
"step": 50
},
{
"epoch": 0.018498367791077257,
"grad_norm": 0.09558644145727158,
"learning_rate": 0.0001998953843456855,
"loss": 0.3528,
"step": 51
},
{
"epoch": 0.01886108088501995,
"grad_norm": 0.1173083484172821,
"learning_rate": 0.00019989008914857116,
"loss": 0.3138,
"step": 52
},
{
"epoch": 0.01922379397896264,
"grad_norm": 0.09404181689023972,
"learning_rate": 0.0001998846633028509,
"loss": 0.3262,
"step": 53
},
{
"epoch": 0.01958650707290533,
"grad_norm": 0.09235358238220215,
"learning_rate": 0.00019987910681562132,
"loss": 0.3271,
"step": 54
},
{
"epoch": 0.01994922016684802,
"grad_norm": 0.10229232162237167,
"learning_rate": 0.0001998734196941499,
"loss": 0.3098,
"step": 55
},
{
"epoch": 0.020311933260790716,
"grad_norm": 0.08622050285339355,
"learning_rate": 0.0001998676019458749,
"loss": 0.2878,
"step": 56
},
{
"epoch": 0.020674646354733407,
"grad_norm": 0.10718828439712524,
"learning_rate": 0.00019986165357840558,
"loss": 0.319,
"step": 57
},
{
"epoch": 0.0210373594486761,
"grad_norm": 0.09529942274093628,
"learning_rate": 0.00019985557459952188,
"loss": 0.2974,
"step": 58
},
{
"epoch": 0.02140007254261879,
"grad_norm": 0.09532184153795242,
"learning_rate": 0.00019984936501717468,
"loss": 0.3016,
"step": 59
},
{
"epoch": 0.02176278563656148,
"grad_norm": 0.098875492811203,
"learning_rate": 0.00019984302483948562,
"loss": 0.3006,
"step": 60
},
{
"epoch": 0.02212549873050417,
"grad_norm": 0.1071372851729393,
"learning_rate": 0.00019983655407474719,
"loss": 0.2796,
"step": 61
},
{
"epoch": 0.022488211824446862,
"grad_norm": 0.11236250400543213,
"learning_rate": 0.0001998299527314226,
"loss": 0.3067,
"step": 62
},
{
"epoch": 0.022850924918389554,
"grad_norm": 0.10537782311439514,
"learning_rate": 0.00019982322081814596,
"loss": 0.3415,
"step": 63
},
{
"epoch": 0.023213638012332245,
"grad_norm": 0.09595459699630737,
"learning_rate": 0.00019981635834372209,
"loss": 0.3076,
"step": 64
},
{
"epoch": 0.023576351106274936,
"grad_norm": 0.09259078651666641,
"learning_rate": 0.00019980936531712652,
"loss": 0.2913,
"step": 65
},
{
"epoch": 0.023939064200217627,
"grad_norm": 0.20734301209449768,
"learning_rate": 0.00019980224174750568,
"loss": 0.3102,
"step": 66
},
{
"epoch": 0.024301777294160318,
"grad_norm": 0.10769975185394287,
"learning_rate": 0.0001997949876441766,
"loss": 0.3336,
"step": 67
},
{
"epoch": 0.024664490388103012,
"grad_norm": 0.1010124459862709,
"learning_rate": 0.00019978760301662715,
"loss": 0.3305,
"step": 68
},
{
"epoch": 0.025027203482045703,
"grad_norm": 0.09571480005979538,
"learning_rate": 0.0001997800878745158,
"loss": 0.3181,
"step": 69
},
{
"epoch": 0.025389916575988394,
"grad_norm": 0.10125493258237839,
"learning_rate": 0.00019977244222767182,
"loss": 0.2873,
"step": 70
},
{
"epoch": 0.025752629669931085,
"grad_norm": 0.11057893931865692,
"learning_rate": 0.0001997646660860951,
"loss": 0.3125,
"step": 71
},
{
"epoch": 0.026115342763873776,
"grad_norm": 0.1009269654750824,
"learning_rate": 0.00019975675945995626,
"loss": 0.3038,
"step": 72
},
{
"epoch": 0.026478055857816468,
"grad_norm": 0.09274876117706299,
"learning_rate": 0.00019974872235959654,
"loss": 0.292,
"step": 73
},
{
"epoch": 0.02684076895175916,
"grad_norm": 0.09206151217222214,
"learning_rate": 0.00019974055479552791,
"loss": 0.3064,
"step": 74
},
{
"epoch": 0.02720348204570185,
"grad_norm": 0.09455125778913498,
"learning_rate": 0.00019973225677843284,
"loss": 0.3031,
"step": 75
},
{
"epoch": 0.02756619513964454,
"grad_norm": 0.10313421487808228,
"learning_rate": 0.00019972382831916457,
"loss": 0.2975,
"step": 76
},
{
"epoch": 0.02792890823358723,
"grad_norm": 0.08839363604784012,
"learning_rate": 0.00019971526942874686,
"loss": 0.2926,
"step": 77
},
{
"epoch": 0.028291621327529923,
"grad_norm": 0.0924365371465683,
"learning_rate": 0.00019970658011837404,
"loss": 0.3071,
"step": 78
},
{
"epoch": 0.028654334421472614,
"grad_norm": 0.09888923168182373,
"learning_rate": 0.00019969776039941114,
"loss": 0.3004,
"step": 79
},
{
"epoch": 0.029017047515415305,
"grad_norm": 0.09569084644317627,
"learning_rate": 0.00019968881028339363,
"loss": 0.2923,
"step": 80
},
{
"epoch": 0.029379760609358,
"grad_norm": 0.11503931879997253,
"learning_rate": 0.0001996797297820276,
"loss": 0.3117,
"step": 81
},
{
"epoch": 0.02974247370330069,
"grad_norm": 0.09839354455471039,
"learning_rate": 0.00019967051890718964,
"loss": 0.2802,
"step": 82
},
{
"epoch": 0.03010518679724338,
"grad_norm": 0.09043775498867035,
"learning_rate": 0.00019966117767092686,
"loss": 0.2877,
"step": 83
},
{
"epoch": 0.030467899891186073,
"grad_norm": 0.09896934777498245,
"learning_rate": 0.00019965170608545688,
"loss": 0.3094,
"step": 84
},
{
"epoch": 0.030830612985128764,
"grad_norm": 0.09892911463975906,
"learning_rate": 0.00019964210416316787,
"loss": 0.302,
"step": 85
},
{
"epoch": 0.031193326079071455,
"grad_norm": 0.0898653194308281,
"learning_rate": 0.00019963237191661834,
"loss": 0.2982,
"step": 86
},
{
"epoch": 0.031556039173014146,
"grad_norm": 0.10663247853517532,
"learning_rate": 0.00019962250935853736,
"loss": 0.2943,
"step": 87
},
{
"epoch": 0.03191875226695684,
"grad_norm": 0.09792915731668472,
"learning_rate": 0.0001996125165018244,
"loss": 0.2826,
"step": 88
},
{
"epoch": 0.03228146536089953,
"grad_norm": 0.09535045176744461,
"learning_rate": 0.00019960239335954936,
"loss": 0.3026,
"step": 89
},
{
"epoch": 0.03264417845484222,
"grad_norm": 0.08838774263858795,
"learning_rate": 0.0001995921399449525,
"loss": 0.277,
"step": 90
},
{
"epoch": 0.03300689154878491,
"grad_norm": 0.09616609662771225,
"learning_rate": 0.00019958175627144453,
"loss": 0.3015,
"step": 91
},
{
"epoch": 0.033369604642727604,
"grad_norm": 0.0945005938410759,
"learning_rate": 0.00019957124235260652,
"loss": 0.288,
"step": 92
},
{
"epoch": 0.03373231773667029,
"grad_norm": 0.10378480702638626,
"learning_rate": 0.00019956059820218982,
"loss": 0.3361,
"step": 93
},
{
"epoch": 0.034095030830612987,
"grad_norm": 0.09242385625839233,
"learning_rate": 0.0001995498238341162,
"loss": 0.2903,
"step": 94
},
{
"epoch": 0.034457743924555674,
"grad_norm": 0.0919501855969429,
"learning_rate": 0.00019953891926247774,
"loss": 0.3025,
"step": 95
},
{
"epoch": 0.03482045701849837,
"grad_norm": 0.09978862851858139,
"learning_rate": 0.00019952788450153675,
"loss": 0.3335,
"step": 96
},
{
"epoch": 0.035183170112441056,
"grad_norm": 0.10097439587116241,
"learning_rate": 0.00019951671956572583,
"loss": 0.3137,
"step": 97
},
{
"epoch": 0.03554588320638375,
"grad_norm": 0.1043080985546112,
"learning_rate": 0.00019950542446964793,
"loss": 0.2896,
"step": 98
},
{
"epoch": 0.035908596300326445,
"grad_norm": 0.09220679104328156,
"learning_rate": 0.00019949399922807612,
"loss": 0.3031,
"step": 99
},
{
"epoch": 0.03627130939426913,
"grad_norm": 0.10692602396011353,
"learning_rate": 0.00019948244385595374,
"loss": 0.3057,
"step": 100
},
{
"epoch": 0.03663402248821183,
"grad_norm": 0.10609027743339539,
"learning_rate": 0.00019947075836839438,
"loss": 0.3082,
"step": 101
},
{
"epoch": 0.036996735582154515,
"grad_norm": 0.16867391765117645,
"learning_rate": 0.00019945894278068172,
"loss": 0.302,
"step": 102
},
{
"epoch": 0.03735944867609721,
"grad_norm": 0.09805990755558014,
"learning_rate": 0.00019944699710826966,
"loss": 0.3218,
"step": 103
},
{
"epoch": 0.0377221617700399,
"grad_norm": 0.09552697837352753,
"learning_rate": 0.00019943492136678223,
"loss": 0.2576,
"step": 104
},
{
"epoch": 0.03808487486398259,
"grad_norm": 0.09718494862318039,
"learning_rate": 0.0001994227155720136,
"loss": 0.2882,
"step": 105
},
{
"epoch": 0.03844758795792528,
"grad_norm": 0.0933772400021553,
"learning_rate": 0.000199410379739928,
"loss": 0.3069,
"step": 106
},
{
"epoch": 0.038810301051867974,
"grad_norm": 0.09682098776102066,
"learning_rate": 0.00019939791388665974,
"loss": 0.3013,
"step": 107
},
{
"epoch": 0.03917301414581066,
"grad_norm": 0.1064608246088028,
"learning_rate": 0.0001993853180285132,
"loss": 0.3307,
"step": 108
},
{
"epoch": 0.039535727239753356,
"grad_norm": 0.09508496522903442,
"learning_rate": 0.00019937259218196282,
"loss": 0.2968,
"step": 109
},
{
"epoch": 0.03989844033369604,
"grad_norm": 0.10839469730854034,
"learning_rate": 0.00019935973636365305,
"loss": 0.3017,
"step": 110
},
{
"epoch": 0.04026115342763874,
"grad_norm": 0.10720638930797577,
"learning_rate": 0.00019934675059039828,
"loss": 0.2817,
"step": 111
},
{
"epoch": 0.04062386652158143,
"grad_norm": 0.10672647505998611,
"learning_rate": 0.00019933363487918294,
"loss": 0.2876,
"step": 112
},
{
"epoch": 0.04098657961552412,
"grad_norm": 0.10290908813476562,
"learning_rate": 0.00019932038924716134,
"loss": 0.2906,
"step": 113
},
{
"epoch": 0.041349292709466814,
"grad_norm": 0.11226241290569305,
"learning_rate": 0.0001993070137116578,
"loss": 0.2816,
"step": 114
},
{
"epoch": 0.0417120058034095,
"grad_norm": 0.09558378159999847,
"learning_rate": 0.00019929350829016648,
"loss": 0.3115,
"step": 115
},
{
"epoch": 0.0420747188973522,
"grad_norm": 0.10267224162817001,
"learning_rate": 0.00019927987300035147,
"loss": 0.3035,
"step": 116
},
{
"epoch": 0.042437431991294884,
"grad_norm": 0.09401127696037292,
"learning_rate": 0.00019926610786004663,
"loss": 0.2995,
"step": 117
},
{
"epoch": 0.04280014508523758,
"grad_norm": 0.10615453869104385,
"learning_rate": 0.00019925221288725573,
"loss": 0.3062,
"step": 118
},
{
"epoch": 0.043162858179180266,
"grad_norm": 0.11928743124008179,
"learning_rate": 0.00019923818810015236,
"loss": 0.317,
"step": 119
},
{
"epoch": 0.04352557127312296,
"grad_norm": 0.10731657594442368,
"learning_rate": 0.00019922403351707983,
"loss": 0.3261,
"step": 120
},
{
"epoch": 0.04388828436706565,
"grad_norm": 0.10545065253973007,
"learning_rate": 0.0001992097491565513,
"loss": 0.3125,
"step": 121
},
{
"epoch": 0.04425099746100834,
"grad_norm": 0.1098426803946495,
"learning_rate": 0.0001991953350372496,
"loss": 0.2928,
"step": 122
},
{
"epoch": 0.04461371055495103,
"grad_norm": 0.09736689925193787,
"learning_rate": 0.00019918079117802725,
"loss": 0.2736,
"step": 123
},
{
"epoch": 0.044976423648893725,
"grad_norm": 0.11810169368982315,
"learning_rate": 0.0001991661175979066,
"loss": 0.2806,
"step": 124
},
{
"epoch": 0.04533913674283642,
"grad_norm": 0.11560354381799698,
"learning_rate": 0.00019915131431607952,
"loss": 0.317,
"step": 125
},
{
"epoch": 0.04570184983677911,
"grad_norm": 0.11197232455015182,
"learning_rate": 0.00019913638135190756,
"loss": 0.3382,
"step": 126
},
{
"epoch": 0.0460645629307218,
"grad_norm": 0.1027117446064949,
"learning_rate": 0.0001991213187249219,
"loss": 0.2684,
"step": 127
},
{
"epoch": 0.04642727602466449,
"grad_norm": 0.10549558699131012,
"learning_rate": 0.00019910612645482334,
"loss": 0.2939,
"step": 128
},
{
"epoch": 0.046789989118607184,
"grad_norm": 0.09976191818714142,
"learning_rate": 0.00019909080456148218,
"loss": 0.2878,
"step": 129
},
{
"epoch": 0.04715270221254987,
"grad_norm": 0.10141481459140778,
"learning_rate": 0.0001990753530649383,
"loss": 0.2959,
"step": 130
},
{
"epoch": 0.047515415306492566,
"grad_norm": 0.10536810010671616,
"learning_rate": 0.00019905977198540105,
"loss": 0.283,
"step": 131
},
{
"epoch": 0.04787812840043525,
"grad_norm": 0.1081426814198494,
"learning_rate": 0.00019904406134324933,
"loss": 0.2982,
"step": 132
},
{
"epoch": 0.04824084149437795,
"grad_norm": 0.10106177628040314,
"learning_rate": 0.00019902822115903143,
"loss": 0.3301,
"step": 133
},
{
"epoch": 0.048603554588320635,
"grad_norm": 0.09809243679046631,
"learning_rate": 0.0001990122514534651,
"loss": 0.2868,
"step": 134
},
{
"epoch": 0.04896626768226333,
"grad_norm": 0.10104624181985855,
"learning_rate": 0.00019899615224743753,
"loss": 0.3035,
"step": 135
},
{
"epoch": 0.049328980776206025,
"grad_norm": 0.09421058744192123,
"learning_rate": 0.0001989799235620052,
"loss": 0.2982,
"step": 136
},
{
"epoch": 0.04969169387014871,
"grad_norm": 0.09937946498394012,
"learning_rate": 0.00019896356541839404,
"loss": 0.2988,
"step": 137
},
{
"epoch": 0.05005440696409141,
"grad_norm": 0.10086655616760254,
"learning_rate": 0.00019894707783799925,
"loss": 0.2849,
"step": 138
},
{
"epoch": 0.050417120058034094,
"grad_norm": 0.09309150278568268,
"learning_rate": 0.0001989304608423853,
"loss": 0.2792,
"step": 139
},
{
"epoch": 0.05077983315197679,
"grad_norm": 0.15080593526363373,
"learning_rate": 0.00019891371445328592,
"loss": 0.2993,
"step": 140
},
{
"epoch": 0.051142546245919476,
"grad_norm": 0.09852839261293411,
"learning_rate": 0.0001988968386926042,
"loss": 0.2887,
"step": 141
},
{
"epoch": 0.05150525933986217,
"grad_norm": 0.13169077038764954,
"learning_rate": 0.00019887983358241225,
"loss": 0.2889,
"step": 142
},
{
"epoch": 0.05186797243380486,
"grad_norm": 0.203284353017807,
"learning_rate": 0.0001988626991449515,
"loss": 0.2762,
"step": 143
},
{
"epoch": 0.05223068552774755,
"grad_norm": 0.09370779246091843,
"learning_rate": 0.00019884543540263247,
"loss": 0.2717,
"step": 144
},
{
"epoch": 0.05259339862169024,
"grad_norm": 0.10462846606969833,
"learning_rate": 0.00019882804237803488,
"loss": 0.2923,
"step": 145
},
{
"epoch": 0.052956111715632935,
"grad_norm": 0.11297117918729782,
"learning_rate": 0.00019881052009390737,
"loss": 0.3037,
"step": 146
},
{
"epoch": 0.05331882480957562,
"grad_norm": 0.11037133634090424,
"learning_rate": 0.00019879286857316783,
"loss": 0.2883,
"step": 147
},
{
"epoch": 0.05368153790351832,
"grad_norm": 0.10279864072799683,
"learning_rate": 0.00019877508783890306,
"loss": 0.2847,
"step": 148
},
{
"epoch": 0.05404425099746101,
"grad_norm": 0.09439583867788315,
"learning_rate": 0.00019875717791436896,
"loss": 0.2779,
"step": 149
},
{
"epoch": 0.0544069640914037,
"grad_norm": 0.10622645914554596,
"learning_rate": 0.00019873913882299026,
"loss": 0.3099,
"step": 150
},
{
"epoch": 0.054769677185346394,
"grad_norm": 0.10882750153541565,
"learning_rate": 0.00019872097058836076,
"loss": 0.2659,
"step": 151
},
{
"epoch": 0.05513239027928908,
"grad_norm": 0.09320899844169617,
"learning_rate": 0.00019870267323424313,
"loss": 0.268,
"step": 152
},
{
"epoch": 0.055495103373231776,
"grad_norm": 0.09685231000185013,
"learning_rate": 0.00019868424678456888,
"loss": 0.2745,
"step": 153
},
{
"epoch": 0.05585781646717446,
"grad_norm": 0.10234569013118744,
"learning_rate": 0.00019866569126343844,
"loss": 0.2948,
"step": 154
},
{
"epoch": 0.05622052956111716,
"grad_norm": 0.09876774251461029,
"learning_rate": 0.00019864700669512098,
"loss": 0.2808,
"step": 155
},
{
"epoch": 0.056583242655059846,
"grad_norm": 0.10879123955965042,
"learning_rate": 0.00019862819310405449,
"loss": 0.2745,
"step": 156
},
{
"epoch": 0.05694595574900254,
"grad_norm": 0.10035258531570435,
"learning_rate": 0.00019860925051484572,
"loss": 0.3027,
"step": 157
},
{
"epoch": 0.05730866884294523,
"grad_norm": 0.098017618060112,
"learning_rate": 0.00019859017895227014,
"loss": 0.2844,
"step": 158
},
{
"epoch": 0.05767138193688792,
"grad_norm": 0.09496638178825378,
"learning_rate": 0.00019857097844127187,
"loss": 0.2852,
"step": 159
},
{
"epoch": 0.05803409503083061,
"grad_norm": 0.10773288458585739,
"learning_rate": 0.00019855164900696375,
"loss": 0.3112,
"step": 160
},
{
"epoch": 0.058396808124773304,
"grad_norm": 0.09997101873159409,
"learning_rate": 0.00019853219067462717,
"loss": 0.2913,
"step": 161
},
{
"epoch": 0.058759521218716,
"grad_norm": 0.09856441617012024,
"learning_rate": 0.00019851260346971214,
"loss": 0.2753,
"step": 162
},
{
"epoch": 0.059122234312658686,
"grad_norm": 0.10671742260456085,
"learning_rate": 0.00019849288741783728,
"loss": 0.2958,
"step": 163
},
{
"epoch": 0.05948494740660138,
"grad_norm": 0.10415424406528473,
"learning_rate": 0.0001984730425447896,
"loss": 0.284,
"step": 164
},
{
"epoch": 0.05984766050054407,
"grad_norm": 0.10045934468507767,
"learning_rate": 0.00019845306887652476,
"loss": 0.281,
"step": 165
},
{
"epoch": 0.06021037359448676,
"grad_norm": 0.10365572571754456,
"learning_rate": 0.0001984329664391667,
"loss": 0.3186,
"step": 166
},
{
"epoch": 0.06057308668842945,
"grad_norm": 0.10675114393234253,
"learning_rate": 0.00019841273525900794,
"loss": 0.2774,
"step": 167
},
{
"epoch": 0.060935799782372145,
"grad_norm": 0.100840725004673,
"learning_rate": 0.0001983923753625093,
"loss": 0.2723,
"step": 168
},
{
"epoch": 0.06129851287631483,
"grad_norm": 0.09524688124656677,
"learning_rate": 0.0001983718867763,
"loss": 0.2679,
"step": 169
},
{
"epoch": 0.06166122597025753,
"grad_norm": 0.10454592853784561,
"learning_rate": 0.0001983512695271775,
"loss": 0.2779,
"step": 170
},
{
"epoch": 0.062023939064200215,
"grad_norm": 0.11385498940944672,
"learning_rate": 0.00019833052364210757,
"loss": 0.2892,
"step": 171
},
{
"epoch": 0.06238665215814291,
"grad_norm": 0.10297231376171112,
"learning_rate": 0.00019830964914822433,
"loss": 0.2885,
"step": 172
},
{
"epoch": 0.0627493652520856,
"grad_norm": 0.10694777965545654,
"learning_rate": 0.00019828864607282994,
"loss": 0.2951,
"step": 173
},
{
"epoch": 0.06311207834602829,
"grad_norm": 0.10187729448080063,
"learning_rate": 0.00019826751444339483,
"loss": 0.267,
"step": 174
},
{
"epoch": 0.06347479143997098,
"grad_norm": 0.10256768018007278,
"learning_rate": 0.0001982462542875576,
"loss": 0.2812,
"step": 175
},
{
"epoch": 0.06383750453391368,
"grad_norm": 0.106157086789608,
"learning_rate": 0.0001982248656331249,
"loss": 0.2617,
"step": 176
},
{
"epoch": 0.06420021762785637,
"grad_norm": 0.10591990500688553,
"learning_rate": 0.00019820334850807143,
"loss": 0.2792,
"step": 177
},
{
"epoch": 0.06456293072179906,
"grad_norm": 0.10539959371089935,
"learning_rate": 0.00019818170294053994,
"loss": 0.2817,
"step": 178
},
{
"epoch": 0.06492564381574174,
"grad_norm": 0.10033068805932999,
"learning_rate": 0.00019815992895884122,
"loss": 0.2917,
"step": 179
},
{
"epoch": 0.06528835690968444,
"grad_norm": 0.11100872606039047,
"learning_rate": 0.00019813802659145394,
"loss": 0.276,
"step": 180
},
{
"epoch": 0.06565107000362713,
"grad_norm": 0.10445630550384521,
"learning_rate": 0.0001981159958670247,
"loss": 0.3308,
"step": 181
},
{
"epoch": 0.06601378309756982,
"grad_norm": 0.09888961911201477,
"learning_rate": 0.00019809383681436809,
"loss": 0.2651,
"step": 182
},
{
"epoch": 0.06637649619151251,
"grad_norm": 0.10630346089601517,
"learning_rate": 0.00019807154946246635,
"loss": 0.2674,
"step": 183
},
{
"epoch": 0.06673920928545521,
"grad_norm": 0.09556199610233307,
"learning_rate": 0.00019804913384046974,
"loss": 0.2988,
"step": 184
},
{
"epoch": 0.0671019223793979,
"grad_norm": 0.10325701534748077,
"learning_rate": 0.0001980265899776961,
"loss": 0.2821,
"step": 185
},
{
"epoch": 0.06746463547334058,
"grad_norm": 0.09466871619224548,
"learning_rate": 0.00019800391790363112,
"loss": 0.2632,
"step": 186
},
{
"epoch": 0.06782734856728329,
"grad_norm": 0.09646070003509521,
"learning_rate": 0.00019798111764792814,
"loss": 0.2888,
"step": 187
},
{
"epoch": 0.06819006166122597,
"grad_norm": 0.09636171907186508,
"learning_rate": 0.00019795818924040815,
"loss": 0.2766,
"step": 188
},
{
"epoch": 0.06855277475516866,
"grad_norm": 0.10880020260810852,
"learning_rate": 0.00019793513271105975,
"loss": 0.3053,
"step": 189
},
{
"epoch": 0.06891548784911135,
"grad_norm": 0.11933793127536774,
"learning_rate": 0.0001979119480900391,
"loss": 0.2903,
"step": 190
},
{
"epoch": 0.06927820094305405,
"grad_norm": 0.1342136114835739,
"learning_rate": 0.00019788863540766996,
"loss": 0.2912,
"step": 191
},
{
"epoch": 0.06964091403699674,
"grad_norm": 0.1037123054265976,
"learning_rate": 0.0001978651946944435,
"loss": 0.3044,
"step": 192
},
{
"epoch": 0.07000362713093942,
"grad_norm": 0.11920095235109329,
"learning_rate": 0.00019784162598101838,
"loss": 0.2859,
"step": 193
},
{
"epoch": 0.07036634022488211,
"grad_norm": 0.11973892152309418,
"learning_rate": 0.00019781792929822068,
"loss": 0.2959,
"step": 194
},
{
"epoch": 0.07072905331882481,
"grad_norm": 0.11078456044197083,
"learning_rate": 0.00019779410467704389,
"loss": 0.2769,
"step": 195
},
{
"epoch": 0.0710917664127675,
"grad_norm": 0.11091899126768112,
"learning_rate": 0.00019777015214864877,
"loss": 0.2832,
"step": 196
},
{
"epoch": 0.07145447950671019,
"grad_norm": 0.09678234905004501,
"learning_rate": 0.00019774607174436338,
"loss": 0.2455,
"step": 197
},
{
"epoch": 0.07181719260065289,
"grad_norm": 0.11300257593393326,
"learning_rate": 0.00019772186349568304,
"loss": 0.3242,
"step": 198
},
{
"epoch": 0.07217990569459558,
"grad_norm": 0.1536862999200821,
"learning_rate": 0.00019769752743427032,
"loss": 0.2901,
"step": 199
},
{
"epoch": 0.07254261878853827,
"grad_norm": 0.10081265866756439,
"learning_rate": 0.00019767306359195493,
"loss": 0.3059,
"step": 200
},
{
"epoch": 0.07290533188248095,
"grad_norm": 0.10079798847436905,
"learning_rate": 0.0001976484720007337,
"loss": 0.2871,
"step": 201
},
{
"epoch": 0.07326804497642365,
"grad_norm": 0.09981225430965424,
"learning_rate": 0.00019762375269277054,
"loss": 0.2713,
"step": 202
},
{
"epoch": 0.07363075807036634,
"grad_norm": 0.10104259103536606,
"learning_rate": 0.00019759890570039644,
"loss": 0.3178,
"step": 203
},
{
"epoch": 0.07399347116430903,
"grad_norm": 0.10694817453622818,
"learning_rate": 0.00019757393105610934,
"loss": 0.2725,
"step": 204
},
{
"epoch": 0.07435618425825172,
"grad_norm": 0.10432042181491852,
"learning_rate": 0.0001975488287925742,
"loss": 0.2798,
"step": 205
},
{
"epoch": 0.07471889735219442,
"grad_norm": 0.11903175711631775,
"learning_rate": 0.00019752359894262283,
"loss": 0.3138,
"step": 206
},
{
"epoch": 0.0750816104461371,
"grad_norm": 0.10495443642139435,
"learning_rate": 0.00019749824153925396,
"loss": 0.2764,
"step": 207
},
{
"epoch": 0.0754443235400798,
"grad_norm": 0.10551683604717255,
"learning_rate": 0.00019747275661563312,
"loss": 0.2884,
"step": 208
},
{
"epoch": 0.07580703663402248,
"grad_norm": 0.12931138277053833,
"learning_rate": 0.00019744714420509273,
"loss": 0.2843,
"step": 209
},
{
"epoch": 0.07616974972796518,
"grad_norm": 0.10500820726156235,
"learning_rate": 0.0001974214043411317,
"loss": 0.298,
"step": 210
},
{
"epoch": 0.07653246282190787,
"grad_norm": 0.10469575226306915,
"learning_rate": 0.000197395537057416,
"loss": 0.2775,
"step": 211
},
{
"epoch": 0.07689517591585056,
"grad_norm": 0.11616349220275879,
"learning_rate": 0.00019736954238777792,
"loss": 0.2868,
"step": 212
},
{
"epoch": 0.07725788900979326,
"grad_norm": 0.10852184146642685,
"learning_rate": 0.00019734342036621652,
"loss": 0.2634,
"step": 213
},
{
"epoch": 0.07762060210373595,
"grad_norm": 0.11353151500225067,
"learning_rate": 0.00019731717102689747,
"loss": 0.2988,
"step": 214
},
{
"epoch": 0.07798331519767863,
"grad_norm": 0.10728183388710022,
"learning_rate": 0.00019729079440415287,
"loss": 0.273,
"step": 215
},
{
"epoch": 0.07834602829162132,
"grad_norm": 0.11151303350925446,
"learning_rate": 0.0001972642905324813,
"loss": 0.282,
"step": 216
},
{
"epoch": 0.07870874138556402,
"grad_norm": 0.1237482950091362,
"learning_rate": 0.00019723765944654783,
"loss": 0.2744,
"step": 217
},
{
"epoch": 0.07907145447950671,
"grad_norm": 0.10815929621458054,
"learning_rate": 0.0001972109011811839,
"loss": 0.2893,
"step": 218
},
{
"epoch": 0.0794341675734494,
"grad_norm": 0.1144891083240509,
"learning_rate": 0.00019718401577138725,
"loss": 0.3018,
"step": 219
},
{
"epoch": 0.07979688066739209,
"grad_norm": 0.1146797463297844,
"learning_rate": 0.00019715700325232194,
"loss": 0.2759,
"step": 220
},
{
"epoch": 0.08015959376133479,
"grad_norm": 0.1100744977593422,
"learning_rate": 0.00019712986365931826,
"loss": 0.2824,
"step": 221
},
{
"epoch": 0.08052230685527748,
"grad_norm": 0.12042435258626938,
"learning_rate": 0.0001971025970278728,
"loss": 0.2683,
"step": 222
},
{
"epoch": 0.08088501994922016,
"grad_norm": 0.11394108831882477,
"learning_rate": 0.00019707520339364818,
"loss": 0.312,
"step": 223
},
{
"epoch": 0.08124773304316286,
"grad_norm": 0.10353437066078186,
"learning_rate": 0.00019704768279247317,
"loss": 0.2673,
"step": 224
},
{
"epoch": 0.08161044613710555,
"grad_norm": 0.0966782197356224,
"learning_rate": 0.00019702003526034264,
"loss": 0.2995,
"step": 225
},
{
"epoch": 0.08197315923104824,
"grad_norm": 0.11248703300952911,
"learning_rate": 0.00019699226083341742,
"loss": 0.2588,
"step": 226
},
{
"epoch": 0.08233587232499093,
"grad_norm": 0.10794703662395477,
"learning_rate": 0.00019696435954802438,
"loss": 0.2594,
"step": 227
},
{
"epoch": 0.08269858541893363,
"grad_norm": 0.1097991019487381,
"learning_rate": 0.0001969363314406562,
"loss": 0.2691,
"step": 228
},
{
"epoch": 0.08306129851287632,
"grad_norm": 0.10738769918680191,
"learning_rate": 0.00019690817654797161,
"loss": 0.2811,
"step": 229
},
{
"epoch": 0.083424011606819,
"grad_norm": 0.10677637159824371,
"learning_rate": 0.00019687989490679503,
"loss": 0.2864,
"step": 230
},
{
"epoch": 0.08378672470076169,
"grad_norm": 0.11440913379192352,
"learning_rate": 0.00019685148655411658,
"loss": 0.2961,
"step": 231
},
{
"epoch": 0.0841494377947044,
"grad_norm": 0.10899066925048828,
"learning_rate": 0.00019682295152709234,
"loss": 0.2852,
"step": 232
},
{
"epoch": 0.08451215088864708,
"grad_norm": 0.10460548102855682,
"learning_rate": 0.00019679428986304386,
"loss": 0.2954,
"step": 233
},
{
"epoch": 0.08487486398258977,
"grad_norm": 0.12301474809646606,
"learning_rate": 0.00019676550159945845,
"loss": 0.263,
"step": 234
},
{
"epoch": 0.08523757707653247,
"grad_norm": 0.11282453685998917,
"learning_rate": 0.000196736586773989,
"loss": 0.3135,
"step": 235
},
{
"epoch": 0.08560029017047516,
"grad_norm": 0.11679442226886749,
"learning_rate": 0.0001967075454244538,
"loss": 0.287,
"step": 236
},
{
"epoch": 0.08596300326441784,
"grad_norm": 0.11096673458814621,
"learning_rate": 0.0001966783775888368,
"loss": 0.295,
"step": 237
},
{
"epoch": 0.08632571635836053,
"grad_norm": 0.1101219430565834,
"learning_rate": 0.00019664908330528725,
"loss": 0.2694,
"step": 238
},
{
"epoch": 0.08668842945230323,
"grad_norm": 0.10985169559717178,
"learning_rate": 0.00019661966261211983,
"loss": 0.2734,
"step": 239
},
{
"epoch": 0.08705114254624592,
"grad_norm": 0.11106691509485245,
"learning_rate": 0.0001965901155478146,
"loss": 0.2781,
"step": 240
},
{
"epoch": 0.08741385564018861,
"grad_norm": 0.1100887879729271,
"learning_rate": 0.00019656044215101684,
"loss": 0.3105,
"step": 241
},
{
"epoch": 0.0877765687341313,
"grad_norm": 0.11487387865781784,
"learning_rate": 0.00019653064246053707,
"loss": 0.2824,
"step": 242
},
{
"epoch": 0.088139281828074,
"grad_norm": 0.10977080464363098,
"learning_rate": 0.00019650071651535104,
"loss": 0.3309,
"step": 243
},
{
"epoch": 0.08850199492201669,
"grad_norm": 0.11280547827482224,
"learning_rate": 0.0001964706643545996,
"loss": 0.2698,
"step": 244
},
{
"epoch": 0.08886470801595937,
"grad_norm": 0.10025591403245926,
"learning_rate": 0.00019644048601758865,
"loss": 0.2623,
"step": 245
},
{
"epoch": 0.08922742110990206,
"grad_norm": 0.10023844242095947,
"learning_rate": 0.0001964101815437892,
"loss": 0.2711,
"step": 246
},
{
"epoch": 0.08959013420384476,
"grad_norm": 0.1235634833574295,
"learning_rate": 0.0001963797509728371,
"loss": 0.2884,
"step": 247
},
{
"epoch": 0.08995284729778745,
"grad_norm": 0.10354435443878174,
"learning_rate": 0.0001963491943445333,
"loss": 0.2601,
"step": 248
},
{
"epoch": 0.09031556039173014,
"grad_norm": 0.10399331152439117,
"learning_rate": 0.00019631851169884352,
"loss": 0.2817,
"step": 249
},
{
"epoch": 0.09067827348567284,
"grad_norm": 0.11649379879236221,
"learning_rate": 0.00019628770307589827,
"loss": 0.3344,
"step": 250
},
{
"epoch": 0.09104098657961553,
"grad_norm": 0.1313096284866333,
"learning_rate": 0.00019625676851599288,
"loss": 0.326,
"step": 251
},
{
"epoch": 0.09140369967355821,
"grad_norm": 0.11555227637290955,
"learning_rate": 0.00019622570805958746,
"loss": 0.2687,
"step": 252
},
{
"epoch": 0.0917664127675009,
"grad_norm": 0.1436738669872284,
"learning_rate": 0.00019619452174730667,
"loss": 0.2748,
"step": 253
},
{
"epoch": 0.0921291258614436,
"grad_norm": 0.11013220995664597,
"learning_rate": 0.0001961632096199398,
"loss": 0.2556,
"step": 254
},
{
"epoch": 0.09249183895538629,
"grad_norm": 0.11054322123527527,
"learning_rate": 0.00019613177171844075,
"loss": 0.2813,
"step": 255
},
{
"epoch": 0.09285455204932898,
"grad_norm": 0.10872920602560043,
"learning_rate": 0.00019610020808392788,
"loss": 0.3022,
"step": 256
},
{
"epoch": 0.09321726514327167,
"grad_norm": 0.12032327055931091,
"learning_rate": 0.000196068518757684,
"loss": 0.2836,
"step": 257
},
{
"epoch": 0.09357997823721437,
"grad_norm": 0.10551446676254272,
"learning_rate": 0.0001960367037811564,
"loss": 0.281,
"step": 258
},
{
"epoch": 0.09394269133115705,
"grad_norm": 0.11461377888917923,
"learning_rate": 0.00019600476319595658,
"loss": 0.2841,
"step": 259
},
{
"epoch": 0.09430540442509974,
"grad_norm": 0.11937367916107178,
"learning_rate": 0.00019597269704386036,
"loss": 0.2695,
"step": 260
},
{
"epoch": 0.09466811751904244,
"grad_norm": 0.109502412378788,
"learning_rate": 0.0001959405053668079,
"loss": 0.2796,
"step": 261
},
{
"epoch": 0.09503083061298513,
"grad_norm": 0.12356701493263245,
"learning_rate": 0.00019590818820690336,
"loss": 0.2963,
"step": 262
},
{
"epoch": 0.09539354370692782,
"grad_norm": 0.1127593144774437,
"learning_rate": 0.00019587574560641518,
"loss": 0.2646,
"step": 263
},
{
"epoch": 0.0957562568008705,
"grad_norm": 0.13234767317771912,
"learning_rate": 0.00019584317760777578,
"loss": 0.2816,
"step": 264
},
{
"epoch": 0.09611896989481321,
"grad_norm": 0.10984192788600922,
"learning_rate": 0.00019581048425358158,
"loss": 0.3069,
"step": 265
},
{
"epoch": 0.0964816829887559,
"grad_norm": 0.1149398684501648,
"learning_rate": 0.00019577766558659306,
"loss": 0.2574,
"step": 266
},
{
"epoch": 0.09684439608269858,
"grad_norm": 0.10994721949100494,
"learning_rate": 0.00019574472164973452,
"loss": 0.2705,
"step": 267
},
{
"epoch": 0.09720710917664127,
"grad_norm": 0.10396052896976471,
"learning_rate": 0.00019571165248609407,
"loss": 0.2343,
"step": 268
},
{
"epoch": 0.09756982227058397,
"grad_norm": 0.1382754147052765,
"learning_rate": 0.00019567845813892368,
"loss": 0.2586,
"step": 269
},
{
"epoch": 0.09793253536452666,
"grad_norm": 0.10811847448348999,
"learning_rate": 0.000195645138651639,
"loss": 0.2599,
"step": 270
},
{
"epoch": 0.09829524845846935,
"grad_norm": 0.12254346907138824,
"learning_rate": 0.00019561169406781938,
"loss": 0.2543,
"step": 271
},
{
"epoch": 0.09865796155241205,
"grad_norm": 0.10719288885593414,
"learning_rate": 0.00019557812443120779,
"loss": 0.2788,
"step": 272
},
{
"epoch": 0.09902067464635474,
"grad_norm": 0.11490897834300995,
"learning_rate": 0.00019554442978571076,
"loss": 0.3076,
"step": 273
},
{
"epoch": 0.09938338774029742,
"grad_norm": 0.11272160708904266,
"learning_rate": 0.00019551061017539828,
"loss": 0.2719,
"step": 274
},
{
"epoch": 0.09974610083424011,
"grad_norm": 0.11950589716434479,
"learning_rate": 0.00019547666564450383,
"loss": 0.2424,
"step": 275
},
{
"epoch": 0.10010881392818281,
"grad_norm": 0.10737808048725128,
"learning_rate": 0.00019544259623742428,
"loss": 0.2628,
"step": 276
},
{
"epoch": 0.1004715270221255,
"grad_norm": 0.10422177612781525,
"learning_rate": 0.00019540840199871982,
"loss": 0.2515,
"step": 277
},
{
"epoch": 0.10083424011606819,
"grad_norm": 0.12654827535152435,
"learning_rate": 0.00019537408297311384,
"loss": 0.3258,
"step": 278
},
{
"epoch": 0.10119695321001088,
"grad_norm": 0.10753121972084045,
"learning_rate": 0.00019533963920549306,
"loss": 0.2633,
"step": 279
},
{
"epoch": 0.10155966630395358,
"grad_norm": 0.1134246215224266,
"learning_rate": 0.0001953050707409073,
"loss": 0.2777,
"step": 280
},
{
"epoch": 0.10192237939789626,
"grad_norm": 0.11118260025978088,
"learning_rate": 0.00019527037762456944,
"loss": 0.2684,
"step": 281
},
{
"epoch": 0.10228509249183895,
"grad_norm": 0.12425535172224045,
"learning_rate": 0.0001952355599018554,
"loss": 0.28,
"step": 282
},
{
"epoch": 0.10264780558578164,
"grad_norm": 0.12097672373056412,
"learning_rate": 0.00019520061761830424,
"loss": 0.2589,
"step": 283
},
{
"epoch": 0.10301051867972434,
"grad_norm": 0.11388805508613586,
"learning_rate": 0.00019516555081961764,
"loss": 0.2864,
"step": 284
},
{
"epoch": 0.10337323177366703,
"grad_norm": 0.10794699192047119,
"learning_rate": 0.00019513035955166035,
"loss": 0.2754,
"step": 285
},
{
"epoch": 0.10373594486760972,
"grad_norm": 0.10783129185438156,
"learning_rate": 0.00019509504386045986,
"loss": 0.252,
"step": 286
},
{
"epoch": 0.10409865796155242,
"grad_norm": 0.12570741772651672,
"learning_rate": 0.0001950596037922064,
"loss": 0.2563,
"step": 287
},
{
"epoch": 0.1044613710554951,
"grad_norm": 0.12100599706172943,
"learning_rate": 0.0001950240393932529,
"loss": 0.2811,
"step": 288
},
{
"epoch": 0.1048240841494378,
"grad_norm": 0.09901045262813568,
"learning_rate": 0.0001949883507101148,
"loss": 0.2724,
"step": 289
},
{
"epoch": 0.10518679724338048,
"grad_norm": 0.10405360162258148,
"learning_rate": 0.00019495253778947026,
"loss": 0.274,
"step": 290
},
{
"epoch": 0.10554951033732318,
"grad_norm": 0.11303572356700897,
"learning_rate": 0.0001949166006781598,
"loss": 0.2669,
"step": 291
},
{
"epoch": 0.10591222343126587,
"grad_norm": 0.1083337813615799,
"learning_rate": 0.0001948805394231864,
"loss": 0.2865,
"step": 292
},
{
"epoch": 0.10627493652520856,
"grad_norm": 0.10910173505544662,
"learning_rate": 0.00019484435407171545,
"loss": 0.2651,
"step": 293
},
{
"epoch": 0.10663764961915125,
"grad_norm": 0.10337372124195099,
"learning_rate": 0.00019480804467107463,
"loss": 0.2509,
"step": 294
},
{
"epoch": 0.10700036271309395,
"grad_norm": 0.1112721636891365,
"learning_rate": 0.00019477161126875387,
"loss": 0.2666,
"step": 295
},
{
"epoch": 0.10736307580703663,
"grad_norm": 0.11390243470668793,
"learning_rate": 0.00019473505391240522,
"loss": 0.278,
"step": 296
},
{
"epoch": 0.10772578890097932,
"grad_norm": 0.11081282794475555,
"learning_rate": 0.000194698372649843,
"loss": 0.2725,
"step": 297
},
{
"epoch": 0.10808850199492202,
"grad_norm": 0.12400209158658981,
"learning_rate": 0.00019466156752904343,
"loss": 0.2812,
"step": 298
},
{
"epoch": 0.10845121508886471,
"grad_norm": 0.11567061394453049,
"learning_rate": 0.0001946246385981448,
"loss": 0.2907,
"step": 299
},
{
"epoch": 0.1088139281828074,
"grad_norm": 0.11256127059459686,
"learning_rate": 0.0001945875859054474,
"loss": 0.2537,
"step": 300
},
{
"epoch": 0.10917664127675009,
"grad_norm": 0.12261880189180374,
"learning_rate": 0.0001945504094994132,
"loss": 0.2726,
"step": 301
},
{
"epoch": 0.10953935437069279,
"grad_norm": 0.10978831350803375,
"learning_rate": 0.00019451310942866621,
"loss": 0.2578,
"step": 302
},
{
"epoch": 0.10990206746463548,
"grad_norm": 0.12203028053045273,
"learning_rate": 0.00019447568574199202,
"loss": 0.2685,
"step": 303
},
{
"epoch": 0.11026478055857816,
"grad_norm": 0.11995328217744827,
"learning_rate": 0.000194438138488338,
"loss": 0.2914,
"step": 304
},
{
"epoch": 0.11062749365252085,
"grad_norm": 0.1177087351679802,
"learning_rate": 0.000194400467716813,
"loss": 0.2576,
"step": 305
},
{
"epoch": 0.11099020674646355,
"grad_norm": 0.11549436300992966,
"learning_rate": 0.00019436267347668757,
"loss": 0.2789,
"step": 306
},
{
"epoch": 0.11135291984040624,
"grad_norm": 0.12319694459438324,
"learning_rate": 0.0001943247558173937,
"loss": 0.2676,
"step": 307
},
{
"epoch": 0.11171563293434893,
"grad_norm": 0.13126415014266968,
"learning_rate": 0.00019428671478852479,
"loss": 0.2612,
"step": 308
},
{
"epoch": 0.11207834602829161,
"grad_norm": 0.11185677349567413,
"learning_rate": 0.00019424855043983556,
"loss": 0.2607,
"step": 309
},
{
"epoch": 0.11244105912223432,
"grad_norm": 0.1092672273516655,
"learning_rate": 0.00019421026282124212,
"loss": 0.2521,
"step": 310
},
{
"epoch": 0.112803772216177,
"grad_norm": 0.12753579020500183,
"learning_rate": 0.00019417185198282168,
"loss": 0.2876,
"step": 311
},
{
"epoch": 0.11316648531011969,
"grad_norm": 0.11622543632984161,
"learning_rate": 0.00019413331797481277,
"loss": 0.2656,
"step": 312
},
{
"epoch": 0.11352919840406239,
"grad_norm": 0.11567405611276627,
"learning_rate": 0.00019409466084761485,
"loss": 0.2836,
"step": 313
},
{
"epoch": 0.11389191149800508,
"grad_norm": 0.11441784352064133,
"learning_rate": 0.00019405588065178852,
"loss": 0.2523,
"step": 314
},
{
"epoch": 0.11425462459194777,
"grad_norm": 0.11300231516361237,
"learning_rate": 0.0001940169774380553,
"loss": 0.2804,
"step": 315
},
{
"epoch": 0.11461733768589046,
"grad_norm": 0.12194045633077621,
"learning_rate": 0.00019397795125729767,
"loss": 0.2867,
"step": 316
},
{
"epoch": 0.11498005077983316,
"grad_norm": 0.12124588340520859,
"learning_rate": 0.00019393880216055887,
"loss": 0.2859,
"step": 317
},
{
"epoch": 0.11534276387377584,
"grad_norm": 0.11623072624206543,
"learning_rate": 0.00019389953019904285,
"loss": 0.288,
"step": 318
},
{
"epoch": 0.11570547696771853,
"grad_norm": 0.11297620832920074,
"learning_rate": 0.00019386013542411449,
"loss": 0.2896,
"step": 319
},
{
"epoch": 0.11606819006166122,
"grad_norm": 0.11987963318824768,
"learning_rate": 0.00019382061788729898,
"loss": 0.3479,
"step": 320
},
{
"epoch": 0.11643090315560392,
"grad_norm": 0.14857983589172363,
"learning_rate": 0.00019378097764028235,
"loss": 0.2519,
"step": 321
},
{
"epoch": 0.11679361624954661,
"grad_norm": 0.10684715956449509,
"learning_rate": 0.00019374121473491096,
"loss": 0.3014,
"step": 322
},
{
"epoch": 0.1171563293434893,
"grad_norm": 0.11060940474271774,
"learning_rate": 0.0001937013292231917,
"loss": 0.2522,
"step": 323
},
{
"epoch": 0.117519042437432,
"grad_norm": 0.10806398838758469,
"learning_rate": 0.00019366132115729173,
"loss": 0.2695,
"step": 324
},
{
"epoch": 0.11788175553137469,
"grad_norm": 0.11272536218166351,
"learning_rate": 0.0001936211905895386,
"loss": 0.2666,
"step": 325
},
{
"epoch": 0.11824446862531737,
"grad_norm": 0.11766637116670609,
"learning_rate": 0.00019358093757241996,
"loss": 0.3007,
"step": 326
},
{
"epoch": 0.11860718171926006,
"grad_norm": 0.1170196607708931,
"learning_rate": 0.0001935405621585837,
"loss": 0.2678,
"step": 327
},
{
"epoch": 0.11896989481320276,
"grad_norm": 0.12220901250839233,
"learning_rate": 0.0001935000644008378,
"loss": 0.2519,
"step": 328
},
{
"epoch": 0.11933260790714545,
"grad_norm": 0.1201847493648529,
"learning_rate": 0.00019345944435215023,
"loss": 0.267,
"step": 329
},
{
"epoch": 0.11969532100108814,
"grad_norm": 0.11570829898118973,
"learning_rate": 0.00019341870206564886,
"loss": 0.2515,
"step": 330
},
{
"epoch": 0.12005803409503082,
"grad_norm": 0.12002036720514297,
"learning_rate": 0.0001933778375946216,
"loss": 0.2767,
"step": 331
},
{
"epoch": 0.12042074718897353,
"grad_norm": 0.12402871996164322,
"learning_rate": 0.00019333685099251594,
"loss": 0.2508,
"step": 332
},
{
"epoch": 0.12078346028291621,
"grad_norm": 0.11982254683971405,
"learning_rate": 0.00019329574231293926,
"loss": 0.2802,
"step": 333
},
{
"epoch": 0.1211461733768589,
"grad_norm": 0.11482241749763489,
"learning_rate": 0.0001932545116096586,
"loss": 0.2774,
"step": 334
},
{
"epoch": 0.1215088864708016,
"grad_norm": 0.1279384046792984,
"learning_rate": 0.00019321315893660056,
"loss": 0.2718,
"step": 335
},
{
"epoch": 0.12187159956474429,
"grad_norm": 0.11594551056623459,
"learning_rate": 0.00019317168434785127,
"loss": 0.2771,
"step": 336
},
{
"epoch": 0.12223431265868698,
"grad_norm": 0.1129961609840393,
"learning_rate": 0.0001931300878976563,
"loss": 0.2602,
"step": 337
},
{
"epoch": 0.12259702575262967,
"grad_norm": 0.11392521858215332,
"learning_rate": 0.0001930883696404207,
"loss": 0.2595,
"step": 338
},
{
"epoch": 0.12295973884657237,
"grad_norm": 0.10742796212434769,
"learning_rate": 0.0001930465296307087,
"loss": 0.2473,
"step": 339
},
{
"epoch": 0.12332245194051505,
"grad_norm": 0.11807534843683243,
"learning_rate": 0.00019300456792324382,
"loss": 0.2374,
"step": 340
},
{
"epoch": 0.12368516503445774,
"grad_norm": 0.13207505643367767,
"learning_rate": 0.00019296248457290882,
"loss": 0.2732,
"step": 341
},
{
"epoch": 0.12404787812840043,
"grad_norm": 0.13366468250751495,
"learning_rate": 0.00019292027963474547,
"loss": 0.2702,
"step": 342
},
{
"epoch": 0.12441059122234313,
"grad_norm": 0.1288871318101883,
"learning_rate": 0.00019287795316395468,
"loss": 0.2667,
"step": 343
},
{
"epoch": 0.12477330431628582,
"grad_norm": 0.11883368343114853,
"learning_rate": 0.00019283550521589614,
"loss": 0.2666,
"step": 344
},
{
"epoch": 0.1251360174102285,
"grad_norm": 0.1264144480228424,
"learning_rate": 0.00019279293584608856,
"loss": 0.2795,
"step": 345
},
{
"epoch": 0.1254987305041712,
"grad_norm": 0.12721741199493408,
"learning_rate": 0.0001927502451102095,
"loss": 0.2516,
"step": 346
},
{
"epoch": 0.12586144359811388,
"grad_norm": 0.1189354807138443,
"learning_rate": 0.00019270743306409505,
"loss": 0.2489,
"step": 347
},
{
"epoch": 0.12622415669205658,
"grad_norm": 0.12466361373662949,
"learning_rate": 0.00019266449976374018,
"loss": 0.2856,
"step": 348
},
{
"epoch": 0.12658686978599928,
"grad_norm": 0.13144852221012115,
"learning_rate": 0.00019262144526529832,
"loss": 0.2612,
"step": 349
},
{
"epoch": 0.12694958287994196,
"grad_norm": 0.10754833370447159,
"learning_rate": 0.0001925782696250815,
"loss": 0.2523,
"step": 350
},
{
"epoch": 0.12731229597388466,
"grad_norm": 0.1237715408205986,
"learning_rate": 0.0001925349728995602,
"loss": 0.2526,
"step": 351
},
{
"epoch": 0.12767500906782736,
"grad_norm": 0.1193939596414566,
"learning_rate": 0.00019249155514536312,
"loss": 0.2819,
"step": 352
},
{
"epoch": 0.12803772216177003,
"grad_norm": 0.12648704648017883,
"learning_rate": 0.00019244801641927746,
"loss": 0.2709,
"step": 353
},
{
"epoch": 0.12840043525571274,
"grad_norm": 0.11707579344511032,
"learning_rate": 0.0001924043567782485,
"loss": 0.2853,
"step": 354
},
{
"epoch": 0.1287631483496554,
"grad_norm": 0.12175849080085754,
"learning_rate": 0.00019236057627937975,
"loss": 0.2702,
"step": 355
},
{
"epoch": 0.1291258614435981,
"grad_norm": 0.1120310127735138,
"learning_rate": 0.0001923166749799327,
"loss": 0.2596,
"step": 356
},
{
"epoch": 0.1294885745375408,
"grad_norm": 0.12282121926546097,
"learning_rate": 0.00019227265293732693,
"loss": 0.2581,
"step": 357
},
{
"epoch": 0.1298512876314835,
"grad_norm": 0.13752269744873047,
"learning_rate": 0.00019222851020913995,
"loss": 0.2641,
"step": 358
},
{
"epoch": 0.1302140007254262,
"grad_norm": 0.11744178086519241,
"learning_rate": 0.00019218424685310702,
"loss": 0.2462,
"step": 359
},
{
"epoch": 0.1305767138193689,
"grad_norm": 0.11440069228410721,
"learning_rate": 0.00019213986292712125,
"loss": 0.2495,
"step": 360
},
{
"epoch": 0.13093942691331156,
"grad_norm": 0.11646847426891327,
"learning_rate": 0.00019209535848923343,
"loss": 0.3054,
"step": 361
},
{
"epoch": 0.13130214000725426,
"grad_norm": 0.11386696994304657,
"learning_rate": 0.00019205073359765192,
"loss": 0.2503,
"step": 362
},
{
"epoch": 0.13166485310119697,
"grad_norm": 0.12510043382644653,
"learning_rate": 0.00019200598831074274,
"loss": 0.275,
"step": 363
},
{
"epoch": 0.13202756619513964,
"grad_norm": 0.12363200634717941,
"learning_rate": 0.00019196112268702925,
"loss": 0.2746,
"step": 364
},
{
"epoch": 0.13239027928908234,
"grad_norm": 0.11029732972383499,
"learning_rate": 0.0001919161367851923,
"loss": 0.3095,
"step": 365
},
{
"epoch": 0.13275299238302501,
"grad_norm": 0.12199590355157852,
"learning_rate": 0.00019187103066406998,
"loss": 0.2641,
"step": 366
},
{
"epoch": 0.13311570547696772,
"grad_norm": 0.11692757904529572,
"learning_rate": 0.00019182580438265764,
"loss": 0.2646,
"step": 367
},
{
"epoch": 0.13347841857091042,
"grad_norm": 0.11142277717590332,
"learning_rate": 0.00019178045800010787,
"loss": 0.2495,
"step": 368
},
{
"epoch": 0.1338411316648531,
"grad_norm": 0.11492447555065155,
"learning_rate": 0.00019173499157573023,
"loss": 0.2647,
"step": 369
},
{
"epoch": 0.1342038447587958,
"grad_norm": 0.114183709025383,
"learning_rate": 0.0001916894051689913,
"loss": 0.2499,
"step": 370
},
{
"epoch": 0.1345665578527385,
"grad_norm": 0.11262322962284088,
"learning_rate": 0.00019164369883951468,
"loss": 0.2749,
"step": 371
},
{
"epoch": 0.13492927094668117,
"grad_norm": 0.11667259782552719,
"learning_rate": 0.0001915978726470807,
"loss": 0.269,
"step": 372
},
{
"epoch": 0.13529198404062387,
"grad_norm": 0.1220724880695343,
"learning_rate": 0.00019155192665162656,
"loss": 0.2652,
"step": 373
},
{
"epoch": 0.13565469713456657,
"grad_norm": 0.12185841798782349,
"learning_rate": 0.0001915058609132461,
"loss": 0.2754,
"step": 374
},
{
"epoch": 0.13601741022850924,
"grad_norm": 0.11733336001634598,
"learning_rate": 0.00019145967549218974,
"loss": 0.2685,
"step": 375
},
{
"epoch": 0.13638012332245195,
"grad_norm": 0.12325771152973175,
"learning_rate": 0.00019141337044886457,
"loss": 0.2548,
"step": 376
},
{
"epoch": 0.13674283641639462,
"grad_norm": 0.11737928539514542,
"learning_rate": 0.000191366945843834,
"loss": 0.2875,
"step": 377
},
{
"epoch": 0.13710554951033732,
"grad_norm": 0.11719442158937454,
"learning_rate": 0.00019132040173781788,
"loss": 0.244,
"step": 378
},
{
"epoch": 0.13746826260428002,
"grad_norm": 0.1146400049328804,
"learning_rate": 0.0001912737381916923,
"loss": 0.2595,
"step": 379
},
{
"epoch": 0.1378309756982227,
"grad_norm": 0.11577652394771576,
"learning_rate": 0.00019122695526648968,
"loss": 0.276,
"step": 380
},
{
"epoch": 0.1381936887921654,
"grad_norm": 0.10648276656866074,
"learning_rate": 0.00019118005302339847,
"loss": 0.2444,
"step": 381
},
{
"epoch": 0.1385564018861081,
"grad_norm": 0.10874751210212708,
"learning_rate": 0.00019113303152376324,
"loss": 0.2502,
"step": 382
},
{
"epoch": 0.13891911498005077,
"grad_norm": 0.1190841868519783,
"learning_rate": 0.00019108589082908453,
"loss": 0.2477,
"step": 383
},
{
"epoch": 0.13928182807399347,
"grad_norm": 0.11433839052915573,
"learning_rate": 0.00019103863100101873,
"loss": 0.2651,
"step": 384
},
{
"epoch": 0.13964454116793618,
"grad_norm": 0.1088482066988945,
"learning_rate": 0.00019099125210137813,
"loss": 0.2452,
"step": 385
},
{
"epoch": 0.14000725426187885,
"grad_norm": 0.115386962890625,
"learning_rate": 0.00019094375419213065,
"loss": 0.2579,
"step": 386
},
{
"epoch": 0.14036996735582155,
"grad_norm": 0.1259610801935196,
"learning_rate": 0.0001908961373354,
"loss": 0.2712,
"step": 387
},
{
"epoch": 0.14073268044976422,
"grad_norm": 4882568.5,
"learning_rate": 0.00019084840159346532,
"loss": 0.2385,
"step": 388
},
{
"epoch": 0.14109539354370693,
"grad_norm": 0.12656670808792114,
"learning_rate": 0.0001908005470287614,
"loss": 0.2406,
"step": 389
},
{
"epoch": 0.14145810663764963,
"grad_norm": 0.13908933103084564,
"learning_rate": 0.00019075257370387827,
"loss": 0.2433,
"step": 390
},
{
"epoch": 0.1418208197315923,
"grad_norm": 0.14672155678272247,
"learning_rate": 0.0001907044816815614,
"loss": 0.2544,
"step": 391
},
{
"epoch": 0.142183532825535,
"grad_norm": 0.15031826496124268,
"learning_rate": 0.0001906562710247115,
"loss": 0.2652,
"step": 392
},
{
"epoch": 0.1425462459194777,
"grad_norm": 0.13194704055786133,
"learning_rate": 0.00019060794179638445,
"loss": 0.2603,
"step": 393
},
{
"epoch": 0.14290895901342038,
"grad_norm": 0.13189998269081116,
"learning_rate": 0.0001905594940597911,
"loss": 0.2419,
"step": 394
},
{
"epoch": 0.14327167210736308,
"grad_norm": 0.1245296448469162,
"learning_rate": 0.00019051092787829746,
"loss": 0.2816,
"step": 395
},
{
"epoch": 0.14363438520130578,
"grad_norm": 0.14372986555099487,
"learning_rate": 0.0001904622433154244,
"loss": 0.261,
"step": 396
},
{
"epoch": 0.14399709829524845,
"grad_norm": 0.13385535776615143,
"learning_rate": 0.00019041344043484754,
"loss": 0.2702,
"step": 397
},
{
"epoch": 0.14435981138919116,
"grad_norm": 0.13935022056102753,
"learning_rate": 0.00019036451930039738,
"loss": 0.2907,
"step": 398
},
{
"epoch": 0.14472252448313383,
"grad_norm": 0.11567000299692154,
"learning_rate": 0.00019031547997605902,
"loss": 0.2618,
"step": 399
},
{
"epoch": 0.14508523757707653,
"grad_norm": 0.1412486582994461,
"learning_rate": 0.0001902663225259721,
"loss": 0.3055,
"step": 400
},
{
"epoch": 0.14544795067101923,
"grad_norm": 0.13404829800128937,
"learning_rate": 0.00019021704701443083,
"loss": 0.2565,
"step": 401
},
{
"epoch": 0.1458106637649619,
"grad_norm": 0.15074236690998077,
"learning_rate": 0.00019016765350588389,
"loss": 0.2737,
"step": 402
},
{
"epoch": 0.1461733768589046,
"grad_norm": 0.11905822902917862,
"learning_rate": 0.00019011814206493411,
"loss": 0.2462,
"step": 403
},
{
"epoch": 0.1465360899528473,
"grad_norm": 0.13609488308429718,
"learning_rate": 0.00019006851275633871,
"loss": 0.3008,
"step": 404
},
{
"epoch": 0.14689880304678998,
"grad_norm": 0.13262596726417542,
"learning_rate": 0.00019001876564500909,
"loss": 0.2682,
"step": 405
},
{
"epoch": 0.14726151614073268,
"grad_norm": 0.12421231716871262,
"learning_rate": 0.00018996890079601059,
"loss": 0.2553,
"step": 406
},
{
"epoch": 0.14762422923467536,
"grad_norm": 0.14463739097118378,
"learning_rate": 0.00018991891827456266,
"loss": 0.2483,
"step": 407
},
{
"epoch": 0.14798694232861806,
"grad_norm": 0.12037564069032669,
"learning_rate": 0.00018986881814603862,
"loss": 0.2807,
"step": 408
},
{
"epoch": 0.14834965542256076,
"grad_norm": 0.1340160369873047,
"learning_rate": 0.0001898186004759656,
"loss": 0.248,
"step": 409
},
{
"epoch": 0.14871236851650343,
"grad_norm": 0.13164542615413666,
"learning_rate": 0.0001897682653300245,
"loss": 0.2617,
"step": 410
},
{
"epoch": 0.14907508161044614,
"grad_norm": 0.12125716358423233,
"learning_rate": 0.0001897178127740498,
"loss": 0.249,
"step": 411
},
{
"epoch": 0.14943779470438884,
"grad_norm": 0.13088323175907135,
"learning_rate": 0.00018966724287402964,
"loss": 0.2855,
"step": 412
},
{
"epoch": 0.1498005077983315,
"grad_norm": 0.13843600451946259,
"learning_rate": 0.00018961655569610557,
"loss": 0.2613,
"step": 413
},
{
"epoch": 0.1501632208922742,
"grad_norm": 0.12319327145814896,
"learning_rate": 0.00018956575130657256,
"loss": 0.2675,
"step": 414
},
{
"epoch": 0.15052593398621691,
"grad_norm": 0.12738944590091705,
"learning_rate": 0.0001895148297718788,
"loss": 0.2492,
"step": 415
},
{
"epoch": 0.1508886470801596,
"grad_norm": 0.1370190680027008,
"learning_rate": 0.00018946379115862585,
"loss": 0.2565,
"step": 416
},
{
"epoch": 0.1512513601741023,
"grad_norm": 0.12752386927604675,
"learning_rate": 0.00018941263553356829,
"loss": 0.2752,
"step": 417
},
{
"epoch": 0.15161407326804496,
"grad_norm": 0.12467992305755615,
"learning_rate": 0.00018936136296361373,
"loss": 0.261,
"step": 418
},
{
"epoch": 0.15197678636198766,
"grad_norm": 0.12830005586147308,
"learning_rate": 0.00018930997351582286,
"loss": 0.2579,
"step": 419
},
{
"epoch": 0.15233949945593037,
"grad_norm": 0.1329096108675003,
"learning_rate": 0.00018925846725740907,
"loss": 0.2736,
"step": 420
},
{
"epoch": 0.15270221254987304,
"grad_norm": 0.12870270013809204,
"learning_rate": 0.00018920684425573865,
"loss": 0.2519,
"step": 421
},
{
"epoch": 0.15306492564381574,
"grad_norm": 0.1223597452044487,
"learning_rate": 0.00018915510457833055,
"loss": 0.2462,
"step": 422
},
{
"epoch": 0.15342763873775844,
"grad_norm": 0.13859276473522186,
"learning_rate": 0.0001891032482928563,
"loss": 0.2546,
"step": 423
},
{
"epoch": 0.15379035183170112,
"grad_norm": 0.12266798317432404,
"learning_rate": 0.00018905127546713996,
"loss": 0.2426,
"step": 424
},
{
"epoch": 0.15415306492564382,
"grad_norm": 0.1270112842321396,
"learning_rate": 0.00018899918616915802,
"loss": 0.2719,
"step": 425
},
{
"epoch": 0.15451577801958652,
"grad_norm": 0.12060489505529404,
"learning_rate": 0.0001889469804670393,
"loss": 0.2617,
"step": 426
},
{
"epoch": 0.1548784911135292,
"grad_norm": 0.1132146492600441,
"learning_rate": 0.00018889465842906488,
"loss": 0.2464,
"step": 427
},
{
"epoch": 0.1552412042074719,
"grad_norm": 0.12224707752466202,
"learning_rate": 0.00018884222012366796,
"loss": 0.2963,
"step": 428
},
{
"epoch": 0.15560391730141457,
"grad_norm": 0.11490823328495026,
"learning_rate": 0.00018878966561943386,
"loss": 0.2686,
"step": 429
},
{
"epoch": 0.15596663039535727,
"grad_norm": 0.16463352739810944,
"learning_rate": 0.00018873699498509988,
"loss": 0.2986,
"step": 430
},
{
"epoch": 0.15632934348929997,
"grad_norm": 0.12075062096118927,
"learning_rate": 0.00018868420828955514,
"loss": 0.2968,
"step": 431
},
{
"epoch": 0.15669205658324264,
"grad_norm": 0.1205056831240654,
"learning_rate": 0.00018863130560184063,
"loss": 0.2565,
"step": 432
},
{
"epoch": 0.15705476967718535,
"grad_norm": 0.1396438032388687,
"learning_rate": 0.00018857828699114904,
"loss": 0.2686,
"step": 433
},
{
"epoch": 0.15741748277112805,
"grad_norm": 0.11857564747333527,
"learning_rate": 0.0001885251525268246,
"loss": 0.2453,
"step": 434
},
{
"epoch": 0.15778019586507072,
"grad_norm": 0.12120261788368225,
"learning_rate": 0.0001884719022783632,
"loss": 0.2363,
"step": 435
},
{
"epoch": 0.15814290895901342,
"grad_norm": 0.1222701370716095,
"learning_rate": 0.00018841853631541207,
"loss": 0.2641,
"step": 436
},
{
"epoch": 0.15850562205295612,
"grad_norm": 0.12121476233005524,
"learning_rate": 0.00018836505470776983,
"loss": 0.2542,
"step": 437
},
{
"epoch": 0.1588683351468988,
"grad_norm": 0.12737686932086945,
"learning_rate": 0.0001883114575253863,
"loss": 0.2502,
"step": 438
},
{
"epoch": 0.1592310482408415,
"grad_norm": 0.12551474571228027,
"learning_rate": 0.00018825774483836248,
"loss": 0.2676,
"step": 439
},
{
"epoch": 0.15959376133478417,
"grad_norm": 0.12225164473056793,
"learning_rate": 0.00018820391671695057,
"loss": 0.2695,
"step": 440
},
{
"epoch": 0.15995647442872687,
"grad_norm": 0.12774313986301422,
"learning_rate": 0.00018814997323155357,
"loss": 0.2454,
"step": 441
},
{
"epoch": 0.16031918752266958,
"grad_norm": 0.12761445343494415,
"learning_rate": 0.0001880959144527254,
"loss": 0.2539,
"step": 442
},
{
"epoch": 0.16068190061661225,
"grad_norm": 0.11978595703840256,
"learning_rate": 0.00018804174045117087,
"loss": 0.2301,
"step": 443
},
{
"epoch": 0.16104461371055495,
"grad_norm": 0.12763962149620056,
"learning_rate": 0.00018798745129774543,
"loss": 0.2376,
"step": 444
},
{
"epoch": 0.16140732680449765,
"grad_norm": 0.13063186407089233,
"learning_rate": 0.00018793304706345515,
"loss": 0.2768,
"step": 445
},
{
"epoch": 0.16177003989844033,
"grad_norm": 0.11672946810722351,
"learning_rate": 0.00018787852781945656,
"loss": 0.246,
"step": 446
},
{
"epoch": 0.16213275299238303,
"grad_norm": 0.12725545465946198,
"learning_rate": 0.00018782389363705674,
"loss": 0.262,
"step": 447
},
{
"epoch": 0.16249546608632573,
"grad_norm": 0.1206207126379013,
"learning_rate": 0.00018776914458771296,
"loss": 0.2385,
"step": 448
},
{
"epoch": 0.1628581791802684,
"grad_norm": 0.11878547072410583,
"learning_rate": 0.00018771428074303286,
"loss": 0.2666,
"step": 449
},
{
"epoch": 0.1632208922742111,
"grad_norm": 0.12689107656478882,
"learning_rate": 0.0001876593021747741,
"loss": 0.2828,
"step": 450
},
{
"epoch": 0.16358360536815378,
"grad_norm": 0.11968659609556198,
"learning_rate": 0.00018760420895484446,
"loss": 0.2428,
"step": 451
},
{
"epoch": 0.16394631846209648,
"grad_norm": 0.13296844065189362,
"learning_rate": 0.0001875490011553017,
"loss": 0.2689,
"step": 452
},
{
"epoch": 0.16430903155603918,
"grad_norm": 0.13149085640907288,
"learning_rate": 0.00018749367884835337,
"loss": 0.259,
"step": 453
},
{
"epoch": 0.16467174464998185,
"grad_norm": 0.13679270446300507,
"learning_rate": 0.00018743824210635683,
"loss": 0.2604,
"step": 454
},
{
"epoch": 0.16503445774392456,
"grad_norm": 0.12205653637647629,
"learning_rate": 0.0001873826910018191,
"loss": 0.2557,
"step": 455
},
{
"epoch": 0.16539717083786726,
"grad_norm": 0.11403360217809677,
"learning_rate": 0.00018732702560739678,
"loss": 0.2596,
"step": 456
},
{
"epoch": 0.16575988393180993,
"grad_norm": 0.15047647058963776,
"learning_rate": 0.000187271245995896,
"loss": 0.2571,
"step": 457
},
{
"epoch": 0.16612259702575263,
"grad_norm": 0.12830372154712677,
"learning_rate": 0.00018721535224027212,
"loss": 0.256,
"step": 458
},
{
"epoch": 0.16648531011969533,
"grad_norm": 0.12144992500543594,
"learning_rate": 0.00018715934441363002,
"loss": 0.2488,
"step": 459
},
{
"epoch": 0.166848023213638,
"grad_norm": 0.128736212849617,
"learning_rate": 0.00018710322258922357,
"loss": 0.2541,
"step": 460
},
{
"epoch": 0.1672107363075807,
"grad_norm": 0.1277531534433365,
"learning_rate": 0.0001870469868404559,
"loss": 0.2609,
"step": 461
},
{
"epoch": 0.16757344940152338,
"grad_norm": 0.12313154339790344,
"learning_rate": 0.00018699063724087904,
"loss": 0.2547,
"step": 462
},
{
"epoch": 0.16793616249546608,
"grad_norm": 0.12278270721435547,
"learning_rate": 0.00018693417386419397,
"loss": 0.2509,
"step": 463
},
{
"epoch": 0.1682988755894088,
"grad_norm": 0.12022969871759415,
"learning_rate": 0.00018687759678425044,
"loss": 0.2384,
"step": 464
},
{
"epoch": 0.16866158868335146,
"grad_norm": 0.12230958789587021,
"learning_rate": 0.000186820906075047,
"loss": 0.2535,
"step": 465
},
{
"epoch": 0.16902430177729416,
"grad_norm": 0.13055519759655,
"learning_rate": 0.00018676410181073073,
"loss": 0.244,
"step": 466
},
{
"epoch": 0.16938701487123686,
"grad_norm": 0.12790988385677338,
"learning_rate": 0.0001867071840655973,
"loss": 0.2479,
"step": 467
},
{
"epoch": 0.16974972796517954,
"grad_norm": 0.13046807050704956,
"learning_rate": 0.00018665015291409077,
"loss": 0.2493,
"step": 468
},
{
"epoch": 0.17011244105912224,
"grad_norm": 0.1160719096660614,
"learning_rate": 0.00018659300843080348,
"loss": 0.2274,
"step": 469
},
{
"epoch": 0.17047515415306494,
"grad_norm": 0.1292848438024521,
"learning_rate": 0.00018653575069047608,
"loss": 0.258,
"step": 470
},
{
"epoch": 0.1708378672470076,
"grad_norm": 0.1197739690542221,
"learning_rate": 0.00018647837976799734,
"loss": 0.2276,
"step": 471
},
{
"epoch": 0.17120058034095031,
"grad_norm": 0.11929846554994583,
"learning_rate": 0.00018642089573840402,
"loss": 0.2617,
"step": 472
},
{
"epoch": 0.171563293434893,
"grad_norm": 0.12611514329910278,
"learning_rate": 0.00018636329867688085,
"loss": 0.2525,
"step": 473
},
{
"epoch": 0.1719260065288357,
"grad_norm": 0.1322082132101059,
"learning_rate": 0.0001863055886587604,
"loss": 0.2564,
"step": 474
},
{
"epoch": 0.1722887196227784,
"grad_norm": 0.1298658400774002,
"learning_rate": 0.0001862477657595229,
"loss": 0.2451,
"step": 475
},
{
"epoch": 0.17265143271672106,
"grad_norm": 0.1305808424949646,
"learning_rate": 0.00018618983005479637,
"loss": 0.2546,
"step": 476
},
{
"epoch": 0.17301414581066377,
"grad_norm": 0.1403343826532364,
"learning_rate": 0.00018613178162035624,
"loss": 0.2566,
"step": 477
},
{
"epoch": 0.17337685890460647,
"grad_norm": 0.12340683490037918,
"learning_rate": 0.00018607362053212545,
"loss": 0.2402,
"step": 478
},
{
"epoch": 0.17373957199854914,
"grad_norm": 0.12032376229763031,
"learning_rate": 0.00018601534686617423,
"loss": 0.2524,
"step": 479
},
{
"epoch": 0.17410228509249184,
"grad_norm": 0.14251156151294708,
"learning_rate": 0.00018595696069872013,
"loss": 0.2386,
"step": 480
},
{
"epoch": 0.17446499818643452,
"grad_norm": 0.12001265585422516,
"learning_rate": 0.00018589846210612776,
"loss": 0.2311,
"step": 481
},
{
"epoch": 0.17482771128037722,
"grad_norm": 0.127760112285614,
"learning_rate": 0.00018583985116490877,
"loss": 0.2528,
"step": 482
},
{
"epoch": 0.17519042437431992,
"grad_norm": 0.1348508894443512,
"learning_rate": 0.0001857811279517219,
"loss": 0.2861,
"step": 483
},
{
"epoch": 0.1755531374682626,
"grad_norm": 0.1362610161304474,
"learning_rate": 0.00018572229254337254,
"loss": 0.2606,
"step": 484
},
{
"epoch": 0.1759158505622053,
"grad_norm": 0.12335646897554398,
"learning_rate": 0.00018566334501681294,
"loss": 0.2735,
"step": 485
},
{
"epoch": 0.176278563656148,
"grad_norm": 0.2398405522108078,
"learning_rate": 0.000185604285449142,
"loss": 0.2686,
"step": 486
},
{
"epoch": 0.17664127675009067,
"grad_norm": 0.12291895598173141,
"learning_rate": 0.00018554511391760502,
"loss": 0.251,
"step": 487
},
{
"epoch": 0.17700398984403337,
"grad_norm": 0.1420765072107315,
"learning_rate": 0.00018548583049959394,
"loss": 0.3053,
"step": 488
},
{
"epoch": 0.17736670293797607,
"grad_norm": 0.13731782138347626,
"learning_rate": 0.0001854264352726469,
"loss": 0.2508,
"step": 489
},
{
"epoch": 0.17772941603191875,
"grad_norm": 0.12329670786857605,
"learning_rate": 0.00018536692831444836,
"loss": 0.2544,
"step": 490
},
{
"epoch": 0.17809212912586145,
"grad_norm": 0.13219058513641357,
"learning_rate": 0.0001853073097028288,
"loss": 0.2933,
"step": 491
},
{
"epoch": 0.17845484221980412,
"grad_norm": 0.13322101533412933,
"learning_rate": 0.00018524757951576487,
"loss": 0.2546,
"step": 492
},
{
"epoch": 0.17881755531374682,
"grad_norm": 0.13400037586688995,
"learning_rate": 0.00018518773783137907,
"loss": 0.2538,
"step": 493
},
{
"epoch": 0.17918026840768952,
"grad_norm": 0.1361285001039505,
"learning_rate": 0.0001851277847279398,
"loss": 0.2522,
"step": 494
},
{
"epoch": 0.1795429815016322,
"grad_norm": 0.1310225874185562,
"learning_rate": 0.00018506772028386106,
"loss": 0.2667,
"step": 495
},
{
"epoch": 0.1799056945955749,
"grad_norm": 0.12234266102313995,
"learning_rate": 0.00018500754457770257,
"loss": 0.2392,
"step": 496
},
{
"epoch": 0.1802684076895176,
"grad_norm": 0.1298176795244217,
"learning_rate": 0.00018494725768816958,
"loss": 0.2573,
"step": 497
},
{
"epoch": 0.18063112078346028,
"grad_norm": 0.1306108981370926,
"learning_rate": 0.00018488685969411276,
"loss": 0.2524,
"step": 498
},
{
"epoch": 0.18099383387740298,
"grad_norm": 0.13212443888187408,
"learning_rate": 0.00018482635067452804,
"loss": 0.2577,
"step": 499
},
{
"epoch": 0.18135654697134568,
"grad_norm": 0.12641021609306335,
"learning_rate": 0.0001847657307085566,
"loss": 0.2585,
"step": 500
},
{
"epoch": 0.18171926006528835,
"grad_norm": 0.13970649242401123,
"learning_rate": 0.00018470499987548473,
"loss": 0.2652,
"step": 501
},
{
"epoch": 0.18208197315923105,
"grad_norm": 0.12708009779453278,
"learning_rate": 0.0001846441582547437,
"loss": 0.2675,
"step": 502
},
{
"epoch": 0.18244468625317373,
"grad_norm": 0.1252969652414322,
"learning_rate": 0.00018458320592590975,
"loss": 0.2622,
"step": 503
},
{
"epoch": 0.18280739934711643,
"grad_norm": 0.13454315066337585,
"learning_rate": 0.0001845221429687038,
"loss": 0.2848,
"step": 504
},
{
"epoch": 0.18317011244105913,
"grad_norm": 0.11531683802604675,
"learning_rate": 0.0001844609694629916,
"loss": 0.2335,
"step": 505
},
{
"epoch": 0.1835328255350018,
"grad_norm": 0.12405534833669662,
"learning_rate": 0.00018439968548878338,
"loss": 0.2494,
"step": 506
},
{
"epoch": 0.1838955386289445,
"grad_norm": 0.12868863344192505,
"learning_rate": 0.00018433829112623394,
"loss": 0.2551,
"step": 507
},
{
"epoch": 0.1842582517228872,
"grad_norm": 0.12778586149215698,
"learning_rate": 0.00018427678645564235,
"loss": 0.2519,
"step": 508
},
{
"epoch": 0.18462096481682988,
"grad_norm": 0.12378937751054764,
"learning_rate": 0.00018421517155745208,
"loss": 0.2463,
"step": 509
},
{
"epoch": 0.18498367791077258,
"grad_norm": 0.12006038427352905,
"learning_rate": 0.00018415344651225067,
"loss": 0.2434,
"step": 510
},
{
"epoch": 0.18534639100471528,
"grad_norm": 0.12323882430791855,
"learning_rate": 0.0001840916114007698,
"loss": 0.2495,
"step": 511
},
{
"epoch": 0.18570910409865796,
"grad_norm": 0.12510351836681366,
"learning_rate": 0.00018402966630388505,
"loss": 0.2421,
"step": 512
},
{
"epoch": 0.18607181719260066,
"grad_norm": 0.16430193185806274,
"learning_rate": 0.00018396761130261586,
"loss": 0.261,
"step": 513
},
{
"epoch": 0.18643453028654333,
"grad_norm": 0.13129295408725739,
"learning_rate": 0.0001839054464781255,
"loss": 0.2552,
"step": 514
},
{
"epoch": 0.18679724338048603,
"grad_norm": 0.12675730884075165,
"learning_rate": 0.00018384317191172072,
"loss": 0.2443,
"step": 515
},
{
"epoch": 0.18715995647442873,
"grad_norm": 0.1283879280090332,
"learning_rate": 0.00018378078768485192,
"loss": 0.2453,
"step": 516
},
{
"epoch": 0.1875226695683714,
"grad_norm": 0.12647312879562378,
"learning_rate": 0.00018371829387911292,
"loss": 0.2434,
"step": 517
},
{
"epoch": 0.1878853826623141,
"grad_norm": 0.12233056873083115,
"learning_rate": 0.0001836556905762409,
"loss": 0.283,
"step": 518
},
{
"epoch": 0.1882480957562568,
"grad_norm": 0.13304516673088074,
"learning_rate": 0.00018359297785811612,
"loss": 0.2545,
"step": 519
},
{
"epoch": 0.18861080885019949,
"grad_norm": 0.13864544034004211,
"learning_rate": 0.000183530155806762,
"loss": 0.2571,
"step": 520
},
{
"epoch": 0.1889735219441422,
"grad_norm": 0.1448136270046234,
"learning_rate": 0.00018346722450434508,
"loss": 0.2576,
"step": 521
},
{
"epoch": 0.1893362350380849,
"grad_norm": 0.14094996452331543,
"learning_rate": 0.00018340418403317463,
"loss": 0.2568,
"step": 522
},
{
"epoch": 0.18969894813202756,
"grad_norm": 0.13471728563308716,
"learning_rate": 0.00018334103447570282,
"loss": 0.2271,
"step": 523
},
{
"epoch": 0.19006166122597026,
"grad_norm": 0.12976421415805817,
"learning_rate": 0.00018327777591452436,
"loss": 0.2386,
"step": 524
},
{
"epoch": 0.19042437431991294,
"grad_norm": 0.15379559993743896,
"learning_rate": 0.00018321440843237672,
"loss": 0.2681,
"step": 525
},
{
"epoch": 0.19078708741385564,
"grad_norm": 0.16950151324272156,
"learning_rate": 0.00018315093211213962,
"loss": 0.2526,
"step": 526
},
{
"epoch": 0.19114980050779834,
"grad_norm": 0.13350321352481842,
"learning_rate": 0.00018308734703683535,
"loss": 0.2495,
"step": 527
},
{
"epoch": 0.191512513601741,
"grad_norm": 0.14698749780654907,
"learning_rate": 0.00018302365328962824,
"loss": 0.2381,
"step": 528
},
{
"epoch": 0.19187522669568371,
"grad_norm": 0.12897023558616638,
"learning_rate": 0.0001829598509538249,
"loss": 0.256,
"step": 529
},
{
"epoch": 0.19223793978962642,
"grad_norm": 0.14562232792377472,
"learning_rate": 0.0001828959401128739,
"loss": 0.2607,
"step": 530
},
{
"epoch": 0.1926006528835691,
"grad_norm": 0.13689380884170532,
"learning_rate": 0.0001828319208503657,
"loss": 0.2451,
"step": 531
},
{
"epoch": 0.1929633659775118,
"grad_norm": 0.130660280585289,
"learning_rate": 0.00018276779325003268,
"loss": 0.2554,
"step": 532
},
{
"epoch": 0.1933260790714545,
"grad_norm": 0.12638305127620697,
"learning_rate": 0.00018270355739574877,
"loss": 0.2496,
"step": 533
},
{
"epoch": 0.19368879216539717,
"grad_norm": 0.14226087927818298,
"learning_rate": 0.00018263921337152955,
"loss": 0.2423,
"step": 534
},
{
"epoch": 0.19405150525933987,
"grad_norm": 0.1410246342420578,
"learning_rate": 0.00018257476126153218,
"loss": 0.2721,
"step": 535
},
{
"epoch": 0.19441421835328254,
"grad_norm": 0.1288328468799591,
"learning_rate": 0.00018251020115005504,
"loss": 0.2321,
"step": 536
},
{
"epoch": 0.19477693144722524,
"grad_norm": 0.14098510146141052,
"learning_rate": 0.0001824455331215378,
"loss": 0.2467,
"step": 537
},
{
"epoch": 0.19513964454116794,
"grad_norm": 0.13489827513694763,
"learning_rate": 0.00018238075726056136,
"loss": 0.2491,
"step": 538
},
{
"epoch": 0.19550235763511062,
"grad_norm": 0.13195975124835968,
"learning_rate": 0.00018231587365184754,
"loss": 0.2443,
"step": 539
},
{
"epoch": 0.19586507072905332,
"grad_norm": 0.1283298283815384,
"learning_rate": 0.00018225088238025915,
"loss": 0.2465,
"step": 540
},
{
"epoch": 0.19622778382299602,
"grad_norm": 0.11871767789125443,
"learning_rate": 0.00018218578353079988,
"loss": 0.227,
"step": 541
},
{
"epoch": 0.1965904969169387,
"grad_norm": 0.14271649718284607,
"learning_rate": 0.00018212057718861396,
"loss": 0.2734,
"step": 542
},
{
"epoch": 0.1969532100108814,
"grad_norm": 0.14445483684539795,
"learning_rate": 0.00018205526343898637,
"loss": 0.2417,
"step": 543
},
{
"epoch": 0.1973159231048241,
"grad_norm": 0.13704335689544678,
"learning_rate": 0.00018198984236734246,
"loss": 0.287,
"step": 544
},
{
"epoch": 0.19767863619876677,
"grad_norm": 0.12846963107585907,
"learning_rate": 0.00018192431405924804,
"loss": 0.2448,
"step": 545
},
{
"epoch": 0.19804134929270947,
"grad_norm": 0.14025187492370605,
"learning_rate": 0.00018185867860040907,
"loss": 0.2277,
"step": 546
},
{
"epoch": 0.19840406238665215,
"grad_norm": 0.12117055058479309,
"learning_rate": 0.00018179293607667178,
"loss": 0.2434,
"step": 547
},
{
"epoch": 0.19876677548059485,
"grad_norm": 0.1310604214668274,
"learning_rate": 0.00018172708657402233,
"loss": 0.2414,
"step": 548
},
{
"epoch": 0.19912948857453755,
"grad_norm": 0.15536460280418396,
"learning_rate": 0.00018166113017858683,
"loss": 0.2608,
"step": 549
},
{
"epoch": 0.19949220166848022,
"grad_norm": 0.1420615315437317,
"learning_rate": 0.00018159506697663127,
"loss": 0.269,
"step": 550
},
{
"epoch": 0.19985491476242292,
"grad_norm": 0.13386112451553345,
"learning_rate": 0.00018152889705456117,
"loss": 0.2728,
"step": 551
},
{
"epoch": 0.20021762785636563,
"grad_norm": 0.12435004115104675,
"learning_rate": 0.00018146262049892185,
"loss": 0.251,
"step": 552
},
{
"epoch": 0.2005803409503083,
"grad_norm": 0.13267625868320465,
"learning_rate": 0.00018139623739639788,
"loss": 0.2844,
"step": 553
},
{
"epoch": 0.200943054044251,
"grad_norm": 0.13061115145683289,
"learning_rate": 0.00018132974783381336,
"loss": 0.2287,
"step": 554
},
{
"epoch": 0.20130576713819368,
"grad_norm": 0.13054601848125458,
"learning_rate": 0.0001812631518981315,
"loss": 0.237,
"step": 555
},
{
"epoch": 0.20166848023213638,
"grad_norm": 0.1794627159833908,
"learning_rate": 0.00018119644967645474,
"loss": 0.2752,
"step": 556
},
{
"epoch": 0.20203119332607908,
"grad_norm": 0.13099108636379242,
"learning_rate": 0.00018112964125602447,
"loss": 0.2514,
"step": 557
},
{
"epoch": 0.20239390642002175,
"grad_norm": 0.13102415204048157,
"learning_rate": 0.000181062726724221,
"loss": 0.2428,
"step": 558
},
{
"epoch": 0.20275661951396445,
"grad_norm": 0.13251091539859772,
"learning_rate": 0.00018099570616856344,
"loss": 0.2452,
"step": 559
},
{
"epoch": 0.20311933260790715,
"grad_norm": 0.12863093614578247,
"learning_rate": 0.00018092857967670956,
"loss": 0.256,
"step": 560
},
{
"epoch": 0.20348204570184983,
"grad_norm": 0.13334688544273376,
"learning_rate": 0.00018086134733645565,
"loss": 0.2608,
"step": 561
},
{
"epoch": 0.20384475879579253,
"grad_norm": 0.15378229320049286,
"learning_rate": 0.00018079400923573652,
"loss": 0.2416,
"step": 562
},
{
"epoch": 0.20420747188973523,
"grad_norm": 0.1594190150499344,
"learning_rate": 0.00018072656546262524,
"loss": 0.2526,
"step": 563
},
{
"epoch": 0.2045701849836779,
"grad_norm": 0.13872471451759338,
"learning_rate": 0.00018065901610533306,
"loss": 0.2379,
"step": 564
},
{
"epoch": 0.2049328980776206,
"grad_norm": 0.1253708302974701,
"learning_rate": 0.0001805913612522095,
"loss": 0.2352,
"step": 565
},
{
"epoch": 0.20529561117156328,
"grad_norm": 0.13366468250751495,
"learning_rate": 0.00018052360099174184,
"loss": 0.2448,
"step": 566
},
{
"epoch": 0.20565832426550598,
"grad_norm": 605528.9375,
"learning_rate": 0.00018045573541255534,
"loss": 0.2251,
"step": 567
},
{
"epoch": 0.20602103735944868,
"grad_norm": 0.18479633331298828,
"learning_rate": 0.00018038776460341303,
"loss": 0.254,
"step": 568
},
{
"epoch": 0.20638375045339136,
"grad_norm": 0.20463520288467407,
"learning_rate": 0.0001803196886532155,
"loss": 0.2328,
"step": 569
},
{
"epoch": 0.20674646354733406,
"grad_norm": 0.1946071833372116,
"learning_rate": 0.00018025150765100094,
"loss": 0.2622,
"step": 570
},
{
"epoch": 0.20710917664127676,
"grad_norm": 0.16838648915290833,
"learning_rate": 0.00018018322168594485,
"loss": 0.2712,
"step": 571
},
{
"epoch": 0.20747188973521943,
"grad_norm": 0.20080481469631195,
"learning_rate": 0.00018011483084736006,
"loss": 0.2465,
"step": 572
},
{
"epoch": 0.20783460282916214,
"grad_norm": 0.19547294080257416,
"learning_rate": 0.00018004633522469656,
"loss": 0.2829,
"step": 573
},
{
"epoch": 0.20819731592310484,
"grad_norm": 0.14593558013439178,
"learning_rate": 0.00017997773490754137,
"loss": 0.2532,
"step": 574
},
{
"epoch": 0.2085600290170475,
"grad_norm": 0.1449822634458542,
"learning_rate": 0.00017990902998561855,
"loss": 0.2528,
"step": 575
},
{
"epoch": 0.2089227421109902,
"grad_norm": 0.14969614148139954,
"learning_rate": 0.0001798402205487888,
"loss": 0.2389,
"step": 576
},
{
"epoch": 0.20928545520493289,
"grad_norm": 0.14283829927444458,
"learning_rate": 0.00017977130668704965,
"loss": 0.2337,
"step": 577
},
{
"epoch": 0.2096481682988756,
"grad_norm": 0.1496269404888153,
"learning_rate": 0.00017970228849053515,
"loss": 0.259,
"step": 578
},
{
"epoch": 0.2100108813928183,
"grad_norm": 0.13835981488227844,
"learning_rate": 0.00017963316604951586,
"loss": 0.2628,
"step": 579
},
{
"epoch": 0.21037359448676096,
"grad_norm": 0.14784668385982513,
"learning_rate": 0.0001795639394543986,
"loss": 0.2488,
"step": 580
},
{
"epoch": 0.21073630758070366,
"grad_norm": 0.13575692474842072,
"learning_rate": 0.00017949460879572652,
"loss": 0.2403,
"step": 581
},
{
"epoch": 0.21109902067464636,
"grad_norm": 0.14234420657157898,
"learning_rate": 0.00017942517416417878,
"loss": 0.2649,
"step": 582
},
{
"epoch": 0.21146173376858904,
"grad_norm": 0.13922925293445587,
"learning_rate": 0.0001793556356505706,
"loss": 0.2466,
"step": 583
},
{
"epoch": 0.21182444686253174,
"grad_norm": 0.1288311779499054,
"learning_rate": 0.00017928599334585306,
"loss": 0.2314,
"step": 584
},
{
"epoch": 0.21218715995647444,
"grad_norm": 0.12375061213970184,
"learning_rate": 0.00017921624734111292,
"loss": 0.2401,
"step": 585
},
{
"epoch": 0.21254987305041712,
"grad_norm": 0.12890039384365082,
"learning_rate": 0.0001791463977275727,
"loss": 0.2416,
"step": 586
},
{
"epoch": 0.21291258614435982,
"grad_norm": 0.13691289722919464,
"learning_rate": 0.00017907644459659033,
"loss": 0.2473,
"step": 587
},
{
"epoch": 0.2132752992383025,
"grad_norm": 0.3051564693450928,
"learning_rate": 0.0001790063880396591,
"loss": 0.2464,
"step": 588
},
{
"epoch": 0.2136380123322452,
"grad_norm": 0.13991987705230713,
"learning_rate": 0.00017893622814840773,
"loss": 0.2526,
"step": 589
},
{
"epoch": 0.2140007254261879,
"grad_norm": 0.12774237990379333,
"learning_rate": 0.00017886596501459992,
"loss": 0.2375,
"step": 590
},
{
"epoch": 0.21436343852013057,
"grad_norm": 0.13759708404541016,
"learning_rate": 0.00017879559873013452,
"loss": 0.2248,
"step": 591
},
{
"epoch": 0.21472615161407327,
"grad_norm": 0.13571417331695557,
"learning_rate": 0.00017872512938704523,
"loss": 0.2612,
"step": 592
},
{
"epoch": 0.21508886470801597,
"grad_norm": 0.1446496546268463,
"learning_rate": 0.00017865455707750063,
"loss": 0.2466,
"step": 593
},
{
"epoch": 0.21545157780195864,
"grad_norm": 0.12743471562862396,
"learning_rate": 0.00017858388189380387,
"loss": 0.2681,
"step": 594
},
{
"epoch": 0.21581429089590135,
"grad_norm": 0.1251528263092041,
"learning_rate": 0.00017851310392839266,
"loss": 0.246,
"step": 595
},
{
"epoch": 0.21617700398984405,
"grad_norm": 0.12966857850551605,
"learning_rate": 0.0001784422232738392,
"loss": 0.2293,
"step": 596
},
{
"epoch": 0.21653971708378672,
"grad_norm": 0.14909860491752625,
"learning_rate": 0.00017837124002285,
"loss": 0.2577,
"step": 597
},
{
"epoch": 0.21690243017772942,
"grad_norm": 0.136635884642601,
"learning_rate": 0.00017830015426826567,
"loss": 0.262,
"step": 598
},
{
"epoch": 0.2172651432716721,
"grad_norm": 0.13285911083221436,
"learning_rate": 0.000178228966103061,
"loss": 0.2598,
"step": 599
},
{
"epoch": 0.2176278563656148,
"grad_norm": 0.13522981107234955,
"learning_rate": 0.00017815767562034463,
"loss": 0.2469,
"step": 600
},
{
"epoch": 0.2179905694595575,
"grad_norm": 0.13613048195838928,
"learning_rate": 0.00017808628291335912,
"loss": 0.2519,
"step": 601
},
{
"epoch": 0.21835328255350017,
"grad_norm": 0.14597558975219727,
"learning_rate": 0.00017801478807548063,
"loss": 0.2651,
"step": 602
},
{
"epoch": 0.21871599564744287,
"grad_norm": 0.13757093250751495,
"learning_rate": 0.00017794319120021895,
"loss": 0.2593,
"step": 603
},
{
"epoch": 0.21907870874138557,
"grad_norm": 0.13094554841518402,
"learning_rate": 0.00017787149238121733,
"loss": 0.2546,
"step": 604
},
{
"epoch": 0.21944142183532825,
"grad_norm": 125.78084564208984,
"learning_rate": 0.00017779969171225236,
"loss": 0.2456,
"step": 605
},
{
"epoch": 0.21980413492927095,
"grad_norm": 0.15768976509571075,
"learning_rate": 0.00017772778928723383,
"loss": 0.2412,
"step": 606
},
{
"epoch": 0.22016684802321365,
"grad_norm": 0.19074760377407074,
"learning_rate": 0.00017765578520020459,
"loss": 0.2699,
"step": 607
},
{
"epoch": 0.22052956111715633,
"grad_norm": 0.1577846109867096,
"learning_rate": 0.0001775836795453405,
"loss": 0.2737,
"step": 608
},
{
"epoch": 0.22089227421109903,
"grad_norm": 0.153973788022995,
"learning_rate": 0.00017751147241695025,
"loss": 0.2336,
"step": 609
},
{
"epoch": 0.2212549873050417,
"grad_norm": 0.16625823080539703,
"learning_rate": 0.0001774391639094753,
"loss": 0.248,
"step": 610
},
{
"epoch": 0.2216177003989844,
"grad_norm": 0.17917267978191376,
"learning_rate": 0.00017736675411748955,
"loss": 0.2559,
"step": 611
},
{
"epoch": 0.2219804134929271,
"grad_norm": 0.15878534317016602,
"learning_rate": 0.00017729424313569955,
"loss": 0.249,
"step": 612
},
{
"epoch": 0.22234312658686978,
"grad_norm": 0.1509985774755478,
"learning_rate": 0.00017722163105894412,
"loss": 0.2607,
"step": 613
},
{
"epoch": 0.22270583968081248,
"grad_norm": 0.13934160768985748,
"learning_rate": 0.0001771489179821943,
"loss": 0.2377,
"step": 614
},
{
"epoch": 0.22306855277475518,
"grad_norm": 0.1717095524072647,
"learning_rate": 0.00017707610400055323,
"loss": 0.2554,
"step": 615
},
{
"epoch": 0.22343126586869785,
"grad_norm": 0.13818614184856415,
"learning_rate": 0.00017700318920925605,
"loss": 0.2748,
"step": 616
},
{
"epoch": 0.22379397896264056,
"grad_norm": 0.12828463315963745,
"learning_rate": 0.00017693017370366972,
"loss": 0.2398,
"step": 617
},
{
"epoch": 0.22415669205658323,
"grad_norm": 0.13687558472156525,
"learning_rate": 0.00017685705757929294,
"loss": 0.2735,
"step": 618
},
{
"epoch": 0.22451940515052593,
"grad_norm": 0.1353394091129303,
"learning_rate": 0.00017678384093175605,
"loss": 0.2428,
"step": 619
},
{
"epoch": 0.22488211824446863,
"grad_norm": 0.1443159580230713,
"learning_rate": 0.00017671052385682078,
"loss": 0.2566,
"step": 620
},
{
"epoch": 0.2252448313384113,
"grad_norm": 0.14144475758075714,
"learning_rate": 0.00017663710645038035,
"loss": 0.2482,
"step": 621
},
{
"epoch": 0.225607544432354,
"grad_norm": 0.14739158749580383,
"learning_rate": 0.000176563588808459,
"loss": 0.253,
"step": 622
},
{
"epoch": 0.2259702575262967,
"grad_norm": 0.14374294877052307,
"learning_rate": 0.0001764899710272123,
"loss": 0.2394,
"step": 623
},
{
"epoch": 0.22633297062023938,
"grad_norm": 0.14988651871681213,
"learning_rate": 0.00017641625320292663,
"loss": 0.2953,
"step": 624
},
{
"epoch": 0.22669568371418208,
"grad_norm": 0.1295817494392395,
"learning_rate": 0.00017634243543201926,
"loss": 0.2177,
"step": 625
},
{
"epoch": 0.22705839680812478,
"grad_norm": 0.13908831775188446,
"learning_rate": 0.0001762685178110382,
"loss": 0.2348,
"step": 626
},
{
"epoch": 0.22742110990206746,
"grad_norm": 0.12676572799682617,
"learning_rate": 0.0001761945004366621,
"loss": 0.2347,
"step": 627
},
{
"epoch": 0.22778382299601016,
"grad_norm": 0.12473898380994797,
"learning_rate": 0.00017612038340569997,
"loss": 0.2161,
"step": 628
},
{
"epoch": 0.22814653608995283,
"grad_norm": 0.12910184264183044,
"learning_rate": 0.00017604616681509127,
"loss": 0.2476,
"step": 629
},
{
"epoch": 0.22850924918389554,
"grad_norm": 0.1438639611005783,
"learning_rate": 0.0001759718507619056,
"loss": 0.2464,
"step": 630
},
{
"epoch": 0.22887196227783824,
"grad_norm": 0.1412367820739746,
"learning_rate": 0.00017589743534334273,
"loss": 0.2475,
"step": 631
},
{
"epoch": 0.2292346753717809,
"grad_norm": 0.13323849439620972,
"learning_rate": 0.00017582292065673226,
"loss": 0.2352,
"step": 632
},
{
"epoch": 0.2295973884657236,
"grad_norm": 0.13439258933067322,
"learning_rate": 0.0001757483067995338,
"loss": 0.3278,
"step": 633
},
{
"epoch": 0.2299601015596663,
"grad_norm": 0.1343153417110443,
"learning_rate": 0.0001756735938693365,
"loss": 0.2419,
"step": 634
},
{
"epoch": 0.230322814653609,
"grad_norm": 0.12620678544044495,
"learning_rate": 0.0001755987819638592,
"loss": 0.2428,
"step": 635
},
{
"epoch": 0.2306855277475517,
"grad_norm": 0.1367313414812088,
"learning_rate": 0.00017552387118095015,
"loss": 0.2501,
"step": 636
},
{
"epoch": 0.2310482408414944,
"grad_norm": 0.14542607963085175,
"learning_rate": 0.00017544886161858695,
"loss": 0.2838,
"step": 637
},
{
"epoch": 0.23141095393543706,
"grad_norm": 0.13652457296848297,
"learning_rate": 0.0001753737533748763,
"loss": 0.2328,
"step": 638
},
{
"epoch": 0.23177366702937977,
"grad_norm": 0.13839620351791382,
"learning_rate": 0.00017529854654805416,
"loss": 0.2479,
"step": 639
},
{
"epoch": 0.23213638012332244,
"grad_norm": 0.1453743427991867,
"learning_rate": 0.00017522324123648525,
"loss": 0.2267,
"step": 640
},
{
"epoch": 0.23249909321726514,
"grad_norm": 0.1310967206954956,
"learning_rate": 0.0001751478375386632,
"loss": 0.2194,
"step": 641
},
{
"epoch": 0.23286180631120784,
"grad_norm": 0.13854770362377167,
"learning_rate": 0.00017507233555321024,
"loss": 0.2447,
"step": 642
},
{
"epoch": 0.23322451940515052,
"grad_norm": 0.12980610132217407,
"learning_rate": 0.00017499673537887722,
"loss": 0.2391,
"step": 643
},
{
"epoch": 0.23358723249909322,
"grad_norm": 0.12693443894386292,
"learning_rate": 0.0001749210371145434,
"loss": 0.2267,
"step": 644
},
{
"epoch": 0.23394994559303592,
"grad_norm": 0.13409999012947083,
"learning_rate": 0.00017484524085921633,
"loss": 0.2464,
"step": 645
},
{
"epoch": 0.2343126586869786,
"grad_norm": 0.1421654224395752,
"learning_rate": 0.0001747693467120317,
"loss": 0.2544,
"step": 646
},
{
"epoch": 0.2346753717809213,
"grad_norm": 0.13795344531536102,
"learning_rate": 0.00017469335477225326,
"loss": 0.2368,
"step": 647
},
{
"epoch": 0.235038084874864,
"grad_norm": 0.14090494811534882,
"learning_rate": 0.0001746172651392727,
"loss": 0.2414,
"step": 648
},
{
"epoch": 0.23540079796880667,
"grad_norm": 0.13511234521865845,
"learning_rate": 0.0001745410779126094,
"loss": 0.2548,
"step": 649
},
{
"epoch": 0.23576351106274937,
"grad_norm": 0.12285248935222626,
"learning_rate": 0.00017446479319191047,
"loss": 0.2211,
"step": 650
},
{
"epoch": 0.23612622415669204,
"grad_norm": 0.13343022763729095,
"learning_rate": 0.00017438841107695046,
"loss": 0.2848,
"step": 651
},
{
"epoch": 0.23648893725063475,
"grad_norm": 0.1315213143825531,
"learning_rate": 0.00017431193166763138,
"loss": 0.2493,
"step": 652
},
{
"epoch": 0.23685165034457745,
"grad_norm": 0.13958190381526947,
"learning_rate": 0.0001742353550639824,
"loss": 0.3001,
"step": 653
},
{
"epoch": 0.23721436343852012,
"grad_norm": 0.13711069524288177,
"learning_rate": 0.00017415868136615994,
"loss": 0.249,
"step": 654
},
{
"epoch": 0.23757707653246282,
"grad_norm": 0.13686099648475647,
"learning_rate": 0.0001740819106744473,
"loss": 0.2493,
"step": 655
},
{
"epoch": 0.23793978962640552,
"grad_norm": 0.14648962020874023,
"learning_rate": 0.00017400504308925468,
"loss": 0.2368,
"step": 656
},
{
"epoch": 0.2383025027203482,
"grad_norm": 0.13652493059635162,
"learning_rate": 0.000173928078711119,
"loss": 0.2198,
"step": 657
},
{
"epoch": 0.2386652158142909,
"grad_norm": 0.13376450538635254,
"learning_rate": 0.00017385101764070383,
"loss": 0.2388,
"step": 658
},
{
"epoch": 0.2390279289082336,
"grad_norm": 0.13941293954849243,
"learning_rate": 0.00017377385997879911,
"loss": 0.2465,
"step": 659
},
{
"epoch": 0.23939064200217627,
"grad_norm": 0.13455533981323242,
"learning_rate": 0.0001736966058263212,
"loss": 0.2366,
"step": 660
},
{
"epoch": 0.23975335509611898,
"grad_norm": 0.1292707622051239,
"learning_rate": 0.00017361925528431262,
"loss": 0.2234,
"step": 661
},
{
"epoch": 0.24011606819006165,
"grad_norm": 0.14742062985897064,
"learning_rate": 0.00017354180845394196,
"loss": 0.2498,
"step": 662
},
{
"epoch": 0.24047878128400435,
"grad_norm": 0.14243729412555695,
"learning_rate": 0.00017346426543650377,
"loss": 0.249,
"step": 663
},
{
"epoch": 0.24084149437794705,
"grad_norm": 0.12824714183807373,
"learning_rate": 0.00017338662633341844,
"loss": 0.2407,
"step": 664
},
{
"epoch": 0.24120420747188973,
"grad_norm": 0.13394343852996826,
"learning_rate": 0.00017330889124623187,
"loss": 0.2375,
"step": 665
},
{
"epoch": 0.24156692056583243,
"grad_norm": 0.13167209923267365,
"learning_rate": 0.0001732310602766157,
"loss": 0.2201,
"step": 666
},
{
"epoch": 0.24192963365977513,
"grad_norm": 0.14167827367782593,
"learning_rate": 0.0001731531335263669,
"loss": 0.2351,
"step": 667
},
{
"epoch": 0.2422923467537178,
"grad_norm": 0.13489162921905518,
"learning_rate": 0.0001730751110974077,
"loss": 0.2298,
"step": 668
},
{
"epoch": 0.2426550598476605,
"grad_norm": 0.1397753804922104,
"learning_rate": 0.0001729969930917854,
"loss": 0.2408,
"step": 669
},
{
"epoch": 0.2430177729416032,
"grad_norm": 0.1405513882637024,
"learning_rate": 0.00017291877961167251,
"loss": 0.2098,
"step": 670
},
{
"epoch": 0.24338048603554588,
"grad_norm": 0.17330865561962128,
"learning_rate": 0.00017284047075936617,
"loss": 0.2655,
"step": 671
},
{
"epoch": 0.24374319912948858,
"grad_norm": 0.1363557130098343,
"learning_rate": 0.00017276206663728846,
"loss": 0.2611,
"step": 672
},
{
"epoch": 0.24410591222343125,
"grad_norm": 0.1307671070098877,
"learning_rate": 0.00017268356734798595,
"loss": 0.2198,
"step": 673
},
{
"epoch": 0.24446862531737396,
"grad_norm": 0.1409989595413208,
"learning_rate": 0.0001726049729941297,
"loss": 0.2404,
"step": 674
},
{
"epoch": 0.24483133841131666,
"grad_norm": 0.136042058467865,
"learning_rate": 0.00017252628367851513,
"loss": 0.2537,
"step": 675
},
{
"epoch": 0.24519405150525933,
"grad_norm": 0.1308341771364212,
"learning_rate": 0.00017244749950406186,
"loss": 0.2296,
"step": 676
},
{
"epoch": 0.24555676459920203,
"grad_norm": 0.14312215149402618,
"learning_rate": 0.00017236862057381358,
"loss": 0.2414,
"step": 677
},
{
"epoch": 0.24591947769314473,
"grad_norm": 0.14419759809970856,
"learning_rate": 0.0001722896469909379,
"loss": 0.2353,
"step": 678
},
{
"epoch": 0.2462821907870874,
"grad_norm": 0.13765071332454681,
"learning_rate": 0.0001722105788587262,
"loss": 0.2317,
"step": 679
},
{
"epoch": 0.2466449038810301,
"grad_norm": 0.1362527757883072,
"learning_rate": 0.0001721314162805936,
"loss": 0.2201,
"step": 680
},
{
"epoch": 0.2470076169749728,
"grad_norm": 0.13269595801830292,
"learning_rate": 0.0001720521593600787,
"loss": 0.2625,
"step": 681
},
{
"epoch": 0.24737033006891548,
"grad_norm": 0.12634457647800446,
"learning_rate": 0.0001719728082008435,
"loss": 0.223,
"step": 682
},
{
"epoch": 0.24773304316285819,
"grad_norm": 0.1394185721874237,
"learning_rate": 0.00017189336290667325,
"loss": 0.2418,
"step": 683
},
{
"epoch": 0.24809575625680086,
"grad_norm": 0.14138251543045044,
"learning_rate": 0.00017181382358147625,
"loss": 0.2377,
"step": 684
},
{
"epoch": 0.24845846935074356,
"grad_norm": 0.14079631865024567,
"learning_rate": 0.00017173419032928398,
"loss": 0.2207,
"step": 685
},
{
"epoch": 0.24882118244468626,
"grad_norm": 0.1409912407398224,
"learning_rate": 0.00017165446325425064,
"loss": 0.2234,
"step": 686
},
{
"epoch": 0.24918389553862894,
"grad_norm": 0.16069121658802032,
"learning_rate": 0.00017157464246065306,
"loss": 0.2661,
"step": 687
},
{
"epoch": 0.24954660863257164,
"grad_norm": 0.14292632043361664,
"learning_rate": 0.0001714947280528908,
"loss": 0.2316,
"step": 688
},
{
"epoch": 0.24990932172651434,
"grad_norm": 0.13920721411705017,
"learning_rate": 0.0001714147201354858,
"loss": 0.2432,
"step": 689
},
{
"epoch": 0.250272034820457,
"grad_norm": 0.13971884548664093,
"learning_rate": 0.0001713346188130823,
"loss": 0.2281,
"step": 690
},
{
"epoch": 0.2506347479143997,
"grad_norm": 0.15373115241527557,
"learning_rate": 0.0001712544241904467,
"loss": 0.2264,
"step": 691
},
{
"epoch": 0.2509974610083424,
"grad_norm": 0.13534583151340485,
"learning_rate": 0.00017117413637246748,
"loss": 0.2263,
"step": 692
},
{
"epoch": 0.2513601741022851,
"grad_norm": 0.14140291512012482,
"learning_rate": 0.00017109375546415495,
"loss": 0.24,
"step": 693
},
{
"epoch": 0.25172288719622776,
"grad_norm": 0.1363680064678192,
"learning_rate": 0.00017101328157064115,
"loss": 0.2212,
"step": 694
},
{
"epoch": 0.25208560029017046,
"grad_norm": 0.13761445879936218,
"learning_rate": 0.00017093271479717986,
"loss": 0.2368,
"step": 695
},
{
"epoch": 0.25244831338411317,
"grad_norm": 0.13729073107242584,
"learning_rate": 0.0001708520552491462,
"loss": 0.2403,
"step": 696
},
{
"epoch": 0.25281102647805587,
"grad_norm": 0.13290317356586456,
"learning_rate": 0.00017077130303203673,
"loss": 0.2234,
"step": 697
},
{
"epoch": 0.25317373957199857,
"grad_norm": 0.14121422171592712,
"learning_rate": 0.0001706904582514692,
"loss": 0.2289,
"step": 698
},
{
"epoch": 0.2535364526659412,
"grad_norm": 0.1334342509508133,
"learning_rate": 0.0001706095210131824,
"loss": 0.2333,
"step": 699
},
{
"epoch": 0.2538991657598839,
"grad_norm": 0.13697004318237305,
"learning_rate": 0.00017052849142303603,
"loss": 0.2244,
"step": 700
},
{
"epoch": 0.2542618788538266,
"grad_norm": 0.14427930116653442,
"learning_rate": 0.00017044736958701058,
"loss": 0.2731,
"step": 701
},
{
"epoch": 0.2546245919477693,
"grad_norm": 0.14478136599063873,
"learning_rate": 0.00017036615561120727,
"loss": 0.2432,
"step": 702
},
{
"epoch": 0.254987305041712,
"grad_norm": 0.1374034285545349,
"learning_rate": 0.0001702848496018478,
"loss": 0.217,
"step": 703
},
{
"epoch": 0.2553500181356547,
"grad_norm": 0.14599081873893738,
"learning_rate": 0.00017020345166527412,
"loss": 0.241,
"step": 704
},
{
"epoch": 0.25571273122959737,
"grad_norm": 0.13574494421482086,
"learning_rate": 0.00017012196190794858,
"loss": 0.2329,
"step": 705
},
{
"epoch": 0.25607544432354007,
"grad_norm": 0.1376832127571106,
"learning_rate": 0.00017004038043645357,
"loss": 0.252,
"step": 706
},
{
"epoch": 0.25643815741748277,
"grad_norm": 0.13819095492362976,
"learning_rate": 0.00016995870735749138,
"loss": 0.2547,
"step": 707
},
{
"epoch": 0.25680087051142547,
"grad_norm": 0.12175976485013962,
"learning_rate": 0.00016987694277788417,
"loss": 0.2058,
"step": 708
},
{
"epoch": 0.2571635836053682,
"grad_norm": 0.13914383947849274,
"learning_rate": 0.0001697950868045738,
"loss": 0.2311,
"step": 709
},
{
"epoch": 0.2575262966993108,
"grad_norm": 0.1349351704120636,
"learning_rate": 0.00016971313954462156,
"loss": 0.2203,
"step": 710
},
{
"epoch": 0.2578890097932535,
"grad_norm": 0.1311430037021637,
"learning_rate": 0.00016963110110520827,
"loss": 0.242,
"step": 711
},
{
"epoch": 0.2582517228871962,
"grad_norm": 0.13092203438282013,
"learning_rate": 0.0001695489715936339,
"loss": 0.25,
"step": 712
},
{
"epoch": 0.2586144359811389,
"grad_norm": 0.13544927537441254,
"learning_rate": 0.00016946675111731766,
"loss": 0.2263,
"step": 713
},
{
"epoch": 0.2589771490750816,
"grad_norm": 0.13862383365631104,
"learning_rate": 0.00016938443978379753,
"loss": 0.2404,
"step": 714
},
{
"epoch": 0.2593398621690243,
"grad_norm": 0.14725641906261444,
"learning_rate": 0.00016930203770073053,
"loss": 0.2482,
"step": 715
},
{
"epoch": 0.259702575262967,
"grad_norm": 0.13641703128814697,
"learning_rate": 0.00016921954497589226,
"loss": 0.2431,
"step": 716
},
{
"epoch": 0.2600652883569097,
"grad_norm": 0.1381891518831253,
"learning_rate": 0.00016913696171717688,
"loss": 0.2321,
"step": 717
},
{
"epoch": 0.2604280014508524,
"grad_norm": 0.14194577932357788,
"learning_rate": 0.000169054288032597,
"loss": 0.2907,
"step": 718
},
{
"epoch": 0.2607907145447951,
"grad_norm": 0.14137552678585052,
"learning_rate": 0.00016897152403028357,
"loss": 0.2205,
"step": 719
},
{
"epoch": 0.2611534276387378,
"grad_norm": 0.12619373202323914,
"learning_rate": 0.00016888866981848544,
"loss": 0.2097,
"step": 720
},
{
"epoch": 0.2615161407326804,
"grad_norm": 0.15918751060962677,
"learning_rate": 0.0001688057255055697,
"loss": 0.2578,
"step": 721
},
{
"epoch": 0.2618788538266231,
"grad_norm": 0.13455507159233093,
"learning_rate": 0.00016872269120002108,
"loss": 0.2676,
"step": 722
},
{
"epoch": 0.2622415669205658,
"grad_norm": 0.14259149134159088,
"learning_rate": 0.0001686395670104422,
"loss": 0.2176,
"step": 723
},
{
"epoch": 0.26260428001450853,
"grad_norm": 0.13362933695316315,
"learning_rate": 0.0001685563530455531,
"loss": 0.2167,
"step": 724
},
{
"epoch": 0.26296699310845123,
"grad_norm": 0.13542160391807556,
"learning_rate": 0.00016847304941419128,
"loss": 0.2288,
"step": 725
},
{
"epoch": 0.26332970620239393,
"grad_norm": 0.15378214418888092,
"learning_rate": 0.00016838965622531157,
"loss": 0.287,
"step": 726
},
{
"epoch": 0.2636924192963366,
"grad_norm": 0.1565556526184082,
"learning_rate": 0.00016830617358798587,
"loss": 0.2692,
"step": 727
},
{
"epoch": 0.2640551323902793,
"grad_norm": 0.14884917438030243,
"learning_rate": 0.0001682226016114031,
"loss": 0.2368,
"step": 728
},
{
"epoch": 0.264417845484222,
"grad_norm": 0.13870306313037872,
"learning_rate": 0.000168138940404869,
"loss": 0.2356,
"step": 729
},
{
"epoch": 0.2647805585781647,
"grad_norm": 0.15050628781318665,
"learning_rate": 0.00016805519007780602,
"loss": 0.2524,
"step": 730
},
{
"epoch": 0.2651432716721074,
"grad_norm": 0.1477731466293335,
"learning_rate": 0.00016797135073975326,
"loss": 0.2184,
"step": 731
},
{
"epoch": 0.26550598476605003,
"grad_norm": 0.1533484160900116,
"learning_rate": 0.0001678874225003661,
"loss": 0.2301,
"step": 732
},
{
"epoch": 0.26586869785999273,
"grad_norm": 0.14348532259464264,
"learning_rate": 0.0001678034054694163,
"loss": 0.2397,
"step": 733
},
{
"epoch": 0.26623141095393543,
"grad_norm": 0.14960677921772003,
"learning_rate": 0.0001677192997567917,
"loss": 0.2244,
"step": 734
},
{
"epoch": 0.26659412404787813,
"grad_norm": 0.15019361674785614,
"learning_rate": 0.00016763510547249615,
"loss": 0.2466,
"step": 735
},
{
"epoch": 0.26695683714182084,
"grad_norm": 0.14875197410583496,
"learning_rate": 0.00016755082272664937,
"loss": 0.2106,
"step": 736
},
{
"epoch": 0.26731955023576354,
"grad_norm": 0.14142164587974548,
"learning_rate": 0.00016746645162948672,
"loss": 0.2387,
"step": 737
},
{
"epoch": 0.2676822633297062,
"grad_norm": 0.16096633672714233,
"learning_rate": 0.0001673819922913592,
"loss": 0.2346,
"step": 738
},
{
"epoch": 0.2680449764236489,
"grad_norm": 0.15639543533325195,
"learning_rate": 0.0001672974448227331,
"loss": 0.2839,
"step": 739
},
{
"epoch": 0.2684076895175916,
"grad_norm": 0.1443796008825302,
"learning_rate": 0.0001672128093341901,
"loss": 0.2314,
"step": 740
},
{
"epoch": 0.2687704026115343,
"grad_norm": 0.15442712604999542,
"learning_rate": 0.00016712808593642695,
"loss": 0.2299,
"step": 741
},
{
"epoch": 0.269133115705477,
"grad_norm": 0.14457674324512482,
"learning_rate": 0.00016704327474025533,
"loss": 0.2526,
"step": 742
},
{
"epoch": 0.26949582879941963,
"grad_norm": 0.14981432259082794,
"learning_rate": 0.00016695837585660187,
"loss": 0.2288,
"step": 743
},
{
"epoch": 0.26985854189336234,
"grad_norm": 0.1518179178237915,
"learning_rate": 0.00016687338939650782,
"loss": 0.2264,
"step": 744
},
{
"epoch": 0.27022125498730504,
"grad_norm": 0.16115126013755798,
"learning_rate": 0.00016678831547112895,
"loss": 0.2533,
"step": 745
},
{
"epoch": 0.27058396808124774,
"grad_norm": 0.1538068801164627,
"learning_rate": 0.00016670315419173548,
"loss": 0.2429,
"step": 746
},
{
"epoch": 0.27094668117519044,
"grad_norm": 0.1365380436182022,
"learning_rate": 0.00016661790566971181,
"loss": 0.2222,
"step": 747
},
{
"epoch": 0.27130939426913314,
"grad_norm": 0.14484576880931854,
"learning_rate": 0.00016653257001655652,
"loss": 0.2197,
"step": 748
},
{
"epoch": 0.2716721073630758,
"grad_norm": 0.16303595900535583,
"learning_rate": 0.00016644714734388217,
"loss": 0.253,
"step": 749
},
{
"epoch": 0.2720348204570185,
"grad_norm": 0.14876610040664673,
"learning_rate": 0.00016636163776341504,
"loss": 0.2205,
"step": 750
},
{
"epoch": 0.2723975335509612,
"grad_norm": 0.13568569719791412,
"learning_rate": 0.00016627604138699515,
"loss": 0.2251,
"step": 751
},
{
"epoch": 0.2727602466449039,
"grad_norm": 0.14528821408748627,
"learning_rate": 0.00016619035832657602,
"loss": 0.2346,
"step": 752
},
{
"epoch": 0.2731229597388466,
"grad_norm": 0.13951005041599274,
"learning_rate": 0.0001661045886942245,
"loss": 0.2311,
"step": 753
},
{
"epoch": 0.27348567283278924,
"grad_norm": 0.1355544924736023,
"learning_rate": 0.0001660187326021208,
"loss": 0.2235,
"step": 754
},
{
"epoch": 0.27384838592673194,
"grad_norm": 0.14282123744487762,
"learning_rate": 0.00016593279016255806,
"loss": 0.211,
"step": 755
},
{
"epoch": 0.27421109902067464,
"grad_norm": 0.1680796593427658,
"learning_rate": 0.0001658467614879425,
"loss": 0.2518,
"step": 756
},
{
"epoch": 0.27457381211461734,
"grad_norm": 0.15991435945034027,
"learning_rate": 0.00016576064669079297,
"loss": 0.2419,
"step": 757
},
{
"epoch": 0.27493652520856005,
"grad_norm": 0.1730770766735077,
"learning_rate": 0.0001656744458837411,
"loss": 0.257,
"step": 758
},
{
"epoch": 0.27529923830250275,
"grad_norm": 0.1453644037246704,
"learning_rate": 0.00016558815917953095,
"loss": 0.2532,
"step": 759
},
{
"epoch": 0.2756619513964454,
"grad_norm": 0.1334659457206726,
"learning_rate": 0.00016550178669101891,
"loss": 0.2098,
"step": 760
},
{
"epoch": 0.2760246644903881,
"grad_norm": 0.13118910789489746,
"learning_rate": 0.00016541532853117365,
"loss": 0.214,
"step": 761
},
{
"epoch": 0.2763873775843308,
"grad_norm": 0.14156754314899445,
"learning_rate": 0.0001653287848130758,
"loss": 0.2434,
"step": 762
},
{
"epoch": 0.2767500906782735,
"grad_norm": 0.16743269562721252,
"learning_rate": 0.0001652421556499179,
"loss": 0.2692,
"step": 763
},
{
"epoch": 0.2771128037722162,
"grad_norm": 0.16182062029838562,
"learning_rate": 0.0001651554411550044,
"loss": 0.2194,
"step": 764
},
{
"epoch": 0.27747551686615884,
"grad_norm": 0.14829173684120178,
"learning_rate": 0.0001650686414417511,
"loss": 0.2444,
"step": 765
},
{
"epoch": 0.27783822996010155,
"grad_norm": 0.14184747636318207,
"learning_rate": 0.00016498175662368544,
"loss": 0.2275,
"step": 766
},
{
"epoch": 0.27820094305404425,
"grad_norm": 0.14175622165203094,
"learning_rate": 0.00016489478681444615,
"loss": 0.2368,
"step": 767
},
{
"epoch": 0.27856365614798695,
"grad_norm": 0.14495515823364258,
"learning_rate": 0.0001648077321277831,
"loss": 0.2087,
"step": 768
},
{
"epoch": 0.27892636924192965,
"grad_norm": 0.14581428468227386,
"learning_rate": 0.0001647205926775571,
"loss": 0.2339,
"step": 769
},
{
"epoch": 0.27928908233587235,
"grad_norm": 0.16971313953399658,
"learning_rate": 0.00016463336857773996,
"loss": 0.2564,
"step": 770
},
{
"epoch": 0.279651795429815,
"grad_norm": 0.16059347987174988,
"learning_rate": 0.00016454605994241413,
"loss": 0.2495,
"step": 771
},
{
"epoch": 0.2800145085237577,
"grad_norm": 0.13135506212711334,
"learning_rate": 0.00016445866688577268,
"loss": 0.221,
"step": 772
},
{
"epoch": 0.2803772216177004,
"grad_norm": 0.14712165296077728,
"learning_rate": 0.00016437118952211893,
"loss": 0.232,
"step": 773
},
{
"epoch": 0.2807399347116431,
"grad_norm": 0.1340080052614212,
"learning_rate": 0.00016428362796586668,
"loss": 0.2134,
"step": 774
},
{
"epoch": 0.2811026478055858,
"grad_norm": 0.1442837119102478,
"learning_rate": 0.00016419598233153977,
"loss": 0.2507,
"step": 775
},
{
"epoch": 0.28146536089952845,
"grad_norm": 0.1472170352935791,
"learning_rate": 0.00016410825273377192,
"loss": 0.2053,
"step": 776
},
{
"epoch": 0.28182807399347115,
"grad_norm": 0.16951750218868256,
"learning_rate": 0.0001640204392873068,
"loss": 0.2226,
"step": 777
},
{
"epoch": 0.28219078708741385,
"grad_norm": 0.1475476771593094,
"learning_rate": 0.00016393254210699765,
"loss": 0.2255,
"step": 778
},
{
"epoch": 0.28255350018135655,
"grad_norm": 0.1399717628955841,
"learning_rate": 0.00016384456130780732,
"loss": 0.2296,
"step": 779
},
{
"epoch": 0.28291621327529926,
"grad_norm": 0.15422862768173218,
"learning_rate": 0.00016375649700480792,
"loss": 0.2549,
"step": 780
},
{
"epoch": 0.28327892636924196,
"grad_norm": 0.14808495342731476,
"learning_rate": 0.0001636683493131809,
"loss": 0.2125,
"step": 781
},
{
"epoch": 0.2836416394631846,
"grad_norm": 0.13389019668102264,
"learning_rate": 0.00016358011834821662,
"loss": 0.2216,
"step": 782
},
{
"epoch": 0.2840043525571273,
"grad_norm": 0.14201773703098297,
"learning_rate": 0.0001634918042253145,
"loss": 0.2257,
"step": 783
},
{
"epoch": 0.28436706565107,
"grad_norm": 0.16533806920051575,
"learning_rate": 0.00016340340705998265,
"loss": 0.2245,
"step": 784
},
{
"epoch": 0.2847297787450127,
"grad_norm": 0.15893639624118805,
"learning_rate": 0.0001633149269678378,
"loss": 0.2175,
"step": 785
},
{
"epoch": 0.2850924918389554,
"grad_norm": 0.1425047069787979,
"learning_rate": 0.0001632263640646052,
"loss": 0.252,
"step": 786
},
{
"epoch": 0.28545520493289805,
"grad_norm": 0.15391702950000763,
"learning_rate": 0.00016313771846611827,
"loss": 0.2222,
"step": 787
},
{
"epoch": 0.28581791802684076,
"grad_norm": 132164.875,
"learning_rate": 0.00016304899028831874,
"loss": 0.2179,
"step": 788
},
{
"epoch": 0.28618063112078346,
"grad_norm": 0.1637081801891327,
"learning_rate": 0.00016296017964725632,
"loss": 0.2205,
"step": 789
},
{
"epoch": 0.28654334421472616,
"grad_norm": 0.20489241182804108,
"learning_rate": 0.0001628712866590885,
"loss": 0.2479,
"step": 790
},
{
"epoch": 0.28690605730866886,
"grad_norm": 0.17106997966766357,
"learning_rate": 0.00016278231144008053,
"loss": 0.227,
"step": 791
},
{
"epoch": 0.28726877040261156,
"grad_norm": 0.16591399908065796,
"learning_rate": 0.00016269325410660517,
"loss": 0.2001,
"step": 792
},
{
"epoch": 0.2876314834965542,
"grad_norm": 0.17908765375614166,
"learning_rate": 0.00016260411477514265,
"loss": 0.2311,
"step": 793
},
{
"epoch": 0.2879941965904969,
"grad_norm": 0.2103756070137024,
"learning_rate": 0.00016251489356228037,
"loss": 0.251,
"step": 794
},
{
"epoch": 0.2883569096844396,
"grad_norm": 0.1727806031703949,
"learning_rate": 0.00016242559058471292,
"loss": 0.2193,
"step": 795
},
{
"epoch": 0.2887196227783823,
"grad_norm": 0.15671540796756744,
"learning_rate": 0.0001623362059592417,
"loss": 0.2462,
"step": 796
},
{
"epoch": 0.289082335872325,
"grad_norm": 0.14824596047401428,
"learning_rate": 0.00016224673980277503,
"loss": 0.2235,
"step": 797
},
{
"epoch": 0.28944504896626766,
"grad_norm": 0.15403501689434052,
"learning_rate": 0.00016215719223232778,
"loss": 0.2644,
"step": 798
},
{
"epoch": 0.28980776206021036,
"grad_norm": 0.15009653568267822,
"learning_rate": 0.0001620675633650213,
"loss": 0.243,
"step": 799
},
{
"epoch": 0.29017047515415306,
"grad_norm": 0.16066166758537292,
"learning_rate": 0.0001619778533180834,
"loss": 0.2171,
"step": 800
},
{
"epoch": 0.29053318824809576,
"grad_norm": 0.15927597880363464,
"learning_rate": 0.00016188806220884786,
"loss": 0.217,
"step": 801
},
{
"epoch": 0.29089590134203847,
"grad_norm": 0.14611735939979553,
"learning_rate": 0.00016179819015475465,
"loss": 0.2204,
"step": 802
},
{
"epoch": 0.2912586144359811,
"grad_norm": 0.14521051943302155,
"learning_rate": 0.00016170823727334956,
"loss": 0.1962,
"step": 803
},
{
"epoch": 0.2916213275299238,
"grad_norm": 0.1608162224292755,
"learning_rate": 0.00016161820368228402,
"loss": 0.2263,
"step": 804
},
{
"epoch": 0.2919840406238665,
"grad_norm": 0.1577100157737732,
"learning_rate": 0.00016152808949931516,
"loss": 0.2208,
"step": 805
},
{
"epoch": 0.2923467537178092,
"grad_norm": 0.15033476054668427,
"learning_rate": 0.00016143789484230543,
"loss": 0.215,
"step": 806
},
{
"epoch": 0.2927094668117519,
"grad_norm": 0.14740067720413208,
"learning_rate": 0.00016134761982922253,
"loss": 0.2042,
"step": 807
},
{
"epoch": 0.2930721799056946,
"grad_norm": 0.15068073570728302,
"learning_rate": 0.0001612572645781393,
"loss": 0.2221,
"step": 808
},
{
"epoch": 0.29343489299963726,
"grad_norm": 0.17142775654792786,
"learning_rate": 0.00016116682920723352,
"loss": 0.2142,
"step": 809
},
{
"epoch": 0.29379760609357997,
"grad_norm": 0.15067829191684723,
"learning_rate": 0.0001610763138347877,
"loss": 0.2225,
"step": 810
},
{
"epoch": 0.29416031918752267,
"grad_norm": 0.1574852466583252,
"learning_rate": 0.0001609857185791891,
"loss": 0.2106,
"step": 811
},
{
"epoch": 0.29452303228146537,
"grad_norm": 0.17060889303684235,
"learning_rate": 0.00016089504355892931,
"loss": 0.233,
"step": 812
},
{
"epoch": 0.29488574537540807,
"grad_norm": 0.14020898938179016,
"learning_rate": 0.0001608042888926044,
"loss": 0.2162,
"step": 813
},
{
"epoch": 0.2952484584693507,
"grad_norm": 0.1367609053850174,
"learning_rate": 0.0001607134546989145,
"loss": 0.2224,
"step": 814
},
{
"epoch": 0.2956111715632934,
"grad_norm": 0.14028465747833252,
"learning_rate": 0.0001606225410966638,
"loss": 0.2237,
"step": 815
},
{
"epoch": 0.2959738846572361,
"grad_norm": 0.13773570954799652,
"learning_rate": 0.00016053154820476037,
"loss": 0.224,
"step": 816
},
{
"epoch": 0.2963365977511788,
"grad_norm": 0.14603252708911896,
"learning_rate": 0.000160440476142216,
"loss": 0.217,
"step": 817
},
{
"epoch": 0.2966993108451215,
"grad_norm": 0.15531830489635468,
"learning_rate": 0.00016034932502814587,
"loss": 0.2137,
"step": 818
},
{
"epoch": 0.2970620239390642,
"grad_norm": 0.15454085171222687,
"learning_rate": 0.00016025809498176874,
"loss": 0.2244,
"step": 819
},
{
"epoch": 0.29742473703300687,
"grad_norm": 0.1548180729150772,
"learning_rate": 0.0001601667861224066,
"loss": 0.2517,
"step": 820
},
{
"epoch": 0.29778745012694957,
"grad_norm": 0.1498357206583023,
"learning_rate": 0.00016007539856948436,
"loss": 0.2512,
"step": 821
},
{
"epoch": 0.2981501632208923,
"grad_norm": 0.1419772207736969,
"learning_rate": 0.00015998393244253002,
"loss": 0.2067,
"step": 822
},
{
"epoch": 0.298512876314835,
"grad_norm": 0.14814653992652893,
"learning_rate": 0.0001598923878611743,
"loss": 0.2293,
"step": 823
},
{
"epoch": 0.2988755894087777,
"grad_norm": 0.15222403407096863,
"learning_rate": 0.00015980076494515047,
"loss": 0.2247,
"step": 824
},
{
"epoch": 0.2992383025027203,
"grad_norm": 0.1679450124502182,
"learning_rate": 0.0001597090638142943,
"loss": 0.2631,
"step": 825
},
{
"epoch": 0.299601015596663,
"grad_norm": 0.14880560338497162,
"learning_rate": 0.00015961728458854397,
"loss": 0.2069,
"step": 826
},
{
"epoch": 0.2999637286906057,
"grad_norm": 0.14599819481372833,
"learning_rate": 0.00015952542738793956,
"loss": 0.226,
"step": 827
},
{
"epoch": 0.3003264417845484,
"grad_norm": 0.14673501253128052,
"learning_rate": 0.00015943349233262332,
"loss": 0.2131,
"step": 828
},
{
"epoch": 0.3006891548784911,
"grad_norm": 0.1625213623046875,
"learning_rate": 0.00015934147954283932,
"loss": 0.2289,
"step": 829
},
{
"epoch": 0.30105186797243383,
"grad_norm": 0.15041042864322662,
"learning_rate": 0.00015924938913893324,
"loss": 0.2217,
"step": 830
},
{
"epoch": 0.3014145810663765,
"grad_norm": 0.14617730677127838,
"learning_rate": 0.00015915722124135227,
"loss": 0.2396,
"step": 831
},
{
"epoch": 0.3017772941603192,
"grad_norm": 0.15437570214271545,
"learning_rate": 0.00015906497597064495,
"loss": 0.2434,
"step": 832
},
{
"epoch": 0.3021400072542619,
"grad_norm": 0.146324023604393,
"learning_rate": 0.00015897265344746113,
"loss": 0.2621,
"step": 833
},
{
"epoch": 0.3025027203482046,
"grad_norm": 0.15348979830741882,
"learning_rate": 0.00015888025379255156,
"loss": 0.2198,
"step": 834
},
{
"epoch": 0.3028654334421473,
"grad_norm": 0.14553911983966827,
"learning_rate": 0.00015878777712676796,
"loss": 0.2168,
"step": 835
},
{
"epoch": 0.3032281465360899,
"grad_norm": 0.15064238011837006,
"learning_rate": 0.00015869522357106272,
"loss": 0.2381,
"step": 836
},
{
"epoch": 0.30359085963003263,
"grad_norm": 0.1429353505373001,
"learning_rate": 0.00015860259324648886,
"loss": 0.2444,
"step": 837
},
{
"epoch": 0.30395357272397533,
"grad_norm": 0.14742977917194366,
"learning_rate": 0.00015850988627419968,
"loss": 0.2112,
"step": 838
},
{
"epoch": 0.30431628581791803,
"grad_norm": 0.14249765872955322,
"learning_rate": 0.00015841710277544896,
"loss": 0.2287,
"step": 839
},
{
"epoch": 0.30467899891186073,
"grad_norm": 0.14514710009098053,
"learning_rate": 0.00015832424287159027,
"loss": 0.2229,
"step": 840
},
{
"epoch": 0.30504171200580343,
"grad_norm": 0.15762075781822205,
"learning_rate": 0.00015823130668407738,
"loss": 0.212,
"step": 841
},
{
"epoch": 0.3054044250997461,
"grad_norm": 0.16756275296211243,
"learning_rate": 0.00015813829433446367,
"loss": 0.2431,
"step": 842
},
{
"epoch": 0.3057671381936888,
"grad_norm": 0.2156544029712677,
"learning_rate": 0.00015804520594440223,
"loss": 0.2045,
"step": 843
},
{
"epoch": 0.3061298512876315,
"grad_norm": 0.18604739010334015,
"learning_rate": 0.00015795204163564556,
"loss": 0.2644,
"step": 844
},
{
"epoch": 0.3064925643815742,
"grad_norm": 0.14301113784313202,
"learning_rate": 0.0001578588015300454,
"loss": 0.2114,
"step": 845
},
{
"epoch": 0.3068552774755169,
"grad_norm": 0.14301526546478271,
"learning_rate": 0.00015776548574955275,
"loss": 0.2127,
"step": 846
},
{
"epoch": 0.30721799056945953,
"grad_norm": 0.15024398267269135,
"learning_rate": 0.0001576720944162175,
"loss": 0.207,
"step": 847
},
{
"epoch": 0.30758070366340223,
"grad_norm": 0.14672665297985077,
"learning_rate": 0.00015757862765218838,
"loss": 0.2112,
"step": 848
},
{
"epoch": 0.30794341675734493,
"grad_norm": 0.177405446767807,
"learning_rate": 0.00015748508557971276,
"loss": 0.2248,
"step": 849
},
{
"epoch": 0.30830612985128764,
"grad_norm": 0.16310465335845947,
"learning_rate": 0.00015739146832113656,
"loss": 0.2389,
"step": 850
},
{
"epoch": 0.30866884294523034,
"grad_norm": 0.14648981392383575,
"learning_rate": 0.00015729777599890395,
"loss": 0.2159,
"step": 851
},
{
"epoch": 0.30903155603917304,
"grad_norm": 0.1470453441143036,
"learning_rate": 0.0001572040087355574,
"loss": 0.2216,
"step": 852
},
{
"epoch": 0.3093942691331157,
"grad_norm": 0.15409401059150696,
"learning_rate": 0.00015711016665373727,
"loss": 0.2497,
"step": 853
},
{
"epoch": 0.3097569822270584,
"grad_norm": 0.16030748188495636,
"learning_rate": 0.0001570162498761819,
"loss": 0.2108,
"step": 854
},
{
"epoch": 0.3101196953210011,
"grad_norm": 0.16415894031524658,
"learning_rate": 0.00015692225852572715,
"loss": 0.2297,
"step": 855
},
{
"epoch": 0.3104824084149438,
"grad_norm": 0.1503467857837677,
"learning_rate": 0.00015682819272530663,
"loss": 0.1972,
"step": 856
},
{
"epoch": 0.3108451215088865,
"grad_norm": 0.15261000394821167,
"learning_rate": 0.00015673405259795118,
"loss": 0.2296,
"step": 857
},
{
"epoch": 0.31120783460282914,
"grad_norm": 0.15605837106704712,
"learning_rate": 0.00015663983826678888,
"loss": 0.2135,
"step": 858
},
{
"epoch": 0.31157054769677184,
"grad_norm": 0.13954474031925201,
"learning_rate": 0.0001565455498550449,
"loss": 0.2064,
"step": 859
},
{
"epoch": 0.31193326079071454,
"grad_norm": 0.14538753032684326,
"learning_rate": 0.0001564511874860413,
"loss": 0.2279,
"step": 860
},
{
"epoch": 0.31229597388465724,
"grad_norm": 0.1461893618106842,
"learning_rate": 0.00015635675128319683,
"loss": 0.2203,
"step": 861
},
{
"epoch": 0.31265868697859994,
"grad_norm": 0.14321376383304596,
"learning_rate": 0.0001562622413700268,
"loss": 0.2112,
"step": 862
},
{
"epoch": 0.31302140007254264,
"grad_norm": 0.14480461180210114,
"learning_rate": 0.00015616765787014302,
"loss": 0.2182,
"step": 863
},
{
"epoch": 0.3133841131664853,
"grad_norm": 0.16734722256660461,
"learning_rate": 0.00015607300090725342,
"loss": 0.2222,
"step": 864
},
{
"epoch": 0.313746826260428,
"grad_norm": 0.14616838097572327,
"learning_rate": 0.00015597827060516211,
"loss": 0.2075,
"step": 865
},
{
"epoch": 0.3141095393543707,
"grad_norm": 0.16457431018352509,
"learning_rate": 0.00015588346708776904,
"loss": 0.2271,
"step": 866
},
{
"epoch": 0.3144722524483134,
"grad_norm": 0.16780099272727966,
"learning_rate": 0.00015578859047907004,
"loss": 0.2196,
"step": 867
},
{
"epoch": 0.3148349655422561,
"grad_norm": 0.14990176260471344,
"learning_rate": 0.00015569364090315646,
"loss": 0.2162,
"step": 868
},
{
"epoch": 0.31519767863619874,
"grad_norm": 0.1400328129529953,
"learning_rate": 0.00015559861848421505,
"loss": 0.2114,
"step": 869
},
{
"epoch": 0.31556039173014144,
"grad_norm": 0.15837667882442474,
"learning_rate": 0.00015550352334652788,
"loss": 0.2755,
"step": 870
},
{
"epoch": 0.31592310482408414,
"grad_norm": 0.14617806673049927,
"learning_rate": 0.00015540835561447214,
"loss": 0.2029,
"step": 871
},
{
"epoch": 0.31628581791802685,
"grad_norm": 0.1634027361869812,
"learning_rate": 0.00015531311541251995,
"loss": 0.2451,
"step": 872
},
{
"epoch": 0.31664853101196955,
"grad_norm": 0.17340759932994843,
"learning_rate": 0.00015521780286523824,
"loss": 0.2267,
"step": 873
},
{
"epoch": 0.31701124410591225,
"grad_norm": 0.15501338243484497,
"learning_rate": 0.0001551224180972885,
"loss": 0.1988,
"step": 874
},
{
"epoch": 0.3173739571998549,
"grad_norm": 0.15017758309841156,
"learning_rate": 0.00015502696123342676,
"loss": 0.211,
"step": 875
},
{
"epoch": 0.3177366702937976,
"grad_norm": 0.15657378733158112,
"learning_rate": 0.00015493143239850329,
"loss": 0.2092,
"step": 876
},
{
"epoch": 0.3180993833877403,
"grad_norm": 0.15220540761947632,
"learning_rate": 0.00015483583171746248,
"loss": 0.2413,
"step": 877
},
{
"epoch": 0.318462096481683,
"grad_norm": 0.15332242846488953,
"learning_rate": 0.00015474015931534276,
"loss": 0.2333,
"step": 878
},
{
"epoch": 0.3188248095756257,
"grad_norm": 0.14318165183067322,
"learning_rate": 0.00015464441531727632,
"loss": 0.2282,
"step": 879
},
{
"epoch": 0.31918752266956835,
"grad_norm": 0.15234385430812836,
"learning_rate": 0.00015454859984848895,
"loss": 0.2092,
"step": 880
},
{
"epoch": 0.31955023576351105,
"grad_norm": 0.15263251960277557,
"learning_rate": 0.0001544527130343,
"loss": 0.2142,
"step": 881
},
{
"epoch": 0.31991294885745375,
"grad_norm": 0.1610080748796463,
"learning_rate": 0.00015435675500012212,
"loss": 0.2305,
"step": 882
},
{
"epoch": 0.32027566195139645,
"grad_norm": 0.15507538616657257,
"learning_rate": 0.00015426072587146106,
"loss": 0.2316,
"step": 883
},
{
"epoch": 0.32063837504533915,
"grad_norm": 0.16231822967529297,
"learning_rate": 0.00015416462577391558,
"loss": 0.2953,
"step": 884
},
{
"epoch": 0.32100108813928185,
"grad_norm": 0.14619815349578857,
"learning_rate": 0.00015406845483317727,
"loss": 0.2335,
"step": 885
},
{
"epoch": 0.3213638012332245,
"grad_norm": 0.15803977847099304,
"learning_rate": 0.00015397221317503039,
"loss": 0.212,
"step": 886
},
{
"epoch": 0.3217265143271672,
"grad_norm": 0.148417666554451,
"learning_rate": 0.00015387590092535164,
"loss": 0.2063,
"step": 887
},
{
"epoch": 0.3220892274211099,
"grad_norm": 0.1504986435174942,
"learning_rate": 0.00015377951821011015,
"loss": 0.2156,
"step": 888
},
{
"epoch": 0.3224519405150526,
"grad_norm": 0.1552225649356842,
"learning_rate": 0.00015368306515536708,
"loss": 0.209,
"step": 889
},
{
"epoch": 0.3228146536089953,
"grad_norm": 0.1671207845211029,
"learning_rate": 0.00015358654188727568,
"loss": 0.218,
"step": 890
},
{
"epoch": 0.32317736670293795,
"grad_norm": 0.15497446060180664,
"learning_rate": 0.00015348994853208104,
"loss": 0.2239,
"step": 891
},
{
"epoch": 0.32354007979688065,
"grad_norm": 0.16032548248767853,
"learning_rate": 0.00015339328521611983,
"loss": 0.2069,
"step": 892
},
{
"epoch": 0.32390279289082335,
"grad_norm": 0.15629202127456665,
"learning_rate": 0.00015329655206582036,
"loss": 0.2262,
"step": 893
},
{
"epoch": 0.32426550598476606,
"grad_norm": 0.15609470009803772,
"learning_rate": 0.00015319974920770214,
"loss": 0.2444,
"step": 894
},
{
"epoch": 0.32462821907870876,
"grad_norm": 0.16244526207447052,
"learning_rate": 0.00015310287676837593,
"loss": 0.211,
"step": 895
},
{
"epoch": 0.32499093217265146,
"grad_norm": 0.1519642472267151,
"learning_rate": 0.00015300593487454348,
"loss": 0.2091,
"step": 896
},
{
"epoch": 0.3253536452665941,
"grad_norm": 0.1546807587146759,
"learning_rate": 0.0001529089236529974,
"loss": 0.2226,
"step": 897
},
{
"epoch": 0.3257163583605368,
"grad_norm": 0.14414747059345245,
"learning_rate": 0.00015281184323062097,
"loss": 0.2259,
"step": 898
},
{
"epoch": 0.3260790714544795,
"grad_norm": 0.1484064757823944,
"learning_rate": 0.00015271469373438792,
"loss": 0.2353,
"step": 899
},
{
"epoch": 0.3264417845484222,
"grad_norm": 0.15261922776699066,
"learning_rate": 0.00015261747529136236,
"loss": 0.2094,
"step": 900
},
{
"epoch": 0.3268044976423649,
"grad_norm": 0.16096492111682892,
"learning_rate": 0.00015252018802869866,
"loss": 0.2102,
"step": 901
},
{
"epoch": 0.32716721073630756,
"grad_norm": 0.14988648891448975,
"learning_rate": 0.00015242283207364107,
"loss": 0.1933,
"step": 902
},
{
"epoch": 0.32752992383025026,
"grad_norm": 0.16668923199176788,
"learning_rate": 0.00015232540755352373,
"loss": 0.2132,
"step": 903
},
{
"epoch": 0.32789263692419296,
"grad_norm": 0.1562613993883133,
"learning_rate": 0.00015222791459577051,
"loss": 0.2174,
"step": 904
},
{
"epoch": 0.32825535001813566,
"grad_norm": 0.15152856707572937,
"learning_rate": 0.00015213035332789477,
"loss": 0.2223,
"step": 905
},
{
"epoch": 0.32861806311207836,
"grad_norm": 0.15007184445858002,
"learning_rate": 0.00015203272387749915,
"loss": 0.2184,
"step": 906
},
{
"epoch": 0.32898077620602106,
"grad_norm": 0.1500440090894699,
"learning_rate": 0.0001519350263722755,
"loss": 0.2493,
"step": 907
},
{
"epoch": 0.3293434892999637,
"grad_norm": 0.15756063163280487,
"learning_rate": 0.00015183726094000476,
"loss": 0.2112,
"step": 908
},
{
"epoch": 0.3297062023939064,
"grad_norm": 0.15649868547916412,
"learning_rate": 0.00015173942770855655,
"loss": 0.2105,
"step": 909
},
{
"epoch": 0.3300689154878491,
"grad_norm": 0.17396046221256256,
"learning_rate": 0.00015164152680588938,
"loss": 0.2092,
"step": 910
},
{
"epoch": 0.3304316285817918,
"grad_norm": 0.15336064994335175,
"learning_rate": 0.00015154355836005006,
"loss": 0.2168,
"step": 911
},
{
"epoch": 0.3307943416757345,
"grad_norm": 0.1463136523962021,
"learning_rate": 0.00015144552249917386,
"loss": 0.2175,
"step": 912
},
{
"epoch": 0.33115705476967716,
"grad_norm": 0.14064238965511322,
"learning_rate": 0.0001513474193514842,
"loss": 0.2342,
"step": 913
},
{
"epoch": 0.33151976786361986,
"grad_norm": 0.15353120863437653,
"learning_rate": 0.00015124924904529253,
"loss": 0.2269,
"step": 914
},
{
"epoch": 0.33188248095756256,
"grad_norm": 0.1634497493505478,
"learning_rate": 0.00015115101170899806,
"loss": 0.2303,
"step": 915
},
{
"epoch": 0.33224519405150527,
"grad_norm": 0.15802593529224396,
"learning_rate": 0.00015105270747108778,
"loss": 0.2181,
"step": 916
},
{
"epoch": 0.33260790714544797,
"grad_norm": 0.16792048513889313,
"learning_rate": 0.00015095433646013606,
"loss": 0.2042,
"step": 917
},
{
"epoch": 0.33297062023939067,
"grad_norm": 0.14907622337341309,
"learning_rate": 0.0001508558988048047,
"loss": 0.198,
"step": 918
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.15107260644435883,
"learning_rate": 0.00015075739463384267,
"loss": 0.2103,
"step": 919
},
{
"epoch": 0.333696046427276,
"grad_norm": 0.16222083568572998,
"learning_rate": 0.00015065882407608582,
"loss": 0.2267,
"step": 920
},
{
"epoch": 0.3340587595212187,
"grad_norm": 0.14550422132015228,
"learning_rate": 0.00015056018726045697,
"loss": 0.2197,
"step": 921
},
{
"epoch": 0.3344214726151614,
"grad_norm": 0.14713485538959503,
"learning_rate": 0.00015046148431596554,
"loss": 0.2261,
"step": 922
},
{
"epoch": 0.3347841857091041,
"grad_norm": 0.15137678384780884,
"learning_rate": 0.0001503627153717074,
"loss": 0.2196,
"step": 923
},
{
"epoch": 0.33514689880304677,
"grad_norm": 0.15455511212348938,
"learning_rate": 0.00015026388055686485,
"loss": 0.2111,
"step": 924
},
{
"epoch": 0.33550961189698947,
"grad_norm": 0.15710324048995972,
"learning_rate": 0.00015016498000070618,
"loss": 0.2138,
"step": 925
},
{
"epoch": 0.33587232499093217,
"grad_norm": 0.19984076917171478,
"learning_rate": 0.00015006601383258584,
"loss": 0.2264,
"step": 926
},
{
"epoch": 0.33623503808487487,
"grad_norm": 0.15135234594345093,
"learning_rate": 0.000149966982181944,
"loss": 0.2121,
"step": 927
},
{
"epoch": 0.3365977511788176,
"grad_norm": 0.14553037285804749,
"learning_rate": 0.0001498678851783065,
"loss": 0.2095,
"step": 928
},
{
"epoch": 0.3369604642727603,
"grad_norm": 0.1508447229862213,
"learning_rate": 0.00014976872295128463,
"loss": 0.2377,
"step": 929
},
{
"epoch": 0.3373231773667029,
"grad_norm": 0.15726783871650696,
"learning_rate": 0.000149669495630575,
"loss": 0.2453,
"step": 930
},
{
"epoch": 0.3376858904606456,
"grad_norm": 0.1487269550561905,
"learning_rate": 0.0001495702033459594,
"loss": 0.1958,
"step": 931
},
{
"epoch": 0.3380486035545883,
"grad_norm": 0.1618356555700302,
"learning_rate": 0.00014947084622730453,
"loss": 0.2061,
"step": 932
},
{
"epoch": 0.338411316648531,
"grad_norm": 0.178387850522995,
"learning_rate": 0.00014937142440456195,
"loss": 0.2358,
"step": 933
},
{
"epoch": 0.3387740297424737,
"grad_norm": 0.15690761804580688,
"learning_rate": 0.00014927193800776776,
"loss": 0.2077,
"step": 934
},
{
"epoch": 0.33913674283641637,
"grad_norm": 0.15255998075008392,
"learning_rate": 0.00014917238716704258,
"loss": 0.214,
"step": 935
},
{
"epoch": 0.3394994559303591,
"grad_norm": 0.15309607982635498,
"learning_rate": 0.00014907277201259132,
"loss": 0.2326,
"step": 936
},
{
"epoch": 0.3398621690243018,
"grad_norm": 0.15764005482196808,
"learning_rate": 0.00014897309267470295,
"loss": 0.2096,
"step": 937
},
{
"epoch": 0.3402248821182445,
"grad_norm": 0.15512487292289734,
"learning_rate": 0.0001488733492837505,
"loss": 0.2322,
"step": 938
},
{
"epoch": 0.3405875952121872,
"grad_norm": 0.17276284098625183,
"learning_rate": 0.00014877354197019064,
"loss": 0.2217,
"step": 939
},
{
"epoch": 0.3409503083061299,
"grad_norm": 0.16367502510547638,
"learning_rate": 0.00014867367086456373,
"loss": 0.2187,
"step": 940
},
{
"epoch": 0.3413130214000725,
"grad_norm": 0.1784859150648117,
"learning_rate": 0.0001485737360974936,
"loss": 0.2339,
"step": 941
},
{
"epoch": 0.3416757344940152,
"grad_norm": 0.15108786523342133,
"learning_rate": 0.00014847373779968724,
"loss": 0.207,
"step": 942
},
{
"epoch": 0.34203844758795793,
"grad_norm": 0.15686751902103424,
"learning_rate": 0.00014837367610193476,
"loss": 0.2155,
"step": 943
},
{
"epoch": 0.34240116068190063,
"grad_norm": 0.17520059645175934,
"learning_rate": 0.00014827355113510927,
"loss": 0.2185,
"step": 944
},
{
"epoch": 0.34276387377584333,
"grad_norm": 0.14490067958831787,
"learning_rate": 0.0001481733630301666,
"loss": 0.2049,
"step": 945
},
{
"epoch": 0.343126586869786,
"grad_norm": 0.15382413566112518,
"learning_rate": 0.000148073111918145,
"loss": 0.2061,
"step": 946
},
{
"epoch": 0.3434892999637287,
"grad_norm": 0.15271534025669098,
"learning_rate": 0.0001479727979301654,
"loss": 0.2208,
"step": 947
},
{
"epoch": 0.3438520130576714,
"grad_norm": 0.1692724972963333,
"learning_rate": 0.0001478724211974308,
"loss": 0.24,
"step": 948
},
{
"epoch": 0.3442147261516141,
"grad_norm": 0.18430864810943604,
"learning_rate": 0.0001477719818512263,
"loss": 0.2347,
"step": 949
},
{
"epoch": 0.3445774392455568,
"grad_norm": 0.16035676002502441,
"learning_rate": 0.00014767148002291886,
"loss": 0.229,
"step": 950
},
{
"epoch": 0.34494015233949943,
"grad_norm": 0.14710398018360138,
"learning_rate": 0.00014757091584395726,
"loss": 0.2184,
"step": 951
},
{
"epoch": 0.34530286543344213,
"grad_norm": 0.1524522453546524,
"learning_rate": 0.00014747028944587167,
"loss": 0.2067,
"step": 952
},
{
"epoch": 0.34566557852738483,
"grad_norm": 0.1544627547264099,
"learning_rate": 0.00014736960096027385,
"loss": 0.1903,
"step": 953
},
{
"epoch": 0.34602829162132753,
"grad_norm": 0.15999731421470642,
"learning_rate": 0.00014726885051885653,
"loss": 0.1956,
"step": 954
},
{
"epoch": 0.34639100471527023,
"grad_norm": 0.16488391160964966,
"learning_rate": 0.00014716803825339368,
"loss": 0.227,
"step": 955
},
{
"epoch": 0.34675371780921294,
"grad_norm": 0.1626642644405365,
"learning_rate": 0.00014706716429573996,
"loss": 0.2302,
"step": 956
},
{
"epoch": 0.3471164309031556,
"grad_norm": 0.1589454710483551,
"learning_rate": 0.00014696622877783088,
"loss": 0.1998,
"step": 957
},
{
"epoch": 0.3474791439970983,
"grad_norm": 0.17863640189170837,
"learning_rate": 0.00014686523183168236,
"loss": 0.2244,
"step": 958
},
{
"epoch": 0.347841857091041,
"grad_norm": 0.15809310972690582,
"learning_rate": 0.00014676417358939063,
"loss": 0.2156,
"step": 959
},
{
"epoch": 0.3482045701849837,
"grad_norm": 0.14684627950191498,
"learning_rate": 0.00014666305418313224,
"loss": 0.2037,
"step": 960
},
{
"epoch": 0.3485672832789264,
"grad_norm": 0.14479795098304749,
"learning_rate": 0.00014656187374516365,
"loss": 0.1991,
"step": 961
},
{
"epoch": 0.34892999637286903,
"grad_norm": 0.17033007740974426,
"learning_rate": 0.00014646063240782105,
"loss": 0.1991,
"step": 962
},
{
"epoch": 0.34929270946681173,
"grad_norm": 0.1695454865694046,
"learning_rate": 0.00014635933030352044,
"loss": 0.2039,
"step": 963
},
{
"epoch": 0.34965542256075444,
"grad_norm": 0.16838496923446655,
"learning_rate": 0.00014625796756475724,
"loss": 0.2111,
"step": 964
},
{
"epoch": 0.35001813565469714,
"grad_norm": 0.16217052936553955,
"learning_rate": 0.00014615654432410612,
"loss": 0.2091,
"step": 965
},
{
"epoch": 0.35038084874863984,
"grad_norm": 0.15333756804466248,
"learning_rate": 0.00014605506071422103,
"loss": 0.2225,
"step": 966
},
{
"epoch": 0.35074356184258254,
"grad_norm": 0.15081751346588135,
"learning_rate": 0.00014595351686783465,
"loss": 0.2138,
"step": 967
},
{
"epoch": 0.3511062749365252,
"grad_norm": 0.16661369800567627,
"learning_rate": 0.00014585191291775868,
"loss": 0.211,
"step": 968
},
{
"epoch": 0.3514689880304679,
"grad_norm": 0.15592342615127563,
"learning_rate": 0.00014575024899688324,
"loss": 0.2069,
"step": 969
},
{
"epoch": 0.3518317011244106,
"grad_norm": 0.15869508683681488,
"learning_rate": 0.00014564852523817705,
"loss": 0.1961,
"step": 970
},
{
"epoch": 0.3521944142183533,
"grad_norm": 0.18337900936603546,
"learning_rate": 0.00014554674177468695,
"loss": 0.2039,
"step": 971
},
{
"epoch": 0.352557127312296,
"grad_norm": 0.20202304422855377,
"learning_rate": 0.00014544489873953803,
"loss": 0.2344,
"step": 972
},
{
"epoch": 0.35291984040623864,
"grad_norm": 0.1616135686635971,
"learning_rate": 0.0001453429962659331,
"loss": 0.2117,
"step": 973
},
{
"epoch": 0.35328255350018134,
"grad_norm": 0.15346059203147888,
"learning_rate": 0.00014524103448715283,
"loss": 0.2235,
"step": 974
},
{
"epoch": 0.35364526659412404,
"grad_norm": 0.148000568151474,
"learning_rate": 0.00014513901353655547,
"loss": 0.1944,
"step": 975
},
{
"epoch": 0.35400797968806674,
"grad_norm": 0.15789712965488434,
"learning_rate": 0.00014503693354757667,
"loss": 0.2139,
"step": 976
},
{
"epoch": 0.35437069278200944,
"grad_norm": 0.16983194649219513,
"learning_rate": 0.00014493479465372912,
"loss": 0.2122,
"step": 977
},
{
"epoch": 0.35473340587595215,
"grad_norm": 0.19161252677440643,
"learning_rate": 0.0001448325969886028,
"loss": 0.2799,
"step": 978
},
{
"epoch": 0.3550961189698948,
"grad_norm": 0.16653449833393097,
"learning_rate": 0.00014473034068586445,
"loss": 0.2166,
"step": 979
},
{
"epoch": 0.3554588320638375,
"grad_norm": 0.1566229611635208,
"learning_rate": 0.00014462802587925742,
"loss": 0.2104,
"step": 980
},
{
"epoch": 0.3558215451577802,
"grad_norm": 0.15640553832054138,
"learning_rate": 0.00014452565270260177,
"loss": 0.1979,
"step": 981
},
{
"epoch": 0.3561842582517229,
"grad_norm": 0.15835930407047272,
"learning_rate": 0.00014442322128979372,
"loss": 0.2412,
"step": 982
},
{
"epoch": 0.3565469713456656,
"grad_norm": 0.172097310423851,
"learning_rate": 0.00014432073177480576,
"loss": 0.2146,
"step": 983
},
{
"epoch": 0.35690968443960824,
"grad_norm": 0.1693415641784668,
"learning_rate": 0.00014421818429168634,
"loss": 0.2408,
"step": 984
},
{
"epoch": 0.35727239753355094,
"grad_norm": 0.15985938906669617,
"learning_rate": 0.00014411557897455973,
"loss": 0.2167,
"step": 985
},
{
"epoch": 0.35763511062749365,
"grad_norm": 0.16702041029930115,
"learning_rate": 0.00014401291595762586,
"loss": 0.2062,
"step": 986
},
{
"epoch": 0.35799782372143635,
"grad_norm": 0.16588671505451202,
"learning_rate": 0.00014391019537516006,
"loss": 0.2023,
"step": 987
},
{
"epoch": 0.35836053681537905,
"grad_norm": 0.15971873700618744,
"learning_rate": 0.0001438074173615131,
"loss": 0.2162,
"step": 988
},
{
"epoch": 0.35872324990932175,
"grad_norm": 0.1973976194858551,
"learning_rate": 0.0001437045820511107,
"loss": 0.2135,
"step": 989
},
{
"epoch": 0.3590859630032644,
"grad_norm": 0.1852118968963623,
"learning_rate": 0.00014360168957845362,
"loss": 0.2161,
"step": 990
},
{
"epoch": 0.3594486760972071,
"grad_norm": 0.15597601234912872,
"learning_rate": 0.00014349874007811735,
"loss": 0.2053,
"step": 991
},
{
"epoch": 0.3598113891911498,
"grad_norm": 0.16251103579998016,
"learning_rate": 0.00014339573368475197,
"loss": 0.2122,
"step": 992
},
{
"epoch": 0.3601741022850925,
"grad_norm": 0.1582382768392563,
"learning_rate": 0.00014329267053308194,
"loss": 0.2175,
"step": 993
},
{
"epoch": 0.3605368153790352,
"grad_norm": 0.15138986706733704,
"learning_rate": 0.00014318955075790605,
"loss": 0.201,
"step": 994
},
{
"epoch": 0.36089952847297785,
"grad_norm": 0.16074247658252716,
"learning_rate": 0.00014308637449409706,
"loss": 0.2281,
"step": 995
},
{
"epoch": 0.36126224156692055,
"grad_norm": 0.153158500790596,
"learning_rate": 0.00014298314187660162,
"loss": 0.1925,
"step": 996
},
{
"epoch": 0.36162495466086325,
"grad_norm": 0.17264969646930695,
"learning_rate": 0.00014287985304044015,
"loss": 0.2069,
"step": 997
},
{
"epoch": 0.36198766775480595,
"grad_norm": 0.18429549038410187,
"learning_rate": 0.0001427765081207065,
"loss": 0.2185,
"step": 998
},
{
"epoch": 0.36235038084874865,
"grad_norm": 0.1758868545293808,
"learning_rate": 0.000142673107252568,
"loss": 0.2432,
"step": 999
},
{
"epoch": 0.36271309394269136,
"grad_norm": 0.15705294907093048,
"learning_rate": 0.00014256965057126504,
"loss": 0.1986,
"step": 1000
},
{
"epoch": 0.363075807036634,
"grad_norm": 0.1507769376039505,
"learning_rate": 0.00014246613821211108,
"loss": 0.1876,
"step": 1001
},
{
"epoch": 0.3634385201305767,
"grad_norm": 0.17133677005767822,
"learning_rate": 0.00014236257031049232,
"loss": 0.209,
"step": 1002
},
{
"epoch": 0.3638012332245194,
"grad_norm": 0.15936224162578583,
"learning_rate": 0.00014225894700186774,
"loss": 0.1974,
"step": 1003
},
{
"epoch": 0.3641639463184621,
"grad_norm": 0.19848595559597015,
"learning_rate": 0.00014215526842176868,
"loss": 0.2218,
"step": 1004
},
{
"epoch": 0.3645266594124048,
"grad_norm": 0.17126554250717163,
"learning_rate": 0.00014205153470579882,
"loss": 0.2229,
"step": 1005
},
{
"epoch": 0.36488937250634745,
"grad_norm": 0.15903635323047638,
"learning_rate": 0.0001419477459896339,
"loss": 0.2127,
"step": 1006
},
{
"epoch": 0.36525208560029016,
"grad_norm": 0.16994720697402954,
"learning_rate": 0.00014184390240902167,
"loss": 0.2289,
"step": 1007
},
{
"epoch": 0.36561479869423286,
"grad_norm": 0.17226669192314148,
"learning_rate": 0.00014174000409978156,
"loss": 0.2147,
"step": 1008
},
{
"epoch": 0.36597751178817556,
"grad_norm": 0.1492406278848648,
"learning_rate": 0.00014163605119780467,
"loss": 0.2087,
"step": 1009
},
{
"epoch": 0.36634022488211826,
"grad_norm": 0.16116073727607727,
"learning_rate": 0.00014153204383905344,
"loss": 0.2176,
"step": 1010
},
{
"epoch": 0.36670293797606096,
"grad_norm": 0.16366463899612427,
"learning_rate": 0.00014142798215956148,
"loss": 0.1925,
"step": 1011
},
{
"epoch": 0.3670656510700036,
"grad_norm": 0.15476755797863007,
"learning_rate": 0.00014132386629543364,
"loss": 0.1994,
"step": 1012
},
{
"epoch": 0.3674283641639463,
"grad_norm": 0.16290143132209778,
"learning_rate": 0.00014121969638284542,
"loss": 0.2131,
"step": 1013
},
{
"epoch": 0.367791077257889,
"grad_norm": 0.15869063138961792,
"learning_rate": 0.00014111547255804316,
"loss": 0.1889,
"step": 1014
},
{
"epoch": 0.3681537903518317,
"grad_norm": 0.1735077053308487,
"learning_rate": 0.00014101119495734364,
"loss": 0.2261,
"step": 1015
},
{
"epoch": 0.3685165034457744,
"grad_norm": 0.16333554685115814,
"learning_rate": 0.00014090686371713402,
"loss": 0.2247,
"step": 1016
},
{
"epoch": 0.36887921653971706,
"grad_norm": 0.18004798889160156,
"learning_rate": 0.00014080247897387156,
"loss": 0.2334,
"step": 1017
},
{
"epoch": 0.36924192963365976,
"grad_norm": 0.16508519649505615,
"learning_rate": 0.0001406980408640835,
"loss": 0.1995,
"step": 1018
},
{
"epoch": 0.36960464272760246,
"grad_norm": 0.1622190773487091,
"learning_rate": 0.00014059354952436698,
"loss": 0.2003,
"step": 1019
},
{
"epoch": 0.36996735582154516,
"grad_norm": 0.16706664860248566,
"learning_rate": 0.00014048900509138867,
"loss": 0.219,
"step": 1020
},
{
"epoch": 0.37033006891548786,
"grad_norm": 0.1640990823507309,
"learning_rate": 0.00014038440770188467,
"loss": 0.2018,
"step": 1021
},
{
"epoch": 0.37069278200943057,
"grad_norm": 0.17155148088932037,
"learning_rate": 0.0001402797574926604,
"loss": 0.2234,
"step": 1022
},
{
"epoch": 0.3710554951033732,
"grad_norm": 0.1780928671360016,
"learning_rate": 0.00014017505460059036,
"loss": 0.2346,
"step": 1023
},
{
"epoch": 0.3714182081973159,
"grad_norm": 0.1557503193616867,
"learning_rate": 0.0001400702991626179,
"loss": 0.1969,
"step": 1024
},
{
"epoch": 0.3717809212912586,
"grad_norm": 0.14212948083877563,
"learning_rate": 0.00013996549131575515,
"loss": 0.1883,
"step": 1025
},
{
"epoch": 0.3721436343852013,
"grad_norm": 0.16952791810035706,
"learning_rate": 0.00013986063119708275,
"loss": 0.2157,
"step": 1026
},
{
"epoch": 0.372506347479144,
"grad_norm": 0.16988742351531982,
"learning_rate": 0.00013975571894374973,
"loss": 0.2103,
"step": 1027
},
{
"epoch": 0.37286906057308666,
"grad_norm": 0.16801071166992188,
"learning_rate": 0.00013965075469297332,
"loss": 0.2094,
"step": 1028
},
{
"epoch": 0.37323177366702937,
"grad_norm": 0.19034814834594727,
"learning_rate": 0.00013954573858203874,
"loss": 0.2444,
"step": 1029
},
{
"epoch": 0.37359448676097207,
"grad_norm": 0.15771815180778503,
"learning_rate": 0.000139440670748299,
"loss": 0.1987,
"step": 1030
},
{
"epoch": 0.37395719985491477,
"grad_norm": 0.1528027504682541,
"learning_rate": 0.00013933555132917487,
"loss": 0.2138,
"step": 1031
},
{
"epoch": 0.37431991294885747,
"grad_norm": 0.16030389070510864,
"learning_rate": 0.00013923038046215446,
"loss": 0.2057,
"step": 1032
},
{
"epoch": 0.37468262604280017,
"grad_norm": 0.1645725667476654,
"learning_rate": 0.0001391251582847932,
"loss": 0.1957,
"step": 1033
},
{
"epoch": 0.3750453391367428,
"grad_norm": 0.17184780538082123,
"learning_rate": 0.0001390198849347138,
"loss": 0.2244,
"step": 1034
},
{
"epoch": 0.3754080522306855,
"grad_norm": 0.16507604718208313,
"learning_rate": 0.00013891456054960564,
"loss": 0.2126,
"step": 1035
},
{
"epoch": 0.3757707653246282,
"grad_norm": 0.15355214476585388,
"learning_rate": 0.00013880918526722497,
"loss": 0.1853,
"step": 1036
},
{
"epoch": 0.3761334784185709,
"grad_norm": 0.1596059501171112,
"learning_rate": 0.00013870375922539466,
"loss": 0.229,
"step": 1037
},
{
"epoch": 0.3764961915125136,
"grad_norm": 0.16307580471038818,
"learning_rate": 0.00013859828256200394,
"loss": 0.2149,
"step": 1038
},
{
"epoch": 0.37685890460645627,
"grad_norm": 0.15789788961410522,
"learning_rate": 0.00013849275541500812,
"loss": 0.2351,
"step": 1039
},
{
"epoch": 0.37722161770039897,
"grad_norm": 0.1589316725730896,
"learning_rate": 0.00013838717792242876,
"loss": 0.2164,
"step": 1040
},
{
"epoch": 0.37758433079434167,
"grad_norm": 0.15134315192699432,
"learning_rate": 0.00013828155022235308,
"loss": 0.1925,
"step": 1041
},
{
"epoch": 0.3779470438882844,
"grad_norm": 0.14640171825885773,
"learning_rate": 0.00013817587245293407,
"loss": 0.2138,
"step": 1042
},
{
"epoch": 0.3783097569822271,
"grad_norm": 0.1695149838924408,
"learning_rate": 0.0001380701447523902,
"loss": 0.2139,
"step": 1043
},
{
"epoch": 0.3786724700761698,
"grad_norm": 0.1683790236711502,
"learning_rate": 0.0001379643672590052,
"loss": 0.1954,
"step": 1044
},
{
"epoch": 0.3790351831701124,
"grad_norm": 0.17694401741027832,
"learning_rate": 0.00013785854011112798,
"loss": 0.2022,
"step": 1045
},
{
"epoch": 0.3793978962640551,
"grad_norm": 0.17428404092788696,
"learning_rate": 0.00013775266344717233,
"loss": 0.1832,
"step": 1046
},
{
"epoch": 0.3797606093579978,
"grad_norm": 0.1612454652786255,
"learning_rate": 0.00013764673740561685,
"loss": 0.1917,
"step": 1047
},
{
"epoch": 0.3801233224519405,
"grad_norm": 0.16686902940273285,
"learning_rate": 0.0001375407621250047,
"loss": 0.1989,
"step": 1048
},
{
"epoch": 0.3804860355458832,
"grad_norm": 0.14911605417728424,
"learning_rate": 0.00013743473774394346,
"loss": 0.2004,
"step": 1049
},
{
"epoch": 0.3808487486398259,
"grad_norm": 0.15896974503993988,
"learning_rate": 0.00013732866440110497,
"loss": 0.2466,
"step": 1050
},
{
"epoch": 0.3812114617337686,
"grad_norm": 0.16059251129627228,
"learning_rate": 0.000137222542235225,
"loss": 0.2042,
"step": 1051
},
{
"epoch": 0.3815741748277113,
"grad_norm": 0.16174575686454773,
"learning_rate": 0.0001371163713851032,
"loss": 0.1979,
"step": 1052
},
{
"epoch": 0.381936887921654,
"grad_norm": 0.1577538102865219,
"learning_rate": 0.00013701015198960302,
"loss": 0.213,
"step": 1053
},
{
"epoch": 0.3822996010155967,
"grad_norm": 0.1710449755191803,
"learning_rate": 0.0001369038841876513,
"loss": 0.223,
"step": 1054
},
{
"epoch": 0.3826623141095394,
"grad_norm": 0.17627973854541779,
"learning_rate": 0.00013679756811823813,
"loss": 0.2397,
"step": 1055
},
{
"epoch": 0.383025027203482,
"grad_norm": 0.15820728242397308,
"learning_rate": 0.0001366912039204169,
"loss": 0.1959,
"step": 1056
},
{
"epoch": 0.38338774029742473,
"grad_norm": 0.15889425575733185,
"learning_rate": 0.00013658479173330384,
"loss": 0.1805,
"step": 1057
},
{
"epoch": 0.38375045339136743,
"grad_norm": 0.18348795175552368,
"learning_rate": 0.00013647833169607788,
"loss": 0.2061,
"step": 1058
},
{
"epoch": 0.38411316648531013,
"grad_norm": 0.16327665746212006,
"learning_rate": 0.0001363718239479807,
"loss": 0.1899,
"step": 1059
},
{
"epoch": 0.38447587957925283,
"grad_norm": 0.15636590123176575,
"learning_rate": 0.00013626526862831628,
"loss": 0.2161,
"step": 1060
},
{
"epoch": 0.3848385926731955,
"grad_norm": 0.158644899725914,
"learning_rate": 0.00013615866587645084,
"loss": 0.1991,
"step": 1061
},
{
"epoch": 0.3852013057671382,
"grad_norm": 0.16064795851707458,
"learning_rate": 0.0001360520158318126,
"loss": 0.2009,
"step": 1062
},
{
"epoch": 0.3855640188610809,
"grad_norm": 0.18209217488765717,
"learning_rate": 0.00013594531863389173,
"loss": 0.2538,
"step": 1063
},
{
"epoch": 0.3859267319550236,
"grad_norm": 0.16186301410198212,
"learning_rate": 0.00013583857442223994,
"loss": 0.2249,
"step": 1064
},
{
"epoch": 0.3862894450489663,
"grad_norm": 0.16660407185554504,
"learning_rate": 0.00013573178333647058,
"loss": 0.2116,
"step": 1065
},
{
"epoch": 0.386652158142909,
"grad_norm": 0.16199025511741638,
"learning_rate": 0.0001356249455162582,
"loss": 0.2156,
"step": 1066
},
{
"epoch": 0.38701487123685163,
"grad_norm": 0.1578529328107834,
"learning_rate": 0.0001355180611013385,
"loss": 0.2066,
"step": 1067
},
{
"epoch": 0.38737758433079433,
"grad_norm": 0.17841364443302155,
"learning_rate": 0.00013541113023150816,
"loss": 0.205,
"step": 1068
},
{
"epoch": 0.38774029742473703,
"grad_norm": 0.1555965095758438,
"learning_rate": 0.00013530415304662457,
"loss": 0.2027,
"step": 1069
},
{
"epoch": 0.38810301051867974,
"grad_norm": 0.15105211734771729,
"learning_rate": 0.00013519712968660568,
"loss": 0.1963,
"step": 1070
},
{
"epoch": 0.38846572361262244,
"grad_norm": 0.16452065110206604,
"learning_rate": 0.0001350900602914299,
"loss": 0.2129,
"step": 1071
},
{
"epoch": 0.3888284367065651,
"grad_norm": 0.16760526597499847,
"learning_rate": 0.00013498294500113585,
"loss": 0.2418,
"step": 1072
},
{
"epoch": 0.3891911498005078,
"grad_norm": 0.16931942105293274,
"learning_rate": 0.00013487578395582206,
"loss": 0.1914,
"step": 1073
},
{
"epoch": 0.3895538628944505,
"grad_norm": 0.1739332228899002,
"learning_rate": 0.0001347685772956471,
"loss": 0.2107,
"step": 1074
},
{
"epoch": 0.3899165759883932,
"grad_norm": 0.1568581908941269,
"learning_rate": 0.00013466132516082907,
"loss": 0.1835,
"step": 1075
},
{
"epoch": 0.3902792890823359,
"grad_norm": 0.16916148364543915,
"learning_rate": 0.0001345540276916455,
"loss": 0.2041,
"step": 1076
},
{
"epoch": 0.3906420021762786,
"grad_norm": 0.16345995664596558,
"learning_rate": 0.0001344466850284333,
"loss": 0.1789,
"step": 1077
},
{
"epoch": 0.39100471527022124,
"grad_norm": 0.16848930716514587,
"learning_rate": 0.00013433929731158852,
"loss": 0.1961,
"step": 1078
},
{
"epoch": 0.39136742836416394,
"grad_norm": 0.1991538405418396,
"learning_rate": 0.00013423186468156608,
"loss": 0.2544,
"step": 1079
},
{
"epoch": 0.39173014145810664,
"grad_norm": 0.17732208967208862,
"learning_rate": 0.0001341243872788796,
"loss": 0.258,
"step": 1080
},
{
"epoch": 0.39209285455204934,
"grad_norm": 0.16117359697818756,
"learning_rate": 0.0001340168652441014,
"loss": 0.2389,
"step": 1081
},
{
"epoch": 0.39245556764599204,
"grad_norm": 0.1693982034921646,
"learning_rate": 0.00013390929871786203,
"loss": 0.2022,
"step": 1082
},
{
"epoch": 0.3928182807399347,
"grad_norm": 0.1722104698419571,
"learning_rate": 0.00013380168784085027,
"loss": 0.1977,
"step": 1083
},
{
"epoch": 0.3931809938338774,
"grad_norm": 0.1871337741613388,
"learning_rate": 0.000133694032753813,
"loss": 0.2249,
"step": 1084
},
{
"epoch": 0.3935437069278201,
"grad_norm": 0.17777620255947113,
"learning_rate": 0.0001335863335975548,
"loss": 0.1949,
"step": 1085
},
{
"epoch": 0.3939064200217628,
"grad_norm": 0.18331852555274963,
"learning_rate": 0.00013347859051293792,
"loss": 0.1969,
"step": 1086
},
{
"epoch": 0.3942691331157055,
"grad_norm": 0.158721461892128,
"learning_rate": 0.0001333708036408821,
"loss": 0.1919,
"step": 1087
},
{
"epoch": 0.3946318462096482,
"grad_norm": 0.16589364409446716,
"learning_rate": 0.00013326297312236439,
"loss": 0.2044,
"step": 1088
},
{
"epoch": 0.39499455930359084,
"grad_norm": 0.15952499210834503,
"learning_rate": 0.0001331550990984188,
"loss": 0.2005,
"step": 1089
},
{
"epoch": 0.39535727239753354,
"grad_norm": 0.15588688850402832,
"learning_rate": 0.00013304718171013632,
"loss": 0.2234,
"step": 1090
},
{
"epoch": 0.39571998549147624,
"grad_norm": 0.17283542454242706,
"learning_rate": 0.0001329392210986647,
"loss": 0.2001,
"step": 1091
},
{
"epoch": 0.39608269858541895,
"grad_norm": 0.15617555379867554,
"learning_rate": 0.00013283121740520812,
"loss": 0.1982,
"step": 1092
},
{
"epoch": 0.39644541167936165,
"grad_norm": 0.18503715097904205,
"learning_rate": 0.0001327231707710272,
"loss": 0.2315,
"step": 1093
},
{
"epoch": 0.3968081247733043,
"grad_norm": 0.16704030334949493,
"learning_rate": 0.00013261508133743865,
"loss": 0.2,
"step": 1094
},
{
"epoch": 0.397170837867247,
"grad_norm": 0.17188745737075806,
"learning_rate": 0.0001325069492458152,
"loss": 0.258,
"step": 1095
},
{
"epoch": 0.3975335509611897,
"grad_norm": 0.1544748693704605,
"learning_rate": 0.00013239877463758537,
"loss": 0.19,
"step": 1096
},
{
"epoch": 0.3978962640551324,
"grad_norm": 0.1821664571762085,
"learning_rate": 0.0001322905576542333,
"loss": 0.2071,
"step": 1097
},
{
"epoch": 0.3982589771490751,
"grad_norm": 0.15686167776584625,
"learning_rate": 0.00013218229843729856,
"loss": 0.1807,
"step": 1098
},
{
"epoch": 0.39862169024301775,
"grad_norm": 0.1645747721195221,
"learning_rate": 0.00013207399712837582,
"loss": 0.1941,
"step": 1099
},
{
"epoch": 0.39898440333696045,
"grad_norm": 0.15510335564613342,
"learning_rate": 0.00013196565386911505,
"loss": 0.1982,
"step": 1100
},
{
"epoch": 0.39934711643090315,
"grad_norm": 0.17434607446193695,
"learning_rate": 0.0001318572688012209,
"loss": 0.2012,
"step": 1101
},
{
"epoch": 0.39970982952484585,
"grad_norm": 0.1454346626996994,
"learning_rate": 0.00013174884206645278,
"loss": 0.1887,
"step": 1102
},
{
"epoch": 0.40007254261878855,
"grad_norm": 0.16709522902965546,
"learning_rate": 0.00013164037380662452,
"loss": 0.1914,
"step": 1103
},
{
"epoch": 0.40043525571273125,
"grad_norm": 0.17922160029411316,
"learning_rate": 0.0001315318641636044,
"loss": 0.2002,
"step": 1104
},
{
"epoch": 0.4007979688066739,
"grad_norm": 0.1769881397485733,
"learning_rate": 0.00013142331327931469,
"loss": 0.1993,
"step": 1105
},
{
"epoch": 0.4011606819006166,
"grad_norm": 0.1627112329006195,
"learning_rate": 0.00013131472129573166,
"loss": 0.2096,
"step": 1106
},
{
"epoch": 0.4015233949945593,
"grad_norm": 0.1649940013885498,
"learning_rate": 0.00013120608835488532,
"loss": 0.2032,
"step": 1107
},
{
"epoch": 0.401886108088502,
"grad_norm": 0.18944235146045685,
"learning_rate": 0.00013109741459885928,
"loss": 0.2163,
"step": 1108
},
{
"epoch": 0.4022488211824447,
"grad_norm": 0.16329450905323029,
"learning_rate": 0.00013098870016979051,
"loss": 0.1833,
"step": 1109
},
{
"epoch": 0.40261153427638735,
"grad_norm": 0.20053814351558685,
"learning_rate": 0.00013087994520986923,
"loss": 0.2166,
"step": 1110
},
{
"epoch": 0.40297424737033005,
"grad_norm": 0.19225598871707916,
"learning_rate": 0.00013077114986133847,
"loss": 0.2544,
"step": 1111
},
{
"epoch": 0.40333696046427275,
"grad_norm": 0.17340917885303497,
"learning_rate": 0.00013066231426649437,
"loss": 0.2005,
"step": 1112
},
{
"epoch": 0.40369967355821545,
"grad_norm": 0.1653253436088562,
"learning_rate": 0.00013055343856768555,
"loss": 0.2119,
"step": 1113
},
{
"epoch": 0.40406238665215816,
"grad_norm": 0.16865472495555878,
"learning_rate": 0.00013044452290731306,
"loss": 0.1748,
"step": 1114
},
{
"epoch": 0.40442509974610086,
"grad_norm": 0.17820391058921814,
"learning_rate": 0.0001303355674278303,
"loss": 0.2094,
"step": 1115
},
{
"epoch": 0.4047878128400435,
"grad_norm": 0.17825227975845337,
"learning_rate": 0.0001302265722717427,
"loss": 0.2174,
"step": 1116
},
{
"epoch": 0.4051505259339862,
"grad_norm": 0.6229606866836548,
"learning_rate": 0.0001301175375816076,
"loss": 0.2072,
"step": 1117
},
{
"epoch": 0.4055132390279289,
"grad_norm": 0.21105241775512695,
"learning_rate": 0.0001300084635000341,
"loss": 0.2041,
"step": 1118
},
{
"epoch": 0.4058759521218716,
"grad_norm": 0.20768609642982483,
"learning_rate": 0.00012989935016968266,
"loss": 0.2091,
"step": 1119
},
{
"epoch": 0.4062386652158143,
"grad_norm": 0.1655317097902298,
"learning_rate": 0.00012979019773326524,
"loss": 0.2095,
"step": 1120
},
{
"epoch": 0.40660137830975696,
"grad_norm": 0.1594689041376114,
"learning_rate": 0.00012968100633354492,
"loss": 0.1922,
"step": 1121
},
{
"epoch": 0.40696409140369966,
"grad_norm": 0.1779058277606964,
"learning_rate": 0.00012957177611333566,
"loss": 0.1948,
"step": 1122
},
{
"epoch": 0.40732680449764236,
"grad_norm": 0.18424735963344574,
"learning_rate": 0.00012946250721550224,
"loss": 0.2174,
"step": 1123
},
{
"epoch": 0.40768951759158506,
"grad_norm": 0.19321289658546448,
"learning_rate": 0.00012935319978296008,
"loss": 0.2032,
"step": 1124
},
{
"epoch": 0.40805223068552776,
"grad_norm": 0.1741238832473755,
"learning_rate": 0.00012924385395867493,
"loss": 0.1928,
"step": 1125
},
{
"epoch": 0.40841494377947046,
"grad_norm": 0.16779236495494843,
"learning_rate": 0.00012913446988566273,
"loss": 0.2021,
"step": 1126
},
{
"epoch": 0.4087776568734131,
"grad_norm": 0.16747735440731049,
"learning_rate": 0.00012902504770698954,
"loss": 0.1993,
"step": 1127
},
{
"epoch": 0.4091403699673558,
"grad_norm": 0.18401382863521576,
"learning_rate": 0.00012891558756577122,
"loss": 0.2151,
"step": 1128
},
{
"epoch": 0.4095030830612985,
"grad_norm": 0.15898433327674866,
"learning_rate": 0.00012880608960517322,
"loss": 0.187,
"step": 1129
},
{
"epoch": 0.4098657961552412,
"grad_norm": 0.1666088104248047,
"learning_rate": 0.0001286965539684106,
"loss": 0.1849,
"step": 1130
},
{
"epoch": 0.4102285092491839,
"grad_norm": 0.17613482475280762,
"learning_rate": 0.00012858698079874748,
"loss": 0.1993,
"step": 1131
},
{
"epoch": 0.41059122234312656,
"grad_norm": 0.17263801395893097,
"learning_rate": 0.0001284773702394973,
"loss": 0.1947,
"step": 1132
},
{
"epoch": 0.41095393543706926,
"grad_norm": 0.1618073433637619,
"learning_rate": 0.00012836772243402224,
"loss": 0.1869,
"step": 1133
},
{
"epoch": 0.41131664853101196,
"grad_norm": 0.1828174889087677,
"learning_rate": 0.00012825803752573327,
"loss": 0.2207,
"step": 1134
},
{
"epoch": 0.41167936162495467,
"grad_norm": 0.17469796538352966,
"learning_rate": 0.00012814831565808986,
"loss": 0.2008,
"step": 1135
},
{
"epoch": 0.41204207471889737,
"grad_norm": 0.17154814302921295,
"learning_rate": 0.00012803855697459987,
"loss": 0.2098,
"step": 1136
},
{
"epoch": 0.41240478781284007,
"grad_norm": 0.1646650731563568,
"learning_rate": 0.00012792876161881925,
"loss": 0.2103,
"step": 1137
},
{
"epoch": 0.4127675009067827,
"grad_norm": 0.17539532482624054,
"learning_rate": 0.00012781892973435195,
"loss": 0.1966,
"step": 1138
},
{
"epoch": 0.4131302140007254,
"grad_norm": 0.17781807482242584,
"learning_rate": 0.00012770906146484964,
"loss": 0.206,
"step": 1139
},
{
"epoch": 0.4134929270946681,
"grad_norm": 0.1847347617149353,
"learning_rate": 0.0001275991569540117,
"loss": 0.2026,
"step": 1140
},
{
"epoch": 0.4138556401886108,
"grad_norm": 0.17020414769649506,
"learning_rate": 0.00012748921634558473,
"loss": 0.1958,
"step": 1141
},
{
"epoch": 0.4142183532825535,
"grad_norm": 0.18093371391296387,
"learning_rate": 0.00012737923978336274,
"loss": 0.2062,
"step": 1142
},
{
"epoch": 0.41458106637649617,
"grad_norm": 0.1588636189699173,
"learning_rate": 0.00012726922741118662,
"loss": 0.1892,
"step": 1143
},
{
"epoch": 0.41494377947043887,
"grad_norm": 0.19953924417495728,
"learning_rate": 0.00012715917937294418,
"loss": 0.2188,
"step": 1144
},
{
"epoch": 0.41530649256438157,
"grad_norm": 0.16585423052310944,
"learning_rate": 0.00012704909581256986,
"loss": 0.2231,
"step": 1145
},
{
"epoch": 0.41566920565832427,
"grad_norm": 0.17226840555667877,
"learning_rate": 0.0001269389768740445,
"loss": 0.1895,
"step": 1146
},
{
"epoch": 0.41603191875226697,
"grad_norm": 0.2125304490327835,
"learning_rate": 0.00012682882270139526,
"loss": 0.2122,
"step": 1147
},
{
"epoch": 0.4163946318462097,
"grad_norm": 0.19522660970687866,
"learning_rate": 0.00012671863343869543,
"loss": 0.2055,
"step": 1148
},
{
"epoch": 0.4167573449401523,
"grad_norm": 0.19831117987632751,
"learning_rate": 0.00012660840923006412,
"loss": 0.189,
"step": 1149
},
{
"epoch": 0.417120058034095,
"grad_norm": 0.16252368688583374,
"learning_rate": 0.0001264981502196662,
"loss": 0.2051,
"step": 1150
},
{
"epoch": 0.4174827711280377,
"grad_norm": 0.17360906302928925,
"learning_rate": 0.00012638785655171196,
"loss": 0.1957,
"step": 1151
},
{
"epoch": 0.4178454842219804,
"grad_norm": 0.1837020069360733,
"learning_rate": 0.0001262775283704572,
"loss": 0.2131,
"step": 1152
},
{
"epoch": 0.4182081973159231,
"grad_norm": 0.1726016104221344,
"learning_rate": 0.00012616716582020265,
"loss": 0.1897,
"step": 1153
},
{
"epoch": 0.41857091040986577,
"grad_norm": 0.16881223022937775,
"learning_rate": 0.00012605676904529415,
"loss": 0.1905,
"step": 1154
},
{
"epoch": 0.41893362350380847,
"grad_norm": 0.2182941734790802,
"learning_rate": 0.00012594633819012225,
"loss": 0.2176,
"step": 1155
},
{
"epoch": 0.4192963365977512,
"grad_norm": 0.1766914576292038,
"learning_rate": 0.00012583587339912207,
"loss": 0.2067,
"step": 1156
},
{
"epoch": 0.4196590496916939,
"grad_norm": 0.16632500290870667,
"learning_rate": 0.00012572537481677308,
"loss": 0.1902,
"step": 1157
},
{
"epoch": 0.4200217627856366,
"grad_norm": 0.16559042036533356,
"learning_rate": 0.00012561484258759905,
"loss": 0.1848,
"step": 1158
},
{
"epoch": 0.4203844758795793,
"grad_norm": 0.16212663054466248,
"learning_rate": 0.00012550427685616765,
"loss": 0.2009,
"step": 1159
},
{
"epoch": 0.4207471889735219,
"grad_norm": 0.16951881349086761,
"learning_rate": 0.0001253936777670904,
"loss": 0.1896,
"step": 1160
},
{
"epoch": 0.4211099020674646,
"grad_norm": 0.19102217257022858,
"learning_rate": 0.0001252830454650225,
"loss": 0.2012,
"step": 1161
},
{
"epoch": 0.4214726151614073,
"grad_norm": 0.1638030707836151,
"learning_rate": 0.00012517238009466253,
"loss": 0.1731,
"step": 1162
},
{
"epoch": 0.42183532825535003,
"grad_norm": 0.1885092556476593,
"learning_rate": 0.00012506168180075232,
"loss": 0.212,
"step": 1163
},
{
"epoch": 0.42219804134929273,
"grad_norm": 0.19661776721477509,
"learning_rate": 0.00012495095072807678,
"loss": 0.1969,
"step": 1164
},
{
"epoch": 0.4225607544432354,
"grad_norm": 0.1665484458208084,
"learning_rate": 0.00012484018702146375,
"loss": 0.1886,
"step": 1165
},
{
"epoch": 0.4229234675371781,
"grad_norm": 0.16225306689739227,
"learning_rate": 0.00012472939082578365,
"loss": 0.1869,
"step": 1166
},
{
"epoch": 0.4232861806311208,
"grad_norm": 0.16616645455360413,
"learning_rate": 0.00012461856228594947,
"loss": 0.1778,
"step": 1167
},
{
"epoch": 0.4236488937250635,
"grad_norm": 0.15914376080036163,
"learning_rate": 0.00012450770154691642,
"loss": 0.1809,
"step": 1168
},
{
"epoch": 0.4240116068190062,
"grad_norm": 0.18165045976638794,
"learning_rate": 0.00012439680875368192,
"loss": 0.1981,
"step": 1169
},
{
"epoch": 0.4243743199129489,
"grad_norm": 0.17815563082695007,
"learning_rate": 0.00012428588405128527,
"loss": 0.2462,
"step": 1170
},
{
"epoch": 0.42473703300689153,
"grad_norm": 0.1577123999595642,
"learning_rate": 0.0001241749275848075,
"loss": 0.1848,
"step": 1171
},
{
"epoch": 0.42509974610083423,
"grad_norm": 0.16714733839035034,
"learning_rate": 0.0001240639394993712,
"loss": 0.1878,
"step": 1172
},
{
"epoch": 0.42546245919477693,
"grad_norm": 0.18040674924850464,
"learning_rate": 0.0001239529199401403,
"loss": 0.2087,
"step": 1173
},
{
"epoch": 0.42582517228871963,
"grad_norm": 0.17369875311851501,
"learning_rate": 0.0001238418690523199,
"loss": 0.2198,
"step": 1174
},
{
"epoch": 0.42618788538266233,
"grad_norm": 0.17522990703582764,
"learning_rate": 0.0001237307869811561,
"loss": 0.1898,
"step": 1175
},
{
"epoch": 0.426550598476605,
"grad_norm": 0.1890110820531845,
"learning_rate": 0.0001236196738719357,
"loss": 0.1946,
"step": 1176
},
{
"epoch": 0.4269133115705477,
"grad_norm": 0.19072000682353973,
"learning_rate": 0.00012350852986998628,
"loss": 0.1782,
"step": 1177
},
{
"epoch": 0.4272760246644904,
"grad_norm": 0.16412675380706787,
"learning_rate": 0.00012339735512067557,
"loss": 0.1957,
"step": 1178
},
{
"epoch": 0.4276387377584331,
"grad_norm": 0.16497628390789032,
"learning_rate": 0.0001232861497694117,
"loss": 0.1914,
"step": 1179
},
{
"epoch": 0.4280014508523758,
"grad_norm": 0.1696443408727646,
"learning_rate": 0.00012317491396164281,
"loss": 0.2205,
"step": 1180
},
{
"epoch": 0.4283641639463185,
"grad_norm": 0.1990218162536621,
"learning_rate": 0.00012306364784285683,
"loss": 0.221,
"step": 1181
},
{
"epoch": 0.42872687704026113,
"grad_norm": 0.15306927263736725,
"learning_rate": 0.00012295235155858128,
"loss": 0.1894,
"step": 1182
},
{
"epoch": 0.42908959013420384,
"grad_norm": 0.16716569662094116,
"learning_rate": 0.00012284102525438327,
"loss": 0.2124,
"step": 1183
},
{
"epoch": 0.42945230322814654,
"grad_norm": 0.16371683776378632,
"learning_rate": 0.00012272966907586906,
"loss": 0.1952,
"step": 1184
},
{
"epoch": 0.42981501632208924,
"grad_norm": 0.19524066150188446,
"learning_rate": 0.00012261828316868404,
"loss": 0.1967,
"step": 1185
},
{
"epoch": 0.43017772941603194,
"grad_norm": 0.1753699630498886,
"learning_rate": 0.0001225068676785125,
"loss": 0.2057,
"step": 1186
},
{
"epoch": 0.4305404425099746,
"grad_norm": 0.15853376686573029,
"learning_rate": 0.00012239542275107733,
"loss": 0.1852,
"step": 1187
},
{
"epoch": 0.4309031556039173,
"grad_norm": 0.1545594483613968,
"learning_rate": 0.00012228394853214,
"loss": 0.1827,
"step": 1188
},
{
"epoch": 0.43126586869786,
"grad_norm": 0.1596081703901291,
"learning_rate": 0.0001221724451675003,
"loss": 0.2032,
"step": 1189
},
{
"epoch": 0.4316285817918027,
"grad_norm": 0.17133690416812897,
"learning_rate": 0.00012206091280299608,
"loss": 0.201,
"step": 1190
},
{
"epoch": 0.4319912948857454,
"grad_norm": 0.18594324588775635,
"learning_rate": 0.00012194935158450318,
"loss": 0.1999,
"step": 1191
},
{
"epoch": 0.4323540079796881,
"grad_norm": 0.1757342368364334,
"learning_rate": 0.0001218377616579351,
"loss": 0.2048,
"step": 1192
},
{
"epoch": 0.43271672107363074,
"grad_norm": 0.15969473123550415,
"learning_rate": 0.00012172614316924303,
"loss": 0.1896,
"step": 1193
},
{
"epoch": 0.43307943416757344,
"grad_norm": 0.1708168387413025,
"learning_rate": 0.00012161449626441535,
"loss": 0.1871,
"step": 1194
},
{
"epoch": 0.43344214726151614,
"grad_norm": 0.16224978864192963,
"learning_rate": 0.0001215028210894777,
"loss": 0.1995,
"step": 1195
},
{
"epoch": 0.43380486035545884,
"grad_norm": 0.17344152927398682,
"learning_rate": 0.00012139111779049272,
"loss": 0.2102,
"step": 1196
},
{
"epoch": 0.43416757344940154,
"grad_norm": 0.1607237160205841,
"learning_rate": 0.00012127938651355973,
"loss": 0.198,
"step": 1197
},
{
"epoch": 0.4345302865433442,
"grad_norm": 0.19598302245140076,
"learning_rate": 0.00012116762740481473,
"loss": 0.2048,
"step": 1198
},
{
"epoch": 0.4348929996372869,
"grad_norm": 0.17380495369434357,
"learning_rate": 0.00012105584061043011,
"loss": 0.1998,
"step": 1199
},
{
"epoch": 0.4352557127312296,
"grad_norm": 0.16845153272151947,
"learning_rate": 0.00012094402627661447,
"loss": 0.1944,
"step": 1200
},
{
"epoch": 0.4356184258251723,
"grad_norm": 0.17525669932365417,
"learning_rate": 0.00012083218454961237,
"loss": 0.2262,
"step": 1201
},
{
"epoch": 0.435981138919115,
"grad_norm": 0.182146355509758,
"learning_rate": 0.00012072031557570425,
"loss": 0.1899,
"step": 1202
},
{
"epoch": 0.4363438520130577,
"grad_norm": 0.1767880618572235,
"learning_rate": 0.00012060841950120623,
"loss": 0.1853,
"step": 1203
},
{
"epoch": 0.43670656510700034,
"grad_norm": 0.1868688315153122,
"learning_rate": 0.00012049649647246976,
"loss": 0.1884,
"step": 1204
},
{
"epoch": 0.43706927820094305,
"grad_norm": 0.16299636662006378,
"learning_rate": 0.0001203845466358817,
"loss": 0.1903,
"step": 1205
},
{
"epoch": 0.43743199129488575,
"grad_norm": 0.1743989884853363,
"learning_rate": 0.00012027257013786382,
"loss": 0.1741,
"step": 1206
},
{
"epoch": 0.43779470438882845,
"grad_norm": 0.16983556747436523,
"learning_rate": 0.00012016056712487281,
"loss": 0.1756,
"step": 1207
},
{
"epoch": 0.43815741748277115,
"grad_norm": 0.16869889199733734,
"learning_rate": 0.0001200485377434001,
"loss": 0.2091,
"step": 1208
},
{
"epoch": 0.4385201305767138,
"grad_norm": 0.18009315431118011,
"learning_rate": 0.00011993648213997155,
"loss": 0.1876,
"step": 1209
},
{
"epoch": 0.4388828436706565,
"grad_norm": 0.17261937260627747,
"learning_rate": 0.00011982440046114734,
"loss": 0.1888,
"step": 1210
},
{
"epoch": 0.4392455567645992,
"grad_norm": 0.1700652837753296,
"learning_rate": 0.00011971229285352173,
"loss": 0.1929,
"step": 1211
},
{
"epoch": 0.4396082698585419,
"grad_norm": 0.1701359897851944,
"learning_rate": 0.0001196001594637229,
"loss": 0.196,
"step": 1212
},
{
"epoch": 0.4399709829524846,
"grad_norm": 0.17813630402088165,
"learning_rate": 0.00011948800043841275,
"loss": 0.2116,
"step": 1213
},
{
"epoch": 0.4403336960464273,
"grad_norm": 0.1756308227777481,
"learning_rate": 0.00011937581592428677,
"loss": 0.2036,
"step": 1214
},
{
"epoch": 0.44069640914036995,
"grad_norm": 0.17653414607048035,
"learning_rate": 0.00011926360606807367,
"loss": 0.186,
"step": 1215
},
{
"epoch": 0.44105912223431265,
"grad_norm": 0.16713349521160126,
"learning_rate": 0.00011915137101653539,
"loss": 0.2161,
"step": 1216
},
{
"epoch": 0.44142183532825535,
"grad_norm": 0.17466074228286743,
"learning_rate": 0.00011903911091646684,
"loss": 0.2025,
"step": 1217
},
{
"epoch": 0.44178454842219805,
"grad_norm": 0.17018508911132812,
"learning_rate": 0.00011892682591469562,
"loss": 0.1901,
"step": 1218
},
{
"epoch": 0.44214726151614075,
"grad_norm": 0.18613681197166443,
"learning_rate": 0.00011881451615808192,
"loss": 0.1994,
"step": 1219
},
{
"epoch": 0.4425099746100834,
"grad_norm": 0.17624922096729279,
"learning_rate": 0.00011870218179351838,
"loss": 0.1909,
"step": 1220
},
{
"epoch": 0.4428726877040261,
"grad_norm": 0.16530555486679077,
"learning_rate": 0.00011858982296792971,
"loss": 0.1925,
"step": 1221
},
{
"epoch": 0.4432354007979688,
"grad_norm": 0.17213410139083862,
"learning_rate": 0.00011847743982827269,
"loss": 0.188,
"step": 1222
},
{
"epoch": 0.4435981138919115,
"grad_norm": 0.17941850423812866,
"learning_rate": 0.00011836503252153588,
"loss": 0.1836,
"step": 1223
},
{
"epoch": 0.4439608269858542,
"grad_norm": 0.211356058716774,
"learning_rate": 0.00011825260119473946,
"loss": 0.1958,
"step": 1224
},
{
"epoch": 0.4443235400797969,
"grad_norm": 0.1753711849451065,
"learning_rate": 0.00011814014599493502,
"loss": 0.1784,
"step": 1225
},
{
"epoch": 0.44468625317373955,
"grad_norm": 0.17775994539260864,
"learning_rate": 0.00011802766706920533,
"loss": 0.1984,
"step": 1226
},
{
"epoch": 0.44504896626768226,
"grad_norm": 0.15988726913928986,
"learning_rate": 0.00011791516456466429,
"loss": 0.196,
"step": 1227
},
{
"epoch": 0.44541167936162496,
"grad_norm": 0.17853982746601105,
"learning_rate": 0.00011780263862845655,
"loss": 0.193,
"step": 1228
},
{
"epoch": 0.44577439245556766,
"grad_norm": 0.1804809272289276,
"learning_rate": 0.00011769008940775744,
"loss": 0.1995,
"step": 1229
},
{
"epoch": 0.44613710554951036,
"grad_norm": 0.18296337127685547,
"learning_rate": 0.00011757751704977275,
"loss": 0.1907,
"step": 1230
},
{
"epoch": 0.446499818643453,
"grad_norm": 0.15713930130004883,
"learning_rate": 0.00011746492170173853,
"loss": 0.1945,
"step": 1231
},
{
"epoch": 0.4468625317373957,
"grad_norm": 0.18204668164253235,
"learning_rate": 0.00011735230351092087,
"loss": 0.2187,
"step": 1232
},
{
"epoch": 0.4472252448313384,
"grad_norm": 0.16009126603603363,
"learning_rate": 0.00011723966262461579,
"loss": 0.1786,
"step": 1233
},
{
"epoch": 0.4475879579252811,
"grad_norm": 0.20128843188285828,
"learning_rate": 0.00011712699919014896,
"loss": 0.1941,
"step": 1234
},
{
"epoch": 0.4479506710192238,
"grad_norm": 0.17296966910362244,
"learning_rate": 0.0001170143133548755,
"loss": 0.1843,
"step": 1235
},
{
"epoch": 0.44831338411316646,
"grad_norm": 0.18363478779792786,
"learning_rate": 0.00011690160526617995,
"loss": 0.197,
"step": 1236
},
{
"epoch": 0.44867609720710916,
"grad_norm": 0.17751774191856384,
"learning_rate": 0.00011678887507147582,
"loss": 0.1756,
"step": 1237
},
{
"epoch": 0.44903881030105186,
"grad_norm": 0.1821131557226181,
"learning_rate": 0.00011667612291820562,
"loss": 0.1911,
"step": 1238
},
{
"epoch": 0.44940152339499456,
"grad_norm": 0.16961705684661865,
"learning_rate": 0.00011656334895384053,
"loss": 0.1782,
"step": 1239
},
{
"epoch": 0.44976423648893726,
"grad_norm": 0.1650359183549881,
"learning_rate": 0.00011645055332588032,
"loss": 0.1849,
"step": 1240
},
{
"epoch": 0.45012694958287996,
"grad_norm": 0.1794784963130951,
"learning_rate": 0.00011633773618185302,
"loss": 0.2059,
"step": 1241
},
{
"epoch": 0.4504896626768226,
"grad_norm": 0.17137840390205383,
"learning_rate": 0.00011622489766931488,
"loss": 0.206,
"step": 1242
},
{
"epoch": 0.4508523757707653,
"grad_norm": 0.1728799045085907,
"learning_rate": 0.00011611203793584999,
"loss": 0.1812,
"step": 1243
},
{
"epoch": 0.451215088864708,
"grad_norm": 0.17596741020679474,
"learning_rate": 0.0001159991571290703,
"loss": 0.1935,
"step": 1244
},
{
"epoch": 0.4515778019586507,
"grad_norm": 0.18633347749710083,
"learning_rate": 0.00011588625539661528,
"loss": 0.1908,
"step": 1245
},
{
"epoch": 0.4519405150525934,
"grad_norm": 0.15337157249450684,
"learning_rate": 0.00011577333288615175,
"loss": 0.1779,
"step": 1246
},
{
"epoch": 0.45230322814653606,
"grad_norm": 0.18902058899402618,
"learning_rate": 0.00011566038974537374,
"loss": 0.2063,
"step": 1247
},
{
"epoch": 0.45266594124047876,
"grad_norm": 0.17245811223983765,
"learning_rate": 0.00011554742612200229,
"loss": 0.1827,
"step": 1248
},
{
"epoch": 0.45302865433442147,
"grad_norm": 0.17236045002937317,
"learning_rate": 0.00011543444216378517,
"loss": 0.1944,
"step": 1249
},
{
"epoch": 0.45339136742836417,
"grad_norm": 0.1754477322101593,
"learning_rate": 0.00011532143801849668,
"loss": 0.1933,
"step": 1250
},
{
"epoch": 0.45375408052230687,
"grad_norm": 0.16361160576343536,
"learning_rate": 0.00011520841383393774,
"loss": 0.193,
"step": 1251
},
{
"epoch": 0.45411679361624957,
"grad_norm": 0.17561082541942596,
"learning_rate": 0.00011509536975793527,
"loss": 0.2062,
"step": 1252
},
{
"epoch": 0.4544795067101922,
"grad_norm": 0.1636163592338562,
"learning_rate": 0.00011498230593834229,
"loss": 0.1839,
"step": 1253
},
{
"epoch": 0.4548422198041349,
"grad_norm": 0.16940078139305115,
"learning_rate": 0.00011486922252303769,
"loss": 0.18,
"step": 1254
},
{
"epoch": 0.4552049328980776,
"grad_norm": 0.1866592913866043,
"learning_rate": 0.0001147561196599259,
"loss": 0.1789,
"step": 1255
},
{
"epoch": 0.4555676459920203,
"grad_norm": 0.1689455509185791,
"learning_rate": 0.00011464299749693679,
"loss": 0.1775,
"step": 1256
},
{
"epoch": 0.455930359085963,
"grad_norm": 0.17223703861236572,
"learning_rate": 0.00011452985618202559,
"loss": 0.1813,
"step": 1257
},
{
"epoch": 0.45629307217990567,
"grad_norm": 0.16031506657600403,
"learning_rate": 0.00011441669586317243,
"loss": 0.1867,
"step": 1258
},
{
"epoch": 0.45665578527384837,
"grad_norm": 0.17869757115840912,
"learning_rate": 0.00011430351668838237,
"loss": 0.1678,
"step": 1259
},
{
"epoch": 0.45701849836779107,
"grad_norm": 0.18296487629413605,
"learning_rate": 0.00011419031880568518,
"loss": 0.1848,
"step": 1260
},
{
"epoch": 0.45738121146173377,
"grad_norm": 0.19954228401184082,
"learning_rate": 0.00011407710236313498,
"loss": 0.1961,
"step": 1261
},
{
"epoch": 0.4577439245556765,
"grad_norm": 0.16006030142307281,
"learning_rate": 0.00011396386750881025,
"loss": 0.1738,
"step": 1262
},
{
"epoch": 0.4581066376496192,
"grad_norm": 0.17467838525772095,
"learning_rate": 0.00011385061439081355,
"loss": 0.2,
"step": 1263
},
{
"epoch": 0.4584693507435618,
"grad_norm": 0.1634225696325302,
"learning_rate": 0.00011373734315727125,
"loss": 0.1593,
"step": 1264
},
{
"epoch": 0.4588320638375045,
"grad_norm": 0.1675540953874588,
"learning_rate": 0.00011362405395633355,
"loss": 0.1761,
"step": 1265
},
{
"epoch": 0.4591947769314472,
"grad_norm": 0.2374797910451889,
"learning_rate": 0.00011351074693617398,
"loss": 0.2401,
"step": 1266
},
{
"epoch": 0.4595574900253899,
"grad_norm": 0.16424275934696198,
"learning_rate": 0.00011339742224498957,
"loss": 0.1822,
"step": 1267
},
{
"epoch": 0.4599202031193326,
"grad_norm": 0.1777309626340866,
"learning_rate": 0.00011328408003100031,
"loss": 0.199,
"step": 1268
},
{
"epoch": 0.4602829162132753,
"grad_norm": 0.17055995762348175,
"learning_rate": 0.0001131707204424491,
"loss": 0.1743,
"step": 1269
},
{
"epoch": 0.460645629307218,
"grad_norm": 0.17005477845668793,
"learning_rate": 0.0001130573436276017,
"loss": 0.1767,
"step": 1270
},
{
"epoch": 0.4610083424011607,
"grad_norm": 0.18844565749168396,
"learning_rate": 0.00011294394973474631,
"loss": 0.1836,
"step": 1271
},
{
"epoch": 0.4613710554951034,
"grad_norm": 0.17676351964473724,
"learning_rate": 0.00011283053891219344,
"loss": 0.1806,
"step": 1272
},
{
"epoch": 0.4617337685890461,
"grad_norm": 0.1949535459280014,
"learning_rate": 0.00011271711130827584,
"loss": 0.2162,
"step": 1273
},
{
"epoch": 0.4620964816829888,
"grad_norm": 0.16555753350257874,
"learning_rate": 0.0001126036670713481,
"loss": 0.2051,
"step": 1274
},
{
"epoch": 0.4624591947769314,
"grad_norm": 0.16618479788303375,
"learning_rate": 0.00011249020634978664,
"loss": 0.1686,
"step": 1275
},
{
"epoch": 0.4628219078708741,
"grad_norm": 0.16579975187778473,
"learning_rate": 0.00011237672929198944,
"loss": 0.1887,
"step": 1276
},
{
"epoch": 0.46318462096481683,
"grad_norm": 0.1672372817993164,
"learning_rate": 0.00011226323604637577,
"loss": 0.1801,
"step": 1277
},
{
"epoch": 0.46354733405875953,
"grad_norm": 0.18061618506908417,
"learning_rate": 0.00011214972676138612,
"loss": 0.2006,
"step": 1278
},
{
"epoch": 0.46391004715270223,
"grad_norm": 0.19238020479679108,
"learning_rate": 0.00011203620158548205,
"loss": 0.1693,
"step": 1279
},
{
"epoch": 0.4642727602466449,
"grad_norm": 0.18483294546604156,
"learning_rate": 0.00011192266066714576,
"loss": 0.181,
"step": 1280
},
{
"epoch": 0.4646354733405876,
"grad_norm": 0.1617163121700287,
"learning_rate": 0.00011180910415488006,
"loss": 0.1812,
"step": 1281
},
{
"epoch": 0.4649981864345303,
"grad_norm": 0.18640659749507904,
"learning_rate": 0.00011169553219720828,
"loss": 0.1877,
"step": 1282
},
{
"epoch": 0.465360899528473,
"grad_norm": 0.1695108264684677,
"learning_rate": 0.00011158194494267375,
"loss": 0.1848,
"step": 1283
},
{
"epoch": 0.4657236126224157,
"grad_norm": 0.1813160479068756,
"learning_rate": 0.00011146834253984006,
"loss": 0.1897,
"step": 1284
},
{
"epoch": 0.4660863257163584,
"grad_norm": 0.19932959973812103,
"learning_rate": 0.00011135472513729037,
"loss": 0.1924,
"step": 1285
},
{
"epoch": 0.46644903881030103,
"grad_norm": 0.18082661926746368,
"learning_rate": 0.0001112410928836276,
"loss": 0.1856,
"step": 1286
},
{
"epoch": 0.46681175190424373,
"grad_norm": 0.18553735315799713,
"learning_rate": 0.00011112744592747406,
"loss": 0.215,
"step": 1287
},
{
"epoch": 0.46717446499818643,
"grad_norm": 0.1664389669895172,
"learning_rate": 0.0001110137844174713,
"loss": 0.181,
"step": 1288
},
{
"epoch": 0.46753717809212914,
"grad_norm": 0.16226251423358917,
"learning_rate": 0.00011090010850227987,
"loss": 0.1818,
"step": 1289
},
{
"epoch": 0.46789989118607184,
"grad_norm": 0.17768961191177368,
"learning_rate": 0.00011078641833057917,
"loss": 0.2087,
"step": 1290
},
{
"epoch": 0.4682626042800145,
"grad_norm": 0.16539828479290009,
"learning_rate": 0.0001106727140510673,
"loss": 0.1882,
"step": 1291
},
{
"epoch": 0.4686253173739572,
"grad_norm": 0.17121171951293945,
"learning_rate": 0.00011055899581246074,
"loss": 0.1664,
"step": 1292
},
{
"epoch": 0.4689880304678999,
"grad_norm": 0.19726701080799103,
"learning_rate": 0.00011044526376349427,
"loss": 0.1924,
"step": 1293
},
{
"epoch": 0.4693507435618426,
"grad_norm": 0.16600336134433746,
"learning_rate": 0.0001103315180529207,
"loss": 0.1775,
"step": 1294
},
{
"epoch": 0.4697134566557853,
"grad_norm": 0.1898517608642578,
"learning_rate": 0.00011021775882951078,
"loss": 0.248,
"step": 1295
},
{
"epoch": 0.470076169749728,
"grad_norm": 0.16445770859718323,
"learning_rate": 0.00011010398624205285,
"loss": 0.1828,
"step": 1296
},
{
"epoch": 0.47043888284367064,
"grad_norm": 0.17627963423728943,
"learning_rate": 0.00010999020043935275,
"loss": 0.1736,
"step": 1297
},
{
"epoch": 0.47080159593761334,
"grad_norm": 0.19058868288993835,
"learning_rate": 0.00010987640157023367,
"loss": 0.2618,
"step": 1298
},
{
"epoch": 0.47116430903155604,
"grad_norm": 0.1651872992515564,
"learning_rate": 0.00010976258978353577,
"loss": 0.1975,
"step": 1299
},
{
"epoch": 0.47152702212549874,
"grad_norm": 0.20072801411151886,
"learning_rate": 0.00010964876522811623,
"loss": 0.1723,
"step": 1300
},
{
"epoch": 0.47188973521944144,
"grad_norm": 0.19362793862819672,
"learning_rate": 0.00010953492805284882,
"loss": 0.1682,
"step": 1301
},
{
"epoch": 0.4722524483133841,
"grad_norm": 0.16751596331596375,
"learning_rate": 0.0001094210784066239,
"loss": 0.1792,
"step": 1302
},
{
"epoch": 0.4726151614073268,
"grad_norm": 0.16240975260734558,
"learning_rate": 0.00010930721643834811,
"loss": 0.1805,
"step": 1303
},
{
"epoch": 0.4729778745012695,
"grad_norm": 0.1741744726896286,
"learning_rate": 0.00010919334229694424,
"loss": 0.1823,
"step": 1304
},
{
"epoch": 0.4733405875952122,
"grad_norm": 0.17905928194522858,
"learning_rate": 0.00010907945613135093,
"loss": 0.1873,
"step": 1305
},
{
"epoch": 0.4737033006891549,
"grad_norm": 0.16759923100471497,
"learning_rate": 0.00010896555809052255,
"loss": 0.1805,
"step": 1306
},
{
"epoch": 0.4740660137830976,
"grad_norm": 0.1604134738445282,
"learning_rate": 0.00010885164832342911,
"loss": 0.1817,
"step": 1307
},
{
"epoch": 0.47442872687704024,
"grad_norm": 0.22676977515220642,
"learning_rate": 0.00010873772697905584,
"loss": 0.1901,
"step": 1308
},
{
"epoch": 0.47479143997098294,
"grad_norm": 0.21346516907215118,
"learning_rate": 0.00010862379420640316,
"loss": 0.2146,
"step": 1309
},
{
"epoch": 0.47515415306492564,
"grad_norm": 0.18681135773658752,
"learning_rate": 0.00010850985015448644,
"loss": 0.1992,
"step": 1310
},
{
"epoch": 0.47551686615886835,
"grad_norm": 0.18223214149475098,
"learning_rate": 0.00010839589497233579,
"loss": 0.1937,
"step": 1311
},
{
"epoch": 0.47587957925281105,
"grad_norm": 0.16445523500442505,
"learning_rate": 0.00010828192880899588,
"loss": 0.201,
"step": 1312
},
{
"epoch": 0.4762422923467537,
"grad_norm": 0.20072023570537567,
"learning_rate": 0.00010816795181352576,
"loss": 0.221,
"step": 1313
},
{
"epoch": 0.4766050054406964,
"grad_norm": 0.1709073781967163,
"learning_rate": 0.00010805396413499865,
"loss": 0.1726,
"step": 1314
},
{
"epoch": 0.4769677185346391,
"grad_norm": 0.20039378106594086,
"learning_rate": 0.00010793996592250166,
"loss": 0.2096,
"step": 1315
},
{
"epoch": 0.4773304316285818,
"grad_norm": 0.17664781212806702,
"learning_rate": 0.00010782595732513581,
"loss": 0.1763,
"step": 1316
},
{
"epoch": 0.4776931447225245,
"grad_norm": 0.19013923406600952,
"learning_rate": 0.00010771193849201561,
"loss": 0.1977,
"step": 1317
},
{
"epoch": 0.4780558578164672,
"grad_norm": 0.18075336515903473,
"learning_rate": 0.00010759790957226896,
"loss": 0.2101,
"step": 1318
},
{
"epoch": 0.47841857091040985,
"grad_norm": 0.16578859090805054,
"learning_rate": 0.00010748387071503703,
"loss": 0.1724,
"step": 1319
},
{
"epoch": 0.47878128400435255,
"grad_norm": 0.1706560105085373,
"learning_rate": 0.00010736982206947388,
"loss": 0.1826,
"step": 1320
},
{
"epoch": 0.47914399709829525,
"grad_norm": 0.1749362051486969,
"learning_rate": 0.00010725576378474647,
"loss": 0.1957,
"step": 1321
},
{
"epoch": 0.47950671019223795,
"grad_norm": 0.16315925121307373,
"learning_rate": 0.0001071416960100343,
"loss": 0.1675,
"step": 1322
},
{
"epoch": 0.47986942328618065,
"grad_norm": 0.18400579690933228,
"learning_rate": 0.0001070276188945293,
"loss": 0.1896,
"step": 1323
},
{
"epoch": 0.4802321363801233,
"grad_norm": 0.15948963165283203,
"learning_rate": 0.00010691353258743566,
"loss": 0.1816,
"step": 1324
},
{
"epoch": 0.480594849474066,
"grad_norm": 0.17198865115642548,
"learning_rate": 0.00010679943723796948,
"loss": 0.176,
"step": 1325
},
{
"epoch": 0.4809575625680087,
"grad_norm": 0.15731912851333618,
"learning_rate": 0.00010668533299535885,
"loss": 0.1717,
"step": 1326
},
{
"epoch": 0.4813202756619514,
"grad_norm": 0.19525468349456787,
"learning_rate": 0.00010657122000884334,
"loss": 0.2036,
"step": 1327
},
{
"epoch": 0.4816829887558941,
"grad_norm": 0.18892909586429596,
"learning_rate": 0.00010645709842767404,
"loss": 0.1886,
"step": 1328
},
{
"epoch": 0.4820457018498368,
"grad_norm": 0.1732751876115799,
"learning_rate": 0.00010634296840111328,
"loss": 0.1654,
"step": 1329
},
{
"epoch": 0.48240841494377945,
"grad_norm": 0.17465728521347046,
"learning_rate": 0.00010622883007843439,
"loss": 0.2119,
"step": 1330
},
{
"epoch": 0.48277112803772215,
"grad_norm": 0.17398551106452942,
"learning_rate": 0.00010611468360892157,
"loss": 0.1833,
"step": 1331
},
{
"epoch": 0.48313384113166485,
"grad_norm": 0.16920240223407745,
"learning_rate": 0.00010600052914186971,
"loss": 0.182,
"step": 1332
},
{
"epoch": 0.48349655422560756,
"grad_norm": 0.1846507489681244,
"learning_rate": 0.0001058863668265841,
"loss": 0.2106,
"step": 1333
},
{
"epoch": 0.48385926731955026,
"grad_norm": 0.2055717557668686,
"learning_rate": 0.00010577219681238035,
"loss": 0.1992,
"step": 1334
},
{
"epoch": 0.4842219804134929,
"grad_norm": 0.18416710197925568,
"learning_rate": 0.00010565801924858411,
"loss": 0.1997,
"step": 1335
},
{
"epoch": 0.4845846935074356,
"grad_norm": 0.1609608680009842,
"learning_rate": 0.00010554383428453093,
"loss": 0.2147,
"step": 1336
},
{
"epoch": 0.4849474066013783,
"grad_norm": 0.1770334094762802,
"learning_rate": 0.00010542964206956601,
"loss": 0.1796,
"step": 1337
},
{
"epoch": 0.485310119695321,
"grad_norm": 0.24041593074798584,
"learning_rate": 0.00010531544275304403,
"loss": 0.1828,
"step": 1338
},
{
"epoch": 0.4856728327892637,
"grad_norm": 0.18589763343334198,
"learning_rate": 0.00010520123648432896,
"loss": 0.1894,
"step": 1339
},
{
"epoch": 0.4860355458832064,
"grad_norm": 0.19816087186336517,
"learning_rate": 0.00010508702341279391,
"loss": 0.1849,
"step": 1340
},
{
"epoch": 0.48639825897714906,
"grad_norm": 0.20071928203105927,
"learning_rate": 0.00010497280368782083,
"loss": 0.1871,
"step": 1341
},
{
"epoch": 0.48676097207109176,
"grad_norm": 0.19222816824913025,
"learning_rate": 0.00010485857745880043,
"loss": 0.2114,
"step": 1342
},
{
"epoch": 0.48712368516503446,
"grad_norm": 0.17220762372016907,
"learning_rate": 0.00010474434487513183,
"loss": 0.186,
"step": 1343
},
{
"epoch": 0.48748639825897716,
"grad_norm": 0.1726873815059662,
"learning_rate": 0.00010463010608622259,
"loss": 0.1945,
"step": 1344
},
{
"epoch": 0.48784911135291986,
"grad_norm": 0.1876380741596222,
"learning_rate": 0.0001045158612414883,
"loss": 0.214,
"step": 1345
},
{
"epoch": 0.4882118244468625,
"grad_norm": 0.16988040506839752,
"learning_rate": 0.00010440161049035242,
"loss": 0.1901,
"step": 1346
},
{
"epoch": 0.4885745375408052,
"grad_norm": 0.15666206181049347,
"learning_rate": 0.00010428735398224629,
"loss": 0.1788,
"step": 1347
},
{
"epoch": 0.4889372506347479,
"grad_norm": 0.16927142441272736,
"learning_rate": 0.00010417309186660869,
"loss": 0.1847,
"step": 1348
},
{
"epoch": 0.4892999637286906,
"grad_norm": 0.17525021731853485,
"learning_rate": 0.00010405882429288567,
"loss": 0.1826,
"step": 1349
},
{
"epoch": 0.4896626768226333,
"grad_norm": 0.19699347019195557,
"learning_rate": 0.00010394455141053056,
"loss": 0.2124,
"step": 1350
},
{
"epoch": 0.490025389916576,
"grad_norm": 0.17007745802402496,
"learning_rate": 0.00010383027336900355,
"loss": 0.1936,
"step": 1351
},
{
"epoch": 0.49038810301051866,
"grad_norm": 0.19556905329227448,
"learning_rate": 0.00010371599031777155,
"loss": 0.197,
"step": 1352
},
{
"epoch": 0.49075081610446136,
"grad_norm": 0.16477836668491364,
"learning_rate": 0.00010360170240630808,
"loss": 0.1625,
"step": 1353
},
{
"epoch": 0.49111352919840406,
"grad_norm": 0.1732366383075714,
"learning_rate": 0.00010348740978409302,
"loss": 0.1971,
"step": 1354
},
{
"epoch": 0.49147624229234677,
"grad_norm": 0.16834014654159546,
"learning_rate": 0.00010337311260061233,
"loss": 0.1766,
"step": 1355
},
{
"epoch": 0.49183895538628947,
"grad_norm": 0.16868503391742706,
"learning_rate": 0.00010325881100535806,
"loss": 0.1708,
"step": 1356
},
{
"epoch": 0.4922016684802321,
"grad_norm": 0.1709543913602829,
"learning_rate": 0.00010314450514782792,
"loss": 0.1639,
"step": 1357
},
{
"epoch": 0.4925643815741748,
"grad_norm": 0.19506582617759705,
"learning_rate": 0.0001030301951775253,
"loss": 0.2164,
"step": 1358
},
{
"epoch": 0.4929270946681175,
"grad_norm": 0.1822308748960495,
"learning_rate": 0.00010291588124395881,
"loss": 0.1743,
"step": 1359
},
{
"epoch": 0.4932898077620602,
"grad_norm": 0.16552822291851044,
"learning_rate": 0.00010280156349664245,
"loss": 0.1826,
"step": 1360
},
{
"epoch": 0.4936525208560029,
"grad_norm": 0.1888803243637085,
"learning_rate": 0.00010268724208509504,
"loss": 0.2016,
"step": 1361
},
{
"epoch": 0.4940152339499456,
"grad_norm": 0.15892508625984192,
"learning_rate": 0.00010257291715884023,
"loss": 0.1666,
"step": 1362
},
{
"epoch": 0.49437794704388827,
"grad_norm": 0.18861308693885803,
"learning_rate": 0.00010245858886740636,
"loss": 0.1969,
"step": 1363
},
{
"epoch": 0.49474066013783097,
"grad_norm": 0.1808152198791504,
"learning_rate": 0.00010234425736032607,
"loss": 0.181,
"step": 1364
},
{
"epoch": 0.49510337323177367,
"grad_norm": 0.18545283377170563,
"learning_rate": 0.00010222992278713619,
"loss": 0.1757,
"step": 1365
},
{
"epoch": 0.49546608632571637,
"grad_norm": 0.16214706003665924,
"learning_rate": 0.00010211558529737768,
"loss": 0.1809,
"step": 1366
},
{
"epoch": 0.49582879941965907,
"grad_norm": 0.16413475573062897,
"learning_rate": 0.00010200124504059522,
"loss": 0.1765,
"step": 1367
},
{
"epoch": 0.4961915125136017,
"grad_norm": 0.17465685307979584,
"learning_rate": 0.0001018869021663371,
"loss": 0.1786,
"step": 1368
},
{
"epoch": 0.4965542256075444,
"grad_norm": 0.16205236315727234,
"learning_rate": 0.00010177255682415512,
"loss": 0.1778,
"step": 1369
},
{
"epoch": 0.4969169387014871,
"grad_norm": 0.17154665291309357,
"learning_rate": 0.0001016582091636042,
"loss": 0.1848,
"step": 1370
},
{
"epoch": 0.4972796517954298,
"grad_norm": 0.19808538258075714,
"learning_rate": 0.00010154385933424236,
"loss": 0.1872,
"step": 1371
},
{
"epoch": 0.4976423648893725,
"grad_norm": 0.17381629347801208,
"learning_rate": 0.00010142950748563047,
"loss": 0.1706,
"step": 1372
},
{
"epoch": 0.4980050779833152,
"grad_norm": 0.18413105607032776,
"learning_rate": 0.00010131515376733199,
"loss": 0.2041,
"step": 1373
},
{
"epoch": 0.49836779107725787,
"grad_norm": 0.16707438230514526,
"learning_rate": 0.0001012007983289128,
"loss": 0.1824,
"step": 1374
},
{
"epoch": 0.4987305041712006,
"grad_norm": 0.18369126319885254,
"learning_rate": 0.00010108644131994118,
"loss": 0.1838,
"step": 1375
},
{
"epoch": 0.4990932172651433,
"grad_norm": 0.17866884171962738,
"learning_rate": 0.00010097208288998727,
"loss": 0.18,
"step": 1376
},
{
"epoch": 0.499455930359086,
"grad_norm": 0.17458714544773102,
"learning_rate": 0.0001008577231886232,
"loss": 0.1863,
"step": 1377
},
{
"epoch": 0.4998186434530287,
"grad_norm": 0.16435278952121735,
"learning_rate": 0.00010074336236542275,
"loss": 0.1691,
"step": 1378
},
{
"epoch": 0.5001813565469714,
"grad_norm": 0.18374355137348175,
"learning_rate": 0.00010062900056996111,
"loss": 0.2016,
"step": 1379
},
{
"epoch": 0.500544069640914,
"grad_norm": 0.1715199500322342,
"learning_rate": 0.00010051463795181475,
"loss": 0.1775,
"step": 1380
},
{
"epoch": 0.5009067827348568,
"grad_norm": 0.17471933364868164,
"learning_rate": 0.00010040027466056128,
"loss": 0.1784,
"step": 1381
},
{
"epoch": 0.5012694958287994,
"grad_norm": 0.182729572057724,
"learning_rate": 0.00010028591084577914,
"loss": 0.1848,
"step": 1382
},
{
"epoch": 0.5016322089227421,
"grad_norm": 0.1831514835357666,
"learning_rate": 0.00010017154665704742,
"loss": 0.1782,
"step": 1383
},
{
"epoch": 0.5019949220166848,
"grad_norm": 0.18920493125915527,
"learning_rate": 0.00010005718224394583,
"loss": 0.1983,
"step": 1384
},
{
"epoch": 0.5023576351106275,
"grad_norm": 0.18116223812103271,
"learning_rate": 9.994281775605417e-05,
"loss": 0.1703,
"step": 1385
},
{
"epoch": 0.5027203482045702,
"grad_norm": 0.16635280847549438,
"learning_rate": 9.982845334295257e-05,
"loss": 0.1826,
"step": 1386
},
{
"epoch": 0.5030830612985129,
"grad_norm": 0.1902262419462204,
"learning_rate": 9.971408915422089e-05,
"loss": 0.1821,
"step": 1387
},
{
"epoch": 0.5034457743924555,
"grad_norm": 0.1716509759426117,
"learning_rate": 9.959972533943873e-05,
"loss": 0.1774,
"step": 1388
},
{
"epoch": 0.5038084874863983,
"grad_norm": 0.1831229031085968,
"learning_rate": 9.948536204818527e-05,
"loss": 0.1859,
"step": 1389
},
{
"epoch": 0.5041712005803409,
"grad_norm": 0.17858019471168518,
"learning_rate": 9.937099943003894e-05,
"loss": 0.1763,
"step": 1390
},
{
"epoch": 0.5045339136742837,
"grad_norm": 0.20118439197540283,
"learning_rate": 9.925663763457726e-05,
"loss": 0.2055,
"step": 1391
},
{
"epoch": 0.5048966267682263,
"grad_norm": 0.1790417581796646,
"learning_rate": 9.91422768113768e-05,
"loss": 0.1886,
"step": 1392
},
{
"epoch": 0.505259339862169,
"grad_norm": 0.18328474462032318,
"learning_rate": 9.902791711001274e-05,
"loss": 0.2024,
"step": 1393
},
{
"epoch": 0.5056220529561117,
"grad_norm": 0.17188413441181183,
"learning_rate": 9.891355868005885e-05,
"loss": 0.1822,
"step": 1394
},
{
"epoch": 0.5059847660500544,
"grad_norm": 0.20239926874637604,
"learning_rate": 9.879920167108722e-05,
"loss": 0.1811,
"step": 1395
},
{
"epoch": 0.5063474791439971,
"grad_norm": 0.1758367419242859,
"learning_rate": 9.868484623266807e-05,
"loss": 0.2074,
"step": 1396
},
{
"epoch": 0.5067101922379398,
"grad_norm": 0.16482442617416382,
"learning_rate": 9.857049251436957e-05,
"loss": 0.1748,
"step": 1397
},
{
"epoch": 0.5070729053318824,
"grad_norm": 0.19277919828891754,
"learning_rate": 9.845614066575764e-05,
"loss": 0.2831,
"step": 1398
},
{
"epoch": 0.5074356184258252,
"grad_norm": 0.19243241846561432,
"learning_rate": 9.834179083639581e-05,
"loss": 0.1817,
"step": 1399
},
{
"epoch": 0.5077983315197678,
"grad_norm": 0.19496306777000427,
"learning_rate": 9.822744317584492e-05,
"loss": 0.1614,
"step": 1400
},
{
"epoch": 0.5081610446137106,
"grad_norm": 0.1891697645187378,
"learning_rate": 9.811309783366291e-05,
"loss": 0.1952,
"step": 1401
},
{
"epoch": 0.5085237577076532,
"grad_norm": 0.19444064795970917,
"learning_rate": 9.799875495940481e-05,
"loss": 0.1942,
"step": 1402
},
{
"epoch": 0.508886470801596,
"grad_norm": 0.18112586438655853,
"learning_rate": 9.788441470262235e-05,
"loss": 0.1963,
"step": 1403
},
{
"epoch": 0.5092491838955386,
"grad_norm": 0.17287184298038483,
"learning_rate": 9.777007721286381e-05,
"loss": 0.1733,
"step": 1404
},
{
"epoch": 0.5096118969894813,
"grad_norm": 0.18775591254234314,
"learning_rate": 9.765574263967396e-05,
"loss": 0.1965,
"step": 1405
},
{
"epoch": 0.509974610083424,
"grad_norm": 0.17914709448814392,
"learning_rate": 9.754141113259366e-05,
"loss": 0.1748,
"step": 1406
},
{
"epoch": 0.5103373231773667,
"grad_norm": 0.19423453509807587,
"learning_rate": 9.74270828411598e-05,
"loss": 0.1833,
"step": 1407
},
{
"epoch": 0.5107000362713094,
"grad_norm": 0.18104608356952667,
"learning_rate": 9.731275791490501e-05,
"loss": 0.177,
"step": 1408
},
{
"epoch": 0.5110627493652521,
"grad_norm": 0.17595738172531128,
"learning_rate": 9.719843650335758e-05,
"loss": 0.1839,
"step": 1409
},
{
"epoch": 0.5114254624591947,
"grad_norm": 0.18560685217380524,
"learning_rate": 9.708411875604118e-05,
"loss": 0.1995,
"step": 1410
},
{
"epoch": 0.5117881755531375,
"grad_norm": 0.18210434913635254,
"learning_rate": 9.696980482247474e-05,
"loss": 0.1963,
"step": 1411
},
{
"epoch": 0.5121508886470801,
"grad_norm": 0.16906267404556274,
"learning_rate": 9.685549485217209e-05,
"loss": 0.1636,
"step": 1412
},
{
"epoch": 0.5125136017410229,
"grad_norm": 0.19701135158538818,
"learning_rate": 9.674118899464195e-05,
"loss": 0.2184,
"step": 1413
},
{
"epoch": 0.5128763148349655,
"grad_norm": 0.18875081837177277,
"learning_rate": 9.662688739938769e-05,
"loss": 0.2142,
"step": 1414
},
{
"epoch": 0.5132390279289082,
"grad_norm": 0.20290465652942657,
"learning_rate": 9.651259021590703e-05,
"loss": 0.2041,
"step": 1415
},
{
"epoch": 0.5136017410228509,
"grad_norm": 0.1915699690580368,
"learning_rate": 9.639829759369191e-05,
"loss": 0.1741,
"step": 1416
},
{
"epoch": 0.5139644541167936,
"grad_norm": 0.1645934134721756,
"learning_rate": 9.628400968222846e-05,
"loss": 0.179,
"step": 1417
},
{
"epoch": 0.5143271672107363,
"grad_norm": 0.18472225964069366,
"learning_rate": 9.616972663099647e-05,
"loss": 0.1876,
"step": 1418
},
{
"epoch": 0.514689880304679,
"grad_norm": 0.17435920238494873,
"learning_rate": 9.605544858946945e-05,
"loss": 0.175,
"step": 1419
},
{
"epoch": 0.5150525933986216,
"grad_norm": 0.1865229606628418,
"learning_rate": 9.594117570711434e-05,
"loss": 0.2141,
"step": 1420
},
{
"epoch": 0.5154153064925644,
"grad_norm": 0.18339309096336365,
"learning_rate": 9.582690813339136e-05,
"loss": 0.1794,
"step": 1421
},
{
"epoch": 0.515778019586507,
"grad_norm": 0.1994606852531433,
"learning_rate": 9.571264601775369e-05,
"loss": 0.1835,
"step": 1422
},
{
"epoch": 0.5161407326804498,
"grad_norm": 0.16973696649074554,
"learning_rate": 9.559838950964757e-05,
"loss": 0.1587,
"step": 1423
},
{
"epoch": 0.5165034457743924,
"grad_norm": 0.17294169962406158,
"learning_rate": 9.548413875851174e-05,
"loss": 0.1748,
"step": 1424
},
{
"epoch": 0.5168661588683352,
"grad_norm": 0.19328264892101288,
"learning_rate": 9.536989391377743e-05,
"loss": 0.2012,
"step": 1425
},
{
"epoch": 0.5172288719622778,
"grad_norm": 0.18262383341789246,
"learning_rate": 9.52556551248682e-05,
"loss": 0.1806,
"step": 1426
},
{
"epoch": 0.5175915850562205,
"grad_norm": 0.16941824555397034,
"learning_rate": 9.514142254119962e-05,
"loss": 0.1739,
"step": 1427
},
{
"epoch": 0.5179542981501633,
"grad_norm": 0.1808822602033615,
"learning_rate": 9.502719631217917e-05,
"loss": 0.1685,
"step": 1428
},
{
"epoch": 0.5183170112441059,
"grad_norm": 0.213886559009552,
"learning_rate": 9.49129765872061e-05,
"loss": 0.1851,
"step": 1429
},
{
"epoch": 0.5186797243380487,
"grad_norm": 0.1952863335609436,
"learning_rate": 9.479876351567107e-05,
"loss": 0.1691,
"step": 1430
},
{
"epoch": 0.5190424374319913,
"grad_norm": 0.1745711863040924,
"learning_rate": 9.4684557246956e-05,
"loss": 0.1883,
"step": 1431
},
{
"epoch": 0.519405150525934,
"grad_norm": 0.19590620696544647,
"learning_rate": 9.457035793043401e-05,
"loss": 0.1822,
"step": 1432
},
{
"epoch": 0.5197678636198767,
"grad_norm": 0.17998209595680237,
"learning_rate": 9.445616571546909e-05,
"loss": 0.172,
"step": 1433
},
{
"epoch": 0.5201305767138193,
"grad_norm": 0.1765129566192627,
"learning_rate": 9.434198075141591e-05,
"loss": 0.1748,
"step": 1434
},
{
"epoch": 0.5204932898077621,
"grad_norm": 0.19922930002212524,
"learning_rate": 9.422780318761965e-05,
"loss": 0.1941,
"step": 1435
},
{
"epoch": 0.5208560029017048,
"grad_norm": 0.1994534283876419,
"learning_rate": 9.411363317341592e-05,
"loss": 0.1838,
"step": 1436
},
{
"epoch": 0.5212187159956474,
"grad_norm": 0.19850608706474304,
"learning_rate": 9.399947085813032e-05,
"loss": 0.1768,
"step": 1437
},
{
"epoch": 0.5215814290895902,
"grad_norm": 0.16051959991455078,
"learning_rate": 9.388531639107846e-05,
"loss": 0.1781,
"step": 1438
},
{
"epoch": 0.5219441421835328,
"grad_norm": 0.18641552329063416,
"learning_rate": 9.377116992156566e-05,
"loss": 0.1884,
"step": 1439
},
{
"epoch": 0.5223068552774756,
"grad_norm": 0.16958610713481903,
"learning_rate": 9.365703159888677e-05,
"loss": 0.1768,
"step": 1440
},
{
"epoch": 0.5226695683714182,
"grad_norm": 0.16557306051254272,
"learning_rate": 9.354290157232596e-05,
"loss": 0.1648,
"step": 1441
},
{
"epoch": 0.5230322814653608,
"grad_norm": 0.18799157440662384,
"learning_rate": 9.342877999115667e-05,
"loss": 0.1711,
"step": 1442
},
{
"epoch": 0.5233949945593036,
"grad_norm": 0.19848479330539703,
"learning_rate": 9.331466700464117e-05,
"loss": 0.1807,
"step": 1443
},
{
"epoch": 0.5237577076532463,
"grad_norm": 0.17750594019889832,
"learning_rate": 9.320056276203054e-05,
"loss": 0.1907,
"step": 1444
},
{
"epoch": 0.524120420747189,
"grad_norm": 0.16206082701683044,
"learning_rate": 9.308646741256439e-05,
"loss": 0.1808,
"step": 1445
},
{
"epoch": 0.5244831338411317,
"grad_norm": 0.1657271534204483,
"learning_rate": 9.297238110547074e-05,
"loss": 0.177,
"step": 1446
},
{
"epoch": 0.5248458469350744,
"grad_norm": 0.19123826920986176,
"learning_rate": 9.285830398996571e-05,
"loss": 0.1817,
"step": 1447
},
{
"epoch": 0.5252085600290171,
"grad_norm": 0.16904449462890625,
"learning_rate": 9.274423621525354e-05,
"loss": 0.1837,
"step": 1448
},
{
"epoch": 0.5255712731229597,
"grad_norm": 0.19816622138023376,
"learning_rate": 9.263017793052615e-05,
"loss": 0.1954,
"step": 1449
},
{
"epoch": 0.5259339862169025,
"grad_norm": 0.18440890312194824,
"learning_rate": 9.251612928496298e-05,
"loss": 0.1708,
"step": 1450
},
{
"epoch": 0.5262966993108451,
"grad_norm": 0.18821316957473755,
"learning_rate": 9.240209042773105e-05,
"loss": 0.1929,
"step": 1451
},
{
"epoch": 0.5266594124047879,
"grad_norm": 0.18499478697776794,
"learning_rate": 9.228806150798442e-05,
"loss": 0.1774,
"step": 1452
},
{
"epoch": 0.5270221254987305,
"grad_norm": 0.21519748866558075,
"learning_rate": 9.21740426748642e-05,
"loss": 0.1915,
"step": 1453
},
{
"epoch": 0.5273848385926732,
"grad_norm": 0.18411661684513092,
"learning_rate": 9.206003407749833e-05,
"loss": 0.2101,
"step": 1454
},
{
"epoch": 0.5277475516866159,
"grad_norm": 0.17182524502277374,
"learning_rate": 9.194603586500136e-05,
"loss": 0.1672,
"step": 1455
},
{
"epoch": 0.5281102647805586,
"grad_norm": 0.18551282584667206,
"learning_rate": 9.183204818647424e-05,
"loss": 0.1924,
"step": 1456
},
{
"epoch": 0.5284729778745013,
"grad_norm": 0.18289272487163544,
"learning_rate": 9.171807119100413e-05,
"loss": 0.1781,
"step": 1457
},
{
"epoch": 0.528835690968444,
"grad_norm": 0.169638991355896,
"learning_rate": 9.160410502766424e-05,
"loss": 0.1704,
"step": 1458
},
{
"epoch": 0.5291984040623866,
"grad_norm": 0.17855599522590637,
"learning_rate": 9.149014984551357e-05,
"loss": 0.1761,
"step": 1459
},
{
"epoch": 0.5295611171563294,
"grad_norm": 0.21452195942401886,
"learning_rate": 9.137620579359685e-05,
"loss": 0.1778,
"step": 1460
},
{
"epoch": 0.529923830250272,
"grad_norm": 0.20922896265983582,
"learning_rate": 9.126227302094417e-05,
"loss": 0.2186,
"step": 1461
},
{
"epoch": 0.5302865433442148,
"grad_norm": 0.15859532356262207,
"learning_rate": 9.114835167657091e-05,
"loss": 0.1829,
"step": 1462
},
{
"epoch": 0.5306492564381574,
"grad_norm": 0.17610323429107666,
"learning_rate": 9.103444190947746e-05,
"loss": 0.2027,
"step": 1463
},
{
"epoch": 0.5310119695321001,
"grad_norm": 0.17557282745838165,
"learning_rate": 9.092054386864912e-05,
"loss": 0.1747,
"step": 1464
},
{
"epoch": 0.5313746826260428,
"grad_norm": 0.19372673332691193,
"learning_rate": 9.080665770305578e-05,
"loss": 0.1644,
"step": 1465
},
{
"epoch": 0.5317373957199855,
"grad_norm": 0.20970730483531952,
"learning_rate": 9.069278356165187e-05,
"loss": 0.2032,
"step": 1466
},
{
"epoch": 0.5321001088139282,
"grad_norm": 0.2470318228006363,
"learning_rate": 9.057892159337612e-05,
"loss": 0.2121,
"step": 1467
},
{
"epoch": 0.5324628219078709,
"grad_norm": 0.1663379967212677,
"learning_rate": 9.046507194715121e-05,
"loss": 0.1741,
"step": 1468
},
{
"epoch": 0.5328255350018135,
"grad_norm": 0.1842135637998581,
"learning_rate": 9.035123477188381e-05,
"loss": 0.1793,
"step": 1469
},
{
"epoch": 0.5331882480957563,
"grad_norm": 0.19390299916267395,
"learning_rate": 9.023741021646427e-05,
"loss": 0.2071,
"step": 1470
},
{
"epoch": 0.5335509611896989,
"grad_norm": 0.17016194760799408,
"learning_rate": 9.012359842976638e-05,
"loss": 0.1718,
"step": 1471
},
{
"epoch": 0.5339136742836417,
"grad_norm": 0.19337502121925354,
"learning_rate": 9.000979956064725e-05,
"loss": 0.2095,
"step": 1472
},
{
"epoch": 0.5342763873775843,
"grad_norm": 0.21092645823955536,
"learning_rate": 8.989601375794717e-05,
"loss": 0.1854,
"step": 1473
},
{
"epoch": 0.5346391004715271,
"grad_norm": 0.18103566765785217,
"learning_rate": 8.978224117048925e-05,
"loss": 0.1829,
"step": 1474
},
{
"epoch": 0.5350018135654697,
"grad_norm": 0.17190292477607727,
"learning_rate": 8.966848194707931e-05,
"loss": 0.1733,
"step": 1475
},
{
"epoch": 0.5353645266594124,
"grad_norm": 0.18108366429805756,
"learning_rate": 8.955473623650578e-05,
"loss": 0.2058,
"step": 1476
},
{
"epoch": 0.5357272397533551,
"grad_norm": 0.16649720072746277,
"learning_rate": 8.944100418753931e-05,
"loss": 0.1744,
"step": 1477
},
{
"epoch": 0.5360899528472978,
"grad_norm": 0.15770559012889862,
"learning_rate": 8.932728594893271e-05,
"loss": 0.1775,
"step": 1478
},
{
"epoch": 0.5364526659412405,
"grad_norm": 0.1907668113708496,
"learning_rate": 8.921358166942084e-05,
"loss": 0.1766,
"step": 1479
},
{
"epoch": 0.5368153790351832,
"grad_norm": 0.18284808099269867,
"learning_rate": 8.909989149772015e-05,
"loss": 0.1708,
"step": 1480
},
{
"epoch": 0.5371780921291258,
"grad_norm": 0.20297999680042267,
"learning_rate": 8.898621558252874e-05,
"loss": 0.165,
"step": 1481
},
{
"epoch": 0.5375408052230686,
"grad_norm": 0.22023969888687134,
"learning_rate": 8.887255407252596e-05,
"loss": 0.1668,
"step": 1482
},
{
"epoch": 0.5379035183170112,
"grad_norm": 0.17669132351875305,
"learning_rate": 8.875890711637243e-05,
"loss": 0.2046,
"step": 1483
},
{
"epoch": 0.538266231410954,
"grad_norm": 0.17783772945404053,
"learning_rate": 8.864527486270964e-05,
"loss": 0.1648,
"step": 1484
},
{
"epoch": 0.5386289445048966,
"grad_norm": 0.171718031167984,
"learning_rate": 8.853165746015997e-05,
"loss": 0.1897,
"step": 1485
},
{
"epoch": 0.5389916575988393,
"grad_norm": 0.16997992992401123,
"learning_rate": 8.841805505732626e-05,
"loss": 0.167,
"step": 1486
},
{
"epoch": 0.539354370692782,
"grad_norm": 0.1764468550682068,
"learning_rate": 8.830446780279176e-05,
"loss": 0.1659,
"step": 1487
},
{
"epoch": 0.5397170837867247,
"grad_norm": 0.18435722589492798,
"learning_rate": 8.819089584511996e-05,
"loss": 0.1754,
"step": 1488
},
{
"epoch": 0.5400797968806674,
"grad_norm": 0.19305875897407532,
"learning_rate": 8.807733933285429e-05,
"loss": 0.1918,
"step": 1489
},
{
"epoch": 0.5404425099746101,
"grad_norm": 0.1882489174604416,
"learning_rate": 8.796379841451796e-05,
"loss": 0.1906,
"step": 1490
},
{
"epoch": 0.5408052230685527,
"grad_norm": 0.14983880519866943,
"learning_rate": 8.785027323861386e-05,
"loss": 0.1552,
"step": 1491
},
{
"epoch": 0.5411679361624955,
"grad_norm": 0.16522106528282166,
"learning_rate": 8.773676395362425e-05,
"loss": 0.1761,
"step": 1492
},
{
"epoch": 0.5415306492564381,
"grad_norm": 0.17727860808372498,
"learning_rate": 8.76232707080106e-05,
"loss": 0.1631,
"step": 1493
},
{
"epoch": 0.5418933623503809,
"grad_norm": 0.1912899911403656,
"learning_rate": 8.750979365021338e-05,
"loss": 0.1804,
"step": 1494
},
{
"epoch": 0.5422560754443235,
"grad_norm": 0.185381218791008,
"learning_rate": 8.739633292865192e-05,
"loss": 0.1831,
"step": 1495
},
{
"epoch": 0.5426187885382663,
"grad_norm": 0.18887324631214142,
"learning_rate": 8.728288869172421e-05,
"loss": 0.178,
"step": 1496
},
{
"epoch": 0.5429815016322089,
"grad_norm": 0.1737644374370575,
"learning_rate": 8.716946108780655e-05,
"loss": 0.1769,
"step": 1497
},
{
"epoch": 0.5433442147261516,
"grad_norm": 0.18002916872501373,
"learning_rate": 8.705605026525371e-05,
"loss": 0.1599,
"step": 1498
},
{
"epoch": 0.5437069278200943,
"grad_norm": 0.18868666887283325,
"learning_rate": 8.694265637239831e-05,
"loss": 0.1661,
"step": 1499
},
{
"epoch": 0.544069640914037,
"grad_norm": 0.20771367847919464,
"learning_rate": 8.682927955755093e-05,
"loss": 0.1839,
"step": 1500
},
{
"epoch": 0.5444323540079797,
"grad_norm": 0.1799492985010147,
"learning_rate": 8.671591996899974e-05,
"loss": 0.1782,
"step": 1501
},
{
"epoch": 0.5447950671019224,
"grad_norm": 0.17485234141349792,
"learning_rate": 8.660257775501045e-05,
"loss": 0.1698,
"step": 1502
},
{
"epoch": 0.545157780195865,
"grad_norm": 0.17470629513263702,
"learning_rate": 8.6489253063826e-05,
"loss": 0.1695,
"step": 1503
},
{
"epoch": 0.5455204932898078,
"grad_norm": 0.17630697786808014,
"learning_rate": 8.637594604366647e-05,
"loss": 0.175,
"step": 1504
},
{
"epoch": 0.5458832063837504,
"grad_norm": 0.19793953001499176,
"learning_rate": 8.626265684272876e-05,
"loss": 0.1798,
"step": 1505
},
{
"epoch": 0.5462459194776932,
"grad_norm": 0.19965516030788422,
"learning_rate": 8.614938560918649e-05,
"loss": 0.2011,
"step": 1506
},
{
"epoch": 0.5466086325716358,
"grad_norm": 0.18119129538536072,
"learning_rate": 8.603613249118977e-05,
"loss": 0.1624,
"step": 1507
},
{
"epoch": 0.5469713456655785,
"grad_norm": 0.19433656334877014,
"learning_rate": 8.592289763686505e-05,
"loss": 0.1842,
"step": 1508
},
{
"epoch": 0.5473340587595212,
"grad_norm": 0.17872895300388336,
"learning_rate": 8.580968119431483e-05,
"loss": 0.1628,
"step": 1509
},
{
"epoch": 0.5476967718534639,
"grad_norm": 0.18134737014770508,
"learning_rate": 8.569648331161762e-05,
"loss": 0.1649,
"step": 1510
},
{
"epoch": 0.5480594849474066,
"grad_norm": 0.19080941379070282,
"learning_rate": 8.558330413682759e-05,
"loss": 0.1856,
"step": 1511
},
{
"epoch": 0.5484221980413493,
"grad_norm": 0.20772339403629303,
"learning_rate": 8.547014381797445e-05,
"loss": 0.1904,
"step": 1512
},
{
"epoch": 0.5487849111352919,
"grad_norm": 0.1807977259159088,
"learning_rate": 8.535700250306322e-05,
"loss": 0.1719,
"step": 1513
},
{
"epoch": 0.5491476242292347,
"grad_norm": 0.18353581428527832,
"learning_rate": 8.524388034007415e-05,
"loss": 0.1758,
"step": 1514
},
{
"epoch": 0.5495103373231773,
"grad_norm": 0.22524112462997437,
"learning_rate": 8.51307774769623e-05,
"loss": 0.1821,
"step": 1515
},
{
"epoch": 0.5498730504171201,
"grad_norm": 0.17495766282081604,
"learning_rate": 8.501769406165769e-05,
"loss": 0.2193,
"step": 1516
},
{
"epoch": 0.5502357635110627,
"grad_norm": 0.17903603613376617,
"learning_rate": 8.490463024206474e-05,
"loss": 0.1687,
"step": 1517
},
{
"epoch": 0.5505984766050055,
"grad_norm": 0.1783863753080368,
"learning_rate": 8.479158616606228e-05,
"loss": 0.1699,
"step": 1518
},
{
"epoch": 0.5509611896989481,
"grad_norm": 0.17774266004562378,
"learning_rate": 8.467856198150333e-05,
"loss": 0.1946,
"step": 1519
},
{
"epoch": 0.5513239027928908,
"grad_norm": 0.20432449877262115,
"learning_rate": 8.45655578362149e-05,
"loss": 0.193,
"step": 1520
},
{
"epoch": 0.5516866158868335,
"grad_norm": 0.1733636111021042,
"learning_rate": 8.44525738779977e-05,
"loss": 0.1712,
"step": 1521
},
{
"epoch": 0.5520493289807762,
"grad_norm": 0.19748555123806,
"learning_rate": 8.433961025462624e-05,
"loss": 0.1969,
"step": 1522
},
{
"epoch": 0.552412042074719,
"grad_norm": 0.18513956665992737,
"learning_rate": 8.422666711384827e-05,
"loss": 0.1735,
"step": 1523
},
{
"epoch": 0.5527747551686616,
"grad_norm": 0.22357869148254395,
"learning_rate": 8.411374460338474e-05,
"loss": 0.1725,
"step": 1524
},
{
"epoch": 0.5531374682626042,
"grad_norm": 0.18229088187217712,
"learning_rate": 8.400084287092973e-05,
"loss": 0.1724,
"step": 1525
},
{
"epoch": 0.553500181356547,
"grad_norm": 0.15753042697906494,
"learning_rate": 8.388796206415004e-05,
"loss": 0.1762,
"step": 1526
},
{
"epoch": 0.5538628944504896,
"grad_norm": 0.18276041746139526,
"learning_rate": 8.377510233068518e-05,
"loss": 0.1862,
"step": 1527
},
{
"epoch": 0.5542256075444324,
"grad_norm": 0.2091018706560135,
"learning_rate": 8.366226381814697e-05,
"loss": 0.1722,
"step": 1528
},
{
"epoch": 0.554588320638375,
"grad_norm": 0.1851229965686798,
"learning_rate": 8.354944667411968e-05,
"loss": 0.174,
"step": 1529
},
{
"epoch": 0.5549510337323177,
"grad_norm": 0.18812698125839233,
"learning_rate": 8.343665104615948e-05,
"loss": 0.192,
"step": 1530
},
{
"epoch": 0.5553137468262604,
"grad_norm": 0.18323373794555664,
"learning_rate": 8.332387708179441e-05,
"loss": 0.185,
"step": 1531
},
{
"epoch": 0.5556764599202031,
"grad_norm": 0.187171071767807,
"learning_rate": 8.321112492852422e-05,
"loss": 0.18,
"step": 1532
},
{
"epoch": 0.5560391730141458,
"grad_norm": 0.18064919114112854,
"learning_rate": 8.30983947338201e-05,
"loss": 0.1739,
"step": 1533
},
{
"epoch": 0.5564018861080885,
"grad_norm": 0.1815587282180786,
"learning_rate": 8.29856866451245e-05,
"loss": 0.1818,
"step": 1534
},
{
"epoch": 0.5567645992020311,
"grad_norm": 0.19945740699768066,
"learning_rate": 8.287300080985106e-05,
"loss": 0.2014,
"step": 1535
},
{
"epoch": 0.5571273122959739,
"grad_norm": 0.1874108761548996,
"learning_rate": 8.276033737538424e-05,
"loss": 0.1719,
"step": 1536
},
{
"epoch": 0.5574900253899165,
"grad_norm": 0.173946350812912,
"learning_rate": 8.264769648907915e-05,
"loss": 0.1616,
"step": 1537
},
{
"epoch": 0.5578527384838593,
"grad_norm": 0.20264151692390442,
"learning_rate": 8.25350782982615e-05,
"loss": 0.1815,
"step": 1538
},
{
"epoch": 0.558215451577802,
"grad_norm": 0.17723354697227478,
"learning_rate": 8.242248295022727e-05,
"loss": 0.1869,
"step": 1539
},
{
"epoch": 0.5585781646717447,
"grad_norm": 0.16882532835006714,
"learning_rate": 8.230991059224257e-05,
"loss": 0.1593,
"step": 1540
},
{
"epoch": 0.5589408777656873,
"grad_norm": 0.17361445724964142,
"learning_rate": 8.219736137154347e-05,
"loss": 0.1696,
"step": 1541
},
{
"epoch": 0.55930359085963,
"grad_norm": 0.1865490823984146,
"learning_rate": 8.208483543533573e-05,
"loss": 0.2033,
"step": 1542
},
{
"epoch": 0.5596663039535728,
"grad_norm": 0.17689920961856842,
"learning_rate": 8.197233293079468e-05,
"loss": 0.1679,
"step": 1543
},
{
"epoch": 0.5600290170475154,
"grad_norm": 0.18286365270614624,
"learning_rate": 8.185985400506502e-05,
"loss": 0.1654,
"step": 1544
},
{
"epoch": 0.5603917301414582,
"grad_norm": 0.18033449351787567,
"learning_rate": 8.174739880526057e-05,
"loss": 0.1814,
"step": 1545
},
{
"epoch": 0.5607544432354008,
"grad_norm": 0.17507143318653107,
"learning_rate": 8.163496747846411e-05,
"loss": 0.1669,
"step": 1546
},
{
"epoch": 0.5611171563293434,
"grad_norm": 0.16485197842121124,
"learning_rate": 8.152256017172732e-05,
"loss": 0.1666,
"step": 1547
},
{
"epoch": 0.5614798694232862,
"grad_norm": 0.18058069050312042,
"learning_rate": 8.14101770320703e-05,
"loss": 0.1626,
"step": 1548
},
{
"epoch": 0.5618425825172289,
"grad_norm": 0.17364412546157837,
"learning_rate": 8.129781820648164e-05,
"loss": 0.1913,
"step": 1549
},
{
"epoch": 0.5622052956111716,
"grad_norm": 0.18617358803749084,
"learning_rate": 8.118548384191809e-05,
"loss": 0.1844,
"step": 1550
},
{
"epoch": 0.5625680087051143,
"grad_norm": 0.17379792034626007,
"learning_rate": 8.107317408530441e-05,
"loss": 0.1657,
"step": 1551
},
{
"epoch": 0.5629307217990569,
"grad_norm": 0.1696668565273285,
"learning_rate": 8.096088908353315e-05,
"loss": 0.1663,
"step": 1552
},
{
"epoch": 0.5632934348929997,
"grad_norm": 0.16332849860191345,
"learning_rate": 8.084862898346459e-05,
"loss": 0.1707,
"step": 1553
},
{
"epoch": 0.5636561479869423,
"grad_norm": 0.20836418867111206,
"learning_rate": 8.073639393192634e-05,
"loss": 0.1849,
"step": 1554
},
{
"epoch": 0.5640188610808851,
"grad_norm": 0.1766640543937683,
"learning_rate": 8.062418407571326e-05,
"loss": 0.1593,
"step": 1555
},
{
"epoch": 0.5643815741748277,
"grad_norm": 0.1723148226737976,
"learning_rate": 8.051199956158727e-05,
"loss": 0.1753,
"step": 1556
},
{
"epoch": 0.5647442872687704,
"grad_norm": 0.17197547852993011,
"learning_rate": 8.039984053627714e-05,
"loss": 0.1664,
"step": 1557
},
{
"epoch": 0.5651070003627131,
"grad_norm": 0.17370520532131195,
"learning_rate": 8.02877071464783e-05,
"loss": 0.1767,
"step": 1558
},
{
"epoch": 0.5654697134566558,
"grad_norm": 0.18708960711956024,
"learning_rate": 8.017559953885267e-05,
"loss": 0.1951,
"step": 1559
},
{
"epoch": 0.5658324265505985,
"grad_norm": 0.21225912868976593,
"learning_rate": 8.006351786002846e-05,
"loss": 0.1752,
"step": 1560
},
{
"epoch": 0.5661951396445412,
"grad_norm": 0.17883837223052979,
"learning_rate": 7.995146225659994e-05,
"loss": 0.1665,
"step": 1561
},
{
"epoch": 0.5665578527384839,
"grad_norm": 0.16992917656898499,
"learning_rate": 7.98394328751272e-05,
"loss": 0.1691,
"step": 1562
},
{
"epoch": 0.5669205658324266,
"grad_norm": 0.18541240692138672,
"learning_rate": 7.972742986213623e-05,
"loss": 0.1818,
"step": 1563
},
{
"epoch": 0.5672832789263692,
"grad_norm": 0.17470984160900116,
"learning_rate": 7.961545336411836e-05,
"loss": 0.1715,
"step": 1564
},
{
"epoch": 0.567645992020312,
"grad_norm": 0.21040913462638855,
"learning_rate": 7.950350352753023e-05,
"loss": 0.1873,
"step": 1565
},
{
"epoch": 0.5680087051142546,
"grad_norm": 0.17107225954532623,
"learning_rate": 7.93915804987938e-05,
"loss": 0.1559,
"step": 1566
},
{
"epoch": 0.5683714182081974,
"grad_norm": 0.16713112592697144,
"learning_rate": 7.927968442429576e-05,
"loss": 0.1734,
"step": 1567
},
{
"epoch": 0.56873413130214,
"grad_norm": 0.18837302923202515,
"learning_rate": 7.916781545038767e-05,
"loss": 0.167,
"step": 1568
},
{
"epoch": 0.5690968443960827,
"grad_norm": 0.17015686631202698,
"learning_rate": 7.905597372338558e-05,
"loss": 0.1703,
"step": 1569
},
{
"epoch": 0.5694595574900254,
"grad_norm": 0.17552775144577026,
"learning_rate": 7.894415938956991e-05,
"loss": 0.1623,
"step": 1570
},
{
"epoch": 0.5698222705839681,
"grad_norm": 0.1910295933485031,
"learning_rate": 7.883237259518526e-05,
"loss": 0.1642,
"step": 1571
},
{
"epoch": 0.5701849836779108,
"grad_norm": 0.19286568462848663,
"learning_rate": 7.872061348644028e-05,
"loss": 0.1776,
"step": 1572
},
{
"epoch": 0.5705476967718535,
"grad_norm": 0.17776118218898773,
"learning_rate": 7.86088822095073e-05,
"loss": 0.167,
"step": 1573
},
{
"epoch": 0.5709104098657961,
"grad_norm": 0.1805812269449234,
"learning_rate": 7.84971789105223e-05,
"loss": 0.1666,
"step": 1574
},
{
"epoch": 0.5712731229597389,
"grad_norm": 0.3048454523086548,
"learning_rate": 7.838550373558469e-05,
"loss": 0.2252,
"step": 1575
},
{
"epoch": 0.5716358360536815,
"grad_norm": 0.18575210869312286,
"learning_rate": 7.827385683075701e-05,
"loss": 0.1673,
"step": 1576
},
{
"epoch": 0.5719985491476243,
"grad_norm": 0.19140534102916718,
"learning_rate": 7.816223834206489e-05,
"loss": 0.1651,
"step": 1577
},
{
"epoch": 0.5723612622415669,
"grad_norm": 0.15774936974048615,
"learning_rate": 7.805064841549685e-05,
"loss": 0.1579,
"step": 1578
},
{
"epoch": 0.5727239753355096,
"grad_norm": 0.16118699312210083,
"learning_rate": 7.793908719700396e-05,
"loss": 0.1656,
"step": 1579
},
{
"epoch": 0.5730866884294523,
"grad_norm": 0.19020985066890717,
"learning_rate": 7.782755483249973e-05,
"loss": 0.1775,
"step": 1580
},
{
"epoch": 0.573449401523395,
"grad_norm": 0.1851213425397873,
"learning_rate": 7.771605146786003e-05,
"loss": 0.1876,
"step": 1581
},
{
"epoch": 0.5738121146173377,
"grad_norm": 0.17101642489433289,
"learning_rate": 7.760457724892272e-05,
"loss": 0.1714,
"step": 1582
},
{
"epoch": 0.5741748277112804,
"grad_norm": 0.17683084309101105,
"learning_rate": 7.749313232148753e-05,
"loss": 0.166,
"step": 1583
},
{
"epoch": 0.5745375408052231,
"grad_norm": 0.1966182291507721,
"learning_rate": 7.738171683131594e-05,
"loss": 0.1763,
"step": 1584
},
{
"epoch": 0.5749002538991658,
"grad_norm": 0.1787012666463852,
"learning_rate": 7.727033092413094e-05,
"loss": 0.1621,
"step": 1585
},
{
"epoch": 0.5752629669931084,
"grad_norm": 0.18337036669254303,
"learning_rate": 7.715897474561675e-05,
"loss": 0.1568,
"step": 1586
},
{
"epoch": 0.5756256800870512,
"grad_norm": 0.18342240154743195,
"learning_rate": 7.704764844141873e-05,
"loss": 0.1722,
"step": 1587
},
{
"epoch": 0.5759883931809938,
"grad_norm": 0.17828598618507385,
"learning_rate": 7.693635215714322e-05,
"loss": 0.1562,
"step": 1588
},
{
"epoch": 0.5763511062749366,
"grad_norm": 0.19080400466918945,
"learning_rate": 7.682508603835722e-05,
"loss": 0.1783,
"step": 1589
},
{
"epoch": 0.5767138193688792,
"grad_norm": 0.16964450478553772,
"learning_rate": 7.67138502305883e-05,
"loss": 0.1726,
"step": 1590
},
{
"epoch": 0.5770765324628219,
"grad_norm": 0.19029711186885834,
"learning_rate": 7.660264487932444e-05,
"loss": 0.1574,
"step": 1591
},
{
"epoch": 0.5774392455567646,
"grad_norm": 0.21546104550361633,
"learning_rate": 7.649147013001376e-05,
"loss": 0.1691,
"step": 1592
},
{
"epoch": 0.5778019586507073,
"grad_norm": 0.17420600354671478,
"learning_rate": 7.63803261280643e-05,
"loss": 0.1612,
"step": 1593
},
{
"epoch": 0.57816467174465,
"grad_norm": 0.18015912175178528,
"learning_rate": 7.626921301884395e-05,
"loss": 0.1622,
"step": 1594
},
{
"epoch": 0.5785273848385927,
"grad_norm": 0.16851022839546204,
"learning_rate": 7.615813094768012e-05,
"loss": 0.1642,
"step": 1595
},
{
"epoch": 0.5788900979325353,
"grad_norm": 0.1783701479434967,
"learning_rate": 7.604708005985971e-05,
"loss": 0.1726,
"step": 1596
},
{
"epoch": 0.5792528110264781,
"grad_norm": 0.16931217908859253,
"learning_rate": 7.593606050062881e-05,
"loss": 0.1608,
"step": 1597
},
{
"epoch": 0.5796155241204207,
"grad_norm": 0.16568873822689056,
"learning_rate": 7.582507241519252e-05,
"loss": 0.163,
"step": 1598
},
{
"epoch": 0.5799782372143635,
"grad_norm": 0.16731184720993042,
"learning_rate": 7.571411594871474e-05,
"loss": 0.2004,
"step": 1599
},
{
"epoch": 0.5803409503083061,
"grad_norm": 0.2044878900051117,
"learning_rate": 7.56031912463181e-05,
"loss": 0.1608,
"step": 1600
},
{
"epoch": 0.5807036634022488,
"grad_norm": 0.2527421712875366,
"learning_rate": 7.549229845308362e-05,
"loss": 0.1948,
"step": 1601
},
{
"epoch": 0.5810663764961915,
"grad_norm": 0.16458679735660553,
"learning_rate": 7.538143771405055e-05,
"loss": 0.1495,
"step": 1602
},
{
"epoch": 0.5814290895901342,
"grad_norm": 0.16658927500247955,
"learning_rate": 7.527060917421635e-05,
"loss": 0.1555,
"step": 1603
},
{
"epoch": 0.5817918026840769,
"grad_norm": 0.17401687800884247,
"learning_rate": 7.515981297853626e-05,
"loss": 0.1702,
"step": 1604
},
{
"epoch": 0.5821545157780196,
"grad_norm": 0.17915883660316467,
"learning_rate": 7.504904927192322e-05,
"loss": 0.175,
"step": 1605
},
{
"epoch": 0.5825172288719622,
"grad_norm": 0.18019749224185944,
"learning_rate": 7.493831819924772e-05,
"loss": 0.1703,
"step": 1606
},
{
"epoch": 0.582879941965905,
"grad_norm": 0.18500368297100067,
"learning_rate": 7.482761990533752e-05,
"loss": 0.1741,
"step": 1607
},
{
"epoch": 0.5832426550598476,
"grad_norm": 0.18486149609088898,
"learning_rate": 7.47169545349775e-05,
"loss": 0.1508,
"step": 1608
},
{
"epoch": 0.5836053681537904,
"grad_norm": 0.2042957842350006,
"learning_rate": 7.46063222329096e-05,
"loss": 0.173,
"step": 1609
},
{
"epoch": 0.583968081247733,
"grad_norm": 0.19605065882205963,
"learning_rate": 7.449572314383237e-05,
"loss": 0.1865,
"step": 1610
},
{
"epoch": 0.5843307943416758,
"grad_norm": 0.18277035653591156,
"learning_rate": 7.438515741240097e-05,
"loss": 0.195,
"step": 1611
},
{
"epoch": 0.5846935074356184,
"grad_norm": 0.18347297608852386,
"learning_rate": 7.427462518322693e-05,
"loss": 0.1579,
"step": 1612
},
{
"epoch": 0.5850562205295611,
"grad_norm": 0.1746947020292282,
"learning_rate": 7.416412660087796e-05,
"loss": 0.1951,
"step": 1613
},
{
"epoch": 0.5854189336235038,
"grad_norm": 0.1751972883939743,
"learning_rate": 7.405366180987775e-05,
"loss": 0.1633,
"step": 1614
},
{
"epoch": 0.5857816467174465,
"grad_norm": 0.17814141511917114,
"learning_rate": 7.394323095470586e-05,
"loss": 0.1845,
"step": 1615
},
{
"epoch": 0.5861443598113892,
"grad_norm": 0.1747366487979889,
"learning_rate": 7.383283417979739e-05,
"loss": 0.1777,
"step": 1616
},
{
"epoch": 0.5865070729053319,
"grad_norm": 0.177615687251091,
"learning_rate": 7.372247162954282e-05,
"loss": 0.1691,
"step": 1617
},
{
"epoch": 0.5868697859992745,
"grad_norm": 0.1927955448627472,
"learning_rate": 7.361214344828805e-05,
"loss": 0.164,
"step": 1618
},
{
"epoch": 0.5872324990932173,
"grad_norm": 0.17188555002212524,
"learning_rate": 7.350184978033386e-05,
"loss": 0.1704,
"step": 1619
},
{
"epoch": 0.5875952121871599,
"grad_norm": 0.25001007318496704,
"learning_rate": 7.339159076993592e-05,
"loss": 0.2025,
"step": 1620
},
{
"epoch": 0.5879579252811027,
"grad_norm": 0.18958470225334167,
"learning_rate": 7.328136656130458e-05,
"loss": 0.1793,
"step": 1621
},
{
"epoch": 0.5883206383750453,
"grad_norm": 0.18085351586341858,
"learning_rate": 7.317117729860475e-05,
"loss": 0.1669,
"step": 1622
},
{
"epoch": 0.588683351468988,
"grad_norm": 0.18232987821102142,
"learning_rate": 7.306102312595553e-05,
"loss": 0.1649,
"step": 1623
},
{
"epoch": 0.5890460645629307,
"grad_norm": 0.17970141768455505,
"learning_rate": 7.295090418743018e-05,
"loss": 0.1757,
"step": 1624
},
{
"epoch": 0.5894087776568734,
"grad_norm": 0.1799871325492859,
"learning_rate": 7.284082062705584e-05,
"loss": 0.1716,
"step": 1625
},
{
"epoch": 0.5897714907508161,
"grad_norm": 0.1792754977941513,
"learning_rate": 7.273077258881342e-05,
"loss": 0.1825,
"step": 1626
},
{
"epoch": 0.5901342038447588,
"grad_norm": 0.17742280662059784,
"learning_rate": 7.262076021663727e-05,
"loss": 0.159,
"step": 1627
},
{
"epoch": 0.5904969169387014,
"grad_norm": 0.20353969931602478,
"learning_rate": 7.251078365441528e-05,
"loss": 0.1597,
"step": 1628
},
{
"epoch": 0.5908596300326442,
"grad_norm": 0.18415038287639618,
"learning_rate": 7.240084304598835e-05,
"loss": 0.1774,
"step": 1629
},
{
"epoch": 0.5912223431265868,
"grad_norm": 0.18927162885665894,
"learning_rate": 7.229093853515038e-05,
"loss": 0.1628,
"step": 1630
},
{
"epoch": 0.5915850562205296,
"grad_norm": 0.1826174110174179,
"learning_rate": 7.21810702656481e-05,
"loss": 0.1604,
"step": 1631
},
{
"epoch": 0.5919477693144722,
"grad_norm": 0.17375624179840088,
"learning_rate": 7.207123838118077e-05,
"loss": 0.1647,
"step": 1632
},
{
"epoch": 0.592310482408415,
"grad_norm": 0.1889926791191101,
"learning_rate": 7.196144302540014e-05,
"loss": 0.1882,
"step": 1633
},
{
"epoch": 0.5926731955023576,
"grad_norm": 0.17155472934246063,
"learning_rate": 7.185168434191014e-05,
"loss": 0.1552,
"step": 1634
},
{
"epoch": 0.5930359085963003,
"grad_norm": 0.18929725885391235,
"learning_rate": 7.174196247426677e-05,
"loss": 0.163,
"step": 1635
},
{
"epoch": 0.593398621690243,
"grad_norm": 0.18491095304489136,
"learning_rate": 7.163227756597779e-05,
"loss": 0.172,
"step": 1636
},
{
"epoch": 0.5937613347841857,
"grad_norm": 0.19160285592079163,
"learning_rate": 7.152262976050275e-05,
"loss": 0.1642,
"step": 1637
},
{
"epoch": 0.5941240478781284,
"grad_norm": 0.18393130600452423,
"learning_rate": 7.141301920125256e-05,
"loss": 0.1504,
"step": 1638
},
{
"epoch": 0.5944867609720711,
"grad_norm": 0.1797264665365219,
"learning_rate": 7.130344603158942e-05,
"loss": 0.1607,
"step": 1639
},
{
"epoch": 0.5948494740660137,
"grad_norm": 0.16639918088912964,
"learning_rate": 7.119391039482677e-05,
"loss": 0.1637,
"step": 1640
},
{
"epoch": 0.5952121871599565,
"grad_norm": 0.17723850905895233,
"learning_rate": 7.10844124342288e-05,
"loss": 0.1695,
"step": 1641
},
{
"epoch": 0.5955749002538991,
"grad_norm": 0.1672993302345276,
"learning_rate": 7.097495229301048e-05,
"loss": 0.1596,
"step": 1642
},
{
"epoch": 0.5959376133478419,
"grad_norm": 0.18969713151454926,
"learning_rate": 7.08655301143373e-05,
"loss": 0.1658,
"step": 1643
},
{
"epoch": 0.5963003264417845,
"grad_norm": 0.18681742250919342,
"learning_rate": 7.075614604132512e-05,
"loss": 0.1822,
"step": 1644
},
{
"epoch": 0.5966630395357272,
"grad_norm": 0.22509360313415527,
"learning_rate": 7.064680021703992e-05,
"loss": 0.1951,
"step": 1645
},
{
"epoch": 0.59702575262967,
"grad_norm": 0.1588478535413742,
"learning_rate": 7.053749278449774e-05,
"loss": 0.1643,
"step": 1646
},
{
"epoch": 0.5973884657236126,
"grad_norm": 0.1908983290195465,
"learning_rate": 7.042822388666436e-05,
"loss": 0.1674,
"step": 1647
},
{
"epoch": 0.5977511788175554,
"grad_norm": 0.19821012020111084,
"learning_rate": 7.031899366645511e-05,
"loss": 0.1817,
"step": 1648
},
{
"epoch": 0.598113891911498,
"grad_norm": 0.18674594163894653,
"learning_rate": 7.020980226673477e-05,
"loss": 0.1547,
"step": 1649
},
{
"epoch": 0.5984766050054406,
"grad_norm": 0.2012438029050827,
"learning_rate": 7.010064983031737e-05,
"loss": 0.1793,
"step": 1650
},
{
"epoch": 0.5988393180993834,
"grad_norm": 0.18832942843437195,
"learning_rate": 6.999153649996595e-05,
"loss": 0.1797,
"step": 1651
},
{
"epoch": 0.599202031193326,
"grad_norm": 0.20757931470870972,
"learning_rate": 6.98824624183924e-05,
"loss": 0.174,
"step": 1652
},
{
"epoch": 0.5995647442872688,
"grad_norm": 0.1787773221731186,
"learning_rate": 6.977342772825732e-05,
"loss": 0.1577,
"step": 1653
},
{
"epoch": 0.5999274573812114,
"grad_norm": 0.18228726089000702,
"learning_rate": 6.966443257216971e-05,
"loss": 0.1834,
"step": 1654
},
{
"epoch": 0.6002901704751542,
"grad_norm": 10869.5341796875,
"learning_rate": 6.955547709268697e-05,
"loss": 0.1647,
"step": 1655
},
{
"epoch": 0.6006528835690969,
"grad_norm": 0.18677209317684174,
"learning_rate": 6.94465614323145e-05,
"loss": 0.1921,
"step": 1656
},
{
"epoch": 0.6010155966630395,
"grad_norm": 0.21163515746593475,
"learning_rate": 6.933768573350567e-05,
"loss": 0.171,
"step": 1657
},
{
"epoch": 0.6013783097569823,
"grad_norm": 0.1897449642419815,
"learning_rate": 6.922885013866153e-05,
"loss": 0.1877,
"step": 1658
},
{
"epoch": 0.6017410228509249,
"grad_norm": 0.20126648247241974,
"learning_rate": 6.912005479013082e-05,
"loss": 0.2154,
"step": 1659
},
{
"epoch": 0.6021037359448677,
"grad_norm": 0.21092937886714935,
"learning_rate": 6.901129983020948e-05,
"loss": 0.1868,
"step": 1660
},
{
"epoch": 0.6024664490388103,
"grad_norm": 0.23496972024440765,
"learning_rate": 6.890258540114074e-05,
"loss": 0.1784,
"step": 1661
},
{
"epoch": 0.602829162132753,
"grad_norm": 0.21016502380371094,
"learning_rate": 6.879391164511471e-05,
"loss": 0.1728,
"step": 1662
},
{
"epoch": 0.6031918752266957,
"grad_norm": 0.2230292558670044,
"learning_rate": 6.86852787042684e-05,
"loss": 0.1849,
"step": 1663
},
{
"epoch": 0.6035545883206384,
"grad_norm": 0.19853949546813965,
"learning_rate": 6.857668672068534e-05,
"loss": 0.1782,
"step": 1664
},
{
"epoch": 0.6039173014145811,
"grad_norm": 0.1775451898574829,
"learning_rate": 6.846813583639562e-05,
"loss": 0.1497,
"step": 1665
},
{
"epoch": 0.6042800145085238,
"grad_norm": 0.1857757419347763,
"learning_rate": 6.835962619337549e-05,
"loss": 0.1836,
"step": 1666
},
{
"epoch": 0.6046427276024664,
"grad_norm": 0.1867503970861435,
"learning_rate": 6.825115793354726e-05,
"loss": 0.1556,
"step": 1667
},
{
"epoch": 0.6050054406964092,
"grad_norm": 0.18607592582702637,
"learning_rate": 6.814273119877912e-05,
"loss": 0.2011,
"step": 1668
},
{
"epoch": 0.6053681537903518,
"grad_norm": 0.18926583230495453,
"learning_rate": 6.803434613088497e-05,
"loss": 0.1661,
"step": 1669
},
{
"epoch": 0.6057308668842946,
"grad_norm": 0.18735969066619873,
"learning_rate": 6.792600287162416e-05,
"loss": 0.1591,
"step": 1670
},
{
"epoch": 0.6060935799782372,
"grad_norm": 0.23324711620807648,
"learning_rate": 6.781770156270149e-05,
"loss": 0.1656,
"step": 1671
},
{
"epoch": 0.6064562930721799,
"grad_norm": 0.1974279284477234,
"learning_rate": 6.77094423457667e-05,
"loss": 0.1585,
"step": 1672
},
{
"epoch": 0.6068190061661226,
"grad_norm": 0.20500749349594116,
"learning_rate": 6.760122536241462e-05,
"loss": 0.164,
"step": 1673
},
{
"epoch": 0.6071817192600653,
"grad_norm": 0.16157761216163635,
"learning_rate": 6.749305075418482e-05,
"loss": 0.171,
"step": 1674
},
{
"epoch": 0.607544432354008,
"grad_norm": 0.19271859526634216,
"learning_rate": 6.738491866256138e-05,
"loss": 0.1777,
"step": 1675
},
{
"epoch": 0.6079071454479507,
"grad_norm": 0.18441638350486755,
"learning_rate": 6.727682922897282e-05,
"loss": 0.1683,
"step": 1676
},
{
"epoch": 0.6082698585418934,
"grad_norm": 0.17519617080688477,
"learning_rate": 6.716878259479189e-05,
"loss": 0.1739,
"step": 1677
},
{
"epoch": 0.6086325716358361,
"grad_norm": 0.18938271701335907,
"learning_rate": 6.706077890133531e-05,
"loss": 0.1606,
"step": 1678
},
{
"epoch": 0.6089952847297787,
"grad_norm": 0.20264668762683868,
"learning_rate": 6.695281828986369e-05,
"loss": 0.174,
"step": 1679
},
{
"epoch": 0.6093579978237215,
"grad_norm": 0.22438956797122955,
"learning_rate": 6.684490090158124e-05,
"loss": 0.1594,
"step": 1680
},
{
"epoch": 0.6097207109176641,
"grad_norm": 0.19163423776626587,
"learning_rate": 6.673702687763565e-05,
"loss": 0.1594,
"step": 1681
},
{
"epoch": 0.6100834240116069,
"grad_norm": 0.1845075786113739,
"learning_rate": 6.662919635911793e-05,
"loss": 0.173,
"step": 1682
},
{
"epoch": 0.6104461371055495,
"grad_norm": 0.18868669867515564,
"learning_rate": 6.652140948706209e-05,
"loss": 0.1786,
"step": 1683
},
{
"epoch": 0.6108088501994922,
"grad_norm": 0.22319957613945007,
"learning_rate": 6.641366640244525e-05,
"loss": 0.2068,
"step": 1684
},
{
"epoch": 0.6111715632934349,
"grad_norm": 0.18685069680213928,
"learning_rate": 6.630596724618703e-05,
"loss": 0.1751,
"step": 1685
},
{
"epoch": 0.6115342763873776,
"grad_norm": 0.18427863717079163,
"learning_rate": 6.619831215914974e-05,
"loss": 0.1707,
"step": 1686
},
{
"epoch": 0.6118969894813203,
"grad_norm": 0.19461330771446228,
"learning_rate": 6.609070128213802e-05,
"loss": 0.178,
"step": 1687
},
{
"epoch": 0.612259702575263,
"grad_norm": 0.21272696554660797,
"learning_rate": 6.598313475589863e-05,
"loss": 0.1789,
"step": 1688
},
{
"epoch": 0.6126224156692056,
"grad_norm": 0.20163173973560333,
"learning_rate": 6.58756127211204e-05,
"loss": 0.2014,
"step": 1689
},
{
"epoch": 0.6129851287631484,
"grad_norm": 0.1940133273601532,
"learning_rate": 6.576813531843396e-05,
"loss": 0.1703,
"step": 1690
},
{
"epoch": 0.613347841857091,
"grad_norm": 0.17384611070156097,
"learning_rate": 6.566070268841152e-05,
"loss": 0.1556,
"step": 1691
},
{
"epoch": 0.6137105549510338,
"grad_norm": 0.1869945228099823,
"learning_rate": 6.555331497156672e-05,
"loss": 0.1548,
"step": 1692
},
{
"epoch": 0.6140732680449764,
"grad_norm": 0.18520064651966095,
"learning_rate": 6.544597230835454e-05,
"loss": 0.1807,
"step": 1693
},
{
"epoch": 0.6144359811389191,
"grad_norm": 0.17966820299625397,
"learning_rate": 6.533867483917098e-05,
"loss": 0.1516,
"step": 1694
},
{
"epoch": 0.6147986942328618,
"grad_norm": 0.1705074906349182,
"learning_rate": 6.523142270435288e-05,
"loss": 0.1518,
"step": 1695
},
{
"epoch": 0.6151614073268045,
"grad_norm": 0.24414807558059692,
"learning_rate": 6.512421604417792e-05,
"loss": 0.2026,
"step": 1696
},
{
"epoch": 0.6155241204207472,
"grad_norm": 0.16796554625034332,
"learning_rate": 6.501705499886418e-05,
"loss": 0.1554,
"step": 1697
},
{
"epoch": 0.6158868335146899,
"grad_norm": 0.19749103486537933,
"learning_rate": 6.490993970857011e-05,
"loss": 0.1807,
"step": 1698
},
{
"epoch": 0.6162495466086326,
"grad_norm": 0.16789931058883667,
"learning_rate": 6.480287031339436e-05,
"loss": 0.1617,
"step": 1699
},
{
"epoch": 0.6166122597025753,
"grad_norm": 0.1916869580745697,
"learning_rate": 6.469584695337548e-05,
"loss": 0.188,
"step": 1700
},
{
"epoch": 0.6169749727965179,
"grad_norm": 0.19540345668792725,
"learning_rate": 6.458886976849183e-05,
"loss": 0.1743,
"step": 1701
},
{
"epoch": 0.6173376858904607,
"grad_norm": 0.17193295061588287,
"learning_rate": 6.448193889866149e-05,
"loss": 0.1763,
"step": 1702
},
{
"epoch": 0.6177003989844033,
"grad_norm": 0.17156308889389038,
"learning_rate": 6.43750544837418e-05,
"loss": 0.158,
"step": 1703
},
{
"epoch": 0.6180631120783461,
"grad_norm": 0.1796158254146576,
"learning_rate": 6.426821666352942e-05,
"loss": 0.1656,
"step": 1704
},
{
"epoch": 0.6184258251722887,
"grad_norm": 0.18700680136680603,
"learning_rate": 6.416142557776006e-05,
"loss": 0.174,
"step": 1705
},
{
"epoch": 0.6187885382662314,
"grad_norm": 0.16723744571208954,
"learning_rate": 6.405468136610832e-05,
"loss": 0.1619,
"step": 1706
},
{
"epoch": 0.6191512513601741,
"grad_norm": 0.17422862350940704,
"learning_rate": 6.394798416818739e-05,
"loss": 0.1609,
"step": 1707
},
{
"epoch": 0.6195139644541168,
"grad_norm": 0.20079629123210907,
"learning_rate": 6.384133412354918e-05,
"loss": 0.1652,
"step": 1708
},
{
"epoch": 0.6198766775480595,
"grad_norm": 0.2474866658449173,
"learning_rate": 6.373473137168373e-05,
"loss": 0.1663,
"step": 1709
},
{
"epoch": 0.6202393906420022,
"grad_norm": 0.1707204282283783,
"learning_rate": 6.36281760520193e-05,
"loss": 0.1592,
"step": 1710
},
{
"epoch": 0.6206021037359448,
"grad_norm": 0.17606933414936066,
"learning_rate": 6.352166830392213e-05,
"loss": 0.1662,
"step": 1711
},
{
"epoch": 0.6209648168298876,
"grad_norm": 0.17025688290596008,
"learning_rate": 6.341520826669621e-05,
"loss": 0.1592,
"step": 1712
},
{
"epoch": 0.6213275299238302,
"grad_norm": 0.18838566541671753,
"learning_rate": 6.330879607958314e-05,
"loss": 0.1816,
"step": 1713
},
{
"epoch": 0.621690243017773,
"grad_norm": 0.2592281103134155,
"learning_rate": 6.320243188176185e-05,
"loss": 0.2014,
"step": 1714
},
{
"epoch": 0.6220529561117156,
"grad_norm": 0.16398011147975922,
"learning_rate": 6.309611581234872e-05,
"loss": 0.1585,
"step": 1715
},
{
"epoch": 0.6224156692056583,
"grad_norm": 0.1793876439332962,
"learning_rate": 6.298984801039697e-05,
"loss": 0.1532,
"step": 1716
},
{
"epoch": 0.622778382299601,
"grad_norm": 0.1910189986228943,
"learning_rate": 6.28836286148968e-05,
"loss": 0.1666,
"step": 1717
},
{
"epoch": 0.6231410953935437,
"grad_norm": 0.20349231362342834,
"learning_rate": 6.277745776477506e-05,
"loss": 0.2075,
"step": 1718
},
{
"epoch": 0.6235038084874864,
"grad_norm": 0.19140169024467468,
"learning_rate": 6.267133559889509e-05,
"loss": 0.1574,
"step": 1719
},
{
"epoch": 0.6238665215814291,
"grad_norm": 0.18104875087738037,
"learning_rate": 6.256526225605652e-05,
"loss": 0.1594,
"step": 1720
},
{
"epoch": 0.6242292346753718,
"grad_norm": 0.18763144314289093,
"learning_rate": 6.245923787499532e-05,
"loss": 0.1613,
"step": 1721
},
{
"epoch": 0.6245919477693145,
"grad_norm": 0.16338056325912476,
"learning_rate": 6.235326259438317e-05,
"loss": 0.1823,
"step": 1722
},
{
"epoch": 0.6249546608632571,
"grad_norm": 0.1663455367088318,
"learning_rate": 6.224733655282771e-05,
"loss": 0.167,
"step": 1723
},
{
"epoch": 0.6253173739571999,
"grad_norm": 0.17179372906684875,
"learning_rate": 6.214145988887206e-05,
"loss": 0.1645,
"step": 1724
},
{
"epoch": 0.6256800870511425,
"grad_norm": 0.16161875426769257,
"learning_rate": 6.203563274099481e-05,
"loss": 0.1402,
"step": 1725
},
{
"epoch": 0.6260428001450853,
"grad_norm": 0.2017858475446701,
"learning_rate": 6.19298552476098e-05,
"loss": 0.1667,
"step": 1726
},
{
"epoch": 0.6264055132390279,
"grad_norm": 0.22198174893856049,
"learning_rate": 6.182412754706594e-05,
"loss": 0.1902,
"step": 1727
},
{
"epoch": 0.6267682263329706,
"grad_norm": 0.1705772578716278,
"learning_rate": 6.171844977764695e-05,
"loss": 0.1588,
"step": 1728
},
{
"epoch": 0.6271309394269133,
"grad_norm": 0.17019295692443848,
"learning_rate": 6.161282207757126e-05,
"loss": 0.1609,
"step": 1729
},
{
"epoch": 0.627493652520856,
"grad_norm": 0.1743742674589157,
"learning_rate": 6.15072445849919e-05,
"loss": 0.179,
"step": 1730
},
{
"epoch": 0.6278563656147987,
"grad_norm": 0.16775129735469818,
"learning_rate": 6.140171743799611e-05,
"loss": 0.1807,
"step": 1731
},
{
"epoch": 0.6282190787087414,
"grad_norm": 0.18963152170181274,
"learning_rate": 6.129624077460532e-05,
"loss": 0.2007,
"step": 1732
},
{
"epoch": 0.628581791802684,
"grad_norm": 0.182524174451828,
"learning_rate": 6.119081473277501e-05,
"loss": 0.1738,
"step": 1733
},
{
"epoch": 0.6289445048966268,
"grad_norm": 0.18262414634227753,
"learning_rate": 6.108543945039438e-05,
"loss": 0.1897,
"step": 1734
},
{
"epoch": 0.6293072179905694,
"grad_norm": 0.1729535162448883,
"learning_rate": 6.098011506528623e-05,
"loss": 0.1586,
"step": 1735
},
{
"epoch": 0.6296699310845122,
"grad_norm": 0.1677355319261551,
"learning_rate": 6.0874841715206785e-05,
"loss": 0.1871,
"step": 1736
},
{
"epoch": 0.6300326441784548,
"grad_norm": 0.17900875210762024,
"learning_rate": 6.076961953784559e-05,
"loss": 0.1595,
"step": 1737
},
{
"epoch": 0.6303953572723975,
"grad_norm": 0.18250757455825806,
"learning_rate": 6.066444867082515e-05,
"loss": 0.1842,
"step": 1738
},
{
"epoch": 0.6307580703663402,
"grad_norm": 0.17696964740753174,
"learning_rate": 6.0559329251701005e-05,
"loss": 0.1709,
"step": 1739
},
{
"epoch": 0.6311207834602829,
"grad_norm": 0.1764724850654602,
"learning_rate": 6.045426141796128e-05,
"loss": 0.161,
"step": 1740
},
{
"epoch": 0.6314834965542256,
"grad_norm": 0.17228443920612335,
"learning_rate": 6.03492453070267e-05,
"loss": 0.1579,
"step": 1741
},
{
"epoch": 0.6318462096481683,
"grad_norm": 0.17399545013904572,
"learning_rate": 6.024428105625028e-05,
"loss": 0.1555,
"step": 1742
},
{
"epoch": 0.6322089227421109,
"grad_norm": 0.1953967958688736,
"learning_rate": 6.0139368802917284e-05,
"loss": 0.2569,
"step": 1743
},
{
"epoch": 0.6325716358360537,
"grad_norm": 0.17359597980976105,
"learning_rate": 6.0034508684244875e-05,
"loss": 0.1783,
"step": 1744
},
{
"epoch": 0.6329343489299963,
"grad_norm": 0.1505521535873413,
"learning_rate": 5.992970083738212e-05,
"loss": 0.1567,
"step": 1745
},
{
"epoch": 0.6332970620239391,
"grad_norm": 0.18801428377628326,
"learning_rate": 5.982494539940966e-05,
"loss": 0.2076,
"step": 1746
},
{
"epoch": 0.6336597751178817,
"grad_norm": 0.16666316986083984,
"learning_rate": 5.97202425073396e-05,
"loss": 0.1617,
"step": 1747
},
{
"epoch": 0.6340224882118245,
"grad_norm": 0.174256831407547,
"learning_rate": 5.961559229811535e-05,
"loss": 0.167,
"step": 1748
},
{
"epoch": 0.6343852013057671,
"grad_norm": 0.16997861862182617,
"learning_rate": 5.951099490861136e-05,
"loss": 0.191,
"step": 1749
},
{
"epoch": 0.6347479143997098,
"grad_norm": 0.18059667944908142,
"learning_rate": 5.940645047563306e-05,
"loss": 0.1769,
"step": 1750
},
{
"epoch": 0.6351106274936525,
"grad_norm": 0.17815832793712616,
"learning_rate": 5.9301959135916496e-05,
"loss": 0.1406,
"step": 1751
},
{
"epoch": 0.6354733405875952,
"grad_norm": 0.1702101081609726,
"learning_rate": 5.919752102612848e-05,
"loss": 0.1471,
"step": 1752
},
{
"epoch": 0.635836053681538,
"grad_norm": 0.1625283807516098,
"learning_rate": 5.909313628286601e-05,
"loss": 0.1446,
"step": 1753
},
{
"epoch": 0.6361987667754806,
"grad_norm": 0.16857244074344635,
"learning_rate": 5.898880504265638e-05,
"loss": 0.1561,
"step": 1754
},
{
"epoch": 0.6365614798694232,
"grad_norm": 0.18340398371219635,
"learning_rate": 5.888452744195687e-05,
"loss": 0.1862,
"step": 1755
},
{
"epoch": 0.636924192963366,
"grad_norm": 0.20158030092716217,
"learning_rate": 5.878030361715461e-05,
"loss": 0.1571,
"step": 1756
},
{
"epoch": 0.6372869060573086,
"grad_norm": 0.17433685064315796,
"learning_rate": 5.867613370456636e-05,
"loss": 0.1629,
"step": 1757
},
{
"epoch": 0.6376496191512514,
"grad_norm": 0.16959048807621002,
"learning_rate": 5.857201784043851e-05,
"loss": 0.1742,
"step": 1758
},
{
"epoch": 0.638012332245194,
"grad_norm": 0.17399851977825165,
"learning_rate": 5.8467956160946604e-05,
"loss": 0.1605,
"step": 1759
},
{
"epoch": 0.6383750453391367,
"grad_norm": 0.1925593912601471,
"learning_rate": 5.8363948802195356e-05,
"loss": 0.2142,
"step": 1760
},
{
"epoch": 0.6387377584330794,
"grad_norm": 0.1870613396167755,
"learning_rate": 5.8259995900218465e-05,
"loss": 0.1619,
"step": 1761
},
{
"epoch": 0.6391004715270221,
"grad_norm": 0.18008996546268463,
"learning_rate": 5.815609759097837e-05,
"loss": 0.1594,
"step": 1762
},
{
"epoch": 0.6394631846209649,
"grad_norm": 0.1749439388513565,
"learning_rate": 5.8052254010366105e-05,
"loss": 0.1543,
"step": 1763
},
{
"epoch": 0.6398258977149075,
"grad_norm": 0.17792417109012604,
"learning_rate": 5.7948465294201194e-05,
"loss": 0.1679,
"step": 1764
},
{
"epoch": 0.6401886108088501,
"grad_norm": 0.18781551718711853,
"learning_rate": 5.7844731578231334e-05,
"loss": 0.1634,
"step": 1765
},
{
"epoch": 0.6405513239027929,
"grad_norm": 0.17064349353313446,
"learning_rate": 5.7741052998132285e-05,
"loss": 0.1547,
"step": 1766
},
{
"epoch": 0.6409140369967355,
"grad_norm": 0.15985310077667236,
"learning_rate": 5.7637429689507713e-05,
"loss": 0.1446,
"step": 1767
},
{
"epoch": 0.6412767500906783,
"grad_norm": 0.18584533035755157,
"learning_rate": 5.7533861787888995e-05,
"loss": 0.1692,
"step": 1768
},
{
"epoch": 0.641639463184621,
"grad_norm": 0.18340182304382324,
"learning_rate": 5.7430349428734995e-05,
"loss": 0.1698,
"step": 1769
},
{
"epoch": 0.6420021762785637,
"grad_norm": 0.15710604190826416,
"learning_rate": 5.732689274743204e-05,
"loss": 0.1465,
"step": 1770
},
{
"epoch": 0.6423648893725064,
"grad_norm": 0.17073456943035126,
"learning_rate": 5.7223491879293526e-05,
"loss": 0.1531,
"step": 1771
},
{
"epoch": 0.642727602466449,
"grad_norm": 0.17552490532398224,
"learning_rate": 5.712014695955991e-05,
"loss": 0.1519,
"step": 1772
},
{
"epoch": 0.6430903155603918,
"grad_norm": 0.20075669884681702,
"learning_rate": 5.7016858123398434e-05,
"loss": 0.167,
"step": 1773
},
{
"epoch": 0.6434530286543344,
"grad_norm": 0.20733250677585602,
"learning_rate": 5.691362550590297e-05,
"loss": 0.1745,
"step": 1774
},
{
"epoch": 0.6438157417482772,
"grad_norm": 0.16159029304981232,
"learning_rate": 5.681044924209398e-05,
"loss": 0.15,
"step": 1775
},
{
"epoch": 0.6441784548422198,
"grad_norm": 0.184630885720253,
"learning_rate": 5.670732946691808e-05,
"loss": 0.1756,
"step": 1776
},
{
"epoch": 0.6445411679361625,
"grad_norm": 0.16852855682373047,
"learning_rate": 5.6604266315248034e-05,
"loss": 0.1642,
"step": 1777
},
{
"epoch": 0.6449038810301052,
"grad_norm": 0.16728003323078156,
"learning_rate": 5.6501259921882655e-05,
"loss": 0.1612,
"step": 1778
},
{
"epoch": 0.6452665941240479,
"grad_norm": 0.17908404767513275,
"learning_rate": 5.6398310421546376e-05,
"loss": 0.1759,
"step": 1779
},
{
"epoch": 0.6456293072179906,
"grad_norm": 0.16568151116371155,
"learning_rate": 5.6295417948889306e-05,
"loss": 0.1514,
"step": 1780
},
{
"epoch": 0.6459920203119333,
"grad_norm": 0.2028510570526123,
"learning_rate": 5.619258263848692e-05,
"loss": 0.1626,
"step": 1781
},
{
"epoch": 0.6463547334058759,
"grad_norm": 0.19075465202331543,
"learning_rate": 5.608980462483991e-05,
"loss": 0.1809,
"step": 1782
},
{
"epoch": 0.6467174464998187,
"grad_norm": 0.18601737916469574,
"learning_rate": 5.598708404237416e-05,
"loss": 0.1606,
"step": 1783
},
{
"epoch": 0.6470801595937613,
"grad_norm": 0.18421201407909393,
"learning_rate": 5.588442102544029e-05,
"loss": 0.1527,
"step": 1784
},
{
"epoch": 0.6474428726877041,
"grad_norm": 0.20656828582286835,
"learning_rate": 5.578181570831369e-05,
"loss": 0.1726,
"step": 1785
},
{
"epoch": 0.6478055857816467,
"grad_norm": 0.1901615560054779,
"learning_rate": 5.567926822519427e-05,
"loss": 0.1865,
"step": 1786
},
{
"epoch": 0.6481682988755894,
"grad_norm": 0.17387042939662933,
"learning_rate": 5.55767787102063e-05,
"loss": 0.1643,
"step": 1787
},
{
"epoch": 0.6485310119695321,
"grad_norm": 0.16012033820152283,
"learning_rate": 5.547434729739822e-05,
"loss": 0.162,
"step": 1788
},
{
"epoch": 0.6488937250634748,
"grad_norm": 0.17737270891666412,
"learning_rate": 5.537197412074257e-05,
"loss": 0.1563,
"step": 1789
},
{
"epoch": 0.6492564381574175,
"grad_norm": 0.17308826744556427,
"learning_rate": 5.526965931413557e-05,
"loss": 0.1596,
"step": 1790
},
{
"epoch": 0.6496191512513602,
"grad_norm": 0.20024463534355164,
"learning_rate": 5.516740301139721e-05,
"loss": 0.1763,
"step": 1791
},
{
"epoch": 0.6499818643453029,
"grad_norm": 0.17333653569221497,
"learning_rate": 5.506520534627091e-05,
"loss": 0.1666,
"step": 1792
},
{
"epoch": 0.6503445774392456,
"grad_norm": 0.17827224731445312,
"learning_rate": 5.496306645242339e-05,
"loss": 0.1718,
"step": 1793
},
{
"epoch": 0.6507072905331882,
"grad_norm": 0.19950279593467712,
"learning_rate": 5.4860986463444506e-05,
"loss": 0.2117,
"step": 1794
},
{
"epoch": 0.651070003627131,
"grad_norm": 0.17631955444812775,
"learning_rate": 5.475896551284716e-05,
"loss": 0.1784,
"step": 1795
},
{
"epoch": 0.6514327167210736,
"grad_norm": 0.18082845211029053,
"learning_rate": 5.4657003734066925e-05,
"loss": 0.2068,
"step": 1796
},
{
"epoch": 0.6517954298150164,
"grad_norm": 0.17366324365139008,
"learning_rate": 5.455510126046199e-05,
"loss": 0.1443,
"step": 1797
},
{
"epoch": 0.652158142908959,
"grad_norm": 0.17154483497142792,
"learning_rate": 5.445325822531304e-05,
"loss": 0.17,
"step": 1798
},
{
"epoch": 0.6525208560029017,
"grad_norm": 0.18583987653255463,
"learning_rate": 5.435147476182298e-05,
"loss": 0.1609,
"step": 1799
},
{
"epoch": 0.6528835690968444,
"grad_norm": 0.16991505026817322,
"learning_rate": 5.424975100311676e-05,
"loss": 0.1537,
"step": 1800
},
{
"epoch": 0.6532462821907871,
"grad_norm": 0.1840389221906662,
"learning_rate": 5.414808708224135e-05,
"loss": 0.1628,
"step": 1801
},
{
"epoch": 0.6536089952847298,
"grad_norm": 0.197292760014534,
"learning_rate": 5.404648313216538e-05,
"loss": 0.1722,
"step": 1802
},
{
"epoch": 0.6539717083786725,
"grad_norm": 0.1785934418439865,
"learning_rate": 5.394493928577903e-05,
"loss": 0.1629,
"step": 1803
},
{
"epoch": 0.6543344214726151,
"grad_norm": 0.17052417993545532,
"learning_rate": 5.384345567589391e-05,
"loss": 0.1639,
"step": 1804
},
{
"epoch": 0.6546971345665579,
"grad_norm": 0.1716339886188507,
"learning_rate": 5.374203243524283e-05,
"loss": 0.1628,
"step": 1805
},
{
"epoch": 0.6550598476605005,
"grad_norm": 0.16768915951251984,
"learning_rate": 5.364066969647963e-05,
"loss": 0.1426,
"step": 1806
},
{
"epoch": 0.6554225607544433,
"grad_norm": 0.1639591008424759,
"learning_rate": 5.353936759217899e-05,
"loss": 0.1604,
"step": 1807
},
{
"epoch": 0.6557852738483859,
"grad_norm": 0.1945423036813736,
"learning_rate": 5.343812625483642e-05,
"loss": 0.1562,
"step": 1808
},
{
"epoch": 0.6561479869423286,
"grad_norm": 0.1996852457523346,
"learning_rate": 5.333694581686779e-05,
"loss": 0.1712,
"step": 1809
},
{
"epoch": 0.6565107000362713,
"grad_norm": 0.18032366037368774,
"learning_rate": 5.32358264106094e-05,
"loss": 0.196,
"step": 1810
},
{
"epoch": 0.656873413130214,
"grad_norm": 0.16884812712669373,
"learning_rate": 5.313476816831768e-05,
"loss": 0.1558,
"step": 1811
},
{
"epoch": 0.6572361262241567,
"grad_norm": 0.1865408569574356,
"learning_rate": 5.303377122216915e-05,
"loss": 0.184,
"step": 1812
},
{
"epoch": 0.6575988393180994,
"grad_norm": 0.18371020257472992,
"learning_rate": 5.293283570426007e-05,
"loss": 0.1672,
"step": 1813
},
{
"epoch": 0.6579615524120421,
"grad_norm": 0.1799343377351761,
"learning_rate": 5.283196174660633e-05,
"loss": 0.1544,
"step": 1814
},
{
"epoch": 0.6583242655059848,
"grad_norm": 0.17262513935565948,
"learning_rate": 5.273114948114346e-05,
"loss": 0.1582,
"step": 1815
},
{
"epoch": 0.6586869785999274,
"grad_norm": 0.19773328304290771,
"learning_rate": 5.263039903972618e-05,
"loss": 0.1649,
"step": 1816
},
{
"epoch": 0.6590496916938702,
"grad_norm": 0.18928907811641693,
"learning_rate": 5.252971055412832e-05,
"loss": 0.1853,
"step": 1817
},
{
"epoch": 0.6594124047878128,
"grad_norm": 0.17779038846492767,
"learning_rate": 5.242908415604277e-05,
"loss": 0.1643,
"step": 1818
},
{
"epoch": 0.6597751178817556,
"grad_norm": 0.2303963601589203,
"learning_rate": 5.2328519977081105e-05,
"loss": 0.1926,
"step": 1819
},
{
"epoch": 0.6601378309756982,
"grad_norm": 0.16455812752246857,
"learning_rate": 5.222801814877369e-05,
"loss": 0.1582,
"step": 1820
},
{
"epoch": 0.6605005440696409,
"grad_norm": 0.16079877316951752,
"learning_rate": 5.21275788025692e-05,
"loss": 0.149,
"step": 1821
},
{
"epoch": 0.6608632571635836,
"grad_norm": 0.1705598533153534,
"learning_rate": 5.20272020698346e-05,
"loss": 0.1624,
"step": 1822
},
{
"epoch": 0.6612259702575263,
"grad_norm": 0.16610048711299896,
"learning_rate": 5.192688808185502e-05,
"loss": 0.1527,
"step": 1823
},
{
"epoch": 0.661588683351469,
"grad_norm": 0.19774171710014343,
"learning_rate": 5.1826636969833475e-05,
"loss": 0.1631,
"step": 1824
},
{
"epoch": 0.6619513964454117,
"grad_norm": 0.17446525394916534,
"learning_rate": 5.172644886489073e-05,
"loss": 0.1621,
"step": 1825
},
{
"epoch": 0.6623141095393543,
"grad_norm": 0.20300233364105225,
"learning_rate": 5.162632389806523e-05,
"loss": 0.1907,
"step": 1826
},
{
"epoch": 0.6626768226332971,
"grad_norm": 0.204659104347229,
"learning_rate": 5.152626220031278e-05,
"loss": 0.1596,
"step": 1827
},
{
"epoch": 0.6630395357272397,
"grad_norm": 0.1757912039756775,
"learning_rate": 5.1426263902506414e-05,
"loss": 0.1535,
"step": 1828
},
{
"epoch": 0.6634022488211825,
"grad_norm": 0.19932380318641663,
"learning_rate": 5.132632913543627e-05,
"loss": 0.1705,
"step": 1829
},
{
"epoch": 0.6637649619151251,
"grad_norm": 0.18215243518352509,
"learning_rate": 5.1226458029809387e-05,
"loss": 0.1636,
"step": 1830
},
{
"epoch": 0.6641276750090678,
"grad_norm": 0.1725538820028305,
"learning_rate": 5.112665071624951e-05,
"loss": 0.1397,
"step": 1831
},
{
"epoch": 0.6644903881030105,
"grad_norm": 0.18406741321086884,
"learning_rate": 5.1026907325297044e-05,
"loss": 0.1639,
"step": 1832
},
{
"epoch": 0.6648531011969532,
"grad_norm": 0.17330917716026306,
"learning_rate": 5.092722798740871e-05,
"loss": 0.1588,
"step": 1833
},
{
"epoch": 0.6652158142908959,
"grad_norm": 0.16775713860988617,
"learning_rate": 5.082761283295745e-05,
"loss": 0.1407,
"step": 1834
},
{
"epoch": 0.6655785273848386,
"grad_norm": 0.17397847771644592,
"learning_rate": 5.072806199223228e-05,
"loss": 0.1767,
"step": 1835
},
{
"epoch": 0.6659412404787813,
"grad_norm": 0.17217876017093658,
"learning_rate": 5.062857559543809e-05,
"loss": 0.1644,
"step": 1836
},
{
"epoch": 0.666303953572724,
"grad_norm": 0.1916993409395218,
"learning_rate": 5.0529153772695495e-05,
"loss": 0.1631,
"step": 1837
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.19008702039718628,
"learning_rate": 5.0429796654040595e-05,
"loss": 0.1677,
"step": 1838
},
{
"epoch": 0.6670293797606094,
"grad_norm": 0.18073846399784088,
"learning_rate": 5.033050436942501e-05,
"loss": 0.1644,
"step": 1839
},
{
"epoch": 0.667392092854552,
"grad_norm": 0.1769622266292572,
"learning_rate": 5.023127704871541e-05,
"loss": 0.1764,
"step": 1840
},
{
"epoch": 0.6677548059484948,
"grad_norm": 0.17394478619098663,
"learning_rate": 5.013211482169354e-05,
"loss": 0.1652,
"step": 1841
},
{
"epoch": 0.6681175190424374,
"grad_norm": 0.18357783555984497,
"learning_rate": 5.003301781805604e-05,
"loss": 0.1799,
"step": 1842
},
{
"epoch": 0.6684802321363801,
"grad_norm": 0.18445712327957153,
"learning_rate": 4.993398616741421e-05,
"loss": 0.1731,
"step": 1843
},
{
"epoch": 0.6688429452303228,
"grad_norm": 0.17181545495986938,
"learning_rate": 4.983501999929384e-05,
"loss": 0.1647,
"step": 1844
},
{
"epoch": 0.6692056583242655,
"grad_norm": 0.1643923968076706,
"learning_rate": 4.97361194431352e-05,
"loss": 0.1522,
"step": 1845
},
{
"epoch": 0.6695683714182082,
"grad_norm": 0.178927481174469,
"learning_rate": 4.963728462829262e-05,
"loss": 0.1693,
"step": 1846
},
{
"epoch": 0.6699310845121509,
"grad_norm": 0.16977953910827637,
"learning_rate": 4.95385156840345e-05,
"loss": 0.1634,
"step": 1847
},
{
"epoch": 0.6702937976060935,
"grad_norm": 0.19453585147857666,
"learning_rate": 4.943981273954302e-05,
"loss": 0.161,
"step": 1848
},
{
"epoch": 0.6706565107000363,
"grad_norm": 0.15591104328632355,
"learning_rate": 4.9341175923914184e-05,
"loss": 0.1336,
"step": 1849
},
{
"epoch": 0.6710192237939789,
"grad_norm": 0.19056206941604614,
"learning_rate": 4.9242605366157356e-05,
"loss": 0.1647,
"step": 1850
},
{
"epoch": 0.6713819368879217,
"grad_norm": 0.18081988394260406,
"learning_rate": 4.914410119519528e-05,
"loss": 0.1679,
"step": 1851
},
{
"epoch": 0.6717446499818643,
"grad_norm": 0.1665160208940506,
"learning_rate": 4.904566353986394e-05,
"loss": 0.1585,
"step": 1852
},
{
"epoch": 0.672107363075807,
"grad_norm": 0.18015241622924805,
"learning_rate": 4.894729252891224e-05,
"loss": 0.1687,
"step": 1853
},
{
"epoch": 0.6724700761697497,
"grad_norm": 0.16529425978660583,
"learning_rate": 4.884898829100194e-05,
"loss": 0.1569,
"step": 1854
},
{
"epoch": 0.6728327892636924,
"grad_norm": 0.17505323886871338,
"learning_rate": 4.87507509547075e-05,
"loss": 0.1651,
"step": 1855
},
{
"epoch": 0.6731955023576351,
"grad_norm": 0.18190018832683563,
"learning_rate": 4.865258064851579e-05,
"loss": 0.1706,
"step": 1856
},
{
"epoch": 0.6735582154515778,
"grad_norm": 0.1668224334716797,
"learning_rate": 4.855447750082615e-05,
"loss": 0.1639,
"step": 1857
},
{
"epoch": 0.6739209285455205,
"grad_norm": 0.18514534831047058,
"learning_rate": 4.845644163994996e-05,
"loss": 0.1912,
"step": 1858
},
{
"epoch": 0.6742836416394632,
"grad_norm": 0.19206570088863373,
"learning_rate": 4.835847319411065e-05,
"loss": 0.1595,
"step": 1859
},
{
"epoch": 0.6746463547334058,
"grad_norm": 0.19193512201309204,
"learning_rate": 4.8260572291443465e-05,
"loss": 0.1586,
"step": 1860
},
{
"epoch": 0.6750090678273486,
"grad_norm": 0.1866559088230133,
"learning_rate": 4.816273905999529e-05,
"loss": 0.1841,
"step": 1861
},
{
"epoch": 0.6753717809212912,
"grad_norm": 0.1677185595035553,
"learning_rate": 4.80649736277245e-05,
"loss": 0.1672,
"step": 1862
},
{
"epoch": 0.675734494015234,
"grad_norm": 0.16328024864196777,
"learning_rate": 4.796727612250087e-05,
"loss": 0.1556,
"step": 1863
},
{
"epoch": 0.6760972071091766,
"grad_norm": 0.1733468621969223,
"learning_rate": 4.7869646672105254e-05,
"loss": 0.1572,
"step": 1864
},
{
"epoch": 0.6764599202031193,
"grad_norm": 0.19276085495948792,
"learning_rate": 4.7772085404229495e-05,
"loss": 0.1681,
"step": 1865
},
{
"epoch": 0.676822633297062,
"grad_norm": 0.2415236532688141,
"learning_rate": 4.767459244647629e-05,
"loss": 0.2347,
"step": 1866
},
{
"epoch": 0.6771853463910047,
"grad_norm": 0.21599356830120087,
"learning_rate": 4.757716792635898e-05,
"loss": 0.1985,
"step": 1867
},
{
"epoch": 0.6775480594849475,
"grad_norm": 0.17399145662784576,
"learning_rate": 4.747981197130139e-05,
"loss": 0.1662,
"step": 1868
},
{
"epoch": 0.6779107725788901,
"grad_norm": 0.1672641634941101,
"learning_rate": 4.738252470863763e-05,
"loss": 0.1482,
"step": 1869
},
{
"epoch": 0.6782734856728327,
"grad_norm": 0.17770545184612274,
"learning_rate": 4.7285306265612106e-05,
"loss": 0.157,
"step": 1870
},
{
"epoch": 0.6786361987667755,
"grad_norm": 0.18158309161663055,
"learning_rate": 4.7188156769379063e-05,
"loss": 0.1569,
"step": 1871
},
{
"epoch": 0.6789989118607181,
"grad_norm": 0.2067386507987976,
"learning_rate": 4.7091076347002613e-05,
"loss": 0.1686,
"step": 1872
},
{
"epoch": 0.6793616249546609,
"grad_norm": 0.16841058433055878,
"learning_rate": 4.6994065125456546e-05,
"loss": 0.1564,
"step": 1873
},
{
"epoch": 0.6797243380486035,
"grad_norm": 0.19121627509593964,
"learning_rate": 4.6897123231624105e-05,
"loss": 0.1794,
"step": 1874
},
{
"epoch": 0.6800870511425462,
"grad_norm": 0.16333697736263275,
"learning_rate": 4.6800250792297885e-05,
"loss": 0.1497,
"step": 1875
},
{
"epoch": 0.680449764236489,
"grad_norm": 0.16970248520374298,
"learning_rate": 4.670344793417967e-05,
"loss": 0.1672,
"step": 1876
},
{
"epoch": 0.6808124773304316,
"grad_norm": 0.1738625019788742,
"learning_rate": 4.660671478388019e-05,
"loss": 0.1689,
"step": 1877
},
{
"epoch": 0.6811751904243744,
"grad_norm": 0.167289599776268,
"learning_rate": 4.651005146791901e-05,
"loss": 0.1446,
"step": 1878
},
{
"epoch": 0.681537903518317,
"grad_norm": 0.1755392998456955,
"learning_rate": 4.641345811272436e-05,
"loss": 0.1509,
"step": 1879
},
{
"epoch": 0.6819006166122598,
"grad_norm": 0.1769733875989914,
"learning_rate": 4.631693484463299e-05,
"loss": 0.1688,
"step": 1880
},
{
"epoch": 0.6822633297062024,
"grad_norm": 0.17857052385807037,
"learning_rate": 4.622048178988989e-05,
"loss": 0.1799,
"step": 1881
},
{
"epoch": 0.682626042800145,
"grad_norm": 0.17262940108776093,
"learning_rate": 4.6124099074648375e-05,
"loss": 0.1613,
"step": 1882
},
{
"epoch": 0.6829887558940878,
"grad_norm": 0.17255136370658875,
"learning_rate": 4.602778682496965e-05,
"loss": 0.1647,
"step": 1883
},
{
"epoch": 0.6833514689880305,
"grad_norm": 0.19975058734416962,
"learning_rate": 4.593154516682276e-05,
"loss": 0.1705,
"step": 1884
},
{
"epoch": 0.6837141820819732,
"grad_norm": 0.19348910450935364,
"learning_rate": 4.5835374226084424e-05,
"loss": 0.1635,
"step": 1885
},
{
"epoch": 0.6840768951759159,
"grad_norm": 0.17311705648899078,
"learning_rate": 4.573927412853896e-05,
"loss": 0.1764,
"step": 1886
},
{
"epoch": 0.6844396082698585,
"grad_norm": 0.17351648211479187,
"learning_rate": 4.56432449998779e-05,
"loss": 0.1466,
"step": 1887
},
{
"epoch": 0.6848023213638013,
"grad_norm": 0.16917894780635834,
"learning_rate": 4.554728696570001e-05,
"loss": 0.1565,
"step": 1888
},
{
"epoch": 0.6851650344577439,
"grad_norm": 0.17121654748916626,
"learning_rate": 4.545140015151105e-05,
"loss": 0.1638,
"step": 1889
},
{
"epoch": 0.6855277475516867,
"grad_norm": 0.18969422578811646,
"learning_rate": 4.535558468272371e-05,
"loss": 0.1533,
"step": 1890
},
{
"epoch": 0.6858904606456293,
"grad_norm": 0.17447051405906677,
"learning_rate": 4.525984068465725e-05,
"loss": 0.1624,
"step": 1891
},
{
"epoch": 0.686253173739572,
"grad_norm": 0.1624990999698639,
"learning_rate": 4.5164168282537546e-05,
"loss": 0.1482,
"step": 1892
},
{
"epoch": 0.6866158868335147,
"grad_norm": 0.17492160201072693,
"learning_rate": 4.506856760149671e-05,
"loss": 0.1733,
"step": 1893
},
{
"epoch": 0.6869785999274574,
"grad_norm": 0.16198083758354187,
"learning_rate": 4.497303876657324e-05,
"loss": 0.1433,
"step": 1894
},
{
"epoch": 0.6873413130214001,
"grad_norm": 0.1759859323501587,
"learning_rate": 4.48775819027115e-05,
"loss": 0.146,
"step": 1895
},
{
"epoch": 0.6877040261153428,
"grad_norm": 0.16649121046066284,
"learning_rate": 4.478219713476178e-05,
"loss": 0.1652,
"step": 1896
},
{
"epoch": 0.6880667392092854,
"grad_norm": 0.1907196342945099,
"learning_rate": 4.468688458748006e-05,
"loss": 0.1731,
"step": 1897
},
{
"epoch": 0.6884294523032282,
"grad_norm": 0.1932022124528885,
"learning_rate": 4.459164438552789e-05,
"loss": 0.1693,
"step": 1898
},
{
"epoch": 0.6887921653971708,
"grad_norm": 0.1829594522714615,
"learning_rate": 4.449647665347216e-05,
"loss": 0.1957,
"step": 1899
},
{
"epoch": 0.6891548784911136,
"grad_norm": 0.17210708558559418,
"learning_rate": 4.4401381515784965e-05,
"loss": 0.1596,
"step": 1900
},
{
"epoch": 0.6895175915850562,
"grad_norm": 0.16382241249084473,
"learning_rate": 4.430635909684356e-05,
"loss": 0.1417,
"step": 1901
},
{
"epoch": 0.6898803046789989,
"grad_norm": 0.16617849469184875,
"learning_rate": 4.421140952092997e-05,
"loss": 0.1459,
"step": 1902
},
{
"epoch": 0.6902430177729416,
"grad_norm": 0.16519035398960114,
"learning_rate": 4.411653291223097e-05,
"loss": 0.1616,
"step": 1903
},
{
"epoch": 0.6906057308668843,
"grad_norm": 0.17537926137447357,
"learning_rate": 4.402172939483794e-05,
"loss": 0.1637,
"step": 1904
},
{
"epoch": 0.690968443960827,
"grad_norm": 0.18427397310733795,
"learning_rate": 4.392699909274664e-05,
"loss": 0.1876,
"step": 1905
},
{
"epoch": 0.6913311570547697,
"grad_norm": 0.1629849672317505,
"learning_rate": 4.383234212985701e-05,
"loss": 0.1436,
"step": 1906
},
{
"epoch": 0.6916938701487124,
"grad_norm": 0.1907191276550293,
"learning_rate": 4.3737758629973204e-05,
"loss": 0.1723,
"step": 1907
},
{
"epoch": 0.6920565832426551,
"grad_norm": 0.18214593827724457,
"learning_rate": 4.3643248716803184e-05,
"loss": 0.1683,
"step": 1908
},
{
"epoch": 0.6924192963365977,
"grad_norm": 0.18101546168327332,
"learning_rate": 4.354881251395871e-05,
"loss": 0.1554,
"step": 1909
},
{
"epoch": 0.6927820094305405,
"grad_norm": 0.18527980148792267,
"learning_rate": 4.3454450144955105e-05,
"loss": 0.1832,
"step": 1910
},
{
"epoch": 0.6931447225244831,
"grad_norm": 0.16371949017047882,
"learning_rate": 4.3360161733211145e-05,
"loss": 0.1528,
"step": 1911
},
{
"epoch": 0.6935074356184259,
"grad_norm": 0.172775536775589,
"learning_rate": 4.3265947402048834e-05,
"loss": 0.1564,
"step": 1912
},
{
"epoch": 0.6938701487123685,
"grad_norm": 0.17069590091705322,
"learning_rate": 4.3171807274693386e-05,
"loss": 0.1555,
"step": 1913
},
{
"epoch": 0.6942328618063112,
"grad_norm": 0.1884002387523651,
"learning_rate": 4.307774147427287e-05,
"loss": 0.1611,
"step": 1914
},
{
"epoch": 0.6945955749002539,
"grad_norm": 0.17518699169158936,
"learning_rate": 4.2983750123818155e-05,
"loss": 0.1651,
"step": 1915
},
{
"epoch": 0.6949582879941966,
"grad_norm": 0.17112936079502106,
"learning_rate": 4.288983334626275e-05,
"loss": 0.1472,
"step": 1916
},
{
"epoch": 0.6953210010881393,
"grad_norm": 0.1765616238117218,
"learning_rate": 4.279599126444264e-05,
"loss": 0.1552,
"step": 1917
},
{
"epoch": 0.695683714182082,
"grad_norm": 0.18281279504299164,
"learning_rate": 4.2702224001096045e-05,
"loss": 0.1758,
"step": 1918
},
{
"epoch": 0.6960464272760246,
"grad_norm": 0.1792001724243164,
"learning_rate": 4.2608531678863475e-05,
"loss": 0.1643,
"step": 1919
},
{
"epoch": 0.6964091403699674,
"grad_norm": 0.1666647344827652,
"learning_rate": 4.2514914420287266e-05,
"loss": 0.146,
"step": 1920
},
{
"epoch": 0.69677185346391,
"grad_norm": 0.2033475637435913,
"learning_rate": 4.242137234781166e-05,
"loss": 0.1841,
"step": 1921
},
{
"epoch": 0.6971345665578528,
"grad_norm": 0.17656663060188293,
"learning_rate": 4.23279055837825e-05,
"loss": 0.1614,
"step": 1922
},
{
"epoch": 0.6974972796517954,
"grad_norm": 0.1725003868341446,
"learning_rate": 4.2234514250447255e-05,
"loss": 0.155,
"step": 1923
},
{
"epoch": 0.6978599927457381,
"grad_norm": 0.17976543307304382,
"learning_rate": 4.214119846995461e-05,
"loss": 0.1646,
"step": 1924
},
{
"epoch": 0.6982227058396808,
"grad_norm": 0.16774506866931915,
"learning_rate": 4.204795836435448e-05,
"loss": 0.1672,
"step": 1925
},
{
"epoch": 0.6985854189336235,
"grad_norm": 0.18107999861240387,
"learning_rate": 4.1954794055597756e-05,
"loss": 0.1769,
"step": 1926
},
{
"epoch": 0.6989481320275662,
"grad_norm": 0.19499120116233826,
"learning_rate": 4.1861705665536324e-05,
"loss": 0.1737,
"step": 1927
},
{
"epoch": 0.6993108451215089,
"grad_norm": 0.18403582274913788,
"learning_rate": 4.1768693315922635e-05,
"loss": 0.1671,
"step": 1928
},
{
"epoch": 0.6996735582154516,
"grad_norm": 0.18355792760849,
"learning_rate": 4.167575712840974e-05,
"loss": 0.1587,
"step": 1929
},
{
"epoch": 0.7000362713093943,
"grad_norm": 0.20113395154476166,
"learning_rate": 4.15828972245511e-05,
"loss": 0.1667,
"step": 1930
},
{
"epoch": 0.7003989844033369,
"grad_norm": 0.1907624453306198,
"learning_rate": 4.149011372580029e-05,
"loss": 0.1708,
"step": 1931
},
{
"epoch": 0.7007616974972797,
"grad_norm": 0.16733594238758087,
"learning_rate": 4.139740675351116e-05,
"loss": 0.1629,
"step": 1932
},
{
"epoch": 0.7011244105912223,
"grad_norm": 0.15931111574172974,
"learning_rate": 4.130477642893729e-05,
"loss": 0.1534,
"step": 1933
},
{
"epoch": 0.7014871236851651,
"grad_norm": 0.19512903690338135,
"learning_rate": 4.1212222873232054e-05,
"loss": 0.1814,
"step": 1934
},
{
"epoch": 0.7018498367791077,
"grad_norm": 0.18595078587532043,
"learning_rate": 4.111974620744845e-05,
"loss": 0.1632,
"step": 1935
},
{
"epoch": 0.7022125498730504,
"grad_norm": 0.17419064044952393,
"learning_rate": 4.10273465525389e-05,
"loss": 0.1448,
"step": 1936
},
{
"epoch": 0.7025752629669931,
"grad_norm": 0.178279310464859,
"learning_rate": 4.093502402935504e-05,
"loss": 0.1578,
"step": 1937
},
{
"epoch": 0.7029379760609358,
"grad_norm": 0.18063177168369293,
"learning_rate": 4.084277875864776e-05,
"loss": 0.1502,
"step": 1938
},
{
"epoch": 0.7033006891548785,
"grad_norm": 0.20529168844223022,
"learning_rate": 4.075061086106678e-05,
"loss": 0.1748,
"step": 1939
},
{
"epoch": 0.7036634022488212,
"grad_norm": 0.1844182014465332,
"learning_rate": 4.065852045716069e-05,
"loss": 0.1543,
"step": 1940
},
{
"epoch": 0.7040261153427638,
"grad_norm": 0.1840999871492386,
"learning_rate": 4.056650766737669e-05,
"loss": 0.189,
"step": 1941
},
{
"epoch": 0.7043888284367066,
"grad_norm": 0.1571437418460846,
"learning_rate": 4.047457261206047e-05,
"loss": 0.1546,
"step": 1942
},
{
"epoch": 0.7047515415306492,
"grad_norm": 0.17258736491203308,
"learning_rate": 4.038271541145604e-05,
"loss": 0.1531,
"step": 1943
},
{
"epoch": 0.705114254624592,
"grad_norm": 0.16301092505455017,
"learning_rate": 4.0290936185705674e-05,
"loss": 0.1554,
"step": 1944
},
{
"epoch": 0.7054769677185346,
"grad_norm": 0.1766006052494049,
"learning_rate": 4.0199235054849546e-05,
"loss": 0.1484,
"step": 1945
},
{
"epoch": 0.7058396808124773,
"grad_norm": 0.18022476136684418,
"learning_rate": 4.010761213882572e-05,
"loss": 0.1519,
"step": 1946
},
{
"epoch": 0.70620239390642,
"grad_norm": 0.16101764142513275,
"learning_rate": 4.001606755746999e-05,
"loss": 0.1564,
"step": 1947
},
{
"epoch": 0.7065651070003627,
"grad_norm": 0.18494002521038055,
"learning_rate": 3.992460143051566e-05,
"loss": 0.1549,
"step": 1948
},
{
"epoch": 0.7069278200943054,
"grad_norm": 0.18700887262821198,
"learning_rate": 3.983321387759342e-05,
"loss": 0.1656,
"step": 1949
},
{
"epoch": 0.7072905331882481,
"grad_norm": 0.18422120809555054,
"learning_rate": 3.974190501823126e-05,
"loss": 0.1646,
"step": 1950
},
{
"epoch": 0.7076532462821908,
"grad_norm": 0.17262974381446838,
"learning_rate": 3.965067497185416e-05,
"loss": 0.1553,
"step": 1951
},
{
"epoch": 0.7080159593761335,
"grad_norm": 0.16152386367321014,
"learning_rate": 3.955952385778406e-05,
"loss": 0.1341,
"step": 1952
},
{
"epoch": 0.7083786724700761,
"grad_norm": 0.16990354657173157,
"learning_rate": 3.946845179523965e-05,
"loss": 0.1727,
"step": 1953
},
{
"epoch": 0.7087413855640189,
"grad_norm": 0.1854991912841797,
"learning_rate": 3.937745890333623e-05,
"loss": 0.1548,
"step": 1954
},
{
"epoch": 0.7091040986579615,
"grad_norm": 0.1773202270269394,
"learning_rate": 3.928654530108552e-05,
"loss": 0.1723,
"step": 1955
},
{
"epoch": 0.7094668117519043,
"grad_norm": 0.18670934438705444,
"learning_rate": 3.9195711107395624e-05,
"loss": 0.1688,
"step": 1956
},
{
"epoch": 0.7098295248458469,
"grad_norm": 0.17176151275634766,
"learning_rate": 3.9104956441070715e-05,
"loss": 0.1524,
"step": 1957
},
{
"epoch": 0.7101922379397896,
"grad_norm": 0.17264217138290405,
"learning_rate": 3.901428142081095e-05,
"loss": 0.1568,
"step": 1958
},
{
"epoch": 0.7105549510337323,
"grad_norm": 0.16863767802715302,
"learning_rate": 3.892368616521229e-05,
"loss": 0.1514,
"step": 1959
},
{
"epoch": 0.710917664127675,
"grad_norm": 0.1810598075389862,
"learning_rate": 3.883317079276649e-05,
"loss": 0.1494,
"step": 1960
},
{
"epoch": 0.7112803772216177,
"grad_norm": 0.18499146401882172,
"learning_rate": 3.87427354218607e-05,
"loss": 0.155,
"step": 1961
},
{
"epoch": 0.7116430903155604,
"grad_norm": 0.16301509737968445,
"learning_rate": 3.865238017077748e-05,
"loss": 0.1505,
"step": 1962
},
{
"epoch": 0.712005803409503,
"grad_norm": 0.18313588201999664,
"learning_rate": 3.856210515769456e-05,
"loss": 0.1696,
"step": 1963
},
{
"epoch": 0.7123685165034458,
"grad_norm": 0.18576788902282715,
"learning_rate": 3.847191050068483e-05,
"loss": 0.1584,
"step": 1964
},
{
"epoch": 0.7127312295973884,
"grad_norm": 0.16800563037395477,
"learning_rate": 3.838179631771598e-05,
"loss": 0.1409,
"step": 1965
},
{
"epoch": 0.7130939426913312,
"grad_norm": 0.15716706216335297,
"learning_rate": 3.829176272665047e-05,
"loss": 0.1647,
"step": 1966
},
{
"epoch": 0.7134566557852738,
"grad_norm": 0.19974446296691895,
"learning_rate": 3.8201809845245364e-05,
"loss": 0.2084,
"step": 1967
},
{
"epoch": 0.7138193688792165,
"grad_norm": 0.18544046580791473,
"learning_rate": 3.811193779115213e-05,
"loss": 0.1579,
"step": 1968
},
{
"epoch": 0.7141820819731592,
"grad_norm": 0.17015773057937622,
"learning_rate": 3.80221466819166e-05,
"loss": 0.1663,
"step": 1969
},
{
"epoch": 0.7145447950671019,
"grad_norm": 0.1646818220615387,
"learning_rate": 3.7932436634978684e-05,
"loss": 0.1582,
"step": 1970
},
{
"epoch": 0.7149075081610446,
"grad_norm": 0.16714130342006683,
"learning_rate": 3.784280776767224e-05,
"loss": 0.137,
"step": 1971
},
{
"epoch": 0.7152702212549873,
"grad_norm": 0.17864611744880676,
"learning_rate": 3.7753260197224995e-05,
"loss": 0.1496,
"step": 1972
},
{
"epoch": 0.71563293434893,
"grad_norm": 0.18264222145080566,
"learning_rate": 3.766379404075832e-05,
"loss": 0.1583,
"step": 1973
},
{
"epoch": 0.7159956474428727,
"grad_norm": 0.1730545610189438,
"learning_rate": 3.757440941528708e-05,
"loss": 0.1616,
"step": 1974
},
{
"epoch": 0.7163583605368153,
"grad_norm": 0.1775929182767868,
"learning_rate": 3.748510643771962e-05,
"loss": 0.1514,
"step": 1975
},
{
"epoch": 0.7167210736307581,
"grad_norm": 0.1856832504272461,
"learning_rate": 3.739588522485736e-05,
"loss": 0.1558,
"step": 1976
},
{
"epoch": 0.7170837867247007,
"grad_norm": 0.19256243109703064,
"learning_rate": 3.7306745893394845e-05,
"loss": 0.1966,
"step": 1977
},
{
"epoch": 0.7174464998186435,
"grad_norm": 0.15902438759803772,
"learning_rate": 3.72176885599195e-05,
"loss": 0.1493,
"step": 1978
},
{
"epoch": 0.7178092129125861,
"grad_norm": 0.16954579949378967,
"learning_rate": 3.7128713340911535e-05,
"loss": 0.1692,
"step": 1979
},
{
"epoch": 0.7181719260065288,
"grad_norm": 0.17363213002681732,
"learning_rate": 3.7039820352743685e-05,
"loss": 0.1491,
"step": 1980
},
{
"epoch": 0.7185346391004716,
"grad_norm": 0.18617630004882812,
"learning_rate": 3.6951009711681253e-05,
"loss": 0.1762,
"step": 1981
},
{
"epoch": 0.7188973521944142,
"grad_norm": 0.15999780595302582,
"learning_rate": 3.6862281533881745e-05,
"loss": 0.1488,
"step": 1982
},
{
"epoch": 0.719260065288357,
"grad_norm": 0.16866905987262726,
"learning_rate": 3.677363593539485e-05,
"loss": 0.1467,
"step": 1983
},
{
"epoch": 0.7196227783822996,
"grad_norm": 0.1777690201997757,
"learning_rate": 3.668507303216223e-05,
"loss": 0.1525,
"step": 1984
},
{
"epoch": 0.7199854914762422,
"grad_norm": 0.19426722824573517,
"learning_rate": 3.659659294001739e-05,
"loss": 0.2006,
"step": 1985
},
{
"epoch": 0.720348204570185,
"grad_norm": 0.17638282477855682,
"learning_rate": 3.6508195774685515e-05,
"loss": 0.1548,
"step": 1986
},
{
"epoch": 0.7207109176641276,
"grad_norm": 0.16942881047725677,
"learning_rate": 3.641988165178339e-05,
"loss": 0.1646,
"step": 1987
},
{
"epoch": 0.7210736307580704,
"grad_norm": 0.17678217589855194,
"learning_rate": 3.633165068681914e-05,
"loss": 0.1342,
"step": 1988
},
{
"epoch": 0.721436343852013,
"grad_norm": 0.15457268059253693,
"learning_rate": 3.624350299519209e-05,
"loss": 0.1489,
"step": 1989
},
{
"epoch": 0.7217990569459557,
"grad_norm": 0.17524264752864838,
"learning_rate": 3.615543869219271e-05,
"loss": 0.1565,
"step": 1990
},
{
"epoch": 0.7221617700398985,
"grad_norm": 0.16811302304267883,
"learning_rate": 3.6067457893002376e-05,
"loss": 0.1518,
"step": 1991
},
{
"epoch": 0.7225244831338411,
"grad_norm": 0.18975135684013367,
"learning_rate": 3.597956071269326e-05,
"loss": 0.1605,
"step": 1992
},
{
"epoch": 0.7228871962277839,
"grad_norm": 0.17413167655467987,
"learning_rate": 3.58917472662281e-05,
"loss": 0.1782,
"step": 1993
},
{
"epoch": 0.7232499093217265,
"grad_norm": 0.17248669266700745,
"learning_rate": 3.580401766846028e-05,
"loss": 0.1499,
"step": 1994
},
{
"epoch": 0.7236126224156693,
"grad_norm": 0.16712360084056854,
"learning_rate": 3.571637203413334e-05,
"loss": 0.1561,
"step": 1995
},
{
"epoch": 0.7239753355096119,
"grad_norm": 0.17022311687469482,
"learning_rate": 3.56288104778811e-05,
"loss": 0.152,
"step": 1996
},
{
"epoch": 0.7243380486035546,
"grad_norm": 0.17325520515441895,
"learning_rate": 3.554133311422735e-05,
"loss": 0.1554,
"step": 1997
},
{
"epoch": 0.7247007616974973,
"grad_norm": 0.17560617625713348,
"learning_rate": 3.5453940057585866e-05,
"loss": 0.1869,
"step": 1998
},
{
"epoch": 0.72506347479144,
"grad_norm": 0.19136746227741241,
"learning_rate": 3.5366631422260045e-05,
"loss": 0.1761,
"step": 1999
},
{
"epoch": 0.7254261878853827,
"grad_norm": 0.1808745115995407,
"learning_rate": 3.527940732244289e-05,
"loss": 0.1558,
"step": 2000
},
{
"epoch": 0.7257889009793254,
"grad_norm": 0.16616669297218323,
"learning_rate": 3.519226787221692e-05,
"loss": 0.1465,
"step": 2001
},
{
"epoch": 0.726151614073268,
"grad_norm": 0.1782522052526474,
"learning_rate": 3.5105213185553856e-05,
"loss": 0.1546,
"step": 2002
},
{
"epoch": 0.7265143271672108,
"grad_norm": 0.1684170663356781,
"learning_rate": 3.5018243376314574e-05,
"loss": 0.1625,
"step": 2003
},
{
"epoch": 0.7268770402611534,
"grad_norm": 0.16710427403450012,
"learning_rate": 3.493135855824894e-05,
"loss": 0.155,
"step": 2004
},
{
"epoch": 0.7272397533550962,
"grad_norm": 4411.4638671875,
"learning_rate": 3.484455884499561e-05,
"loss": 0.1437,
"step": 2005
},
{
"epoch": 0.7276024664490388,
"grad_norm": 0.1757262647151947,
"learning_rate": 3.475784435008208e-05,
"loss": 0.1531,
"step": 2006
},
{
"epoch": 0.7279651795429815,
"grad_norm": 0.1928826868534088,
"learning_rate": 3.467121518692422e-05,
"loss": 0.1655,
"step": 2007
},
{
"epoch": 0.7283278926369242,
"grad_norm": 0.19880840182304382,
"learning_rate": 3.458467146882637e-05,
"loss": 0.1579,
"step": 2008
},
{
"epoch": 0.7286906057308669,
"grad_norm": 0.23102417588233948,
"learning_rate": 3.4498213308981095e-05,
"loss": 0.1581,
"step": 2009
},
{
"epoch": 0.7290533188248096,
"grad_norm": 0.1807643175125122,
"learning_rate": 3.441184082046908e-05,
"loss": 0.1462,
"step": 2010
},
{
"epoch": 0.7294160319187523,
"grad_norm": 0.18923969566822052,
"learning_rate": 3.4325554116258894e-05,
"loss": 0.1507,
"step": 2011
},
{
"epoch": 0.7297787450126949,
"grad_norm": 0.22489802539348602,
"learning_rate": 3.423935330920702e-05,
"loss": 0.1803,
"step": 2012
},
{
"epoch": 0.7301414581066377,
"grad_norm": 0.23475851118564606,
"learning_rate": 3.415323851205752e-05,
"loss": 0.1649,
"step": 2013
},
{
"epoch": 0.7305041712005803,
"grad_norm": 0.2082839459180832,
"learning_rate": 3.406720983744193e-05,
"loss": 0.182,
"step": 2014
},
{
"epoch": 0.7308668842945231,
"grad_norm": 0.19769790768623352,
"learning_rate": 3.3981267397879215e-05,
"loss": 0.1543,
"step": 2015
},
{
"epoch": 0.7312295973884657,
"grad_norm": 0.1755545437335968,
"learning_rate": 3.38954113057755e-05,
"loss": 0.1469,
"step": 2016
},
{
"epoch": 0.7315923104824085,
"grad_norm": 0.18786299228668213,
"learning_rate": 3.3809641673423985e-05,
"loss": 0.1778,
"step": 2017
},
{
"epoch": 0.7319550235763511,
"grad_norm": 0.17806515097618103,
"learning_rate": 3.3723958613004855e-05,
"loss": 0.1567,
"step": 2018
},
{
"epoch": 0.7323177366702938,
"grad_norm": 0.17538048326969147,
"learning_rate": 3.3638362236584965e-05,
"loss": 0.1573,
"step": 2019
},
{
"epoch": 0.7326804497642365,
"grad_norm": 0.17543213069438934,
"learning_rate": 3.355285265611784e-05,
"loss": 0.1651,
"step": 2020
},
{
"epoch": 0.7330431628581792,
"grad_norm": 0.1797361820936203,
"learning_rate": 3.346742998344348e-05,
"loss": 0.1696,
"step": 2021
},
{
"epoch": 0.7334058759521219,
"grad_norm": 0.20315411686897278,
"learning_rate": 3.3382094330288216e-05,
"loss": 0.1682,
"step": 2022
},
{
"epoch": 0.7337685890460646,
"grad_norm": 0.17584829032421112,
"learning_rate": 3.3296845808264574e-05,
"loss": 0.1734,
"step": 2023
},
{
"epoch": 0.7341313021400072,
"grad_norm": 0.192337304353714,
"learning_rate": 3.321168452887106e-05,
"loss": 0.185,
"step": 2024
},
{
"epoch": 0.73449401523395,
"grad_norm": 0.1659361571073532,
"learning_rate": 3.3126610603492194e-05,
"loss": 0.1556,
"step": 2025
},
{
"epoch": 0.7348567283278926,
"grad_norm": 0.16753138601779938,
"learning_rate": 3.304162414339814e-05,
"loss": 0.1467,
"step": 2026
},
{
"epoch": 0.7352194414218354,
"grad_norm": 0.18743427097797394,
"learning_rate": 3.295672525974469e-05,
"loss": 0.1653,
"step": 2027
},
{
"epoch": 0.735582154515778,
"grad_norm": 0.16860130429267883,
"learning_rate": 3.287191406357311e-05,
"loss": 0.1563,
"step": 2028
},
{
"epoch": 0.7359448676097207,
"grad_norm": 0.16440363228321075,
"learning_rate": 3.278719066580995e-05,
"loss": 0.1493,
"step": 2029
},
{
"epoch": 0.7363075807036634,
"grad_norm": 0.1813763827085495,
"learning_rate": 3.270255517726691e-05,
"loss": 0.1621,
"step": 2030
},
{
"epoch": 0.7366702937976061,
"grad_norm": 0.16494570672512054,
"learning_rate": 3.261800770864083e-05,
"loss": 0.1381,
"step": 2031
},
{
"epoch": 0.7370330068915488,
"grad_norm": 0.1700211763381958,
"learning_rate": 3.2533548370513286e-05,
"loss": 0.1508,
"step": 2032
},
{
"epoch": 0.7373957199854915,
"grad_norm": 0.19019465148448944,
"learning_rate": 3.244917727335066e-05,
"loss": 0.1596,
"step": 2033
},
{
"epoch": 0.7377584330794341,
"grad_norm": 0.1853635013103485,
"learning_rate": 3.236489452750385e-05,
"loss": 0.1433,
"step": 2034
},
{
"epoch": 0.7381211461733769,
"grad_norm": 0.19163811206817627,
"learning_rate": 3.228070024320833e-05,
"loss": 0.1605,
"step": 2035
},
{
"epoch": 0.7384838592673195,
"grad_norm": 0.2122446596622467,
"learning_rate": 3.2196594530583735e-05,
"loss": 0.1792,
"step": 2036
},
{
"epoch": 0.7388465723612623,
"grad_norm": 0.18100525438785553,
"learning_rate": 3.211257749963391e-05,
"loss": 0.1703,
"step": 2037
},
{
"epoch": 0.7392092854552049,
"grad_norm": 0.15972734987735748,
"learning_rate": 3.2028649260246754e-05,
"loss": 0.1691,
"step": 2038
},
{
"epoch": 0.7395719985491476,
"grad_norm": 0.17128963768482208,
"learning_rate": 3.1944809922193986e-05,
"loss": 0.1611,
"step": 2039
},
{
"epoch": 0.7399347116430903,
"grad_norm": 0.18161478638648987,
"learning_rate": 3.186105959513103e-05,
"loss": 0.1457,
"step": 2040
},
{
"epoch": 0.740297424737033,
"grad_norm": 0.1911374032497406,
"learning_rate": 3.177739838859694e-05,
"loss": 0.1655,
"step": 2041
},
{
"epoch": 0.7406601378309757,
"grad_norm": 0.16643930971622467,
"learning_rate": 3.1693826412014114e-05,
"loss": 0.1744,
"step": 2042
},
{
"epoch": 0.7410228509249184,
"grad_norm": 0.17060095071792603,
"learning_rate": 3.1610343774688414e-05,
"loss": 0.1469,
"step": 2043
},
{
"epoch": 0.7413855640188611,
"grad_norm": 0.1795426309108734,
"learning_rate": 3.152695058580871e-05,
"loss": 0.1487,
"step": 2044
},
{
"epoch": 0.7417482771128038,
"grad_norm": 0.1854647696018219,
"learning_rate": 3.1443646954446914e-05,
"loss": 0.17,
"step": 2045
},
{
"epoch": 0.7421109902067464,
"grad_norm": 0.1683138608932495,
"learning_rate": 3.136043298955782e-05,
"loss": 0.1584,
"step": 2046
},
{
"epoch": 0.7424737033006892,
"grad_norm": 0.18557599186897278,
"learning_rate": 3.127730879997895e-05,
"loss": 0.1507,
"step": 2047
},
{
"epoch": 0.7428364163946318,
"grad_norm": 0.17158469557762146,
"learning_rate": 3.119427449443032e-05,
"loss": 0.1512,
"step": 2048
},
{
"epoch": 0.7431991294885746,
"grad_norm": 0.1670829951763153,
"learning_rate": 3.111133018151456e-05,
"loss": 0.167,
"step": 2049
},
{
"epoch": 0.7435618425825172,
"grad_norm": 0.1642339676618576,
"learning_rate": 3.102847596971646e-05,
"loss": 0.144,
"step": 2050
},
{
"epoch": 0.7439245556764599,
"grad_norm": 0.16173475980758667,
"learning_rate": 3.094571196740299e-05,
"loss": 0.1412,
"step": 2051
},
{
"epoch": 0.7442872687704026,
"grad_norm": 0.16731561720371246,
"learning_rate": 3.086303828282315e-05,
"loss": 0.1586,
"step": 2052
},
{
"epoch": 0.7446499818643453,
"grad_norm": 0.19204100966453552,
"learning_rate": 3.078045502410779e-05,
"loss": 0.2226,
"step": 2053
},
{
"epoch": 0.745012694958288,
"grad_norm": 0.17547018826007843,
"learning_rate": 3.069796229926952e-05,
"loss": 0.1509,
"step": 2054
},
{
"epoch": 0.7453754080522307,
"grad_norm": 0.1662409007549286,
"learning_rate": 3.0615560216202486e-05,
"loss": 0.1554,
"step": 2055
},
{
"epoch": 0.7457381211461733,
"grad_norm": 0.18224076926708221,
"learning_rate": 3.0533248882682374e-05,
"loss": 0.1608,
"step": 2056
},
{
"epoch": 0.7461008342401161,
"grad_norm": 0.2161344736814499,
"learning_rate": 3.045102840636609e-05,
"loss": 0.1661,
"step": 2057
},
{
"epoch": 0.7464635473340587,
"grad_norm": 0.16624325513839722,
"learning_rate": 3.0368898894791753e-05,
"loss": 0.1558,
"step": 2058
},
{
"epoch": 0.7468262604280015,
"grad_norm": 0.15912269055843353,
"learning_rate": 3.0286860455378462e-05,
"loss": 0.1536,
"step": 2059
},
{
"epoch": 0.7471889735219441,
"grad_norm": 0.1618340164422989,
"learning_rate": 3.0204913195426254e-05,
"loss": 0.1436,
"step": 2060
},
{
"epoch": 0.7475516866158868,
"grad_norm": 0.16747722029685974,
"learning_rate": 3.0123057222115836e-05,
"loss": 0.149,
"step": 2061
},
{
"epoch": 0.7479143997098295,
"grad_norm": 0.1707213968038559,
"learning_rate": 3.0041292642508644e-05,
"loss": 0.1522,
"step": 2062
},
{
"epoch": 0.7482771128037722,
"grad_norm": 0.17695897817611694,
"learning_rate": 2.995961956354646e-05,
"loss": 0.1573,
"step": 2063
},
{
"epoch": 0.7486398258977149,
"grad_norm": 0.18760527670383453,
"learning_rate": 2.9878038092051443e-05,
"loss": 0.1551,
"step": 2064
},
{
"epoch": 0.7490025389916576,
"grad_norm": 0.1940336525440216,
"learning_rate": 2.9796548334725916e-05,
"loss": 0.1531,
"step": 2065
},
{
"epoch": 0.7493652520856003,
"grad_norm": 0.16656464338302612,
"learning_rate": 2.9715150398152268e-05,
"loss": 0.1474,
"step": 2066
},
{
"epoch": 0.749727965179543,
"grad_norm": 0.16804639995098114,
"learning_rate": 2.9633844388792732e-05,
"loss": 0.1651,
"step": 2067
},
{
"epoch": 0.7500906782734856,
"grad_norm": 0.16543330252170563,
"learning_rate": 2.9552630412989434e-05,
"loss": 0.1433,
"step": 2068
},
{
"epoch": 0.7504533913674284,
"grad_norm": 0.17684879899024963,
"learning_rate": 2.9471508576964023e-05,
"loss": 0.1533,
"step": 2069
},
{
"epoch": 0.750816104461371,
"grad_norm": 0.16878783702850342,
"learning_rate": 2.939047898681765e-05,
"loss": 0.1509,
"step": 2070
},
{
"epoch": 0.7511788175553138,
"grad_norm": 0.16449496150016785,
"learning_rate": 2.93095417485308e-05,
"loss": 0.1628,
"step": 2071
},
{
"epoch": 0.7515415306492564,
"grad_norm": 0.20348592102527618,
"learning_rate": 2.9228696967963275e-05,
"loss": 0.1695,
"step": 2072
},
{
"epoch": 0.7519042437431991,
"grad_norm": 0.1528720259666443,
"learning_rate": 2.9147944750853816e-05,
"loss": 0.1396,
"step": 2073
},
{
"epoch": 0.7522669568371418,
"grad_norm": 0.17836391925811768,
"learning_rate": 2.906728520282015e-05,
"loss": 0.1538,
"step": 2074
},
{
"epoch": 0.7526296699310845,
"grad_norm": 0.16207584738731384,
"learning_rate": 2.898671842935885e-05,
"loss": 0.1457,
"step": 2075
},
{
"epoch": 0.7529923830250272,
"grad_norm": 0.17391245067119598,
"learning_rate": 2.8906244535845072e-05,
"loss": 0.1813,
"step": 2076
},
{
"epoch": 0.7533550961189699,
"grad_norm": 0.1827738881111145,
"learning_rate": 2.8825863627532524e-05,
"loss": 0.1712,
"step": 2077
},
{
"epoch": 0.7537178092129125,
"grad_norm": 0.16939976811408997,
"learning_rate": 2.8745575809553294e-05,
"loss": 0.1599,
"step": 2078
},
{
"epoch": 0.7540805223068553,
"grad_norm": 0.15600422024726868,
"learning_rate": 2.8665381186917718e-05,
"loss": 0.1469,
"step": 2079
},
{
"epoch": 0.7544432354007979,
"grad_norm": 0.2160848081111908,
"learning_rate": 2.858527986451419e-05,
"loss": 0.1748,
"step": 2080
},
{
"epoch": 0.7548059484947407,
"grad_norm": 0.16352678835391998,
"learning_rate": 2.8505271947109203e-05,
"loss": 0.1486,
"step": 2081
},
{
"epoch": 0.7551686615886833,
"grad_norm": 0.16789479553699493,
"learning_rate": 2.842535753934695e-05,
"loss": 0.1765,
"step": 2082
},
{
"epoch": 0.755531374682626,
"grad_norm": 0.16260650753974915,
"learning_rate": 2.8345536745749403e-05,
"loss": 0.1374,
"step": 2083
},
{
"epoch": 0.7558940877765687,
"grad_norm": 0.16362746059894562,
"learning_rate": 2.8265809670716027e-05,
"loss": 0.1528,
"step": 2084
},
{
"epoch": 0.7562568008705114,
"grad_norm": 0.1730203479528427,
"learning_rate": 2.818617641852376e-05,
"loss": 0.16,
"step": 2085
},
{
"epoch": 0.7566195139644541,
"grad_norm": 0.1941351443529129,
"learning_rate": 2.8106637093326782e-05,
"loss": 0.1578,
"step": 2086
},
{
"epoch": 0.7569822270583968,
"grad_norm": 0.17957964539527893,
"learning_rate": 2.8027191799156514e-05,
"loss": 0.1497,
"step": 2087
},
{
"epoch": 0.7573449401523396,
"grad_norm": 0.1569589227437973,
"learning_rate": 2.794784063992131e-05,
"loss": 0.1377,
"step": 2088
},
{
"epoch": 0.7577076532462822,
"grad_norm": 0.16305673122406006,
"learning_rate": 2.7868583719406403e-05,
"loss": 0.1471,
"step": 2089
},
{
"epoch": 0.7580703663402248,
"grad_norm": 0.171325221657753,
"learning_rate": 2.778942114127382e-05,
"loss": 0.1501,
"step": 2090
},
{
"epoch": 0.7584330794341676,
"grad_norm": 0.1620980203151703,
"learning_rate": 2.771035300906215e-05,
"loss": 0.1461,
"step": 2091
},
{
"epoch": 0.7587957925281102,
"grad_norm": 0.16900931298732758,
"learning_rate": 2.7631379426186434e-05,
"loss": 0.143,
"step": 2092
},
{
"epoch": 0.759158505622053,
"grad_norm": 0.1761879175901413,
"learning_rate": 2.755250049593816e-05,
"loss": 0.1541,
"step": 2093
},
{
"epoch": 0.7595212187159956,
"grad_norm": 0.18240278959274292,
"learning_rate": 2.74737163214849e-05,
"loss": 0.1931,
"step": 2094
},
{
"epoch": 0.7598839318099383,
"grad_norm": 0.15427257120609283,
"learning_rate": 2.7395027005870343e-05,
"loss": 0.1453,
"step": 2095
},
{
"epoch": 0.760246644903881,
"grad_norm": 0.18148113787174225,
"learning_rate": 2.73164326520141e-05,
"loss": 0.1733,
"step": 2096
},
{
"epoch": 0.7606093579978237,
"grad_norm": 0.1736038774251938,
"learning_rate": 2.7237933362711576e-05,
"loss": 0.1532,
"step": 2097
},
{
"epoch": 0.7609720710917665,
"grad_norm": 0.18636751174926758,
"learning_rate": 2.715952924063383e-05,
"loss": 0.1627,
"step": 2098
},
{
"epoch": 0.7613347841857091,
"grad_norm": 0.18383683264255524,
"learning_rate": 2.7081220388327522e-05,
"loss": 0.1625,
"step": 2099
},
{
"epoch": 0.7616974972796517,
"grad_norm": 0.16700130701065063,
"learning_rate": 2.70030069082146e-05,
"loss": 0.1536,
"step": 2100
},
{
"epoch": 0.7620602103735945,
"grad_norm": 0.178177148103714,
"learning_rate": 2.692488890259235e-05,
"loss": 0.1593,
"step": 2101
},
{
"epoch": 0.7624229234675372,
"grad_norm": 0.16141119599342346,
"learning_rate": 2.6846866473633125e-05,
"loss": 0.1476,
"step": 2102
},
{
"epoch": 0.7627856365614799,
"grad_norm": 0.16690880060195923,
"learning_rate": 2.676893972338432e-05,
"loss": 0.1606,
"step": 2103
},
{
"epoch": 0.7631483496554226,
"grad_norm": 0.18088023364543915,
"learning_rate": 2.6691108753768146e-05,
"loss": 0.1799,
"step": 2104
},
{
"epoch": 0.7635110627493652,
"grad_norm": 0.16774174571037292,
"learning_rate": 2.661337366658161e-05,
"loss": 0.1534,
"step": 2105
},
{
"epoch": 0.763873775843308,
"grad_norm": 0.1739625185728073,
"learning_rate": 2.653573456349624e-05,
"loss": 0.1752,
"step": 2106
},
{
"epoch": 0.7642364889372506,
"grad_norm": 0.1661982536315918,
"learning_rate": 2.6458191546058064e-05,
"loss": 0.1554,
"step": 2107
},
{
"epoch": 0.7645992020311934,
"grad_norm": 0.15863363444805145,
"learning_rate": 2.638074471568739e-05,
"loss": 0.1563,
"step": 2108
},
{
"epoch": 0.764961915125136,
"grad_norm": 0.1664765626192093,
"learning_rate": 2.630339417367882e-05,
"loss": 0.1613,
"step": 2109
},
{
"epoch": 0.7653246282190788,
"grad_norm": 0.17983406782150269,
"learning_rate": 2.622614002120091e-05,
"loss": 0.1354,
"step": 2110
},
{
"epoch": 0.7656873413130214,
"grad_norm": 0.18512356281280518,
"learning_rate": 2.6148982359296205e-05,
"loss": 0.1548,
"step": 2111
},
{
"epoch": 0.766050054406964,
"grad_norm": 0.16237185895442963,
"learning_rate": 2.6071921288880984e-05,
"loss": 0.151,
"step": 2112
},
{
"epoch": 0.7664127675009068,
"grad_norm": 0.16601556539535522,
"learning_rate": 2.5994956910745326e-05,
"loss": 0.1616,
"step": 2113
},
{
"epoch": 0.7667754805948495,
"grad_norm": 0.163995161652565,
"learning_rate": 2.5918089325552707e-05,
"loss": 0.1485,
"step": 2114
},
{
"epoch": 0.7671381936887922,
"grad_norm": 0.18575289845466614,
"learning_rate": 2.5841318633840072e-05,
"loss": 0.1577,
"step": 2115
},
{
"epoch": 0.7675009067827349,
"grad_norm": 0.19277150928974152,
"learning_rate": 2.576464493601761e-05,
"loss": 0.155,
"step": 2116
},
{
"epoch": 0.7678636198766775,
"grad_norm": 0.1656551957130432,
"learning_rate": 2.5688068332368632e-05,
"loss": 0.1486,
"step": 2117
},
{
"epoch": 0.7682263329706203,
"grad_norm": 0.15799161791801453,
"learning_rate": 2.5611588923049544e-05,
"loss": 0.1369,
"step": 2118
},
{
"epoch": 0.7685890460645629,
"grad_norm": 0.17702096700668335,
"learning_rate": 2.5535206808089553e-05,
"loss": 0.1789,
"step": 2119
},
{
"epoch": 0.7689517591585057,
"grad_norm": 2096.28515625,
"learning_rate": 2.5458922087390613e-05,
"loss": 0.1436,
"step": 2120
},
{
"epoch": 0.7693144722524483,
"grad_norm": 0.17093558609485626,
"learning_rate": 2.5382734860727332e-05,
"loss": 0.1518,
"step": 2121
},
{
"epoch": 0.769677185346391,
"grad_norm": 0.1638222485780716,
"learning_rate": 2.5306645227746762e-05,
"loss": 0.1473,
"step": 2122
},
{
"epoch": 0.7700398984403337,
"grad_norm": 0.1996994912624359,
"learning_rate": 2.523065328796831e-05,
"loss": 0.1809,
"step": 2123
},
{
"epoch": 0.7704026115342764,
"grad_norm": 0.1753552258014679,
"learning_rate": 2.515475914078369e-05,
"loss": 0.1811,
"step": 2124
},
{
"epoch": 0.7707653246282191,
"grad_norm": 0.19755405187606812,
"learning_rate": 2.5078962885456612e-05,
"loss": 0.1783,
"step": 2125
},
{
"epoch": 0.7711280377221618,
"grad_norm": 0.18720857799053192,
"learning_rate": 2.5003264621122802e-05,
"loss": 0.1519,
"step": 2126
},
{
"epoch": 0.7714907508161044,
"grad_norm": 0.1806974709033966,
"learning_rate": 2.4927664446789788e-05,
"loss": 0.1594,
"step": 2127
},
{
"epoch": 0.7718534639100472,
"grad_norm": 0.18246807157993317,
"learning_rate": 2.4852162461336835e-05,
"loss": 0.1395,
"step": 2128
},
{
"epoch": 0.7722161770039898,
"grad_norm": 0.18061847984790802,
"learning_rate": 2.477675876351475e-05,
"loss": 0.1709,
"step": 2129
},
{
"epoch": 0.7725788900979326,
"grad_norm": 0.1823715716600418,
"learning_rate": 2.4701453451945846e-05,
"loss": 0.1488,
"step": 2130
},
{
"epoch": 0.7729416031918752,
"grad_norm": 0.16946843266487122,
"learning_rate": 2.4626246625123706e-05,
"loss": 0.1498,
"step": 2131
},
{
"epoch": 0.773304316285818,
"grad_norm": 0.17811253666877747,
"learning_rate": 2.455113838141311e-05,
"loss": 0.1649,
"step": 2132
},
{
"epoch": 0.7736670293797606,
"grad_norm": 0.16584321856498718,
"learning_rate": 2.4476128819049893e-05,
"loss": 0.1814,
"step": 2133
},
{
"epoch": 0.7740297424737033,
"grad_norm": 0.15835148096084595,
"learning_rate": 2.4401218036140848e-05,
"loss": 0.1453,
"step": 2134
},
{
"epoch": 0.774392455567646,
"grad_norm": 0.17442336678504944,
"learning_rate": 2.4326406130663527e-05,
"loss": 0.1457,
"step": 2135
},
{
"epoch": 0.7747551686615887,
"grad_norm": 0.18500109016895294,
"learning_rate": 2.4251693200466242e-05,
"loss": 0.1673,
"step": 2136
},
{
"epoch": 0.7751178817555314,
"grad_norm": 0.17963416874408722,
"learning_rate": 2.417707934326775e-05,
"loss": 0.1522,
"step": 2137
},
{
"epoch": 0.7754805948494741,
"grad_norm": 0.17526273429393768,
"learning_rate": 2.4102564656657312e-05,
"loss": 0.1485,
"step": 2138
},
{
"epoch": 0.7758433079434167,
"grad_norm": 0.15860708057880402,
"learning_rate": 2.402814923809442e-05,
"loss": 0.1446,
"step": 2139
},
{
"epoch": 0.7762060210373595,
"grad_norm": 0.1740608960390091,
"learning_rate": 2.3953833184908757e-05,
"loss": 0.1521,
"step": 2140
},
{
"epoch": 0.7765687341313021,
"grad_norm": 0.1701829582452774,
"learning_rate": 2.387961659430007e-05,
"loss": 0.1386,
"step": 2141
},
{
"epoch": 0.7769314472252449,
"grad_norm": 0.17111440002918243,
"learning_rate": 2.380549956333793e-05,
"loss": 0.1452,
"step": 2142
},
{
"epoch": 0.7772941603191875,
"grad_norm": 0.17982304096221924,
"learning_rate": 2.3731482188961818e-05,
"loss": 0.163,
"step": 2143
},
{
"epoch": 0.7776568734131302,
"grad_norm": 0.1801091730594635,
"learning_rate": 2.3657564567980782e-05,
"loss": 0.1423,
"step": 2144
},
{
"epoch": 0.7780195865070729,
"grad_norm": 0.15309491753578186,
"learning_rate": 2.358374679707339e-05,
"loss": 0.1393,
"step": 2145
},
{
"epoch": 0.7783822996010156,
"grad_norm": 0.15650945901870728,
"learning_rate": 2.351002897278771e-05,
"loss": 0.1894,
"step": 2146
},
{
"epoch": 0.7787450126949583,
"grad_norm": 0.17866793274879456,
"learning_rate": 2.343641119154101e-05,
"loss": 0.1549,
"step": 2147
},
{
"epoch": 0.779107725788901,
"grad_norm": 0.17232728004455566,
"learning_rate": 2.336289354961969e-05,
"loss": 0.1802,
"step": 2148
},
{
"epoch": 0.7794704388828436,
"grad_norm": 0.18021385371685028,
"learning_rate": 2.3289476143179202e-05,
"loss": 0.143,
"step": 2149
},
{
"epoch": 0.7798331519767864,
"grad_norm": 0.18300630152225494,
"learning_rate": 2.3216159068243958e-05,
"loss": 0.1739,
"step": 2150
},
{
"epoch": 0.780195865070729,
"grad_norm": 0.18222151696681976,
"learning_rate": 2.314294242070706e-05,
"loss": 0.1653,
"step": 2151
},
{
"epoch": 0.7805585781646718,
"grad_norm": 0.16753800213336945,
"learning_rate": 2.30698262963303e-05,
"loss": 0.1766,
"step": 2152
},
{
"epoch": 0.7809212912586144,
"grad_norm": 0.16288548707962036,
"learning_rate": 2.2996810790743983e-05,
"loss": 0.1417,
"step": 2153
},
{
"epoch": 0.7812840043525572,
"grad_norm": 0.14791814982891083,
"learning_rate": 2.2923895999446764e-05,
"loss": 0.1452,
"step": 2154
},
{
"epoch": 0.7816467174464998,
"grad_norm": 0.17105069756507874,
"learning_rate": 2.2851082017805703e-05,
"loss": 0.1641,
"step": 2155
},
{
"epoch": 0.7820094305404425,
"grad_norm": 0.17432281374931335,
"learning_rate": 2.2778368941055882e-05,
"loss": 0.1774,
"step": 2156
},
{
"epoch": 0.7823721436343852,
"grad_norm": 0.19430530071258545,
"learning_rate": 2.2705756864300454e-05,
"loss": 0.167,
"step": 2157
},
{
"epoch": 0.7827348567283279,
"grad_norm": 0.16627925634384155,
"learning_rate": 2.2633245882510457e-05,
"loss": 0.1328,
"step": 2158
},
{
"epoch": 0.7830975698222706,
"grad_norm": 0.1691751331090927,
"learning_rate": 2.256083609052474e-05,
"loss": 0.1504,
"step": 2159
},
{
"epoch": 0.7834602829162133,
"grad_norm": 0.17866089940071106,
"learning_rate": 2.2488527583049736e-05,
"loss": 0.1503,
"step": 2160
},
{
"epoch": 0.7838229960101559,
"grad_norm": 0.19467145204544067,
"learning_rate": 2.2416320454659512e-05,
"loss": 0.1611,
"step": 2161
},
{
"epoch": 0.7841857091040987,
"grad_norm": 0.17603172361850739,
"learning_rate": 2.2344214799795438e-05,
"loss": 0.1519,
"step": 2162
},
{
"epoch": 0.7845484221980413,
"grad_norm": 0.18451876938343048,
"learning_rate": 2.2272210712766205e-05,
"loss": 0.1675,
"step": 2163
},
{
"epoch": 0.7849111352919841,
"grad_norm": 0.17610016465187073,
"learning_rate": 2.2200308287747673e-05,
"loss": 0.1597,
"step": 2164
},
{
"epoch": 0.7852738483859267,
"grad_norm": 0.1533452421426773,
"learning_rate": 2.21285076187827e-05,
"loss": 0.1381,
"step": 2165
},
{
"epoch": 0.7856365614798694,
"grad_norm": 0.16271378099918365,
"learning_rate": 2.205680879978107e-05,
"loss": 0.1435,
"step": 2166
},
{
"epoch": 0.7859992745738121,
"grad_norm": 0.15660040080547333,
"learning_rate": 2.19852119245194e-05,
"loss": 0.1441,
"step": 2167
},
{
"epoch": 0.7863619876677548,
"grad_norm": 0.16608907282352448,
"learning_rate": 2.1913717086640906e-05,
"loss": 0.1603,
"step": 2168
},
{
"epoch": 0.7867247007616975,
"grad_norm": 0.19811011850833893,
"learning_rate": 2.1842324379655378e-05,
"loss": 0.1729,
"step": 2169
},
{
"epoch": 0.7870874138556402,
"grad_norm": 0.16923308372497559,
"learning_rate": 2.177103389693903e-05,
"loss": 0.1572,
"step": 2170
},
{
"epoch": 0.7874501269495828,
"grad_norm": 0.16869297623634338,
"learning_rate": 2.169984573173436e-05,
"loss": 0.1523,
"step": 2171
},
{
"epoch": 0.7878128400435256,
"grad_norm": 0.16741646826267242,
"learning_rate": 2.162875997715005e-05,
"loss": 0.1336,
"step": 2172
},
{
"epoch": 0.7881755531374682,
"grad_norm": 0.17434288561344147,
"learning_rate": 2.1557776726160807e-05,
"loss": 0.1615,
"step": 2173
},
{
"epoch": 0.788538266231411,
"grad_norm": 0.19176846742630005,
"learning_rate": 2.1486896071607364e-05,
"loss": 0.158,
"step": 2174
},
{
"epoch": 0.7889009793253536,
"grad_norm": 0.19300417602062225,
"learning_rate": 2.141611810619617e-05,
"loss": 0.1618,
"step": 2175
},
{
"epoch": 0.7892636924192964,
"grad_norm": 0.18857765197753906,
"learning_rate": 2.1345442922499394e-05,
"loss": 0.1552,
"step": 2176
},
{
"epoch": 0.789626405513239,
"grad_norm": 0.16958756744861603,
"learning_rate": 2.127487061295478e-05,
"loss": 0.1498,
"step": 2177
},
{
"epoch": 0.7899891186071817,
"grad_norm": 0.1617862582206726,
"learning_rate": 2.1204401269865526e-05,
"loss": 0.1468,
"step": 2178
},
{
"epoch": 0.7903518317011244,
"grad_norm": 0.17696796357631683,
"learning_rate": 2.113403498540011e-05,
"loss": 0.158,
"step": 2179
},
{
"epoch": 0.7907145447950671,
"grad_norm": 0.18679635226726532,
"learning_rate": 2.1063771851592316e-05,
"loss": 0.1725,
"step": 2180
},
{
"epoch": 0.7910772578890098,
"grad_norm": 0.16767951846122742,
"learning_rate": 2.099361196034093e-05,
"loss": 0.1541,
"step": 2181
},
{
"epoch": 0.7914399709829525,
"grad_norm": 0.17078953981399536,
"learning_rate": 2.09235554034097e-05,
"loss": 0.1517,
"step": 2182
},
{
"epoch": 0.7918026840768951,
"grad_norm": 0.18054896593093872,
"learning_rate": 2.085360227242731e-05,
"loss": 0.1668,
"step": 2183
},
{
"epoch": 0.7921653971708379,
"grad_norm": 0.17167535424232483,
"learning_rate": 2.0783752658887066e-05,
"loss": 0.1486,
"step": 2184
},
{
"epoch": 0.7925281102647805,
"grad_norm": 0.18194803595542908,
"learning_rate": 2.0714006654146955e-05,
"loss": 0.1705,
"step": 2185
},
{
"epoch": 0.7928908233587233,
"grad_norm": 0.15957947075366974,
"learning_rate": 2.0644364349429378e-05,
"loss": 0.1393,
"step": 2186
},
{
"epoch": 0.7932535364526659,
"grad_norm": 0.17193473875522614,
"learning_rate": 2.057482583582122e-05,
"loss": 0.1549,
"step": 2187
},
{
"epoch": 0.7936162495466086,
"grad_norm": 0.16619963943958282,
"learning_rate": 2.0505391204273495e-05,
"loss": 0.1526,
"step": 2188
},
{
"epoch": 0.7939789626405513,
"grad_norm": 0.15132339298725128,
"learning_rate": 2.043606054560141e-05,
"loss": 0.1602,
"step": 2189
},
{
"epoch": 0.794341675734494,
"grad_norm": 0.17620229721069336,
"learning_rate": 2.0366833950484164e-05,
"loss": 0.1505,
"step": 2190
},
{
"epoch": 0.7947043888284367,
"grad_norm": 0.16328759491443634,
"learning_rate": 2.0297711509464833e-05,
"loss": 0.1407,
"step": 2191
},
{
"epoch": 0.7950671019223794,
"grad_norm": 0.16912280023097992,
"learning_rate": 2.0228693312950352e-05,
"loss": 0.1571,
"step": 2192
},
{
"epoch": 0.795429815016322,
"grad_norm": 0.16919687390327454,
"learning_rate": 2.0159779451211204e-05,
"loss": 0.1484,
"step": 2193
},
{
"epoch": 0.7957925281102648,
"grad_norm": 0.17652738094329834,
"learning_rate": 2.009097001438147e-05,
"loss": 0.1388,
"step": 2194
},
{
"epoch": 0.7961552412042074,
"grad_norm": 0.17439448833465576,
"learning_rate": 2.0022265092458638e-05,
"loss": 0.162,
"step": 2195
},
{
"epoch": 0.7965179542981502,
"grad_norm": 0.16315314173698425,
"learning_rate": 1.9953664775303483e-05,
"loss": 0.1463,
"step": 2196
},
{
"epoch": 0.7968806673920928,
"grad_norm": 0.15268266201019287,
"learning_rate": 1.988516915263996e-05,
"loss": 0.1421,
"step": 2197
},
{
"epoch": 0.7972433804860355,
"grad_norm": 0.16543833911418915,
"learning_rate": 1.981677831405516e-05,
"loss": 0.1495,
"step": 2198
},
{
"epoch": 0.7976060935799782,
"grad_norm": 0.1608053743839264,
"learning_rate": 1.974849234899907e-05,
"loss": 0.1383,
"step": 2199
},
{
"epoch": 0.7979688066739209,
"grad_norm": 0.1577446609735489,
"learning_rate": 1.9680311346784496e-05,
"loss": 0.1418,
"step": 2200
}
],
"logging_steps": 1,
"max_steps": 2757,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.5288134512966107e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}