{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2934, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006816632583503749, "grad_norm": 1240.0, "learning_rate": 0.0, "loss": 50.93749237060547, "step": 1 }, { "epoch": 0.0013633265167007499, "grad_norm": 1296.0, "learning_rate": 2.247191011235955e-07, "loss": 53.13850021362305, "step": 2 }, { "epoch": 0.002044989775051125, "grad_norm": 1288.0, "learning_rate": 4.49438202247191e-07, "loss": 51.01094436645508, "step": 3 }, { "epoch": 0.0027266530334014998, "grad_norm": 1216.0, "learning_rate": 6.741573033707865e-07, "loss": 49.02351379394531, "step": 4 }, { "epoch": 0.0034083162917518746, "grad_norm": 1192.0, "learning_rate": 8.98876404494382e-07, "loss": 47.19402313232422, "step": 5 }, { "epoch": 0.00408997955010225, "grad_norm": 1304.0, "learning_rate": 1.1235955056179777e-06, "loss": 52.479461669921875, "step": 6 }, { "epoch": 0.004771642808452625, "grad_norm": 1192.0, "learning_rate": 1.348314606741573e-06, "loss": 50.72967529296875, "step": 7 }, { "epoch": 0.0054533060668029995, "grad_norm": 1240.0, "learning_rate": 1.5730337078651686e-06, "loss": 48.787574768066406, "step": 8 }, { "epoch": 0.006134969325153374, "grad_norm": 1144.0, "learning_rate": 1.797752808988764e-06, "loss": 47.966796875, "step": 9 }, { "epoch": 0.006816632583503749, "grad_norm": 1128.0, "learning_rate": 2.02247191011236e-06, "loss": 46.31734085083008, "step": 10 }, { "epoch": 0.007498295841854124, "grad_norm": 1064.0, "learning_rate": 2.2471910112359554e-06, "loss": 45.56753158569336, "step": 11 }, { "epoch": 0.0081799591002045, "grad_norm": 964.0, "learning_rate": 2.4719101123595505e-06, "loss": 44.19970703125, "step": 12 }, { "epoch": 0.008861622358554875, "grad_norm": 900.0, "learning_rate": 2.696629213483146e-06, "loss": 40.641353607177734, "step": 13 }, { "epoch": 0.00954328561690525, "grad_norm": 716.0, "learning_rate": 2.9213483146067416e-06, "loss": 40.364559173583984, "step": 14 }, { "epoch": 0.010224948875255624, "grad_norm": 780.0, "learning_rate": 3.146067415730337e-06, "loss": 39.04439926147461, "step": 15 }, { "epoch": 0.010906612133605999, "grad_norm": 572.0, "learning_rate": 3.3707865168539327e-06, "loss": 36.065673828125, "step": 16 }, { "epoch": 0.011588275391956374, "grad_norm": 496.0, "learning_rate": 3.595505617977528e-06, "loss": 35.184654235839844, "step": 17 }, { "epoch": 0.012269938650306749, "grad_norm": 414.0, "learning_rate": 3.820224719101124e-06, "loss": 33.84925842285156, "step": 18 }, { "epoch": 0.012951601908657124, "grad_norm": 374.0, "learning_rate": 4.04494382022472e-06, "loss": 34.170814514160156, "step": 19 }, { "epoch": 0.013633265167007498, "grad_norm": 338.0, "learning_rate": 4.269662921348315e-06, "loss": 31.390647888183594, "step": 20 }, { "epoch": 0.014314928425357873, "grad_norm": 264.0, "learning_rate": 4.494382022471911e-06, "loss": 30.589218139648438, "step": 21 }, { "epoch": 0.014996591683708248, "grad_norm": 231.0, "learning_rate": 4.719101123595506e-06, "loss": 31.37618064880371, "step": 22 }, { "epoch": 0.015678254942058625, "grad_norm": 196.0, "learning_rate": 4.943820224719101e-06, "loss": 29.921207427978516, "step": 23 }, { "epoch": 0.016359918200409, "grad_norm": 204.0, "learning_rate": 5.168539325842698e-06, "loss": 28.391132354736328, "step": 24 }, { "epoch": 0.017041581458759374, "grad_norm": 160.0, "learning_rate": 5.393258426966292e-06, "loss": 25.221527099609375, "step": 25 }, { "epoch": 0.01772324471710975, "grad_norm": 168.0, "learning_rate": 5.617977528089889e-06, "loss": 25.57047462463379, "step": 26 }, { "epoch": 0.018404907975460124, "grad_norm": 131.0, "learning_rate": 5.842696629213483e-06, "loss": 25.24676513671875, "step": 27 }, { "epoch": 0.0190865712338105, "grad_norm": 123.0, "learning_rate": 6.06741573033708e-06, "loss": 25.444564819335938, "step": 28 }, { "epoch": 0.019768234492160874, "grad_norm": 116.5, "learning_rate": 6.292134831460674e-06, "loss": 23.184051513671875, "step": 29 }, { "epoch": 0.02044989775051125, "grad_norm": 99.0, "learning_rate": 6.51685393258427e-06, "loss": 23.858680725097656, "step": 30 }, { "epoch": 0.021131561008861623, "grad_norm": 88.0, "learning_rate": 6.741573033707865e-06, "loss": 22.229976654052734, "step": 31 }, { "epoch": 0.021813224267211998, "grad_norm": 80.0, "learning_rate": 6.966292134831461e-06, "loss": 23.788650512695312, "step": 32 }, { "epoch": 0.022494887525562373, "grad_norm": 75.0, "learning_rate": 7.191011235955056e-06, "loss": 23.62090492248535, "step": 33 }, { "epoch": 0.023176550783912748, "grad_norm": 163.0, "learning_rate": 7.415730337078652e-06, "loss": 20.09526824951172, "step": 34 }, { "epoch": 0.023858214042263123, "grad_norm": 104.0, "learning_rate": 7.640449438202247e-06, "loss": 19.6470890045166, "step": 35 }, { "epoch": 0.024539877300613498, "grad_norm": 65.5, "learning_rate": 7.865168539325843e-06, "loss": 23.076629638671875, "step": 36 }, { "epoch": 0.025221540558963872, "grad_norm": 67.5, "learning_rate": 8.08988764044944e-06, "loss": 22.928770065307617, "step": 37 }, { "epoch": 0.025903203817314247, "grad_norm": 55.25, "learning_rate": 8.314606741573035e-06, "loss": 21.056434631347656, "step": 38 }, { "epoch": 0.026584867075664622, "grad_norm": 102.0, "learning_rate": 8.53932584269663e-06, "loss": 22.40350341796875, "step": 39 }, { "epoch": 0.027266530334014997, "grad_norm": 57.0, "learning_rate": 8.764044943820226e-06, "loss": 17.662132263183594, "step": 40 }, { "epoch": 0.02794819359236537, "grad_norm": 64.0, "learning_rate": 8.988764044943822e-06, "loss": 16.361438751220703, "step": 41 }, { "epoch": 0.028629856850715747, "grad_norm": 51.75, "learning_rate": 9.213483146067417e-06, "loss": 18.1719970703125, "step": 42 }, { "epoch": 0.02931152010906612, "grad_norm": 46.0, "learning_rate": 9.438202247191012e-06, "loss": 17.828445434570312, "step": 43 }, { "epoch": 0.029993183367416496, "grad_norm": 55.0, "learning_rate": 9.662921348314608e-06, "loss": 19.732280731201172, "step": 44 }, { "epoch": 0.03067484662576687, "grad_norm": 45.75, "learning_rate": 9.887640449438202e-06, "loss": 15.047309875488281, "step": 45 }, { "epoch": 0.03135650988411725, "grad_norm": 43.0, "learning_rate": 1.01123595505618e-05, "loss": 17.854541778564453, "step": 46 }, { "epoch": 0.032038173142467624, "grad_norm": 43.25, "learning_rate": 1.0337078651685396e-05, "loss": 14.515093803405762, "step": 47 }, { "epoch": 0.032719836400818, "grad_norm": 36.75, "learning_rate": 1.0561797752808988e-05, "loss": 16.042842864990234, "step": 48 }, { "epoch": 0.033401499659168374, "grad_norm": 42.5, "learning_rate": 1.0786516853932584e-05, "loss": 17.069700241088867, "step": 49 }, { "epoch": 0.03408316291751875, "grad_norm": 72.5, "learning_rate": 1.101123595505618e-05, "loss": 20.273284912109375, "step": 50 }, { "epoch": 0.034764826175869123, "grad_norm": 41.0, "learning_rate": 1.1235955056179778e-05, "loss": 16.16796875, "step": 51 }, { "epoch": 0.0354464894342195, "grad_norm": 41.75, "learning_rate": 1.146067415730337e-05, "loss": 14.313140869140625, "step": 52 }, { "epoch": 0.03612815269256987, "grad_norm": 49.0, "learning_rate": 1.1685393258426966e-05, "loss": 13.049314498901367, "step": 53 }, { "epoch": 0.03680981595092025, "grad_norm": 34.0, "learning_rate": 1.1910112359550562e-05, "loss": 14.502646446228027, "step": 54 }, { "epoch": 0.03749147920927062, "grad_norm": 41.5, "learning_rate": 1.213483146067416e-05, "loss": 16.009258270263672, "step": 55 }, { "epoch": 0.038173142467621, "grad_norm": 46.75, "learning_rate": 1.2359550561797752e-05, "loss": 12.448726654052734, "step": 56 }, { "epoch": 0.03885480572597137, "grad_norm": 47.0, "learning_rate": 1.2584269662921348e-05, "loss": 16.326663970947266, "step": 57 }, { "epoch": 0.03953646898432175, "grad_norm": 32.0, "learning_rate": 1.2808988764044944e-05, "loss": 12.50612735748291, "step": 58 }, { "epoch": 0.04021813224267212, "grad_norm": 36.5, "learning_rate": 1.303370786516854e-05, "loss": 12.726358413696289, "step": 59 }, { "epoch": 0.0408997955010225, "grad_norm": 39.0, "learning_rate": 1.3258426966292135e-05, "loss": 13.529668807983398, "step": 60 }, { "epoch": 0.04158145875937287, "grad_norm": 46.5, "learning_rate": 1.348314606741573e-05, "loss": 11.741790771484375, "step": 61 }, { "epoch": 0.04226312201772325, "grad_norm": 47.5, "learning_rate": 1.3707865168539327e-05, "loss": 14.300347328186035, "step": 62 }, { "epoch": 0.04294478527607362, "grad_norm": 41.0, "learning_rate": 1.3932584269662923e-05, "loss": 11.639049530029297, "step": 63 }, { "epoch": 0.043626448534423996, "grad_norm": 39.0, "learning_rate": 1.4157303370786517e-05, "loss": 10.297684669494629, "step": 64 }, { "epoch": 0.04430811179277437, "grad_norm": 38.5, "learning_rate": 1.4382022471910113e-05, "loss": 9.187118530273438, "step": 65 }, { "epoch": 0.044989775051124746, "grad_norm": 35.5, "learning_rate": 1.4606741573033709e-05, "loss": 12.91425609588623, "step": 66 }, { "epoch": 0.04567143830947512, "grad_norm": 29.75, "learning_rate": 1.4831460674157305e-05, "loss": 11.797382354736328, "step": 67 }, { "epoch": 0.046353101567825496, "grad_norm": 30.625, "learning_rate": 1.5056179775280899e-05, "loss": 8.548418998718262, "step": 68 }, { "epoch": 0.04703476482617587, "grad_norm": 43.0, "learning_rate": 1.5280898876404495e-05, "loss": 12.24549388885498, "step": 69 }, { "epoch": 0.047716428084526245, "grad_norm": 37.25, "learning_rate": 1.5505617977528093e-05, "loss": 10.88212776184082, "step": 70 }, { "epoch": 0.04839809134287662, "grad_norm": 37.25, "learning_rate": 1.5730337078651687e-05, "loss": 11.247610092163086, "step": 71 }, { "epoch": 0.049079754601226995, "grad_norm": 31.375, "learning_rate": 1.595505617977528e-05, "loss": 11.10583209991455, "step": 72 }, { "epoch": 0.04976141785957737, "grad_norm": 29.5, "learning_rate": 1.617977528089888e-05, "loss": 9.709287643432617, "step": 73 }, { "epoch": 0.050443081117927745, "grad_norm": 32.5, "learning_rate": 1.6404494382022473e-05, "loss": 12.9605712890625, "step": 74 }, { "epoch": 0.05112474437627812, "grad_norm": 49.0, "learning_rate": 1.662921348314607e-05, "loss": 15.444433212280273, "step": 75 }, { "epoch": 0.051806407634628494, "grad_norm": 55.5, "learning_rate": 1.6853932584269665e-05, "loss": 8.601103782653809, "step": 76 }, { "epoch": 0.05248807089297887, "grad_norm": 41.25, "learning_rate": 1.707865168539326e-05, "loss": 10.595943450927734, "step": 77 }, { "epoch": 0.053169734151329244, "grad_norm": 38.75, "learning_rate": 1.7303370786516857e-05, "loss": 8.791882514953613, "step": 78 }, { "epoch": 0.05385139740967962, "grad_norm": 61.5, "learning_rate": 1.752808988764045e-05, "loss": 11.594084739685059, "step": 79 }, { "epoch": 0.054533060668029994, "grad_norm": 44.75, "learning_rate": 1.7752808988764045e-05, "loss": 11.820429801940918, "step": 80 }, { "epoch": 0.05521472392638037, "grad_norm": 39.5, "learning_rate": 1.7977528089887643e-05, "loss": 11.794942855834961, "step": 81 }, { "epoch": 0.05589638718473074, "grad_norm": 33.25, "learning_rate": 1.8202247191011237e-05, "loss": 10.080327987670898, "step": 82 }, { "epoch": 0.05657805044308112, "grad_norm": 31.0, "learning_rate": 1.8426966292134835e-05, "loss": 9.8519287109375, "step": 83 }, { "epoch": 0.05725971370143149, "grad_norm": 72.0, "learning_rate": 1.8651685393258426e-05, "loss": 8.134903907775879, "step": 84 }, { "epoch": 0.05794137695978187, "grad_norm": 58.5, "learning_rate": 1.8876404494382024e-05, "loss": 10.02702522277832, "step": 85 }, { "epoch": 0.05862304021813224, "grad_norm": 41.5, "learning_rate": 1.910112359550562e-05, "loss": 8.772936820983887, "step": 86 }, { "epoch": 0.05930470347648262, "grad_norm": 27.625, "learning_rate": 1.9325842696629215e-05, "loss": 9.463577270507812, "step": 87 }, { "epoch": 0.05998636673483299, "grad_norm": 29.75, "learning_rate": 1.955056179775281e-05, "loss": 7.669590950012207, "step": 88 }, { "epoch": 0.06066802999318337, "grad_norm": 40.5, "learning_rate": 1.9775280898876404e-05, "loss": 9.30628776550293, "step": 89 }, { "epoch": 0.06134969325153374, "grad_norm": 27.375, "learning_rate": 2e-05, "loss": 7.885855197906494, "step": 90 }, { "epoch": 0.06203135650988412, "grad_norm": 38.75, "learning_rate": 1.9999993903154863e-05, "loss": 8.965539932250977, "step": 91 }, { "epoch": 0.0627130197682345, "grad_norm": 51.75, "learning_rate": 1.9999975612626872e-05, "loss": 10.038616180419922, "step": 92 }, { "epoch": 0.06339468302658487, "grad_norm": 59.75, "learning_rate": 1.9999945128438338e-05, "loss": 13.050436019897461, "step": 93 }, { "epoch": 0.06407634628493525, "grad_norm": 27.0, "learning_rate": 1.9999902450626434e-05, "loss": 9.7062406539917, "step": 94 }, { "epoch": 0.06475800954328562, "grad_norm": 23.5, "learning_rate": 1.9999847579243196e-05, "loss": 9.722532272338867, "step": 95 }, { "epoch": 0.065439672801636, "grad_norm": 25.5, "learning_rate": 1.9999780514355533e-05, "loss": 7.783807754516602, "step": 96 }, { "epoch": 0.06612133605998637, "grad_norm": 50.5, "learning_rate": 1.9999701256045223e-05, "loss": 8.890963554382324, "step": 97 }, { "epoch": 0.06680299931833675, "grad_norm": 54.75, "learning_rate": 1.999960980440891e-05, "loss": 6.290140151977539, "step": 98 }, { "epoch": 0.06748466257668712, "grad_norm": 42.75, "learning_rate": 1.9999506159558107e-05, "loss": 7.389662742614746, "step": 99 }, { "epoch": 0.0681663258350375, "grad_norm": 29.875, "learning_rate": 1.9999390321619196e-05, "loss": 8.746122360229492, "step": 100 }, { "epoch": 0.06884798909338787, "grad_norm": 38.5, "learning_rate": 1.9999262290733427e-05, "loss": 9.873523712158203, "step": 101 }, { "epoch": 0.06952965235173825, "grad_norm": 33.0, "learning_rate": 1.9999122067056915e-05, "loss": 7.091464519500732, "step": 102 }, { "epoch": 0.07021131561008861, "grad_norm": 32.5, "learning_rate": 1.9998969650760646e-05, "loss": 9.792913436889648, "step": 103 }, { "epoch": 0.070892978868439, "grad_norm": 30.625, "learning_rate": 1.9998805042030472e-05, "loss": 9.923949241638184, "step": 104 }, { "epoch": 0.07157464212678936, "grad_norm": 47.75, "learning_rate": 1.9998628241067113e-05, "loss": 7.707469940185547, "step": 105 }, { "epoch": 0.07225630538513975, "grad_norm": 28.75, "learning_rate": 1.999843924808615e-05, "loss": 9.238975524902344, "step": 106 }, { "epoch": 0.07293796864349011, "grad_norm": 41.25, "learning_rate": 1.999823806331804e-05, "loss": 9.532022476196289, "step": 107 }, { "epoch": 0.0736196319018405, "grad_norm": 24.875, "learning_rate": 1.9998024687008098e-05, "loss": 9.782421112060547, "step": 108 }, { "epoch": 0.07430129516019086, "grad_norm": 23.75, "learning_rate": 1.9997799119416508e-05, "loss": 8.547179222106934, "step": 109 }, { "epoch": 0.07498295841854125, "grad_norm": 41.25, "learning_rate": 1.9997561360818322e-05, "loss": 11.423990249633789, "step": 110 }, { "epoch": 0.07566462167689161, "grad_norm": 35.5, "learning_rate": 1.999731141150346e-05, "loss": 10.873977661132812, "step": 111 }, { "epoch": 0.076346284935242, "grad_norm": 26.0, "learning_rate": 1.9997049271776693e-05, "loss": 9.771560668945312, "step": 112 }, { "epoch": 0.07702794819359236, "grad_norm": 28.25, "learning_rate": 1.9996774941957673e-05, "loss": 8.348849296569824, "step": 113 }, { "epoch": 0.07770961145194274, "grad_norm": 27.75, "learning_rate": 1.999648842238091e-05, "loss": 8.133264541625977, "step": 114 }, { "epoch": 0.07839127471029311, "grad_norm": 26.75, "learning_rate": 1.999618971339577e-05, "loss": 9.438216209411621, "step": 115 }, { "epoch": 0.0790729379686435, "grad_norm": 44.0, "learning_rate": 1.9995878815366498e-05, "loss": 9.7490234375, "step": 116 }, { "epoch": 0.07975460122699386, "grad_norm": 30.25, "learning_rate": 1.999555572867218e-05, "loss": 10.59623908996582, "step": 117 }, { "epoch": 0.08043626448534424, "grad_norm": 23.5, "learning_rate": 1.9995220453706797e-05, "loss": 8.41657829284668, "step": 118 }, { "epoch": 0.08111792774369461, "grad_norm": 26.25, "learning_rate": 1.9994872990879163e-05, "loss": 6.801867485046387, "step": 119 }, { "epoch": 0.081799591002045, "grad_norm": 30.875, "learning_rate": 1.9994513340612957e-05, "loss": 10.117204666137695, "step": 120 }, { "epoch": 0.08248125426039536, "grad_norm": 29.25, "learning_rate": 1.9994141503346735e-05, "loss": 8.73253059387207, "step": 121 }, { "epoch": 0.08316291751874574, "grad_norm": 50.25, "learning_rate": 1.99937574795339e-05, "loss": 12.908260345458984, "step": 122 }, { "epoch": 0.08384458077709611, "grad_norm": 28.125, "learning_rate": 1.999336126964272e-05, "loss": 8.462295532226562, "step": 123 }, { "epoch": 0.0845262440354465, "grad_norm": 26.75, "learning_rate": 1.9992952874156323e-05, "loss": 9.694671630859375, "step": 124 }, { "epoch": 0.08520790729379686, "grad_norm": 46.0, "learning_rate": 1.9992532293572688e-05, "loss": 5.866450309753418, "step": 125 }, { "epoch": 0.08588957055214724, "grad_norm": 25.625, "learning_rate": 1.9992099528404664e-05, "loss": 10.496939659118652, "step": 126 }, { "epoch": 0.08657123381049761, "grad_norm": 24.75, "learning_rate": 1.9991654579179948e-05, "loss": 7.338177680969238, "step": 127 }, { "epoch": 0.08725289706884799, "grad_norm": 27.5, "learning_rate": 1.9991197446441096e-05, "loss": 9.462447166442871, "step": 128 }, { "epoch": 0.08793456032719836, "grad_norm": 19.75, "learning_rate": 1.9990728130745524e-05, "loss": 7.890719413757324, "step": 129 }, { "epoch": 0.08861622358554874, "grad_norm": 50.75, "learning_rate": 1.9990246632665503e-05, "loss": 7.281942367553711, "step": 130 }, { "epoch": 0.08929788684389911, "grad_norm": 24.125, "learning_rate": 1.998975295278815e-05, "loss": 6.036952972412109, "step": 131 }, { "epoch": 0.08997955010224949, "grad_norm": 39.5, "learning_rate": 1.9989247091715454e-05, "loss": 10.337360382080078, "step": 132 }, { "epoch": 0.09066121336059986, "grad_norm": 38.5, "learning_rate": 1.998872905006423e-05, "loss": 10.778207778930664, "step": 133 }, { "epoch": 0.09134287661895024, "grad_norm": 23.625, "learning_rate": 1.9988198828466182e-05, "loss": 7.6067399978637695, "step": 134 }, { "epoch": 0.09202453987730061, "grad_norm": 28.0, "learning_rate": 1.998765642756783e-05, "loss": 6.079522132873535, "step": 135 }, { "epoch": 0.09270620313565099, "grad_norm": 49.75, "learning_rate": 1.9987101848030566e-05, "loss": 9.402310371398926, "step": 136 }, { "epoch": 0.09338786639400136, "grad_norm": 28.875, "learning_rate": 1.998653509053063e-05, "loss": 10.946863174438477, "step": 137 }, { "epoch": 0.09406952965235174, "grad_norm": 27.25, "learning_rate": 1.9985956155759104e-05, "loss": 7.430831432342529, "step": 138 }, { "epoch": 0.09475119291070211, "grad_norm": 34.5, "learning_rate": 1.9985365044421926e-05, "loss": 12.701254844665527, "step": 139 }, { "epoch": 0.09543285616905249, "grad_norm": 41.5, "learning_rate": 1.9984761757239878e-05, "loss": 6.6523284912109375, "step": 140 }, { "epoch": 0.09611451942740286, "grad_norm": 30.25, "learning_rate": 1.9984146294948585e-05, "loss": 9.092896461486816, "step": 141 }, { "epoch": 0.09679618268575324, "grad_norm": 24.375, "learning_rate": 1.998351865829853e-05, "loss": 5.745997428894043, "step": 142 }, { "epoch": 0.09747784594410361, "grad_norm": 23.875, "learning_rate": 1.9982878848055036e-05, "loss": 7.442193508148193, "step": 143 }, { "epoch": 0.09815950920245399, "grad_norm": 28.125, "learning_rate": 1.9982226864998256e-05, "loss": 6.941527843475342, "step": 144 }, { "epoch": 0.09884117246080436, "grad_norm": 20.875, "learning_rate": 1.998156270992321e-05, "loss": 6.432262420654297, "step": 145 }, { "epoch": 0.09952283571915474, "grad_norm": 42.75, "learning_rate": 1.998088638363974e-05, "loss": 9.149693489074707, "step": 146 }, { "epoch": 0.10020449897750511, "grad_norm": 31.0, "learning_rate": 1.998019788697254e-05, "loss": 9.498212814331055, "step": 147 }, { "epoch": 0.10088616223585549, "grad_norm": 25.5, "learning_rate": 1.9979497220761142e-05, "loss": 7.568300247192383, "step": 148 }, { "epoch": 0.10156782549420586, "grad_norm": 24.5, "learning_rate": 1.997878438585992e-05, "loss": 7.560571670532227, "step": 149 }, { "epoch": 0.10224948875255624, "grad_norm": 39.0, "learning_rate": 1.9978059383138073e-05, "loss": 6.12653112411499, "step": 150 }, { "epoch": 0.1029311520109066, "grad_norm": 33.75, "learning_rate": 1.9977322213479655e-05, "loss": 8.7139892578125, "step": 151 }, { "epoch": 0.10361281526925699, "grad_norm": 48.0, "learning_rate": 1.9976572877783548e-05, "loss": 5.482341289520264, "step": 152 }, { "epoch": 0.10429447852760736, "grad_norm": 36.0, "learning_rate": 1.9975811376963464e-05, "loss": 5.802707195281982, "step": 153 }, { "epoch": 0.10497614178595774, "grad_norm": 27.25, "learning_rate": 1.997503771194796e-05, "loss": 6.929409980773926, "step": 154 }, { "epoch": 0.1056578050443081, "grad_norm": 28.875, "learning_rate": 1.997425188368041e-05, "loss": 8.93964958190918, "step": 155 }, { "epoch": 0.10633946830265849, "grad_norm": 25.125, "learning_rate": 1.9973453893119033e-05, "loss": 4.790992736816406, "step": 156 }, { "epoch": 0.10702113156100886, "grad_norm": 23.25, "learning_rate": 1.9972643741236882e-05, "loss": 6.582825183868408, "step": 157 }, { "epoch": 0.10770279481935924, "grad_norm": 20.125, "learning_rate": 1.9971821429021817e-05, "loss": 5.917995452880859, "step": 158 }, { "epoch": 0.1083844580777096, "grad_norm": 17.875, "learning_rate": 1.997098695747655e-05, "loss": 5.241603374481201, "step": 159 }, { "epoch": 0.10906612133605999, "grad_norm": 45.0, "learning_rate": 1.9970140327618612e-05, "loss": 9.344328880310059, "step": 160 }, { "epoch": 0.10974778459441036, "grad_norm": 25.75, "learning_rate": 1.9969281540480346e-05, "loss": 7.2963409423828125, "step": 161 }, { "epoch": 0.11042944785276074, "grad_norm": 21.625, "learning_rate": 1.9968410597108935e-05, "loss": 7.538847923278809, "step": 162 }, { "epoch": 0.1111111111111111, "grad_norm": 45.0, "learning_rate": 1.9967527498566387e-05, "loss": 10.164068222045898, "step": 163 }, { "epoch": 0.11179277436946149, "grad_norm": 40.75, "learning_rate": 1.9966632245929515e-05, "loss": 9.811113357543945, "step": 164 }, { "epoch": 0.11247443762781185, "grad_norm": 34.75, "learning_rate": 1.9965724840289972e-05, "loss": 5.59637975692749, "step": 165 }, { "epoch": 0.11315610088616224, "grad_norm": 33.25, "learning_rate": 1.996480528275421e-05, "loss": 5.69235897064209, "step": 166 }, { "epoch": 0.1138377641445126, "grad_norm": 22.625, "learning_rate": 1.996387357444352e-05, "loss": 7.57390022277832, "step": 167 }, { "epoch": 0.11451942740286299, "grad_norm": 21.875, "learning_rate": 1.9962929716493987e-05, "loss": 7.872608184814453, "step": 168 }, { "epoch": 0.11520109066121335, "grad_norm": 25.0, "learning_rate": 1.9961973710056535e-05, "loss": 3.9859418869018555, "step": 169 }, { "epoch": 0.11588275391956374, "grad_norm": 29.375, "learning_rate": 1.9961005556296875e-05, "loss": 7.044215202331543, "step": 170 }, { "epoch": 0.1165644171779141, "grad_norm": 25.375, "learning_rate": 1.9960025256395556e-05, "loss": 5.625599384307861, "step": 171 }, { "epoch": 0.11724608043626449, "grad_norm": 46.75, "learning_rate": 1.9959032811547912e-05, "loss": 8.253169059753418, "step": 172 }, { "epoch": 0.11792774369461487, "grad_norm": 37.75, "learning_rate": 1.9958028222964114e-05, "loss": 7.638906478881836, "step": 173 }, { "epoch": 0.11860940695296524, "grad_norm": 58.25, "learning_rate": 1.9957011491869118e-05, "loss": 11.625969886779785, "step": 174 }, { "epoch": 0.11929107021131562, "grad_norm": 24.5, "learning_rate": 1.9955982619502693e-05, "loss": 5.61654806137085, "step": 175 }, { "epoch": 0.11997273346966598, "grad_norm": 21.0, "learning_rate": 1.995494160711942e-05, "loss": 5.503238677978516, "step": 176 }, { "epoch": 0.12065439672801637, "grad_norm": 26.0, "learning_rate": 1.9953888455988674e-05, "loss": 7.334506988525391, "step": 177 }, { "epoch": 0.12133605998636673, "grad_norm": 17.375, "learning_rate": 1.995282316739463e-05, "loss": 6.27517032623291, "step": 178 }, { "epoch": 0.12201772324471712, "grad_norm": 21.75, "learning_rate": 1.995174574263628e-05, "loss": 8.78685474395752, "step": 179 }, { "epoch": 0.12269938650306748, "grad_norm": 28.5, "learning_rate": 1.9950656183027392e-05, "loss": 5.968265056610107, "step": 180 }, { "epoch": 0.12338104976141787, "grad_norm": 29.125, "learning_rate": 1.9949554489896542e-05, "loss": 9.19890022277832, "step": 181 }, { "epoch": 0.12406271301976823, "grad_norm": 41.5, "learning_rate": 1.994844066458711e-05, "loss": 11.692063331604004, "step": 182 }, { "epoch": 0.12474437627811862, "grad_norm": 32.5, "learning_rate": 1.9947314708457245e-05, "loss": 5.405261516571045, "step": 183 }, { "epoch": 0.125426039536469, "grad_norm": 20.75, "learning_rate": 1.9946176622879915e-05, "loss": 8.772290229797363, "step": 184 }, { "epoch": 0.12610770279481937, "grad_norm": 29.0, "learning_rate": 1.994502640924286e-05, "loss": 7.680125713348389, "step": 185 }, { "epoch": 0.12678936605316973, "grad_norm": 34.0, "learning_rate": 1.994386406894862e-05, "loss": 7.944326400756836, "step": 186 }, { "epoch": 0.1274710293115201, "grad_norm": 53.0, "learning_rate": 1.9942689603414513e-05, "loss": 8.110595703125, "step": 187 }, { "epoch": 0.1281526925698705, "grad_norm": 24.875, "learning_rate": 1.9941503014072646e-05, "loss": 9.813580513000488, "step": 188 }, { "epoch": 0.12883435582822086, "grad_norm": 25.625, "learning_rate": 1.9940304302369912e-05, "loss": 5.48173189163208, "step": 189 }, { "epoch": 0.12951601908657123, "grad_norm": 35.0, "learning_rate": 1.993909346976798e-05, "loss": 4.518528938293457, "step": 190 }, { "epoch": 0.1301976823449216, "grad_norm": 20.875, "learning_rate": 1.9937870517743304e-05, "loss": 4.664849281311035, "step": 191 }, { "epoch": 0.130879345603272, "grad_norm": 27.5, "learning_rate": 1.9936635447787112e-05, "loss": 4.781461715698242, "step": 192 }, { "epoch": 0.13156100886162236, "grad_norm": 25.625, "learning_rate": 1.9935388261405412e-05, "loss": 6.724184036254883, "step": 193 }, { "epoch": 0.13224267211997273, "grad_norm": 24.25, "learning_rate": 1.993412896011898e-05, "loss": 4.6410298347473145, "step": 194 }, { "epoch": 0.1329243353783231, "grad_norm": 35.25, "learning_rate": 1.993285754546338e-05, "loss": 6.395364761352539, "step": 195 }, { "epoch": 0.1336059986366735, "grad_norm": 39.5, "learning_rate": 1.9931574018988916e-05, "loss": 8.52147388458252, "step": 196 }, { "epoch": 0.13428766189502386, "grad_norm": 26.0, "learning_rate": 1.99302783822607e-05, "loss": 5.561795234680176, "step": 197 }, { "epoch": 0.13496932515337423, "grad_norm": 21.875, "learning_rate": 1.9928970636858584e-05, "loss": 6.095944404602051, "step": 198 }, { "epoch": 0.1356509884117246, "grad_norm": 27.75, "learning_rate": 1.992765078437719e-05, "loss": 8.251648902893066, "step": 199 }, { "epoch": 0.136332651670075, "grad_norm": 14.875, "learning_rate": 1.9926318826425905e-05, "loss": 5.511835098266602, "step": 200 }, { "epoch": 0.13701431492842536, "grad_norm": 24.125, "learning_rate": 1.992497476462888e-05, "loss": 4.936651706695557, "step": 201 }, { "epoch": 0.13769597818677573, "grad_norm": 17.875, "learning_rate": 1.9923618600625025e-05, "loss": 5.025204658508301, "step": 202 }, { "epoch": 0.1383776414451261, "grad_norm": 29.75, "learning_rate": 1.9922250336068e-05, "loss": 4.812963485717773, "step": 203 }, { "epoch": 0.1390593047034765, "grad_norm": 22.25, "learning_rate": 1.9920869972626225e-05, "loss": 5.645591735839844, "step": 204 }, { "epoch": 0.13974096796182686, "grad_norm": 22.625, "learning_rate": 1.9919477511982873e-05, "loss": 6.975612640380859, "step": 205 }, { "epoch": 0.14042263122017723, "grad_norm": 23.375, "learning_rate": 1.991807295583587e-05, "loss": 8.177459716796875, "step": 206 }, { "epoch": 0.1411042944785276, "grad_norm": 15.375, "learning_rate": 1.991665630589788e-05, "loss": 5.366596221923828, "step": 207 }, { "epoch": 0.141785957736878, "grad_norm": 21.125, "learning_rate": 1.9915227563896327e-05, "loss": 5.395918846130371, "step": 208 }, { "epoch": 0.14246762099522836, "grad_norm": 18.375, "learning_rate": 1.9913786731573382e-05, "loss": 3.3276357650756836, "step": 209 }, { "epoch": 0.14314928425357873, "grad_norm": 15.4375, "learning_rate": 1.991233381068594e-05, "loss": 5.434110641479492, "step": 210 }, { "epoch": 0.1438309475119291, "grad_norm": 21.375, "learning_rate": 1.991086880300565e-05, "loss": 4.756390571594238, "step": 211 }, { "epoch": 0.1445126107702795, "grad_norm": 25.5, "learning_rate": 1.9909391710318907e-05, "loss": 6.9003143310546875, "step": 212 }, { "epoch": 0.14519427402862986, "grad_norm": 14.8125, "learning_rate": 1.990790253442682e-05, "loss": 4.234390735626221, "step": 213 }, { "epoch": 0.14587593728698023, "grad_norm": 41.0, "learning_rate": 1.990640127714525e-05, "loss": 10.397843360900879, "step": 214 }, { "epoch": 0.1465576005453306, "grad_norm": 26.75, "learning_rate": 1.990488794030478e-05, "loss": 3.9121079444885254, "step": 215 }, { "epoch": 0.147239263803681, "grad_norm": 21.75, "learning_rate": 1.990336252575073e-05, "loss": 3.8865060806274414, "step": 216 }, { "epoch": 0.14792092706203136, "grad_norm": 17.25, "learning_rate": 1.990182503534314e-05, "loss": 3.9200165271759033, "step": 217 }, { "epoch": 0.14860259032038173, "grad_norm": 29.375, "learning_rate": 1.9900275470956778e-05, "loss": 3.5212175846099854, "step": 218 }, { "epoch": 0.1492842535787321, "grad_norm": 24.125, "learning_rate": 1.9898713834481137e-05, "loss": 7.625463485717773, "step": 219 }, { "epoch": 0.1499659168370825, "grad_norm": 19.5, "learning_rate": 1.9897140127820432e-05, "loss": 3.7833309173583984, "step": 220 }, { "epoch": 0.15064758009543286, "grad_norm": 36.25, "learning_rate": 1.9895554352893584e-05, "loss": 8.871758460998535, "step": 221 }, { "epoch": 0.15132924335378323, "grad_norm": 42.0, "learning_rate": 1.9893956511634242e-05, "loss": 8.002093315124512, "step": 222 }, { "epoch": 0.1520109066121336, "grad_norm": 27.5, "learning_rate": 1.9892346605990764e-05, "loss": 5.860401153564453, "step": 223 }, { "epoch": 0.152692569870484, "grad_norm": 25.625, "learning_rate": 1.989072463792622e-05, "loss": 6.242304801940918, "step": 224 }, { "epoch": 0.15337423312883436, "grad_norm": 27.5, "learning_rate": 1.9889090609418384e-05, "loss": 8.299890518188477, "step": 225 }, { "epoch": 0.15405589638718473, "grad_norm": 26.125, "learning_rate": 1.9887444522459743e-05, "loss": 4.64989709854126, "step": 226 }, { "epoch": 0.1547375596455351, "grad_norm": 24.125, "learning_rate": 1.9885786379057487e-05, "loss": 7.86977481842041, "step": 227 }, { "epoch": 0.1554192229038855, "grad_norm": 26.375, "learning_rate": 1.9884116181233496e-05, "loss": 6.246367931365967, "step": 228 }, { "epoch": 0.15610088616223586, "grad_norm": 61.0, "learning_rate": 1.9882433931024367e-05, "loss": 4.3841142654418945, "step": 229 }, { "epoch": 0.15678254942058623, "grad_norm": 15.25, "learning_rate": 1.9880739630481376e-05, "loss": 6.021090984344482, "step": 230 }, { "epoch": 0.1574642126789366, "grad_norm": 25.75, "learning_rate": 1.9879033281670508e-05, "loss": 6.868640422821045, "step": 231 }, { "epoch": 0.158145875937287, "grad_norm": 28.5, "learning_rate": 1.9877314886672424e-05, "loss": 7.8858747482299805, "step": 232 }, { "epoch": 0.15882753919563736, "grad_norm": 17.25, "learning_rate": 1.987558444758249e-05, "loss": 5.481084823608398, "step": 233 }, { "epoch": 0.15950920245398773, "grad_norm": 29.375, "learning_rate": 1.9873841966510744e-05, "loss": 4.549655914306641, "step": 234 }, { "epoch": 0.1601908657123381, "grad_norm": 27.0, "learning_rate": 1.9872087445581912e-05, "loss": 5.833483695983887, "step": 235 }, { "epoch": 0.1608725289706885, "grad_norm": 33.25, "learning_rate": 1.987032088693541e-05, "loss": 7.040109634399414, "step": 236 }, { "epoch": 0.16155419222903886, "grad_norm": 30.5, "learning_rate": 1.9868542292725316e-05, "loss": 6.993575096130371, "step": 237 }, { "epoch": 0.16223585548738922, "grad_norm": 18.0, "learning_rate": 1.9866751665120398e-05, "loss": 4.666120529174805, "step": 238 }, { "epoch": 0.1629175187457396, "grad_norm": 16.375, "learning_rate": 1.9864949006304094e-05, "loss": 3.779996395111084, "step": 239 }, { "epoch": 0.16359918200409, "grad_norm": 24.5, "learning_rate": 1.9863134318474504e-05, "loss": 6.483820915222168, "step": 240 }, { "epoch": 0.16428084526244036, "grad_norm": 27.75, "learning_rate": 1.986130760384441e-05, "loss": 7.43174934387207, "step": 241 }, { "epoch": 0.16496250852079072, "grad_norm": 18.25, "learning_rate": 1.9859468864641242e-05, "loss": 5.192183494567871, "step": 242 }, { "epoch": 0.1656441717791411, "grad_norm": 21.75, "learning_rate": 1.985761810310711e-05, "loss": 5.670318603515625, "step": 243 }, { "epoch": 0.1663258350374915, "grad_norm": 35.5, "learning_rate": 1.985575532149877e-05, "loss": 9.05514144897461, "step": 244 }, { "epoch": 0.16700749829584186, "grad_norm": 27.625, "learning_rate": 1.985388052208764e-05, "loss": 7.165792942047119, "step": 245 }, { "epoch": 0.16768916155419222, "grad_norm": 36.75, "learning_rate": 1.9851993707159794e-05, "loss": 5.000845909118652, "step": 246 }, { "epoch": 0.1683708248125426, "grad_norm": 29.875, "learning_rate": 1.985009487901596e-05, "loss": 7.4508256912231445, "step": 247 }, { "epoch": 0.169052488070893, "grad_norm": 52.5, "learning_rate": 1.9848184039971502e-05, "loss": 5.9550065994262695, "step": 248 }, { "epoch": 0.16973415132924335, "grad_norm": 27.125, "learning_rate": 1.9846261192356442e-05, "loss": 6.4095048904418945, "step": 249 }, { "epoch": 0.17041581458759372, "grad_norm": 35.75, "learning_rate": 1.9844326338515444e-05, "loss": 4.5142340660095215, "step": 250 }, { "epoch": 0.1710974778459441, "grad_norm": 23.125, "learning_rate": 1.9842379480807804e-05, "loss": 4.987588882446289, "step": 251 }, { "epoch": 0.17177914110429449, "grad_norm": 20.25, "learning_rate": 1.984042062160746e-05, "loss": 3.48603892326355, "step": 252 }, { "epoch": 0.17246080436264485, "grad_norm": 42.25, "learning_rate": 1.983844976330299e-05, "loss": 8.386517524719238, "step": 253 }, { "epoch": 0.17314246762099522, "grad_norm": 33.25, "learning_rate": 1.983646690829759e-05, "loss": 7.611467361450195, "step": 254 }, { "epoch": 0.1738241308793456, "grad_norm": 41.0, "learning_rate": 1.9834472059009097e-05, "loss": 6.919773101806641, "step": 255 }, { "epoch": 0.17450579413769599, "grad_norm": 25.5, "learning_rate": 1.9832465217869964e-05, "loss": 4.884511947631836, "step": 256 }, { "epoch": 0.17518745739604635, "grad_norm": 77.0, "learning_rate": 1.9830446387327277e-05, "loss": 12.053509712219238, "step": 257 }, { "epoch": 0.17586912065439672, "grad_norm": 72.0, "learning_rate": 1.9828415569842732e-05, "loss": 11.680207252502441, "step": 258 }, { "epoch": 0.17655078391274712, "grad_norm": 24.125, "learning_rate": 1.9826372767892644e-05, "loss": 3.387017011642456, "step": 259 }, { "epoch": 0.17723244717109748, "grad_norm": 41.75, "learning_rate": 1.982431798396794e-05, "loss": 9.364209175109863, "step": 260 }, { "epoch": 0.17791411042944785, "grad_norm": 35.25, "learning_rate": 1.982225122057417e-05, "loss": 5.37290620803833, "step": 261 }, { "epoch": 0.17859577368779822, "grad_norm": 37.75, "learning_rate": 1.9820172480231476e-05, "loss": 6.597467422485352, "step": 262 }, { "epoch": 0.17927743694614862, "grad_norm": 24.75, "learning_rate": 1.9818081765474602e-05, "loss": 8.09438705444336, "step": 263 }, { "epoch": 0.17995910020449898, "grad_norm": 23.5, "learning_rate": 1.981597907885291e-05, "loss": 7.103013038635254, "step": 264 }, { "epoch": 0.18064076346284935, "grad_norm": 30.125, "learning_rate": 1.9813864422930345e-05, "loss": 8.737072944641113, "step": 265 }, { "epoch": 0.18132242672119972, "grad_norm": 28.375, "learning_rate": 1.981173780028546e-05, "loss": 9.382484436035156, "step": 266 }, { "epoch": 0.18200408997955012, "grad_norm": 46.25, "learning_rate": 1.980959921351139e-05, "loss": 7.1068010330200195, "step": 267 }, { "epoch": 0.18268575323790048, "grad_norm": 30.5, "learning_rate": 1.980744866521586e-05, "loss": 4.572725296020508, "step": 268 }, { "epoch": 0.18336741649625085, "grad_norm": 22.625, "learning_rate": 1.980528615802118e-05, "loss": 5.960131645202637, "step": 269 }, { "epoch": 0.18404907975460122, "grad_norm": 42.25, "learning_rate": 1.9803111694564246e-05, "loss": 4.680814743041992, "step": 270 }, { "epoch": 0.18473074301295161, "grad_norm": 21.0, "learning_rate": 1.9800925277496532e-05, "loss": 4.171243667602539, "step": 271 }, { "epoch": 0.18541240627130198, "grad_norm": 19.125, "learning_rate": 1.979872690948409e-05, "loss": 5.246752738952637, "step": 272 }, { "epoch": 0.18609406952965235, "grad_norm": 22.5, "learning_rate": 1.9796516593207537e-05, "loss": 4.766591548919678, "step": 273 }, { "epoch": 0.18677573278800272, "grad_norm": 59.75, "learning_rate": 1.979429433136207e-05, "loss": 12.800636291503906, "step": 274 }, { "epoch": 0.18745739604635311, "grad_norm": 31.625, "learning_rate": 1.9792060126657437e-05, "loss": 5.936284065246582, "step": 275 }, { "epoch": 0.18813905930470348, "grad_norm": 53.75, "learning_rate": 1.9789813981817963e-05, "loss": 10.339212417602539, "step": 276 }, { "epoch": 0.18882072256305385, "grad_norm": 27.375, "learning_rate": 1.9787555899582533e-05, "loss": 6.968416213989258, "step": 277 }, { "epoch": 0.18950238582140422, "grad_norm": 22.625, "learning_rate": 1.978528588270458e-05, "loss": 4.656405925750732, "step": 278 }, { "epoch": 0.1901840490797546, "grad_norm": 24.875, "learning_rate": 1.9783003933952082e-05, "loss": 6.244778156280518, "step": 279 }, { "epoch": 0.19086571233810498, "grad_norm": 27.0, "learning_rate": 1.9780710056107587e-05, "loss": 8.363030433654785, "step": 280 }, { "epoch": 0.19154737559645535, "grad_norm": 17.125, "learning_rate": 1.9778404251968176e-05, "loss": 6.711296081542969, "step": 281 }, { "epoch": 0.19222903885480572, "grad_norm": 21.25, "learning_rate": 1.977608652434548e-05, "loss": 8.933284759521484, "step": 282 }, { "epoch": 0.1929107021131561, "grad_norm": 21.875, "learning_rate": 1.9773756876065655e-05, "loss": 8.752829551696777, "step": 283 }, { "epoch": 0.19359236537150648, "grad_norm": 16.875, "learning_rate": 1.9771415309969406e-05, "loss": 6.1796369552612305, "step": 284 }, { "epoch": 0.19427402862985685, "grad_norm": 56.75, "learning_rate": 1.976906182891197e-05, "loss": 6.736907958984375, "step": 285 }, { "epoch": 0.19495569188820722, "grad_norm": 46.25, "learning_rate": 1.97666964357631e-05, "loss": 5.108454704284668, "step": 286 }, { "epoch": 0.1956373551465576, "grad_norm": 24.125, "learning_rate": 1.9764319133407095e-05, "loss": 11.29523754119873, "step": 287 }, { "epoch": 0.19631901840490798, "grad_norm": 30.25, "learning_rate": 1.9761929924742756e-05, "loss": 6.913311004638672, "step": 288 }, { "epoch": 0.19700068166325835, "grad_norm": 25.875, "learning_rate": 1.975952881268341e-05, "loss": 8.411927223205566, "step": 289 }, { "epoch": 0.19768234492160872, "grad_norm": 20.25, "learning_rate": 1.97571158001569e-05, "loss": 9.889822959899902, "step": 290 }, { "epoch": 0.1983640081799591, "grad_norm": 23.875, "learning_rate": 1.975469089010558e-05, "loss": 6.81682014465332, "step": 291 }, { "epoch": 0.19904567143830948, "grad_norm": 25.625, "learning_rate": 1.975225408548631e-05, "loss": 4.505208969116211, "step": 292 }, { "epoch": 0.19972733469665985, "grad_norm": 27.875, "learning_rate": 1.9749805389270453e-05, "loss": 9.784196853637695, "step": 293 }, { "epoch": 0.20040899795501022, "grad_norm": 19.0, "learning_rate": 1.9747344804443873e-05, "loss": 6.655136585235596, "step": 294 }, { "epoch": 0.2010906612133606, "grad_norm": 23.75, "learning_rate": 1.9744872334006936e-05, "loss": 6.012855529785156, "step": 295 }, { "epoch": 0.20177232447171098, "grad_norm": 21.0, "learning_rate": 1.9742387980974484e-05, "loss": 6.321096897125244, "step": 296 }, { "epoch": 0.20245398773006135, "grad_norm": 17.25, "learning_rate": 1.973989174837587e-05, "loss": 5.344976425170898, "step": 297 }, { "epoch": 0.20313565098841171, "grad_norm": 17.875, "learning_rate": 1.9737383639254924e-05, "loss": 6.940766334533691, "step": 298 }, { "epoch": 0.2038173142467621, "grad_norm": 18.375, "learning_rate": 1.9734863656669948e-05, "loss": 5.863371849060059, "step": 299 }, { "epoch": 0.20449897750511248, "grad_norm": 28.125, "learning_rate": 1.973233180369374e-05, "loss": 7.586820602416992, "step": 300 }, { "epoch": 0.20518064076346285, "grad_norm": 20.375, "learning_rate": 1.9729788083413558e-05, "loss": 8.29321575164795, "step": 301 }, { "epoch": 0.2058623040218132, "grad_norm": 22.5, "learning_rate": 1.9727232498931135e-05, "loss": 6.42503023147583, "step": 302 }, { "epoch": 0.2065439672801636, "grad_norm": 18.75, "learning_rate": 1.972466505336267e-05, "loss": 5.483799934387207, "step": 303 }, { "epoch": 0.20722563053851398, "grad_norm": 19.5, "learning_rate": 1.9722085749838835e-05, "loss": 6.4698967933654785, "step": 304 }, { "epoch": 0.20790729379686435, "grad_norm": 29.0, "learning_rate": 1.9719494591504747e-05, "loss": 6.794079780578613, "step": 305 }, { "epoch": 0.2085889570552147, "grad_norm": 29.25, "learning_rate": 1.9716891581519983e-05, "loss": 8.430429458618164, "step": 306 }, { "epoch": 0.2092706203135651, "grad_norm": 24.625, "learning_rate": 1.9714276723058576e-05, "loss": 7.812586307525635, "step": 307 }, { "epoch": 0.20995228357191548, "grad_norm": 30.375, "learning_rate": 1.9711650019309e-05, "loss": 10.433080673217773, "step": 308 }, { "epoch": 0.21063394683026584, "grad_norm": 39.25, "learning_rate": 1.970901147347418e-05, "loss": 6.7389116287231445, "step": 309 }, { "epoch": 0.2113156100886162, "grad_norm": 19.125, "learning_rate": 1.9706361088771474e-05, "loss": 7.359713554382324, "step": 310 }, { "epoch": 0.2119972733469666, "grad_norm": 22.75, "learning_rate": 1.9703698868432676e-05, "loss": 8.524877548217773, "step": 311 }, { "epoch": 0.21267893660531698, "grad_norm": 23.625, "learning_rate": 1.9701024815704023e-05, "loss": 8.210470199584961, "step": 312 }, { "epoch": 0.21336059986366734, "grad_norm": 21.0, "learning_rate": 1.9698338933846172e-05, "loss": 7.218918323516846, "step": 313 }, { "epoch": 0.2140422631220177, "grad_norm": 43.75, "learning_rate": 1.9695641226134196e-05, "loss": 4.425941467285156, "step": 314 }, { "epoch": 0.2147239263803681, "grad_norm": 33.0, "learning_rate": 1.96929316958576e-05, "loss": 6.4954376220703125, "step": 315 }, { "epoch": 0.21540558963871848, "grad_norm": 16.5, "learning_rate": 1.9690210346320304e-05, "loss": 3.473909854888916, "step": 316 }, { "epoch": 0.21608725289706884, "grad_norm": 19.75, "learning_rate": 1.9687477180840634e-05, "loss": 9.46400260925293, "step": 317 }, { "epoch": 0.2167689161554192, "grad_norm": 16.875, "learning_rate": 1.9684732202751328e-05, "loss": 4.723077297210693, "step": 318 }, { "epoch": 0.2174505794137696, "grad_norm": 17.625, "learning_rate": 1.968197541539953e-05, "loss": 6.001959800720215, "step": 319 }, { "epoch": 0.21813224267211997, "grad_norm": 54.25, "learning_rate": 1.9679206822146776e-05, "loss": 5.832144260406494, "step": 320 }, { "epoch": 0.21881390593047034, "grad_norm": 21.25, "learning_rate": 1.967642642636901e-05, "loss": 4.5882463455200195, "step": 321 }, { "epoch": 0.2194955691888207, "grad_norm": 22.25, "learning_rate": 1.9673634231456554e-05, "loss": 3.8746566772460938, "step": 322 }, { "epoch": 0.2201772324471711, "grad_norm": 33.0, "learning_rate": 1.9670830240814127e-05, "loss": 6.420742511749268, "step": 323 }, { "epoch": 0.22085889570552147, "grad_norm": 14.625, "learning_rate": 1.9668014457860828e-05, "loss": 4.0230512619018555, "step": 324 }, { "epoch": 0.22154055896387184, "grad_norm": 32.5, "learning_rate": 1.9665186886030135e-05, "loss": 5.967896461486816, "step": 325 }, { "epoch": 0.2222222222222222, "grad_norm": 22.0, "learning_rate": 1.96623475287699e-05, "loss": 4.741808891296387, "step": 326 }, { "epoch": 0.2229038854805726, "grad_norm": 20.75, "learning_rate": 1.965949638954235e-05, "loss": 5.939693450927734, "step": 327 }, { "epoch": 0.22358554873892297, "grad_norm": 23.125, "learning_rate": 1.9656633471824075e-05, "loss": 6.899787902832031, "step": 328 }, { "epoch": 0.22426721199727334, "grad_norm": 21.375, "learning_rate": 1.9653758779106028e-05, "loss": 5.684700965881348, "step": 329 }, { "epoch": 0.2249488752556237, "grad_norm": 29.75, "learning_rate": 1.9650872314893523e-05, "loss": 3.586632251739502, "step": 330 }, { "epoch": 0.2256305385139741, "grad_norm": 18.625, "learning_rate": 1.964797408270622e-05, "loss": 3.505176305770874, "step": 331 }, { "epoch": 0.22631220177232447, "grad_norm": 24.625, "learning_rate": 1.9645064086078135e-05, "loss": 3.1256818771362305, "step": 332 }, { "epoch": 0.22699386503067484, "grad_norm": 19.25, "learning_rate": 1.964214232855763e-05, "loss": 3.896146297454834, "step": 333 }, { "epoch": 0.2276755282890252, "grad_norm": 40.75, "learning_rate": 1.9639208813707407e-05, "loss": 8.692360877990723, "step": 334 }, { "epoch": 0.2283571915473756, "grad_norm": 17.375, "learning_rate": 1.9636263545104498e-05, "loss": 3.2091927528381348, "step": 335 }, { "epoch": 0.22903885480572597, "grad_norm": 19.75, "learning_rate": 1.9633306526340273e-05, "loss": 5.988053321838379, "step": 336 }, { "epoch": 0.22972051806407634, "grad_norm": 18.0, "learning_rate": 1.9630337761020436e-05, "loss": 4.111014366149902, "step": 337 }, { "epoch": 0.2304021813224267, "grad_norm": 40.0, "learning_rate": 1.9627357252765e-05, "loss": 9.038805961608887, "step": 338 }, { "epoch": 0.2310838445807771, "grad_norm": 34.5, "learning_rate": 1.9624365005208303e-05, "loss": 8.002530097961426, "step": 339 }, { "epoch": 0.23176550783912747, "grad_norm": 42.25, "learning_rate": 1.962136102199901e-05, "loss": 8.740842819213867, "step": 340 }, { "epoch": 0.23244717109747784, "grad_norm": 24.0, "learning_rate": 1.961834530680007e-05, "loss": 5.005092620849609, "step": 341 }, { "epoch": 0.2331288343558282, "grad_norm": 12.875, "learning_rate": 1.9615317863288765e-05, "loss": 3.427917957305908, "step": 342 }, { "epoch": 0.2338104976141786, "grad_norm": 41.75, "learning_rate": 1.9612278695156662e-05, "loss": 6.940898895263672, "step": 343 }, { "epoch": 0.23449216087252897, "grad_norm": 61.75, "learning_rate": 1.9609227806109627e-05, "loss": 5.176095962524414, "step": 344 }, { "epoch": 0.23517382413087934, "grad_norm": 25.125, "learning_rate": 1.9606165199867822e-05, "loss": 9.740903854370117, "step": 345 }, { "epoch": 0.23585548738922973, "grad_norm": 29.75, "learning_rate": 1.960309088016569e-05, "loss": 8.038175582885742, "step": 346 }, { "epoch": 0.2365371506475801, "grad_norm": 37.75, "learning_rate": 1.9600004850751967e-05, "loss": 4.647739887237549, "step": 347 }, { "epoch": 0.23721881390593047, "grad_norm": 24.5, "learning_rate": 1.9596907115389656e-05, "loss": 5.158146381378174, "step": 348 }, { "epoch": 0.23790047716428084, "grad_norm": 25.875, "learning_rate": 1.9593797677856043e-05, "loss": 6.1644439697265625, "step": 349 }, { "epoch": 0.23858214042263123, "grad_norm": 21.5, "learning_rate": 1.959067654194268e-05, "loss": 6.613912582397461, "step": 350 }, { "epoch": 0.2392638036809816, "grad_norm": 20.625, "learning_rate": 1.9587543711455383e-05, "loss": 4.524042129516602, "step": 351 }, { "epoch": 0.23994546693933197, "grad_norm": 17.25, "learning_rate": 1.958439919021423e-05, "loss": 5.289886474609375, "step": 352 }, { "epoch": 0.24062713019768234, "grad_norm": 13.375, "learning_rate": 1.9581242982053546e-05, "loss": 2.5090675354003906, "step": 353 }, { "epoch": 0.24130879345603273, "grad_norm": 26.875, "learning_rate": 1.957807509082192e-05, "loss": 6.08259391784668, "step": 354 }, { "epoch": 0.2419904567143831, "grad_norm": 29.75, "learning_rate": 1.9574895520382183e-05, "loss": 7.746129035949707, "step": 355 }, { "epoch": 0.24267211997273347, "grad_norm": 25.25, "learning_rate": 1.9571704274611397e-05, "loss": 6.677036285400391, "step": 356 }, { "epoch": 0.24335378323108384, "grad_norm": 27.75, "learning_rate": 1.956850135740087e-05, "loss": 6.980639457702637, "step": 357 }, { "epoch": 0.24403544648943423, "grad_norm": 24.625, "learning_rate": 1.9565286772656145e-05, "loss": 6.017238616943359, "step": 358 }, { "epoch": 0.2447171097477846, "grad_norm": 24.5, "learning_rate": 1.9562060524296983e-05, "loss": 4.629112243652344, "step": 359 }, { "epoch": 0.24539877300613497, "grad_norm": 21.875, "learning_rate": 1.955882261625737e-05, "loss": 4.669083118438721, "step": 360 }, { "epoch": 0.24608043626448534, "grad_norm": 24.875, "learning_rate": 1.9555573052485518e-05, "loss": 9.551931381225586, "step": 361 }, { "epoch": 0.24676209952283573, "grad_norm": 24.25, "learning_rate": 1.9552311836943832e-05, "loss": 6.547706604003906, "step": 362 }, { "epoch": 0.2474437627811861, "grad_norm": 32.0, "learning_rate": 1.9549038973608952e-05, "loss": 5.336575508117676, "step": 363 }, { "epoch": 0.24812542603953647, "grad_norm": 37.25, "learning_rate": 1.9545754466471696e-05, "loss": 9.606460571289062, "step": 364 }, { "epoch": 0.24880708929788684, "grad_norm": 21.5, "learning_rate": 1.9542458319537094e-05, "loss": 6.098098278045654, "step": 365 }, { "epoch": 0.24948875255623723, "grad_norm": 24.0, "learning_rate": 1.9539150536824363e-05, "loss": 4.937188148498535, "step": 366 }, { "epoch": 0.2501704158145876, "grad_norm": 31.375, "learning_rate": 1.953583112236691e-05, "loss": 3.4440088272094727, "step": 367 }, { "epoch": 0.250852079072938, "grad_norm": 18.5, "learning_rate": 1.9532500080212333e-05, "loss": 6.272159576416016, "step": 368 }, { "epoch": 0.25153374233128833, "grad_norm": 13.625, "learning_rate": 1.9529157414422398e-05, "loss": 4.610774040222168, "step": 369 }, { "epoch": 0.25221540558963873, "grad_norm": 22.5, "learning_rate": 1.9525803129073046e-05, "loss": 6.640964984893799, "step": 370 }, { "epoch": 0.25289706884798907, "grad_norm": 25.0, "learning_rate": 1.9522437228254386e-05, "loss": 8.267536163330078, "step": 371 }, { "epoch": 0.25357873210633947, "grad_norm": 17.375, "learning_rate": 1.9519059716070702e-05, "loss": 4.9635162353515625, "step": 372 }, { "epoch": 0.25426039536468986, "grad_norm": 25.75, "learning_rate": 1.951567059664042e-05, "loss": 7.538964748382568, "step": 373 }, { "epoch": 0.2549420586230402, "grad_norm": 31.125, "learning_rate": 1.9512269874096132e-05, "loss": 10.431976318359375, "step": 374 }, { "epoch": 0.2556237218813906, "grad_norm": 31.375, "learning_rate": 1.9508857552584574e-05, "loss": 6.4673004150390625, "step": 375 }, { "epoch": 0.256305385139741, "grad_norm": 21.375, "learning_rate": 1.9505433636266618e-05, "loss": 7.063249588012695, "step": 376 }, { "epoch": 0.25698704839809133, "grad_norm": 19.0, "learning_rate": 1.9501998129317288e-05, "loss": 8.241815567016602, "step": 377 }, { "epoch": 0.25766871165644173, "grad_norm": 21.375, "learning_rate": 1.9498551035925736e-05, "loss": 7.799825191497803, "step": 378 }, { "epoch": 0.25835037491479207, "grad_norm": 31.375, "learning_rate": 1.9495092360295236e-05, "loss": 3.994025230407715, "step": 379 }, { "epoch": 0.25903203817314246, "grad_norm": 23.75, "learning_rate": 1.9491622106643195e-05, "loss": 6.96708869934082, "step": 380 }, { "epoch": 0.25971370143149286, "grad_norm": 23.5, "learning_rate": 1.9488140279201128e-05, "loss": 8.4287748336792, "step": 381 }, { "epoch": 0.2603953646898432, "grad_norm": 19.875, "learning_rate": 1.948464688221467e-05, "loss": 7.34901762008667, "step": 382 }, { "epoch": 0.2610770279481936, "grad_norm": 49.0, "learning_rate": 1.948114191994356e-05, "loss": 11.35413932800293, "step": 383 }, { "epoch": 0.261758691206544, "grad_norm": 20.625, "learning_rate": 1.9477625396661643e-05, "loss": 6.597695350646973, "step": 384 }, { "epoch": 0.26244035446489433, "grad_norm": 16.375, "learning_rate": 1.9474097316656856e-05, "loss": 6.020539283752441, "step": 385 }, { "epoch": 0.2631220177232447, "grad_norm": 22.25, "learning_rate": 1.947055768423123e-05, "loss": 6.662626266479492, "step": 386 }, { "epoch": 0.26380368098159507, "grad_norm": 18.375, "learning_rate": 1.9467006503700886e-05, "loss": 5.945688247680664, "step": 387 }, { "epoch": 0.26448534423994546, "grad_norm": 41.25, "learning_rate": 1.946344377939602e-05, "loss": 5.684548377990723, "step": 388 }, { "epoch": 0.26516700749829586, "grad_norm": 24.75, "learning_rate": 1.945986951566091e-05, "loss": 4.648403644561768, "step": 389 }, { "epoch": 0.2658486707566462, "grad_norm": 25.875, "learning_rate": 1.9456283716853906e-05, "loss": 3.4672482013702393, "step": 390 }, { "epoch": 0.2665303340149966, "grad_norm": 27.0, "learning_rate": 1.9452686387347414e-05, "loss": 8.25106430053711, "step": 391 }, { "epoch": 0.267211997273347, "grad_norm": 15.75, "learning_rate": 1.9449077531527906e-05, "loss": 5.596707344055176, "step": 392 }, { "epoch": 0.26789366053169733, "grad_norm": 13.1875, "learning_rate": 1.9445457153795912e-05, "loss": 2.294459819793701, "step": 393 }, { "epoch": 0.2685753237900477, "grad_norm": 20.375, "learning_rate": 1.944182525856601e-05, "loss": 5.581919193267822, "step": 394 }, { "epoch": 0.26925698704839807, "grad_norm": 37.25, "learning_rate": 1.9438181850266815e-05, "loss": 8.416301727294922, "step": 395 }, { "epoch": 0.26993865030674846, "grad_norm": 51.5, "learning_rate": 1.9434526933340993e-05, "loss": 9.655067443847656, "step": 396 }, { "epoch": 0.27062031356509886, "grad_norm": 37.5, "learning_rate": 1.9430860512245233e-05, "loss": 9.413631439208984, "step": 397 }, { "epoch": 0.2713019768234492, "grad_norm": 23.125, "learning_rate": 1.9427182591450252e-05, "loss": 7.430983543395996, "step": 398 }, { "epoch": 0.2719836400817996, "grad_norm": 48.25, "learning_rate": 1.9423493175440797e-05, "loss": 11.370874404907227, "step": 399 }, { "epoch": 0.27266530334015, "grad_norm": 22.375, "learning_rate": 1.941979226871563e-05, "loss": 7.413367748260498, "step": 400 }, { "epoch": 0.27334696659850033, "grad_norm": 23.0, "learning_rate": 1.9416079875787518e-05, "loss": 7.3853654861450195, "step": 401 }, { "epoch": 0.2740286298568507, "grad_norm": 22.25, "learning_rate": 1.9412356001183234e-05, "loss": 7.70382833480835, "step": 402 }, { "epoch": 0.27471029311520107, "grad_norm": 32.25, "learning_rate": 1.9408620649443563e-05, "loss": 4.789129257202148, "step": 403 }, { "epoch": 0.27539195637355146, "grad_norm": 29.125, "learning_rate": 1.940487382512328e-05, "loss": 8.876415252685547, "step": 404 }, { "epoch": 0.27607361963190186, "grad_norm": 34.75, "learning_rate": 1.9401115532791134e-05, "loss": 6.307229042053223, "step": 405 }, { "epoch": 0.2767552828902522, "grad_norm": 21.25, "learning_rate": 1.9397345777029877e-05, "loss": 7.086931228637695, "step": 406 }, { "epoch": 0.2774369461486026, "grad_norm": 31.0, "learning_rate": 1.9393564562436235e-05, "loss": 7.520183563232422, "step": 407 }, { "epoch": 0.278118609406953, "grad_norm": 45.25, "learning_rate": 1.93897718936209e-05, "loss": 11.563923835754395, "step": 408 }, { "epoch": 0.27880027266530333, "grad_norm": 27.25, "learning_rate": 1.9385967775208538e-05, "loss": 4.420769691467285, "step": 409 }, { "epoch": 0.2794819359236537, "grad_norm": 22.75, "learning_rate": 1.938215221183777e-05, "loss": 3.9098446369171143, "step": 410 }, { "epoch": 0.28016359918200406, "grad_norm": 49.75, "learning_rate": 1.9378325208161178e-05, "loss": 12.570737838745117, "step": 411 }, { "epoch": 0.28084526244035446, "grad_norm": 27.375, "learning_rate": 1.937448676884529e-05, "loss": 9.46922492980957, "step": 412 }, { "epoch": 0.28152692569870486, "grad_norm": 17.875, "learning_rate": 1.9370636898570585e-05, "loss": 3.898407220840454, "step": 413 }, { "epoch": 0.2822085889570552, "grad_norm": 18.125, "learning_rate": 1.9366775602031466e-05, "loss": 8.05340576171875, "step": 414 }, { "epoch": 0.2828902522154056, "grad_norm": 20.875, "learning_rate": 1.936290288393629e-05, "loss": 3.873743772506714, "step": 415 }, { "epoch": 0.283571915473756, "grad_norm": 20.75, "learning_rate": 1.935901874900732e-05, "loss": 8.753393173217773, "step": 416 }, { "epoch": 0.2842535787321063, "grad_norm": 29.875, "learning_rate": 1.9355123201980756e-05, "loss": 5.692137241363525, "step": 417 }, { "epoch": 0.2849352419904567, "grad_norm": 16.625, "learning_rate": 1.93512162476067e-05, "loss": 6.259197235107422, "step": 418 }, { "epoch": 0.28561690524880706, "grad_norm": 31.375, "learning_rate": 1.934729789064918e-05, "loss": 3.5019619464874268, "step": 419 }, { "epoch": 0.28629856850715746, "grad_norm": 17.125, "learning_rate": 1.9343368135886112e-05, "loss": 4.883201599121094, "step": 420 }, { "epoch": 0.28698023176550785, "grad_norm": 26.125, "learning_rate": 1.9339426988109325e-05, "loss": 3.723773241043091, "step": 421 }, { "epoch": 0.2876618950238582, "grad_norm": 15.625, "learning_rate": 1.9335474452124524e-05, "loss": 4.35776948928833, "step": 422 }, { "epoch": 0.2883435582822086, "grad_norm": 23.5, "learning_rate": 1.9331510532751313e-05, "loss": 5.772010803222656, "step": 423 }, { "epoch": 0.289025221540559, "grad_norm": 22.625, "learning_rate": 1.9327535234823174e-05, "loss": 5.794553756713867, "step": 424 }, { "epoch": 0.2897068847989093, "grad_norm": 13.9375, "learning_rate": 1.932354856318746e-05, "loss": 4.1197428703308105, "step": 425 }, { "epoch": 0.2903885480572597, "grad_norm": 36.75, "learning_rate": 1.9319550522705394e-05, "loss": 7.345468521118164, "step": 426 }, { "epoch": 0.29107021131561006, "grad_norm": 15.125, "learning_rate": 1.9315541118252068e-05, "loss": 4.2566914558410645, "step": 427 }, { "epoch": 0.29175187457396046, "grad_norm": 23.75, "learning_rate": 1.9311520354716417e-05, "loss": 7.043590068817139, "step": 428 }, { "epoch": 0.29243353783231085, "grad_norm": 21.125, "learning_rate": 1.9307488237001247e-05, "loss": 5.469351291656494, "step": 429 }, { "epoch": 0.2931152010906612, "grad_norm": 14.9375, "learning_rate": 1.9303444770023184e-05, "loss": 4.852531909942627, "step": 430 }, { "epoch": 0.2937968643490116, "grad_norm": 25.375, "learning_rate": 1.9299389958712717e-05, "loss": 5.304583549499512, "step": 431 }, { "epoch": 0.294478527607362, "grad_norm": 17.5, "learning_rate": 1.9295323808014152e-05, "loss": 5.128393173217773, "step": 432 }, { "epoch": 0.2951601908657123, "grad_norm": 19.875, "learning_rate": 1.9291246322885627e-05, "loss": 6.013811111450195, "step": 433 }, { "epoch": 0.2958418541240627, "grad_norm": 20.875, "learning_rate": 1.9287157508299104e-05, "loss": 5.2212724685668945, "step": 434 }, { "epoch": 0.2965235173824131, "grad_norm": 20.125, "learning_rate": 1.9283057369240358e-05, "loss": 7.086505889892578, "step": 435 }, { "epoch": 0.29720518064076346, "grad_norm": 14.5625, "learning_rate": 1.9278945910708967e-05, "loss": 6.398612976074219, "step": 436 }, { "epoch": 0.29788684389911385, "grad_norm": 15.8125, "learning_rate": 1.927482313771832e-05, "loss": 4.507778644561768, "step": 437 }, { "epoch": 0.2985685071574642, "grad_norm": 17.125, "learning_rate": 1.9270689055295596e-05, "loss": 5.7465410232543945, "step": 438 }, { "epoch": 0.2992501704158146, "grad_norm": 20.375, "learning_rate": 1.926654366848177e-05, "loss": 6.041643142700195, "step": 439 }, { "epoch": 0.299931833674165, "grad_norm": 17.125, "learning_rate": 1.9262386982331596e-05, "loss": 4.610371112823486, "step": 440 }, { "epoch": 0.3006134969325153, "grad_norm": 12.8125, "learning_rate": 1.9258219001913607e-05, "loss": 3.903378486633301, "step": 441 }, { "epoch": 0.3012951601908657, "grad_norm": 19.625, "learning_rate": 1.9254039732310113e-05, "loss": 6.163326263427734, "step": 442 }, { "epoch": 0.3019768234492161, "grad_norm": 49.0, "learning_rate": 1.9249849178617182e-05, "loss": 10.710020065307617, "step": 443 }, { "epoch": 0.30265848670756645, "grad_norm": 20.75, "learning_rate": 1.9245647345944647e-05, "loss": 8.06418514251709, "step": 444 }, { "epoch": 0.30334014996591685, "grad_norm": 14.3125, "learning_rate": 1.9241434239416093e-05, "loss": 5.243063449859619, "step": 445 }, { "epoch": 0.3040218132242672, "grad_norm": 26.125, "learning_rate": 1.9237209864168855e-05, "loss": 8.081219673156738, "step": 446 }, { "epoch": 0.3047034764826176, "grad_norm": 42.5, "learning_rate": 1.9232974225354e-05, "loss": 6.125290870666504, "step": 447 }, { "epoch": 0.305385139740968, "grad_norm": 21.75, "learning_rate": 1.9228727328136337e-05, "loss": 4.336400032043457, "step": 448 }, { "epoch": 0.3060668029993183, "grad_norm": 16.75, "learning_rate": 1.92244691776944e-05, "loss": 5.98284387588501, "step": 449 }, { "epoch": 0.3067484662576687, "grad_norm": 22.375, "learning_rate": 1.922019977922045e-05, "loss": 3.993062734603882, "step": 450 }, { "epoch": 0.3074301295160191, "grad_norm": 16.0, "learning_rate": 1.9215919137920452e-05, "loss": 4.256191253662109, "step": 451 }, { "epoch": 0.30811179277436945, "grad_norm": 19.375, "learning_rate": 1.921162725901409e-05, "loss": 2.4848380088806152, "step": 452 }, { "epoch": 0.30879345603271985, "grad_norm": 27.125, "learning_rate": 1.920732414773475e-05, "loss": 9.637655258178711, "step": 453 }, { "epoch": 0.3094751192910702, "grad_norm": 25.875, "learning_rate": 1.9203009809329515e-05, "loss": 8.284427642822266, "step": 454 }, { "epoch": 0.3101567825494206, "grad_norm": 14.375, "learning_rate": 1.919868424905915e-05, "loss": 4.16232967376709, "step": 455 }, { "epoch": 0.310838445807771, "grad_norm": 12.5, "learning_rate": 1.9194347472198112e-05, "loss": 2.500793218612671, "step": 456 }, { "epoch": 0.3115201090661213, "grad_norm": 19.0, "learning_rate": 1.9189999484034533e-05, "loss": 4.537483215332031, "step": 457 }, { "epoch": 0.3122017723244717, "grad_norm": 20.375, "learning_rate": 1.9185640289870213e-05, "loss": 3.932478666305542, "step": 458 }, { "epoch": 0.3128834355828221, "grad_norm": 27.5, "learning_rate": 1.9181269895020624e-05, "loss": 6.70701789855957, "step": 459 }, { "epoch": 0.31356509884117245, "grad_norm": 12.625, "learning_rate": 1.9176888304814882e-05, "loss": 4.547783851623535, "step": 460 }, { "epoch": 0.31424676209952285, "grad_norm": 27.125, "learning_rate": 1.9172495524595764e-05, "loss": 8.351564407348633, "step": 461 }, { "epoch": 0.3149284253578732, "grad_norm": 29.25, "learning_rate": 1.9168091559719696e-05, "loss": 8.839858055114746, "step": 462 }, { "epoch": 0.3156100886162236, "grad_norm": 19.125, "learning_rate": 1.9163676415556734e-05, "loss": 3.882465362548828, "step": 463 }, { "epoch": 0.316291751874574, "grad_norm": 32.75, "learning_rate": 1.9159250097490563e-05, "loss": 9.649469375610352, "step": 464 }, { "epoch": 0.3169734151329243, "grad_norm": 21.625, "learning_rate": 1.9154812610918503e-05, "loss": 6.384946823120117, "step": 465 }, { "epoch": 0.3176550783912747, "grad_norm": 27.125, "learning_rate": 1.9150363961251485e-05, "loss": 3.5908772945404053, "step": 466 }, { "epoch": 0.3183367416496251, "grad_norm": 22.125, "learning_rate": 1.914590415391406e-05, "loss": 5.738977432250977, "step": 467 }, { "epoch": 0.31901840490797545, "grad_norm": 21.875, "learning_rate": 1.9141433194344374e-05, "loss": 7.829689979553223, "step": 468 }, { "epoch": 0.31970006816632585, "grad_norm": 26.125, "learning_rate": 1.9136951087994176e-05, "loss": 9.198844909667969, "step": 469 }, { "epoch": 0.3203817314246762, "grad_norm": 16.125, "learning_rate": 1.913245784032881e-05, "loss": 5.452543258666992, "step": 470 }, { "epoch": 0.3210633946830266, "grad_norm": 17.375, "learning_rate": 1.9127953456827205e-05, "loss": 4.050671100616455, "step": 471 }, { "epoch": 0.321745057941377, "grad_norm": 26.375, "learning_rate": 1.912343794298186e-05, "loss": 5.260730743408203, "step": 472 }, { "epoch": 0.3224267211997273, "grad_norm": 14.625, "learning_rate": 1.911891130429886e-05, "loss": 4.213287353515625, "step": 473 }, { "epoch": 0.3231083844580777, "grad_norm": 25.0, "learning_rate": 1.9114373546297844e-05, "loss": 5.373583793640137, "step": 474 }, { "epoch": 0.3237900477164281, "grad_norm": 56.0, "learning_rate": 1.9109824674512014e-05, "loss": 5.187254428863525, "step": 475 }, { "epoch": 0.32447171097477845, "grad_norm": 22.75, "learning_rate": 1.9105264694488124e-05, "loss": 6.163130283355713, "step": 476 }, { "epoch": 0.32515337423312884, "grad_norm": 22.875, "learning_rate": 1.9100693611786472e-05, "loss": 3.4174985885620117, "step": 477 }, { "epoch": 0.3258350374914792, "grad_norm": 12.875, "learning_rate": 1.9096111431980896e-05, "loss": 4.013187408447266, "step": 478 }, { "epoch": 0.3265167007498296, "grad_norm": 17.875, "learning_rate": 1.9091518160658763e-05, "loss": 6.716484069824219, "step": 479 }, { "epoch": 0.32719836400818, "grad_norm": 13.5, "learning_rate": 1.9086913803420966e-05, "loss": 5.263323783874512, "step": 480 }, { "epoch": 0.3278800272665303, "grad_norm": 31.125, "learning_rate": 1.9082298365881916e-05, "loss": 8.015993118286133, "step": 481 }, { "epoch": 0.3285616905248807, "grad_norm": 16.75, "learning_rate": 1.907767185366953e-05, "loss": 5.892767906188965, "step": 482 }, { "epoch": 0.3292433537832311, "grad_norm": 12.1875, "learning_rate": 1.9073034272425245e-05, "loss": 3.78084659576416, "step": 483 }, { "epoch": 0.32992501704158145, "grad_norm": 15.3125, "learning_rate": 1.9068385627803972e-05, "loss": 5.061692714691162, "step": 484 }, { "epoch": 0.33060668029993184, "grad_norm": 21.625, "learning_rate": 1.906372592547413e-05, "loss": 4.281647682189941, "step": 485 }, { "epoch": 0.3312883435582822, "grad_norm": 27.75, "learning_rate": 1.905905517111761e-05, "loss": 6.108635902404785, "step": 486 }, { "epoch": 0.3319700068166326, "grad_norm": 16.5, "learning_rate": 1.90543733704298e-05, "loss": 6.028197288513184, "step": 487 }, { "epoch": 0.332651670074983, "grad_norm": 18.0, "learning_rate": 1.9049680529119524e-05, "loss": 6.547325134277344, "step": 488 }, { "epoch": 0.3333333333333333, "grad_norm": 24.875, "learning_rate": 1.9044976652909102e-05, "loss": 8.781012535095215, "step": 489 }, { "epoch": 0.3340149965916837, "grad_norm": 22.25, "learning_rate": 1.9040261747534282e-05, "loss": 3.229706048965454, "step": 490 }, { "epoch": 0.3346966598500341, "grad_norm": 19.75, "learning_rate": 1.9035535818744286e-05, "loss": 5.425136566162109, "step": 491 }, { "epoch": 0.33537832310838445, "grad_norm": 20.125, "learning_rate": 1.9030798872301758e-05, "loss": 6.044832706451416, "step": 492 }, { "epoch": 0.33605998636673484, "grad_norm": 20.875, "learning_rate": 1.9026050913982788e-05, "loss": 3.32551908493042, "step": 493 }, { "epoch": 0.3367416496250852, "grad_norm": 22.75, "learning_rate": 1.9021291949576883e-05, "loss": 5.301616191864014, "step": 494 }, { "epoch": 0.3374233128834356, "grad_norm": 20.5, "learning_rate": 1.9016521984886984e-05, "loss": 5.331033229827881, "step": 495 }, { "epoch": 0.338104976141786, "grad_norm": 22.0, "learning_rate": 1.901174102572943e-05, "loss": 4.166750907897949, "step": 496 }, { "epoch": 0.3387866394001363, "grad_norm": 29.5, "learning_rate": 1.9006949077933984e-05, "loss": 5.450303554534912, "step": 497 }, { "epoch": 0.3394683026584867, "grad_norm": 19.5, "learning_rate": 1.900214614734379e-05, "loss": 5.960881233215332, "step": 498 }, { "epoch": 0.3401499659168371, "grad_norm": 29.0, "learning_rate": 1.8997332239815403e-05, "loss": 6.340490341186523, "step": 499 }, { "epoch": 0.34083162917518744, "grad_norm": 22.125, "learning_rate": 1.8992507361218743e-05, "loss": 6.3714375495910645, "step": 500 }, { "epoch": 0.34151329243353784, "grad_norm": 37.75, "learning_rate": 1.8987671517437122e-05, "loss": 7.923834800720215, "step": 501 }, { "epoch": 0.3421949556918882, "grad_norm": 18.875, "learning_rate": 1.8982824714367214e-05, "loss": 5.059648513793945, "step": 502 }, { "epoch": 0.3428766189502386, "grad_norm": 27.75, "learning_rate": 1.8977966957919068e-05, "loss": 2.817004680633545, "step": 503 }, { "epoch": 0.34355828220858897, "grad_norm": 15.375, "learning_rate": 1.8973098254016074e-05, "loss": 6.107322692871094, "step": 504 }, { "epoch": 0.3442399454669393, "grad_norm": 14.6875, "learning_rate": 1.8968218608594987e-05, "loss": 5.037627696990967, "step": 505 }, { "epoch": 0.3449216087252897, "grad_norm": 30.375, "learning_rate": 1.8963328027605886e-05, "loss": 3.2527809143066406, "step": 506 }, { "epoch": 0.3456032719836401, "grad_norm": 23.0, "learning_rate": 1.8958426517012203e-05, "loss": 3.3513503074645996, "step": 507 }, { "epoch": 0.34628493524199044, "grad_norm": 19.5, "learning_rate": 1.8953514082790683e-05, "loss": 6.678170680999756, "step": 508 }, { "epoch": 0.34696659850034084, "grad_norm": 17.0, "learning_rate": 1.8948590730931394e-05, "loss": 4.593085289001465, "step": 509 }, { "epoch": 0.3476482617586912, "grad_norm": 13.4375, "learning_rate": 1.8943656467437726e-05, "loss": 3.942119598388672, "step": 510 }, { "epoch": 0.3483299250170416, "grad_norm": 41.0, "learning_rate": 1.893871129832636e-05, "loss": 9.717809677124023, "step": 511 }, { "epoch": 0.34901158827539197, "grad_norm": 15.375, "learning_rate": 1.893375522962729e-05, "loss": 3.350231647491455, "step": 512 }, { "epoch": 0.3496932515337423, "grad_norm": 19.75, "learning_rate": 1.8928788267383783e-05, "loss": 5.786828994750977, "step": 513 }, { "epoch": 0.3503749147920927, "grad_norm": 17.625, "learning_rate": 1.8923810417652404e-05, "loss": 6.077963352203369, "step": 514 }, { "epoch": 0.3510565780504431, "grad_norm": 41.75, "learning_rate": 1.8918821686502992e-05, "loss": 4.499693870544434, "step": 515 }, { "epoch": 0.35173824130879344, "grad_norm": 21.25, "learning_rate": 1.8913822080018645e-05, "loss": 6.8729987144470215, "step": 516 }, { "epoch": 0.35241990456714384, "grad_norm": 45.0, "learning_rate": 1.8908811604295728e-05, "loss": 3.93133544921875, "step": 517 }, { "epoch": 0.35310156782549423, "grad_norm": 13.9375, "learning_rate": 1.8903790265443865e-05, "loss": 3.7073123455047607, "step": 518 }, { "epoch": 0.3537832310838446, "grad_norm": 29.375, "learning_rate": 1.8898758069585923e-05, "loss": 5.071776390075684, "step": 519 }, { "epoch": 0.35446489434219497, "grad_norm": 18.125, "learning_rate": 1.8893715022858e-05, "loss": 7.739048957824707, "step": 520 }, { "epoch": 0.3551465576005453, "grad_norm": 19.875, "learning_rate": 1.888866113140943e-05, "loss": 8.77826976776123, "step": 521 }, { "epoch": 0.3558282208588957, "grad_norm": 26.125, "learning_rate": 1.8883596401402777e-05, "loss": 8.45267105102539, "step": 522 }, { "epoch": 0.3565098841172461, "grad_norm": 13.125, "learning_rate": 1.8878520839013812e-05, "loss": 4.881437301635742, "step": 523 }, { "epoch": 0.35719154737559644, "grad_norm": 31.25, "learning_rate": 1.8873434450431522e-05, "loss": 5.730535507202148, "step": 524 }, { "epoch": 0.35787321063394684, "grad_norm": 29.25, "learning_rate": 1.886833724185809e-05, "loss": 3.56628680229187, "step": 525 }, { "epoch": 0.35855487389229723, "grad_norm": 41.75, "learning_rate": 1.8863229219508892e-05, "loss": 10.166557312011719, "step": 526 }, { "epoch": 0.3592365371506476, "grad_norm": 20.0, "learning_rate": 1.8858110389612495e-05, "loss": 8.018098831176758, "step": 527 }, { "epoch": 0.35991820040899797, "grad_norm": 24.875, "learning_rate": 1.885298075841064e-05, "loss": 5.056179046630859, "step": 528 }, { "epoch": 0.3605998636673483, "grad_norm": 20.25, "learning_rate": 1.8847840332158243e-05, "loss": 7.5738019943237305, "step": 529 }, { "epoch": 0.3612815269256987, "grad_norm": 36.0, "learning_rate": 1.8842689117123377e-05, "loss": 3.63010311126709, "step": 530 }, { "epoch": 0.3619631901840491, "grad_norm": 26.0, "learning_rate": 1.8837527119587277e-05, "loss": 8.94369888305664, "step": 531 }, { "epoch": 0.36264485344239944, "grad_norm": 16.625, "learning_rate": 1.883235434584432e-05, "loss": 4.412526607513428, "step": 532 }, { "epoch": 0.36332651670074984, "grad_norm": 16.375, "learning_rate": 1.8827170802202027e-05, "loss": 5.926525115966797, "step": 533 }, { "epoch": 0.36400817995910023, "grad_norm": 31.0, "learning_rate": 1.8821976494981055e-05, "loss": 7.854779243469238, "step": 534 }, { "epoch": 0.36468984321745057, "grad_norm": 22.375, "learning_rate": 1.8816771430515178e-05, "loss": 4.220308303833008, "step": 535 }, { "epoch": 0.36537150647580097, "grad_norm": 22.25, "learning_rate": 1.8811555615151286e-05, "loss": 6.673711776733398, "step": 536 }, { "epoch": 0.3660531697341513, "grad_norm": 28.625, "learning_rate": 1.880632905524939e-05, "loss": 3.9553794860839844, "step": 537 }, { "epoch": 0.3667348329925017, "grad_norm": 15.25, "learning_rate": 1.8801091757182593e-05, "loss": 4.298336029052734, "step": 538 }, { "epoch": 0.3674164962508521, "grad_norm": 27.0, "learning_rate": 1.879584372733709e-05, "loss": 7.987879753112793, "step": 539 }, { "epoch": 0.36809815950920244, "grad_norm": 16.625, "learning_rate": 1.8790584972112174e-05, "loss": 4.97304630279541, "step": 540 }, { "epoch": 0.36877982276755283, "grad_norm": 20.375, "learning_rate": 1.87853154979202e-05, "loss": 5.153449535369873, "step": 541 }, { "epoch": 0.36946148602590323, "grad_norm": 10.9375, "learning_rate": 1.8780035311186605e-05, "loss": 4.027165412902832, "step": 542 }, { "epoch": 0.37014314928425357, "grad_norm": 48.0, "learning_rate": 1.8774744418349886e-05, "loss": 9.187662124633789, "step": 543 }, { "epoch": 0.37082481254260397, "grad_norm": 23.25, "learning_rate": 1.8769442825861594e-05, "loss": 5.085555553436279, "step": 544 }, { "epoch": 0.3715064758009543, "grad_norm": 23.0, "learning_rate": 1.876413054018633e-05, "loss": 6.713476181030273, "step": 545 }, { "epoch": 0.3721881390593047, "grad_norm": 15.875, "learning_rate": 1.875880756780172e-05, "loss": 5.342873573303223, "step": 546 }, { "epoch": 0.3728698023176551, "grad_norm": 18.375, "learning_rate": 1.8753473915198437e-05, "loss": 6.536588668823242, "step": 547 }, { "epoch": 0.37355146557600544, "grad_norm": 22.375, "learning_rate": 1.874812958888018e-05, "loss": 4.117295742034912, "step": 548 }, { "epoch": 0.37423312883435583, "grad_norm": 19.0, "learning_rate": 1.874277459536364e-05, "loss": 4.402835369110107, "step": 549 }, { "epoch": 0.37491479209270623, "grad_norm": 37.75, "learning_rate": 1.873740894117854e-05, "loss": 9.3967924118042, "step": 550 }, { "epoch": 0.37559645535105657, "grad_norm": 16.25, "learning_rate": 1.8732032632867592e-05, "loss": 8.169157028198242, "step": 551 }, { "epoch": 0.37627811860940696, "grad_norm": 17.5, "learning_rate": 1.8726645676986503e-05, "loss": 5.718846321105957, "step": 552 }, { "epoch": 0.3769597818677573, "grad_norm": 14.25, "learning_rate": 1.872124808010395e-05, "loss": 5.480005741119385, "step": 553 }, { "epoch": 0.3776414451261077, "grad_norm": 25.625, "learning_rate": 1.8715839848801604e-05, "loss": 4.090268611907959, "step": 554 }, { "epoch": 0.3783231083844581, "grad_norm": 15.3125, "learning_rate": 1.8710420989674093e-05, "loss": 5.952251434326172, "step": 555 }, { "epoch": 0.37900477164280844, "grad_norm": 36.25, "learning_rate": 1.8704991509329002e-05, "loss": 7.658427715301514, "step": 556 }, { "epoch": 0.37968643490115883, "grad_norm": 27.875, "learning_rate": 1.8699551414386877e-05, "loss": 7.369929790496826, "step": 557 }, { "epoch": 0.3803680981595092, "grad_norm": 17.0, "learning_rate": 1.8694100711481195e-05, "loss": 2.7062854766845703, "step": 558 }, { "epoch": 0.38104976141785957, "grad_norm": 26.25, "learning_rate": 1.868863940725838e-05, "loss": 5.130803108215332, "step": 559 }, { "epoch": 0.38173142467620996, "grad_norm": 22.75, "learning_rate": 1.8683167508377775e-05, "loss": 3.8656163215637207, "step": 560 }, { "epoch": 0.3824130879345603, "grad_norm": 24.375, "learning_rate": 1.8677685021511643e-05, "loss": 5.59077262878418, "step": 561 }, { "epoch": 0.3830947511929107, "grad_norm": 21.5, "learning_rate": 1.8672191953345156e-05, "loss": 6.256784915924072, "step": 562 }, { "epoch": 0.3837764144512611, "grad_norm": 23.875, "learning_rate": 1.86666883105764e-05, "loss": 6.591250419616699, "step": 563 }, { "epoch": 0.38445807770961143, "grad_norm": 34.0, "learning_rate": 1.866117409991634e-05, "loss": 8.354074478149414, "step": 564 }, { "epoch": 0.38513974096796183, "grad_norm": 43.25, "learning_rate": 1.8655649328088836e-05, "loss": 5.082333564758301, "step": 565 }, { "epoch": 0.3858214042263122, "grad_norm": 14.4375, "learning_rate": 1.865011400183062e-05, "loss": 3.7426371574401855, "step": 566 }, { "epoch": 0.38650306748466257, "grad_norm": 44.5, "learning_rate": 1.8644568127891303e-05, "loss": 8.140022277832031, "step": 567 }, { "epoch": 0.38718473074301296, "grad_norm": 22.375, "learning_rate": 1.8639011713033347e-05, "loss": 2.890700578689575, "step": 568 }, { "epoch": 0.3878663940013633, "grad_norm": 16.625, "learning_rate": 1.8633444764032074e-05, "loss": 5.505722999572754, "step": 569 }, { "epoch": 0.3885480572597137, "grad_norm": 20.125, "learning_rate": 1.862786728767565e-05, "loss": 5.544048309326172, "step": 570 }, { "epoch": 0.3892297205180641, "grad_norm": 17.875, "learning_rate": 1.8622279290765078e-05, "loss": 2.674530267715454, "step": 571 }, { "epoch": 0.38991138377641443, "grad_norm": 26.5, "learning_rate": 1.8616680780114183e-05, "loss": 5.432843208312988, "step": 572 }, { "epoch": 0.39059304703476483, "grad_norm": 13.3125, "learning_rate": 1.8611071762549623e-05, "loss": 4.6200642585754395, "step": 573 }, { "epoch": 0.3912747102931152, "grad_norm": 14.5625, "learning_rate": 1.860545224491085e-05, "loss": 4.122474670410156, "step": 574 }, { "epoch": 0.39195637355146556, "grad_norm": 14.4375, "learning_rate": 1.8599822234050143e-05, "loss": 4.684253692626953, "step": 575 }, { "epoch": 0.39263803680981596, "grad_norm": 13.5625, "learning_rate": 1.859418173683255e-05, "loss": 4.763998031616211, "step": 576 }, { "epoch": 0.3933197000681663, "grad_norm": 14.0, "learning_rate": 1.858853076013593e-05, "loss": 5.254890441894531, "step": 577 }, { "epoch": 0.3940013633265167, "grad_norm": 37.5, "learning_rate": 1.8582869310850903e-05, "loss": 3.421285629272461, "step": 578 }, { "epoch": 0.3946830265848671, "grad_norm": 19.75, "learning_rate": 1.8577197395880866e-05, "loss": 3.683777093887329, "step": 579 }, { "epoch": 0.39536468984321743, "grad_norm": 14.75, "learning_rate": 1.8571515022141974e-05, "loss": 4.105260372161865, "step": 580 }, { "epoch": 0.3960463531015678, "grad_norm": 13.3125, "learning_rate": 1.856582219656314e-05, "loss": 4.068833351135254, "step": 581 }, { "epoch": 0.3967280163599182, "grad_norm": 15.0625, "learning_rate": 1.856011892608602e-05, "loss": 3.3460545539855957, "step": 582 }, { "epoch": 0.39740967961826856, "grad_norm": 25.625, "learning_rate": 1.8554405217665004e-05, "loss": 4.70405387878418, "step": 583 }, { "epoch": 0.39809134287661896, "grad_norm": 30.625, "learning_rate": 1.854868107826721e-05, "loss": 6.5129218101501465, "step": 584 }, { "epoch": 0.3987730061349693, "grad_norm": 54.75, "learning_rate": 1.8542946514872478e-05, "loss": 9.24893856048584, "step": 585 }, { "epoch": 0.3994546693933197, "grad_norm": 47.5, "learning_rate": 1.8537201534473353e-05, "loss": 8.785318374633789, "step": 586 }, { "epoch": 0.4001363326516701, "grad_norm": 17.25, "learning_rate": 1.8531446144075093e-05, "loss": 3.9578466415405273, "step": 587 }, { "epoch": 0.40081799591002043, "grad_norm": 27.375, "learning_rate": 1.852568035069564e-05, "loss": 6.765663146972656, "step": 588 }, { "epoch": 0.4014996591683708, "grad_norm": 17.125, "learning_rate": 1.8519904161365624e-05, "loss": 6.958662986755371, "step": 589 }, { "epoch": 0.4021813224267212, "grad_norm": 29.5, "learning_rate": 1.851411758312835e-05, "loss": 6.789897441864014, "step": 590 }, { "epoch": 0.40286298568507156, "grad_norm": 15.625, "learning_rate": 1.8508320623039792e-05, "loss": 4.470952987670898, "step": 591 }, { "epoch": 0.40354464894342196, "grad_norm": 15.75, "learning_rate": 1.8502513288168584e-05, "loss": 6.339482307434082, "step": 592 }, { "epoch": 0.4042263122017723, "grad_norm": 17.0, "learning_rate": 1.8496695585596013e-05, "loss": 5.749475479125977, "step": 593 }, { "epoch": 0.4049079754601227, "grad_norm": 15.5, "learning_rate": 1.8490867522416e-05, "loss": 7.821369171142578, "step": 594 }, { "epoch": 0.4055896387184731, "grad_norm": 25.125, "learning_rate": 1.8485029105735112e-05, "loss": 5.460648536682129, "step": 595 }, { "epoch": 0.40627130197682343, "grad_norm": 25.25, "learning_rate": 1.8479180342672525e-05, "loss": 4.564432144165039, "step": 596 }, { "epoch": 0.4069529652351738, "grad_norm": 37.25, "learning_rate": 1.8473321240360048e-05, "loss": 6.620779037475586, "step": 597 }, { "epoch": 0.4076346284935242, "grad_norm": 28.375, "learning_rate": 1.846745180594208e-05, "loss": 7.144023895263672, "step": 598 }, { "epoch": 0.40831629175187456, "grad_norm": 17.75, "learning_rate": 1.8461572046575638e-05, "loss": 5.085351943969727, "step": 599 }, { "epoch": 0.40899795501022496, "grad_norm": 48.0, "learning_rate": 1.8455681969430307e-05, "loss": 9.1131591796875, "step": 600 }, { "epoch": 0.4096796182685753, "grad_norm": 48.25, "learning_rate": 1.8449781581688274e-05, "loss": 12.038650512695312, "step": 601 }, { "epoch": 0.4103612815269257, "grad_norm": 14.6875, "learning_rate": 1.8443870890544287e-05, "loss": 3.9854350090026855, "step": 602 }, { "epoch": 0.4110429447852761, "grad_norm": 15.125, "learning_rate": 1.8437949903205657e-05, "loss": 3.0895256996154785, "step": 603 }, { "epoch": 0.4117246080436264, "grad_norm": 20.0, "learning_rate": 1.843201862689225e-05, "loss": 2.5332789421081543, "step": 604 }, { "epoch": 0.4124062713019768, "grad_norm": 23.5, "learning_rate": 1.8426077068836487e-05, "loss": 7.050914764404297, "step": 605 }, { "epoch": 0.4130879345603272, "grad_norm": 18.0, "learning_rate": 1.842012523628332e-05, "loss": 4.123310089111328, "step": 606 }, { "epoch": 0.41376959781867756, "grad_norm": 17.375, "learning_rate": 1.8414163136490224e-05, "loss": 5.691148281097412, "step": 607 }, { "epoch": 0.41445126107702795, "grad_norm": 51.5, "learning_rate": 1.84081907767272e-05, "loss": 11.009285926818848, "step": 608 }, { "epoch": 0.41513292433537835, "grad_norm": 17.125, "learning_rate": 1.840220816427676e-05, "loss": 2.6631100177764893, "step": 609 }, { "epoch": 0.4158145875937287, "grad_norm": 15.625, "learning_rate": 1.839621530643392e-05, "loss": 4.5691938400268555, "step": 610 }, { "epoch": 0.4164962508520791, "grad_norm": 32.25, "learning_rate": 1.839021221050618e-05, "loss": 6.180819034576416, "step": 611 }, { "epoch": 0.4171779141104294, "grad_norm": 17.375, "learning_rate": 1.838419888381353e-05, "loss": 4.582550048828125, "step": 612 }, { "epoch": 0.4178595773687798, "grad_norm": 14.5, "learning_rate": 1.8378175333688438e-05, "loss": 2.185682773590088, "step": 613 }, { "epoch": 0.4185412406271302, "grad_norm": 44.5, "learning_rate": 1.837214156747583e-05, "loss": 6.069904804229736, "step": 614 }, { "epoch": 0.41922290388548056, "grad_norm": 21.5, "learning_rate": 1.8366097592533095e-05, "loss": 2.7126808166503906, "step": 615 }, { "epoch": 0.41990456714383095, "grad_norm": 29.0, "learning_rate": 1.8360043416230067e-05, "loss": 7.966791152954102, "step": 616 }, { "epoch": 0.42058623040218135, "grad_norm": 12.9375, "learning_rate": 1.8353979045949023e-05, "loss": 6.072749137878418, "step": 617 }, { "epoch": 0.4212678936605317, "grad_norm": 26.75, "learning_rate": 1.834790448908467e-05, "loss": 3.055478096008301, "step": 618 }, { "epoch": 0.4219495569188821, "grad_norm": 15.5625, "learning_rate": 1.8341819753044135e-05, "loss": 4.82210636138916, "step": 619 }, { "epoch": 0.4226312201772324, "grad_norm": 19.375, "learning_rate": 1.8335724845246948e-05, "loss": 2.661076784133911, "step": 620 }, { "epoch": 0.4233128834355828, "grad_norm": 23.5, "learning_rate": 1.8329619773125064e-05, "loss": 5.013340473175049, "step": 621 }, { "epoch": 0.4239945466939332, "grad_norm": 27.25, "learning_rate": 1.832350454412281e-05, "loss": 7.342447757720947, "step": 622 }, { "epoch": 0.42467620995228356, "grad_norm": 16.75, "learning_rate": 1.8317379165696908e-05, "loss": 5.252130508422852, "step": 623 }, { "epoch": 0.42535787321063395, "grad_norm": 31.875, "learning_rate": 1.8311243645316458e-05, "loss": 3.1734910011291504, "step": 624 }, { "epoch": 0.42603953646898435, "grad_norm": 12.125, "learning_rate": 1.830509799046292e-05, "loss": 3.2355284690856934, "step": 625 }, { "epoch": 0.4267211997273347, "grad_norm": 24.0, "learning_rate": 1.829894220863012e-05, "loss": 3.564464807510376, "step": 626 }, { "epoch": 0.4274028629856851, "grad_norm": 25.375, "learning_rate": 1.8292776307324217e-05, "loss": 5.972105026245117, "step": 627 }, { "epoch": 0.4280845262440354, "grad_norm": 17.5, "learning_rate": 1.8286600294063732e-05, "loss": 3.12518048286438, "step": 628 }, { "epoch": 0.4287661895023858, "grad_norm": 44.0, "learning_rate": 1.82804141763795e-05, "loss": 8.79373550415039, "step": 629 }, { "epoch": 0.4294478527607362, "grad_norm": 39.5, "learning_rate": 1.8274217961814682e-05, "loss": 7.679512023925781, "step": 630 }, { "epoch": 0.43012951601908656, "grad_norm": 19.0, "learning_rate": 1.8268011657924746e-05, "loss": 4.789336204528809, "step": 631 }, { "epoch": 0.43081117927743695, "grad_norm": 22.5, "learning_rate": 1.8261795272277472e-05, "loss": 4.606503486633301, "step": 632 }, { "epoch": 0.43149284253578735, "grad_norm": 21.75, "learning_rate": 1.8255568812452923e-05, "loss": 5.3832106590271, "step": 633 }, { "epoch": 0.4321745057941377, "grad_norm": 24.5, "learning_rate": 1.8249332286043456e-05, "loss": 6.201325416564941, "step": 634 }, { "epoch": 0.4328561690524881, "grad_norm": 15.8125, "learning_rate": 1.8243085700653698e-05, "loss": 6.22796630859375, "step": 635 }, { "epoch": 0.4335378323108384, "grad_norm": 15.25, "learning_rate": 1.8236829063900535e-05, "loss": 5.875482559204102, "step": 636 }, { "epoch": 0.4342194955691888, "grad_norm": 23.25, "learning_rate": 1.8230562383413127e-05, "loss": 6.583766460418701, "step": 637 }, { "epoch": 0.4349011588275392, "grad_norm": 28.625, "learning_rate": 1.822428566683286e-05, "loss": 7.891244888305664, "step": 638 }, { "epoch": 0.43558282208588955, "grad_norm": 22.75, "learning_rate": 1.8217998921813375e-05, "loss": 3.5405707359313965, "step": 639 }, { "epoch": 0.43626448534423995, "grad_norm": 15.875, "learning_rate": 1.821170215602053e-05, "loss": 7.14054012298584, "step": 640 }, { "epoch": 0.43694614860259035, "grad_norm": 52.0, "learning_rate": 1.8205395377132407e-05, "loss": 4.029902458190918, "step": 641 }, { "epoch": 0.4376278118609407, "grad_norm": 16.875, "learning_rate": 1.81990785928393e-05, "loss": 5.0225911140441895, "step": 642 }, { "epoch": 0.4383094751192911, "grad_norm": 19.375, "learning_rate": 1.8192751810843697e-05, "loss": 4.948035717010498, "step": 643 }, { "epoch": 0.4389911383776414, "grad_norm": 18.125, "learning_rate": 1.8186415038860276e-05, "loss": 2.465547561645508, "step": 644 }, { "epoch": 0.4396728016359918, "grad_norm": 23.25, "learning_rate": 1.818006828461591e-05, "loss": 5.726883411407471, "step": 645 }, { "epoch": 0.4403544648943422, "grad_norm": 49.5, "learning_rate": 1.8173711555849626e-05, "loss": 5.978935241699219, "step": 646 }, { "epoch": 0.44103612815269255, "grad_norm": 28.25, "learning_rate": 1.8167344860312627e-05, "loss": 5.928651809692383, "step": 647 }, { "epoch": 0.44171779141104295, "grad_norm": 13.375, "learning_rate": 1.8160968205768264e-05, "loss": 3.609272003173828, "step": 648 }, { "epoch": 0.44239945466939334, "grad_norm": 15.625, "learning_rate": 1.815458159999203e-05, "loss": 4.6759233474731445, "step": 649 }, { "epoch": 0.4430811179277437, "grad_norm": 35.25, "learning_rate": 1.8148185050771554e-05, "loss": 5.418087959289551, "step": 650 }, { "epoch": 0.4437627811860941, "grad_norm": 50.75, "learning_rate": 1.8141778565906594e-05, "loss": 7.174725532531738, "step": 651 }, { "epoch": 0.4444444444444444, "grad_norm": 14.4375, "learning_rate": 1.8135362153209014e-05, "loss": 4.941481113433838, "step": 652 }, { "epoch": 0.4451261077027948, "grad_norm": 38.5, "learning_rate": 1.8128935820502792e-05, "loss": 5.892004013061523, "step": 653 }, { "epoch": 0.4458077709611452, "grad_norm": 46.0, "learning_rate": 1.8122499575624e-05, "loss": 10.376693725585938, "step": 654 }, { "epoch": 0.44648943421949555, "grad_norm": 18.375, "learning_rate": 1.8116053426420793e-05, "loss": 4.42366886138916, "step": 655 }, { "epoch": 0.44717109747784595, "grad_norm": 21.0, "learning_rate": 1.8109597380753404e-05, "loss": 4.374699592590332, "step": 656 }, { "epoch": 0.44785276073619634, "grad_norm": 14.6875, "learning_rate": 1.8103131446494144e-05, "loss": 5.070104598999023, "step": 657 }, { "epoch": 0.4485344239945467, "grad_norm": 16.375, "learning_rate": 1.8096655631527365e-05, "loss": 5.8413896560668945, "step": 658 }, { "epoch": 0.4492160872528971, "grad_norm": 36.25, "learning_rate": 1.8090169943749477e-05, "loss": 4.752076148986816, "step": 659 }, { "epoch": 0.4498977505112474, "grad_norm": 15.0, "learning_rate": 1.8083674391068925e-05, "loss": 4.847862720489502, "step": 660 }, { "epoch": 0.4505794137695978, "grad_norm": 18.75, "learning_rate": 1.807716898140619e-05, "loss": 3.9472098350524902, "step": 661 }, { "epoch": 0.4512610770279482, "grad_norm": 20.125, "learning_rate": 1.807065372269376e-05, "loss": 3.202831506729126, "step": 662 }, { "epoch": 0.45194274028629855, "grad_norm": 17.25, "learning_rate": 1.8064128622876146e-05, "loss": 3.687476634979248, "step": 663 }, { "epoch": 0.45262440354464895, "grad_norm": 12.25, "learning_rate": 1.805759368990985e-05, "loss": 2.744915723800659, "step": 664 }, { "epoch": 0.45330606680299934, "grad_norm": 14.4375, "learning_rate": 1.8051048931763366e-05, "loss": 4.3892316818237305, "step": 665 }, { "epoch": 0.4539877300613497, "grad_norm": 43.5, "learning_rate": 1.804449435641717e-05, "loss": 7.007484436035156, "step": 666 }, { "epoch": 0.4546693933197001, "grad_norm": 28.125, "learning_rate": 1.803792997186371e-05, "loss": 6.370609283447266, "step": 667 }, { "epoch": 0.4553510565780504, "grad_norm": 16.5, "learning_rate": 1.803135578610739e-05, "loss": 5.463666915893555, "step": 668 }, { "epoch": 0.4560327198364008, "grad_norm": 19.625, "learning_rate": 1.802477180716457e-05, "loss": 4.562535285949707, "step": 669 }, { "epoch": 0.4567143830947512, "grad_norm": 23.25, "learning_rate": 1.8018178043063554e-05, "loss": 5.7054362297058105, "step": 670 }, { "epoch": 0.45739604635310155, "grad_norm": 17.75, "learning_rate": 1.801157450184457e-05, "loss": 5.214230537414551, "step": 671 }, { "epoch": 0.45807770961145194, "grad_norm": 21.5, "learning_rate": 1.8004961191559765e-05, "loss": 5.113531112670898, "step": 672 }, { "epoch": 0.45875937286980234, "grad_norm": 20.0, "learning_rate": 1.7998338120273218e-05, "loss": 2.4789419174194336, "step": 673 }, { "epoch": 0.4594410361281527, "grad_norm": 15.5625, "learning_rate": 1.7991705296060888e-05, "loss": 5.595855712890625, "step": 674 }, { "epoch": 0.4601226993865031, "grad_norm": 30.875, "learning_rate": 1.798506272701064e-05, "loss": 8.296469688415527, "step": 675 }, { "epoch": 0.4608043626448534, "grad_norm": 18.5, "learning_rate": 1.797841042122221e-05, "loss": 6.031268119812012, "step": 676 }, { "epoch": 0.4614860259032038, "grad_norm": 15.875, "learning_rate": 1.797174838680722e-05, "loss": 6.904556751251221, "step": 677 }, { "epoch": 0.4621676891615542, "grad_norm": 10.75, "learning_rate": 1.7965076631889146e-05, "loss": 4.079852104187012, "step": 678 }, { "epoch": 0.46284935241990455, "grad_norm": 17.75, "learning_rate": 1.7958395164603323e-05, "loss": 4.1085004806518555, "step": 679 }, { "epoch": 0.46353101567825494, "grad_norm": 15.625, "learning_rate": 1.795170399309692e-05, "loss": 5.562870025634766, "step": 680 }, { "epoch": 0.46421267893660534, "grad_norm": 16.0, "learning_rate": 1.794500312552895e-05, "loss": 5.687665939331055, "step": 681 }, { "epoch": 0.4648943421949557, "grad_norm": 20.875, "learning_rate": 1.7938292570070238e-05, "loss": 5.714770793914795, "step": 682 }, { "epoch": 0.4655760054533061, "grad_norm": 20.375, "learning_rate": 1.7931572334903427e-05, "loss": 2.397054672241211, "step": 683 }, { "epoch": 0.4662576687116564, "grad_norm": 18.5, "learning_rate": 1.792484242822297e-05, "loss": 7.0671000480651855, "step": 684 }, { "epoch": 0.4669393319700068, "grad_norm": 19.0, "learning_rate": 1.7918102858235103e-05, "loss": 2.467313528060913, "step": 685 }, { "epoch": 0.4676209952283572, "grad_norm": 33.75, "learning_rate": 1.7911353633157844e-05, "loss": 3.12894606590271, "step": 686 }, { "epoch": 0.46830265848670755, "grad_norm": 13.1875, "learning_rate": 1.7904594761221e-05, "loss": 3.7749271392822266, "step": 687 }, { "epoch": 0.46898432174505794, "grad_norm": 16.5, "learning_rate": 1.789782625066612e-05, "loss": 5.2524285316467285, "step": 688 }, { "epoch": 0.46966598500340834, "grad_norm": 23.25, "learning_rate": 1.7891048109746522e-05, "loss": 5.896417617797852, "step": 689 }, { "epoch": 0.4703476482617587, "grad_norm": 33.25, "learning_rate": 1.7884260346727257e-05, "loss": 7.6412739753723145, "step": 690 }, { "epoch": 0.4710293115201091, "grad_norm": 33.75, "learning_rate": 1.7877462969885114e-05, "loss": 8.404247283935547, "step": 691 }, { "epoch": 0.47171097477845947, "grad_norm": 10.0, "learning_rate": 1.7870655987508613e-05, "loss": 2.7146921157836914, "step": 692 }, { "epoch": 0.4723926380368098, "grad_norm": 20.75, "learning_rate": 1.7863839407897962e-05, "loss": 6.647920608520508, "step": 693 }, { "epoch": 0.4730743012951602, "grad_norm": 24.375, "learning_rate": 1.7857013239365098e-05, "loss": 4.06779146194458, "step": 694 }, { "epoch": 0.47375596455351054, "grad_norm": 16.875, "learning_rate": 1.7850177490233635e-05, "loss": 4.289577484130859, "step": 695 }, { "epoch": 0.47443762781186094, "grad_norm": 34.5, "learning_rate": 1.784333216883887e-05, "loss": 8.982884407043457, "step": 696 }, { "epoch": 0.47511929107021134, "grad_norm": 14.375, "learning_rate": 1.7836477283527787e-05, "loss": 3.7980613708496094, "step": 697 }, { "epoch": 0.4758009543285617, "grad_norm": 26.0, "learning_rate": 1.782961284265901e-05, "loss": 4.184602737426758, "step": 698 }, { "epoch": 0.47648261758691207, "grad_norm": 16.0, "learning_rate": 1.7822738854602835e-05, "loss": 3.991168260574341, "step": 699 }, { "epoch": 0.47716428084526247, "grad_norm": 17.125, "learning_rate": 1.7815855327741185e-05, "loss": 6.1737542152404785, "step": 700 }, { "epoch": 0.4778459441036128, "grad_norm": 20.75, "learning_rate": 1.780896227046762e-05, "loss": 6.414396286010742, "step": 701 }, { "epoch": 0.4785276073619632, "grad_norm": 19.875, "learning_rate": 1.7802059691187316e-05, "loss": 6.349745273590088, "step": 702 }, { "epoch": 0.47920927062031354, "grad_norm": 18.0, "learning_rate": 1.7795147598317067e-05, "loss": 6.329185962677002, "step": 703 }, { "epoch": 0.47989093387866394, "grad_norm": 17.125, "learning_rate": 1.7788226000285272e-05, "loss": 6.260217666625977, "step": 704 }, { "epoch": 0.48057259713701433, "grad_norm": 45.75, "learning_rate": 1.7781294905531908e-05, "loss": 7.356969833374023, "step": 705 }, { "epoch": 0.4812542603953647, "grad_norm": 23.75, "learning_rate": 1.7774354322508535e-05, "loss": 5.053040504455566, "step": 706 }, { "epoch": 0.48193592365371507, "grad_norm": 29.125, "learning_rate": 1.776740425967829e-05, "loss": 5.1830902099609375, "step": 707 }, { "epoch": 0.48261758691206547, "grad_norm": 20.875, "learning_rate": 1.7760444725515856e-05, "loss": 4.527225494384766, "step": 708 }, { "epoch": 0.4832992501704158, "grad_norm": 17.75, "learning_rate": 1.775347572850748e-05, "loss": 2.998427391052246, "step": 709 }, { "epoch": 0.4839809134287662, "grad_norm": 26.25, "learning_rate": 1.774649727715094e-05, "loss": 3.73393177986145, "step": 710 }, { "epoch": 0.48466257668711654, "grad_norm": 20.875, "learning_rate": 1.7739509379955548e-05, "loss": 6.284903526306152, "step": 711 }, { "epoch": 0.48534423994546694, "grad_norm": 67.5, "learning_rate": 1.7732512045442125e-05, "loss": 8.570135116577148, "step": 712 }, { "epoch": 0.48602590320381733, "grad_norm": 45.5, "learning_rate": 1.7725505282142997e-05, "loss": 8.17967414855957, "step": 713 }, { "epoch": 0.4867075664621677, "grad_norm": 17.75, "learning_rate": 1.771848909860201e-05, "loss": 5.644268035888672, "step": 714 }, { "epoch": 0.48738922972051807, "grad_norm": 49.5, "learning_rate": 1.7711463503374466e-05, "loss": 6.208334445953369, "step": 715 }, { "epoch": 0.48807089297886846, "grad_norm": 21.375, "learning_rate": 1.7704428505027165e-05, "loss": 7.114458084106445, "step": 716 }, { "epoch": 0.4887525562372188, "grad_norm": 17.625, "learning_rate": 1.7697384112138367e-05, "loss": 6.610820293426514, "step": 717 }, { "epoch": 0.4894342194955692, "grad_norm": 16.375, "learning_rate": 1.769033033329778e-05, "loss": 4.9485554695129395, "step": 718 }, { "epoch": 0.49011588275391954, "grad_norm": 30.5, "learning_rate": 1.7683267177106573e-05, "loss": 3.1811065673828125, "step": 719 }, { "epoch": 0.49079754601226994, "grad_norm": 25.875, "learning_rate": 1.7676194652177333e-05, "loss": 7.05119514465332, "step": 720 }, { "epoch": 0.49147920927062033, "grad_norm": 23.0, "learning_rate": 1.7669112767134084e-05, "loss": 3.027287006378174, "step": 721 }, { "epoch": 0.49216087252897067, "grad_norm": 20.125, "learning_rate": 1.766202153061225e-05, "loss": 4.605448246002197, "step": 722 }, { "epoch": 0.49284253578732107, "grad_norm": 26.375, "learning_rate": 1.7654920951258668e-05, "loss": 4.7265520095825195, "step": 723 }, { "epoch": 0.49352419904567146, "grad_norm": 22.5, "learning_rate": 1.7647811037731565e-05, "loss": 6.136214256286621, "step": 724 }, { "epoch": 0.4942058623040218, "grad_norm": 11.75, "learning_rate": 1.764069179870055e-05, "loss": 4.145930767059326, "step": 725 }, { "epoch": 0.4948875255623722, "grad_norm": 20.75, "learning_rate": 1.76335632428466e-05, "loss": 6.457059860229492, "step": 726 }, { "epoch": 0.49556918882072254, "grad_norm": 36.0, "learning_rate": 1.762642537886206e-05, "loss": 7.282492637634277, "step": 727 }, { "epoch": 0.49625085207907293, "grad_norm": 12.875, "learning_rate": 1.7619278215450615e-05, "loss": 2.4843266010284424, "step": 728 }, { "epoch": 0.49693251533742333, "grad_norm": 21.5, "learning_rate": 1.76121217613273e-05, "loss": 3.3024020195007324, "step": 729 }, { "epoch": 0.49761417859577367, "grad_norm": 22.625, "learning_rate": 1.760495602521847e-05, "loss": 5.873968601226807, "step": 730 }, { "epoch": 0.49829584185412407, "grad_norm": 22.875, "learning_rate": 1.7597781015861797e-05, "loss": 2.7352471351623535, "step": 731 }, { "epoch": 0.49897750511247446, "grad_norm": 16.5, "learning_rate": 1.7590596742006276e-05, "loss": 4.390603065490723, "step": 732 }, { "epoch": 0.4996591683708248, "grad_norm": 32.75, "learning_rate": 1.7583403212412183e-05, "loss": 7.019407272338867, "step": 733 }, { "epoch": 0.5003408316291752, "grad_norm": 29.125, "learning_rate": 1.7576200435851082e-05, "loss": 6.769026756286621, "step": 734 }, { "epoch": 0.5010224948875256, "grad_norm": 52.5, "learning_rate": 1.756898842110582e-05, "loss": 9.766204833984375, "step": 735 }, { "epoch": 0.501704158145876, "grad_norm": 14.8125, "learning_rate": 1.75617671769705e-05, "loss": 4.672554969787598, "step": 736 }, { "epoch": 0.5023858214042263, "grad_norm": 13.75, "learning_rate": 1.7554536712250488e-05, "loss": 3.5992767810821533, "step": 737 }, { "epoch": 0.5030674846625767, "grad_norm": 24.0, "learning_rate": 1.7547297035762387e-05, "loss": 6.065346717834473, "step": 738 }, { "epoch": 0.5037491479209271, "grad_norm": 28.5, "learning_rate": 1.7540048156334035e-05, "loss": 3.5336179733276367, "step": 739 }, { "epoch": 0.5044308111792775, "grad_norm": 19.5, "learning_rate": 1.753279008280449e-05, "loss": 7.444052696228027, "step": 740 }, { "epoch": 0.5051124744376279, "grad_norm": 18.375, "learning_rate": 1.7525522824024023e-05, "loss": 3.4653196334838867, "step": 741 }, { "epoch": 0.5057941376959781, "grad_norm": 24.75, "learning_rate": 1.75182463888541e-05, "loss": 7.252952575683594, "step": 742 }, { "epoch": 0.5064758009543285, "grad_norm": 19.75, "learning_rate": 1.751096078616739e-05, "loss": 7.798409461975098, "step": 743 }, { "epoch": 0.5071574642126789, "grad_norm": 17.0, "learning_rate": 1.7503666024847722e-05, "loss": 5.68978214263916, "step": 744 }, { "epoch": 0.5078391274710293, "grad_norm": 26.75, "learning_rate": 1.74963621137901e-05, "loss": 3.6793763637542725, "step": 745 }, { "epoch": 0.5085207907293797, "grad_norm": 26.25, "learning_rate": 1.7489049061900702e-05, "loss": 3.8085954189300537, "step": 746 }, { "epoch": 0.50920245398773, "grad_norm": 23.375, "learning_rate": 1.7481726878096824e-05, "loss": 3.978182792663574, "step": 747 }, { "epoch": 0.5098841172460804, "grad_norm": 20.0, "learning_rate": 1.7474395571306914e-05, "loss": 3.5641160011291504, "step": 748 }, { "epoch": 0.5105657805044308, "grad_norm": 29.125, "learning_rate": 1.746705515047054e-05, "loss": 3.0876471996307373, "step": 749 }, { "epoch": 0.5112474437627812, "grad_norm": 22.625, "learning_rate": 1.7459705624538383e-05, "loss": 6.352581977844238, "step": 750 }, { "epoch": 0.5119291070211316, "grad_norm": 13.6875, "learning_rate": 1.745234700247223e-05, "loss": 4.559296607971191, "step": 751 }, { "epoch": 0.512610770279482, "grad_norm": 14.6875, "learning_rate": 1.7444979293244953e-05, "loss": 4.898072242736816, "step": 752 }, { "epoch": 0.5132924335378323, "grad_norm": 12.8125, "learning_rate": 1.7437602505840513e-05, "loss": 4.366255760192871, "step": 753 }, { "epoch": 0.5139740967961827, "grad_norm": 12.5625, "learning_rate": 1.7430216649253934e-05, "loss": 3.435013771057129, "step": 754 }, { "epoch": 0.5146557600545331, "grad_norm": 20.125, "learning_rate": 1.7422821732491297e-05, "loss": 3.4886722564697266, "step": 755 }, { "epoch": 0.5153374233128835, "grad_norm": 43.25, "learning_rate": 1.741541776456974e-05, "loss": 4.574648857116699, "step": 756 }, { "epoch": 0.5160190865712339, "grad_norm": 29.75, "learning_rate": 1.7408004754517428e-05, "loss": 3.6834537982940674, "step": 757 }, { "epoch": 0.5167007498295841, "grad_norm": 13.5, "learning_rate": 1.7400582711373558e-05, "loss": 3.394076347351074, "step": 758 }, { "epoch": 0.5173824130879345, "grad_norm": 26.5, "learning_rate": 1.739315164418834e-05, "loss": 6.86533784866333, "step": 759 }, { "epoch": 0.5180640763462849, "grad_norm": 22.875, "learning_rate": 1.7385711562022988e-05, "loss": 6.113768577575684, "step": 760 }, { "epoch": 0.5187457396046353, "grad_norm": 47.25, "learning_rate": 1.7378262473949705e-05, "loss": 8.696137428283691, "step": 761 }, { "epoch": 0.5194274028629857, "grad_norm": 18.375, "learning_rate": 1.737080438905168e-05, "loss": 4.360647201538086, "step": 762 }, { "epoch": 0.520109066121336, "grad_norm": 18.125, "learning_rate": 1.736333731642307e-05, "loss": 7.205753326416016, "step": 763 }, { "epoch": 0.5207907293796864, "grad_norm": 17.5, "learning_rate": 1.735586126516899e-05, "loss": 8.769655227661133, "step": 764 }, { "epoch": 0.5214723926380368, "grad_norm": 38.0, "learning_rate": 1.734837624440551e-05, "loss": 6.432826995849609, "step": 765 }, { "epoch": 0.5221540558963872, "grad_norm": 42.25, "learning_rate": 1.734088226325963e-05, "loss": 4.797942638397217, "step": 766 }, { "epoch": 0.5228357191547376, "grad_norm": 25.125, "learning_rate": 1.733337933086928e-05, "loss": 5.637678146362305, "step": 767 }, { "epoch": 0.523517382413088, "grad_norm": 21.0, "learning_rate": 1.7325867456383303e-05, "loss": 6.412743091583252, "step": 768 }, { "epoch": 0.5241990456714383, "grad_norm": 15.125, "learning_rate": 1.7318346648961444e-05, "loss": 5.598393440246582, "step": 769 }, { "epoch": 0.5248807089297887, "grad_norm": 19.0, "learning_rate": 1.731081691777434e-05, "loss": 8.09205150604248, "step": 770 }, { "epoch": 0.5255623721881391, "grad_norm": 43.0, "learning_rate": 1.7303278272003524e-05, "loss": 5.0000762939453125, "step": 771 }, { "epoch": 0.5262440354464895, "grad_norm": 27.25, "learning_rate": 1.7295730720841372e-05, "loss": 3.178325891494751, "step": 772 }, { "epoch": 0.5269256987048399, "grad_norm": 22.125, "learning_rate": 1.7288174273491144e-05, "loss": 4.753299713134766, "step": 773 }, { "epoch": 0.5276073619631901, "grad_norm": 26.125, "learning_rate": 1.7280608939166937e-05, "loss": 2.4609792232513428, "step": 774 }, { "epoch": 0.5282890252215405, "grad_norm": 36.5, "learning_rate": 1.7273034727093677e-05, "loss": 5.353132247924805, "step": 775 }, { "epoch": 0.5289706884798909, "grad_norm": 30.75, "learning_rate": 1.726545164650714e-05, "loss": 6.388213634490967, "step": 776 }, { "epoch": 0.5296523517382413, "grad_norm": 17.0, "learning_rate": 1.725785970665388e-05, "loss": 4.9844865798950195, "step": 777 }, { "epoch": 0.5303340149965917, "grad_norm": 14.6875, "learning_rate": 1.7250258916791286e-05, "loss": 4.63857889175415, "step": 778 }, { "epoch": 0.5310156782549421, "grad_norm": 27.875, "learning_rate": 1.7242649286187524e-05, "loss": 4.909311294555664, "step": 779 }, { "epoch": 0.5316973415132924, "grad_norm": 21.25, "learning_rate": 1.7235030824121542e-05, "loss": 5.426059722900391, "step": 780 }, { "epoch": 0.5323790047716428, "grad_norm": 47.5, "learning_rate": 1.722740353988305e-05, "loss": 9.43151569366455, "step": 781 }, { "epoch": 0.5330606680299932, "grad_norm": 31.875, "learning_rate": 1.721976744277253e-05, "loss": 5.190795421600342, "step": 782 }, { "epoch": 0.5337423312883436, "grad_norm": 16.75, "learning_rate": 1.7212122542101202e-05, "loss": 2.316226005554199, "step": 783 }, { "epoch": 0.534423994546694, "grad_norm": 12.5, "learning_rate": 1.7204468847191017e-05, "loss": 3.7189102172851562, "step": 784 }, { "epoch": 0.5351056578050443, "grad_norm": 26.875, "learning_rate": 1.7196806367374656e-05, "loss": 5.842459201812744, "step": 785 }, { "epoch": 0.5357873210633947, "grad_norm": 13.5625, "learning_rate": 1.718913511199551e-05, "loss": 2.479865789413452, "step": 786 }, { "epoch": 0.5364689843217451, "grad_norm": 40.5, "learning_rate": 1.7181455090407667e-05, "loss": 8.691658020019531, "step": 787 }, { "epoch": 0.5371506475800955, "grad_norm": 18.375, "learning_rate": 1.717376631197591e-05, "loss": 4.961927890777588, "step": 788 }, { "epoch": 0.5378323108384458, "grad_norm": 15.875, "learning_rate": 1.7166068786075697e-05, "loss": 3.161909580230713, "step": 789 }, { "epoch": 0.5385139740967961, "grad_norm": 11.9375, "learning_rate": 1.7158362522093153e-05, "loss": 3.577162981033325, "step": 790 }, { "epoch": 0.5391956373551465, "grad_norm": 44.0, "learning_rate": 1.715064752942506e-05, "loss": 9.465940475463867, "step": 791 }, { "epoch": 0.5398773006134969, "grad_norm": 21.5, "learning_rate": 1.714292381747883e-05, "loss": 7.599254608154297, "step": 792 }, { "epoch": 0.5405589638718473, "grad_norm": 30.75, "learning_rate": 1.7135191395672532e-05, "loss": 3.613727331161499, "step": 793 }, { "epoch": 0.5412406271301977, "grad_norm": 12.5, "learning_rate": 1.7127450273434837e-05, "loss": 4.984393119812012, "step": 794 }, { "epoch": 0.5419222903885481, "grad_norm": 14.4375, "learning_rate": 1.7119700460205026e-05, "loss": 4.632012367248535, "step": 795 }, { "epoch": 0.5426039536468984, "grad_norm": 25.625, "learning_rate": 1.7111941965432985e-05, "loss": 7.500833511352539, "step": 796 }, { "epoch": 0.5432856169052488, "grad_norm": 20.0, "learning_rate": 1.710417479857918e-05, "loss": 5.490220546722412, "step": 797 }, { "epoch": 0.5439672801635992, "grad_norm": 16.625, "learning_rate": 1.709639896911466e-05, "loss": 4.7602386474609375, "step": 798 }, { "epoch": 0.5446489434219496, "grad_norm": 51.0, "learning_rate": 1.708861448652102e-05, "loss": 5.899531364440918, "step": 799 }, { "epoch": 0.5453306066803, "grad_norm": 18.875, "learning_rate": 1.7080821360290426e-05, "loss": 4.8148579597473145, "step": 800 }, { "epoch": 0.5460122699386503, "grad_norm": 29.5, "learning_rate": 1.707301959992557e-05, "loss": 3.1351547241210938, "step": 801 }, { "epoch": 0.5466939331970007, "grad_norm": 43.0, "learning_rate": 1.7065209214939677e-05, "loss": 9.405404090881348, "step": 802 }, { "epoch": 0.547375596455351, "grad_norm": 38.75, "learning_rate": 1.7057390214856493e-05, "loss": 7.529723644256592, "step": 803 }, { "epoch": 0.5480572597137015, "grad_norm": 15.1875, "learning_rate": 1.704956260921026e-05, "loss": 3.425607204437256, "step": 804 }, { "epoch": 0.5487389229720518, "grad_norm": 53.25, "learning_rate": 1.7041726407545716e-05, "loss": 10.491121292114258, "step": 805 }, { "epoch": 0.5494205862304021, "grad_norm": 30.0, "learning_rate": 1.703388161941809e-05, "loss": 6.606597900390625, "step": 806 }, { "epoch": 0.5501022494887525, "grad_norm": 25.125, "learning_rate": 1.7026028254393067e-05, "loss": 2.927739143371582, "step": 807 }, { "epoch": 0.5507839127471029, "grad_norm": 12.875, "learning_rate": 1.7018166322046798e-05, "loss": 4.3564558029174805, "step": 808 }, { "epoch": 0.5514655760054533, "grad_norm": 29.0, "learning_rate": 1.7010295831965886e-05, "loss": 6.850395202636719, "step": 809 }, { "epoch": 0.5521472392638037, "grad_norm": 14.9375, "learning_rate": 1.7002416793747354e-05, "loss": 4.009963035583496, "step": 810 }, { "epoch": 0.5528289025221541, "grad_norm": 17.5, "learning_rate": 1.6994529216998664e-05, "loss": 3.866522789001465, "step": 811 }, { "epoch": 0.5535105657805044, "grad_norm": 20.375, "learning_rate": 1.698663311133768e-05, "loss": 5.946323394775391, "step": 812 }, { "epoch": 0.5541922290388548, "grad_norm": 21.125, "learning_rate": 1.697872848639267e-05, "loss": 4.221358299255371, "step": 813 }, { "epoch": 0.5548738922972052, "grad_norm": 20.125, "learning_rate": 1.6970815351802285e-05, "loss": 5.555936336517334, "step": 814 }, { "epoch": 0.5555555555555556, "grad_norm": 17.375, "learning_rate": 1.696289371721556e-05, "loss": 4.56041955947876, "step": 815 }, { "epoch": 0.556237218813906, "grad_norm": 17.5, "learning_rate": 1.695496359229189e-05, "loss": 6.984447479248047, "step": 816 }, { "epoch": 0.5569188820722563, "grad_norm": 26.625, "learning_rate": 1.694702498670102e-05, "loss": 6.216546058654785, "step": 817 }, { "epoch": 0.5576005453306067, "grad_norm": 26.5, "learning_rate": 1.693907791012305e-05, "loss": 7.583439826965332, "step": 818 }, { "epoch": 0.558282208588957, "grad_norm": 25.0, "learning_rate": 1.6931122372248386e-05, "loss": 9.43929672241211, "step": 819 }, { "epoch": 0.5589638718473074, "grad_norm": 19.375, "learning_rate": 1.692315838277778e-05, "loss": 8.433021545410156, "step": 820 }, { "epoch": 0.5596455351056578, "grad_norm": 18.625, "learning_rate": 1.6915185951422256e-05, "loss": 2.7877023220062256, "step": 821 }, { "epoch": 0.5603271983640081, "grad_norm": 15.25, "learning_rate": 1.690720508790316e-05, "loss": 5.281449317932129, "step": 822 }, { "epoch": 0.5610088616223585, "grad_norm": 26.75, "learning_rate": 1.6899215801952112e-05, "loss": 4.110177516937256, "step": 823 }, { "epoch": 0.5616905248807089, "grad_norm": 23.125, "learning_rate": 1.6891218103310994e-05, "loss": 8.110624313354492, "step": 824 }, { "epoch": 0.5623721881390593, "grad_norm": 18.125, "learning_rate": 1.6883212001731956e-05, "loss": 4.416251182556152, "step": 825 }, { "epoch": 0.5630538513974097, "grad_norm": 16.875, "learning_rate": 1.6875197506977387e-05, "loss": 4.327664852142334, "step": 826 }, { "epoch": 0.5637355146557601, "grad_norm": 21.75, "learning_rate": 1.686717462881992e-05, "loss": 4.46920108795166, "step": 827 }, { "epoch": 0.5644171779141104, "grad_norm": 31.875, "learning_rate": 1.685914337704239e-05, "loss": 8.769279479980469, "step": 828 }, { "epoch": 0.5650988411724608, "grad_norm": 12.625, "learning_rate": 1.6851103761437876e-05, "loss": 5.457320690155029, "step": 829 }, { "epoch": 0.5657805044308112, "grad_norm": 17.875, "learning_rate": 1.6843055791809623e-05, "loss": 5.706052780151367, "step": 830 }, { "epoch": 0.5664621676891616, "grad_norm": 11.375, "learning_rate": 1.6834999477971078e-05, "loss": 4.3603081703186035, "step": 831 }, { "epoch": 0.567143830947512, "grad_norm": 16.25, "learning_rate": 1.6826934829745868e-05, "loss": 4.580974578857422, "step": 832 }, { "epoch": 0.5678254942058623, "grad_norm": 19.25, "learning_rate": 1.6818861856967762e-05, "loss": 6.1858110427856445, "step": 833 }, { "epoch": 0.5685071574642127, "grad_norm": 19.125, "learning_rate": 1.68107805694807e-05, "loss": 6.189134120941162, "step": 834 }, { "epoch": 0.569188820722563, "grad_norm": 20.0, "learning_rate": 1.680269097713876e-05, "loss": 6.221613883972168, "step": 835 }, { "epoch": 0.5698704839809134, "grad_norm": 13.1875, "learning_rate": 1.6794593089806134e-05, "loss": 4.74809455871582, "step": 836 }, { "epoch": 0.5705521472392638, "grad_norm": 19.125, "learning_rate": 1.678648691735713e-05, "loss": 6.200054168701172, "step": 837 }, { "epoch": 0.5712338104976141, "grad_norm": 31.25, "learning_rate": 1.6778372469676173e-05, "loss": 5.635201454162598, "step": 838 }, { "epoch": 0.5719154737559645, "grad_norm": 17.5, "learning_rate": 1.6770249756657762e-05, "loss": 5.680493354797363, "step": 839 }, { "epoch": 0.5725971370143149, "grad_norm": 24.75, "learning_rate": 1.6762118788206488e-05, "loss": 5.830935478210449, "step": 840 }, { "epoch": 0.5732788002726653, "grad_norm": 24.75, "learning_rate": 1.6753979574236996e-05, "loss": 2.997720718383789, "step": 841 }, { "epoch": 0.5739604635310157, "grad_norm": 23.875, "learning_rate": 1.6745832124673996e-05, "loss": 2.9766693115234375, "step": 842 }, { "epoch": 0.5746421267893661, "grad_norm": 21.875, "learning_rate": 1.6737676449452235e-05, "loss": 6.096424102783203, "step": 843 }, { "epoch": 0.5753237900477164, "grad_norm": 23.0, "learning_rate": 1.672951255851649e-05, "loss": 5.801201343536377, "step": 844 }, { "epoch": 0.5760054533060668, "grad_norm": 19.75, "learning_rate": 1.6721340461821555e-05, "loss": 5.334002494812012, "step": 845 }, { "epoch": 0.5766871165644172, "grad_norm": 11.5625, "learning_rate": 1.671316016933223e-05, "loss": 3.296034812927246, "step": 846 }, { "epoch": 0.5773687798227676, "grad_norm": 13.375, "learning_rate": 1.6704971691023316e-05, "loss": 4.413066864013672, "step": 847 }, { "epoch": 0.578050443081118, "grad_norm": 62.25, "learning_rate": 1.6696775036879588e-05, "loss": 8.878671646118164, "step": 848 }, { "epoch": 0.5787321063394683, "grad_norm": 27.5, "learning_rate": 1.6688570216895793e-05, "loss": 6.95365571975708, "step": 849 }, { "epoch": 0.5794137695978187, "grad_norm": 11.0, "learning_rate": 1.6680357241076632e-05, "loss": 2.980342388153076, "step": 850 }, { "epoch": 0.580095432856169, "grad_norm": 19.25, "learning_rate": 1.667213611943675e-05, "loss": 4.892740249633789, "step": 851 }, { "epoch": 0.5807770961145194, "grad_norm": 13.625, "learning_rate": 1.6663906862000736e-05, "loss": 4.322038650512695, "step": 852 }, { "epoch": 0.5814587593728698, "grad_norm": 36.25, "learning_rate": 1.6655669478803086e-05, "loss": 4.278210639953613, "step": 853 }, { "epoch": 0.5821404226312201, "grad_norm": 15.125, "learning_rate": 1.6647423979888214e-05, "loss": 5.745888710021973, "step": 854 }, { "epoch": 0.5828220858895705, "grad_norm": 15.75, "learning_rate": 1.6639170375310422e-05, "loss": 5.346442222595215, "step": 855 }, { "epoch": 0.5835037491479209, "grad_norm": 25.75, "learning_rate": 1.6630908675133905e-05, "loss": 8.261942863464355, "step": 856 }, { "epoch": 0.5841854124062713, "grad_norm": 28.625, "learning_rate": 1.6622638889432716e-05, "loss": 7.7204437255859375, "step": 857 }, { "epoch": 0.5848670756646217, "grad_norm": 18.75, "learning_rate": 1.6614361028290783e-05, "loss": 4.518536567687988, "step": 858 }, { "epoch": 0.5855487389229721, "grad_norm": 16.125, "learning_rate": 1.660607510180187e-05, "loss": 6.365080833435059, "step": 859 }, { "epoch": 0.5862304021813224, "grad_norm": 14.75, "learning_rate": 1.6597781120069584e-05, "loss": 6.839221954345703, "step": 860 }, { "epoch": 0.5869120654396728, "grad_norm": 24.5, "learning_rate": 1.658947909320734e-05, "loss": 3.103800058364868, "step": 861 }, { "epoch": 0.5875937286980232, "grad_norm": 21.25, "learning_rate": 1.658116903133838e-05, "loss": 9.818506240844727, "step": 862 }, { "epoch": 0.5882753919563736, "grad_norm": 15.0, "learning_rate": 1.6572850944595735e-05, "loss": 5.974897861480713, "step": 863 }, { "epoch": 0.588957055214724, "grad_norm": 35.5, "learning_rate": 1.6564524843122223e-05, "loss": 8.5772123336792, "step": 864 }, { "epoch": 0.5896387184730743, "grad_norm": 21.375, "learning_rate": 1.655619073707043e-05, "loss": 3.4890878200531006, "step": 865 }, { "epoch": 0.5903203817314246, "grad_norm": 19.75, "learning_rate": 1.6547848636602708e-05, "loss": 6.76170539855957, "step": 866 }, { "epoch": 0.591002044989775, "grad_norm": 20.75, "learning_rate": 1.653949855189116e-05, "loss": 7.491352081298828, "step": 867 }, { "epoch": 0.5916837082481254, "grad_norm": 17.75, "learning_rate": 1.653114049311762e-05, "loss": 8.280683517456055, "step": 868 }, { "epoch": 0.5923653715064758, "grad_norm": 13.375, "learning_rate": 1.6522774470473642e-05, "loss": 7.435303211212158, "step": 869 }, { "epoch": 0.5930470347648262, "grad_norm": 13.0625, "learning_rate": 1.6514400494160498e-05, "loss": 4.654882907867432, "step": 870 }, { "epoch": 0.5937286980231765, "grad_norm": 13.4375, "learning_rate": 1.6506018574389152e-05, "loss": 4.3531999588012695, "step": 871 }, { "epoch": 0.5944103612815269, "grad_norm": 23.25, "learning_rate": 1.6497628721380257e-05, "loss": 6.253190040588379, "step": 872 }, { "epoch": 0.5950920245398773, "grad_norm": 20.5, "learning_rate": 1.6489230945364148e-05, "loss": 3.7390856742858887, "step": 873 }, { "epoch": 0.5957736877982277, "grad_norm": 15.3125, "learning_rate": 1.648082525658081e-05, "loss": 3.9133782386779785, "step": 874 }, { "epoch": 0.5964553510565781, "grad_norm": 19.0, "learning_rate": 1.6472411665279872e-05, "loss": 2.8957972526550293, "step": 875 }, { "epoch": 0.5971370143149284, "grad_norm": 14.625, "learning_rate": 1.646399018172061e-05, "loss": 5.105293273925781, "step": 876 }, { "epoch": 0.5978186775732788, "grad_norm": 21.375, "learning_rate": 1.6455560816171928e-05, "loss": 2.1098718643188477, "step": 877 }, { "epoch": 0.5985003408316292, "grad_norm": 19.125, "learning_rate": 1.644712357891232e-05, "loss": 5.385929584503174, "step": 878 }, { "epoch": 0.5991820040899796, "grad_norm": 12.0625, "learning_rate": 1.643867848022991e-05, "loss": 4.1626386642456055, "step": 879 }, { "epoch": 0.59986366734833, "grad_norm": 24.25, "learning_rate": 1.643022553042237e-05, "loss": 5.523311138153076, "step": 880 }, { "epoch": 0.6005453306066802, "grad_norm": 47.5, "learning_rate": 1.6421764739796974e-05, "loss": 7.681699752807617, "step": 881 }, { "epoch": 0.6012269938650306, "grad_norm": 25.75, "learning_rate": 1.6413296118670553e-05, "loss": 4.2484588623046875, "step": 882 }, { "epoch": 0.601908657123381, "grad_norm": 24.75, "learning_rate": 1.6404819677369474e-05, "loss": 6.761958122253418, "step": 883 }, { "epoch": 0.6025903203817314, "grad_norm": 13.8125, "learning_rate": 1.639633542622965e-05, "loss": 3.772169828414917, "step": 884 }, { "epoch": 0.6032719836400818, "grad_norm": 22.5, "learning_rate": 1.6387843375596513e-05, "loss": 7.125896453857422, "step": 885 }, { "epoch": 0.6039536468984322, "grad_norm": 19.625, "learning_rate": 1.6379343535825004e-05, "loss": 3.3655197620391846, "step": 886 }, { "epoch": 0.6046353101567825, "grad_norm": 16.625, "learning_rate": 1.6370835917279573e-05, "loss": 3.6069765090942383, "step": 887 }, { "epoch": 0.6053169734151329, "grad_norm": 36.0, "learning_rate": 1.636232053033414e-05, "loss": 8.15231704711914, "step": 888 }, { "epoch": 0.6059986366734833, "grad_norm": 16.375, "learning_rate": 1.63537973853721e-05, "loss": 3.788557767868042, "step": 889 }, { "epoch": 0.6066802999318337, "grad_norm": 18.0, "learning_rate": 1.634526649278632e-05, "loss": 5.229141712188721, "step": 890 }, { "epoch": 0.6073619631901841, "grad_norm": 29.25, "learning_rate": 1.6336727862979108e-05, "loss": 3.939873456954956, "step": 891 }, { "epoch": 0.6080436264485344, "grad_norm": 14.3125, "learning_rate": 1.6328181506362193e-05, "loss": 4.4055256843566895, "step": 892 }, { "epoch": 0.6087252897068848, "grad_norm": 15.5625, "learning_rate": 1.631962743335675e-05, "loss": 4.996438026428223, "step": 893 }, { "epoch": 0.6094069529652352, "grad_norm": 20.125, "learning_rate": 1.631106565439334e-05, "loss": 4.564064979553223, "step": 894 }, { "epoch": 0.6100886162235856, "grad_norm": 15.5625, "learning_rate": 1.630249617991194e-05, "loss": 3.768395185470581, "step": 895 }, { "epoch": 0.610770279481936, "grad_norm": 23.25, "learning_rate": 1.6293919020361895e-05, "loss": 5.623241424560547, "step": 896 }, { "epoch": 0.6114519427402862, "grad_norm": 41.5, "learning_rate": 1.6285334186201933e-05, "loss": 9.943592071533203, "step": 897 }, { "epoch": 0.6121336059986366, "grad_norm": 11.8125, "learning_rate": 1.6276741687900134e-05, "loss": 2.9091217517852783, "step": 898 }, { "epoch": 0.612815269256987, "grad_norm": 23.5, "learning_rate": 1.626814153593392e-05, "loss": 5.738786697387695, "step": 899 }, { "epoch": 0.6134969325153374, "grad_norm": 11.5, "learning_rate": 1.6259533740790055e-05, "loss": 4.156549453735352, "step": 900 }, { "epoch": 0.6141785957736878, "grad_norm": 16.0, "learning_rate": 1.6250918312964613e-05, "loss": 4.519524097442627, "step": 901 }, { "epoch": 0.6148602590320382, "grad_norm": 15.6875, "learning_rate": 1.6242295262962983e-05, "loss": 4.07269287109375, "step": 902 }, { "epoch": 0.6155419222903885, "grad_norm": 34.25, "learning_rate": 1.6233664601299848e-05, "loss": 5.758724212646484, "step": 903 }, { "epoch": 0.6162235855487389, "grad_norm": 14.25, "learning_rate": 1.6225026338499166e-05, "loss": 3.3662121295928955, "step": 904 }, { "epoch": 0.6169052488070893, "grad_norm": 17.125, "learning_rate": 1.6216380485094164e-05, "loss": 4.307858467102051, "step": 905 }, { "epoch": 0.6175869120654397, "grad_norm": 29.75, "learning_rate": 1.6207727051627334e-05, "loss": 6.085043907165527, "step": 906 }, { "epoch": 0.6182685753237901, "grad_norm": 19.875, "learning_rate": 1.61990660486504e-05, "loss": 5.409701347351074, "step": 907 }, { "epoch": 0.6189502385821404, "grad_norm": 12.0, "learning_rate": 1.6190397486724324e-05, "loss": 3.53297758102417, "step": 908 }, { "epoch": 0.6196319018404908, "grad_norm": 17.25, "learning_rate": 1.6181721376419282e-05, "loss": 4.4189252853393555, "step": 909 }, { "epoch": 0.6203135650988412, "grad_norm": 21.25, "learning_rate": 1.617303772831465e-05, "loss": 7.012677192687988, "step": 910 }, { "epoch": 0.6209952283571916, "grad_norm": 31.75, "learning_rate": 1.6164346552999e-05, "loss": 8.130770683288574, "step": 911 }, { "epoch": 0.621676891615542, "grad_norm": 33.5, "learning_rate": 1.615564786107009e-05, "loss": 8.663565635681152, "step": 912 }, { "epoch": 0.6223585548738922, "grad_norm": 17.375, "learning_rate": 1.614694166313483e-05, "loss": 4.179257392883301, "step": 913 }, { "epoch": 0.6230402181322426, "grad_norm": 15.5625, "learning_rate": 1.6138227969809283e-05, "loss": 6.245866298675537, "step": 914 }, { "epoch": 0.623721881390593, "grad_norm": 27.125, "learning_rate": 1.6129506791718665e-05, "loss": 7.57796573638916, "step": 915 }, { "epoch": 0.6244035446489434, "grad_norm": 22.125, "learning_rate": 1.6120778139497307e-05, "loss": 5.888785362243652, "step": 916 }, { "epoch": 0.6250852079072938, "grad_norm": 32.5, "learning_rate": 1.611204202378866e-05, "loss": 9.636720657348633, "step": 917 }, { "epoch": 0.6257668711656442, "grad_norm": 28.5, "learning_rate": 1.6103298455245267e-05, "loss": 10.562395095825195, "step": 918 }, { "epoch": 0.6264485344239945, "grad_norm": 18.25, "learning_rate": 1.6094547444528767e-05, "loss": 4.589843273162842, "step": 919 }, { "epoch": 0.6271301976823449, "grad_norm": 13.6875, "learning_rate": 1.6085789002309873e-05, "loss": 5.723838806152344, "step": 920 }, { "epoch": 0.6278118609406953, "grad_norm": 17.625, "learning_rate": 1.607702313926836e-05, "loss": 6.712417125701904, "step": 921 }, { "epoch": 0.6284935241990457, "grad_norm": 32.25, "learning_rate": 1.6068249866093046e-05, "loss": 4.781304359436035, "step": 922 }, { "epoch": 0.6291751874573961, "grad_norm": 21.0, "learning_rate": 1.605946919348179e-05, "loss": 3.957465887069702, "step": 923 }, { "epoch": 0.6298568507157464, "grad_norm": 54.0, "learning_rate": 1.6050681132141473e-05, "loss": 11.913284301757812, "step": 924 }, { "epoch": 0.6305385139740968, "grad_norm": 21.75, "learning_rate": 1.6041885692787985e-05, "loss": 2.1874420642852783, "step": 925 }, { "epoch": 0.6312201772324472, "grad_norm": 22.5, "learning_rate": 1.603308288614621e-05, "loss": 4.355494499206543, "step": 926 }, { "epoch": 0.6319018404907976, "grad_norm": 18.5, "learning_rate": 1.602427272295002e-05, "loss": 4.3051252365112305, "step": 927 }, { "epoch": 0.632583503749148, "grad_norm": 18.375, "learning_rate": 1.6015455213942253e-05, "loss": 5.1242852210998535, "step": 928 }, { "epoch": 0.6332651670074982, "grad_norm": 17.625, "learning_rate": 1.600663036987471e-05, "loss": 4.971452713012695, "step": 929 }, { "epoch": 0.6339468302658486, "grad_norm": 18.0, "learning_rate": 1.599779820150813e-05, "loss": 3.9015190601348877, "step": 930 }, { "epoch": 0.634628493524199, "grad_norm": 34.5, "learning_rate": 1.5988958719612182e-05, "loss": 8.000478744506836, "step": 931 }, { "epoch": 0.6353101567825494, "grad_norm": 20.0, "learning_rate": 1.5980111934965467e-05, "loss": 5.082400321960449, "step": 932 }, { "epoch": 0.6359918200408998, "grad_norm": 23.25, "learning_rate": 1.5971257858355467e-05, "loss": 5.638777732849121, "step": 933 }, { "epoch": 0.6366734832992502, "grad_norm": 49.25, "learning_rate": 1.596239650057858e-05, "loss": 8.658170700073242, "step": 934 }, { "epoch": 0.6373551465576005, "grad_norm": 39.0, "learning_rate": 1.5953527872440063e-05, "loss": 6.13491678237915, "step": 935 }, { "epoch": 0.6380368098159509, "grad_norm": 28.375, "learning_rate": 1.5944651984754053e-05, "loss": 3.5162131786346436, "step": 936 }, { "epoch": 0.6387184730743013, "grad_norm": 26.875, "learning_rate": 1.5935768848343527e-05, "loss": 8.209856033325195, "step": 937 }, { "epoch": 0.6394001363326517, "grad_norm": 24.375, "learning_rate": 1.5926878474040313e-05, "loss": 7.896737575531006, "step": 938 }, { "epoch": 0.6400817995910021, "grad_norm": 31.5, "learning_rate": 1.591798087268505e-05, "loss": 2.9893224239349365, "step": 939 }, { "epoch": 0.6407634628493524, "grad_norm": 28.5, "learning_rate": 1.5909076055127202e-05, "loss": 9.902832984924316, "step": 940 }, { "epoch": 0.6414451261077028, "grad_norm": 38.0, "learning_rate": 1.590016403222503e-05, "loss": 6.824080467224121, "step": 941 }, { "epoch": 0.6421267893660532, "grad_norm": 16.5, "learning_rate": 1.5891244814845575e-05, "loss": 5.282330513000488, "step": 942 }, { "epoch": 0.6428084526244036, "grad_norm": 21.125, "learning_rate": 1.5882318413864653e-05, "loss": 5.467375755310059, "step": 943 }, { "epoch": 0.643490115882754, "grad_norm": 19.625, "learning_rate": 1.5873384840166846e-05, "loss": 7.627417087554932, "step": 944 }, { "epoch": 0.6441717791411042, "grad_norm": 23.0, "learning_rate": 1.5864444104645473e-05, "loss": 5.842252254486084, "step": 945 }, { "epoch": 0.6448534423994546, "grad_norm": 18.875, "learning_rate": 1.5855496218202592e-05, "loss": 5.883267402648926, "step": 946 }, { "epoch": 0.645535105657805, "grad_norm": 14.8125, "learning_rate": 1.5846541191748978e-05, "loss": 4.811272621154785, "step": 947 }, { "epoch": 0.6462167689161554, "grad_norm": 13.0, "learning_rate": 1.5837579036204114e-05, "loss": 5.287847518920898, "step": 948 }, { "epoch": 0.6468984321745058, "grad_norm": 20.875, "learning_rate": 1.582860976249617e-05, "loss": 3.5283546447753906, "step": 949 }, { "epoch": 0.6475800954328562, "grad_norm": 36.25, "learning_rate": 1.581963338156201e-05, "loss": 7.442572593688965, "step": 950 }, { "epoch": 0.6482617586912065, "grad_norm": 17.25, "learning_rate": 1.5810649904347145e-05, "loss": 4.404822826385498, "step": 951 }, { "epoch": 0.6489434219495569, "grad_norm": 15.0, "learning_rate": 1.5801659341805752e-05, "loss": 4.723443031311035, "step": 952 }, { "epoch": 0.6496250852079073, "grad_norm": 22.5, "learning_rate": 1.5792661704900648e-05, "loss": 7.175860404968262, "step": 953 }, { "epoch": 0.6503067484662577, "grad_norm": 35.75, "learning_rate": 1.5783657004603273e-05, "loss": 9.856776237487793, "step": 954 }, { "epoch": 0.6509884117246081, "grad_norm": 20.0, "learning_rate": 1.5774645251893673e-05, "loss": 6.082765579223633, "step": 955 }, { "epoch": 0.6516700749829584, "grad_norm": 17.625, "learning_rate": 1.5765626457760506e-05, "loss": 3.8805317878723145, "step": 956 }, { "epoch": 0.6523517382413088, "grad_norm": 32.5, "learning_rate": 1.575660063320101e-05, "loss": 7.641180038452148, "step": 957 }, { "epoch": 0.6530334014996592, "grad_norm": 17.5, "learning_rate": 1.574756778922099e-05, "loss": 5.246222496032715, "step": 958 }, { "epoch": 0.6537150647580096, "grad_norm": 17.25, "learning_rate": 1.5738527936834824e-05, "loss": 3.750765323638916, "step": 959 }, { "epoch": 0.65439672801636, "grad_norm": 13.25, "learning_rate": 1.5729481087065423e-05, "loss": 4.505781650543213, "step": 960 }, { "epoch": 0.6550783912747103, "grad_norm": 44.25, "learning_rate": 1.5720427250944237e-05, "loss": 10.043703079223633, "step": 961 }, { "epoch": 0.6557600545330606, "grad_norm": 16.625, "learning_rate": 1.5711366439511234e-05, "loss": 3.813462257385254, "step": 962 }, { "epoch": 0.656441717791411, "grad_norm": 23.75, "learning_rate": 1.5702298663814884e-05, "loss": 4.2245330810546875, "step": 963 }, { "epoch": 0.6571233810497614, "grad_norm": 18.875, "learning_rate": 1.569322393491216e-05, "loss": 5.409110069274902, "step": 964 }, { "epoch": 0.6578050443081118, "grad_norm": 21.125, "learning_rate": 1.5684142263868493e-05, "loss": 7.5253400802612305, "step": 965 }, { "epoch": 0.6584867075664622, "grad_norm": 21.875, "learning_rate": 1.5675053661757802e-05, "loss": 2.632591724395752, "step": 966 }, { "epoch": 0.6591683708248125, "grad_norm": 29.875, "learning_rate": 1.566595813966244e-05, "loss": 3.4430923461914062, "step": 967 }, { "epoch": 0.6598500340831629, "grad_norm": 12.4375, "learning_rate": 1.5656855708673208e-05, "loss": 4.857151031494141, "step": 968 }, { "epoch": 0.6605316973415133, "grad_norm": 11.25, "learning_rate": 1.564774637988933e-05, "loss": 3.3688602447509766, "step": 969 }, { "epoch": 0.6612133605998637, "grad_norm": 18.375, "learning_rate": 1.5638630164418435e-05, "loss": 3.5627150535583496, "step": 970 }, { "epoch": 0.6618950238582141, "grad_norm": 27.125, "learning_rate": 1.5629507073376556e-05, "loss": 5.497921943664551, "step": 971 }, { "epoch": 0.6625766871165644, "grad_norm": 42.75, "learning_rate": 1.562037711788811e-05, "loss": 5.4203081130981445, "step": 972 }, { "epoch": 0.6632583503749148, "grad_norm": 14.375, "learning_rate": 1.5611240309085877e-05, "loss": 4.977199554443359, "step": 973 }, { "epoch": 0.6639400136332652, "grad_norm": 10.3125, "learning_rate": 1.5602096658111003e-05, "loss": 3.1947827339172363, "step": 974 }, { "epoch": 0.6646216768916156, "grad_norm": 20.375, "learning_rate": 1.5592946176112973e-05, "loss": 4.727209091186523, "step": 975 }, { "epoch": 0.665303340149966, "grad_norm": 27.375, "learning_rate": 1.55837888742496e-05, "loss": 6.3235392570495605, "step": 976 }, { "epoch": 0.6659850034083163, "grad_norm": 14.8125, "learning_rate": 1.5574624763687006e-05, "loss": 4.340359687805176, "step": 977 }, { "epoch": 0.6666666666666666, "grad_norm": 14.6875, "learning_rate": 1.556545385559964e-05, "loss": 2.8050012588500977, "step": 978 }, { "epoch": 0.667348329925017, "grad_norm": 20.125, "learning_rate": 1.5556276161170214e-05, "loss": 4.717611312866211, "step": 979 }, { "epoch": 0.6680299931833674, "grad_norm": 14.375, "learning_rate": 1.554709169158972e-05, "loss": 4.422959327697754, "step": 980 }, { "epoch": 0.6687116564417178, "grad_norm": 17.875, "learning_rate": 1.5537900458057426e-05, "loss": 5.352914810180664, "step": 981 }, { "epoch": 0.6693933197000682, "grad_norm": 20.75, "learning_rate": 1.5528702471780832e-05, "loss": 5.555569171905518, "step": 982 }, { "epoch": 0.6700749829584185, "grad_norm": 25.25, "learning_rate": 1.5519497743975676e-05, "loss": 6.7705183029174805, "step": 983 }, { "epoch": 0.6707566462167689, "grad_norm": 12.9375, "learning_rate": 1.551028628586592e-05, "loss": 3.375976085662842, "step": 984 }, { "epoch": 0.6714383094751193, "grad_norm": 23.5, "learning_rate": 1.550106810868373e-05, "loss": 7.389703273773193, "step": 985 }, { "epoch": 0.6721199727334697, "grad_norm": 43.75, "learning_rate": 1.549184322366947e-05, "loss": 6.736047267913818, "step": 986 }, { "epoch": 0.6728016359918201, "grad_norm": 11.8125, "learning_rate": 1.5482611642071672e-05, "loss": 3.954582691192627, "step": 987 }, { "epoch": 0.6734832992501704, "grad_norm": 27.375, "learning_rate": 1.5473373375147046e-05, "loss": 8.18304443359375, "step": 988 }, { "epoch": 0.6741649625085208, "grad_norm": 33.25, "learning_rate": 1.546412843416045e-05, "loss": 6.267477035522461, "step": 989 }, { "epoch": 0.6748466257668712, "grad_norm": 17.375, "learning_rate": 1.5454876830384868e-05, "loss": 4.164224147796631, "step": 990 }, { "epoch": 0.6755282890252216, "grad_norm": 18.875, "learning_rate": 1.544561857510143e-05, "loss": 7.214755058288574, "step": 991 }, { "epoch": 0.676209952283572, "grad_norm": 15.9375, "learning_rate": 1.5436353679599363e-05, "loss": 6.3685150146484375, "step": 992 }, { "epoch": 0.6768916155419223, "grad_norm": 13.75, "learning_rate": 1.5427082155175993e-05, "loss": 3.7820606231689453, "step": 993 }, { "epoch": 0.6775732788002726, "grad_norm": 35.5, "learning_rate": 1.541780401313673e-05, "loss": 3.8012566566467285, "step": 994 }, { "epoch": 0.678254942058623, "grad_norm": 13.8125, "learning_rate": 1.540851926479505e-05, "loss": 5.9949235916137695, "step": 995 }, { "epoch": 0.6789366053169734, "grad_norm": 23.5, "learning_rate": 1.5399227921472493e-05, "loss": 6.8023247718811035, "step": 996 }, { "epoch": 0.6796182685753238, "grad_norm": 19.125, "learning_rate": 1.5389929994498635e-05, "loss": 2.1573333740234375, "step": 997 }, { "epoch": 0.6802999318336742, "grad_norm": 27.375, "learning_rate": 1.5380625495211072e-05, "loss": 5.905636787414551, "step": 998 }, { "epoch": 0.6809815950920245, "grad_norm": 46.75, "learning_rate": 1.537131443495543e-05, "loss": 5.439169883728027, "step": 999 }, { "epoch": 0.6816632583503749, "grad_norm": 37.0, "learning_rate": 1.536199682508533e-05, "loss": 7.3878960609436035, "step": 1000 }, { "epoch": 0.6823449216087253, "grad_norm": 21.25, "learning_rate": 1.5352672676962365e-05, "loss": 4.30549955368042, "step": 1001 }, { "epoch": 0.6830265848670757, "grad_norm": 32.25, "learning_rate": 1.5343342001956125e-05, "loss": 5.103906631469727, "step": 1002 }, { "epoch": 0.6837082481254261, "grad_norm": 11.1875, "learning_rate": 1.533400481144414e-05, "loss": 3.6511011123657227, "step": 1003 }, { "epoch": 0.6843899113837764, "grad_norm": 53.25, "learning_rate": 1.5324661116811887e-05, "loss": 9.983938217163086, "step": 1004 }, { "epoch": 0.6850715746421268, "grad_norm": 103.5, "learning_rate": 1.531531092945279e-05, "loss": 7.256073951721191, "step": 1005 }, { "epoch": 0.6857532379004772, "grad_norm": 14.5625, "learning_rate": 1.5305954260768166e-05, "loss": 1.815278172492981, "step": 1006 }, { "epoch": 0.6864349011588275, "grad_norm": 13.3125, "learning_rate": 1.5296591122167254e-05, "loss": 2.315953254699707, "step": 1007 }, { "epoch": 0.6871165644171779, "grad_norm": 13.4375, "learning_rate": 1.5287221525067168e-05, "loss": 3.0559139251708984, "step": 1008 }, { "epoch": 0.6877982276755283, "grad_norm": 34.75, "learning_rate": 1.5277845480892914e-05, "loss": 6.707594871520996, "step": 1009 }, { "epoch": 0.6884798909338786, "grad_norm": 19.75, "learning_rate": 1.526846300107734e-05, "loss": 6.714608669281006, "step": 1010 }, { "epoch": 0.689161554192229, "grad_norm": 28.625, "learning_rate": 1.5259074097061166e-05, "loss": 5.949961185455322, "step": 1011 }, { "epoch": 0.6898432174505794, "grad_norm": 14.5625, "learning_rate": 1.5249678780292913e-05, "loss": 3.5882210731506348, "step": 1012 }, { "epoch": 0.6905248807089298, "grad_norm": 21.125, "learning_rate": 1.5240277062228952e-05, "loss": 2.7397894859313965, "step": 1013 }, { "epoch": 0.6912065439672802, "grad_norm": 15.9375, "learning_rate": 1.523086895433344e-05, "loss": 7.816567420959473, "step": 1014 }, { "epoch": 0.6918882072256305, "grad_norm": 30.0, "learning_rate": 1.5221454468078336e-05, "loss": 6.0686750411987305, "step": 1015 }, { "epoch": 0.6925698704839809, "grad_norm": 49.75, "learning_rate": 1.5212033614943371e-05, "loss": 11.321406364440918, "step": 1016 }, { "epoch": 0.6932515337423313, "grad_norm": 19.75, "learning_rate": 1.5202606406416043e-05, "loss": 5.013477802276611, "step": 1017 }, { "epoch": 0.6939331970006817, "grad_norm": 27.125, "learning_rate": 1.5193172853991596e-05, "loss": 4.771260738372803, "step": 1018 }, { "epoch": 0.6946148602590321, "grad_norm": 19.875, "learning_rate": 1.518373296917301e-05, "loss": 5.589818954467773, "step": 1019 }, { "epoch": 0.6952965235173824, "grad_norm": 27.5, "learning_rate": 1.5174286763470995e-05, "loss": 2.712704658508301, "step": 1020 }, { "epoch": 0.6959781867757328, "grad_norm": 12.75, "learning_rate": 1.5164834248403959e-05, "loss": 2.91904354095459, "step": 1021 }, { "epoch": 0.6966598500340832, "grad_norm": 24.75, "learning_rate": 1.5155375435498001e-05, "loss": 3.64868426322937, "step": 1022 }, { "epoch": 0.6973415132924335, "grad_norm": 13.125, "learning_rate": 1.5145910336286912e-05, "loss": 4.806176662445068, "step": 1023 }, { "epoch": 0.6980231765507839, "grad_norm": 28.25, "learning_rate": 1.5136438962312134e-05, "loss": 4.9293317794799805, "step": 1024 }, { "epoch": 0.6987048398091343, "grad_norm": 38.5, "learning_rate": 1.5126961325122773e-05, "loss": 9.516109466552734, "step": 1025 }, { "epoch": 0.6993865030674846, "grad_norm": 14.3125, "learning_rate": 1.511747743627556e-05, "loss": 3.548491954803467, "step": 1026 }, { "epoch": 0.700068166325835, "grad_norm": 16.5, "learning_rate": 1.5107987307334864e-05, "loss": 4.943625450134277, "step": 1027 }, { "epoch": 0.7007498295841854, "grad_norm": 17.5, "learning_rate": 1.5098490949872648e-05, "loss": 7.025869369506836, "step": 1028 }, { "epoch": 0.7014314928425358, "grad_norm": 26.25, "learning_rate": 1.5088988375468473e-05, "loss": 6.330288887023926, "step": 1029 }, { "epoch": 0.7021131561008862, "grad_norm": 19.125, "learning_rate": 1.5079479595709493e-05, "loss": 5.6317925453186035, "step": 1030 }, { "epoch": 0.7027948193592365, "grad_norm": 15.6875, "learning_rate": 1.5069964622190409e-05, "loss": 2.1470820903778076, "step": 1031 }, { "epoch": 0.7034764826175869, "grad_norm": 15.5625, "learning_rate": 1.5060443466513497e-05, "loss": 7.9737467765808105, "step": 1032 }, { "epoch": 0.7041581458759373, "grad_norm": 34.0, "learning_rate": 1.5050916140288552e-05, "loss": 6.454843044281006, "step": 1033 }, { "epoch": 0.7048398091342877, "grad_norm": 16.0, "learning_rate": 1.5041382655132899e-05, "loss": 6.19007682800293, "step": 1034 }, { "epoch": 0.7055214723926381, "grad_norm": 21.25, "learning_rate": 1.5031843022671377e-05, "loss": 5.20697021484375, "step": 1035 }, { "epoch": 0.7062031356509885, "grad_norm": 17.25, "learning_rate": 1.5022297254536321e-05, "loss": 4.3360137939453125, "step": 1036 }, { "epoch": 0.7068847989093388, "grad_norm": 25.5, "learning_rate": 1.5012745362367543e-05, "loss": 6.870034217834473, "step": 1037 }, { "epoch": 0.7075664621676891, "grad_norm": 24.125, "learning_rate": 1.5003187357812323e-05, "loss": 8.151287078857422, "step": 1038 }, { "epoch": 0.7082481254260395, "grad_norm": 22.25, "learning_rate": 1.4993623252525398e-05, "loss": 5.630760669708252, "step": 1039 }, { "epoch": 0.7089297886843899, "grad_norm": 13.8125, "learning_rate": 1.4984053058168936e-05, "loss": 5.863424301147461, "step": 1040 }, { "epoch": 0.7096114519427403, "grad_norm": 21.5, "learning_rate": 1.4974476786412542e-05, "loss": 5.9935760498046875, "step": 1041 }, { "epoch": 0.7102931152010906, "grad_norm": 24.125, "learning_rate": 1.4964894448933227e-05, "loss": 4.425015926361084, "step": 1042 }, { "epoch": 0.710974778459441, "grad_norm": 13.25, "learning_rate": 1.4955306057415388e-05, "loss": 4.522419452667236, "step": 1043 }, { "epoch": 0.7116564417177914, "grad_norm": 14.25, "learning_rate": 1.4945711623550822e-05, "loss": 5.37777853012085, "step": 1044 }, { "epoch": 0.7123381049761418, "grad_norm": 13.9375, "learning_rate": 1.4936111159038677e-05, "loss": 6.694005966186523, "step": 1045 }, { "epoch": 0.7130197682344922, "grad_norm": 12.5625, "learning_rate": 1.4926504675585467e-05, "loss": 4.051295280456543, "step": 1046 }, { "epoch": 0.7137014314928425, "grad_norm": 11.125, "learning_rate": 1.4916892184905037e-05, "loss": 3.187448024749756, "step": 1047 }, { "epoch": 0.7143830947511929, "grad_norm": 22.875, "learning_rate": 1.4907273698718562e-05, "loss": 5.389264106750488, "step": 1048 }, { "epoch": 0.7150647580095433, "grad_norm": 25.0, "learning_rate": 1.4897649228754527e-05, "loss": 6.706471920013428, "step": 1049 }, { "epoch": 0.7157464212678937, "grad_norm": 23.625, "learning_rate": 1.4888018786748713e-05, "loss": 5.037300109863281, "step": 1050 }, { "epoch": 0.7164280845262441, "grad_norm": 33.75, "learning_rate": 1.487838238444418e-05, "loss": 8.008472442626953, "step": 1051 }, { "epoch": 0.7171097477845945, "grad_norm": 29.5, "learning_rate": 1.4868740033591258e-05, "loss": 6.434911251068115, "step": 1052 }, { "epoch": 0.7177914110429447, "grad_norm": 44.75, "learning_rate": 1.485909174594753e-05, "loss": 13.166252136230469, "step": 1053 }, { "epoch": 0.7184730743012951, "grad_norm": 17.625, "learning_rate": 1.484943753327783e-05, "loss": 4.397521018981934, "step": 1054 }, { "epoch": 0.7191547375596455, "grad_norm": 14.6875, "learning_rate": 1.4839777407354194e-05, "loss": 3.467169761657715, "step": 1055 }, { "epoch": 0.7198364008179959, "grad_norm": 19.0, "learning_rate": 1.4830111379955886e-05, "loss": 8.48218059539795, "step": 1056 }, { "epoch": 0.7205180640763463, "grad_norm": 16.5, "learning_rate": 1.4820439462869353e-05, "loss": 6.280445575714111, "step": 1057 }, { "epoch": 0.7211997273346966, "grad_norm": 26.875, "learning_rate": 1.481076166788824e-05, "loss": 6.832099437713623, "step": 1058 }, { "epoch": 0.721881390593047, "grad_norm": 26.625, "learning_rate": 1.480107800681335e-05, "loss": 6.164374828338623, "step": 1059 }, { "epoch": 0.7225630538513974, "grad_norm": 15.3125, "learning_rate": 1.4791388491452637e-05, "loss": 7.399207592010498, "step": 1060 }, { "epoch": 0.7232447171097478, "grad_norm": 29.125, "learning_rate": 1.4781693133621191e-05, "loss": 3.451977252960205, "step": 1061 }, { "epoch": 0.7239263803680982, "grad_norm": 47.0, "learning_rate": 1.4771991945141237e-05, "loss": 4.0687971115112305, "step": 1062 }, { "epoch": 0.7246080436264485, "grad_norm": 28.875, "learning_rate": 1.4762284937842103e-05, "loss": 9.413057327270508, "step": 1063 }, { "epoch": 0.7252897068847989, "grad_norm": 14.625, "learning_rate": 1.4752572123560216e-05, "loss": 5.68951416015625, "step": 1064 }, { "epoch": 0.7259713701431493, "grad_norm": 35.0, "learning_rate": 1.4742853514139076e-05, "loss": 3.4193339347839355, "step": 1065 }, { "epoch": 0.7266530334014997, "grad_norm": 24.875, "learning_rate": 1.4733129121429253e-05, "loss": 8.097602844238281, "step": 1066 }, { "epoch": 0.7273346966598501, "grad_norm": 29.0, "learning_rate": 1.4723398957288373e-05, "loss": 8.344840049743652, "step": 1067 }, { "epoch": 0.7280163599182005, "grad_norm": 13.5625, "learning_rate": 1.4713663033581099e-05, "loss": 6.769367218017578, "step": 1068 }, { "epoch": 0.7286980231765507, "grad_norm": 15.25, "learning_rate": 1.470392136217911e-05, "loss": 4.463737964630127, "step": 1069 }, { "epoch": 0.7293796864349011, "grad_norm": 28.375, "learning_rate": 1.4694173954961105e-05, "loss": 5.34139347076416, "step": 1070 }, { "epoch": 0.7300613496932515, "grad_norm": 12.875, "learning_rate": 1.4684420823812763e-05, "loss": 4.078634738922119, "step": 1071 }, { "epoch": 0.7307430129516019, "grad_norm": 20.25, "learning_rate": 1.4674661980626754e-05, "loss": 4.04760217666626, "step": 1072 }, { "epoch": 0.7314246762099523, "grad_norm": 14.125, "learning_rate": 1.466489743730271e-05, "loss": 5.105189323425293, "step": 1073 }, { "epoch": 0.7321063394683026, "grad_norm": 22.375, "learning_rate": 1.4655127205747208e-05, "loss": 5.912664413452148, "step": 1074 }, { "epoch": 0.732788002726653, "grad_norm": 16.25, "learning_rate": 1.4645351297873774e-05, "loss": 4.827907085418701, "step": 1075 }, { "epoch": 0.7334696659850034, "grad_norm": 14.875, "learning_rate": 1.463556972560284e-05, "loss": 5.005539417266846, "step": 1076 }, { "epoch": 0.7341513292433538, "grad_norm": 19.25, "learning_rate": 1.4625782500861756e-05, "loss": 4.34989595413208, "step": 1077 }, { "epoch": 0.7348329925017042, "grad_norm": 25.875, "learning_rate": 1.4615989635584757e-05, "loss": 4.03895378112793, "step": 1078 }, { "epoch": 0.7355146557600545, "grad_norm": 19.25, "learning_rate": 1.4606191141712964e-05, "loss": 4.6824541091918945, "step": 1079 }, { "epoch": 0.7361963190184049, "grad_norm": 27.0, "learning_rate": 1.4596387031194354e-05, "loss": 4.807040214538574, "step": 1080 }, { "epoch": 0.7368779822767553, "grad_norm": 16.875, "learning_rate": 1.458657731598376e-05, "loss": 2.367739200592041, "step": 1081 }, { "epoch": 0.7375596455351057, "grad_norm": 17.625, "learning_rate": 1.4576762008042837e-05, "loss": 5.025629043579102, "step": 1082 }, { "epoch": 0.7382413087934561, "grad_norm": 31.0, "learning_rate": 1.4566941119340074e-05, "loss": 6.282010555267334, "step": 1083 }, { "epoch": 0.7389229720518065, "grad_norm": 13.75, "learning_rate": 1.4557114661850755e-05, "loss": 4.528744220733643, "step": 1084 }, { "epoch": 0.7396046353101567, "grad_norm": 29.375, "learning_rate": 1.4547282647556964e-05, "loss": 7.079162120819092, "step": 1085 }, { "epoch": 0.7402862985685071, "grad_norm": 16.25, "learning_rate": 1.4537445088447547e-05, "loss": 3.883497714996338, "step": 1086 }, { "epoch": 0.7409679618268575, "grad_norm": 16.125, "learning_rate": 1.4527601996518122e-05, "loss": 3.0125694274902344, "step": 1087 }, { "epoch": 0.7416496250852079, "grad_norm": 12.5625, "learning_rate": 1.4517753383771052e-05, "loss": 3.6458115577697754, "step": 1088 }, { "epoch": 0.7423312883435583, "grad_norm": 19.0, "learning_rate": 1.4507899262215426e-05, "loss": 4.018141746520996, "step": 1089 }, { "epoch": 0.7430129516019086, "grad_norm": 31.375, "learning_rate": 1.449803964386706e-05, "loss": 7.084048271179199, "step": 1090 }, { "epoch": 0.743694614860259, "grad_norm": 11.875, "learning_rate": 1.4488174540748463e-05, "loss": 4.328667163848877, "step": 1091 }, { "epoch": 0.7443762781186094, "grad_norm": 12.75, "learning_rate": 1.4478303964888842e-05, "loss": 2.538975238800049, "step": 1092 }, { "epoch": 0.7450579413769598, "grad_norm": 24.5, "learning_rate": 1.4468427928324065e-05, "loss": 7.125279426574707, "step": 1093 }, { "epoch": 0.7457396046353102, "grad_norm": 12.1875, "learning_rate": 1.4458546443096663e-05, "loss": 3.825270414352417, "step": 1094 }, { "epoch": 0.7464212678936605, "grad_norm": 14.8125, "learning_rate": 1.4448659521255823e-05, "loss": 5.0135393142700195, "step": 1095 }, { "epoch": 0.7471029311520109, "grad_norm": 22.875, "learning_rate": 1.4438767174857346e-05, "loss": 6.899580955505371, "step": 1096 }, { "epoch": 0.7477845944103613, "grad_norm": 17.25, "learning_rate": 1.442886941596365e-05, "loss": 4.847146034240723, "step": 1097 }, { "epoch": 0.7484662576687117, "grad_norm": 12.3125, "learning_rate": 1.4418966256643762e-05, "loss": 4.565229892730713, "step": 1098 }, { "epoch": 0.7491479209270621, "grad_norm": 25.375, "learning_rate": 1.4409057708973282e-05, "loss": 3.3522911071777344, "step": 1099 }, { "epoch": 0.7498295841854125, "grad_norm": 15.6875, "learning_rate": 1.4399143785034388e-05, "loss": 5.931720733642578, "step": 1100 }, { "epoch": 0.7505112474437627, "grad_norm": 13.125, "learning_rate": 1.4389224496915814e-05, "loss": 4.803996562957764, "step": 1101 }, { "epoch": 0.7511929107021131, "grad_norm": 49.5, "learning_rate": 1.4379299856712827e-05, "loss": 2.3364109992980957, "step": 1102 }, { "epoch": 0.7518745739604635, "grad_norm": 23.375, "learning_rate": 1.4369369876527234e-05, "loss": 5.744244575500488, "step": 1103 }, { "epoch": 0.7525562372188139, "grad_norm": 12.0, "learning_rate": 1.4359434568467341e-05, "loss": 3.7399048805236816, "step": 1104 }, { "epoch": 0.7532379004771643, "grad_norm": 26.75, "learning_rate": 1.4349493944647953e-05, "loss": 7.513420104980469, "step": 1105 }, { "epoch": 0.7539195637355146, "grad_norm": 24.375, "learning_rate": 1.4339548017190356e-05, "loss": 2.525172710418701, "step": 1106 }, { "epoch": 0.754601226993865, "grad_norm": 14.4375, "learning_rate": 1.4329596798222318e-05, "loss": 3.128390073776245, "step": 1107 }, { "epoch": 0.7552828902522154, "grad_norm": 30.25, "learning_rate": 1.4319640299878038e-05, "loss": 7.995621681213379, "step": 1108 }, { "epoch": 0.7559645535105658, "grad_norm": 19.0, "learning_rate": 1.4309678534298164e-05, "loss": 4.388433456420898, "step": 1109 }, { "epoch": 0.7566462167689162, "grad_norm": 22.875, "learning_rate": 1.4299711513629759e-05, "loss": 7.085526466369629, "step": 1110 }, { "epoch": 0.7573278800272665, "grad_norm": 15.75, "learning_rate": 1.428973925002631e-05, "loss": 5.139338493347168, "step": 1111 }, { "epoch": 0.7580095432856169, "grad_norm": 11.375, "learning_rate": 1.4279761755647679e-05, "loss": 3.8428213596343994, "step": 1112 }, { "epoch": 0.7586912065439673, "grad_norm": 13.4375, "learning_rate": 1.4269779042660112e-05, "loss": 4.075303077697754, "step": 1113 }, { "epoch": 0.7593728698023177, "grad_norm": 16.125, "learning_rate": 1.4259791123236227e-05, "loss": 4.091595649719238, "step": 1114 }, { "epoch": 0.7600545330606681, "grad_norm": 12.5625, "learning_rate": 1.4249798009554979e-05, "loss": 3.4051740169525146, "step": 1115 }, { "epoch": 0.7607361963190185, "grad_norm": 13.375, "learning_rate": 1.4239799713801662e-05, "loss": 4.733155727386475, "step": 1116 }, { "epoch": 0.7614178595773687, "grad_norm": 14.3125, "learning_rate": 1.4229796248167888e-05, "loss": 4.09591007232666, "step": 1117 }, { "epoch": 0.7620995228357191, "grad_norm": 21.25, "learning_rate": 1.421978762485157e-05, "loss": 5.481997489929199, "step": 1118 }, { "epoch": 0.7627811860940695, "grad_norm": 26.875, "learning_rate": 1.4209773856056925e-05, "loss": 8.344427108764648, "step": 1119 }, { "epoch": 0.7634628493524199, "grad_norm": 38.0, "learning_rate": 1.419975495399442e-05, "loss": 9.937443733215332, "step": 1120 }, { "epoch": 0.7641445126107703, "grad_norm": 12.4375, "learning_rate": 1.4189730930880799e-05, "loss": 5.654269218444824, "step": 1121 }, { "epoch": 0.7648261758691206, "grad_norm": 30.375, "learning_rate": 1.417970179893904e-05, "loss": 4.144343852996826, "step": 1122 }, { "epoch": 0.765507839127471, "grad_norm": 16.125, "learning_rate": 1.4169667570398367e-05, "loss": 6.5023298263549805, "step": 1123 }, { "epoch": 0.7661895023858214, "grad_norm": 29.5, "learning_rate": 1.4159628257494195e-05, "loss": 3.3415448665618896, "step": 1124 }, { "epoch": 0.7668711656441718, "grad_norm": 26.0, "learning_rate": 1.4149583872468165e-05, "loss": 7.788743019104004, "step": 1125 }, { "epoch": 0.7675528289025222, "grad_norm": 13.6875, "learning_rate": 1.4139534427568073e-05, "loss": 4.870860576629639, "step": 1126 }, { "epoch": 0.7682344921608726, "grad_norm": 18.75, "learning_rate": 1.4129479935047914e-05, "loss": 2.6639316082000732, "step": 1127 }, { "epoch": 0.7689161554192229, "grad_norm": 22.375, "learning_rate": 1.4119420407167817e-05, "loss": 4.136035919189453, "step": 1128 }, { "epoch": 0.7695978186775733, "grad_norm": 26.75, "learning_rate": 1.4109355856194062e-05, "loss": 2.6696815490722656, "step": 1129 }, { "epoch": 0.7702794819359237, "grad_norm": 17.125, "learning_rate": 1.4099286294399051e-05, "loss": 5.261463165283203, "step": 1130 }, { "epoch": 0.7709611451942741, "grad_norm": 26.0, "learning_rate": 1.4089211734061294e-05, "loss": 3.368682861328125, "step": 1131 }, { "epoch": 0.7716428084526245, "grad_norm": 11.5625, "learning_rate": 1.4079132187465403e-05, "loss": 3.3555760383605957, "step": 1132 }, { "epoch": 0.7723244717109747, "grad_norm": 14.25, "learning_rate": 1.4069047666902056e-05, "loss": 4.572790145874023, "step": 1133 }, { "epoch": 0.7730061349693251, "grad_norm": 69.0, "learning_rate": 1.405895818466801e-05, "loss": 9.763680458068848, "step": 1134 }, { "epoch": 0.7736877982276755, "grad_norm": 41.5, "learning_rate": 1.404886375306607e-05, "loss": 5.384492874145508, "step": 1135 }, { "epoch": 0.7743694614860259, "grad_norm": 53.0, "learning_rate": 1.403876438440507e-05, "loss": 8.14116382598877, "step": 1136 }, { "epoch": 0.7750511247443763, "grad_norm": 10.375, "learning_rate": 1.4028660090999866e-05, "loss": 2.8655292987823486, "step": 1137 }, { "epoch": 0.7757327880027266, "grad_norm": 14.625, "learning_rate": 1.4018550885171322e-05, "loss": 4.552759170532227, "step": 1138 }, { "epoch": 0.776414451261077, "grad_norm": 7.71875, "learning_rate": 1.4008436779246288e-05, "loss": 1.6025056838989258, "step": 1139 }, { "epoch": 0.7770961145194274, "grad_norm": 39.5, "learning_rate": 1.3998317785557597e-05, "loss": 7.000317096710205, "step": 1140 }, { "epoch": 0.7777777777777778, "grad_norm": 28.125, "learning_rate": 1.3988193916444036e-05, "loss": 6.331212043762207, "step": 1141 }, { "epoch": 0.7784594410361282, "grad_norm": 16.0, "learning_rate": 1.3978065184250334e-05, "loss": 3.51653790473938, "step": 1142 }, { "epoch": 0.7791411042944786, "grad_norm": 30.625, "learning_rate": 1.396793160132715e-05, "loss": 7.661231994628906, "step": 1143 }, { "epoch": 0.7798227675528289, "grad_norm": 13.1875, "learning_rate": 1.3957793180031067e-05, "loss": 4.045989990234375, "step": 1144 }, { "epoch": 0.7805044308111793, "grad_norm": 36.25, "learning_rate": 1.3947649932724563e-05, "loss": 6.961798191070557, "step": 1145 }, { "epoch": 0.7811860940695297, "grad_norm": 22.125, "learning_rate": 1.3937501871775995e-05, "loss": 6.917481422424316, "step": 1146 }, { "epoch": 0.78186775732788, "grad_norm": 18.0, "learning_rate": 1.3927349009559597e-05, "loss": 2.015615463256836, "step": 1147 }, { "epoch": 0.7825494205862304, "grad_norm": 38.25, "learning_rate": 1.3917191358455453e-05, "loss": 10.73542594909668, "step": 1148 }, { "epoch": 0.7832310838445807, "grad_norm": 23.125, "learning_rate": 1.3907028930849489e-05, "loss": 3.485299587249756, "step": 1149 }, { "epoch": 0.7839127471029311, "grad_norm": 24.75, "learning_rate": 1.3896861739133456e-05, "loss": 9.266298294067383, "step": 1150 }, { "epoch": 0.7845944103612815, "grad_norm": 15.1875, "learning_rate": 1.388668979570491e-05, "loss": 3.483701705932617, "step": 1151 }, { "epoch": 0.7852760736196319, "grad_norm": 26.75, "learning_rate": 1.3876513112967208e-05, "loss": 4.22698974609375, "step": 1152 }, { "epoch": 0.7859577368779823, "grad_norm": 25.75, "learning_rate": 1.3866331703329477e-05, "loss": 4.917081832885742, "step": 1153 }, { "epoch": 0.7866394001363326, "grad_norm": 37.0, "learning_rate": 1.3856145579206612e-05, "loss": 7.172286033630371, "step": 1154 }, { "epoch": 0.787321063394683, "grad_norm": 28.25, "learning_rate": 1.384595475301926e-05, "loss": 6.103662490844727, "step": 1155 }, { "epoch": 0.7880027266530334, "grad_norm": 12.125, "learning_rate": 1.38357592371938e-05, "loss": 3.801004648208618, "step": 1156 }, { "epoch": 0.7886843899113838, "grad_norm": 20.125, "learning_rate": 1.3825559044162327e-05, "loss": 5.549659729003906, "step": 1157 }, { "epoch": 0.7893660531697342, "grad_norm": 22.875, "learning_rate": 1.381535418636264e-05, "loss": 5.782815456390381, "step": 1158 }, { "epoch": 0.7900477164280846, "grad_norm": 26.875, "learning_rate": 1.3805144676238225e-05, "loss": 8.853187561035156, "step": 1159 }, { "epoch": 0.7907293796864349, "grad_norm": 17.75, "learning_rate": 1.3794930526238246e-05, "loss": 2.4447765350341797, "step": 1160 }, { "epoch": 0.7914110429447853, "grad_norm": 17.5, "learning_rate": 1.3784711748817519e-05, "loss": 4.261455535888672, "step": 1161 }, { "epoch": 0.7920927062031357, "grad_norm": 23.625, "learning_rate": 1.3774488356436505e-05, "loss": 7.590598106384277, "step": 1162 }, { "epoch": 0.792774369461486, "grad_norm": 31.375, "learning_rate": 1.376426036156129e-05, "loss": 6.976542949676514, "step": 1163 }, { "epoch": 0.7934560327198364, "grad_norm": 18.75, "learning_rate": 1.3754027776663579e-05, "loss": 5.37154483795166, "step": 1164 }, { "epoch": 0.7941376959781867, "grad_norm": 13.1875, "learning_rate": 1.3743790614220664e-05, "loss": 4.106621265411377, "step": 1165 }, { "epoch": 0.7948193592365371, "grad_norm": 18.5, "learning_rate": 1.3733548886715427e-05, "loss": 4.762651443481445, "step": 1166 }, { "epoch": 0.7955010224948875, "grad_norm": 18.875, "learning_rate": 1.3723302606636311e-05, "loss": 3.7421579360961914, "step": 1167 }, { "epoch": 0.7961826857532379, "grad_norm": 21.375, "learning_rate": 1.3713051786477319e-05, "loss": 3.669285774230957, "step": 1168 }, { "epoch": 0.7968643490115883, "grad_norm": 57.75, "learning_rate": 1.3702796438737974e-05, "loss": 11.038129806518555, "step": 1169 }, { "epoch": 0.7975460122699386, "grad_norm": 29.25, "learning_rate": 1.3692536575923334e-05, "loss": 4.191030025482178, "step": 1170 }, { "epoch": 0.798227675528289, "grad_norm": 63.25, "learning_rate": 1.3682272210543959e-05, "loss": 11.665654182434082, "step": 1171 }, { "epoch": 0.7989093387866394, "grad_norm": 22.0, "learning_rate": 1.3672003355115897e-05, "loss": 8.293638229370117, "step": 1172 }, { "epoch": 0.7995910020449898, "grad_norm": 15.25, "learning_rate": 1.3661730022160673e-05, "loss": 3.1005396842956543, "step": 1173 }, { "epoch": 0.8002726653033402, "grad_norm": 20.25, "learning_rate": 1.365145222420527e-05, "loss": 2.6996078491210938, "step": 1174 }, { "epoch": 0.8009543285616906, "grad_norm": 44.0, "learning_rate": 1.3641169973782117e-05, "loss": 8.127614974975586, "step": 1175 }, { "epoch": 0.8016359918200409, "grad_norm": 28.25, "learning_rate": 1.3630883283429071e-05, "loss": 6.690086364746094, "step": 1176 }, { "epoch": 0.8023176550783913, "grad_norm": 19.25, "learning_rate": 1.3620592165689405e-05, "loss": 5.876935958862305, "step": 1177 }, { "epoch": 0.8029993183367417, "grad_norm": 18.375, "learning_rate": 1.3610296633111788e-05, "loss": 3.039498805999756, "step": 1178 }, { "epoch": 0.803680981595092, "grad_norm": 21.25, "learning_rate": 1.3599996698250274e-05, "loss": 5.677700042724609, "step": 1179 }, { "epoch": 0.8043626448534424, "grad_norm": 20.875, "learning_rate": 1.3589692373664288e-05, "loss": 2.158928155899048, "step": 1180 }, { "epoch": 0.8050443081117927, "grad_norm": 12.0, "learning_rate": 1.3579383671918598e-05, "loss": 4.26624059677124, "step": 1181 }, { "epoch": 0.8057259713701431, "grad_norm": 30.375, "learning_rate": 1.3569070605583319e-05, "loss": 8.67898941040039, "step": 1182 }, { "epoch": 0.8064076346284935, "grad_norm": 23.5, "learning_rate": 1.3558753187233881e-05, "loss": 9.83696174621582, "step": 1183 }, { "epoch": 0.8070892978868439, "grad_norm": 18.75, "learning_rate": 1.3548431429451032e-05, "loss": 5.879307270050049, "step": 1184 }, { "epoch": 0.8077709611451943, "grad_norm": 28.625, "learning_rate": 1.3538105344820798e-05, "loss": 4.928169250488281, "step": 1185 }, { "epoch": 0.8084526244035446, "grad_norm": 25.5, "learning_rate": 1.352777494593449e-05, "loss": 6.740233421325684, "step": 1186 }, { "epoch": 0.809134287661895, "grad_norm": 14.0, "learning_rate": 1.3517440245388672e-05, "loss": 4.599939823150635, "step": 1187 }, { "epoch": 0.8098159509202454, "grad_norm": 39.75, "learning_rate": 1.350710125578516e-05, "loss": 7.246450901031494, "step": 1188 }, { "epoch": 0.8104976141785958, "grad_norm": 30.375, "learning_rate": 1.3496757989730997e-05, "loss": 5.334835052490234, "step": 1189 }, { "epoch": 0.8111792774369462, "grad_norm": 18.875, "learning_rate": 1.3486410459838448e-05, "loss": 3.8925724029541016, "step": 1190 }, { "epoch": 0.8118609406952966, "grad_norm": 25.75, "learning_rate": 1.347605867872496e-05, "loss": 5.1975250244140625, "step": 1191 }, { "epoch": 0.8125426039536469, "grad_norm": 19.625, "learning_rate": 1.346570265901318e-05, "loss": 6.299593925476074, "step": 1192 }, { "epoch": 0.8132242672119973, "grad_norm": 20.125, "learning_rate": 1.3455342413330916e-05, "loss": 6.832985877990723, "step": 1193 }, { "epoch": 0.8139059304703476, "grad_norm": 14.0, "learning_rate": 1.3444977954311133e-05, "loss": 5.218026161193848, "step": 1194 }, { "epoch": 0.814587593728698, "grad_norm": 24.75, "learning_rate": 1.343460929459193e-05, "loss": 6.642695426940918, "step": 1195 }, { "epoch": 0.8152692569870484, "grad_norm": 11.6875, "learning_rate": 1.3424236446816528e-05, "loss": 4.2042741775512695, "step": 1196 }, { "epoch": 0.8159509202453987, "grad_norm": 34.25, "learning_rate": 1.3413859423633259e-05, "loss": 6.871602535247803, "step": 1197 }, { "epoch": 0.8166325835037491, "grad_norm": 32.25, "learning_rate": 1.3403478237695542e-05, "loss": 6.623035907745361, "step": 1198 }, { "epoch": 0.8173142467620995, "grad_norm": 40.25, "learning_rate": 1.3393092901661873e-05, "loss": 6.140131950378418, "step": 1199 }, { "epoch": 0.8179959100204499, "grad_norm": 13.375, "learning_rate": 1.3382703428195812e-05, "loss": 4.9660325050354, "step": 1200 }, { "epoch": 0.8186775732788003, "grad_norm": 56.0, "learning_rate": 1.3372309829965957e-05, "loss": 11.116511344909668, "step": 1201 }, { "epoch": 0.8193592365371506, "grad_norm": 50.5, "learning_rate": 1.3361912119645943e-05, "loss": 2.2782092094421387, "step": 1202 }, { "epoch": 0.820040899795501, "grad_norm": 20.25, "learning_rate": 1.3351510309914415e-05, "loss": 3.274704933166504, "step": 1203 }, { "epoch": 0.8207225630538514, "grad_norm": 19.625, "learning_rate": 1.3341104413455014e-05, "loss": 4.136429309844971, "step": 1204 }, { "epoch": 0.8214042263122018, "grad_norm": 15.5, "learning_rate": 1.3330694442956376e-05, "loss": 6.5382466316223145, "step": 1205 }, { "epoch": 0.8220858895705522, "grad_norm": 18.75, "learning_rate": 1.3320280411112092e-05, "loss": 5.344361782073975, "step": 1206 }, { "epoch": 0.8227675528289026, "grad_norm": 20.375, "learning_rate": 1.3309862330620709e-05, "loss": 3.5186572074890137, "step": 1207 }, { "epoch": 0.8234492160872529, "grad_norm": 21.125, "learning_rate": 1.3299440214185707e-05, "loss": 5.724998474121094, "step": 1208 }, { "epoch": 0.8241308793456033, "grad_norm": 21.375, "learning_rate": 1.3289014074515505e-05, "loss": 3.3172106742858887, "step": 1209 }, { "epoch": 0.8248125426039536, "grad_norm": 102.0, "learning_rate": 1.3278583924323405e-05, "loss": 10.222071647644043, "step": 1210 }, { "epoch": 0.825494205862304, "grad_norm": 17.75, "learning_rate": 1.326814977632761e-05, "loss": 5.318989276885986, "step": 1211 }, { "epoch": 0.8261758691206544, "grad_norm": 26.375, "learning_rate": 1.3257711643251201e-05, "loss": 2.3570308685302734, "step": 1212 }, { "epoch": 0.8268575323790047, "grad_norm": 16.875, "learning_rate": 1.3247269537822109e-05, "loss": 3.3095345497131348, "step": 1213 }, { "epoch": 0.8275391956373551, "grad_norm": 15.75, "learning_rate": 1.3236823472773116e-05, "loss": 2.8666939735412598, "step": 1214 }, { "epoch": 0.8282208588957055, "grad_norm": 40.75, "learning_rate": 1.3226373460841835e-05, "loss": 7.855008125305176, "step": 1215 }, { "epoch": 0.8289025221540559, "grad_norm": 33.0, "learning_rate": 1.3215919514770676e-05, "loss": 4.343752861022949, "step": 1216 }, { "epoch": 0.8295841854124063, "grad_norm": 48.25, "learning_rate": 1.3205461647306872e-05, "loss": 6.0765838623046875, "step": 1217 }, { "epoch": 0.8302658486707567, "grad_norm": 11.6875, "learning_rate": 1.3194999871202408e-05, "loss": 3.0836353302001953, "step": 1218 }, { "epoch": 0.830947511929107, "grad_norm": 11.625, "learning_rate": 1.3184534199214059e-05, "loss": 3.713568925857544, "step": 1219 }, { "epoch": 0.8316291751874574, "grad_norm": 34.5, "learning_rate": 1.3174064644103334e-05, "loss": 5.693996906280518, "step": 1220 }, { "epoch": 0.8323108384458078, "grad_norm": 30.25, "learning_rate": 1.3163591218636494e-05, "loss": 7.392035007476807, "step": 1221 }, { "epoch": 0.8329925017041582, "grad_norm": 44.75, "learning_rate": 1.31531139355845e-05, "loss": 7.585793495178223, "step": 1222 }, { "epoch": 0.8336741649625086, "grad_norm": 19.625, "learning_rate": 1.3142632807723035e-05, "loss": 4.717523574829102, "step": 1223 }, { "epoch": 0.8343558282208589, "grad_norm": 23.125, "learning_rate": 1.3132147847832453e-05, "loss": 5.721275806427002, "step": 1224 }, { "epoch": 0.8350374914792092, "grad_norm": 25.875, "learning_rate": 1.3121659068697797e-05, "loss": 4.5124125480651855, "step": 1225 }, { "epoch": 0.8357191547375596, "grad_norm": 17.25, "learning_rate": 1.3111166483108753e-05, "loss": 6.463704586029053, "step": 1226 }, { "epoch": 0.83640081799591, "grad_norm": 16.5, "learning_rate": 1.310067010385966e-05, "loss": 4.600020408630371, "step": 1227 }, { "epoch": 0.8370824812542604, "grad_norm": 30.25, "learning_rate": 1.3090169943749475e-05, "loss": 2.360793113708496, "step": 1228 }, { "epoch": 0.8377641445126107, "grad_norm": 24.25, "learning_rate": 1.307966601558177e-05, "loss": 5.200765132904053, "step": 1229 }, { "epoch": 0.8384458077709611, "grad_norm": 19.875, "learning_rate": 1.306915833216471e-05, "loss": 5.129266738891602, "step": 1230 }, { "epoch": 0.8391274710293115, "grad_norm": 41.75, "learning_rate": 1.3058646906311032e-05, "loss": 3.0531563758850098, "step": 1231 }, { "epoch": 0.8398091342876619, "grad_norm": 17.375, "learning_rate": 1.304813175083805e-05, "loss": 2.3894104957580566, "step": 1232 }, { "epoch": 0.8404907975460123, "grad_norm": 31.25, "learning_rate": 1.3037612878567623e-05, "loss": 3.7720444202423096, "step": 1233 }, { "epoch": 0.8411724608043627, "grad_norm": 34.5, "learning_rate": 1.3027090302326127e-05, "loss": 8.493297576904297, "step": 1234 }, { "epoch": 0.841854124062713, "grad_norm": 24.625, "learning_rate": 1.3016564034944473e-05, "loss": 9.874228477478027, "step": 1235 }, { "epoch": 0.8425357873210634, "grad_norm": 16.375, "learning_rate": 1.3006034089258059e-05, "loss": 4.853800296783447, "step": 1236 }, { "epoch": 0.8432174505794138, "grad_norm": 36.25, "learning_rate": 1.2995500478106781e-05, "loss": 8.264150619506836, "step": 1237 }, { "epoch": 0.8438991138377642, "grad_norm": 29.875, "learning_rate": 1.2984963214335e-05, "loss": 5.295577049255371, "step": 1238 }, { "epoch": 0.8445807770961146, "grad_norm": 17.375, "learning_rate": 1.2974422310791524e-05, "loss": 4.1205620765686035, "step": 1239 }, { "epoch": 0.8452624403544649, "grad_norm": 44.75, "learning_rate": 1.29638777803296e-05, "loss": 10.425886154174805, "step": 1240 }, { "epoch": 0.8459441036128152, "grad_norm": 17.25, "learning_rate": 1.2953329635806914e-05, "loss": 3.9294514656066895, "step": 1241 }, { "epoch": 0.8466257668711656, "grad_norm": 45.0, "learning_rate": 1.2942777890085538e-05, "loss": 8.385122299194336, "step": 1242 }, { "epoch": 0.847307430129516, "grad_norm": 47.0, "learning_rate": 1.2932222556031946e-05, "loss": 9.211775779724121, "step": 1243 }, { "epoch": 0.8479890933878664, "grad_norm": 21.875, "learning_rate": 1.2921663646516985e-05, "loss": 5.504023551940918, "step": 1244 }, { "epoch": 0.8486707566462167, "grad_norm": 14.375, "learning_rate": 1.2911101174415861e-05, "loss": 4.817590713500977, "step": 1245 }, { "epoch": 0.8493524199045671, "grad_norm": 19.875, "learning_rate": 1.290053515260813e-05, "loss": 2.9874162673950195, "step": 1246 }, { "epoch": 0.8500340831629175, "grad_norm": 30.0, "learning_rate": 1.288996559397767e-05, "loss": 4.567483425140381, "step": 1247 }, { "epoch": 0.8507157464212679, "grad_norm": 22.625, "learning_rate": 1.2879392511412668e-05, "loss": 7.513790607452393, "step": 1248 }, { "epoch": 0.8513974096796183, "grad_norm": 11.9375, "learning_rate": 1.2868815917805619e-05, "loss": 2.597170352935791, "step": 1249 }, { "epoch": 0.8520790729379687, "grad_norm": 17.375, "learning_rate": 1.2858235826053294e-05, "loss": 3.574446678161621, "step": 1250 }, { "epoch": 0.852760736196319, "grad_norm": 16.375, "learning_rate": 1.2847652249056726e-05, "loss": 6.071230888366699, "step": 1251 }, { "epoch": 0.8534423994546694, "grad_norm": 22.625, "learning_rate": 1.2837065199721204e-05, "loss": 3.101438045501709, "step": 1252 }, { "epoch": 0.8541240627130198, "grad_norm": 25.375, "learning_rate": 1.2826474690956243e-05, "loss": 5.300721168518066, "step": 1253 }, { "epoch": 0.8548057259713702, "grad_norm": 15.9375, "learning_rate": 1.2815880735675588e-05, "loss": 2.849289655685425, "step": 1254 }, { "epoch": 0.8554873892297206, "grad_norm": 17.875, "learning_rate": 1.2805283346797179e-05, "loss": 6.522305488586426, "step": 1255 }, { "epoch": 0.8561690524880708, "grad_norm": 48.25, "learning_rate": 1.279468253724314e-05, "loss": 3.8448407649993896, "step": 1256 }, { "epoch": 0.8568507157464212, "grad_norm": 30.125, "learning_rate": 1.2784078319939769e-05, "loss": 5.57237434387207, "step": 1257 }, { "epoch": 0.8575323790047716, "grad_norm": 16.0, "learning_rate": 1.2773470707817524e-05, "loss": 5.539696216583252, "step": 1258 }, { "epoch": 0.858214042263122, "grad_norm": 20.0, "learning_rate": 1.2762859713810998e-05, "loss": 5.318199157714844, "step": 1259 }, { "epoch": 0.8588957055214724, "grad_norm": 34.5, "learning_rate": 1.2752245350858905e-05, "loss": 5.128523826599121, "step": 1260 }, { "epoch": 0.8595773687798227, "grad_norm": 24.625, "learning_rate": 1.2741627631904077e-05, "loss": 7.015297889709473, "step": 1261 }, { "epoch": 0.8602590320381731, "grad_norm": 17.875, "learning_rate": 1.2731006569893427e-05, "loss": 5.861207485198975, "step": 1262 }, { "epoch": 0.8609406952965235, "grad_norm": 25.375, "learning_rate": 1.272038217777795e-05, "loss": 6.271626949310303, "step": 1263 }, { "epoch": 0.8616223585548739, "grad_norm": 14.8125, "learning_rate": 1.27097544685127e-05, "loss": 3.9759652614593506, "step": 1264 }, { "epoch": 0.8623040218132243, "grad_norm": 20.5, "learning_rate": 1.2699123455056777e-05, "loss": 6.3593621253967285, "step": 1265 }, { "epoch": 0.8629856850715747, "grad_norm": 18.0, "learning_rate": 1.268848915037331e-05, "loss": 3.928056001663208, "step": 1266 }, { "epoch": 0.863667348329925, "grad_norm": 15.0625, "learning_rate": 1.2677851567429442e-05, "loss": 3.404360294342041, "step": 1267 }, { "epoch": 0.8643490115882754, "grad_norm": 16.875, "learning_rate": 1.2667210719196308e-05, "loss": 7.654482841491699, "step": 1268 }, { "epoch": 0.8650306748466258, "grad_norm": 20.25, "learning_rate": 1.2656566618649031e-05, "loss": 5.685360908508301, "step": 1269 }, { "epoch": 0.8657123381049762, "grad_norm": 29.25, "learning_rate": 1.26459192787667e-05, "loss": 3.593329429626465, "step": 1270 }, { "epoch": 0.8663940013633266, "grad_norm": 28.625, "learning_rate": 1.263526871253235e-05, "loss": 4.685408592224121, "step": 1271 }, { "epoch": 0.8670756646216768, "grad_norm": 16.625, "learning_rate": 1.2624614932932953e-05, "loss": 4.676250457763672, "step": 1272 }, { "epoch": 0.8677573278800272, "grad_norm": 15.5625, "learning_rate": 1.261395795295939e-05, "loss": 5.195029258728027, "step": 1273 }, { "epoch": 0.8684389911383776, "grad_norm": 25.5, "learning_rate": 1.260329778560646e-05, "loss": 3.545381546020508, "step": 1274 }, { "epoch": 0.869120654396728, "grad_norm": 19.5, "learning_rate": 1.2592634443872842e-05, "loss": 2.4088144302368164, "step": 1275 }, { "epoch": 0.8698023176550784, "grad_norm": 15.375, "learning_rate": 1.2581967940761079e-05, "loss": 4.246018886566162, "step": 1276 }, { "epoch": 0.8704839809134287, "grad_norm": 38.75, "learning_rate": 1.257129828927758e-05, "loss": 8.94006633758545, "step": 1277 }, { "epoch": 0.8711656441717791, "grad_norm": 11.0625, "learning_rate": 1.2560625502432581e-05, "loss": 3.0126559734344482, "step": 1278 }, { "epoch": 0.8718473074301295, "grad_norm": 22.125, "learning_rate": 1.2549949593240156e-05, "loss": 6.636211395263672, "step": 1279 }, { "epoch": 0.8725289706884799, "grad_norm": 17.25, "learning_rate": 1.2539270574718172e-05, "loss": 4.550020217895508, "step": 1280 }, { "epoch": 0.8732106339468303, "grad_norm": 46.25, "learning_rate": 1.2528588459888291e-05, "loss": 5.349539756774902, "step": 1281 }, { "epoch": 0.8738922972051807, "grad_norm": 54.25, "learning_rate": 1.2517903261775963e-05, "loss": 8.857925415039062, "step": 1282 }, { "epoch": 0.874573960463531, "grad_norm": 14.6875, "learning_rate": 1.2507214993410382e-05, "loss": 3.52671480178833, "step": 1283 }, { "epoch": 0.8752556237218814, "grad_norm": 15.625, "learning_rate": 1.2496523667824487e-05, "loss": 3.8740973472595215, "step": 1284 }, { "epoch": 0.8759372869802318, "grad_norm": 40.25, "learning_rate": 1.2485829298054952e-05, "loss": 9.282148361206055, "step": 1285 }, { "epoch": 0.8766189502385822, "grad_norm": 28.375, "learning_rate": 1.2475131897142165e-05, "loss": 5.062959671020508, "step": 1286 }, { "epoch": 0.8773006134969326, "grad_norm": 20.875, "learning_rate": 1.2464431478130204e-05, "loss": 5.759718894958496, "step": 1287 }, { "epoch": 0.8779822767552828, "grad_norm": 37.25, "learning_rate": 1.2453728054066825e-05, "loss": 6.958524703979492, "step": 1288 }, { "epoch": 0.8786639400136332, "grad_norm": 20.625, "learning_rate": 1.244302163800345e-05, "loss": 5.623591899871826, "step": 1289 }, { "epoch": 0.8793456032719836, "grad_norm": 23.625, "learning_rate": 1.2432312242995158e-05, "loss": 5.134998321533203, "step": 1290 }, { "epoch": 0.880027266530334, "grad_norm": 25.25, "learning_rate": 1.2421599882100647e-05, "loss": 7.522536277770996, "step": 1291 }, { "epoch": 0.8807089297886844, "grad_norm": 26.375, "learning_rate": 1.2410884568382245e-05, "loss": 3.3977978229522705, "step": 1292 }, { "epoch": 0.8813905930470347, "grad_norm": 55.25, "learning_rate": 1.2400166314905868e-05, "loss": 4.127241134643555, "step": 1293 }, { "epoch": 0.8820722563053851, "grad_norm": 14.5625, "learning_rate": 1.2389445134741022e-05, "loss": 4.355645656585693, "step": 1294 }, { "epoch": 0.8827539195637355, "grad_norm": 30.0, "learning_rate": 1.2378721040960788e-05, "loss": 5.3913702964782715, "step": 1295 }, { "epoch": 0.8834355828220859, "grad_norm": 12.9375, "learning_rate": 1.2367994046641787e-05, "loss": 6.28423547744751, "step": 1296 }, { "epoch": 0.8841172460804363, "grad_norm": 16.5, "learning_rate": 1.2357264164864186e-05, "loss": 4.339078426361084, "step": 1297 }, { "epoch": 0.8847989093387867, "grad_norm": 29.75, "learning_rate": 1.2346531408711675e-05, "loss": 4.6371612548828125, "step": 1298 }, { "epoch": 0.885480572597137, "grad_norm": 24.75, "learning_rate": 1.233579579127144e-05, "loss": 9.0311918258667, "step": 1299 }, { "epoch": 0.8861622358554874, "grad_norm": 18.0, "learning_rate": 1.232505732563416e-05, "loss": 4.752367973327637, "step": 1300 }, { "epoch": 0.8868438991138378, "grad_norm": 14.4375, "learning_rate": 1.2314316024893987e-05, "loss": 2.7875378131866455, "step": 1301 }, { "epoch": 0.8875255623721882, "grad_norm": 24.375, "learning_rate": 1.2303571902148532e-05, "loss": 5.434711456298828, "step": 1302 }, { "epoch": 0.8882072256305386, "grad_norm": 18.875, "learning_rate": 1.2292824970498847e-05, "loss": 4.585599899291992, "step": 1303 }, { "epoch": 0.8888888888888888, "grad_norm": 15.625, "learning_rate": 1.2282075243049408e-05, "loss": 6.090699195861816, "step": 1304 }, { "epoch": 0.8895705521472392, "grad_norm": 25.5, "learning_rate": 1.2271322732908091e-05, "loss": 4.09541130065918, "step": 1305 }, { "epoch": 0.8902522154055896, "grad_norm": 20.25, "learning_rate": 1.2260567453186185e-05, "loss": 2.9923157691955566, "step": 1306 }, { "epoch": 0.89093387866394, "grad_norm": 17.375, "learning_rate": 1.2249809416998339e-05, "loss": 5.023626327514648, "step": 1307 }, { "epoch": 0.8916155419222904, "grad_norm": 17.5, "learning_rate": 1.2239048637462572e-05, "loss": 2.0889580249786377, "step": 1308 }, { "epoch": 0.8922972051806408, "grad_norm": 80.5, "learning_rate": 1.2228285127700244e-05, "loss": 8.162361145019531, "step": 1309 }, { "epoch": 0.8929788684389911, "grad_norm": 16.75, "learning_rate": 1.2217518900836045e-05, "loss": 3.2970166206359863, "step": 1310 }, { "epoch": 0.8936605316973415, "grad_norm": 58.25, "learning_rate": 1.2206749969997979e-05, "loss": 6.195333957672119, "step": 1311 }, { "epoch": 0.8943421949556919, "grad_norm": 21.25, "learning_rate": 1.2195978348317347e-05, "loss": 2.991267681121826, "step": 1312 }, { "epoch": 0.8950238582140423, "grad_norm": 23.5, "learning_rate": 1.2185204048928729e-05, "loss": 5.32099723815918, "step": 1313 }, { "epoch": 0.8957055214723927, "grad_norm": 20.625, "learning_rate": 1.2174427084969973e-05, "loss": 5.420284271240234, "step": 1314 }, { "epoch": 0.896387184730743, "grad_norm": 78.0, "learning_rate": 1.2163647469582181e-05, "loss": 10.285449981689453, "step": 1315 }, { "epoch": 0.8970688479890934, "grad_norm": 37.5, "learning_rate": 1.2152865215909673e-05, "loss": 7.664188861846924, "step": 1316 }, { "epoch": 0.8977505112474438, "grad_norm": 48.0, "learning_rate": 1.2142080337099998e-05, "loss": 7.155360698699951, "step": 1317 }, { "epoch": 0.8984321745057942, "grad_norm": 53.75, "learning_rate": 1.2131292846303901e-05, "loss": 9.11170768737793, "step": 1318 }, { "epoch": 0.8991138377641446, "grad_norm": 19.0, "learning_rate": 1.2120502756675324e-05, "loss": 3.897080898284912, "step": 1319 }, { "epoch": 0.8997955010224948, "grad_norm": 12.375, "learning_rate": 1.210971008137136e-05, "loss": 3.713414192199707, "step": 1320 }, { "epoch": 0.9004771642808452, "grad_norm": 22.375, "learning_rate": 1.2098914833552262e-05, "loss": 4.646474361419678, "step": 1321 }, { "epoch": 0.9011588275391956, "grad_norm": 20.0, "learning_rate": 1.2088117026381422e-05, "loss": 7.059642791748047, "step": 1322 }, { "epoch": 0.901840490797546, "grad_norm": 20.0, "learning_rate": 1.2077316673025354e-05, "loss": 3.1722850799560547, "step": 1323 }, { "epoch": 0.9025221540558964, "grad_norm": 18.25, "learning_rate": 1.2066513786653675e-05, "loss": 5.5353546142578125, "step": 1324 }, { "epoch": 0.9032038173142468, "grad_norm": 16.875, "learning_rate": 1.2055708380439089e-05, "loss": 6.185270309448242, "step": 1325 }, { "epoch": 0.9038854805725971, "grad_norm": 27.125, "learning_rate": 1.204490046755737e-05, "loss": 3.8724958896636963, "step": 1326 }, { "epoch": 0.9045671438309475, "grad_norm": 16.375, "learning_rate": 1.2034090061187358e-05, "loss": 7.000111103057861, "step": 1327 }, { "epoch": 0.9052488070892979, "grad_norm": 20.25, "learning_rate": 1.2023277174510923e-05, "loss": 4.5064239501953125, "step": 1328 }, { "epoch": 0.9059304703476483, "grad_norm": 22.125, "learning_rate": 1.2012461820712966e-05, "loss": 7.617931365966797, "step": 1329 }, { "epoch": 0.9066121336059987, "grad_norm": 25.0, "learning_rate": 1.2001644012981392e-05, "loss": 5.939640998840332, "step": 1330 }, { "epoch": 0.907293796864349, "grad_norm": 16.375, "learning_rate": 1.1990823764507108e-05, "loss": 4.3405351638793945, "step": 1331 }, { "epoch": 0.9079754601226994, "grad_norm": 29.25, "learning_rate": 1.1980001088483986e-05, "loss": 6.204938888549805, "step": 1332 }, { "epoch": 0.9086571233810498, "grad_norm": 15.3125, "learning_rate": 1.1969175998108857e-05, "loss": 5.289066314697266, "step": 1333 }, { "epoch": 0.9093387866394002, "grad_norm": 46.75, "learning_rate": 1.1958348506581503e-05, "loss": 9.477140426635742, "step": 1334 }, { "epoch": 0.9100204498977505, "grad_norm": 20.5, "learning_rate": 1.1947518627104637e-05, "loss": 3.151419162750244, "step": 1335 }, { "epoch": 0.9107021131561008, "grad_norm": 15.3125, "learning_rate": 1.1936686372883877e-05, "loss": 4.251214981079102, "step": 1336 }, { "epoch": 0.9113837764144512, "grad_norm": 14.875, "learning_rate": 1.1925851757127735e-05, "loss": 6.146907806396484, "step": 1337 }, { "epoch": 0.9120654396728016, "grad_norm": 29.25, "learning_rate": 1.1915014793047606e-05, "loss": 2.7248806953430176, "step": 1338 }, { "epoch": 0.912747102931152, "grad_norm": 24.375, "learning_rate": 1.190417549385775e-05, "loss": 7.106926918029785, "step": 1339 }, { "epoch": 0.9134287661895024, "grad_norm": 13.5, "learning_rate": 1.1893333872775275e-05, "loss": 3.7511017322540283, "step": 1340 }, { "epoch": 0.9141104294478528, "grad_norm": 92.0, "learning_rate": 1.1882489943020115e-05, "loss": 6.660236835479736, "step": 1341 }, { "epoch": 0.9147920927062031, "grad_norm": 26.375, "learning_rate": 1.187164371781502e-05, "loss": 2.321042537689209, "step": 1342 }, { "epoch": 0.9154737559645535, "grad_norm": 20.125, "learning_rate": 1.1860795210385547e-05, "loss": 8.861368179321289, "step": 1343 }, { "epoch": 0.9161554192229039, "grad_norm": 17.0, "learning_rate": 1.1849944433960026e-05, "loss": 3.8385515213012695, "step": 1344 }, { "epoch": 0.9168370824812543, "grad_norm": 28.125, "learning_rate": 1.1839091401769559e-05, "loss": 8.717733383178711, "step": 1345 }, { "epoch": 0.9175187457396047, "grad_norm": 41.0, "learning_rate": 1.1828236127047991e-05, "loss": 6.037054538726807, "step": 1346 }, { "epoch": 0.918200408997955, "grad_norm": 20.0, "learning_rate": 1.1817378623031921e-05, "loss": 5.172226428985596, "step": 1347 }, { "epoch": 0.9188820722563054, "grad_norm": 13.1875, "learning_rate": 1.1806518902960643e-05, "loss": 3.504687547683716, "step": 1348 }, { "epoch": 0.9195637355146558, "grad_norm": 19.625, "learning_rate": 1.1795656980076164e-05, "loss": 5.575456619262695, "step": 1349 }, { "epoch": 0.9202453987730062, "grad_norm": 53.0, "learning_rate": 1.1784792867623179e-05, "loss": 8.186395645141602, "step": 1350 }, { "epoch": 0.9209270620313565, "grad_norm": 15.125, "learning_rate": 1.1773926578849049e-05, "loss": 3.5194382667541504, "step": 1351 }, { "epoch": 0.9216087252897068, "grad_norm": 37.25, "learning_rate": 1.1763058127003793e-05, "loss": 7.524107933044434, "step": 1352 }, { "epoch": 0.9222903885480572, "grad_norm": 16.75, "learning_rate": 1.1752187525340061e-05, "loss": 2.2473201751708984, "step": 1353 }, { "epoch": 0.9229720518064076, "grad_norm": 28.25, "learning_rate": 1.1741314787113129e-05, "loss": 5.831595420837402, "step": 1354 }, { "epoch": 0.923653715064758, "grad_norm": 25.25, "learning_rate": 1.1730439925580876e-05, "loss": 7.502413749694824, "step": 1355 }, { "epoch": 0.9243353783231084, "grad_norm": 12.6875, "learning_rate": 1.1719562954003774e-05, "loss": 3.6224398612976074, "step": 1356 }, { "epoch": 0.9250170415814588, "grad_norm": 30.125, "learning_rate": 1.1708683885644865e-05, "loss": 7.81357479095459, "step": 1357 }, { "epoch": 0.9256987048398091, "grad_norm": 20.25, "learning_rate": 1.1697802733769745e-05, "loss": 6.633660316467285, "step": 1358 }, { "epoch": 0.9263803680981595, "grad_norm": 31.25, "learning_rate": 1.1686919511646557e-05, "loss": 7.668174743652344, "step": 1359 }, { "epoch": 0.9270620313565099, "grad_norm": 14.1875, "learning_rate": 1.1676034232545963e-05, "loss": 4.6107354164123535, "step": 1360 }, { "epoch": 0.9277436946148603, "grad_norm": 17.25, "learning_rate": 1.1665146909741134e-05, "loss": 4.731639862060547, "step": 1361 }, { "epoch": 0.9284253578732107, "grad_norm": 31.625, "learning_rate": 1.1654257556507735e-05, "loss": 3.275231122970581, "step": 1362 }, { "epoch": 0.929107021131561, "grad_norm": 16.375, "learning_rate": 1.1643366186123913e-05, "loss": 3.0904626846313477, "step": 1363 }, { "epoch": 0.9297886843899114, "grad_norm": 38.0, "learning_rate": 1.163247281187026e-05, "loss": 4.619803428649902, "step": 1364 }, { "epoch": 0.9304703476482618, "grad_norm": 25.625, "learning_rate": 1.1621577447029816e-05, "loss": 3.692537784576416, "step": 1365 }, { "epoch": 0.9311520109066121, "grad_norm": 29.75, "learning_rate": 1.1610680104888057e-05, "loss": 7.823019027709961, "step": 1366 }, { "epoch": 0.9318336741649625, "grad_norm": 11.6875, "learning_rate": 1.1599780798732868e-05, "loss": 2.705531358718872, "step": 1367 }, { "epoch": 0.9325153374233128, "grad_norm": 22.125, "learning_rate": 1.158887954185452e-05, "loss": 4.368618011474609, "step": 1368 }, { "epoch": 0.9331970006816632, "grad_norm": 27.5, "learning_rate": 1.157797634754567e-05, "loss": 2.9533731937408447, "step": 1369 }, { "epoch": 0.9338786639400136, "grad_norm": 10.4375, "learning_rate": 1.1567071229101332e-05, "loss": 3.7323696613311768, "step": 1370 }, { "epoch": 0.934560327198364, "grad_norm": 16.75, "learning_rate": 1.1556164199818871e-05, "loss": 4.672563552856445, "step": 1371 }, { "epoch": 0.9352419904567144, "grad_norm": 37.25, "learning_rate": 1.1545255272997983e-05, "loss": 8.265373229980469, "step": 1372 }, { "epoch": 0.9359236537150648, "grad_norm": 14.375, "learning_rate": 1.153434446194068e-05, "loss": 4.911079406738281, "step": 1373 }, { "epoch": 0.9366053169734151, "grad_norm": 28.0, "learning_rate": 1.1523431779951255e-05, "loss": 6.810037136077881, "step": 1374 }, { "epoch": 0.9372869802317655, "grad_norm": 18.25, "learning_rate": 1.1512517240336304e-05, "loss": 5.774360656738281, "step": 1375 }, { "epoch": 0.9379686434901159, "grad_norm": 67.0, "learning_rate": 1.1501600856404676e-05, "loss": 10.140185356140137, "step": 1376 }, { "epoch": 0.9386503067484663, "grad_norm": 19.0, "learning_rate": 1.149068264146747e-05, "loss": 5.116090297698975, "step": 1377 }, { "epoch": 0.9393319700068167, "grad_norm": 57.25, "learning_rate": 1.1479762608838018e-05, "loss": 8.842608451843262, "step": 1378 }, { "epoch": 0.940013633265167, "grad_norm": 12.5625, "learning_rate": 1.1468840771831874e-05, "loss": 2.29817533493042, "step": 1379 }, { "epoch": 0.9406952965235174, "grad_norm": 12.125, "learning_rate": 1.1457917143766786e-05, "loss": 2.869518756866455, "step": 1380 }, { "epoch": 0.9413769597818678, "grad_norm": 17.75, "learning_rate": 1.1446991737962688e-05, "loss": 4.41871452331543, "step": 1381 }, { "epoch": 0.9420586230402181, "grad_norm": 16.75, "learning_rate": 1.1436064567741679e-05, "loss": 4.453996658325195, "step": 1382 }, { "epoch": 0.9427402862985685, "grad_norm": 41.0, "learning_rate": 1.1425135646428011e-05, "loss": 9.71518611907959, "step": 1383 }, { "epoch": 0.9434219495569189, "grad_norm": 35.0, "learning_rate": 1.141420498734808e-05, "loss": 4.147461414337158, "step": 1384 }, { "epoch": 0.9441036128152692, "grad_norm": 15.5625, "learning_rate": 1.1403272603830384e-05, "loss": 4.635113716125488, "step": 1385 }, { "epoch": 0.9447852760736196, "grad_norm": 49.75, "learning_rate": 1.139233850920554e-05, "loss": 11.979005813598633, "step": 1386 }, { "epoch": 0.94546693933197, "grad_norm": 21.875, "learning_rate": 1.1381402716806237e-05, "loss": 3.7526919841766357, "step": 1387 }, { "epoch": 0.9461486025903204, "grad_norm": 12.75, "learning_rate": 1.137046523996725e-05, "loss": 4.084906578063965, "step": 1388 }, { "epoch": 0.9468302658486708, "grad_norm": 21.875, "learning_rate": 1.1359526092025395e-05, "loss": 2.3988165855407715, "step": 1389 }, { "epoch": 0.9475119291070211, "grad_norm": 23.0, "learning_rate": 1.1348585286319529e-05, "loss": 3.1027121543884277, "step": 1390 }, { "epoch": 0.9481935923653715, "grad_norm": 19.5, "learning_rate": 1.1337642836190532e-05, "loss": 5.420268535614014, "step": 1391 }, { "epoch": 0.9488752556237219, "grad_norm": 16.5, "learning_rate": 1.1326698754981292e-05, "loss": 3.2984988689422607, "step": 1392 }, { "epoch": 0.9495569188820723, "grad_norm": 16.0, "learning_rate": 1.131575305603668e-05, "loss": 4.6400346755981445, "step": 1393 }, { "epoch": 0.9502385821404227, "grad_norm": 10.1875, "learning_rate": 1.130480575270354e-05, "loss": 2.9564967155456543, "step": 1394 }, { "epoch": 0.950920245398773, "grad_norm": 39.0, "learning_rate": 1.1293856858330678e-05, "loss": 6.731145858764648, "step": 1395 }, { "epoch": 0.9516019086571234, "grad_norm": 18.25, "learning_rate": 1.1282906386268842e-05, "loss": 6.028124809265137, "step": 1396 }, { "epoch": 0.9522835719154737, "grad_norm": 13.125, "learning_rate": 1.1271954349870686e-05, "loss": 4.406980514526367, "step": 1397 }, { "epoch": 0.9529652351738241, "grad_norm": 15.25, "learning_rate": 1.1261000762490793e-05, "loss": 4.640754222869873, "step": 1398 }, { "epoch": 0.9536468984321745, "grad_norm": 17.75, "learning_rate": 1.1250045637485624e-05, "loss": 2.006152868270874, "step": 1399 }, { "epoch": 0.9543285616905249, "grad_norm": 21.0, "learning_rate": 1.1239088988213522e-05, "loss": 5.326655387878418, "step": 1400 }, { "epoch": 0.9550102249488752, "grad_norm": 21.875, "learning_rate": 1.1228130828034685e-05, "loss": 3.4637537002563477, "step": 1401 }, { "epoch": 0.9556918882072256, "grad_norm": 20.125, "learning_rate": 1.1217171170311157e-05, "loss": 6.046266555786133, "step": 1402 }, { "epoch": 0.956373551465576, "grad_norm": 16.5, "learning_rate": 1.1206210028406797e-05, "loss": 6.5497589111328125, "step": 1403 }, { "epoch": 0.9570552147239264, "grad_norm": 11.8125, "learning_rate": 1.1195247415687286e-05, "loss": 3.3028626441955566, "step": 1404 }, { "epoch": 0.9577368779822768, "grad_norm": 10.75, "learning_rate": 1.11842833455201e-05, "loss": 3.402437925338745, "step": 1405 }, { "epoch": 0.9584185412406271, "grad_norm": 21.0, "learning_rate": 1.1173317831274479e-05, "loss": 4.895727157592773, "step": 1406 }, { "epoch": 0.9591002044989775, "grad_norm": 25.25, "learning_rate": 1.1162350886321435e-05, "loss": 3.9754281044006348, "step": 1407 }, { "epoch": 0.9597818677573279, "grad_norm": 19.25, "learning_rate": 1.115138252403372e-05, "loss": 5.007200241088867, "step": 1408 }, { "epoch": 0.9604635310156783, "grad_norm": 312.0, "learning_rate": 1.1140412757785818e-05, "loss": 3.0526342391967773, "step": 1409 }, { "epoch": 0.9611451942740287, "grad_norm": 14.75, "learning_rate": 1.1129441600953916e-05, "loss": 5.1781325340271, "step": 1410 }, { "epoch": 0.961826857532379, "grad_norm": 14.8125, "learning_rate": 1.1118469066915907e-05, "loss": 5.803821086883545, "step": 1411 }, { "epoch": 0.9625085207907293, "grad_norm": 33.75, "learning_rate": 1.1107495169051364e-05, "loss": 6.760996341705322, "step": 1412 }, { "epoch": 0.9631901840490797, "grad_norm": 16.625, "learning_rate": 1.1096519920741509e-05, "loss": 4.804296016693115, "step": 1413 }, { "epoch": 0.9638718473074301, "grad_norm": 12.625, "learning_rate": 1.1085543335369224e-05, "loss": 2.6042098999023438, "step": 1414 }, { "epoch": 0.9645535105657805, "grad_norm": 33.0, "learning_rate": 1.1074565426319014e-05, "loss": 6.68544864654541, "step": 1415 }, { "epoch": 0.9652351738241309, "grad_norm": 28.0, "learning_rate": 1.1063586206977009e-05, "loss": 4.805915355682373, "step": 1416 }, { "epoch": 0.9659168370824812, "grad_norm": 11.25, "learning_rate": 1.1052605690730922e-05, "loss": 3.0436081886291504, "step": 1417 }, { "epoch": 0.9665985003408316, "grad_norm": 14.3125, "learning_rate": 1.1041623890970061e-05, "loss": 4.011970520019531, "step": 1418 }, { "epoch": 0.967280163599182, "grad_norm": 23.75, "learning_rate": 1.1030640821085284e-05, "loss": 5.415000915527344, "step": 1419 }, { "epoch": 0.9679618268575324, "grad_norm": 28.25, "learning_rate": 1.101965649446901e-05, "loss": 8.550607681274414, "step": 1420 }, { "epoch": 0.9686434901158828, "grad_norm": 26.25, "learning_rate": 1.1008670924515191e-05, "loss": 5.092818260192871, "step": 1421 }, { "epoch": 0.9693251533742331, "grad_norm": 19.125, "learning_rate": 1.0997684124619286e-05, "loss": 6.35060453414917, "step": 1422 }, { "epoch": 0.9700068166325835, "grad_norm": 18.25, "learning_rate": 1.0986696108178259e-05, "loss": 4.128617286682129, "step": 1423 }, { "epoch": 0.9706884798909339, "grad_norm": 16.125, "learning_rate": 1.0975706888590556e-05, "loss": 3.153402328491211, "step": 1424 }, { "epoch": 0.9713701431492843, "grad_norm": 19.125, "learning_rate": 1.0964716479256094e-05, "loss": 6.361213207244873, "step": 1425 }, { "epoch": 0.9720518064076347, "grad_norm": 14.625, "learning_rate": 1.0953724893576236e-05, "loss": 5.631996154785156, "step": 1426 }, { "epoch": 0.972733469665985, "grad_norm": 12.875, "learning_rate": 1.0942732144953782e-05, "loss": 5.759578704833984, "step": 1427 }, { "epoch": 0.9734151329243353, "grad_norm": 32.75, "learning_rate": 1.0931738246792947e-05, "loss": 4.888126850128174, "step": 1428 }, { "epoch": 0.9740967961826857, "grad_norm": 30.875, "learning_rate": 1.0920743212499355e-05, "loss": 6.081423759460449, "step": 1429 }, { "epoch": 0.9747784594410361, "grad_norm": 13.6875, "learning_rate": 1.0909747055480004e-05, "loss": 2.6258888244628906, "step": 1430 }, { "epoch": 0.9754601226993865, "grad_norm": 37.0, "learning_rate": 1.089874978914327e-05, "loss": 7.570301055908203, "step": 1431 }, { "epoch": 0.9761417859577369, "grad_norm": 26.625, "learning_rate": 1.0887751426898878e-05, "loss": 6.980555057525635, "step": 1432 }, { "epoch": 0.9768234492160872, "grad_norm": 48.75, "learning_rate": 1.0876751982157892e-05, "loss": 9.058960914611816, "step": 1433 }, { "epoch": 0.9775051124744376, "grad_norm": 15.3125, "learning_rate": 1.0865751468332695e-05, "loss": 4.9931440353393555, "step": 1434 }, { "epoch": 0.978186775732788, "grad_norm": 53.5, "learning_rate": 1.0854749898836974e-05, "loss": 6.3406877517700195, "step": 1435 }, { "epoch": 0.9788684389911384, "grad_norm": 34.0, "learning_rate": 1.0843747287085693e-05, "loss": 3.3779234886169434, "step": 1436 }, { "epoch": 0.9795501022494888, "grad_norm": 20.0, "learning_rate": 1.0832743646495105e-05, "loss": 2.471741199493408, "step": 1437 }, { "epoch": 0.9802317655078391, "grad_norm": 21.625, "learning_rate": 1.0821738990482709e-05, "loss": 8.409602165222168, "step": 1438 }, { "epoch": 0.9809134287661895, "grad_norm": 27.625, "learning_rate": 1.0810733332467235e-05, "loss": 7.618063926696777, "step": 1439 }, { "epoch": 0.9815950920245399, "grad_norm": 17.0, "learning_rate": 1.0799726685868648e-05, "loss": 2.692469596862793, "step": 1440 }, { "epoch": 0.9822767552828903, "grad_norm": 11.375, "learning_rate": 1.0788719064108108e-05, "loss": 3.107055425643921, "step": 1441 }, { "epoch": 0.9829584185412407, "grad_norm": 43.5, "learning_rate": 1.077771048060797e-05, "loss": 6.260942459106445, "step": 1442 }, { "epoch": 0.983640081799591, "grad_norm": 31.625, "learning_rate": 1.076670094879176e-05, "loss": 6.384950637817383, "step": 1443 }, { "epoch": 0.9843217450579413, "grad_norm": 38.75, "learning_rate": 1.0755690482084154e-05, "loss": 8.36608600616455, "step": 1444 }, { "epoch": 0.9850034083162917, "grad_norm": 13.875, "learning_rate": 1.0744679093910987e-05, "loss": 4.242187976837158, "step": 1445 }, { "epoch": 0.9856850715746421, "grad_norm": 19.0, "learning_rate": 1.0733666797699191e-05, "loss": 5.080479621887207, "step": 1446 }, { "epoch": 0.9863667348329925, "grad_norm": 14.875, "learning_rate": 1.0722653606876828e-05, "loss": 5.573699951171875, "step": 1447 }, { "epoch": 0.9870483980913429, "grad_norm": 10.6875, "learning_rate": 1.0711639534873035e-05, "loss": 2.2629261016845703, "step": 1448 }, { "epoch": 0.9877300613496932, "grad_norm": 13.0625, "learning_rate": 1.0700624595118037e-05, "loss": 4.294123649597168, "step": 1449 }, { "epoch": 0.9884117246080436, "grad_norm": 24.375, "learning_rate": 1.0689608801043107e-05, "loss": 7.163780212402344, "step": 1450 }, { "epoch": 0.989093387866394, "grad_norm": 37.5, "learning_rate": 1.0678592166080565e-05, "loss": 5.2584686279296875, "step": 1451 }, { "epoch": 0.9897750511247444, "grad_norm": 14.0, "learning_rate": 1.066757470366375e-05, "loss": 5.09457540512085, "step": 1452 }, { "epoch": 0.9904567143830948, "grad_norm": 21.75, "learning_rate": 1.0656556427227019e-05, "loss": 6.028077125549316, "step": 1453 }, { "epoch": 0.9911383776414451, "grad_norm": 13.9375, "learning_rate": 1.0645537350205714e-05, "loss": 3.743544578552246, "step": 1454 }, { "epoch": 0.9918200408997955, "grad_norm": 35.25, "learning_rate": 1.063451748603616e-05, "loss": 7.1067681312561035, "step": 1455 }, { "epoch": 0.9925017041581459, "grad_norm": 21.125, "learning_rate": 1.0623496848155635e-05, "loss": 5.95827579498291, "step": 1456 }, { "epoch": 0.9931833674164963, "grad_norm": 34.25, "learning_rate": 1.0612475450002363e-05, "loss": 6.740974426269531, "step": 1457 }, { "epoch": 0.9938650306748467, "grad_norm": 35.5, "learning_rate": 1.0601453305015497e-05, "loss": 7.936580657958984, "step": 1458 }, { "epoch": 0.994546693933197, "grad_norm": 24.125, "learning_rate": 1.0590430426635098e-05, "loss": 3.005598545074463, "step": 1459 }, { "epoch": 0.9952283571915473, "grad_norm": 20.75, "learning_rate": 1.0579406828302124e-05, "loss": 4.811061859130859, "step": 1460 }, { "epoch": 0.9959100204498977, "grad_norm": 24.0, "learning_rate": 1.0568382523458412e-05, "loss": 3.8753485679626465, "step": 1461 }, { "epoch": 0.9965916837082481, "grad_norm": 15.5625, "learning_rate": 1.0557357525546651e-05, "loss": 5.5971527099609375, "step": 1462 }, { "epoch": 0.9972733469665985, "grad_norm": 24.875, "learning_rate": 1.054633184801039e-05, "loss": 5.780820846557617, "step": 1463 }, { "epoch": 0.9979550102249489, "grad_norm": 19.25, "learning_rate": 1.0535305504293988e-05, "loss": 8.252350807189941, "step": 1464 }, { "epoch": 0.9986366734832992, "grad_norm": 18.0, "learning_rate": 1.0524278507842637e-05, "loss": 5.768834114074707, "step": 1465 }, { "epoch": 0.9993183367416496, "grad_norm": 16.25, "learning_rate": 1.0513250872102312e-05, "loss": 5.380429267883301, "step": 1466 }, { "epoch": 1.0, "grad_norm": 34.25, "learning_rate": 1.0502222610519772e-05, "loss": 5.940509796142578, "step": 1467 }, { "epoch": 1.0006816632583504, "grad_norm": 15.5, "learning_rate": 1.049119373654253e-05, "loss": 6.070116996765137, "step": 1468 }, { "epoch": 1.0013633265167008, "grad_norm": 36.25, "learning_rate": 1.0480164263618862e-05, "loss": 4.629971504211426, "step": 1469 }, { "epoch": 1.0020449897750512, "grad_norm": 23.875, "learning_rate": 1.0469134205197762e-05, "loss": 5.355800628662109, "step": 1470 }, { "epoch": 1.0027266530334016, "grad_norm": 17.625, "learning_rate": 1.0458103574728942e-05, "loss": 6.330981254577637, "step": 1471 }, { "epoch": 1.003408316291752, "grad_norm": 33.0, "learning_rate": 1.044707238566281e-05, "loss": 4.590171813964844, "step": 1472 }, { "epoch": 1.0040899795501022, "grad_norm": 25.5, "learning_rate": 1.043604065145046e-05, "loss": 5.543246269226074, "step": 1473 }, { "epoch": 1.0047716428084525, "grad_norm": 13.5625, "learning_rate": 1.0425008385543644e-05, "loss": 5.521811485290527, "step": 1474 }, { "epoch": 1.005453306066803, "grad_norm": 20.125, "learning_rate": 1.0413975601394765e-05, "loss": 6.667549133300781, "step": 1475 }, { "epoch": 1.0061349693251533, "grad_norm": 25.5, "learning_rate": 1.040294231245686e-05, "loss": 4.270026206970215, "step": 1476 }, { "epoch": 1.0068166325835037, "grad_norm": 21.25, "learning_rate": 1.039190853218358e-05, "loss": 4.980504035949707, "step": 1477 }, { "epoch": 1.0074982958418541, "grad_norm": 16.75, "learning_rate": 1.0380874274029173e-05, "loss": 1.950175166130066, "step": 1478 }, { "epoch": 1.0081799591002045, "grad_norm": 23.75, "learning_rate": 1.0369839551448473e-05, "loss": 5.0768208503723145, "step": 1479 }, { "epoch": 1.008861622358555, "grad_norm": 25.875, "learning_rate": 1.0358804377896876e-05, "loss": 6.594276428222656, "step": 1480 }, { "epoch": 1.0095432856169053, "grad_norm": 17.625, "learning_rate": 1.0347768766830335e-05, "loss": 2.1603755950927734, "step": 1481 }, { "epoch": 1.0102249488752557, "grad_norm": 29.0, "learning_rate": 1.0336732731705332e-05, "loss": 2.6884114742279053, "step": 1482 }, { "epoch": 1.010906612133606, "grad_norm": 18.375, "learning_rate": 1.0325696285978868e-05, "loss": 3.715885639190674, "step": 1483 }, { "epoch": 1.0115882753919563, "grad_norm": 33.0, "learning_rate": 1.0314659443108436e-05, "loss": 7.264423847198486, "step": 1484 }, { "epoch": 1.0122699386503067, "grad_norm": 18.0, "learning_rate": 1.0303622216552022e-05, "loss": 2.1196303367614746, "step": 1485 }, { "epoch": 1.012951601908657, "grad_norm": 16.625, "learning_rate": 1.0292584619768087e-05, "loss": 4.562056541442871, "step": 1486 }, { "epoch": 1.0136332651670075, "grad_norm": 16.25, "learning_rate": 1.0281546666215525e-05, "loss": 4.161388874053955, "step": 1487 }, { "epoch": 1.0143149284253579, "grad_norm": 11.5625, "learning_rate": 1.027050836935368e-05, "loss": 3.902395725250244, "step": 1488 }, { "epoch": 1.0149965916837083, "grad_norm": 12.375, "learning_rate": 1.0259469742642305e-05, "loss": 4.617828369140625, "step": 1489 }, { "epoch": 1.0156782549420587, "grad_norm": 18.25, "learning_rate": 1.0248430799541564e-05, "loss": 5.8154497146606445, "step": 1490 }, { "epoch": 1.016359918200409, "grad_norm": 30.875, "learning_rate": 1.0237391553512002e-05, "loss": 7.08201789855957, "step": 1491 }, { "epoch": 1.0170415814587594, "grad_norm": 20.625, "learning_rate": 1.0226352018014529e-05, "loss": 5.759778022766113, "step": 1492 }, { "epoch": 1.0177232447171098, "grad_norm": 15.0625, "learning_rate": 1.0215312206510415e-05, "loss": 4.860376358032227, "step": 1493 }, { "epoch": 1.01840490797546, "grad_norm": 17.0, "learning_rate": 1.0204272132461269e-05, "loss": 4.146553993225098, "step": 1494 }, { "epoch": 1.0190865712338104, "grad_norm": 12.75, "learning_rate": 1.019323180932901e-05, "loss": 3.8040313720703125, "step": 1495 }, { "epoch": 1.0197682344921608, "grad_norm": 22.875, "learning_rate": 1.0182191250575865e-05, "loss": 3.992687702178955, "step": 1496 }, { "epoch": 1.0204498977505112, "grad_norm": 12.1875, "learning_rate": 1.0171150469664353e-05, "loss": 3.4415478706359863, "step": 1497 }, { "epoch": 1.0211315610088616, "grad_norm": 13.75, "learning_rate": 1.016010948005726e-05, "loss": 3.23018217086792, "step": 1498 }, { "epoch": 1.021813224267212, "grad_norm": 25.75, "learning_rate": 1.0149068295217626e-05, "loss": 4.723155975341797, "step": 1499 }, { "epoch": 1.0224948875255624, "grad_norm": 17.375, "learning_rate": 1.013802692860873e-05, "loss": 2.7304582595825195, "step": 1500 }, { "epoch": 1.0231765507839128, "grad_norm": 15.125, "learning_rate": 1.012698539369407e-05, "loss": 2.855530023574829, "step": 1501 }, { "epoch": 1.0238582140422632, "grad_norm": 15.6875, "learning_rate": 1.0115943703937356e-05, "loss": 4.481204509735107, "step": 1502 }, { "epoch": 1.0245398773006136, "grad_norm": 20.0, "learning_rate": 1.010490187280248e-05, "loss": 6.6839494705200195, "step": 1503 }, { "epoch": 1.025221540558964, "grad_norm": 53.0, "learning_rate": 1.009385991375351e-05, "loss": 9.708818435668945, "step": 1504 }, { "epoch": 1.0259032038173141, "grad_norm": 17.75, "learning_rate": 1.0082817840254667e-05, "loss": 4.18892765045166, "step": 1505 }, { "epoch": 1.0265848670756645, "grad_norm": 13.125, "learning_rate": 1.0071775665770316e-05, "loss": 2.7887582778930664, "step": 1506 }, { "epoch": 1.027266530334015, "grad_norm": 18.375, "learning_rate": 1.006073340376494e-05, "loss": 5.502171993255615, "step": 1507 }, { "epoch": 1.0279481935923653, "grad_norm": 51.0, "learning_rate": 1.0049691067703133e-05, "loss": 5.553772449493408, "step": 1508 }, { "epoch": 1.0286298568507157, "grad_norm": 13.625, "learning_rate": 1.0038648671049574e-05, "loss": 3.496236801147461, "step": 1509 }, { "epoch": 1.0293115201090661, "grad_norm": 19.25, "learning_rate": 1.0027606227269026e-05, "loss": 7.103146553039551, "step": 1510 }, { "epoch": 1.0299931833674165, "grad_norm": 31.625, "learning_rate": 1.00165637498263e-05, "loss": 4.931897163391113, "step": 1511 }, { "epoch": 1.030674846625767, "grad_norm": 16.625, "learning_rate": 1.000552125218625e-05, "loss": 6.140021324157715, "step": 1512 }, { "epoch": 1.0313565098841173, "grad_norm": 33.5, "learning_rate": 9.994478747813755e-06, "loss": 7.345560073852539, "step": 1513 }, { "epoch": 1.0320381731424677, "grad_norm": 29.75, "learning_rate": 9.983436250173702e-06, "loss": 6.332590103149414, "step": 1514 }, { "epoch": 1.032719836400818, "grad_norm": 17.625, "learning_rate": 9.972393772730975e-06, "loss": 4.907554626464844, "step": 1515 }, { "epoch": 1.0334014996591683, "grad_norm": 11.1875, "learning_rate": 9.961351328950429e-06, "loss": 3.7796061038970947, "step": 1516 }, { "epoch": 1.0340831629175187, "grad_norm": 23.625, "learning_rate": 9.95030893229687e-06, "loss": 3.3562541007995605, "step": 1517 }, { "epoch": 1.034764826175869, "grad_norm": 13.8125, "learning_rate": 9.939266596235065e-06, "loss": 3.6736466884613037, "step": 1518 }, { "epoch": 1.0354464894342195, "grad_norm": 25.0, "learning_rate": 9.928224334229689e-06, "loss": 2.3857522010803223, "step": 1519 }, { "epoch": 1.0361281526925699, "grad_norm": 24.125, "learning_rate": 9.917182159745335e-06, "loss": 3.413331985473633, "step": 1520 }, { "epoch": 1.0368098159509203, "grad_norm": 18.875, "learning_rate": 9.906140086246493e-06, "loss": 7.507312297821045, "step": 1521 }, { "epoch": 1.0374914792092707, "grad_norm": 34.5, "learning_rate": 9.895098127197522e-06, "loss": 7.80535888671875, "step": 1522 }, { "epoch": 1.038173142467621, "grad_norm": 19.875, "learning_rate": 9.884056296062644e-06, "loss": 2.4190077781677246, "step": 1523 }, { "epoch": 1.0388548057259714, "grad_norm": 27.625, "learning_rate": 9.873014606305931e-06, "loss": 6.159796714782715, "step": 1524 }, { "epoch": 1.0395364689843218, "grad_norm": 13.6875, "learning_rate": 9.861973071391272e-06, "loss": 4.731330394744873, "step": 1525 }, { "epoch": 1.0402181322426722, "grad_norm": 35.75, "learning_rate": 9.850931704782377e-06, "loss": 8.998680114746094, "step": 1526 }, { "epoch": 1.0408997955010224, "grad_norm": 15.0625, "learning_rate": 9.839890519942743e-06, "loss": 5.075788497924805, "step": 1527 }, { "epoch": 1.0415814587593728, "grad_norm": 38.25, "learning_rate": 9.828849530335648e-06, "loss": 7.8539557456970215, "step": 1528 }, { "epoch": 1.0422631220177232, "grad_norm": 16.0, "learning_rate": 9.817808749424138e-06, "loss": 5.145135402679443, "step": 1529 }, { "epoch": 1.0429447852760736, "grad_norm": 32.0, "learning_rate": 9.806768190670994e-06, "loss": 2.9653429985046387, "step": 1530 }, { "epoch": 1.043626448534424, "grad_norm": 10.375, "learning_rate": 9.795727867538733e-06, "loss": 2.9928014278411865, "step": 1531 }, { "epoch": 1.0443081117927744, "grad_norm": 27.75, "learning_rate": 9.784687793489588e-06, "loss": 2.4595863819122314, "step": 1532 }, { "epoch": 1.0449897750511248, "grad_norm": 14.8125, "learning_rate": 9.773647981985473e-06, "loss": 3.2585206031799316, "step": 1533 }, { "epoch": 1.0456714383094752, "grad_norm": 28.5, "learning_rate": 9.762608446488004e-06, "loss": 6.584098815917969, "step": 1534 }, { "epoch": 1.0463531015678256, "grad_norm": 31.5, "learning_rate": 9.751569200458438e-06, "loss": 5.333770751953125, "step": 1535 }, { "epoch": 1.047034764826176, "grad_norm": 21.5, "learning_rate": 9.740530257357696e-06, "loss": 3.8779568672180176, "step": 1536 }, { "epoch": 1.0477164280845261, "grad_norm": 33.25, "learning_rate": 9.729491630646324e-06, "loss": 6.860392093658447, "step": 1537 }, { "epoch": 1.0483980913428765, "grad_norm": 14.375, "learning_rate": 9.718453333784478e-06, "loss": 4.505067825317383, "step": 1538 }, { "epoch": 1.049079754601227, "grad_norm": 23.625, "learning_rate": 9.707415380231915e-06, "loss": 5.660163879394531, "step": 1539 }, { "epoch": 1.0497614178595773, "grad_norm": 12.5625, "learning_rate": 9.69637778344798e-06, "loss": 4.572573184967041, "step": 1540 }, { "epoch": 1.0504430811179277, "grad_norm": 26.625, "learning_rate": 9.685340556891567e-06, "loss": 4.181941509246826, "step": 1541 }, { "epoch": 1.0511247443762781, "grad_norm": 31.75, "learning_rate": 9.674303714021139e-06, "loss": 6.419864654541016, "step": 1542 }, { "epoch": 1.0518064076346285, "grad_norm": 16.5, "learning_rate": 9.66326726829467e-06, "loss": 5.917838096618652, "step": 1543 }, { "epoch": 1.052488070892979, "grad_norm": 46.5, "learning_rate": 9.652231233169665e-06, "loss": 7.8823089599609375, "step": 1544 }, { "epoch": 1.0531697341513293, "grad_norm": 24.0, "learning_rate": 9.641195622103126e-06, "loss": 7.610499382019043, "step": 1545 }, { "epoch": 1.0538513974096797, "grad_norm": 48.0, "learning_rate": 9.63016044855153e-06, "loss": 5.871123790740967, "step": 1546 }, { "epoch": 1.05453306066803, "grad_norm": 68.5, "learning_rate": 9.619125725970832e-06, "loss": 8.58255386352539, "step": 1547 }, { "epoch": 1.0552147239263803, "grad_norm": 23.5, "learning_rate": 9.608091467816423e-06, "loss": 4.753182411193848, "step": 1548 }, { "epoch": 1.0558963871847307, "grad_norm": 16.375, "learning_rate": 9.59705768754314e-06, "loss": 1.885911464691162, "step": 1549 }, { "epoch": 1.056578050443081, "grad_norm": 15.875, "learning_rate": 9.586024398605238e-06, "loss": 2.010641098022461, "step": 1550 }, { "epoch": 1.0572597137014315, "grad_norm": 13.9375, "learning_rate": 9.574991614456358e-06, "loss": 3.827972173690796, "step": 1551 }, { "epoch": 1.0579413769597819, "grad_norm": 46.5, "learning_rate": 9.56395934854954e-06, "loss": 2.9487409591674805, "step": 1552 }, { "epoch": 1.0586230402181322, "grad_norm": 22.25, "learning_rate": 9.552927614337191e-06, "loss": 7.9868268966674805, "step": 1553 }, { "epoch": 1.0593047034764826, "grad_norm": 22.375, "learning_rate": 9.541896425271062e-06, "loss": 3.7796406745910645, "step": 1554 }, { "epoch": 1.059986366734833, "grad_norm": 26.75, "learning_rate": 9.530865794802243e-06, "loss": 6.916559219360352, "step": 1555 }, { "epoch": 1.0606680299931834, "grad_norm": 18.375, "learning_rate": 9.519835736381141e-06, "loss": 5.111781120300293, "step": 1556 }, { "epoch": 1.0613496932515338, "grad_norm": 22.75, "learning_rate": 9.508806263457471e-06, "loss": 6.114238739013672, "step": 1557 }, { "epoch": 1.062031356509884, "grad_norm": 11.5625, "learning_rate": 9.497777389480235e-06, "loss": 2.74649715423584, "step": 1558 }, { "epoch": 1.0627130197682344, "grad_norm": 34.5, "learning_rate": 9.486749127897692e-06, "loss": 8.41982650756836, "step": 1559 }, { "epoch": 1.0633946830265848, "grad_norm": 14.3125, "learning_rate": 9.475721492157365e-06, "loss": 4.563595294952393, "step": 1560 }, { "epoch": 1.0640763462849352, "grad_norm": 36.5, "learning_rate": 9.464694495706015e-06, "loss": 7.372790336608887, "step": 1561 }, { "epoch": 1.0647580095432856, "grad_norm": 46.75, "learning_rate": 9.453668151989615e-06, "loss": 7.876200199127197, "step": 1562 }, { "epoch": 1.065439672801636, "grad_norm": 10.0, "learning_rate": 9.442642474453352e-06, "loss": 3.360135078430176, "step": 1563 }, { "epoch": 1.0661213360599864, "grad_norm": 14.3125, "learning_rate": 9.431617476541591e-06, "loss": 3.7369699478149414, "step": 1564 }, { "epoch": 1.0668029993183368, "grad_norm": 14.875, "learning_rate": 9.420593171697876e-06, "loss": 3.5562360286712646, "step": 1565 }, { "epoch": 1.0674846625766872, "grad_norm": 44.25, "learning_rate": 9.409569573364905e-06, "loss": 7.1143999099731445, "step": 1566 }, { "epoch": 1.0681663258350376, "grad_norm": 34.0, "learning_rate": 9.398546694984506e-06, "loss": 5.729254245758057, "step": 1567 }, { "epoch": 1.068847989093388, "grad_norm": 20.625, "learning_rate": 9.387524549997637e-06, "loss": 5.56757116317749, "step": 1568 }, { "epoch": 1.0695296523517381, "grad_norm": 28.875, "learning_rate": 9.37650315184437e-06, "loss": 2.34714412689209, "step": 1569 }, { "epoch": 1.0702113156100885, "grad_norm": 31.75, "learning_rate": 9.365482513963844e-06, "loss": 4.84844446182251, "step": 1570 }, { "epoch": 1.070892978868439, "grad_norm": 13.5, "learning_rate": 9.354462649794291e-06, "loss": 2.0638632774353027, "step": 1571 }, { "epoch": 1.0715746421267893, "grad_norm": 16.5, "learning_rate": 9.343443572772985e-06, "loss": 5.325016975402832, "step": 1572 }, { "epoch": 1.0722563053851397, "grad_norm": 14.875, "learning_rate": 9.332425296336251e-06, "loss": 2.9386680126190186, "step": 1573 }, { "epoch": 1.0729379686434901, "grad_norm": 14.5625, "learning_rate": 9.321407833919438e-06, "loss": 3.6779747009277344, "step": 1574 }, { "epoch": 1.0736196319018405, "grad_norm": 11.6875, "learning_rate": 9.310391198956896e-06, "loss": 2.979320526123047, "step": 1575 }, { "epoch": 1.074301295160191, "grad_norm": 24.625, "learning_rate": 9.299375404881963e-06, "loss": 3.2142701148986816, "step": 1576 }, { "epoch": 1.0749829584185413, "grad_norm": 17.375, "learning_rate": 9.288360465126968e-06, "loss": 5.862717151641846, "step": 1577 }, { "epoch": 1.0756646216768917, "grad_norm": 18.5, "learning_rate": 9.277346393123174e-06, "loss": 5.267236709594727, "step": 1578 }, { "epoch": 1.076346284935242, "grad_norm": 23.125, "learning_rate": 9.266333202300814e-06, "loss": 2.733450174331665, "step": 1579 }, { "epoch": 1.0770279481935923, "grad_norm": 14.0, "learning_rate": 9.255320906089017e-06, "loss": 3.84560489654541, "step": 1580 }, { "epoch": 1.0777096114519427, "grad_norm": 30.75, "learning_rate": 9.244309517915846e-06, "loss": 6.094271183013916, "step": 1581 }, { "epoch": 1.078391274710293, "grad_norm": 24.125, "learning_rate": 9.233299051208247e-06, "loss": 4.732877731323242, "step": 1582 }, { "epoch": 1.0790729379686435, "grad_norm": 34.75, "learning_rate": 9.222289519392033e-06, "loss": 2.6296279430389404, "step": 1583 }, { "epoch": 1.0797546012269938, "grad_norm": 14.0, "learning_rate": 9.211280935891892e-06, "loss": 3.667299747467041, "step": 1584 }, { "epoch": 1.0804362644853442, "grad_norm": 15.6875, "learning_rate": 9.200273314131356e-06, "loss": 3.075784921646118, "step": 1585 }, { "epoch": 1.0811179277436946, "grad_norm": 10.375, "learning_rate": 9.189266667532767e-06, "loss": 2.4934003353118896, "step": 1586 }, { "epoch": 1.081799591002045, "grad_norm": 34.75, "learning_rate": 9.178261009517296e-06, "loss": 8.606280326843262, "step": 1587 }, { "epoch": 1.0824812542603954, "grad_norm": 30.25, "learning_rate": 9.167256353504897e-06, "loss": 7.966087341308594, "step": 1588 }, { "epoch": 1.0831629175187458, "grad_norm": 10.6875, "learning_rate": 9.156252712914307e-06, "loss": 1.9561617374420166, "step": 1589 }, { "epoch": 1.0838445807770962, "grad_norm": 11.625, "learning_rate": 9.145250101163032e-06, "loss": 3.3932318687438965, "step": 1590 }, { "epoch": 1.0845262440354464, "grad_norm": 52.5, "learning_rate": 9.134248531667308e-06, "loss": 7.631193161010742, "step": 1591 }, { "epoch": 1.0852079072937968, "grad_norm": 27.375, "learning_rate": 9.123248017842108e-06, "loss": 2.5352320671081543, "step": 1592 }, { "epoch": 1.0858895705521472, "grad_norm": 19.375, "learning_rate": 9.112248573101125e-06, "loss": 5.525008201599121, "step": 1593 }, { "epoch": 1.0865712338104976, "grad_norm": 17.25, "learning_rate": 9.101250210856733e-06, "loss": 5.6474761962890625, "step": 1594 }, { "epoch": 1.087252897068848, "grad_norm": 18.875, "learning_rate": 9.090252944520002e-06, "loss": 4.407737731933594, "step": 1595 }, { "epoch": 1.0879345603271984, "grad_norm": 12.0, "learning_rate": 9.079256787500649e-06, "loss": 4.08145809173584, "step": 1596 }, { "epoch": 1.0886162235855488, "grad_norm": 25.0, "learning_rate": 9.068261753207054e-06, "loss": 4.904573440551758, "step": 1597 }, { "epoch": 1.0892978868438992, "grad_norm": 76.0, "learning_rate": 9.057267855046223e-06, "loss": 11.377581596374512, "step": 1598 }, { "epoch": 1.0899795501022496, "grad_norm": 10.625, "learning_rate": 9.046275106423766e-06, "loss": 3.8048973083496094, "step": 1599 }, { "epoch": 1.0906612133606, "grad_norm": 15.6875, "learning_rate": 9.035283520743911e-06, "loss": 1.9749810695648193, "step": 1600 }, { "epoch": 1.0913428766189504, "grad_norm": 12.125, "learning_rate": 9.024293111409446e-06, "loss": 4.479114055633545, "step": 1601 }, { "epoch": 1.0920245398773005, "grad_norm": 21.125, "learning_rate": 9.013303891821744e-06, "loss": 3.770312786102295, "step": 1602 }, { "epoch": 1.092706203135651, "grad_norm": 24.5, "learning_rate": 9.002315875380719e-06, "loss": 5.725625991821289, "step": 1603 }, { "epoch": 1.0933878663940013, "grad_norm": 18.75, "learning_rate": 8.99132907548481e-06, "loss": 2.5233843326568604, "step": 1604 }, { "epoch": 1.0940695296523517, "grad_norm": 22.125, "learning_rate": 8.980343505530988e-06, "loss": 4.879720687866211, "step": 1605 }, { "epoch": 1.094751192910702, "grad_norm": 22.25, "learning_rate": 8.96935917891472e-06, "loss": 5.501543998718262, "step": 1606 }, { "epoch": 1.0954328561690525, "grad_norm": 13.75, "learning_rate": 8.958376109029942e-06, "loss": 3.166465997695923, "step": 1607 }, { "epoch": 1.096114519427403, "grad_norm": 14.875, "learning_rate": 8.94739430926908e-06, "loss": 4.200061798095703, "step": 1608 }, { "epoch": 1.0967961826857533, "grad_norm": 40.5, "learning_rate": 8.936413793022994e-06, "loss": 8.601780891418457, "step": 1609 }, { "epoch": 1.0974778459441037, "grad_norm": 52.25, "learning_rate": 8.925434573680986e-06, "loss": 12.021560668945312, "step": 1610 }, { "epoch": 1.098159509202454, "grad_norm": 36.0, "learning_rate": 8.914456664630782e-06, "loss": 8.669591903686523, "step": 1611 }, { "epoch": 1.0988411724608043, "grad_norm": 22.125, "learning_rate": 8.903480079258495e-06, "loss": 5.235696315765381, "step": 1612 }, { "epoch": 1.0995228357191547, "grad_norm": 16.25, "learning_rate": 8.892504830948641e-06, "loss": 5.2205891609191895, "step": 1613 }, { "epoch": 1.100204498977505, "grad_norm": 15.4375, "learning_rate": 8.881530933084097e-06, "loss": 4.4382710456848145, "step": 1614 }, { "epoch": 1.1008861622358554, "grad_norm": 30.375, "learning_rate": 8.870558399046086e-06, "loss": 2.6607232093811035, "step": 1615 }, { "epoch": 1.1015678254942058, "grad_norm": 15.875, "learning_rate": 8.859587242214187e-06, "loss": 4.917505264282227, "step": 1616 }, { "epoch": 1.1022494887525562, "grad_norm": 21.25, "learning_rate": 8.848617475966282e-06, "loss": 5.6812920570373535, "step": 1617 }, { "epoch": 1.1029311520109066, "grad_norm": 21.0, "learning_rate": 8.837649113678568e-06, "loss": 6.4308295249938965, "step": 1618 }, { "epoch": 1.103612815269257, "grad_norm": 21.375, "learning_rate": 8.826682168725525e-06, "loss": 2.897919178009033, "step": 1619 }, { "epoch": 1.1042944785276074, "grad_norm": 22.125, "learning_rate": 8.815716654479903e-06, "loss": 3.1289336681365967, "step": 1620 }, { "epoch": 1.1049761417859578, "grad_norm": 14.375, "learning_rate": 8.804752584312713e-06, "loss": 7.240499973297119, "step": 1621 }, { "epoch": 1.105657805044308, "grad_norm": 26.5, "learning_rate": 8.793789971593207e-06, "loss": 3.740675687789917, "step": 1622 }, { "epoch": 1.1063394683026584, "grad_norm": 16.5, "learning_rate": 8.782828829688846e-06, "loss": 4.361794948577881, "step": 1623 }, { "epoch": 1.1070211315610088, "grad_norm": 18.0, "learning_rate": 8.771869171965317e-06, "loss": 4.867619037628174, "step": 1624 }, { "epoch": 1.1077027948193592, "grad_norm": 28.125, "learning_rate": 8.76091101178648e-06, "loss": 7.528362274169922, "step": 1625 }, { "epoch": 1.1083844580777096, "grad_norm": 13.875, "learning_rate": 8.749954362514377e-06, "loss": 4.874473571777344, "step": 1626 }, { "epoch": 1.10906612133606, "grad_norm": 28.375, "learning_rate": 8.738999237509212e-06, "loss": 6.518458366394043, "step": 1627 }, { "epoch": 1.1097477845944104, "grad_norm": 44.5, "learning_rate": 8.728045650129315e-06, "loss": 9.18846321105957, "step": 1628 }, { "epoch": 1.1104294478527608, "grad_norm": 51.75, "learning_rate": 8.717093613731162e-06, "loss": 8.967084884643555, "step": 1629 }, { "epoch": 1.1111111111111112, "grad_norm": 13.5, "learning_rate": 8.706143141669324e-06, "loss": 2.5367472171783447, "step": 1630 }, { "epoch": 1.1117927743694616, "grad_norm": 65.5, "learning_rate": 8.695194247296461e-06, "loss": 8.841804504394531, "step": 1631 }, { "epoch": 1.112474437627812, "grad_norm": 11.875, "learning_rate": 8.684246943963327e-06, "loss": 3.8289425373077393, "step": 1632 }, { "epoch": 1.1131561008861621, "grad_norm": 22.125, "learning_rate": 8.673301245018712e-06, "loss": 5.030890464782715, "step": 1633 }, { "epoch": 1.1138377641445125, "grad_norm": 19.375, "learning_rate": 8.66235716380947e-06, "loss": 5.513038158416748, "step": 1634 }, { "epoch": 1.114519427402863, "grad_norm": 25.75, "learning_rate": 8.651414713680474e-06, "loss": 6.777066230773926, "step": 1635 }, { "epoch": 1.1152010906612133, "grad_norm": 23.25, "learning_rate": 8.640473907974609e-06, "loss": 6.601796627044678, "step": 1636 }, { "epoch": 1.1158827539195637, "grad_norm": 14.5, "learning_rate": 8.629534760032749e-06, "loss": 4.01887321472168, "step": 1637 }, { "epoch": 1.116564417177914, "grad_norm": 45.25, "learning_rate": 8.618597283193764e-06, "loss": 9.005342483520508, "step": 1638 }, { "epoch": 1.1172460804362645, "grad_norm": 26.625, "learning_rate": 8.607661490794461e-06, "loss": 6.292356491088867, "step": 1639 }, { "epoch": 1.117927743694615, "grad_norm": 43.0, "learning_rate": 8.59672739616962e-06, "loss": 8.227553367614746, "step": 1640 }, { "epoch": 1.1186094069529653, "grad_norm": 19.875, "learning_rate": 8.585795012651924e-06, "loss": 4.558838844299316, "step": 1641 }, { "epoch": 1.1192910702113157, "grad_norm": 31.5, "learning_rate": 8.574864353571989e-06, "loss": 8.541998863220215, "step": 1642 }, { "epoch": 1.119972733469666, "grad_norm": 20.625, "learning_rate": 8.563935432258326e-06, "loss": 2.5924596786499023, "step": 1643 }, { "epoch": 1.1206543967280163, "grad_norm": 28.75, "learning_rate": 8.553008262037316e-06, "loss": 6.06859827041626, "step": 1644 }, { "epoch": 1.1213360599863667, "grad_norm": 35.25, "learning_rate": 8.542082856233216e-06, "loss": 2.2794241905212402, "step": 1645 }, { "epoch": 1.122017723244717, "grad_norm": 29.125, "learning_rate": 8.53115922816813e-06, "loss": 5.276708126068115, "step": 1646 }, { "epoch": 1.1226993865030674, "grad_norm": 20.125, "learning_rate": 8.520237391161983e-06, "loss": 6.1056413650512695, "step": 1647 }, { "epoch": 1.1233810497614178, "grad_norm": 17.75, "learning_rate": 8.509317358532536e-06, "loss": 3.9177379608154297, "step": 1648 }, { "epoch": 1.1240627130197682, "grad_norm": 14.1875, "learning_rate": 8.498399143595328e-06, "loss": 4.1641387939453125, "step": 1649 }, { "epoch": 1.1247443762781186, "grad_norm": 14.3125, "learning_rate": 8.487482759663696e-06, "loss": 5.60027551651001, "step": 1650 }, { "epoch": 1.125426039536469, "grad_norm": 11.375, "learning_rate": 8.476568220048748e-06, "loss": 4.396129608154297, "step": 1651 }, { "epoch": 1.1261077027948194, "grad_norm": 25.125, "learning_rate": 8.465655538059326e-06, "loss": 4.973955154418945, "step": 1652 }, { "epoch": 1.1267893660531698, "grad_norm": 21.25, "learning_rate": 8.454744727002015e-06, "loss": 4.542899131774902, "step": 1653 }, { "epoch": 1.1274710293115202, "grad_norm": 27.625, "learning_rate": 8.443835800181132e-06, "loss": 3.604156970977783, "step": 1654 }, { "epoch": 1.1281526925698704, "grad_norm": 160.0, "learning_rate": 8.43292877089867e-06, "loss": 4.056912899017334, "step": 1655 }, { "epoch": 1.1288343558282208, "grad_norm": 39.75, "learning_rate": 8.422023652454336e-06, "loss": 6.675937652587891, "step": 1656 }, { "epoch": 1.1295160190865712, "grad_norm": 19.625, "learning_rate": 8.411120458145484e-06, "loss": 6.758126735687256, "step": 1657 }, { "epoch": 1.1301976823449216, "grad_norm": 22.75, "learning_rate": 8.400219201267134e-06, "loss": 3.329357147216797, "step": 1658 }, { "epoch": 1.130879345603272, "grad_norm": 26.375, "learning_rate": 8.389319895111945e-06, "loss": 6.384528636932373, "step": 1659 }, { "epoch": 1.1315610088616224, "grad_norm": 21.125, "learning_rate": 8.378422552970185e-06, "loss": 4.923349380493164, "step": 1660 }, { "epoch": 1.1322426721199728, "grad_norm": 20.0, "learning_rate": 8.367527188129748e-06, "loss": 4.995100975036621, "step": 1661 }, { "epoch": 1.1329243353783232, "grad_norm": 13.375, "learning_rate": 8.356633813876089e-06, "loss": 5.854995250701904, "step": 1662 }, { "epoch": 1.1336059986366736, "grad_norm": 15.9375, "learning_rate": 8.345742443492264e-06, "loss": 4.7914581298828125, "step": 1663 }, { "epoch": 1.134287661895024, "grad_norm": 21.75, "learning_rate": 8.33485309025887e-06, "loss": 3.5375657081604004, "step": 1664 }, { "epoch": 1.1349693251533743, "grad_norm": 20.125, "learning_rate": 8.32396576745404e-06, "loss": 4.592517852783203, "step": 1665 }, { "epoch": 1.1356509884117245, "grad_norm": 17.75, "learning_rate": 8.313080488353444e-06, "loss": 2.1187541484832764, "step": 1666 }, { "epoch": 1.136332651670075, "grad_norm": 16.625, "learning_rate": 8.302197266230258e-06, "loss": 3.853410243988037, "step": 1667 }, { "epoch": 1.1370143149284253, "grad_norm": 25.5, "learning_rate": 8.291316114355138e-06, "loss": 6.204894065856934, "step": 1668 }, { "epoch": 1.1376959781867757, "grad_norm": 27.0, "learning_rate": 8.280437045996231e-06, "loss": 5.563859939575195, "step": 1669 }, { "epoch": 1.138377641445126, "grad_norm": 13.9375, "learning_rate": 8.269560074419126e-06, "loss": 4.319883823394775, "step": 1670 }, { "epoch": 1.1390593047034765, "grad_norm": 20.5, "learning_rate": 8.258685212886873e-06, "loss": 3.493776798248291, "step": 1671 }, { "epoch": 1.139740967961827, "grad_norm": 20.0, "learning_rate": 8.247812474659942e-06, "loss": 4.198956489562988, "step": 1672 }, { "epoch": 1.1404226312201773, "grad_norm": 36.0, "learning_rate": 8.23694187299621e-06, "loss": 7.795163631439209, "step": 1673 }, { "epoch": 1.1411042944785277, "grad_norm": 17.0, "learning_rate": 8.226073421150951e-06, "loss": 5.760781764984131, "step": 1674 }, { "epoch": 1.141785957736878, "grad_norm": 50.5, "learning_rate": 8.215207132376824e-06, "loss": 6.654126167297363, "step": 1675 }, { "epoch": 1.1424676209952285, "grad_norm": 20.0, "learning_rate": 8.204343019923837e-06, "loss": 4.71498966217041, "step": 1676 }, { "epoch": 1.1431492842535786, "grad_norm": 38.5, "learning_rate": 8.193481097039362e-06, "loss": 5.817451477050781, "step": 1677 }, { "epoch": 1.143830947511929, "grad_norm": 17.0, "learning_rate": 8.182621376968082e-06, "loss": 3.941257953643799, "step": 1678 }, { "epoch": 1.1445126107702794, "grad_norm": 30.0, "learning_rate": 8.17176387295201e-06, "loss": 6.174861907958984, "step": 1679 }, { "epoch": 1.1451942740286298, "grad_norm": 18.0, "learning_rate": 8.160908598230448e-06, "loss": 2.2171311378479004, "step": 1680 }, { "epoch": 1.1458759372869802, "grad_norm": 30.375, "learning_rate": 8.150055566039977e-06, "loss": 7.021890640258789, "step": 1681 }, { "epoch": 1.1465576005453306, "grad_norm": 17.0, "learning_rate": 8.139204789614455e-06, "loss": 4.5863871574401855, "step": 1682 }, { "epoch": 1.147239263803681, "grad_norm": 41.25, "learning_rate": 8.128356282184982e-06, "loss": 5.655610084533691, "step": 1683 }, { "epoch": 1.1479209270620314, "grad_norm": 15.6875, "learning_rate": 8.11751005697989e-06, "loss": 6.0512895584106445, "step": 1684 }, { "epoch": 1.1486025903203818, "grad_norm": 11.875, "learning_rate": 8.10666612722473e-06, "loss": 3.844203233718872, "step": 1685 }, { "epoch": 1.149284253578732, "grad_norm": 16.875, "learning_rate": 8.095824506142251e-06, "loss": 1.9315086603164673, "step": 1686 }, { "epoch": 1.1499659168370826, "grad_norm": 37.75, "learning_rate": 8.084985206952394e-06, "loss": 7.156002998352051, "step": 1687 }, { "epoch": 1.1506475800954328, "grad_norm": 27.625, "learning_rate": 8.07414824287227e-06, "loss": 3.331641674041748, "step": 1688 }, { "epoch": 1.1513292433537832, "grad_norm": 39.25, "learning_rate": 8.063313627116126e-06, "loss": 9.496403694152832, "step": 1689 }, { "epoch": 1.1520109066121336, "grad_norm": 13.75, "learning_rate": 8.052481372895363e-06, "loss": 3.1969692707061768, "step": 1690 }, { "epoch": 1.152692569870484, "grad_norm": 14.625, "learning_rate": 8.041651493418498e-06, "loss": 4.921978950500488, "step": 1691 }, { "epoch": 1.1533742331288344, "grad_norm": 19.625, "learning_rate": 8.030824001891147e-06, "loss": 3.244955062866211, "step": 1692 }, { "epoch": 1.1540558963871848, "grad_norm": 18.125, "learning_rate": 8.019998911516021e-06, "loss": 7.180319786071777, "step": 1693 }, { "epoch": 1.1547375596455351, "grad_norm": 35.25, "learning_rate": 8.009176235492893e-06, "loss": 4.041303634643555, "step": 1694 }, { "epoch": 1.1554192229038855, "grad_norm": 42.0, "learning_rate": 7.998355987018606e-06, "loss": 9.342479705810547, "step": 1695 }, { "epoch": 1.156100886162236, "grad_norm": 13.75, "learning_rate": 7.987538179287039e-06, "loss": 3.897177219390869, "step": 1696 }, { "epoch": 1.1567825494205861, "grad_norm": 20.625, "learning_rate": 7.97672282548908e-06, "loss": 5.072324752807617, "step": 1697 }, { "epoch": 1.1574642126789365, "grad_norm": 29.375, "learning_rate": 7.965909938812644e-06, "loss": 2.797964334487915, "step": 1698 }, { "epoch": 1.158145875937287, "grad_norm": 16.5, "learning_rate": 7.955099532442632e-06, "loss": 2.990330696105957, "step": 1699 }, { "epoch": 1.1588275391956373, "grad_norm": 40.0, "learning_rate": 7.944291619560914e-06, "loss": 6.126614093780518, "step": 1700 }, { "epoch": 1.1595092024539877, "grad_norm": 12.25, "learning_rate": 7.93348621334633e-06, "loss": 3.4021477699279785, "step": 1701 }, { "epoch": 1.160190865712338, "grad_norm": 11.8125, "learning_rate": 7.922683326974648e-06, "loss": 3.1986098289489746, "step": 1702 }, { "epoch": 1.1608725289706885, "grad_norm": 18.5, "learning_rate": 7.91188297361858e-06, "loss": 5.415998458862305, "step": 1703 }, { "epoch": 1.1615541922290389, "grad_norm": 34.75, "learning_rate": 7.901085166447743e-06, "loss": 6.677962303161621, "step": 1704 }, { "epoch": 1.1622358554873893, "grad_norm": 16.0, "learning_rate": 7.890289918628644e-06, "loss": 2.5745205879211426, "step": 1705 }, { "epoch": 1.1629175187457397, "grad_norm": 36.25, "learning_rate": 7.879497243324678e-06, "loss": 6.741413116455078, "step": 1706 }, { "epoch": 1.16359918200409, "grad_norm": 11.8125, "learning_rate": 7.8687071536961e-06, "loss": 3.3370468616485596, "step": 1707 }, { "epoch": 1.1642808452624402, "grad_norm": 17.0, "learning_rate": 7.857919662900006e-06, "loss": 4.694578170776367, "step": 1708 }, { "epoch": 1.1649625085207906, "grad_norm": 13.9375, "learning_rate": 7.847134784090333e-06, "loss": 3.48256778717041, "step": 1709 }, { "epoch": 1.165644171779141, "grad_norm": 21.375, "learning_rate": 7.836352530417824e-06, "loss": 6.844442367553711, "step": 1710 }, { "epoch": 1.1663258350374914, "grad_norm": 18.375, "learning_rate": 7.825572915030027e-06, "loss": 5.645540237426758, "step": 1711 }, { "epoch": 1.1670074982958418, "grad_norm": 33.25, "learning_rate": 7.814795951071275e-06, "loss": 3.0488901138305664, "step": 1712 }, { "epoch": 1.1676891615541922, "grad_norm": 34.0, "learning_rate": 7.804021651682656e-06, "loss": 8.571889877319336, "step": 1713 }, { "epoch": 1.1683708248125426, "grad_norm": 14.0, "learning_rate": 7.793250030002022e-06, "loss": 5.182218551635742, "step": 1714 }, { "epoch": 1.169052488070893, "grad_norm": 32.75, "learning_rate": 7.782481099163958e-06, "loss": 7.455838680267334, "step": 1715 }, { "epoch": 1.1697341513292434, "grad_norm": 23.875, "learning_rate": 7.771714872299758e-06, "loss": 2.3497366905212402, "step": 1716 }, { "epoch": 1.1704158145875938, "grad_norm": 16.25, "learning_rate": 7.760951362537432e-06, "loss": 3.0141539573669434, "step": 1717 }, { "epoch": 1.1710974778459442, "grad_norm": 56.75, "learning_rate": 7.750190583001663e-06, "loss": 10.912010192871094, "step": 1718 }, { "epoch": 1.1717791411042944, "grad_norm": 17.25, "learning_rate": 7.739432546813817e-06, "loss": 4.310739040374756, "step": 1719 }, { "epoch": 1.1724608043626448, "grad_norm": 12.875, "learning_rate": 7.728677267091912e-06, "loss": 1.6862201690673828, "step": 1720 }, { "epoch": 1.1731424676209952, "grad_norm": 16.25, "learning_rate": 7.717924756950597e-06, "loss": 2.1441850662231445, "step": 1721 }, { "epoch": 1.1738241308793456, "grad_norm": 15.5, "learning_rate": 7.707175029501158e-06, "loss": 6.223445415496826, "step": 1722 }, { "epoch": 1.174505794137696, "grad_norm": 25.125, "learning_rate": 7.696428097851472e-06, "loss": 2.6836113929748535, "step": 1723 }, { "epoch": 1.1751874573960464, "grad_norm": 15.625, "learning_rate": 7.685683975106015e-06, "loss": 4.044487953186035, "step": 1724 }, { "epoch": 1.1758691206543967, "grad_norm": 11.375, "learning_rate": 7.674942674365847e-06, "loss": 3.2008249759674072, "step": 1725 }, { "epoch": 1.1765507839127471, "grad_norm": 39.25, "learning_rate": 7.664204208728562e-06, "loss": 6.555390357971191, "step": 1726 }, { "epoch": 1.1772324471710975, "grad_norm": 29.625, "learning_rate": 7.653468591288326e-06, "loss": 6.175301551818848, "step": 1727 }, { "epoch": 1.177914110429448, "grad_norm": 32.25, "learning_rate": 7.642735835135817e-06, "loss": 4.872661590576172, "step": 1728 }, { "epoch": 1.1785957736877983, "grad_norm": 29.25, "learning_rate": 7.632005953358214e-06, "loss": 5.762571334838867, "step": 1729 }, { "epoch": 1.1792774369461485, "grad_norm": 69.0, "learning_rate": 7.621278959039217e-06, "loss": 9.835029602050781, "step": 1730 }, { "epoch": 1.179959100204499, "grad_norm": 11.5, "learning_rate": 7.61055486525898e-06, "loss": 3.37182879447937, "step": 1731 }, { "epoch": 1.1806407634628493, "grad_norm": 26.875, "learning_rate": 7.599833685094136e-06, "loss": 6.601644515991211, "step": 1732 }, { "epoch": 1.1813224267211997, "grad_norm": 12.0625, "learning_rate": 7.5891154316177586e-06, "loss": 4.859679222106934, "step": 1733 }, { "epoch": 1.18200408997955, "grad_norm": 17.125, "learning_rate": 7.578400117899355e-06, "loss": 3.007108211517334, "step": 1734 }, { "epoch": 1.1826857532379005, "grad_norm": 13.0625, "learning_rate": 7.567687757004843e-06, "loss": 2.8198392391204834, "step": 1735 }, { "epoch": 1.1833674164962509, "grad_norm": 23.5, "learning_rate": 7.556978361996553e-06, "loss": 4.469259738922119, "step": 1736 }, { "epoch": 1.1840490797546013, "grad_norm": 44.5, "learning_rate": 7.546271945933178e-06, "loss": 5.9087934494018555, "step": 1737 }, { "epoch": 1.1847307430129517, "grad_norm": 11.5625, "learning_rate": 7.5355685218698e-06, "loss": 2.731790781021118, "step": 1738 }, { "epoch": 1.185412406271302, "grad_norm": 13.6875, "learning_rate": 7.524868102857836e-06, "loss": 4.622488021850586, "step": 1739 }, { "epoch": 1.1860940695296525, "grad_norm": 69.5, "learning_rate": 7.514170701945047e-06, "loss": 4.749290466308594, "step": 1740 }, { "epoch": 1.1867757327880026, "grad_norm": 17.25, "learning_rate": 7.5034763321755174e-06, "loss": 4.04113245010376, "step": 1741 }, { "epoch": 1.187457396046353, "grad_norm": 27.875, "learning_rate": 7.492785006589623e-06, "loss": 4.7369256019592285, "step": 1742 }, { "epoch": 1.1881390593047034, "grad_norm": 30.375, "learning_rate": 7.482096738224038e-06, "loss": 6.970638275146484, "step": 1743 }, { "epoch": 1.1888207225630538, "grad_norm": 18.125, "learning_rate": 7.47141154011171e-06, "loss": 4.261471271514893, "step": 1744 }, { "epoch": 1.1895023858214042, "grad_norm": 12.4375, "learning_rate": 7.460729425281831e-06, "loss": 4.488287448883057, "step": 1745 }, { "epoch": 1.1901840490797546, "grad_norm": 18.25, "learning_rate": 7.450050406759849e-06, "loss": 3.000248432159424, "step": 1746 }, { "epoch": 1.190865712338105, "grad_norm": 13.6875, "learning_rate": 7.43937449756742e-06, "loss": 4.373888969421387, "step": 1747 }, { "epoch": 1.1915473755964554, "grad_norm": 12.1875, "learning_rate": 7.428701710722424e-06, "loss": 4.11643123626709, "step": 1748 }, { "epoch": 1.1922290388548058, "grad_norm": 42.75, "learning_rate": 7.418032059238922e-06, "loss": 9.482246398925781, "step": 1749 }, { "epoch": 1.1929107021131562, "grad_norm": 36.25, "learning_rate": 7.407365556127162e-06, "loss": 5.924765586853027, "step": 1750 }, { "epoch": 1.1935923653715066, "grad_norm": 23.375, "learning_rate": 7.39670221439354e-06, "loss": 4.339920997619629, "step": 1751 }, { "epoch": 1.1942740286298568, "grad_norm": 33.0, "learning_rate": 7.386042047040613e-06, "loss": 5.718446731567383, "step": 1752 }, { "epoch": 1.1949556918882072, "grad_norm": 21.0, "learning_rate": 7.375385067067051e-06, "loss": 5.056180953979492, "step": 1753 }, { "epoch": 1.1956373551465576, "grad_norm": 33.5, "learning_rate": 7.364731287467653e-06, "loss": 4.516639232635498, "step": 1754 }, { "epoch": 1.196319018404908, "grad_norm": 13.25, "learning_rate": 7.354080721233303e-06, "loss": 3.7805404663085938, "step": 1755 }, { "epoch": 1.1970006816632583, "grad_norm": 16.5, "learning_rate": 7.343433381350969e-06, "loss": 6.0509419441223145, "step": 1756 }, { "epoch": 1.1976823449216087, "grad_norm": 19.75, "learning_rate": 7.332789280803696e-06, "loss": 3.4284064769744873, "step": 1757 }, { "epoch": 1.1983640081799591, "grad_norm": 19.0, "learning_rate": 7.3221484325705615e-06, "loss": 3.834421157836914, "step": 1758 }, { "epoch": 1.1990456714383095, "grad_norm": 13.5, "learning_rate": 7.311510849626692e-06, "loss": 2.8783631324768066, "step": 1759 }, { "epoch": 1.19972733469666, "grad_norm": 21.5, "learning_rate": 7.300876544943227e-06, "loss": 3.6869263648986816, "step": 1760 }, { "epoch": 1.20040899795501, "grad_norm": 12.8125, "learning_rate": 7.290245531487303e-06, "loss": 3.540595769882202, "step": 1761 }, { "epoch": 1.2010906612133607, "grad_norm": 13.875, "learning_rate": 7.279617822222056e-06, "loss": 3.5032248497009277, "step": 1762 }, { "epoch": 1.201772324471711, "grad_norm": 30.375, "learning_rate": 7.268993430106576e-06, "loss": 3.871457576751709, "step": 1763 }, { "epoch": 1.2024539877300613, "grad_norm": 41.75, "learning_rate": 7.258372368095923e-06, "loss": 6.6087422370910645, "step": 1764 }, { "epoch": 1.2031356509884117, "grad_norm": 23.875, "learning_rate": 7.247754649141097e-06, "loss": 3.7051963806152344, "step": 1765 }, { "epoch": 1.203817314246762, "grad_norm": 67.0, "learning_rate": 7.2371402861890065e-06, "loss": 5.8413286209106445, "step": 1766 }, { "epoch": 1.2044989775051125, "grad_norm": 31.875, "learning_rate": 7.226529292182478e-06, "loss": 6.392594337463379, "step": 1767 }, { "epoch": 1.2051806407634629, "grad_norm": 12.625, "learning_rate": 7.2159216800602364e-06, "loss": 3.462491512298584, "step": 1768 }, { "epoch": 1.2058623040218133, "grad_norm": 15.1875, "learning_rate": 7.205317462756865e-06, "loss": 3.624847412109375, "step": 1769 }, { "epoch": 1.2065439672801637, "grad_norm": 23.0, "learning_rate": 7.194716653202826e-06, "loss": 4.861111164093018, "step": 1770 }, { "epoch": 1.207225630538514, "grad_norm": 12.9375, "learning_rate": 7.184119264324414e-06, "loss": 3.8111467361450195, "step": 1771 }, { "epoch": 1.2079072937968642, "grad_norm": 19.5, "learning_rate": 7.173525309043757e-06, "loss": 5.20961332321167, "step": 1772 }, { "epoch": 1.2085889570552146, "grad_norm": 37.25, "learning_rate": 7.162934800278801e-06, "loss": 7.974620342254639, "step": 1773 }, { "epoch": 1.209270620313565, "grad_norm": 15.125, "learning_rate": 7.152347750943276e-06, "loss": 2.9720537662506104, "step": 1774 }, { "epoch": 1.2099522835719154, "grad_norm": 54.5, "learning_rate": 7.1417641739467104e-06, "loss": 10.222710609436035, "step": 1775 }, { "epoch": 1.2106339468302658, "grad_norm": 25.75, "learning_rate": 7.131184082194382e-06, "loss": 5.135432720184326, "step": 1776 }, { "epoch": 1.2113156100886162, "grad_norm": 41.75, "learning_rate": 7.1206074885873345e-06, "loss": 7.191382884979248, "step": 1777 }, { "epoch": 1.2119972733469666, "grad_norm": 30.25, "learning_rate": 7.110034406022337e-06, "loss": 6.832047939300537, "step": 1778 }, { "epoch": 1.212678936605317, "grad_norm": 10.625, "learning_rate": 7.099464847391873e-06, "loss": 4.407943248748779, "step": 1779 }, { "epoch": 1.2133605998636674, "grad_norm": 16.625, "learning_rate": 7.088898825584139e-06, "loss": 2.940113067626953, "step": 1780 }, { "epoch": 1.2140422631220178, "grad_norm": 19.375, "learning_rate": 7.0783363534830185e-06, "loss": 5.905073165893555, "step": 1781 }, { "epoch": 1.2147239263803682, "grad_norm": 32.5, "learning_rate": 7.067777443968058e-06, "loss": 6.558566093444824, "step": 1782 }, { "epoch": 1.2154055896387184, "grad_norm": 16.625, "learning_rate": 7.057222109914467e-06, "loss": 4.260597229003906, "step": 1783 }, { "epoch": 1.2160872528970688, "grad_norm": 12.3125, "learning_rate": 7.046670364193089e-06, "loss": 3.796602487564087, "step": 1784 }, { "epoch": 1.2167689161554192, "grad_norm": 30.75, "learning_rate": 7.036122219670398e-06, "loss": 8.20306396484375, "step": 1785 }, { "epoch": 1.2174505794137696, "grad_norm": 17.5, "learning_rate": 7.02557768920848e-06, "loss": 2.4573044776916504, "step": 1786 }, { "epoch": 1.21813224267212, "grad_norm": 10.1875, "learning_rate": 7.0150367856650035e-06, "loss": 4.113483428955078, "step": 1787 }, { "epoch": 1.2188139059304703, "grad_norm": 24.875, "learning_rate": 7.004499521893217e-06, "loss": 2.231403350830078, "step": 1788 }, { "epoch": 1.2194955691888207, "grad_norm": 19.375, "learning_rate": 6.993965910741943e-06, "loss": 3.1322011947631836, "step": 1789 }, { "epoch": 1.2201772324471711, "grad_norm": 13.875, "learning_rate": 6.9834359650555305e-06, "loss": 3.631190299987793, "step": 1790 }, { "epoch": 1.2208588957055215, "grad_norm": 28.125, "learning_rate": 6.972909697673877e-06, "loss": 3.537489175796509, "step": 1791 }, { "epoch": 1.221540558963872, "grad_norm": 14.75, "learning_rate": 6.962387121432381e-06, "loss": 4.738898754119873, "step": 1792 }, { "epoch": 1.2222222222222223, "grad_norm": 18.125, "learning_rate": 6.951868249161951e-06, "loss": 4.667486190795898, "step": 1793 }, { "epoch": 1.2229038854805725, "grad_norm": 26.75, "learning_rate": 6.941353093688972e-06, "loss": 5.780388355255127, "step": 1794 }, { "epoch": 1.223585548738923, "grad_norm": 16.0, "learning_rate": 6.930841667835295e-06, "loss": 2.736281394958496, "step": 1795 }, { "epoch": 1.2242672119972733, "grad_norm": 22.375, "learning_rate": 6.92033398441823e-06, "loss": 4.089028358459473, "step": 1796 }, { "epoch": 1.2249488752556237, "grad_norm": 34.0, "learning_rate": 6.909830056250527e-06, "loss": 5.030251502990723, "step": 1797 }, { "epoch": 1.225630538513974, "grad_norm": 14.9375, "learning_rate": 6.899329896140343e-06, "loss": 4.004383087158203, "step": 1798 }, { "epoch": 1.2263122017723245, "grad_norm": 30.25, "learning_rate": 6.8888335168912515e-06, "loss": 4.663923263549805, "step": 1799 }, { "epoch": 1.2269938650306749, "grad_norm": 26.25, "learning_rate": 6.878340931302208e-06, "loss": 5.761874198913574, "step": 1800 }, { "epoch": 1.2276755282890253, "grad_norm": 25.5, "learning_rate": 6.867852152167548e-06, "loss": 9.38907527923584, "step": 1801 }, { "epoch": 1.2283571915473757, "grad_norm": 16.875, "learning_rate": 6.857367192276971e-06, "loss": 3.2213737964630127, "step": 1802 }, { "epoch": 1.229038854805726, "grad_norm": 22.375, "learning_rate": 6.8468860644155034e-06, "loss": 4.22444486618042, "step": 1803 }, { "epoch": 1.2297205180640765, "grad_norm": 16.625, "learning_rate": 6.836408781363508e-06, "loss": 5.905540943145752, "step": 1804 }, { "epoch": 1.2304021813224266, "grad_norm": 27.25, "learning_rate": 6.825935355896669e-06, "loss": 7.203788757324219, "step": 1805 }, { "epoch": 1.231083844580777, "grad_norm": 13.875, "learning_rate": 6.815465800785946e-06, "loss": 3.0165390968322754, "step": 1806 }, { "epoch": 1.2317655078391274, "grad_norm": 31.0, "learning_rate": 6.805000128797596e-06, "loss": 7.559882640838623, "step": 1807 }, { "epoch": 1.2324471710974778, "grad_norm": 23.25, "learning_rate": 6.794538352693132e-06, "loss": 6.2513885498046875, "step": 1808 }, { "epoch": 1.2331288343558282, "grad_norm": 25.5, "learning_rate": 6.7840804852293225e-06, "loss": 3.650421619415283, "step": 1809 }, { "epoch": 1.2338104976141786, "grad_norm": 18.25, "learning_rate": 6.773626539158171e-06, "loss": 4.4261674880981445, "step": 1810 }, { "epoch": 1.234492160872529, "grad_norm": 24.0, "learning_rate": 6.7631765272268845e-06, "loss": 2.50698184967041, "step": 1811 }, { "epoch": 1.2351738241308794, "grad_norm": 29.75, "learning_rate": 6.752730462177891e-06, "loss": 7.287703514099121, "step": 1812 }, { "epoch": 1.2358554873892298, "grad_norm": 19.125, "learning_rate": 6.742288356748803e-06, "loss": 3.8883864879608154, "step": 1813 }, { "epoch": 1.2365371506475802, "grad_norm": 24.0, "learning_rate": 6.731850223672391e-06, "loss": 6.120924472808838, "step": 1814 }, { "epoch": 1.2372188139059306, "grad_norm": 22.0, "learning_rate": 6.721416075676601e-06, "loss": 4.160590648651123, "step": 1815 }, { "epoch": 1.2379004771642808, "grad_norm": 39.25, "learning_rate": 6.710985925484499e-06, "loss": 8.476997375488281, "step": 1816 }, { "epoch": 1.2385821404226312, "grad_norm": 24.25, "learning_rate": 6.700559785814291e-06, "loss": 6.182928085327148, "step": 1817 }, { "epoch": 1.2392638036809815, "grad_norm": 21.625, "learning_rate": 6.690137669379298e-06, "loss": 7.054372787475586, "step": 1818 }, { "epoch": 1.239945466939332, "grad_norm": 13.8125, "learning_rate": 6.679719588887911e-06, "loss": 3.1224098205566406, "step": 1819 }, { "epoch": 1.2406271301976823, "grad_norm": 20.125, "learning_rate": 6.669305557043626e-06, "loss": 4.956881999969482, "step": 1820 }, { "epoch": 1.2413087934560327, "grad_norm": 38.75, "learning_rate": 6.658895586544989e-06, "loss": 6.483809471130371, "step": 1821 }, { "epoch": 1.2419904567143831, "grad_norm": 21.125, "learning_rate": 6.648489690085589e-06, "loss": 4.749490737915039, "step": 1822 }, { "epoch": 1.2426721199727335, "grad_norm": 22.75, "learning_rate": 6.638087880354062e-06, "loss": 5.02000093460083, "step": 1823 }, { "epoch": 1.243353783231084, "grad_norm": 16.375, "learning_rate": 6.6276901700340454e-06, "loss": 5.580973148345947, "step": 1824 }, { "epoch": 1.2440354464894343, "grad_norm": 20.5, "learning_rate": 6.617296571804191e-06, "loss": 3.6924643516540527, "step": 1825 }, { "epoch": 1.2447171097477847, "grad_norm": 25.875, "learning_rate": 6.606907098338131e-06, "loss": 6.29012393951416, "step": 1826 }, { "epoch": 1.2453987730061349, "grad_norm": 40.5, "learning_rate": 6.596521762304461e-06, "loss": 7.382050514221191, "step": 1827 }, { "epoch": 1.2460804362644853, "grad_norm": 17.125, "learning_rate": 6.586140576366742e-06, "loss": 7.051212310791016, "step": 1828 }, { "epoch": 1.2467620995228357, "grad_norm": 15.4375, "learning_rate": 6.575763553183474e-06, "loss": 7.005589485168457, "step": 1829 }, { "epoch": 1.247443762781186, "grad_norm": 26.125, "learning_rate": 6.565390705408072e-06, "loss": 2.422550916671753, "step": 1830 }, { "epoch": 1.2481254260395365, "grad_norm": 42.25, "learning_rate": 6.555022045688872e-06, "loss": 6.6546220779418945, "step": 1831 }, { "epoch": 1.2488070892978869, "grad_norm": 19.375, "learning_rate": 6.544657586669085e-06, "loss": 4.5506463050842285, "step": 1832 }, { "epoch": 1.2494887525562373, "grad_norm": 13.4375, "learning_rate": 6.534297340986822e-06, "loss": 4.749760627746582, "step": 1833 }, { "epoch": 1.2501704158145877, "grad_norm": 16.5, "learning_rate": 6.523941321275043e-06, "loss": 5.004620552062988, "step": 1834 }, { "epoch": 1.250852079072938, "grad_norm": 16.375, "learning_rate": 6.513589540161556e-06, "loss": 5.782403945922852, "step": 1835 }, { "epoch": 1.2515337423312882, "grad_norm": 21.375, "learning_rate": 6.5032420102690045e-06, "loss": 3.8297951221466064, "step": 1836 }, { "epoch": 1.2522154055896388, "grad_norm": 14.75, "learning_rate": 6.492898744214844e-06, "loss": 2.7680888175964355, "step": 1837 }, { "epoch": 1.252897068847989, "grad_norm": 28.125, "learning_rate": 6.482559754611331e-06, "loss": 6.186370372772217, "step": 1838 }, { "epoch": 1.2535787321063394, "grad_norm": 13.625, "learning_rate": 6.4722250540655155e-06, "loss": 4.572150230407715, "step": 1839 }, { "epoch": 1.2542603953646898, "grad_norm": 25.0, "learning_rate": 6.461894655179204e-06, "loss": 7.642073631286621, "step": 1840 }, { "epoch": 1.2549420586230402, "grad_norm": 13.4375, "learning_rate": 6.451568570548969e-06, "loss": 3.413811683654785, "step": 1841 }, { "epoch": 1.2556237218813906, "grad_norm": 17.5, "learning_rate": 6.441246812766122e-06, "loss": 5.67801570892334, "step": 1842 }, { "epoch": 1.256305385139741, "grad_norm": 27.75, "learning_rate": 6.430929394416685e-06, "loss": 7.326361656188965, "step": 1843 }, { "epoch": 1.2569870483980914, "grad_norm": 40.5, "learning_rate": 6.420616328081408e-06, "loss": 6.810323715209961, "step": 1844 }, { "epoch": 1.2576687116564418, "grad_norm": 14.0625, "learning_rate": 6.410307626335717e-06, "loss": 2.395935535430908, "step": 1845 }, { "epoch": 1.2583503749147922, "grad_norm": 23.625, "learning_rate": 6.400003301749726e-06, "loss": 6.258760452270508, "step": 1846 }, { "epoch": 1.2590320381731424, "grad_norm": 26.625, "learning_rate": 6.389703366888214e-06, "loss": 4.02325439453125, "step": 1847 }, { "epoch": 1.259713701431493, "grad_norm": 26.0, "learning_rate": 6.379407834310598e-06, "loss": 5.849958419799805, "step": 1848 }, { "epoch": 1.2603953646898431, "grad_norm": 19.25, "learning_rate": 6.36911671657093e-06, "loss": 1.906442403793335, "step": 1849 }, { "epoch": 1.2610770279481935, "grad_norm": 18.875, "learning_rate": 6.358830026217887e-06, "loss": 5.465363502502441, "step": 1850 }, { "epoch": 1.261758691206544, "grad_norm": 20.0, "learning_rate": 6.348547775794731e-06, "loss": 3.5529913902282715, "step": 1851 }, { "epoch": 1.2624403544648943, "grad_norm": 15.625, "learning_rate": 6.338269977839331e-06, "loss": 4.483345031738281, "step": 1852 }, { "epoch": 1.2631220177232447, "grad_norm": 14.375, "learning_rate": 6.327996644884107e-06, "loss": 2.5647406578063965, "step": 1853 }, { "epoch": 1.2638036809815951, "grad_norm": 57.25, "learning_rate": 6.317727789456043e-06, "loss": 9.570801734924316, "step": 1854 }, { "epoch": 1.2644853442399455, "grad_norm": 31.0, "learning_rate": 6.30746342407667e-06, "loss": 6.692539215087891, "step": 1855 }, { "epoch": 1.265167007498296, "grad_norm": 34.25, "learning_rate": 6.297203561262029e-06, "loss": 4.826699256896973, "step": 1856 }, { "epoch": 1.2658486707566463, "grad_norm": 26.625, "learning_rate": 6.286948213522686e-06, "loss": 5.210046291351318, "step": 1857 }, { "epoch": 1.2665303340149965, "grad_norm": 18.25, "learning_rate": 6.276697393363692e-06, "loss": 0.9613856077194214, "step": 1858 }, { "epoch": 1.267211997273347, "grad_norm": 20.25, "learning_rate": 6.266451113284575e-06, "loss": 2.4157204627990723, "step": 1859 }, { "epoch": 1.2678936605316973, "grad_norm": 32.75, "learning_rate": 6.256209385779341e-06, "loss": 2.6902403831481934, "step": 1860 }, { "epoch": 1.2685753237900477, "grad_norm": 25.5, "learning_rate": 6.245972223336425e-06, "loss": 5.675390243530273, "step": 1861 }, { "epoch": 1.269256987048398, "grad_norm": 33.0, "learning_rate": 6.2357396384387125e-06, "loss": 6.759422302246094, "step": 1862 }, { "epoch": 1.2699386503067485, "grad_norm": 54.0, "learning_rate": 6.2255116435635e-06, "loss": 6.909953594207764, "step": 1863 }, { "epoch": 1.2706203135650989, "grad_norm": 11.875, "learning_rate": 6.215288251182485e-06, "loss": 2.542846202850342, "step": 1864 }, { "epoch": 1.2713019768234493, "grad_norm": 27.75, "learning_rate": 6.205069473761756e-06, "loss": 3.2881383895874023, "step": 1865 }, { "epoch": 1.2719836400817996, "grad_norm": 46.5, "learning_rate": 6.194855323761778e-06, "loss": 3.6933727264404297, "step": 1866 }, { "epoch": 1.27266530334015, "grad_norm": 73.5, "learning_rate": 6.184645813637362e-06, "loss": 11.547966003417969, "step": 1867 }, { "epoch": 1.2733469665985004, "grad_norm": 54.75, "learning_rate": 6.174440955837678e-06, "loss": 8.598566055297852, "step": 1868 }, { "epoch": 1.2740286298568506, "grad_norm": 30.875, "learning_rate": 6.164240762806204e-06, "loss": 5.365769863128662, "step": 1869 }, { "epoch": 1.274710293115201, "grad_norm": 49.75, "learning_rate": 6.154045246980742e-06, "loss": 6.519659519195557, "step": 1870 }, { "epoch": 1.2753919563735514, "grad_norm": 39.5, "learning_rate": 6.143854420793393e-06, "loss": 9.478382110595703, "step": 1871 }, { "epoch": 1.2760736196319018, "grad_norm": 16.375, "learning_rate": 6.133668296670528e-06, "loss": 3.6917929649353027, "step": 1872 }, { "epoch": 1.2767552828902522, "grad_norm": 28.125, "learning_rate": 6.1234868870327966e-06, "loss": 4.726600646972656, "step": 1873 }, { "epoch": 1.2774369461486026, "grad_norm": 20.875, "learning_rate": 6.113310204295093e-06, "loss": 2.6593258380889893, "step": 1874 }, { "epoch": 1.278118609406953, "grad_norm": 34.5, "learning_rate": 6.1031382608665456e-06, "loss": 5.9605255126953125, "step": 1875 }, { "epoch": 1.2788002726653034, "grad_norm": 19.375, "learning_rate": 6.092971069150515e-06, "loss": 3.3693439960479736, "step": 1876 }, { "epoch": 1.2794819359236538, "grad_norm": 11.8125, "learning_rate": 6.08280864154455e-06, "loss": 2.7040786743164062, "step": 1877 }, { "epoch": 1.280163599182004, "grad_norm": 56.0, "learning_rate": 6.072650990440404e-06, "loss": 8.134035110473633, "step": 1878 }, { "epoch": 1.2808452624403546, "grad_norm": 32.5, "learning_rate": 6.062498128224008e-06, "loss": 6.23111629486084, "step": 1879 }, { "epoch": 1.2815269256987047, "grad_norm": 19.75, "learning_rate": 6.052350067275441e-06, "loss": 5.229652404785156, "step": 1880 }, { "epoch": 1.2822085889570551, "grad_norm": 19.875, "learning_rate": 6.0422068199689335e-06, "loss": 4.894524097442627, "step": 1881 }, { "epoch": 1.2828902522154055, "grad_norm": 24.75, "learning_rate": 6.0320683986728535e-06, "loss": 6.307511329650879, "step": 1882 }, { "epoch": 1.283571915473756, "grad_norm": 14.75, "learning_rate": 6.02193481574967e-06, "loss": 6.348879337310791, "step": 1883 }, { "epoch": 1.2842535787321063, "grad_norm": 14.75, "learning_rate": 6.011806083555969e-06, "loss": 4.131725788116455, "step": 1884 }, { "epoch": 1.2849352419904567, "grad_norm": 11.75, "learning_rate": 6.001682214442406e-06, "loss": 3.9715380668640137, "step": 1885 }, { "epoch": 1.2856169052488071, "grad_norm": 23.875, "learning_rate": 5.991563220753712e-06, "loss": 2.619417667388916, "step": 1886 }, { "epoch": 1.2862985685071575, "grad_norm": 43.75, "learning_rate": 5.981449114828684e-06, "loss": 7.322666168212891, "step": 1887 }, { "epoch": 1.286980231765508, "grad_norm": 35.25, "learning_rate": 5.971339909000138e-06, "loss": 5.3692626953125, "step": 1888 }, { "epoch": 1.287661895023858, "grad_norm": 11.1875, "learning_rate": 5.961235615594934e-06, "loss": 3.5853219032287598, "step": 1889 }, { "epoch": 1.2883435582822087, "grad_norm": 14.375, "learning_rate": 5.951136246933933e-06, "loss": 3.0478310585021973, "step": 1890 }, { "epoch": 1.2890252215405589, "grad_norm": 19.5, "learning_rate": 5.941041815331991e-06, "loss": 3.357750415802002, "step": 1891 }, { "epoch": 1.2897068847989093, "grad_norm": 28.0, "learning_rate": 5.930952333097948e-06, "loss": 3.1347670555114746, "step": 1892 }, { "epoch": 1.2903885480572597, "grad_norm": 23.625, "learning_rate": 5.920867812534601e-06, "loss": 7.283046722412109, "step": 1893 }, { "epoch": 1.29107021131561, "grad_norm": 16.25, "learning_rate": 5.910788265938705e-06, "loss": 3.521819829940796, "step": 1894 }, { "epoch": 1.2917518745739605, "grad_norm": 16.25, "learning_rate": 5.900713705600951e-06, "loss": 5.956548690795898, "step": 1895 }, { "epoch": 1.2924335378323109, "grad_norm": 23.875, "learning_rate": 5.890644143805939e-06, "loss": 6.103580474853516, "step": 1896 }, { "epoch": 1.2931152010906612, "grad_norm": 22.625, "learning_rate": 5.880579592832188e-06, "loss": 3.910811185836792, "step": 1897 }, { "epoch": 1.2937968643490116, "grad_norm": 39.5, "learning_rate": 5.87052006495209e-06, "loss": 4.837799072265625, "step": 1898 }, { "epoch": 1.294478527607362, "grad_norm": 28.5, "learning_rate": 5.860465572431927e-06, "loss": 6.4117584228515625, "step": 1899 }, { "epoch": 1.2951601908657122, "grad_norm": 22.875, "learning_rate": 5.850416127531841e-06, "loss": 4.544978141784668, "step": 1900 }, { "epoch": 1.2958418541240628, "grad_norm": 58.75, "learning_rate": 5.840371742505806e-06, "loss": 7.1843132972717285, "step": 1901 }, { "epoch": 1.296523517382413, "grad_norm": 46.5, "learning_rate": 5.830332429601635e-06, "loss": 9.423023223876953, "step": 1902 }, { "epoch": 1.2972051806407634, "grad_norm": 26.375, "learning_rate": 5.820298201060961e-06, "loss": 4.8081207275390625, "step": 1903 }, { "epoch": 1.2978868438991138, "grad_norm": 14.1875, "learning_rate": 5.810269069119204e-06, "loss": 3.261143445968628, "step": 1904 }, { "epoch": 1.2985685071574642, "grad_norm": 33.75, "learning_rate": 5.800245046005585e-06, "loss": 6.632670879364014, "step": 1905 }, { "epoch": 1.2992501704158146, "grad_norm": 14.875, "learning_rate": 5.7902261439430786e-06, "loss": 2.979003667831421, "step": 1906 }, { "epoch": 1.299931833674165, "grad_norm": 59.5, "learning_rate": 5.780212375148426e-06, "loss": 7.974099636077881, "step": 1907 }, { "epoch": 1.3006134969325154, "grad_norm": 13.875, "learning_rate": 5.770203751832115e-06, "loss": 3.0699782371520996, "step": 1908 }, { "epoch": 1.3012951601908658, "grad_norm": 40.5, "learning_rate": 5.760200286198343e-06, "loss": 7.200620651245117, "step": 1909 }, { "epoch": 1.3019768234492162, "grad_norm": 36.0, "learning_rate": 5.750201990445024e-06, "loss": 5.256871223449707, "step": 1910 }, { "epoch": 1.3026584867075663, "grad_norm": 25.375, "learning_rate": 5.740208876763777e-06, "loss": 5.745491027832031, "step": 1911 }, { "epoch": 1.303340149965917, "grad_norm": 18.5, "learning_rate": 5.730220957339889e-06, "loss": 3.055166482925415, "step": 1912 }, { "epoch": 1.3040218132242671, "grad_norm": 38.5, "learning_rate": 5.720238244352328e-06, "loss": 6.701803207397461, "step": 1913 }, { "epoch": 1.3047034764826175, "grad_norm": 36.5, "learning_rate": 5.710260749973694e-06, "loss": 6.260416030883789, "step": 1914 }, { "epoch": 1.305385139740968, "grad_norm": 31.875, "learning_rate": 5.70028848637024e-06, "loss": 8.499013900756836, "step": 1915 }, { "epoch": 1.3060668029993183, "grad_norm": 10.4375, "learning_rate": 5.690321465701841e-06, "loss": 3.703840732574463, "step": 1916 }, { "epoch": 1.3067484662576687, "grad_norm": 13.875, "learning_rate": 5.680359700121965e-06, "loss": 2.609858274459839, "step": 1917 }, { "epoch": 1.3074301295160191, "grad_norm": 29.375, "learning_rate": 5.670403201777681e-06, "loss": 8.69485855102539, "step": 1918 }, { "epoch": 1.3081117927743695, "grad_norm": 20.25, "learning_rate": 5.660451982809642e-06, "loss": 5.127311706542969, "step": 1919 }, { "epoch": 1.30879345603272, "grad_norm": 14.375, "learning_rate": 5.650506055352052e-06, "loss": 4.3793463706970215, "step": 1920 }, { "epoch": 1.3094751192910703, "grad_norm": 53.0, "learning_rate": 5.640565431532667e-06, "loss": 3.636665105819702, "step": 1921 }, { "epoch": 1.3101567825494205, "grad_norm": 32.75, "learning_rate": 5.63063012347277e-06, "loss": 6.460373401641846, "step": 1922 }, { "epoch": 1.310838445807771, "grad_norm": 19.5, "learning_rate": 5.620700143287173e-06, "loss": 5.2690205574035645, "step": 1923 }, { "epoch": 1.3115201090661213, "grad_norm": 20.125, "learning_rate": 5.61077550308419e-06, "loss": 6.080758571624756, "step": 1924 }, { "epoch": 1.3122017723244717, "grad_norm": 49.0, "learning_rate": 5.600856214965613e-06, "loss": 8.303730010986328, "step": 1925 }, { "epoch": 1.312883435582822, "grad_norm": 13.5625, "learning_rate": 5.5909422910267174e-06, "loss": 4.218391418457031, "step": 1926 }, { "epoch": 1.3135650988411725, "grad_norm": 22.0, "learning_rate": 5.5810337433562414e-06, "loss": 6.800525188446045, "step": 1927 }, { "epoch": 1.3142467620995228, "grad_norm": 20.125, "learning_rate": 5.571130584036349e-06, "loss": 3.6913833618164062, "step": 1928 }, { "epoch": 1.3149284253578732, "grad_norm": 38.0, "learning_rate": 5.5612328251426575e-06, "loss": 8.274604797363281, "step": 1929 }, { "epoch": 1.3156100886162236, "grad_norm": 20.25, "learning_rate": 5.551340478744176e-06, "loss": 5.716685771942139, "step": 1930 }, { "epoch": 1.316291751874574, "grad_norm": 47.5, "learning_rate": 5.541453556903338e-06, "loss": 9.908090591430664, "step": 1931 }, { "epoch": 1.3169734151329244, "grad_norm": 300.0, "learning_rate": 5.531572071675942e-06, "loss": 5.241491317749023, "step": 1932 }, { "epoch": 1.3176550783912746, "grad_norm": 13.625, "learning_rate": 5.521696035111164e-06, "loss": 4.2351884841918945, "step": 1933 }, { "epoch": 1.3183367416496252, "grad_norm": 23.25, "learning_rate": 5.511825459251539e-06, "loss": 7.769440650939941, "step": 1934 }, { "epoch": 1.3190184049079754, "grad_norm": 16.0, "learning_rate": 5.501960356132945e-06, "loss": 2.1897592544555664, "step": 1935 }, { "epoch": 1.3197000681663258, "grad_norm": 20.875, "learning_rate": 5.492100737784576e-06, "loss": 6.106921672821045, "step": 1936 }, { "epoch": 1.3203817314246762, "grad_norm": 19.5, "learning_rate": 5.482246616228954e-06, "loss": 4.005908966064453, "step": 1937 }, { "epoch": 1.3210633946830266, "grad_norm": 29.25, "learning_rate": 5.472398003481881e-06, "loss": 3.3190054893493652, "step": 1938 }, { "epoch": 1.321745057941377, "grad_norm": 15.8125, "learning_rate": 5.462554911552455e-06, "loss": 5.095207214355469, "step": 1939 }, { "epoch": 1.3224267211997274, "grad_norm": 33.0, "learning_rate": 5.4527173524430395e-06, "loss": 8.984001159667969, "step": 1940 }, { "epoch": 1.3231083844580778, "grad_norm": 18.5, "learning_rate": 5.442885338149244e-06, "loss": 5.284385681152344, "step": 1941 }, { "epoch": 1.3237900477164282, "grad_norm": 20.75, "learning_rate": 5.43305888065993e-06, "loss": 4.751410961151123, "step": 1942 }, { "epoch": 1.3244717109747786, "grad_norm": 25.0, "learning_rate": 5.423237991957168e-06, "loss": 8.3447904586792, "step": 1943 }, { "epoch": 1.3251533742331287, "grad_norm": 19.625, "learning_rate": 5.4134226840162455e-06, "loss": 5.630289077758789, "step": 1944 }, { "epoch": 1.3258350374914791, "grad_norm": 55.5, "learning_rate": 5.403612968805649e-06, "loss": 6.30268669128418, "step": 1945 }, { "epoch": 1.3265167007498295, "grad_norm": 11.0, "learning_rate": 5.393808858287039e-06, "loss": 2.9504010677337646, "step": 1946 }, { "epoch": 1.32719836400818, "grad_norm": 17.5, "learning_rate": 5.384010364415243e-06, "loss": 2.909893035888672, "step": 1947 }, { "epoch": 1.3278800272665303, "grad_norm": 16.875, "learning_rate": 5.374217499138248e-06, "loss": 3.995891571044922, "step": 1948 }, { "epoch": 1.3285616905248807, "grad_norm": 17.0, "learning_rate": 5.364430274397161e-06, "loss": 2.193687915802002, "step": 1949 }, { "epoch": 1.329243353783231, "grad_norm": 20.125, "learning_rate": 5.354648702126229e-06, "loss": 6.442833423614502, "step": 1950 }, { "epoch": 1.3299250170415815, "grad_norm": 24.875, "learning_rate": 5.3448727942527915e-06, "loss": 4.923888683319092, "step": 1951 }, { "epoch": 1.330606680299932, "grad_norm": 20.125, "learning_rate": 5.335102562697294e-06, "loss": 3.1915292739868164, "step": 1952 }, { "epoch": 1.331288343558282, "grad_norm": 21.25, "learning_rate": 5.325338019373247e-06, "loss": 6.087789535522461, "step": 1953 }, { "epoch": 1.3319700068166327, "grad_norm": 16.5, "learning_rate": 5.315579176187242e-06, "loss": 3.904956102371216, "step": 1954 }, { "epoch": 1.3326516700749829, "grad_norm": 19.875, "learning_rate": 5.305826045038899e-06, "loss": 6.166762351989746, "step": 1955 }, { "epoch": 1.3333333333333333, "grad_norm": 34.25, "learning_rate": 5.296078637820894e-06, "loss": 6.2608723640441895, "step": 1956 }, { "epoch": 1.3340149965916837, "grad_norm": 14.9375, "learning_rate": 5.286336966418904e-06, "loss": 3.4583752155303955, "step": 1957 }, { "epoch": 1.334696659850034, "grad_norm": 15.0, "learning_rate": 5.276601042711631e-06, "loss": 2.2882261276245117, "step": 1958 }, { "epoch": 1.3353783231083844, "grad_norm": 20.625, "learning_rate": 5.266870878570751e-06, "loss": 3.2587265968322754, "step": 1959 }, { "epoch": 1.3360599863667348, "grad_norm": 21.125, "learning_rate": 5.257146485860927e-06, "loss": 5.731127738952637, "step": 1960 }, { "epoch": 1.3367416496250852, "grad_norm": 14.0, "learning_rate": 5.247427876439788e-06, "loss": 4.605905055999756, "step": 1961 }, { "epoch": 1.3374233128834356, "grad_norm": 22.75, "learning_rate": 5.237715062157895e-06, "loss": 7.605738639831543, "step": 1962 }, { "epoch": 1.338104976141786, "grad_norm": 11.4375, "learning_rate": 5.228008054858764e-06, "loss": 3.747340679168701, "step": 1963 }, { "epoch": 1.3387866394001362, "grad_norm": 13.6875, "learning_rate": 5.218306866378812e-06, "loss": 3.453395128250122, "step": 1964 }, { "epoch": 1.3394683026584868, "grad_norm": 20.5, "learning_rate": 5.208611508547367e-06, "loss": 5.098195552825928, "step": 1965 }, { "epoch": 1.340149965916837, "grad_norm": 24.75, "learning_rate": 5.198921993186654e-06, "loss": 6.830511569976807, "step": 1966 }, { "epoch": 1.3408316291751874, "grad_norm": 30.25, "learning_rate": 5.189238332111761e-06, "loss": 3.7876908779144287, "step": 1967 }, { "epoch": 1.3415132924335378, "grad_norm": 35.25, "learning_rate": 5.179560537130647e-06, "loss": 7.6129150390625, "step": 1968 }, { "epoch": 1.3421949556918882, "grad_norm": 18.875, "learning_rate": 5.16988862004412e-06, "loss": 5.100566387176514, "step": 1969 }, { "epoch": 1.3428766189502386, "grad_norm": 22.625, "learning_rate": 5.160222592645808e-06, "loss": 3.841235637664795, "step": 1970 }, { "epoch": 1.343558282208589, "grad_norm": 16.625, "learning_rate": 5.150562466722171e-06, "loss": 6.593350410461426, "step": 1971 }, { "epoch": 1.3442399454669394, "grad_norm": 28.125, "learning_rate": 5.140908254052469e-06, "loss": 4.396834373474121, "step": 1972 }, { "epoch": 1.3449216087252898, "grad_norm": 20.625, "learning_rate": 5.1312599664087435e-06, "loss": 5.126183032989502, "step": 1973 }, { "epoch": 1.3456032719836402, "grad_norm": 31.875, "learning_rate": 5.121617615555825e-06, "loss": 6.533592224121094, "step": 1974 }, { "epoch": 1.3462849352419903, "grad_norm": 21.0, "learning_rate": 5.111981213251293e-06, "loss": 3.1461637020111084, "step": 1975 }, { "epoch": 1.346966598500341, "grad_norm": 45.5, "learning_rate": 5.102350771245474e-06, "loss": 7.538666248321533, "step": 1976 }, { "epoch": 1.3476482617586911, "grad_norm": 19.0, "learning_rate": 5.092726301281442e-06, "loss": 4.511101245880127, "step": 1977 }, { "epoch": 1.3483299250170415, "grad_norm": 49.0, "learning_rate": 5.083107815094966e-06, "loss": 7.19852352142334, "step": 1978 }, { "epoch": 1.349011588275392, "grad_norm": 21.0, "learning_rate": 5.073495324414535e-06, "loss": 5.103464126586914, "step": 1979 }, { "epoch": 1.3496932515337423, "grad_norm": 18.75, "learning_rate": 5.063888840961325e-06, "loss": 6.963976860046387, "step": 1980 }, { "epoch": 1.3503749147920927, "grad_norm": 11.875, "learning_rate": 5.054288376449179e-06, "loss": 2.3163092136383057, "step": 1981 }, { "epoch": 1.351056578050443, "grad_norm": 20.625, "learning_rate": 5.044693942584613e-06, "loss": 9.088257789611816, "step": 1982 }, { "epoch": 1.3517382413087935, "grad_norm": 30.5, "learning_rate": 5.035105551066775e-06, "loss": 7.261842250823975, "step": 1983 }, { "epoch": 1.352419904567144, "grad_norm": 19.625, "learning_rate": 5.025523213587455e-06, "loss": 8.022004127502441, "step": 1984 }, { "epoch": 1.3531015678254943, "grad_norm": 68.5, "learning_rate": 5.015946941831064e-06, "loss": 10.096906661987305, "step": 1985 }, { "epoch": 1.3537832310838445, "grad_norm": 19.5, "learning_rate": 5.006376747474608e-06, "loss": 4.513362884521484, "step": 1986 }, { "epoch": 1.354464894342195, "grad_norm": 30.875, "learning_rate": 4.99681264218768e-06, "loss": 8.763145446777344, "step": 1987 }, { "epoch": 1.3551465576005453, "grad_norm": 26.5, "learning_rate": 4.987254637632462e-06, "loss": 4.71531867980957, "step": 1988 }, { "epoch": 1.3558282208588956, "grad_norm": 39.75, "learning_rate": 4.977702745463681e-06, "loss": 3.9427366256713867, "step": 1989 }, { "epoch": 1.356509884117246, "grad_norm": 40.5, "learning_rate": 4.968156977328626e-06, "loss": 7.410538673400879, "step": 1990 }, { "epoch": 1.3571915473755964, "grad_norm": 17.875, "learning_rate": 4.958617344867104e-06, "loss": 5.245522975921631, "step": 1991 }, { "epoch": 1.3578732106339468, "grad_norm": 12.3125, "learning_rate": 4.94908385971145e-06, "loss": 1.7231168746948242, "step": 1992 }, { "epoch": 1.3585548738922972, "grad_norm": 19.75, "learning_rate": 4.939556533486506e-06, "loss": 2.3717894554138184, "step": 1993 }, { "epoch": 1.3592365371506476, "grad_norm": 17.75, "learning_rate": 4.93003537780959e-06, "loss": 5.808124542236328, "step": 1994 }, { "epoch": 1.359918200408998, "grad_norm": 13.5, "learning_rate": 4.920520404290512e-06, "loss": 4.844557285308838, "step": 1995 }, { "epoch": 1.3605998636673484, "grad_norm": 22.75, "learning_rate": 4.9110116245315275e-06, "loss": 2.8185718059539795, "step": 1996 }, { "epoch": 1.3612815269256986, "grad_norm": 42.25, "learning_rate": 4.901509050127357e-06, "loss": 6.259099960327148, "step": 1997 }, { "epoch": 1.3619631901840492, "grad_norm": 31.125, "learning_rate": 4.892012692665143e-06, "loss": 2.786968469619751, "step": 1998 }, { "epoch": 1.3626448534423994, "grad_norm": 19.125, "learning_rate": 4.882522563724442e-06, "loss": 4.585051536560059, "step": 1999 }, { "epoch": 1.3633265167007498, "grad_norm": 33.5, "learning_rate": 4.87303867487723e-06, "loss": 6.4635114669799805, "step": 2000 }, { "epoch": 1.3640081799591002, "grad_norm": 22.125, "learning_rate": 4.86356103768787e-06, "loss": 5.398584842681885, "step": 2001 }, { "epoch": 1.3646898432174506, "grad_norm": 34.5, "learning_rate": 4.854089663713091e-06, "loss": 4.975487232208252, "step": 2002 }, { "epoch": 1.365371506475801, "grad_norm": 21.375, "learning_rate": 4.844624564501998e-06, "loss": 3.7063751220703125, "step": 2003 }, { "epoch": 1.3660531697341514, "grad_norm": 26.75, "learning_rate": 4.835165751596045e-06, "loss": 6.296645641326904, "step": 2004 }, { "epoch": 1.3667348329925018, "grad_norm": 26.75, "learning_rate": 4.825713236529005e-06, "loss": 5.66240119934082, "step": 2005 }, { "epoch": 1.3674164962508522, "grad_norm": 19.25, "learning_rate": 4.816267030826991e-06, "loss": 4.260105133056641, "step": 2006 }, { "epoch": 1.3680981595092025, "grad_norm": 22.875, "learning_rate": 4.8068271460084105e-06, "loss": 4.143197059631348, "step": 2007 }, { "epoch": 1.3687798227675527, "grad_norm": 15.625, "learning_rate": 4.797393593583961e-06, "loss": 4.7735371589660645, "step": 2008 }, { "epoch": 1.3694614860259033, "grad_norm": 9.8125, "learning_rate": 4.787966385056635e-06, "loss": 3.205138683319092, "step": 2009 }, { "epoch": 1.3701431492842535, "grad_norm": 13.9375, "learning_rate": 4.778545531921668e-06, "loss": 4.189981460571289, "step": 2010 }, { "epoch": 1.370824812542604, "grad_norm": 20.625, "learning_rate": 4.769131045666564e-06, "loss": 6.8509955406188965, "step": 2011 }, { "epoch": 1.3715064758009543, "grad_norm": 14.5, "learning_rate": 4.759722937771051e-06, "loss": 3.9030776023864746, "step": 2012 }, { "epoch": 1.3721881390593047, "grad_norm": 11.0625, "learning_rate": 4.750321219707087e-06, "loss": 3.229804277420044, "step": 2013 }, { "epoch": 1.372869802317655, "grad_norm": 15.0, "learning_rate": 4.740925902938838e-06, "loss": 3.374544620513916, "step": 2014 }, { "epoch": 1.3735514655760055, "grad_norm": 39.25, "learning_rate": 4.731536998922657e-06, "loss": 6.006149768829346, "step": 2015 }, { "epoch": 1.3742331288343559, "grad_norm": 31.375, "learning_rate": 4.722154519107085e-06, "loss": 4.969033718109131, "step": 2016 }, { "epoch": 1.3749147920927063, "grad_norm": 35.25, "learning_rate": 4.712778474932832e-06, "loss": 3.278733968734741, "step": 2017 }, { "epoch": 1.3755964553510567, "grad_norm": 27.5, "learning_rate": 4.703408877832752e-06, "loss": 6.468416690826416, "step": 2018 }, { "epoch": 1.3762781186094069, "grad_norm": 18.75, "learning_rate": 4.694045739231839e-06, "loss": 6.0921173095703125, "step": 2019 }, { "epoch": 1.3769597818677572, "grad_norm": 28.25, "learning_rate": 4.684689070547216e-06, "loss": 5.6685285568237305, "step": 2020 }, { "epoch": 1.3776414451261076, "grad_norm": 19.0, "learning_rate": 4.675338883188113e-06, "loss": 3.3105342388153076, "step": 2021 }, { "epoch": 1.378323108384458, "grad_norm": 18.5, "learning_rate": 4.665995188555866e-06, "loss": 5.503789901733398, "step": 2022 }, { "epoch": 1.3790047716428084, "grad_norm": 19.5, "learning_rate": 4.656657998043879e-06, "loss": 6.028713226318359, "step": 2023 }, { "epoch": 1.3796864349011588, "grad_norm": 19.625, "learning_rate": 4.647327323037635e-06, "loss": 2.0828442573547363, "step": 2024 }, { "epoch": 1.3803680981595092, "grad_norm": 57.25, "learning_rate": 4.638003174914675e-06, "loss": 10.405936241149902, "step": 2025 }, { "epoch": 1.3810497614178596, "grad_norm": 11.0, "learning_rate": 4.6286855650445695e-06, "loss": 2.805297374725342, "step": 2026 }, { "epoch": 1.38173142467621, "grad_norm": 13.4375, "learning_rate": 4.619374504788931e-06, "loss": 5.538810729980469, "step": 2027 }, { "epoch": 1.3824130879345602, "grad_norm": 23.625, "learning_rate": 4.610070005501368e-06, "loss": 4.736574172973633, "step": 2028 }, { "epoch": 1.3830947511929108, "grad_norm": 37.75, "learning_rate": 4.600772078527509e-06, "loss": 6.040497303009033, "step": 2029 }, { "epoch": 1.383776414451261, "grad_norm": 15.1875, "learning_rate": 4.591480735204953e-06, "loss": 2.9890825748443604, "step": 2030 }, { "epoch": 1.3844580777096114, "grad_norm": 34.25, "learning_rate": 4.5821959868632735e-06, "loss": 4.17214298248291, "step": 2031 }, { "epoch": 1.3851397409679618, "grad_norm": 18.75, "learning_rate": 4.572917844824008e-06, "loss": 5.7804274559021, "step": 2032 }, { "epoch": 1.3858214042263122, "grad_norm": 21.5, "learning_rate": 4.56364632040064e-06, "loss": 7.601726531982422, "step": 2033 }, { "epoch": 1.3865030674846626, "grad_norm": 19.0, "learning_rate": 4.554381424898571e-06, "loss": 5.637332439422607, "step": 2034 }, { "epoch": 1.387184730743013, "grad_norm": 22.625, "learning_rate": 4.545123169615134e-06, "loss": 3.226628303527832, "step": 2035 }, { "epoch": 1.3878663940013634, "grad_norm": 85.5, "learning_rate": 4.535871565839555e-06, "loss": 9.347726821899414, "step": 2036 }, { "epoch": 1.3885480572597138, "grad_norm": 33.5, "learning_rate": 4.526626624852954e-06, "loss": 4.142840385437012, "step": 2037 }, { "epoch": 1.3892297205180641, "grad_norm": 21.125, "learning_rate": 4.517388357928329e-06, "loss": 5.395458221435547, "step": 2038 }, { "epoch": 1.3899113837764143, "grad_norm": 16.0, "learning_rate": 4.5081567763305345e-06, "loss": 4.976834774017334, "step": 2039 }, { "epoch": 1.390593047034765, "grad_norm": 18.0, "learning_rate": 4.49893189131627e-06, "loss": 3.9795753955841064, "step": 2040 }, { "epoch": 1.3912747102931151, "grad_norm": 19.875, "learning_rate": 4.4897137141340844e-06, "loss": 2.32871150970459, "step": 2041 }, { "epoch": 1.3919563735514655, "grad_norm": 18.875, "learning_rate": 4.480502256024327e-06, "loss": 5.6886396408081055, "step": 2042 }, { "epoch": 1.392638036809816, "grad_norm": 24.375, "learning_rate": 4.471297528219174e-06, "loss": 6.039487361907959, "step": 2043 }, { "epoch": 1.3933197000681663, "grad_norm": 12.6875, "learning_rate": 4.4620995419425765e-06, "loss": 3.949005365371704, "step": 2044 }, { "epoch": 1.3940013633265167, "grad_norm": 22.25, "learning_rate": 4.45290830841028e-06, "loss": 4.500894546508789, "step": 2045 }, { "epoch": 1.394683026584867, "grad_norm": 29.625, "learning_rate": 4.443723838829791e-06, "loss": 4.509685516357422, "step": 2046 }, { "epoch": 1.3953646898432175, "grad_norm": 31.5, "learning_rate": 4.434546144400361e-06, "loss": 8.779342651367188, "step": 2047 }, { "epoch": 1.3960463531015679, "grad_norm": 24.125, "learning_rate": 4.425375236312991e-06, "loss": 8.611041069030762, "step": 2048 }, { "epoch": 1.3967280163599183, "grad_norm": 45.5, "learning_rate": 4.416211125750406e-06, "loss": 7.407787799835205, "step": 2049 }, { "epoch": 1.3974096796182685, "grad_norm": 39.5, "learning_rate": 4.407053823887033e-06, "loss": 7.548531532287598, "step": 2050 }, { "epoch": 1.398091342876619, "grad_norm": 13.3125, "learning_rate": 4.397903341889003e-06, "loss": 1.7263919115066528, "step": 2051 }, { "epoch": 1.3987730061349692, "grad_norm": 27.75, "learning_rate": 4.388759690914128e-06, "loss": 4.631210803985596, "step": 2052 }, { "epoch": 1.3994546693933196, "grad_norm": 43.5, "learning_rate": 4.379622882111895e-06, "loss": 5.751603126525879, "step": 2053 }, { "epoch": 1.40013633265167, "grad_norm": 20.25, "learning_rate": 4.37049292662345e-06, "loss": 4.589346885681152, "step": 2054 }, { "epoch": 1.4008179959100204, "grad_norm": 25.625, "learning_rate": 4.361369835581569e-06, "loss": 5.574239253997803, "step": 2055 }, { "epoch": 1.4014996591683708, "grad_norm": 26.0, "learning_rate": 4.352253620110674e-06, "loss": 1.7878053188323975, "step": 2056 }, { "epoch": 1.4021813224267212, "grad_norm": 31.0, "learning_rate": 4.343144291326796e-06, "loss": 4.879244327545166, "step": 2057 }, { "epoch": 1.4028629856850716, "grad_norm": 15.1875, "learning_rate": 4.334041860337562e-06, "loss": 5.51246452331543, "step": 2058 }, { "epoch": 1.403544648943422, "grad_norm": 19.5, "learning_rate": 4.324946338242203e-06, "loss": 3.423245429992676, "step": 2059 }, { "epoch": 1.4042263122017724, "grad_norm": 21.875, "learning_rate": 4.315857736131508e-06, "loss": 4.528190612792969, "step": 2060 }, { "epoch": 1.4049079754601226, "grad_norm": 18.625, "learning_rate": 4.306776065087846e-06, "loss": 4.514669418334961, "step": 2061 }, { "epoch": 1.4055896387184732, "grad_norm": 27.25, "learning_rate": 4.297701336185119e-06, "loss": 8.252084732055664, "step": 2062 }, { "epoch": 1.4062713019768234, "grad_norm": 17.5, "learning_rate": 4.28863356048877e-06, "loss": 5.96700382232666, "step": 2063 }, { "epoch": 1.4069529652351738, "grad_norm": 35.25, "learning_rate": 4.279572749055765e-06, "loss": 2.980764627456665, "step": 2064 }, { "epoch": 1.4076346284935242, "grad_norm": 30.25, "learning_rate": 4.2705189129345814e-06, "loss": 4.206424713134766, "step": 2065 }, { "epoch": 1.4083162917518746, "grad_norm": 16.0, "learning_rate": 4.261472063165179e-06, "loss": 4.689609050750732, "step": 2066 }, { "epoch": 1.408997955010225, "grad_norm": 20.25, "learning_rate": 4.2524322107790135e-06, "loss": 4.255527496337891, "step": 2067 }, { "epoch": 1.4096796182685754, "grad_norm": 18.375, "learning_rate": 4.243399366798994e-06, "loss": 4.320568561553955, "step": 2068 }, { "epoch": 1.4103612815269257, "grad_norm": 33.0, "learning_rate": 4.234373542239495e-06, "loss": 7.334473609924316, "step": 2069 }, { "epoch": 1.4110429447852761, "grad_norm": 16.5, "learning_rate": 4.225354748106328e-06, "loss": 3.613032579421997, "step": 2070 }, { "epoch": 1.4117246080436265, "grad_norm": 44.25, "learning_rate": 4.216342995396728e-06, "loss": 6.872384071350098, "step": 2071 }, { "epoch": 1.4124062713019767, "grad_norm": 14.5625, "learning_rate": 4.207338295099353e-06, "loss": 4.652342319488525, "step": 2072 }, { "epoch": 1.4130879345603273, "grad_norm": 15.375, "learning_rate": 4.198340658194251e-06, "loss": 3.2875070571899414, "step": 2073 }, { "epoch": 1.4137695978186775, "grad_norm": 17.75, "learning_rate": 4.189350095652857e-06, "loss": 3.3227360248565674, "step": 2074 }, { "epoch": 1.414451261077028, "grad_norm": 20.375, "learning_rate": 4.180366618437996e-06, "loss": 3.457723379135132, "step": 2075 }, { "epoch": 1.4151329243353783, "grad_norm": 20.125, "learning_rate": 4.171390237503832e-06, "loss": 5.05181360244751, "step": 2076 }, { "epoch": 1.4158145875937287, "grad_norm": 17.5, "learning_rate": 4.1624209637958875e-06, "loss": 3.1756691932678223, "step": 2077 }, { "epoch": 1.416496250852079, "grad_norm": 18.75, "learning_rate": 4.153458808251026e-06, "loss": 5.87408971786499, "step": 2078 }, { "epoch": 1.4171779141104295, "grad_norm": 53.75, "learning_rate": 4.144503781797409e-06, "loss": 7.327788829803467, "step": 2079 }, { "epoch": 1.4178595773687799, "grad_norm": 18.875, "learning_rate": 4.13555589535453e-06, "loss": 4.677674293518066, "step": 2080 }, { "epoch": 1.4185412406271303, "grad_norm": 16.875, "learning_rate": 4.126615159833156e-06, "loss": 5.880982398986816, "step": 2081 }, { "epoch": 1.4192229038854807, "grad_norm": 34.5, "learning_rate": 4.11768158613535e-06, "loss": 8.551815032958984, "step": 2082 }, { "epoch": 1.4199045671438308, "grad_norm": 29.0, "learning_rate": 4.108755185154428e-06, "loss": 5.5577497482299805, "step": 2083 }, { "epoch": 1.4205862304021815, "grad_norm": 16.75, "learning_rate": 4.099835967774974e-06, "loss": 6.28212833404541, "step": 2084 }, { "epoch": 1.4212678936605316, "grad_norm": 20.125, "learning_rate": 4.0909239448727985e-06, "loss": 4.965356349945068, "step": 2085 }, { "epoch": 1.421949556918882, "grad_norm": 13.6875, "learning_rate": 4.082019127314954e-06, "loss": 2.525484085083008, "step": 2086 }, { "epoch": 1.4226312201772324, "grad_norm": 36.0, "learning_rate": 4.07312152595969e-06, "loss": 7.202946186065674, "step": 2087 }, { "epoch": 1.4233128834355828, "grad_norm": 22.625, "learning_rate": 4.064231151656476e-06, "loss": 6.155373573303223, "step": 2088 }, { "epoch": 1.4239945466939332, "grad_norm": 44.0, "learning_rate": 4.05534801524595e-06, "loss": 8.27874755859375, "step": 2089 }, { "epoch": 1.4246762099522836, "grad_norm": 20.5, "learning_rate": 4.046472127559937e-06, "loss": 6.827492713928223, "step": 2090 }, { "epoch": 1.425357873210634, "grad_norm": 18.625, "learning_rate": 4.037603499421423e-06, "loss": 2.969090461730957, "step": 2091 }, { "epoch": 1.4260395364689844, "grad_norm": 19.75, "learning_rate": 4.028742141644532e-06, "loss": 3.6945269107818604, "step": 2092 }, { "epoch": 1.4267211997273348, "grad_norm": 31.25, "learning_rate": 4.019888065034538e-06, "loss": 8.212724685668945, "step": 2093 }, { "epoch": 1.427402862985685, "grad_norm": 23.125, "learning_rate": 4.011041280387822e-06, "loss": 6.328047752380371, "step": 2094 }, { "epoch": 1.4280845262440354, "grad_norm": 51.0, "learning_rate": 4.002201798491875e-06, "loss": 9.1661376953125, "step": 2095 }, { "epoch": 1.4287661895023858, "grad_norm": 22.875, "learning_rate": 3.9933696301252956e-06, "loss": 4.933104038238525, "step": 2096 }, { "epoch": 1.4294478527607362, "grad_norm": 25.875, "learning_rate": 3.98454478605775e-06, "loss": 7.272467613220215, "step": 2097 }, { "epoch": 1.4301295160190866, "grad_norm": 20.875, "learning_rate": 3.975727277049982e-06, "loss": 6.0666093826293945, "step": 2098 }, { "epoch": 1.430811179277437, "grad_norm": 16.0, "learning_rate": 3.966917113853795e-06, "loss": 2.3959543704986572, "step": 2099 }, { "epoch": 1.4314928425357873, "grad_norm": 18.125, "learning_rate": 3.958114307212018e-06, "loss": 5.880310535430908, "step": 2100 }, { "epoch": 1.4321745057941377, "grad_norm": 34.0, "learning_rate": 3.949318867858527e-06, "loss": 6.554533004760742, "step": 2101 }, { "epoch": 1.4328561690524881, "grad_norm": 29.75, "learning_rate": 3.940530806518211e-06, "loss": 6.45911979675293, "step": 2102 }, { "epoch": 1.4335378323108383, "grad_norm": 19.125, "learning_rate": 3.931750133906954e-06, "loss": 6.430793285369873, "step": 2103 }, { "epoch": 1.434219495569189, "grad_norm": 53.75, "learning_rate": 3.922976860731642e-06, "loss": 8.09473991394043, "step": 2104 }, { "epoch": 1.434901158827539, "grad_norm": 18.0, "learning_rate": 3.91421099769013e-06, "loss": 4.18020486831665, "step": 2105 }, { "epoch": 1.4355828220858895, "grad_norm": 21.0, "learning_rate": 3.905452555471235e-06, "loss": 7.774521350860596, "step": 2106 }, { "epoch": 1.43626448534424, "grad_norm": 15.75, "learning_rate": 3.8967015447547385e-06, "loss": 4.005303382873535, "step": 2107 }, { "epoch": 1.4369461486025903, "grad_norm": 47.75, "learning_rate": 3.887957976211345e-06, "loss": 6.144782066345215, "step": 2108 }, { "epoch": 1.4376278118609407, "grad_norm": 27.875, "learning_rate": 3.879221860502693e-06, "loss": 4.33512020111084, "step": 2109 }, { "epoch": 1.438309475119291, "grad_norm": 18.875, "learning_rate": 3.870493208281337e-06, "loss": 6.663443565368652, "step": 2110 }, { "epoch": 1.4389911383776415, "grad_norm": 32.75, "learning_rate": 3.861772030190717e-06, "loss": 5.346297264099121, "step": 2111 }, { "epoch": 1.4396728016359919, "grad_norm": 22.25, "learning_rate": 3.853058336865174e-06, "loss": 3.926506280899048, "step": 2112 }, { "epoch": 1.4403544648943423, "grad_norm": 23.625, "learning_rate": 3.844352138929911e-06, "loss": 4.05122184753418, "step": 2113 }, { "epoch": 1.4410361281526924, "grad_norm": 27.625, "learning_rate": 3.835653447000997e-06, "loss": 3.5555787086486816, "step": 2114 }, { "epoch": 1.441717791411043, "grad_norm": 23.0, "learning_rate": 3.826962271685351e-06, "loss": 6.6555585861206055, "step": 2115 }, { "epoch": 1.4423994546693932, "grad_norm": 32.75, "learning_rate": 3.818278623580724e-06, "loss": 8.088645935058594, "step": 2116 }, { "epoch": 1.4430811179277436, "grad_norm": 24.5, "learning_rate": 3.8096025132756775e-06, "loss": 3.694427251815796, "step": 2117 }, { "epoch": 1.443762781186094, "grad_norm": 23.625, "learning_rate": 3.8009339513496034e-06, "loss": 8.195084571838379, "step": 2118 }, { "epoch": 1.4444444444444444, "grad_norm": 23.25, "learning_rate": 3.7922729483726685e-06, "loss": 7.882268905639648, "step": 2119 }, { "epoch": 1.4451261077027948, "grad_norm": 42.0, "learning_rate": 3.7836195149058386e-06, "loss": 7.353115081787109, "step": 2120 }, { "epoch": 1.4458077709611452, "grad_norm": 19.25, "learning_rate": 3.774973661500837e-06, "loss": 6.360138893127441, "step": 2121 }, { "epoch": 1.4464894342194956, "grad_norm": 19.5, "learning_rate": 3.7663353987001516e-06, "loss": 5.945451736450195, "step": 2122 }, { "epoch": 1.447171097477846, "grad_norm": 54.0, "learning_rate": 3.7577047370370168e-06, "loss": 4.472362518310547, "step": 2123 }, { "epoch": 1.4478527607361964, "grad_norm": 12.8125, "learning_rate": 3.7490816870353865e-06, "loss": 4.3732709884643555, "step": 2124 }, { "epoch": 1.4485344239945466, "grad_norm": 23.125, "learning_rate": 3.7404662592099483e-06, "loss": 3.714017391204834, "step": 2125 }, { "epoch": 1.4492160872528972, "grad_norm": 13.625, "learning_rate": 3.7318584640660816e-06, "loss": 3.9876418113708496, "step": 2126 }, { "epoch": 1.4498977505112474, "grad_norm": 35.5, "learning_rate": 3.72325831209987e-06, "loss": 7.448818206787109, "step": 2127 }, { "epoch": 1.4505794137695978, "grad_norm": 18.375, "learning_rate": 3.7146658137980716e-06, "loss": 5.174450874328613, "step": 2128 }, { "epoch": 1.4512610770279482, "grad_norm": 14.75, "learning_rate": 3.706080979638107e-06, "loss": 5.639563083648682, "step": 2129 }, { "epoch": 1.4519427402862985, "grad_norm": 21.375, "learning_rate": 3.697503820088063e-06, "loss": 2.3522891998291016, "step": 2130 }, { "epoch": 1.452624403544649, "grad_norm": 48.5, "learning_rate": 3.6889343456066627e-06, "loss": 7.424954891204834, "step": 2131 }, { "epoch": 1.4533060668029993, "grad_norm": 26.625, "learning_rate": 3.6803725666432544e-06, "loss": 5.225255966186523, "step": 2132 }, { "epoch": 1.4539877300613497, "grad_norm": 19.0, "learning_rate": 3.6718184936378097e-06, "loss": 4.173942565917969, "step": 2133 }, { "epoch": 1.4546693933197001, "grad_norm": 33.5, "learning_rate": 3.6632721370208956e-06, "loss": 6.068643093109131, "step": 2134 }, { "epoch": 1.4553510565780505, "grad_norm": 70.5, "learning_rate": 3.654733507213678e-06, "loss": 5.94393253326416, "step": 2135 }, { "epoch": 1.4560327198364007, "grad_norm": 20.875, "learning_rate": 3.6462026146279007e-06, "loss": 9.34310531616211, "step": 2136 }, { "epoch": 1.4567143830947513, "grad_norm": 21.5, "learning_rate": 3.6376794696658656e-06, "loss": 5.906853199005127, "step": 2137 }, { "epoch": 1.4573960463531015, "grad_norm": 16.375, "learning_rate": 3.62916408272043e-06, "loss": 3.5065953731536865, "step": 2138 }, { "epoch": 1.4580777096114519, "grad_norm": 12.1875, "learning_rate": 3.6206564641749974e-06, "loss": 5.16298770904541, "step": 2139 }, { "epoch": 1.4587593728698023, "grad_norm": 25.875, "learning_rate": 3.61215662440349e-06, "loss": 6.929368019104004, "step": 2140 }, { "epoch": 1.4594410361281527, "grad_norm": 36.5, "learning_rate": 3.6036645737703557e-06, "loss": 3.09114670753479, "step": 2141 }, { "epoch": 1.460122699386503, "grad_norm": 29.625, "learning_rate": 3.595180322630529e-06, "loss": 5.234488487243652, "step": 2142 }, { "epoch": 1.4608043626448535, "grad_norm": 50.0, "learning_rate": 3.5867038813294487e-06, "loss": 6.952371597290039, "step": 2143 }, { "epoch": 1.4614860259032039, "grad_norm": 14.0, "learning_rate": 3.578235260203027e-06, "loss": 3.870906352996826, "step": 2144 }, { "epoch": 1.4621676891615543, "grad_norm": 17.625, "learning_rate": 3.5697744695776326e-06, "loss": 3.4291768074035645, "step": 2145 }, { "epoch": 1.4628493524199047, "grad_norm": 12.8125, "learning_rate": 3.5613215197700935e-06, "loss": 4.2776994705200195, "step": 2146 }, { "epoch": 1.4635310156782548, "grad_norm": 36.75, "learning_rate": 3.5528764210876787e-06, "loss": 7.719753265380859, "step": 2147 }, { "epoch": 1.4642126789366054, "grad_norm": 20.875, "learning_rate": 3.5444391838280766e-06, "loss": 2.4932122230529785, "step": 2148 }, { "epoch": 1.4648943421949556, "grad_norm": 33.5, "learning_rate": 3.5360098182793933e-06, "loss": 2.5127413272857666, "step": 2149 }, { "epoch": 1.465576005453306, "grad_norm": 25.625, "learning_rate": 3.5275883347201336e-06, "loss": 4.794689178466797, "step": 2150 }, { "epoch": 1.4662576687116564, "grad_norm": 30.5, "learning_rate": 3.5191747434191946e-06, "loss": 2.205981731414795, "step": 2151 }, { "epoch": 1.4669393319700068, "grad_norm": 47.25, "learning_rate": 3.5107690546358555e-06, "loss": 6.565275192260742, "step": 2152 }, { "epoch": 1.4676209952283572, "grad_norm": 17.25, "learning_rate": 3.502371278619743e-06, "loss": 4.750123023986816, "step": 2153 }, { "epoch": 1.4683026584867076, "grad_norm": 30.875, "learning_rate": 3.4939814256108494e-06, "loss": 8.548187255859375, "step": 2154 }, { "epoch": 1.468984321745058, "grad_norm": 25.875, "learning_rate": 3.4855995058395066e-06, "loss": 5.708867073059082, "step": 2155 }, { "epoch": 1.4696659850034084, "grad_norm": 20.75, "learning_rate": 3.4772255295263604e-06, "loss": 3.516817331314087, "step": 2156 }, { "epoch": 1.4703476482617588, "grad_norm": 34.75, "learning_rate": 3.4688595068823837e-06, "loss": 7.727801322937012, "step": 2157 }, { "epoch": 1.471029311520109, "grad_norm": 18.375, "learning_rate": 3.4605014481088394e-06, "loss": 3.20235538482666, "step": 2158 }, { "epoch": 1.4717109747784596, "grad_norm": 24.375, "learning_rate": 3.4521513633972936e-06, "loss": 6.695145606994629, "step": 2159 }, { "epoch": 1.4723926380368098, "grad_norm": 28.0, "learning_rate": 3.443809262929575e-06, "loss": 3.8071537017822266, "step": 2160 }, { "epoch": 1.4730743012951601, "grad_norm": 15.3125, "learning_rate": 3.435475156877781e-06, "loss": 4.690737247467041, "step": 2161 }, { "epoch": 1.4737559645535105, "grad_norm": 18.5, "learning_rate": 3.4271490554042663e-06, "loss": 7.756416320800781, "step": 2162 }, { "epoch": 1.474437627811861, "grad_norm": 23.75, "learning_rate": 3.418830968661623e-06, "loss": 5.844024658203125, "step": 2163 }, { "epoch": 1.4751192910702113, "grad_norm": 26.0, "learning_rate": 3.4105209067926615e-06, "loss": 4.657769203186035, "step": 2164 }, { "epoch": 1.4758009543285617, "grad_norm": 19.625, "learning_rate": 3.4022188799304214e-06, "loss": 4.406302452087402, "step": 2165 }, { "epoch": 1.4764826175869121, "grad_norm": 41.5, "learning_rate": 3.393924898198131e-06, "loss": 5.365929126739502, "step": 2166 }, { "epoch": 1.4771642808452625, "grad_norm": 17.875, "learning_rate": 3.3856389717092176e-06, "loss": 5.304364204406738, "step": 2167 }, { "epoch": 1.477845944103613, "grad_norm": 17.375, "learning_rate": 3.3773611105672866e-06, "loss": 5.340169906616211, "step": 2168 }, { "epoch": 1.478527607361963, "grad_norm": 15.75, "learning_rate": 3.3690913248660983e-06, "loss": 3.8536152839660645, "step": 2169 }, { "epoch": 1.4792092706203135, "grad_norm": 23.25, "learning_rate": 3.36082962468958e-06, "loss": 6.87495756149292, "step": 2170 }, { "epoch": 1.4798909338786639, "grad_norm": 28.125, "learning_rate": 3.3525760201117907e-06, "loss": 3.293588638305664, "step": 2171 }, { "epoch": 1.4805725971370143, "grad_norm": 24.375, "learning_rate": 3.344330521196917e-06, "loss": 2.8505795001983643, "step": 2172 }, { "epoch": 1.4812542603953647, "grad_norm": 15.375, "learning_rate": 3.336093137999269e-06, "loss": 2.063725471496582, "step": 2173 }, { "epoch": 1.481935923653715, "grad_norm": 17.125, "learning_rate": 3.3278638805632525e-06, "loss": 3.3091883659362793, "step": 2174 }, { "epoch": 1.4826175869120655, "grad_norm": 19.0, "learning_rate": 3.3196427589233725e-06, "loss": 4.678814888000488, "step": 2175 }, { "epoch": 1.4832992501704159, "grad_norm": 17.0, "learning_rate": 3.311429783104212e-06, "loss": 3.813445568084717, "step": 2176 }, { "epoch": 1.4839809134287663, "grad_norm": 14.3125, "learning_rate": 3.303224963120414e-06, "loss": 4.357495307922363, "step": 2177 }, { "epoch": 1.4846625766871164, "grad_norm": 42.75, "learning_rate": 3.295028308976683e-06, "loss": 8.124505996704102, "step": 2178 }, { "epoch": 1.485344239945467, "grad_norm": 16.0, "learning_rate": 3.286839830667772e-06, "loss": 3.7614223957061768, "step": 2179 }, { "epoch": 1.4860259032038172, "grad_norm": 30.375, "learning_rate": 3.2786595381784512e-06, "loss": 5.578490734100342, "step": 2180 }, { "epoch": 1.4867075664621676, "grad_norm": 37.5, "learning_rate": 3.2704874414835143e-06, "loss": 7.994753837585449, "step": 2181 }, { "epoch": 1.487389229720518, "grad_norm": 15.125, "learning_rate": 3.26232355054777e-06, "loss": 4.491529941558838, "step": 2182 }, { "epoch": 1.4880708929788684, "grad_norm": 20.0, "learning_rate": 3.2541678753260066e-06, "loss": 1.8571553230285645, "step": 2183 }, { "epoch": 1.4887525562372188, "grad_norm": 26.5, "learning_rate": 3.2460204257630066e-06, "loss": 4.107489585876465, "step": 2184 }, { "epoch": 1.4894342194955692, "grad_norm": 24.25, "learning_rate": 3.2378812117935154e-06, "loss": 3.528583526611328, "step": 2185 }, { "epoch": 1.4901158827539196, "grad_norm": 14.25, "learning_rate": 3.229750243342241e-06, "loss": 2.496750593185425, "step": 2186 }, { "epoch": 1.49079754601227, "grad_norm": 18.0, "learning_rate": 3.2216275303238308e-06, "loss": 3.835811138153076, "step": 2187 }, { "epoch": 1.4914792092706204, "grad_norm": 38.5, "learning_rate": 3.2135130826428705e-06, "loss": 8.405109405517578, "step": 2188 }, { "epoch": 1.4921608725289706, "grad_norm": 13.8125, "learning_rate": 3.205406910193871e-06, "loss": 5.300267219543457, "step": 2189 }, { "epoch": 1.4928425357873212, "grad_norm": 36.75, "learning_rate": 3.1973090228612404e-06, "loss": 6.186171531677246, "step": 2190 }, { "epoch": 1.4935241990456714, "grad_norm": 15.75, "learning_rate": 3.1892194305192992e-06, "loss": 5.825161933898926, "step": 2191 }, { "epoch": 1.4942058623040217, "grad_norm": 16.625, "learning_rate": 3.1811381430322418e-06, "loss": 4.419179916381836, "step": 2192 }, { "epoch": 1.4948875255623721, "grad_norm": 15.375, "learning_rate": 3.173065170254138e-06, "loss": 3.8430569171905518, "step": 2193 }, { "epoch": 1.4955691888207225, "grad_norm": 41.25, "learning_rate": 3.1650005220289247e-06, "loss": 7.075680732727051, "step": 2194 }, { "epoch": 1.496250852079073, "grad_norm": 25.75, "learning_rate": 3.15694420819038e-06, "loss": 5.739487648010254, "step": 2195 }, { "epoch": 1.4969325153374233, "grad_norm": 12.6875, "learning_rate": 3.148896238562126e-06, "loss": 3.2386536598205566, "step": 2196 }, { "epoch": 1.4976141785957737, "grad_norm": 27.625, "learning_rate": 3.14085662295761e-06, "loss": 4.902383327484131, "step": 2197 }, { "epoch": 1.4982958418541241, "grad_norm": 11.25, "learning_rate": 3.132825371180085e-06, "loss": 4.266819000244141, "step": 2198 }, { "epoch": 1.4989775051124745, "grad_norm": 19.875, "learning_rate": 3.124802493022613e-06, "loss": 5.791576385498047, "step": 2199 }, { "epoch": 1.4996591683708247, "grad_norm": 28.25, "learning_rate": 3.116787998268046e-06, "loss": 5.358956336975098, "step": 2200 }, { "epoch": 1.5003408316291753, "grad_norm": 12.75, "learning_rate": 3.108781896689006e-06, "loss": 4.018437385559082, "step": 2201 }, { "epoch": 1.5010224948875255, "grad_norm": 21.75, "learning_rate": 3.1007841980478905e-06, "loss": 5.192237377166748, "step": 2202 }, { "epoch": 1.501704158145876, "grad_norm": 19.0, "learning_rate": 3.0927949120968426e-06, "loss": 6.431877136230469, "step": 2203 }, { "epoch": 1.5023858214042263, "grad_norm": 15.5625, "learning_rate": 3.0848140485777466e-06, "loss": 5.253396987915039, "step": 2204 }, { "epoch": 1.5030674846625767, "grad_norm": 22.25, "learning_rate": 3.076841617222228e-06, "loss": 3.050384998321533, "step": 2205 }, { "epoch": 1.503749147920927, "grad_norm": 12.6875, "learning_rate": 3.068877627751614e-06, "loss": 3.302441120147705, "step": 2206 }, { "epoch": 1.5044308111792775, "grad_norm": 14.5625, "learning_rate": 3.060922089876952e-06, "loss": 4.430276393890381, "step": 2207 }, { "epoch": 1.5051124744376279, "grad_norm": 13.375, "learning_rate": 3.05297501329898e-06, "loss": 3.651062488555908, "step": 2208 }, { "epoch": 1.505794137695978, "grad_norm": 16.625, "learning_rate": 3.045036407708112e-06, "loss": 4.898688316345215, "step": 2209 }, { "epoch": 1.5064758009543286, "grad_norm": 15.75, "learning_rate": 3.0371062827844434e-06, "loss": 4.009773254394531, "step": 2210 }, { "epoch": 1.5071574642126788, "grad_norm": 16.25, "learning_rate": 3.0291846481977173e-06, "loss": 3.5872960090637207, "step": 2211 }, { "epoch": 1.5078391274710294, "grad_norm": 14.5625, "learning_rate": 3.0212715136073325e-06, "loss": 4.186932563781738, "step": 2212 }, { "epoch": 1.5085207907293796, "grad_norm": 22.25, "learning_rate": 3.0133668886623226e-06, "loss": 3.3877928256988525, "step": 2213 }, { "epoch": 1.50920245398773, "grad_norm": 31.5, "learning_rate": 3.00547078300134e-06, "loss": 8.482763290405273, "step": 2214 }, { "epoch": 1.5098841172460804, "grad_norm": 23.125, "learning_rate": 2.997583206252647e-06, "loss": 4.092954158782959, "step": 2215 }, { "epoch": 1.5105657805044308, "grad_norm": 50.5, "learning_rate": 2.9897041680341187e-06, "loss": 11.02216911315918, "step": 2216 }, { "epoch": 1.5112474437627812, "grad_norm": 20.5, "learning_rate": 2.981833677953203e-06, "loss": 3.2554116249084473, "step": 2217 }, { "epoch": 1.5119291070211316, "grad_norm": 33.0, "learning_rate": 2.973971745606938e-06, "loss": 7.751379489898682, "step": 2218 }, { "epoch": 1.512610770279482, "grad_norm": 20.75, "learning_rate": 2.966118380581914e-06, "loss": 3.6639833450317383, "step": 2219 }, { "epoch": 1.5132924335378322, "grad_norm": 15.1875, "learning_rate": 2.958273592454285e-06, "loss": 3.392359733581543, "step": 2220 }, { "epoch": 1.5139740967961828, "grad_norm": 11.0625, "learning_rate": 2.9504373907897456e-06, "loss": 3.871924877166748, "step": 2221 }, { "epoch": 1.514655760054533, "grad_norm": 24.875, "learning_rate": 2.9426097851435096e-06, "loss": 4.082754611968994, "step": 2222 }, { "epoch": 1.5153374233128836, "grad_norm": 44.25, "learning_rate": 2.934790785060325e-06, "loss": 6.972023963928223, "step": 2223 }, { "epoch": 1.5160190865712337, "grad_norm": 14.375, "learning_rate": 2.926980400074432e-06, "loss": 3.6981444358825684, "step": 2224 }, { "epoch": 1.5167007498295841, "grad_norm": 21.75, "learning_rate": 2.9191786397095778e-06, "loss": 4.682798385620117, "step": 2225 }, { "epoch": 1.5173824130879345, "grad_norm": 16.5, "learning_rate": 2.9113855134789836e-06, "loss": 3.74405837059021, "step": 2226 }, { "epoch": 1.518064076346285, "grad_norm": 27.25, "learning_rate": 2.9036010308853444e-06, "loss": 2.607114791870117, "step": 2227 }, { "epoch": 1.5187457396046353, "grad_norm": 41.75, "learning_rate": 2.89582520142082e-06, "loss": 4.414623260498047, "step": 2228 }, { "epoch": 1.5194274028629857, "grad_norm": 13.8125, "learning_rate": 2.8880580345670174e-06, "loss": 1.2222378253936768, "step": 2229 }, { "epoch": 1.5201090661213361, "grad_norm": 15.25, "learning_rate": 2.880299539794975e-06, "loss": 5.577733993530273, "step": 2230 }, { "epoch": 1.5207907293796863, "grad_norm": 17.5, "learning_rate": 2.8725497265651647e-06, "loss": 2.9192323684692383, "step": 2231 }, { "epoch": 1.521472392638037, "grad_norm": 31.25, "learning_rate": 2.8648086043274692e-06, "loss": 5.168089389801025, "step": 2232 }, { "epoch": 1.522154055896387, "grad_norm": 10.4375, "learning_rate": 2.857076182521169e-06, "loss": 2.6286838054656982, "step": 2233 }, { "epoch": 1.5228357191547377, "grad_norm": 12.1875, "learning_rate": 2.8493524705749464e-06, "loss": 3.412076473236084, "step": 2234 }, { "epoch": 1.5235173824130879, "grad_norm": 16.5, "learning_rate": 2.841637477906851e-06, "loss": 4.652256011962891, "step": 2235 }, { "epoch": 1.5241990456714383, "grad_norm": 23.25, "learning_rate": 2.8339312139243058e-06, "loss": 4.673931121826172, "step": 2236 }, { "epoch": 1.5248807089297887, "grad_norm": 14.6875, "learning_rate": 2.8262336880240947e-06, "loss": 3.7349014282226562, "step": 2237 }, { "epoch": 1.525562372188139, "grad_norm": 20.375, "learning_rate": 2.8185449095923356e-06, "loss": 4.008641242980957, "step": 2238 }, { "epoch": 1.5262440354464895, "grad_norm": 16.0, "learning_rate": 2.8108648880044954e-06, "loss": 1.822892427444458, "step": 2239 }, { "epoch": 1.5269256987048399, "grad_norm": 19.0, "learning_rate": 2.803193632625346e-06, "loss": 4.190914154052734, "step": 2240 }, { "epoch": 1.5276073619631902, "grad_norm": 12.0, "learning_rate": 2.7955311528089835e-06, "loss": 4.438333511352539, "step": 2241 }, { "epoch": 1.5282890252215404, "grad_norm": 12.3125, "learning_rate": 2.7878774578988e-06, "loss": 3.2085747718811035, "step": 2242 }, { "epoch": 1.528970688479891, "grad_norm": 14.625, "learning_rate": 2.780232557227469e-06, "loss": 2.9508683681488037, "step": 2243 }, { "epoch": 1.5296523517382412, "grad_norm": 18.0, "learning_rate": 2.7725964601169475e-06, "loss": 6.79923677444458, "step": 2244 }, { "epoch": 1.5303340149965918, "grad_norm": 31.0, "learning_rate": 2.7649691758784603e-06, "loss": 3.5884528160095215, "step": 2245 }, { "epoch": 1.531015678254942, "grad_norm": 18.625, "learning_rate": 2.7573507138124777e-06, "loss": 6.3188066482543945, "step": 2246 }, { "epoch": 1.5316973415132924, "grad_norm": 12.6875, "learning_rate": 2.7497410832087167e-06, "loss": 4.223536014556885, "step": 2247 }, { "epoch": 1.5323790047716428, "grad_norm": 23.625, "learning_rate": 2.7421402933461226e-06, "loss": 5.774472236633301, "step": 2248 }, { "epoch": 1.5330606680299932, "grad_norm": 57.5, "learning_rate": 2.7345483534928654e-06, "loss": 7.487559795379639, "step": 2249 }, { "epoch": 1.5337423312883436, "grad_norm": 14.5, "learning_rate": 2.7269652729063233e-06, "loss": 4.32108211517334, "step": 2250 }, { "epoch": 1.534423994546694, "grad_norm": 12.5625, "learning_rate": 2.7193910608330666e-06, "loss": 3.387157917022705, "step": 2251 }, { "epoch": 1.5351056578050444, "grad_norm": 23.25, "learning_rate": 2.7118257265088554e-06, "loss": 5.037228584289551, "step": 2252 }, { "epoch": 1.5357873210633946, "grad_norm": 11.0, "learning_rate": 2.704269279158629e-06, "loss": 4.52142333984375, "step": 2253 }, { "epoch": 1.5364689843217452, "grad_norm": 11.9375, "learning_rate": 2.6967217279964776e-06, "loss": 1.8358433246612549, "step": 2254 }, { "epoch": 1.5371506475800953, "grad_norm": 45.25, "learning_rate": 2.689183082225659e-06, "loss": 9.48846435546875, "step": 2255 }, { "epoch": 1.537832310838446, "grad_norm": 34.5, "learning_rate": 2.681653351038557e-06, "loss": 5.890583038330078, "step": 2256 }, { "epoch": 1.5385139740967961, "grad_norm": 22.125, "learning_rate": 2.6741325436166996e-06, "loss": 4.193048477172852, "step": 2257 }, { "epoch": 1.5391956373551465, "grad_norm": 15.0, "learning_rate": 2.666620669130723e-06, "loss": 3.479093313217163, "step": 2258 }, { "epoch": 1.539877300613497, "grad_norm": 11.4375, "learning_rate": 2.659117736740371e-06, "loss": 2.619184970855713, "step": 2259 }, { "epoch": 1.5405589638718473, "grad_norm": 22.5, "learning_rate": 2.65162375559449e-06, "loss": 3.562098264694214, "step": 2260 }, { "epoch": 1.5412406271301977, "grad_norm": 32.5, "learning_rate": 2.644138734831011e-06, "loss": 6.374241352081299, "step": 2261 }, { "epoch": 1.5419222903885481, "grad_norm": 28.875, "learning_rate": 2.6366626835769327e-06, "loss": 6.187280654907227, "step": 2262 }, { "epoch": 1.5426039536468985, "grad_norm": 21.125, "learning_rate": 2.6291956109483242e-06, "loss": 2.791496753692627, "step": 2263 }, { "epoch": 1.5432856169052487, "grad_norm": 40.25, "learning_rate": 2.6217375260502974e-06, "loss": 8.624463081359863, "step": 2264 }, { "epoch": 1.5439672801635993, "grad_norm": 14.1875, "learning_rate": 2.614288437977014e-06, "loss": 4.0031609535217285, "step": 2265 }, { "epoch": 1.5446489434219495, "grad_norm": 13.5625, "learning_rate": 2.606848355811662e-06, "loss": 2.4029014110565186, "step": 2266 }, { "epoch": 1.5453306066803, "grad_norm": 18.375, "learning_rate": 2.5994172886264436e-06, "loss": 5.623987197875977, "step": 2267 }, { "epoch": 1.5460122699386503, "grad_norm": 11.75, "learning_rate": 2.5919952454825758e-06, "loss": 3.1451828479766846, "step": 2268 }, { "epoch": 1.5466939331970007, "grad_norm": 23.125, "learning_rate": 2.5845822354302663e-06, "loss": 3.401416063308716, "step": 2269 }, { "epoch": 1.547375596455351, "grad_norm": 22.625, "learning_rate": 2.5771782675087078e-06, "loss": 2.7605183124542236, "step": 2270 }, { "epoch": 1.5480572597137015, "grad_norm": 16.5, "learning_rate": 2.5697833507460733e-06, "loss": 3.207418918609619, "step": 2271 }, { "epoch": 1.5487389229720518, "grad_norm": 16.0, "learning_rate": 2.562397494159491e-06, "loss": 3.825110912322998, "step": 2272 }, { "epoch": 1.549420586230402, "grad_norm": 20.5, "learning_rate": 2.555020706755048e-06, "loss": 5.863574504852295, "step": 2273 }, { "epoch": 1.5501022494887526, "grad_norm": 12.6875, "learning_rate": 2.547652997527773e-06, "loss": 3.1473796367645264, "step": 2274 }, { "epoch": 1.5507839127471028, "grad_norm": 38.75, "learning_rate": 2.5402943754616182e-06, "loss": 5.00313663482666, "step": 2275 }, { "epoch": 1.5514655760054534, "grad_norm": 15.375, "learning_rate": 2.5329448495294607e-06, "loss": 5.417040824890137, "step": 2276 }, { "epoch": 1.5521472392638036, "grad_norm": 22.625, "learning_rate": 2.525604428693088e-06, "loss": 6.070529937744141, "step": 2277 }, { "epoch": 1.5528289025221542, "grad_norm": 33.5, "learning_rate": 2.5182731219031784e-06, "loss": 4.1323652267456055, "step": 2278 }, { "epoch": 1.5535105657805044, "grad_norm": 38.75, "learning_rate": 2.5109509380992992e-06, "loss": 7.636715888977051, "step": 2279 }, { "epoch": 1.5541922290388548, "grad_norm": 11.8125, "learning_rate": 2.5036378862099e-06, "loss": 3.719938278198242, "step": 2280 }, { "epoch": 1.5548738922972052, "grad_norm": 24.625, "learning_rate": 2.496333975152282e-06, "loss": 5.40027379989624, "step": 2281 }, { "epoch": 1.5555555555555556, "grad_norm": 81.5, "learning_rate": 2.4890392138326157e-06, "loss": 12.607975959777832, "step": 2282 }, { "epoch": 1.556237218813906, "grad_norm": 9.875, "learning_rate": 2.481753611145902e-06, "loss": 2.5586671829223633, "step": 2283 }, { "epoch": 1.5569188820722561, "grad_norm": 31.125, "learning_rate": 2.47447717597598e-06, "loss": 4.705672264099121, "step": 2284 }, { "epoch": 1.5576005453306068, "grad_norm": 19.375, "learning_rate": 2.467209917195513e-06, "loss": 3.1570982933044434, "step": 2285 }, { "epoch": 1.558282208588957, "grad_norm": 14.5625, "learning_rate": 2.4599518436659666e-06, "loss": 3.0715770721435547, "step": 2286 }, { "epoch": 1.5589638718473076, "grad_norm": 80.5, "learning_rate": 2.4527029642376156e-06, "loss": 5.599830627441406, "step": 2287 }, { "epoch": 1.5596455351056577, "grad_norm": 28.125, "learning_rate": 2.4454632877495132e-06, "loss": 4.243928909301758, "step": 2288 }, { "epoch": 1.5603271983640081, "grad_norm": 13.9375, "learning_rate": 2.4382328230295025e-06, "loss": 3.956549644470215, "step": 2289 }, { "epoch": 1.5610088616223585, "grad_norm": 28.5, "learning_rate": 2.4310115788941855e-06, "loss": 4.519423484802246, "step": 2290 }, { "epoch": 1.561690524880709, "grad_norm": 25.5, "learning_rate": 2.423799564148922e-06, "loss": 5.553188323974609, "step": 2291 }, { "epoch": 1.5623721881390593, "grad_norm": 28.25, "learning_rate": 2.4165967875878203e-06, "loss": 5.329846382141113, "step": 2292 }, { "epoch": 1.5630538513974097, "grad_norm": 11.5625, "learning_rate": 2.4094032579937264e-06, "loss": 3.48065185546875, "step": 2293 }, { "epoch": 1.56373551465576, "grad_norm": 40.5, "learning_rate": 2.402218984138204e-06, "loss": 6.155162811279297, "step": 2294 }, { "epoch": 1.5644171779141103, "grad_norm": 31.375, "learning_rate": 2.3950439747815357e-06, "loss": 5.613640308380127, "step": 2295 }, { "epoch": 1.565098841172461, "grad_norm": 38.25, "learning_rate": 2.387878238672704e-06, "loss": 5.384792327880859, "step": 2296 }, { "epoch": 1.565780504430811, "grad_norm": 11.25, "learning_rate": 2.3807217845493857e-06, "loss": 4.698662281036377, "step": 2297 }, { "epoch": 1.5664621676891617, "grad_norm": 14.1875, "learning_rate": 2.373574621137943e-06, "loss": 4.595712661743164, "step": 2298 }, { "epoch": 1.5671438309475119, "grad_norm": 18.875, "learning_rate": 2.3664367571534008e-06, "loss": 4.426677227020264, "step": 2299 }, { "epoch": 1.5678254942058623, "grad_norm": 26.75, "learning_rate": 2.359308201299454e-06, "loss": 5.919748306274414, "step": 2300 }, { "epoch": 1.5685071574642127, "grad_norm": 22.375, "learning_rate": 2.35218896226844e-06, "loss": 2.2974636554718018, "step": 2301 }, { "epoch": 1.569188820722563, "grad_norm": 15.0625, "learning_rate": 2.3450790487413355e-06, "loss": 3.3982105255126953, "step": 2302 }, { "epoch": 1.5698704839809134, "grad_norm": 22.75, "learning_rate": 2.337978469387756e-06, "loss": 6.239871501922607, "step": 2303 }, { "epoch": 1.5705521472392638, "grad_norm": 56.25, "learning_rate": 2.3308872328659204e-06, "loss": 9.415977478027344, "step": 2304 }, { "epoch": 1.5712338104976142, "grad_norm": 12.25, "learning_rate": 2.3238053478226665e-06, "loss": 2.974834680557251, "step": 2305 }, { "epoch": 1.5719154737559644, "grad_norm": 57.0, "learning_rate": 2.3167328228934292e-06, "loss": 9.53856086730957, "step": 2306 }, { "epoch": 1.572597137014315, "grad_norm": 21.75, "learning_rate": 2.309669666702219e-06, "loss": 3.841061592102051, "step": 2307 }, { "epoch": 1.5732788002726652, "grad_norm": 41.25, "learning_rate": 2.3026158878616366e-06, "loss": 7.2212066650390625, "step": 2308 }, { "epoch": 1.5739604635310158, "grad_norm": 19.375, "learning_rate": 2.295571494972836e-06, "loss": 5.402873516082764, "step": 2309 }, { "epoch": 1.574642126789366, "grad_norm": 22.25, "learning_rate": 2.2885364966255372e-06, "loss": 3.0711703300476074, "step": 2310 }, { "epoch": 1.5753237900477164, "grad_norm": 29.5, "learning_rate": 2.281510901397993e-06, "loss": 3.2769062519073486, "step": 2311 }, { "epoch": 1.5760054533060668, "grad_norm": 30.625, "learning_rate": 2.274494717857003e-06, "loss": 4.809566497802734, "step": 2312 }, { "epoch": 1.5766871165644172, "grad_norm": 15.6875, "learning_rate": 2.2674879545578775e-06, "loss": 4.225295066833496, "step": 2313 }, { "epoch": 1.5773687798227676, "grad_norm": 45.25, "learning_rate": 2.2604906200444545e-06, "loss": 8.472102165222168, "step": 2314 }, { "epoch": 1.578050443081118, "grad_norm": 15.4375, "learning_rate": 2.2535027228490582e-06, "loss": 2.2797021865844727, "step": 2315 }, { "epoch": 1.5787321063394684, "grad_norm": 24.5, "learning_rate": 2.2465242714925218e-06, "loss": 5.215639114379883, "step": 2316 }, { "epoch": 1.5794137695978185, "grad_norm": 33.75, "learning_rate": 2.2395552744841465e-06, "loss": 6.451731204986572, "step": 2317 }, { "epoch": 1.5800954328561692, "grad_norm": 19.75, "learning_rate": 2.2325957403217148e-06, "loss": 6.3792853355407715, "step": 2318 }, { "epoch": 1.5807770961145193, "grad_norm": 27.875, "learning_rate": 2.225645677491468e-06, "loss": 2.5255379676818848, "step": 2319 }, { "epoch": 1.58145875937287, "grad_norm": 50.25, "learning_rate": 2.2187050944680942e-06, "loss": 7.820954322814941, "step": 2320 }, { "epoch": 1.5821404226312201, "grad_norm": 19.0, "learning_rate": 2.21177399971473e-06, "loss": 6.359643936157227, "step": 2321 }, { "epoch": 1.5828220858895705, "grad_norm": 17.875, "learning_rate": 2.2048524016829354e-06, "loss": 6.449435234069824, "step": 2322 }, { "epoch": 1.583503749147921, "grad_norm": 24.875, "learning_rate": 2.197940308812688e-06, "loss": 5.217724800109863, "step": 2323 }, { "epoch": 1.5841854124062713, "grad_norm": 25.0, "learning_rate": 2.191037729532388e-06, "loss": 5.58925724029541, "step": 2324 }, { "epoch": 1.5848670756646217, "grad_norm": 44.5, "learning_rate": 2.18414467225882e-06, "loss": 10.185806274414062, "step": 2325 }, { "epoch": 1.585548738922972, "grad_norm": 16.375, "learning_rate": 2.177261145397166e-06, "loss": 4.192946434020996, "step": 2326 }, { "epoch": 1.5862304021813225, "grad_norm": 15.5625, "learning_rate": 2.170387157340991e-06, "loss": 4.035940170288086, "step": 2327 }, { "epoch": 1.5869120654396727, "grad_norm": 43.0, "learning_rate": 2.163522716472215e-06, "loss": 7.732216835021973, "step": 2328 }, { "epoch": 1.5875937286980233, "grad_norm": 17.0, "learning_rate": 2.156667831161129e-06, "loss": 4.65104341506958, "step": 2329 }, { "epoch": 1.5882753919563735, "grad_norm": 17.75, "learning_rate": 2.1498225097663695e-06, "loss": 4.7881269454956055, "step": 2330 }, { "epoch": 1.588957055214724, "grad_norm": 21.125, "learning_rate": 2.1429867606349053e-06, "loss": 5.47606897354126, "step": 2331 }, { "epoch": 1.5896387184730743, "grad_norm": 47.0, "learning_rate": 2.1361605921020414e-06, "loss": 8.967060089111328, "step": 2332 }, { "epoch": 1.5903203817314246, "grad_norm": 14.1875, "learning_rate": 2.1293440124913935e-06, "loss": 5.703973770141602, "step": 2333 }, { "epoch": 1.591002044989775, "grad_norm": 20.375, "learning_rate": 2.1225370301148863e-06, "loss": 3.6702351570129395, "step": 2334 }, { "epoch": 1.5916837082481254, "grad_norm": 25.875, "learning_rate": 2.115739653272747e-06, "loss": 6.452264785766602, "step": 2335 }, { "epoch": 1.5923653715064758, "grad_norm": 24.75, "learning_rate": 2.108951890253482e-06, "loss": 5.111787796020508, "step": 2336 }, { "epoch": 1.5930470347648262, "grad_norm": 17.875, "learning_rate": 2.102173749333882e-06, "loss": 3.7926650047302246, "step": 2337 }, { "epoch": 1.5937286980231766, "grad_norm": 26.125, "learning_rate": 2.095405238779005e-06, "loss": 7.547939777374268, "step": 2338 }, { "epoch": 1.5944103612815268, "grad_norm": 14.875, "learning_rate": 2.0886463668421562e-06, "loss": 4.4434380531311035, "step": 2339 }, { "epoch": 1.5950920245398774, "grad_norm": 16.125, "learning_rate": 2.0818971417649013e-06, "loss": 2.324268102645874, "step": 2340 }, { "epoch": 1.5957736877982276, "grad_norm": 45.5, "learning_rate": 2.0751575717770324e-06, "loss": 4.112920761108398, "step": 2341 }, { "epoch": 1.5964553510565782, "grad_norm": 22.375, "learning_rate": 2.0684276650965718e-06, "loss": 5.287014007568359, "step": 2342 }, { "epoch": 1.5971370143149284, "grad_norm": 19.5, "learning_rate": 2.0617074299297646e-06, "loss": 3.369626522064209, "step": 2343 }, { "epoch": 1.5978186775732788, "grad_norm": 21.375, "learning_rate": 2.0549968744710546e-06, "loss": 3.606440782546997, "step": 2344 }, { "epoch": 1.5985003408316292, "grad_norm": 18.25, "learning_rate": 2.048296006903081e-06, "loss": 4.143680095672607, "step": 2345 }, { "epoch": 1.5991820040899796, "grad_norm": 19.0, "learning_rate": 2.04160483539668e-06, "loss": 3.456432342529297, "step": 2346 }, { "epoch": 1.59986366734833, "grad_norm": 19.75, "learning_rate": 2.034923368110855e-06, "loss": 2.644862174987793, "step": 2347 }, { "epoch": 1.6005453306066801, "grad_norm": 15.875, "learning_rate": 2.0282516131927833e-06, "loss": 2.5071957111358643, "step": 2348 }, { "epoch": 1.6012269938650308, "grad_norm": 19.125, "learning_rate": 2.021589578777793e-06, "loss": 3.447601318359375, "step": 2349 }, { "epoch": 1.601908657123381, "grad_norm": 33.0, "learning_rate": 2.0149372729893646e-06, "loss": 4.610872268676758, "step": 2350 }, { "epoch": 1.6025903203817315, "grad_norm": 16.875, "learning_rate": 2.0082947039391154e-06, "loss": 7.563458442687988, "step": 2351 }, { "epoch": 1.6032719836400817, "grad_norm": 19.0, "learning_rate": 2.001661879726784e-06, "loss": 2.9356789588928223, "step": 2352 }, { "epoch": 1.6039536468984323, "grad_norm": 33.0, "learning_rate": 1.995038808440236e-06, "loss": 6.6725993156433105, "step": 2353 }, { "epoch": 1.6046353101567825, "grad_norm": 33.75, "learning_rate": 1.988425498155434e-06, "loss": 4.167017936706543, "step": 2354 }, { "epoch": 1.605316973415133, "grad_norm": 19.125, "learning_rate": 1.981821956936448e-06, "loss": 5.152316093444824, "step": 2355 }, { "epoch": 1.6059986366734833, "grad_norm": 14.125, "learning_rate": 1.9752281928354323e-06, "loss": 3.89868426322937, "step": 2356 }, { "epoch": 1.6066802999318337, "grad_norm": 18.125, "learning_rate": 1.968644213892612e-06, "loss": 3.894108772277832, "step": 2357 }, { "epoch": 1.607361963190184, "grad_norm": 17.125, "learning_rate": 1.9620700281362913e-06, "loss": 3.215193748474121, "step": 2358 }, { "epoch": 1.6080436264485343, "grad_norm": 12.5625, "learning_rate": 1.9555056435828323e-06, "loss": 1.8850473165512085, "step": 2359 }, { "epoch": 1.6087252897068849, "grad_norm": 18.5, "learning_rate": 1.9489510682366363e-06, "loss": 5.760750770568848, "step": 2360 }, { "epoch": 1.609406952965235, "grad_norm": 29.75, "learning_rate": 1.942406310090155e-06, "loss": 6.471381187438965, "step": 2361 }, { "epoch": 1.6100886162235857, "grad_norm": 17.5, "learning_rate": 1.9358713771238568e-06, "loss": 3.389429807662964, "step": 2362 }, { "epoch": 1.6107702794819359, "grad_norm": 11.375, "learning_rate": 1.929346277306241e-06, "loss": 3.0777933597564697, "step": 2363 }, { "epoch": 1.6114519427402862, "grad_norm": 67.0, "learning_rate": 1.922831018593814e-06, "loss": 5.997224807739258, "step": 2364 }, { "epoch": 1.6121336059986366, "grad_norm": 17.75, "learning_rate": 1.916325608931079e-06, "loss": 4.526200771331787, "step": 2365 }, { "epoch": 1.612815269256987, "grad_norm": 59.0, "learning_rate": 1.9098300562505266e-06, "loss": 8.842668533325195, "step": 2366 }, { "epoch": 1.6134969325153374, "grad_norm": 22.875, "learning_rate": 1.9033443684726394e-06, "loss": 3.6325230598449707, "step": 2367 }, { "epoch": 1.6141785957736878, "grad_norm": 12.5, "learning_rate": 1.8968685535058584e-06, "loss": 4.183434963226318, "step": 2368 }, { "epoch": 1.6148602590320382, "grad_norm": 33.25, "learning_rate": 1.8904026192465963e-06, "loss": 7.589760780334473, "step": 2369 }, { "epoch": 1.6155419222903884, "grad_norm": 15.625, "learning_rate": 1.8839465735792095e-06, "loss": 4.729938983917236, "step": 2370 }, { "epoch": 1.616223585548739, "grad_norm": 18.875, "learning_rate": 1.877500424376002e-06, "loss": 5.8737969398498535, "step": 2371 }, { "epoch": 1.6169052488070892, "grad_norm": 16.0, "learning_rate": 1.8710641794972106e-06, "loss": 3.770240068435669, "step": 2372 }, { "epoch": 1.6175869120654398, "grad_norm": 44.25, "learning_rate": 1.8646378467909875e-06, "loss": 7.635149002075195, "step": 2373 }, { "epoch": 1.61826857532379, "grad_norm": 17.75, "learning_rate": 1.8582214340934069e-06, "loss": 5.192507266998291, "step": 2374 }, { "epoch": 1.6189502385821404, "grad_norm": 21.875, "learning_rate": 1.8518149492284477e-06, "loss": 3.442523956298828, "step": 2375 }, { "epoch": 1.6196319018404908, "grad_norm": 24.75, "learning_rate": 1.8454184000079733e-06, "loss": 4.952095985412598, "step": 2376 }, { "epoch": 1.6203135650988412, "grad_norm": 26.875, "learning_rate": 1.8390317942317404e-06, "loss": 6.047942161560059, "step": 2377 }, { "epoch": 1.6209952283571916, "grad_norm": 33.25, "learning_rate": 1.832655139687376e-06, "loss": 7.149279594421387, "step": 2378 }, { "epoch": 1.621676891615542, "grad_norm": 125.0, "learning_rate": 1.8262884441503748e-06, "loss": 5.361629962921143, "step": 2379 }, { "epoch": 1.6223585548738924, "grad_norm": 14.625, "learning_rate": 1.8199317153840933e-06, "loss": 3.1054604053497314, "step": 2380 }, { "epoch": 1.6230402181322425, "grad_norm": 15.0, "learning_rate": 1.8135849611397239e-06, "loss": 3.3508167266845703, "step": 2381 }, { "epoch": 1.6237218813905931, "grad_norm": 15.125, "learning_rate": 1.8072481891563054e-06, "loss": 4.355668067932129, "step": 2382 }, { "epoch": 1.6244035446489433, "grad_norm": 41.5, "learning_rate": 1.8009214071607017e-06, "loss": 6.819050312042236, "step": 2383 }, { "epoch": 1.625085207907294, "grad_norm": 21.875, "learning_rate": 1.7946046228675917e-06, "loss": 2.5701711177825928, "step": 2384 }, { "epoch": 1.6257668711656441, "grad_norm": 21.125, "learning_rate": 1.7882978439794708e-06, "loss": 2.860868453979492, "step": 2385 }, { "epoch": 1.6264485344239945, "grad_norm": 11.875, "learning_rate": 1.7820010781866248e-06, "loss": 3.9742960929870605, "step": 2386 }, { "epoch": 1.627130197682345, "grad_norm": 18.375, "learning_rate": 1.77571433316714e-06, "loss": 3.3456368446350098, "step": 2387 }, { "epoch": 1.6278118609406953, "grad_norm": 10.5625, "learning_rate": 1.7694376165868765e-06, "loss": 3.104473114013672, "step": 2388 }, { "epoch": 1.6284935241990457, "grad_norm": 16.5, "learning_rate": 1.7631709360994653e-06, "loss": 4.505502700805664, "step": 2389 }, { "epoch": 1.629175187457396, "grad_norm": 16.0, "learning_rate": 1.756914299346304e-06, "loss": 4.794858455657959, "step": 2390 }, { "epoch": 1.6298568507157465, "grad_norm": 12.9375, "learning_rate": 1.7506677139565464e-06, "loss": 1.373042345046997, "step": 2391 }, { "epoch": 1.6305385139740967, "grad_norm": 19.625, "learning_rate": 1.7444311875470788e-06, "loss": 6.325782299041748, "step": 2392 }, { "epoch": 1.6312201772324473, "grad_norm": 25.875, "learning_rate": 1.7382047277225323e-06, "loss": 4.337449073791504, "step": 2393 }, { "epoch": 1.6319018404907975, "grad_norm": 37.0, "learning_rate": 1.7319883420752559e-06, "loss": 7.569191932678223, "step": 2394 }, { "epoch": 1.632583503749148, "grad_norm": 27.0, "learning_rate": 1.7257820381853197e-06, "loss": 5.7404465675354, "step": 2395 }, { "epoch": 1.6332651670074982, "grad_norm": 12.3125, "learning_rate": 1.719585823620502e-06, "loss": 3.6626298427581787, "step": 2396 }, { "epoch": 1.6339468302658486, "grad_norm": 22.25, "learning_rate": 1.7133997059362673e-06, "loss": 6.917304039001465, "step": 2397 }, { "epoch": 1.634628493524199, "grad_norm": 23.625, "learning_rate": 1.7072236926757835e-06, "loss": 4.318532943725586, "step": 2398 }, { "epoch": 1.6353101567825494, "grad_norm": 35.0, "learning_rate": 1.7010577913698855e-06, "loss": 6.008736610412598, "step": 2399 }, { "epoch": 1.6359918200408998, "grad_norm": 15.8125, "learning_rate": 1.6949020095370816e-06, "loss": 1.6837490797042847, "step": 2400 }, { "epoch": 1.6366734832992502, "grad_norm": 13.875, "learning_rate": 1.688756354683545e-06, "loss": 3.300551176071167, "step": 2401 }, { "epoch": 1.6373551465576006, "grad_norm": 25.875, "learning_rate": 1.6826208343030925e-06, "loss": 5.813998699188232, "step": 2402 }, { "epoch": 1.6380368098159508, "grad_norm": 33.75, "learning_rate": 1.6764954558771906e-06, "loss": 6.242608547210693, "step": 2403 }, { "epoch": 1.6387184730743014, "grad_norm": 23.625, "learning_rate": 1.670380226874938e-06, "loss": 3.955885887145996, "step": 2404 }, { "epoch": 1.6394001363326516, "grad_norm": 15.8125, "learning_rate": 1.6642751547530512e-06, "loss": 3.980687141418457, "step": 2405 }, { "epoch": 1.6400817995910022, "grad_norm": 23.875, "learning_rate": 1.6581802469558671e-06, "loss": 4.787891387939453, "step": 2406 }, { "epoch": 1.6407634628493524, "grad_norm": 19.0, "learning_rate": 1.6520955109153314e-06, "loss": 6.024620056152344, "step": 2407 }, { "epoch": 1.6414451261077028, "grad_norm": 27.875, "learning_rate": 1.6460209540509797e-06, "loss": 6.981907844543457, "step": 2408 }, { "epoch": 1.6421267893660532, "grad_norm": 18.125, "learning_rate": 1.6399565837699361e-06, "loss": 6.1150665283203125, "step": 2409 }, { "epoch": 1.6428084526244036, "grad_norm": 12.25, "learning_rate": 1.6339024074669107e-06, "loss": 3.3893346786499023, "step": 2410 }, { "epoch": 1.643490115882754, "grad_norm": 23.625, "learning_rate": 1.627858432524173e-06, "loss": 5.643283367156982, "step": 2411 }, { "epoch": 1.6441717791411041, "grad_norm": 20.25, "learning_rate": 1.6218246663115656e-06, "loss": 5.2174882888793945, "step": 2412 }, { "epoch": 1.6448534423994547, "grad_norm": 20.0, "learning_rate": 1.6158011161864706e-06, "loss": 4.127598285675049, "step": 2413 }, { "epoch": 1.645535105657805, "grad_norm": 28.75, "learning_rate": 1.6097877894938218e-06, "loss": 5.73557186126709, "step": 2414 }, { "epoch": 1.6462167689161555, "grad_norm": 14.375, "learning_rate": 1.6037846935660807e-06, "loss": 3.933081865310669, "step": 2415 }, { "epoch": 1.6468984321745057, "grad_norm": 31.25, "learning_rate": 1.5977918357232369e-06, "loss": 6.264403343200684, "step": 2416 }, { "epoch": 1.6475800954328563, "grad_norm": 37.5, "learning_rate": 1.5918092232728e-06, "loss": 8.454083442687988, "step": 2417 }, { "epoch": 1.6482617586912065, "grad_norm": 14.375, "learning_rate": 1.5858368635097776e-06, "loss": 2.058438777923584, "step": 2418 }, { "epoch": 1.648943421949557, "grad_norm": 16.375, "learning_rate": 1.5798747637166834e-06, "loss": 4.563935279846191, "step": 2419 }, { "epoch": 1.6496250852079073, "grad_norm": 27.875, "learning_rate": 1.5739229311635152e-06, "loss": 5.799880027770996, "step": 2420 }, { "epoch": 1.6503067484662577, "grad_norm": 28.375, "learning_rate": 1.5679813731077508e-06, "loss": 3.271304130554199, "step": 2421 }, { "epoch": 1.650988411724608, "grad_norm": 18.75, "learning_rate": 1.5620500967943486e-06, "loss": 4.995147705078125, "step": 2422 }, { "epoch": 1.6516700749829583, "grad_norm": 29.25, "learning_rate": 1.5561291094557173e-06, "loss": 7.5146331787109375, "step": 2423 }, { "epoch": 1.6523517382413089, "grad_norm": 25.625, "learning_rate": 1.5502184183117264e-06, "loss": 3.1783881187438965, "step": 2424 }, { "epoch": 1.653033401499659, "grad_norm": 16.875, "learning_rate": 1.5443180305696948e-06, "loss": 6.744013786315918, "step": 2425 }, { "epoch": 1.6537150647580097, "grad_norm": 15.6875, "learning_rate": 1.5384279534243652e-06, "loss": 3.819431781768799, "step": 2426 }, { "epoch": 1.6543967280163598, "grad_norm": 15.4375, "learning_rate": 1.532548194057919e-06, "loss": 6.014884948730469, "step": 2427 }, { "epoch": 1.6550783912747105, "grad_norm": 10.375, "learning_rate": 1.5266787596399557e-06, "loss": 2.7468860149383545, "step": 2428 }, { "epoch": 1.6557600545330606, "grad_norm": 12.25, "learning_rate": 1.5208196573274758e-06, "loss": 3.238698959350586, "step": 2429 }, { "epoch": 1.656441717791411, "grad_norm": 19.25, "learning_rate": 1.5149708942648922e-06, "loss": 4.875136375427246, "step": 2430 }, { "epoch": 1.6571233810497614, "grad_norm": 39.0, "learning_rate": 1.5091324775840032e-06, "loss": 7.954928398132324, "step": 2431 }, { "epoch": 1.6578050443081118, "grad_norm": 16.5, "learning_rate": 1.503304414403991e-06, "loss": 5.388803005218506, "step": 2432 }, { "epoch": 1.6584867075664622, "grad_norm": 37.75, "learning_rate": 1.4974867118314196e-06, "loss": 3.423356533050537, "step": 2433 }, { "epoch": 1.6591683708248124, "grad_norm": 18.125, "learning_rate": 1.4916793769602111e-06, "loss": 3.0680129528045654, "step": 2434 }, { "epoch": 1.659850034083163, "grad_norm": 29.25, "learning_rate": 1.4858824168716524e-06, "loss": 6.3440937995910645, "step": 2435 }, { "epoch": 1.6605316973415132, "grad_norm": 23.5, "learning_rate": 1.480095838634379e-06, "loss": 6.717624664306641, "step": 2436 }, { "epoch": 1.6612133605998638, "grad_norm": 15.5625, "learning_rate": 1.474319649304361e-06, "loss": 4.230828285217285, "step": 2437 }, { "epoch": 1.661895023858214, "grad_norm": 12.6875, "learning_rate": 1.468553855924908e-06, "loss": 4.754615306854248, "step": 2438 }, { "epoch": 1.6625766871165644, "grad_norm": 13.0625, "learning_rate": 1.462798465526647e-06, "loss": 4.478590965270996, "step": 2439 }, { "epoch": 1.6632583503749148, "grad_norm": 13.0, "learning_rate": 1.4570534851275241e-06, "loss": 3.077754497528076, "step": 2440 }, { "epoch": 1.6639400136332652, "grad_norm": 24.125, "learning_rate": 1.4513189217327927e-06, "loss": 2.435742139816284, "step": 2441 }, { "epoch": 1.6646216768916156, "grad_norm": 31.25, "learning_rate": 1.4455947823350004e-06, "loss": 7.738450050354004, "step": 2442 }, { "epoch": 1.665303340149966, "grad_norm": 64.5, "learning_rate": 1.4398810739139822e-06, "loss": 11.616652488708496, "step": 2443 }, { "epoch": 1.6659850034083163, "grad_norm": 49.75, "learning_rate": 1.4341778034368626e-06, "loss": 8.787760734558105, "step": 2444 }, { "epoch": 1.6666666666666665, "grad_norm": 21.25, "learning_rate": 1.4284849778580279e-06, "loss": 7.255436420440674, "step": 2445 }, { "epoch": 1.6673483299250171, "grad_norm": 16.875, "learning_rate": 1.4228026041191368e-06, "loss": 4.83613395690918, "step": 2446 }, { "epoch": 1.6680299931833673, "grad_norm": 34.75, "learning_rate": 1.4171306891490977e-06, "loss": 7.075747013092041, "step": 2447 }, { "epoch": 1.668711656441718, "grad_norm": 17.375, "learning_rate": 1.4114692398640683e-06, "loss": 4.1329450607299805, "step": 2448 }, { "epoch": 1.669393319700068, "grad_norm": 29.375, "learning_rate": 1.405818263167449e-06, "loss": 5.341249942779541, "step": 2449 }, { "epoch": 1.6700749829584185, "grad_norm": 31.0, "learning_rate": 1.4001777659498584e-06, "loss": 5.111704349517822, "step": 2450 }, { "epoch": 1.670756646216769, "grad_norm": 39.75, "learning_rate": 1.3945477550891494e-06, "loss": 5.270699501037598, "step": 2451 }, { "epoch": 1.6714383094751193, "grad_norm": 22.125, "learning_rate": 1.38892823745038e-06, "loss": 2.0102319717407227, "step": 2452 }, { "epoch": 1.6721199727334697, "grad_norm": 41.5, "learning_rate": 1.3833192198858191e-06, "loss": 2.191265821456909, "step": 2453 }, { "epoch": 1.67280163599182, "grad_norm": 13.875, "learning_rate": 1.3777207092349276e-06, "loss": 3.8734095096588135, "step": 2454 }, { "epoch": 1.6734832992501705, "grad_norm": 26.375, "learning_rate": 1.3721327123243533e-06, "loss": 5.771875381469727, "step": 2455 }, { "epoch": 1.6741649625085206, "grad_norm": 16.375, "learning_rate": 1.366555235967928e-06, "loss": 5.732770919799805, "step": 2456 }, { "epoch": 1.6748466257668713, "grad_norm": 26.25, "learning_rate": 1.360988286966657e-06, "loss": 4.732114315032959, "step": 2457 }, { "epoch": 1.6755282890252214, "grad_norm": 20.375, "learning_rate": 1.3554318721087001e-06, "loss": 7.017389297485352, "step": 2458 }, { "epoch": 1.676209952283572, "grad_norm": 10.9375, "learning_rate": 1.3498859981693801e-06, "loss": 2.8822546005249023, "step": 2459 }, { "epoch": 1.6768916155419222, "grad_norm": 22.5, "learning_rate": 1.3443506719111666e-06, "loss": 5.071442604064941, "step": 2460 }, { "epoch": 1.6775732788002726, "grad_norm": 13.9375, "learning_rate": 1.33882590008366e-06, "loss": 3.4610564708709717, "step": 2461 }, { "epoch": 1.678254942058623, "grad_norm": 15.875, "learning_rate": 1.333311689423601e-06, "loss": 5.61279296875, "step": 2462 }, { "epoch": 1.6789366053169734, "grad_norm": 8.9375, "learning_rate": 1.327808046654845e-06, "loss": 3.229604721069336, "step": 2463 }, { "epoch": 1.6796182685753238, "grad_norm": 18.0, "learning_rate": 1.3223149784883604e-06, "loss": 3.7883195877075195, "step": 2464 }, { "epoch": 1.6802999318336742, "grad_norm": 13.4375, "learning_rate": 1.3168324916222296e-06, "loss": 1.744385004043579, "step": 2465 }, { "epoch": 1.6809815950920246, "grad_norm": 29.5, "learning_rate": 1.311360592741623e-06, "loss": 6.706332206726074, "step": 2466 }, { "epoch": 1.6816632583503748, "grad_norm": 33.25, "learning_rate": 1.305899288518806e-06, "loss": 7.140202045440674, "step": 2467 }, { "epoch": 1.6823449216087254, "grad_norm": 23.0, "learning_rate": 1.3004485856131265e-06, "loss": 1.897514820098877, "step": 2468 }, { "epoch": 1.6830265848670756, "grad_norm": 68.0, "learning_rate": 1.2950084906709991e-06, "loss": 8.627399444580078, "step": 2469 }, { "epoch": 1.6837082481254262, "grad_norm": 18.875, "learning_rate": 1.28957901032591e-06, "loss": 2.939840078353882, "step": 2470 }, { "epoch": 1.6843899113837764, "grad_norm": 21.0, "learning_rate": 1.284160151198396e-06, "loss": 6.158937454223633, "step": 2471 }, { "epoch": 1.6850715746421268, "grad_norm": 34.0, "learning_rate": 1.2787519198960485e-06, "loss": 6.949977874755859, "step": 2472 }, { "epoch": 1.6857532379004772, "grad_norm": 15.0625, "learning_rate": 1.2733543230134993e-06, "loss": 4.210999488830566, "step": 2473 }, { "epoch": 1.6864349011588275, "grad_norm": 13.0, "learning_rate": 1.2679673671324078e-06, "loss": 2.939734935760498, "step": 2474 }, { "epoch": 1.687116564417178, "grad_norm": 19.0, "learning_rate": 1.2625910588214608e-06, "loss": 4.273207664489746, "step": 2475 }, { "epoch": 1.6877982276755283, "grad_norm": 22.25, "learning_rate": 1.257225404636362e-06, "loss": 6.3214850425720215, "step": 2476 }, { "epoch": 1.6884798909338787, "grad_norm": 24.0, "learning_rate": 1.2518704111198255e-06, "loss": 4.936516284942627, "step": 2477 }, { "epoch": 1.689161554192229, "grad_norm": 14.625, "learning_rate": 1.246526084801566e-06, "loss": 4.715229511260986, "step": 2478 }, { "epoch": 1.6898432174505795, "grad_norm": 14.125, "learning_rate": 1.2411924321982848e-06, "loss": 4.383710861206055, "step": 2479 }, { "epoch": 1.6905248807089297, "grad_norm": 12.625, "learning_rate": 1.2358694598136755e-06, "loss": 4.343920707702637, "step": 2480 }, { "epoch": 1.6912065439672803, "grad_norm": 49.25, "learning_rate": 1.2305571741384081e-06, "loss": 6.711365222930908, "step": 2481 }, { "epoch": 1.6918882072256305, "grad_norm": 16.625, "learning_rate": 1.2252555816501144e-06, "loss": 4.804305076599121, "step": 2482 }, { "epoch": 1.6925698704839809, "grad_norm": 25.5, "learning_rate": 1.2199646888133964e-06, "loss": 5.4693498611450195, "step": 2483 }, { "epoch": 1.6932515337423313, "grad_norm": 18.5, "learning_rate": 1.2146845020798014e-06, "loss": 4.323767185211182, "step": 2484 }, { "epoch": 1.6939331970006817, "grad_norm": 29.625, "learning_rate": 1.2094150278878303e-06, "loss": 3.537645101547241, "step": 2485 }, { "epoch": 1.694614860259032, "grad_norm": 25.625, "learning_rate": 1.204156272662912e-06, "loss": 4.328585147857666, "step": 2486 }, { "epoch": 1.6952965235173822, "grad_norm": 12.625, "learning_rate": 1.1989082428174104e-06, "loss": 3.780259132385254, "step": 2487 }, { "epoch": 1.6959781867757329, "grad_norm": 19.125, "learning_rate": 1.1936709447506111e-06, "loss": 4.642852783203125, "step": 2488 }, { "epoch": 1.696659850034083, "grad_norm": 26.125, "learning_rate": 1.188444384848716e-06, "loss": 2.7880802154541016, "step": 2489 }, { "epoch": 1.6973415132924337, "grad_norm": 22.625, "learning_rate": 1.1832285694848255e-06, "loss": 5.548046112060547, "step": 2490 }, { "epoch": 1.6980231765507838, "grad_norm": 14.375, "learning_rate": 1.1780235050189481e-06, "loss": 3.447953224182129, "step": 2491 }, { "epoch": 1.6987048398091344, "grad_norm": 25.375, "learning_rate": 1.1728291977979734e-06, "loss": 6.289928913116455, "step": 2492 }, { "epoch": 1.6993865030674846, "grad_norm": 13.9375, "learning_rate": 1.1676456541556813e-06, "loss": 5.209049701690674, "step": 2493 }, { "epoch": 1.700068166325835, "grad_norm": 41.75, "learning_rate": 1.1624728804127272e-06, "loss": 7.078976154327393, "step": 2494 }, { "epoch": 1.7007498295841854, "grad_norm": 21.625, "learning_rate": 1.1573108828766255e-06, "loss": 2.291191577911377, "step": 2495 }, { "epoch": 1.7014314928425358, "grad_norm": 69.5, "learning_rate": 1.1521596678417613e-06, "loss": 9.008760452270508, "step": 2496 }, { "epoch": 1.7021131561008862, "grad_norm": 11.0625, "learning_rate": 1.1470192415893645e-06, "loss": 2.2970376014709473, "step": 2497 }, { "epoch": 1.7027948193592364, "grad_norm": 20.625, "learning_rate": 1.1418896103875087e-06, "loss": 2.3484582901000977, "step": 2498 }, { "epoch": 1.703476482617587, "grad_norm": 21.25, "learning_rate": 1.1367707804911122e-06, "loss": 3.385910987854004, "step": 2499 }, { "epoch": 1.7041581458759372, "grad_norm": 12.5, "learning_rate": 1.1316627581419137e-06, "loss": 3.016845226287842, "step": 2500 }, { "epoch": 1.7048398091342878, "grad_norm": 21.125, "learning_rate": 1.126565549568479e-06, "loss": 7.029099941253662, "step": 2501 }, { "epoch": 1.705521472392638, "grad_norm": 32.75, "learning_rate": 1.1214791609861886e-06, "loss": 6.723758697509766, "step": 2502 }, { "epoch": 1.7062031356509886, "grad_norm": 20.875, "learning_rate": 1.1164035985972244e-06, "loss": 6.2729291915893555, "step": 2503 }, { "epoch": 1.7068847989093388, "grad_norm": 28.0, "learning_rate": 1.111338868590569e-06, "loss": 7.224662780761719, "step": 2504 }, { "epoch": 1.7075664621676891, "grad_norm": 25.75, "learning_rate": 1.1062849771420025e-06, "loss": 8.379149436950684, "step": 2505 }, { "epoch": 1.7082481254260395, "grad_norm": 15.625, "learning_rate": 1.1012419304140799e-06, "loss": 2.9837546348571777, "step": 2506 }, { "epoch": 1.70892978868439, "grad_norm": 12.4375, "learning_rate": 1.096209734556134e-06, "loss": 4.092199802398682, "step": 2507 }, { "epoch": 1.7096114519427403, "grad_norm": 21.25, "learning_rate": 1.0911883957042736e-06, "loss": 4.330066680908203, "step": 2508 }, { "epoch": 1.7102931152010905, "grad_norm": 16.0, "learning_rate": 1.0861779199813583e-06, "loss": 4.410202980041504, "step": 2509 }, { "epoch": 1.7109747784594411, "grad_norm": 15.4375, "learning_rate": 1.0811783134970132e-06, "loss": 3.693362236022949, "step": 2510 }, { "epoch": 1.7116564417177913, "grad_norm": 17.25, "learning_rate": 1.0761895823475976e-06, "loss": 3.049572467803955, "step": 2511 }, { "epoch": 1.712338104976142, "grad_norm": 14.875, "learning_rate": 1.0712117326162186e-06, "loss": 2.7321925163269043, "step": 2512 }, { "epoch": 1.713019768234492, "grad_norm": 22.625, "learning_rate": 1.066244770372714e-06, "loss": 3.4936656951904297, "step": 2513 }, { "epoch": 1.7137014314928425, "grad_norm": 18.5, "learning_rate": 1.0612887016736395e-06, "loss": 4.723230361938477, "step": 2514 }, { "epoch": 1.7143830947511929, "grad_norm": 13.0625, "learning_rate": 1.0563435325622762e-06, "loss": 1.601760745048523, "step": 2515 }, { "epoch": 1.7150647580095433, "grad_norm": 25.625, "learning_rate": 1.0514092690686061e-06, "loss": 6.2751359939575195, "step": 2516 }, { "epoch": 1.7157464212678937, "grad_norm": 15.0625, "learning_rate": 1.04648591720932e-06, "loss": 3.6499874591827393, "step": 2517 }, { "epoch": 1.716428084526244, "grad_norm": 22.0, "learning_rate": 1.041573482987801e-06, "loss": 5.1577582359313965, "step": 2518 }, { "epoch": 1.7171097477845945, "grad_norm": 29.375, "learning_rate": 1.0366719723941144e-06, "loss": 6.34061861038208, "step": 2519 }, { "epoch": 1.7177914110429446, "grad_norm": 12.625, "learning_rate": 1.0317813914050157e-06, "loss": 4.436561584472656, "step": 2520 }, { "epoch": 1.7184730743012953, "grad_norm": 16.5, "learning_rate": 1.026901745983927e-06, "loss": 4.275240421295166, "step": 2521 }, { "epoch": 1.7191547375596454, "grad_norm": 11.6875, "learning_rate": 1.0220330420809343e-06, "loss": 2.733059883117676, "step": 2522 }, { "epoch": 1.719836400817996, "grad_norm": 25.25, "learning_rate": 1.0171752856327876e-06, "loss": 5.59696102142334, "step": 2523 }, { "epoch": 1.7205180640763462, "grad_norm": 26.25, "learning_rate": 1.012328482562881e-06, "loss": 4.574085235595703, "step": 2524 }, { "epoch": 1.7211997273346966, "grad_norm": 30.625, "learning_rate": 1.007492638781259e-06, "loss": 4.774003505706787, "step": 2525 }, { "epoch": 1.721881390593047, "grad_norm": 27.25, "learning_rate": 1.0026677601846001e-06, "loss": 4.77939510345459, "step": 2526 }, { "epoch": 1.7225630538513974, "grad_norm": 11.75, "learning_rate": 9.978538526562088e-07, "loss": 2.778803586959839, "step": 2527 }, { "epoch": 1.7232447171097478, "grad_norm": 35.0, "learning_rate": 9.930509220660179e-07, "loss": 7.975351810455322, "step": 2528 }, { "epoch": 1.7239263803680982, "grad_norm": 51.25, "learning_rate": 9.882589742705717e-07, "loss": 7.223303318023682, "step": 2529 }, { "epoch": 1.7246080436264486, "grad_norm": 26.875, "learning_rate": 9.834780151130196e-07, "loss": 4.857811450958252, "step": 2530 }, { "epoch": 1.7252897068847988, "grad_norm": 12.75, "learning_rate": 9.787080504231195e-07, "loss": 4.9471330642700195, "step": 2531 }, { "epoch": 1.7259713701431494, "grad_norm": 19.375, "learning_rate": 9.739490860172152e-07, "loss": 4.663339614868164, "step": 2532 }, { "epoch": 1.7266530334014996, "grad_norm": 30.5, "learning_rate": 9.692011276982426e-07, "loss": 5.173912525177002, "step": 2533 }, { "epoch": 1.7273346966598502, "grad_norm": 14.8125, "learning_rate": 9.644641812557155e-07, "loss": 2.089712381362915, "step": 2534 }, { "epoch": 1.7280163599182004, "grad_norm": 17.0, "learning_rate": 9.597382524657173e-07, "loss": 2.005657434463501, "step": 2535 }, { "epoch": 1.7286980231765507, "grad_norm": 24.25, "learning_rate": 9.550233470909021e-07, "loss": 5.664228439331055, "step": 2536 }, { "epoch": 1.7293796864349011, "grad_norm": 16.125, "learning_rate": 9.503194708804764e-07, "loss": 3.4060301780700684, "step": 2537 }, { "epoch": 1.7300613496932515, "grad_norm": 38.75, "learning_rate": 9.456266295702022e-07, "loss": 9.183801651000977, "step": 2538 }, { "epoch": 1.730743012951602, "grad_norm": 43.25, "learning_rate": 9.409448288823875e-07, "loss": 7.9283552169799805, "step": 2539 }, { "epoch": 1.7314246762099523, "grad_norm": 25.25, "learning_rate": 9.362740745258736e-07, "loss": 5.627583980560303, "step": 2540 }, { "epoch": 1.7321063394683027, "grad_norm": 13.8125, "learning_rate": 9.316143721960303e-07, "loss": 4.395862579345703, "step": 2541 }, { "epoch": 1.732788002726653, "grad_norm": 23.875, "learning_rate": 9.26965727574759e-07, "loss": 5.8715901374816895, "step": 2542 }, { "epoch": 1.7334696659850035, "grad_norm": 15.0625, "learning_rate": 9.223281463304689e-07, "loss": 2.50063419342041, "step": 2543 }, { "epoch": 1.7341513292433537, "grad_norm": 29.875, "learning_rate": 9.177016341180867e-07, "loss": 4.187967300415039, "step": 2544 }, { "epoch": 1.7348329925017043, "grad_norm": 21.875, "learning_rate": 9.13086196579035e-07, "loss": 3.641655445098877, "step": 2545 }, { "epoch": 1.7355146557600545, "grad_norm": 22.125, "learning_rate": 9.084818393412375e-07, "loss": 5.616839408874512, "step": 2546 }, { "epoch": 1.7361963190184049, "grad_norm": 23.5, "learning_rate": 9.038885680191045e-07, "loss": 2.9898290634155273, "step": 2547 }, { "epoch": 1.7368779822767553, "grad_norm": 29.5, "learning_rate": 8.993063882135278e-07, "loss": 7.581299781799316, "step": 2548 }, { "epoch": 1.7375596455351057, "grad_norm": 20.0, "learning_rate": 8.947353055118779e-07, "loss": 1.9632058143615723, "step": 2549 }, { "epoch": 1.738241308793456, "grad_norm": 17.0, "learning_rate": 8.901753254879885e-07, "loss": 4.940116882324219, "step": 2550 }, { "epoch": 1.7389229720518065, "grad_norm": 18.75, "learning_rate": 8.856264537021586e-07, "loss": 7.297369956970215, "step": 2551 }, { "epoch": 1.7396046353101569, "grad_norm": 31.875, "learning_rate": 8.810886957011433e-07, "loss": 4.474785804748535, "step": 2552 }, { "epoch": 1.740286298568507, "grad_norm": 19.375, "learning_rate": 8.765620570181421e-07, "loss": 3.2772042751312256, "step": 2553 }, { "epoch": 1.7409679618268576, "grad_norm": 14.3125, "learning_rate": 8.720465431727976e-07, "loss": 4.292938232421875, "step": 2554 }, { "epoch": 1.7416496250852078, "grad_norm": 19.375, "learning_rate": 8.67542159671192e-07, "loss": 5.125296592712402, "step": 2555 }, { "epoch": 1.7423312883435584, "grad_norm": 18.375, "learning_rate": 8.630489120058261e-07, "loss": 2.8080897331237793, "step": 2556 }, { "epoch": 1.7430129516019086, "grad_norm": 27.125, "learning_rate": 8.585668056556273e-07, "loss": 5.4397687911987305, "step": 2557 }, { "epoch": 1.743694614860259, "grad_norm": 35.5, "learning_rate": 8.540958460859416e-07, "loss": 7.852839469909668, "step": 2558 }, { "epoch": 1.7443762781186094, "grad_norm": 35.5, "learning_rate": 8.496360387485147e-07, "loss": 9.204946517944336, "step": 2559 }, { "epoch": 1.7450579413769598, "grad_norm": 15.25, "learning_rate": 8.451873890814988e-07, "loss": 5.350673198699951, "step": 2560 }, { "epoch": 1.7457396046353102, "grad_norm": 13.375, "learning_rate": 8.407499025094402e-07, "loss": 2.3319175243377686, "step": 2561 }, { "epoch": 1.7464212678936604, "grad_norm": 10.0625, "learning_rate": 8.363235844432705e-07, "loss": 3.4711170196533203, "step": 2562 }, { "epoch": 1.747102931152011, "grad_norm": 36.5, "learning_rate": 8.319084402803069e-07, "loss": 4.882756233215332, "step": 2563 }, { "epoch": 1.7477845944103612, "grad_norm": 24.875, "learning_rate": 8.275044754042372e-07, "loss": 4.554181098937988, "step": 2564 }, { "epoch": 1.7484662576687118, "grad_norm": 19.625, "learning_rate": 8.231116951851204e-07, "loss": 6.9863433837890625, "step": 2565 }, { "epoch": 1.749147920927062, "grad_norm": 18.75, "learning_rate": 8.187301049793794e-07, "loss": 3.3308303356170654, "step": 2566 }, { "epoch": 1.7498295841854126, "grad_norm": 20.625, "learning_rate": 8.143597101297873e-07, "loss": 6.916524887084961, "step": 2567 }, { "epoch": 1.7505112474437627, "grad_norm": 22.25, "learning_rate": 8.10000515965469e-07, "loss": 4.927857398986816, "step": 2568 }, { "epoch": 1.7511929107021131, "grad_norm": 33.25, "learning_rate": 8.05652527801889e-07, "loss": 7.4017133712768555, "step": 2569 }, { "epoch": 1.7518745739604635, "grad_norm": 37.25, "learning_rate": 8.013157509408509e-07, "loss": 5.628267288208008, "step": 2570 }, { "epoch": 1.752556237218814, "grad_norm": 15.3125, "learning_rate": 7.969901906704869e-07, "loss": 5.104277610778809, "step": 2571 }, { "epoch": 1.7532379004771643, "grad_norm": 45.75, "learning_rate": 7.926758522652511e-07, "loss": 11.461145401000977, "step": 2572 }, { "epoch": 1.7539195637355145, "grad_norm": 28.5, "learning_rate": 7.88372740985911e-07, "loss": 7.820512294769287, "step": 2573 }, { "epoch": 1.7546012269938651, "grad_norm": 53.25, "learning_rate": 7.840808620795526e-07, "loss": 4.888619899749756, "step": 2574 }, { "epoch": 1.7552828902522153, "grad_norm": 172.0, "learning_rate": 7.79800220779554e-07, "loss": 3.872779130935669, "step": 2575 }, { "epoch": 1.755964553510566, "grad_norm": 17.375, "learning_rate": 7.755308223056024e-07, "loss": 3.7961502075195312, "step": 2576 }, { "epoch": 1.756646216768916, "grad_norm": 14.3125, "learning_rate": 7.712726718636643e-07, "loss": 2.3039584159851074, "step": 2577 }, { "epoch": 1.7573278800272665, "grad_norm": 22.375, "learning_rate": 7.670257746460008e-07, "loss": 7.840152740478516, "step": 2578 }, { "epoch": 1.7580095432856169, "grad_norm": 38.75, "learning_rate": 7.627901358311462e-07, "loss": 7.904692649841309, "step": 2579 }, { "epoch": 1.7586912065439673, "grad_norm": 20.375, "learning_rate": 7.585657605839059e-07, "loss": 2.275923252105713, "step": 2580 }, { "epoch": 1.7593728698023177, "grad_norm": 12.5, "learning_rate": 7.543526540553536e-07, "loss": 4.737390041351318, "step": 2581 }, { "epoch": 1.760054533060668, "grad_norm": 21.875, "learning_rate": 7.501508213828202e-07, "loss": 5.971288681030273, "step": 2582 }, { "epoch": 1.7607361963190185, "grad_norm": 21.125, "learning_rate": 7.459602676898902e-07, "loss": 5.007809162139893, "step": 2583 }, { "epoch": 1.7614178595773686, "grad_norm": 18.5, "learning_rate": 7.417809980863965e-07, "loss": 3.799466609954834, "step": 2584 }, { "epoch": 1.7620995228357192, "grad_norm": 21.625, "learning_rate": 7.376130176684082e-07, "loss": 3.977330446243286, "step": 2585 }, { "epoch": 1.7627811860940694, "grad_norm": 18.0, "learning_rate": 7.334563315182319e-07, "loss": 4.677376747131348, "step": 2586 }, { "epoch": 1.76346284935242, "grad_norm": 14.875, "learning_rate": 7.293109447044056e-07, "loss": 5.174595355987549, "step": 2587 }, { "epoch": 1.7641445126107702, "grad_norm": 35.5, "learning_rate": 7.251768622816813e-07, "loss": 6.248071670532227, "step": 2588 }, { "epoch": 1.7648261758691206, "grad_norm": 31.625, "learning_rate": 7.210540892910345e-07, "loss": 6.143766403198242, "step": 2589 }, { "epoch": 1.765507839127471, "grad_norm": 11.1875, "learning_rate": 7.169426307596428e-07, "loss": 3.949766159057617, "step": 2590 }, { "epoch": 1.7661895023858214, "grad_norm": 15.4375, "learning_rate": 7.128424917008947e-07, "loss": 3.083034038543701, "step": 2591 }, { "epoch": 1.7668711656441718, "grad_norm": 20.25, "learning_rate": 7.087536771143733e-07, "loss": 5.273445129394531, "step": 2592 }, { "epoch": 1.7675528289025222, "grad_norm": 110.5, "learning_rate": 7.046761919858513e-07, "loss": 7.24980354309082, "step": 2593 }, { "epoch": 1.7682344921608726, "grad_norm": 14.0625, "learning_rate": 7.006100412872863e-07, "loss": 2.5691983699798584, "step": 2594 }, { "epoch": 1.7689161554192228, "grad_norm": 15.3125, "learning_rate": 6.965552299768186e-07, "loss": 3.533280611038208, "step": 2595 }, { "epoch": 1.7695978186775734, "grad_norm": 34.5, "learning_rate": 6.925117629987577e-07, "loss": 6.230571269989014, "step": 2596 }, { "epoch": 1.7702794819359235, "grad_norm": 53.0, "learning_rate": 6.88479645283584e-07, "loss": 7.727921485900879, "step": 2597 }, { "epoch": 1.7709611451942742, "grad_norm": 26.375, "learning_rate": 6.844588817479347e-07, "loss": 5.164918422698975, "step": 2598 }, { "epoch": 1.7716428084526243, "grad_norm": 15.0625, "learning_rate": 6.804494772946058e-07, "loss": 4.261207580566406, "step": 2599 }, { "epoch": 1.7723244717109747, "grad_norm": 11.1875, "learning_rate": 6.764514368125419e-07, "loss": 4.238258361816406, "step": 2600 }, { "epoch": 1.7730061349693251, "grad_norm": 29.0, "learning_rate": 6.724647651768268e-07, "loss": 2.0636894702911377, "step": 2601 }, { "epoch": 1.7736877982276755, "grad_norm": 45.25, "learning_rate": 6.684894672486874e-07, "loss": 7.505370616912842, "step": 2602 }, { "epoch": 1.774369461486026, "grad_norm": 29.75, "learning_rate": 6.645255478754776e-07, "loss": 5.309700012207031, "step": 2603 }, { "epoch": 1.7750511247443763, "grad_norm": 17.125, "learning_rate": 6.605730118906795e-07, "loss": 5.343388557434082, "step": 2604 }, { "epoch": 1.7757327880027267, "grad_norm": 19.375, "learning_rate": 6.566318641138902e-07, "loss": 3.8368473052978516, "step": 2605 }, { "epoch": 1.7764144512610769, "grad_norm": 31.5, "learning_rate": 6.527021093508234e-07, "loss": 5.417485237121582, "step": 2606 }, { "epoch": 1.7770961145194275, "grad_norm": 23.0, "learning_rate": 6.487837523933016e-07, "loss": 6.173131465911865, "step": 2607 }, { "epoch": 1.7777777777777777, "grad_norm": 17.0, "learning_rate": 6.448767980192494e-07, "loss": 2.694535732269287, "step": 2608 }, { "epoch": 1.7784594410361283, "grad_norm": 25.0, "learning_rate": 6.409812509926827e-07, "loss": 3.625383138656616, "step": 2609 }, { "epoch": 1.7791411042944785, "grad_norm": 14.3125, "learning_rate": 6.370971160637129e-07, "loss": 3.303194761276245, "step": 2610 }, { "epoch": 1.7798227675528289, "grad_norm": 39.75, "learning_rate": 6.332243979685349e-07, "loss": 6.268808364868164, "step": 2611 }, { "epoch": 1.7805044308111793, "grad_norm": 14.125, "learning_rate": 6.293631014294177e-07, "loss": 2.663141965866089, "step": 2612 }, { "epoch": 1.7811860940695297, "grad_norm": 19.625, "learning_rate": 6.255132311547108e-07, "loss": 1.8527110815048218, "step": 2613 }, { "epoch": 1.78186775732788, "grad_norm": 22.5, "learning_rate": 6.216747918388233e-07, "loss": 3.6704061031341553, "step": 2614 }, { "epoch": 1.7825494205862304, "grad_norm": 36.0, "learning_rate": 6.178477881622325e-07, "loss": 2.190366744995117, "step": 2615 }, { "epoch": 1.7832310838445808, "grad_norm": 39.5, "learning_rate": 6.140322247914654e-07, "loss": 5.869146347045898, "step": 2616 }, { "epoch": 1.783912747102931, "grad_norm": 78.0, "learning_rate": 6.102281063791027e-07, "loss": 7.153844356536865, "step": 2617 }, { "epoch": 1.7845944103612816, "grad_norm": 24.75, "learning_rate": 6.064354375637671e-07, "loss": 5.323446273803711, "step": 2618 }, { "epoch": 1.7852760736196318, "grad_norm": 27.25, "learning_rate": 6.026542229701249e-07, "loss": 5.7921528816223145, "step": 2619 }, { "epoch": 1.7859577368779824, "grad_norm": 19.125, "learning_rate": 5.98884467208869e-07, "loss": 4.545669078826904, "step": 2620 }, { "epoch": 1.7866394001363326, "grad_norm": 20.875, "learning_rate": 5.951261748767257e-07, "loss": 7.257363796234131, "step": 2621 }, { "epoch": 1.787321063394683, "grad_norm": 24.875, "learning_rate": 5.913793505564369e-07, "loss": 5.067315101623535, "step": 2622 }, { "epoch": 1.7880027266530334, "grad_norm": 27.75, "learning_rate": 5.87643998816766e-07, "loss": 6.6837687492370605, "step": 2623 }, { "epoch": 1.7886843899113838, "grad_norm": 22.5, "learning_rate": 5.839201242124859e-07, "loss": 4.6905646324157715, "step": 2624 }, { "epoch": 1.7893660531697342, "grad_norm": 21.375, "learning_rate": 5.802077312843723e-07, "loss": 5.372653007507324, "step": 2625 }, { "epoch": 1.7900477164280846, "grad_norm": 18.625, "learning_rate": 5.765068245592032e-07, "loss": 5.458276748657227, "step": 2626 }, { "epoch": 1.790729379686435, "grad_norm": 27.75, "learning_rate": 5.728174085497506e-07, "loss": 3.87081241607666, "step": 2627 }, { "epoch": 1.7914110429447851, "grad_norm": 25.25, "learning_rate": 5.691394877547707e-07, "loss": 5.970423698425293, "step": 2628 }, { "epoch": 1.7920927062031358, "grad_norm": 52.0, "learning_rate": 5.654730666590102e-07, "loss": 8.748456954956055, "step": 2629 }, { "epoch": 1.792774369461486, "grad_norm": 31.625, "learning_rate": 5.618181497331865e-07, "loss": 5.418834209442139, "step": 2630 }, { "epoch": 1.7934560327198366, "grad_norm": 15.6875, "learning_rate": 5.581747414339922e-07, "loss": 5.0698418617248535, "step": 2631 }, { "epoch": 1.7941376959781867, "grad_norm": 13.875, "learning_rate": 5.545428462040903e-07, "loss": 3.730250597000122, "step": 2632 }, { "epoch": 1.7948193592365371, "grad_norm": 29.125, "learning_rate": 5.50922468472096e-07, "loss": 6.655462741851807, "step": 2633 }, { "epoch": 1.7955010224948875, "grad_norm": 9.875, "learning_rate": 5.473136126525891e-07, "loss": 3.2077364921569824, "step": 2634 }, { "epoch": 1.796182685753238, "grad_norm": 13.75, "learning_rate": 5.437162831460962e-07, "loss": 3.4958295822143555, "step": 2635 }, { "epoch": 1.7968643490115883, "grad_norm": 27.75, "learning_rate": 5.401304843390909e-07, "loss": 4.891316890716553, "step": 2636 }, { "epoch": 1.7975460122699385, "grad_norm": 27.0, "learning_rate": 5.36556220603981e-07, "loss": 5.241407871246338, "step": 2637 }, { "epoch": 1.798227675528289, "grad_norm": 28.75, "learning_rate": 5.329934962991168e-07, "loss": 3.4233436584472656, "step": 2638 }, { "epoch": 1.7989093387866393, "grad_norm": 16.125, "learning_rate": 5.294423157687712e-07, "loss": 3.296311378479004, "step": 2639 }, { "epoch": 1.79959100204499, "grad_norm": 24.0, "learning_rate": 5.259026833431468e-07, "loss": 4.560527801513672, "step": 2640 }, { "epoch": 1.80027266530334, "grad_norm": 44.25, "learning_rate": 5.223746033383592e-07, "loss": 5.271116256713867, "step": 2641 }, { "epoch": 1.8009543285616907, "grad_norm": 26.75, "learning_rate": 5.188580800564402e-07, "loss": 5.477992057800293, "step": 2642 }, { "epoch": 1.8016359918200409, "grad_norm": 24.0, "learning_rate": 5.153531177853322e-07, "loss": 6.435674667358398, "step": 2643 }, { "epoch": 1.8023176550783913, "grad_norm": 24.5, "learning_rate": 5.118597207988741e-07, "loss": 5.504081726074219, "step": 2644 }, { "epoch": 1.8029993183367417, "grad_norm": 29.625, "learning_rate": 5.083778933568073e-07, "loss": 4.739253520965576, "step": 2645 }, { "epoch": 1.803680981595092, "grad_norm": 19.875, "learning_rate": 5.049076397047648e-07, "loss": 5.690196514129639, "step": 2646 }, { "epoch": 1.8043626448534424, "grad_norm": 33.75, "learning_rate": 5.014489640742659e-07, "loss": 2.7385306358337402, "step": 2647 }, { "epoch": 1.8050443081117926, "grad_norm": 24.375, "learning_rate": 4.980018706827139e-07, "loss": 5.733936786651611, "step": 2648 }, { "epoch": 1.8057259713701432, "grad_norm": 84.0, "learning_rate": 4.945663637333842e-07, "loss": 4.803215503692627, "step": 2649 }, { "epoch": 1.8064076346284934, "grad_norm": 39.75, "learning_rate": 4.911424474154314e-07, "loss": 7.233545303344727, "step": 2650 }, { "epoch": 1.807089297886844, "grad_norm": 40.75, "learning_rate": 4.877301259038703e-07, "loss": 5.63930606842041, "step": 2651 }, { "epoch": 1.8077709611451942, "grad_norm": 42.0, "learning_rate": 4.843294033595814e-07, "loss": 6.040931701660156, "step": 2652 }, { "epoch": 1.8084526244035446, "grad_norm": 76.5, "learning_rate": 4.80940283929301e-07, "loss": 12.345968246459961, "step": 2653 }, { "epoch": 1.809134287661895, "grad_norm": 18.875, "learning_rate": 4.775627717456143e-07, "loss": 6.9803876876831055, "step": 2654 }, { "epoch": 1.8098159509202454, "grad_norm": 21.875, "learning_rate": 4.741968709269573e-07, "loss": 4.297351837158203, "step": 2655 }, { "epoch": 1.8104976141785958, "grad_norm": 33.0, "learning_rate": 4.708425855776044e-07, "loss": 4.877196311950684, "step": 2656 }, { "epoch": 1.8111792774369462, "grad_norm": 12.0, "learning_rate": 4.674999197876673e-07, "loss": 4.531613826751709, "step": 2657 }, { "epoch": 1.8118609406952966, "grad_norm": 23.125, "learning_rate": 4.6416887763308885e-07, "loss": 4.8553667068481445, "step": 2658 }, { "epoch": 1.8125426039536467, "grad_norm": 31.75, "learning_rate": 4.6084946317563997e-07, "loss": 5.929840564727783, "step": 2659 }, { "epoch": 1.8132242672119974, "grad_norm": 42.0, "learning_rate": 4.575416804629085e-07, "loss": 4.316781997680664, "step": 2660 }, { "epoch": 1.8139059304703475, "grad_norm": 15.3125, "learning_rate": 4.542455335283069e-07, "loss": 3.6964828968048096, "step": 2661 }, { "epoch": 1.8145875937286982, "grad_norm": 18.375, "learning_rate": 4.509610263910502e-07, "loss": 3.9290173053741455, "step": 2662 }, { "epoch": 1.8152692569870483, "grad_norm": 15.4375, "learning_rate": 4.476881630561658e-07, "loss": 5.255082130432129, "step": 2663 }, { "epoch": 1.8159509202453987, "grad_norm": 16.875, "learning_rate": 4.444269475144858e-07, "loss": 1.9505901336669922, "step": 2664 }, { "epoch": 1.8166325835037491, "grad_norm": 24.75, "learning_rate": 4.411773837426303e-07, "loss": 6.44141149520874, "step": 2665 }, { "epoch": 1.8173142467620995, "grad_norm": 29.0, "learning_rate": 4.3793947570301973e-07, "loss": 6.585089683532715, "step": 2666 }, { "epoch": 1.81799591002045, "grad_norm": 16.875, "learning_rate": 4.34713227343857e-07, "loss": 4.919661998748779, "step": 2667 }, { "epoch": 1.8186775732788003, "grad_norm": 30.0, "learning_rate": 4.3149864259912966e-07, "loss": 6.584992408752441, "step": 2668 }, { "epoch": 1.8193592365371507, "grad_norm": 17.875, "learning_rate": 4.2829572538860553e-07, "loss": 4.208704948425293, "step": 2669 }, { "epoch": 1.8200408997955009, "grad_norm": 14.6875, "learning_rate": 4.2510447961782055e-07, "loss": 4.099368572235107, "step": 2670 }, { "epoch": 1.8207225630538515, "grad_norm": 19.5, "learning_rate": 4.219249091780797e-07, "loss": 3.5398669242858887, "step": 2671 }, { "epoch": 1.8214042263122017, "grad_norm": 14.4375, "learning_rate": 4.187570179464551e-07, "loss": 3.636699676513672, "step": 2672 }, { "epoch": 1.8220858895705523, "grad_norm": 13.875, "learning_rate": 4.156008097857733e-07, "loss": 3.6577672958374023, "step": 2673 }, { "epoch": 1.8227675528289025, "grad_norm": 39.0, "learning_rate": 4.1245628854461816e-07, "loss": 7.221151351928711, "step": 2674 }, { "epoch": 1.8234492160872529, "grad_norm": 19.125, "learning_rate": 4.093234580573202e-07, "loss": 4.730220794677734, "step": 2675 }, { "epoch": 1.8241308793456033, "grad_norm": 20.25, "learning_rate": 4.0620232214395704e-07, "loss": 4.317601680755615, "step": 2676 }, { "epoch": 1.8248125426039536, "grad_norm": 13.375, "learning_rate": 4.0309288461034544e-07, "loss": 5.52736759185791, "step": 2677 }, { "epoch": 1.825494205862304, "grad_norm": 14.3125, "learning_rate": 3.999951492480358e-07, "loss": 4.273284912109375, "step": 2678 }, { "epoch": 1.8261758691206544, "grad_norm": 10.125, "learning_rate": 3.9690911983431226e-07, "loss": 2.7213094234466553, "step": 2679 }, { "epoch": 1.8268575323790048, "grad_norm": 26.875, "learning_rate": 3.938348001321812e-07, "loss": 6.298955917358398, "step": 2680 }, { "epoch": 1.827539195637355, "grad_norm": 39.5, "learning_rate": 3.9077219389037526e-07, "loss": 7.968618392944336, "step": 2681 }, { "epoch": 1.8282208588957056, "grad_norm": 18.125, "learning_rate": 3.8772130484334147e-07, "loss": 4.659139633178711, "step": 2682 }, { "epoch": 1.8289025221540558, "grad_norm": 10.1875, "learning_rate": 3.8468213671123747e-07, "loss": 3.6286134719848633, "step": 2683 }, { "epoch": 1.8295841854124064, "grad_norm": 13.5625, "learning_rate": 3.8165469319993097e-07, "loss": 4.660744667053223, "step": 2684 }, { "epoch": 1.8302658486707566, "grad_norm": 25.625, "learning_rate": 3.786389780009958e-07, "loss": 3.1908411979675293, "step": 2685 }, { "epoch": 1.830947511929107, "grad_norm": 17.5, "learning_rate": 3.756349947916982e-07, "loss": 3.6129298210144043, "step": 2686 }, { "epoch": 1.8316291751874574, "grad_norm": 13.0, "learning_rate": 3.726427472350036e-07, "loss": 2.4261605739593506, "step": 2687 }, { "epoch": 1.8323108384458078, "grad_norm": 11.6875, "learning_rate": 3.696622389795679e-07, "loss": 3.655959129333496, "step": 2688 }, { "epoch": 1.8329925017041582, "grad_norm": 20.5, "learning_rate": 3.666934736597272e-07, "loss": 4.898262977600098, "step": 2689 }, { "epoch": 1.8336741649625086, "grad_norm": 27.375, "learning_rate": 3.637364548955047e-07, "loss": 4.590310096740723, "step": 2690 }, { "epoch": 1.834355828220859, "grad_norm": 16.625, "learning_rate": 3.6079118629259615e-07, "loss": 4.274911403656006, "step": 2691 }, { "epoch": 1.8350374914792091, "grad_norm": 51.5, "learning_rate": 3.578576714423698e-07, "loss": 6.587590217590332, "step": 2692 }, { "epoch": 1.8357191547375598, "grad_norm": 13.0625, "learning_rate": 3.5493591392186553e-07, "loss": 3.4166672229766846, "step": 2693 }, { "epoch": 1.83640081799591, "grad_norm": 17.375, "learning_rate": 3.520259172937812e-07, "loss": 2.901803970336914, "step": 2694 }, { "epoch": 1.8370824812542605, "grad_norm": 12.5625, "learning_rate": 3.491276851064784e-07, "loss": 2.2928152084350586, "step": 2695 }, { "epoch": 1.8377641445126107, "grad_norm": 35.75, "learning_rate": 3.4624122089397137e-07, "loss": 4.913736343383789, "step": 2696 }, { "epoch": 1.8384458077709611, "grad_norm": 20.75, "learning_rate": 3.4336652817592466e-07, "loss": 4.894477844238281, "step": 2697 }, { "epoch": 1.8391274710293115, "grad_norm": 15.8125, "learning_rate": 3.405036104576509e-07, "loss": 3.2707018852233887, "step": 2698 }, { "epoch": 1.839809134287662, "grad_norm": 48.25, "learning_rate": 3.37652471230101e-07, "loss": 7.460787773132324, "step": 2699 }, { "epoch": 1.8404907975460123, "grad_norm": 27.0, "learning_rate": 3.3481311396986626e-07, "loss": 7.362221717834473, "step": 2700 }, { "epoch": 1.8411724608043627, "grad_norm": 20.0, "learning_rate": 3.319855421391738e-07, "loss": 3.8700199127197266, "step": 2701 }, { "epoch": 1.841854124062713, "grad_norm": 63.25, "learning_rate": 3.2916975918587445e-07, "loss": 7.850796699523926, "step": 2702 }, { "epoch": 1.8425357873210633, "grad_norm": 16.5, "learning_rate": 3.263657685434485e-07, "loss": 4.616790771484375, "step": 2703 }, { "epoch": 1.8432174505794139, "grad_norm": 14.875, "learning_rate": 3.23573573630992e-07, "loss": 3.1093297004699707, "step": 2704 }, { "epoch": 1.843899113837764, "grad_norm": 35.25, "learning_rate": 3.2079317785322363e-07, "loss": 4.5120463371276855, "step": 2705 }, { "epoch": 1.8445807770961147, "grad_norm": 17.5, "learning_rate": 3.180245846004726e-07, "loss": 6.021507263183594, "step": 2706 }, { "epoch": 1.8452624403544649, "grad_norm": 20.25, "learning_rate": 3.152677972486728e-07, "loss": 6.066402435302734, "step": 2707 }, { "epoch": 1.8459441036128152, "grad_norm": 42.0, "learning_rate": 3.1252281915936766e-07, "loss": 7.412055492401123, "step": 2708 }, { "epoch": 1.8466257668711656, "grad_norm": 22.875, "learning_rate": 3.097896536796985e-07, "loss": 5.253879547119141, "step": 2709 }, { "epoch": 1.847307430129516, "grad_norm": 23.5, "learning_rate": 3.0706830414240164e-07, "loss": 6.557915687561035, "step": 2710 }, { "epoch": 1.8479890933878664, "grad_norm": 21.375, "learning_rate": 3.0435877386580717e-07, "loss": 5.4784064292907715, "step": 2711 }, { "epoch": 1.8486707566462166, "grad_norm": 27.625, "learning_rate": 3.0166106615383216e-07, "loss": 5.2541656494140625, "step": 2712 }, { "epoch": 1.8493524199045672, "grad_norm": 39.5, "learning_rate": 2.989751842959776e-07, "loss": 6.372235298156738, "step": 2713 }, { "epoch": 1.8500340831629174, "grad_norm": 19.25, "learning_rate": 2.963011315673259e-07, "loss": 6.7360382080078125, "step": 2714 }, { "epoch": 1.850715746421268, "grad_norm": 24.0, "learning_rate": 2.9363891122853097e-07, "loss": 4.99141788482666, "step": 2715 }, { "epoch": 1.8513974096796182, "grad_norm": 25.75, "learning_rate": 2.9098852652582385e-07, "loss": 5.689350128173828, "step": 2716 }, { "epoch": 1.8520790729379688, "grad_norm": 21.75, "learning_rate": 2.8834998069100285e-07, "loss": 5.647188663482666, "step": 2717 }, { "epoch": 1.852760736196319, "grad_norm": 21.125, "learning_rate": 2.8572327694142754e-07, "loss": 2.4793665409088135, "step": 2718 }, { "epoch": 1.8534423994546694, "grad_norm": 13.1875, "learning_rate": 2.8310841848001927e-07, "loss": 3.5758378505706787, "step": 2719 }, { "epoch": 1.8541240627130198, "grad_norm": 23.0, "learning_rate": 2.805054084952552e-07, "loss": 5.691527366638184, "step": 2720 }, { "epoch": 1.8548057259713702, "grad_norm": 28.875, "learning_rate": 2.7791425016116537e-07, "loss": 5.572688579559326, "step": 2721 }, { "epoch": 1.8554873892297206, "grad_norm": 15.5625, "learning_rate": 2.7533494663732894e-07, "loss": 5.957261085510254, "step": 2722 }, { "epoch": 1.8561690524880707, "grad_norm": 16.75, "learning_rate": 2.7276750106886686e-07, "loss": 3.551358222961426, "step": 2723 }, { "epoch": 1.8568507157464214, "grad_norm": 28.5, "learning_rate": 2.7021191658644474e-07, "loss": 6.168783664703369, "step": 2724 }, { "epoch": 1.8575323790047715, "grad_norm": 17.625, "learning_rate": 2.6766819630626216e-07, "loss": 5.443592548370361, "step": 2725 }, { "epoch": 1.8582140422631221, "grad_norm": 16.25, "learning_rate": 2.6513634333005134e-07, "loss": 4.129920959472656, "step": 2726 }, { "epoch": 1.8588957055214723, "grad_norm": 14.625, "learning_rate": 2.6261636074507933e-07, "loss": 4.466020584106445, "step": 2727 }, { "epoch": 1.8595773687798227, "grad_norm": 28.625, "learning_rate": 2.6010825162413043e-07, "loss": 7.447360038757324, "step": 2728 }, { "epoch": 1.860259032038173, "grad_norm": 46.75, "learning_rate": 2.5761201902551714e-07, "loss": 7.256712436676025, "step": 2729 }, { "epoch": 1.8609406952965235, "grad_norm": 29.125, "learning_rate": 2.5512766599306903e-07, "loss": 7.634082794189453, "step": 2730 }, { "epoch": 1.861622358554874, "grad_norm": 48.0, "learning_rate": 2.526551955561274e-07, "loss": 5.994986534118652, "step": 2731 }, { "epoch": 1.8623040218132243, "grad_norm": 18.125, "learning_rate": 2.501946107295472e-07, "loss": 2.852592945098877, "step": 2732 }, { "epoch": 1.8629856850715747, "grad_norm": 23.0, "learning_rate": 2.477459145136907e-07, "loss": 5.904362678527832, "step": 2733 }, { "epoch": 1.8636673483299249, "grad_norm": 21.625, "learning_rate": 2.453091098944205e-07, "loss": 6.216617107391357, "step": 2734 }, { "epoch": 1.8643490115882755, "grad_norm": 22.625, "learning_rate": 2.4288419984310086e-07, "loss": 4.357490062713623, "step": 2735 }, { "epoch": 1.8650306748466257, "grad_norm": 31.375, "learning_rate": 2.404711873165921e-07, "loss": 2.488081455230713, "step": 2736 }, { "epoch": 1.8657123381049763, "grad_norm": 36.0, "learning_rate": 2.3807007525724623e-07, "loss": 7.458122730255127, "step": 2737 }, { "epoch": 1.8663940013633264, "grad_norm": 15.8125, "learning_rate": 2.3568086659290778e-07, "loss": 4.308403968811035, "step": 2738 }, { "epoch": 1.8670756646216768, "grad_norm": 17.125, "learning_rate": 2.333035642368997e-07, "loss": 5.6159772872924805, "step": 2739 }, { "epoch": 1.8677573278800272, "grad_norm": 14.9375, "learning_rate": 2.3093817108803318e-07, "loss": 4.081464767456055, "step": 2740 }, { "epoch": 1.8684389911383776, "grad_norm": 13.75, "learning_rate": 2.2858469003059546e-07, "loss": 2.9483556747436523, "step": 2741 }, { "epoch": 1.869120654396728, "grad_norm": 30.375, "learning_rate": 2.2624312393434766e-07, "loss": 5.659716606140137, "step": 2742 }, { "epoch": 1.8698023176550784, "grad_norm": 15.375, "learning_rate": 2.239134756545247e-07, "loss": 4.347964286804199, "step": 2743 }, { "epoch": 1.8704839809134288, "grad_norm": 32.5, "learning_rate": 2.2159574803182425e-07, "loss": 7.8822526931762695, "step": 2744 }, { "epoch": 1.871165644171779, "grad_norm": 29.5, "learning_rate": 2.1928994389241454e-07, "loss": 5.665243148803711, "step": 2745 }, { "epoch": 1.8718473074301296, "grad_norm": 15.625, "learning_rate": 2.16996066047922e-07, "loss": 2.13620924949646, "step": 2746 }, { "epoch": 1.8725289706884798, "grad_norm": 42.5, "learning_rate": 2.1471411729542701e-07, "loss": 9.216123580932617, "step": 2747 }, { "epoch": 1.8732106339468304, "grad_norm": 49.0, "learning_rate": 2.1244410041746933e-07, "loss": 8.209264755249023, "step": 2748 }, { "epoch": 1.8738922972051806, "grad_norm": 20.375, "learning_rate": 2.1018601818203809e-07, "loss": 4.271251678466797, "step": 2749 }, { "epoch": 1.874573960463531, "grad_norm": 21.5, "learning_rate": 2.0793987334256637e-07, "loss": 5.463613986968994, "step": 2750 }, { "epoch": 1.8752556237218814, "grad_norm": 15.0, "learning_rate": 2.057056686379366e-07, "loss": 3.6207642555236816, "step": 2751 }, { "epoch": 1.8759372869802318, "grad_norm": 21.25, "learning_rate": 2.0348340679246625e-07, "loss": 5.525351524353027, "step": 2752 }, { "epoch": 1.8766189502385822, "grad_norm": 24.125, "learning_rate": 2.0127309051591325e-07, "loss": 4.146646022796631, "step": 2753 }, { "epoch": 1.8773006134969326, "grad_norm": 13.3125, "learning_rate": 1.9907472250346947e-07, "loss": 1.9141697883605957, "step": 2754 }, { "epoch": 1.877982276755283, "grad_norm": 13.625, "learning_rate": 1.968883054357562e-07, "loss": 3.906130313873291, "step": 2755 }, { "epoch": 1.8786639400136331, "grad_norm": 18.25, "learning_rate": 1.9471384197882414e-07, "loss": 3.3061985969543457, "step": 2756 }, { "epoch": 1.8793456032719837, "grad_norm": 13.1875, "learning_rate": 1.9255133478414457e-07, "loss": 3.8614180088043213, "step": 2757 }, { "epoch": 1.880027266530334, "grad_norm": 38.75, "learning_rate": 1.904007864886126e-07, "loss": 8.076139450073242, "step": 2758 }, { "epoch": 1.8807089297886845, "grad_norm": 14.3125, "learning_rate": 1.8826219971454064e-07, "loss": 4.6324567794799805, "step": 2759 }, { "epoch": 1.8813905930470347, "grad_norm": 19.5, "learning_rate": 1.861355770696549e-07, "loss": 3.999713659286499, "step": 2760 }, { "epoch": 1.882072256305385, "grad_norm": 12.9375, "learning_rate": 1.8402092114709226e-07, "loss": 4.662599563598633, "step": 2761 }, { "epoch": 1.8827539195637355, "grad_norm": 52.0, "learning_rate": 1.8191823452540115e-07, "loss": 8.818958282470703, "step": 2762 }, { "epoch": 1.883435582822086, "grad_norm": 14.4375, "learning_rate": 1.7982751976852841e-07, "loss": 3.6795005798339844, "step": 2763 }, { "epoch": 1.8841172460804363, "grad_norm": 17.5, "learning_rate": 1.7774877942583147e-07, "loss": 3.689544677734375, "step": 2764 }, { "epoch": 1.8847989093387867, "grad_norm": 19.0, "learning_rate": 1.7568201603205827e-07, "loss": 5.567490577697754, "step": 2765 }, { "epoch": 1.885480572597137, "grad_norm": 18.25, "learning_rate": 1.7362723210735843e-07, "loss": 2.926252841949463, "step": 2766 }, { "epoch": 1.8861622358554873, "grad_norm": 29.875, "learning_rate": 1.715844301572711e-07, "loss": 8.754290580749512, "step": 2767 }, { "epoch": 1.8868438991138379, "grad_norm": 22.25, "learning_rate": 1.6955361267272596e-07, "loss": 3.3489232063293457, "step": 2768 }, { "epoch": 1.887525562372188, "grad_norm": 26.625, "learning_rate": 1.6753478213003772e-07, "loss": 6.306264400482178, "step": 2769 }, { "epoch": 1.8882072256305387, "grad_norm": 19.625, "learning_rate": 1.6552794099090718e-07, "loss": 3.534698963165283, "step": 2770 }, { "epoch": 1.8888888888888888, "grad_norm": 43.75, "learning_rate": 1.635330917024125e-07, "loss": 9.404044151306152, "step": 2771 }, { "epoch": 1.8895705521472392, "grad_norm": 19.5, "learning_rate": 1.6155023669701454e-07, "loss": 5.367260932922363, "step": 2772 }, { "epoch": 1.8902522154055896, "grad_norm": 46.5, "learning_rate": 1.5957937839254146e-07, "loss": 7.046936988830566, "step": 2773 }, { "epoch": 1.89093387866394, "grad_norm": 26.25, "learning_rate": 1.5762051919219867e-07, "loss": 6.138800144195557, "step": 2774 }, { "epoch": 1.8916155419222904, "grad_norm": 22.625, "learning_rate": 1.5567366148455887e-07, "loss": 2.1423940658569336, "step": 2775 }, { "epoch": 1.8922972051806408, "grad_norm": 15.9375, "learning_rate": 1.5373880764355865e-07, "loss": 4.274113655090332, "step": 2776 }, { "epoch": 1.8929788684389912, "grad_norm": 13.0625, "learning_rate": 1.5181596002850075e-07, "loss": 4.809713363647461, "step": 2777 }, { "epoch": 1.8936605316973414, "grad_norm": 14.125, "learning_rate": 1.4990512098404296e-07, "loss": 2.7128477096557617, "step": 2778 }, { "epoch": 1.894342194955692, "grad_norm": 19.25, "learning_rate": 1.4800629284020706e-07, "loss": 6.156774520874023, "step": 2779 }, { "epoch": 1.8950238582140422, "grad_norm": 15.3125, "learning_rate": 1.4611947791236314e-07, "loss": 3.874891519546509, "step": 2780 }, { "epoch": 1.8957055214723928, "grad_norm": 28.25, "learning_rate": 1.4424467850123413e-07, "loss": 5.038738250732422, "step": 2781 }, { "epoch": 1.896387184730743, "grad_norm": 19.375, "learning_rate": 1.4238189689289362e-07, "loss": 5.0992536544799805, "step": 2782 }, { "epoch": 1.8970688479890934, "grad_norm": 16.375, "learning_rate": 1.4053113535876019e-07, "loss": 3.9093780517578125, "step": 2783 }, { "epoch": 1.8977505112474438, "grad_norm": 18.0, "learning_rate": 1.3869239615559303e-07, "loss": 5.364569664001465, "step": 2784 }, { "epoch": 1.8984321745057942, "grad_norm": 20.625, "learning_rate": 1.3686568152549539e-07, "loss": 3.2042596340179443, "step": 2785 }, { "epoch": 1.8991138377641446, "grad_norm": 17.625, "learning_rate": 1.350509936959077e-07, "loss": 5.5806803703308105, "step": 2786 }, { "epoch": 1.8997955010224947, "grad_norm": 14.75, "learning_rate": 1.332483348796021e-07, "loss": 3.663651943206787, "step": 2787 }, { "epoch": 1.9004771642808453, "grad_norm": 33.0, "learning_rate": 1.3145770727468588e-07, "loss": 3.014803647994995, "step": 2788 }, { "epoch": 1.9011588275391955, "grad_norm": 12.1875, "learning_rate": 1.296791130645947e-07, "loss": 4.328468322753906, "step": 2789 }, { "epoch": 1.9018404907975461, "grad_norm": 28.25, "learning_rate": 1.2791255441809037e-07, "loss": 4.1046905517578125, "step": 2790 }, { "epoch": 1.9025221540558963, "grad_norm": 13.25, "learning_rate": 1.2615803348926092e-07, "loss": 3.75811767578125, "step": 2791 }, { "epoch": 1.903203817314247, "grad_norm": 37.5, "learning_rate": 1.244155524175139e-07, "loss": 7.976456165313721, "step": 2792 }, { "epoch": 1.903885480572597, "grad_norm": 58.0, "learning_rate": 1.2268511332757747e-07, "loss": 7.428311824798584, "step": 2793 }, { "epoch": 1.9045671438309475, "grad_norm": 24.125, "learning_rate": 1.20966718329496e-07, "loss": 8.219427108764648, "step": 2794 }, { "epoch": 1.905248807089298, "grad_norm": 31.0, "learning_rate": 1.1926036951862563e-07, "loss": 8.255804061889648, "step": 2795 }, { "epoch": 1.9059304703476483, "grad_norm": 23.375, "learning_rate": 1.1756606897563639e-07, "loss": 5.16880989074707, "step": 2796 }, { "epoch": 1.9066121336059987, "grad_norm": 21.625, "learning_rate": 1.1588381876650568e-07, "loss": 3.8614726066589355, "step": 2797 }, { "epoch": 1.9072937968643489, "grad_norm": 11.75, "learning_rate": 1.1421362094251598e-07, "loss": 3.3713297843933105, "step": 2798 }, { "epoch": 1.9079754601226995, "grad_norm": 21.0, "learning_rate": 1.1255547754025708e-07, "loss": 2.560875415802002, "step": 2799 }, { "epoch": 1.9086571233810496, "grad_norm": 34.0, "learning_rate": 1.109093905816172e-07, "loss": 6.241981506347656, "step": 2800 }, { "epoch": 1.9093387866394003, "grad_norm": 15.5, "learning_rate": 1.0927536207378187e-07, "loss": 4.090888977050781, "step": 2801 }, { "epoch": 1.9100204498977504, "grad_norm": 17.875, "learning_rate": 1.0765339400923724e-07, "loss": 3.7561488151550293, "step": 2802 }, { "epoch": 1.9107021131561008, "grad_norm": 35.0, "learning_rate": 1.0604348836575906e-07, "loss": 8.605913162231445, "step": 2803 }, { "epoch": 1.9113837764144512, "grad_norm": 25.25, "learning_rate": 1.0444564710641702e-07, "loss": 2.8657777309417725, "step": 2804 }, { "epoch": 1.9120654396728016, "grad_norm": 14.1875, "learning_rate": 1.0285987217957038e-07, "loss": 4.091822624206543, "step": 2805 }, { "epoch": 1.912747102931152, "grad_norm": 15.6875, "learning_rate": 1.0128616551886128e-07, "loss": 5.448248863220215, "step": 2806 }, { "epoch": 1.9134287661895024, "grad_norm": 20.75, "learning_rate": 9.972452904322249e-08, "loss": 5.602548599243164, "step": 2807 }, { "epoch": 1.9141104294478528, "grad_norm": 26.0, "learning_rate": 9.817496465686193e-08, "loss": 4.781885623931885, "step": 2808 }, { "epoch": 1.914792092706203, "grad_norm": 19.125, "learning_rate": 9.663747424927262e-08, "loss": 2.518782138824463, "step": 2809 }, { "epoch": 1.9154737559645536, "grad_norm": 17.625, "learning_rate": 9.511205969522263e-08, "loss": 2.5301921367645264, "step": 2810 }, { "epoch": 1.9161554192229038, "grad_norm": 41.25, "learning_rate": 9.359872285475302e-08, "loss": 7.156878471374512, "step": 2811 }, { "epoch": 1.9168370824812544, "grad_norm": 19.5, "learning_rate": 9.209746557318211e-08, "loss": 2.3349649906158447, "step": 2812 }, { "epoch": 1.9175187457396046, "grad_norm": 29.625, "learning_rate": 9.060828968109558e-08, "loss": 6.6991705894470215, "step": 2813 }, { "epoch": 1.918200408997955, "grad_norm": 49.5, "learning_rate": 8.91311969943487e-08, "loss": 5.041890621185303, "step": 2814 }, { "epoch": 1.9188820722563054, "grad_norm": 14.8125, "learning_rate": 8.76661893140629e-08, "loss": 2.7401363849639893, "step": 2815 }, { "epoch": 1.9195637355146558, "grad_norm": 21.75, "learning_rate": 8.621326842662148e-08, "loss": 3.6127824783325195, "step": 2816 }, { "epoch": 1.9202453987730062, "grad_norm": 23.5, "learning_rate": 8.47724361036728e-08, "loss": 4.600052833557129, "step": 2817 }, { "epoch": 1.9209270620313565, "grad_norm": 18.75, "learning_rate": 8.334369410212262e-08, "loss": 3.7339627742767334, "step": 2818 }, { "epoch": 1.921608725289707, "grad_norm": 9.75, "learning_rate": 8.192704416413511e-08, "loss": 2.3137359619140625, "step": 2819 }, { "epoch": 1.9222903885480571, "grad_norm": 27.875, "learning_rate": 8.052248801712958e-08, "loss": 3.7732396125793457, "step": 2820 }, { "epoch": 1.9229720518064077, "grad_norm": 48.75, "learning_rate": 7.913002737377718e-08, "loss": 6.443537712097168, "step": 2821 }, { "epoch": 1.923653715064758, "grad_norm": 22.875, "learning_rate": 7.774966393200189e-08, "loss": 5.898337364196777, "step": 2822 }, { "epoch": 1.9243353783231085, "grad_norm": 48.5, "learning_rate": 7.638139937497624e-08, "loss": 9.554438591003418, "step": 2823 }, { "epoch": 1.9250170415814587, "grad_norm": 13.8125, "learning_rate": 7.502523537111894e-08, "loss": 3.576861619949341, "step": 2824 }, { "epoch": 1.925698704839809, "grad_norm": 19.125, "learning_rate": 7.36811735740961e-08, "loss": 4.202127933502197, "step": 2825 }, { "epoch": 1.9263803680981595, "grad_norm": 17.875, "learning_rate": 7.234921562281227e-08, "loss": 4.0487518310546875, "step": 2826 }, { "epoch": 1.9270620313565099, "grad_norm": 19.625, "learning_rate": 7.102936314141828e-08, "loss": 4.949954032897949, "step": 2827 }, { "epoch": 1.9277436946148603, "grad_norm": 18.625, "learning_rate": 6.972161773930119e-08, "loss": 4.587427616119385, "step": 2828 }, { "epoch": 1.9284253578732107, "grad_norm": 22.5, "learning_rate": 6.842598101108322e-08, "loss": 6.401041030883789, "step": 2829 }, { "epoch": 1.929107021131561, "grad_norm": 15.0, "learning_rate": 6.714245453662504e-08, "loss": 4.744366645812988, "step": 2830 }, { "epoch": 1.9297886843899112, "grad_norm": 14.4375, "learning_rate": 6.587103988102028e-08, "loss": 4.25332498550415, "step": 2831 }, { "epoch": 1.9304703476482619, "grad_norm": 37.75, "learning_rate": 6.461173859458992e-08, "loss": 6.374660491943359, "step": 2832 }, { "epoch": 1.931152010906612, "grad_norm": 24.5, "learning_rate": 6.336455221288895e-08, "loss": 4.139296531677246, "step": 2833 }, { "epoch": 1.9318336741649627, "grad_norm": 27.125, "learning_rate": 6.212948225669757e-08, "loss": 6.4993414878845215, "step": 2834 }, { "epoch": 1.9325153374233128, "grad_norm": 13.1875, "learning_rate": 6.090653023201997e-08, "loss": 3.9064764976501465, "step": 2835 }, { "epoch": 1.9331970006816632, "grad_norm": 28.125, "learning_rate": 5.969569763008886e-08, "loss": 5.976467132568359, "step": 2836 }, { "epoch": 1.9338786639400136, "grad_norm": 36.5, "learning_rate": 5.849698592735431e-08, "loss": 7.72698974609375, "step": 2837 }, { "epoch": 1.934560327198364, "grad_norm": 17.5, "learning_rate": 5.731039658548821e-08, "loss": 8.171910285949707, "step": 2838 }, { "epoch": 1.9352419904567144, "grad_norm": 15.5, "learning_rate": 5.6135931051380934e-08, "loss": 2.981773614883423, "step": 2839 }, { "epoch": 1.9359236537150648, "grad_norm": 11.9375, "learning_rate": 5.497359075714026e-08, "loss": 4.446168899536133, "step": 2840 }, { "epoch": 1.9366053169734152, "grad_norm": 17.375, "learning_rate": 5.3823377120086894e-08, "loss": 4.201350688934326, "step": 2841 }, { "epoch": 1.9372869802317654, "grad_norm": 11.5, "learning_rate": 5.26852915427567e-08, "loss": 2.751448392868042, "step": 2842 }, { "epoch": 1.937968643490116, "grad_norm": 28.625, "learning_rate": 5.155933541289515e-08, "loss": 6.7884297370910645, "step": 2843 }, { "epoch": 1.9386503067484662, "grad_norm": 13.5625, "learning_rate": 5.044551010345844e-08, "loss": 3.018629789352417, "step": 2844 }, { "epoch": 1.9393319700068168, "grad_norm": 26.5, "learning_rate": 4.934381697261015e-08, "loss": 3.2539620399475098, "step": 2845 }, { "epoch": 1.940013633265167, "grad_norm": 11.25, "learning_rate": 4.8254257363722356e-08, "loss": 3.2074108123779297, "step": 2846 }, { "epoch": 1.9406952965235174, "grad_norm": 34.0, "learning_rate": 4.717683260536898e-08, "loss": 6.667632102966309, "step": 2847 }, { "epoch": 1.9413769597818678, "grad_norm": 13.5, "learning_rate": 4.6111544011329115e-08, "loss": 3.165349006652832, "step": 2848 }, { "epoch": 1.9420586230402181, "grad_norm": 16.75, "learning_rate": 4.5058392880581445e-08, "loss": 3.306952714920044, "step": 2849 }, { "epoch": 1.9427402862985685, "grad_norm": 11.0, "learning_rate": 4.401738049730653e-08, "loss": 3.981013298034668, "step": 2850 }, { "epoch": 1.943421949556919, "grad_norm": 35.5, "learning_rate": 4.29885081308834e-08, "loss": 10.325587272644043, "step": 2851 }, { "epoch": 1.9441036128152693, "grad_norm": 17.125, "learning_rate": 4.1971777035887395e-08, "loss": 4.540951251983643, "step": 2852 }, { "epoch": 1.9447852760736195, "grad_norm": 12.1875, "learning_rate": 4.09671884520868e-08, "loss": 3.9824318885803223, "step": 2853 }, { "epoch": 1.9454669393319701, "grad_norm": 13.8125, "learning_rate": 3.997474360444731e-08, "loss": 4.659494876861572, "step": 2854 }, { "epoch": 1.9461486025903203, "grad_norm": 21.0, "learning_rate": 3.899444370312533e-08, "loss": 4.34985876083374, "step": 2855 }, { "epoch": 1.946830265848671, "grad_norm": 39.75, "learning_rate": 3.802628994346802e-08, "loss": 7.219938278198242, "step": 2856 }, { "epoch": 1.947511929107021, "grad_norm": 19.625, "learning_rate": 3.707028350601327e-08, "loss": 3.730142593383789, "step": 2857 }, { "epoch": 1.9481935923653715, "grad_norm": 19.75, "learning_rate": 3.6126425556483046e-08, "loss": 5.669454097747803, "step": 2858 }, { "epoch": 1.9488752556237219, "grad_norm": 24.125, "learning_rate": 3.5194717245790046e-08, "loss": 5.918878555297852, "step": 2859 }, { "epoch": 1.9495569188820723, "grad_norm": 20.375, "learning_rate": 3.4275159710032146e-08, "loss": 7.635791778564453, "step": 2860 }, { "epoch": 1.9502385821404227, "grad_norm": 13.1875, "learning_rate": 3.336775407048576e-08, "loss": 4.050065040588379, "step": 2861 }, { "epoch": 1.9509202453987728, "grad_norm": 20.625, "learning_rate": 3.24725014336158e-08, "loss": 5.742995262145996, "step": 2862 }, { "epoch": 1.9516019086571235, "grad_norm": 22.375, "learning_rate": 3.158940289106571e-08, "loss": 4.090199947357178, "step": 2863 }, { "epoch": 1.9522835719154736, "grad_norm": 10.75, "learning_rate": 3.071845951965746e-08, "loss": 3.724001407623291, "step": 2864 }, { "epoch": 1.9529652351738243, "grad_norm": 16.125, "learning_rate": 2.9859672381392644e-08, "loss": 2.6587259769439697, "step": 2865 }, { "epoch": 1.9536468984321744, "grad_norm": 26.375, "learning_rate": 2.9013042523450274e-08, "loss": 7.501119136810303, "step": 2866 }, { "epoch": 1.954328561690525, "grad_norm": 69.5, "learning_rate": 2.8178570978183438e-08, "loss": 6.178948402404785, "step": 2867 }, { "epoch": 1.9550102249488752, "grad_norm": 21.5, "learning_rate": 2.7356258763121535e-08, "loss": 2.5224039554595947, "step": 2868 }, { "epoch": 1.9556918882072256, "grad_norm": 37.75, "learning_rate": 2.6546106880966926e-08, "loss": 7.612993240356445, "step": 2869 }, { "epoch": 1.956373551465576, "grad_norm": 28.5, "learning_rate": 2.574811631959273e-08, "loss": 4.4909749031066895, "step": 2870 }, { "epoch": 1.9570552147239264, "grad_norm": 13.3125, "learning_rate": 2.4962288052045037e-08, "loss": 2.6139304637908936, "step": 2871 }, { "epoch": 1.9577368779822768, "grad_norm": 25.625, "learning_rate": 2.418862303653846e-08, "loss": 6.43367862701416, "step": 2872 }, { "epoch": 1.958418541240627, "grad_norm": 13.8125, "learning_rate": 2.3427122216456154e-08, "loss": 4.173886775970459, "step": 2873 }, { "epoch": 1.9591002044989776, "grad_norm": 17.0, "learning_rate": 2.267778652034647e-08, "loss": 4.3889055252075195, "step": 2874 }, { "epoch": 1.9597818677573278, "grad_norm": 17.5, "learning_rate": 2.1940616861929608e-08, "loss": 2.702101945877075, "step": 2875 }, { "epoch": 1.9604635310156784, "grad_norm": 34.75, "learning_rate": 2.1215614140085438e-08, "loss": 5.340231418609619, "step": 2876 }, { "epoch": 1.9611451942740286, "grad_norm": 26.875, "learning_rate": 2.0502779238860125e-08, "loss": 8.3497896194458, "step": 2877 }, { "epoch": 1.961826857532379, "grad_norm": 26.625, "learning_rate": 1.9802113027461712e-08, "loss": 5.967607021331787, "step": 2878 }, { "epoch": 1.9625085207907293, "grad_norm": 22.25, "learning_rate": 1.911361636026232e-08, "loss": 4.917688369750977, "step": 2879 }, { "epoch": 1.9631901840490797, "grad_norm": 27.375, "learning_rate": 1.8437290076792624e-08, "loss": 5.173532485961914, "step": 2880 }, { "epoch": 1.9638718473074301, "grad_norm": 16.625, "learning_rate": 1.7773135001744047e-08, "loss": 3.6276044845581055, "step": 2881 }, { "epoch": 1.9645535105657805, "grad_norm": 32.25, "learning_rate": 1.7121151944966553e-08, "loss": 6.414065361022949, "step": 2882 }, { "epoch": 1.965235173824131, "grad_norm": 23.875, "learning_rate": 1.6481341701468644e-08, "loss": 5.12346076965332, "step": 2883 }, { "epoch": 1.965916837082481, "grad_norm": 16.0, "learning_rate": 1.5853705051415148e-08, "loss": 4.672459602355957, "step": 2884 }, { "epoch": 1.9665985003408317, "grad_norm": 38.0, "learning_rate": 1.5238242760126088e-08, "loss": 8.251648902893066, "step": 2885 }, { "epoch": 1.967280163599182, "grad_norm": 29.625, "learning_rate": 1.4634955578076704e-08, "loss": 8.433072090148926, "step": 2886 }, { "epoch": 1.9679618268575325, "grad_norm": 23.75, "learning_rate": 1.4043844240898552e-08, "loss": 7.470628261566162, "step": 2887 }, { "epoch": 1.9686434901158827, "grad_norm": 15.9375, "learning_rate": 1.3464909469372845e-08, "loss": 4.404353141784668, "step": 2888 }, { "epoch": 1.969325153374233, "grad_norm": 26.625, "learning_rate": 1.2898151969434892e-08, "loss": 4.148073196411133, "step": 2889 }, { "epoch": 1.9700068166325835, "grad_norm": 44.0, "learning_rate": 1.234357243217188e-08, "loss": 8.738536834716797, "step": 2890 }, { "epoch": 1.9706884798909339, "grad_norm": 46.75, "learning_rate": 1.1801171533820655e-08, "loss": 8.13895034790039, "step": 2891 }, { "epoch": 1.9713701431492843, "grad_norm": 17.625, "learning_rate": 1.1270949935767716e-08, "loss": 5.891839981079102, "step": 2892 }, { "epoch": 1.9720518064076347, "grad_norm": 20.125, "learning_rate": 1.0752908284549223e-08, "loss": 5.195493698120117, "step": 2893 }, { "epoch": 1.972733469665985, "grad_norm": 16.5, "learning_rate": 1.0247047211849881e-08, "loss": 1.8649721145629883, "step": 2894 }, { "epoch": 1.9734151329243352, "grad_norm": 13.0, "learning_rate": 9.753367334499608e-09, "loss": 2.133979082107544, "step": 2895 }, { "epoch": 1.9740967961826859, "grad_norm": 46.0, "learning_rate": 9.271869254476873e-09, "loss": 6.589439392089844, "step": 2896 }, { "epoch": 1.974778459441036, "grad_norm": 28.25, "learning_rate": 8.802553558905357e-09, "loss": 5.365048885345459, "step": 2897 }, { "epoch": 1.9754601226993866, "grad_norm": 11.8125, "learning_rate": 8.345420820055073e-09, "loss": 4.62324333190918, "step": 2898 }, { "epoch": 1.9761417859577368, "grad_norm": 12.5625, "learning_rate": 7.90047159533791e-09, "loss": 2.7803709506988525, "step": 2899 }, { "epoch": 1.9768234492160872, "grad_norm": 14.4375, "learning_rate": 7.467706427312093e-09, "loss": 4.8446269035339355, "step": 2900 }, { "epoch": 1.9775051124744376, "grad_norm": 26.625, "learning_rate": 7.047125843678837e-09, "loss": 6.1476731300354, "step": 2901 }, { "epoch": 1.978186775732788, "grad_norm": 23.5, "learning_rate": 6.6387303572790215e-09, "loss": 3.5197243690490723, "step": 2902 }, { "epoch": 1.9788684389911384, "grad_norm": 29.375, "learning_rate": 6.242520466099855e-09, "loss": 5.298270225524902, "step": 2903 }, { "epoch": 1.9795501022494888, "grad_norm": 24.75, "learning_rate": 5.858496653265988e-09, "loss": 7.091706275939941, "step": 2904 }, { "epoch": 1.9802317655078392, "grad_norm": 16.125, "learning_rate": 5.486659387043958e-09, "loss": 5.049643039703369, "step": 2905 }, { "epoch": 1.9809134287661894, "grad_norm": 20.625, "learning_rate": 5.127009120841076e-09, "loss": 4.505095481872559, "step": 2906 }, { "epoch": 1.98159509202454, "grad_norm": 52.75, "learning_rate": 4.779546293204318e-09, "loss": 7.745931148529053, "step": 2907 }, { "epoch": 1.9822767552828902, "grad_norm": 26.0, "learning_rate": 4.444271327818106e-09, "loss": 4.939314365386963, "step": 2908 }, { "epoch": 1.9829584185412408, "grad_norm": 30.625, "learning_rate": 4.121184633506525e-09, "loss": 6.2001633644104, "step": 2909 }, { "epoch": 1.983640081799591, "grad_norm": 32.0, "learning_rate": 3.810286604232216e-09, "loss": 2.6781578063964844, "step": 2910 }, { "epoch": 1.9843217450579413, "grad_norm": 51.5, "learning_rate": 3.511577619093043e-09, "loss": 2.698533058166504, "step": 2911 }, { "epoch": 1.9850034083162917, "grad_norm": 25.875, "learning_rate": 3.2250580423276445e-09, "loss": 3.5193562507629395, "step": 2912 }, { "epoch": 1.9856850715746421, "grad_norm": 16.625, "learning_rate": 2.950728223307664e-09, "loss": 4.639314651489258, "step": 2913 }, { "epoch": 1.9863667348329925, "grad_norm": 15.375, "learning_rate": 2.6885884965421882e-09, "loss": 4.844074249267578, "step": 2914 }, { "epoch": 1.987048398091343, "grad_norm": 14.25, "learning_rate": 2.4386391816777488e-09, "loss": 4.526384353637695, "step": 2915 }, { "epoch": 1.9877300613496933, "grad_norm": 16.125, "learning_rate": 2.2008805834938807e-09, "loss": 4.126609802246094, "step": 2916 }, { "epoch": 1.9884117246080435, "grad_norm": 23.375, "learning_rate": 1.9753129919053425e-09, "loss": 3.363898754119873, "step": 2917 }, { "epoch": 1.9890933878663941, "grad_norm": 14.625, "learning_rate": 1.7619366819632277e-09, "loss": 3.8608059883117676, "step": 2918 }, { "epoch": 1.9897750511247443, "grad_norm": 13.25, "learning_rate": 1.5607519138516326e-09, "loss": 3.8306150436401367, "step": 2919 }, { "epoch": 1.990456714383095, "grad_norm": 29.5, "learning_rate": 1.3717589328898773e-09, "loss": 6.497473239898682, "step": 2920 }, { "epoch": 1.991138377641445, "grad_norm": 16.375, "learning_rate": 1.1949579695280656e-09, "loss": 3.9633736610412598, "step": 2921 }, { "epoch": 1.9918200408997955, "grad_norm": 25.875, "learning_rate": 1.0303492393548554e-09, "loss": 5.307573318481445, "step": 2922 }, { "epoch": 1.9925017041581459, "grad_norm": 16.75, "learning_rate": 8.77932943086357e-10, "loss": 3.291430950164795, "step": 2923 }, { "epoch": 1.9931833674164963, "grad_norm": 13.3125, "learning_rate": 7.377092665750152e-10, "loss": 1.8634185791015625, "step": 2924 }, { "epoch": 1.9938650306748467, "grad_norm": 43.75, "learning_rate": 6.096783808062778e-10, "loss": 8.025413513183594, "step": 2925 }, { "epoch": 1.9945466939331968, "grad_norm": 13.0625, "learning_rate": 4.938404418952658e-10, "loss": 4.106935501098633, "step": 2926 }, { "epoch": 1.9952283571915475, "grad_norm": 12.0, "learning_rate": 3.901955910934341e-10, "loss": 4.732489585876465, "step": 2927 }, { "epoch": 1.9959100204498976, "grad_norm": 12.3125, "learning_rate": 2.9874395477969e-10, "loss": 4.103945255279541, "step": 2928 }, { "epoch": 1.9965916837082482, "grad_norm": 18.0, "learning_rate": 2.19485644469275e-10, "loss": 5.395977973937988, "step": 2929 }, { "epoch": 1.9972733469665984, "grad_norm": 12.875, "learning_rate": 1.524207568059932e-10, "loss": 4.617153644561768, "step": 2930 }, { "epoch": 1.997955010224949, "grad_norm": 16.25, "learning_rate": 9.75493735677624e-11, "loss": 4.5145263671875, "step": 2931 }, { "epoch": 1.9986366734832992, "grad_norm": 25.25, "learning_rate": 5.4871561662173115e-11, "loss": 2.723302125930786, "step": 2932 }, { "epoch": 1.9993183367416496, "grad_norm": 13.4375, "learning_rate": 2.43873731287092e-11, "loss": 3.8983383178710938, "step": 2933 }, { "epoch": 2.0, "grad_norm": 60.5, "learning_rate": 6.0968451409682e-12, "loss": 7.608074188232422, "step": 2934 } ], "logging_steps": 1, "max_steps": 2934, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2995500862037033e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }