| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9990982867448152, |
| "eval_steps": 500, |
| "global_step": 554, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0018034265103697023, |
| "grad_norm": 6.674532137525891, |
| "learning_rate": 0.0, |
| "loss": 1.2076, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0036068530207394047, |
| "grad_norm": 6.154259029326631, |
| "learning_rate": 2.2522522522522524e-08, |
| "loss": 1.1491, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.005410279531109108, |
| "grad_norm": 6.771653034612783, |
| "learning_rate": 4.504504504504505e-08, |
| "loss": 1.0534, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.007213706041478809, |
| "grad_norm": 6.67369914164215, |
| "learning_rate": 6.756756756756757e-08, |
| "loss": 1.1268, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.009017132551848512, |
| "grad_norm": 6.948175490186477, |
| "learning_rate": 9.00900900900901e-08, |
| "loss": 1.0109, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.010820559062218215, |
| "grad_norm": 6.285358425270715, |
| "learning_rate": 1.1261261261261262e-07, |
| "loss": 1.1015, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.012623985572587917, |
| "grad_norm": 7.480900019360095, |
| "learning_rate": 1.3513513513513515e-07, |
| "loss": 1.1338, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.014427412082957619, |
| "grad_norm": 5.985730068048292, |
| "learning_rate": 1.5765765765765766e-07, |
| "loss": 1.0889, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.016230838593327322, |
| "grad_norm": 6.74360726738279, |
| "learning_rate": 1.801801801801802e-07, |
| "loss": 1.0402, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.018034265103697024, |
| "grad_norm": 6.603771834682011, |
| "learning_rate": 2.0270270270270273e-07, |
| "loss": 1.1394, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.019837691614066726, |
| "grad_norm": 6.070271548824436, |
| "learning_rate": 2.2522522522522524e-07, |
| "loss": 1.1439, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.02164111812443643, |
| "grad_norm": 6.814219599011481, |
| "learning_rate": 2.477477477477478e-07, |
| "loss": 1.0881, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.023444544634806132, |
| "grad_norm": 6.428167151788823, |
| "learning_rate": 2.702702702702703e-07, |
| "loss": 1.1163, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.025247971145175834, |
| "grad_norm": 7.047010957583219, |
| "learning_rate": 2.927927927927928e-07, |
| "loss": 1.0923, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.027051397655545536, |
| "grad_norm": 5.53060795190606, |
| "learning_rate": 3.153153153153153e-07, |
| "loss": 1.0118, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.028854824165915238, |
| "grad_norm": 6.682099184154056, |
| "learning_rate": 3.378378378378379e-07, |
| "loss": 1.1213, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.030658250676284943, |
| "grad_norm": 5.6679773328714615, |
| "learning_rate": 3.603603603603604e-07, |
| "loss": 1.2125, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.032461677186654644, |
| "grad_norm": 5.547383052372404, |
| "learning_rate": 3.828828828828829e-07, |
| "loss": 1.0804, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.034265103697024346, |
| "grad_norm": 7.200179452941777, |
| "learning_rate": 4.0540540540540546e-07, |
| "loss": 1.0663, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.03606853020739405, |
| "grad_norm": 5.325648740758234, |
| "learning_rate": 4.27927927927928e-07, |
| "loss": 1.2159, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03787195671776375, |
| "grad_norm": 5.0110630414633475, |
| "learning_rate": 4.504504504504505e-07, |
| "loss": 0.9838, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.03967538322813345, |
| "grad_norm": 5.657518684511818, |
| "learning_rate": 4.7297297297297305e-07, |
| "loss": 1.0797, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.04147880973850315, |
| "grad_norm": 4.5143785306650575, |
| "learning_rate": 4.954954954954956e-07, |
| "loss": 1.1578, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.04328223624887286, |
| "grad_norm": 5.287373714805066, |
| "learning_rate": 5.180180180180181e-07, |
| "loss": 1.1259, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.04508566275924256, |
| "grad_norm": 4.812185404720589, |
| "learning_rate": 5.405405405405406e-07, |
| "loss": 1.0263, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.046889089269612265, |
| "grad_norm": 4.813260658431154, |
| "learning_rate": 5.630630630630631e-07, |
| "loss": 1.0294, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.04869251577998197, |
| "grad_norm": 5.38161770029296, |
| "learning_rate": 5.855855855855856e-07, |
| "loss": 1.0928, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.05049594229035167, |
| "grad_norm": 4.1250087598910365, |
| "learning_rate": 6.081081081081082e-07, |
| "loss": 1.0323, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.05229936880072137, |
| "grad_norm": 3.4592927519497496, |
| "learning_rate": 6.306306306306306e-07, |
| "loss": 1.0056, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.05410279531109107, |
| "grad_norm": 3.904048770191826, |
| "learning_rate": 6.531531531531532e-07, |
| "loss": 1.0514, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05590622182146077, |
| "grad_norm": 3.648999951860501, |
| "learning_rate": 6.756756756756758e-07, |
| "loss": 1.0958, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.057709648331830475, |
| "grad_norm": 3.466281275771792, |
| "learning_rate": 6.981981981981982e-07, |
| "loss": 1.0257, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.059513074842200184, |
| "grad_norm": 3.392213870901326, |
| "learning_rate": 7.207207207207208e-07, |
| "loss": 0.9459, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.061316501352569885, |
| "grad_norm": 3.5029298346782385, |
| "learning_rate": 7.432432432432434e-07, |
| "loss": 0.9415, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.06311992786293959, |
| "grad_norm": 3.0634308438792632, |
| "learning_rate": 7.657657657657658e-07, |
| "loss": 1.0153, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.06492335437330929, |
| "grad_norm": 2.9484149128045, |
| "learning_rate": 7.882882882882883e-07, |
| "loss": 0.9878, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.06672678088367899, |
| "grad_norm": 3.0610426789398195, |
| "learning_rate": 8.108108108108109e-07, |
| "loss": 0.9123, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.06853020739404869, |
| "grad_norm": 3.32199769361744, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 1.0099, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0703336339044184, |
| "grad_norm": 3.0709465427851046, |
| "learning_rate": 8.55855855855856e-07, |
| "loss": 1.0385, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.0721370604147881, |
| "grad_norm": 3.0428201478943575, |
| "learning_rate": 8.783783783783785e-07, |
| "loss": 0.9887, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0739404869251578, |
| "grad_norm": 3.11038611613558, |
| "learning_rate": 9.00900900900901e-07, |
| "loss": 0.8805, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.0757439134355275, |
| "grad_norm": 3.5117708849754283, |
| "learning_rate": 9.234234234234235e-07, |
| "loss": 0.9708, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0775473399458972, |
| "grad_norm": 3.436408499708477, |
| "learning_rate": 9.459459459459461e-07, |
| "loss": 0.991, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0793507664562669, |
| "grad_norm": 2.707066762216591, |
| "learning_rate": 9.684684684684686e-07, |
| "loss": 0.8664, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0811541929666366, |
| "grad_norm": 2.9154636312948647, |
| "learning_rate": 9.909909909909911e-07, |
| "loss": 0.9008, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0829576194770063, |
| "grad_norm": 2.9028667627025726, |
| "learning_rate": 1.0135135135135136e-06, |
| "loss": 1.0705, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.08476104598737602, |
| "grad_norm": 2.6634992062941736, |
| "learning_rate": 1.0360360360360361e-06, |
| "loss": 0.891, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.08656447249774572, |
| "grad_norm": 2.738023098685531, |
| "learning_rate": 1.0585585585585587e-06, |
| "loss": 0.9246, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.08836789900811542, |
| "grad_norm": 2.5938725151636435, |
| "learning_rate": 1.0810810810810812e-06, |
| "loss": 0.9308, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.09017132551848513, |
| "grad_norm": 2.732422906982916, |
| "learning_rate": 1.1036036036036037e-06, |
| "loss": 1.0283, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09197475202885483, |
| "grad_norm": 2.5138095481285814, |
| "learning_rate": 1.1261261261261262e-06, |
| "loss": 1.0285, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.09377817853922453, |
| "grad_norm": 2.5550555806196265, |
| "learning_rate": 1.148648648648649e-06, |
| "loss": 0.9065, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.09558160504959423, |
| "grad_norm": 2.3645335521201702, |
| "learning_rate": 1.1711711711711712e-06, |
| "loss": 0.8516, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.09738503155996393, |
| "grad_norm": 2.409700298550962, |
| "learning_rate": 1.1936936936936937e-06, |
| "loss": 0.8294, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.09918845807033363, |
| "grad_norm": 2.3183367981378145, |
| "learning_rate": 1.2162162162162164e-06, |
| "loss": 0.9365, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.10099188458070334, |
| "grad_norm": 2.1828402934512776, |
| "learning_rate": 1.2387387387387387e-06, |
| "loss": 0.8918, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.10279531109107304, |
| "grad_norm": 2.3691895978895094, |
| "learning_rate": 1.2612612612612613e-06, |
| "loss": 0.9768, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.10459873760144274, |
| "grad_norm": 2.3204193779879208, |
| "learning_rate": 1.2837837837837838e-06, |
| "loss": 0.7925, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.10640216411181244, |
| "grad_norm": 2.334168434235552, |
| "learning_rate": 1.3063063063063065e-06, |
| "loss": 0.855, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.10820559062218214, |
| "grad_norm": 2.256611178722444, |
| "learning_rate": 1.328828828828829e-06, |
| "loss": 0.8408, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11000901713255185, |
| "grad_norm": 2.4778146158109924, |
| "learning_rate": 1.3513513513513515e-06, |
| "loss": 0.8964, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.11181244364292155, |
| "grad_norm": 2.4357880480005756, |
| "learning_rate": 1.373873873873874e-06, |
| "loss": 0.9621, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.11361587015329125, |
| "grad_norm": 2.2150394871764294, |
| "learning_rate": 1.3963963963963963e-06, |
| "loss": 0.8501, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.11541929666366095, |
| "grad_norm": 2.118545319018784, |
| "learning_rate": 1.418918918918919e-06, |
| "loss": 0.9133, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.11722272317403065, |
| "grad_norm": 2.1649234330413587, |
| "learning_rate": 1.4414414414414416e-06, |
| "loss": 0.7883, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.11902614968440037, |
| "grad_norm": 2.260747898334313, |
| "learning_rate": 1.463963963963964e-06, |
| "loss": 1.0857, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.12082957619477007, |
| "grad_norm": 2.3422569755909257, |
| "learning_rate": 1.4864864864864868e-06, |
| "loss": 0.9595, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.12263300270513977, |
| "grad_norm": 2.1879879443879067, |
| "learning_rate": 1.5090090090090093e-06, |
| "loss": 0.9373, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.12443642921550947, |
| "grad_norm": 2.1212636698318565, |
| "learning_rate": 1.5315315315315316e-06, |
| "loss": 0.8465, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.12623985572587917, |
| "grad_norm": 2.093523831224821, |
| "learning_rate": 1.5540540540540541e-06, |
| "loss": 0.8851, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12804328223624886, |
| "grad_norm": 2.117245792873491, |
| "learning_rate": 1.5765765765765766e-06, |
| "loss": 0.8836, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.12984670874661858, |
| "grad_norm": 2.297950888317582, |
| "learning_rate": 1.5990990990990993e-06, |
| "loss": 0.8671, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.13165013525698827, |
| "grad_norm": 2.136681162174477, |
| "learning_rate": 1.6216216216216219e-06, |
| "loss": 0.9114, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.13345356176735798, |
| "grad_norm": 2.377418938286004, |
| "learning_rate": 1.6441441441441444e-06, |
| "loss": 0.9153, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.13525698827772767, |
| "grad_norm": 2.14684216322763, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.895, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.13706041478809738, |
| "grad_norm": 2.0578144463585395, |
| "learning_rate": 1.6891891891891894e-06, |
| "loss": 0.7646, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.1388638412984671, |
| "grad_norm": 2.1370659943028256, |
| "learning_rate": 1.711711711711712e-06, |
| "loss": 0.963, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.1406672678088368, |
| "grad_norm": 2.1407789023578805, |
| "learning_rate": 1.7342342342342344e-06, |
| "loss": 0.8181, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.1424706943192065, |
| "grad_norm": 2.224908436029519, |
| "learning_rate": 1.756756756756757e-06, |
| "loss": 0.7726, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.1442741208295762, |
| "grad_norm": 2.4321949851329627, |
| "learning_rate": 1.7792792792792792e-06, |
| "loss": 0.857, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1460775473399459, |
| "grad_norm": 2.144226568602669, |
| "learning_rate": 1.801801801801802e-06, |
| "loss": 0.8378, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.1478809738503156, |
| "grad_norm": 1.9826711249103168, |
| "learning_rate": 1.8243243243243245e-06, |
| "loss": 0.7902, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.1496844003606853, |
| "grad_norm": 2.291008678375686, |
| "learning_rate": 1.846846846846847e-06, |
| "loss": 0.8824, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.151487826871055, |
| "grad_norm": 2.145067975437641, |
| "learning_rate": 1.8693693693693697e-06, |
| "loss": 0.7856, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.1532912533814247, |
| "grad_norm": 2.123969288662637, |
| "learning_rate": 1.8918918918918922e-06, |
| "loss": 0.7462, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.1550946798917944, |
| "grad_norm": 2.2295818146317776, |
| "learning_rate": 1.9144144144144145e-06, |
| "loss": 0.8886, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.15689810640216412, |
| "grad_norm": 2.140597774817277, |
| "learning_rate": 1.9369369369369372e-06, |
| "loss": 0.8319, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.1587015329125338, |
| "grad_norm": 2.344451235287885, |
| "learning_rate": 1.9594594594594595e-06, |
| "loss": 0.8575, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.16050495942290352, |
| "grad_norm": 1.961958906755019, |
| "learning_rate": 1.9819819819819822e-06, |
| "loss": 0.8305, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.1623083859332732, |
| "grad_norm": 2.082591039262878, |
| "learning_rate": 2.0045045045045045e-06, |
| "loss": 0.8032, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16411181244364292, |
| "grad_norm": 1.8951491866286936, |
| "learning_rate": 2.0270270270270273e-06, |
| "loss": 0.9459, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.1659152389540126, |
| "grad_norm": 2.1428859812782344, |
| "learning_rate": 2.0495495495495496e-06, |
| "loss": 0.8839, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.16771866546438233, |
| "grad_norm": 2.2152288593264173, |
| "learning_rate": 2.0720720720720723e-06, |
| "loss": 0.8265, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.16952209197475204, |
| "grad_norm": 2.0213826178716254, |
| "learning_rate": 2.0945945945945946e-06, |
| "loss": 0.8098, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.17132551848512173, |
| "grad_norm": 2.0901306331246374, |
| "learning_rate": 2.1171171171171173e-06, |
| "loss": 0.8728, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.17312894499549145, |
| "grad_norm": 2.164683159815703, |
| "learning_rate": 2.13963963963964e-06, |
| "loss": 0.7473, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.17493237150586113, |
| "grad_norm": 2.128063710011363, |
| "learning_rate": 2.1621621621621623e-06, |
| "loss": 0.7631, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.17673579801623085, |
| "grad_norm": 2.2572109322776446, |
| "learning_rate": 2.1846846846846846e-06, |
| "loss": 0.933, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.17853922452660054, |
| "grad_norm": 2.1363963838074325, |
| "learning_rate": 2.2072072072072073e-06, |
| "loss": 0.7977, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.18034265103697025, |
| "grad_norm": 2.146510752101339, |
| "learning_rate": 2.22972972972973e-06, |
| "loss": 0.8328, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.18214607754733994, |
| "grad_norm": 2.128787213407692, |
| "learning_rate": 2.2522522522522524e-06, |
| "loss": 0.769, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.18394950405770966, |
| "grad_norm": 2.1474901480116384, |
| "learning_rate": 2.274774774774775e-06, |
| "loss": 0.8078, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.18575293056807934, |
| "grad_norm": 2.2077628269004306, |
| "learning_rate": 2.297297297297298e-06, |
| "loss": 0.869, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.18755635707844906, |
| "grad_norm": 2.1596373889839353, |
| "learning_rate": 2.31981981981982e-06, |
| "loss": 0.9066, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.18935978358881875, |
| "grad_norm": 2.227258779710617, |
| "learning_rate": 2.3423423423423424e-06, |
| "loss": 0.8736, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.19116321009918846, |
| "grad_norm": 2.0265448039731604, |
| "learning_rate": 2.364864864864865e-06, |
| "loss": 0.8269, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.19296663660955815, |
| "grad_norm": 2.090824078885953, |
| "learning_rate": 2.3873873873873874e-06, |
| "loss": 0.8771, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.19477006311992787, |
| "grad_norm": 1.9761919651706363, |
| "learning_rate": 2.40990990990991e-06, |
| "loss": 0.8337, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.19657348963029755, |
| "grad_norm": 2.1515967867262455, |
| "learning_rate": 2.432432432432433e-06, |
| "loss": 0.8635, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.19837691614066727, |
| "grad_norm": 2.0366179273737943, |
| "learning_rate": 2.454954954954955e-06, |
| "loss": 0.7951, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.20018034265103696, |
| "grad_norm": 2.2337095952708568, |
| "learning_rate": 2.4774774774774775e-06, |
| "loss": 0.8296, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.20198376916140667, |
| "grad_norm": 2.314845611994883, |
| "learning_rate": 2.5e-06, |
| "loss": 0.8512, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.2037871956717764, |
| "grad_norm": 2.1397074134865623, |
| "learning_rate": 2.5225225225225225e-06, |
| "loss": 0.7915, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.20559062218214608, |
| "grad_norm": 2.2454332127701644, |
| "learning_rate": 2.5450450450450452e-06, |
| "loss": 0.7976, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.2073940486925158, |
| "grad_norm": 2.184763023914372, |
| "learning_rate": 2.5675675675675675e-06, |
| "loss": 0.9853, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.20919747520288548, |
| "grad_norm": 2.1965096069781653, |
| "learning_rate": 2.5900900900900907e-06, |
| "loss": 0.8754, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.2110009017132552, |
| "grad_norm": 2.1197328540143405, |
| "learning_rate": 2.612612612612613e-06, |
| "loss": 0.873, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.21280432822362488, |
| "grad_norm": 2.1479255682477656, |
| "learning_rate": 2.6351351351351353e-06, |
| "loss": 0.811, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.2146077547339946, |
| "grad_norm": 1.907529342641289, |
| "learning_rate": 2.657657657657658e-06, |
| "loss": 0.8094, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.2164111812443643, |
| "grad_norm": 2.0692795464883162, |
| "learning_rate": 2.6801801801801803e-06, |
| "loss": 0.7645, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.218214607754734, |
| "grad_norm": 2.525383768634248, |
| "learning_rate": 2.702702702702703e-06, |
| "loss": 0.7909, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.2200180342651037, |
| "grad_norm": 2.193135508499143, |
| "learning_rate": 2.7252252252252253e-06, |
| "loss": 0.8109, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.2218214607754734, |
| "grad_norm": 2.2119521513341263, |
| "learning_rate": 2.747747747747748e-06, |
| "loss": 0.864, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.2236248872858431, |
| "grad_norm": 2.2411527155988966, |
| "learning_rate": 2.7702702702702703e-06, |
| "loss": 0.7952, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.2254283137962128, |
| "grad_norm": 2.0883350487153693, |
| "learning_rate": 2.7927927927927926e-06, |
| "loss": 1.0036, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2272317403065825, |
| "grad_norm": 2.109829568194192, |
| "learning_rate": 2.8153153153153158e-06, |
| "loss": 0.7232, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.2290351668169522, |
| "grad_norm": 1.993689884083202, |
| "learning_rate": 2.837837837837838e-06, |
| "loss": 0.7923, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.2308385933273219, |
| "grad_norm": 1.8907292753085065, |
| "learning_rate": 2.860360360360361e-06, |
| "loss": 0.7996, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.23264201983769162, |
| "grad_norm": 2.0881747024356367, |
| "learning_rate": 2.882882882882883e-06, |
| "loss": 0.8358, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.2344454463480613, |
| "grad_norm": 2.2364232617934583, |
| "learning_rate": 2.9054054054054054e-06, |
| "loss": 0.855, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.23624887285843102, |
| "grad_norm": 2.1751800704208017, |
| "learning_rate": 2.927927927927928e-06, |
| "loss": 0.988, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.23805229936880073, |
| "grad_norm": 2.5339818982016244, |
| "learning_rate": 2.9504504504504504e-06, |
| "loss": 0.9803, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.23985572587917042, |
| "grad_norm": 2.1208235838666276, |
| "learning_rate": 2.9729729729729736e-06, |
| "loss": 0.8555, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.24165915238954014, |
| "grad_norm": 2.2141319147659404, |
| "learning_rate": 2.995495495495496e-06, |
| "loss": 0.8157, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.24346257889990983, |
| "grad_norm": 2.2533639584780403, |
| "learning_rate": 3.0180180180180186e-06, |
| "loss": 0.7266, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.24526600541027954, |
| "grad_norm": 1.9934582570878943, |
| "learning_rate": 3.040540540540541e-06, |
| "loss": 0.6843, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.24706943192064923, |
| "grad_norm": 2.3507505242464286, |
| "learning_rate": 3.063063063063063e-06, |
| "loss": 0.7358, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.24887285843101895, |
| "grad_norm": 2.333362017875557, |
| "learning_rate": 3.085585585585586e-06, |
| "loss": 0.7389, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.25067628494138866, |
| "grad_norm": 2.2614223566969707, |
| "learning_rate": 3.1081081081081082e-06, |
| "loss": 0.7867, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.25247971145175835, |
| "grad_norm": 1.9874595550084149, |
| "learning_rate": 3.130630630630631e-06, |
| "loss": 0.8633, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.25428313796212804, |
| "grad_norm": 2.078601890935306, |
| "learning_rate": 3.1531531531531532e-06, |
| "loss": 0.7232, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.2560865644724977, |
| "grad_norm": 2.059190081358862, |
| "learning_rate": 3.1756756756756755e-06, |
| "loss": 0.716, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.25788999098286747, |
| "grad_norm": 2.2762951186816474, |
| "learning_rate": 3.1981981981981987e-06, |
| "loss": 0.8357, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.25969341749323716, |
| "grad_norm": 2.2293473455945882, |
| "learning_rate": 3.220720720720721e-06, |
| "loss": 0.9155, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.26149684400360684, |
| "grad_norm": 2.138416887954227, |
| "learning_rate": 3.2432432432432437e-06, |
| "loss": 0.8325, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.26330027051397653, |
| "grad_norm": 2.124046946880288, |
| "learning_rate": 3.265765765765766e-06, |
| "loss": 0.8651, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.2651036970243463, |
| "grad_norm": 2.1097037863696015, |
| "learning_rate": 3.2882882882882887e-06, |
| "loss": 0.748, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.26690712353471596, |
| "grad_norm": 1.9680662328568495, |
| "learning_rate": 3.310810810810811e-06, |
| "loss": 0.7221, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.26871055004508565, |
| "grad_norm": 2.046832628017909, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.76, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.27051397655545534, |
| "grad_norm": 2.318638863913297, |
| "learning_rate": 3.3558558558558565e-06, |
| "loss": 0.8807, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2723174030658251, |
| "grad_norm": 2.012178712794308, |
| "learning_rate": 3.3783783783783788e-06, |
| "loss": 0.8326, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.27412082957619477, |
| "grad_norm": 2.066509449866673, |
| "learning_rate": 3.4009009009009015e-06, |
| "loss": 0.8075, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.27592425608656446, |
| "grad_norm": 2.1828393174475735, |
| "learning_rate": 3.423423423423424e-06, |
| "loss": 0.8072, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.2777276825969342, |
| "grad_norm": 2.3767082165962368, |
| "learning_rate": 3.445945945945946e-06, |
| "loss": 0.97, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.2795311091073039, |
| "grad_norm": 2.1245505290732853, |
| "learning_rate": 3.468468468468469e-06, |
| "loss": 0.8154, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.2813345356176736, |
| "grad_norm": 2.0827910518003523, |
| "learning_rate": 3.490990990990991e-06, |
| "loss": 0.7921, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.28313796212804326, |
| "grad_norm": 2.0364316538096863, |
| "learning_rate": 3.513513513513514e-06, |
| "loss": 0.9461, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.284941388638413, |
| "grad_norm": 2.3511021810109383, |
| "learning_rate": 3.536036036036036e-06, |
| "loss": 0.7942, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.2867448151487827, |
| "grad_norm": 1.9833910918431235, |
| "learning_rate": 3.5585585585585584e-06, |
| "loss": 0.8312, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.2885482416591524, |
| "grad_norm": 1.9342863774694277, |
| "learning_rate": 3.5810810810810816e-06, |
| "loss": 0.8015, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.29035166816952207, |
| "grad_norm": 2.033383216145857, |
| "learning_rate": 3.603603603603604e-06, |
| "loss": 0.7695, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.2921550946798918, |
| "grad_norm": 2.374348511862132, |
| "learning_rate": 3.6261261261261266e-06, |
| "loss": 0.762, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.2939585211902615, |
| "grad_norm": 2.114360094597133, |
| "learning_rate": 3.648648648648649e-06, |
| "loss": 0.8444, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.2957619477006312, |
| "grad_norm": 1.9931929796238907, |
| "learning_rate": 3.6711711711711716e-06, |
| "loss": 0.7882, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.2975653742110009, |
| "grad_norm": 2.0730938718533145, |
| "learning_rate": 3.693693693693694e-06, |
| "loss": 0.7745, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.2993688007213706, |
| "grad_norm": 1.8554364231513298, |
| "learning_rate": 3.7162162162162162e-06, |
| "loss": 0.7253, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.3011722272317403, |
| "grad_norm": 2.149623516434781, |
| "learning_rate": 3.7387387387387394e-06, |
| "loss": 0.8954, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.30297565374211, |
| "grad_norm": 2.4856316208076503, |
| "learning_rate": 3.7612612612612612e-06, |
| "loss": 0.8402, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.3047790802524797, |
| "grad_norm": 2.1406112105466035, |
| "learning_rate": 3.7837837837837844e-06, |
| "loss": 0.8636, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.3065825067628494, |
| "grad_norm": 2.2289790923203205, |
| "learning_rate": 3.8063063063063067e-06, |
| "loss": 0.7428, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3083859332732191, |
| "grad_norm": 2.004209812667466, |
| "learning_rate": 3.828828828828829e-06, |
| "loss": 0.7797, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.3101893597835888, |
| "grad_norm": 2.006314497006802, |
| "learning_rate": 3.851351351351352e-06, |
| "loss": 0.7593, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.31199278629395855, |
| "grad_norm": 2.282382563008822, |
| "learning_rate": 3.8738738738738744e-06, |
| "loss": 0.9733, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.31379621280432823, |
| "grad_norm": 2.0355833568890946, |
| "learning_rate": 3.896396396396397e-06, |
| "loss": 0.8561, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.3155996393146979, |
| "grad_norm": 2.259718701083019, |
| "learning_rate": 3.918918918918919e-06, |
| "loss": 0.797, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3174030658250676, |
| "grad_norm": 2.1729996844233455, |
| "learning_rate": 3.941441441441442e-06, |
| "loss": 0.7527, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.31920649233543735, |
| "grad_norm": 2.60117835410255, |
| "learning_rate": 3.9639639639639645e-06, |
| "loss": 1.0225, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.32100991884580704, |
| "grad_norm": 2.2528379596704604, |
| "learning_rate": 3.986486486486487e-06, |
| "loss": 0.7965, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.32281334535617673, |
| "grad_norm": 2.3132904967648082, |
| "learning_rate": 4.009009009009009e-06, |
| "loss": 0.8112, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.3246167718665464, |
| "grad_norm": 2.5263030575643564, |
| "learning_rate": 4.031531531531531e-06, |
| "loss": 0.8432, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.32642019837691616, |
| "grad_norm": 2.2940008917196817, |
| "learning_rate": 4.0540540540540545e-06, |
| "loss": 0.7679, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.32822362488728585, |
| "grad_norm": 2.1976286649954355, |
| "learning_rate": 4.076576576576577e-06, |
| "loss": 0.853, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.33002705139765554, |
| "grad_norm": 2.287412594205084, |
| "learning_rate": 4.099099099099099e-06, |
| "loss": 0.8528, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.3318304779080252, |
| "grad_norm": 2.265413975048022, |
| "learning_rate": 4.121621621621622e-06, |
| "loss": 0.8891, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.33363390441839497, |
| "grad_norm": 2.1347188948409626, |
| "learning_rate": 4.1441441441441446e-06, |
| "loss": 0.7172, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.33543733092876465, |
| "grad_norm": 1.9036460590607482, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 0.8139, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.33724075743913434, |
| "grad_norm": 2.1982101741220723, |
| "learning_rate": 4.189189189189189e-06, |
| "loss": 0.7872, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.3390441839495041, |
| "grad_norm": 1.9974084871264948, |
| "learning_rate": 4.2117117117117115e-06, |
| "loss": 0.7211, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.3408476104598738, |
| "grad_norm": 2.343692275926038, |
| "learning_rate": 4.234234234234235e-06, |
| "loss": 0.8724, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.34265103697024346, |
| "grad_norm": 2.4031986369572103, |
| "learning_rate": 4.256756756756757e-06, |
| "loss": 0.8742, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.34445446348061315, |
| "grad_norm": 2.077375945110708, |
| "learning_rate": 4.27927927927928e-06, |
| "loss": 0.7802, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.3462578899909829, |
| "grad_norm": 2.027445526646734, |
| "learning_rate": 4.301801801801802e-06, |
| "loss": 0.8748, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.3480613165013526, |
| "grad_norm": 2.4821342969963522, |
| "learning_rate": 4.324324324324325e-06, |
| "loss": 0.7775, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.34986474301172227, |
| "grad_norm": 2.356163875058872, |
| "learning_rate": 4.346846846846847e-06, |
| "loss": 0.7257, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.35166816952209196, |
| "grad_norm": 2.295664379900927, |
| "learning_rate": 4.369369369369369e-06, |
| "loss": 0.7341, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.3534715960324617, |
| "grad_norm": 2.3623157091199882, |
| "learning_rate": 4.391891891891892e-06, |
| "loss": 0.8198, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.3552750225428314, |
| "grad_norm": 2.186111001087259, |
| "learning_rate": 4.414414414414415e-06, |
| "loss": 0.8505, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.3570784490532011, |
| "grad_norm": 2.2466120322229504, |
| "learning_rate": 4.436936936936938e-06, |
| "loss": 0.8352, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.35888187556357076, |
| "grad_norm": 2.2300706402504837, |
| "learning_rate": 4.45945945945946e-06, |
| "loss": 0.9199, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.3606853020739405, |
| "grad_norm": 2.175470606319692, |
| "learning_rate": 4.4819819819819824e-06, |
| "loss": 0.6704, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3624887285843102, |
| "grad_norm": 2.0680624661556397, |
| "learning_rate": 4.504504504504505e-06, |
| "loss": 0.8349, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.3642921550946799, |
| "grad_norm": 2.1728800893786895, |
| "learning_rate": 4.527027027027027e-06, |
| "loss": 0.8035, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.36609558160504957, |
| "grad_norm": 2.360603677367448, |
| "learning_rate": 4.54954954954955e-06, |
| "loss": 0.7997, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.3678990081154193, |
| "grad_norm": 2.0528022407082425, |
| "learning_rate": 4.5720720720720725e-06, |
| "loss": 0.7377, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.369702434625789, |
| "grad_norm": 2.145107444388918, |
| "learning_rate": 4.594594594594596e-06, |
| "loss": 0.7246, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.3715058611361587, |
| "grad_norm": 2.142567114305303, |
| "learning_rate": 4.617117117117118e-06, |
| "loss": 0.767, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.37330928764652843, |
| "grad_norm": 2.250353037529415, |
| "learning_rate": 4.63963963963964e-06, |
| "loss": 0.744, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.3751127141568981, |
| "grad_norm": 2.4107500279982577, |
| "learning_rate": 4.6621621621621625e-06, |
| "loss": 0.9702, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.3769161406672678, |
| "grad_norm": 1.83721607411279, |
| "learning_rate": 4.684684684684685e-06, |
| "loss": 0.7841, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.3787195671776375, |
| "grad_norm": 2.187844750445605, |
| "learning_rate": 4.707207207207208e-06, |
| "loss": 0.7828, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.38052299368800724, |
| "grad_norm": 2.6155119945345913, |
| "learning_rate": 4.72972972972973e-06, |
| "loss": 0.7754, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.3823264201983769, |
| "grad_norm": 2.087136361991544, |
| "learning_rate": 4.7522522522522526e-06, |
| "loss": 0.7961, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.3841298467087466, |
| "grad_norm": 2.052469543045352, |
| "learning_rate": 4.774774774774775e-06, |
| "loss": 0.74, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.3859332732191163, |
| "grad_norm": 2.326611456516733, |
| "learning_rate": 4.797297297297297e-06, |
| "loss": 0.8593, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.38773669972948605, |
| "grad_norm": 2.161826276327704, |
| "learning_rate": 4.81981981981982e-06, |
| "loss": 0.8495, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.38954012623985573, |
| "grad_norm": 2.2046675857827416, |
| "learning_rate": 4.842342342342343e-06, |
| "loss": 0.7909, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.3913435527502254, |
| "grad_norm": 1.9987211123805613, |
| "learning_rate": 4.864864864864866e-06, |
| "loss": 0.7073, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.3931469792605951, |
| "grad_norm": 2.060842447295872, |
| "learning_rate": 4.887387387387388e-06, |
| "loss": 0.7356, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.39495040577096485, |
| "grad_norm": 2.167011002864499, |
| "learning_rate": 4.90990990990991e-06, |
| "loss": 0.9376, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.39675383228133454, |
| "grad_norm": 2.120387815326822, |
| "learning_rate": 4.932432432432433e-06, |
| "loss": 0.923, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3985572587917042, |
| "grad_norm": 2.0360713757641675, |
| "learning_rate": 4.954954954954955e-06, |
| "loss": 0.7894, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.4003606853020739, |
| "grad_norm": 1.9555130956875506, |
| "learning_rate": 4.977477477477478e-06, |
| "loss": 0.7949, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.40216411181244366, |
| "grad_norm": 2.1707142856979553, |
| "learning_rate": 5e-06, |
| "loss": 0.8106, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.40396753832281335, |
| "grad_norm": 2.0454935807171566, |
| "learning_rate": 5.022522522522523e-06, |
| "loss": 0.7753, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.40577096483318303, |
| "grad_norm": 1.9818753837111296, |
| "learning_rate": 5.045045045045045e-06, |
| "loss": 0.7425, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.4075743913435528, |
| "grad_norm": 2.089730216802721, |
| "learning_rate": 5.067567567567568e-06, |
| "loss": 0.7953, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.40937781785392247, |
| "grad_norm": 2.1221432507716593, |
| "learning_rate": 5.0900900900900905e-06, |
| "loss": 0.7228, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.41118124436429215, |
| "grad_norm": 2.302405327887679, |
| "learning_rate": 5.112612612612613e-06, |
| "loss": 0.8949, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.41298467087466184, |
| "grad_norm": 2.1247546433849203, |
| "learning_rate": 5.135135135135135e-06, |
| "loss": 0.7007, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.4147880973850316, |
| "grad_norm": 2.043647163631905, |
| "learning_rate": 5.157657657657657e-06, |
| "loss": 0.7338, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4165915238954013, |
| "grad_norm": 2.270245087265323, |
| "learning_rate": 5.180180180180181e-06, |
| "loss": 0.8341, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.41839495040577096, |
| "grad_norm": 2.183816726602172, |
| "learning_rate": 5.202702702702704e-06, |
| "loss": 0.8531, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.42019837691614065, |
| "grad_norm": 2.1547309877335903, |
| "learning_rate": 5.225225225225226e-06, |
| "loss": 0.7347, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.4220018034265104, |
| "grad_norm": 2.206709828896525, |
| "learning_rate": 5.247747747747748e-06, |
| "loss": 0.8665, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.4238052299368801, |
| "grad_norm": 2.0559406646994987, |
| "learning_rate": 5.2702702702702705e-06, |
| "loss": 0.6986, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.42560865644724977, |
| "grad_norm": 2.018292507433674, |
| "learning_rate": 5.292792792792794e-06, |
| "loss": 0.6952, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.42741208295761945, |
| "grad_norm": 2.0090659177061805, |
| "learning_rate": 5.315315315315316e-06, |
| "loss": 0.8291, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.4292155094679892, |
| "grad_norm": 2.301997115792778, |
| "learning_rate": 5.337837837837838e-06, |
| "loss": 0.7145, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.4310189359783589, |
| "grad_norm": 2.067738383833435, |
| "learning_rate": 5.360360360360361e-06, |
| "loss": 0.7995, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.4328223624887286, |
| "grad_norm": 2.2635903833648245, |
| "learning_rate": 5.382882882882884e-06, |
| "loss": 0.7224, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.43462578899909826, |
| "grad_norm": 2.275286557637392, |
| "learning_rate": 5.405405405405406e-06, |
| "loss": 0.8183, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.436429215509468, |
| "grad_norm": 2.1179111948903513, |
| "learning_rate": 5.427927927927928e-06, |
| "loss": 0.772, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.4382326420198377, |
| "grad_norm": 2.164539734222491, |
| "learning_rate": 5.450450450450451e-06, |
| "loss": 0.7319, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.4400360685302074, |
| "grad_norm": 2.1650273688319515, |
| "learning_rate": 5.472972972972973e-06, |
| "loss": 0.8215, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.4418394950405771, |
| "grad_norm": 2.075382076411821, |
| "learning_rate": 5.495495495495496e-06, |
| "loss": 0.7398, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.4436429215509468, |
| "grad_norm": 1.8310224669393116, |
| "learning_rate": 5.518018018018018e-06, |
| "loss": 0.7462, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.4454463480613165, |
| "grad_norm": 2.1020377635825955, |
| "learning_rate": 5.540540540540541e-06, |
| "loss": 0.7952, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.4472497745716862, |
| "grad_norm": 2.20596373228597, |
| "learning_rate": 5.563063063063063e-06, |
| "loss": 0.7155, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.44905320108205593, |
| "grad_norm": 2.157295752855383, |
| "learning_rate": 5.585585585585585e-06, |
| "loss": 0.7448, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.4508566275924256, |
| "grad_norm": 2.0756794403814767, |
| "learning_rate": 5.608108108108109e-06, |
| "loss": 0.9216, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4526600541027953, |
| "grad_norm": 2.3192109805255123, |
| "learning_rate": 5.6306306306306316e-06, |
| "loss": 0.7884, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.454463480613165, |
| "grad_norm": 2.0220751392261467, |
| "learning_rate": 5.653153153153154e-06, |
| "loss": 0.7508, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.45626690712353474, |
| "grad_norm": 2.13660207924998, |
| "learning_rate": 5.675675675675676e-06, |
| "loss": 0.7559, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.4580703336339044, |
| "grad_norm": 2.114868597507177, |
| "learning_rate": 5.6981981981981985e-06, |
| "loss": 0.8145, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.4598737601442741, |
| "grad_norm": 2.1123935906108313, |
| "learning_rate": 5.720720720720722e-06, |
| "loss": 0.8049, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.4616771866546438, |
| "grad_norm": 2.4676890144062957, |
| "learning_rate": 5.743243243243244e-06, |
| "loss": 0.7957, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.46348061316501354, |
| "grad_norm": 2.168073489314107, |
| "learning_rate": 5.765765765765766e-06, |
| "loss": 0.7666, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.46528403967538323, |
| "grad_norm": 2.072398238803128, |
| "learning_rate": 5.7882882882882885e-06, |
| "loss": 0.8192, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.4670874661857529, |
| "grad_norm": 2.563194398373102, |
| "learning_rate": 5.810810810810811e-06, |
| "loss": 0.8409, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.4688908926961226, |
| "grad_norm": 2.304094514669754, |
| "learning_rate": 5.833333333333334e-06, |
| "loss": 0.8541, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.47069431920649235, |
| "grad_norm": 2.198147090270184, |
| "learning_rate": 5.855855855855856e-06, |
| "loss": 0.6985, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.47249774571686204, |
| "grad_norm": 2.1753059174557112, |
| "learning_rate": 5.8783783783783786e-06, |
| "loss": 0.8182, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.4743011722272317, |
| "grad_norm": 2.2589974392486356, |
| "learning_rate": 5.900900900900901e-06, |
| "loss": 0.755, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.47610459873760147, |
| "grad_norm": 2.1007335416554507, |
| "learning_rate": 5.923423423423423e-06, |
| "loss": 0.748, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.47790802524797116, |
| "grad_norm": 1.9793246514216147, |
| "learning_rate": 5.945945945945947e-06, |
| "loss": 0.6935, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.47971145175834085, |
| "grad_norm": 2.310796934006057, |
| "learning_rate": 5.9684684684684694e-06, |
| "loss": 0.7839, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.48151487826871053, |
| "grad_norm": 2.1733242596190374, |
| "learning_rate": 5.990990990990992e-06, |
| "loss": 0.7748, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.4833183047790803, |
| "grad_norm": 2.134235761560956, |
| "learning_rate": 6.013513513513514e-06, |
| "loss": 0.7545, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.48512173128944996, |
| "grad_norm": 2.279296657289438, |
| "learning_rate": 6.036036036036037e-06, |
| "loss": 0.7399, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.48692515779981965, |
| "grad_norm": 2.2785713614109566, |
| "learning_rate": 6.0585585585585595e-06, |
| "loss": 0.9031, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.48872858431018934, |
| "grad_norm": 2.17376130509456, |
| "learning_rate": 6.081081081081082e-06, |
| "loss": 0.8876, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.4905320108205591, |
| "grad_norm": 2.2837346570989694, |
| "learning_rate": 6.103603603603604e-06, |
| "loss": 0.852, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.49233543733092877, |
| "grad_norm": 2.308367261822732, |
| "learning_rate": 6.126126126126126e-06, |
| "loss": 0.7471, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.49413886384129846, |
| "grad_norm": 2.353472342894518, |
| "learning_rate": 6.1486486486486495e-06, |
| "loss": 0.865, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.49594229035166815, |
| "grad_norm": 2.2188631595778077, |
| "learning_rate": 6.171171171171172e-06, |
| "loss": 0.8253, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.4977457168620379, |
| "grad_norm": 2.4928969764456212, |
| "learning_rate": 6.193693693693694e-06, |
| "loss": 0.9809, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.4995491433724076, |
| "grad_norm": 2.429996582097567, |
| "learning_rate": 6.2162162162162164e-06, |
| "loss": 0.9529, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.5013525698827773, |
| "grad_norm": 2.156174833500389, |
| "learning_rate": 6.238738738738739e-06, |
| "loss": 0.7549, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.503155996393147, |
| "grad_norm": 2.1145480790559916, |
| "learning_rate": 6.261261261261262e-06, |
| "loss": 0.7325, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.5049594229035167, |
| "grad_norm": 2.3827039996906887, |
| "learning_rate": 6.283783783783784e-06, |
| "loss": 0.8234, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5067628494138864, |
| "grad_norm": 2.2520674713452635, |
| "learning_rate": 6.3063063063063065e-06, |
| "loss": 0.805, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.5085662759242561, |
| "grad_norm": 2.2751328483189344, |
| "learning_rate": 6.328828828828829e-06, |
| "loss": 0.7916, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.5103697024346258, |
| "grad_norm": 2.105893153039127, |
| "learning_rate": 6.351351351351351e-06, |
| "loss": 0.7339, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.5121731289449954, |
| "grad_norm": 2.3088480635629853, |
| "learning_rate": 6.373873873873875e-06, |
| "loss": 0.7908, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.5139765554553652, |
| "grad_norm": 2.2019643640954567, |
| "learning_rate": 6.396396396396397e-06, |
| "loss": 0.8165, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5157799819657349, |
| "grad_norm": 2.2224375489982195, |
| "learning_rate": 6.41891891891892e-06, |
| "loss": 0.7884, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.5175834084761046, |
| "grad_norm": 2.2123927819948257, |
| "learning_rate": 6.441441441441442e-06, |
| "loss": 0.8261, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.5193868349864743, |
| "grad_norm": 2.2449799654503093, |
| "learning_rate": 6.463963963963964e-06, |
| "loss": 0.8287, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.521190261496844, |
| "grad_norm": 2.244952248633715, |
| "learning_rate": 6.486486486486487e-06, |
| "loss": 0.823, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.5229936880072137, |
| "grad_norm": 2.146102311067904, |
| "learning_rate": 6.50900900900901e-06, |
| "loss": 0.8201, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5247971145175834, |
| "grad_norm": 2.2107121939036642, |
| "learning_rate": 6.531531531531532e-06, |
| "loss": 0.7452, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.5266005410279531, |
| "grad_norm": 2.274570701724603, |
| "learning_rate": 6.554054054054054e-06, |
| "loss": 0.7995, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.5284039675383229, |
| "grad_norm": 2.3901970457801323, |
| "learning_rate": 6.5765765765765775e-06, |
| "loss": 0.8293, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.5302073940486925, |
| "grad_norm": 2.2046674887614617, |
| "learning_rate": 6.5990990990991e-06, |
| "loss": 0.7711, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.5320108205590622, |
| "grad_norm": 2.181130141644271, |
| "learning_rate": 6.621621621621622e-06, |
| "loss": 0.7467, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.5338142470694319, |
| "grad_norm": 2.1545307052885434, |
| "learning_rate": 6.644144144144144e-06, |
| "loss": 0.7591, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.5356176735798016, |
| "grad_norm": 2.000955616731471, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.7405, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.5374211000901713, |
| "grad_norm": 2.270342762369627, |
| "learning_rate": 6.689189189189191e-06, |
| "loss": 0.8734, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.539224526600541, |
| "grad_norm": 1.9914609602909024, |
| "learning_rate": 6.711711711711713e-06, |
| "loss": 0.9029, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.5410279531109107, |
| "grad_norm": 2.1862011960901238, |
| "learning_rate": 6.734234234234235e-06, |
| "loss": 0.7847, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5428313796212805, |
| "grad_norm": 2.162455670849857, |
| "learning_rate": 6.7567567567567575e-06, |
| "loss": 0.796, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.5446348061316502, |
| "grad_norm": 2.190782292923182, |
| "learning_rate": 6.77927927927928e-06, |
| "loss": 0.8361, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.5464382326420198, |
| "grad_norm": 2.343114673195786, |
| "learning_rate": 6.801801801801803e-06, |
| "loss": 0.9578, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.5482416591523895, |
| "grad_norm": 2.137122549596483, |
| "learning_rate": 6.824324324324325e-06, |
| "loss": 0.8094, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.5500450856627592, |
| "grad_norm": 2.106947969785909, |
| "learning_rate": 6.846846846846848e-06, |
| "loss": 0.7836, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5518485121731289, |
| "grad_norm": 2.146424998051745, |
| "learning_rate": 6.86936936936937e-06, |
| "loss": 0.7997, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.5536519386834986, |
| "grad_norm": 1.8042726808487144, |
| "learning_rate": 6.891891891891892e-06, |
| "loss": 0.7629, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.5554553651938684, |
| "grad_norm": 2.142256130584483, |
| "learning_rate": 6.914414414414415e-06, |
| "loss": 0.8131, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.5572587917042381, |
| "grad_norm": 2.0191516225293116, |
| "learning_rate": 6.936936936936938e-06, |
| "loss": 0.8088, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.5590622182146078, |
| "grad_norm": 2.2871792098661015, |
| "learning_rate": 6.95945945945946e-06, |
| "loss": 0.7785, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5608656447249775, |
| "grad_norm": 2.2023691318993905, |
| "learning_rate": 6.981981981981982e-06, |
| "loss": 0.828, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.5626690712353472, |
| "grad_norm": 2.234138281725447, |
| "learning_rate": 7.0045045045045045e-06, |
| "loss": 0.7213, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.5644724977457168, |
| "grad_norm": 2.2818031085235932, |
| "learning_rate": 7.027027027027028e-06, |
| "loss": 0.6795, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.5662759242560865, |
| "grad_norm": 2.0842673839335846, |
| "learning_rate": 7.04954954954955e-06, |
| "loss": 0.8452, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.5680793507664562, |
| "grad_norm": 2.002642371369536, |
| "learning_rate": 7.072072072072072e-06, |
| "loss": 0.89, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.569882777276826, |
| "grad_norm": 2.083265325972135, |
| "learning_rate": 7.0945945945945946e-06, |
| "loss": 0.7551, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.5716862037871957, |
| "grad_norm": 2.0768528412350586, |
| "learning_rate": 7.117117117117117e-06, |
| "loss": 0.7289, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.5734896302975654, |
| "grad_norm": 1.9695169006653106, |
| "learning_rate": 7.139639639639641e-06, |
| "loss": 0.7184, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.5752930568079351, |
| "grad_norm": 2.084828562576803, |
| "learning_rate": 7.162162162162163e-06, |
| "loss": 0.7489, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.5770964833183048, |
| "grad_norm": 2.209350364597537, |
| "learning_rate": 7.1846846846846855e-06, |
| "loss": 0.7564, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5788999098286745, |
| "grad_norm": 2.2105476735413054, |
| "learning_rate": 7.207207207207208e-06, |
| "loss": 0.84, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.5807033363390441, |
| "grad_norm": 2.5857780263589616, |
| "learning_rate": 7.229729729729731e-06, |
| "loss": 0.8624, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.5825067628494139, |
| "grad_norm": 2.19851358126889, |
| "learning_rate": 7.252252252252253e-06, |
| "loss": 0.6745, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.5843101893597836, |
| "grad_norm": 2.144886414168463, |
| "learning_rate": 7.2747747747747755e-06, |
| "loss": 0.8314, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.5861136158701533, |
| "grad_norm": 2.11225650931259, |
| "learning_rate": 7.297297297297298e-06, |
| "loss": 0.9043, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.587917042380523, |
| "grad_norm": 2.301105964642942, |
| "learning_rate": 7.31981981981982e-06, |
| "loss": 0.7197, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.5897204688908927, |
| "grad_norm": 2.3975525696500806, |
| "learning_rate": 7.342342342342343e-06, |
| "loss": 0.9116, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.5915238954012624, |
| "grad_norm": 2.1452666571693255, |
| "learning_rate": 7.3648648648648655e-06, |
| "loss": 0.7906, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.5933273219116321, |
| "grad_norm": 2.4409882639138134, |
| "learning_rate": 7.387387387387388e-06, |
| "loss": 0.8716, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.5951307484220018, |
| "grad_norm": 2.171908727845542, |
| "learning_rate": 7.40990990990991e-06, |
| "loss": 0.7304, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5969341749323716, |
| "grad_norm": 2.247976351955023, |
| "learning_rate": 7.4324324324324324e-06, |
| "loss": 0.7739, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.5987376014427412, |
| "grad_norm": 2.238977362911319, |
| "learning_rate": 7.4549549549549564e-06, |
| "loss": 0.6977, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.6005410279531109, |
| "grad_norm": 1.9416451363003897, |
| "learning_rate": 7.477477477477479e-06, |
| "loss": 0.7653, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.6023444544634806, |
| "grad_norm": 2.184729395722401, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.7434, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.6041478809738503, |
| "grad_norm": 2.1317078889032173, |
| "learning_rate": 7.5225225225225225e-06, |
| "loss": 0.7047, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.60595130748422, |
| "grad_norm": 1.9907910766284589, |
| "learning_rate": 7.545045045045045e-06, |
| "loss": 0.7499, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.6077547339945897, |
| "grad_norm": 2.1264240696103487, |
| "learning_rate": 7.567567567567569e-06, |
| "loss": 0.8601, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.6095581605049594, |
| "grad_norm": 2.2046257254141035, |
| "learning_rate": 7.590090090090091e-06, |
| "loss": 0.8269, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.6113615870153292, |
| "grad_norm": 2.1215302945356695, |
| "learning_rate": 7.612612612612613e-06, |
| "loss": 0.7938, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.6131650135256989, |
| "grad_norm": 2.229799150273438, |
| "learning_rate": 7.635135135135135e-06, |
| "loss": 0.7993, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6149684400360685, |
| "grad_norm": 2.129796061875063, |
| "learning_rate": 7.657657657657658e-06, |
| "loss": 0.8045, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.6167718665464382, |
| "grad_norm": 2.0716978158297685, |
| "learning_rate": 7.680180180180181e-06, |
| "loss": 0.7925, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.6185752930568079, |
| "grad_norm": 2.130760036578581, |
| "learning_rate": 7.702702702702704e-06, |
| "loss": 0.8987, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.6203787195671776, |
| "grad_norm": 1.9658518773305242, |
| "learning_rate": 7.725225225225226e-06, |
| "loss": 0.7385, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.6221821460775473, |
| "grad_norm": 2.2057377247232557, |
| "learning_rate": 7.747747747747749e-06, |
| "loss": 0.8462, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6239855725879171, |
| "grad_norm": 1.998255103995078, |
| "learning_rate": 7.77027027027027e-06, |
| "loss": 0.726, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.6257889990982868, |
| "grad_norm": 2.1024774999508384, |
| "learning_rate": 7.792792792792793e-06, |
| "loss": 0.7351, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.6275924256086565, |
| "grad_norm": 2.045648302941062, |
| "learning_rate": 7.815315315315317e-06, |
| "loss": 0.7605, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.6293958521190262, |
| "grad_norm": 2.2083257683921373, |
| "learning_rate": 7.837837837837838e-06, |
| "loss": 0.7318, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.6311992786293958, |
| "grad_norm": 2.187495516104006, |
| "learning_rate": 7.860360360360361e-06, |
| "loss": 0.8159, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6330027051397655, |
| "grad_norm": 2.0804434740408007, |
| "learning_rate": 7.882882882882884e-06, |
| "loss": 0.7396, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.6348061316501352, |
| "grad_norm": 2.037209239741434, |
| "learning_rate": 7.905405405405406e-06, |
| "loss": 0.7953, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.6366095581605049, |
| "grad_norm": 2.0556472196259055, |
| "learning_rate": 7.927927927927929e-06, |
| "loss": 0.6783, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.6384129846708747, |
| "grad_norm": 2.0562270484236898, |
| "learning_rate": 7.95045045045045e-06, |
| "loss": 0.7568, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.6402164111812444, |
| "grad_norm": 1.9938898178619702, |
| "learning_rate": 7.972972972972974e-06, |
| "loss": 0.8204, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6420198376916141, |
| "grad_norm": 2.072139406380031, |
| "learning_rate": 7.995495495495497e-06, |
| "loss": 0.8035, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.6438232642019838, |
| "grad_norm": 2.0697820853812674, |
| "learning_rate": 8.018018018018018e-06, |
| "loss": 0.7801, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.6456266907123535, |
| "grad_norm": 2.0412202940720623, |
| "learning_rate": 8.040540540540541e-06, |
| "loss": 0.7014, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.6474301172227231, |
| "grad_norm": 1.9405663633560892, |
| "learning_rate": 8.063063063063063e-06, |
| "loss": 0.6928, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.6492335437330928, |
| "grad_norm": 2.1239135328030234, |
| "learning_rate": 8.085585585585586e-06, |
| "loss": 0.887, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6510369702434626, |
| "grad_norm": 2.436623349264573, |
| "learning_rate": 8.108108108108109e-06, |
| "loss": 0.8074, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.6528403967538323, |
| "grad_norm": 2.0833195573627195, |
| "learning_rate": 8.130630630630632e-06, |
| "loss": 0.7037, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.654643823264202, |
| "grad_norm": 1.9625571637730252, |
| "learning_rate": 8.153153153153154e-06, |
| "loss": 0.7199, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.6564472497745717, |
| "grad_norm": 1.980886318284568, |
| "learning_rate": 8.175675675675677e-06, |
| "loss": 0.749, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.6582506762849414, |
| "grad_norm": 2.3686023326598593, |
| "learning_rate": 8.198198198198198e-06, |
| "loss": 1.0184, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6600541027953111, |
| "grad_norm": 2.2671248043012264, |
| "learning_rate": 8.220720720720721e-06, |
| "loss": 0.7957, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.6618575293056808, |
| "grad_norm": 2.2527130831079027, |
| "learning_rate": 8.243243243243245e-06, |
| "loss": 0.8575, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.6636609558160504, |
| "grad_norm": 2.5533424372075446, |
| "learning_rate": 8.265765765765766e-06, |
| "loss": 0.7706, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.6654643823264202, |
| "grad_norm": 2.467984541574478, |
| "learning_rate": 8.288288288288289e-06, |
| "loss": 0.7647, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.6672678088367899, |
| "grad_norm": 2.183449489939818, |
| "learning_rate": 8.31081081081081e-06, |
| "loss": 0.8173, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6690712353471596, |
| "grad_norm": 2.2867069876523582, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 0.8269, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.6708746618575293, |
| "grad_norm": 2.2573156909093957, |
| "learning_rate": 8.355855855855857e-06, |
| "loss": 0.7816, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.672678088367899, |
| "grad_norm": 1.9766214188381033, |
| "learning_rate": 8.378378378378378e-06, |
| "loss": 0.7515, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.6744815148782687, |
| "grad_norm": 2.1750484801074057, |
| "learning_rate": 8.400900900900901e-06, |
| "loss": 0.8656, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.6762849413886384, |
| "grad_norm": 2.230627699222089, |
| "learning_rate": 8.423423423423423e-06, |
| "loss": 0.773, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.6780883678990082, |
| "grad_norm": 2.0314632652565763, |
| "learning_rate": 8.445945945945948e-06, |
| "loss": 0.7375, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.6798917944093779, |
| "grad_norm": 2.005821931578866, |
| "learning_rate": 8.46846846846847e-06, |
| "loss": 0.7262, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.6816952209197475, |
| "grad_norm": 2.1743917104398647, |
| "learning_rate": 8.490990990990992e-06, |
| "loss": 0.7824, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.6834986474301172, |
| "grad_norm": 2.0955934020895066, |
| "learning_rate": 8.513513513513514e-06, |
| "loss": 0.785, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.6853020739404869, |
| "grad_norm": 2.000365871785507, |
| "learning_rate": 8.536036036036037e-06, |
| "loss": 0.648, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6871055004508566, |
| "grad_norm": 2.1478172637074744, |
| "learning_rate": 8.55855855855856e-06, |
| "loss": 0.8075, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.6889089269612263, |
| "grad_norm": 2.134460577230095, |
| "learning_rate": 8.581081081081082e-06, |
| "loss": 0.9026, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.690712353471596, |
| "grad_norm": 2.14542331689987, |
| "learning_rate": 8.603603603603605e-06, |
| "loss": 0.8901, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.6925157799819658, |
| "grad_norm": 2.135300301139234, |
| "learning_rate": 8.626126126126126e-06, |
| "loss": 0.7259, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.6943192064923355, |
| "grad_norm": 2.474623212671607, |
| "learning_rate": 8.64864864864865e-06, |
| "loss": 0.8629, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.6961226330027052, |
| "grad_norm": 1.975908066289463, |
| "learning_rate": 8.671171171171172e-06, |
| "loss": 0.7249, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.6979260595130748, |
| "grad_norm": 2.1653693128183016, |
| "learning_rate": 8.693693693693694e-06, |
| "loss": 0.8081, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.6997294860234445, |
| "grad_norm": 1.8567902438166204, |
| "learning_rate": 8.716216216216217e-06, |
| "loss": 0.7579, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.7015329125338142, |
| "grad_norm": 2.2215481111685484, |
| "learning_rate": 8.738738738738739e-06, |
| "loss": 0.9716, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.7033363390441839, |
| "grad_norm": 2.3046170296242, |
| "learning_rate": 8.761261261261262e-06, |
| "loss": 0.7795, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7051397655545536, |
| "grad_norm": 2.131248198058394, |
| "learning_rate": 8.783783783783785e-06, |
| "loss": 0.9155, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.7069431920649234, |
| "grad_norm": 1.924396723021384, |
| "learning_rate": 8.806306306306306e-06, |
| "loss": 0.7556, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.7087466185752931, |
| "grad_norm": 2.1117533927836996, |
| "learning_rate": 8.82882882882883e-06, |
| "loss": 0.8406, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.7105500450856628, |
| "grad_norm": 2.075709429966764, |
| "learning_rate": 8.851351351351351e-06, |
| "loss": 0.745, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.7123534715960325, |
| "grad_norm": 2.16985925608763, |
| "learning_rate": 8.873873873873876e-06, |
| "loss": 0.6691, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.7141568981064021, |
| "grad_norm": 2.0797926880074846, |
| "learning_rate": 8.896396396396397e-06, |
| "loss": 0.8238, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.7159603246167718, |
| "grad_norm": 2.0186248223482997, |
| "learning_rate": 8.91891891891892e-06, |
| "loss": 0.7898, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.7177637511271415, |
| "grad_norm": 2.185076250626117, |
| "learning_rate": 8.941441441441442e-06, |
| "loss": 0.7142, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.7195671776375113, |
| "grad_norm": 2.1357619098512384, |
| "learning_rate": 8.963963963963965e-06, |
| "loss": 0.723, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.721370604147881, |
| "grad_norm": 2.117174241205152, |
| "learning_rate": 8.986486486486488e-06, |
| "loss": 0.7863, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7231740306582507, |
| "grad_norm": 2.0651532522605214, |
| "learning_rate": 9.00900900900901e-06, |
| "loss": 0.7213, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.7249774571686204, |
| "grad_norm": 1.9757749532276578, |
| "learning_rate": 9.031531531531533e-06, |
| "loss": 0.7742, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.7267808836789901, |
| "grad_norm": 2.3393951402241755, |
| "learning_rate": 9.054054054054054e-06, |
| "loss": 0.7721, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.7285843101893598, |
| "grad_norm": 2.2574331805115064, |
| "learning_rate": 9.076576576576577e-06, |
| "loss": 0.9634, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.7303877366997295, |
| "grad_norm": 2.0915118092689524, |
| "learning_rate": 9.0990990990991e-06, |
| "loss": 0.8077, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7321911632100991, |
| "grad_norm": 2.1653381189020524, |
| "learning_rate": 9.121621621621622e-06, |
| "loss": 0.7777, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.7339945897204689, |
| "grad_norm": 2.1328560039152458, |
| "learning_rate": 9.144144144144145e-06, |
| "loss": 0.7882, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.7357980162308386, |
| "grad_norm": 2.201274651740219, |
| "learning_rate": 9.166666666666666e-06, |
| "loss": 0.7608, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.7376014427412083, |
| "grad_norm": 2.2359271481989587, |
| "learning_rate": 9.189189189189191e-06, |
| "loss": 0.8347, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.739404869251578, |
| "grad_norm": 2.1161842611073034, |
| "learning_rate": 9.211711711711713e-06, |
| "loss": 0.7557, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7412082957619477, |
| "grad_norm": 2.028791176838769, |
| "learning_rate": 9.234234234234236e-06, |
| "loss": 0.7183, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.7430117222723174, |
| "grad_norm": 2.0656955597804503, |
| "learning_rate": 9.256756756756757e-06, |
| "loss": 0.7699, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.7448151487826871, |
| "grad_norm": 2.087727535028477, |
| "learning_rate": 9.27927927927928e-06, |
| "loss": 0.7679, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.7466185752930569, |
| "grad_norm": 2.49764840147548, |
| "learning_rate": 9.301801801801804e-06, |
| "loss": 0.8287, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.7484220018034266, |
| "grad_norm": 1.9741285413389515, |
| "learning_rate": 9.324324324324325e-06, |
| "loss": 0.8161, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7502254283137962, |
| "grad_norm": 1.9406848284047182, |
| "learning_rate": 9.346846846846848e-06, |
| "loss": 0.768, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.7520288548241659, |
| "grad_norm": 2.244408508249851, |
| "learning_rate": 9.36936936936937e-06, |
| "loss": 0.8611, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.7538322813345356, |
| "grad_norm": 2.158384016489991, |
| "learning_rate": 9.391891891891893e-06, |
| "loss": 0.8361, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.7556357078449053, |
| "grad_norm": 2.066053768199076, |
| "learning_rate": 9.414414414414416e-06, |
| "loss": 0.819, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.757439134355275, |
| "grad_norm": 2.1350022569990603, |
| "learning_rate": 9.436936936936937e-06, |
| "loss": 0.7075, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.7592425608656447, |
| "grad_norm": 2.0051316619920745, |
| "learning_rate": 9.45945945945946e-06, |
| "loss": 0.8319, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.7610459873760145, |
| "grad_norm": 2.225160013360467, |
| "learning_rate": 9.481981981981982e-06, |
| "loss": 0.7308, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.7628494138863842, |
| "grad_norm": 2.276993744188313, |
| "learning_rate": 9.504504504504505e-06, |
| "loss": 0.8014, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.7646528403967539, |
| "grad_norm": 1.8858762886928577, |
| "learning_rate": 9.527027027027028e-06, |
| "loss": 0.7923, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.7664562669071235, |
| "grad_norm": 2.0252801103636195, |
| "learning_rate": 9.54954954954955e-06, |
| "loss": 0.76, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.7682596934174932, |
| "grad_norm": 2.0418148296691503, |
| "learning_rate": 9.572072072072073e-06, |
| "loss": 0.7714, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.7700631199278629, |
| "grad_norm": 2.133301976541229, |
| "learning_rate": 9.594594594594594e-06, |
| "loss": 0.918, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.7718665464382326, |
| "grad_norm": 1.8904070256803192, |
| "learning_rate": 9.617117117117117e-06, |
| "loss": 0.8019, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.7736699729486023, |
| "grad_norm": 2.1802687710567445, |
| "learning_rate": 9.63963963963964e-06, |
| "loss": 0.7124, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.7754733994589721, |
| "grad_norm": 2.0041551174293883, |
| "learning_rate": 9.662162162162164e-06, |
| "loss": 0.7317, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7772768259693418, |
| "grad_norm": 2.1685058642707085, |
| "learning_rate": 9.684684684684685e-06, |
| "loss": 0.8182, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.7790802524797115, |
| "grad_norm": 2.0821300887019394, |
| "learning_rate": 9.707207207207208e-06, |
| "loss": 0.7483, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.7808836789900812, |
| "grad_norm": 2.017254787966865, |
| "learning_rate": 9.729729729729732e-06, |
| "loss": 0.7439, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.7826871055004508, |
| "grad_norm": 2.0938808603633965, |
| "learning_rate": 9.752252252252253e-06, |
| "loss": 0.727, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.7844905320108205, |
| "grad_norm": 2.0745407772671784, |
| "learning_rate": 9.774774774774776e-06, |
| "loss": 0.8423, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.7862939585211902, |
| "grad_norm": 2.080200509416444, |
| "learning_rate": 9.797297297297298e-06, |
| "loss": 0.7642, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.78809738503156, |
| "grad_norm": 2.0994306655505537, |
| "learning_rate": 9.81981981981982e-06, |
| "loss": 0.839, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.7899008115419297, |
| "grad_norm": 2.2979389003423423, |
| "learning_rate": 9.842342342342344e-06, |
| "loss": 1.0683, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.7917042380522994, |
| "grad_norm": 2.0022713778993046, |
| "learning_rate": 9.864864864864865e-06, |
| "loss": 0.8482, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.7935076645626691, |
| "grad_norm": 2.079766390913082, |
| "learning_rate": 9.887387387387388e-06, |
| "loss": 0.8196, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7953110910730388, |
| "grad_norm": 2.2693808506736555, |
| "learning_rate": 9.90990990990991e-06, |
| "loss": 0.8566, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.7971145175834085, |
| "grad_norm": 2.0568829176639767, |
| "learning_rate": 9.932432432432433e-06, |
| "loss": 0.6902, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.7989179440937781, |
| "grad_norm": 2.0605295508114687, |
| "learning_rate": 9.954954954954956e-06, |
| "loss": 0.7233, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.8007213706041478, |
| "grad_norm": 2.0444950860289532, |
| "learning_rate": 9.97747747747748e-06, |
| "loss": 0.7836, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.8025247971145176, |
| "grad_norm": 2.110829091240123, |
| "learning_rate": 1e-05, |
| "loss": 0.8386, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.8043282236248873, |
| "grad_norm": 1.9884942336111662, |
| "learning_rate": 9.999998454785508e-06, |
| "loss": 0.7082, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.806131650135257, |
| "grad_norm": 2.259740074055523, |
| "learning_rate": 9.999993819142988e-06, |
| "loss": 0.7289, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.8079350766456267, |
| "grad_norm": 1.9397862708682205, |
| "learning_rate": 9.999986093075303e-06, |
| "loss": 0.8564, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.8097385031559964, |
| "grad_norm": 2.024093634732536, |
| "learning_rate": 9.99997527658723e-06, |
| "loss": 0.8771, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.8115419296663661, |
| "grad_norm": 2.1014829667276866, |
| "learning_rate": 9.999961369685454e-06, |
| "loss": 0.7321, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8133453561767358, |
| "grad_norm": 2.048448841813617, |
| "learning_rate": 9.999944372378571e-06, |
| "loss": 0.7546, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.8151487826871056, |
| "grad_norm": 2.1737934515944346, |
| "learning_rate": 9.999924284677087e-06, |
| "loss": 0.7508, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.8169522091974752, |
| "grad_norm": 2.1001584800928037, |
| "learning_rate": 9.999901106593418e-06, |
| "loss": 0.7644, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.8187556357078449, |
| "grad_norm": 1.8767527278907021, |
| "learning_rate": 9.999874838141888e-06, |
| "loss": 0.7667, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.8205590622182146, |
| "grad_norm": 2.04063723684597, |
| "learning_rate": 9.999845479338735e-06, |
| "loss": 0.8819, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.8223624887285843, |
| "grad_norm": 2.0555425814497728, |
| "learning_rate": 9.999813030202106e-06, |
| "loss": 0.7877, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.824165915238954, |
| "grad_norm": 1.967379340855907, |
| "learning_rate": 9.999777490752056e-06, |
| "loss": 0.8801, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.8259693417493237, |
| "grad_norm": 2.2507392839674556, |
| "learning_rate": 9.99973886101055e-06, |
| "loss": 0.7568, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.8277727682596934, |
| "grad_norm": 2.5630927142344935, |
| "learning_rate": 9.99969714100147e-06, |
| "loss": 0.9138, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.8295761947700632, |
| "grad_norm": 2.173332163298101, |
| "learning_rate": 9.999652330750595e-06, |
| "loss": 0.8281, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8313796212804329, |
| "grad_norm": 1.916637925944768, |
| "learning_rate": 9.999604430285628e-06, |
| "loss": 0.7754, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.8331830477908025, |
| "grad_norm": 2.1152249416855384, |
| "learning_rate": 9.999553439636171e-06, |
| "loss": 0.8997, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.8349864743011722, |
| "grad_norm": 2.0689074493196955, |
| "learning_rate": 9.999499358833745e-06, |
| "loss": 0.7964, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.8367899008115419, |
| "grad_norm": 2.0154413915192064, |
| "learning_rate": 9.999442187911774e-06, |
| "loss": 0.7699, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.8385933273219116, |
| "grad_norm": 1.9151923147532852, |
| "learning_rate": 9.999381926905592e-06, |
| "loss": 0.6932, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8403967538322813, |
| "grad_norm": 2.1665705435782336, |
| "learning_rate": 9.999318575852451e-06, |
| "loss": 1.0093, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.842200180342651, |
| "grad_norm": 1.8047363755961323, |
| "learning_rate": 9.999252134791504e-06, |
| "loss": 0.6659, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.8440036068530208, |
| "grad_norm": 2.151615247814168, |
| "learning_rate": 9.999182603763816e-06, |
| "loss": 0.7546, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.8458070333633905, |
| "grad_norm": 2.4423751661740503, |
| "learning_rate": 9.999109982812368e-06, |
| "loss": 0.9198, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.8476104598737602, |
| "grad_norm": 1.9717826223860573, |
| "learning_rate": 9.99903427198204e-06, |
| "loss": 0.7544, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8494138863841298, |
| "grad_norm": 2.0312411419769005, |
| "learning_rate": 9.99895547131963e-06, |
| "loss": 0.8107, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.8512173128944995, |
| "grad_norm": 2.157429067167941, |
| "learning_rate": 9.998873580873848e-06, |
| "loss": 0.6818, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.8530207394048692, |
| "grad_norm": 2.165363885628237, |
| "learning_rate": 9.998788600695304e-06, |
| "loss": 0.7382, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.8548241659152389, |
| "grad_norm": 2.147934133999578, |
| "learning_rate": 9.998700530836525e-06, |
| "loss": 0.8056, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.8566275924256087, |
| "grad_norm": 1.9152522216267172, |
| "learning_rate": 9.998609371351944e-06, |
| "loss": 0.8791, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8584310189359784, |
| "grad_norm": 2.0560094856153976, |
| "learning_rate": 9.998515122297909e-06, |
| "loss": 0.8172, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.8602344454463481, |
| "grad_norm": 2.123184949091883, |
| "learning_rate": 9.99841778373267e-06, |
| "loss": 0.8243, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.8620378719567178, |
| "grad_norm": 2.178911188445628, |
| "learning_rate": 9.998317355716393e-06, |
| "loss": 0.9132, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.8638412984670875, |
| "grad_norm": 2.2802080985811766, |
| "learning_rate": 9.99821383831115e-06, |
| "loss": 0.8641, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.8656447249774571, |
| "grad_norm": 2.0900308677818287, |
| "learning_rate": 9.998107231580925e-06, |
| "loss": 0.7905, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.8674481514878268, |
| "grad_norm": 1.9731645490377003, |
| "learning_rate": 9.99799753559161e-06, |
| "loss": 0.639, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.8692515779981965, |
| "grad_norm": 2.253348194223949, |
| "learning_rate": 9.997884750411004e-06, |
| "loss": 0.7037, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.8710550045085663, |
| "grad_norm": 2.052696164388112, |
| "learning_rate": 9.99776887610882e-06, |
| "loss": 0.7247, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.872858431018936, |
| "grad_norm": 2.0018489890660196, |
| "learning_rate": 9.997649912756678e-06, |
| "loss": 0.8574, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.8746618575293057, |
| "grad_norm": 2.168583333160758, |
| "learning_rate": 9.997527860428108e-06, |
| "loss": 0.7786, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.8764652840396754, |
| "grad_norm": 1.943218814932057, |
| "learning_rate": 9.99740271919855e-06, |
| "loss": 0.8582, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.8782687105500451, |
| "grad_norm": 2.3671546883192964, |
| "learning_rate": 9.997274489145348e-06, |
| "loss": 0.8454, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.8800721370604148, |
| "grad_norm": 1.970853464509309, |
| "learning_rate": 9.997143170347762e-06, |
| "loss": 0.8135, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.8818755635707844, |
| "grad_norm": 2.0194288814353505, |
| "learning_rate": 9.997008762886957e-06, |
| "loss": 0.8322, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.8836789900811542, |
| "grad_norm": 1.7905144760024025, |
| "learning_rate": 9.99687126684601e-06, |
| "loss": 0.6747, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8854824165915239, |
| "grad_norm": 1.8598765372123462, |
| "learning_rate": 9.996730682309905e-06, |
| "loss": 0.7077, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.8872858431018936, |
| "grad_norm": 2.0349723629280194, |
| "learning_rate": 9.996587009365534e-06, |
| "loss": 1.0192, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.8890892696122633, |
| "grad_norm": 1.8670659449601439, |
| "learning_rate": 9.9964402481017e-06, |
| "loss": 0.7877, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.890892696122633, |
| "grad_norm": 1.9920058723596443, |
| "learning_rate": 9.996290398609115e-06, |
| "loss": 0.7732, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.8926961226330027, |
| "grad_norm": 2.115518704658833, |
| "learning_rate": 9.996137460980397e-06, |
| "loss": 0.9214, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.8944995491433724, |
| "grad_norm": 1.8785646679648142, |
| "learning_rate": 9.995981435310078e-06, |
| "loss": 0.7817, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.8963029756537421, |
| "grad_norm": 2.055980396750251, |
| "learning_rate": 9.99582232169459e-06, |
| "loss": 0.7329, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.8981064021641119, |
| "grad_norm": 2.3245341827701016, |
| "learning_rate": 9.995660120232282e-06, |
| "loss": 0.7507, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.8999098286744815, |
| "grad_norm": 1.9959708222467396, |
| "learning_rate": 9.99549483102341e-06, |
| "loss": 0.8384, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.9017132551848512, |
| "grad_norm": 1.8680585762600073, |
| "learning_rate": 9.995326454170132e-06, |
| "loss": 0.7024, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9035166816952209, |
| "grad_norm": 2.0117030514545378, |
| "learning_rate": 9.995154989776523e-06, |
| "loss": 0.7802, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.9053201082055906, |
| "grad_norm": 2.137679667033616, |
| "learning_rate": 9.994980437948563e-06, |
| "loss": 0.8063, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.9071235347159603, |
| "grad_norm": 2.1841726174477225, |
| "learning_rate": 9.994802798794138e-06, |
| "loss": 0.8739, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.90892696122633, |
| "grad_norm": 2.0156304452286316, |
| "learning_rate": 9.994622072423046e-06, |
| "loss": 0.8506, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.9107303877366997, |
| "grad_norm": 1.9906410530703365, |
| "learning_rate": 9.99443825894699e-06, |
| "loss": 0.756, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.9125338142470695, |
| "grad_norm": 2.112608845026693, |
| "learning_rate": 9.994251358479583e-06, |
| "loss": 0.8051, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.9143372407574392, |
| "grad_norm": 2.043964300441665, |
| "learning_rate": 9.994061371136347e-06, |
| "loss": 0.7568, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.9161406672678089, |
| "grad_norm": 1.938078803529045, |
| "learning_rate": 9.993868297034709e-06, |
| "loss": 0.6958, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.9179440937781785, |
| "grad_norm": 2.136141218098241, |
| "learning_rate": 9.993672136294004e-06, |
| "loss": 0.8964, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.9197475202885482, |
| "grad_norm": 2.159727291798854, |
| "learning_rate": 9.993472889035478e-06, |
| "loss": 0.7743, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.9215509467989179, |
| "grad_norm": 2.067924171823381, |
| "learning_rate": 9.993270555382283e-06, |
| "loss": 0.7229, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.9233543733092876, |
| "grad_norm": 1.9675288285614405, |
| "learning_rate": 9.99306513545948e-06, |
| "loss": 0.7454, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.9251577998196574, |
| "grad_norm": 2.022409969863871, |
| "learning_rate": 9.99285662939403e-06, |
| "loss": 0.8741, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.9269612263300271, |
| "grad_norm": 2.1801414103496084, |
| "learning_rate": 9.992645037314815e-06, |
| "loss": 0.9204, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.9287646528403968, |
| "grad_norm": 2.243366786461632, |
| "learning_rate": 9.992430359352613e-06, |
| "loss": 0.7942, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.9305680793507665, |
| "grad_norm": 2.035046137492824, |
| "learning_rate": 9.992212595640115e-06, |
| "loss": 0.7946, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.9323715058611362, |
| "grad_norm": 1.9398861696717984, |
| "learning_rate": 9.991991746311916e-06, |
| "loss": 0.8198, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.9341749323715058, |
| "grad_norm": 2.069297097067726, |
| "learning_rate": 9.991767811504522e-06, |
| "loss": 0.7359, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.9359783588818755, |
| "grad_norm": 2.0501351425908516, |
| "learning_rate": 9.991540791356342e-06, |
| "loss": 0.7781, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.9377817853922452, |
| "grad_norm": 2.0883364441479713, |
| "learning_rate": 9.991310686007694e-06, |
| "loss": 0.7445, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.939585211902615, |
| "grad_norm": 2.000915313932848, |
| "learning_rate": 9.991077495600806e-06, |
| "loss": 0.877, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.9413886384129847, |
| "grad_norm": 2.0041196859921637, |
| "learning_rate": 9.990841220279805e-06, |
| "loss": 0.7847, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.9431920649233544, |
| "grad_norm": 2.0817088464090614, |
| "learning_rate": 9.990601860190732e-06, |
| "loss": 0.769, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.9449954914337241, |
| "grad_norm": 1.8694308179676222, |
| "learning_rate": 9.990359415481532e-06, |
| "loss": 0.7341, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.9467989179440938, |
| "grad_norm": 1.929784513064164, |
| "learning_rate": 9.990113886302057e-06, |
| "loss": 0.9216, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.9486023444544635, |
| "grad_norm": 2.0430562116596698, |
| "learning_rate": 9.989865272804064e-06, |
| "loss": 0.9328, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.9504057709648331, |
| "grad_norm": 2.0821273642223366, |
| "learning_rate": 9.989613575141216e-06, |
| "loss": 0.7013, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.9522091974752029, |
| "grad_norm": 2.107240374281522, |
| "learning_rate": 9.989358793469089e-06, |
| "loss": 0.8061, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.9540126239855726, |
| "grad_norm": 1.9040690842756598, |
| "learning_rate": 9.989100927945155e-06, |
| "loss": 0.6969, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.9558160504959423, |
| "grad_norm": 1.9025952752616262, |
| "learning_rate": 9.988839978728798e-06, |
| "loss": 0.8238, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.957619477006312, |
| "grad_norm": 2.26685993097902, |
| "learning_rate": 9.988575945981308e-06, |
| "loss": 0.72, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.9594229035166817, |
| "grad_norm": 1.9202409462283874, |
| "learning_rate": 9.98830882986588e-06, |
| "loss": 0.6961, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.9612263300270514, |
| "grad_norm": 1.9580497289053733, |
| "learning_rate": 9.988038630547613e-06, |
| "loss": 0.7772, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.9630297565374211, |
| "grad_norm": 2.1460745134447143, |
| "learning_rate": 9.987765348193517e-06, |
| "loss": 0.7882, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.9648331830477908, |
| "grad_norm": 2.232600000911527, |
| "learning_rate": 9.9874889829725e-06, |
| "loss": 0.7665, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.9666366095581606, |
| "grad_norm": 2.169430275754146, |
| "learning_rate": 9.98720953505538e-06, |
| "loss": 0.8654, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.9684400360685302, |
| "grad_norm": 2.1645017061078096, |
| "learning_rate": 9.986927004614881e-06, |
| "loss": 0.7641, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.9702434625788999, |
| "grad_norm": 1.964406868016901, |
| "learning_rate": 9.986641391825633e-06, |
| "loss": 0.74, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.9720468890892696, |
| "grad_norm": 2.024095725628653, |
| "learning_rate": 9.986352696864165e-06, |
| "loss": 0.7718, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.9738503155996393, |
| "grad_norm": 2.256779549272981, |
| "learning_rate": 9.986060919908917e-06, |
| "loss": 0.8262, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.975653742110009, |
| "grad_norm": 1.9660904201916358, |
| "learning_rate": 9.985766061140233e-06, |
| "loss": 0.6275, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.9774571686203787, |
| "grad_norm": 2.1280476336617378, |
| "learning_rate": 9.985468120740361e-06, |
| "loss": 0.768, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.9792605951307484, |
| "grad_norm": 1.938265598688848, |
| "learning_rate": 9.985167098893452e-06, |
| "loss": 0.9998, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.9810640216411182, |
| "grad_norm": 2.1379975513910487, |
| "learning_rate": 9.984862995785564e-06, |
| "loss": 0.8202, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.9828674481514879, |
| "grad_norm": 2.0638541854893884, |
| "learning_rate": 9.984555811604662e-06, |
| "loss": 0.7729, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.9846708746618575, |
| "grad_norm": 2.34875633165973, |
| "learning_rate": 9.984245546540606e-06, |
| "loss": 0.8073, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.9864743011722272, |
| "grad_norm": 2.1131918995078056, |
| "learning_rate": 9.983932200785173e-06, |
| "loss": 0.7262, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.9882777276825969, |
| "grad_norm": 1.995224926283804, |
| "learning_rate": 9.983615774532031e-06, |
| "loss": 0.8007, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.9900811541929666, |
| "grad_norm": 1.8849007313400998, |
| "learning_rate": 9.983296267976766e-06, |
| "loss": 0.6879, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.9918845807033363, |
| "grad_norm": 2.051745716721103, |
| "learning_rate": 9.982973681316854e-06, |
| "loss": 0.7265, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9936880072137061, |
| "grad_norm": 2.0528171651989693, |
| "learning_rate": 9.982648014751685e-06, |
| "loss": 0.7505, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.9954914337240758, |
| "grad_norm": 2.2911439260287336, |
| "learning_rate": 9.982319268482547e-06, |
| "loss": 0.8454, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.9972948602344455, |
| "grad_norm": 2.2047269517513257, |
| "learning_rate": 9.981987442712634e-06, |
| "loss": 0.8355, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.9990982867448152, |
| "grad_norm": 1.7988349615615926, |
| "learning_rate": 9.981652537647041e-06, |
| "loss": 0.6762, |
| "step": 554 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 4440, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 277, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 260908552552448.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|