{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004329004329004329, "grad_norm": 6.30618953704834, "learning_rate": 0.0, "loss": 0.4668, "step": 1 }, { "epoch": 0.008658008658008658, "grad_norm": 3.96525502204895, "learning_rate": 1.4285714285714285e-05, "loss": 0.3887, "step": 2 }, { "epoch": 0.012987012987012988, "grad_norm": 4.628964424133301, "learning_rate": 2.857142857142857e-05, "loss": 0.4199, "step": 3 }, { "epoch": 0.017316017316017316, "grad_norm": 4.340306282043457, "learning_rate": 4.2857142857142856e-05, "loss": 0.3379, "step": 4 }, { "epoch": 0.021645021645021644, "grad_norm": 10.413896560668945, "learning_rate": 5.714285714285714e-05, "loss": 0.4785, "step": 5 }, { "epoch": 0.025974025974025976, "grad_norm": 3.5524611473083496, "learning_rate": 7.142857142857143e-05, "loss": 0.3965, "step": 6 }, { "epoch": 0.030303030303030304, "grad_norm": 3.9681403636932373, "learning_rate": 8.571428571428571e-05, "loss": 0.334, "step": 7 }, { "epoch": 0.03463203463203463, "grad_norm": 6.837435722351074, "learning_rate": 0.0001, "loss": 0.4609, "step": 8 }, { "epoch": 0.03896103896103896, "grad_norm": 6.86609411239624, "learning_rate": 9.999508258797877e-05, "loss": 0.4297, "step": 9 }, { "epoch": 0.04329004329004329, "grad_norm": 5.7420973777771, "learning_rate": 9.998033131915266e-05, "loss": 0.3691, "step": 10 }, { "epoch": 0.047619047619047616, "grad_norm": 5.456467151641846, "learning_rate": 9.995574909504435e-05, "loss": 0.4023, "step": 11 }, { "epoch": 0.05194805194805195, "grad_norm": 6.083012104034424, "learning_rate": 9.992134075089084e-05, "loss": 0.498, "step": 12 }, { "epoch": 0.05627705627705628, "grad_norm": 24.94365692138672, "learning_rate": 9.987711305469231e-05, "loss": 0.4551, "step": 13 }, { "epoch": 0.06060606060606061, "grad_norm": 9.905355453491211, "learning_rate": 9.982307470588098e-05, "loss": 0.4453, "step": 14 }, { "epoch": 0.06493506493506493, "grad_norm": 6.808415412902832, "learning_rate": 9.975923633360985e-05, "loss": 0.4434, "step": 15 }, { "epoch": 0.06926406926406926, "grad_norm": 8.532634735107422, "learning_rate": 9.968561049466214e-05, "loss": 0.4043, "step": 16 }, { "epoch": 0.0735930735930736, "grad_norm": 7.474434852600098, "learning_rate": 9.960221167098124e-05, "loss": 0.4473, "step": 17 }, { "epoch": 0.07792207792207792, "grad_norm": 31.124326705932617, "learning_rate": 9.950905626682228e-05, "loss": 0.3809, "step": 18 }, { "epoch": 0.08225108225108226, "grad_norm": 7.993480205535889, "learning_rate": 9.940616260552544e-05, "loss": 0.4863, "step": 19 }, { "epoch": 0.08658008658008658, "grad_norm": 6.659813404083252, "learning_rate": 9.92935509259118e-05, "loss": 0.3477, "step": 20 }, { "epoch": 0.09090909090909091, "grad_norm": 8.14820671081543, "learning_rate": 9.917124337830243e-05, "loss": 0.3809, "step": 21 }, { "epoch": 0.09523809523809523, "grad_norm": 6.370754718780518, "learning_rate": 9.903926402016153e-05, "loss": 0.4727, "step": 22 }, { "epoch": 0.09956709956709957, "grad_norm": 10.538797378540039, "learning_rate": 9.889763881136439e-05, "loss": 0.5156, "step": 23 }, { "epoch": 0.1038961038961039, "grad_norm": 8.41638469696045, "learning_rate": 9.874639560909117e-05, "loss": 0.4727, "step": 24 }, { "epoch": 0.10822510822510822, "grad_norm": 11.677297592163086, "learning_rate": 9.858556416234755e-05, "loss": 0.457, "step": 25 }, { "epoch": 0.11255411255411256, "grad_norm": 14.953546524047852, "learning_rate": 9.841517610611309e-05, "loss": 0.4336, "step": 26 }, { "epoch": 0.11688311688311688, "grad_norm": 7.516232013702393, "learning_rate": 9.82352649551188e-05, "loss": 0.3516, "step": 27 }, { "epoch": 0.12121212121212122, "grad_norm": 7.574983596801758, "learning_rate": 9.804586609725499e-05, "loss": 0.3926, "step": 28 }, { "epoch": 0.12554112554112554, "grad_norm": 17.778465270996094, "learning_rate": 9.784701678661045e-05, "loss": 0.4082, "step": 29 }, { "epoch": 0.12987012987012986, "grad_norm": 22.582393646240234, "learning_rate": 9.763875613614482e-05, "loss": 0.4688, "step": 30 }, { "epoch": 0.1341991341991342, "grad_norm": 156.63107299804688, "learning_rate": 9.742112510999515e-05, "loss": 0.4902, "step": 31 }, { "epoch": 0.13852813852813853, "grad_norm": 22.96047592163086, "learning_rate": 9.719416651541839e-05, "loss": 0.4336, "step": 32 }, { "epoch": 0.14285714285714285, "grad_norm": 10.698134422302246, "learning_rate": 9.69579249943714e-05, "loss": 0.4414, "step": 33 }, { "epoch": 0.1471861471861472, "grad_norm": 7.732542514801025, "learning_rate": 9.671244701472999e-05, "loss": 0.3789, "step": 34 }, { "epoch": 0.15151515151515152, "grad_norm": 20.447450637817383, "learning_rate": 9.645778086114892e-05, "loss": 0.4551, "step": 35 }, { "epoch": 0.15584415584415584, "grad_norm": 7.715920925140381, "learning_rate": 9.619397662556435e-05, "loss": 0.4863, "step": 36 }, { "epoch": 0.16017316017316016, "grad_norm": 11.215670585632324, "learning_rate": 9.592108619734106e-05, "loss": 0.4746, "step": 37 }, { "epoch": 0.1645021645021645, "grad_norm": 14.483991622924805, "learning_rate": 9.563916325306594e-05, "loss": 0.4062, "step": 38 }, { "epoch": 0.16883116883116883, "grad_norm": 9.34101390838623, "learning_rate": 9.534826324599003e-05, "loss": 0.4941, "step": 39 }, { "epoch": 0.17316017316017315, "grad_norm": 9.180691719055176, "learning_rate": 9.504844339512095e-05, "loss": 0.4707, "step": 40 }, { "epoch": 0.1774891774891775, "grad_norm": 5.87382173538208, "learning_rate": 9.473976267396831e-05, "loss": 0.5078, "step": 41 }, { "epoch": 0.18181818181818182, "grad_norm": 7.854617118835449, "learning_rate": 9.442228179894362e-05, "loss": 0.4746, "step": 42 }, { "epoch": 0.18614718614718614, "grad_norm": 5.667612075805664, "learning_rate": 9.409606321741775e-05, "loss": 0.5156, "step": 43 }, { "epoch": 0.19047619047619047, "grad_norm": 6.341343402862549, "learning_rate": 9.376117109543769e-05, "loss": 0.498, "step": 44 }, { "epoch": 0.19480519480519481, "grad_norm": 6.829994201660156, "learning_rate": 9.341767130510528e-05, "loss": 0.3711, "step": 45 }, { "epoch": 0.19913419913419914, "grad_norm": 11.078237533569336, "learning_rate": 9.306563141162046e-05, "loss": 0.4434, "step": 46 }, { "epoch": 0.20346320346320346, "grad_norm": 9.45587158203125, "learning_rate": 9.270512065999137e-05, "loss": 0.5234, "step": 47 }, { "epoch": 0.2077922077922078, "grad_norm": 6.764471530914307, "learning_rate": 9.233620996141421e-05, "loss": 0.4219, "step": 48 }, { "epoch": 0.21212121212121213, "grad_norm": 23.972339630126953, "learning_rate": 9.195897187932512e-05, "loss": 0.3965, "step": 49 }, { "epoch": 0.21645021645021645, "grad_norm": 5.304466724395752, "learning_rate": 9.157348061512727e-05, "loss": 0.418, "step": 50 }, { "epoch": 0.22077922077922077, "grad_norm": 5.903518199920654, "learning_rate": 9.117981199359574e-05, "loss": 0.2969, "step": 51 }, { "epoch": 0.22510822510822512, "grad_norm": 6.200258731842041, "learning_rate": 9.077804344796302e-05, "loss": 0.4551, "step": 52 }, { "epoch": 0.22943722943722944, "grad_norm": 7.089105606079102, "learning_rate": 9.036825400468812e-05, "loss": 0.457, "step": 53 }, { "epoch": 0.23376623376623376, "grad_norm": 34.66611862182617, "learning_rate": 8.995052426791247e-05, "loss": 0.418, "step": 54 }, { "epoch": 0.23809523809523808, "grad_norm": 5.267077922821045, "learning_rate": 8.952493640360517e-05, "loss": 0.4082, "step": 55 }, { "epoch": 0.24242424242424243, "grad_norm": 6.587023735046387, "learning_rate": 8.90915741234015e-05, "loss": 0.4512, "step": 56 }, { "epoch": 0.24675324675324675, "grad_norm": 7.729217529296875, "learning_rate": 8.865052266813685e-05, "loss": 0.4258, "step": 57 }, { "epoch": 0.2510822510822511, "grad_norm": 6.812266826629639, "learning_rate": 8.820186879108038e-05, "loss": 0.4512, "step": 58 }, { "epoch": 0.2554112554112554, "grad_norm": 11.491703033447266, "learning_rate": 8.77457007408708e-05, "loss": 0.2832, "step": 59 }, { "epoch": 0.2597402597402597, "grad_norm": 5.020540714263916, "learning_rate": 8.728210824415827e-05, "loss": 0.3496, "step": 60 }, { "epoch": 0.26406926406926406, "grad_norm": 5.021598815917969, "learning_rate": 8.681118248795547e-05, "loss": 0.5, "step": 61 }, { "epoch": 0.2683982683982684, "grad_norm": 8.810498237609863, "learning_rate": 8.633301610170135e-05, "loss": 0.4551, "step": 62 }, { "epoch": 0.2727272727272727, "grad_norm": 7.0777411460876465, "learning_rate": 8.584770313904137e-05, "loss": 0.3926, "step": 63 }, { "epoch": 0.27705627705627706, "grad_norm": 5.617433547973633, "learning_rate": 8.535533905932738e-05, "loss": 0.3223, "step": 64 }, { "epoch": 0.2813852813852814, "grad_norm": 5.157428741455078, "learning_rate": 8.485602070884117e-05, "loss": 0.373, "step": 65 }, { "epoch": 0.2857142857142857, "grad_norm": 6.726741790771484, "learning_rate": 8.434984630174509e-05, "loss": 0.4219, "step": 66 }, { "epoch": 0.29004329004329005, "grad_norm": 4.791531085968018, "learning_rate": 8.383691540076371e-05, "loss": 0.4199, "step": 67 }, { "epoch": 0.2943722943722944, "grad_norm": 5.5883026123046875, "learning_rate": 8.33173288976002e-05, "loss": 0.459, "step": 68 }, { "epoch": 0.2987012987012987, "grad_norm": 7.0759663581848145, "learning_rate": 8.279118899309122e-05, "loss": 0.4629, "step": 69 }, { "epoch": 0.30303030303030304, "grad_norm": 7.941741466522217, "learning_rate": 8.225859917710439e-05, "loss": 0.4316, "step": 70 }, { "epoch": 0.30735930735930733, "grad_norm": 4.9386396408081055, "learning_rate": 8.171966420818228e-05, "loss": 0.3359, "step": 71 }, { "epoch": 0.3116883116883117, "grad_norm": 4.561506748199463, "learning_rate": 8.117449009293668e-05, "loss": 0.373, "step": 72 }, { "epoch": 0.31601731601731603, "grad_norm": 6.921361446380615, "learning_rate": 8.062318406519751e-05, "loss": 0.3809, "step": 73 }, { "epoch": 0.3203463203463203, "grad_norm": 6.0779876708984375, "learning_rate": 8.006585456492029e-05, "loss": 0.416, "step": 74 }, { "epoch": 0.3246753246753247, "grad_norm": 14.405391693115234, "learning_rate": 7.950261121685641e-05, "loss": 0.3711, "step": 75 }, { "epoch": 0.329004329004329, "grad_norm": 12.624444961547852, "learning_rate": 7.89335648089903e-05, "loss": 0.3691, "step": 76 }, { "epoch": 0.3333333333333333, "grad_norm": 8.761910438537598, "learning_rate": 7.835882727074779e-05, "loss": 0.4707, "step": 77 }, { "epoch": 0.33766233766233766, "grad_norm": 6.742737770080566, "learning_rate": 7.777851165098012e-05, "loss": 0.2949, "step": 78 }, { "epoch": 0.341991341991342, "grad_norm": 13.422913551330566, "learning_rate": 7.719273209572744e-05, "loss": 0.377, "step": 79 }, { "epoch": 0.3463203463203463, "grad_norm": 8.455282211303711, "learning_rate": 7.660160382576683e-05, "loss": 0.4707, "step": 80 }, { "epoch": 0.35064935064935066, "grad_norm": 7.551178455352783, "learning_rate": 7.600524311394873e-05, "loss": 0.4688, "step": 81 }, { "epoch": 0.354978354978355, "grad_norm": 5.217031002044678, "learning_rate": 7.540376726232648e-05, "loss": 0.4492, "step": 82 }, { "epoch": 0.3593073593073593, "grad_norm": 7.432611465454102, "learning_rate": 7.47972945790834e-05, "loss": 0.4121, "step": 83 }, { "epoch": 0.36363636363636365, "grad_norm": 10.472357749938965, "learning_rate": 7.4185944355262e-05, "loss": 0.4199, "step": 84 }, { "epoch": 0.36796536796536794, "grad_norm": 8.361776351928711, "learning_rate": 7.35698368412999e-05, "loss": 0.3496, "step": 85 }, { "epoch": 0.3722943722943723, "grad_norm": 8.236878395080566, "learning_rate": 7.294909322337689e-05, "loss": 0.3906, "step": 86 }, { "epoch": 0.37662337662337664, "grad_norm": 6.491207122802734, "learning_rate": 7.232383559957814e-05, "loss": 0.4277, "step": 87 }, { "epoch": 0.38095238095238093, "grad_norm": 9.313511848449707, "learning_rate": 7.169418695587791e-05, "loss": 0.3867, "step": 88 }, { "epoch": 0.3852813852813853, "grad_norm": 13.22208309173584, "learning_rate": 7.106027114194855e-05, "loss": 0.4824, "step": 89 }, { "epoch": 0.38961038961038963, "grad_norm": 7.9978718757629395, "learning_rate": 7.042221284679982e-05, "loss": 0.4238, "step": 90 }, { "epoch": 0.3939393939393939, "grad_norm": 14.92418384552002, "learning_rate": 6.978013757425295e-05, "loss": 0.4941, "step": 91 }, { "epoch": 0.39826839826839827, "grad_norm": 8.810619354248047, "learning_rate": 6.91341716182545e-05, "loss": 0.3691, "step": 92 }, { "epoch": 0.4025974025974026, "grad_norm": 7.104898452758789, "learning_rate": 6.848444203803476e-05, "loss": 0.4609, "step": 93 }, { "epoch": 0.4069264069264069, "grad_norm": 30.174938201904297, "learning_rate": 6.783107663311565e-05, "loss": 0.375, "step": 94 }, { "epoch": 0.41125541125541126, "grad_norm": 6.395003318786621, "learning_rate": 6.717420391817306e-05, "loss": 0.4043, "step": 95 }, { "epoch": 0.4155844155844156, "grad_norm": 17.24394416809082, "learning_rate": 6.651395309775837e-05, "loss": 0.4434, "step": 96 }, { "epoch": 0.4199134199134199, "grad_norm": 6.928684711456299, "learning_rate": 6.585045404088441e-05, "loss": 0.4141, "step": 97 }, { "epoch": 0.42424242424242425, "grad_norm": 14.082657814025879, "learning_rate": 6.518383725548074e-05, "loss": 0.4414, "step": 98 }, { "epoch": 0.42857142857142855, "grad_norm": 21.20639991760254, "learning_rate": 6.451423386272312e-05, "loss": 0.4121, "step": 99 }, { "epoch": 0.4329004329004329, "grad_norm": 12.427319526672363, "learning_rate": 6.384177557124247e-05, "loss": 0.4258, "step": 100 }, { "epoch": 0.43722943722943725, "grad_norm": 10.986278533935547, "learning_rate": 6.316659465121824e-05, "loss": 0.3965, "step": 101 }, { "epoch": 0.44155844155844154, "grad_norm": 12.830971717834473, "learning_rate": 6.248882390836135e-05, "loss": 0.4375, "step": 102 }, { "epoch": 0.4458874458874459, "grad_norm": 14.790221214294434, "learning_rate": 6.180859665779172e-05, "loss": 0.4492, "step": 103 }, { "epoch": 0.45021645021645024, "grad_norm": 44.620018005371094, "learning_rate": 6.112604669781572e-05, "loss": 0.5703, "step": 104 }, { "epoch": 0.45454545454545453, "grad_norm": 13.396041870117188, "learning_rate": 6.04413082836085e-05, "loss": 0.4805, "step": 105 }, { "epoch": 0.4588744588744589, "grad_norm": 13.93975830078125, "learning_rate": 5.9754516100806423e-05, "loss": 0.3809, "step": 106 }, { "epoch": 0.46320346320346323, "grad_norm": 18.1014404296875, "learning_rate": 5.9065805239014923e-05, "loss": 0.4707, "step": 107 }, { "epoch": 0.4675324675324675, "grad_norm": 7.800663948059082, "learning_rate": 5.837531116523682e-05, "loss": 0.4121, "step": 108 }, { "epoch": 0.47186147186147187, "grad_norm": 9.331329345703125, "learning_rate": 5.76831696972265e-05, "loss": 0.3926, "step": 109 }, { "epoch": 0.47619047619047616, "grad_norm": 8.944900512695312, "learning_rate": 5.698951697677498e-05, "loss": 0.3262, "step": 110 }, { "epoch": 0.4805194805194805, "grad_norm": 20.515888214111328, "learning_rate": 5.629448944293127e-05, "loss": 0.4648, "step": 111 }, { "epoch": 0.48484848484848486, "grad_norm": 9.60425853729248, "learning_rate": 5.559822380516539e-05, "loss": 0.4023, "step": 112 }, { "epoch": 0.48917748917748916, "grad_norm": 9.976903915405273, "learning_rate": 5.490085701647805e-05, "loss": 0.3438, "step": 113 }, { "epoch": 0.4935064935064935, "grad_norm": 19.132675170898438, "learning_rate": 5.420252624646238e-05, "loss": 0.3965, "step": 114 }, { "epoch": 0.49783549783549785, "grad_norm": 12.469053268432617, "learning_rate": 5.3503368854323366e-05, "loss": 0.3848, "step": 115 }, { "epoch": 0.5021645021645021, "grad_norm": 14.417160034179688, "learning_rate": 5.2803522361859594e-05, "loss": 0.459, "step": 116 }, { "epoch": 0.5064935064935064, "grad_norm": 8.167932510375977, "learning_rate": 5.2103124426413264e-05, "loss": 0.3711, "step": 117 }, { "epoch": 0.5108225108225108, "grad_norm": 9.628777503967285, "learning_rate": 5.140231281379345e-05, "loss": 0.2695, "step": 118 }, { "epoch": 0.5151515151515151, "grad_norm": 13.074538230895996, "learning_rate": 5.070122537117812e-05, "loss": 0.3633, "step": 119 }, { "epoch": 0.5194805194805194, "grad_norm": 8.415633201599121, "learning_rate": 5e-05, "loss": 0.4395, "step": 120 }, { "epoch": 0.5238095238095238, "grad_norm": 16.27584457397461, "learning_rate": 4.929877462882189e-05, "loss": 0.416, "step": 121 }, { "epoch": 0.5281385281385281, "grad_norm": 5.963536262512207, "learning_rate": 4.859768718620656e-05, "loss": 0.3633, "step": 122 }, { "epoch": 0.5324675324675324, "grad_norm": 17.088741302490234, "learning_rate": 4.7896875573586755e-05, "loss": 0.3672, "step": 123 }, { "epoch": 0.5367965367965368, "grad_norm": 16.568267822265625, "learning_rate": 4.7196477638140404e-05, "loss": 0.4121, "step": 124 }, { "epoch": 0.5411255411255411, "grad_norm": 15.208264350891113, "learning_rate": 4.649663114567663e-05, "loss": 0.4082, "step": 125 }, { "epoch": 0.5454545454545454, "grad_norm": 11.11156177520752, "learning_rate": 4.579747375353763e-05, "loss": 0.375, "step": 126 }, { "epoch": 0.5497835497835498, "grad_norm": 15.04819107055664, "learning_rate": 4.509914298352197e-05, "loss": 0.3594, "step": 127 }, { "epoch": 0.5541125541125541, "grad_norm": 6.2651286125183105, "learning_rate": 4.4401776194834613e-05, "loss": 0.2793, "step": 128 }, { "epoch": 0.5584415584415584, "grad_norm": 9.082189559936523, "learning_rate": 4.370551055706874e-05, "loss": 0.3906, "step": 129 }, { "epoch": 0.5627705627705628, "grad_norm": 7.258723258972168, "learning_rate": 4.3010483023225045e-05, "loss": 0.3086, "step": 130 }, { "epoch": 0.5670995670995671, "grad_norm": 9.980514526367188, "learning_rate": 4.231683030277349e-05, "loss": 0.4043, "step": 131 }, { "epoch": 0.5714285714285714, "grad_norm": 13.367783546447754, "learning_rate": 4.162468883476319e-05, "loss": 0.3223, "step": 132 }, { "epoch": 0.5757575757575758, "grad_norm": 5.13516902923584, "learning_rate": 4.093419476098509e-05, "loss": 0.3027, "step": 133 }, { "epoch": 0.5800865800865801, "grad_norm": 6.814506530761719, "learning_rate": 4.0245483899193595e-05, "loss": 0.4395, "step": 134 }, { "epoch": 0.5844155844155844, "grad_norm": 9.574424743652344, "learning_rate": 3.955869171639152e-05, "loss": 0.3027, "step": 135 }, { "epoch": 0.5887445887445888, "grad_norm": 6.566686630249023, "learning_rate": 3.887395330218429e-05, "loss": 0.3262, "step": 136 }, { "epoch": 0.5930735930735931, "grad_norm": 14.991905212402344, "learning_rate": 3.81914033422083e-05, "loss": 0.3926, "step": 137 }, { "epoch": 0.5974025974025974, "grad_norm": 6.989403247833252, "learning_rate": 3.7511176091638653e-05, "loss": 0.416, "step": 138 }, { "epoch": 0.6017316017316018, "grad_norm": 99.71111297607422, "learning_rate": 3.683340534878176e-05, "loss": 0.3555, "step": 139 }, { "epoch": 0.6060606060606061, "grad_norm": 7.185469150543213, "learning_rate": 3.6158224428757535e-05, "loss": 0.3555, "step": 140 }, { "epoch": 0.6103896103896104, "grad_norm": 48.649051666259766, "learning_rate": 3.5485766137276894e-05, "loss": 0.3906, "step": 141 }, { "epoch": 0.6147186147186147, "grad_norm": 8.177955627441406, "learning_rate": 3.4816162744519263e-05, "loss": 0.4043, "step": 142 }, { "epoch": 0.6190476190476191, "grad_norm": 7.434495449066162, "learning_rate": 3.4149545959115605e-05, "loss": 0.3867, "step": 143 }, { "epoch": 0.6233766233766234, "grad_norm": 10.178008079528809, "learning_rate": 3.3486046902241664e-05, "loss": 0.4141, "step": 144 }, { "epoch": 0.6277056277056277, "grad_norm": 6.520373821258545, "learning_rate": 3.282579608182694e-05, "loss": 0.3301, "step": 145 }, { "epoch": 0.6320346320346321, "grad_norm": 15.099567413330078, "learning_rate": 3.216892336688435e-05, "loss": 0.3496, "step": 146 }, { "epoch": 0.6363636363636364, "grad_norm": 14.876191139221191, "learning_rate": 3.151555796196525e-05, "loss": 0.3691, "step": 147 }, { "epoch": 0.6406926406926406, "grad_norm": 10.065027236938477, "learning_rate": 3.086582838174551e-05, "loss": 0.3379, "step": 148 }, { "epoch": 0.645021645021645, "grad_norm": 7.905645370483398, "learning_rate": 3.021986242574707e-05, "loss": 0.4004, "step": 149 }, { "epoch": 0.6493506493506493, "grad_norm": 10.962891578674316, "learning_rate": 2.9577787153200197e-05, "loss": 0.4141, "step": 150 }, { "epoch": 0.6536796536796536, "grad_norm": 6.561282157897949, "learning_rate": 2.893972885805148e-05, "loss": 0.4844, "step": 151 }, { "epoch": 0.658008658008658, "grad_norm": 15.016473770141602, "learning_rate": 2.8305813044122097e-05, "loss": 0.3418, "step": 152 }, { "epoch": 0.6623376623376623, "grad_norm": 8.82343864440918, "learning_rate": 2.7676164400421862e-05, "loss": 0.3438, "step": 153 }, { "epoch": 0.6666666666666666, "grad_norm": 6.84242582321167, "learning_rate": 2.705090677662311e-05, "loss": 0.3438, "step": 154 }, { "epoch": 0.670995670995671, "grad_norm": 7.3810930252075195, "learning_rate": 2.6430163158700115e-05, "loss": 0.3906, "step": 155 }, { "epoch": 0.6753246753246753, "grad_norm": 8.569120407104492, "learning_rate": 2.581405564473801e-05, "loss": 0.4004, "step": 156 }, { "epoch": 0.6796536796536796, "grad_norm": 13.025605201721191, "learning_rate": 2.5202705420916627e-05, "loss": 0.373, "step": 157 }, { "epoch": 0.683982683982684, "grad_norm": 8.868095397949219, "learning_rate": 2.459623273767354e-05, "loss": 0.4199, "step": 158 }, { "epoch": 0.6883116883116883, "grad_norm": 6.385418891906738, "learning_rate": 2.3994756886051268e-05, "loss": 0.3203, "step": 159 }, { "epoch": 0.6926406926406926, "grad_norm": 8.916970252990723, "learning_rate": 2.3398396174233178e-05, "loss": 0.2949, "step": 160 }, { "epoch": 0.696969696969697, "grad_norm": 9.499614715576172, "learning_rate": 2.280726790427258e-05, "loss": 0.2832, "step": 161 }, { "epoch": 0.7012987012987013, "grad_norm": 14.341413497924805, "learning_rate": 2.2221488349019903e-05, "loss": 0.3125, "step": 162 }, { "epoch": 0.7056277056277056, "grad_norm": 6.781824111938477, "learning_rate": 2.164117272925221e-05, "loss": 0.332, "step": 163 }, { "epoch": 0.70995670995671, "grad_norm": 44.00325393676758, "learning_rate": 2.1066435191009715e-05, "loss": 0.4141, "step": 164 }, { "epoch": 0.7142857142857143, "grad_norm": 13.864439010620117, "learning_rate": 2.0497388783143602e-05, "loss": 0.4199, "step": 165 }, { "epoch": 0.7186147186147186, "grad_norm": 8.958086967468262, "learning_rate": 1.9934145435079702e-05, "loss": 0.3535, "step": 166 }, { "epoch": 0.7229437229437229, "grad_norm": 4.822065830230713, "learning_rate": 1.9376815934802496e-05, "loss": 0.3828, "step": 167 }, { "epoch": 0.7272727272727273, "grad_norm": 8.322967529296875, "learning_rate": 1.8825509907063327e-05, "loss": 0.4258, "step": 168 }, { "epoch": 0.7316017316017316, "grad_norm": 13.969364166259766, "learning_rate": 1.8280335791817733e-05, "loss": 0.3672, "step": 169 }, { "epoch": 0.7359307359307359, "grad_norm": 6.8569722175598145, "learning_rate": 1.774140082289563e-05, "loss": 0.3789, "step": 170 }, { "epoch": 0.7402597402597403, "grad_norm": 6.253945827484131, "learning_rate": 1.7208811006908798e-05, "loss": 0.3379, "step": 171 }, { "epoch": 0.7445887445887446, "grad_norm": 5.97335958480835, "learning_rate": 1.6682671102399805e-05, "loss": 0.3672, "step": 172 }, { "epoch": 0.7489177489177489, "grad_norm": 21.618064880371094, "learning_rate": 1.6163084599236278e-05, "loss": 0.4297, "step": 173 }, { "epoch": 0.7532467532467533, "grad_norm": 6.355106353759766, "learning_rate": 1.5650153698254916e-05, "loss": 0.2969, "step": 174 }, { "epoch": 0.7575757575757576, "grad_norm": 11.884471893310547, "learning_rate": 1.5143979291158838e-05, "loss": 0.4531, "step": 175 }, { "epoch": 0.7619047619047619, "grad_norm": 8.2210054397583, "learning_rate": 1.4644660940672627e-05, "loss": 0.3535, "step": 176 }, { "epoch": 0.7662337662337663, "grad_norm": 5.43289852142334, "learning_rate": 1.4152296860958642e-05, "loss": 0.3281, "step": 177 }, { "epoch": 0.7705627705627706, "grad_norm": 13.447378158569336, "learning_rate": 1.3666983898298657e-05, "loss": 0.4336, "step": 178 }, { "epoch": 0.7748917748917749, "grad_norm": 9.235896110534668, "learning_rate": 1.3188817512044544e-05, "loss": 0.3516, "step": 179 }, { "epoch": 0.7792207792207793, "grad_norm": 6.7336602210998535, "learning_rate": 1.2717891755841722e-05, "loss": 0.3223, "step": 180 }, { "epoch": 0.7835497835497836, "grad_norm": 6.973309516906738, "learning_rate": 1.225429925912921e-05, "loss": 0.3301, "step": 181 }, { "epoch": 0.7878787878787878, "grad_norm": 30.55255699157715, "learning_rate": 1.1798131208919627e-05, "loss": 0.3027, "step": 182 }, { "epoch": 0.7922077922077922, "grad_norm": 10.4804048538208, "learning_rate": 1.134947733186315e-05, "loss": 0.3848, "step": 183 }, { "epoch": 0.7965367965367965, "grad_norm": 5.046676158905029, "learning_rate": 1.090842587659851e-05, "loss": 0.4805, "step": 184 }, { "epoch": 0.8008658008658008, "grad_norm": 8.961450576782227, "learning_rate": 1.047506359639483e-05, "loss": 0.3398, "step": 185 }, { "epoch": 0.8051948051948052, "grad_norm": 10.69762134552002, "learning_rate": 1.004947573208756e-05, "loss": 0.3945, "step": 186 }, { "epoch": 0.8095238095238095, "grad_norm": 9.194847106933594, "learning_rate": 9.63174599531188e-06, "loss": 0.3379, "step": 187 }, { "epoch": 0.8138528138528138, "grad_norm": 7.735859394073486, "learning_rate": 9.221956552036992e-06, "loss": 0.373, "step": 188 }, { "epoch": 0.8181818181818182, "grad_norm": 7.634536266326904, "learning_rate": 8.820188006404268e-06, "loss": 0.3418, "step": 189 }, { "epoch": 0.8225108225108225, "grad_norm": 12.410811424255371, "learning_rate": 8.426519384872733e-06, "loss": 0.3125, "step": 190 }, { "epoch": 0.8268398268398268, "grad_norm": 44.61256408691406, "learning_rate": 8.041028120674893e-06, "loss": 0.3418, "step": 191 }, { "epoch": 0.8311688311688312, "grad_norm": 8.871509552001953, "learning_rate": 7.663790038585793e-06, "loss": 0.4141, "step": 192 }, { "epoch": 0.8354978354978355, "grad_norm": 7.508159160614014, "learning_rate": 7.2948793400086315e-06, "loss": 0.4004, "step": 193 }, { "epoch": 0.8398268398268398, "grad_norm": 33.30258560180664, "learning_rate": 6.934368588379553e-06, "loss": 0.3574, "step": 194 }, { "epoch": 0.8441558441558441, "grad_norm": 21.46922492980957, "learning_rate": 6.582328694894729e-06, "loss": 0.4023, "step": 195 }, { "epoch": 0.8484848484848485, "grad_norm": 31.504789352416992, "learning_rate": 6.238828904562316e-06, "loss": 0.377, "step": 196 }, { "epoch": 0.8528138528138528, "grad_norm": 9.979594230651855, "learning_rate": 5.903936782582253e-06, "loss": 0.4668, "step": 197 }, { "epoch": 0.8571428571428571, "grad_norm": 7.933041095733643, "learning_rate": 5.577718201056392e-06, "loss": 0.4102, "step": 198 }, { "epoch": 0.8614718614718615, "grad_norm": 5.786583423614502, "learning_rate": 5.260237326031697e-06, "loss": 0.4414, "step": 199 }, { "epoch": 0.8658008658008658, "grad_norm": 5.894907474517822, "learning_rate": 4.951556604879048e-06, "loss": 0.3848, "step": 200 }, { "epoch": 0.8701298701298701, "grad_norm": 32.59992599487305, "learning_rate": 4.651736754009972e-06, "loss": 0.3672, "step": 201 }, { "epoch": 0.8744588744588745, "grad_norm": 10.215508460998535, "learning_rate": 4.360836746934055e-06, "loss": 0.3496, "step": 202 }, { "epoch": 0.8787878787878788, "grad_norm": 10.185182571411133, "learning_rate": 4.078913802658946e-06, "loss": 0.4062, "step": 203 }, { "epoch": 0.8831168831168831, "grad_norm": 5.9914021492004395, "learning_rate": 3.8060233744356633e-06, "loss": 0.3633, "step": 204 }, { "epoch": 0.8874458874458875, "grad_norm": 18.541730880737305, "learning_rate": 3.542219138851094e-06, "loss": 0.3691, "step": 205 }, { "epoch": 0.8917748917748918, "grad_norm": 5.432623863220215, "learning_rate": 3.2875529852700147e-06, "loss": 0.4297, "step": 206 }, { "epoch": 0.8961038961038961, "grad_norm": 15.46761417388916, "learning_rate": 3.0420750056286195e-06, "loss": 0.3379, "step": 207 }, { "epoch": 0.9004329004329005, "grad_norm": 6.300500392913818, "learning_rate": 2.8058334845816213e-06, "loss": 0.2754, "step": 208 }, { "epoch": 0.9047619047619048, "grad_norm": 8.409278869628906, "learning_rate": 2.5788748900048676e-06, "loss": 0.293, "step": 209 }, { "epoch": 0.9090909090909091, "grad_norm": 12.198765754699707, "learning_rate": 2.361243863855184e-06, "loss": 0.3789, "step": 210 }, { "epoch": 0.9134199134199135, "grad_norm": 6.002485275268555, "learning_rate": 2.152983213389559e-06, "loss": 0.3652, "step": 211 }, { "epoch": 0.9177489177489178, "grad_norm": 5.1724090576171875, "learning_rate": 1.9541339027450256e-06, "loss": 0.3125, "step": 212 }, { "epoch": 0.922077922077922, "grad_norm": 19.717180252075195, "learning_rate": 1.7647350448812106e-06, "loss": 0.3516, "step": 213 }, { "epoch": 0.9264069264069265, "grad_norm": 17.20496368408203, "learning_rate": 1.584823893886933e-06, "loss": 0.3789, "step": 214 }, { "epoch": 0.9307359307359307, "grad_norm": 6.270045280456543, "learning_rate": 1.4144358376524503e-06, "loss": 0.3672, "step": 215 }, { "epoch": 0.935064935064935, "grad_norm": 10.227100372314453, "learning_rate": 1.2536043909088191e-06, "loss": 0.4395, "step": 216 }, { "epoch": 0.9393939393939394, "grad_norm": 8.13603401184082, "learning_rate": 1.10236118863562e-06, "loss": 0.3418, "step": 217 }, { "epoch": 0.9437229437229437, "grad_norm": 7.209720611572266, "learning_rate": 9.607359798384785e-07, "loss": 0.3945, "step": 218 }, { "epoch": 0.948051948051948, "grad_norm": 8.675386428833008, "learning_rate": 8.287566216975795e-07, "loss": 0.3242, "step": 219 }, { "epoch": 0.9523809523809523, "grad_norm": 13.124395370483398, "learning_rate": 7.064490740882057e-07, "loss": 0.3398, "step": 220 }, { "epoch": 0.9567099567099567, "grad_norm": 5.071322917938232, "learning_rate": 5.938373944745612e-07, "loss": 0.2812, "step": 221 }, { "epoch": 0.961038961038961, "grad_norm": 6.736144542694092, "learning_rate": 4.909437331777179e-07, "loss": 0.3906, "step": 222 }, { "epoch": 0.9653679653679653, "grad_norm": 6.963472366333008, "learning_rate": 3.9778832901876675e-07, "loss": 0.3945, "step": 223 }, { "epoch": 0.9696969696969697, "grad_norm": 11.631999015808105, "learning_rate": 3.143895053378698e-07, "loss": 0.418, "step": 224 }, { "epoch": 0.974025974025974, "grad_norm": 6.837718963623047, "learning_rate": 2.407636663901591e-07, "loss": 0.334, "step": 225 }, { "epoch": 0.9783549783549783, "grad_norm": 71.07975006103516, "learning_rate": 1.7692529411904578e-07, "loss": 0.3926, "step": 226 }, { "epoch": 0.9826839826839827, "grad_norm": 9.980896949768066, "learning_rate": 1.228869453076986e-07, "loss": 0.3398, "step": 227 }, { "epoch": 0.987012987012987, "grad_norm": 34.9681282043457, "learning_rate": 7.865924910916977e-08, "loss": 0.4414, "step": 228 }, { "epoch": 0.9913419913419913, "grad_norm": 23.342512130737305, "learning_rate": 4.4250904955656095e-08, "loss": 0.4336, "step": 229 }, { "epoch": 0.9956709956709957, "grad_norm": 15.746844291687012, "learning_rate": 1.9668680847356735e-08, "loss": 0.375, "step": 230 }, { "epoch": 1.0, "grad_norm": 19.062803268432617, "learning_rate": 4.917412021249179e-09, "loss": 0.3711, "step": 231 } ], "logging_steps": 1.0, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 116, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.922842866902368e+18, "train_batch_size": 64, "trial_name": null, "trial_params": null }