{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.19515368352577656, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003252561392096276, "grad_norm": 7.644000053405762, "learning_rate": 5e-05, "loss": 3.7348, "step": 1 }, { "epoch": 0.0006505122784192552, "grad_norm": 6.572476387023926, "learning_rate": 5e-05, "loss": 3.5202, "step": 2 }, { "epoch": 0.0009757684176288828, "grad_norm": 5.915395736694336, "learning_rate": 5e-05, "loss": 3.2423, "step": 3 }, { "epoch": 0.0013010245568385104, "grad_norm": 5.39382266998291, "learning_rate": 5e-05, "loss": 3.2308, "step": 4 }, { "epoch": 0.0016262806960481379, "grad_norm": 5.474543571472168, "learning_rate": 5e-05, "loss": 3.3753, "step": 5 }, { "epoch": 0.0019515368352577655, "grad_norm": 5.402736663818359, "learning_rate": 5e-05, "loss": 3.3042, "step": 6 }, { "epoch": 0.002276792974467393, "grad_norm": 5.727195739746094, "learning_rate": 5e-05, "loss": 3.2722, "step": 7 }, { "epoch": 0.002602049113677021, "grad_norm": 5.997256755828857, "learning_rate": 5e-05, "loss": 3.2708, "step": 8 }, { "epoch": 0.002927305252886648, "grad_norm": 8.144789695739746, "learning_rate": 5e-05, "loss": 3.4531, "step": 9 }, { "epoch": 0.0032525613920962758, "grad_norm": 11.264220237731934, "learning_rate": 5e-05, "loss": 3.3755, "step": 10 }, { "epoch": 0.0035778175313059034, "grad_norm": 10.535292625427246, "learning_rate": 5e-05, "loss": 3.5809, "step": 11 }, { "epoch": 0.003903073670515531, "grad_norm": 8.04301929473877, "learning_rate": 5e-05, "loss": 3.3787, "step": 12 }, { "epoch": 0.004228329809725159, "grad_norm": 9.441449165344238, "learning_rate": 5e-05, "loss": 3.4919, "step": 13 }, { "epoch": 0.004553585948934786, "grad_norm": 9.322367668151855, "learning_rate": 5e-05, "loss": 3.6405, "step": 14 }, { "epoch": 0.004878842088144414, "grad_norm": 10.059698104858398, "learning_rate": 5e-05, "loss": 3.425, "step": 15 }, { "epoch": 0.005204098227354042, "grad_norm": 11.285538673400879, "learning_rate": 5e-05, "loss": 3.6643, "step": 16 }, { "epoch": 0.0055293543665636685, "grad_norm": 10.433178901672363, "learning_rate": 5e-05, "loss": 3.4485, "step": 17 }, { "epoch": 0.005854610505773296, "grad_norm": 11.724845886230469, "learning_rate": 5e-05, "loss": 3.8379, "step": 18 }, { "epoch": 0.006179866644982924, "grad_norm": 11.558403968811035, "learning_rate": 5e-05, "loss": 3.8067, "step": 19 }, { "epoch": 0.0065051227841925515, "grad_norm": 9.274937629699707, "learning_rate": 5e-05, "loss": 3.5213, "step": 20 }, { "epoch": 0.006830378923402179, "grad_norm": 11.487302780151367, "learning_rate": 5e-05, "loss": 3.8464, "step": 21 }, { "epoch": 0.007155635062611807, "grad_norm": 11.910959243774414, "learning_rate": 5e-05, "loss": 3.7159, "step": 22 }, { "epoch": 0.0074808912018214345, "grad_norm": 10.441876411437988, "learning_rate": 5e-05, "loss": 4.3052, "step": 23 }, { "epoch": 0.007806147341031062, "grad_norm": 11.492648124694824, "learning_rate": 5e-05, "loss": 4.0298, "step": 24 }, { "epoch": 0.00813140348024069, "grad_norm": 11.218420028686523, "learning_rate": 5e-05, "loss": 3.9014, "step": 25 }, { "epoch": 0.008456659619450317, "grad_norm": 9.615971565246582, "learning_rate": 5e-05, "loss": 3.8972, "step": 26 }, { "epoch": 0.008781915758659945, "grad_norm": 9.325116157531738, "learning_rate": 5e-05, "loss": 4.0339, "step": 27 }, { "epoch": 0.009107171897869573, "grad_norm": 10.537034034729004, "learning_rate": 5e-05, "loss": 4.3475, "step": 28 }, { "epoch": 0.0094324280370792, "grad_norm": 9.163812637329102, "learning_rate": 5e-05, "loss": 4.4702, "step": 29 }, { "epoch": 0.009757684176288828, "grad_norm": 8.232535362243652, "learning_rate": 5e-05, "loss": 4.5234, "step": 30 }, { "epoch": 0.010082940315498456, "grad_norm": 7.85621976852417, "learning_rate": 5e-05, "loss": 4.665, "step": 31 }, { "epoch": 0.010408196454708083, "grad_norm": 7.693171501159668, "learning_rate": 5e-05, "loss": 4.5524, "step": 32 }, { "epoch": 0.01073345259391771, "grad_norm": 6.418099403381348, "learning_rate": 5e-05, "loss": 4.3471, "step": 33 }, { "epoch": 0.011058708733127337, "grad_norm": 9.231449127197266, "learning_rate": 5e-05, "loss": 4.5491, "step": 34 }, { "epoch": 0.011383964872336965, "grad_norm": 8.351770401000977, "learning_rate": 5e-05, "loss": 4.6851, "step": 35 }, { "epoch": 0.011709221011546592, "grad_norm": 8.412186622619629, "learning_rate": 5e-05, "loss": 4.4453, "step": 36 }, { "epoch": 0.01203447715075622, "grad_norm": 7.4988932609558105, "learning_rate": 5e-05, "loss": 4.2595, "step": 37 }, { "epoch": 0.012359733289965848, "grad_norm": 6.0955424308776855, "learning_rate": 5e-05, "loss": 4.1402, "step": 38 }, { "epoch": 0.012684989429175475, "grad_norm": 5.853550434112549, "learning_rate": 5e-05, "loss": 4.2133, "step": 39 }, { "epoch": 0.013010245568385103, "grad_norm": 6.071702480316162, "learning_rate": 5e-05, "loss": 3.9406, "step": 40 }, { "epoch": 0.01333550170759473, "grad_norm": 5.181666374206543, "learning_rate": 5e-05, "loss": 3.9559, "step": 41 }, { "epoch": 0.013660757846804358, "grad_norm": 5.067506313323975, "learning_rate": 5e-05, "loss": 4.0526, "step": 42 }, { "epoch": 0.013986013986013986, "grad_norm": 4.593716621398926, "learning_rate": 5e-05, "loss": 4.0814, "step": 43 }, { "epoch": 0.014311270125223614, "grad_norm": 5.379826545715332, "learning_rate": 5e-05, "loss": 4.13, "step": 44 }, { "epoch": 0.014636526264433241, "grad_norm": 6.113279819488525, "learning_rate": 5e-05, "loss": 4.0842, "step": 45 }, { "epoch": 0.014961782403642869, "grad_norm": 4.885502815246582, "learning_rate": 5e-05, "loss": 4.147, "step": 46 }, { "epoch": 0.015287038542852497, "grad_norm": 5.065276622772217, "learning_rate": 5e-05, "loss": 4.0597, "step": 47 }, { "epoch": 0.015612294682062124, "grad_norm": 5.306569576263428, "learning_rate": 5e-05, "loss": 4.4288, "step": 48 }, { "epoch": 0.01593755082127175, "grad_norm": 6.738716125488281, "learning_rate": 5e-05, "loss": 4.2153, "step": 49 }, { "epoch": 0.01626280696048138, "grad_norm": 7.264622211456299, "learning_rate": 5e-05, "loss": 4.3567, "step": 50 }, { "epoch": 0.016588063099691006, "grad_norm": 4.799393177032471, "learning_rate": 5e-05, "loss": 4.1491, "step": 51 }, { "epoch": 0.016913319238900635, "grad_norm": 4.5021071434021, "learning_rate": 5e-05, "loss": 4.0725, "step": 52 }, { "epoch": 0.01723857537811026, "grad_norm": 5.524833679199219, "learning_rate": 5e-05, "loss": 4.405, "step": 53 }, { "epoch": 0.01756383151731989, "grad_norm": 4.327210426330566, "learning_rate": 5e-05, "loss": 4.198, "step": 54 }, { "epoch": 0.017889087656529516, "grad_norm": 4.141977787017822, "learning_rate": 5e-05, "loss": 4.0445, "step": 55 }, { "epoch": 0.018214343795739146, "grad_norm": 4.746036529541016, "learning_rate": 5e-05, "loss": 4.5174, "step": 56 }, { "epoch": 0.01853959993494877, "grad_norm": 5.5715837478637695, "learning_rate": 5e-05, "loss": 4.2292, "step": 57 }, { "epoch": 0.0188648560741584, "grad_norm": 5.887129306793213, "learning_rate": 5e-05, "loss": 4.0795, "step": 58 }, { "epoch": 0.019190112213368027, "grad_norm": 4.8050150871276855, "learning_rate": 5e-05, "loss": 4.1816, "step": 59 }, { "epoch": 0.019515368352577656, "grad_norm": 4.179840564727783, "learning_rate": 5e-05, "loss": 4.337, "step": 60 }, { "epoch": 0.019840624491787282, "grad_norm": 5.042575359344482, "learning_rate": 5e-05, "loss": 4.1466, "step": 61 }, { "epoch": 0.02016588063099691, "grad_norm": 4.339786052703857, "learning_rate": 5e-05, "loss": 3.7189, "step": 62 }, { "epoch": 0.020491136770206538, "grad_norm": 8.27956771850586, "learning_rate": 5e-05, "loss": 4.1767, "step": 63 }, { "epoch": 0.020816392909416167, "grad_norm": 6.273040294647217, "learning_rate": 5e-05, "loss": 4.363, "step": 64 }, { "epoch": 0.021141649048625793, "grad_norm": 7.342176914215088, "learning_rate": 5e-05, "loss": 4.3939, "step": 65 }, { "epoch": 0.02146690518783542, "grad_norm": 6.060370445251465, "learning_rate": 5e-05, "loss": 4.1033, "step": 66 }, { "epoch": 0.021792161327045048, "grad_norm": 6.683494567871094, "learning_rate": 5e-05, "loss": 3.9883, "step": 67 }, { "epoch": 0.022117417466254674, "grad_norm": 6.418432712554932, "learning_rate": 5e-05, "loss": 4.1472, "step": 68 }, { "epoch": 0.022442673605464303, "grad_norm": 6.342174053192139, "learning_rate": 5e-05, "loss": 4.414, "step": 69 }, { "epoch": 0.02276792974467393, "grad_norm": 10.055042266845703, "learning_rate": 5e-05, "loss": 4.1925, "step": 70 }, { "epoch": 0.02309318588388356, "grad_norm": 6.6624932289123535, "learning_rate": 5e-05, "loss": 4.1111, "step": 71 }, { "epoch": 0.023418442023093185, "grad_norm": 8.38736343383789, "learning_rate": 5e-05, "loss": 4.5028, "step": 72 }, { "epoch": 0.023743698162302814, "grad_norm": 8.479351997375488, "learning_rate": 5e-05, "loss": 4.2832, "step": 73 }, { "epoch": 0.02406895430151244, "grad_norm": 8.613444328308105, "learning_rate": 5e-05, "loss": 4.5189, "step": 74 }, { "epoch": 0.02439421044072207, "grad_norm": 6.932406425476074, "learning_rate": 5e-05, "loss": 4.1228, "step": 75 }, { "epoch": 0.024719466579931695, "grad_norm": 5.989908695220947, "learning_rate": 5e-05, "loss": 4.0289, "step": 76 }, { "epoch": 0.025044722719141325, "grad_norm": 5.118892192840576, "learning_rate": 5e-05, "loss": 3.8781, "step": 77 }, { "epoch": 0.02536997885835095, "grad_norm": 5.232855796813965, "learning_rate": 5e-05, "loss": 4.2731, "step": 78 }, { "epoch": 0.02569523499756058, "grad_norm": 4.9437103271484375, "learning_rate": 5e-05, "loss": 3.1985, "step": 79 }, { "epoch": 0.026020491136770206, "grad_norm": 7.929747581481934, "learning_rate": 5e-05, "loss": 4.1925, "step": 80 }, { "epoch": 0.026345747275979835, "grad_norm": 9.323763847351074, "learning_rate": 5e-05, "loss": 4.2925, "step": 81 }, { "epoch": 0.02667100341518946, "grad_norm": 6.18411111831665, "learning_rate": 5e-05, "loss": 4.0956, "step": 82 }, { "epoch": 0.02699625955439909, "grad_norm": 4.424221038818359, "learning_rate": 5e-05, "loss": 4.1803, "step": 83 }, { "epoch": 0.027321515693608717, "grad_norm": 6.044656276702881, "learning_rate": 5e-05, "loss": 4.0551, "step": 84 }, { "epoch": 0.027646771832818346, "grad_norm": 9.380243301391602, "learning_rate": 5e-05, "loss": 4.2753, "step": 85 }, { "epoch": 0.027972027972027972, "grad_norm": 10.203062057495117, "learning_rate": 5e-05, "loss": 4.5481, "step": 86 }, { "epoch": 0.028297284111237598, "grad_norm": 12.21309757232666, "learning_rate": 5e-05, "loss": 4.2921, "step": 87 }, { "epoch": 0.028622540250447227, "grad_norm": 7.666494369506836, "learning_rate": 5e-05, "loss": 4.2344, "step": 88 }, { "epoch": 0.028947796389656853, "grad_norm": 5.388767719268799, "learning_rate": 5e-05, "loss": 4.3914, "step": 89 }, { "epoch": 0.029273052528866483, "grad_norm": 7.598433971405029, "learning_rate": 5e-05, "loss": 4.242, "step": 90 }, { "epoch": 0.02959830866807611, "grad_norm": 11.387866020202637, "learning_rate": 5e-05, "loss": 4.0014, "step": 91 }, { "epoch": 0.029923564807285738, "grad_norm": 10.232786178588867, "learning_rate": 5e-05, "loss": 4.025, "step": 92 }, { "epoch": 0.030248820946495364, "grad_norm": 9.431289672851562, "learning_rate": 5e-05, "loss": 4.2743, "step": 93 }, { "epoch": 0.030574077085704993, "grad_norm": 6.723931789398193, "learning_rate": 5e-05, "loss": 4.111, "step": 94 }, { "epoch": 0.03089933322491462, "grad_norm": 5.171010971069336, "learning_rate": 5e-05, "loss": 3.8832, "step": 95 }, { "epoch": 0.03122458936412425, "grad_norm": 7.667996883392334, "learning_rate": 5e-05, "loss": 4.1839, "step": 96 }, { "epoch": 0.031549845503333875, "grad_norm": 9.853692054748535, "learning_rate": 5e-05, "loss": 4.2596, "step": 97 }, { "epoch": 0.0318751016425435, "grad_norm": 9.22080135345459, "learning_rate": 5e-05, "loss": 3.7551, "step": 98 }, { "epoch": 0.03220035778175313, "grad_norm": 8.230822563171387, "learning_rate": 5e-05, "loss": 4.1223, "step": 99 }, { "epoch": 0.03252561392096276, "grad_norm": 4.844912052154541, "learning_rate": 5e-05, "loss": 3.9264, "step": 100 }, { "epoch": 0.032850870060172385, "grad_norm": 5.341676712036133, "learning_rate": 5e-05, "loss": 4.2083, "step": 101 }, { "epoch": 0.03317612619938201, "grad_norm": 10.346305847167969, "learning_rate": 5e-05, "loss": 4.6745, "step": 102 }, { "epoch": 0.033501382338591644, "grad_norm": 5.822200298309326, "learning_rate": 5e-05, "loss": 3.9329, "step": 103 }, { "epoch": 0.03382663847780127, "grad_norm": 4.412308692932129, "learning_rate": 5e-05, "loss": 4.2581, "step": 104 }, { "epoch": 0.034151894617010896, "grad_norm": 6.643288612365723, "learning_rate": 5e-05, "loss": 4.0638, "step": 105 }, { "epoch": 0.03447715075622052, "grad_norm": 4.771259784698486, "learning_rate": 5e-05, "loss": 3.9147, "step": 106 }, { "epoch": 0.034802406895430155, "grad_norm": 4.471869945526123, "learning_rate": 5e-05, "loss": 3.8011, "step": 107 }, { "epoch": 0.03512766303463978, "grad_norm": 6.949775218963623, "learning_rate": 5e-05, "loss": 4.3229, "step": 108 }, { "epoch": 0.035452919173849406, "grad_norm": 5.095446586608887, "learning_rate": 5e-05, "loss": 3.616, "step": 109 }, { "epoch": 0.03577817531305903, "grad_norm": 6.592041015625, "learning_rate": 5e-05, "loss": 3.7697, "step": 110 }, { "epoch": 0.03610343145226866, "grad_norm": 7.455766677856445, "learning_rate": 5e-05, "loss": 4.0375, "step": 111 }, { "epoch": 0.03642868759147829, "grad_norm": 4.540219306945801, "learning_rate": 5e-05, "loss": 3.7951, "step": 112 }, { "epoch": 0.03675394373068792, "grad_norm": 5.230220794677734, "learning_rate": 5e-05, "loss": 4.0304, "step": 113 }, { "epoch": 0.03707919986989754, "grad_norm": 5.179874420166016, "learning_rate": 5e-05, "loss": 4.0191, "step": 114 }, { "epoch": 0.03740445600910717, "grad_norm": 6.374222755432129, "learning_rate": 5e-05, "loss": 4.4934, "step": 115 }, { "epoch": 0.0377297121483168, "grad_norm": 6.96058988571167, "learning_rate": 5e-05, "loss": 4.1446, "step": 116 }, { "epoch": 0.03805496828752643, "grad_norm": 6.299279689788818, "learning_rate": 5e-05, "loss": 3.9077, "step": 117 }, { "epoch": 0.038380224426736054, "grad_norm": 6.169437408447266, "learning_rate": 5e-05, "loss": 4.1457, "step": 118 }, { "epoch": 0.03870548056594568, "grad_norm": 5.159611701965332, "learning_rate": 5e-05, "loss": 3.7067, "step": 119 }, { "epoch": 0.03903073670515531, "grad_norm": 6.676630973815918, "learning_rate": 5e-05, "loss": 4.0493, "step": 120 }, { "epoch": 0.03935599284436494, "grad_norm": 6.488524436950684, "learning_rate": 5e-05, "loss": 4.109, "step": 121 }, { "epoch": 0.039681248983574564, "grad_norm": 6.670077800750732, "learning_rate": 5e-05, "loss": 3.6645, "step": 122 }, { "epoch": 0.04000650512278419, "grad_norm": 6.173693656921387, "learning_rate": 5e-05, "loss": 4.4163, "step": 123 }, { "epoch": 0.04033176126199382, "grad_norm": 6.306183815002441, "learning_rate": 5e-05, "loss": 3.9382, "step": 124 }, { "epoch": 0.04065701740120345, "grad_norm": 6.007297039031982, "learning_rate": 5e-05, "loss": 3.956, "step": 125 }, { "epoch": 0.040982273540413075, "grad_norm": 6.0243730545043945, "learning_rate": 5e-05, "loss": 3.9613, "step": 126 }, { "epoch": 0.0413075296796227, "grad_norm": 5.6909871101379395, "learning_rate": 5e-05, "loss": 3.9213, "step": 127 }, { "epoch": 0.041632785818832334, "grad_norm": 5.5652265548706055, "learning_rate": 5e-05, "loss": 3.9325, "step": 128 }, { "epoch": 0.04195804195804196, "grad_norm": 7.6173272132873535, "learning_rate": 5e-05, "loss": 4.0422, "step": 129 }, { "epoch": 0.042283298097251586, "grad_norm": 10.900376319885254, "learning_rate": 5e-05, "loss": 3.9173, "step": 130 }, { "epoch": 0.04260855423646121, "grad_norm": 12.899847984313965, "learning_rate": 5e-05, "loss": 4.0328, "step": 131 }, { "epoch": 0.04293381037567084, "grad_norm": 11.928502082824707, "learning_rate": 5e-05, "loss": 3.9763, "step": 132 }, { "epoch": 0.04325906651488047, "grad_norm": 8.4597749710083, "learning_rate": 5e-05, "loss": 4.0895, "step": 133 }, { "epoch": 0.043584322654090096, "grad_norm": 5.162694931030273, "learning_rate": 5e-05, "loss": 4.1564, "step": 134 }, { "epoch": 0.04390957879329972, "grad_norm": 13.066299438476562, "learning_rate": 5e-05, "loss": 4.1774, "step": 135 }, { "epoch": 0.04423483493250935, "grad_norm": 14.013510704040527, "learning_rate": 5e-05, "loss": 3.5203, "step": 136 }, { "epoch": 0.04456009107171898, "grad_norm": 15.885542869567871, "learning_rate": 5e-05, "loss": 4.1405, "step": 137 }, { "epoch": 0.04488534721092861, "grad_norm": 7.15226411819458, "learning_rate": 5e-05, "loss": 4.043, "step": 138 }, { "epoch": 0.04521060335013823, "grad_norm": 5.1085686683654785, "learning_rate": 5e-05, "loss": 3.7764, "step": 139 }, { "epoch": 0.04553585948934786, "grad_norm": 6.9343390464782715, "learning_rate": 5e-05, "loss": 4.2056, "step": 140 }, { "epoch": 0.04586111562855749, "grad_norm": 11.624869346618652, "learning_rate": 5e-05, "loss": 3.7097, "step": 141 }, { "epoch": 0.04618637176776712, "grad_norm": 12.546487808227539, "learning_rate": 5e-05, "loss": 4.2067, "step": 142 }, { "epoch": 0.046511627906976744, "grad_norm": 5.573001861572266, "learning_rate": 5e-05, "loss": 3.6916, "step": 143 }, { "epoch": 0.04683688404618637, "grad_norm": 6.0869140625, "learning_rate": 5e-05, "loss": 4.1239, "step": 144 }, { "epoch": 0.047162140185396, "grad_norm": 7.625162124633789, "learning_rate": 5e-05, "loss": 3.9154, "step": 145 }, { "epoch": 0.04748739632460563, "grad_norm": 5.8476762771606445, "learning_rate": 5e-05, "loss": 3.9673, "step": 146 }, { "epoch": 0.047812652463815254, "grad_norm": 6.097865104675293, "learning_rate": 5e-05, "loss": 3.994, "step": 147 }, { "epoch": 0.04813790860302488, "grad_norm": 5.200497150421143, "learning_rate": 5e-05, "loss": 4.1332, "step": 148 }, { "epoch": 0.04846316474223451, "grad_norm": 8.92606258392334, "learning_rate": 5e-05, "loss": 4.1005, "step": 149 }, { "epoch": 0.04878842088144414, "grad_norm": 5.433960437774658, "learning_rate": 5e-05, "loss": 4.099, "step": 150 }, { "epoch": 0.049113677020653765, "grad_norm": 4.350966453552246, "learning_rate": 5e-05, "loss": 3.7492, "step": 151 }, { "epoch": 0.04943893315986339, "grad_norm": 8.3677978515625, "learning_rate": 5e-05, "loss": 3.8048, "step": 152 }, { "epoch": 0.04976418929907302, "grad_norm": 10.74728012084961, "learning_rate": 5e-05, "loss": 4.3122, "step": 153 }, { "epoch": 0.05008944543828265, "grad_norm": 8.89576530456543, "learning_rate": 5e-05, "loss": 4.432, "step": 154 }, { "epoch": 0.050414701577492275, "grad_norm": 6.710874080657959, "learning_rate": 5e-05, "loss": 3.9353, "step": 155 }, { "epoch": 0.0507399577167019, "grad_norm": 10.725092887878418, "learning_rate": 5e-05, "loss": 4.2776, "step": 156 }, { "epoch": 0.05106521385591153, "grad_norm": 11.6733980178833, "learning_rate": 5e-05, "loss": 4.1354, "step": 157 }, { "epoch": 0.05139046999512116, "grad_norm": 9.846784591674805, "learning_rate": 5e-05, "loss": 3.9646, "step": 158 }, { "epoch": 0.051715726134330786, "grad_norm": 8.646893501281738, "learning_rate": 5e-05, "loss": 3.9413, "step": 159 }, { "epoch": 0.05204098227354041, "grad_norm": 7.0959062576293945, "learning_rate": 5e-05, "loss": 4.0745, "step": 160 }, { "epoch": 0.05236623841275004, "grad_norm": 7.378256797790527, "learning_rate": 5e-05, "loss": 4.1712, "step": 161 }, { "epoch": 0.05269149455195967, "grad_norm": 7.3625640869140625, "learning_rate": 5e-05, "loss": 3.9466, "step": 162 }, { "epoch": 0.0530167506911693, "grad_norm": 4.918821811676025, "learning_rate": 5e-05, "loss": 3.8243, "step": 163 }, { "epoch": 0.05334200683037892, "grad_norm": 6.653145790100098, "learning_rate": 5e-05, "loss": 3.8895, "step": 164 }, { "epoch": 0.05366726296958855, "grad_norm": 8.699049949645996, "learning_rate": 5e-05, "loss": 4.1705, "step": 165 }, { "epoch": 0.05399251910879818, "grad_norm": 7.855594158172607, "learning_rate": 5e-05, "loss": 3.6343, "step": 166 }, { "epoch": 0.05431777524800781, "grad_norm": 5.597055435180664, "learning_rate": 5e-05, "loss": 3.7043, "step": 167 }, { "epoch": 0.05464303138721743, "grad_norm": 9.524121284484863, "learning_rate": 5e-05, "loss": 3.8942, "step": 168 }, { "epoch": 0.05496828752642706, "grad_norm": 5.389048099517822, "learning_rate": 5e-05, "loss": 3.4256, "step": 169 }, { "epoch": 0.05529354366563669, "grad_norm": 5.568866729736328, "learning_rate": 5e-05, "loss": 3.9784, "step": 170 }, { "epoch": 0.05561879980484632, "grad_norm": 12.199308395385742, "learning_rate": 5e-05, "loss": 3.7894, "step": 171 }, { "epoch": 0.055944055944055944, "grad_norm": 15.115793228149414, "learning_rate": 5e-05, "loss": 3.6162, "step": 172 }, { "epoch": 0.05626931208326557, "grad_norm": 5.6520538330078125, "learning_rate": 5e-05, "loss": 3.8633, "step": 173 }, { "epoch": 0.056594568222475196, "grad_norm": 5.969440937042236, "learning_rate": 5e-05, "loss": 4.0392, "step": 174 }, { "epoch": 0.05691982436168483, "grad_norm": 8.82860279083252, "learning_rate": 5e-05, "loss": 4.1596, "step": 175 }, { "epoch": 0.057245080500894455, "grad_norm": 8.133511543273926, "learning_rate": 5e-05, "loss": 4.3075, "step": 176 }, { "epoch": 0.05757033664010408, "grad_norm": 5.794802665710449, "learning_rate": 5e-05, "loss": 3.7082, "step": 177 }, { "epoch": 0.057895592779313707, "grad_norm": 6.0018744468688965, "learning_rate": 5e-05, "loss": 3.67, "step": 178 }, { "epoch": 0.05822084891852334, "grad_norm": 9.123400688171387, "learning_rate": 5e-05, "loss": 3.6293, "step": 179 }, { "epoch": 0.058546105057732965, "grad_norm": 12.262410163879395, "learning_rate": 5e-05, "loss": 4.0337, "step": 180 }, { "epoch": 0.05887136119694259, "grad_norm": 5.367374897003174, "learning_rate": 5e-05, "loss": 3.7243, "step": 181 }, { "epoch": 0.05919661733615222, "grad_norm": 5.942975997924805, "learning_rate": 5e-05, "loss": 3.691, "step": 182 }, { "epoch": 0.05952187347536185, "grad_norm": 5.772192001342773, "learning_rate": 5e-05, "loss": 3.8556, "step": 183 }, { "epoch": 0.059847129614571476, "grad_norm": 6.091885566711426, "learning_rate": 5e-05, "loss": 3.8396, "step": 184 }, { "epoch": 0.0601723857537811, "grad_norm": 6.4458231925964355, "learning_rate": 5e-05, "loss": 4.3947, "step": 185 }, { "epoch": 0.06049764189299073, "grad_norm": 6.378884315490723, "learning_rate": 5e-05, "loss": 4.375, "step": 186 }, { "epoch": 0.06082289803220036, "grad_norm": 7.179290294647217, "learning_rate": 5e-05, "loss": 3.6237, "step": 187 }, { "epoch": 0.06114815417140999, "grad_norm": 5.786200046539307, "learning_rate": 5e-05, "loss": 4.2066, "step": 188 }, { "epoch": 0.06147341031061961, "grad_norm": 5.186939239501953, "learning_rate": 5e-05, "loss": 3.9935, "step": 189 }, { "epoch": 0.06179866644982924, "grad_norm": 4.02333402633667, "learning_rate": 5e-05, "loss": 3.6273, "step": 190 }, { "epoch": 0.06212392258903887, "grad_norm": 6.622715473175049, "learning_rate": 5e-05, "loss": 4.2028, "step": 191 }, { "epoch": 0.0624491787282485, "grad_norm": 9.464071273803711, "learning_rate": 5e-05, "loss": 4.0506, "step": 192 }, { "epoch": 0.06277443486745812, "grad_norm": 6.995242595672607, "learning_rate": 5e-05, "loss": 4.1047, "step": 193 }, { "epoch": 0.06309969100666775, "grad_norm": 9.435445785522461, "learning_rate": 5e-05, "loss": 3.8718, "step": 194 }, { "epoch": 0.06342494714587738, "grad_norm": 7.273919582366943, "learning_rate": 5e-05, "loss": 3.9176, "step": 195 }, { "epoch": 0.063750203285087, "grad_norm": 6.896090030670166, "learning_rate": 5e-05, "loss": 3.8408, "step": 196 }, { "epoch": 0.06407545942429663, "grad_norm": 7.282253265380859, "learning_rate": 5e-05, "loss": 3.6903, "step": 197 }, { "epoch": 0.06440071556350627, "grad_norm": 9.39031982421875, "learning_rate": 5e-05, "loss": 3.9139, "step": 198 }, { "epoch": 0.06472597170271589, "grad_norm": 7.485379695892334, "learning_rate": 5e-05, "loss": 3.8522, "step": 199 }, { "epoch": 0.06505122784192552, "grad_norm": 7.848803997039795, "learning_rate": 5e-05, "loss": 3.879, "step": 200 }, { "epoch": 0.06537648398113514, "grad_norm": 7.829058647155762, "learning_rate": 5e-05, "loss": 4.0283, "step": 201 }, { "epoch": 0.06570174012034477, "grad_norm": 8.984028816223145, "learning_rate": 5e-05, "loss": 3.8663, "step": 202 }, { "epoch": 0.0660269962595544, "grad_norm": 7.604732513427734, "learning_rate": 5e-05, "loss": 3.7871, "step": 203 }, { "epoch": 0.06635225239876402, "grad_norm": 6.779748916625977, "learning_rate": 5e-05, "loss": 3.9423, "step": 204 }, { "epoch": 0.06667750853797365, "grad_norm": 8.93659782409668, "learning_rate": 5e-05, "loss": 4.0208, "step": 205 }, { "epoch": 0.06700276467718329, "grad_norm": 6.093626022338867, "learning_rate": 5e-05, "loss": 3.8828, "step": 206 }, { "epoch": 0.06732802081639291, "grad_norm": 6.105995178222656, "learning_rate": 5e-05, "loss": 4.0983, "step": 207 }, { "epoch": 0.06765327695560254, "grad_norm": 7.379316329956055, "learning_rate": 5e-05, "loss": 4.0493, "step": 208 }, { "epoch": 0.06797853309481217, "grad_norm": 6.404873847961426, "learning_rate": 5e-05, "loss": 3.9271, "step": 209 }, { "epoch": 0.06830378923402179, "grad_norm": 7.560967445373535, "learning_rate": 5e-05, "loss": 3.8633, "step": 210 }, { "epoch": 0.06862904537323142, "grad_norm": 6.042522430419922, "learning_rate": 5e-05, "loss": 3.7389, "step": 211 }, { "epoch": 0.06895430151244104, "grad_norm": 6.4881367683410645, "learning_rate": 5e-05, "loss": 3.8196, "step": 212 }, { "epoch": 0.06927955765165067, "grad_norm": 6.52613639831543, "learning_rate": 5e-05, "loss": 3.7737, "step": 213 }, { "epoch": 0.06960481379086031, "grad_norm": 4.999444007873535, "learning_rate": 5e-05, "loss": 3.8722, "step": 214 }, { "epoch": 0.06993006993006994, "grad_norm": 7.060845851898193, "learning_rate": 5e-05, "loss": 3.774, "step": 215 }, { "epoch": 0.07025532606927956, "grad_norm": 8.545415878295898, "learning_rate": 5e-05, "loss": 3.8479, "step": 216 }, { "epoch": 0.07058058220848919, "grad_norm": 7.625663757324219, "learning_rate": 5e-05, "loss": 3.5068, "step": 217 }, { "epoch": 0.07090583834769881, "grad_norm": 7.191437244415283, "learning_rate": 5e-05, "loss": 3.7885, "step": 218 }, { "epoch": 0.07123109448690844, "grad_norm": 7.386499881744385, "learning_rate": 5e-05, "loss": 3.9494, "step": 219 }, { "epoch": 0.07155635062611806, "grad_norm": 5.616601943969727, "learning_rate": 5e-05, "loss": 3.8093, "step": 220 }, { "epoch": 0.07188160676532769, "grad_norm": 6.17822265625, "learning_rate": 5e-05, "loss": 3.8981, "step": 221 }, { "epoch": 0.07220686290453732, "grad_norm": 6.138426303863525, "learning_rate": 5e-05, "loss": 4.0091, "step": 222 }, { "epoch": 0.07253211904374696, "grad_norm": 6.6297831535339355, "learning_rate": 5e-05, "loss": 3.9291, "step": 223 }, { "epoch": 0.07285737518295658, "grad_norm": 6.557385444641113, "learning_rate": 5e-05, "loss": 3.8338, "step": 224 }, { "epoch": 0.07318263132216621, "grad_norm": 6.9579291343688965, "learning_rate": 5e-05, "loss": 3.8158, "step": 225 }, { "epoch": 0.07350788746137583, "grad_norm": 7.129207611083984, "learning_rate": 5e-05, "loss": 4.1731, "step": 226 }, { "epoch": 0.07383314360058546, "grad_norm": 6.645360946655273, "learning_rate": 5e-05, "loss": 3.6926, "step": 227 }, { "epoch": 0.07415839973979509, "grad_norm": 8.101895332336426, "learning_rate": 5e-05, "loss": 4.3064, "step": 228 }, { "epoch": 0.07448365587900471, "grad_norm": 7.812802791595459, "learning_rate": 5e-05, "loss": 3.7983, "step": 229 }, { "epoch": 0.07480891201821434, "grad_norm": 7.278988838195801, "learning_rate": 5e-05, "loss": 3.9539, "step": 230 }, { "epoch": 0.07513416815742398, "grad_norm": 7.909803867340088, "learning_rate": 5e-05, "loss": 3.5888, "step": 231 }, { "epoch": 0.0754594242966336, "grad_norm": 5.668457984924316, "learning_rate": 5e-05, "loss": 3.8219, "step": 232 }, { "epoch": 0.07578468043584323, "grad_norm": 6.159639358520508, "learning_rate": 5e-05, "loss": 4.0184, "step": 233 }, { "epoch": 0.07610993657505286, "grad_norm": 6.18869161605835, "learning_rate": 5e-05, "loss": 3.7595, "step": 234 }, { "epoch": 0.07643519271426248, "grad_norm": 5.471868991851807, "learning_rate": 5e-05, "loss": 3.9409, "step": 235 }, { "epoch": 0.07676044885347211, "grad_norm": 7.921130180358887, "learning_rate": 5e-05, "loss": 4.1452, "step": 236 }, { "epoch": 0.07708570499268173, "grad_norm": 6.49941349029541, "learning_rate": 5e-05, "loss": 3.8261, "step": 237 }, { "epoch": 0.07741096113189136, "grad_norm": 10.190372467041016, "learning_rate": 5e-05, "loss": 3.6134, "step": 238 }, { "epoch": 0.07773621727110099, "grad_norm": 5.995229244232178, "learning_rate": 5e-05, "loss": 3.7563, "step": 239 }, { "epoch": 0.07806147341031063, "grad_norm": 8.94497299194336, "learning_rate": 5e-05, "loss": 3.5379, "step": 240 }, { "epoch": 0.07838672954952025, "grad_norm": 10.990089416503906, "learning_rate": 5e-05, "loss": 3.9176, "step": 241 }, { "epoch": 0.07871198568872988, "grad_norm": 7.899653434753418, "learning_rate": 5e-05, "loss": 3.7961, "step": 242 }, { "epoch": 0.0790372418279395, "grad_norm": 7.264082908630371, "learning_rate": 5e-05, "loss": 3.5957, "step": 243 }, { "epoch": 0.07936249796714913, "grad_norm": 5.855433940887451, "learning_rate": 5e-05, "loss": 3.5913, "step": 244 }, { "epoch": 0.07968775410635875, "grad_norm": 6.854794979095459, "learning_rate": 5e-05, "loss": 3.6167, "step": 245 }, { "epoch": 0.08001301024556838, "grad_norm": 7.06243896484375, "learning_rate": 5e-05, "loss": 3.7909, "step": 246 }, { "epoch": 0.080338266384778, "grad_norm": 8.033863067626953, "learning_rate": 5e-05, "loss": 4.046, "step": 247 }, { "epoch": 0.08066352252398765, "grad_norm": 6.078402519226074, "learning_rate": 5e-05, "loss": 3.3269, "step": 248 }, { "epoch": 0.08098877866319727, "grad_norm": 9.511942863464355, "learning_rate": 5e-05, "loss": 3.5749, "step": 249 }, { "epoch": 0.0813140348024069, "grad_norm": 9.74225902557373, "learning_rate": 5e-05, "loss": 3.9162, "step": 250 }, { "epoch": 0.08163929094161652, "grad_norm": 6.432509422302246, "learning_rate": 5e-05, "loss": 3.8754, "step": 251 }, { "epoch": 0.08196454708082615, "grad_norm": 7.885379314422607, "learning_rate": 5e-05, "loss": 3.7214, "step": 252 }, { "epoch": 0.08228980322003578, "grad_norm": 11.552560806274414, "learning_rate": 5e-05, "loss": 3.7367, "step": 253 }, { "epoch": 0.0826150593592454, "grad_norm": 9.054500579833984, "learning_rate": 5e-05, "loss": 3.5476, "step": 254 }, { "epoch": 0.08294031549845503, "grad_norm": 5.916128635406494, "learning_rate": 5e-05, "loss": 3.695, "step": 255 }, { "epoch": 0.08326557163766467, "grad_norm": 10.65311050415039, "learning_rate": 5e-05, "loss": 4.1597, "step": 256 }, { "epoch": 0.0835908277768743, "grad_norm": 13.63244342803955, "learning_rate": 5e-05, "loss": 3.5796, "step": 257 }, { "epoch": 0.08391608391608392, "grad_norm": 10.830595970153809, "learning_rate": 5e-05, "loss": 3.4825, "step": 258 }, { "epoch": 0.08424134005529355, "grad_norm": 5.9953718185424805, "learning_rate": 5e-05, "loss": 3.792, "step": 259 }, { "epoch": 0.08456659619450317, "grad_norm": 12.76282787322998, "learning_rate": 5e-05, "loss": 3.5532, "step": 260 }, { "epoch": 0.0848918523337128, "grad_norm": 18.605255126953125, "learning_rate": 5e-05, "loss": 3.7788, "step": 261 }, { "epoch": 0.08521710847292242, "grad_norm": 12.753776550292969, "learning_rate": 5e-05, "loss": 3.7542, "step": 262 }, { "epoch": 0.08554236461213205, "grad_norm": 7.8098673820495605, "learning_rate": 5e-05, "loss": 4.4529, "step": 263 }, { "epoch": 0.08586762075134168, "grad_norm": 9.642732620239258, "learning_rate": 5e-05, "loss": 3.8515, "step": 264 }, { "epoch": 0.08619287689055131, "grad_norm": 5.820125102996826, "learning_rate": 5e-05, "loss": 4.3286, "step": 265 }, { "epoch": 0.08651813302976094, "grad_norm": 9.613585472106934, "learning_rate": 5e-05, "loss": 3.9569, "step": 266 }, { "epoch": 0.08684338916897057, "grad_norm": 9.211997985839844, "learning_rate": 5e-05, "loss": 3.9244, "step": 267 }, { "epoch": 0.08716864530818019, "grad_norm": 6.351746559143066, "learning_rate": 5e-05, "loss": 3.6584, "step": 268 }, { "epoch": 0.08749390144738982, "grad_norm": 6.802426815032959, "learning_rate": 5e-05, "loss": 3.8361, "step": 269 }, { "epoch": 0.08781915758659944, "grad_norm": 7.498976707458496, "learning_rate": 5e-05, "loss": 3.6589, "step": 270 }, { "epoch": 0.08814441372580907, "grad_norm": 6.058091163635254, "learning_rate": 5e-05, "loss": 3.6699, "step": 271 }, { "epoch": 0.0884696698650187, "grad_norm": 5.347617149353027, "learning_rate": 5e-05, "loss": 3.586, "step": 272 }, { "epoch": 0.08879492600422834, "grad_norm": 6.720355033874512, "learning_rate": 5e-05, "loss": 3.9508, "step": 273 }, { "epoch": 0.08912018214343796, "grad_norm": 5.8187174797058105, "learning_rate": 5e-05, "loss": 4.1922, "step": 274 }, { "epoch": 0.08944543828264759, "grad_norm": 4.981271743774414, "learning_rate": 5e-05, "loss": 3.5997, "step": 275 }, { "epoch": 0.08977069442185721, "grad_norm": 6.238826751708984, "learning_rate": 5e-05, "loss": 3.6929, "step": 276 }, { "epoch": 0.09009595056106684, "grad_norm": 7.073620796203613, "learning_rate": 5e-05, "loss": 3.7813, "step": 277 }, { "epoch": 0.09042120670027647, "grad_norm": 7.445998668670654, "learning_rate": 5e-05, "loss": 3.5475, "step": 278 }, { "epoch": 0.09074646283948609, "grad_norm": 6.120103359222412, "learning_rate": 5e-05, "loss": 3.7499, "step": 279 }, { "epoch": 0.09107171897869572, "grad_norm": 8.138337135314941, "learning_rate": 5e-05, "loss": 3.8753, "step": 280 }, { "epoch": 0.09139697511790534, "grad_norm": 7.516664981842041, "learning_rate": 5e-05, "loss": 3.6093, "step": 281 }, { "epoch": 0.09172223125711498, "grad_norm": 5.630866050720215, "learning_rate": 5e-05, "loss": 3.7269, "step": 282 }, { "epoch": 0.09204748739632461, "grad_norm": 5.958463668823242, "learning_rate": 5e-05, "loss": 3.7448, "step": 283 }, { "epoch": 0.09237274353553424, "grad_norm": 6.990318298339844, "learning_rate": 5e-05, "loss": 3.6994, "step": 284 }, { "epoch": 0.09269799967474386, "grad_norm": 6.364505767822266, "learning_rate": 5e-05, "loss": 3.5316, "step": 285 }, { "epoch": 0.09302325581395349, "grad_norm": 6.308237552642822, "learning_rate": 5e-05, "loss": 4.1899, "step": 286 }, { "epoch": 0.09334851195316311, "grad_norm": 8.585831642150879, "learning_rate": 5e-05, "loss": 3.7078, "step": 287 }, { "epoch": 0.09367376809237274, "grad_norm": 6.02251672744751, "learning_rate": 5e-05, "loss": 3.7081, "step": 288 }, { "epoch": 0.09399902423158236, "grad_norm": 6.891519546508789, "learning_rate": 5e-05, "loss": 3.9432, "step": 289 }, { "epoch": 0.094324280370792, "grad_norm": 8.65449047088623, "learning_rate": 5e-05, "loss": 3.4704, "step": 290 }, { "epoch": 0.09464953651000163, "grad_norm": 6.133912563323975, "learning_rate": 5e-05, "loss": 3.8847, "step": 291 }, { "epoch": 0.09497479264921126, "grad_norm": 6.619656085968018, "learning_rate": 5e-05, "loss": 3.5818, "step": 292 }, { "epoch": 0.09530004878842088, "grad_norm": 9.706931114196777, "learning_rate": 5e-05, "loss": 3.6839, "step": 293 }, { "epoch": 0.09562530492763051, "grad_norm": 6.43947172164917, "learning_rate": 5e-05, "loss": 4.0691, "step": 294 }, { "epoch": 0.09595056106684013, "grad_norm": 7.45628547668457, "learning_rate": 5e-05, "loss": 3.9546, "step": 295 }, { "epoch": 0.09627581720604976, "grad_norm": 9.464739799499512, "learning_rate": 5e-05, "loss": 3.8893, "step": 296 }, { "epoch": 0.09660107334525939, "grad_norm": 9.263232231140137, "learning_rate": 5e-05, "loss": 4.3718, "step": 297 }, { "epoch": 0.09692632948446903, "grad_norm": 6.793147087097168, "learning_rate": 5e-05, "loss": 3.7036, "step": 298 }, { "epoch": 0.09725158562367865, "grad_norm": 12.16869831085205, "learning_rate": 5e-05, "loss": 4.2924, "step": 299 }, { "epoch": 0.09757684176288828, "grad_norm": 10.058348655700684, "learning_rate": 5e-05, "loss": 3.852, "step": 300 }, { "epoch": 0.0979020979020979, "grad_norm": 5.966858386993408, "learning_rate": 5e-05, "loss": 3.7514, "step": 301 }, { "epoch": 0.09822735404130753, "grad_norm": 9.075318336486816, "learning_rate": 5e-05, "loss": 3.6808, "step": 302 }, { "epoch": 0.09855261018051716, "grad_norm": 11.372644424438477, "learning_rate": 5e-05, "loss": 4.0671, "step": 303 }, { "epoch": 0.09887786631972678, "grad_norm": 12.080697059631348, "learning_rate": 5e-05, "loss": 3.2312, "step": 304 }, { "epoch": 0.09920312245893641, "grad_norm": 5.336820125579834, "learning_rate": 5e-05, "loss": 3.5947, "step": 305 }, { "epoch": 0.09952837859814603, "grad_norm": 7.266972064971924, "learning_rate": 5e-05, "loss": 3.1227, "step": 306 }, { "epoch": 0.09985363473735567, "grad_norm": 6.334729194641113, "learning_rate": 5e-05, "loss": 3.6156, "step": 307 }, { "epoch": 0.1001788908765653, "grad_norm": 5.643017768859863, "learning_rate": 5e-05, "loss": 3.6258, "step": 308 }, { "epoch": 0.10050414701577493, "grad_norm": 5.640905380249023, "learning_rate": 5e-05, "loss": 3.9051, "step": 309 }, { "epoch": 0.10082940315498455, "grad_norm": 6.985749244689941, "learning_rate": 5e-05, "loss": 3.6725, "step": 310 }, { "epoch": 0.10115465929419418, "grad_norm": 6.893199443817139, "learning_rate": 5e-05, "loss": 3.4541, "step": 311 }, { "epoch": 0.1014799154334038, "grad_norm": 6.803256511688232, "learning_rate": 5e-05, "loss": 3.0762, "step": 312 }, { "epoch": 0.10180517157261343, "grad_norm": 8.491405487060547, "learning_rate": 5e-05, "loss": 3.719, "step": 313 }, { "epoch": 0.10213042771182305, "grad_norm": 5.912895202636719, "learning_rate": 5e-05, "loss": 3.6613, "step": 314 }, { "epoch": 0.1024556838510327, "grad_norm": 6.211380958557129, "learning_rate": 5e-05, "loss": 3.632, "step": 315 }, { "epoch": 0.10278093999024232, "grad_norm": 8.083343505859375, "learning_rate": 5e-05, "loss": 3.5327, "step": 316 }, { "epoch": 0.10310619612945195, "grad_norm": 8.091614723205566, "learning_rate": 5e-05, "loss": 3.8166, "step": 317 }, { "epoch": 0.10343145226866157, "grad_norm": 5.631373405456543, "learning_rate": 5e-05, "loss": 3.6482, "step": 318 }, { "epoch": 0.1037567084078712, "grad_norm": 12.532264709472656, "learning_rate": 5e-05, "loss": 3.5283, "step": 319 }, { "epoch": 0.10408196454708082, "grad_norm": 5.990050315856934, "learning_rate": 5e-05, "loss": 3.9679, "step": 320 }, { "epoch": 0.10440722068629045, "grad_norm": 6.2988667488098145, "learning_rate": 5e-05, "loss": 3.8285, "step": 321 }, { "epoch": 0.10473247682550008, "grad_norm": 8.320550918579102, "learning_rate": 5e-05, "loss": 4.0295, "step": 322 }, { "epoch": 0.10505773296470972, "grad_norm": 6.640725612640381, "learning_rate": 5e-05, "loss": 3.5094, "step": 323 }, { "epoch": 0.10538298910391934, "grad_norm": 6.340143203735352, "learning_rate": 5e-05, "loss": 3.6144, "step": 324 }, { "epoch": 0.10570824524312897, "grad_norm": 7.403520584106445, "learning_rate": 5e-05, "loss": 3.9676, "step": 325 }, { "epoch": 0.1060335013823386, "grad_norm": 7.462515354156494, "learning_rate": 5e-05, "loss": 3.7215, "step": 326 }, { "epoch": 0.10635875752154822, "grad_norm": 5.612910747528076, "learning_rate": 5e-05, "loss": 3.5211, "step": 327 }, { "epoch": 0.10668401366075785, "grad_norm": 7.502828598022461, "learning_rate": 5e-05, "loss": 4.0133, "step": 328 }, { "epoch": 0.10700926979996747, "grad_norm": 8.6078462600708, "learning_rate": 5e-05, "loss": 4.1303, "step": 329 }, { "epoch": 0.1073345259391771, "grad_norm": 9.176727294921875, "learning_rate": 5e-05, "loss": 3.8978, "step": 330 }, { "epoch": 0.10765978207838672, "grad_norm": 10.041065216064453, "learning_rate": 5e-05, "loss": 4.0287, "step": 331 }, { "epoch": 0.10798503821759636, "grad_norm": 9.741332054138184, "learning_rate": 5e-05, "loss": 3.5463, "step": 332 }, { "epoch": 0.10831029435680599, "grad_norm": 7.8142499923706055, "learning_rate": 5e-05, "loss": 3.7777, "step": 333 }, { "epoch": 0.10863555049601561, "grad_norm": 8.65985107421875, "learning_rate": 5e-05, "loss": 3.7169, "step": 334 }, { "epoch": 0.10896080663522524, "grad_norm": 9.955862998962402, "learning_rate": 5e-05, "loss": 3.4762, "step": 335 }, { "epoch": 0.10928606277443487, "grad_norm": 8.422538757324219, "learning_rate": 5e-05, "loss": 3.8524, "step": 336 }, { "epoch": 0.10961131891364449, "grad_norm": 6.849399089813232, "learning_rate": 5e-05, "loss": 3.5617, "step": 337 }, { "epoch": 0.10993657505285412, "grad_norm": 10.709142684936523, "learning_rate": 5e-05, "loss": 3.8691, "step": 338 }, { "epoch": 0.11026183119206374, "grad_norm": 6.636946678161621, "learning_rate": 5e-05, "loss": 3.9301, "step": 339 }, { "epoch": 0.11058708733127338, "grad_norm": 7.364269256591797, "learning_rate": 5e-05, "loss": 3.485, "step": 340 }, { "epoch": 0.11091234347048301, "grad_norm": 12.705086708068848, "learning_rate": 5e-05, "loss": 3.8413, "step": 341 }, { "epoch": 0.11123759960969264, "grad_norm": 6.6930928230285645, "learning_rate": 5e-05, "loss": 3.8912, "step": 342 }, { "epoch": 0.11156285574890226, "grad_norm": 6.823209285736084, "learning_rate": 5e-05, "loss": 4.2082, "step": 343 }, { "epoch": 0.11188811188811189, "grad_norm": 9.133801460266113, "learning_rate": 5e-05, "loss": 4.0229, "step": 344 }, { "epoch": 0.11221336802732151, "grad_norm": 7.471242904663086, "learning_rate": 5e-05, "loss": 3.6206, "step": 345 }, { "epoch": 0.11253862416653114, "grad_norm": 7.450990676879883, "learning_rate": 5e-05, "loss": 3.7254, "step": 346 }, { "epoch": 0.11286388030574077, "grad_norm": 7.53968620300293, "learning_rate": 5e-05, "loss": 4.0496, "step": 347 }, { "epoch": 0.11318913644495039, "grad_norm": 12.383916854858398, "learning_rate": 5e-05, "loss": 3.6815, "step": 348 }, { "epoch": 0.11351439258416003, "grad_norm": 8.754898071289062, "learning_rate": 5e-05, "loss": 3.5597, "step": 349 }, { "epoch": 0.11383964872336966, "grad_norm": 7.65074348449707, "learning_rate": 5e-05, "loss": 4.0558, "step": 350 }, { "epoch": 0.11416490486257928, "grad_norm": 6.735880374908447, "learning_rate": 5e-05, "loss": 3.5702, "step": 351 }, { "epoch": 0.11449016100178891, "grad_norm": 6.371280670166016, "learning_rate": 5e-05, "loss": 3.7325, "step": 352 }, { "epoch": 0.11481541714099854, "grad_norm": 6.2961745262146, "learning_rate": 5e-05, "loss": 3.8299, "step": 353 }, { "epoch": 0.11514067328020816, "grad_norm": 8.073019027709961, "learning_rate": 5e-05, "loss": 3.77, "step": 354 }, { "epoch": 0.11546592941941779, "grad_norm": 7.552728176116943, "learning_rate": 5e-05, "loss": 3.6974, "step": 355 }, { "epoch": 0.11579118555862741, "grad_norm": 6.595133304595947, "learning_rate": 5e-05, "loss": 3.6737, "step": 356 }, { "epoch": 0.11611644169783705, "grad_norm": 7.287491321563721, "learning_rate": 5e-05, "loss": 3.5135, "step": 357 }, { "epoch": 0.11644169783704668, "grad_norm": 8.068704605102539, "learning_rate": 5e-05, "loss": 3.9036, "step": 358 }, { "epoch": 0.1167669539762563, "grad_norm": 6.8040618896484375, "learning_rate": 5e-05, "loss": 3.8034, "step": 359 }, { "epoch": 0.11709221011546593, "grad_norm": 9.113652229309082, "learning_rate": 5e-05, "loss": 3.4709, "step": 360 }, { "epoch": 0.11741746625467556, "grad_norm": 8.15011978149414, "learning_rate": 5e-05, "loss": 4.111, "step": 361 }, { "epoch": 0.11774272239388518, "grad_norm": 6.63869047164917, "learning_rate": 5e-05, "loss": 3.8713, "step": 362 }, { "epoch": 0.11806797853309481, "grad_norm": 6.785707473754883, "learning_rate": 5e-05, "loss": 3.1996, "step": 363 }, { "epoch": 0.11839323467230443, "grad_norm": 7.099983215332031, "learning_rate": 5e-05, "loss": 3.4236, "step": 364 }, { "epoch": 0.11871849081151407, "grad_norm": 7.014822006225586, "learning_rate": 5e-05, "loss": 3.7598, "step": 365 }, { "epoch": 0.1190437469507237, "grad_norm": 7.138816833496094, "learning_rate": 5e-05, "loss": 3.6681, "step": 366 }, { "epoch": 0.11936900308993333, "grad_norm": 6.563411235809326, "learning_rate": 5e-05, "loss": 3.82, "step": 367 }, { "epoch": 0.11969425922914295, "grad_norm": 6.389061450958252, "learning_rate": 5e-05, "loss": 3.7305, "step": 368 }, { "epoch": 0.12001951536835258, "grad_norm": 8.009288787841797, "learning_rate": 5e-05, "loss": 3.9673, "step": 369 }, { "epoch": 0.1203447715075622, "grad_norm": 6.436244964599609, "learning_rate": 5e-05, "loss": 3.6934, "step": 370 }, { "epoch": 0.12067002764677183, "grad_norm": 7.818999767303467, "learning_rate": 5e-05, "loss": 3.8564, "step": 371 }, { "epoch": 0.12099528378598146, "grad_norm": 7.891193866729736, "learning_rate": 5e-05, "loss": 3.9254, "step": 372 }, { "epoch": 0.12132053992519108, "grad_norm": 8.381048202514648, "learning_rate": 5e-05, "loss": 3.8966, "step": 373 }, { "epoch": 0.12164579606440072, "grad_norm": 7.328115940093994, "learning_rate": 5e-05, "loss": 3.4804, "step": 374 }, { "epoch": 0.12197105220361035, "grad_norm": 7.0525922775268555, "learning_rate": 5e-05, "loss": 4.0177, "step": 375 }, { "epoch": 0.12229630834281997, "grad_norm": 7.212350845336914, "learning_rate": 5e-05, "loss": 3.4131, "step": 376 }, { "epoch": 0.1226215644820296, "grad_norm": 6.808897972106934, "learning_rate": 5e-05, "loss": 3.5174, "step": 377 }, { "epoch": 0.12294682062123923, "grad_norm": 7.094473838806152, "learning_rate": 5e-05, "loss": 3.7852, "step": 378 }, { "epoch": 0.12327207676044885, "grad_norm": 6.998628616333008, "learning_rate": 5e-05, "loss": 3.8094, "step": 379 }, { "epoch": 0.12359733289965848, "grad_norm": 7.043560028076172, "learning_rate": 5e-05, "loss": 3.6862, "step": 380 }, { "epoch": 0.1239225890388681, "grad_norm": 6.198429107666016, "learning_rate": 5e-05, "loss": 3.5462, "step": 381 }, { "epoch": 0.12424784517807774, "grad_norm": 6.5926513671875, "learning_rate": 5e-05, "loss": 3.9851, "step": 382 }, { "epoch": 0.12457310131728737, "grad_norm": 5.893482208251953, "learning_rate": 5e-05, "loss": 3.6685, "step": 383 }, { "epoch": 0.124898357456497, "grad_norm": 5.886164665222168, "learning_rate": 5e-05, "loss": 3.7367, "step": 384 }, { "epoch": 0.1252236135957066, "grad_norm": 7.275190353393555, "learning_rate": 5e-05, "loss": 3.4105, "step": 385 }, { "epoch": 0.12554886973491625, "grad_norm": 8.864086151123047, "learning_rate": 5e-05, "loss": 4.0736, "step": 386 }, { "epoch": 0.1258741258741259, "grad_norm": 9.517216682434082, "learning_rate": 5e-05, "loss": 3.6961, "step": 387 }, { "epoch": 0.1261993820133355, "grad_norm": 8.437984466552734, "learning_rate": 5e-05, "loss": 4.1894, "step": 388 }, { "epoch": 0.12652463815254514, "grad_norm": 9.38377571105957, "learning_rate": 5e-05, "loss": 3.9029, "step": 389 }, { "epoch": 0.12684989429175475, "grad_norm": 8.621910095214844, "learning_rate": 5e-05, "loss": 3.9334, "step": 390 }, { "epoch": 0.1271751504309644, "grad_norm": 7.772785186767578, "learning_rate": 5e-05, "loss": 3.4678, "step": 391 }, { "epoch": 0.127500406570174, "grad_norm": 8.752019882202148, "learning_rate": 5e-05, "loss": 3.62, "step": 392 }, { "epoch": 0.12782566270938364, "grad_norm": 7.4593706130981445, "learning_rate": 5e-05, "loss": 3.907, "step": 393 }, { "epoch": 0.12815091884859325, "grad_norm": 7.014523983001709, "learning_rate": 5e-05, "loss": 3.7242, "step": 394 }, { "epoch": 0.1284761749878029, "grad_norm": 7.254335403442383, "learning_rate": 5e-05, "loss": 3.8927, "step": 395 }, { "epoch": 0.12880143112701253, "grad_norm": 7.555474281311035, "learning_rate": 5e-05, "loss": 4.3168, "step": 396 }, { "epoch": 0.12912668726622215, "grad_norm": 11.899949073791504, "learning_rate": 5e-05, "loss": 3.6753, "step": 397 }, { "epoch": 0.12945194340543179, "grad_norm": 11.901144027709961, "learning_rate": 5e-05, "loss": 3.7524, "step": 398 }, { "epoch": 0.1297771995446414, "grad_norm": 9.584845542907715, "learning_rate": 5e-05, "loss": 3.4015, "step": 399 }, { "epoch": 0.13010245568385104, "grad_norm": 12.348978042602539, "learning_rate": 5e-05, "loss": 3.6966, "step": 400 }, { "epoch": 0.13042771182306065, "grad_norm": 12.886831283569336, "learning_rate": 5e-05, "loss": 4.5981, "step": 401 }, { "epoch": 0.1307529679622703, "grad_norm": 7.066255569458008, "learning_rate": 5e-05, "loss": 3.6408, "step": 402 }, { "epoch": 0.13107822410147993, "grad_norm": 7.1310014724731445, "learning_rate": 5e-05, "loss": 3.7652, "step": 403 }, { "epoch": 0.13140348024068954, "grad_norm": 7.658654689788818, "learning_rate": 5e-05, "loss": 3.9908, "step": 404 }, { "epoch": 0.13172873637989918, "grad_norm": 9.139669418334961, "learning_rate": 5e-05, "loss": 3.7055, "step": 405 }, { "epoch": 0.1320539925191088, "grad_norm": 7.406591892242432, "learning_rate": 5e-05, "loss": 3.9435, "step": 406 }, { "epoch": 0.13237924865831843, "grad_norm": 7.888886451721191, "learning_rate": 5e-05, "loss": 3.7047, "step": 407 }, { "epoch": 0.13270450479752804, "grad_norm": 6.58457088470459, "learning_rate": 5e-05, "loss": 3.5882, "step": 408 }, { "epoch": 0.13302976093673768, "grad_norm": 6.361485958099365, "learning_rate": 5e-05, "loss": 3.6571, "step": 409 }, { "epoch": 0.1333550170759473, "grad_norm": 10.977415084838867, "learning_rate": 5e-05, "loss": 3.9192, "step": 410 }, { "epoch": 0.13368027321515694, "grad_norm": 8.509581565856934, "learning_rate": 5e-05, "loss": 3.7566, "step": 411 }, { "epoch": 0.13400552935436658, "grad_norm": 7.781307220458984, "learning_rate": 5e-05, "loss": 4.1816, "step": 412 }, { "epoch": 0.1343307854935762, "grad_norm": 7.275979518890381, "learning_rate": 5e-05, "loss": 3.2031, "step": 413 }, { "epoch": 0.13465604163278583, "grad_norm": 7.543152332305908, "learning_rate": 5e-05, "loss": 3.9525, "step": 414 }, { "epoch": 0.13498129777199544, "grad_norm": 9.093851089477539, "learning_rate": 5e-05, "loss": 3.9126, "step": 415 }, { "epoch": 0.13530655391120508, "grad_norm": 12.760071754455566, "learning_rate": 5e-05, "loss": 3.7383, "step": 416 }, { "epoch": 0.1356318100504147, "grad_norm": 8.025795936584473, "learning_rate": 5e-05, "loss": 3.5959, "step": 417 }, { "epoch": 0.13595706618962433, "grad_norm": 7.363426685333252, "learning_rate": 5e-05, "loss": 3.7238, "step": 418 }, { "epoch": 0.13628232232883394, "grad_norm": 8.554017066955566, "learning_rate": 5e-05, "loss": 3.3142, "step": 419 }, { "epoch": 0.13660757846804358, "grad_norm": 7.679929256439209, "learning_rate": 5e-05, "loss": 3.6632, "step": 420 }, { "epoch": 0.13693283460725322, "grad_norm": 7.977594375610352, "learning_rate": 5e-05, "loss": 3.3469, "step": 421 }, { "epoch": 0.13725809074646284, "grad_norm": 7.612384796142578, "learning_rate": 5e-05, "loss": 3.6385, "step": 422 }, { "epoch": 0.13758334688567248, "grad_norm": 9.067974090576172, "learning_rate": 5e-05, "loss": 3.7935, "step": 423 }, { "epoch": 0.1379086030248821, "grad_norm": 9.513571739196777, "learning_rate": 5e-05, "loss": 4.0498, "step": 424 }, { "epoch": 0.13823385916409173, "grad_norm": 10.234723091125488, "learning_rate": 5e-05, "loss": 3.9196, "step": 425 }, { "epoch": 0.13855911530330134, "grad_norm": 8.635189056396484, "learning_rate": 5e-05, "loss": 3.8482, "step": 426 }, { "epoch": 0.13888437144251098, "grad_norm": 10.970733642578125, "learning_rate": 5e-05, "loss": 3.765, "step": 427 }, { "epoch": 0.13920962758172062, "grad_norm": 9.9661226272583, "learning_rate": 5e-05, "loss": 3.939, "step": 428 }, { "epoch": 0.13953488372093023, "grad_norm": 9.381548881530762, "learning_rate": 5e-05, "loss": 3.749, "step": 429 }, { "epoch": 0.13986013986013987, "grad_norm": 8.287264823913574, "learning_rate": 5e-05, "loss": 3.7818, "step": 430 }, { "epoch": 0.14018539599934948, "grad_norm": 8.97390365600586, "learning_rate": 5e-05, "loss": 4.0317, "step": 431 }, { "epoch": 0.14051065213855912, "grad_norm": 7.877195835113525, "learning_rate": 5e-05, "loss": 3.5584, "step": 432 }, { "epoch": 0.14083590827776873, "grad_norm": 9.581697463989258, "learning_rate": 5e-05, "loss": 3.4095, "step": 433 }, { "epoch": 0.14116116441697837, "grad_norm": 17.695627212524414, "learning_rate": 5e-05, "loss": 4.3531, "step": 434 }, { "epoch": 0.141486420556188, "grad_norm": 8.333785057067871, "learning_rate": 5e-05, "loss": 3.3827, "step": 435 }, { "epoch": 0.14181167669539763, "grad_norm": 7.970407009124756, "learning_rate": 5e-05, "loss": 3.5483, "step": 436 }, { "epoch": 0.14213693283460727, "grad_norm": 9.061053276062012, "learning_rate": 5e-05, "loss": 3.7439, "step": 437 }, { "epoch": 0.14246218897381688, "grad_norm": 6.50039005279541, "learning_rate": 5e-05, "loss": 3.9181, "step": 438 }, { "epoch": 0.14278744511302652, "grad_norm": 9.928549766540527, "learning_rate": 5e-05, "loss": 3.5832, "step": 439 }, { "epoch": 0.14311270125223613, "grad_norm": 12.250447273254395, "learning_rate": 5e-05, "loss": 3.4639, "step": 440 }, { "epoch": 0.14343795739144577, "grad_norm": 8.427464485168457, "learning_rate": 5e-05, "loss": 3.4289, "step": 441 }, { "epoch": 0.14376321353065538, "grad_norm": 12.150249481201172, "learning_rate": 5e-05, "loss": 3.8382, "step": 442 }, { "epoch": 0.14408846966986502, "grad_norm": 7.406175136566162, "learning_rate": 5e-05, "loss": 3.9814, "step": 443 }, { "epoch": 0.14441372580907463, "grad_norm": 6.988471031188965, "learning_rate": 5e-05, "loss": 4.0547, "step": 444 }, { "epoch": 0.14473898194828427, "grad_norm": 12.318148612976074, "learning_rate": 5e-05, "loss": 3.9433, "step": 445 }, { "epoch": 0.1450642380874939, "grad_norm": 10.347890853881836, "learning_rate": 5e-05, "loss": 3.7052, "step": 446 }, { "epoch": 0.14538949422670353, "grad_norm": 9.213937759399414, "learning_rate": 5e-05, "loss": 3.9906, "step": 447 }, { "epoch": 0.14571475036591316, "grad_norm": 10.614618301391602, "learning_rate": 5e-05, "loss": 3.276, "step": 448 }, { "epoch": 0.14604000650512278, "grad_norm": 11.268338203430176, "learning_rate": 5e-05, "loss": 3.5563, "step": 449 }, { "epoch": 0.14636526264433242, "grad_norm": 8.418384552001953, "learning_rate": 5e-05, "loss": 3.4222, "step": 450 }, { "epoch": 0.14669051878354203, "grad_norm": 12.904454231262207, "learning_rate": 5e-05, "loss": 3.7737, "step": 451 }, { "epoch": 0.14701577492275167, "grad_norm": 9.767146110534668, "learning_rate": 5e-05, "loss": 4.0424, "step": 452 }, { "epoch": 0.1473410310619613, "grad_norm": 8.333344459533691, "learning_rate": 5e-05, "loss": 3.7574, "step": 453 }, { "epoch": 0.14766628720117092, "grad_norm": 11.907800674438477, "learning_rate": 5e-05, "loss": 3.7266, "step": 454 }, { "epoch": 0.14799154334038056, "grad_norm": 9.306441307067871, "learning_rate": 5e-05, "loss": 3.8885, "step": 455 }, { "epoch": 0.14831679947959017, "grad_norm": 10.486589431762695, "learning_rate": 5e-05, "loss": 3.7777, "step": 456 }, { "epoch": 0.1486420556187998, "grad_norm": 12.291946411132812, "learning_rate": 5e-05, "loss": 3.6718, "step": 457 }, { "epoch": 0.14896731175800942, "grad_norm": 12.743392944335938, "learning_rate": 5e-05, "loss": 3.6533, "step": 458 }, { "epoch": 0.14929256789721906, "grad_norm": 9.82790470123291, "learning_rate": 5e-05, "loss": 3.8719, "step": 459 }, { "epoch": 0.14961782403642868, "grad_norm": 11.749984741210938, "learning_rate": 5e-05, "loss": 3.495, "step": 460 }, { "epoch": 0.14994308017563832, "grad_norm": 11.35322380065918, "learning_rate": 5e-05, "loss": 4.0119, "step": 461 }, { "epoch": 0.15026833631484796, "grad_norm": 7.1181159019470215, "learning_rate": 5e-05, "loss": 3.6675, "step": 462 }, { "epoch": 0.15059359245405757, "grad_norm": 11.3037109375, "learning_rate": 5e-05, "loss": 3.5125, "step": 463 }, { "epoch": 0.1509188485932672, "grad_norm": 10.003439903259277, "learning_rate": 5e-05, "loss": 3.2109, "step": 464 }, { "epoch": 0.15124410473247682, "grad_norm": 7.561882019042969, "learning_rate": 5e-05, "loss": 3.566, "step": 465 }, { "epoch": 0.15156936087168646, "grad_norm": 7.8044586181640625, "learning_rate": 5e-05, "loss": 3.5393, "step": 466 }, { "epoch": 0.15189461701089607, "grad_norm": 11.766450881958008, "learning_rate": 5e-05, "loss": 3.7168, "step": 467 }, { "epoch": 0.1522198731501057, "grad_norm": 7.827965259552002, "learning_rate": 5e-05, "loss": 3.7292, "step": 468 }, { "epoch": 0.15254512928931532, "grad_norm": 9.4542236328125, "learning_rate": 5e-05, "loss": 3.4709, "step": 469 }, { "epoch": 0.15287038542852496, "grad_norm": 8.595137596130371, "learning_rate": 5e-05, "loss": 4.3444, "step": 470 }, { "epoch": 0.1531956415677346, "grad_norm": 8.952139854431152, "learning_rate": 5e-05, "loss": 4.3924, "step": 471 }, { "epoch": 0.15352089770694421, "grad_norm": 9.41843318939209, "learning_rate": 5e-05, "loss": 3.7794, "step": 472 }, { "epoch": 0.15384615384615385, "grad_norm": 7.201237201690674, "learning_rate": 5e-05, "loss": 3.3527, "step": 473 }, { "epoch": 0.15417140998536347, "grad_norm": 9.496088981628418, "learning_rate": 5e-05, "loss": 3.3554, "step": 474 }, { "epoch": 0.1544966661245731, "grad_norm": 8.370413780212402, "learning_rate": 5e-05, "loss": 3.8943, "step": 475 }, { "epoch": 0.15482192226378272, "grad_norm": 8.910099029541016, "learning_rate": 5e-05, "loss": 3.5867, "step": 476 }, { "epoch": 0.15514717840299236, "grad_norm": 7.085155963897705, "learning_rate": 5e-05, "loss": 3.3267, "step": 477 }, { "epoch": 0.15547243454220197, "grad_norm": 8.49357795715332, "learning_rate": 5e-05, "loss": 3.5925, "step": 478 }, { "epoch": 0.1557976906814116, "grad_norm": 7.811654567718506, "learning_rate": 5e-05, "loss": 3.9464, "step": 479 }, { "epoch": 0.15612294682062125, "grad_norm": 9.111326217651367, "learning_rate": 5e-05, "loss": 3.5547, "step": 480 }, { "epoch": 0.15644820295983086, "grad_norm": 7.44144344329834, "learning_rate": 5e-05, "loss": 3.9226, "step": 481 }, { "epoch": 0.1567734590990405, "grad_norm": 9.64633846282959, "learning_rate": 5e-05, "loss": 3.887, "step": 482 }, { "epoch": 0.15709871523825011, "grad_norm": 11.60362720489502, "learning_rate": 5e-05, "loss": 3.7125, "step": 483 }, { "epoch": 0.15742397137745975, "grad_norm": 7.746142864227295, "learning_rate": 5e-05, "loss": 3.9722, "step": 484 }, { "epoch": 0.15774922751666937, "grad_norm": 7.080160140991211, "learning_rate": 5e-05, "loss": 3.1951, "step": 485 }, { "epoch": 0.158074483655879, "grad_norm": 17.081741333007812, "learning_rate": 5e-05, "loss": 3.7901, "step": 486 }, { "epoch": 0.15839973979508865, "grad_norm": 7.917695999145508, "learning_rate": 5e-05, "loss": 3.5357, "step": 487 }, { "epoch": 0.15872499593429826, "grad_norm": 8.854647636413574, "learning_rate": 5e-05, "loss": 3.7515, "step": 488 }, { "epoch": 0.1590502520735079, "grad_norm": 9.028191566467285, "learning_rate": 5e-05, "loss": 3.5516, "step": 489 }, { "epoch": 0.1593755082127175, "grad_norm": 10.008574485778809, "learning_rate": 5e-05, "loss": 3.6189, "step": 490 }, { "epoch": 0.15970076435192715, "grad_norm": 10.596325874328613, "learning_rate": 5e-05, "loss": 3.4306, "step": 491 }, { "epoch": 0.16002602049113676, "grad_norm": 12.258661270141602, "learning_rate": 5e-05, "loss": 3.5577, "step": 492 }, { "epoch": 0.1603512766303464, "grad_norm": 11.506673812866211, "learning_rate": 5e-05, "loss": 3.9079, "step": 493 }, { "epoch": 0.160676532769556, "grad_norm": 7.967488765716553, "learning_rate": 5e-05, "loss": 3.6066, "step": 494 }, { "epoch": 0.16100178890876565, "grad_norm": 11.729570388793945, "learning_rate": 5e-05, "loss": 3.9043, "step": 495 }, { "epoch": 0.1613270450479753, "grad_norm": 8.846243858337402, "learning_rate": 5e-05, "loss": 3.8228, "step": 496 }, { "epoch": 0.1616523011871849, "grad_norm": 12.016153335571289, "learning_rate": 5e-05, "loss": 3.9159, "step": 497 }, { "epoch": 0.16197755732639454, "grad_norm": 8.154533386230469, "learning_rate": 5e-05, "loss": 3.7158, "step": 498 }, { "epoch": 0.16230281346560416, "grad_norm": 11.766820907592773, "learning_rate": 5e-05, "loss": 3.5189, "step": 499 }, { "epoch": 0.1626280696048138, "grad_norm": 11.467671394348145, "learning_rate": 5e-05, "loss": 3.8887, "step": 500 }, { "epoch": 0.1629533257440234, "grad_norm": 7.477685451507568, "learning_rate": 5e-05, "loss": 4.0159, "step": 501 }, { "epoch": 0.16327858188323305, "grad_norm": 6.95028018951416, "learning_rate": 5e-05, "loss": 3.7446, "step": 502 }, { "epoch": 0.16360383802244266, "grad_norm": 11.298054695129395, "learning_rate": 5e-05, "loss": 3.692, "step": 503 }, { "epoch": 0.1639290941616523, "grad_norm": 7.807789325714111, "learning_rate": 5e-05, "loss": 3.6819, "step": 504 }, { "epoch": 0.16425435030086194, "grad_norm": 7.94120454788208, "learning_rate": 5e-05, "loss": 3.6853, "step": 505 }, { "epoch": 0.16457960644007155, "grad_norm": 8.74360466003418, "learning_rate": 5e-05, "loss": 3.5674, "step": 506 }, { "epoch": 0.1649048625792812, "grad_norm": 8.877306938171387, "learning_rate": 5e-05, "loss": 3.5393, "step": 507 }, { "epoch": 0.1652301187184908, "grad_norm": 7.502864837646484, "learning_rate": 5e-05, "loss": 3.6376, "step": 508 }, { "epoch": 0.16555537485770044, "grad_norm": 11.325250625610352, "learning_rate": 5e-05, "loss": 3.7114, "step": 509 }, { "epoch": 0.16588063099691006, "grad_norm": 9.918580055236816, "learning_rate": 5e-05, "loss": 3.783, "step": 510 }, { "epoch": 0.1662058871361197, "grad_norm": 8.940899848937988, "learning_rate": 5e-05, "loss": 3.707, "step": 511 }, { "epoch": 0.16653114327532934, "grad_norm": 17.020418167114258, "learning_rate": 5e-05, "loss": 4.319, "step": 512 }, { "epoch": 0.16685639941453895, "grad_norm": 10.722935676574707, "learning_rate": 5e-05, "loss": 3.4215, "step": 513 }, { "epoch": 0.1671816555537486, "grad_norm": 9.579489707946777, "learning_rate": 5e-05, "loss": 2.835, "step": 514 }, { "epoch": 0.1675069116929582, "grad_norm": 8.158295631408691, "learning_rate": 5e-05, "loss": 2.78, "step": 515 }, { "epoch": 0.16783216783216784, "grad_norm": 8.238765716552734, "learning_rate": 5e-05, "loss": 3.4628, "step": 516 }, { "epoch": 0.16815742397137745, "grad_norm": 8.580034255981445, "learning_rate": 5e-05, "loss": 3.9686, "step": 517 }, { "epoch": 0.1684826801105871, "grad_norm": 8.966435432434082, "learning_rate": 5e-05, "loss": 3.4936, "step": 518 }, { "epoch": 0.1688079362497967, "grad_norm": 6.970677375793457, "learning_rate": 5e-05, "loss": 3.4287, "step": 519 }, { "epoch": 0.16913319238900634, "grad_norm": 7.750168323516846, "learning_rate": 5e-05, "loss": 3.3396, "step": 520 }, { "epoch": 0.16945844852821598, "grad_norm": 7.394572734832764, "learning_rate": 5e-05, "loss": 3.6622, "step": 521 }, { "epoch": 0.1697837046674256, "grad_norm": 6.781547546386719, "learning_rate": 5e-05, "loss": 3.5704, "step": 522 }, { "epoch": 0.17010896080663523, "grad_norm": 11.887150764465332, "learning_rate": 5e-05, "loss": 3.4968, "step": 523 }, { "epoch": 0.17043421694584485, "grad_norm": 7.665548324584961, "learning_rate": 5e-05, "loss": 3.6475, "step": 524 }, { "epoch": 0.1707594730850545, "grad_norm": 8.977829933166504, "learning_rate": 5e-05, "loss": 2.6397, "step": 525 }, { "epoch": 0.1710847292242641, "grad_norm": 9.76028060913086, "learning_rate": 5e-05, "loss": 3.6625, "step": 526 }, { "epoch": 0.17140998536347374, "grad_norm": 9.248437881469727, "learning_rate": 5e-05, "loss": 3.5199, "step": 527 }, { "epoch": 0.17173524150268335, "grad_norm": 9.724522590637207, "learning_rate": 5e-05, "loss": 3.7095, "step": 528 }, { "epoch": 0.172060497641893, "grad_norm": 8.21666431427002, "learning_rate": 5e-05, "loss": 3.6802, "step": 529 }, { "epoch": 0.17238575378110263, "grad_norm": 7.362802505493164, "learning_rate": 5e-05, "loss": 3.6511, "step": 530 }, { "epoch": 0.17271100992031224, "grad_norm": 7.2599334716796875, "learning_rate": 5e-05, "loss": 3.1097, "step": 531 }, { "epoch": 0.17303626605952188, "grad_norm": 9.250597953796387, "learning_rate": 5e-05, "loss": 3.608, "step": 532 }, { "epoch": 0.1733615221987315, "grad_norm": 8.88681411743164, "learning_rate": 5e-05, "loss": 3.8212, "step": 533 }, { "epoch": 0.17368677833794113, "grad_norm": 11.290247917175293, "learning_rate": 5e-05, "loss": 3.3286, "step": 534 }, { "epoch": 0.17401203447715075, "grad_norm": 10.88404369354248, "learning_rate": 5e-05, "loss": 3.5241, "step": 535 }, { "epoch": 0.17433729061636039, "grad_norm": 9.20648193359375, "learning_rate": 5e-05, "loss": 3.6005, "step": 536 }, { "epoch": 0.17466254675557003, "grad_norm": 8.10119342803955, "learning_rate": 5e-05, "loss": 3.5739, "step": 537 }, { "epoch": 0.17498780289477964, "grad_norm": 10.661712646484375, "learning_rate": 5e-05, "loss": 3.4837, "step": 538 }, { "epoch": 0.17531305903398928, "grad_norm": 10.739490509033203, "learning_rate": 5e-05, "loss": 3.6787, "step": 539 }, { "epoch": 0.1756383151731989, "grad_norm": 9.183284759521484, "learning_rate": 5e-05, "loss": 3.6174, "step": 540 }, { "epoch": 0.17596357131240853, "grad_norm": 8.37816047668457, "learning_rate": 5e-05, "loss": 3.5964, "step": 541 }, { "epoch": 0.17628882745161814, "grad_norm": 10.69176959991455, "learning_rate": 5e-05, "loss": 3.8804, "step": 542 }, { "epoch": 0.17661408359082778, "grad_norm": 8.007463455200195, "learning_rate": 5e-05, "loss": 3.8485, "step": 543 }, { "epoch": 0.1769393397300374, "grad_norm": 8.839487075805664, "learning_rate": 5e-05, "loss": 3.6356, "step": 544 }, { "epoch": 0.17726459586924703, "grad_norm": 11.773893356323242, "learning_rate": 5e-05, "loss": 3.8846, "step": 545 }, { "epoch": 0.17758985200845667, "grad_norm": 11.021324157714844, "learning_rate": 5e-05, "loss": 3.8369, "step": 546 }, { "epoch": 0.17791510814766628, "grad_norm": 10.33259391784668, "learning_rate": 5e-05, "loss": 3.392, "step": 547 }, { "epoch": 0.17824036428687592, "grad_norm": 8.52408504486084, "learning_rate": 5e-05, "loss": 3.8102, "step": 548 }, { "epoch": 0.17856562042608554, "grad_norm": 9.792526245117188, "learning_rate": 5e-05, "loss": 3.9087, "step": 549 }, { "epoch": 0.17889087656529518, "grad_norm": 9.989365577697754, "learning_rate": 5e-05, "loss": 3.8268, "step": 550 }, { "epoch": 0.1792161327045048, "grad_norm": 9.42458438873291, "learning_rate": 5e-05, "loss": 3.7863, "step": 551 }, { "epoch": 0.17954138884371443, "grad_norm": 8.195592880249023, "learning_rate": 5e-05, "loss": 3.7507, "step": 552 }, { "epoch": 0.17986664498292404, "grad_norm": 9.999628067016602, "learning_rate": 5e-05, "loss": 3.742, "step": 553 }, { "epoch": 0.18019190112213368, "grad_norm": 8.54041576385498, "learning_rate": 5e-05, "loss": 4.4209, "step": 554 }, { "epoch": 0.18051715726134332, "grad_norm": 8.870122909545898, "learning_rate": 5e-05, "loss": 3.2489, "step": 555 }, { "epoch": 0.18084241340055293, "grad_norm": 10.103729248046875, "learning_rate": 5e-05, "loss": 3.5182, "step": 556 }, { "epoch": 0.18116766953976257, "grad_norm": 9.698139190673828, "learning_rate": 5e-05, "loss": 3.7934, "step": 557 }, { "epoch": 0.18149292567897218, "grad_norm": 9.132758140563965, "learning_rate": 5e-05, "loss": 3.3561, "step": 558 }, { "epoch": 0.18181818181818182, "grad_norm": 8.661752700805664, "learning_rate": 5e-05, "loss": 3.5775, "step": 559 }, { "epoch": 0.18214343795739144, "grad_norm": 9.332806587219238, "learning_rate": 5e-05, "loss": 3.0548, "step": 560 }, { "epoch": 0.18246869409660108, "grad_norm": 9.24280071258545, "learning_rate": 5e-05, "loss": 4.0208, "step": 561 }, { "epoch": 0.1827939502358107, "grad_norm": 9.924572944641113, "learning_rate": 5e-05, "loss": 3.6914, "step": 562 }, { "epoch": 0.18311920637502033, "grad_norm": 9.362936973571777, "learning_rate": 5e-05, "loss": 3.9219, "step": 563 }, { "epoch": 0.18344446251422997, "grad_norm": 12.603287696838379, "learning_rate": 5e-05, "loss": 3.4585, "step": 564 }, { "epoch": 0.18376971865343958, "grad_norm": 10.904036521911621, "learning_rate": 5e-05, "loss": 3.3266, "step": 565 }, { "epoch": 0.18409497479264922, "grad_norm": 9.598895072937012, "learning_rate": 5e-05, "loss": 3.6607, "step": 566 }, { "epoch": 0.18442023093185883, "grad_norm": 8.842279434204102, "learning_rate": 5e-05, "loss": 3.5674, "step": 567 }, { "epoch": 0.18474548707106847, "grad_norm": 9.379899024963379, "learning_rate": 5e-05, "loss": 3.6743, "step": 568 }, { "epoch": 0.18507074321027808, "grad_norm": 9.745834350585938, "learning_rate": 5e-05, "loss": 4.0652, "step": 569 }, { "epoch": 0.18539599934948772, "grad_norm": 8.990086555480957, "learning_rate": 5e-05, "loss": 3.637, "step": 570 }, { "epoch": 0.18572125548869736, "grad_norm": 8.382301330566406, "learning_rate": 5e-05, "loss": 3.967, "step": 571 }, { "epoch": 0.18604651162790697, "grad_norm": 8.533965110778809, "learning_rate": 5e-05, "loss": 3.4941, "step": 572 }, { "epoch": 0.18637176776711661, "grad_norm": 9.823786735534668, "learning_rate": 5e-05, "loss": 3.7345, "step": 573 }, { "epoch": 0.18669702390632623, "grad_norm": 7.766260147094727, "learning_rate": 5e-05, "loss": 3.6831, "step": 574 }, { "epoch": 0.18702228004553587, "grad_norm": 8.095032691955566, "learning_rate": 5e-05, "loss": 3.4701, "step": 575 }, { "epoch": 0.18734753618474548, "grad_norm": 11.641885757446289, "learning_rate": 5e-05, "loss": 4.0934, "step": 576 }, { "epoch": 0.18767279232395512, "grad_norm": 9.155062675476074, "learning_rate": 5e-05, "loss": 3.5356, "step": 577 }, { "epoch": 0.18799804846316473, "grad_norm": 8.703105926513672, "learning_rate": 5e-05, "loss": 3.8144, "step": 578 }, { "epoch": 0.18832330460237437, "grad_norm": 9.528350830078125, "learning_rate": 5e-05, "loss": 3.2073, "step": 579 }, { "epoch": 0.188648560741584, "grad_norm": 9.156220436096191, "learning_rate": 5e-05, "loss": 3.777, "step": 580 }, { "epoch": 0.18897381688079362, "grad_norm": 8.443305015563965, "learning_rate": 5e-05, "loss": 3.7239, "step": 581 }, { "epoch": 0.18929907302000326, "grad_norm": 7.838225841522217, "learning_rate": 5e-05, "loss": 3.4467, "step": 582 }, { "epoch": 0.18962432915921287, "grad_norm": 7.3834757804870605, "learning_rate": 5e-05, "loss": 3.5151, "step": 583 }, { "epoch": 0.1899495852984225, "grad_norm": 9.460673332214355, "learning_rate": 5e-05, "loss": 3.4287, "step": 584 }, { "epoch": 0.19027484143763213, "grad_norm": 8.232035636901855, "learning_rate": 5e-05, "loss": 4.2533, "step": 585 }, { "epoch": 0.19060009757684176, "grad_norm": 12.586129188537598, "learning_rate": 5e-05, "loss": 3.7747, "step": 586 }, { "epoch": 0.19092535371605138, "grad_norm": 8.150300979614258, "learning_rate": 5e-05, "loss": 3.9724, "step": 587 }, { "epoch": 0.19125060985526102, "grad_norm": 8.529426574707031, "learning_rate": 5e-05, "loss": 4.6377, "step": 588 }, { "epoch": 0.19157586599447066, "grad_norm": 7.794090747833252, "learning_rate": 5e-05, "loss": 3.2597, "step": 589 }, { "epoch": 0.19190112213368027, "grad_norm": 8.038799285888672, "learning_rate": 5e-05, "loss": 3.9826, "step": 590 }, { "epoch": 0.1922263782728899, "grad_norm": 7.855754852294922, "learning_rate": 5e-05, "loss": 3.7777, "step": 591 }, { "epoch": 0.19255163441209952, "grad_norm": 8.140380859375, "learning_rate": 5e-05, "loss": 3.443, "step": 592 }, { "epoch": 0.19287689055130916, "grad_norm": 9.79352855682373, "learning_rate": 5e-05, "loss": 3.7066, "step": 593 }, { "epoch": 0.19320214669051877, "grad_norm": 13.119344711303711, "learning_rate": 5e-05, "loss": 3.9336, "step": 594 }, { "epoch": 0.1935274028297284, "grad_norm": 8.955309867858887, "learning_rate": 5e-05, "loss": 3.3643, "step": 595 }, { "epoch": 0.19385265896893805, "grad_norm": 8.016314506530762, "learning_rate": 5e-05, "loss": 3.7039, "step": 596 }, { "epoch": 0.19417791510814766, "grad_norm": 8.888792991638184, "learning_rate": 5e-05, "loss": 3.6356, "step": 597 }, { "epoch": 0.1945031712473573, "grad_norm": 9.017245292663574, "learning_rate": 5e-05, "loss": 3.6173, "step": 598 }, { "epoch": 0.19482842738656692, "grad_norm": 9.05187702178955, "learning_rate": 5e-05, "loss": 3.7156, "step": 599 }, { "epoch": 0.19515368352577656, "grad_norm": 8.980157852172852, "learning_rate": 5e-05, "loss": 3.5519, "step": 600 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2792420391095808.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }