diff --git "a/checkpoint-2115/trainer_state.json" "b/checkpoint-2115/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2115/trainer_state.json" @@ -0,0 +1,14838 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2115, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047281323877068556, + "grad_norm": 5.163570880889893, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.4628, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 6.298020839691162, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.5003, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 5.853623390197754, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.4495, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 5.456025123596191, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.3798, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 5.757407188415527, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.4515, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 5.872277736663818, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4424, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 6.7816009521484375, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.4004, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 6.229667663574219, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.4494, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 5.336202621459961, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.3916, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 5.589445114135742, + "learning_rate": 5.000000000000001e-07, + "loss": 1.2318, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 5.720539569854736, + "learning_rate": 5.5e-07, + "loss": 1.4367, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 5.913913726806641, + "learning_rate": 6.000000000000001e-07, + "loss": 1.342, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 5.899744987487793, + "learning_rate": 6.5e-07, + "loss": 1.4307, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 5.571037292480469, + "learning_rate": 7.000000000000001e-07, + "loss": 1.3372, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 5.480010509490967, + "learning_rate": 7.5e-07, + "loss": 1.3923, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 5.254702091217041, + "learning_rate": 8.000000000000001e-07, + "loss": 1.2928, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 6.090312480926514, + "learning_rate": 8.500000000000001e-07, + "loss": 1.4984, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 5.689319610595703, + "learning_rate": 9.000000000000001e-07, + "loss": 1.4108, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 5.386685848236084, + "learning_rate": 9.500000000000001e-07, + "loss": 1.425, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 6.451584815979004, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.5507, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 5.37647008895874, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.4109, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 4.716553211212158, + "learning_rate": 1.1e-06, + "loss": 1.2028, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 4.950989723205566, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.3043, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 4.688975811004639, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2708, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 4.905868053436279, + "learning_rate": 1.25e-06, + "loss": 1.3268, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 4.503395080566406, + "learning_rate": 1.3e-06, + "loss": 1.1799, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 4.77382230758667, + "learning_rate": 1.3500000000000002e-06, + "loss": 1.3882, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 4.734329700469971, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.3476, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 4.775066375732422, + "learning_rate": 1.45e-06, + "loss": 1.2429, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 4.978334426879883, + "learning_rate": 1.5e-06, + "loss": 1.2119, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 4.506785869598389, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.3157, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 4.007757186889648, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.1451, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.6621618270874023, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.093, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 3.8733766078948975, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.2289, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 4.3391900062561035, + "learning_rate": 1.75e-06, + "loss": 1.1453, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 3.287623643875122, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0257, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 3.591721773147583, + "learning_rate": 1.85e-06, + "loss": 0.9976, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 4.028271675109863, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.0773, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 3.3543951511383057, + "learning_rate": 1.9500000000000004e-06, + "loss": 1.1677, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 3.807624340057373, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.1232, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 4.242797374725342, + "learning_rate": 2.05e-06, + "loss": 1.1819, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 3.4574992656707764, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9878, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 3.906695604324341, + "learning_rate": 2.15e-06, + "loss": 1.0592, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 3.7543163299560547, + "learning_rate": 2.2e-06, + "loss": 1.0309, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 3.3777148723602295, + "learning_rate": 2.25e-06, + "loss": 1.0664, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 3.6003634929656982, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.0482, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 3.3961377143859863, + "learning_rate": 2.35e-06, + "loss": 1.0252, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 3.1601035594940186, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.0435, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 3.4192967414855957, + "learning_rate": 2.4500000000000003e-06, + "loss": 1.0935, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 3.1225922107696533, + "learning_rate": 2.5e-06, + "loss": 0.8988, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 3.1423380374908447, + "learning_rate": 2.55e-06, + "loss": 1.0159, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.4782402515411377, + "learning_rate": 2.6e-06, + "loss": 1.0231, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 3.8362693786621094, + "learning_rate": 2.6500000000000005e-06, + "loss": 1.0725, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 3.033294916152954, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9377, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.849741220474243, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.0046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 3.141876220703125, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.9226, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.773594856262207, + "learning_rate": 2.85e-06, + "loss": 0.8662, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.1460225582122803, + "learning_rate": 2.9e-06, + "loss": 0.9304, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 3.293583631515503, + "learning_rate": 2.95e-06, + "loss": 1.0374, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 3.8190863132476807, + "learning_rate": 3e-06, + "loss": 0.971, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 3.4566776752471924, + "learning_rate": 3.05e-06, + "loss": 0.9631, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 3.355741500854492, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.0097, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 3.29746675491333, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.9459, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 3.3122968673706055, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8594, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 3.477701187133789, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.9197, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 3.3363406658172607, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9478, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 4.143295764923096, + "learning_rate": 3.3500000000000005e-06, + "loss": 1.0534, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 3.2363274097442627, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.9454, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 3.198746681213379, + "learning_rate": 3.45e-06, + "loss": 0.9388, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.5751023292541504, + "learning_rate": 3.5e-06, + "loss": 0.9444, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 3.1745729446411133, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.8683, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 3.3210883140563965, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.8811, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 3.2502429485321045, + "learning_rate": 3.65e-06, + "loss": 1.0012, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 3.44598126411438, + "learning_rate": 3.7e-06, + "loss": 0.9217, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 3.439117431640625, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 3.523627758026123, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8996, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.3716015815734863, + "learning_rate": 3.85e-06, + "loss": 0.9061, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 3.33518385887146, + "learning_rate": 3.900000000000001e-06, + "loss": 0.9371, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 3.833829879760742, + "learning_rate": 3.95e-06, + "loss": 0.9669, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 3.260446786880493, + "learning_rate": 4.000000000000001e-06, + "loss": 0.9449, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.532451629638672, + "learning_rate": 4.05e-06, + "loss": 0.897, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 3.1156492233276367, + "learning_rate": 4.1e-06, + "loss": 0.8463, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.8801751136779785, + "learning_rate": 4.15e-06, + "loss": 0.8616, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 3.072476863861084, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8387, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.9601376056671143, + "learning_rate": 4.25e-06, + "loss": 0.8538, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 3.521664619445801, + "learning_rate": 4.3e-06, + "loss": 0.8894, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 3.2670981884002686, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8387, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 3.422089099884033, + "learning_rate": 4.4e-06, + "loss": 0.7728, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 3.414034128189087, + "learning_rate": 4.450000000000001e-06, + "loss": 0.7968, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 4.234285354614258, + "learning_rate": 4.5e-06, + "loss": 0.8502, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 3.1446919441223145, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8236, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.683443307876587, + "learning_rate": 4.600000000000001e-06, + "loss": 0.9792, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.664219617843628, + "learning_rate": 4.65e-06, + "loss": 0.8743, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.369479179382324, + "learning_rate": 4.7e-06, + "loss": 0.8741, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 3.694949150085449, + "learning_rate": 4.75e-06, + "loss": 0.7574, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 3.5144498348236084, + "learning_rate": 4.800000000000001e-06, + "loss": 0.9934, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.164451837539673, + "learning_rate": 4.85e-06, + "loss": 0.7463, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 3.222785472869873, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.7698, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.9129555225372314, + "learning_rate": 4.95e-06, + "loss": 0.7856, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 3.5061235427856445, + "learning_rate": 5e-06, + "loss": 0.8588, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 3.2805044651031494, + "learning_rate": 4.999999922167982e-06, + "loss": 0.7643, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.5461678504943848, + "learning_rate": 4.999999688671929e-06, + "loss": 0.8253, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 3.2238264083862305, + "learning_rate": 4.99999929951186e-06, + "loss": 0.7622, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 3.818955898284912, + "learning_rate": 4.999998754687795e-06, + "loss": 0.8471, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.1252424716949463, + "learning_rate": 4.99999805419977e-06, + "loss": 0.8409, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 3.604283571243286, + "learning_rate": 4.999997198047828e-06, + "loss": 0.9027, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 3.6752424240112305, + "learning_rate": 4.999996186232023e-06, + "loss": 0.9336, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 3.517557144165039, + "learning_rate": 4.9999950187524184e-06, + "loss": 0.8351, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.427285671234131, + "learning_rate": 4.999993695609085e-06, + "loss": 0.8457, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 3.2792510986328125, + "learning_rate": 4.999992216802107e-06, + "loss": 0.8391, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.581094741821289, + "learning_rate": 4.999990582331576e-06, + "loss": 0.7533, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.1667377948760986, + "learning_rate": 4.999988792197593e-06, + "loss": 0.9562, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 3.3609890937805176, + "learning_rate": 4.99998684640027e-06, + "loss": 0.8181, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.260627269744873, + "learning_rate": 4.999984744939729e-06, + "loss": 0.8012, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4535653591156006, + "learning_rate": 4.9999824878160985e-06, + "loss": 0.919, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.4880740642547607, + "learning_rate": 4.999980075029522e-06, + "loss": 0.8114, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.2546932697296143, + "learning_rate": 4.999977506580147e-06, + "loss": 0.8274, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.2762744426727295, + "learning_rate": 4.999974782468136e-06, + "loss": 0.9018, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.42825984954834, + "learning_rate": 4.999971902693657e-06, + "loss": 0.8262, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 3.082496404647827, + "learning_rate": 4.99996886725689e-06, + "loss": 0.8181, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 3.322869300842285, + "learning_rate": 4.9999656761580225e-06, + "loss": 0.8382, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.6365339756011963, + "learning_rate": 4.9999623293972555e-06, + "loss": 0.7489, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.376352548599243, + "learning_rate": 4.999958826974796e-06, + "loss": 0.9012, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 3.49088716506958, + "learning_rate": 4.999955168890862e-06, + "loss": 0.8999, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 3.3265068531036377, + "learning_rate": 4.999951355145682e-06, + "loss": 0.8161, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.697282314300537, + "learning_rate": 4.999947385739493e-06, + "loss": 0.9623, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 2.7901928424835205, + "learning_rate": 4.999943260672542e-06, + "loss": 0.7371, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 3.110319137573242, + "learning_rate": 4.999938979945086e-06, + "loss": 0.715, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 3.2211520671844482, + "learning_rate": 4.999934543557392e-06, + "loss": 0.8888, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 3.2466187477111816, + "learning_rate": 4.999929951509735e-06, + "loss": 0.9389, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 3.3574399948120117, + "learning_rate": 4.999925203802403e-06, + "loss": 0.8263, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 3.275601625442505, + "learning_rate": 4.99992030043569e-06, + "loss": 0.8338, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 3.6011312007904053, + "learning_rate": 4.999915241409902e-06, + "loss": 0.8351, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 2.969011068344116, + "learning_rate": 4.999910026725352e-06, + "loss": 0.79, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 3.690784454345703, + "learning_rate": 4.999904656382369e-06, + "loss": 0.8209, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 3.3363115787506104, + "learning_rate": 4.999899130381283e-06, + "loss": 0.858, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 3.206881523132324, + "learning_rate": 4.9998934487224405e-06, + "loss": 0.834, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 2.773146152496338, + "learning_rate": 4.999887611406195e-06, + "loss": 0.7576, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 3.307725667953491, + "learning_rate": 4.999881618432908e-06, + "loss": 0.7487, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 4.273657321929932, + "learning_rate": 4.999875469802956e-06, + "loss": 0.8176, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 3.0898005962371826, + "learning_rate": 4.999869165516719e-06, + "loss": 0.7578, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 3.25150990486145, + "learning_rate": 4.9998627055745915e-06, + "loss": 0.7873, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 2.9705755710601807, + "learning_rate": 4.999856089976974e-06, + "loss": 0.6473, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 3.5658507347106934, + "learning_rate": 4.9998493187242804e-06, + "loss": 0.855, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 3.3994076251983643, + "learning_rate": 4.99984239181693e-06, + "loss": 0.7926, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 2.8266260623931885, + "learning_rate": 4.999835309255357e-06, + "loss": 0.7564, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 3.1143875122070312, + "learning_rate": 4.999828071039999e-06, + "loss": 0.8398, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 2.9364278316497803, + "learning_rate": 4.99982067717131e-06, + "loss": 0.7381, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 3.4155616760253906, + "learning_rate": 4.999813127649748e-06, + "loss": 0.7933, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 4.371236324310303, + "learning_rate": 4.999805422475784e-06, + "loss": 0.8292, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 3.3967185020446777, + "learning_rate": 4.999797561649897e-06, + "loss": 0.8712, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 3.343303680419922, + "learning_rate": 4.999789545172578e-06, + "loss": 0.8177, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 3.040235757827759, + "learning_rate": 4.999781373044325e-06, + "loss": 0.7379, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 3.4069204330444336, + "learning_rate": 4.999773045265647e-06, + "loss": 0.7939, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 3.1939475536346436, + "learning_rate": 4.999764561837063e-06, + "loss": 0.8037, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 4.452004909515381, + "learning_rate": 4.999755922759101e-06, + "loss": 0.8421, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 3.2031240463256836, + "learning_rate": 4.999747128032298e-06, + "loss": 0.794, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 3.175920009613037, + "learning_rate": 4.999738177657203e-06, + "loss": 0.759, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 3.7679688930511475, + "learning_rate": 4.9997290716343725e-06, + "loss": 0.8174, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 3.7020037174224854, + "learning_rate": 4.999719809964373e-06, + "loss": 0.7116, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 4.357471942901611, + "learning_rate": 4.999710392647783e-06, + "loss": 0.7649, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 3.3439087867736816, + "learning_rate": 4.999700819685187e-06, + "loss": 0.7907, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 3.210815191268921, + "learning_rate": 4.999691091077182e-06, + "loss": 0.8446, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 3.1029553413391113, + "learning_rate": 4.9996812068243735e-06, + "loss": 0.7232, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 2.9389400482177734, + "learning_rate": 4.999671166927378e-06, + "loss": 0.7413, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 3.7062697410583496, + "learning_rate": 4.9996609713868185e-06, + "loss": 0.8773, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 3.2768924236297607, + "learning_rate": 4.999650620203332e-06, + "loss": 0.8046, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 3.380373001098633, + "learning_rate": 4.999640113377561e-06, + "loss": 0.7529, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 3.520022392272949, + "learning_rate": 4.999629450910162e-06, + "loss": 0.7352, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 3.43269419670105, + "learning_rate": 4.999618632801796e-06, + "loss": 0.9371, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 3.555877923965454, + "learning_rate": 4.99960765905314e-06, + "loss": 0.8276, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 3.597050189971924, + "learning_rate": 4.999596529664874e-06, + "loss": 0.8164, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 3.2002956867218018, + "learning_rate": 4.999585244637693e-06, + "loss": 0.7824, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 3.527275562286377, + "learning_rate": 4.999573803972299e-06, + "loss": 0.8033, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 3.5184452533721924, + "learning_rate": 4.999562207669405e-06, + "loss": 0.724, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 3.6635067462921143, + "learning_rate": 4.999550455729732e-06, + "loss": 0.819, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 3.192399740219116, + "learning_rate": 4.999538548154012e-06, + "loss": 0.7999, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 3.0946953296661377, + "learning_rate": 4.999526484942988e-06, + "loss": 0.7367, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 2.847198009490967, + "learning_rate": 4.99951426609741e-06, + "loss": 0.7536, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 2.7674827575683594, + "learning_rate": 4.999501891618037e-06, + "loss": 0.701, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 3.357933521270752, + "learning_rate": 4.999489361505643e-06, + "loss": 0.8331, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 3.1464426517486572, + "learning_rate": 4.999476675761004e-06, + "loss": 0.7931, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 3.310697078704834, + "learning_rate": 4.999463834384915e-06, + "loss": 0.753, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 2.9794881343841553, + "learning_rate": 4.999450837378171e-06, + "loss": 0.7091, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 3.0776889324188232, + "learning_rate": 4.999437684741584e-06, + "loss": 0.7226, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 3.6657519340515137, + "learning_rate": 4.999424376475972e-06, + "loss": 0.845, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 3.872718572616577, + "learning_rate": 4.999410912582164e-06, + "loss": 0.812, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.9184508323669434, + "learning_rate": 4.9993972930609976e-06, + "loss": 0.6823, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 3.5567142963409424, + "learning_rate": 4.999383517913321e-06, + "loss": 0.7614, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 3.3688533306121826, + "learning_rate": 4.999369587139992e-06, + "loss": 0.858, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 2.893223524093628, + "learning_rate": 4.99935550074188e-06, + "loss": 0.6761, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 3.400225877761841, + "learning_rate": 4.999341258719859e-06, + "loss": 0.7531, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 3.6167714595794678, + "learning_rate": 4.999326861074817e-06, + "loss": 0.8164, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 4.325016498565674, + "learning_rate": 4.9993123078076506e-06, + "loss": 0.7069, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 3.195317029953003, + "learning_rate": 4.999297598919266e-06, + "loss": 0.726, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 3.146530866622925, + "learning_rate": 4.999282734410579e-06, + "loss": 0.7888, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 3.5166752338409424, + "learning_rate": 4.999267714282515e-06, + "loss": 0.8473, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 3.3140196800231934, + "learning_rate": 4.99925253853601e-06, + "loss": 0.7233, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 3.0318164825439453, + "learning_rate": 4.999237207172008e-06, + "loss": 0.7543, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 3.662214756011963, + "learning_rate": 4.999221720191464e-06, + "loss": 0.7783, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 3.452078104019165, + "learning_rate": 4.9992060775953425e-06, + "loss": 0.7868, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 3.4051287174224854, + "learning_rate": 4.999190279384617e-06, + "loss": 0.7849, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 3.1377196311950684, + "learning_rate": 4.999174325560271e-06, + "loss": 0.8364, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 3.129473924636841, + "learning_rate": 4.999158216123299e-06, + "loss": 0.7458, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 3.169548749923706, + "learning_rate": 4.999141951074703e-06, + "loss": 0.7256, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 3.186009168624878, + "learning_rate": 4.999125530415495e-06, + "loss": 0.783, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 3.0995123386383057, + "learning_rate": 4.9991089541467e-06, + "loss": 0.7519, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 3.1854088306427, + "learning_rate": 4.999092222269348e-06, + "loss": 0.7444, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 3.1512246131896973, + "learning_rate": 4.999075334784482e-06, + "loss": 0.7882, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 3.6199698448181152, + "learning_rate": 4.999058291693153e-06, + "loss": 0.8048, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 2.956907272338867, + "learning_rate": 4.999041092996422e-06, + "loss": 0.7663, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 3.3493971824645996, + "learning_rate": 4.99902373869536e-06, + "loss": 0.7639, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 3.144812822341919, + "learning_rate": 4.9990062287910475e-06, + "loss": 0.7953, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 3.5986971855163574, + "learning_rate": 4.998988563284576e-06, + "loss": 0.8297, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 3.447584867477417, + "learning_rate": 4.998970742177044e-06, + "loss": 0.808, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 3.791353940963745, + "learning_rate": 4.998952765469562e-06, + "loss": 0.8005, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 3.4490807056427, + "learning_rate": 4.998934633163247e-06, + "loss": 0.8135, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 3.1053314208984375, + "learning_rate": 4.998916345259232e-06, + "loss": 0.7888, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 3.407862663269043, + "learning_rate": 4.9988979017586514e-06, + "loss": 0.7099, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 3.116656541824341, + "learning_rate": 4.998879302662658e-06, + "loss": 0.8344, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 3.339264154434204, + "learning_rate": 4.998860547972406e-06, + "loss": 0.8496, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 3.251892566680908, + "learning_rate": 4.998841637689066e-06, + "loss": 0.7455, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 4.098135471343994, + "learning_rate": 4.998822571813814e-06, + "loss": 0.7772, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 3.9871134757995605, + "learning_rate": 4.998803350347837e-06, + "loss": 0.8261, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 3.2822303771972656, + "learning_rate": 4.998783973292333e-06, + "loss": 0.8623, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 3.0356857776641846, + "learning_rate": 4.998764440648507e-06, + "loss": 0.7426, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 2.8932785987854004, + "learning_rate": 4.998744752417576e-06, + "loss": 0.6741, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 3.085820436477661, + "learning_rate": 4.998724908600767e-06, + "loss": 0.6549, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 3.135829210281372, + "learning_rate": 4.998704909199314e-06, + "loss": 0.6702, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 5.016134262084961, + "learning_rate": 4.9986847542144625e-06, + "loss": 0.7852, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 3.9056200981140137, + "learning_rate": 4.998664443647468e-06, + "loss": 0.9654, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 3.0880749225616455, + "learning_rate": 4.998643977499595e-06, + "loss": 0.7579, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 3.6893601417541504, + "learning_rate": 4.998623355772118e-06, + "loss": 0.713, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.181536674499512, + "learning_rate": 4.998602578466319e-06, + "loss": 0.7331, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 3.036386728286743, + "learning_rate": 4.998581645583496e-06, + "loss": 0.7115, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 3.6333255767822266, + "learning_rate": 4.998560557124948e-06, + "loss": 0.7544, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 2.926417827606201, + "learning_rate": 4.9985393130919915e-06, + "loss": 0.715, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 2.969158172607422, + "learning_rate": 4.998517913485946e-06, + "loss": 0.7304, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 3.5254971981048584, + "learning_rate": 4.9984963583081466e-06, + "loss": 0.7725, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 3.7840335369110107, + "learning_rate": 4.998474647559936e-06, + "loss": 0.8685, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 3.0333125591278076, + "learning_rate": 4.9984527812426625e-06, + "loss": 0.7793, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 3.290159225463867, + "learning_rate": 4.99843075935769e-06, + "loss": 0.7158, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 3.3935494422912598, + "learning_rate": 4.99840858190639e-06, + "loss": 0.7643, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 3.333965539932251, + "learning_rate": 4.998386248890142e-06, + "loss": 0.7255, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 2.8129613399505615, + "learning_rate": 4.998363760310339e-06, + "loss": 0.768, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 2.8678107261657715, + "learning_rate": 4.998341116168378e-06, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 2.8898239135742188, + "learning_rate": 4.998318316465672e-06, + "loss": 0.6844, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 3.139777898788452, + "learning_rate": 4.998295361203637e-06, + "loss": 0.7936, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.393721103668213, + "learning_rate": 4.998272250383707e-06, + "loss": 0.8173, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 3.240973949432373, + "learning_rate": 4.998248984007318e-06, + "loss": 0.8252, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 3.384855031967163, + "learning_rate": 4.998225562075918e-06, + "loss": 0.7244, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 3.1881816387176514, + "learning_rate": 4.9982019845909675e-06, + "loss": 0.6818, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 2.888364553451538, + "learning_rate": 4.998178251553934e-06, + "loss": 0.6753, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 3.630093812942505, + "learning_rate": 4.9981543629662944e-06, + "loss": 0.7995, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 2.9820947647094727, + "learning_rate": 4.998130318829537e-06, + "loss": 0.7478, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 2.7094738483428955, + "learning_rate": 4.998106119145159e-06, + "loss": 0.7237, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 3.1808104515075684, + "learning_rate": 4.9980817639146665e-06, + "loss": 0.7915, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 3.1661291122436523, + "learning_rate": 4.998057253139575e-06, + "loss": 0.8053, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 3.528749942779541, + "learning_rate": 4.998032586821413e-06, + "loss": 0.7946, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 3.125964879989624, + "learning_rate": 4.998007764961716e-06, + "loss": 0.7569, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 3.0778942108154297, + "learning_rate": 4.997982787562029e-06, + "loss": 0.7184, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 3.3531930446624756, + "learning_rate": 4.997957654623906e-06, + "loss": 0.7586, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 3.229278564453125, + "learning_rate": 4.997932366148913e-06, + "loss": 0.6092, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 3.7286155223846436, + "learning_rate": 4.997906922138626e-06, + "loss": 0.7965, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 3.300311803817749, + "learning_rate": 4.997881322594628e-06, + "loss": 0.7665, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 3.411482572555542, + "learning_rate": 4.9978555675185115e-06, + "loss": 0.7253, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 3.0884511470794678, + "learning_rate": 4.9978296569118825e-06, + "loss": 0.659, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 3.0652925968170166, + "learning_rate": 4.9978035907763535e-06, + "loss": 0.6739, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 3.280555009841919, + "learning_rate": 4.997777369113547e-06, + "loss": 0.8003, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 2.980860948562622, + "learning_rate": 4.997750991925096e-06, + "loss": 0.7097, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 3.301760673522949, + "learning_rate": 4.997724459212644e-06, + "loss": 0.7894, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 2.9584903717041016, + "learning_rate": 4.997697770977841e-06, + "loss": 0.733, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 3.5632214546203613, + "learning_rate": 4.99767092722235e-06, + "loss": 0.7228, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 3.5900983810424805, + "learning_rate": 4.997643927947843e-06, + "loss": 0.7634, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 3.332650661468506, + "learning_rate": 4.997616773156e-06, + "loss": 0.797, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 3.1094167232513428, + "learning_rate": 4.997589462848512e-06, + "loss": 0.7849, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 3.5359463691711426, + "learning_rate": 4.99756199702708e-06, + "loss": 0.6871, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 3.190441846847534, + "learning_rate": 4.997534375693414e-06, + "loss": 0.6883, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 3.063518762588501, + "learning_rate": 4.997506598849234e-06, + "loss": 0.7586, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.4112050533294678, + "learning_rate": 4.997478666496269e-06, + "loss": 0.796, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 3.231886386871338, + "learning_rate": 4.997450578636259e-06, + "loss": 0.7714, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.279425621032715, + "learning_rate": 4.9974223352709515e-06, + "loss": 0.7793, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 3.2154316902160645, + "learning_rate": 4.9973939364021075e-06, + "loss": 0.791, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 3.2090768814086914, + "learning_rate": 4.9973653820314925e-06, + "loss": 0.6433, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 3.1712026596069336, + "learning_rate": 4.997336672160886e-06, + "loss": 0.8128, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 2.929229497909546, + "learning_rate": 4.997307806792076e-06, + "loss": 0.7594, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 3.0363314151763916, + "learning_rate": 4.997278785926859e-06, + "loss": 0.7336, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 3.1352357864379883, + "learning_rate": 4.997249609567042e-06, + "loss": 0.7225, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 3.3171157836914062, + "learning_rate": 4.997220277714442e-06, + "loss": 0.7777, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 3.050717353820801, + "learning_rate": 4.997190790370885e-06, + "loss": 0.6836, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 3.0297694206237793, + "learning_rate": 4.997161147538208e-06, + "loss": 0.6883, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 3.0566554069519043, + "learning_rate": 4.997131349218256e-06, + "loss": 0.6674, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 3.799111843109131, + "learning_rate": 4.997101395412885e-06, + "loss": 0.8256, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 3.1394248008728027, + "learning_rate": 4.9970712861239576e-06, + "loss": 0.7306, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 3.0605666637420654, + "learning_rate": 4.997041021353352e-06, + "loss": 0.7212, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 3.8813397884368896, + "learning_rate": 4.997010601102951e-06, + "loss": 0.769, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 3.0514819622039795, + "learning_rate": 4.996980025374649e-06, + "loss": 0.7422, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 2.9544146060943604, + "learning_rate": 4.99694929417035e-06, + "loss": 0.6912, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 3.2635602951049805, + "learning_rate": 4.996918407491966e-06, + "loss": 0.7395, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 3.373882532119751, + "learning_rate": 4.996887365341423e-06, + "loss": 0.7799, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 3.001128673553467, + "learning_rate": 4.996856167720652e-06, + "loss": 0.7168, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 3.1026835441589355, + "learning_rate": 4.996824814631595e-06, + "loss": 0.7492, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.41947603225708, + "learning_rate": 4.996793306076205e-06, + "loss": 0.6659, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 3.2272400856018066, + "learning_rate": 4.996761642056444e-06, + "loss": 0.7184, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 2.9488935470581055, + "learning_rate": 4.996729822574284e-06, + "loss": 0.7451, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.268231153488159, + "learning_rate": 4.9966978476317065e-06, + "loss": 0.7798, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 3.9086556434631348, + "learning_rate": 4.996665717230701e-06, + "loss": 0.7871, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 3.3483879566192627, + "learning_rate": 4.996633431373269e-06, + "loss": 0.7415, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 2.839400053024292, + "learning_rate": 4.99660099006142e-06, + "loss": 0.7192, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 3.177302598953247, + "learning_rate": 4.996568393297175e-06, + "loss": 0.755, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 3.5477044582366943, + "learning_rate": 4.996535641082563e-06, + "loss": 0.7531, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 3.418576717376709, + "learning_rate": 4.996502733419624e-06, + "loss": 0.8009, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 3.711341619491577, + "learning_rate": 4.996469670310407e-06, + "loss": 0.7362, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 3.2419373989105225, + "learning_rate": 4.99643645175697e-06, + "loss": 0.7761, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 3.121858835220337, + "learning_rate": 4.996403077761381e-06, + "loss": 0.6495, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 3.123054265975952, + "learning_rate": 4.996369548325719e-06, + "loss": 0.7444, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 2.780880928039551, + "learning_rate": 4.996335863452072e-06, + "loss": 0.672, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 3.3738629817962646, + "learning_rate": 4.996302023142536e-06, + "loss": 0.7972, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 3.4874777793884277, + "learning_rate": 4.99626802739922e-06, + "loss": 0.8252, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 3.7074787616729736, + "learning_rate": 4.9962338762242395e-06, + "loss": 0.8216, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 3.281912326812744, + "learning_rate": 4.996199569619721e-06, + "loss": 0.8175, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 2.9485340118408203, + "learning_rate": 4.996165107587801e-06, + "loss": 0.707, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 3.3757646083831787, + "learning_rate": 4.996130490130625e-06, + "loss": 0.7955, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 2.962181568145752, + "learning_rate": 4.996095717250349e-06, + "loss": 0.7067, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 3.114272356033325, + "learning_rate": 4.996060788949136e-06, + "loss": 0.7486, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 3.0621590614318848, + "learning_rate": 4.996025705229165e-06, + "loss": 0.6547, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 2.8745882511138916, + "learning_rate": 4.995990466092616e-06, + "loss": 0.6435, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 2.90841007232666, + "learning_rate": 4.995955071541686e-06, + "loss": 0.7331, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 2.694580316543579, + "learning_rate": 4.9959195215785784e-06, + "loss": 0.6731, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 3.158083438873291, + "learning_rate": 4.995883816205507e-06, + "loss": 0.7257, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 3.3234715461730957, + "learning_rate": 4.995847955424694e-06, + "loss": 0.7389, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 2.9406495094299316, + "learning_rate": 4.995811939238373e-06, + "loss": 0.643, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 3.3191726207733154, + "learning_rate": 4.995775767648785e-06, + "loss": 0.7879, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.711925745010376, + "learning_rate": 4.995739440658185e-06, + "loss": 0.7586, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 9.573421478271484, + "learning_rate": 4.995702958268833e-06, + "loss": 0.7842, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 3.4154508113861084, + "learning_rate": 4.995666320483001e-06, + "loss": 0.6735, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 3.4169859886169434, + "learning_rate": 4.995629527302971e-06, + "loss": 0.741, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 3.287503242492676, + "learning_rate": 4.9955925787310335e-06, + "loss": 0.7139, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 3.288409471511841, + "learning_rate": 4.995555474769488e-06, + "loss": 0.7636, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 2.8021693229675293, + "learning_rate": 4.995518215420646e-06, + "loss": 0.5883, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 2.7038564682006836, + "learning_rate": 4.995480800686827e-06, + "loss": 0.657, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 3.2370235919952393, + "learning_rate": 4.9954432305703615e-06, + "loss": 0.6999, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 2.8666412830352783, + "learning_rate": 4.995405505073588e-06, + "loss": 0.7199, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 3.6467232704162598, + "learning_rate": 4.995367624198856e-06, + "loss": 0.7317, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 2.7576327323913574, + "learning_rate": 4.9953295879485246e-06, + "loss": 0.647, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 2.922232151031494, + "learning_rate": 4.995291396324959e-06, + "loss": 0.6686, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 2.8693501949310303, + "learning_rate": 4.995253049330542e-06, + "loss": 0.6756, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 3.671865701675415, + "learning_rate": 4.995214546967658e-06, + "loss": 0.7347, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 3.024219274520874, + "learning_rate": 4.995175889238706e-06, + "loss": 0.7547, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 2.8470778465270996, + "learning_rate": 4.995137076146091e-06, + "loss": 0.6764, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 2.905057907104492, + "learning_rate": 4.9950981076922324e-06, + "loss": 0.6814, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 3.504377841949463, + "learning_rate": 4.995058983879555e-06, + "loss": 0.7145, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 3.0029661655426025, + "learning_rate": 4.995019704710495e-06, + "loss": 0.7114, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 2.8666274547576904, + "learning_rate": 4.994980270187499e-06, + "loss": 0.7416, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 3.1644718647003174, + "learning_rate": 4.994940680313021e-06, + "loss": 0.661, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 3.050391674041748, + "learning_rate": 4.994900935089527e-06, + "loss": 0.7243, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 2.985466480255127, + "learning_rate": 4.994861034519491e-06, + "loss": 0.6917, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 2.909342050552368, + "learning_rate": 4.9948209786053995e-06, + "loss": 0.6636, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 3.2214784622192383, + "learning_rate": 4.9947807673497435e-06, + "loss": 0.7903, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 2.5654983520507812, + "learning_rate": 4.994740400755029e-06, + "loss": 0.6129, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 3.775646448135376, + "learning_rate": 4.99469987882377e-06, + "loss": 0.7145, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 2.8965413570404053, + "learning_rate": 4.994659201558487e-06, + "loss": 0.7177, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 3.485597848892212, + "learning_rate": 4.9946183689617146e-06, + "loss": 0.8107, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 3.277839183807373, + "learning_rate": 4.994577381035995e-06, + "loss": 0.691, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 2.8807685375213623, + "learning_rate": 4.99453623778388e-06, + "loss": 0.7627, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 3.0659940242767334, + "learning_rate": 4.994494939207932e-06, + "loss": 0.6858, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 3.0881855487823486, + "learning_rate": 4.994453485310723e-06, + "loss": 0.8212, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 2.7199201583862305, + "learning_rate": 4.994411876094832e-06, + "loss": 0.6516, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 2.955889940261841, + "learning_rate": 4.994370111562851e-06, + "loss": 0.6579, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 3.1321663856506348, + "learning_rate": 4.994328191717382e-06, + "loss": 0.6891, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 3.0560388565063477, + "learning_rate": 4.994286116561034e-06, + "loss": 0.7243, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 3.1560704708099365, + "learning_rate": 4.994243886096425e-06, + "loss": 0.7262, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 2.913541316986084, + "learning_rate": 4.994201500326187e-06, + "loss": 0.7318, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 3.098376512527466, + "learning_rate": 4.994158959252958e-06, + "loss": 0.6419, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 2.977508544921875, + "learning_rate": 4.994116262879387e-06, + "loss": 0.6709, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 3.168186902999878, + "learning_rate": 4.994073411208133e-06, + "loss": 0.6608, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 3.436844825744629, + "learning_rate": 4.994030404241864e-06, + "loss": 0.7227, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 2.8998289108276367, + "learning_rate": 4.993987241983258e-06, + "loss": 0.6512, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 3.407191514968872, + "learning_rate": 4.993943924435002e-06, + "loss": 0.616, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 3.744858741760254, + "learning_rate": 4.993900451599793e-06, + "loss": 0.8599, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 3.486283779144287, + "learning_rate": 4.993856823480338e-06, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 2.895719051361084, + "learning_rate": 4.993813040079355e-06, + "loss": 0.6972, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 2.814133882522583, + "learning_rate": 4.993769101399569e-06, + "loss": 0.6271, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 2.8609800338745117, + "learning_rate": 4.993725007443715e-06, + "loss": 0.6481, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 3.2829644680023193, + "learning_rate": 4.99368075821454e-06, + "loss": 0.7999, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 3.1417458057403564, + "learning_rate": 4.993636353714798e-06, + "loss": 0.6972, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 3.0679385662078857, + "learning_rate": 4.993591793947256e-06, + "loss": 0.667, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 3.1387410163879395, + "learning_rate": 4.993547078914686e-06, + "loss": 0.7618, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 2.9181406497955322, + "learning_rate": 4.993502208619872e-06, + "loss": 0.7391, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 2.8952157497406006, + "learning_rate": 4.993457183065611e-06, + "loss": 0.6988, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 3.2274813652038574, + "learning_rate": 4.993412002254704e-06, + "loss": 0.688, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 3.4693779945373535, + "learning_rate": 4.993366666189965e-06, + "loss": 0.6634, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 3.5358526706695557, + "learning_rate": 4.993321174874217e-06, + "loss": 0.7343, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 3.013338088989258, + "learning_rate": 4.993275528310292e-06, + "loss": 0.7579, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 2.694772720336914, + "learning_rate": 4.993229726501033e-06, + "loss": 0.718, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 3.070612907409668, + "learning_rate": 4.9931837694492915e-06, + "loss": 0.6438, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 2.9193027019500732, + "learning_rate": 4.993137657157928e-06, + "loss": 0.6788, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 3.047682046890259, + "learning_rate": 4.993091389629816e-06, + "loss": 0.6826, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 2.9629905223846436, + "learning_rate": 4.993044966867834e-06, + "loss": 0.7196, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 3.0692050457000732, + "learning_rate": 4.992998388874874e-06, + "loss": 0.7015, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 3.5427212715148926, + "learning_rate": 4.992951655653836e-06, + "loss": 0.8292, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 2.643526554107666, + "learning_rate": 4.992904767207629e-06, + "loss": 0.624, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 3.1185996532440186, + "learning_rate": 4.992857723539173e-06, + "loss": 0.7354, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 3.006856679916382, + "learning_rate": 4.992810524651398e-06, + "loss": 0.7752, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 2.9913275241851807, + "learning_rate": 4.9927631705472425e-06, + "loss": 0.7306, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 2.6794071197509766, + "learning_rate": 4.992715661229655e-06, + "loss": 0.6136, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 3.5933966636657715, + "learning_rate": 4.992667996701593e-06, + "loss": 0.7024, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 2.862187623977661, + "learning_rate": 4.992620176966025e-06, + "loss": 0.692, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 3.076845407485962, + "learning_rate": 4.9925722020259286e-06, + "loss": 0.7475, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.372919797897339, + "learning_rate": 4.9925240718842895e-06, + "loss": 0.6886, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 2.922977924346924, + "learning_rate": 4.992475786544108e-06, + "loss": 0.7049, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 2.908034324645996, + "learning_rate": 4.992427346008387e-06, + "loss": 0.6498, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 3.096723794937134, + "learning_rate": 4.992378750280144e-06, + "loss": 0.7151, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 2.895237684249878, + "learning_rate": 4.992329999362405e-06, + "loss": 0.7277, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 2.718230724334717, + "learning_rate": 4.9922810932582065e-06, + "loss": 0.6375, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 3.187743663787842, + "learning_rate": 4.992232031970592e-06, + "loss": 0.6528, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 2.996406316757202, + "learning_rate": 4.992182815502616e-06, + "loss": 0.6552, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 3.301084041595459, + "learning_rate": 4.992133443857345e-06, + "loss": 0.7061, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 3.7874677181243896, + "learning_rate": 4.992083917037853e-06, + "loss": 0.7859, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 3.124253511428833, + "learning_rate": 4.992034235047222e-06, + "loss": 0.7615, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.0488970279693604, + "learning_rate": 4.991984397888546e-06, + "loss": 0.6916, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 3.1241321563720703, + "learning_rate": 4.991934405564929e-06, + "loss": 0.7055, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 3.396632432937622, + "learning_rate": 4.991884258079484e-06, + "loss": 0.7675, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 3.7776873111724854, + "learning_rate": 4.9918339554353316e-06, + "loss": 0.7371, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 3.3356032371520996, + "learning_rate": 4.991783497635606e-06, + "loss": 0.6778, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 2.988856792449951, + "learning_rate": 4.9917328846834474e-06, + "loss": 0.6795, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 3.264183282852173, + "learning_rate": 4.99168211658201e-06, + "loss": 0.7707, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 3.878068208694458, + "learning_rate": 4.991631193334451e-06, + "loss": 0.857, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 3.6377553939819336, + "learning_rate": 4.991580114943943e-06, + "loss": 0.8033, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 2.95393967628479, + "learning_rate": 4.991528881413667e-06, + "loss": 0.6809, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 3.058704376220703, + "learning_rate": 4.9914774927468125e-06, + "loss": 0.6664, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 2.7783217430114746, + "learning_rate": 4.9914259489465795e-06, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 2.4825217723846436, + "learning_rate": 4.991374250016177e-06, + "loss": 0.6598, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 2.8753600120544434, + "learning_rate": 4.991322395958824e-06, + "loss": 0.6947, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 3.2339367866516113, + "learning_rate": 4.99127038677775e-06, + "loss": 0.8201, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 2.9065537452697754, + "learning_rate": 4.991218222476193e-06, + "loss": 0.6679, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 3.283228874206543, + "learning_rate": 4.991165903057401e-06, + "loss": 0.8039, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 3.429872751235962, + "learning_rate": 4.991113428524631e-06, + "loss": 0.7392, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 3.118943452835083, + "learning_rate": 4.991060798881152e-06, + "loss": 0.6794, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 3.395970106124878, + "learning_rate": 4.99100801413024e-06, + "loss": 0.6862, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 2.869191884994507, + "learning_rate": 4.99095507427518e-06, + "loss": 0.6076, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 3.1934661865234375, + "learning_rate": 4.990901979319272e-06, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 2.9068603515625, + "learning_rate": 4.990848729265819e-06, + "loss": 0.6864, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 3.0535948276519775, + "learning_rate": 4.9907953241181375e-06, + "loss": 0.6396, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 2.871511459350586, + "learning_rate": 4.990741763879554e-06, + "loss": 0.6743, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 2.9184393882751465, + "learning_rate": 4.9906880485534015e-06, + "loss": 0.6786, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 3.0628271102905273, + "learning_rate": 4.990634178143026e-06, + "loss": 0.6326, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 3.7878305912017822, + "learning_rate": 4.990580152651782e-06, + "loss": 0.7944, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 2.8577189445495605, + "learning_rate": 4.990525972083031e-06, + "loss": 0.71, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 3.307769775390625, + "learning_rate": 4.99047163644015e-06, + "loss": 0.6893, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 2.7391717433929443, + "learning_rate": 4.990417145726519e-06, + "loss": 0.712, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 2.938044786453247, + "learning_rate": 4.990362499945534e-06, + "loss": 0.7516, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 2.7831056118011475, + "learning_rate": 4.990307699100595e-06, + "loss": 0.6168, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 2.907977342605591, + "learning_rate": 4.990252743195116e-06, + "loss": 0.6706, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 3.7882161140441895, + "learning_rate": 4.990197632232517e-06, + "loss": 0.6847, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 2.899716854095459, + "learning_rate": 4.990142366216232e-06, + "loss": 0.6699, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 2.907003879547119, + "learning_rate": 4.990086945149701e-06, + "loss": 0.6864, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 3.2407333850860596, + "learning_rate": 4.9900313690363736e-06, + "loss": 0.692, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 2.9055583477020264, + "learning_rate": 4.989975637879712e-06, + "loss": 0.7113, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 2.9836206436157227, + "learning_rate": 4.989919751683184e-06, + "loss": 0.6673, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.371035575866699, + "learning_rate": 4.989863710450273e-06, + "loss": 0.7181, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 2.9636635780334473, + "learning_rate": 4.989807514184465e-06, + "loss": 0.6082, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 2.9634664058685303, + "learning_rate": 4.9897511628892615e-06, + "loss": 0.7086, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 3.154763698577881, + "learning_rate": 4.98969465656817e-06, + "loss": 0.7027, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 2.9959890842437744, + "learning_rate": 4.98963799522471e-06, + "loss": 0.6498, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 3.5470590591430664, + "learning_rate": 4.989581178862408e-06, + "loss": 0.7199, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 7.1873369216918945, + "learning_rate": 4.989524207484802e-06, + "loss": 0.6676, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 3.1099541187286377, + "learning_rate": 4.98946708109544e-06, + "loss": 0.6785, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 2.830991506576538, + "learning_rate": 4.9894097996978795e-06, + "loss": 0.6456, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.0212316513061523, + "learning_rate": 4.989352363295687e-06, + "loss": 0.6048, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 3.18776798248291, + "learning_rate": 4.989294771892437e-06, + "loss": 0.7078, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 2.9972598552703857, + "learning_rate": 4.989237025491717e-06, + "loss": 0.7082, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 3.4935688972473145, + "learning_rate": 4.989179124097123e-06, + "loss": 0.8199, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 2.6485543251037598, + "learning_rate": 4.9891210677122595e-06, + "loss": 0.6371, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 2.969233512878418, + "learning_rate": 4.989062856340742e-06, + "loss": 0.6879, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 2.881875514984131, + "learning_rate": 4.989004489986194e-06, + "loss": 0.7415, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 2.624540090560913, + "learning_rate": 4.98894596865225e-06, + "loss": 0.6522, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 3.61075496673584, + "learning_rate": 4.988887292342555e-06, + "loss": 0.7109, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 2.9368972778320312, + "learning_rate": 4.988828461060762e-06, + "loss": 0.6843, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 3.0670197010040283, + "learning_rate": 4.988769474810533e-06, + "loss": 0.6807, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 2.9662792682647705, + "learning_rate": 4.988710333595542e-06, + "loss": 0.6796, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 2.971235752105713, + "learning_rate": 4.988651037419472e-06, + "loss": 0.696, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 2.931884527206421, + "learning_rate": 4.988591586286013e-06, + "loss": 0.7323, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 2.8114213943481445, + "learning_rate": 4.988531980198868e-06, + "loss": 0.6584, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 3.2785916328430176, + "learning_rate": 4.98847221916175e-06, + "loss": 0.7514, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 3.0520215034484863, + "learning_rate": 4.988412303178377e-06, + "loss": 0.7564, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 3.181002616882324, + "learning_rate": 4.988352232252483e-06, + "loss": 0.6768, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 3.4953625202178955, + "learning_rate": 4.988292006387805e-06, + "loss": 0.7143, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 3.326571226119995, + "learning_rate": 4.988231625588096e-06, + "loss": 0.7318, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 3.09614634513855, + "learning_rate": 4.988171089857113e-06, + "loss": 0.6574, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 2.7439446449279785, + "learning_rate": 4.9881103991986265e-06, + "loss": 0.6637, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 3.0681190490722656, + "learning_rate": 4.988049553616416e-06, + "loss": 0.6326, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 3.0757341384887695, + "learning_rate": 4.98798855311427e-06, + "loss": 0.695, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 2.8637635707855225, + "learning_rate": 4.987927397695985e-06, + "loss": 0.6598, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 3.3641068935394287, + "learning_rate": 4.9878660873653715e-06, + "loss": 0.7435, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 3.5025596618652344, + "learning_rate": 4.987804622126245e-06, + "loss": 0.735, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 2.9298837184906006, + "learning_rate": 4.987743001982434e-06, + "loss": 0.7063, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 2.70358943939209, + "learning_rate": 4.987681226937774e-06, + "loss": 0.6799, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 3.027871608734131, + "learning_rate": 4.9876192969961125e-06, + "loss": 0.6881, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 3.362306594848633, + "learning_rate": 4.987557212161304e-06, + "loss": 0.7906, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 3.3136050701141357, + "learning_rate": 4.987494972437217e-06, + "loss": 0.6878, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.017089605331421, + "learning_rate": 4.9874325778277255e-06, + "loss": 0.7279, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 2.8300516605377197, + "learning_rate": 4.987370028336714e-06, + "loss": 0.6864, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 3.201860189437866, + "learning_rate": 4.987307323968077e-06, + "loss": 0.7531, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 2.685396194458008, + "learning_rate": 4.987244464725721e-06, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 2.8715312480926514, + "learning_rate": 4.987181450613557e-06, + "loss": 0.675, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 2.813908815383911, + "learning_rate": 4.987118281635511e-06, + "loss": 0.6841, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 3.2738473415374756, + "learning_rate": 4.987054957795514e-06, + "loss": 0.7158, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 2.896134376525879, + "learning_rate": 4.986991479097511e-06, + "loss": 0.7542, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 3.0390403270721436, + "learning_rate": 4.986927845545454e-06, + "loss": 0.6733, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 3.0300254821777344, + "learning_rate": 4.9868640571433044e-06, + "loss": 0.722, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 3.3037352561950684, + "learning_rate": 4.986800113895035e-06, + "loss": 0.6811, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 3.0358474254608154, + "learning_rate": 4.986736015804627e-06, + "loss": 0.7348, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 3.108792304992676, + "learning_rate": 4.986671762876071e-06, + "loss": 0.6096, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 3.1316237449645996, + "learning_rate": 4.986607355113367e-06, + "loss": 0.6357, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 3.3095219135284424, + "learning_rate": 4.986542792520528e-06, + "loss": 0.7515, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 3.4775984287261963, + "learning_rate": 4.986478075101572e-06, + "loss": 0.7104, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 3.341708183288574, + "learning_rate": 4.986413202860528e-06, + "loss": 0.7339, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 2.9646966457366943, + "learning_rate": 4.986348175801438e-06, + "loss": 0.6032, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 3.1853902339935303, + "learning_rate": 4.986282993928349e-06, + "loss": 0.6925, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 3.286909818649292, + "learning_rate": 4.98621765724532e-06, + "loss": 0.7447, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 3.2255051136016846, + "learning_rate": 4.986152165756419e-06, + "loss": 0.7747, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 3.002352237701416, + "learning_rate": 4.986086519465724e-06, + "loss": 0.6472, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 3.4738974571228027, + "learning_rate": 4.986020718377322e-06, + "loss": 0.7381, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 3.4470200538635254, + "learning_rate": 4.985954762495312e-06, + "loss": 0.6878, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 2.9219350814819336, + "learning_rate": 4.985888651823799e-06, + "loss": 0.6317, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 3.061767101287842, + "learning_rate": 4.985822386366899e-06, + "loss": 0.6842, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 3.0291247367858887, + "learning_rate": 4.985755966128742e-06, + "loss": 0.6852, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 2.964280843734741, + "learning_rate": 4.985689391113457e-06, + "loss": 0.7738, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 3.058302164077759, + "learning_rate": 4.9856226613251955e-06, + "loss": 0.6677, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 3.345141649246216, + "learning_rate": 4.985555776768109e-06, + "loss": 0.7837, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 3.565031051635742, + "learning_rate": 4.9854887374463636e-06, + "loss": 0.7231, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 2.7953789234161377, + "learning_rate": 4.985421543364132e-06, + "loss": 0.6102, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 2.887606620788574, + "learning_rate": 4.9853541945256e-06, + "loss": 0.6289, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 3.1480495929718018, + "learning_rate": 4.985286690934961e-06, + "loss": 0.6348, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 2.8912761211395264, + "learning_rate": 4.985219032596416e-06, + "loss": 0.595, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 2.947936534881592, + "learning_rate": 4.98515121951418e-06, + "loss": 0.6196, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 3.1085827350616455, + "learning_rate": 4.985083251692474e-06, + "loss": 0.6387, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 3.1688334941864014, + "learning_rate": 4.985015129135531e-06, + "loss": 0.7055, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 3.075042963027954, + "learning_rate": 4.984946851847593e-06, + "loss": 0.7515, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 3.1933093070983887, + "learning_rate": 4.98487841983291e-06, + "loss": 0.7054, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 3.043473958969116, + "learning_rate": 4.984809833095744e-06, + "loss": 0.6281, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 3.0532584190368652, + "learning_rate": 4.9847410916403645e-06, + "loss": 0.6155, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 3.608480215072632, + "learning_rate": 4.984672195471053e-06, + "loss": 0.7363, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 2.7491862773895264, + "learning_rate": 4.9846031445921e-06, + "loss": 0.6594, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 2.8602418899536133, + "learning_rate": 4.984533939007802e-06, + "loss": 0.6742, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 3.1782007217407227, + "learning_rate": 4.98446457872247e-06, + "loss": 0.731, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 2.796147584915161, + "learning_rate": 4.984395063740423e-06, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 2.8392202854156494, + "learning_rate": 4.984325394065991e-06, + "loss": 0.6753, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 3.134672164916992, + "learning_rate": 4.984255569703508e-06, + "loss": 0.7222, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 2.734330177307129, + "learning_rate": 4.984185590657325e-06, + "loss": 0.6098, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 3.739010810852051, + "learning_rate": 4.984115456931798e-06, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 2.8412528038024902, + "learning_rate": 4.9840451685312925e-06, + "loss": 0.6972, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 3.017395496368408, + "learning_rate": 4.983974725460188e-06, + "loss": 0.6887, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 3.2746949195861816, + "learning_rate": 4.98390412772287e-06, + "loss": 0.7047, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 3.1561965942382812, + "learning_rate": 4.983833375323732e-06, + "loss": 0.7726, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.2367217540740967, + "learning_rate": 4.9837624682671816e-06, + "loss": 0.6348, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 2.8195858001708984, + "learning_rate": 4.983691406557633e-06, + "loss": 0.6387, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 3.349820852279663, + "learning_rate": 4.983620190199511e-06, + "loss": 0.6776, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 2.8025588989257812, + "learning_rate": 4.98354881919725e-06, + "loss": 0.6512, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 2.9125499725341797, + "learning_rate": 4.983477293555295e-06, + "loss": 0.7024, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 3.3479275703430176, + "learning_rate": 4.983405613278098e-06, + "loss": 0.688, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 3.123971462249756, + "learning_rate": 4.983333778370123e-06, + "loss": 0.6743, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.891625165939331, + "learning_rate": 4.983261788835843e-06, + "loss": 0.5971, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 3.5066864490509033, + "learning_rate": 4.98318964467974e-06, + "loss": 0.6958, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 2.570547342300415, + "learning_rate": 4.983117345906306e-06, + "loss": 0.609, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 3.005106210708618, + "learning_rate": 4.983044892520044e-06, + "loss": 0.6791, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 3.429675340652466, + "learning_rate": 4.982972284525463e-06, + "loss": 0.6625, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 3.825657367706299, + "learning_rate": 4.982899521927086e-06, + "loss": 0.6368, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 2.8699095249176025, + "learning_rate": 4.982826604729443e-06, + "loss": 0.6425, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 3.1688714027404785, + "learning_rate": 4.982753532937074e-06, + "loss": 0.6904, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 3.3889992237091064, + "learning_rate": 4.98268030655453e-06, + "loss": 0.7575, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 3.108315944671631, + "learning_rate": 4.982606925586367e-06, + "loss": 0.6648, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 3.209831953048706, + "learning_rate": 4.982533390037159e-06, + "loss": 0.657, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 3.1740927696228027, + "learning_rate": 4.982459699911482e-06, + "loss": 0.7262, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 3.0190417766571045, + "learning_rate": 4.982385855213924e-06, + "loss": 0.6368, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 3.05049467086792, + "learning_rate": 4.982311855949084e-06, + "loss": 0.72, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 2.984816551208496, + "learning_rate": 4.98223770212157e-06, + "loss": 0.6856, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 2.744969606399536, + "learning_rate": 4.982163393735998e-06, + "loss": 0.6023, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 3.170564889907837, + "learning_rate": 4.982088930796996e-06, + "loss": 0.6678, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 2.8686118125915527, + "learning_rate": 4.982014313309199e-06, + "loss": 0.6157, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 2.8768694400787354, + "learning_rate": 4.981939541277254e-06, + "loss": 0.6566, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 2.621481418609619, + "learning_rate": 4.981864614705818e-06, + "loss": 0.7372, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 3.527374267578125, + "learning_rate": 4.981789533599554e-06, + "loss": 0.6485, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 3.3141074180603027, + "learning_rate": 4.981714297963138e-06, + "loss": 0.6816, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 2.9247069358825684, + "learning_rate": 4.981638907801255e-06, + "loss": 0.7217, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 2.875236749649048, + "learning_rate": 4.981563363118599e-06, + "loss": 0.6662, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 2.9540364742279053, + "learning_rate": 4.981487663919874e-06, + "loss": 0.7225, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 2.90889310836792, + "learning_rate": 4.981411810209793e-06, + "loss": 0.6054, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 2.8541409969329834, + "learning_rate": 4.981335801993078e-06, + "loss": 0.6539, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 3.1600730419158936, + "learning_rate": 4.981259639274465e-06, + "loss": 0.6415, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 3.569376230239868, + "learning_rate": 4.981183322058693e-06, + "loss": 0.6944, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 3.067667007446289, + "learning_rate": 4.981106850350515e-06, + "loss": 0.7378, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 3.082073450088501, + "learning_rate": 4.981030224154693e-06, + "loss": 0.693, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 2.902932643890381, + "learning_rate": 4.980953443475998e-06, + "loss": 0.6549, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 2.6821181774139404, + "learning_rate": 4.980876508319211e-06, + "loss": 0.6231, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 3.1747355461120605, + "learning_rate": 4.9807994186891215e-06, + "loss": 0.6826, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 2.6975860595703125, + "learning_rate": 4.980722174590531e-06, + "loss": 0.6669, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 2.924285650253296, + "learning_rate": 4.9806447760282486e-06, + "loss": 0.689, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 2.941417694091797, + "learning_rate": 4.980567223007093e-06, + "loss": 0.6672, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 2.8582186698913574, + "learning_rate": 4.980489515531892e-06, + "loss": 0.6229, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 2.6462013721466064, + "learning_rate": 4.9804116536074865e-06, + "loss": 0.606, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 2.9029998779296875, + "learning_rate": 4.980333637238723e-06, + "loss": 0.5915, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 3.9359042644500732, + "learning_rate": 4.980255466430462e-06, + "loss": 0.7035, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 3.200524091720581, + "learning_rate": 4.980177141187566e-06, + "loss": 0.7156, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 3.1708686351776123, + "learning_rate": 4.980098661514916e-06, + "loss": 0.746, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 2.8926830291748047, + "learning_rate": 4.980020027417397e-06, + "loss": 0.6282, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 3.0526294708251953, + "learning_rate": 4.979941238899906e-06, + "loss": 0.6594, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 2.9869306087493896, + "learning_rate": 4.9798622959673486e-06, + "loss": 0.7771, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 2.7894513607025146, + "learning_rate": 4.979783198624638e-06, + "loss": 0.6819, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.958575963973999, + "learning_rate": 4.9797039468767025e-06, + "loss": 0.6474, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 3.423748016357422, + "learning_rate": 4.979624540728475e-06, + "loss": 0.7389, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 2.9641635417938232, + "learning_rate": 4.9795449801849e-06, + "loss": 0.6005, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 3.02274227142334, + "learning_rate": 4.979465265250933e-06, + "loss": 0.6358, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 3.0562758445739746, + "learning_rate": 4.979385395931534e-06, + "loss": 0.6313, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 3.301816701889038, + "learning_rate": 4.97930537223168e-06, + "loss": 0.7264, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 2.975360870361328, + "learning_rate": 4.979225194156351e-06, + "loss": 0.613, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 2.9245030879974365, + "learning_rate": 4.97914486171054e-06, + "loss": 0.6646, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 3.1336188316345215, + "learning_rate": 4.979064374899249e-06, + "loss": 0.6421, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 3.6298763751983643, + "learning_rate": 4.978983733727491e-06, + "loss": 0.6433, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 2.919597625732422, + "learning_rate": 4.9789029382002845e-06, + "loss": 0.6288, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 3.2206127643585205, + "learning_rate": 4.978821988322662e-06, + "loss": 0.7102, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 3.1767101287841797, + "learning_rate": 4.978740884099664e-06, + "loss": 0.6722, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 3.3425452709198, + "learning_rate": 4.97865962553634e-06, + "loss": 0.6492, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 3.0408358573913574, + "learning_rate": 4.97857821263775e-06, + "loss": 0.6522, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 2.8144783973693848, + "learning_rate": 4.978496645408963e-06, + "loss": 0.7237, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 3.7010560035705566, + "learning_rate": 4.978414923855057e-06, + "loss": 0.7509, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 2.9438371658325195, + "learning_rate": 4.978333047981122e-06, + "loss": 0.6244, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 3.285982370376587, + "learning_rate": 4.978251017792255e-06, + "loss": 0.7553, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 3.7021138668060303, + "learning_rate": 4.978168833293564e-06, + "loss": 0.7859, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 3.481858730316162, + "learning_rate": 4.9780864944901654e-06, + "loss": 0.7146, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 3.693824529647827, + "learning_rate": 4.978004001387188e-06, + "loss": 0.6608, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 3.0069146156311035, + "learning_rate": 4.9779213539897665e-06, + "loss": 0.6506, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 3.037644147872925, + "learning_rate": 4.977838552303048e-06, + "loss": 0.6487, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 3.018554449081421, + "learning_rate": 4.977755596332188e-06, + "loss": 0.6128, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 3.000312089920044, + "learning_rate": 4.977672486082351e-06, + "loss": 0.6431, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 2.836803913116455, + "learning_rate": 4.977589221558713e-06, + "loss": 0.5914, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 3.080469846725464, + "learning_rate": 4.977505802766457e-06, + "loss": 0.7265, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 3.2245471477508545, + "learning_rate": 4.97742222971078e-06, + "loss": 0.6895, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 3.559006452560425, + "learning_rate": 4.977338502396882e-06, + "loss": 0.7439, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 2.9116289615631104, + "learning_rate": 4.9772546208299795e-06, + "loss": 0.6907, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 3.3645524978637695, + "learning_rate": 4.977170585015295e-06, + "loss": 0.6983, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 3.080148458480835, + "learning_rate": 4.977086394958058e-06, + "loss": 0.7016, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 2.9276750087738037, + "learning_rate": 4.977002050663515e-06, + "loss": 0.6509, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 3.183609962463379, + "learning_rate": 4.976917552136914e-06, + "loss": 0.6814, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 3.0980000495910645, + "learning_rate": 4.976832899383519e-06, + "loss": 0.6319, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 3.211376190185547, + "learning_rate": 4.9767480924086e-06, + "loss": 0.6365, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.214430093765259, + "learning_rate": 4.976663131217437e-06, + "loss": 0.6006, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 3.0914318561553955, + "learning_rate": 4.976578015815321e-06, + "loss": 0.7162, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 2.7644500732421875, + "learning_rate": 4.976492746207551e-06, + "loss": 0.6045, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 3.1913280487060547, + "learning_rate": 4.9764073223994374e-06, + "loss": 0.6796, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 2.8919692039489746, + "learning_rate": 4.976321744396299e-06, + "loss": 0.6683, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 2.862234115600586, + "learning_rate": 4.976236012203463e-06, + "loss": 0.6631, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.9708092212677, + "learning_rate": 4.976150125826268e-06, + "loss": 0.6326, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 2.892465353012085, + "learning_rate": 4.976064085270063e-06, + "loss": 0.6574, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 3.9215126037597656, + "learning_rate": 4.975977890540205e-06, + "loss": 0.7351, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 2.9544081687927246, + "learning_rate": 4.975891541642059e-06, + "loss": 0.7264, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 2.995035409927368, + "learning_rate": 4.975805038581005e-06, + "loss": 0.7405, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 2.9653120040893555, + "learning_rate": 4.975718381362427e-06, + "loss": 0.679, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 2.93976092338562, + "learning_rate": 4.9756315699917205e-06, + "loss": 0.627, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 3.106522560119629, + "learning_rate": 4.9755446044742915e-06, + "loss": 0.6329, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 3.0238280296325684, + "learning_rate": 4.975457484815554e-06, + "loss": 0.6643, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 2.943528175354004, + "learning_rate": 4.9753702110209356e-06, + "loss": 0.668, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 2.6840121746063232, + "learning_rate": 4.9752827830958676e-06, + "loss": 0.5482, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 2.823875904083252, + "learning_rate": 4.975195201045794e-06, + "loss": 0.7017, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 3.148181200027466, + "learning_rate": 4.975107464876168e-06, + "loss": 0.747, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 2.630584478378296, + "learning_rate": 4.9750195745924545e-06, + "loss": 0.5987, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 3.075866460800171, + "learning_rate": 4.974931530200124e-06, + "loss": 0.664, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 2.947197914123535, + "learning_rate": 4.974843331704659e-06, + "loss": 0.631, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 3.519646644592285, + "learning_rate": 4.974754979111552e-06, + "loss": 0.7154, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 2.8687186241149902, + "learning_rate": 4.974666472426305e-06, + "loss": 0.6366, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 2.6966612339019775, + "learning_rate": 4.974577811654426e-06, + "loss": 0.7112, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 3.1390228271484375, + "learning_rate": 4.974488996801439e-06, + "loss": 0.6882, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 3.4667599201202393, + "learning_rate": 4.974400027872871e-06, + "loss": 0.7153, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 2.9632184505462646, + "learning_rate": 4.974310904874265e-06, + "loss": 0.7081, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 3.46150279045105, + "learning_rate": 4.9742216278111666e-06, + "loss": 0.6242, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 3.380403757095337, + "learning_rate": 4.974132196689137e-06, + "loss": 0.6863, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 3.4279606342315674, + "learning_rate": 4.974042611513746e-06, + "loss": 0.6388, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 2.634523391723633, + "learning_rate": 4.973952872290568e-06, + "loss": 0.6038, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 3.19693922996521, + "learning_rate": 4.973862979025194e-06, + "loss": 0.6383, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 3.437692165374756, + "learning_rate": 4.973772931723218e-06, + "loss": 0.7288, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 2.506301164627075, + "learning_rate": 4.97368273039025e-06, + "loss": 0.5707, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 3.0942845344543457, + "learning_rate": 4.9735923750319044e-06, + "loss": 0.6348, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 3.0889835357666016, + "learning_rate": 4.973501865653809e-06, + "loss": 0.6697, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 3.0391931533813477, + "learning_rate": 4.973411202261598e-06, + "loss": 0.7091, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 3.0333497524261475, + "learning_rate": 4.973320384860917e-06, + "loss": 0.6403, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 2.9714622497558594, + "learning_rate": 4.973229413457421e-06, + "loss": 0.6977, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 3.057558298110962, + "learning_rate": 4.973138288056774e-06, + "loss": 0.7236, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 2.921093463897705, + "learning_rate": 4.97304700866465e-06, + "loss": 0.576, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 3.0287256240844727, + "learning_rate": 4.972955575286732e-06, + "loss": 0.7077, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 2.8621346950531006, + "learning_rate": 4.972863987928716e-06, + "loss": 0.6952, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 2.631359100341797, + "learning_rate": 4.9727722465963006e-06, + "loss": 0.6931, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 2.8484320640563965, + "learning_rate": 4.972680351295201e-06, + "loss": 0.6292, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 2.593001365661621, + "learning_rate": 4.972588302031138e-06, + "loss": 0.5942, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 2.6321065425872803, + "learning_rate": 4.972496098809844e-06, + "loss": 0.65, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 3.2516732215881348, + "learning_rate": 4.972403741637059e-06, + "loss": 0.7385, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 3.180854320526123, + "learning_rate": 4.972311230518535e-06, + "loss": 0.6569, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.161016941070557, + "learning_rate": 4.972218565460031e-06, + "loss": 0.6416, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.153897762298584, + "learning_rate": 4.972125746467317e-06, + "loss": 0.7196, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 2.9595556259155273, + "learning_rate": 4.972032773546173e-06, + "loss": 0.7093, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 3.1086833477020264, + "learning_rate": 4.9719396467023875e-06, + "loss": 0.6963, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 2.958921432495117, + "learning_rate": 4.971846365941759e-06, + "loss": 0.6518, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 2.8745479583740234, + "learning_rate": 4.971752931270096e-06, + "loss": 0.696, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 3.224358558654785, + "learning_rate": 4.971659342693217e-06, + "loss": 0.6769, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.696319580078125, + "learning_rate": 4.9715656002169486e-06, + "loss": 0.6833, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 2.9283502101898193, + "learning_rate": 4.971471703847127e-06, + "loss": 0.6784, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 2.654914140701294, + "learning_rate": 4.9713776535896e-06, + "loss": 0.6337, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 3.041555643081665, + "learning_rate": 4.971283449450224e-06, + "loss": 0.6227, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 2.893008232116699, + "learning_rate": 4.971189091434863e-06, + "loss": 0.655, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 2.8806653022766113, + "learning_rate": 4.971094579549393e-06, + "loss": 0.7077, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 3.4830048084259033, + "learning_rate": 4.9709999137996986e-06, + "loss": 0.7461, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 3.155444860458374, + "learning_rate": 4.970905094191674e-06, + "loss": 0.652, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 2.7608706951141357, + "learning_rate": 4.970810120731225e-06, + "loss": 0.684, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 2.8209474086761475, + "learning_rate": 4.970714993424265e-06, + "loss": 0.6009, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 3.6532654762268066, + "learning_rate": 4.9706197122767145e-06, + "loss": 0.702, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 2.6276566982269287, + "learning_rate": 4.970524277294508e-06, + "loss": 0.6338, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 3.509871482849121, + "learning_rate": 4.970428688483589e-06, + "loss": 0.6853, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 5.332682132720947, + "learning_rate": 4.970332945849906e-06, + "loss": 0.6684, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 2.718801975250244, + "learning_rate": 4.970237049399424e-06, + "loss": 0.6676, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 3.891003131866455, + "learning_rate": 4.970140999138112e-06, + "loss": 0.7043, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 2.8863155841827393, + "learning_rate": 4.970044795071951e-06, + "loss": 0.6563, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 3.2527518272399902, + "learning_rate": 4.969948437206932e-06, + "loss": 0.7244, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 2.9726758003234863, + "learning_rate": 4.969851925549054e-06, + "loss": 0.6548, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 3.118309497833252, + "learning_rate": 4.969755260104327e-06, + "loss": 0.7293, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 3.373068332672119, + "learning_rate": 4.969658440878769e-06, + "loss": 0.6444, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 2.7157437801361084, + "learning_rate": 4.969561467878409e-06, + "loss": 0.642, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 2.58929705619812, + "learning_rate": 4.969464341109285e-06, + "loss": 0.6165, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 2.8811306953430176, + "learning_rate": 4.969367060577445e-06, + "loss": 0.7127, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 3.494358539581299, + "learning_rate": 4.969269626288946e-06, + "loss": 0.7103, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 2.9753928184509277, + "learning_rate": 4.969172038249855e-06, + "loss": 0.6911, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 3.2885913848876953, + "learning_rate": 4.969074296466247e-06, + "loss": 0.6968, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 2.7564568519592285, + "learning_rate": 4.968976400944211e-06, + "loss": 0.6843, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 2.9255006313323975, + "learning_rate": 4.96887835168984e-06, + "loss": 0.6024, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 3.1808290481567383, + "learning_rate": 4.968780148709239e-06, + "loss": 0.7377, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 2.956666946411133, + "learning_rate": 4.968681792008523e-06, + "loss": 0.65, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 2.9631855487823486, + "learning_rate": 4.9685832815938175e-06, + "loss": 0.677, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 2.501917600631714, + "learning_rate": 4.968484617471256e-06, + "loss": 0.6282, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 2.750779628753662, + "learning_rate": 4.968385799646981e-06, + "loss": 0.6507, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 2.872300624847412, + "learning_rate": 4.968286828127146e-06, + "loss": 0.5949, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 2.6316142082214355, + "learning_rate": 4.9681877029179124e-06, + "loss": 0.6328, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 3.244364023208618, + "learning_rate": 4.968088424025454e-06, + "loss": 0.7393, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 2.620465040206909, + "learning_rate": 4.967988991455951e-06, + "loss": 0.6797, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 2.854513645172119, + "learning_rate": 4.967889405215596e-06, + "loss": 0.6368, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 2.579854726791382, + "learning_rate": 4.9677896653105886e-06, + "loss": 0.6489, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.0697381496429443, + "learning_rate": 4.96768977174714e-06, + "loss": 0.6313, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 3.369338035583496, + "learning_rate": 4.96758972453147e-06, + "loss": 0.7416, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 2.836221933364868, + "learning_rate": 4.967489523669807e-06, + "loss": 0.6422, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 2.929579496383667, + "learning_rate": 4.967389169168392e-06, + "loss": 0.6482, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 2.9243831634521484, + "learning_rate": 4.967288661033472e-06, + "loss": 0.5813, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 3.7555336952209473, + "learning_rate": 4.967187999271306e-06, + "loss": 0.6501, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 3.4279143810272217, + "learning_rate": 4.9670871838881615e-06, + "loss": 0.6326, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 2.875066041946411, + "learning_rate": 4.9669862148903166e-06, + "loss": 0.664, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 3.130394697189331, + "learning_rate": 4.966885092284057e-06, + "loss": 0.706, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 2.9606287479400635, + "learning_rate": 4.96678381607568e-06, + "loss": 0.693, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 3.0584909915924072, + "learning_rate": 4.966682386271491e-06, + "loss": 0.6034, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 2.8215200901031494, + "learning_rate": 4.966580802877805e-06, + "loss": 0.6217, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 2.7348055839538574, + "learning_rate": 4.966479065900949e-06, + "loss": 0.6194, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 3.2347466945648193, + "learning_rate": 4.966377175347257e-06, + "loss": 0.6377, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 3.311845302581787, + "learning_rate": 4.966275131223072e-06, + "loss": 0.6234, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 3.0384368896484375, + "learning_rate": 4.96617293353475e-06, + "loss": 0.609, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 3.516854763031006, + "learning_rate": 4.966070582288653e-06, + "loss": 0.6627, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 3.2425215244293213, + "learning_rate": 4.9659680774911534e-06, + "loss": 0.7355, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.2665750980377197, + "learning_rate": 4.965865419148636e-06, + "loss": 0.6787, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 2.729428291320801, + "learning_rate": 4.96576260726749e-06, + "loss": 0.6272, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 3.299969434738159, + "learning_rate": 4.965659641854119e-06, + "loss": 0.6552, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 2.7090916633605957, + "learning_rate": 4.965556522914934e-06, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 2.488846778869629, + "learning_rate": 4.965453250456355e-06, + "loss": 0.5821, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 2.5267233848571777, + "learning_rate": 4.965349824484813e-06, + "loss": 0.5593, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 3.0646679401397705, + "learning_rate": 4.965246245006748e-06, + "loss": 0.6341, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 2.9877712726593018, + "learning_rate": 4.965142512028609e-06, + "loss": 0.7202, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 3.7494113445281982, + "learning_rate": 4.965038625556854e-06, + "loss": 0.7643, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 2.8382890224456787, + "learning_rate": 4.964934585597954e-06, + "loss": 0.6522, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 3.091655731201172, + "learning_rate": 4.9648303921583854e-06, + "loss": 0.7117, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 3.0608325004577637, + "learning_rate": 4.964726045244635e-06, + "loss": 0.6538, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 2.8492867946624756, + "learning_rate": 4.964621544863203e-06, + "loss": 0.6079, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 3.0669894218444824, + "learning_rate": 4.964516891020594e-06, + "loss": 0.6223, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 3.089984893798828, + "learning_rate": 4.964412083723325e-06, + "loss": 0.671, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 2.905242443084717, + "learning_rate": 4.964307122977921e-06, + "loss": 0.62, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 3.954436779022217, + "learning_rate": 4.964202008790918e-06, + "loss": 0.6535, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 2.6026058197021484, + "learning_rate": 4.9640967411688615e-06, + "loss": 0.5865, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.963991320118306e-06, + "loss": 0.6698, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 2.9411263465881348, + "learning_rate": 4.963885745645815e-06, + "loss": 0.6173, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 2.5679805278778076, + "learning_rate": 4.963780017757962e-06, + "loss": 0.6285, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 3.3100640773773193, + "learning_rate": 4.963674136461332e-06, + "loss": 0.5968, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 3.1293699741363525, + "learning_rate": 4.963568101762515e-06, + "loss": 0.697, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 3.043853759765625, + "learning_rate": 4.963461913668115e-06, + "loss": 0.5881, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 3.07351016998291, + "learning_rate": 4.963355572184744e-06, + "loss": 0.6307, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 2.7381317615509033, + "learning_rate": 4.9632490773190225e-06, + "loss": 0.716, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 2.892221450805664, + "learning_rate": 4.963142429077582e-06, + "loss": 0.6867, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 3.133122205734253, + "learning_rate": 4.963035627467064e-06, + "loss": 0.659, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 3.032599925994873, + "learning_rate": 4.962928672494116e-06, + "loss": 0.6848, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 3.0076355934143066, + "learning_rate": 4.9628215641654e-06, + "loss": 0.6549, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 2.8904454708099365, + "learning_rate": 4.962714302487585e-06, + "loss": 0.6484, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 2.881364107131958, + "learning_rate": 4.9626068874673486e-06, + "loss": 0.721, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 3.11668062210083, + "learning_rate": 4.962499319111379e-06, + "loss": 0.7824, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 2.9201436042785645, + "learning_rate": 4.962391597426374e-06, + "loss": 0.6911, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 2.926598072052002, + "learning_rate": 4.962283722419043e-06, + "loss": 0.6715, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 2.7267675399780273, + "learning_rate": 4.962175694096101e-06, + "loss": 0.6111, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 3.194031000137329, + "learning_rate": 4.962067512464275e-06, + "loss": 0.6558, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 2.6249136924743652, + "learning_rate": 4.9619591775303e-06, + "loss": 0.6166, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 2.6356167793273926, + "learning_rate": 4.961850689300923e-06, + "loss": 0.6112, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 3.030724287033081, + "learning_rate": 4.961742047782898e-06, + "loss": 0.6511, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.4987757205963135, + "learning_rate": 4.96163325298299e-06, + "loss": 0.5888, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 3.0371780395507812, + "learning_rate": 4.961524304907974e-06, + "loss": 0.6385, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 3.302570104598999, + "learning_rate": 4.961415203564632e-06, + "loss": 0.6515, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 2.7597038745880127, + "learning_rate": 4.961305948959759e-06, + "loss": 0.6126, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 2.789811849594116, + "learning_rate": 4.9611965411001575e-06, + "loss": 0.6601, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 3.0403921604156494, + "learning_rate": 4.961086979992639e-06, + "loss": 0.6947, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 3.2139980792999268, + "learning_rate": 4.960977265644026e-06, + "loss": 0.6876, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 2.918515205383301, + "learning_rate": 4.960867398061149e-06, + "loss": 0.5997, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 3.197636604309082, + "learning_rate": 4.9607573772508495e-06, + "loss": 0.5754, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 2.8848466873168945, + "learning_rate": 4.960647203219979e-06, + "loss": 0.6424, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 3.4810187816619873, + "learning_rate": 4.960536875975397e-06, + "loss": 0.6851, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 3.713934898376465, + "learning_rate": 4.960426395523972e-06, + "loss": 0.6122, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 2.862600803375244, + "learning_rate": 4.960315761872585e-06, + "loss": 0.6493, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 3.133882522583008, + "learning_rate": 4.960204975028123e-06, + "loss": 0.7535, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 3.1526732444763184, + "learning_rate": 4.960094034997485e-06, + "loss": 0.6512, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 2.7213544845581055, + "learning_rate": 4.959982941787579e-06, + "loss": 0.6121, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 3.4935851097106934, + "learning_rate": 4.9598716954053214e-06, + "loss": 0.7852, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 2.691016435623169, + "learning_rate": 4.9597602958576395e-06, + "loss": 0.6861, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 2.8621015548706055, + "learning_rate": 4.959648743151469e-06, + "loss": 0.6262, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 3.3887462615966797, + "learning_rate": 4.959537037293758e-06, + "loss": 0.7103, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 2.7565438747406006, + "learning_rate": 4.95942517829146e-06, + "loss": 0.6471, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 2.7920358180999756, + "learning_rate": 4.959313166151541e-06, + "loss": 0.6239, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 3.18904185295105, + "learning_rate": 4.959201000880973e-06, + "loss": 0.7461, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 2.727872371673584, + "learning_rate": 4.959088682486743e-06, + "loss": 0.6333, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 2.906378746032715, + "learning_rate": 4.958976210975844e-06, + "loss": 0.7547, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 2.96482515335083, + "learning_rate": 4.958863586355278e-06, + "loss": 0.6312, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 3.2890889644622803, + "learning_rate": 4.958750808632059e-06, + "loss": 0.6943, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 2.7004311084747314, + "learning_rate": 4.958637877813207e-06, + "loss": 0.5918, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 2.7487950325012207, + "learning_rate": 4.9585247939057566e-06, + "loss": 0.6201, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 2.7873897552490234, + "learning_rate": 4.958411556916747e-06, + "loss": 0.6268, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 2.8501343727111816, + "learning_rate": 4.958298166853229e-06, + "loss": 0.7119, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 3.0391547679901123, + "learning_rate": 4.958184623722265e-06, + "loss": 0.6375, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 2.850520133972168, + "learning_rate": 4.958070927530922e-06, + "loss": 0.5962, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 3.351914644241333, + "learning_rate": 4.957957078286281e-06, + "loss": 0.7247, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 2.9559543132781982, + "learning_rate": 4.957843075995431e-06, + "loss": 0.6571, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.225785255432129, + "learning_rate": 4.95772892066547e-06, + "loss": 0.7074, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 2.7842373847961426, + "learning_rate": 4.957614612303505e-06, + "loss": 0.6469, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.249724864959717, + "learning_rate": 4.957500150916655e-06, + "loss": 0.741, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 3.138221263885498, + "learning_rate": 4.957385536512046e-06, + "loss": 0.6676, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 3.456423759460449, + "learning_rate": 4.957270769096816e-06, + "loss": 0.6877, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 2.8676278591156006, + "learning_rate": 4.957155848678109e-06, + "loss": 0.5986, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 2.705324411392212, + "learning_rate": 4.957040775263082e-06, + "loss": 0.6356, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 3.0767486095428467, + "learning_rate": 4.9569255488589e-06, + "loss": 0.6844, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 2.7787704467773438, + "learning_rate": 4.956810169472736e-06, + "loss": 0.6641, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 2.584277868270874, + "learning_rate": 4.956694637111777e-06, + "loss": 0.6256, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 2.751641273498535, + "learning_rate": 4.956578951783215e-06, + "loss": 0.5954, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 3.0181658267974854, + "learning_rate": 4.956463113494253e-06, + "loss": 0.6569, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 3.0933220386505127, + "learning_rate": 4.956347122252104e-06, + "loss": 0.6248, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 3.3767428398132324, + "learning_rate": 4.956230978063991e-06, + "loss": 0.719, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.7666573524475098, + "learning_rate": 4.956114680937145e-06, + "loss": 0.6467, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 2.9836843013763428, + "learning_rate": 4.955998230878808e-06, + "loss": 0.6993, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 2.981497049331665, + "learning_rate": 4.955881627896229e-06, + "loss": 0.6578, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.1369056701660156, + "learning_rate": 4.955764871996672e-06, + "loss": 0.6763, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 2.7675817012786865, + "learning_rate": 4.9556479631874036e-06, + "loss": 0.6488, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.035334825515747, + "learning_rate": 4.9555309014757034e-06, + "loss": 0.7076, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 3.493704319000244, + "learning_rate": 4.955413686868862e-06, + "loss": 0.6773, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 3.245487928390503, + "learning_rate": 4.9552963193741765e-06, + "loss": 0.6915, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 3.189969539642334, + "learning_rate": 4.955178798998956e-06, + "loss": 0.7318, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 2.7987146377563477, + "learning_rate": 4.955061125750517e-06, + "loss": 0.6162, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 3.020118474960327, + "learning_rate": 4.954943299636187e-06, + "loss": 0.6678, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 2.715463876724243, + "learning_rate": 4.954825320663302e-06, + "loss": 0.668, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 2.595050096511841, + "learning_rate": 4.9547071888392085e-06, + "loss": 0.6557, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 3.131596088409424, + "learning_rate": 4.954588904171261e-06, + "loss": 0.6548, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 2.5742313861846924, + "learning_rate": 4.954470466666827e-06, + "loss": 0.6592, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 2.8612802028656006, + "learning_rate": 4.9543518763332785e-06, + "loss": 0.5391, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 2.8973186016082764, + "learning_rate": 4.954233133178001e-06, + "loss": 0.6649, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 2.802525043487549, + "learning_rate": 4.954114237208388e-06, + "loss": 0.6212, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 2.5919506549835205, + "learning_rate": 4.953995188431843e-06, + "loss": 0.6596, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 3.139169454574585, + "learning_rate": 4.953875986855777e-06, + "loss": 0.6799, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 3.99727725982666, + "learning_rate": 4.953756632487614e-06, + "loss": 0.6519, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 3.238706350326538, + "learning_rate": 4.953637125334784e-06, + "loss": 0.7361, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 2.780019998550415, + "learning_rate": 4.9535174654047295e-06, + "loss": 0.6406, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 2.7629551887512207, + "learning_rate": 4.953397652704901e-06, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 2.8008246421813965, + "learning_rate": 4.9532776872427585e-06, + "loss": 0.6464, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 3.0970115661621094, + "learning_rate": 4.953157569025772e-06, + "loss": 0.7066, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 2.8375589847564697, + "learning_rate": 4.9530372980614195e-06, + "loss": 0.6551, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 2.718843936920166, + "learning_rate": 4.952916874357191e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 2.7104697227478027, + "learning_rate": 4.952796297920585e-06, + "loss": 0.6708, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 2.8223445415496826, + "learning_rate": 4.952675568759108e-06, + "loss": 0.6214, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 2.6598153114318848, + "learning_rate": 4.952554686880279e-06, + "loss": 0.6116, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 2.8639824390411377, + "learning_rate": 4.952433652291623e-06, + "loss": 0.5971, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 2.9578304290771484, + "learning_rate": 4.952312465000677e-06, + "loss": 0.6785, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 2.872144937515259, + "learning_rate": 4.952191125014987e-06, + "loss": 0.6772, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 2.7513675689697266, + "learning_rate": 4.952069632342108e-06, + "loss": 0.702, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 2.9275078773498535, + "learning_rate": 4.951947986989606e-06, + "loss": 0.589, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 2.740549325942993, + "learning_rate": 4.951826188965053e-06, + "loss": 0.5942, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 2.92452073097229, + "learning_rate": 4.951704238276035e-06, + "loss": 0.6819, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 2.842491865158081, + "learning_rate": 4.951582134930144e-06, + "loss": 0.6304, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 2.613478422164917, + "learning_rate": 4.951459878934983e-06, + "loss": 0.6912, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 3.2408607006073, + "learning_rate": 4.951337470298165e-06, + "loss": 0.6755, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 3.1022439002990723, + "learning_rate": 4.9512149090273125e-06, + "loss": 0.6138, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 2.6418895721435547, + "learning_rate": 4.951092195130055e-06, + "loss": 0.639, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 3.010744333267212, + "learning_rate": 4.950969328614035e-06, + "loss": 0.7102, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 2.673292636871338, + "learning_rate": 4.950846309486901e-06, + "loss": 0.5676, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 3.6974737644195557, + "learning_rate": 4.950723137756314e-06, + "loss": 0.5722, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 3.69028902053833, + "learning_rate": 4.9505998134299435e-06, + "loss": 0.6337, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 3.2136125564575195, + "learning_rate": 4.950476336515469e-06, + "loss": 0.6469, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 2.7396016120910645, + "learning_rate": 4.950352707020577e-06, + "loss": 0.6656, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 2.825416088104248, + "learning_rate": 4.950228924952967e-06, + "loss": 0.6298, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 3.401658535003662, + "learning_rate": 4.950104990320345e-06, + "loss": 0.778, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 2.7002272605895996, + "learning_rate": 4.9499809031304294e-06, + "loss": 0.6536, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 2.62386417388916, + "learning_rate": 4.949856663390945e-06, + "loss": 0.6629, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 2.584247589111328, + "learning_rate": 4.94973227110963e-06, + "loss": 0.5813, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 3.4365768432617188, + "learning_rate": 4.9496077262942265e-06, + "loss": 0.7648, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 2.8993639945983887, + "learning_rate": 4.949483028952492e-06, + "loss": 0.6696, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 2.922809362411499, + "learning_rate": 4.94935817909219e-06, + "loss": 0.6892, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 2.85478138923645, + "learning_rate": 4.9492331767210944e-06, + "loss": 0.536, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 2.8639259338378906, + "learning_rate": 4.949108021846988e-06, + "loss": 0.634, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 3.0533697605133057, + "learning_rate": 4.948982714477664e-06, + "loss": 0.6318, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 2.331674814224243, + "learning_rate": 4.9488572546209255e-06, + "loss": 0.6562, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.0154623985290527, + "learning_rate": 4.9487316422845835e-06, + "loss": 0.6675, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 2.7354514598846436, + "learning_rate": 4.948605877476459e-06, + "loss": 0.6012, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 2.863736629486084, + "learning_rate": 4.948479960204383e-06, + "loss": 0.6062, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 3.01998233795166, + "learning_rate": 4.948353890476197e-06, + "loss": 0.6749, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 2.7550456523895264, + "learning_rate": 4.94822766829975e-06, + "loss": 0.6507, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 3.370572805404663, + "learning_rate": 4.948101293682901e-06, + "loss": 0.714, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 2.9736790657043457, + "learning_rate": 4.947974766633519e-06, + "loss": 0.729, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 3.1036548614501953, + "learning_rate": 4.947848087159483e-06, + "loss": 0.7547, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 2.895094871520996, + "learning_rate": 4.947721255268679e-06, + "loss": 0.6089, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 2.798476219177246, + "learning_rate": 4.947594270969005e-06, + "loss": 0.5432, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 2.7675702571868896, + "learning_rate": 4.94746713426837e-06, + "loss": 0.5693, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 2.6851553916931152, + "learning_rate": 4.947339845174687e-06, + "loss": 0.6503, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 2.909635543823242, + "learning_rate": 4.947212403695883e-06, + "loss": 0.6494, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 2.604526996612549, + "learning_rate": 4.947084809839894e-06, + "loss": 0.6349, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 3.118149518966675, + "learning_rate": 4.946957063614664e-06, + "loss": 0.6219, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 2.7452616691589355, + "learning_rate": 4.9468291650281465e-06, + "loss": 0.6096, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 3.30098819732666, + "learning_rate": 4.946701114088307e-06, + "loss": 0.6277, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 2.789482593536377, + "learning_rate": 4.946572910803116e-06, + "loss": 0.7, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 2.7283935546875, + "learning_rate": 4.946444555180559e-06, + "loss": 0.5375, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 3.101304054260254, + "learning_rate": 4.946316047228627e-06, + "loss": 0.6131, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 3.573908805847168, + "learning_rate": 4.946187386955321e-06, + "loss": 0.7073, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 3.214979648590088, + "learning_rate": 4.946058574368653e-06, + "loss": 0.6508, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 3.145082712173462, + "learning_rate": 4.945929609476643e-06, + "loss": 0.64, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 2.991780996322632, + "learning_rate": 4.945800492287321e-06, + "loss": 0.6315, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 3.2441139221191406, + "learning_rate": 4.945671222808727e-06, + "loss": 0.7144, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 2.9397029876708984, + "learning_rate": 4.94554180104891e-06, + "loss": 0.6818, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.2471461296081543, + "learning_rate": 4.945412227015929e-06, + "loss": 0.6921, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 3.0882487297058105, + "learning_rate": 4.945282500717851e-06, + "loss": 0.718, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 2.6035783290863037, + "learning_rate": 4.945152622162753e-06, + "loss": 0.621, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 2.83659029006958, + "learning_rate": 4.945022591358724e-06, + "loss": 0.6403, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 2.824463129043579, + "learning_rate": 4.944892408313859e-06, + "loss": 0.6594, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 2.753735065460205, + "learning_rate": 4.9447620730362645e-06, + "loss": 0.6116, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 3.0659725666046143, + "learning_rate": 4.944631585534056e-06, + "loss": 0.5983, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 2.969113349914551, + "learning_rate": 4.944500945815357e-06, + "loss": 0.6859, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 2.810303211212158, + "learning_rate": 4.944370153888303e-06, + "loss": 0.7025, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.027721643447876, + "learning_rate": 4.944239209761038e-06, + "loss": 0.7268, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 2.661503314971924, + "learning_rate": 4.944108113441716e-06, + "loss": 0.6702, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 2.738591432571411, + "learning_rate": 4.943976864938498e-06, + "loss": 0.6728, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 3.447505474090576, + "learning_rate": 4.943845464259557e-06, + "loss": 0.6586, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 3.0968854427337646, + "learning_rate": 4.943713911413075e-06, + "loss": 0.7666, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 2.4113779067993164, + "learning_rate": 4.943582206407244e-06, + "loss": 0.6173, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 2.6357979774475098, + "learning_rate": 4.943450349250263e-06, + "loss": 0.5589, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 2.9182233810424805, + "learning_rate": 4.9433183399503425e-06, + "loss": 0.6252, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 2.832740306854248, + "learning_rate": 4.943186178515703e-06, + "loss": 0.6882, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 2.9508981704711914, + "learning_rate": 4.943053864954574e-06, + "loss": 0.5722, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 3.044729471206665, + "learning_rate": 4.9429213992751925e-06, + "loss": 0.6772, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 2.606003522872925, + "learning_rate": 4.9427887814858075e-06, + "loss": 0.6445, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 2.4634225368499756, + "learning_rate": 4.942656011594676e-06, + "loss": 0.6151, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 2.8872334957122803, + "learning_rate": 4.942523089610066e-06, + "loss": 0.6255, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 2.870605707168579, + "learning_rate": 4.942390015540253e-06, + "loss": 0.7481, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 2.952680826187134, + "learning_rate": 4.942256789393524e-06, + "loss": 0.5556, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 2.623680353164673, + "learning_rate": 4.9421234111781725e-06, + "loss": 0.6115, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 2.6933600902557373, + "learning_rate": 4.941989880902505e-06, + "loss": 0.6102, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 2.6047189235687256, + "learning_rate": 4.941856198574836e-06, + "loss": 0.612, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 2.779186725616455, + "learning_rate": 4.9417223642034885e-06, + "loss": 0.5424, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 2.6177165508270264, + "learning_rate": 4.941588377796795e-06, + "loss": 0.4661, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 2.959676742553711, + "learning_rate": 4.941454239363101e-06, + "loss": 0.6966, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 2.9788379669189453, + "learning_rate": 4.941319948910756e-06, + "loss": 0.6181, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 4.642750263214111, + "learning_rate": 4.941185506448122e-06, + "loss": 0.5602, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 2.793002128601074, + "learning_rate": 4.941050911983572e-06, + "loss": 0.602, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 2.6833035945892334, + "learning_rate": 4.9409161655254845e-06, + "loss": 0.5549, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 3.905032157897949, + "learning_rate": 4.94078126708225e-06, + "loss": 0.6335, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 2.922609329223633, + "learning_rate": 4.94064621666227e-06, + "loss": 0.5839, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 2.8277416229248047, + "learning_rate": 4.940511014273952e-06, + "loss": 0.629, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 3.07511043548584, + "learning_rate": 4.940375659925714e-06, + "loss": 0.7058, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 3.65043044090271, + "learning_rate": 4.940240153625984e-06, + "loss": 0.7174, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 2.755167245864868, + "learning_rate": 4.9401044953832e-06, + "loss": 0.6548, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 2.9881057739257812, + "learning_rate": 4.939968685205808e-06, + "loss": 0.6245, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 2.9484212398529053, + "learning_rate": 4.939832723102266e-06, + "loss": 0.655, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 2.898918628692627, + "learning_rate": 4.939696609081038e-06, + "loss": 0.6178, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 2.7052435874938965, + "learning_rate": 4.9395603431506e-06, + "loss": 0.6393, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 2.5610013008117676, + "learning_rate": 4.939423925319436e-06, + "loss": 0.4847, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 3.229083299636841, + "learning_rate": 4.939287355596042e-06, + "loss": 0.6473, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 2.907097816467285, + "learning_rate": 4.9391506339889195e-06, + "loss": 0.652, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.6929478645324707, + "learning_rate": 4.939013760506582e-06, + "loss": 0.6175, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.414813280105591, + "learning_rate": 4.938876735157554e-06, + "loss": 0.7597, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 3.297360420227051, + "learning_rate": 4.938739557950365e-06, + "loss": 0.6824, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 3.083155393600464, + "learning_rate": 4.938602228893557e-06, + "loss": 0.6505, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 2.9781153202056885, + "learning_rate": 4.938464747995681e-06, + "loss": 0.666, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 3.1494534015655518, + "learning_rate": 4.9383271152652975e-06, + "loss": 0.6422, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 2.547868490219116, + "learning_rate": 4.938189330710976e-06, + "loss": 0.5766, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 2.684736967086792, + "learning_rate": 4.938051394341297e-06, + "loss": 0.6407, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 2.9619693756103516, + "learning_rate": 4.937913306164847e-06, + "loss": 0.6936, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 2.9698498249053955, + "learning_rate": 4.937775066190227e-06, + "loss": 0.6464, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 3.121049642562866, + "learning_rate": 4.937636674426042e-06, + "loss": 0.6383, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 3.113672971725464, + "learning_rate": 4.93749813088091e-06, + "loss": 0.6892, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 3.126113176345825, + "learning_rate": 4.937359435563458e-06, + "loss": 0.6728, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 3.353966236114502, + "learning_rate": 4.937220588482321e-06, + "loss": 0.6041, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 2.8860628604888916, + "learning_rate": 4.937081589646144e-06, + "loss": 0.6798, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 3.0510590076446533, + "learning_rate": 4.936942439063584e-06, + "loss": 0.5841, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 2.6998369693756104, + "learning_rate": 4.936803136743303e-06, + "loss": 0.6403, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 2.875347137451172, + "learning_rate": 4.9366636826939765e-06, + "loss": 0.5811, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 2.9122262001037598, + "learning_rate": 4.936524076924287e-06, + "loss": 0.6852, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 2.5167057514190674, + "learning_rate": 4.9363843194429265e-06, + "loss": 0.5367, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 2.5745551586151123, + "learning_rate": 4.9362444102585985e-06, + "loss": 0.6241, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 2.5024216175079346, + "learning_rate": 4.9361043493800125e-06, + "loss": 0.6133, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 2.7281384468078613, + "learning_rate": 4.935964136815892e-06, + "loss": 0.6834, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 3.0118913650512695, + "learning_rate": 4.935823772574965e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 3.016216993331909, + "learning_rate": 4.935683256665973e-06, + "loss": 0.6653, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 2.9526784420013428, + "learning_rate": 4.9355425890976636e-06, + "loss": 0.6423, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 6.222797393798828, + "learning_rate": 4.9354017698787985e-06, + "loss": 0.5884, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 2.6553597450256348, + "learning_rate": 4.935260799018143e-06, + "loss": 0.6624, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 3.0942065715789795, + "learning_rate": 4.935119676524475e-06, + "loss": 0.6623, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 2.626359224319458, + "learning_rate": 4.934978402406585e-06, + "loss": 0.6195, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 2.7954699993133545, + "learning_rate": 4.934836976673265e-06, + "loss": 0.5545, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 2.913557291030884, + "learning_rate": 4.934695399333324e-06, + "loss": 0.6288, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 3.1043739318847656, + "learning_rate": 4.9345536703955746e-06, + "loss": 0.6771, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 2.789357900619507, + "learning_rate": 4.934411789868845e-06, + "loss": 0.6227, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 2.480609655380249, + "learning_rate": 4.934269757761967e-06, + "loss": 0.5779, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 2.7946252822875977, + "learning_rate": 4.934127574083785e-06, + "loss": 0.6166, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 3.0670509338378906, + "learning_rate": 4.933985238843153e-06, + "loss": 0.7766, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 2.8567559719085693, + "learning_rate": 4.933842752048932e-06, + "loss": 0.5088, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 2.5674657821655273, + "learning_rate": 4.933700113709996e-06, + "loss": 0.6036, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 2.782339096069336, + "learning_rate": 4.933557323835224e-06, + "loss": 0.5335, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 2.6334071159362793, + "learning_rate": 4.93341438243351e-06, + "loss": 0.6327, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 3.0853965282440186, + "learning_rate": 4.933271289513751e-06, + "loss": 0.7102, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 2.619997501373291, + "learning_rate": 4.933128045084859e-06, + "loss": 0.6138, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 2.8316116333007812, + "learning_rate": 4.932984649155753e-06, + "loss": 0.6346, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 3.153486490249634, + "learning_rate": 4.932841101735361e-06, + "loss": 0.7626, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 3.1831274032592773, + "learning_rate": 4.9326974028326214e-06, + "loss": 0.6607, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 2.791078567504883, + "learning_rate": 4.932553552456481e-06, + "loss": 0.6141, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.627263307571411, + "learning_rate": 4.932409550615898e-06, + "loss": 0.6777, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 2.8550007343292236, + "learning_rate": 4.932265397319838e-06, + "loss": 0.6379, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.505824089050293, + "learning_rate": 4.932121092577276e-06, + "loss": 0.5892, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 3.100191116333008, + "learning_rate": 4.931976636397199e-06, + "loss": 0.6443, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 2.921494245529175, + "learning_rate": 4.9318320287886e-06, + "loss": 0.6821, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.577807903289795, + "learning_rate": 4.931687269760485e-06, + "loss": 0.5946, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 2.7347636222839355, + "learning_rate": 4.931542359321865e-06, + "loss": 0.5689, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 2.5289158821105957, + "learning_rate": 4.931397297481765e-06, + "loss": 0.5632, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 3.3518471717834473, + "learning_rate": 4.9312520842492165e-06, + "loss": 0.6349, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 3.0469748973846436, + "learning_rate": 4.931106719633261e-06, + "loss": 0.5734, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 3.104682445526123, + "learning_rate": 4.930961203642951e-06, + "loss": 0.6101, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 2.776705503463745, + "learning_rate": 4.930815536287346e-06, + "loss": 0.6397, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 2.760380983352661, + "learning_rate": 4.930669717575516e-06, + "loss": 0.668, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 2.70084547996521, + "learning_rate": 4.930523747516541e-06, + "loss": 0.5729, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 2.7319583892822266, + "learning_rate": 4.930377626119511e-06, + "loss": 0.6258, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 3.2515223026275635, + "learning_rate": 4.930231353393521e-06, + "loss": 0.7412, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 3.0646486282348633, + "learning_rate": 4.930084929347682e-06, + "loss": 0.5809, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 3.1621921062469482, + "learning_rate": 4.9299383539911096e-06, + "loss": 0.6282, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 2.864713191986084, + "learning_rate": 4.929791627332931e-06, + "loss": 0.6263, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 3.181016683578491, + "learning_rate": 4.929644749382283e-06, + "loss": 0.5697, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 2.9064836502075195, + "learning_rate": 4.929497720148309e-06, + "loss": 0.6161, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 3.058112859725952, + "learning_rate": 4.9293505396401655e-06, + "loss": 0.6477, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 2.5227596759796143, + "learning_rate": 4.929203207867016e-06, + "loss": 0.5819, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 3.386862277984619, + "learning_rate": 4.929055724838035e-06, + "loss": 0.7342, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 3.368346929550171, + "learning_rate": 4.928908090562404e-06, + "loss": 0.6622, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 2.9108314514160156, + "learning_rate": 4.928760305049317e-06, + "loss": 0.6598, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 2.822305917739868, + "learning_rate": 4.928612368307977e-06, + "loss": 0.5841, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 2.689131259918213, + "learning_rate": 4.928464280347592e-06, + "loss": 0.6631, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 3.337214946746826, + "learning_rate": 4.9283160411773864e-06, + "loss": 0.6105, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 3.035911798477173, + "learning_rate": 4.928167650806588e-06, + "loss": 0.6981, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 2.8820855617523193, + "learning_rate": 4.9280191092444375e-06, + "loss": 0.6408, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 3.080432415008545, + "learning_rate": 4.927870416500183e-06, + "loss": 0.6398, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 2.761612892150879, + "learning_rate": 4.927721572583084e-06, + "loss": 0.6126, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 2.8561882972717285, + "learning_rate": 4.927572577502408e-06, + "loss": 0.584, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 3.3386311531066895, + "learning_rate": 4.927423431267432e-06, + "loss": 0.6666, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 2.632906675338745, + "learning_rate": 4.927274133887443e-06, + "loss": 0.632, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 2.8737308979034424, + "learning_rate": 4.927124685371737e-06, + "loss": 0.6051, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 3.042222738265991, + "learning_rate": 4.926975085729619e-06, + "loss": 0.6954, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 3.3341481685638428, + "learning_rate": 4.926825334970404e-06, + "loss": 0.7148, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 2.7415387630462646, + "learning_rate": 4.926675433103418e-06, + "loss": 0.5456, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 2.7545325756073, + "learning_rate": 4.926525380137993e-06, + "loss": 0.6213, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 2.9153690338134766, + "learning_rate": 4.926375176083472e-06, + "loss": 0.6466, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 4.210638523101807, + "learning_rate": 4.926224820949209e-06, + "loss": 0.6192, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 2.4357898235321045, + "learning_rate": 4.926074314744565e-06, + "loss": 0.594, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 2.8004701137542725, + "learning_rate": 4.92592365747891e-06, + "loss": 0.6276, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 2.920675039291382, + "learning_rate": 4.925772849161628e-06, + "loss": 0.6043, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 2.791555404663086, + "learning_rate": 4.9256218898021055e-06, + "loss": 0.6837, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 3.1702463626861572, + "learning_rate": 4.925470779409746e-06, + "loss": 0.668, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 2.7149479389190674, + "learning_rate": 4.925319517993955e-06, + "loss": 0.5842, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 2.916311025619507, + "learning_rate": 4.925168105564153e-06, + "loss": 0.6893, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 2.917654514312744, + "learning_rate": 4.925016542129767e-06, + "loss": 0.6513, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 2.5568928718566895, + "learning_rate": 4.924864827700234e-06, + "loss": 0.6177, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 2.816720485687256, + "learning_rate": 4.924712962285001e-06, + "loss": 0.5833, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 2.6989188194274902, + "learning_rate": 4.9245609458935235e-06, + "loss": 0.6332, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 2.959599494934082, + "learning_rate": 4.924408778535268e-06, + "loss": 0.626, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 2.872814416885376, + "learning_rate": 4.924256460219708e-06, + "loss": 0.6407, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 2.6989097595214844, + "learning_rate": 4.924103990956329e-06, + "loss": 0.6391, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 2.986492156982422, + "learning_rate": 4.9239513707546235e-06, + "loss": 0.6911, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 3.069920301437378, + "learning_rate": 4.9237985996240954e-06, + "loss": 0.671, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 2.8214917182922363, + "learning_rate": 4.9236456775742555e-06, + "loss": 0.5885, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 2.9416961669921875, + "learning_rate": 4.923492604614627e-06, + "loss": 0.6293, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 2.761780023574829, + "learning_rate": 4.923339380754741e-06, + "loss": 0.649, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 2.7648792266845703, + "learning_rate": 4.923186006004138e-06, + "loss": 0.5906, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 3.5535428524017334, + "learning_rate": 4.923032480372367e-06, + "loss": 0.7138, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 2.6252479553222656, + "learning_rate": 4.922878803868988e-06, + "loss": 0.5499, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 2.901002883911133, + "learning_rate": 4.9227249765035715e-06, + "loss": 0.6991, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 2.621877431869507, + "learning_rate": 4.9225709982856925e-06, + "loss": 0.6269, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 2.872483015060425, + "learning_rate": 4.92241686922494e-06, + "loss": 0.6657, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 2.730447769165039, + "learning_rate": 4.922262589330912e-06, + "loss": 0.6061, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 2.646247386932373, + "learning_rate": 4.922108158613213e-06, + "loss": 0.5923, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 2.6488895416259766, + "learning_rate": 4.92195357708146e-06, + "loss": 0.6293, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 2.756338357925415, + "learning_rate": 4.921798844745278e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 3.1441280841827393, + "learning_rate": 4.921643961614301e-06, + "loss": 0.6652, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 3.050002098083496, + "learning_rate": 4.921488927698172e-06, + "loss": 0.6809, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 2.71750807762146, + "learning_rate": 4.921333743006547e-06, + "loss": 0.6266, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 2.8439245223999023, + "learning_rate": 4.921178407549086e-06, + "loss": 0.5663, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 3.0722241401672363, + "learning_rate": 4.921022921335464e-06, + "loss": 0.6791, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 3.4381656646728516, + "learning_rate": 4.920867284375358e-06, + "loss": 0.6687, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 2.819812774658203, + "learning_rate": 4.920711496678463e-06, + "loss": 0.6299, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 3.6587414741516113, + "learning_rate": 4.9205555582544765e-06, + "loss": 0.7392, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 2.774296522140503, + "learning_rate": 4.920399469113109e-06, + "loss": 0.6652, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 2.7480580806732178, + "learning_rate": 4.920243229264081e-06, + "loss": 0.596, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 3.213057518005371, + "learning_rate": 4.920086838717119e-06, + "loss": 0.6986, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 2.940546989440918, + "learning_rate": 4.919930297481962e-06, + "loss": 0.6481, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 2.5970494747161865, + "learning_rate": 4.9197736055683555e-06, + "loss": 0.5658, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 4.49385404586792, + "learning_rate": 4.919616762986057e-06, + "loss": 0.605, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 2.971857786178589, + "learning_rate": 4.919459769744833e-06, + "loss": 0.6539, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 2.6192965507507324, + "learning_rate": 4.919302625854457e-06, + "loss": 0.6226, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 2.665088176727295, + "learning_rate": 4.919145331324716e-06, + "loss": 0.6647, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 2.612126111984253, + "learning_rate": 4.918987886165403e-06, + "loss": 0.6965, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 3.80017352104187, + "learning_rate": 4.9188302903863205e-06, + "loss": 0.7396, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 2.781752824783325, + "learning_rate": 4.918672543997282e-06, + "loss": 0.5985, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 2.6067914962768555, + "learning_rate": 4.91851464700811e-06, + "loss": 0.6159, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 2.670807123184204, + "learning_rate": 4.918356599428636e-06, + "loss": 0.5958, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.608611822128296, + "learning_rate": 4.9181984012687e-06, + "loss": 0.5768, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 2.586764097213745, + "learning_rate": 4.918040052538154e-06, + "loss": 0.661, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 3.1317451000213623, + "learning_rate": 4.917881553246856e-06, + "loss": 0.6626, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 2.7135281562805176, + "learning_rate": 4.917722903404676e-06, + "loss": 0.6572, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 3.4546358585357666, + "learning_rate": 4.917564103021493e-06, + "loss": 0.5597, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 3.0943493843078613, + "learning_rate": 4.917405152107193e-06, + "loss": 0.7258, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 2.6069352626800537, + "learning_rate": 4.917246050671674e-06, + "loss": 0.6209, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 2.584883689880371, + "learning_rate": 4.917086798724844e-06, + "loss": 0.658, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 3.001976490020752, + "learning_rate": 4.9169273962766166e-06, + "loss": 0.6306, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 2.5013928413391113, + "learning_rate": 4.916767843336918e-06, + "loss": 0.572, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 2.9114553928375244, + "learning_rate": 4.916608139915684e-06, + "loss": 0.5841, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 2.8878467082977295, + "learning_rate": 4.9164482860228564e-06, + "loss": 0.6654, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 2.9827866554260254, + "learning_rate": 4.91628828166839e-06, + "loss": 0.6674, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 3.8696281909942627, + "learning_rate": 4.916128126862248e-06, + "loss": 0.6241, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 2.9556291103363037, + "learning_rate": 4.915967821614402e-06, + "loss": 0.6478, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 2.392942428588867, + "learning_rate": 4.915807365934834e-06, + "loss": 0.6097, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 3.032235860824585, + "learning_rate": 4.915646759833534e-06, + "loss": 0.7193, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 2.840416193008423, + "learning_rate": 4.915486003320501e-06, + "loss": 0.5506, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 2.5438895225524902, + "learning_rate": 4.915325096405747e-06, + "loss": 0.6487, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 2.544334650039673, + "learning_rate": 4.9151640390992905e-06, + "loss": 0.6168, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 2.8535678386688232, + "learning_rate": 4.91500283141116e-06, + "loss": 0.678, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 2.8086955547332764, + "learning_rate": 4.9148414733513915e-06, + "loss": 0.6473, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 2.4709885120391846, + "learning_rate": 4.914679964930034e-06, + "loss": 0.6797, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 2.8546934127807617, + "learning_rate": 4.9145183061571435e-06, + "loss": 0.6247, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 2.991184711456299, + "learning_rate": 4.9143564970427844e-06, + "loss": 0.5977, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 3.011216402053833, + "learning_rate": 4.914194537597033e-06, + "loss": 0.7005, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 2.807521343231201, + "learning_rate": 4.9140324278299744e-06, + "loss": 0.5412, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 3.0401229858398438, + "learning_rate": 4.913870167751701e-06, + "loss": 0.6394, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 2.853914976119995, + "learning_rate": 4.913707757372317e-06, + "loss": 0.6745, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.505620956420898, + "learning_rate": 4.913545196701935e-06, + "loss": 0.6668, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 3.0505781173706055, + "learning_rate": 4.913382485750676e-06, + "loss": 0.6926, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 2.798435688018799, + "learning_rate": 4.913219624528672e-06, + "loss": 0.605, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 2.7814908027648926, + "learning_rate": 4.913056613046065e-06, + "loss": 0.6678, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.2089321613311768, + "learning_rate": 4.9128934513130025e-06, + "loss": 0.5995, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 2.7699952125549316, + "learning_rate": 4.9127301393396455e-06, + "loss": 0.7062, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 2.859368324279785, + "learning_rate": 4.912566677136162e-06, + "loss": 0.6063, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 2.727334499359131, + "learning_rate": 4.91240306471273e-06, + "loss": 0.6848, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 2.6017510890960693, + "learning_rate": 4.912239302079537e-06, + "loss": 0.5808, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 3.539583206176758, + "learning_rate": 4.912075389246781e-06, + "loss": 0.7053, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 2.918280601501465, + "learning_rate": 4.911911326224666e-06, + "loss": 0.5904, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 3.0067362785339355, + "learning_rate": 4.9117471130234095e-06, + "loss": 0.6392, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 2.4374797344207764, + "learning_rate": 4.911582749653236e-06, + "loss": 0.5793, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.121182918548584, + "learning_rate": 4.911418236124378e-06, + "loss": 0.6636, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 3.1289851665496826, + "learning_rate": 4.91125357244708e-06, + "loss": 0.656, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 2.7034592628479004, + "learning_rate": 4.911088758631596e-06, + "loss": 0.6001, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 2.710146188735962, + "learning_rate": 4.910923794688187e-06, + "loss": 0.6007, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 2.5424487590789795, + "learning_rate": 4.910758680627124e-06, + "loss": 0.5193, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 2.615893602371216, + "learning_rate": 4.91059341645869e-06, + "loss": 0.5525, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 3.3179728984832764, + "learning_rate": 4.910428002193174e-06, + "loss": 0.7285, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 2.7234175205230713, + "learning_rate": 4.910262437840875e-06, + "loss": 0.574, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 3.0416605472564697, + "learning_rate": 4.9100967234121034e-06, + "loss": 0.5623, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 3.067786455154419, + "learning_rate": 4.909930858917177e-06, + "loss": 0.6491, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 3.0037379264831543, + "learning_rate": 4.909764844366422e-06, + "loss": 0.5696, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 2.966179609298706, + "learning_rate": 4.909598679770178e-06, + "loss": 0.6042, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 2.6000657081604004, + "learning_rate": 4.909432365138789e-06, + "loss": 0.5883, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 2.6794495582580566, + "learning_rate": 4.909265900482612e-06, + "loss": 0.6809, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 2.6765122413635254, + "learning_rate": 4.9090992858120115e-06, + "loss": 0.6601, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 2.6051928997039795, + "learning_rate": 4.908932521137363e-06, + "loss": 0.5946, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 3.0405542850494385, + "learning_rate": 4.908765606469048e-06, + "loss": 0.6998, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 2.7975668907165527, + "learning_rate": 4.908598541817462e-06, + "loss": 0.6218, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 2.5367627143859863, + "learning_rate": 4.908431327193005e-06, + "loss": 0.6354, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 3.7939631938934326, + "learning_rate": 4.908263962606091e-06, + "loss": 0.6376, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 2.864079475402832, + "learning_rate": 4.908096448067139e-06, + "loss": 0.5485, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 2.7855563163757324, + "learning_rate": 4.9079287835865804e-06, + "loss": 0.6645, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 2.6156625747680664, + "learning_rate": 4.9077609691748556e-06, + "loss": 0.5751, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 3.0475659370422363, + "learning_rate": 4.907593004842412e-06, + "loss": 0.6739, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 2.9176738262176514, + "learning_rate": 4.9074248905997104e-06, + "loss": 0.6493, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 2.6168384552001953, + "learning_rate": 4.907256626457216e-06, + "loss": 0.6154, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 2.893980026245117, + "learning_rate": 4.907088212425408e-06, + "loss": 0.5808, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 3.3832836151123047, + "learning_rate": 4.90691964851477e-06, + "loss": 0.7888, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 3.088932752609253, + "learning_rate": 4.906750934735801e-06, + "loss": 0.6516, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 2.494471549987793, + "learning_rate": 4.906582071099004e-06, + "loss": 0.6286, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 2.716550588607788, + "learning_rate": 4.906413057614895e-06, + "loss": 0.5939, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 2.5821073055267334, + "learning_rate": 4.906243894293995e-06, + "loss": 0.6668, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 3.651787042617798, + "learning_rate": 4.90607458114684e-06, + "loss": 0.6124, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 2.7567858695983887, + "learning_rate": 4.9059051181839705e-06, + "loss": 0.6656, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 2.8067586421966553, + "learning_rate": 4.90573550541594e-06, + "loss": 0.6306, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 2.6136393547058105, + "learning_rate": 4.905565742853307e-06, + "loss": 0.5992, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 2.899049758911133, + "learning_rate": 4.905395830506644e-06, + "loss": 0.621, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 3.036583185195923, + "learning_rate": 4.9052257683865294e-06, + "loss": 0.652, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 2.7947216033935547, + "learning_rate": 4.905055556503553e-06, + "loss": 0.6636, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 3.1646955013275146, + "learning_rate": 4.9048851948683135e-06, + "loss": 0.6376, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 2.8175766468048096, + "learning_rate": 4.904714683491417e-06, + "loss": 0.5929, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 2.923923969268799, + "learning_rate": 4.904544022383483e-06, + "loss": 0.6633, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 2.7471134662628174, + "learning_rate": 4.9043732115551356e-06, + "loss": 0.6551, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 2.8660807609558105, + "learning_rate": 4.90420225101701e-06, + "loss": 0.6423, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 2.769247531890869, + "learning_rate": 4.904031140779754e-06, + "loss": 0.5982, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 2.9043145179748535, + "learning_rate": 4.90385988085402e-06, + "loss": 0.5843, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 2.6639609336853027, + "learning_rate": 4.903688471250471e-06, + "loss": 0.5858, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 2.6967573165893555, + "learning_rate": 4.903516911979781e-06, + "loss": 0.5755, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 2.8865857124328613, + "learning_rate": 4.903345203052633e-06, + "loss": 0.6051, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 2.381979465484619, + "learning_rate": 4.903173344479717e-06, + "loss": 0.5727, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 2.7717981338500977, + "learning_rate": 4.903001336271734e-06, + "loss": 0.6406, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 2.6431570053100586, + "learning_rate": 4.902829178439395e-06, + "loss": 0.6226, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 2.8090415000915527, + "learning_rate": 4.902656870993419e-06, + "loss": 0.5761, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 2.4769368171691895, + "learning_rate": 4.902484413944535e-06, + "loss": 0.5602, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 2.693316698074341, + "learning_rate": 4.902311807303481e-06, + "loss": 0.5222, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 2.7623913288116455, + "learning_rate": 4.902139051081004e-06, + "loss": 0.6978, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 2.6133766174316406, + "learning_rate": 4.901966145287863e-06, + "loss": 0.5802, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 2.7345972061157227, + "learning_rate": 4.901793089934821e-06, + "loss": 0.6294, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 2.7545835971832275, + "learning_rate": 4.9016198850326555e-06, + "loss": 0.6085, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 2.6947758197784424, + "learning_rate": 4.90144653059215e-06, + "loss": 0.6025, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 2.692967414855957, + "learning_rate": 4.901273026624099e-06, + "loss": 0.5715, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 2.78347110748291, + "learning_rate": 4.901099373139307e-06, + "loss": 0.6063, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 2.346496343612671, + "learning_rate": 4.900925570148585e-06, + "loss": 0.5869, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 2.606639862060547, + "learning_rate": 4.900751617662755e-06, + "loss": 0.6197, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 2.5825929641723633, + "learning_rate": 4.900577515692649e-06, + "loss": 0.6721, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 2.731349468231201, + "learning_rate": 4.900403264249107e-06, + "loss": 0.6273, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 3.2133874893188477, + "learning_rate": 4.90022886334298e-06, + "loss": 0.6231, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 2.9213852882385254, + "learning_rate": 4.900054312985127e-06, + "loss": 0.6677, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 2.815425157546997, + "learning_rate": 4.899879613186414e-06, + "loss": 0.6405, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 2.730782985687256, + "learning_rate": 4.899704763957721e-06, + "loss": 0.6233, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 2.6432766914367676, + "learning_rate": 4.899529765309936e-06, + "loss": 0.6267, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 2.616215229034424, + "learning_rate": 4.899354617253953e-06, + "loss": 0.6268, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 2.7630255222320557, + "learning_rate": 4.899179319800679e-06, + "loss": 0.6348, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 2.785095453262329, + "learning_rate": 4.899003872961029e-06, + "loss": 0.5839, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 2.9050328731536865, + "learning_rate": 4.898828276745927e-06, + "loss": 0.651, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 2.958092212677002, + "learning_rate": 4.8986525311663065e-06, + "loss": 0.6395, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 2.952310800552368, + "learning_rate": 4.898476636233111e-06, + "loss": 0.6731, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 2.9876346588134766, + "learning_rate": 4.898300591957293e-06, + "loss": 0.7015, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 2.8941752910614014, + "learning_rate": 4.898124398349813e-06, + "loss": 0.6452, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 2.9809536933898926, + "learning_rate": 4.897948055421642e-06, + "loss": 0.5736, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 2.927046775817871, + "learning_rate": 4.897771563183761e-06, + "loss": 0.5918, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 2.865020275115967, + "learning_rate": 4.897594921647158e-06, + "loss": 0.6924, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 2.7406699657440186, + "learning_rate": 4.897418130822832e-06, + "loss": 0.509, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 2.781606912612915, + "learning_rate": 4.897241190721791e-06, + "loss": 0.5555, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 2.79209303855896, + "learning_rate": 4.8970641013550535e-06, + "loss": 0.6722, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 3.0672268867492676, + "learning_rate": 4.896886862733645e-06, + "loss": 0.6366, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 2.7456953525543213, + "learning_rate": 4.896709474868602e-06, + "loss": 0.6246, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 3.6731202602386475, + "learning_rate": 4.896531937770968e-06, + "loss": 0.668, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 2.6056087017059326, + "learning_rate": 4.8963542514518e-06, + "loss": 0.5815, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 2.719698905944824, + "learning_rate": 4.89617641592216e-06, + "loss": 0.6058, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 2.625838279724121, + "learning_rate": 4.895998431193121e-06, + "loss": 0.6143, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 2.7166085243225098, + "learning_rate": 4.895820297275767e-06, + "loss": 0.5187, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 2.7544102668762207, + "learning_rate": 4.8956420141811875e-06, + "loss": 0.5928, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 2.6678333282470703, + "learning_rate": 4.895463581920484e-06, + "loss": 0.611, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 2.853384494781494, + "learning_rate": 4.895285000504768e-06, + "loss": 0.642, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 2.637852430343628, + "learning_rate": 4.895106269945158e-06, + "loss": 0.6308, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 2.9880387783050537, + "learning_rate": 4.8949273902527826e-06, + "loss": 0.5781, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 3.5984015464782715, + "learning_rate": 4.89474836143878e-06, + "loss": 0.5865, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 2.719855546951294, + "learning_rate": 4.8945691835142975e-06, + "loss": 0.6393, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 2.7885141372680664, + "learning_rate": 4.894389856490492e-06, + "loss": 0.66, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 2.698819875717163, + "learning_rate": 4.894210380378529e-06, + "loss": 0.6144, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 2.278045654296875, + "learning_rate": 4.894030755189584e-06, + "loss": 0.5609, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 2.8729357719421387, + "learning_rate": 4.893850980934841e-06, + "loss": 0.6715, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 2.8541221618652344, + "learning_rate": 4.893671057625495e-06, + "loss": 0.6787, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 2.4561476707458496, + "learning_rate": 4.893490985272748e-06, + "loss": 0.6331, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 2.565739154815674, + "learning_rate": 4.893310763887812e-06, + "loss": 0.587, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 2.384951591491699, + "learning_rate": 4.8931303934819095e-06, + "loss": 0.5358, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 2.380808115005493, + "learning_rate": 4.89294987406627e-06, + "loss": 0.5402, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 2.764815092086792, + "learning_rate": 4.892769205652136e-06, + "loss": 0.6103, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 2.463350296020508, + "learning_rate": 4.892588388250754e-06, + "loss": 0.5937, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 3.099689245223999, + "learning_rate": 4.8924074218733855e-06, + "loss": 0.6354, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 2.804450035095215, + "learning_rate": 4.892226306531297e-06, + "loss": 0.6595, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 3.1559767723083496, + "learning_rate": 4.892045042235765e-06, + "loss": 0.6664, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 2.844341993331909, + "learning_rate": 4.891863628998079e-06, + "loss": 0.7454, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 2.686602830886841, + "learning_rate": 4.891682066829532e-06, + "loss": 0.6755, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 2.736457347869873, + "learning_rate": 4.8915003557414285e-06, + "loss": 0.6305, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 2.661362409591675, + "learning_rate": 4.891318495745086e-06, + "loss": 0.5958, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 2.707348108291626, + "learning_rate": 4.8911364868518255e-06, + "loss": 0.5824, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 2.9798858165740967, + "learning_rate": 4.890954329072981e-06, + "loss": 0.5981, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 2.6285455226898193, + "learning_rate": 4.890772022419895e-06, + "loss": 0.6194, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 2.9254322052001953, + "learning_rate": 4.890589566903917e-06, + "loss": 0.6002, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 2.6458325386047363, + "learning_rate": 4.89040696253641e-06, + "loss": 0.5457, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 2.508242607116699, + "learning_rate": 4.890224209328743e-06, + "loss": 0.6168, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 3.034785509109497, + "learning_rate": 4.890041307292296e-06, + "loss": 0.664, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 3.52469539642334, + "learning_rate": 4.889858256438455e-06, + "loss": 0.7301, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 2.9145348072052, + "learning_rate": 4.889675056778622e-06, + "loss": 0.6494, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 2.831829071044922, + "learning_rate": 4.8894917083242e-06, + "loss": 0.6064, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 2.6883130073547363, + "learning_rate": 4.889308211086608e-06, + "loss": 0.5642, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 3.0605485439300537, + "learning_rate": 4.889124565077269e-06, + "loss": 0.6695, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.44062876701355, + "learning_rate": 4.88894077030762e-06, + "loss": 0.6415, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 2.5970818996429443, + "learning_rate": 4.888756826789105e-06, + "loss": 0.6518, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 4.2233567237854, + "learning_rate": 4.8885727345331755e-06, + "loss": 0.6555, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 2.645385503768921, + "learning_rate": 4.888388493551297e-06, + "loss": 0.6762, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 2.907954454421997, + "learning_rate": 4.8882041038549385e-06, + "loss": 0.6526, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 2.482771873474121, + "learning_rate": 4.888019565455583e-06, + "loss": 0.628, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 2.7165915966033936, + "learning_rate": 4.88783487836472e-06, + "loss": 0.5743, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 3.095627546310425, + "learning_rate": 4.88765004259385e-06, + "loss": 0.627, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 2.5018465518951416, + "learning_rate": 4.8874650581544805e-06, + "loss": 0.5215, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 3.094337224960327, + "learning_rate": 4.8872799250581316e-06, + "loss": 0.6979, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 3.1002209186553955, + "learning_rate": 4.887094643316329e-06, + "loss": 0.6565, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 2.551431894302368, + "learning_rate": 4.88690921294061e-06, + "loss": 0.5748, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 2.8282904624938965, + "learning_rate": 4.886723633942521e-06, + "loss": 0.676, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 2.8887810707092285, + "learning_rate": 4.886537906333617e-06, + "loss": 0.5971, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 2.9989118576049805, + "learning_rate": 4.886352030125462e-06, + "loss": 0.6341, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 2.8042776584625244, + "learning_rate": 4.886166005329629e-06, + "loss": 0.6578, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 2.4980967044830322, + "learning_rate": 4.8859798319577026e-06, + "loss": 0.6711, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.762369155883789, + "learning_rate": 4.885793510021274e-06, + "loss": 0.5747, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 3.136327028274536, + "learning_rate": 4.885607039531945e-06, + "loss": 0.7544, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 2.8736963272094727, + "learning_rate": 4.885420420501327e-06, + "loss": 0.6603, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 2.766237497329712, + "learning_rate": 4.885233652941039e-06, + "loss": 0.581, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 2.4740939140319824, + "learning_rate": 4.88504673686271e-06, + "loss": 0.6335, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 3.324795961380005, + "learning_rate": 4.884859672277978e-06, + "loss": 0.6019, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 3.521327257156372, + "learning_rate": 4.884672459198493e-06, + "loss": 0.6104, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 2.7728071212768555, + "learning_rate": 4.884485097635909e-06, + "loss": 0.6714, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 3.0738155841827393, + "learning_rate": 4.884297587601895e-06, + "loss": 0.604, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 2.719240427017212, + "learning_rate": 4.884109929108124e-06, + "loss": 0.6795, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 2.4108200073242188, + "learning_rate": 4.883922122166282e-06, + "loss": 0.5846, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 2.393899917602539, + "learning_rate": 4.883734166788063e-06, + "loss": 0.6188, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 4.555255889892578, + "learning_rate": 4.883546062985169e-06, + "loss": 0.5962, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 2.571075439453125, + "learning_rate": 4.883357810769315e-06, + "loss": 0.6165, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 2.553115129470825, + "learning_rate": 4.8831694101522185e-06, + "loss": 0.6787, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 3.2564642429351807, + "learning_rate": 4.882980861145614e-06, + "loss": 0.659, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 2.535216808319092, + "learning_rate": 4.882792163761241e-06, + "loss": 0.6176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 3.097921848297119, + "learning_rate": 4.882603318010847e-06, + "loss": 0.6822, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 2.8135175704956055, + "learning_rate": 4.882414323906192e-06, + "loss": 0.6782, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 2.724634885787964, + "learning_rate": 4.882225181459044e-06, + "loss": 0.6545, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 2.9585227966308594, + "learning_rate": 4.882035890681179e-06, + "loss": 0.6218, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 2.6952011585235596, + "learning_rate": 4.881846451584385e-06, + "loss": 0.6, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 3.1400704383850098, + "learning_rate": 4.881656864180455e-06, + "loss": 0.6687, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 2.8382487297058105, + "learning_rate": 4.881467128481197e-06, + "loss": 0.574, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 2.8520095348358154, + "learning_rate": 4.881277244498422e-06, + "loss": 0.6582, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 2.703498363494873, + "learning_rate": 4.881087212243956e-06, + "loss": 0.7224, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 3.697205066680908, + "learning_rate": 4.880897031729629e-06, + "loss": 0.6582, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 2.7625808715820312, + "learning_rate": 4.880706702967284e-06, + "loss": 0.574, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 2.949984073638916, + "learning_rate": 4.880516225968771e-06, + "loss": 0.66, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 2.548269748687744, + "learning_rate": 4.8803256007459525e-06, + "loss": 0.642, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 2.5102174282073975, + "learning_rate": 4.8801348273106945e-06, + "loss": 0.6238, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 2.9847946166992188, + "learning_rate": 4.8799439056748786e-06, + "loss": 0.5416, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 2.8711049556732178, + "learning_rate": 4.879752835850391e-06, + "loss": 0.6427, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 2.7901716232299805, + "learning_rate": 4.879561617849129e-06, + "loss": 0.6026, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 2.659778356552124, + "learning_rate": 4.879370251682999e-06, + "loss": 0.6623, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 3.224386692047119, + "learning_rate": 4.879178737363917e-06, + "loss": 0.6485, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 2.6385605335235596, + "learning_rate": 4.8789870749038076e-06, + "loss": 0.5866, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 2.807713270187378, + "learning_rate": 4.8787952643146045e-06, + "loss": 0.6537, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 2.5689280033111572, + "learning_rate": 4.878603305608251e-06, + "loss": 0.6216, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 2.7347843647003174, + "learning_rate": 4.8784111987967e-06, + "loss": 0.6318, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 2.5210378170013428, + "learning_rate": 4.878218943891911e-06, + "loss": 0.5472, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 2.866785764694214, + "learning_rate": 4.878026540905858e-06, + "loss": 0.7108, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 2.923314332962036, + "learning_rate": 4.877833989850519e-06, + "loss": 0.5557, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 2.925463914871216, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.6382, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 2.909644365310669, + "learning_rate": 4.877448443579952e-06, + "loss": 0.5603, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 3.501148223876953, + "learning_rate": 4.8772554483887306e-06, + "loss": 0.6722, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 2.823765516281128, + "learning_rate": 4.877062305176235e-06, + "loss": 0.6408, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.9807584285736084, + "learning_rate": 4.8768690139544935e-06, + "loss": 0.5984, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 2.8411378860473633, + "learning_rate": 4.8766755747355405e-06, + "loss": 0.6231, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 3.158952236175537, + "learning_rate": 4.8764819875314215e-06, + "loss": 0.6441, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 2.9614369869232178, + "learning_rate": 4.876288252354189e-06, + "loss": 0.6308, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 3.073805570602417, + "learning_rate": 4.876094369215907e-06, + "loss": 0.6046, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 2.719189405441284, + "learning_rate": 4.875900338128648e-06, + "loss": 0.6082, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 2.676726818084717, + "learning_rate": 4.8757061591044914e-06, + "loss": 0.6344, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 2.955256938934326, + "learning_rate": 4.87551183215553e-06, + "loss": 0.6506, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 2.5672218799591064, + "learning_rate": 4.875317357293864e-06, + "loss": 0.5284, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 2.5860238075256348, + "learning_rate": 4.875122734531602e-06, + "loss": 0.667, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 3.1037003993988037, + "learning_rate": 4.8749279638808605e-06, + "loss": 0.6902, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 2.7715282440185547, + "learning_rate": 4.874733045353769e-06, + "loss": 0.6291, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 2.527071475982666, + "learning_rate": 4.874537978962463e-06, + "loss": 0.5565, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 2.722092628479004, + "learning_rate": 4.874342764719091e-06, + "loss": 0.5724, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 2.6342411041259766, + "learning_rate": 4.874147402635805e-06, + "loss": 0.6308, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 2.3850719928741455, + "learning_rate": 4.8739518927247695e-06, + "loss": 0.5692, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 2.9787259101867676, + "learning_rate": 4.873756234998161e-06, + "loss": 0.6953, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 2.634141683578491, + "learning_rate": 4.873560429468159e-06, + "loss": 0.6077, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 2.803046941757202, + "learning_rate": 4.873364476146958e-06, + "loss": 0.6657, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 2.762827157974243, + "learning_rate": 4.8731683750467574e-06, + "loss": 0.6061, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 2.6654391288757324, + "learning_rate": 4.872972126179768e-06, + "loss": 0.6387, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 2.4363625049591064, + "learning_rate": 4.872775729558209e-06, + "loss": 0.5623, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 2.528959035873413, + "learning_rate": 4.87257918519431e-06, + "loss": 0.5609, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 2.718383312225342, + "learning_rate": 4.872382493100309e-06, + "loss": 0.5575, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 2.660841226577759, + "learning_rate": 4.872185653288453e-06, + "loss": 0.6106, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 2.508753538131714, + "learning_rate": 4.871988665770997e-06, + "loss": 0.5705, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 2.5134334564208984, + "learning_rate": 4.871791530560208e-06, + "loss": 0.5592, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 2.7475597858428955, + "learning_rate": 4.871594247668361e-06, + "loss": 0.6277, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 2.793616533279419, + "learning_rate": 4.871396817107739e-06, + "loss": 0.595, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 2.8285086154937744, + "learning_rate": 4.871199238890635e-06, + "loss": 0.6094, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 2.74124813079834, + "learning_rate": 4.871001513029352e-06, + "loss": 0.6296, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 2.761237621307373, + "learning_rate": 4.870803639536202e-06, + "loss": 0.5702, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 2.761038064956665, + "learning_rate": 4.870605618423504e-06, + "loss": 0.6195, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 2.8812482357025146, + "learning_rate": 4.870407449703589e-06, + "loss": 0.616, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 2.9966578483581543, + "learning_rate": 4.870209133388797e-06, + "loss": 0.6547, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 2.7969017028808594, + "learning_rate": 4.870010669491474e-06, + "loss": 0.5762, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 2.557783842086792, + "learning_rate": 4.86981205802398e-06, + "loss": 0.6184, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 2.5393927097320557, + "learning_rate": 4.86961329899868e-06, + "loss": 0.5953, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 2.7745981216430664, + "learning_rate": 4.86941439242795e-06, + "loss": 0.5967, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 2.650381326675415, + "learning_rate": 4.869215338324176e-06, + "loss": 0.5667, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 2.583169937133789, + "learning_rate": 4.869016136699751e-06, + "loss": 0.549, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 2.984978437423706, + "learning_rate": 4.868816787567079e-06, + "loss": 0.5931, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 3.1947181224823, + "learning_rate": 4.868617290938573e-06, + "loss": 0.5473, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 2.562927007675171, + "learning_rate": 4.868417646826654e-06, + "loss": 0.6878, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 2.8741261959075928, + "learning_rate": 4.868217855243754e-06, + "loss": 0.6312, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 2.9834797382354736, + "learning_rate": 4.868017916202312e-06, + "loss": 0.5624, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 2.6935982704162598, + "learning_rate": 4.8678178297147785e-06, + "loss": 0.5857, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.8200576305389404, + "learning_rate": 4.86761759579361e-06, + "loss": 0.6153, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 2.831425189971924, + "learning_rate": 4.867417214451276e-06, + "loss": 0.6495, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 2.733565092086792, + "learning_rate": 4.867216685700253e-06, + "loss": 0.6036, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 3.0609400272369385, + "learning_rate": 4.867016009553027e-06, + "loss": 0.6773, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 2.665452241897583, + "learning_rate": 4.866815186022093e-06, + "loss": 0.6256, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 2.9480721950531006, + "learning_rate": 4.866614215119956e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 2.5514180660247803, + "learning_rate": 4.866413096859128e-06, + "loss": 0.6588, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 3.3442373275756836, + "learning_rate": 4.866211831252134e-06, + "loss": 0.5754, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 2.521467685699463, + "learning_rate": 4.866010418311504e-06, + "loss": 0.5546, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 2.930706262588501, + "learning_rate": 4.865808858049781e-06, + "loss": 0.589, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 2.6298375129699707, + "learning_rate": 4.865607150479513e-06, + "loss": 0.5915, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 2.9554293155670166, + "learning_rate": 4.8654052956132615e-06, + "loss": 0.6654, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 3.2706902027130127, + "learning_rate": 4.865203293463593e-06, + "loss": 0.7115, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 3.041539430618286, + "learning_rate": 4.865001144043088e-06, + "loss": 0.5818, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 3.1314544677734375, + "learning_rate": 4.864798847364331e-06, + "loss": 0.5822, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 2.5301461219787598, + "learning_rate": 4.86459640343992e-06, + "loss": 0.5525, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 2.809295892715454, + "learning_rate": 4.864393812282458e-06, + "loss": 0.6768, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 2.794664144515991, + "learning_rate": 4.864191073904562e-06, + "loss": 0.5793, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 2.7771105766296387, + "learning_rate": 4.863988188318854e-06, + "loss": 0.6453, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 2.6431946754455566, + "learning_rate": 4.863785155537967e-06, + "loss": 0.5877, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 2.951353073120117, + "learning_rate": 4.863581975574544e-06, + "loss": 0.6793, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 3.1336071491241455, + "learning_rate": 4.863378648441235e-06, + "loss": 0.6695, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 2.735982656478882, + "learning_rate": 4.8631751741507e-06, + "loss": 0.5239, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 2.7085206508636475, + "learning_rate": 4.862971552715611e-06, + "loss": 0.6837, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 3.136528730392456, + "learning_rate": 4.8627677841486436e-06, + "loss": 0.683, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 2.7879369258880615, + "learning_rate": 4.862563868462486e-06, + "loss": 0.608, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 2.7937729358673096, + "learning_rate": 4.862359805669837e-06, + "loss": 0.6131, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 2.5988364219665527, + "learning_rate": 4.862155595783401e-06, + "loss": 0.6303, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 3.251070499420166, + "learning_rate": 4.861951238815894e-06, + "loss": 0.7246, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 2.646759271621704, + "learning_rate": 4.861746734780039e-06, + "loss": 0.6313, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 2.773866891860962, + "learning_rate": 4.861542083688573e-06, + "loss": 0.6463, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 2.759965658187866, + "learning_rate": 4.861337285554235e-06, + "loss": 0.5428, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 3.3250818252563477, + "learning_rate": 4.861132340389779e-06, + "loss": 0.6522, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 2.661797523498535, + "learning_rate": 4.860927248207965e-06, + "loss": 0.5871, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 2.706289052963257, + "learning_rate": 4.860722009021563e-06, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 2.8459298610687256, + "learning_rate": 4.860516622843354e-06, + "loss": 0.5827, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 3.1041831970214844, + "learning_rate": 4.860311089686125e-06, + "loss": 0.6727, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 2.9382801055908203, + "learning_rate": 4.8601054095626746e-06, + "loss": 0.6002, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 2.782475471496582, + "learning_rate": 4.859899582485808e-06, + "loss": 0.6951, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 3.313894510269165, + "learning_rate": 4.859693608468343e-06, + "loss": 0.6363, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 3.1639695167541504, + "learning_rate": 4.8594874875231045e-06, + "loss": 0.7002, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 2.6762218475341797, + "learning_rate": 4.859281219662926e-06, + "loss": 0.6246, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 2.8368663787841797, + "learning_rate": 4.85907480490065e-06, + "loss": 0.5906, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 2.887373208999634, + "learning_rate": 4.858868243249131e-06, + "loss": 0.5931, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 2.8115322589874268, + "learning_rate": 4.858661534721229e-06, + "loss": 0.6337, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 2.8470499515533447, + "learning_rate": 4.8584546793298174e-06, + "loss": 0.632, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 2.8229613304138184, + "learning_rate": 4.8582476770877725e-06, + "loss": 0.6494, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.4235479831695557, + "learning_rate": 4.858040528007987e-06, + "loss": 0.5709, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 2.9348199367523193, + "learning_rate": 4.857833232103356e-06, + "loss": 0.5404, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 2.8274219036102295, + "learning_rate": 4.857625789386789e-06, + "loss": 0.701, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 3.136929988861084, + "learning_rate": 4.857418199871203e-06, + "loss": 0.6971, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 2.8987185955047607, + "learning_rate": 4.8572104635695214e-06, + "loss": 0.6613, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 2.5073442459106445, + "learning_rate": 4.857002580494681e-06, + "loss": 0.6032, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 2.7019522190093994, + "learning_rate": 4.856794550659625e-06, + "loss": 0.567, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 2.4795594215393066, + "learning_rate": 4.8565863740773054e-06, + "loss": 0.5777, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 3.032506227493286, + "learning_rate": 4.856378050760687e-06, + "loss": 0.607, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 3.052091121673584, + "learning_rate": 4.85616958072274e-06, + "loss": 0.591, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 2.704831838607788, + "learning_rate": 4.855960963976443e-06, + "loss": 0.6528, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 2.680995225906372, + "learning_rate": 4.855752200534788e-06, + "loss": 0.6294, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 2.3948659896850586, + "learning_rate": 4.855543290410774e-06, + "loss": 0.6091, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 2.6407411098480225, + "learning_rate": 4.855334233617407e-06, + "loss": 0.5572, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 2.5526835918426514, + "learning_rate": 4.8551250301677064e-06, + "loss": 0.5432, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 3.1237430572509766, + "learning_rate": 4.8549156800746965e-06, + "loss": 0.5944, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 2.8112540245056152, + "learning_rate": 4.854706183351412e-06, + "loss": 0.604, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 2.664644479751587, + "learning_rate": 4.8544965400109e-06, + "loss": 0.5647, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 3.26310133934021, + "learning_rate": 4.854286750066212e-06, + "loss": 0.6999, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 2.9717442989349365, + "learning_rate": 4.8540768135304115e-06, + "loss": 0.6655, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 2.5302982330322266, + "learning_rate": 4.85386673041657e-06, + "loss": 0.6384, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 2.864877700805664, + "learning_rate": 4.853656500737769e-06, + "loss": 0.6834, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 2.5522031784057617, + "learning_rate": 4.853446124507098e-06, + "loss": 0.5929, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 3.096477746963501, + "learning_rate": 4.853235601737656e-06, + "loss": 0.5737, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 2.884779214859009, + "learning_rate": 4.853024932442552e-06, + "loss": 0.6362, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 3.368558406829834, + "learning_rate": 4.852814116634903e-06, + "loss": 0.6721, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 2.742414951324463, + "learning_rate": 4.852603154327837e-06, + "loss": 0.6212, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 2.53454852104187, + "learning_rate": 4.8523920455344864e-06, + "loss": 0.6675, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 2.9354238510131836, + "learning_rate": 4.852180790267999e-06, + "loss": 0.6692, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 2.585070848464966, + "learning_rate": 4.8519693885415274e-06, + "loss": 0.6215, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 2.9047999382019043, + "learning_rate": 4.851757840368235e-06, + "loss": 0.6231, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 3.0930933952331543, + "learning_rate": 4.851546145761295e-06, + "loss": 0.7267, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 3.0224719047546387, + "learning_rate": 4.8513343047338875e-06, + "loss": 0.6293, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 2.5758471488952637, + "learning_rate": 4.851122317299203e-06, + "loss": 0.5855, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 2.579272508621216, + "learning_rate": 4.850910183470441e-06, + "loss": 0.582, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 2.8148300647735596, + "learning_rate": 4.85069790326081e-06, + "loss": 0.6396, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 2.6380527019500732, + "learning_rate": 4.850485476683528e-06, + "loss": 0.6114, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 2.7736263275146484, + "learning_rate": 4.850272903751823e-06, + "loss": 0.6683, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 3.1958179473876953, + "learning_rate": 4.8500601844789285e-06, + "loss": 0.6265, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 3.783212423324585, + "learning_rate": 4.8498473188780916e-06, + "loss": 0.6078, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 2.6656646728515625, + "learning_rate": 4.849634306962566e-06, + "loss": 0.5756, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 2.757141590118408, + "learning_rate": 4.849421148745615e-06, + "loss": 0.5596, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 3.0391886234283447, + "learning_rate": 4.849207844240511e-06, + "loss": 0.5293, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 2.981912851333618, + "learning_rate": 4.848994393460535e-06, + "loss": 0.598, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 2.5470798015594482, + "learning_rate": 4.848780796418978e-06, + "loss": 0.6266, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 2.8394415378570557, + "learning_rate": 4.8485670531291415e-06, + "loss": 0.6844, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 3.2023508548736572, + "learning_rate": 4.848353163604331e-06, + "loss": 0.6134, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.98245906829834, + "learning_rate": 4.848139127857867e-06, + "loss": 0.7084, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 2.5917441844940186, + "learning_rate": 4.847924945903076e-06, + "loss": 0.5676, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 2.8736681938171387, + "learning_rate": 4.847710617753294e-06, + "loss": 0.6304, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 2.7832682132720947, + "learning_rate": 4.847496143421866e-06, + "loss": 0.5705, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 2.480560779571533, + "learning_rate": 4.847281522922147e-06, + "loss": 0.5595, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 2.357675313949585, + "learning_rate": 4.847066756267499e-06, + "loss": 0.5065, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 2.632669448852539, + "learning_rate": 4.846851843471296e-06, + "loss": 0.6949, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 2.7691073417663574, + "learning_rate": 4.84663678454692e-06, + "loss": 0.6638, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 2.5647685527801514, + "learning_rate": 4.846421579507761e-06, + "loss": 0.6098, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 2.476701021194458, + "learning_rate": 4.846206228367218e-06, + "loss": 0.592, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 2.805727958679199, + "learning_rate": 4.845990731138702e-06, + "loss": 0.5466, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 2.551392078399658, + "learning_rate": 4.84577508783563e-06, + "loss": 0.6039, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 2.6861350536346436, + "learning_rate": 4.845559298471429e-06, + "loss": 0.6427, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 3.1908371448516846, + "learning_rate": 4.845343363059535e-06, + "loss": 0.5447, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 2.9021761417388916, + "learning_rate": 4.845127281613394e-06, + "loss": 0.5836, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 2.476670742034912, + "learning_rate": 4.844911054146461e-06, + "loss": 0.5863, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 2.662935495376587, + "learning_rate": 4.844694680672198e-06, + "loss": 0.5678, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 2.677896738052368, + "learning_rate": 4.844478161204079e-06, + "loss": 0.6195, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 2.781921863555908, + "learning_rate": 4.844261495755585e-06, + "loss": 0.643, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 3.0157392024993896, + "learning_rate": 4.844044684340206e-06, + "loss": 0.7559, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 2.8109354972839355, + "learning_rate": 4.843827726971444e-06, + "loss": 0.6264, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 3.0953569412231445, + "learning_rate": 4.8436106236628064e-06, + "loss": 0.6429, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 2.6850643157958984, + "learning_rate": 4.843393374427812e-06, + "loss": 0.6598, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 3.043480634689331, + "learning_rate": 4.8431759792799874e-06, + "loss": 0.6331, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 2.723870038986206, + "learning_rate": 4.842958438232868e-06, + "loss": 0.6259, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 2.822492837905884, + "learning_rate": 4.842740751300002e-06, + "loss": 0.6554, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 2.7866315841674805, + "learning_rate": 4.842522918494941e-06, + "loss": 0.6991, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 2.8881826400756836, + "learning_rate": 4.84230493983125e-06, + "loss": 0.5876, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 2.7456939220428467, + "learning_rate": 4.8420868153225e-06, + "loss": 0.6188, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.0257532596588135, + "learning_rate": 4.841868544982274e-06, + "loss": 0.63, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 3.1581954956054688, + "learning_rate": 4.841650128824164e-06, + "loss": 0.7214, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 2.9174306392669678, + "learning_rate": 4.841431566861767e-06, + "loss": 0.704, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 2.5019054412841797, + "learning_rate": 4.8412128591086935e-06, + "loss": 0.6298, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 2.724285125732422, + "learning_rate": 4.840994005578562e-06, + "loss": 0.6289, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 2.5882341861724854, + "learning_rate": 4.840775006284998e-06, + "loss": 0.6355, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 3.1281991004943848, + "learning_rate": 4.840555861241638e-06, + "loss": 0.5551, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 2.6064817905426025, + "learning_rate": 4.840336570462127e-06, + "loss": 0.5543, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 2.67112398147583, + "learning_rate": 4.840117133960122e-06, + "loss": 0.6044, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 2.838022232055664, + "learning_rate": 4.839897551749282e-06, + "loss": 0.6814, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 2.8897151947021484, + "learning_rate": 4.839677823843283e-06, + "loss": 0.593, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 2.9238014221191406, + "learning_rate": 4.839457950255805e-06, + "loss": 0.5544, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.016876459121704, + "learning_rate": 4.839237931000538e-06, + "loss": 0.6099, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 2.9415392875671387, + "learning_rate": 4.839017766091182e-06, + "loss": 0.6413, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 2.658067226409912, + "learning_rate": 4.838797455541446e-06, + "loss": 0.6534, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 2.460358142852783, + "learning_rate": 4.838576999365049e-06, + "loss": 0.5307, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 2.5818674564361572, + "learning_rate": 4.838356397575716e-06, + "loss": 0.6265, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.009197473526001, + "learning_rate": 4.838135650187183e-06, + "loss": 0.6957, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.738543748855591, + "learning_rate": 4.837914757213196e-06, + "loss": 0.646, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 2.8208494186401367, + "learning_rate": 4.837693718667508e-06, + "loss": 0.5936, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 3.1574649810791016, + "learning_rate": 4.837472534563883e-06, + "loss": 0.6455, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 2.6737420558929443, + "learning_rate": 4.837251204916093e-06, + "loss": 0.5921, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 2.424983024597168, + "learning_rate": 4.837029729737918e-06, + "loss": 0.6346, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 2.5163493156433105, + "learning_rate": 4.836808109043151e-06, + "loss": 0.6061, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 2.8377044200897217, + "learning_rate": 4.836586342845588e-06, + "loss": 0.611, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 2.5929181575775146, + "learning_rate": 4.83636443115904e-06, + "loss": 0.5496, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 2.5017223358154297, + "learning_rate": 4.836142373997323e-06, + "loss": 0.6235, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 2.822500228881836, + "learning_rate": 4.835920171374265e-06, + "loss": 0.6147, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 2.7234230041503906, + "learning_rate": 4.8356978233037e-06, + "loss": 0.6228, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 2.9565515518188477, + "learning_rate": 4.835475329799472e-06, + "loss": 0.5728, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 2.4356038570404053, + "learning_rate": 4.835252690875438e-06, + "loss": 0.6723, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 2.765913248062134, + "learning_rate": 4.835029906545458e-06, + "loss": 0.5805, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 2.4481914043426514, + "learning_rate": 4.834806976823405e-06, + "loss": 0.599, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 2.620779514312744, + "learning_rate": 4.834583901723158e-06, + "loss": 0.63, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 2.654426097869873, + "learning_rate": 4.83436068125861e-06, + "loss": 0.6544, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 2.589623212814331, + "learning_rate": 4.834137315443656e-06, + "loss": 0.5596, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 2.572883129119873, + "learning_rate": 4.833913804292209e-06, + "loss": 0.5974, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 2.8744914531707764, + "learning_rate": 4.833690147818181e-06, + "loss": 0.5364, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 2.9800851345062256, + "learning_rate": 4.833466346035502e-06, + "loss": 0.6287, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 2.627784490585327, + "learning_rate": 4.833242398958105e-06, + "loss": 0.621, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 2.5187721252441406, + "learning_rate": 4.833018306599933e-06, + "loss": 0.5901, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 2.4843688011169434, + "learning_rate": 4.832794068974944e-06, + "loss": 0.6336, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 2.774911880493164, + "learning_rate": 4.832569686097096e-06, + "loss": 0.6091, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 3.2562527656555176, + "learning_rate": 4.8323451579803615e-06, + "loss": 0.7686, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 2.799570083618164, + "learning_rate": 4.832120484638721e-06, + "loss": 0.6233, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 2.661893367767334, + "learning_rate": 4.831895666086164e-06, + "loss": 0.5841, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 3.0382652282714844, + "learning_rate": 4.831670702336689e-06, + "loss": 0.5769, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 2.676398515701294, + "learning_rate": 4.831445593404304e-06, + "loss": 0.619, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 2.717916965484619, + "learning_rate": 4.831220339303024e-06, + "loss": 0.5787, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 2.3918066024780273, + "learning_rate": 4.830994940046876e-06, + "loss": 0.5108, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 2.709144115447998, + "learning_rate": 4.830769395649895e-06, + "loss": 0.6875, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 2.8711116313934326, + "learning_rate": 4.830543706126123e-06, + "loss": 0.6745, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 2.612339496612549, + "learning_rate": 4.830317871489614e-06, + "loss": 0.5738, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 2.4355857372283936, + "learning_rate": 4.830091891754429e-06, + "loss": 0.5907, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 2.676051378250122, + "learning_rate": 4.829865766934638e-06, + "loss": 0.6628, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 2.66489839553833, + "learning_rate": 4.829639497044323e-06, + "loss": 0.5984, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 2.5358035564422607, + "learning_rate": 4.829413082097572e-06, + "loss": 0.5867, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 2.6530144214630127, + "learning_rate": 4.8291865221084815e-06, + "loss": 0.5917, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 2.5160958766937256, + "learning_rate": 4.82895981709116e-06, + "loss": 0.6347, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 2.61592698097229, + "learning_rate": 4.8287329670597225e-06, + "loss": 0.5472, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 2.7528622150421143, + "learning_rate": 4.828505972028296e-06, + "loss": 0.5842, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 2.8154072761535645, + "learning_rate": 4.828278832011011e-06, + "loss": 0.5757, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 3.118515729904175, + "learning_rate": 4.828051547022013e-06, + "loss": 0.6472, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 2.452033758163452, + "learning_rate": 4.827824117075453e-06, + "loss": 0.5571, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 2.984388828277588, + "learning_rate": 4.827596542185492e-06, + "loss": 0.6656, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.61356782913208, + "learning_rate": 4.8273688223663014e-06, + "loss": 0.6444, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 2.8967196941375732, + "learning_rate": 4.8271409576320595e-06, + "loss": 0.6457, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 2.852367639541626, + "learning_rate": 4.826912947996954e-06, + "loss": 0.5629, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 2.905280590057373, + "learning_rate": 4.826684793475182e-06, + "loss": 0.6245, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 2.6156530380249023, + "learning_rate": 4.826456494080951e-06, + "loss": 0.5869, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 2.6490228176116943, + "learning_rate": 4.826228049828475e-06, + "loss": 0.5461, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 2.9626693725585938, + "learning_rate": 4.825999460731978e-06, + "loss": 0.6842, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 2.6866023540496826, + "learning_rate": 4.825770726805695e-06, + "loss": 0.5726, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 2.5525858402252197, + "learning_rate": 4.825541848063866e-06, + "loss": 0.6061, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 2.703977584838867, + "learning_rate": 4.825312824520743e-06, + "loss": 0.6726, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 2.856534957885742, + "learning_rate": 4.825083656190588e-06, + "loss": 0.625, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 2.8564887046813965, + "learning_rate": 4.824854343087668e-06, + "loss": 0.7251, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 2.327650308609009, + "learning_rate": 4.824624885226262e-06, + "loss": 0.526, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 3.0025737285614014, + "learning_rate": 4.824395282620659e-06, + "loss": 0.6043, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 2.5441737174987793, + "learning_rate": 4.824165535285152e-06, + "loss": 0.6276, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 2.4177372455596924, + "learning_rate": 4.823935643234049e-06, + "loss": 0.6419, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 2.9210550785064697, + "learning_rate": 4.823705606481664e-06, + "loss": 0.5663, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 2.6353724002838135, + "learning_rate": 4.82347542504232e-06, + "loss": 0.5669, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 2.419081926345825, + "learning_rate": 4.823245098930349e-06, + "loss": 0.5777, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 2.5077571868896484, + "learning_rate": 4.823014628160093e-06, + "loss": 0.5924, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 2.816056251525879, + "learning_rate": 4.822784012745902e-06, + "loss": 0.7273, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 2.7163147926330566, + "learning_rate": 4.8225532527021366e-06, + "loss": 0.5545, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 2.4784302711486816, + "learning_rate": 4.822322348043164e-06, + "loss": 0.556, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 2.712467670440674, + "learning_rate": 4.822091298783361e-06, + "loss": 0.6501, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 2.7217724323272705, + "learning_rate": 4.821860104937115e-06, + "loss": 0.5989, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 2.5622854232788086, + "learning_rate": 4.821628766518821e-06, + "loss": 0.5263, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 3.230923891067505, + "learning_rate": 4.821397283542884e-06, + "loss": 0.6707, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 2.37929105758667, + "learning_rate": 4.821165656023718e-06, + "loss": 0.6124, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 2.9811325073242188, + "learning_rate": 4.820933883975745e-06, + "loss": 0.6435, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 2.887380838394165, + "learning_rate": 4.820701967413395e-06, + "loss": 0.621, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 2.6762876510620117, + "learning_rate": 4.820469906351109e-06, + "loss": 0.5713, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.820237700803337e-06, + "loss": 0.6136, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 2.7244746685028076, + "learning_rate": 4.820005350784539e-06, + "loss": 0.5816, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 2.9293999671936035, + "learning_rate": 4.8197728563091795e-06, + "loss": 0.6649, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 2.4402127265930176, + "learning_rate": 4.819540217391736e-06, + "loss": 0.6481, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 3.083941698074341, + "learning_rate": 4.819307434046694e-06, + "loss": 0.6951, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 2.544952392578125, + "learning_rate": 4.819074506288548e-06, + "loss": 0.539, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 2.7791268825531006, + "learning_rate": 4.818841434131801e-06, + "loss": 0.5827, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 2.7349796295166016, + "learning_rate": 4.818608217590967e-06, + "loss": 0.5584, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 2.637652635574341, + "learning_rate": 4.818374856680565e-06, + "loss": 0.6386, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 2.9821584224700928, + "learning_rate": 4.818141351415127e-06, + "loss": 0.6734, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 2.992938995361328, + "learning_rate": 4.817907701809192e-06, + "loss": 0.5899, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 4.35719633102417, + "learning_rate": 4.8176739078773076e-06, + "loss": 0.6281, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 2.838146209716797, + "learning_rate": 4.8174399696340315e-06, + "loss": 0.5766, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 3.3116989135742188, + "learning_rate": 4.81720588709393e-06, + "loss": 0.6409, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 2.9843590259552, + "learning_rate": 4.816971660271579e-06, + "loss": 0.6108, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 2.843770742416382, + "learning_rate": 4.816737289181562e-06, + "loss": 0.6053, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.7608556747436523, + "learning_rate": 4.816502773838473e-06, + "loss": 0.5854, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 3.343682289123535, + "learning_rate": 4.816268114256914e-06, + "loss": 0.6329, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 2.769768476486206, + "learning_rate": 4.816033310451496e-06, + "loss": 0.6242, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 2.989851713180542, + "learning_rate": 4.815798362436838e-06, + "loss": 0.6493, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 3.170736312866211, + "learning_rate": 4.8155632702275716e-06, + "loss": 0.6341, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 2.7372522354125977, + "learning_rate": 4.815328033838334e-06, + "loss": 0.5445, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 2.6947238445281982, + "learning_rate": 4.8150926532837715e-06, + "loss": 0.6437, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 2.472323179244995, + "learning_rate": 4.81485712857854e-06, + "loss": 0.5751, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 2.791114091873169, + "learning_rate": 4.814621459737308e-06, + "loss": 0.5996, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 3.1957521438598633, + "learning_rate": 4.814385646774745e-06, + "loss": 0.5803, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 2.4120798110961914, + "learning_rate": 4.8141496897055364e-06, + "loss": 0.5814, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 2.9262423515319824, + "learning_rate": 4.813913588544374e-06, + "loss": 0.6292, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 2.8251047134399414, + "learning_rate": 4.813677343305959e-06, + "loss": 0.6787, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 2.931659698486328, + "learning_rate": 4.8134409540050005e-06, + "loss": 0.6163, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 2.7160706520080566, + "learning_rate": 4.813204420656219e-06, + "loss": 0.6831, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 3.2134454250335693, + "learning_rate": 4.81296774327434e-06, + "loss": 0.6002, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 2.4002513885498047, + "learning_rate": 4.812730921874103e-06, + "loss": 0.5488, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 2.5559282302856445, + "learning_rate": 4.812493956470251e-06, + "loss": 0.5802, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 2.57478404045105, + "learning_rate": 4.812256847077541e-06, + "loss": 0.646, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 2.811851978302002, + "learning_rate": 4.812019593710736e-06, + "loss": 0.6245, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 2.5228829383850098, + "learning_rate": 4.811782196384609e-06, + "loss": 0.5949, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 2.744096040725708, + "learning_rate": 4.8115446551139415e-06, + "loss": 0.6006, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 3.129242420196533, + "learning_rate": 4.811306969913524e-06, + "loss": 0.7251, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 2.7855660915374756, + "learning_rate": 4.811069140798156e-06, + "loss": 0.6534, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 2.836603879928589, + "learning_rate": 4.810831167782647e-06, + "loss": 0.6661, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 2.5339887142181396, + "learning_rate": 4.810593050881813e-06, + "loss": 0.5354, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 2.9553709030151367, + "learning_rate": 4.810354790110482e-06, + "loss": 0.6001, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 2.6581788063049316, + "learning_rate": 4.8101163854834885e-06, + "loss": 0.6802, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 3.2002551555633545, + "learning_rate": 4.809877837015677e-06, + "loss": 0.6641, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 2.918792963027954, + "learning_rate": 4.809639144721902e-06, + "loss": 0.6758, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 2.7993946075439453, + "learning_rate": 4.8094003086170245e-06, + "loss": 0.5889, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 2.3698952198028564, + "learning_rate": 4.809161328715916e-06, + "loss": 0.6244, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 2.8891594409942627, + "learning_rate": 4.808922205033458e-06, + "loss": 0.5835, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 2.838345766067505, + "learning_rate": 4.808682937584537e-06, + "loss": 0.6907, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 2.8443174362182617, + "learning_rate": 4.808443526384053e-06, + "loss": 0.6692, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 2.7355034351348877, + "learning_rate": 4.808203971446913e-06, + "loss": 0.5799, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 2.7108020782470703, + "learning_rate": 4.807964272788033e-06, + "loss": 0.652, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 2.397650957107544, + "learning_rate": 4.807724430422338e-06, + "loss": 0.5418, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 2.4981582164764404, + "learning_rate": 4.807484444364762e-06, + "loss": 0.5731, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 2.7943713665008545, + "learning_rate": 4.8072443146302475e-06, + "loss": 0.5913, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 2.5691423416137695, + "learning_rate": 4.807004041233746e-06, + "loss": 0.6475, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 3.2367498874664307, + "learning_rate": 4.8067636241902195e-06, + "loss": 0.675, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 3.000595808029175, + "learning_rate": 4.806523063514637e-06, + "loss": 0.5481, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 2.702014207839966, + "learning_rate": 4.806282359221976e-06, + "loss": 0.5993, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 2.383671998977661, + "learning_rate": 4.806041511327226e-06, + "loss": 0.562, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 2.6965041160583496, + "learning_rate": 4.8058005198453834e-06, + "loss": 0.5955, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 2.5906765460968018, + "learning_rate": 4.805559384791453e-06, + "loss": 0.5151, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 2.5454652309417725, + "learning_rate": 4.8053181061804475e-06, + "loss": 0.5843, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 2.661343812942505, + "learning_rate": 4.8050766840273935e-06, + "loss": 0.5995, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 2.5635924339294434, + "learning_rate": 4.8048351183473215e-06, + "loss": 0.5676, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 2.5936667919158936, + "learning_rate": 4.804593409155274e-06, + "loss": 0.6291, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 2.6902432441711426, + "learning_rate": 4.804351556466299e-06, + "loss": 0.6114, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 2.7764673233032227, + "learning_rate": 4.804109560295457e-06, + "loss": 0.5768, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 2.9587221145629883, + "learning_rate": 4.803867420657816e-06, + "loss": 0.6048, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 2.9238998889923096, + "learning_rate": 4.803625137568453e-06, + "loss": 0.6329, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 2.70473313331604, + "learning_rate": 4.803382711042455e-06, + "loss": 0.5427, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 3.1604979038238525, + "learning_rate": 4.803140141094914e-06, + "loss": 0.626, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 2.9567699432373047, + "learning_rate": 4.802897427740936e-06, + "loss": 0.5319, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 2.90983247756958, + "learning_rate": 4.802654570995632e-06, + "loss": 0.586, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 2.783480167388916, + "learning_rate": 4.8024115708741255e-06, + "loss": 0.5773, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 3.3307793140411377, + "learning_rate": 4.802168427391547e-06, + "loss": 0.6257, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 3.0475001335144043, + "learning_rate": 4.801925140563034e-06, + "loss": 0.6612, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 2.8278894424438477, + "learning_rate": 4.8016817104037375e-06, + "loss": 0.6449, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 2.760244369506836, + "learning_rate": 4.801438136928812e-06, + "loss": 0.7007, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 2.827634572982788, + "learning_rate": 4.801194420153427e-06, + "loss": 0.6418, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 2.8655009269714355, + "learning_rate": 4.800950560092754e-06, + "loss": 0.6231, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 2.738112688064575, + "learning_rate": 4.800706556761981e-06, + "loss": 0.6463, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 2.4781179428100586, + "learning_rate": 4.800462410176296e-06, + "loss": 0.5365, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 2.6049838066101074, + "learning_rate": 4.800218120350906e-06, + "loss": 0.6035, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 2.9089980125427246, + "learning_rate": 4.79997368730102e-06, + "loss": 0.5828, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 2.831871747970581, + "learning_rate": 4.799729111041857e-06, + "loss": 0.5953, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 2.5611300468444824, + "learning_rate": 4.799484391588647e-06, + "loss": 0.6302, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 2.744070053100586, + "learning_rate": 4.799239528956625e-06, + "loss": 0.5561, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 2.7344231605529785, + "learning_rate": 4.798994523161041e-06, + "loss": 0.6317, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 2.3420889377593994, + "learning_rate": 4.798749374217149e-06, + "loss": 0.5415, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 2.57384991645813, + "learning_rate": 4.798504082140212e-06, + "loss": 0.6383, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 2.8819844722747803, + "learning_rate": 4.798258646945505e-06, + "loss": 0.6355, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 2.908123254776001, + "learning_rate": 4.79801306864831e-06, + "loss": 0.701, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 2.6500701904296875, + "learning_rate": 4.797767347263917e-06, + "loss": 0.6152, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 2.5513017177581787, + "learning_rate": 4.797521482807628e-06, + "loss": 0.6241, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 2.6239185333251953, + "learning_rate": 4.7972754752947495e-06, + "loss": 0.6072, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 2.673436403274536, + "learning_rate": 4.797029324740601e-06, + "loss": 0.5802, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 2.533831834793091, + "learning_rate": 4.796783031160508e-06, + "loss": 0.5566, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 2.9806582927703857, + "learning_rate": 4.796536594569807e-06, + "loss": 0.6945, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 2.7093560695648193, + "learning_rate": 4.796290014983842e-06, + "loss": 0.7143, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 2.814507246017456, + "learning_rate": 4.796043292417967e-06, + "loss": 0.6122, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 2.537156820297241, + "learning_rate": 4.795796426887543e-06, + "loss": 0.6229, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 2.4878013134002686, + "learning_rate": 4.795549418407944e-06, + "loss": 0.5442, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 2.839383363723755, + "learning_rate": 4.795302266994548e-06, + "loss": 0.6717, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 3.1981801986694336, + "learning_rate": 4.795054972662744e-06, + "loss": 0.6596, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 2.781730890274048, + "learning_rate": 4.79480753542793e-06, + "loss": 0.5845, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 2.689948558807373, + "learning_rate": 4.794559955305513e-06, + "loss": 0.5928, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 2.7267637252807617, + "learning_rate": 4.7943122323109105e-06, + "loss": 0.5224, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 2.4346601963043213, + "learning_rate": 4.794064366459544e-06, + "loss": 0.6431, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 2.7440176010131836, + "learning_rate": 4.793816357766849e-06, + "loss": 0.6083, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 2.6558027267456055, + "learning_rate": 4.793568206248268e-06, + "loss": 0.698, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 2.591658353805542, + "learning_rate": 4.793319911919251e-06, + "loss": 0.6601, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 2.5431172847747803, + "learning_rate": 4.79307147479526e-06, + "loss": 0.5917, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 2.7335588932037354, + "learning_rate": 4.792822894891762e-06, + "loss": 0.5925, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 2.2500839233398438, + "learning_rate": 4.792574172224237e-06, + "loss": 0.4984, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 2.691343069076538, + "learning_rate": 4.79232530680817e-06, + "loss": 0.6262, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 2.612204074859619, + "learning_rate": 4.792076298659058e-06, + "loss": 0.5822, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 3.0163519382476807, + "learning_rate": 4.791827147792406e-06, + "loss": 0.6263, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 2.742183208465576, + "learning_rate": 4.791577854223727e-06, + "loss": 0.6628, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 2.872213840484619, + "learning_rate": 4.791328417968542e-06, + "loss": 0.6332, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 2.725006580352783, + "learning_rate": 4.7910788390423844e-06, + "loss": 0.6266, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 3.0366697311401367, + "learning_rate": 4.790829117460793e-06, + "loss": 0.6403, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 2.594881772994995, + "learning_rate": 4.790579253239318e-06, + "loss": 0.521, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 2.4496347904205322, + "learning_rate": 4.790329246393517e-06, + "loss": 0.54, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.102278470993042, + "learning_rate": 4.790079096938956e-06, + "loss": 0.6142, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 2.4645912647247314, + "learning_rate": 4.789828804891212e-06, + "loss": 0.5212, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 2.7482516765594482, + "learning_rate": 4.789578370265868e-06, + "loss": 0.6712, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 2.61360502243042, + "learning_rate": 4.7893277930785195e-06, + "loss": 0.6367, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 2.79028058052063, + "learning_rate": 4.789077073344767e-06, + "loss": 0.5099, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 2.647662401199341, + "learning_rate": 4.788826211080222e-06, + "loss": 0.6698, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.0214831829071045, + "learning_rate": 4.7885752063005055e-06, + "loss": 0.6121, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 2.8244032859802246, + "learning_rate": 4.788324059021247e-06, + "loss": 0.6921, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 3.1501076221466064, + "learning_rate": 4.788072769258082e-06, + "loss": 0.6872, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 2.6989903450012207, + "learning_rate": 4.7878213370266594e-06, + "loss": 0.5884, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 2.6982665061950684, + "learning_rate": 4.787569762342633e-06, + "loss": 0.6112, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 2.6918323040008545, + "learning_rate": 4.7873180452216685e-06, + "loss": 0.5315, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 2.5494401454925537, + "learning_rate": 4.78706618567944e-06, + "loss": 0.5909, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 2.7532095909118652, + "learning_rate": 4.786814183731627e-06, + "loss": 0.5566, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 2.550865888595581, + "learning_rate": 4.786562039393923e-06, + "loss": 0.555, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 2.4477791786193848, + "learning_rate": 4.786309752682028e-06, + "loss": 0.5844, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 2.6982262134552, + "learning_rate": 4.7860573236116485e-06, + "loss": 0.6136, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 2.456263542175293, + "learning_rate": 4.785804752198503e-06, + "loss": 0.5055, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 2.428544521331787, + "learning_rate": 4.78555203845832e-06, + "loss": 0.5859, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 2.1782307624816895, + "learning_rate": 4.785299182406833e-06, + "loss": 0.5325, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.137956142425537, + "learning_rate": 4.785046184059786e-06, + "loss": 0.6097, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 2.6269001960754395, + "learning_rate": 4.7847930434329336e-06, + "loss": 0.5972, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 2.732659339904785, + "learning_rate": 4.784539760542037e-06, + "loss": 0.6054, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 2.5346736907958984, + "learning_rate": 4.784286335402866e-06, + "loss": 0.5521, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 3.1420228481292725, + "learning_rate": 4.784032768031202e-06, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 3.073793411254883, + "learning_rate": 4.783779058442831e-06, + "loss": 0.6414, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 2.6621336936950684, + "learning_rate": 4.783525206653554e-06, + "loss": 0.5836, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 2.7029049396514893, + "learning_rate": 4.7832712126791745e-06, + "loss": 0.5897, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 2.4733822345733643, + "learning_rate": 4.783017076535509e-06, + "loss": 0.5913, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 2.8119473457336426, + "learning_rate": 4.782762798238381e-06, + "loss": 0.6105, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 2.5290818214416504, + "learning_rate": 4.782508377803622e-06, + "loss": 0.6119, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 3.193472385406494, + "learning_rate": 4.782253815247076e-06, + "loss": 0.6665, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 3.206759452819824, + "learning_rate": 4.781999110584592e-06, + "loss": 0.6012, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 2.6227457523345947, + "learning_rate": 4.781744263832029e-06, + "loss": 0.5845, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 2.838365316390991, + "learning_rate": 4.781489275005257e-06, + "loss": 0.5695, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 2.8348326683044434, + "learning_rate": 4.78123414412015e-06, + "loss": 0.6136, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 2.5698344707489014, + "learning_rate": 4.780978871192597e-06, + "loss": 0.6576, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 2.5198330879211426, + "learning_rate": 4.780723456238492e-06, + "loss": 0.5521, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 3.001325845718384, + "learning_rate": 4.780467899273737e-06, + "loss": 0.6075, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 2.7732746601104736, + "learning_rate": 4.780212200314247e-06, + "loss": 0.6245, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 2.6950337886810303, + "learning_rate": 4.77995635937594e-06, + "loss": 0.5723, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 2.82051420211792, + "learning_rate": 4.779700376474749e-06, + "loss": 0.6184, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 2.757791757583618, + "learning_rate": 4.779444251626611e-06, + "loss": 0.608, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 2.394108533859253, + "learning_rate": 4.779187984847475e-06, + "loss": 0.6174, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 2.427562713623047, + "learning_rate": 4.778931576153296e-06, + "loss": 0.5618, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 2.891268491744995, + "learning_rate": 4.778675025560042e-06, + "loss": 0.6865, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 2.665534257888794, + "learning_rate": 4.778418333083685e-06, + "loss": 0.5852, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 2.5492889881134033, + "learning_rate": 4.7781614987402095e-06, + "loss": 0.5161, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 2.400177001953125, + "learning_rate": 4.777904522545607e-06, + "loss": 0.5128, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 2.3949809074401855, + "learning_rate": 4.777647404515878e-06, + "loss": 0.571, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 2.3624472618103027, + "learning_rate": 4.7773901446670325e-06, + "loss": 0.5486, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 2.711366891860962, + "learning_rate": 4.7771327430150885e-06, + "loss": 0.5667, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 2.7681493759155273, + "learning_rate": 4.776875199576073e-06, + "loss": 0.5686, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 3.0369436740875244, + "learning_rate": 4.776617514366023e-06, + "loss": 0.6635, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 2.919649600982666, + "learning_rate": 4.776359687400983e-06, + "loss": 0.5749, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 2.7986185550689697, + "learning_rate": 4.776101718697007e-06, + "loss": 0.559, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 2.5951223373413086, + "learning_rate": 4.775843608270158e-06, + "loss": 0.5654, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 2.674138069152832, + "learning_rate": 4.775585356136505e-06, + "loss": 0.5286, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 3.045437812805176, + "learning_rate": 4.775326962312131e-06, + "loss": 0.6185, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 2.6145293712615967, + "learning_rate": 4.775068426813124e-06, + "loss": 0.6075, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 2.6320106983184814, + "learning_rate": 4.7748097496555824e-06, + "loss": 0.561, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 2.5038623809814453, + "learning_rate": 4.774550930855612e-06, + "loss": 0.593, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 2.8168089389801025, + "learning_rate": 4.774291970429329e-06, + "loss": 0.5196, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 2.778130292892456, + "learning_rate": 4.774032868392858e-06, + "loss": 0.5984, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 2.536458730697632, + "learning_rate": 4.7737736247623305e-06, + "loss": 0.568, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 2.6669719219207764, + "learning_rate": 4.77351423955389e-06, + "loss": 0.6233, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 2.578242540359497, + "learning_rate": 4.773254712783687e-06, + "loss": 0.579, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 2.816664457321167, + "learning_rate": 4.772995044467881e-06, + "loss": 0.6635, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 3.1111979484558105, + "learning_rate": 4.77273523462264e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 2.764552354812622, + "learning_rate": 4.772475283264142e-06, + "loss": 0.6216, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 2.9126830101013184, + "learning_rate": 4.772215190408572e-06, + "loss": 0.6396, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 2.7502307891845703, + "learning_rate": 4.7719549560721264e-06, + "loss": 0.6186, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 2.6279006004333496, + "learning_rate": 4.771694580271007e-06, + "loss": 0.5557, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 2.996563196182251, + "learning_rate": 4.7714340630214276e-06, + "loss": 0.6259, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 3.231323480606079, + "learning_rate": 4.771173404339609e-06, + "loss": 0.5473, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 3.143519878387451, + "learning_rate": 4.770912604241781e-06, + "loss": 0.593, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 2.515484094619751, + "learning_rate": 4.770651662744184e-06, + "loss": 0.538, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 2.629058837890625, + "learning_rate": 4.770390579863064e-06, + "loss": 0.5745, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 2.5826802253723145, + "learning_rate": 4.770129355614677e-06, + "loss": 0.6397, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 2.954623222351074, + "learning_rate": 4.769867990015289e-06, + "loss": 0.6106, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 2.742192268371582, + "learning_rate": 4.769606483081175e-06, + "loss": 0.6902, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 2.2619097232818604, + "learning_rate": 4.769344834828618e-06, + "loss": 0.5414, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 2.7384188175201416, + "learning_rate": 4.769083045273908e-06, + "loss": 0.5787, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 2.6734485626220703, + "learning_rate": 4.768821114433346e-06, + "loss": 0.5923, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 2.286140203475952, + "learning_rate": 4.768559042323243e-06, + "loss": 0.5822, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 3.0243725776672363, + "learning_rate": 4.768296828959915e-06, + "loss": 0.6623, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 2.4026312828063965, + "learning_rate": 4.768034474359689e-06, + "loss": 0.5554, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 2.7469029426574707, + "learning_rate": 4.767771978538903e-06, + "loss": 0.6316, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 2.729659080505371, + "learning_rate": 4.767509341513899e-06, + "loss": 0.5807, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 2.5336945056915283, + "learning_rate": 4.76724656330103e-06, + "loss": 0.6109, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 2.519880533218384, + "learning_rate": 4.76698364391666e-06, + "loss": 0.5313, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 2.698862075805664, + "learning_rate": 4.766720583377159e-06, + "loss": 0.5953, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 3.0195560455322266, + "learning_rate": 4.766457381698907e-06, + "loss": 0.5965, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 2.5972697734832764, + "learning_rate": 4.766194038898291e-06, + "loss": 0.6014, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 2.7132294178009033, + "learning_rate": 4.76593055499171e-06, + "loss": 0.5638, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 2.7134575843811035, + "learning_rate": 4.765666929995568e-06, + "loss": 0.52, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 2.3804993629455566, + "learning_rate": 4.765403163926282e-06, + "loss": 0.5435, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 2.8782761096954346, + "learning_rate": 4.765139256800274e-06, + "loss": 0.5843, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 2.836209774017334, + "learning_rate": 4.764875208633977e-06, + "loss": 0.6667, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 2.608851194381714, + "learning_rate": 4.764611019443831e-06, + "loss": 0.5436, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 2.788738965988159, + "learning_rate": 4.764346689246288e-06, + "loss": 0.7331, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 2.524277687072754, + "learning_rate": 4.764082218057805e-06, + "loss": 0.5067, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 3.7559316158294678, + "learning_rate": 4.763817605894851e-06, + "loss": 0.6809, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 2.9070613384246826, + "learning_rate": 4.763552852773899e-06, + "loss": 0.5913, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 2.7050609588623047, + "learning_rate": 4.7632879587114386e-06, + "loss": 0.6074, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 2.891134262084961, + "learning_rate": 4.76302292372396e-06, + "loss": 0.5939, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 2.8581702709198, + "learning_rate": 4.762757747827968e-06, + "loss": 0.5972, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 2.8266196250915527, + "learning_rate": 4.762492431039971e-06, + "loss": 0.5993, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 2.4853954315185547, + "learning_rate": 4.762226973376493e-06, + "loss": 0.6388, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 3.2212886810302734, + "learning_rate": 4.761961374854059e-06, + "loss": 0.6698, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 3.1254501342773438, + "learning_rate": 4.761695635489211e-06, + "loss": 0.5263, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 2.6891462802886963, + "learning_rate": 4.761429755298491e-06, + "loss": 0.5359, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 2.8557538986206055, + "learning_rate": 4.761163734298457e-06, + "loss": 0.5933, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 2.53548264503479, + "learning_rate": 4.7608975725056724e-06, + "loss": 0.6397, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 3.0237956047058105, + "learning_rate": 4.76063126993671e-06, + "loss": 0.6845, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 3.222886800765991, + "learning_rate": 4.76036482660815e-06, + "loss": 0.6055, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 3.1867551803588867, + "learning_rate": 4.760098242536584e-06, + "loss": 0.6592, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 2.782209873199463, + "learning_rate": 4.7598315177386115e-06, + "loss": 0.5833, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 2.899871587753296, + "learning_rate": 4.759564652230838e-06, + "loss": 0.6129, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 2.5690579414367676, + "learning_rate": 4.759297646029882e-06, + "loss": 0.5827, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 2.666130304336548, + "learning_rate": 4.759030499152368e-06, + "loss": 0.5272, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 2.7030911445617676, + "learning_rate": 4.758763211614932e-06, + "loss": 0.6415, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 2.717512845993042, + "learning_rate": 4.7584957834342135e-06, + "loss": 0.5827, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 2.665823459625244, + "learning_rate": 4.758228214626867e-06, + "loss": 0.6209, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 2.636653184890747, + "learning_rate": 4.75796050520955e-06, + "loss": 0.6413, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 2.585115671157837, + "learning_rate": 4.7576926551989345e-06, + "loss": 0.5518, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.808526039123535, + "learning_rate": 4.757424664611697e-06, + "loss": 0.5717, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 3.5957939624786377, + "learning_rate": 4.757156533464524e-06, + "loss": 0.6323, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 2.5003883838653564, + "learning_rate": 4.756888261774111e-06, + "loss": 0.5937, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 2.749061346054077, + "learning_rate": 4.756619849557161e-06, + "loss": 0.6642, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 2.6757891178131104, + "learning_rate": 4.756351296830389e-06, + "loss": 0.5887, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 2.811925172805786, + "learning_rate": 4.756082603610516e-06, + "loss": 0.6571, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 2.5054616928100586, + "learning_rate": 4.755813769914271e-06, + "loss": 0.6312, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 2.7518467903137207, + "learning_rate": 4.755544795758395e-06, + "loss": 0.6685, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 2.7527287006378174, + "learning_rate": 4.755275681159634e-06, + "loss": 0.5886, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 2.6162452697753906, + "learning_rate": 4.755006426134745e-06, + "loss": 0.546, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 2.4016737937927246, + "learning_rate": 4.754737030700495e-06, + "loss": 0.5726, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 2.528327703475952, + "learning_rate": 4.754467494873656e-06, + "loss": 0.5682, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 2.3139286041259766, + "learning_rate": 4.7541978186710115e-06, + "loss": 0.6108, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 2.7269136905670166, + "learning_rate": 4.753928002109354e-06, + "loss": 0.5875, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 4.425495147705078, + "learning_rate": 4.753658045205482e-06, + "loss": 0.5572, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 2.535409927368164, + "learning_rate": 4.753387947976206e-06, + "loss": 0.5868, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 2.722458600997925, + "learning_rate": 4.753117710438343e-06, + "loss": 0.5935, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 2.743861436843872, + "learning_rate": 4.75284733260872e-06, + "loss": 0.572, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 2.60640549659729, + "learning_rate": 4.752576814504173e-06, + "loss": 0.567, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 2.7486042976379395, + "learning_rate": 4.7523061561415435e-06, + "loss": 0.5768, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 3.8410251140594482, + "learning_rate": 4.752035357537686e-06, + "loss": 0.6034, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 3.0935890674591064, + "learning_rate": 4.751764418709462e-06, + "loss": 0.5644, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 2.7989892959594727, + "learning_rate": 4.751493339673742e-06, + "loss": 0.656, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 3.6940557956695557, + "learning_rate": 4.751222120447403e-06, + "loss": 0.6632, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 2.3428797721862793, + "learning_rate": 4.750950761047335e-06, + "loss": 0.4485, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 2.622544050216675, + "learning_rate": 4.750679261490432e-06, + "loss": 0.5857, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 2.4911322593688965, + "learning_rate": 4.750407621793601e-06, + "loss": 0.5618, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 2.6434662342071533, + "learning_rate": 4.750135841973755e-06, + "loss": 0.6057, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 3.115443706512451, + "learning_rate": 4.749863922047817e-06, + "loss": 0.6064, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 2.5671091079711914, + "learning_rate": 4.749591862032718e-06, + "loss": 0.5625, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 3.2008655071258545, + "learning_rate": 4.749319661945398e-06, + "loss": 0.5547, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 2.905987024307251, + "learning_rate": 4.749047321802805e-06, + "loss": 0.6033, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 3.1456053256988525, + "learning_rate": 4.748774841621897e-06, + "loss": 0.5651, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 2.8116416931152344, + "learning_rate": 4.748502221419641e-06, + "loss": 0.5853, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 3.123835325241089, + "learning_rate": 4.748229461213011e-06, + "loss": 0.5427, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 2.4750146865844727, + "learning_rate": 4.747956561018989e-06, + "loss": 0.6517, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 2.6174299716949463, + "learning_rate": 4.7476835208545705e-06, + "loss": 0.6119, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 2.7390382289886475, + "learning_rate": 4.747410340736755e-06, + "loss": 0.5664, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 2.7940444946289062, + "learning_rate": 4.747137020682552e-06, + "loss": 0.5628, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 2.477365016937256, + "learning_rate": 4.7468635607089795e-06, + "loss": 0.5261, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 2.7016685009002686, + "learning_rate": 4.746589960833066e-06, + "loss": 0.5576, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 2.8806519508361816, + "learning_rate": 4.746316221071846e-06, + "loss": 0.5925, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 3.0315234661102295, + "learning_rate": 4.746042341442365e-06, + "loss": 0.6142, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 4.2446160316467285, + "learning_rate": 4.745768321961676e-06, + "loss": 0.5352, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 2.6517012119293213, + "learning_rate": 4.745494162646841e-06, + "loss": 0.6118, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 2.774900197982788, + "learning_rate": 4.7452198635149304e-06, + "loss": 0.572, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 3.0133683681488037, + "learning_rate": 4.744945424583024e-06, + "loss": 0.5897, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 2.7344839572906494, + "learning_rate": 4.744670845868211e-06, + "loss": 0.6207, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 2.636578321456909, + "learning_rate": 4.744396127387586e-06, + "loss": 0.6687, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 2.8663458824157715, + "learning_rate": 4.744121269158255e-06, + "loss": 0.5002, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 2.661079168319702, + "learning_rate": 4.743846271197333e-06, + "loss": 0.5848, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 2.881256341934204, + "learning_rate": 4.743571133521943e-06, + "loss": 0.5911, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 2.5540573596954346, + "learning_rate": 4.743295856149217e-06, + "loss": 0.5647, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 2.7060387134552, + "learning_rate": 4.743020439096293e-06, + "loss": 0.6267, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 2.694481372833252, + "learning_rate": 4.742744882380323e-06, + "loss": 0.6283, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 2.711555242538452, + "learning_rate": 4.7424691860184625e-06, + "loss": 0.5784, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 2.9077224731445312, + "learning_rate": 4.742193350027879e-06, + "loss": 0.5948, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 2.9824187755584717, + "learning_rate": 4.7419173744257476e-06, + "loss": 0.6115, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 2.5127830505371094, + "learning_rate": 4.7416412592292515e-06, + "loss": 0.5803, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 3.1307175159454346, + "learning_rate": 4.741365004455583e-06, + "loss": 0.5657, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 2.8205273151397705, + "learning_rate": 4.741088610121944e-06, + "loss": 0.6145, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 2.6119720935821533, + "learning_rate": 4.7408120762455444e-06, + "loss": 0.6058, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 2.421276092529297, + "learning_rate": 4.7405354028436025e-06, + "loss": 0.5973, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 2.9846808910369873, + "learning_rate": 4.740258589933346e-06, + "loss": 0.6892, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 2.6899871826171875, + "learning_rate": 4.739981637532009e-06, + "loss": 0.5705, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 2.8636131286621094, + "learning_rate": 4.739704545656839e-06, + "loss": 0.5775, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 2.7659449577331543, + "learning_rate": 4.739427314325087e-06, + "loss": 0.5823, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 4.71295166015625, + "learning_rate": 4.739149943554016e-06, + "loss": 0.5601, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 2.642636775970459, + "learning_rate": 4.738872433360896e-06, + "loss": 0.5278, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 2.4658217430114746, + "learning_rate": 4.7385947837630065e-06, + "loss": 0.6392, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 2.851602792739868, + "learning_rate": 4.738316994777636e-06, + "loss": 0.6164, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 2.394226551055908, + "learning_rate": 4.738039066422081e-06, + "loss": 0.5556, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 2.7985100746154785, + "learning_rate": 4.737760998713647e-06, + "loss": 0.5799, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 2.5974674224853516, + "learning_rate": 4.737482791669648e-06, + "loss": 0.6984, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 2.707636594772339, + "learning_rate": 4.737204445307406e-06, + "loss": 0.5548, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 2.7882707118988037, + "learning_rate": 4.736925959644254e-06, + "loss": 0.6026, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 2.474482774734497, + "learning_rate": 4.7366473346975304e-06, + "loss": 0.5832, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 2.6196324825286865, + "learning_rate": 4.736368570484585e-06, + "loss": 0.5861, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 2.826864004135132, + "learning_rate": 4.736089667022775e-06, + "loss": 0.6173, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 2.414473056793213, + "learning_rate": 4.735810624329466e-06, + "loss": 0.5753, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 2.8037970066070557, + "learning_rate": 4.7355314424220335e-06, + "loss": 0.6207, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 2.645458698272705, + "learning_rate": 4.735252121317861e-06, + "loss": 0.5959, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 2.7983884811401367, + "learning_rate": 4.734972661034339e-06, + "loss": 0.5696, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 3.0568997859954834, + "learning_rate": 4.73469306158887e-06, + "loss": 0.6194, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 2.7205135822296143, + "learning_rate": 4.734413322998863e-06, + "loss": 0.5292, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 3.3168489933013916, + "learning_rate": 4.734133445281735e-06, + "loss": 0.5654, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 3.0095653533935547, + "learning_rate": 4.733853428454916e-06, + "loss": 0.6508, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 2.7726712226867676, + "learning_rate": 4.733573272535838e-06, + "loss": 0.644, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 2.474397659301758, + "learning_rate": 4.7332929775419456e-06, + "loss": 0.5479, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 2.4518635272979736, + "learning_rate": 4.733012543490693e-06, + "loss": 0.6, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 2.9292192459106445, + "learning_rate": 4.73273197039954e-06, + "loss": 0.6647, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 2.425004720687866, + "learning_rate": 4.732451258285958e-06, + "loss": 0.6338, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 2.904479503631592, + "learning_rate": 4.7321704071674255e-06, + "loss": 0.5923, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 2.477085590362549, + "learning_rate": 4.731889417061428e-06, + "loss": 0.5984, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.585240364074707, + "learning_rate": 4.731608287985465e-06, + "loss": 0.558, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 2.658714532852173, + "learning_rate": 4.731327019957039e-06, + "loss": 0.5567, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 2.7593026161193848, + "learning_rate": 4.731045612993662e-06, + "loss": 0.5772, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 2.4386026859283447, + "learning_rate": 4.7307640671128585e-06, + "loss": 0.6199, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 2.681910514831543, + "learning_rate": 4.730482382332158e-06, + "loss": 0.5971, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 3.7593860626220703, + "learning_rate": 4.7302005586691e-06, + "loss": 0.6346, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 2.5789096355438232, + "learning_rate": 4.729918596141232e-06, + "loss": 0.5684, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 3.0607335567474365, + "learning_rate": 4.729636494766111e-06, + "loss": 0.6223, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 2.906643867492676, + "learning_rate": 4.729354254561303e-06, + "loss": 0.6513, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 3.192430019378662, + "learning_rate": 4.7290718755443795e-06, + "loss": 0.5095, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 2.661536931991577, + "learning_rate": 4.7287893577329255e-06, + "loss": 0.5525, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 2.8436734676361084, + "learning_rate": 4.728506701144531e-06, + "loss": 0.6323, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 2.75544810295105, + "learning_rate": 4.728223905796796e-06, + "loss": 0.6018, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 3.0652759075164795, + "learning_rate": 4.727940971707329e-06, + "loss": 0.62, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 2.802567720413208, + "learning_rate": 4.727657898893747e-06, + "loss": 0.5809, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 2.6208512783050537, + "learning_rate": 4.7273746873736745e-06, + "loss": 0.5762, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 2.5901873111724854, + "learning_rate": 4.727091337164748e-06, + "loss": 0.6111, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 3.002347707748413, + "learning_rate": 4.726807848284609e-06, + "loss": 0.6419, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 2.522151470184326, + "learning_rate": 4.72652422075091e-06, + "loss": 0.642, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 2.5571532249450684, + "learning_rate": 4.726240454581311e-06, + "loss": 0.5729, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 2.7704918384552, + "learning_rate": 4.72595654979348e-06, + "loss": 0.6816, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 2.517040491104126, + "learning_rate": 4.7256725064050955e-06, + "loss": 0.5782, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 2.613955020904541, + "learning_rate": 4.725388324433843e-06, + "loss": 0.6291, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 2.848891258239746, + "learning_rate": 4.725104003897418e-06, + "loss": 0.6544, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 3.0162429809570312, + "learning_rate": 4.724819544813523e-06, + "loss": 0.6301, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 2.613614559173584, + "learning_rate": 4.72453494719987e-06, + "loss": 0.5829, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 2.4838767051696777, + "learning_rate": 4.724250211074182e-06, + "loss": 0.6042, + "step": 2000 + }, + { + "epoch": 0.9460992907801419, + "grad_norm": 2.526470899581909, + "learning_rate": 4.723965336454185e-06, + "loss": 0.6167, + "step": 2001 + }, + { + "epoch": 0.9465721040189126, + "grad_norm": 2.504506826400757, + "learning_rate": 4.723680323357618e-06, + "loss": 0.6061, + "step": 2002 + }, + { + "epoch": 0.9470449172576832, + "grad_norm": 3.0547544956207275, + "learning_rate": 4.723395171802228e-06, + "loss": 0.6619, + "step": 2003 + }, + { + "epoch": 0.9475177304964539, + "grad_norm": 2.8692407608032227, + "learning_rate": 4.723109881805771e-06, + "loss": 0.5985, + "step": 2004 + }, + { + "epoch": 0.9479905437352246, + "grad_norm": 2.7929654121398926, + "learning_rate": 4.7228244533860094e-06, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.9484633569739953, + "grad_norm": 2.764869451522827, + "learning_rate": 4.7225388865607146e-06, + "loss": 0.6288, + "step": 2006 + }, + { + "epoch": 0.948936170212766, + "grad_norm": 2.7656404972076416, + "learning_rate": 4.722253181347671e-06, + "loss": 0.5831, + "step": 2007 + }, + { + "epoch": 0.9494089834515367, + "grad_norm": 2.6698336601257324, + "learning_rate": 4.7219673377646635e-06, + "loss": 0.6087, + "step": 2008 + }, + { + "epoch": 0.9498817966903074, + "grad_norm": 2.524935722351074, + "learning_rate": 4.7216813558294946e-06, + "loss": 0.5675, + "step": 2009 + }, + { + "epoch": 0.950354609929078, + "grad_norm": 2.5998785495758057, + "learning_rate": 4.721395235559969e-06, + "loss": 0.5667, + "step": 2010 + }, + { + "epoch": 0.9508274231678487, + "grad_norm": 2.758021354675293, + "learning_rate": 4.721108976973902e-06, + "loss": 0.4931, + "step": 2011 + }, + { + "epoch": 0.9513002364066194, + "grad_norm": 2.767695903778076, + "learning_rate": 4.72082258008912e-06, + "loss": 0.5778, + "step": 2012 + }, + { + "epoch": 0.9517730496453901, + "grad_norm": 2.982314348220825, + "learning_rate": 4.720536044923453e-06, + "loss": 0.6096, + "step": 2013 + }, + { + "epoch": 0.9522458628841608, + "grad_norm": 2.7608799934387207, + "learning_rate": 4.720249371494743e-06, + "loss": 0.6242, + "step": 2014 + }, + { + "epoch": 0.9527186761229315, + "grad_norm": 2.60054349899292, + "learning_rate": 4.71996255982084e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.9531914893617022, + "grad_norm": 2.654355764389038, + "learning_rate": 4.719675609919603e-06, + "loss": 0.6327, + "step": 2016 + }, + { + "epoch": 0.9536643026004729, + "grad_norm": 2.589404582977295, + "learning_rate": 4.719388521808899e-06, + "loss": 0.6357, + "step": 2017 + }, + { + "epoch": 0.9541371158392435, + "grad_norm": 2.8016581535339355, + "learning_rate": 4.719101295506603e-06, + "loss": 0.5901, + "step": 2018 + }, + { + "epoch": 0.9546099290780142, + "grad_norm": 3.1408045291900635, + "learning_rate": 4.7188139310306e-06, + "loss": 0.598, + "step": 2019 + }, + { + "epoch": 0.9550827423167849, + "grad_norm": 2.7432665824890137, + "learning_rate": 4.718526428398783e-06, + "loss": 0.5508, + "step": 2020 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 2.947800874710083, + "learning_rate": 4.718238787629053e-06, + "loss": 0.6439, + "step": 2021 + }, + { + "epoch": 0.9560283687943263, + "grad_norm": 2.50828218460083, + "learning_rate": 4.71795100873932e-06, + "loss": 0.5441, + "step": 2022 + }, + { + "epoch": 0.956501182033097, + "grad_norm": 2.8558974266052246, + "learning_rate": 4.717663091747503e-06, + "loss": 0.5416, + "step": 2023 + }, + { + "epoch": 0.9569739952718677, + "grad_norm": 2.4803316593170166, + "learning_rate": 4.71737503667153e-06, + "loss": 0.5317, + "step": 2024 + }, + { + "epoch": 0.9574468085106383, + "grad_norm": 4.36754035949707, + "learning_rate": 4.717086843529336e-06, + "loss": 0.5808, + "step": 2025 + }, + { + "epoch": 0.957919621749409, + "grad_norm": 2.730185031890869, + "learning_rate": 4.7167985123388665e-06, + "loss": 0.5257, + "step": 2026 + }, + { + "epoch": 0.9583924349881797, + "grad_norm": 2.8136069774627686, + "learning_rate": 4.716510043118074e-06, + "loss": 0.5836, + "step": 2027 + }, + { + "epoch": 0.9588652482269504, + "grad_norm": 2.793975353240967, + "learning_rate": 4.71622143588492e-06, + "loss": 0.5706, + "step": 2028 + }, + { + "epoch": 0.9593380614657211, + "grad_norm": 2.3883821964263916, + "learning_rate": 4.7159326906573745e-06, + "loss": 0.5291, + "step": 2029 + }, + { + "epoch": 0.9598108747044918, + "grad_norm": 2.6135976314544678, + "learning_rate": 4.715643807453417e-06, + "loss": 0.6199, + "step": 2030 + }, + { + "epoch": 0.9602836879432625, + "grad_norm": 2.6245670318603516, + "learning_rate": 4.715354786291035e-06, + "loss": 0.5585, + "step": 2031 + }, + { + "epoch": 0.9607565011820332, + "grad_norm": 2.7870967388153076, + "learning_rate": 4.715065627188225e-06, + "loss": 0.6196, + "step": 2032 + }, + { + "epoch": 0.9612293144208038, + "grad_norm": 2.6983911991119385, + "learning_rate": 4.714776330162991e-06, + "loss": 0.6424, + "step": 2033 + }, + { + "epoch": 0.9617021276595744, + "grad_norm": 2.3221919536590576, + "learning_rate": 4.7144868952333465e-06, + "loss": 0.568, + "step": 2034 + }, + { + "epoch": 0.9621749408983451, + "grad_norm": 2.9408178329467773, + "learning_rate": 4.714197322417314e-06, + "loss": 0.6175, + "step": 2035 + }, + { + "epoch": 0.9626477541371158, + "grad_norm": 2.404057264328003, + "learning_rate": 4.713907611732921e-06, + "loss": 0.4943, + "step": 2036 + }, + { + "epoch": 0.9631205673758865, + "grad_norm": 3.547607660293579, + "learning_rate": 4.71361776319821e-06, + "loss": 0.5488, + "step": 2037 + }, + { + "epoch": 0.9635933806146572, + "grad_norm": 2.679614543914795, + "learning_rate": 4.713327776831227e-06, + "loss": 0.6234, + "step": 2038 + }, + { + "epoch": 0.9640661938534278, + "grad_norm": 2.526914119720459, + "learning_rate": 4.7130376526500286e-06, + "loss": 0.5891, + "step": 2039 + }, + { + "epoch": 0.9645390070921985, + "grad_norm": 2.6953470706939697, + "learning_rate": 4.71274739067268e-06, + "loss": 0.69, + "step": 2040 + }, + { + "epoch": 0.9650118203309692, + "grad_norm": 2.546660900115967, + "learning_rate": 4.712456990917254e-06, + "loss": 0.6185, + "step": 2041 + }, + { + "epoch": 0.9654846335697399, + "grad_norm": 3.3920490741729736, + "learning_rate": 4.712166453401832e-06, + "loss": 0.587, + "step": 2042 + }, + { + "epoch": 0.9659574468085106, + "grad_norm": 2.5961573123931885, + "learning_rate": 4.711875778144504e-06, + "loss": 0.6105, + "step": 2043 + }, + { + "epoch": 0.9664302600472813, + "grad_norm": 2.5111498832702637, + "learning_rate": 4.711584965163372e-06, + "loss": 0.5533, + "step": 2044 + }, + { + "epoch": 0.966903073286052, + "grad_norm": 2.4878132343292236, + "learning_rate": 4.7112940144765405e-06, + "loss": 0.5604, + "step": 2045 + }, + { + "epoch": 0.9673758865248226, + "grad_norm": 2.5714077949523926, + "learning_rate": 4.711002926102128e-06, + "loss": 0.5794, + "step": 2046 + }, + { + "epoch": 0.9678486997635933, + "grad_norm": 2.7069091796875, + "learning_rate": 4.710711700058257e-06, + "loss": 0.594, + "step": 2047 + }, + { + "epoch": 0.968321513002364, + "grad_norm": 2.8104631900787354, + "learning_rate": 4.710420336363063e-06, + "loss": 0.6247, + "step": 2048 + }, + { + "epoch": 0.9687943262411347, + "grad_norm": 2.8464386463165283, + "learning_rate": 4.7101288350346865e-06, + "loss": 0.6162, + "step": 2049 + }, + { + "epoch": 0.9692671394799054, + "grad_norm": 2.7187976837158203, + "learning_rate": 4.709837196091279e-06, + "loss": 0.6109, + "step": 2050 + }, + { + "epoch": 0.9697399527186761, + "grad_norm": 2.556734085083008, + "learning_rate": 4.709545419550999e-06, + "loss": 0.6297, + "step": 2051 + }, + { + "epoch": 0.9702127659574468, + "grad_norm": 2.937195062637329, + "learning_rate": 4.709253505432014e-06, + "loss": 0.6862, + "step": 2052 + }, + { + "epoch": 0.9706855791962175, + "grad_norm": 2.792175531387329, + "learning_rate": 4.7089614537525015e-06, + "loss": 0.6105, + "step": 2053 + }, + { + "epoch": 0.9711583924349881, + "grad_norm": 2.625636100769043, + "learning_rate": 4.708669264530644e-06, + "loss": 0.5849, + "step": 2054 + }, + { + "epoch": 0.9716312056737588, + "grad_norm": 2.6752610206604004, + "learning_rate": 4.708376937784637e-06, + "loss": 0.5949, + "step": 2055 + }, + { + "epoch": 0.9721040189125295, + "grad_norm": 2.6072793006896973, + "learning_rate": 4.708084473532681e-06, + "loss": 0.5776, + "step": 2056 + }, + { + "epoch": 0.9725768321513002, + "grad_norm": 2.728632926940918, + "learning_rate": 4.707791871792988e-06, + "loss": 0.6352, + "step": 2057 + }, + { + "epoch": 0.9730496453900709, + "grad_norm": 2.5841758251190186, + "learning_rate": 4.707499132583775e-06, + "loss": 0.5488, + "step": 2058 + }, + { + "epoch": 0.9735224586288416, + "grad_norm": 2.8464293479919434, + "learning_rate": 4.707206255923271e-06, + "loss": 0.7051, + "step": 2059 + }, + { + "epoch": 0.9739952718676123, + "grad_norm": 2.547297239303589, + "learning_rate": 4.706913241829712e-06, + "loss": 0.5937, + "step": 2060 + }, + { + "epoch": 0.9744680851063829, + "grad_norm": 2.6572306156158447, + "learning_rate": 4.706620090321341e-06, + "loss": 0.6041, + "step": 2061 + }, + { + "epoch": 0.9749408983451536, + "grad_norm": 2.3262805938720703, + "learning_rate": 4.706326801416414e-06, + "loss": 0.5144, + "step": 2062 + }, + { + "epoch": 0.9754137115839243, + "grad_norm": 2.9693965911865234, + "learning_rate": 4.706033375133191e-06, + "loss": 0.551, + "step": 2063 + }, + { + "epoch": 0.975886524822695, + "grad_norm": 2.5993731021881104, + "learning_rate": 4.7057398114899435e-06, + "loss": 0.6143, + "step": 2064 + }, + { + "epoch": 0.9763593380614657, + "grad_norm": 2.453336477279663, + "learning_rate": 4.70544611050495e-06, + "loss": 0.6093, + "step": 2065 + }, + { + "epoch": 0.9768321513002364, + "grad_norm": 2.898629665374756, + "learning_rate": 4.705152272196497e-06, + "loss": 0.6007, + "step": 2066 + }, + { + "epoch": 0.9773049645390071, + "grad_norm": 2.7990612983703613, + "learning_rate": 4.7048582965828815e-06, + "loss": 0.6687, + "step": 2067 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.635284423828125, + "learning_rate": 4.704564183682408e-06, + "loss": 0.5564, + "step": 2068 + }, + { + "epoch": 0.9782505910165484, + "grad_norm": 3.014547109603882, + "learning_rate": 4.704269933513389e-06, + "loss": 0.6084, + "step": 2069 + }, + { + "epoch": 0.9787234042553191, + "grad_norm": 2.659357786178589, + "learning_rate": 4.703975546094147e-06, + "loss": 0.6031, + "step": 2070 + }, + { + "epoch": 0.9791962174940898, + "grad_norm": 2.326932668685913, + "learning_rate": 4.703681021443013e-06, + "loss": 0.5859, + "step": 2071 + }, + { + "epoch": 0.9796690307328605, + "grad_norm": 2.958803653717041, + "learning_rate": 4.7033863595783235e-06, + "loss": 0.5586, + "step": 2072 + }, + { + "epoch": 0.9801418439716312, + "grad_norm": 2.921386957168579, + "learning_rate": 4.703091560518427e-06, + "loss": 0.6126, + "step": 2073 + }, + { + "epoch": 0.9806146572104019, + "grad_norm": 2.6500775814056396, + "learning_rate": 4.702796624281679e-06, + "loss": 0.5678, + "step": 2074 + }, + { + "epoch": 0.9810874704491725, + "grad_norm": 2.7740228176116943, + "learning_rate": 4.702501550886445e-06, + "loss": 0.6067, + "step": 2075 + }, + { + "epoch": 0.9815602836879432, + "grad_norm": 2.3296213150024414, + "learning_rate": 4.702206340351096e-06, + "loss": 0.5247, + "step": 2076 + }, + { + "epoch": 0.9820330969267139, + "grad_norm": 2.748300790786743, + "learning_rate": 4.701910992694016e-06, + "loss": 0.5197, + "step": 2077 + }, + { + "epoch": 0.9825059101654846, + "grad_norm": 2.250985622406006, + "learning_rate": 4.7016155079335926e-06, + "loss": 0.5214, + "step": 2078 + }, + { + "epoch": 0.9829787234042553, + "grad_norm": 2.389845848083496, + "learning_rate": 4.701319886088226e-06, + "loss": 0.519, + "step": 2079 + }, + { + "epoch": 0.983451536643026, + "grad_norm": 2.818220853805542, + "learning_rate": 4.701024127176322e-06, + "loss": 0.607, + "step": 2080 + }, + { + "epoch": 0.9839243498817967, + "grad_norm": 3.4058034420013428, + "learning_rate": 4.700728231216297e-06, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.9843971631205674, + "grad_norm": 2.5297787189483643, + "learning_rate": 4.700432198226575e-06, + "loss": 0.5979, + "step": 2082 + }, + { + "epoch": 0.984869976359338, + "grad_norm": 3.0548105239868164, + "learning_rate": 4.7001360282255885e-06, + "loss": 0.6041, + "step": 2083 + }, + { + "epoch": 0.9853427895981087, + "grad_norm": 2.8983733654022217, + "learning_rate": 4.699839721231779e-06, + "loss": 0.5926, + "step": 2084 + }, + { + "epoch": 0.9858156028368794, + "grad_norm": 3.2717764377593994, + "learning_rate": 4.699543277263596e-06, + "loss": 0.6477, + "step": 2085 + }, + { + "epoch": 0.9862884160756501, + "grad_norm": 3.03729248046875, + "learning_rate": 4.699246696339497e-06, + "loss": 0.6786, + "step": 2086 + }, + { + "epoch": 0.9867612293144208, + "grad_norm": 2.852301597595215, + "learning_rate": 4.698949978477951e-06, + "loss": 0.6565, + "step": 2087 + }, + { + "epoch": 0.9872340425531915, + "grad_norm": 2.843485116958618, + "learning_rate": 4.698653123697431e-06, + "loss": 0.6627, + "step": 2088 + }, + { + "epoch": 0.9877068557919622, + "grad_norm": 2.6315064430236816, + "learning_rate": 4.698356132016423e-06, + "loss": 0.6577, + "step": 2089 + }, + { + "epoch": 0.9881796690307328, + "grad_norm": 2.7482151985168457, + "learning_rate": 4.698059003453417e-06, + "loss": 0.5514, + "step": 2090 + }, + { + "epoch": 0.9886524822695035, + "grad_norm": 2.826673746109009, + "learning_rate": 4.6977617380269145e-06, + "loss": 0.565, + "step": 2091 + }, + { + "epoch": 0.9891252955082742, + "grad_norm": 3.0273752212524414, + "learning_rate": 4.697464335755427e-06, + "loss": 0.6331, + "step": 2092 + }, + { + "epoch": 0.9895981087470449, + "grad_norm": 2.7551653385162354, + "learning_rate": 4.6971667966574695e-06, + "loss": 0.6486, + "step": 2093 + }, + { + "epoch": 0.9900709219858156, + "grad_norm": 2.656299114227295, + "learning_rate": 4.696869120751571e-06, + "loss": 0.6562, + "step": 2094 + }, + { + "epoch": 0.9905437352245863, + "grad_norm": 2.785322904586792, + "learning_rate": 4.696571308056265e-06, + "loss": 0.5892, + "step": 2095 + }, + { + "epoch": 0.991016548463357, + "grad_norm": 2.9334635734558105, + "learning_rate": 4.696273358590095e-06, + "loss": 0.6346, + "step": 2096 + }, + { + "epoch": 0.9914893617021276, + "grad_norm": 2.7944300174713135, + "learning_rate": 4.695975272371613e-06, + "loss": 0.5859, + "step": 2097 + }, + { + "epoch": 0.9919621749408983, + "grad_norm": 2.5416972637176514, + "learning_rate": 4.695677049419381e-06, + "loss": 0.5658, + "step": 2098 + }, + { + "epoch": 0.992434988179669, + "grad_norm": 2.4056856632232666, + "learning_rate": 4.695378689751966e-06, + "loss": 0.5121, + "step": 2099 + }, + { + "epoch": 0.9929078014184397, + "grad_norm": 2.614548683166504, + "learning_rate": 4.695080193387948e-06, + "loss": 0.5961, + "step": 2100 + }, + { + "epoch": 0.9933806146572104, + "grad_norm": 2.8966517448425293, + "learning_rate": 4.69478156034591e-06, + "loss": 0.5985, + "step": 2101 + }, + { + "epoch": 0.9938534278959811, + "grad_norm": 2.9514098167419434, + "learning_rate": 4.694482790644448e-06, + "loss": 0.5677, + "step": 2102 + }, + { + "epoch": 0.9943262411347518, + "grad_norm": 2.4326791763305664, + "learning_rate": 4.694183884302165e-06, + "loss": 0.5698, + "step": 2103 + }, + { + "epoch": 0.9947990543735225, + "grad_norm": 2.9242892265319824, + "learning_rate": 4.6938848413376735e-06, + "loss": 0.6245, + "step": 2104 + }, + { + "epoch": 0.9952718676122931, + "grad_norm": 2.9134104251861572, + "learning_rate": 4.693585661769593e-06, + "loss": 0.6164, + "step": 2105 + }, + { + "epoch": 0.9957446808510638, + "grad_norm": 2.472564458847046, + "learning_rate": 4.693286345616551e-06, + "loss": 0.5616, + "step": 2106 + }, + { + "epoch": 0.9962174940898345, + "grad_norm": 3.2456448078155518, + "learning_rate": 4.692986892897186e-06, + "loss": 0.6977, + "step": 2107 + }, + { + "epoch": 0.9966903073286052, + "grad_norm": 3.4032769203186035, + "learning_rate": 4.692687303630143e-06, + "loss": 0.643, + "step": 2108 + }, + { + "epoch": 0.9971631205673759, + "grad_norm": 2.722200870513916, + "learning_rate": 4.692387577834076e-06, + "loss": 0.5873, + "step": 2109 + }, + { + "epoch": 0.9976359338061466, + "grad_norm": 2.687532663345337, + "learning_rate": 4.692087715527648e-06, + "loss": 0.5423, + "step": 2110 + }, + { + "epoch": 0.9981087470449173, + "grad_norm": 2.578613042831421, + "learning_rate": 4.6917877167295305e-06, + "loss": 0.5689, + "step": 2111 + }, + { + "epoch": 0.9985815602836879, + "grad_norm": 3.1806094646453857, + "learning_rate": 4.691487581458402e-06, + "loss": 0.6133, + "step": 2112 + }, + { + "epoch": 0.9990543735224586, + "grad_norm": 2.4449520111083984, + "learning_rate": 4.691187309732952e-06, + "loss": 0.5841, + "step": 2113 + }, + { + "epoch": 0.9995271867612293, + "grad_norm": 2.908749580383301, + "learning_rate": 4.690886901571875e-06, + "loss": 0.534, + "step": 2114 + }, + { + "epoch": 1.0, + "grad_norm": 4.019968032836914, + "learning_rate": 4.6905863569938785e-06, + "loss": 0.596, + "step": 2115 + } + ], + "logging_steps": 1, + "max_steps": 12690, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 2115, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.341936104473887e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}