diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,15245 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9456264775413712, + "eval_steps": 20, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_accuracy": 0.7339246119733924, + "eval_f1": 0.24528301886792453, + "eval_loss": 0.6025775074958801, + "eval_precision": 0.6, + "eval_recall": 0.1541501976284585, + "eval_runtime": 48.7601, + "eval_samples_per_second": 5.66, + "eval_steps_per_second": 0.185, + "step": 0 + }, + { + "epoch": 0.00047281323877068556, + "grad_norm": 2.2663159370422363, + "learning_rate": 9.433962264150944e-08, + "loss": 0.6749, + "step": 1 + }, + { + "epoch": 0.0009456264775413711, + "grad_norm": 2.2706336975097656, + "learning_rate": 1.886792452830189e-07, + "loss": 0.6176, + "step": 2 + }, + { + "epoch": 0.0014184397163120568, + "grad_norm": 3.000195026397705, + "learning_rate": 2.8301886792452833e-07, + "loss": 0.5503, + "step": 3 + }, + { + "epoch": 0.0018912529550827422, + "grad_norm": 1.9788570404052734, + "learning_rate": 3.773584905660378e-07, + "loss": 0.5419, + "step": 4 + }, + { + "epoch": 0.002364066193853428, + "grad_norm": 2.908334255218506, + "learning_rate": 4.716981132075472e-07, + "loss": 0.6287, + "step": 5 + }, + { + "epoch": 0.0028368794326241137, + "grad_norm": 2.66474986076355, + "learning_rate": 5.660377358490567e-07, + "loss": 0.6961, + "step": 6 + }, + { + "epoch": 0.003309692671394799, + "grad_norm": 2.1652109622955322, + "learning_rate": 6.603773584905661e-07, + "loss": 0.6128, + "step": 7 + }, + { + "epoch": 0.0037825059101654845, + "grad_norm": 2.8184926509857178, + "learning_rate": 7.547169811320755e-07, + "loss": 0.6289, + "step": 8 + }, + { + "epoch": 0.00425531914893617, + "grad_norm": 2.4191572666168213, + "learning_rate": 8.490566037735849e-07, + "loss": 0.6109, + "step": 9 + }, + { + "epoch": 0.004728132387706856, + "grad_norm": 2.1350443363189697, + "learning_rate": 9.433962264150944e-07, + "loss": 0.6027, + "step": 10 + }, + { + "epoch": 0.005200945626477541, + "grad_norm": 2.3789725303649902, + "learning_rate": 1.037735849056604e-06, + "loss": 0.6009, + "step": 11 + }, + { + "epoch": 0.005673758865248227, + "grad_norm": 2.0987956523895264, + "learning_rate": 1.1320754716981133e-06, + "loss": 0.5696, + "step": 12 + }, + { + "epoch": 0.006146572104018913, + "grad_norm": 2.1385180950164795, + "learning_rate": 1.2264150943396227e-06, + "loss": 0.6081, + "step": 13 + }, + { + "epoch": 0.006619385342789598, + "grad_norm": 2.550551652908325, + "learning_rate": 1.3207547169811322e-06, + "loss": 0.6091, + "step": 14 + }, + { + "epoch": 0.0070921985815602835, + "grad_norm": 2.2968673706054688, + "learning_rate": 1.4150943396226415e-06, + "loss": 0.6279, + "step": 15 + }, + { + "epoch": 0.007565011820330969, + "grad_norm": 2.0703189373016357, + "learning_rate": 1.509433962264151e-06, + "loss": 0.5747, + "step": 16 + }, + { + "epoch": 0.008037825059101654, + "grad_norm": 3.0899605751037598, + "learning_rate": 1.6037735849056604e-06, + "loss": 0.6645, + "step": 17 + }, + { + "epoch": 0.00851063829787234, + "grad_norm": 1.9014838933944702, + "learning_rate": 1.6981132075471698e-06, + "loss": 0.5237, + "step": 18 + }, + { + "epoch": 0.008983451536643027, + "grad_norm": 2.2065110206604004, + "learning_rate": 1.7924528301886793e-06, + "loss": 0.5515, + "step": 19 + }, + { + "epoch": 0.009456264775413711, + "grad_norm": 1.8808679580688477, + "learning_rate": 1.8867924528301889e-06, + "loss": 0.5256, + "step": 20 + }, + { + "epoch": 0.009456264775413711, + "eval_accuracy": 0.7350332594235033, + "eval_f1": 0.2507836990595611, + "eval_loss": 0.6015310287475586, + "eval_precision": 0.6060606060606061, + "eval_recall": 0.15810276679841898, + "eval_runtime": 49.4054, + "eval_samples_per_second": 5.586, + "eval_steps_per_second": 0.182, + "step": 20 + }, + { + "epoch": 0.009929078014184398, + "grad_norm": 1.7238119840621948, + "learning_rate": 1.981132075471698e-06, + "loss": 0.5206, + "step": 21 + }, + { + "epoch": 0.010401891252955082, + "grad_norm": 2.4662513732910156, + "learning_rate": 2.075471698113208e-06, + "loss": 0.6127, + "step": 22 + }, + { + "epoch": 0.010874704491725768, + "grad_norm": 2.3498942852020264, + "learning_rate": 2.1698113207547173e-06, + "loss": 0.6588, + "step": 23 + }, + { + "epoch": 0.011347517730496455, + "grad_norm": 2.9931042194366455, + "learning_rate": 2.2641509433962266e-06, + "loss": 0.5891, + "step": 24 + }, + { + "epoch": 0.01182033096926714, + "grad_norm": 2.32051944732666, + "learning_rate": 2.358490566037736e-06, + "loss": 0.599, + "step": 25 + }, + { + "epoch": 0.012293144208037825, + "grad_norm": 3.134979486465454, + "learning_rate": 2.4528301886792453e-06, + "loss": 0.6473, + "step": 26 + }, + { + "epoch": 0.01276595744680851, + "grad_norm": 2.371523857116699, + "learning_rate": 2.547169811320755e-06, + "loss": 0.6037, + "step": 27 + }, + { + "epoch": 0.013238770685579196, + "grad_norm": 2.681360960006714, + "learning_rate": 2.6415094339622644e-06, + "loss": 0.5573, + "step": 28 + }, + { + "epoch": 0.013711583924349883, + "grad_norm": 3.202848434448242, + "learning_rate": 2.7358490566037738e-06, + "loss": 0.6697, + "step": 29 + }, + { + "epoch": 0.014184397163120567, + "grad_norm": 2.109222888946533, + "learning_rate": 2.830188679245283e-06, + "loss": 0.6073, + "step": 30 + }, + { + "epoch": 0.014657210401891253, + "grad_norm": 2.031996250152588, + "learning_rate": 2.9245283018867924e-06, + "loss": 0.5569, + "step": 31 + }, + { + "epoch": 0.015130023640661938, + "grad_norm": 2.5829803943634033, + "learning_rate": 3.018867924528302e-06, + "loss": 0.5503, + "step": 32 + }, + { + "epoch": 0.015602836879432624, + "grad_norm": 3.7826120853424072, + "learning_rate": 3.1132075471698115e-06, + "loss": 0.4966, + "step": 33 + }, + { + "epoch": 0.01607565011820331, + "grad_norm": 1.3336539268493652, + "learning_rate": 3.207547169811321e-06, + "loss": 0.4965, + "step": 34 + }, + { + "epoch": 0.016548463356973995, + "grad_norm": 2.4206223487854004, + "learning_rate": 3.30188679245283e-06, + "loss": 0.6583, + "step": 35 + }, + { + "epoch": 0.01702127659574468, + "grad_norm": 2.437887191772461, + "learning_rate": 3.3962264150943395e-06, + "loss": 0.5355, + "step": 36 + }, + { + "epoch": 0.017494089834515367, + "grad_norm": 2.00358510017395, + "learning_rate": 3.4905660377358493e-06, + "loss": 0.5854, + "step": 37 + }, + { + "epoch": 0.017966903073286054, + "grad_norm": 2.635550022125244, + "learning_rate": 3.5849056603773586e-06, + "loss": 0.6226, + "step": 38 + }, + { + "epoch": 0.018439716312056736, + "grad_norm": 1.9979338645935059, + "learning_rate": 3.679245283018868e-06, + "loss": 0.6202, + "step": 39 + }, + { + "epoch": 0.018912529550827423, + "grad_norm": 2.933034658432007, + "learning_rate": 3.7735849056603777e-06, + "loss": 0.6118, + "step": 40 + }, + { + "epoch": 0.018912529550827423, + "eval_accuracy": 0.7361419068736141, + "eval_f1": 0.2608695652173913, + "eval_loss": 0.5987924337387085, + "eval_precision": 0.6086956521739131, + "eval_recall": 0.16600790513833993, + "eval_runtime": 49.5092, + "eval_samples_per_second": 5.575, + "eval_steps_per_second": 0.182, + "step": 40 + }, + { + "epoch": 0.01938534278959811, + "grad_norm": 2.4579005241394043, + "learning_rate": 3.8679245283018875e-06, + "loss": 0.6329, + "step": 41 + }, + { + "epoch": 0.019858156028368795, + "grad_norm": 2.0814871788024902, + "learning_rate": 3.962264150943396e-06, + "loss": 0.5792, + "step": 42 + }, + { + "epoch": 0.02033096926713948, + "grad_norm": 4.046266078948975, + "learning_rate": 4.056603773584906e-06, + "loss": 0.6294, + "step": 43 + }, + { + "epoch": 0.020803782505910164, + "grad_norm": 1.990343451499939, + "learning_rate": 4.150943396226416e-06, + "loss": 0.5279, + "step": 44 + }, + { + "epoch": 0.02127659574468085, + "grad_norm": 2.2721312046051025, + "learning_rate": 4.245283018867925e-06, + "loss": 0.5947, + "step": 45 + }, + { + "epoch": 0.021749408983451537, + "grad_norm": 2.3753161430358887, + "learning_rate": 4.339622641509435e-06, + "loss": 0.6097, + "step": 46 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 2.2465322017669678, + "learning_rate": 4.4339622641509435e-06, + "loss": 0.5921, + "step": 47 + }, + { + "epoch": 0.02269503546099291, + "grad_norm": 1.9690579175949097, + "learning_rate": 4.528301886792453e-06, + "loss": 0.5191, + "step": 48 + }, + { + "epoch": 0.023167848699763592, + "grad_norm": 2.9993767738342285, + "learning_rate": 4.622641509433963e-06, + "loss": 0.5827, + "step": 49 + }, + { + "epoch": 0.02364066193853428, + "grad_norm": 2.3443307876586914, + "learning_rate": 4.716981132075472e-06, + "loss": 0.5746, + "step": 50 + }, + { + "epoch": 0.024113475177304965, + "grad_norm": 2.446950912475586, + "learning_rate": 4.811320754716982e-06, + "loss": 0.582, + "step": 51 + }, + { + "epoch": 0.02458628841607565, + "grad_norm": 3.164130687713623, + "learning_rate": 4.905660377358491e-06, + "loss": 0.6406, + "step": 52 + }, + { + "epoch": 0.025059101654846337, + "grad_norm": 2.339772939682007, + "learning_rate": 5e-06, + "loss": 0.5627, + "step": 53 + }, + { + "epoch": 0.02553191489361702, + "grad_norm": 2.548523187637329, + "learning_rate": 5.09433962264151e-06, + "loss": 0.5909, + "step": 54 + }, + { + "epoch": 0.026004728132387706, + "grad_norm": 3.006196975708008, + "learning_rate": 5.188679245283019e-06, + "loss": 0.6046, + "step": 55 + }, + { + "epoch": 0.026477541371158392, + "grad_norm": 2.3136887550354004, + "learning_rate": 5.283018867924529e-06, + "loss": 0.5235, + "step": 56 + }, + { + "epoch": 0.02695035460992908, + "grad_norm": 2.072728157043457, + "learning_rate": 5.377358490566038e-06, + "loss": 0.6377, + "step": 57 + }, + { + "epoch": 0.027423167848699765, + "grad_norm": 3.415151357650757, + "learning_rate": 5.4716981132075475e-06, + "loss": 0.6367, + "step": 58 + }, + { + "epoch": 0.027895981087470448, + "grad_norm": 2.400956869125366, + "learning_rate": 5.566037735849057e-06, + "loss": 0.5671, + "step": 59 + }, + { + "epoch": 0.028368794326241134, + "grad_norm": 2.0561230182647705, + "learning_rate": 5.660377358490566e-06, + "loss": 0.5575, + "step": 60 + }, + { + "epoch": 0.028368794326241134, + "eval_accuracy": 0.7450110864745011, + "eval_f1": 0.3072289156626506, + "eval_loss": 0.5848703384399414, + "eval_precision": 0.6455696202531646, + "eval_recall": 0.2015810276679842, + "eval_runtime": 49.0008, + "eval_samples_per_second": 5.633, + "eval_steps_per_second": 0.184, + "step": 60 + }, + { + "epoch": 0.02884160756501182, + "grad_norm": 2.189640522003174, + "learning_rate": 5.754716981132076e-06, + "loss": 0.619, + "step": 61 + }, + { + "epoch": 0.029314420803782507, + "grad_norm": 2.607837677001953, + "learning_rate": 5.849056603773585e-06, + "loss": 0.6095, + "step": 62 + }, + { + "epoch": 0.029787234042553193, + "grad_norm": 2.26078200340271, + "learning_rate": 5.943396226415095e-06, + "loss": 0.5471, + "step": 63 + }, + { + "epoch": 0.030260047281323876, + "grad_norm": 2.622464656829834, + "learning_rate": 6.037735849056604e-06, + "loss": 0.6509, + "step": 64 + }, + { + "epoch": 0.030732860520094562, + "grad_norm": 2.8349571228027344, + "learning_rate": 6.132075471698113e-06, + "loss": 0.6926, + "step": 65 + }, + { + "epoch": 0.031205673758865248, + "grad_norm": 2.139317750930786, + "learning_rate": 6.226415094339623e-06, + "loss": 0.5786, + "step": 66 + }, + { + "epoch": 0.03167848699763593, + "grad_norm": 3.0620882511138916, + "learning_rate": 6.320754716981132e-06, + "loss": 0.5841, + "step": 67 + }, + { + "epoch": 0.03215130023640662, + "grad_norm": 2.194460391998291, + "learning_rate": 6.415094339622642e-06, + "loss": 0.5746, + "step": 68 + }, + { + "epoch": 0.032624113475177303, + "grad_norm": 2.3444063663482666, + "learning_rate": 6.5094339622641515e-06, + "loss": 0.51, + "step": 69 + }, + { + "epoch": 0.03309692671394799, + "grad_norm": 3.622739791870117, + "learning_rate": 6.60377358490566e-06, + "loss": 0.6342, + "step": 70 + }, + { + "epoch": 0.033569739952718676, + "grad_norm": 2.9004671573638916, + "learning_rate": 6.69811320754717e-06, + "loss": 0.641, + "step": 71 + }, + { + "epoch": 0.03404255319148936, + "grad_norm": 2.351501941680908, + "learning_rate": 6.792452830188679e-06, + "loss": 0.5936, + "step": 72 + }, + { + "epoch": 0.03451536643026005, + "grad_norm": 2.6966824531555176, + "learning_rate": 6.886792452830189e-06, + "loss": 0.5755, + "step": 73 + }, + { + "epoch": 0.034988179669030735, + "grad_norm": 2.026407241821289, + "learning_rate": 6.981132075471699e-06, + "loss": 0.4305, + "step": 74 + }, + { + "epoch": 0.03546099290780142, + "grad_norm": 2.9599199295043945, + "learning_rate": 7.0754716981132075e-06, + "loss": 0.494, + "step": 75 + }, + { + "epoch": 0.03593380614657211, + "grad_norm": 2.460238218307495, + "learning_rate": 7.169811320754717e-06, + "loss": 0.5991, + "step": 76 + }, + { + "epoch": 0.03640661938534279, + "grad_norm": 3.174283266067505, + "learning_rate": 7.264150943396226e-06, + "loss": 0.6035, + "step": 77 + }, + { + "epoch": 0.03687943262411347, + "grad_norm": 2.4575035572052, + "learning_rate": 7.358490566037736e-06, + "loss": 0.549, + "step": 78 + }, + { + "epoch": 0.03735224586288416, + "grad_norm": 2.558811664581299, + "learning_rate": 7.452830188679246e-06, + "loss": 0.4979, + "step": 79 + }, + { + "epoch": 0.037825059101654845, + "grad_norm": 2.396045684814453, + "learning_rate": 7.5471698113207555e-06, + "loss": 0.6385, + "step": 80 + }, + { + "epoch": 0.037825059101654845, + "eval_accuracy": 0.746119733924612, + "eval_f1": 0.3989501312335958, + "eval_loss": 0.5647635459899902, + "eval_precision": 0.59375, + "eval_recall": 0.30039525691699603, + "eval_runtime": 49.2336, + "eval_samples_per_second": 5.606, + "eval_steps_per_second": 0.183, + "step": 80 + }, + { + "epoch": 0.03829787234042553, + "grad_norm": 3.1989941596984863, + "learning_rate": 7.641509433962266e-06, + "loss": 0.6269, + "step": 81 + }, + { + "epoch": 0.03877068557919622, + "grad_norm": 2.829859972000122, + "learning_rate": 7.735849056603775e-06, + "loss": 0.5911, + "step": 82 + }, + { + "epoch": 0.039243498817966904, + "grad_norm": 2.1976866722106934, + "learning_rate": 7.830188679245284e-06, + "loss": 0.5403, + "step": 83 + }, + { + "epoch": 0.03971631205673759, + "grad_norm": 2.3576695919036865, + "learning_rate": 7.924528301886793e-06, + "loss": 0.5199, + "step": 84 + }, + { + "epoch": 0.04018912529550828, + "grad_norm": 2.662384033203125, + "learning_rate": 8.018867924528303e-06, + "loss": 0.5898, + "step": 85 + }, + { + "epoch": 0.04066193853427896, + "grad_norm": 4.871118545532227, + "learning_rate": 8.113207547169812e-06, + "loss": 0.5394, + "step": 86 + }, + { + "epoch": 0.04113475177304964, + "grad_norm": 2.710362195968628, + "learning_rate": 8.207547169811321e-06, + "loss": 0.509, + "step": 87 + }, + { + "epoch": 0.04160756501182033, + "grad_norm": 2.387660264968872, + "learning_rate": 8.301886792452832e-06, + "loss": 0.567, + "step": 88 + }, + { + "epoch": 0.042080378250591015, + "grad_norm": 2.4443883895874023, + "learning_rate": 8.39622641509434e-06, + "loss": 0.5319, + "step": 89 + }, + { + "epoch": 0.0425531914893617, + "grad_norm": 2.9741415977478027, + "learning_rate": 8.49056603773585e-06, + "loss": 0.6318, + "step": 90 + }, + { + "epoch": 0.04302600472813239, + "grad_norm": 2.7385494709014893, + "learning_rate": 8.58490566037736e-06, + "loss": 0.6109, + "step": 91 + }, + { + "epoch": 0.043498817966903074, + "grad_norm": 3.7744197845458984, + "learning_rate": 8.67924528301887e-06, + "loss": 0.6275, + "step": 92 + }, + { + "epoch": 0.04397163120567376, + "grad_norm": 3.1745519638061523, + "learning_rate": 8.773584905660378e-06, + "loss": 0.5503, + "step": 93 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 3.254016399383545, + "learning_rate": 8.867924528301887e-06, + "loss": 0.6103, + "step": 94 + }, + { + "epoch": 0.04491725768321513, + "grad_norm": 2.4502315521240234, + "learning_rate": 8.962264150943398e-06, + "loss": 0.4538, + "step": 95 + }, + { + "epoch": 0.04539007092198582, + "grad_norm": 2.24422025680542, + "learning_rate": 9.056603773584907e-06, + "loss": 0.5061, + "step": 96 + }, + { + "epoch": 0.0458628841607565, + "grad_norm": 3.284022092819214, + "learning_rate": 9.150943396226416e-06, + "loss": 0.579, + "step": 97 + }, + { + "epoch": 0.046335697399527184, + "grad_norm": 2.722243309020996, + "learning_rate": 9.245283018867926e-06, + "loss": 0.5395, + "step": 98 + }, + { + "epoch": 0.04680851063829787, + "grad_norm": 2.641986131668091, + "learning_rate": 9.339622641509435e-06, + "loss": 0.5201, + "step": 99 + }, + { + "epoch": 0.04728132387706856, + "grad_norm": 2.5733094215393066, + "learning_rate": 9.433962264150944e-06, + "loss": 0.4791, + "step": 100 + }, + { + "epoch": 0.04728132387706856, + "eval_accuracy": 0.7660753880266076, + "eval_f1": 0.4403183023872679, + "eval_loss": 0.539648711681366, + "eval_precision": 0.6693548387096774, + "eval_recall": 0.32806324110671936, + "eval_runtime": 50.2993, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 0.179, + "step": 100 + }, + { + "epoch": 0.04775413711583924, + "grad_norm": 2.3666300773620605, + "learning_rate": 9.528301886792455e-06, + "loss": 0.4835, + "step": 101 + }, + { + "epoch": 0.04822695035460993, + "grad_norm": 3.929025888442993, + "learning_rate": 9.622641509433963e-06, + "loss": 0.48, + "step": 102 + }, + { + "epoch": 0.048699763593380616, + "grad_norm": 2.604964017868042, + "learning_rate": 9.716981132075472e-06, + "loss": 0.4988, + "step": 103 + }, + { + "epoch": 0.0491725768321513, + "grad_norm": 2.985452890396118, + "learning_rate": 9.811320754716981e-06, + "loss": 0.4611, + "step": 104 + }, + { + "epoch": 0.04964539007092199, + "grad_norm": 3.0728108882904053, + "learning_rate": 9.905660377358492e-06, + "loss": 0.4563, + "step": 105 + }, + { + "epoch": 0.050118203309692674, + "grad_norm": 2.5450596809387207, + "learning_rate": 1e-05, + "loss": 0.399, + "step": 106 + }, + { + "epoch": 0.050591016548463354, + "grad_norm": 4.241573810577393, + "learning_rate": 1.0094339622641511e-05, + "loss": 0.659, + "step": 107 + }, + { + "epoch": 0.05106382978723404, + "grad_norm": 2.582282781600952, + "learning_rate": 1.018867924528302e-05, + "loss": 0.4278, + "step": 108 + }, + { + "epoch": 0.051536643026004726, + "grad_norm": 3.337094306945801, + "learning_rate": 1.0283018867924531e-05, + "loss": 0.4334, + "step": 109 + }, + { + "epoch": 0.05200945626477541, + "grad_norm": 2.199113368988037, + "learning_rate": 1.0377358490566038e-05, + "loss": 0.3644, + "step": 110 + }, + { + "epoch": 0.0524822695035461, + "grad_norm": 3.2351200580596924, + "learning_rate": 1.0471698113207549e-05, + "loss": 0.506, + "step": 111 + }, + { + "epoch": 0.052955082742316785, + "grad_norm": 3.9023163318634033, + "learning_rate": 1.0566037735849058e-05, + "loss": 0.5263, + "step": 112 + }, + { + "epoch": 0.05342789598108747, + "grad_norm": 2.8746888637542725, + "learning_rate": 1.0660377358490568e-05, + "loss": 0.4335, + "step": 113 + }, + { + "epoch": 0.05390070921985816, + "grad_norm": 3.934784173965454, + "learning_rate": 1.0754716981132076e-05, + "loss": 0.4804, + "step": 114 + }, + { + "epoch": 0.054373522458628844, + "grad_norm": 3.4959094524383545, + "learning_rate": 1.0849056603773586e-05, + "loss": 0.4114, + "step": 115 + }, + { + "epoch": 0.05484633569739953, + "grad_norm": 3.6590819358825684, + "learning_rate": 1.0943396226415095e-05, + "loss": 0.4618, + "step": 116 + }, + { + "epoch": 0.05531914893617021, + "grad_norm": 3.4006142616271973, + "learning_rate": 1.1037735849056606e-05, + "loss": 0.4839, + "step": 117 + }, + { + "epoch": 0.055791962174940896, + "grad_norm": 3.9192683696746826, + "learning_rate": 1.1132075471698115e-05, + "loss": 0.484, + "step": 118 + }, + { + "epoch": 0.05626477541371158, + "grad_norm": 3.4559454917907715, + "learning_rate": 1.1226415094339625e-05, + "loss": 0.5247, + "step": 119 + }, + { + "epoch": 0.05673758865248227, + "grad_norm": 4.34246301651001, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.3593, + "step": 120 + }, + { + "epoch": 0.05673758865248227, + "eval_accuracy": 0.779379157427938, + "eval_f1": 0.4776902887139108, + "eval_loss": 0.5030393600463867, + "eval_precision": 0.7109375, + "eval_recall": 0.35968379446640314, + "eval_runtime": 47.339, + "eval_samples_per_second": 5.83, + "eval_steps_per_second": 0.19, + "step": 120 + }, + { + "epoch": 0.057210401891252954, + "grad_norm": 4.863562107086182, + "learning_rate": 1.1415094339622643e-05, + "loss": 0.4712, + "step": 121 + }, + { + "epoch": 0.05768321513002364, + "grad_norm": 3.856417179107666, + "learning_rate": 1.1509433962264152e-05, + "loss": 0.4069, + "step": 122 + }, + { + "epoch": 0.05815602836879433, + "grad_norm": 3.3835558891296387, + "learning_rate": 1.1603773584905663e-05, + "loss": 0.4641, + "step": 123 + }, + { + "epoch": 0.05862884160756501, + "grad_norm": 4.175307750701904, + "learning_rate": 1.169811320754717e-05, + "loss": 0.4452, + "step": 124 + }, + { + "epoch": 0.0591016548463357, + "grad_norm": 5.4297356605529785, + "learning_rate": 1.179245283018868e-05, + "loss": 0.3464, + "step": 125 + }, + { + "epoch": 0.059574468085106386, + "grad_norm": 3.767544746398926, + "learning_rate": 1.188679245283019e-05, + "loss": 0.4649, + "step": 126 + }, + { + "epoch": 0.06004728132387707, + "grad_norm": 3.859020233154297, + "learning_rate": 1.19811320754717e-05, + "loss": 0.4034, + "step": 127 + }, + { + "epoch": 0.06052009456264775, + "grad_norm": 5.159704685211182, + "learning_rate": 1.2075471698113209e-05, + "loss": 0.5041, + "step": 128 + }, + { + "epoch": 0.06099290780141844, + "grad_norm": 4.291565418243408, + "learning_rate": 1.216981132075472e-05, + "loss": 0.4173, + "step": 129 + }, + { + "epoch": 0.061465721040189124, + "grad_norm": 4.175761699676514, + "learning_rate": 1.2264150943396227e-05, + "loss": 0.391, + "step": 130 + }, + { + "epoch": 0.06193853427895981, + "grad_norm": 5.98757266998291, + "learning_rate": 1.2358490566037737e-05, + "loss": 0.4565, + "step": 131 + }, + { + "epoch": 0.062411347517730496, + "grad_norm": 6.7860307693481445, + "learning_rate": 1.2452830188679246e-05, + "loss": 0.4116, + "step": 132 + }, + { + "epoch": 0.06288416075650118, + "grad_norm": 7.493508338928223, + "learning_rate": 1.2547169811320757e-05, + "loss": 0.4762, + "step": 133 + }, + { + "epoch": 0.06335697399527186, + "grad_norm": 4.3719964027404785, + "learning_rate": 1.2641509433962264e-05, + "loss": 0.4301, + "step": 134 + }, + { + "epoch": 0.06382978723404255, + "grad_norm": 5.992913246154785, + "learning_rate": 1.2735849056603775e-05, + "loss": 0.4892, + "step": 135 + }, + { + "epoch": 0.06430260047281323, + "grad_norm": 6.05405330657959, + "learning_rate": 1.2830188679245283e-05, + "loss": 0.4004, + "step": 136 + }, + { + "epoch": 0.06477541371158392, + "grad_norm": 6.542272090911865, + "learning_rate": 1.2924528301886794e-05, + "loss": 0.5386, + "step": 137 + }, + { + "epoch": 0.06524822695035461, + "grad_norm": 5.304028511047363, + "learning_rate": 1.3018867924528303e-05, + "loss": 0.3309, + "step": 138 + }, + { + "epoch": 0.0657210401891253, + "grad_norm": 4.330917835235596, + "learning_rate": 1.3113207547169814e-05, + "loss": 0.3509, + "step": 139 + }, + { + "epoch": 0.06619385342789598, + "grad_norm": 6.812550067901611, + "learning_rate": 1.320754716981132e-05, + "loss": 0.4435, + "step": 140 + }, + { + "epoch": 0.06619385342789598, + "eval_accuracy": 0.779379157427938, + "eval_f1": 0.5446224256292906, + "eval_loss": 0.4715713858604431, + "eval_precision": 0.6467391304347826, + "eval_recall": 0.47035573122529645, + "eval_runtime": 48.4446, + "eval_samples_per_second": 5.697, + "eval_steps_per_second": 0.186, + "step": 140 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 7.422946453094482, + "learning_rate": 1.3301886792452831e-05, + "loss": 0.406, + "step": 141 + }, + { + "epoch": 0.06713947990543735, + "grad_norm": 7.327658653259277, + "learning_rate": 1.339622641509434e-05, + "loss": 0.4366, + "step": 142 + }, + { + "epoch": 0.06761229314420804, + "grad_norm": 6.69068717956543, + "learning_rate": 1.3490566037735851e-05, + "loss": 0.4263, + "step": 143 + }, + { + "epoch": 0.06808510638297872, + "grad_norm": 4.780946254730225, + "learning_rate": 1.3584905660377358e-05, + "loss": 0.325, + "step": 144 + }, + { + "epoch": 0.06855791962174941, + "grad_norm": 6.016948699951172, + "learning_rate": 1.3679245283018869e-05, + "loss": 0.4426, + "step": 145 + }, + { + "epoch": 0.0690307328605201, + "grad_norm": 5.669694900512695, + "learning_rate": 1.3773584905660378e-05, + "loss": 0.3957, + "step": 146 + }, + { + "epoch": 0.06950354609929078, + "grad_norm": 6.2454609870910645, + "learning_rate": 1.3867924528301888e-05, + "loss": 0.3033, + "step": 147 + }, + { + "epoch": 0.06997635933806147, + "grad_norm": 5.8120808601379395, + "learning_rate": 1.3962264150943397e-05, + "loss": 0.4472, + "step": 148 + }, + { + "epoch": 0.07044917257683216, + "grad_norm": 6.466278553009033, + "learning_rate": 1.4056603773584908e-05, + "loss": 0.3469, + "step": 149 + }, + { + "epoch": 0.07092198581560284, + "grad_norm": 8.212775230407715, + "learning_rate": 1.4150943396226415e-05, + "loss": 0.4478, + "step": 150 + }, + { + "epoch": 0.07139479905437353, + "grad_norm": 7.582151889801025, + "learning_rate": 1.4245283018867926e-05, + "loss": 0.4312, + "step": 151 + }, + { + "epoch": 0.07186761229314421, + "grad_norm": 5.214906215667725, + "learning_rate": 1.4339622641509435e-05, + "loss": 0.3446, + "step": 152 + }, + { + "epoch": 0.07234042553191489, + "grad_norm": 4.743616580963135, + "learning_rate": 1.4433962264150945e-05, + "loss": 0.2665, + "step": 153 + }, + { + "epoch": 0.07281323877068557, + "grad_norm": 5.460316181182861, + "learning_rate": 1.4528301886792452e-05, + "loss": 0.4369, + "step": 154 + }, + { + "epoch": 0.07328605200945626, + "grad_norm": 7.11004638671875, + "learning_rate": 1.4622641509433963e-05, + "loss": 0.3813, + "step": 155 + }, + { + "epoch": 0.07375886524822695, + "grad_norm": 6.461905479431152, + "learning_rate": 1.4716981132075472e-05, + "loss": 0.3413, + "step": 156 + }, + { + "epoch": 0.07423167848699763, + "grad_norm": 6.668741226196289, + "learning_rate": 1.4811320754716983e-05, + "loss": 0.425, + "step": 157 + }, + { + "epoch": 0.07470449172576832, + "grad_norm": 7.922025203704834, + "learning_rate": 1.4905660377358491e-05, + "loss": 0.3927, + "step": 158 + }, + { + "epoch": 0.075177304964539, + "grad_norm": 5.079823017120361, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.34, + "step": 159 + }, + { + "epoch": 0.07565011820330969, + "grad_norm": 5.172731876373291, + "learning_rate": 1.5094339622641511e-05, + "loss": 0.3899, + "step": 160 + }, + { + "epoch": 0.07565011820330969, + "eval_accuracy": 0.7937915742793792, + "eval_f1": 0.5181347150259067, + "eval_loss": 0.4403148889541626, + "eval_precision": 0.7518796992481203, + "eval_recall": 0.3952569169960474, + "eval_runtime": 48.1121, + "eval_samples_per_second": 5.737, + "eval_steps_per_second": 0.187, + "step": 160 + }, + { + "epoch": 0.07612293144208038, + "grad_norm": 6.415732383728027, + "learning_rate": 1.518867924528302e-05, + "loss": 0.3399, + "step": 161 + }, + { + "epoch": 0.07659574468085106, + "grad_norm": 6.380404472351074, + "learning_rate": 1.5283018867924532e-05, + "loss": 0.4384, + "step": 162 + }, + { + "epoch": 0.07706855791962175, + "grad_norm": 4.902369976043701, + "learning_rate": 1.5377358490566038e-05, + "loss": 0.3413, + "step": 163 + }, + { + "epoch": 0.07754137115839244, + "grad_norm": 5.686254024505615, + "learning_rate": 1.547169811320755e-05, + "loss": 0.4252, + "step": 164 + }, + { + "epoch": 0.07801418439716312, + "grad_norm": 6.227957725524902, + "learning_rate": 1.556603773584906e-05, + "loss": 0.3132, + "step": 165 + }, + { + "epoch": 0.07848699763593381, + "grad_norm": 8.092106819152832, + "learning_rate": 1.5660377358490568e-05, + "loss": 0.5402, + "step": 166 + }, + { + "epoch": 0.0789598108747045, + "grad_norm": 6.633399486541748, + "learning_rate": 1.5754716981132077e-05, + "loss": 0.3574, + "step": 167 + }, + { + "epoch": 0.07943262411347518, + "grad_norm": 7.712852954864502, + "learning_rate": 1.5849056603773586e-05, + "loss": 0.435, + "step": 168 + }, + { + "epoch": 0.07990543735224587, + "grad_norm": 4.21342134475708, + "learning_rate": 1.5943396226415095e-05, + "loss": 0.3245, + "step": 169 + }, + { + "epoch": 0.08037825059101655, + "grad_norm": 4.879771709442139, + "learning_rate": 1.6037735849056607e-05, + "loss": 0.2535, + "step": 170 + }, + { + "epoch": 0.08085106382978724, + "grad_norm": 7.206470966339111, + "learning_rate": 1.6132075471698116e-05, + "loss": 0.3628, + "step": 171 + }, + { + "epoch": 0.08132387706855793, + "grad_norm": 4.479485034942627, + "learning_rate": 1.6226415094339625e-05, + "loss": 0.3318, + "step": 172 + }, + { + "epoch": 0.0817966903073286, + "grad_norm": 6.472604751586914, + "learning_rate": 1.6320754716981134e-05, + "loss": 0.4404, + "step": 173 + }, + { + "epoch": 0.08226950354609928, + "grad_norm": 4.094892501831055, + "learning_rate": 1.6415094339622643e-05, + "loss": 0.2986, + "step": 174 + }, + { + "epoch": 0.08274231678486997, + "grad_norm": 5.433969974517822, + "learning_rate": 1.650943396226415e-05, + "loss": 0.3356, + "step": 175 + }, + { + "epoch": 0.08321513002364066, + "grad_norm": 5.57079553604126, + "learning_rate": 1.6603773584905664e-05, + "loss": 0.3288, + "step": 176 + }, + { + "epoch": 0.08368794326241134, + "grad_norm": 5.4054460525512695, + "learning_rate": 1.669811320754717e-05, + "loss": 0.3688, + "step": 177 + }, + { + "epoch": 0.08416075650118203, + "grad_norm": 6.414549350738525, + "learning_rate": 1.679245283018868e-05, + "loss": 0.3791, + "step": 178 + }, + { + "epoch": 0.08463356973995272, + "grad_norm": 6.032560348510742, + "learning_rate": 1.688679245283019e-05, + "loss": 0.3142, + "step": 179 + }, + { + "epoch": 0.0851063829787234, + "grad_norm": 6.080160617828369, + "learning_rate": 1.69811320754717e-05, + "loss": 0.3429, + "step": 180 + }, + { + "epoch": 0.0851063829787234, + "eval_accuracy": 0.8159645232815964, + "eval_f1": 0.5951219512195122, + "eval_loss": 0.4055093824863434, + "eval_precision": 0.7770700636942676, + "eval_recall": 0.48221343873517786, + "eval_runtime": 47.3649, + "eval_samples_per_second": 5.827, + "eval_steps_per_second": 0.19, + "step": 180 + }, + { + "epoch": 0.08557919621749409, + "grad_norm": 5.409966468811035, + "learning_rate": 1.707547169811321e-05, + "loss": 0.2881, + "step": 181 + }, + { + "epoch": 0.08605200945626477, + "grad_norm": 7.760888576507568, + "learning_rate": 1.716981132075472e-05, + "loss": 0.4003, + "step": 182 + }, + { + "epoch": 0.08652482269503546, + "grad_norm": 6.271183013916016, + "learning_rate": 1.7264150943396226e-05, + "loss": 0.3343, + "step": 183 + }, + { + "epoch": 0.08699763593380615, + "grad_norm": 7.139448165893555, + "learning_rate": 1.735849056603774e-05, + "loss": 0.3246, + "step": 184 + }, + { + "epoch": 0.08747044917257683, + "grad_norm": 5.225427627563477, + "learning_rate": 1.7452830188679247e-05, + "loss": 0.3042, + "step": 185 + }, + { + "epoch": 0.08794326241134752, + "grad_norm": 7.858066558837891, + "learning_rate": 1.7547169811320756e-05, + "loss": 0.3843, + "step": 186 + }, + { + "epoch": 0.0884160756501182, + "grad_norm": 7.103234767913818, + "learning_rate": 1.7641509433962265e-05, + "loss": 0.3878, + "step": 187 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 8.041577339172363, + "learning_rate": 1.7735849056603774e-05, + "loss": 0.4482, + "step": 188 + }, + { + "epoch": 0.08936170212765958, + "grad_norm": 6.207291126251221, + "learning_rate": 1.7830188679245283e-05, + "loss": 0.3709, + "step": 189 + }, + { + "epoch": 0.08983451536643026, + "grad_norm": 5.278400897979736, + "learning_rate": 1.7924528301886795e-05, + "loss": 0.2548, + "step": 190 + }, + { + "epoch": 0.09030732860520095, + "grad_norm": 6.8568854331970215, + "learning_rate": 1.8018867924528304e-05, + "loss": 0.3724, + "step": 191 + }, + { + "epoch": 0.09078014184397164, + "grad_norm": 6.631660461425781, + "learning_rate": 1.8113207547169813e-05, + "loss": 0.3654, + "step": 192 + }, + { + "epoch": 0.09125295508274232, + "grad_norm": 7.872669696807861, + "learning_rate": 1.8207547169811322e-05, + "loss": 0.3598, + "step": 193 + }, + { + "epoch": 0.091725768321513, + "grad_norm": 5.30977725982666, + "learning_rate": 1.830188679245283e-05, + "loss": 0.316, + "step": 194 + }, + { + "epoch": 0.09219858156028368, + "grad_norm": 6.427607536315918, + "learning_rate": 1.839622641509434e-05, + "loss": 0.3215, + "step": 195 + }, + { + "epoch": 0.09267139479905437, + "grad_norm": 6.099403381347656, + "learning_rate": 1.8490566037735852e-05, + "loss": 0.3482, + "step": 196 + }, + { + "epoch": 0.09314420803782505, + "grad_norm": 5.679231643676758, + "learning_rate": 1.8584905660377358e-05, + "loss": 0.3189, + "step": 197 + }, + { + "epoch": 0.09361702127659574, + "grad_norm": 7.309570789337158, + "learning_rate": 1.867924528301887e-05, + "loss": 0.4746, + "step": 198 + }, + { + "epoch": 0.09408983451536643, + "grad_norm": 8.899137496948242, + "learning_rate": 1.877358490566038e-05, + "loss": 0.5097, + "step": 199 + }, + { + "epoch": 0.09456264775413711, + "grad_norm": 5.904891014099121, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.3529, + "step": 200 + }, + { + "epoch": 0.09456264775413711, + "eval_accuracy": 0.8181818181818182, + "eval_f1": 0.6255707762557078, + "eval_loss": 0.38472801446914673, + "eval_precision": 0.7405405405405405, + "eval_recall": 0.541501976284585, + "eval_runtime": 48.2579, + "eval_samples_per_second": 5.719, + "eval_steps_per_second": 0.186, + "step": 200 + }, + { + "epoch": 0.0950354609929078, + "grad_norm": 5.29753303527832, + "learning_rate": 1.8962264150943397e-05, + "loss": 0.2815, + "step": 201 + }, + { + "epoch": 0.09550827423167849, + "grad_norm": 8.292261123657227, + "learning_rate": 1.905660377358491e-05, + "loss": 0.3791, + "step": 202 + }, + { + "epoch": 0.09598108747044917, + "grad_norm": 5.616471290588379, + "learning_rate": 1.9150943396226415e-05, + "loss": 0.3434, + "step": 203 + }, + { + "epoch": 0.09645390070921986, + "grad_norm": 4.834171772003174, + "learning_rate": 1.9245283018867927e-05, + "loss": 0.3383, + "step": 204 + }, + { + "epoch": 0.09692671394799054, + "grad_norm": 6.36716890335083, + "learning_rate": 1.9339622641509436e-05, + "loss": 0.3548, + "step": 205 + }, + { + "epoch": 0.09739952718676123, + "grad_norm": 5.7878899574279785, + "learning_rate": 1.9433962264150945e-05, + "loss": 0.3652, + "step": 206 + }, + { + "epoch": 0.09787234042553192, + "grad_norm": 5.697458267211914, + "learning_rate": 1.9528301886792454e-05, + "loss": 0.2581, + "step": 207 + }, + { + "epoch": 0.0983451536643026, + "grad_norm": 4.944214344024658, + "learning_rate": 1.9622641509433963e-05, + "loss": 0.3725, + "step": 208 + }, + { + "epoch": 0.09881796690307329, + "grad_norm": 5.800679683685303, + "learning_rate": 1.971698113207547e-05, + "loss": 0.2957, + "step": 209 + }, + { + "epoch": 0.09929078014184398, + "grad_norm": 5.455956935882568, + "learning_rate": 1.9811320754716984e-05, + "loss": 0.3826, + "step": 210 + }, + { + "epoch": 0.09976359338061466, + "grad_norm": 4.2240519523620605, + "learning_rate": 1.9905660377358493e-05, + "loss": 0.276, + "step": 211 + }, + { + "epoch": 0.10023640661938535, + "grad_norm": 4.200746059417725, + "learning_rate": 2e-05, + "loss": 0.2807, + "step": 212 + }, + { + "epoch": 0.10070921985815603, + "grad_norm": 5.269329071044922, + "learning_rate": 1.999998637325671e-05, + "loss": 0.3351, + "step": 213 + }, + { + "epoch": 0.10118203309692671, + "grad_norm": 4.950570583343506, + "learning_rate": 1.999994549306397e-05, + "loss": 0.3145, + "step": 214 + }, + { + "epoch": 0.1016548463356974, + "grad_norm": 6.465134143829346, + "learning_rate": 1.9999877359533202e-05, + "loss": 0.351, + "step": 215 + }, + { + "epoch": 0.10212765957446808, + "grad_norm": 6.148433685302734, + "learning_rate": 1.9999781972850082e-05, + "loss": 0.3563, + "step": 216 + }, + { + "epoch": 0.10260047281323877, + "grad_norm": 4.79353666305542, + "learning_rate": 1.9999659333274582e-05, + "loss": 0.2827, + "step": 217 + }, + { + "epoch": 0.10307328605200945, + "grad_norm": 5.91294002532959, + "learning_rate": 1.9999509441140934e-05, + "loss": 0.3741, + "step": 218 + }, + { + "epoch": 0.10354609929078014, + "grad_norm": 6.508899688720703, + "learning_rate": 1.9999332296857642e-05, + "loss": 0.3454, + "step": 219 + }, + { + "epoch": 0.10401891252955082, + "grad_norm": 6.511887073516846, + "learning_rate": 1.9999127900907496e-05, + "loss": 0.36, + "step": 220 + }, + { + "epoch": 0.10401891252955082, + "eval_accuracy": 0.8181818181818182, + "eval_f1": 0.6076555023923444, + "eval_loss": 0.38239341974258423, + "eval_precision": 0.7696969696969697, + "eval_recall": 0.5019762845849802, + "eval_runtime": 49.0986, + "eval_samples_per_second": 5.621, + "eval_steps_per_second": 0.183, + "step": 220 + }, + { + "epoch": 0.10449172576832151, + "grad_norm": 5.294708728790283, + "learning_rate": 1.9998896253847536e-05, + "loss": 0.345, + "step": 221 + }, + { + "epoch": 0.1049645390070922, + "grad_norm": 6.348694324493408, + "learning_rate": 1.9998637356309088e-05, + "loss": 0.2954, + "step": 222 + }, + { + "epoch": 0.10543735224586288, + "grad_norm": 5.638726234436035, + "learning_rate": 1.9998351208997734e-05, + "loss": 0.3151, + "step": 223 + }, + { + "epoch": 0.10591016548463357, + "grad_norm": 6.741607666015625, + "learning_rate": 1.999803781269333e-05, + "loss": 0.376, + "step": 224 + }, + { + "epoch": 0.10638297872340426, + "grad_norm": 5.410939693450928, + "learning_rate": 1.999769716824998e-05, + "loss": 0.2694, + "step": 225 + }, + { + "epoch": 0.10685579196217494, + "grad_norm": 6.066391944885254, + "learning_rate": 1.9997329276596073e-05, + "loss": 0.3443, + "step": 226 + }, + { + "epoch": 0.10732860520094563, + "grad_norm": 6.088653087615967, + "learning_rate": 1.999693413873423e-05, + "loss": 0.3617, + "step": 227 + }, + { + "epoch": 0.10780141843971631, + "grad_norm": 6.114930152893066, + "learning_rate": 1.9996511755741346e-05, + "loss": 0.2752, + "step": 228 + }, + { + "epoch": 0.108274231678487, + "grad_norm": 5.094395160675049, + "learning_rate": 1.999606212876856e-05, + "loss": 0.2251, + "step": 229 + }, + { + "epoch": 0.10874704491725769, + "grad_norm": 6.448328495025635, + "learning_rate": 1.999558525904126e-05, + "loss": 0.3729, + "step": 230 + }, + { + "epoch": 0.10921985815602837, + "grad_norm": 5.548649787902832, + "learning_rate": 1.9995081147859087e-05, + "loss": 0.3591, + "step": 231 + }, + { + "epoch": 0.10969267139479906, + "grad_norm": 8.929473876953125, + "learning_rate": 1.9994549796595913e-05, + "loss": 0.3535, + "step": 232 + }, + { + "epoch": 0.11016548463356975, + "grad_norm": 4.452419757843018, + "learning_rate": 1.9993991206699865e-05, + "loss": 0.2647, + "step": 233 + }, + { + "epoch": 0.11063829787234042, + "grad_norm": 4.958791732788086, + "learning_rate": 1.999340537969329e-05, + "loss": 0.3027, + "step": 234 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 6.243503093719482, + "learning_rate": 1.9992792317172765e-05, + "loss": 0.3199, + "step": 235 + }, + { + "epoch": 0.11158392434988179, + "grad_norm": 5.826799392700195, + "learning_rate": 1.9992152020809113e-05, + "loss": 0.3239, + "step": 236 + }, + { + "epoch": 0.11205673758865248, + "grad_norm": 8.090536117553711, + "learning_rate": 1.9991484492347353e-05, + "loss": 0.4326, + "step": 237 + }, + { + "epoch": 0.11252955082742316, + "grad_norm": 7.066166877746582, + "learning_rate": 1.9990789733606733e-05, + "loss": 0.2672, + "step": 238 + }, + { + "epoch": 0.11300236406619385, + "grad_norm": 5.99971342086792, + "learning_rate": 1.999006774648072e-05, + "loss": 0.2984, + "step": 239 + }, + { + "epoch": 0.11347517730496454, + "grad_norm": 5.290529727935791, + "learning_rate": 1.998931853293698e-05, + "loss": 0.2875, + "step": 240 + }, + { + "epoch": 0.11347517730496454, + "eval_accuracy": 0.8226164079822617, + "eval_f1": 0.6428571428571429, + "eval_loss": 0.3577645719051361, + "eval_precision": 0.7384615384615385, + "eval_recall": 0.5691699604743083, + "eval_runtime": 48.6337, + "eval_samples_per_second": 5.675, + "eval_steps_per_second": 0.185, + "step": 240 + }, + { + "epoch": 0.11394799054373522, + "grad_norm": 6.742930889129639, + "learning_rate": 1.9988542095017373e-05, + "loss": 0.3273, + "step": 241 + }, + { + "epoch": 0.11442080378250591, + "grad_norm": 6.42619514465332, + "learning_rate": 1.9987738434837973e-05, + "loss": 0.3949, + "step": 242 + }, + { + "epoch": 0.1148936170212766, + "grad_norm": 5.628868103027344, + "learning_rate": 1.9986907554589024e-05, + "loss": 0.2947, + "step": 243 + }, + { + "epoch": 0.11536643026004728, + "grad_norm": 6.224806308746338, + "learning_rate": 1.9986049456534972e-05, + "loss": 0.4323, + "step": 244 + }, + { + "epoch": 0.11583924349881797, + "grad_norm": 8.325421333312988, + "learning_rate": 1.9985164143014433e-05, + "loss": 0.2801, + "step": 245 + }, + { + "epoch": 0.11631205673758865, + "grad_norm": 6.2002129554748535, + "learning_rate": 1.9984251616440195e-05, + "loss": 0.2003, + "step": 246 + }, + { + "epoch": 0.11678486997635934, + "grad_norm": 6.008997917175293, + "learning_rate": 1.9983311879299203e-05, + "loss": 0.3376, + "step": 247 + }, + { + "epoch": 0.11725768321513003, + "grad_norm": 5.332534313201904, + "learning_rate": 1.9982344934152577e-05, + "loss": 0.3856, + "step": 248 + }, + { + "epoch": 0.11773049645390071, + "grad_norm": 3.9309067726135254, + "learning_rate": 1.9981350783635582e-05, + "loss": 0.2775, + "step": 249 + }, + { + "epoch": 0.1182033096926714, + "grad_norm": 6.058276653289795, + "learning_rate": 1.9980329430457616e-05, + "loss": 0.2785, + "step": 250 + }, + { + "epoch": 0.11867612293144209, + "grad_norm": 4.346603870391846, + "learning_rate": 1.997928087740222e-05, + "loss": 0.2102, + "step": 251 + }, + { + "epoch": 0.11914893617021277, + "grad_norm": 5.8436408042907715, + "learning_rate": 1.9978205127327085e-05, + "loss": 0.321, + "step": 252 + }, + { + "epoch": 0.11962174940898346, + "grad_norm": 8.181411743164062, + "learning_rate": 1.9977102183163984e-05, + "loss": 0.3408, + "step": 253 + }, + { + "epoch": 0.12009456264775414, + "grad_norm": 6.579717636108398, + "learning_rate": 1.997597204791884e-05, + "loss": 0.3031, + "step": 254 + }, + { + "epoch": 0.12056737588652482, + "grad_norm": 6.751533031463623, + "learning_rate": 1.9974814724671658e-05, + "loss": 0.3331, + "step": 255 + }, + { + "epoch": 0.1210401891252955, + "grad_norm": 5.326685905456543, + "learning_rate": 1.9973630216576547e-05, + "loss": 0.2865, + "step": 256 + }, + { + "epoch": 0.12151300236406619, + "grad_norm": 6.129054546356201, + "learning_rate": 1.9972418526861704e-05, + "loss": 0.243, + "step": 257 + }, + { + "epoch": 0.12198581560283688, + "grad_norm": 4.441340446472168, + "learning_rate": 1.997117965882941e-05, + "loss": 0.1915, + "step": 258 + }, + { + "epoch": 0.12245862884160756, + "grad_norm": 6.2238569259643555, + "learning_rate": 1.9969913615856015e-05, + "loss": 0.3069, + "step": 259 + }, + { + "epoch": 0.12293144208037825, + "grad_norm": 6.094357967376709, + "learning_rate": 1.9968620401391917e-05, + "loss": 0.3237, + "step": 260 + }, + { + "epoch": 0.12293144208037825, + "eval_accuracy": 0.8425720620842572, + "eval_f1": 0.710204081632653, + "eval_loss": 0.34573158621788025, + "eval_precision": 0.7341772151898734, + "eval_recall": 0.6877470355731226, + "eval_runtime": 49.3073, + "eval_samples_per_second": 5.598, + "eval_steps_per_second": 0.183, + "step": 260 + }, + { + "epoch": 0.12340425531914893, + "grad_norm": 7.97859001159668, + "learning_rate": 1.9967300018961582e-05, + "loss": 0.235, + "step": 261 + }, + { + "epoch": 0.12387706855791962, + "grad_norm": 8.9214448928833, + "learning_rate": 1.9965952472163517e-05, + "loss": 0.3719, + "step": 262 + }, + { + "epoch": 0.1243498817966903, + "grad_norm": 9.066556930541992, + "learning_rate": 1.996457776467025e-05, + "loss": 0.3064, + "step": 263 + }, + { + "epoch": 0.12482269503546099, + "grad_norm": 5.2799177169799805, + "learning_rate": 1.996317590022834e-05, + "loss": 0.3362, + "step": 264 + }, + { + "epoch": 0.12529550827423167, + "grad_norm": 7.641961574554443, + "learning_rate": 1.996174688265836e-05, + "loss": 0.326, + "step": 265 + }, + { + "epoch": 0.12576832151300235, + "grad_norm": 7.163477420806885, + "learning_rate": 1.9960290715854874e-05, + "loss": 0.2446, + "step": 266 + }, + { + "epoch": 0.12624113475177304, + "grad_norm": 5.137381553649902, + "learning_rate": 1.9958807403786452e-05, + "loss": 0.2447, + "step": 267 + }, + { + "epoch": 0.12671394799054372, + "grad_norm": 8.010651588439941, + "learning_rate": 1.995729695049563e-05, + "loss": 0.3647, + "step": 268 + }, + { + "epoch": 0.1271867612293144, + "grad_norm": 5.50241231918335, + "learning_rate": 1.995575936009893e-05, + "loss": 0.3076, + "step": 269 + }, + { + "epoch": 0.1276595744680851, + "grad_norm": 5.5639567375183105, + "learning_rate": 1.995419463678681e-05, + "loss": 0.3547, + "step": 270 + }, + { + "epoch": 0.12813238770685578, + "grad_norm": 5.540798187255859, + "learning_rate": 1.9952602784823688e-05, + "loss": 0.385, + "step": 271 + }, + { + "epoch": 0.12860520094562647, + "grad_norm": 8.423896789550781, + "learning_rate": 1.9950983808547923e-05, + "loss": 0.2973, + "step": 272 + }, + { + "epoch": 0.12907801418439716, + "grad_norm": 7.910554885864258, + "learning_rate": 1.994933771237179e-05, + "loss": 0.232, + "step": 273 + }, + { + "epoch": 0.12955082742316784, + "grad_norm": 5.810388565063477, + "learning_rate": 1.9947664500781464e-05, + "loss": 0.3371, + "step": 274 + }, + { + "epoch": 0.13002364066193853, + "grad_norm": 4.976069450378418, + "learning_rate": 1.9945964178337037e-05, + "loss": 0.2469, + "step": 275 + }, + { + "epoch": 0.13049645390070921, + "grad_norm": 5.738739013671875, + "learning_rate": 1.9944236749672483e-05, + "loss": 0.3208, + "step": 276 + }, + { + "epoch": 0.1309692671394799, + "grad_norm": 6.816530704498291, + "learning_rate": 1.9942482219495644e-05, + "loss": 0.327, + "step": 277 + }, + { + "epoch": 0.1314420803782506, + "grad_norm": 4.750877857208252, + "learning_rate": 1.9940700592588228e-05, + "loss": 0.2176, + "step": 278 + }, + { + "epoch": 0.13191489361702127, + "grad_norm": 4.42209529876709, + "learning_rate": 1.9938891873805787e-05, + "loss": 0.3138, + "step": 279 + }, + { + "epoch": 0.13238770685579196, + "grad_norm": 3.6869099140167236, + "learning_rate": 1.993705606807771e-05, + "loss": 0.2309, + "step": 280 + }, + { + "epoch": 0.13238770685579196, + "eval_accuracy": 0.8203991130820399, + "eval_f1": 0.5759162303664922, + "eval_loss": 0.36263027787208557, + "eval_precision": 0.8527131782945736, + "eval_recall": 0.43478260869565216, + "eval_runtime": 49.2617, + "eval_samples_per_second": 5.603, + "eval_steps_per_second": 0.183, + "step": 280 + }, + { + "epoch": 0.13286052009456265, + "grad_norm": 8.313398361206055, + "learning_rate": 1.9935193180407216e-05, + "loss": 0.3767, + "step": 281 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 3.1473255157470703, + "learning_rate": 1.9933303215871313e-05, + "loss": 0.21, + "step": 282 + }, + { + "epoch": 0.13380614657210402, + "grad_norm": 4.254230499267578, + "learning_rate": 1.9931386179620816e-05, + "loss": 0.2737, + "step": 283 + }, + { + "epoch": 0.1342789598108747, + "grad_norm": 7.221229553222656, + "learning_rate": 1.9929442076880323e-05, + "loss": 0.2579, + "step": 284 + }, + { + "epoch": 0.1347517730496454, + "grad_norm": 5.343625068664551, + "learning_rate": 1.9927470912948184e-05, + "loss": 0.3152, + "step": 285 + }, + { + "epoch": 0.13522458628841608, + "grad_norm": 6.4857916831970215, + "learning_rate": 1.992547269319651e-05, + "loss": 0.3524, + "step": 286 + }, + { + "epoch": 0.13569739952718676, + "grad_norm": 6.073742389678955, + "learning_rate": 1.9923447423071153e-05, + "loss": 0.3117, + "step": 287 + }, + { + "epoch": 0.13617021276595745, + "grad_norm": 6.168652057647705, + "learning_rate": 1.992139510809167e-05, + "loss": 0.343, + "step": 288 + }, + { + "epoch": 0.13664302600472814, + "grad_norm": 4.182333469390869, + "learning_rate": 1.9919315753851343e-05, + "loss": 0.2652, + "step": 289 + }, + { + "epoch": 0.13711583924349882, + "grad_norm": 5.490057945251465, + "learning_rate": 1.9917209366017134e-05, + "loss": 0.3377, + "step": 290 + }, + { + "epoch": 0.1375886524822695, + "grad_norm": 4.765297889709473, + "learning_rate": 1.9915075950329683e-05, + "loss": 0.2798, + "step": 291 + }, + { + "epoch": 0.1380614657210402, + "grad_norm": 6.1175923347473145, + "learning_rate": 1.9912915512603294e-05, + "loss": 0.3759, + "step": 292 + }, + { + "epoch": 0.13853427895981088, + "grad_norm": 7.154444217681885, + "learning_rate": 1.991072805872591e-05, + "loss": 0.3252, + "step": 293 + }, + { + "epoch": 0.13900709219858157, + "grad_norm": 6.086852550506592, + "learning_rate": 1.990851359465911e-05, + "loss": 0.3158, + "step": 294 + }, + { + "epoch": 0.13947990543735225, + "grad_norm": 6.656464576721191, + "learning_rate": 1.990627212643808e-05, + "loss": 0.3349, + "step": 295 + }, + { + "epoch": 0.13995271867612294, + "grad_norm": 5.9059062004089355, + "learning_rate": 1.9904003660171597e-05, + "loss": 0.2919, + "step": 296 + }, + { + "epoch": 0.14042553191489363, + "grad_norm": 7.576990604400635, + "learning_rate": 1.990170820204203e-05, + "loss": 0.2865, + "step": 297 + }, + { + "epoch": 0.1408983451536643, + "grad_norm": 5.305886268615723, + "learning_rate": 1.9899385758305298e-05, + "loss": 0.3499, + "step": 298 + }, + { + "epoch": 0.141371158392435, + "grad_norm": 4.519371032714844, + "learning_rate": 1.9897036335290868e-05, + "loss": 0.2928, + "step": 299 + }, + { + "epoch": 0.14184397163120568, + "grad_norm": 4.735250949859619, + "learning_rate": 1.989465993940174e-05, + "loss": 0.2843, + "step": 300 + }, + { + "epoch": 0.14184397163120568, + "eval_accuracy": 0.8325942350332595, + "eval_f1": 0.6215538847117794, + "eval_loss": 0.35112443566322327, + "eval_precision": 0.8493150684931506, + "eval_recall": 0.4901185770750988, + "eval_runtime": 49.5436, + "eval_samples_per_second": 5.571, + "eval_steps_per_second": 0.182, + "step": 300 + }, + { + "epoch": 0.14231678486997637, + "grad_norm": 6.546813488006592, + "learning_rate": 1.9892256577114422e-05, + "loss": 0.2892, + "step": 301 + }, + { + "epoch": 0.14278959810874706, + "grad_norm": 10.950983047485352, + "learning_rate": 1.9889826254978915e-05, + "loss": 0.4064, + "step": 302 + }, + { + "epoch": 0.14326241134751774, + "grad_norm": 3.705230474472046, + "learning_rate": 1.988736897961869e-05, + "loss": 0.2045, + "step": 303 + }, + { + "epoch": 0.14373522458628843, + "grad_norm": 8.16879940032959, + "learning_rate": 1.9884884757730683e-05, + "loss": 0.4112, + "step": 304 + }, + { + "epoch": 0.14420803782505912, + "grad_norm": 3.9046270847320557, + "learning_rate": 1.988237359608526e-05, + "loss": 0.234, + "step": 305 + }, + { + "epoch": 0.14468085106382977, + "grad_norm": 3.2188122272491455, + "learning_rate": 1.987983550152622e-05, + "loss": 0.16, + "step": 306 + }, + { + "epoch": 0.14515366430260046, + "grad_norm": 4.718387603759766, + "learning_rate": 1.987727048097075e-05, + "loss": 0.2409, + "step": 307 + }, + { + "epoch": 0.14562647754137115, + "grad_norm": 4.721340656280518, + "learning_rate": 1.9874678541409427e-05, + "loss": 0.3075, + "step": 308 + }, + { + "epoch": 0.14609929078014183, + "grad_norm": 8.97851848602295, + "learning_rate": 1.9872059689906188e-05, + "loss": 0.3628, + "step": 309 + }, + { + "epoch": 0.14657210401891252, + "grad_norm": 5.3872809410095215, + "learning_rate": 1.9869413933598317e-05, + "loss": 0.2832, + "step": 310 + }, + { + "epoch": 0.1470449172576832, + "grad_norm": 6.58519172668457, + "learning_rate": 1.986674127969642e-05, + "loss": 0.2423, + "step": 311 + }, + { + "epoch": 0.1475177304964539, + "grad_norm": 6.278075218200684, + "learning_rate": 1.9864041735484417e-05, + "loss": 0.3341, + "step": 312 + }, + { + "epoch": 0.14799054373522458, + "grad_norm": 6.8102288246154785, + "learning_rate": 1.986131530831951e-05, + "loss": 0.2887, + "step": 313 + }, + { + "epoch": 0.14846335697399526, + "grad_norm": 6.963206768035889, + "learning_rate": 1.985856200563215e-05, + "loss": 0.2883, + "step": 314 + }, + { + "epoch": 0.14893617021276595, + "grad_norm": 5.686670303344727, + "learning_rate": 1.9855781834926057e-05, + "loss": 0.3673, + "step": 315 + }, + { + "epoch": 0.14940898345153664, + "grad_norm": 5.6203227043151855, + "learning_rate": 1.985297480377816e-05, + "loss": 0.2601, + "step": 316 + }, + { + "epoch": 0.14988179669030732, + "grad_norm": 5.668765068054199, + "learning_rate": 1.98501409198386e-05, + "loss": 0.2868, + "step": 317 + }, + { + "epoch": 0.150354609929078, + "grad_norm": 4.719535827636719, + "learning_rate": 1.9847280190830706e-05, + "loss": 0.2295, + "step": 318 + }, + { + "epoch": 0.1508274231678487, + "grad_norm": 6.404664039611816, + "learning_rate": 1.9844392624550952e-05, + "loss": 0.3816, + "step": 319 + }, + { + "epoch": 0.15130023640661938, + "grad_norm": 4.635862827301025, + "learning_rate": 1.9841478228868966e-05, + "loss": 0.2694, + "step": 320 + }, + { + "epoch": 0.15130023640661938, + "eval_accuracy": 0.8337028824833703, + "eval_f1": 0.6287128712871287, + "eval_loss": 0.34866657853126526, + "eval_precision": 0.8410596026490066, + "eval_recall": 0.5019762845849802, + "eval_runtime": 49.2171, + "eval_samples_per_second": 5.608, + "eval_steps_per_second": 0.183, + "step": 320 + }, + { + "epoch": 0.15177304964539007, + "grad_norm": 6.7525787353515625, + "learning_rate": 1.983853701172749e-05, + "loss": 0.3208, + "step": 321 + }, + { + "epoch": 0.15224586288416075, + "grad_norm": 4.8178019523620605, + "learning_rate": 1.9835568981142376e-05, + "loss": 0.2365, + "step": 322 + }, + { + "epoch": 0.15271867612293144, + "grad_norm": 6.369754791259766, + "learning_rate": 1.9832574145202524e-05, + "loss": 0.3079, + "step": 323 + }, + { + "epoch": 0.15319148936170213, + "grad_norm": 6.613752365112305, + "learning_rate": 1.982955251206993e-05, + "loss": 0.3133, + "step": 324 + }, + { + "epoch": 0.1536643026004728, + "grad_norm": 7.213798999786377, + "learning_rate": 1.9826504089979573e-05, + "loss": 0.3551, + "step": 325 + }, + { + "epoch": 0.1541371158392435, + "grad_norm": 5.0920515060424805, + "learning_rate": 1.9823428887239484e-05, + "loss": 0.2295, + "step": 326 + }, + { + "epoch": 0.15460992907801419, + "grad_norm": 6.177611827850342, + "learning_rate": 1.9820326912230654e-05, + "loss": 0.3048, + "step": 327 + }, + { + "epoch": 0.15508274231678487, + "grad_norm": 6.256964683532715, + "learning_rate": 1.981719817340705e-05, + "loss": 0.3382, + "step": 328 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 7.9319915771484375, + "learning_rate": 1.9814042679295574e-05, + "loss": 0.4387, + "step": 329 + }, + { + "epoch": 0.15602836879432624, + "grad_norm": 5.281505107879639, + "learning_rate": 1.981086043849605e-05, + "loss": 0.2951, + "step": 330 + }, + { + "epoch": 0.15650118203309693, + "grad_norm": 6.895681858062744, + "learning_rate": 1.9807651459681195e-05, + "loss": 0.399, + "step": 331 + }, + { + "epoch": 0.15697399527186762, + "grad_norm": 6.184955596923828, + "learning_rate": 1.9804415751596587e-05, + "loss": 0.2605, + "step": 332 + }, + { + "epoch": 0.1574468085106383, + "grad_norm": 6.7411699295043945, + "learning_rate": 1.9801153323060667e-05, + "loss": 0.3157, + "step": 333 + }, + { + "epoch": 0.157919621749409, + "grad_norm": 3.7483937740325928, + "learning_rate": 1.9797864182964687e-05, + "loss": 0.2806, + "step": 334 + }, + { + "epoch": 0.15839243498817968, + "grad_norm": 7.106858253479004, + "learning_rate": 1.97945483402727e-05, + "loss": 0.3, + "step": 335 + }, + { + "epoch": 0.15886524822695036, + "grad_norm": 6.808032512664795, + "learning_rate": 1.9791205804021537e-05, + "loss": 0.3269, + "step": 336 + }, + { + "epoch": 0.15933806146572105, + "grad_norm": 5.708781719207764, + "learning_rate": 1.978783658332077e-05, + "loss": 0.248, + "step": 337 + }, + { + "epoch": 0.15981087470449173, + "grad_norm": 4.7683587074279785, + "learning_rate": 1.9784440687352708e-05, + "loss": 0.2693, + "step": 338 + }, + { + "epoch": 0.16028368794326242, + "grad_norm": 4.79544734954834, + "learning_rate": 1.9781018125372337e-05, + "loss": 0.2603, + "step": 339 + }, + { + "epoch": 0.1607565011820331, + "grad_norm": 8.155563354492188, + "learning_rate": 1.9777568906707344e-05, + "loss": 0.3854, + "step": 340 + }, + { + "epoch": 0.1607565011820331, + "eval_accuracy": 0.8192904656319291, + "eval_f1": 0.5788113695090439, + "eval_loss": 0.35731109976768494, + "eval_precision": 0.835820895522388, + "eval_recall": 0.4426877470355731, + "eval_runtime": 49.5821, + "eval_samples_per_second": 5.567, + "eval_steps_per_second": 0.182, + "step": 340 + }, + { + "epoch": 0.1612293144208038, + "grad_norm": 4.6710286140441895, + "learning_rate": 1.977409304075805e-05, + "loss": 0.2523, + "step": 341 + }, + { + "epoch": 0.16170212765957448, + "grad_norm": 5.3869781494140625, + "learning_rate": 1.97705905369974e-05, + "loss": 0.3004, + "step": 342 + }, + { + "epoch": 0.16217494089834517, + "grad_norm": 4.742314338684082, + "learning_rate": 1.976706140497094e-05, + "loss": 0.2293, + "step": 343 + }, + { + "epoch": 0.16264775413711585, + "grad_norm": 7.420506477355957, + "learning_rate": 1.9763505654296782e-05, + "loss": 0.2997, + "step": 344 + }, + { + "epoch": 0.16312056737588654, + "grad_norm": 6.123251438140869, + "learning_rate": 1.9759923294665588e-05, + "loss": 0.2884, + "step": 345 + }, + { + "epoch": 0.1635933806146572, + "grad_norm": 8.684674263000488, + "learning_rate": 1.9756314335840535e-05, + "loss": 0.3789, + "step": 346 + }, + { + "epoch": 0.16406619385342788, + "grad_norm": 6.067385673522949, + "learning_rate": 1.97526787876573e-05, + "loss": 0.3224, + "step": 347 + }, + { + "epoch": 0.16453900709219857, + "grad_norm": 5.475379467010498, + "learning_rate": 1.9749016660024014e-05, + "loss": 0.2177, + "step": 348 + }, + { + "epoch": 0.16501182033096926, + "grad_norm": 5.6096391677856445, + "learning_rate": 1.9745327962921253e-05, + "loss": 0.2141, + "step": 349 + }, + { + "epoch": 0.16548463356973994, + "grad_norm": 6.197654724121094, + "learning_rate": 1.9741612706402002e-05, + "loss": 0.3054, + "step": 350 + }, + { + "epoch": 0.16595744680851063, + "grad_norm": 5.453296184539795, + "learning_rate": 1.973787090059163e-05, + "loss": 0.2404, + "step": 351 + }, + { + "epoch": 0.16643026004728131, + "grad_norm": 4.969869613647461, + "learning_rate": 1.9734102555687868e-05, + "loss": 0.2441, + "step": 352 + }, + { + "epoch": 0.166903073286052, + "grad_norm": 5.547883987426758, + "learning_rate": 1.9730307681960763e-05, + "loss": 0.255, + "step": 353 + }, + { + "epoch": 0.1673758865248227, + "grad_norm": 8.373857498168945, + "learning_rate": 1.972648628975267e-05, + "loss": 0.323, + "step": 354 + }, + { + "epoch": 0.16784869976359337, + "grad_norm": 9.678048133850098, + "learning_rate": 1.9722638389478218e-05, + "loss": 0.3685, + "step": 355 + }, + { + "epoch": 0.16832151300236406, + "grad_norm": 5.831118106842041, + "learning_rate": 1.9718763991624277e-05, + "loss": 0.2394, + "step": 356 + }, + { + "epoch": 0.16879432624113475, + "grad_norm": 5.348067760467529, + "learning_rate": 1.9714863106749928e-05, + "loss": 0.2312, + "step": 357 + }, + { + "epoch": 0.16926713947990543, + "grad_norm": 5.5273518562316895, + "learning_rate": 1.9710935745486447e-05, + "loss": 0.2442, + "step": 358 + }, + { + "epoch": 0.16973995271867612, + "grad_norm": 4.848127365112305, + "learning_rate": 1.9706981918537257e-05, + "loss": 0.2208, + "step": 359 + }, + { + "epoch": 0.1702127659574468, + "grad_norm": 5.912291526794434, + "learning_rate": 1.970300163667792e-05, + "loss": 0.3062, + "step": 360 + }, + { + "epoch": 0.1702127659574468, + "eval_accuracy": 0.8470066518847007, + "eval_f1": 0.7, + "eval_loss": 0.326249897480011, + "eval_precision": 0.7777777777777778, + "eval_recall": 0.6363636363636364, + "eval_runtime": 49.3056, + "eval_samples_per_second": 5.598, + "eval_steps_per_second": 0.183, + "step": 360 + }, + { + "epoch": 0.1706855791962175, + "grad_norm": 7.104170322418213, + "learning_rate": 1.9698994910756092e-05, + "loss": 0.2781, + "step": 361 + }, + { + "epoch": 0.17115839243498818, + "grad_norm": 6.990221977233887, + "learning_rate": 1.969496175169149e-05, + "loss": 0.2876, + "step": 362 + }, + { + "epoch": 0.17163120567375886, + "grad_norm": 6.5197014808654785, + "learning_rate": 1.9690902170475894e-05, + "loss": 0.279, + "step": 363 + }, + { + "epoch": 0.17210401891252955, + "grad_norm": 4.127284526824951, + "learning_rate": 1.9686816178173065e-05, + "loss": 0.2238, + "step": 364 + }, + { + "epoch": 0.17257683215130024, + "grad_norm": 7.10957145690918, + "learning_rate": 1.968270378591876e-05, + "loss": 0.3302, + "step": 365 + }, + { + "epoch": 0.17304964539007092, + "grad_norm": 5.990342617034912, + "learning_rate": 1.967856500492068e-05, + "loss": 0.2445, + "step": 366 + }, + { + "epoch": 0.1735224586288416, + "grad_norm": 8.701051712036133, + "learning_rate": 1.9674399846458455e-05, + "loss": 0.3321, + "step": 367 + }, + { + "epoch": 0.1739952718676123, + "grad_norm": 6.448107719421387, + "learning_rate": 1.9670208321883588e-05, + "loss": 0.3304, + "step": 368 + }, + { + "epoch": 0.17446808510638298, + "grad_norm": 6.382139682769775, + "learning_rate": 1.966599044261944e-05, + "loss": 0.2925, + "step": 369 + }, + { + "epoch": 0.17494089834515367, + "grad_norm": 5.474351406097412, + "learning_rate": 1.9661746220161208e-05, + "loss": 0.3041, + "step": 370 + }, + { + "epoch": 0.17541371158392435, + "grad_norm": 6.983358383178711, + "learning_rate": 1.965747566607588e-05, + "loss": 0.3657, + "step": 371 + }, + { + "epoch": 0.17588652482269504, + "grad_norm": 6.531889915466309, + "learning_rate": 1.9653178792002203e-05, + "loss": 0.2679, + "step": 372 + }, + { + "epoch": 0.17635933806146573, + "grad_norm": 4.009150505065918, + "learning_rate": 1.964885560965065e-05, + "loss": 0.2429, + "step": 373 + }, + { + "epoch": 0.1768321513002364, + "grad_norm": 4.717473983764648, + "learning_rate": 1.964450613080341e-05, + "loss": 0.2378, + "step": 374 + }, + { + "epoch": 0.1773049645390071, + "grad_norm": 5.107661724090576, + "learning_rate": 1.9640130367314327e-05, + "loss": 0.3191, + "step": 375 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 6.810859203338623, + "learning_rate": 1.963572833110888e-05, + "loss": 0.3077, + "step": 376 + }, + { + "epoch": 0.17825059101654847, + "grad_norm": 4.793877124786377, + "learning_rate": 1.9631300034184155e-05, + "loss": 0.2426, + "step": 377 + }, + { + "epoch": 0.17872340425531916, + "grad_norm": 5.632551193237305, + "learning_rate": 1.96268454886088e-05, + "loss": 0.2767, + "step": 378 + }, + { + "epoch": 0.17919621749408984, + "grad_norm": 4.865971565246582, + "learning_rate": 1.962236470652301e-05, + "loss": 0.2907, + "step": 379 + }, + { + "epoch": 0.17966903073286053, + "grad_norm": 6.953917503356934, + "learning_rate": 1.9617857700138477e-05, + "loss": 0.2861, + "step": 380 + }, + { + "epoch": 0.17966903073286053, + "eval_accuracy": 0.8458980044345898, + "eval_f1": 0.6774941995359629, + "eval_loss": 0.3308302164077759, + "eval_precision": 0.8202247191011236, + "eval_recall": 0.5770750988142292, + "eval_runtime": 49.0662, + "eval_samples_per_second": 5.625, + "eval_steps_per_second": 0.183, + "step": 380 + }, + { + "epoch": 0.18014184397163122, + "grad_norm": 8.24282455444336, + "learning_rate": 1.9613324481738364e-05, + "loss": 0.3452, + "step": 381 + }, + { + "epoch": 0.1806146572104019, + "grad_norm": 6.349436283111572, + "learning_rate": 1.9608765063677272e-05, + "loss": 0.3436, + "step": 382 + }, + { + "epoch": 0.1810874704491726, + "grad_norm": 4.89585018157959, + "learning_rate": 1.9604179458381204e-05, + "loss": 0.2596, + "step": 383 + }, + { + "epoch": 0.18156028368794327, + "grad_norm": 5.193378925323486, + "learning_rate": 1.9599567678347536e-05, + "loss": 0.1857, + "step": 384 + }, + { + "epoch": 0.18203309692671396, + "grad_norm": 4.266732692718506, + "learning_rate": 1.9594929736144978e-05, + "loss": 0.1993, + "step": 385 + }, + { + "epoch": 0.18250591016548465, + "grad_norm": 4.616336822509766, + "learning_rate": 1.959026564441353e-05, + "loss": 0.2655, + "step": 386 + }, + { + "epoch": 0.1829787234042553, + "grad_norm": 5.701202392578125, + "learning_rate": 1.958557541586448e-05, + "loss": 0.2377, + "step": 387 + }, + { + "epoch": 0.183451536643026, + "grad_norm": 4.910188674926758, + "learning_rate": 1.9580859063280326e-05, + "loss": 0.2346, + "step": 388 + }, + { + "epoch": 0.18392434988179668, + "grad_norm": 5.084827899932861, + "learning_rate": 1.957611659951478e-05, + "loss": 0.2473, + "step": 389 + }, + { + "epoch": 0.18439716312056736, + "grad_norm": 5.31158971786499, + "learning_rate": 1.9571348037492705e-05, + "loss": 0.2524, + "step": 390 + }, + { + "epoch": 0.18486997635933805, + "grad_norm": 5.35557746887207, + "learning_rate": 1.9566553390210103e-05, + "loss": 0.22, + "step": 391 + }, + { + "epoch": 0.18534278959810874, + "grad_norm": 7.644250392913818, + "learning_rate": 1.9561732670734048e-05, + "loss": 0.3009, + "step": 392 + }, + { + "epoch": 0.18581560283687942, + "grad_norm": 9.144373893737793, + "learning_rate": 1.9556885892202685e-05, + "loss": 0.4346, + "step": 393 + }, + { + "epoch": 0.1862884160756501, + "grad_norm": 6.692631721496582, + "learning_rate": 1.9552013067825185e-05, + "loss": 0.3075, + "step": 394 + }, + { + "epoch": 0.1867612293144208, + "grad_norm": 7.828726768493652, + "learning_rate": 1.9547114210881683e-05, + "loss": 0.3187, + "step": 395 + }, + { + "epoch": 0.18723404255319148, + "grad_norm": 7.344780445098877, + "learning_rate": 1.954218933472327e-05, + "loss": 0.3382, + "step": 396 + }, + { + "epoch": 0.18770685579196217, + "grad_norm": 5.836700439453125, + "learning_rate": 1.9537238452771962e-05, + "loss": 0.2509, + "step": 397 + }, + { + "epoch": 0.18817966903073285, + "grad_norm": 6.650071144104004, + "learning_rate": 1.953226157852063e-05, + "loss": 0.214, + "step": 398 + }, + { + "epoch": 0.18865248226950354, + "grad_norm": 6.298871040344238, + "learning_rate": 1.952725872553299e-05, + "loss": 0.2362, + "step": 399 + }, + { + "epoch": 0.18912529550827423, + "grad_norm": 5.426384449005127, + "learning_rate": 1.952222990744357e-05, + "loss": 0.2808, + "step": 400 + }, + { + "epoch": 0.18912529550827423, + "eval_accuracy": 0.8337028824833703, + "eval_f1": 0.609375, + "eval_loss": 0.35840529203414917, + "eval_precision": 0.8931297709923665, + "eval_recall": 0.4624505928853755, + "eval_runtime": 49.5299, + "eval_samples_per_second": 5.572, + "eval_steps_per_second": 0.182, + "step": 400 + }, + { + "epoch": 0.1895981087470449, + "grad_norm": 4.219785213470459, + "learning_rate": 1.9517175137957647e-05, + "loss": 0.2007, + "step": 401 + }, + { + "epoch": 0.1900709219858156, + "grad_norm": 4.726499080657959, + "learning_rate": 1.9512094430851226e-05, + "loss": 0.2333, + "step": 402 + }, + { + "epoch": 0.19054373522458629, + "grad_norm": 7.44296407699585, + "learning_rate": 1.9506987799971013e-05, + "loss": 0.2563, + "step": 403 + }, + { + "epoch": 0.19101654846335697, + "grad_norm": 7.742011547088623, + "learning_rate": 1.9501855259234353e-05, + "loss": 0.313, + "step": 404 + }, + { + "epoch": 0.19148936170212766, + "grad_norm": 6.5203657150268555, + "learning_rate": 1.9496696822629208e-05, + "loss": 0.3372, + "step": 405 + }, + { + "epoch": 0.19196217494089834, + "grad_norm": 6.323021411895752, + "learning_rate": 1.9491512504214123e-05, + "loss": 0.2561, + "step": 406 + }, + { + "epoch": 0.19243498817966903, + "grad_norm": 5.560845851898193, + "learning_rate": 1.9486302318118164e-05, + "loss": 0.2822, + "step": 407 + }, + { + "epoch": 0.19290780141843972, + "grad_norm": 5.433687686920166, + "learning_rate": 1.9481066278540912e-05, + "loss": 0.1501, + "step": 408 + }, + { + "epoch": 0.1933806146572104, + "grad_norm": 5.543313026428223, + "learning_rate": 1.9475804399752397e-05, + "loss": 0.2399, + "step": 409 + }, + { + "epoch": 0.1938534278959811, + "grad_norm": 3.8356525897979736, + "learning_rate": 1.9470516696093075e-05, + "loss": 0.1458, + "step": 410 + }, + { + "epoch": 0.19432624113475178, + "grad_norm": 5.613055229187012, + "learning_rate": 1.946520318197378e-05, + "loss": 0.2665, + "step": 411 + }, + { + "epoch": 0.19479905437352246, + "grad_norm": 7.673485279083252, + "learning_rate": 1.9459863871875694e-05, + "loss": 0.3718, + "step": 412 + }, + { + "epoch": 0.19527186761229315, + "grad_norm": 5.367708683013916, + "learning_rate": 1.945449878035029e-05, + "loss": 0.1897, + "step": 413 + }, + { + "epoch": 0.19574468085106383, + "grad_norm": 7.302910804748535, + "learning_rate": 1.9449107922019326e-05, + "loss": 0.2457, + "step": 414 + }, + { + "epoch": 0.19621749408983452, + "grad_norm": 6.045085906982422, + "learning_rate": 1.944369131157476e-05, + "loss": 0.2974, + "step": 415 + }, + { + "epoch": 0.1966903073286052, + "grad_norm": 7.004913806915283, + "learning_rate": 1.9438248963778754e-05, + "loss": 0.2723, + "step": 416 + }, + { + "epoch": 0.1971631205673759, + "grad_norm": 6.820647716522217, + "learning_rate": 1.9432780893463594e-05, + "loss": 0.3367, + "step": 417 + }, + { + "epoch": 0.19763593380614658, + "grad_norm": 6.083995819091797, + "learning_rate": 1.942728711553168e-05, + "loss": 0.265, + "step": 418 + }, + { + "epoch": 0.19810874704491727, + "grad_norm": 4.812845706939697, + "learning_rate": 1.942176764495547e-05, + "loss": 0.2567, + "step": 419 + }, + { + "epoch": 0.19858156028368795, + "grad_norm": 5.881125450134277, + "learning_rate": 1.9416222496777453e-05, + "loss": 0.2716, + "step": 420 + }, + { + "epoch": 0.19858156028368795, + "eval_accuracy": 0.852549889135255, + "eval_f1": 0.6825775656324582, + "eval_loss": 0.33116355538368225, + "eval_precision": 0.8614457831325302, + "eval_recall": 0.5652173913043478, + "eval_runtime": 47.258, + "eval_samples_per_second": 5.84, + "eval_steps_per_second": 0.19, + "step": 420 + }, + { + "epoch": 0.19905437352245864, + "grad_norm": 3.609654426574707, + "learning_rate": 1.941065168611009e-05, + "loss": 0.2053, + "step": 421 + }, + { + "epoch": 0.19952718676122932, + "grad_norm": 5.121413230895996, + "learning_rate": 1.9405055228135777e-05, + "loss": 0.2572, + "step": 422 + }, + { + "epoch": 0.2, + "grad_norm": 6.144900321960449, + "learning_rate": 1.9399433138106814e-05, + "loss": 0.2689, + "step": 423 + }, + { + "epoch": 0.2004728132387707, + "grad_norm": 5.300688743591309, + "learning_rate": 1.939378543134536e-05, + "loss": 0.3146, + "step": 424 + }, + { + "epoch": 0.20094562647754138, + "grad_norm": 8.221715927124023, + "learning_rate": 1.9388112123243386e-05, + "loss": 0.2843, + "step": 425 + }, + { + "epoch": 0.20141843971631207, + "grad_norm": 3.9929358959198, + "learning_rate": 1.938241322926263e-05, + "loss": 0.2603, + "step": 426 + }, + { + "epoch": 0.20189125295508276, + "grad_norm": 4.981365203857422, + "learning_rate": 1.937668876493457e-05, + "loss": 0.3303, + "step": 427 + }, + { + "epoch": 0.20236406619385341, + "grad_norm": 5.623127460479736, + "learning_rate": 1.9370938745860362e-05, + "loss": 0.2564, + "step": 428 + }, + { + "epoch": 0.2028368794326241, + "grad_norm": 5.664045810699463, + "learning_rate": 1.9365163187710817e-05, + "loss": 0.2749, + "step": 429 + }, + { + "epoch": 0.2033096926713948, + "grad_norm": 5.24921178817749, + "learning_rate": 1.935936210622634e-05, + "loss": 0.2835, + "step": 430 + }, + { + "epoch": 0.20378250591016547, + "grad_norm": 8.841286659240723, + "learning_rate": 1.9353535517216908e-05, + "loss": 0.3114, + "step": 431 + }, + { + "epoch": 0.20425531914893616, + "grad_norm": 4.470973491668701, + "learning_rate": 1.9347683436562e-05, + "loss": 0.2417, + "step": 432 + }, + { + "epoch": 0.20472813238770685, + "grad_norm": 7.2563276290893555, + "learning_rate": 1.934180588021058e-05, + "loss": 0.2727, + "step": 433 + }, + { + "epoch": 0.20520094562647753, + "grad_norm": 6.107417106628418, + "learning_rate": 1.933590286418104e-05, + "loss": 0.2461, + "step": 434 + }, + { + "epoch": 0.20567375886524822, + "grad_norm": 10.675925254821777, + "learning_rate": 1.932997440456115e-05, + "loss": 0.3362, + "step": 435 + }, + { + "epoch": 0.2061465721040189, + "grad_norm": 6.831496715545654, + "learning_rate": 1.932402051750803e-05, + "loss": 0.326, + "step": 436 + }, + { + "epoch": 0.2066193853427896, + "grad_norm": 6.8831095695495605, + "learning_rate": 1.9318041219248108e-05, + "loss": 0.3411, + "step": 437 + }, + { + "epoch": 0.20709219858156028, + "grad_norm": 4.773165225982666, + "learning_rate": 1.9312036526077055e-05, + "loss": 0.2587, + "step": 438 + }, + { + "epoch": 0.20756501182033096, + "grad_norm": 7.510995864868164, + "learning_rate": 1.930600645435974e-05, + "loss": 0.2906, + "step": 439 + }, + { + "epoch": 0.20803782505910165, + "grad_norm": 6.88019323348999, + "learning_rate": 1.9299951020530226e-05, + "loss": 0.3696, + "step": 440 + }, + { + "epoch": 0.20803782505910165, + "eval_accuracy": 0.8547671840354767, + "eval_f1": 0.7120879120879121, + "eval_loss": 0.31960517168045044, + "eval_precision": 0.801980198019802, + "eval_recall": 0.6403162055335968, + "eval_runtime": 47.3456, + "eval_samples_per_second": 5.829, + "eval_steps_per_second": 0.19, + "step": 440 + }, + { + "epoch": 0.20851063829787234, + "grad_norm": 4.385389804840088, + "learning_rate": 1.929387024109167e-05, + "loss": 0.2451, + "step": 441 + }, + { + "epoch": 0.20898345153664302, + "grad_norm": 5.8849077224731445, + "learning_rate": 1.9287764132616323e-05, + "loss": 0.2734, + "step": 442 + }, + { + "epoch": 0.2094562647754137, + "grad_norm": 8.523555755615234, + "learning_rate": 1.928163271174546e-05, + "loss": 0.3602, + "step": 443 + }, + { + "epoch": 0.2099290780141844, + "grad_norm": 8.392167091369629, + "learning_rate": 1.927547599518934e-05, + "loss": 0.286, + "step": 444 + }, + { + "epoch": 0.21040189125295508, + "grad_norm": 6.212944030761719, + "learning_rate": 1.9269293999727156e-05, + "loss": 0.2083, + "step": 445 + }, + { + "epoch": 0.21087470449172577, + "grad_norm": 6.23652982711792, + "learning_rate": 1.926308674220701e-05, + "loss": 0.259, + "step": 446 + }, + { + "epoch": 0.21134751773049645, + "grad_norm": 6.646223545074463, + "learning_rate": 1.9256854239545833e-05, + "loss": 0.325, + "step": 447 + }, + { + "epoch": 0.21182033096926714, + "grad_norm": 10.316300392150879, + "learning_rate": 1.925059650872938e-05, + "loss": 0.4394, + "step": 448 + }, + { + "epoch": 0.21229314420803783, + "grad_norm": 4.443530559539795, + "learning_rate": 1.9244313566812138e-05, + "loss": 0.2843, + "step": 449 + }, + { + "epoch": 0.2127659574468085, + "grad_norm": 5.295090198516846, + "learning_rate": 1.923800543091732e-05, + "loss": 0.2672, + "step": 450 + }, + { + "epoch": 0.2132387706855792, + "grad_norm": 4.4014482498168945, + "learning_rate": 1.9231672118236798e-05, + "loss": 0.2578, + "step": 451 + }, + { + "epoch": 0.21371158392434988, + "grad_norm": 5.532175540924072, + "learning_rate": 1.922531364603105e-05, + "loss": 0.2718, + "step": 452 + }, + { + "epoch": 0.21418439716312057, + "grad_norm": 5.192704200744629, + "learning_rate": 1.9218930031629134e-05, + "loss": 0.2279, + "step": 453 + }, + { + "epoch": 0.21465721040189126, + "grad_norm": 4.8312458992004395, + "learning_rate": 1.921252129242863e-05, + "loss": 0.3158, + "step": 454 + }, + { + "epoch": 0.21513002364066194, + "grad_norm": 4.629521369934082, + "learning_rate": 1.9206087445895572e-05, + "loss": 0.187, + "step": 455 + }, + { + "epoch": 0.21560283687943263, + "grad_norm": 5.664729118347168, + "learning_rate": 1.9199628509564455e-05, + "loss": 0.2869, + "step": 456 + }, + { + "epoch": 0.21607565011820332, + "grad_norm": 6.123085021972656, + "learning_rate": 1.9193144501038116e-05, + "loss": 0.2455, + "step": 457 + }, + { + "epoch": 0.216548463356974, + "grad_norm": 5.794577121734619, + "learning_rate": 1.9186635437987746e-05, + "loss": 0.2984, + "step": 458 + }, + { + "epoch": 0.2170212765957447, + "grad_norm": 5.055876731872559, + "learning_rate": 1.9180101338152807e-05, + "loss": 0.1974, + "step": 459 + }, + { + "epoch": 0.21749408983451538, + "grad_norm": 5.301477432250977, + "learning_rate": 1.9173542219341005e-05, + "loss": 0.1911, + "step": 460 + }, + { + "epoch": 0.21749408983451538, + "eval_accuracy": 0.8425720620842572, + "eval_f1": 0.6467661691542289, + "eval_loss": 0.34363728761672974, + "eval_precision": 0.87248322147651, + "eval_recall": 0.5138339920948617, + "eval_runtime": 47.279, + "eval_samples_per_second": 5.838, + "eval_steps_per_second": 0.19, + "step": 460 + }, + { + "epoch": 0.21796690307328606, + "grad_norm": 3.5169882774353027, + "learning_rate": 1.9166958099428227e-05, + "loss": 0.2012, + "step": 461 + }, + { + "epoch": 0.21843971631205675, + "grad_norm": 7.913601875305176, + "learning_rate": 1.9160348996358484e-05, + "loss": 0.3436, + "step": 462 + }, + { + "epoch": 0.21891252955082743, + "grad_norm": 6.095242500305176, + "learning_rate": 1.9153714928143898e-05, + "loss": 0.2419, + "step": 463 + }, + { + "epoch": 0.21938534278959812, + "grad_norm": 6.3486104011535645, + "learning_rate": 1.914705591286461e-05, + "loss": 0.2504, + "step": 464 + }, + { + "epoch": 0.2198581560283688, + "grad_norm": 4.352773189544678, + "learning_rate": 1.9140371968668767e-05, + "loss": 0.211, + "step": 465 + }, + { + "epoch": 0.2203309692671395, + "grad_norm": 7.6094889640808105, + "learning_rate": 1.9133663113772437e-05, + "loss": 0.2995, + "step": 466 + }, + { + "epoch": 0.22080378250591018, + "grad_norm": 5.553167819976807, + "learning_rate": 1.9126929366459596e-05, + "loss": 0.1836, + "step": 467 + }, + { + "epoch": 0.22127659574468084, + "grad_norm": 6.153863906860352, + "learning_rate": 1.912017074508205e-05, + "loss": 0.2946, + "step": 468 + }, + { + "epoch": 0.22174940898345152, + "grad_norm": 5.882277011871338, + "learning_rate": 1.9113387268059402e-05, + "loss": 0.1988, + "step": 469 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 8.288911819458008, + "learning_rate": 1.910657895387899e-05, + "loss": 0.2778, + "step": 470 + }, + { + "epoch": 0.2226950354609929, + "grad_norm": 8.619799613952637, + "learning_rate": 1.9099745821095842e-05, + "loss": 0.2995, + "step": 471 + }, + { + "epoch": 0.22316784869976358, + "grad_norm": 4.235837936401367, + "learning_rate": 1.909288788833263e-05, + "loss": 0.1523, + "step": 472 + }, + { + "epoch": 0.22364066193853427, + "grad_norm": 4.533273220062256, + "learning_rate": 1.908600517427961e-05, + "loss": 0.188, + "step": 473 + }, + { + "epoch": 0.22411347517730495, + "grad_norm": 6.656558513641357, + "learning_rate": 1.9079097697694578e-05, + "loss": 0.2706, + "step": 474 + }, + { + "epoch": 0.22458628841607564, + "grad_norm": 7.253176212310791, + "learning_rate": 1.9072165477402813e-05, + "loss": 0.2533, + "step": 475 + }, + { + "epoch": 0.22505910165484633, + "grad_norm": 7.984339237213135, + "learning_rate": 1.9065208532297043e-05, + "loss": 0.2768, + "step": 476 + }, + { + "epoch": 0.225531914893617, + "grad_norm": 5.5995259284973145, + "learning_rate": 1.9058226881337356e-05, + "loss": 0.2397, + "step": 477 + }, + { + "epoch": 0.2260047281323877, + "grad_norm": 8.392828941345215, + "learning_rate": 1.9051220543551193e-05, + "loss": 0.235, + "step": 478 + }, + { + "epoch": 0.2264775413711584, + "grad_norm": 5.6677422523498535, + "learning_rate": 1.9044189538033264e-05, + "loss": 0.2468, + "step": 479 + }, + { + "epoch": 0.22695035460992907, + "grad_norm": 5.317750930786133, + "learning_rate": 1.903713388394551e-05, + "loss": 0.2548, + "step": 480 + }, + { + "epoch": 0.22695035460992907, + "eval_accuracy": 0.852549889135255, + "eval_f1": 0.6795180722891566, + "eval_loss": 0.33114132285118103, + "eval_precision": 0.8703703703703703, + "eval_recall": 0.5573122529644269, + "eval_runtime": 48.0304, + "eval_samples_per_second": 5.746, + "eval_steps_per_second": 0.187, + "step": 480 + }, + { + "epoch": 0.22742316784869976, + "grad_norm": 6.943238258361816, + "learning_rate": 1.9030053600517053e-05, + "loss": 0.2657, + "step": 481 + }, + { + "epoch": 0.22789598108747045, + "grad_norm": 5.950002193450928, + "learning_rate": 1.902294870704413e-05, + "loss": 0.2934, + "step": 482 + }, + { + "epoch": 0.22836879432624113, + "grad_norm": 4.295022964477539, + "learning_rate": 1.901581922289005e-05, + "loss": 0.2139, + "step": 483 + }, + { + "epoch": 0.22884160756501182, + "grad_norm": 5.326821804046631, + "learning_rate": 1.9008665167485154e-05, + "loss": 0.2407, + "step": 484 + }, + { + "epoch": 0.2293144208037825, + "grad_norm": 6.2555036544799805, + "learning_rate": 1.9001486560326724e-05, + "loss": 0.2723, + "step": 485 + }, + { + "epoch": 0.2297872340425532, + "grad_norm": 6.824589729309082, + "learning_rate": 1.8994283420978975e-05, + "loss": 0.3014, + "step": 486 + }, + { + "epoch": 0.23026004728132388, + "grad_norm": 5.3086628913879395, + "learning_rate": 1.8987055769072973e-05, + "loss": 0.245, + "step": 487 + }, + { + "epoch": 0.23073286052009456, + "grad_norm": 5.174909591674805, + "learning_rate": 1.8979803624306585e-05, + "loss": 0.3507, + "step": 488 + }, + { + "epoch": 0.23120567375886525, + "grad_norm": 4.394033908843994, + "learning_rate": 1.897252700644444e-05, + "loss": 0.2182, + "step": 489 + }, + { + "epoch": 0.23167848699763594, + "grad_norm": 5.750499248504639, + "learning_rate": 1.8965225935317854e-05, + "loss": 0.2635, + "step": 490 + }, + { + "epoch": 0.23215130023640662, + "grad_norm": 5.786622047424316, + "learning_rate": 1.8957900430824793e-05, + "loss": 0.2483, + "step": 491 + }, + { + "epoch": 0.2326241134751773, + "grad_norm": 8.868030548095703, + "learning_rate": 1.895055051292981e-05, + "loss": 0.3858, + "step": 492 + }, + { + "epoch": 0.233096926713948, + "grad_norm": 5.2425408363342285, + "learning_rate": 1.8943176201664e-05, + "loss": 0.2107, + "step": 493 + }, + { + "epoch": 0.23356973995271868, + "grad_norm": 4.864797115325928, + "learning_rate": 1.8935777517124923e-05, + "loss": 0.3114, + "step": 494 + }, + { + "epoch": 0.23404255319148937, + "grad_norm": 6.035416603088379, + "learning_rate": 1.8928354479476577e-05, + "loss": 0.2315, + "step": 495 + }, + { + "epoch": 0.23451536643026005, + "grad_norm": 5.7510857582092285, + "learning_rate": 1.8920907108949335e-05, + "loss": 0.2423, + "step": 496 + }, + { + "epoch": 0.23498817966903074, + "grad_norm": 4.192838668823242, + "learning_rate": 1.8913435425839865e-05, + "loss": 0.2874, + "step": 497 + }, + { + "epoch": 0.23546099290780143, + "grad_norm": 5.289735317230225, + "learning_rate": 1.8905939450511117e-05, + "loss": 0.2984, + "step": 498 + }, + { + "epoch": 0.2359338061465721, + "grad_norm": 7.9628400802612305, + "learning_rate": 1.889841920339224e-05, + "loss": 0.3416, + "step": 499 + }, + { + "epoch": 0.2364066193853428, + "grad_norm": 4.809371471405029, + "learning_rate": 1.889087470497852e-05, + "loss": 0.2501, + "step": 500 + }, + { + "epoch": 0.2364066193853428, + "eval_accuracy": 0.8481152993348116, + "eval_f1": 0.6666666666666666, + "eval_loss": 0.3237150013446808, + "eval_precision": 0.8670886075949367, + "eval_recall": 0.541501976284585, + "eval_runtime": 47.4742, + "eval_samples_per_second": 5.814, + "eval_steps_per_second": 0.19, + "step": 500 + }, + { + "epoch": 0.23687943262411348, + "grad_norm": 4.674736499786377, + "learning_rate": 1.8883305975831357e-05, + "loss": 0.1875, + "step": 501 + }, + { + "epoch": 0.23735224586288417, + "grad_norm": 3.8804233074188232, + "learning_rate": 1.8875713036578168e-05, + "loss": 0.184, + "step": 502 + }, + { + "epoch": 0.23782505910165486, + "grad_norm": 6.960356712341309, + "learning_rate": 1.886809590791236e-05, + "loss": 0.2407, + "step": 503 + }, + { + "epoch": 0.23829787234042554, + "grad_norm": 6.6463823318481445, + "learning_rate": 1.886045461059327e-05, + "loss": 0.2633, + "step": 504 + }, + { + "epoch": 0.23877068557919623, + "grad_norm": 4.191521167755127, + "learning_rate": 1.8852789165446094e-05, + "loss": 0.218, + "step": 505 + }, + { + "epoch": 0.23924349881796692, + "grad_norm": 5.596932411193848, + "learning_rate": 1.8845099593361844e-05, + "loss": 0.2609, + "step": 506 + }, + { + "epoch": 0.2397163120567376, + "grad_norm": 4.571609973907471, + "learning_rate": 1.883738591529728e-05, + "loss": 0.1815, + "step": 507 + }, + { + "epoch": 0.2401891252955083, + "grad_norm": 5.630218505859375, + "learning_rate": 1.8829648152274872e-05, + "loss": 0.2727, + "step": 508 + }, + { + "epoch": 0.24066193853427895, + "grad_norm": 8.154533386230469, + "learning_rate": 1.8821886325382718e-05, + "loss": 0.3277, + "step": 509 + }, + { + "epoch": 0.24113475177304963, + "grad_norm": 5.654308795928955, + "learning_rate": 1.8814100455774504e-05, + "loss": 0.2938, + "step": 510 + }, + { + "epoch": 0.24160756501182032, + "grad_norm": 6.1714677810668945, + "learning_rate": 1.8806290564669435e-05, + "loss": 0.2314, + "step": 511 + }, + { + "epoch": 0.242080378250591, + "grad_norm": 5.192854404449463, + "learning_rate": 1.879845667335219e-05, + "loss": 0.2489, + "step": 512 + }, + { + "epoch": 0.2425531914893617, + "grad_norm": 7.861010551452637, + "learning_rate": 1.8790598803172857e-05, + "loss": 0.2815, + "step": 513 + }, + { + "epoch": 0.24302600472813238, + "grad_norm": 6.233393669128418, + "learning_rate": 1.878271697554687e-05, + "loss": 0.2584, + "step": 514 + }, + { + "epoch": 0.24349881796690306, + "grad_norm": 6.324631690979004, + "learning_rate": 1.8774811211954954e-05, + "loss": 0.3333, + "step": 515 + }, + { + "epoch": 0.24397163120567375, + "grad_norm": 7.769336700439453, + "learning_rate": 1.8766881533943074e-05, + "loss": 0.2968, + "step": 516 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 6.459347724914551, + "learning_rate": 1.875892796312237e-05, + "loss": 0.2193, + "step": 517 + }, + { + "epoch": 0.24491725768321512, + "grad_norm": 4.615235805511475, + "learning_rate": 1.875095052116909e-05, + "loss": 0.2396, + "step": 518 + }, + { + "epoch": 0.2453900709219858, + "grad_norm": 4.699162483215332, + "learning_rate": 1.874294922982455e-05, + "loss": 0.2726, + "step": 519 + }, + { + "epoch": 0.2458628841607565, + "grad_norm": 6.707888126373291, + "learning_rate": 1.8734924110895056e-05, + "loss": 0.2936, + "step": 520 + }, + { + "epoch": 0.2458628841607565, + "eval_accuracy": 0.835920177383592, + "eval_f1": 0.6205128205128205, + "eval_loss": 0.3496428430080414, + "eval_precision": 0.8832116788321168, + "eval_recall": 0.4782608695652174, + "eval_runtime": 49.5579, + "eval_samples_per_second": 5.569, + "eval_steps_per_second": 0.182, + "step": 520 + }, + { + "epoch": 0.24633569739952718, + "grad_norm": 6.290736198425293, + "learning_rate": 1.8726875186251856e-05, + "loss": 0.2605, + "step": 521 + }, + { + "epoch": 0.24680851063829787, + "grad_norm": 5.97813606262207, + "learning_rate": 1.8718802477831072e-05, + "loss": 0.257, + "step": 522 + }, + { + "epoch": 0.24728132387706855, + "grad_norm": 5.485353469848633, + "learning_rate": 1.8710706007633654e-05, + "loss": 0.2114, + "step": 523 + }, + { + "epoch": 0.24775413711583924, + "grad_norm": 4.747553825378418, + "learning_rate": 1.8702585797725308e-05, + "loss": 0.2579, + "step": 524 + }, + { + "epoch": 0.24822695035460993, + "grad_norm": 5.752557754516602, + "learning_rate": 1.869444187023643e-05, + "loss": 0.2706, + "step": 525 + }, + { + "epoch": 0.2486997635933806, + "grad_norm": 5.672857761383057, + "learning_rate": 1.8686274247362067e-05, + "loss": 0.23, + "step": 526 + }, + { + "epoch": 0.2491725768321513, + "grad_norm": 5.739321708679199, + "learning_rate": 1.8678082951361837e-05, + "loss": 0.2274, + "step": 527 + }, + { + "epoch": 0.24964539007092199, + "grad_norm": 8.147102355957031, + "learning_rate": 1.8669868004559878e-05, + "loss": 0.2682, + "step": 528 + }, + { + "epoch": 0.25011820330969264, + "grad_norm": 4.3418169021606445, + "learning_rate": 1.8661629429344782e-05, + "loss": 0.2552, + "step": 529 + }, + { + "epoch": 0.25059101654846333, + "grad_norm": 5.724131107330322, + "learning_rate": 1.8653367248169547e-05, + "loss": 0.2912, + "step": 530 + }, + { + "epoch": 0.251063829787234, + "grad_norm": 5.735341548919678, + "learning_rate": 1.864508148355149e-05, + "loss": 0.2843, + "step": 531 + }, + { + "epoch": 0.2515366430260047, + "grad_norm": 4.442595481872559, + "learning_rate": 1.863677215807221e-05, + "loss": 0.2334, + "step": 532 + }, + { + "epoch": 0.2520094562647754, + "grad_norm": 6.313594341278076, + "learning_rate": 1.862843929437751e-05, + "loss": 0.3263, + "step": 533 + }, + { + "epoch": 0.2524822695035461, + "grad_norm": 9.929341316223145, + "learning_rate": 1.8620082915177363e-05, + "loss": 0.2992, + "step": 534 + }, + { + "epoch": 0.25295508274231676, + "grad_norm": 4.474004745483398, + "learning_rate": 1.8611703043245807e-05, + "loss": 0.2582, + "step": 535 + }, + { + "epoch": 0.25342789598108745, + "grad_norm": 6.404626369476318, + "learning_rate": 1.8603299701420915e-05, + "loss": 0.2724, + "step": 536 + }, + { + "epoch": 0.25390070921985813, + "grad_norm": 5.970371723175049, + "learning_rate": 1.8594872912604723e-05, + "loss": 0.3189, + "step": 537 + }, + { + "epoch": 0.2543735224586288, + "grad_norm": 6.132597923278809, + "learning_rate": 1.858642269976317e-05, + "loss": 0.2448, + "step": 538 + }, + { + "epoch": 0.2548463356973995, + "grad_norm": 4.322042942047119, + "learning_rate": 1.8577949085926032e-05, + "loss": 0.1853, + "step": 539 + }, + { + "epoch": 0.2553191489361702, + "grad_norm": 4.78247594833374, + "learning_rate": 1.8569452094186863e-05, + "loss": 0.2012, + "step": 540 + }, + { + "epoch": 0.2553191489361702, + "eval_accuracy": 0.8403547671840355, + "eval_f1": 0.6470588235294118, + "eval_loss": 0.3362201750278473, + "eval_precision": 0.8516129032258064, + "eval_recall": 0.5217391304347826, + "eval_runtime": 48.1646, + "eval_samples_per_second": 5.73, + "eval_steps_per_second": 0.187, + "step": 540 + }, + { + "epoch": 0.2557919621749409, + "grad_norm": 5.21437406539917, + "learning_rate": 1.8560931747702924e-05, + "loss": 0.2784, + "step": 541 + }, + { + "epoch": 0.25626477541371157, + "grad_norm": 6.169255256652832, + "learning_rate": 1.855238806969513e-05, + "loss": 0.3128, + "step": 542 + }, + { + "epoch": 0.25673758865248225, + "grad_norm": 5.7340874671936035, + "learning_rate": 1.854382108344799e-05, + "loss": 0.2696, + "step": 543 + }, + { + "epoch": 0.25721040189125294, + "grad_norm": 7.384251594543457, + "learning_rate": 1.853523081230952e-05, + "loss": 0.2896, + "step": 544 + }, + { + "epoch": 0.2576832151300236, + "grad_norm": 7.609536170959473, + "learning_rate": 1.8526617279691207e-05, + "loss": 0.276, + "step": 545 + }, + { + "epoch": 0.2581560283687943, + "grad_norm": 5.231773853302002, + "learning_rate": 1.8517980509067926e-05, + "loss": 0.2765, + "step": 546 + }, + { + "epoch": 0.258628841607565, + "grad_norm": 4.057672023773193, + "learning_rate": 1.8509320523977895e-05, + "loss": 0.1932, + "step": 547 + }, + { + "epoch": 0.2591016548463357, + "grad_norm": 6.7243452072143555, + "learning_rate": 1.8500637348022594e-05, + "loss": 0.1775, + "step": 548 + }, + { + "epoch": 0.25957446808510637, + "grad_norm": 5.530993461608887, + "learning_rate": 1.84919310048667e-05, + "loss": 0.2299, + "step": 549 + }, + { + "epoch": 0.26004728132387706, + "grad_norm": 4.74931001663208, + "learning_rate": 1.8483201518238032e-05, + "loss": 0.1902, + "step": 550 + }, + { + "epoch": 0.26052009456264774, + "grad_norm": 6.366207122802734, + "learning_rate": 1.847444891192749e-05, + "loss": 0.3109, + "step": 551 + }, + { + "epoch": 0.26099290780141843, + "grad_norm": 8.735250473022461, + "learning_rate": 1.8465673209788975e-05, + "loss": 0.32, + "step": 552 + }, + { + "epoch": 0.2614657210401891, + "grad_norm": 5.171599864959717, + "learning_rate": 1.8456874435739337e-05, + "loss": 0.2245, + "step": 553 + }, + { + "epoch": 0.2619385342789598, + "grad_norm": 6.582614898681641, + "learning_rate": 1.8448052613758297e-05, + "loss": 0.2419, + "step": 554 + }, + { + "epoch": 0.2624113475177305, + "grad_norm": 5.810616970062256, + "learning_rate": 1.84392077678884e-05, + "loss": 0.2402, + "step": 555 + }, + { + "epoch": 0.2628841607565012, + "grad_norm": 6.7433271408081055, + "learning_rate": 1.843033992223494e-05, + "loss": 0.2887, + "step": 556 + }, + { + "epoch": 0.26335697399527186, + "grad_norm": 3.9062905311584473, + "learning_rate": 1.8421449100965884e-05, + "loss": 0.1842, + "step": 557 + }, + { + "epoch": 0.26382978723404255, + "grad_norm": 5.2100749015808105, + "learning_rate": 1.8412535328311813e-05, + "loss": 0.2191, + "step": 558 + }, + { + "epoch": 0.26430260047281323, + "grad_norm": 4.192863941192627, + "learning_rate": 1.8403598628565876e-05, + "loss": 0.1958, + "step": 559 + }, + { + "epoch": 0.2647754137115839, + "grad_norm": 8.32767105102539, + "learning_rate": 1.839463902608369e-05, + "loss": 0.3295, + "step": 560 + }, + { + "epoch": 0.2647754137115839, + "eval_accuracy": 0.8492239467849224, + "eval_f1": 0.6777251184834123, + "eval_loss": 0.3414818048477173, + "eval_precision": 0.8461538461538461, + "eval_recall": 0.5652173913043478, + "eval_runtime": 47.9942, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.188, + "step": 560 + }, + { + "epoch": 0.2652482269503546, + "grad_norm": 7.091763019561768, + "learning_rate": 1.8385656545283296e-05, + "loss": 0.3177, + "step": 561 + }, + { + "epoch": 0.2657210401891253, + "grad_norm": 4.874910354614258, + "learning_rate": 1.8376651210645085e-05, + "loss": 0.255, + "step": 562 + }, + { + "epoch": 0.266193853427896, + "grad_norm": 5.0358710289001465, + "learning_rate": 1.836762304671174e-05, + "loss": 0.2031, + "step": 563 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 6.5243330001831055, + "learning_rate": 1.8358572078088144e-05, + "loss": 0.2583, + "step": 564 + }, + { + "epoch": 0.26713947990543735, + "grad_norm": 5.2152862548828125, + "learning_rate": 1.8349498329441355e-05, + "loss": 0.191, + "step": 565 + }, + { + "epoch": 0.26761229314420804, + "grad_norm": 4.507115840911865, + "learning_rate": 1.8340401825500496e-05, + "loss": 0.1649, + "step": 566 + }, + { + "epoch": 0.2680851063829787, + "grad_norm": 4.836716651916504, + "learning_rate": 1.833128259105671e-05, + "loss": 0.2041, + "step": 567 + }, + { + "epoch": 0.2685579196217494, + "grad_norm": 5.212822437286377, + "learning_rate": 1.832214065096309e-05, + "loss": 0.2043, + "step": 568 + }, + { + "epoch": 0.2690307328605201, + "grad_norm": 9.472026824951172, + "learning_rate": 1.8312976030134613e-05, + "loss": 0.377, + "step": 569 + }, + { + "epoch": 0.2695035460992908, + "grad_norm": 7.958829879760742, + "learning_rate": 1.8303788753548065e-05, + "loss": 0.2281, + "step": 570 + }, + { + "epoch": 0.26997635933806147, + "grad_norm": 6.508796215057373, + "learning_rate": 1.829457884624198e-05, + "loss": 0.1819, + "step": 571 + }, + { + "epoch": 0.27044917257683215, + "grad_norm": 7.919535160064697, + "learning_rate": 1.8285346333316564e-05, + "loss": 0.2852, + "step": 572 + }, + { + "epoch": 0.27092198581560284, + "grad_norm": 5.031895637512207, + "learning_rate": 1.8276091239933634e-05, + "loss": 0.1962, + "step": 573 + }, + { + "epoch": 0.2713947990543735, + "grad_norm": 5.484118461608887, + "learning_rate": 1.8266813591316548e-05, + "loss": 0.2812, + "step": 574 + }, + { + "epoch": 0.2718676122931442, + "grad_norm": 6.0553460121154785, + "learning_rate": 1.825751341275013e-05, + "loss": 0.2536, + "step": 575 + }, + { + "epoch": 0.2723404255319149, + "grad_norm": 6.331978797912598, + "learning_rate": 1.8248190729580613e-05, + "loss": 0.2043, + "step": 576 + }, + { + "epoch": 0.2728132387706856, + "grad_norm": 5.793010711669922, + "learning_rate": 1.8238845567215554e-05, + "loss": 0.2921, + "step": 577 + }, + { + "epoch": 0.27328605200945627, + "grad_norm": 8.16348648071289, + "learning_rate": 1.8229477951123785e-05, + "loss": 0.3131, + "step": 578 + }, + { + "epoch": 0.27375886524822696, + "grad_norm": 4.672780513763428, + "learning_rate": 1.822008790683532e-05, + "loss": 0.2961, + "step": 579 + }, + { + "epoch": 0.27423167848699764, + "grad_norm": 5.705632209777832, + "learning_rate": 1.8210675459941306e-05, + "loss": 0.2859, + "step": 580 + }, + { + "epoch": 0.27423167848699764, + "eval_accuracy": 0.843680709534368, + "eval_f1": 0.6501240694789082, + "eval_loss": 0.336950421333313, + "eval_precision": 0.8733333333333333, + "eval_recall": 0.5177865612648221, + "eval_runtime": 48.0605, + "eval_samples_per_second": 5.743, + "eval_steps_per_second": 0.187, + "step": 580 + }, + { + "epoch": 0.27470449172576833, + "grad_norm": 6.904634952545166, + "learning_rate": 1.8201240636093948e-05, + "loss": 0.2677, + "step": 581 + }, + { + "epoch": 0.275177304964539, + "grad_norm": 6.0656328201293945, + "learning_rate": 1.819178346100642e-05, + "loss": 0.3325, + "step": 582 + }, + { + "epoch": 0.2756501182033097, + "grad_norm": 4.885197162628174, + "learning_rate": 1.8182303960452826e-05, + "loss": 0.2458, + "step": 583 + }, + { + "epoch": 0.2761229314420804, + "grad_norm": 5.368473052978516, + "learning_rate": 1.8172802160268116e-05, + "loss": 0.2929, + "step": 584 + }, + { + "epoch": 0.2765957446808511, + "grad_norm": 4.78055477142334, + "learning_rate": 1.8163278086347998e-05, + "loss": 0.2534, + "step": 585 + }, + { + "epoch": 0.27706855791962176, + "grad_norm": 8.146903038024902, + "learning_rate": 1.8153731764648907e-05, + "loss": 0.2733, + "step": 586 + }, + { + "epoch": 0.27754137115839245, + "grad_norm": 3.793304681777954, + "learning_rate": 1.8144163221187882e-05, + "loss": 0.2232, + "step": 587 + }, + { + "epoch": 0.27801418439716313, + "grad_norm": 4.330966949462891, + "learning_rate": 1.8134572482042555e-05, + "loss": 0.2709, + "step": 588 + }, + { + "epoch": 0.2784869976359338, + "grad_norm": 3.585026264190674, + "learning_rate": 1.8124959573351023e-05, + "loss": 0.1779, + "step": 589 + }, + { + "epoch": 0.2789598108747045, + "grad_norm": 5.1665754318237305, + "learning_rate": 1.8115324521311823e-05, + "loss": 0.2599, + "step": 590 + }, + { + "epoch": 0.2794326241134752, + "grad_norm": 4.275265216827393, + "learning_rate": 1.8105667352183823e-05, + "loss": 0.1805, + "step": 591 + }, + { + "epoch": 0.2799054373522459, + "grad_norm": 5.761347770690918, + "learning_rate": 1.809598809228618e-05, + "loss": 0.2043, + "step": 592 + }, + { + "epoch": 0.28037825059101656, + "grad_norm": 4.9206647872924805, + "learning_rate": 1.8086286767998253e-05, + "loss": 0.2351, + "step": 593 + }, + { + "epoch": 0.28085106382978725, + "grad_norm": 5.019208908081055, + "learning_rate": 1.807656340575953e-05, + "loss": 0.234, + "step": 594 + }, + { + "epoch": 0.28132387706855794, + "grad_norm": 6.9271626472473145, + "learning_rate": 1.8066818032069566e-05, + "loss": 0.3302, + "step": 595 + }, + { + "epoch": 0.2817966903073286, + "grad_norm": 6.962562084197998, + "learning_rate": 1.80570506734879e-05, + "loss": 0.265, + "step": 596 + }, + { + "epoch": 0.2822695035460993, + "grad_norm": 6.923299789428711, + "learning_rate": 1.804726135663399e-05, + "loss": 0.27, + "step": 597 + }, + { + "epoch": 0.28274231678487, + "grad_norm": 7.9528422355651855, + "learning_rate": 1.803745010818714e-05, + "loss": 0.2644, + "step": 598 + }, + { + "epoch": 0.2832151300236407, + "grad_norm": 6.256555557250977, + "learning_rate": 1.802761695488642e-05, + "loss": 0.296, + "step": 599 + }, + { + "epoch": 0.28368794326241137, + "grad_norm": 8.758502006530762, + "learning_rate": 1.8017761923530602e-05, + "loss": 0.2655, + "step": 600 + }, + { + "epoch": 0.28368794326241137, + "eval_accuracy": 0.8492239467849224, + "eval_f1": 0.6822429906542056, + "eval_loss": 0.32478421926498413, + "eval_precision": 0.8342857142857143, + "eval_recall": 0.5770750988142292, + "eval_runtime": 48.6007, + "eval_samples_per_second": 5.679, + "eval_steps_per_second": 0.185, + "step": 600 + }, + { + "epoch": 0.28416075650118205, + "grad_norm": 6.9206929206848145, + "learning_rate": 1.8007885040978078e-05, + "loss": 0.2534, + "step": 601 + }, + { + "epoch": 0.28463356973995274, + "grad_norm": 4.878746509552002, + "learning_rate": 1.7997986334146808e-05, + "loss": 0.2592, + "step": 602 + }, + { + "epoch": 0.2851063829787234, + "grad_norm": 6.951952934265137, + "learning_rate": 1.798806583001421e-05, + "loss": 0.3073, + "step": 603 + }, + { + "epoch": 0.2855791962174941, + "grad_norm": 6.235677242279053, + "learning_rate": 1.7978123555617116e-05, + "loss": 0.2217, + "step": 604 + }, + { + "epoch": 0.2860520094562648, + "grad_norm": 5.96851110458374, + "learning_rate": 1.7968159538051703e-05, + "loss": 0.3361, + "step": 605 + }, + { + "epoch": 0.2865248226950355, + "grad_norm": 7.12925386428833, + "learning_rate": 1.7958173804473373e-05, + "loss": 0.2454, + "step": 606 + }, + { + "epoch": 0.28699763593380617, + "grad_norm": 5.562047481536865, + "learning_rate": 1.7948166382096744e-05, + "loss": 0.2518, + "step": 607 + }, + { + "epoch": 0.28747044917257686, + "grad_norm": 5.448677062988281, + "learning_rate": 1.793813729819553e-05, + "loss": 0.241, + "step": 608 + }, + { + "epoch": 0.28794326241134754, + "grad_norm": 6.051489353179932, + "learning_rate": 1.7928086580102485e-05, + "loss": 0.2748, + "step": 609 + }, + { + "epoch": 0.28841607565011823, + "grad_norm": 5.5659661293029785, + "learning_rate": 1.791801425520931e-05, + "loss": 0.2435, + "step": 610 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 6.0021071434021, + "learning_rate": 1.790792035096661e-05, + "loss": 0.2785, + "step": 611 + }, + { + "epoch": 0.28936170212765955, + "grad_norm": 6.288322925567627, + "learning_rate": 1.789780489488379e-05, + "loss": 0.2959, + "step": 612 + }, + { + "epoch": 0.28983451536643023, + "grad_norm": 5.30917501449585, + "learning_rate": 1.7887667914528996e-05, + "loss": 0.1903, + "step": 613 + }, + { + "epoch": 0.2903073286052009, + "grad_norm": 5.338979721069336, + "learning_rate": 1.7877509437529032e-05, + "loss": 0.2522, + "step": 614 + }, + { + "epoch": 0.2907801418439716, + "grad_norm": 9.307381629943848, + "learning_rate": 1.7867329491569293e-05, + "loss": 0.3809, + "step": 615 + }, + { + "epoch": 0.2912529550827423, + "grad_norm": 8.209502220153809, + "learning_rate": 1.785712810439368e-05, + "loss": 0.3395, + "step": 616 + }, + { + "epoch": 0.291725768321513, + "grad_norm": 4.827156066894531, + "learning_rate": 1.7846905303804525e-05, + "loss": 0.2298, + "step": 617 + }, + { + "epoch": 0.29219858156028367, + "grad_norm": 3.754410743713379, + "learning_rate": 1.783666111766253e-05, + "loss": 0.149, + "step": 618 + }, + { + "epoch": 0.29267139479905435, + "grad_norm": 5.818631649017334, + "learning_rate": 1.782639557388667e-05, + "loss": 0.3478, + "step": 619 + }, + { + "epoch": 0.29314420803782504, + "grad_norm": 4.670815467834473, + "learning_rate": 1.781610870045414e-05, + "loss": 0.2646, + "step": 620 + }, + { + "epoch": 0.29314420803782504, + "eval_accuracy": 0.8481152993348116, + "eval_f1": 0.6682808716707022, + "eval_loss": 0.3289998471736908, + "eval_precision": 0.8625, + "eval_recall": 0.5454545454545454, + "eval_runtime": 47.6704, + "eval_samples_per_second": 5.79, + "eval_steps_per_second": 0.189, + "step": 620 + }, + { + "epoch": 0.2936170212765957, + "grad_norm": 4.68503475189209, + "learning_rate": 1.780580052540024e-05, + "loss": 0.2216, + "step": 621 + }, + { + "epoch": 0.2940898345153664, + "grad_norm": 8.284623146057129, + "learning_rate": 1.7795471076818356e-05, + "loss": 0.3633, + "step": 622 + }, + { + "epoch": 0.2945626477541371, + "grad_norm": 4.425292015075684, + "learning_rate": 1.7785120382859832e-05, + "loss": 0.2103, + "step": 623 + }, + { + "epoch": 0.2950354609929078, + "grad_norm": 5.457613468170166, + "learning_rate": 1.7774748471733915e-05, + "loss": 0.2615, + "step": 624 + }, + { + "epoch": 0.29550827423167847, + "grad_norm": 7.534078598022461, + "learning_rate": 1.776435537170768e-05, + "loss": 0.2994, + "step": 625 + }, + { + "epoch": 0.29598108747044916, + "grad_norm": 5.263175010681152, + "learning_rate": 1.7753941111105954e-05, + "loss": 0.221, + "step": 626 + }, + { + "epoch": 0.29645390070921984, + "grad_norm": 6.045238971710205, + "learning_rate": 1.7743505718311218e-05, + "loss": 0.2957, + "step": 627 + }, + { + "epoch": 0.29692671394799053, + "grad_norm": 6.148013591766357, + "learning_rate": 1.7733049221763565e-05, + "loss": 0.2611, + "step": 628 + }, + { + "epoch": 0.2973995271867612, + "grad_norm": 5.272988796234131, + "learning_rate": 1.772257164996059e-05, + "loss": 0.2432, + "step": 629 + }, + { + "epoch": 0.2978723404255319, + "grad_norm": 4.126927375793457, + "learning_rate": 1.7712073031457332e-05, + "loss": 0.2488, + "step": 630 + }, + { + "epoch": 0.2983451536643026, + "grad_norm": 5.052836894989014, + "learning_rate": 1.770155339486618e-05, + "loss": 0.3185, + "step": 631 + }, + { + "epoch": 0.2988179669030733, + "grad_norm": 6.2589240074157715, + "learning_rate": 1.7691012768856817e-05, + "loss": 0.2974, + "step": 632 + }, + { + "epoch": 0.29929078014184396, + "grad_norm": 9.090741157531738, + "learning_rate": 1.7680451182156123e-05, + "loss": 0.3296, + "step": 633 + }, + { + "epoch": 0.29976359338061465, + "grad_norm": 4.453991889953613, + "learning_rate": 1.7669868663548105e-05, + "loss": 0.2429, + "step": 634 + }, + { + "epoch": 0.30023640661938533, + "grad_norm": 4.265166282653809, + "learning_rate": 1.7659265241873815e-05, + "loss": 0.2038, + "step": 635 + }, + { + "epoch": 0.300709219858156, + "grad_norm": 6.1728901863098145, + "learning_rate": 1.7648640946031273e-05, + "loss": 0.2277, + "step": 636 + }, + { + "epoch": 0.3011820330969267, + "grad_norm": 3.938297986984253, + "learning_rate": 1.7637995804975392e-05, + "loss": 0.2425, + "step": 637 + }, + { + "epoch": 0.3016548463356974, + "grad_norm": 5.709317684173584, + "learning_rate": 1.7627329847717888e-05, + "loss": 0.2656, + "step": 638 + }, + { + "epoch": 0.3021276595744681, + "grad_norm": 5.384950160980225, + "learning_rate": 1.761664310332722e-05, + "loss": 0.3077, + "step": 639 + }, + { + "epoch": 0.30260047281323876, + "grad_norm": 5.811992645263672, + "learning_rate": 1.7605935600928486e-05, + "loss": 0.2706, + "step": 640 + }, + { + "epoch": 0.30260047281323876, + "eval_accuracy": 0.8481152993348116, + "eval_f1": 0.6836027713625866, + "eval_loss": 0.3192698657512665, + "eval_precision": 0.8222222222222222, + "eval_recall": 0.5849802371541502, + "eval_runtime": 48.4619, + "eval_samples_per_second": 5.695, + "eval_steps_per_second": 0.186, + "step": 640 + }, + { + "epoch": 0.30307328605200945, + "grad_norm": 7.0490241050720215, + "learning_rate": 1.759520736970337e-05, + "loss": 0.3842, + "step": 641 + }, + { + "epoch": 0.30354609929078014, + "grad_norm": 7.162063121795654, + "learning_rate": 1.7584458438890036e-05, + "loss": 0.3018, + "step": 642 + }, + { + "epoch": 0.3040189125295508, + "grad_norm": 7.344574928283691, + "learning_rate": 1.757368883778307e-05, + "loss": 0.281, + "step": 643 + }, + { + "epoch": 0.3044917257683215, + "grad_norm": 5.31951379776001, + "learning_rate": 1.7562898595733395e-05, + "loss": 0.2809, + "step": 644 + }, + { + "epoch": 0.3049645390070922, + "grad_norm": 6.758824348449707, + "learning_rate": 1.7552087742148176e-05, + "loss": 0.2333, + "step": 645 + }, + { + "epoch": 0.3054373522458629, + "grad_norm": 5.954471588134766, + "learning_rate": 1.754125630649076e-05, + "loss": 0.31, + "step": 646 + }, + { + "epoch": 0.30591016548463357, + "grad_norm": 5.111174583435059, + "learning_rate": 1.753040431828059e-05, + "loss": 0.2112, + "step": 647 + }, + { + "epoch": 0.30638297872340425, + "grad_norm": 3.9539942741394043, + "learning_rate": 1.751953180709311e-05, + "loss": 0.1611, + "step": 648 + }, + { + "epoch": 0.30685579196217494, + "grad_norm": 4.952718257904053, + "learning_rate": 1.750863880255971e-05, + "loss": 0.2869, + "step": 649 + }, + { + "epoch": 0.3073286052009456, + "grad_norm": 5.200640678405762, + "learning_rate": 1.7497725334367627e-05, + "loss": 0.2983, + "step": 650 + }, + { + "epoch": 0.3078014184397163, + "grad_norm": 9.411211013793945, + "learning_rate": 1.7486791432259858e-05, + "loss": 0.2823, + "step": 651 + }, + { + "epoch": 0.308274231678487, + "grad_norm": 5.560668468475342, + "learning_rate": 1.7475837126035105e-05, + "loss": 0.2646, + "step": 652 + }, + { + "epoch": 0.3087470449172577, + "grad_norm": 4.861005783081055, + "learning_rate": 1.746486244554767e-05, + "loss": 0.3096, + "step": 653 + }, + { + "epoch": 0.30921985815602837, + "grad_norm": 2.945842742919922, + "learning_rate": 1.7453867420707386e-05, + "loss": 0.1699, + "step": 654 + }, + { + "epoch": 0.30969267139479906, + "grad_norm": 5.4764933586120605, + "learning_rate": 1.7442852081479525e-05, + "loss": 0.2001, + "step": 655 + }, + { + "epoch": 0.31016548463356974, + "grad_norm": 5.168087005615234, + "learning_rate": 1.743181645788473e-05, + "loss": 0.207, + "step": 656 + }, + { + "epoch": 0.31063829787234043, + "grad_norm": 6.098107814788818, + "learning_rate": 1.742076057999892e-05, + "loss": 0.2383, + "step": 657 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 6.835310935974121, + "learning_rate": 1.7409684477953224e-05, + "loss": 0.2723, + "step": 658 + }, + { + "epoch": 0.3115839243498818, + "grad_norm": 8.645564079284668, + "learning_rate": 1.739858818193387e-05, + "loss": 0.3614, + "step": 659 + }, + { + "epoch": 0.3120567375886525, + "grad_norm": 5.342036724090576, + "learning_rate": 1.738747172218215e-05, + "loss": 0.2074, + "step": 660 + }, + { + "epoch": 0.3120567375886525, + "eval_accuracy": 0.8470066518847007, + "eval_f1": 0.655, + "eval_loss": 0.3505603075027466, + "eval_precision": 0.891156462585034, + "eval_recall": 0.5177865612648221, + "eval_runtime": 48.6815, + "eval_samples_per_second": 5.67, + "eval_steps_per_second": 0.185, + "step": 660 + }, + { + "epoch": 0.3125295508274232, + "grad_norm": 6.081332683563232, + "learning_rate": 1.7376335128994276e-05, + "loss": 0.3087, + "step": 661 + }, + { + "epoch": 0.31300236406619386, + "grad_norm": 5.119427680969238, + "learning_rate": 1.7365178432721358e-05, + "loss": 0.2799, + "step": 662 + }, + { + "epoch": 0.31347517730496455, + "grad_norm": 7.370607376098633, + "learning_rate": 1.7354001663769278e-05, + "loss": 0.2989, + "step": 663 + }, + { + "epoch": 0.31394799054373523, + "grad_norm": 4.7025885581970215, + "learning_rate": 1.734280485259863e-05, + "loss": 0.2622, + "step": 664 + }, + { + "epoch": 0.3144208037825059, + "grad_norm": 7.617417812347412, + "learning_rate": 1.7331588029724628e-05, + "loss": 0.3428, + "step": 665 + }, + { + "epoch": 0.3148936170212766, + "grad_norm": 4.964621543884277, + "learning_rate": 1.7320351225717025e-05, + "loss": 0.2216, + "step": 666 + }, + { + "epoch": 0.3153664302600473, + "grad_norm": 6.546290397644043, + "learning_rate": 1.730909447120003e-05, + "loss": 0.2092, + "step": 667 + }, + { + "epoch": 0.315839243498818, + "grad_norm": 6.265383243560791, + "learning_rate": 1.7297817796852227e-05, + "loss": 0.2734, + "step": 668 + }, + { + "epoch": 0.31631205673758866, + "grad_norm": 5.259603500366211, + "learning_rate": 1.728652123340648e-05, + "loss": 0.2409, + "step": 669 + }, + { + "epoch": 0.31678486997635935, + "grad_norm": 6.892948627471924, + "learning_rate": 1.7275204811649865e-05, + "loss": 0.311, + "step": 670 + }, + { + "epoch": 0.31725768321513004, + "grad_norm": 4.608394145965576, + "learning_rate": 1.7263868562423577e-05, + "loss": 0.2553, + "step": 671 + }, + { + "epoch": 0.3177304964539007, + "grad_norm": 6.246408462524414, + "learning_rate": 1.725251251662285e-05, + "loss": 0.2753, + "step": 672 + }, + { + "epoch": 0.3182033096926714, + "grad_norm": 5.615715026855469, + "learning_rate": 1.7241136705196865e-05, + "loss": 0.1744, + "step": 673 + }, + { + "epoch": 0.3186761229314421, + "grad_norm": 4.1983642578125, + "learning_rate": 1.7229741159148676e-05, + "loss": 0.2054, + "step": 674 + }, + { + "epoch": 0.3191489361702128, + "grad_norm": 7.475837230682373, + "learning_rate": 1.7218325909535118e-05, + "loss": 0.2695, + "step": 675 + }, + { + "epoch": 0.31962174940898347, + "grad_norm": 6.710148334503174, + "learning_rate": 1.7206890987466726e-05, + "loss": 0.2597, + "step": 676 + }, + { + "epoch": 0.32009456264775416, + "grad_norm": 5.378614902496338, + "learning_rate": 1.7195436424107648e-05, + "loss": 0.2669, + "step": 677 + }, + { + "epoch": 0.32056737588652484, + "grad_norm": 6.92887020111084, + "learning_rate": 1.7183962250675568e-05, + "loss": 0.3035, + "step": 678 + }, + { + "epoch": 0.3210401891252955, + "grad_norm": 5.467234134674072, + "learning_rate": 1.7172468498441604e-05, + "loss": 0.2622, + "step": 679 + }, + { + "epoch": 0.3215130023640662, + "grad_norm": 5.692148685455322, + "learning_rate": 1.7160955198730244e-05, + "loss": 0.2825, + "step": 680 + }, + { + "epoch": 0.3215130023640662, + "eval_accuracy": 0.8281596452328159, + "eval_f1": 0.5931758530183727, + "eval_loss": 0.35231587290763855, + "eval_precision": 0.8828125, + "eval_recall": 0.44664031620553357, + "eval_runtime": 47.4158, + "eval_samples_per_second": 5.821, + "eval_steps_per_second": 0.19, + "step": 680 + }, + { + "epoch": 0.3219858156028369, + "grad_norm": 5.253277778625488, + "learning_rate": 1.7149422382919237e-05, + "loss": 0.2007, + "step": 681 + }, + { + "epoch": 0.3224586288416076, + "grad_norm": 5.658674716949463, + "learning_rate": 1.7137870082439533e-05, + "loss": 0.2242, + "step": 682 + }, + { + "epoch": 0.3229314420803783, + "grad_norm": 6.746735095977783, + "learning_rate": 1.7126298328775175e-05, + "loss": 0.3869, + "step": 683 + }, + { + "epoch": 0.32340425531914896, + "grad_norm": 4.4457173347473145, + "learning_rate": 1.711470715346323e-05, + "loss": 0.207, + "step": 684 + }, + { + "epoch": 0.32387706855791965, + "grad_norm": 4.7827935218811035, + "learning_rate": 1.7103096588093686e-05, + "loss": 0.1964, + "step": 685 + }, + { + "epoch": 0.32434988179669033, + "grad_norm": 5.443333148956299, + "learning_rate": 1.7091466664309385e-05, + "loss": 0.2212, + "step": 686 + }, + { + "epoch": 0.324822695035461, + "grad_norm": 7.0208539962768555, + "learning_rate": 1.7079817413805927e-05, + "loss": 0.38, + "step": 687 + }, + { + "epoch": 0.3252955082742317, + "grad_norm": 4.507380485534668, + "learning_rate": 1.706814886833158e-05, + "loss": 0.2782, + "step": 688 + }, + { + "epoch": 0.3257683215130024, + "grad_norm": 5.8691301345825195, + "learning_rate": 1.7056461059687195e-05, + "loss": 0.2178, + "step": 689 + }, + { + "epoch": 0.3262411347517731, + "grad_norm": 7.219882011413574, + "learning_rate": 1.7044754019726127e-05, + "loss": 0.2707, + "step": 690 + }, + { + "epoch": 0.32671394799054376, + "grad_norm": 5.678999900817871, + "learning_rate": 1.703302778035415e-05, + "loss": 0.258, + "step": 691 + }, + { + "epoch": 0.3271867612293144, + "grad_norm": 6.334179878234863, + "learning_rate": 1.702128237352934e-05, + "loss": 0.2489, + "step": 692 + }, + { + "epoch": 0.3276595744680851, + "grad_norm": 7.485446453094482, + "learning_rate": 1.7009517831262034e-05, + "loss": 0.3043, + "step": 693 + }, + { + "epoch": 0.32813238770685577, + "grad_norm": 5.8358354568481445, + "learning_rate": 1.6997734185614712e-05, + "loss": 0.2401, + "step": 694 + }, + { + "epoch": 0.32860520094562645, + "grad_norm": 5.5207319259643555, + "learning_rate": 1.6985931468701915e-05, + "loss": 0.2512, + "step": 695 + }, + { + "epoch": 0.32907801418439714, + "grad_norm": 5.306708335876465, + "learning_rate": 1.6974109712690163e-05, + "loss": 0.2479, + "step": 696 + }, + { + "epoch": 0.3295508274231678, + "grad_norm": 5.970691204071045, + "learning_rate": 1.6962268949797862e-05, + "loss": 0.2745, + "step": 697 + }, + { + "epoch": 0.3300236406619385, + "grad_norm": 4.881795883178711, + "learning_rate": 1.695040921229522e-05, + "loss": 0.1999, + "step": 698 + }, + { + "epoch": 0.3304964539007092, + "grad_norm": 3.6859960556030273, + "learning_rate": 1.6938530532504155e-05, + "loss": 0.1434, + "step": 699 + }, + { + "epoch": 0.3309692671394799, + "grad_norm": 6.081749439239502, + "learning_rate": 1.692663294279821e-05, + "loss": 0.2718, + "step": 700 + }, + { + "epoch": 0.3309692671394799, + "eval_accuracy": 0.8270509977827051, + "eval_f1": 0.5828877005347594, + "eval_loss": 0.3708072304725647, + "eval_precision": 0.9008264462809917, + "eval_recall": 0.4308300395256917, + "eval_runtime": 49.3785, + "eval_samples_per_second": 5.589, + "eval_steps_per_second": 0.182, + "step": 700 + }, + { + "epoch": 0.33144208037825057, + "grad_norm": 4.934723377227783, + "learning_rate": 1.6914716475602474e-05, + "loss": 0.1914, + "step": 701 + }, + { + "epoch": 0.33191489361702126, + "grad_norm": 6.057024955749512, + "learning_rate": 1.690278116339346e-05, + "loss": 0.2151, + "step": 702 + }, + { + "epoch": 0.33238770685579194, + "grad_norm": 5.484874248504639, + "learning_rate": 1.689082703869907e-05, + "loss": 0.2675, + "step": 703 + }, + { + "epoch": 0.33286052009456263, + "grad_norm": 7.429450511932373, + "learning_rate": 1.687885413409845e-05, + "loss": 0.3249, + "step": 704 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 7.455477714538574, + "learning_rate": 1.6866862482221948e-05, + "loss": 0.3455, + "step": 705 + }, + { + "epoch": 0.333806146572104, + "grad_norm": 6.469564437866211, + "learning_rate": 1.685485211575099e-05, + "loss": 0.2674, + "step": 706 + }, + { + "epoch": 0.3342789598108747, + "grad_norm": 4.929532527923584, + "learning_rate": 1.684282306741802e-05, + "loss": 0.2082, + "step": 707 + }, + { + "epoch": 0.3347517730496454, + "grad_norm": 5.234260082244873, + "learning_rate": 1.6830775370006377e-05, + "loss": 0.1776, + "step": 708 + }, + { + "epoch": 0.33522458628841606, + "grad_norm": 7.192393779754639, + "learning_rate": 1.681870905635025e-05, + "loss": 0.2546, + "step": 709 + }, + { + "epoch": 0.33569739952718675, + "grad_norm": 5.962497711181641, + "learning_rate": 1.680662415933454e-05, + "loss": 0.2344, + "step": 710 + }, + { + "epoch": 0.33617021276595743, + "grad_norm": 6.120792865753174, + "learning_rate": 1.679452071189481e-05, + "loss": 0.2552, + "step": 711 + }, + { + "epoch": 0.3366430260047281, + "grad_norm": 8.747769355773926, + "learning_rate": 1.6782398747017176e-05, + "loss": 0.3015, + "step": 712 + }, + { + "epoch": 0.3371158392434988, + "grad_norm": 4.518637657165527, + "learning_rate": 1.6770258297738213e-05, + "loss": 0.2825, + "step": 713 + }, + { + "epoch": 0.3375886524822695, + "grad_norm": 6.613455295562744, + "learning_rate": 1.6758099397144884e-05, + "loss": 0.3259, + "step": 714 + }, + { + "epoch": 0.3380614657210402, + "grad_norm": 6.923367023468018, + "learning_rate": 1.674592207837443e-05, + "loss": 0.2302, + "step": 715 + }, + { + "epoch": 0.33853427895981086, + "grad_norm": 7.601401329040527, + "learning_rate": 1.6733726374614287e-05, + "loss": 0.2771, + "step": 716 + }, + { + "epoch": 0.33900709219858155, + "grad_norm": 4.864050388336182, + "learning_rate": 1.6721512319102006e-05, + "loss": 0.2364, + "step": 717 + }, + { + "epoch": 0.33947990543735224, + "grad_norm": 4.241363048553467, + "learning_rate": 1.670927994512514e-05, + "loss": 0.2275, + "step": 718 + }, + { + "epoch": 0.3399527186761229, + "grad_norm": 9.842682838439941, + "learning_rate": 1.6697029286021182e-05, + "loss": 0.3548, + "step": 719 + }, + { + "epoch": 0.3404255319148936, + "grad_norm": 7.171640396118164, + "learning_rate": 1.6684760375177442e-05, + "loss": 0.2172, + "step": 720 + }, + { + "epoch": 0.3404255319148936, + "eval_accuracy": 0.8237250554323725, + "eval_f1": 0.5667574931880109, + "eval_loss": 0.3734827935695648, + "eval_precision": 0.9122807017543859, + "eval_recall": 0.41106719367588934, + "eval_runtime": 47.8437, + "eval_samples_per_second": 5.769, + "eval_steps_per_second": 0.188, + "step": 720 + }, + { + "epoch": 0.3408983451536643, + "grad_norm": 5.395704746246338, + "learning_rate": 1.667247324603098e-05, + "loss": 0.1952, + "step": 721 + }, + { + "epoch": 0.341371158392435, + "grad_norm": 6.559274196624756, + "learning_rate": 1.666016793206851e-05, + "loss": 0.2231, + "step": 722 + }, + { + "epoch": 0.34184397163120567, + "grad_norm": 5.171023368835449, + "learning_rate": 1.6647844466826302e-05, + "loss": 0.251, + "step": 723 + }, + { + "epoch": 0.34231678486997635, + "grad_norm": 6.1227898597717285, + "learning_rate": 1.6635502883890098e-05, + "loss": 0.2674, + "step": 724 + }, + { + "epoch": 0.34278959810874704, + "grad_norm": 4.9917802810668945, + "learning_rate": 1.6623143216895008e-05, + "loss": 0.2228, + "step": 725 + }, + { + "epoch": 0.3432624113475177, + "grad_norm": 4.765135765075684, + "learning_rate": 1.661076549952544e-05, + "loss": 0.1833, + "step": 726 + }, + { + "epoch": 0.3437352245862884, + "grad_norm": 5.079737186431885, + "learning_rate": 1.6598369765514986e-05, + "loss": 0.2315, + "step": 727 + }, + { + "epoch": 0.3442080378250591, + "grad_norm": 8.060894012451172, + "learning_rate": 1.6585956048646345e-05, + "loss": 0.3144, + "step": 728 + }, + { + "epoch": 0.3446808510638298, + "grad_norm": 5.104706287384033, + "learning_rate": 1.657352438275122e-05, + "loss": 0.2241, + "step": 729 + }, + { + "epoch": 0.34515366430260047, + "grad_norm": 7.560702323913574, + "learning_rate": 1.656107480171024e-05, + "loss": 0.2651, + "step": 730 + }, + { + "epoch": 0.34562647754137116, + "grad_norm": 4.7865190505981445, + "learning_rate": 1.6548607339452853e-05, + "loss": 0.1864, + "step": 731 + }, + { + "epoch": 0.34609929078014184, + "grad_norm": 7.309717178344727, + "learning_rate": 1.6536122029957237e-05, + "loss": 0.2793, + "step": 732 + }, + { + "epoch": 0.34657210401891253, + "grad_norm": 5.886257171630859, + "learning_rate": 1.6523618907250215e-05, + "loss": 0.283, + "step": 733 + }, + { + "epoch": 0.3470449172576832, + "grad_norm": 7.503266334533691, + "learning_rate": 1.6511098005407157e-05, + "loss": 0.2675, + "step": 734 + }, + { + "epoch": 0.3475177304964539, + "grad_norm": 6.831967830657959, + "learning_rate": 1.6498559358551885e-05, + "loss": 0.2302, + "step": 735 + }, + { + "epoch": 0.3479905437352246, + "grad_norm": 5.59326696395874, + "learning_rate": 1.6486003000856587e-05, + "loss": 0.2629, + "step": 736 + }, + { + "epoch": 0.3484633569739953, + "grad_norm": 6.322920799255371, + "learning_rate": 1.647342896654171e-05, + "loss": 0.3043, + "step": 737 + }, + { + "epoch": 0.34893617021276596, + "grad_norm": 7.298335552215576, + "learning_rate": 1.6460837289875886e-05, + "loss": 0.2891, + "step": 738 + }, + { + "epoch": 0.34940898345153665, + "grad_norm": 5.698408126831055, + "learning_rate": 1.6448228005175818e-05, + "loss": 0.2265, + "step": 739 + }, + { + "epoch": 0.34988179669030733, + "grad_norm": 4.371240139007568, + "learning_rate": 1.643560114680621e-05, + "loss": 0.1876, + "step": 740 + }, + { + "epoch": 0.34988179669030733, + "eval_accuracy": 0.8392461197339246, + "eval_f1": 0.6214099216710183, + "eval_loss": 0.3518848717212677, + "eval_precision": 0.9153846153846154, + "eval_recall": 0.47035573122529645, + "eval_runtime": 47.5745, + "eval_samples_per_second": 5.801, + "eval_steps_per_second": 0.189, + "step": 740 + }, + { + "epoch": 0.350354609929078, + "grad_norm": 5.097601413726807, + "learning_rate": 1.642295674917965e-05, + "loss": 0.2459, + "step": 741 + }, + { + "epoch": 0.3508274231678487, + "grad_norm": 6.104417324066162, + "learning_rate": 1.641029484675653e-05, + "loss": 0.1901, + "step": 742 + }, + { + "epoch": 0.3513002364066194, + "grad_norm": 6.226688385009766, + "learning_rate": 1.639761547404495e-05, + "loss": 0.2534, + "step": 743 + }, + { + "epoch": 0.3517730496453901, + "grad_norm": 6.888615608215332, + "learning_rate": 1.6384918665600623e-05, + "loss": 0.2798, + "step": 744 + }, + { + "epoch": 0.35224586288416077, + "grad_norm": 3.7885279655456543, + "learning_rate": 1.6372204456026774e-05, + "loss": 0.177, + "step": 745 + }, + { + "epoch": 0.35271867612293145, + "grad_norm": 7.243451118469238, + "learning_rate": 1.6359472879974064e-05, + "loss": 0.2581, + "step": 746 + }, + { + "epoch": 0.35319148936170214, + "grad_norm": 5.321907043457031, + "learning_rate": 1.634672397214047e-05, + "loss": 0.2978, + "step": 747 + }, + { + "epoch": 0.3536643026004728, + "grad_norm": 4.163849830627441, + "learning_rate": 1.633395776727121e-05, + "loss": 0.1865, + "step": 748 + }, + { + "epoch": 0.3541371158392435, + "grad_norm": 5.830822467803955, + "learning_rate": 1.632117430015865e-05, + "loss": 0.2759, + "step": 749 + }, + { + "epoch": 0.3546099290780142, + "grad_norm": 6.779140949249268, + "learning_rate": 1.6308373605642192e-05, + "loss": 0.2363, + "step": 750 + }, + { + "epoch": 0.3550827423167849, + "grad_norm": 5.8843770027160645, + "learning_rate": 1.629555571860819e-05, + "loss": 0.2933, + "step": 751 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 4.655647277832031, + "learning_rate": 1.628272067398986e-05, + "loss": 0.1564, + "step": 752 + }, + { + "epoch": 0.35602836879432626, + "grad_norm": 5.365359306335449, + "learning_rate": 1.626986850676717e-05, + "loss": 0.1929, + "step": 753 + }, + { + "epoch": 0.35650118203309694, + "grad_norm": 4.647514820098877, + "learning_rate": 1.625699925196675e-05, + "loss": 0.1701, + "step": 754 + }, + { + "epoch": 0.35697399527186763, + "grad_norm": 5.949582099914551, + "learning_rate": 1.624411294466182e-05, + "loss": 0.1973, + "step": 755 + }, + { + "epoch": 0.3574468085106383, + "grad_norm": 6.162478446960449, + "learning_rate": 1.623120961997205e-05, + "loss": 0.1898, + "step": 756 + }, + { + "epoch": 0.357919621749409, + "grad_norm": 5.563331604003906, + "learning_rate": 1.6218289313063503e-05, + "loss": 0.228, + "step": 757 + }, + { + "epoch": 0.3583924349881797, + "grad_norm": 4.954248428344727, + "learning_rate": 1.6205352059148522e-05, + "loss": 0.2102, + "step": 758 + }, + { + "epoch": 0.3588652482269504, + "grad_norm": 5.003850936889648, + "learning_rate": 1.619239789348563e-05, + "loss": 0.2235, + "step": 759 + }, + { + "epoch": 0.35933806146572106, + "grad_norm": 7.022822856903076, + "learning_rate": 1.6179426851379443e-05, + "loss": 0.2788, + "step": 760 + }, + { + "epoch": 0.35933806146572106, + "eval_accuracy": 0.8348115299334812, + "eval_f1": 0.6246851385390428, + "eval_loss": 0.3573962152004242, + "eval_precision": 0.8611111111111112, + "eval_recall": 0.4901185770750988, + "eval_runtime": 47.6981, + "eval_samples_per_second": 5.786, + "eval_steps_per_second": 0.189, + "step": 760 + }, + { + "epoch": 0.35981087470449175, + "grad_norm": 5.163010597229004, + "learning_rate": 1.6166438968180582e-05, + "loss": 0.194, + "step": 761 + }, + { + "epoch": 0.36028368794326243, + "grad_norm": 7.249414920806885, + "learning_rate": 1.615343427928555e-05, + "loss": 0.2594, + "step": 762 + }, + { + "epoch": 0.3607565011820331, + "grad_norm": 3.0018277168273926, + "learning_rate": 1.614041282013666e-05, + "loss": 0.1381, + "step": 763 + }, + { + "epoch": 0.3612293144208038, + "grad_norm": 5.234002590179443, + "learning_rate": 1.6127374626221934e-05, + "loss": 0.2252, + "step": 764 + }, + { + "epoch": 0.3617021276595745, + "grad_norm": 5.7721848487854, + "learning_rate": 1.6114319733074986e-05, + "loss": 0.3073, + "step": 765 + }, + { + "epoch": 0.3621749408983452, + "grad_norm": 7.471461296081543, + "learning_rate": 1.6101248176274958e-05, + "loss": 0.2948, + "step": 766 + }, + { + "epoch": 0.36264775413711586, + "grad_norm": 6.112615585327148, + "learning_rate": 1.6088159991446397e-05, + "loss": 0.2433, + "step": 767 + }, + { + "epoch": 0.36312056737588655, + "grad_norm": 5.112131118774414, + "learning_rate": 1.6075055214259174e-05, + "loss": 0.1972, + "step": 768 + }, + { + "epoch": 0.36359338061465724, + "grad_norm": 6.367164611816406, + "learning_rate": 1.606193388042837e-05, + "loss": 0.2283, + "step": 769 + }, + { + "epoch": 0.3640661938534279, + "grad_norm": 6.986507892608643, + "learning_rate": 1.60487960257142e-05, + "loss": 0.233, + "step": 770 + }, + { + "epoch": 0.3645390070921986, + "grad_norm": 4.459200382232666, + "learning_rate": 1.6035641685921895e-05, + "loss": 0.1947, + "step": 771 + }, + { + "epoch": 0.3650118203309693, + "grad_norm": 4.415493965148926, + "learning_rate": 1.602247089690162e-05, + "loss": 0.1612, + "step": 772 + }, + { + "epoch": 0.3654846335697399, + "grad_norm": 6.583262920379639, + "learning_rate": 1.6009283694548365e-05, + "loss": 0.234, + "step": 773 + }, + { + "epoch": 0.3659574468085106, + "grad_norm": 7.73126745223999, + "learning_rate": 1.5996080114801858e-05, + "loss": 0.2687, + "step": 774 + }, + { + "epoch": 0.3664302600472813, + "grad_norm": 6.350796222686768, + "learning_rate": 1.598286019364645e-05, + "loss": 0.2279, + "step": 775 + }, + { + "epoch": 0.366903073286052, + "grad_norm": 4.372172832489014, + "learning_rate": 1.596962396711104e-05, + "loss": 0.1742, + "step": 776 + }, + { + "epoch": 0.36737588652482267, + "grad_norm": 7.295071125030518, + "learning_rate": 1.5956371471268968e-05, + "loss": 0.2632, + "step": 777 + }, + { + "epoch": 0.36784869976359336, + "grad_norm": 7.446830749511719, + "learning_rate": 1.5943102742237894e-05, + "loss": 0.2026, + "step": 778 + }, + { + "epoch": 0.36832151300236404, + "grad_norm": 6.453531742095947, + "learning_rate": 1.5929817816179733e-05, + "loss": 0.3007, + "step": 779 + }, + { + "epoch": 0.36879432624113473, + "grad_norm": 5.014437198638916, + "learning_rate": 1.591651672930054e-05, + "loss": 0.305, + "step": 780 + }, + { + "epoch": 0.36879432624113473, + "eval_accuracy": 0.8580931263858093, + "eval_f1": 0.7037037037037037, + "eval_loss": 0.31539109349250793, + "eval_precision": 0.8491620111731844, + "eval_recall": 0.6007905138339921, + "eval_runtime": 47.4123, + "eval_samples_per_second": 5.821, + "eval_steps_per_second": 0.19, + "step": 780 + }, + { + "epoch": 0.3692671394799054, + "grad_norm": 4.877912521362305, + "learning_rate": 1.5903199517850422e-05, + "loss": 0.2521, + "step": 781 + }, + { + "epoch": 0.3697399527186761, + "grad_norm": 5.180963516235352, + "learning_rate": 1.5889866218123414e-05, + "loss": 0.2296, + "step": 782 + }, + { + "epoch": 0.3702127659574468, + "grad_norm": 6.379220008850098, + "learning_rate": 1.5876516866457412e-05, + "loss": 0.252, + "step": 783 + }, + { + "epoch": 0.3706855791962175, + "grad_norm": 5.103360652923584, + "learning_rate": 1.5863151499234053e-05, + "loss": 0.2349, + "step": 784 + }, + { + "epoch": 0.37115839243498816, + "grad_norm": 5.573483943939209, + "learning_rate": 1.5849770152878622e-05, + "loss": 0.2627, + "step": 785 + }, + { + "epoch": 0.37163120567375885, + "grad_norm": 4.954294681549072, + "learning_rate": 1.583637286385995e-05, + "loss": 0.2475, + "step": 786 + }, + { + "epoch": 0.37210401891252953, + "grad_norm": 8.134040832519531, + "learning_rate": 1.5822959668690325e-05, + "loss": 0.4456, + "step": 787 + }, + { + "epoch": 0.3725768321513002, + "grad_norm": 5.643916130065918, + "learning_rate": 1.5809530603925378e-05, + "loss": 0.3051, + "step": 788 + }, + { + "epoch": 0.3730496453900709, + "grad_norm": 5.253788471221924, + "learning_rate": 1.5796085706163997e-05, + "loss": 0.254, + "step": 789 + }, + { + "epoch": 0.3735224586288416, + "grad_norm": 7.210267543792725, + "learning_rate": 1.5782625012048212e-05, + "loss": 0.2753, + "step": 790 + }, + { + "epoch": 0.3739952718676123, + "grad_norm": 4.508166790008545, + "learning_rate": 1.5769148558263108e-05, + "loss": 0.3106, + "step": 791 + }, + { + "epoch": 0.37446808510638296, + "grad_norm": 4.896869659423828, + "learning_rate": 1.575565638153672e-05, + "loss": 0.2384, + "step": 792 + }, + { + "epoch": 0.37494089834515365, + "grad_norm": 4.60262393951416, + "learning_rate": 1.574214851863993e-05, + "loss": 0.2742, + "step": 793 + }, + { + "epoch": 0.37541371158392434, + "grad_norm": 5.893795490264893, + "learning_rate": 1.572862500638639e-05, + "loss": 0.2526, + "step": 794 + }, + { + "epoch": 0.375886524822695, + "grad_norm": 4.181454181671143, + "learning_rate": 1.5715085881632366e-05, + "loss": 0.2012, + "step": 795 + }, + { + "epoch": 0.3763593380614657, + "grad_norm": 5.785640716552734, + "learning_rate": 1.5701531181276703e-05, + "loss": 0.2868, + "step": 796 + }, + { + "epoch": 0.3768321513002364, + "grad_norm": 4.539717197418213, + "learning_rate": 1.5687960942260687e-05, + "loss": 0.1804, + "step": 797 + }, + { + "epoch": 0.3773049645390071, + "grad_norm": 4.89398717880249, + "learning_rate": 1.5674375201567948e-05, + "loss": 0.1735, + "step": 798 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 6.449411392211914, + "learning_rate": 1.566077399622436e-05, + "loss": 0.342, + "step": 799 + }, + { + "epoch": 0.37825059101654845, + "grad_norm": 4.286831378936768, + "learning_rate": 1.5647157363297964e-05, + "loss": 0.2726, + "step": 800 + }, + { + "epoch": 0.37825059101654845, + "eval_accuracy": 0.8458980044345898, + "eval_f1": 0.6567901234567901, + "eval_loss": 0.3148706555366516, + "eval_precision": 0.875, + "eval_recall": 0.525691699604743, + "eval_runtime": 47.1619, + "eval_samples_per_second": 5.852, + "eval_steps_per_second": 0.191, + "step": 800 + }, + { + "epoch": 0.37872340425531914, + "grad_norm": 5.2504563331604, + "learning_rate": 1.5633525339898818e-05, + "loss": 0.2679, + "step": 801 + }, + { + "epoch": 0.3791962174940898, + "grad_norm": 5.007554531097412, + "learning_rate": 1.5619877963178952e-05, + "loss": 0.2399, + "step": 802 + }, + { + "epoch": 0.3796690307328605, + "grad_norm": 3.274820566177368, + "learning_rate": 1.5606215270332216e-05, + "loss": 0.1511, + "step": 803 + }, + { + "epoch": 0.3801418439716312, + "grad_norm": 4.302379131317139, + "learning_rate": 1.559253729859421e-05, + "loss": 0.2247, + "step": 804 + }, + { + "epoch": 0.3806146572104019, + "grad_norm": 4.160916805267334, + "learning_rate": 1.5578844085242185e-05, + "loss": 0.2082, + "step": 805 + }, + { + "epoch": 0.38108747044917257, + "grad_norm": 5.578160285949707, + "learning_rate": 1.5565135667594916e-05, + "loss": 0.3049, + "step": 806 + }, + { + "epoch": 0.38156028368794326, + "grad_norm": 7.35500431060791, + "learning_rate": 1.555141208301262e-05, + "loss": 0.2808, + "step": 807 + }, + { + "epoch": 0.38203309692671394, + "grad_norm": 5.54599666595459, + "learning_rate": 1.5537673368896853e-05, + "loss": 0.2069, + "step": 808 + }, + { + "epoch": 0.38250591016548463, + "grad_norm": 4.696985721588135, + "learning_rate": 1.55239195626904e-05, + "loss": 0.2765, + "step": 809 + }, + { + "epoch": 0.3829787234042553, + "grad_norm": 6.143385410308838, + "learning_rate": 1.5510150701877178e-05, + "loss": 0.1958, + "step": 810 + }, + { + "epoch": 0.383451536643026, + "grad_norm": 6.515667915344238, + "learning_rate": 1.549636682398213e-05, + "loss": 0.2543, + "step": 811 + }, + { + "epoch": 0.3839243498817967, + "grad_norm": 4.393880367279053, + "learning_rate": 1.5482567966571136e-05, + "loss": 0.2278, + "step": 812 + }, + { + "epoch": 0.3843971631205674, + "grad_norm": 8.271415710449219, + "learning_rate": 1.546875416725089e-05, + "loss": 0.25, + "step": 813 + }, + { + "epoch": 0.38486997635933806, + "grad_norm": 5.564967155456543, + "learning_rate": 1.5454925463668812e-05, + "loss": 0.2286, + "step": 814 + }, + { + "epoch": 0.38534278959810875, + "grad_norm": 4.746275424957275, + "learning_rate": 1.5441081893512933e-05, + "loss": 0.2164, + "step": 815 + }, + { + "epoch": 0.38581560283687943, + "grad_norm": 7.916270732879639, + "learning_rate": 1.5427223494511824e-05, + "loss": 0.3749, + "step": 816 + }, + { + "epoch": 0.3862884160756501, + "grad_norm": 4.836629867553711, + "learning_rate": 1.541335030443444e-05, + "loss": 0.1946, + "step": 817 + }, + { + "epoch": 0.3867612293144208, + "grad_norm": 5.497342586517334, + "learning_rate": 1.539946236109007e-05, + "loss": 0.2712, + "step": 818 + }, + { + "epoch": 0.3872340425531915, + "grad_norm": 4.717584133148193, + "learning_rate": 1.5385559702328195e-05, + "loss": 0.239, + "step": 819 + }, + { + "epoch": 0.3877068557919622, + "grad_norm": 6.673068046569824, + "learning_rate": 1.5371642366038412e-05, + "loss": 0.2819, + "step": 820 + }, + { + "epoch": 0.3877068557919622, + "eval_accuracy": 0.8580931263858093, + "eval_f1": 0.7276595744680852, + "eval_loss": 0.30154746770858765, + "eval_precision": 0.7880184331797235, + "eval_recall": 0.6758893280632411, + "eval_runtime": 48.4441, + "eval_samples_per_second": 5.697, + "eval_steps_per_second": 0.186, + "step": 820 + }, + { + "epoch": 0.38817966903073287, + "grad_norm": 8.295758247375488, + "learning_rate": 1.5357710390150312e-05, + "loss": 0.2953, + "step": 821 + }, + { + "epoch": 0.38865248226950355, + "grad_norm": 6.9379730224609375, + "learning_rate": 1.5343763812633393e-05, + "loss": 0.2614, + "step": 822 + }, + { + "epoch": 0.38912529550827424, + "grad_norm": 5.640291690826416, + "learning_rate": 1.5329802671496935e-05, + "loss": 0.2978, + "step": 823 + }, + { + "epoch": 0.3895981087470449, + "grad_norm": 5.361009120941162, + "learning_rate": 1.5315827004789918e-05, + "loss": 0.3108, + "step": 824 + }, + { + "epoch": 0.3900709219858156, + "grad_norm": 5.312415599822998, + "learning_rate": 1.53018368506009e-05, + "loss": 0.1958, + "step": 825 + }, + { + "epoch": 0.3905437352245863, + "grad_norm": 4.690582752227783, + "learning_rate": 1.5287832247057936e-05, + "loss": 0.2102, + "step": 826 + }, + { + "epoch": 0.391016548463357, + "grad_norm": 5.4220099449157715, + "learning_rate": 1.527381323232845e-05, + "loss": 0.1965, + "step": 827 + }, + { + "epoch": 0.39148936170212767, + "grad_norm": 6.630805969238281, + "learning_rate": 1.5259779844619152e-05, + "loss": 0.2573, + "step": 828 + }, + { + "epoch": 0.39196217494089836, + "grad_norm": 4.912630081176758, + "learning_rate": 1.524573212217591e-05, + "loss": 0.2715, + "step": 829 + }, + { + "epoch": 0.39243498817966904, + "grad_norm": 5.768490314483643, + "learning_rate": 1.5231670103283665e-05, + "loss": 0.2107, + "step": 830 + }, + { + "epoch": 0.39290780141843973, + "grad_norm": 11.683192253112793, + "learning_rate": 1.521759382626632e-05, + "loss": 0.3559, + "step": 831 + }, + { + "epoch": 0.3933806146572104, + "grad_norm": 6.212742805480957, + "learning_rate": 1.5203503329486649e-05, + "loss": 0.299, + "step": 832 + }, + { + "epoch": 0.3938534278959811, + "grad_norm": 11.167441368103027, + "learning_rate": 1.5189398651346153e-05, + "loss": 0.4404, + "step": 833 + }, + { + "epoch": 0.3943262411347518, + "grad_norm": 4.801130294799805, + "learning_rate": 1.5175279830285006e-05, + "loss": 0.2968, + "step": 834 + }, + { + "epoch": 0.3947990543735225, + "grad_norm": 4.244668006896973, + "learning_rate": 1.5161146904781918e-05, + "loss": 0.2195, + "step": 835 + }, + { + "epoch": 0.39527186761229316, + "grad_norm": 4.198855876922607, + "learning_rate": 1.514699991335404e-05, + "loss": 0.2572, + "step": 836 + }, + { + "epoch": 0.39574468085106385, + "grad_norm": 3.7614452838897705, + "learning_rate": 1.5132838894556848e-05, + "loss": 0.2454, + "step": 837 + }, + { + "epoch": 0.39621749408983453, + "grad_norm": 3.7950305938720703, + "learning_rate": 1.5118663886984065e-05, + "loss": 0.2254, + "step": 838 + }, + { + "epoch": 0.3966903073286052, + "grad_norm": 7.833040714263916, + "learning_rate": 1.510447492926752e-05, + "loss": 0.3283, + "step": 839 + }, + { + "epoch": 0.3971631205673759, + "grad_norm": 4.6039204597473145, + "learning_rate": 1.5090272060077081e-05, + "loss": 0.2596, + "step": 840 + }, + { + "epoch": 0.3971631205673759, + "eval_accuracy": 0.8547671840354767, + "eval_f1": 0.7298969072164948, + "eval_loss": 0.3099477291107178, + "eval_precision": 0.7629310344827587, + "eval_recall": 0.6996047430830039, + "eval_runtime": 48.4738, + "eval_samples_per_second": 5.694, + "eval_steps_per_second": 0.186, + "step": 840 + }, + { + "epoch": 0.3976359338061466, + "grad_norm": 6.1639485359191895, + "learning_rate": 1.5076055318120508e-05, + "loss": 0.2616, + "step": 841 + }, + { + "epoch": 0.3981087470449173, + "grad_norm": 5.882129192352295, + "learning_rate": 1.5061824742143388e-05, + "loss": 0.2296, + "step": 842 + }, + { + "epoch": 0.39858156028368796, + "grad_norm": 6.4986772537231445, + "learning_rate": 1.5047580370928994e-05, + "loss": 0.3221, + "step": 843 + }, + { + "epoch": 0.39905437352245865, + "grad_norm": 4.624194622039795, + "learning_rate": 1.5033322243298209e-05, + "loss": 0.2522, + "step": 844 + }, + { + "epoch": 0.39952718676122934, + "grad_norm": 4.8714280128479, + "learning_rate": 1.50190503981094e-05, + "loss": 0.2503, + "step": 845 + }, + { + "epoch": 0.4, + "grad_norm": 6.177154541015625, + "learning_rate": 1.5004764874258327e-05, + "loss": 0.283, + "step": 846 + }, + { + "epoch": 0.4004728132387707, + "grad_norm": 6.643271446228027, + "learning_rate": 1.4990465710678015e-05, + "loss": 0.3263, + "step": 847 + }, + { + "epoch": 0.4009456264775414, + "grad_norm": 5.074257850646973, + "learning_rate": 1.4976152946338673e-05, + "loss": 0.2613, + "step": 848 + }, + { + "epoch": 0.4014184397163121, + "grad_norm": 4.194014072418213, + "learning_rate": 1.4961826620247574e-05, + "loss": 0.221, + "step": 849 + }, + { + "epoch": 0.40189125295508277, + "grad_norm": 4.170263767242432, + "learning_rate": 1.4947486771448955e-05, + "loss": 0.2559, + "step": 850 + }, + { + "epoch": 0.40236406619385345, + "grad_norm": 5.984470844268799, + "learning_rate": 1.4933133439023903e-05, + "loss": 0.3017, + "step": 851 + }, + { + "epoch": 0.40283687943262414, + "grad_norm": 3.760006904602051, + "learning_rate": 1.4918766662090248e-05, + "loss": 0.197, + "step": 852 + }, + { + "epoch": 0.4033096926713948, + "grad_norm": 4.673705101013184, + "learning_rate": 1.4904386479802471e-05, + "loss": 0.2784, + "step": 853 + }, + { + "epoch": 0.4037825059101655, + "grad_norm": 8.06790828704834, + "learning_rate": 1.4889992931351578e-05, + "loss": 0.2297, + "step": 854 + }, + { + "epoch": 0.40425531914893614, + "grad_norm": 3.7845892906188965, + "learning_rate": 1.4875586055965014e-05, + "loss": 0.2101, + "step": 855 + }, + { + "epoch": 0.40472813238770683, + "grad_norm": 3.8769285678863525, + "learning_rate": 1.4861165892906532e-05, + "loss": 0.1641, + "step": 856 + }, + { + "epoch": 0.4052009456264775, + "grad_norm": 3.3952763080596924, + "learning_rate": 1.4846732481476105e-05, + "loss": 0.1768, + "step": 857 + }, + { + "epoch": 0.4056737588652482, + "grad_norm": 6.208580493927002, + "learning_rate": 1.4832285861009812e-05, + "loss": 0.1995, + "step": 858 + }, + { + "epoch": 0.4061465721040189, + "grad_norm": 6.829061508178711, + "learning_rate": 1.4817826070879732e-05, + "loss": 0.3429, + "step": 859 + }, + { + "epoch": 0.4066193853427896, + "grad_norm": 4.501508712768555, + "learning_rate": 1.4803353150493834e-05, + "loss": 0.185, + "step": 860 + }, + { + "epoch": 0.4066193853427896, + "eval_accuracy": 0.8614190687361419, + "eval_f1": 0.7203579418344519, + "eval_loss": 0.3079231381416321, + "eval_precision": 0.8298969072164949, + "eval_recall": 0.6363636363636364, + "eval_runtime": 48.0809, + "eval_samples_per_second": 5.74, + "eval_steps_per_second": 0.187, + "step": 860 + }, + { + "epoch": 0.40709219858156026, + "grad_norm": 7.904217720031738, + "learning_rate": 1.478886713929587e-05, + "loss": 0.2896, + "step": 861 + }, + { + "epoch": 0.40756501182033095, + "grad_norm": 5.54583740234375, + "learning_rate": 1.4774368076765272e-05, + "loss": 0.2334, + "step": 862 + }, + { + "epoch": 0.40803782505910163, + "grad_norm": 6.930192470550537, + "learning_rate": 1.4759856002417046e-05, + "loss": 0.233, + "step": 863 + }, + { + "epoch": 0.4085106382978723, + "grad_norm": 7.0124335289001465, + "learning_rate": 1.4745330955801644e-05, + "loss": 0.2996, + "step": 864 + }, + { + "epoch": 0.408983451536643, + "grad_norm": 7.793242454528809, + "learning_rate": 1.4730792976504892e-05, + "loss": 0.1966, + "step": 865 + }, + { + "epoch": 0.4094562647754137, + "grad_norm": 6.164129734039307, + "learning_rate": 1.4716242104147849e-05, + "loss": 0.2556, + "step": 866 + }, + { + "epoch": 0.4099290780141844, + "grad_norm": 5.059127330780029, + "learning_rate": 1.470167837838671e-05, + "loss": 0.1843, + "step": 867 + }, + { + "epoch": 0.41040189125295506, + "grad_norm": 7.891740798950195, + "learning_rate": 1.4687101838912713e-05, + "loss": 0.2942, + "step": 868 + }, + { + "epoch": 0.41087470449172575, + "grad_norm": 8.02418327331543, + "learning_rate": 1.467251252545201e-05, + "loss": 0.2544, + "step": 869 + }, + { + "epoch": 0.41134751773049644, + "grad_norm": 7.103123188018799, + "learning_rate": 1.4657910477765564e-05, + "loss": 0.2167, + "step": 870 + }, + { + "epoch": 0.4118203309692671, + "grad_norm": 6.880304336547852, + "learning_rate": 1.4643295735649044e-05, + "loss": 0.3523, + "step": 871 + }, + { + "epoch": 0.4122931442080378, + "grad_norm": 5.1397576332092285, + "learning_rate": 1.4628668338932721e-05, + "loss": 0.2939, + "step": 872 + }, + { + "epoch": 0.4127659574468085, + "grad_norm": 4.4353346824646, + "learning_rate": 1.461402832748135e-05, + "loss": 0.2673, + "step": 873 + }, + { + "epoch": 0.4132387706855792, + "grad_norm": 4.128648281097412, + "learning_rate": 1.4599375741194069e-05, + "loss": 0.1686, + "step": 874 + }, + { + "epoch": 0.41371158392434987, + "grad_norm": 5.588024616241455, + "learning_rate": 1.4584710620004284e-05, + "loss": 0.2412, + "step": 875 + }, + { + "epoch": 0.41418439716312055, + "grad_norm": 5.182522296905518, + "learning_rate": 1.4570033003879556e-05, + "loss": 0.2453, + "step": 876 + }, + { + "epoch": 0.41465721040189124, + "grad_norm": 4.976614475250244, + "learning_rate": 1.4555342932821517e-05, + "loss": 0.2493, + "step": 877 + }, + { + "epoch": 0.4151300236406619, + "grad_norm": 6.306532859802246, + "learning_rate": 1.4540640446865723e-05, + "loss": 0.2481, + "step": 878 + }, + { + "epoch": 0.4156028368794326, + "grad_norm": 4.86607027053833, + "learning_rate": 1.4525925586081584e-05, + "loss": 0.1933, + "step": 879 + }, + { + "epoch": 0.4160756501182033, + "grad_norm": 4.547597885131836, + "learning_rate": 1.4511198390572219e-05, + "loss": 0.189, + "step": 880 + }, + { + "epoch": 0.4160756501182033, + "eval_accuracy": 0.8503325942350333, + "eval_f1": 0.6666666666666666, + "eval_loss": 0.32481706142425537, + "eval_precision": 0.8881578947368421, + "eval_recall": 0.5335968379446641, + "eval_runtime": 48.5726, + "eval_samples_per_second": 5.682, + "eval_steps_per_second": 0.185, + "step": 880 + }, + { + "epoch": 0.416548463356974, + "grad_norm": 6.4413886070251465, + "learning_rate": 1.4496458900474371e-05, + "loss": 0.2284, + "step": 881 + }, + { + "epoch": 0.41702127659574467, + "grad_norm": 3.7408576011657715, + "learning_rate": 1.4481707155958291e-05, + "loss": 0.1963, + "step": 882 + }, + { + "epoch": 0.41749408983451536, + "grad_norm": 5.2726664543151855, + "learning_rate": 1.446694319722763e-05, + "loss": 0.2463, + "step": 883 + }, + { + "epoch": 0.41796690307328604, + "grad_norm": 4.192355155944824, + "learning_rate": 1.4452167064519316e-05, + "loss": 0.2065, + "step": 884 + }, + { + "epoch": 0.41843971631205673, + "grad_norm": 7.111584663391113, + "learning_rate": 1.4437378798103467e-05, + "loss": 0.3013, + "step": 885 + }, + { + "epoch": 0.4189125295508274, + "grad_norm": 7.128089427947998, + "learning_rate": 1.4422578438283263e-05, + "loss": 0.2477, + "step": 886 + }, + { + "epoch": 0.4193853427895981, + "grad_norm": 6.053483486175537, + "learning_rate": 1.4407766025394847e-05, + "loss": 0.2003, + "step": 887 + }, + { + "epoch": 0.4198581560283688, + "grad_norm": 6.564062118530273, + "learning_rate": 1.4392941599807206e-05, + "loss": 0.2808, + "step": 888 + }, + { + "epoch": 0.4203309692671395, + "grad_norm": 4.815242290496826, + "learning_rate": 1.4378105201922073e-05, + "loss": 0.1874, + "step": 889 + }, + { + "epoch": 0.42080378250591016, + "grad_norm": 6.174993991851807, + "learning_rate": 1.4363256872173801e-05, + "loss": 0.1918, + "step": 890 + }, + { + "epoch": 0.42127659574468085, + "grad_norm": 7.473939418792725, + "learning_rate": 1.4348396651029261e-05, + "loss": 0.2361, + "step": 891 + }, + { + "epoch": 0.42174940898345153, + "grad_norm": 8.417937278747559, + "learning_rate": 1.4333524578987748e-05, + "loss": 0.4323, + "step": 892 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 5.9651007652282715, + "learning_rate": 1.4318640696580834e-05, + "loss": 0.3207, + "step": 893 + }, + { + "epoch": 0.4226950354609929, + "grad_norm": 4.948203086853027, + "learning_rate": 1.4303745044372293e-05, + "loss": 0.2782, + "step": 894 + }, + { + "epoch": 0.4231678486997636, + "grad_norm": 9.178805351257324, + "learning_rate": 1.4288837662957969e-05, + "loss": 0.334, + "step": 895 + }, + { + "epoch": 0.4236406619385343, + "grad_norm": 7.537435054779053, + "learning_rate": 1.4273918592965674e-05, + "loss": 0.3307, + "step": 896 + }, + { + "epoch": 0.42411347517730497, + "grad_norm": 5.170799732208252, + "learning_rate": 1.4258987875055077e-05, + "loss": 0.2322, + "step": 897 + }, + { + "epoch": 0.42458628841607565, + "grad_norm": 7.296963214874268, + "learning_rate": 1.4244045549917587e-05, + "loss": 0.292, + "step": 898 + }, + { + "epoch": 0.42505910165484634, + "grad_norm": 5.456043720245361, + "learning_rate": 1.422909165827625e-05, + "loss": 0.2374, + "step": 899 + }, + { + "epoch": 0.425531914893617, + "grad_norm": 4.878541946411133, + "learning_rate": 1.421412624088564e-05, + "loss": 0.299, + "step": 900 + }, + { + "epoch": 0.425531914893617, + "eval_accuracy": 0.852549889135255, + "eval_f1": 0.6825775656324582, + "eval_loss": 0.31735506653785706, + "eval_precision": 0.8614457831325302, + "eval_recall": 0.5652173913043478, + "eval_runtime": 48.299, + "eval_samples_per_second": 5.714, + "eval_steps_per_second": 0.186, + "step": 900 + }, + { + "epoch": 0.4260047281323877, + "grad_norm": 5.303489685058594, + "learning_rate": 1.419914933853173e-05, + "loss": 0.2548, + "step": 901 + }, + { + "epoch": 0.4264775413711584, + "grad_norm": 5.416555404663086, + "learning_rate": 1.4184160992031806e-05, + "loss": 0.249, + "step": 902 + }, + { + "epoch": 0.4269503546099291, + "grad_norm": 5.52853536605835, + "learning_rate": 1.4169161242234335e-05, + "loss": 0.2135, + "step": 903 + }, + { + "epoch": 0.42742316784869977, + "grad_norm": 5.232771396636963, + "learning_rate": 1.4154150130018867e-05, + "loss": 0.2314, + "step": 904 + }, + { + "epoch": 0.42789598108747046, + "grad_norm": 5.249035835266113, + "learning_rate": 1.4139127696295913e-05, + "loss": 0.188, + "step": 905 + }, + { + "epoch": 0.42836879432624114, + "grad_norm": 8.240036010742188, + "learning_rate": 1.4124093982006846e-05, + "loss": 0.2678, + "step": 906 + }, + { + "epoch": 0.42884160756501183, + "grad_norm": 5.175498008728027, + "learning_rate": 1.410904902812378e-05, + "loss": 0.2565, + "step": 907 + }, + { + "epoch": 0.4293144208037825, + "grad_norm": 3.9959726333618164, + "learning_rate": 1.4093992875649456e-05, + "loss": 0.2413, + "step": 908 + }, + { + "epoch": 0.4297872340425532, + "grad_norm": 3.8025238513946533, + "learning_rate": 1.407892556561714e-05, + "loss": 0.1705, + "step": 909 + }, + { + "epoch": 0.4302600472813239, + "grad_norm": 5.208123683929443, + "learning_rate": 1.4063847139090507e-05, + "loss": 0.2492, + "step": 910 + }, + { + "epoch": 0.4307328605200946, + "grad_norm": 4.154348850250244, + "learning_rate": 1.4048757637163529e-05, + "loss": 0.2182, + "step": 911 + }, + { + "epoch": 0.43120567375886526, + "grad_norm": 5.2830939292907715, + "learning_rate": 1.4033657100960356e-05, + "loss": 0.2097, + "step": 912 + }, + { + "epoch": 0.43167848699763595, + "grad_norm": 3.8644347190856934, + "learning_rate": 1.4018545571635209e-05, + "loss": 0.214, + "step": 913 + }, + { + "epoch": 0.43215130023640663, + "grad_norm": 4.06352424621582, + "learning_rate": 1.4003423090372286e-05, + "loss": 0.2284, + "step": 914 + }, + { + "epoch": 0.4326241134751773, + "grad_norm": 6.2407355308532715, + "learning_rate": 1.3988289698385608e-05, + "loss": 0.2216, + "step": 915 + }, + { + "epoch": 0.433096926713948, + "grad_norm": 6.083385467529297, + "learning_rate": 1.3973145436918957e-05, + "loss": 0.268, + "step": 916 + }, + { + "epoch": 0.4335697399527187, + "grad_norm": 7.127196311950684, + "learning_rate": 1.3957990347245717e-05, + "loss": 0.3019, + "step": 917 + }, + { + "epoch": 0.4340425531914894, + "grad_norm": 4.245884418487549, + "learning_rate": 1.3942824470668796e-05, + "loss": 0.2615, + "step": 918 + }, + { + "epoch": 0.43451536643026006, + "grad_norm": 6.33418083190918, + "learning_rate": 1.3927647848520493e-05, + "loss": 0.2592, + "step": 919 + }, + { + "epoch": 0.43498817966903075, + "grad_norm": 6.671105861663818, + "learning_rate": 1.3912460522162396e-05, + "loss": 0.199, + "step": 920 + }, + { + "epoch": 0.43498817966903075, + "eval_accuracy": 0.8392461197339246, + "eval_f1": 0.6253229974160207, + "eval_loss": 0.33865952491760254, + "eval_precision": 0.9029850746268657, + "eval_recall": 0.4782608695652174, + "eval_runtime": 48.0155, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.187, + "step": 920 + }, + { + "epoch": 0.43546099290780144, + "grad_norm": 4.163972854614258, + "learning_rate": 1.3897262532985263e-05, + "loss": 0.184, + "step": 921 + }, + { + "epoch": 0.4359338061465721, + "grad_norm": 8.20583438873291, + "learning_rate": 1.3882053922408915e-05, + "loss": 0.288, + "step": 922 + }, + { + "epoch": 0.4364066193853428, + "grad_norm": 5.573141098022461, + "learning_rate": 1.3866834731882117e-05, + "loss": 0.1807, + "step": 923 + }, + { + "epoch": 0.4368794326241135, + "grad_norm": 9.644611358642578, + "learning_rate": 1.3851605002882472e-05, + "loss": 0.4276, + "step": 924 + }, + { + "epoch": 0.4373522458628842, + "grad_norm": 7.489835739135742, + "learning_rate": 1.38363647769163e-05, + "loss": 0.3496, + "step": 925 + }, + { + "epoch": 0.43782505910165487, + "grad_norm": 4.446575164794922, + "learning_rate": 1.3821114095518529e-05, + "loss": 0.1963, + "step": 926 + }, + { + "epoch": 0.43829787234042555, + "grad_norm": 4.232187271118164, + "learning_rate": 1.3805853000252584e-05, + "loss": 0.2081, + "step": 927 + }, + { + "epoch": 0.43877068557919624, + "grad_norm": 5.939121246337891, + "learning_rate": 1.379058153271027e-05, + "loss": 0.2361, + "step": 928 + }, + { + "epoch": 0.4392434988179669, + "grad_norm": 8.863687515258789, + "learning_rate": 1.3775299734511663e-05, + "loss": 0.341, + "step": 929 + }, + { + "epoch": 0.4397163120567376, + "grad_norm": 6.206582546234131, + "learning_rate": 1.3760007647304987e-05, + "loss": 0.23, + "step": 930 + }, + { + "epoch": 0.4401891252955083, + "grad_norm": 7.478794574737549, + "learning_rate": 1.3744705312766517e-05, + "loss": 0.241, + "step": 931 + }, + { + "epoch": 0.440661938534279, + "grad_norm": 9.208320617675781, + "learning_rate": 1.3729392772600445e-05, + "loss": 0.2495, + "step": 932 + }, + { + "epoch": 0.44113475177304967, + "grad_norm": 5.460510730743408, + "learning_rate": 1.3714070068538785e-05, + "loss": 0.1938, + "step": 933 + }, + { + "epoch": 0.44160756501182036, + "grad_norm": 6.056775093078613, + "learning_rate": 1.3698737242341245e-05, + "loss": 0.3128, + "step": 934 + }, + { + "epoch": 0.44208037825059104, + "grad_norm": 6.535298824310303, + "learning_rate": 1.3683394335795126e-05, + "loss": 0.2466, + "step": 935 + }, + { + "epoch": 0.4425531914893617, + "grad_norm": 6.023354530334473, + "learning_rate": 1.3668041390715195e-05, + "loss": 0.2496, + "step": 936 + }, + { + "epoch": 0.44302600472813236, + "grad_norm": 5.573044300079346, + "learning_rate": 1.365267844894358e-05, + "loss": 0.2324, + "step": 937 + }, + { + "epoch": 0.44349881796690305, + "grad_norm": 7.519514560699463, + "learning_rate": 1.3637305552349656e-05, + "loss": 0.3045, + "step": 938 + }, + { + "epoch": 0.44397163120567373, + "grad_norm": 3.165480852127075, + "learning_rate": 1.3621922742829923e-05, + "loss": 0.1767, + "step": 939 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 5.499924659729004, + "learning_rate": 1.3606530062307902e-05, + "loss": 0.2886, + "step": 940 + }, + { + "epoch": 0.4444444444444444, + "eval_accuracy": 0.8381374722838137, + "eval_f1": 0.6294416243654822, + "eval_loss": 0.33129268884658813, + "eval_precision": 0.8794326241134752, + "eval_recall": 0.4901185770750988, + "eval_runtime": 48.7368, + "eval_samples_per_second": 5.663, + "eval_steps_per_second": 0.185, + "step": 940 + }, + { + "epoch": 0.4449172576832151, + "grad_norm": 4.935895919799805, + "learning_rate": 1.3591127552734018e-05, + "loss": 0.1841, + "step": 941 + }, + { + "epoch": 0.4453900709219858, + "grad_norm": 5.519673824310303, + "learning_rate": 1.3575715256085474e-05, + "loss": 0.2634, + "step": 942 + }, + { + "epoch": 0.4458628841607565, + "grad_norm": 5.731892108917236, + "learning_rate": 1.3560293214366152e-05, + "loss": 0.2458, + "step": 943 + }, + { + "epoch": 0.44633569739952716, + "grad_norm": 6.215859413146973, + "learning_rate": 1.3544861469606495e-05, + "loss": 0.3181, + "step": 944 + }, + { + "epoch": 0.44680851063829785, + "grad_norm": 5.27556848526001, + "learning_rate": 1.352942006386339e-05, + "loss": 0.2382, + "step": 945 + }, + { + "epoch": 0.44728132387706854, + "grad_norm": 5.7444963455200195, + "learning_rate": 1.351396903922005e-05, + "loss": 0.218, + "step": 946 + }, + { + "epoch": 0.4477541371158392, + "grad_norm": 6.035450458526611, + "learning_rate": 1.3498508437785897e-05, + "loss": 0.239, + "step": 947 + }, + { + "epoch": 0.4482269503546099, + "grad_norm": 3.9959163665771484, + "learning_rate": 1.3483038301696473e-05, + "loss": 0.1595, + "step": 948 + }, + { + "epoch": 0.4486997635933806, + "grad_norm": 3.841958999633789, + "learning_rate": 1.3467558673113286e-05, + "loss": 0.1573, + "step": 949 + }, + { + "epoch": 0.4491725768321513, + "grad_norm": 4.796578884124756, + "learning_rate": 1.345206959422372e-05, + "loss": 0.2151, + "step": 950 + }, + { + "epoch": 0.44964539007092197, + "grad_norm": 6.64060640335083, + "learning_rate": 1.3436571107240919e-05, + "loss": 0.2468, + "step": 951 + }, + { + "epoch": 0.45011820330969265, + "grad_norm": 4.541578769683838, + "learning_rate": 1.3421063254403657e-05, + "loss": 0.2113, + "step": 952 + }, + { + "epoch": 0.45059101654846334, + "grad_norm": 6.632504940032959, + "learning_rate": 1.3405546077976249e-05, + "loss": 0.2745, + "step": 953 + }, + { + "epoch": 0.451063829787234, + "grad_norm": 4.1523284912109375, + "learning_rate": 1.3390019620248403e-05, + "loss": 0.1837, + "step": 954 + }, + { + "epoch": 0.4515366430260047, + "grad_norm": 5.9142351150512695, + "learning_rate": 1.3374483923535136e-05, + "loss": 0.2732, + "step": 955 + }, + { + "epoch": 0.4520094562647754, + "grad_norm": 4.184595584869385, + "learning_rate": 1.335893903017663e-05, + "loss": 0.2433, + "step": 956 + }, + { + "epoch": 0.4524822695035461, + "grad_norm": 5.808665752410889, + "learning_rate": 1.334338498253815e-05, + "loss": 0.2497, + "step": 957 + }, + { + "epoch": 0.4529550827423168, + "grad_norm": 4.390594959259033, + "learning_rate": 1.332782182300989e-05, + "loss": 0.2517, + "step": 958 + }, + { + "epoch": 0.45342789598108746, + "grad_norm": 4.8440165519714355, + "learning_rate": 1.3312249594006893e-05, + "loss": 0.2194, + "step": 959 + }, + { + "epoch": 0.45390070921985815, + "grad_norm": 7.286986827850342, + "learning_rate": 1.3296668337968904e-05, + "loss": 0.2641, + "step": 960 + }, + { + "epoch": 0.45390070921985815, + "eval_accuracy": 0.8636363636363636, + "eval_f1": 0.7159353348729792, + "eval_loss": 0.3095405697822571, + "eval_precision": 0.8611111111111112, + "eval_recall": 0.6126482213438735, + "eval_runtime": 48.7277, + "eval_samples_per_second": 5.664, + "eval_steps_per_second": 0.185, + "step": 960 + }, + { + "epoch": 0.45437352245862883, + "grad_norm": 5.716742992401123, + "learning_rate": 1.3281078097360287e-05, + "loss": 0.1991, + "step": 961 + }, + { + "epoch": 0.4548463356973995, + "grad_norm": 6.432254791259766, + "learning_rate": 1.3265478914669878e-05, + "loss": 0.2438, + "step": 962 + }, + { + "epoch": 0.4553191489361702, + "grad_norm": 6.0999250411987305, + "learning_rate": 1.3249870832410886e-05, + "loss": 0.2145, + "step": 963 + }, + { + "epoch": 0.4557919621749409, + "grad_norm": 6.828171730041504, + "learning_rate": 1.323425389312079e-05, + "loss": 0.2465, + "step": 964 + }, + { + "epoch": 0.4562647754137116, + "grad_norm": 5.544858455657959, + "learning_rate": 1.3218628139361178e-05, + "loss": 0.1922, + "step": 965 + }, + { + "epoch": 0.45673758865248226, + "grad_norm": 5.448679447174072, + "learning_rate": 1.3202993613717688e-05, + "loss": 0.2383, + "step": 966 + }, + { + "epoch": 0.45721040189125295, + "grad_norm": 4.943000793457031, + "learning_rate": 1.3187350358799846e-05, + "loss": 0.2142, + "step": 967 + }, + { + "epoch": 0.45768321513002364, + "grad_norm": 4.455641746520996, + "learning_rate": 1.3171698417240984e-05, + "loss": 0.2669, + "step": 968 + }, + { + "epoch": 0.4581560283687943, + "grad_norm": 6.673210144042969, + "learning_rate": 1.3156037831698094e-05, + "loss": 0.2913, + "step": 969 + }, + { + "epoch": 0.458628841607565, + "grad_norm": 4.279630661010742, + "learning_rate": 1.3140368644851735e-05, + "loss": 0.1963, + "step": 970 + }, + { + "epoch": 0.4591016548463357, + "grad_norm": 6.872097969055176, + "learning_rate": 1.3124690899405903e-05, + "loss": 0.3115, + "step": 971 + }, + { + "epoch": 0.4595744680851064, + "grad_norm": 8.577292442321777, + "learning_rate": 1.3109004638087919e-05, + "loss": 0.2241, + "step": 972 + }, + { + "epoch": 0.46004728132387707, + "grad_norm": 6.13325834274292, + "learning_rate": 1.3093309903648316e-05, + "loss": 0.2008, + "step": 973 + }, + { + "epoch": 0.46052009456264775, + "grad_norm": 7.323633193969727, + "learning_rate": 1.3077606738860719e-05, + "loss": 0.2284, + "step": 974 + }, + { + "epoch": 0.46099290780141844, + "grad_norm": 5.5575642585754395, + "learning_rate": 1.3061895186521724e-05, + "loss": 0.2657, + "step": 975 + }, + { + "epoch": 0.4614657210401891, + "grad_norm": 6.12337064743042, + "learning_rate": 1.304617528945079e-05, + "loss": 0.2165, + "step": 976 + }, + { + "epoch": 0.4619385342789598, + "grad_norm": 6.96366024017334, + "learning_rate": 1.3030447090490117e-05, + "loss": 0.321, + "step": 977 + }, + { + "epoch": 0.4624113475177305, + "grad_norm": 6.608788967132568, + "learning_rate": 1.3014710632504533e-05, + "loss": 0.2851, + "step": 978 + }, + { + "epoch": 0.4628841607565012, + "grad_norm": 7.216396808624268, + "learning_rate": 1.299896595838137e-05, + "loss": 0.3212, + "step": 979 + }, + { + "epoch": 0.46335697399527187, + "grad_norm": 5.307373523712158, + "learning_rate": 1.2983213111030355e-05, + "loss": 0.2316, + "step": 980 + }, + { + "epoch": 0.46335697399527187, + "eval_accuracy": 0.8603104212860311, + "eval_f1": 0.71875, + "eval_loss": 0.3029595613479614, + "eval_precision": 0.8256410256410256, + "eval_recall": 0.6363636363636364, + "eval_runtime": 48.9893, + "eval_samples_per_second": 5.634, + "eval_steps_per_second": 0.184, + "step": 980 + }, + { + "epoch": 0.46382978723404256, + "grad_norm": 7.238687992095947, + "learning_rate": 1.2967452133383494e-05, + "loss": 0.3382, + "step": 981 + }, + { + "epoch": 0.46430260047281324, + "grad_norm": 5.481350421905518, + "learning_rate": 1.2951683068394941e-05, + "loss": 0.2074, + "step": 982 + }, + { + "epoch": 0.46477541371158393, + "grad_norm": 6.286655426025391, + "learning_rate": 1.2935905959040898e-05, + "loss": 0.2536, + "step": 983 + }, + { + "epoch": 0.4652482269503546, + "grad_norm": 7.73200798034668, + "learning_rate": 1.2920120848319483e-05, + "loss": 0.2815, + "step": 984 + }, + { + "epoch": 0.4657210401891253, + "grad_norm": 5.538710117340088, + "learning_rate": 1.2904327779250638e-05, + "loss": 0.2503, + "step": 985 + }, + { + "epoch": 0.466193853427896, + "grad_norm": 8.157992362976074, + "learning_rate": 1.2888526794875975e-05, + "loss": 0.2675, + "step": 986 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 4.824194431304932, + "learning_rate": 1.2872717938258688e-05, + "loss": 0.2185, + "step": 987 + }, + { + "epoch": 0.46713947990543736, + "grad_norm": 3.831620931625366, + "learning_rate": 1.285690125248342e-05, + "loss": 0.2046, + "step": 988 + }, + { + "epoch": 0.46761229314420805, + "grad_norm": 5.231266498565674, + "learning_rate": 1.2841076780656155e-05, + "loss": 0.2472, + "step": 989 + }, + { + "epoch": 0.46808510638297873, + "grad_norm": 6.9529194831848145, + "learning_rate": 1.28252445659041e-05, + "loss": 0.2855, + "step": 990 + }, + { + "epoch": 0.4685579196217494, + "grad_norm": 6.860682964324951, + "learning_rate": 1.2809404651375554e-05, + "loss": 0.2526, + "step": 991 + }, + { + "epoch": 0.4690307328605201, + "grad_norm": 6.531607627868652, + "learning_rate": 1.2793557080239819e-05, + "loss": 0.266, + "step": 992 + }, + { + "epoch": 0.4695035460992908, + "grad_norm": 4.6222758293151855, + "learning_rate": 1.2777701895687034e-05, + "loss": 0.2346, + "step": 993 + }, + { + "epoch": 0.4699763593380615, + "grad_norm": 5.676296710968018, + "learning_rate": 1.2761839140928119e-05, + "loss": 0.3332, + "step": 994 + }, + { + "epoch": 0.47044917257683216, + "grad_norm": 8.922492027282715, + "learning_rate": 1.2745968859194604e-05, + "loss": 0.2986, + "step": 995 + }, + { + "epoch": 0.47092198581560285, + "grad_norm": 3.270632266998291, + "learning_rate": 1.2730091093738545e-05, + "loss": 0.122, + "step": 996 + }, + { + "epoch": 0.47139479905437354, + "grad_norm": 4.889394283294678, + "learning_rate": 1.2714205887832388e-05, + "loss": 0.2348, + "step": 997 + }, + { + "epoch": 0.4718676122931442, + "grad_norm": 6.802956581115723, + "learning_rate": 1.2698313284768852e-05, + "loss": 0.2074, + "step": 998 + }, + { + "epoch": 0.4723404255319149, + "grad_norm": 5.15386962890625, + "learning_rate": 1.2682413327860827e-05, + "loss": 0.2129, + "step": 999 + }, + { + "epoch": 0.4728132387706856, + "grad_norm": 4.577718257904053, + "learning_rate": 1.2666506060441237e-05, + "loss": 0.2116, + "step": 1000 + }, + { + "epoch": 0.4728132387706856, + "eval_accuracy": 0.8580931263858093, + "eval_f1": 0.7009345794392523, + "eval_loss": 0.3230363726615906, + "eval_precision": 0.8571428571428571, + "eval_recall": 0.5928853754940712, + "eval_runtime": 48.9793, + "eval_samples_per_second": 5.635, + "eval_steps_per_second": 0.184, + "step": 1000 + }, + { + "epoch": 0.4732860520094563, + "grad_norm": 6.338871479034424, + "learning_rate": 1.2650591525862934e-05, + "loss": 0.2665, + "step": 1001 + }, + { + "epoch": 0.47375886524822697, + "grad_norm": 6.019141674041748, + "learning_rate": 1.2634669767498573e-05, + "loss": 0.2079, + "step": 1002 + }, + { + "epoch": 0.47423167848699765, + "grad_norm": 4.347167015075684, + "learning_rate": 1.2618740828740494e-05, + "loss": 0.1908, + "step": 1003 + }, + { + "epoch": 0.47470449172576834, + "grad_norm": 6.210932731628418, + "learning_rate": 1.2602804753000611e-05, + "loss": 0.1847, + "step": 1004 + }, + { + "epoch": 0.475177304964539, + "grad_norm": 5.755384922027588, + "learning_rate": 1.2586861583710289e-05, + "loss": 0.2592, + "step": 1005 + }, + { + "epoch": 0.4756501182033097, + "grad_norm": 7.43326473236084, + "learning_rate": 1.2570911364320218e-05, + "loss": 0.2216, + "step": 1006 + }, + { + "epoch": 0.4761229314420804, + "grad_norm": 6.543978214263916, + "learning_rate": 1.2554954138300307e-05, + "loss": 0.2118, + "step": 1007 + }, + { + "epoch": 0.4765957446808511, + "grad_norm": 4.375254154205322, + "learning_rate": 1.2538989949139567e-05, + "loss": 0.1908, + "step": 1008 + }, + { + "epoch": 0.47706855791962177, + "grad_norm": 6.078047275543213, + "learning_rate": 1.2523018840345972e-05, + "loss": 0.2619, + "step": 1009 + }, + { + "epoch": 0.47754137115839246, + "grad_norm": 4.737030506134033, + "learning_rate": 1.2507040855446371e-05, + "loss": 0.1731, + "step": 1010 + }, + { + "epoch": 0.47801418439716314, + "grad_norm": 5.818294525146484, + "learning_rate": 1.2491056037986334e-05, + "loss": 0.2438, + "step": 1011 + }, + { + "epoch": 0.47848699763593383, + "grad_norm": 6.881172180175781, + "learning_rate": 1.2475064431530066e-05, + "loss": 0.2313, + "step": 1012 + }, + { + "epoch": 0.4789598108747045, + "grad_norm": 5.162444591522217, + "learning_rate": 1.245906607966027e-05, + "loss": 0.2579, + "step": 1013 + }, + { + "epoch": 0.4794326241134752, + "grad_norm": 8.821483612060547, + "learning_rate": 1.2443061025978034e-05, + "loss": 0.3318, + "step": 1014 + }, + { + "epoch": 0.4799054373522459, + "grad_norm": 7.8657684326171875, + "learning_rate": 1.2427049314102708e-05, + "loss": 0.2404, + "step": 1015 + }, + { + "epoch": 0.4803782505910166, + "grad_norm": 5.313066482543945, + "learning_rate": 1.2411030987671791e-05, + "loss": 0.1851, + "step": 1016 + }, + { + "epoch": 0.4808510638297872, + "grad_norm": 6.415999412536621, + "learning_rate": 1.2395006090340804e-05, + "loss": 0.2219, + "step": 1017 + }, + { + "epoch": 0.4813238770685579, + "grad_norm": 6.840671539306641, + "learning_rate": 1.2378974665783184e-05, + "loss": 0.3221, + "step": 1018 + }, + { + "epoch": 0.4817966903073286, + "grad_norm": 7.59630823135376, + "learning_rate": 1.236293675769015e-05, + "loss": 0.3566, + "step": 1019 + }, + { + "epoch": 0.48226950354609927, + "grad_norm": 5.026065349578857, + "learning_rate": 1.2346892409770594e-05, + "loss": 0.2134, + "step": 1020 + }, + { + "epoch": 0.48226950354609927, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7327586206896551, + "eval_loss": 0.3039931058883667, + "eval_precision": 0.8056872037914692, + "eval_recall": 0.6719367588932806, + "eval_runtime": 49.0789, + "eval_samples_per_second": 5.624, + "eval_steps_per_second": 0.183, + "step": 1020 + }, + { + "epoch": 0.48274231678486995, + "grad_norm": 5.331277370452881, + "learning_rate": 1.2330841665750954e-05, + "loss": 0.2262, + "step": 1021 + }, + { + "epoch": 0.48321513002364064, + "grad_norm": 8.84730052947998, + "learning_rate": 1.2314784569375114e-05, + "loss": 0.3046, + "step": 1022 + }, + { + "epoch": 0.4836879432624113, + "grad_norm": 6.711941719055176, + "learning_rate": 1.2298721164404249e-05, + "loss": 0.3397, + "step": 1023 + }, + { + "epoch": 0.484160756501182, + "grad_norm": 5.642043113708496, + "learning_rate": 1.2282651494616742e-05, + "loss": 0.2586, + "step": 1024 + }, + { + "epoch": 0.4846335697399527, + "grad_norm": 5.215063571929932, + "learning_rate": 1.226657560380805e-05, + "loss": 0.2402, + "step": 1025 + }, + { + "epoch": 0.4851063829787234, + "grad_norm": 7.698766231536865, + "learning_rate": 1.2250493535790574e-05, + "loss": 0.304, + "step": 1026 + }, + { + "epoch": 0.48557919621749407, + "grad_norm": 5.472048282623291, + "learning_rate": 1.223440533439356e-05, + "loss": 0.229, + "step": 1027 + }, + { + "epoch": 0.48605200945626476, + "grad_norm": 4.721035480499268, + "learning_rate": 1.2218311043462964e-05, + "loss": 0.1878, + "step": 1028 + }, + { + "epoch": 0.48652482269503544, + "grad_norm": 5.494324207305908, + "learning_rate": 1.2202210706861346e-05, + "loss": 0.2146, + "step": 1029 + }, + { + "epoch": 0.48699763593380613, + "grad_norm": 5.183449745178223, + "learning_rate": 1.218610436846773e-05, + "loss": 0.2175, + "step": 1030 + }, + { + "epoch": 0.4874704491725768, + "grad_norm": 8.610817909240723, + "learning_rate": 1.216999207217751e-05, + "loss": 0.3331, + "step": 1031 + }, + { + "epoch": 0.4879432624113475, + "grad_norm": 6.789135456085205, + "learning_rate": 1.21538738619023e-05, + "loss": 0.3608, + "step": 1032 + }, + { + "epoch": 0.4884160756501182, + "grad_norm": 3.8762876987457275, + "learning_rate": 1.2137749781569857e-05, + "loss": 0.2002, + "step": 1033 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 5.960103511810303, + "learning_rate": 1.2121619875123914e-05, + "loss": 0.2497, + "step": 1034 + }, + { + "epoch": 0.48936170212765956, + "grad_norm": 7.955074787139893, + "learning_rate": 1.2105484186524088e-05, + "loss": 0.3593, + "step": 1035 + }, + { + "epoch": 0.48983451536643025, + "grad_norm": 4.501315116882324, + "learning_rate": 1.2089342759745761e-05, + "loss": 0.2412, + "step": 1036 + }, + { + "epoch": 0.49030732860520093, + "grad_norm": 4.577963352203369, + "learning_rate": 1.2073195638779944e-05, + "loss": 0.2328, + "step": 1037 + }, + { + "epoch": 0.4907801418439716, + "grad_norm": 18.703994750976562, + "learning_rate": 1.2057042867633178e-05, + "loss": 0.2931, + "step": 1038 + }, + { + "epoch": 0.4912529550827423, + "grad_norm": 4.856638431549072, + "learning_rate": 1.2040884490327391e-05, + "loss": 0.2607, + "step": 1039 + }, + { + "epoch": 0.491725768321513, + "grad_norm": 5.463403701782227, + "learning_rate": 1.2024720550899798e-05, + "loss": 0.2139, + "step": 1040 + }, + { + "epoch": 0.491725768321513, + "eval_accuracy": 0.844789356984479, + "eval_f1": 0.6446700507614214, + "eval_loss": 0.3279857337474823, + "eval_precision": 0.900709219858156, + "eval_recall": 0.5019762845849802, + "eval_runtime": 48.0379, + "eval_samples_per_second": 5.745, + "eval_steps_per_second": 0.187, + "step": 1040 + }, + { + "epoch": 0.4921985815602837, + "grad_norm": 6.326286315917969, + "learning_rate": 1.2008551093402763e-05, + "loss": 0.264, + "step": 1041 + }, + { + "epoch": 0.49267139479905436, + "grad_norm": 5.787569046020508, + "learning_rate": 1.1992376161903705e-05, + "loss": 0.228, + "step": 1042 + }, + { + "epoch": 0.49314420803782505, + "grad_norm": 6.124124526977539, + "learning_rate": 1.1976195800484945e-05, + "loss": 0.1668, + "step": 1043 + }, + { + "epoch": 0.49361702127659574, + "grad_norm": 5.056814670562744, + "learning_rate": 1.1960010053243613e-05, + "loss": 0.1894, + "step": 1044 + }, + { + "epoch": 0.4940898345153664, + "grad_norm": 7.828837871551514, + "learning_rate": 1.194381896429151e-05, + "loss": 0.3602, + "step": 1045 + }, + { + "epoch": 0.4945626477541371, + "grad_norm": 4.001469135284424, + "learning_rate": 1.1927622577755003e-05, + "loss": 0.1379, + "step": 1046 + }, + { + "epoch": 0.4950354609929078, + "grad_norm": 7.635477542877197, + "learning_rate": 1.191142093777489e-05, + "loss": 0.244, + "step": 1047 + }, + { + "epoch": 0.4955082742316785, + "grad_norm": 7.2881364822387695, + "learning_rate": 1.1895214088506284e-05, + "loss": 0.3006, + "step": 1048 + }, + { + "epoch": 0.49598108747044917, + "grad_norm": 5.0428619384765625, + "learning_rate": 1.1879002074118512e-05, + "loss": 0.2994, + "step": 1049 + }, + { + "epoch": 0.49645390070921985, + "grad_norm": 5.892991542816162, + "learning_rate": 1.1862784938794951e-05, + "loss": 0.229, + "step": 1050 + }, + { + "epoch": 0.49692671394799054, + "grad_norm": 6.7257304191589355, + "learning_rate": 1.184656272673296e-05, + "loss": 0.3032, + "step": 1051 + }, + { + "epoch": 0.4973995271867612, + "grad_norm": 6.495220184326172, + "learning_rate": 1.1830335482143718e-05, + "loss": 0.2918, + "step": 1052 + }, + { + "epoch": 0.4978723404255319, + "grad_norm": 4.424355983734131, + "learning_rate": 1.1814103249252124e-05, + "loss": 0.2097, + "step": 1053 + }, + { + "epoch": 0.4983451536643026, + "grad_norm": 4.175996780395508, + "learning_rate": 1.1797866072296676e-05, + "loss": 0.1882, + "step": 1054 + }, + { + "epoch": 0.4988179669030733, + "grad_norm": 5.1931328773498535, + "learning_rate": 1.1781623995529341e-05, + "loss": 0.2526, + "step": 1055 + }, + { + "epoch": 0.49929078014184397, + "grad_norm": 5.245265960693359, + "learning_rate": 1.1765377063215436e-05, + "loss": 0.204, + "step": 1056 + }, + { + "epoch": 0.49976359338061466, + "grad_norm": 4.931206226348877, + "learning_rate": 1.1749125319633523e-05, + "loss": 0.2239, + "step": 1057 + }, + { + "epoch": 0.5002364066193853, + "grad_norm": 4.764687538146973, + "learning_rate": 1.1732868809075266e-05, + "loss": 0.2257, + "step": 1058 + }, + { + "epoch": 0.500709219858156, + "grad_norm": 6.108907222747803, + "learning_rate": 1.1716607575845327e-05, + "loss": 0.271, + "step": 1059 + }, + { + "epoch": 0.5011820330969267, + "grad_norm": 5.150505065917969, + "learning_rate": 1.1700341664261233e-05, + "loss": 0.1949, + "step": 1060 + }, + { + "epoch": 0.5011820330969267, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7089201877934272, + "eval_loss": 0.31160375475883484, + "eval_precision": 0.8728323699421965, + "eval_recall": 0.5968379446640316, + "eval_runtime": 48.5889, + "eval_samples_per_second": 5.68, + "eval_steps_per_second": 0.185, + "step": 1060 + }, + { + "epoch": 0.5016548463356973, + "grad_norm": 5.739040851593018, + "learning_rate": 1.1684071118653262e-05, + "loss": 0.238, + "step": 1061 + }, + { + "epoch": 0.502127659574468, + "grad_norm": 4.2889862060546875, + "learning_rate": 1.1667795983364332e-05, + "loss": 0.1881, + "step": 1062 + }, + { + "epoch": 0.5026004728132387, + "grad_norm": 6.57220458984375, + "learning_rate": 1.1651516302749854e-05, + "loss": 0.3294, + "step": 1063 + }, + { + "epoch": 0.5030732860520094, + "grad_norm": 6.348330497741699, + "learning_rate": 1.1635232121177637e-05, + "loss": 0.2125, + "step": 1064 + }, + { + "epoch": 0.5035460992907801, + "grad_norm": 4.948648452758789, + "learning_rate": 1.1618943483027749e-05, + "loss": 0.2409, + "step": 1065 + }, + { + "epoch": 0.5040189125295508, + "grad_norm": 5.066147804260254, + "learning_rate": 1.1602650432692417e-05, + "loss": 0.317, + "step": 1066 + }, + { + "epoch": 0.5044917257683215, + "grad_norm": 5.450560092926025, + "learning_rate": 1.1586353014575875e-05, + "loss": 0.2262, + "step": 1067 + }, + { + "epoch": 0.5049645390070922, + "grad_norm": 4.708855152130127, + "learning_rate": 1.1570051273094277e-05, + "loss": 0.2362, + "step": 1068 + }, + { + "epoch": 0.5054373522458628, + "grad_norm": 10.365764617919922, + "learning_rate": 1.1553745252675541e-05, + "loss": 0.3124, + "step": 1069 + }, + { + "epoch": 0.5059101654846335, + "grad_norm": 3.386537790298462, + "learning_rate": 1.153743499775927e-05, + "loss": 0.121, + "step": 1070 + }, + { + "epoch": 0.5063829787234042, + "grad_norm": 6.828433990478516, + "learning_rate": 1.152112055279659e-05, + "loss": 0.2726, + "step": 1071 + }, + { + "epoch": 0.5068557919621749, + "grad_norm": 5.744606971740723, + "learning_rate": 1.1504801962250055e-05, + "loss": 0.2195, + "step": 1072 + }, + { + "epoch": 0.5073286052009456, + "grad_norm": 4.951056480407715, + "learning_rate": 1.1488479270593507e-05, + "loss": 0.2528, + "step": 1073 + }, + { + "epoch": 0.5078014184397163, + "grad_norm": 5.981780529022217, + "learning_rate": 1.1472152522311974e-05, + "loss": 0.2478, + "step": 1074 + }, + { + "epoch": 0.508274231678487, + "grad_norm": 3.798823118209839, + "learning_rate": 1.1455821761901544e-05, + "loss": 0.2075, + "step": 1075 + }, + { + "epoch": 0.5087470449172576, + "grad_norm": 6.254341125488281, + "learning_rate": 1.1439487033869226e-05, + "loss": 0.301, + "step": 1076 + }, + { + "epoch": 0.5092198581560283, + "grad_norm": 6.011257648468018, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.257, + "step": 1077 + }, + { + "epoch": 0.509692671394799, + "grad_norm": 5.492880344390869, + "learning_rate": 1.1406805853020944e-05, + "loss": 0.2582, + "step": 1078 + }, + { + "epoch": 0.5101654846335697, + "grad_norm": 4.94982385635376, + "learning_rate": 1.139045948927259e-05, + "loss": 0.2408, + "step": 1079 + }, + { + "epoch": 0.5106382978723404, + "grad_norm": 4.847739219665527, + "learning_rate": 1.1374109336037331e-05, + "loss": 0.2255, + "step": 1080 + }, + { + "epoch": 0.5106382978723404, + "eval_accuracy": 0.8592017738359202, + "eval_f1": 0.7011764705882353, + "eval_loss": 0.31947237253189087, + "eval_precision": 0.8662790697674418, + "eval_recall": 0.5889328063241107, + "eval_runtime": 48.5383, + "eval_samples_per_second": 5.686, + "eval_steps_per_second": 0.185, + "step": 1080 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 5.068375587463379, + "learning_rate": 1.135775543787504e-05, + "loss": 0.2263, + "step": 1081 + }, + { + "epoch": 0.5115839243498818, + "grad_norm": 4.222960472106934, + "learning_rate": 1.1341397839355786e-05, + "loss": 0.2395, + "step": 1082 + }, + { + "epoch": 0.5120567375886524, + "grad_norm": 4.976379871368408, + "learning_rate": 1.1325036585059732e-05, + "loss": 0.2215, + "step": 1083 + }, + { + "epoch": 0.5125295508274231, + "grad_norm": 5.738669395446777, + "learning_rate": 1.1308671719576997e-05, + "loss": 0.22, + "step": 1084 + }, + { + "epoch": 0.5130023640661938, + "grad_norm": 7.325643539428711, + "learning_rate": 1.1292303287507546e-05, + "loss": 0.2674, + "step": 1085 + }, + { + "epoch": 0.5134751773049645, + "grad_norm": 7.6121320724487305, + "learning_rate": 1.1275931333461065e-05, + "loss": 0.3137, + "step": 1086 + }, + { + "epoch": 0.5139479905437352, + "grad_norm": 4.593527793884277, + "learning_rate": 1.1259555902056838e-05, + "loss": 0.2287, + "step": 1087 + }, + { + "epoch": 0.5144208037825059, + "grad_norm": 5.258584499359131, + "learning_rate": 1.1243177037923623e-05, + "loss": 0.2492, + "step": 1088 + }, + { + "epoch": 0.5148936170212766, + "grad_norm": 6.010392189025879, + "learning_rate": 1.1226794785699531e-05, + "loss": 0.2251, + "step": 1089 + }, + { + "epoch": 0.5153664302600472, + "grad_norm": 7.567986488342285, + "learning_rate": 1.121040919003192e-05, + "loss": 0.3043, + "step": 1090 + }, + { + "epoch": 0.5158392434988179, + "grad_norm": 5.762569427490234, + "learning_rate": 1.1194020295577246e-05, + "loss": 0.2001, + "step": 1091 + }, + { + "epoch": 0.5163120567375886, + "grad_norm": 8.211880683898926, + "learning_rate": 1.1177628147000961e-05, + "loss": 0.2575, + "step": 1092 + }, + { + "epoch": 0.5167848699763593, + "grad_norm": 4.067295074462891, + "learning_rate": 1.1161232788977385e-05, + "loss": 0.1754, + "step": 1093 + }, + { + "epoch": 0.51725768321513, + "grad_norm": 5.805228233337402, + "learning_rate": 1.1144834266189585e-05, + "loss": 0.2111, + "step": 1094 + }, + { + "epoch": 0.5177304964539007, + "grad_norm": 4.621476650238037, + "learning_rate": 1.1128432623329256e-05, + "loss": 0.1363, + "step": 1095 + }, + { + "epoch": 0.5182033096926714, + "grad_norm": 7.750375747680664, + "learning_rate": 1.111202790509659e-05, + "loss": 0.2732, + "step": 1096 + }, + { + "epoch": 0.518676122931442, + "grad_norm": 4.444814682006836, + "learning_rate": 1.1095620156200166e-05, + "loss": 0.2107, + "step": 1097 + }, + { + "epoch": 0.5191489361702127, + "grad_norm": 12.939567565917969, + "learning_rate": 1.1079209421356816e-05, + "loss": 0.2695, + "step": 1098 + }, + { + "epoch": 0.5196217494089834, + "grad_norm": 7.756330966949463, + "learning_rate": 1.1062795745291519e-05, + "loss": 0.2247, + "step": 1099 + }, + { + "epoch": 0.5200945626477541, + "grad_norm": 6.402958393096924, + "learning_rate": 1.1046379172737264e-05, + "loss": 0.2452, + "step": 1100 + }, + { + "epoch": 0.5200945626477541, + "eval_accuracy": 0.8425720620842572, + "eval_f1": 0.6395939086294417, + "eval_loss": 0.34635570645332336, + "eval_precision": 0.8936170212765957, + "eval_recall": 0.4980237154150198, + "eval_runtime": 47.8847, + "eval_samples_per_second": 5.764, + "eval_steps_per_second": 0.188, + "step": 1100 + }, + { + "epoch": 0.5205673758865248, + "grad_norm": 7.110340118408203, + "learning_rate": 1.1029959748434935e-05, + "loss": 0.2357, + "step": 1101 + }, + { + "epoch": 0.5210401891252955, + "grad_norm": 6.949429512023926, + "learning_rate": 1.1013537517133184e-05, + "loss": 0.3259, + "step": 1102 + }, + { + "epoch": 0.5215130023640662, + "grad_norm": 5.027368068695068, + "learning_rate": 1.0997112523588322e-05, + "loss": 0.1423, + "step": 1103 + }, + { + "epoch": 0.5219858156028369, + "grad_norm": 6.545793056488037, + "learning_rate": 1.0980684812564183e-05, + "loss": 0.1863, + "step": 1104 + }, + { + "epoch": 0.5224586288416075, + "grad_norm": 5.906529903411865, + "learning_rate": 1.0964254428832007e-05, + "loss": 0.2981, + "step": 1105 + }, + { + "epoch": 0.5229314420803782, + "grad_norm": 4.966193675994873, + "learning_rate": 1.0947821417170313e-05, + "loss": 0.2378, + "step": 1106 + }, + { + "epoch": 0.5234042553191489, + "grad_norm": 5.323748588562012, + "learning_rate": 1.0931385822364796e-05, + "loss": 0.2183, + "step": 1107 + }, + { + "epoch": 0.5238770685579196, + "grad_norm": 7.892477035522461, + "learning_rate": 1.0914947689208171e-05, + "loss": 0.3732, + "step": 1108 + }, + { + "epoch": 0.5243498817966903, + "grad_norm": 4.786356449127197, + "learning_rate": 1.0898507062500095e-05, + "loss": 0.2391, + "step": 1109 + }, + { + "epoch": 0.524822695035461, + "grad_norm": 6.325803279876709, + "learning_rate": 1.0882063987047e-05, + "loss": 0.2397, + "step": 1110 + }, + { + "epoch": 0.5252955082742317, + "grad_norm": 6.990598201751709, + "learning_rate": 1.0865618507662001e-05, + "loss": 0.2782, + "step": 1111 + }, + { + "epoch": 0.5257683215130023, + "grad_norm": 8.039189338684082, + "learning_rate": 1.0849170669164764e-05, + "loss": 0.262, + "step": 1112 + }, + { + "epoch": 0.526241134751773, + "grad_norm": 6.111503601074219, + "learning_rate": 1.0832720516381382e-05, + "loss": 0.2201, + "step": 1113 + }, + { + "epoch": 0.5267139479905437, + "grad_norm": 6.7883124351501465, + "learning_rate": 1.0816268094144257e-05, + "loss": 0.2615, + "step": 1114 + }, + { + "epoch": 0.5271867612293144, + "grad_norm": 6.257448673248291, + "learning_rate": 1.0799813447291979e-05, + "loss": 0.232, + "step": 1115 + }, + { + "epoch": 0.5276595744680851, + "grad_norm": 8.060059547424316, + "learning_rate": 1.0783356620669195e-05, + "loss": 0.2726, + "step": 1116 + }, + { + "epoch": 0.5281323877068558, + "grad_norm": 4.382721424102783, + "learning_rate": 1.0766897659126491e-05, + "loss": 0.2114, + "step": 1117 + }, + { + "epoch": 0.5286052009456265, + "grad_norm": 5.4973859786987305, + "learning_rate": 1.0750436607520287e-05, + "loss": 0.2706, + "step": 1118 + }, + { + "epoch": 0.5290780141843971, + "grad_norm": 8.051422119140625, + "learning_rate": 1.0733973510712682e-05, + "loss": 0.2354, + "step": 1119 + }, + { + "epoch": 0.5295508274231678, + "grad_norm": 3.799506664276123, + "learning_rate": 1.0717508413571349e-05, + "loss": 0.2038, + "step": 1120 + }, + { + "epoch": 0.5295508274231678, + "eval_accuracy": 0.8569844789356984, + "eval_f1": 0.6921241050119332, + "eval_loss": 0.31673863530158997, + "eval_precision": 0.8734939759036144, + "eval_recall": 0.5731225296442688, + "eval_runtime": 48.009, + "eval_samples_per_second": 5.749, + "eval_steps_per_second": 0.187, + "step": 1120 + }, + { + "epoch": 0.5300236406619385, + "grad_norm": 7.479004383087158, + "learning_rate": 1.0701041360969428e-05, + "loss": 0.2895, + "step": 1121 + }, + { + "epoch": 0.5304964539007092, + "grad_norm": 4.519740104675293, + "learning_rate": 1.068457239778537e-05, + "loss": 0.2641, + "step": 1122 + }, + { + "epoch": 0.5309692671394799, + "grad_norm": 5.979281425476074, + "learning_rate": 1.0668101568902852e-05, + "loss": 0.2297, + "step": 1123 + }, + { + "epoch": 0.5314420803782506, + "grad_norm": 4.343296051025391, + "learning_rate": 1.0651628919210615e-05, + "loss": 0.1811, + "step": 1124 + }, + { + "epoch": 0.5319148936170213, + "grad_norm": 5.795645713806152, + "learning_rate": 1.063515449360238e-05, + "loss": 0.2214, + "step": 1125 + }, + { + "epoch": 0.532387706855792, + "grad_norm": 4.395986080169678, + "learning_rate": 1.0618678336976695e-05, + "loss": 0.2373, + "step": 1126 + }, + { + "epoch": 0.5328605200945626, + "grad_norm": 4.035050392150879, + "learning_rate": 1.0602200494236837e-05, + "loss": 0.185, + "step": 1127 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 5.2432780265808105, + "learning_rate": 1.0585721010290668e-05, + "loss": 0.201, + "step": 1128 + }, + { + "epoch": 0.533806146572104, + "grad_norm": 6.4242777824401855, + "learning_rate": 1.0569239930050532e-05, + "loss": 0.2681, + "step": 1129 + }, + { + "epoch": 0.5342789598108747, + "grad_norm": 6.040828227996826, + "learning_rate": 1.0552757298433113e-05, + "loss": 0.1799, + "step": 1130 + }, + { + "epoch": 0.5347517730496454, + "grad_norm": 8.409934043884277, + "learning_rate": 1.0536273160359335e-05, + "loss": 0.3153, + "step": 1131 + }, + { + "epoch": 0.5352245862884161, + "grad_norm": 5.49470853805542, + "learning_rate": 1.0519787560754215e-05, + "loss": 0.2344, + "step": 1132 + }, + { + "epoch": 0.5356973995271868, + "grad_norm": 4.8799967765808105, + "learning_rate": 1.050330054454677e-05, + "loss": 0.1693, + "step": 1133 + }, + { + "epoch": 0.5361702127659574, + "grad_norm": 7.70962381362915, + "learning_rate": 1.0486812156669859e-05, + "loss": 0.1999, + "step": 1134 + }, + { + "epoch": 0.5366430260047281, + "grad_norm": 6.684405326843262, + "learning_rate": 1.0470322442060089e-05, + "loss": 0.1878, + "step": 1135 + }, + { + "epoch": 0.5371158392434988, + "grad_norm": 7.799801826477051, + "learning_rate": 1.045383144565768e-05, + "loss": 0.2943, + "step": 1136 + }, + { + "epoch": 0.5375886524822695, + "grad_norm": 9.22608470916748, + "learning_rate": 1.043733921240635e-05, + "loss": 0.3668, + "step": 1137 + }, + { + "epoch": 0.5380614657210402, + "grad_norm": 5.817656517028809, + "learning_rate": 1.0420845787253189e-05, + "loss": 0.2449, + "step": 1138 + }, + { + "epoch": 0.5385342789598109, + "grad_norm": 9.814664840698242, + "learning_rate": 1.0404351215148523e-05, + "loss": 0.3372, + "step": 1139 + }, + { + "epoch": 0.5390070921985816, + "grad_norm": 4.843449592590332, + "learning_rate": 1.0387855541045815e-05, + "loss": 0.2496, + "step": 1140 + }, + { + "epoch": 0.5390070921985816, + "eval_accuracy": 0.8592017738359202, + "eval_f1": 0.6968973747016707, + "eval_loss": 0.31810781359672546, + "eval_precision": 0.8795180722891566, + "eval_recall": 0.5770750988142292, + "eval_runtime": 49.0428, + "eval_samples_per_second": 5.628, + "eval_steps_per_second": 0.184, + "step": 1140 + }, + { + "epoch": 0.5394799054373522, + "grad_norm": 6.5451765060424805, + "learning_rate": 1.0371358809901529e-05, + "loss": 0.266, + "step": 1141 + }, + { + "epoch": 0.5399527186761229, + "grad_norm": 4.096044540405273, + "learning_rate": 1.0354861066675008e-05, + "loss": 0.1938, + "step": 1142 + }, + { + "epoch": 0.5404255319148936, + "grad_norm": 5.981978416442871, + "learning_rate": 1.0338362356328355e-05, + "loss": 0.2721, + "step": 1143 + }, + { + "epoch": 0.5408983451536643, + "grad_norm": 4.459275245666504, + "learning_rate": 1.0321862723826311e-05, + "loss": 0.2085, + "step": 1144 + }, + { + "epoch": 0.541371158392435, + "grad_norm": 4.302639484405518, + "learning_rate": 1.0305362214136122e-05, + "loss": 0.2267, + "step": 1145 + }, + { + "epoch": 0.5418439716312057, + "grad_norm": 8.027523040771484, + "learning_rate": 1.028886087222743e-05, + "loss": 0.3361, + "step": 1146 + }, + { + "epoch": 0.5423167848699764, + "grad_norm": 6.380166530609131, + "learning_rate": 1.0272358743072152e-05, + "loss": 0.2274, + "step": 1147 + }, + { + "epoch": 0.542789598108747, + "grad_norm": 7.479015827178955, + "learning_rate": 1.0255855871644338e-05, + "loss": 0.3562, + "step": 1148 + }, + { + "epoch": 0.5432624113475177, + "grad_norm": 4.3820295333862305, + "learning_rate": 1.0239352302920067e-05, + "loss": 0.1709, + "step": 1149 + }, + { + "epoch": 0.5437352245862884, + "grad_norm": 6.630291938781738, + "learning_rate": 1.0222848081877316e-05, + "loss": 0.2615, + "step": 1150 + }, + { + "epoch": 0.5442080378250591, + "grad_norm": 5.88150691986084, + "learning_rate": 1.0206343253495848e-05, + "loss": 0.2611, + "step": 1151 + }, + { + "epoch": 0.5446808510638298, + "grad_norm": 6.246159553527832, + "learning_rate": 1.0189837862757068e-05, + "loss": 0.2713, + "step": 1152 + }, + { + "epoch": 0.5451536643026005, + "grad_norm": 6.391038417816162, + "learning_rate": 1.0173331954643926e-05, + "loss": 0.1998, + "step": 1153 + }, + { + "epoch": 0.5456264775413712, + "grad_norm": 5.693717002868652, + "learning_rate": 1.0156825574140769e-05, + "loss": 0.2219, + "step": 1154 + }, + { + "epoch": 0.5460992907801419, + "grad_norm": 4.549108982086182, + "learning_rate": 1.0140318766233247e-05, + "loss": 0.239, + "step": 1155 + }, + { + "epoch": 0.5465721040189125, + "grad_norm": 7.300600528717041, + "learning_rate": 1.0123811575908166e-05, + "loss": 0.3028, + "step": 1156 + }, + { + "epoch": 0.5470449172576832, + "grad_norm": 4.2444071769714355, + "learning_rate": 1.0107304048153372e-05, + "loss": 0.1432, + "step": 1157 + }, + { + "epoch": 0.5475177304964539, + "grad_norm": 5.09889030456543, + "learning_rate": 1.0090796227957633e-05, + "loss": 0.2697, + "step": 1158 + }, + { + "epoch": 0.5479905437352246, + "grad_norm": 5.837294101715088, + "learning_rate": 1.0074288160310514e-05, + "loss": 0.2371, + "step": 1159 + }, + { + "epoch": 0.5484633569739953, + "grad_norm": 6.027414798736572, + "learning_rate": 1.0057779890202259e-05, + "loss": 0.2864, + "step": 1160 + }, + { + "epoch": 0.5484633569739953, + "eval_accuracy": 0.8514412416851441, + "eval_f1": 0.6731707317073171, + "eval_loss": 0.32011911273002625, + "eval_precision": 0.8789808917197452, + "eval_recall": 0.5454545454545454, + "eval_runtime": 48.0201, + "eval_samples_per_second": 5.748, + "eval_steps_per_second": 0.187, + "step": 1160 + }, + { + "epoch": 0.548936170212766, + "grad_norm": 3.9856438636779785, + "learning_rate": 1.0041271462623658e-05, + "loss": 0.2113, + "step": 1161 + }, + { + "epoch": 0.5494089834515367, + "grad_norm": 4.562050819396973, + "learning_rate": 1.0024762922565933e-05, + "loss": 0.2173, + "step": 1162 + }, + { + "epoch": 0.5498817966903073, + "grad_norm": 4.3589558601379395, + "learning_rate": 1.0008254315020607e-05, + "loss": 0.185, + "step": 1163 + }, + { + "epoch": 0.550354609929078, + "grad_norm": 5.3740620613098145, + "learning_rate": 9.991745684979394e-06, + "loss": 0.2472, + "step": 1164 + }, + { + "epoch": 0.5508274231678487, + "grad_norm": 5.081512451171875, + "learning_rate": 9.97523707743407e-06, + "loss": 0.219, + "step": 1165 + }, + { + "epoch": 0.5513002364066194, + "grad_norm": 4.1024346351623535, + "learning_rate": 9.958728537376345e-06, + "loss": 0.1668, + "step": 1166 + }, + { + "epoch": 0.5517730496453901, + "grad_norm": 3.816474199295044, + "learning_rate": 9.942220109797746e-06, + "loss": 0.2022, + "step": 1167 + }, + { + "epoch": 0.5522458628841608, + "grad_norm": 5.035168647766113, + "learning_rate": 9.925711839689487e-06, + "loss": 0.2188, + "step": 1168 + }, + { + "epoch": 0.5527186761229315, + "grad_norm": 5.621501922607422, + "learning_rate": 9.909203772042369e-06, + "loss": 0.2612, + "step": 1169 + }, + { + "epoch": 0.5531914893617021, + "grad_norm": 3.7916884422302246, + "learning_rate": 9.892695951846631e-06, + "loss": 0.1537, + "step": 1170 + }, + { + "epoch": 0.5536643026004728, + "grad_norm": 6.460813045501709, + "learning_rate": 9.876188424091837e-06, + "loss": 0.2258, + "step": 1171 + }, + { + "epoch": 0.5541371158392435, + "grad_norm": 5.5038604736328125, + "learning_rate": 9.859681233766756e-06, + "loss": 0.1853, + "step": 1172 + }, + { + "epoch": 0.5546099290780142, + "grad_norm": 9.788790702819824, + "learning_rate": 9.843174425859231e-06, + "loss": 0.3384, + "step": 1173 + }, + { + "epoch": 0.5550827423167849, + "grad_norm": 8.492478370666504, + "learning_rate": 9.826668045356078e-06, + "loss": 0.2906, + "step": 1174 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 5.153669357299805, + "learning_rate": 9.810162137242935e-06, + "loss": 0.215, + "step": 1175 + }, + { + "epoch": 0.5560283687943263, + "grad_norm": 6.732087135314941, + "learning_rate": 9.793656746504155e-06, + "loss": 0.2921, + "step": 1176 + }, + { + "epoch": 0.556501182033097, + "grad_norm": 4.027410507202148, + "learning_rate": 9.777151918122684e-06, + "loss": 0.1983, + "step": 1177 + }, + { + "epoch": 0.5569739952718676, + "grad_norm": 5.1011061668396, + "learning_rate": 9.760647697079936e-06, + "loss": 0.2095, + "step": 1178 + }, + { + "epoch": 0.5574468085106383, + "grad_norm": 5.0621538162231445, + "learning_rate": 9.744144128355665e-06, + "loss": 0.1637, + "step": 1179 + }, + { + "epoch": 0.557919621749409, + "grad_norm": 6.033471584320068, + "learning_rate": 9.72764125692785e-06, + "loss": 0.2342, + "step": 1180 + }, + { + "epoch": 0.557919621749409, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7252252252252253, + "eval_loss": 0.3139636218547821, + "eval_precision": 0.8429319371727748, + "eval_recall": 0.6363636363636364, + "eval_runtime": 48.235, + "eval_samples_per_second": 5.722, + "eval_steps_per_second": 0.187, + "step": 1180 + }, + { + "epoch": 0.5583924349881797, + "grad_norm": 6.157944679260254, + "learning_rate": 9.711139127772568e-06, + "loss": 0.2866, + "step": 1181 + }, + { + "epoch": 0.5588652482269504, + "grad_norm": 5.42804479598999, + "learning_rate": 9.69463778586388e-06, + "loss": 0.1551, + "step": 1182 + }, + { + "epoch": 0.5593380614657211, + "grad_norm": 5.639116287231445, + "learning_rate": 9.678137276173692e-06, + "loss": 0.1956, + "step": 1183 + }, + { + "epoch": 0.5598108747044918, + "grad_norm": 6.913265705108643, + "learning_rate": 9.661637643671647e-06, + "loss": 0.2281, + "step": 1184 + }, + { + "epoch": 0.5602836879432624, + "grad_norm": 5.190281867980957, + "learning_rate": 9.645138933324994e-06, + "loss": 0.1959, + "step": 1185 + }, + { + "epoch": 0.5607565011820331, + "grad_norm": 4.637751579284668, + "learning_rate": 9.628641190098473e-06, + "loss": 0.2019, + "step": 1186 + }, + { + "epoch": 0.5612293144208038, + "grad_norm": 4.302716255187988, + "learning_rate": 9.612144458954189e-06, + "loss": 0.1699, + "step": 1187 + }, + { + "epoch": 0.5617021276595745, + "grad_norm": 6.4986395835876465, + "learning_rate": 9.59564878485148e-06, + "loss": 0.2111, + "step": 1188 + }, + { + "epoch": 0.5621749408983452, + "grad_norm": 6.432104587554932, + "learning_rate": 9.579154212746815e-06, + "loss": 0.199, + "step": 1189 + }, + { + "epoch": 0.5626477541371159, + "grad_norm": 5.450148105621338, + "learning_rate": 9.56266078759365e-06, + "loss": 0.2335, + "step": 1190 + }, + { + "epoch": 0.5631205673758866, + "grad_norm": 5.353931427001953, + "learning_rate": 9.546168554342323e-06, + "loss": 0.1919, + "step": 1191 + }, + { + "epoch": 0.5635933806146572, + "grad_norm": 5.608835220336914, + "learning_rate": 9.529677557939916e-06, + "loss": 0.2217, + "step": 1192 + }, + { + "epoch": 0.5640661938534279, + "grad_norm": 7.61819314956665, + "learning_rate": 9.513187843330146e-06, + "loss": 0.2864, + "step": 1193 + }, + { + "epoch": 0.5645390070921986, + "grad_norm": 7.839981555938721, + "learning_rate": 9.496699455453232e-06, + "loss": 0.2923, + "step": 1194 + }, + { + "epoch": 0.5650118203309693, + "grad_norm": 4.617547035217285, + "learning_rate": 9.480212439245785e-06, + "loss": 0.1815, + "step": 1195 + }, + { + "epoch": 0.56548463356974, + "grad_norm": 5.598609924316406, + "learning_rate": 9.463726839640667e-06, + "loss": 0.238, + "step": 1196 + }, + { + "epoch": 0.5659574468085107, + "grad_norm": 5.537100791931152, + "learning_rate": 9.44724270156689e-06, + "loss": 0.1757, + "step": 1197 + }, + { + "epoch": 0.5664302600472814, + "grad_norm": 4.509025573730469, + "learning_rate": 9.430760069949473e-06, + "loss": 0.2335, + "step": 1198 + }, + { + "epoch": 0.566903073286052, + "grad_norm": 6.317657470703125, + "learning_rate": 9.414278989709334e-06, + "loss": 0.1729, + "step": 1199 + }, + { + "epoch": 0.5673758865248227, + "grad_norm": 4.740533351898193, + "learning_rate": 9.397799505763167e-06, + "loss": 0.1366, + "step": 1200 + }, + { + "epoch": 0.5673758865248227, + "eval_accuracy": 0.8680709534368071, + "eval_f1": 0.7361419068736141, + "eval_loss": 0.30103373527526855, + "eval_precision": 0.8383838383838383, + "eval_recall": 0.6561264822134387, + "eval_runtime": 47.2489, + "eval_samples_per_second": 5.841, + "eval_steps_per_second": 0.19, + "step": 1200 + }, + { + "epoch": 0.5678486997635934, + "grad_norm": 6.263066291809082, + "learning_rate": 9.381321663023308e-06, + "loss": 0.2202, + "step": 1201 + }, + { + "epoch": 0.5683215130023641, + "grad_norm": 6.9543070793151855, + "learning_rate": 9.364845506397625e-06, + "loss": 0.1869, + "step": 1202 + }, + { + "epoch": 0.5687943262411348, + "grad_norm": 4.8995513916015625, + "learning_rate": 9.348371080789387e-06, + "loss": 0.2227, + "step": 1203 + }, + { + "epoch": 0.5692671394799055, + "grad_norm": 3.913970470428467, + "learning_rate": 9.331898431097153e-06, + "loss": 0.1941, + "step": 1204 + }, + { + "epoch": 0.5697399527186762, + "grad_norm": 4.263607025146484, + "learning_rate": 9.315427602214631e-06, + "loss": 0.2026, + "step": 1205 + }, + { + "epoch": 0.5702127659574469, + "grad_norm": 4.99878454208374, + "learning_rate": 9.298958639030577e-06, + "loss": 0.1717, + "step": 1206 + }, + { + "epoch": 0.5706855791962175, + "grad_norm": 4.628468036651611, + "learning_rate": 9.282491586428655e-06, + "loss": 0.1845, + "step": 1207 + }, + { + "epoch": 0.5711583924349882, + "grad_norm": 6.5551533699035645, + "learning_rate": 9.266026489287323e-06, + "loss": 0.2557, + "step": 1208 + }, + { + "epoch": 0.5716312056737589, + "grad_norm": 5.44743013381958, + "learning_rate": 9.249563392479715e-06, + "loss": 0.2666, + "step": 1209 + }, + { + "epoch": 0.5721040189125296, + "grad_norm": 5.58568000793457, + "learning_rate": 9.23310234087351e-06, + "loss": 0.2257, + "step": 1210 + }, + { + "epoch": 0.5725768321513003, + "grad_norm": 7.3130574226379395, + "learning_rate": 9.21664337933081e-06, + "loss": 0.3227, + "step": 1211 + }, + { + "epoch": 0.573049645390071, + "grad_norm": 6.498375415802002, + "learning_rate": 9.200186552708023e-06, + "loss": 0.2122, + "step": 1212 + }, + { + "epoch": 0.5735224586288417, + "grad_norm": 5.47324275970459, + "learning_rate": 9.183731905855746e-06, + "loss": 0.243, + "step": 1213 + }, + { + "epoch": 0.5739952718676123, + "grad_norm": 5.8507866859436035, + "learning_rate": 9.167279483618623e-06, + "loss": 0.1633, + "step": 1214 + }, + { + "epoch": 0.574468085106383, + "grad_norm": 4.788534641265869, + "learning_rate": 9.150829330835241e-06, + "loss": 0.182, + "step": 1215 + }, + { + "epoch": 0.5749408983451537, + "grad_norm": 7.402541160583496, + "learning_rate": 9.134381492338e-06, + "loss": 0.3063, + "step": 1216 + }, + { + "epoch": 0.5754137115839244, + "grad_norm": 4.93443489074707, + "learning_rate": 9.117936012953002e-06, + "loss": 0.208, + "step": 1217 + }, + { + "epoch": 0.5758865248226951, + "grad_norm": 8.154093742370605, + "learning_rate": 9.101492937499909e-06, + "loss": 0.2389, + "step": 1218 + }, + { + "epoch": 0.5763593380614658, + "grad_norm": 7.1925368309021, + "learning_rate": 9.08505231079183e-06, + "loss": 0.3203, + "step": 1219 + }, + { + "epoch": 0.5768321513002365, + "grad_norm": 7.500906467437744, + "learning_rate": 9.068614177635211e-06, + "loss": 0.2301, + "step": 1220 + }, + { + "epoch": 0.5768321513002365, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7142857142857143, + "eval_loss": 0.30109164118766785, + "eval_precision": 0.856353591160221, + "eval_recall": 0.6126482213438735, + "eval_runtime": 47.0544, + "eval_samples_per_second": 5.866, + "eval_steps_per_second": 0.191, + "step": 1220 + }, + { + "epoch": 0.577304964539007, + "grad_norm": 5.455466270446777, + "learning_rate": 9.052178582829687e-06, + "loss": 0.2111, + "step": 1221 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 4.1028618812561035, + "learning_rate": 9.035745571167996e-06, + "loss": 0.206, + "step": 1222 + }, + { + "epoch": 0.5782505910165484, + "grad_norm": 4.987546443939209, + "learning_rate": 9.01931518743582e-06, + "loss": 0.2396, + "step": 1223 + }, + { + "epoch": 0.5787234042553191, + "grad_norm": 9.012516975402832, + "learning_rate": 9.002887476411681e-06, + "loss": 0.3507, + "step": 1224 + }, + { + "epoch": 0.5791962174940898, + "grad_norm": 6.798236846923828, + "learning_rate": 8.986462482866817e-06, + "loss": 0.2712, + "step": 1225 + }, + { + "epoch": 0.5796690307328605, + "grad_norm": 5.508780479431152, + "learning_rate": 8.970040251565068e-06, + "loss": 0.2785, + "step": 1226 + }, + { + "epoch": 0.5801418439716312, + "grad_norm": 3.585559606552124, + "learning_rate": 8.953620827262739e-06, + "loss": 0.1821, + "step": 1227 + }, + { + "epoch": 0.5806146572104018, + "grad_norm": 4.184317588806152, + "learning_rate": 8.937204254708486e-06, + "loss": 0.2308, + "step": 1228 + }, + { + "epoch": 0.5810874704491725, + "grad_norm": 4.5350518226623535, + "learning_rate": 8.920790578643186e-06, + "loss": 0.2078, + "step": 1229 + }, + { + "epoch": 0.5815602836879432, + "grad_norm": 5.0740742683410645, + "learning_rate": 8.904379843799838e-06, + "loss": 0.2313, + "step": 1230 + }, + { + "epoch": 0.5820330969267139, + "grad_norm": 6.0607147216796875, + "learning_rate": 8.887972094903412e-06, + "loss": 0.2334, + "step": 1231 + }, + { + "epoch": 0.5825059101654846, + "grad_norm": 5.125598430633545, + "learning_rate": 8.871567376670747e-06, + "loss": 0.2739, + "step": 1232 + }, + { + "epoch": 0.5829787234042553, + "grad_norm": 7.169873237609863, + "learning_rate": 8.85516573381042e-06, + "loss": 0.2739, + "step": 1233 + }, + { + "epoch": 0.583451536643026, + "grad_norm": 6.202165603637695, + "learning_rate": 8.838767211022616e-06, + "loss": 0.3156, + "step": 1234 + }, + { + "epoch": 0.5839243498817966, + "grad_norm": 5.986494541168213, + "learning_rate": 8.82237185299904e-06, + "loss": 0.1882, + "step": 1235 + }, + { + "epoch": 0.5843971631205673, + "grad_norm": 4.3089470863342285, + "learning_rate": 8.805979704422758e-06, + "loss": 0.1905, + "step": 1236 + }, + { + "epoch": 0.584869976359338, + "grad_norm": 4.750925540924072, + "learning_rate": 8.789590809968082e-06, + "loss": 0.2272, + "step": 1237 + }, + { + "epoch": 0.5853427895981087, + "grad_norm": 4.529053688049316, + "learning_rate": 8.773205214300469e-06, + "loss": 0.2583, + "step": 1238 + }, + { + "epoch": 0.5858156028368794, + "grad_norm": 5.315147876739502, + "learning_rate": 8.756822962076382e-06, + "loss": 0.2463, + "step": 1239 + }, + { + "epoch": 0.5862884160756501, + "grad_norm": 5.5175909996032715, + "learning_rate": 8.740444097943166e-06, + "loss": 0.2873, + "step": 1240 + }, + { + "epoch": 0.5862884160756501, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7142857142857143, + "eval_loss": 0.30490389466285706, + "eval_precision": 0.856353591160221, + "eval_recall": 0.6126482213438735, + "eval_runtime": 47.7126, + "eval_samples_per_second": 5.785, + "eval_steps_per_second": 0.189, + "step": 1240 + }, + { + "epoch": 0.5867612293144208, + "grad_norm": 5.6907572746276855, + "learning_rate": 8.724068666538938e-06, + "loss": 0.2456, + "step": 1241 + }, + { + "epoch": 0.5872340425531914, + "grad_norm": 5.550398826599121, + "learning_rate": 8.707696712492455e-06, + "loss": 0.2122, + "step": 1242 + }, + { + "epoch": 0.5877068557919621, + "grad_norm": 5.554051876068115, + "learning_rate": 8.691328280423004e-06, + "loss": 0.1672, + "step": 1243 + }, + { + "epoch": 0.5881796690307328, + "grad_norm": 5.504934787750244, + "learning_rate": 8.674963414940271e-06, + "loss": 0.1918, + "step": 1244 + }, + { + "epoch": 0.5886524822695035, + "grad_norm": 6.041418075561523, + "learning_rate": 8.658602160644216e-06, + "loss": 0.2718, + "step": 1245 + }, + { + "epoch": 0.5891252955082742, + "grad_norm": 6.632382392883301, + "learning_rate": 8.642244562124962e-06, + "loss": 0.316, + "step": 1246 + }, + { + "epoch": 0.5895981087470449, + "grad_norm": 4.766592502593994, + "learning_rate": 8.625890663962669e-06, + "loss": 0.2298, + "step": 1247 + }, + { + "epoch": 0.5900709219858156, + "grad_norm": 5.895883560180664, + "learning_rate": 8.609540510727412e-06, + "loss": 0.2365, + "step": 1248 + }, + { + "epoch": 0.5905437352245863, + "grad_norm": 5.390053749084473, + "learning_rate": 8.593194146979059e-06, + "loss": 0.1977, + "step": 1249 + }, + { + "epoch": 0.5910165484633569, + "grad_norm": 7.53000020980835, + "learning_rate": 8.576851617267151e-06, + "loss": 0.2733, + "step": 1250 + }, + { + "epoch": 0.5914893617021276, + "grad_norm": 5.143542766571045, + "learning_rate": 8.560512966130775e-06, + "loss": 0.2405, + "step": 1251 + }, + { + "epoch": 0.5919621749408983, + "grad_norm": 5.577294826507568, + "learning_rate": 8.544178238098458e-06, + "loss": 0.2378, + "step": 1252 + }, + { + "epoch": 0.592434988179669, + "grad_norm": 4.410736083984375, + "learning_rate": 8.527847477688027e-06, + "loss": 0.2437, + "step": 1253 + }, + { + "epoch": 0.5929078014184397, + "grad_norm": 6.536932945251465, + "learning_rate": 8.511520729406498e-06, + "loss": 0.2503, + "step": 1254 + }, + { + "epoch": 0.5933806146572104, + "grad_norm": 5.56400728225708, + "learning_rate": 8.49519803774995e-06, + "loss": 0.2324, + "step": 1255 + }, + { + "epoch": 0.5938534278959811, + "grad_norm": 4.7766923904418945, + "learning_rate": 8.478879447203411e-06, + "loss": 0.1441, + "step": 1256 + }, + { + "epoch": 0.5943262411347517, + "grad_norm": 5.121423721313477, + "learning_rate": 8.462565002240733e-06, + "loss": 0.2649, + "step": 1257 + }, + { + "epoch": 0.5947990543735224, + "grad_norm": 4.3107404708862305, + "learning_rate": 8.446254747324462e-06, + "loss": 0.1711, + "step": 1258 + }, + { + "epoch": 0.5952718676122931, + "grad_norm": 5.047919273376465, + "learning_rate": 8.42994872690573e-06, + "loss": 0.2281, + "step": 1259 + }, + { + "epoch": 0.5957446808510638, + "grad_norm": 6.451530933380127, + "learning_rate": 8.413646985424127e-06, + "loss": 0.2467, + "step": 1260 + }, + { + "epoch": 0.5957446808510638, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7102803738317757, + "eval_loss": 0.3106723129749298, + "eval_precision": 0.8685714285714285, + "eval_recall": 0.6007905138339921, + "eval_runtime": 48.2804, + "eval_samples_per_second": 5.717, + "eval_steps_per_second": 0.186, + "step": 1260 + }, + { + "epoch": 0.5962174940898345, + "grad_norm": 5.580362796783447, + "learning_rate": 8.397349567307586e-06, + "loss": 0.2108, + "step": 1261 + }, + { + "epoch": 0.5966903073286052, + "grad_norm": 4.948899745941162, + "learning_rate": 8.381056516972253e-06, + "loss": 0.2347, + "step": 1262 + }, + { + "epoch": 0.5971631205673759, + "grad_norm": 3.7529051303863525, + "learning_rate": 8.364767878822368e-06, + "loss": 0.1665, + "step": 1263 + }, + { + "epoch": 0.5976359338061465, + "grad_norm": 7.008377552032471, + "learning_rate": 8.34848369725015e-06, + "loss": 0.2132, + "step": 1264 + }, + { + "epoch": 0.5981087470449172, + "grad_norm": 5.252836227416992, + "learning_rate": 8.332204016635672e-06, + "loss": 0.1941, + "step": 1265 + }, + { + "epoch": 0.5985815602836879, + "grad_norm": 6.382559776306152, + "learning_rate": 8.31592888134674e-06, + "loss": 0.2376, + "step": 1266 + }, + { + "epoch": 0.5990543735224586, + "grad_norm": 6.636437892913818, + "learning_rate": 8.299658335738772e-06, + "loss": 0.3327, + "step": 1267 + }, + { + "epoch": 0.5995271867612293, + "grad_norm": 5.242986679077148, + "learning_rate": 8.28339242415468e-06, + "loss": 0.2062, + "step": 1268 + }, + { + "epoch": 0.6, + "grad_norm": 5.419365882873535, + "learning_rate": 8.267131190924737e-06, + "loss": 0.2488, + "step": 1269 + }, + { + "epoch": 0.6004728132387707, + "grad_norm": 6.189310550689697, + "learning_rate": 8.25087468036648e-06, + "loss": 0.2743, + "step": 1270 + }, + { + "epoch": 0.6009456264775414, + "grad_norm": 5.147764205932617, + "learning_rate": 8.234622936784566e-06, + "loss": 0.1907, + "step": 1271 + }, + { + "epoch": 0.601418439716312, + "grad_norm": 5.64257287979126, + "learning_rate": 8.218376004470665e-06, + "loss": 0.2655, + "step": 1272 + }, + { + "epoch": 0.6018912529550827, + "grad_norm": 3.5520944595336914, + "learning_rate": 8.202133927703324e-06, + "loss": 0.1818, + "step": 1273 + }, + { + "epoch": 0.6023640661938534, + "grad_norm": 5.096825122833252, + "learning_rate": 8.185896750747878e-06, + "loss": 0.1918, + "step": 1274 + }, + { + "epoch": 0.6028368794326241, + "grad_norm": 5.38516092300415, + "learning_rate": 8.169664517856287e-06, + "loss": 0.2708, + "step": 1275 + }, + { + "epoch": 0.6033096926713948, + "grad_norm": 5.871916770935059, + "learning_rate": 8.153437273267045e-06, + "loss": 0.1947, + "step": 1276 + }, + { + "epoch": 0.6037825059101655, + "grad_norm": 4.89730167388916, + "learning_rate": 8.137215061205049e-06, + "loss": 0.2103, + "step": 1277 + }, + { + "epoch": 0.6042553191489362, + "grad_norm": 5.5777587890625, + "learning_rate": 8.120997925881492e-06, + "loss": 0.2599, + "step": 1278 + }, + { + "epoch": 0.6047281323877068, + "grad_norm": 4.445948600769043, + "learning_rate": 8.10478591149372e-06, + "loss": 0.2191, + "step": 1279 + }, + { + "epoch": 0.6052009456264775, + "grad_norm": 7.9579267501831055, + "learning_rate": 8.088579062225116e-06, + "loss": 0.3175, + "step": 1280 + }, + { + "epoch": 0.6052009456264775, + "eval_accuracy": 0.8580931263858093, + "eval_f1": 0.69377990430622, + "eval_loss": 0.312032550573349, + "eval_precision": 0.8787878787878788, + "eval_recall": 0.5731225296442688, + "eval_runtime": 50.2837, + "eval_samples_per_second": 5.489, + "eval_steps_per_second": 0.179, + "step": 1280 + }, + { + "epoch": 0.6056737588652482, + "grad_norm": 3.942072629928589, + "learning_rate": 8.072377422245002e-06, + "loss": 0.1949, + "step": 1281 + }, + { + "epoch": 0.6061465721040189, + "grad_norm": 4.524231910705566, + "learning_rate": 8.05618103570849e-06, + "loss": 0.1867, + "step": 1282 + }, + { + "epoch": 0.6066193853427896, + "grad_norm": 5.806196689605713, + "learning_rate": 8.039989946756388e-06, + "loss": 0.2334, + "step": 1283 + }, + { + "epoch": 0.6070921985815603, + "grad_norm": 5.2232489585876465, + "learning_rate": 8.02380419951506e-06, + "loss": 0.2788, + "step": 1284 + }, + { + "epoch": 0.607565011820331, + "grad_norm": 6.341989517211914, + "learning_rate": 8.0076238380963e-06, + "loss": 0.2481, + "step": 1285 + }, + { + "epoch": 0.6080378250591016, + "grad_norm": 5.141717433929443, + "learning_rate": 7.991448906597237e-06, + "loss": 0.2083, + "step": 1286 + }, + { + "epoch": 0.6085106382978723, + "grad_norm": 5.809133052825928, + "learning_rate": 7.975279449100207e-06, + "loss": 0.2377, + "step": 1287 + }, + { + "epoch": 0.608983451536643, + "grad_norm": 4.600372314453125, + "learning_rate": 7.959115509672612e-06, + "loss": 0.2026, + "step": 1288 + }, + { + "epoch": 0.6094562647754137, + "grad_norm": 7.412517547607422, + "learning_rate": 7.942957132366827e-06, + "loss": 0.3106, + "step": 1289 + }, + { + "epoch": 0.6099290780141844, + "grad_norm": 10.773149490356445, + "learning_rate": 7.926804361220056e-06, + "loss": 0.2309, + "step": 1290 + }, + { + "epoch": 0.6104018912529551, + "grad_norm": 4.44931173324585, + "learning_rate": 7.910657240254242e-06, + "loss": 0.2072, + "step": 1291 + }, + { + "epoch": 0.6108747044917258, + "grad_norm": 6.045795917510986, + "learning_rate": 7.894515813475914e-06, + "loss": 0.2879, + "step": 1292 + }, + { + "epoch": 0.6113475177304964, + "grad_norm": 4.986977577209473, + "learning_rate": 7.87838012487609e-06, + "loss": 0.1959, + "step": 1293 + }, + { + "epoch": 0.6118203309692671, + "grad_norm": 6.099925518035889, + "learning_rate": 7.862250218430147e-06, + "loss": 0.2966, + "step": 1294 + }, + { + "epoch": 0.6122931442080378, + "grad_norm": 5.837856292724609, + "learning_rate": 7.846126138097698e-06, + "loss": 0.2563, + "step": 1295 + }, + { + "epoch": 0.6127659574468085, + "grad_norm": 6.82401704788208, + "learning_rate": 7.830007927822494e-06, + "loss": 0.1892, + "step": 1296 + }, + { + "epoch": 0.6132387706855792, + "grad_norm": 6.041834354400635, + "learning_rate": 7.813895631532271e-06, + "loss": 0.1974, + "step": 1297 + }, + { + "epoch": 0.6137115839243499, + "grad_norm": 5.327773094177246, + "learning_rate": 7.797789293138657e-06, + "loss": 0.2551, + "step": 1298 + }, + { + "epoch": 0.6141843971631206, + "grad_norm": 3.487072467803955, + "learning_rate": 7.781688956537034e-06, + "loss": 0.1987, + "step": 1299 + }, + { + "epoch": 0.6146572104018913, + "grad_norm": 4.819819450378418, + "learning_rate": 7.765594665606441e-06, + "loss": 0.1988, + "step": 1300 + }, + { + "epoch": 0.6146572104018913, + "eval_accuracy": 0.8636363636363636, + "eval_f1": 0.7146171693735499, + "eval_loss": 0.3020324409008026, + "eval_precision": 0.8651685393258427, + "eval_recall": 0.6086956521739131, + "eval_runtime": 48.6185, + "eval_samples_per_second": 5.677, + "eval_steps_per_second": 0.185, + "step": 1300 + }, + { + "epoch": 0.6151300236406619, + "grad_norm": 5.6234612464904785, + "learning_rate": 7.749506464209428e-06, + "loss": 0.2889, + "step": 1301 + }, + { + "epoch": 0.6156028368794326, + "grad_norm": 5.582950592041016, + "learning_rate": 7.733424396191955e-06, + "loss": 0.2902, + "step": 1302 + }, + { + "epoch": 0.6160756501182033, + "grad_norm": 5.391469955444336, + "learning_rate": 7.71734850538326e-06, + "loss": 0.213, + "step": 1303 + }, + { + "epoch": 0.616548463356974, + "grad_norm": 4.6382060050964355, + "learning_rate": 7.701278835595753e-06, + "loss": 0.1684, + "step": 1304 + }, + { + "epoch": 0.6170212765957447, + "grad_norm": 8.098640441894531, + "learning_rate": 7.685215430624891e-06, + "loss": 0.4206, + "step": 1305 + }, + { + "epoch": 0.6174940898345154, + "grad_norm": 4.473232746124268, + "learning_rate": 7.669158334249048e-06, + "loss": 0.243, + "step": 1306 + }, + { + "epoch": 0.6179669030732861, + "grad_norm": 5.509943008422852, + "learning_rate": 7.65310759022941e-06, + "loss": 0.1861, + "step": 1307 + }, + { + "epoch": 0.6184397163120567, + "grad_norm": 6.489039421081543, + "learning_rate": 7.637063242309852e-06, + "loss": 0.2912, + "step": 1308 + }, + { + "epoch": 0.6189125295508274, + "grad_norm": 4.711176872253418, + "learning_rate": 7.621025334216819e-06, + "loss": 0.243, + "step": 1309 + }, + { + "epoch": 0.6193853427895981, + "grad_norm": 5.736166000366211, + "learning_rate": 7.604993909659198e-06, + "loss": 0.2759, + "step": 1310 + }, + { + "epoch": 0.6198581560283688, + "grad_norm": 7.324904441833496, + "learning_rate": 7.588969012328214e-06, + "loss": 0.2655, + "step": 1311 + }, + { + "epoch": 0.6203309692671395, + "grad_norm": 5.770148754119873, + "learning_rate": 7.572950685897295e-06, + "loss": 0.2062, + "step": 1312 + }, + { + "epoch": 0.6208037825059102, + "grad_norm": 5.873038291931152, + "learning_rate": 7.556938974021969e-06, + "loss": 0.2604, + "step": 1313 + }, + { + "epoch": 0.6212765957446809, + "grad_norm": 5.717566013336182, + "learning_rate": 7.540933920339733e-06, + "loss": 0.1932, + "step": 1314 + }, + { + "epoch": 0.6217494089834515, + "grad_norm": 5.060842514038086, + "learning_rate": 7.524935568469939e-06, + "loss": 0.2813, + "step": 1315 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 5.583745002746582, + "learning_rate": 7.50894396201367e-06, + "loss": 0.23, + "step": 1316 + }, + { + "epoch": 0.6226950354609929, + "grad_norm": 10.26961898803711, + "learning_rate": 7.4929591445536336e-06, + "loss": 0.2003, + "step": 1317 + }, + { + "epoch": 0.6231678486997636, + "grad_norm": 4.721004009246826, + "learning_rate": 7.4769811596540285e-06, + "loss": 0.1755, + "step": 1318 + }, + { + "epoch": 0.6236406619385343, + "grad_norm": 4.784334659576416, + "learning_rate": 7.461010050860438e-06, + "loss": 0.2046, + "step": 1319 + }, + { + "epoch": 0.624113475177305, + "grad_norm": 7.074143886566162, + "learning_rate": 7.445045861699696e-06, + "loss": 0.2081, + "step": 1320 + }, + { + "epoch": 0.624113475177305, + "eval_accuracy": 0.8558758314855875, + "eval_f1": 0.6859903381642513, + "eval_loss": 0.31748807430267334, + "eval_precision": 0.8819875776397516, + "eval_recall": 0.5612648221343873, + "eval_runtime": 47.9172, + "eval_samples_per_second": 5.76, + "eval_steps_per_second": 0.188, + "step": 1320 + }, + { + "epoch": 0.6245862884160757, + "grad_norm": 6.400256156921387, + "learning_rate": 7.429088635679786e-06, + "loss": 0.2797, + "step": 1321 + }, + { + "epoch": 0.6250591016548463, + "grad_norm": 7.948604583740234, + "learning_rate": 7.413138416289716e-06, + "loss": 0.2883, + "step": 1322 + }, + { + "epoch": 0.625531914893617, + "grad_norm": 5.226047039031982, + "learning_rate": 7.397195246999391e-06, + "loss": 0.2944, + "step": 1323 + }, + { + "epoch": 0.6260047281323877, + "grad_norm": 4.652298450469971, + "learning_rate": 7.381259171259509e-06, + "loss": 0.2375, + "step": 1324 + }, + { + "epoch": 0.6264775413711584, + "grad_norm": 6.352631568908691, + "learning_rate": 7.365330232501427e-06, + "loss": 0.2923, + "step": 1325 + }, + { + "epoch": 0.6269503546099291, + "grad_norm": 5.204030513763428, + "learning_rate": 7.349408474137067e-06, + "loss": 0.2485, + "step": 1326 + }, + { + "epoch": 0.6274231678486998, + "grad_norm": 6.4170026779174805, + "learning_rate": 7.333493939558764e-06, + "loss": 0.3025, + "step": 1327 + }, + { + "epoch": 0.6278959810874705, + "grad_norm": 5.421019077301025, + "learning_rate": 7.317586672139177e-06, + "loss": 0.2311, + "step": 1328 + }, + { + "epoch": 0.6283687943262412, + "grad_norm": 6.363109111785889, + "learning_rate": 7.301686715231149e-06, + "loss": 0.244, + "step": 1329 + }, + { + "epoch": 0.6288416075650118, + "grad_norm": 4.805910587310791, + "learning_rate": 7.285794112167615e-06, + "loss": 0.2314, + "step": 1330 + }, + { + "epoch": 0.6293144208037825, + "grad_norm": 4.570178508758545, + "learning_rate": 7.269908906261458e-06, + "loss": 0.2186, + "step": 1331 + }, + { + "epoch": 0.6297872340425532, + "grad_norm": 4.513207912445068, + "learning_rate": 7.254031140805399e-06, + "loss": 0.2176, + "step": 1332 + }, + { + "epoch": 0.6302600472813239, + "grad_norm": 5.025672435760498, + "learning_rate": 7.238160859071885e-06, + "loss": 0.275, + "step": 1333 + }, + { + "epoch": 0.6307328605200946, + "grad_norm": 5.059742450714111, + "learning_rate": 7.222298104312966e-06, + "loss": 0.2367, + "step": 1334 + }, + { + "epoch": 0.6312056737588653, + "grad_norm": 5.431969165802002, + "learning_rate": 7.206442919760186e-06, + "loss": 0.24, + "step": 1335 + }, + { + "epoch": 0.631678486997636, + "grad_norm": 11.056987762451172, + "learning_rate": 7.190595348624447e-06, + "loss": 0.3124, + "step": 1336 + }, + { + "epoch": 0.6321513002364066, + "grad_norm": 6.1571197509765625, + "learning_rate": 7.1747554340959055e-06, + "loss": 0.2398, + "step": 1337 + }, + { + "epoch": 0.6326241134751773, + "grad_norm": 5.703886032104492, + "learning_rate": 7.158923219343845e-06, + "loss": 0.2612, + "step": 1338 + }, + { + "epoch": 0.633096926713948, + "grad_norm": 5.536457061767578, + "learning_rate": 7.1430987475165834e-06, + "loss": 0.2558, + "step": 1339 + }, + { + "epoch": 0.6335697399527187, + "grad_norm": 4.489446640014648, + "learning_rate": 7.127282061741316e-06, + "loss": 0.1784, + "step": 1340 + }, + { + "epoch": 0.6335697399527187, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7324561403508771, + "eval_loss": 0.2959369122982025, + "eval_precision": 0.8226600985221675, + "eval_recall": 0.6600790513833992, + "eval_runtime": 49.1184, + "eval_samples_per_second": 5.619, + "eval_steps_per_second": 0.183, + "step": 1340 + }, + { + "epoch": 0.6340425531914894, + "grad_norm": 4.977541923522949, + "learning_rate": 7.11147320512403e-06, + "loss": 0.2467, + "step": 1341 + }, + { + "epoch": 0.6345153664302601, + "grad_norm": 4.624886989593506, + "learning_rate": 7.095672220749367e-06, + "loss": 0.258, + "step": 1342 + }, + { + "epoch": 0.6349881796690308, + "grad_norm": 4.496685028076172, + "learning_rate": 7.079879151680516e-06, + "loss": 0.206, + "step": 1343 + }, + { + "epoch": 0.6354609929078014, + "grad_norm": 5.0084919929504395, + "learning_rate": 7.064094040959107e-06, + "loss": 0.1829, + "step": 1344 + }, + { + "epoch": 0.6359338061465721, + "grad_norm": 5.6750168800354, + "learning_rate": 7.048316931605062e-06, + "loss": 0.2466, + "step": 1345 + }, + { + "epoch": 0.6364066193853428, + "grad_norm": 6.028310298919678, + "learning_rate": 7.032547866616512e-06, + "loss": 0.2048, + "step": 1346 + }, + { + "epoch": 0.6368794326241135, + "grad_norm": 4.741923809051514, + "learning_rate": 7.0167868889696445e-06, + "loss": 0.1621, + "step": 1347 + }, + { + "epoch": 0.6373522458628842, + "grad_norm": 5.013643264770508, + "learning_rate": 7.001034041618632e-06, + "loss": 0.3119, + "step": 1348 + }, + { + "epoch": 0.6378250591016549, + "grad_norm": 7.612677097320557, + "learning_rate": 6.985289367495469e-06, + "loss": 0.3243, + "step": 1349 + }, + { + "epoch": 0.6382978723404256, + "grad_norm": 6.442877769470215, + "learning_rate": 6.969552909509885e-06, + "loss": 0.2928, + "step": 1350 + }, + { + "epoch": 0.6387706855791963, + "grad_norm": 4.449213981628418, + "learning_rate": 6.953824710549212e-06, + "loss": 0.1977, + "step": 1351 + }, + { + "epoch": 0.6392434988179669, + "grad_norm": 5.301755905151367, + "learning_rate": 6.938104813478279e-06, + "loss": 0.2666, + "step": 1352 + }, + { + "epoch": 0.6397163120567376, + "grad_norm": 4.733539581298828, + "learning_rate": 6.922393261139284e-06, + "loss": 0.1967, + "step": 1353 + }, + { + "epoch": 0.6401891252955083, + "grad_norm": 5.527211666107178, + "learning_rate": 6.9066900963516855e-06, + "loss": 0.2261, + "step": 1354 + }, + { + "epoch": 0.640661938534279, + "grad_norm": 7.788763999938965, + "learning_rate": 6.8909953619120836e-06, + "loss": 0.2244, + "step": 1355 + }, + { + "epoch": 0.6411347517730497, + "grad_norm": 4.974414825439453, + "learning_rate": 6.875309100594098e-06, + "loss": 0.2021, + "step": 1356 + }, + { + "epoch": 0.6416075650118204, + "grad_norm": 4.9365739822387695, + "learning_rate": 6.859631355148266e-06, + "loss": 0.1671, + "step": 1357 + }, + { + "epoch": 0.642080378250591, + "grad_norm": 5.185995578765869, + "learning_rate": 6.843962168301907e-06, + "loss": 0.2056, + "step": 1358 + }, + { + "epoch": 0.6425531914893617, + "grad_norm": 4.460386276245117, + "learning_rate": 6.828301582759018e-06, + "loss": 0.1665, + "step": 1359 + }, + { + "epoch": 0.6430260047281324, + "grad_norm": 6.994537353515625, + "learning_rate": 6.8126496412001545e-06, + "loss": 0.2712, + "step": 1360 + }, + { + "epoch": 0.6430260047281324, + "eval_accuracy": 0.8592017738359202, + "eval_f1": 0.7066974595842956, + "eval_loss": 0.31329602003097534, + "eval_precision": 0.85, + "eval_recall": 0.6047430830039525, + "eval_runtime": 47.7506, + "eval_samples_per_second": 5.78, + "eval_steps_per_second": 0.188, + "step": 1360 + }, + { + "epoch": 0.6434988179669031, + "grad_norm": 4.288215160369873, + "learning_rate": 6.797006386282316e-06, + "loss": 0.1407, + "step": 1361 + }, + { + "epoch": 0.6439716312056738, + "grad_norm": 7.29041862487793, + "learning_rate": 6.7813718606388255e-06, + "loss": 0.2459, + "step": 1362 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 4.9920268058776855, + "learning_rate": 6.7657461068792164e-06, + "loss": 0.2132, + "step": 1363 + }, + { + "epoch": 0.6449172576832152, + "grad_norm": 5.254515171051025, + "learning_rate": 6.750129167589113e-06, + "loss": 0.2016, + "step": 1364 + }, + { + "epoch": 0.6453900709219859, + "grad_norm": 6.032263278961182, + "learning_rate": 6.734521085330126e-06, + "loss": 0.1932, + "step": 1365 + }, + { + "epoch": 0.6458628841607565, + "grad_norm": 4.635222434997559, + "learning_rate": 6.718921902639717e-06, + "loss": 0.202, + "step": 1366 + }, + { + "epoch": 0.6463356973995272, + "grad_norm": 5.365309715270996, + "learning_rate": 6.7033316620310985e-06, + "loss": 0.2137, + "step": 1367 + }, + { + "epoch": 0.6468085106382979, + "grad_norm": 4.981945991516113, + "learning_rate": 6.687750405993113e-06, + "loss": 0.2489, + "step": 1368 + }, + { + "epoch": 0.6472813238770686, + "grad_norm": 6.213076591491699, + "learning_rate": 6.672178176990112e-06, + "loss": 0.2583, + "step": 1369 + }, + { + "epoch": 0.6477541371158393, + "grad_norm": 8.723681449890137, + "learning_rate": 6.656615017461854e-06, + "loss": 0.2961, + "step": 1370 + }, + { + "epoch": 0.64822695035461, + "grad_norm": 3.6889824867248535, + "learning_rate": 6.641060969823372e-06, + "loss": 0.1616, + "step": 1371 + }, + { + "epoch": 0.6486997635933807, + "grad_norm": 5.324930667877197, + "learning_rate": 6.625516076464871e-06, + "loss": 0.2571, + "step": 1372 + }, + { + "epoch": 0.6491725768321513, + "grad_norm": 7.705888748168945, + "learning_rate": 6.6099803797516e-06, + "loss": 0.3487, + "step": 1373 + }, + { + "epoch": 0.649645390070922, + "grad_norm": 7.570559024810791, + "learning_rate": 6.5944539220237555e-06, + "loss": 0.2652, + "step": 1374 + }, + { + "epoch": 0.6501182033096927, + "grad_norm": 5.115143299102783, + "learning_rate": 6.578936745596346e-06, + "loss": 0.1846, + "step": 1375 + }, + { + "epoch": 0.6505910165484634, + "grad_norm": 5.2409162521362305, + "learning_rate": 6.563428892759087e-06, + "loss": 0.1869, + "step": 1376 + }, + { + "epoch": 0.6510638297872341, + "grad_norm": 7.305501937866211, + "learning_rate": 6.547930405776282e-06, + "loss": 0.2298, + "step": 1377 + }, + { + "epoch": 0.6515366430260048, + "grad_norm": 5.585699081420898, + "learning_rate": 6.532441326886716e-06, + "loss": 0.1531, + "step": 1378 + }, + { + "epoch": 0.6520094562647755, + "grad_norm": 5.718395709991455, + "learning_rate": 6.5169616983035285e-06, + "loss": 0.2375, + "step": 1379 + }, + { + "epoch": 0.6524822695035462, + "grad_norm": 7.011470317840576, + "learning_rate": 6.501491562214104e-06, + "loss": 0.2463, + "step": 1380 + }, + { + "epoch": 0.6524822695035462, + "eval_accuracy": 0.8547671840354767, + "eval_f1": 0.6960556844547564, + "eval_loss": 0.3179844319820404, + "eval_precision": 0.8426966292134831, + "eval_recall": 0.5928853754940712, + "eval_runtime": 48.0507, + "eval_samples_per_second": 5.744, + "eval_steps_per_second": 0.187, + "step": 1380 + }, + { + "epoch": 0.6529550827423168, + "grad_norm": 6.096155643463135, + "learning_rate": 6.486030960779956e-06, + "loss": 0.2143, + "step": 1381 + }, + { + "epoch": 0.6534278959810875, + "grad_norm": 6.085330486297607, + "learning_rate": 6.470579936136612e-06, + "loss": 0.2922, + "step": 1382 + }, + { + "epoch": 0.6539007092198581, + "grad_norm": 4.465743541717529, + "learning_rate": 6.455138530393508e-06, + "loss": 0.2069, + "step": 1383 + }, + { + "epoch": 0.6543735224586288, + "grad_norm": 8.364693641662598, + "learning_rate": 6.4397067856338524e-06, + "loss": 0.2768, + "step": 1384 + }, + { + "epoch": 0.6548463356973995, + "grad_norm": 4.662283897399902, + "learning_rate": 6.424284743914532e-06, + "loss": 0.2401, + "step": 1385 + }, + { + "epoch": 0.6553191489361702, + "grad_norm": 6.041397571563721, + "learning_rate": 6.408872447265984e-06, + "loss": 0.2707, + "step": 1386 + }, + { + "epoch": 0.6557919621749408, + "grad_norm": 5.31654167175293, + "learning_rate": 6.393469937692101e-06, + "loss": 0.2028, + "step": 1387 + }, + { + "epoch": 0.6562647754137115, + "grad_norm": 5.916751384735107, + "learning_rate": 6.378077257170081e-06, + "loss": 0.2362, + "step": 1388 + }, + { + "epoch": 0.6567375886524822, + "grad_norm": 5.025276184082031, + "learning_rate": 6.3626944476503485e-06, + "loss": 0.2574, + "step": 1389 + }, + { + "epoch": 0.6572104018912529, + "grad_norm": 9.537090301513672, + "learning_rate": 6.34732155105642e-06, + "loss": 0.3416, + "step": 1390 + }, + { + "epoch": 0.6576832151300236, + "grad_norm": 4.489987850189209, + "learning_rate": 6.331958609284806e-06, + "loss": 0.2113, + "step": 1391 + }, + { + "epoch": 0.6581560283687943, + "grad_norm": 6.175182819366455, + "learning_rate": 6.316605664204878e-06, + "loss": 0.2733, + "step": 1392 + }, + { + "epoch": 0.658628841607565, + "grad_norm": 8.544486999511719, + "learning_rate": 6.301262757658758e-06, + "loss": 0.254, + "step": 1393 + }, + { + "epoch": 0.6591016548463356, + "grad_norm": 7.164076328277588, + "learning_rate": 6.285929931461218e-06, + "loss": 0.2604, + "step": 1394 + }, + { + "epoch": 0.6595744680851063, + "grad_norm": 7.023167133331299, + "learning_rate": 6.2706072273995546e-06, + "loss": 0.2135, + "step": 1395 + }, + { + "epoch": 0.660047281323877, + "grad_norm": 3.9141695499420166, + "learning_rate": 6.255294687233484e-06, + "loss": 0.1969, + "step": 1396 + }, + { + "epoch": 0.6605200945626477, + "grad_norm": 9.266695976257324, + "learning_rate": 6.239992352695016e-06, + "loss": 0.2915, + "step": 1397 + }, + { + "epoch": 0.6609929078014184, + "grad_norm": 5.782726764678955, + "learning_rate": 6.224700265488343e-06, + "loss": 0.2525, + "step": 1398 + }, + { + "epoch": 0.6614657210401891, + "grad_norm": 5.57002067565918, + "learning_rate": 6.209418467289731e-06, + "loss": 0.2601, + "step": 1399 + }, + { + "epoch": 0.6619385342789598, + "grad_norm": 8.637619018554688, + "learning_rate": 6.194146999747419e-06, + "loss": 0.3991, + "step": 1400 + }, + { + "epoch": 0.6619385342789598, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7061611374407583, + "eval_loss": 0.3167264759540558, + "eval_precision": 0.8816568047337278, + "eval_recall": 0.5889328063241107, + "eval_runtime": 49.4666, + "eval_samples_per_second": 5.58, + "eval_steps_per_second": 0.182, + "step": 1400 + }, + { + "epoch": 0.6624113475177305, + "grad_norm": 8.265732765197754, + "learning_rate": 6.1788859044814755e-06, + "loss": 0.369, + "step": 1401 + }, + { + "epoch": 0.6628841607565011, + "grad_norm": 6.062003135681152, + "learning_rate": 6.163635223083706e-06, + "loss": 0.2058, + "step": 1402 + }, + { + "epoch": 0.6633569739952718, + "grad_norm": 3.8213107585906982, + "learning_rate": 6.148394997117532e-06, + "loss": 0.1434, + "step": 1403 + }, + { + "epoch": 0.6638297872340425, + "grad_norm": 4.005511283874512, + "learning_rate": 6.133165268117885e-06, + "loss": 0.2036, + "step": 1404 + }, + { + "epoch": 0.6643026004728132, + "grad_norm": 5.579016208648682, + "learning_rate": 6.117946077591087e-06, + "loss": 0.2527, + "step": 1405 + }, + { + "epoch": 0.6647754137115839, + "grad_norm": 8.702812194824219, + "learning_rate": 6.102737467014739e-06, + "loss": 0.2678, + "step": 1406 + }, + { + "epoch": 0.6652482269503546, + "grad_norm": 5.449542999267578, + "learning_rate": 6.087539477837609e-06, + "loss": 0.2133, + "step": 1407 + }, + { + "epoch": 0.6657210401891253, + "grad_norm": 5.406524181365967, + "learning_rate": 6.072352151479508e-06, + "loss": 0.2225, + "step": 1408 + }, + { + "epoch": 0.6661938534278959, + "grad_norm": 7.414266586303711, + "learning_rate": 6.057175529331205e-06, + "loss": 0.2549, + "step": 1409 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 6.921921730041504, + "learning_rate": 6.0420096527542835e-06, + "loss": 0.2658, + "step": 1410 + }, + { + "epoch": 0.6671394799054373, + "grad_norm": 6.305230617523193, + "learning_rate": 6.026854563081046e-06, + "loss": 0.2819, + "step": 1411 + }, + { + "epoch": 0.667612293144208, + "grad_norm": 5.2894511222839355, + "learning_rate": 6.0117103016143915e-06, + "loss": 0.1989, + "step": 1412 + }, + { + "epoch": 0.6680851063829787, + "grad_norm": 7.54205846786499, + "learning_rate": 5.996576909627718e-06, + "loss": 0.2917, + "step": 1413 + }, + { + "epoch": 0.6685579196217494, + "grad_norm": 3.777785539627075, + "learning_rate": 5.981454428364792e-06, + "loss": 0.1905, + "step": 1414 + }, + { + "epoch": 0.6690307328605201, + "grad_norm": 4.611357688903809, + "learning_rate": 5.96634289903965e-06, + "loss": 0.2231, + "step": 1415 + }, + { + "epoch": 0.6695035460992907, + "grad_norm": 9.586370468139648, + "learning_rate": 5.951242362836475e-06, + "loss": 0.2447, + "step": 1416 + }, + { + "epoch": 0.6699763593380614, + "grad_norm": 5.429812431335449, + "learning_rate": 5.936152860909492e-06, + "loss": 0.2668, + "step": 1417 + }, + { + "epoch": 0.6704491725768321, + "grad_norm": 4.418676376342773, + "learning_rate": 5.921074434382861e-06, + "loss": 0.216, + "step": 1418 + }, + { + "epoch": 0.6709219858156028, + "grad_norm": 4.1734089851379395, + "learning_rate": 5.906007124350547e-06, + "loss": 0.1834, + "step": 1419 + }, + { + "epoch": 0.6713947990543735, + "grad_norm": 5.029831886291504, + "learning_rate": 5.8909509718762235e-06, + "loss": 0.154, + "step": 1420 + }, + { + "epoch": 0.6713947990543735, + "eval_accuracy": 0.8636363636363636, + "eval_f1": 0.7146171693735499, + "eval_loss": 0.30272066593170166, + "eval_precision": 0.8651685393258427, + "eval_recall": 0.6086956521739131, + "eval_runtime": 47.8176, + "eval_samples_per_second": 5.772, + "eval_steps_per_second": 0.188, + "step": 1420 + }, + { + "epoch": 0.6718676122931442, + "grad_norm": 4.48655366897583, + "learning_rate": 5.875906017993156e-06, + "loss": 0.1879, + "step": 1421 + }, + { + "epoch": 0.6723404255319149, + "grad_norm": 5.680935382843018, + "learning_rate": 5.8608723037040894e-06, + "loss": 0.2809, + "step": 1422 + }, + { + "epoch": 0.6728132387706856, + "grad_norm": 6.1803083419799805, + "learning_rate": 5.845849869981137e-06, + "loss": 0.195, + "step": 1423 + }, + { + "epoch": 0.6732860520094562, + "grad_norm": 9.49524974822998, + "learning_rate": 5.830838757765671e-06, + "loss": 0.3723, + "step": 1424 + }, + { + "epoch": 0.6737588652482269, + "grad_norm": 4.156935214996338, + "learning_rate": 5.815839007968196e-06, + "loss": 0.2042, + "step": 1425 + }, + { + "epoch": 0.6742316784869976, + "grad_norm": 5.995987892150879, + "learning_rate": 5.8008506614682714e-06, + "loss": 0.2007, + "step": 1426 + }, + { + "epoch": 0.6747044917257683, + "grad_norm": 4.0844645500183105, + "learning_rate": 5.785873759114364e-06, + "loss": 0.1592, + "step": 1427 + }, + { + "epoch": 0.675177304964539, + "grad_norm": 7.70731782913208, + "learning_rate": 5.770908341723752e-06, + "loss": 0.2633, + "step": 1428 + }, + { + "epoch": 0.6756501182033097, + "grad_norm": 4.959256649017334, + "learning_rate": 5.755954450082417e-06, + "loss": 0.2326, + "step": 1429 + }, + { + "epoch": 0.6761229314420804, + "grad_norm": 7.28727912902832, + "learning_rate": 5.741012124944925e-06, + "loss": 0.3043, + "step": 1430 + }, + { + "epoch": 0.676595744680851, + "grad_norm": 6.117763519287109, + "learning_rate": 5.726081407034327e-06, + "loss": 0.1876, + "step": 1431 + }, + { + "epoch": 0.6770685579196217, + "grad_norm": 4.031482696533203, + "learning_rate": 5.711162337042033e-06, + "loss": 0.2204, + "step": 1432 + }, + { + "epoch": 0.6775413711583924, + "grad_norm": 4.445287227630615, + "learning_rate": 5.6962549556277134e-06, + "loss": 0.1462, + "step": 1433 + }, + { + "epoch": 0.6780141843971631, + "grad_norm": 5.196186542510986, + "learning_rate": 5.681359303419169e-06, + "loss": 0.2448, + "step": 1434 + }, + { + "epoch": 0.6784869976359338, + "grad_norm": 5.29311990737915, + "learning_rate": 5.666475421012256e-06, + "loss": 0.1858, + "step": 1435 + }, + { + "epoch": 0.6789598108747045, + "grad_norm": 4.104085445404053, + "learning_rate": 5.651603348970741e-06, + "loss": 0.1939, + "step": 1436 + }, + { + "epoch": 0.6794326241134752, + "grad_norm": 6.132163047790527, + "learning_rate": 5.636743127826205e-06, + "loss": 0.2087, + "step": 1437 + }, + { + "epoch": 0.6799054373522458, + "grad_norm": 5.092171669006348, + "learning_rate": 5.621894798077928e-06, + "loss": 0.1947, + "step": 1438 + }, + { + "epoch": 0.6803782505910165, + "grad_norm": 5.59557580947876, + "learning_rate": 5.607058400192793e-06, + "loss": 0.219, + "step": 1439 + }, + { + "epoch": 0.6808510638297872, + "grad_norm": 6.623174667358398, + "learning_rate": 5.592233974605154e-06, + "loss": 0.1944, + "step": 1440 + }, + { + "epoch": 0.6808510638297872, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7075471698113207, + "eval_loss": 0.3171689808368683, + "eval_precision": 0.8771929824561403, + "eval_recall": 0.5928853754940712, + "eval_runtime": 46.6382, + "eval_samples_per_second": 5.918, + "eval_steps_per_second": 0.193, + "step": 1440 + }, + { + "epoch": 0.6813238770685579, + "grad_norm": 5.490980625152588, + "learning_rate": 5.577421561716738e-06, + "loss": 0.2387, + "step": 1441 + }, + { + "epoch": 0.6817966903073286, + "grad_norm": 6.905361652374268, + "learning_rate": 5.5626212018965344e-06, + "loss": 0.214, + "step": 1442 + }, + { + "epoch": 0.6822695035460993, + "grad_norm": 4.289844036102295, + "learning_rate": 5.547832935480686e-06, + "loss": 0.1257, + "step": 1443 + }, + { + "epoch": 0.68274231678487, + "grad_norm": 5.823220729827881, + "learning_rate": 5.533056802772374e-06, + "loss": 0.256, + "step": 1444 + }, + { + "epoch": 0.6832151300236406, + "grad_norm": 5.129421234130859, + "learning_rate": 5.518292844041711e-06, + "loss": 0.2614, + "step": 1445 + }, + { + "epoch": 0.6836879432624113, + "grad_norm": 4.678886890411377, + "learning_rate": 5.503541099525633e-06, + "loss": 0.1629, + "step": 1446 + }, + { + "epoch": 0.684160756501182, + "grad_norm": 8.273910522460938, + "learning_rate": 5.488801609427783e-06, + "loss": 0.2119, + "step": 1447 + }, + { + "epoch": 0.6846335697399527, + "grad_norm": 4.722292900085449, + "learning_rate": 5.474074413918418e-06, + "loss": 0.1892, + "step": 1448 + }, + { + "epoch": 0.6851063829787234, + "grad_norm": 6.923647880554199, + "learning_rate": 5.459359553134278e-06, + "loss": 0.2873, + "step": 1449 + }, + { + "epoch": 0.6855791962174941, + "grad_norm": 4.769101619720459, + "learning_rate": 5.444657067178487e-06, + "loss": 0.163, + "step": 1450 + }, + { + "epoch": 0.6860520094562648, + "grad_norm": 5.0238518714904785, + "learning_rate": 5.429966996120446e-06, + "loss": 0.1423, + "step": 1451 + }, + { + "epoch": 0.6865248226950355, + "grad_norm": 6.839998245239258, + "learning_rate": 5.415289379995723e-06, + "loss": 0.2498, + "step": 1452 + }, + { + "epoch": 0.6869976359338061, + "grad_norm": 5.654500484466553, + "learning_rate": 5.400624258805935e-06, + "loss": 0.1813, + "step": 1453 + }, + { + "epoch": 0.6874704491725768, + "grad_norm": 6.542251110076904, + "learning_rate": 5.385971672518653e-06, + "loss": 0.1936, + "step": 1454 + }, + { + "epoch": 0.6879432624113475, + "grad_norm": 6.039957046508789, + "learning_rate": 5.371331661067284e-06, + "loss": 0.1988, + "step": 1455 + }, + { + "epoch": 0.6884160756501182, + "grad_norm": 6.784928321838379, + "learning_rate": 5.356704264350958e-06, + "loss": 0.244, + "step": 1456 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 6.722204685211182, + "learning_rate": 5.342089522234439e-06, + "loss": 0.2621, + "step": 1457 + }, + { + "epoch": 0.6893617021276596, + "grad_norm": 4.733044147491455, + "learning_rate": 5.327487474547992e-06, + "loss": 0.2154, + "step": 1458 + }, + { + "epoch": 0.6898345153664303, + "grad_norm": 6.589846134185791, + "learning_rate": 5.312898161087288e-06, + "loss": 0.2647, + "step": 1459 + }, + { + "epoch": 0.6903073286052009, + "grad_norm": 5.961693286895752, + "learning_rate": 5.298321621613292e-06, + "loss": 0.2434, + "step": 1460 + }, + { + "epoch": 0.6903073286052009, + "eval_accuracy": 0.8691796008869179, + "eval_f1": 0.7412280701754386, + "eval_loss": 0.303468257188797, + "eval_precision": 0.8325123152709359, + "eval_recall": 0.6679841897233202, + "eval_runtime": 47.3008, + "eval_samples_per_second": 5.835, + "eval_steps_per_second": 0.19, + "step": 1460 + }, + { + "epoch": 0.6907801418439716, + "grad_norm": 7.759005069732666, + "learning_rate": 5.283757895852156e-06, + "loss": 0.2391, + "step": 1461 + }, + { + "epoch": 0.6912529550827423, + "grad_norm": 7.777105331420898, + "learning_rate": 5.269207023495112e-06, + "loss": 0.2959, + "step": 1462 + }, + { + "epoch": 0.691725768321513, + "grad_norm": 5.419680595397949, + "learning_rate": 5.25466904419836e-06, + "loss": 0.2324, + "step": 1463 + }, + { + "epoch": 0.6921985815602837, + "grad_norm": 8.358044624328613, + "learning_rate": 5.240143997582956e-06, + "loss": 0.242, + "step": 1464 + }, + { + "epoch": 0.6926713947990544, + "grad_norm": 7.01899528503418, + "learning_rate": 5.2256319232347275e-06, + "loss": 0.2361, + "step": 1465 + }, + { + "epoch": 0.6931442080378251, + "grad_norm": 9.41346549987793, + "learning_rate": 5.211132860704131e-06, + "loss": 0.2523, + "step": 1466 + }, + { + "epoch": 0.6936170212765957, + "grad_norm": 5.518009185791016, + "learning_rate": 5.196646849506169e-06, + "loss": 0.271, + "step": 1467 + }, + { + "epoch": 0.6940898345153664, + "grad_norm": 8.015327453613281, + "learning_rate": 5.18217392912027e-06, + "loss": 0.1744, + "step": 1468 + }, + { + "epoch": 0.6945626477541371, + "grad_norm": 6.272970199584961, + "learning_rate": 5.16771413899019e-06, + "loss": 0.2644, + "step": 1469 + }, + { + "epoch": 0.6950354609929078, + "grad_norm": 5.460439682006836, + "learning_rate": 5.153267518523899e-06, + "loss": 0.1546, + "step": 1470 + }, + { + "epoch": 0.6955082742316785, + "grad_norm": 4.355556488037109, + "learning_rate": 5.1388341070934735e-06, + "loss": 0.1737, + "step": 1471 + }, + { + "epoch": 0.6959810874704492, + "grad_norm": 5.721870422363281, + "learning_rate": 5.124413944034992e-06, + "loss": 0.2474, + "step": 1472 + }, + { + "epoch": 0.6964539007092199, + "grad_norm": 6.737970352172852, + "learning_rate": 5.110007068648422e-06, + "loss": 0.2093, + "step": 1473 + }, + { + "epoch": 0.6969267139479906, + "grad_norm": 5.622700214385986, + "learning_rate": 5.095613520197533e-06, + "loss": 0.2962, + "step": 1474 + }, + { + "epoch": 0.6973995271867612, + "grad_norm": 7.287775993347168, + "learning_rate": 5.081233337909756e-06, + "loss": 0.237, + "step": 1475 + }, + { + "epoch": 0.6978723404255319, + "grad_norm": 5.515003204345703, + "learning_rate": 5.066866560976102e-06, + "loss": 0.2765, + "step": 1476 + }, + { + "epoch": 0.6983451536643026, + "grad_norm": 6.383912563323975, + "learning_rate": 5.052513228551048e-06, + "loss": 0.234, + "step": 1477 + }, + { + "epoch": 0.6988179669030733, + "grad_norm": 6.042670726776123, + "learning_rate": 5.038173379752425e-06, + "loss": 0.3074, + "step": 1478 + }, + { + "epoch": 0.699290780141844, + "grad_norm": 7.7684831619262695, + "learning_rate": 5.0238470536613315e-06, + "loss": 0.2179, + "step": 1479 + }, + { + "epoch": 0.6997635933806147, + "grad_norm": 6.858969211578369, + "learning_rate": 5.009534289321991e-06, + "loss": 0.2346, + "step": 1480 + }, + { + "epoch": 0.6997635933806147, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7089201877934272, + "eval_loss": 0.3162979483604431, + "eval_precision": 0.8728323699421965, + "eval_recall": 0.5968379446640316, + "eval_runtime": 47.1019, + "eval_samples_per_second": 5.86, + "eval_steps_per_second": 0.191, + "step": 1480 + }, + { + "epoch": 0.7002364066193854, + "grad_norm": 4.629432201385498, + "learning_rate": 4.99523512574168e-06, + "loss": 0.1548, + "step": 1481 + }, + { + "epoch": 0.700709219858156, + "grad_norm": 5.3954548835754395, + "learning_rate": 4.9809496018906e-06, + "loss": 0.2797, + "step": 1482 + }, + { + "epoch": 0.7011820330969267, + "grad_norm": 5.510435104370117, + "learning_rate": 4.9666777567017935e-06, + "loss": 0.2455, + "step": 1483 + }, + { + "epoch": 0.7016548463356974, + "grad_norm": 5.090349197387695, + "learning_rate": 4.9524196290710095e-06, + "loss": 0.2056, + "step": 1484 + }, + { + "epoch": 0.7021276595744681, + "grad_norm": 3.495950698852539, + "learning_rate": 4.938175257856618e-06, + "loss": 0.1664, + "step": 1485 + }, + { + "epoch": 0.7026004728132388, + "grad_norm": 5.755441188812256, + "learning_rate": 4.9239446818794914e-06, + "loss": 0.247, + "step": 1486 + }, + { + "epoch": 0.7030732860520095, + "grad_norm": 6.540046215057373, + "learning_rate": 4.90972793992292e-06, + "loss": 0.2858, + "step": 1487 + }, + { + "epoch": 0.7035460992907802, + "grad_norm": 6.471051216125488, + "learning_rate": 4.89552507073248e-06, + "loss": 0.1972, + "step": 1488 + }, + { + "epoch": 0.7040189125295508, + "grad_norm": 4.746458530426025, + "learning_rate": 4.881336113015939e-06, + "loss": 0.2121, + "step": 1489 + }, + { + "epoch": 0.7044917257683215, + "grad_norm": 4.542480945587158, + "learning_rate": 4.867161105443158e-06, + "loss": 0.2013, + "step": 1490 + }, + { + "epoch": 0.7049645390070922, + "grad_norm": 5.85392951965332, + "learning_rate": 4.853000086645965e-06, + "loss": 0.2253, + "step": 1491 + }, + { + "epoch": 0.7054373522458629, + "grad_norm": 6.674161911010742, + "learning_rate": 4.838853095218085e-06, + "loss": 0.2491, + "step": 1492 + }, + { + "epoch": 0.7059101654846336, + "grad_norm": 4.372111797332764, + "learning_rate": 4.824720169714997e-06, + "loss": 0.1928, + "step": 1493 + }, + { + "epoch": 0.7063829787234043, + "grad_norm": 6.305669784545898, + "learning_rate": 4.8106013486538505e-06, + "loss": 0.2462, + "step": 1494 + }, + { + "epoch": 0.706855791962175, + "grad_norm": 5.79583215713501, + "learning_rate": 4.796496670513354e-06, + "loss": 0.2655, + "step": 1495 + }, + { + "epoch": 0.7073286052009456, + "grad_norm": 6.358242511749268, + "learning_rate": 4.782406173733678e-06, + "loss": 0.1943, + "step": 1496 + }, + { + "epoch": 0.7078014184397163, + "grad_norm": 6.863159656524658, + "learning_rate": 4.768329896716337e-06, + "loss": 0.2634, + "step": 1497 + }, + { + "epoch": 0.708274231678487, + "grad_norm": 6.125742435455322, + "learning_rate": 4.7542678778240925e-06, + "loss": 0.2209, + "step": 1498 + }, + { + "epoch": 0.7087470449172577, + "grad_norm": 5.767421722412109, + "learning_rate": 4.74022015538085e-06, + "loss": 0.2381, + "step": 1499 + }, + { + "epoch": 0.7092198581560284, + "grad_norm": 5.935783863067627, + "learning_rate": 4.72618676767155e-06, + "loss": 0.2532, + "step": 1500 + }, + { + "epoch": 0.7092198581560284, + "eval_accuracy": 0.8658536585365854, + "eval_f1": 0.7328918322295805, + "eval_loss": 0.2938072085380554, + "eval_precision": 0.83, + "eval_recall": 0.6561264822134387, + "eval_runtime": 47.9899, + "eval_samples_per_second": 5.751, + "eval_steps_per_second": 0.188, + "step": 1500 + }, + { + "epoch": 0.7096926713947991, + "grad_norm": 6.156116008758545, + "learning_rate": 4.712167752942067e-06, + "loss": 0.2343, + "step": 1501 + }, + { + "epoch": 0.7101654846335698, + "grad_norm": 7.0279412269592285, + "learning_rate": 4.698163149399104e-06, + "loss": 0.2633, + "step": 1502 + }, + { + "epoch": 0.7106382978723405, + "grad_norm": 5.053689002990723, + "learning_rate": 4.68417299521009e-06, + "loss": 0.2181, + "step": 1503 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 5.44726037979126, + "learning_rate": 4.670197328503067e-06, + "loss": 0.2635, + "step": 1504 + }, + { + "epoch": 0.7115839243498818, + "grad_norm": 6.307510852813721, + "learning_rate": 4.656236187366607e-06, + "loss": 0.2775, + "step": 1505 + }, + { + "epoch": 0.7120567375886525, + "grad_norm": 7.774320602416992, + "learning_rate": 4.642289609849686e-06, + "loss": 0.2855, + "step": 1506 + }, + { + "epoch": 0.7125295508274232, + "grad_norm": 7.836673736572266, + "learning_rate": 4.628357633961589e-06, + "loss": 0.3376, + "step": 1507 + }, + { + "epoch": 0.7130023640661939, + "grad_norm": 6.827236175537109, + "learning_rate": 4.614440297671806e-06, + "loss": 0.2518, + "step": 1508 + }, + { + "epoch": 0.7134751773049646, + "grad_norm": 4.354206085205078, + "learning_rate": 4.600537638909933e-06, + "loss": 0.2073, + "step": 1509 + }, + { + "epoch": 0.7139479905437353, + "grad_norm": 5.115018367767334, + "learning_rate": 4.586649695565563e-06, + "loss": 0.2146, + "step": 1510 + }, + { + "epoch": 0.7144208037825059, + "grad_norm": 6.804549694061279, + "learning_rate": 4.572776505488181e-06, + "loss": 0.2646, + "step": 1511 + }, + { + "epoch": 0.7148936170212766, + "grad_norm": 5.3641581535339355, + "learning_rate": 4.558918106487065e-06, + "loss": 0.2636, + "step": 1512 + }, + { + "epoch": 0.7153664302600473, + "grad_norm": 7.315088272094727, + "learning_rate": 4.545074536331191e-06, + "loss": 0.2897, + "step": 1513 + }, + { + "epoch": 0.715839243498818, + "grad_norm": 5.816035747528076, + "learning_rate": 4.531245832749112e-06, + "loss": 0.2956, + "step": 1514 + }, + { + "epoch": 0.7163120567375887, + "grad_norm": 4.486478328704834, + "learning_rate": 4.517432033428864e-06, + "loss": 0.2543, + "step": 1515 + }, + { + "epoch": 0.7167848699763594, + "grad_norm": 4.364136219024658, + "learning_rate": 4.5036331760178695e-06, + "loss": 0.1811, + "step": 1516 + }, + { + "epoch": 0.7172576832151301, + "grad_norm": 8.54796314239502, + "learning_rate": 4.4898492981228245e-06, + "loss": 0.237, + "step": 1517 + }, + { + "epoch": 0.7177304964539007, + "grad_norm": 6.10654354095459, + "learning_rate": 4.4760804373096036e-06, + "loss": 0.2353, + "step": 1518 + }, + { + "epoch": 0.7182033096926714, + "grad_norm": 4.113856792449951, + "learning_rate": 4.46232663110315e-06, + "loss": 0.1487, + "step": 1519 + }, + { + "epoch": 0.7186761229314421, + "grad_norm": 3.579453468322754, + "learning_rate": 4.448587916987384e-06, + "loss": 0.1815, + "step": 1520 + }, + { + "epoch": 0.7186761229314421, + "eval_accuracy": 0.8569844789356984, + "eval_f1": 0.6861313868613139, + "eval_loss": 0.3156262934207916, + "eval_precision": 0.8924050632911392, + "eval_recall": 0.5573122529644269, + "eval_runtime": 47.8319, + "eval_samples_per_second": 5.77, + "eval_steps_per_second": 0.188, + "step": 1520 + }, + { + "epoch": 0.7191489361702128, + "grad_norm": 3.734877824783325, + "learning_rate": 4.434864332405085e-06, + "loss": 0.1694, + "step": 1521 + }, + { + "epoch": 0.7196217494089835, + "grad_norm": 5.21559476852417, + "learning_rate": 4.421155914757817e-06, + "loss": 0.2566, + "step": 1522 + }, + { + "epoch": 0.7200945626477542, + "grad_norm": 5.495852470397949, + "learning_rate": 4.407462701405791e-06, + "loss": 0.2993, + "step": 1523 + }, + { + "epoch": 0.7205673758865249, + "grad_norm": 4.870457649230957, + "learning_rate": 4.393784729667788e-06, + "loss": 0.2035, + "step": 1524 + }, + { + "epoch": 0.7210401891252955, + "grad_norm": 10.591519355773926, + "learning_rate": 4.380122036821048e-06, + "loss": 0.4052, + "step": 1525 + }, + { + "epoch": 0.7215130023640662, + "grad_norm": 5.435426712036133, + "learning_rate": 4.366474660101183e-06, + "loss": 0.2258, + "step": 1526 + }, + { + "epoch": 0.7219858156028369, + "grad_norm": 4.526400089263916, + "learning_rate": 4.3528426367020405e-06, + "loss": 0.1775, + "step": 1527 + }, + { + "epoch": 0.7224586288416076, + "grad_norm": 3.7634830474853516, + "learning_rate": 4.339226003775642e-06, + "loss": 0.2018, + "step": 1528 + }, + { + "epoch": 0.7229314420803783, + "grad_norm": 5.57973051071167, + "learning_rate": 4.325624798432059e-06, + "loss": 0.2942, + "step": 1529 + }, + { + "epoch": 0.723404255319149, + "grad_norm": 4.420356750488281, + "learning_rate": 4.312039057739316e-06, + "loss": 0.217, + "step": 1530 + }, + { + "epoch": 0.7238770685579197, + "grad_norm": 6.249464511871338, + "learning_rate": 4.298468818723298e-06, + "loss": 0.2769, + "step": 1531 + }, + { + "epoch": 0.7243498817966904, + "grad_norm": 4.786191463470459, + "learning_rate": 4.284914118367637e-06, + "loss": 0.2363, + "step": 1532 + }, + { + "epoch": 0.724822695035461, + "grad_norm": 3.865898847579956, + "learning_rate": 4.271374993613615e-06, + "loss": 0.1605, + "step": 1533 + }, + { + "epoch": 0.7252955082742317, + "grad_norm": 4.794460296630859, + "learning_rate": 4.257851481360066e-06, + "loss": 0.1329, + "step": 1534 + }, + { + "epoch": 0.7257683215130024, + "grad_norm": 4.994689464569092, + "learning_rate": 4.244343618463281e-06, + "loss": 0.2508, + "step": 1535 + }, + { + "epoch": 0.7262411347517731, + "grad_norm": 5.947475910186768, + "learning_rate": 4.2308514417368974e-06, + "loss": 0.1934, + "step": 1536 + }, + { + "epoch": 0.7267139479905438, + "grad_norm": 5.171843528747559, + "learning_rate": 4.2173749879517945e-06, + "loss": 0.2216, + "step": 1537 + }, + { + "epoch": 0.7271867612293145, + "grad_norm": 3.876723527908325, + "learning_rate": 4.2039142938360086e-06, + "loss": 0.1718, + "step": 1538 + }, + { + "epoch": 0.7276595744680852, + "grad_norm": 4.720749855041504, + "learning_rate": 4.190469396074622e-06, + "loss": 0.1817, + "step": 1539 + }, + { + "epoch": 0.7281323877068558, + "grad_norm": 4.733708381652832, + "learning_rate": 4.177040331309678e-06, + "loss": 0.1989, + "step": 1540 + }, + { + "epoch": 0.7281323877068558, + "eval_accuracy": 0.8614190687361419, + "eval_f1": 0.7016706443914081, + "eval_loss": 0.3186676502227783, + "eval_precision": 0.8855421686746988, + "eval_recall": 0.5810276679841897, + "eval_runtime": 48.0319, + "eval_samples_per_second": 5.746, + "eval_steps_per_second": 0.187, + "step": 1540 + }, + { + "epoch": 0.7286052009456265, + "grad_norm": 4.963865756988525, + "learning_rate": 4.163627136140054e-06, + "loss": 0.1798, + "step": 1541 + }, + { + "epoch": 0.7290780141843972, + "grad_norm": 5.173526763916016, + "learning_rate": 4.150229847121384e-06, + "loss": 0.2075, + "step": 1542 + }, + { + "epoch": 0.7295508274231679, + "grad_norm": 4.9639387130737305, + "learning_rate": 4.136848500765948e-06, + "loss": 0.2293, + "step": 1543 + }, + { + "epoch": 0.7300236406619386, + "grad_norm": 5.996505260467529, + "learning_rate": 4.123483133542588e-06, + "loss": 0.2557, + "step": 1544 + }, + { + "epoch": 0.7304964539007093, + "grad_norm": 3.7886509895324707, + "learning_rate": 4.110133781876587e-06, + "loss": 0.1741, + "step": 1545 + }, + { + "epoch": 0.7309692671394799, + "grad_norm": 9.337357521057129, + "learning_rate": 4.0968004821495845e-06, + "loss": 0.2775, + "step": 1546 + }, + { + "epoch": 0.7314420803782505, + "grad_norm": 7.194756984710693, + "learning_rate": 4.083483270699461e-06, + "loss": 0.2572, + "step": 1547 + }, + { + "epoch": 0.7319148936170212, + "grad_norm": 5.332988739013672, + "learning_rate": 4.070182183820272e-06, + "loss": 0.1859, + "step": 1548 + }, + { + "epoch": 0.7323877068557919, + "grad_norm": 7.105772972106934, + "learning_rate": 4.056897257762111e-06, + "loss": 0.2279, + "step": 1549 + }, + { + "epoch": 0.7328605200945626, + "grad_norm": 3.7316532135009766, + "learning_rate": 4.043628528731036e-06, + "loss": 0.1744, + "step": 1550 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 5.632213592529297, + "learning_rate": 4.030376032888959e-06, + "loss": 0.1418, + "step": 1551 + }, + { + "epoch": 0.733806146572104, + "grad_norm": 11.740776062011719, + "learning_rate": 4.01713980635355e-06, + "loss": 0.4084, + "step": 1552 + }, + { + "epoch": 0.7342789598108747, + "grad_norm": 5.153548717498779, + "learning_rate": 4.003919885198145e-06, + "loss": 0.1908, + "step": 1553 + }, + { + "epoch": 0.7347517730496453, + "grad_norm": 7.000768184661865, + "learning_rate": 3.990716305451636e-06, + "loss": 0.158, + "step": 1554 + }, + { + "epoch": 0.735224586288416, + "grad_norm": 7.075311183929443, + "learning_rate": 3.977529103098382e-06, + "loss": 0.3135, + "step": 1555 + }, + { + "epoch": 0.7356973995271867, + "grad_norm": 4.545357704162598, + "learning_rate": 3.964358314078107e-06, + "loss": 0.1733, + "step": 1556 + }, + { + "epoch": 0.7361702127659574, + "grad_norm": 7.8234357833862305, + "learning_rate": 3.951203974285805e-06, + "loss": 0.2067, + "step": 1557 + }, + { + "epoch": 0.7366430260047281, + "grad_norm": 7.197999000549316, + "learning_rate": 3.938066119571634e-06, + "loss": 0.1974, + "step": 1558 + }, + { + "epoch": 0.7371158392434988, + "grad_norm": 4.149715900421143, + "learning_rate": 3.9249447857408316e-06, + "loss": 0.1895, + "step": 1559 + }, + { + "epoch": 0.7375886524822695, + "grad_norm": 6.287510395050049, + "learning_rate": 3.911840008553604e-06, + "loss": 0.1749, + "step": 1560 + }, + { + "epoch": 0.7375886524822695, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7136150234741784, + "eval_loss": 0.3169473111629486, + "eval_precision": 0.8786127167630058, + "eval_recall": 0.6007905138339921, + "eval_runtime": 49.2192, + "eval_samples_per_second": 5.608, + "eval_steps_per_second": 0.183, + "step": 1560 + }, + { + "epoch": 0.7380614657210401, + "grad_norm": 6.008514881134033, + "learning_rate": 3.898751823725044e-06, + "loss": 0.2882, + "step": 1561 + }, + { + "epoch": 0.7385342789598108, + "grad_norm": 4.948297023773193, + "learning_rate": 3.885680266925016e-06, + "loss": 0.2198, + "step": 1562 + }, + { + "epoch": 0.7390070921985815, + "grad_norm": 8.02566146850586, + "learning_rate": 3.87262537377807e-06, + "loss": 0.2223, + "step": 1563 + }, + { + "epoch": 0.7394799054373522, + "grad_norm": 5.575436592102051, + "learning_rate": 3.85958717986334e-06, + "loss": 0.2097, + "step": 1564 + }, + { + "epoch": 0.7399527186761229, + "grad_norm": 6.199014663696289, + "learning_rate": 3.846565720714451e-06, + "loss": 0.2203, + "step": 1565 + }, + { + "epoch": 0.7404255319148936, + "grad_norm": 4.431324481964111, + "learning_rate": 3.83356103181942e-06, + "loss": 0.196, + "step": 1566 + }, + { + "epoch": 0.7408983451536643, + "grad_norm": 8.445691108703613, + "learning_rate": 3.820573148620559e-06, + "loss": 0.278, + "step": 1567 + }, + { + "epoch": 0.741371158392435, + "grad_norm": 7.028994083404541, + "learning_rate": 3.807602106514375e-06, + "loss": 0.2681, + "step": 1568 + }, + { + "epoch": 0.7418439716312056, + "grad_norm": 4.2099995613098145, + "learning_rate": 3.79464794085148e-06, + "loss": 0.2066, + "step": 1569 + }, + { + "epoch": 0.7423167848699763, + "grad_norm": 4.710866928100586, + "learning_rate": 3.781710686936497e-06, + "loss": 0.183, + "step": 1570 + }, + { + "epoch": 0.742789598108747, + "grad_norm": 5.328328609466553, + "learning_rate": 3.7687903800279513e-06, + "loss": 0.1954, + "step": 1571 + }, + { + "epoch": 0.7432624113475177, + "grad_norm": 7.036726474761963, + "learning_rate": 3.755887055338183e-06, + "loss": 0.2555, + "step": 1572 + }, + { + "epoch": 0.7437352245862884, + "grad_norm": 6.250298500061035, + "learning_rate": 3.743000748033252e-06, + "loss": 0.2065, + "step": 1573 + }, + { + "epoch": 0.7442080378250591, + "grad_norm": 6.400665760040283, + "learning_rate": 3.730131493232837e-06, + "loss": 0.2693, + "step": 1574 + }, + { + "epoch": 0.7446808510638298, + "grad_norm": 5.254453659057617, + "learning_rate": 3.7172793260101446e-06, + "loss": 0.1433, + "step": 1575 + }, + { + "epoch": 0.7451536643026004, + "grad_norm": 7.966073989868164, + "learning_rate": 3.7044442813918125e-06, + "loss": 0.2912, + "step": 1576 + }, + { + "epoch": 0.7456264775413711, + "grad_norm": 7.94743537902832, + "learning_rate": 3.6916263943578123e-06, + "loss": 0.1966, + "step": 1577 + }, + { + "epoch": 0.7460992907801418, + "grad_norm": 6.912722110748291, + "learning_rate": 3.6788256998413506e-06, + "loss": 0.1794, + "step": 1578 + }, + { + "epoch": 0.7465721040189125, + "grad_norm": 6.084993839263916, + "learning_rate": 3.6660422327287914e-06, + "loss": 0.1606, + "step": 1579 + }, + { + "epoch": 0.7470449172576832, + "grad_norm": 5.905043125152588, + "learning_rate": 3.6532760278595345e-06, + "loss": 0.2141, + "step": 1580 + }, + { + "epoch": 0.7470449172576832, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7272727272727273, + "eval_loss": 0.3045748174190521, + "eval_precision": 0.8556149732620321, + "eval_recall": 0.6324110671936759, + "eval_runtime": 46.807, + "eval_samples_per_second": 5.897, + "eval_steps_per_second": 0.192, + "step": 1580 + }, + { + "epoch": 0.7475177304964539, + "grad_norm": 6.382122039794922, + "learning_rate": 3.6405271200259406e-06, + "loss": 0.1741, + "step": 1581 + }, + { + "epoch": 0.7479905437352246, + "grad_norm": 7.32953405380249, + "learning_rate": 3.627795543973228e-06, + "loss": 0.3185, + "step": 1582 + }, + { + "epoch": 0.7484633569739952, + "grad_norm": 5.959390163421631, + "learning_rate": 3.6150813343993817e-06, + "loss": 0.244, + "step": 1583 + }, + { + "epoch": 0.7489361702127659, + "grad_norm": 7.206792831420898, + "learning_rate": 3.6023845259550526e-06, + "loss": 0.2935, + "step": 1584 + }, + { + "epoch": 0.7494089834515366, + "grad_norm": 6.780135154724121, + "learning_rate": 3.5897051532434746e-06, + "loss": 0.2841, + "step": 1585 + }, + { + "epoch": 0.7498817966903073, + "grad_norm": 5.990156173706055, + "learning_rate": 3.5770432508203525e-06, + "loss": 0.2146, + "step": 1586 + }, + { + "epoch": 0.750354609929078, + "grad_norm": 6.680501937866211, + "learning_rate": 3.5643988531937923e-06, + "loss": 0.2593, + "step": 1587 + }, + { + "epoch": 0.7508274231678487, + "grad_norm": 6.1346282958984375, + "learning_rate": 3.5517719948241837e-06, + "loss": 0.2365, + "step": 1588 + }, + { + "epoch": 0.7513002364066194, + "grad_norm": 6.420486927032471, + "learning_rate": 3.5391627101241187e-06, + "loss": 0.2646, + "step": 1589 + }, + { + "epoch": 0.75177304964539, + "grad_norm": 5.62802791595459, + "learning_rate": 3.5265710334582924e-06, + "loss": 0.2584, + "step": 1590 + }, + { + "epoch": 0.7522458628841607, + "grad_norm": 5.959242820739746, + "learning_rate": 3.5139969991434132e-06, + "loss": 0.1629, + "step": 1591 + }, + { + "epoch": 0.7527186761229314, + "grad_norm": 7.4486517906188965, + "learning_rate": 3.5014406414481173e-06, + "loss": 0.3043, + "step": 1592 + }, + { + "epoch": 0.7531914893617021, + "grad_norm": 11.137663841247559, + "learning_rate": 3.488901994592846e-06, + "loss": 0.2216, + "step": 1593 + }, + { + "epoch": 0.7536643026004728, + "grad_norm": 7.19482946395874, + "learning_rate": 3.476381092749789e-06, + "loss": 0.1895, + "step": 1594 + }, + { + "epoch": 0.7541371158392435, + "grad_norm": 5.478968620300293, + "learning_rate": 3.463877970042765e-06, + "loss": 0.2568, + "step": 1595 + }, + { + "epoch": 0.7546099290780142, + "grad_norm": 5.759098052978516, + "learning_rate": 3.4513926605471504e-06, + "loss": 0.2078, + "step": 1596 + }, + { + "epoch": 0.7550827423167848, + "grad_norm": 5.912849426269531, + "learning_rate": 3.438925198289762e-06, + "loss": 0.2184, + "step": 1597 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 7.000945091247559, + "learning_rate": 3.4264756172487813e-06, + "loss": 0.2958, + "step": 1598 + }, + { + "epoch": 0.7560283687943262, + "grad_norm": 6.138962745666504, + "learning_rate": 3.414043951353656e-06, + "loss": 0.3196, + "step": 1599 + }, + { + "epoch": 0.7565011820330969, + "grad_norm": 6.125826835632324, + "learning_rate": 3.401630234485014e-06, + "loss": 0.2638, + "step": 1600 + }, + { + "epoch": 0.7565011820330969, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7309417040358744, + "eval_loss": 0.29756960272789, + "eval_precision": 0.844559585492228, + "eval_recall": 0.6442687747035574, + "eval_runtime": 47.3187, + "eval_samples_per_second": 5.833, + "eval_steps_per_second": 0.19, + "step": 1600 + }, + { + "epoch": 0.7569739952718676, + "grad_norm": 6.0788445472717285, + "learning_rate": 3.3892345004745607e-06, + "loss": 0.1994, + "step": 1601 + }, + { + "epoch": 0.7574468085106383, + "grad_norm": 6.959369659423828, + "learning_rate": 3.376856783104996e-06, + "loss": 0.3052, + "step": 1602 + }, + { + "epoch": 0.757919621749409, + "grad_norm": 4.413602352142334, + "learning_rate": 3.3644971161099083e-06, + "loss": 0.1861, + "step": 1603 + }, + { + "epoch": 0.7583924349881797, + "grad_norm": 4.487978935241699, + "learning_rate": 3.3521555331736987e-06, + "loss": 0.2593, + "step": 1604 + }, + { + "epoch": 0.7588652482269503, + "grad_norm": 5.322134017944336, + "learning_rate": 3.339832067931491e-06, + "loss": 0.2151, + "step": 1605 + }, + { + "epoch": 0.759338061465721, + "grad_norm": 5.43377685546875, + "learning_rate": 3.3275267539690225e-06, + "loss": 0.2738, + "step": 1606 + }, + { + "epoch": 0.7598108747044917, + "grad_norm": 5.074190616607666, + "learning_rate": 3.315239624822563e-06, + "loss": 0.1439, + "step": 1607 + }, + { + "epoch": 0.7602836879432624, + "grad_norm": 5.0913920402526855, + "learning_rate": 3.30297071397882e-06, + "loss": 0.2314, + "step": 1608 + }, + { + "epoch": 0.7607565011820331, + "grad_norm": 6.6666107177734375, + "learning_rate": 3.29072005487486e-06, + "loss": 0.3486, + "step": 1609 + }, + { + "epoch": 0.7612293144208038, + "grad_norm": 5.526213645935059, + "learning_rate": 3.278487680897997e-06, + "loss": 0.2517, + "step": 1610 + }, + { + "epoch": 0.7617021276595745, + "grad_norm": 5.9422736167907715, + "learning_rate": 3.2662736253857154e-06, + "loss": 0.219, + "step": 1611 + }, + { + "epoch": 0.7621749408983451, + "grad_norm": 5.5532426834106445, + "learning_rate": 3.254077921625578e-06, + "loss": 0.2077, + "step": 1612 + }, + { + "epoch": 0.7626477541371158, + "grad_norm": 4.330904960632324, + "learning_rate": 3.2419006028551205e-06, + "loss": 0.1412, + "step": 1613 + }, + { + "epoch": 0.7631205673758865, + "grad_norm": 8.491339683532715, + "learning_rate": 3.2297417022617904e-06, + "loss": 0.3303, + "step": 1614 + }, + { + "epoch": 0.7635933806146572, + "grad_norm": 6.322214603424072, + "learning_rate": 3.2176012529828295e-06, + "loss": 0.2718, + "step": 1615 + }, + { + "epoch": 0.7640661938534279, + "grad_norm": 6.424625873565674, + "learning_rate": 3.2054792881051933e-06, + "loss": 0.2817, + "step": 1616 + }, + { + "epoch": 0.7645390070921986, + "grad_norm": 5.700141429901123, + "learning_rate": 3.1933758406654615e-06, + "loss": 0.273, + "step": 1617 + }, + { + "epoch": 0.7650118203309693, + "grad_norm": 4.833881378173828, + "learning_rate": 3.181290943649753e-06, + "loss": 0.2003, + "step": 1618 + }, + { + "epoch": 0.76548463356974, + "grad_norm": 5.849541187286377, + "learning_rate": 3.1692246299936234e-06, + "loss": 0.2389, + "step": 1619 + }, + { + "epoch": 0.7659574468085106, + "grad_norm": 4.621654510498047, + "learning_rate": 3.1571769325819834e-06, + "loss": 0.2215, + "step": 1620 + }, + { + "epoch": 0.7659574468085106, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7276785714285714, + "eval_loss": 0.2926580309867859, + "eval_precision": 0.8358974358974359, + "eval_recall": 0.6442687747035574, + "eval_runtime": 48.4804, + "eval_samples_per_second": 5.693, + "eval_steps_per_second": 0.186, + "step": 1620 + }, + { + "epoch": 0.7664302600472813, + "grad_norm": 4.861423492431641, + "learning_rate": 3.1451478842490114e-06, + "loss": 0.2547, + "step": 1621 + }, + { + "epoch": 0.766903073286052, + "grad_norm": 5.893204212188721, + "learning_rate": 3.133137517778054e-06, + "loss": 0.1872, + "step": 1622 + }, + { + "epoch": 0.7673758865248227, + "grad_norm": 5.206727504730225, + "learning_rate": 3.1211458659015513e-06, + "loss": 0.198, + "step": 1623 + }, + { + "epoch": 0.7678486997635934, + "grad_norm": 5.422547340393066, + "learning_rate": 3.1091729613009346e-06, + "loss": 0.25, + "step": 1624 + }, + { + "epoch": 0.7683215130023641, + "grad_norm": 6.1905999183654785, + "learning_rate": 3.0972188366065424e-06, + "loss": 0.2626, + "step": 1625 + }, + { + "epoch": 0.7687943262411348, + "grad_norm": 4.419033050537109, + "learning_rate": 3.08528352439753e-06, + "loss": 0.2188, + "step": 1626 + }, + { + "epoch": 0.7692671394799054, + "grad_norm": 4.382445335388184, + "learning_rate": 3.0733670572017894e-06, + "loss": 0.1589, + "step": 1627 + }, + { + "epoch": 0.7697399527186761, + "grad_norm": 4.622806549072266, + "learning_rate": 3.0614694674958477e-06, + "loss": 0.2515, + "step": 1628 + }, + { + "epoch": 0.7702127659574468, + "grad_norm": 6.51718807220459, + "learning_rate": 3.0495907877047836e-06, + "loss": 0.2507, + "step": 1629 + }, + { + "epoch": 0.7706855791962175, + "grad_norm": 3.415192127227783, + "learning_rate": 3.0377310502021405e-06, + "loss": 0.1726, + "step": 1630 + }, + { + "epoch": 0.7711583924349882, + "grad_norm": 4.835279941558838, + "learning_rate": 3.0258902873098406e-06, + "loss": 0.1817, + "step": 1631 + }, + { + "epoch": 0.7716312056737589, + "grad_norm": 6.565097808837891, + "learning_rate": 3.014068531298089e-06, + "loss": 0.2459, + "step": 1632 + }, + { + "epoch": 0.7721040189125296, + "grad_norm": 6.267763614654541, + "learning_rate": 3.0022658143852923e-06, + "loss": 0.2536, + "step": 1633 + }, + { + "epoch": 0.7725768321513002, + "grad_norm": 4.700669765472412, + "learning_rate": 2.990482168737967e-06, + "loss": 0.161, + "step": 1634 + }, + { + "epoch": 0.7730496453900709, + "grad_norm": 6.653107166290283, + "learning_rate": 2.978717626470663e-06, + "loss": 0.2972, + "step": 1635 + }, + { + "epoch": 0.7735224586288416, + "grad_norm": 5.026117324829102, + "learning_rate": 2.966972219645855e-06, + "loss": 0.237, + "step": 1636 + }, + { + "epoch": 0.7739952718676123, + "grad_norm": 6.812114238739014, + "learning_rate": 2.9552459802738733e-06, + "loss": 0.1928, + "step": 1637 + }, + { + "epoch": 0.774468085106383, + "grad_norm": 5.485900402069092, + "learning_rate": 2.943538940312807e-06, + "loss": 0.1795, + "step": 1638 + }, + { + "epoch": 0.7749408983451537, + "grad_norm": 4.413486003875732, + "learning_rate": 2.931851131668423e-06, + "loss": 0.2056, + "step": 1639 + }, + { + "epoch": 0.7754137115839244, + "grad_norm": 6.196752071380615, + "learning_rate": 2.920182586194075e-06, + "loss": 0.2587, + "step": 1640 + }, + { + "epoch": 0.7754137115839244, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7067307692307693, + "eval_loss": 0.31787875294685364, + "eval_precision": 0.901840490797546, + "eval_recall": 0.5810276679841897, + "eval_runtime": 48.6346, + "eval_samples_per_second": 5.675, + "eval_steps_per_second": 0.185, + "step": 1640 + }, + { + "epoch": 0.775886524822695, + "grad_norm": 4.845483303070068, + "learning_rate": 2.9085333356906165e-06, + "loss": 0.1588, + "step": 1641 + }, + { + "epoch": 0.7763593380614657, + "grad_norm": 5.5834479331970215, + "learning_rate": 2.8969034119063176e-06, + "loss": 0.2241, + "step": 1642 + }, + { + "epoch": 0.7768321513002364, + "grad_norm": 5.414841175079346, + "learning_rate": 2.8852928465367726e-06, + "loss": 0.2914, + "step": 1643 + }, + { + "epoch": 0.7773049645390071, + "grad_norm": 6.023651599884033, + "learning_rate": 2.8737016712248258e-06, + "loss": 0.2307, + "step": 1644 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 4.881513595581055, + "learning_rate": 2.862129917560469e-06, + "loss": 0.1618, + "step": 1645 + }, + { + "epoch": 0.7782505910165485, + "grad_norm": 5.7092814445495605, + "learning_rate": 2.850577617080764e-06, + "loss": 0.2415, + "step": 1646 + }, + { + "epoch": 0.7787234042553192, + "grad_norm": 6.310904026031494, + "learning_rate": 2.839044801269756e-06, + "loss": 0.2487, + "step": 1647 + }, + { + "epoch": 0.7791962174940898, + "grad_norm": 8.262921333312988, + "learning_rate": 2.827531501558395e-06, + "loss": 0.2799, + "step": 1648 + }, + { + "epoch": 0.7796690307328605, + "grad_norm": 6.071582317352295, + "learning_rate": 2.8160377493244363e-06, + "loss": 0.2469, + "step": 1649 + }, + { + "epoch": 0.7801418439716312, + "grad_norm": 4.781665802001953, + "learning_rate": 2.8045635758923563e-06, + "loss": 0.169, + "step": 1650 + }, + { + "epoch": 0.7806146572104019, + "grad_norm": 4.784432411193848, + "learning_rate": 2.7931090125332806e-06, + "loss": 0.2056, + "step": 1651 + }, + { + "epoch": 0.7810874704491726, + "grad_norm": 5.871290683746338, + "learning_rate": 2.7816740904648866e-06, + "loss": 0.2034, + "step": 1652 + }, + { + "epoch": 0.7815602836879433, + "grad_norm": 7.995057106018066, + "learning_rate": 2.7702588408513276e-06, + "loss": 0.3481, + "step": 1653 + }, + { + "epoch": 0.782033096926714, + "grad_norm": 4.378397464752197, + "learning_rate": 2.758863294803138e-06, + "loss": 0.182, + "step": 1654 + }, + { + "epoch": 0.7825059101654847, + "grad_norm": 6.422306060791016, + "learning_rate": 2.7474874833771524e-06, + "loss": 0.2954, + "step": 1655 + }, + { + "epoch": 0.7829787234042553, + "grad_norm": 4.291572570800781, + "learning_rate": 2.7361314375764215e-06, + "loss": 0.1982, + "step": 1656 + }, + { + "epoch": 0.783451536643026, + "grad_norm": 4.588647365570068, + "learning_rate": 2.7247951883501343e-06, + "loss": 0.1613, + "step": 1657 + }, + { + "epoch": 0.7839243498817967, + "grad_norm": 5.927759647369385, + "learning_rate": 2.7134787665935213e-06, + "loss": 0.3002, + "step": 1658 + }, + { + "epoch": 0.7843971631205674, + "grad_norm": 6.183173656463623, + "learning_rate": 2.7021822031477773e-06, + "loss": 0.2178, + "step": 1659 + }, + { + "epoch": 0.7848699763593381, + "grad_norm": 6.231297492980957, + "learning_rate": 2.6909055287999698e-06, + "loss": 0.2216, + "step": 1660 + }, + { + "epoch": 0.7848699763593381, + "eval_accuracy": 0.8713968957871396, + "eval_f1": 0.7289719626168224, + "eval_loss": 0.3045533299446106, + "eval_precision": 0.8914285714285715, + "eval_recall": 0.616600790513834, + "eval_runtime": 49.0889, + "eval_samples_per_second": 5.622, + "eval_steps_per_second": 0.183, + "step": 1660 + }, + { + "epoch": 0.7853427895981088, + "grad_norm": 6.300743103027344, + "learning_rate": 2.6796487742829758e-06, + "loss": 0.2452, + "step": 1661 + }, + { + "epoch": 0.7858156028368795, + "grad_norm": 5.218397617340088, + "learning_rate": 2.668411970275374e-06, + "loss": 0.2879, + "step": 1662 + }, + { + "epoch": 0.7862884160756501, + "grad_norm": 5.584065914154053, + "learning_rate": 2.6571951474013734e-06, + "loss": 0.2256, + "step": 1663 + }, + { + "epoch": 0.7867612293144208, + "grad_norm": 6.801513671875, + "learning_rate": 2.6459983362307263e-06, + "loss": 0.2637, + "step": 1664 + }, + { + "epoch": 0.7872340425531915, + "grad_norm": 3.2993884086608887, + "learning_rate": 2.6348215672786435e-06, + "loss": 0.1469, + "step": 1665 + }, + { + "epoch": 0.7877068557919622, + "grad_norm": 5.288236141204834, + "learning_rate": 2.6236648710057244e-06, + "loss": 0.1577, + "step": 1666 + }, + { + "epoch": 0.7881796690307329, + "grad_norm": 4.468508243560791, + "learning_rate": 2.612528277817853e-06, + "loss": 0.188, + "step": 1667 + }, + { + "epoch": 0.7886524822695036, + "grad_norm": 5.90925407409668, + "learning_rate": 2.6014118180661284e-06, + "loss": 0.229, + "step": 1668 + }, + { + "epoch": 0.7891252955082743, + "grad_norm": 5.6658124923706055, + "learning_rate": 2.590315522046779e-06, + "loss": 0.2122, + "step": 1669 + }, + { + "epoch": 0.789598108747045, + "grad_norm": 6.5475921630859375, + "learning_rate": 2.5792394200010805e-06, + "loss": 0.2117, + "step": 1670 + }, + { + "epoch": 0.7900709219858156, + "grad_norm": 5.4423112869262695, + "learning_rate": 2.5681835421152736e-06, + "loss": 0.2646, + "step": 1671 + }, + { + "epoch": 0.7905437352245863, + "grad_norm": 4.928060531616211, + "learning_rate": 2.5571479185204785e-06, + "loss": 0.2074, + "step": 1672 + }, + { + "epoch": 0.791016548463357, + "grad_norm": 5.3929243087768555, + "learning_rate": 2.546132579292616e-06, + "loss": 0.2094, + "step": 1673 + }, + { + "epoch": 0.7914893617021277, + "grad_norm": 6.514987468719482, + "learning_rate": 2.5351375544523306e-06, + "loss": 0.2043, + "step": 1674 + }, + { + "epoch": 0.7919621749408984, + "grad_norm": 4.381026268005371, + "learning_rate": 2.524162873964896e-06, + "loss": 0.2086, + "step": 1675 + }, + { + "epoch": 0.7924349881796691, + "grad_norm": 5.139743328094482, + "learning_rate": 2.513208567740144e-06, + "loss": 0.1823, + "step": 1676 + }, + { + "epoch": 0.7929078014184398, + "grad_norm": 4.477554798126221, + "learning_rate": 2.502274665632377e-06, + "loss": 0.1828, + "step": 1677 + }, + { + "epoch": 0.7933806146572104, + "grad_norm": 7.049522399902344, + "learning_rate": 2.491361197440291e-06, + "loss": 0.2235, + "step": 1678 + }, + { + "epoch": 0.7938534278959811, + "grad_norm": 5.63670539855957, + "learning_rate": 2.4804681929068907e-06, + "loss": 0.2404, + "step": 1679 + }, + { + "epoch": 0.7943262411347518, + "grad_norm": 5.916224956512451, + "learning_rate": 2.4695956817194134e-06, + "loss": 0.2357, + "step": 1680 + }, + { + "epoch": 0.7943262411347518, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7272727272727273, + "eval_loss": 0.2966913878917694, + "eval_precision": 0.8556149732620321, + "eval_recall": 0.6324110671936759, + "eval_runtime": 48.6495, + "eval_samples_per_second": 5.673, + "eval_steps_per_second": 0.185, + "step": 1680 + }, + { + "epoch": 0.7947990543735225, + "grad_norm": 6.325137138366699, + "learning_rate": 2.4587436935092424e-06, + "loss": 0.2087, + "step": 1681 + }, + { + "epoch": 0.7952718676122932, + "grad_norm": 4.733521461486816, + "learning_rate": 2.4479122578518257e-06, + "loss": 0.2256, + "step": 1682 + }, + { + "epoch": 0.7957446808510639, + "grad_norm": 4.68524169921875, + "learning_rate": 2.4371014042666074e-06, + "loss": 0.2188, + "step": 1683 + }, + { + "epoch": 0.7962174940898346, + "grad_norm": 6.98213529586792, + "learning_rate": 2.42631116221693e-06, + "loss": 0.2244, + "step": 1684 + }, + { + "epoch": 0.7966903073286052, + "grad_norm": 6.548198223114014, + "learning_rate": 2.4155415611099664e-06, + "loss": 0.2656, + "step": 1685 + }, + { + "epoch": 0.7971631205673759, + "grad_norm": 5.059558391571045, + "learning_rate": 2.404792630296633e-06, + "loss": 0.1769, + "step": 1686 + }, + { + "epoch": 0.7976359338061466, + "grad_norm": 6.935822010040283, + "learning_rate": 2.394064399071515e-06, + "loss": 0.2474, + "step": 1687 + }, + { + "epoch": 0.7981087470449173, + "grad_norm": 6.197619438171387, + "learning_rate": 2.3833568966727837e-06, + "loss": 0.2132, + "step": 1688 + }, + { + "epoch": 0.798581560283688, + "grad_norm": 6.507517337799072, + "learning_rate": 2.372670152282114e-06, + "loss": 0.2625, + "step": 1689 + }, + { + "epoch": 0.7990543735224587, + "grad_norm": 8.216846466064453, + "learning_rate": 2.362004195024613e-06, + "loss": 0.2823, + "step": 1690 + }, + { + "epoch": 0.7995271867612294, + "grad_norm": 5.132957935333252, + "learning_rate": 2.351359053968728e-06, + "loss": 0.1989, + "step": 1691 + }, + { + "epoch": 0.8, + "grad_norm": 6.100037574768066, + "learning_rate": 2.3407347581261863e-06, + "loss": 0.2593, + "step": 1692 + }, + { + "epoch": 0.8004728132387707, + "grad_norm": 8.391918182373047, + "learning_rate": 2.3301313364518964e-06, + "loss": 0.3208, + "step": 1693 + }, + { + "epoch": 0.8009456264775414, + "grad_norm": 4.401480674743652, + "learning_rate": 2.3195488178438785e-06, + "loss": 0.1518, + "step": 1694 + }, + { + "epoch": 0.8014184397163121, + "grad_norm": 6.447848796844482, + "learning_rate": 2.308987231143186e-06, + "loss": 0.2173, + "step": 1695 + }, + { + "epoch": 0.8018912529550828, + "grad_norm": 6.483435153961182, + "learning_rate": 2.298446605133824e-06, + "loss": 0.1744, + "step": 1696 + }, + { + "epoch": 0.8023640661938535, + "grad_norm": 5.87816858291626, + "learning_rate": 2.2879269685426742e-06, + "loss": 0.1838, + "step": 1697 + }, + { + "epoch": 0.8028368794326242, + "grad_norm": 8.297409057617188, + "learning_rate": 2.2774283500394134e-06, + "loss": 0.2732, + "step": 1698 + }, + { + "epoch": 0.8033096926713948, + "grad_norm": 9.192248344421387, + "learning_rate": 2.2669507782364387e-06, + "loss": 0.3547, + "step": 1699 + }, + { + "epoch": 0.8037825059101655, + "grad_norm": 5.559706211090088, + "learning_rate": 2.2564942816887837e-06, + "loss": 0.1972, + "step": 1700 + }, + { + "epoch": 0.8037825059101655, + "eval_accuracy": 0.8658536585365854, + "eval_f1": 0.7218390804597701, + "eval_loss": 0.30020540952682495, + "eval_precision": 0.8626373626373627, + "eval_recall": 0.6205533596837944, + "eval_runtime": 48.6466, + "eval_samples_per_second": 5.674, + "eval_steps_per_second": 0.185, + "step": 1700 + }, + { + "epoch": 0.8042553191489362, + "grad_norm": 5.664395332336426, + "learning_rate": 2.2460588888940504e-06, + "loss": 0.2274, + "step": 1701 + }, + { + "epoch": 0.8047281323877069, + "grad_norm": 5.178661346435547, + "learning_rate": 2.235644628292323e-06, + "loss": 0.2305, + "step": 1702 + }, + { + "epoch": 0.8052009456264776, + "grad_norm": 6.627544403076172, + "learning_rate": 2.225251528266089e-06, + "loss": 0.2816, + "step": 1703 + }, + { + "epoch": 0.8056737588652483, + "grad_norm": 5.081453323364258, + "learning_rate": 2.214879617140171e-06, + "loss": 0.1905, + "step": 1704 + }, + { + "epoch": 0.806146572104019, + "grad_norm": 6.601840496063232, + "learning_rate": 2.204528923181648e-06, + "loss": 0.2067, + "step": 1705 + }, + { + "epoch": 0.8066193853427897, + "grad_norm": 5.457771301269531, + "learning_rate": 2.194199474599763e-06, + "loss": 0.2434, + "step": 1706 + }, + { + "epoch": 0.8070921985815603, + "grad_norm": 6.326608657836914, + "learning_rate": 2.1838912995458673e-06, + "loss": 0.2722, + "step": 1707 + }, + { + "epoch": 0.807565011820331, + "grad_norm": 5.285124778747559, + "learning_rate": 2.1736044261133305e-06, + "loss": 0.2349, + "step": 1708 + }, + { + "epoch": 0.8080378250591016, + "grad_norm": 7.561131477355957, + "learning_rate": 2.1633388823374722e-06, + "loss": 0.2804, + "step": 1709 + }, + { + "epoch": 0.8085106382978723, + "grad_norm": 5.1685309410095215, + "learning_rate": 2.153094696195478e-06, + "loss": 0.179, + "step": 1710 + }, + { + "epoch": 0.808983451536643, + "grad_norm": 5.682158470153809, + "learning_rate": 2.1428718956063253e-06, + "loss": 0.2478, + "step": 1711 + }, + { + "epoch": 0.8094562647754137, + "grad_norm": 6.980831146240234, + "learning_rate": 2.132670508430711e-06, + "loss": 0.1889, + "step": 1712 + }, + { + "epoch": 0.8099290780141843, + "grad_norm": 4.640564441680908, + "learning_rate": 2.1224905624709692e-06, + "loss": 0.1338, + "step": 1713 + }, + { + "epoch": 0.810401891252955, + "grad_norm": 5.731657981872559, + "learning_rate": 2.112332085471006e-06, + "loss": 0.2535, + "step": 1714 + }, + { + "epoch": 0.8108747044917257, + "grad_norm": 5.084789276123047, + "learning_rate": 2.102195105116215e-06, + "loss": 0.271, + "step": 1715 + }, + { + "epoch": 0.8113475177304964, + "grad_norm": 5.400364875793457, + "learning_rate": 2.092079649033395e-06, + "loss": 0.1942, + "step": 1716 + }, + { + "epoch": 0.8118203309692671, + "grad_norm": 4.337155818939209, + "learning_rate": 2.081985744790691e-06, + "loss": 0.1472, + "step": 1717 + }, + { + "epoch": 0.8122931442080378, + "grad_norm": 4.337014675140381, + "learning_rate": 2.0719134198975187e-06, + "loss": 0.1988, + "step": 1718 + }, + { + "epoch": 0.8127659574468085, + "grad_norm": 4.499454498291016, + "learning_rate": 2.06186270180447e-06, + "loss": 0.1955, + "step": 1719 + }, + { + "epoch": 0.8132387706855791, + "grad_norm": 6.413626670837402, + "learning_rate": 2.051833617903257e-06, + "loss": 0.2602, + "step": 1720 + }, + { + "epoch": 0.8132387706855791, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7209302325581395, + "eval_loss": 0.30013346672058105, + "eval_precision": 0.8757062146892656, + "eval_recall": 0.6126482213438735, + "eval_runtime": 48.9633, + "eval_samples_per_second": 5.637, + "eval_steps_per_second": 0.184, + "step": 1720 + }, + { + "epoch": 0.8137115839243498, + "grad_norm": 6.098957538604736, + "learning_rate": 2.041826195526627e-06, + "loss": 0.3015, + "step": 1721 + }, + { + "epoch": 0.8141843971631205, + "grad_norm": 6.596921443939209, + "learning_rate": 2.031840461948301e-06, + "loss": 0.3017, + "step": 1722 + }, + { + "epoch": 0.8146572104018912, + "grad_norm": 6.857213497161865, + "learning_rate": 2.021876444382882e-06, + "loss": 0.2186, + "step": 1723 + }, + { + "epoch": 0.8151300236406619, + "grad_norm": 5.775735378265381, + "learning_rate": 2.011934169985792e-06, + "loss": 0.2594, + "step": 1724 + }, + { + "epoch": 0.8156028368794326, + "grad_norm": 6.679286956787109, + "learning_rate": 2.0020136658531964e-06, + "loss": 0.3236, + "step": 1725 + }, + { + "epoch": 0.8160756501182033, + "grad_norm": 5.931440353393555, + "learning_rate": 1.9921149590219213e-06, + "loss": 0.2564, + "step": 1726 + }, + { + "epoch": 0.816548463356974, + "grad_norm": 4.7152581214904785, + "learning_rate": 1.9822380764694027e-06, + "loss": 0.1825, + "step": 1727 + }, + { + "epoch": 0.8170212765957446, + "grad_norm": 5.601851940155029, + "learning_rate": 1.972383045113585e-06, + "loss": 0.2052, + "step": 1728 + }, + { + "epoch": 0.8174940898345153, + "grad_norm": 5.7181501388549805, + "learning_rate": 1.962549891812865e-06, + "loss": 0.2467, + "step": 1729 + }, + { + "epoch": 0.817966903073286, + "grad_norm": 11.021007537841797, + "learning_rate": 1.952738643366011e-06, + "loss": 0.3405, + "step": 1730 + }, + { + "epoch": 0.8184397163120567, + "grad_norm": 5.372162342071533, + "learning_rate": 1.9429493265121026e-06, + "loss": 0.2504, + "step": 1731 + }, + { + "epoch": 0.8189125295508274, + "grad_norm": 4.431525707244873, + "learning_rate": 1.9331819679304376e-06, + "loss": 0.2204, + "step": 1732 + }, + { + "epoch": 0.8193853427895981, + "grad_norm": 4.9457268714904785, + "learning_rate": 1.923436594240473e-06, + "loss": 0.1216, + "step": 1733 + }, + { + "epoch": 0.8198581560283688, + "grad_norm": 6.442474365234375, + "learning_rate": 1.9137132320017505e-06, + "loss": 0.2644, + "step": 1734 + }, + { + "epoch": 0.8203309692671394, + "grad_norm": 4.034286975860596, + "learning_rate": 1.904011907713823e-06, + "loss": 0.1679, + "step": 1735 + }, + { + "epoch": 0.8208037825059101, + "grad_norm": 5.20193338394165, + "learning_rate": 1.8943326478161806e-06, + "loss": 0.1667, + "step": 1736 + }, + { + "epoch": 0.8212765957446808, + "grad_norm": 5.608093738555908, + "learning_rate": 1.8846754786881816e-06, + "loss": 0.2191, + "step": 1737 + }, + { + "epoch": 0.8217494089834515, + "grad_norm": 4.049744606018066, + "learning_rate": 1.8750404266489796e-06, + "loss": 0.186, + "step": 1738 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 5.3404436111450195, + "learning_rate": 1.8654275179574477e-06, + "loss": 0.2918, + "step": 1739 + }, + { + "epoch": 0.8226950354609929, + "grad_norm": 6.187091827392578, + "learning_rate": 1.855836778812118e-06, + "loss": 0.1873, + "step": 1740 + }, + { + "epoch": 0.8226950354609929, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7136150234741784, + "eval_loss": 0.30463746190071106, + "eval_precision": 0.8786127167630058, + "eval_recall": 0.6007905138339921, + "eval_runtime": 49.7049, + "eval_samples_per_second": 5.553, + "eval_steps_per_second": 0.181, + "step": 1740 + }, + { + "epoch": 0.8231678486997636, + "grad_norm": 5.610990524291992, + "learning_rate": 1.8462682353510974e-06, + "loss": 0.258, + "step": 1741 + }, + { + "epoch": 0.8236406619385342, + "grad_norm": 7.3444318771362305, + "learning_rate": 1.836721913652002e-06, + "loss": 0.2804, + "step": 1742 + }, + { + "epoch": 0.8241134751773049, + "grad_norm": 5.178664684295654, + "learning_rate": 1.8271978397318868e-06, + "loss": 0.2232, + "step": 1743 + }, + { + "epoch": 0.8245862884160756, + "grad_norm": 7.083937168121338, + "learning_rate": 1.8176960395471754e-06, + "loss": 0.306, + "step": 1744 + }, + { + "epoch": 0.8250591016548463, + "grad_norm": 5.501087188720703, + "learning_rate": 1.8082165389935836e-06, + "loss": 0.2164, + "step": 1745 + }, + { + "epoch": 0.825531914893617, + "grad_norm": 6.170442581176758, + "learning_rate": 1.7987593639060586e-06, + "loss": 0.2403, + "step": 1746 + }, + { + "epoch": 0.8260047281323877, + "grad_norm": 6.09285306930542, + "learning_rate": 1.7893245400586967e-06, + "loss": 0.2852, + "step": 1747 + }, + { + "epoch": 0.8264775413711584, + "grad_norm": 5.350312232971191, + "learning_rate": 1.7799120931646819e-06, + "loss": 0.251, + "step": 1748 + }, + { + "epoch": 0.826950354609929, + "grad_norm": 5.884124755859375, + "learning_rate": 1.7705220488762187e-06, + "loss": 0.2269, + "step": 1749 + }, + { + "epoch": 0.8274231678486997, + "grad_norm": 5.945254325866699, + "learning_rate": 1.7611544327844487e-06, + "loss": 0.206, + "step": 1750 + }, + { + "epoch": 0.8278959810874704, + "grad_norm": 4.885594844818115, + "learning_rate": 1.7518092704193913e-06, + "loss": 0.2674, + "step": 1751 + }, + { + "epoch": 0.8283687943262411, + "grad_norm": 5.776851654052734, + "learning_rate": 1.742486587249873e-06, + "loss": 0.2314, + "step": 1752 + }, + { + "epoch": 0.8288416075650118, + "grad_norm": 4.074375629425049, + "learning_rate": 1.733186408683456e-06, + "loss": 0.1666, + "step": 1753 + }, + { + "epoch": 0.8293144208037825, + "grad_norm": 3.9429306983947754, + "learning_rate": 1.7239087600663684e-06, + "loss": 0.2021, + "step": 1754 + }, + { + "epoch": 0.8297872340425532, + "grad_norm": 4.927017688751221, + "learning_rate": 1.714653666683439e-06, + "loss": 0.2131, + "step": 1755 + }, + { + "epoch": 0.8302600472813239, + "grad_norm": 4.356184959411621, + "learning_rate": 1.7054211537580201e-06, + "loss": 0.1633, + "step": 1756 + }, + { + "epoch": 0.8307328605200945, + "grad_norm": 6.772974967956543, + "learning_rate": 1.6962112464519343e-06, + "loss": 0.2083, + "step": 1757 + }, + { + "epoch": 0.8312056737588652, + "grad_norm": 5.196053504943848, + "learning_rate": 1.6870239698653879e-06, + "loss": 0.2203, + "step": 1758 + }, + { + "epoch": 0.8316784869976359, + "grad_norm": 5.340625762939453, + "learning_rate": 1.677859349036911e-06, + "loss": 0.2047, + "step": 1759 + }, + { + "epoch": 0.8321513002364066, + "grad_norm": 3.65738582611084, + "learning_rate": 1.6687174089432934e-06, + "loss": 0.1663, + "step": 1760 + }, + { + "epoch": 0.8321513002364066, + "eval_accuracy": 0.8658536585365854, + "eval_f1": 0.717948717948718, + "eval_loss": 0.2977656424045563, + "eval_precision": 0.875, + "eval_recall": 0.6086956521739131, + "eval_runtime": 47.9133, + "eval_samples_per_second": 5.76, + "eval_steps_per_second": 0.188, + "step": 1760 + }, + { + "epoch": 0.8326241134751773, + "grad_norm": 5.490943431854248, + "learning_rate": 1.659598174499505e-06, + "loss": 0.2443, + "step": 1761 + }, + { + "epoch": 0.833096926713948, + "grad_norm": 5.882487773895264, + "learning_rate": 1.6505016705586475e-06, + "loss": 0.2925, + "step": 1762 + }, + { + "epoch": 0.8335697399527187, + "grad_norm": 4.92294979095459, + "learning_rate": 1.6414279219118568e-06, + "loss": 0.1559, + "step": 1763 + }, + { + "epoch": 0.8340425531914893, + "grad_norm": 8.455697059631348, + "learning_rate": 1.632376953288265e-06, + "loss": 0.3205, + "step": 1764 + }, + { + "epoch": 0.83451536643026, + "grad_norm": 5.372105598449707, + "learning_rate": 1.623348789354916e-06, + "loss": 0.2579, + "step": 1765 + }, + { + "epoch": 0.8349881796690307, + "grad_norm": 6.4145307540893555, + "learning_rate": 1.614343454716707e-06, + "loss": 0.2467, + "step": 1766 + }, + { + "epoch": 0.8354609929078014, + "grad_norm": 5.563887119293213, + "learning_rate": 1.6053609739163134e-06, + "loss": 0.192, + "step": 1767 + }, + { + "epoch": 0.8359338061465721, + "grad_norm": 4.483576774597168, + "learning_rate": 1.5964013714341275e-06, + "loss": 0.1964, + "step": 1768 + }, + { + "epoch": 0.8364066193853428, + "grad_norm": 6.770689010620117, + "learning_rate": 1.587464671688187e-06, + "loss": 0.2926, + "step": 1769 + }, + { + "epoch": 0.8368794326241135, + "grad_norm": 3.7878456115722656, + "learning_rate": 1.5785508990341192e-06, + "loss": 0.1907, + "step": 1770 + }, + { + "epoch": 0.8373522458628841, + "grad_norm": 7.301677227020264, + "learning_rate": 1.5696600777650606e-06, + "loss": 0.2305, + "step": 1771 + }, + { + "epoch": 0.8378250591016548, + "grad_norm": 3.8965413570404053, + "learning_rate": 1.560792232111601e-06, + "loss": 0.1244, + "step": 1772 + }, + { + "epoch": 0.8382978723404255, + "grad_norm": 5.381835460662842, + "learning_rate": 1.551947386241708e-06, + "loss": 0.2294, + "step": 1773 + }, + { + "epoch": 0.8387706855791962, + "grad_norm": 3.8923234939575195, + "learning_rate": 1.543125564260668e-06, + "loss": 0.1775, + "step": 1774 + }, + { + "epoch": 0.8392434988179669, + "grad_norm": 4.11511754989624, + "learning_rate": 1.5343267902110282e-06, + "loss": 0.1614, + "step": 1775 + }, + { + "epoch": 0.8397163120567376, + "grad_norm": 4.022883892059326, + "learning_rate": 1.5255510880725133e-06, + "loss": 0.2149, + "step": 1776 + }, + { + "epoch": 0.8401891252955083, + "grad_norm": 5.9938836097717285, + "learning_rate": 1.5167984817619709e-06, + "loss": 0.2138, + "step": 1777 + }, + { + "epoch": 0.840661938534279, + "grad_norm": 6.684109210968018, + "learning_rate": 1.5080689951333017e-06, + "loss": 0.2798, + "step": 1778 + }, + { + "epoch": 0.8411347517730496, + "grad_norm": 5.152201175689697, + "learning_rate": 1.4993626519774073e-06, + "loss": 0.2239, + "step": 1779 + }, + { + "epoch": 0.8416075650118203, + "grad_norm": 8.338132858276367, + "learning_rate": 1.4906794760221032e-06, + "loss": 0.363, + "step": 1780 + }, + { + "epoch": 0.8416075650118203, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7149532710280374, + "eval_loss": 0.2977047264575958, + "eval_precision": 0.8742857142857143, + "eval_recall": 0.6047430830039525, + "eval_runtime": 48.5608, + "eval_samples_per_second": 5.684, + "eval_steps_per_second": 0.185, + "step": 1780 + }, + { + "epoch": 0.842080378250591, + "grad_norm": 5.853981971740723, + "learning_rate": 1.482019490932074e-06, + "loss": 0.2162, + "step": 1781 + }, + { + "epoch": 0.8425531914893617, + "grad_norm": 5.779383182525635, + "learning_rate": 1.473382720308797e-06, + "loss": 0.2139, + "step": 1782 + }, + { + "epoch": 0.8430260047281324, + "grad_norm": 5.416299343109131, + "learning_rate": 1.4647691876904835e-06, + "loss": 0.1742, + "step": 1783 + }, + { + "epoch": 0.8434988179669031, + "grad_norm": 6.843648433685303, + "learning_rate": 1.4561789165520136e-06, + "loss": 0.3138, + "step": 1784 + }, + { + "epoch": 0.8439716312056738, + "grad_norm": 6.401846885681152, + "learning_rate": 1.4476119303048709e-06, + "loss": 0.259, + "step": 1785 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 6.052865982055664, + "learning_rate": 1.43906825229708e-06, + "loss": 0.2637, + "step": 1786 + }, + { + "epoch": 0.8449172576832151, + "grad_norm": 5.836279392242432, + "learning_rate": 1.4305479058131389e-06, + "loss": 0.2327, + "step": 1787 + }, + { + "epoch": 0.8453900709219858, + "grad_norm": 6.56742525100708, + "learning_rate": 1.4220509140739692e-06, + "loss": 0.2571, + "step": 1788 + }, + { + "epoch": 0.8458628841607565, + "grad_norm": 5.522575378417969, + "learning_rate": 1.4135773002368314e-06, + "loss": 0.1913, + "step": 1789 + }, + { + "epoch": 0.8463356973995272, + "grad_norm": 6.318970203399658, + "learning_rate": 1.4051270873952794e-06, + "loss": 0.2334, + "step": 1790 + }, + { + "epoch": 0.8468085106382979, + "grad_norm": 5.222054958343506, + "learning_rate": 1.3967002985790878e-06, + "loss": 0.2156, + "step": 1791 + }, + { + "epoch": 0.8472813238770686, + "grad_norm": 4.232369422912598, + "learning_rate": 1.3882969567541959e-06, + "loss": 0.2233, + "step": 1792 + }, + { + "epoch": 0.8477541371158392, + "grad_norm": 3.875591993331909, + "learning_rate": 1.3799170848226395e-06, + "loss": 0.1502, + "step": 1793 + }, + { + "epoch": 0.8482269503546099, + "grad_norm": 7.354650020599365, + "learning_rate": 1.37156070562249e-06, + "loss": 0.249, + "step": 1794 + }, + { + "epoch": 0.8486997635933806, + "grad_norm": 5.512684345245361, + "learning_rate": 1.3632278419277933e-06, + "loss": 0.2428, + "step": 1795 + }, + { + "epoch": 0.8491725768321513, + "grad_norm": 8.732033729553223, + "learning_rate": 1.3549185164485135e-06, + "loss": 0.3614, + "step": 1796 + }, + { + "epoch": 0.849645390070922, + "grad_norm": 7.074683666229248, + "learning_rate": 1.3466327518304555e-06, + "loss": 0.3366, + "step": 1797 + }, + { + "epoch": 0.8501182033096927, + "grad_norm": 4.561595916748047, + "learning_rate": 1.3383705706552174e-06, + "loss": 0.2038, + "step": 1798 + }, + { + "epoch": 0.8505910165484634, + "grad_norm": 4.308145523071289, + "learning_rate": 1.3301319954401248e-06, + "loss": 0.2003, + "step": 1799 + }, + { + "epoch": 0.851063829787234, + "grad_norm": 4.1011552810668945, + "learning_rate": 1.3219170486381671e-06, + "loss": 0.1727, + "step": 1800 + }, + { + "epoch": 0.851063829787234, + "eval_accuracy": 0.8658536585365854, + "eval_f1": 0.717948717948718, + "eval_loss": 0.29891377687454224, + "eval_precision": 0.875, + "eval_recall": 0.6086956521739131, + "eval_runtime": 49.9353, + "eval_samples_per_second": 5.527, + "eval_steps_per_second": 0.18, + "step": 1800 + }, + { + "epoch": 0.8515366430260047, + "grad_norm": 4.582373142242432, + "learning_rate": 1.3137257526379366e-06, + "loss": 0.1734, + "step": 1801 + }, + { + "epoch": 0.8520094562647754, + "grad_norm": 5.546594619750977, + "learning_rate": 1.3055581297635734e-06, + "loss": 0.1714, + "step": 1802 + }, + { + "epoch": 0.8524822695035461, + "grad_norm": 6.5857696533203125, + "learning_rate": 1.2974142022746971e-06, + "loss": 0.2197, + "step": 1803 + }, + { + "epoch": 0.8529550827423168, + "grad_norm": 3.709681987762451, + "learning_rate": 1.289293992366346e-06, + "loss": 0.1239, + "step": 1804 + }, + { + "epoch": 0.8534278959810875, + "grad_norm": 3.4839696884155273, + "learning_rate": 1.2811975221689289e-06, + "loss": 0.1857, + "step": 1805 + }, + { + "epoch": 0.8539007092198582, + "grad_norm": 4.332188606262207, + "learning_rate": 1.2731248137481468e-06, + "loss": 0.2506, + "step": 1806 + }, + { + "epoch": 0.8543735224586289, + "grad_norm": 5.268299579620361, + "learning_rate": 1.2650758891049464e-06, + "loss": 0.2326, + "step": 1807 + }, + { + "epoch": 0.8548463356973995, + "grad_norm": 4.915356636047363, + "learning_rate": 1.257050770175452e-06, + "loss": 0.1439, + "step": 1808 + }, + { + "epoch": 0.8553191489361702, + "grad_norm": 5.468433856964111, + "learning_rate": 1.2490494788309115e-06, + "loss": 0.2292, + "step": 1809 + }, + { + "epoch": 0.8557919621749409, + "grad_norm": 8.169693946838379, + "learning_rate": 1.241072036877633e-06, + "loss": 0.2483, + "step": 1810 + }, + { + "epoch": 0.8562647754137116, + "grad_norm": 4.5563063621521, + "learning_rate": 1.2331184660569284e-06, + "loss": 0.213, + "step": 1811 + }, + { + "epoch": 0.8567375886524823, + "grad_norm": 5.421559810638428, + "learning_rate": 1.2251887880450498e-06, + "loss": 0.2602, + "step": 1812 + }, + { + "epoch": 0.857210401891253, + "grad_norm": 5.384074687957764, + "learning_rate": 1.217283024453133e-06, + "loss": 0.2221, + "step": 1813 + }, + { + "epoch": 0.8576832151300237, + "grad_norm": 4.89252233505249, + "learning_rate": 1.2094011968271447e-06, + "loss": 0.1907, + "step": 1814 + }, + { + "epoch": 0.8581560283687943, + "grad_norm": 6.57568359375, + "learning_rate": 1.2015433266478105e-06, + "loss": 0.2879, + "step": 1815 + }, + { + "epoch": 0.858628841607565, + "grad_norm": 7.750422954559326, + "learning_rate": 1.1937094353305679e-06, + "loss": 0.2787, + "step": 1816 + }, + { + "epoch": 0.8591016548463357, + "grad_norm": 5.562004089355469, + "learning_rate": 1.1858995442254984e-06, + "loss": 0.2327, + "step": 1817 + }, + { + "epoch": 0.8595744680851064, + "grad_norm": 5.795264720916748, + "learning_rate": 1.178113674617285e-06, + "loss": 0.2329, + "step": 1818 + }, + { + "epoch": 0.8600472813238771, + "grad_norm": 4.586887836456299, + "learning_rate": 1.1703518477251296e-06, + "loss": 0.2449, + "step": 1819 + }, + { + "epoch": 0.8605200945626478, + "grad_norm": 5.4445648193359375, + "learning_rate": 1.1626140847027211e-06, + "loss": 0.1995, + "step": 1820 + }, + { + "epoch": 0.8605200945626478, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7089201877934272, + "eval_loss": 0.30064335465431213, + "eval_precision": 0.8728323699421965, + "eval_recall": 0.5968379446640316, + "eval_runtime": 47.2996, + "eval_samples_per_second": 5.835, + "eval_steps_per_second": 0.19, + "step": 1820 + }, + { + "epoch": 0.8609929078014185, + "grad_norm": 5.280003070831299, + "learning_rate": 1.154900406638161e-06, + "loss": 0.2273, + "step": 1821 + }, + { + "epoch": 0.8614657210401891, + "grad_norm": 5.873126029968262, + "learning_rate": 1.147210834553908e-06, + "loss": 0.3027, + "step": 1822 + }, + { + "epoch": 0.8619385342789598, + "grad_norm": 5.633482456207275, + "learning_rate": 1.1395453894067322e-06, + "loss": 0.2282, + "step": 1823 + }, + { + "epoch": 0.8624113475177305, + "grad_norm": 7.417043209075928, + "learning_rate": 1.1319040920876412e-06, + "loss": 0.261, + "step": 1824 + }, + { + "epoch": 0.8628841607565012, + "grad_norm": 4.741674900054932, + "learning_rate": 1.1242869634218355e-06, + "loss": 0.2136, + "step": 1825 + }, + { + "epoch": 0.8633569739952719, + "grad_norm": 6.367619037628174, + "learning_rate": 1.1166940241686453e-06, + "loss": 0.2331, + "step": 1826 + }, + { + "epoch": 0.8638297872340426, + "grad_norm": 7.427839756011963, + "learning_rate": 1.1091252950214793e-06, + "loss": 0.2836, + "step": 1827 + }, + { + "epoch": 0.8643026004728133, + "grad_norm": 4.886536598205566, + "learning_rate": 1.1015807966077641e-06, + "loss": 0.2326, + "step": 1828 + }, + { + "epoch": 0.864775413711584, + "grad_norm": 6.093448162078857, + "learning_rate": 1.0940605494888856e-06, + "loss": 0.1806, + "step": 1829 + }, + { + "epoch": 0.8652482269503546, + "grad_norm": 5.902658939361572, + "learning_rate": 1.0865645741601372e-06, + "loss": 0.2035, + "step": 1830 + }, + { + "epoch": 0.8657210401891253, + "grad_norm": 6.096370697021484, + "learning_rate": 1.0790928910506705e-06, + "loss": 0.1924, + "step": 1831 + }, + { + "epoch": 0.866193853427896, + "grad_norm": 6.3981499671936035, + "learning_rate": 1.0716455205234244e-06, + "loss": 0.2536, + "step": 1832 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 5.565893650054932, + "learning_rate": 1.0642224828750803e-06, + "loss": 0.2512, + "step": 1833 + }, + { + "epoch": 0.8671394799054374, + "grad_norm": 6.9906206130981445, + "learning_rate": 1.0568237983360041e-06, + "loss": 0.2001, + "step": 1834 + }, + { + "epoch": 0.8676122931442081, + "grad_norm": 5.311846733093262, + "learning_rate": 1.0494494870701889e-06, + "loss": 0.2169, + "step": 1835 + }, + { + "epoch": 0.8680851063829788, + "grad_norm": 4.316741466522217, + "learning_rate": 1.0420995691752079e-06, + "loss": 0.2258, + "step": 1836 + }, + { + "epoch": 0.8685579196217494, + "grad_norm": 4.4594316482543945, + "learning_rate": 1.034774064682148e-06, + "loss": 0.1806, + "step": 1837 + }, + { + "epoch": 0.8690307328605201, + "grad_norm": 4.841372013092041, + "learning_rate": 1.027472993555565e-06, + "loss": 0.1777, + "step": 1838 + }, + { + "epoch": 0.8695035460992908, + "grad_norm": 4.721182346343994, + "learning_rate": 1.0201963756934164e-06, + "loss": 0.1937, + "step": 1839 + }, + { + "epoch": 0.8699763593380615, + "grad_norm": 5.975325584411621, + "learning_rate": 1.012944230927031e-06, + "loss": 0.154, + "step": 1840 + }, + { + "epoch": 0.8699763593380615, + "eval_accuracy": 0.8680709534368071, + "eval_f1": 0.7264367816091954, + "eval_loss": 0.29660460352897644, + "eval_precision": 0.8681318681318682, + "eval_recall": 0.6245059288537549, + "eval_runtime": 47.2449, + "eval_samples_per_second": 5.842, + "eval_steps_per_second": 0.19, + "step": 1840 + }, + { + "epoch": 0.8704491725768322, + "grad_norm": 4.6500020027160645, + "learning_rate": 1.0057165790210277e-06, + "loss": 0.1928, + "step": 1841 + }, + { + "epoch": 0.8709219858156029, + "grad_norm": 5.256702423095703, + "learning_rate": 9.985134396732798e-07, + "loss": 0.2108, + "step": 1842 + }, + { + "epoch": 0.8713947990543736, + "grad_norm": 4.254281997680664, + "learning_rate": 9.913348325148498e-07, + "loss": 0.2064, + "step": 1843 + }, + { + "epoch": 0.8718676122931442, + "grad_norm": 4.128483772277832, + "learning_rate": 9.841807771099498e-07, + "loss": 0.1908, + "step": 1844 + }, + { + "epoch": 0.8723404255319149, + "grad_norm": 6.125643730163574, + "learning_rate": 9.77051292955873e-07, + "loss": 0.2637, + "step": 1845 + }, + { + "epoch": 0.8728132387706856, + "grad_norm": 5.453957557678223, + "learning_rate": 9.699463994829495e-07, + "loss": 0.2566, + "step": 1846 + }, + { + "epoch": 0.8732860520094563, + "grad_norm": 15.336091041564941, + "learning_rate": 9.628661160544905e-07, + "loss": 0.2678, + "step": 1847 + }, + { + "epoch": 0.873758865248227, + "grad_norm": 7.426636695861816, + "learning_rate": 9.558104619667386e-07, + "loss": 0.1946, + "step": 1848 + }, + { + "epoch": 0.8742316784869977, + "grad_norm": 6.527373313903809, + "learning_rate": 9.487794564488106e-07, + "loss": 0.2772, + "step": 1849 + }, + { + "epoch": 0.8747044917257684, + "grad_norm": 5.806751728057861, + "learning_rate": 9.417731186626466e-07, + "loss": 0.1703, + "step": 1850 + }, + { + "epoch": 0.875177304964539, + "grad_norm": 5.468467712402344, + "learning_rate": 9.347914677029624e-07, + "loss": 0.2873, + "step": 1851 + }, + { + "epoch": 0.8756501182033097, + "grad_norm": 7.120368957519531, + "learning_rate": 9.278345225971863e-07, + "loss": 0.296, + "step": 1852 + }, + { + "epoch": 0.8761229314420804, + "grad_norm": 6.212596893310547, + "learning_rate": 9.209023023054253e-07, + "loss": 0.2348, + "step": 1853 + }, + { + "epoch": 0.8765957446808511, + "grad_norm": 3.766883373260498, + "learning_rate": 9.139948257203934e-07, + "loss": 0.1481, + "step": 1854 + }, + { + "epoch": 0.8770685579196218, + "grad_norm": 3.6534929275512695, + "learning_rate": 9.071121116673731e-07, + "loss": 0.1831, + "step": 1855 + }, + { + "epoch": 0.8775413711583925, + "grad_norm": 6.801371097564697, + "learning_rate": 9.002541789041608e-07, + "loss": 0.257, + "step": 1856 + }, + { + "epoch": 0.8780141843971632, + "grad_norm": 6.545820713043213, + "learning_rate": 8.934210461210136e-07, + "loss": 0.2464, + "step": 1857 + }, + { + "epoch": 0.8784869976359339, + "grad_norm": 7.8918914794921875, + "learning_rate": 8.866127319406004e-07, + "loss": 0.2951, + "step": 1858 + }, + { + "epoch": 0.8789598108747045, + "grad_norm": 7.128468036651611, + "learning_rate": 8.79829254917951e-07, + "loss": 0.3351, + "step": 1859 + }, + { + "epoch": 0.8794326241134752, + "grad_norm": 7.129080772399902, + "learning_rate": 8.73070633540406e-07, + "loss": 0.1821, + "step": 1860 + }, + { + "epoch": 0.8794326241134752, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7235023041474654, + "eval_loss": 0.29677459597587585, + "eval_precision": 0.8674033149171271, + "eval_recall": 0.6205533596837944, + "eval_runtime": 48.3135, + "eval_samples_per_second": 5.713, + "eval_steps_per_second": 0.186, + "step": 1860 + }, + { + "epoch": 0.8799054373522459, + "grad_norm": 5.813145637512207, + "learning_rate": 8.663368862275634e-07, + "loss": 0.2184, + "step": 1861 + }, + { + "epoch": 0.8803782505910166, + "grad_norm": 4.450648307800293, + "learning_rate": 8.596280313312355e-07, + "loss": 0.2037, + "step": 1862 + }, + { + "epoch": 0.8808510638297873, + "grad_norm": 4.639596939086914, + "learning_rate": 8.5294408713539e-07, + "loss": 0.2164, + "step": 1863 + }, + { + "epoch": 0.881323877068558, + "grad_norm": 5.317780017852783, + "learning_rate": 8.462850718561045e-07, + "loss": 0.2591, + "step": 1864 + }, + { + "epoch": 0.8817966903073287, + "grad_norm": 5.928182125091553, + "learning_rate": 8.396510036415173e-07, + "loss": 0.2807, + "step": 1865 + }, + { + "epoch": 0.8822695035460993, + "grad_norm": 8.71645736694336, + "learning_rate": 8.330419005717782e-07, + "loss": 0.3168, + "step": 1866 + }, + { + "epoch": 0.88274231678487, + "grad_norm": 5.529267311096191, + "learning_rate": 8.264577806589968e-07, + "loss": 0.2113, + "step": 1867 + }, + { + "epoch": 0.8832151300236407, + "grad_norm": 4.838929176330566, + "learning_rate": 8.198986618471949e-07, + "loss": 0.1428, + "step": 1868 + }, + { + "epoch": 0.8836879432624114, + "grad_norm": 5.252522945404053, + "learning_rate": 8.133645620122566e-07, + "loss": 0.2061, + "step": 1869 + }, + { + "epoch": 0.8841607565011821, + "grad_norm": 5.35953950881958, + "learning_rate": 8.068554989618871e-07, + "loss": 0.1998, + "step": 1870 + }, + { + "epoch": 0.8846335697399527, + "grad_norm": 5.610535621643066, + "learning_rate": 8.003714904355486e-07, + "loss": 0.1787, + "step": 1871 + }, + { + "epoch": 0.8851063829787233, + "grad_norm": 7.672272205352783, + "learning_rate": 7.939125541044268e-07, + "loss": 0.147, + "step": 1872 + }, + { + "epoch": 0.885579196217494, + "grad_norm": 6.3421711921691895, + "learning_rate": 7.874787075713742e-07, + "loss": 0.2605, + "step": 1873 + }, + { + "epoch": 0.8860520094562647, + "grad_norm": 6.709553241729736, + "learning_rate": 7.810699683708644e-07, + "loss": 0.2765, + "step": 1874 + }, + { + "epoch": 0.8865248226950354, + "grad_norm": 7.121283531188965, + "learning_rate": 7.74686353968952e-07, + "loss": 0.2537, + "step": 1875 + }, + { + "epoch": 0.8869976359338061, + "grad_norm": 7.508021831512451, + "learning_rate": 7.683278817632056e-07, + "loss": 0.2712, + "step": 1876 + }, + { + "epoch": 0.8874704491725768, + "grad_norm": 6.003512859344482, + "learning_rate": 7.619945690826824e-07, + "loss": 0.2222, + "step": 1877 + }, + { + "epoch": 0.8879432624113475, + "grad_norm": 6.198127746582031, + "learning_rate": 7.556864331878633e-07, + "loss": 0.2216, + "step": 1878 + }, + { + "epoch": 0.8884160756501182, + "grad_norm": 4.947995185852051, + "learning_rate": 7.494034912706227e-07, + "loss": 0.1685, + "step": 1879 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 7.408123016357422, + "learning_rate": 7.43145760454167e-07, + "loss": 0.2354, + "step": 1880 + }, + { + "epoch": 0.8888888888888888, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.726027397260274, + "eval_loss": 0.2952026128768921, + "eval_precision": 0.8594594594594595, + "eval_recall": 0.6284584980237155, + "eval_runtime": 47.4827, + "eval_samples_per_second": 5.813, + "eval_steps_per_second": 0.19, + "step": 1880 + }, + { + "epoch": 0.8893617021276595, + "grad_norm": 5.347765922546387, + "learning_rate": 7.369132577929938e-07, + "loss": 0.2025, + "step": 1881 + }, + { + "epoch": 0.8898345153664302, + "grad_norm": 5.331334590911865, + "learning_rate": 7.307060002728462e-07, + "loss": 0.2239, + "step": 1882 + }, + { + "epoch": 0.8903073286052009, + "grad_norm": 4.435246467590332, + "learning_rate": 7.245240048106628e-07, + "loss": 0.1737, + "step": 1883 + }, + { + "epoch": 0.8907801418439716, + "grad_norm": 7.154726028442383, + "learning_rate": 7.183672882545401e-07, + "loss": 0.2582, + "step": 1884 + }, + { + "epoch": 0.8912529550827423, + "grad_norm": 4.464818000793457, + "learning_rate": 7.122358673836782e-07, + "loss": 0.1574, + "step": 1885 + }, + { + "epoch": 0.891725768321513, + "grad_norm": 6.102884769439697, + "learning_rate": 7.061297589083327e-07, + "loss": 0.2082, + "step": 1886 + }, + { + "epoch": 0.8921985815602836, + "grad_norm": 5.337555408477783, + "learning_rate": 7.000489794697774e-07, + "loss": 0.237, + "step": 1887 + }, + { + "epoch": 0.8926713947990543, + "grad_norm": 6.383353233337402, + "learning_rate": 6.939935456402613e-07, + "loss": 0.2242, + "step": 1888 + }, + { + "epoch": 0.893144208037825, + "grad_norm": 5.135204792022705, + "learning_rate": 6.879634739229502e-07, + "loss": 0.2586, + "step": 1889 + }, + { + "epoch": 0.8936170212765957, + "grad_norm": 13.136929512023926, + "learning_rate": 6.819587807518924e-07, + "loss": 0.3131, + "step": 1890 + }, + { + "epoch": 0.8940898345153664, + "grad_norm": 5.313321590423584, + "learning_rate": 6.759794824919686e-07, + "loss": 0.2519, + "step": 1891 + }, + { + "epoch": 0.8945626477541371, + "grad_norm": 5.850648403167725, + "learning_rate": 6.700255954388535e-07, + "loss": 0.2373, + "step": 1892 + }, + { + "epoch": 0.8950354609929078, + "grad_norm": 5.615677356719971, + "learning_rate": 6.640971358189651e-07, + "loss": 0.1992, + "step": 1893 + }, + { + "epoch": 0.8955082742316784, + "grad_norm": 6.700225830078125, + "learning_rate": 6.581941197894226e-07, + "loss": 0.2574, + "step": 1894 + }, + { + "epoch": 0.8959810874704491, + "grad_norm": 9.15202808380127, + "learning_rate": 6.523165634380047e-07, + "loss": 0.3287, + "step": 1895 + }, + { + "epoch": 0.8964539007092198, + "grad_norm": 4.325416564941406, + "learning_rate": 6.464644827830945e-07, + "loss": 0.1514, + "step": 1896 + }, + { + "epoch": 0.8969267139479905, + "grad_norm": 5.760486602783203, + "learning_rate": 6.406378937736602e-07, + "loss": 0.2557, + "step": 1897 + }, + { + "epoch": 0.8973995271867612, + "grad_norm": 5.745640754699707, + "learning_rate": 6.348368122891857e-07, + "loss": 0.1752, + "step": 1898 + }, + { + "epoch": 0.8978723404255319, + "grad_norm": 6.136902809143066, + "learning_rate": 6.29061254139639e-07, + "loss": 0.2628, + "step": 1899 + }, + { + "epoch": 0.8983451536643026, + "grad_norm": 10.711871147155762, + "learning_rate": 6.233112350654302e-07, + "loss": 0.3563, + "step": 1900 + }, + { + "epoch": 0.8983451536643026, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7272727272727273, + "eval_loss": 0.2932513654232025, + "eval_precision": 0.8556149732620321, + "eval_recall": 0.6324110671936759, + "eval_runtime": 47.8382, + "eval_samples_per_second": 5.769, + "eval_steps_per_second": 0.188, + "step": 1900 + }, + { + "epoch": 0.8988179669030733, + "grad_norm": 6.118460178375244, + "learning_rate": 6.175867707373695e-07, + "loss": 0.2678, + "step": 1901 + }, + { + "epoch": 0.8992907801418439, + "grad_norm": 5.572527885437012, + "learning_rate": 6.118878767566139e-07, + "loss": 0.2428, + "step": 1902 + }, + { + "epoch": 0.8997635933806146, + "grad_norm": 6.919821262359619, + "learning_rate": 6.062145686546383e-07, + "loss": 0.1785, + "step": 1903 + }, + { + "epoch": 0.9002364066193853, + "grad_norm": 5.680126667022705, + "learning_rate": 6.00566861893186e-07, + "loss": 0.2201, + "step": 1904 + }, + { + "epoch": 0.900709219858156, + "grad_norm": 5.649215221405029, + "learning_rate": 5.949447718642254e-07, + "loss": 0.169, + "step": 1905 + }, + { + "epoch": 0.9011820330969267, + "grad_norm": 6.076656341552734, + "learning_rate": 5.893483138899125e-07, + "loss": 0.219, + "step": 1906 + }, + { + "epoch": 0.9016548463356974, + "grad_norm": 5.83716344833374, + "learning_rate": 5.837775032225479e-07, + "loss": 0.2754, + "step": 1907 + }, + { + "epoch": 0.902127659574468, + "grad_norm": 4.6485819816589355, + "learning_rate": 5.782323550445313e-07, + "loss": 0.2558, + "step": 1908 + }, + { + "epoch": 0.9026004728132387, + "grad_norm": 5.645073890686035, + "learning_rate": 5.727128844683227e-07, + "loss": 0.214, + "step": 1909 + }, + { + "epoch": 0.9030732860520094, + "grad_norm": 7.4083476066589355, + "learning_rate": 5.672191065364097e-07, + "loss": 0.2417, + "step": 1910 + }, + { + "epoch": 0.9035460992907801, + "grad_norm": 6.812260150909424, + "learning_rate": 5.617510362212486e-07, + "loss": 0.3103, + "step": 1911 + }, + { + "epoch": 0.9040189125295508, + "grad_norm": 5.349275588989258, + "learning_rate": 5.563086884252389e-07, + "loss": 0.206, + "step": 1912 + }, + { + "epoch": 0.9044917257683215, + "grad_norm": 5.61432409286499, + "learning_rate": 5.508920779806748e-07, + "loss": 0.2645, + "step": 1913 + }, + { + "epoch": 0.9049645390070922, + "grad_norm": 8.072186470031738, + "learning_rate": 5.455012196497089e-07, + "loss": 0.2231, + "step": 1914 + }, + { + "epoch": 0.9054373522458629, + "grad_norm": 6.193761348724365, + "learning_rate": 5.4013612812431e-07, + "loss": 0.2029, + "step": 1915 + }, + { + "epoch": 0.9059101654846335, + "grad_norm": 3.4695332050323486, + "learning_rate": 5.34796818026222e-07, + "loss": 0.194, + "step": 1916 + }, + { + "epoch": 0.9063829787234042, + "grad_norm": 4.863160133361816, + "learning_rate": 5.294833039069269e-07, + "loss": 0.1776, + "step": 1917 + }, + { + "epoch": 0.9068557919621749, + "grad_norm": 5.608933448791504, + "learning_rate": 5.241956002476045e-07, + "loss": 0.2093, + "step": 1918 + }, + { + "epoch": 0.9073286052009456, + "grad_norm": 4.8589959144592285, + "learning_rate": 5.189337214590895e-07, + "loss": 0.1433, + "step": 1919 + }, + { + "epoch": 0.9078014184397163, + "grad_norm": 5.196472644805908, + "learning_rate": 5.136976818818373e-07, + "loss": 0.2716, + "step": 1920 + }, + { + "epoch": 0.9078014184397163, + "eval_accuracy": 0.8647450110864745, + "eval_f1": 0.7188940092165899, + "eval_loss": 0.296786367893219, + "eval_precision": 0.861878453038674, + "eval_recall": 0.616600790513834, + "eval_runtime": 48.2695, + "eval_samples_per_second": 5.718, + "eval_steps_per_second": 0.186, + "step": 1920 + }, + { + "epoch": 0.908274231678487, + "grad_norm": 5.688756942749023, + "learning_rate": 5.08487495785881e-07, + "loss": 0.1862, + "step": 1921 + }, + { + "epoch": 0.9087470449172577, + "grad_norm": 4.335201263427734, + "learning_rate": 5.03303177370793e-07, + "loss": 0.2051, + "step": 1922 + }, + { + "epoch": 0.9092198581560283, + "grad_norm": 5.4330735206604, + "learning_rate": 4.981447407656504e-07, + "loss": 0.2108, + "step": 1923 + }, + { + "epoch": 0.909692671394799, + "grad_norm": 7.466004848480225, + "learning_rate": 4.930122000289905e-07, + "loss": 0.2334, + "step": 1924 + }, + { + "epoch": 0.9101654846335697, + "grad_norm": 8.651698112487793, + "learning_rate": 4.879055691487767e-07, + "loss": 0.2628, + "step": 1925 + }, + { + "epoch": 0.9106382978723404, + "grad_norm": 6.537608623504639, + "learning_rate": 4.828248620423559e-07, + "loss": 0.2477, + "step": 1926 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 5.294820308685303, + "learning_rate": 4.77770092556431e-07, + "loss": 0.2558, + "step": 1927 + }, + { + "epoch": 0.9115839243498818, + "grad_norm": 6.457219123840332, + "learning_rate": 4.72741274467009e-07, + "loss": 0.2544, + "step": 1928 + }, + { + "epoch": 0.9120567375886525, + "grad_norm": 6.060578346252441, + "learning_rate": 4.6773842147937234e-07, + "loss": 0.2504, + "step": 1929 + }, + { + "epoch": 0.9125295508274232, + "grad_norm": 6.253387928009033, + "learning_rate": 4.627615472280389e-07, + "loss": 0.2758, + "step": 1930 + }, + { + "epoch": 0.9130023640661938, + "grad_norm": 5.318558692932129, + "learning_rate": 4.5781066527673003e-07, + "loss": 0.1307, + "step": 1931 + }, + { + "epoch": 0.9134751773049645, + "grad_norm": 6.5254316329956055, + "learning_rate": 4.528857891183214e-07, + "loss": 0.2367, + "step": 1932 + }, + { + "epoch": 0.9139479905437352, + "grad_norm": 4.4486165046691895, + "learning_rate": 4.479869321748187e-07, + "loss": 0.1974, + "step": 1933 + }, + { + "epoch": 0.9144208037825059, + "grad_norm": 5.705449104309082, + "learning_rate": 4.431141077973156e-07, + "loss": 0.1546, + "step": 1934 + }, + { + "epoch": 0.9148936170212766, + "grad_norm": 6.80421781539917, + "learning_rate": 4.382673292659545e-07, + "loss": 0.2338, + "step": 1935 + }, + { + "epoch": 0.9153664302600473, + "grad_norm": 6.027945518493652, + "learning_rate": 4.334466097899015e-07, + "loss": 0.2387, + "step": 1936 + }, + { + "epoch": 0.915839243498818, + "grad_norm": 7.638448715209961, + "learning_rate": 4.28651962507296e-07, + "loss": 0.3043, + "step": 1937 + }, + { + "epoch": 0.9163120567375886, + "grad_norm": 5.784573078155518, + "learning_rate": 4.2388340048522325e-07, + "loss": 0.1626, + "step": 1938 + }, + { + "epoch": 0.9167848699763593, + "grad_norm": 7.274070739746094, + "learning_rate": 4.191409367196753e-07, + "loss": 0.3126, + "step": 1939 + }, + { + "epoch": 0.91725768321513, + "grad_norm": 4.2528533935546875, + "learning_rate": 4.1442458413552324e-07, + "loss": 0.1428, + "step": 1940 + }, + { + "epoch": 0.91725768321513, + "eval_accuracy": 0.8636363636363636, + "eval_f1": 0.7146171693735499, + "eval_loss": 0.2969609200954437, + "eval_precision": 0.8651685393258427, + "eval_recall": 0.6086956521739131, + "eval_runtime": 49.0031, + "eval_samples_per_second": 5.632, + "eval_steps_per_second": 0.184, + "step": 1940 + }, + { + "epoch": 0.9177304964539007, + "grad_norm": 6.376473426818848, + "learning_rate": 4.097343555864719e-07, + "loss": 0.3121, + "step": 1941 + }, + { + "epoch": 0.9182033096926714, + "grad_norm": 4.471124172210693, + "learning_rate": 4.0507026385502747e-07, + "loss": 0.2247, + "step": 1942 + }, + { + "epoch": 0.9186761229314421, + "grad_norm": 4.96635103225708, + "learning_rate": 4.0043232165246413e-07, + "loss": 0.1916, + "step": 1943 + }, + { + "epoch": 0.9191489361702128, + "grad_norm": 4.768991947174072, + "learning_rate": 3.958205416187966e-07, + "loss": 0.1832, + "step": 1944 + }, + { + "epoch": 0.9196217494089834, + "grad_norm": 3.4908788204193115, + "learning_rate": 3.9123493632272967e-07, + "loss": 0.1689, + "step": 1945 + }, + { + "epoch": 0.9200945626477541, + "grad_norm": 7.56951379776001, + "learning_rate": 3.8667551826163774e-07, + "loss": 0.2176, + "step": 1946 + }, + { + "epoch": 0.9205673758865248, + "grad_norm": 6.852828502655029, + "learning_rate": 3.821422998615254e-07, + "loss": 0.2735, + "step": 1947 + }, + { + "epoch": 0.9210401891252955, + "grad_norm": 5.238857269287109, + "learning_rate": 3.776352934769911e-07, + "loss": 0.2495, + "step": 1948 + }, + { + "epoch": 0.9215130023640662, + "grad_norm": 6.270791530609131, + "learning_rate": 3.731545113912005e-07, + "loss": 0.2455, + "step": 1949 + }, + { + "epoch": 0.9219858156028369, + "grad_norm": 6.1830034255981445, + "learning_rate": 3.6869996581584746e-07, + "loss": 0.252, + "step": 1950 + }, + { + "epoch": 0.9224586288416076, + "grad_norm": 6.186679840087891, + "learning_rate": 3.6427166889112184e-07, + "loss": 0.2653, + "step": 1951 + }, + { + "epoch": 0.9229314420803783, + "grad_norm": 4.7130126953125, + "learning_rate": 3.5986963268567433e-07, + "loss": 0.1775, + "step": 1952 + }, + { + "epoch": 0.9234042553191489, + "grad_norm": 4.696549892425537, + "learning_rate": 3.5549386919659033e-07, + "loss": 0.2533, + "step": 1953 + }, + { + "epoch": 0.9238770685579196, + "grad_norm": 4.767563819885254, + "learning_rate": 3.5114439034935053e-07, + "loss": 0.2097, + "step": 1954 + }, + { + "epoch": 0.9243498817966903, + "grad_norm": 3.9315848350524902, + "learning_rate": 3.468212079978017e-07, + "loss": 0.1625, + "step": 1955 + }, + { + "epoch": 0.924822695035461, + "grad_norm": 3.269307851791382, + "learning_rate": 3.4252433392412244e-07, + "loss": 0.1028, + "step": 1956 + }, + { + "epoch": 0.9252955082742317, + "grad_norm": 6.641714096069336, + "learning_rate": 3.3825377983879195e-07, + "loss": 0.2642, + "step": 1957 + }, + { + "epoch": 0.9257683215130024, + "grad_norm": 6.65203332901001, + "learning_rate": 3.340095573805613e-07, + "loss": 0.2346, + "step": 1958 + }, + { + "epoch": 0.926241134751773, + "grad_norm": 6.2382025718688965, + "learning_rate": 3.2979167811641567e-07, + "loss": 0.2514, + "step": 1959 + }, + { + "epoch": 0.9267139479905437, + "grad_norm": 4.330326557159424, + "learning_rate": 3.256001535415465e-07, + "loss": 0.2108, + "step": 1960 + }, + { + "epoch": 0.9267139479905437, + "eval_accuracy": 0.8658536585365854, + "eval_f1": 0.7205542725173211, + "eval_loss": 0.29785633087158203, + "eval_precision": 0.8666666666666667, + "eval_recall": 0.616600790513834, + "eval_runtime": 47.846, + "eval_samples_per_second": 5.769, + "eval_steps_per_second": 0.188, + "step": 1960 + }, + { + "epoch": 0.9271867612293144, + "grad_norm": 4.008530139923096, + "learning_rate": 3.214349950793183e-07, + "loss": 0.1599, + "step": 1961 + }, + { + "epoch": 0.9276595744680851, + "grad_norm": 6.942195415496826, + "learning_rate": 3.172962140812419e-07, + "loss": 0.3592, + "step": 1962 + }, + { + "epoch": 0.9281323877068558, + "grad_norm": 4.6299567222595215, + "learning_rate": 3.1318382182693894e-07, + "loss": 0.2181, + "step": 1963 + }, + { + "epoch": 0.9286052009456265, + "grad_norm": 5.631269454956055, + "learning_rate": 3.0909782952410984e-07, + "loss": 0.269, + "step": 1964 + }, + { + "epoch": 0.9290780141843972, + "grad_norm": 4.7120232582092285, + "learning_rate": 3.05038248308509e-07, + "loss": 0.1236, + "step": 1965 + }, + { + "epoch": 0.9295508274231679, + "grad_norm": 7.05232048034668, + "learning_rate": 3.010050892439109e-07, + "loss": 0.2494, + "step": 1966 + }, + { + "epoch": 0.9300236406619385, + "grad_norm": 4.27794885635376, + "learning_rate": 2.9699836332208186e-07, + "loss": 0.1902, + "step": 1967 + }, + { + "epoch": 0.9304964539007092, + "grad_norm": 5.519193172454834, + "learning_rate": 2.930180814627448e-07, + "loss": 0.2123, + "step": 1968 + }, + { + "epoch": 0.9309692671394799, + "grad_norm": 5.249775409698486, + "learning_rate": 2.890642545135569e-07, + "loss": 0.2105, + "step": 1969 + }, + { + "epoch": 0.9314420803782506, + "grad_norm": 6.687892436981201, + "learning_rate": 2.851368932500742e-07, + "loss": 0.2725, + "step": 1970 + }, + { + "epoch": 0.9319148936170213, + "grad_norm": 5.885591506958008, + "learning_rate": 2.8123600837572594e-07, + "loss": 0.261, + "step": 1971 + }, + { + "epoch": 0.932387706855792, + "grad_norm": 4.598552227020264, + "learning_rate": 2.773616105217836e-07, + "loss": 0.1995, + "step": 1972 + }, + { + "epoch": 0.9328605200945627, + "grad_norm": 6.907764434814453, + "learning_rate": 2.7351371024733174e-07, + "loss": 0.2393, + "step": 1973 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 5.189178466796875, + "learning_rate": 2.6969231803923856e-07, + "loss": 0.1963, + "step": 1974 + }, + { + "epoch": 0.933806146572104, + "grad_norm": 5.675337791442871, + "learning_rate": 2.6589744431213313e-07, + "loss": 0.2482, + "step": 1975 + }, + { + "epoch": 0.9342789598108747, + "grad_norm": 4.176632881164551, + "learning_rate": 2.621290994083692e-07, + "loss": 0.1704, + "step": 1976 + }, + { + "epoch": 0.9347517730496454, + "grad_norm": 4.455401420593262, + "learning_rate": 2.5838729359799917e-07, + "loss": 0.2635, + "step": 1977 + }, + { + "epoch": 0.9352245862884161, + "grad_norm": 5.684086799621582, + "learning_rate": 2.546720370787492e-07, + "loss": 0.2496, + "step": 1978 + }, + { + "epoch": 0.9356973995271868, + "grad_norm": 7.903246879577637, + "learning_rate": 2.5098333997598755e-07, + "loss": 0.3008, + "step": 1979 + }, + { + "epoch": 0.9361702127659575, + "grad_norm": 4.9532270431518555, + "learning_rate": 2.4732121234270156e-07, + "loss": 0.1501, + "step": 1980 + }, + { + "epoch": 0.9361702127659575, + "eval_accuracy": 0.8669623059866962, + "eval_f1": 0.7222222222222222, + "eval_loss": 0.2986098527908325, + "eval_precision": 0.8715083798882681, + "eval_recall": 0.616600790513834, + "eval_runtime": 49.3058, + "eval_samples_per_second": 5.598, + "eval_steps_per_second": 0.183, + "step": 1980 + }, + { + "epoch": 0.9366430260047282, + "grad_norm": 4.474625110626221, + "learning_rate": 2.4368566415946536e-07, + "loss": 0.1952, + "step": 1981 + }, + { + "epoch": 0.9371158392434988, + "grad_norm": 7.129388809204102, + "learning_rate": 2.400767053344144e-07, + "loss": 0.2342, + "step": 1982 + }, + { + "epoch": 0.9375886524822695, + "grad_norm": 7.6979780197143555, + "learning_rate": 2.3649434570321984e-07, + "loss": 0.2414, + "step": 1983 + }, + { + "epoch": 0.9380614657210402, + "grad_norm": 5.29350471496582, + "learning_rate": 2.3293859502906192e-07, + "loss": 0.241, + "step": 1984 + }, + { + "epoch": 0.9385342789598109, + "grad_norm": 5.01874303817749, + "learning_rate": 2.2940946300260113e-07, + "loss": 0.2131, + "step": 1985 + }, + { + "epoch": 0.9390070921985816, + "grad_norm": 5.676163673400879, + "learning_rate": 2.2590695924195048e-07, + "loss": 0.3109, + "step": 1986 + }, + { + "epoch": 0.9394799054373523, + "grad_norm": 4.1814045906066895, + "learning_rate": 2.2243109329265545e-07, + "loss": 0.1398, + "step": 1987 + }, + { + "epoch": 0.939952718676123, + "grad_norm": 5.860604763031006, + "learning_rate": 2.1898187462766395e-07, + "loss": 0.2024, + "step": 1988 + }, + { + "epoch": 0.9404255319148936, + "grad_norm": 5.536343574523926, + "learning_rate": 2.1555931264729657e-07, + "loss": 0.2877, + "step": 1989 + }, + { + "epoch": 0.9408983451536643, + "grad_norm": 4.574560642242432, + "learning_rate": 2.121634166792308e-07, + "loss": 0.226, + "step": 1990 + }, + { + "epoch": 0.941371158392435, + "grad_norm": 6.119741439819336, + "learning_rate": 2.087941959784634e-07, + "loss": 0.213, + "step": 1991 + }, + { + "epoch": 0.9418439716312057, + "grad_norm": 5.73854398727417, + "learning_rate": 2.054516597272993e-07, + "loss": 0.2295, + "step": 1992 + }, + { + "epoch": 0.9423167848699764, + "grad_norm": 6.395056247711182, + "learning_rate": 2.021358170353138e-07, + "loss": 0.2884, + "step": 1993 + }, + { + "epoch": 0.9427895981087471, + "grad_norm": 6.370244026184082, + "learning_rate": 1.988466769393349e-07, + "loss": 0.2622, + "step": 1994 + }, + { + "epoch": 0.9432624113475178, + "grad_norm": 5.031834125518799, + "learning_rate": 1.9558424840341428e-07, + "loss": 0.2347, + "step": 1995 + }, + { + "epoch": 0.9437352245862884, + "grad_norm": 4.863191604614258, + "learning_rate": 1.9234854031880856e-07, + "loss": 0.2221, + "step": 1996 + }, + { + "epoch": 0.9442080378250591, + "grad_norm": 7.025779724121094, + "learning_rate": 1.891395615039504e-07, + "loss": 0.2246, + "step": 1997 + }, + { + "epoch": 0.9446808510638298, + "grad_norm": 6.635202407836914, + "learning_rate": 1.859573207044274e-07, + "loss": 0.2719, + "step": 1998 + }, + { + "epoch": 0.9451536643026005, + "grad_norm": 7.9684014320373535, + "learning_rate": 1.8280182659295321e-07, + "loss": 0.3291, + "step": 1999 + }, + { + "epoch": 0.9456264775413712, + "grad_norm": 5.251444339752197, + "learning_rate": 1.7967308776934755e-07, + "loss": 0.2162, + "step": 2000 + }, + { + "epoch": 0.9456264775413712, + "eval_accuracy": 0.8625277161862528, + "eval_f1": 0.7116279069767442, + "eval_loss": 0.29841360449790955, + "eval_precision": 0.864406779661017, + "eval_recall": 0.6047430830039525, + "eval_runtime": 48.9233, + "eval_samples_per_second": 5.641, + "eval_steps_per_second": 0.184, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 2115, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.1185465136093594e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}