diff --git "a/checkpoint-1040/trainer_state.json" "b/checkpoint-1040/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1040/trainer_state.json" @@ -0,0 +1,7450 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 65, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019230769230769232, + "grad_norm": 3.5625, + "learning_rate": 0.0, + "loss": 1.1154, + "step": 1 + }, + { + "epoch": 0.0019230769230769232, + "eval_loss": 1.137184739112854, + "eval_runtime": 34.305, + "eval_samples_per_second": 68.299, + "eval_steps_per_second": 17.082, + "step": 1 + }, + { + "epoch": 0.0038461538461538464, + "grad_norm": 3.5, + "learning_rate": 1.9230769230769234e-07, + "loss": 1.1559, + "step": 2 + }, + { + "epoch": 0.0057692307692307696, + "grad_norm": 3.515625, + "learning_rate": 3.846153846153847e-07, + "loss": 1.1031, + "step": 3 + }, + { + "epoch": 0.007692307692307693, + "grad_norm": 3.6875, + "learning_rate": 5.76923076923077e-07, + "loss": 1.1168, + "step": 4 + }, + { + "epoch": 0.009615384615384616, + "grad_norm": 3.78125, + "learning_rate": 7.692307692307694e-07, + "loss": 1.0839, + "step": 5 + }, + { + "epoch": 0.011538461538461539, + "grad_norm": 3.546875, + "learning_rate": 9.615384615384617e-07, + "loss": 1.0938, + "step": 6 + }, + { + "epoch": 0.013461538461538462, + "grad_norm": 3.71875, + "learning_rate": 1.153846153846154e-06, + "loss": 1.1459, + "step": 7 + }, + { + "epoch": 0.015384615384615385, + "grad_norm": 3.671875, + "learning_rate": 1.3461538461538462e-06, + "loss": 1.0944, + "step": 8 + }, + { + "epoch": 0.01730769230769231, + "grad_norm": 3.65625, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.1185, + "step": 9 + }, + { + "epoch": 0.019230769230769232, + "grad_norm": 3.734375, + "learning_rate": 1.7307692307692308e-06, + "loss": 1.0774, + "step": 10 + }, + { + "epoch": 0.021153846153846155, + "grad_norm": 3.46875, + "learning_rate": 1.9230769230769234e-06, + "loss": 1.1568, + "step": 11 + }, + { + "epoch": 0.023076923076923078, + "grad_norm": 3.515625, + "learning_rate": 2.1153846153846155e-06, + "loss": 1.086, + "step": 12 + }, + { + "epoch": 0.025, + "grad_norm": 3.3125, + "learning_rate": 2.307692307692308e-06, + "loss": 1.0978, + "step": 13 + }, + { + "epoch": 0.026923076923076925, + "grad_norm": 3.359375, + "learning_rate": 2.5e-06, + "loss": 1.109, + "step": 14 + }, + { + "epoch": 0.028846153846153848, + "grad_norm": 2.953125, + "learning_rate": 2.6923076923076923e-06, + "loss": 1.0631, + "step": 15 + }, + { + "epoch": 0.03076923076923077, + "grad_norm": 2.890625, + "learning_rate": 2.8846153846153845e-06, + "loss": 1.1241, + "step": 16 + }, + { + "epoch": 0.032692307692307694, + "grad_norm": 2.6875, + "learning_rate": 3.0769230769230774e-06, + "loss": 1.0644, + "step": 17 + }, + { + "epoch": 0.03461538461538462, + "grad_norm": 2.640625, + "learning_rate": 3.2692307692307696e-06, + "loss": 1.0823, + "step": 18 + }, + { + "epoch": 0.03653846153846154, + "grad_norm": 2.703125, + "learning_rate": 3.4615384615384617e-06, + "loss": 1.0494, + "step": 19 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 2.375, + "learning_rate": 3.653846153846154e-06, + "loss": 1.0778, + "step": 20 + }, + { + "epoch": 0.04038461538461539, + "grad_norm": 2.359375, + "learning_rate": 3.846153846153847e-06, + "loss": 0.9911, + "step": 21 + }, + { + "epoch": 0.04230769230769231, + "grad_norm": 1.9296875, + "learning_rate": 4.0384615384615385e-06, + "loss": 1.1214, + "step": 22 + }, + { + "epoch": 0.04423076923076923, + "grad_norm": 1.859375, + "learning_rate": 4.230769230769231e-06, + "loss": 1.0294, + "step": 23 + }, + { + "epoch": 0.046153846153846156, + "grad_norm": 1.6796875, + "learning_rate": 4.423076923076924e-06, + "loss": 1.0473, + "step": 24 + }, + { + "epoch": 0.04807692307692308, + "grad_norm": 1.4765625, + "learning_rate": 4.615384615384616e-06, + "loss": 1.0562, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 1.4140625, + "learning_rate": 4.807692307692308e-06, + "loss": 1.0273, + "step": 26 + }, + { + "epoch": 0.051923076923076926, + "grad_norm": 1.2890625, + "learning_rate": 5e-06, + "loss": 1.0713, + "step": 27 + }, + { + "epoch": 0.05384615384615385, + "grad_norm": 1.2578125, + "learning_rate": 5.192307692307693e-06, + "loss": 1.0218, + "step": 28 + }, + { + "epoch": 0.05576923076923077, + "grad_norm": 1.2265625, + "learning_rate": 5.384615384615385e-06, + "loss": 1.0322, + "step": 29 + }, + { + "epoch": 0.057692307692307696, + "grad_norm": 1.125, + "learning_rate": 5.576923076923077e-06, + "loss": 0.9993, + "step": 30 + }, + { + "epoch": 0.05961538461538462, + "grad_norm": 1.1796875, + "learning_rate": 5.769230769230769e-06, + "loss": 0.9909, + "step": 31 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 1.078125, + "learning_rate": 5.961538461538462e-06, + "loss": 1.0497, + "step": 32 + }, + { + "epoch": 0.06346153846153846, + "grad_norm": 1.078125, + "learning_rate": 6.153846153846155e-06, + "loss": 1.0699, + "step": 33 + }, + { + "epoch": 0.06538461538461539, + "grad_norm": 1.1171875, + "learning_rate": 6.3461538461538466e-06, + "loss": 1.0065, + "step": 34 + }, + { + "epoch": 0.0673076923076923, + "grad_norm": 1.03125, + "learning_rate": 6.538461538461539e-06, + "loss": 0.9839, + "step": 35 + }, + { + "epoch": 0.06923076923076923, + "grad_norm": 1.421875, + "learning_rate": 6.730769230769232e-06, + "loss": 1.0069, + "step": 36 + }, + { + "epoch": 0.07115384615384615, + "grad_norm": 1.0546875, + "learning_rate": 6.923076923076923e-06, + "loss": 1.0441, + "step": 37 + }, + { + "epoch": 0.07307692307692308, + "grad_norm": 1.015625, + "learning_rate": 7.115384615384616e-06, + "loss": 0.9343, + "step": 38 + }, + { + "epoch": 0.075, + "grad_norm": 1.0234375, + "learning_rate": 7.307692307692308e-06, + "loss": 0.9853, + "step": 39 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.98046875, + "learning_rate": 7.500000000000001e-06, + "loss": 0.9502, + "step": 40 + }, + { + "epoch": 0.07884615384615384, + "grad_norm": 0.984375, + "learning_rate": 7.692307692307694e-06, + "loss": 0.987, + "step": 41 + }, + { + "epoch": 0.08076923076923077, + "grad_norm": 1.0234375, + "learning_rate": 7.884615384615384e-06, + "loss": 1.009, + "step": 42 + }, + { + "epoch": 0.08269230769230769, + "grad_norm": 0.9765625, + "learning_rate": 8.076923076923077e-06, + "loss": 1.0174, + "step": 43 + }, + { + "epoch": 0.08461538461538462, + "grad_norm": 0.99609375, + "learning_rate": 8.26923076923077e-06, + "loss": 1.0515, + "step": 44 + }, + { + "epoch": 0.08653846153846154, + "grad_norm": 1.0234375, + "learning_rate": 8.461538461538462e-06, + "loss": 1.0196, + "step": 45 + }, + { + "epoch": 0.08846153846153847, + "grad_norm": 0.94921875, + "learning_rate": 8.653846153846155e-06, + "loss": 0.9957, + "step": 46 + }, + { + "epoch": 0.09038461538461538, + "grad_norm": 1.0078125, + "learning_rate": 8.846153846153847e-06, + "loss": 0.9422, + "step": 47 + }, + { + "epoch": 0.09230769230769231, + "grad_norm": 0.97265625, + "learning_rate": 9.03846153846154e-06, + "loss": 0.9527, + "step": 48 + }, + { + "epoch": 0.09423076923076923, + "grad_norm": 1.015625, + "learning_rate": 9.230769230769232e-06, + "loss": 0.9944, + "step": 49 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 1.0078125, + "learning_rate": 9.423076923076923e-06, + "loss": 0.9794, + "step": 50 + }, + { + "epoch": 0.09807692307692308, + "grad_norm": 1.03125, + "learning_rate": 9.615384615384616e-06, + "loss": 1.0226, + "step": 51 + }, + { + "epoch": 0.1, + "grad_norm": 1.0, + "learning_rate": 9.807692307692308e-06, + "loss": 1.0325, + "step": 52 + }, + { + "epoch": 0.10192307692307692, + "grad_norm": 1.0234375, + "learning_rate": 1e-05, + "loss": 0.9893, + "step": 53 + }, + { + "epoch": 0.10384615384615385, + "grad_norm": 0.98046875, + "learning_rate": 9.999974723001716e-06, + "loss": 1.0543, + "step": 54 + }, + { + "epoch": 0.10576923076923077, + "grad_norm": 1.03125, + "learning_rate": 9.999898892262433e-06, + "loss": 1.0227, + "step": 55 + }, + { + "epoch": 0.1076923076923077, + "grad_norm": 1.046875, + "learning_rate": 9.999772508548863e-06, + "loss": 1.067, + "step": 56 + }, + { + "epoch": 0.10961538461538461, + "grad_norm": 1.015625, + "learning_rate": 9.999595573138845e-06, + "loss": 0.9794, + "step": 57 + }, + { + "epoch": 0.11153846153846154, + "grad_norm": 1.0625, + "learning_rate": 9.999368087821337e-06, + "loss": 1.0166, + "step": 58 + }, + { + "epoch": 0.11346153846153846, + "grad_norm": 1.0546875, + "learning_rate": 9.999090054896397e-06, + "loss": 1.0092, + "step": 59 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 1.015625, + "learning_rate": 9.99876147717516e-06, + "loss": 0.9518, + "step": 60 + }, + { + "epoch": 0.11730769230769231, + "grad_norm": 1.0390625, + "learning_rate": 9.99838235797981e-06, + "loss": 0.9558, + "step": 61 + }, + { + "epoch": 0.11923076923076924, + "grad_norm": 1.078125, + "learning_rate": 9.997952701143547e-06, + "loss": 1.0134, + "step": 62 + }, + { + "epoch": 0.12115384615384615, + "grad_norm": 1.03125, + "learning_rate": 9.997472511010543e-06, + "loss": 0.8941, + "step": 63 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 1.046875, + "learning_rate": 9.996941792435903e-06, + "loss": 0.9207, + "step": 64 + }, + { + "epoch": 0.125, + "grad_norm": 1.0625, + "learning_rate": 9.996360550785619e-06, + "loss": 0.9351, + "step": 65 + }, + { + "epoch": 0.125, + "eval_loss": 1.0073611736297607, + "eval_runtime": 34.5277, + "eval_samples_per_second": 67.859, + "eval_steps_per_second": 16.972, + "step": 65 + }, + { + "epoch": 0.12692307692307692, + "grad_norm": 1.046875, + "learning_rate": 9.995728791936505e-06, + "loss": 1.0024, + "step": 66 + }, + { + "epoch": 0.12884615384615383, + "grad_norm": 1.109375, + "learning_rate": 9.995046522276152e-06, + "loss": 1.0015, + "step": 67 + }, + { + "epoch": 0.13076923076923078, + "grad_norm": 1.3359375, + "learning_rate": 9.994313748702848e-06, + "loss": 0.9595, + "step": 68 + }, + { + "epoch": 0.1326923076923077, + "grad_norm": 1.0625, + "learning_rate": 9.993530478625524e-06, + "loss": 0.9331, + "step": 69 + }, + { + "epoch": 0.1346153846153846, + "grad_norm": 1.0390625, + "learning_rate": 9.992696719963662e-06, + "loss": 0.9174, + "step": 70 + }, + { + "epoch": 0.13653846153846153, + "grad_norm": 1.0546875, + "learning_rate": 9.99181248114723e-06, + "loss": 1.009, + "step": 71 + }, + { + "epoch": 0.13846153846153847, + "grad_norm": 1.0390625, + "learning_rate": 9.990877771116588e-06, + "loss": 0.9661, + "step": 72 + }, + { + "epoch": 0.14038461538461539, + "grad_norm": 1.078125, + "learning_rate": 9.989892599322404e-06, + "loss": 0.9398, + "step": 73 + }, + { + "epoch": 0.1423076923076923, + "grad_norm": 1.078125, + "learning_rate": 9.988856975725551e-06, + "loss": 0.9973, + "step": 74 + }, + { + "epoch": 0.14423076923076922, + "grad_norm": 1.046875, + "learning_rate": 9.987770910797014e-06, + "loss": 0.8935, + "step": 75 + }, + { + "epoch": 0.14615384615384616, + "grad_norm": 1.09375, + "learning_rate": 9.986634415517774e-06, + "loss": 0.958, + "step": 76 + }, + { + "epoch": 0.14807692307692308, + "grad_norm": 1.0390625, + "learning_rate": 9.985447501378706e-06, + "loss": 0.9349, + "step": 77 + }, + { + "epoch": 0.15, + "grad_norm": 1.078125, + "learning_rate": 9.984210180380464e-06, + "loss": 0.9474, + "step": 78 + }, + { + "epoch": 0.1519230769230769, + "grad_norm": 1.0234375, + "learning_rate": 9.98292246503335e-06, + "loss": 0.9704, + "step": 79 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 1.0625, + "learning_rate": 9.981584368357198e-06, + "loss": 0.9745, + "step": 80 + }, + { + "epoch": 0.15576923076923077, + "grad_norm": 1.0859375, + "learning_rate": 9.980195903881231e-06, + "loss": 0.9527, + "step": 81 + }, + { + "epoch": 0.1576923076923077, + "grad_norm": 1.0859375, + "learning_rate": 9.978757085643937e-06, + "loss": 0.9732, + "step": 82 + }, + { + "epoch": 0.1596153846153846, + "grad_norm": 1.09375, + "learning_rate": 9.97726792819292e-06, + "loss": 0.9304, + "step": 83 + }, + { + "epoch": 0.16153846153846155, + "grad_norm": 1.03125, + "learning_rate": 9.975728446584748e-06, + "loss": 0.9999, + "step": 84 + }, + { + "epoch": 0.16346153846153846, + "grad_norm": 1.0625, + "learning_rate": 9.974138656384815e-06, + "loss": 0.9477, + "step": 85 + }, + { + "epoch": 0.16538461538461538, + "grad_norm": 1.0234375, + "learning_rate": 9.97249857366717e-06, + "loss": 0.9661, + "step": 86 + }, + { + "epoch": 0.1673076923076923, + "grad_norm": 1.078125, + "learning_rate": 9.970808215014357e-06, + "loss": 0.9763, + "step": 87 + }, + { + "epoch": 0.16923076923076924, + "grad_norm": 1.0703125, + "learning_rate": 9.969067597517255e-06, + "loss": 0.9292, + "step": 88 + }, + { + "epoch": 0.17115384615384616, + "grad_norm": 1.0625, + "learning_rate": 9.967276738774897e-06, + "loss": 0.9083, + "step": 89 + }, + { + "epoch": 0.17307692307692307, + "grad_norm": 1.1328125, + "learning_rate": 9.9654356568943e-06, + "loss": 0.9764, + "step": 90 + }, + { + "epoch": 0.175, + "grad_norm": 1.109375, + "learning_rate": 9.96354437049027e-06, + "loss": 0.9924, + "step": 91 + }, + { + "epoch": 0.17692307692307693, + "grad_norm": 1.09375, + "learning_rate": 9.961602898685225e-06, + "loss": 0.9551, + "step": 92 + }, + { + "epoch": 0.17884615384615385, + "grad_norm": 1.109375, + "learning_rate": 9.959611261108999e-06, + "loss": 1.0074, + "step": 93 + }, + { + "epoch": 0.18076923076923077, + "grad_norm": 1.09375, + "learning_rate": 9.957569477898636e-06, + "loss": 1.0348, + "step": 94 + }, + { + "epoch": 0.18269230769230768, + "grad_norm": 1.0859375, + "learning_rate": 9.955477569698197e-06, + "loss": 0.9745, + "step": 95 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 1.0625, + "learning_rate": 9.95333555765855e-06, + "loss": 0.9278, + "step": 96 + }, + { + "epoch": 0.18653846153846154, + "grad_norm": 1.0859375, + "learning_rate": 9.951143463437145e-06, + "loss": 0.9497, + "step": 97 + }, + { + "epoch": 0.18846153846153846, + "grad_norm": 1.109375, + "learning_rate": 9.948901309197807e-06, + "loss": 1.0283, + "step": 98 + }, + { + "epoch": 0.19038461538461537, + "grad_norm": 1.078125, + "learning_rate": 9.946609117610508e-06, + "loss": 0.9384, + "step": 99 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 1.0546875, + "learning_rate": 9.94426691185114e-06, + "loss": 1.0316, + "step": 100 + }, + { + "epoch": 0.19423076923076923, + "grad_norm": 1.1328125, + "learning_rate": 9.94187471560127e-06, + "loss": 0.9401, + "step": 101 + }, + { + "epoch": 0.19615384615384615, + "grad_norm": 1.0703125, + "learning_rate": 9.939432553047919e-06, + "loss": 0.9112, + "step": 102 + }, + { + "epoch": 0.19807692307692307, + "grad_norm": 1.0859375, + "learning_rate": 9.936940448883299e-06, + "loss": 0.9085, + "step": 103 + }, + { + "epoch": 0.2, + "grad_norm": 1.09375, + "learning_rate": 9.934398428304577e-06, + "loss": 0.9583, + "step": 104 + }, + { + "epoch": 0.20192307692307693, + "grad_norm": 1.1171875, + "learning_rate": 9.931806517013612e-06, + "loss": 0.9923, + "step": 105 + }, + { + "epoch": 0.20384615384615384, + "grad_norm": 1.09375, + "learning_rate": 9.929164741216702e-06, + "loss": 0.9281, + "step": 106 + }, + { + "epoch": 0.20576923076923076, + "grad_norm": 2.03125, + "learning_rate": 9.926473127624306e-06, + "loss": 0.9767, + "step": 107 + }, + { + "epoch": 0.2076923076923077, + "grad_norm": 1.0625, + "learning_rate": 9.923731703450794e-06, + "loss": 0.9255, + "step": 108 + }, + { + "epoch": 0.20961538461538462, + "grad_norm": 1.109375, + "learning_rate": 9.920940496414153e-06, + "loss": 0.9394, + "step": 109 + }, + { + "epoch": 0.21153846153846154, + "grad_norm": 1.0703125, + "learning_rate": 9.91809953473572e-06, + "loss": 0.8881, + "step": 110 + }, + { + "epoch": 0.21346153846153845, + "grad_norm": 1.0703125, + "learning_rate": 9.915208847139883e-06, + "loss": 0.9327, + "step": 111 + }, + { + "epoch": 0.2153846153846154, + "grad_norm": 1.09375, + "learning_rate": 9.912268462853811e-06, + "loss": 0.9637, + "step": 112 + }, + { + "epoch": 0.2173076923076923, + "grad_norm": 1.09375, + "learning_rate": 9.909278411607134e-06, + "loss": 0.904, + "step": 113 + }, + { + "epoch": 0.21923076923076923, + "grad_norm": 1.0859375, + "learning_rate": 9.906238723631662e-06, + "loss": 1.0109, + "step": 114 + }, + { + "epoch": 0.22115384615384615, + "grad_norm": 1.109375, + "learning_rate": 9.903149429661072e-06, + "loss": 0.9612, + "step": 115 + }, + { + "epoch": 0.2230769230769231, + "grad_norm": 1.140625, + "learning_rate": 9.90001056093059e-06, + "loss": 0.966, + "step": 116 + }, + { + "epoch": 0.225, + "grad_norm": 1.0859375, + "learning_rate": 9.896822149176695e-06, + "loss": 0.9076, + "step": 117 + }, + { + "epoch": 0.22692307692307692, + "grad_norm": 1.1328125, + "learning_rate": 9.893584226636773e-06, + "loss": 0.9435, + "step": 118 + }, + { + "epoch": 0.22884615384615384, + "grad_norm": 1.109375, + "learning_rate": 9.89029682604881e-06, + "loss": 0.9738, + "step": 119 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 1.09375, + "learning_rate": 9.886959980651056e-06, + "loss": 1.0009, + "step": 120 + }, + { + "epoch": 0.2326923076923077, + "grad_norm": 1.046875, + "learning_rate": 9.883573724181683e-06, + "loss": 0.9864, + "step": 121 + }, + { + "epoch": 0.23461538461538461, + "grad_norm": 1.125, + "learning_rate": 9.880138090878452e-06, + "loss": 0.9537, + "step": 122 + }, + { + "epoch": 0.23653846153846153, + "grad_norm": 1.09375, + "learning_rate": 9.87665311547836e-06, + "loss": 1.0204, + "step": 123 + }, + { + "epoch": 0.23846153846153847, + "grad_norm": 1.1328125, + "learning_rate": 9.873118833217294e-06, + "loss": 0.9623, + "step": 124 + }, + { + "epoch": 0.2403846153846154, + "grad_norm": 1.078125, + "learning_rate": 9.869535279829674e-06, + "loss": 0.9458, + "step": 125 + }, + { + "epoch": 0.2423076923076923, + "grad_norm": 1.0703125, + "learning_rate": 9.86590249154809e-06, + "loss": 0.9223, + "step": 126 + }, + { + "epoch": 0.24423076923076922, + "grad_norm": 1.078125, + "learning_rate": 9.862220505102933e-06, + "loss": 0.9847, + "step": 127 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 1.1015625, + "learning_rate": 9.858489357722028e-06, + "loss": 0.9633, + "step": 128 + }, + { + "epoch": 0.24807692307692308, + "grad_norm": 1.1328125, + "learning_rate": 9.854709087130261e-06, + "loss": 0.896, + "step": 129 + }, + { + "epoch": 0.25, + "grad_norm": 1.078125, + "learning_rate": 9.850879731549188e-06, + "loss": 0.8884, + "step": 130 + }, + { + "epoch": 0.25, + "eval_loss": 0.9758404493331909, + "eval_runtime": 34.4304, + "eval_samples_per_second": 68.05, + "eval_steps_per_second": 17.02, + "step": 130 + }, + { + "epoch": 0.2519230769230769, + "grad_norm": 1.078125, + "learning_rate": 9.847001329696653e-06, + "loss": 0.9047, + "step": 131 + }, + { + "epoch": 0.25384615384615383, + "grad_norm": 1.1015625, + "learning_rate": 9.843073920786402e-06, + "loss": 0.979, + "step": 132 + }, + { + "epoch": 0.25576923076923075, + "grad_norm": 1.09375, + "learning_rate": 9.839097544527674e-06, + "loss": 0.9224, + "step": 133 + }, + { + "epoch": 0.25769230769230766, + "grad_norm": 1.1328125, + "learning_rate": 9.835072241124815e-06, + "loss": 0.8739, + "step": 134 + }, + { + "epoch": 0.25961538461538464, + "grad_norm": 1.125, + "learning_rate": 9.830998051276858e-06, + "loss": 0.9326, + "step": 135 + }, + { + "epoch": 0.26153846153846155, + "grad_norm": 1.0859375, + "learning_rate": 9.82687501617712e-06, + "loss": 0.9303, + "step": 136 + }, + { + "epoch": 0.26346153846153847, + "grad_norm": 1.1171875, + "learning_rate": 9.822703177512783e-06, + "loss": 1.0002, + "step": 137 + }, + { + "epoch": 0.2653846153846154, + "grad_norm": 1.125, + "learning_rate": 9.818482577464466e-06, + "loss": 0.9562, + "step": 138 + }, + { + "epoch": 0.2673076923076923, + "grad_norm": 1.140625, + "learning_rate": 9.814213258705813e-06, + "loss": 0.9212, + "step": 139 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 1.125, + "learning_rate": 9.809895264403046e-06, + "loss": 0.9679, + "step": 140 + }, + { + "epoch": 0.27115384615384613, + "grad_norm": 1.1015625, + "learning_rate": 9.805528638214543e-06, + "loss": 0.9903, + "step": 141 + }, + { + "epoch": 0.27307692307692305, + "grad_norm": 1.1171875, + "learning_rate": 9.801113424290381e-06, + "loss": 0.9754, + "step": 142 + }, + { + "epoch": 0.275, + "grad_norm": 1.125, + "learning_rate": 9.796649667271905e-06, + "loss": 0.8977, + "step": 143 + }, + { + "epoch": 0.27692307692307694, + "grad_norm": 1.109375, + "learning_rate": 9.792137412291265e-06, + "loss": 0.8798, + "step": 144 + }, + { + "epoch": 0.27884615384615385, + "grad_norm": 1.15625, + "learning_rate": 9.787576704970965e-06, + "loss": 0.9382, + "step": 145 + }, + { + "epoch": 0.28076923076923077, + "grad_norm": 1.1796875, + "learning_rate": 9.7829675914234e-06, + "loss": 0.9692, + "step": 146 + }, + { + "epoch": 0.2826923076923077, + "grad_norm": 1.125, + "learning_rate": 9.778310118250397e-06, + "loss": 0.8939, + "step": 147 + }, + { + "epoch": 0.2846153846153846, + "grad_norm": 1.1640625, + "learning_rate": 9.77360433254273e-06, + "loss": 1.0047, + "step": 148 + }, + { + "epoch": 0.2865384615384615, + "grad_norm": 1.1328125, + "learning_rate": 9.768850281879651e-06, + "loss": 0.9484, + "step": 149 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 1.1171875, + "learning_rate": 9.764048014328417e-06, + "loss": 0.9004, + "step": 150 + }, + { + "epoch": 0.2903846153846154, + "grad_norm": 1.1171875, + "learning_rate": 9.759197578443787e-06, + "loss": 0.9281, + "step": 151 + }, + { + "epoch": 0.2923076923076923, + "grad_norm": 1.2109375, + "learning_rate": 9.754299023267548e-06, + "loss": 0.9638, + "step": 152 + }, + { + "epoch": 0.29423076923076924, + "grad_norm": 1.1640625, + "learning_rate": 9.74935239832801e-06, + "loss": 0.9366, + "step": 153 + }, + { + "epoch": 0.29615384615384616, + "grad_norm": 1.1796875, + "learning_rate": 9.7443577536395e-06, + "loss": 0.9125, + "step": 154 + }, + { + "epoch": 0.2980769230769231, + "grad_norm": 1.125, + "learning_rate": 9.739315139701868e-06, + "loss": 0.9226, + "step": 155 + }, + { + "epoch": 0.3, + "grad_norm": 1.1484375, + "learning_rate": 9.734224607499978e-06, + "loss": 0.9384, + "step": 156 + }, + { + "epoch": 0.3019230769230769, + "grad_norm": 1.2890625, + "learning_rate": 9.729086208503174e-06, + "loss": 0.8788, + "step": 157 + }, + { + "epoch": 0.3038461538461538, + "grad_norm": 1.15625, + "learning_rate": 9.723899994664779e-06, + "loss": 0.9672, + "step": 158 + }, + { + "epoch": 0.3057692307692308, + "grad_norm": 1.125, + "learning_rate": 9.71866601842156e-06, + "loss": 0.8889, + "step": 159 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.15625, + "learning_rate": 9.713384332693199e-06, + "loss": 0.8975, + "step": 160 + }, + { + "epoch": 0.3096153846153846, + "grad_norm": 1.1015625, + "learning_rate": 9.708054990881763e-06, + "loss": 0.9614, + "step": 161 + }, + { + "epoch": 0.31153846153846154, + "grad_norm": 1.125, + "learning_rate": 9.702678046871157e-06, + "loss": 0.9061, + "step": 162 + }, + { + "epoch": 0.31346153846153846, + "grad_norm": 1.1484375, + "learning_rate": 9.69725355502658e-06, + "loss": 0.9322, + "step": 163 + }, + { + "epoch": 0.3153846153846154, + "grad_norm": 1.15625, + "learning_rate": 9.691781570193983e-06, + "loss": 0.9512, + "step": 164 + }, + { + "epoch": 0.3173076923076923, + "grad_norm": 1.1328125, + "learning_rate": 9.686262147699507e-06, + "loss": 1.0271, + "step": 165 + }, + { + "epoch": 0.3192307692307692, + "grad_norm": 1.125, + "learning_rate": 9.680695343348923e-06, + "loss": 0.9529, + "step": 166 + }, + { + "epoch": 0.3211538461538462, + "grad_norm": 1.203125, + "learning_rate": 9.675081213427076e-06, + "loss": 0.9636, + "step": 167 + }, + { + "epoch": 0.3230769230769231, + "grad_norm": 1.109375, + "learning_rate": 9.669419814697303e-06, + "loss": 0.8879, + "step": 168 + }, + { + "epoch": 0.325, + "grad_norm": 1.125, + "learning_rate": 9.663711204400872e-06, + "loss": 0.8889, + "step": 169 + }, + { + "epoch": 0.3269230769230769, + "grad_norm": 1.0546875, + "learning_rate": 9.657955440256396e-06, + "loss": 0.9735, + "step": 170 + }, + { + "epoch": 0.32884615384615384, + "grad_norm": 1.1015625, + "learning_rate": 9.65215258045925e-06, + "loss": 1.0088, + "step": 171 + }, + { + "epoch": 0.33076923076923076, + "grad_norm": 1.1015625, + "learning_rate": 9.64630268368099e-06, + "loss": 0.9318, + "step": 172 + }, + { + "epoch": 0.3326923076923077, + "grad_norm": 1.09375, + "learning_rate": 9.640405809068743e-06, + "loss": 0.9902, + "step": 173 + }, + { + "epoch": 0.3346153846153846, + "grad_norm": 1.109375, + "learning_rate": 9.634462016244625e-06, + "loss": 0.9315, + "step": 174 + }, + { + "epoch": 0.33653846153846156, + "grad_norm": 1.0546875, + "learning_rate": 9.628471365305134e-06, + "loss": 0.931, + "step": 175 + }, + { + "epoch": 0.3384615384615385, + "grad_norm": 1.1015625, + "learning_rate": 9.622433916820539e-06, + "loss": 0.9167, + "step": 176 + }, + { + "epoch": 0.3403846153846154, + "grad_norm": 1.109375, + "learning_rate": 9.616349731834271e-06, + "loss": 0.9214, + "step": 177 + }, + { + "epoch": 0.3423076923076923, + "grad_norm": 1.1171875, + "learning_rate": 9.610218871862303e-06, + "loss": 0.945, + "step": 178 + }, + { + "epoch": 0.34423076923076923, + "grad_norm": 1.1171875, + "learning_rate": 9.604041398892528e-06, + "loss": 0.9445, + "step": 179 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 1.140625, + "learning_rate": 9.597817375384138e-06, + "loss": 1.0135, + "step": 180 + }, + { + "epoch": 0.34807692307692306, + "grad_norm": 1.3203125, + "learning_rate": 9.591546864266983e-06, + "loss": 0.9393, + "step": 181 + }, + { + "epoch": 0.35, + "grad_norm": 1.1171875, + "learning_rate": 9.585229928940944e-06, + "loss": 0.9273, + "step": 182 + }, + { + "epoch": 0.35192307692307695, + "grad_norm": 1.1171875, + "learning_rate": 9.578866633275289e-06, + "loss": 1.0091, + "step": 183 + }, + { + "epoch": 0.35384615384615387, + "grad_norm": 1.0859375, + "learning_rate": 9.572457041608018e-06, + "loss": 0.9301, + "step": 184 + }, + { + "epoch": 0.3557692307692308, + "grad_norm": 1.046875, + "learning_rate": 9.56600121874523e-06, + "loss": 0.991, + "step": 185 + }, + { + "epoch": 0.3576923076923077, + "grad_norm": 1.125, + "learning_rate": 9.55949922996045e-06, + "loss": 0.8897, + "step": 186 + }, + { + "epoch": 0.3596153846153846, + "grad_norm": 1.109375, + "learning_rate": 9.55295114099399e-06, + "loss": 0.8844, + "step": 187 + }, + { + "epoch": 0.36153846153846153, + "grad_norm": 1.0703125, + "learning_rate": 9.546357018052254e-06, + "loss": 0.9226, + "step": 188 + }, + { + "epoch": 0.36346153846153845, + "grad_norm": 1.078125, + "learning_rate": 9.539716927807102e-06, + "loss": 0.9273, + "step": 189 + }, + { + "epoch": 0.36538461538461536, + "grad_norm": 1.109375, + "learning_rate": 9.533030937395151e-06, + "loss": 0.9116, + "step": 190 + }, + { + "epoch": 0.36730769230769234, + "grad_norm": 1.0703125, + "learning_rate": 9.526299114417108e-06, + "loss": 0.9732, + "step": 191 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 1.078125, + "learning_rate": 9.519521526937087e-06, + "loss": 0.918, + "step": 192 + }, + { + "epoch": 0.37115384615384617, + "grad_norm": 1.03125, + "learning_rate": 9.512698243481914e-06, + "loss": 0.9045, + "step": 193 + }, + { + "epoch": 0.3730769230769231, + "grad_norm": 1.078125, + "learning_rate": 9.505829333040437e-06, + "loss": 0.9189, + "step": 194 + }, + { + "epoch": 0.375, + "grad_norm": 1.1015625, + "learning_rate": 9.498914865062831e-06, + "loss": 0.9853, + "step": 195 + }, + { + "epoch": 0.375, + "eval_loss": 0.9608431458473206, + "eval_runtime": 34.5143, + "eval_samples_per_second": 67.885, + "eval_steps_per_second": 16.978, + "step": 195 + }, + { + "epoch": 0.3769230769230769, + "grad_norm": 1.046875, + "learning_rate": 9.491954909459895e-06, + "loss": 0.9078, + "step": 196 + }, + { + "epoch": 0.37884615384615383, + "grad_norm": 1.0703125, + "learning_rate": 9.484949536602343e-06, + "loss": 0.8859, + "step": 197 + }, + { + "epoch": 0.38076923076923075, + "grad_norm": 1.0390625, + "learning_rate": 9.477898817320094e-06, + "loss": 0.9183, + "step": 198 + }, + { + "epoch": 0.38269230769230766, + "grad_norm": 1.0390625, + "learning_rate": 9.470802822901558e-06, + "loss": 0.918, + "step": 199 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.0625, + "learning_rate": 9.463661625092907e-06, + "loss": 0.9444, + "step": 200 + }, + { + "epoch": 0.38653846153846155, + "grad_norm": 1.046875, + "learning_rate": 9.45647529609736e-06, + "loss": 0.8958, + "step": 201 + }, + { + "epoch": 0.38846153846153847, + "grad_norm": 1.046875, + "learning_rate": 9.44924390857445e-06, + "loss": 0.9071, + "step": 202 + }, + { + "epoch": 0.3903846153846154, + "grad_norm": 1.09375, + "learning_rate": 9.44196753563928e-06, + "loss": 0.962, + "step": 203 + }, + { + "epoch": 0.3923076923076923, + "grad_norm": 1.0546875, + "learning_rate": 9.434646250861801e-06, + "loss": 0.9814, + "step": 204 + }, + { + "epoch": 0.3942307692307692, + "grad_norm": 1.09375, + "learning_rate": 9.427280128266049e-06, + "loss": 0.9873, + "step": 205 + }, + { + "epoch": 0.39615384615384613, + "grad_norm": 1.15625, + "learning_rate": 9.419869242329417e-06, + "loss": 0.9034, + "step": 206 + }, + { + "epoch": 0.39807692307692305, + "grad_norm": 1.0703125, + "learning_rate": 9.412413667981884e-06, + "loss": 0.8953, + "step": 207 + }, + { + "epoch": 0.4, + "grad_norm": 1.1015625, + "learning_rate": 9.404913480605264e-06, + "loss": 1.0005, + "step": 208 + }, + { + "epoch": 0.40192307692307694, + "grad_norm": 1.0625, + "learning_rate": 9.397368756032445e-06, + "loss": 0.9455, + "step": 209 + }, + { + "epoch": 0.40384615384615385, + "grad_norm": 1.0234375, + "learning_rate": 9.389779570546628e-06, + "loss": 0.9336, + "step": 210 + }, + { + "epoch": 0.40576923076923077, + "grad_norm": 1.1953125, + "learning_rate": 9.38214600088054e-06, + "loss": 0.8907, + "step": 211 + }, + { + "epoch": 0.4076923076923077, + "grad_norm": 1.078125, + "learning_rate": 9.374468124215676e-06, + "loss": 0.8735, + "step": 212 + }, + { + "epoch": 0.4096153846153846, + "grad_norm": 1.0625, + "learning_rate": 9.366746018181503e-06, + "loss": 0.9682, + "step": 213 + }, + { + "epoch": 0.4115384615384615, + "grad_norm": 1.0078125, + "learning_rate": 9.358979760854686e-06, + "loss": 0.9069, + "step": 214 + }, + { + "epoch": 0.41346153846153844, + "grad_norm": 0.99609375, + "learning_rate": 9.351169430758293e-06, + "loss": 0.9371, + "step": 215 + }, + { + "epoch": 0.4153846153846154, + "grad_norm": 1.0703125, + "learning_rate": 9.343315106861008e-06, + "loss": 0.9691, + "step": 216 + }, + { + "epoch": 0.4173076923076923, + "grad_norm": 1.0234375, + "learning_rate": 9.33541686857632e-06, + "loss": 0.9521, + "step": 217 + }, + { + "epoch": 0.41923076923076924, + "grad_norm": 1.0078125, + "learning_rate": 9.327474795761734e-06, + "loss": 0.9503, + "step": 218 + }, + { + "epoch": 0.42115384615384616, + "grad_norm": 1.015625, + "learning_rate": 9.31948896871795e-06, + "loss": 0.9311, + "step": 219 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 1.046875, + "learning_rate": 9.311459468188066e-06, + "loss": 0.8998, + "step": 220 + }, + { + "epoch": 0.425, + "grad_norm": 1.0859375, + "learning_rate": 9.303386375356752e-06, + "loss": 0.8776, + "step": 221 + }, + { + "epoch": 0.4269230769230769, + "grad_norm": 1.0703125, + "learning_rate": 9.295269771849426e-06, + "loss": 0.9003, + "step": 222 + }, + { + "epoch": 0.4288461538461538, + "grad_norm": 0.98046875, + "learning_rate": 9.28710973973144e-06, + "loss": 0.9597, + "step": 223 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 0.96875, + "learning_rate": 9.278906361507238e-06, + "loss": 0.8842, + "step": 224 + }, + { + "epoch": 0.4326923076923077, + "grad_norm": 1.0078125, + "learning_rate": 9.270659720119533e-06, + "loss": 0.9524, + "step": 225 + }, + { + "epoch": 0.4346153846153846, + "grad_norm": 1.0625, + "learning_rate": 9.262369898948462e-06, + "loss": 0.9271, + "step": 226 + }, + { + "epoch": 0.43653846153846154, + "grad_norm": 1.078125, + "learning_rate": 9.254036981810741e-06, + "loss": 0.9627, + "step": 227 + }, + { + "epoch": 0.43846153846153846, + "grad_norm": 1.015625, + "learning_rate": 9.245661052958823e-06, + "loss": 0.9305, + "step": 228 + }, + { + "epoch": 0.4403846153846154, + "grad_norm": 1.03125, + "learning_rate": 9.237242197080045e-06, + "loss": 0.9855, + "step": 229 + }, + { + "epoch": 0.4423076923076923, + "grad_norm": 1.0390625, + "learning_rate": 9.22878049929577e-06, + "loss": 0.9237, + "step": 230 + }, + { + "epoch": 0.4442307692307692, + "grad_norm": 1.0078125, + "learning_rate": 9.220276045160524e-06, + "loss": 0.9733, + "step": 231 + }, + { + "epoch": 0.4461538461538462, + "grad_norm": 0.9765625, + "learning_rate": 9.211728920661136e-06, + "loss": 0.9613, + "step": 232 + }, + { + "epoch": 0.4480769230769231, + "grad_norm": 0.9609375, + "learning_rate": 9.203139212215868e-06, + "loss": 0.9317, + "step": 233 + }, + { + "epoch": 0.45, + "grad_norm": 0.99609375, + "learning_rate": 9.19450700667354e-06, + "loss": 0.9067, + "step": 234 + }, + { + "epoch": 0.4519230769230769, + "grad_norm": 0.9296875, + "learning_rate": 9.185832391312644e-06, + "loss": 0.9088, + "step": 235 + }, + { + "epoch": 0.45384615384615384, + "grad_norm": 0.953125, + "learning_rate": 9.17711545384048e-06, + "loss": 0.9359, + "step": 236 + }, + { + "epoch": 0.45576923076923076, + "grad_norm": 1.015625, + "learning_rate": 9.168356282392253e-06, + "loss": 0.96, + "step": 237 + }, + { + "epoch": 0.4576923076923077, + "grad_norm": 0.90234375, + "learning_rate": 9.159554965530184e-06, + "loss": 0.9193, + "step": 238 + }, + { + "epoch": 0.4596153846153846, + "grad_norm": 0.93359375, + "learning_rate": 9.150711592242627e-06, + "loss": 0.9439, + "step": 239 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.0078125, + "learning_rate": 9.14182625194315e-06, + "loss": 0.8804, + "step": 240 + }, + { + "epoch": 0.4634615384615385, + "grad_norm": 0.9375, + "learning_rate": 9.132899034469648e-06, + "loss": 0.916, + "step": 241 + }, + { + "epoch": 0.4653846153846154, + "grad_norm": 0.98828125, + "learning_rate": 9.123930030083425e-06, + "loss": 0.8793, + "step": 242 + }, + { + "epoch": 0.4673076923076923, + "grad_norm": 0.93359375, + "learning_rate": 9.114919329468283e-06, + "loss": 0.9162, + "step": 243 + }, + { + "epoch": 0.46923076923076923, + "grad_norm": 0.98046875, + "learning_rate": 9.10586702372961e-06, + "loss": 0.9098, + "step": 244 + }, + { + "epoch": 0.47115384615384615, + "grad_norm": 0.91015625, + "learning_rate": 9.09677320439345e-06, + "loss": 0.9219, + "step": 245 + }, + { + "epoch": 0.47307692307692306, + "grad_norm": 0.921875, + "learning_rate": 9.087637963405586e-06, + "loss": 0.9554, + "step": 246 + }, + { + "epoch": 0.475, + "grad_norm": 0.9140625, + "learning_rate": 9.07846139313061e-06, + "loss": 0.9994, + "step": 247 + }, + { + "epoch": 0.47692307692307695, + "grad_norm": 0.96875, + "learning_rate": 9.069243586350976e-06, + "loss": 0.9052, + "step": 248 + }, + { + "epoch": 0.47884615384615387, + "grad_norm": 0.94921875, + "learning_rate": 9.059984636266082e-06, + "loss": 0.9232, + "step": 249 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 0.97265625, + "learning_rate": 9.050684636491317e-06, + "loss": 0.8964, + "step": 250 + }, + { + "epoch": 0.4826923076923077, + "grad_norm": 0.91015625, + "learning_rate": 9.041343681057106e-06, + "loss": 0.9386, + "step": 251 + }, + { + "epoch": 0.4846153846153846, + "grad_norm": 0.8984375, + "learning_rate": 9.03196186440798e-06, + "loss": 0.9078, + "step": 252 + }, + { + "epoch": 0.48653846153846153, + "grad_norm": 0.96875, + "learning_rate": 9.022539281401601e-06, + "loss": 0.9056, + "step": 253 + }, + { + "epoch": 0.48846153846153845, + "grad_norm": 0.8671875, + "learning_rate": 9.013076027307817e-06, + "loss": 0.8973, + "step": 254 + }, + { + "epoch": 0.49038461538461536, + "grad_norm": 0.921875, + "learning_rate": 9.00357219780769e-06, + "loss": 0.9663, + "step": 255 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 0.90234375, + "learning_rate": 8.994027888992533e-06, + "loss": 0.8857, + "step": 256 + }, + { + "epoch": 0.49423076923076925, + "grad_norm": 0.9921875, + "learning_rate": 8.984443197362938e-06, + "loss": 0.9815, + "step": 257 + }, + { + "epoch": 0.49615384615384617, + "grad_norm": 0.91015625, + "learning_rate": 8.974818219827796e-06, + "loss": 0.9693, + "step": 258 + }, + { + "epoch": 0.4980769230769231, + "grad_norm": 1.2265625, + "learning_rate": 8.965153053703325e-06, + "loss": 0.8971, + "step": 259 + }, + { + "epoch": 0.5, + "grad_norm": 0.94140625, + "learning_rate": 8.955447796712083e-06, + "loss": 0.8998, + "step": 260 + }, + { + "epoch": 0.5, + "eval_loss": 0.9489682912826538, + "eval_runtime": 34.8214, + "eval_samples_per_second": 67.286, + "eval_steps_per_second": 16.829, + "step": 260 + }, + { + "epoch": 0.5019230769230769, + "grad_norm": 0.9609375, + "learning_rate": 8.94570254698197e-06, + "loss": 0.9153, + "step": 261 + }, + { + "epoch": 0.5038461538461538, + "grad_norm": 0.94140625, + "learning_rate": 8.935917403045251e-06, + "loss": 0.9449, + "step": 262 + }, + { + "epoch": 0.5057692307692307, + "grad_norm": 0.91796875, + "learning_rate": 8.926092463837557e-06, + "loss": 0.9087, + "step": 263 + }, + { + "epoch": 0.5076923076923077, + "grad_norm": 0.890625, + "learning_rate": 8.916227828696873e-06, + "loss": 0.8946, + "step": 264 + }, + { + "epoch": 0.5096153846153846, + "grad_norm": 0.8671875, + "learning_rate": 8.906323597362547e-06, + "loss": 0.9261, + "step": 265 + }, + { + "epoch": 0.5115384615384615, + "grad_norm": 0.8828125, + "learning_rate": 8.896379869974273e-06, + "loss": 0.8826, + "step": 266 + }, + { + "epoch": 0.5134615384615384, + "grad_norm": 0.9609375, + "learning_rate": 8.886396747071085e-06, + "loss": 0.9224, + "step": 267 + }, + { + "epoch": 0.5153846153846153, + "grad_norm": 0.84765625, + "learning_rate": 8.876374329590331e-06, + "loss": 0.8849, + "step": 268 + }, + { + "epoch": 0.5173076923076924, + "grad_norm": 0.890625, + "learning_rate": 8.866312718866669e-06, + "loss": 0.9516, + "step": 269 + }, + { + "epoch": 0.5192307692307693, + "grad_norm": 0.921875, + "learning_rate": 8.85621201663102e-06, + "loss": 0.9049, + "step": 270 + }, + { + "epoch": 0.5211538461538462, + "grad_norm": 0.8125, + "learning_rate": 8.846072325009562e-06, + "loss": 0.8953, + "step": 271 + }, + { + "epoch": 0.5230769230769231, + "grad_norm": 0.8046875, + "learning_rate": 8.83589374652268e-06, + "loss": 0.8507, + "step": 272 + }, + { + "epoch": 0.525, + "grad_norm": 0.8359375, + "learning_rate": 8.825676384083936e-06, + "loss": 0.8904, + "step": 273 + }, + { + "epoch": 0.5269230769230769, + "grad_norm": 0.83203125, + "learning_rate": 8.815420340999034e-06, + "loss": 0.9469, + "step": 274 + }, + { + "epoch": 0.5288461538461539, + "grad_norm": 0.90234375, + "learning_rate": 8.805125720964766e-06, + "loss": 0.9144, + "step": 275 + }, + { + "epoch": 0.5307692307692308, + "grad_norm": 0.828125, + "learning_rate": 8.79479262806797e-06, + "loss": 0.8757, + "step": 276 + }, + { + "epoch": 0.5326923076923077, + "grad_norm": 0.83203125, + "learning_rate": 8.784421166784476e-06, + "loss": 0.8781, + "step": 277 + }, + { + "epoch": 0.5346153846153846, + "grad_norm": 0.89453125, + "learning_rate": 8.774011441978046e-06, + "loss": 0.9348, + "step": 278 + }, + { + "epoch": 0.5365384615384615, + "grad_norm": 0.9296875, + "learning_rate": 8.763563558899317e-06, + "loss": 0.9949, + "step": 279 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 0.8359375, + "learning_rate": 8.75307762318474e-06, + "loss": 0.9171, + "step": 280 + }, + { + "epoch": 0.5403846153846154, + "grad_norm": 0.890625, + "learning_rate": 8.742553740855507e-06, + "loss": 1.0024, + "step": 281 + }, + { + "epoch": 0.5423076923076923, + "grad_norm": 0.82421875, + "learning_rate": 8.731992018316478e-06, + "loss": 0.8619, + "step": 282 + }, + { + "epoch": 0.5442307692307692, + "grad_norm": 0.7890625, + "learning_rate": 8.721392562355113e-06, + "loss": 0.955, + "step": 283 + }, + { + "epoch": 0.5461538461538461, + "grad_norm": 1.046875, + "learning_rate": 8.71075548014038e-06, + "loss": 0.9189, + "step": 284 + }, + { + "epoch": 0.5480769230769231, + "grad_norm": 0.9140625, + "learning_rate": 8.700080879221689e-06, + "loss": 0.9118, + "step": 285 + }, + { + "epoch": 0.55, + "grad_norm": 0.8203125, + "learning_rate": 8.689368867527781e-06, + "loss": 0.8916, + "step": 286 + }, + { + "epoch": 0.551923076923077, + "grad_norm": 0.86328125, + "learning_rate": 8.67861955336566e-06, + "loss": 0.8934, + "step": 287 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 0.875, + "learning_rate": 8.667833045419483e-06, + "loss": 0.8921, + "step": 288 + }, + { + "epoch": 0.5557692307692308, + "grad_norm": 0.87890625, + "learning_rate": 8.657009452749466e-06, + "loss": 1.0005, + "step": 289 + }, + { + "epoch": 0.5576923076923077, + "grad_norm": 0.85546875, + "learning_rate": 8.646148884790786e-06, + "loss": 0.8828, + "step": 290 + }, + { + "epoch": 0.5596153846153846, + "grad_norm": 0.82421875, + "learning_rate": 8.635251451352463e-06, + "loss": 0.8704, + "step": 291 + }, + { + "epoch": 0.5615384615384615, + "grad_norm": 0.7890625, + "learning_rate": 8.624317262616261e-06, + "loss": 0.9182, + "step": 292 + }, + { + "epoch": 0.5634615384615385, + "grad_norm": 0.86328125, + "learning_rate": 8.613346429135567e-06, + "loss": 0.9128, + "step": 293 + }, + { + "epoch": 0.5653846153846154, + "grad_norm": 0.83984375, + "learning_rate": 8.602339061834278e-06, + "loss": 0.9893, + "step": 294 + }, + { + "epoch": 0.5673076923076923, + "grad_norm": 0.84375, + "learning_rate": 8.591295272005674e-06, + "loss": 0.9299, + "step": 295 + }, + { + "epoch": 0.5692307692307692, + "grad_norm": 0.77734375, + "learning_rate": 8.5802151713113e-06, + "loss": 0.894, + "step": 296 + }, + { + "epoch": 0.5711538461538461, + "grad_norm": 0.8359375, + "learning_rate": 8.569098871779828e-06, + "loss": 0.9472, + "step": 297 + }, + { + "epoch": 0.573076923076923, + "grad_norm": 0.828125, + "learning_rate": 8.557946485805932e-06, + "loss": 0.919, + "step": 298 + }, + { + "epoch": 0.575, + "grad_norm": 0.8046875, + "learning_rate": 8.546758126149148e-06, + "loss": 0.882, + "step": 299 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 0.88671875, + "learning_rate": 8.535533905932739e-06, + "loss": 0.8685, + "step": 300 + }, + { + "epoch": 0.5788461538461539, + "grad_norm": 0.85546875, + "learning_rate": 8.524273938642539e-06, + "loss": 0.8966, + "step": 301 + }, + { + "epoch": 0.5807692307692308, + "grad_norm": 0.796875, + "learning_rate": 8.512978338125818e-06, + "loss": 0.9205, + "step": 302 + }, + { + "epoch": 0.5826923076923077, + "grad_norm": 0.83984375, + "learning_rate": 8.501647218590127e-06, + "loss": 0.9815, + "step": 303 + }, + { + "epoch": 0.5846153846153846, + "grad_norm": 0.80859375, + "learning_rate": 8.490280694602142e-06, + "loss": 0.9317, + "step": 304 + }, + { + "epoch": 0.5865384615384616, + "grad_norm": 0.7890625, + "learning_rate": 8.478878881086505e-06, + "loss": 0.9498, + "step": 305 + }, + { + "epoch": 0.5884615384615385, + "grad_norm": 0.7578125, + "learning_rate": 8.467441893324667e-06, + "loss": 0.9088, + "step": 306 + }, + { + "epoch": 0.5903846153846154, + "grad_norm": 0.81640625, + "learning_rate": 8.455969846953711e-06, + "loss": 0.8728, + "step": 307 + }, + { + "epoch": 0.5923076923076923, + "grad_norm": 0.8046875, + "learning_rate": 8.444462857965198e-06, + "loss": 0.9345, + "step": 308 + }, + { + "epoch": 0.5942307692307692, + "grad_norm": 0.8046875, + "learning_rate": 8.432921042703985e-06, + "loss": 0.8951, + "step": 309 + }, + { + "epoch": 0.5961538461538461, + "grad_norm": 0.8046875, + "learning_rate": 8.42134451786705e-06, + "loss": 1.005, + "step": 310 + }, + { + "epoch": 0.5980769230769231, + "grad_norm": 0.7734375, + "learning_rate": 8.409733400502311e-06, + "loss": 0.919, + "step": 311 + }, + { + "epoch": 0.6, + "grad_norm": 0.81640625, + "learning_rate": 8.398087808007447e-06, + "loss": 0.882, + "step": 312 + }, + { + "epoch": 0.6019230769230769, + "grad_norm": 0.84375, + "learning_rate": 8.386407858128707e-06, + "loss": 0.9254, + "step": 313 + }, + { + "epoch": 0.6038461538461538, + "grad_norm": 0.77734375, + "learning_rate": 8.374693668959717e-06, + "loss": 0.9312, + "step": 314 + }, + { + "epoch": 0.6057692307692307, + "grad_norm": 0.77734375, + "learning_rate": 8.362945358940295e-06, + "loss": 0.9124, + "step": 315 + }, + { + "epoch": 0.6076923076923076, + "grad_norm": 0.765625, + "learning_rate": 8.351163046855246e-06, + "loss": 0.9181, + "step": 316 + }, + { + "epoch": 0.6096153846153847, + "grad_norm": 0.77734375, + "learning_rate": 8.339346851833163e-06, + "loss": 0.9124, + "step": 317 + }, + { + "epoch": 0.6115384615384616, + "grad_norm": 0.78515625, + "learning_rate": 8.327496893345223e-06, + "loss": 0.9282, + "step": 318 + }, + { + "epoch": 0.6134615384615385, + "grad_norm": 0.7421875, + "learning_rate": 8.315613291203977e-06, + "loss": 0.9125, + "step": 319 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.7734375, + "learning_rate": 8.303696165562141e-06, + "loss": 0.9366, + "step": 320 + }, + { + "epoch": 0.6173076923076923, + "grad_norm": 0.78515625, + "learning_rate": 8.291745636911382e-06, + "loss": 0.9808, + "step": 321 + }, + { + "epoch": 0.6192307692307693, + "grad_norm": 0.74609375, + "learning_rate": 8.279761826081096e-06, + "loss": 0.9105, + "step": 322 + }, + { + "epoch": 0.6211538461538462, + "grad_norm": 0.79296875, + "learning_rate": 8.26774485423719e-06, + "loss": 0.9444, + "step": 323 + }, + { + "epoch": 0.6230769230769231, + "grad_norm": 0.79296875, + "learning_rate": 8.255694842880854e-06, + "loss": 0.8981, + "step": 324 + }, + { + "epoch": 0.625, + "grad_norm": 0.76171875, + "learning_rate": 8.243611913847337e-06, + "loss": 0.8919, + "step": 325 + }, + { + "epoch": 0.625, + "eval_loss": 0.9419646263122559, + "eval_runtime": 34.6043, + "eval_samples_per_second": 67.708, + "eval_steps_per_second": 16.934, + "step": 325 + }, + { + "epoch": 0.6269230769230769, + "grad_norm": 0.76171875, + "learning_rate": 8.231496189304704e-06, + "loss": 0.8434, + "step": 326 + }, + { + "epoch": 0.6288461538461538, + "grad_norm": 0.8671875, + "learning_rate": 8.21934779175262e-06, + "loss": 0.912, + "step": 327 + }, + { + "epoch": 0.6307692307692307, + "grad_norm": 0.7890625, + "learning_rate": 8.207166844021093e-06, + "loss": 0.9085, + "step": 328 + }, + { + "epoch": 0.6326923076923077, + "grad_norm": 0.77734375, + "learning_rate": 8.19495346926924e-06, + "loss": 0.9301, + "step": 329 + }, + { + "epoch": 0.6346153846153846, + "grad_norm": 0.75, + "learning_rate": 8.182707790984043e-06, + "loss": 0.9023, + "step": 330 + }, + { + "epoch": 0.6365384615384615, + "grad_norm": 0.76953125, + "learning_rate": 8.170429932979097e-06, + "loss": 0.9363, + "step": 331 + }, + { + "epoch": 0.6384615384615384, + "grad_norm": 0.76953125, + "learning_rate": 8.15812001939336e-06, + "loss": 0.927, + "step": 332 + }, + { + "epoch": 0.6403846153846153, + "grad_norm": 0.8671875, + "learning_rate": 8.145778174689897e-06, + "loss": 0.9826, + "step": 333 + }, + { + "epoch": 0.6423076923076924, + "grad_norm": 0.8125, + "learning_rate": 8.133404523654626e-06, + "loss": 0.922, + "step": 334 + }, + { + "epoch": 0.6442307692307693, + "grad_norm": 0.7578125, + "learning_rate": 8.120999191395048e-06, + "loss": 0.9405, + "step": 335 + }, + { + "epoch": 0.6461538461538462, + "grad_norm": 0.78515625, + "learning_rate": 8.108562303338987e-06, + "loss": 0.8947, + "step": 336 + }, + { + "epoch": 0.6480769230769231, + "grad_norm": 0.74609375, + "learning_rate": 8.096093985233323e-06, + "loss": 0.9109, + "step": 337 + }, + { + "epoch": 0.65, + "grad_norm": 0.8125, + "learning_rate": 8.083594363142717e-06, + "loss": 0.9111, + "step": 338 + }, + { + "epoch": 0.6519230769230769, + "grad_norm": 0.78515625, + "learning_rate": 8.071063563448341e-06, + "loss": 0.8957, + "step": 339 + }, + { + "epoch": 0.6538461538461539, + "grad_norm": 0.74609375, + "learning_rate": 8.058501712846594e-06, + "loss": 0.9003, + "step": 340 + }, + { + "epoch": 0.6557692307692308, + "grad_norm": 0.7890625, + "learning_rate": 8.045908938347828e-06, + "loss": 0.9372, + "step": 341 + }, + { + "epoch": 0.6576923076923077, + "grad_norm": 0.75390625, + "learning_rate": 8.03328536727506e-06, + "loss": 0.8854, + "step": 342 + }, + { + "epoch": 0.6596153846153846, + "grad_norm": 0.77734375, + "learning_rate": 8.020631127262681e-06, + "loss": 0.9505, + "step": 343 + }, + { + "epoch": 0.6615384615384615, + "grad_norm": 0.765625, + "learning_rate": 8.007946346255176e-06, + "loss": 0.9581, + "step": 344 + }, + { + "epoch": 0.6634615384615384, + "grad_norm": 0.83984375, + "learning_rate": 7.995231152505815e-06, + "loss": 0.9068, + "step": 345 + }, + { + "epoch": 0.6653846153846154, + "grad_norm": 0.78125, + "learning_rate": 7.982485674575373e-06, + "loss": 0.9159, + "step": 346 + }, + { + "epoch": 0.6673076923076923, + "grad_norm": 0.765625, + "learning_rate": 7.96971004133082e-06, + "loss": 0.9015, + "step": 347 + }, + { + "epoch": 0.6692307692307692, + "grad_norm": 0.796875, + "learning_rate": 7.95690438194402e-06, + "loss": 0.9531, + "step": 348 + }, + { + "epoch": 0.6711538461538461, + "grad_norm": 0.8046875, + "learning_rate": 7.944068825890424e-06, + "loss": 0.8971, + "step": 349 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 0.83203125, + "learning_rate": 7.931203502947762e-06, + "loss": 0.868, + "step": 350 + }, + { + "epoch": 0.675, + "grad_norm": 0.734375, + "learning_rate": 7.918308543194735e-06, + "loss": 0.9151, + "step": 351 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 0.77734375, + "learning_rate": 7.905384077009693e-06, + "loss": 0.9949, + "step": 352 + }, + { + "epoch": 0.6788461538461539, + "grad_norm": 0.77734375, + "learning_rate": 7.892430235069317e-06, + "loss": 0.9025, + "step": 353 + }, + { + "epoch": 0.6807692307692308, + "grad_norm": 0.76171875, + "learning_rate": 7.879447148347307e-06, + "loss": 0.8969, + "step": 354 + }, + { + "epoch": 0.6826923076923077, + "grad_norm": 0.8828125, + "learning_rate": 7.866434948113046e-06, + "loss": 0.9086, + "step": 355 + }, + { + "epoch": 0.6846153846153846, + "grad_norm": 0.78125, + "learning_rate": 7.853393765930279e-06, + "loss": 0.865, + "step": 356 + }, + { + "epoch": 0.6865384615384615, + "grad_norm": 0.78515625, + "learning_rate": 7.84032373365578e-06, + "loss": 0.9514, + "step": 357 + }, + { + "epoch": 0.6884615384615385, + "grad_norm": 0.7421875, + "learning_rate": 7.827224983438024e-06, + "loss": 0.8866, + "step": 358 + }, + { + "epoch": 0.6903846153846154, + "grad_norm": 0.8671875, + "learning_rate": 7.814097647715848e-06, + "loss": 0.8856, + "step": 359 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.78125, + "learning_rate": 7.800941859217103e-06, + "loss": 0.8864, + "step": 360 + }, + { + "epoch": 0.6942307692307692, + "grad_norm": 0.75390625, + "learning_rate": 7.787757750957335e-06, + "loss": 0.9212, + "step": 361 + }, + { + "epoch": 0.6961538461538461, + "grad_norm": 0.796875, + "learning_rate": 7.77454545623841e-06, + "loss": 0.9006, + "step": 362 + }, + { + "epoch": 0.698076923076923, + "grad_norm": 0.7578125, + "learning_rate": 7.761305108647188e-06, + "loss": 0.9427, + "step": 363 + }, + { + "epoch": 0.7, + "grad_norm": 0.78125, + "learning_rate": 7.74803684205417e-06, + "loss": 0.9583, + "step": 364 + }, + { + "epoch": 0.7019230769230769, + "grad_norm": 0.80078125, + "learning_rate": 7.734740790612137e-06, + "loss": 0.9303, + "step": 365 + }, + { + "epoch": 0.7038461538461539, + "grad_norm": 0.796875, + "learning_rate": 7.72141708875479e-06, + "loss": 0.9017, + "step": 366 + }, + { + "epoch": 0.7057692307692308, + "grad_norm": 0.7421875, + "learning_rate": 7.708065871195413e-06, + "loss": 0.9247, + "step": 367 + }, + { + "epoch": 0.7076923076923077, + "grad_norm": 0.7109375, + "learning_rate": 7.694687272925487e-06, + "loss": 0.8598, + "step": 368 + }, + { + "epoch": 0.7096153846153846, + "grad_norm": 0.73046875, + "learning_rate": 7.681281429213328e-06, + "loss": 0.9719, + "step": 369 + }, + { + "epoch": 0.7115384615384616, + "grad_norm": 0.76171875, + "learning_rate": 7.667848475602735e-06, + "loss": 0.9588, + "step": 370 + }, + { + "epoch": 0.7134615384615385, + "grad_norm": 0.73046875, + "learning_rate": 7.654388547911605e-06, + "loss": 0.8185, + "step": 371 + }, + { + "epoch": 0.7153846153846154, + "grad_norm": 0.73046875, + "learning_rate": 7.640901782230567e-06, + "loss": 0.93, + "step": 372 + }, + { + "epoch": 0.7173076923076923, + "grad_norm": 0.734375, + "learning_rate": 7.627388314921602e-06, + "loss": 0.9846, + "step": 373 + }, + { + "epoch": 0.7192307692307692, + "grad_norm": 0.74609375, + "learning_rate": 7.613848282616665e-06, + "loss": 0.9807, + "step": 374 + }, + { + "epoch": 0.7211538461538461, + "grad_norm": 0.75390625, + "learning_rate": 7.600281822216307e-06, + "loss": 0.9011, + "step": 375 + }, + { + "epoch": 0.7230769230769231, + "grad_norm": 0.74609375, + "learning_rate": 7.586689070888284e-06, + "loss": 0.8961, + "step": 376 + }, + { + "epoch": 0.725, + "grad_norm": 0.7890625, + "learning_rate": 7.5730701660661795e-06, + "loss": 0.9279, + "step": 377 + }, + { + "epoch": 0.7269230769230769, + "grad_norm": 0.74609375, + "learning_rate": 7.559425245448006e-06, + "loss": 0.9177, + "step": 378 + }, + { + "epoch": 0.7288461538461538, + "grad_norm": 0.734375, + "learning_rate": 7.5457544469948164e-06, + "loss": 0.9309, + "step": 379 + }, + { + "epoch": 0.7307692307692307, + "grad_norm": 0.75, + "learning_rate": 7.532057908929311e-06, + "loss": 0.8937, + "step": 380 + }, + { + "epoch": 0.7326923076923076, + "grad_norm": 0.78515625, + "learning_rate": 7.5183357697344395e-06, + "loss": 0.895, + "step": 381 + }, + { + "epoch": 0.7346153846153847, + "grad_norm": 0.7421875, + "learning_rate": 7.504588168151994e-06, + "loss": 0.9167, + "step": 382 + }, + { + "epoch": 0.7365384615384616, + "grad_norm": 0.7421875, + "learning_rate": 7.4908152431812175e-06, + "loss": 0.921, + "step": 383 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.75390625, + "learning_rate": 7.477017134077389e-06, + "loss": 0.8987, + "step": 384 + }, + { + "epoch": 0.7403846153846154, + "grad_norm": 0.7578125, + "learning_rate": 7.4631939803504215e-06, + "loss": 0.8866, + "step": 385 + }, + { + "epoch": 0.7423076923076923, + "grad_norm": 0.73828125, + "learning_rate": 7.449345921763449e-06, + "loss": 0.8745, + "step": 386 + }, + { + "epoch": 0.7442307692307693, + "grad_norm": 0.75390625, + "learning_rate": 7.435473098331411e-06, + "loss": 0.865, + "step": 387 + }, + { + "epoch": 0.7461538461538462, + "grad_norm": 0.78125, + "learning_rate": 7.421575650319641e-06, + "loss": 0.8841, + "step": 388 + }, + { + "epoch": 0.7480769230769231, + "grad_norm": 0.765625, + "learning_rate": 7.407653718242449e-06, + "loss": 0.9637, + "step": 389 + }, + { + "epoch": 0.75, + "grad_norm": 0.70703125, + "learning_rate": 7.393707442861693e-06, + "loss": 0.914, + "step": 390 + }, + { + "epoch": 0.75, + "eval_loss": 0.9376137256622314, + "eval_runtime": 34.5412, + "eval_samples_per_second": 67.832, + "eval_steps_per_second": 16.965, + "step": 390 + }, + { + "epoch": 0.7519230769230769, + "grad_norm": 0.74609375, + "learning_rate": 7.379736965185369e-06, + "loss": 0.9394, + "step": 391 + }, + { + "epoch": 0.7538461538461538, + "grad_norm": 0.77734375, + "learning_rate": 7.365742426466169e-06, + "loss": 0.9122, + "step": 392 + }, + { + "epoch": 0.7557692307692307, + "grad_norm": 0.73046875, + "learning_rate": 7.3517239682000675e-06, + "loss": 0.9033, + "step": 393 + }, + { + "epoch": 0.7576923076923077, + "grad_norm": 0.97265625, + "learning_rate": 7.337681732124882e-06, + "loss": 0.8908, + "step": 394 + }, + { + "epoch": 0.7596153846153846, + "grad_norm": 0.734375, + "learning_rate": 7.323615860218844e-06, + "loss": 0.8938, + "step": 395 + }, + { + "epoch": 0.7615384615384615, + "grad_norm": 0.75390625, + "learning_rate": 7.30952649469916e-06, + "loss": 0.9013, + "step": 396 + }, + { + "epoch": 0.7634615384615384, + "grad_norm": 0.75390625, + "learning_rate": 7.295413778020579e-06, + "loss": 0.9203, + "step": 397 + }, + { + "epoch": 0.7653846153846153, + "grad_norm": 0.76171875, + "learning_rate": 7.281277852873947e-06, + "loss": 0.9713, + "step": 398 + }, + { + "epoch": 0.7673076923076924, + "grad_norm": 0.7890625, + "learning_rate": 7.267118862184767e-06, + "loss": 0.9376, + "step": 399 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.765625, + "learning_rate": 7.252936949111749e-06, + "loss": 0.9329, + "step": 400 + }, + { + "epoch": 0.7711538461538462, + "grad_norm": 0.734375, + "learning_rate": 7.2387322570453724e-06, + "loss": 0.8421, + "step": 401 + }, + { + "epoch": 0.7730769230769231, + "grad_norm": 0.75, + "learning_rate": 7.224504929606429e-06, + "loss": 0.8929, + "step": 402 + }, + { + "epoch": 0.775, + "grad_norm": 0.75390625, + "learning_rate": 7.210255110644569e-06, + "loss": 0.9063, + "step": 403 + }, + { + "epoch": 0.7769230769230769, + "grad_norm": 0.7265625, + "learning_rate": 7.195982944236853e-06, + "loss": 0.9735, + "step": 404 + }, + { + "epoch": 0.7788461538461539, + "grad_norm": 0.7109375, + "learning_rate": 7.181688574686292e-06, + "loss": 0.8794, + "step": 405 + }, + { + "epoch": 0.7807692307692308, + "grad_norm": 0.75, + "learning_rate": 7.167372146520386e-06, + "loss": 0.8891, + "step": 406 + }, + { + "epoch": 0.7826923076923077, + "grad_norm": 0.71484375, + "learning_rate": 7.15303380448967e-06, + "loss": 0.8951, + "step": 407 + }, + { + "epoch": 0.7846153846153846, + "grad_norm": 0.78125, + "learning_rate": 7.138673693566241e-06, + "loss": 0.897, + "step": 408 + }, + { + "epoch": 0.7865384615384615, + "grad_norm": 0.796875, + "learning_rate": 7.1242919589422974e-06, + "loss": 0.9431, + "step": 409 + }, + { + "epoch": 0.7884615384615384, + "grad_norm": 0.74609375, + "learning_rate": 7.1098887460286745e-06, + "loss": 0.8704, + "step": 410 + }, + { + "epoch": 0.7903846153846154, + "grad_norm": 0.765625, + "learning_rate": 7.095464200453366e-06, + "loss": 0.9813, + "step": 411 + }, + { + "epoch": 0.7923076923076923, + "grad_norm": 0.734375, + "learning_rate": 7.081018468060057e-06, + "loss": 0.8657, + "step": 412 + }, + { + "epoch": 0.7942307692307692, + "grad_norm": 0.7578125, + "learning_rate": 7.066551694906651e-06, + "loss": 0.9216, + "step": 413 + }, + { + "epoch": 0.7961538461538461, + "grad_norm": 0.734375, + "learning_rate": 7.052064027263785e-06, + "loss": 0.9203, + "step": 414 + }, + { + "epoch": 0.7980769230769231, + "grad_norm": 0.73828125, + "learning_rate": 7.0375556116133605e-06, + "loss": 0.9002, + "step": 415 + }, + { + "epoch": 0.8, + "grad_norm": 0.73046875, + "learning_rate": 7.023026594647057e-06, + "loss": 0.9279, + "step": 416 + }, + { + "epoch": 0.801923076923077, + "grad_norm": 0.71875, + "learning_rate": 7.008477123264849e-06, + "loss": 0.8851, + "step": 417 + }, + { + "epoch": 0.8038461538461539, + "grad_norm": 0.72265625, + "learning_rate": 6.9939073445735205e-06, + "loss": 0.8718, + "step": 418 + }, + { + "epoch": 0.8057692307692308, + "grad_norm": 0.703125, + "learning_rate": 6.9793174058851805e-06, + "loss": 0.8874, + "step": 419 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 0.73828125, + "learning_rate": 6.964707454715772e-06, + "loss": 0.8747, + "step": 420 + }, + { + "epoch": 0.8096153846153846, + "grad_norm": 0.7578125, + "learning_rate": 6.9500776387835785e-06, + "loss": 0.9146, + "step": 421 + }, + { + "epoch": 0.8115384615384615, + "grad_norm": 0.78515625, + "learning_rate": 6.935428106007734e-06, + "loss": 0.9598, + "step": 422 + }, + { + "epoch": 0.8134615384615385, + "grad_norm": 0.75390625, + "learning_rate": 6.920759004506723e-06, + "loss": 0.873, + "step": 423 + }, + { + "epoch": 0.8153846153846154, + "grad_norm": 0.80859375, + "learning_rate": 6.906070482596887e-06, + "loss": 0.9395, + "step": 424 + }, + { + "epoch": 0.8173076923076923, + "grad_norm": 0.71484375, + "learning_rate": 6.891362688790925e-06, + "loss": 0.8713, + "step": 425 + }, + { + "epoch": 0.8192307692307692, + "grad_norm": 0.7109375, + "learning_rate": 6.876635771796386e-06, + "loss": 0.8427, + "step": 426 + }, + { + "epoch": 0.8211538461538461, + "grad_norm": 0.75, + "learning_rate": 6.8618898805141744e-06, + "loss": 0.9148, + "step": 427 + }, + { + "epoch": 0.823076923076923, + "grad_norm": 0.74609375, + "learning_rate": 6.847125164037036e-06, + "loss": 0.8788, + "step": 428 + }, + { + "epoch": 0.825, + "grad_norm": 0.72265625, + "learning_rate": 6.832341771648057e-06, + "loss": 0.8523, + "step": 429 + }, + { + "epoch": 0.8269230769230769, + "grad_norm": 0.7265625, + "learning_rate": 6.817539852819149e-06, + "loss": 0.869, + "step": 430 + }, + { + "epoch": 0.8288461538461539, + "grad_norm": 0.6953125, + "learning_rate": 6.802719557209547e-06, + "loss": 0.8934, + "step": 431 + }, + { + "epoch": 0.8307692307692308, + "grad_norm": 0.7265625, + "learning_rate": 6.787881034664283e-06, + "loss": 0.9127, + "step": 432 + }, + { + "epoch": 0.8326923076923077, + "grad_norm": 0.7421875, + "learning_rate": 6.773024435212678e-06, + "loss": 0.9617, + "step": 433 + }, + { + "epoch": 0.8346153846153846, + "grad_norm": 0.7421875, + "learning_rate": 6.758149909066832e-06, + "loss": 0.8918, + "step": 434 + }, + { + "epoch": 0.8365384615384616, + "grad_norm": 0.7734375, + "learning_rate": 6.743257606620094e-06, + "loss": 0.9721, + "step": 435 + }, + { + "epoch": 0.8384615384615385, + "grad_norm": 0.734375, + "learning_rate": 6.728347678445539e-06, + "loss": 0.9183, + "step": 436 + }, + { + "epoch": 0.8403846153846154, + "grad_norm": 0.7265625, + "learning_rate": 6.713420275294467e-06, + "loss": 0.8995, + "step": 437 + }, + { + "epoch": 0.8423076923076923, + "grad_norm": 0.75, + "learning_rate": 6.69847554809485e-06, + "loss": 0.879, + "step": 438 + }, + { + "epoch": 0.8442307692307692, + "grad_norm": 0.7421875, + "learning_rate": 6.683513647949826e-06, + "loss": 0.927, + "step": 439 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.75390625, + "learning_rate": 6.668534726136166e-06, + "loss": 0.9, + "step": 440 + }, + { + "epoch": 0.8480769230769231, + "grad_norm": 0.74609375, + "learning_rate": 6.653538934102743e-06, + "loss": 0.8526, + "step": 441 + }, + { + "epoch": 0.85, + "grad_norm": 0.75, + "learning_rate": 6.638526423468999e-06, + "loss": 0.8354, + "step": 442 + }, + { + "epoch": 0.8519230769230769, + "grad_norm": 0.75, + "learning_rate": 6.6234973460234184e-06, + "loss": 0.8852, + "step": 443 + }, + { + "epoch": 0.8538461538461538, + "grad_norm": 0.75, + "learning_rate": 6.608451853721985e-06, + "loss": 0.9275, + "step": 444 + }, + { + "epoch": 0.8557692307692307, + "grad_norm": 0.72265625, + "learning_rate": 6.593390098686653e-06, + "loss": 0.9023, + "step": 445 + }, + { + "epoch": 0.8576923076923076, + "grad_norm": 0.75, + "learning_rate": 6.578312233203804e-06, + "loss": 0.8804, + "step": 446 + }, + { + "epoch": 0.8596153846153847, + "grad_norm": 0.7109375, + "learning_rate": 6.563218409722712e-06, + "loss": 0.9229, + "step": 447 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 0.71484375, + "learning_rate": 6.548108780853995e-06, + "loss": 0.8995, + "step": 448 + }, + { + "epoch": 0.8634615384615385, + "grad_norm": 0.703125, + "learning_rate": 6.532983499368078e-06, + "loss": 0.8847, + "step": 449 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 0.71484375, + "learning_rate": 6.5178427181936485e-06, + "loss": 0.923, + "step": 450 + }, + { + "epoch": 0.8673076923076923, + "grad_norm": 0.73828125, + "learning_rate": 6.502686590416105e-06, + "loss": 0.8987, + "step": 451 + }, + { + "epoch": 0.8692307692307693, + "grad_norm": 0.7109375, + "learning_rate": 6.487515269276015e-06, + "loss": 0.9345, + "step": 452 + }, + { + "epoch": 0.8711538461538462, + "grad_norm": 0.74609375, + "learning_rate": 6.472328908167562e-06, + "loss": 0.8575, + "step": 453 + }, + { + "epoch": 0.8730769230769231, + "grad_norm": 0.734375, + "learning_rate": 6.457127660636994e-06, + "loss": 0.9209, + "step": 454 + }, + { + "epoch": 0.875, + "grad_norm": 0.74609375, + "learning_rate": 6.441911680381074e-06, + "loss": 0.8873, + "step": 455 + }, + { + "epoch": 0.875, + "eval_loss": 0.9346491098403931, + "eval_runtime": 34.5947, + "eval_samples_per_second": 67.727, + "eval_steps_per_second": 16.939, + "step": 455 + }, + { + "epoch": 0.8769230769230769, + "grad_norm": 0.73046875, + "learning_rate": 6.426681121245527e-06, + "loss": 0.9187, + "step": 456 + }, + { + "epoch": 0.8788461538461538, + "grad_norm": 0.77734375, + "learning_rate": 6.411436137223479e-06, + "loss": 0.9509, + "step": 457 + }, + { + "epoch": 0.8807692307692307, + "grad_norm": 0.75, + "learning_rate": 6.396176882453902e-06, + "loss": 0.9401, + "step": 458 + }, + { + "epoch": 0.8826923076923077, + "grad_norm": 0.73828125, + "learning_rate": 6.38090351122006e-06, + "loss": 0.8767, + "step": 459 + }, + { + "epoch": 0.8846153846153846, + "grad_norm": 0.72265625, + "learning_rate": 6.365616177947945e-06, + "loss": 0.8637, + "step": 460 + }, + { + "epoch": 0.8865384615384615, + "grad_norm": 0.74609375, + "learning_rate": 6.350315037204714e-06, + "loss": 0.9081, + "step": 461 + }, + { + "epoch": 0.8884615384615384, + "grad_norm": 0.73828125, + "learning_rate": 6.335000243697134e-06, + "loss": 0.9054, + "step": 462 + }, + { + "epoch": 0.8903846153846153, + "grad_norm": 0.73046875, + "learning_rate": 6.319671952270004e-06, + "loss": 0.9045, + "step": 463 + }, + { + "epoch": 0.8923076923076924, + "grad_norm": 0.71484375, + "learning_rate": 6.304330317904605e-06, + "loss": 0.9227, + "step": 464 + }, + { + "epoch": 0.8942307692307693, + "grad_norm": 0.69921875, + "learning_rate": 6.288975495717124e-06, + "loss": 0.8882, + "step": 465 + }, + { + "epoch": 0.8961538461538462, + "grad_norm": 0.76171875, + "learning_rate": 6.273607640957085e-06, + "loss": 0.9967, + "step": 466 + }, + { + "epoch": 0.8980769230769231, + "grad_norm": 0.73046875, + "learning_rate": 6.258226909005783e-06, + "loss": 0.9474, + "step": 467 + }, + { + "epoch": 0.9, + "grad_norm": 0.7578125, + "learning_rate": 6.2428334553747135e-06, + "loss": 0.912, + "step": 468 + }, + { + "epoch": 0.9019230769230769, + "grad_norm": 0.74609375, + "learning_rate": 6.227427435703997e-06, + "loss": 0.9355, + "step": 469 + }, + { + "epoch": 0.9038461538461539, + "grad_norm": 0.75, + "learning_rate": 6.212009005760805e-06, + "loss": 0.9328, + "step": 470 + }, + { + "epoch": 0.9057692307692308, + "grad_norm": 0.74609375, + "learning_rate": 6.1965783214377895e-06, + "loss": 0.9323, + "step": 471 + }, + { + "epoch": 0.9076923076923077, + "grad_norm": 0.71484375, + "learning_rate": 6.181135538751504e-06, + "loss": 0.8865, + "step": 472 + }, + { + "epoch": 0.9096153846153846, + "grad_norm": 0.73046875, + "learning_rate": 6.165680813840822e-06, + "loss": 0.9123, + "step": 473 + }, + { + "epoch": 0.9115384615384615, + "grad_norm": 0.73828125, + "learning_rate": 6.150214302965368e-06, + "loss": 0.9209, + "step": 474 + }, + { + "epoch": 0.9134615384615384, + "grad_norm": 0.7109375, + "learning_rate": 6.134736162503929e-06, + "loss": 0.9377, + "step": 475 + }, + { + "epoch": 0.9153846153846154, + "grad_norm": 0.734375, + "learning_rate": 6.119246548952877e-06, + "loss": 0.9317, + "step": 476 + }, + { + "epoch": 0.9173076923076923, + "grad_norm": 0.71484375, + "learning_rate": 6.103745618924587e-06, + "loss": 0.8839, + "step": 477 + }, + { + "epoch": 0.9192307692307692, + "grad_norm": 0.71484375, + "learning_rate": 6.088233529145849e-06, + "loss": 0.8823, + "step": 478 + }, + { + "epoch": 0.9211538461538461, + "grad_norm": 0.7421875, + "learning_rate": 6.072710436456293e-06, + "loss": 0.9031, + "step": 479 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.7109375, + "learning_rate": 6.057176497806791e-06, + "loss": 0.9132, + "step": 480 + }, + { + "epoch": 0.925, + "grad_norm": 0.7734375, + "learning_rate": 6.041631870257882e-06, + "loss": 0.8772, + "step": 481 + }, + { + "epoch": 0.926923076923077, + "grad_norm": 0.73046875, + "learning_rate": 6.026076710978172e-06, + "loss": 0.901, + "step": 482 + }, + { + "epoch": 0.9288461538461539, + "grad_norm": 0.74609375, + "learning_rate": 6.010511177242757e-06, + "loss": 0.9196, + "step": 483 + }, + { + "epoch": 0.9307692307692308, + "grad_norm": 0.73046875, + "learning_rate": 5.994935426431627e-06, + "loss": 0.9718, + "step": 484 + }, + { + "epoch": 0.9326923076923077, + "grad_norm": 0.76953125, + "learning_rate": 5.979349616028067e-06, + "loss": 0.963, + "step": 485 + }, + { + "epoch": 0.9346153846153846, + "grad_norm": 0.70703125, + "learning_rate": 5.963753903617084e-06, + "loss": 0.9048, + "step": 486 + }, + { + "epoch": 0.9365384615384615, + "grad_norm": 0.7578125, + "learning_rate": 5.948148446883794e-06, + "loss": 0.9705, + "step": 487 + }, + { + "epoch": 0.9384615384615385, + "grad_norm": 0.71484375, + "learning_rate": 5.932533403611835e-06, + "loss": 0.8878, + "step": 488 + }, + { + "epoch": 0.9403846153846154, + "grad_norm": 0.73046875, + "learning_rate": 5.916908931681781e-06, + "loss": 0.9049, + "step": 489 + }, + { + "epoch": 0.9423076923076923, + "grad_norm": 0.703125, + "learning_rate": 5.90127518906953e-06, + "loss": 0.8733, + "step": 490 + }, + { + "epoch": 0.9442307692307692, + "grad_norm": 0.69921875, + "learning_rate": 5.885632333844714e-06, + "loss": 0.8896, + "step": 491 + }, + { + "epoch": 0.9461538461538461, + "grad_norm": 0.7109375, + "learning_rate": 5.8699805241691065e-06, + "loss": 0.9191, + "step": 492 + }, + { + "epoch": 0.948076923076923, + "grad_norm": 0.69921875, + "learning_rate": 5.854319918295012e-06, + "loss": 0.8721, + "step": 493 + }, + { + "epoch": 0.95, + "grad_norm": 0.73828125, + "learning_rate": 5.838650674563674e-06, + "loss": 0.8746, + "step": 494 + }, + { + "epoch": 0.9519230769230769, + "grad_norm": 0.74609375, + "learning_rate": 5.82297295140367e-06, + "loss": 0.9072, + "step": 495 + }, + { + "epoch": 0.9538461538461539, + "grad_norm": 0.74609375, + "learning_rate": 5.807286907329315e-06, + "loss": 0.8981, + "step": 496 + }, + { + "epoch": 0.9557692307692308, + "grad_norm": 0.734375, + "learning_rate": 5.79159270093905e-06, + "loss": 0.9358, + "step": 497 + }, + { + "epoch": 0.9576923076923077, + "grad_norm": 0.7109375, + "learning_rate": 5.7758904909138495e-06, + "loss": 0.915, + "step": 498 + }, + { + "epoch": 0.9596153846153846, + "grad_norm": 0.72265625, + "learning_rate": 5.760180436015604e-06, + "loss": 0.8652, + "step": 499 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 0.7265625, + "learning_rate": 5.74446269508553e-06, + "loss": 0.8926, + "step": 500 + }, + { + "epoch": 0.9634615384615385, + "grad_norm": 0.71484375, + "learning_rate": 5.7287374270425475e-06, + "loss": 0.8889, + "step": 501 + }, + { + "epoch": 0.9653846153846154, + "grad_norm": 0.703125, + "learning_rate": 5.7130047908816884e-06, + "loss": 0.9027, + "step": 502 + }, + { + "epoch": 0.9673076923076923, + "grad_norm": 0.7421875, + "learning_rate": 5.69726494567248e-06, + "loss": 0.9419, + "step": 503 + }, + { + "epoch": 0.9692307692307692, + "grad_norm": 0.73046875, + "learning_rate": 5.681518050557336e-06, + "loss": 0.9396, + "step": 504 + }, + { + "epoch": 0.9711538461538461, + "grad_norm": 0.71875, + "learning_rate": 5.6657642647499545e-06, + "loss": 0.8828, + "step": 505 + }, + { + "epoch": 0.9730769230769231, + "grad_norm": 0.765625, + "learning_rate": 5.650003747533701e-06, + "loss": 0.944, + "step": 506 + }, + { + "epoch": 0.975, + "grad_norm": 0.73046875, + "learning_rate": 5.6342366582600035e-06, + "loss": 0.9072, + "step": 507 + }, + { + "epoch": 0.9769230769230769, + "grad_norm": 0.7265625, + "learning_rate": 5.61846315634674e-06, + "loss": 0.9015, + "step": 508 + }, + { + "epoch": 0.9788461538461538, + "grad_norm": 0.7265625, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.9117, + "step": 509 + }, + { + "epoch": 0.9807692307692307, + "grad_norm": 0.75, + "learning_rate": 5.586897552595573e-06, + "loss": 0.971, + "step": 510 + }, + { + "epoch": 0.9826923076923076, + "grad_norm": 0.73828125, + "learning_rate": 5.571105769911159e-06, + "loss": 0.8729, + "step": 511 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 0.72265625, + "learning_rate": 5.555308212890917e-06, + "loss": 0.9132, + "step": 512 + }, + { + "epoch": 0.9865384615384616, + "grad_norm": 0.72265625, + "learning_rate": 5.539505041260779e-06, + "loss": 0.8606, + "step": 513 + }, + { + "epoch": 0.9884615384615385, + "grad_norm": 0.703125, + "learning_rate": 5.523696414803438e-06, + "loss": 0.8937, + "step": 514 + }, + { + "epoch": 0.9903846153846154, + "grad_norm": 0.73046875, + "learning_rate": 5.507882493356745e-06, + "loss": 0.913, + "step": 515 + }, + { + "epoch": 0.9923076923076923, + "grad_norm": 4.5625, + "learning_rate": 5.49206343681209e-06, + "loss": 0.862, + "step": 516 + }, + { + "epoch": 0.9942307692307693, + "grad_norm": 0.703125, + "learning_rate": 5.476239405112775e-06, + "loss": 0.8662, + "step": 517 + }, + { + "epoch": 0.9961538461538462, + "grad_norm": 0.73046875, + "learning_rate": 5.460410558252408e-06, + "loss": 0.8443, + "step": 518 + }, + { + "epoch": 0.9980769230769231, + "grad_norm": 0.7578125, + "learning_rate": 5.444577056273284e-06, + "loss": 0.9569, + "step": 519 + }, + { + "epoch": 1.0, + "grad_norm": 0.73828125, + "learning_rate": 5.428739059264767e-06, + "loss": 0.8854, + "step": 520 + }, + { + "epoch": 1.0, + "eval_loss": 0.9325999617576599, + "eval_runtime": 34.5297, + "eval_samples_per_second": 67.855, + "eval_steps_per_second": 16.971, + "step": 520 + }, + { + "epoch": 1.001923076923077, + "grad_norm": 0.7265625, + "learning_rate": 5.412896727361663e-06, + "loss": 0.9419, + "step": 521 + }, + { + "epoch": 1.0038461538461538, + "grad_norm": 0.7265625, + "learning_rate": 5.39705022074261e-06, + "loss": 0.843, + "step": 522 + }, + { + "epoch": 1.0057692307692307, + "grad_norm": 0.73046875, + "learning_rate": 5.381199699628459e-06, + "loss": 0.8494, + "step": 523 + }, + { + "epoch": 1.0076923076923077, + "grad_norm": 0.7265625, + "learning_rate": 5.365345324280646e-06, + "loss": 0.8623, + "step": 524 + }, + { + "epoch": 1.0096153846153846, + "grad_norm": 0.7421875, + "learning_rate": 5.349487254999579e-06, + "loss": 0.8716, + "step": 525 + }, + { + "epoch": 1.0115384615384615, + "grad_norm": 0.68359375, + "learning_rate": 5.333625652123014e-06, + "loss": 0.9326, + "step": 526 + }, + { + "epoch": 1.0134615384615384, + "grad_norm": 0.75, + "learning_rate": 5.317760676024436e-06, + "loss": 0.8741, + "step": 527 + }, + { + "epoch": 1.0153846153846153, + "grad_norm": 0.71875, + "learning_rate": 5.301892487111431e-06, + "loss": 0.8816, + "step": 528 + }, + { + "epoch": 1.0173076923076922, + "grad_norm": 0.71484375, + "learning_rate": 5.286021245824075e-06, + "loss": 0.8976, + "step": 529 + }, + { + "epoch": 1.0192307692307692, + "grad_norm": 0.703125, + "learning_rate": 5.270147112633304e-06, + "loss": 0.8997, + "step": 530 + }, + { + "epoch": 1.021153846153846, + "grad_norm": 0.74609375, + "learning_rate": 5.254270248039291e-06, + "loss": 0.9766, + "step": 531 + }, + { + "epoch": 1.023076923076923, + "grad_norm": 0.71484375, + "learning_rate": 5.238390812569828e-06, + "loss": 0.8946, + "step": 532 + }, + { + "epoch": 1.025, + "grad_norm": 0.70703125, + "learning_rate": 5.222508966778702e-06, + "loss": 0.909, + "step": 533 + }, + { + "epoch": 1.0269230769230768, + "grad_norm": 0.69140625, + "learning_rate": 5.206624871244066e-06, + "loss": 0.8777, + "step": 534 + }, + { + "epoch": 1.0288461538461537, + "grad_norm": 0.7421875, + "learning_rate": 5.190738686566826e-06, + "loss": 0.8775, + "step": 535 + }, + { + "epoch": 1.0307692307692307, + "grad_norm": 0.7265625, + "learning_rate": 5.1748505733690035e-06, + "loss": 0.8591, + "step": 536 + }, + { + "epoch": 1.0326923076923078, + "grad_norm": 0.76171875, + "learning_rate": 5.158960692292122e-06, + "loss": 0.942, + "step": 537 + }, + { + "epoch": 1.0346153846153847, + "grad_norm": 0.734375, + "learning_rate": 5.143069203995586e-06, + "loss": 0.8728, + "step": 538 + }, + { + "epoch": 1.0365384615384616, + "grad_norm": 0.74609375, + "learning_rate": 5.1271762691550375e-06, + "loss": 0.8437, + "step": 539 + }, + { + "epoch": 1.0384615384615385, + "grad_norm": 0.69921875, + "learning_rate": 5.111282048460753e-06, + "loss": 0.8885, + "step": 540 + }, + { + "epoch": 1.0403846153846155, + "grad_norm": 0.69140625, + "learning_rate": 5.095386702616012e-06, + "loss": 0.9171, + "step": 541 + }, + { + "epoch": 1.0423076923076924, + "grad_norm": 0.70703125, + "learning_rate": 5.079490392335463e-06, + "loss": 0.8742, + "step": 542 + }, + { + "epoch": 1.0442307692307693, + "grad_norm": 0.73046875, + "learning_rate": 5.06359327834351e-06, + "loss": 0.9521, + "step": 543 + }, + { + "epoch": 1.0461538461538462, + "grad_norm": 0.70703125, + "learning_rate": 5.047695521372681e-06, + "loss": 0.913, + "step": 544 + }, + { + "epoch": 1.0480769230769231, + "grad_norm": 0.75390625, + "learning_rate": 5.031797282162007e-06, + "loss": 0.9162, + "step": 545 + }, + { + "epoch": 1.05, + "grad_norm": 0.734375, + "learning_rate": 5.015898721455394e-06, + "loss": 0.9098, + "step": 546 + }, + { + "epoch": 1.051923076923077, + "grad_norm": 0.7109375, + "learning_rate": 5e-06, + "loss": 0.8851, + "step": 547 + }, + { + "epoch": 1.0538461538461539, + "grad_norm": 0.7734375, + "learning_rate": 4.984101278544607e-06, + "loss": 0.9895, + "step": 548 + }, + { + "epoch": 1.0557692307692308, + "grad_norm": 0.7265625, + "learning_rate": 4.968202717837996e-06, + "loss": 0.8406, + "step": 549 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 0.72265625, + "learning_rate": 4.9523044786273214e-06, + "loss": 0.9372, + "step": 550 + }, + { + "epoch": 1.0596153846153846, + "grad_norm": 0.69921875, + "learning_rate": 4.936406721656492e-06, + "loss": 0.8837, + "step": 551 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 0.70703125, + "learning_rate": 4.92050960766454e-06, + "loss": 0.9099, + "step": 552 + }, + { + "epoch": 1.0634615384615385, + "grad_norm": 0.7265625, + "learning_rate": 4.9046132973839895e-06, + "loss": 0.8802, + "step": 553 + }, + { + "epoch": 1.0653846153846154, + "grad_norm": 0.73828125, + "learning_rate": 4.8887179515392465e-06, + "loss": 0.8892, + "step": 554 + }, + { + "epoch": 1.0673076923076923, + "grad_norm": 0.73046875, + "learning_rate": 4.872823730844966e-06, + "loss": 0.9367, + "step": 555 + }, + { + "epoch": 1.0692307692307692, + "grad_norm": 0.765625, + "learning_rate": 4.856930796004417e-06, + "loss": 0.999, + "step": 556 + }, + { + "epoch": 1.0711538461538461, + "grad_norm": 0.671875, + "learning_rate": 4.841039307707878e-06, + "loss": 0.8129, + "step": 557 + }, + { + "epoch": 1.073076923076923, + "grad_norm": 0.78515625, + "learning_rate": 4.825149426630999e-06, + "loss": 0.8832, + "step": 558 + }, + { + "epoch": 1.075, + "grad_norm": 0.6875, + "learning_rate": 4.809261313433176e-06, + "loss": 0.9148, + "step": 559 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 0.734375, + "learning_rate": 4.793375128755934e-06, + "loss": 0.9256, + "step": 560 + }, + { + "epoch": 1.0788461538461538, + "grad_norm": 0.7265625, + "learning_rate": 4.7774910332213005e-06, + "loss": 0.9012, + "step": 561 + }, + { + "epoch": 1.0807692307692307, + "grad_norm": 0.7265625, + "learning_rate": 4.761609187430174e-06, + "loss": 0.85, + "step": 562 + }, + { + "epoch": 1.0826923076923076, + "grad_norm": 1.1015625, + "learning_rate": 4.74572975196071e-06, + "loss": 0.9216, + "step": 563 + }, + { + "epoch": 1.0846153846153845, + "grad_norm": 0.703125, + "learning_rate": 4.7298528873666985e-06, + "loss": 0.8746, + "step": 564 + }, + { + "epoch": 1.0865384615384615, + "grad_norm": 0.75, + "learning_rate": 4.713978754175926e-06, + "loss": 0.8979, + "step": 565 + }, + { + "epoch": 1.0884615384615384, + "grad_norm": 0.7265625, + "learning_rate": 4.69810751288857e-06, + "loss": 0.888, + "step": 566 + }, + { + "epoch": 1.0903846153846153, + "grad_norm": 0.734375, + "learning_rate": 4.682239323975566e-06, + "loss": 0.9075, + "step": 567 + }, + { + "epoch": 1.0923076923076924, + "grad_norm": 0.7421875, + "learning_rate": 4.666374347876987e-06, + "loss": 0.8873, + "step": 568 + }, + { + "epoch": 1.0942307692307693, + "grad_norm": 0.71484375, + "learning_rate": 4.6505127450004216e-06, + "loss": 0.8379, + "step": 569 + }, + { + "epoch": 1.0961538461538463, + "grad_norm": 0.71875, + "learning_rate": 4.634654675719355e-06, + "loss": 0.9033, + "step": 570 + }, + { + "epoch": 1.0980769230769232, + "grad_norm": 0.71875, + "learning_rate": 4.618800300371543e-06, + "loss": 0.896, + "step": 571 + }, + { + "epoch": 1.1, + "grad_norm": 0.734375, + "learning_rate": 4.60294977925739e-06, + "loss": 0.8901, + "step": 572 + }, + { + "epoch": 1.101923076923077, + "grad_norm": 0.7421875, + "learning_rate": 4.587103272638339e-06, + "loss": 0.9841, + "step": 573 + }, + { + "epoch": 1.103846153846154, + "grad_norm": 0.72265625, + "learning_rate": 4.571260940735235e-06, + "loss": 0.8862, + "step": 574 + }, + { + "epoch": 1.1057692307692308, + "grad_norm": 0.71875, + "learning_rate": 4.555422943726715e-06, + "loss": 0.9193, + "step": 575 + }, + { + "epoch": 1.1076923076923078, + "grad_norm": 0.765625, + "learning_rate": 4.539589441747595e-06, + "loss": 0.9534, + "step": 576 + }, + { + "epoch": 1.1096153846153847, + "grad_norm": 0.7578125, + "learning_rate": 4.523760594887228e-06, + "loss": 0.9352, + "step": 577 + }, + { + "epoch": 1.1115384615384616, + "grad_norm": 0.69921875, + "learning_rate": 4.507936563187911e-06, + "loss": 0.8709, + "step": 578 + }, + { + "epoch": 1.1134615384615385, + "grad_norm": 0.76171875, + "learning_rate": 4.492117506643256e-06, + "loss": 0.9662, + "step": 579 + }, + { + "epoch": 1.1153846153846154, + "grad_norm": 0.671875, + "learning_rate": 4.476303585196563e-06, + "loss": 0.8556, + "step": 580 + }, + { + "epoch": 1.1173076923076923, + "grad_norm": 0.73046875, + "learning_rate": 4.460494958739223e-06, + "loss": 0.9715, + "step": 581 + }, + { + "epoch": 1.1192307692307693, + "grad_norm": 0.72265625, + "learning_rate": 4.444691787109085e-06, + "loss": 0.8285, + "step": 582 + }, + { + "epoch": 1.1211538461538462, + "grad_norm": 0.72265625, + "learning_rate": 4.428894230088842e-06, + "loss": 0.908, + "step": 583 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 0.734375, + "learning_rate": 4.413102447404428e-06, + "loss": 0.946, + "step": 584 + }, + { + "epoch": 1.125, + "grad_norm": 0.7265625, + "learning_rate": 4.397316598723385e-06, + "loss": 0.9365, + "step": 585 + }, + { + "epoch": 1.125, + "eval_loss": 0.9315983653068542, + "eval_runtime": 34.5056, + "eval_samples_per_second": 67.902, + "eval_steps_per_second": 16.983, + "step": 585 + }, + { + "epoch": 1.126923076923077, + "grad_norm": 0.75390625, + "learning_rate": 4.381536843653262e-06, + "loss": 0.9313, + "step": 586 + }, + { + "epoch": 1.1288461538461538, + "grad_norm": 0.73046875, + "learning_rate": 4.365763341739996e-06, + "loss": 0.882, + "step": 587 + }, + { + "epoch": 1.1307692307692307, + "grad_norm": 0.73828125, + "learning_rate": 4.3499962524662995e-06, + "loss": 0.8806, + "step": 588 + }, + { + "epoch": 1.1326923076923077, + "grad_norm": 0.74609375, + "learning_rate": 4.334235735250047e-06, + "loss": 0.8719, + "step": 589 + }, + { + "epoch": 1.1346153846153846, + "grad_norm": 0.72265625, + "learning_rate": 4.318481949442665e-06, + "loss": 0.9148, + "step": 590 + }, + { + "epoch": 1.1365384615384615, + "grad_norm": 0.734375, + "learning_rate": 4.302735054327523e-06, + "loss": 0.8417, + "step": 591 + }, + { + "epoch": 1.1384615384615384, + "grad_norm": 0.734375, + "learning_rate": 4.286995209118313e-06, + "loss": 0.8802, + "step": 592 + }, + { + "epoch": 1.1403846153846153, + "grad_norm": 0.71875, + "learning_rate": 4.271262572957453e-06, + "loss": 0.9018, + "step": 593 + }, + { + "epoch": 1.1423076923076922, + "grad_norm": 0.68359375, + "learning_rate": 4.255537304914472e-06, + "loss": 0.8499, + "step": 594 + }, + { + "epoch": 1.1442307692307692, + "grad_norm": 0.74609375, + "learning_rate": 4.239819563984397e-06, + "loss": 0.948, + "step": 595 + }, + { + "epoch": 1.146153846153846, + "grad_norm": 0.71484375, + "learning_rate": 4.224109509086151e-06, + "loss": 0.9229, + "step": 596 + }, + { + "epoch": 1.148076923076923, + "grad_norm": 0.7109375, + "learning_rate": 4.2084072990609505e-06, + "loss": 0.8326, + "step": 597 + }, + { + "epoch": 1.15, + "grad_norm": 0.703125, + "learning_rate": 4.192713092670687e-06, + "loss": 0.899, + "step": 598 + }, + { + "epoch": 1.1519230769230768, + "grad_norm": 0.7265625, + "learning_rate": 4.17702704859633e-06, + "loss": 0.8957, + "step": 599 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.73828125, + "learning_rate": 4.161349325436328e-06, + "loss": 0.9516, + "step": 600 + }, + { + "epoch": 1.1557692307692307, + "grad_norm": 0.734375, + "learning_rate": 4.145680081704989e-06, + "loss": 0.9012, + "step": 601 + }, + { + "epoch": 1.1576923076923076, + "grad_norm": 0.6875, + "learning_rate": 4.1300194758308935e-06, + "loss": 0.8685, + "step": 602 + }, + { + "epoch": 1.1596153846153845, + "grad_norm": 0.79296875, + "learning_rate": 4.1143676661552876e-06, + "loss": 0.9122, + "step": 603 + }, + { + "epoch": 1.1615384615384616, + "grad_norm": 0.71484375, + "learning_rate": 4.098724810930472e-06, + "loss": 0.9732, + "step": 604 + }, + { + "epoch": 1.1634615384615385, + "grad_norm": 0.71484375, + "learning_rate": 4.08309106831822e-06, + "loss": 0.8518, + "step": 605 + }, + { + "epoch": 1.1653846153846155, + "grad_norm": 0.73046875, + "learning_rate": 4.067466596388166e-06, + "loss": 0.8326, + "step": 606 + }, + { + "epoch": 1.1673076923076924, + "grad_norm": 0.71484375, + "learning_rate": 4.051851553116208e-06, + "loss": 0.8629, + "step": 607 + }, + { + "epoch": 1.1692307692307693, + "grad_norm": 0.71875, + "learning_rate": 4.036246096382916e-06, + "loss": 0.9053, + "step": 608 + }, + { + "epoch": 1.1711538461538462, + "grad_norm": 0.73046875, + "learning_rate": 4.0206503839719335e-06, + "loss": 0.9191, + "step": 609 + }, + { + "epoch": 1.1730769230769231, + "grad_norm": 0.7734375, + "learning_rate": 4.0050645735683745e-06, + "loss": 0.9079, + "step": 610 + }, + { + "epoch": 1.175, + "grad_norm": 0.72265625, + "learning_rate": 3.989488822757244e-06, + "loss": 0.9048, + "step": 611 + }, + { + "epoch": 1.176923076923077, + "grad_norm": 0.71875, + "learning_rate": 3.973923289021829e-06, + "loss": 0.8945, + "step": 612 + }, + { + "epoch": 1.1788461538461539, + "grad_norm": 0.71484375, + "learning_rate": 3.9583681297421194e-06, + "loss": 0.8879, + "step": 613 + }, + { + "epoch": 1.1807692307692308, + "grad_norm": 0.79296875, + "learning_rate": 3.9428235021932104e-06, + "loss": 0.982, + "step": 614 + }, + { + "epoch": 1.1826923076923077, + "grad_norm": 0.73828125, + "learning_rate": 3.927289563543709e-06, + "loss": 0.8807, + "step": 615 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 0.73828125, + "learning_rate": 3.911766470854152e-06, + "loss": 0.9508, + "step": 616 + }, + { + "epoch": 1.1865384615384615, + "grad_norm": 0.7265625, + "learning_rate": 3.896254381075416e-06, + "loss": 0.8701, + "step": 617 + }, + { + "epoch": 1.1884615384615385, + "grad_norm": 0.71484375, + "learning_rate": 3.880753451047124e-06, + "loss": 0.9148, + "step": 618 + }, + { + "epoch": 1.1903846153846154, + "grad_norm": 0.73828125, + "learning_rate": 3.865263837496072e-06, + "loss": 0.8937, + "step": 619 + }, + { + "epoch": 1.1923076923076923, + "grad_norm": 0.7265625, + "learning_rate": 3.849785697034634e-06, + "loss": 0.855, + "step": 620 + }, + { + "epoch": 1.1942307692307692, + "grad_norm": 0.68359375, + "learning_rate": 3.834319186159179e-06, + "loss": 0.8574, + "step": 621 + }, + { + "epoch": 1.1961538461538461, + "grad_norm": 0.7265625, + "learning_rate": 3.818864461248498e-06, + "loss": 0.9341, + "step": 622 + }, + { + "epoch": 1.198076923076923, + "grad_norm": 0.69921875, + "learning_rate": 3.803421678562213e-06, + "loss": 0.9029, + "step": 623 + }, + { + "epoch": 1.2, + "grad_norm": 0.7265625, + "learning_rate": 3.7879909942391963e-06, + "loss": 0.9188, + "step": 624 + }, + { + "epoch": 1.2019230769230769, + "grad_norm": 0.7109375, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.8459, + "step": 625 + }, + { + "epoch": 1.2038461538461538, + "grad_norm": 0.6953125, + "learning_rate": 3.7571665446252886e-06, + "loss": 0.8931, + "step": 626 + }, + { + "epoch": 1.2057692307692307, + "grad_norm": 0.69921875, + "learning_rate": 3.7417730909942184e-06, + "loss": 0.8719, + "step": 627 + }, + { + "epoch": 1.2076923076923076, + "grad_norm": 0.734375, + "learning_rate": 3.726392359042917e-06, + "loss": 0.922, + "step": 628 + }, + { + "epoch": 1.2096153846153845, + "grad_norm": 0.77734375, + "learning_rate": 3.7110245042828786e-06, + "loss": 0.8538, + "step": 629 + }, + { + "epoch": 1.2115384615384615, + "grad_norm": 0.75, + "learning_rate": 3.695669682095397e-06, + "loss": 0.9149, + "step": 630 + }, + { + "epoch": 1.2134615384615384, + "grad_norm": 0.67578125, + "learning_rate": 3.6803280477299975e-06, + "loss": 0.8769, + "step": 631 + }, + { + "epoch": 1.2153846153846155, + "grad_norm": 0.69140625, + "learning_rate": 3.664999756302869e-06, + "loss": 0.8532, + "step": 632 + }, + { + "epoch": 1.2173076923076924, + "grad_norm": 0.76953125, + "learning_rate": 3.6496849627952875e-06, + "loss": 0.9334, + "step": 633 + }, + { + "epoch": 1.2192307692307693, + "grad_norm": 0.73046875, + "learning_rate": 3.634383822052057e-06, + "loss": 0.9112, + "step": 634 + }, + { + "epoch": 1.2211538461538463, + "grad_norm": 0.69921875, + "learning_rate": 3.6190964887799418e-06, + "loss": 0.8462, + "step": 635 + }, + { + "epoch": 1.2230769230769232, + "grad_norm": 0.76953125, + "learning_rate": 3.6038231175461004e-06, + "loss": 0.915, + "step": 636 + }, + { + "epoch": 1.225, + "grad_norm": 0.7109375, + "learning_rate": 3.5885638627765228e-06, + "loss": 0.9869, + "step": 637 + }, + { + "epoch": 1.226923076923077, + "grad_norm": 0.6875, + "learning_rate": 3.573318878754475e-06, + "loss": 0.8105, + "step": 638 + }, + { + "epoch": 1.228846153846154, + "grad_norm": 0.7421875, + "learning_rate": 3.5580883196189265e-06, + "loss": 0.929, + "step": 639 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.70703125, + "learning_rate": 3.5428723393630067e-06, + "loss": 0.9044, + "step": 640 + }, + { + "epoch": 1.2326923076923078, + "grad_norm": 0.71875, + "learning_rate": 3.52767109183244e-06, + "loss": 0.8723, + "step": 641 + }, + { + "epoch": 1.2346153846153847, + "grad_norm": 0.921875, + "learning_rate": 3.5124847307239863e-06, + "loss": 0.9235, + "step": 642 + }, + { + "epoch": 1.2365384615384616, + "grad_norm": 0.73046875, + "learning_rate": 3.4973134095838943e-06, + "loss": 0.9271, + "step": 643 + }, + { + "epoch": 1.2384615384615385, + "grad_norm": 0.68359375, + "learning_rate": 3.4821572818063544e-06, + "loss": 0.8456, + "step": 644 + }, + { + "epoch": 1.2403846153846154, + "grad_norm": 0.69921875, + "learning_rate": 3.4670165006319236e-06, + "loss": 0.9204, + "step": 645 + }, + { + "epoch": 1.2423076923076923, + "grad_norm": 0.73828125, + "learning_rate": 3.4518912191460073e-06, + "loss": 0.908, + "step": 646 + }, + { + "epoch": 1.2442307692307693, + "grad_norm": 0.6953125, + "learning_rate": 3.4367815902772917e-06, + "loss": 0.8835, + "step": 647 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 0.734375, + "learning_rate": 3.4216877667961975e-06, + "loss": 0.8741, + "step": 648 + }, + { + "epoch": 1.248076923076923, + "grad_norm": 0.6953125, + "learning_rate": 3.406609901313349e-06, + "loss": 0.9994, + "step": 649 + }, + { + "epoch": 1.25, + "grad_norm": 0.703125, + "learning_rate": 3.3915481462780174e-06, + "loss": 0.8865, + "step": 650 + }, + { + "epoch": 1.25, + "eval_loss": 0.9308112859725952, + "eval_runtime": 34.3336, + "eval_samples_per_second": 68.242, + "eval_steps_per_second": 17.068, + "step": 650 + }, + { + "epoch": 1.251923076923077, + "grad_norm": 0.671875, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.8766, + "step": 651 + }, + { + "epoch": 1.2538461538461538, + "grad_norm": 0.7109375, + "learning_rate": 3.3614735765310013e-06, + "loss": 0.8693, + "step": 652 + }, + { + "epoch": 1.2557692307692307, + "grad_norm": 0.7109375, + "learning_rate": 3.3464610658972584e-06, + "loss": 0.858, + "step": 653 + }, + { + "epoch": 1.2576923076923077, + "grad_norm": 0.72265625, + "learning_rate": 3.331465273863834e-06, + "loss": 0.8857, + "step": 654 + }, + { + "epoch": 1.2596153846153846, + "grad_norm": 0.73828125, + "learning_rate": 3.3164863520501744e-06, + "loss": 0.8839, + "step": 655 + }, + { + "epoch": 1.2615384615384615, + "grad_norm": 0.67578125, + "learning_rate": 3.3015244519051525e-06, + "loss": 0.8891, + "step": 656 + }, + { + "epoch": 1.2634615384615384, + "grad_norm": 0.7265625, + "learning_rate": 3.2865797247055354e-06, + "loss": 0.8891, + "step": 657 + }, + { + "epoch": 1.2653846153846153, + "grad_norm": 0.71875, + "learning_rate": 3.2716523215544602e-06, + "loss": 0.8977, + "step": 658 + }, + { + "epoch": 1.2673076923076922, + "grad_norm": 0.71484375, + "learning_rate": 3.256742393379909e-06, + "loss": 0.9019, + "step": 659 + }, + { + "epoch": 1.2692307692307692, + "grad_norm": 0.71875, + "learning_rate": 3.2418500909331684e-06, + "loss": 0.8878, + "step": 660 + }, + { + "epoch": 1.271153846153846, + "grad_norm": 0.71484375, + "learning_rate": 3.226975564787322e-06, + "loss": 0.8501, + "step": 661 + }, + { + "epoch": 1.273076923076923, + "grad_norm": 0.7265625, + "learning_rate": 3.21211896533572e-06, + "loss": 0.8878, + "step": 662 + }, + { + "epoch": 1.275, + "grad_norm": 0.7421875, + "learning_rate": 3.197280442790455e-06, + "loss": 0.9186, + "step": 663 + }, + { + "epoch": 1.2769230769230768, + "grad_norm": 0.6953125, + "learning_rate": 3.1824601471808504e-06, + "loss": 0.8288, + "step": 664 + }, + { + "epoch": 1.2788461538461537, + "grad_norm": 0.71484375, + "learning_rate": 3.1676582283519454e-06, + "loss": 0.9064, + "step": 665 + }, + { + "epoch": 1.2807692307692307, + "grad_norm": 0.69921875, + "learning_rate": 3.1528748359629657e-06, + "loss": 0.8728, + "step": 666 + }, + { + "epoch": 1.2826923076923076, + "grad_norm": 0.7109375, + "learning_rate": 3.1381101194858264e-06, + "loss": 0.8885, + "step": 667 + }, + { + "epoch": 1.2846153846153845, + "grad_norm": 0.69921875, + "learning_rate": 3.1233642282036147e-06, + "loss": 0.9146, + "step": 668 + }, + { + "epoch": 1.2865384615384614, + "grad_norm": 0.74609375, + "learning_rate": 3.1086373112090762e-06, + "loss": 0.897, + "step": 669 + }, + { + "epoch": 1.2884615384615383, + "grad_norm": 0.73046875, + "learning_rate": 3.0939295174031127e-06, + "loss": 0.8649, + "step": 670 + }, + { + "epoch": 1.2903846153846155, + "grad_norm": 0.69140625, + "learning_rate": 3.079240995493279e-06, + "loss": 0.8316, + "step": 671 + }, + { + "epoch": 1.2923076923076924, + "grad_norm": 0.71484375, + "learning_rate": 3.0645718939922668e-06, + "loss": 0.8968, + "step": 672 + }, + { + "epoch": 1.2942307692307693, + "grad_norm": 0.703125, + "learning_rate": 3.049922361216422e-06, + "loss": 0.9268, + "step": 673 + }, + { + "epoch": 1.2961538461538462, + "grad_norm": 0.71875, + "learning_rate": 3.03529254528423e-06, + "loss": 0.8527, + "step": 674 + }, + { + "epoch": 1.2980769230769231, + "grad_norm": 0.75390625, + "learning_rate": 3.0206825941148203e-06, + "loss": 0.8926, + "step": 675 + }, + { + "epoch": 1.3, + "grad_norm": 0.77734375, + "learning_rate": 3.006092655426481e-06, + "loss": 0.8785, + "step": 676 + }, + { + "epoch": 1.301923076923077, + "grad_norm": 0.73046875, + "learning_rate": 2.991522876735154e-06, + "loss": 0.9372, + "step": 677 + }, + { + "epoch": 1.3038461538461539, + "grad_norm": 0.74609375, + "learning_rate": 2.9769734053529443e-06, + "loss": 0.876, + "step": 678 + }, + { + "epoch": 1.3057692307692308, + "grad_norm": 0.77734375, + "learning_rate": 2.9624443883866403e-06, + "loss": 0.9531, + "step": 679 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.71875, + "learning_rate": 2.947935972736217e-06, + "loss": 0.881, + "step": 680 + }, + { + "epoch": 1.3096153846153846, + "grad_norm": 0.75, + "learning_rate": 2.9334483050933506e-06, + "loss": 0.9362, + "step": 681 + }, + { + "epoch": 1.3115384615384615, + "grad_norm": 0.72265625, + "learning_rate": 2.9189815319399422e-06, + "loss": 0.9015, + "step": 682 + }, + { + "epoch": 1.3134615384615385, + "grad_norm": 0.7109375, + "learning_rate": 2.904535799546636e-06, + "loss": 0.9278, + "step": 683 + }, + { + "epoch": 1.3153846153846154, + "grad_norm": 0.73046875, + "learning_rate": 2.890111253971327e-06, + "loss": 0.8761, + "step": 684 + }, + { + "epoch": 1.3173076923076923, + "grad_norm": 0.69921875, + "learning_rate": 2.8757080410577042e-06, + "loss": 0.8785, + "step": 685 + }, + { + "epoch": 1.3192307692307692, + "grad_norm": 0.70703125, + "learning_rate": 2.8613263064337617e-06, + "loss": 0.8794, + "step": 686 + }, + { + "epoch": 1.3211538461538461, + "grad_norm": 0.7109375, + "learning_rate": 2.846966195510332e-06, + "loss": 0.891, + "step": 687 + }, + { + "epoch": 1.323076923076923, + "grad_norm": 0.72265625, + "learning_rate": 2.8326278534796154e-06, + "loss": 0.903, + "step": 688 + }, + { + "epoch": 1.325, + "grad_norm": 0.71484375, + "learning_rate": 2.81831142531371e-06, + "loss": 0.9126, + "step": 689 + }, + { + "epoch": 1.3269230769230769, + "grad_norm": 0.69921875, + "learning_rate": 2.804017055763149e-06, + "loss": 0.8889, + "step": 690 + }, + { + "epoch": 1.3288461538461538, + "grad_norm": 0.703125, + "learning_rate": 2.7897448893554335e-06, + "loss": 0.8606, + "step": 691 + }, + { + "epoch": 1.3307692307692307, + "grad_norm": 0.72265625, + "learning_rate": 2.7754950703935735e-06, + "loss": 0.8373, + "step": 692 + }, + { + "epoch": 1.3326923076923076, + "grad_norm": 0.73046875, + "learning_rate": 2.761267742954629e-06, + "loss": 0.9246, + "step": 693 + }, + { + "epoch": 1.3346153846153845, + "grad_norm": 0.72265625, + "learning_rate": 2.7470630508882525e-06, + "loss": 0.9381, + "step": 694 + }, + { + "epoch": 1.3365384615384617, + "grad_norm": 0.73828125, + "learning_rate": 2.7328811378152355e-06, + "loss": 0.8936, + "step": 695 + }, + { + "epoch": 1.3384615384615386, + "grad_norm": 0.7734375, + "learning_rate": 2.718722147126054e-06, + "loss": 0.9314, + "step": 696 + }, + { + "epoch": 1.3403846153846155, + "grad_norm": 0.69921875, + "learning_rate": 2.704586221979422e-06, + "loss": 0.8957, + "step": 697 + }, + { + "epoch": 1.3423076923076924, + "grad_norm": 0.70703125, + "learning_rate": 2.6904735053008405e-06, + "loss": 0.8703, + "step": 698 + }, + { + "epoch": 1.3442307692307693, + "grad_norm": 0.7421875, + "learning_rate": 2.6763841397811576e-06, + "loss": 0.9244, + "step": 699 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 0.71875, + "learning_rate": 2.662318267875119e-06, + "loss": 0.8768, + "step": 700 + }, + { + "epoch": 1.3480769230769232, + "grad_norm": 0.71875, + "learning_rate": 2.6482760317999338e-06, + "loss": 0.9445, + "step": 701 + }, + { + "epoch": 1.35, + "grad_norm": 0.74609375, + "learning_rate": 2.634257573533833e-06, + "loss": 0.9024, + "step": 702 + }, + { + "epoch": 1.351923076923077, + "grad_norm": 0.73046875, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.8766, + "step": 703 + }, + { + "epoch": 1.353846153846154, + "grad_norm": 0.70703125, + "learning_rate": 2.606292557138307e-06, + "loss": 0.889, + "step": 704 + }, + { + "epoch": 1.3557692307692308, + "grad_norm": 0.72265625, + "learning_rate": 2.592346281757552e-06, + "loss": 0.9081, + "step": 705 + }, + { + "epoch": 1.3576923076923078, + "grad_norm": 0.7109375, + "learning_rate": 2.5784243496803596e-06, + "loss": 0.8632, + "step": 706 + }, + { + "epoch": 1.3596153846153847, + "grad_norm": 0.6953125, + "learning_rate": 2.5645269016685905e-06, + "loss": 0.8161, + "step": 707 + }, + { + "epoch": 1.3615384615384616, + "grad_norm": 0.71484375, + "learning_rate": 2.550654078236552e-06, + "loss": 0.8689, + "step": 708 + }, + { + "epoch": 1.3634615384615385, + "grad_norm": 0.7109375, + "learning_rate": 2.5368060196495785e-06, + "loss": 0.9514, + "step": 709 + }, + { + "epoch": 1.3653846153846154, + "grad_norm": 0.703125, + "learning_rate": 2.5229828659226114e-06, + "loss": 0.8675, + "step": 710 + }, + { + "epoch": 1.3673076923076923, + "grad_norm": 0.72265625, + "learning_rate": 2.5091847568187834e-06, + "loss": 0.9018, + "step": 711 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 0.73828125, + "learning_rate": 2.4954118318480063e-06, + "loss": 0.9049, + "step": 712 + }, + { + "epoch": 1.3711538461538462, + "grad_norm": 0.703125, + "learning_rate": 2.4816642302655634e-06, + "loss": 0.8864, + "step": 713 + }, + { + "epoch": 1.373076923076923, + "grad_norm": 0.74609375, + "learning_rate": 2.4679420910706887e-06, + "loss": 0.9208, + "step": 714 + }, + { + "epoch": 1.375, + "grad_norm": 0.72265625, + "learning_rate": 2.454245553005184e-06, + "loss": 0.9696, + "step": 715 + }, + { + "epoch": 1.375, + "eval_loss": 0.9303730130195618, + "eval_runtime": 34.4007, + "eval_samples_per_second": 68.109, + "eval_steps_per_second": 17.035, + "step": 715 + }, + { + "epoch": 1.376923076923077, + "grad_norm": 0.703125, + "learning_rate": 2.4405747545519966e-06, + "loss": 0.8986, + "step": 716 + }, + { + "epoch": 1.3788461538461538, + "grad_norm": 0.73046875, + "learning_rate": 2.4269298339338205e-06, + "loss": 0.9069, + "step": 717 + }, + { + "epoch": 1.3807692307692307, + "grad_norm": 0.70703125, + "learning_rate": 2.4133109291117156e-06, + "loss": 0.8608, + "step": 718 + }, + { + "epoch": 1.3826923076923077, + "grad_norm": 0.73046875, + "learning_rate": 2.3997181777836955e-06, + "loss": 0.9137, + "step": 719 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.73046875, + "learning_rate": 2.3861517173833347e-06, + "loss": 0.8828, + "step": 720 + }, + { + "epoch": 1.3865384615384615, + "grad_norm": 0.76953125, + "learning_rate": 2.3726116850783987e-06, + "loss": 0.9207, + "step": 721 + }, + { + "epoch": 1.3884615384615384, + "grad_norm": 0.69140625, + "learning_rate": 2.3590982177694348e-06, + "loss": 0.8221, + "step": 722 + }, + { + "epoch": 1.3903846153846153, + "grad_norm": 0.7265625, + "learning_rate": 2.3456114520883956e-06, + "loss": 0.8922, + "step": 723 + }, + { + "epoch": 1.3923076923076922, + "grad_norm": 0.72265625, + "learning_rate": 2.3321515243972663e-06, + "loss": 0.8462, + "step": 724 + }, + { + "epoch": 1.3942307692307692, + "grad_norm": 0.69921875, + "learning_rate": 2.318718570786675e-06, + "loss": 0.8804, + "step": 725 + }, + { + "epoch": 1.396153846153846, + "grad_norm": 0.69921875, + "learning_rate": 2.3053127270745163e-06, + "loss": 0.8969, + "step": 726 + }, + { + "epoch": 1.398076923076923, + "grad_norm": 0.7265625, + "learning_rate": 2.2919341288045853e-06, + "loss": 0.9326, + "step": 727 + }, + { + "epoch": 1.4, + "grad_norm": 0.69921875, + "learning_rate": 2.27858291124521e-06, + "loss": 0.9407, + "step": 728 + }, + { + "epoch": 1.4019230769230768, + "grad_norm": 0.6953125, + "learning_rate": 2.265259209387867e-06, + "loss": 0.8988, + "step": 729 + }, + { + "epoch": 1.4038461538461537, + "grad_norm": 0.7109375, + "learning_rate": 2.25196315794583e-06, + "loss": 0.8821, + "step": 730 + }, + { + "epoch": 1.4057692307692307, + "grad_norm": 0.69921875, + "learning_rate": 2.238694891352814e-06, + "loss": 0.8893, + "step": 731 + }, + { + "epoch": 1.4076923076923076, + "grad_norm": 0.6953125, + "learning_rate": 2.2254545437615932e-06, + "loss": 0.9305, + "step": 732 + }, + { + "epoch": 1.4096153846153845, + "grad_norm": 0.71875, + "learning_rate": 2.2122422490426676e-06, + "loss": 0.9039, + "step": 733 + }, + { + "epoch": 1.4115384615384614, + "grad_norm": 0.7109375, + "learning_rate": 2.199058140782897e-06, + "loss": 0.8692, + "step": 734 + }, + { + "epoch": 1.4134615384615383, + "grad_norm": 0.72265625, + "learning_rate": 2.1859023522841543e-06, + "loss": 0.9083, + "step": 735 + }, + { + "epoch": 1.4153846153846155, + "grad_norm": 0.6875, + "learning_rate": 2.172775016561977e-06, + "loss": 0.8858, + "step": 736 + }, + { + "epoch": 1.4173076923076924, + "grad_norm": 0.73046875, + "learning_rate": 2.159676266344222e-06, + "loss": 0.8998, + "step": 737 + }, + { + "epoch": 1.4192307692307693, + "grad_norm": 0.7265625, + "learning_rate": 2.1466062340697234e-06, + "loss": 0.8965, + "step": 738 + }, + { + "epoch": 1.4211538461538462, + "grad_norm": 0.6953125, + "learning_rate": 2.1335650518869555e-06, + "loss": 0.8807, + "step": 739 + }, + { + "epoch": 1.4230769230769231, + "grad_norm": 0.7109375, + "learning_rate": 2.120552851652694e-06, + "loss": 0.9295, + "step": 740 + }, + { + "epoch": 1.425, + "grad_norm": 0.70703125, + "learning_rate": 2.1075697649306838e-06, + "loss": 0.8947, + "step": 741 + }, + { + "epoch": 1.426923076923077, + "grad_norm": 0.72265625, + "learning_rate": 2.094615922990309e-06, + "loss": 0.9056, + "step": 742 + }, + { + "epoch": 1.4288461538461539, + "grad_norm": 0.76171875, + "learning_rate": 2.0816914568052664e-06, + "loss": 0.9257, + "step": 743 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 0.70703125, + "learning_rate": 2.0687964970522394e-06, + "loss": 0.9331, + "step": 744 + }, + { + "epoch": 1.4326923076923077, + "grad_norm": 0.703125, + "learning_rate": 2.055931174109579e-06, + "loss": 0.9563, + "step": 745 + }, + { + "epoch": 1.4346153846153846, + "grad_norm": 0.73046875, + "learning_rate": 2.043095618055982e-06, + "loss": 0.8613, + "step": 746 + }, + { + "epoch": 1.4365384615384615, + "grad_norm": 0.7109375, + "learning_rate": 2.030289958669181e-06, + "loss": 0.8563, + "step": 747 + }, + { + "epoch": 1.4384615384615385, + "grad_norm": 0.6953125, + "learning_rate": 2.0175143254246277e-06, + "loss": 0.8817, + "step": 748 + }, + { + "epoch": 1.4403846153846154, + "grad_norm": 0.72265625, + "learning_rate": 2.004768847494186e-06, + "loss": 0.8463, + "step": 749 + }, + { + "epoch": 1.4423076923076923, + "grad_norm": 0.71484375, + "learning_rate": 1.992053653744826e-06, + "loss": 0.9228, + "step": 750 + }, + { + "epoch": 1.4442307692307692, + "grad_norm": 0.71484375, + "learning_rate": 1.979368872737319e-06, + "loss": 0.8673, + "step": 751 + }, + { + "epoch": 1.4461538461538461, + "grad_norm": 0.703125, + "learning_rate": 1.966714632724941e-06, + "loss": 0.8408, + "step": 752 + }, + { + "epoch": 1.448076923076923, + "grad_norm": 0.72265625, + "learning_rate": 1.954091061652172e-06, + "loss": 0.9419, + "step": 753 + }, + { + "epoch": 1.45, + "grad_norm": 0.71484375, + "learning_rate": 1.941498287153409e-06, + "loss": 0.9032, + "step": 754 + }, + { + "epoch": 1.4519230769230769, + "grad_norm": 0.7421875, + "learning_rate": 1.928936436551661e-06, + "loss": 0.9398, + "step": 755 + }, + { + "epoch": 1.4538461538461538, + "grad_norm": 0.70703125, + "learning_rate": 1.9164056368572847e-06, + "loss": 0.8877, + "step": 756 + }, + { + "epoch": 1.4557692307692307, + "grad_norm": 0.7109375, + "learning_rate": 1.903906014766681e-06, + "loss": 0.9077, + "step": 757 + }, + { + "epoch": 1.4576923076923076, + "grad_norm": 0.71484375, + "learning_rate": 1.891437696661015e-06, + "loss": 0.9012, + "step": 758 + }, + { + "epoch": 1.4596153846153845, + "grad_norm": 0.703125, + "learning_rate": 1.8790008086049534e-06, + "loss": 0.8972, + "step": 759 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.71484375, + "learning_rate": 1.8665954763453764e-06, + "loss": 0.885, + "step": 760 + }, + { + "epoch": 1.4634615384615386, + "grad_norm": 0.75390625, + "learning_rate": 1.854221825310103e-06, + "loss": 0.9224, + "step": 761 + }, + { + "epoch": 1.4653846153846155, + "grad_norm": 0.6796875, + "learning_rate": 1.8418799806066413e-06, + "loss": 0.8654, + "step": 762 + }, + { + "epoch": 1.4673076923076924, + "grad_norm": 0.7109375, + "learning_rate": 1.829570067020906e-06, + "loss": 0.9112, + "step": 763 + }, + { + "epoch": 1.4692307692307693, + "grad_norm": 0.7109375, + "learning_rate": 1.8172922090159578e-06, + "loss": 0.8584, + "step": 764 + }, + { + "epoch": 1.4711538461538463, + "grad_norm": 0.70703125, + "learning_rate": 1.8050465307307602e-06, + "loss": 0.8674, + "step": 765 + }, + { + "epoch": 1.4730769230769232, + "grad_norm": 0.73046875, + "learning_rate": 1.7928331559789087e-06, + "loss": 0.8901, + "step": 766 + }, + { + "epoch": 1.475, + "grad_norm": 0.69921875, + "learning_rate": 1.7806522082473809e-06, + "loss": 0.9509, + "step": 767 + }, + { + "epoch": 1.476923076923077, + "grad_norm": 0.734375, + "learning_rate": 1.7685038106952952e-06, + "loss": 0.8837, + "step": 768 + }, + { + "epoch": 1.478846153846154, + "grad_norm": 0.69921875, + "learning_rate": 1.7563880861526656e-06, + "loss": 0.9192, + "step": 769 + }, + { + "epoch": 1.4807692307692308, + "grad_norm": 0.71484375, + "learning_rate": 1.7443051571191472e-06, + "loss": 0.8741, + "step": 770 + }, + { + "epoch": 1.4826923076923078, + "grad_norm": 0.69140625, + "learning_rate": 1.73225514576281e-06, + "loss": 0.9072, + "step": 771 + }, + { + "epoch": 1.4846153846153847, + "grad_norm": 0.70703125, + "learning_rate": 1.7202381739189055e-06, + "loss": 0.851, + "step": 772 + }, + { + "epoch": 1.4865384615384616, + "grad_norm": 0.75, + "learning_rate": 1.70825436308862e-06, + "loss": 0.9552, + "step": 773 + }, + { + "epoch": 1.4884615384615385, + "grad_norm": 0.73046875, + "learning_rate": 1.696303834437859e-06, + "loss": 0.8915, + "step": 774 + }, + { + "epoch": 1.4903846153846154, + "grad_norm": 0.734375, + "learning_rate": 1.6843867087960252e-06, + "loss": 0.9092, + "step": 775 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 0.71484375, + "learning_rate": 1.6725031066547786e-06, + "loss": 0.8589, + "step": 776 + }, + { + "epoch": 1.4942307692307693, + "grad_norm": 0.7265625, + "learning_rate": 1.6606531481668364e-06, + "loss": 0.8642, + "step": 777 + }, + { + "epoch": 1.4961538461538462, + "grad_norm": 0.73046875, + "learning_rate": 1.648836953144755e-06, + "loss": 0.9987, + "step": 778 + }, + { + "epoch": 1.498076923076923, + "grad_norm": 0.72265625, + "learning_rate": 1.6370546410597066e-06, + "loss": 0.8802, + "step": 779 + }, + { + "epoch": 1.5, + "grad_norm": 0.69140625, + "learning_rate": 1.6253063310402833e-06, + "loss": 0.9119, + "step": 780 + }, + { + "epoch": 1.5, + "eval_loss": 0.9302220344543457, + "eval_runtime": 34.2902, + "eval_samples_per_second": 68.329, + "eval_steps_per_second": 17.089, + "step": 780 + }, + { + "epoch": 1.501923076923077, + "grad_norm": 0.72265625, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.8816, + "step": 781 + }, + { + "epoch": 1.5038461538461538, + "grad_norm": 0.73046875, + "learning_rate": 1.601912191992554e-06, + "loss": 0.9097, + "step": 782 + }, + { + "epoch": 1.5057692307692307, + "grad_norm": 1.1484375, + "learning_rate": 1.5902665994976896e-06, + "loss": 0.9058, + "step": 783 + }, + { + "epoch": 1.5076923076923077, + "grad_norm": 0.7109375, + "learning_rate": 1.5786554821329515e-06, + "loss": 0.8729, + "step": 784 + }, + { + "epoch": 1.5096153846153846, + "grad_norm": 0.6953125, + "learning_rate": 1.567078957296016e-06, + "loss": 0.8892, + "step": 785 + }, + { + "epoch": 1.5115384615384615, + "grad_norm": 0.71484375, + "learning_rate": 1.5555371420348031e-06, + "loss": 0.8654, + "step": 786 + }, + { + "epoch": 1.5134615384615384, + "grad_norm": 0.75, + "learning_rate": 1.544030153046291e-06, + "loss": 0.8445, + "step": 787 + }, + { + "epoch": 1.5153846153846153, + "grad_norm": 0.69140625, + "learning_rate": 1.5325581066753354e-06, + "loss": 0.8402, + "step": 788 + }, + { + "epoch": 1.5173076923076922, + "grad_norm": 0.69140625, + "learning_rate": 1.5211211189134955e-06, + "loss": 0.8546, + "step": 789 + }, + { + "epoch": 1.5192307692307692, + "grad_norm": 0.72265625, + "learning_rate": 1.5097193053978587e-06, + "loss": 0.9212, + "step": 790 + }, + { + "epoch": 1.521153846153846, + "grad_norm": 0.66796875, + "learning_rate": 1.4983527814098736e-06, + "loss": 0.8601, + "step": 791 + }, + { + "epoch": 1.523076923076923, + "grad_norm": 0.73046875, + "learning_rate": 1.4870216618741833e-06, + "loss": 0.8572, + "step": 792 + }, + { + "epoch": 1.525, + "grad_norm": 0.90234375, + "learning_rate": 1.475726061357463e-06, + "loss": 0.9179, + "step": 793 + }, + { + "epoch": 1.5269230769230768, + "grad_norm": 0.73828125, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.9296, + "step": 794 + }, + { + "epoch": 1.5288461538461537, + "grad_norm": 0.6953125, + "learning_rate": 1.4532418738508525e-06, + "loss": 0.8652, + "step": 795 + }, + { + "epoch": 1.5307692307692307, + "grad_norm": 1.1640625, + "learning_rate": 1.44205351419407e-06, + "loss": 0.8567, + "step": 796 + }, + { + "epoch": 1.5326923076923076, + "grad_norm": 0.7109375, + "learning_rate": 1.430901128220174e-06, + "loss": 0.9234, + "step": 797 + }, + { + "epoch": 1.5346153846153845, + "grad_norm": 0.6953125, + "learning_rate": 1.4197848286887017e-06, + "loss": 0.8839, + "step": 798 + }, + { + "epoch": 1.5365384615384614, + "grad_norm": 0.7109375, + "learning_rate": 1.4087047279943267e-06, + "loss": 0.9021, + "step": 799 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.71875, + "learning_rate": 1.397660938165723e-06, + "loss": 0.9017, + "step": 800 + }, + { + "epoch": 1.5403846153846152, + "grad_norm": 0.703125, + "learning_rate": 1.3866535708644335e-06, + "loss": 0.9315, + "step": 801 + }, + { + "epoch": 1.5423076923076922, + "grad_norm": 1.046875, + "learning_rate": 1.3756827373837396e-06, + "loss": 0.8562, + "step": 802 + }, + { + "epoch": 1.544230769230769, + "grad_norm": 0.71484375, + "learning_rate": 1.3647485486475376e-06, + "loss": 0.9092, + "step": 803 + }, + { + "epoch": 1.546153846153846, + "grad_norm": 0.6953125, + "learning_rate": 1.353851115209215e-06, + "loss": 0.9142, + "step": 804 + }, + { + "epoch": 1.5480769230769231, + "grad_norm": 0.7109375, + "learning_rate": 1.3429905472505344e-06, + "loss": 0.8867, + "step": 805 + }, + { + "epoch": 1.55, + "grad_norm": 0.73046875, + "learning_rate": 1.3321669545805188e-06, + "loss": 0.9228, + "step": 806 + }, + { + "epoch": 1.551923076923077, + "grad_norm": 0.6875, + "learning_rate": 1.321380446634342e-06, + "loss": 0.8635, + "step": 807 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 0.6796875, + "learning_rate": 1.310631132472222e-06, + "loss": 0.8625, + "step": 808 + }, + { + "epoch": 1.5557692307692308, + "grad_norm": 0.6875, + "learning_rate": 1.2999191207783129e-06, + "loss": 0.8367, + "step": 809 + }, + { + "epoch": 1.5576923076923077, + "grad_norm": 0.69921875, + "learning_rate": 1.2892445198596198e-06, + "loss": 0.9199, + "step": 810 + }, + { + "epoch": 1.5596153846153846, + "grad_norm": 0.6796875, + "learning_rate": 1.27860743764489e-06, + "loss": 0.8648, + "step": 811 + }, + { + "epoch": 1.5615384615384615, + "grad_norm": 0.71875, + "learning_rate": 1.2680079816835228e-06, + "loss": 0.9395, + "step": 812 + }, + { + "epoch": 1.5634615384615385, + "grad_norm": 0.6796875, + "learning_rate": 1.257446259144494e-06, + "loss": 0.8859, + "step": 813 + }, + { + "epoch": 1.5653846153846154, + "grad_norm": 0.6875, + "learning_rate": 1.2469223768152622e-06, + "loss": 0.884, + "step": 814 + }, + { + "epoch": 1.5673076923076923, + "grad_norm": 0.72265625, + "learning_rate": 1.2364364411006841e-06, + "loss": 0.8908, + "step": 815 + }, + { + "epoch": 1.5692307692307692, + "grad_norm": 0.6875, + "learning_rate": 1.2259885580219555e-06, + "loss": 0.871, + "step": 816 + }, + { + "epoch": 1.5711538461538461, + "grad_norm": 0.6953125, + "learning_rate": 1.215578833215526e-06, + "loss": 0.9022, + "step": 817 + }, + { + "epoch": 1.573076923076923, + "grad_norm": 0.6875, + "learning_rate": 1.2052073719320296e-06, + "loss": 0.9199, + "step": 818 + }, + { + "epoch": 1.575, + "grad_norm": 0.71484375, + "learning_rate": 1.1948742790352342e-06, + "loss": 0.9211, + "step": 819 + }, + { + "epoch": 1.5769230769230769, + "grad_norm": 0.703125, + "learning_rate": 1.1845796590009684e-06, + "loss": 0.9067, + "step": 820 + }, + { + "epoch": 1.578846153846154, + "grad_norm": 0.703125, + "learning_rate": 1.1743236159160654e-06, + "loss": 0.9342, + "step": 821 + }, + { + "epoch": 1.580769230769231, + "grad_norm": 0.75390625, + "learning_rate": 1.1641062534773218e-06, + "loss": 0.915, + "step": 822 + }, + { + "epoch": 1.5826923076923078, + "grad_norm": 0.72265625, + "learning_rate": 1.15392767499044e-06, + "loss": 0.8886, + "step": 823 + }, + { + "epoch": 1.5846153846153848, + "grad_norm": 0.7109375, + "learning_rate": 1.1437879833689808e-06, + "loss": 0.9112, + "step": 824 + }, + { + "epoch": 1.5865384615384617, + "grad_norm": 0.734375, + "learning_rate": 1.133687281133331e-06, + "loss": 0.9056, + "step": 825 + }, + { + "epoch": 1.5884615384615386, + "grad_norm": 0.734375, + "learning_rate": 1.1236256704096693e-06, + "loss": 0.91, + "step": 826 + }, + { + "epoch": 1.5903846153846155, + "grad_norm": 0.73046875, + "learning_rate": 1.113603252928917e-06, + "loss": 0.8725, + "step": 827 + }, + { + "epoch": 1.5923076923076924, + "grad_norm": 0.72265625, + "learning_rate": 1.1036201300257266e-06, + "loss": 0.9173, + "step": 828 + }, + { + "epoch": 1.5942307692307693, + "grad_norm": 0.7109375, + "learning_rate": 1.0936764026374547e-06, + "loss": 0.8928, + "step": 829 + }, + { + "epoch": 1.5961538461538463, + "grad_norm": 0.69921875, + "learning_rate": 1.083772171303128e-06, + "loss": 0.8897, + "step": 830 + }, + { + "epoch": 1.5980769230769232, + "grad_norm": 0.72265625, + "learning_rate": 1.073907536162443e-06, + "loss": 0.9053, + "step": 831 + }, + { + "epoch": 1.6, + "grad_norm": 0.6953125, + "learning_rate": 1.0640825969547498e-06, + "loss": 0.8655, + "step": 832 + }, + { + "epoch": 1.601923076923077, + "grad_norm": 0.703125, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.8698, + "step": 833 + }, + { + "epoch": 1.603846153846154, + "grad_norm": 0.6953125, + "learning_rate": 1.0445522032879184e-06, + "loss": 0.8369, + "step": 834 + }, + { + "epoch": 1.6057692307692308, + "grad_norm": 0.6953125, + "learning_rate": 1.0348469462966753e-06, + "loss": 0.8818, + "step": 835 + }, + { + "epoch": 1.6076923076923078, + "grad_norm": 0.703125, + "learning_rate": 1.0251817801722047e-06, + "loss": 0.837, + "step": 836 + }, + { + "epoch": 1.6096153846153847, + "grad_norm": 0.69921875, + "learning_rate": 1.0155568026370637e-06, + "loss": 0.8907, + "step": 837 + }, + { + "epoch": 1.6115384615384616, + "grad_norm": 0.73046875, + "learning_rate": 1.0059721110074678e-06, + "loss": 0.9129, + "step": 838 + }, + { + "epoch": 1.6134615384615385, + "grad_norm": 0.73828125, + "learning_rate": 9.964278021923107e-07, + "loss": 0.8743, + "step": 839 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.1015625, + "learning_rate": 9.869239726921843e-07, + "loss": 0.8883, + "step": 840 + }, + { + "epoch": 1.6173076923076923, + "grad_norm": 0.6953125, + "learning_rate": 9.774607185984004e-07, + "loss": 0.904, + "step": 841 + }, + { + "epoch": 1.6192307692307693, + "grad_norm": 0.70703125, + "learning_rate": 9.68038135592022e-07, + "loss": 0.9311, + "step": 842 + }, + { + "epoch": 1.6211538461538462, + "grad_norm": 0.69140625, + "learning_rate": 9.586563189428954e-07, + "loss": 0.8621, + "step": 843 + }, + { + "epoch": 1.623076923076923, + "grad_norm": 0.69921875, + "learning_rate": 9.493153635086855e-07, + "loss": 0.8783, + "step": 844 + }, + { + "epoch": 1.625, + "grad_norm": 0.69921875, + "learning_rate": 9.400153637339182e-07, + "loss": 0.8793, + "step": 845 + }, + { + "epoch": 1.625, + "eval_loss": 0.9300703406333923, + "eval_runtime": 34.3126, + "eval_samples_per_second": 68.284, + "eval_steps_per_second": 17.078, + "step": 845 + }, + { + "epoch": 1.626923076923077, + "grad_norm": 0.69140625, + "learning_rate": 9.307564136490255e-07, + "loss": 0.8341, + "step": 846 + }, + { + "epoch": 1.6288461538461538, + "grad_norm": 0.71484375, + "learning_rate": 9.215386068693927e-07, + "loss": 0.8605, + "step": 847 + }, + { + "epoch": 1.6307692307692307, + "grad_norm": 0.72265625, + "learning_rate": 9.123620365944147e-07, + "loss": 0.8513, + "step": 848 + }, + { + "epoch": 1.6326923076923077, + "grad_norm": 0.72265625, + "learning_rate": 9.032267956065516e-07, + "loss": 0.9004, + "step": 849 + }, + { + "epoch": 1.6346153846153846, + "grad_norm": 0.75, + "learning_rate": 8.941329762703921e-07, + "loss": 0.9814, + "step": 850 + }, + { + "epoch": 1.6365384615384615, + "grad_norm": 0.73828125, + "learning_rate": 8.850806705317183e-07, + "loss": 0.9074, + "step": 851 + }, + { + "epoch": 1.6384615384615384, + "grad_norm": 0.6953125, + "learning_rate": 8.76069969916577e-07, + "loss": 0.9301, + "step": 852 + }, + { + "epoch": 1.6403846153846153, + "grad_norm": 0.7109375, + "learning_rate": 8.671009655303531e-07, + "loss": 0.9227, + "step": 853 + }, + { + "epoch": 1.6423076923076922, + "grad_norm": 0.69921875, + "learning_rate": 8.581737480568514e-07, + "loss": 0.8816, + "step": 854 + }, + { + "epoch": 1.6442307692307692, + "grad_norm": 0.69140625, + "learning_rate": 8.492884077573749e-07, + "loss": 0.8869, + "step": 855 + }, + { + "epoch": 1.646153846153846, + "grad_norm": 0.72265625, + "learning_rate": 8.404450344698167e-07, + "loss": 0.9363, + "step": 856 + }, + { + "epoch": 1.648076923076923, + "grad_norm": 0.6875, + "learning_rate": 8.316437176077491e-07, + "loss": 0.8935, + "step": 857 + }, + { + "epoch": 1.65, + "grad_norm": 0.71875, + "learning_rate": 8.228845461595225e-07, + "loss": 0.8805, + "step": 858 + }, + { + "epoch": 1.6519230769230768, + "grad_norm": 0.765625, + "learning_rate": 8.141676086873574e-07, + "loss": 0.8784, + "step": 859 + }, + { + "epoch": 1.6538461538461537, + "grad_norm": 0.70703125, + "learning_rate": 8.054929933264626e-07, + "loss": 0.9377, + "step": 860 + }, + { + "epoch": 1.6557692307692307, + "grad_norm": 0.734375, + "learning_rate": 7.968607877841333e-07, + "loss": 0.8825, + "step": 861 + }, + { + "epoch": 1.6576923076923076, + "grad_norm": 0.70703125, + "learning_rate": 7.882710793388643e-07, + "loss": 0.915, + "step": 862 + }, + { + "epoch": 1.6596153846153845, + "grad_norm": 0.70703125, + "learning_rate": 7.79723954839477e-07, + "loss": 0.9108, + "step": 863 + }, + { + "epoch": 1.6615384615384614, + "grad_norm": 0.71875, + "learning_rate": 7.712195007042322e-07, + "loss": 0.8689, + "step": 864 + }, + { + "epoch": 1.6634615384615383, + "grad_norm": 0.7109375, + "learning_rate": 7.627578029199562e-07, + "loss": 0.8841, + "step": 865 + }, + { + "epoch": 1.6653846153846152, + "grad_norm": 0.71875, + "learning_rate": 7.543389470411772e-07, + "loss": 0.9736, + "step": 866 + }, + { + "epoch": 1.6673076923076922, + "grad_norm": 0.6953125, + "learning_rate": 7.459630181892608e-07, + "loss": 0.9249, + "step": 867 + }, + { + "epoch": 1.669230769230769, + "grad_norm": 0.7421875, + "learning_rate": 7.376301010515397e-07, + "loss": 0.9258, + "step": 868 + }, + { + "epoch": 1.671153846153846, + "grad_norm": 0.7265625, + "learning_rate": 7.293402798804667e-07, + "loss": 0.909, + "step": 869 + }, + { + "epoch": 1.6730769230769231, + "grad_norm": 0.69921875, + "learning_rate": 7.210936384927631e-07, + "loss": 0.888, + "step": 870 + }, + { + "epoch": 1.675, + "grad_norm": 0.72265625, + "learning_rate": 7.128902602685617e-07, + "loss": 0.9196, + "step": 871 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 0.703125, + "learning_rate": 7.047302281505735e-07, + "loss": 0.943, + "step": 872 + }, + { + "epoch": 1.6788461538461539, + "grad_norm": 0.74609375, + "learning_rate": 6.966136246432492e-07, + "loss": 0.9123, + "step": 873 + }, + { + "epoch": 1.6807692307692308, + "grad_norm": 0.6796875, + "learning_rate": 6.885405318119342e-07, + "loss": 0.8793, + "step": 874 + }, + { + "epoch": 1.6826923076923077, + "grad_norm": 0.71484375, + "learning_rate": 6.805110312820501e-07, + "loss": 0.9478, + "step": 875 + }, + { + "epoch": 1.6846153846153846, + "grad_norm": 0.69921875, + "learning_rate": 6.725252042382691e-07, + "loss": 0.8718, + "step": 876 + }, + { + "epoch": 1.6865384615384615, + "grad_norm": 0.7265625, + "learning_rate": 6.645831314236817e-07, + "loss": 0.8476, + "step": 877 + }, + { + "epoch": 1.6884615384615385, + "grad_norm": 0.75390625, + "learning_rate": 6.566848931389935e-07, + "loss": 0.9334, + "step": 878 + }, + { + "epoch": 1.6903846153846154, + "grad_norm": 0.71875, + "learning_rate": 6.488305692417074e-07, + "loss": 0.8902, + "step": 879 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 0.6875, + "learning_rate": 6.410202391453157e-07, + "loss": 0.8728, + "step": 880 + }, + { + "epoch": 1.6942307692307692, + "grad_norm": 0.7265625, + "learning_rate": 6.332539818184985e-07, + "loss": 0.8788, + "step": 881 + }, + { + "epoch": 1.6961538461538461, + "grad_norm": 0.76171875, + "learning_rate": 6.255318757843249e-07, + "loss": 0.8917, + "step": 882 + }, + { + "epoch": 1.698076923076923, + "grad_norm": 0.72265625, + "learning_rate": 6.178539991194599e-07, + "loss": 0.8898, + "step": 883 + }, + { + "epoch": 1.7, + "grad_norm": 0.6875, + "learning_rate": 6.102204294533731e-07, + "loss": 0.894, + "step": 884 + }, + { + "epoch": 1.7019230769230769, + "grad_norm": 0.703125, + "learning_rate": 6.026312439675553e-07, + "loss": 0.882, + "step": 885 + }, + { + "epoch": 1.703846153846154, + "grad_norm": 0.7109375, + "learning_rate": 5.95086519394738e-07, + "loss": 0.917, + "step": 886 + }, + { + "epoch": 1.705769230769231, + "grad_norm": 0.73046875, + "learning_rate": 5.875863320181175e-07, + "loss": 0.8931, + "step": 887 + }, + { + "epoch": 1.7076923076923078, + "grad_norm": 0.703125, + "learning_rate": 5.801307576705833e-07, + "loss": 0.8623, + "step": 888 + }, + { + "epoch": 1.7096153846153848, + "grad_norm": 0.6953125, + "learning_rate": 5.727198717339511e-07, + "loss": 0.8815, + "step": 889 + }, + { + "epoch": 1.7115384615384617, + "grad_norm": 0.70703125, + "learning_rate": 5.653537491382011e-07, + "loss": 0.9418, + "step": 890 + }, + { + "epoch": 1.7134615384615386, + "grad_norm": 0.73828125, + "learning_rate": 5.58032464360721e-07, + "loss": 0.8381, + "step": 891 + }, + { + "epoch": 1.7153846153846155, + "grad_norm": 0.73046875, + "learning_rate": 5.507560914255516e-07, + "loss": 0.9239, + "step": 892 + }, + { + "epoch": 1.7173076923076924, + "grad_norm": 0.69921875, + "learning_rate": 5.435247039026398e-07, + "loss": 0.9262, + "step": 893 + }, + { + "epoch": 1.7192307692307693, + "grad_norm": 0.734375, + "learning_rate": 5.363383749070939e-07, + "loss": 0.844, + "step": 894 + }, + { + "epoch": 1.7211538461538463, + "grad_norm": 0.7265625, + "learning_rate": 5.291971770984428e-07, + "loss": 0.9041, + "step": 895 + }, + { + "epoch": 1.7230769230769232, + "grad_norm": 0.7109375, + "learning_rate": 5.221011826799055e-07, + "loss": 0.8342, + "step": 896 + }, + { + "epoch": 1.725, + "grad_norm": 0.6796875, + "learning_rate": 5.150504633976572e-07, + "loss": 0.8119, + "step": 897 + }, + { + "epoch": 1.726923076923077, + "grad_norm": 0.73828125, + "learning_rate": 5.080450905401057e-07, + "loss": 0.9174, + "step": 898 + }, + { + "epoch": 1.728846153846154, + "grad_norm": 0.72265625, + "learning_rate": 5.010851349371704e-07, + "loss": 0.8972, + "step": 899 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 0.71875, + "learning_rate": 4.941706669595647e-07, + "loss": 0.9052, + "step": 900 + }, + { + "epoch": 1.7326923076923078, + "grad_norm": 0.7109375, + "learning_rate": 4.873017565180871e-07, + "loss": 0.8823, + "step": 901 + }, + { + "epoch": 1.7346153846153847, + "grad_norm": 0.69140625, + "learning_rate": 4.804784730629131e-07, + "loss": 0.8687, + "step": 902 + }, + { + "epoch": 1.7365384615384616, + "grad_norm": 0.7265625, + "learning_rate": 4.7370088558289175e-07, + "loss": 0.8865, + "step": 903 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 0.69921875, + "learning_rate": 4.6696906260485007e-07, + "loss": 0.8775, + "step": 904 + }, + { + "epoch": 1.7403846153846154, + "grad_norm": 0.71875, + "learning_rate": 4.602830721928997e-07, + "loss": 0.8814, + "step": 905 + }, + { + "epoch": 1.7423076923076923, + "grad_norm": 0.73828125, + "learning_rate": 4.536429819477478e-07, + "loss": 0.9094, + "step": 906 + }, + { + "epoch": 1.7442307692307693, + "grad_norm": 0.72265625, + "learning_rate": 4.4704885900601236e-07, + "loss": 0.9433, + "step": 907 + }, + { + "epoch": 1.7461538461538462, + "grad_norm": 0.6796875, + "learning_rate": 4.405007700395497e-07, + "loss": 0.8625, + "step": 908 + }, + { + "epoch": 1.748076923076923, + "grad_norm": 0.72265625, + "learning_rate": 4.33998781254773e-07, + "loss": 0.8546, + "step": 909 + }, + { + "epoch": 1.75, + "grad_norm": 0.7265625, + "learning_rate": 4.2754295839198325e-07, + "loss": 0.9265, + "step": 910 + }, + { + "epoch": 1.75, + "eval_loss": 0.9300666451454163, + "eval_runtime": 34.3132, + "eval_samples_per_second": 68.283, + "eval_steps_per_second": 17.078, + "step": 910 + }, + { + "epoch": 1.751923076923077, + "grad_norm": 0.69140625, + "learning_rate": 4.211333667247125e-07, + "loss": 0.8753, + "step": 911 + }, + { + "epoch": 1.7538461538461538, + "grad_norm": 0.67578125, + "learning_rate": 4.147700710590563e-07, + "loss": 0.8862, + "step": 912 + }, + { + "epoch": 1.7557692307692307, + "grad_norm": 0.7109375, + "learning_rate": 4.0845313573301736e-07, + "loss": 0.9027, + "step": 913 + }, + { + "epoch": 1.7576923076923077, + "grad_norm": 0.72265625, + "learning_rate": 4.021826246158628e-07, + "loss": 0.9113, + "step": 914 + }, + { + "epoch": 1.7596153846153846, + "grad_norm": 0.71484375, + "learning_rate": 3.959586011074729e-07, + "loss": 0.8939, + "step": 915 + }, + { + "epoch": 1.7615384615384615, + "grad_norm": 0.7421875, + "learning_rate": 3.8978112813769786e-07, + "loss": 0.9124, + "step": 916 + }, + { + "epoch": 1.7634615384615384, + "grad_norm": 0.73046875, + "learning_rate": 3.836502681657289e-07, + "loss": 0.9074, + "step": 917 + }, + { + "epoch": 1.7653846153846153, + "grad_norm": 0.6953125, + "learning_rate": 3.7756608317946144e-07, + "loss": 0.8721, + "step": 918 + }, + { + "epoch": 1.7673076923076922, + "grad_norm": 0.71875, + "learning_rate": 3.715286346948671e-07, + "loss": 0.9188, + "step": 919 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.7109375, + "learning_rate": 3.6553798375537574e-07, + "loss": 0.8737, + "step": 920 + }, + { + "epoch": 1.771153846153846, + "grad_norm": 0.69140625, + "learning_rate": 3.595941909312595e-07, + "loss": 0.9252, + "step": 921 + }, + { + "epoch": 1.773076923076923, + "grad_norm": 0.78125, + "learning_rate": 3.5369731631901214e-07, + "loss": 0.921, + "step": 922 + }, + { + "epoch": 1.775, + "grad_norm": 0.7421875, + "learning_rate": 3.4784741954074884e-07, + "loss": 0.8851, + "step": 923 + }, + { + "epoch": 1.7769230769230768, + "grad_norm": 0.69140625, + "learning_rate": 3.420445597436056e-07, + "loss": 0.8651, + "step": 924 + }, + { + "epoch": 1.7788461538461537, + "grad_norm": 0.734375, + "learning_rate": 3.362887955991301e-07, + "loss": 0.902, + "step": 925 + }, + { + "epoch": 1.7807692307692307, + "grad_norm": 0.6796875, + "learning_rate": 3.305801853026985e-07, + "loss": 0.8853, + "step": 926 + }, + { + "epoch": 1.7826923076923076, + "grad_norm": 0.671875, + "learning_rate": 3.2491878657292643e-07, + "loss": 0.8499, + "step": 927 + }, + { + "epoch": 1.7846153846153845, + "grad_norm": 0.734375, + "learning_rate": 3.193046566510777e-07, + "loss": 0.8928, + "step": 928 + }, + { + "epoch": 1.7865384615384614, + "grad_norm": 0.71875, + "learning_rate": 3.1373785230049356e-07, + "loss": 0.8892, + "step": 929 + }, + { + "epoch": 1.7884615384615383, + "grad_norm": 0.703125, + "learning_rate": 3.0821842980601756e-07, + "loss": 0.8887, + "step": 930 + }, + { + "epoch": 1.7903846153846152, + "grad_norm": 0.7421875, + "learning_rate": 3.0274644497342133e-07, + "loss": 0.9069, + "step": 931 + }, + { + "epoch": 1.7923076923076922, + "grad_norm": 0.72265625, + "learning_rate": 2.9732195312884515e-07, + "loss": 0.9282, + "step": 932 + }, + { + "epoch": 1.794230769230769, + "grad_norm": 0.71484375, + "learning_rate": 2.91945009118238e-07, + "loss": 0.8736, + "step": 933 + }, + { + "epoch": 1.796153846153846, + "grad_norm": 0.73046875, + "learning_rate": 2.866156673068016e-07, + "loss": 0.9549, + "step": 934 + }, + { + "epoch": 1.7980769230769231, + "grad_norm": 0.69921875, + "learning_rate": 2.813339815784416e-07, + "loss": 0.8407, + "step": 935 + }, + { + "epoch": 1.8, + "grad_norm": 0.703125, + "learning_rate": 2.76100005335222e-07, + "loss": 0.9366, + "step": 936 + }, + { + "epoch": 1.801923076923077, + "grad_norm": 0.7734375, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.8663, + "step": 937 + }, + { + "epoch": 1.8038461538461539, + "grad_norm": 0.73828125, + "learning_rate": 2.657753925000228e-07, + "loss": 0.8676, + "step": 938 + }, + { + "epoch": 1.8057692307692308, + "grad_norm": 0.6953125, + "learning_rate": 2.6068486029813154e-07, + "loss": 0.8491, + "step": 939 + }, + { + "epoch": 1.8076923076923077, + "grad_norm": 0.703125, + "learning_rate": 2.556422463605024e-07, + "loss": 0.9075, + "step": 940 + }, + { + "epoch": 1.8096153846153846, + "grad_norm": 0.6796875, + "learning_rate": 2.506476016719922e-07, + "loss": 0.8678, + "step": 941 + }, + { + "epoch": 1.8115384615384615, + "grad_norm": 0.7265625, + "learning_rate": 2.4570097673245197e-07, + "loss": 0.921, + "step": 942 + }, + { + "epoch": 1.8134615384615385, + "grad_norm": 0.75, + "learning_rate": 2.4080242155621327e-07, + "loss": 0.9795, + "step": 943 + }, + { + "epoch": 1.8153846153846154, + "grad_norm": 0.7421875, + "learning_rate": 2.3595198567158473e-07, + "loss": 0.9078, + "step": 944 + }, + { + "epoch": 1.8173076923076923, + "grad_norm": 0.71875, + "learning_rate": 2.3114971812034981e-07, + "loss": 0.8915, + "step": 945 + }, + { + "epoch": 1.8192307692307692, + "grad_norm": 0.70703125, + "learning_rate": 2.2639566745727203e-07, + "loss": 0.8709, + "step": 946 + }, + { + "epoch": 1.8211538461538461, + "grad_norm": 0.7109375, + "learning_rate": 2.2168988174960382e-07, + "loss": 0.9404, + "step": 947 + }, + { + "epoch": 1.823076923076923, + "grad_norm": 0.71875, + "learning_rate": 2.1703240857659958e-07, + "loss": 0.8965, + "step": 948 + }, + { + "epoch": 1.825, + "grad_norm": 0.70703125, + "learning_rate": 2.124232950290367e-07, + "loss": 0.8497, + "step": 949 + }, + { + "epoch": 1.8269230769230769, + "grad_norm": 0.72265625, + "learning_rate": 2.0786258770873647e-07, + "loss": 0.9354, + "step": 950 + }, + { + "epoch": 1.828846153846154, + "grad_norm": 0.73046875, + "learning_rate": 2.0335033272809612e-07, + "loss": 0.8639, + "step": 951 + }, + { + "epoch": 1.830769230769231, + "grad_norm": 0.70703125, + "learning_rate": 1.9888657570961924e-07, + "loss": 0.8492, + "step": 952 + }, + { + "epoch": 1.8326923076923078, + "grad_norm": 0.75390625, + "learning_rate": 1.9447136178545766e-07, + "loss": 0.929, + "step": 953 + }, + { + "epoch": 1.8346153846153848, + "grad_norm": 0.69140625, + "learning_rate": 1.9010473559695376e-07, + "loss": 0.8714, + "step": 954 + }, + { + "epoch": 1.8365384615384617, + "grad_norm": 0.70703125, + "learning_rate": 1.857867412941883e-07, + "loss": 0.9262, + "step": 955 + }, + { + "epoch": 1.8384615384615386, + "grad_norm": 0.7109375, + "learning_rate": 1.8151742253553483e-07, + "loss": 0.8796, + "step": 956 + }, + { + "epoch": 1.8403846153846155, + "grad_norm": 0.7421875, + "learning_rate": 1.7729682248721848e-07, + "loss": 0.8677, + "step": 957 + }, + { + "epoch": 1.8423076923076924, + "grad_norm": 0.68359375, + "learning_rate": 1.731249838228799e-07, + "loss": 0.8699, + "step": 958 + }, + { + "epoch": 1.8442307692307693, + "grad_norm": 0.6875, + "learning_rate": 1.69001948723142e-07, + "loss": 0.8765, + "step": 959 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.72265625, + "learning_rate": 1.649277588751863e-07, + "loss": 0.9203, + "step": 960 + }, + { + "epoch": 1.8480769230769232, + "grad_norm": 0.73828125, + "learning_rate": 1.6090245547232707e-07, + "loss": 0.9009, + "step": 961 + }, + { + "epoch": 1.85, + "grad_norm": 1.4140625, + "learning_rate": 1.5692607921360014e-07, + "loss": 0.8951, + "step": 962 + }, + { + "epoch": 1.851923076923077, + "grad_norm": 0.7109375, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.938, + "step": 963 + }, + { + "epoch": 1.853846153846154, + "grad_norm": 0.71875, + "learning_rate": 1.491202684508136e-07, + "loss": 0.9381, + "step": 964 + }, + { + "epoch": 1.8557692307692308, + "grad_norm": 0.72265625, + "learning_rate": 1.4529091286973994e-07, + "loss": 0.9324, + "step": 965 + }, + { + "epoch": 1.8576923076923078, + "grad_norm": 0.7109375, + "learning_rate": 1.415106422779733e-07, + "loss": 0.881, + "step": 966 + }, + { + "epoch": 1.8596153846153847, + "grad_norm": 0.7734375, + "learning_rate": 1.3777949489706898e-07, + "loss": 0.8983, + "step": 967 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 0.7109375, + "learning_rate": 1.3409750845191138e-07, + "loss": 0.9162, + "step": 968 + }, + { + "epoch": 1.8634615384615385, + "grad_norm": 0.70703125, + "learning_rate": 1.3046472017032685e-07, + "loss": 0.8893, + "step": 969 + }, + { + "epoch": 1.8653846153846154, + "grad_norm": 0.68359375, + "learning_rate": 1.2688116678270636e-07, + "loss": 0.9097, + "step": 970 + }, + { + "epoch": 1.8673076923076923, + "grad_norm": 0.71484375, + "learning_rate": 1.2334688452164122e-07, + "loss": 0.9282, + "step": 971 + }, + { + "epoch": 1.8692307692307693, + "grad_norm": 0.69921875, + "learning_rate": 1.198619091215497e-07, + "loss": 0.8906, + "step": 972 + }, + { + "epoch": 1.8711538461538462, + "grad_norm": 0.796875, + "learning_rate": 1.1642627581831767e-07, + "loss": 0.9165, + "step": 973 + }, + { + "epoch": 1.873076923076923, + "grad_norm": 0.73828125, + "learning_rate": 1.1304001934894393e-07, + "loss": 0.9424, + "step": 974 + }, + { + "epoch": 1.875, + "grad_norm": 0.71875, + "learning_rate": 1.0970317395119001e-07, + "loss": 0.9375, + "step": 975 + }, + { + "epoch": 1.875, + "eval_loss": 0.9301103949546814, + "eval_runtime": 34.343, + "eval_samples_per_second": 68.223, + "eval_steps_per_second": 17.063, + "step": 975 + }, + { + "epoch": 1.876923076923077, + "grad_norm": 0.69921875, + "learning_rate": 1.0641577336322761e-07, + "loss": 0.8467, + "step": 976 + }, + { + "epoch": 1.8788461538461538, + "grad_norm": 0.6875, + "learning_rate": 1.0317785082330555e-07, + "loss": 0.9042, + "step": 977 + }, + { + "epoch": 1.8807692307692307, + "grad_norm": 0.71484375, + "learning_rate": 9.998943906941005e-08, + "loss": 0.8585, + "step": 978 + }, + { + "epoch": 1.8826923076923077, + "grad_norm": 0.703125, + "learning_rate": 9.685057033892998e-08, + "loss": 0.9187, + "step": 979 + }, + { + "epoch": 1.8846153846153846, + "grad_norm": 0.72265625, + "learning_rate": 9.376127636833876e-08, + "loss": 0.8607, + "step": 980 + }, + { + "epoch": 1.8865384615384615, + "grad_norm": 0.7265625, + "learning_rate": 9.072158839286748e-08, + "loss": 0.9251, + "step": 981 + }, + { + "epoch": 1.8884615384615384, + "grad_norm": 0.7265625, + "learning_rate": 8.773153714619064e-08, + "loss": 0.9172, + "step": 982 + }, + { + "epoch": 1.8903846153846153, + "grad_norm": 0.734375, + "learning_rate": 8.479115286011752e-08, + "loss": 0.9146, + "step": 983 + }, + { + "epoch": 1.8923076923076922, + "grad_norm": 0.69921875, + "learning_rate": 8.190046526428241e-08, + "loss": 0.8388, + "step": 984 + }, + { + "epoch": 1.8942307692307692, + "grad_norm": 0.6953125, + "learning_rate": 7.905950358584768e-08, + "loss": 0.8973, + "step": 985 + }, + { + "epoch": 1.896153846153846, + "grad_norm": 0.73046875, + "learning_rate": 7.626829654920732e-08, + "loss": 0.9986, + "step": 986 + }, + { + "epoch": 1.898076923076923, + "grad_norm": 0.7109375, + "learning_rate": 7.352687237569489e-08, + "loss": 0.9084, + "step": 987 + }, + { + "epoch": 1.9, + "grad_norm": 0.70703125, + "learning_rate": 7.08352587833e-08, + "loss": 0.8867, + "step": 988 + }, + { + "epoch": 1.9019230769230768, + "grad_norm": 0.6796875, + "learning_rate": 6.819348298638839e-08, + "loss": 0.9154, + "step": 989 + }, + { + "epoch": 1.9038461538461537, + "grad_norm": 0.671875, + "learning_rate": 6.560157169542391e-08, + "loss": 0.8247, + "step": 990 + }, + { + "epoch": 1.9057692307692307, + "grad_norm": 0.703125, + "learning_rate": 6.305955111670204e-08, + "loss": 0.9041, + "step": 991 + }, + { + "epoch": 1.9076923076923076, + "grad_norm": 0.6875, + "learning_rate": 6.056744695208283e-08, + "loss": 0.9043, + "step": 992 + }, + { + "epoch": 1.9096153846153845, + "grad_norm": 0.71875, + "learning_rate": 5.8125284398730666e-08, + "loss": 0.8587, + "step": 993 + }, + { + "epoch": 1.9115384615384614, + "grad_norm": 0.73828125, + "learning_rate": 5.573308814886158e-08, + "loss": 0.9092, + "step": 994 + }, + { + "epoch": 1.9134615384615383, + "grad_norm": 0.69921875, + "learning_rate": 5.339088238949186e-08, + "loss": 0.8529, + "step": 995 + }, + { + "epoch": 1.9153846153846152, + "grad_norm": 0.71484375, + "learning_rate": 5.109869080219376e-08, + "loss": 0.9078, + "step": 996 + }, + { + "epoch": 1.9173076923076922, + "grad_norm": 0.671875, + "learning_rate": 4.885653656285627e-08, + "loss": 0.8423, + "step": 997 + }, + { + "epoch": 1.919230769230769, + "grad_norm": 0.75390625, + "learning_rate": 4.666444234145084e-08, + "loss": 0.8814, + "step": 998 + }, + { + "epoch": 1.921153846153846, + "grad_norm": 0.734375, + "learning_rate": 4.45224303018027e-08, + "loss": 0.8811, + "step": 999 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.71875, + "learning_rate": 4.2430522101364894e-08, + "loss": 0.9339, + "step": 1000 + }, + { + "epoch": 1.925, + "grad_norm": 0.70703125, + "learning_rate": 4.038873889100237e-08, + "loss": 0.8606, + "step": 1001 + }, + { + "epoch": 1.926923076923077, + "grad_norm": 0.6953125, + "learning_rate": 3.839710131477492e-08, + "loss": 0.9095, + "step": 1002 + }, + { + "epoch": 1.9288461538461539, + "grad_norm": 0.73828125, + "learning_rate": 3.645562950973014e-08, + "loss": 0.9305, + "step": 1003 + }, + { + "epoch": 1.9307692307692308, + "grad_norm": 0.72265625, + "learning_rate": 3.456434310570023e-08, + "loss": 0.9045, + "step": 1004 + }, + { + "epoch": 1.9326923076923077, + "grad_norm": 0.74609375, + "learning_rate": 3.2723261225102164e-08, + "loss": 0.8942, + "step": 1005 + }, + { + "epoch": 1.9346153846153846, + "grad_norm": 0.6953125, + "learning_rate": 3.093240248274565e-08, + "loss": 0.8703, + "step": 1006 + }, + { + "epoch": 1.9365384615384615, + "grad_norm": 0.72265625, + "learning_rate": 2.9191784985644345e-08, + "loss": 0.9141, + "step": 1007 + }, + { + "epoch": 1.9384615384615385, + "grad_norm": 0.75, + "learning_rate": 2.7501426332831594e-08, + "loss": 0.8912, + "step": 1008 + }, + { + "epoch": 1.9403846153846154, + "grad_norm": 0.7109375, + "learning_rate": 2.5861343615184997e-08, + "loss": 0.8769, + "step": 1009 + }, + { + "epoch": 1.9423076923076923, + "grad_norm": 0.6953125, + "learning_rate": 2.427155341525156e-08, + "loss": 0.8312, + "step": 1010 + }, + { + "epoch": 1.9442307692307692, + "grad_norm": 0.72265625, + "learning_rate": 2.2732071807081147e-08, + "loss": 0.8963, + "step": 1011 + }, + { + "epoch": 1.9461538461538461, + "grad_norm": 0.6796875, + "learning_rate": 2.1242914356063292e-08, + "loss": 0.9095, + "step": 1012 + }, + { + "epoch": 1.948076923076923, + "grad_norm": 0.71484375, + "learning_rate": 1.98040961187701e-08, + "loss": 0.9588, + "step": 1013 + }, + { + "epoch": 1.95, + "grad_norm": 0.78125, + "learning_rate": 1.841563164280413e-08, + "loss": 0.8922, + "step": 1014 + }, + { + "epoch": 1.9519230769230769, + "grad_norm": 0.8046875, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.8975, + "step": 1015 + }, + { + "epoch": 1.953846153846154, + "grad_norm": 0.74609375, + "learning_rate": 1.5789819619537182e-08, + "loss": 0.8766, + "step": 1016 + }, + { + "epoch": 1.955769230769231, + "grad_norm": 0.7109375, + "learning_rate": 1.4552498621295264e-08, + "loss": 0.9691, + "step": 1017 + }, + { + "epoch": 1.9576923076923078, + "grad_norm": 0.71484375, + "learning_rate": 1.3365584482228356e-08, + "loss": 0.8297, + "step": 1018 + }, + { + "epoch": 1.9596153846153848, + "grad_norm": 0.70703125, + "learning_rate": 1.2229089202987487e-08, + "loss": 0.9212, + "step": 1019 + }, + { + "epoch": 1.9615384615384617, + "grad_norm": 0.7265625, + "learning_rate": 1.1143024274448689e-08, + "loss": 0.9787, + "step": 1020 + }, + { + "epoch": 1.9634615384615386, + "grad_norm": 0.703125, + "learning_rate": 1.0107400677596413e-08, + "loss": 0.9338, + "step": 1021 + }, + { + "epoch": 1.9653846153846155, + "grad_norm": 0.75, + "learning_rate": 9.12222888341252e-09, + "loss": 0.901, + "step": 1022 + }, + { + "epoch": 1.9673076923076924, + "grad_norm": 0.67578125, + "learning_rate": 8.187518852771914e-09, + "loss": 0.8544, + "step": 1023 + }, + { + "epoch": 1.9692307692307693, + "grad_norm": 0.67578125, + "learning_rate": 7.3032800363398435e-09, + "loss": 0.892, + "step": 1024 + }, + { + "epoch": 1.9711538461538463, + "grad_norm": 0.71484375, + "learning_rate": 6.469521374477539e-09, + "loss": 0.8719, + "step": 1025 + }, + { + "epoch": 1.9730769230769232, + "grad_norm": 0.73046875, + "learning_rate": 5.686251297151724e-09, + "loss": 0.9165, + "step": 1026 + }, + { + "epoch": 1.975, + "grad_norm": 0.73828125, + "learning_rate": 4.9534777238485764e-09, + "loss": 0.9102, + "step": 1027 + }, + { + "epoch": 1.976923076923077, + "grad_norm": 0.734375, + "learning_rate": 4.2712080634949024e-09, + "loss": 0.9305, + "step": 1028 + }, + { + "epoch": 1.978846153846154, + "grad_norm": 0.6953125, + "learning_rate": 3.6394492143820847e-09, + "loss": 0.9779, + "step": 1029 + }, + { + "epoch": 1.9807692307692308, + "grad_norm": 0.73046875, + "learning_rate": 3.0582075640972487e-09, + "loss": 0.904, + "step": 1030 + }, + { + "epoch": 1.9826923076923078, + "grad_norm": 0.6953125, + "learning_rate": 2.5274889894583156e-09, + "loss": 0.9295, + "step": 1031 + }, + { + "epoch": 1.9846153846153847, + "grad_norm": 0.6640625, + "learning_rate": 2.0472988564540496e-09, + "loss": 0.8869, + "step": 1032 + }, + { + "epoch": 1.9865384615384616, + "grad_norm": 0.73046875, + "learning_rate": 1.6176420201902132e-09, + "loss": 0.9327, + "step": 1033 + }, + { + "epoch": 1.9884615384615385, + "grad_norm": 0.73046875, + "learning_rate": 1.2385228248407155e-09, + "loss": 0.9076, + "step": 1034 + }, + { + "epoch": 1.9903846153846154, + "grad_norm": 0.71875, + "learning_rate": 9.099451036048701e-10, + "loss": 0.9368, + "step": 1035 + }, + { + "epoch": 1.9923076923076923, + "grad_norm": 0.73828125, + "learning_rate": 6.319121786646509e-10, + "loss": 0.8626, + "step": 1036 + }, + { + "epoch": 1.9942307692307693, + "grad_norm": 0.7421875, + "learning_rate": 4.0442686115582665e-10, + "loss": 0.9733, + "step": 1037 + }, + { + "epoch": 1.9961538461538462, + "grad_norm": 0.71875, + "learning_rate": 2.274914511374293e-10, + "loss": 0.824, + "step": 1038 + }, + { + "epoch": 1.998076923076923, + "grad_norm": 0.7265625, + "learning_rate": 1.011077375662195e-10, + "loss": 0.8492, + "step": 1039 + }, + { + "epoch": 2.0, + "grad_norm": 0.7265625, + "learning_rate": 2.5276998284473608e-11, + "loss": 0.8473, + "step": 1040 + }, + { + "epoch": 2.0, + "eval_loss": 0.930040180683136, + "eval_runtime": 34.5175, + "eval_samples_per_second": 67.879, + "eval_steps_per_second": 16.977, + "step": 1040 + } + ], + "logging_steps": 1, + "max_steps": 1040, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 520, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.943620643240018e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}