diff --git "a/checkpoint-10248/trainer_state.json" "b/checkpoint-10248/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-10248/trainer_state.json" @@ -0,0 +1,71769 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 10248, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002927400468384075, + "grad_norm": 2.3541512489318848, + "learning_rate": 5.0000000000000004e-08, + "loss": 1.1378, + "step": 1 + }, + { + "epoch": 0.000585480093676815, + "grad_norm": 2.106961488723755, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.0751, + "step": 2 + }, + { + "epoch": 0.0008782201405152225, + "grad_norm": 1.988677978515625, + "learning_rate": 1.5000000000000002e-07, + "loss": 1.0645, + "step": 3 + }, + { + "epoch": 0.00117096018735363, + "grad_norm": 1.8958356380462646, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.0391, + "step": 4 + }, + { + "epoch": 0.0014637002341920376, + "grad_norm": 2.0591158866882324, + "learning_rate": 2.5000000000000004e-07, + "loss": 1.0987, + "step": 5 + }, + { + "epoch": 0.001756440281030445, + "grad_norm": 2.1340889930725098, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.151, + "step": 6 + }, + { + "epoch": 0.0020491803278688526, + "grad_norm": 2.2017033100128174, + "learning_rate": 3.5000000000000004e-07, + "loss": 1.1097, + "step": 7 + }, + { + "epoch": 0.00234192037470726, + "grad_norm": 2.4071333408355713, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.0969, + "step": 8 + }, + { + "epoch": 0.0026346604215456673, + "grad_norm": 2.45190167427063, + "learning_rate": 4.5000000000000003e-07, + "loss": 1.1309, + "step": 9 + }, + { + "epoch": 0.002927400468384075, + "grad_norm": 2.1820831298828125, + "learning_rate": 5.000000000000001e-07, + "loss": 1.0835, + "step": 10 + }, + { + "epoch": 0.0032201405152224825, + "grad_norm": 3.201382637023926, + "learning_rate": 5.5e-07, + "loss": 1.0574, + "step": 11 + }, + { + "epoch": 0.00351288056206089, + "grad_norm": 2.133012533187866, + "learning_rate": 6.000000000000001e-07, + "loss": 1.1454, + "step": 12 + }, + { + "epoch": 0.0038056206088992973, + "grad_norm": 1.9446614980697632, + "learning_rate": 6.5e-07, + "loss": 1.078, + "step": 13 + }, + { + "epoch": 0.004098360655737705, + "grad_norm": 1.9025191068649292, + "learning_rate": 7.000000000000001e-07, + "loss": 1.0084, + "step": 14 + }, + { + "epoch": 0.004391100702576112, + "grad_norm": 2.0782299041748047, + "learning_rate": 7.5e-07, + "loss": 1.1299, + "step": 15 + }, + { + "epoch": 0.00468384074941452, + "grad_norm": 1.9437332153320312, + "learning_rate": 8.000000000000001e-07, + "loss": 1.1095, + "step": 16 + }, + { + "epoch": 0.004976580796252928, + "grad_norm": 1.8862369060516357, + "learning_rate": 8.500000000000001e-07, + "loss": 1.0881, + "step": 17 + }, + { + "epoch": 0.005269320843091335, + "grad_norm": 1.8146216869354248, + "learning_rate": 9.000000000000001e-07, + "loss": 1.0693, + "step": 18 + }, + { + "epoch": 0.0055620608899297425, + "grad_norm": 1.820211410522461, + "learning_rate": 9.500000000000001e-07, + "loss": 1.0797, + "step": 19 + }, + { + "epoch": 0.00585480093676815, + "grad_norm": 1.963200569152832, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.1173, + "step": 20 + }, + { + "epoch": 0.006147540983606557, + "grad_norm": 1.6739530563354492, + "learning_rate": 1.0500000000000001e-06, + "loss": 1.0382, + "step": 21 + }, + { + "epoch": 0.006440281030444965, + "grad_norm": 1.7953011989593506, + "learning_rate": 1.1e-06, + "loss": 1.0755, + "step": 22 + }, + { + "epoch": 0.006733021077283372, + "grad_norm": 1.607630968093872, + "learning_rate": 1.1500000000000002e-06, + "loss": 1.113, + "step": 23 + }, + { + "epoch": 0.00702576112412178, + "grad_norm": 1.5690702199935913, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.0766, + "step": 24 + }, + { + "epoch": 0.007318501170960188, + "grad_norm": 1.556094765663147, + "learning_rate": 1.25e-06, + "loss": 1.082, + "step": 25 + }, + { + "epoch": 0.007611241217798595, + "grad_norm": 1.5100266933441162, + "learning_rate": 1.3e-06, + "loss": 1.0392, + "step": 26 + }, + { + "epoch": 0.007903981264637002, + "grad_norm": 1.4244755506515503, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.963, + "step": 27 + }, + { + "epoch": 0.00819672131147541, + "grad_norm": 1.3507155179977417, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.0068, + "step": 28 + }, + { + "epoch": 0.008489461358313818, + "grad_norm": 1.2662217617034912, + "learning_rate": 1.45e-06, + "loss": 0.983, + "step": 29 + }, + { + "epoch": 0.008782201405152224, + "grad_norm": 1.302406907081604, + "learning_rate": 1.5e-06, + "loss": 1.0428, + "step": 30 + }, + { + "epoch": 0.009074941451990632, + "grad_norm": 1.2275854349136353, + "learning_rate": 1.5500000000000002e-06, + "loss": 1.0395, + "step": 31 + }, + { + "epoch": 0.00936768149882904, + "grad_norm": 1.1993814706802368, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.9881, + "step": 32 + }, + { + "epoch": 0.009660421545667448, + "grad_norm": 1.1544785499572754, + "learning_rate": 1.6500000000000003e-06, + "loss": 1.0023, + "step": 33 + }, + { + "epoch": 0.009953161592505855, + "grad_norm": 1.1538089513778687, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.9822, + "step": 34 + }, + { + "epoch": 0.010245901639344262, + "grad_norm": 1.0913233757019043, + "learning_rate": 1.75e-06, + "loss": 1.0014, + "step": 35 + }, + { + "epoch": 0.01053864168618267, + "grad_norm": 1.0799410343170166, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.0135, + "step": 36 + }, + { + "epoch": 0.010831381733021077, + "grad_norm": 1.1881572008132935, + "learning_rate": 1.85e-06, + "loss": 1.0422, + "step": 37 + }, + { + "epoch": 0.011124121779859485, + "grad_norm": 1.0558487176895142, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.9245, + "step": 38 + }, + { + "epoch": 0.011416861826697893, + "grad_norm": 1.1681361198425293, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.9446, + "step": 39 + }, + { + "epoch": 0.0117096018735363, + "grad_norm": 1.0581327676773071, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.947, + "step": 40 + }, + { + "epoch": 0.012002341920374707, + "grad_norm": 1.1554250717163086, + "learning_rate": 2.05e-06, + "loss": 0.9738, + "step": 41 + }, + { + "epoch": 0.012295081967213115, + "grad_norm": 1.0748344659805298, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9108, + "step": 42 + }, + { + "epoch": 0.012587822014051522, + "grad_norm": 1.0412883758544922, + "learning_rate": 2.15e-06, + "loss": 0.9769, + "step": 43 + }, + { + "epoch": 0.01288056206088993, + "grad_norm": 1.0339866876602173, + "learning_rate": 2.2e-06, + "loss": 0.9144, + "step": 44 + }, + { + "epoch": 0.013173302107728338, + "grad_norm": 1.0747230052947998, + "learning_rate": 2.25e-06, + "loss": 0.962, + "step": 45 + }, + { + "epoch": 0.013466042154566744, + "grad_norm": 1.0155915021896362, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.9488, + "step": 46 + }, + { + "epoch": 0.013758782201405152, + "grad_norm": 0.9912748336791992, + "learning_rate": 2.35e-06, + "loss": 0.9221, + "step": 47 + }, + { + "epoch": 0.01405152224824356, + "grad_norm": 1.0419483184814453, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.9144, + "step": 48 + }, + { + "epoch": 0.014344262295081968, + "grad_norm": 1.0045194625854492, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.8505, + "step": 49 + }, + { + "epoch": 0.014637002341920375, + "grad_norm": 1.0407166481018066, + "learning_rate": 2.5e-06, + "loss": 0.978, + "step": 50 + }, + { + "epoch": 0.014929742388758781, + "grad_norm": 1.0109283924102783, + "learning_rate": 2.55e-06, + "loss": 0.9948, + "step": 51 + }, + { + "epoch": 0.01522248243559719, + "grad_norm": 0.9554976224899292, + "learning_rate": 2.6e-06, + "loss": 0.9181, + "step": 52 + }, + { + "epoch": 0.015515222482435597, + "grad_norm": 1.027953863143921, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.8891, + "step": 53 + }, + { + "epoch": 0.015807962529274005, + "grad_norm": 1.0142865180969238, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.9203, + "step": 54 + }, + { + "epoch": 0.016100702576112413, + "grad_norm": 0.9523277282714844, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.8914, + "step": 55 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 0.9507057666778564, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.8881, + "step": 56 + }, + { + "epoch": 0.01668618266978923, + "grad_norm": 0.9237038493156433, + "learning_rate": 2.85e-06, + "loss": 0.8434, + "step": 57 + }, + { + "epoch": 0.016978922716627636, + "grad_norm": 1.0065115690231323, + "learning_rate": 2.9e-06, + "loss": 0.9173, + "step": 58 + }, + { + "epoch": 0.01727166276346604, + "grad_norm": 0.933765172958374, + "learning_rate": 2.95e-06, + "loss": 0.8705, + "step": 59 + }, + { + "epoch": 0.01756440281030445, + "grad_norm": 0.9817979335784912, + "learning_rate": 3e-06, + "loss": 0.8726, + "step": 60 + }, + { + "epoch": 0.017857142857142856, + "grad_norm": 0.9656811952590942, + "learning_rate": 3.05e-06, + "loss": 0.9444, + "step": 61 + }, + { + "epoch": 0.018149882903981264, + "grad_norm": 1.0471440553665161, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.882, + "step": 62 + }, + { + "epoch": 0.018442622950819672, + "grad_norm": 0.9930428862571716, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.8667, + "step": 63 + }, + { + "epoch": 0.01873536299765808, + "grad_norm": 0.9586749076843262, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.8508, + "step": 64 + }, + { + "epoch": 0.019028103044496487, + "grad_norm": 0.9275703430175781, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.8905, + "step": 65 + }, + { + "epoch": 0.019320843091334895, + "grad_norm": 0.9886481761932373, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.9159, + "step": 66 + }, + { + "epoch": 0.019613583138173303, + "grad_norm": 1.029747486114502, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.8999, + "step": 67 + }, + { + "epoch": 0.01990632318501171, + "grad_norm": 0.9590982794761658, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.8817, + "step": 68 + }, + { + "epoch": 0.02019906323185012, + "grad_norm": 0.9610603451728821, + "learning_rate": 3.45e-06, + "loss": 0.9117, + "step": 69 + }, + { + "epoch": 0.020491803278688523, + "grad_norm": 1.1659636497497559, + "learning_rate": 3.5e-06, + "loss": 0.8588, + "step": 70 + }, + { + "epoch": 0.02078454332552693, + "grad_norm": 1.0044513940811157, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.935, + "step": 71 + }, + { + "epoch": 0.02107728337236534, + "grad_norm": 1.046877145767212, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.9429, + "step": 72 + }, + { + "epoch": 0.021370023419203747, + "grad_norm": 0.9645074009895325, + "learning_rate": 3.65e-06, + "loss": 0.9361, + "step": 73 + }, + { + "epoch": 0.021662763466042154, + "grad_norm": 0.9304291009902954, + "learning_rate": 3.7e-06, + "loss": 0.9011, + "step": 74 + }, + { + "epoch": 0.021955503512880562, + "grad_norm": 0.9350640177726746, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.8775, + "step": 75 + }, + { + "epoch": 0.02224824355971897, + "grad_norm": 0.9947004318237305, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.8904, + "step": 76 + }, + { + "epoch": 0.022540983606557378, + "grad_norm": 0.925485372543335, + "learning_rate": 3.85e-06, + "loss": 0.8465, + "step": 77 + }, + { + "epoch": 0.022833723653395786, + "grad_norm": 0.9247211217880249, + "learning_rate": 3.900000000000001e-06, + "loss": 0.8451, + "step": 78 + }, + { + "epoch": 0.023126463700234193, + "grad_norm": 0.9751530885696411, + "learning_rate": 3.95e-06, + "loss": 0.8758, + "step": 79 + }, + { + "epoch": 0.0234192037470726, + "grad_norm": 0.9623793959617615, + "learning_rate": 4.000000000000001e-06, + "loss": 0.8635, + "step": 80 + }, + { + "epoch": 0.023711943793911006, + "grad_norm": 0.9478814601898193, + "learning_rate": 4.05e-06, + "loss": 0.8847, + "step": 81 + }, + { + "epoch": 0.024004683840749413, + "grad_norm": 1.0501374006271362, + "learning_rate": 4.1e-06, + "loss": 0.8471, + "step": 82 + }, + { + "epoch": 0.02429742388758782, + "grad_norm": 1.2229230403900146, + "learning_rate": 4.15e-06, + "loss": 0.8656, + "step": 83 + }, + { + "epoch": 0.02459016393442623, + "grad_norm": 0.9581073522567749, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8674, + "step": 84 + }, + { + "epoch": 0.024882903981264637, + "grad_norm": 0.9169310927391052, + "learning_rate": 4.25e-06, + "loss": 0.8435, + "step": 85 + }, + { + "epoch": 0.025175644028103045, + "grad_norm": 0.9522615671157837, + "learning_rate": 4.3e-06, + "loss": 0.8712, + "step": 86 + }, + { + "epoch": 0.025468384074941453, + "grad_norm": 0.947637140750885, + "learning_rate": 4.350000000000001e-06, + "loss": 0.8942, + "step": 87 + }, + { + "epoch": 0.02576112412177986, + "grad_norm": 0.9472607970237732, + "learning_rate": 4.4e-06, + "loss": 0.8041, + "step": 88 + }, + { + "epoch": 0.026053864168618268, + "grad_norm": 1.0139119625091553, + "learning_rate": 4.450000000000001e-06, + "loss": 0.8999, + "step": 89 + }, + { + "epoch": 0.026346604215456676, + "grad_norm": 0.9121288657188416, + "learning_rate": 4.5e-06, + "loss": 0.7585, + "step": 90 + }, + { + "epoch": 0.02663934426229508, + "grad_norm": 0.9676820039749146, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.8882, + "step": 91 + }, + { + "epoch": 0.026932084309133488, + "grad_norm": 0.9133880734443665, + "learning_rate": 4.600000000000001e-06, + "loss": 0.8727, + "step": 92 + }, + { + "epoch": 0.027224824355971896, + "grad_norm": 0.9575766324996948, + "learning_rate": 4.65e-06, + "loss": 0.8681, + "step": 93 + }, + { + "epoch": 0.027517564402810304, + "grad_norm": 0.9919242858886719, + "learning_rate": 4.7e-06, + "loss": 0.8407, + "step": 94 + }, + { + "epoch": 0.02781030444964871, + "grad_norm": 0.9768396019935608, + "learning_rate": 4.75e-06, + "loss": 0.8725, + "step": 95 + }, + { + "epoch": 0.02810304449648712, + "grad_norm": 0.9673435091972351, + "learning_rate": 4.800000000000001e-06, + "loss": 0.8056, + "step": 96 + }, + { + "epoch": 0.028395784543325527, + "grad_norm": 0.9342538118362427, + "learning_rate": 4.85e-06, + "loss": 0.8125, + "step": 97 + }, + { + "epoch": 0.028688524590163935, + "grad_norm": 1.0368341207504272, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.8705, + "step": 98 + }, + { + "epoch": 0.028981264637002343, + "grad_norm": 0.9556247591972351, + "learning_rate": 4.95e-06, + "loss": 0.8375, + "step": 99 + }, + { + "epoch": 0.02927400468384075, + "grad_norm": 0.9754555225372314, + "learning_rate": 5e-06, + "loss": 0.8384, + "step": 100 + }, + { + "epoch": 0.02956674473067916, + "grad_norm": 0.9527928233146667, + "learning_rate": 4.99999997034351e-06, + "loss": 0.8102, + "step": 101 + }, + { + "epoch": 0.029859484777517563, + "grad_norm": 1.0043853521347046, + "learning_rate": 4.99999988137404e-06, + "loss": 0.8473, + "step": 102 + }, + { + "epoch": 0.03015222482435597, + "grad_norm": 0.9378682971000671, + "learning_rate": 4.999999733091593e-06, + "loss": 0.8444, + "step": 103 + }, + { + "epoch": 0.03044496487119438, + "grad_norm": 0.9688631296157837, + "learning_rate": 4.999999525496172e-06, + "loss": 0.8674, + "step": 104 + }, + { + "epoch": 0.030737704918032786, + "grad_norm": 0.9299169778823853, + "learning_rate": 4.9999992585877825e-06, + "loss": 0.8722, + "step": 105 + }, + { + "epoch": 0.031030444964871194, + "grad_norm": 1.0180927515029907, + "learning_rate": 4.9999989323664285e-06, + "loss": 0.8352, + "step": 106 + }, + { + "epoch": 0.0313231850117096, + "grad_norm": 0.9455348253250122, + "learning_rate": 4.999998546832121e-06, + "loss": 0.8462, + "step": 107 + }, + { + "epoch": 0.03161592505854801, + "grad_norm": 0.9903197288513184, + "learning_rate": 4.999998101984867e-06, + "loss": 0.8884, + "step": 108 + }, + { + "epoch": 0.03190866510538642, + "grad_norm": 0.9969741702079773, + "learning_rate": 4.999997597824678e-06, + "loss": 0.8133, + "step": 109 + }, + { + "epoch": 0.032201405152224825, + "grad_norm": 0.9899001717567444, + "learning_rate": 4.999997034351566e-06, + "loss": 0.8648, + "step": 110 + }, + { + "epoch": 0.03249414519906323, + "grad_norm": 0.9291925430297852, + "learning_rate": 4.999996411565543e-06, + "loss": 0.7833, + "step": 111 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.9759557843208313, + "learning_rate": 4.999995729466626e-06, + "loss": 0.8471, + "step": 112 + }, + { + "epoch": 0.03307962529274005, + "grad_norm": 1.013403296470642, + "learning_rate": 4.999994988054829e-06, + "loss": 0.8198, + "step": 113 + }, + { + "epoch": 0.03337236533957846, + "grad_norm": 1.1213629245758057, + "learning_rate": 4.9999941873301715e-06, + "loss": 0.8104, + "step": 114 + }, + { + "epoch": 0.033665105386416865, + "grad_norm": 0.9904029965400696, + "learning_rate": 4.9999933272926715e-06, + "loss": 0.7855, + "step": 115 + }, + { + "epoch": 0.03395784543325527, + "grad_norm": 0.9461360573768616, + "learning_rate": 4.999992407942349e-06, + "loss": 0.835, + "step": 116 + }, + { + "epoch": 0.03425058548009368, + "grad_norm": 0.9951664209365845, + "learning_rate": 4.999991429279227e-06, + "loss": 0.8285, + "step": 117 + }, + { + "epoch": 0.03454332552693208, + "grad_norm": 1.0482087135314941, + "learning_rate": 4.9999903913033275e-06, + "loss": 0.833, + "step": 118 + }, + { + "epoch": 0.03483606557377049, + "grad_norm": 0.9666406512260437, + "learning_rate": 4.999989294014676e-06, + "loss": 0.8413, + "step": 119 + }, + { + "epoch": 0.0351288056206089, + "grad_norm": 0.98187655210495, + "learning_rate": 4.999988137413297e-06, + "loss": 0.8815, + "step": 120 + }, + { + "epoch": 0.035421545667447304, + "grad_norm": 0.9637346863746643, + "learning_rate": 4.99998692149922e-06, + "loss": 0.8431, + "step": 121 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 0.9825169444084167, + "learning_rate": 4.999985646272474e-06, + "loss": 0.8683, + "step": 122 + }, + { + "epoch": 0.03600702576112412, + "grad_norm": 0.9963875412940979, + "learning_rate": 4.9999843117330875e-06, + "loss": 0.8453, + "step": 123 + }, + { + "epoch": 0.03629976580796253, + "grad_norm": 1.0644196271896362, + "learning_rate": 4.999982917881092e-06, + "loss": 0.8121, + "step": 124 + }, + { + "epoch": 0.036592505854800936, + "grad_norm": 1.0068858861923218, + "learning_rate": 4.999981464716522e-06, + "loss": 0.8445, + "step": 125 + }, + { + "epoch": 0.036885245901639344, + "grad_norm": 1.1024317741394043, + "learning_rate": 4.999979952239412e-06, + "loss": 0.8344, + "step": 126 + }, + { + "epoch": 0.03717798594847775, + "grad_norm": 1.029248833656311, + "learning_rate": 4.999978380449797e-06, + "loss": 0.804, + "step": 127 + }, + { + "epoch": 0.03747072599531616, + "grad_norm": 1.0084072351455688, + "learning_rate": 4.9999767493477145e-06, + "loss": 0.8576, + "step": 128 + }, + { + "epoch": 0.03776346604215457, + "grad_norm": 0.9764242768287659, + "learning_rate": 4.9999750589332045e-06, + "loss": 0.8302, + "step": 129 + }, + { + "epoch": 0.038056206088992975, + "grad_norm": 0.9733086228370667, + "learning_rate": 4.999973309206304e-06, + "loss": 0.8322, + "step": 130 + }, + { + "epoch": 0.03834894613583138, + "grad_norm": 0.9909419417381287, + "learning_rate": 4.999971500167058e-06, + "loss": 0.8474, + "step": 131 + }, + { + "epoch": 0.03864168618266979, + "grad_norm": 1.0278211832046509, + "learning_rate": 4.999969631815506e-06, + "loss": 0.8133, + "step": 132 + }, + { + "epoch": 0.0389344262295082, + "grad_norm": 1.0617951154708862, + "learning_rate": 4.999967704151696e-06, + "loss": 0.8637, + "step": 133 + }, + { + "epoch": 0.039227166276346606, + "grad_norm": 1.0767017602920532, + "learning_rate": 4.999965717175671e-06, + "loss": 0.8205, + "step": 134 + }, + { + "epoch": 0.039519906323185014, + "grad_norm": 1.005443811416626, + "learning_rate": 4.99996367088748e-06, + "loss": 0.8251, + "step": 135 + }, + { + "epoch": 0.03981264637002342, + "grad_norm": 1.0080013275146484, + "learning_rate": 4.999961565287169e-06, + "loss": 0.8261, + "step": 136 + }, + { + "epoch": 0.04010538641686183, + "grad_norm": 1.0612362623214722, + "learning_rate": 4.9999594003747906e-06, + "loss": 0.8115, + "step": 137 + }, + { + "epoch": 0.04039812646370024, + "grad_norm": 1.105657935142517, + "learning_rate": 4.999957176150396e-06, + "loss": 0.8486, + "step": 138 + }, + { + "epoch": 0.04069086651053864, + "grad_norm": 1.0058767795562744, + "learning_rate": 4.999954892614036e-06, + "loss": 0.8068, + "step": 139 + }, + { + "epoch": 0.040983606557377046, + "grad_norm": 0.9760586023330688, + "learning_rate": 4.999952549765766e-06, + "loss": 0.8347, + "step": 140 + }, + { + "epoch": 0.041276346604215454, + "grad_norm": 0.977874755859375, + "learning_rate": 4.999950147605641e-06, + "loss": 0.8366, + "step": 141 + }, + { + "epoch": 0.04156908665105386, + "grad_norm": 0.9850569367408752, + "learning_rate": 4.999947686133719e-06, + "loss": 0.8515, + "step": 142 + }, + { + "epoch": 0.04186182669789227, + "grad_norm": 0.9677111506462097, + "learning_rate": 4.999945165350058e-06, + "loss": 0.8093, + "step": 143 + }, + { + "epoch": 0.04215456674473068, + "grad_norm": 0.9514144659042358, + "learning_rate": 4.999942585254718e-06, + "loss": 0.8297, + "step": 144 + }, + { + "epoch": 0.042447306791569085, + "grad_norm": 0.9613053798675537, + "learning_rate": 4.9999399458477586e-06, + "loss": 0.8565, + "step": 145 + }, + { + "epoch": 0.04274004683840749, + "grad_norm": 1.1038466691970825, + "learning_rate": 4.999937247129245e-06, + "loss": 0.8504, + "step": 146 + }, + { + "epoch": 0.0430327868852459, + "grad_norm": 0.9453856945037842, + "learning_rate": 4.9999344890992395e-06, + "loss": 0.8582, + "step": 147 + }, + { + "epoch": 0.04332552693208431, + "grad_norm": 0.9916960000991821, + "learning_rate": 4.999931671757807e-06, + "loss": 0.7936, + "step": 148 + }, + { + "epoch": 0.043618266978922716, + "grad_norm": 1.005887508392334, + "learning_rate": 4.9999287951050166e-06, + "loss": 0.8671, + "step": 149 + }, + { + "epoch": 0.043911007025761124, + "grad_norm": 1.4730418920516968, + "learning_rate": 4.9999258591409345e-06, + "loss": 0.812, + "step": 150 + }, + { + "epoch": 0.04420374707259953, + "grad_norm": 1.0366698503494263, + "learning_rate": 4.999922863865632e-06, + "loss": 0.8502, + "step": 151 + }, + { + "epoch": 0.04449648711943794, + "grad_norm": 0.9877177476882935, + "learning_rate": 4.999919809279178e-06, + "loss": 0.8203, + "step": 152 + }, + { + "epoch": 0.04478922716627635, + "grad_norm": 0.9640616178512573, + "learning_rate": 4.999916695381647e-06, + "loss": 0.7793, + "step": 153 + }, + { + "epoch": 0.045081967213114756, + "grad_norm": 1.0057071447372437, + "learning_rate": 4.999913522173112e-06, + "loss": 0.8032, + "step": 154 + }, + { + "epoch": 0.04537470725995316, + "grad_norm": 0.9726261496543884, + "learning_rate": 4.999910289653647e-06, + "loss": 0.8131, + "step": 155 + }, + { + "epoch": 0.04566744730679157, + "grad_norm": 1.0055077075958252, + "learning_rate": 4.999906997823333e-06, + "loss": 0.8522, + "step": 156 + }, + { + "epoch": 0.04596018735362998, + "grad_norm": 1.0414237976074219, + "learning_rate": 4.999903646682243e-06, + "loss": 0.8367, + "step": 157 + }, + { + "epoch": 0.04625292740046839, + "grad_norm": 1.0839874744415283, + "learning_rate": 4.99990023623046e-06, + "loss": 0.8204, + "step": 158 + }, + { + "epoch": 0.046545667447306795, + "grad_norm": 0.9151723980903625, + "learning_rate": 4.9998967664680625e-06, + "loss": 0.7725, + "step": 159 + }, + { + "epoch": 0.0468384074941452, + "grad_norm": 1.0398532152175903, + "learning_rate": 4.999893237395134e-06, + "loss": 0.8114, + "step": 160 + }, + { + "epoch": 0.0471311475409836, + "grad_norm": 1.0006062984466553, + "learning_rate": 4.999889649011759e-06, + "loss": 0.8125, + "step": 161 + }, + { + "epoch": 0.04742388758782201, + "grad_norm": 0.9722331762313843, + "learning_rate": 4.999886001318021e-06, + "loss": 0.7795, + "step": 162 + }, + { + "epoch": 0.04771662763466042, + "grad_norm": 1.0387248992919922, + "learning_rate": 4.999882294314008e-06, + "loss": 0.839, + "step": 163 + }, + { + "epoch": 0.04800936768149883, + "grad_norm": 0.96611088514328, + "learning_rate": 4.999878527999807e-06, + "loss": 0.8362, + "step": 164 + }, + { + "epoch": 0.048302107728337235, + "grad_norm": 0.9805483818054199, + "learning_rate": 4.999874702375508e-06, + "loss": 0.7881, + "step": 165 + }, + { + "epoch": 0.04859484777517564, + "grad_norm": 0.9570621848106384, + "learning_rate": 4.9998708174412005e-06, + "loss": 0.7942, + "step": 166 + }, + { + "epoch": 0.04888758782201405, + "grad_norm": 0.9646132588386536, + "learning_rate": 4.999866873196978e-06, + "loss": 0.7973, + "step": 167 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 1.0000524520874023, + "learning_rate": 4.9998628696429345e-06, + "loss": 0.8147, + "step": 168 + }, + { + "epoch": 0.049473067915690866, + "grad_norm": 0.9705430865287781, + "learning_rate": 4.999858806779163e-06, + "loss": 0.8051, + "step": 169 + }, + { + "epoch": 0.049765807962529274, + "grad_norm": 0.9634801149368286, + "learning_rate": 4.999854684605761e-06, + "loss": 0.8041, + "step": 170 + }, + { + "epoch": 0.05005854800936768, + "grad_norm": 1.008618712425232, + "learning_rate": 4.9998505031228264e-06, + "loss": 0.8009, + "step": 171 + }, + { + "epoch": 0.05035128805620609, + "grad_norm": 0.9893620014190674, + "learning_rate": 4.999846262330459e-06, + "loss": 0.8132, + "step": 172 + }, + { + "epoch": 0.0506440281030445, + "grad_norm": 0.9727342128753662, + "learning_rate": 4.9998419622287585e-06, + "loss": 0.8509, + "step": 173 + }, + { + "epoch": 0.050936768149882905, + "grad_norm": 0.9113779664039612, + "learning_rate": 4.999837602817827e-06, + "loss": 0.7499, + "step": 174 + }, + { + "epoch": 0.05122950819672131, + "grad_norm": 0.9678839445114136, + "learning_rate": 4.999833184097768e-06, + "loss": 0.7905, + "step": 175 + }, + { + "epoch": 0.05152224824355972, + "grad_norm": 0.957160234451294, + "learning_rate": 4.9998287060686865e-06, + "loss": 0.8111, + "step": 176 + }, + { + "epoch": 0.05181498829039813, + "grad_norm": 1.021247386932373, + "learning_rate": 4.999824168730689e-06, + "loss": 0.8559, + "step": 177 + }, + { + "epoch": 0.052107728337236536, + "grad_norm": 1.0306686162948608, + "learning_rate": 4.999819572083883e-06, + "loss": 0.8303, + "step": 178 + }, + { + "epoch": 0.052400468384074944, + "grad_norm": 0.9939149618148804, + "learning_rate": 4.9998149161283775e-06, + "loss": 0.8043, + "step": 179 + }, + { + "epoch": 0.05269320843091335, + "grad_norm": 1.0016019344329834, + "learning_rate": 4.999810200864282e-06, + "loss": 0.8253, + "step": 180 + }, + { + "epoch": 0.05298594847775176, + "grad_norm": 1.0452425479888916, + "learning_rate": 4.99980542629171e-06, + "loss": 0.8207, + "step": 181 + }, + { + "epoch": 0.05327868852459016, + "grad_norm": 1.0133237838745117, + "learning_rate": 4.999800592410773e-06, + "loss": 0.7941, + "step": 182 + }, + { + "epoch": 0.05357142857142857, + "grad_norm": 0.9688971638679504, + "learning_rate": 4.999795699221587e-06, + "loss": 0.7764, + "step": 183 + }, + { + "epoch": 0.053864168618266976, + "grad_norm": 0.9863933324813843, + "learning_rate": 4.999790746724268e-06, + "loss": 0.8348, + "step": 184 + }, + { + "epoch": 0.054156908665105384, + "grad_norm": 1.0190049409866333, + "learning_rate": 4.9997857349189334e-06, + "loss": 0.7965, + "step": 185 + }, + { + "epoch": 0.05444964871194379, + "grad_norm": 0.9718985557556152, + "learning_rate": 4.999780663805703e-06, + "loss": 0.7478, + "step": 186 + }, + { + "epoch": 0.0547423887587822, + "grad_norm": 1.119864583015442, + "learning_rate": 4.9997755333846945e-06, + "loss": 0.7794, + "step": 187 + }, + { + "epoch": 0.05503512880562061, + "grad_norm": 1.0553618669509888, + "learning_rate": 4.999770343656031e-06, + "loss": 0.8108, + "step": 188 + }, + { + "epoch": 0.055327868852459015, + "grad_norm": 1.0268009901046753, + "learning_rate": 4.9997650946198365e-06, + "loss": 0.8125, + "step": 189 + }, + { + "epoch": 0.05562060889929742, + "grad_norm": 1.043461561203003, + "learning_rate": 4.999759786276235e-06, + "loss": 0.7844, + "step": 190 + }, + { + "epoch": 0.05591334894613583, + "grad_norm": 0.9761081337928772, + "learning_rate": 4.999754418625352e-06, + "loss": 0.7893, + "step": 191 + }, + { + "epoch": 0.05620608899297424, + "grad_norm": 0.9455216526985168, + "learning_rate": 4.9997489916673155e-06, + "loss": 0.7669, + "step": 192 + }, + { + "epoch": 0.05649882903981265, + "grad_norm": 1.0277810096740723, + "learning_rate": 4.999743505402254e-06, + "loss": 0.7727, + "step": 193 + }, + { + "epoch": 0.056791569086651054, + "grad_norm": 0.9814777970314026, + "learning_rate": 4.999737959830297e-06, + "loss": 0.8041, + "step": 194 + }, + { + "epoch": 0.05708430913348946, + "grad_norm": 0.9988226890563965, + "learning_rate": 4.999732354951577e-06, + "loss": 0.784, + "step": 195 + }, + { + "epoch": 0.05737704918032787, + "grad_norm": 0.9985037446022034, + "learning_rate": 4.999726690766227e-06, + "loss": 0.8722, + "step": 196 + }, + { + "epoch": 0.05766978922716628, + "grad_norm": 0.9539951086044312, + "learning_rate": 4.99972096727438e-06, + "loss": 0.8242, + "step": 197 + }, + { + "epoch": 0.057962529274004686, + "grad_norm": 1.207901120185852, + "learning_rate": 4.999715184476174e-06, + "loss": 0.8091, + "step": 198 + }, + { + "epoch": 0.058255269320843094, + "grad_norm": 0.956853449344635, + "learning_rate": 4.999709342371745e-06, + "loss": 0.77, + "step": 199 + }, + { + "epoch": 0.0585480093676815, + "grad_norm": 1.0085495710372925, + "learning_rate": 4.99970344096123e-06, + "loss": 0.7753, + "step": 200 + }, + { + "epoch": 0.05884074941451991, + "grad_norm": 0.9614437818527222, + "learning_rate": 4.999697480244772e-06, + "loss": 0.8225, + "step": 201 + }, + { + "epoch": 0.05913348946135832, + "grad_norm": 0.9549258947372437, + "learning_rate": 4.999691460222511e-06, + "loss": 0.7736, + "step": 202 + }, + { + "epoch": 0.05942622950819672, + "grad_norm": 1.0398695468902588, + "learning_rate": 4.999685380894589e-06, + "loss": 0.8338, + "step": 203 + }, + { + "epoch": 0.059718969555035126, + "grad_norm": 1.0222532749176025, + "learning_rate": 4.999679242261152e-06, + "loss": 0.8443, + "step": 204 + }, + { + "epoch": 0.060011709601873534, + "grad_norm": 1.0239646434783936, + "learning_rate": 4.999673044322344e-06, + "loss": 0.8181, + "step": 205 + }, + { + "epoch": 0.06030444964871194, + "grad_norm": 1.017911434173584, + "learning_rate": 4.999666787078313e-06, + "loss": 0.8338, + "step": 206 + }, + { + "epoch": 0.06059718969555035, + "grad_norm": 1.0163347721099854, + "learning_rate": 4.999660470529208e-06, + "loss": 0.7727, + "step": 207 + }, + { + "epoch": 0.06088992974238876, + "grad_norm": 0.9736925959587097, + "learning_rate": 4.999654094675177e-06, + "loss": 0.7785, + "step": 208 + }, + { + "epoch": 0.061182669789227165, + "grad_norm": 0.9376518726348877, + "learning_rate": 4.999647659516373e-06, + "loss": 0.7646, + "step": 209 + }, + { + "epoch": 0.06147540983606557, + "grad_norm": 1.0660001039505005, + "learning_rate": 4.999641165052948e-06, + "loss": 0.818, + "step": 210 + }, + { + "epoch": 0.06176814988290398, + "grad_norm": 0.9679873585700989, + "learning_rate": 4.999634611285056e-06, + "loss": 0.7519, + "step": 211 + }, + { + "epoch": 0.06206088992974239, + "grad_norm": 0.9630188941955566, + "learning_rate": 4.999627998212852e-06, + "loss": 0.8285, + "step": 212 + }, + { + "epoch": 0.062353629976580796, + "grad_norm": 0.9847297668457031, + "learning_rate": 4.999621325836495e-06, + "loss": 0.8002, + "step": 213 + }, + { + "epoch": 0.0626463700234192, + "grad_norm": 0.9583022594451904, + "learning_rate": 4.999614594156141e-06, + "loss": 0.7852, + "step": 214 + }, + { + "epoch": 0.06293911007025761, + "grad_norm": 0.905168890953064, + "learning_rate": 4.999607803171951e-06, + "loss": 0.7291, + "step": 215 + }, + { + "epoch": 0.06323185011709602, + "grad_norm": 0.9930005073547363, + "learning_rate": 4.999600952884085e-06, + "loss": 0.753, + "step": 216 + }, + { + "epoch": 0.06352459016393443, + "grad_norm": 1.0106189250946045, + "learning_rate": 4.999594043292706e-06, + "loss": 0.7705, + "step": 217 + }, + { + "epoch": 0.06381733021077284, + "grad_norm": 1.035780906677246, + "learning_rate": 4.999587074397978e-06, + "loss": 0.8465, + "step": 218 + }, + { + "epoch": 0.06411007025761124, + "grad_norm": 1.0787569284439087, + "learning_rate": 4.999580046200067e-06, + "loss": 0.8252, + "step": 219 + }, + { + "epoch": 0.06440281030444965, + "grad_norm": 0.9901817440986633, + "learning_rate": 4.999572958699139e-06, + "loss": 0.7793, + "step": 220 + }, + { + "epoch": 0.06469555035128806, + "grad_norm": 1.041366696357727, + "learning_rate": 4.999565811895363e-06, + "loss": 0.8276, + "step": 221 + }, + { + "epoch": 0.06498829039812647, + "grad_norm": 1.0493861436843872, + "learning_rate": 4.999558605788907e-06, + "loss": 0.824, + "step": 222 + }, + { + "epoch": 0.06528103044496487, + "grad_norm": 1.0313262939453125, + "learning_rate": 4.999551340379943e-06, + "loss": 0.8447, + "step": 223 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.980102002620697, + "learning_rate": 4.999544015668643e-06, + "loss": 0.8079, + "step": 224 + }, + { + "epoch": 0.06586651053864169, + "grad_norm": 1.0124592781066895, + "learning_rate": 4.999536631655182e-06, + "loss": 0.7983, + "step": 225 + }, + { + "epoch": 0.0661592505854801, + "grad_norm": 1.0240685939788818, + "learning_rate": 4.999529188339734e-06, + "loss": 0.7682, + "step": 226 + }, + { + "epoch": 0.0664519906323185, + "grad_norm": 1.049780249595642, + "learning_rate": 4.999521685722474e-06, + "loss": 0.7995, + "step": 227 + }, + { + "epoch": 0.06674473067915691, + "grad_norm": 1.0490747690200806, + "learning_rate": 4.999514123803584e-06, + "loss": 0.7986, + "step": 228 + }, + { + "epoch": 0.06703747072599532, + "grad_norm": 1.0661537647247314, + "learning_rate": 4.99950650258324e-06, + "loss": 0.7973, + "step": 229 + }, + { + "epoch": 0.06733021077283373, + "grad_norm": 1.0204161405563354, + "learning_rate": 4.9994988220616245e-06, + "loss": 0.7806, + "step": 230 + }, + { + "epoch": 0.06762295081967214, + "grad_norm": 1.0347875356674194, + "learning_rate": 4.999491082238919e-06, + "loss": 0.8165, + "step": 231 + }, + { + "epoch": 0.06791569086651054, + "grad_norm": 0.9846603274345398, + "learning_rate": 4.999483283115307e-06, + "loss": 0.7873, + "step": 232 + }, + { + "epoch": 0.06820843091334895, + "grad_norm": 1.0054099559783936, + "learning_rate": 4.999475424690975e-06, + "loss": 0.8007, + "step": 233 + }, + { + "epoch": 0.06850117096018736, + "grad_norm": 1.0528240203857422, + "learning_rate": 4.999467506966107e-06, + "loss": 0.7668, + "step": 234 + }, + { + "epoch": 0.06879391100702575, + "grad_norm": 0.9989666938781738, + "learning_rate": 4.999459529940893e-06, + "loss": 0.8245, + "step": 235 + }, + { + "epoch": 0.06908665105386416, + "grad_norm": 1.0626130104064941, + "learning_rate": 4.999451493615521e-06, + "loss": 0.8018, + "step": 236 + }, + { + "epoch": 0.06937939110070257, + "grad_norm": 1.0165215730667114, + "learning_rate": 4.999443397990183e-06, + "loss": 0.7509, + "step": 237 + }, + { + "epoch": 0.06967213114754098, + "grad_norm": 0.9973688721656799, + "learning_rate": 4.999435243065069e-06, + "loss": 0.7309, + "step": 238 + }, + { + "epoch": 0.06996487119437939, + "grad_norm": 1.084015965461731, + "learning_rate": 4.999427028840375e-06, + "loss": 0.813, + "step": 239 + }, + { + "epoch": 0.0702576112412178, + "grad_norm": 1.01414954662323, + "learning_rate": 4.9994187553162934e-06, + "loss": 0.8277, + "step": 240 + }, + { + "epoch": 0.0705503512880562, + "grad_norm": 1.0722118616104126, + "learning_rate": 4.999410422493023e-06, + "loss": 0.8321, + "step": 241 + }, + { + "epoch": 0.07084309133489461, + "grad_norm": 1.015113115310669, + "learning_rate": 4.999402030370759e-06, + "loss": 0.7555, + "step": 242 + }, + { + "epoch": 0.07113583138173302, + "grad_norm": 1.1035938262939453, + "learning_rate": 4.999393578949702e-06, + "loss": 0.7716, + "step": 243 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 1.0007226467132568, + "learning_rate": 4.999385068230051e-06, + "loss": 0.8493, + "step": 244 + }, + { + "epoch": 0.07172131147540983, + "grad_norm": 1.012778639793396, + "learning_rate": 4.999376498212011e-06, + "loss": 0.7618, + "step": 245 + }, + { + "epoch": 0.07201405152224824, + "grad_norm": 1.0418421030044556, + "learning_rate": 4.999367868895783e-06, + "loss": 0.787, + "step": 246 + }, + { + "epoch": 0.07230679156908665, + "grad_norm": 1.0634076595306396, + "learning_rate": 4.999359180281571e-06, + "loss": 0.8068, + "step": 247 + }, + { + "epoch": 0.07259953161592506, + "grad_norm": 1.0441018342971802, + "learning_rate": 4.999350432369583e-06, + "loss": 0.8063, + "step": 248 + }, + { + "epoch": 0.07289227166276346, + "grad_norm": 0.92852383852005, + "learning_rate": 4.999341625160026e-06, + "loss": 0.675, + "step": 249 + }, + { + "epoch": 0.07318501170960187, + "grad_norm": 1.0548518896102905, + "learning_rate": 4.999332758653108e-06, + "loss": 0.7759, + "step": 250 + }, + { + "epoch": 0.07347775175644028, + "grad_norm": 0.9742603898048401, + "learning_rate": 4.999323832849041e-06, + "loss": 0.7668, + "step": 251 + }, + { + "epoch": 0.07377049180327869, + "grad_norm": 1.0125787258148193, + "learning_rate": 4.999314847748035e-06, + "loss": 0.7909, + "step": 252 + }, + { + "epoch": 0.0740632318501171, + "grad_norm": 0.9325947165489197, + "learning_rate": 4.9993058033503046e-06, + "loss": 0.7664, + "step": 253 + }, + { + "epoch": 0.0743559718969555, + "grad_norm": 0.9980131983757019, + "learning_rate": 4.999296699656064e-06, + "loss": 0.7791, + "step": 254 + }, + { + "epoch": 0.07464871194379391, + "grad_norm": 1.1078598499298096, + "learning_rate": 4.9992875366655284e-06, + "loss": 0.8182, + "step": 255 + }, + { + "epoch": 0.07494145199063232, + "grad_norm": 0.9955490827560425, + "learning_rate": 4.999278314378916e-06, + "loss": 0.8153, + "step": 256 + }, + { + "epoch": 0.07523419203747073, + "grad_norm": 0.9883580803871155, + "learning_rate": 4.999269032796446e-06, + "loss": 0.7857, + "step": 257 + }, + { + "epoch": 0.07552693208430913, + "grad_norm": 1.061065912246704, + "learning_rate": 4.999259691918338e-06, + "loss": 0.8316, + "step": 258 + }, + { + "epoch": 0.07581967213114754, + "grad_norm": 1.0175617933273315, + "learning_rate": 4.999250291744813e-06, + "loss": 0.8109, + "step": 259 + }, + { + "epoch": 0.07611241217798595, + "grad_norm": 1.0206730365753174, + "learning_rate": 4.999240832276096e-06, + "loss": 0.8514, + "step": 260 + }, + { + "epoch": 0.07640515222482436, + "grad_norm": 1.001150369644165, + "learning_rate": 4.999231313512408e-06, + "loss": 0.7343, + "step": 261 + }, + { + "epoch": 0.07669789227166277, + "grad_norm": 0.9983119368553162, + "learning_rate": 4.999221735453978e-06, + "loss": 0.8514, + "step": 262 + }, + { + "epoch": 0.07699063231850117, + "grad_norm": 1.0750926733016968, + "learning_rate": 4.999212098101033e-06, + "loss": 0.7615, + "step": 263 + }, + { + "epoch": 0.07728337236533958, + "grad_norm": 0.9846647381782532, + "learning_rate": 4.9992024014538e-06, + "loss": 0.7991, + "step": 264 + }, + { + "epoch": 0.07757611241217799, + "grad_norm": 0.9803087711334229, + "learning_rate": 4.99919264551251e-06, + "loss": 0.7789, + "step": 265 + }, + { + "epoch": 0.0778688524590164, + "grad_norm": 1.0144833326339722, + "learning_rate": 4.999182830277394e-06, + "loss": 0.7744, + "step": 266 + }, + { + "epoch": 0.0781615925058548, + "grad_norm": 1.0123480558395386, + "learning_rate": 4.999172955748685e-06, + "loss": 0.7871, + "step": 267 + }, + { + "epoch": 0.07845433255269321, + "grad_norm": 1.0320500135421753, + "learning_rate": 4.999163021926618e-06, + "loss": 0.839, + "step": 268 + }, + { + "epoch": 0.07874707259953162, + "grad_norm": 1.0359399318695068, + "learning_rate": 4.9991530288114286e-06, + "loss": 0.7578, + "step": 269 + }, + { + "epoch": 0.07903981264637003, + "grad_norm": 1.075359582901001, + "learning_rate": 4.999142976403353e-06, + "loss": 0.8212, + "step": 270 + }, + { + "epoch": 0.07933255269320844, + "grad_norm": 1.141492486000061, + "learning_rate": 4.999132864702629e-06, + "loss": 0.8085, + "step": 271 + }, + { + "epoch": 0.07962529274004684, + "grad_norm": 1.0821517705917358, + "learning_rate": 4.9991226937094995e-06, + "loss": 0.7765, + "step": 272 + }, + { + "epoch": 0.07991803278688525, + "grad_norm": 1.0253477096557617, + "learning_rate": 4.9991124634242035e-06, + "loss": 0.7217, + "step": 273 + }, + { + "epoch": 0.08021077283372366, + "grad_norm": 1.0165424346923828, + "learning_rate": 4.999102173846983e-06, + "loss": 0.812, + "step": 274 + }, + { + "epoch": 0.08050351288056207, + "grad_norm": 1.034346580505371, + "learning_rate": 4.999091824978085e-06, + "loss": 0.7849, + "step": 275 + }, + { + "epoch": 0.08079625292740047, + "grad_norm": 1.1602669954299927, + "learning_rate": 4.999081416817752e-06, + "loss": 0.7893, + "step": 276 + }, + { + "epoch": 0.08108899297423888, + "grad_norm": 1.0330182313919067, + "learning_rate": 4.999070949366233e-06, + "loss": 0.7971, + "step": 277 + }, + { + "epoch": 0.08138173302107728, + "grad_norm": 1.0489083528518677, + "learning_rate": 4.999060422623775e-06, + "loss": 0.7646, + "step": 278 + }, + { + "epoch": 0.08167447306791568, + "grad_norm": 0.983022153377533, + "learning_rate": 4.999049836590629e-06, + "loss": 0.8289, + "step": 279 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 1.0879958868026733, + "learning_rate": 4.999039191267045e-06, + "loss": 0.8174, + "step": 280 + }, + { + "epoch": 0.0822599531615925, + "grad_norm": 0.9744462966918945, + "learning_rate": 4.999028486653277e-06, + "loss": 0.7831, + "step": 281 + }, + { + "epoch": 0.08255269320843091, + "grad_norm": 1.0008559226989746, + "learning_rate": 4.999017722749576e-06, + "loss": 0.7615, + "step": 282 + }, + { + "epoch": 0.08284543325526932, + "grad_norm": 0.9998801350593567, + "learning_rate": 4.999006899556201e-06, + "loss": 0.7793, + "step": 283 + }, + { + "epoch": 0.08313817330210772, + "grad_norm": 1.0264472961425781, + "learning_rate": 4.9989960170734074e-06, + "loss": 0.8055, + "step": 284 + }, + { + "epoch": 0.08343091334894613, + "grad_norm": 1.0080816745758057, + "learning_rate": 4.998985075301452e-06, + "loss": 0.7354, + "step": 285 + }, + { + "epoch": 0.08372365339578454, + "grad_norm": 1.021681308746338, + "learning_rate": 4.998974074240597e-06, + "loss": 0.8095, + "step": 286 + }, + { + "epoch": 0.08401639344262295, + "grad_norm": 1.019035816192627, + "learning_rate": 4.998963013891102e-06, + "loss": 0.7612, + "step": 287 + }, + { + "epoch": 0.08430913348946135, + "grad_norm": 1.0714670419692993, + "learning_rate": 4.998951894253227e-06, + "loss": 0.7994, + "step": 288 + }, + { + "epoch": 0.08460187353629976, + "grad_norm": 1.1203774213790894, + "learning_rate": 4.99894071532724e-06, + "loss": 0.7604, + "step": 289 + }, + { + "epoch": 0.08489461358313817, + "grad_norm": 1.0937491655349731, + "learning_rate": 4.998929477113405e-06, + "loss": 0.8053, + "step": 290 + }, + { + "epoch": 0.08518735362997658, + "grad_norm": 1.026435375213623, + "learning_rate": 4.998918179611988e-06, + "loss": 0.7889, + "step": 291 + }, + { + "epoch": 0.08548009367681499, + "grad_norm": 1.0503054857254028, + "learning_rate": 4.998906822823256e-06, + "loss": 0.7906, + "step": 292 + }, + { + "epoch": 0.0857728337236534, + "grad_norm": 1.0537586212158203, + "learning_rate": 4.99889540674748e-06, + "loss": 0.811, + "step": 293 + }, + { + "epoch": 0.0860655737704918, + "grad_norm": 0.9758758544921875, + "learning_rate": 4.99888393138493e-06, + "loss": 0.7999, + "step": 294 + }, + { + "epoch": 0.08635831381733021, + "grad_norm": 1.0707707405090332, + "learning_rate": 4.9988723967358784e-06, + "loss": 0.7787, + "step": 295 + }, + { + "epoch": 0.08665105386416862, + "grad_norm": 1.0078744888305664, + "learning_rate": 4.9988608028006e-06, + "loss": 0.8135, + "step": 296 + }, + { + "epoch": 0.08694379391100703, + "grad_norm": 1.0277955532073975, + "learning_rate": 4.998849149579369e-06, + "loss": 0.7952, + "step": 297 + }, + { + "epoch": 0.08723653395784543, + "grad_norm": 1.0249435901641846, + "learning_rate": 4.998837437072462e-06, + "loss": 0.7398, + "step": 298 + }, + { + "epoch": 0.08752927400468384, + "grad_norm": 0.9656965732574463, + "learning_rate": 4.998825665280156e-06, + "loss": 0.7301, + "step": 299 + }, + { + "epoch": 0.08782201405152225, + "grad_norm": 1.0629814863204956, + "learning_rate": 4.998813834202732e-06, + "loss": 0.7843, + "step": 300 + }, + { + "epoch": 0.08811475409836066, + "grad_norm": 1.0062425136566162, + "learning_rate": 4.998801943840468e-06, + "loss": 0.7643, + "step": 301 + }, + { + "epoch": 0.08840749414519906, + "grad_norm": 1.05402672290802, + "learning_rate": 4.998789994193649e-06, + "loss": 0.7464, + "step": 302 + }, + { + "epoch": 0.08870023419203747, + "grad_norm": 1.042034387588501, + "learning_rate": 4.998777985262557e-06, + "loss": 0.7249, + "step": 303 + }, + { + "epoch": 0.08899297423887588, + "grad_norm": 1.017446756362915, + "learning_rate": 4.998765917047477e-06, + "loss": 0.7327, + "step": 304 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 1.0578644275665283, + "learning_rate": 4.9987537895486955e-06, + "loss": 0.725, + "step": 305 + }, + { + "epoch": 0.0895784543325527, + "grad_norm": 1.0969740152359009, + "learning_rate": 4.998741602766501e-06, + "loss": 0.8114, + "step": 306 + }, + { + "epoch": 0.0898711943793911, + "grad_norm": 1.0567536354064941, + "learning_rate": 4.998729356701182e-06, + "loss": 0.7347, + "step": 307 + }, + { + "epoch": 0.09016393442622951, + "grad_norm": 1.1671446561813354, + "learning_rate": 4.998717051353028e-06, + "loss": 0.8037, + "step": 308 + }, + { + "epoch": 0.09045667447306792, + "grad_norm": 1.0147705078125, + "learning_rate": 4.998704686722332e-06, + "loss": 0.7702, + "step": 309 + }, + { + "epoch": 0.09074941451990633, + "grad_norm": 0.9970436692237854, + "learning_rate": 4.9986922628093885e-06, + "loss": 0.7646, + "step": 310 + }, + { + "epoch": 0.09104215456674473, + "grad_norm": 1.0379939079284668, + "learning_rate": 4.99867977961449e-06, + "loss": 0.8068, + "step": 311 + }, + { + "epoch": 0.09133489461358314, + "grad_norm": 1.0548770427703857, + "learning_rate": 4.998667237137934e-06, + "loss": 0.7661, + "step": 312 + }, + { + "epoch": 0.09162763466042155, + "grad_norm": 1.045809268951416, + "learning_rate": 4.998654635380017e-06, + "loss": 0.7874, + "step": 313 + }, + { + "epoch": 0.09192037470725996, + "grad_norm": 1.073453664779663, + "learning_rate": 4.99864197434104e-06, + "loss": 0.7788, + "step": 314 + }, + { + "epoch": 0.09221311475409837, + "grad_norm": 1.035721778869629, + "learning_rate": 4.998629254021301e-06, + "loss": 0.8063, + "step": 315 + }, + { + "epoch": 0.09250585480093677, + "grad_norm": 1.0297883749008179, + "learning_rate": 4.9986164744211026e-06, + "loss": 0.7636, + "step": 316 + }, + { + "epoch": 0.09279859484777518, + "grad_norm": 0.9612944722175598, + "learning_rate": 4.99860363554075e-06, + "loss": 0.7493, + "step": 317 + }, + { + "epoch": 0.09309133489461359, + "grad_norm": 1.0770940780639648, + "learning_rate": 4.998590737380545e-06, + "loss": 0.8048, + "step": 318 + }, + { + "epoch": 0.093384074941452, + "grad_norm": 0.9489647746086121, + "learning_rate": 4.998577779940795e-06, + "loss": 0.7466, + "step": 319 + }, + { + "epoch": 0.0936768149882904, + "grad_norm": 0.974545419216156, + "learning_rate": 4.998564763221807e-06, + "loss": 0.8098, + "step": 320 + }, + { + "epoch": 0.0939695550351288, + "grad_norm": 1.0344306230545044, + "learning_rate": 4.99855168722389e-06, + "loss": 0.7536, + "step": 321 + }, + { + "epoch": 0.0942622950819672, + "grad_norm": 1.03992760181427, + "learning_rate": 4.998538551947354e-06, + "loss": 0.7937, + "step": 322 + }, + { + "epoch": 0.09455503512880561, + "grad_norm": 1.059653401374817, + "learning_rate": 4.998525357392511e-06, + "loss": 0.7583, + "step": 323 + }, + { + "epoch": 0.09484777517564402, + "grad_norm": 1.0235531330108643, + "learning_rate": 4.998512103559674e-06, + "loss": 0.7867, + "step": 324 + }, + { + "epoch": 0.09514051522248243, + "grad_norm": 1.038150429725647, + "learning_rate": 4.9984987904491576e-06, + "loss": 0.7417, + "step": 325 + }, + { + "epoch": 0.09543325526932084, + "grad_norm": 1.0136059522628784, + "learning_rate": 4.998485418061276e-06, + "loss": 0.7493, + "step": 326 + }, + { + "epoch": 0.09572599531615925, + "grad_norm": 1.0565600395202637, + "learning_rate": 4.998471986396349e-06, + "loss": 0.7931, + "step": 327 + }, + { + "epoch": 0.09601873536299765, + "grad_norm": 1.038159966468811, + "learning_rate": 4.9984584954546945e-06, + "loss": 0.785, + "step": 328 + }, + { + "epoch": 0.09631147540983606, + "grad_norm": 0.9892928600311279, + "learning_rate": 4.9984449452366315e-06, + "loss": 0.7506, + "step": 329 + }, + { + "epoch": 0.09660421545667447, + "grad_norm": 1.0567528009414673, + "learning_rate": 4.998431335742482e-06, + "loss": 0.7596, + "step": 330 + }, + { + "epoch": 0.09689695550351288, + "grad_norm": 1.0513629913330078, + "learning_rate": 4.998417666972569e-06, + "loss": 0.8142, + "step": 331 + }, + { + "epoch": 0.09718969555035128, + "grad_norm": 1.043482780456543, + "learning_rate": 4.998403938927217e-06, + "loss": 0.7861, + "step": 332 + }, + { + "epoch": 0.09748243559718969, + "grad_norm": 1.0302705764770508, + "learning_rate": 4.998390151606752e-06, + "loss": 0.7788, + "step": 333 + }, + { + "epoch": 0.0977751756440281, + "grad_norm": 1.011191487312317, + "learning_rate": 4.998376305011501e-06, + "loss": 0.7671, + "step": 334 + }, + { + "epoch": 0.09806791569086651, + "grad_norm": 1.0430840253829956, + "learning_rate": 4.9983623991417916e-06, + "loss": 0.7184, + "step": 335 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 1.0830774307250977, + "learning_rate": 4.998348433997955e-06, + "loss": 0.7716, + "step": 336 + }, + { + "epoch": 0.09865339578454332, + "grad_norm": 1.0723916292190552, + "learning_rate": 4.998334409580321e-06, + "loss": 0.8038, + "step": 337 + }, + { + "epoch": 0.09894613583138173, + "grad_norm": 1.003633737564087, + "learning_rate": 4.998320325889223e-06, + "loss": 0.7601, + "step": 338 + }, + { + "epoch": 0.09923887587822014, + "grad_norm": 1.0196579694747925, + "learning_rate": 4.998306182924995e-06, + "loss": 0.7044, + "step": 339 + }, + { + "epoch": 0.09953161592505855, + "grad_norm": 0.9865079522132874, + "learning_rate": 4.998291980687974e-06, + "loss": 0.7916, + "step": 340 + }, + { + "epoch": 0.09982435597189696, + "grad_norm": 1.0892618894577026, + "learning_rate": 4.998277719178495e-06, + "loss": 0.7902, + "step": 341 + }, + { + "epoch": 0.10011709601873536, + "grad_norm": 1.0515190362930298, + "learning_rate": 4.998263398396897e-06, + "loss": 0.7948, + "step": 342 + }, + { + "epoch": 0.10040983606557377, + "grad_norm": 1.0524024963378906, + "learning_rate": 4.9982490183435206e-06, + "loss": 0.7663, + "step": 343 + }, + { + "epoch": 0.10070257611241218, + "grad_norm": 1.0424766540527344, + "learning_rate": 4.998234579018706e-06, + "loss": 0.7813, + "step": 344 + }, + { + "epoch": 0.10099531615925059, + "grad_norm": 1.0016549825668335, + "learning_rate": 4.9982200804227955e-06, + "loss": 0.7027, + "step": 345 + }, + { + "epoch": 0.101288056206089, + "grad_norm": 0.986585795879364, + "learning_rate": 4.998205522556134e-06, + "loss": 0.7122, + "step": 346 + }, + { + "epoch": 0.1015807962529274, + "grad_norm": 1.0090174674987793, + "learning_rate": 4.998190905419067e-06, + "loss": 0.7382, + "step": 347 + }, + { + "epoch": 0.10187353629976581, + "grad_norm": 1.0027350187301636, + "learning_rate": 4.99817622901194e-06, + "loss": 0.7731, + "step": 348 + }, + { + "epoch": 0.10216627634660422, + "grad_norm": 1.0439881086349487, + "learning_rate": 4.998161493335102e-06, + "loss": 0.8305, + "step": 349 + }, + { + "epoch": 0.10245901639344263, + "grad_norm": 0.9699923396110535, + "learning_rate": 4.9981466983889036e-06, + "loss": 0.7496, + "step": 350 + }, + { + "epoch": 0.10275175644028103, + "grad_norm": 1.0649313926696777, + "learning_rate": 4.998131844173694e-06, + "loss": 0.775, + "step": 351 + }, + { + "epoch": 0.10304449648711944, + "grad_norm": 1.0893328189849854, + "learning_rate": 4.998116930689827e-06, + "loss": 0.779, + "step": 352 + }, + { + "epoch": 0.10333723653395785, + "grad_norm": 0.9803676009178162, + "learning_rate": 4.998101957937655e-06, + "loss": 0.7771, + "step": 353 + }, + { + "epoch": 0.10362997658079626, + "grad_norm": 1.0448029041290283, + "learning_rate": 4.998086925917534e-06, + "loss": 0.7402, + "step": 354 + }, + { + "epoch": 0.10392271662763466, + "grad_norm": 0.9680082201957703, + "learning_rate": 4.998071834629822e-06, + "loss": 0.6843, + "step": 355 + }, + { + "epoch": 0.10421545667447307, + "grad_norm": 1.0383589267730713, + "learning_rate": 4.998056684074876e-06, + "loss": 0.7439, + "step": 356 + }, + { + "epoch": 0.10450819672131148, + "grad_norm": 0.9745027422904968, + "learning_rate": 4.998041474253053e-06, + "loss": 0.7417, + "step": 357 + }, + { + "epoch": 0.10480093676814989, + "grad_norm": 1.1582478284835815, + "learning_rate": 4.998026205164719e-06, + "loss": 0.7661, + "step": 358 + }, + { + "epoch": 0.1050936768149883, + "grad_norm": 1.038679599761963, + "learning_rate": 4.998010876810231e-06, + "loss": 0.7667, + "step": 359 + }, + { + "epoch": 0.1053864168618267, + "grad_norm": 1.1150333881378174, + "learning_rate": 4.997995489189956e-06, + "loss": 0.7819, + "step": 360 + }, + { + "epoch": 0.10567915690866511, + "grad_norm": 1.1317877769470215, + "learning_rate": 4.997980042304258e-06, + "loss": 0.7726, + "step": 361 + }, + { + "epoch": 0.10597189695550352, + "grad_norm": 1.043576717376709, + "learning_rate": 4.997964536153504e-06, + "loss": 0.7673, + "step": 362 + }, + { + "epoch": 0.10626463700234191, + "grad_norm": 1.202532410621643, + "learning_rate": 4.997948970738061e-06, + "loss": 0.7494, + "step": 363 + }, + { + "epoch": 0.10655737704918032, + "grad_norm": 1.0821906328201294, + "learning_rate": 4.997933346058299e-06, + "loss": 0.8013, + "step": 364 + }, + { + "epoch": 0.10685011709601873, + "grad_norm": 1.0681560039520264, + "learning_rate": 4.997917662114588e-06, + "loss": 0.7421, + "step": 365 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 1.0434759855270386, + "learning_rate": 4.997901918907301e-06, + "loss": 0.7809, + "step": 366 + }, + { + "epoch": 0.10743559718969554, + "grad_norm": 0.9777372479438782, + "learning_rate": 4.997886116436811e-06, + "loss": 0.7307, + "step": 367 + }, + { + "epoch": 0.10772833723653395, + "grad_norm": 0.9907523989677429, + "learning_rate": 4.997870254703493e-06, + "loss": 0.7332, + "step": 368 + }, + { + "epoch": 0.10802107728337236, + "grad_norm": 1.0686542987823486, + "learning_rate": 4.9978543337077234e-06, + "loss": 0.8201, + "step": 369 + }, + { + "epoch": 0.10831381733021077, + "grad_norm": 1.0577365159988403, + "learning_rate": 4.99783835344988e-06, + "loss": 0.7387, + "step": 370 + }, + { + "epoch": 0.10860655737704918, + "grad_norm": 3.08103084564209, + "learning_rate": 4.997822313930341e-06, + "loss": 0.7609, + "step": 371 + }, + { + "epoch": 0.10889929742388758, + "grad_norm": 1.0915437936782837, + "learning_rate": 4.997806215149488e-06, + "loss": 0.7425, + "step": 372 + }, + { + "epoch": 0.10919203747072599, + "grad_norm": 1.0372344255447388, + "learning_rate": 4.997790057107703e-06, + "loss": 0.7853, + "step": 373 + }, + { + "epoch": 0.1094847775175644, + "grad_norm": 1.0187528133392334, + "learning_rate": 4.9977738398053686e-06, + "loss": 0.7653, + "step": 374 + }, + { + "epoch": 0.10977751756440281, + "grad_norm": 1.0650933980941772, + "learning_rate": 4.997757563242871e-06, + "loss": 0.7575, + "step": 375 + }, + { + "epoch": 0.11007025761124122, + "grad_norm": 1.005064606666565, + "learning_rate": 4.9977412274205935e-06, + "loss": 0.7462, + "step": 376 + }, + { + "epoch": 0.11036299765807962, + "grad_norm": 1.071667194366455, + "learning_rate": 4.997724832338927e-06, + "loss": 0.7828, + "step": 377 + }, + { + "epoch": 0.11065573770491803, + "grad_norm": 1.0389713048934937, + "learning_rate": 4.997708377998259e-06, + "loss": 0.7091, + "step": 378 + }, + { + "epoch": 0.11094847775175644, + "grad_norm": 0.9492546319961548, + "learning_rate": 4.997691864398979e-06, + "loss": 0.7403, + "step": 379 + }, + { + "epoch": 0.11124121779859485, + "grad_norm": 0.9911750555038452, + "learning_rate": 4.99767529154148e-06, + "loss": 0.772, + "step": 380 + }, + { + "epoch": 0.11153395784543325, + "grad_norm": 1.0081173181533813, + "learning_rate": 4.997658659426155e-06, + "loss": 0.766, + "step": 381 + }, + { + "epoch": 0.11182669789227166, + "grad_norm": 0.9873563051223755, + "learning_rate": 4.997641968053399e-06, + "loss": 0.7666, + "step": 382 + }, + { + "epoch": 0.11211943793911007, + "grad_norm": 1.0278087854385376, + "learning_rate": 4.997625217423606e-06, + "loss": 0.8036, + "step": 383 + }, + { + "epoch": 0.11241217798594848, + "grad_norm": 0.9905311465263367, + "learning_rate": 4.9976084075371755e-06, + "loss": 0.7134, + "step": 384 + }, + { + "epoch": 0.11270491803278689, + "grad_norm": 1.2551419734954834, + "learning_rate": 4.997591538394506e-06, + "loss": 0.7583, + "step": 385 + }, + { + "epoch": 0.1129976580796253, + "grad_norm": 0.9751659035682678, + "learning_rate": 4.997574609995998e-06, + "loss": 0.7423, + "step": 386 + }, + { + "epoch": 0.1132903981264637, + "grad_norm": 0.9926493167877197, + "learning_rate": 4.997557622342052e-06, + "loss": 0.7652, + "step": 387 + }, + { + "epoch": 0.11358313817330211, + "grad_norm": 1.0476713180541992, + "learning_rate": 4.9975405754330705e-06, + "loss": 0.751, + "step": 388 + }, + { + "epoch": 0.11387587822014052, + "grad_norm": 0.9747985005378723, + "learning_rate": 4.997523469269461e-06, + "loss": 0.7361, + "step": 389 + }, + { + "epoch": 0.11416861826697892, + "grad_norm": 1.097082495689392, + "learning_rate": 4.997506303851626e-06, + "loss": 0.7755, + "step": 390 + }, + { + "epoch": 0.11446135831381733, + "grad_norm": 1.0939267873764038, + "learning_rate": 4.997489079179974e-06, + "loss": 0.7705, + "step": 391 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 1.0342153310775757, + "learning_rate": 4.997471795254915e-06, + "loss": 0.7815, + "step": 392 + }, + { + "epoch": 0.11504683840749415, + "grad_norm": 1.0410988330841064, + "learning_rate": 4.997454452076857e-06, + "loss": 0.7932, + "step": 393 + }, + { + "epoch": 0.11533957845433256, + "grad_norm": 0.9908266067504883, + "learning_rate": 4.9974370496462114e-06, + "loss": 0.7596, + "step": 394 + }, + { + "epoch": 0.11563231850117096, + "grad_norm": 1.0018764734268188, + "learning_rate": 4.997419587963394e-06, + "loss": 0.7216, + "step": 395 + }, + { + "epoch": 0.11592505854800937, + "grad_norm": 0.9897498488426208, + "learning_rate": 4.997402067028815e-06, + "loss": 0.7676, + "step": 396 + }, + { + "epoch": 0.11621779859484778, + "grad_norm": 1.0064325332641602, + "learning_rate": 4.997384486842893e-06, + "loss": 0.7575, + "step": 397 + }, + { + "epoch": 0.11651053864168619, + "grad_norm": 0.9785544276237488, + "learning_rate": 4.997366847406044e-06, + "loss": 0.7569, + "step": 398 + }, + { + "epoch": 0.1168032786885246, + "grad_norm": 0.9922221899032593, + "learning_rate": 4.9973491487186875e-06, + "loss": 0.7631, + "step": 399 + }, + { + "epoch": 0.117096018735363, + "grad_norm": 0.9898287653923035, + "learning_rate": 4.997331390781242e-06, + "loss": 0.7303, + "step": 400 + }, + { + "epoch": 0.11738875878220141, + "grad_norm": 1.0244249105453491, + "learning_rate": 4.9973135735941305e-06, + "loss": 0.7516, + "step": 401 + }, + { + "epoch": 0.11768149882903982, + "grad_norm": 0.9801017642021179, + "learning_rate": 4.997295697157774e-06, + "loss": 0.7327, + "step": 402 + }, + { + "epoch": 0.11797423887587823, + "grad_norm": 1.0143450498580933, + "learning_rate": 4.997277761472597e-06, + "loss": 0.7204, + "step": 403 + }, + { + "epoch": 0.11826697892271663, + "grad_norm": 1.0454155206680298, + "learning_rate": 4.997259766539025e-06, + "loss": 0.7986, + "step": 404 + }, + { + "epoch": 0.11855971896955504, + "grad_norm": 1.0337510108947754, + "learning_rate": 4.997241712357487e-06, + "loss": 0.7832, + "step": 405 + }, + { + "epoch": 0.11885245901639344, + "grad_norm": 1.0107650756835938, + "learning_rate": 4.997223598928408e-06, + "loss": 0.7116, + "step": 406 + }, + { + "epoch": 0.11914519906323184, + "grad_norm": 1.0875338315963745, + "learning_rate": 4.9972054262522195e-06, + "loss": 0.6885, + "step": 407 + }, + { + "epoch": 0.11943793911007025, + "grad_norm": 1.0200167894363403, + "learning_rate": 4.997187194329353e-06, + "loss": 0.7606, + "step": 408 + }, + { + "epoch": 0.11973067915690866, + "grad_norm": 0.9776178002357483, + "learning_rate": 4.99716890316024e-06, + "loss": 0.6965, + "step": 409 + }, + { + "epoch": 0.12002341920374707, + "grad_norm": 1.0252526998519897, + "learning_rate": 4.997150552745316e-06, + "loss": 0.73, + "step": 410 + }, + { + "epoch": 0.12031615925058547, + "grad_norm": 1.0157012939453125, + "learning_rate": 4.9971321430850135e-06, + "loss": 0.7564, + "step": 411 + }, + { + "epoch": 0.12060889929742388, + "grad_norm": 1.0667160749435425, + "learning_rate": 4.997113674179773e-06, + "loss": 0.6872, + "step": 412 + }, + { + "epoch": 0.12090163934426229, + "grad_norm": 1.0117136240005493, + "learning_rate": 4.99709514603003e-06, + "loss": 0.7402, + "step": 413 + }, + { + "epoch": 0.1211943793911007, + "grad_norm": 1.03264582157135, + "learning_rate": 4.997076558636226e-06, + "loss": 0.7702, + "step": 414 + }, + { + "epoch": 0.1214871194379391, + "grad_norm": 1.0241092443466187, + "learning_rate": 4.9970579119987995e-06, + "loss": 0.7602, + "step": 415 + }, + { + "epoch": 0.12177985948477751, + "grad_norm": 1.0487524271011353, + "learning_rate": 4.997039206118195e-06, + "loss": 0.8145, + "step": 416 + }, + { + "epoch": 0.12207259953161592, + "grad_norm": 1.0222060680389404, + "learning_rate": 4.997020440994856e-06, + "loss": 0.8304, + "step": 417 + }, + { + "epoch": 0.12236533957845433, + "grad_norm": 1.0419267416000366, + "learning_rate": 4.997001616629226e-06, + "loss": 0.7009, + "step": 418 + }, + { + "epoch": 0.12265807962529274, + "grad_norm": 1.068539023399353, + "learning_rate": 4.996982733021753e-06, + "loss": 0.7093, + "step": 419 + }, + { + "epoch": 0.12295081967213115, + "grad_norm": 1.057252049446106, + "learning_rate": 4.996963790172886e-06, + "loss": 0.7027, + "step": 420 + }, + { + "epoch": 0.12324355971896955, + "grad_norm": 1.0557957887649536, + "learning_rate": 4.9969447880830726e-06, + "loss": 0.7918, + "step": 421 + }, + { + "epoch": 0.12353629976580796, + "grad_norm": 1.0614861249923706, + "learning_rate": 4.996925726752765e-06, + "loss": 0.7445, + "step": 422 + }, + { + "epoch": 0.12382903981264637, + "grad_norm": 1.0664589405059814, + "learning_rate": 4.9969066061824145e-06, + "loss": 0.7959, + "step": 423 + }, + { + "epoch": 0.12412177985948478, + "grad_norm": 1.0395475625991821, + "learning_rate": 4.996887426372475e-06, + "loss": 0.732, + "step": 424 + }, + { + "epoch": 0.12441451990632318, + "grad_norm": 1.0503530502319336, + "learning_rate": 4.996868187323402e-06, + "loss": 0.7699, + "step": 425 + }, + { + "epoch": 0.12470725995316159, + "grad_norm": 1.0329161882400513, + "learning_rate": 4.9968488890356516e-06, + "loss": 0.7903, + "step": 426 + }, + { + "epoch": 0.125, + "grad_norm": 1.0941752195358276, + "learning_rate": 4.996829531509681e-06, + "loss": 0.7688, + "step": 427 + }, + { + "epoch": 0.1252927400468384, + "grad_norm": 1.018449068069458, + "learning_rate": 4.99681011474595e-06, + "loss": 0.7507, + "step": 428 + }, + { + "epoch": 0.12558548009367682, + "grad_norm": 1.0863702297210693, + "learning_rate": 4.996790638744919e-06, + "loss": 0.757, + "step": 429 + }, + { + "epoch": 0.12587822014051522, + "grad_norm": 1.0654449462890625, + "learning_rate": 4.996771103507052e-06, + "loss": 0.7632, + "step": 430 + }, + { + "epoch": 0.12617096018735363, + "grad_norm": 1.1279804706573486, + "learning_rate": 4.99675150903281e-06, + "loss": 0.7799, + "step": 431 + }, + { + "epoch": 0.12646370023419204, + "grad_norm": 1.0296435356140137, + "learning_rate": 4.9967318553226586e-06, + "loss": 0.7856, + "step": 432 + }, + { + "epoch": 0.12675644028103045, + "grad_norm": 0.9789742231369019, + "learning_rate": 4.9967121423770645e-06, + "loss": 0.6972, + "step": 433 + }, + { + "epoch": 0.12704918032786885, + "grad_norm": 1.1222054958343506, + "learning_rate": 4.996692370196495e-06, + "loss": 0.7538, + "step": 434 + }, + { + "epoch": 0.12734192037470726, + "grad_norm": 1.033806562423706, + "learning_rate": 4.996672538781419e-06, + "loss": 0.7728, + "step": 435 + }, + { + "epoch": 0.12763466042154567, + "grad_norm": 1.0473988056182861, + "learning_rate": 4.996652648132308e-06, + "loss": 0.7799, + "step": 436 + }, + { + "epoch": 0.12792740046838408, + "grad_norm": 1.0086933374404907, + "learning_rate": 4.996632698249633e-06, + "loss": 0.7928, + "step": 437 + }, + { + "epoch": 0.12822014051522249, + "grad_norm": 1.0298932790756226, + "learning_rate": 4.996612689133867e-06, + "loss": 0.7541, + "step": 438 + }, + { + "epoch": 0.1285128805620609, + "grad_norm": 1.1842492818832397, + "learning_rate": 4.996592620785486e-06, + "loss": 0.7923, + "step": 439 + }, + { + "epoch": 0.1288056206088993, + "grad_norm": 1.063135027885437, + "learning_rate": 4.996572493204966e-06, + "loss": 0.8084, + "step": 440 + }, + { + "epoch": 0.1290983606557377, + "grad_norm": 1.0525684356689453, + "learning_rate": 4.996552306392783e-06, + "loss": 0.771, + "step": 441 + }, + { + "epoch": 0.12939110070257612, + "grad_norm": 1.039347529411316, + "learning_rate": 4.996532060349417e-06, + "loss": 0.7435, + "step": 442 + }, + { + "epoch": 0.12968384074941453, + "grad_norm": 1.0141247510910034, + "learning_rate": 4.9965117550753486e-06, + "loss": 0.7372, + "step": 443 + }, + { + "epoch": 0.12997658079625293, + "grad_norm": 1.0450791120529175, + "learning_rate": 4.9964913905710584e-06, + "loss": 0.7093, + "step": 444 + }, + { + "epoch": 0.13026932084309134, + "grad_norm": 1.0006966590881348, + "learning_rate": 4.9964709668370315e-06, + "loss": 0.7332, + "step": 445 + }, + { + "epoch": 0.13056206088992975, + "grad_norm": 1.0312389135360718, + "learning_rate": 4.9964504838737505e-06, + "loss": 0.7563, + "step": 446 + }, + { + "epoch": 0.13085480093676816, + "grad_norm": 1.1221442222595215, + "learning_rate": 4.996429941681702e-06, + "loss": 0.7458, + "step": 447 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 1.0053136348724365, + "learning_rate": 4.996409340261374e-06, + "loss": 0.7889, + "step": 448 + }, + { + "epoch": 0.13144028103044497, + "grad_norm": 1.1341326236724854, + "learning_rate": 4.996388679613255e-06, + "loss": 0.7892, + "step": 449 + }, + { + "epoch": 0.13173302107728338, + "grad_norm": 1.0794131755828857, + "learning_rate": 4.996367959737834e-06, + "loss": 0.7701, + "step": 450 + }, + { + "epoch": 0.1320257611241218, + "grad_norm": 1.0199047327041626, + "learning_rate": 4.996347180635605e-06, + "loss": 0.7538, + "step": 451 + }, + { + "epoch": 0.1323185011709602, + "grad_norm": 1.1228561401367188, + "learning_rate": 4.996326342307058e-06, + "loss": 0.765, + "step": 452 + }, + { + "epoch": 0.1326112412177986, + "grad_norm": 0.9764695167541504, + "learning_rate": 4.996305444752689e-06, + "loss": 0.7448, + "step": 453 + }, + { + "epoch": 0.132903981264637, + "grad_norm": 1.038224220275879, + "learning_rate": 4.9962844879729944e-06, + "loss": 0.7941, + "step": 454 + }, + { + "epoch": 0.13319672131147542, + "grad_norm": 1.0253431797027588, + "learning_rate": 4.996263471968471e-06, + "loss": 0.7814, + "step": 455 + }, + { + "epoch": 0.13348946135831383, + "grad_norm": 1.0342546701431274, + "learning_rate": 4.996242396739617e-06, + "loss": 0.7197, + "step": 456 + }, + { + "epoch": 0.13378220140515223, + "grad_norm": 1.040342092514038, + "learning_rate": 4.996221262286932e-06, + "loss": 0.7541, + "step": 457 + }, + { + "epoch": 0.13407494145199064, + "grad_norm": 1.0446733236312866, + "learning_rate": 4.996200068610919e-06, + "loss": 0.7709, + "step": 458 + }, + { + "epoch": 0.13436768149882905, + "grad_norm": 1.0265744924545288, + "learning_rate": 4.996178815712078e-06, + "loss": 0.7538, + "step": 459 + }, + { + "epoch": 0.13466042154566746, + "grad_norm": 1.1642494201660156, + "learning_rate": 4.9961575035909175e-06, + "loss": 0.774, + "step": 460 + }, + { + "epoch": 0.13495316159250587, + "grad_norm": 1.0753215551376343, + "learning_rate": 4.996136132247938e-06, + "loss": 0.7713, + "step": 461 + }, + { + "epoch": 0.13524590163934427, + "grad_norm": 1.0548503398895264, + "learning_rate": 4.996114701683651e-06, + "loss": 0.778, + "step": 462 + }, + { + "epoch": 0.13553864168618268, + "grad_norm": 1.0346354246139526, + "learning_rate": 4.9960932118985626e-06, + "loss": 0.7298, + "step": 463 + }, + { + "epoch": 0.1358313817330211, + "grad_norm": 1.0867948532104492, + "learning_rate": 4.996071662893183e-06, + "loss": 0.7699, + "step": 464 + }, + { + "epoch": 0.1361241217798595, + "grad_norm": 1.0301053524017334, + "learning_rate": 4.996050054668025e-06, + "loss": 0.7423, + "step": 465 + }, + { + "epoch": 0.1364168618266979, + "grad_norm": 1.0423439741134644, + "learning_rate": 4.996028387223598e-06, + "loss": 0.7839, + "step": 466 + }, + { + "epoch": 0.1367096018735363, + "grad_norm": 1.1208505630493164, + "learning_rate": 4.996006660560418e-06, + "loss": 0.7195, + "step": 467 + }, + { + "epoch": 0.13700234192037472, + "grad_norm": 1.0195363759994507, + "learning_rate": 4.995984874679002e-06, + "loss": 0.7692, + "step": 468 + }, + { + "epoch": 0.13729508196721313, + "grad_norm": 1.025005578994751, + "learning_rate": 4.995963029579864e-06, + "loss": 0.7319, + "step": 469 + }, + { + "epoch": 0.1375878220140515, + "grad_norm": 1.0431208610534668, + "learning_rate": 4.9959411252635245e-06, + "loss": 0.787, + "step": 470 + }, + { + "epoch": 0.13788056206088992, + "grad_norm": 1.0112665891647339, + "learning_rate": 4.995919161730502e-06, + "loss": 0.743, + "step": 471 + }, + { + "epoch": 0.13817330210772832, + "grad_norm": 1.0570484399795532, + "learning_rate": 4.9958971389813185e-06, + "loss": 0.7852, + "step": 472 + }, + { + "epoch": 0.13846604215456673, + "grad_norm": 1.0454916954040527, + "learning_rate": 4.995875057016495e-06, + "loss": 0.7908, + "step": 473 + }, + { + "epoch": 0.13875878220140514, + "grad_norm": 1.1131798028945923, + "learning_rate": 4.995852915836556e-06, + "loss": 0.7338, + "step": 474 + }, + { + "epoch": 0.13905152224824355, + "grad_norm": 0.9980742335319519, + "learning_rate": 4.995830715442027e-06, + "loss": 0.7869, + "step": 475 + }, + { + "epoch": 0.13934426229508196, + "grad_norm": 1.3487155437469482, + "learning_rate": 4.995808455833435e-06, + "loss": 0.7756, + "step": 476 + }, + { + "epoch": 0.13963700234192036, + "grad_norm": 1.0363023281097412, + "learning_rate": 4.995786137011308e-06, + "loss": 0.7482, + "step": 477 + }, + { + "epoch": 0.13992974238875877, + "grad_norm": 1.0383214950561523, + "learning_rate": 4.995763758976176e-06, + "loss": 0.7563, + "step": 478 + }, + { + "epoch": 0.14022248243559718, + "grad_norm": 1.1910260915756226, + "learning_rate": 4.995741321728569e-06, + "loss": 0.788, + "step": 479 + }, + { + "epoch": 0.1405152224824356, + "grad_norm": 1.0117086172103882, + "learning_rate": 4.995718825269019e-06, + "loss": 0.7505, + "step": 480 + }, + { + "epoch": 0.140807962529274, + "grad_norm": 1.0669265985488892, + "learning_rate": 4.995696269598061e-06, + "loss": 0.8012, + "step": 481 + }, + { + "epoch": 0.1411007025761124, + "grad_norm": 1.0423585176467896, + "learning_rate": 4.99567365471623e-06, + "loss": 0.7426, + "step": 482 + }, + { + "epoch": 0.1413934426229508, + "grad_norm": 0.971229076385498, + "learning_rate": 4.995650980624062e-06, + "loss": 0.7662, + "step": 483 + }, + { + "epoch": 0.14168618266978922, + "grad_norm": 1.0862553119659424, + "learning_rate": 4.995628247322094e-06, + "loss": 0.7489, + "step": 484 + }, + { + "epoch": 0.14197892271662763, + "grad_norm": 1.0066474676132202, + "learning_rate": 4.995605454810867e-06, + "loss": 0.7759, + "step": 485 + }, + { + "epoch": 0.14227166276346603, + "grad_norm": 1.2388725280761719, + "learning_rate": 4.99558260309092e-06, + "loss": 0.7796, + "step": 486 + }, + { + "epoch": 0.14256440281030444, + "grad_norm": 1.0122560262680054, + "learning_rate": 4.995559692162798e-06, + "loss": 0.7087, + "step": 487 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.0272035598754883, + "learning_rate": 4.995536722027042e-06, + "loss": 0.7509, + "step": 488 + }, + { + "epoch": 0.14314988290398126, + "grad_norm": 1.0606050491333008, + "learning_rate": 4.9955136926841985e-06, + "loss": 0.7408, + "step": 489 + }, + { + "epoch": 0.14344262295081966, + "grad_norm": 0.9947139024734497, + "learning_rate": 4.995490604134812e-06, + "loss": 0.7486, + "step": 490 + }, + { + "epoch": 0.14373536299765807, + "grad_norm": 1.0536909103393555, + "learning_rate": 4.995467456379432e-06, + "loss": 0.7392, + "step": 491 + }, + { + "epoch": 0.14402810304449648, + "grad_norm": 1.031711459159851, + "learning_rate": 4.9954442494186075e-06, + "loss": 0.7698, + "step": 492 + }, + { + "epoch": 0.1443208430913349, + "grad_norm": 1.0472348928451538, + "learning_rate": 4.9954209832528885e-06, + "loss": 0.7433, + "step": 493 + }, + { + "epoch": 0.1446135831381733, + "grad_norm": 1.0442010164260864, + "learning_rate": 4.995397657882828e-06, + "loss": 0.7071, + "step": 494 + }, + { + "epoch": 0.1449063231850117, + "grad_norm": 1.0676288604736328, + "learning_rate": 4.995374273308978e-06, + "loss": 0.7463, + "step": 495 + }, + { + "epoch": 0.1451990632318501, + "grad_norm": 1.0971819162368774, + "learning_rate": 4.995350829531893e-06, + "loss": 0.7335, + "step": 496 + }, + { + "epoch": 0.14549180327868852, + "grad_norm": 1.0309195518493652, + "learning_rate": 4.995327326552132e-06, + "loss": 0.7594, + "step": 497 + }, + { + "epoch": 0.14578454332552693, + "grad_norm": 1.3642863035202026, + "learning_rate": 4.99530376437025e-06, + "loss": 0.752, + "step": 498 + }, + { + "epoch": 0.14607728337236534, + "grad_norm": 0.9873098731040955, + "learning_rate": 4.995280142986806e-06, + "loss": 0.7178, + "step": 499 + }, + { + "epoch": 0.14637002341920374, + "grad_norm": 1.1110937595367432, + "learning_rate": 4.995256462402362e-06, + "loss": 0.7166, + "step": 500 + }, + { + "epoch": 0.14666276346604215, + "grad_norm": 1.1457798480987549, + "learning_rate": 4.995232722617479e-06, + "loss": 0.7476, + "step": 501 + }, + { + "epoch": 0.14695550351288056, + "grad_norm": 1.085321307182312, + "learning_rate": 4.99520892363272e-06, + "loss": 0.7609, + "step": 502 + }, + { + "epoch": 0.14724824355971897, + "grad_norm": 1.0328150987625122, + "learning_rate": 4.99518506544865e-06, + "loss": 0.7551, + "step": 503 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 1.0304176807403564, + "learning_rate": 4.995161148065835e-06, + "loss": 0.7607, + "step": 504 + }, + { + "epoch": 0.14783372365339578, + "grad_norm": 1.002192497253418, + "learning_rate": 4.995137171484843e-06, + "loss": 0.7232, + "step": 505 + }, + { + "epoch": 0.1481264637002342, + "grad_norm": 1.0392717123031616, + "learning_rate": 4.995113135706241e-06, + "loss": 0.7541, + "step": 506 + }, + { + "epoch": 0.1484192037470726, + "grad_norm": 1.0434260368347168, + "learning_rate": 4.9950890407306005e-06, + "loss": 0.7707, + "step": 507 + }, + { + "epoch": 0.148711943793911, + "grad_norm": 1.0789471864700317, + "learning_rate": 4.995064886558494e-06, + "loss": 0.7469, + "step": 508 + }, + { + "epoch": 0.1490046838407494, + "grad_norm": 1.0245229005813599, + "learning_rate": 4.9950406731904936e-06, + "loss": 0.7322, + "step": 509 + }, + { + "epoch": 0.14929742388758782, + "grad_norm": 1.0533961057662964, + "learning_rate": 4.995016400627174e-06, + "loss": 0.7267, + "step": 510 + }, + { + "epoch": 0.14959016393442623, + "grad_norm": 1.0870709419250488, + "learning_rate": 4.994992068869111e-06, + "loss": 0.7637, + "step": 511 + }, + { + "epoch": 0.14988290398126464, + "grad_norm": 1.0598188638687134, + "learning_rate": 4.994967677916881e-06, + "loss": 0.7331, + "step": 512 + }, + { + "epoch": 0.15017564402810304, + "grad_norm": 1.1002193689346313, + "learning_rate": 4.994943227771064e-06, + "loss": 0.7418, + "step": 513 + }, + { + "epoch": 0.15046838407494145, + "grad_norm": 1.1156649589538574, + "learning_rate": 4.99491871843224e-06, + "loss": 0.7538, + "step": 514 + }, + { + "epoch": 0.15076112412177986, + "grad_norm": 1.0885365009307861, + "learning_rate": 4.99489414990099e-06, + "loss": 0.796, + "step": 515 + }, + { + "epoch": 0.15105386416861827, + "grad_norm": 1.0566763877868652, + "learning_rate": 4.994869522177896e-06, + "loss": 0.7816, + "step": 516 + }, + { + "epoch": 0.15134660421545668, + "grad_norm": 1.0240143537521362, + "learning_rate": 4.994844835263543e-06, + "loss": 0.7537, + "step": 517 + }, + { + "epoch": 0.15163934426229508, + "grad_norm": 0.9999751448631287, + "learning_rate": 4.994820089158518e-06, + "loss": 0.6944, + "step": 518 + }, + { + "epoch": 0.1519320843091335, + "grad_norm": 0.9684767723083496, + "learning_rate": 4.994795283863408e-06, + "loss": 0.7479, + "step": 519 + }, + { + "epoch": 0.1522248243559719, + "grad_norm": 1.0401848554611206, + "learning_rate": 4.9947704193787995e-06, + "loss": 0.7948, + "step": 520 + }, + { + "epoch": 0.1525175644028103, + "grad_norm": 1.0069934129714966, + "learning_rate": 4.994745495705283e-06, + "loss": 0.7059, + "step": 521 + }, + { + "epoch": 0.15281030444964872, + "grad_norm": 1.032149314880371, + "learning_rate": 4.994720512843451e-06, + "loss": 0.7369, + "step": 522 + }, + { + "epoch": 0.15310304449648712, + "grad_norm": 0.9935375452041626, + "learning_rate": 4.994695470793896e-06, + "loss": 0.6944, + "step": 523 + }, + { + "epoch": 0.15339578454332553, + "grad_norm": 1.0790419578552246, + "learning_rate": 4.994670369557211e-06, + "loss": 0.7344, + "step": 524 + }, + { + "epoch": 0.15368852459016394, + "grad_norm": 1.056556224822998, + "learning_rate": 4.994645209133992e-06, + "loss": 0.7735, + "step": 525 + }, + { + "epoch": 0.15398126463700235, + "grad_norm": 1.2543163299560547, + "learning_rate": 4.994619989524837e-06, + "loss": 0.7312, + "step": 526 + }, + { + "epoch": 0.15427400468384075, + "grad_norm": 1.0421758890151978, + "learning_rate": 4.994594710730342e-06, + "loss": 0.7431, + "step": 527 + }, + { + "epoch": 0.15456674473067916, + "grad_norm": 1.0740159749984741, + "learning_rate": 4.99456937275111e-06, + "loss": 0.7573, + "step": 528 + }, + { + "epoch": 0.15485948477751757, + "grad_norm": 1.0140674114227295, + "learning_rate": 4.9945439755877385e-06, + "loss": 0.7298, + "step": 529 + }, + { + "epoch": 0.15515222482435598, + "grad_norm": 0.9783845543861389, + "learning_rate": 4.994518519240834e-06, + "loss": 0.7569, + "step": 530 + }, + { + "epoch": 0.15544496487119439, + "grad_norm": 1.038848876953125, + "learning_rate": 4.994493003710996e-06, + "loss": 0.7635, + "step": 531 + }, + { + "epoch": 0.1557377049180328, + "grad_norm": 1.0091180801391602, + "learning_rate": 4.994467428998834e-06, + "loss": 0.7405, + "step": 532 + }, + { + "epoch": 0.1560304449648712, + "grad_norm": 1.042149543762207, + "learning_rate": 4.9944417951049526e-06, + "loss": 0.7272, + "step": 533 + }, + { + "epoch": 0.1563231850117096, + "grad_norm": 0.9998611211776733, + "learning_rate": 4.9944161020299606e-06, + "loss": 0.7295, + "step": 534 + }, + { + "epoch": 0.15661592505854802, + "grad_norm": 1.0675480365753174, + "learning_rate": 4.994390349774467e-06, + "loss": 0.755, + "step": 535 + }, + { + "epoch": 0.15690866510538642, + "grad_norm": 1.1135398149490356, + "learning_rate": 4.994364538339083e-06, + "loss": 0.7474, + "step": 536 + }, + { + "epoch": 0.15720140515222483, + "grad_norm": 1.08311128616333, + "learning_rate": 4.994338667724422e-06, + "loss": 0.7757, + "step": 537 + }, + { + "epoch": 0.15749414519906324, + "grad_norm": 1.0662428140640259, + "learning_rate": 4.994312737931096e-06, + "loss": 0.7376, + "step": 538 + }, + { + "epoch": 0.15778688524590165, + "grad_norm": 0.9838939309120178, + "learning_rate": 4.994286748959721e-06, + "loss": 0.7657, + "step": 539 + }, + { + "epoch": 0.15807962529274006, + "grad_norm": 1.0284777879714966, + "learning_rate": 4.994260700810914e-06, + "loss": 0.6962, + "step": 540 + }, + { + "epoch": 0.15837236533957846, + "grad_norm": 1.0588003396987915, + "learning_rate": 4.994234593485292e-06, + "loss": 0.7161, + "step": 541 + }, + { + "epoch": 0.15866510538641687, + "grad_norm": 0.9858947396278381, + "learning_rate": 4.994208426983476e-06, + "loss": 0.7297, + "step": 542 + }, + { + "epoch": 0.15895784543325528, + "grad_norm": 1.0785642862319946, + "learning_rate": 4.994182201306086e-06, + "loss": 0.7684, + "step": 543 + }, + { + "epoch": 0.1592505854800937, + "grad_norm": 1.1240425109863281, + "learning_rate": 4.994155916453744e-06, + "loss": 0.6967, + "step": 544 + }, + { + "epoch": 0.1595433255269321, + "grad_norm": 0.9456214308738708, + "learning_rate": 4.994129572427073e-06, + "loss": 0.7292, + "step": 545 + }, + { + "epoch": 0.1598360655737705, + "grad_norm": 1.0441051721572876, + "learning_rate": 4.994103169226699e-06, + "loss": 0.7947, + "step": 546 + }, + { + "epoch": 0.1601288056206089, + "grad_norm": 0.9871823191642761, + "learning_rate": 4.994076706853248e-06, + "loss": 0.7533, + "step": 547 + }, + { + "epoch": 0.16042154566744732, + "grad_norm": 0.9885470271110535, + "learning_rate": 4.994050185307349e-06, + "loss": 0.7214, + "step": 548 + }, + { + "epoch": 0.16071428571428573, + "grad_norm": 1.0480362176895142, + "learning_rate": 4.9940236045896286e-06, + "loss": 0.7431, + "step": 549 + }, + { + "epoch": 0.16100702576112413, + "grad_norm": 1.0542758703231812, + "learning_rate": 4.99399696470072e-06, + "loss": 0.7394, + "step": 550 + }, + { + "epoch": 0.16129976580796254, + "grad_norm": 1.0107929706573486, + "learning_rate": 4.993970265641254e-06, + "loss": 0.7058, + "step": 551 + }, + { + "epoch": 0.16159250585480095, + "grad_norm": 0.9981992244720459, + "learning_rate": 4.993943507411865e-06, + "loss": 0.7324, + "step": 552 + }, + { + "epoch": 0.16188524590163936, + "grad_norm": 1.1358853578567505, + "learning_rate": 4.993916690013186e-06, + "loss": 0.7329, + "step": 553 + }, + { + "epoch": 0.16217798594847777, + "grad_norm": 1.0473805665969849, + "learning_rate": 4.993889813445854e-06, + "loss": 0.7511, + "step": 554 + }, + { + "epoch": 0.16247072599531617, + "grad_norm": 1.062420129776001, + "learning_rate": 4.993862877710509e-06, + "loss": 0.7236, + "step": 555 + }, + { + "epoch": 0.16276346604215455, + "grad_norm": 1.0818626880645752, + "learning_rate": 4.993835882807787e-06, + "loss": 0.7101, + "step": 556 + }, + { + "epoch": 0.16305620608899296, + "grad_norm": 1.0834968090057373, + "learning_rate": 4.993808828738329e-06, + "loss": 0.7292, + "step": 557 + }, + { + "epoch": 0.16334894613583137, + "grad_norm": 1.0610898733139038, + "learning_rate": 4.993781715502779e-06, + "loss": 0.7685, + "step": 558 + }, + { + "epoch": 0.16364168618266978, + "grad_norm": 1.0930734872817993, + "learning_rate": 4.993754543101777e-06, + "loss": 0.7436, + "step": 559 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.0046436786651611, + "learning_rate": 4.99372731153597e-06, + "loss": 0.7132, + "step": 560 + }, + { + "epoch": 0.1642271662763466, + "grad_norm": 1.0625430345535278, + "learning_rate": 4.993700020806004e-06, + "loss": 0.704, + "step": 561 + }, + { + "epoch": 0.164519906323185, + "grad_norm": 1.0220093727111816, + "learning_rate": 4.993672670912525e-06, + "loss": 0.7157, + "step": 562 + }, + { + "epoch": 0.1648126463700234, + "grad_norm": 1.0230109691619873, + "learning_rate": 4.993645261856184e-06, + "loss": 0.7607, + "step": 563 + }, + { + "epoch": 0.16510538641686182, + "grad_norm": 1.0819051265716553, + "learning_rate": 4.993617793637628e-06, + "loss": 0.7727, + "step": 564 + }, + { + "epoch": 0.16539812646370022, + "grad_norm": 1.0024514198303223, + "learning_rate": 4.993590266257513e-06, + "loss": 0.7464, + "step": 565 + }, + { + "epoch": 0.16569086651053863, + "grad_norm": 1.0258533954620361, + "learning_rate": 4.993562679716489e-06, + "loss": 0.7632, + "step": 566 + }, + { + "epoch": 0.16598360655737704, + "grad_norm": 1.0132462978363037, + "learning_rate": 4.993535034015211e-06, + "loss": 0.7272, + "step": 567 + }, + { + "epoch": 0.16627634660421545, + "grad_norm": 1.070153832435608, + "learning_rate": 4.993507329154336e-06, + "loss": 0.7659, + "step": 568 + }, + { + "epoch": 0.16656908665105385, + "grad_norm": 0.9886132478713989, + "learning_rate": 4.99347956513452e-06, + "loss": 0.6861, + "step": 569 + }, + { + "epoch": 0.16686182669789226, + "grad_norm": 1.0627933740615845, + "learning_rate": 4.993451741956422e-06, + "loss": 0.7372, + "step": 570 + }, + { + "epoch": 0.16715456674473067, + "grad_norm": 1.05949866771698, + "learning_rate": 4.993423859620703e-06, + "loss": 0.7197, + "step": 571 + }, + { + "epoch": 0.16744730679156908, + "grad_norm": 1.0306446552276611, + "learning_rate": 4.993395918128023e-06, + "loss": 0.7265, + "step": 572 + }, + { + "epoch": 0.16774004683840749, + "grad_norm": 1.0499868392944336, + "learning_rate": 4.993367917479047e-06, + "loss": 0.7529, + "step": 573 + }, + { + "epoch": 0.1680327868852459, + "grad_norm": 1.0573797225952148, + "learning_rate": 4.9933398576744375e-06, + "loss": 0.7723, + "step": 574 + }, + { + "epoch": 0.1683255269320843, + "grad_norm": 1.0257741212844849, + "learning_rate": 4.993311738714861e-06, + "loss": 0.752, + "step": 575 + }, + { + "epoch": 0.1686182669789227, + "grad_norm": 1.0326147079467773, + "learning_rate": 4.993283560600984e-06, + "loss": 0.7342, + "step": 576 + }, + { + "epoch": 0.16891100702576112, + "grad_norm": 1.032239556312561, + "learning_rate": 4.993255323333476e-06, + "loss": 0.7491, + "step": 577 + }, + { + "epoch": 0.16920374707259953, + "grad_norm": 1.0691924095153809, + "learning_rate": 4.993227026913007e-06, + "loss": 0.7194, + "step": 578 + }, + { + "epoch": 0.16949648711943793, + "grad_norm": 1.0019234418869019, + "learning_rate": 4.993198671340247e-06, + "loss": 0.7388, + "step": 579 + }, + { + "epoch": 0.16978922716627634, + "grad_norm": 1.0918360948562622, + "learning_rate": 4.993170256615869e-06, + "loss": 0.775, + "step": 580 + }, + { + "epoch": 0.17008196721311475, + "grad_norm": 1.0384430885314941, + "learning_rate": 4.993141782740549e-06, + "loss": 0.7726, + "step": 581 + }, + { + "epoch": 0.17037470725995316, + "grad_norm": 1.037977695465088, + "learning_rate": 4.99311324971496e-06, + "loss": 0.7699, + "step": 582 + }, + { + "epoch": 0.17066744730679156, + "grad_norm": 0.9696611762046814, + "learning_rate": 4.9930846575397805e-06, + "loss": 0.7664, + "step": 583 + }, + { + "epoch": 0.17096018735362997, + "grad_norm": 1.041130781173706, + "learning_rate": 4.99305600621569e-06, + "loss": 0.7313, + "step": 584 + }, + { + "epoch": 0.17125292740046838, + "grad_norm": 1.022875189781189, + "learning_rate": 4.9930272957433645e-06, + "loss": 0.7342, + "step": 585 + }, + { + "epoch": 0.1715456674473068, + "grad_norm": 0.9844250679016113, + "learning_rate": 4.992998526123489e-06, + "loss": 0.7605, + "step": 586 + }, + { + "epoch": 0.1718384074941452, + "grad_norm": 1.057983160018921, + "learning_rate": 4.992969697356744e-06, + "loss": 0.7347, + "step": 587 + }, + { + "epoch": 0.1721311475409836, + "grad_norm": 1.1272175312042236, + "learning_rate": 4.992940809443813e-06, + "loss": 0.7696, + "step": 588 + }, + { + "epoch": 0.172423887587822, + "grad_norm": 1.012060523033142, + "learning_rate": 4.992911862385385e-06, + "loss": 0.7488, + "step": 589 + }, + { + "epoch": 0.17271662763466042, + "grad_norm": 1.0462814569473267, + "learning_rate": 4.9928828561821415e-06, + "loss": 0.7914, + "step": 590 + }, + { + "epoch": 0.17300936768149883, + "grad_norm": 1.0901904106140137, + "learning_rate": 4.992853790834774e-06, + "loss": 0.7631, + "step": 591 + }, + { + "epoch": 0.17330210772833723, + "grad_norm": 1.0424400568008423, + "learning_rate": 4.992824666343972e-06, + "loss": 0.775, + "step": 592 + }, + { + "epoch": 0.17359484777517564, + "grad_norm": 1.1213066577911377, + "learning_rate": 4.992795482710425e-06, + "loss": 0.7918, + "step": 593 + }, + { + "epoch": 0.17388758782201405, + "grad_norm": 1.048017978668213, + "learning_rate": 4.992766239934828e-06, + "loss": 0.7223, + "step": 594 + }, + { + "epoch": 0.17418032786885246, + "grad_norm": 1.1663038730621338, + "learning_rate": 4.992736938017871e-06, + "loss": 0.731, + "step": 595 + }, + { + "epoch": 0.17447306791569087, + "grad_norm": 1.0565427541732788, + "learning_rate": 4.992707576960252e-06, + "loss": 0.745, + "step": 596 + }, + { + "epoch": 0.17476580796252927, + "grad_norm": 1.0555992126464844, + "learning_rate": 4.992678156762667e-06, + "loss": 0.7554, + "step": 597 + }, + { + "epoch": 0.17505854800936768, + "grad_norm": 1.1321685314178467, + "learning_rate": 4.992648677425813e-06, + "loss": 0.7173, + "step": 598 + }, + { + "epoch": 0.1753512880562061, + "grad_norm": 1.0394710302352905, + "learning_rate": 4.992619138950391e-06, + "loss": 0.7188, + "step": 599 + }, + { + "epoch": 0.1756440281030445, + "grad_norm": 1.076233148574829, + "learning_rate": 4.992589541337102e-06, + "loss": 0.6983, + "step": 600 + }, + { + "epoch": 0.1759367681498829, + "grad_norm": 1.0988874435424805, + "learning_rate": 4.992559884586645e-06, + "loss": 0.7141, + "step": 601 + }, + { + "epoch": 0.1762295081967213, + "grad_norm": 1.0630314350128174, + "learning_rate": 4.992530168699727e-06, + "loss": 0.7431, + "step": 602 + }, + { + "epoch": 0.17652224824355972, + "grad_norm": 1.0518795251846313, + "learning_rate": 4.992500393677053e-06, + "loss": 0.7322, + "step": 603 + }, + { + "epoch": 0.17681498829039813, + "grad_norm": 1.3076356649398804, + "learning_rate": 4.992470559519327e-06, + "loss": 0.7056, + "step": 604 + }, + { + "epoch": 0.17710772833723654, + "grad_norm": 1.0358792543411255, + "learning_rate": 4.992440666227259e-06, + "loss": 0.7595, + "step": 605 + }, + { + "epoch": 0.17740046838407494, + "grad_norm": 1.0961109399795532, + "learning_rate": 4.992410713801556e-06, + "loss": 0.731, + "step": 606 + }, + { + "epoch": 0.17769320843091335, + "grad_norm": 0.9455900192260742, + "learning_rate": 4.99238070224293e-06, + "loss": 0.7055, + "step": 607 + }, + { + "epoch": 0.17798594847775176, + "grad_norm": 1.004599690437317, + "learning_rate": 4.992350631552094e-06, + "loss": 0.6539, + "step": 608 + }, + { + "epoch": 0.17827868852459017, + "grad_norm": 1.0386807918548584, + "learning_rate": 4.992320501729761e-06, + "loss": 0.7828, + "step": 609 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 0.9567112326622009, + "learning_rate": 4.9922903127766446e-06, + "loss": 0.6893, + "step": 610 + }, + { + "epoch": 0.17886416861826698, + "grad_norm": 1.0716243982315063, + "learning_rate": 4.99226006469346e-06, + "loss": 0.8081, + "step": 611 + }, + { + "epoch": 0.1791569086651054, + "grad_norm": 1.034654140472412, + "learning_rate": 4.992229757480929e-06, + "loss": 0.7397, + "step": 612 + }, + { + "epoch": 0.1794496487119438, + "grad_norm": 1.036073923110962, + "learning_rate": 4.992199391139767e-06, + "loss": 0.7409, + "step": 613 + }, + { + "epoch": 0.1797423887587822, + "grad_norm": 0.9878498315811157, + "learning_rate": 4.992168965670696e-06, + "loss": 0.7429, + "step": 614 + }, + { + "epoch": 0.18003512880562061, + "grad_norm": 1.0201154947280884, + "learning_rate": 4.9921384810744374e-06, + "loss": 0.7248, + "step": 615 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 0.9892057180404663, + "learning_rate": 4.992107937351715e-06, + "loss": 0.6674, + "step": 616 + }, + { + "epoch": 0.18062060889929743, + "grad_norm": 1.0774754285812378, + "learning_rate": 4.992077334503252e-06, + "loss": 0.7626, + "step": 617 + }, + { + "epoch": 0.18091334894613584, + "grad_norm": 1.0042815208435059, + "learning_rate": 4.992046672529776e-06, + "loss": 0.7292, + "step": 618 + }, + { + "epoch": 0.18120608899297425, + "grad_norm": 1.0517489910125732, + "learning_rate": 4.992015951432014e-06, + "loss": 0.7475, + "step": 619 + }, + { + "epoch": 0.18149882903981265, + "grad_norm": 0.9919170141220093, + "learning_rate": 4.991985171210694e-06, + "loss": 0.7152, + "step": 620 + }, + { + "epoch": 0.18179156908665106, + "grad_norm": 1.055580735206604, + "learning_rate": 4.991954331866548e-06, + "loss": 0.6799, + "step": 621 + }, + { + "epoch": 0.18208430913348947, + "grad_norm": 0.9550749659538269, + "learning_rate": 4.991923433400306e-06, + "loss": 0.6717, + "step": 622 + }, + { + "epoch": 0.18237704918032788, + "grad_norm": 1.00528883934021, + "learning_rate": 4.991892475812703e-06, + "loss": 0.7613, + "step": 623 + }, + { + "epoch": 0.18266978922716628, + "grad_norm": 1.029396653175354, + "learning_rate": 4.991861459104471e-06, + "loss": 0.7431, + "step": 624 + }, + { + "epoch": 0.1829625292740047, + "grad_norm": 1.054984450340271, + "learning_rate": 4.991830383276348e-06, + "loss": 0.7881, + "step": 625 + }, + { + "epoch": 0.1832552693208431, + "grad_norm": 1.050837516784668, + "learning_rate": 4.99179924832907e-06, + "loss": 0.6818, + "step": 626 + }, + { + "epoch": 0.1835480093676815, + "grad_norm": 1.1065423488616943, + "learning_rate": 4.991768054263376e-06, + "loss": 0.7572, + "step": 627 + }, + { + "epoch": 0.18384074941451992, + "grad_norm": 1.0425485372543335, + "learning_rate": 4.991736801080006e-06, + "loss": 0.7783, + "step": 628 + }, + { + "epoch": 0.18413348946135832, + "grad_norm": 0.9795385003089905, + "learning_rate": 4.991705488779702e-06, + "loss": 0.7641, + "step": 629 + }, + { + "epoch": 0.18442622950819673, + "grad_norm": 1.03068208694458, + "learning_rate": 4.991674117363207e-06, + "loss": 0.7462, + "step": 630 + }, + { + "epoch": 0.18471896955503514, + "grad_norm": 1.044397234916687, + "learning_rate": 4.991642686831264e-06, + "loss": 0.7527, + "step": 631 + }, + { + "epoch": 0.18501170960187355, + "grad_norm": 1.1305725574493408, + "learning_rate": 4.991611197184621e-06, + "loss": 0.717, + "step": 632 + }, + { + "epoch": 0.18530444964871196, + "grad_norm": 1.0267339944839478, + "learning_rate": 4.991579648424022e-06, + "loss": 0.7744, + "step": 633 + }, + { + "epoch": 0.18559718969555036, + "grad_norm": 1.0448241233825684, + "learning_rate": 4.991548040550218e-06, + "loss": 0.7286, + "step": 634 + }, + { + "epoch": 0.18588992974238877, + "grad_norm": 1.0771070718765259, + "learning_rate": 4.9915163735639575e-06, + "loss": 0.7194, + "step": 635 + }, + { + "epoch": 0.18618266978922718, + "grad_norm": 1.0556049346923828, + "learning_rate": 4.991484647465993e-06, + "loss": 0.7097, + "step": 636 + }, + { + "epoch": 0.1864754098360656, + "grad_norm": 1.012529730796814, + "learning_rate": 4.991452862257076e-06, + "loss": 0.7459, + "step": 637 + }, + { + "epoch": 0.186768149882904, + "grad_norm": 1.0305569171905518, + "learning_rate": 4.991421017937962e-06, + "loss": 0.7334, + "step": 638 + }, + { + "epoch": 0.1870608899297424, + "grad_norm": 1.072114109992981, + "learning_rate": 4.991389114509405e-06, + "loss": 0.6838, + "step": 639 + }, + { + "epoch": 0.1873536299765808, + "grad_norm": 1.0516080856323242, + "learning_rate": 4.991357151972163e-06, + "loss": 0.7104, + "step": 640 + }, + { + "epoch": 0.1876463700234192, + "grad_norm": 1.0710749626159668, + "learning_rate": 4.991325130326994e-06, + "loss": 0.769, + "step": 641 + }, + { + "epoch": 0.1879391100702576, + "grad_norm": 1.0682337284088135, + "learning_rate": 4.991293049574657e-06, + "loss": 0.7394, + "step": 642 + }, + { + "epoch": 0.188231850117096, + "grad_norm": 1.014511227607727, + "learning_rate": 4.9912609097159135e-06, + "loss": 0.7047, + "step": 643 + }, + { + "epoch": 0.1885245901639344, + "grad_norm": 1.0116115808486938, + "learning_rate": 4.991228710751528e-06, + "loss": 0.743, + "step": 644 + }, + { + "epoch": 0.18881733021077282, + "grad_norm": 1.078175663948059, + "learning_rate": 4.991196452682261e-06, + "loss": 0.7383, + "step": 645 + }, + { + "epoch": 0.18911007025761123, + "grad_norm": 1.0496474504470825, + "learning_rate": 4.99116413550888e-06, + "loss": 0.7341, + "step": 646 + }, + { + "epoch": 0.18940281030444964, + "grad_norm": 1.069902777671814, + "learning_rate": 4.9911317592321515e-06, + "loss": 0.8108, + "step": 647 + }, + { + "epoch": 0.18969555035128804, + "grad_norm": 0.9741089344024658, + "learning_rate": 4.991099323852844e-06, + "loss": 0.6899, + "step": 648 + }, + { + "epoch": 0.18998829039812645, + "grad_norm": 1.019347906112671, + "learning_rate": 4.991066829371726e-06, + "loss": 0.7815, + "step": 649 + }, + { + "epoch": 0.19028103044496486, + "grad_norm": 1.089674711227417, + "learning_rate": 4.991034275789568e-06, + "loss": 0.7448, + "step": 650 + }, + { + "epoch": 0.19057377049180327, + "grad_norm": 0.9528079032897949, + "learning_rate": 4.991001663107144e-06, + "loss": 0.6516, + "step": 651 + }, + { + "epoch": 0.19086651053864168, + "grad_norm": 1.0877478122711182, + "learning_rate": 4.990968991325227e-06, + "loss": 0.7761, + "step": 652 + }, + { + "epoch": 0.19115925058548008, + "grad_norm": 1.0191349983215332, + "learning_rate": 4.990936260444592e-06, + "loss": 0.7132, + "step": 653 + }, + { + "epoch": 0.1914519906323185, + "grad_norm": 1.062837839126587, + "learning_rate": 4.9909034704660155e-06, + "loss": 0.7336, + "step": 654 + }, + { + "epoch": 0.1917447306791569, + "grad_norm": 1.5472934246063232, + "learning_rate": 4.9908706213902755e-06, + "loss": 0.762, + "step": 655 + }, + { + "epoch": 0.1920374707259953, + "grad_norm": 1.0537971258163452, + "learning_rate": 4.990837713218152e-06, + "loss": 0.7087, + "step": 656 + }, + { + "epoch": 0.19233021077283372, + "grad_norm": 1.0655015707015991, + "learning_rate": 4.990804745950425e-06, + "loss": 0.7707, + "step": 657 + }, + { + "epoch": 0.19262295081967212, + "grad_norm": 1.0465137958526611, + "learning_rate": 4.990771719587877e-06, + "loss": 0.7234, + "step": 658 + }, + { + "epoch": 0.19291569086651053, + "grad_norm": 0.9924777746200562, + "learning_rate": 4.9907386341312915e-06, + "loss": 0.7311, + "step": 659 + }, + { + "epoch": 0.19320843091334894, + "grad_norm": 1.0217084884643555, + "learning_rate": 4.990705489581453e-06, + "loss": 0.7069, + "step": 660 + }, + { + "epoch": 0.19350117096018735, + "grad_norm": 1.0155528783798218, + "learning_rate": 4.990672285939148e-06, + "loss": 0.7529, + "step": 661 + }, + { + "epoch": 0.19379391100702575, + "grad_norm": 0.9800963401794434, + "learning_rate": 4.990639023205165e-06, + "loss": 0.7303, + "step": 662 + }, + { + "epoch": 0.19408665105386416, + "grad_norm": 0.9955663084983826, + "learning_rate": 4.990605701380292e-06, + "loss": 0.7008, + "step": 663 + }, + { + "epoch": 0.19437939110070257, + "grad_norm": 1.0005638599395752, + "learning_rate": 4.9905723204653215e-06, + "loss": 0.692, + "step": 664 + }, + { + "epoch": 0.19467213114754098, + "grad_norm": 1.0096423625946045, + "learning_rate": 4.990538880461043e-06, + "loss": 0.6988, + "step": 665 + }, + { + "epoch": 0.19496487119437939, + "grad_norm": 1.0063362121582031, + "learning_rate": 4.990505381368252e-06, + "loss": 0.6886, + "step": 666 + }, + { + "epoch": 0.1952576112412178, + "grad_norm": 1.090458631515503, + "learning_rate": 4.990471823187741e-06, + "loss": 0.739, + "step": 667 + }, + { + "epoch": 0.1955503512880562, + "grad_norm": 1.1056127548217773, + "learning_rate": 4.990438205920309e-06, + "loss": 0.7304, + "step": 668 + }, + { + "epoch": 0.1958430913348946, + "grad_norm": 1.0049651861190796, + "learning_rate": 4.990404529566752e-06, + "loss": 0.7476, + "step": 669 + }, + { + "epoch": 0.19613583138173302, + "grad_norm": 1.0209503173828125, + "learning_rate": 4.9903707941278675e-06, + "loss": 0.715, + "step": 670 + }, + { + "epoch": 0.19642857142857142, + "grad_norm": 1.0002988576889038, + "learning_rate": 4.990336999604459e-06, + "loss": 0.7535, + "step": 671 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 1.036946415901184, + "learning_rate": 4.990303145997326e-06, + "loss": 0.7515, + "step": 672 + }, + { + "epoch": 0.19701405152224824, + "grad_norm": 1.0492767095565796, + "learning_rate": 4.990269233307273e-06, + "loss": 0.7383, + "step": 673 + }, + { + "epoch": 0.19730679156908665, + "grad_norm": 0.9920244812965393, + "learning_rate": 4.990235261535104e-06, + "loss": 0.746, + "step": 674 + }, + { + "epoch": 0.19759953161592506, + "grad_norm": 1.022533893585205, + "learning_rate": 4.990201230681625e-06, + "loss": 0.7251, + "step": 675 + }, + { + "epoch": 0.19789227166276346, + "grad_norm": 0.9507768750190735, + "learning_rate": 4.990167140747643e-06, + "loss": 0.698, + "step": 676 + }, + { + "epoch": 0.19818501170960187, + "grad_norm": 1.0477930307388306, + "learning_rate": 4.9901329917339666e-06, + "loss": 0.7465, + "step": 677 + }, + { + "epoch": 0.19847775175644028, + "grad_norm": 1.0809611082077026, + "learning_rate": 4.9900987836414075e-06, + "loss": 0.7572, + "step": 678 + }, + { + "epoch": 0.1987704918032787, + "grad_norm": 1.0472795963287354, + "learning_rate": 4.990064516470776e-06, + "loss": 0.7056, + "step": 679 + }, + { + "epoch": 0.1990632318501171, + "grad_norm": 1.046653151512146, + "learning_rate": 4.990030190222885e-06, + "loss": 0.7472, + "step": 680 + }, + { + "epoch": 0.1993559718969555, + "grad_norm": 1.0082098245620728, + "learning_rate": 4.9899958048985504e-06, + "loss": 0.7325, + "step": 681 + }, + { + "epoch": 0.1996487119437939, + "grad_norm": 1.0049536228179932, + "learning_rate": 4.9899613604985866e-06, + "loss": 0.6967, + "step": 682 + }, + { + "epoch": 0.19994145199063232, + "grad_norm": 1.0320920944213867, + "learning_rate": 4.9899268570238116e-06, + "loss": 0.6733, + "step": 683 + }, + { + "epoch": 0.20023419203747073, + "grad_norm": 0.995404839515686, + "learning_rate": 4.989892294475042e-06, + "loss": 0.7483, + "step": 684 + }, + { + "epoch": 0.20052693208430913, + "grad_norm": 1.0578256845474243, + "learning_rate": 4.9898576728531e-06, + "loss": 0.7346, + "step": 685 + }, + { + "epoch": 0.20081967213114754, + "grad_norm": 1.0098872184753418, + "learning_rate": 4.989822992158807e-06, + "loss": 0.7473, + "step": 686 + }, + { + "epoch": 0.20111241217798595, + "grad_norm": 1.03389573097229, + "learning_rate": 4.989788252392985e-06, + "loss": 0.7665, + "step": 687 + }, + { + "epoch": 0.20140515222482436, + "grad_norm": 1.0242549180984497, + "learning_rate": 4.989753453556458e-06, + "loss": 0.7377, + "step": 688 + }, + { + "epoch": 0.20169789227166277, + "grad_norm": 1.0716460943222046, + "learning_rate": 4.9897185956500525e-06, + "loss": 0.7383, + "step": 689 + }, + { + "epoch": 0.20199063231850117, + "grad_norm": 0.96657794713974, + "learning_rate": 4.989683678674594e-06, + "loss": 0.6823, + "step": 690 + }, + { + "epoch": 0.20228337236533958, + "grad_norm": 1.0209015607833862, + "learning_rate": 4.989648702630913e-06, + "loss": 0.7133, + "step": 691 + }, + { + "epoch": 0.202576112412178, + "grad_norm": 0.9681962132453918, + "learning_rate": 4.989613667519838e-06, + "loss": 0.7443, + "step": 692 + }, + { + "epoch": 0.2028688524590164, + "grad_norm": 1.0192793607711792, + "learning_rate": 4.989578573342199e-06, + "loss": 0.7012, + "step": 693 + }, + { + "epoch": 0.2031615925058548, + "grad_norm": 0.997841477394104, + "learning_rate": 4.989543420098832e-06, + "loss": 0.7458, + "step": 694 + }, + { + "epoch": 0.2034543325526932, + "grad_norm": 1.057230830192566, + "learning_rate": 4.989508207790568e-06, + "loss": 0.7134, + "step": 695 + }, + { + "epoch": 0.20374707259953162, + "grad_norm": 1.0058214664459229, + "learning_rate": 4.989472936418244e-06, + "loss": 0.752, + "step": 696 + }, + { + "epoch": 0.20403981264637003, + "grad_norm": 0.9776172637939453, + "learning_rate": 4.989437605982696e-06, + "loss": 0.7152, + "step": 697 + }, + { + "epoch": 0.20433255269320844, + "grad_norm": 1.04109525680542, + "learning_rate": 4.9894022164847625e-06, + "loss": 0.7156, + "step": 698 + }, + { + "epoch": 0.20462529274004684, + "grad_norm": 1.0102745294570923, + "learning_rate": 4.989366767925283e-06, + "loss": 0.7505, + "step": 699 + }, + { + "epoch": 0.20491803278688525, + "grad_norm": 1.0139981508255005, + "learning_rate": 4.989331260305099e-06, + "loss": 0.7049, + "step": 700 + }, + { + "epoch": 0.20521077283372366, + "grad_norm": 0.9724369645118713, + "learning_rate": 4.989295693625053e-06, + "loss": 0.7623, + "step": 701 + }, + { + "epoch": 0.20550351288056207, + "grad_norm": 1.0498576164245605, + "learning_rate": 4.989260067885988e-06, + "loss": 0.7545, + "step": 702 + }, + { + "epoch": 0.20579625292740047, + "grad_norm": 0.9969208836555481, + "learning_rate": 4.989224383088749e-06, + "loss": 0.6927, + "step": 703 + }, + { + "epoch": 0.20608899297423888, + "grad_norm": 1.0377227067947388, + "learning_rate": 4.989188639234184e-06, + "loss": 0.7211, + "step": 704 + }, + { + "epoch": 0.2063817330210773, + "grad_norm": 1.0484113693237305, + "learning_rate": 4.98915283632314e-06, + "loss": 0.7454, + "step": 705 + }, + { + "epoch": 0.2066744730679157, + "grad_norm": 1.0581308603286743, + "learning_rate": 4.989116974356467e-06, + "loss": 0.7502, + "step": 706 + }, + { + "epoch": 0.2069672131147541, + "grad_norm": 1.0490704774856567, + "learning_rate": 4.989081053335015e-06, + "loss": 0.7294, + "step": 707 + }, + { + "epoch": 0.20725995316159251, + "grad_norm": 1.0793706178665161, + "learning_rate": 4.989045073259637e-06, + "loss": 0.72, + "step": 708 + }, + { + "epoch": 0.20755269320843092, + "grad_norm": 1.0298837423324585, + "learning_rate": 4.989009034131187e-06, + "loss": 0.7473, + "step": 709 + }, + { + "epoch": 0.20784543325526933, + "grad_norm": 1.0298395156860352, + "learning_rate": 4.98897293595052e-06, + "loss": 0.7423, + "step": 710 + }, + { + "epoch": 0.20813817330210774, + "grad_norm": 1.1512911319732666, + "learning_rate": 4.988936778718491e-06, + "loss": 0.7906, + "step": 711 + }, + { + "epoch": 0.20843091334894615, + "grad_norm": 1.0049829483032227, + "learning_rate": 4.9889005624359585e-06, + "loss": 0.755, + "step": 712 + }, + { + "epoch": 0.20872365339578455, + "grad_norm": 0.9980104565620422, + "learning_rate": 4.988864287103782e-06, + "loss": 0.6925, + "step": 713 + }, + { + "epoch": 0.20901639344262296, + "grad_norm": 1.016703724861145, + "learning_rate": 4.9888279527228225e-06, + "loss": 0.7083, + "step": 714 + }, + { + "epoch": 0.20930913348946137, + "grad_norm": 1.0506963729858398, + "learning_rate": 4.988791559293942e-06, + "loss": 0.6975, + "step": 715 + }, + { + "epoch": 0.20960187353629978, + "grad_norm": 1.0891588926315308, + "learning_rate": 4.9887551068180026e-06, + "loss": 0.7222, + "step": 716 + }, + { + "epoch": 0.20989461358313818, + "grad_norm": 0.971768319606781, + "learning_rate": 4.988718595295871e-06, + "loss": 0.6933, + "step": 717 + }, + { + "epoch": 0.2101873536299766, + "grad_norm": 1.1047747135162354, + "learning_rate": 4.988682024728412e-06, + "loss": 0.7064, + "step": 718 + }, + { + "epoch": 0.210480093676815, + "grad_norm": 1.0571386814117432, + "learning_rate": 4.9886453951164946e-06, + "loss": 0.7354, + "step": 719 + }, + { + "epoch": 0.2107728337236534, + "grad_norm": 1.0627344846725464, + "learning_rate": 4.988608706460987e-06, + "loss": 0.7036, + "step": 720 + }, + { + "epoch": 0.21106557377049182, + "grad_norm": 1.069915771484375, + "learning_rate": 4.988571958762759e-06, + "loss": 0.7381, + "step": 721 + }, + { + "epoch": 0.21135831381733022, + "grad_norm": 1.0299769639968872, + "learning_rate": 4.988535152022683e-06, + "loss": 0.6999, + "step": 722 + }, + { + "epoch": 0.21165105386416863, + "grad_norm": 1.0895439386367798, + "learning_rate": 4.988498286241633e-06, + "loss": 0.7267, + "step": 723 + }, + { + "epoch": 0.21194379391100704, + "grad_norm": 1.0283035039901733, + "learning_rate": 4.988461361420484e-06, + "loss": 0.7462, + "step": 724 + }, + { + "epoch": 0.21223653395784545, + "grad_norm": 0.9983345866203308, + "learning_rate": 4.988424377560109e-06, + "loss": 0.7068, + "step": 725 + }, + { + "epoch": 0.21252927400468383, + "grad_norm": 1.0561195611953735, + "learning_rate": 4.988387334661389e-06, + "loss": 0.7585, + "step": 726 + }, + { + "epoch": 0.21282201405152223, + "grad_norm": 1.0522255897521973, + "learning_rate": 4.988350232725201e-06, + "loss": 0.7142, + "step": 727 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 1.061959981918335, + "learning_rate": 4.988313071752427e-06, + "loss": 0.7608, + "step": 728 + }, + { + "epoch": 0.21340749414519905, + "grad_norm": 1.0319968461990356, + "learning_rate": 4.9882758517439456e-06, + "loss": 0.7476, + "step": 729 + }, + { + "epoch": 0.21370023419203746, + "grad_norm": 1.0711992979049683, + "learning_rate": 4.9882385727006425e-06, + "loss": 0.7347, + "step": 730 + }, + { + "epoch": 0.21399297423887587, + "grad_norm": 1.0234369039535522, + "learning_rate": 4.9882012346234e-06, + "loss": 0.7186, + "step": 731 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 1.0170878171920776, + "learning_rate": 4.988163837513106e-06, + "loss": 0.7214, + "step": 732 + }, + { + "epoch": 0.21457845433255268, + "grad_norm": 1.023919701576233, + "learning_rate": 4.988126381370647e-06, + "loss": 0.7349, + "step": 733 + }, + { + "epoch": 0.2148711943793911, + "grad_norm": 1.0037362575531006, + "learning_rate": 4.988088866196911e-06, + "loss": 0.7318, + "step": 734 + }, + { + "epoch": 0.2151639344262295, + "grad_norm": 0.9858283400535583, + "learning_rate": 4.988051291992789e-06, + "loss": 0.7118, + "step": 735 + }, + { + "epoch": 0.2154566744730679, + "grad_norm": 1.1656187772750854, + "learning_rate": 4.9880136587591725e-06, + "loss": 0.7634, + "step": 736 + }, + { + "epoch": 0.2157494145199063, + "grad_norm": 1.0331271886825562, + "learning_rate": 4.987975966496954e-06, + "loss": 0.754, + "step": 737 + }, + { + "epoch": 0.21604215456674472, + "grad_norm": 1.1061828136444092, + "learning_rate": 4.987938215207027e-06, + "loss": 0.7548, + "step": 738 + }, + { + "epoch": 0.21633489461358313, + "grad_norm": 1.0138111114501953, + "learning_rate": 4.987900404890288e-06, + "loss": 0.6928, + "step": 739 + }, + { + "epoch": 0.21662763466042154, + "grad_norm": 1.0242236852645874, + "learning_rate": 4.987862535547634e-06, + "loss": 0.7434, + "step": 740 + }, + { + "epoch": 0.21692037470725994, + "grad_norm": 1.0436114072799683, + "learning_rate": 4.987824607179964e-06, + "loss": 0.777, + "step": 741 + }, + { + "epoch": 0.21721311475409835, + "grad_norm": 1.0113728046417236, + "learning_rate": 4.987786619788177e-06, + "loss": 0.7301, + "step": 742 + }, + { + "epoch": 0.21750585480093676, + "grad_norm": 1.039515495300293, + "learning_rate": 4.9877485733731735e-06, + "loss": 0.7516, + "step": 743 + }, + { + "epoch": 0.21779859484777517, + "grad_norm": 1.016727089881897, + "learning_rate": 4.9877104679358576e-06, + "loss": 0.6741, + "step": 744 + }, + { + "epoch": 0.21809133489461358, + "grad_norm": 0.9795830249786377, + "learning_rate": 4.987672303477133e-06, + "loss": 0.7435, + "step": 745 + }, + { + "epoch": 0.21838407494145198, + "grad_norm": 1.1297931671142578, + "learning_rate": 4.9876340799979056e-06, + "loss": 0.732, + "step": 746 + }, + { + "epoch": 0.2186768149882904, + "grad_norm": 1.1076101064682007, + "learning_rate": 4.987595797499081e-06, + "loss": 0.7207, + "step": 747 + }, + { + "epoch": 0.2189695550351288, + "grad_norm": 1.089637041091919, + "learning_rate": 4.987557455981569e-06, + "loss": 0.7257, + "step": 748 + }, + { + "epoch": 0.2192622950819672, + "grad_norm": 1.0447978973388672, + "learning_rate": 4.987519055446277e-06, + "loss": 0.7953, + "step": 749 + }, + { + "epoch": 0.21955503512880561, + "grad_norm": 0.9963814616203308, + "learning_rate": 4.987480595894119e-06, + "loss": 0.7376, + "step": 750 + }, + { + "epoch": 0.21984777517564402, + "grad_norm": 1.072474718093872, + "learning_rate": 4.987442077326006e-06, + "loss": 0.7082, + "step": 751 + }, + { + "epoch": 0.22014051522248243, + "grad_norm": 1.0968236923217773, + "learning_rate": 4.987403499742851e-06, + "loss": 0.7443, + "step": 752 + }, + { + "epoch": 0.22043325526932084, + "grad_norm": 0.9804752469062805, + "learning_rate": 4.98736486314557e-06, + "loss": 0.753, + "step": 753 + }, + { + "epoch": 0.22072599531615925, + "grad_norm": 0.9773231744766235, + "learning_rate": 4.98732616753508e-06, + "loss": 0.7039, + "step": 754 + }, + { + "epoch": 0.22101873536299765, + "grad_norm": 0.9917612671852112, + "learning_rate": 4.9872874129122995e-06, + "loss": 0.6856, + "step": 755 + }, + { + "epoch": 0.22131147540983606, + "grad_norm": 0.9919469952583313, + "learning_rate": 4.987248599278146e-06, + "loss": 0.7066, + "step": 756 + }, + { + "epoch": 0.22160421545667447, + "grad_norm": 1.027259349822998, + "learning_rate": 4.987209726633543e-06, + "loss": 0.7223, + "step": 757 + }, + { + "epoch": 0.22189695550351288, + "grad_norm": 1.00771963596344, + "learning_rate": 4.98717079497941e-06, + "loss": 0.7616, + "step": 758 + }, + { + "epoch": 0.22218969555035128, + "grad_norm": 1.0364511013031006, + "learning_rate": 4.987131804316672e-06, + "loss": 0.7384, + "step": 759 + }, + { + "epoch": 0.2224824355971897, + "grad_norm": 1.0453704595565796, + "learning_rate": 4.9870927546462555e-06, + "loss": 0.7283, + "step": 760 + }, + { + "epoch": 0.2227751756440281, + "grad_norm": 1.0185705423355103, + "learning_rate": 4.987053645969086e-06, + "loss": 0.7316, + "step": 761 + }, + { + "epoch": 0.2230679156908665, + "grad_norm": 1.0153050422668457, + "learning_rate": 4.987014478286089e-06, + "loss": 0.7582, + "step": 762 + }, + { + "epoch": 0.22336065573770492, + "grad_norm": 1.2217068672180176, + "learning_rate": 4.986975251598196e-06, + "loss": 0.7389, + "step": 763 + }, + { + "epoch": 0.22365339578454332, + "grad_norm": 1.0225616693496704, + "learning_rate": 4.986935965906338e-06, + "loss": 0.7285, + "step": 764 + }, + { + "epoch": 0.22394613583138173, + "grad_norm": 1.0044368505477905, + "learning_rate": 4.9868966212114465e-06, + "loss": 0.6903, + "step": 765 + }, + { + "epoch": 0.22423887587822014, + "grad_norm": 1.0100784301757812, + "learning_rate": 4.9868572175144545e-06, + "loss": 0.7342, + "step": 766 + }, + { + "epoch": 0.22453161592505855, + "grad_norm": 1.0012294054031372, + "learning_rate": 4.986817754816298e-06, + "loss": 0.6779, + "step": 767 + }, + { + "epoch": 0.22482435597189696, + "grad_norm": 1.068459153175354, + "learning_rate": 4.986778233117912e-06, + "loss": 0.7211, + "step": 768 + }, + { + "epoch": 0.22511709601873536, + "grad_norm": 1.0475529432296753, + "learning_rate": 4.986738652420234e-06, + "loss": 0.7536, + "step": 769 + }, + { + "epoch": 0.22540983606557377, + "grad_norm": 1.0845870971679688, + "learning_rate": 4.986699012724204e-06, + "loss": 0.7509, + "step": 770 + }, + { + "epoch": 0.22570257611241218, + "grad_norm": 0.9745630025863647, + "learning_rate": 4.986659314030763e-06, + "loss": 0.7238, + "step": 771 + }, + { + "epoch": 0.2259953161592506, + "grad_norm": 1.0196101665496826, + "learning_rate": 4.986619556340851e-06, + "loss": 0.7884, + "step": 772 + }, + { + "epoch": 0.226288056206089, + "grad_norm": 1.003771185874939, + "learning_rate": 4.986579739655413e-06, + "loss": 0.7307, + "step": 773 + }, + { + "epoch": 0.2265807962529274, + "grad_norm": 1.0868394374847412, + "learning_rate": 4.986539863975392e-06, + "loss": 0.7303, + "step": 774 + }, + { + "epoch": 0.2268735362997658, + "grad_norm": 1.0192327499389648, + "learning_rate": 4.9864999293017365e-06, + "loss": 0.7483, + "step": 775 + }, + { + "epoch": 0.22716627634660422, + "grad_norm": 0.9710105061531067, + "learning_rate": 4.986459935635391e-06, + "loss": 0.743, + "step": 776 + }, + { + "epoch": 0.22745901639344263, + "grad_norm": 0.986169695854187, + "learning_rate": 4.986419882977307e-06, + "loss": 0.7338, + "step": 777 + }, + { + "epoch": 0.22775175644028103, + "grad_norm": 1.0134469270706177, + "learning_rate": 4.9863797713284334e-06, + "loss": 0.7552, + "step": 778 + }, + { + "epoch": 0.22804449648711944, + "grad_norm": 0.9807611703872681, + "learning_rate": 4.986339600689722e-06, + "loss": 0.711, + "step": 779 + }, + { + "epoch": 0.22833723653395785, + "grad_norm": 1.0080280303955078, + "learning_rate": 4.986299371062125e-06, + "loss": 0.7223, + "step": 780 + }, + { + "epoch": 0.22862997658079626, + "grad_norm": 1.006966471672058, + "learning_rate": 4.9862590824465986e-06, + "loss": 0.749, + "step": 781 + }, + { + "epoch": 0.22892271662763466, + "grad_norm": 1.083571195602417, + "learning_rate": 4.986218734844097e-06, + "loss": 0.704, + "step": 782 + }, + { + "epoch": 0.22921545667447307, + "grad_norm": 1.1143665313720703, + "learning_rate": 4.9861783282555794e-06, + "loss": 0.7433, + "step": 783 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.9748536348342896, + "learning_rate": 4.986137862682003e-06, + "loss": 0.6776, + "step": 784 + }, + { + "epoch": 0.2298009367681499, + "grad_norm": 1.033311367034912, + "learning_rate": 4.986097338124327e-06, + "loss": 0.6898, + "step": 785 + }, + { + "epoch": 0.2300936768149883, + "grad_norm": 1.2170376777648926, + "learning_rate": 4.986056754583516e-06, + "loss": 0.7495, + "step": 786 + }, + { + "epoch": 0.2303864168618267, + "grad_norm": 1.1159372329711914, + "learning_rate": 4.986016112060529e-06, + "loss": 0.7446, + "step": 787 + }, + { + "epoch": 0.2306791569086651, + "grad_norm": 1.0482879877090454, + "learning_rate": 4.985975410556333e-06, + "loss": 0.7172, + "step": 788 + }, + { + "epoch": 0.23097189695550352, + "grad_norm": 0.9813310503959656, + "learning_rate": 4.985934650071893e-06, + "loss": 0.706, + "step": 789 + }, + { + "epoch": 0.23126463700234193, + "grad_norm": 0.9958032369613647, + "learning_rate": 4.985893830608175e-06, + "loss": 0.7519, + "step": 790 + }, + { + "epoch": 0.23155737704918034, + "grad_norm": 0.98252934217453, + "learning_rate": 4.985852952166149e-06, + "loss": 0.6922, + "step": 791 + }, + { + "epoch": 0.23185011709601874, + "grad_norm": 0.9770300388336182, + "learning_rate": 4.985812014746784e-06, + "loss": 0.7065, + "step": 792 + }, + { + "epoch": 0.23214285714285715, + "grad_norm": 0.9632241129875183, + "learning_rate": 4.985771018351051e-06, + "loss": 0.6928, + "step": 793 + }, + { + "epoch": 0.23243559718969556, + "grad_norm": 0.9974339604377747, + "learning_rate": 4.9857299629799235e-06, + "loss": 0.724, + "step": 794 + }, + { + "epoch": 0.23272833723653397, + "grad_norm": 0.9990900754928589, + "learning_rate": 4.985688848634376e-06, + "loss": 0.6915, + "step": 795 + }, + { + "epoch": 0.23302107728337237, + "grad_norm": 1.0349503755569458, + "learning_rate": 4.985647675315381e-06, + "loss": 0.6716, + "step": 796 + }, + { + "epoch": 0.23331381733021078, + "grad_norm": 1.2694240808486938, + "learning_rate": 4.9856064430239185e-06, + "loss": 0.6918, + "step": 797 + }, + { + "epoch": 0.2336065573770492, + "grad_norm": 1.0616333484649658, + "learning_rate": 4.985565151760966e-06, + "loss": 0.734, + "step": 798 + }, + { + "epoch": 0.2338992974238876, + "grad_norm": 0.992032527923584, + "learning_rate": 4.985523801527502e-06, + "loss": 0.7341, + "step": 799 + }, + { + "epoch": 0.234192037470726, + "grad_norm": 1.1033542156219482, + "learning_rate": 4.985482392324508e-06, + "loss": 0.6914, + "step": 800 + }, + { + "epoch": 0.2344847775175644, + "grad_norm": 1.0479737520217896, + "learning_rate": 4.985440924152967e-06, + "loss": 0.7697, + "step": 801 + }, + { + "epoch": 0.23477751756440282, + "grad_norm": 1.0023902654647827, + "learning_rate": 4.9853993970138625e-06, + "loss": 0.6947, + "step": 802 + }, + { + "epoch": 0.23507025761124123, + "grad_norm": 1.083855152130127, + "learning_rate": 4.98535781090818e-06, + "loss": 0.7565, + "step": 803 + }, + { + "epoch": 0.23536299765807964, + "grad_norm": 1.090889811515808, + "learning_rate": 4.985316165836906e-06, + "loss": 0.716, + "step": 804 + }, + { + "epoch": 0.23565573770491804, + "grad_norm": 1.0342276096343994, + "learning_rate": 4.985274461801029e-06, + "loss": 0.7195, + "step": 805 + }, + { + "epoch": 0.23594847775175645, + "grad_norm": 1.0038288831710815, + "learning_rate": 4.985232698801536e-06, + "loss": 0.731, + "step": 806 + }, + { + "epoch": 0.23624121779859486, + "grad_norm": 1.014167308807373, + "learning_rate": 4.985190876839422e-06, + "loss": 0.7725, + "step": 807 + }, + { + "epoch": 0.23653395784543327, + "grad_norm": 1.0489351749420166, + "learning_rate": 4.985148995915676e-06, + "loss": 0.7329, + "step": 808 + }, + { + "epoch": 0.23682669789227168, + "grad_norm": 1.057259440422058, + "learning_rate": 4.985107056031293e-06, + "loss": 0.6733, + "step": 809 + }, + { + "epoch": 0.23711943793911008, + "grad_norm": 0.9972809553146362, + "learning_rate": 4.985065057187267e-06, + "loss": 0.761, + "step": 810 + }, + { + "epoch": 0.2374121779859485, + "grad_norm": 1.0193086862564087, + "learning_rate": 4.985022999384596e-06, + "loss": 0.7755, + "step": 811 + }, + { + "epoch": 0.23770491803278687, + "grad_norm": 1.0520013570785522, + "learning_rate": 4.9849808826242764e-06, + "loss": 0.7786, + "step": 812 + }, + { + "epoch": 0.23799765807962528, + "grad_norm": 1.0040655136108398, + "learning_rate": 4.984938706907308e-06, + "loss": 0.7294, + "step": 813 + }, + { + "epoch": 0.2382903981264637, + "grad_norm": 1.0017849206924438, + "learning_rate": 4.98489647223469e-06, + "loss": 0.7155, + "step": 814 + }, + { + "epoch": 0.2385831381733021, + "grad_norm": 1.0524656772613525, + "learning_rate": 4.9848541786074276e-06, + "loss": 0.6945, + "step": 815 + }, + { + "epoch": 0.2388758782201405, + "grad_norm": 0.9755221605300903, + "learning_rate": 4.984811826026522e-06, + "loss": 0.7219, + "step": 816 + }, + { + "epoch": 0.2391686182669789, + "grad_norm": 1.0630000829696655, + "learning_rate": 4.984769414492978e-06, + "loss": 0.7246, + "step": 817 + }, + { + "epoch": 0.23946135831381732, + "grad_norm": 1.0156902074813843, + "learning_rate": 4.984726944007803e-06, + "loss": 0.7429, + "step": 818 + }, + { + "epoch": 0.23975409836065573, + "grad_norm": 0.9594588875770569, + "learning_rate": 4.984684414572003e-06, + "loss": 0.6765, + "step": 819 + }, + { + "epoch": 0.24004683840749413, + "grad_norm": 1.0005369186401367, + "learning_rate": 4.984641826186589e-06, + "loss": 0.7114, + "step": 820 + }, + { + "epoch": 0.24033957845433254, + "grad_norm": 1.0562324523925781, + "learning_rate": 4.984599178852569e-06, + "loss": 0.7532, + "step": 821 + }, + { + "epoch": 0.24063231850117095, + "grad_norm": 1.0028666257858276, + "learning_rate": 4.984556472570957e-06, + "loss": 0.7141, + "step": 822 + }, + { + "epoch": 0.24092505854800936, + "grad_norm": 1.0038100481033325, + "learning_rate": 4.984513707342766e-06, + "loss": 0.7125, + "step": 823 + }, + { + "epoch": 0.24121779859484777, + "grad_norm": 0.9941343069076538, + "learning_rate": 4.984470883169008e-06, + "loss": 0.698, + "step": 824 + }, + { + "epoch": 0.24151053864168617, + "grad_norm": 1.015010952949524, + "learning_rate": 4.9844280000507026e-06, + "loss": 0.7098, + "step": 825 + }, + { + "epoch": 0.24180327868852458, + "grad_norm": 1.0542465448379517, + "learning_rate": 4.984385057988866e-06, + "loss": 0.7166, + "step": 826 + }, + { + "epoch": 0.242096018735363, + "grad_norm": 1.1602016687393188, + "learning_rate": 4.984342056984515e-06, + "loss": 0.7058, + "step": 827 + }, + { + "epoch": 0.2423887587822014, + "grad_norm": 1.0496536493301392, + "learning_rate": 4.984298997038672e-06, + "loss": 0.7266, + "step": 828 + }, + { + "epoch": 0.2426814988290398, + "grad_norm": 1.053364634513855, + "learning_rate": 4.9842558781523585e-06, + "loss": 0.7352, + "step": 829 + }, + { + "epoch": 0.2429742388758782, + "grad_norm": 1.0354281663894653, + "learning_rate": 4.984212700326596e-06, + "loss": 0.7118, + "step": 830 + }, + { + "epoch": 0.24326697892271662, + "grad_norm": 1.0467826128005981, + "learning_rate": 4.984169463562411e-06, + "loss": 0.7609, + "step": 831 + }, + { + "epoch": 0.24355971896955503, + "grad_norm": 1.017685890197754, + "learning_rate": 4.984126167860828e-06, + "loss": 0.6992, + "step": 832 + }, + { + "epoch": 0.24385245901639344, + "grad_norm": 1.0735857486724854, + "learning_rate": 4.9840828132228735e-06, + "loss": 0.7755, + "step": 833 + }, + { + "epoch": 0.24414519906323184, + "grad_norm": 1.048542857170105, + "learning_rate": 4.984039399649578e-06, + "loss": 0.7168, + "step": 834 + }, + { + "epoch": 0.24443793911007025, + "grad_norm": 1.0000165700912476, + "learning_rate": 4.9839959271419706e-06, + "loss": 0.7377, + "step": 835 + }, + { + "epoch": 0.24473067915690866, + "grad_norm": 1.1005804538726807, + "learning_rate": 4.983952395701082e-06, + "loss": 0.714, + "step": 836 + }, + { + "epoch": 0.24502341920374707, + "grad_norm": 1.0941208600997925, + "learning_rate": 4.983908805327945e-06, + "loss": 0.7416, + "step": 837 + }, + { + "epoch": 0.24531615925058547, + "grad_norm": 1.0661135911941528, + "learning_rate": 4.983865156023595e-06, + "loss": 0.7632, + "step": 838 + }, + { + "epoch": 0.24560889929742388, + "grad_norm": 1.0207784175872803, + "learning_rate": 4.983821447789068e-06, + "loss": 0.6823, + "step": 839 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.0421547889709473, + "learning_rate": 4.983777680625398e-06, + "loss": 0.7704, + "step": 840 + }, + { + "epoch": 0.2461943793911007, + "grad_norm": 0.9706763625144958, + "learning_rate": 4.983733854533627e-06, + "loss": 0.7042, + "step": 841 + }, + { + "epoch": 0.2464871194379391, + "grad_norm": 1.0740224123001099, + "learning_rate": 4.9836899695147925e-06, + "loss": 0.6995, + "step": 842 + }, + { + "epoch": 0.24677985948477751, + "grad_norm": 1.2029122114181519, + "learning_rate": 4.983646025569937e-06, + "loss": 0.7557, + "step": 843 + }, + { + "epoch": 0.24707259953161592, + "grad_norm": 1.018723964691162, + "learning_rate": 4.9836020227001015e-06, + "loss": 0.6809, + "step": 844 + }, + { + "epoch": 0.24736533957845433, + "grad_norm": 1.098447561264038, + "learning_rate": 4.98355796090633e-06, + "loss": 0.722, + "step": 845 + }, + { + "epoch": 0.24765807962529274, + "grad_norm": 1.1119989156723022, + "learning_rate": 4.98351384018967e-06, + "loss": 0.6858, + "step": 846 + }, + { + "epoch": 0.24795081967213115, + "grad_norm": 1.0163623094558716, + "learning_rate": 4.983469660551167e-06, + "loss": 0.6833, + "step": 847 + }, + { + "epoch": 0.24824355971896955, + "grad_norm": 1.0496872663497925, + "learning_rate": 4.983425421991869e-06, + "loss": 0.705, + "step": 848 + }, + { + "epoch": 0.24853629976580796, + "grad_norm": 0.9982650876045227, + "learning_rate": 4.983381124512826e-06, + "loss": 0.7351, + "step": 849 + }, + { + "epoch": 0.24882903981264637, + "grad_norm": 1.0254333019256592, + "learning_rate": 4.983336768115089e-06, + "loss": 0.7901, + "step": 850 + }, + { + "epoch": 0.24912177985948478, + "grad_norm": 1.013558268547058, + "learning_rate": 4.983292352799709e-06, + "loss": 0.6876, + "step": 851 + }, + { + "epoch": 0.24941451990632318, + "grad_norm": 0.906428873538971, + "learning_rate": 4.983247878567741e-06, + "loss": 0.6675, + "step": 852 + }, + { + "epoch": 0.2497072599531616, + "grad_norm": 0.9919475317001343, + "learning_rate": 4.983203345420241e-06, + "loss": 0.7548, + "step": 853 + }, + { + "epoch": 0.25, + "grad_norm": 1.0635743141174316, + "learning_rate": 4.983158753358263e-06, + "loss": 0.7, + "step": 854 + }, + { + "epoch": 0.2502927400468384, + "grad_norm": 0.9723818898200989, + "learning_rate": 4.983114102382868e-06, + "loss": 0.7486, + "step": 855 + }, + { + "epoch": 0.2505854800936768, + "grad_norm": 0.9967241883277893, + "learning_rate": 4.983069392495113e-06, + "loss": 0.6945, + "step": 856 + }, + { + "epoch": 0.2508782201405152, + "grad_norm": 0.9693993330001831, + "learning_rate": 4.98302462369606e-06, + "loss": 0.6931, + "step": 857 + }, + { + "epoch": 0.25117096018735363, + "grad_norm": 1.1494483947753906, + "learning_rate": 4.9829797959867706e-06, + "loss": 0.7382, + "step": 858 + }, + { + "epoch": 0.251463700234192, + "grad_norm": 1.039512276649475, + "learning_rate": 4.982934909368309e-06, + "loss": 0.6979, + "step": 859 + }, + { + "epoch": 0.25175644028103045, + "grad_norm": 1.0488133430480957, + "learning_rate": 4.9828899638417384e-06, + "loss": 0.7356, + "step": 860 + }, + { + "epoch": 0.2520491803278688, + "grad_norm": 1.05567467212677, + "learning_rate": 4.982844959408128e-06, + "loss": 0.7307, + "step": 861 + }, + { + "epoch": 0.25234192037470726, + "grad_norm": 0.9795578718185425, + "learning_rate": 4.982799896068543e-06, + "loss": 0.7089, + "step": 862 + }, + { + "epoch": 0.25263466042154564, + "grad_norm": 1.027922511100769, + "learning_rate": 4.982754773824053e-06, + "loss": 0.7196, + "step": 863 + }, + { + "epoch": 0.2529274004683841, + "grad_norm": 1.0246983766555786, + "learning_rate": 4.98270959267573e-06, + "loss": 0.7599, + "step": 864 + }, + { + "epoch": 0.25322014051522246, + "grad_norm": 1.0974235534667969, + "learning_rate": 4.9826643526246445e-06, + "loss": 0.7686, + "step": 865 + }, + { + "epoch": 0.2535128805620609, + "grad_norm": 1.0227991342544556, + "learning_rate": 4.98261905367187e-06, + "loss": 0.6583, + "step": 866 + }, + { + "epoch": 0.2538056206088993, + "grad_norm": 1.012539029121399, + "learning_rate": 4.982573695818482e-06, + "loss": 0.7158, + "step": 867 + }, + { + "epoch": 0.2540983606557377, + "grad_norm": 1.0143994092941284, + "learning_rate": 4.982528279065556e-06, + "loss": 0.7361, + "step": 868 + }, + { + "epoch": 0.2543911007025761, + "grad_norm": 1.0902328491210938, + "learning_rate": 4.982482803414169e-06, + "loss": 0.7439, + "step": 869 + }, + { + "epoch": 0.2546838407494145, + "grad_norm": 1.038634181022644, + "learning_rate": 4.982437268865401e-06, + "loss": 0.7495, + "step": 870 + }, + { + "epoch": 0.2549765807962529, + "grad_norm": 1.0043364763259888, + "learning_rate": 4.982391675420332e-06, + "loss": 0.6854, + "step": 871 + }, + { + "epoch": 0.25526932084309134, + "grad_norm": 1.0429935455322266, + "learning_rate": 4.982346023080043e-06, + "loss": 0.7595, + "step": 872 + }, + { + "epoch": 0.2555620608899297, + "grad_norm": 1.0881071090698242, + "learning_rate": 4.982300311845619e-06, + "loss": 0.7074, + "step": 873 + }, + { + "epoch": 0.25585480093676816, + "grad_norm": 1.051313042640686, + "learning_rate": 4.982254541718142e-06, + "loss": 0.7352, + "step": 874 + }, + { + "epoch": 0.25614754098360654, + "grad_norm": 1.0183985233306885, + "learning_rate": 4.9822087126986994e-06, + "loss": 0.7188, + "step": 875 + }, + { + "epoch": 0.25644028103044497, + "grad_norm": 1.0575062036514282, + "learning_rate": 4.9821628247883795e-06, + "loss": 0.7595, + "step": 876 + }, + { + "epoch": 0.25673302107728335, + "grad_norm": 1.075734257698059, + "learning_rate": 4.9821168779882675e-06, + "loss": 0.6909, + "step": 877 + }, + { + "epoch": 0.2570257611241218, + "grad_norm": 1.0135852098464966, + "learning_rate": 4.982070872299457e-06, + "loss": 0.7022, + "step": 878 + }, + { + "epoch": 0.25731850117096017, + "grad_norm": 0.9765479564666748, + "learning_rate": 4.982024807723038e-06, + "loss": 0.7231, + "step": 879 + }, + { + "epoch": 0.2576112412177986, + "grad_norm": 1.0298668146133423, + "learning_rate": 4.9819786842601036e-06, + "loss": 0.6952, + "step": 880 + }, + { + "epoch": 0.257903981264637, + "grad_norm": 1.0270487070083618, + "learning_rate": 4.981932501911748e-06, + "loss": 0.7429, + "step": 881 + }, + { + "epoch": 0.2581967213114754, + "grad_norm": 1.0679138898849487, + "learning_rate": 4.9818862606790666e-06, + "loss": 0.7345, + "step": 882 + }, + { + "epoch": 0.2584894613583138, + "grad_norm": 1.1373153924942017, + "learning_rate": 4.981839960563157e-06, + "loss": 0.7296, + "step": 883 + }, + { + "epoch": 0.25878220140515223, + "grad_norm": 1.0389937162399292, + "learning_rate": 4.981793601565118e-06, + "loss": 0.713, + "step": 884 + }, + { + "epoch": 0.2590749414519906, + "grad_norm": 1.0050194263458252, + "learning_rate": 4.981747183686049e-06, + "loss": 0.7256, + "step": 885 + }, + { + "epoch": 0.25936768149882905, + "grad_norm": 1.0876712799072266, + "learning_rate": 4.98170070692705e-06, + "loss": 0.7461, + "step": 886 + }, + { + "epoch": 0.25966042154566743, + "grad_norm": 1.065070629119873, + "learning_rate": 4.981654171289226e-06, + "loss": 0.6936, + "step": 887 + }, + { + "epoch": 0.25995316159250587, + "grad_norm": 1.021506428718567, + "learning_rate": 4.98160757677368e-06, + "loss": 0.6321, + "step": 888 + }, + { + "epoch": 0.26024590163934425, + "grad_norm": 1.0057860612869263, + "learning_rate": 4.981560923381518e-06, + "loss": 0.678, + "step": 889 + }, + { + "epoch": 0.2605386416861827, + "grad_norm": 1.014624834060669, + "learning_rate": 4.981514211113845e-06, + "loss": 0.694, + "step": 890 + }, + { + "epoch": 0.26083138173302106, + "grad_norm": 1.0676417350769043, + "learning_rate": 4.981467439971772e-06, + "loss": 0.7176, + "step": 891 + }, + { + "epoch": 0.2611241217798595, + "grad_norm": 1.005433440208435, + "learning_rate": 4.981420609956406e-06, + "loss": 0.6949, + "step": 892 + }, + { + "epoch": 0.2614168618266979, + "grad_norm": 1.079632043838501, + "learning_rate": 4.98137372106886e-06, + "loss": 0.7474, + "step": 893 + }, + { + "epoch": 0.2617096018735363, + "grad_norm": 0.9780515432357788, + "learning_rate": 4.981326773310246e-06, + "loss": 0.7155, + "step": 894 + }, + { + "epoch": 0.2620023419203747, + "grad_norm": 1.026510238647461, + "learning_rate": 4.981279766681677e-06, + "loss": 0.7636, + "step": 895 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.9955527782440186, + "learning_rate": 4.981232701184268e-06, + "loss": 0.7327, + "step": 896 + }, + { + "epoch": 0.2625878220140515, + "grad_norm": 1.0813243389129639, + "learning_rate": 4.981185576819138e-06, + "loss": 0.7183, + "step": 897 + }, + { + "epoch": 0.26288056206088994, + "grad_norm": 1.039019227027893, + "learning_rate": 4.981138393587403e-06, + "loss": 0.7326, + "step": 898 + }, + { + "epoch": 0.2631733021077283, + "grad_norm": 1.0069769620895386, + "learning_rate": 4.9810911514901824e-06, + "loss": 0.6536, + "step": 899 + }, + { + "epoch": 0.26346604215456676, + "grad_norm": 1.0454398393630981, + "learning_rate": 4.981043850528598e-06, + "loss": 0.7214, + "step": 900 + }, + { + "epoch": 0.26375878220140514, + "grad_norm": 1.0102931261062622, + "learning_rate": 4.980996490703773e-06, + "loss": 0.7274, + "step": 901 + }, + { + "epoch": 0.2640515222482436, + "grad_norm": 1.0385116338729858, + "learning_rate": 4.980949072016827e-06, + "loss": 0.7492, + "step": 902 + }, + { + "epoch": 0.26434426229508196, + "grad_norm": 1.087937831878662, + "learning_rate": 4.980901594468889e-06, + "loss": 0.7226, + "step": 903 + }, + { + "epoch": 0.2646370023419204, + "grad_norm": 1.043847918510437, + "learning_rate": 4.980854058061085e-06, + "loss": 0.735, + "step": 904 + }, + { + "epoch": 0.26492974238875877, + "grad_norm": 1.032575011253357, + "learning_rate": 4.9808064627945395e-06, + "loss": 0.6968, + "step": 905 + }, + { + "epoch": 0.2652224824355972, + "grad_norm": 0.993313729763031, + "learning_rate": 4.980758808670385e-06, + "loss": 0.7098, + "step": 906 + }, + { + "epoch": 0.2655152224824356, + "grad_norm": 0.9864101409912109, + "learning_rate": 4.980711095689752e-06, + "loss": 0.6805, + "step": 907 + }, + { + "epoch": 0.265807962529274, + "grad_norm": 1.1427743434906006, + "learning_rate": 4.980663323853771e-06, + "loss": 0.6956, + "step": 908 + }, + { + "epoch": 0.2661007025761124, + "grad_norm": 1.0470517873764038, + "learning_rate": 4.980615493163576e-06, + "loss": 0.7271, + "step": 909 + }, + { + "epoch": 0.26639344262295084, + "grad_norm": 0.9880208373069763, + "learning_rate": 4.980567603620302e-06, + "loss": 0.7213, + "step": 910 + }, + { + "epoch": 0.2666861826697892, + "grad_norm": 1.0385977029800415, + "learning_rate": 4.980519655225085e-06, + "loss": 0.7401, + "step": 911 + }, + { + "epoch": 0.26697892271662765, + "grad_norm": 1.1759341955184937, + "learning_rate": 4.980471647979063e-06, + "loss": 0.7636, + "step": 912 + }, + { + "epoch": 0.26727166276346603, + "grad_norm": 1.0396438837051392, + "learning_rate": 4.9804235818833746e-06, + "loss": 0.7346, + "step": 913 + }, + { + "epoch": 0.26756440281030447, + "grad_norm": 1.0569101572036743, + "learning_rate": 4.980375456939159e-06, + "loss": 0.7309, + "step": 914 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 1.0514681339263916, + "learning_rate": 4.9803272731475605e-06, + "loss": 0.7262, + "step": 915 + }, + { + "epoch": 0.2681498829039813, + "grad_norm": 1.011803388595581, + "learning_rate": 4.980279030509721e-06, + "loss": 0.7011, + "step": 916 + }, + { + "epoch": 0.26844262295081966, + "grad_norm": 1.0539807081222534, + "learning_rate": 4.980230729026783e-06, + "loss": 0.7355, + "step": 917 + }, + { + "epoch": 0.2687353629976581, + "grad_norm": 1.0268906354904175, + "learning_rate": 4.980182368699896e-06, + "loss": 0.7284, + "step": 918 + }, + { + "epoch": 0.2690281030444965, + "grad_norm": 1.0481562614440918, + "learning_rate": 4.9801339495302056e-06, + "loss": 0.7172, + "step": 919 + }, + { + "epoch": 0.2693208430913349, + "grad_norm": 1.131388783454895, + "learning_rate": 4.980085471518861e-06, + "loss": 0.7237, + "step": 920 + }, + { + "epoch": 0.2696135831381733, + "grad_norm": 1.0480324029922485, + "learning_rate": 4.980036934667012e-06, + "loss": 0.6961, + "step": 921 + }, + { + "epoch": 0.26990632318501173, + "grad_norm": 1.1077014207839966, + "learning_rate": 4.979988338975811e-06, + "loss": 0.703, + "step": 922 + }, + { + "epoch": 0.2701990632318501, + "grad_norm": 1.0433381795883179, + "learning_rate": 4.979939684446409e-06, + "loss": 0.7133, + "step": 923 + }, + { + "epoch": 0.27049180327868855, + "grad_norm": 0.9954860806465149, + "learning_rate": 4.9798909710799615e-06, + "loss": 0.6694, + "step": 924 + }, + { + "epoch": 0.2707845433255269, + "grad_norm": 0.9865260720252991, + "learning_rate": 4.9798421988776245e-06, + "loss": 0.7231, + "step": 925 + }, + { + "epoch": 0.27107728337236536, + "grad_norm": 1.0011913776397705, + "learning_rate": 4.979793367840555e-06, + "loss": 0.7306, + "step": 926 + }, + { + "epoch": 0.27137002341920374, + "grad_norm": 1.0542563199996948, + "learning_rate": 4.979744477969912e-06, + "loss": 0.7706, + "step": 927 + }, + { + "epoch": 0.2716627634660422, + "grad_norm": 1.0811022520065308, + "learning_rate": 4.9796955292668545e-06, + "loss": 0.6533, + "step": 928 + }, + { + "epoch": 0.27195550351288056, + "grad_norm": 1.0996493101119995, + "learning_rate": 4.979646521732544e-06, + "loss": 0.7033, + "step": 929 + }, + { + "epoch": 0.272248243559719, + "grad_norm": 1.0107835531234741, + "learning_rate": 4.979597455368143e-06, + "loss": 0.7349, + "step": 930 + }, + { + "epoch": 0.2725409836065574, + "grad_norm": 1.1094043254852295, + "learning_rate": 4.979548330174817e-06, + "loss": 0.7038, + "step": 931 + }, + { + "epoch": 0.2728337236533958, + "grad_norm": 1.1478793621063232, + "learning_rate": 4.979499146153729e-06, + "loss": 0.7356, + "step": 932 + }, + { + "epoch": 0.2731264637002342, + "grad_norm": 0.972846508026123, + "learning_rate": 4.979449903306049e-06, + "loss": 0.6838, + "step": 933 + }, + { + "epoch": 0.2734192037470726, + "grad_norm": 1.1048834323883057, + "learning_rate": 4.979400601632943e-06, + "loss": 0.7293, + "step": 934 + }, + { + "epoch": 0.273711943793911, + "grad_norm": 1.1530427932739258, + "learning_rate": 4.979351241135581e-06, + "loss": 0.7029, + "step": 935 + }, + { + "epoch": 0.27400468384074944, + "grad_norm": 1.0549235343933105, + "learning_rate": 4.979301821815134e-06, + "loss": 0.6772, + "step": 936 + }, + { + "epoch": 0.2742974238875878, + "grad_norm": 0.9755483269691467, + "learning_rate": 4.979252343672777e-06, + "loss": 0.6743, + "step": 937 + }, + { + "epoch": 0.27459016393442626, + "grad_norm": 1.056493878364563, + "learning_rate": 4.97920280670968e-06, + "loss": 0.7825, + "step": 938 + }, + { + "epoch": 0.27488290398126464, + "grad_norm": 1.0004432201385498, + "learning_rate": 4.979153210927021e-06, + "loss": 0.6638, + "step": 939 + }, + { + "epoch": 0.275175644028103, + "grad_norm": 1.0140432119369507, + "learning_rate": 4.979103556325975e-06, + "loss": 0.7366, + "step": 940 + }, + { + "epoch": 0.27546838407494145, + "grad_norm": 1.0502870082855225, + "learning_rate": 4.9790538429077215e-06, + "loss": 0.7111, + "step": 941 + }, + { + "epoch": 0.27576112412177983, + "grad_norm": 1.0574276447296143, + "learning_rate": 4.9790040706734385e-06, + "loss": 0.7437, + "step": 942 + }, + { + "epoch": 0.27605386416861827, + "grad_norm": 1.0560981035232544, + "learning_rate": 4.978954239624309e-06, + "loss": 0.7813, + "step": 943 + }, + { + "epoch": 0.27634660421545665, + "grad_norm": 1.2403614521026611, + "learning_rate": 4.978904349761513e-06, + "loss": 0.7495, + "step": 944 + }, + { + "epoch": 0.2766393442622951, + "grad_norm": 1.099726915359497, + "learning_rate": 4.978854401086235e-06, + "loss": 0.7797, + "step": 945 + }, + { + "epoch": 0.27693208430913346, + "grad_norm": 1.1164369583129883, + "learning_rate": 4.97880439359966e-06, + "loss": 0.6974, + "step": 946 + }, + { + "epoch": 0.2772248243559719, + "grad_norm": 1.095984697341919, + "learning_rate": 4.978754327302975e-06, + "loss": 0.7291, + "step": 947 + }, + { + "epoch": 0.2775175644028103, + "grad_norm": 1.1129194498062134, + "learning_rate": 4.978704202197367e-06, + "loss": 0.6915, + "step": 948 + }, + { + "epoch": 0.2778103044496487, + "grad_norm": 1.022239327430725, + "learning_rate": 4.9786540182840255e-06, + "loss": 0.7411, + "step": 949 + }, + { + "epoch": 0.2781030444964871, + "grad_norm": 1.0473504066467285, + "learning_rate": 4.978603775564142e-06, + "loss": 0.7362, + "step": 950 + }, + { + "epoch": 0.27839578454332553, + "grad_norm": 1.0407932996749878, + "learning_rate": 4.978553474038907e-06, + "loss": 0.7639, + "step": 951 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 1.160336971282959, + "learning_rate": 4.978503113709515e-06, + "loss": 0.7421, + "step": 952 + }, + { + "epoch": 0.27898126463700235, + "grad_norm": 1.047182559967041, + "learning_rate": 4.97845269457716e-06, + "loss": 0.7252, + "step": 953 + }, + { + "epoch": 0.2792740046838407, + "grad_norm": 1.0067484378814697, + "learning_rate": 4.978402216643039e-06, + "loss": 0.6724, + "step": 954 + }, + { + "epoch": 0.27956674473067916, + "grad_norm": 1.0446885824203491, + "learning_rate": 4.97835167990835e-06, + "loss": 0.7381, + "step": 955 + }, + { + "epoch": 0.27985948477751754, + "grad_norm": 1.043499231338501, + "learning_rate": 4.97830108437429e-06, + "loss": 0.733, + "step": 956 + }, + { + "epoch": 0.280152224824356, + "grad_norm": 1.0767308473587036, + "learning_rate": 4.9782504300420605e-06, + "loss": 0.7568, + "step": 957 + }, + { + "epoch": 0.28044496487119436, + "grad_norm": 1.0511515140533447, + "learning_rate": 4.978199716912865e-06, + "loss": 0.7244, + "step": 958 + }, + { + "epoch": 0.2807377049180328, + "grad_norm": 1.0743741989135742, + "learning_rate": 4.978148944987904e-06, + "loss": 0.7392, + "step": 959 + }, + { + "epoch": 0.2810304449648712, + "grad_norm": 1.0330288410186768, + "learning_rate": 4.978098114268383e-06, + "loss": 0.758, + "step": 960 + }, + { + "epoch": 0.2813231850117096, + "grad_norm": 0.9855220317840576, + "learning_rate": 4.978047224755508e-06, + "loss": 0.7061, + "step": 961 + }, + { + "epoch": 0.281615925058548, + "grad_norm": 1.0263782739639282, + "learning_rate": 4.977996276450488e-06, + "loss": 0.7105, + "step": 962 + }, + { + "epoch": 0.2819086651053864, + "grad_norm": 1.039804220199585, + "learning_rate": 4.977945269354528e-06, + "loss": 0.7111, + "step": 963 + }, + { + "epoch": 0.2822014051522248, + "grad_norm": 1.0168216228485107, + "learning_rate": 4.977894203468842e-06, + "loss": 0.7501, + "step": 964 + }, + { + "epoch": 0.28249414519906324, + "grad_norm": 1.0083826780319214, + "learning_rate": 4.9778430787946395e-06, + "loss": 0.6918, + "step": 965 + }, + { + "epoch": 0.2827868852459016, + "grad_norm": 0.9668732285499573, + "learning_rate": 4.977791895333135e-06, + "loss": 0.7028, + "step": 966 + }, + { + "epoch": 0.28307962529274006, + "grad_norm": 1.0255467891693115, + "learning_rate": 4.97774065308554e-06, + "loss": 0.7054, + "step": 967 + }, + { + "epoch": 0.28337236533957844, + "grad_norm": 1.0059702396392822, + "learning_rate": 4.977689352053073e-06, + "loss": 0.6948, + "step": 968 + }, + { + "epoch": 0.28366510538641687, + "grad_norm": 1.0699231624603271, + "learning_rate": 4.97763799223695e-06, + "loss": 0.7043, + "step": 969 + }, + { + "epoch": 0.28395784543325525, + "grad_norm": 1.0688396692276, + "learning_rate": 4.977586573638389e-06, + "loss": 0.724, + "step": 970 + }, + { + "epoch": 0.2842505854800937, + "grad_norm": 1.0575860738754272, + "learning_rate": 4.9775350962586115e-06, + "loss": 0.7481, + "step": 971 + }, + { + "epoch": 0.28454332552693207, + "grad_norm": 1.0306289196014404, + "learning_rate": 4.977483560098837e-06, + "loss": 0.7322, + "step": 972 + }, + { + "epoch": 0.2848360655737705, + "grad_norm": 1.076809048652649, + "learning_rate": 4.977431965160289e-06, + "loss": 0.7454, + "step": 973 + }, + { + "epoch": 0.2851288056206089, + "grad_norm": 1.0291054248809814, + "learning_rate": 4.977380311444192e-06, + "loss": 0.7159, + "step": 974 + }, + { + "epoch": 0.2854215456674473, + "grad_norm": 1.033155083656311, + "learning_rate": 4.9773285989517715e-06, + "loss": 0.7309, + "step": 975 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 1.001937985420227, + "learning_rate": 4.977276827684254e-06, + "loss": 0.6916, + "step": 976 + }, + { + "epoch": 0.28600702576112413, + "grad_norm": 1.123834490776062, + "learning_rate": 4.977224997642868e-06, + "loss": 0.7194, + "step": 977 + }, + { + "epoch": 0.2862997658079625, + "grad_norm": 1.058316946029663, + "learning_rate": 4.977173108828842e-06, + "loss": 0.7396, + "step": 978 + }, + { + "epoch": 0.28659250585480095, + "grad_norm": 1.0336657762527466, + "learning_rate": 4.977121161243409e-06, + "loss": 0.712, + "step": 979 + }, + { + "epoch": 0.28688524590163933, + "grad_norm": 1.045052409172058, + "learning_rate": 4.9770691548878005e-06, + "loss": 0.739, + "step": 980 + }, + { + "epoch": 0.28717798594847777, + "grad_norm": 1.0575480461120605, + "learning_rate": 4.97701708976325e-06, + "loss": 0.7425, + "step": 981 + }, + { + "epoch": 0.28747072599531615, + "grad_norm": 1.022293210029602, + "learning_rate": 4.976964965870993e-06, + "loss": 0.6967, + "step": 982 + }, + { + "epoch": 0.2877634660421546, + "grad_norm": 1.0912870168685913, + "learning_rate": 4.9769127832122665e-06, + "loss": 0.7224, + "step": 983 + }, + { + "epoch": 0.28805620608899296, + "grad_norm": 1.0017168521881104, + "learning_rate": 4.976860541788309e-06, + "loss": 0.7161, + "step": 984 + }, + { + "epoch": 0.2883489461358314, + "grad_norm": 1.0304137468338013, + "learning_rate": 4.976808241600358e-06, + "loss": 0.7102, + "step": 985 + }, + { + "epoch": 0.2886416861826698, + "grad_norm": 1.1085901260375977, + "learning_rate": 4.976755882649657e-06, + "loss": 0.7754, + "step": 986 + }, + { + "epoch": 0.2889344262295082, + "grad_norm": 1.0049340724945068, + "learning_rate": 4.976703464937446e-06, + "loss": 0.6751, + "step": 987 + }, + { + "epoch": 0.2892271662763466, + "grad_norm": 1.0335228443145752, + "learning_rate": 4.976650988464969e-06, + "loss": 0.7086, + "step": 988 + }, + { + "epoch": 0.28951990632318503, + "grad_norm": 1.0137214660644531, + "learning_rate": 4.976598453233472e-06, + "loss": 0.7156, + "step": 989 + }, + { + "epoch": 0.2898126463700234, + "grad_norm": 1.2428139448165894, + "learning_rate": 4.9765458592442014e-06, + "loss": 0.7531, + "step": 990 + }, + { + "epoch": 0.29010538641686184, + "grad_norm": 1.0587496757507324, + "learning_rate": 4.976493206498404e-06, + "loss": 0.7471, + "step": 991 + }, + { + "epoch": 0.2903981264637002, + "grad_norm": 1.0717357397079468, + "learning_rate": 4.97644049499733e-06, + "loss": 0.7085, + "step": 992 + }, + { + "epoch": 0.29069086651053866, + "grad_norm": 0.9393372535705566, + "learning_rate": 4.976387724742229e-06, + "loss": 0.6549, + "step": 993 + }, + { + "epoch": 0.29098360655737704, + "grad_norm": 1.0752443075180054, + "learning_rate": 4.976334895734353e-06, + "loss": 0.7035, + "step": 994 + }, + { + "epoch": 0.2912763466042155, + "grad_norm": 1.0273995399475098, + "learning_rate": 4.976282007974957e-06, + "loss": 0.7341, + "step": 995 + }, + { + "epoch": 0.29156908665105385, + "grad_norm": 1.073352575302124, + "learning_rate": 4.976229061465294e-06, + "loss": 0.7331, + "step": 996 + }, + { + "epoch": 0.2918618266978923, + "grad_norm": 1.2274487018585205, + "learning_rate": 4.97617605620662e-06, + "loss": 0.7241, + "step": 997 + }, + { + "epoch": 0.29215456674473067, + "grad_norm": 1.0392765998840332, + "learning_rate": 4.976122992200194e-06, + "loss": 0.6903, + "step": 998 + }, + { + "epoch": 0.2924473067915691, + "grad_norm": 1.1075812578201294, + "learning_rate": 4.9760698694472744e-06, + "loss": 0.7416, + "step": 999 + }, + { + "epoch": 0.2927400468384075, + "grad_norm": 1.050911784172058, + "learning_rate": 4.976016687949122e-06, + "loss": 0.7749, + "step": 1000 + }, + { + "epoch": 0.2930327868852459, + "grad_norm": 1.0740138292312622, + "learning_rate": 4.975963447706998e-06, + "loss": 0.7277, + "step": 1001 + }, + { + "epoch": 0.2933255269320843, + "grad_norm": 1.028006911277771, + "learning_rate": 4.975910148722164e-06, + "loss": 0.6976, + "step": 1002 + }, + { + "epoch": 0.29361826697892274, + "grad_norm": 1.027548909187317, + "learning_rate": 4.975856790995887e-06, + "loss": 0.7318, + "step": 1003 + }, + { + "epoch": 0.2939110070257611, + "grad_norm": 1.032153844833374, + "learning_rate": 4.9758033745294325e-06, + "loss": 0.6991, + "step": 1004 + }, + { + "epoch": 0.29420374707259955, + "grad_norm": 1.0472958087921143, + "learning_rate": 4.975749899324067e-06, + "loss": 0.7403, + "step": 1005 + }, + { + "epoch": 0.29449648711943793, + "grad_norm": 1.0511680841445923, + "learning_rate": 4.975696365381059e-06, + "loss": 0.706, + "step": 1006 + }, + { + "epoch": 0.29478922716627637, + "grad_norm": 1.037423014640808, + "learning_rate": 4.975642772701679e-06, + "loss": 0.7245, + "step": 1007 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 1.0041903257369995, + "learning_rate": 4.9755891212871985e-06, + "loss": 0.7036, + "step": 1008 + }, + { + "epoch": 0.2953747072599532, + "grad_norm": 1.0746440887451172, + "learning_rate": 4.975535411138891e-06, + "loss": 0.7477, + "step": 1009 + }, + { + "epoch": 0.29566744730679156, + "grad_norm": 1.0498912334442139, + "learning_rate": 4.97548164225803e-06, + "loss": 0.6923, + "step": 1010 + }, + { + "epoch": 0.29596018735363, + "grad_norm": 1.0496796369552612, + "learning_rate": 4.975427814645892e-06, + "loss": 0.7084, + "step": 1011 + }, + { + "epoch": 0.2962529274004684, + "grad_norm": 1.068353295326233, + "learning_rate": 4.975373928303753e-06, + "loss": 0.7028, + "step": 1012 + }, + { + "epoch": 0.2965456674473068, + "grad_norm": 1.037412405014038, + "learning_rate": 4.975319983232892e-06, + "loss": 0.6943, + "step": 1013 + }, + { + "epoch": 0.2968384074941452, + "grad_norm": 1.1391676664352417, + "learning_rate": 4.975265979434589e-06, + "loss": 0.7385, + "step": 1014 + }, + { + "epoch": 0.29713114754098363, + "grad_norm": 1.061945915222168, + "learning_rate": 4.975211916910126e-06, + "loss": 0.6929, + "step": 1015 + }, + { + "epoch": 0.297423887587822, + "grad_norm": 1.017655849456787, + "learning_rate": 4.975157795660784e-06, + "loss": 0.7242, + "step": 1016 + }, + { + "epoch": 0.29771662763466045, + "grad_norm": 1.091619610786438, + "learning_rate": 4.975103615687846e-06, + "loss": 0.7531, + "step": 1017 + }, + { + "epoch": 0.2980093676814988, + "grad_norm": 1.023508071899414, + "learning_rate": 4.975049376992601e-06, + "loss": 0.738, + "step": 1018 + }, + { + "epoch": 0.29830210772833726, + "grad_norm": 1.0277678966522217, + "learning_rate": 4.974995079576334e-06, + "loss": 0.7356, + "step": 1019 + }, + { + "epoch": 0.29859484777517564, + "grad_norm": 1.0197066068649292, + "learning_rate": 4.9749407234403335e-06, + "loss": 0.7166, + "step": 1020 + }, + { + "epoch": 0.2988875878220141, + "grad_norm": 1.0432167053222656, + "learning_rate": 4.974886308585888e-06, + "loss": 0.7181, + "step": 1021 + }, + { + "epoch": 0.29918032786885246, + "grad_norm": 1.0206791162490845, + "learning_rate": 4.97483183501429e-06, + "loss": 0.6374, + "step": 1022 + }, + { + "epoch": 0.2994730679156909, + "grad_norm": 0.9854914546012878, + "learning_rate": 4.974777302726831e-06, + "loss": 0.6801, + "step": 1023 + }, + { + "epoch": 0.2997658079625293, + "grad_norm": 1.0654867887496948, + "learning_rate": 4.974722711724804e-06, + "loss": 0.7482, + "step": 1024 + }, + { + "epoch": 0.30005854800936765, + "grad_norm": 1.104385495185852, + "learning_rate": 4.974668062009507e-06, + "loss": 0.7498, + "step": 1025 + }, + { + "epoch": 0.3003512880562061, + "grad_norm": 1.1250284910202026, + "learning_rate": 4.974613353582234e-06, + "loss": 0.7496, + "step": 1026 + }, + { + "epoch": 0.30064402810304447, + "grad_norm": 1.0134528875350952, + "learning_rate": 4.974558586444283e-06, + "loss": 0.7292, + "step": 1027 + }, + { + "epoch": 0.3009367681498829, + "grad_norm": 1.0368529558181763, + "learning_rate": 4.974503760596955e-06, + "loss": 0.6943, + "step": 1028 + }, + { + "epoch": 0.3012295081967213, + "grad_norm": 1.0062355995178223, + "learning_rate": 4.9744488760415495e-06, + "loss": 0.6777, + "step": 1029 + }, + { + "epoch": 0.3015222482435597, + "grad_norm": 1.0464539527893066, + "learning_rate": 4.97439393277937e-06, + "loss": 0.6597, + "step": 1030 + }, + { + "epoch": 0.3018149882903981, + "grad_norm": 1.045858383178711, + "learning_rate": 4.974338930811718e-06, + "loss": 0.7207, + "step": 1031 + }, + { + "epoch": 0.30210772833723654, + "grad_norm": 0.9803716540336609, + "learning_rate": 4.9742838701399e-06, + "loss": 0.7026, + "step": 1032 + }, + { + "epoch": 0.3024004683840749, + "grad_norm": 1.0374640226364136, + "learning_rate": 4.9742287507652215e-06, + "loss": 0.7163, + "step": 1033 + }, + { + "epoch": 0.30269320843091335, + "grad_norm": 0.9662953019142151, + "learning_rate": 4.974173572688991e-06, + "loss": 0.7175, + "step": 1034 + }, + { + "epoch": 0.30298594847775173, + "grad_norm": 1.0078158378601074, + "learning_rate": 4.974118335912517e-06, + "loss": 0.7518, + "step": 1035 + }, + { + "epoch": 0.30327868852459017, + "grad_norm": 1.0312137603759766, + "learning_rate": 4.974063040437109e-06, + "loss": 0.7005, + "step": 1036 + }, + { + "epoch": 0.30357142857142855, + "grad_norm": 1.0535322427749634, + "learning_rate": 4.9740076862640814e-06, + "loss": 0.7612, + "step": 1037 + }, + { + "epoch": 0.303864168618267, + "grad_norm": 1.9454222917556763, + "learning_rate": 4.9739522733947454e-06, + "loss": 0.6624, + "step": 1038 + }, + { + "epoch": 0.30415690866510536, + "grad_norm": 1.0316725969314575, + "learning_rate": 4.973896801830417e-06, + "loss": 0.7043, + "step": 1039 + }, + { + "epoch": 0.3044496487119438, + "grad_norm": 0.9848288297653198, + "learning_rate": 4.973841271572411e-06, + "loss": 0.6667, + "step": 1040 + }, + { + "epoch": 0.3047423887587822, + "grad_norm": 1.0137226581573486, + "learning_rate": 4.973785682622047e-06, + "loss": 0.6857, + "step": 1041 + }, + { + "epoch": 0.3050351288056206, + "grad_norm": 1.0183775424957275, + "learning_rate": 4.97373003498064e-06, + "loss": 0.7205, + "step": 1042 + }, + { + "epoch": 0.305327868852459, + "grad_norm": 0.996317982673645, + "learning_rate": 4.9736743286495146e-06, + "loss": 0.681, + "step": 1043 + }, + { + "epoch": 0.30562060889929743, + "grad_norm": 1.0144741535186768, + "learning_rate": 4.97361856362999e-06, + "loss": 0.6946, + "step": 1044 + }, + { + "epoch": 0.3059133489461358, + "grad_norm": 1.0034986734390259, + "learning_rate": 4.97356273992339e-06, + "loss": 0.7051, + "step": 1045 + }, + { + "epoch": 0.30620608899297425, + "grad_norm": 1.012244462966919, + "learning_rate": 4.9735068575310385e-06, + "loss": 0.7098, + "step": 1046 + }, + { + "epoch": 0.3064988290398126, + "grad_norm": 1.0550434589385986, + "learning_rate": 4.973450916454261e-06, + "loss": 0.6582, + "step": 1047 + }, + { + "epoch": 0.30679156908665106, + "grad_norm": 1.0554959774017334, + "learning_rate": 4.973394916694386e-06, + "loss": 0.6862, + "step": 1048 + }, + { + "epoch": 0.30708430913348944, + "grad_norm": 0.9799898266792297, + "learning_rate": 4.9733388582527415e-06, + "loss": 0.6722, + "step": 1049 + }, + { + "epoch": 0.3073770491803279, + "grad_norm": 1.0683598518371582, + "learning_rate": 4.973282741130658e-06, + "loss": 0.747, + "step": 1050 + }, + { + "epoch": 0.30766978922716626, + "grad_norm": 1.0579555034637451, + "learning_rate": 4.973226565329464e-06, + "loss": 0.7053, + "step": 1051 + }, + { + "epoch": 0.3079625292740047, + "grad_norm": 1.0550786256790161, + "learning_rate": 4.973170330850496e-06, + "loss": 0.7072, + "step": 1052 + }, + { + "epoch": 0.3082552693208431, + "grad_norm": 1.0130152702331543, + "learning_rate": 4.973114037695087e-06, + "loss": 0.6757, + "step": 1053 + }, + { + "epoch": 0.3085480093676815, + "grad_norm": 0.9678844809532166, + "learning_rate": 4.973057685864572e-06, + "loss": 0.6979, + "step": 1054 + }, + { + "epoch": 0.3088407494145199, + "grad_norm": 1.1098878383636475, + "learning_rate": 4.973001275360288e-06, + "loss": 0.7288, + "step": 1055 + }, + { + "epoch": 0.3091334894613583, + "grad_norm": 1.0402486324310303, + "learning_rate": 4.972944806183575e-06, + "loss": 0.724, + "step": 1056 + }, + { + "epoch": 0.3094262295081967, + "grad_norm": 0.9939144253730774, + "learning_rate": 4.97288827833577e-06, + "loss": 0.7157, + "step": 1057 + }, + { + "epoch": 0.30971896955503514, + "grad_norm": 1.062434434890747, + "learning_rate": 4.972831691818216e-06, + "loss": 0.6965, + "step": 1058 + }, + { + "epoch": 0.3100117096018735, + "grad_norm": 1.023071527481079, + "learning_rate": 4.972775046632255e-06, + "loss": 0.6836, + "step": 1059 + }, + { + "epoch": 0.31030444964871196, + "grad_norm": 1.1212248802185059, + "learning_rate": 4.972718342779231e-06, + "loss": 0.7281, + "step": 1060 + }, + { + "epoch": 0.31059718969555034, + "grad_norm": 1.007436752319336, + "learning_rate": 4.9726615802604895e-06, + "loss": 0.6848, + "step": 1061 + }, + { + "epoch": 0.31088992974238877, + "grad_norm": 1.0576808452606201, + "learning_rate": 4.972604759077376e-06, + "loss": 0.725, + "step": 1062 + }, + { + "epoch": 0.31118266978922715, + "grad_norm": 1.0203003883361816, + "learning_rate": 4.97254787923124e-06, + "loss": 0.6842, + "step": 1063 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 1.0188393592834473, + "learning_rate": 4.972490940723431e-06, + "loss": 0.7049, + "step": 1064 + }, + { + "epoch": 0.31176814988290397, + "grad_norm": 1.2319813966751099, + "learning_rate": 4.9724339435552996e-06, + "loss": 0.6696, + "step": 1065 + }, + { + "epoch": 0.3120608899297424, + "grad_norm": 1.0225120782852173, + "learning_rate": 4.9723768877281965e-06, + "loss": 0.7006, + "step": 1066 + }, + { + "epoch": 0.3123536299765808, + "grad_norm": 1.0106565952301025, + "learning_rate": 4.972319773243478e-06, + "loss": 0.7116, + "step": 1067 + }, + { + "epoch": 0.3126463700234192, + "grad_norm": 1.049256682395935, + "learning_rate": 4.9722626001024974e-06, + "loss": 0.6776, + "step": 1068 + }, + { + "epoch": 0.3129391100702576, + "grad_norm": 1.0381031036376953, + "learning_rate": 4.972205368306612e-06, + "loss": 0.6928, + "step": 1069 + }, + { + "epoch": 0.31323185011709603, + "grad_norm": 1.0893114805221558, + "learning_rate": 4.972148077857179e-06, + "loss": 0.7011, + "step": 1070 + }, + { + "epoch": 0.3135245901639344, + "grad_norm": 1.0672881603240967, + "learning_rate": 4.972090728755557e-06, + "loss": 0.7256, + "step": 1071 + }, + { + "epoch": 0.31381733021077285, + "grad_norm": 1.058821439743042, + "learning_rate": 4.972033321003109e-06, + "loss": 0.7263, + "step": 1072 + }, + { + "epoch": 0.31411007025761123, + "grad_norm": 1.0239142179489136, + "learning_rate": 4.971975854601195e-06, + "loss": 0.7014, + "step": 1073 + }, + { + "epoch": 0.31440281030444966, + "grad_norm": 1.0223500728607178, + "learning_rate": 4.971918329551178e-06, + "loss": 0.7144, + "step": 1074 + }, + { + "epoch": 0.31469555035128804, + "grad_norm": 1.0584158897399902, + "learning_rate": 4.971860745854426e-06, + "loss": 0.7434, + "step": 1075 + }, + { + "epoch": 0.3149882903981265, + "grad_norm": 1.0237953662872314, + "learning_rate": 4.971803103512301e-06, + "loss": 0.6847, + "step": 1076 + }, + { + "epoch": 0.31528103044496486, + "grad_norm": 1.045485258102417, + "learning_rate": 4.971745402526172e-06, + "loss": 0.716, + "step": 1077 + }, + { + "epoch": 0.3155737704918033, + "grad_norm": 1.0279474258422852, + "learning_rate": 4.97168764289741e-06, + "loss": 0.745, + "step": 1078 + }, + { + "epoch": 0.3158665105386417, + "grad_norm": 1.0576756000518799, + "learning_rate": 4.9716298246273835e-06, + "loss": 0.7563, + "step": 1079 + }, + { + "epoch": 0.3161592505854801, + "grad_norm": 1.0098720788955688, + "learning_rate": 4.971571947717464e-06, + "loss": 0.6922, + "step": 1080 + }, + { + "epoch": 0.3164519906323185, + "grad_norm": 1.0396603345870972, + "learning_rate": 4.971514012169026e-06, + "loss": 0.691, + "step": 1081 + }, + { + "epoch": 0.3167447306791569, + "grad_norm": 1.0001777410507202, + "learning_rate": 4.971456017983442e-06, + "loss": 0.6549, + "step": 1082 + }, + { + "epoch": 0.3170374707259953, + "grad_norm": 1.064826488494873, + "learning_rate": 4.9713979651620904e-06, + "loss": 0.6828, + "step": 1083 + }, + { + "epoch": 0.31733021077283374, + "grad_norm": 1.0144538879394531, + "learning_rate": 4.9713398537063465e-06, + "loss": 0.7232, + "step": 1084 + }, + { + "epoch": 0.3176229508196721, + "grad_norm": 0.995833158493042, + "learning_rate": 4.97128168361759e-06, + "loss": 0.7255, + "step": 1085 + }, + { + "epoch": 0.31791569086651056, + "grad_norm": 1.0060347318649292, + "learning_rate": 4.971223454897201e-06, + "loss": 0.6838, + "step": 1086 + }, + { + "epoch": 0.31820843091334894, + "grad_norm": 1.0364265441894531, + "learning_rate": 4.97116516754656e-06, + "loss": 0.7237, + "step": 1087 + }, + { + "epoch": 0.3185011709601874, + "grad_norm": 1.0013868808746338, + "learning_rate": 4.971106821567052e-06, + "loss": 0.7079, + "step": 1088 + }, + { + "epoch": 0.31879391100702575, + "grad_norm": 1.0149331092834473, + "learning_rate": 4.971048416960059e-06, + "loss": 0.7051, + "step": 1089 + }, + { + "epoch": 0.3190866510538642, + "grad_norm": 1.0322049856185913, + "learning_rate": 4.9709899537269675e-06, + "loss": 0.7521, + "step": 1090 + }, + { + "epoch": 0.31937939110070257, + "grad_norm": 1.0698907375335693, + "learning_rate": 4.9709314318691645e-06, + "loss": 0.7546, + "step": 1091 + }, + { + "epoch": 0.319672131147541, + "grad_norm": 1.0014560222625732, + "learning_rate": 4.9708728513880385e-06, + "loss": 0.7026, + "step": 1092 + }, + { + "epoch": 0.3199648711943794, + "grad_norm": 1.031192421913147, + "learning_rate": 4.97081421228498e-06, + "loss": 0.7039, + "step": 1093 + }, + { + "epoch": 0.3202576112412178, + "grad_norm": 1.0081617832183838, + "learning_rate": 4.97075551456138e-06, + "loss": 0.7258, + "step": 1094 + }, + { + "epoch": 0.3205503512880562, + "grad_norm": 1.0310925245285034, + "learning_rate": 4.970696758218629e-06, + "loss": 0.7092, + "step": 1095 + }, + { + "epoch": 0.32084309133489464, + "grad_norm": 1.0640363693237305, + "learning_rate": 4.970637943258124e-06, + "loss": 0.7669, + "step": 1096 + }, + { + "epoch": 0.321135831381733, + "grad_norm": 1.0451228618621826, + "learning_rate": 4.97057906968126e-06, + "loss": 0.7205, + "step": 1097 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 0.982742190361023, + "learning_rate": 4.9705201374894315e-06, + "loss": 0.6872, + "step": 1098 + }, + { + "epoch": 0.32172131147540983, + "grad_norm": 1.023891568183899, + "learning_rate": 4.970461146684038e-06, + "loss": 0.7556, + "step": 1099 + }, + { + "epoch": 0.32201405152224827, + "grad_norm": 1.079974889755249, + "learning_rate": 4.97040209726648e-06, + "loss": 0.7315, + "step": 1100 + }, + { + "epoch": 0.32230679156908665, + "grad_norm": 1.052176833152771, + "learning_rate": 4.970342989238157e-06, + "loss": 0.7019, + "step": 1101 + }, + { + "epoch": 0.3225995316159251, + "grad_norm": 1.0219899415969849, + "learning_rate": 4.970283822600472e-06, + "loss": 0.7117, + "step": 1102 + }, + { + "epoch": 0.32289227166276346, + "grad_norm": 0.9987968802452087, + "learning_rate": 4.970224597354829e-06, + "loss": 0.6914, + "step": 1103 + }, + { + "epoch": 0.3231850117096019, + "grad_norm": 0.9937111735343933, + "learning_rate": 4.970165313502633e-06, + "loss": 0.6945, + "step": 1104 + }, + { + "epoch": 0.3234777517564403, + "grad_norm": 1.0368874073028564, + "learning_rate": 4.970105971045291e-06, + "loss": 0.6811, + "step": 1105 + }, + { + "epoch": 0.3237704918032787, + "grad_norm": 1.1431665420532227, + "learning_rate": 4.970046569984209e-06, + "loss": 0.7069, + "step": 1106 + }, + { + "epoch": 0.3240632318501171, + "grad_norm": 1.0213297605514526, + "learning_rate": 4.9699871103207975e-06, + "loss": 0.7315, + "step": 1107 + }, + { + "epoch": 0.32435597189695553, + "grad_norm": 1.0133731365203857, + "learning_rate": 4.969927592056468e-06, + "loss": 0.7047, + "step": 1108 + }, + { + "epoch": 0.3246487119437939, + "grad_norm": 1.0339889526367188, + "learning_rate": 4.9698680151926316e-06, + "loss": 0.7658, + "step": 1109 + }, + { + "epoch": 0.32494145199063235, + "grad_norm": 0.9564478397369385, + "learning_rate": 4.969808379730702e-06, + "loss": 0.6925, + "step": 1110 + }, + { + "epoch": 0.3252341920374707, + "grad_norm": 1.0242143869400024, + "learning_rate": 4.969748685672095e-06, + "loss": 0.6522, + "step": 1111 + }, + { + "epoch": 0.3255269320843091, + "grad_norm": 1.0414994955062866, + "learning_rate": 4.969688933018224e-06, + "loss": 0.6902, + "step": 1112 + }, + { + "epoch": 0.32581967213114754, + "grad_norm": 1.1019970178604126, + "learning_rate": 4.96962912177051e-06, + "loss": 0.6931, + "step": 1113 + }, + { + "epoch": 0.3261124121779859, + "grad_norm": 1.0153347253799438, + "learning_rate": 4.96956925193037e-06, + "loss": 0.6836, + "step": 1114 + }, + { + "epoch": 0.32640515222482436, + "grad_norm": 1.0053660869598389, + "learning_rate": 4.969509323499226e-06, + "loss": 0.6855, + "step": 1115 + }, + { + "epoch": 0.32669789227166274, + "grad_norm": 1.0784345865249634, + "learning_rate": 4.969449336478498e-06, + "loss": 0.6707, + "step": 1116 + }, + { + "epoch": 0.3269906323185012, + "grad_norm": 0.9679749608039856, + "learning_rate": 4.96938929086961e-06, + "loss": 0.6731, + "step": 1117 + }, + { + "epoch": 0.32728337236533955, + "grad_norm": 0.9827604293823242, + "learning_rate": 4.9693291866739864e-06, + "loss": 0.7133, + "step": 1118 + }, + { + "epoch": 0.327576112412178, + "grad_norm": 0.9664946794509888, + "learning_rate": 4.969269023893054e-06, + "loss": 0.7056, + "step": 1119 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 1.0434165000915527, + "learning_rate": 4.9692088025282396e-06, + "loss": 0.6864, + "step": 1120 + }, + { + "epoch": 0.3281615925058548, + "grad_norm": 1.0321365594863892, + "learning_rate": 4.969148522580972e-06, + "loss": 0.7188, + "step": 1121 + }, + { + "epoch": 0.3284543325526932, + "grad_norm": 1.0573608875274658, + "learning_rate": 4.969088184052682e-06, + "loss": 0.7309, + "step": 1122 + }, + { + "epoch": 0.3287470725995316, + "grad_norm": 1.0472930669784546, + "learning_rate": 4.969027786944799e-06, + "loss": 0.6817, + "step": 1123 + }, + { + "epoch": 0.32903981264637, + "grad_norm": 1.0469905138015747, + "learning_rate": 4.968967331258758e-06, + "loss": 0.7091, + "step": 1124 + }, + { + "epoch": 0.32933255269320844, + "grad_norm": 1.0964922904968262, + "learning_rate": 4.968906816995993e-06, + "loss": 0.6835, + "step": 1125 + }, + { + "epoch": 0.3296252927400468, + "grad_norm": 1.0030417442321777, + "learning_rate": 4.968846244157939e-06, + "loss": 0.6684, + "step": 1126 + }, + { + "epoch": 0.32991803278688525, + "grad_norm": 1.1076903343200684, + "learning_rate": 4.968785612746034e-06, + "loss": 0.6194, + "step": 1127 + }, + { + "epoch": 0.33021077283372363, + "grad_norm": 1.009840726852417, + "learning_rate": 4.9687249227617155e-06, + "loss": 0.7427, + "step": 1128 + }, + { + "epoch": 0.33050351288056207, + "grad_norm": 1.0135443210601807, + "learning_rate": 4.968664174206425e-06, + "loss": 0.6941, + "step": 1129 + }, + { + "epoch": 0.33079625292740045, + "grad_norm": 0.9731440544128418, + "learning_rate": 4.968603367081602e-06, + "loss": 0.7041, + "step": 1130 + }, + { + "epoch": 0.3310889929742389, + "grad_norm": 1.068191409111023, + "learning_rate": 4.968542501388689e-06, + "loss": 0.7209, + "step": 1131 + }, + { + "epoch": 0.33138173302107726, + "grad_norm": 1.0051522254943848, + "learning_rate": 4.968481577129131e-06, + "loss": 0.6845, + "step": 1132 + }, + { + "epoch": 0.3316744730679157, + "grad_norm": 0.9665180444717407, + "learning_rate": 4.968420594304375e-06, + "loss": 0.6604, + "step": 1133 + }, + { + "epoch": 0.3319672131147541, + "grad_norm": 0.991442084312439, + "learning_rate": 4.968359552915865e-06, + "loss": 0.7035, + "step": 1134 + }, + { + "epoch": 0.3322599531615925, + "grad_norm": 1.0032553672790527, + "learning_rate": 4.96829845296505e-06, + "loss": 0.72, + "step": 1135 + }, + { + "epoch": 0.3325526932084309, + "grad_norm": 1.0426082611083984, + "learning_rate": 4.96823729445338e-06, + "loss": 0.669, + "step": 1136 + }, + { + "epoch": 0.33284543325526933, + "grad_norm": 1.0504231452941895, + "learning_rate": 4.968176077382307e-06, + "loss": 0.737, + "step": 1137 + }, + { + "epoch": 0.3331381733021077, + "grad_norm": 1.048828125, + "learning_rate": 4.968114801753282e-06, + "loss": 0.7532, + "step": 1138 + }, + { + "epoch": 0.33343091334894615, + "grad_norm": 0.9790355563163757, + "learning_rate": 4.968053467567759e-06, + "loss": 0.7273, + "step": 1139 + }, + { + "epoch": 0.3337236533957845, + "grad_norm": 1.018373727798462, + "learning_rate": 4.967992074827194e-06, + "loss": 0.7133, + "step": 1140 + }, + { + "epoch": 0.33401639344262296, + "grad_norm": 0.9761000871658325, + "learning_rate": 4.967930623533041e-06, + "loss": 0.6988, + "step": 1141 + }, + { + "epoch": 0.33430913348946134, + "grad_norm": 1.0016419887542725, + "learning_rate": 4.967869113686762e-06, + "loss": 0.6957, + "step": 1142 + }, + { + "epoch": 0.3346018735362998, + "grad_norm": 1.0518378019332886, + "learning_rate": 4.967807545289813e-06, + "loss": 0.6967, + "step": 1143 + }, + { + "epoch": 0.33489461358313816, + "grad_norm": 1.0596566200256348, + "learning_rate": 4.967745918343656e-06, + "loss": 0.6986, + "step": 1144 + }, + { + "epoch": 0.3351873536299766, + "grad_norm": 1.0855120420455933, + "learning_rate": 4.967684232849753e-06, + "loss": 0.7243, + "step": 1145 + }, + { + "epoch": 0.33548009367681497, + "grad_norm": 1.0887712240219116, + "learning_rate": 4.967622488809568e-06, + "loss": 0.6922, + "step": 1146 + }, + { + "epoch": 0.3357728337236534, + "grad_norm": 1.023648738861084, + "learning_rate": 4.967560686224565e-06, + "loss": 0.7232, + "step": 1147 + }, + { + "epoch": 0.3360655737704918, + "grad_norm": 1.016161322593689, + "learning_rate": 4.96749882509621e-06, + "loss": 0.6959, + "step": 1148 + }, + { + "epoch": 0.3363583138173302, + "grad_norm": 1.0799683332443237, + "learning_rate": 4.967436905425972e-06, + "loss": 0.7434, + "step": 1149 + }, + { + "epoch": 0.3366510538641686, + "grad_norm": 0.9874275326728821, + "learning_rate": 4.9673749272153195e-06, + "loss": 0.6653, + "step": 1150 + }, + { + "epoch": 0.33694379391100704, + "grad_norm": 1.0633972883224487, + "learning_rate": 4.967312890465722e-06, + "loss": 0.711, + "step": 1151 + }, + { + "epoch": 0.3372365339578454, + "grad_norm": 0.9656113386154175, + "learning_rate": 4.967250795178653e-06, + "loss": 0.7196, + "step": 1152 + }, + { + "epoch": 0.33752927400468385, + "grad_norm": 1.0109549760818481, + "learning_rate": 4.967188641355585e-06, + "loss": 0.7224, + "step": 1153 + }, + { + "epoch": 0.33782201405152223, + "grad_norm": 0.9847148656845093, + "learning_rate": 4.967126428997991e-06, + "loss": 0.708, + "step": 1154 + }, + { + "epoch": 0.33811475409836067, + "grad_norm": 1.0011004209518433, + "learning_rate": 4.96706415810735e-06, + "loss": 0.7285, + "step": 1155 + }, + { + "epoch": 0.33840749414519905, + "grad_norm": 1.0158133506774902, + "learning_rate": 4.967001828685137e-06, + "loss": 0.6927, + "step": 1156 + }, + { + "epoch": 0.3387002341920375, + "grad_norm": 0.9973741173744202, + "learning_rate": 4.966939440732832e-06, + "loss": 0.6768, + "step": 1157 + }, + { + "epoch": 0.33899297423887587, + "grad_norm": 1.0361130237579346, + "learning_rate": 4.966876994251915e-06, + "loss": 0.6968, + "step": 1158 + }, + { + "epoch": 0.3392857142857143, + "grad_norm": 1.1205775737762451, + "learning_rate": 4.9668144892438675e-06, + "loss": 0.7171, + "step": 1159 + }, + { + "epoch": 0.3395784543325527, + "grad_norm": 1.0197944641113281, + "learning_rate": 4.966751925710172e-06, + "loss": 0.7453, + "step": 1160 + }, + { + "epoch": 0.3398711943793911, + "grad_norm": 1.0246455669403076, + "learning_rate": 4.966689303652314e-06, + "loss": 0.7147, + "step": 1161 + }, + { + "epoch": 0.3401639344262295, + "grad_norm": 1.0265711545944214, + "learning_rate": 4.966626623071777e-06, + "loss": 0.6891, + "step": 1162 + }, + { + "epoch": 0.34045667447306793, + "grad_norm": 0.980503261089325, + "learning_rate": 4.96656388397005e-06, + "loss": 0.6709, + "step": 1163 + }, + { + "epoch": 0.3407494145199063, + "grad_norm": 0.9595857262611389, + "learning_rate": 4.966501086348621e-06, + "loss": 0.6527, + "step": 1164 + }, + { + "epoch": 0.34104215456674475, + "grad_norm": 1.0557111501693726, + "learning_rate": 4.9664382302089795e-06, + "loss": 0.7481, + "step": 1165 + }, + { + "epoch": 0.34133489461358313, + "grad_norm": 1.0760278701782227, + "learning_rate": 4.966375315552617e-06, + "loss": 0.7373, + "step": 1166 + }, + { + "epoch": 0.34162763466042156, + "grad_norm": 1.0610170364379883, + "learning_rate": 4.966312342381028e-06, + "loss": 0.695, + "step": 1167 + }, + { + "epoch": 0.34192037470725994, + "grad_norm": 0.9980792999267578, + "learning_rate": 4.966249310695703e-06, + "loss": 0.6808, + "step": 1168 + }, + { + "epoch": 0.3422131147540984, + "grad_norm": 1.0101970434188843, + "learning_rate": 4.9661862204981405e-06, + "loss": 0.6968, + "step": 1169 + }, + { + "epoch": 0.34250585480093676, + "grad_norm": 1.0732946395874023, + "learning_rate": 4.9661230717898355e-06, + "loss": 0.7068, + "step": 1170 + }, + { + "epoch": 0.3427985948477752, + "grad_norm": 1.039677381515503, + "learning_rate": 4.966059864572287e-06, + "loss": 0.7295, + "step": 1171 + }, + { + "epoch": 0.3430913348946136, + "grad_norm": 1.0853806734085083, + "learning_rate": 4.965996598846994e-06, + "loss": 0.7097, + "step": 1172 + }, + { + "epoch": 0.343384074941452, + "grad_norm": 1.0101193189620972, + "learning_rate": 4.965933274615458e-06, + "loss": 0.717, + "step": 1173 + }, + { + "epoch": 0.3436768149882904, + "grad_norm": 1.0385756492614746, + "learning_rate": 4.9658698918791816e-06, + "loss": 0.6816, + "step": 1174 + }, + { + "epoch": 0.3439695550351288, + "grad_norm": 1.035588264465332, + "learning_rate": 4.965806450639668e-06, + "loss": 0.7263, + "step": 1175 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 1.011927604675293, + "learning_rate": 4.965742950898422e-06, + "loss": 0.6922, + "step": 1176 + }, + { + "epoch": 0.34455503512880564, + "grad_norm": 1.0300769805908203, + "learning_rate": 4.965679392656952e-06, + "loss": 0.6967, + "step": 1177 + }, + { + "epoch": 0.344847775175644, + "grad_norm": 1.0566831827163696, + "learning_rate": 4.965615775916765e-06, + "loss": 0.7266, + "step": 1178 + }, + { + "epoch": 0.34514051522248246, + "grad_norm": 1.0803710222244263, + "learning_rate": 4.9655521006793685e-06, + "loss": 0.7237, + "step": 1179 + }, + { + "epoch": 0.34543325526932084, + "grad_norm": 1.085924744606018, + "learning_rate": 4.965488366946276e-06, + "loss": 0.7441, + "step": 1180 + }, + { + "epoch": 0.3457259953161593, + "grad_norm": 1.0466762781143188, + "learning_rate": 4.9654245747189975e-06, + "loss": 0.7038, + "step": 1181 + }, + { + "epoch": 0.34601873536299765, + "grad_norm": 1.0625278949737549, + "learning_rate": 4.965360723999047e-06, + "loss": 0.6876, + "step": 1182 + }, + { + "epoch": 0.3463114754098361, + "grad_norm": 1.0566527843475342, + "learning_rate": 4.965296814787941e-06, + "loss": 0.6927, + "step": 1183 + }, + { + "epoch": 0.34660421545667447, + "grad_norm": 1.0296764373779297, + "learning_rate": 4.965232847087193e-06, + "loss": 0.707, + "step": 1184 + }, + { + "epoch": 0.3468969555035129, + "grad_norm": 1.04856276512146, + "learning_rate": 4.965168820898324e-06, + "loss": 0.691, + "step": 1185 + }, + { + "epoch": 0.3471896955503513, + "grad_norm": 1.070234775543213, + "learning_rate": 4.96510473622285e-06, + "loss": 0.6793, + "step": 1186 + }, + { + "epoch": 0.3474824355971897, + "grad_norm": 1.0888280868530273, + "learning_rate": 4.965040593062292e-06, + "loss": 0.7117, + "step": 1187 + }, + { + "epoch": 0.3477751756440281, + "grad_norm": 1.0750700235366821, + "learning_rate": 4.964976391418173e-06, + "loss": 0.7065, + "step": 1188 + }, + { + "epoch": 0.34806791569086654, + "grad_norm": 1.030056118965149, + "learning_rate": 4.964912131292015e-06, + "loss": 0.7352, + "step": 1189 + }, + { + "epoch": 0.3483606557377049, + "grad_norm": 1.100020408630371, + "learning_rate": 4.964847812685344e-06, + "loss": 0.7063, + "step": 1190 + }, + { + "epoch": 0.34865339578454335, + "grad_norm": 1.0429716110229492, + "learning_rate": 4.964783435599684e-06, + "loss": 0.6783, + "step": 1191 + }, + { + "epoch": 0.34894613583138173, + "grad_norm": 1.11379075050354, + "learning_rate": 4.964719000036564e-06, + "loss": 0.7447, + "step": 1192 + }, + { + "epoch": 0.34923887587822017, + "grad_norm": 1.1355020999908447, + "learning_rate": 4.964654505997512e-06, + "loss": 0.7341, + "step": 1193 + }, + { + "epoch": 0.34953161592505855, + "grad_norm": 1.0674324035644531, + "learning_rate": 4.964589953484059e-06, + "loss": 0.7264, + "step": 1194 + }, + { + "epoch": 0.349824355971897, + "grad_norm": 1.0597835779190063, + "learning_rate": 4.9645253424977355e-06, + "loss": 0.7508, + "step": 1195 + }, + { + "epoch": 0.35011709601873536, + "grad_norm": 0.9869487881660461, + "learning_rate": 4.9644606730400744e-06, + "loss": 0.7056, + "step": 1196 + }, + { + "epoch": 0.35040983606557374, + "grad_norm": 1.0544071197509766, + "learning_rate": 4.964395945112611e-06, + "loss": 0.7386, + "step": 1197 + }, + { + "epoch": 0.3507025761124122, + "grad_norm": 1.061193585395813, + "learning_rate": 4.964331158716879e-06, + "loss": 0.716, + "step": 1198 + }, + { + "epoch": 0.35099531615925056, + "grad_norm": 0.9744677543640137, + "learning_rate": 4.964266313854418e-06, + "loss": 0.6472, + "step": 1199 + }, + { + "epoch": 0.351288056206089, + "grad_norm": 1.050890564918518, + "learning_rate": 4.964201410526765e-06, + "loss": 0.6997, + "step": 1200 + }, + { + "epoch": 0.3515807962529274, + "grad_norm": 1.0311603546142578, + "learning_rate": 4.964136448735461e-06, + "loss": 0.6998, + "step": 1201 + }, + { + "epoch": 0.3518735362997658, + "grad_norm": 1.0737648010253906, + "learning_rate": 4.9640714284820445e-06, + "loss": 0.6869, + "step": 1202 + }, + { + "epoch": 0.3521662763466042, + "grad_norm": 1.0879855155944824, + "learning_rate": 4.964006349768061e-06, + "loss": 0.688, + "step": 1203 + }, + { + "epoch": 0.3524590163934426, + "grad_norm": 1.0379098653793335, + "learning_rate": 4.963941212595053e-06, + "loss": 0.7033, + "step": 1204 + }, + { + "epoch": 0.352751756440281, + "grad_norm": 1.0627745389938354, + "learning_rate": 4.9638760169645675e-06, + "loss": 0.7053, + "step": 1205 + }, + { + "epoch": 0.35304449648711944, + "grad_norm": 1.0886688232421875, + "learning_rate": 4.963810762878149e-06, + "loss": 0.7414, + "step": 1206 + }, + { + "epoch": 0.3533372365339578, + "grad_norm": 1.1560651063919067, + "learning_rate": 4.963745450337347e-06, + "loss": 0.7145, + "step": 1207 + }, + { + "epoch": 0.35362997658079626, + "grad_norm": 1.0343557596206665, + "learning_rate": 4.963680079343711e-06, + "loss": 0.7135, + "step": 1208 + }, + { + "epoch": 0.35392271662763464, + "grad_norm": 1.0340228080749512, + "learning_rate": 4.963614649898793e-06, + "loss": 0.6936, + "step": 1209 + }, + { + "epoch": 0.3542154566744731, + "grad_norm": 1.0474520921707153, + "learning_rate": 4.963549162004142e-06, + "loss": 0.7604, + "step": 1210 + }, + { + "epoch": 0.35450819672131145, + "grad_norm": 1.0741896629333496, + "learning_rate": 4.9634836156613154e-06, + "loss": 0.7738, + "step": 1211 + }, + { + "epoch": 0.3548009367681499, + "grad_norm": 1.0240626335144043, + "learning_rate": 4.963418010871865e-06, + "loss": 0.6676, + "step": 1212 + }, + { + "epoch": 0.35509367681498827, + "grad_norm": 1.042197585105896, + "learning_rate": 4.963352347637351e-06, + "loss": 0.6917, + "step": 1213 + }, + { + "epoch": 0.3553864168618267, + "grad_norm": 1.07012939453125, + "learning_rate": 4.963286625959328e-06, + "loss": 0.6921, + "step": 1214 + }, + { + "epoch": 0.3556791569086651, + "grad_norm": 1.0504122972488403, + "learning_rate": 4.9632208458393574e-06, + "loss": 0.6244, + "step": 1215 + }, + { + "epoch": 0.3559718969555035, + "grad_norm": 1.0806747674942017, + "learning_rate": 4.963155007278998e-06, + "loss": 0.7162, + "step": 1216 + }, + { + "epoch": 0.3562646370023419, + "grad_norm": 1.0814069509506226, + "learning_rate": 4.963089110279814e-06, + "loss": 0.7183, + "step": 1217 + }, + { + "epoch": 0.35655737704918034, + "grad_norm": 1.008934736251831, + "learning_rate": 4.9630231548433675e-06, + "loss": 0.6689, + "step": 1218 + }, + { + "epoch": 0.3568501170960187, + "grad_norm": 1.0099372863769531, + "learning_rate": 4.962957140971222e-06, + "loss": 0.7326, + "step": 1219 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 1.1072124242782593, + "learning_rate": 4.962891068664947e-06, + "loss": 0.6705, + "step": 1220 + }, + { + "epoch": 0.35743559718969553, + "grad_norm": 1.0657756328582764, + "learning_rate": 4.962824937926107e-06, + "loss": 0.7275, + "step": 1221 + }, + { + "epoch": 0.35772833723653397, + "grad_norm": 1.0491372346878052, + "learning_rate": 4.9627587487562725e-06, + "loss": 0.6978, + "step": 1222 + }, + { + "epoch": 0.35802107728337235, + "grad_norm": 0.9573419690132141, + "learning_rate": 4.962692501157013e-06, + "loss": 0.6906, + "step": 1223 + }, + { + "epoch": 0.3583138173302108, + "grad_norm": 1.0145756006240845, + "learning_rate": 4.962626195129902e-06, + "loss": 0.6944, + "step": 1224 + }, + { + "epoch": 0.35860655737704916, + "grad_norm": 1.133514404296875, + "learning_rate": 4.96255983067651e-06, + "loss": 0.7598, + "step": 1225 + }, + { + "epoch": 0.3588992974238876, + "grad_norm": 1.001069188117981, + "learning_rate": 4.962493407798414e-06, + "loss": 0.6905, + "step": 1226 + }, + { + "epoch": 0.359192037470726, + "grad_norm": 0.9836168885231018, + "learning_rate": 4.962426926497188e-06, + "loss": 0.7108, + "step": 1227 + }, + { + "epoch": 0.3594847775175644, + "grad_norm": 1.0206648111343384, + "learning_rate": 4.962360386774411e-06, + "loss": 0.709, + "step": 1228 + }, + { + "epoch": 0.3597775175644028, + "grad_norm": 1.0740604400634766, + "learning_rate": 4.96229378863166e-06, + "loss": 0.6905, + "step": 1229 + }, + { + "epoch": 0.36007025761124123, + "grad_norm": 1.0138808488845825, + "learning_rate": 4.962227132070517e-06, + "loss": 0.6992, + "step": 1230 + }, + { + "epoch": 0.3603629976580796, + "grad_norm": 1.0309278964996338, + "learning_rate": 4.962160417092561e-06, + "loss": 0.6855, + "step": 1231 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 1.0864710807800293, + "learning_rate": 4.962093643699376e-06, + "loss": 0.6554, + "step": 1232 + }, + { + "epoch": 0.3609484777517564, + "grad_norm": 0.9379793405532837, + "learning_rate": 4.962026811892547e-06, + "loss": 0.678, + "step": 1233 + }, + { + "epoch": 0.36124121779859486, + "grad_norm": 1.061845302581787, + "learning_rate": 4.961959921673658e-06, + "loss": 0.7034, + "step": 1234 + }, + { + "epoch": 0.36153395784543324, + "grad_norm": 1.0586259365081787, + "learning_rate": 4.961892973044298e-06, + "loss": 0.7221, + "step": 1235 + }, + { + "epoch": 0.3618266978922717, + "grad_norm": 0.9793519377708435, + "learning_rate": 4.961825966006053e-06, + "loss": 0.6671, + "step": 1236 + }, + { + "epoch": 0.36211943793911006, + "grad_norm": 1.0115970373153687, + "learning_rate": 4.961758900560515e-06, + "loss": 0.6882, + "step": 1237 + }, + { + "epoch": 0.3624121779859485, + "grad_norm": 1.0464587211608887, + "learning_rate": 4.961691776709273e-06, + "loss": 0.7172, + "step": 1238 + }, + { + "epoch": 0.36270491803278687, + "grad_norm": 0.9737903475761414, + "learning_rate": 4.961624594453922e-06, + "loss": 0.7235, + "step": 1239 + }, + { + "epoch": 0.3629976580796253, + "grad_norm": 1.0289822816848755, + "learning_rate": 4.961557353796054e-06, + "loss": 0.6584, + "step": 1240 + }, + { + "epoch": 0.3632903981264637, + "grad_norm": 1.0380135774612427, + "learning_rate": 4.961490054737265e-06, + "loss": 0.6984, + "step": 1241 + }, + { + "epoch": 0.3635831381733021, + "grad_norm": 0.9886779189109802, + "learning_rate": 4.961422697279152e-06, + "loss": 0.6675, + "step": 1242 + }, + { + "epoch": 0.3638758782201405, + "grad_norm": 1.0046916007995605, + "learning_rate": 4.961355281423311e-06, + "loss": 0.7116, + "step": 1243 + }, + { + "epoch": 0.36416861826697894, + "grad_norm": 0.9905543327331543, + "learning_rate": 4.961287807171344e-06, + "loss": 0.6942, + "step": 1244 + }, + { + "epoch": 0.3644613583138173, + "grad_norm": 0.9793192148208618, + "learning_rate": 4.961220274524852e-06, + "loss": 0.7224, + "step": 1245 + }, + { + "epoch": 0.36475409836065575, + "grad_norm": 1.0226945877075195, + "learning_rate": 4.961152683485434e-06, + "loss": 0.7003, + "step": 1246 + }, + { + "epoch": 0.36504683840749413, + "grad_norm": 1.0217171907424927, + "learning_rate": 4.9610850340546975e-06, + "loss": 0.7016, + "step": 1247 + }, + { + "epoch": 0.36533957845433257, + "grad_norm": 1.042677402496338, + "learning_rate": 4.961017326234244e-06, + "loss": 0.7529, + "step": 1248 + }, + { + "epoch": 0.36563231850117095, + "grad_norm": 1.022925853729248, + "learning_rate": 4.960949560025683e-06, + "loss": 0.722, + "step": 1249 + }, + { + "epoch": 0.3659250585480094, + "grad_norm": 1.0514870882034302, + "learning_rate": 4.960881735430621e-06, + "loss": 0.7492, + "step": 1250 + }, + { + "epoch": 0.36621779859484777, + "grad_norm": 0.9801886081695557, + "learning_rate": 4.960813852450666e-06, + "loss": 0.6569, + "step": 1251 + }, + { + "epoch": 0.3665105386416862, + "grad_norm": 0.9909241199493408, + "learning_rate": 4.960745911087432e-06, + "loss": 0.7081, + "step": 1252 + }, + { + "epoch": 0.3668032786885246, + "grad_norm": 1.014387845993042, + "learning_rate": 4.9606779113425255e-06, + "loss": 0.692, + "step": 1253 + }, + { + "epoch": 0.367096018735363, + "grad_norm": 0.9996505379676819, + "learning_rate": 4.960609853217564e-06, + "loss": 0.7, + "step": 1254 + }, + { + "epoch": 0.3673887587822014, + "grad_norm": 1.0026066303253174, + "learning_rate": 4.960541736714161e-06, + "loss": 0.6533, + "step": 1255 + }, + { + "epoch": 0.36768149882903983, + "grad_norm": 1.052744746208191, + "learning_rate": 4.960473561833933e-06, + "loss": 0.6961, + "step": 1256 + }, + { + "epoch": 0.3679742388758782, + "grad_norm": 1.069267749786377, + "learning_rate": 4.960405328578496e-06, + "loss": 0.6804, + "step": 1257 + }, + { + "epoch": 0.36826697892271665, + "grad_norm": 1.0313178300857544, + "learning_rate": 4.96033703694947e-06, + "loss": 0.6908, + "step": 1258 + }, + { + "epoch": 0.36855971896955503, + "grad_norm": 0.9840500950813293, + "learning_rate": 4.960268686948476e-06, + "loss": 0.7292, + "step": 1259 + }, + { + "epoch": 0.36885245901639346, + "grad_norm": 0.9792658686637878, + "learning_rate": 4.960200278577135e-06, + "loss": 0.6834, + "step": 1260 + }, + { + "epoch": 0.36914519906323184, + "grad_norm": 1.0019034147262573, + "learning_rate": 4.960131811837069e-06, + "loss": 0.7167, + "step": 1261 + }, + { + "epoch": 0.3694379391100703, + "grad_norm": 1.0718308687210083, + "learning_rate": 4.960063286729903e-06, + "loss": 0.7328, + "step": 1262 + }, + { + "epoch": 0.36973067915690866, + "grad_norm": 0.9618103504180908, + "learning_rate": 4.959994703257263e-06, + "loss": 0.6821, + "step": 1263 + }, + { + "epoch": 0.3700234192037471, + "grad_norm": 1.0692803859710693, + "learning_rate": 4.9599260614207765e-06, + "loss": 0.7401, + "step": 1264 + }, + { + "epoch": 0.3703161592505855, + "grad_norm": 0.9662203192710876, + "learning_rate": 4.959857361222071e-06, + "loss": 0.6279, + "step": 1265 + }, + { + "epoch": 0.3706088992974239, + "grad_norm": 0.9953547716140747, + "learning_rate": 4.959788602662777e-06, + "loss": 0.6733, + "step": 1266 + }, + { + "epoch": 0.3709016393442623, + "grad_norm": 1.1290725469589233, + "learning_rate": 4.959719785744527e-06, + "loss": 0.6685, + "step": 1267 + }, + { + "epoch": 0.3711943793911007, + "grad_norm": 1.0216116905212402, + "learning_rate": 4.9596509104689515e-06, + "loss": 0.7032, + "step": 1268 + }, + { + "epoch": 0.3714871194379391, + "grad_norm": 1.0615943670272827, + "learning_rate": 4.959581976837685e-06, + "loss": 0.7347, + "step": 1269 + }, + { + "epoch": 0.37177985948477754, + "grad_norm": 1.05218505859375, + "learning_rate": 4.959512984852364e-06, + "loss": 0.652, + "step": 1270 + }, + { + "epoch": 0.3720725995316159, + "grad_norm": 1.0318681001663208, + "learning_rate": 4.959443934514626e-06, + "loss": 0.6877, + "step": 1271 + }, + { + "epoch": 0.37236533957845436, + "grad_norm": 1.0723981857299805, + "learning_rate": 4.959374825826106e-06, + "loss": 0.696, + "step": 1272 + }, + { + "epoch": 0.37265807962529274, + "grad_norm": 0.9716984033584595, + "learning_rate": 4.959305658788448e-06, + "loss": 0.6812, + "step": 1273 + }, + { + "epoch": 0.3729508196721312, + "grad_norm": 0.9960514903068542, + "learning_rate": 4.9592364334032896e-06, + "loss": 0.6804, + "step": 1274 + }, + { + "epoch": 0.37324355971896955, + "grad_norm": 1.0041236877441406, + "learning_rate": 4.959167149672275e-06, + "loss": 0.6505, + "step": 1275 + }, + { + "epoch": 0.373536299765808, + "grad_norm": 1.03640878200531, + "learning_rate": 4.959097807597046e-06, + "loss": 0.6947, + "step": 1276 + }, + { + "epoch": 0.37382903981264637, + "grad_norm": 1.0688682794570923, + "learning_rate": 4.9590284071792505e-06, + "loss": 0.6686, + "step": 1277 + }, + { + "epoch": 0.3741217798594848, + "grad_norm": 1.0356793403625488, + "learning_rate": 4.958958948420533e-06, + "loss": 0.7204, + "step": 1278 + }, + { + "epoch": 0.3744145199063232, + "grad_norm": 0.9998685717582703, + "learning_rate": 4.958889431322542e-06, + "loss": 0.6832, + "step": 1279 + }, + { + "epoch": 0.3747072599531616, + "grad_norm": 1.024307131767273, + "learning_rate": 4.958819855886927e-06, + "loss": 0.7082, + "step": 1280 + }, + { + "epoch": 0.375, + "grad_norm": 0.987759530544281, + "learning_rate": 4.958750222115339e-06, + "loss": 0.6976, + "step": 1281 + }, + { + "epoch": 0.3752927400468384, + "grad_norm": 0.9874844551086426, + "learning_rate": 4.958680530009429e-06, + "loss": 0.6867, + "step": 1282 + }, + { + "epoch": 0.3755854800936768, + "grad_norm": 0.9834856390953064, + "learning_rate": 4.958610779570851e-06, + "loss": 0.716, + "step": 1283 + }, + { + "epoch": 0.3758782201405152, + "grad_norm": 1.0532090663909912, + "learning_rate": 4.95854097080126e-06, + "loss": 0.7149, + "step": 1284 + }, + { + "epoch": 0.37617096018735363, + "grad_norm": 0.950659990310669, + "learning_rate": 4.958471103702311e-06, + "loss": 0.673, + "step": 1285 + }, + { + "epoch": 0.376463700234192, + "grad_norm": 0.992885172367096, + "learning_rate": 4.958401178275665e-06, + "loss": 0.7069, + "step": 1286 + }, + { + "epoch": 0.37675644028103045, + "grad_norm": 1.004529595375061, + "learning_rate": 4.9583311945229775e-06, + "loss": 0.6801, + "step": 1287 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 1.0065760612487793, + "learning_rate": 4.9582611524459105e-06, + "loss": 0.7434, + "step": 1288 + }, + { + "epoch": 0.37734192037470726, + "grad_norm": 0.9879124164581299, + "learning_rate": 4.958191052046124e-06, + "loss": 0.6935, + "step": 1289 + }, + { + "epoch": 0.37763466042154564, + "grad_norm": 0.9850251078605652, + "learning_rate": 4.958120893325285e-06, + "loss": 0.6816, + "step": 1290 + }, + { + "epoch": 0.3779274004683841, + "grad_norm": 1.1849783658981323, + "learning_rate": 4.958050676285053e-06, + "loss": 0.7155, + "step": 1291 + }, + { + "epoch": 0.37822014051522246, + "grad_norm": 1.0009607076644897, + "learning_rate": 4.957980400927099e-06, + "loss": 0.6795, + "step": 1292 + }, + { + "epoch": 0.3785128805620609, + "grad_norm": 1.020949363708496, + "learning_rate": 4.957910067253086e-06, + "loss": 0.7215, + "step": 1293 + }, + { + "epoch": 0.3788056206088993, + "grad_norm": 1.0087758302688599, + "learning_rate": 4.957839675264684e-06, + "loss": 0.6944, + "step": 1294 + }, + { + "epoch": 0.3790983606557377, + "grad_norm": 1.0029518604278564, + "learning_rate": 4.957769224963565e-06, + "loss": 0.646, + "step": 1295 + }, + { + "epoch": 0.3793911007025761, + "grad_norm": 0.9950380921363831, + "learning_rate": 4.9576987163513975e-06, + "loss": 0.6999, + "step": 1296 + }, + { + "epoch": 0.3796838407494145, + "grad_norm": 1.0247745513916016, + "learning_rate": 4.957628149429857e-06, + "loss": 0.6832, + "step": 1297 + }, + { + "epoch": 0.3799765807962529, + "grad_norm": 1.0316559076309204, + "learning_rate": 4.957557524200616e-06, + "loss": 0.7206, + "step": 1298 + }, + { + "epoch": 0.38026932084309134, + "grad_norm": 1.0172219276428223, + "learning_rate": 4.95748684066535e-06, + "loss": 0.668, + "step": 1299 + }, + { + "epoch": 0.3805620608899297, + "grad_norm": 0.9712539911270142, + "learning_rate": 4.957416098825738e-06, + "loss": 0.654, + "step": 1300 + }, + { + "epoch": 0.38085480093676816, + "grad_norm": 1.1325297355651855, + "learning_rate": 4.957345298683456e-06, + "loss": 0.7098, + "step": 1301 + }, + { + "epoch": 0.38114754098360654, + "grad_norm": 0.9968823194503784, + "learning_rate": 4.957274440240184e-06, + "loss": 0.6419, + "step": 1302 + }, + { + "epoch": 0.38144028103044497, + "grad_norm": 0.9789109230041504, + "learning_rate": 4.957203523497605e-06, + "loss": 0.6842, + "step": 1303 + }, + { + "epoch": 0.38173302107728335, + "grad_norm": 1.0404905080795288, + "learning_rate": 4.9571325484574e-06, + "loss": 0.7011, + "step": 1304 + }, + { + "epoch": 0.3820257611241218, + "grad_norm": 0.981388509273529, + "learning_rate": 4.957061515121253e-06, + "loss": 0.6912, + "step": 1305 + }, + { + "epoch": 0.38231850117096017, + "grad_norm": 1.0965113639831543, + "learning_rate": 4.9569904234908495e-06, + "loss": 0.7217, + "step": 1306 + }, + { + "epoch": 0.3826112412177986, + "grad_norm": 0.9995091557502747, + "learning_rate": 4.956919273567876e-06, + "loss": 0.6558, + "step": 1307 + }, + { + "epoch": 0.382903981264637, + "grad_norm": 1.069390892982483, + "learning_rate": 4.9568480653540215e-06, + "loss": 0.7349, + "step": 1308 + }, + { + "epoch": 0.3831967213114754, + "grad_norm": 0.971950113773346, + "learning_rate": 4.956776798850974e-06, + "loss": 0.6974, + "step": 1309 + }, + { + "epoch": 0.3834894613583138, + "grad_norm": 1.0412019491195679, + "learning_rate": 4.9567054740604244e-06, + "loss": 0.6746, + "step": 1310 + }, + { + "epoch": 0.38378220140515223, + "grad_norm": 1.0848901271820068, + "learning_rate": 4.9566340909840675e-06, + "loss": 0.6835, + "step": 1311 + }, + { + "epoch": 0.3840749414519906, + "grad_norm": 1.0173029899597168, + "learning_rate": 4.956562649623593e-06, + "loss": 0.6906, + "step": 1312 + }, + { + "epoch": 0.38436768149882905, + "grad_norm": 1.0035003423690796, + "learning_rate": 4.956491149980698e-06, + "loss": 0.6397, + "step": 1313 + }, + { + "epoch": 0.38466042154566743, + "grad_norm": 1.0627820491790771, + "learning_rate": 4.956419592057079e-06, + "loss": 0.7466, + "step": 1314 + }, + { + "epoch": 0.38495316159250587, + "grad_norm": 1.0435189008712769, + "learning_rate": 4.956347975854432e-06, + "loss": 0.714, + "step": 1315 + }, + { + "epoch": 0.38524590163934425, + "grad_norm": 1.0844056606292725, + "learning_rate": 4.956276301374459e-06, + "loss": 0.6761, + "step": 1316 + }, + { + "epoch": 0.3855386416861827, + "grad_norm": 1.0055752992630005, + "learning_rate": 4.956204568618857e-06, + "loss": 0.6616, + "step": 1317 + }, + { + "epoch": 0.38583138173302106, + "grad_norm": 1.029035210609436, + "learning_rate": 4.956132777589332e-06, + "loss": 0.7414, + "step": 1318 + }, + { + "epoch": 0.3861241217798595, + "grad_norm": 0.9952924251556396, + "learning_rate": 4.956060928287583e-06, + "loss": 0.698, + "step": 1319 + }, + { + "epoch": 0.3864168618266979, + "grad_norm": 0.9879747033119202, + "learning_rate": 4.9559890207153185e-06, + "loss": 0.6549, + "step": 1320 + }, + { + "epoch": 0.3867096018735363, + "grad_norm": 0.9698799252510071, + "learning_rate": 4.955917054874241e-06, + "loss": 0.677, + "step": 1321 + }, + { + "epoch": 0.3870023419203747, + "grad_norm": 1.0010823011398315, + "learning_rate": 4.95584503076606e-06, + "loss": 0.686, + "step": 1322 + }, + { + "epoch": 0.38729508196721313, + "grad_norm": 1.0007860660552979, + "learning_rate": 4.955772948392484e-06, + "loss": 0.6841, + "step": 1323 + }, + { + "epoch": 0.3875878220140515, + "grad_norm": 1.1146610975265503, + "learning_rate": 4.955700807755223e-06, + "loss": 0.7599, + "step": 1324 + }, + { + "epoch": 0.38788056206088994, + "grad_norm": 0.9992092847824097, + "learning_rate": 4.955628608855989e-06, + "loss": 0.691, + "step": 1325 + }, + { + "epoch": 0.3881733021077283, + "grad_norm": 1.0268770456314087, + "learning_rate": 4.955556351696493e-06, + "loss": 0.6953, + "step": 1326 + }, + { + "epoch": 0.38846604215456676, + "grad_norm": 1.0649701356887817, + "learning_rate": 4.9554840362784514e-06, + "loss": 0.7249, + "step": 1327 + }, + { + "epoch": 0.38875878220140514, + "grad_norm": 1.0714694261550903, + "learning_rate": 4.955411662603581e-06, + "loss": 0.7341, + "step": 1328 + }, + { + "epoch": 0.3890515222482436, + "grad_norm": 1.0157561302185059, + "learning_rate": 4.9553392306735945e-06, + "loss": 0.6954, + "step": 1329 + }, + { + "epoch": 0.38934426229508196, + "grad_norm": 1.028476595878601, + "learning_rate": 4.955266740490214e-06, + "loss": 0.7187, + "step": 1330 + }, + { + "epoch": 0.3896370023419204, + "grad_norm": 1.0999784469604492, + "learning_rate": 4.955194192055159e-06, + "loss": 0.7469, + "step": 1331 + }, + { + "epoch": 0.38992974238875877, + "grad_norm": 1.022863507270813, + "learning_rate": 4.95512158537015e-06, + "loss": 0.6764, + "step": 1332 + }, + { + "epoch": 0.3902224824355972, + "grad_norm": 1.0356369018554688, + "learning_rate": 4.955048920436909e-06, + "loss": 0.6805, + "step": 1333 + }, + { + "epoch": 0.3905152224824356, + "grad_norm": 1.0175625085830688, + "learning_rate": 4.954976197257162e-06, + "loss": 0.6897, + "step": 1334 + }, + { + "epoch": 0.390807962529274, + "grad_norm": 1.178212285041809, + "learning_rate": 4.954903415832632e-06, + "loss": 0.6841, + "step": 1335 + }, + { + "epoch": 0.3911007025761124, + "grad_norm": 1.0278284549713135, + "learning_rate": 4.954830576165047e-06, + "loss": 0.7146, + "step": 1336 + }, + { + "epoch": 0.39139344262295084, + "grad_norm": 0.9879711866378784, + "learning_rate": 4.9547576782561355e-06, + "loss": 0.6996, + "step": 1337 + }, + { + "epoch": 0.3916861826697892, + "grad_norm": 1.0813645124435425, + "learning_rate": 4.954684722107625e-06, + "loss": 0.735, + "step": 1338 + }, + { + "epoch": 0.39197892271662765, + "grad_norm": 1.0459017753601074, + "learning_rate": 4.95461170772125e-06, + "loss": 0.7359, + "step": 1339 + }, + { + "epoch": 0.39227166276346603, + "grad_norm": 1.0373607873916626, + "learning_rate": 4.954538635098739e-06, + "loss": 0.7155, + "step": 1340 + }, + { + "epoch": 0.39256440281030447, + "grad_norm": 1.0348507165908813, + "learning_rate": 4.954465504241828e-06, + "loss": 0.6659, + "step": 1341 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 1.0404537916183472, + "learning_rate": 4.954392315152251e-06, + "loss": 0.6577, + "step": 1342 + }, + { + "epoch": 0.3931498829039813, + "grad_norm": 1.1089251041412354, + "learning_rate": 4.954319067831745e-06, + "loss": 0.7286, + "step": 1343 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 1.0182774066925049, + "learning_rate": 4.954245762282047e-06, + "loss": 0.6605, + "step": 1344 + }, + { + "epoch": 0.3937353629976581, + "grad_norm": 1.0312526226043701, + "learning_rate": 4.954172398504898e-06, + "loss": 0.6895, + "step": 1345 + }, + { + "epoch": 0.3940281030444965, + "grad_norm": 1.0343512296676636, + "learning_rate": 4.954098976502037e-06, + "loss": 0.6708, + "step": 1346 + }, + { + "epoch": 0.3943208430913349, + "grad_norm": 1.032578945159912, + "learning_rate": 4.954025496275206e-06, + "loss": 0.6731, + "step": 1347 + }, + { + "epoch": 0.3946135831381733, + "grad_norm": 1.022489070892334, + "learning_rate": 4.953951957826148e-06, + "loss": 0.6376, + "step": 1348 + }, + { + "epoch": 0.39490632318501173, + "grad_norm": 1.0826354026794434, + "learning_rate": 4.953878361156609e-06, + "loss": 0.7313, + "step": 1349 + }, + { + "epoch": 0.3951990632318501, + "grad_norm": 1.075745940208435, + "learning_rate": 4.953804706268334e-06, + "loss": 0.712, + "step": 1350 + }, + { + "epoch": 0.39549180327868855, + "grad_norm": 0.980183482170105, + "learning_rate": 4.953730993163071e-06, + "loss": 0.6708, + "step": 1351 + }, + { + "epoch": 0.3957845433255269, + "grad_norm": 0.9741186499595642, + "learning_rate": 4.953657221842569e-06, + "loss": 0.686, + "step": 1352 + }, + { + "epoch": 0.39607728337236536, + "grad_norm": 1.009213924407959, + "learning_rate": 4.9535833923085785e-06, + "loss": 0.6486, + "step": 1353 + }, + { + "epoch": 0.39637002341920374, + "grad_norm": 1.0246530771255493, + "learning_rate": 4.95350950456285e-06, + "loss": 0.7376, + "step": 1354 + }, + { + "epoch": 0.3966627634660422, + "grad_norm": 1.0157349109649658, + "learning_rate": 4.9534355586071374e-06, + "loss": 0.7295, + "step": 1355 + }, + { + "epoch": 0.39695550351288056, + "grad_norm": 1.0243022441864014, + "learning_rate": 4.953361554443195e-06, + "loss": 0.6965, + "step": 1356 + }, + { + "epoch": 0.397248243559719, + "grad_norm": 1.0439339876174927, + "learning_rate": 4.953287492072778e-06, + "loss": 0.7183, + "step": 1357 + }, + { + "epoch": 0.3975409836065574, + "grad_norm": 0.9906116127967834, + "learning_rate": 4.9532133714976435e-06, + "loss": 0.7079, + "step": 1358 + }, + { + "epoch": 0.3978337236533958, + "grad_norm": 1.0113129615783691, + "learning_rate": 4.953139192719552e-06, + "loss": 0.726, + "step": 1359 + }, + { + "epoch": 0.3981264637002342, + "grad_norm": 1.0061874389648438, + "learning_rate": 4.95306495574026e-06, + "loss": 0.7184, + "step": 1360 + }, + { + "epoch": 0.3984192037470726, + "grad_norm": 1.040701985359192, + "learning_rate": 4.952990660561532e-06, + "loss": 0.7627, + "step": 1361 + }, + { + "epoch": 0.398711943793911, + "grad_norm": 1.0932954549789429, + "learning_rate": 4.952916307185129e-06, + "loss": 0.7051, + "step": 1362 + }, + { + "epoch": 0.39900468384074944, + "grad_norm": 1.0392587184906006, + "learning_rate": 4.952841895612815e-06, + "loss": 0.7144, + "step": 1363 + }, + { + "epoch": 0.3992974238875878, + "grad_norm": 1.0034656524658203, + "learning_rate": 4.9527674258463555e-06, + "loss": 0.6664, + "step": 1364 + }, + { + "epoch": 0.39959016393442626, + "grad_norm": 1.0798996686935425, + "learning_rate": 4.9526928978875186e-06, + "loss": 0.7064, + "step": 1365 + }, + { + "epoch": 0.39988290398126464, + "grad_norm": 1.009746789932251, + "learning_rate": 4.9526183117380705e-06, + "loss": 0.7141, + "step": 1366 + }, + { + "epoch": 0.400175644028103, + "grad_norm": 1.0383254289627075, + "learning_rate": 4.952543667399782e-06, + "loss": 0.7318, + "step": 1367 + }, + { + "epoch": 0.40046838407494145, + "grad_norm": 0.9931860566139221, + "learning_rate": 4.952468964874424e-06, + "loss": 0.6788, + "step": 1368 + }, + { + "epoch": 0.40076112412177983, + "grad_norm": 1.063928484916687, + "learning_rate": 4.952394204163769e-06, + "loss": 0.7162, + "step": 1369 + }, + { + "epoch": 0.40105386416861827, + "grad_norm": 1.1055129766464233, + "learning_rate": 4.95231938526959e-06, + "loss": 0.7, + "step": 1370 + }, + { + "epoch": 0.40134660421545665, + "grad_norm": 1.0497889518737793, + "learning_rate": 4.952244508193662e-06, + "loss": 0.6859, + "step": 1371 + }, + { + "epoch": 0.4016393442622951, + "grad_norm": 1.0535171031951904, + "learning_rate": 4.952169572937763e-06, + "loss": 0.6845, + "step": 1372 + }, + { + "epoch": 0.40193208430913346, + "grad_norm": 1.0177236795425415, + "learning_rate": 4.952094579503668e-06, + "loss": 0.7083, + "step": 1373 + }, + { + "epoch": 0.4022248243559719, + "grad_norm": 1.0435822010040283, + "learning_rate": 4.952019527893159e-06, + "loss": 0.6757, + "step": 1374 + }, + { + "epoch": 0.4025175644028103, + "grad_norm": 1.0963902473449707, + "learning_rate": 4.951944418108016e-06, + "loss": 0.7037, + "step": 1375 + }, + { + "epoch": 0.4028103044496487, + "grad_norm": 1.0197752714157104, + "learning_rate": 4.9518692501500206e-06, + "loss": 0.7015, + "step": 1376 + }, + { + "epoch": 0.4031030444964871, + "grad_norm": 1.0328961610794067, + "learning_rate": 4.951794024020956e-06, + "loss": 0.6644, + "step": 1377 + }, + { + "epoch": 0.40339578454332553, + "grad_norm": 1.0127259492874146, + "learning_rate": 4.951718739722606e-06, + "loss": 0.7258, + "step": 1378 + }, + { + "epoch": 0.4036885245901639, + "grad_norm": 1.0141851902008057, + "learning_rate": 4.951643397256759e-06, + "loss": 0.6731, + "step": 1379 + }, + { + "epoch": 0.40398126463700235, + "grad_norm": 1.0118669271469116, + "learning_rate": 4.9515679966252015e-06, + "loss": 0.7284, + "step": 1380 + }, + { + "epoch": 0.4042740046838407, + "grad_norm": 1.0363401174545288, + "learning_rate": 4.9514925378297224e-06, + "loss": 0.7248, + "step": 1381 + }, + { + "epoch": 0.40456674473067916, + "grad_norm": 1.0223392248153687, + "learning_rate": 4.951417020872111e-06, + "loss": 0.7098, + "step": 1382 + }, + { + "epoch": 0.40485948477751754, + "grad_norm": 1.0494436025619507, + "learning_rate": 4.951341445754161e-06, + "loss": 0.6567, + "step": 1383 + }, + { + "epoch": 0.405152224824356, + "grad_norm": 1.0072554349899292, + "learning_rate": 4.9512658124776635e-06, + "loss": 0.6253, + "step": 1384 + }, + { + "epoch": 0.40544496487119436, + "grad_norm": 1.0160539150238037, + "learning_rate": 4.9511901210444135e-06, + "loss": 0.7245, + "step": 1385 + }, + { + "epoch": 0.4057377049180328, + "grad_norm": 1.0147802829742432, + "learning_rate": 4.951114371456207e-06, + "loss": 0.7121, + "step": 1386 + }, + { + "epoch": 0.4060304449648712, + "grad_norm": 1.038943886756897, + "learning_rate": 4.9510385637148415e-06, + "loss": 0.6795, + "step": 1387 + }, + { + "epoch": 0.4063231850117096, + "grad_norm": 1.0161031484603882, + "learning_rate": 4.950962697822115e-06, + "loss": 0.711, + "step": 1388 + }, + { + "epoch": 0.406615925058548, + "grad_norm": 1.0262399911880493, + "learning_rate": 4.950886773779827e-06, + "loss": 0.7139, + "step": 1389 + }, + { + "epoch": 0.4069086651053864, + "grad_norm": 1.0611735582351685, + "learning_rate": 4.9508107915897805e-06, + "loss": 0.7688, + "step": 1390 + }, + { + "epoch": 0.4072014051522248, + "grad_norm": 1.0697635412216187, + "learning_rate": 4.950734751253776e-06, + "loss": 0.7293, + "step": 1391 + }, + { + "epoch": 0.40749414519906324, + "grad_norm": 1.0215036869049072, + "learning_rate": 4.95065865277362e-06, + "loss": 0.72, + "step": 1392 + }, + { + "epoch": 0.4077868852459016, + "grad_norm": 1.0687167644500732, + "learning_rate": 4.950582496151116e-06, + "loss": 0.6702, + "step": 1393 + }, + { + "epoch": 0.40807962529274006, + "grad_norm": 0.9893348813056946, + "learning_rate": 4.950506281388071e-06, + "loss": 0.6887, + "step": 1394 + }, + { + "epoch": 0.40837236533957844, + "grad_norm": 0.99796062707901, + "learning_rate": 4.950430008486295e-06, + "loss": 0.6817, + "step": 1395 + }, + { + "epoch": 0.40866510538641687, + "grad_norm": 1.0028623342514038, + "learning_rate": 4.950353677447595e-06, + "loss": 0.6776, + "step": 1396 + }, + { + "epoch": 0.40895784543325525, + "grad_norm": 1.0369467735290527, + "learning_rate": 4.9502772882737835e-06, + "loss": 0.7628, + "step": 1397 + }, + { + "epoch": 0.4092505854800937, + "grad_norm": 0.9765019416809082, + "learning_rate": 4.950200840966672e-06, + "loss": 0.6757, + "step": 1398 + }, + { + "epoch": 0.40954332552693207, + "grad_norm": 1.0292922258377075, + "learning_rate": 4.950124335528076e-06, + "loss": 0.7018, + "step": 1399 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 0.9495410323143005, + "learning_rate": 4.950047771959809e-06, + "loss": 0.6597, + "step": 1400 + }, + { + "epoch": 0.4101288056206089, + "grad_norm": 1.014998435974121, + "learning_rate": 4.949971150263688e-06, + "loss": 0.6792, + "step": 1401 + }, + { + "epoch": 0.4104215456674473, + "grad_norm": 1.04826021194458, + "learning_rate": 4.949894470441531e-06, + "loss": 0.7274, + "step": 1402 + }, + { + "epoch": 0.4107142857142857, + "grad_norm": 1.0338375568389893, + "learning_rate": 4.949817732495156e-06, + "loss": 0.6926, + "step": 1403 + }, + { + "epoch": 0.41100702576112413, + "grad_norm": 1.0668671131134033, + "learning_rate": 4.949740936426385e-06, + "loss": 0.6865, + "step": 1404 + }, + { + "epoch": 0.4112997658079625, + "grad_norm": 1.1016768217086792, + "learning_rate": 4.94966408223704e-06, + "loss": 0.7121, + "step": 1405 + }, + { + "epoch": 0.41159250585480095, + "grad_norm": 1.0366102457046509, + "learning_rate": 4.949587169928944e-06, + "loss": 0.7239, + "step": 1406 + }, + { + "epoch": 0.41188524590163933, + "grad_norm": 0.9636387825012207, + "learning_rate": 4.949510199503922e-06, + "loss": 0.6574, + "step": 1407 + }, + { + "epoch": 0.41217798594847777, + "grad_norm": 1.1051698923110962, + "learning_rate": 4.949433170963799e-06, + "loss": 0.6978, + "step": 1408 + }, + { + "epoch": 0.41247072599531615, + "grad_norm": 1.0142911672592163, + "learning_rate": 4.949356084310404e-06, + "loss": 0.68, + "step": 1409 + }, + { + "epoch": 0.4127634660421546, + "grad_norm": 1.0869536399841309, + "learning_rate": 4.949278939545566e-06, + "loss": 0.7377, + "step": 1410 + }, + { + "epoch": 0.41305620608899296, + "grad_norm": 1.0136380195617676, + "learning_rate": 4.949201736671113e-06, + "loss": 0.7388, + "step": 1411 + }, + { + "epoch": 0.4133489461358314, + "grad_norm": 1.0259995460510254, + "learning_rate": 4.949124475688879e-06, + "loss": 0.7132, + "step": 1412 + }, + { + "epoch": 0.4136416861826698, + "grad_norm": 1.0438123941421509, + "learning_rate": 4.949047156600697e-06, + "loss": 0.7509, + "step": 1413 + }, + { + "epoch": 0.4139344262295082, + "grad_norm": 1.0052317380905151, + "learning_rate": 4.9489697794084005e-06, + "loss": 0.6932, + "step": 1414 + }, + { + "epoch": 0.4142271662763466, + "grad_norm": 1.061414361000061, + "learning_rate": 4.948892344113825e-06, + "loss": 0.7038, + "step": 1415 + }, + { + "epoch": 0.41451990632318503, + "grad_norm": 1.043354868888855, + "learning_rate": 4.9488148507188085e-06, + "loss": 0.7367, + "step": 1416 + }, + { + "epoch": 0.4148126463700234, + "grad_norm": 0.995106041431427, + "learning_rate": 4.948737299225188e-06, + "loss": 0.7389, + "step": 1417 + }, + { + "epoch": 0.41510538641686184, + "grad_norm": 1.0106801986694336, + "learning_rate": 4.948659689634806e-06, + "loss": 0.6288, + "step": 1418 + }, + { + "epoch": 0.4153981264637002, + "grad_norm": 1.0475387573242188, + "learning_rate": 4.948582021949502e-06, + "loss": 0.7395, + "step": 1419 + }, + { + "epoch": 0.41569086651053866, + "grad_norm": 1.0015791654586792, + "learning_rate": 4.94850429617112e-06, + "loss": 0.6739, + "step": 1420 + }, + { + "epoch": 0.41598360655737704, + "grad_norm": 1.0564920902252197, + "learning_rate": 4.948426512301502e-06, + "loss": 0.7497, + "step": 1421 + }, + { + "epoch": 0.4162763466042155, + "grad_norm": 1.0028114318847656, + "learning_rate": 4.948348670342495e-06, + "loss": 0.6477, + "step": 1422 + }, + { + "epoch": 0.41656908665105385, + "grad_norm": 1.0562057495117188, + "learning_rate": 4.948270770295944e-06, + "loss": 0.6554, + "step": 1423 + }, + { + "epoch": 0.4168618266978923, + "grad_norm": 0.9578866958618164, + "learning_rate": 4.9481928121637e-06, + "loss": 0.6557, + "step": 1424 + }, + { + "epoch": 0.41715456674473067, + "grad_norm": 0.9793127179145813, + "learning_rate": 4.948114795947612e-06, + "loss": 0.6725, + "step": 1425 + }, + { + "epoch": 0.4174473067915691, + "grad_norm": 1.1191742420196533, + "learning_rate": 4.948036721649529e-06, + "loss": 0.6455, + "step": 1426 + }, + { + "epoch": 0.4177400468384075, + "grad_norm": 0.993094265460968, + "learning_rate": 4.947958589271304e-06, + "loss": 0.6804, + "step": 1427 + }, + { + "epoch": 0.4180327868852459, + "grad_norm": 1.003902554512024, + "learning_rate": 4.947880398814793e-06, + "loss": 0.7189, + "step": 1428 + }, + { + "epoch": 0.4183255269320843, + "grad_norm": 1.0005580186843872, + "learning_rate": 4.9478021502818475e-06, + "loss": 0.6798, + "step": 1429 + }, + { + "epoch": 0.41861826697892274, + "grad_norm": 1.0002140998840332, + "learning_rate": 4.947723843674327e-06, + "loss": 0.7166, + "step": 1430 + }, + { + "epoch": 0.4189110070257611, + "grad_norm": 1.0051710605621338, + "learning_rate": 4.947645478994086e-06, + "loss": 0.6841, + "step": 1431 + }, + { + "epoch": 0.41920374707259955, + "grad_norm": 1.0240815877914429, + "learning_rate": 4.947567056242988e-06, + "loss": 0.6935, + "step": 1432 + }, + { + "epoch": 0.41949648711943793, + "grad_norm": 0.96775221824646, + "learning_rate": 4.94748857542289e-06, + "loss": 0.6506, + "step": 1433 + }, + { + "epoch": 0.41978922716627637, + "grad_norm": 1.0350052118301392, + "learning_rate": 4.947410036535656e-06, + "loss": 0.6424, + "step": 1434 + }, + { + "epoch": 0.42008196721311475, + "grad_norm": 1.0454398393630981, + "learning_rate": 4.947331439583149e-06, + "loss": 0.6765, + "step": 1435 + }, + { + "epoch": 0.4203747072599532, + "grad_norm": 0.9981266260147095, + "learning_rate": 4.9472527845672324e-06, + "loss": 0.7264, + "step": 1436 + }, + { + "epoch": 0.42066744730679156, + "grad_norm": 1.036076307296753, + "learning_rate": 4.947174071489774e-06, + "loss": 0.7088, + "step": 1437 + }, + { + "epoch": 0.42096018735363, + "grad_norm": 1.0715161561965942, + "learning_rate": 4.94709530035264e-06, + "loss": 0.7214, + "step": 1438 + }, + { + "epoch": 0.4212529274004684, + "grad_norm": 0.9655546545982361, + "learning_rate": 4.947016471157701e-06, + "loss": 0.6764, + "step": 1439 + }, + { + "epoch": 0.4215456674473068, + "grad_norm": 1.0161850452423096, + "learning_rate": 4.946937583906825e-06, + "loss": 0.7004, + "step": 1440 + }, + { + "epoch": 0.4218384074941452, + "grad_norm": 0.9599172472953796, + "learning_rate": 4.946858638601885e-06, + "loss": 0.6223, + "step": 1441 + }, + { + "epoch": 0.42213114754098363, + "grad_norm": 0.9867199659347534, + "learning_rate": 4.946779635244754e-06, + "loss": 0.6853, + "step": 1442 + }, + { + "epoch": 0.422423887587822, + "grad_norm": 1.1154359579086304, + "learning_rate": 4.9467005738373055e-06, + "loss": 0.7425, + "step": 1443 + }, + { + "epoch": 0.42271662763466045, + "grad_norm": 0.9692285656929016, + "learning_rate": 4.946621454381416e-06, + "loss": 0.7145, + "step": 1444 + }, + { + "epoch": 0.4230093676814988, + "grad_norm": 1.0348433256149292, + "learning_rate": 4.946542276878963e-06, + "loss": 0.6698, + "step": 1445 + }, + { + "epoch": 0.42330210772833726, + "grad_norm": 0.9604342579841614, + "learning_rate": 4.9464630413318235e-06, + "loss": 0.6662, + "step": 1446 + }, + { + "epoch": 0.42359484777517564, + "grad_norm": 1.0711063146591187, + "learning_rate": 4.94638374774188e-06, + "loss": 0.7108, + "step": 1447 + }, + { + "epoch": 0.4238875878220141, + "grad_norm": 1.019213080406189, + "learning_rate": 4.94630439611101e-06, + "loss": 0.7202, + "step": 1448 + }, + { + "epoch": 0.42418032786885246, + "grad_norm": 1.0555568933486938, + "learning_rate": 4.946224986441099e-06, + "loss": 0.6581, + "step": 1449 + }, + { + "epoch": 0.4244730679156909, + "grad_norm": 1.0802351236343384, + "learning_rate": 4.946145518734031e-06, + "loss": 0.6773, + "step": 1450 + }, + { + "epoch": 0.4247658079625293, + "grad_norm": 0.9957408308982849, + "learning_rate": 4.9460659929916895e-06, + "loss": 0.6942, + "step": 1451 + }, + { + "epoch": 0.42505854800936765, + "grad_norm": 0.9928397536277771, + "learning_rate": 4.945986409215964e-06, + "loss": 0.6574, + "step": 1452 + }, + { + "epoch": 0.4253512880562061, + "grad_norm": 1.0508562326431274, + "learning_rate": 4.945906767408739e-06, + "loss": 0.7326, + "step": 1453 + }, + { + "epoch": 0.42564402810304447, + "grad_norm": 0.993266761302948, + "learning_rate": 4.9458270675719075e-06, + "loss": 0.6911, + "step": 1454 + }, + { + "epoch": 0.4259367681498829, + "grad_norm": 1.174735188484192, + "learning_rate": 4.945747309707358e-06, + "loss": 0.699, + "step": 1455 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.9985058903694153, + "learning_rate": 4.945667493816985e-06, + "loss": 0.7154, + "step": 1456 + }, + { + "epoch": 0.4265222482435597, + "grad_norm": 0.9808061718940735, + "learning_rate": 4.945587619902679e-06, + "loss": 0.66, + "step": 1457 + }, + { + "epoch": 0.4268149882903981, + "grad_norm": 1.0333631038665771, + "learning_rate": 4.9455076879663375e-06, + "loss": 0.7051, + "step": 1458 + }, + { + "epoch": 0.42710772833723654, + "grad_norm": 1.2876067161560059, + "learning_rate": 4.945427698009857e-06, + "loss": 0.7409, + "step": 1459 + }, + { + "epoch": 0.4274004683840749, + "grad_norm": 0.9942845106124878, + "learning_rate": 4.945347650035134e-06, + "loss": 0.7025, + "step": 1460 + }, + { + "epoch": 0.42769320843091335, + "grad_norm": 1.0109423398971558, + "learning_rate": 4.945267544044069e-06, + "loss": 0.7448, + "step": 1461 + }, + { + "epoch": 0.42798594847775173, + "grad_norm": 0.9859521985054016, + "learning_rate": 4.94518738003856e-06, + "loss": 0.6809, + "step": 1462 + }, + { + "epoch": 0.42827868852459017, + "grad_norm": 1.0371966361999512, + "learning_rate": 4.945107158020512e-06, + "loss": 0.7004, + "step": 1463 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 1.0799295902252197, + "learning_rate": 4.9450268779918265e-06, + "loss": 0.6589, + "step": 1464 + }, + { + "epoch": 0.428864168618267, + "grad_norm": 0.9889400601387024, + "learning_rate": 4.944946539954409e-06, + "loss": 0.6972, + "step": 1465 + }, + { + "epoch": 0.42915690866510536, + "grad_norm": 1.0021352767944336, + "learning_rate": 4.944866143910164e-06, + "loss": 0.707, + "step": 1466 + }, + { + "epoch": 0.4294496487119438, + "grad_norm": 1.0689431428909302, + "learning_rate": 4.944785689861001e-06, + "loss": 0.692, + "step": 1467 + }, + { + "epoch": 0.4297423887587822, + "grad_norm": 1.060151219367981, + "learning_rate": 4.944705177808828e-06, + "loss": 0.6468, + "step": 1468 + }, + { + "epoch": 0.4300351288056206, + "grad_norm": 1.09408700466156, + "learning_rate": 4.944624607755555e-06, + "loss": 0.7137, + "step": 1469 + }, + { + "epoch": 0.430327868852459, + "grad_norm": 1.0325082540512085, + "learning_rate": 4.944543979703093e-06, + "loss": 0.7092, + "step": 1470 + }, + { + "epoch": 0.43062060889929743, + "grad_norm": 1.0044955015182495, + "learning_rate": 4.944463293653355e-06, + "loss": 0.6633, + "step": 1471 + }, + { + "epoch": 0.4309133489461358, + "grad_norm": 1.07355797290802, + "learning_rate": 4.944382549608257e-06, + "loss": 0.7277, + "step": 1472 + }, + { + "epoch": 0.43120608899297425, + "grad_norm": 1.1203532218933105, + "learning_rate": 4.944301747569712e-06, + "loss": 0.6393, + "step": 1473 + }, + { + "epoch": 0.4314988290398126, + "grad_norm": 0.9986314177513123, + "learning_rate": 4.944220887539639e-06, + "loss": 0.6919, + "step": 1474 + }, + { + "epoch": 0.43179156908665106, + "grad_norm": 1.0235910415649414, + "learning_rate": 4.9441399695199565e-06, + "loss": 0.7389, + "step": 1475 + }, + { + "epoch": 0.43208430913348944, + "grad_norm": 0.9902118444442749, + "learning_rate": 4.9440589935125824e-06, + "loss": 0.7389, + "step": 1476 + }, + { + "epoch": 0.4323770491803279, + "grad_norm": 1.072177767753601, + "learning_rate": 4.94397795951944e-06, + "loss": 0.7258, + "step": 1477 + }, + { + "epoch": 0.43266978922716626, + "grad_norm": 1.043449878692627, + "learning_rate": 4.943896867542452e-06, + "loss": 0.662, + "step": 1478 + }, + { + "epoch": 0.4329625292740047, + "grad_norm": 1.0642788410186768, + "learning_rate": 4.943815717583539e-06, + "loss": 0.6698, + "step": 1479 + }, + { + "epoch": 0.4332552693208431, + "grad_norm": 1.0054750442504883, + "learning_rate": 4.94373450964463e-06, + "loss": 0.659, + "step": 1480 + }, + { + "epoch": 0.4335480093676815, + "grad_norm": 1.0410748720169067, + "learning_rate": 4.943653243727649e-06, + "loss": 0.7246, + "step": 1481 + }, + { + "epoch": 0.4338407494145199, + "grad_norm": 1.0093923807144165, + "learning_rate": 4.943571919834526e-06, + "loss": 0.7177, + "step": 1482 + }, + { + "epoch": 0.4341334894613583, + "grad_norm": 1.0321439504623413, + "learning_rate": 4.943490537967191e-06, + "loss": 0.6826, + "step": 1483 + }, + { + "epoch": 0.4344262295081967, + "grad_norm": 1.0558891296386719, + "learning_rate": 4.943409098127573e-06, + "loss": 0.6982, + "step": 1484 + }, + { + "epoch": 0.43471896955503514, + "grad_norm": 1.001528263092041, + "learning_rate": 4.943327600317603e-06, + "loss": 0.7024, + "step": 1485 + }, + { + "epoch": 0.4350117096018735, + "grad_norm": 1.0202277898788452, + "learning_rate": 4.943246044539218e-06, + "loss": 0.7466, + "step": 1486 + }, + { + "epoch": 0.43530444964871196, + "grad_norm": 0.9845041632652283, + "learning_rate": 4.943164430794351e-06, + "loss": 0.7212, + "step": 1487 + }, + { + "epoch": 0.43559718969555034, + "grad_norm": 0.9704400300979614, + "learning_rate": 4.943082759084939e-06, + "loss": 0.6285, + "step": 1488 + }, + { + "epoch": 0.43588992974238877, + "grad_norm": 0.9858938455581665, + "learning_rate": 4.943001029412917e-06, + "loss": 0.6915, + "step": 1489 + }, + { + "epoch": 0.43618266978922715, + "grad_norm": 1.0566909313201904, + "learning_rate": 4.942919241780228e-06, + "loss": 0.7102, + "step": 1490 + }, + { + "epoch": 0.4364754098360656, + "grad_norm": 1.0778976678848267, + "learning_rate": 4.9428373961888105e-06, + "loss": 0.737, + "step": 1491 + }, + { + "epoch": 0.43676814988290397, + "grad_norm": 1.0600908994674683, + "learning_rate": 4.942755492640606e-06, + "loss": 0.6803, + "step": 1492 + }, + { + "epoch": 0.4370608899297424, + "grad_norm": 1.0293182134628296, + "learning_rate": 4.942673531137558e-06, + "loss": 0.6706, + "step": 1493 + }, + { + "epoch": 0.4373536299765808, + "grad_norm": 1.0256255865097046, + "learning_rate": 4.942591511681612e-06, + "loss": 0.6957, + "step": 1494 + }, + { + "epoch": 0.4376463700234192, + "grad_norm": 1.0381542444229126, + "learning_rate": 4.942509434274713e-06, + "loss": 0.6955, + "step": 1495 + }, + { + "epoch": 0.4379391100702576, + "grad_norm": 1.0341978073120117, + "learning_rate": 4.942427298918807e-06, + "loss": 0.722, + "step": 1496 + }, + { + "epoch": 0.43823185011709603, + "grad_norm": 1.0584276914596558, + "learning_rate": 4.942345105615845e-06, + "loss": 0.7058, + "step": 1497 + }, + { + "epoch": 0.4385245901639344, + "grad_norm": 1.0048742294311523, + "learning_rate": 4.942262854367777e-06, + "loss": 0.6308, + "step": 1498 + }, + { + "epoch": 0.43881733021077285, + "grad_norm": 1.0451935529708862, + "learning_rate": 4.942180545176553e-06, + "loss": 0.7004, + "step": 1499 + }, + { + "epoch": 0.43911007025761123, + "grad_norm": 1.0284167528152466, + "learning_rate": 4.942098178044126e-06, + "loss": 0.6527, + "step": 1500 + }, + { + "epoch": 0.43940281030444966, + "grad_norm": 1.0171562433242798, + "learning_rate": 4.942015752972451e-06, + "loss": 0.7325, + "step": 1501 + }, + { + "epoch": 0.43969555035128804, + "grad_norm": 0.9874509572982788, + "learning_rate": 4.9419332699634826e-06, + "loss": 0.7224, + "step": 1502 + }, + { + "epoch": 0.4399882903981265, + "grad_norm": 1.0335524082183838, + "learning_rate": 4.941850729019179e-06, + "loss": 0.7154, + "step": 1503 + }, + { + "epoch": 0.44028103044496486, + "grad_norm": 1.09645414352417, + "learning_rate": 4.941768130141497e-06, + "loss": 0.7345, + "step": 1504 + }, + { + "epoch": 0.4405737704918033, + "grad_norm": 0.9917603135108948, + "learning_rate": 4.9416854733323975e-06, + "loss": 0.6733, + "step": 1505 + }, + { + "epoch": 0.4408665105386417, + "grad_norm": 0.9950408339500427, + "learning_rate": 4.94160275859384e-06, + "loss": 0.7393, + "step": 1506 + }, + { + "epoch": 0.4411592505854801, + "grad_norm": 0.9532929062843323, + "learning_rate": 4.941519985927788e-06, + "loss": 0.6747, + "step": 1507 + }, + { + "epoch": 0.4414519906323185, + "grad_norm": 1.0053805112838745, + "learning_rate": 4.941437155336207e-06, + "loss": 0.6917, + "step": 1508 + }, + { + "epoch": 0.4417447306791569, + "grad_norm": 0.9708220362663269, + "learning_rate": 4.941354266821059e-06, + "loss": 0.6499, + "step": 1509 + }, + { + "epoch": 0.4420374707259953, + "grad_norm": 1.0017833709716797, + "learning_rate": 4.941271320384312e-06, + "loss": 0.6885, + "step": 1510 + }, + { + "epoch": 0.44233021077283374, + "grad_norm": 0.9822532534599304, + "learning_rate": 4.941188316027935e-06, + "loss": 0.6727, + "step": 1511 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 0.9832000732421875, + "learning_rate": 4.941105253753895e-06, + "loss": 0.6675, + "step": 1512 + }, + { + "epoch": 0.44291569086651056, + "grad_norm": 0.9766156673431396, + "learning_rate": 4.941022133564164e-06, + "loss": 0.6923, + "step": 1513 + }, + { + "epoch": 0.44320843091334894, + "grad_norm": 1.0529567003250122, + "learning_rate": 4.940938955460714e-06, + "loss": 0.6886, + "step": 1514 + }, + { + "epoch": 0.4435011709601874, + "grad_norm": 1.105082392692566, + "learning_rate": 4.940855719445519e-06, + "loss": 0.6949, + "step": 1515 + }, + { + "epoch": 0.44379391100702575, + "grad_norm": 1.1094459295272827, + "learning_rate": 4.940772425520553e-06, + "loss": 0.6984, + "step": 1516 + }, + { + "epoch": 0.4440866510538642, + "grad_norm": 1.0342319011688232, + "learning_rate": 4.940689073687792e-06, + "loss": 0.6747, + "step": 1517 + }, + { + "epoch": 0.44437939110070257, + "grad_norm": 0.9971280694007874, + "learning_rate": 4.9406056639492136e-06, + "loss": 0.6342, + "step": 1518 + }, + { + "epoch": 0.444672131147541, + "grad_norm": 1.0366559028625488, + "learning_rate": 4.940522196306797e-06, + "loss": 0.6996, + "step": 1519 + }, + { + "epoch": 0.4449648711943794, + "grad_norm": 0.9942964315414429, + "learning_rate": 4.940438670762523e-06, + "loss": 0.7047, + "step": 1520 + }, + { + "epoch": 0.4452576112412178, + "grad_norm": 1.0267871618270874, + "learning_rate": 4.940355087318372e-06, + "loss": 0.717, + "step": 1521 + }, + { + "epoch": 0.4455503512880562, + "grad_norm": 1.0134491920471191, + "learning_rate": 4.940271445976328e-06, + "loss": 0.6971, + "step": 1522 + }, + { + "epoch": 0.44584309133489464, + "grad_norm": 1.0228811502456665, + "learning_rate": 4.9401877467383755e-06, + "loss": 0.7092, + "step": 1523 + }, + { + "epoch": 0.446135831381733, + "grad_norm": 1.0299031734466553, + "learning_rate": 4.940103989606499e-06, + "loss": 0.7158, + "step": 1524 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 1.068807601928711, + "learning_rate": 4.940020174582687e-06, + "loss": 0.6604, + "step": 1525 + }, + { + "epoch": 0.44672131147540983, + "grad_norm": 1.0102218389511108, + "learning_rate": 4.9399363016689275e-06, + "loss": 0.6776, + "step": 1526 + }, + { + "epoch": 0.44701405152224827, + "grad_norm": 0.9894137978553772, + "learning_rate": 4.939852370867211e-06, + "loss": 0.7066, + "step": 1527 + }, + { + "epoch": 0.44730679156908665, + "grad_norm": 1.017255187034607, + "learning_rate": 4.939768382179527e-06, + "loss": 0.6474, + "step": 1528 + }, + { + "epoch": 0.4475995316159251, + "grad_norm": 0.9957756400108337, + "learning_rate": 4.939684335607871e-06, + "loss": 0.7156, + "step": 1529 + }, + { + "epoch": 0.44789227166276346, + "grad_norm": 0.9981983304023743, + "learning_rate": 4.939600231154234e-06, + "loss": 0.7114, + "step": 1530 + }, + { + "epoch": 0.4481850117096019, + "grad_norm": 1.0113753080368042, + "learning_rate": 4.939516068820613e-06, + "loss": 0.6567, + "step": 1531 + }, + { + "epoch": 0.4484777517564403, + "grad_norm": 1.0299586057662964, + "learning_rate": 4.939431848609006e-06, + "loss": 0.6623, + "step": 1532 + }, + { + "epoch": 0.4487704918032787, + "grad_norm": 0.9243226647377014, + "learning_rate": 4.939347570521408e-06, + "loss": 0.623, + "step": 1533 + }, + { + "epoch": 0.4490632318501171, + "grad_norm": 0.975844144821167, + "learning_rate": 4.9392632345598216e-06, + "loss": 0.7191, + "step": 1534 + }, + { + "epoch": 0.44935597189695553, + "grad_norm": 1.0820962190628052, + "learning_rate": 4.939178840726246e-06, + "loss": 0.7063, + "step": 1535 + }, + { + "epoch": 0.4496487119437939, + "grad_norm": 0.9873591661453247, + "learning_rate": 4.939094389022682e-06, + "loss": 0.7075, + "step": 1536 + }, + { + "epoch": 0.44994145199063235, + "grad_norm": 1.012648582458496, + "learning_rate": 4.939009879451138e-06, + "loss": 0.7244, + "step": 1537 + }, + { + "epoch": 0.4502341920374707, + "grad_norm": 0.9938149452209473, + "learning_rate": 4.938925312013614e-06, + "loss": 0.6854, + "step": 1538 + }, + { + "epoch": 0.4505269320843091, + "grad_norm": 1.010703444480896, + "learning_rate": 4.938840686712119e-06, + "loss": 0.72, + "step": 1539 + }, + { + "epoch": 0.45081967213114754, + "grad_norm": 0.9947099089622498, + "learning_rate": 4.9387560035486595e-06, + "loss": 0.6394, + "step": 1540 + }, + { + "epoch": 0.4511124121779859, + "grad_norm": 0.9868845343589783, + "learning_rate": 4.938671262525247e-06, + "loss": 0.6618, + "step": 1541 + }, + { + "epoch": 0.45140515222482436, + "grad_norm": 0.967301070690155, + "learning_rate": 4.938586463643888e-06, + "loss": 0.699, + "step": 1542 + }, + { + "epoch": 0.45169789227166274, + "grad_norm": 0.9632762670516968, + "learning_rate": 4.938501606906598e-06, + "loss": 0.6162, + "step": 1543 + }, + { + "epoch": 0.4519906323185012, + "grad_norm": 1.108443021774292, + "learning_rate": 4.9384166923153885e-06, + "loss": 0.723, + "step": 1544 + }, + { + "epoch": 0.45228337236533955, + "grad_norm": 0.9912594556808472, + "learning_rate": 4.938331719872275e-06, + "loss": 0.6465, + "step": 1545 + }, + { + "epoch": 0.452576112412178, + "grad_norm": 1.0258008241653442, + "learning_rate": 4.938246689579272e-06, + "loss": 0.674, + "step": 1546 + }, + { + "epoch": 0.45286885245901637, + "grad_norm": 0.9504169821739197, + "learning_rate": 4.938161601438399e-06, + "loss": 0.6878, + "step": 1547 + }, + { + "epoch": 0.4531615925058548, + "grad_norm": 0.9538093209266663, + "learning_rate": 4.9380764554516736e-06, + "loss": 0.6295, + "step": 1548 + }, + { + "epoch": 0.4534543325526932, + "grad_norm": 0.9537244439125061, + "learning_rate": 4.9379912516211146e-06, + "loss": 0.7371, + "step": 1549 + }, + { + "epoch": 0.4537470725995316, + "grad_norm": 0.9889068603515625, + "learning_rate": 4.937905989948746e-06, + "loss": 0.7392, + "step": 1550 + }, + { + "epoch": 0.45403981264637, + "grad_norm": 1.0168226957321167, + "learning_rate": 4.937820670436589e-06, + "loss": 0.6978, + "step": 1551 + }, + { + "epoch": 0.45433255269320844, + "grad_norm": 0.9521581530570984, + "learning_rate": 4.937735293086669e-06, + "loss": 0.7418, + "step": 1552 + }, + { + "epoch": 0.4546252927400468, + "grad_norm": 1.117502212524414, + "learning_rate": 4.937649857901009e-06, + "loss": 0.6407, + "step": 1553 + }, + { + "epoch": 0.45491803278688525, + "grad_norm": 0.9840188026428223, + "learning_rate": 4.937564364881639e-06, + "loss": 0.6532, + "step": 1554 + }, + { + "epoch": 0.45521077283372363, + "grad_norm": 1.0659668445587158, + "learning_rate": 4.9374788140305865e-06, + "loss": 0.6928, + "step": 1555 + }, + { + "epoch": 0.45550351288056207, + "grad_norm": 0.9986117482185364, + "learning_rate": 4.93739320534988e-06, + "loss": 0.6389, + "step": 1556 + }, + { + "epoch": 0.45579625292740045, + "grad_norm": 1.009304404258728, + "learning_rate": 4.937307538841551e-06, + "loss": 0.705, + "step": 1557 + }, + { + "epoch": 0.4560889929742389, + "grad_norm": 0.9829684495925903, + "learning_rate": 4.937221814507633e-06, + "loss": 0.6828, + "step": 1558 + }, + { + "epoch": 0.45638173302107726, + "grad_norm": 1.0256600379943848, + "learning_rate": 4.93713603235016e-06, + "loss": 0.6828, + "step": 1559 + }, + { + "epoch": 0.4566744730679157, + "grad_norm": 1.0268185138702393, + "learning_rate": 4.937050192371166e-06, + "loss": 0.7122, + "step": 1560 + }, + { + "epoch": 0.4569672131147541, + "grad_norm": 1.00479257106781, + "learning_rate": 4.936964294572687e-06, + "loss": 0.6827, + "step": 1561 + }, + { + "epoch": 0.4572599531615925, + "grad_norm": 1.0128870010375977, + "learning_rate": 4.936878338956763e-06, + "loss": 0.6955, + "step": 1562 + }, + { + "epoch": 0.4575526932084309, + "grad_norm": 1.011623501777649, + "learning_rate": 4.936792325525432e-06, + "loss": 0.7405, + "step": 1563 + }, + { + "epoch": 0.45784543325526933, + "grad_norm": 0.9963316321372986, + "learning_rate": 4.936706254280734e-06, + "loss": 0.6783, + "step": 1564 + }, + { + "epoch": 0.4581381733021077, + "grad_norm": 1.06641685962677, + "learning_rate": 4.936620125224713e-06, + "loss": 0.7268, + "step": 1565 + }, + { + "epoch": 0.45843091334894615, + "grad_norm": 1.0047250986099243, + "learning_rate": 4.9365339383594115e-06, + "loss": 0.7285, + "step": 1566 + }, + { + "epoch": 0.4587236533957845, + "grad_norm": 1.0601940155029297, + "learning_rate": 4.936447693686874e-06, + "loss": 0.6707, + "step": 1567 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.9685571789741516, + "learning_rate": 4.936361391209147e-06, + "loss": 0.6814, + "step": 1568 + }, + { + "epoch": 0.45930913348946134, + "grad_norm": 1.0203208923339844, + "learning_rate": 4.936275030928277e-06, + "loss": 0.7152, + "step": 1569 + }, + { + "epoch": 0.4596018735362998, + "grad_norm": 0.9771785736083984, + "learning_rate": 4.936188612846315e-06, + "loss": 0.7001, + "step": 1570 + }, + { + "epoch": 0.45989461358313816, + "grad_norm": 1.0437418222427368, + "learning_rate": 4.936102136965309e-06, + "loss": 0.6417, + "step": 1571 + }, + { + "epoch": 0.4601873536299766, + "grad_norm": 1.0066179037094116, + "learning_rate": 4.9360156032873125e-06, + "loss": 0.7274, + "step": 1572 + }, + { + "epoch": 0.46048009367681497, + "grad_norm": 0.9605157375335693, + "learning_rate": 4.935929011814378e-06, + "loss": 0.6424, + "step": 1573 + }, + { + "epoch": 0.4607728337236534, + "grad_norm": 0.9791153073310852, + "learning_rate": 4.935842362548558e-06, + "loss": 0.679, + "step": 1574 + }, + { + "epoch": 0.4610655737704918, + "grad_norm": 0.9500385522842407, + "learning_rate": 4.935755655491912e-06, + "loss": 0.7243, + "step": 1575 + }, + { + "epoch": 0.4613583138173302, + "grad_norm": 1.0103706121444702, + "learning_rate": 4.935668890646494e-06, + "loss": 0.706, + "step": 1576 + }, + { + "epoch": 0.4616510538641686, + "grad_norm": 1.038698673248291, + "learning_rate": 4.935582068014365e-06, + "loss": 0.7149, + "step": 1577 + }, + { + "epoch": 0.46194379391100704, + "grad_norm": 0.9846019148826599, + "learning_rate": 4.935495187597582e-06, + "loss": 0.6862, + "step": 1578 + }, + { + "epoch": 0.4622365339578454, + "grad_norm": 0.9870989918708801, + "learning_rate": 4.935408249398209e-06, + "loss": 0.6818, + "step": 1579 + }, + { + "epoch": 0.46252927400468385, + "grad_norm": 1.0507659912109375, + "learning_rate": 4.935321253418306e-06, + "loss": 0.6859, + "step": 1580 + }, + { + "epoch": 0.46282201405152223, + "grad_norm": 1.0288645029067993, + "learning_rate": 4.935234199659939e-06, + "loss": 0.6964, + "step": 1581 + }, + { + "epoch": 0.46311475409836067, + "grad_norm": 1.084138035774231, + "learning_rate": 4.935147088125174e-06, + "loss": 0.673, + "step": 1582 + }, + { + "epoch": 0.46340749414519905, + "grad_norm": 1.0091365575790405, + "learning_rate": 4.935059918816075e-06, + "loss": 0.7065, + "step": 1583 + }, + { + "epoch": 0.4637002341920375, + "grad_norm": 1.0290796756744385, + "learning_rate": 4.934972691734712e-06, + "loss": 0.6773, + "step": 1584 + }, + { + "epoch": 0.46399297423887587, + "grad_norm": 0.9908772706985474, + "learning_rate": 4.9348854068831536e-06, + "loss": 0.7121, + "step": 1585 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 1.0257079601287842, + "learning_rate": 4.934798064263472e-06, + "loss": 0.6834, + "step": 1586 + }, + { + "epoch": 0.4645784543325527, + "grad_norm": 0.9826936721801758, + "learning_rate": 4.934710663877738e-06, + "loss": 0.7262, + "step": 1587 + }, + { + "epoch": 0.4648711943793911, + "grad_norm": 0.9932706356048584, + "learning_rate": 4.934623205728025e-06, + "loss": 0.6159, + "step": 1588 + }, + { + "epoch": 0.4651639344262295, + "grad_norm": 1.0249824523925781, + "learning_rate": 4.9345356898164094e-06, + "loss": 0.7298, + "step": 1589 + }, + { + "epoch": 0.46545667447306793, + "grad_norm": 1.0163424015045166, + "learning_rate": 4.934448116144967e-06, + "loss": 0.6975, + "step": 1590 + }, + { + "epoch": 0.4657494145199063, + "grad_norm": 1.0417466163635254, + "learning_rate": 4.934360484715776e-06, + "loss": 0.7085, + "step": 1591 + }, + { + "epoch": 0.46604215456674475, + "grad_norm": 1.0184879302978516, + "learning_rate": 4.9342727955309135e-06, + "loss": 0.701, + "step": 1592 + }, + { + "epoch": 0.46633489461358313, + "grad_norm": 0.9675158858299255, + "learning_rate": 4.934185048592462e-06, + "loss": 0.6323, + "step": 1593 + }, + { + "epoch": 0.46662763466042156, + "grad_norm": 0.9923155307769775, + "learning_rate": 4.934097243902503e-06, + "loss": 0.7286, + "step": 1594 + }, + { + "epoch": 0.46692037470725994, + "grad_norm": 1.0460234880447388, + "learning_rate": 4.934009381463118e-06, + "loss": 0.6809, + "step": 1595 + }, + { + "epoch": 0.4672131147540984, + "grad_norm": 1.018006682395935, + "learning_rate": 4.933921461276394e-06, + "loss": 0.7167, + "step": 1596 + }, + { + "epoch": 0.46750585480093676, + "grad_norm": 1.0285162925720215, + "learning_rate": 4.933833483344415e-06, + "loss": 0.6995, + "step": 1597 + }, + { + "epoch": 0.4677985948477752, + "grad_norm": 1.023771047592163, + "learning_rate": 4.93374544766927e-06, + "loss": 0.7042, + "step": 1598 + }, + { + "epoch": 0.4680913348946136, + "grad_norm": 0.9525641798973083, + "learning_rate": 4.933657354253045e-06, + "loss": 0.6765, + "step": 1599 + }, + { + "epoch": 0.468384074941452, + "grad_norm": 0.962576687335968, + "learning_rate": 4.933569203097833e-06, + "loss": 0.6912, + "step": 1600 + }, + { + "epoch": 0.4686768149882904, + "grad_norm": 0.9500767588615417, + "learning_rate": 4.933480994205724e-06, + "loss": 0.6614, + "step": 1601 + }, + { + "epoch": 0.4689695550351288, + "grad_norm": 1.0513293743133545, + "learning_rate": 4.933392727578809e-06, + "loss": 0.6993, + "step": 1602 + }, + { + "epoch": 0.4692622950819672, + "grad_norm": 1.023654580116272, + "learning_rate": 4.9333044032191865e-06, + "loss": 0.7276, + "step": 1603 + }, + { + "epoch": 0.46955503512880564, + "grad_norm": 1.016361117362976, + "learning_rate": 4.933216021128949e-06, + "loss": 0.6795, + "step": 1604 + }, + { + "epoch": 0.469847775175644, + "grad_norm": 0.9931377172470093, + "learning_rate": 4.933127581310193e-06, + "loss": 0.7203, + "step": 1605 + }, + { + "epoch": 0.47014051522248246, + "grad_norm": 1.0300989151000977, + "learning_rate": 4.933039083765019e-06, + "loss": 0.7062, + "step": 1606 + }, + { + "epoch": 0.47043325526932084, + "grad_norm": 0.9942249655723572, + "learning_rate": 4.932950528495525e-06, + "loss": 0.6828, + "step": 1607 + }, + { + "epoch": 0.4707259953161593, + "grad_norm": 1.0218541622161865, + "learning_rate": 4.932861915503811e-06, + "loss": 0.684, + "step": 1608 + }, + { + "epoch": 0.47101873536299765, + "grad_norm": 0.9693766832351685, + "learning_rate": 4.932773244791982e-06, + "loss": 0.6536, + "step": 1609 + }, + { + "epoch": 0.4713114754098361, + "grad_norm": 1.0432535409927368, + "learning_rate": 4.932684516362139e-06, + "loss": 0.7293, + "step": 1610 + }, + { + "epoch": 0.47160421545667447, + "grad_norm": 0.9919100403785706, + "learning_rate": 4.932595730216389e-06, + "loss": 0.6512, + "step": 1611 + }, + { + "epoch": 0.4718969555035129, + "grad_norm": 0.9999998807907104, + "learning_rate": 4.932506886356839e-06, + "loss": 0.7141, + "step": 1612 + }, + { + "epoch": 0.4721896955503513, + "grad_norm": 1.0512914657592773, + "learning_rate": 4.9324179847855955e-06, + "loss": 0.7062, + "step": 1613 + }, + { + "epoch": 0.4724824355971897, + "grad_norm": 1.0060722827911377, + "learning_rate": 4.932329025504767e-06, + "loss": 0.7019, + "step": 1614 + }, + { + "epoch": 0.4727751756440281, + "grad_norm": 0.9997569918632507, + "learning_rate": 4.932240008516466e-06, + "loss": 0.7534, + "step": 1615 + }, + { + "epoch": 0.47306791569086654, + "grad_norm": 1.0036380290985107, + "learning_rate": 4.932150933822803e-06, + "loss": 0.6412, + "step": 1616 + }, + { + "epoch": 0.4733606557377049, + "grad_norm": 1.025288462638855, + "learning_rate": 4.932061801425893e-06, + "loss": 0.7121, + "step": 1617 + }, + { + "epoch": 0.47365339578454335, + "grad_norm": 1.089234709739685, + "learning_rate": 4.931972611327848e-06, + "loss": 0.7615, + "step": 1618 + }, + { + "epoch": 0.47394613583138173, + "grad_norm": 0.9486044645309448, + "learning_rate": 4.9318833635307865e-06, + "loss": 0.6576, + "step": 1619 + }, + { + "epoch": 0.47423887587822017, + "grad_norm": 1.0110373497009277, + "learning_rate": 4.931794058036824e-06, + "loss": 0.7031, + "step": 1620 + }, + { + "epoch": 0.47453161592505855, + "grad_norm": 1.0531044006347656, + "learning_rate": 4.931704694848081e-06, + "loss": 0.7018, + "step": 1621 + }, + { + "epoch": 0.474824355971897, + "grad_norm": 0.9968502521514893, + "learning_rate": 4.9316152739666765e-06, + "loss": 0.6782, + "step": 1622 + }, + { + "epoch": 0.47511709601873536, + "grad_norm": 1.0105788707733154, + "learning_rate": 4.931525795394732e-06, + "loss": 0.6673, + "step": 1623 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 1.0336084365844727, + "learning_rate": 4.931436259134372e-06, + "loss": 0.7226, + "step": 1624 + }, + { + "epoch": 0.4757025761124122, + "grad_norm": 1.0822569131851196, + "learning_rate": 4.931346665187719e-06, + "loss": 0.7338, + "step": 1625 + }, + { + "epoch": 0.47599531615925056, + "grad_norm": 1.0120885372161865, + "learning_rate": 4.931257013556899e-06, + "loss": 0.6278, + "step": 1626 + }, + { + "epoch": 0.476288056206089, + "grad_norm": 1.0359200239181519, + "learning_rate": 4.931167304244041e-06, + "loss": 0.6998, + "step": 1627 + }, + { + "epoch": 0.4765807962529274, + "grad_norm": 0.9902055859565735, + "learning_rate": 4.93107753725127e-06, + "loss": 0.6633, + "step": 1628 + }, + { + "epoch": 0.4768735362997658, + "grad_norm": 1.0071651935577393, + "learning_rate": 4.930987712580717e-06, + "loss": 0.662, + "step": 1629 + }, + { + "epoch": 0.4771662763466042, + "grad_norm": 0.9890103340148926, + "learning_rate": 4.930897830234514e-06, + "loss": 0.6526, + "step": 1630 + }, + { + "epoch": 0.4774590163934426, + "grad_norm": 1.0118926763534546, + "learning_rate": 4.930807890214794e-06, + "loss": 0.6808, + "step": 1631 + }, + { + "epoch": 0.477751756440281, + "grad_norm": 1.013959288597107, + "learning_rate": 4.9307178925236885e-06, + "loss": 0.6936, + "step": 1632 + }, + { + "epoch": 0.47804449648711944, + "grad_norm": 0.9849639534950256, + "learning_rate": 4.930627837163334e-06, + "loss": 0.7063, + "step": 1633 + }, + { + "epoch": 0.4783372365339578, + "grad_norm": 1.0201255083084106, + "learning_rate": 4.930537724135868e-06, + "loss": 0.6919, + "step": 1634 + }, + { + "epoch": 0.47862997658079626, + "grad_norm": 1.0201516151428223, + "learning_rate": 4.930447553443427e-06, + "loss": 0.693, + "step": 1635 + }, + { + "epoch": 0.47892271662763464, + "grad_norm": 1.0273560285568237, + "learning_rate": 4.930357325088151e-06, + "loss": 0.68, + "step": 1636 + }, + { + "epoch": 0.4792154566744731, + "grad_norm": 1.0146743059158325, + "learning_rate": 4.93026703907218e-06, + "loss": 0.6972, + "step": 1637 + }, + { + "epoch": 0.47950819672131145, + "grad_norm": 1.0288480520248413, + "learning_rate": 4.930176695397657e-06, + "loss": 0.7044, + "step": 1638 + }, + { + "epoch": 0.4798009367681499, + "grad_norm": 0.9934495091438293, + "learning_rate": 4.930086294066725e-06, + "loss": 0.666, + "step": 1639 + }, + { + "epoch": 0.48009367681498827, + "grad_norm": 1.0443718433380127, + "learning_rate": 4.929995835081529e-06, + "loss": 0.7192, + "step": 1640 + }, + { + "epoch": 0.4803864168618267, + "grad_norm": 0.9611266255378723, + "learning_rate": 4.929905318444215e-06, + "loss": 0.6768, + "step": 1641 + }, + { + "epoch": 0.4806791569086651, + "grad_norm": 1.0527710914611816, + "learning_rate": 4.92981474415693e-06, + "loss": 0.7184, + "step": 1642 + }, + { + "epoch": 0.4809718969555035, + "grad_norm": 1.0840237140655518, + "learning_rate": 4.929724112221824e-06, + "loss": 0.6903, + "step": 1643 + }, + { + "epoch": 0.4812646370023419, + "grad_norm": 1.013275384902954, + "learning_rate": 4.929633422641046e-06, + "loss": 0.664, + "step": 1644 + }, + { + "epoch": 0.48155737704918034, + "grad_norm": 1.0002176761627197, + "learning_rate": 4.9295426754167486e-06, + "loss": 0.655, + "step": 1645 + }, + { + "epoch": 0.4818501170960187, + "grad_norm": 0.9724012613296509, + "learning_rate": 4.929451870551085e-06, + "loss": 0.6801, + "step": 1646 + }, + { + "epoch": 0.48214285714285715, + "grad_norm": 1.0075947046279907, + "learning_rate": 4.929361008046209e-06, + "loss": 0.6716, + "step": 1647 + }, + { + "epoch": 0.48243559718969553, + "grad_norm": 1.043563961982727, + "learning_rate": 4.9292700879042746e-06, + "loss": 0.6741, + "step": 1648 + }, + { + "epoch": 0.48272833723653397, + "grad_norm": 0.9482227563858032, + "learning_rate": 4.929179110127442e-06, + "loss": 0.6291, + "step": 1649 + }, + { + "epoch": 0.48302107728337235, + "grad_norm": 0.9976707100868225, + "learning_rate": 4.9290880747178686e-06, + "loss": 0.7038, + "step": 1650 + }, + { + "epoch": 0.4833138173302108, + "grad_norm": 1.053668737411499, + "learning_rate": 4.9289969816777124e-06, + "loss": 0.7106, + "step": 1651 + }, + { + "epoch": 0.48360655737704916, + "grad_norm": 1.0003395080566406, + "learning_rate": 4.928905831009138e-06, + "loss": 0.6957, + "step": 1652 + }, + { + "epoch": 0.4838992974238876, + "grad_norm": 1.0083470344543457, + "learning_rate": 4.9288146227143055e-06, + "loss": 0.7282, + "step": 1653 + }, + { + "epoch": 0.484192037470726, + "grad_norm": 0.9875685572624207, + "learning_rate": 4.928723356795379e-06, + "loss": 0.7212, + "step": 1654 + }, + { + "epoch": 0.4844847775175644, + "grad_norm": 1.0869332551956177, + "learning_rate": 4.928632033254524e-06, + "loss": 0.6998, + "step": 1655 + }, + { + "epoch": 0.4847775175644028, + "grad_norm": 0.9819524884223938, + "learning_rate": 4.928540652093908e-06, + "loss": 0.6727, + "step": 1656 + }, + { + "epoch": 0.48507025761124123, + "grad_norm": 1.03109610080719, + "learning_rate": 4.928449213315699e-06, + "loss": 0.6979, + "step": 1657 + }, + { + "epoch": 0.4853629976580796, + "grad_norm": 1.0214091539382935, + "learning_rate": 4.928357716922065e-06, + "loss": 0.6445, + "step": 1658 + }, + { + "epoch": 0.48565573770491804, + "grad_norm": 1.0559478998184204, + "learning_rate": 4.928266162915178e-06, + "loss": 0.7151, + "step": 1659 + }, + { + "epoch": 0.4859484777517564, + "grad_norm": 1.0191131830215454, + "learning_rate": 4.9281745512972105e-06, + "loss": 0.6625, + "step": 1660 + }, + { + "epoch": 0.48624121779859486, + "grad_norm": 1.0035191774368286, + "learning_rate": 4.928082882070334e-06, + "loss": 0.7033, + "step": 1661 + }, + { + "epoch": 0.48653395784543324, + "grad_norm": 1.0474588871002197, + "learning_rate": 4.927991155236725e-06, + "loss": 0.669, + "step": 1662 + }, + { + "epoch": 0.4868266978922717, + "grad_norm": 1.0723110437393188, + "learning_rate": 4.927899370798559e-06, + "loss": 0.7226, + "step": 1663 + }, + { + "epoch": 0.48711943793911006, + "grad_norm": 1.0155754089355469, + "learning_rate": 4.927807528758015e-06, + "loss": 0.7036, + "step": 1664 + }, + { + "epoch": 0.4874121779859485, + "grad_norm": 1.0127094984054565, + "learning_rate": 4.92771562911727e-06, + "loss": 0.6902, + "step": 1665 + }, + { + "epoch": 0.48770491803278687, + "grad_norm": 0.9810916185379028, + "learning_rate": 4.927623671878505e-06, + "loss": 0.6644, + "step": 1666 + }, + { + "epoch": 0.4879976580796253, + "grad_norm": 1.0194673538208008, + "learning_rate": 4.927531657043903e-06, + "loss": 0.6904, + "step": 1667 + }, + { + "epoch": 0.4882903981264637, + "grad_norm": 0.9658402800559998, + "learning_rate": 4.927439584615646e-06, + "loss": 0.6686, + "step": 1668 + }, + { + "epoch": 0.4885831381733021, + "grad_norm": 1.003627896308899, + "learning_rate": 4.927347454595918e-06, + "loss": 0.7066, + "step": 1669 + }, + { + "epoch": 0.4888758782201405, + "grad_norm": 0.9580634832382202, + "learning_rate": 4.927255266986906e-06, + "loss": 0.6446, + "step": 1670 + }, + { + "epoch": 0.48916861826697894, + "grad_norm": 1.0631824731826782, + "learning_rate": 4.9271630217907955e-06, + "loss": 0.6837, + "step": 1671 + }, + { + "epoch": 0.4894613583138173, + "grad_norm": 1.0464458465576172, + "learning_rate": 4.9270707190097764e-06, + "loss": 0.693, + "step": 1672 + }, + { + "epoch": 0.48975409836065575, + "grad_norm": 1.1209322214126587, + "learning_rate": 4.926978358646039e-06, + "loss": 0.7327, + "step": 1673 + }, + { + "epoch": 0.49004683840749413, + "grad_norm": 0.9891908168792725, + "learning_rate": 4.926885940701772e-06, + "loss": 0.7137, + "step": 1674 + }, + { + "epoch": 0.49033957845433257, + "grad_norm": 0.9350649118423462, + "learning_rate": 4.926793465179171e-06, + "loss": 0.6224, + "step": 1675 + }, + { + "epoch": 0.49063231850117095, + "grad_norm": 1.076486587524414, + "learning_rate": 4.9267009320804285e-06, + "loss": 0.6643, + "step": 1676 + }, + { + "epoch": 0.4909250585480094, + "grad_norm": 0.9870668649673462, + "learning_rate": 4.9266083414077405e-06, + "loss": 0.6655, + "step": 1677 + }, + { + "epoch": 0.49121779859484777, + "grad_norm": 0.9744249582290649, + "learning_rate": 4.926515693163303e-06, + "loss": 0.6633, + "step": 1678 + }, + { + "epoch": 0.4915105386416862, + "grad_norm": 1.0023761987686157, + "learning_rate": 4.926422987349314e-06, + "loss": 0.7136, + "step": 1679 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.9782978892326355, + "learning_rate": 4.926330223967975e-06, + "loss": 0.6698, + "step": 1680 + }, + { + "epoch": 0.492096018735363, + "grad_norm": 0.9790882468223572, + "learning_rate": 4.926237403021484e-06, + "loss": 0.675, + "step": 1681 + }, + { + "epoch": 0.4923887587822014, + "grad_norm": 1.0051923990249634, + "learning_rate": 4.926144524512045e-06, + "loss": 0.7175, + "step": 1682 + }, + { + "epoch": 0.49268149882903983, + "grad_norm": 0.9809061288833618, + "learning_rate": 4.926051588441861e-06, + "loss": 0.6965, + "step": 1683 + }, + { + "epoch": 0.4929742388758782, + "grad_norm": 0.9999088048934937, + "learning_rate": 4.9259585948131375e-06, + "loss": 0.6505, + "step": 1684 + }, + { + "epoch": 0.49326697892271665, + "grad_norm": 1.0039489269256592, + "learning_rate": 4.925865543628079e-06, + "loss": 0.6647, + "step": 1685 + }, + { + "epoch": 0.49355971896955503, + "grad_norm": 1.0456119775772095, + "learning_rate": 4.925772434888895e-06, + "loss": 0.6591, + "step": 1686 + }, + { + "epoch": 0.49385245901639346, + "grad_norm": 1.0395344495773315, + "learning_rate": 4.925679268597794e-06, + "loss": 0.7231, + "step": 1687 + }, + { + "epoch": 0.49414519906323184, + "grad_norm": 1.0328888893127441, + "learning_rate": 4.925586044756986e-06, + "loss": 0.6512, + "step": 1688 + }, + { + "epoch": 0.4944379391100703, + "grad_norm": 1.1648180484771729, + "learning_rate": 4.925492763368684e-06, + "loss": 0.6801, + "step": 1689 + }, + { + "epoch": 0.49473067915690866, + "grad_norm": 1.1069332361221313, + "learning_rate": 4.9253994244350996e-06, + "loss": 0.6708, + "step": 1690 + }, + { + "epoch": 0.4950234192037471, + "grad_norm": 1.0336140394210815, + "learning_rate": 4.9253060279584475e-06, + "loss": 0.6959, + "step": 1691 + }, + { + "epoch": 0.4953161592505855, + "grad_norm": 0.992352306842804, + "learning_rate": 4.925212573940945e-06, + "loss": 0.6433, + "step": 1692 + }, + { + "epoch": 0.4956088992974239, + "grad_norm": 1.0218180418014526, + "learning_rate": 4.9251190623848075e-06, + "loss": 0.6867, + "step": 1693 + }, + { + "epoch": 0.4959016393442623, + "grad_norm": 1.0174202919006348, + "learning_rate": 4.925025493292254e-06, + "loss": 0.6743, + "step": 1694 + }, + { + "epoch": 0.4961943793911007, + "grad_norm": 1.0067107677459717, + "learning_rate": 4.9249318666655056e-06, + "loss": 0.6928, + "step": 1695 + }, + { + "epoch": 0.4964871194379391, + "grad_norm": 1.0289318561553955, + "learning_rate": 4.924838182506781e-06, + "loss": 0.6673, + "step": 1696 + }, + { + "epoch": 0.49677985948477754, + "grad_norm": 1.014535665512085, + "learning_rate": 4.9247444408183066e-06, + "loss": 0.7082, + "step": 1697 + }, + { + "epoch": 0.4970725995316159, + "grad_norm": 1.056251049041748, + "learning_rate": 4.924650641602304e-06, + "loss": 0.6728, + "step": 1698 + }, + { + "epoch": 0.49736533957845436, + "grad_norm": 1.009797215461731, + "learning_rate": 4.924556784860999e-06, + "loss": 0.7011, + "step": 1699 + }, + { + "epoch": 0.49765807962529274, + "grad_norm": 0.9793830513954163, + "learning_rate": 4.924462870596619e-06, + "loss": 0.6915, + "step": 1700 + }, + { + "epoch": 0.4979508196721312, + "grad_norm": 1.0169779062271118, + "learning_rate": 4.924368898811391e-06, + "loss": 0.7406, + "step": 1701 + }, + { + "epoch": 0.49824355971896955, + "grad_norm": 1.031825304031372, + "learning_rate": 4.9242748695075445e-06, + "loss": 0.6579, + "step": 1702 + }, + { + "epoch": 0.498536299765808, + "grad_norm": 0.9976780414581299, + "learning_rate": 4.924180782687312e-06, + "loss": 0.7096, + "step": 1703 + }, + { + "epoch": 0.49882903981264637, + "grad_norm": 1.075486421585083, + "learning_rate": 4.924086638352924e-06, + "loss": 0.7291, + "step": 1704 + }, + { + "epoch": 0.4991217798594848, + "grad_norm": 0.9969227910041809, + "learning_rate": 4.923992436506615e-06, + "loss": 0.6645, + "step": 1705 + }, + { + "epoch": 0.4994145199063232, + "grad_norm": 0.9652342796325684, + "learning_rate": 4.92389817715062e-06, + "loss": 0.6845, + "step": 1706 + }, + { + "epoch": 0.4997072599531616, + "grad_norm": 1.073412299156189, + "learning_rate": 4.923803860287175e-06, + "loss": 0.7136, + "step": 1707 + }, + { + "epoch": 0.5, + "grad_norm": 1.0346430540084839, + "learning_rate": 4.923709485918518e-06, + "loss": 0.666, + "step": 1708 + }, + { + "epoch": 0.5002927400468384, + "grad_norm": 1.0767618417739868, + "learning_rate": 4.923615054046887e-06, + "loss": 0.698, + "step": 1709 + }, + { + "epoch": 0.5005854800936768, + "grad_norm": 0.9610265493392944, + "learning_rate": 4.9235205646745245e-06, + "loss": 0.6629, + "step": 1710 + }, + { + "epoch": 0.5008782201405152, + "grad_norm": 1.0610243082046509, + "learning_rate": 4.92342601780367e-06, + "loss": 0.7016, + "step": 1711 + }, + { + "epoch": 0.5011709601873536, + "grad_norm": 1.0242431163787842, + "learning_rate": 4.923331413436567e-06, + "loss": 0.7082, + "step": 1712 + }, + { + "epoch": 0.5014637002341921, + "grad_norm": 1.0031076669692993, + "learning_rate": 4.923236751575462e-06, + "loss": 0.6782, + "step": 1713 + }, + { + "epoch": 0.5017564402810304, + "grad_norm": 0.9722186326980591, + "learning_rate": 4.923142032222599e-06, + "loss": 0.682, + "step": 1714 + }, + { + "epoch": 0.5020491803278688, + "grad_norm": 1.0470008850097656, + "learning_rate": 4.923047255380226e-06, + "loss": 0.7084, + "step": 1715 + }, + { + "epoch": 0.5023419203747073, + "grad_norm": 1.0557764768600464, + "learning_rate": 4.922952421050591e-06, + "loss": 0.6563, + "step": 1716 + }, + { + "epoch": 0.5026346604215457, + "grad_norm": 1.0708643198013306, + "learning_rate": 4.922857529235944e-06, + "loss": 0.6943, + "step": 1717 + }, + { + "epoch": 0.502927400468384, + "grad_norm": 0.9730634689331055, + "learning_rate": 4.922762579938537e-06, + "loss": 0.685, + "step": 1718 + }, + { + "epoch": 0.5032201405152225, + "grad_norm": 1.02862548828125, + "learning_rate": 4.922667573160622e-06, + "loss": 0.7384, + "step": 1719 + }, + { + "epoch": 0.5035128805620609, + "grad_norm": 1.1486345529556274, + "learning_rate": 4.922572508904454e-06, + "loss": 0.7244, + "step": 1720 + }, + { + "epoch": 0.5038056206088993, + "grad_norm": 1.0733104944229126, + "learning_rate": 4.9224773871722875e-06, + "loss": 0.7161, + "step": 1721 + }, + { + "epoch": 0.5040983606557377, + "grad_norm": 1.0065982341766357, + "learning_rate": 4.92238220796638e-06, + "loss": 0.6838, + "step": 1722 + }, + { + "epoch": 0.5043911007025761, + "grad_norm": 0.9616788625717163, + "learning_rate": 4.922286971288989e-06, + "loss": 0.6944, + "step": 1723 + }, + { + "epoch": 0.5046838407494145, + "grad_norm": 0.99040287733078, + "learning_rate": 4.922191677142374e-06, + "loss": 0.6832, + "step": 1724 + }, + { + "epoch": 0.504976580796253, + "grad_norm": 0.9533112645149231, + "learning_rate": 4.922096325528796e-06, + "loss": 0.6501, + "step": 1725 + }, + { + "epoch": 0.5052693208430913, + "grad_norm": 1.0018318891525269, + "learning_rate": 4.922000916450517e-06, + "loss": 0.7089, + "step": 1726 + }, + { + "epoch": 0.5055620608899297, + "grad_norm": 1.004063606262207, + "learning_rate": 4.9219054499098015e-06, + "loss": 0.6921, + "step": 1727 + }, + { + "epoch": 0.5058548009367682, + "grad_norm": 1.0972379446029663, + "learning_rate": 4.921809925908914e-06, + "loss": 0.6784, + "step": 1728 + }, + { + "epoch": 0.5061475409836066, + "grad_norm": 1.0045503377914429, + "learning_rate": 4.92171434445012e-06, + "loss": 0.7007, + "step": 1729 + }, + { + "epoch": 0.5064402810304449, + "grad_norm": 1.0523390769958496, + "learning_rate": 4.921618705535689e-06, + "loss": 0.6593, + "step": 1730 + }, + { + "epoch": 0.5067330210772834, + "grad_norm": 1.0464568138122559, + "learning_rate": 4.921523009167888e-06, + "loss": 0.7007, + "step": 1731 + }, + { + "epoch": 0.5070257611241218, + "grad_norm": 0.9746640920639038, + "learning_rate": 4.92142725534899e-06, + "loss": 0.695, + "step": 1732 + }, + { + "epoch": 0.5073185011709602, + "grad_norm": 1.0180875062942505, + "learning_rate": 4.921331444081263e-06, + "loss": 0.7061, + "step": 1733 + }, + { + "epoch": 0.5076112412177985, + "grad_norm": 0.992331326007843, + "learning_rate": 4.921235575366984e-06, + "loss": 0.6797, + "step": 1734 + }, + { + "epoch": 0.507903981264637, + "grad_norm": 0.9861599206924438, + "learning_rate": 4.921139649208425e-06, + "loss": 0.664, + "step": 1735 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 0.9805043935775757, + "learning_rate": 4.9210436656078625e-06, + "loss": 0.7107, + "step": 1736 + }, + { + "epoch": 0.5084894613583139, + "grad_norm": 0.9678063988685608, + "learning_rate": 4.9209476245675745e-06, + "loss": 0.6903, + "step": 1737 + }, + { + "epoch": 0.5087822014051522, + "grad_norm": 1.0074952840805054, + "learning_rate": 4.920851526089838e-06, + "loss": 0.7109, + "step": 1738 + }, + { + "epoch": 0.5090749414519906, + "grad_norm": 1.069000244140625, + "learning_rate": 4.920755370176935e-06, + "loss": 0.7103, + "step": 1739 + }, + { + "epoch": 0.509367681498829, + "grad_norm": 1.006404161453247, + "learning_rate": 4.920659156831145e-06, + "loss": 0.6645, + "step": 1740 + }, + { + "epoch": 0.5096604215456675, + "grad_norm": 1.0848084688186646, + "learning_rate": 4.9205628860547516e-06, + "loss": 0.7231, + "step": 1741 + }, + { + "epoch": 0.5099531615925058, + "grad_norm": 0.9718915820121765, + "learning_rate": 4.9204665578500385e-06, + "loss": 0.6528, + "step": 1742 + }, + { + "epoch": 0.5102459016393442, + "grad_norm": 1.0646721124649048, + "learning_rate": 4.920370172219292e-06, + "loss": 0.6622, + "step": 1743 + }, + { + "epoch": 0.5105386416861827, + "grad_norm": 1.0056434869766235, + "learning_rate": 4.920273729164799e-06, + "loss": 0.7249, + "step": 1744 + }, + { + "epoch": 0.5108313817330211, + "grad_norm": 1.0041409730911255, + "learning_rate": 4.920177228688845e-06, + "loss": 0.6841, + "step": 1745 + }, + { + "epoch": 0.5111241217798594, + "grad_norm": 1.0198118686676025, + "learning_rate": 4.920080670793722e-06, + "loss": 0.6907, + "step": 1746 + }, + { + "epoch": 0.5114168618266979, + "grad_norm": 1.060612440109253, + "learning_rate": 4.919984055481719e-06, + "loss": 0.6535, + "step": 1747 + }, + { + "epoch": 0.5117096018735363, + "grad_norm": 1.0267560482025146, + "learning_rate": 4.919887382755131e-06, + "loss": 0.6807, + "step": 1748 + }, + { + "epoch": 0.5120023419203747, + "grad_norm": 1.0354574918746948, + "learning_rate": 4.91979065261625e-06, + "loss": 0.6795, + "step": 1749 + }, + { + "epoch": 0.5122950819672131, + "grad_norm": 0.979286789894104, + "learning_rate": 4.919693865067369e-06, + "loss": 0.6696, + "step": 1750 + }, + { + "epoch": 0.5125878220140515, + "grad_norm": 1.006359338760376, + "learning_rate": 4.919597020110788e-06, + "loss": 0.7092, + "step": 1751 + }, + { + "epoch": 0.5128805620608899, + "grad_norm": 1.0431020259857178, + "learning_rate": 4.919500117748802e-06, + "loss": 0.6859, + "step": 1752 + }, + { + "epoch": 0.5131733021077284, + "grad_norm": 0.9722080826759338, + "learning_rate": 4.919403157983711e-06, + "loss": 0.6917, + "step": 1753 + }, + { + "epoch": 0.5134660421545667, + "grad_norm": 0.9672486782073975, + "learning_rate": 4.919306140817816e-06, + "loss": 0.6604, + "step": 1754 + }, + { + "epoch": 0.5137587822014051, + "grad_norm": 0.9700562357902527, + "learning_rate": 4.919209066253416e-06, + "loss": 0.663, + "step": 1755 + }, + { + "epoch": 0.5140515222482436, + "grad_norm": 1.0520321130752563, + "learning_rate": 4.919111934292819e-06, + "loss": 0.7093, + "step": 1756 + }, + { + "epoch": 0.514344262295082, + "grad_norm": 0.9865478277206421, + "learning_rate": 4.919014744938324e-06, + "loss": 0.6748, + "step": 1757 + }, + { + "epoch": 0.5146370023419203, + "grad_norm": 0.9970810413360596, + "learning_rate": 4.91891749819224e-06, + "loss": 0.6765, + "step": 1758 + }, + { + "epoch": 0.5149297423887588, + "grad_norm": 1.0557605028152466, + "learning_rate": 4.9188201940568745e-06, + "loss": 0.6891, + "step": 1759 + }, + { + "epoch": 0.5152224824355972, + "grad_norm": 0.986372172832489, + "learning_rate": 4.918722832534535e-06, + "loss": 0.6827, + "step": 1760 + }, + { + "epoch": 0.5155152224824356, + "grad_norm": 1.0314772129058838, + "learning_rate": 4.9186254136275315e-06, + "loss": 0.6674, + "step": 1761 + }, + { + "epoch": 0.515807962529274, + "grad_norm": 1.0132333040237427, + "learning_rate": 4.918527937338176e-06, + "loss": 0.6403, + "step": 1762 + }, + { + "epoch": 0.5161007025761124, + "grad_norm": 1.0450232028961182, + "learning_rate": 4.918430403668779e-06, + "loss": 0.6958, + "step": 1763 + }, + { + "epoch": 0.5163934426229508, + "grad_norm": 1.0083640813827515, + "learning_rate": 4.918332812621657e-06, + "loss": 0.6801, + "step": 1764 + }, + { + "epoch": 0.5166861826697893, + "grad_norm": 0.9617442488670349, + "learning_rate": 4.918235164199125e-06, + "loss": 0.6885, + "step": 1765 + }, + { + "epoch": 0.5169789227166276, + "grad_norm": 0.9502189755439758, + "learning_rate": 4.918137458403498e-06, + "loss": 0.6544, + "step": 1766 + }, + { + "epoch": 0.517271662763466, + "grad_norm": 0.9865788817405701, + "learning_rate": 4.918039695237095e-06, + "loss": 0.6757, + "step": 1767 + }, + { + "epoch": 0.5175644028103045, + "grad_norm": 1.0278438329696655, + "learning_rate": 4.917941874702237e-06, + "loss": 0.6889, + "step": 1768 + }, + { + "epoch": 0.5178571428571429, + "grad_norm": 0.9685450196266174, + "learning_rate": 4.917843996801243e-06, + "loss": 0.6365, + "step": 1769 + }, + { + "epoch": 0.5181498829039812, + "grad_norm": 1.0558158159255981, + "learning_rate": 4.917746061536435e-06, + "loss": 0.6855, + "step": 1770 + }, + { + "epoch": 0.5184426229508197, + "grad_norm": 1.02521812915802, + "learning_rate": 4.917648068910138e-06, + "loss": 0.7365, + "step": 1771 + }, + { + "epoch": 0.5187353629976581, + "grad_norm": 1.002528190612793, + "learning_rate": 4.917550018924675e-06, + "loss": 0.6718, + "step": 1772 + }, + { + "epoch": 0.5190281030444965, + "grad_norm": 1.0132935047149658, + "learning_rate": 4.917451911582375e-06, + "loss": 0.7154, + "step": 1773 + }, + { + "epoch": 0.5193208430913349, + "grad_norm": 1.0852420330047607, + "learning_rate": 4.917353746885562e-06, + "loss": 0.7273, + "step": 1774 + }, + { + "epoch": 0.5196135831381733, + "grad_norm": 0.9978970289230347, + "learning_rate": 4.917255524836568e-06, + "loss": 0.6943, + "step": 1775 + }, + { + "epoch": 0.5199063231850117, + "grad_norm": 0.9833730459213257, + "learning_rate": 4.917157245437723e-06, + "loss": 0.6727, + "step": 1776 + }, + { + "epoch": 0.5201990632318502, + "grad_norm": 1.030129075050354, + "learning_rate": 4.917058908691357e-06, + "loss": 0.6926, + "step": 1777 + }, + { + "epoch": 0.5204918032786885, + "grad_norm": 0.9920133948326111, + "learning_rate": 4.916960514599804e-06, + "loss": 0.6587, + "step": 1778 + }, + { + "epoch": 0.5207845433255269, + "grad_norm": 1.062203049659729, + "learning_rate": 4.916862063165399e-06, + "loss": 0.6935, + "step": 1779 + }, + { + "epoch": 0.5210772833723654, + "grad_norm": 0.9840894937515259, + "learning_rate": 4.916763554390477e-06, + "loss": 0.6879, + "step": 1780 + }, + { + "epoch": 0.5213700234192038, + "grad_norm": 0.9604233503341675, + "learning_rate": 4.916664988277375e-06, + "loss": 0.6651, + "step": 1781 + }, + { + "epoch": 0.5216627634660421, + "grad_norm": 1.002029299736023, + "learning_rate": 4.916566364828432e-06, + "loss": 0.6714, + "step": 1782 + }, + { + "epoch": 0.5219555035128806, + "grad_norm": 0.9953715205192566, + "learning_rate": 4.916467684045988e-06, + "loss": 0.7152, + "step": 1783 + }, + { + "epoch": 0.522248243559719, + "grad_norm": 1.0449846982955933, + "learning_rate": 4.916368945932383e-06, + "loss": 0.6471, + "step": 1784 + }, + { + "epoch": 0.5225409836065574, + "grad_norm": 0.9990989565849304, + "learning_rate": 4.916270150489961e-06, + "loss": 0.6591, + "step": 1785 + }, + { + "epoch": 0.5228337236533958, + "grad_norm": 0.9939989447593689, + "learning_rate": 4.9161712977210664e-06, + "loss": 0.67, + "step": 1786 + }, + { + "epoch": 0.5231264637002342, + "grad_norm": 1.0171363353729248, + "learning_rate": 4.916072387628043e-06, + "loss": 0.7096, + "step": 1787 + }, + { + "epoch": 0.5234192037470726, + "grad_norm": 1.0060858726501465, + "learning_rate": 4.915973420213238e-06, + "loss": 0.6981, + "step": 1788 + }, + { + "epoch": 0.5237119437939111, + "grad_norm": 0.9748288989067078, + "learning_rate": 4.915874395478999e-06, + "loss": 0.6707, + "step": 1789 + }, + { + "epoch": 0.5240046838407494, + "grad_norm": 1.4380316734313965, + "learning_rate": 4.915775313427676e-06, + "loss": 0.7098, + "step": 1790 + }, + { + "epoch": 0.5242974238875878, + "grad_norm": 0.9562284350395203, + "learning_rate": 4.9156761740616195e-06, + "loss": 0.6989, + "step": 1791 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.9703788161277771, + "learning_rate": 4.915576977383182e-06, + "loss": 0.6761, + "step": 1792 + }, + { + "epoch": 0.5248829039812647, + "grad_norm": 1.073676586151123, + "learning_rate": 4.915477723394717e-06, + "loss": 0.6949, + "step": 1793 + }, + { + "epoch": 0.525175644028103, + "grad_norm": 0.9993125796318054, + "learning_rate": 4.915378412098578e-06, + "loss": 0.7321, + "step": 1794 + }, + { + "epoch": 0.5254683840749415, + "grad_norm": 1.0560977458953857, + "learning_rate": 4.915279043497123e-06, + "loss": 0.7065, + "step": 1795 + }, + { + "epoch": 0.5257611241217799, + "grad_norm": 0.9361927509307861, + "learning_rate": 4.915179617592708e-06, + "loss": 0.6725, + "step": 1796 + }, + { + "epoch": 0.5260538641686182, + "grad_norm": 1.0178052186965942, + "learning_rate": 4.915080134387693e-06, + "loss": 0.6878, + "step": 1797 + }, + { + "epoch": 0.5263466042154566, + "grad_norm": 0.9978435635566711, + "learning_rate": 4.914980593884438e-06, + "loss": 0.6189, + "step": 1798 + }, + { + "epoch": 0.5266393442622951, + "grad_norm": 1.0239073038101196, + "learning_rate": 4.914880996085304e-06, + "loss": 0.7172, + "step": 1799 + }, + { + "epoch": 0.5269320843091335, + "grad_norm": 0.9885980486869812, + "learning_rate": 4.9147813409926544e-06, + "loss": 0.6692, + "step": 1800 + }, + { + "epoch": 0.5272248243559718, + "grad_norm": 0.983012318611145, + "learning_rate": 4.914681628608854e-06, + "loss": 0.7027, + "step": 1801 + }, + { + "epoch": 0.5275175644028103, + "grad_norm": 1.0915889739990234, + "learning_rate": 4.914581858936267e-06, + "loss": 0.688, + "step": 1802 + }, + { + "epoch": 0.5278103044496487, + "grad_norm": 1.0113648176193237, + "learning_rate": 4.914482031977262e-06, + "loss": 0.6831, + "step": 1803 + }, + { + "epoch": 0.5281030444964872, + "grad_norm": 1.045946717262268, + "learning_rate": 4.914382147734208e-06, + "loss": 0.6919, + "step": 1804 + }, + { + "epoch": 0.5283957845433255, + "grad_norm": 1.0499942302703857, + "learning_rate": 4.914282206209473e-06, + "loss": 0.6872, + "step": 1805 + }, + { + "epoch": 0.5286885245901639, + "grad_norm": 0.9130474925041199, + "learning_rate": 4.914182207405428e-06, + "loss": 0.652, + "step": 1806 + }, + { + "epoch": 0.5289812646370023, + "grad_norm": 0.9873315095901489, + "learning_rate": 4.914082151324448e-06, + "loss": 0.6903, + "step": 1807 + }, + { + "epoch": 0.5292740046838408, + "grad_norm": 1.003938913345337, + "learning_rate": 4.913982037968903e-06, + "loss": 0.7127, + "step": 1808 + }, + { + "epoch": 0.5295667447306791, + "grad_norm": 0.9686459302902222, + "learning_rate": 4.913881867341171e-06, + "loss": 0.6829, + "step": 1809 + }, + { + "epoch": 0.5298594847775175, + "grad_norm": 1.0091627836227417, + "learning_rate": 4.9137816394436296e-06, + "loss": 0.6799, + "step": 1810 + }, + { + "epoch": 0.530152224824356, + "grad_norm": 0.922448456287384, + "learning_rate": 4.9136813542786535e-06, + "loss": 0.6636, + "step": 1811 + }, + { + "epoch": 0.5304449648711944, + "grad_norm": 1.011178970336914, + "learning_rate": 4.913581011848624e-06, + "loss": 0.6965, + "step": 1812 + }, + { + "epoch": 0.5307377049180327, + "grad_norm": 1.0156731605529785, + "learning_rate": 4.913480612155921e-06, + "loss": 0.7153, + "step": 1813 + }, + { + "epoch": 0.5310304449648712, + "grad_norm": 1.0201653242111206, + "learning_rate": 4.913380155202927e-06, + "loss": 0.6861, + "step": 1814 + }, + { + "epoch": 0.5313231850117096, + "grad_norm": 1.1089726686477661, + "learning_rate": 4.913279640992026e-06, + "loss": 0.7027, + "step": 1815 + }, + { + "epoch": 0.531615925058548, + "grad_norm": 1.0089256763458252, + "learning_rate": 4.913179069525601e-06, + "loss": 0.6515, + "step": 1816 + }, + { + "epoch": 0.5319086651053864, + "grad_norm": 0.9480703473091125, + "learning_rate": 4.913078440806038e-06, + "loss": 0.6503, + "step": 1817 + }, + { + "epoch": 0.5322014051522248, + "grad_norm": 0.967658281326294, + "learning_rate": 4.912977754835727e-06, + "loss": 0.6746, + "step": 1818 + }, + { + "epoch": 0.5324941451990632, + "grad_norm": 1.0081883668899536, + "learning_rate": 4.912877011617054e-06, + "loss": 0.7338, + "step": 1819 + }, + { + "epoch": 0.5327868852459017, + "grad_norm": 1.0400853157043457, + "learning_rate": 4.912776211152412e-06, + "loss": 0.6916, + "step": 1820 + }, + { + "epoch": 0.53307962529274, + "grad_norm": 1.0715502500534058, + "learning_rate": 4.9126753534441894e-06, + "loss": 0.675, + "step": 1821 + }, + { + "epoch": 0.5333723653395784, + "grad_norm": 0.9807583093643188, + "learning_rate": 4.912574438494781e-06, + "loss": 0.6823, + "step": 1822 + }, + { + "epoch": 0.5336651053864169, + "grad_norm": 1.0356254577636719, + "learning_rate": 4.91247346630658e-06, + "loss": 0.629, + "step": 1823 + }, + { + "epoch": 0.5339578454332553, + "grad_norm": 0.9810326099395752, + "learning_rate": 4.912372436881983e-06, + "loss": 0.6854, + "step": 1824 + }, + { + "epoch": 0.5342505854800936, + "grad_norm": 1.0205388069152832, + "learning_rate": 4.912271350223386e-06, + "loss": 0.6924, + "step": 1825 + }, + { + "epoch": 0.5345433255269321, + "grad_norm": 1.2321008443832397, + "learning_rate": 4.912170206333189e-06, + "loss": 0.711, + "step": 1826 + }, + { + "epoch": 0.5348360655737705, + "grad_norm": 1.0692940950393677, + "learning_rate": 4.9120690052137895e-06, + "loss": 0.6658, + "step": 1827 + }, + { + "epoch": 0.5351288056206089, + "grad_norm": 1.0512220859527588, + "learning_rate": 4.91196774686759e-06, + "loss": 0.6587, + "step": 1828 + }, + { + "epoch": 0.5354215456674473, + "grad_norm": 0.9877349138259888, + "learning_rate": 4.911866431296991e-06, + "loss": 0.6744, + "step": 1829 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 1.0272552967071533, + "learning_rate": 4.911765058504398e-06, + "loss": 0.6456, + "step": 1830 + }, + { + "epoch": 0.5360070257611241, + "grad_norm": 0.9859814047813416, + "learning_rate": 4.9116636284922165e-06, + "loss": 0.6915, + "step": 1831 + }, + { + "epoch": 0.5362997658079626, + "grad_norm": 0.9781588315963745, + "learning_rate": 4.911562141262852e-06, + "loss": 0.6732, + "step": 1832 + }, + { + "epoch": 0.5365925058548009, + "grad_norm": 0.9875129461288452, + "learning_rate": 4.911460596818712e-06, + "loss": 0.6831, + "step": 1833 + }, + { + "epoch": 0.5368852459016393, + "grad_norm": 1.0459705591201782, + "learning_rate": 4.911358995162206e-06, + "loss": 0.6782, + "step": 1834 + }, + { + "epoch": 0.5371779859484778, + "grad_norm": 1.0303936004638672, + "learning_rate": 4.911257336295744e-06, + "loss": 0.6614, + "step": 1835 + }, + { + "epoch": 0.5374707259953162, + "grad_norm": 1.001491665840149, + "learning_rate": 4.911155620221739e-06, + "loss": 0.7005, + "step": 1836 + }, + { + "epoch": 0.5377634660421545, + "grad_norm": 1.0013641119003296, + "learning_rate": 4.911053846942604e-06, + "loss": 0.6923, + "step": 1837 + }, + { + "epoch": 0.538056206088993, + "grad_norm": 1.1147783994674683, + "learning_rate": 4.910952016460753e-06, + "loss": 0.7239, + "step": 1838 + }, + { + "epoch": 0.5383489461358314, + "grad_norm": 1.086534023284912, + "learning_rate": 4.910850128778602e-06, + "loss": 0.6288, + "step": 1839 + }, + { + "epoch": 0.5386416861826698, + "grad_norm": 1.024134635925293, + "learning_rate": 4.910748183898569e-06, + "loss": 0.6764, + "step": 1840 + }, + { + "epoch": 0.5389344262295082, + "grad_norm": 1.1205769777297974, + "learning_rate": 4.910646181823072e-06, + "loss": 0.6916, + "step": 1841 + }, + { + "epoch": 0.5392271662763466, + "grad_norm": 1.0269639492034912, + "learning_rate": 4.910544122554532e-06, + "loss": 0.6734, + "step": 1842 + }, + { + "epoch": 0.539519906323185, + "grad_norm": 1.0153809785842896, + "learning_rate": 4.910442006095368e-06, + "loss": 0.702, + "step": 1843 + }, + { + "epoch": 0.5398126463700235, + "grad_norm": 1.0802863836288452, + "learning_rate": 4.910339832448004e-06, + "loss": 0.6833, + "step": 1844 + }, + { + "epoch": 0.5401053864168618, + "grad_norm": 1.0401196479797363, + "learning_rate": 4.910237601614866e-06, + "loss": 0.6977, + "step": 1845 + }, + { + "epoch": 0.5403981264637002, + "grad_norm": 1.059260606765747, + "learning_rate": 4.910135313598377e-06, + "loss": 0.726, + "step": 1846 + }, + { + "epoch": 0.5406908665105387, + "grad_norm": 0.9089834690093994, + "learning_rate": 4.910032968400965e-06, + "loss": 0.6897, + "step": 1847 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 0.9867436289787292, + "learning_rate": 4.909930566025058e-06, + "loss": 0.7007, + "step": 1848 + }, + { + "epoch": 0.5412763466042154, + "grad_norm": 0.9832221865653992, + "learning_rate": 4.909828106473085e-06, + "loss": 0.7259, + "step": 1849 + }, + { + "epoch": 0.5415690866510539, + "grad_norm": 0.978451669216156, + "learning_rate": 4.909725589747477e-06, + "loss": 0.7095, + "step": 1850 + }, + { + "epoch": 0.5418618266978923, + "grad_norm": 1.0657687187194824, + "learning_rate": 4.909623015850667e-06, + "loss": 0.7089, + "step": 1851 + }, + { + "epoch": 0.5421545667447307, + "grad_norm": 1.0185163021087646, + "learning_rate": 4.909520384785087e-06, + "loss": 0.6731, + "step": 1852 + }, + { + "epoch": 0.542447306791569, + "grad_norm": 1.023008108139038, + "learning_rate": 4.909417696553173e-06, + "loss": 0.6965, + "step": 1853 + }, + { + "epoch": 0.5427400468384075, + "grad_norm": 1.0388774871826172, + "learning_rate": 4.909314951157363e-06, + "loss": 0.6551, + "step": 1854 + }, + { + "epoch": 0.5430327868852459, + "grad_norm": 0.9903032183647156, + "learning_rate": 4.9092121486000915e-06, + "loss": 0.6717, + "step": 1855 + }, + { + "epoch": 0.5433255269320844, + "grad_norm": 1.245619297027588, + "learning_rate": 4.909109288883799e-06, + "loss": 0.6547, + "step": 1856 + }, + { + "epoch": 0.5436182669789227, + "grad_norm": 0.9980534315109253, + "learning_rate": 4.909006372010926e-06, + "loss": 0.726, + "step": 1857 + }, + { + "epoch": 0.5439110070257611, + "grad_norm": 1.0520246028900146, + "learning_rate": 4.908903397983914e-06, + "loss": 0.7232, + "step": 1858 + }, + { + "epoch": 0.5442037470725996, + "grad_norm": 0.9966354370117188, + "learning_rate": 4.908800366805207e-06, + "loss": 0.6864, + "step": 1859 + }, + { + "epoch": 0.544496487119438, + "grad_norm": 0.9819132089614868, + "learning_rate": 4.908697278477247e-06, + "loss": 0.6401, + "step": 1860 + }, + { + "epoch": 0.5447892271662763, + "grad_norm": 0.9660316109657288, + "learning_rate": 4.9085941330024815e-06, + "loss": 0.6492, + "step": 1861 + }, + { + "epoch": 0.5450819672131147, + "grad_norm": 0.9418030381202698, + "learning_rate": 4.908490930383357e-06, + "loss": 0.653, + "step": 1862 + }, + { + "epoch": 0.5453747072599532, + "grad_norm": 0.9727563858032227, + "learning_rate": 4.908387670622323e-06, + "loss": 0.6838, + "step": 1863 + }, + { + "epoch": 0.5456674473067916, + "grad_norm": 1.0098140239715576, + "learning_rate": 4.90828435372183e-06, + "loss": 0.6182, + "step": 1864 + }, + { + "epoch": 0.5459601873536299, + "grad_norm": 1.07843017578125, + "learning_rate": 4.908180979684327e-06, + "loss": 0.7013, + "step": 1865 + }, + { + "epoch": 0.5462529274004684, + "grad_norm": 1.042439579963684, + "learning_rate": 4.908077548512268e-06, + "loss": 0.6935, + "step": 1866 + }, + { + "epoch": 0.5465456674473068, + "grad_norm": 0.9925409555435181, + "learning_rate": 4.9079740602081065e-06, + "loss": 0.6612, + "step": 1867 + }, + { + "epoch": 0.5468384074941453, + "grad_norm": 0.95917809009552, + "learning_rate": 4.907870514774297e-06, + "loss": 0.6813, + "step": 1868 + }, + { + "epoch": 0.5471311475409836, + "grad_norm": 1.0, + "learning_rate": 4.907766912213297e-06, + "loss": 0.6962, + "step": 1869 + }, + { + "epoch": 0.547423887587822, + "grad_norm": 1.042900562286377, + "learning_rate": 4.907663252527565e-06, + "loss": 0.7072, + "step": 1870 + }, + { + "epoch": 0.5477166276346604, + "grad_norm": 1.0482467412948608, + "learning_rate": 4.907559535719559e-06, + "loss": 0.6785, + "step": 1871 + }, + { + "epoch": 0.5480093676814989, + "grad_norm": 0.9643098711967468, + "learning_rate": 4.907455761791742e-06, + "loss": 0.6477, + "step": 1872 + }, + { + "epoch": 0.5483021077283372, + "grad_norm": 0.9439372420310974, + "learning_rate": 4.907351930746574e-06, + "loss": 0.6777, + "step": 1873 + }, + { + "epoch": 0.5485948477751756, + "grad_norm": 1.038141131401062, + "learning_rate": 4.907248042586518e-06, + "loss": 0.6978, + "step": 1874 + }, + { + "epoch": 0.5488875878220141, + "grad_norm": 0.9963442087173462, + "learning_rate": 4.90714409731404e-06, + "loss": 0.6685, + "step": 1875 + }, + { + "epoch": 0.5491803278688525, + "grad_norm": 0.9889292120933533, + "learning_rate": 4.907040094931606e-06, + "loss": 0.6731, + "step": 1876 + }, + { + "epoch": 0.5494730679156908, + "grad_norm": 0.9725640416145325, + "learning_rate": 4.9069360354416836e-06, + "loss": 0.732, + "step": 1877 + }, + { + "epoch": 0.5497658079625293, + "grad_norm": 1.035670518875122, + "learning_rate": 4.906831918846741e-06, + "loss": 0.6883, + "step": 1878 + }, + { + "epoch": 0.5500585480093677, + "grad_norm": 1.0466508865356445, + "learning_rate": 4.906727745149249e-06, + "loss": 0.6616, + "step": 1879 + }, + { + "epoch": 0.550351288056206, + "grad_norm": 1.0342254638671875, + "learning_rate": 4.906623514351678e-06, + "loss": 0.6864, + "step": 1880 + }, + { + "epoch": 0.5506440281030445, + "grad_norm": 0.9948923587799072, + "learning_rate": 4.906519226456503e-06, + "loss": 0.7015, + "step": 1881 + }, + { + "epoch": 0.5509367681498829, + "grad_norm": 1.0734190940856934, + "learning_rate": 4.906414881466197e-06, + "loss": 0.6505, + "step": 1882 + }, + { + "epoch": 0.5512295081967213, + "grad_norm": 1.1260706186294556, + "learning_rate": 4.9063104793832345e-06, + "loss": 0.6756, + "step": 1883 + }, + { + "epoch": 0.5515222482435597, + "grad_norm": 0.999087929725647, + "learning_rate": 4.906206020210094e-06, + "loss": 0.6706, + "step": 1884 + }, + { + "epoch": 0.5518149882903981, + "grad_norm": 1.0405933856964111, + "learning_rate": 4.906101503949254e-06, + "loss": 0.6634, + "step": 1885 + }, + { + "epoch": 0.5521077283372365, + "grad_norm": 1.0203733444213867, + "learning_rate": 4.905996930603193e-06, + "loss": 0.7341, + "step": 1886 + }, + { + "epoch": 0.552400468384075, + "grad_norm": 1.035671353340149, + "learning_rate": 4.9058923001743926e-06, + "loss": 0.7057, + "step": 1887 + }, + { + "epoch": 0.5526932084309133, + "grad_norm": 0.9837383031845093, + "learning_rate": 4.905787612665335e-06, + "loss": 0.6534, + "step": 1888 + }, + { + "epoch": 0.5529859484777517, + "grad_norm": 1.0643848180770874, + "learning_rate": 4.905682868078504e-06, + "loss": 0.7086, + "step": 1889 + }, + { + "epoch": 0.5532786885245902, + "grad_norm": 0.9756625294685364, + "learning_rate": 4.905578066416385e-06, + "loss": 0.6732, + "step": 1890 + }, + { + "epoch": 0.5535714285714286, + "grad_norm": 1.0735028982162476, + "learning_rate": 4.905473207681465e-06, + "loss": 0.7031, + "step": 1891 + }, + { + "epoch": 0.5538641686182669, + "grad_norm": 1.0387545824050903, + "learning_rate": 4.90536829187623e-06, + "loss": 0.7043, + "step": 1892 + }, + { + "epoch": 0.5541569086651054, + "grad_norm": 1.0250980854034424, + "learning_rate": 4.90526331900317e-06, + "loss": 0.6841, + "step": 1893 + }, + { + "epoch": 0.5544496487119438, + "grad_norm": 0.9438363313674927, + "learning_rate": 4.905158289064777e-06, + "loss": 0.6089, + "step": 1894 + }, + { + "epoch": 0.5547423887587822, + "grad_norm": 1.0602233409881592, + "learning_rate": 4.90505320206354e-06, + "loss": 0.7115, + "step": 1895 + }, + { + "epoch": 0.5550351288056206, + "grad_norm": 1.006448745727539, + "learning_rate": 4.904948058001954e-06, + "loss": 0.6986, + "step": 1896 + }, + { + "epoch": 0.555327868852459, + "grad_norm": 1.043142318725586, + "learning_rate": 4.904842856882514e-06, + "loss": 0.6723, + "step": 1897 + }, + { + "epoch": 0.5556206088992974, + "grad_norm": 0.9812002778053284, + "learning_rate": 4.904737598707715e-06, + "loss": 0.6715, + "step": 1898 + }, + { + "epoch": 0.5559133489461359, + "grad_norm": 1.0515623092651367, + "learning_rate": 4.904632283480054e-06, + "loss": 0.6802, + "step": 1899 + }, + { + "epoch": 0.5562060889929742, + "grad_norm": 1.026755452156067, + "learning_rate": 4.90452691120203e-06, + "loss": 0.6938, + "step": 1900 + }, + { + "epoch": 0.5564988290398126, + "grad_norm": 0.9763220548629761, + "learning_rate": 4.904421481876144e-06, + "loss": 0.6644, + "step": 1901 + }, + { + "epoch": 0.5567915690866511, + "grad_norm": 1.0166175365447998, + "learning_rate": 4.9043159955048955e-06, + "loss": 0.7084, + "step": 1902 + }, + { + "epoch": 0.5570843091334895, + "grad_norm": 1.0289771556854248, + "learning_rate": 4.904210452090788e-06, + "loss": 0.6617, + "step": 1903 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 1.0107277631759644, + "learning_rate": 4.904104851636327e-06, + "loss": 0.6938, + "step": 1904 + }, + { + "epoch": 0.5576697892271663, + "grad_norm": 1.0075970888137817, + "learning_rate": 4.903999194144015e-06, + "loss": 0.6813, + "step": 1905 + }, + { + "epoch": 0.5579625292740047, + "grad_norm": 1.1215237379074097, + "learning_rate": 4.903893479616361e-06, + "loss": 0.6986, + "step": 1906 + }, + { + "epoch": 0.5582552693208431, + "grad_norm": 1.0908256769180298, + "learning_rate": 4.903787708055872e-06, + "loss": 0.7127, + "step": 1907 + }, + { + "epoch": 0.5585480093676815, + "grad_norm": 0.9693386554718018, + "learning_rate": 4.9036818794650585e-06, + "loss": 0.6744, + "step": 1908 + }, + { + "epoch": 0.5588407494145199, + "grad_norm": 1.0212316513061523, + "learning_rate": 4.90357599384643e-06, + "loss": 0.6659, + "step": 1909 + }, + { + "epoch": 0.5591334894613583, + "grad_norm": 1.0410579442977905, + "learning_rate": 4.903470051202499e-06, + "loss": 0.7014, + "step": 1910 + }, + { + "epoch": 0.5594262295081968, + "grad_norm": 0.9789318442344666, + "learning_rate": 4.903364051535779e-06, + "loss": 0.6842, + "step": 1911 + }, + { + "epoch": 0.5597189695550351, + "grad_norm": 0.9771793484687805, + "learning_rate": 4.903257994848787e-06, + "loss": 0.7107, + "step": 1912 + }, + { + "epoch": 0.5600117096018735, + "grad_norm": 1.0605063438415527, + "learning_rate": 4.903151881144036e-06, + "loss": 0.6953, + "step": 1913 + }, + { + "epoch": 0.560304449648712, + "grad_norm": 1.020437479019165, + "learning_rate": 4.9030457104240445e-06, + "loss": 0.6845, + "step": 1914 + }, + { + "epoch": 0.5605971896955504, + "grad_norm": 1.0253185033798218, + "learning_rate": 4.902939482691332e-06, + "loss": 0.7333, + "step": 1915 + }, + { + "epoch": 0.5608899297423887, + "grad_norm": 0.9801590442657471, + "learning_rate": 4.902833197948419e-06, + "loss": 0.6711, + "step": 1916 + }, + { + "epoch": 0.5611826697892272, + "grad_norm": 1.0052248239517212, + "learning_rate": 4.902726856197827e-06, + "loss": 0.6621, + "step": 1917 + }, + { + "epoch": 0.5614754098360656, + "grad_norm": 0.9798829555511475, + "learning_rate": 4.902620457442079e-06, + "loss": 0.6756, + "step": 1918 + }, + { + "epoch": 0.561768149882904, + "grad_norm": 1.0128339529037476, + "learning_rate": 4.902514001683698e-06, + "loss": 0.6925, + "step": 1919 + }, + { + "epoch": 0.5620608899297423, + "grad_norm": 0.993097186088562, + "learning_rate": 4.9024074889252115e-06, + "loss": 0.7123, + "step": 1920 + }, + { + "epoch": 0.5623536299765808, + "grad_norm": 1.0093265771865845, + "learning_rate": 4.902300919169146e-06, + "loss": 0.6716, + "step": 1921 + }, + { + "epoch": 0.5626463700234192, + "grad_norm": 0.9501147866249084, + "learning_rate": 4.902194292418029e-06, + "loss": 0.6747, + "step": 1922 + }, + { + "epoch": 0.5629391100702577, + "grad_norm": 0.9980944395065308, + "learning_rate": 4.902087608674391e-06, + "loss": 0.7381, + "step": 1923 + }, + { + "epoch": 0.563231850117096, + "grad_norm": 0.9505720138549805, + "learning_rate": 4.901980867940763e-06, + "loss": 0.6803, + "step": 1924 + }, + { + "epoch": 0.5635245901639344, + "grad_norm": 1.0228817462921143, + "learning_rate": 4.901874070219678e-06, + "loss": 0.7002, + "step": 1925 + }, + { + "epoch": 0.5638173302107728, + "grad_norm": 1.0854846239089966, + "learning_rate": 4.901767215513668e-06, + "loss": 0.6624, + "step": 1926 + }, + { + "epoch": 0.5641100702576113, + "grad_norm": 1.0195280313491821, + "learning_rate": 4.901660303825271e-06, + "loss": 0.7144, + "step": 1927 + }, + { + "epoch": 0.5644028103044496, + "grad_norm": 1.022262454032898, + "learning_rate": 4.90155333515702e-06, + "loss": 0.6695, + "step": 1928 + }, + { + "epoch": 0.564695550351288, + "grad_norm": 1.0303404331207275, + "learning_rate": 4.901446309511456e-06, + "loss": 0.6743, + "step": 1929 + }, + { + "epoch": 0.5649882903981265, + "grad_norm": 0.9545363187789917, + "learning_rate": 4.9013392268911165e-06, + "loss": 0.6932, + "step": 1930 + }, + { + "epoch": 0.5652810304449649, + "grad_norm": 1.0168342590332031, + "learning_rate": 4.901232087298543e-06, + "loss": 0.6565, + "step": 1931 + }, + { + "epoch": 0.5655737704918032, + "grad_norm": 1.0357860326766968, + "learning_rate": 4.9011248907362764e-06, + "loss": 0.6621, + "step": 1932 + }, + { + "epoch": 0.5658665105386417, + "grad_norm": 0.9940609931945801, + "learning_rate": 4.90101763720686e-06, + "loss": 0.6482, + "step": 1933 + }, + { + "epoch": 0.5661592505854801, + "grad_norm": 1.0219368934631348, + "learning_rate": 4.9009103267128396e-06, + "loss": 0.6716, + "step": 1934 + }, + { + "epoch": 0.5664519906323185, + "grad_norm": 0.9630075097084045, + "learning_rate": 4.900802959256761e-06, + "loss": 0.7087, + "step": 1935 + }, + { + "epoch": 0.5667447306791569, + "grad_norm": 1.0083407163619995, + "learning_rate": 4.90069553484117e-06, + "loss": 0.693, + "step": 1936 + }, + { + "epoch": 0.5670374707259953, + "grad_norm": 1.0997103452682495, + "learning_rate": 4.900588053468617e-06, + "loss": 0.7317, + "step": 1937 + }, + { + "epoch": 0.5673302107728337, + "grad_norm": 0.9609708189964294, + "learning_rate": 4.90048051514165e-06, + "loss": 0.7049, + "step": 1938 + }, + { + "epoch": 0.5676229508196722, + "grad_norm": 1.0154017210006714, + "learning_rate": 4.900372919862823e-06, + "loss": 0.659, + "step": 1939 + }, + { + "epoch": 0.5679156908665105, + "grad_norm": 0.9805722832679749, + "learning_rate": 4.900265267634686e-06, + "loss": 0.6874, + "step": 1940 + }, + { + "epoch": 0.5682084309133489, + "grad_norm": 0.9850307703018188, + "learning_rate": 4.9001575584597946e-06, + "loss": 0.6467, + "step": 1941 + }, + { + "epoch": 0.5685011709601874, + "grad_norm": 1.0495502948760986, + "learning_rate": 4.900049792340705e-06, + "loss": 0.6619, + "step": 1942 + }, + { + "epoch": 0.5687939110070258, + "grad_norm": 1.1207689046859741, + "learning_rate": 4.899941969279972e-06, + "loss": 0.6886, + "step": 1943 + }, + { + "epoch": 0.5690866510538641, + "grad_norm": 1.0259872674942017, + "learning_rate": 4.899834089280156e-06, + "loss": 0.6796, + "step": 1944 + }, + { + "epoch": 0.5693793911007026, + "grad_norm": 1.0038162469863892, + "learning_rate": 4.899726152343814e-06, + "loss": 0.6452, + "step": 1945 + }, + { + "epoch": 0.569672131147541, + "grad_norm": 1.052854061126709, + "learning_rate": 4.899618158473508e-06, + "loss": 0.7164, + "step": 1946 + }, + { + "epoch": 0.5699648711943794, + "grad_norm": 0.9901336431503296, + "learning_rate": 4.8995101076718e-06, + "loss": 0.6903, + "step": 1947 + }, + { + "epoch": 0.5702576112412178, + "grad_norm": 1.0555438995361328, + "learning_rate": 4.8994019999412546e-06, + "loss": 0.7291, + "step": 1948 + }, + { + "epoch": 0.5705503512880562, + "grad_norm": 1.0068293809890747, + "learning_rate": 4.899293835284436e-06, + "loss": 0.6786, + "step": 1949 + }, + { + "epoch": 0.5708430913348946, + "grad_norm": 0.9520289301872253, + "learning_rate": 4.899185613703909e-06, + "loss": 0.6851, + "step": 1950 + }, + { + "epoch": 0.5711358313817331, + "grad_norm": 1.0301380157470703, + "learning_rate": 4.8990773352022425e-06, + "loss": 0.6677, + "step": 1951 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 1.0185621976852417, + "learning_rate": 4.898968999782005e-06, + "loss": 0.6876, + "step": 1952 + }, + { + "epoch": 0.5717213114754098, + "grad_norm": 1.0196335315704346, + "learning_rate": 4.898860607445769e-06, + "loss": 0.7031, + "step": 1953 + }, + { + "epoch": 0.5720140515222483, + "grad_norm": 1.2047932147979736, + "learning_rate": 4.898752158196102e-06, + "loss": 0.6823, + "step": 1954 + }, + { + "epoch": 0.5723067915690867, + "grad_norm": 1.0050805807113647, + "learning_rate": 4.898643652035581e-06, + "loss": 0.7088, + "step": 1955 + }, + { + "epoch": 0.572599531615925, + "grad_norm": 0.9705953598022461, + "learning_rate": 4.898535088966777e-06, + "loss": 0.6863, + "step": 1956 + }, + { + "epoch": 0.5728922716627635, + "grad_norm": 1.0476975440979004, + "learning_rate": 4.898426468992268e-06, + "loss": 0.698, + "step": 1957 + }, + { + "epoch": 0.5731850117096019, + "grad_norm": 0.9642202258110046, + "learning_rate": 4.898317792114631e-06, + "loss": 0.6495, + "step": 1958 + }, + { + "epoch": 0.5734777517564403, + "grad_norm": 1.0621931552886963, + "learning_rate": 4.898209058336442e-06, + "loss": 0.6993, + "step": 1959 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 1.0257246494293213, + "learning_rate": 4.898100267660283e-06, + "loss": 0.6921, + "step": 1960 + }, + { + "epoch": 0.5740632318501171, + "grad_norm": 1.0001678466796875, + "learning_rate": 4.8979914200887345e-06, + "loss": 0.6823, + "step": 1961 + }, + { + "epoch": 0.5743559718969555, + "grad_norm": 0.9933276772499084, + "learning_rate": 4.897882515624379e-06, + "loss": 0.6541, + "step": 1962 + }, + { + "epoch": 0.574648711943794, + "grad_norm": 1.0018783807754517, + "learning_rate": 4.8977735542698e-06, + "loss": 0.67, + "step": 1963 + }, + { + "epoch": 0.5749414519906323, + "grad_norm": 1.009622573852539, + "learning_rate": 4.8976645360275834e-06, + "loss": 0.6845, + "step": 1964 + }, + { + "epoch": 0.5752341920374707, + "grad_norm": 0.9902573227882385, + "learning_rate": 4.897555460900314e-06, + "loss": 0.654, + "step": 1965 + }, + { + "epoch": 0.5755269320843092, + "grad_norm": 0.9849370121955872, + "learning_rate": 4.897446328890581e-06, + "loss": 0.6564, + "step": 1966 + }, + { + "epoch": 0.5758196721311475, + "grad_norm": 0.9845024347305298, + "learning_rate": 4.897337140000973e-06, + "loss": 0.635, + "step": 1967 + }, + { + "epoch": 0.5761124121779859, + "grad_norm": 1.0259181261062622, + "learning_rate": 4.897227894234081e-06, + "loss": 0.7023, + "step": 1968 + }, + { + "epoch": 0.5764051522248244, + "grad_norm": 1.104998230934143, + "learning_rate": 4.8971185915924965e-06, + "loss": 0.7438, + "step": 1969 + }, + { + "epoch": 0.5766978922716628, + "grad_norm": 0.9704550504684448, + "learning_rate": 4.897009232078813e-06, + "loss": 0.7169, + "step": 1970 + }, + { + "epoch": 0.5769906323185011, + "grad_norm": 1.037165880203247, + "learning_rate": 4.896899815695626e-06, + "loss": 0.7045, + "step": 1971 + }, + { + "epoch": 0.5772833723653396, + "grad_norm": 0.930844247341156, + "learning_rate": 4.896790342445529e-06, + "loss": 0.6642, + "step": 1972 + }, + { + "epoch": 0.577576112412178, + "grad_norm": 1.0120642185211182, + "learning_rate": 4.896680812331122e-06, + "loss": 0.6718, + "step": 1973 + }, + { + "epoch": 0.5778688524590164, + "grad_norm": 0.9561020135879517, + "learning_rate": 4.896571225355001e-06, + "loss": 0.6446, + "step": 1974 + }, + { + "epoch": 0.5781615925058547, + "grad_norm": 1.1918706893920898, + "learning_rate": 4.896461581519767e-06, + "loss": 0.6616, + "step": 1975 + }, + { + "epoch": 0.5784543325526932, + "grad_norm": 0.988662838935852, + "learning_rate": 4.896351880828022e-06, + "loss": 0.6528, + "step": 1976 + }, + { + "epoch": 0.5787470725995316, + "grad_norm": 0.9943236112594604, + "learning_rate": 4.896242123282369e-06, + "loss": 0.7018, + "step": 1977 + }, + { + "epoch": 0.5790398126463701, + "grad_norm": 1.0132771730422974, + "learning_rate": 4.896132308885411e-06, + "loss": 0.6862, + "step": 1978 + }, + { + "epoch": 0.5793325526932084, + "grad_norm": 1.0089607238769531, + "learning_rate": 4.896022437639753e-06, + "loss": 0.6706, + "step": 1979 + }, + { + "epoch": 0.5796252927400468, + "grad_norm": 1.0281457901000977, + "learning_rate": 4.895912509548003e-06, + "loss": 0.685, + "step": 1980 + }, + { + "epoch": 0.5799180327868853, + "grad_norm": 0.9946911334991455, + "learning_rate": 4.895802524612767e-06, + "loss": 0.7131, + "step": 1981 + }, + { + "epoch": 0.5802107728337237, + "grad_norm": 0.9313334822654724, + "learning_rate": 4.895692482836658e-06, + "loss": 0.6251, + "step": 1982 + }, + { + "epoch": 0.580503512880562, + "grad_norm": 1.0116848945617676, + "learning_rate": 4.8955823842222824e-06, + "loss": 0.7064, + "step": 1983 + }, + { + "epoch": 0.5807962529274004, + "grad_norm": 1.0355197191238403, + "learning_rate": 4.8954722287722565e-06, + "loss": 0.6893, + "step": 1984 + }, + { + "epoch": 0.5810889929742389, + "grad_norm": 1.0446765422821045, + "learning_rate": 4.89536201648919e-06, + "loss": 0.6829, + "step": 1985 + }, + { + "epoch": 0.5813817330210773, + "grad_norm": 0.9807254672050476, + "learning_rate": 4.8952517473757006e-06, + "loss": 0.6781, + "step": 1986 + }, + { + "epoch": 0.5816744730679156, + "grad_norm": 1.0295993089675903, + "learning_rate": 4.895141421434403e-06, + "loss": 0.6696, + "step": 1987 + }, + { + "epoch": 0.5819672131147541, + "grad_norm": 0.9719418883323669, + "learning_rate": 4.895031038667914e-06, + "loss": 0.6631, + "step": 1988 + }, + { + "epoch": 0.5822599531615925, + "grad_norm": 0.9616822600364685, + "learning_rate": 4.894920599078855e-06, + "loss": 0.663, + "step": 1989 + }, + { + "epoch": 0.582552693208431, + "grad_norm": 1.0723292827606201, + "learning_rate": 4.894810102669844e-06, + "loss": 0.6573, + "step": 1990 + }, + { + "epoch": 0.5828454332552693, + "grad_norm": 1.0210849046707153, + "learning_rate": 4.894699549443504e-06, + "loss": 0.7135, + "step": 1991 + }, + { + "epoch": 0.5831381733021077, + "grad_norm": 0.9706221222877502, + "learning_rate": 4.894588939402456e-06, + "loss": 0.6357, + "step": 1992 + }, + { + "epoch": 0.5834309133489461, + "grad_norm": 1.0033648014068604, + "learning_rate": 4.894478272549326e-06, + "loss": 0.6949, + "step": 1993 + }, + { + "epoch": 0.5837236533957846, + "grad_norm": 0.991218090057373, + "learning_rate": 4.894367548886739e-06, + "loss": 0.6663, + "step": 1994 + }, + { + "epoch": 0.5840163934426229, + "grad_norm": 1.0086917877197266, + "learning_rate": 4.894256768417321e-06, + "loss": 0.6789, + "step": 1995 + }, + { + "epoch": 0.5843091334894613, + "grad_norm": 0.9716125726699829, + "learning_rate": 4.894145931143703e-06, + "loss": 0.6887, + "step": 1996 + }, + { + "epoch": 0.5846018735362998, + "grad_norm": 0.9442796111106873, + "learning_rate": 4.8940350370685106e-06, + "loss": 0.6367, + "step": 1997 + }, + { + "epoch": 0.5848946135831382, + "grad_norm": 1.0405166149139404, + "learning_rate": 4.893924086194378e-06, + "loss": 0.715, + "step": 1998 + }, + { + "epoch": 0.5851873536299765, + "grad_norm": 1.00184965133667, + "learning_rate": 4.893813078523937e-06, + "loss": 0.6915, + "step": 1999 + }, + { + "epoch": 0.585480093676815, + "grad_norm": 0.9621688723564148, + "learning_rate": 4.89370201405982e-06, + "loss": 0.6803, + "step": 2000 + }, + { + "epoch": 0.5857728337236534, + "grad_norm": 0.9902613162994385, + "learning_rate": 4.893590892804664e-06, + "loss": 0.6709, + "step": 2001 + }, + { + "epoch": 0.5860655737704918, + "grad_norm": 0.9320769309997559, + "learning_rate": 4.893479714761103e-06, + "loss": 0.6606, + "step": 2002 + }, + { + "epoch": 0.5863583138173302, + "grad_norm": 0.9924476146697998, + "learning_rate": 4.893368479931776e-06, + "loss": 0.6516, + "step": 2003 + }, + { + "epoch": 0.5866510538641686, + "grad_norm": 0.9528056979179382, + "learning_rate": 4.8932571883193225e-06, + "loss": 0.6466, + "step": 2004 + }, + { + "epoch": 0.586943793911007, + "grad_norm": 0.9802749156951904, + "learning_rate": 4.893145839926382e-06, + "loss": 0.6923, + "step": 2005 + }, + { + "epoch": 0.5872365339578455, + "grad_norm": 1.0453370809555054, + "learning_rate": 4.893034434755597e-06, + "loss": 0.7246, + "step": 2006 + }, + { + "epoch": 0.5875292740046838, + "grad_norm": 1.1529754400253296, + "learning_rate": 4.89292297280961e-06, + "loss": 0.6715, + "step": 2007 + }, + { + "epoch": 0.5878220140515222, + "grad_norm": 1.0329431295394897, + "learning_rate": 4.8928114540910664e-06, + "loss": 0.716, + "step": 2008 + }, + { + "epoch": 0.5881147540983607, + "grad_norm": 0.9866818785667419, + "learning_rate": 4.892699878602611e-06, + "loss": 0.6929, + "step": 2009 + }, + { + "epoch": 0.5884074941451991, + "grad_norm": 1.0163047313690186, + "learning_rate": 4.892588246346891e-06, + "loss": 0.683, + "step": 2010 + }, + { + "epoch": 0.5887002341920374, + "grad_norm": 0.9908021688461304, + "learning_rate": 4.8924765573265555e-06, + "loss": 0.6597, + "step": 2011 + }, + { + "epoch": 0.5889929742388759, + "grad_norm": 1.0197739601135254, + "learning_rate": 4.892364811544254e-06, + "loss": 0.6668, + "step": 2012 + }, + { + "epoch": 0.5892857142857143, + "grad_norm": 0.9864645600318909, + "learning_rate": 4.8922530090026375e-06, + "loss": 0.6634, + "step": 2013 + }, + { + "epoch": 0.5895784543325527, + "grad_norm": 0.9930760264396667, + "learning_rate": 4.892141149704359e-06, + "loss": 0.7226, + "step": 2014 + }, + { + "epoch": 0.5898711943793911, + "grad_norm": 1.0164624452590942, + "learning_rate": 4.892029233652072e-06, + "loss": 0.6919, + "step": 2015 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 1.017778992652893, + "learning_rate": 4.891917260848432e-06, + "loss": 0.7159, + "step": 2016 + }, + { + "epoch": 0.5904566744730679, + "grad_norm": 1.041526198387146, + "learning_rate": 4.891805231296095e-06, + "loss": 0.6566, + "step": 2017 + }, + { + "epoch": 0.5907494145199064, + "grad_norm": 1.0427558422088623, + "learning_rate": 4.89169314499772e-06, + "loss": 0.6095, + "step": 2018 + }, + { + "epoch": 0.5910421545667447, + "grad_norm": 0.9903550744056702, + "learning_rate": 4.891581001955965e-06, + "loss": 0.6566, + "step": 2019 + }, + { + "epoch": 0.5913348946135831, + "grad_norm": 0.9671207666397095, + "learning_rate": 4.891468802173492e-06, + "loss": 0.6791, + "step": 2020 + }, + { + "epoch": 0.5916276346604216, + "grad_norm": 0.9732134938240051, + "learning_rate": 4.891356545652962e-06, + "loss": 0.6896, + "step": 2021 + }, + { + "epoch": 0.59192037470726, + "grad_norm": 1.0326652526855469, + "learning_rate": 4.891244232397038e-06, + "loss": 0.6884, + "step": 2022 + }, + { + "epoch": 0.5922131147540983, + "grad_norm": 0.9917556643486023, + "learning_rate": 4.891131862408386e-06, + "loss": 0.7071, + "step": 2023 + }, + { + "epoch": 0.5925058548009368, + "grad_norm": 1.0305869579315186, + "learning_rate": 4.89101943568967e-06, + "loss": 0.7259, + "step": 2024 + }, + { + "epoch": 0.5927985948477752, + "grad_norm": 1.0207642316818237, + "learning_rate": 4.890906952243559e-06, + "loss": 0.696, + "step": 2025 + }, + { + "epoch": 0.5930913348946136, + "grad_norm": 1.0522650480270386, + "learning_rate": 4.8907944120727215e-06, + "loss": 0.6958, + "step": 2026 + }, + { + "epoch": 0.593384074941452, + "grad_norm": 0.9618912935256958, + "learning_rate": 4.890681815179827e-06, + "loss": 0.6817, + "step": 2027 + }, + { + "epoch": 0.5936768149882904, + "grad_norm": 1.042024850845337, + "learning_rate": 4.890569161567547e-06, + "loss": 0.6948, + "step": 2028 + }, + { + "epoch": 0.5939695550351288, + "grad_norm": 0.9516180157661438, + "learning_rate": 4.8904564512385555e-06, + "loss": 0.6525, + "step": 2029 + }, + { + "epoch": 0.5942622950819673, + "grad_norm": 1.0165082216262817, + "learning_rate": 4.890343684195525e-06, + "loss": 0.699, + "step": 2030 + }, + { + "epoch": 0.5945550351288056, + "grad_norm": 1.0071173906326294, + "learning_rate": 4.8902308604411304e-06, + "loss": 0.6747, + "step": 2031 + }, + { + "epoch": 0.594847775175644, + "grad_norm": 0.9805462956428528, + "learning_rate": 4.89011797997805e-06, + "loss": 0.6892, + "step": 2032 + }, + { + "epoch": 0.5951405152224825, + "grad_norm": 0.9578855037689209, + "learning_rate": 4.890005042808961e-06, + "loss": 0.6833, + "step": 2033 + }, + { + "epoch": 0.5954332552693209, + "grad_norm": 0.9208221435546875, + "learning_rate": 4.889892048936544e-06, + "loss": 0.6602, + "step": 2034 + }, + { + "epoch": 0.5957259953161592, + "grad_norm": 0.9665013551712036, + "learning_rate": 4.889778998363478e-06, + "loss": 0.6219, + "step": 2035 + }, + { + "epoch": 0.5960187353629977, + "grad_norm": 1.0092709064483643, + "learning_rate": 4.889665891092447e-06, + "loss": 0.69, + "step": 2036 + }, + { + "epoch": 0.5963114754098361, + "grad_norm": 0.9781215190887451, + "learning_rate": 4.889552727126133e-06, + "loss": 0.6752, + "step": 2037 + }, + { + "epoch": 0.5966042154566745, + "grad_norm": 0.9660111665725708, + "learning_rate": 4.889439506467223e-06, + "loss": 0.6819, + "step": 2038 + }, + { + "epoch": 0.5968969555035128, + "grad_norm": 0.9841863512992859, + "learning_rate": 4.889326229118399e-06, + "loss": 0.6775, + "step": 2039 + }, + { + "epoch": 0.5971896955503513, + "grad_norm": 0.9742935299873352, + "learning_rate": 4.889212895082354e-06, + "loss": 0.6845, + "step": 2040 + }, + { + "epoch": 0.5974824355971897, + "grad_norm": 0.9452866315841675, + "learning_rate": 4.889099504361771e-06, + "loss": 0.6186, + "step": 2041 + }, + { + "epoch": 0.5977751756440282, + "grad_norm": 1.0834399461746216, + "learning_rate": 4.888986056959346e-06, + "loss": 0.6481, + "step": 2042 + }, + { + "epoch": 0.5980679156908665, + "grad_norm": 1.1003448963165283, + "learning_rate": 4.888872552877766e-06, + "loss": 0.7148, + "step": 2043 + }, + { + "epoch": 0.5983606557377049, + "grad_norm": 0.9434350728988647, + "learning_rate": 4.888758992119726e-06, + "loss": 0.6889, + "step": 2044 + }, + { + "epoch": 0.5986533957845434, + "grad_norm": 0.9506006240844727, + "learning_rate": 4.888645374687922e-06, + "loss": 0.6729, + "step": 2045 + }, + { + "epoch": 0.5989461358313818, + "grad_norm": 1.0023024082183838, + "learning_rate": 4.888531700585045e-06, + "loss": 0.6941, + "step": 2046 + }, + { + "epoch": 0.5992388758782201, + "grad_norm": 0.9950646162033081, + "learning_rate": 4.888417969813795e-06, + "loss": 0.7009, + "step": 2047 + }, + { + "epoch": 0.5995316159250585, + "grad_norm": 1.0402599573135376, + "learning_rate": 4.888304182376871e-06, + "loss": 0.6899, + "step": 2048 + }, + { + "epoch": 0.599824355971897, + "grad_norm": 1.0464391708374023, + "learning_rate": 4.88819033827697e-06, + "loss": 0.692, + "step": 2049 + }, + { + "epoch": 0.6001170960187353, + "grad_norm": 0.9718529582023621, + "learning_rate": 4.888076437516796e-06, + "loss": 0.7239, + "step": 2050 + }, + { + "epoch": 0.6004098360655737, + "grad_norm": 0.9968354105949402, + "learning_rate": 4.8879624800990485e-06, + "loss": 0.6153, + "step": 2051 + }, + { + "epoch": 0.6007025761124122, + "grad_norm": 1.0289498567581177, + "learning_rate": 4.887848466026433e-06, + "loss": 0.6813, + "step": 2052 + }, + { + "epoch": 0.6009953161592506, + "grad_norm": 1.0324561595916748, + "learning_rate": 4.887734395301653e-06, + "loss": 0.7104, + "step": 2053 + }, + { + "epoch": 0.6012880562060889, + "grad_norm": 1.089612603187561, + "learning_rate": 4.887620267927417e-06, + "loss": 0.7258, + "step": 2054 + }, + { + "epoch": 0.6015807962529274, + "grad_norm": 0.9292313456535339, + "learning_rate": 4.887506083906431e-06, + "loss": 0.6337, + "step": 2055 + }, + { + "epoch": 0.6018735362997658, + "grad_norm": 0.963144063949585, + "learning_rate": 4.887391843241404e-06, + "loss": 0.6496, + "step": 2056 + }, + { + "epoch": 0.6021662763466042, + "grad_norm": 1.0018730163574219, + "learning_rate": 4.887277545935046e-06, + "loss": 0.6794, + "step": 2057 + }, + { + "epoch": 0.6024590163934426, + "grad_norm": 0.9901618957519531, + "learning_rate": 4.887163191990072e-06, + "loss": 0.6694, + "step": 2058 + }, + { + "epoch": 0.602751756440281, + "grad_norm": 0.9424851536750793, + "learning_rate": 4.88704878140919e-06, + "loss": 0.641, + "step": 2059 + }, + { + "epoch": 0.6030444964871194, + "grad_norm": 1.0016788244247437, + "learning_rate": 4.886934314195119e-06, + "loss": 0.6938, + "step": 2060 + }, + { + "epoch": 0.6033372365339579, + "grad_norm": 1.0132185220718384, + "learning_rate": 4.886819790350572e-06, + "loss": 0.7016, + "step": 2061 + }, + { + "epoch": 0.6036299765807962, + "grad_norm": 1.0027257204055786, + "learning_rate": 4.886705209878267e-06, + "loss": 0.6602, + "step": 2062 + }, + { + "epoch": 0.6039227166276346, + "grad_norm": 1.0321905612945557, + "learning_rate": 4.886590572780923e-06, + "loss": 0.6792, + "step": 2063 + }, + { + "epoch": 0.6042154566744731, + "grad_norm": 1.003283143043518, + "learning_rate": 4.886475879061258e-06, + "loss": 0.6697, + "step": 2064 + }, + { + "epoch": 0.6045081967213115, + "grad_norm": 0.950910210609436, + "learning_rate": 4.886361128721995e-06, + "loss": 0.6665, + "step": 2065 + }, + { + "epoch": 0.6048009367681498, + "grad_norm": 0.9663719534873962, + "learning_rate": 4.8862463217658555e-06, + "loss": 0.6731, + "step": 2066 + }, + { + "epoch": 0.6050936768149883, + "grad_norm": 0.9885073304176331, + "learning_rate": 4.886131458195564e-06, + "loss": 0.6479, + "step": 2067 + }, + { + "epoch": 0.6053864168618267, + "grad_norm": 0.9063718914985657, + "learning_rate": 4.886016538013845e-06, + "loss": 0.6068, + "step": 2068 + }, + { + "epoch": 0.6056791569086651, + "grad_norm": 1.0064513683319092, + "learning_rate": 4.885901561223425e-06, + "loss": 0.7074, + "step": 2069 + }, + { + "epoch": 0.6059718969555035, + "grad_norm": 1.0011066198349, + "learning_rate": 4.885786527827033e-06, + "loss": 0.6669, + "step": 2070 + }, + { + "epoch": 0.6062646370023419, + "grad_norm": 1.0139529705047607, + "learning_rate": 4.8856714378273955e-06, + "loss": 0.6779, + "step": 2071 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 0.9494311809539795, + "learning_rate": 4.885556291227246e-06, + "loss": 0.6832, + "step": 2072 + }, + { + "epoch": 0.6068501170960188, + "grad_norm": 0.9486610889434814, + "learning_rate": 4.885441088029315e-06, + "loss": 0.6617, + "step": 2073 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 0.9999540448188782, + "learning_rate": 4.8853258282363355e-06, + "loss": 0.6785, + "step": 2074 + }, + { + "epoch": 0.6074355971896955, + "grad_norm": 0.9828449487686157, + "learning_rate": 4.885210511851043e-06, + "loss": 0.6993, + "step": 2075 + }, + { + "epoch": 0.607728337236534, + "grad_norm": 1.0199376344680786, + "learning_rate": 4.8850951388761725e-06, + "loss": 0.7027, + "step": 2076 + }, + { + "epoch": 0.6080210772833724, + "grad_norm": 1.035446047782898, + "learning_rate": 4.884979709314462e-06, + "loss": 0.7011, + "step": 2077 + }, + { + "epoch": 0.6083138173302107, + "grad_norm": 1.0009357929229736, + "learning_rate": 4.884864223168649e-06, + "loss": 0.7091, + "step": 2078 + }, + { + "epoch": 0.6086065573770492, + "grad_norm": 1.0007990598678589, + "learning_rate": 4.884748680441475e-06, + "loss": 0.7109, + "step": 2079 + }, + { + "epoch": 0.6088992974238876, + "grad_norm": 0.9805217385292053, + "learning_rate": 4.884633081135679e-06, + "loss": 0.661, + "step": 2080 + }, + { + "epoch": 0.609192037470726, + "grad_norm": 1.0142450332641602, + "learning_rate": 4.884517425254007e-06, + "loss": 0.6851, + "step": 2081 + }, + { + "epoch": 0.6094847775175644, + "grad_norm": 1.0245341062545776, + "learning_rate": 4.8844017127992e-06, + "loss": 0.7226, + "step": 2082 + }, + { + "epoch": 0.6097775175644028, + "grad_norm": 0.9885476231575012, + "learning_rate": 4.884285943774005e-06, + "loss": 0.6733, + "step": 2083 + }, + { + "epoch": 0.6100702576112412, + "grad_norm": 1.0242372751235962, + "learning_rate": 4.884170118181166e-06, + "loss": 0.7316, + "step": 2084 + }, + { + "epoch": 0.6103629976580797, + "grad_norm": 1.0566155910491943, + "learning_rate": 4.884054236023434e-06, + "loss": 0.6349, + "step": 2085 + }, + { + "epoch": 0.610655737704918, + "grad_norm": 0.9293136596679688, + "learning_rate": 4.883938297303557e-06, + "loss": 0.6427, + "step": 2086 + }, + { + "epoch": 0.6109484777517564, + "grad_norm": 1.0023229122161865, + "learning_rate": 4.8838223020242865e-06, + "loss": 0.6467, + "step": 2087 + }, + { + "epoch": 0.6112412177985949, + "grad_norm": 0.9831147193908691, + "learning_rate": 4.883706250188373e-06, + "loss": 0.664, + "step": 2088 + }, + { + "epoch": 0.6115339578454333, + "grad_norm": 1.0530221462249756, + "learning_rate": 4.8835901417985715e-06, + "loss": 0.6581, + "step": 2089 + }, + { + "epoch": 0.6118266978922716, + "grad_norm": 0.9839169383049011, + "learning_rate": 4.883473976857636e-06, + "loss": 0.6555, + "step": 2090 + }, + { + "epoch": 0.6121194379391101, + "grad_norm": 1.0624605417251587, + "learning_rate": 4.883357755368322e-06, + "loss": 0.6743, + "step": 2091 + }, + { + "epoch": 0.6124121779859485, + "grad_norm": 1.0112485885620117, + "learning_rate": 4.883241477333388e-06, + "loss": 0.6601, + "step": 2092 + }, + { + "epoch": 0.6127049180327869, + "grad_norm": 0.9963438510894775, + "learning_rate": 4.883125142755591e-06, + "loss": 0.6714, + "step": 2093 + }, + { + "epoch": 0.6129976580796253, + "grad_norm": 0.9917556643486023, + "learning_rate": 4.883008751637694e-06, + "loss": 0.7211, + "step": 2094 + }, + { + "epoch": 0.6132903981264637, + "grad_norm": 0.999608039855957, + "learning_rate": 4.882892303982454e-06, + "loss": 0.6781, + "step": 2095 + }, + { + "epoch": 0.6135831381733021, + "grad_norm": 1.0016415119171143, + "learning_rate": 4.882775799792638e-06, + "loss": 0.6595, + "step": 2096 + }, + { + "epoch": 0.6138758782201406, + "grad_norm": 0.9366666674613953, + "learning_rate": 4.882659239071008e-06, + "loss": 0.6809, + "step": 2097 + }, + { + "epoch": 0.6141686182669789, + "grad_norm": 0.9500702023506165, + "learning_rate": 4.88254262182033e-06, + "loss": 0.6715, + "step": 2098 + }, + { + "epoch": 0.6144613583138173, + "grad_norm": 0.9814761877059937, + "learning_rate": 4.88242594804337e-06, + "loss": 0.708, + "step": 2099 + }, + { + "epoch": 0.6147540983606558, + "grad_norm": 0.995089054107666, + "learning_rate": 4.882309217742897e-06, + "loss": 0.6964, + "step": 2100 + }, + { + "epoch": 0.6150468384074942, + "grad_norm": 1.038294792175293, + "learning_rate": 4.88219243092168e-06, + "loss": 0.7181, + "step": 2101 + }, + { + "epoch": 0.6153395784543325, + "grad_norm": 0.9497870206832886, + "learning_rate": 4.88207558758249e-06, + "loss": 0.7141, + "step": 2102 + }, + { + "epoch": 0.615632318501171, + "grad_norm": 0.9202408194541931, + "learning_rate": 4.881958687728099e-06, + "loss": 0.6425, + "step": 2103 + }, + { + "epoch": 0.6159250585480094, + "grad_norm": 1.0408422946929932, + "learning_rate": 4.881841731361281e-06, + "loss": 0.633, + "step": 2104 + }, + { + "epoch": 0.6162177985948478, + "grad_norm": 1.1203434467315674, + "learning_rate": 4.881724718484809e-06, + "loss": 0.7069, + "step": 2105 + }, + { + "epoch": 0.6165105386416861, + "grad_norm": 0.9299518465995789, + "learning_rate": 4.881607649101462e-06, + "loss": 0.6263, + "step": 2106 + }, + { + "epoch": 0.6168032786885246, + "grad_norm": 0.9951931834220886, + "learning_rate": 4.881490523214015e-06, + "loss": 0.6902, + "step": 2107 + }, + { + "epoch": 0.617096018735363, + "grad_norm": 0.9375085830688477, + "learning_rate": 4.881373340825249e-06, + "loss": 0.6217, + "step": 2108 + }, + { + "epoch": 0.6173887587822015, + "grad_norm": 1.0653786659240723, + "learning_rate": 4.881256101937941e-06, + "loss": 0.6981, + "step": 2109 + }, + { + "epoch": 0.6176814988290398, + "grad_norm": 1.0366475582122803, + "learning_rate": 4.881138806554876e-06, + "loss": 0.677, + "step": 2110 + }, + { + "epoch": 0.6179742388758782, + "grad_norm": 0.9610787630081177, + "learning_rate": 4.881021454678835e-06, + "loss": 0.6563, + "step": 2111 + }, + { + "epoch": 0.6182669789227166, + "grad_norm": 1.1080557107925415, + "learning_rate": 4.880904046312602e-06, + "loss": 0.6485, + "step": 2112 + }, + { + "epoch": 0.6185597189695551, + "grad_norm": 0.9901196360588074, + "learning_rate": 4.880786581458964e-06, + "loss": 0.7126, + "step": 2113 + }, + { + "epoch": 0.6188524590163934, + "grad_norm": 1.0501980781555176, + "learning_rate": 4.880669060120706e-06, + "loss": 0.65, + "step": 2114 + }, + { + "epoch": 0.6191451990632318, + "grad_norm": 0.9888202548027039, + "learning_rate": 4.880551482300618e-06, + "loss": 0.6822, + "step": 2115 + }, + { + "epoch": 0.6194379391100703, + "grad_norm": 1.0122631788253784, + "learning_rate": 4.880433848001488e-06, + "loss": 0.6657, + "step": 2116 + }, + { + "epoch": 0.6197306791569087, + "grad_norm": 1.0534694194793701, + "learning_rate": 4.880316157226108e-06, + "loss": 0.7, + "step": 2117 + }, + { + "epoch": 0.620023419203747, + "grad_norm": 1.0107978582382202, + "learning_rate": 4.88019840997727e-06, + "loss": 0.6558, + "step": 2118 + }, + { + "epoch": 0.6203161592505855, + "grad_norm": 1.0370997190475464, + "learning_rate": 4.880080606257768e-06, + "loss": 0.6236, + "step": 2119 + }, + { + "epoch": 0.6206088992974239, + "grad_norm": 0.9080067873001099, + "learning_rate": 4.879962746070396e-06, + "loss": 0.6507, + "step": 2120 + }, + { + "epoch": 0.6209016393442623, + "grad_norm": 0.9555186033248901, + "learning_rate": 4.87984482941795e-06, + "loss": 0.6748, + "step": 2121 + }, + { + "epoch": 0.6211943793911007, + "grad_norm": 0.994879424571991, + "learning_rate": 4.8797268563032285e-06, + "loss": 0.6225, + "step": 2122 + }, + { + "epoch": 0.6214871194379391, + "grad_norm": 0.9535747170448303, + "learning_rate": 4.87960882672903e-06, + "loss": 0.6684, + "step": 2123 + }, + { + "epoch": 0.6217798594847775, + "grad_norm": 1.0143144130706787, + "learning_rate": 4.879490740698155e-06, + "loss": 0.714, + "step": 2124 + }, + { + "epoch": 0.622072599531616, + "grad_norm": 0.961336612701416, + "learning_rate": 4.879372598213405e-06, + "loss": 0.6857, + "step": 2125 + }, + { + "epoch": 0.6223653395784543, + "grad_norm": 0.9656534790992737, + "learning_rate": 4.879254399277583e-06, + "loss": 0.6806, + "step": 2126 + }, + { + "epoch": 0.6226580796252927, + "grad_norm": 0.9691649079322815, + "learning_rate": 4.8791361438934935e-06, + "loss": 0.6605, + "step": 2127 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 1.003970742225647, + "learning_rate": 4.879017832063941e-06, + "loss": 0.6929, + "step": 2128 + }, + { + "epoch": 0.6232435597189696, + "grad_norm": 1.0408508777618408, + "learning_rate": 4.878899463791734e-06, + "loss": 0.7076, + "step": 2129 + }, + { + "epoch": 0.6235362997658079, + "grad_norm": 0.9481172561645508, + "learning_rate": 4.8787810390796805e-06, + "loss": 0.687, + "step": 2130 + }, + { + "epoch": 0.6238290398126464, + "grad_norm": 1.0362364053726196, + "learning_rate": 4.87866255793059e-06, + "loss": 0.6802, + "step": 2131 + }, + { + "epoch": 0.6241217798594848, + "grad_norm": 0.9424051642417908, + "learning_rate": 4.8785440203472725e-06, + "loss": 0.6343, + "step": 2132 + }, + { + "epoch": 0.6244145199063232, + "grad_norm": 0.9239621758460999, + "learning_rate": 4.878425426332541e-06, + "loss": 0.6393, + "step": 2133 + }, + { + "epoch": 0.6247072599531616, + "grad_norm": 0.9799872636795044, + "learning_rate": 4.878306775889211e-06, + "loss": 0.6707, + "step": 2134 + }, + { + "epoch": 0.625, + "grad_norm": 1.0349359512329102, + "learning_rate": 4.878188069020094e-06, + "loss": 0.7204, + "step": 2135 + }, + { + "epoch": 0.6252927400468384, + "grad_norm": 0.9892054200172424, + "learning_rate": 4.8780693057280094e-06, + "loss": 0.6808, + "step": 2136 + }, + { + "epoch": 0.6255854800936768, + "grad_norm": 1.0138297080993652, + "learning_rate": 4.877950486015773e-06, + "loss": 0.7244, + "step": 2137 + }, + { + "epoch": 0.6258782201405152, + "grad_norm": 0.9441587924957275, + "learning_rate": 4.877831609886205e-06, + "loss": 0.6537, + "step": 2138 + }, + { + "epoch": 0.6261709601873536, + "grad_norm": 1.0227981805801392, + "learning_rate": 4.877712677342124e-06, + "loss": 0.7211, + "step": 2139 + }, + { + "epoch": 0.6264637002341921, + "grad_norm": 1.0049419403076172, + "learning_rate": 4.877593688386354e-06, + "loss": 0.6588, + "step": 2140 + }, + { + "epoch": 0.6267564402810304, + "grad_norm": 0.976902961730957, + "learning_rate": 4.877474643021717e-06, + "loss": 0.7019, + "step": 2141 + }, + { + "epoch": 0.6270491803278688, + "grad_norm": 0.9608343839645386, + "learning_rate": 4.8773555412510365e-06, + "loss": 0.6918, + "step": 2142 + }, + { + "epoch": 0.6273419203747073, + "grad_norm": 0.9718950390815735, + "learning_rate": 4.877236383077139e-06, + "loss": 0.653, + "step": 2143 + }, + { + "epoch": 0.6276346604215457, + "grad_norm": 0.9829786419868469, + "learning_rate": 4.877117168502853e-06, + "loss": 0.7059, + "step": 2144 + }, + { + "epoch": 0.627927400468384, + "grad_norm": 1.028332233428955, + "learning_rate": 4.8769978975310045e-06, + "loss": 0.6849, + "step": 2145 + }, + { + "epoch": 0.6282201405152225, + "grad_norm": 1.0205504894256592, + "learning_rate": 4.876878570164424e-06, + "loss": 0.6238, + "step": 2146 + }, + { + "epoch": 0.6285128805620609, + "grad_norm": 0.9781897068023682, + "learning_rate": 4.8767591864059435e-06, + "loss": 0.6657, + "step": 2147 + }, + { + "epoch": 0.6288056206088993, + "grad_norm": 0.9602506160736084, + "learning_rate": 4.876639746258394e-06, + "loss": 0.6875, + "step": 2148 + }, + { + "epoch": 0.6290983606557377, + "grad_norm": 0.9693266749382019, + "learning_rate": 4.87652024972461e-06, + "loss": 0.7217, + "step": 2149 + }, + { + "epoch": 0.6293911007025761, + "grad_norm": 0.9976664781570435, + "learning_rate": 4.876400696807427e-06, + "loss": 0.6854, + "step": 2150 + }, + { + "epoch": 0.6296838407494145, + "grad_norm": 1.0221617221832275, + "learning_rate": 4.876281087509681e-06, + "loss": 0.6608, + "step": 2151 + }, + { + "epoch": 0.629976580796253, + "grad_norm": 0.9949924945831299, + "learning_rate": 4.8761614218342094e-06, + "loss": 0.6813, + "step": 2152 + }, + { + "epoch": 0.6302693208430913, + "grad_norm": 0.9861562252044678, + "learning_rate": 4.876041699783851e-06, + "loss": 0.6194, + "step": 2153 + }, + { + "epoch": 0.6305620608899297, + "grad_norm": 1.060700535774231, + "learning_rate": 4.875921921361447e-06, + "loss": 0.6622, + "step": 2154 + }, + { + "epoch": 0.6308548009367682, + "grad_norm": 0.9431871175765991, + "learning_rate": 4.875802086569839e-06, + "loss": 0.6552, + "step": 2155 + }, + { + "epoch": 0.6311475409836066, + "grad_norm": 0.9646035432815552, + "learning_rate": 4.87568219541187e-06, + "loss": 0.6778, + "step": 2156 + }, + { + "epoch": 0.6314402810304449, + "grad_norm": 0.9613351225852966, + "learning_rate": 4.875562247890385e-06, + "loss": 0.6965, + "step": 2157 + }, + { + "epoch": 0.6317330210772834, + "grad_norm": 0.9897869825363159, + "learning_rate": 4.875442244008229e-06, + "loss": 0.6357, + "step": 2158 + }, + { + "epoch": 0.6320257611241218, + "grad_norm": 0.9672220349311829, + "learning_rate": 4.87532218376825e-06, + "loss": 0.707, + "step": 2159 + }, + { + "epoch": 0.6323185011709602, + "grad_norm": 0.9711172580718994, + "learning_rate": 4.875202067173295e-06, + "loss": 0.6668, + "step": 2160 + }, + { + "epoch": 0.6326112412177985, + "grad_norm": 0.9315459132194519, + "learning_rate": 4.8750818942262144e-06, + "loss": 0.6548, + "step": 2161 + }, + { + "epoch": 0.632903981264637, + "grad_norm": 0.9807552695274353, + "learning_rate": 4.874961664929861e-06, + "loss": 0.6624, + "step": 2162 + }, + { + "epoch": 0.6331967213114754, + "grad_norm": 0.9559620022773743, + "learning_rate": 4.874841379287084e-06, + "loss": 0.6804, + "step": 2163 + }, + { + "epoch": 0.6334894613583139, + "grad_norm": 0.9798761606216431, + "learning_rate": 4.874721037300739e-06, + "loss": 0.6988, + "step": 2164 + }, + { + "epoch": 0.6337822014051522, + "grad_norm": 0.959778368473053, + "learning_rate": 4.874600638973682e-06, + "loss": 0.6436, + "step": 2165 + }, + { + "epoch": 0.6340749414519906, + "grad_norm": 0.9838948845863342, + "learning_rate": 4.8744801843087695e-06, + "loss": 0.6722, + "step": 2166 + }, + { + "epoch": 0.634367681498829, + "grad_norm": 0.9522321224212646, + "learning_rate": 4.874359673308857e-06, + "loss": 0.6293, + "step": 2167 + }, + { + "epoch": 0.6346604215456675, + "grad_norm": 0.9615171551704407, + "learning_rate": 4.874239105976806e-06, + "loss": 0.684, + "step": 2168 + }, + { + "epoch": 0.6349531615925058, + "grad_norm": 0.9827127456665039, + "learning_rate": 4.874118482315475e-06, + "loss": 0.6255, + "step": 2169 + }, + { + "epoch": 0.6352459016393442, + "grad_norm": 1.018966794013977, + "learning_rate": 4.873997802327728e-06, + "loss": 0.6758, + "step": 2170 + }, + { + "epoch": 0.6355386416861827, + "grad_norm": 0.9487247467041016, + "learning_rate": 4.873877066016427e-06, + "loss": 0.6321, + "step": 2171 + }, + { + "epoch": 0.6358313817330211, + "grad_norm": 0.9861055016517639, + "learning_rate": 4.873756273384436e-06, + "loss": 0.6601, + "step": 2172 + }, + { + "epoch": 0.6361241217798594, + "grad_norm": 0.9591404795646667, + "learning_rate": 4.873635424434621e-06, + "loss": 0.667, + "step": 2173 + }, + { + "epoch": 0.6364168618266979, + "grad_norm": 0.9631296992301941, + "learning_rate": 4.87351451916985e-06, + "loss": 0.6701, + "step": 2174 + }, + { + "epoch": 0.6367096018735363, + "grad_norm": 0.9893993139266968, + "learning_rate": 4.873393557592992e-06, + "loss": 0.6794, + "step": 2175 + }, + { + "epoch": 0.6370023419203747, + "grad_norm": 0.9654977917671204, + "learning_rate": 4.873272539706915e-06, + "loss": 0.6097, + "step": 2176 + }, + { + "epoch": 0.6372950819672131, + "grad_norm": 1.0840871334075928, + "learning_rate": 4.873151465514491e-06, + "loss": 0.6815, + "step": 2177 + }, + { + "epoch": 0.6375878220140515, + "grad_norm": 1.0037420988082886, + "learning_rate": 4.873030335018592e-06, + "loss": 0.6884, + "step": 2178 + }, + { + "epoch": 0.6378805620608899, + "grad_norm": 1.021381139755249, + "learning_rate": 4.872909148222092e-06, + "loss": 0.7051, + "step": 2179 + }, + { + "epoch": 0.6381733021077284, + "grad_norm": 0.9798935651779175, + "learning_rate": 4.872787905127868e-06, + "loss": 0.6704, + "step": 2180 + }, + { + "epoch": 0.6384660421545667, + "grad_norm": 0.9250304698944092, + "learning_rate": 4.872666605738795e-06, + "loss": 0.6707, + "step": 2181 + }, + { + "epoch": 0.6387587822014051, + "grad_norm": 0.9538632035255432, + "learning_rate": 4.872545250057751e-06, + "loss": 0.6845, + "step": 2182 + }, + { + "epoch": 0.6390515222482436, + "grad_norm": 1.008870005607605, + "learning_rate": 4.872423838087614e-06, + "loss": 0.713, + "step": 2183 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.0287574529647827, + "learning_rate": 4.872302369831267e-06, + "loss": 0.7279, + "step": 2184 + }, + { + "epoch": 0.6396370023419203, + "grad_norm": 1.0046038627624512, + "learning_rate": 4.872180845291589e-06, + "loss": 0.6563, + "step": 2185 + }, + { + "epoch": 0.6399297423887588, + "grad_norm": 1.0080809593200684, + "learning_rate": 4.872059264471465e-06, + "loss": 0.6992, + "step": 2186 + }, + { + "epoch": 0.6402224824355972, + "grad_norm": 0.9572263360023499, + "learning_rate": 4.87193762737378e-06, + "loss": 0.6771, + "step": 2187 + }, + { + "epoch": 0.6405152224824356, + "grad_norm": 0.9719826579093933, + "learning_rate": 4.871815934001419e-06, + "loss": 0.6836, + "step": 2188 + }, + { + "epoch": 0.640807962529274, + "grad_norm": 0.9442581534385681, + "learning_rate": 4.871694184357269e-06, + "loss": 0.6568, + "step": 2189 + }, + { + "epoch": 0.6411007025761124, + "grad_norm": 0.9904913902282715, + "learning_rate": 4.871572378444219e-06, + "loss": 0.6497, + "step": 2190 + }, + { + "epoch": 0.6413934426229508, + "grad_norm": 0.9930439591407776, + "learning_rate": 4.871450516265158e-06, + "loss": 0.713, + "step": 2191 + }, + { + "epoch": 0.6416861826697893, + "grad_norm": 1.054931879043579, + "learning_rate": 4.8713285978229795e-06, + "loss": 0.6727, + "step": 2192 + }, + { + "epoch": 0.6419789227166276, + "grad_norm": 1.5138787031173706, + "learning_rate": 4.871206623120573e-06, + "loss": 0.6885, + "step": 2193 + }, + { + "epoch": 0.642271662763466, + "grad_norm": 1.0081034898757935, + "learning_rate": 4.8710845921608345e-06, + "loss": 0.6954, + "step": 2194 + }, + { + "epoch": 0.6425644028103045, + "grad_norm": 0.9758413434028625, + "learning_rate": 4.870962504946658e-06, + "loss": 0.6651, + "step": 2195 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.9284654855728149, + "learning_rate": 4.8708403614809404e-06, + "loss": 0.656, + "step": 2196 + }, + { + "epoch": 0.6431498829039812, + "grad_norm": 0.984208345413208, + "learning_rate": 4.870718161766581e-06, + "loss": 0.7108, + "step": 2197 + }, + { + "epoch": 0.6434426229508197, + "grad_norm": 1.0133427381515503, + "learning_rate": 4.8705959058064765e-06, + "loss": 0.7036, + "step": 2198 + }, + { + "epoch": 0.6437353629976581, + "grad_norm": 1.0210460424423218, + "learning_rate": 4.870473593603529e-06, + "loss": 0.6851, + "step": 2199 + }, + { + "epoch": 0.6440281030444965, + "grad_norm": 0.9794594049453735, + "learning_rate": 4.87035122516064e-06, + "loss": 0.6831, + "step": 2200 + }, + { + "epoch": 0.6443208430913349, + "grad_norm": 1.0074502229690552, + "learning_rate": 4.870228800480713e-06, + "loss": 0.705, + "step": 2201 + }, + { + "epoch": 0.6446135831381733, + "grad_norm": 1.0488473176956177, + "learning_rate": 4.870106319566652e-06, + "loss": 0.672, + "step": 2202 + }, + { + "epoch": 0.6449063231850117, + "grad_norm": 1.1514956951141357, + "learning_rate": 4.869983782421364e-06, + "loss": 0.6948, + "step": 2203 + }, + { + "epoch": 0.6451990632318502, + "grad_norm": 0.980114221572876, + "learning_rate": 4.869861189047755e-06, + "loss": 0.6462, + "step": 2204 + }, + { + "epoch": 0.6454918032786885, + "grad_norm": 0.9970903396606445, + "learning_rate": 4.869738539448734e-06, + "loss": 0.7167, + "step": 2205 + }, + { + "epoch": 0.6457845433255269, + "grad_norm": 0.9744213223457336, + "learning_rate": 4.86961583362721e-06, + "loss": 0.656, + "step": 2206 + }, + { + "epoch": 0.6460772833723654, + "grad_norm": 0.9443262815475464, + "learning_rate": 4.8694930715860965e-06, + "loss": 0.6475, + "step": 2207 + }, + { + "epoch": 0.6463700234192038, + "grad_norm": 0.9450652599334717, + "learning_rate": 4.869370253328304e-06, + "loss": 0.6521, + "step": 2208 + }, + { + "epoch": 0.6466627634660421, + "grad_norm": 0.9884177446365356, + "learning_rate": 4.869247378856747e-06, + "loss": 0.6815, + "step": 2209 + }, + { + "epoch": 0.6469555035128806, + "grad_norm": 0.9839397072792053, + "learning_rate": 4.869124448174341e-06, + "loss": 0.6493, + "step": 2210 + }, + { + "epoch": 0.647248243559719, + "grad_norm": 0.9930598139762878, + "learning_rate": 4.869001461284002e-06, + "loss": 0.6565, + "step": 2211 + }, + { + "epoch": 0.6475409836065574, + "grad_norm": 0.9557778835296631, + "learning_rate": 4.868878418188648e-06, + "loss": 0.6963, + "step": 2212 + }, + { + "epoch": 0.6478337236533958, + "grad_norm": 0.9431025981903076, + "learning_rate": 4.868755318891199e-06, + "loss": 0.7084, + "step": 2213 + }, + { + "epoch": 0.6481264637002342, + "grad_norm": 0.9298965334892273, + "learning_rate": 4.868632163394575e-06, + "loss": 0.6394, + "step": 2214 + }, + { + "epoch": 0.6484192037470726, + "grad_norm": 1.0517911911010742, + "learning_rate": 4.868508951701697e-06, + "loss": 0.6471, + "step": 2215 + }, + { + "epoch": 0.6487119437939111, + "grad_norm": 1.0303895473480225, + "learning_rate": 4.86838568381549e-06, + "loss": 0.6463, + "step": 2216 + }, + { + "epoch": 0.6490046838407494, + "grad_norm": 0.9581923484802246, + "learning_rate": 4.868262359738877e-06, + "loss": 0.7036, + "step": 2217 + }, + { + "epoch": 0.6492974238875878, + "grad_norm": 0.9840261936187744, + "learning_rate": 4.868138979474785e-06, + "loss": 0.6631, + "step": 2218 + }, + { + "epoch": 0.6495901639344263, + "grad_norm": 0.9934372305870056, + "learning_rate": 4.86801554302614e-06, + "loss": 0.6234, + "step": 2219 + }, + { + "epoch": 0.6498829039812647, + "grad_norm": 1.014316439628601, + "learning_rate": 4.867892050395873e-06, + "loss": 0.6621, + "step": 2220 + }, + { + "epoch": 0.650175644028103, + "grad_norm": 1.006337285041809, + "learning_rate": 4.86776850158691e-06, + "loss": 0.6698, + "step": 2221 + }, + { + "epoch": 0.6504683840749415, + "grad_norm": 1.0906541347503662, + "learning_rate": 4.867644896602186e-06, + "loss": 0.6495, + "step": 2222 + }, + { + "epoch": 0.6507611241217799, + "grad_norm": 1.0189708471298218, + "learning_rate": 4.867521235444631e-06, + "loss": 0.6853, + "step": 2223 + }, + { + "epoch": 0.6510538641686182, + "grad_norm": 0.974472165107727, + "learning_rate": 4.867397518117181e-06, + "loss": 0.6911, + "step": 2224 + }, + { + "epoch": 0.6513466042154566, + "grad_norm": 0.970893919467926, + "learning_rate": 4.8672737446227696e-06, + "loss": 0.627, + "step": 2225 + }, + { + "epoch": 0.6516393442622951, + "grad_norm": 1.0178016424179077, + "learning_rate": 4.8671499149643326e-06, + "loss": 0.7087, + "step": 2226 + }, + { + "epoch": 0.6519320843091335, + "grad_norm": 0.9988121390342712, + "learning_rate": 4.86702602914481e-06, + "loss": 0.7071, + "step": 2227 + }, + { + "epoch": 0.6522248243559718, + "grad_norm": 1.032410740852356, + "learning_rate": 4.86690208716714e-06, + "loss": 0.6893, + "step": 2228 + }, + { + "epoch": 0.6525175644028103, + "grad_norm": 0.9565680623054504, + "learning_rate": 4.866778089034263e-06, + "loss": 0.6349, + "step": 2229 + }, + { + "epoch": 0.6528103044496487, + "grad_norm": 1.0000181198120117, + "learning_rate": 4.866654034749122e-06, + "loss": 0.6814, + "step": 2230 + }, + { + "epoch": 0.6531030444964872, + "grad_norm": 0.9869099855422974, + "learning_rate": 4.866529924314659e-06, + "loss": 0.6862, + "step": 2231 + }, + { + "epoch": 0.6533957845433255, + "grad_norm": 0.9464926719665527, + "learning_rate": 4.866405757733818e-06, + "loss": 0.6767, + "step": 2232 + }, + { + "epoch": 0.6536885245901639, + "grad_norm": 1.03895103931427, + "learning_rate": 4.866281535009547e-06, + "loss": 0.6504, + "step": 2233 + }, + { + "epoch": 0.6539812646370023, + "grad_norm": 1.0008567571640015, + "learning_rate": 4.866157256144791e-06, + "loss": 0.6642, + "step": 2234 + }, + { + "epoch": 0.6542740046838408, + "grad_norm": 0.9529739022254944, + "learning_rate": 4.8660329211425e-06, + "loss": 0.6861, + "step": 2235 + }, + { + "epoch": 0.6545667447306791, + "grad_norm": 1.0182169675827026, + "learning_rate": 4.865908530005623e-06, + "loss": 0.7249, + "step": 2236 + }, + { + "epoch": 0.6548594847775175, + "grad_norm": 0.962832510471344, + "learning_rate": 4.8657840827371115e-06, + "loss": 0.6529, + "step": 2237 + }, + { + "epoch": 0.655152224824356, + "grad_norm": 0.9577229022979736, + "learning_rate": 4.865659579339918e-06, + "loss": 0.6552, + "step": 2238 + }, + { + "epoch": 0.6554449648711944, + "grad_norm": 0.9374836683273315, + "learning_rate": 4.865535019816998e-06, + "loss": 0.6506, + "step": 2239 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.9382537603378296, + "learning_rate": 4.865410404171304e-06, + "loss": 0.6769, + "step": 2240 + }, + { + "epoch": 0.6560304449648712, + "grad_norm": 0.9262009859085083, + "learning_rate": 4.865285732405794e-06, + "loss": 0.6853, + "step": 2241 + }, + { + "epoch": 0.6563231850117096, + "grad_norm": 0.9381985664367676, + "learning_rate": 4.865161004523426e-06, + "loss": 0.6851, + "step": 2242 + }, + { + "epoch": 0.656615925058548, + "grad_norm": 0.9729037284851074, + "learning_rate": 4.865036220527157e-06, + "loss": 0.6699, + "step": 2243 + }, + { + "epoch": 0.6569086651053864, + "grad_norm": 0.9193257093429565, + "learning_rate": 4.864911380419951e-06, + "loss": 0.6119, + "step": 2244 + }, + { + "epoch": 0.6572014051522248, + "grad_norm": 0.9635428786277771, + "learning_rate": 4.864786484204768e-06, + "loss": 0.6319, + "step": 2245 + }, + { + "epoch": 0.6574941451990632, + "grad_norm": 0.9806466698646545, + "learning_rate": 4.8646615318845706e-06, + "loss": 0.6721, + "step": 2246 + }, + { + "epoch": 0.6577868852459017, + "grad_norm": 0.9827562570571899, + "learning_rate": 4.864536523462324e-06, + "loss": 0.676, + "step": 2247 + }, + { + "epoch": 0.65807962529274, + "grad_norm": 0.9984734654426575, + "learning_rate": 4.864411458940995e-06, + "loss": 0.7028, + "step": 2248 + }, + { + "epoch": 0.6583723653395784, + "grad_norm": 1.0233004093170166, + "learning_rate": 4.864286338323549e-06, + "loss": 0.689, + "step": 2249 + }, + { + "epoch": 0.6586651053864169, + "grad_norm": 0.9839504361152649, + "learning_rate": 4.864161161612956e-06, + "loss": 0.6533, + "step": 2250 + }, + { + "epoch": 0.6589578454332553, + "grad_norm": 0.961230456829071, + "learning_rate": 4.864035928812185e-06, + "loss": 0.662, + "step": 2251 + }, + { + "epoch": 0.6592505854800936, + "grad_norm": 1.0296125411987305, + "learning_rate": 4.863910639924207e-06, + "loss": 0.6766, + "step": 2252 + }, + { + "epoch": 0.6595433255269321, + "grad_norm": 1.0380611419677734, + "learning_rate": 4.863785294951996e-06, + "loss": 0.6932, + "step": 2253 + }, + { + "epoch": 0.6598360655737705, + "grad_norm": 0.9439920783042908, + "learning_rate": 4.863659893898525e-06, + "loss": 0.6366, + "step": 2254 + }, + { + "epoch": 0.6601288056206089, + "grad_norm": 0.971835732460022, + "learning_rate": 4.863534436766767e-06, + "loss": 0.7047, + "step": 2255 + }, + { + "epoch": 0.6604215456674473, + "grad_norm": 0.9497107863426208, + "learning_rate": 4.863408923559702e-06, + "loss": 0.6557, + "step": 2256 + }, + { + "epoch": 0.6607142857142857, + "grad_norm": 0.9785372614860535, + "learning_rate": 4.863283354280305e-06, + "loss": 0.7141, + "step": 2257 + }, + { + "epoch": 0.6610070257611241, + "grad_norm": 0.9827923774719238, + "learning_rate": 4.863157728931557e-06, + "loss": 0.6384, + "step": 2258 + }, + { + "epoch": 0.6612997658079626, + "grad_norm": 0.9709166884422302, + "learning_rate": 4.863032047516438e-06, + "loss": 0.6662, + "step": 2259 + }, + { + "epoch": 0.6615925058548009, + "grad_norm": 0.9870762228965759, + "learning_rate": 4.86290631003793e-06, + "loss": 0.6825, + "step": 2260 + }, + { + "epoch": 0.6618852459016393, + "grad_norm": 1.0452382564544678, + "learning_rate": 4.8627805164990145e-06, + "loss": 0.7097, + "step": 2261 + }, + { + "epoch": 0.6621779859484778, + "grad_norm": 1.0510674715042114, + "learning_rate": 4.8626546669026785e-06, + "loss": 0.6624, + "step": 2262 + }, + { + "epoch": 0.6624707259953162, + "grad_norm": 0.9352555871009827, + "learning_rate": 4.862528761251907e-06, + "loss": 0.6604, + "step": 2263 + }, + { + "epoch": 0.6627634660421545, + "grad_norm": 1.0250780582427979, + "learning_rate": 4.862402799549686e-06, + "loss": 0.6761, + "step": 2264 + }, + { + "epoch": 0.663056206088993, + "grad_norm": 1.0143426656723022, + "learning_rate": 4.862276781799005e-06, + "loss": 0.6994, + "step": 2265 + }, + { + "epoch": 0.6633489461358314, + "grad_norm": 1.0225505828857422, + "learning_rate": 4.862150708002853e-06, + "loss": 0.6447, + "step": 2266 + }, + { + "epoch": 0.6636416861826698, + "grad_norm": 1.011782169342041, + "learning_rate": 4.862024578164221e-06, + "loss": 0.6857, + "step": 2267 + }, + { + "epoch": 0.6639344262295082, + "grad_norm": 0.9960460066795349, + "learning_rate": 4.861898392286104e-06, + "loss": 0.6729, + "step": 2268 + }, + { + "epoch": 0.6642271662763466, + "grad_norm": 1.0483076572418213, + "learning_rate": 4.861772150371492e-06, + "loss": 0.6317, + "step": 2269 + }, + { + "epoch": 0.664519906323185, + "grad_norm": 0.9818496108055115, + "learning_rate": 4.8616458524233836e-06, + "loss": 0.6858, + "step": 2270 + }, + { + "epoch": 0.6648126463700235, + "grad_norm": 1.0765678882598877, + "learning_rate": 4.861519498444773e-06, + "loss": 0.6648, + "step": 2271 + }, + { + "epoch": 0.6651053864168618, + "grad_norm": 0.9760961532592773, + "learning_rate": 4.861393088438658e-06, + "loss": 0.6634, + "step": 2272 + }, + { + "epoch": 0.6653981264637002, + "grad_norm": 0.9972482323646545, + "learning_rate": 4.861266622408039e-06, + "loss": 0.639, + "step": 2273 + }, + { + "epoch": 0.6656908665105387, + "grad_norm": 0.9686852097511292, + "learning_rate": 4.861140100355916e-06, + "loss": 0.7176, + "step": 2274 + }, + { + "epoch": 0.6659836065573771, + "grad_norm": 0.9180808067321777, + "learning_rate": 4.8610135222852894e-06, + "loss": 0.6136, + "step": 2275 + }, + { + "epoch": 0.6662763466042154, + "grad_norm": 0.9921360015869141, + "learning_rate": 4.860886888199165e-06, + "loss": 0.7051, + "step": 2276 + }, + { + "epoch": 0.6665690866510539, + "grad_norm": 0.9707745909690857, + "learning_rate": 4.860760198100544e-06, + "loss": 0.6887, + "step": 2277 + }, + { + "epoch": 0.6668618266978923, + "grad_norm": 0.9729372262954712, + "learning_rate": 4.860633451992434e-06, + "loss": 0.6889, + "step": 2278 + }, + { + "epoch": 0.6671545667447307, + "grad_norm": 0.9879921674728394, + "learning_rate": 4.860506649877842e-06, + "loss": 0.6461, + "step": 2279 + }, + { + "epoch": 0.667447306791569, + "grad_norm": 0.9935986995697021, + "learning_rate": 4.860379791759777e-06, + "loss": 0.6863, + "step": 2280 + }, + { + "epoch": 0.6677400468384075, + "grad_norm": 1.0422405004501343, + "learning_rate": 4.860252877641247e-06, + "loss": 0.6915, + "step": 2281 + }, + { + "epoch": 0.6680327868852459, + "grad_norm": 0.9735476970672607, + "learning_rate": 4.860125907525264e-06, + "loss": 0.7, + "step": 2282 + }, + { + "epoch": 0.6683255269320844, + "grad_norm": 0.9881382584571838, + "learning_rate": 4.859998881414842e-06, + "loss": 0.7098, + "step": 2283 + }, + { + "epoch": 0.6686182669789227, + "grad_norm": 0.9659385085105896, + "learning_rate": 4.859871799312991e-06, + "loss": 0.662, + "step": 2284 + }, + { + "epoch": 0.6689110070257611, + "grad_norm": 1.0426748991012573, + "learning_rate": 4.859744661222729e-06, + "loss": 0.6762, + "step": 2285 + }, + { + "epoch": 0.6692037470725996, + "grad_norm": 0.9830307960510254, + "learning_rate": 4.859617467147072e-06, + "loss": 0.6762, + "step": 2286 + }, + { + "epoch": 0.669496487119438, + "grad_norm": 1.0117669105529785, + "learning_rate": 4.859490217089037e-06, + "loss": 0.7002, + "step": 2287 + }, + { + "epoch": 0.6697892271662763, + "grad_norm": 0.9837619066238403, + "learning_rate": 4.859362911051643e-06, + "loss": 0.7054, + "step": 2288 + }, + { + "epoch": 0.6700819672131147, + "grad_norm": 0.8958171606063843, + "learning_rate": 4.859235549037911e-06, + "loss": 0.6476, + "step": 2289 + }, + { + "epoch": 0.6703747072599532, + "grad_norm": 0.9326554536819458, + "learning_rate": 4.859108131050862e-06, + "loss": 0.6553, + "step": 2290 + }, + { + "epoch": 0.6706674473067916, + "grad_norm": 1.0134243965148926, + "learning_rate": 4.858980657093519e-06, + "loss": 0.7022, + "step": 2291 + }, + { + "epoch": 0.6709601873536299, + "grad_norm": 0.9534375071525574, + "learning_rate": 4.858853127168908e-06, + "loss": 0.6223, + "step": 2292 + }, + { + "epoch": 0.6712529274004684, + "grad_norm": 0.9608964323997498, + "learning_rate": 4.858725541280053e-06, + "loss": 0.6761, + "step": 2293 + }, + { + "epoch": 0.6715456674473068, + "grad_norm": 0.9657753705978394, + "learning_rate": 4.858597899429981e-06, + "loss": 0.6358, + "step": 2294 + }, + { + "epoch": 0.6718384074941453, + "grad_norm": 1.022303819656372, + "learning_rate": 4.85847020162172e-06, + "loss": 0.6778, + "step": 2295 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 1.0351331233978271, + "learning_rate": 4.858342447858301e-06, + "loss": 0.6659, + "step": 2296 + }, + { + "epoch": 0.672423887587822, + "grad_norm": 1.0471330881118774, + "learning_rate": 4.858214638142755e-06, + "loss": 0.701, + "step": 2297 + }, + { + "epoch": 0.6727166276346604, + "grad_norm": 0.9618033766746521, + "learning_rate": 4.858086772478113e-06, + "loss": 0.6722, + "step": 2298 + }, + { + "epoch": 0.6730093676814989, + "grad_norm": 1.0328878164291382, + "learning_rate": 4.8579588508674095e-06, + "loss": 0.7286, + "step": 2299 + }, + { + "epoch": 0.6733021077283372, + "grad_norm": 1.0499165058135986, + "learning_rate": 4.857830873313678e-06, + "loss": 0.6629, + "step": 2300 + }, + { + "epoch": 0.6735948477751756, + "grad_norm": 0.9725488424301147, + "learning_rate": 4.8577028398199574e-06, + "loss": 0.6777, + "step": 2301 + }, + { + "epoch": 0.6738875878220141, + "grad_norm": 0.9825259447097778, + "learning_rate": 4.857574750389284e-06, + "loss": 0.7017, + "step": 2302 + }, + { + "epoch": 0.6741803278688525, + "grad_norm": 0.9700861573219299, + "learning_rate": 4.857446605024696e-06, + "loss": 0.655, + "step": 2303 + }, + { + "epoch": 0.6744730679156908, + "grad_norm": 0.9580789804458618, + "learning_rate": 4.857318403729235e-06, + "loss": 0.6718, + "step": 2304 + }, + { + "epoch": 0.6747658079625293, + "grad_norm": 0.9606214761734009, + "learning_rate": 4.8571901465059416e-06, + "loss": 0.6842, + "step": 2305 + }, + { + "epoch": 0.6750585480093677, + "grad_norm": 1.0215611457824707, + "learning_rate": 4.857061833357859e-06, + "loss": 0.7097, + "step": 2306 + }, + { + "epoch": 0.675351288056206, + "grad_norm": 0.9423355460166931, + "learning_rate": 4.856933464288032e-06, + "loss": 0.6818, + "step": 2307 + }, + { + "epoch": 0.6756440281030445, + "grad_norm": 1.0826376676559448, + "learning_rate": 4.856805039299506e-06, + "loss": 0.6631, + "step": 2308 + }, + { + "epoch": 0.6759367681498829, + "grad_norm": 0.9997490048408508, + "learning_rate": 4.856676558395327e-06, + "loss": 0.6787, + "step": 2309 + }, + { + "epoch": 0.6762295081967213, + "grad_norm": 0.9853737354278564, + "learning_rate": 4.856548021578544e-06, + "loss": 0.6742, + "step": 2310 + }, + { + "epoch": 0.6765222482435597, + "grad_norm": 0.948182225227356, + "learning_rate": 4.856419428852206e-06, + "loss": 0.7083, + "step": 2311 + }, + { + "epoch": 0.6768149882903981, + "grad_norm": 0.972477376461029, + "learning_rate": 4.856290780219365e-06, + "loss": 0.664, + "step": 2312 + }, + { + "epoch": 0.6771077283372365, + "grad_norm": 0.9875838756561279, + "learning_rate": 4.856162075683073e-06, + "loss": 0.6776, + "step": 2313 + }, + { + "epoch": 0.677400468384075, + "grad_norm": 1.0504279136657715, + "learning_rate": 4.856033315246382e-06, + "loss": 0.6288, + "step": 2314 + }, + { + "epoch": 0.6776932084309133, + "grad_norm": 0.9698414206504822, + "learning_rate": 4.855904498912348e-06, + "loss": 0.7324, + "step": 2315 + }, + { + "epoch": 0.6779859484777517, + "grad_norm": 0.9557287096977234, + "learning_rate": 4.855775626684029e-06, + "loss": 0.6946, + "step": 2316 + }, + { + "epoch": 0.6782786885245902, + "grad_norm": 0.9538636207580566, + "learning_rate": 4.855646698564478e-06, + "loss": 0.6627, + "step": 2317 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 0.9657625555992126, + "learning_rate": 4.855517714556758e-06, + "loss": 0.5852, + "step": 2318 + }, + { + "epoch": 0.6788641686182669, + "grad_norm": 1.0196975469589233, + "learning_rate": 4.855388674663928e-06, + "loss": 0.6708, + "step": 2319 + }, + { + "epoch": 0.6791569086651054, + "grad_norm": 1.041404128074646, + "learning_rate": 4.855259578889048e-06, + "loss": 0.6794, + "step": 2320 + }, + { + "epoch": 0.6794496487119438, + "grad_norm": 1.0312410593032837, + "learning_rate": 4.855130427235183e-06, + "loss": 0.6769, + "step": 2321 + }, + { + "epoch": 0.6797423887587822, + "grad_norm": 0.9925411343574524, + "learning_rate": 4.855001219705395e-06, + "loss": 0.6788, + "step": 2322 + }, + { + "epoch": 0.6800351288056206, + "grad_norm": 0.9812830686569214, + "learning_rate": 4.854871956302752e-06, + "loss": 0.7003, + "step": 2323 + }, + { + "epoch": 0.680327868852459, + "grad_norm": 0.9777898192405701, + "learning_rate": 4.8547426370303185e-06, + "loss": 0.6601, + "step": 2324 + }, + { + "epoch": 0.6806206088992974, + "grad_norm": 0.930770218372345, + "learning_rate": 4.854613261891163e-06, + "loss": 0.6566, + "step": 2325 + }, + { + "epoch": 0.6809133489461359, + "grad_norm": 0.9695926308631897, + "learning_rate": 4.854483830888357e-06, + "loss": 0.6891, + "step": 2326 + }, + { + "epoch": 0.6812060889929742, + "grad_norm": 1.073023796081543, + "learning_rate": 4.854354344024969e-06, + "loss": 0.6299, + "step": 2327 + }, + { + "epoch": 0.6814988290398126, + "grad_norm": 0.9883136749267578, + "learning_rate": 4.854224801304072e-06, + "loss": 0.6352, + "step": 2328 + }, + { + "epoch": 0.6817915690866511, + "grad_norm": 0.9968993067741394, + "learning_rate": 4.854095202728739e-06, + "loss": 0.6289, + "step": 2329 + }, + { + "epoch": 0.6820843091334895, + "grad_norm": 1.0005717277526855, + "learning_rate": 4.853965548302046e-06, + "loss": 0.657, + "step": 2330 + }, + { + "epoch": 0.6823770491803278, + "grad_norm": 1.0581246614456177, + "learning_rate": 4.853835838027067e-06, + "loss": 0.6718, + "step": 2331 + }, + { + "epoch": 0.6826697892271663, + "grad_norm": 1.0190953016281128, + "learning_rate": 4.85370607190688e-06, + "loss": 0.6791, + "step": 2332 + }, + { + "epoch": 0.6829625292740047, + "grad_norm": 1.011945128440857, + "learning_rate": 4.853576249944566e-06, + "loss": 0.6549, + "step": 2333 + }, + { + "epoch": 0.6832552693208431, + "grad_norm": 0.9880363941192627, + "learning_rate": 4.853446372143202e-06, + "loss": 0.683, + "step": 2334 + }, + { + "epoch": 0.6835480093676815, + "grad_norm": 1.0014530420303345, + "learning_rate": 4.853316438505872e-06, + "loss": 0.6683, + "step": 2335 + }, + { + "epoch": 0.6838407494145199, + "grad_norm": 0.9648080468177795, + "learning_rate": 4.8531864490356565e-06, + "loss": 0.6217, + "step": 2336 + }, + { + "epoch": 0.6841334894613583, + "grad_norm": 0.9740931391716003, + "learning_rate": 4.85305640373564e-06, + "loss": 0.6922, + "step": 2337 + }, + { + "epoch": 0.6844262295081968, + "grad_norm": 0.9722051024436951, + "learning_rate": 4.852926302608909e-06, + "loss": 0.6721, + "step": 2338 + }, + { + "epoch": 0.6847189695550351, + "grad_norm": 0.9793356657028198, + "learning_rate": 4.852796145658548e-06, + "loss": 0.6554, + "step": 2339 + }, + { + "epoch": 0.6850117096018735, + "grad_norm": 0.9567468166351318, + "learning_rate": 4.852665932887648e-06, + "loss": 0.6337, + "step": 2340 + }, + { + "epoch": 0.685304449648712, + "grad_norm": 0.9817701578140259, + "learning_rate": 4.852535664299296e-06, + "loss": 0.638, + "step": 2341 + }, + { + "epoch": 0.6855971896955504, + "grad_norm": 0.9484159350395203, + "learning_rate": 4.852405339896584e-06, + "loss": 0.691, + "step": 2342 + }, + { + "epoch": 0.6858899297423887, + "grad_norm": 0.9498723149299622, + "learning_rate": 4.852274959682602e-06, + "loss": 0.6437, + "step": 2343 + }, + { + "epoch": 0.6861826697892272, + "grad_norm": 1.158899188041687, + "learning_rate": 4.852144523660446e-06, + "loss": 0.635, + "step": 2344 + }, + { + "epoch": 0.6864754098360656, + "grad_norm": 0.9741037487983704, + "learning_rate": 4.8520140318332094e-06, + "loss": 0.6844, + "step": 2345 + }, + { + "epoch": 0.686768149882904, + "grad_norm": 1.0533289909362793, + "learning_rate": 4.8518834842039874e-06, + "loss": 0.6874, + "step": 2346 + }, + { + "epoch": 0.6870608899297423, + "grad_norm": 1.0226118564605713, + "learning_rate": 4.851752880775878e-06, + "loss": 0.6726, + "step": 2347 + }, + { + "epoch": 0.6873536299765808, + "grad_norm": 0.9971701502799988, + "learning_rate": 4.8516222215519795e-06, + "loss": 0.6622, + "step": 2348 + }, + { + "epoch": 0.6876463700234192, + "grad_norm": 0.9916727542877197, + "learning_rate": 4.851491506535392e-06, + "loss": 0.7023, + "step": 2349 + }, + { + "epoch": 0.6879391100702577, + "grad_norm": 0.9929489493370056, + "learning_rate": 4.851360735729217e-06, + "loss": 0.7073, + "step": 2350 + }, + { + "epoch": 0.688231850117096, + "grad_norm": 0.9462578296661377, + "learning_rate": 4.851229909136556e-06, + "loss": 0.7024, + "step": 2351 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.9755676984786987, + "learning_rate": 4.851099026760515e-06, + "loss": 0.6435, + "step": 2352 + }, + { + "epoch": 0.6888173302107728, + "grad_norm": 0.937428891658783, + "learning_rate": 4.850968088604198e-06, + "loss": 0.632, + "step": 2353 + }, + { + "epoch": 0.6891100702576113, + "grad_norm": 1.0315933227539062, + "learning_rate": 4.850837094670711e-06, + "loss": 0.6411, + "step": 2354 + }, + { + "epoch": 0.6894028103044496, + "grad_norm": 0.9859442710876465, + "learning_rate": 4.8507060449631615e-06, + "loss": 0.6679, + "step": 2355 + }, + { + "epoch": 0.689695550351288, + "grad_norm": 0.9646093845367432, + "learning_rate": 4.85057493948466e-06, + "loss": 0.6339, + "step": 2356 + }, + { + "epoch": 0.6899882903981265, + "grad_norm": 1.0038787126541138, + "learning_rate": 4.8504437782383165e-06, + "loss": 0.6506, + "step": 2357 + }, + { + "epoch": 0.6902810304449649, + "grad_norm": 1.0366098880767822, + "learning_rate": 4.850312561227243e-06, + "loss": 0.6734, + "step": 2358 + }, + { + "epoch": 0.6905737704918032, + "grad_norm": 1.016201138496399, + "learning_rate": 4.850181288454552e-06, + "loss": 0.6731, + "step": 2359 + }, + { + "epoch": 0.6908665105386417, + "grad_norm": 1.038008451461792, + "learning_rate": 4.850049959923359e-06, + "loss": 0.6861, + "step": 2360 + }, + { + "epoch": 0.6911592505854801, + "grad_norm": 0.9589744806289673, + "learning_rate": 4.849918575636778e-06, + "loss": 0.6484, + "step": 2361 + }, + { + "epoch": 0.6914519906323185, + "grad_norm": 1.0763590335845947, + "learning_rate": 4.849787135597928e-06, + "loss": 0.6905, + "step": 2362 + }, + { + "epoch": 0.6917447306791569, + "grad_norm": 0.944715142250061, + "learning_rate": 4.849655639809927e-06, + "loss": 0.684, + "step": 2363 + }, + { + "epoch": 0.6920374707259953, + "grad_norm": 1.0398012399673462, + "learning_rate": 4.849524088275894e-06, + "loss": 0.7139, + "step": 2364 + }, + { + "epoch": 0.6923302107728337, + "grad_norm": 1.0029194355010986, + "learning_rate": 4.849392480998951e-06, + "loss": 0.6784, + "step": 2365 + }, + { + "epoch": 0.6926229508196722, + "grad_norm": 0.9972625970840454, + "learning_rate": 4.84926081798222e-06, + "loss": 0.7112, + "step": 2366 + }, + { + "epoch": 0.6929156908665105, + "grad_norm": 0.9656035900115967, + "learning_rate": 4.849129099228826e-06, + "loss": 0.7192, + "step": 2367 + }, + { + "epoch": 0.6932084309133489, + "grad_norm": 1.023010492324829, + "learning_rate": 4.848997324741891e-06, + "loss": 0.6727, + "step": 2368 + }, + { + "epoch": 0.6935011709601874, + "grad_norm": 1.0618606805801392, + "learning_rate": 4.848865494524544e-06, + "loss": 0.6719, + "step": 2369 + }, + { + "epoch": 0.6937939110070258, + "grad_norm": 0.9596459865570068, + "learning_rate": 4.848733608579911e-06, + "loss": 0.6424, + "step": 2370 + }, + { + "epoch": 0.6940866510538641, + "grad_norm": 0.9909616708755493, + "learning_rate": 4.848601666911122e-06, + "loss": 0.6832, + "step": 2371 + }, + { + "epoch": 0.6943793911007026, + "grad_norm": 0.9851824641227722, + "learning_rate": 4.848469669521308e-06, + "loss": 0.6648, + "step": 2372 + }, + { + "epoch": 0.694672131147541, + "grad_norm": 1.0121028423309326, + "learning_rate": 4.8483376164136e-06, + "loss": 0.7109, + "step": 2373 + }, + { + "epoch": 0.6949648711943794, + "grad_norm": 0.9791396260261536, + "learning_rate": 4.848205507591131e-06, + "loss": 0.711, + "step": 2374 + }, + { + "epoch": 0.6952576112412178, + "grad_norm": 0.9722687602043152, + "learning_rate": 4.848073343057034e-06, + "loss": 0.6838, + "step": 2375 + }, + { + "epoch": 0.6955503512880562, + "grad_norm": 1.036615252494812, + "learning_rate": 4.847941122814447e-06, + "loss": 0.6909, + "step": 2376 + }, + { + "epoch": 0.6958430913348946, + "grad_norm": 0.9484121799468994, + "learning_rate": 4.847808846866505e-06, + "loss": 0.627, + "step": 2377 + }, + { + "epoch": 0.6961358313817331, + "grad_norm": 0.9779052138328552, + "learning_rate": 4.847676515216347e-06, + "loss": 0.6869, + "step": 2378 + }, + { + "epoch": 0.6964285714285714, + "grad_norm": 0.9575409293174744, + "learning_rate": 4.847544127867114e-06, + "loss": 0.7047, + "step": 2379 + }, + { + "epoch": 0.6967213114754098, + "grad_norm": 0.9823215007781982, + "learning_rate": 4.847411684821945e-06, + "loss": 0.6514, + "step": 2380 + }, + { + "epoch": 0.6970140515222483, + "grad_norm": 0.9680964350700378, + "learning_rate": 4.8472791860839815e-06, + "loss": 0.6542, + "step": 2381 + }, + { + "epoch": 0.6973067915690867, + "grad_norm": 0.9820600748062134, + "learning_rate": 4.847146631656369e-06, + "loss": 0.6916, + "step": 2382 + }, + { + "epoch": 0.697599531615925, + "grad_norm": 0.9653842449188232, + "learning_rate": 4.847014021542252e-06, + "loss": 0.7072, + "step": 2383 + }, + { + "epoch": 0.6978922716627635, + "grad_norm": 1.0180569887161255, + "learning_rate": 4.846881355744776e-06, + "loss": 0.7063, + "step": 2384 + }, + { + "epoch": 0.6981850117096019, + "grad_norm": 0.9701268672943115, + "learning_rate": 4.84674863426709e-06, + "loss": 0.6792, + "step": 2385 + }, + { + "epoch": 0.6984777517564403, + "grad_norm": 0.9979000687599182, + "learning_rate": 4.846615857112341e-06, + "loss": 0.6552, + "step": 2386 + }, + { + "epoch": 0.6987704918032787, + "grad_norm": 1.0502222776412964, + "learning_rate": 4.84648302428368e-06, + "loss": 0.6419, + "step": 2387 + }, + { + "epoch": 0.6990632318501171, + "grad_norm": 1.041737675666809, + "learning_rate": 4.846350135784258e-06, + "loss": 0.6968, + "step": 2388 + }, + { + "epoch": 0.6993559718969555, + "grad_norm": 0.9730232954025269, + "learning_rate": 4.846217191617229e-06, + "loss": 0.6607, + "step": 2389 + }, + { + "epoch": 0.699648711943794, + "grad_norm": 0.9833289980888367, + "learning_rate": 4.846084191785746e-06, + "loss": 0.7028, + "step": 2390 + }, + { + "epoch": 0.6999414519906323, + "grad_norm": 0.9861181974411011, + "learning_rate": 4.8459511362929655e-06, + "loss": 0.7032, + "step": 2391 + }, + { + "epoch": 0.7002341920374707, + "grad_norm": 1.0536359548568726, + "learning_rate": 4.845818025142043e-06, + "loss": 0.6887, + "step": 2392 + }, + { + "epoch": 0.7005269320843092, + "grad_norm": 0.9685488939285278, + "learning_rate": 4.845684858336137e-06, + "loss": 0.6583, + "step": 2393 + }, + { + "epoch": 0.7008196721311475, + "grad_norm": 1.0184929370880127, + "learning_rate": 4.845551635878407e-06, + "loss": 0.6977, + "step": 2394 + }, + { + "epoch": 0.7011124121779859, + "grad_norm": 0.9738637804985046, + "learning_rate": 4.845418357772014e-06, + "loss": 0.702, + "step": 2395 + }, + { + "epoch": 0.7014051522248244, + "grad_norm": 0.9446795582771301, + "learning_rate": 4.84528502402012e-06, + "loss": 0.637, + "step": 2396 + }, + { + "epoch": 0.7016978922716628, + "grad_norm": 0.9779831767082214, + "learning_rate": 4.845151634625888e-06, + "loss": 0.6584, + "step": 2397 + }, + { + "epoch": 0.7019906323185011, + "grad_norm": 0.9341223239898682, + "learning_rate": 4.8450181895924835e-06, + "loss": 0.6614, + "step": 2398 + }, + { + "epoch": 0.7022833723653396, + "grad_norm": 0.9423546195030212, + "learning_rate": 4.8448846889230714e-06, + "loss": 0.6935, + "step": 2399 + }, + { + "epoch": 0.702576112412178, + "grad_norm": 0.9827507138252258, + "learning_rate": 4.84475113262082e-06, + "loss": 0.662, + "step": 2400 + }, + { + "epoch": 0.7028688524590164, + "grad_norm": 1.0144261121749878, + "learning_rate": 4.844617520688896e-06, + "loss": 0.675, + "step": 2401 + }, + { + "epoch": 0.7031615925058547, + "grad_norm": 0.9611207246780396, + "learning_rate": 4.8444838531304724e-06, + "loss": 0.6565, + "step": 2402 + }, + { + "epoch": 0.7034543325526932, + "grad_norm": 1.0325442552566528, + "learning_rate": 4.844350129948718e-06, + "loss": 0.6895, + "step": 2403 + }, + { + "epoch": 0.7037470725995316, + "grad_norm": 1.0549720525741577, + "learning_rate": 4.844216351146808e-06, + "loss": 0.6288, + "step": 2404 + }, + { + "epoch": 0.7040398126463701, + "grad_norm": 1.0408657789230347, + "learning_rate": 4.844082516727913e-06, + "loss": 0.6875, + "step": 2405 + }, + { + "epoch": 0.7043325526932084, + "grad_norm": 0.9755939245223999, + "learning_rate": 4.843948626695211e-06, + "loss": 0.6169, + "step": 2406 + }, + { + "epoch": 0.7046252927400468, + "grad_norm": 1.0582365989685059, + "learning_rate": 4.843814681051877e-06, + "loss": 0.6671, + "step": 2407 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 0.957874059677124, + "learning_rate": 4.84368067980109e-06, + "loss": 0.6311, + "step": 2408 + }, + { + "epoch": 0.7052107728337237, + "grad_norm": 1.0082972049713135, + "learning_rate": 4.843546622946027e-06, + "loss": 0.6417, + "step": 2409 + }, + { + "epoch": 0.705503512880562, + "grad_norm": 0.9449263215065002, + "learning_rate": 4.843412510489871e-06, + "loss": 0.6594, + "step": 2410 + }, + { + "epoch": 0.7057962529274004, + "grad_norm": 0.9553565979003906, + "learning_rate": 4.843278342435803e-06, + "loss": 0.6697, + "step": 2411 + }, + { + "epoch": 0.7060889929742389, + "grad_norm": 1.005255103111267, + "learning_rate": 4.843144118787007e-06, + "loss": 0.6814, + "step": 2412 + }, + { + "epoch": 0.7063817330210773, + "grad_norm": 0.9755287170410156, + "learning_rate": 4.843009839546666e-06, + "loss": 0.6986, + "step": 2413 + }, + { + "epoch": 0.7066744730679156, + "grad_norm": 0.9470412731170654, + "learning_rate": 4.842875504717966e-06, + "loss": 0.6747, + "step": 2414 + }, + { + "epoch": 0.7069672131147541, + "grad_norm": 0.9818392395973206, + "learning_rate": 4.842741114304095e-06, + "loss": 0.697, + "step": 2415 + }, + { + "epoch": 0.7072599531615925, + "grad_norm": 0.9864434003829956, + "learning_rate": 4.8426066683082404e-06, + "loss": 0.6973, + "step": 2416 + }, + { + "epoch": 0.707552693208431, + "grad_norm": 0.9778057932853699, + "learning_rate": 4.842472166733593e-06, + "loss": 0.6938, + "step": 2417 + }, + { + "epoch": 0.7078454332552693, + "grad_norm": 1.0301120281219482, + "learning_rate": 4.842337609583344e-06, + "loss": 0.7145, + "step": 2418 + }, + { + "epoch": 0.7081381733021077, + "grad_norm": 0.9531358480453491, + "learning_rate": 4.842202996860684e-06, + "loss": 0.7061, + "step": 2419 + }, + { + "epoch": 0.7084309133489461, + "grad_norm": 0.9214529991149902, + "learning_rate": 4.842068328568809e-06, + "loss": 0.5967, + "step": 2420 + }, + { + "epoch": 0.7087236533957846, + "grad_norm": 0.988349974155426, + "learning_rate": 4.841933604710912e-06, + "loss": 0.6612, + "step": 2421 + }, + { + "epoch": 0.7090163934426229, + "grad_norm": 0.9575024843215942, + "learning_rate": 4.84179882529019e-06, + "loss": 0.6141, + "step": 2422 + }, + { + "epoch": 0.7093091334894613, + "grad_norm": 0.9821819067001343, + "learning_rate": 4.841663990309842e-06, + "loss": 0.6303, + "step": 2423 + }, + { + "epoch": 0.7096018735362998, + "grad_norm": 1.0042649507522583, + "learning_rate": 4.841529099773066e-06, + "loss": 0.7237, + "step": 2424 + }, + { + "epoch": 0.7098946135831382, + "grad_norm": 0.9812665581703186, + "learning_rate": 4.841394153683062e-06, + "loss": 0.6888, + "step": 2425 + }, + { + "epoch": 0.7101873536299765, + "grad_norm": 0.9299144148826599, + "learning_rate": 4.841259152043032e-06, + "loss": 0.6456, + "step": 2426 + }, + { + "epoch": 0.710480093676815, + "grad_norm": 0.9687488675117493, + "learning_rate": 4.841124094856178e-06, + "loss": 0.6655, + "step": 2427 + }, + { + "epoch": 0.7107728337236534, + "grad_norm": 1.0022339820861816, + "learning_rate": 4.8409889821257056e-06, + "loss": 0.6992, + "step": 2428 + }, + { + "epoch": 0.7110655737704918, + "grad_norm": 1.0187747478485107, + "learning_rate": 4.84085381385482e-06, + "loss": 0.7478, + "step": 2429 + }, + { + "epoch": 0.7113583138173302, + "grad_norm": 1.013043761253357, + "learning_rate": 4.840718590046728e-06, + "loss": 0.633, + "step": 2430 + }, + { + "epoch": 0.7116510538641686, + "grad_norm": 0.9653459787368774, + "learning_rate": 4.840583310704637e-06, + "loss": 0.6271, + "step": 2431 + }, + { + "epoch": 0.711943793911007, + "grad_norm": 0.9658300876617432, + "learning_rate": 4.840447975831758e-06, + "loss": 0.6489, + "step": 2432 + }, + { + "epoch": 0.7122365339578455, + "grad_norm": 1.0490610599517822, + "learning_rate": 4.8403125854313e-06, + "loss": 0.7064, + "step": 2433 + }, + { + "epoch": 0.7125292740046838, + "grad_norm": 1.1162289381027222, + "learning_rate": 4.8401771395064774e-06, + "loss": 0.6903, + "step": 2434 + }, + { + "epoch": 0.7128220140515222, + "grad_norm": 0.9666079878807068, + "learning_rate": 4.8400416380605034e-06, + "loss": 0.6231, + "step": 2435 + }, + { + "epoch": 0.7131147540983607, + "grad_norm": 1.0029510259628296, + "learning_rate": 4.83990608109659e-06, + "loss": 0.6608, + "step": 2436 + }, + { + "epoch": 0.7134074941451991, + "grad_norm": 1.0456960201263428, + "learning_rate": 4.839770468617957e-06, + "loss": 0.6573, + "step": 2437 + }, + { + "epoch": 0.7137002341920374, + "grad_norm": 0.9788967967033386, + "learning_rate": 4.839634800627819e-06, + "loss": 0.6508, + "step": 2438 + }, + { + "epoch": 0.7139929742388759, + "grad_norm": 0.8969278335571289, + "learning_rate": 4.839499077129396e-06, + "loss": 0.6061, + "step": 2439 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 1.0467467308044434, + "learning_rate": 4.839363298125908e-06, + "loss": 0.696, + "step": 2440 + }, + { + "epoch": 0.7145784543325527, + "grad_norm": 0.9350816607475281, + "learning_rate": 4.839227463620577e-06, + "loss": 0.6769, + "step": 2441 + }, + { + "epoch": 0.7148711943793911, + "grad_norm": 1.012078881263733, + "learning_rate": 4.8390915736166245e-06, + "loss": 0.6741, + "step": 2442 + }, + { + "epoch": 0.7151639344262295, + "grad_norm": 0.9689279198646545, + "learning_rate": 4.838955628117274e-06, + "loss": 0.7123, + "step": 2443 + }, + { + "epoch": 0.7154566744730679, + "grad_norm": 0.9676612019538879, + "learning_rate": 4.838819627125753e-06, + "loss": 0.65, + "step": 2444 + }, + { + "epoch": 0.7157494145199064, + "grad_norm": 0.9975646138191223, + "learning_rate": 4.838683570645287e-06, + "loss": 0.6396, + "step": 2445 + }, + { + "epoch": 0.7160421545667447, + "grad_norm": 1.0627868175506592, + "learning_rate": 4.838547458679104e-06, + "loss": 0.6432, + "step": 2446 + }, + { + "epoch": 0.7163348946135831, + "grad_norm": 0.9935399889945984, + "learning_rate": 4.838411291230434e-06, + "loss": 0.6647, + "step": 2447 + }, + { + "epoch": 0.7166276346604216, + "grad_norm": 0.9724365472793579, + "learning_rate": 4.838275068302506e-06, + "loss": 0.6563, + "step": 2448 + }, + { + "epoch": 0.71692037470726, + "grad_norm": 0.9875682592391968, + "learning_rate": 4.838138789898552e-06, + "loss": 0.6948, + "step": 2449 + }, + { + "epoch": 0.7172131147540983, + "grad_norm": 0.9783352613449097, + "learning_rate": 4.8380024560218066e-06, + "loss": 0.6414, + "step": 2450 + }, + { + "epoch": 0.7175058548009368, + "grad_norm": 1.0243470668792725, + "learning_rate": 4.837866066675504e-06, + "loss": 0.7084, + "step": 2451 + }, + { + "epoch": 0.7177985948477752, + "grad_norm": 0.9512138962745667, + "learning_rate": 4.83772962186288e-06, + "loss": 0.6411, + "step": 2452 + }, + { + "epoch": 0.7180913348946136, + "grad_norm": 0.9899474382400513, + "learning_rate": 4.837593121587171e-06, + "loss": 0.6895, + "step": 2453 + }, + { + "epoch": 0.718384074941452, + "grad_norm": 0.9944790005683899, + "learning_rate": 4.837456565851616e-06, + "loss": 0.7021, + "step": 2454 + }, + { + "epoch": 0.7186768149882904, + "grad_norm": 0.9481464624404907, + "learning_rate": 4.837319954659455e-06, + "loss": 0.6725, + "step": 2455 + }, + { + "epoch": 0.7189695550351288, + "grad_norm": 1.0203945636749268, + "learning_rate": 4.83718328801393e-06, + "loss": 0.6954, + "step": 2456 + }, + { + "epoch": 0.7192622950819673, + "grad_norm": 0.983006477355957, + "learning_rate": 4.837046565918281e-06, + "loss": 0.654, + "step": 2457 + }, + { + "epoch": 0.7195550351288056, + "grad_norm": 0.9685888290405273, + "learning_rate": 4.8369097883757546e-06, + "loss": 0.6775, + "step": 2458 + }, + { + "epoch": 0.719847775175644, + "grad_norm": 1.003172516822815, + "learning_rate": 4.836772955389594e-06, + "loss": 0.6576, + "step": 2459 + }, + { + "epoch": 0.7201405152224825, + "grad_norm": 0.9728193879127502, + "learning_rate": 4.836636066963045e-06, + "loss": 0.6857, + "step": 2460 + }, + { + "epoch": 0.7204332552693209, + "grad_norm": 0.9627071022987366, + "learning_rate": 4.836499123099358e-06, + "loss": 0.6651, + "step": 2461 + }, + { + "epoch": 0.7207259953161592, + "grad_norm": 1.04771888256073, + "learning_rate": 4.83636212380178e-06, + "loss": 0.7111, + "step": 2462 + }, + { + "epoch": 0.7210187353629977, + "grad_norm": 1.006294846534729, + "learning_rate": 4.836225069073561e-06, + "loss": 0.7294, + "step": 2463 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.9928861260414124, + "learning_rate": 4.836087958917954e-06, + "loss": 0.7142, + "step": 2464 + }, + { + "epoch": 0.7216042154566745, + "grad_norm": 0.9771029353141785, + "learning_rate": 4.8359507933382114e-06, + "loss": 0.7014, + "step": 2465 + }, + { + "epoch": 0.7218969555035128, + "grad_norm": 1.035372257232666, + "learning_rate": 4.8358135723375876e-06, + "loss": 0.6994, + "step": 2466 + }, + { + "epoch": 0.7221896955503513, + "grad_norm": 0.9644604325294495, + "learning_rate": 4.835676295919337e-06, + "loss": 0.6677, + "step": 2467 + }, + { + "epoch": 0.7224824355971897, + "grad_norm": 0.9849503636360168, + "learning_rate": 4.835538964086719e-06, + "loss": 0.6184, + "step": 2468 + }, + { + "epoch": 0.7227751756440282, + "grad_norm": 0.9733806252479553, + "learning_rate": 4.835401576842989e-06, + "loss": 0.6471, + "step": 2469 + }, + { + "epoch": 0.7230679156908665, + "grad_norm": 0.984883725643158, + "learning_rate": 4.835264134191409e-06, + "loss": 0.6695, + "step": 2470 + }, + { + "epoch": 0.7233606557377049, + "grad_norm": 1.0393331050872803, + "learning_rate": 4.835126636135238e-06, + "loss": 0.6629, + "step": 2471 + }, + { + "epoch": 0.7236533957845434, + "grad_norm": 0.9599575400352478, + "learning_rate": 4.834989082677739e-06, + "loss": 0.6882, + "step": 2472 + }, + { + "epoch": 0.7239461358313818, + "grad_norm": 0.932843029499054, + "learning_rate": 4.8348514738221765e-06, + "loss": 0.6189, + "step": 2473 + }, + { + "epoch": 0.7242388758782201, + "grad_norm": 0.9779561758041382, + "learning_rate": 4.8347138095718125e-06, + "loss": 0.698, + "step": 2474 + }, + { + "epoch": 0.7245316159250585, + "grad_norm": 1.0204392671585083, + "learning_rate": 4.834576089929916e-06, + "loss": 0.6701, + "step": 2475 + }, + { + "epoch": 0.724824355971897, + "grad_norm": 0.9641032814979553, + "learning_rate": 4.834438314899753e-06, + "loss": 0.6106, + "step": 2476 + }, + { + "epoch": 0.7251170960187353, + "grad_norm": 1.0309169292449951, + "learning_rate": 4.834300484484593e-06, + "loss": 0.6524, + "step": 2477 + }, + { + "epoch": 0.7254098360655737, + "grad_norm": 1.0148911476135254, + "learning_rate": 4.834162598687705e-06, + "loss": 0.6735, + "step": 2478 + }, + { + "epoch": 0.7257025761124122, + "grad_norm": 1.0678564310073853, + "learning_rate": 4.834024657512361e-06, + "loss": 0.7386, + "step": 2479 + }, + { + "epoch": 0.7259953161592506, + "grad_norm": 0.9634899497032166, + "learning_rate": 4.833886660961833e-06, + "loss": 0.7065, + "step": 2480 + }, + { + "epoch": 0.7262880562060889, + "grad_norm": 1.023107647895813, + "learning_rate": 4.833748609039396e-06, + "loss": 0.6485, + "step": 2481 + }, + { + "epoch": 0.7265807962529274, + "grad_norm": 0.9681041836738586, + "learning_rate": 4.8336105017483244e-06, + "loss": 0.6695, + "step": 2482 + }, + { + "epoch": 0.7268735362997658, + "grad_norm": 1.0183650255203247, + "learning_rate": 4.8334723390918956e-06, + "loss": 0.6831, + "step": 2483 + }, + { + "epoch": 0.7271662763466042, + "grad_norm": 0.9609016180038452, + "learning_rate": 4.833334121073388e-06, + "loss": 0.7188, + "step": 2484 + }, + { + "epoch": 0.7274590163934426, + "grad_norm": 0.9649230241775513, + "learning_rate": 4.833195847696079e-06, + "loss": 0.6304, + "step": 2485 + }, + { + "epoch": 0.727751756440281, + "grad_norm": 0.9974673986434937, + "learning_rate": 4.833057518963251e-06, + "loss": 0.684, + "step": 2486 + }, + { + "epoch": 0.7280444964871194, + "grad_norm": 0.9631463885307312, + "learning_rate": 4.832919134878186e-06, + "loss": 0.6573, + "step": 2487 + }, + { + "epoch": 0.7283372365339579, + "grad_norm": 0.9887321591377258, + "learning_rate": 4.832780695444165e-06, + "loss": 0.6606, + "step": 2488 + }, + { + "epoch": 0.7286299765807962, + "grad_norm": 1.07668137550354, + "learning_rate": 4.8326422006644735e-06, + "loss": 0.6825, + "step": 2489 + }, + { + "epoch": 0.7289227166276346, + "grad_norm": 0.9894252419471741, + "learning_rate": 4.832503650542399e-06, + "loss": 0.6988, + "step": 2490 + }, + { + "epoch": 0.7292154566744731, + "grad_norm": 0.9692526459693909, + "learning_rate": 4.832365045081226e-06, + "loss": 0.6602, + "step": 2491 + }, + { + "epoch": 0.7295081967213115, + "grad_norm": 0.958368718624115, + "learning_rate": 4.832226384284245e-06, + "loss": 0.6709, + "step": 2492 + }, + { + "epoch": 0.7298009367681498, + "grad_norm": 1.0324345827102661, + "learning_rate": 4.832087668154745e-06, + "loss": 0.6669, + "step": 2493 + }, + { + "epoch": 0.7300936768149883, + "grad_norm": 1.0060834884643555, + "learning_rate": 4.831948896696016e-06, + "loss": 0.7138, + "step": 2494 + }, + { + "epoch": 0.7303864168618267, + "grad_norm": 0.965546190738678, + "learning_rate": 4.8318100699113524e-06, + "loss": 0.6607, + "step": 2495 + }, + { + "epoch": 0.7306791569086651, + "grad_norm": 1.0053272247314453, + "learning_rate": 4.8316711878040465e-06, + "loss": 0.697, + "step": 2496 + }, + { + "epoch": 0.7309718969555035, + "grad_norm": 0.9662278294563293, + "learning_rate": 4.831532250377394e-06, + "loss": 0.6261, + "step": 2497 + }, + { + "epoch": 0.7312646370023419, + "grad_norm": 1.0150270462036133, + "learning_rate": 4.83139325763469e-06, + "loss": 0.6768, + "step": 2498 + }, + { + "epoch": 0.7315573770491803, + "grad_norm": 0.9365479946136475, + "learning_rate": 4.831254209579234e-06, + "loss": 0.63, + "step": 2499 + }, + { + "epoch": 0.7318501170960188, + "grad_norm": 1.0476243495941162, + "learning_rate": 4.831115106214324e-06, + "loss": 0.6477, + "step": 2500 + }, + { + "epoch": 0.7321428571428571, + "grad_norm": 1.0576472282409668, + "learning_rate": 4.83097594754326e-06, + "loss": 0.6687, + "step": 2501 + }, + { + "epoch": 0.7324355971896955, + "grad_norm": 1.0254930257797241, + "learning_rate": 4.830836733569344e-06, + "loss": 0.619, + "step": 2502 + }, + { + "epoch": 0.732728337236534, + "grad_norm": 1.0141335725784302, + "learning_rate": 4.830697464295878e-06, + "loss": 0.6958, + "step": 2503 + }, + { + "epoch": 0.7330210772833724, + "grad_norm": 0.9595030546188354, + "learning_rate": 4.830558139726168e-06, + "loss": 0.6612, + "step": 2504 + }, + { + "epoch": 0.7333138173302107, + "grad_norm": 1.0249606370925903, + "learning_rate": 4.830418759863517e-06, + "loss": 0.6721, + "step": 2505 + }, + { + "epoch": 0.7336065573770492, + "grad_norm": 0.9795429110527039, + "learning_rate": 4.830279324711234e-06, + "loss": 0.6815, + "step": 2506 + }, + { + "epoch": 0.7338992974238876, + "grad_norm": 0.9553292393684387, + "learning_rate": 4.830139834272627e-06, + "loss": 0.6827, + "step": 2507 + }, + { + "epoch": 0.734192037470726, + "grad_norm": 0.9590965509414673, + "learning_rate": 4.830000288551004e-06, + "loss": 0.6606, + "step": 2508 + }, + { + "epoch": 0.7344847775175644, + "grad_norm": 0.9535275101661682, + "learning_rate": 4.8298606875496764e-06, + "loss": 0.6452, + "step": 2509 + }, + { + "epoch": 0.7347775175644028, + "grad_norm": 1.0127876996994019, + "learning_rate": 4.829721031271956e-06, + "loss": 0.6921, + "step": 2510 + }, + { + "epoch": 0.7350702576112412, + "grad_norm": 0.9966358542442322, + "learning_rate": 4.829581319721158e-06, + "loss": 0.6667, + "step": 2511 + }, + { + "epoch": 0.7353629976580797, + "grad_norm": 0.9647680521011353, + "learning_rate": 4.829441552900594e-06, + "loss": 0.6531, + "step": 2512 + }, + { + "epoch": 0.735655737704918, + "grad_norm": 0.9660115838050842, + "learning_rate": 4.8293017308135826e-06, + "loss": 0.6368, + "step": 2513 + }, + { + "epoch": 0.7359484777517564, + "grad_norm": 0.9291300773620605, + "learning_rate": 4.8291618534634395e-06, + "loss": 0.6114, + "step": 2514 + }, + { + "epoch": 0.7362412177985949, + "grad_norm": 0.9691485166549683, + "learning_rate": 4.829021920853485e-06, + "loss": 0.6516, + "step": 2515 + }, + { + "epoch": 0.7365339578454333, + "grad_norm": 1.016615629196167, + "learning_rate": 4.828881932987037e-06, + "loss": 0.6556, + "step": 2516 + }, + { + "epoch": 0.7368266978922716, + "grad_norm": 1.0639957189559937, + "learning_rate": 4.828741889867418e-06, + "loss": 0.693, + "step": 2517 + }, + { + "epoch": 0.7371194379391101, + "grad_norm": 0.9987112283706665, + "learning_rate": 4.82860179149795e-06, + "loss": 0.6719, + "step": 2518 + }, + { + "epoch": 0.7374121779859485, + "grad_norm": 0.976995587348938, + "learning_rate": 4.828461637881958e-06, + "loss": 0.6912, + "step": 2519 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 0.9532166719436646, + "learning_rate": 4.828321429022765e-06, + "loss": 0.6385, + "step": 2520 + }, + { + "epoch": 0.7379976580796253, + "grad_norm": 0.9926960468292236, + "learning_rate": 4.8281811649237e-06, + "loss": 0.6677, + "step": 2521 + }, + { + "epoch": 0.7382903981264637, + "grad_norm": 0.9855061173439026, + "learning_rate": 4.828040845588089e-06, + "loss": 0.6669, + "step": 2522 + }, + { + "epoch": 0.7385831381733021, + "grad_norm": 1.0022751092910767, + "learning_rate": 4.827900471019262e-06, + "loss": 0.6795, + "step": 2523 + }, + { + "epoch": 0.7388758782201406, + "grad_norm": 0.9891449213027954, + "learning_rate": 4.827760041220549e-06, + "loss": 0.6796, + "step": 2524 + }, + { + "epoch": 0.7391686182669789, + "grad_norm": 0.9840843677520752, + "learning_rate": 4.827619556195281e-06, + "loss": 0.6753, + "step": 2525 + }, + { + "epoch": 0.7394613583138173, + "grad_norm": 0.9507187604904175, + "learning_rate": 4.827479015946792e-06, + "loss": 0.6824, + "step": 2526 + }, + { + "epoch": 0.7397540983606558, + "grad_norm": 0.9588443040847778, + "learning_rate": 4.8273384204784164e-06, + "loss": 0.6435, + "step": 2527 + }, + { + "epoch": 0.7400468384074942, + "grad_norm": 1.0382976531982422, + "learning_rate": 4.827197769793491e-06, + "loss": 0.645, + "step": 2528 + }, + { + "epoch": 0.7403395784543325, + "grad_norm": 0.9771413207054138, + "learning_rate": 4.82705706389535e-06, + "loss": 0.6664, + "step": 2529 + }, + { + "epoch": 0.740632318501171, + "grad_norm": 0.951867401599884, + "learning_rate": 4.826916302787334e-06, + "loss": 0.6863, + "step": 2530 + }, + { + "epoch": 0.7409250585480094, + "grad_norm": 0.9556918144226074, + "learning_rate": 4.82677548647278e-06, + "loss": 0.6883, + "step": 2531 + }, + { + "epoch": 0.7412177985948478, + "grad_norm": 0.9507229328155518, + "learning_rate": 4.826634614955032e-06, + "loss": 0.683, + "step": 2532 + }, + { + "epoch": 0.7415105386416861, + "grad_norm": 0.9709544777870178, + "learning_rate": 4.82649368823743e-06, + "loss": 0.7092, + "step": 2533 + }, + { + "epoch": 0.7418032786885246, + "grad_norm": 0.9684427380561829, + "learning_rate": 4.826352706323319e-06, + "loss": 0.6725, + "step": 2534 + }, + { + "epoch": 0.742096018735363, + "grad_norm": 0.9732899069786072, + "learning_rate": 4.826211669216042e-06, + "loss": 0.7109, + "step": 2535 + }, + { + "epoch": 0.7423887587822015, + "grad_norm": 9.817760467529297, + "learning_rate": 4.826070576918947e-06, + "loss": 0.6887, + "step": 2536 + }, + { + "epoch": 0.7426814988290398, + "grad_norm": 0.9687430262565613, + "learning_rate": 4.825929429435381e-06, + "loss": 0.6464, + "step": 2537 + }, + { + "epoch": 0.7429742388758782, + "grad_norm": 1.0252794027328491, + "learning_rate": 4.825788226768692e-06, + "loss": 0.6904, + "step": 2538 + }, + { + "epoch": 0.7432669789227166, + "grad_norm": 0.9394489526748657, + "learning_rate": 4.825646968922231e-06, + "loss": 0.628, + "step": 2539 + }, + { + "epoch": 0.7435597189695551, + "grad_norm": 1.0099481344223022, + "learning_rate": 4.825505655899348e-06, + "loss": 0.7127, + "step": 2540 + }, + { + "epoch": 0.7438524590163934, + "grad_norm": 0.9775800108909607, + "learning_rate": 4.825364287703397e-06, + "loss": 0.6793, + "step": 2541 + }, + { + "epoch": 0.7441451990632318, + "grad_norm": 0.8965847492218018, + "learning_rate": 4.8252228643377314e-06, + "loss": 0.6561, + "step": 2542 + }, + { + "epoch": 0.7444379391100703, + "grad_norm": 0.9663348197937012, + "learning_rate": 4.8250813858057065e-06, + "loss": 0.6498, + "step": 2543 + }, + { + "epoch": 0.7447306791569087, + "grad_norm": 0.9738327264785767, + "learning_rate": 4.8249398521106796e-06, + "loss": 0.6258, + "step": 2544 + }, + { + "epoch": 0.745023419203747, + "grad_norm": 0.9585781097412109, + "learning_rate": 4.824798263256007e-06, + "loss": 0.678, + "step": 2545 + }, + { + "epoch": 0.7453161592505855, + "grad_norm": 1.0163590908050537, + "learning_rate": 4.82465661924505e-06, + "loss": 0.6501, + "step": 2546 + }, + { + "epoch": 0.7456088992974239, + "grad_norm": 0.9767870903015137, + "learning_rate": 4.824514920081168e-06, + "loss": 0.6212, + "step": 2547 + }, + { + "epoch": 0.7459016393442623, + "grad_norm": 1.0304607152938843, + "learning_rate": 4.824373165767723e-06, + "loss": 0.7282, + "step": 2548 + }, + { + "epoch": 0.7461943793911007, + "grad_norm": 0.979469895362854, + "learning_rate": 4.824231356308077e-06, + "loss": 0.6309, + "step": 2549 + }, + { + "epoch": 0.7464871194379391, + "grad_norm": 0.9572466611862183, + "learning_rate": 4.824089491705596e-06, + "loss": 0.672, + "step": 2550 + }, + { + "epoch": 0.7467798594847775, + "grad_norm": 0.9259224534034729, + "learning_rate": 4.823947571963646e-06, + "loss": 0.6024, + "step": 2551 + }, + { + "epoch": 0.747072599531616, + "grad_norm": 1.0284051895141602, + "learning_rate": 4.823805597085592e-06, + "loss": 0.7029, + "step": 2552 + }, + { + "epoch": 0.7473653395784543, + "grad_norm": 0.9627203345298767, + "learning_rate": 4.8236635670748055e-06, + "loss": 0.6515, + "step": 2553 + }, + { + "epoch": 0.7476580796252927, + "grad_norm": 0.9794974327087402, + "learning_rate": 4.823521481934653e-06, + "loss": 0.7146, + "step": 2554 + }, + { + "epoch": 0.7479508196721312, + "grad_norm": 0.9793955087661743, + "learning_rate": 4.823379341668508e-06, + "loss": 0.6819, + "step": 2555 + }, + { + "epoch": 0.7482435597189696, + "grad_norm": 0.930491030216217, + "learning_rate": 4.823237146279741e-06, + "loss": 0.6356, + "step": 2556 + }, + { + "epoch": 0.7485362997658079, + "grad_norm": 0.9460793733596802, + "learning_rate": 4.823094895771727e-06, + "loss": 0.6653, + "step": 2557 + }, + { + "epoch": 0.7488290398126464, + "grad_norm": 0.9503335356712341, + "learning_rate": 4.82295259014784e-06, + "loss": 0.6562, + "step": 2558 + }, + { + "epoch": 0.7491217798594848, + "grad_norm": 1.000777244567871, + "learning_rate": 4.822810229411457e-06, + "loss": 0.6626, + "step": 2559 + }, + { + "epoch": 0.7494145199063232, + "grad_norm": 0.913009762763977, + "learning_rate": 4.822667813565954e-06, + "loss": 0.626, + "step": 2560 + }, + { + "epoch": 0.7497072599531616, + "grad_norm": 1.0696160793304443, + "learning_rate": 4.822525342614711e-06, + "loss": 0.6807, + "step": 2561 + }, + { + "epoch": 0.75, + "grad_norm": 1.0577964782714844, + "learning_rate": 4.822382816561109e-06, + "loss": 0.6952, + "step": 2562 + }, + { + "epoch": 0.7502927400468384, + "grad_norm": 1.025119662284851, + "learning_rate": 4.822240235408527e-06, + "loss": 0.6732, + "step": 2563 + }, + { + "epoch": 0.7505854800936768, + "grad_norm": 0.9332608580589294, + "learning_rate": 4.82209759916035e-06, + "loss": 0.6626, + "step": 2564 + }, + { + "epoch": 0.7508782201405152, + "grad_norm": 0.9525098204612732, + "learning_rate": 4.821954907819962e-06, + "loss": 0.6447, + "step": 2565 + }, + { + "epoch": 0.7511709601873536, + "grad_norm": 0.9595814347267151, + "learning_rate": 4.821812161390748e-06, + "loss": 0.569, + "step": 2566 + }, + { + "epoch": 0.7514637002341921, + "grad_norm": 0.984329879283905, + "learning_rate": 4.821669359876093e-06, + "loss": 0.6687, + "step": 2567 + }, + { + "epoch": 0.7517564402810304, + "grad_norm": 1.0439749956130981, + "learning_rate": 4.821526503279387e-06, + "loss": 0.6549, + "step": 2568 + }, + { + "epoch": 0.7520491803278688, + "grad_norm": 1.037962555885315, + "learning_rate": 4.821383591604019e-06, + "loss": 0.6751, + "step": 2569 + }, + { + "epoch": 0.7523419203747073, + "grad_norm": 0.9589645862579346, + "learning_rate": 4.821240624853379e-06, + "loss": 0.6841, + "step": 2570 + }, + { + "epoch": 0.7526346604215457, + "grad_norm": 1.047347068786621, + "learning_rate": 4.8210976030308584e-06, + "loss": 0.6443, + "step": 2571 + }, + { + "epoch": 0.752927400468384, + "grad_norm": 0.9291172623634338, + "learning_rate": 4.8209545261398524e-06, + "loss": 0.6249, + "step": 2572 + }, + { + "epoch": 0.7532201405152225, + "grad_norm": 0.962733268737793, + "learning_rate": 4.820811394183755e-06, + "loss": 0.6835, + "step": 2573 + }, + { + "epoch": 0.7535128805620609, + "grad_norm": 0.9880397915840149, + "learning_rate": 4.820668207165959e-06, + "loss": 0.6619, + "step": 2574 + }, + { + "epoch": 0.7538056206088993, + "grad_norm": 0.9141432046890259, + "learning_rate": 4.8205249650898665e-06, + "loss": 0.6094, + "step": 2575 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.9885067939758301, + "learning_rate": 4.820381667958872e-06, + "loss": 0.632, + "step": 2576 + }, + { + "epoch": 0.7543911007025761, + "grad_norm": 0.9495107531547546, + "learning_rate": 4.820238315776376e-06, + "loss": 0.6649, + "step": 2577 + }, + { + "epoch": 0.7546838407494145, + "grad_norm": 0.9637511968612671, + "learning_rate": 4.820094908545782e-06, + "loss": 0.6632, + "step": 2578 + }, + { + "epoch": 0.754976580796253, + "grad_norm": 0.9608165621757507, + "learning_rate": 4.81995144627049e-06, + "loss": 0.6615, + "step": 2579 + }, + { + "epoch": 0.7552693208430913, + "grad_norm": 0.9591270685195923, + "learning_rate": 4.819807928953904e-06, + "loss": 0.6832, + "step": 2580 + }, + { + "epoch": 0.7555620608899297, + "grad_norm": 1.0299150943756104, + "learning_rate": 4.819664356599428e-06, + "loss": 0.6467, + "step": 2581 + }, + { + "epoch": 0.7558548009367682, + "grad_norm": 0.9504705667495728, + "learning_rate": 4.819520729210471e-06, + "loss": 0.6627, + "step": 2582 + }, + { + "epoch": 0.7561475409836066, + "grad_norm": 0.9839116334915161, + "learning_rate": 4.819377046790438e-06, + "loss": 0.6755, + "step": 2583 + }, + { + "epoch": 0.7564402810304449, + "grad_norm": 0.9940731525421143, + "learning_rate": 4.81923330934274e-06, + "loss": 0.6768, + "step": 2584 + }, + { + "epoch": 0.7567330210772834, + "grad_norm": 0.9934871196746826, + "learning_rate": 4.819089516870785e-06, + "loss": 0.6735, + "step": 2585 + }, + { + "epoch": 0.7570257611241218, + "grad_norm": 1.00322425365448, + "learning_rate": 4.818945669377987e-06, + "loss": 0.6817, + "step": 2586 + }, + { + "epoch": 0.7573185011709602, + "grad_norm": 0.9559628367424011, + "learning_rate": 4.818801766867757e-06, + "loss": 0.6549, + "step": 2587 + }, + { + "epoch": 0.7576112412177985, + "grad_norm": 0.9766403436660767, + "learning_rate": 4.818657809343509e-06, + "loss": 0.6907, + "step": 2588 + }, + { + "epoch": 0.757903981264637, + "grad_norm": 0.9515842795372009, + "learning_rate": 4.818513796808659e-06, + "loss": 0.6658, + "step": 2589 + }, + { + "epoch": 0.7581967213114754, + "grad_norm": 0.9420010447502136, + "learning_rate": 4.818369729266625e-06, + "loss": 0.6704, + "step": 2590 + }, + { + "epoch": 0.7584894613583139, + "grad_norm": 0.9830732941627502, + "learning_rate": 4.818225606720823e-06, + "loss": 0.6622, + "step": 2591 + }, + { + "epoch": 0.7587822014051522, + "grad_norm": 1.0004438161849976, + "learning_rate": 4.8180814291746725e-06, + "loss": 0.6613, + "step": 2592 + }, + { + "epoch": 0.7590749414519906, + "grad_norm": 0.9673280715942383, + "learning_rate": 4.817937196631596e-06, + "loss": 0.7136, + "step": 2593 + }, + { + "epoch": 0.759367681498829, + "grad_norm": 0.956204354763031, + "learning_rate": 4.817792909095013e-06, + "loss": 0.6689, + "step": 2594 + }, + { + "epoch": 0.7596604215456675, + "grad_norm": 0.9618379473686218, + "learning_rate": 4.8176485665683486e-06, + "loss": 0.6826, + "step": 2595 + }, + { + "epoch": 0.7599531615925058, + "grad_norm": 0.9988508224487305, + "learning_rate": 4.817504169055026e-06, + "loss": 0.6714, + "step": 2596 + }, + { + "epoch": 0.7602459016393442, + "grad_norm": 0.973912239074707, + "learning_rate": 4.817359716558473e-06, + "loss": 0.6318, + "step": 2597 + }, + { + "epoch": 0.7605386416861827, + "grad_norm": 0.9733139276504517, + "learning_rate": 4.8172152090821146e-06, + "loss": 0.649, + "step": 2598 + }, + { + "epoch": 0.7608313817330211, + "grad_norm": 0.9257541298866272, + "learning_rate": 4.817070646629381e-06, + "loss": 0.6224, + "step": 2599 + }, + { + "epoch": 0.7611241217798594, + "grad_norm": 0.967129111289978, + "learning_rate": 4.8169260292036994e-06, + "loss": 0.6979, + "step": 2600 + }, + { + "epoch": 0.7614168618266979, + "grad_norm": 0.9047691226005554, + "learning_rate": 4.816781356808504e-06, + "loss": 0.6446, + "step": 2601 + }, + { + "epoch": 0.7617096018735363, + "grad_norm": 0.9726836681365967, + "learning_rate": 4.816636629447225e-06, + "loss": 0.6675, + "step": 2602 + }, + { + "epoch": 0.7620023419203747, + "grad_norm": 0.9235100746154785, + "learning_rate": 4.816491847123298e-06, + "loss": 0.6295, + "step": 2603 + }, + { + "epoch": 0.7622950819672131, + "grad_norm": 0.9881744384765625, + "learning_rate": 4.8163470098401564e-06, + "loss": 0.67, + "step": 2604 + }, + { + "epoch": 0.7625878220140515, + "grad_norm": 0.9657142758369446, + "learning_rate": 4.816202117601236e-06, + "loss": 0.7146, + "step": 2605 + }, + { + "epoch": 0.7628805620608899, + "grad_norm": 0.9153656959533691, + "learning_rate": 4.8160571704099764e-06, + "loss": 0.6293, + "step": 2606 + }, + { + "epoch": 0.7631733021077284, + "grad_norm": 0.9669782519340515, + "learning_rate": 4.815912168269816e-06, + "loss": 0.693, + "step": 2607 + }, + { + "epoch": 0.7634660421545667, + "grad_norm": 0.9433345198631287, + "learning_rate": 4.815767111184193e-06, + "loss": 0.6801, + "step": 2608 + }, + { + "epoch": 0.7637587822014051, + "grad_norm": 0.9660816788673401, + "learning_rate": 4.815621999156551e-06, + "loss": 0.6139, + "step": 2609 + }, + { + "epoch": 0.7640515222482436, + "grad_norm": 0.9465237855911255, + "learning_rate": 4.815476832190333e-06, + "loss": 0.6424, + "step": 2610 + }, + { + "epoch": 0.764344262295082, + "grad_norm": 0.964924693107605, + "learning_rate": 4.815331610288981e-06, + "loss": 0.6555, + "step": 2611 + }, + { + "epoch": 0.7646370023419203, + "grad_norm": 0.9335149526596069, + "learning_rate": 4.815186333455943e-06, + "loss": 0.6813, + "step": 2612 + }, + { + "epoch": 0.7649297423887588, + "grad_norm": 0.9957479238510132, + "learning_rate": 4.815041001694663e-06, + "loss": 0.66, + "step": 2613 + }, + { + "epoch": 0.7652224824355972, + "grad_norm": 1.0134044885635376, + "learning_rate": 4.814895615008592e-06, + "loss": 0.6765, + "step": 2614 + }, + { + "epoch": 0.7655152224824356, + "grad_norm": 0.9757283926010132, + "learning_rate": 4.814750173401177e-06, + "loss": 0.7049, + "step": 2615 + }, + { + "epoch": 0.765807962529274, + "grad_norm": 1.0186116695404053, + "learning_rate": 4.814604676875869e-06, + "loss": 0.6823, + "step": 2616 + }, + { + "epoch": 0.7661007025761124, + "grad_norm": 1.0094447135925293, + "learning_rate": 4.814459125436121e-06, + "loss": 0.6681, + "step": 2617 + }, + { + "epoch": 0.7663934426229508, + "grad_norm": 1.0134992599487305, + "learning_rate": 4.814313519085385e-06, + "loss": 0.7211, + "step": 2618 + }, + { + "epoch": 0.7666861826697893, + "grad_norm": 0.9274501204490662, + "learning_rate": 4.814167857827117e-06, + "loss": 0.6465, + "step": 2619 + }, + { + "epoch": 0.7669789227166276, + "grad_norm": 0.9316897392272949, + "learning_rate": 4.8140221416647715e-06, + "loss": 0.663, + "step": 2620 + }, + { + "epoch": 0.767271662763466, + "grad_norm": 0.9368662238121033, + "learning_rate": 4.813876370601807e-06, + "loss": 0.6296, + "step": 2621 + }, + { + "epoch": 0.7675644028103045, + "grad_norm": 0.9973495006561279, + "learning_rate": 4.813730544641679e-06, + "loss": 0.7148, + "step": 2622 + }, + { + "epoch": 0.7678571428571429, + "grad_norm": 1.0121041536331177, + "learning_rate": 4.8135846637878515e-06, + "loss": 0.7042, + "step": 2623 + }, + { + "epoch": 0.7681498829039812, + "grad_norm": 0.9060382843017578, + "learning_rate": 4.8134387280437825e-06, + "loss": 0.5881, + "step": 2624 + }, + { + "epoch": 0.7684426229508197, + "grad_norm": 0.9425743222236633, + "learning_rate": 4.813292737412935e-06, + "loss": 0.6467, + "step": 2625 + }, + { + "epoch": 0.7687353629976581, + "grad_norm": 0.9829302430152893, + "learning_rate": 4.813146691898772e-06, + "loss": 0.6731, + "step": 2626 + }, + { + "epoch": 0.7690281030444965, + "grad_norm": 0.9451154470443726, + "learning_rate": 4.813000591504761e-06, + "loss": 0.6585, + "step": 2627 + }, + { + "epoch": 0.7693208430913349, + "grad_norm": 1.0212111473083496, + "learning_rate": 4.812854436234365e-06, + "loss": 0.6545, + "step": 2628 + }, + { + "epoch": 0.7696135831381733, + "grad_norm": 1.0763142108917236, + "learning_rate": 4.812708226091055e-06, + "loss": 0.6789, + "step": 2629 + }, + { + "epoch": 0.7699063231850117, + "grad_norm": 1.0033652782440186, + "learning_rate": 4.812561961078297e-06, + "loss": 0.6595, + "step": 2630 + }, + { + "epoch": 0.7701990632318502, + "grad_norm": 1.0097460746765137, + "learning_rate": 4.812415641199561e-06, + "loss": 0.647, + "step": 2631 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 0.9646894335746765, + "learning_rate": 4.812269266458322e-06, + "loss": 0.678, + "step": 2632 + }, + { + "epoch": 0.7707845433255269, + "grad_norm": 0.9833580255508423, + "learning_rate": 4.812122836858049e-06, + "loss": 0.6714, + "step": 2633 + }, + { + "epoch": 0.7710772833723654, + "grad_norm": 1.0014920234680176, + "learning_rate": 4.8119763524022174e-06, + "loss": 0.6768, + "step": 2634 + }, + { + "epoch": 0.7713700234192038, + "grad_norm": 0.9830964207649231, + "learning_rate": 4.8118298130943034e-06, + "loss": 0.6642, + "step": 2635 + }, + { + "epoch": 0.7716627634660421, + "grad_norm": 0.9852242469787598, + "learning_rate": 4.811683218937782e-06, + "loss": 0.6937, + "step": 2636 + }, + { + "epoch": 0.7719555035128806, + "grad_norm": 0.9605165719985962, + "learning_rate": 4.811536569936134e-06, + "loss": 0.6287, + "step": 2637 + }, + { + "epoch": 0.772248243559719, + "grad_norm": 0.9858463406562805, + "learning_rate": 4.8113898660928345e-06, + "loss": 0.7022, + "step": 2638 + }, + { + "epoch": 0.7725409836065574, + "grad_norm": 0.9905266761779785, + "learning_rate": 4.811243107411368e-06, + "loss": 0.6468, + "step": 2639 + }, + { + "epoch": 0.7728337236533958, + "grad_norm": 1.0148428678512573, + "learning_rate": 4.8110962938952134e-06, + "loss": 0.6877, + "step": 2640 + }, + { + "epoch": 0.7731264637002342, + "grad_norm": 0.9807069301605225, + "learning_rate": 4.810949425547856e-06, + "loss": 0.69, + "step": 2641 + }, + { + "epoch": 0.7734192037470726, + "grad_norm": 0.9429119229316711, + "learning_rate": 4.810802502372779e-06, + "loss": 0.6461, + "step": 2642 + }, + { + "epoch": 0.7737119437939111, + "grad_norm": 0.9756373763084412, + "learning_rate": 4.810655524373469e-06, + "loss": 0.6806, + "step": 2643 + }, + { + "epoch": 0.7740046838407494, + "grad_norm": 0.9984195828437805, + "learning_rate": 4.810508491553413e-06, + "loss": 0.7049, + "step": 2644 + }, + { + "epoch": 0.7742974238875878, + "grad_norm": 1.0115180015563965, + "learning_rate": 4.810361403916099e-06, + "loss": 0.6698, + "step": 2645 + }, + { + "epoch": 0.7745901639344263, + "grad_norm": 0.9678516983985901, + "learning_rate": 4.810214261465017e-06, + "loss": 0.7125, + "step": 2646 + }, + { + "epoch": 0.7748829039812647, + "grad_norm": 1.0609760284423828, + "learning_rate": 4.810067064203657e-06, + "loss": 0.7315, + "step": 2647 + }, + { + "epoch": 0.775175644028103, + "grad_norm": 0.9321461319923401, + "learning_rate": 4.809919812135513e-06, + "loss": 0.6573, + "step": 2648 + }, + { + "epoch": 0.7754683840749415, + "grad_norm": 0.9503993391990662, + "learning_rate": 4.8097725052640775e-06, + "loss": 0.6328, + "step": 2649 + }, + { + "epoch": 0.7757611241217799, + "grad_norm": 1.0529848337173462, + "learning_rate": 4.809625143592845e-06, + "loss": 0.6935, + "step": 2650 + }, + { + "epoch": 0.7760538641686182, + "grad_norm": 0.9462052583694458, + "learning_rate": 4.809477727125313e-06, + "loss": 0.6794, + "step": 2651 + }, + { + "epoch": 0.7763466042154566, + "grad_norm": 0.9663704037666321, + "learning_rate": 4.809330255864978e-06, + "loss": 0.6669, + "step": 2652 + }, + { + "epoch": 0.7766393442622951, + "grad_norm": 0.950873851776123, + "learning_rate": 4.809182729815338e-06, + "loss": 0.6715, + "step": 2653 + }, + { + "epoch": 0.7769320843091335, + "grad_norm": 0.9289637804031372, + "learning_rate": 4.809035148979895e-06, + "loss": 0.6572, + "step": 2654 + }, + { + "epoch": 0.7772248243559718, + "grad_norm": 0.9767483472824097, + "learning_rate": 4.8088875133621495e-06, + "loss": 0.6617, + "step": 2655 + }, + { + "epoch": 0.7775175644028103, + "grad_norm": 0.9646378755569458, + "learning_rate": 4.808739822965605e-06, + "loss": 0.6903, + "step": 2656 + }, + { + "epoch": 0.7778103044496487, + "grad_norm": 0.9915593266487122, + "learning_rate": 4.808592077793763e-06, + "loss": 0.7093, + "step": 2657 + }, + { + "epoch": 0.7781030444964872, + "grad_norm": 1.0065659284591675, + "learning_rate": 4.808444277850131e-06, + "loss": 0.6585, + "step": 2658 + }, + { + "epoch": 0.7783957845433255, + "grad_norm": 0.9971205592155457, + "learning_rate": 4.808296423138216e-06, + "loss": 0.6498, + "step": 2659 + }, + { + "epoch": 0.7786885245901639, + "grad_norm": 0.9649834036827087, + "learning_rate": 4.808148513661523e-06, + "loss": 0.6689, + "step": 2660 + }, + { + "epoch": 0.7789812646370023, + "grad_norm": 0.9203527569770813, + "learning_rate": 4.8080005494235645e-06, + "loss": 0.6207, + "step": 2661 + }, + { + "epoch": 0.7792740046838408, + "grad_norm": 0.9605453014373779, + "learning_rate": 4.807852530427849e-06, + "loss": 0.6258, + "step": 2662 + }, + { + "epoch": 0.7795667447306791, + "grad_norm": 0.932913601398468, + "learning_rate": 4.807704456677889e-06, + "loss": 0.6188, + "step": 2663 + }, + { + "epoch": 0.7798594847775175, + "grad_norm": 0.947376549243927, + "learning_rate": 4.807556328177198e-06, + "loss": 0.6534, + "step": 2664 + }, + { + "epoch": 0.780152224824356, + "grad_norm": 0.9965957403182983, + "learning_rate": 4.807408144929289e-06, + "loss": 0.6817, + "step": 2665 + }, + { + "epoch": 0.7804449648711944, + "grad_norm": 0.9827908873558044, + "learning_rate": 4.807259906937678e-06, + "loss": 0.6682, + "step": 2666 + }, + { + "epoch": 0.7807377049180327, + "grad_norm": 0.9536962509155273, + "learning_rate": 4.807111614205883e-06, + "loss": 0.6954, + "step": 2667 + }, + { + "epoch": 0.7810304449648712, + "grad_norm": 1.0213751792907715, + "learning_rate": 4.806963266737422e-06, + "loss": 0.7032, + "step": 2668 + }, + { + "epoch": 0.7813231850117096, + "grad_norm": 1.0035734176635742, + "learning_rate": 4.8068148645358136e-06, + "loss": 0.7013, + "step": 2669 + }, + { + "epoch": 0.781615925058548, + "grad_norm": 0.9489345550537109, + "learning_rate": 4.80666640760458e-06, + "loss": 0.6398, + "step": 2670 + }, + { + "epoch": 0.7819086651053864, + "grad_norm": 0.9474843740463257, + "learning_rate": 4.806517895947243e-06, + "loss": 0.6593, + "step": 2671 + }, + { + "epoch": 0.7822014051522248, + "grad_norm": 0.9690195322036743, + "learning_rate": 4.806369329567325e-06, + "loss": 0.66, + "step": 2672 + }, + { + "epoch": 0.7824941451990632, + "grad_norm": 0.9828530550003052, + "learning_rate": 4.8062207084683524e-06, + "loss": 0.656, + "step": 2673 + }, + { + "epoch": 0.7827868852459017, + "grad_norm": 0.9190521836280823, + "learning_rate": 4.806072032653849e-06, + "loss": 0.6089, + "step": 2674 + }, + { + "epoch": 0.78307962529274, + "grad_norm": 0.9984466433525085, + "learning_rate": 4.805923302127345e-06, + "loss": 0.7089, + "step": 2675 + }, + { + "epoch": 0.7833723653395784, + "grad_norm": 0.9686915874481201, + "learning_rate": 4.805774516892366e-06, + "loss": 0.6565, + "step": 2676 + }, + { + "epoch": 0.7836651053864169, + "grad_norm": 0.957123875617981, + "learning_rate": 4.805625676952446e-06, + "loss": 0.6883, + "step": 2677 + }, + { + "epoch": 0.7839578454332553, + "grad_norm": 1.006954312324524, + "learning_rate": 4.805476782311112e-06, + "loss": 0.6344, + "step": 2678 + }, + { + "epoch": 0.7842505854800936, + "grad_norm": 0.9505056738853455, + "learning_rate": 4.805327832971899e-06, + "loss": 0.6604, + "step": 2679 + }, + { + "epoch": 0.7845433255269321, + "grad_norm": 1.0098148584365845, + "learning_rate": 4.80517882893834e-06, + "loss": 0.6505, + "step": 2680 + }, + { + "epoch": 0.7848360655737705, + "grad_norm": 0.9593586921691895, + "learning_rate": 4.805029770213971e-06, + "loss": 0.6967, + "step": 2681 + }, + { + "epoch": 0.7851288056206089, + "grad_norm": 1.1332507133483887, + "learning_rate": 4.804880656802327e-06, + "loss": 0.6806, + "step": 2682 + }, + { + "epoch": 0.7854215456674473, + "grad_norm": 0.9509003162384033, + "learning_rate": 4.804731488706947e-06, + "loss": 0.622, + "step": 2683 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.9735237956047058, + "learning_rate": 4.80458226593137e-06, + "loss": 0.6644, + "step": 2684 + }, + { + "epoch": 0.7860070257611241, + "grad_norm": 0.9820671677589417, + "learning_rate": 4.804432988479135e-06, + "loss": 0.709, + "step": 2685 + }, + { + "epoch": 0.7862997658079626, + "grad_norm": 0.9330145120620728, + "learning_rate": 4.804283656353786e-06, + "loss": 0.6574, + "step": 2686 + }, + { + "epoch": 0.7865925058548009, + "grad_norm": 1.0071065425872803, + "learning_rate": 4.804134269558863e-06, + "loss": 0.655, + "step": 2687 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.9456788897514343, + "learning_rate": 4.803984828097913e-06, + "loss": 0.6457, + "step": 2688 + }, + { + "epoch": 0.7871779859484778, + "grad_norm": 1.0038093328475952, + "learning_rate": 4.80383533197448e-06, + "loss": 0.6601, + "step": 2689 + }, + { + "epoch": 0.7874707259953162, + "grad_norm": 1.0284347534179688, + "learning_rate": 4.8036857811921116e-06, + "loss": 0.6322, + "step": 2690 + }, + { + "epoch": 0.7877634660421545, + "grad_norm": 0.9614667296409607, + "learning_rate": 4.803536175754355e-06, + "loss": 0.6894, + "step": 2691 + }, + { + "epoch": 0.788056206088993, + "grad_norm": 0.941280722618103, + "learning_rate": 4.80338651566476e-06, + "loss": 0.6683, + "step": 2692 + }, + { + "epoch": 0.7883489461358314, + "grad_norm": 0.9835332632064819, + "learning_rate": 4.803236800926877e-06, + "loss": 0.7042, + "step": 2693 + }, + { + "epoch": 0.7886416861826698, + "grad_norm": 0.9373319745063782, + "learning_rate": 4.803087031544259e-06, + "loss": 0.6486, + "step": 2694 + }, + { + "epoch": 0.7889344262295082, + "grad_norm": 0.9817278385162354, + "learning_rate": 4.802937207520459e-06, + "loss": 0.6257, + "step": 2695 + }, + { + "epoch": 0.7892271662763466, + "grad_norm": 0.9311197400093079, + "learning_rate": 4.80278732885903e-06, + "loss": 0.6568, + "step": 2696 + }, + { + "epoch": 0.789519906323185, + "grad_norm": 0.9749744534492493, + "learning_rate": 4.80263739556353e-06, + "loss": 0.6478, + "step": 2697 + }, + { + "epoch": 0.7898126463700235, + "grad_norm": 0.9642316102981567, + "learning_rate": 4.802487407637516e-06, + "loss": 0.6123, + "step": 2698 + }, + { + "epoch": 0.7901053864168618, + "grad_norm": 0.9429492950439453, + "learning_rate": 4.802337365084545e-06, + "loss": 0.6873, + "step": 2699 + }, + { + "epoch": 0.7903981264637002, + "grad_norm": 0.9779683351516724, + "learning_rate": 4.802187267908178e-06, + "loss": 0.6706, + "step": 2700 + }, + { + "epoch": 0.7906908665105387, + "grad_norm": 0.9431127905845642, + "learning_rate": 4.8020371161119755e-06, + "loss": 0.6454, + "step": 2701 + }, + { + "epoch": 0.7909836065573771, + "grad_norm": 0.9789735674858093, + "learning_rate": 4.8018869096995005e-06, + "loss": 0.6676, + "step": 2702 + }, + { + "epoch": 0.7912763466042154, + "grad_norm": 0.9891870021820068, + "learning_rate": 4.801736648674316e-06, + "loss": 0.7038, + "step": 2703 + }, + { + "epoch": 0.7915690866510539, + "grad_norm": 0.964250922203064, + "learning_rate": 4.801586333039988e-06, + "loss": 0.6855, + "step": 2704 + }, + { + "epoch": 0.7918618266978923, + "grad_norm": 0.9797296524047852, + "learning_rate": 4.801435962800081e-06, + "loss": 0.7105, + "step": 2705 + }, + { + "epoch": 0.7921545667447307, + "grad_norm": 0.9742047786712646, + "learning_rate": 4.801285537958165e-06, + "loss": 0.6404, + "step": 2706 + }, + { + "epoch": 0.792447306791569, + "grad_norm": 0.9519373178482056, + "learning_rate": 4.801135058517806e-06, + "loss": 0.6551, + "step": 2707 + }, + { + "epoch": 0.7927400468384075, + "grad_norm": 0.9357604384422302, + "learning_rate": 4.800984524482576e-06, + "loss": 0.652, + "step": 2708 + }, + { + "epoch": 0.7930327868852459, + "grad_norm": 0.9809781312942505, + "learning_rate": 4.800833935856047e-06, + "loss": 0.6623, + "step": 2709 + }, + { + "epoch": 0.7933255269320844, + "grad_norm": 0.9646432995796204, + "learning_rate": 4.8006832926417905e-06, + "loss": 0.6386, + "step": 2710 + }, + { + "epoch": 0.7936182669789227, + "grad_norm": 0.9978732466697693, + "learning_rate": 4.80053259484338e-06, + "loss": 0.662, + "step": 2711 + }, + { + "epoch": 0.7939110070257611, + "grad_norm": 0.9653863310813904, + "learning_rate": 4.800381842464392e-06, + "loss": 0.6857, + "step": 2712 + }, + { + "epoch": 0.7942037470725996, + "grad_norm": 0.9932756423950195, + "learning_rate": 4.800231035508404e-06, + "loss": 0.6719, + "step": 2713 + }, + { + "epoch": 0.794496487119438, + "grad_norm": 0.9626267552375793, + "learning_rate": 4.800080173978991e-06, + "loss": 0.652, + "step": 2714 + }, + { + "epoch": 0.7947892271662763, + "grad_norm": 0.956934928894043, + "learning_rate": 4.799929257879735e-06, + "loss": 0.6374, + "step": 2715 + }, + { + "epoch": 0.7950819672131147, + "grad_norm": 0.9747413396835327, + "learning_rate": 4.799778287214216e-06, + "loss": 0.6651, + "step": 2716 + }, + { + "epoch": 0.7953747072599532, + "grad_norm": 0.9721114635467529, + "learning_rate": 4.799627261986014e-06, + "loss": 0.6659, + "step": 2717 + }, + { + "epoch": 0.7956674473067916, + "grad_norm": 0.9759097695350647, + "learning_rate": 4.7994761821987145e-06, + "loss": 0.6977, + "step": 2718 + }, + { + "epoch": 0.7959601873536299, + "grad_norm": 1.0231525897979736, + "learning_rate": 4.7993250478559e-06, + "loss": 0.6881, + "step": 2719 + }, + { + "epoch": 0.7962529274004684, + "grad_norm": 0.9695059061050415, + "learning_rate": 4.799173858961158e-06, + "loss": 0.6605, + "step": 2720 + }, + { + "epoch": 0.7965456674473068, + "grad_norm": 1.0050657987594604, + "learning_rate": 4.799022615518074e-06, + "loss": 0.6742, + "step": 2721 + }, + { + "epoch": 0.7968384074941453, + "grad_norm": 1.0182703733444214, + "learning_rate": 4.798871317530237e-06, + "loss": 0.6485, + "step": 2722 + }, + { + "epoch": 0.7971311475409836, + "grad_norm": 0.9287837147712708, + "learning_rate": 4.798719965001236e-06, + "loss": 0.6478, + "step": 2723 + }, + { + "epoch": 0.797423887587822, + "grad_norm": 1.0035138130187988, + "learning_rate": 4.798568557934662e-06, + "loss": 0.7138, + "step": 2724 + }, + { + "epoch": 0.7977166276346604, + "grad_norm": 1.0174108743667603, + "learning_rate": 4.798417096334107e-06, + "loss": 0.6764, + "step": 2725 + }, + { + "epoch": 0.7980093676814989, + "grad_norm": 0.8886292576789856, + "learning_rate": 4.7982655802031655e-06, + "loss": 0.6001, + "step": 2726 + }, + { + "epoch": 0.7983021077283372, + "grad_norm": 0.9617050886154175, + "learning_rate": 4.798114009545432e-06, + "loss": 0.6377, + "step": 2727 + }, + { + "epoch": 0.7985948477751756, + "grad_norm": 0.9752925038337708, + "learning_rate": 4.797962384364501e-06, + "loss": 0.6729, + "step": 2728 + }, + { + "epoch": 0.7988875878220141, + "grad_norm": 0.9588488936424255, + "learning_rate": 4.797810704663971e-06, + "loss": 0.6914, + "step": 2729 + }, + { + "epoch": 0.7991803278688525, + "grad_norm": 0.984188973903656, + "learning_rate": 4.797658970447441e-06, + "loss": 0.6853, + "step": 2730 + }, + { + "epoch": 0.7994730679156908, + "grad_norm": 0.9679237008094788, + "learning_rate": 4.79750718171851e-06, + "loss": 0.5939, + "step": 2731 + }, + { + "epoch": 0.7997658079625293, + "grad_norm": 0.9575273990631104, + "learning_rate": 4.7973553384807806e-06, + "loss": 0.6584, + "step": 2732 + }, + { + "epoch": 0.8000585480093677, + "grad_norm": 0.9942688345909119, + "learning_rate": 4.797203440737854e-06, + "loss": 0.6448, + "step": 2733 + }, + { + "epoch": 0.800351288056206, + "grad_norm": 0.9634747505187988, + "learning_rate": 4.797051488493335e-06, + "loss": 0.6656, + "step": 2734 + }, + { + "epoch": 0.8006440281030445, + "grad_norm": 0.9566713571548462, + "learning_rate": 4.796899481750828e-06, + "loss": 0.678, + "step": 2735 + }, + { + "epoch": 0.8009367681498829, + "grad_norm": 0.9441654086112976, + "learning_rate": 4.796747420513939e-06, + "loss": 0.6846, + "step": 2736 + }, + { + "epoch": 0.8012295081967213, + "grad_norm": 0.9786010384559631, + "learning_rate": 4.796595304786276e-06, + "loss": 0.6411, + "step": 2737 + }, + { + "epoch": 0.8015222482435597, + "grad_norm": 0.9903592467308044, + "learning_rate": 4.796443134571448e-06, + "loss": 0.6701, + "step": 2738 + }, + { + "epoch": 0.8018149882903981, + "grad_norm": 0.9814341068267822, + "learning_rate": 4.796290909873066e-06, + "loss": 0.6899, + "step": 2739 + }, + { + "epoch": 0.8021077283372365, + "grad_norm": 0.9604585766792297, + "learning_rate": 4.796138630694741e-06, + "loss": 0.6364, + "step": 2740 + }, + { + "epoch": 0.802400468384075, + "grad_norm": 1.0619951486587524, + "learning_rate": 4.795986297040086e-06, + "loss": 0.6929, + "step": 2741 + }, + { + "epoch": 0.8026932084309133, + "grad_norm": 0.967383623123169, + "learning_rate": 4.795833908912714e-06, + "loss": 0.6882, + "step": 2742 + }, + { + "epoch": 0.8029859484777517, + "grad_norm": 1.0610599517822266, + "learning_rate": 4.795681466316242e-06, + "loss": 0.7, + "step": 2743 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 0.9786458611488342, + "learning_rate": 4.7955289692542865e-06, + "loss": 0.6613, + "step": 2744 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 0.9825703501701355, + "learning_rate": 4.795376417730465e-06, + "loss": 0.6863, + "step": 2745 + }, + { + "epoch": 0.8038641686182669, + "grad_norm": 0.9486829042434692, + "learning_rate": 4.795223811748397e-06, + "loss": 0.6492, + "step": 2746 + }, + { + "epoch": 0.8041569086651054, + "grad_norm": 0.9282700419425964, + "learning_rate": 4.795071151311703e-06, + "loss": 0.6157, + "step": 2747 + }, + { + "epoch": 0.8044496487119438, + "grad_norm": 0.9706962704658508, + "learning_rate": 4.794918436424005e-06, + "loss": 0.6724, + "step": 2748 + }, + { + "epoch": 0.8047423887587822, + "grad_norm": 0.9484982490539551, + "learning_rate": 4.794765667088926e-06, + "loss": 0.6274, + "step": 2749 + }, + { + "epoch": 0.8050351288056206, + "grad_norm": 0.9901068210601807, + "learning_rate": 4.794612843310091e-06, + "loss": 0.7183, + "step": 2750 + }, + { + "epoch": 0.805327868852459, + "grad_norm": 1.0111565589904785, + "learning_rate": 4.794459965091126e-06, + "loss": 0.6919, + "step": 2751 + }, + { + "epoch": 0.8056206088992974, + "grad_norm": 0.9373451471328735, + "learning_rate": 4.794307032435656e-06, + "loss": 0.6592, + "step": 2752 + }, + { + "epoch": 0.8059133489461359, + "grad_norm": 0.9836905598640442, + "learning_rate": 4.794154045347313e-06, + "loss": 0.6853, + "step": 2753 + }, + { + "epoch": 0.8062060889929742, + "grad_norm": 1.024569034576416, + "learning_rate": 4.7940010038297235e-06, + "loss": 0.6511, + "step": 2754 + }, + { + "epoch": 0.8064988290398126, + "grad_norm": 0.9943025708198547, + "learning_rate": 4.793847907886519e-06, + "loss": 0.6675, + "step": 2755 + }, + { + "epoch": 0.8067915690866511, + "grad_norm": 0.9916179180145264, + "learning_rate": 4.7936947575213326e-06, + "loss": 0.6309, + "step": 2756 + }, + { + "epoch": 0.8070843091334895, + "grad_norm": 1.0292935371398926, + "learning_rate": 4.793541552737797e-06, + "loss": 0.6666, + "step": 2757 + }, + { + "epoch": 0.8073770491803278, + "grad_norm": 0.9629769325256348, + "learning_rate": 4.7933882935395485e-06, + "loss": 0.6069, + "step": 2758 + }, + { + "epoch": 0.8076697892271663, + "grad_norm": 0.9945060014724731, + "learning_rate": 4.79323497993022e-06, + "loss": 0.6691, + "step": 2759 + }, + { + "epoch": 0.8079625292740047, + "grad_norm": 0.9850197434425354, + "learning_rate": 4.793081611913452e-06, + "loss": 0.6972, + "step": 2760 + }, + { + "epoch": 0.8082552693208431, + "grad_norm": 1.0651614665985107, + "learning_rate": 4.792928189492883e-06, + "loss": 0.6879, + "step": 2761 + }, + { + "epoch": 0.8085480093676815, + "grad_norm": 1.0271738767623901, + "learning_rate": 4.792774712672153e-06, + "loss": 0.6852, + "step": 2762 + }, + { + "epoch": 0.8088407494145199, + "grad_norm": 1.0159739255905151, + "learning_rate": 4.792621181454901e-06, + "loss": 0.7249, + "step": 2763 + }, + { + "epoch": 0.8091334894613583, + "grad_norm": 0.9443522691726685, + "learning_rate": 4.792467595844771e-06, + "loss": 0.6629, + "step": 2764 + }, + { + "epoch": 0.8094262295081968, + "grad_norm": 1.0324227809906006, + "learning_rate": 4.792313955845407e-06, + "loss": 0.6783, + "step": 2765 + }, + { + "epoch": 0.8097189695550351, + "grad_norm": 1.0461944341659546, + "learning_rate": 4.792160261460453e-06, + "loss": 0.6611, + "step": 2766 + }, + { + "epoch": 0.8100117096018735, + "grad_norm": 0.9317655563354492, + "learning_rate": 4.792006512693558e-06, + "loss": 0.5912, + "step": 2767 + }, + { + "epoch": 0.810304449648712, + "grad_norm": 0.9601888060569763, + "learning_rate": 4.791852709548368e-06, + "loss": 0.6685, + "step": 2768 + }, + { + "epoch": 0.8105971896955504, + "grad_norm": 1.013680338859558, + "learning_rate": 4.791698852028531e-06, + "loss": 0.6626, + "step": 2769 + }, + { + "epoch": 0.8108899297423887, + "grad_norm": 0.9827961921691895, + "learning_rate": 4.7915449401377e-06, + "loss": 0.6856, + "step": 2770 + }, + { + "epoch": 0.8111826697892272, + "grad_norm": 0.9355833530426025, + "learning_rate": 4.791390973879524e-06, + "loss": 0.5856, + "step": 2771 + }, + { + "epoch": 0.8114754098360656, + "grad_norm": 0.9321961402893066, + "learning_rate": 4.791236953257658e-06, + "loss": 0.608, + "step": 2772 + }, + { + "epoch": 0.811768149882904, + "grad_norm": 0.9307120442390442, + "learning_rate": 4.791082878275754e-06, + "loss": 0.6526, + "step": 2773 + }, + { + "epoch": 0.8120608899297423, + "grad_norm": 0.9692187309265137, + "learning_rate": 4.790928748937468e-06, + "loss": 0.6263, + "step": 2774 + }, + { + "epoch": 0.8123536299765808, + "grad_norm": 0.969426155090332, + "learning_rate": 4.7907745652464585e-06, + "loss": 0.6556, + "step": 2775 + }, + { + "epoch": 0.8126463700234192, + "grad_norm": 0.9912156462669373, + "learning_rate": 4.790620327206381e-06, + "loss": 0.6833, + "step": 2776 + }, + { + "epoch": 0.8129391100702577, + "grad_norm": 0.9600932002067566, + "learning_rate": 4.790466034820897e-06, + "loss": 0.6291, + "step": 2777 + }, + { + "epoch": 0.813231850117096, + "grad_norm": 0.9848114848136902, + "learning_rate": 4.790311688093667e-06, + "loss": 0.6688, + "step": 2778 + }, + { + "epoch": 0.8135245901639344, + "grad_norm": 0.960176944732666, + "learning_rate": 4.790157287028352e-06, + "loss": 0.6489, + "step": 2779 + }, + { + "epoch": 0.8138173302107728, + "grad_norm": 0.9893671870231628, + "learning_rate": 4.790002831628614e-06, + "loss": 0.6287, + "step": 2780 + }, + { + "epoch": 0.8141100702576113, + "grad_norm": 0.9711840748786926, + "learning_rate": 4.789848321898121e-06, + "loss": 0.6809, + "step": 2781 + }, + { + "epoch": 0.8144028103044496, + "grad_norm": 0.9345763921737671, + "learning_rate": 4.789693757840535e-06, + "loss": 0.6423, + "step": 2782 + }, + { + "epoch": 0.814695550351288, + "grad_norm": 0.936966598033905, + "learning_rate": 4.789539139459525e-06, + "loss": 0.6552, + "step": 2783 + }, + { + "epoch": 0.8149882903981265, + "grad_norm": 1.0480449199676514, + "learning_rate": 4.789384466758759e-06, + "loss": 0.7119, + "step": 2784 + }, + { + "epoch": 0.8152810304449649, + "grad_norm": 1.0668872594833374, + "learning_rate": 4.789229739741907e-06, + "loss": 0.7019, + "step": 2785 + }, + { + "epoch": 0.8155737704918032, + "grad_norm": 0.958901584148407, + "learning_rate": 4.789074958412639e-06, + "loss": 0.6785, + "step": 2786 + }, + { + "epoch": 0.8158665105386417, + "grad_norm": 0.9307546019554138, + "learning_rate": 4.7889201227746285e-06, + "loss": 0.6143, + "step": 2787 + }, + { + "epoch": 0.8161592505854801, + "grad_norm": 0.9399756193161011, + "learning_rate": 4.788765232831548e-06, + "loss": 0.6489, + "step": 2788 + }, + { + "epoch": 0.8164519906323185, + "grad_norm": 1.0295403003692627, + "learning_rate": 4.788610288587074e-06, + "loss": 0.6757, + "step": 2789 + }, + { + "epoch": 0.8167447306791569, + "grad_norm": 0.9341875910758972, + "learning_rate": 4.7884552900448786e-06, + "loss": 0.6639, + "step": 2790 + }, + { + "epoch": 0.8170374707259953, + "grad_norm": 1.0031592845916748, + "learning_rate": 4.788300237208644e-06, + "loss": 0.6725, + "step": 2791 + }, + { + "epoch": 0.8173302107728337, + "grad_norm": 0.9818474054336548, + "learning_rate": 4.788145130082045e-06, + "loss": 0.6429, + "step": 2792 + }, + { + "epoch": 0.8176229508196722, + "grad_norm": 0.9538310170173645, + "learning_rate": 4.7879899686687645e-06, + "loss": 0.6793, + "step": 2793 + }, + { + "epoch": 0.8179156908665105, + "grad_norm": 0.9440767168998718, + "learning_rate": 4.787834752972482e-06, + "loss": 0.6687, + "step": 2794 + }, + { + "epoch": 0.8182084309133489, + "grad_norm": 0.9863094091415405, + "learning_rate": 4.78767948299688e-06, + "loss": 0.7103, + "step": 2795 + }, + { + "epoch": 0.8185011709601874, + "grad_norm": 1.0029956102371216, + "learning_rate": 4.787524158745643e-06, + "loss": 0.7116, + "step": 2796 + }, + { + "epoch": 0.8187939110070258, + "grad_norm": 0.9633566737174988, + "learning_rate": 4.787368780222456e-06, + "loss": 0.6393, + "step": 2797 + }, + { + "epoch": 0.8190866510538641, + "grad_norm": 0.9286820292472839, + "learning_rate": 4.787213347431006e-06, + "loss": 0.6572, + "step": 2798 + }, + { + "epoch": 0.8193793911007026, + "grad_norm": 1.0330345630645752, + "learning_rate": 4.787057860374978e-06, + "loss": 0.6763, + "step": 2799 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 1.044358253479004, + "learning_rate": 4.7869023190580644e-06, + "loss": 0.7216, + "step": 2800 + }, + { + "epoch": 0.8199648711943794, + "grad_norm": 0.9373910427093506, + "learning_rate": 4.7867467234839536e-06, + "loss": 0.6478, + "step": 2801 + }, + { + "epoch": 0.8202576112412178, + "grad_norm": 0.9878972768783569, + "learning_rate": 4.786591073656337e-06, + "loss": 0.6877, + "step": 2802 + }, + { + "epoch": 0.8205503512880562, + "grad_norm": 0.9876794815063477, + "learning_rate": 4.786435369578908e-06, + "loss": 0.6091, + "step": 2803 + }, + { + "epoch": 0.8208430913348946, + "grad_norm": 0.914695680141449, + "learning_rate": 4.786279611255361e-06, + "loss": 0.6287, + "step": 2804 + }, + { + "epoch": 0.8211358313817331, + "grad_norm": 0.9658267498016357, + "learning_rate": 4.786123798689391e-06, + "loss": 0.6661, + "step": 2805 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 0.9400615692138672, + "learning_rate": 4.785967931884694e-06, + "loss": 0.6455, + "step": 2806 + }, + { + "epoch": 0.8217213114754098, + "grad_norm": 0.9206145405769348, + "learning_rate": 4.785812010844969e-06, + "loss": 0.6428, + "step": 2807 + }, + { + "epoch": 0.8220140515222483, + "grad_norm": 0.9424629807472229, + "learning_rate": 4.785656035573915e-06, + "loss": 0.6872, + "step": 2808 + }, + { + "epoch": 0.8223067915690867, + "grad_norm": 0.999724805355072, + "learning_rate": 4.785500006075233e-06, + "loss": 0.6879, + "step": 2809 + }, + { + "epoch": 0.822599531615925, + "grad_norm": 1.0169602632522583, + "learning_rate": 4.785343922352624e-06, + "loss": 0.6566, + "step": 2810 + }, + { + "epoch": 0.8228922716627635, + "grad_norm": 0.9912406206130981, + "learning_rate": 4.78518778440979e-06, + "loss": 0.6796, + "step": 2811 + }, + { + "epoch": 0.8231850117096019, + "grad_norm": 0.9866358041763306, + "learning_rate": 4.785031592250438e-06, + "loss": 0.6377, + "step": 2812 + }, + { + "epoch": 0.8234777517564403, + "grad_norm": 1.2223234176635742, + "learning_rate": 4.784875345878273e-06, + "loss": 0.6821, + "step": 2813 + }, + { + "epoch": 0.8237704918032787, + "grad_norm": 0.9136068224906921, + "learning_rate": 4.784719045297001e-06, + "loss": 0.674, + "step": 2814 + }, + { + "epoch": 0.8240632318501171, + "grad_norm": 0.924764096736908, + "learning_rate": 4.78456269051033e-06, + "loss": 0.6346, + "step": 2815 + }, + { + "epoch": 0.8243559718969555, + "grad_norm": 0.9709203839302063, + "learning_rate": 4.78440628152197e-06, + "loss": 0.6762, + "step": 2816 + }, + { + "epoch": 0.824648711943794, + "grad_norm": 0.9138123989105225, + "learning_rate": 4.784249818335633e-06, + "loss": 0.635, + "step": 2817 + }, + { + "epoch": 0.8249414519906323, + "grad_norm": 0.9559844732284546, + "learning_rate": 4.78409330095503e-06, + "loss": 0.6349, + "step": 2818 + }, + { + "epoch": 0.8252341920374707, + "grad_norm": 0.9166303873062134, + "learning_rate": 4.783936729383874e-06, + "loss": 0.6531, + "step": 2819 + }, + { + "epoch": 0.8255269320843092, + "grad_norm": 0.9708403944969177, + "learning_rate": 4.783780103625881e-06, + "loss": 0.6238, + "step": 2820 + }, + { + "epoch": 0.8258196721311475, + "grad_norm": 0.9540261030197144, + "learning_rate": 4.783623423684767e-06, + "loss": 0.65, + "step": 2821 + }, + { + "epoch": 0.8261124121779859, + "grad_norm": 0.9786800742149353, + "learning_rate": 4.783466689564247e-06, + "loss": 0.6655, + "step": 2822 + }, + { + "epoch": 0.8264051522248244, + "grad_norm": 0.987142026424408, + "learning_rate": 4.7833099012680426e-06, + "loss": 0.6705, + "step": 2823 + }, + { + "epoch": 0.8266978922716628, + "grad_norm": 1.0127766132354736, + "learning_rate": 4.783153058799871e-06, + "loss": 0.671, + "step": 2824 + }, + { + "epoch": 0.8269906323185011, + "grad_norm": 0.9725877642631531, + "learning_rate": 4.7829961621634545e-06, + "loss": 0.7128, + "step": 2825 + }, + { + "epoch": 0.8272833723653396, + "grad_norm": 0.8903459310531616, + "learning_rate": 4.782839211362515e-06, + "loss": 0.6188, + "step": 2826 + }, + { + "epoch": 0.827576112412178, + "grad_norm": 0.9440898299217224, + "learning_rate": 4.782682206400778e-06, + "loss": 0.6647, + "step": 2827 + }, + { + "epoch": 0.8278688524590164, + "grad_norm": 0.9490867257118225, + "learning_rate": 4.7825251472819665e-06, + "loss": 0.6522, + "step": 2828 + }, + { + "epoch": 0.8281615925058547, + "grad_norm": 0.9416446089744568, + "learning_rate": 4.782368034009807e-06, + "loss": 0.6588, + "step": 2829 + }, + { + "epoch": 0.8284543325526932, + "grad_norm": 0.9408036470413208, + "learning_rate": 4.782210866588028e-06, + "loss": 0.6228, + "step": 2830 + }, + { + "epoch": 0.8287470725995316, + "grad_norm": 1.0118452310562134, + "learning_rate": 4.7820536450203565e-06, + "loss": 0.6964, + "step": 2831 + }, + { + "epoch": 0.8290398126463701, + "grad_norm": 1.0136206150054932, + "learning_rate": 4.7818963693105254e-06, + "loss": 0.6624, + "step": 2832 + }, + { + "epoch": 0.8293325526932084, + "grad_norm": 0.9894269704818726, + "learning_rate": 4.781739039462264e-06, + "loss": 0.6464, + "step": 2833 + }, + { + "epoch": 0.8296252927400468, + "grad_norm": 0.9397152066230774, + "learning_rate": 4.781581655479306e-06, + "loss": 0.6671, + "step": 2834 + }, + { + "epoch": 0.8299180327868853, + "grad_norm": 0.9859839081764221, + "learning_rate": 4.7814242173653835e-06, + "loss": 0.6846, + "step": 2835 + }, + { + "epoch": 0.8302107728337237, + "grad_norm": 0.9844891428947449, + "learning_rate": 4.781266725124234e-06, + "loss": 0.6554, + "step": 2836 + }, + { + "epoch": 0.830503512880562, + "grad_norm": 0.9960318803787231, + "learning_rate": 4.781109178759593e-06, + "loss": 0.6626, + "step": 2837 + }, + { + "epoch": 0.8307962529274004, + "grad_norm": 1.0031254291534424, + "learning_rate": 4.780951578275198e-06, + "loss": 0.6699, + "step": 2838 + }, + { + "epoch": 0.8310889929742389, + "grad_norm": 0.9829326272010803, + "learning_rate": 4.780793923674791e-06, + "loss": 0.6956, + "step": 2839 + }, + { + "epoch": 0.8313817330210773, + "grad_norm": 0.9773803353309631, + "learning_rate": 4.7806362149621065e-06, + "loss": 0.6218, + "step": 2840 + }, + { + "epoch": 0.8316744730679156, + "grad_norm": 0.9764754772186279, + "learning_rate": 4.780478452140892e-06, + "loss": 0.6735, + "step": 2841 + }, + { + "epoch": 0.8319672131147541, + "grad_norm": 1.0023903846740723, + "learning_rate": 4.7803206352148865e-06, + "loss": 0.6857, + "step": 2842 + }, + { + "epoch": 0.8322599531615925, + "grad_norm": 0.9837384819984436, + "learning_rate": 4.780162764187838e-06, + "loss": 0.6192, + "step": 2843 + }, + { + "epoch": 0.832552693208431, + "grad_norm": 0.9869927167892456, + "learning_rate": 4.780004839063488e-06, + "loss": 0.6899, + "step": 2844 + }, + { + "epoch": 0.8328454332552693, + "grad_norm": 1.0096521377563477, + "learning_rate": 4.779846859845586e-06, + "loss": 0.6991, + "step": 2845 + }, + { + "epoch": 0.8331381733021077, + "grad_norm": 1.0087250471115112, + "learning_rate": 4.779688826537878e-06, + "loss": 0.658, + "step": 2846 + }, + { + "epoch": 0.8334309133489461, + "grad_norm": 0.948125958442688, + "learning_rate": 4.779530739144116e-06, + "loss": 0.6743, + "step": 2847 + }, + { + "epoch": 0.8337236533957846, + "grad_norm": 0.9743801951408386, + "learning_rate": 4.779372597668048e-06, + "loss": 0.6438, + "step": 2848 + }, + { + "epoch": 0.8340163934426229, + "grad_norm": 0.920917809009552, + "learning_rate": 4.779214402113429e-06, + "loss": 0.6399, + "step": 2849 + }, + { + "epoch": 0.8343091334894613, + "grad_norm": 0.9702663421630859, + "learning_rate": 4.779056152484009e-06, + "loss": 0.6563, + "step": 2850 + }, + { + "epoch": 0.8346018735362998, + "grad_norm": 0.9882484674453735, + "learning_rate": 4.778897848783544e-06, + "loss": 0.6133, + "step": 2851 + }, + { + "epoch": 0.8348946135831382, + "grad_norm": 1.005784273147583, + "learning_rate": 4.77873949101579e-06, + "loss": 0.6801, + "step": 2852 + }, + { + "epoch": 0.8351873536299765, + "grad_norm": 0.9066639542579651, + "learning_rate": 4.778581079184504e-06, + "loss": 0.6488, + "step": 2853 + }, + { + "epoch": 0.835480093676815, + "grad_norm": 0.9285972118377686, + "learning_rate": 4.778422613293444e-06, + "loss": 0.6847, + "step": 2854 + }, + { + "epoch": 0.8357728337236534, + "grad_norm": 0.9481026530265808, + "learning_rate": 4.77826409334637e-06, + "loss": 0.6815, + "step": 2855 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 1.017000675201416, + "learning_rate": 4.778105519347042e-06, + "loss": 0.6947, + "step": 2856 + }, + { + "epoch": 0.8363583138173302, + "grad_norm": 1.0177254676818848, + "learning_rate": 4.777946891299224e-06, + "loss": 0.6775, + "step": 2857 + }, + { + "epoch": 0.8366510538641686, + "grad_norm": 0.922105073928833, + "learning_rate": 4.777788209206677e-06, + "loss": 0.6457, + "step": 2858 + }, + { + "epoch": 0.836943793911007, + "grad_norm": 1.2273157835006714, + "learning_rate": 4.777629473073169e-06, + "loss": 0.6415, + "step": 2859 + }, + { + "epoch": 0.8372365339578455, + "grad_norm": 0.9534598588943481, + "learning_rate": 4.777470682902462e-06, + "loss": 0.617, + "step": 2860 + }, + { + "epoch": 0.8375292740046838, + "grad_norm": 0.9594615697860718, + "learning_rate": 4.777311838698327e-06, + "loss": 0.6292, + "step": 2861 + }, + { + "epoch": 0.8378220140515222, + "grad_norm": 0.9812807440757751, + "learning_rate": 4.77715294046453e-06, + "loss": 0.6927, + "step": 2862 + }, + { + "epoch": 0.8381147540983607, + "grad_norm": 0.9805405735969543, + "learning_rate": 4.776993988204843e-06, + "loss": 0.6247, + "step": 2863 + }, + { + "epoch": 0.8384074941451991, + "grad_norm": 0.9548155069351196, + "learning_rate": 4.776834981923035e-06, + "loss": 0.6284, + "step": 2864 + }, + { + "epoch": 0.8387002341920374, + "grad_norm": 1.0176641941070557, + "learning_rate": 4.776675921622881e-06, + "loss": 0.6932, + "step": 2865 + }, + { + "epoch": 0.8389929742388759, + "grad_norm": 0.9421233534812927, + "learning_rate": 4.776516807308153e-06, + "loss": 0.6608, + "step": 2866 + }, + { + "epoch": 0.8392857142857143, + "grad_norm": 0.9748751521110535, + "learning_rate": 4.776357638982626e-06, + "loss": 0.6895, + "step": 2867 + }, + { + "epoch": 0.8395784543325527, + "grad_norm": 0.9636788368225098, + "learning_rate": 4.776198416650076e-06, + "loss": 0.6891, + "step": 2868 + }, + { + "epoch": 0.8398711943793911, + "grad_norm": 0.9911706447601318, + "learning_rate": 4.776039140314283e-06, + "loss": 0.666, + "step": 2869 + }, + { + "epoch": 0.8401639344262295, + "grad_norm": 1.023356318473816, + "learning_rate": 4.775879809979023e-06, + "loss": 0.6942, + "step": 2870 + }, + { + "epoch": 0.8404566744730679, + "grad_norm": 0.9393298625946045, + "learning_rate": 4.775720425648077e-06, + "loss": 0.6967, + "step": 2871 + }, + { + "epoch": 0.8407494145199064, + "grad_norm": 0.9929444789886475, + "learning_rate": 4.775560987325227e-06, + "loss": 0.6729, + "step": 2872 + }, + { + "epoch": 0.8410421545667447, + "grad_norm": 1.0561363697052002, + "learning_rate": 4.775401495014256e-06, + "loss": 0.6838, + "step": 2873 + }, + { + "epoch": 0.8413348946135831, + "grad_norm": 1.0335297584533691, + "learning_rate": 4.775241948718947e-06, + "loss": 0.6769, + "step": 2874 + }, + { + "epoch": 0.8416276346604216, + "grad_norm": 0.9206576347351074, + "learning_rate": 4.775082348443086e-06, + "loss": 0.6578, + "step": 2875 + }, + { + "epoch": 0.84192037470726, + "grad_norm": 0.9513888955116272, + "learning_rate": 4.774922694190458e-06, + "loss": 0.6654, + "step": 2876 + }, + { + "epoch": 0.8422131147540983, + "grad_norm": 0.9198861122131348, + "learning_rate": 4.774762985964852e-06, + "loss": 0.6409, + "step": 2877 + }, + { + "epoch": 0.8425058548009368, + "grad_norm": 0.9631726145744324, + "learning_rate": 4.774603223770058e-06, + "loss": 0.636, + "step": 2878 + }, + { + "epoch": 0.8427985948477752, + "grad_norm": 0.9408578276634216, + "learning_rate": 4.774443407609865e-06, + "loss": 0.6219, + "step": 2879 + }, + { + "epoch": 0.8430913348946136, + "grad_norm": 0.9608256816864014, + "learning_rate": 4.774283537488066e-06, + "loss": 0.6188, + "step": 2880 + }, + { + "epoch": 0.843384074941452, + "grad_norm": 0.9678825736045837, + "learning_rate": 4.7741236134084534e-06, + "loss": 0.6329, + "step": 2881 + }, + { + "epoch": 0.8436768149882904, + "grad_norm": 0.9444988965988159, + "learning_rate": 4.7739636353748196e-06, + "loss": 0.6641, + "step": 2882 + }, + { + "epoch": 0.8439695550351288, + "grad_norm": 0.9564294815063477, + "learning_rate": 4.773803603390963e-06, + "loss": 0.6508, + "step": 2883 + }, + { + "epoch": 0.8442622950819673, + "grad_norm": 1.0194957256317139, + "learning_rate": 4.773643517460679e-06, + "loss": 0.6445, + "step": 2884 + }, + { + "epoch": 0.8445550351288056, + "grad_norm": 0.979212760925293, + "learning_rate": 4.773483377587766e-06, + "loss": 0.6663, + "step": 2885 + }, + { + "epoch": 0.844847775175644, + "grad_norm": 0.9966521859169006, + "learning_rate": 4.7733231837760215e-06, + "loss": 0.6364, + "step": 2886 + }, + { + "epoch": 0.8451405152224825, + "grad_norm": 1.0453996658325195, + "learning_rate": 4.773162936029249e-06, + "loss": 0.7121, + "step": 2887 + }, + { + "epoch": 0.8454332552693209, + "grad_norm": 0.9744501709938049, + "learning_rate": 4.773002634351248e-06, + "loss": 0.6639, + "step": 2888 + }, + { + "epoch": 0.8457259953161592, + "grad_norm": 1.012278437614441, + "learning_rate": 4.772842278745824e-06, + "loss": 0.6822, + "step": 2889 + }, + { + "epoch": 0.8460187353629977, + "grad_norm": 0.9245831370353699, + "learning_rate": 4.77268186921678e-06, + "loss": 0.6276, + "step": 2890 + }, + { + "epoch": 0.8463114754098361, + "grad_norm": 0.9582067728042603, + "learning_rate": 4.772521405767922e-06, + "loss": 0.6579, + "step": 2891 + }, + { + "epoch": 0.8466042154566745, + "grad_norm": 1.0078071355819702, + "learning_rate": 4.772360888403056e-06, + "loss": 0.6788, + "step": 2892 + }, + { + "epoch": 0.8468969555035128, + "grad_norm": 1.0347754955291748, + "learning_rate": 4.772200317125992e-06, + "loss": 0.6681, + "step": 2893 + }, + { + "epoch": 0.8471896955503513, + "grad_norm": 0.9510510563850403, + "learning_rate": 4.77203969194054e-06, + "loss": 0.6762, + "step": 2894 + }, + { + "epoch": 0.8474824355971897, + "grad_norm": 0.9556336402893066, + "learning_rate": 4.771879012850509e-06, + "loss": 0.6563, + "step": 2895 + }, + { + "epoch": 0.8477751756440282, + "grad_norm": 0.9943619966506958, + "learning_rate": 4.771718279859712e-06, + "loss": 0.6441, + "step": 2896 + }, + { + "epoch": 0.8480679156908665, + "grad_norm": 1.0159027576446533, + "learning_rate": 4.771557492971962e-06, + "loss": 0.6352, + "step": 2897 + }, + { + "epoch": 0.8483606557377049, + "grad_norm": 0.9739909768104553, + "learning_rate": 4.7713966521910744e-06, + "loss": 0.6549, + "step": 2898 + }, + { + "epoch": 0.8486533957845434, + "grad_norm": 0.9666604399681091, + "learning_rate": 4.7712357575208644e-06, + "loss": 0.6884, + "step": 2899 + }, + { + "epoch": 0.8489461358313818, + "grad_norm": 0.9393750429153442, + "learning_rate": 4.77107480896515e-06, + "loss": 0.6275, + "step": 2900 + }, + { + "epoch": 0.8492388758782201, + "grad_norm": 0.9287746548652649, + "learning_rate": 4.77091380652775e-06, + "loss": 0.6624, + "step": 2901 + }, + { + "epoch": 0.8495316159250585, + "grad_norm": 0.9978668689727783, + "learning_rate": 4.7707527502124825e-06, + "loss": 0.6948, + "step": 2902 + }, + { + "epoch": 0.849824355971897, + "grad_norm": 0.9747798442840576, + "learning_rate": 4.770591640023171e-06, + "loss": 0.67, + "step": 2903 + }, + { + "epoch": 0.8501170960187353, + "grad_norm": 0.9695436954498291, + "learning_rate": 4.770430475963636e-06, + "loss": 0.6335, + "step": 2904 + }, + { + "epoch": 0.8504098360655737, + "grad_norm": 0.9945396780967712, + "learning_rate": 4.770269258037702e-06, + "loss": 0.6253, + "step": 2905 + }, + { + "epoch": 0.8507025761124122, + "grad_norm": 0.9392843246459961, + "learning_rate": 4.770107986249193e-06, + "loss": 0.6514, + "step": 2906 + }, + { + "epoch": 0.8509953161592506, + "grad_norm": 1.0021556615829468, + "learning_rate": 4.7699466606019365e-06, + "loss": 0.6906, + "step": 2907 + }, + { + "epoch": 0.8512880562060889, + "grad_norm": 0.9806783199310303, + "learning_rate": 4.769785281099759e-06, + "loss": 0.6385, + "step": 2908 + }, + { + "epoch": 0.8515807962529274, + "grad_norm": 0.9472449421882629, + "learning_rate": 4.76962384774649e-06, + "loss": 0.6044, + "step": 2909 + }, + { + "epoch": 0.8518735362997658, + "grad_norm": 1.041795253753662, + "learning_rate": 4.769462360545959e-06, + "loss": 0.6331, + "step": 2910 + }, + { + "epoch": 0.8521662763466042, + "grad_norm": 0.9561552405357361, + "learning_rate": 4.769300819501997e-06, + "loss": 0.6559, + "step": 2911 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.9393250942230225, + "learning_rate": 4.769139224618438e-06, + "loss": 0.6182, + "step": 2912 + }, + { + "epoch": 0.852751756440281, + "grad_norm": 1.0099893808364868, + "learning_rate": 4.7689775758991145e-06, + "loss": 0.6343, + "step": 2913 + }, + { + "epoch": 0.8530444964871194, + "grad_norm": 0.9416078925132751, + "learning_rate": 4.768815873347862e-06, + "loss": 0.6681, + "step": 2914 + }, + { + "epoch": 0.8533372365339579, + "grad_norm": 0.907407820224762, + "learning_rate": 4.7686541169685176e-06, + "loss": 0.5823, + "step": 2915 + }, + { + "epoch": 0.8536299765807962, + "grad_norm": 1.0027137994766235, + "learning_rate": 4.768492306764918e-06, + "loss": 0.629, + "step": 2916 + }, + { + "epoch": 0.8539227166276346, + "grad_norm": 1.0042176246643066, + "learning_rate": 4.768330442740903e-06, + "loss": 0.6791, + "step": 2917 + }, + { + "epoch": 0.8542154566744731, + "grad_norm": 0.9610425233840942, + "learning_rate": 4.7681685249003114e-06, + "loss": 0.6526, + "step": 2918 + }, + { + "epoch": 0.8545081967213115, + "grad_norm": 0.9804086089134216, + "learning_rate": 4.768006553246987e-06, + "loss": 0.6782, + "step": 2919 + }, + { + "epoch": 0.8548009367681498, + "grad_norm": 0.9087112545967102, + "learning_rate": 4.76784452778477e-06, + "loss": 0.6317, + "step": 2920 + }, + { + "epoch": 0.8550936768149883, + "grad_norm": 0.9147226214408875, + "learning_rate": 4.767682448517507e-06, + "loss": 0.6224, + "step": 2921 + }, + { + "epoch": 0.8553864168618267, + "grad_norm": 0.9121715426445007, + "learning_rate": 4.767520315449042e-06, + "loss": 0.6476, + "step": 2922 + }, + { + "epoch": 0.8556791569086651, + "grad_norm": 0.9756464958190918, + "learning_rate": 4.767358128583221e-06, + "loss": 0.6265, + "step": 2923 + }, + { + "epoch": 0.8559718969555035, + "grad_norm": 0.9285110235214233, + "learning_rate": 4.767195887923894e-06, + "loss": 0.6107, + "step": 2924 + }, + { + "epoch": 0.8562646370023419, + "grad_norm": 0.9937058687210083, + "learning_rate": 4.767033593474908e-06, + "loss": 0.6324, + "step": 2925 + }, + { + "epoch": 0.8565573770491803, + "grad_norm": 1.0190527439117432, + "learning_rate": 4.766871245240116e-06, + "loss": 0.7076, + "step": 2926 + }, + { + "epoch": 0.8568501170960188, + "grad_norm": 0.9729838371276855, + "learning_rate": 4.7667088432233664e-06, + "loss": 0.7001, + "step": 2927 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.9740895628929138, + "learning_rate": 4.766546387428515e-06, + "loss": 0.6886, + "step": 2928 + }, + { + "epoch": 0.8574355971896955, + "grad_norm": 0.9570560455322266, + "learning_rate": 4.766383877859415e-06, + "loss": 0.6621, + "step": 2929 + }, + { + "epoch": 0.857728337236534, + "grad_norm": 0.8956660032272339, + "learning_rate": 4.766221314519922e-06, + "loss": 0.6207, + "step": 2930 + }, + { + "epoch": 0.8580210772833724, + "grad_norm": 0.9803920388221741, + "learning_rate": 4.766058697413893e-06, + "loss": 0.6848, + "step": 2931 + }, + { + "epoch": 0.8583138173302107, + "grad_norm": 0.9283446073532104, + "learning_rate": 4.765896026545186e-06, + "loss": 0.6414, + "step": 2932 + }, + { + "epoch": 0.8586065573770492, + "grad_norm": 1.0371763706207275, + "learning_rate": 4.76573330191766e-06, + "loss": 0.6755, + "step": 2933 + }, + { + "epoch": 0.8588992974238876, + "grad_norm": 0.9624677896499634, + "learning_rate": 4.765570523535177e-06, + "loss": 0.6532, + "step": 2934 + }, + { + "epoch": 0.859192037470726, + "grad_norm": 0.9550821185112, + "learning_rate": 4.765407691401597e-06, + "loss": 0.6461, + "step": 2935 + }, + { + "epoch": 0.8594847775175644, + "grad_norm": 1.0050146579742432, + "learning_rate": 4.765244805520785e-06, + "loss": 0.6619, + "step": 2936 + }, + { + "epoch": 0.8597775175644028, + "grad_norm": 0.9266591668128967, + "learning_rate": 4.765081865896605e-06, + "loss": 0.6724, + "step": 2937 + }, + { + "epoch": 0.8600702576112412, + "grad_norm": 0.9631842374801636, + "learning_rate": 4.764918872532922e-06, + "loss": 0.6809, + "step": 2938 + }, + { + "epoch": 0.8603629976580797, + "grad_norm": 0.9850926399230957, + "learning_rate": 4.764755825433604e-06, + "loss": 0.6733, + "step": 2939 + }, + { + "epoch": 0.860655737704918, + "grad_norm": 0.9772422313690186, + "learning_rate": 4.764592724602518e-06, + "loss": 0.6429, + "step": 2940 + }, + { + "epoch": 0.8609484777517564, + "grad_norm": 1.0067286491394043, + "learning_rate": 4.764429570043536e-06, + "loss": 0.6679, + "step": 2941 + }, + { + "epoch": 0.8612412177985949, + "grad_norm": 0.9560746550559998, + "learning_rate": 4.764266361760527e-06, + "loss": 0.6554, + "step": 2942 + }, + { + "epoch": 0.8615339578454333, + "grad_norm": 0.9998706579208374, + "learning_rate": 4.764103099757362e-06, + "loss": 0.6843, + "step": 2943 + }, + { + "epoch": 0.8618266978922716, + "grad_norm": 0.9268092513084412, + "learning_rate": 4.763939784037917e-06, + "loss": 0.6365, + "step": 2944 + }, + { + "epoch": 0.8621194379391101, + "grad_norm": 0.9693580865859985, + "learning_rate": 4.763776414606067e-06, + "loss": 0.653, + "step": 2945 + }, + { + "epoch": 0.8624121779859485, + "grad_norm": 0.9945774674415588, + "learning_rate": 4.763612991465685e-06, + "loss": 0.6879, + "step": 2946 + }, + { + "epoch": 0.8627049180327869, + "grad_norm": 0.9391907453536987, + "learning_rate": 4.7634495146206505e-06, + "loss": 0.6631, + "step": 2947 + }, + { + "epoch": 0.8629976580796253, + "grad_norm": 0.9533590078353882, + "learning_rate": 4.763285984074841e-06, + "loss": 0.6836, + "step": 2948 + }, + { + "epoch": 0.8632903981264637, + "grad_norm": 1.0183998346328735, + "learning_rate": 4.763122399832138e-06, + "loss": 0.6849, + "step": 2949 + }, + { + "epoch": 0.8635831381733021, + "grad_norm": 1.0164912939071655, + "learning_rate": 4.76295876189642e-06, + "loss": 0.6498, + "step": 2950 + }, + { + "epoch": 0.8638758782201406, + "grad_norm": 0.9638834595680237, + "learning_rate": 4.762795070271571e-06, + "loss": 0.6413, + "step": 2951 + }, + { + "epoch": 0.8641686182669789, + "grad_norm": 1.0011526346206665, + "learning_rate": 4.7626313249614735e-06, + "loss": 0.6095, + "step": 2952 + }, + { + "epoch": 0.8644613583138173, + "grad_norm": 0.9680542349815369, + "learning_rate": 4.762467525970014e-06, + "loss": 0.6389, + "step": 2953 + }, + { + "epoch": 0.8647540983606558, + "grad_norm": 0.9689949154853821, + "learning_rate": 4.762303673301077e-06, + "loss": 0.6347, + "step": 2954 + }, + { + "epoch": 0.8650468384074942, + "grad_norm": 1.012576937675476, + "learning_rate": 4.762139766958552e-06, + "loss": 0.6639, + "step": 2955 + }, + { + "epoch": 0.8653395784543325, + "grad_norm": 0.9881799817085266, + "learning_rate": 4.761975806946325e-06, + "loss": 0.6639, + "step": 2956 + }, + { + "epoch": 0.865632318501171, + "grad_norm": 0.9205610752105713, + "learning_rate": 4.761811793268288e-06, + "loss": 0.6382, + "step": 2957 + }, + { + "epoch": 0.8659250585480094, + "grad_norm": 1.0037965774536133, + "learning_rate": 4.7616477259283325e-06, + "loss": 0.712, + "step": 2958 + }, + { + "epoch": 0.8662177985948478, + "grad_norm": 0.9483511447906494, + "learning_rate": 4.761483604930349e-06, + "loss": 0.6483, + "step": 2959 + }, + { + "epoch": 0.8665105386416861, + "grad_norm": 0.9027544260025024, + "learning_rate": 4.7613194302782326e-06, + "loss": 0.6349, + "step": 2960 + }, + { + "epoch": 0.8668032786885246, + "grad_norm": 0.9748361706733704, + "learning_rate": 4.761155201975879e-06, + "loss": 0.6832, + "step": 2961 + }, + { + "epoch": 0.867096018735363, + "grad_norm": 0.898650050163269, + "learning_rate": 4.760990920027183e-06, + "loss": 0.6019, + "step": 2962 + }, + { + "epoch": 0.8673887587822015, + "grad_norm": 0.9923785924911499, + "learning_rate": 4.760826584436043e-06, + "loss": 0.6451, + "step": 2963 + }, + { + "epoch": 0.8676814988290398, + "grad_norm": 0.9896733164787292, + "learning_rate": 4.7606621952063594e-06, + "loss": 0.6494, + "step": 2964 + }, + { + "epoch": 0.8679742388758782, + "grad_norm": 0.9166337251663208, + "learning_rate": 4.760497752342029e-06, + "loss": 0.6186, + "step": 2965 + }, + { + "epoch": 0.8682669789227166, + "grad_norm": 0.9520391821861267, + "learning_rate": 4.760333255846956e-06, + "loss": 0.6809, + "step": 2966 + }, + { + "epoch": 0.8685597189695551, + "grad_norm": 0.9983303546905518, + "learning_rate": 4.7601687057250425e-06, + "loss": 0.6854, + "step": 2967 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 1.0044193267822266, + "learning_rate": 4.760004101980192e-06, + "loss": 0.6778, + "step": 2968 + }, + { + "epoch": 0.8691451990632318, + "grad_norm": 0.9483680725097656, + "learning_rate": 4.75983944461631e-06, + "loss": 0.6807, + "step": 2969 + }, + { + "epoch": 0.8694379391100703, + "grad_norm": 0.9672930240631104, + "learning_rate": 4.759674733637303e-06, + "loss": 0.6754, + "step": 2970 + }, + { + "epoch": 0.8697306791569087, + "grad_norm": 0.9844440817832947, + "learning_rate": 4.759509969047079e-06, + "loss": 0.6291, + "step": 2971 + }, + { + "epoch": 0.870023419203747, + "grad_norm": 0.9720949530601501, + "learning_rate": 4.759345150849547e-06, + "loss": 0.6344, + "step": 2972 + }, + { + "epoch": 0.8703161592505855, + "grad_norm": 0.9705049991607666, + "learning_rate": 4.7591802790486166e-06, + "loss": 0.663, + "step": 2973 + }, + { + "epoch": 0.8706088992974239, + "grad_norm": 1.0124212503433228, + "learning_rate": 4.759015353648199e-06, + "loss": 0.6201, + "step": 2974 + }, + { + "epoch": 0.8709016393442623, + "grad_norm": 0.9603182673454285, + "learning_rate": 4.758850374652211e-06, + "loss": 0.6673, + "step": 2975 + }, + { + "epoch": 0.8711943793911007, + "grad_norm": 0.9500019550323486, + "learning_rate": 4.758685342064562e-06, + "loss": 0.6532, + "step": 2976 + }, + { + "epoch": 0.8714871194379391, + "grad_norm": 0.9351889491081238, + "learning_rate": 4.75852025588917e-06, + "loss": 0.6631, + "step": 2977 + }, + { + "epoch": 0.8717798594847775, + "grad_norm": 0.9901925325393677, + "learning_rate": 4.75835511612995e-06, + "loss": 0.6685, + "step": 2978 + }, + { + "epoch": 0.872072599531616, + "grad_norm": 0.9419016242027283, + "learning_rate": 4.758189922790821e-06, + "loss": 0.6497, + "step": 2979 + }, + { + "epoch": 0.8723653395784543, + "grad_norm": 0.9432163834571838, + "learning_rate": 4.758024675875704e-06, + "loss": 0.6401, + "step": 2980 + }, + { + "epoch": 0.8726580796252927, + "grad_norm": 0.9458270072937012, + "learning_rate": 4.7578593753885166e-06, + "loss": 0.6743, + "step": 2981 + }, + { + "epoch": 0.8729508196721312, + "grad_norm": 0.9606125354766846, + "learning_rate": 4.757694021333182e-06, + "loss": 0.6757, + "step": 2982 + }, + { + "epoch": 0.8732435597189696, + "grad_norm": 0.9494543075561523, + "learning_rate": 4.757528613713622e-06, + "loss": 0.6714, + "step": 2983 + }, + { + "epoch": 0.8735362997658079, + "grad_norm": 0.9120808243751526, + "learning_rate": 4.7573631525337635e-06, + "loss": 0.6268, + "step": 2984 + }, + { + "epoch": 0.8738290398126464, + "grad_norm": 0.9623260498046875, + "learning_rate": 4.7571976377975295e-06, + "loss": 0.6653, + "step": 2985 + }, + { + "epoch": 0.8741217798594848, + "grad_norm": 0.919123649597168, + "learning_rate": 4.757032069508848e-06, + "loss": 0.6548, + "step": 2986 + }, + { + "epoch": 0.8744145199063232, + "grad_norm": 0.9856236577033997, + "learning_rate": 4.756866447671647e-06, + "loss": 0.6308, + "step": 2987 + }, + { + "epoch": 0.8747072599531616, + "grad_norm": 0.9921389818191528, + "learning_rate": 4.756700772289856e-06, + "loss": 0.6411, + "step": 2988 + }, + { + "epoch": 0.875, + "grad_norm": 0.9889643788337708, + "learning_rate": 4.756535043367406e-06, + "loss": 0.6851, + "step": 2989 + }, + { + "epoch": 0.8752927400468384, + "grad_norm": 0.9126710891723633, + "learning_rate": 4.756369260908228e-06, + "loss": 0.6656, + "step": 2990 + }, + { + "epoch": 0.8755854800936768, + "grad_norm": 0.9590594172477722, + "learning_rate": 4.756203424916256e-06, + "loss": 0.6688, + "step": 2991 + }, + { + "epoch": 0.8758782201405152, + "grad_norm": 1.0085372924804688, + "learning_rate": 4.756037535395425e-06, + "loss": 0.6224, + "step": 2992 + }, + { + "epoch": 0.8761709601873536, + "grad_norm": 0.9568994641304016, + "learning_rate": 4.755871592349669e-06, + "loss": 0.6621, + "step": 2993 + }, + { + "epoch": 0.8764637002341921, + "grad_norm": 0.9759738445281982, + "learning_rate": 4.755705595782926e-06, + "loss": 0.6749, + "step": 2994 + }, + { + "epoch": 0.8767564402810304, + "grad_norm": 0.9207966327667236, + "learning_rate": 4.755539545699135e-06, + "loss": 0.6538, + "step": 2995 + }, + { + "epoch": 0.8770491803278688, + "grad_norm": 0.9915757179260254, + "learning_rate": 4.755373442102235e-06, + "loss": 0.6919, + "step": 2996 + }, + { + "epoch": 0.8773419203747073, + "grad_norm": 0.9536476731300354, + "learning_rate": 4.755207284996166e-06, + "loss": 0.6588, + "step": 2997 + }, + { + "epoch": 0.8776346604215457, + "grad_norm": 0.9467412233352661, + "learning_rate": 4.755041074384872e-06, + "loss": 0.6602, + "step": 2998 + }, + { + "epoch": 0.877927400468384, + "grad_norm": 0.9334191083908081, + "learning_rate": 4.754874810272294e-06, + "loss": 0.6285, + "step": 2999 + }, + { + "epoch": 0.8782201405152225, + "grad_norm": 0.9919931292533875, + "learning_rate": 4.754708492662378e-06, + "loss": 0.7222, + "step": 3000 + }, + { + "epoch": 0.8785128805620609, + "grad_norm": 0.903755247592926, + "learning_rate": 4.754542121559071e-06, + "loss": 0.6163, + "step": 3001 + }, + { + "epoch": 0.8788056206088993, + "grad_norm": 1.0459023714065552, + "learning_rate": 4.7543756969663194e-06, + "loss": 0.6551, + "step": 3002 + }, + { + "epoch": 0.8790983606557377, + "grad_norm": 0.9316949248313904, + "learning_rate": 4.7542092188880695e-06, + "loss": 0.6606, + "step": 3003 + }, + { + "epoch": 0.8793911007025761, + "grad_norm": 0.9783416986465454, + "learning_rate": 4.7540426873282745e-06, + "loss": 0.6579, + "step": 3004 + }, + { + "epoch": 0.8796838407494145, + "grad_norm": 0.9951759576797485, + "learning_rate": 4.7538761022908815e-06, + "loss": 0.6551, + "step": 3005 + }, + { + "epoch": 0.879976580796253, + "grad_norm": 0.9870976209640503, + "learning_rate": 4.753709463779847e-06, + "loss": 0.6659, + "step": 3006 + }, + { + "epoch": 0.8802693208430913, + "grad_norm": 0.993213415145874, + "learning_rate": 4.753542771799122e-06, + "loss": 0.632, + "step": 3007 + }, + { + "epoch": 0.8805620608899297, + "grad_norm": 1.0549896955490112, + "learning_rate": 4.7533760263526615e-06, + "loss": 0.6832, + "step": 3008 + }, + { + "epoch": 0.8808548009367682, + "grad_norm": 0.9180334806442261, + "learning_rate": 4.753209227444422e-06, + "loss": 0.6308, + "step": 3009 + }, + { + "epoch": 0.8811475409836066, + "grad_norm": 0.9931708574295044, + "learning_rate": 4.753042375078361e-06, + "loss": 0.682, + "step": 3010 + }, + { + "epoch": 0.8814402810304449, + "grad_norm": 0.9298861026763916, + "learning_rate": 4.752875469258435e-06, + "loss": 0.6827, + "step": 3011 + }, + { + "epoch": 0.8817330210772834, + "grad_norm": 1.0288089513778687, + "learning_rate": 4.752708509988607e-06, + "loss": 0.6572, + "step": 3012 + }, + { + "epoch": 0.8820257611241218, + "grad_norm": 1.0151652097702026, + "learning_rate": 4.752541497272838e-06, + "loss": 0.6681, + "step": 3013 + }, + { + "epoch": 0.8823185011709602, + "grad_norm": 1.057962417602539, + "learning_rate": 4.752374431115088e-06, + "loss": 0.6097, + "step": 3014 + }, + { + "epoch": 0.8826112412177985, + "grad_norm": 0.9697676301002502, + "learning_rate": 4.752207311519321e-06, + "loss": 0.6352, + "step": 3015 + }, + { + "epoch": 0.882903981264637, + "grad_norm": 1.0187233686447144, + "learning_rate": 4.752040138489505e-06, + "loss": 0.626, + "step": 3016 + }, + { + "epoch": 0.8831967213114754, + "grad_norm": 0.9735510349273682, + "learning_rate": 4.751872912029602e-06, + "loss": 0.6381, + "step": 3017 + }, + { + "epoch": 0.8834894613583139, + "grad_norm": 0.9835630059242249, + "learning_rate": 4.751705632143583e-06, + "loss": 0.6662, + "step": 3018 + }, + { + "epoch": 0.8837822014051522, + "grad_norm": 0.9236445426940918, + "learning_rate": 4.751538298835414e-06, + "loss": 0.683, + "step": 3019 + }, + { + "epoch": 0.8840749414519906, + "grad_norm": 0.9389310479164124, + "learning_rate": 4.751370912109067e-06, + "loss": 0.6222, + "step": 3020 + }, + { + "epoch": 0.884367681498829, + "grad_norm": 0.8882930874824524, + "learning_rate": 4.751203471968513e-06, + "loss": 0.5961, + "step": 3021 + }, + { + "epoch": 0.8846604215456675, + "grad_norm": 0.9509779214859009, + "learning_rate": 4.7510359784177225e-06, + "loss": 0.6614, + "step": 3022 + }, + { + "epoch": 0.8849531615925058, + "grad_norm": 0.9945912957191467, + "learning_rate": 4.7508684314606725e-06, + "loss": 0.7119, + "step": 3023 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.9872069358825684, + "learning_rate": 4.750700831101337e-06, + "loss": 0.7194, + "step": 3024 + }, + { + "epoch": 0.8855386416861827, + "grad_norm": 0.9442868232727051, + "learning_rate": 4.7505331773436904e-06, + "loss": 0.6499, + "step": 3025 + }, + { + "epoch": 0.8858313817330211, + "grad_norm": 1.058609127998352, + "learning_rate": 4.750365470191712e-06, + "loss": 0.6772, + "step": 3026 + }, + { + "epoch": 0.8861241217798594, + "grad_norm": 0.9862047433853149, + "learning_rate": 4.750197709649381e-06, + "loss": 0.6426, + "step": 3027 + }, + { + "epoch": 0.8864168618266979, + "grad_norm": 0.9580667614936829, + "learning_rate": 4.750029895720677e-06, + "loss": 0.6381, + "step": 3028 + }, + { + "epoch": 0.8867096018735363, + "grad_norm": 0.9763757586479187, + "learning_rate": 4.749862028409581e-06, + "loss": 0.712, + "step": 3029 + }, + { + "epoch": 0.8870023419203747, + "grad_norm": 0.9438791275024414, + "learning_rate": 4.749694107720077e-06, + "loss": 0.6739, + "step": 3030 + }, + { + "epoch": 0.8872950819672131, + "grad_norm": 0.9761878848075867, + "learning_rate": 4.749526133656147e-06, + "loss": 0.6686, + "step": 3031 + }, + { + "epoch": 0.8875878220140515, + "grad_norm": 0.9547361135482788, + "learning_rate": 4.749358106221778e-06, + "loss": 0.6509, + "step": 3032 + }, + { + "epoch": 0.8878805620608899, + "grad_norm": 0.9560233950614929, + "learning_rate": 4.749190025420956e-06, + "loss": 0.6519, + "step": 3033 + }, + { + "epoch": 0.8881733021077284, + "grad_norm": 0.9248474836349487, + "learning_rate": 4.749021891257668e-06, + "loss": 0.6494, + "step": 3034 + }, + { + "epoch": 0.8884660421545667, + "grad_norm": 0.9890168905258179, + "learning_rate": 4.748853703735904e-06, + "loss": 0.7196, + "step": 3035 + }, + { + "epoch": 0.8887587822014051, + "grad_norm": 1.0614588260650635, + "learning_rate": 4.748685462859653e-06, + "loss": 0.695, + "step": 3036 + }, + { + "epoch": 0.8890515222482436, + "grad_norm": 0.9615668654441833, + "learning_rate": 4.748517168632908e-06, + "loss": 0.6253, + "step": 3037 + }, + { + "epoch": 0.889344262295082, + "grad_norm": 0.9174488186836243, + "learning_rate": 4.748348821059661e-06, + "loss": 0.6303, + "step": 3038 + }, + { + "epoch": 0.8896370023419203, + "grad_norm": 0.9831375479698181, + "learning_rate": 4.748180420143906e-06, + "loss": 0.6635, + "step": 3039 + }, + { + "epoch": 0.8899297423887588, + "grad_norm": 1.034010887145996, + "learning_rate": 4.748011965889639e-06, + "loss": 0.6791, + "step": 3040 + }, + { + "epoch": 0.8902224824355972, + "grad_norm": 1.008805513381958, + "learning_rate": 4.7478434583008555e-06, + "loss": 0.6441, + "step": 3041 + }, + { + "epoch": 0.8905152224824356, + "grad_norm": 0.953762948513031, + "learning_rate": 4.7476748973815555e-06, + "loss": 0.6544, + "step": 3042 + }, + { + "epoch": 0.890807962529274, + "grad_norm": 1.0589303970336914, + "learning_rate": 4.747506283135735e-06, + "loss": 0.6659, + "step": 3043 + }, + { + "epoch": 0.8911007025761124, + "grad_norm": 0.9714640378952026, + "learning_rate": 4.747337615567398e-06, + "loss": 0.667, + "step": 3044 + }, + { + "epoch": 0.8913934426229508, + "grad_norm": 0.9261729717254639, + "learning_rate": 4.747168894680544e-06, + "loss": 0.6441, + "step": 3045 + }, + { + "epoch": 0.8916861826697893, + "grad_norm": 1.0010321140289307, + "learning_rate": 4.747000120479175e-06, + "loss": 0.664, + "step": 3046 + }, + { + "epoch": 0.8919789227166276, + "grad_norm": 1.0112234354019165, + "learning_rate": 4.746831292967297e-06, + "loss": 0.6222, + "step": 3047 + }, + { + "epoch": 0.892271662763466, + "grad_norm": 0.9542308449745178, + "learning_rate": 4.7466624121489146e-06, + "loss": 0.6687, + "step": 3048 + }, + { + "epoch": 0.8925644028103045, + "grad_norm": 1.0058538913726807, + "learning_rate": 4.746493478028036e-06, + "loss": 0.621, + "step": 3049 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 1.0055328607559204, + "learning_rate": 4.746324490608667e-06, + "loss": 0.6488, + "step": 3050 + }, + { + "epoch": 0.8931498829039812, + "grad_norm": 0.9933416247367859, + "learning_rate": 4.746155449894818e-06, + "loss": 0.6915, + "step": 3051 + }, + { + "epoch": 0.8934426229508197, + "grad_norm": 1.0160892009735107, + "learning_rate": 4.7459863558905e-06, + "loss": 0.6797, + "step": 3052 + }, + { + "epoch": 0.8937353629976581, + "grad_norm": 0.9401261210441589, + "learning_rate": 4.745817208599723e-06, + "loss": 0.6837, + "step": 3053 + }, + { + "epoch": 0.8940281030444965, + "grad_norm": 0.9779773950576782, + "learning_rate": 4.745648008026502e-06, + "loss": 0.6487, + "step": 3054 + }, + { + "epoch": 0.8943208430913349, + "grad_norm": 0.9154078960418701, + "learning_rate": 4.745478754174852e-06, + "loss": 0.6459, + "step": 3055 + }, + { + "epoch": 0.8946135831381733, + "grad_norm": 0.9793970584869385, + "learning_rate": 4.745309447048785e-06, + "loss": 0.6913, + "step": 3056 + }, + { + "epoch": 0.8949063231850117, + "grad_norm": 1.0131943225860596, + "learning_rate": 4.745140086652321e-06, + "loss": 0.6747, + "step": 3057 + }, + { + "epoch": 0.8951990632318502, + "grad_norm": 1.001654863357544, + "learning_rate": 4.744970672989477e-06, + "loss": 0.6389, + "step": 3058 + }, + { + "epoch": 0.8954918032786885, + "grad_norm": 0.9440588355064392, + "learning_rate": 4.744801206064272e-06, + "loss": 0.6896, + "step": 3059 + }, + { + "epoch": 0.8957845433255269, + "grad_norm": 0.9403742551803589, + "learning_rate": 4.744631685880727e-06, + "loss": 0.6775, + "step": 3060 + }, + { + "epoch": 0.8960772833723654, + "grad_norm": 0.9423749446868896, + "learning_rate": 4.744462112442865e-06, + "loss": 0.6588, + "step": 3061 + }, + { + "epoch": 0.8963700234192038, + "grad_norm": 0.9830531477928162, + "learning_rate": 4.744292485754707e-06, + "loss": 0.6934, + "step": 3062 + }, + { + "epoch": 0.8966627634660421, + "grad_norm": 1.0129430294036865, + "learning_rate": 4.744122805820279e-06, + "loss": 0.6954, + "step": 3063 + }, + { + "epoch": 0.8969555035128806, + "grad_norm": 0.9986541867256165, + "learning_rate": 4.743953072643608e-06, + "loss": 0.6558, + "step": 3064 + }, + { + "epoch": 0.897248243559719, + "grad_norm": 0.9479562640190125, + "learning_rate": 4.743783286228717e-06, + "loss": 0.6278, + "step": 3065 + }, + { + "epoch": 0.8975409836065574, + "grad_norm": 0.9460668563842773, + "learning_rate": 4.743613446579637e-06, + "loss": 0.6706, + "step": 3066 + }, + { + "epoch": 0.8978337236533958, + "grad_norm": 1.0195789337158203, + "learning_rate": 4.743443553700397e-06, + "loss": 0.6701, + "step": 3067 + }, + { + "epoch": 0.8981264637002342, + "grad_norm": 0.9673579335212708, + "learning_rate": 4.743273607595028e-06, + "loss": 0.6457, + "step": 3068 + }, + { + "epoch": 0.8984192037470726, + "grad_norm": 0.909854531288147, + "learning_rate": 4.7431036082675615e-06, + "loss": 0.6322, + "step": 3069 + }, + { + "epoch": 0.8987119437939111, + "grad_norm": 0.9568290114402771, + "learning_rate": 4.742933555722031e-06, + "loss": 0.6507, + "step": 3070 + }, + { + "epoch": 0.8990046838407494, + "grad_norm": 0.9677749872207642, + "learning_rate": 4.742763449962471e-06, + "loss": 0.6614, + "step": 3071 + }, + { + "epoch": 0.8992974238875878, + "grad_norm": 1.0225344896316528, + "learning_rate": 4.7425932909929176e-06, + "loss": 0.6851, + "step": 3072 + }, + { + "epoch": 0.8995901639344263, + "grad_norm": 0.9663192629814148, + "learning_rate": 4.742423078817406e-06, + "loss": 0.6436, + "step": 3073 + }, + { + "epoch": 0.8998829039812647, + "grad_norm": 0.9308681488037109, + "learning_rate": 4.742252813439978e-06, + "loss": 0.6325, + "step": 3074 + }, + { + "epoch": 0.900175644028103, + "grad_norm": 0.9528010487556458, + "learning_rate": 4.742082494864671e-06, + "loss": 0.6766, + "step": 3075 + }, + { + "epoch": 0.9004683840749415, + "grad_norm": 1.1074912548065186, + "learning_rate": 4.7419121230955245e-06, + "loss": 0.6591, + "step": 3076 + }, + { + "epoch": 0.9007611241217799, + "grad_norm": 0.9850867986679077, + "learning_rate": 4.741741698136583e-06, + "loss": 0.6689, + "step": 3077 + }, + { + "epoch": 0.9010538641686182, + "grad_norm": 1.0548179149627686, + "learning_rate": 4.7415712199918895e-06, + "loss": 0.6588, + "step": 3078 + }, + { + "epoch": 0.9013466042154566, + "grad_norm": 0.9553893208503723, + "learning_rate": 4.7414006886654875e-06, + "loss": 0.6294, + "step": 3079 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 0.9677547812461853, + "learning_rate": 4.741230104161424e-06, + "loss": 0.6617, + "step": 3080 + }, + { + "epoch": 0.9019320843091335, + "grad_norm": 0.9914385676383972, + "learning_rate": 4.741059466483745e-06, + "loss": 0.632, + "step": 3081 + }, + { + "epoch": 0.9022248243559718, + "grad_norm": 1.005961298942566, + "learning_rate": 4.740888775636501e-06, + "loss": 0.6632, + "step": 3082 + }, + { + "epoch": 0.9025175644028103, + "grad_norm": 0.9439470767974854, + "learning_rate": 4.740718031623739e-06, + "loss": 0.639, + "step": 3083 + }, + { + "epoch": 0.9028103044496487, + "grad_norm": 1.120239019393921, + "learning_rate": 4.740547234449512e-06, + "loss": 0.6723, + "step": 3084 + }, + { + "epoch": 0.9031030444964872, + "grad_norm": 0.9241703152656555, + "learning_rate": 4.740376384117871e-06, + "loss": 0.6715, + "step": 3085 + }, + { + "epoch": 0.9033957845433255, + "grad_norm": 0.9405798316001892, + "learning_rate": 4.740205480632869e-06, + "loss": 0.6733, + "step": 3086 + }, + { + "epoch": 0.9036885245901639, + "grad_norm": 1.2721672058105469, + "learning_rate": 4.740034523998562e-06, + "loss": 0.6921, + "step": 3087 + }, + { + "epoch": 0.9039812646370023, + "grad_norm": 0.9954063892364502, + "learning_rate": 4.739863514219006e-06, + "loss": 0.6306, + "step": 3088 + }, + { + "epoch": 0.9042740046838408, + "grad_norm": 0.9405252933502197, + "learning_rate": 4.7396924512982585e-06, + "loss": 0.6479, + "step": 3089 + }, + { + "epoch": 0.9045667447306791, + "grad_norm": 0.9865220785140991, + "learning_rate": 4.739521335240377e-06, + "loss": 0.6318, + "step": 3090 + }, + { + "epoch": 0.9048594847775175, + "grad_norm": 0.9995077848434448, + "learning_rate": 4.739350166049421e-06, + "loss": 0.716, + "step": 3091 + }, + { + "epoch": 0.905152224824356, + "grad_norm": 1.0388315916061401, + "learning_rate": 4.739178943729452e-06, + "loss": 0.6391, + "step": 3092 + }, + { + "epoch": 0.9054449648711944, + "grad_norm": 0.9735414981842041, + "learning_rate": 4.739007668284533e-06, + "loss": 0.6338, + "step": 3093 + }, + { + "epoch": 0.9057377049180327, + "grad_norm": 0.933647632598877, + "learning_rate": 4.738836339718726e-06, + "loss": 0.6313, + "step": 3094 + }, + { + "epoch": 0.9060304449648712, + "grad_norm": 0.9979823231697083, + "learning_rate": 4.738664958036097e-06, + "loss": 0.6667, + "step": 3095 + }, + { + "epoch": 0.9063231850117096, + "grad_norm": 0.9699830412864685, + "learning_rate": 4.738493523240712e-06, + "loss": 0.6775, + "step": 3096 + }, + { + "epoch": 0.906615925058548, + "grad_norm": 1.0335662364959717, + "learning_rate": 4.7383220353366384e-06, + "loss": 0.6967, + "step": 3097 + }, + { + "epoch": 0.9069086651053864, + "grad_norm": 0.9655336737632751, + "learning_rate": 4.738150494327944e-06, + "loss": 0.6819, + "step": 3098 + }, + { + "epoch": 0.9072014051522248, + "grad_norm": 1.060995101928711, + "learning_rate": 4.737978900218699e-06, + "loss": 0.695, + "step": 3099 + }, + { + "epoch": 0.9074941451990632, + "grad_norm": 1.025328278541565, + "learning_rate": 4.7378072530129746e-06, + "loss": 0.6867, + "step": 3100 + }, + { + "epoch": 0.9077868852459017, + "grad_norm": 0.9485393166542053, + "learning_rate": 4.7376355527148445e-06, + "loss": 0.6617, + "step": 3101 + }, + { + "epoch": 0.90807962529274, + "grad_norm": 0.8987494111061096, + "learning_rate": 4.73746379932838e-06, + "loss": 0.6237, + "step": 3102 + }, + { + "epoch": 0.9083723653395784, + "grad_norm": 1.0281933546066284, + "learning_rate": 4.7372919928576566e-06, + "loss": 0.6276, + "step": 3103 + }, + { + "epoch": 0.9086651053864169, + "grad_norm": 1.0238590240478516, + "learning_rate": 4.737120133306751e-06, + "loss": 0.6714, + "step": 3104 + }, + { + "epoch": 0.9089578454332553, + "grad_norm": 0.9436424374580383, + "learning_rate": 4.736948220679741e-06, + "loss": 0.6709, + "step": 3105 + }, + { + "epoch": 0.9092505854800936, + "grad_norm": 0.9644876718521118, + "learning_rate": 4.736776254980705e-06, + "loss": 0.6395, + "step": 3106 + }, + { + "epoch": 0.9095433255269321, + "grad_norm": 1.008880615234375, + "learning_rate": 4.736604236213722e-06, + "loss": 0.658, + "step": 3107 + }, + { + "epoch": 0.9098360655737705, + "grad_norm": 8.082836151123047, + "learning_rate": 4.736432164382873e-06, + "loss": 0.6494, + "step": 3108 + }, + { + "epoch": 0.9101288056206089, + "grad_norm": 0.9405232071876526, + "learning_rate": 4.736260039492242e-06, + "loss": 0.6242, + "step": 3109 + }, + { + "epoch": 0.9104215456674473, + "grad_norm": 0.9740483164787292, + "learning_rate": 4.736087861545912e-06, + "loss": 0.642, + "step": 3110 + }, + { + "epoch": 0.9107142857142857, + "grad_norm": 0.9331749081611633, + "learning_rate": 4.7359156305479685e-06, + "loss": 0.64, + "step": 3111 + }, + { + "epoch": 0.9110070257611241, + "grad_norm": 0.9688782095909119, + "learning_rate": 4.735743346502496e-06, + "loss": 0.6892, + "step": 3112 + }, + { + "epoch": 0.9112997658079626, + "grad_norm": 0.9182763695716858, + "learning_rate": 4.7355710094135835e-06, + "loss": 0.6793, + "step": 3113 + }, + { + "epoch": 0.9115925058548009, + "grad_norm": 0.9320412874221802, + "learning_rate": 4.7353986192853194e-06, + "loss": 0.6305, + "step": 3114 + }, + { + "epoch": 0.9118852459016393, + "grad_norm": 0.8969971537590027, + "learning_rate": 4.735226176121793e-06, + "loss": 0.6356, + "step": 3115 + }, + { + "epoch": 0.9121779859484778, + "grad_norm": 0.9366916418075562, + "learning_rate": 4.735053679927097e-06, + "loss": 0.6313, + "step": 3116 + }, + { + "epoch": 0.9124707259953162, + "grad_norm": 0.9600667357444763, + "learning_rate": 4.734881130705323e-06, + "loss": 0.6366, + "step": 3117 + }, + { + "epoch": 0.9127634660421545, + "grad_norm": 1.4765102863311768, + "learning_rate": 4.734708528460564e-06, + "loss": 0.6279, + "step": 3118 + }, + { + "epoch": 0.913056206088993, + "grad_norm": 0.956405520439148, + "learning_rate": 4.734535873196917e-06, + "loss": 0.6483, + "step": 3119 + }, + { + "epoch": 0.9133489461358314, + "grad_norm": 0.9657067656517029, + "learning_rate": 4.7343631649184765e-06, + "loss": 0.6874, + "step": 3120 + }, + { + "epoch": 0.9136416861826698, + "grad_norm": 0.9200806617736816, + "learning_rate": 4.73419040362934e-06, + "loss": 0.6472, + "step": 3121 + }, + { + "epoch": 0.9139344262295082, + "grad_norm": 0.9909254312515259, + "learning_rate": 4.734017589333607e-06, + "loss": 0.6709, + "step": 3122 + }, + { + "epoch": 0.9142271662763466, + "grad_norm": 0.9259937405586243, + "learning_rate": 4.7338447220353786e-06, + "loss": 0.6371, + "step": 3123 + }, + { + "epoch": 0.914519906323185, + "grad_norm": 0.9948121309280396, + "learning_rate": 4.733671801738754e-06, + "loss": 0.637, + "step": 3124 + }, + { + "epoch": 0.9148126463700235, + "grad_norm": 0.9559407234191895, + "learning_rate": 4.733498828447837e-06, + "loss": 0.6492, + "step": 3125 + }, + { + "epoch": 0.9151053864168618, + "grad_norm": 0.9173922538757324, + "learning_rate": 4.733325802166732e-06, + "loss": 0.6621, + "step": 3126 + }, + { + "epoch": 0.9153981264637002, + "grad_norm": 1.0096288919448853, + "learning_rate": 4.733152722899542e-06, + "loss": 0.7129, + "step": 3127 + }, + { + "epoch": 0.9156908665105387, + "grad_norm": 0.9824875593185425, + "learning_rate": 4.732979590650376e-06, + "loss": 0.6797, + "step": 3128 + }, + { + "epoch": 0.9159836065573771, + "grad_norm": 1.0337616205215454, + "learning_rate": 4.7328064054233395e-06, + "loss": 0.6815, + "step": 3129 + }, + { + "epoch": 0.9162763466042154, + "grad_norm": 0.9378529787063599, + "learning_rate": 4.732633167222542e-06, + "loss": 0.6732, + "step": 3130 + }, + { + "epoch": 0.9165690866510539, + "grad_norm": 0.9142439961433411, + "learning_rate": 4.732459876052093e-06, + "loss": 0.5985, + "step": 3131 + }, + { + "epoch": 0.9168618266978923, + "grad_norm": 1.0098590850830078, + "learning_rate": 4.732286531916106e-06, + "loss": 0.6572, + "step": 3132 + }, + { + "epoch": 0.9171545667447307, + "grad_norm": 1.0208171606063843, + "learning_rate": 4.732113134818691e-06, + "loss": 0.6816, + "step": 3133 + }, + { + "epoch": 0.917447306791569, + "grad_norm": 0.9448334574699402, + "learning_rate": 4.7319396847639635e-06, + "loss": 0.6384, + "step": 3134 + }, + { + "epoch": 0.9177400468384075, + "grad_norm": 0.9677916765213013, + "learning_rate": 4.731766181756038e-06, + "loss": 0.6538, + "step": 3135 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.9937874674797058, + "learning_rate": 4.731592625799031e-06, + "loss": 0.6777, + "step": 3136 + }, + { + "epoch": 0.9183255269320844, + "grad_norm": 1.022184133529663, + "learning_rate": 4.731419016897061e-06, + "loss": 0.6812, + "step": 3137 + }, + { + "epoch": 0.9186182669789227, + "grad_norm": 0.9880068302154541, + "learning_rate": 4.731245355054245e-06, + "loss": 0.681, + "step": 3138 + }, + { + "epoch": 0.9189110070257611, + "grad_norm": 0.9302680492401123, + "learning_rate": 4.731071640274705e-06, + "loss": 0.6532, + "step": 3139 + }, + { + "epoch": 0.9192037470725996, + "grad_norm": 1.0321388244628906, + "learning_rate": 4.730897872562561e-06, + "loss": 0.624, + "step": 3140 + }, + { + "epoch": 0.919496487119438, + "grad_norm": 0.9889824390411377, + "learning_rate": 4.7307240519219365e-06, + "loss": 0.6461, + "step": 3141 + }, + { + "epoch": 0.9197892271662763, + "grad_norm": 1.0157816410064697, + "learning_rate": 4.730550178356956e-06, + "loss": 0.6656, + "step": 3142 + }, + { + "epoch": 0.9200819672131147, + "grad_norm": 0.9111616015434265, + "learning_rate": 4.730376251871744e-06, + "loss": 0.626, + "step": 3143 + }, + { + "epoch": 0.9203747072599532, + "grad_norm": 0.9625220894813538, + "learning_rate": 4.730202272470427e-06, + "loss": 0.6467, + "step": 3144 + }, + { + "epoch": 0.9206674473067916, + "grad_norm": 0.9840562343597412, + "learning_rate": 4.730028240157132e-06, + "loss": 0.6451, + "step": 3145 + }, + { + "epoch": 0.9209601873536299, + "grad_norm": 0.9446005821228027, + "learning_rate": 4.729854154935988e-06, + "loss": 0.6655, + "step": 3146 + }, + { + "epoch": 0.9212529274004684, + "grad_norm": 0.9417304396629333, + "learning_rate": 4.7296800168111265e-06, + "loss": 0.6269, + "step": 3147 + }, + { + "epoch": 0.9215456674473068, + "grad_norm": 0.9644907116889954, + "learning_rate": 4.729505825786679e-06, + "loss": 0.6728, + "step": 3148 + }, + { + "epoch": 0.9218384074941453, + "grad_norm": 0.9574456214904785, + "learning_rate": 4.729331581866776e-06, + "loss": 0.6083, + "step": 3149 + }, + { + "epoch": 0.9221311475409836, + "grad_norm": 0.9380435347557068, + "learning_rate": 4.729157285055553e-06, + "loss": 0.6557, + "step": 3150 + }, + { + "epoch": 0.922423887587822, + "grad_norm": 0.976436972618103, + "learning_rate": 4.728982935357147e-06, + "loss": 0.6468, + "step": 3151 + }, + { + "epoch": 0.9227166276346604, + "grad_norm": 1.0154696702957153, + "learning_rate": 4.72880853277569e-06, + "loss": 0.6791, + "step": 3152 + }, + { + "epoch": 0.9230093676814989, + "grad_norm": 1.018214464187622, + "learning_rate": 4.728634077315325e-06, + "loss": 0.655, + "step": 3153 + }, + { + "epoch": 0.9233021077283372, + "grad_norm": 0.9816519021987915, + "learning_rate": 4.728459568980187e-06, + "loss": 0.7222, + "step": 3154 + }, + { + "epoch": 0.9235948477751756, + "grad_norm": 0.9588265419006348, + "learning_rate": 4.728285007774417e-06, + "loss": 0.6409, + "step": 3155 + }, + { + "epoch": 0.9238875878220141, + "grad_norm": 0.9404575228691101, + "learning_rate": 4.728110393702158e-06, + "loss": 0.7047, + "step": 3156 + }, + { + "epoch": 0.9241803278688525, + "grad_norm": 0.9643314480781555, + "learning_rate": 4.727935726767552e-06, + "loss": 0.6381, + "step": 3157 + }, + { + "epoch": 0.9244730679156908, + "grad_norm": 0.9890220761299133, + "learning_rate": 4.727761006974742e-06, + "loss": 0.6616, + "step": 3158 + }, + { + "epoch": 0.9247658079625293, + "grad_norm": 0.9658961892127991, + "learning_rate": 4.727586234327875e-06, + "loss": 0.684, + "step": 3159 + }, + { + "epoch": 0.9250585480093677, + "grad_norm": 0.9528616070747375, + "learning_rate": 4.727411408831096e-06, + "loss": 0.6546, + "step": 3160 + }, + { + "epoch": 0.925351288056206, + "grad_norm": 0.9494866728782654, + "learning_rate": 4.727236530488553e-06, + "loss": 0.6762, + "step": 3161 + }, + { + "epoch": 0.9256440281030445, + "grad_norm": 0.9765480756759644, + "learning_rate": 4.727061599304396e-06, + "loss": 0.6778, + "step": 3162 + }, + { + "epoch": 0.9259367681498829, + "grad_norm": 1.0540434122085571, + "learning_rate": 4.726886615282775e-06, + "loss": 0.6397, + "step": 3163 + }, + { + "epoch": 0.9262295081967213, + "grad_norm": 0.9590700268745422, + "learning_rate": 4.726711578427841e-06, + "loss": 0.6613, + "step": 3164 + }, + { + "epoch": 0.9265222482435597, + "grad_norm": 0.9623047113418579, + "learning_rate": 4.726536488743746e-06, + "loss": 0.6549, + "step": 3165 + }, + { + "epoch": 0.9268149882903981, + "grad_norm": 0.897293746471405, + "learning_rate": 4.726361346234646e-06, + "loss": 0.616, + "step": 3166 + }, + { + "epoch": 0.9271077283372365, + "grad_norm": 0.9448834657669067, + "learning_rate": 4.726186150904696e-06, + "loss": 0.6731, + "step": 3167 + }, + { + "epoch": 0.927400468384075, + "grad_norm": 0.9197112321853638, + "learning_rate": 4.7260109027580505e-06, + "loss": 0.5931, + "step": 3168 + }, + { + "epoch": 0.9276932084309133, + "grad_norm": 0.9696610569953918, + "learning_rate": 4.725835601798869e-06, + "loss": 0.6694, + "step": 3169 + }, + { + "epoch": 0.9279859484777517, + "grad_norm": 0.9763285517692566, + "learning_rate": 4.7256602480313116e-06, + "loss": 0.6804, + "step": 3170 + }, + { + "epoch": 0.9282786885245902, + "grad_norm": 0.9657164216041565, + "learning_rate": 4.725484841459535e-06, + "loss": 0.6408, + "step": 3171 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.9541552066802979, + "learning_rate": 4.725309382087705e-06, + "loss": 0.6528, + "step": 3172 + }, + { + "epoch": 0.9288641686182669, + "grad_norm": 0.9158934950828552, + "learning_rate": 4.725133869919981e-06, + "loss": 0.6191, + "step": 3173 + }, + { + "epoch": 0.9291569086651054, + "grad_norm": 0.9715738892555237, + "learning_rate": 4.7249583049605295e-06, + "loss": 0.6602, + "step": 3174 + }, + { + "epoch": 0.9294496487119438, + "grad_norm": 0.9300965666770935, + "learning_rate": 4.724782687213514e-06, + "loss": 0.6468, + "step": 3175 + }, + { + "epoch": 0.9297423887587822, + "grad_norm": 0.937608003616333, + "learning_rate": 4.724607016683101e-06, + "loss": 0.635, + "step": 3176 + }, + { + "epoch": 0.9300351288056206, + "grad_norm": 0.9596208333969116, + "learning_rate": 4.72443129337346e-06, + "loss": 0.6562, + "step": 3177 + }, + { + "epoch": 0.930327868852459, + "grad_norm": 0.9643343687057495, + "learning_rate": 4.7242555172887604e-06, + "loss": 0.6545, + "step": 3178 + }, + { + "epoch": 0.9306206088992974, + "grad_norm": 0.9530842900276184, + "learning_rate": 4.7240796884331695e-06, + "loss": 0.7031, + "step": 3179 + }, + { + "epoch": 0.9309133489461359, + "grad_norm": 1.0403352975845337, + "learning_rate": 4.723903806810862e-06, + "loss": 0.657, + "step": 3180 + }, + { + "epoch": 0.9312060889929742, + "grad_norm": 0.9755838513374329, + "learning_rate": 4.723727872426009e-06, + "loss": 0.6692, + "step": 3181 + }, + { + "epoch": 0.9314988290398126, + "grad_norm": 0.9995898008346558, + "learning_rate": 4.723551885282784e-06, + "loss": 0.6864, + "step": 3182 + }, + { + "epoch": 0.9317915690866511, + "grad_norm": 0.9498898386955261, + "learning_rate": 4.723375845385365e-06, + "loss": 0.6659, + "step": 3183 + }, + { + "epoch": 0.9320843091334895, + "grad_norm": 0.9248277544975281, + "learning_rate": 4.723199752737926e-06, + "loss": 0.6269, + "step": 3184 + }, + { + "epoch": 0.9323770491803278, + "grad_norm": 0.9481980800628662, + "learning_rate": 4.723023607344646e-06, + "loss": 0.6796, + "step": 3185 + }, + { + "epoch": 0.9326697892271663, + "grad_norm": 0.9890158772468567, + "learning_rate": 4.722847409209704e-06, + "loss": 0.6414, + "step": 3186 + }, + { + "epoch": 0.9329625292740047, + "grad_norm": 0.986864447593689, + "learning_rate": 4.7226711583372795e-06, + "loss": 0.7118, + "step": 3187 + }, + { + "epoch": 0.9332552693208431, + "grad_norm": 0.9633713364601135, + "learning_rate": 4.7224948547315555e-06, + "loss": 0.6776, + "step": 3188 + }, + { + "epoch": 0.9335480093676815, + "grad_norm": 0.9328820705413818, + "learning_rate": 4.722318498396713e-06, + "loss": 0.6534, + "step": 3189 + }, + { + "epoch": 0.9338407494145199, + "grad_norm": 0.9565248489379883, + "learning_rate": 4.722142089336938e-06, + "loss": 0.6683, + "step": 3190 + }, + { + "epoch": 0.9341334894613583, + "grad_norm": 0.9504727721214294, + "learning_rate": 4.721965627556415e-06, + "loss": 0.7058, + "step": 3191 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 0.9472916722297668, + "learning_rate": 4.7217891130593305e-06, + "loss": 0.6449, + "step": 3192 + }, + { + "epoch": 0.9347189695550351, + "grad_norm": 1.1343458890914917, + "learning_rate": 4.721612545849872e-06, + "loss": 0.61, + "step": 3193 + }, + { + "epoch": 0.9350117096018735, + "grad_norm": 1.0099279880523682, + "learning_rate": 4.721435925932229e-06, + "loss": 0.6601, + "step": 3194 + }, + { + "epoch": 0.935304449648712, + "grad_norm": 1.0765433311462402, + "learning_rate": 4.721259253310593e-06, + "loss": 0.6911, + "step": 3195 + }, + { + "epoch": 0.9355971896955504, + "grad_norm": 0.9910606741905212, + "learning_rate": 4.7210825279891535e-06, + "loss": 0.6443, + "step": 3196 + }, + { + "epoch": 0.9358899297423887, + "grad_norm": 0.9927693605422974, + "learning_rate": 4.7209057499721045e-06, + "loss": 0.6378, + "step": 3197 + }, + { + "epoch": 0.9361826697892272, + "grad_norm": 0.9120052456855774, + "learning_rate": 4.72072891926364e-06, + "loss": 0.6196, + "step": 3198 + }, + { + "epoch": 0.9364754098360656, + "grad_norm": 0.9648561477661133, + "learning_rate": 4.720552035867955e-06, + "loss": 0.6756, + "step": 3199 + }, + { + "epoch": 0.936768149882904, + "grad_norm": 0.9886999130249023, + "learning_rate": 4.720375099789246e-06, + "loss": 0.6541, + "step": 3200 + }, + { + "epoch": 0.9370608899297423, + "grad_norm": 1.0497303009033203, + "learning_rate": 4.720198111031712e-06, + "loss": 0.6898, + "step": 3201 + }, + { + "epoch": 0.9373536299765808, + "grad_norm": 1.011000394821167, + "learning_rate": 4.720021069599551e-06, + "loss": 0.6509, + "step": 3202 + }, + { + "epoch": 0.9376463700234192, + "grad_norm": 0.9642525315284729, + "learning_rate": 4.719843975496964e-06, + "loss": 0.7034, + "step": 3203 + }, + { + "epoch": 0.9379391100702577, + "grad_norm": 0.9955817461013794, + "learning_rate": 4.719666828728151e-06, + "loss": 0.6166, + "step": 3204 + }, + { + "epoch": 0.938231850117096, + "grad_norm": 1.029110312461853, + "learning_rate": 4.719489629297317e-06, + "loss": 0.6642, + "step": 3205 + }, + { + "epoch": 0.9385245901639344, + "grad_norm": 1.0271058082580566, + "learning_rate": 4.719312377208665e-06, + "loss": 0.664, + "step": 3206 + }, + { + "epoch": 0.9388173302107728, + "grad_norm": 0.952768862247467, + "learning_rate": 4.7191350724664005e-06, + "loss": 0.66, + "step": 3207 + }, + { + "epoch": 0.9391100702576113, + "grad_norm": 0.9833314418792725, + "learning_rate": 4.71895771507473e-06, + "loss": 0.682, + "step": 3208 + }, + { + "epoch": 0.9394028103044496, + "grad_norm": 0.9431644082069397, + "learning_rate": 4.718780305037862e-06, + "loss": 0.6531, + "step": 3209 + }, + { + "epoch": 0.939695550351288, + "grad_norm": 0.9850846529006958, + "learning_rate": 4.718602842360004e-06, + "loss": 0.618, + "step": 3210 + }, + { + "epoch": 0.9399882903981265, + "grad_norm": 0.9501405954360962, + "learning_rate": 4.718425327045368e-06, + "loss": 0.6551, + "step": 3211 + }, + { + "epoch": 0.9402810304449649, + "grad_norm": 0.9519366025924683, + "learning_rate": 4.7182477590981645e-06, + "loss": 0.6139, + "step": 3212 + }, + { + "epoch": 0.9405737704918032, + "grad_norm": 0.977500855922699, + "learning_rate": 4.718070138522607e-06, + "loss": 0.6668, + "step": 3213 + }, + { + "epoch": 0.9408665105386417, + "grad_norm": 0.9968308806419373, + "learning_rate": 4.717892465322909e-06, + "loss": 0.6451, + "step": 3214 + }, + { + "epoch": 0.9411592505854801, + "grad_norm": 0.9674070477485657, + "learning_rate": 4.717714739503287e-06, + "loss": 0.6886, + "step": 3215 + }, + { + "epoch": 0.9414519906323185, + "grad_norm": 0.9871894121170044, + "learning_rate": 4.717536961067955e-06, + "loss": 0.6683, + "step": 3216 + }, + { + "epoch": 0.9417447306791569, + "grad_norm": 1.0518490076065063, + "learning_rate": 4.717359130021133e-06, + "loss": 0.6739, + "step": 3217 + }, + { + "epoch": 0.9420374707259953, + "grad_norm": 0.9601038098335266, + "learning_rate": 4.717181246367041e-06, + "loss": 0.6162, + "step": 3218 + }, + { + "epoch": 0.9423302107728337, + "grad_norm": 0.9391522407531738, + "learning_rate": 4.717003310109897e-06, + "loss": 0.6529, + "step": 3219 + }, + { + "epoch": 0.9426229508196722, + "grad_norm": 0.9596079587936401, + "learning_rate": 4.716825321253924e-06, + "loss": 0.6388, + "step": 3220 + }, + { + "epoch": 0.9429156908665105, + "grad_norm": 0.9179622530937195, + "learning_rate": 4.716647279803344e-06, + "loss": 0.611, + "step": 3221 + }, + { + "epoch": 0.9432084309133489, + "grad_norm": 0.9498873353004456, + "learning_rate": 4.716469185762381e-06, + "loss": 0.6603, + "step": 3222 + }, + { + "epoch": 0.9435011709601874, + "grad_norm": 0.9587296843528748, + "learning_rate": 4.716291039135262e-06, + "loss": 0.6731, + "step": 3223 + }, + { + "epoch": 0.9437939110070258, + "grad_norm": 0.9413779973983765, + "learning_rate": 4.716112839926211e-06, + "loss": 0.6443, + "step": 3224 + }, + { + "epoch": 0.9440866510538641, + "grad_norm": 1.0101547241210938, + "learning_rate": 4.715934588139458e-06, + "loss": 0.63, + "step": 3225 + }, + { + "epoch": 0.9443793911007026, + "grad_norm": 0.9732929468154907, + "learning_rate": 4.715756283779231e-06, + "loss": 0.6383, + "step": 3226 + }, + { + "epoch": 0.944672131147541, + "grad_norm": 0.9474318027496338, + "learning_rate": 4.71557792684976e-06, + "loss": 0.6636, + "step": 3227 + }, + { + "epoch": 0.9449648711943794, + "grad_norm": 0.9670058488845825, + "learning_rate": 4.715399517355277e-06, + "loss": 0.6697, + "step": 3228 + }, + { + "epoch": 0.9452576112412178, + "grad_norm": 0.9513845443725586, + "learning_rate": 4.715221055300016e-06, + "loss": 0.611, + "step": 3229 + }, + { + "epoch": 0.9455503512880562, + "grad_norm": 0.9285411834716797, + "learning_rate": 4.715042540688209e-06, + "loss": 0.6376, + "step": 3230 + }, + { + "epoch": 0.9458430913348946, + "grad_norm": 0.961846113204956, + "learning_rate": 4.714863973524092e-06, + "loss": 0.6354, + "step": 3231 + }, + { + "epoch": 0.9461358313817331, + "grad_norm": 1.305545449256897, + "learning_rate": 4.714685353811902e-06, + "loss": 0.6281, + "step": 3232 + }, + { + "epoch": 0.9464285714285714, + "grad_norm": 0.9686965942382812, + "learning_rate": 4.714506681555876e-06, + "loss": 0.6797, + "step": 3233 + }, + { + "epoch": 0.9467213114754098, + "grad_norm": 0.9577288627624512, + "learning_rate": 4.714327956760254e-06, + "loss": 0.648, + "step": 3234 + }, + { + "epoch": 0.9470140515222483, + "grad_norm": 1.0124062299728394, + "learning_rate": 4.7141491794292755e-06, + "loss": 0.5614, + "step": 3235 + }, + { + "epoch": 0.9473067915690867, + "grad_norm": 0.9928358793258667, + "learning_rate": 4.713970349567183e-06, + "loss": 0.6748, + "step": 3236 + }, + { + "epoch": 0.947599531615925, + "grad_norm": 1.0336921215057373, + "learning_rate": 4.7137914671782184e-06, + "loss": 0.6573, + "step": 3237 + }, + { + "epoch": 0.9478922716627635, + "grad_norm": 0.9304029941558838, + "learning_rate": 4.713612532266625e-06, + "loss": 0.6268, + "step": 3238 + }, + { + "epoch": 0.9481850117096019, + "grad_norm": 1.019468903541565, + "learning_rate": 4.713433544836649e-06, + "loss": 0.6315, + "step": 3239 + }, + { + "epoch": 0.9484777517564403, + "grad_norm": 1.0263917446136475, + "learning_rate": 4.7132545048925385e-06, + "loss": 0.6521, + "step": 3240 + }, + { + "epoch": 0.9487704918032787, + "grad_norm": 0.980613648891449, + "learning_rate": 4.7130754124385394e-06, + "loss": 0.6535, + "step": 3241 + }, + { + "epoch": 0.9490632318501171, + "grad_norm": 0.9484977126121521, + "learning_rate": 4.7128962674789e-06, + "loss": 0.6461, + "step": 3242 + }, + { + "epoch": 0.9493559718969555, + "grad_norm": 0.9491426944732666, + "learning_rate": 4.712717070017872e-06, + "loss": 0.6745, + "step": 3243 + }, + { + "epoch": 0.949648711943794, + "grad_norm": 0.9669609665870667, + "learning_rate": 4.712537820059705e-06, + "loss": 0.7064, + "step": 3244 + }, + { + "epoch": 0.9499414519906323, + "grad_norm": 0.9553847312927246, + "learning_rate": 4.712358517608655e-06, + "loss": 0.6492, + "step": 3245 + }, + { + "epoch": 0.9502341920374707, + "grad_norm": 0.8965136408805847, + "learning_rate": 4.712179162668973e-06, + "loss": 0.6197, + "step": 3246 + }, + { + "epoch": 0.9505269320843092, + "grad_norm": 1.013554573059082, + "learning_rate": 4.711999755244916e-06, + "loss": 0.6622, + "step": 3247 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.9148480296134949, + "learning_rate": 4.71182029534074e-06, + "loss": 0.676, + "step": 3248 + }, + { + "epoch": 0.9511124121779859, + "grad_norm": 0.9114254713058472, + "learning_rate": 4.711640782960702e-06, + "loss": 0.6439, + "step": 3249 + }, + { + "epoch": 0.9514051522248244, + "grad_norm": 0.9480571746826172, + "learning_rate": 4.711461218109062e-06, + "loss": 0.6454, + "step": 3250 + }, + { + "epoch": 0.9516978922716628, + "grad_norm": 0.9372909665107727, + "learning_rate": 4.7112816007900796e-06, + "loss": 0.618, + "step": 3251 + }, + { + "epoch": 0.9519906323185011, + "grad_norm": 1.004170298576355, + "learning_rate": 4.711101931008016e-06, + "loss": 0.6764, + "step": 3252 + }, + { + "epoch": 0.9522833723653396, + "grad_norm": 1.0247136354446411, + "learning_rate": 4.710922208767135e-06, + "loss": 0.6971, + "step": 3253 + }, + { + "epoch": 0.952576112412178, + "grad_norm": 1.1611812114715576, + "learning_rate": 4.710742434071699e-06, + "loss": 0.625, + "step": 3254 + }, + { + "epoch": 0.9528688524590164, + "grad_norm": 0.9654311537742615, + "learning_rate": 4.710562606925975e-06, + "loss": 0.6736, + "step": 3255 + }, + { + "epoch": 0.9531615925058547, + "grad_norm": 0.9769260287284851, + "learning_rate": 4.710382727334228e-06, + "loss": 0.7244, + "step": 3256 + }, + { + "epoch": 0.9534543325526932, + "grad_norm": 0.9760617613792419, + "learning_rate": 4.710202795300725e-06, + "loss": 0.6259, + "step": 3257 + }, + { + "epoch": 0.9537470725995316, + "grad_norm": 1.0657532215118408, + "learning_rate": 4.7100228108297375e-06, + "loss": 0.6229, + "step": 3258 + }, + { + "epoch": 0.9540398126463701, + "grad_norm": 0.9618728756904602, + "learning_rate": 4.709842773925534e-06, + "loss": 0.6291, + "step": 3259 + }, + { + "epoch": 0.9543325526932084, + "grad_norm": 1.005456566810608, + "learning_rate": 4.709662684592386e-06, + "loss": 0.6468, + "step": 3260 + }, + { + "epoch": 0.9546252927400468, + "grad_norm": 0.9955158829689026, + "learning_rate": 4.7094825428345654e-06, + "loss": 0.6154, + "step": 3261 + }, + { + "epoch": 0.9549180327868853, + "grad_norm": 1.0223281383514404, + "learning_rate": 4.709302348656348e-06, + "loss": 0.642, + "step": 3262 + }, + { + "epoch": 0.9552107728337237, + "grad_norm": 0.9747985005378723, + "learning_rate": 4.709122102062007e-06, + "loss": 0.6487, + "step": 3263 + }, + { + "epoch": 0.955503512880562, + "grad_norm": 0.9952049255371094, + "learning_rate": 4.708941803055821e-06, + "loss": 0.678, + "step": 3264 + }, + { + "epoch": 0.9557962529274004, + "grad_norm": 0.9502685070037842, + "learning_rate": 4.708761451642065e-06, + "loss": 0.6784, + "step": 3265 + }, + { + "epoch": 0.9560889929742389, + "grad_norm": 1.0327924489974976, + "learning_rate": 4.708581047825021e-06, + "loss": 0.6931, + "step": 3266 + }, + { + "epoch": 0.9563817330210773, + "grad_norm": 0.9007242321968079, + "learning_rate": 4.708400591608966e-06, + "loss": 0.5785, + "step": 3267 + }, + { + "epoch": 0.9566744730679156, + "grad_norm": 0.9613141417503357, + "learning_rate": 4.7082200829981825e-06, + "loss": 0.6916, + "step": 3268 + }, + { + "epoch": 0.9569672131147541, + "grad_norm": 0.987189531326294, + "learning_rate": 4.708039521996954e-06, + "loss": 0.6749, + "step": 3269 + }, + { + "epoch": 0.9572599531615925, + "grad_norm": 0.9153044819831848, + "learning_rate": 4.707858908609563e-06, + "loss": 0.6747, + "step": 3270 + }, + { + "epoch": 0.957552693208431, + "grad_norm": 0.9929498434066772, + "learning_rate": 4.707678242840296e-06, + "loss": 0.6799, + "step": 3271 + }, + { + "epoch": 0.9578454332552693, + "grad_norm": 0.987177848815918, + "learning_rate": 4.707497524693438e-06, + "loss": 0.6704, + "step": 3272 + }, + { + "epoch": 0.9581381733021077, + "grad_norm": 0.9576061964035034, + "learning_rate": 4.707316754173277e-06, + "loss": 0.6101, + "step": 3273 + }, + { + "epoch": 0.9584309133489461, + "grad_norm": 0.9266015887260437, + "learning_rate": 4.707135931284103e-06, + "loss": 0.6635, + "step": 3274 + }, + { + "epoch": 0.9587236533957846, + "grad_norm": 0.9730335474014282, + "learning_rate": 4.706955056030204e-06, + "loss": 0.6502, + "step": 3275 + }, + { + "epoch": 0.9590163934426229, + "grad_norm": 0.9784179329872131, + "learning_rate": 4.706774128415872e-06, + "loss": 0.6492, + "step": 3276 + }, + { + "epoch": 0.9593091334894613, + "grad_norm": 0.9839978218078613, + "learning_rate": 4.7065931484454005e-06, + "loss": 0.6736, + "step": 3277 + }, + { + "epoch": 0.9596018735362998, + "grad_norm": 0.9594902396202087, + "learning_rate": 4.706412116123082e-06, + "loss": 0.6669, + "step": 3278 + }, + { + "epoch": 0.9598946135831382, + "grad_norm": 0.9361562132835388, + "learning_rate": 4.706231031453212e-06, + "loss": 0.6491, + "step": 3279 + }, + { + "epoch": 0.9601873536299765, + "grad_norm": 0.9986446499824524, + "learning_rate": 4.706049894440088e-06, + "loss": 0.6384, + "step": 3280 + }, + { + "epoch": 0.960480093676815, + "grad_norm": 0.987981379032135, + "learning_rate": 4.7058687050880055e-06, + "loss": 0.6684, + "step": 3281 + }, + { + "epoch": 0.9607728337236534, + "grad_norm": 0.9449546933174133, + "learning_rate": 4.705687463401265e-06, + "loss": 0.6067, + "step": 3282 + }, + { + "epoch": 0.9610655737704918, + "grad_norm": 0.9739958047866821, + "learning_rate": 4.705506169384165e-06, + "loss": 0.67, + "step": 3283 + }, + { + "epoch": 0.9613583138173302, + "grad_norm": 1.0053902864456177, + "learning_rate": 4.705324823041008e-06, + "loss": 0.6752, + "step": 3284 + }, + { + "epoch": 0.9616510538641686, + "grad_norm": 0.9739540219306946, + "learning_rate": 4.705143424376095e-06, + "loss": 0.6988, + "step": 3285 + }, + { + "epoch": 0.961943793911007, + "grad_norm": 0.9604346752166748, + "learning_rate": 4.704961973393731e-06, + "loss": 0.6413, + "step": 3286 + }, + { + "epoch": 0.9622365339578455, + "grad_norm": 0.9709678888320923, + "learning_rate": 4.704780470098221e-06, + "loss": 0.6972, + "step": 3287 + }, + { + "epoch": 0.9625292740046838, + "grad_norm": 0.9826647043228149, + "learning_rate": 4.704598914493871e-06, + "loss": 0.6438, + "step": 3288 + }, + { + "epoch": 0.9628220140515222, + "grad_norm": 0.951810359954834, + "learning_rate": 4.704417306584987e-06, + "loss": 0.6065, + "step": 3289 + }, + { + "epoch": 0.9631147540983607, + "grad_norm": 0.9276007413864136, + "learning_rate": 4.70423564637588e-06, + "loss": 0.6494, + "step": 3290 + }, + { + "epoch": 0.9634074941451991, + "grad_norm": 0.9323604106903076, + "learning_rate": 4.704053933870858e-06, + "loss": 0.6264, + "step": 3291 + }, + { + "epoch": 0.9637002341920374, + "grad_norm": 0.9194305539131165, + "learning_rate": 4.703872169074233e-06, + "loss": 0.6618, + "step": 3292 + }, + { + "epoch": 0.9639929742388759, + "grad_norm": 0.9467216730117798, + "learning_rate": 4.703690351990318e-06, + "loss": 0.6059, + "step": 3293 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 0.9517220854759216, + "learning_rate": 4.7035084826234256e-06, + "loss": 0.693, + "step": 3294 + }, + { + "epoch": 0.9645784543325527, + "grad_norm": 0.9642337560653687, + "learning_rate": 4.703326560977871e-06, + "loss": 0.6968, + "step": 3295 + }, + { + "epoch": 0.9648711943793911, + "grad_norm": 0.9542016983032227, + "learning_rate": 4.703144587057971e-06, + "loss": 0.631, + "step": 3296 + }, + { + "epoch": 0.9651639344262295, + "grad_norm": 1.0257647037506104, + "learning_rate": 4.702962560868042e-06, + "loss": 0.6478, + "step": 3297 + }, + { + "epoch": 0.9654566744730679, + "grad_norm": 0.9696040153503418, + "learning_rate": 4.702780482412404e-06, + "loss": 0.6489, + "step": 3298 + }, + { + "epoch": 0.9657494145199064, + "grad_norm": 0.9276673197746277, + "learning_rate": 4.702598351695374e-06, + "loss": 0.5997, + "step": 3299 + }, + { + "epoch": 0.9660421545667447, + "grad_norm": 1.0463097095489502, + "learning_rate": 4.702416168721276e-06, + "loss": 0.6637, + "step": 3300 + }, + { + "epoch": 0.9663348946135831, + "grad_norm": 1.0116034746170044, + "learning_rate": 4.702233933494431e-06, + "loss": 0.6678, + "step": 3301 + }, + { + "epoch": 0.9666276346604216, + "grad_norm": 0.9873026609420776, + "learning_rate": 4.702051646019164e-06, + "loss": 0.6241, + "step": 3302 + }, + { + "epoch": 0.96692037470726, + "grad_norm": 0.9990526437759399, + "learning_rate": 4.701869306299796e-06, + "loss": 0.6974, + "step": 3303 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 0.9976506233215332, + "learning_rate": 4.701686914340658e-06, + "loss": 0.6776, + "step": 3304 + }, + { + "epoch": 0.9675058548009368, + "grad_norm": 0.9315078854560852, + "learning_rate": 4.701504470146074e-06, + "loss": 0.6099, + "step": 3305 + }, + { + "epoch": 0.9677985948477752, + "grad_norm": 1.000731110572815, + "learning_rate": 4.701321973720374e-06, + "loss": 0.6575, + "step": 3306 + }, + { + "epoch": 0.9680913348946136, + "grad_norm": 0.9480735063552856, + "learning_rate": 4.701139425067886e-06, + "loss": 0.579, + "step": 3307 + }, + { + "epoch": 0.968384074941452, + "grad_norm": 0.9545005559921265, + "learning_rate": 4.7009568241929436e-06, + "loss": 0.6761, + "step": 3308 + }, + { + "epoch": 0.9686768149882904, + "grad_norm": 0.9412955641746521, + "learning_rate": 4.7007741710998774e-06, + "loss": 0.6323, + "step": 3309 + }, + { + "epoch": 0.9689695550351288, + "grad_norm": 0.9454843401908875, + "learning_rate": 4.700591465793021e-06, + "loss": 0.6546, + "step": 3310 + }, + { + "epoch": 0.9692622950819673, + "grad_norm": 0.9295659065246582, + "learning_rate": 4.700408708276709e-06, + "loss": 0.5857, + "step": 3311 + }, + { + "epoch": 0.9695550351288056, + "grad_norm": 0.9674099683761597, + "learning_rate": 4.700225898555278e-06, + "loss": 0.6593, + "step": 3312 + }, + { + "epoch": 0.969847775175644, + "grad_norm": 0.9598174691200256, + "learning_rate": 4.700043036633065e-06, + "loss": 0.6572, + "step": 3313 + }, + { + "epoch": 0.9701405152224825, + "grad_norm": 0.9429852962493896, + "learning_rate": 4.6998601225144095e-06, + "loss": 0.6308, + "step": 3314 + }, + { + "epoch": 0.9704332552693209, + "grad_norm": 0.96925288438797, + "learning_rate": 4.699677156203648e-06, + "loss": 0.6561, + "step": 3315 + }, + { + "epoch": 0.9707259953161592, + "grad_norm": 0.9553619027137756, + "learning_rate": 4.699494137705124e-06, + "loss": 0.662, + "step": 3316 + }, + { + "epoch": 0.9710187353629977, + "grad_norm": 0.9579572081565857, + "learning_rate": 4.699311067023179e-06, + "loss": 0.6761, + "step": 3317 + }, + { + "epoch": 0.9713114754098361, + "grad_norm": 0.9784052968025208, + "learning_rate": 4.699127944162156e-06, + "loss": 0.6706, + "step": 3318 + }, + { + "epoch": 0.9716042154566745, + "grad_norm": 0.9076412320137024, + "learning_rate": 4.6989447691264e-06, + "loss": 0.6889, + "step": 3319 + }, + { + "epoch": 0.9718969555035128, + "grad_norm": 0.9758455157279968, + "learning_rate": 4.698761541920258e-06, + "loss": 0.6657, + "step": 3320 + }, + { + "epoch": 0.9721896955503513, + "grad_norm": 0.9685584306716919, + "learning_rate": 4.6985782625480756e-06, + "loss": 0.6538, + "step": 3321 + }, + { + "epoch": 0.9724824355971897, + "grad_norm": 0.9250316023826599, + "learning_rate": 4.698394931014201e-06, + "loss": 0.5614, + "step": 3322 + }, + { + "epoch": 0.9727751756440282, + "grad_norm": 0.9688401222229004, + "learning_rate": 4.6982115473229845e-06, + "loss": 0.6335, + "step": 3323 + }, + { + "epoch": 0.9730679156908665, + "grad_norm": 1.0301059484481812, + "learning_rate": 4.698028111478777e-06, + "loss": 0.6616, + "step": 3324 + }, + { + "epoch": 0.9733606557377049, + "grad_norm": 1.0041865110397339, + "learning_rate": 4.69784462348593e-06, + "loss": 0.6558, + "step": 3325 + }, + { + "epoch": 0.9736533957845434, + "grad_norm": 0.8994332551956177, + "learning_rate": 4.697661083348797e-06, + "loss": 0.5757, + "step": 3326 + }, + { + "epoch": 0.9739461358313818, + "grad_norm": 0.9492050409317017, + "learning_rate": 4.697477491071733e-06, + "loss": 0.6513, + "step": 3327 + }, + { + "epoch": 0.9742388758782201, + "grad_norm": 0.9720810651779175, + "learning_rate": 4.697293846659092e-06, + "loss": 0.6956, + "step": 3328 + }, + { + "epoch": 0.9745316159250585, + "grad_norm": 0.971261203289032, + "learning_rate": 4.697110150115234e-06, + "loss": 0.6695, + "step": 3329 + }, + { + "epoch": 0.974824355971897, + "grad_norm": 0.9674195051193237, + "learning_rate": 4.696926401444515e-06, + "loss": 0.6298, + "step": 3330 + }, + { + "epoch": 0.9751170960187353, + "grad_norm": 0.9372227787971497, + "learning_rate": 4.6967426006512944e-06, + "loss": 0.6292, + "step": 3331 + }, + { + "epoch": 0.9754098360655737, + "grad_norm": 0.9633303880691528, + "learning_rate": 4.6965587477399345e-06, + "loss": 0.6366, + "step": 3332 + }, + { + "epoch": 0.9757025761124122, + "grad_norm": 0.9197907447814941, + "learning_rate": 4.696374842714796e-06, + "loss": 0.6178, + "step": 3333 + }, + { + "epoch": 0.9759953161592506, + "grad_norm": 0.9585869908332825, + "learning_rate": 4.696190885580242e-06, + "loss": 0.6437, + "step": 3334 + }, + { + "epoch": 0.9762880562060889, + "grad_norm": 0.9758283495903015, + "learning_rate": 4.696006876340637e-06, + "loss": 0.6588, + "step": 3335 + }, + { + "epoch": 0.9765807962529274, + "grad_norm": 0.9390897750854492, + "learning_rate": 4.695822815000348e-06, + "loss": 0.6436, + "step": 3336 + }, + { + "epoch": 0.9768735362997658, + "grad_norm": 0.9753897786140442, + "learning_rate": 4.69563870156374e-06, + "loss": 0.6103, + "step": 3337 + }, + { + "epoch": 0.9771662763466042, + "grad_norm": 0.955707848072052, + "learning_rate": 4.695454536035183e-06, + "loss": 0.6687, + "step": 3338 + }, + { + "epoch": 0.9774590163934426, + "grad_norm": 0.9084389209747314, + "learning_rate": 4.695270318419045e-06, + "loss": 0.597, + "step": 3339 + }, + { + "epoch": 0.977751756440281, + "grad_norm": 0.9070822596549988, + "learning_rate": 4.695086048719696e-06, + "loss": 0.648, + "step": 3340 + }, + { + "epoch": 0.9780444964871194, + "grad_norm": 0.9622007012367249, + "learning_rate": 4.6949017269415096e-06, + "loss": 0.6099, + "step": 3341 + }, + { + "epoch": 0.9783372365339579, + "grad_norm": 0.9337007999420166, + "learning_rate": 4.694717353088858e-06, + "loss": 0.6917, + "step": 3342 + }, + { + "epoch": 0.9786299765807962, + "grad_norm": 0.9614972472190857, + "learning_rate": 4.694532927166115e-06, + "loss": 0.6922, + "step": 3343 + }, + { + "epoch": 0.9789227166276346, + "grad_norm": 0.951202929019928, + "learning_rate": 4.6943484491776575e-06, + "loss": 0.6532, + "step": 3344 + }, + { + "epoch": 0.9792154566744731, + "grad_norm": 0.9451289772987366, + "learning_rate": 4.694163919127861e-06, + "loss": 0.6249, + "step": 3345 + }, + { + "epoch": 0.9795081967213115, + "grad_norm": 1.000924825668335, + "learning_rate": 4.693979337021104e-06, + "loss": 0.6957, + "step": 3346 + }, + { + "epoch": 0.9798009367681498, + "grad_norm": 0.9557392597198486, + "learning_rate": 4.693794702861766e-06, + "loss": 0.644, + "step": 3347 + }, + { + "epoch": 0.9800936768149883, + "grad_norm": 1.0248613357543945, + "learning_rate": 4.693610016654226e-06, + "loss": 0.6448, + "step": 3348 + }, + { + "epoch": 0.9803864168618267, + "grad_norm": 1.0025197267532349, + "learning_rate": 4.693425278402869e-06, + "loss": 0.6744, + "step": 3349 + }, + { + "epoch": 0.9806791569086651, + "grad_norm": 1.1022255420684814, + "learning_rate": 4.693240488112074e-06, + "loss": 0.6507, + "step": 3350 + }, + { + "epoch": 0.9809718969555035, + "grad_norm": 0.9352915287017822, + "learning_rate": 4.693055645786228e-06, + "loss": 0.6469, + "step": 3351 + }, + { + "epoch": 0.9812646370023419, + "grad_norm": 1.0563913583755493, + "learning_rate": 4.692870751429716e-06, + "loss": 0.6562, + "step": 3352 + }, + { + "epoch": 0.9815573770491803, + "grad_norm": 1.0124040842056274, + "learning_rate": 4.692685805046922e-06, + "loss": 0.6187, + "step": 3353 + }, + { + "epoch": 0.9818501170960188, + "grad_norm": 0.9763144254684448, + "learning_rate": 4.692500806642238e-06, + "loss": 0.6692, + "step": 3354 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 0.9265322089195251, + "learning_rate": 4.692315756220051e-06, + "loss": 0.6336, + "step": 3355 + }, + { + "epoch": 0.9824355971896955, + "grad_norm": 0.9644172787666321, + "learning_rate": 4.69213065378475e-06, + "loss": 0.6017, + "step": 3356 + }, + { + "epoch": 0.982728337236534, + "grad_norm": 0.955126941204071, + "learning_rate": 4.69194549934073e-06, + "loss": 0.6051, + "step": 3357 + }, + { + "epoch": 0.9830210772833724, + "grad_norm": 1.0247160196304321, + "learning_rate": 4.6917602928923815e-06, + "loss": 0.6621, + "step": 3358 + }, + { + "epoch": 0.9833138173302107, + "grad_norm": 1.039940357208252, + "learning_rate": 4.691575034444098e-06, + "loss": 0.6455, + "step": 3359 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.995488703250885, + "learning_rate": 4.691389724000276e-06, + "loss": 0.6519, + "step": 3360 + }, + { + "epoch": 0.9838992974238876, + "grad_norm": 0.963116466999054, + "learning_rate": 4.691204361565312e-06, + "loss": 0.6101, + "step": 3361 + }, + { + "epoch": 0.984192037470726, + "grad_norm": 0.9988937973976135, + "learning_rate": 4.691018947143603e-06, + "loss": 0.633, + "step": 3362 + }, + { + "epoch": 0.9844847775175644, + "grad_norm": 0.9758363962173462, + "learning_rate": 4.690833480739549e-06, + "loss": 0.656, + "step": 3363 + }, + { + "epoch": 0.9847775175644028, + "grad_norm": 0.9582085609436035, + "learning_rate": 4.690647962357551e-06, + "loss": 0.6855, + "step": 3364 + }, + { + "epoch": 0.9850702576112412, + "grad_norm": 0.9516425132751465, + "learning_rate": 4.690462392002008e-06, + "loss": 0.6374, + "step": 3365 + }, + { + "epoch": 0.9853629976580797, + "grad_norm": 1.0283949375152588, + "learning_rate": 4.690276769677324e-06, + "loss": 0.6628, + "step": 3366 + }, + { + "epoch": 0.985655737704918, + "grad_norm": 1.0134649276733398, + "learning_rate": 4.690091095387903e-06, + "loss": 0.6468, + "step": 3367 + }, + { + "epoch": 0.9859484777517564, + "grad_norm": 0.8922951817512512, + "learning_rate": 4.6899053691381495e-06, + "loss": 0.6373, + "step": 3368 + }, + { + "epoch": 0.9862412177985949, + "grad_norm": 0.9948383569717407, + "learning_rate": 4.689719590932471e-06, + "loss": 0.6431, + "step": 3369 + }, + { + "epoch": 0.9865339578454333, + "grad_norm": 0.9722495079040527, + "learning_rate": 4.689533760775274e-06, + "loss": 0.6748, + "step": 3370 + }, + { + "epoch": 0.9868266978922716, + "grad_norm": 0.9849606156349182, + "learning_rate": 4.689347878670968e-06, + "loss": 0.6503, + "step": 3371 + }, + { + "epoch": 0.9871194379391101, + "grad_norm": 0.9618024230003357, + "learning_rate": 4.6891619446239625e-06, + "loss": 0.6828, + "step": 3372 + }, + { + "epoch": 0.9874121779859485, + "grad_norm": 0.9560632705688477, + "learning_rate": 4.68897595863867e-06, + "loss": 0.6408, + "step": 3373 + }, + { + "epoch": 0.9877049180327869, + "grad_norm": 0.9290902018547058, + "learning_rate": 4.688789920719502e-06, + "loss": 0.636, + "step": 3374 + }, + { + "epoch": 0.9879976580796253, + "grad_norm": 0.9815626740455627, + "learning_rate": 4.688603830870873e-06, + "loss": 0.6724, + "step": 3375 + }, + { + "epoch": 0.9882903981264637, + "grad_norm": 0.9261895418167114, + "learning_rate": 4.688417689097198e-06, + "loss": 0.6279, + "step": 3376 + }, + { + "epoch": 0.9885831381733021, + "grad_norm": 0.9391101598739624, + "learning_rate": 4.688231495402892e-06, + "loss": 0.6197, + "step": 3377 + }, + { + "epoch": 0.9888758782201406, + "grad_norm": 1.0473748445510864, + "learning_rate": 4.688045249792373e-06, + "loss": 0.6357, + "step": 3378 + }, + { + "epoch": 0.9891686182669789, + "grad_norm": 0.9574573040008545, + "learning_rate": 4.687858952270061e-06, + "loss": 0.5956, + "step": 3379 + }, + { + "epoch": 0.9894613583138173, + "grad_norm": 0.9576648473739624, + "learning_rate": 4.687672602840375e-06, + "loss": 0.6234, + "step": 3380 + }, + { + "epoch": 0.9897540983606558, + "grad_norm": 0.9751240611076355, + "learning_rate": 4.687486201507736e-06, + "loss": 0.6515, + "step": 3381 + }, + { + "epoch": 0.9900468384074942, + "grad_norm": 0.9758044481277466, + "learning_rate": 4.687299748276567e-06, + "loss": 0.6527, + "step": 3382 + }, + { + "epoch": 0.9903395784543325, + "grad_norm": 0.9995319247245789, + "learning_rate": 4.6871132431512905e-06, + "loss": 0.6872, + "step": 3383 + }, + { + "epoch": 0.990632318501171, + "grad_norm": 0.9479180574417114, + "learning_rate": 4.686926686136333e-06, + "loss": 0.6815, + "step": 3384 + }, + { + "epoch": 0.9909250585480094, + "grad_norm": 0.9832482933998108, + "learning_rate": 4.686740077236118e-06, + "loss": 0.6793, + "step": 3385 + }, + { + "epoch": 0.9912177985948478, + "grad_norm": 0.9449059367179871, + "learning_rate": 4.686553416455077e-06, + "loss": 0.6887, + "step": 3386 + }, + { + "epoch": 0.9915105386416861, + "grad_norm": 1.0095418691635132, + "learning_rate": 4.686366703797634e-06, + "loss": 0.6838, + "step": 3387 + }, + { + "epoch": 0.9918032786885246, + "grad_norm": 0.9696465730667114, + "learning_rate": 4.686179939268222e-06, + "loss": 0.649, + "step": 3388 + }, + { + "epoch": 0.992096018735363, + "grad_norm": 0.9999851584434509, + "learning_rate": 4.685993122871271e-06, + "loss": 0.6275, + "step": 3389 + }, + { + "epoch": 0.9923887587822015, + "grad_norm": 0.9445211887359619, + "learning_rate": 4.685806254611212e-06, + "loss": 0.6409, + "step": 3390 + }, + { + "epoch": 0.9926814988290398, + "grad_norm": 1.02142333984375, + "learning_rate": 4.685619334492481e-06, + "loss": 0.6843, + "step": 3391 + }, + { + "epoch": 0.9929742388758782, + "grad_norm": 0.970629870891571, + "learning_rate": 4.685432362519511e-06, + "loss": 0.6694, + "step": 3392 + }, + { + "epoch": 0.9932669789227166, + "grad_norm": 0.9886816143989563, + "learning_rate": 4.6852453386967375e-06, + "loss": 0.6529, + "step": 3393 + }, + { + "epoch": 0.9935597189695551, + "grad_norm": 0.9553976655006409, + "learning_rate": 4.685058263028599e-06, + "loss": 0.6609, + "step": 3394 + }, + { + "epoch": 0.9938524590163934, + "grad_norm": 0.9757091403007507, + "learning_rate": 4.684871135519534e-06, + "loss": 0.6658, + "step": 3395 + }, + { + "epoch": 0.9941451990632318, + "grad_norm": 0.9436838030815125, + "learning_rate": 4.684683956173981e-06, + "loss": 0.5868, + "step": 3396 + }, + { + "epoch": 0.9944379391100703, + "grad_norm": 0.9792131185531616, + "learning_rate": 4.684496724996382e-06, + "loss": 0.6527, + "step": 3397 + }, + { + "epoch": 0.9947306791569087, + "grad_norm": 0.9682813286781311, + "learning_rate": 4.6843094419911785e-06, + "loss": 0.6338, + "step": 3398 + }, + { + "epoch": 0.995023419203747, + "grad_norm": 0.9791179895401001, + "learning_rate": 4.684122107162813e-06, + "loss": 0.6192, + "step": 3399 + }, + { + "epoch": 0.9953161592505855, + "grad_norm": 0.9722906351089478, + "learning_rate": 4.683934720515731e-06, + "loss": 0.6573, + "step": 3400 + }, + { + "epoch": 0.9956088992974239, + "grad_norm": 0.946898877620697, + "learning_rate": 4.683747282054379e-06, + "loss": 0.6483, + "step": 3401 + }, + { + "epoch": 0.9959016393442623, + "grad_norm": 0.9850074648857117, + "learning_rate": 4.683559791783202e-06, + "loss": 0.6388, + "step": 3402 + }, + { + "epoch": 0.9961943793911007, + "grad_norm": 0.9315775632858276, + "learning_rate": 4.68337224970665e-06, + "loss": 0.6536, + "step": 3403 + }, + { + "epoch": 0.9964871194379391, + "grad_norm": 0.9767241477966309, + "learning_rate": 4.683184655829173e-06, + "loss": 0.5957, + "step": 3404 + }, + { + "epoch": 0.9967798594847775, + "grad_norm": 0.9348752498626709, + "learning_rate": 4.6829970101552195e-06, + "loss": 0.6357, + "step": 3405 + }, + { + "epoch": 0.997072599531616, + "grad_norm": 0.9333168864250183, + "learning_rate": 4.682809312689243e-06, + "loss": 0.6148, + "step": 3406 + }, + { + "epoch": 0.9973653395784543, + "grad_norm": 0.9422442317008972, + "learning_rate": 4.682621563435695e-06, + "loss": 0.6523, + "step": 3407 + }, + { + "epoch": 0.9976580796252927, + "grad_norm": 0.9584015011787415, + "learning_rate": 4.682433762399033e-06, + "loss": 0.6427, + "step": 3408 + }, + { + "epoch": 0.9979508196721312, + "grad_norm": 0.9711824059486389, + "learning_rate": 4.682245909583709e-06, + "loss": 0.6749, + "step": 3409 + }, + { + "epoch": 0.9982435597189696, + "grad_norm": 1.0185067653656006, + "learning_rate": 4.682058004994182e-06, + "loss": 0.6567, + "step": 3410 + }, + { + "epoch": 0.9985362997658079, + "grad_norm": 0.9264505505561829, + "learning_rate": 4.68187004863491e-06, + "loss": 0.635, + "step": 3411 + }, + { + "epoch": 0.9988290398126464, + "grad_norm": 0.864705502986908, + "learning_rate": 4.681682040510351e-06, + "loss": 0.6196, + "step": 3412 + }, + { + "epoch": 0.9991217798594848, + "grad_norm": 1.014452576637268, + "learning_rate": 4.681493980624968e-06, + "loss": 0.6446, + "step": 3413 + }, + { + "epoch": 0.9994145199063232, + "grad_norm": 0.9638717770576477, + "learning_rate": 4.681305868983221e-06, + "loss": 0.6555, + "step": 3414 + }, + { + "epoch": 0.9997072599531616, + "grad_norm": 0.9754822850227356, + "learning_rate": 4.6811177055895715e-06, + "loss": 0.6386, + "step": 3415 + }, + { + "epoch": 1.0, + "grad_norm": 1.0275006294250488, + "learning_rate": 4.6809294904484866e-06, + "loss": 0.691, + "step": 3416 + }, + { + "epoch": 1.0002927400468384, + "grad_norm": 0.9294849634170532, + "learning_rate": 4.68074122356443e-06, + "loss": 0.5809, + "step": 3417 + }, + { + "epoch": 1.0005854800936769, + "grad_norm": 0.9318827390670776, + "learning_rate": 4.680552904941869e-06, + "loss": 0.6648, + "step": 3418 + }, + { + "epoch": 1.0008782201405153, + "grad_norm": 0.8910863399505615, + "learning_rate": 4.680364534585272e-06, + "loss": 0.6081, + "step": 3419 + }, + { + "epoch": 1.0011709601873535, + "grad_norm": 0.9459846615791321, + "learning_rate": 4.680176112499108e-06, + "loss": 0.6137, + "step": 3420 + }, + { + "epoch": 1.001463700234192, + "grad_norm": 0.9313222765922546, + "learning_rate": 4.679987638687845e-06, + "loss": 0.6114, + "step": 3421 + }, + { + "epoch": 1.0017564402810304, + "grad_norm": 0.9752010703086853, + "learning_rate": 4.6797991131559575e-06, + "loss": 0.6116, + "step": 3422 + }, + { + "epoch": 1.0020491803278688, + "grad_norm": 0.977264940738678, + "learning_rate": 4.6796105359079175e-06, + "loss": 0.5788, + "step": 3423 + }, + { + "epoch": 1.0023419203747073, + "grad_norm": 0.9640780091285706, + "learning_rate": 4.679421906948198e-06, + "loss": 0.5956, + "step": 3424 + }, + { + "epoch": 1.0026346604215457, + "grad_norm": 0.9548678994178772, + "learning_rate": 4.679233226281276e-06, + "loss": 0.6251, + "step": 3425 + }, + { + "epoch": 1.0029274004683841, + "grad_norm": 1.0262401103973389, + "learning_rate": 4.679044493911627e-06, + "loss": 0.6404, + "step": 3426 + }, + { + "epoch": 1.0032201405152226, + "grad_norm": 1.0050238370895386, + "learning_rate": 4.678855709843728e-06, + "loss": 0.6499, + "step": 3427 + }, + { + "epoch": 1.0035128805620608, + "grad_norm": 0.9346453547477722, + "learning_rate": 4.6786668740820585e-06, + "loss": 0.577, + "step": 3428 + }, + { + "epoch": 1.0038056206088992, + "grad_norm": 0.977109968662262, + "learning_rate": 4.678477986631099e-06, + "loss": 0.6187, + "step": 3429 + }, + { + "epoch": 1.0040983606557377, + "grad_norm": 0.9589594006538391, + "learning_rate": 4.678289047495332e-06, + "loss": 0.656, + "step": 3430 + }, + { + "epoch": 1.004391100702576, + "grad_norm": 0.9249182343482971, + "learning_rate": 4.678100056679238e-06, + "loss": 0.6238, + "step": 3431 + }, + { + "epoch": 1.0046838407494145, + "grad_norm": 0.9039417505264282, + "learning_rate": 4.677911014187302e-06, + "loss": 0.5823, + "step": 3432 + }, + { + "epoch": 1.004976580796253, + "grad_norm": 0.9997001886367798, + "learning_rate": 4.6777219200240075e-06, + "loss": 0.6255, + "step": 3433 + }, + { + "epoch": 1.0052693208430914, + "grad_norm": 1.002948522567749, + "learning_rate": 4.677532774193844e-06, + "loss": 0.6005, + "step": 3434 + }, + { + "epoch": 1.0055620608899298, + "grad_norm": 1.0168336629867554, + "learning_rate": 4.677343576701296e-06, + "loss": 0.6072, + "step": 3435 + }, + { + "epoch": 1.005854800936768, + "grad_norm": 0.950864851474762, + "learning_rate": 4.6771543275508525e-06, + "loss": 0.6053, + "step": 3436 + }, + { + "epoch": 1.0061475409836065, + "grad_norm": 0.9901363253593445, + "learning_rate": 4.676965026747005e-06, + "loss": 0.6738, + "step": 3437 + }, + { + "epoch": 1.006440281030445, + "grad_norm": 0.9605069160461426, + "learning_rate": 4.676775674294245e-06, + "loss": 0.5975, + "step": 3438 + }, + { + "epoch": 1.0067330210772834, + "grad_norm": 1.0218255519866943, + "learning_rate": 4.676586270197063e-06, + "loss": 0.6479, + "step": 3439 + }, + { + "epoch": 1.0070257611241218, + "grad_norm": 1.0479927062988281, + "learning_rate": 4.676396814459954e-06, + "loss": 0.6502, + "step": 3440 + }, + { + "epoch": 1.0073185011709602, + "grad_norm": 0.9220239520072937, + "learning_rate": 4.676207307087412e-06, + "loss": 0.5864, + "step": 3441 + }, + { + "epoch": 1.0076112412177987, + "grad_norm": 1.0032938718795776, + "learning_rate": 4.676017748083933e-06, + "loss": 0.6691, + "step": 3442 + }, + { + "epoch": 1.007903981264637, + "grad_norm": 1.0282138586044312, + "learning_rate": 4.675828137454016e-06, + "loss": 0.5974, + "step": 3443 + }, + { + "epoch": 1.0081967213114753, + "grad_norm": 0.983674168586731, + "learning_rate": 4.675638475202158e-06, + "loss": 0.6524, + "step": 3444 + }, + { + "epoch": 1.0084894613583137, + "grad_norm": 0.9158604741096497, + "learning_rate": 4.675448761332859e-06, + "loss": 0.6209, + "step": 3445 + }, + { + "epoch": 1.0087822014051522, + "grad_norm": 0.9210212826728821, + "learning_rate": 4.67525899585062e-06, + "loss": 0.5992, + "step": 3446 + }, + { + "epoch": 1.0090749414519906, + "grad_norm": 0.9971821308135986, + "learning_rate": 4.675069178759944e-06, + "loss": 0.6735, + "step": 3447 + }, + { + "epoch": 1.009367681498829, + "grad_norm": 0.9782065153121948, + "learning_rate": 4.674879310065333e-06, + "loss": 0.6133, + "step": 3448 + }, + { + "epoch": 1.0096604215456675, + "grad_norm": 0.9281617403030396, + "learning_rate": 4.674689389771294e-06, + "loss": 0.5803, + "step": 3449 + }, + { + "epoch": 1.009953161592506, + "grad_norm": 1.042849063873291, + "learning_rate": 4.67449941788233e-06, + "loss": 0.5965, + "step": 3450 + }, + { + "epoch": 1.0102459016393444, + "grad_norm": 0.9914324283599854, + "learning_rate": 4.674309394402951e-06, + "loss": 0.6543, + "step": 3451 + }, + { + "epoch": 1.0105386416861826, + "grad_norm": 0.9548019766807556, + "learning_rate": 4.674119319337662e-06, + "loss": 0.6075, + "step": 3452 + }, + { + "epoch": 1.010831381733021, + "grad_norm": 1.0310802459716797, + "learning_rate": 4.673929192690975e-06, + "loss": 0.6456, + "step": 3453 + }, + { + "epoch": 1.0111241217798594, + "grad_norm": 0.942043125629425, + "learning_rate": 4.673739014467401e-06, + "loss": 0.5701, + "step": 3454 + }, + { + "epoch": 1.0114168618266979, + "grad_norm": 1.0200700759887695, + "learning_rate": 4.673548784671451e-06, + "loss": 0.6708, + "step": 3455 + }, + { + "epoch": 1.0117096018735363, + "grad_norm": 1.0198854207992554, + "learning_rate": 4.673358503307638e-06, + "loss": 0.5951, + "step": 3456 + }, + { + "epoch": 1.0120023419203747, + "grad_norm": 0.9643044471740723, + "learning_rate": 4.673168170380478e-06, + "loss": 0.6582, + "step": 3457 + }, + { + "epoch": 1.0122950819672132, + "grad_norm": 0.8908067941665649, + "learning_rate": 4.672977785894485e-06, + "loss": 0.5536, + "step": 3458 + }, + { + "epoch": 1.0125878220140516, + "grad_norm": 0.915847897529602, + "learning_rate": 4.672787349854176e-06, + "loss": 0.61, + "step": 3459 + }, + { + "epoch": 1.0128805620608898, + "grad_norm": 0.9995766878128052, + "learning_rate": 4.672596862264071e-06, + "loss": 0.6058, + "step": 3460 + }, + { + "epoch": 1.0131733021077283, + "grad_norm": 0.9669291973114014, + "learning_rate": 4.672406323128687e-06, + "loss": 0.6505, + "step": 3461 + }, + { + "epoch": 1.0134660421545667, + "grad_norm": 1.0363949537277222, + "learning_rate": 4.672215732452546e-06, + "loss": 0.6543, + "step": 3462 + }, + { + "epoch": 1.0137587822014051, + "grad_norm": 1.0391266345977783, + "learning_rate": 4.672025090240169e-06, + "loss": 0.597, + "step": 3463 + }, + { + "epoch": 1.0140515222482436, + "grad_norm": 0.9991456866264343, + "learning_rate": 4.67183439649608e-06, + "loss": 0.6416, + "step": 3464 + }, + { + "epoch": 1.014344262295082, + "grad_norm": 0.9692447781562805, + "learning_rate": 4.671643651224802e-06, + "loss": 0.6345, + "step": 3465 + }, + { + "epoch": 1.0146370023419204, + "grad_norm": 1.0430585145950317, + "learning_rate": 4.6714528544308615e-06, + "loss": 0.6658, + "step": 3466 + }, + { + "epoch": 1.0149297423887589, + "grad_norm": 0.9613690376281738, + "learning_rate": 4.6712620061187855e-06, + "loss": 0.6539, + "step": 3467 + }, + { + "epoch": 1.015222482435597, + "grad_norm": 1.0261386632919312, + "learning_rate": 4.6710711062931006e-06, + "loss": 0.6156, + "step": 3468 + }, + { + "epoch": 1.0155152224824355, + "grad_norm": 0.9881714582443237, + "learning_rate": 4.670880154958337e-06, + "loss": 0.6271, + "step": 3469 + }, + { + "epoch": 1.015807962529274, + "grad_norm": 0.9575268626213074, + "learning_rate": 4.670689152119024e-06, + "loss": 0.5851, + "step": 3470 + }, + { + "epoch": 1.0161007025761124, + "grad_norm": 1.0513452291488647, + "learning_rate": 4.6704980977796944e-06, + "loss": 0.657, + "step": 3471 + }, + { + "epoch": 1.0163934426229508, + "grad_norm": 0.9531498551368713, + "learning_rate": 4.67030699194488e-06, + "loss": 0.6181, + "step": 3472 + }, + { + "epoch": 1.0166861826697893, + "grad_norm": 0.9274744391441345, + "learning_rate": 4.670115834619116e-06, + "loss": 0.5771, + "step": 3473 + }, + { + "epoch": 1.0169789227166277, + "grad_norm": 0.9267966747283936, + "learning_rate": 4.669924625806936e-06, + "loss": 0.6608, + "step": 3474 + }, + { + "epoch": 1.0172716627634661, + "grad_norm": 1.0028561353683472, + "learning_rate": 4.669733365512878e-06, + "loss": 0.6316, + "step": 3475 + }, + { + "epoch": 1.0175644028103044, + "grad_norm": 0.9614577293395996, + "learning_rate": 4.669542053741478e-06, + "loss": 0.6338, + "step": 3476 + }, + { + "epoch": 1.0178571428571428, + "grad_norm": 0.9533283114433289, + "learning_rate": 4.669350690497277e-06, + "loss": 0.669, + "step": 3477 + }, + { + "epoch": 1.0181498829039812, + "grad_norm": 0.8903689384460449, + "learning_rate": 4.669159275784814e-06, + "loss": 0.5884, + "step": 3478 + }, + { + "epoch": 1.0184426229508197, + "grad_norm": 0.9621901512145996, + "learning_rate": 4.66896780960863e-06, + "loss": 0.6556, + "step": 3479 + }, + { + "epoch": 1.018735362997658, + "grad_norm": 0.9643514752388, + "learning_rate": 4.6687762919732686e-06, + "loss": 0.626, + "step": 3480 + }, + { + "epoch": 1.0190281030444965, + "grad_norm": 0.9368652701377869, + "learning_rate": 4.668584722883272e-06, + "loss": 0.5715, + "step": 3481 + }, + { + "epoch": 1.019320843091335, + "grad_norm": 1.0116091966629028, + "learning_rate": 4.668393102343187e-06, + "loss": 0.6586, + "step": 3482 + }, + { + "epoch": 1.0196135831381734, + "grad_norm": 0.9675971865653992, + "learning_rate": 4.66820143035756e-06, + "loss": 0.615, + "step": 3483 + }, + { + "epoch": 1.0199063231850116, + "grad_norm": 0.9168434143066406, + "learning_rate": 4.668009706930936e-06, + "loss": 0.6087, + "step": 3484 + }, + { + "epoch": 1.02019906323185, + "grad_norm": 0.9497443437576294, + "learning_rate": 4.667817932067866e-06, + "loss": 0.588, + "step": 3485 + }, + { + "epoch": 1.0204918032786885, + "grad_norm": 1.0043973922729492, + "learning_rate": 4.667626105772898e-06, + "loss": 0.6177, + "step": 3486 + }, + { + "epoch": 1.020784543325527, + "grad_norm": 0.9665338397026062, + "learning_rate": 4.667434228050585e-06, + "loss": 0.6281, + "step": 3487 + }, + { + "epoch": 1.0210772833723654, + "grad_norm": 0.9660483598709106, + "learning_rate": 4.667242298905479e-06, + "loss": 0.6114, + "step": 3488 + }, + { + "epoch": 1.0213700234192038, + "grad_norm": 0.9133903980255127, + "learning_rate": 4.667050318342132e-06, + "loss": 0.5828, + "step": 3489 + }, + { + "epoch": 1.0216627634660422, + "grad_norm": 1.0473170280456543, + "learning_rate": 4.666858286365101e-06, + "loss": 0.6432, + "step": 3490 + }, + { + "epoch": 1.0219555035128807, + "grad_norm": 0.9807840585708618, + "learning_rate": 4.66666620297894e-06, + "loss": 0.6253, + "step": 3491 + }, + { + "epoch": 1.0222482435597189, + "grad_norm": 0.967452347278595, + "learning_rate": 4.666474068188207e-06, + "loss": 0.6668, + "step": 3492 + }, + { + "epoch": 1.0225409836065573, + "grad_norm": 0.9884018898010254, + "learning_rate": 4.666281881997461e-06, + "loss": 0.6104, + "step": 3493 + }, + { + "epoch": 1.0228337236533958, + "grad_norm": 1.0131282806396484, + "learning_rate": 4.666089644411261e-06, + "loss": 0.6015, + "step": 3494 + }, + { + "epoch": 1.0231264637002342, + "grad_norm": 0.9640107750892639, + "learning_rate": 4.665897355434168e-06, + "loss": 0.6556, + "step": 3495 + }, + { + "epoch": 1.0234192037470726, + "grad_norm": 0.9967181086540222, + "learning_rate": 4.6657050150707435e-06, + "loss": 0.6693, + "step": 3496 + }, + { + "epoch": 1.023711943793911, + "grad_norm": 0.9532243013381958, + "learning_rate": 4.665512623325552e-06, + "loss": 0.6358, + "step": 3497 + }, + { + "epoch": 1.0240046838407495, + "grad_norm": 0.9948485493659973, + "learning_rate": 4.665320180203158e-06, + "loss": 0.6482, + "step": 3498 + }, + { + "epoch": 1.024297423887588, + "grad_norm": 0.9626889228820801, + "learning_rate": 4.665127685708126e-06, + "loss": 0.6215, + "step": 3499 + }, + { + "epoch": 1.0245901639344261, + "grad_norm": 0.9809090495109558, + "learning_rate": 4.664935139845025e-06, + "loss": 0.6221, + "step": 3500 + }, + { + "epoch": 1.0248829039812646, + "grad_norm": 1.015590786933899, + "learning_rate": 4.66474254261842e-06, + "loss": 0.6757, + "step": 3501 + }, + { + "epoch": 1.025175644028103, + "grad_norm": 0.9790574908256531, + "learning_rate": 4.6645498940328834e-06, + "loss": 0.6168, + "step": 3502 + }, + { + "epoch": 1.0254683840749415, + "grad_norm": 0.9292106032371521, + "learning_rate": 4.664357194092984e-06, + "loss": 0.6207, + "step": 3503 + }, + { + "epoch": 1.0257611241217799, + "grad_norm": 0.9536728858947754, + "learning_rate": 4.664164442803295e-06, + "loss": 0.5912, + "step": 3504 + }, + { + "epoch": 1.0260538641686183, + "grad_norm": 1.351918339729309, + "learning_rate": 4.663971640168389e-06, + "loss": 0.5845, + "step": 3505 + }, + { + "epoch": 1.0263466042154568, + "grad_norm": 0.9802219867706299, + "learning_rate": 4.66377878619284e-06, + "loss": 0.6677, + "step": 3506 + }, + { + "epoch": 1.026639344262295, + "grad_norm": 1.0099585056304932, + "learning_rate": 4.663585880881223e-06, + "loss": 0.6209, + "step": 3507 + }, + { + "epoch": 1.0269320843091334, + "grad_norm": 0.9582974314689636, + "learning_rate": 4.663392924238116e-06, + "loss": 0.5942, + "step": 3508 + }, + { + "epoch": 1.0272248243559718, + "grad_norm": 0.9955439567565918, + "learning_rate": 4.663199916268095e-06, + "loss": 0.6247, + "step": 3509 + }, + { + "epoch": 1.0275175644028103, + "grad_norm": 0.9615540504455566, + "learning_rate": 4.663006856975742e-06, + "loss": 0.6246, + "step": 3510 + }, + { + "epoch": 1.0278103044496487, + "grad_norm": 1.021417498588562, + "learning_rate": 4.662813746365635e-06, + "loss": 0.6603, + "step": 3511 + }, + { + "epoch": 1.0281030444964872, + "grad_norm": 0.9495982527732849, + "learning_rate": 4.6626205844423555e-06, + "loss": 0.6336, + "step": 3512 + }, + { + "epoch": 1.0283957845433256, + "grad_norm": 1.0012246370315552, + "learning_rate": 4.6624273712104884e-06, + "loss": 0.6613, + "step": 3513 + }, + { + "epoch": 1.028688524590164, + "grad_norm": 0.9803540706634521, + "learning_rate": 4.662234106674616e-06, + "loss": 0.6016, + "step": 3514 + }, + { + "epoch": 1.0289812646370022, + "grad_norm": 0.9454165697097778, + "learning_rate": 4.662040790839324e-06, + "loss": 0.6453, + "step": 3515 + }, + { + "epoch": 1.0292740046838407, + "grad_norm": 0.9704473614692688, + "learning_rate": 4.661847423709198e-06, + "loss": 0.5753, + "step": 3516 + }, + { + "epoch": 1.029566744730679, + "grad_norm": 1.0088720321655273, + "learning_rate": 4.661654005288828e-06, + "loss": 0.6579, + "step": 3517 + }, + { + "epoch": 1.0298594847775175, + "grad_norm": 1.023005485534668, + "learning_rate": 4.661460535582801e-06, + "loss": 0.6169, + "step": 3518 + }, + { + "epoch": 1.030152224824356, + "grad_norm": 1.0418821573257446, + "learning_rate": 4.661267014595707e-06, + "loss": 0.6299, + "step": 3519 + }, + { + "epoch": 1.0304449648711944, + "grad_norm": 0.976065456867218, + "learning_rate": 4.661073442332139e-06, + "loss": 0.6486, + "step": 3520 + }, + { + "epoch": 1.0307377049180328, + "grad_norm": 1.0279289484024048, + "learning_rate": 4.660879818796688e-06, + "loss": 0.6033, + "step": 3521 + }, + { + "epoch": 1.0310304449648713, + "grad_norm": 1.0478489398956299, + "learning_rate": 4.660686143993947e-06, + "loss": 0.6353, + "step": 3522 + }, + { + "epoch": 1.0313231850117095, + "grad_norm": 0.9574486017227173, + "learning_rate": 4.660492417928513e-06, + "loss": 0.6032, + "step": 3523 + }, + { + "epoch": 1.031615925058548, + "grad_norm": 1.0010905265808105, + "learning_rate": 4.660298640604981e-06, + "loss": 0.6232, + "step": 3524 + }, + { + "epoch": 1.0319086651053864, + "grad_norm": 0.9590752124786377, + "learning_rate": 4.660104812027949e-06, + "loss": 0.6313, + "step": 3525 + }, + { + "epoch": 1.0322014051522248, + "grad_norm": 0.9208690524101257, + "learning_rate": 4.659910932202015e-06, + "loss": 0.6172, + "step": 3526 + }, + { + "epoch": 1.0324941451990632, + "grad_norm": 1.020134687423706, + "learning_rate": 4.6597170011317795e-06, + "loss": 0.5749, + "step": 3527 + }, + { + "epoch": 1.0327868852459017, + "grad_norm": 1.0431705713272095, + "learning_rate": 4.659523018821843e-06, + "loss": 0.6141, + "step": 3528 + }, + { + "epoch": 1.0330796252927401, + "grad_norm": 0.986928403377533, + "learning_rate": 4.659328985276809e-06, + "loss": 0.6393, + "step": 3529 + }, + { + "epoch": 1.0333723653395785, + "grad_norm": 1.0540739297866821, + "learning_rate": 4.659134900501278e-06, + "loss": 0.6048, + "step": 3530 + }, + { + "epoch": 1.0336651053864168, + "grad_norm": 0.97709059715271, + "learning_rate": 4.658940764499858e-06, + "loss": 0.6219, + "step": 3531 + }, + { + "epoch": 1.0339578454332552, + "grad_norm": 0.9474360942840576, + "learning_rate": 4.658746577277153e-06, + "loss": 0.5892, + "step": 3532 + }, + { + "epoch": 1.0342505854800936, + "grad_norm": 0.9502942562103271, + "learning_rate": 4.658552338837772e-06, + "loss": 0.6249, + "step": 3533 + }, + { + "epoch": 1.034543325526932, + "grad_norm": 0.9669580459594727, + "learning_rate": 4.658358049186321e-06, + "loss": 0.6538, + "step": 3534 + }, + { + "epoch": 1.0348360655737705, + "grad_norm": 0.9346634745597839, + "learning_rate": 4.65816370832741e-06, + "loss": 0.608, + "step": 3535 + }, + { + "epoch": 1.035128805620609, + "grad_norm": 1.0085246562957764, + "learning_rate": 4.657969316265652e-06, + "loss": 0.6166, + "step": 3536 + }, + { + "epoch": 1.0354215456674474, + "grad_norm": 0.9804354906082153, + "learning_rate": 4.657774873005656e-06, + "loss": 0.69, + "step": 3537 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 0.9951425194740295, + "learning_rate": 4.657580378552037e-06, + "loss": 0.5644, + "step": 3538 + }, + { + "epoch": 1.036007025761124, + "grad_norm": 0.9397252202033997, + "learning_rate": 4.657385832909409e-06, + "loss": 0.6114, + "step": 3539 + }, + { + "epoch": 1.0362997658079625, + "grad_norm": 0.9702205061912537, + "learning_rate": 4.6571912360823875e-06, + "loss": 0.6201, + "step": 3540 + }, + { + "epoch": 1.036592505854801, + "grad_norm": 1.027286410331726, + "learning_rate": 4.65699658807559e-06, + "loss": 0.5843, + "step": 3541 + }, + { + "epoch": 1.0368852459016393, + "grad_norm": 0.9743898510932922, + "learning_rate": 4.656801888893634e-06, + "loss": 0.612, + "step": 3542 + }, + { + "epoch": 1.0371779859484778, + "grad_norm": 1.0097333192825317, + "learning_rate": 4.656607138541138e-06, + "loss": 0.6406, + "step": 3543 + }, + { + "epoch": 1.0374707259953162, + "grad_norm": 1.2327184677124023, + "learning_rate": 4.656412337022724e-06, + "loss": 0.6519, + "step": 3544 + }, + { + "epoch": 1.0377634660421546, + "grad_norm": 1.0379366874694824, + "learning_rate": 4.656217484343013e-06, + "loss": 0.6631, + "step": 3545 + }, + { + "epoch": 1.038056206088993, + "grad_norm": 1.0135167837142944, + "learning_rate": 4.6560225805066275e-06, + "loss": 0.627, + "step": 3546 + }, + { + "epoch": 1.0383489461358313, + "grad_norm": 0.9697626233100891, + "learning_rate": 4.655827625518192e-06, + "loss": 0.5682, + "step": 3547 + }, + { + "epoch": 1.0386416861826697, + "grad_norm": 1.0123093128204346, + "learning_rate": 4.6556326193823335e-06, + "loss": 0.6623, + "step": 3548 + }, + { + "epoch": 1.0389344262295082, + "grad_norm": 1.0033255815505981, + "learning_rate": 4.655437562103675e-06, + "loss": 0.6686, + "step": 3549 + }, + { + "epoch": 1.0392271662763466, + "grad_norm": 0.9571719169616699, + "learning_rate": 4.655242453686847e-06, + "loss": 0.6282, + "step": 3550 + }, + { + "epoch": 1.039519906323185, + "grad_norm": 0.9800928831100464, + "learning_rate": 4.6550472941364775e-06, + "loss": 0.6103, + "step": 3551 + }, + { + "epoch": 1.0398126463700235, + "grad_norm": 1.047980546951294, + "learning_rate": 4.654852083457198e-06, + "loss": 0.6692, + "step": 3552 + }, + { + "epoch": 1.040105386416862, + "grad_norm": 1.0058144330978394, + "learning_rate": 4.654656821653637e-06, + "loss": 0.6276, + "step": 3553 + }, + { + "epoch": 1.0403981264637003, + "grad_norm": 0.9339274168014526, + "learning_rate": 4.65446150873043e-06, + "loss": 0.5836, + "step": 3554 + }, + { + "epoch": 1.0406908665105385, + "grad_norm": 0.9896802306175232, + "learning_rate": 4.65426614469221e-06, + "loss": 0.7002, + "step": 3555 + }, + { + "epoch": 1.040983606557377, + "grad_norm": 0.9916583299636841, + "learning_rate": 4.654070729543611e-06, + "loss": 0.6377, + "step": 3556 + }, + { + "epoch": 1.0412763466042154, + "grad_norm": 0.9340778589248657, + "learning_rate": 4.653875263289271e-06, + "loss": 0.6146, + "step": 3557 + }, + { + "epoch": 1.0415690866510539, + "grad_norm": 0.9689563512802124, + "learning_rate": 4.6536797459338256e-06, + "loss": 0.5913, + "step": 3558 + }, + { + "epoch": 1.0418618266978923, + "grad_norm": 0.9506902694702148, + "learning_rate": 4.653484177481915e-06, + "loss": 0.6387, + "step": 3559 + }, + { + "epoch": 1.0421545667447307, + "grad_norm": 0.9730144143104553, + "learning_rate": 4.6532885579381795e-06, + "loss": 0.616, + "step": 3560 + }, + { + "epoch": 1.0424473067915692, + "grad_norm": 1.008414626121521, + "learning_rate": 4.6530928873072575e-06, + "loss": 0.6456, + "step": 3561 + }, + { + "epoch": 1.0427400468384076, + "grad_norm": 0.9603182077407837, + "learning_rate": 4.6528971655937935e-06, + "loss": 0.6245, + "step": 3562 + }, + { + "epoch": 1.0430327868852458, + "grad_norm": 0.9842037558555603, + "learning_rate": 4.652701392802432e-06, + "loss": 0.6343, + "step": 3563 + }, + { + "epoch": 1.0433255269320842, + "grad_norm": 0.9258037209510803, + "learning_rate": 4.652505568937815e-06, + "loss": 0.6229, + "step": 3564 + }, + { + "epoch": 1.0436182669789227, + "grad_norm": 0.9779812693595886, + "learning_rate": 4.652309694004591e-06, + "loss": 0.6132, + "step": 3565 + }, + { + "epoch": 1.0439110070257611, + "grad_norm": 0.9854347109794617, + "learning_rate": 4.652113768007405e-06, + "loss": 0.6138, + "step": 3566 + }, + { + "epoch": 1.0442037470725996, + "grad_norm": 1.0477056503295898, + "learning_rate": 4.651917790950906e-06, + "loss": 0.641, + "step": 3567 + }, + { + "epoch": 1.044496487119438, + "grad_norm": 0.9457709789276123, + "learning_rate": 4.651721762839745e-06, + "loss": 0.5464, + "step": 3568 + }, + { + "epoch": 1.0447892271662764, + "grad_norm": 0.9523487091064453, + "learning_rate": 4.651525683678572e-06, + "loss": 0.6265, + "step": 3569 + }, + { + "epoch": 1.0450819672131149, + "grad_norm": 0.9850186109542847, + "learning_rate": 4.651329553472038e-06, + "loss": 0.6357, + "step": 3570 + }, + { + "epoch": 1.045374707259953, + "grad_norm": 0.9517174959182739, + "learning_rate": 4.651133372224797e-06, + "loss": 0.6144, + "step": 3571 + }, + { + "epoch": 1.0456674473067915, + "grad_norm": 0.9593577980995178, + "learning_rate": 4.650937139941503e-06, + "loss": 0.6087, + "step": 3572 + }, + { + "epoch": 1.04596018735363, + "grad_norm": 0.9382795691490173, + "learning_rate": 4.650740856626814e-06, + "loss": 0.6458, + "step": 3573 + }, + { + "epoch": 1.0462529274004684, + "grad_norm": 0.9746546745300293, + "learning_rate": 4.650544522285383e-06, + "loss": 0.6325, + "step": 3574 + }, + { + "epoch": 1.0465456674473068, + "grad_norm": 0.9496377110481262, + "learning_rate": 4.650348136921871e-06, + "loss": 0.6031, + "step": 3575 + }, + { + "epoch": 1.0468384074941453, + "grad_norm": 0.9095340967178345, + "learning_rate": 4.650151700540936e-06, + "loss": 0.5732, + "step": 3576 + }, + { + "epoch": 1.0471311475409837, + "grad_norm": 0.9785987138748169, + "learning_rate": 4.649955213147239e-06, + "loss": 0.6286, + "step": 3577 + }, + { + "epoch": 1.0474238875878221, + "grad_norm": 0.9211812615394592, + "learning_rate": 4.649758674745441e-06, + "loss": 0.5946, + "step": 3578 + }, + { + "epoch": 1.0477166276346603, + "grad_norm": 0.9830271005630493, + "learning_rate": 4.6495620853402065e-06, + "loss": 0.6059, + "step": 3579 + }, + { + "epoch": 1.0480093676814988, + "grad_norm": 0.9868259429931641, + "learning_rate": 4.649365444936198e-06, + "loss": 0.5961, + "step": 3580 + }, + { + "epoch": 1.0483021077283372, + "grad_norm": 0.9503172039985657, + "learning_rate": 4.649168753538081e-06, + "loss": 0.6006, + "step": 3581 + }, + { + "epoch": 1.0485948477751756, + "grad_norm": 0.9922894835472107, + "learning_rate": 4.648972011150523e-06, + "loss": 0.6566, + "step": 3582 + }, + { + "epoch": 1.048887587822014, + "grad_norm": 1.0016425848007202, + "learning_rate": 4.648775217778192e-06, + "loss": 0.6492, + "step": 3583 + }, + { + "epoch": 1.0491803278688525, + "grad_norm": 0.9913505911827087, + "learning_rate": 4.648578373425754e-06, + "loss": 0.6085, + "step": 3584 + }, + { + "epoch": 1.049473067915691, + "grad_norm": 1.0026942491531372, + "learning_rate": 4.648381478097883e-06, + "loss": 0.6461, + "step": 3585 + }, + { + "epoch": 1.0497658079625292, + "grad_norm": 1.034787654876709, + "learning_rate": 4.648184531799248e-06, + "loss": 0.6371, + "step": 3586 + }, + { + "epoch": 1.0500585480093676, + "grad_norm": 0.9811267256736755, + "learning_rate": 4.647987534534522e-06, + "loss": 0.6318, + "step": 3587 + }, + { + "epoch": 1.050351288056206, + "grad_norm": 0.9995855093002319, + "learning_rate": 4.6477904863083795e-06, + "loss": 0.6136, + "step": 3588 + }, + { + "epoch": 1.0506440281030445, + "grad_norm": 1.0386559963226318, + "learning_rate": 4.647593387125496e-06, + "loss": 0.6036, + "step": 3589 + }, + { + "epoch": 1.050936768149883, + "grad_norm": 1.0008927583694458, + "learning_rate": 4.647396236990544e-06, + "loss": 0.664, + "step": 3590 + }, + { + "epoch": 1.0512295081967213, + "grad_norm": 0.9799753427505493, + "learning_rate": 4.647199035908207e-06, + "loss": 0.6357, + "step": 3591 + }, + { + "epoch": 1.0515222482435598, + "grad_norm": 0.9898558855056763, + "learning_rate": 4.647001783883158e-06, + "loss": 0.6445, + "step": 3592 + }, + { + "epoch": 1.0518149882903982, + "grad_norm": 0.9402797222137451, + "learning_rate": 4.6468044809200805e-06, + "loss": 0.6418, + "step": 3593 + }, + { + "epoch": 1.0521077283372366, + "grad_norm": 0.9761261343955994, + "learning_rate": 4.646607127023653e-06, + "loss": 0.667, + "step": 3594 + }, + { + "epoch": 1.0524004683840749, + "grad_norm": 1.1228201389312744, + "learning_rate": 4.64640972219856e-06, + "loss": 0.6605, + "step": 3595 + }, + { + "epoch": 1.0526932084309133, + "grad_norm": 0.9908759593963623, + "learning_rate": 4.646212266449484e-06, + "loss": 0.6297, + "step": 3596 + }, + { + "epoch": 1.0529859484777517, + "grad_norm": 0.9983891844749451, + "learning_rate": 4.646014759781109e-06, + "loss": 0.6182, + "step": 3597 + }, + { + "epoch": 1.0532786885245902, + "grad_norm": 0.9633698463439941, + "learning_rate": 4.645817202198123e-06, + "loss": 0.6334, + "step": 3598 + }, + { + "epoch": 1.0535714285714286, + "grad_norm": 1.0089192390441895, + "learning_rate": 4.645619593705209e-06, + "loss": 0.6261, + "step": 3599 + }, + { + "epoch": 1.053864168618267, + "grad_norm": 0.9526565670967102, + "learning_rate": 4.64542193430706e-06, + "loss": 0.5969, + "step": 3600 + }, + { + "epoch": 1.0541569086651055, + "grad_norm": 1.3935277462005615, + "learning_rate": 4.645224224008363e-06, + "loss": 0.6507, + "step": 3601 + }, + { + "epoch": 1.0544496487119437, + "grad_norm": 0.9948412179946899, + "learning_rate": 4.645026462813809e-06, + "loss": 0.5813, + "step": 3602 + }, + { + "epoch": 1.0547423887587821, + "grad_norm": 0.9550055265426636, + "learning_rate": 4.64482865072809e-06, + "loss": 0.6349, + "step": 3603 + }, + { + "epoch": 1.0550351288056206, + "grad_norm": 0.9885076284408569, + "learning_rate": 4.644630787755899e-06, + "loss": 0.6183, + "step": 3604 + }, + { + "epoch": 1.055327868852459, + "grad_norm": 1.023621916770935, + "learning_rate": 4.644432873901931e-06, + "loss": 0.6505, + "step": 3605 + }, + { + "epoch": 1.0556206088992974, + "grad_norm": 0.9970288276672363, + "learning_rate": 4.644234909170881e-06, + "loss": 0.6173, + "step": 3606 + }, + { + "epoch": 1.0559133489461359, + "grad_norm": 0.9954100847244263, + "learning_rate": 4.644036893567446e-06, + "loss": 0.6719, + "step": 3607 + }, + { + "epoch": 1.0562060889929743, + "grad_norm": 0.9517325162887573, + "learning_rate": 4.643838827096323e-06, + "loss": 0.6139, + "step": 3608 + }, + { + "epoch": 1.0564988290398127, + "grad_norm": 0.9573807120323181, + "learning_rate": 4.643640709762213e-06, + "loss": 0.6423, + "step": 3609 + }, + { + "epoch": 1.056791569086651, + "grad_norm": 1.019157886505127, + "learning_rate": 4.643442541569815e-06, + "loss": 0.6229, + "step": 3610 + }, + { + "epoch": 1.0570843091334894, + "grad_norm": 0.9777049422264099, + "learning_rate": 4.64324432252383e-06, + "loss": 0.618, + "step": 3611 + }, + { + "epoch": 1.0573770491803278, + "grad_norm": 0.9315643310546875, + "learning_rate": 4.643046052628963e-06, + "loss": 0.6203, + "step": 3612 + }, + { + "epoch": 1.0576697892271663, + "grad_norm": 0.891864001750946, + "learning_rate": 4.642847731889916e-06, + "loss": 0.6095, + "step": 3613 + }, + { + "epoch": 1.0579625292740047, + "grad_norm": 1.0089088678359985, + "learning_rate": 4.6426493603113944e-06, + "loss": 0.6236, + "step": 3614 + }, + { + "epoch": 1.0582552693208431, + "grad_norm": 1.1565223932266235, + "learning_rate": 4.642450937898105e-06, + "loss": 0.6376, + "step": 3615 + }, + { + "epoch": 1.0585480093676816, + "grad_norm": 1.0012537240982056, + "learning_rate": 4.642252464654756e-06, + "loss": 0.6365, + "step": 3616 + }, + { + "epoch": 1.05884074941452, + "grad_norm": 1.0090100765228271, + "learning_rate": 4.642053940586056e-06, + "loss": 0.5633, + "step": 3617 + }, + { + "epoch": 1.0591334894613582, + "grad_norm": 0.9495288729667664, + "learning_rate": 4.641855365696715e-06, + "loss": 0.5998, + "step": 3618 + }, + { + "epoch": 1.0594262295081966, + "grad_norm": 0.9364737868309021, + "learning_rate": 4.6416567399914434e-06, + "loss": 0.6341, + "step": 3619 + }, + { + "epoch": 1.059718969555035, + "grad_norm": 0.9764317870140076, + "learning_rate": 4.641458063474954e-06, + "loss": 0.6543, + "step": 3620 + }, + { + "epoch": 1.0600117096018735, + "grad_norm": 1.0098251104354858, + "learning_rate": 4.641259336151961e-06, + "loss": 0.6075, + "step": 3621 + }, + { + "epoch": 1.060304449648712, + "grad_norm": 1.05937922000885, + "learning_rate": 4.6410605580271785e-06, + "loss": 0.6105, + "step": 3622 + }, + { + "epoch": 1.0605971896955504, + "grad_norm": 0.9525614380836487, + "learning_rate": 4.640861729105324e-06, + "loss": 0.5867, + "step": 3623 + }, + { + "epoch": 1.0608899297423888, + "grad_norm": 0.9882469177246094, + "learning_rate": 4.640662849391112e-06, + "loss": 0.6078, + "step": 3624 + }, + { + "epoch": 1.0611826697892273, + "grad_norm": 0.9170294404029846, + "learning_rate": 4.640463918889264e-06, + "loss": 0.5754, + "step": 3625 + }, + { + "epoch": 1.0614754098360655, + "grad_norm": 0.9771245718002319, + "learning_rate": 4.640264937604497e-06, + "loss": 0.622, + "step": 3626 + }, + { + "epoch": 1.061768149882904, + "grad_norm": 0.9760441184043884, + "learning_rate": 4.6400659055415346e-06, + "loss": 0.6258, + "step": 3627 + }, + { + "epoch": 1.0620608899297423, + "grad_norm": 0.977948009967804, + "learning_rate": 4.639866822705096e-06, + "loss": 0.6562, + "step": 3628 + }, + { + "epoch": 1.0623536299765808, + "grad_norm": 0.9983142018318176, + "learning_rate": 4.639667689099907e-06, + "loss": 0.5951, + "step": 3629 + }, + { + "epoch": 1.0626463700234192, + "grad_norm": 0.9536082744598389, + "learning_rate": 4.639468504730691e-06, + "loss": 0.5757, + "step": 3630 + }, + { + "epoch": 1.0629391100702577, + "grad_norm": 0.9514579772949219, + "learning_rate": 4.6392692696021735e-06, + "loss": 0.6352, + "step": 3631 + }, + { + "epoch": 1.063231850117096, + "grad_norm": 0.9705181121826172, + "learning_rate": 4.639069983719081e-06, + "loss": 0.6193, + "step": 3632 + }, + { + "epoch": 1.0635245901639345, + "grad_norm": 0.9926466941833496, + "learning_rate": 4.638870647086142e-06, + "loss": 0.6509, + "step": 3633 + }, + { + "epoch": 1.0638173302107727, + "grad_norm": 1.0027347803115845, + "learning_rate": 4.6386712597080875e-06, + "loss": 0.6224, + "step": 3634 + }, + { + "epoch": 1.0641100702576112, + "grad_norm": 0.9951688647270203, + "learning_rate": 4.638471821589645e-06, + "loss": 0.6292, + "step": 3635 + }, + { + "epoch": 1.0644028103044496, + "grad_norm": 0.9920988082885742, + "learning_rate": 4.638272332735548e-06, + "loss": 0.6497, + "step": 3636 + }, + { + "epoch": 1.064695550351288, + "grad_norm": 0.9866023659706116, + "learning_rate": 4.63807279315053e-06, + "loss": 0.6212, + "step": 3637 + }, + { + "epoch": 1.0649882903981265, + "grad_norm": 1.0158641338348389, + "learning_rate": 4.637873202839323e-06, + "loss": 0.6055, + "step": 3638 + }, + { + "epoch": 1.065281030444965, + "grad_norm": 0.9524785876274109, + "learning_rate": 4.637673561806665e-06, + "loss": 0.6132, + "step": 3639 + }, + { + "epoch": 1.0655737704918034, + "grad_norm": 0.9733337759971619, + "learning_rate": 4.6374738700572894e-06, + "loss": 0.5715, + "step": 3640 + }, + { + "epoch": 1.0658665105386418, + "grad_norm": 0.9813475608825684, + "learning_rate": 4.637274127595936e-06, + "loss": 0.6016, + "step": 3641 + }, + { + "epoch": 1.06615925058548, + "grad_norm": 0.9645315408706665, + "learning_rate": 4.637074334427344e-06, + "loss": 0.6577, + "step": 3642 + }, + { + "epoch": 1.0664519906323184, + "grad_norm": 1.018913745880127, + "learning_rate": 4.6368744905562525e-06, + "loss": 0.5973, + "step": 3643 + }, + { + "epoch": 1.0667447306791569, + "grad_norm": 0.9577640295028687, + "learning_rate": 4.6366745959874035e-06, + "loss": 0.6526, + "step": 3644 + }, + { + "epoch": 1.0670374707259953, + "grad_norm": 0.9651963114738464, + "learning_rate": 4.6364746507255385e-06, + "loss": 0.6483, + "step": 3645 + }, + { + "epoch": 1.0673302107728337, + "grad_norm": 0.9763790965080261, + "learning_rate": 4.6362746547754025e-06, + "loss": 0.6349, + "step": 3646 + }, + { + "epoch": 1.0676229508196722, + "grad_norm": 0.9601399302482605, + "learning_rate": 4.63607460814174e-06, + "loss": 0.6111, + "step": 3647 + }, + { + "epoch": 1.0679156908665106, + "grad_norm": 0.9831777215003967, + "learning_rate": 4.635874510829296e-06, + "loss": 0.6536, + "step": 3648 + }, + { + "epoch": 1.068208430913349, + "grad_norm": 0.9582194685935974, + "learning_rate": 4.635674362842819e-06, + "loss": 0.6371, + "step": 3649 + }, + { + "epoch": 1.0685011709601873, + "grad_norm": 0.9892329573631287, + "learning_rate": 4.6354741641870586e-06, + "loss": 0.6101, + "step": 3650 + }, + { + "epoch": 1.0687939110070257, + "grad_norm": 1.0097441673278809, + "learning_rate": 4.635273914866763e-06, + "loss": 0.6419, + "step": 3651 + }, + { + "epoch": 1.0690866510538641, + "grad_norm": 0.9796767830848694, + "learning_rate": 4.6350736148866835e-06, + "loss": 0.6257, + "step": 3652 + }, + { + "epoch": 1.0693793911007026, + "grad_norm": 0.936348021030426, + "learning_rate": 4.634873264251572e-06, + "loss": 0.6356, + "step": 3653 + }, + { + "epoch": 1.069672131147541, + "grad_norm": 0.9727880954742432, + "learning_rate": 4.6346728629661816e-06, + "loss": 0.6143, + "step": 3654 + }, + { + "epoch": 1.0699648711943794, + "grad_norm": 0.9954716563224792, + "learning_rate": 4.634472411035269e-06, + "loss": 0.6292, + "step": 3655 + }, + { + "epoch": 1.0702576112412179, + "grad_norm": 1.0170587301254272, + "learning_rate": 4.634271908463587e-06, + "loss": 0.6194, + "step": 3656 + }, + { + "epoch": 1.0705503512880563, + "grad_norm": 0.927987813949585, + "learning_rate": 4.634071355255894e-06, + "loss": 0.5954, + "step": 3657 + }, + { + "epoch": 1.0708430913348945, + "grad_norm": 0.9933330416679382, + "learning_rate": 4.633870751416949e-06, + "loss": 0.6162, + "step": 3658 + }, + { + "epoch": 1.071135831381733, + "grad_norm": 0.9714576601982117, + "learning_rate": 4.63367009695151e-06, + "loss": 0.6102, + "step": 3659 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 1.028070092201233, + "learning_rate": 4.633469391864338e-06, + "loss": 0.6242, + "step": 3660 + }, + { + "epoch": 1.0717213114754098, + "grad_norm": 0.9480123519897461, + "learning_rate": 4.633268636160195e-06, + "loss": 0.6194, + "step": 3661 + }, + { + "epoch": 1.0720140515222483, + "grad_norm": 0.9314332604408264, + "learning_rate": 4.633067829843844e-06, + "loss": 0.5891, + "step": 3662 + }, + { + "epoch": 1.0723067915690867, + "grad_norm": 1.0663743019104004, + "learning_rate": 4.632866972920047e-06, + "loss": 0.6083, + "step": 3663 + }, + { + "epoch": 1.0725995316159251, + "grad_norm": 0.9508763551712036, + "learning_rate": 4.632666065393574e-06, + "loss": 0.6338, + "step": 3664 + }, + { + "epoch": 1.0728922716627634, + "grad_norm": 0.9373596906661987, + "learning_rate": 4.632465107269187e-06, + "loss": 0.6046, + "step": 3665 + }, + { + "epoch": 1.0731850117096018, + "grad_norm": 1.0033024549484253, + "learning_rate": 4.632264098551656e-06, + "loss": 0.5984, + "step": 3666 + }, + { + "epoch": 1.0734777517564402, + "grad_norm": 1.0221749544143677, + "learning_rate": 4.63206303924575e-06, + "loss": 0.651, + "step": 3667 + }, + { + "epoch": 1.0737704918032787, + "grad_norm": 0.9671105146408081, + "learning_rate": 4.631861929356239e-06, + "loss": 0.6496, + "step": 3668 + }, + { + "epoch": 1.074063231850117, + "grad_norm": 0.9746518135070801, + "learning_rate": 4.631660768887893e-06, + "loss": 0.5867, + "step": 3669 + }, + { + "epoch": 1.0743559718969555, + "grad_norm": 1.0058128833770752, + "learning_rate": 4.6314595578454865e-06, + "loss": 0.6111, + "step": 3670 + }, + { + "epoch": 1.074648711943794, + "grad_norm": 0.915306031703949, + "learning_rate": 4.631258296233793e-06, + "loss": 0.6243, + "step": 3671 + }, + { + "epoch": 1.0749414519906324, + "grad_norm": 1.025342583656311, + "learning_rate": 4.631056984057587e-06, + "loss": 0.6301, + "step": 3672 + }, + { + "epoch": 1.0752341920374708, + "grad_norm": 0.9710121154785156, + "learning_rate": 4.630855621321644e-06, + "loss": 0.6237, + "step": 3673 + }, + { + "epoch": 1.075526932084309, + "grad_norm": 0.9877296090126038, + "learning_rate": 4.630654208030742e-06, + "loss": 0.603, + "step": 3674 + }, + { + "epoch": 1.0758196721311475, + "grad_norm": 0.9753234386444092, + "learning_rate": 4.63045274418966e-06, + "loss": 0.6303, + "step": 3675 + }, + { + "epoch": 1.076112412177986, + "grad_norm": 0.9560112953186035, + "learning_rate": 4.6302512298031765e-06, + "loss": 0.6284, + "step": 3676 + }, + { + "epoch": 1.0764051522248244, + "grad_norm": 0.9783791899681091, + "learning_rate": 4.630049664876074e-06, + "loss": 0.6166, + "step": 3677 + }, + { + "epoch": 1.0766978922716628, + "grad_norm": 1.0312237739562988, + "learning_rate": 4.629848049413134e-06, + "loss": 0.62, + "step": 3678 + }, + { + "epoch": 1.0769906323185012, + "grad_norm": 0.9757225513458252, + "learning_rate": 4.629646383419139e-06, + "loss": 0.6286, + "step": 3679 + }, + { + "epoch": 1.0772833723653397, + "grad_norm": 0.996088981628418, + "learning_rate": 4.629444666898875e-06, + "loss": 0.5633, + "step": 3680 + }, + { + "epoch": 1.0775761124121779, + "grad_norm": 0.9898551106452942, + "learning_rate": 4.629242899857127e-06, + "loss": 0.634, + "step": 3681 + }, + { + "epoch": 1.0778688524590163, + "grad_norm": 0.9363374710083008, + "learning_rate": 4.629041082298683e-06, + "loss": 0.5709, + "step": 3682 + }, + { + "epoch": 1.0781615925058547, + "grad_norm": 0.9942444562911987, + "learning_rate": 4.6288392142283285e-06, + "loss": 0.6492, + "step": 3683 + }, + { + "epoch": 1.0784543325526932, + "grad_norm": 0.9601694345474243, + "learning_rate": 4.628637295650856e-06, + "loss": 0.6083, + "step": 3684 + }, + { + "epoch": 1.0787470725995316, + "grad_norm": 0.9769856929779053, + "learning_rate": 4.628435326571054e-06, + "loss": 0.6002, + "step": 3685 + }, + { + "epoch": 1.07903981264637, + "grad_norm": 0.9600805044174194, + "learning_rate": 4.6282333069937165e-06, + "loss": 0.5854, + "step": 3686 + }, + { + "epoch": 1.0793325526932085, + "grad_norm": 0.9408785700798035, + "learning_rate": 4.628031236923633e-06, + "loss": 0.6607, + "step": 3687 + }, + { + "epoch": 1.079625292740047, + "grad_norm": 0.9607127904891968, + "learning_rate": 4.627829116365601e-06, + "loss": 0.6122, + "step": 3688 + }, + { + "epoch": 1.0799180327868854, + "grad_norm": 0.9974823594093323, + "learning_rate": 4.627626945324414e-06, + "loss": 0.6408, + "step": 3689 + }, + { + "epoch": 1.0802107728337236, + "grad_norm": 0.9751729965209961, + "learning_rate": 4.6274247238048684e-06, + "loss": 0.6071, + "step": 3690 + }, + { + "epoch": 1.080503512880562, + "grad_norm": 0.9718919396400452, + "learning_rate": 4.627222451811763e-06, + "loss": 0.6292, + "step": 3691 + }, + { + "epoch": 1.0807962529274004, + "grad_norm": 1.0165643692016602, + "learning_rate": 4.627020129349896e-06, + "loss": 0.6525, + "step": 3692 + }, + { + "epoch": 1.0810889929742389, + "grad_norm": 1.0028557777404785, + "learning_rate": 4.626817756424068e-06, + "loss": 0.6402, + "step": 3693 + }, + { + "epoch": 1.0813817330210773, + "grad_norm": 1.0095573663711548, + "learning_rate": 4.626615333039081e-06, + "loss": 0.6226, + "step": 3694 + }, + { + "epoch": 1.0816744730679158, + "grad_norm": 0.9842044115066528, + "learning_rate": 4.626412859199735e-06, + "loss": 0.6434, + "step": 3695 + }, + { + "epoch": 1.0819672131147542, + "grad_norm": 0.991215169429779, + "learning_rate": 4.626210334910836e-06, + "loss": 0.6368, + "step": 3696 + }, + { + "epoch": 1.0822599531615924, + "grad_norm": 1.065658450126648, + "learning_rate": 4.626007760177188e-06, + "loss": 0.6653, + "step": 3697 + }, + { + "epoch": 1.0825526932084308, + "grad_norm": 1.0165789127349854, + "learning_rate": 4.6258051350035985e-06, + "loss": 0.6022, + "step": 3698 + }, + { + "epoch": 1.0828454332552693, + "grad_norm": 0.9744022488594055, + "learning_rate": 4.625602459394873e-06, + "loss": 0.6357, + "step": 3699 + }, + { + "epoch": 1.0831381733021077, + "grad_norm": 1.0383517742156982, + "learning_rate": 4.625399733355821e-06, + "loss": 0.6408, + "step": 3700 + }, + { + "epoch": 1.0834309133489461, + "grad_norm": 0.9699131846427917, + "learning_rate": 4.625196956891251e-06, + "loss": 0.6002, + "step": 3701 + }, + { + "epoch": 1.0837236533957846, + "grad_norm": 1.9634267091751099, + "learning_rate": 4.624994130005976e-06, + "loss": 0.6428, + "step": 3702 + }, + { + "epoch": 1.084016393442623, + "grad_norm": 0.9975478053092957, + "learning_rate": 4.624791252704807e-06, + "loss": 0.5949, + "step": 3703 + }, + { + "epoch": 1.0843091334894615, + "grad_norm": 1.0228310823440552, + "learning_rate": 4.624588324992557e-06, + "loss": 0.6565, + "step": 3704 + }, + { + "epoch": 1.0846018735362997, + "grad_norm": 0.9990718364715576, + "learning_rate": 4.624385346874041e-06, + "loss": 0.6699, + "step": 3705 + }, + { + "epoch": 1.084894613583138, + "grad_norm": 0.9815998673439026, + "learning_rate": 4.624182318354074e-06, + "loss": 0.6499, + "step": 3706 + }, + { + "epoch": 1.0851873536299765, + "grad_norm": 0.9376361966133118, + "learning_rate": 4.623979239437474e-06, + "loss": 0.6046, + "step": 3707 + }, + { + "epoch": 1.085480093676815, + "grad_norm": 0.9311253428459167, + "learning_rate": 4.623776110129058e-06, + "loss": 0.6495, + "step": 3708 + }, + { + "epoch": 1.0857728337236534, + "grad_norm": 0.9880542755126953, + "learning_rate": 4.623572930433646e-06, + "loss": 0.649, + "step": 3709 + }, + { + "epoch": 1.0860655737704918, + "grad_norm": 1.057134985923767, + "learning_rate": 4.623369700356058e-06, + "loss": 0.62, + "step": 3710 + }, + { + "epoch": 1.0863583138173303, + "grad_norm": 0.9916936159133911, + "learning_rate": 4.623166419901116e-06, + "loss": 0.653, + "step": 3711 + }, + { + "epoch": 1.0866510538641687, + "grad_norm": 0.9830076694488525, + "learning_rate": 4.6229630890736435e-06, + "loss": 0.6481, + "step": 3712 + }, + { + "epoch": 1.086943793911007, + "grad_norm": 0.9793269634246826, + "learning_rate": 4.6227597078784635e-06, + "loss": 0.6354, + "step": 3713 + }, + { + "epoch": 1.0872365339578454, + "grad_norm": 0.9410616159439087, + "learning_rate": 4.622556276320401e-06, + "loss": 0.5961, + "step": 3714 + }, + { + "epoch": 1.0875292740046838, + "grad_norm": 0.9778451919555664, + "learning_rate": 4.622352794404283e-06, + "loss": 0.6226, + "step": 3715 + }, + { + "epoch": 1.0878220140515222, + "grad_norm": 0.9570394158363342, + "learning_rate": 4.622149262134939e-06, + "loss": 0.6416, + "step": 3716 + }, + { + "epoch": 1.0881147540983607, + "grad_norm": 0.9878842830657959, + "learning_rate": 4.621945679517194e-06, + "loss": 0.5962, + "step": 3717 + }, + { + "epoch": 1.088407494145199, + "grad_norm": 0.9686046838760376, + "learning_rate": 4.62174204655588e-06, + "loss": 0.6419, + "step": 3718 + }, + { + "epoch": 1.0887002341920375, + "grad_norm": 0.997977614402771, + "learning_rate": 4.6215383632558294e-06, + "loss": 0.6475, + "step": 3719 + }, + { + "epoch": 1.088992974238876, + "grad_norm": 0.9438425302505493, + "learning_rate": 4.621334629621873e-06, + "loss": 0.6409, + "step": 3720 + }, + { + "epoch": 1.0892857142857142, + "grad_norm": 0.9445755481719971, + "learning_rate": 4.621130845658846e-06, + "loss": 0.6664, + "step": 3721 + }, + { + "epoch": 1.0895784543325526, + "grad_norm": 0.9884910583496094, + "learning_rate": 4.6209270113715806e-06, + "loss": 0.6408, + "step": 3722 + }, + { + "epoch": 1.089871194379391, + "grad_norm": 0.9922361373901367, + "learning_rate": 4.6207231267649155e-06, + "loss": 0.6122, + "step": 3723 + }, + { + "epoch": 1.0901639344262295, + "grad_norm": 0.9444513916969299, + "learning_rate": 4.620519191843686e-06, + "loss": 0.6042, + "step": 3724 + }, + { + "epoch": 1.090456674473068, + "grad_norm": 0.9302930235862732, + "learning_rate": 4.6203152066127326e-06, + "loss": 0.6426, + "step": 3725 + }, + { + "epoch": 1.0907494145199064, + "grad_norm": 0.9809553623199463, + "learning_rate": 4.620111171076893e-06, + "loss": 0.6817, + "step": 3726 + }, + { + "epoch": 1.0910421545667448, + "grad_norm": 0.9996722936630249, + "learning_rate": 4.619907085241008e-06, + "loss": 0.6165, + "step": 3727 + }, + { + "epoch": 1.0913348946135832, + "grad_norm": 0.9456072449684143, + "learning_rate": 4.61970294910992e-06, + "loss": 0.6398, + "step": 3728 + }, + { + "epoch": 1.0916276346604215, + "grad_norm": 0.9711762070655823, + "learning_rate": 4.6194987626884726e-06, + "loss": 0.651, + "step": 3729 + }, + { + "epoch": 1.0919203747072599, + "grad_norm": 0.9726723432540894, + "learning_rate": 4.61929452598151e-06, + "loss": 0.656, + "step": 3730 + }, + { + "epoch": 1.0922131147540983, + "grad_norm": 0.9732535481452942, + "learning_rate": 4.619090238993877e-06, + "loss": 0.5983, + "step": 3731 + }, + { + "epoch": 1.0925058548009368, + "grad_norm": 0.9166037440299988, + "learning_rate": 4.618885901730422e-06, + "loss": 0.6145, + "step": 3732 + }, + { + "epoch": 1.0927985948477752, + "grad_norm": 0.9630424380302429, + "learning_rate": 4.6186815141959916e-06, + "loss": 0.611, + "step": 3733 + }, + { + "epoch": 1.0930913348946136, + "grad_norm": 0.9843548536300659, + "learning_rate": 4.618477076395434e-06, + "loss": 0.6029, + "step": 3734 + }, + { + "epoch": 1.093384074941452, + "grad_norm": 0.9833624958992004, + "learning_rate": 4.618272588333602e-06, + "loss": 0.6603, + "step": 3735 + }, + { + "epoch": 1.0936768149882905, + "grad_norm": 0.9177285432815552, + "learning_rate": 4.618068050015346e-06, + "loss": 0.6052, + "step": 3736 + }, + { + "epoch": 1.0939695550351287, + "grad_norm": 1.0325678586959839, + "learning_rate": 4.617863461445518e-06, + "loss": 0.6098, + "step": 3737 + }, + { + "epoch": 1.0942622950819672, + "grad_norm": 0.976641833782196, + "learning_rate": 4.617658822628973e-06, + "loss": 0.6543, + "step": 3738 + }, + { + "epoch": 1.0945550351288056, + "grad_norm": 0.9717233180999756, + "learning_rate": 4.6174541335705646e-06, + "loss": 0.6311, + "step": 3739 + }, + { + "epoch": 1.094847775175644, + "grad_norm": 0.9587548971176147, + "learning_rate": 4.617249394275151e-06, + "loss": 0.5898, + "step": 3740 + }, + { + "epoch": 1.0951405152224825, + "grad_norm": 0.9837205410003662, + "learning_rate": 4.617044604747588e-06, + "loss": 0.6418, + "step": 3741 + }, + { + "epoch": 1.095433255269321, + "grad_norm": 1.0560575723648071, + "learning_rate": 4.616839764992736e-06, + "loss": 0.6097, + "step": 3742 + }, + { + "epoch": 1.0957259953161593, + "grad_norm": 1.0646638870239258, + "learning_rate": 4.616634875015454e-06, + "loss": 0.6486, + "step": 3743 + }, + { + "epoch": 1.0960187353629975, + "grad_norm": 0.9863326549530029, + "learning_rate": 4.616429934820602e-06, + "loss": 0.612, + "step": 3744 + }, + { + "epoch": 1.096311475409836, + "grad_norm": 0.9639100432395935, + "learning_rate": 4.6162249444130436e-06, + "loss": 0.6265, + "step": 3745 + }, + { + "epoch": 1.0966042154566744, + "grad_norm": 0.9807822704315186, + "learning_rate": 4.616019903797642e-06, + "loss": 0.6513, + "step": 3746 + }, + { + "epoch": 1.0968969555035128, + "grad_norm": 0.9762060642242432, + "learning_rate": 4.6158148129792614e-06, + "loss": 0.6309, + "step": 3747 + }, + { + "epoch": 1.0971896955503513, + "grad_norm": 0.9592571258544922, + "learning_rate": 4.615609671962768e-06, + "loss": 0.5957, + "step": 3748 + }, + { + "epoch": 1.0974824355971897, + "grad_norm": 0.9871405363082886, + "learning_rate": 4.615404480753028e-06, + "loss": 0.6452, + "step": 3749 + }, + { + "epoch": 1.0977751756440282, + "grad_norm": 1.122552514076233, + "learning_rate": 4.615199239354911e-06, + "loss": 0.5918, + "step": 3750 + }, + { + "epoch": 1.0980679156908666, + "grad_norm": 0.9488926529884338, + "learning_rate": 4.614993947773285e-06, + "loss": 0.6065, + "step": 3751 + }, + { + "epoch": 1.098360655737705, + "grad_norm": 1.015687346458435, + "learning_rate": 4.614788606013022e-06, + "loss": 0.6609, + "step": 3752 + }, + { + "epoch": 1.0986533957845432, + "grad_norm": 0.9463213086128235, + "learning_rate": 4.614583214078993e-06, + "loss": 0.6183, + "step": 3753 + }, + { + "epoch": 1.0989461358313817, + "grad_norm": 0.8907645344734192, + "learning_rate": 4.614377771976071e-06, + "loss": 0.5881, + "step": 3754 + }, + { + "epoch": 1.0992388758782201, + "grad_norm": 1.033030390739441, + "learning_rate": 4.614172279709129e-06, + "loss": 0.6347, + "step": 3755 + }, + { + "epoch": 1.0995316159250585, + "grad_norm": 1.1359243392944336, + "learning_rate": 4.613966737283045e-06, + "loss": 0.6756, + "step": 3756 + }, + { + "epoch": 1.099824355971897, + "grad_norm": 0.9763171672821045, + "learning_rate": 4.613761144702693e-06, + "loss": 0.6213, + "step": 3757 + }, + { + "epoch": 1.1001170960187354, + "grad_norm": 0.9832759499549866, + "learning_rate": 4.6135555019729525e-06, + "loss": 0.617, + "step": 3758 + }, + { + "epoch": 1.1004098360655739, + "grad_norm": 1.0300967693328857, + "learning_rate": 4.6133498090987015e-06, + "loss": 0.5995, + "step": 3759 + }, + { + "epoch": 1.100702576112412, + "grad_norm": 0.9871177077293396, + "learning_rate": 4.61314406608482e-06, + "loss": 0.6296, + "step": 3760 + }, + { + "epoch": 1.1009953161592505, + "grad_norm": 1.000026822090149, + "learning_rate": 4.612938272936189e-06, + "loss": 0.6544, + "step": 3761 + }, + { + "epoch": 1.101288056206089, + "grad_norm": 0.9762188196182251, + "learning_rate": 4.612732429657693e-06, + "loss": 0.6397, + "step": 3762 + }, + { + "epoch": 1.1015807962529274, + "grad_norm": 0.9366106390953064, + "learning_rate": 4.612526536254212e-06, + "loss": 0.6076, + "step": 3763 + }, + { + "epoch": 1.1018735362997658, + "grad_norm": 0.9892871379852295, + "learning_rate": 4.612320592730635e-06, + "loss": 0.6352, + "step": 3764 + }, + { + "epoch": 1.1021662763466042, + "grad_norm": 0.9411518573760986, + "learning_rate": 4.612114599091846e-06, + "loss": 0.6237, + "step": 3765 + }, + { + "epoch": 1.1024590163934427, + "grad_norm": 0.9574638605117798, + "learning_rate": 4.611908555342732e-06, + "loss": 0.5847, + "step": 3766 + }, + { + "epoch": 1.1027517564402811, + "grad_norm": 0.9616842269897461, + "learning_rate": 4.611702461488181e-06, + "loss": 0.6348, + "step": 3767 + }, + { + "epoch": 1.1030444964871196, + "grad_norm": 0.989943265914917, + "learning_rate": 4.611496317533084e-06, + "loss": 0.6613, + "step": 3768 + }, + { + "epoch": 1.1033372365339578, + "grad_norm": 0.9428748488426208, + "learning_rate": 4.611290123482331e-06, + "loss": 0.5961, + "step": 3769 + }, + { + "epoch": 1.1036299765807962, + "grad_norm": 0.946242094039917, + "learning_rate": 4.611083879340815e-06, + "loss": 0.6244, + "step": 3770 + }, + { + "epoch": 1.1039227166276346, + "grad_norm": 1.003009557723999, + "learning_rate": 4.610877585113428e-06, + "loss": 0.6879, + "step": 3771 + }, + { + "epoch": 1.104215456674473, + "grad_norm": 0.9323304891586304, + "learning_rate": 4.610671240805065e-06, + "loss": 0.5865, + "step": 3772 + }, + { + "epoch": 1.1045081967213115, + "grad_norm": 0.9923133850097656, + "learning_rate": 4.61046484642062e-06, + "loss": 0.6403, + "step": 3773 + }, + { + "epoch": 1.10480093676815, + "grad_norm": 0.9739687442779541, + "learning_rate": 4.610258401964992e-06, + "loss": 0.6775, + "step": 3774 + }, + { + "epoch": 1.1050936768149884, + "grad_norm": 0.9896920323371887, + "learning_rate": 4.610051907443078e-06, + "loss": 0.6405, + "step": 3775 + }, + { + "epoch": 1.1053864168618266, + "grad_norm": 1.015183687210083, + "learning_rate": 4.609845362859777e-06, + "loss": 0.6329, + "step": 3776 + }, + { + "epoch": 1.105679156908665, + "grad_norm": 0.9785884618759155, + "learning_rate": 4.60963876821999e-06, + "loss": 0.636, + "step": 3777 + }, + { + "epoch": 1.1059718969555035, + "grad_norm": 0.9792786240577698, + "learning_rate": 4.609432123528618e-06, + "loss": 0.5828, + "step": 3778 + }, + { + "epoch": 1.106264637002342, + "grad_norm": 1.0053975582122803, + "learning_rate": 4.609225428790562e-06, + "loss": 0.5929, + "step": 3779 + }, + { + "epoch": 1.1065573770491803, + "grad_norm": 0.9433420896530151, + "learning_rate": 4.609018684010728e-06, + "loss": 0.6009, + "step": 3780 + }, + { + "epoch": 1.1068501170960188, + "grad_norm": 0.9658641815185547, + "learning_rate": 4.6088118891940214e-06, + "loss": 0.6029, + "step": 3781 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 1.043588638305664, + "learning_rate": 4.6086050443453475e-06, + "loss": 0.6137, + "step": 3782 + }, + { + "epoch": 1.1074355971896956, + "grad_norm": 1.023732304573059, + "learning_rate": 4.6083981494696134e-06, + "loss": 0.7094, + "step": 3783 + }, + { + "epoch": 1.1077283372365339, + "grad_norm": 0.9884748458862305, + "learning_rate": 4.608191204571728e-06, + "loss": 0.6092, + "step": 3784 + }, + { + "epoch": 1.1080210772833723, + "grad_norm": 1.0306814908981323, + "learning_rate": 4.607984209656602e-06, + "loss": 0.6069, + "step": 3785 + }, + { + "epoch": 1.1083138173302107, + "grad_norm": 0.983569860458374, + "learning_rate": 4.6077771647291445e-06, + "loss": 0.6377, + "step": 3786 + }, + { + "epoch": 1.1086065573770492, + "grad_norm": 0.9147101044654846, + "learning_rate": 4.607570069794269e-06, + "loss": 0.63, + "step": 3787 + }, + { + "epoch": 1.1088992974238876, + "grad_norm": 0.987065851688385, + "learning_rate": 4.60736292485689e-06, + "loss": 0.6246, + "step": 3788 + }, + { + "epoch": 1.109192037470726, + "grad_norm": 0.9963900446891785, + "learning_rate": 4.6071557299219195e-06, + "loss": 0.5889, + "step": 3789 + }, + { + "epoch": 1.1094847775175645, + "grad_norm": 0.9876672625541687, + "learning_rate": 4.606948484994274e-06, + "loss": 0.6305, + "step": 3790 + }, + { + "epoch": 1.109777517564403, + "grad_norm": 1.0093497037887573, + "learning_rate": 4.606741190078873e-06, + "loss": 0.615, + "step": 3791 + }, + { + "epoch": 1.1100702576112411, + "grad_norm": 1.0456680059432983, + "learning_rate": 4.60653384518063e-06, + "loss": 0.6566, + "step": 3792 + }, + { + "epoch": 1.1103629976580796, + "grad_norm": 1.0825303792953491, + "learning_rate": 4.606326450304469e-06, + "loss": 0.6744, + "step": 3793 + }, + { + "epoch": 1.110655737704918, + "grad_norm": 0.9784829020500183, + "learning_rate": 4.606119005455307e-06, + "loss": 0.6469, + "step": 3794 + }, + { + "epoch": 1.1109484777517564, + "grad_norm": 0.998207688331604, + "learning_rate": 4.605911510638067e-06, + "loss": 0.5958, + "step": 3795 + }, + { + "epoch": 1.1112412177985949, + "grad_norm": 1.0188827514648438, + "learning_rate": 4.605703965857673e-06, + "loss": 0.5792, + "step": 3796 + }, + { + "epoch": 1.1115339578454333, + "grad_norm": 0.9894059896469116, + "learning_rate": 4.605496371119047e-06, + "loss": 0.6187, + "step": 3797 + }, + { + "epoch": 1.1118266978922717, + "grad_norm": 1.0123966932296753, + "learning_rate": 4.605288726427115e-06, + "loss": 0.6333, + "step": 3798 + }, + { + "epoch": 1.1121194379391102, + "grad_norm": 0.9428403377532959, + "learning_rate": 4.6050810317868035e-06, + "loss": 0.6046, + "step": 3799 + }, + { + "epoch": 1.1124121779859484, + "grad_norm": 0.9778447151184082, + "learning_rate": 4.60487328720304e-06, + "loss": 0.6111, + "step": 3800 + }, + { + "epoch": 1.1127049180327868, + "grad_norm": 0.9123297333717346, + "learning_rate": 4.6046654926807535e-06, + "loss": 0.5824, + "step": 3801 + }, + { + "epoch": 1.1129976580796253, + "grad_norm": 0.9798107147216797, + "learning_rate": 4.604457648224874e-06, + "loss": 0.6233, + "step": 3802 + }, + { + "epoch": 1.1132903981264637, + "grad_norm": 1.0234464406967163, + "learning_rate": 4.6042497538403315e-06, + "loss": 0.6438, + "step": 3803 + }, + { + "epoch": 1.1135831381733021, + "grad_norm": 0.9556613564491272, + "learning_rate": 4.604041809532061e-06, + "loss": 0.65, + "step": 3804 + }, + { + "epoch": 1.1138758782201406, + "grad_norm": 0.968713641166687, + "learning_rate": 4.603833815304993e-06, + "loss": 0.6023, + "step": 3805 + }, + { + "epoch": 1.114168618266979, + "grad_norm": 0.9614652395248413, + "learning_rate": 4.603625771164064e-06, + "loss": 0.6932, + "step": 3806 + }, + { + "epoch": 1.1144613583138174, + "grad_norm": 0.9661239981651306, + "learning_rate": 4.60341767711421e-06, + "loss": 0.6277, + "step": 3807 + }, + { + "epoch": 1.1147540983606556, + "grad_norm": 0.9499217867851257, + "learning_rate": 4.603209533160366e-06, + "loss": 0.6693, + "step": 3808 + }, + { + "epoch": 1.115046838407494, + "grad_norm": 0.9818447828292847, + "learning_rate": 4.6030013393074735e-06, + "loss": 0.6431, + "step": 3809 + }, + { + "epoch": 1.1153395784543325, + "grad_norm": 0.9582764506340027, + "learning_rate": 4.602793095560469e-06, + "loss": 0.6071, + "step": 3810 + }, + { + "epoch": 1.115632318501171, + "grad_norm": 1.0088990926742554, + "learning_rate": 4.602584801924295e-06, + "loss": 0.6302, + "step": 3811 + }, + { + "epoch": 1.1159250585480094, + "grad_norm": 0.9497069716453552, + "learning_rate": 4.6023764584038925e-06, + "loss": 0.5847, + "step": 3812 + }, + { + "epoch": 1.1162177985948478, + "grad_norm": 0.9650553464889526, + "learning_rate": 4.602168065004204e-06, + "loss": 0.5978, + "step": 3813 + }, + { + "epoch": 1.1165105386416863, + "grad_norm": 0.9761606454849243, + "learning_rate": 4.6019596217301755e-06, + "loss": 0.6036, + "step": 3814 + }, + { + "epoch": 1.1168032786885247, + "grad_norm": 1.0079599618911743, + "learning_rate": 4.60175112858675e-06, + "loss": 0.5936, + "step": 3815 + }, + { + "epoch": 1.117096018735363, + "grad_norm": 0.9529397487640381, + "learning_rate": 4.6015425855788756e-06, + "loss": 0.6137, + "step": 3816 + }, + { + "epoch": 1.1173887587822013, + "grad_norm": 0.9475335478782654, + "learning_rate": 4.6013339927115e-06, + "loss": 0.6732, + "step": 3817 + }, + { + "epoch": 1.1176814988290398, + "grad_norm": 1.011076807975769, + "learning_rate": 4.601125349989571e-06, + "loss": 0.6445, + "step": 3818 + }, + { + "epoch": 1.1179742388758782, + "grad_norm": 1.0244370698928833, + "learning_rate": 4.60091665741804e-06, + "loss": 0.6222, + "step": 3819 + }, + { + "epoch": 1.1182669789227166, + "grad_norm": 0.9445985555648804, + "learning_rate": 4.6007079150018566e-06, + "loss": 0.6159, + "step": 3820 + }, + { + "epoch": 1.118559718969555, + "grad_norm": 0.9532619118690491, + "learning_rate": 4.600499122745975e-06, + "loss": 0.5797, + "step": 3821 + }, + { + "epoch": 1.1188524590163935, + "grad_norm": 0.949085533618927, + "learning_rate": 4.600290280655349e-06, + "loss": 0.5525, + "step": 3822 + }, + { + "epoch": 1.1191451990632317, + "grad_norm": 0.942179799079895, + "learning_rate": 4.6000813887349316e-06, + "loss": 0.6046, + "step": 3823 + }, + { + "epoch": 1.1194379391100702, + "grad_norm": 0.948894739151001, + "learning_rate": 4.59987244698968e-06, + "loss": 0.5604, + "step": 3824 + }, + { + "epoch": 1.1197306791569086, + "grad_norm": 1.0277159214019775, + "learning_rate": 4.599663455424551e-06, + "loss": 0.6103, + "step": 3825 + }, + { + "epoch": 1.120023419203747, + "grad_norm": 1.0250256061553955, + "learning_rate": 4.599454414044504e-06, + "loss": 0.6339, + "step": 3826 + }, + { + "epoch": 1.1203161592505855, + "grad_norm": 0.9588828086853027, + "learning_rate": 4.599245322854497e-06, + "loss": 0.6205, + "step": 3827 + }, + { + "epoch": 1.120608899297424, + "grad_norm": 0.9758357405662537, + "learning_rate": 4.5990361818594905e-06, + "loss": 0.637, + "step": 3828 + }, + { + "epoch": 1.1209016393442623, + "grad_norm": 1.009634256362915, + "learning_rate": 4.598826991064448e-06, + "loss": 0.6274, + "step": 3829 + }, + { + "epoch": 1.1211943793911008, + "grad_norm": 0.9956340193748474, + "learning_rate": 4.598617750474332e-06, + "loss": 0.5991, + "step": 3830 + }, + { + "epoch": 1.1214871194379392, + "grad_norm": 0.9886682629585266, + "learning_rate": 4.598408460094107e-06, + "loss": 0.6515, + "step": 3831 + }, + { + "epoch": 1.1217798594847774, + "grad_norm": 1.0117814540863037, + "learning_rate": 4.598199119928738e-06, + "loss": 0.6093, + "step": 3832 + }, + { + "epoch": 1.1220725995316159, + "grad_norm": 0.930108904838562, + "learning_rate": 4.597989729983192e-06, + "loss": 0.606, + "step": 3833 + }, + { + "epoch": 1.1223653395784543, + "grad_norm": 0.9589305520057678, + "learning_rate": 4.597780290262436e-06, + "loss": 0.5935, + "step": 3834 + }, + { + "epoch": 1.1226580796252927, + "grad_norm": 0.982227623462677, + "learning_rate": 4.59757080077144e-06, + "loss": 0.6773, + "step": 3835 + }, + { + "epoch": 1.1229508196721312, + "grad_norm": 1.0214983224868774, + "learning_rate": 4.597361261515173e-06, + "loss": 0.6041, + "step": 3836 + }, + { + "epoch": 1.1232435597189696, + "grad_norm": 1.0058863162994385, + "learning_rate": 4.597151672498608e-06, + "loss": 0.6422, + "step": 3837 + }, + { + "epoch": 1.123536299765808, + "grad_norm": 0.9848122000694275, + "learning_rate": 4.5969420337267165e-06, + "loss": 0.6274, + "step": 3838 + }, + { + "epoch": 1.1238290398126463, + "grad_norm": 0.9939152002334595, + "learning_rate": 4.596732345204472e-06, + "loss": 0.6341, + "step": 3839 + }, + { + "epoch": 1.1241217798594847, + "grad_norm": 0.9924494624137878, + "learning_rate": 4.596522606936851e-06, + "loss": 0.6046, + "step": 3840 + }, + { + "epoch": 1.1244145199063231, + "grad_norm": 1.0381956100463867, + "learning_rate": 4.596312818928828e-06, + "loss": 0.6566, + "step": 3841 + }, + { + "epoch": 1.1247072599531616, + "grad_norm": 1.0262798070907593, + "learning_rate": 4.596102981185379e-06, + "loss": 0.6363, + "step": 3842 + }, + { + "epoch": 1.125, + "grad_norm": 0.9834688305854797, + "learning_rate": 4.595893093711485e-06, + "loss": 0.6119, + "step": 3843 + }, + { + "epoch": 1.1252927400468384, + "grad_norm": 0.9490705132484436, + "learning_rate": 4.595683156512125e-06, + "loss": 0.6136, + "step": 3844 + }, + { + "epoch": 1.1255854800936769, + "grad_norm": 0.9509444832801819, + "learning_rate": 4.595473169592279e-06, + "loss": 0.6135, + "step": 3845 + }, + { + "epoch": 1.1258782201405153, + "grad_norm": 0.9369563460350037, + "learning_rate": 4.59526313295693e-06, + "loss": 0.5768, + "step": 3846 + }, + { + "epoch": 1.1261709601873537, + "grad_norm": 1.0105741024017334, + "learning_rate": 4.59505304661106e-06, + "loss": 0.6222, + "step": 3847 + }, + { + "epoch": 1.126463700234192, + "grad_norm": 1.024318814277649, + "learning_rate": 4.594842910559655e-06, + "loss": 0.6375, + "step": 3848 + }, + { + "epoch": 1.1267564402810304, + "grad_norm": 0.9791163802146912, + "learning_rate": 4.594632724807699e-06, + "loss": 0.6361, + "step": 3849 + }, + { + "epoch": 1.1270491803278688, + "grad_norm": 1.2677937746047974, + "learning_rate": 4.594422489360179e-06, + "loss": 0.6614, + "step": 3850 + }, + { + "epoch": 1.1273419203747073, + "grad_norm": 0.9354791045188904, + "learning_rate": 4.5942122042220825e-06, + "loss": 0.6293, + "step": 3851 + }, + { + "epoch": 1.1276346604215457, + "grad_norm": 1.019478678703308, + "learning_rate": 4.5940018693984e-06, + "loss": 0.6382, + "step": 3852 + }, + { + "epoch": 1.1279274004683841, + "grad_norm": 0.949056088924408, + "learning_rate": 4.59379148489412e-06, + "loss": 0.6304, + "step": 3853 + }, + { + "epoch": 1.1282201405152226, + "grad_norm": 0.9525336027145386, + "learning_rate": 4.593581050714236e-06, + "loss": 0.6378, + "step": 3854 + }, + { + "epoch": 1.1285128805620608, + "grad_norm": 0.9595624804496765, + "learning_rate": 4.593370566863738e-06, + "loss": 0.6359, + "step": 3855 + }, + { + "epoch": 1.1288056206088992, + "grad_norm": 0.9289277195930481, + "learning_rate": 4.593160033347622e-06, + "loss": 0.6144, + "step": 3856 + }, + { + "epoch": 1.1290983606557377, + "grad_norm": 0.9536124467849731, + "learning_rate": 4.592949450170881e-06, + "loss": 0.6261, + "step": 3857 + }, + { + "epoch": 1.129391100702576, + "grad_norm": 1.0071780681610107, + "learning_rate": 4.592738817338514e-06, + "loss": 0.5931, + "step": 3858 + }, + { + "epoch": 1.1296838407494145, + "grad_norm": 1.0019582509994507, + "learning_rate": 4.592528134855515e-06, + "loss": 0.6618, + "step": 3859 + }, + { + "epoch": 1.129976580796253, + "grad_norm": 0.9846954345703125, + "learning_rate": 4.592317402726885e-06, + "loss": 0.6155, + "step": 3860 + }, + { + "epoch": 1.1302693208430914, + "grad_norm": 0.9914869070053101, + "learning_rate": 4.592106620957622e-06, + "loss": 0.6227, + "step": 3861 + }, + { + "epoch": 1.1305620608899298, + "grad_norm": 0.9963089823722839, + "learning_rate": 4.591895789552728e-06, + "loss": 0.6156, + "step": 3862 + }, + { + "epoch": 1.1308548009367683, + "grad_norm": 0.991753339767456, + "learning_rate": 4.591684908517204e-06, + "loss": 0.5667, + "step": 3863 + }, + { + "epoch": 1.1311475409836065, + "grad_norm": 0.9675180315971375, + "learning_rate": 4.591473977856054e-06, + "loss": 0.6146, + "step": 3864 + }, + { + "epoch": 1.131440281030445, + "grad_norm": 0.9653717279434204, + "learning_rate": 4.5912629975742815e-06, + "loss": 0.6317, + "step": 3865 + }, + { + "epoch": 1.1317330210772834, + "grad_norm": 0.9771853089332581, + "learning_rate": 4.591051967676893e-06, + "loss": 0.6582, + "step": 3866 + }, + { + "epoch": 1.1320257611241218, + "grad_norm": 0.9701839685440063, + "learning_rate": 4.590840888168895e-06, + "loss": 0.5908, + "step": 3867 + }, + { + "epoch": 1.1323185011709602, + "grad_norm": 0.9142202138900757, + "learning_rate": 4.590629759055295e-06, + "loss": 0.5823, + "step": 3868 + }, + { + "epoch": 1.1326112412177987, + "grad_norm": 0.9589078426361084, + "learning_rate": 4.590418580341102e-06, + "loss": 0.6131, + "step": 3869 + }, + { + "epoch": 1.132903981264637, + "grad_norm": 0.9612835049629211, + "learning_rate": 4.590207352031328e-06, + "loss": 0.6516, + "step": 3870 + }, + { + "epoch": 1.1331967213114753, + "grad_norm": 0.9390817880630493, + "learning_rate": 4.589996074130981e-06, + "loss": 0.5939, + "step": 3871 + }, + { + "epoch": 1.1334894613583137, + "grad_norm": 1.2184231281280518, + "learning_rate": 4.589784746645077e-06, + "loss": 0.6597, + "step": 3872 + }, + { + "epoch": 1.1337822014051522, + "grad_norm": 0.9500048756599426, + "learning_rate": 4.589573369578627e-06, + "loss": 0.6144, + "step": 3873 + }, + { + "epoch": 1.1340749414519906, + "grad_norm": 0.9752905368804932, + "learning_rate": 4.589361942936649e-06, + "loss": 0.626, + "step": 3874 + }, + { + "epoch": 1.134367681498829, + "grad_norm": 1.0092201232910156, + "learning_rate": 4.589150466724156e-06, + "loss": 0.6585, + "step": 3875 + }, + { + "epoch": 1.1346604215456675, + "grad_norm": 0.9855881929397583, + "learning_rate": 4.588938940946168e-06, + "loss": 0.6244, + "step": 3876 + }, + { + "epoch": 1.134953161592506, + "grad_norm": 1.009594440460205, + "learning_rate": 4.588727365607701e-06, + "loss": 0.6413, + "step": 3877 + }, + { + "epoch": 1.1352459016393444, + "grad_norm": 1.004822015762329, + "learning_rate": 4.588515740713777e-06, + "loss": 0.6353, + "step": 3878 + }, + { + "epoch": 1.1355386416861828, + "grad_norm": 1.0172860622406006, + "learning_rate": 4.5883040662694165e-06, + "loss": 0.6575, + "step": 3879 + }, + { + "epoch": 1.135831381733021, + "grad_norm": 0.9939830303192139, + "learning_rate": 4.58809234227964e-06, + "loss": 0.641, + "step": 3880 + }, + { + "epoch": 1.1361241217798594, + "grad_norm": 0.9681724905967712, + "learning_rate": 4.587880568749471e-06, + "loss": 0.6765, + "step": 3881 + }, + { + "epoch": 1.1364168618266979, + "grad_norm": 0.9489683508872986, + "learning_rate": 4.587668745683935e-06, + "loss": 0.6121, + "step": 3882 + }, + { + "epoch": 1.1367096018735363, + "grad_norm": 1.0474343299865723, + "learning_rate": 4.587456873088056e-06, + "loss": 0.6232, + "step": 3883 + }, + { + "epoch": 1.1370023419203747, + "grad_norm": 0.9991560578346252, + "learning_rate": 4.587244950966862e-06, + "loss": 0.5954, + "step": 3884 + }, + { + "epoch": 1.1372950819672132, + "grad_norm": 0.975243330001831, + "learning_rate": 4.587032979325382e-06, + "loss": 0.6391, + "step": 3885 + }, + { + "epoch": 1.1375878220140514, + "grad_norm": 0.9796773195266724, + "learning_rate": 4.5868209581686424e-06, + "loss": 0.6362, + "step": 3886 + }, + { + "epoch": 1.1378805620608898, + "grad_norm": 1.0083180665969849, + "learning_rate": 4.586608887501675e-06, + "loss": 0.6509, + "step": 3887 + }, + { + "epoch": 1.1381733021077283, + "grad_norm": 0.9666469693183899, + "learning_rate": 4.586396767329511e-06, + "loss": 0.6116, + "step": 3888 + }, + { + "epoch": 1.1384660421545667, + "grad_norm": 0.9897247552871704, + "learning_rate": 4.586184597657182e-06, + "loss": 0.6329, + "step": 3889 + }, + { + "epoch": 1.1387587822014051, + "grad_norm": 1.0091567039489746, + "learning_rate": 4.5859723784897224e-06, + "loss": 0.6432, + "step": 3890 + }, + { + "epoch": 1.1390515222482436, + "grad_norm": 0.9738842844963074, + "learning_rate": 4.585760109832168e-06, + "loss": 0.6871, + "step": 3891 + }, + { + "epoch": 1.139344262295082, + "grad_norm": 0.9840426445007324, + "learning_rate": 4.585547791689555e-06, + "loss": 0.6343, + "step": 3892 + }, + { + "epoch": 1.1396370023419204, + "grad_norm": 1.0046380758285522, + "learning_rate": 4.585335424066919e-06, + "loss": 0.6655, + "step": 3893 + }, + { + "epoch": 1.1399297423887589, + "grad_norm": 0.9656774401664734, + "learning_rate": 4.5851230069693e-06, + "loss": 0.5955, + "step": 3894 + }, + { + "epoch": 1.140222482435597, + "grad_norm": 0.9981351494789124, + "learning_rate": 4.584910540401737e-06, + "loss": 0.5779, + "step": 3895 + }, + { + "epoch": 1.1405152224824355, + "grad_norm": 0.9731106758117676, + "learning_rate": 4.58469802436927e-06, + "loss": 0.6665, + "step": 3896 + }, + { + "epoch": 1.140807962529274, + "grad_norm": 0.9120030403137207, + "learning_rate": 4.584485458876943e-06, + "loss": 0.5656, + "step": 3897 + }, + { + "epoch": 1.1411007025761124, + "grad_norm": 0.9336007237434387, + "learning_rate": 4.5842728439297975e-06, + "loss": 0.6062, + "step": 3898 + }, + { + "epoch": 1.1413934426229508, + "grad_norm": 0.9936284422874451, + "learning_rate": 4.584060179532878e-06, + "loss": 0.6215, + "step": 3899 + }, + { + "epoch": 1.1416861826697893, + "grad_norm": 0.9635063409805298, + "learning_rate": 4.583847465691231e-06, + "loss": 0.6247, + "step": 3900 + }, + { + "epoch": 1.1419789227166277, + "grad_norm": 1.097273826599121, + "learning_rate": 4.5836347024099025e-06, + "loss": 0.6105, + "step": 3901 + }, + { + "epoch": 1.142271662763466, + "grad_norm": 0.9959077835083008, + "learning_rate": 4.5834218896939395e-06, + "loss": 0.6381, + "step": 3902 + }, + { + "epoch": 1.1425644028103044, + "grad_norm": 1.0078036785125732, + "learning_rate": 4.583209027548393e-06, + "loss": 0.6242, + "step": 3903 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.9698099493980408, + "learning_rate": 4.582996115978312e-06, + "loss": 0.6323, + "step": 3904 + }, + { + "epoch": 1.1431498829039812, + "grad_norm": 0.967129647731781, + "learning_rate": 4.582783154988747e-06, + "loss": 0.5858, + "step": 3905 + }, + { + "epoch": 1.1434426229508197, + "grad_norm": 0.9499537348747253, + "learning_rate": 4.582570144584752e-06, + "loss": 0.6474, + "step": 3906 + }, + { + "epoch": 1.143735362997658, + "grad_norm": 1.0069056749343872, + "learning_rate": 4.58235708477138e-06, + "loss": 0.6351, + "step": 3907 + }, + { + "epoch": 1.1440281030444965, + "grad_norm": 0.960563063621521, + "learning_rate": 4.5821439755536865e-06, + "loss": 0.5758, + "step": 3908 + }, + { + "epoch": 1.144320843091335, + "grad_norm": 0.9781309962272644, + "learning_rate": 4.581930816936727e-06, + "loss": 0.6641, + "step": 3909 + }, + { + "epoch": 1.1446135831381734, + "grad_norm": 0.9803583025932312, + "learning_rate": 4.581717608925559e-06, + "loss": 0.6135, + "step": 3910 + }, + { + "epoch": 1.1449063231850116, + "grad_norm": 0.9537407159805298, + "learning_rate": 4.581504351525241e-06, + "loss": 0.636, + "step": 3911 + }, + { + "epoch": 1.14519906323185, + "grad_norm": 0.98774254322052, + "learning_rate": 4.581291044740832e-06, + "loss": 0.6139, + "step": 3912 + }, + { + "epoch": 1.1454918032786885, + "grad_norm": 1.0111796855926514, + "learning_rate": 4.581077688577393e-06, + "loss": 0.6601, + "step": 3913 + }, + { + "epoch": 1.145784543325527, + "grad_norm": 0.9507741332054138, + "learning_rate": 4.5808642830399865e-06, + "loss": 0.6121, + "step": 3914 + }, + { + "epoch": 1.1460772833723654, + "grad_norm": 0.9635425806045532, + "learning_rate": 4.5806508281336745e-06, + "loss": 0.6028, + "step": 3915 + }, + { + "epoch": 1.1463700234192038, + "grad_norm": 0.9402564764022827, + "learning_rate": 4.580437323863522e-06, + "loss": 0.6321, + "step": 3916 + }, + { + "epoch": 1.1466627634660422, + "grad_norm": 0.9581696391105652, + "learning_rate": 4.5802237702345945e-06, + "loss": 0.587, + "step": 3917 + }, + { + "epoch": 1.1469555035128804, + "grad_norm": 0.9676547646522522, + "learning_rate": 4.580010167251958e-06, + "loss": 0.6536, + "step": 3918 + }, + { + "epoch": 1.1472482435597189, + "grad_norm": 0.972605288028717, + "learning_rate": 4.579796514920681e-06, + "loss": 0.6283, + "step": 3919 + }, + { + "epoch": 1.1475409836065573, + "grad_norm": 0.9702804088592529, + "learning_rate": 4.579582813245832e-06, + "loss": 0.6778, + "step": 3920 + }, + { + "epoch": 1.1478337236533958, + "grad_norm": 0.973403811454773, + "learning_rate": 4.579369062232481e-06, + "loss": 0.5954, + "step": 3921 + }, + { + "epoch": 1.1481264637002342, + "grad_norm": 1.0386947393417358, + "learning_rate": 4.5791552618857e-06, + "loss": 0.6456, + "step": 3922 + }, + { + "epoch": 1.1484192037470726, + "grad_norm": 0.9726020693778992, + "learning_rate": 4.578941412210562e-06, + "loss": 0.6236, + "step": 3923 + }, + { + "epoch": 1.148711943793911, + "grad_norm": 0.9990214109420776, + "learning_rate": 4.578727513212138e-06, + "loss": 0.6596, + "step": 3924 + }, + { + "epoch": 1.1490046838407495, + "grad_norm": 0.9884713888168335, + "learning_rate": 4.578513564895505e-06, + "loss": 0.6398, + "step": 3925 + }, + { + "epoch": 1.149297423887588, + "grad_norm": 0.9600544571876526, + "learning_rate": 4.578299567265738e-06, + "loss": 0.6405, + "step": 3926 + }, + { + "epoch": 1.1495901639344261, + "grad_norm": 1.0042673349380493, + "learning_rate": 4.578085520327915e-06, + "loss": 0.6338, + "step": 3927 + }, + { + "epoch": 1.1498829039812646, + "grad_norm": 0.9643961191177368, + "learning_rate": 4.577871424087113e-06, + "loss": 0.5994, + "step": 3928 + }, + { + "epoch": 1.150175644028103, + "grad_norm": 0.9978809356689453, + "learning_rate": 4.577657278548413e-06, + "loss": 0.6356, + "step": 3929 + }, + { + "epoch": 1.1504683840749415, + "grad_norm": 0.9809943437576294, + "learning_rate": 4.577443083716894e-06, + "loss": 0.653, + "step": 3930 + }, + { + "epoch": 1.1507611241217799, + "grad_norm": 1.00352942943573, + "learning_rate": 4.5772288395976394e-06, + "loss": 0.6494, + "step": 3931 + }, + { + "epoch": 1.1510538641686183, + "grad_norm": 1.0136250257492065, + "learning_rate": 4.577014546195732e-06, + "loss": 0.6386, + "step": 3932 + }, + { + "epoch": 1.1513466042154568, + "grad_norm": 0.9780409932136536, + "learning_rate": 4.576800203516254e-06, + "loss": 0.6259, + "step": 3933 + }, + { + "epoch": 1.151639344262295, + "grad_norm": 0.9808200001716614, + "learning_rate": 4.576585811564293e-06, + "loss": 0.6452, + "step": 3934 + }, + { + "epoch": 1.1519320843091334, + "grad_norm": 0.985503077507019, + "learning_rate": 4.576371370344935e-06, + "loss": 0.6231, + "step": 3935 + }, + { + "epoch": 1.1522248243559718, + "grad_norm": 1.009474515914917, + "learning_rate": 4.576156879863267e-06, + "loss": 0.6365, + "step": 3936 + }, + { + "epoch": 1.1525175644028103, + "grad_norm": 0.9525560140609741, + "learning_rate": 4.575942340124378e-06, + "loss": 0.5973, + "step": 3937 + }, + { + "epoch": 1.1528103044496487, + "grad_norm": 0.9012035131454468, + "learning_rate": 4.575727751133358e-06, + "loss": 0.599, + "step": 3938 + }, + { + "epoch": 1.1531030444964872, + "grad_norm": 0.9459276795387268, + "learning_rate": 4.575513112895299e-06, + "loss": 0.6282, + "step": 3939 + }, + { + "epoch": 1.1533957845433256, + "grad_norm": 0.9677209258079529, + "learning_rate": 4.575298425415292e-06, + "loss": 0.6223, + "step": 3940 + }, + { + "epoch": 1.153688524590164, + "grad_norm": 0.9363953471183777, + "learning_rate": 4.575083688698431e-06, + "loss": 0.5741, + "step": 3941 + }, + { + "epoch": 1.1539812646370025, + "grad_norm": 0.9916462302207947, + "learning_rate": 4.5748689027498114e-06, + "loss": 0.624, + "step": 3942 + }, + { + "epoch": 1.1542740046838407, + "grad_norm": 0.9474512338638306, + "learning_rate": 4.574654067574528e-06, + "loss": 0.6654, + "step": 3943 + }, + { + "epoch": 1.154566744730679, + "grad_norm": 1.0035390853881836, + "learning_rate": 4.574439183177679e-06, + "loss": 0.6564, + "step": 3944 + }, + { + "epoch": 1.1548594847775175, + "grad_norm": 0.9790393710136414, + "learning_rate": 4.574224249564361e-06, + "loss": 0.6405, + "step": 3945 + }, + { + "epoch": 1.155152224824356, + "grad_norm": 0.9668492078781128, + "learning_rate": 4.574009266739675e-06, + "loss": 0.6243, + "step": 3946 + }, + { + "epoch": 1.1554449648711944, + "grad_norm": 0.9240148663520813, + "learning_rate": 4.57379423470872e-06, + "loss": 0.5667, + "step": 3947 + }, + { + "epoch": 1.1557377049180328, + "grad_norm": 1.0241847038269043, + "learning_rate": 4.573579153476598e-06, + "loss": 0.6309, + "step": 3948 + }, + { + "epoch": 1.1560304449648713, + "grad_norm": 0.9715156555175781, + "learning_rate": 4.573364023048414e-06, + "loss": 0.6708, + "step": 3949 + }, + { + "epoch": 1.1563231850117095, + "grad_norm": 0.9808785915374756, + "learning_rate": 4.573148843429268e-06, + "loss": 0.6103, + "step": 3950 + }, + { + "epoch": 1.156615925058548, + "grad_norm": 0.9764946699142456, + "learning_rate": 4.572933614624268e-06, + "loss": 0.6737, + "step": 3951 + }, + { + "epoch": 1.1569086651053864, + "grad_norm": 0.947813093662262, + "learning_rate": 4.57271833663852e-06, + "loss": 0.6139, + "step": 3952 + }, + { + "epoch": 1.1572014051522248, + "grad_norm": 0.9865022897720337, + "learning_rate": 4.572503009477132e-06, + "loss": 0.6311, + "step": 3953 + }, + { + "epoch": 1.1574941451990632, + "grad_norm": 0.9688667058944702, + "learning_rate": 4.572287633145212e-06, + "loss": 0.6313, + "step": 3954 + }, + { + "epoch": 1.1577868852459017, + "grad_norm": 1.022560477256775, + "learning_rate": 4.572072207647869e-06, + "loss": 0.6218, + "step": 3955 + }, + { + "epoch": 1.1580796252927401, + "grad_norm": 0.9923453330993652, + "learning_rate": 4.571856732990214e-06, + "loss": 0.592, + "step": 3956 + }, + { + "epoch": 1.1583723653395785, + "grad_norm": 0.9617502093315125, + "learning_rate": 4.571641209177361e-06, + "loss": 0.6305, + "step": 3957 + }, + { + "epoch": 1.158665105386417, + "grad_norm": 0.9740207195281982, + "learning_rate": 4.571425636214422e-06, + "loss": 0.6161, + "step": 3958 + }, + { + "epoch": 1.1589578454332552, + "grad_norm": 1.0763038396835327, + "learning_rate": 4.5712100141065115e-06, + "loss": 0.5672, + "step": 3959 + }, + { + "epoch": 1.1592505854800936, + "grad_norm": 0.927771270275116, + "learning_rate": 4.5709943428587465e-06, + "loss": 0.6219, + "step": 3960 + }, + { + "epoch": 1.159543325526932, + "grad_norm": 0.9783847332000732, + "learning_rate": 4.570778622476243e-06, + "loss": 0.5843, + "step": 3961 + }, + { + "epoch": 1.1598360655737705, + "grad_norm": 0.9909420013427734, + "learning_rate": 4.570562852964118e-06, + "loss": 0.6388, + "step": 3962 + }, + { + "epoch": 1.160128805620609, + "grad_norm": 1.00015389919281, + "learning_rate": 4.570347034327491e-06, + "loss": 0.6459, + "step": 3963 + }, + { + "epoch": 1.1604215456674474, + "grad_norm": 1.0316163301467896, + "learning_rate": 4.570131166571483e-06, + "loss": 0.6727, + "step": 3964 + }, + { + "epoch": 1.1607142857142858, + "grad_norm": 0.9614436030387878, + "learning_rate": 4.569915249701217e-06, + "loss": 0.6495, + "step": 3965 + }, + { + "epoch": 1.161007025761124, + "grad_norm": 0.9333088994026184, + "learning_rate": 4.569699283721812e-06, + "loss": 0.5842, + "step": 3966 + }, + { + "epoch": 1.1612997658079625, + "grad_norm": 1.0172415971755981, + "learning_rate": 4.569483268638395e-06, + "loss": 0.643, + "step": 3967 + }, + { + "epoch": 1.161592505854801, + "grad_norm": 1.0245931148529053, + "learning_rate": 4.56926720445609e-06, + "loss": 0.6099, + "step": 3968 + }, + { + "epoch": 1.1618852459016393, + "grad_norm": 1.0360833406448364, + "learning_rate": 4.569051091180022e-06, + "loss": 0.6258, + "step": 3969 + }, + { + "epoch": 1.1621779859484778, + "grad_norm": 0.9980958700180054, + "learning_rate": 4.56883492881532e-06, + "loss": 0.6527, + "step": 3970 + }, + { + "epoch": 1.1624707259953162, + "grad_norm": 0.9580082297325134, + "learning_rate": 4.5686187173671115e-06, + "loss": 0.6283, + "step": 3971 + }, + { + "epoch": 1.1627634660421546, + "grad_norm": 0.9367914795875549, + "learning_rate": 4.5684024568405275e-06, + "loss": 0.6386, + "step": 3972 + }, + { + "epoch": 1.163056206088993, + "grad_norm": 1.00300931930542, + "learning_rate": 4.568186147240697e-06, + "loss": 0.6161, + "step": 3973 + }, + { + "epoch": 1.1633489461358313, + "grad_norm": 0.94806307554245, + "learning_rate": 4.567969788572753e-06, + "loss": 0.6318, + "step": 3974 + }, + { + "epoch": 1.1636416861826697, + "grad_norm": 0.9666337370872498, + "learning_rate": 4.567753380841828e-06, + "loss": 0.6098, + "step": 3975 + }, + { + "epoch": 1.1639344262295082, + "grad_norm": 1.022963047027588, + "learning_rate": 4.567536924053057e-06, + "loss": 0.6193, + "step": 3976 + }, + { + "epoch": 1.1642271662763466, + "grad_norm": 1.000006914138794, + "learning_rate": 4.567320418211576e-06, + "loss": 0.6231, + "step": 3977 + }, + { + "epoch": 1.164519906323185, + "grad_norm": 1.0284385681152344, + "learning_rate": 4.56710386332252e-06, + "loss": 0.6294, + "step": 3978 + }, + { + "epoch": 1.1648126463700235, + "grad_norm": 0.9648719429969788, + "learning_rate": 4.5668872593910276e-06, + "loss": 0.6478, + "step": 3979 + }, + { + "epoch": 1.165105386416862, + "grad_norm": 0.9762274622917175, + "learning_rate": 4.5666706064222384e-06, + "loss": 0.6131, + "step": 3980 + }, + { + "epoch": 1.1653981264637001, + "grad_norm": 0.9257004857063293, + "learning_rate": 4.566453904421292e-06, + "loss": 0.6069, + "step": 3981 + }, + { + "epoch": 1.1656908665105385, + "grad_norm": 0.9382058382034302, + "learning_rate": 4.56623715339333e-06, + "loss": 0.5646, + "step": 3982 + }, + { + "epoch": 1.165983606557377, + "grad_norm": 0.9773825407028198, + "learning_rate": 4.566020353343494e-06, + "loss": 0.5976, + "step": 3983 + }, + { + "epoch": 1.1662763466042154, + "grad_norm": 1.0003498792648315, + "learning_rate": 4.5658035042769285e-06, + "loss": 0.6266, + "step": 3984 + }, + { + "epoch": 1.1665690866510539, + "grad_norm": 0.9320899844169617, + "learning_rate": 4.565586606198777e-06, + "loss": 0.5509, + "step": 3985 + }, + { + "epoch": 1.1668618266978923, + "grad_norm": 1.006221055984497, + "learning_rate": 4.565369659114188e-06, + "loss": 0.6291, + "step": 3986 + }, + { + "epoch": 1.1671545667447307, + "grad_norm": 0.9852460622787476, + "learning_rate": 4.565152663028306e-06, + "loss": 0.6217, + "step": 3987 + }, + { + "epoch": 1.1674473067915692, + "grad_norm": 0.9820074439048767, + "learning_rate": 4.5649356179462815e-06, + "loss": 0.6274, + "step": 3988 + }, + { + "epoch": 1.1677400468384076, + "grad_norm": 0.9844979047775269, + "learning_rate": 4.564718523873261e-06, + "loss": 0.5931, + "step": 3989 + }, + { + "epoch": 1.1680327868852458, + "grad_norm": 1.0302549600601196, + "learning_rate": 4.564501380814398e-06, + "loss": 0.6555, + "step": 3990 + }, + { + "epoch": 1.1683255269320842, + "grad_norm": 1.1135177612304688, + "learning_rate": 4.564284188774843e-06, + "loss": 0.6346, + "step": 3991 + }, + { + "epoch": 1.1686182669789227, + "grad_norm": 0.9314882755279541, + "learning_rate": 4.5640669477597484e-06, + "loss": 0.5963, + "step": 3992 + }, + { + "epoch": 1.1689110070257611, + "grad_norm": 1.0426868200302124, + "learning_rate": 4.56384965777427e-06, + "loss": 0.6014, + "step": 3993 + }, + { + "epoch": 1.1692037470725996, + "grad_norm": 1.0377687215805054, + "learning_rate": 4.563632318823561e-06, + "loss": 0.6161, + "step": 3994 + }, + { + "epoch": 1.169496487119438, + "grad_norm": 0.988455593585968, + "learning_rate": 4.56341493091278e-06, + "loss": 0.5985, + "step": 3995 + }, + { + "epoch": 1.1697892271662764, + "grad_norm": 1.010964274406433, + "learning_rate": 4.563197494047082e-06, + "loss": 0.5714, + "step": 3996 + }, + { + "epoch": 1.1700819672131146, + "grad_norm": 0.9945647120475769, + "learning_rate": 4.562980008231628e-06, + "loss": 0.637, + "step": 3997 + }, + { + "epoch": 1.170374707259953, + "grad_norm": 1.0499600172042847, + "learning_rate": 4.562762473471576e-06, + "loss": 0.5852, + "step": 3998 + }, + { + "epoch": 1.1706674473067915, + "grad_norm": 0.9455194473266602, + "learning_rate": 4.562544889772088e-06, + "loss": 0.5991, + "step": 3999 + }, + { + "epoch": 1.17096018735363, + "grad_norm": 0.9908710718154907, + "learning_rate": 4.562327257138327e-06, + "loss": 0.6403, + "step": 4000 + }, + { + "epoch": 1.1712529274004684, + "grad_norm": 0.9435238242149353, + "learning_rate": 4.562109575575455e-06, + "loss": 0.6053, + "step": 4001 + }, + { + "epoch": 1.1715456674473068, + "grad_norm": 0.9422709345817566, + "learning_rate": 4.561891845088637e-06, + "loss": 0.6202, + "step": 4002 + }, + { + "epoch": 1.1718384074941453, + "grad_norm": 1.0155086517333984, + "learning_rate": 4.56167406568304e-06, + "loss": 0.6114, + "step": 4003 + }, + { + "epoch": 1.1721311475409837, + "grad_norm": 0.9057068228721619, + "learning_rate": 4.5614562373638284e-06, + "loss": 0.5743, + "step": 4004 + }, + { + "epoch": 1.1724238875878221, + "grad_norm": 0.9245136976242065, + "learning_rate": 4.561238360136172e-06, + "loss": 0.6181, + "step": 4005 + }, + { + "epoch": 1.1727166276346603, + "grad_norm": 0.9634538888931274, + "learning_rate": 4.56102043400524e-06, + "loss": 0.6555, + "step": 4006 + }, + { + "epoch": 1.1730093676814988, + "grad_norm": 0.9635276198387146, + "learning_rate": 4.560802458976202e-06, + "loss": 0.6195, + "step": 4007 + }, + { + "epoch": 1.1733021077283372, + "grad_norm": 0.9827808737754822, + "learning_rate": 4.560584435054229e-06, + "loss": 0.626, + "step": 4008 + }, + { + "epoch": 1.1735948477751756, + "grad_norm": 0.9454050064086914, + "learning_rate": 4.560366362244496e-06, + "loss": 0.6022, + "step": 4009 + }, + { + "epoch": 1.173887587822014, + "grad_norm": 0.9353100061416626, + "learning_rate": 4.560148240552173e-06, + "loss": 0.5974, + "step": 4010 + }, + { + "epoch": 1.1741803278688525, + "grad_norm": 0.9932160377502441, + "learning_rate": 4.5599300699824395e-06, + "loss": 0.6262, + "step": 4011 + }, + { + "epoch": 1.174473067915691, + "grad_norm": 0.9775030016899109, + "learning_rate": 4.559711850540468e-06, + "loss": 0.6048, + "step": 4012 + }, + { + "epoch": 1.1747658079625292, + "grad_norm": 1.0005176067352295, + "learning_rate": 4.559493582231437e-06, + "loss": 0.6026, + "step": 4013 + }, + { + "epoch": 1.1750585480093676, + "grad_norm": 1.003309726715088, + "learning_rate": 4.559275265060526e-06, + "loss": 0.5982, + "step": 4014 + }, + { + "epoch": 1.175351288056206, + "grad_norm": 0.9742490649223328, + "learning_rate": 4.559056899032913e-06, + "loss": 0.6394, + "step": 4015 + }, + { + "epoch": 1.1756440281030445, + "grad_norm": 0.9820358753204346, + "learning_rate": 4.558838484153779e-06, + "loss": 0.5894, + "step": 4016 + }, + { + "epoch": 1.175936768149883, + "grad_norm": 0.969911515712738, + "learning_rate": 4.558620020428307e-06, + "loss": 0.6718, + "step": 4017 + }, + { + "epoch": 1.1762295081967213, + "grad_norm": 0.9907684326171875, + "learning_rate": 4.558401507861681e-06, + "loss": 0.6371, + "step": 4018 + }, + { + "epoch": 1.1765222482435598, + "grad_norm": 1.0242137908935547, + "learning_rate": 4.558182946459081e-06, + "loss": 0.6428, + "step": 4019 + }, + { + "epoch": 1.1768149882903982, + "grad_norm": 1.0736823081970215, + "learning_rate": 4.557964336225697e-06, + "loss": 0.6482, + "step": 4020 + }, + { + "epoch": 1.1771077283372366, + "grad_norm": 0.9303506016731262, + "learning_rate": 4.557745677166713e-06, + "loss": 0.6121, + "step": 4021 + }, + { + "epoch": 1.1774004683840749, + "grad_norm": 0.9367402195930481, + "learning_rate": 4.557526969287318e-06, + "loss": 0.5775, + "step": 4022 + }, + { + "epoch": 1.1776932084309133, + "grad_norm": 0.94473797082901, + "learning_rate": 4.5573082125927e-06, + "loss": 0.6358, + "step": 4023 + }, + { + "epoch": 1.1779859484777517, + "grad_norm": 0.9358528256416321, + "learning_rate": 4.557089407088049e-06, + "loss": 0.6555, + "step": 4024 + }, + { + "epoch": 1.1782786885245902, + "grad_norm": 0.9596480131149292, + "learning_rate": 4.556870552778557e-06, + "loss": 0.6311, + "step": 4025 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 0.9353258609771729, + "learning_rate": 4.556651649669417e-06, + "loss": 0.6071, + "step": 4026 + }, + { + "epoch": 1.178864168618267, + "grad_norm": 1.0326642990112305, + "learning_rate": 4.55643269776582e-06, + "loss": 0.5924, + "step": 4027 + }, + { + "epoch": 1.1791569086651055, + "grad_norm": 0.960258960723877, + "learning_rate": 4.556213697072963e-06, + "loss": 0.6486, + "step": 4028 + }, + { + "epoch": 1.1794496487119437, + "grad_norm": 1.0255565643310547, + "learning_rate": 4.55599464759604e-06, + "loss": 0.5916, + "step": 4029 + }, + { + "epoch": 1.1797423887587821, + "grad_norm": 1.0123175382614136, + "learning_rate": 4.55577554934025e-06, + "loss": 0.6602, + "step": 4030 + }, + { + "epoch": 1.1800351288056206, + "grad_norm": 1.0103296041488647, + "learning_rate": 4.55555640231079e-06, + "loss": 0.621, + "step": 4031 + }, + { + "epoch": 1.180327868852459, + "grad_norm": 0.920932412147522, + "learning_rate": 4.555337206512859e-06, + "loss": 0.6016, + "step": 4032 + }, + { + "epoch": 1.1806206088992974, + "grad_norm": 0.9788028001785278, + "learning_rate": 4.5551179619516585e-06, + "loss": 0.6429, + "step": 4033 + }, + { + "epoch": 1.1809133489461359, + "grad_norm": 1.024303913116455, + "learning_rate": 4.554898668632389e-06, + "loss": 0.6079, + "step": 4034 + }, + { + "epoch": 1.1812060889929743, + "grad_norm": 0.9523053765296936, + "learning_rate": 4.554679326560254e-06, + "loss": 0.5978, + "step": 4035 + }, + { + "epoch": 1.1814988290398127, + "grad_norm": 0.989281952381134, + "learning_rate": 4.554459935740457e-06, + "loss": 0.6321, + "step": 4036 + }, + { + "epoch": 1.1817915690866512, + "grad_norm": 1.017836570739746, + "learning_rate": 4.554240496178204e-06, + "loss": 0.6535, + "step": 4037 + }, + { + "epoch": 1.1820843091334894, + "grad_norm": 0.9420521259307861, + "learning_rate": 4.5540210078787e-06, + "loss": 0.6155, + "step": 4038 + }, + { + "epoch": 1.1823770491803278, + "grad_norm": 0.971993625164032, + "learning_rate": 4.553801470847153e-06, + "loss": 0.6213, + "step": 4039 + }, + { + "epoch": 1.1826697892271663, + "grad_norm": 0.948943018913269, + "learning_rate": 4.553581885088772e-06, + "loss": 0.6233, + "step": 4040 + }, + { + "epoch": 1.1829625292740047, + "grad_norm": 0.9922240972518921, + "learning_rate": 4.553362250608765e-06, + "loss": 0.6538, + "step": 4041 + }, + { + "epoch": 1.1832552693208431, + "grad_norm": 1.005143642425537, + "learning_rate": 4.553142567412345e-06, + "loss": 0.5627, + "step": 4042 + }, + { + "epoch": 1.1835480093676816, + "grad_norm": 0.9514747858047485, + "learning_rate": 4.552922835504723e-06, + "loss": 0.5915, + "step": 4043 + }, + { + "epoch": 1.18384074941452, + "grad_norm": 0.9258306622505188, + "learning_rate": 4.5527030548911124e-06, + "loss": 0.5915, + "step": 4044 + }, + { + "epoch": 1.1841334894613582, + "grad_norm": 0.980612576007843, + "learning_rate": 4.5524832255767274e-06, + "loss": 0.6354, + "step": 4045 + }, + { + "epoch": 1.1844262295081966, + "grad_norm": 0.9794800281524658, + "learning_rate": 4.552263347566784e-06, + "loss": 0.5919, + "step": 4046 + }, + { + "epoch": 1.184718969555035, + "grad_norm": 1.0117034912109375, + "learning_rate": 4.552043420866498e-06, + "loss": 0.6723, + "step": 4047 + }, + { + "epoch": 1.1850117096018735, + "grad_norm": 1.061004877090454, + "learning_rate": 4.551823445481087e-06, + "loss": 0.6258, + "step": 4048 + }, + { + "epoch": 1.185304449648712, + "grad_norm": 1.0326769351959229, + "learning_rate": 4.551603421415771e-06, + "loss": 0.633, + "step": 4049 + }, + { + "epoch": 1.1855971896955504, + "grad_norm": 1.0024168491363525, + "learning_rate": 4.55138334867577e-06, + "loss": 0.6449, + "step": 4050 + }, + { + "epoch": 1.1858899297423888, + "grad_norm": 0.9838324785232544, + "learning_rate": 4.551163227266305e-06, + "loss": 0.6068, + "step": 4051 + }, + { + "epoch": 1.1861826697892273, + "grad_norm": 0.9822061061859131, + "learning_rate": 4.550943057192598e-06, + "loss": 0.5948, + "step": 4052 + }, + { + "epoch": 1.1864754098360657, + "grad_norm": 0.9225403666496277, + "learning_rate": 4.550722838459873e-06, + "loss": 0.5693, + "step": 4053 + }, + { + "epoch": 1.186768149882904, + "grad_norm": 0.9507278800010681, + "learning_rate": 4.550502571073355e-06, + "loss": 0.5857, + "step": 4054 + }, + { + "epoch": 1.1870608899297423, + "grad_norm": 0.9717975854873657, + "learning_rate": 4.550282255038269e-06, + "loss": 0.6194, + "step": 4055 + }, + { + "epoch": 1.1873536299765808, + "grad_norm": 0.9910897016525269, + "learning_rate": 4.550061890359843e-06, + "loss": 0.6092, + "step": 4056 + }, + { + "epoch": 1.1876463700234192, + "grad_norm": 0.974524736404419, + "learning_rate": 4.549841477043304e-06, + "loss": 0.629, + "step": 4057 + }, + { + "epoch": 1.1879391100702577, + "grad_norm": 0.9437193870544434, + "learning_rate": 4.549621015093883e-06, + "loss": 0.5903, + "step": 4058 + }, + { + "epoch": 1.188231850117096, + "grad_norm": 0.9659669995307922, + "learning_rate": 4.54940050451681e-06, + "loss": 0.608, + "step": 4059 + }, + { + "epoch": 1.1885245901639343, + "grad_norm": 0.9507407546043396, + "learning_rate": 4.549179945317315e-06, + "loss": 0.5786, + "step": 4060 + }, + { + "epoch": 1.1888173302107727, + "grad_norm": 0.9434143304824829, + "learning_rate": 4.548959337500633e-06, + "loss": 0.5933, + "step": 4061 + }, + { + "epoch": 1.1891100702576112, + "grad_norm": 0.9230867028236389, + "learning_rate": 4.548738681071997e-06, + "loss": 0.5831, + "step": 4062 + }, + { + "epoch": 1.1894028103044496, + "grad_norm": 1.015138030052185, + "learning_rate": 4.5485179760366415e-06, + "loss": 0.6325, + "step": 4063 + }, + { + "epoch": 1.189695550351288, + "grad_norm": 0.9354866743087769, + "learning_rate": 4.548297222399804e-06, + "loss": 0.6288, + "step": 4064 + }, + { + "epoch": 1.1899882903981265, + "grad_norm": 0.9615681767463684, + "learning_rate": 4.548076420166722e-06, + "loss": 0.5892, + "step": 4065 + }, + { + "epoch": 1.190281030444965, + "grad_norm": 0.9574183821678162, + "learning_rate": 4.547855569342633e-06, + "loss": 0.6295, + "step": 4066 + }, + { + "epoch": 1.1905737704918034, + "grad_norm": 1.0130500793457031, + "learning_rate": 4.547634669932777e-06, + "loss": 0.617, + "step": 4067 + }, + { + "epoch": 1.1908665105386418, + "grad_norm": 0.9841152429580688, + "learning_rate": 4.5474137219423944e-06, + "loss": 0.6235, + "step": 4068 + }, + { + "epoch": 1.19115925058548, + "grad_norm": 0.9497674107551575, + "learning_rate": 4.547192725376729e-06, + "loss": 0.6247, + "step": 4069 + }, + { + "epoch": 1.1914519906323184, + "grad_norm": 1.0361043214797974, + "learning_rate": 4.546971680241023e-06, + "loss": 0.6614, + "step": 4070 + }, + { + "epoch": 1.1917447306791569, + "grad_norm": 1.0218533277511597, + "learning_rate": 4.546750586540519e-06, + "loss": 0.6442, + "step": 4071 + }, + { + "epoch": 1.1920374707259953, + "grad_norm": 1.0211670398712158, + "learning_rate": 4.546529444280465e-06, + "loss": 0.6312, + "step": 4072 + }, + { + "epoch": 1.1923302107728337, + "grad_norm": 0.9484982490539551, + "learning_rate": 4.546308253466107e-06, + "loss": 0.654, + "step": 4073 + }, + { + "epoch": 1.1926229508196722, + "grad_norm": 0.9695085287094116, + "learning_rate": 4.546087014102693e-06, + "loss": 0.5633, + "step": 4074 + }, + { + "epoch": 1.1929156908665106, + "grad_norm": 1.0321831703186035, + "learning_rate": 4.545865726195471e-06, + "loss": 0.5987, + "step": 4075 + }, + { + "epoch": 1.1932084309133488, + "grad_norm": 0.9172946214675903, + "learning_rate": 4.545644389749691e-06, + "loss": 0.5811, + "step": 4076 + }, + { + "epoch": 1.1935011709601873, + "grad_norm": 0.9542210102081299, + "learning_rate": 4.545423004770606e-06, + "loss": 0.6236, + "step": 4077 + }, + { + "epoch": 1.1937939110070257, + "grad_norm": 1.0377801656723022, + "learning_rate": 4.545201571263467e-06, + "loss": 0.6284, + "step": 4078 + }, + { + "epoch": 1.1940866510538641, + "grad_norm": 0.9620851278305054, + "learning_rate": 4.544980089233528e-06, + "loss": 0.6141, + "step": 4079 + }, + { + "epoch": 1.1943793911007026, + "grad_norm": 0.9288758635520935, + "learning_rate": 4.544758558686042e-06, + "loss": 0.6235, + "step": 4080 + }, + { + "epoch": 1.194672131147541, + "grad_norm": 0.9985141158103943, + "learning_rate": 4.544536979626267e-06, + "loss": 0.621, + "step": 4081 + }, + { + "epoch": 1.1949648711943794, + "grad_norm": 0.9403860569000244, + "learning_rate": 4.54431535205946e-06, + "loss": 0.5784, + "step": 4082 + }, + { + "epoch": 1.1952576112412179, + "grad_norm": 0.94313645362854, + "learning_rate": 4.544093675990878e-06, + "loss": 0.6493, + "step": 4083 + }, + { + "epoch": 1.1955503512880563, + "grad_norm": 0.9905625581741333, + "learning_rate": 4.543871951425781e-06, + "loss": 0.6037, + "step": 4084 + }, + { + "epoch": 1.1958430913348945, + "grad_norm": 0.9249507188796997, + "learning_rate": 4.543650178369429e-06, + "loss": 0.6298, + "step": 4085 + }, + { + "epoch": 1.196135831381733, + "grad_norm": 0.998201847076416, + "learning_rate": 4.543428356827084e-06, + "loss": 0.6783, + "step": 4086 + }, + { + "epoch": 1.1964285714285714, + "grad_norm": 0.9322547316551208, + "learning_rate": 4.54320648680401e-06, + "loss": 0.5969, + "step": 4087 + }, + { + "epoch": 1.1967213114754098, + "grad_norm": 1.01278555393219, + "learning_rate": 4.5429845683054675e-06, + "loss": 0.6654, + "step": 4088 + }, + { + "epoch": 1.1970140515222483, + "grad_norm": 0.9736299514770508, + "learning_rate": 4.5427626013367245e-06, + "loss": 0.6187, + "step": 4089 + }, + { + "epoch": 1.1973067915690867, + "grad_norm": 0.9377394318580627, + "learning_rate": 4.542540585903047e-06, + "loss": 0.6403, + "step": 4090 + }, + { + "epoch": 1.1975995316159251, + "grad_norm": 1.031380295753479, + "learning_rate": 4.542318522009701e-06, + "loss": 0.6245, + "step": 4091 + }, + { + "epoch": 1.1978922716627634, + "grad_norm": 0.9537259340286255, + "learning_rate": 4.5420964096619555e-06, + "loss": 0.6037, + "step": 4092 + }, + { + "epoch": 1.1981850117096018, + "grad_norm": 1.0192688703536987, + "learning_rate": 4.54187424886508e-06, + "loss": 0.6569, + "step": 4093 + }, + { + "epoch": 1.1984777517564402, + "grad_norm": 0.9643672704696655, + "learning_rate": 4.5416520396243466e-06, + "loss": 0.6239, + "step": 4094 + }, + { + "epoch": 1.1987704918032787, + "grad_norm": 0.9718366861343384, + "learning_rate": 4.541429781945026e-06, + "loss": 0.6061, + "step": 4095 + }, + { + "epoch": 1.199063231850117, + "grad_norm": 1.005693793296814, + "learning_rate": 4.541207475832391e-06, + "loss": 0.6268, + "step": 4096 + }, + { + "epoch": 1.1993559718969555, + "grad_norm": 0.960507869720459, + "learning_rate": 4.540985121291717e-06, + "loss": 0.6455, + "step": 4097 + }, + { + "epoch": 1.199648711943794, + "grad_norm": 0.9963826537132263, + "learning_rate": 4.540762718328279e-06, + "loss": 0.6573, + "step": 4098 + }, + { + "epoch": 1.1999414519906324, + "grad_norm": 0.9723401665687561, + "learning_rate": 4.540540266947354e-06, + "loss": 0.6471, + "step": 4099 + }, + { + "epoch": 1.2002341920374708, + "grad_norm": 0.969629168510437, + "learning_rate": 4.540317767154218e-06, + "loss": 0.5806, + "step": 4100 + }, + { + "epoch": 1.200526932084309, + "grad_norm": 0.9757792949676514, + "learning_rate": 4.5400952189541515e-06, + "loss": 0.6118, + "step": 4101 + }, + { + "epoch": 1.2008196721311475, + "grad_norm": 0.9879500865936279, + "learning_rate": 4.539872622352434e-06, + "loss": 0.6461, + "step": 4102 + }, + { + "epoch": 1.201112412177986, + "grad_norm": 1.0126479864120483, + "learning_rate": 4.539649977354346e-06, + "loss": 0.6124, + "step": 4103 + }, + { + "epoch": 1.2014051522248244, + "grad_norm": 0.9360887408256531, + "learning_rate": 4.539427283965171e-06, + "loss": 0.6126, + "step": 4104 + }, + { + "epoch": 1.2016978922716628, + "grad_norm": 0.9882668256759644, + "learning_rate": 4.539204542190192e-06, + "loss": 0.61, + "step": 4105 + }, + { + "epoch": 1.2019906323185012, + "grad_norm": 0.9443275928497314, + "learning_rate": 4.5389817520346936e-06, + "loss": 0.611, + "step": 4106 + }, + { + "epoch": 1.2022833723653397, + "grad_norm": 1.0116037130355835, + "learning_rate": 4.538758913503961e-06, + "loss": 0.6382, + "step": 4107 + }, + { + "epoch": 1.2025761124121779, + "grad_norm": 0.931643545627594, + "learning_rate": 4.538536026603282e-06, + "loss": 0.603, + "step": 4108 + }, + { + "epoch": 1.2028688524590163, + "grad_norm": 1.0020712614059448, + "learning_rate": 4.538313091337943e-06, + "loss": 0.6313, + "step": 4109 + }, + { + "epoch": 1.2031615925058547, + "grad_norm": 0.9967525601387024, + "learning_rate": 4.538090107713235e-06, + "loss": 0.6321, + "step": 4110 + }, + { + "epoch": 1.2034543325526932, + "grad_norm": 0.9356414675712585, + "learning_rate": 4.537867075734448e-06, + "loss": 0.5712, + "step": 4111 + }, + { + "epoch": 1.2037470725995316, + "grad_norm": 0.964428186416626, + "learning_rate": 4.537643995406873e-06, + "loss": 0.6468, + "step": 4112 + }, + { + "epoch": 1.20403981264637, + "grad_norm": 1.0499025583267212, + "learning_rate": 4.537420866735802e-06, + "loss": 0.6426, + "step": 4113 + }, + { + "epoch": 1.2043325526932085, + "grad_norm": 0.944702684879303, + "learning_rate": 4.537197689726531e-06, + "loss": 0.6368, + "step": 4114 + }, + { + "epoch": 1.204625292740047, + "grad_norm": 0.9971206784248352, + "learning_rate": 4.536974464384352e-06, + "loss": 0.6384, + "step": 4115 + }, + { + "epoch": 1.2049180327868854, + "grad_norm": 0.951614260673523, + "learning_rate": 4.536751190714563e-06, + "loss": 0.6005, + "step": 4116 + }, + { + "epoch": 1.2052107728337236, + "grad_norm": 0.9736311435699463, + "learning_rate": 4.53652786872246e-06, + "loss": 0.6274, + "step": 4117 + }, + { + "epoch": 1.205503512880562, + "grad_norm": 0.9730742573738098, + "learning_rate": 4.536304498413343e-06, + "loss": 0.6068, + "step": 4118 + }, + { + "epoch": 1.2057962529274004, + "grad_norm": 0.9695230722427368, + "learning_rate": 4.53608107979251e-06, + "loss": 0.5865, + "step": 4119 + }, + { + "epoch": 1.2060889929742389, + "grad_norm": 0.9465107321739197, + "learning_rate": 4.535857612865262e-06, + "loss": 0.6371, + "step": 4120 + }, + { + "epoch": 1.2063817330210773, + "grad_norm": 0.9944451451301575, + "learning_rate": 4.535634097636902e-06, + "loss": 0.6169, + "step": 4121 + }, + { + "epoch": 1.2066744730679158, + "grad_norm": 0.9682955145835876, + "learning_rate": 4.535410534112731e-06, + "loss": 0.6358, + "step": 4122 + }, + { + "epoch": 1.2069672131147542, + "grad_norm": 0.984351634979248, + "learning_rate": 4.535186922298054e-06, + "loss": 0.6634, + "step": 4123 + }, + { + "epoch": 1.2072599531615924, + "grad_norm": 0.9979748129844666, + "learning_rate": 4.5349632621981775e-06, + "loss": 0.6406, + "step": 4124 + }, + { + "epoch": 1.2075526932084308, + "grad_norm": 0.9636246562004089, + "learning_rate": 4.5347395538184055e-06, + "loss": 0.5905, + "step": 4125 + }, + { + "epoch": 1.2078454332552693, + "grad_norm": 0.9530026316642761, + "learning_rate": 4.5345157971640465e-06, + "loss": 0.635, + "step": 4126 + }, + { + "epoch": 1.2081381733021077, + "grad_norm": 0.9467639327049255, + "learning_rate": 4.5342919922404105e-06, + "loss": 0.6102, + "step": 4127 + }, + { + "epoch": 1.2084309133489461, + "grad_norm": 0.9400654435157776, + "learning_rate": 4.534068139052805e-06, + "loss": 0.6188, + "step": 4128 + }, + { + "epoch": 1.2087236533957846, + "grad_norm": 0.996929943561554, + "learning_rate": 4.5338442376065425e-06, + "loss": 0.605, + "step": 4129 + }, + { + "epoch": 1.209016393442623, + "grad_norm": 1.0149749517440796, + "learning_rate": 4.533620287906935e-06, + "loss": 0.595, + "step": 4130 + }, + { + "epoch": 1.2093091334894615, + "grad_norm": 0.9209686517715454, + "learning_rate": 4.533396289959295e-06, + "loss": 0.6163, + "step": 4131 + }, + { + "epoch": 1.2096018735362999, + "grad_norm": 0.9706611633300781, + "learning_rate": 4.533172243768938e-06, + "loss": 0.6229, + "step": 4132 + }, + { + "epoch": 1.209894613583138, + "grad_norm": 0.9793619513511658, + "learning_rate": 4.532948149341178e-06, + "loss": 0.5855, + "step": 4133 + }, + { + "epoch": 1.2101873536299765, + "grad_norm": 0.9698737859725952, + "learning_rate": 4.532724006681334e-06, + "loss": 0.6376, + "step": 4134 + }, + { + "epoch": 1.210480093676815, + "grad_norm": 1.0066189765930176, + "learning_rate": 4.532499815794721e-06, + "loss": 0.6255, + "step": 4135 + }, + { + "epoch": 1.2107728337236534, + "grad_norm": 0.9690581560134888, + "learning_rate": 4.53227557668666e-06, + "loss": 0.6645, + "step": 4136 + }, + { + "epoch": 1.2110655737704918, + "grad_norm": 0.9575128555297852, + "learning_rate": 4.53205128936247e-06, + "loss": 0.6164, + "step": 4137 + }, + { + "epoch": 1.2113583138173303, + "grad_norm": 0.9775250554084778, + "learning_rate": 4.531826953827474e-06, + "loss": 0.6106, + "step": 4138 + }, + { + "epoch": 1.2116510538641687, + "grad_norm": 0.947291374206543, + "learning_rate": 4.531602570086992e-06, + "loss": 0.6046, + "step": 4139 + }, + { + "epoch": 1.211943793911007, + "grad_norm": 1.0289188623428345, + "learning_rate": 4.531378138146349e-06, + "loss": 0.6094, + "step": 4140 + }, + { + "epoch": 1.2122365339578454, + "grad_norm": 0.9423925280570984, + "learning_rate": 4.531153658010869e-06, + "loss": 0.6084, + "step": 4141 + }, + { + "epoch": 1.2125292740046838, + "grad_norm": 0.9954180717468262, + "learning_rate": 4.530929129685879e-06, + "loss": 0.6148, + "step": 4142 + }, + { + "epoch": 1.2128220140515222, + "grad_norm": 0.9811019897460938, + "learning_rate": 4.530704553176705e-06, + "loss": 0.6315, + "step": 4143 + }, + { + "epoch": 1.2131147540983607, + "grad_norm": 0.9440450668334961, + "learning_rate": 4.530479928488675e-06, + "loss": 0.6394, + "step": 4144 + }, + { + "epoch": 1.213407494145199, + "grad_norm": 0.9319061636924744, + "learning_rate": 4.530255255627119e-06, + "loss": 0.6005, + "step": 4145 + }, + { + "epoch": 1.2137002341920375, + "grad_norm": 0.9678567051887512, + "learning_rate": 4.5300305345973664e-06, + "loss": 0.6308, + "step": 4146 + }, + { + "epoch": 1.213992974238876, + "grad_norm": 0.9711343050003052, + "learning_rate": 4.52980576540475e-06, + "loss": 0.6488, + "step": 4147 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.998629629611969, + "learning_rate": 4.529580948054601e-06, + "loss": 0.6472, + "step": 4148 + }, + { + "epoch": 1.2145784543325526, + "grad_norm": 1.01121985912323, + "learning_rate": 4.529356082552255e-06, + "loss": 0.6553, + "step": 4149 + }, + { + "epoch": 1.214871194379391, + "grad_norm": 0.9619823098182678, + "learning_rate": 4.529131168903046e-06, + "loss": 0.5873, + "step": 4150 + }, + { + "epoch": 1.2151639344262295, + "grad_norm": 0.9870560169219971, + "learning_rate": 4.52890620711231e-06, + "loss": 0.6646, + "step": 4151 + }, + { + "epoch": 1.215456674473068, + "grad_norm": 0.9438629150390625, + "learning_rate": 4.528681197185385e-06, + "loss": 0.5742, + "step": 4152 + }, + { + "epoch": 1.2157494145199064, + "grad_norm": 1.026771068572998, + "learning_rate": 4.528456139127607e-06, + "loss": 0.61, + "step": 4153 + }, + { + "epoch": 1.2160421545667448, + "grad_norm": 0.9999291896820068, + "learning_rate": 4.528231032944319e-06, + "loss": 0.6482, + "step": 4154 + }, + { + "epoch": 1.216334894613583, + "grad_norm": 0.9451204538345337, + "learning_rate": 4.52800587864086e-06, + "loss": 0.5703, + "step": 4155 + }, + { + "epoch": 1.2166276346604215, + "grad_norm": 0.9310161471366882, + "learning_rate": 4.527780676222572e-06, + "loss": 0.6226, + "step": 4156 + }, + { + "epoch": 1.2169203747072599, + "grad_norm": 0.9548562169075012, + "learning_rate": 4.5275554256947975e-06, + "loss": 0.5934, + "step": 4157 + }, + { + "epoch": 1.2172131147540983, + "grad_norm": 0.9953787922859192, + "learning_rate": 4.527330127062881e-06, + "loss": 0.5595, + "step": 4158 + }, + { + "epoch": 1.2175058548009368, + "grad_norm": 1.0067870616912842, + "learning_rate": 4.527104780332168e-06, + "loss": 0.6105, + "step": 4159 + }, + { + "epoch": 1.2177985948477752, + "grad_norm": 0.9481733441352844, + "learning_rate": 4.526879385508004e-06, + "loss": 0.6176, + "step": 4160 + }, + { + "epoch": 1.2180913348946136, + "grad_norm": 1.0315715074539185, + "learning_rate": 4.526653942595737e-06, + "loss": 0.6247, + "step": 4161 + }, + { + "epoch": 1.218384074941452, + "grad_norm": 0.9893214702606201, + "learning_rate": 4.526428451600716e-06, + "loss": 0.5685, + "step": 4162 + }, + { + "epoch": 1.2186768149882905, + "grad_norm": 1.0302801132202148, + "learning_rate": 4.5262029125282914e-06, + "loss": 0.6787, + "step": 4163 + }, + { + "epoch": 1.2189695550351287, + "grad_norm": 0.9514853954315186, + "learning_rate": 4.525977325383813e-06, + "loss": 0.5946, + "step": 4164 + }, + { + "epoch": 1.2192622950819672, + "grad_norm": 0.9605667591094971, + "learning_rate": 4.525751690172634e-06, + "loss": 0.6385, + "step": 4165 + }, + { + "epoch": 1.2195550351288056, + "grad_norm": 0.9864689707756042, + "learning_rate": 4.525526006900105e-06, + "loss": 0.5964, + "step": 4166 + }, + { + "epoch": 1.219847775175644, + "grad_norm": 0.9313239455223083, + "learning_rate": 4.5253002755715846e-06, + "loss": 0.6097, + "step": 4167 + }, + { + "epoch": 1.2201405152224825, + "grad_norm": 0.981905460357666, + "learning_rate": 4.525074496192425e-06, + "loss": 0.649, + "step": 4168 + }, + { + "epoch": 1.220433255269321, + "grad_norm": 0.9818923473358154, + "learning_rate": 4.524848668767984e-06, + "loss": 0.6385, + "step": 4169 + }, + { + "epoch": 1.2207259953161593, + "grad_norm": 0.9845628142356873, + "learning_rate": 4.524622793303619e-06, + "loss": 0.6185, + "step": 4170 + }, + { + "epoch": 1.2210187353629975, + "grad_norm": 1.001230239868164, + "learning_rate": 4.52439686980469e-06, + "loss": 0.6308, + "step": 4171 + }, + { + "epoch": 1.221311475409836, + "grad_norm": 1.0061501264572144, + "learning_rate": 4.524170898276556e-06, + "loss": 0.6834, + "step": 4172 + }, + { + "epoch": 1.2216042154566744, + "grad_norm": 0.979555070400238, + "learning_rate": 4.523944878724579e-06, + "loss": 0.629, + "step": 4173 + }, + { + "epoch": 1.2218969555035128, + "grad_norm": 0.9608957171440125, + "learning_rate": 4.5237188111541195e-06, + "loss": 0.5827, + "step": 4174 + }, + { + "epoch": 1.2221896955503513, + "grad_norm": 0.944301187992096, + "learning_rate": 4.523492695570544e-06, + "loss": 0.6364, + "step": 4175 + }, + { + "epoch": 1.2224824355971897, + "grad_norm": 1.0220330953598022, + "learning_rate": 4.523266531979214e-06, + "loss": 0.6787, + "step": 4176 + }, + { + "epoch": 1.2227751756440282, + "grad_norm": 0.9887394309043884, + "learning_rate": 4.523040320385498e-06, + "loss": 0.5822, + "step": 4177 + }, + { + "epoch": 1.2230679156908666, + "grad_norm": 0.934556245803833, + "learning_rate": 4.522814060794762e-06, + "loss": 0.6601, + "step": 4178 + }, + { + "epoch": 1.223360655737705, + "grad_norm": 0.9557207226753235, + "learning_rate": 4.522587753212373e-06, + "loss": 0.6168, + "step": 4179 + }, + { + "epoch": 1.2236533957845432, + "grad_norm": 0.9571710824966431, + "learning_rate": 4.522361397643701e-06, + "loss": 0.6396, + "step": 4180 + }, + { + "epoch": 1.2239461358313817, + "grad_norm": 0.9376846551895142, + "learning_rate": 4.522134994094116e-06, + "loss": 0.6178, + "step": 4181 + }, + { + "epoch": 1.2242388758782201, + "grad_norm": 0.9712764620780945, + "learning_rate": 4.52190854256899e-06, + "loss": 0.6108, + "step": 4182 + }, + { + "epoch": 1.2245316159250585, + "grad_norm": 0.9775986075401306, + "learning_rate": 4.5216820430736966e-06, + "loss": 0.6196, + "step": 4183 + }, + { + "epoch": 1.224824355971897, + "grad_norm": 1.0264384746551514, + "learning_rate": 4.521455495613607e-06, + "loss": 0.6104, + "step": 4184 + }, + { + "epoch": 1.2251170960187354, + "grad_norm": 1.0234867334365845, + "learning_rate": 4.521228900194097e-06, + "loss": 0.6309, + "step": 4185 + }, + { + "epoch": 1.2254098360655739, + "grad_norm": 0.965651273727417, + "learning_rate": 4.521002256820544e-06, + "loss": 0.5925, + "step": 4186 + }, + { + "epoch": 1.225702576112412, + "grad_norm": 0.9575965404510498, + "learning_rate": 4.520775565498324e-06, + "loss": 0.5945, + "step": 4187 + }, + { + "epoch": 1.2259953161592505, + "grad_norm": 0.9177213311195374, + "learning_rate": 4.5205488262328155e-06, + "loss": 0.5741, + "step": 4188 + }, + { + "epoch": 1.226288056206089, + "grad_norm": 0.9579589366912842, + "learning_rate": 4.520322039029398e-06, + "loss": 0.6071, + "step": 4189 + }, + { + "epoch": 1.2265807962529274, + "grad_norm": 0.9259364008903503, + "learning_rate": 4.520095203893452e-06, + "loss": 0.5797, + "step": 4190 + }, + { + "epoch": 1.2268735362997658, + "grad_norm": 0.9972461462020874, + "learning_rate": 4.519868320830358e-06, + "loss": 0.6352, + "step": 4191 + }, + { + "epoch": 1.2271662763466042, + "grad_norm": 0.9337904453277588, + "learning_rate": 4.5196413898455015e-06, + "loss": 0.6297, + "step": 4192 + }, + { + "epoch": 1.2274590163934427, + "grad_norm": 0.9313480257987976, + "learning_rate": 4.5194144109442646e-06, + "loss": 0.5926, + "step": 4193 + }, + { + "epoch": 1.2277517564402811, + "grad_norm": 0.9813135266304016, + "learning_rate": 4.519187384132032e-06, + "loss": 0.6211, + "step": 4194 + }, + { + "epoch": 1.2280444964871196, + "grad_norm": 0.9691362977027893, + "learning_rate": 4.518960309414192e-06, + "loss": 0.6663, + "step": 4195 + }, + { + "epoch": 1.2283372365339578, + "grad_norm": 1.0789157152175903, + "learning_rate": 4.51873318679613e-06, + "loss": 0.6903, + "step": 4196 + }, + { + "epoch": 1.2286299765807962, + "grad_norm": 1.013328194618225, + "learning_rate": 4.518506016283236e-06, + "loss": 0.6352, + "step": 4197 + }, + { + "epoch": 1.2289227166276346, + "grad_norm": 0.9942218065261841, + "learning_rate": 4.518278797880899e-06, + "loss": 0.6016, + "step": 4198 + }, + { + "epoch": 1.229215456674473, + "grad_norm": 0.929839551448822, + "learning_rate": 4.518051531594508e-06, + "loss": 0.5809, + "step": 4199 + }, + { + "epoch": 1.2295081967213115, + "grad_norm": 0.9454646110534668, + "learning_rate": 4.517824217429459e-06, + "loss": 0.5824, + "step": 4200 + }, + { + "epoch": 1.22980093676815, + "grad_norm": 0.9779424667358398, + "learning_rate": 4.517596855391141e-06, + "loss": 0.6223, + "step": 4201 + }, + { + "epoch": 1.2300936768149884, + "grad_norm": 0.9403318762779236, + "learning_rate": 4.517369445484951e-06, + "loss": 0.6033, + "step": 4202 + }, + { + "epoch": 1.2303864168618266, + "grad_norm": 0.9715566039085388, + "learning_rate": 4.5171419877162835e-06, + "loss": 0.6413, + "step": 4203 + }, + { + "epoch": 1.230679156908665, + "grad_norm": 0.9569689035415649, + "learning_rate": 4.516914482090534e-06, + "loss": 0.5853, + "step": 4204 + }, + { + "epoch": 1.2309718969555035, + "grad_norm": 0.9408531188964844, + "learning_rate": 4.516686928613101e-06, + "loss": 0.6067, + "step": 4205 + }, + { + "epoch": 1.231264637002342, + "grad_norm": 0.9513837695121765, + "learning_rate": 4.516459327289384e-06, + "loss": 0.6227, + "step": 4206 + }, + { + "epoch": 1.2315573770491803, + "grad_norm": 0.9545785188674927, + "learning_rate": 4.5162316781247815e-06, + "loss": 0.636, + "step": 4207 + }, + { + "epoch": 1.2318501170960188, + "grad_norm": 1.1718465089797974, + "learning_rate": 4.516003981124695e-06, + "loss": 0.6272, + "step": 4208 + }, + { + "epoch": 1.2321428571428572, + "grad_norm": 0.9105302095413208, + "learning_rate": 4.515776236294527e-06, + "loss": 0.6272, + "step": 4209 + }, + { + "epoch": 1.2324355971896956, + "grad_norm": 0.9924986362457275, + "learning_rate": 4.51554844363968e-06, + "loss": 0.5943, + "step": 4210 + }, + { + "epoch": 1.232728337236534, + "grad_norm": 0.9671734571456909, + "learning_rate": 4.5153206031655595e-06, + "loss": 0.6063, + "step": 4211 + }, + { + "epoch": 1.2330210772833723, + "grad_norm": 0.9681123495101929, + "learning_rate": 4.51509271487757e-06, + "loss": 0.6179, + "step": 4212 + }, + { + "epoch": 1.2333138173302107, + "grad_norm": 0.978575587272644, + "learning_rate": 4.51486477878112e-06, + "loss": 0.6321, + "step": 4213 + }, + { + "epoch": 1.2336065573770492, + "grad_norm": 1.0054035186767578, + "learning_rate": 4.514636794881614e-06, + "loss": 0.6364, + "step": 4214 + }, + { + "epoch": 1.2338992974238876, + "grad_norm": 1.0013883113861084, + "learning_rate": 4.514408763184464e-06, + "loss": 0.5885, + "step": 4215 + }, + { + "epoch": 1.234192037470726, + "grad_norm": 0.9425501823425293, + "learning_rate": 4.51418068369508e-06, + "loss": 0.6263, + "step": 4216 + }, + { + "epoch": 1.2344847775175645, + "grad_norm": 1.0185424089431763, + "learning_rate": 4.513952556418871e-06, + "loss": 0.6479, + "step": 4217 + }, + { + "epoch": 1.234777517564403, + "grad_norm": 0.9662545919418335, + "learning_rate": 4.513724381361251e-06, + "loss": 0.6214, + "step": 4218 + }, + { + "epoch": 1.2350702576112411, + "grad_norm": 0.9931087493896484, + "learning_rate": 4.513496158527633e-06, + "loss": 0.5879, + "step": 4219 + }, + { + "epoch": 1.2353629976580796, + "grad_norm": 0.9958824515342712, + "learning_rate": 4.513267887923432e-06, + "loss": 0.6423, + "step": 4220 + }, + { + "epoch": 1.235655737704918, + "grad_norm": 0.9831037521362305, + "learning_rate": 4.513039569554064e-06, + "loss": 0.6448, + "step": 4221 + }, + { + "epoch": 1.2359484777517564, + "grad_norm": 0.9948973655700684, + "learning_rate": 4.512811203424945e-06, + "loss": 0.6331, + "step": 4222 + }, + { + "epoch": 1.2362412177985949, + "grad_norm": 0.9281118512153625, + "learning_rate": 4.512582789541494e-06, + "loss": 0.6432, + "step": 4223 + }, + { + "epoch": 1.2365339578454333, + "grad_norm": 0.9925378561019897, + "learning_rate": 4.512354327909128e-06, + "loss": 0.6241, + "step": 4224 + }, + { + "epoch": 1.2368266978922717, + "grad_norm": 0.9772345423698425, + "learning_rate": 4.512125818533271e-06, + "loss": 0.6421, + "step": 4225 + }, + { + "epoch": 1.2371194379391102, + "grad_norm": 0.9973930716514587, + "learning_rate": 4.5118972614193415e-06, + "loss": 0.5939, + "step": 4226 + }, + { + "epoch": 1.2374121779859486, + "grad_norm": 0.9583884477615356, + "learning_rate": 4.511668656572763e-06, + "loss": 0.6253, + "step": 4227 + }, + { + "epoch": 1.2377049180327868, + "grad_norm": 0.9545579552650452, + "learning_rate": 4.511440003998959e-06, + "loss": 0.6295, + "step": 4228 + }, + { + "epoch": 1.2379976580796253, + "grad_norm": 1.0175273418426514, + "learning_rate": 4.511211303703354e-06, + "loss": 0.6326, + "step": 4229 + }, + { + "epoch": 1.2382903981264637, + "grad_norm": 0.9498239159584045, + "learning_rate": 4.510982555691375e-06, + "loss": 0.5974, + "step": 4230 + }, + { + "epoch": 1.2385831381733021, + "grad_norm": 0.9783760905265808, + "learning_rate": 4.510753759968449e-06, + "loss": 0.6123, + "step": 4231 + }, + { + "epoch": 1.2388758782201406, + "grad_norm": 1.0405571460723877, + "learning_rate": 4.510524916540004e-06, + "loss": 0.6236, + "step": 4232 + }, + { + "epoch": 1.239168618266979, + "grad_norm": 0.9569115042686462, + "learning_rate": 4.510296025411468e-06, + "loss": 0.6306, + "step": 4233 + }, + { + "epoch": 1.2394613583138172, + "grad_norm": 0.9781119227409363, + "learning_rate": 4.510067086588273e-06, + "loss": 0.6219, + "step": 4234 + }, + { + "epoch": 1.2397540983606556, + "grad_norm": 0.9764747023582458, + "learning_rate": 4.50983810007585e-06, + "loss": 0.6439, + "step": 4235 + }, + { + "epoch": 1.240046838407494, + "grad_norm": 1.007982850074768, + "learning_rate": 4.509609065879632e-06, + "loss": 0.6255, + "step": 4236 + }, + { + "epoch": 1.2403395784543325, + "grad_norm": 0.986848771572113, + "learning_rate": 4.509379984005053e-06, + "loss": 0.6478, + "step": 4237 + }, + { + "epoch": 1.240632318501171, + "grad_norm": 1.0218474864959717, + "learning_rate": 4.509150854457548e-06, + "loss": 0.6613, + "step": 4238 + }, + { + "epoch": 1.2409250585480094, + "grad_norm": 1.0151807069778442, + "learning_rate": 4.508921677242553e-06, + "loss": 0.5971, + "step": 4239 + }, + { + "epoch": 1.2412177985948478, + "grad_norm": 0.912473201751709, + "learning_rate": 4.508692452365505e-06, + "loss": 0.5667, + "step": 4240 + }, + { + "epoch": 1.2415105386416863, + "grad_norm": 1.0671132802963257, + "learning_rate": 4.508463179831843e-06, + "loss": 0.6305, + "step": 4241 + }, + { + "epoch": 1.2418032786885247, + "grad_norm": 0.9834454655647278, + "learning_rate": 4.508233859647006e-06, + "loss": 0.6327, + "step": 4242 + }, + { + "epoch": 1.242096018735363, + "grad_norm": 0.9428094029426575, + "learning_rate": 4.508004491816433e-06, + "loss": 0.657, + "step": 4243 + }, + { + "epoch": 1.2423887587822013, + "grad_norm": 0.9864641427993774, + "learning_rate": 4.50777507634557e-06, + "loss": 0.6416, + "step": 4244 + }, + { + "epoch": 1.2426814988290398, + "grad_norm": 1.050676703453064, + "learning_rate": 4.507545613239857e-06, + "loss": 0.651, + "step": 4245 + }, + { + "epoch": 1.2429742388758782, + "grad_norm": 0.9137433171272278, + "learning_rate": 4.507316102504739e-06, + "loss": 0.5815, + "step": 4246 + }, + { + "epoch": 1.2432669789227166, + "grad_norm": 0.9307228922843933, + "learning_rate": 4.50708654414566e-06, + "loss": 0.6102, + "step": 4247 + }, + { + "epoch": 1.243559718969555, + "grad_norm": 0.9351855516433716, + "learning_rate": 4.506856938168067e-06, + "loss": 0.5843, + "step": 4248 + }, + { + "epoch": 1.2438524590163935, + "grad_norm": 1.0191911458969116, + "learning_rate": 4.5066272845774084e-06, + "loss": 0.578, + "step": 4249 + }, + { + "epoch": 1.2441451990632317, + "grad_norm": 1.0319695472717285, + "learning_rate": 4.506397583379132e-06, + "loss": 0.6715, + "step": 4250 + }, + { + "epoch": 1.2444379391100702, + "grad_norm": 0.9693368673324585, + "learning_rate": 4.506167834578687e-06, + "loss": 0.6578, + "step": 4251 + }, + { + "epoch": 1.2447306791569086, + "grad_norm": 1.0125025510787964, + "learning_rate": 4.505938038181525e-06, + "loss": 0.6035, + "step": 4252 + }, + { + "epoch": 1.245023419203747, + "grad_norm": 0.9347845911979675, + "learning_rate": 4.505708194193097e-06, + "loss": 0.6223, + "step": 4253 + }, + { + "epoch": 1.2453161592505855, + "grad_norm": 0.9155698418617249, + "learning_rate": 4.505478302618857e-06, + "loss": 0.5878, + "step": 4254 + }, + { + "epoch": 1.245608899297424, + "grad_norm": 0.9481428861618042, + "learning_rate": 4.5052483634642595e-06, + "loss": 0.6448, + "step": 4255 + }, + { + "epoch": 1.2459016393442623, + "grad_norm": 0.9494887590408325, + "learning_rate": 4.5050183767347585e-06, + "loss": 0.614, + "step": 4256 + }, + { + "epoch": 1.2461943793911008, + "grad_norm": 0.9998863935470581, + "learning_rate": 4.504788342435812e-06, + "loss": 0.6317, + "step": 4257 + }, + { + "epoch": 1.2464871194379392, + "grad_norm": 0.9673553109169006, + "learning_rate": 4.504558260572877e-06, + "loss": 0.5835, + "step": 4258 + }, + { + "epoch": 1.2467798594847774, + "grad_norm": 0.9876679182052612, + "learning_rate": 4.504328131151412e-06, + "loss": 0.6406, + "step": 4259 + }, + { + "epoch": 1.2470725995316159, + "grad_norm": 1.0046522617340088, + "learning_rate": 4.504097954176877e-06, + "loss": 0.6122, + "step": 4260 + }, + { + "epoch": 1.2473653395784543, + "grad_norm": 0.9234721660614014, + "learning_rate": 4.503867729654733e-06, + "loss": 0.5923, + "step": 4261 + }, + { + "epoch": 1.2476580796252927, + "grad_norm": 0.9888014793395996, + "learning_rate": 4.5036374575904415e-06, + "loss": 0.6319, + "step": 4262 + }, + { + "epoch": 1.2479508196721312, + "grad_norm": 0.9556519389152527, + "learning_rate": 4.503407137989467e-06, + "loss": 0.6105, + "step": 4263 + }, + { + "epoch": 1.2482435597189696, + "grad_norm": 0.9030968546867371, + "learning_rate": 4.503176770857273e-06, + "loss": 0.5771, + "step": 4264 + }, + { + "epoch": 1.248536299765808, + "grad_norm": 0.9362974166870117, + "learning_rate": 4.502946356199326e-06, + "loss": 0.609, + "step": 4265 + }, + { + "epoch": 1.2488290398126463, + "grad_norm": 0.9929400086402893, + "learning_rate": 4.502715894021091e-06, + "loss": 0.6299, + "step": 4266 + }, + { + "epoch": 1.2491217798594847, + "grad_norm": 0.9870968461036682, + "learning_rate": 4.502485384328037e-06, + "loss": 0.6343, + "step": 4267 + }, + { + "epoch": 1.2494145199063231, + "grad_norm": 0.9476544260978699, + "learning_rate": 4.5022548271256326e-06, + "loss": 0.5725, + "step": 4268 + }, + { + "epoch": 1.2497072599531616, + "grad_norm": 0.9738339781761169, + "learning_rate": 4.502024222419348e-06, + "loss": 0.6034, + "step": 4269 + }, + { + "epoch": 1.25, + "grad_norm": 1.0415047407150269, + "learning_rate": 4.501793570214653e-06, + "loss": 0.6237, + "step": 4270 + }, + { + "epoch": 1.2502927400468384, + "grad_norm": 0.962153434753418, + "learning_rate": 4.501562870517022e-06, + "loss": 0.6303, + "step": 4271 + }, + { + "epoch": 1.2505854800936769, + "grad_norm": 0.9860857725143433, + "learning_rate": 4.501332123331927e-06, + "loss": 0.6115, + "step": 4272 + }, + { + "epoch": 1.2508782201405153, + "grad_norm": 0.9616137146949768, + "learning_rate": 4.501101328664843e-06, + "loss": 0.6323, + "step": 4273 + }, + { + "epoch": 1.2511709601873537, + "grad_norm": 0.9258964657783508, + "learning_rate": 4.500870486521245e-06, + "loss": 0.6338, + "step": 4274 + }, + { + "epoch": 1.251463700234192, + "grad_norm": 0.921231746673584, + "learning_rate": 4.5006395969066105e-06, + "loss": 0.5946, + "step": 4275 + }, + { + "epoch": 1.2517564402810304, + "grad_norm": 0.9378697872161865, + "learning_rate": 4.500408659826417e-06, + "loss": 0.5681, + "step": 4276 + }, + { + "epoch": 1.2520491803278688, + "grad_norm": 0.991297721862793, + "learning_rate": 4.500177675286145e-06, + "loss": 0.6174, + "step": 4277 + }, + { + "epoch": 1.2523419203747073, + "grad_norm": 1.0373573303222656, + "learning_rate": 4.4999466432912716e-06, + "loss": 0.6667, + "step": 4278 + }, + { + "epoch": 1.2526346604215457, + "grad_norm": 1.0143097639083862, + "learning_rate": 4.499715563847282e-06, + "loss": 0.6053, + "step": 4279 + }, + { + "epoch": 1.2529274004683841, + "grad_norm": 1.0136926174163818, + "learning_rate": 4.499484436959655e-06, + "loss": 0.6627, + "step": 4280 + }, + { + "epoch": 1.2532201405152223, + "grad_norm": 0.9508180618286133, + "learning_rate": 4.499253262633876e-06, + "loss": 0.6081, + "step": 4281 + }, + { + "epoch": 1.2535128805620608, + "grad_norm": 0.9574763774871826, + "learning_rate": 4.499022040875429e-06, + "loss": 0.6011, + "step": 4282 + }, + { + "epoch": 1.2538056206088992, + "grad_norm": 0.9364356994628906, + "learning_rate": 4.4987907716898e-06, + "loss": 0.5904, + "step": 4283 + }, + { + "epoch": 1.2540983606557377, + "grad_norm": 1.0441135168075562, + "learning_rate": 4.498559455082476e-06, + "loss": 0.6405, + "step": 4284 + }, + { + "epoch": 1.254391100702576, + "grad_norm": 0.9364659190177917, + "learning_rate": 4.498328091058944e-06, + "loss": 0.6274, + "step": 4285 + }, + { + "epoch": 1.2546838407494145, + "grad_norm": 0.9372391700744629, + "learning_rate": 4.498096679624696e-06, + "loss": 0.6288, + "step": 4286 + }, + { + "epoch": 1.254976580796253, + "grad_norm": 1.042837381362915, + "learning_rate": 4.497865220785219e-06, + "loss": 0.633, + "step": 4287 + }, + { + "epoch": 1.2552693208430914, + "grad_norm": 0.9211299419403076, + "learning_rate": 4.497633714546007e-06, + "loss": 0.6465, + "step": 4288 + }, + { + "epoch": 1.2555620608899298, + "grad_norm": 0.9741918444633484, + "learning_rate": 4.497402160912551e-06, + "loss": 0.5911, + "step": 4289 + }, + { + "epoch": 1.2558548009367683, + "grad_norm": 0.9780747294425964, + "learning_rate": 4.497170559890344e-06, + "loss": 0.6186, + "step": 4290 + }, + { + "epoch": 1.2561475409836065, + "grad_norm": 0.9505590200424194, + "learning_rate": 4.496938911484882e-06, + "loss": 0.6365, + "step": 4291 + }, + { + "epoch": 1.256440281030445, + "grad_norm": 1.0412962436676025, + "learning_rate": 4.496707215701661e-06, + "loss": 0.6234, + "step": 4292 + }, + { + "epoch": 1.2567330210772834, + "grad_norm": 0.9452297687530518, + "learning_rate": 4.496475472546178e-06, + "loss": 0.6337, + "step": 4293 + }, + { + "epoch": 1.2570257611241218, + "grad_norm": 0.9519926905632019, + "learning_rate": 4.4962436820239305e-06, + "loss": 0.6005, + "step": 4294 + }, + { + "epoch": 1.2573185011709602, + "grad_norm": 0.975534200668335, + "learning_rate": 4.496011844140419e-06, + "loss": 0.5852, + "step": 4295 + }, + { + "epoch": 1.2576112412177987, + "grad_norm": 0.9678035974502563, + "learning_rate": 4.495779958901142e-06, + "loss": 0.6245, + "step": 4296 + }, + { + "epoch": 1.2579039812646369, + "grad_norm": 1.0200313329696655, + "learning_rate": 4.495548026311602e-06, + "loss": 0.6289, + "step": 4297 + }, + { + "epoch": 1.2581967213114753, + "grad_norm": 1.0091145038604736, + "learning_rate": 4.495316046377302e-06, + "loss": 0.635, + "step": 4298 + }, + { + "epoch": 1.2584894613583137, + "grad_norm": 0.9672524333000183, + "learning_rate": 4.4950840191037456e-06, + "loss": 0.6572, + "step": 4299 + }, + { + "epoch": 1.2587822014051522, + "grad_norm": 1.0034222602844238, + "learning_rate": 4.494851944496438e-06, + "loss": 0.6646, + "step": 4300 + }, + { + "epoch": 1.2590749414519906, + "grad_norm": 0.9184914827346802, + "learning_rate": 4.4946198225608834e-06, + "loss": 0.608, + "step": 4301 + }, + { + "epoch": 1.259367681498829, + "grad_norm": 0.9711180925369263, + "learning_rate": 4.494387653302591e-06, + "loss": 0.6211, + "step": 4302 + }, + { + "epoch": 1.2596604215456675, + "grad_norm": 0.9918525815010071, + "learning_rate": 4.494155436727068e-06, + "loss": 0.6234, + "step": 4303 + }, + { + "epoch": 1.259953161592506, + "grad_norm": 0.9694542288780212, + "learning_rate": 4.493923172839825e-06, + "loss": 0.5971, + "step": 4304 + }, + { + "epoch": 1.2602459016393444, + "grad_norm": 0.9387860894203186, + "learning_rate": 4.493690861646371e-06, + "loss": 0.608, + "step": 4305 + }, + { + "epoch": 1.2605386416861828, + "grad_norm": 0.9417112469673157, + "learning_rate": 4.493458503152219e-06, + "loss": 0.6031, + "step": 4306 + }, + { + "epoch": 1.260831381733021, + "grad_norm": 1.007652997970581, + "learning_rate": 4.4932260973628805e-06, + "loss": 0.5871, + "step": 4307 + }, + { + "epoch": 1.2611241217798594, + "grad_norm": 1.0002436637878418, + "learning_rate": 4.49299364428387e-06, + "loss": 0.6047, + "step": 4308 + }, + { + "epoch": 1.2614168618266979, + "grad_norm": 0.9782008528709412, + "learning_rate": 4.492761143920702e-06, + "loss": 0.6466, + "step": 4309 + }, + { + "epoch": 1.2617096018735363, + "grad_norm": 0.9591519832611084, + "learning_rate": 4.492528596278893e-06, + "loss": 0.6391, + "step": 4310 + }, + { + "epoch": 1.2620023419203747, + "grad_norm": 0.9903216361999512, + "learning_rate": 4.49229600136396e-06, + "loss": 0.6189, + "step": 4311 + }, + { + "epoch": 1.2622950819672132, + "grad_norm": 0.9313678741455078, + "learning_rate": 4.492063359181423e-06, + "loss": 0.6626, + "step": 4312 + }, + { + "epoch": 1.2625878220140514, + "grad_norm": 0.9859466552734375, + "learning_rate": 4.491830669736799e-06, + "loss": 0.6398, + "step": 4313 + }, + { + "epoch": 1.2628805620608898, + "grad_norm": 1.1138529777526855, + "learning_rate": 4.49159793303561e-06, + "loss": 0.6245, + "step": 4314 + }, + { + "epoch": 1.2631733021077283, + "grad_norm": 0.9532672166824341, + "learning_rate": 4.491365149083377e-06, + "loss": 0.5998, + "step": 4315 + }, + { + "epoch": 1.2634660421545667, + "grad_norm": 0.9283668398857117, + "learning_rate": 4.491132317885623e-06, + "loss": 0.6613, + "step": 4316 + }, + { + "epoch": 1.2637587822014051, + "grad_norm": 0.9697747230529785, + "learning_rate": 4.490899439447874e-06, + "loss": 0.6514, + "step": 4317 + }, + { + "epoch": 1.2640515222482436, + "grad_norm": 1.0781128406524658, + "learning_rate": 4.490666513775652e-06, + "loss": 0.6161, + "step": 4318 + }, + { + "epoch": 1.264344262295082, + "grad_norm": 0.925869345664978, + "learning_rate": 4.490433540874485e-06, + "loss": 0.5997, + "step": 4319 + }, + { + "epoch": 1.2646370023419204, + "grad_norm": 0.9338982701301575, + "learning_rate": 4.490200520749899e-06, + "loss": 0.5958, + "step": 4320 + }, + { + "epoch": 1.2649297423887589, + "grad_norm": 0.9387359619140625, + "learning_rate": 4.489967453407424e-06, + "loss": 0.5708, + "step": 4321 + }, + { + "epoch": 1.2652224824355973, + "grad_norm": 0.9909465312957764, + "learning_rate": 4.489734338852588e-06, + "loss": 0.6153, + "step": 4322 + }, + { + "epoch": 1.2655152224824355, + "grad_norm": 0.9713402390480042, + "learning_rate": 4.489501177090923e-06, + "loss": 0.5998, + "step": 4323 + }, + { + "epoch": 1.265807962529274, + "grad_norm": 1.0086549520492554, + "learning_rate": 4.489267968127961e-06, + "loss": 0.6197, + "step": 4324 + }, + { + "epoch": 1.2661007025761124, + "grad_norm": 0.9664751291275024, + "learning_rate": 4.4890347119692335e-06, + "loss": 0.609, + "step": 4325 + }, + { + "epoch": 1.2663934426229508, + "grad_norm": 0.937025785446167, + "learning_rate": 4.488801408620276e-06, + "loss": 0.6211, + "step": 4326 + }, + { + "epoch": 1.2666861826697893, + "grad_norm": 0.9491571187973022, + "learning_rate": 4.488568058086622e-06, + "loss": 0.5878, + "step": 4327 + }, + { + "epoch": 1.2669789227166277, + "grad_norm": 0.9666664600372314, + "learning_rate": 4.48833466037381e-06, + "loss": 0.6211, + "step": 4328 + }, + { + "epoch": 1.267271662763466, + "grad_norm": 0.9418140649795532, + "learning_rate": 4.488101215487375e-06, + "loss": 0.6069, + "step": 4329 + }, + { + "epoch": 1.2675644028103044, + "grad_norm": 1.045400857925415, + "learning_rate": 4.487867723432857e-06, + "loss": 0.648, + "step": 4330 + }, + { + "epoch": 1.2678571428571428, + "grad_norm": 1.245956540107727, + "learning_rate": 4.487634184215796e-06, + "loss": 0.6307, + "step": 4331 + }, + { + "epoch": 1.2681498829039812, + "grad_norm": 0.9793998003005981, + "learning_rate": 4.487400597841732e-06, + "loss": 0.6093, + "step": 4332 + }, + { + "epoch": 1.2684426229508197, + "grad_norm": 0.9685893058776855, + "learning_rate": 4.487166964316207e-06, + "loss": 0.6329, + "step": 4333 + }, + { + "epoch": 1.268735362997658, + "grad_norm": 1.010116696357727, + "learning_rate": 4.486933283644763e-06, + "loss": 0.6759, + "step": 4334 + }, + { + "epoch": 1.2690281030444965, + "grad_norm": 0.9684317708015442, + "learning_rate": 4.486699555832946e-06, + "loss": 0.5993, + "step": 4335 + }, + { + "epoch": 1.269320843091335, + "grad_norm": 0.9259043335914612, + "learning_rate": 4.4864657808863e-06, + "loss": 0.5888, + "step": 4336 + }, + { + "epoch": 1.2696135831381734, + "grad_norm": 0.9925552606582642, + "learning_rate": 4.486231958810372e-06, + "loss": 0.579, + "step": 4337 + }, + { + "epoch": 1.2699063231850118, + "grad_norm": 0.956619381904602, + "learning_rate": 4.485998089610709e-06, + "loss": 0.6391, + "step": 4338 + }, + { + "epoch": 1.27019906323185, + "grad_norm": 0.9365197420120239, + "learning_rate": 4.48576417329286e-06, + "loss": 0.6002, + "step": 4339 + }, + { + "epoch": 1.2704918032786885, + "grad_norm": 0.9592602849006653, + "learning_rate": 4.485530209862375e-06, + "loss": 0.6292, + "step": 4340 + }, + { + "epoch": 1.270784543325527, + "grad_norm": 1.0274156332015991, + "learning_rate": 4.485296199324802e-06, + "loss": 0.6324, + "step": 4341 + }, + { + "epoch": 1.2710772833723654, + "grad_norm": 0.9497712850570679, + "learning_rate": 4.485062141685698e-06, + "loss": 0.5952, + "step": 4342 + }, + { + "epoch": 1.2713700234192038, + "grad_norm": 0.9642655253410339, + "learning_rate": 4.484828036950612e-06, + "loss": 0.616, + "step": 4343 + }, + { + "epoch": 1.2716627634660422, + "grad_norm": 0.9987855553627014, + "learning_rate": 4.484593885125099e-06, + "loss": 0.6577, + "step": 4344 + }, + { + "epoch": 1.2719555035128804, + "grad_norm": 0.9837020635604858, + "learning_rate": 4.4843596862147155e-06, + "loss": 0.6193, + "step": 4345 + }, + { + "epoch": 1.2722482435597189, + "grad_norm": 0.9498054385185242, + "learning_rate": 4.484125440225018e-06, + "loss": 0.5991, + "step": 4346 + }, + { + "epoch": 1.2725409836065573, + "grad_norm": 0.9409908652305603, + "learning_rate": 4.483891147161562e-06, + "loss": 0.6173, + "step": 4347 + }, + { + "epoch": 1.2728337236533958, + "grad_norm": 0.9765833020210266, + "learning_rate": 4.4836568070299075e-06, + "loss": 0.6437, + "step": 4348 + }, + { + "epoch": 1.2731264637002342, + "grad_norm": 1.0019508600234985, + "learning_rate": 4.483422419835614e-06, + "loss": 0.6573, + "step": 4349 + }, + { + "epoch": 1.2734192037470726, + "grad_norm": 0.9797468781471252, + "learning_rate": 4.483187985584243e-06, + "loss": 0.6642, + "step": 4350 + }, + { + "epoch": 1.273711943793911, + "grad_norm": 0.9837626814842224, + "learning_rate": 4.482953504281356e-06, + "loss": 0.6395, + "step": 4351 + }, + { + "epoch": 1.2740046838407495, + "grad_norm": 0.9540021419525146, + "learning_rate": 4.4827189759325166e-06, + "loss": 0.6187, + "step": 4352 + }, + { + "epoch": 1.274297423887588, + "grad_norm": 0.9681099653244019, + "learning_rate": 4.482484400543288e-06, + "loss": 0.6385, + "step": 4353 + }, + { + "epoch": 1.2745901639344264, + "grad_norm": 0.9208570718765259, + "learning_rate": 4.482249778119237e-06, + "loss": 0.6531, + "step": 4354 + }, + { + "epoch": 1.2748829039812646, + "grad_norm": 0.9509547352790833, + "learning_rate": 4.482015108665928e-06, + "loss": 0.631, + "step": 4355 + }, + { + "epoch": 1.275175644028103, + "grad_norm": 0.9605768322944641, + "learning_rate": 4.48178039218893e-06, + "loss": 0.6023, + "step": 4356 + }, + { + "epoch": 1.2754683840749415, + "grad_norm": 0.998030424118042, + "learning_rate": 4.481545628693812e-06, + "loss": 0.66, + "step": 4357 + }, + { + "epoch": 1.2757611241217799, + "grad_norm": 0.9920161366462708, + "learning_rate": 4.481310818186142e-06, + "loss": 0.6483, + "step": 4358 + }, + { + "epoch": 1.2760538641686183, + "grad_norm": 0.9294471144676208, + "learning_rate": 4.481075960671493e-06, + "loss": 0.6116, + "step": 4359 + }, + { + "epoch": 1.2763466042154565, + "grad_norm": 0.9849762320518494, + "learning_rate": 4.480841056155437e-06, + "loss": 0.6463, + "step": 4360 + }, + { + "epoch": 1.276639344262295, + "grad_norm": 0.9601083397865295, + "learning_rate": 4.480606104643545e-06, + "loss": 0.598, + "step": 4361 + }, + { + "epoch": 1.2769320843091334, + "grad_norm": 0.98780357837677, + "learning_rate": 4.480371106141394e-06, + "loss": 0.6475, + "step": 4362 + }, + { + "epoch": 1.2772248243559718, + "grad_norm": 0.992827296257019, + "learning_rate": 4.480136060654557e-06, + "loss": 0.5877, + "step": 4363 + }, + { + "epoch": 1.2775175644028103, + "grad_norm": 0.9733892679214478, + "learning_rate": 4.479900968188612e-06, + "loss": 0.5631, + "step": 4364 + }, + { + "epoch": 1.2778103044496487, + "grad_norm": 1.0190376043319702, + "learning_rate": 4.479665828749136e-06, + "loss": 0.6321, + "step": 4365 + }, + { + "epoch": 1.2781030444964872, + "grad_norm": 0.9496223330497742, + "learning_rate": 4.479430642341708e-06, + "loss": 0.6288, + "step": 4366 + }, + { + "epoch": 1.2783957845433256, + "grad_norm": 0.955847978591919, + "learning_rate": 4.479195408971907e-06, + "loss": 0.6261, + "step": 4367 + }, + { + "epoch": 1.278688524590164, + "grad_norm": 0.9733151197433472, + "learning_rate": 4.4789601286453155e-06, + "loss": 0.607, + "step": 4368 + }, + { + "epoch": 1.2789812646370025, + "grad_norm": 0.9513235688209534, + "learning_rate": 4.4787248013675145e-06, + "loss": 0.5797, + "step": 4369 + }, + { + "epoch": 1.2792740046838407, + "grad_norm": 0.9496986269950867, + "learning_rate": 4.478489427144087e-06, + "loss": 0.6227, + "step": 4370 + }, + { + "epoch": 1.279566744730679, + "grad_norm": 0.9868835806846619, + "learning_rate": 4.478254005980619e-06, + "loss": 0.7033, + "step": 4371 + }, + { + "epoch": 1.2798594847775175, + "grad_norm": 1.0475271940231323, + "learning_rate": 4.478018537882694e-06, + "loss": 0.6393, + "step": 4372 + }, + { + "epoch": 1.280152224824356, + "grad_norm": 0.9404594302177429, + "learning_rate": 4.477783022855898e-06, + "loss": 0.6211, + "step": 4373 + }, + { + "epoch": 1.2804449648711944, + "grad_norm": 0.9554361701011658, + "learning_rate": 4.477547460905821e-06, + "loss": 0.6183, + "step": 4374 + }, + { + "epoch": 1.2807377049180328, + "grad_norm": 0.9508875012397766, + "learning_rate": 4.477311852038051e-06, + "loss": 0.625, + "step": 4375 + }, + { + "epoch": 1.281030444964871, + "grad_norm": 0.985066294670105, + "learning_rate": 4.477076196258177e-06, + "loss": 0.6279, + "step": 4376 + }, + { + "epoch": 1.2813231850117095, + "grad_norm": 0.9422428011894226, + "learning_rate": 4.47684049357179e-06, + "loss": 0.6119, + "step": 4377 + }, + { + "epoch": 1.281615925058548, + "grad_norm": 0.9585182070732117, + "learning_rate": 4.476604743984483e-06, + "loss": 0.6661, + "step": 4378 + }, + { + "epoch": 1.2819086651053864, + "grad_norm": 0.975171685218811, + "learning_rate": 4.476368947501848e-06, + "loss": 0.6066, + "step": 4379 + }, + { + "epoch": 1.2822014051522248, + "grad_norm": 0.9466639757156372, + "learning_rate": 4.476133104129481e-06, + "loss": 0.6176, + "step": 4380 + }, + { + "epoch": 1.2824941451990632, + "grad_norm": 0.9211204051971436, + "learning_rate": 4.475897213872975e-06, + "loss": 0.5967, + "step": 4381 + }, + { + "epoch": 1.2827868852459017, + "grad_norm": 0.9492511749267578, + "learning_rate": 4.475661276737929e-06, + "loss": 0.589, + "step": 4382 + }, + { + "epoch": 1.2830796252927401, + "grad_norm": 0.9844173789024353, + "learning_rate": 4.475425292729939e-06, + "loss": 0.6549, + "step": 4383 + }, + { + "epoch": 1.2833723653395785, + "grad_norm": 0.9171685576438904, + "learning_rate": 4.475189261854605e-06, + "loss": 0.5951, + "step": 4384 + }, + { + "epoch": 1.283665105386417, + "grad_norm": 0.9144049882888794, + "learning_rate": 4.474953184117527e-06, + "loss": 0.6071, + "step": 4385 + }, + { + "epoch": 1.2839578454332552, + "grad_norm": 0.9527641534805298, + "learning_rate": 4.474717059524305e-06, + "loss": 0.5691, + "step": 4386 + }, + { + "epoch": 1.2842505854800936, + "grad_norm": 1.042531967163086, + "learning_rate": 4.47448088808054e-06, + "loss": 0.6509, + "step": 4387 + }, + { + "epoch": 1.284543325526932, + "grad_norm": 1.0048390626907349, + "learning_rate": 4.474244669791837e-06, + "loss": 0.6369, + "step": 4388 + }, + { + "epoch": 1.2848360655737705, + "grad_norm": 0.9682003855705261, + "learning_rate": 4.474008404663801e-06, + "loss": 0.6606, + "step": 4389 + }, + { + "epoch": 1.285128805620609, + "grad_norm": 0.9530508518218994, + "learning_rate": 4.4737720927020356e-06, + "loss": 0.6083, + "step": 4390 + }, + { + "epoch": 1.2854215456674474, + "grad_norm": 0.9642596244812012, + "learning_rate": 4.473535733912148e-06, + "loss": 0.602, + "step": 4391 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.9634547233581543, + "learning_rate": 4.4732993282997464e-06, + "loss": 0.5676, + "step": 4392 + }, + { + "epoch": 1.286007025761124, + "grad_norm": 0.9283317923545837, + "learning_rate": 4.4730628758704395e-06, + "loss": 0.6389, + "step": 4393 + }, + { + "epoch": 1.2862997658079625, + "grad_norm": 0.9618542194366455, + "learning_rate": 4.472826376629836e-06, + "loss": 0.6497, + "step": 4394 + }, + { + "epoch": 1.286592505854801, + "grad_norm": 0.9992674589157104, + "learning_rate": 4.472589830583547e-06, + "loss": 0.6084, + "step": 4395 + }, + { + "epoch": 1.2868852459016393, + "grad_norm": 1.0321189165115356, + "learning_rate": 4.4723532377371866e-06, + "loss": 0.6427, + "step": 4396 + }, + { + "epoch": 1.2871779859484778, + "grad_norm": 0.9192115068435669, + "learning_rate": 4.472116598096366e-06, + "loss": 0.6061, + "step": 4397 + }, + { + "epoch": 1.2874707259953162, + "grad_norm": 1.042612910270691, + "learning_rate": 4.471879911666701e-06, + "loss": 0.6077, + "step": 4398 + }, + { + "epoch": 1.2877634660421546, + "grad_norm": 0.9268545508384705, + "learning_rate": 4.471643178453806e-06, + "loss": 0.6056, + "step": 4399 + }, + { + "epoch": 1.288056206088993, + "grad_norm": 0.9402037262916565, + "learning_rate": 4.471406398463297e-06, + "loss": 0.5766, + "step": 4400 + }, + { + "epoch": 1.2883489461358315, + "grad_norm": 0.9505319595336914, + "learning_rate": 4.471169571700793e-06, + "loss": 0.6228, + "step": 4401 + }, + { + "epoch": 1.2886416861826697, + "grad_norm": 0.9970121383666992, + "learning_rate": 4.470932698171913e-06, + "loss": 0.6161, + "step": 4402 + }, + { + "epoch": 1.2889344262295082, + "grad_norm": 0.9661443829536438, + "learning_rate": 4.470695777882274e-06, + "loss": 0.6351, + "step": 4403 + }, + { + "epoch": 1.2892271662763466, + "grad_norm": 0.9396880269050598, + "learning_rate": 4.470458810837501e-06, + "loss": 0.6001, + "step": 4404 + }, + { + "epoch": 1.289519906323185, + "grad_norm": 0.9560454487800598, + "learning_rate": 4.470221797043213e-06, + "loss": 0.6034, + "step": 4405 + }, + { + "epoch": 1.2898126463700235, + "grad_norm": 0.9110947251319885, + "learning_rate": 4.469984736505035e-06, + "loss": 0.6012, + "step": 4406 + }, + { + "epoch": 1.290105386416862, + "grad_norm": 0.9690170884132385, + "learning_rate": 4.469747629228591e-06, + "loss": 0.6089, + "step": 4407 + }, + { + "epoch": 1.2903981264637001, + "grad_norm": 0.9778798222541809, + "learning_rate": 4.469510475219505e-06, + "loss": 0.5879, + "step": 4408 + }, + { + "epoch": 1.2906908665105385, + "grad_norm": 0.9712124466896057, + "learning_rate": 4.469273274483406e-06, + "loss": 0.6068, + "step": 4409 + }, + { + "epoch": 1.290983606557377, + "grad_norm": 0.9554480910301208, + "learning_rate": 4.469036027025919e-06, + "loss": 0.6414, + "step": 4410 + }, + { + "epoch": 1.2912763466042154, + "grad_norm": 0.9459145069122314, + "learning_rate": 4.468798732852674e-06, + "loss": 0.5822, + "step": 4411 + }, + { + "epoch": 1.2915690866510539, + "grad_norm": 0.9789740443229675, + "learning_rate": 4.468561391969301e-06, + "loss": 0.6498, + "step": 4412 + }, + { + "epoch": 1.2918618266978923, + "grad_norm": 0.9698752164840698, + "learning_rate": 4.468324004381431e-06, + "loss": 0.6396, + "step": 4413 + }, + { + "epoch": 1.2921545667447307, + "grad_norm": 0.9438214898109436, + "learning_rate": 4.4680865700946956e-06, + "loss": 0.6048, + "step": 4414 + }, + { + "epoch": 1.2924473067915692, + "grad_norm": 0.9612036347389221, + "learning_rate": 4.467849089114728e-06, + "loss": 0.6567, + "step": 4415 + }, + { + "epoch": 1.2927400468384076, + "grad_norm": 0.9861551523208618, + "learning_rate": 4.467611561447164e-06, + "loss": 0.6462, + "step": 4416 + }, + { + "epoch": 1.293032786885246, + "grad_norm": 0.9637178182601929, + "learning_rate": 4.467373987097636e-06, + "loss": 0.6136, + "step": 4417 + }, + { + "epoch": 1.2933255269320842, + "grad_norm": 0.8955893516540527, + "learning_rate": 4.4671363660717835e-06, + "loss": 0.5666, + "step": 4418 + }, + { + "epoch": 1.2936182669789227, + "grad_norm": 0.9160864949226379, + "learning_rate": 4.466898698375241e-06, + "loss": 0.6317, + "step": 4419 + }, + { + "epoch": 1.2939110070257611, + "grad_norm": 0.9635897874832153, + "learning_rate": 4.466660984013651e-06, + "loss": 0.6436, + "step": 4420 + }, + { + "epoch": 1.2942037470725996, + "grad_norm": 0.9904659986495972, + "learning_rate": 4.4664232229926505e-06, + "loss": 0.6093, + "step": 4421 + }, + { + "epoch": 1.294496487119438, + "grad_norm": 0.9412827491760254, + "learning_rate": 4.466185415317881e-06, + "loss": 0.5986, + "step": 4422 + }, + { + "epoch": 1.2947892271662764, + "grad_norm": 1.0005842447280884, + "learning_rate": 4.4659475609949855e-06, + "loss": 0.6367, + "step": 4423 + }, + { + "epoch": 1.2950819672131146, + "grad_norm": 0.9479128122329712, + "learning_rate": 4.465709660029606e-06, + "loss": 0.6398, + "step": 4424 + }, + { + "epoch": 1.295374707259953, + "grad_norm": 0.963394284248352, + "learning_rate": 4.465471712427387e-06, + "loss": 0.6463, + "step": 4425 + }, + { + "epoch": 1.2956674473067915, + "grad_norm": 0.9821163415908813, + "learning_rate": 4.4652337181939745e-06, + "loss": 0.6035, + "step": 4426 + }, + { + "epoch": 1.29596018735363, + "grad_norm": 0.971262514591217, + "learning_rate": 4.464995677335013e-06, + "loss": 0.6126, + "step": 4427 + }, + { + "epoch": 1.2962529274004684, + "grad_norm": 0.9748396873474121, + "learning_rate": 4.464757589856153e-06, + "loss": 0.5968, + "step": 4428 + }, + { + "epoch": 1.2965456674473068, + "grad_norm": 0.9415642023086548, + "learning_rate": 4.464519455763041e-06, + "loss": 0.6479, + "step": 4429 + }, + { + "epoch": 1.2968384074941453, + "grad_norm": 0.9383069276809692, + "learning_rate": 4.464281275061328e-06, + "loss": 0.6408, + "step": 4430 + }, + { + "epoch": 1.2971311475409837, + "grad_norm": 0.9142728447914124, + "learning_rate": 4.464043047756665e-06, + "loss": 0.6306, + "step": 4431 + }, + { + "epoch": 1.2974238875878221, + "grad_norm": 0.9810581207275391, + "learning_rate": 4.463804773854702e-06, + "loss": 0.6294, + "step": 4432 + }, + { + "epoch": 1.2977166276346606, + "grad_norm": 0.9295633435249329, + "learning_rate": 4.463566453361094e-06, + "loss": 0.6267, + "step": 4433 + }, + { + "epoch": 1.2980093676814988, + "grad_norm": 0.9444278478622437, + "learning_rate": 4.463328086281495e-06, + "loss": 0.5897, + "step": 4434 + }, + { + "epoch": 1.2983021077283372, + "grad_norm": 0.9984384775161743, + "learning_rate": 4.46308967262156e-06, + "loss": 0.6186, + "step": 4435 + }, + { + "epoch": 1.2985948477751756, + "grad_norm": 0.9666447043418884, + "learning_rate": 4.462851212386945e-06, + "loss": 0.5905, + "step": 4436 + }, + { + "epoch": 1.298887587822014, + "grad_norm": 0.9579741954803467, + "learning_rate": 4.462612705583308e-06, + "loss": 0.6463, + "step": 4437 + }, + { + "epoch": 1.2991803278688525, + "grad_norm": 0.9475614428520203, + "learning_rate": 4.462374152216307e-06, + "loss": 0.6269, + "step": 4438 + }, + { + "epoch": 1.299473067915691, + "grad_norm": 0.9728106260299683, + "learning_rate": 4.462135552291602e-06, + "loss": 0.6316, + "step": 4439 + }, + { + "epoch": 1.2997658079625292, + "grad_norm": 1.0034054517745972, + "learning_rate": 4.461896905814855e-06, + "loss": 0.6623, + "step": 4440 + }, + { + "epoch": 1.3000585480093676, + "grad_norm": 0.9488202333450317, + "learning_rate": 4.461658212791726e-06, + "loss": 0.5813, + "step": 4441 + }, + { + "epoch": 1.300351288056206, + "grad_norm": 1.0155702829360962, + "learning_rate": 4.461419473227881e-06, + "loss": 0.648, + "step": 4442 + }, + { + "epoch": 1.3006440281030445, + "grad_norm": 0.9860164523124695, + "learning_rate": 4.46118068712898e-06, + "loss": 0.6371, + "step": 4443 + }, + { + "epoch": 1.300936768149883, + "grad_norm": 0.9670454859733582, + "learning_rate": 4.460941854500691e-06, + "loss": 0.6461, + "step": 4444 + }, + { + "epoch": 1.3012295081967213, + "grad_norm": 0.9084078073501587, + "learning_rate": 4.46070297534868e-06, + "loss": 0.5868, + "step": 4445 + }, + { + "epoch": 1.3015222482435598, + "grad_norm": 0.9885028600692749, + "learning_rate": 4.460464049678614e-06, + "loss": 0.6236, + "step": 4446 + }, + { + "epoch": 1.3018149882903982, + "grad_norm": 0.9164494276046753, + "learning_rate": 4.460225077496162e-06, + "loss": 0.6161, + "step": 4447 + }, + { + "epoch": 1.3021077283372366, + "grad_norm": 0.9509332776069641, + "learning_rate": 4.459986058806993e-06, + "loss": 0.6057, + "step": 4448 + }, + { + "epoch": 1.3024004683840749, + "grad_norm": 0.9462849497795105, + "learning_rate": 4.459746993616779e-06, + "loss": 0.6408, + "step": 4449 + }, + { + "epoch": 1.3026932084309133, + "grad_norm": 0.9687355756759644, + "learning_rate": 4.459507881931191e-06, + "loss": 0.6121, + "step": 4450 + }, + { + "epoch": 1.3029859484777517, + "grad_norm": 1.0234616994857788, + "learning_rate": 4.4592687237559015e-06, + "loss": 0.64, + "step": 4451 + }, + { + "epoch": 1.3032786885245902, + "grad_norm": 1.0319422483444214, + "learning_rate": 4.4590295190965855e-06, + "loss": 0.5772, + "step": 4452 + }, + { + "epoch": 1.3035714285714286, + "grad_norm": 0.9395558834075928, + "learning_rate": 4.4587902679589175e-06, + "loss": 0.6288, + "step": 4453 + }, + { + "epoch": 1.303864168618267, + "grad_norm": 1.0055198669433594, + "learning_rate": 4.458550970348574e-06, + "loss": 0.6252, + "step": 4454 + }, + { + "epoch": 1.3041569086651053, + "grad_norm": 0.9780582189559937, + "learning_rate": 4.458311626271232e-06, + "loss": 0.6505, + "step": 4455 + }, + { + "epoch": 1.3044496487119437, + "grad_norm": 0.9001513719558716, + "learning_rate": 4.4580722357325724e-06, + "loss": 0.6417, + "step": 4456 + }, + { + "epoch": 1.3047423887587821, + "grad_norm": 0.9663947224617004, + "learning_rate": 4.457832798738271e-06, + "loss": 0.627, + "step": 4457 + }, + { + "epoch": 1.3050351288056206, + "grad_norm": 1.0472923517227173, + "learning_rate": 4.457593315294011e-06, + "loss": 0.6454, + "step": 4458 + }, + { + "epoch": 1.305327868852459, + "grad_norm": 0.9733299016952515, + "learning_rate": 4.457353785405472e-06, + "loss": 0.6222, + "step": 4459 + }, + { + "epoch": 1.3056206088992974, + "grad_norm": 0.9329483509063721, + "learning_rate": 4.457114209078341e-06, + "loss": 0.6075, + "step": 4460 + }, + { + "epoch": 1.3059133489461359, + "grad_norm": 1.0138170719146729, + "learning_rate": 4.456874586318298e-06, + "loss": 0.6087, + "step": 4461 + }, + { + "epoch": 1.3062060889929743, + "grad_norm": 0.9393907189369202, + "learning_rate": 4.456634917131029e-06, + "loss": 0.5716, + "step": 4462 + }, + { + "epoch": 1.3064988290398127, + "grad_norm": 0.9516644477844238, + "learning_rate": 4.456395201522221e-06, + "loss": 0.5872, + "step": 4463 + }, + { + "epoch": 1.3067915690866512, + "grad_norm": 0.9202091097831726, + "learning_rate": 4.456155439497561e-06, + "loss": 0.5676, + "step": 4464 + }, + { + "epoch": 1.3070843091334894, + "grad_norm": 1.00568687915802, + "learning_rate": 4.4559156310627385e-06, + "loss": 0.6411, + "step": 4465 + }, + { + "epoch": 1.3073770491803278, + "grad_norm": 0.9474206566810608, + "learning_rate": 4.45567577622344e-06, + "loss": 0.5687, + "step": 4466 + }, + { + "epoch": 1.3076697892271663, + "grad_norm": 0.9573012590408325, + "learning_rate": 4.45543587498536e-06, + "loss": 0.6041, + "step": 4467 + }, + { + "epoch": 1.3079625292740047, + "grad_norm": 0.953058123588562, + "learning_rate": 4.455195927354187e-06, + "loss": 0.6079, + "step": 4468 + }, + { + "epoch": 1.3082552693208431, + "grad_norm": 0.9918780326843262, + "learning_rate": 4.4549559333356155e-06, + "loss": 0.6236, + "step": 4469 + }, + { + "epoch": 1.3085480093676816, + "grad_norm": 0.9484881162643433, + "learning_rate": 4.454715892935337e-06, + "loss": 0.6298, + "step": 4470 + }, + { + "epoch": 1.3088407494145198, + "grad_norm": 0.9501761198043823, + "learning_rate": 4.454475806159051e-06, + "loss": 0.5974, + "step": 4471 + }, + { + "epoch": 1.3091334894613582, + "grad_norm": 0.9716671705245972, + "learning_rate": 4.4542356730124495e-06, + "loss": 0.5941, + "step": 4472 + }, + { + "epoch": 1.3094262295081966, + "grad_norm": 0.9963477253913879, + "learning_rate": 4.453995493501232e-06, + "loss": 0.6172, + "step": 4473 + }, + { + "epoch": 1.309718969555035, + "grad_norm": 0.9515895843505859, + "learning_rate": 4.4537552676310955e-06, + "loss": 0.6233, + "step": 4474 + }, + { + "epoch": 1.3100117096018735, + "grad_norm": 0.9992842078208923, + "learning_rate": 4.45351499540774e-06, + "loss": 0.6133, + "step": 4475 + }, + { + "epoch": 1.310304449648712, + "grad_norm": 0.966387152671814, + "learning_rate": 4.453274676836865e-06, + "loss": 0.5954, + "step": 4476 + }, + { + "epoch": 1.3105971896955504, + "grad_norm": 0.9972788691520691, + "learning_rate": 4.453034311924175e-06, + "loss": 0.6131, + "step": 4477 + }, + { + "epoch": 1.3108899297423888, + "grad_norm": 0.9951683282852173, + "learning_rate": 4.452793900675369e-06, + "loss": 0.661, + "step": 4478 + }, + { + "epoch": 1.3111826697892273, + "grad_norm": 0.9734586477279663, + "learning_rate": 4.452553443096154e-06, + "loss": 0.599, + "step": 4479 + }, + { + "epoch": 1.3114754098360657, + "grad_norm": 0.9592598676681519, + "learning_rate": 4.452312939192233e-06, + "loss": 0.5864, + "step": 4480 + }, + { + "epoch": 1.311768149882904, + "grad_norm": 1.014585018157959, + "learning_rate": 4.452072388969312e-06, + "loss": 0.638, + "step": 4481 + }, + { + "epoch": 1.3120608899297423, + "grad_norm": 0.9816588163375854, + "learning_rate": 4.451831792433098e-06, + "loss": 0.6559, + "step": 4482 + }, + { + "epoch": 1.3123536299765808, + "grad_norm": 0.9793152809143066, + "learning_rate": 4.4515911495893e-06, + "loss": 0.6253, + "step": 4483 + }, + { + "epoch": 1.3126463700234192, + "grad_norm": 0.9351814985275269, + "learning_rate": 4.451350460443628e-06, + "loss": 0.6223, + "step": 4484 + }, + { + "epoch": 1.3129391100702577, + "grad_norm": 0.9800422191619873, + "learning_rate": 4.451109725001791e-06, + "loss": 0.6241, + "step": 4485 + }, + { + "epoch": 1.313231850117096, + "grad_norm": 0.9878500699996948, + "learning_rate": 4.450868943269501e-06, + "loss": 0.6503, + "step": 4486 + }, + { + "epoch": 1.3135245901639343, + "grad_norm": 0.95245760679245, + "learning_rate": 4.45062811525247e-06, + "loss": 0.639, + "step": 4487 + }, + { + "epoch": 1.3138173302107727, + "grad_norm": 0.9320048689842224, + "learning_rate": 4.450387240956412e-06, + "loss": 0.6467, + "step": 4488 + }, + { + "epoch": 1.3141100702576112, + "grad_norm": 0.924410879611969, + "learning_rate": 4.4501463203870435e-06, + "loss": 0.6077, + "step": 4489 + }, + { + "epoch": 1.3144028103044496, + "grad_norm": 0.9474089741706848, + "learning_rate": 4.4499053535500765e-06, + "loss": 0.6566, + "step": 4490 + }, + { + "epoch": 1.314695550351288, + "grad_norm": 1.0169399976730347, + "learning_rate": 4.449664340451232e-06, + "loss": 0.6367, + "step": 4491 + }, + { + "epoch": 1.3149882903981265, + "grad_norm": 0.9600574970245361, + "learning_rate": 4.4494232810962265e-06, + "loss": 0.6742, + "step": 4492 + }, + { + "epoch": 1.315281030444965, + "grad_norm": 1.0392709970474243, + "learning_rate": 4.449182175490779e-06, + "loss": 0.6181, + "step": 4493 + }, + { + "epoch": 1.3155737704918034, + "grad_norm": 0.9458771347999573, + "learning_rate": 4.4489410236406095e-06, + "loss": 0.6209, + "step": 4494 + }, + { + "epoch": 1.3158665105386418, + "grad_norm": 0.9603034853935242, + "learning_rate": 4.4486998255514404e-06, + "loss": 0.6144, + "step": 4495 + }, + { + "epoch": 1.3161592505854802, + "grad_norm": 0.9623656868934631, + "learning_rate": 4.448458581228993e-06, + "loss": 0.6171, + "step": 4496 + }, + { + "epoch": 1.3164519906323184, + "grad_norm": 0.9210444092750549, + "learning_rate": 4.448217290678992e-06, + "loss": 0.5633, + "step": 4497 + }, + { + "epoch": 1.3167447306791569, + "grad_norm": 1.0143747329711914, + "learning_rate": 4.44797595390716e-06, + "loss": 0.632, + "step": 4498 + }, + { + "epoch": 1.3170374707259953, + "grad_norm": 0.9400420784950256, + "learning_rate": 4.447734570919226e-06, + "loss": 0.5998, + "step": 4499 + }, + { + "epoch": 1.3173302107728337, + "grad_norm": 0.9782544374465942, + "learning_rate": 4.447493141720915e-06, + "loss": 0.6278, + "step": 4500 + }, + { + "epoch": 1.3176229508196722, + "grad_norm": 0.9801684021949768, + "learning_rate": 4.447251666317954e-06, + "loss": 0.6305, + "step": 4501 + }, + { + "epoch": 1.3179156908665106, + "grad_norm": 0.9688162803649902, + "learning_rate": 4.4470101447160735e-06, + "loss": 0.5947, + "step": 4502 + }, + { + "epoch": 1.3182084309133488, + "grad_norm": 0.9612842798233032, + "learning_rate": 4.446768576921003e-06, + "loss": 0.6077, + "step": 4503 + }, + { + "epoch": 1.3185011709601873, + "grad_norm": 0.9718267917633057, + "learning_rate": 4.446526962938474e-06, + "loss": 0.6419, + "step": 4504 + }, + { + "epoch": 1.3187939110070257, + "grad_norm": 1.0112124681472778, + "learning_rate": 4.44628530277422e-06, + "loss": 0.5975, + "step": 4505 + }, + { + "epoch": 1.3190866510538641, + "grad_norm": 0.9121837615966797, + "learning_rate": 4.446043596433972e-06, + "loss": 0.5908, + "step": 4506 + }, + { + "epoch": 1.3193793911007026, + "grad_norm": 1.003007411956787, + "learning_rate": 4.4458018439234665e-06, + "loss": 0.6236, + "step": 4507 + }, + { + "epoch": 1.319672131147541, + "grad_norm": 0.9781185388565063, + "learning_rate": 4.445560045248438e-06, + "loss": 0.6018, + "step": 4508 + }, + { + "epoch": 1.3199648711943794, + "grad_norm": 0.9873374700546265, + "learning_rate": 4.445318200414624e-06, + "loss": 0.6826, + "step": 4509 + }, + { + "epoch": 1.3202576112412179, + "grad_norm": 0.9651336073875427, + "learning_rate": 4.445076309427761e-06, + "loss": 0.6516, + "step": 4510 + }, + { + "epoch": 1.3205503512880563, + "grad_norm": 0.9624611139297485, + "learning_rate": 4.444834372293591e-06, + "loss": 0.6417, + "step": 4511 + }, + { + "epoch": 1.3208430913348947, + "grad_norm": 0.9609744548797607, + "learning_rate": 4.44459238901785e-06, + "loss": 0.6048, + "step": 4512 + }, + { + "epoch": 1.321135831381733, + "grad_norm": 0.9837982654571533, + "learning_rate": 4.444350359606281e-06, + "loss": 0.6562, + "step": 4513 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 0.9375048279762268, + "learning_rate": 4.444108284064626e-06, + "loss": 0.6188, + "step": 4514 + }, + { + "epoch": 1.3217213114754098, + "grad_norm": 0.9050257205963135, + "learning_rate": 4.4438661623986304e-06, + "loss": 0.5649, + "step": 4515 + }, + { + "epoch": 1.3220140515222483, + "grad_norm": 0.994612991809845, + "learning_rate": 4.443623994614035e-06, + "loss": 0.6486, + "step": 4516 + }, + { + "epoch": 1.3223067915690867, + "grad_norm": 0.9554393887519836, + "learning_rate": 4.4433817807165866e-06, + "loss": 0.6722, + "step": 4517 + }, + { + "epoch": 1.3225995316159251, + "grad_norm": 0.9981602430343628, + "learning_rate": 4.443139520712033e-06, + "loss": 0.619, + "step": 4518 + }, + { + "epoch": 1.3228922716627634, + "grad_norm": 1.0343531370162964, + "learning_rate": 4.442897214606121e-06, + "loss": 0.6497, + "step": 4519 + }, + { + "epoch": 1.3231850117096018, + "grad_norm": 0.9382816553115845, + "learning_rate": 4.442654862404599e-06, + "loss": 0.5646, + "step": 4520 + }, + { + "epoch": 1.3234777517564402, + "grad_norm": 0.9452589750289917, + "learning_rate": 4.442412464113217e-06, + "loss": 0.5986, + "step": 4521 + }, + { + "epoch": 1.3237704918032787, + "grad_norm": 0.9604417681694031, + "learning_rate": 4.442170019737726e-06, + "loss": 0.5835, + "step": 4522 + }, + { + "epoch": 1.324063231850117, + "grad_norm": 0.9044540524482727, + "learning_rate": 4.441927529283879e-06, + "loss": 0.5542, + "step": 4523 + }, + { + "epoch": 1.3243559718969555, + "grad_norm": 0.9772694110870361, + "learning_rate": 4.441684992757427e-06, + "loss": 0.6369, + "step": 4524 + }, + { + "epoch": 1.324648711943794, + "grad_norm": 0.9345768690109253, + "learning_rate": 4.441442410164126e-06, + "loss": 0.5855, + "step": 4525 + }, + { + "epoch": 1.3249414519906324, + "grad_norm": 0.9754636883735657, + "learning_rate": 4.441199781509732e-06, + "loss": 0.6161, + "step": 4526 + }, + { + "epoch": 1.3252341920374708, + "grad_norm": 0.9725382328033447, + "learning_rate": 4.440957106799999e-06, + "loss": 0.6402, + "step": 4527 + }, + { + "epoch": 1.325526932084309, + "grad_norm": 0.9625139832496643, + "learning_rate": 4.4407143860406855e-06, + "loss": 0.5968, + "step": 4528 + }, + { + "epoch": 1.3258196721311475, + "grad_norm": 0.985339879989624, + "learning_rate": 4.440471619237551e-06, + "loss": 0.5998, + "step": 4529 + }, + { + "epoch": 1.326112412177986, + "grad_norm": 1.0334137678146362, + "learning_rate": 4.440228806396355e-06, + "loss": 0.6186, + "step": 4530 + }, + { + "epoch": 1.3264051522248244, + "grad_norm": 0.9479877948760986, + "learning_rate": 4.439985947522857e-06, + "loss": 0.5836, + "step": 4531 + }, + { + "epoch": 1.3266978922716628, + "grad_norm": 0.9091500639915466, + "learning_rate": 4.439743042622819e-06, + "loss": 0.5557, + "step": 4532 + }, + { + "epoch": 1.3269906323185012, + "grad_norm": 0.9471980333328247, + "learning_rate": 4.439500091702006e-06, + "loss": 0.6037, + "step": 4533 + }, + { + "epoch": 1.3272833723653394, + "grad_norm": 0.9463550448417664, + "learning_rate": 4.43925709476618e-06, + "loss": 0.6206, + "step": 4534 + }, + { + "epoch": 1.3275761124121779, + "grad_norm": 0.9707633256912231, + "learning_rate": 4.439014051821107e-06, + "loss": 0.645, + "step": 4535 + }, + { + "epoch": 1.3278688524590163, + "grad_norm": 0.9763364195823669, + "learning_rate": 4.438770962872554e-06, + "loss": 0.6082, + "step": 4536 + }, + { + "epoch": 1.3281615925058547, + "grad_norm": 0.9349299073219299, + "learning_rate": 4.438527827926286e-06, + "loss": 0.6267, + "step": 4537 + }, + { + "epoch": 1.3284543325526932, + "grad_norm": 0.9473118782043457, + "learning_rate": 4.438284646988074e-06, + "loss": 0.6176, + "step": 4538 + }, + { + "epoch": 1.3287470725995316, + "grad_norm": 0.9064692854881287, + "learning_rate": 4.4380414200636855e-06, + "loss": 0.6457, + "step": 4539 + }, + { + "epoch": 1.32903981264637, + "grad_norm": 0.940768837928772, + "learning_rate": 4.437798147158892e-06, + "loss": 0.5846, + "step": 4540 + }, + { + "epoch": 1.3293325526932085, + "grad_norm": 0.9784445762634277, + "learning_rate": 4.4375548282794655e-06, + "loss": 0.6192, + "step": 4541 + }, + { + "epoch": 1.329625292740047, + "grad_norm": 1.5123119354248047, + "learning_rate": 4.437311463431178e-06, + "loss": 0.6501, + "step": 4542 + }, + { + "epoch": 1.3299180327868854, + "grad_norm": 0.9683117270469666, + "learning_rate": 4.4370680526198045e-06, + "loss": 0.6281, + "step": 4543 + }, + { + "epoch": 1.3302107728337236, + "grad_norm": 0.9621789455413818, + "learning_rate": 4.436824595851119e-06, + "loss": 0.5555, + "step": 4544 + }, + { + "epoch": 1.330503512880562, + "grad_norm": 0.9321409463882446, + "learning_rate": 4.436581093130899e-06, + "loss": 0.5915, + "step": 4545 + }, + { + "epoch": 1.3307962529274004, + "grad_norm": 0.9846649169921875, + "learning_rate": 4.436337544464918e-06, + "loss": 0.6223, + "step": 4546 + }, + { + "epoch": 1.3310889929742389, + "grad_norm": 0.9783638119697571, + "learning_rate": 4.436093949858958e-06, + "loss": 0.6003, + "step": 4547 + }, + { + "epoch": 1.3313817330210773, + "grad_norm": 0.9443066716194153, + "learning_rate": 4.435850309318798e-06, + "loss": 0.6248, + "step": 4548 + }, + { + "epoch": 1.3316744730679158, + "grad_norm": 0.9444625973701477, + "learning_rate": 4.435606622850216e-06, + "loss": 0.5634, + "step": 4549 + }, + { + "epoch": 1.331967213114754, + "grad_norm": 0.9622592329978943, + "learning_rate": 4.435362890458996e-06, + "loss": 0.5802, + "step": 4550 + }, + { + "epoch": 1.3322599531615924, + "grad_norm": 0.9524176716804504, + "learning_rate": 4.435119112150919e-06, + "loss": 0.5845, + "step": 4551 + }, + { + "epoch": 1.3325526932084308, + "grad_norm": 1.0157780647277832, + "learning_rate": 4.4348752879317705e-06, + "loss": 0.6021, + "step": 4552 + }, + { + "epoch": 1.3328454332552693, + "grad_norm": 0.953950822353363, + "learning_rate": 4.434631417807332e-06, + "loss": 0.6226, + "step": 4553 + }, + { + "epoch": 1.3331381733021077, + "grad_norm": 1.2489250898361206, + "learning_rate": 4.434387501783393e-06, + "loss": 0.6189, + "step": 4554 + }, + { + "epoch": 1.3334309133489461, + "grad_norm": 0.9716475009918213, + "learning_rate": 4.434143539865738e-06, + "loss": 0.6549, + "step": 4555 + }, + { + "epoch": 1.3337236533957846, + "grad_norm": 0.9657772779464722, + "learning_rate": 4.433899532060157e-06, + "loss": 0.6193, + "step": 4556 + }, + { + "epoch": 1.334016393442623, + "grad_norm": 0.9601584076881409, + "learning_rate": 4.433655478372437e-06, + "loss": 0.5705, + "step": 4557 + }, + { + "epoch": 1.3343091334894615, + "grad_norm": 0.9487001895904541, + "learning_rate": 4.43341137880837e-06, + "loss": 0.6481, + "step": 4558 + }, + { + "epoch": 1.3346018735362999, + "grad_norm": 0.9724152088165283, + "learning_rate": 4.433167233373747e-06, + "loss": 0.6029, + "step": 4559 + }, + { + "epoch": 1.334894613583138, + "grad_norm": 0.9629527926445007, + "learning_rate": 4.432923042074358e-06, + "loss": 0.6429, + "step": 4560 + }, + { + "epoch": 1.3351873536299765, + "grad_norm": 1.0565859079360962, + "learning_rate": 4.432678804916001e-06, + "loss": 0.5788, + "step": 4561 + }, + { + "epoch": 1.335480093676815, + "grad_norm": 0.9706205129623413, + "learning_rate": 4.432434521904466e-06, + "loss": 0.6006, + "step": 4562 + }, + { + "epoch": 1.3357728337236534, + "grad_norm": 0.9389815330505371, + "learning_rate": 4.432190193045552e-06, + "loss": 0.636, + "step": 4563 + }, + { + "epoch": 1.3360655737704918, + "grad_norm": 0.9630430936813354, + "learning_rate": 4.431945818345054e-06, + "loss": 0.5825, + "step": 4564 + }, + { + "epoch": 1.3363583138173303, + "grad_norm": 0.9593695998191833, + "learning_rate": 4.43170139780877e-06, + "loss": 0.6188, + "step": 4565 + }, + { + "epoch": 1.3366510538641685, + "grad_norm": 0.957213282585144, + "learning_rate": 4.4314569314425e-06, + "loss": 0.6112, + "step": 4566 + }, + { + "epoch": 1.336943793911007, + "grad_norm": 0.9588969945907593, + "learning_rate": 4.4312124192520426e-06, + "loss": 0.5896, + "step": 4567 + }, + { + "epoch": 1.3372365339578454, + "grad_norm": 0.9176177978515625, + "learning_rate": 4.430967861243199e-06, + "loss": 0.6146, + "step": 4568 + }, + { + "epoch": 1.3375292740046838, + "grad_norm": 0.9878908395767212, + "learning_rate": 4.430723257421773e-06, + "loss": 0.6114, + "step": 4569 + }, + { + "epoch": 1.3378220140515222, + "grad_norm": 0.9529159665107727, + "learning_rate": 4.430478607793566e-06, + "loss": 0.5973, + "step": 4570 + }, + { + "epoch": 1.3381147540983607, + "grad_norm": 1.012317180633545, + "learning_rate": 4.430233912364383e-06, + "loss": 0.6546, + "step": 4571 + }, + { + "epoch": 1.338407494145199, + "grad_norm": 1.0216116905212402, + "learning_rate": 4.42998917114003e-06, + "loss": 0.6528, + "step": 4572 + }, + { + "epoch": 1.3387002341920375, + "grad_norm": 1.0047228336334229, + "learning_rate": 4.429744384126314e-06, + "loss": 0.6389, + "step": 4573 + }, + { + "epoch": 1.338992974238876, + "grad_norm": 0.9421074390411377, + "learning_rate": 4.42949955132904e-06, + "loss": 0.6058, + "step": 4574 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 0.9702188968658447, + "learning_rate": 4.429254672754019e-06, + "loss": 0.6272, + "step": 4575 + }, + { + "epoch": 1.3395784543325526, + "grad_norm": 0.9738038182258606, + "learning_rate": 4.429009748407062e-06, + "loss": 0.6369, + "step": 4576 + }, + { + "epoch": 1.339871194379391, + "grad_norm": 0.9247280359268188, + "learning_rate": 4.428764778293976e-06, + "loss": 0.6518, + "step": 4577 + }, + { + "epoch": 1.3401639344262295, + "grad_norm": 0.9112014770507812, + "learning_rate": 4.428519762420576e-06, + "loss": 0.6108, + "step": 4578 + }, + { + "epoch": 1.340456674473068, + "grad_norm": 0.972089946269989, + "learning_rate": 4.428274700792674e-06, + "loss": 0.6203, + "step": 4579 + }, + { + "epoch": 1.3407494145199064, + "grad_norm": 0.9763414263725281, + "learning_rate": 4.4280295934160845e-06, + "loss": 0.6112, + "step": 4580 + }, + { + "epoch": 1.3410421545667448, + "grad_norm": 0.9895251393318176, + "learning_rate": 4.427784440296622e-06, + "loss": 0.616, + "step": 4581 + }, + { + "epoch": 1.341334894613583, + "grad_norm": 0.9294360280036926, + "learning_rate": 4.427539241440103e-06, + "loss": 0.6322, + "step": 4582 + }, + { + "epoch": 1.3416276346604215, + "grad_norm": 0.96377032995224, + "learning_rate": 4.427293996852346e-06, + "loss": 0.597, + "step": 4583 + }, + { + "epoch": 1.3419203747072599, + "grad_norm": 0.9677971005439758, + "learning_rate": 4.427048706539168e-06, + "loss": 0.6391, + "step": 4584 + }, + { + "epoch": 1.3422131147540983, + "grad_norm": 0.9385986328125, + "learning_rate": 4.426803370506389e-06, + "loss": 0.5833, + "step": 4585 + }, + { + "epoch": 1.3425058548009368, + "grad_norm": 0.999334990978241, + "learning_rate": 4.426557988759831e-06, + "loss": 0.619, + "step": 4586 + }, + { + "epoch": 1.3427985948477752, + "grad_norm": 0.9872877597808838, + "learning_rate": 4.4263125613053135e-06, + "loss": 0.6033, + "step": 4587 + }, + { + "epoch": 1.3430913348946136, + "grad_norm": 0.9298118948936462, + "learning_rate": 4.426067088148661e-06, + "loss": 0.6313, + "step": 4588 + }, + { + "epoch": 1.343384074941452, + "grad_norm": 0.9875504374504089, + "learning_rate": 4.4258215692956965e-06, + "loss": 0.6027, + "step": 4589 + }, + { + "epoch": 1.3436768149882905, + "grad_norm": 0.9018908739089966, + "learning_rate": 4.425576004752246e-06, + "loss": 0.5979, + "step": 4590 + }, + { + "epoch": 1.343969555035129, + "grad_norm": 1.0121581554412842, + "learning_rate": 4.4253303945241335e-06, + "loss": 0.6169, + "step": 4591 + }, + { + "epoch": 1.3442622950819672, + "grad_norm": 1.0087553262710571, + "learning_rate": 4.4250847386171885e-06, + "loss": 0.6256, + "step": 4592 + }, + { + "epoch": 1.3445550351288056, + "grad_norm": 0.9850611090660095, + "learning_rate": 4.424839037037238e-06, + "loss": 0.6259, + "step": 4593 + }, + { + "epoch": 1.344847775175644, + "grad_norm": 1.036868691444397, + "learning_rate": 4.424593289790111e-06, + "loss": 0.616, + "step": 4594 + }, + { + "epoch": 1.3451405152224825, + "grad_norm": 0.9354026913642883, + "learning_rate": 4.424347496881639e-06, + "loss": 0.6052, + "step": 4595 + }, + { + "epoch": 1.345433255269321, + "grad_norm": 0.9189494252204895, + "learning_rate": 4.424101658317653e-06, + "loss": 0.618, + "step": 4596 + }, + { + "epoch": 1.3457259953161593, + "grad_norm": 0.9373454451560974, + "learning_rate": 4.423855774103986e-06, + "loss": 0.6313, + "step": 4597 + }, + { + "epoch": 1.3460187353629975, + "grad_norm": 0.9385582804679871, + "learning_rate": 4.4236098442464705e-06, + "loss": 0.6011, + "step": 4598 + }, + { + "epoch": 1.346311475409836, + "grad_norm": 0.934055745601654, + "learning_rate": 4.423363868750942e-06, + "loss": 0.5773, + "step": 4599 + }, + { + "epoch": 1.3466042154566744, + "grad_norm": 0.9715591073036194, + "learning_rate": 4.423117847623236e-06, + "loss": 0.607, + "step": 4600 + }, + { + "epoch": 1.3468969555035128, + "grad_norm": 0.9545853137969971, + "learning_rate": 4.422871780869189e-06, + "loss": 0.5972, + "step": 4601 + }, + { + "epoch": 1.3471896955503513, + "grad_norm": 1.0427451133728027, + "learning_rate": 4.42262566849464e-06, + "loss": 0.6059, + "step": 4602 + }, + { + "epoch": 1.3474824355971897, + "grad_norm": 0.9635732769966125, + "learning_rate": 4.422379510505429e-06, + "loss": 0.6281, + "step": 4603 + }, + { + "epoch": 1.3477751756440282, + "grad_norm": 0.9656251072883606, + "learning_rate": 4.422133306907393e-06, + "loss": 0.6174, + "step": 4604 + }, + { + "epoch": 1.3480679156908666, + "grad_norm": 0.9609788060188293, + "learning_rate": 4.421887057706375e-06, + "loss": 0.63, + "step": 4605 + }, + { + "epoch": 1.348360655737705, + "grad_norm": 0.9655849933624268, + "learning_rate": 4.421640762908219e-06, + "loss": 0.6223, + "step": 4606 + }, + { + "epoch": 1.3486533957845435, + "grad_norm": 1.0012755393981934, + "learning_rate": 4.421394422518765e-06, + "loss": 0.6145, + "step": 4607 + }, + { + "epoch": 1.3489461358313817, + "grad_norm": 0.9523609280586243, + "learning_rate": 4.4211480365438605e-06, + "loss": 0.5966, + "step": 4608 + }, + { + "epoch": 1.3492388758782201, + "grad_norm": 0.9264230728149414, + "learning_rate": 4.420901604989349e-06, + "loss": 0.5919, + "step": 4609 + }, + { + "epoch": 1.3495316159250585, + "grad_norm": 0.9308303594589233, + "learning_rate": 4.420655127861078e-06, + "loss": 0.5704, + "step": 4610 + }, + { + "epoch": 1.349824355971897, + "grad_norm": 0.9672810435295105, + "learning_rate": 4.420408605164895e-06, + "loss": 0.624, + "step": 4611 + }, + { + "epoch": 1.3501170960187354, + "grad_norm": 0.9952184557914734, + "learning_rate": 4.420162036906649e-06, + "loss": 0.6577, + "step": 4612 + }, + { + "epoch": 1.3504098360655736, + "grad_norm": 0.9428781270980835, + "learning_rate": 4.419915423092189e-06, + "loss": 0.6386, + "step": 4613 + }, + { + "epoch": 1.350702576112412, + "grad_norm": 0.9597965478897095, + "learning_rate": 4.419668763727368e-06, + "loss": 0.5805, + "step": 4614 + }, + { + "epoch": 1.3509953161592505, + "grad_norm": 1.094857931137085, + "learning_rate": 4.419422058818036e-06, + "loss": 0.6237, + "step": 4615 + }, + { + "epoch": 1.351288056206089, + "grad_norm": 0.9935159683227539, + "learning_rate": 4.419175308370047e-06, + "loss": 0.6119, + "step": 4616 + }, + { + "epoch": 1.3515807962529274, + "grad_norm": 0.8902120590209961, + "learning_rate": 4.418928512389256e-06, + "loss": 0.5749, + "step": 4617 + }, + { + "epoch": 1.3518735362997658, + "grad_norm": 0.949329674243927, + "learning_rate": 4.418681670881516e-06, + "loss": 0.6191, + "step": 4618 + }, + { + "epoch": 1.3521662763466042, + "grad_norm": 1.0260120630264282, + "learning_rate": 4.418434783852686e-06, + "loss": 0.6153, + "step": 4619 + }, + { + "epoch": 1.3524590163934427, + "grad_norm": 1.0562689304351807, + "learning_rate": 4.418187851308621e-06, + "loss": 0.6262, + "step": 4620 + }, + { + "epoch": 1.3527517564402811, + "grad_norm": 0.9686413407325745, + "learning_rate": 4.417940873255181e-06, + "loss": 0.6327, + "step": 4621 + }, + { + "epoch": 1.3530444964871196, + "grad_norm": 0.9907900094985962, + "learning_rate": 4.4176938496982255e-06, + "loss": 0.621, + "step": 4622 + }, + { + "epoch": 1.3533372365339578, + "grad_norm": 1.0871425867080688, + "learning_rate": 4.417446780643615e-06, + "loss": 0.6505, + "step": 4623 + }, + { + "epoch": 1.3536299765807962, + "grad_norm": 0.9664628505706787, + "learning_rate": 4.4171996660972115e-06, + "loss": 0.6489, + "step": 4624 + }, + { + "epoch": 1.3539227166276346, + "grad_norm": 0.9529922604560852, + "learning_rate": 4.416952506064877e-06, + "loss": 0.601, + "step": 4625 + }, + { + "epoch": 1.354215456674473, + "grad_norm": 1.0007243156433105, + "learning_rate": 4.416705300552477e-06, + "loss": 0.6181, + "step": 4626 + }, + { + "epoch": 1.3545081967213115, + "grad_norm": 0.9293556809425354, + "learning_rate": 4.4164580495658745e-06, + "loss": 0.6054, + "step": 4627 + }, + { + "epoch": 1.35480093676815, + "grad_norm": 0.9955129623413086, + "learning_rate": 4.416210753110937e-06, + "loss": 0.6262, + "step": 4628 + }, + { + "epoch": 1.3550936768149882, + "grad_norm": 0.9009256958961487, + "learning_rate": 4.415963411193531e-06, + "loss": 0.5507, + "step": 4629 + }, + { + "epoch": 1.3553864168618266, + "grad_norm": 0.9257979393005371, + "learning_rate": 4.415716023819525e-06, + "loss": 0.575, + "step": 4630 + }, + { + "epoch": 1.355679156908665, + "grad_norm": 0.981940746307373, + "learning_rate": 4.4154685909947895e-06, + "loss": 0.6606, + "step": 4631 + }, + { + "epoch": 1.3559718969555035, + "grad_norm": 0.9992950558662415, + "learning_rate": 4.415221112725192e-06, + "loss": 0.601, + "step": 4632 + }, + { + "epoch": 1.356264637002342, + "grad_norm": 0.9514440298080444, + "learning_rate": 4.414973589016606e-06, + "loss": 0.6096, + "step": 4633 + }, + { + "epoch": 1.3565573770491803, + "grad_norm": 0.9288879632949829, + "learning_rate": 4.4147260198749045e-06, + "loss": 0.5601, + "step": 4634 + }, + { + "epoch": 1.3568501170960188, + "grad_norm": 0.9893496036529541, + "learning_rate": 4.4144784053059595e-06, + "loss": 0.6234, + "step": 4635 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.9956640601158142, + "learning_rate": 4.414230745315647e-06, + "loss": 0.5874, + "step": 4636 + }, + { + "epoch": 1.3574355971896956, + "grad_norm": 0.9707168936729431, + "learning_rate": 4.413983039909842e-06, + "loss": 0.6024, + "step": 4637 + }, + { + "epoch": 1.357728337236534, + "grad_norm": 0.9384655356407166, + "learning_rate": 4.4137352890944215e-06, + "loss": 0.5709, + "step": 4638 + }, + { + "epoch": 1.3580210772833723, + "grad_norm": 0.9622271656990051, + "learning_rate": 4.413487492875264e-06, + "loss": 0.5831, + "step": 4639 + }, + { + "epoch": 1.3583138173302107, + "grad_norm": 0.9709953665733337, + "learning_rate": 4.413239651258248e-06, + "loss": 0.6087, + "step": 4640 + }, + { + "epoch": 1.3586065573770492, + "grad_norm": 0.9205732345581055, + "learning_rate": 4.412991764249254e-06, + "loss": 0.6033, + "step": 4641 + }, + { + "epoch": 1.3588992974238876, + "grad_norm": 0.9710667133331299, + "learning_rate": 4.412743831854162e-06, + "loss": 0.6023, + "step": 4642 + }, + { + "epoch": 1.359192037470726, + "grad_norm": 0.9597374796867371, + "learning_rate": 4.4124958540788555e-06, + "loss": 0.5931, + "step": 4643 + }, + { + "epoch": 1.3594847775175645, + "grad_norm": 0.967430591583252, + "learning_rate": 4.412247830929218e-06, + "loss": 0.5861, + "step": 4644 + }, + { + "epoch": 1.3597775175644027, + "grad_norm": 1.0502015352249146, + "learning_rate": 4.411999762411133e-06, + "loss": 0.6557, + "step": 4645 + }, + { + "epoch": 1.3600702576112411, + "grad_norm": 1.0420035123825073, + "learning_rate": 4.4117516485304855e-06, + "loss": 0.626, + "step": 4646 + }, + { + "epoch": 1.3603629976580796, + "grad_norm": 0.9895673990249634, + "learning_rate": 4.411503489293164e-06, + "loss": 0.6388, + "step": 4647 + }, + { + "epoch": 1.360655737704918, + "grad_norm": 0.972195029258728, + "learning_rate": 4.4112552847050546e-06, + "loss": 0.6188, + "step": 4648 + }, + { + "epoch": 1.3609484777517564, + "grad_norm": 0.954071581363678, + "learning_rate": 4.411007034772046e-06, + "loss": 0.5713, + "step": 4649 + }, + { + "epoch": 1.3612412177985949, + "grad_norm": 0.9615337252616882, + "learning_rate": 4.410758739500029e-06, + "loss": 0.6211, + "step": 4650 + }, + { + "epoch": 1.3615339578454333, + "grad_norm": 0.924652099609375, + "learning_rate": 4.410510398894893e-06, + "loss": 0.5804, + "step": 4651 + }, + { + "epoch": 1.3618266978922717, + "grad_norm": 0.9262964129447937, + "learning_rate": 4.410262012962532e-06, + "loss": 0.5821, + "step": 4652 + }, + { + "epoch": 1.3621194379391102, + "grad_norm": 0.9559378623962402, + "learning_rate": 4.410013581708836e-06, + "loss": 0.607, + "step": 4653 + }, + { + "epoch": 1.3624121779859486, + "grad_norm": 0.9235690832138062, + "learning_rate": 4.4097651051397025e-06, + "loss": 0.6238, + "step": 4654 + }, + { + "epoch": 1.3627049180327868, + "grad_norm": 1.001281976699829, + "learning_rate": 4.409516583261024e-06, + "loss": 0.6255, + "step": 4655 + }, + { + "epoch": 1.3629976580796253, + "grad_norm": 0.9139559864997864, + "learning_rate": 4.409268016078697e-06, + "loss": 0.5535, + "step": 4656 + }, + { + "epoch": 1.3632903981264637, + "grad_norm": 0.941162645816803, + "learning_rate": 4.409019403598621e-06, + "loss": 0.6211, + "step": 4657 + }, + { + "epoch": 1.3635831381733021, + "grad_norm": 0.9576692581176758, + "learning_rate": 4.408770745826692e-06, + "loss": 0.6427, + "step": 4658 + }, + { + "epoch": 1.3638758782201406, + "grad_norm": 0.9505347609519958, + "learning_rate": 4.4085220427688115e-06, + "loss": 0.6294, + "step": 4659 + }, + { + "epoch": 1.364168618266979, + "grad_norm": 0.9913117289543152, + "learning_rate": 4.408273294430878e-06, + "loss": 0.6069, + "step": 4660 + }, + { + "epoch": 1.3644613583138172, + "grad_norm": 1.0032981634140015, + "learning_rate": 4.408024500818794e-06, + "loss": 0.6415, + "step": 4661 + }, + { + "epoch": 1.3647540983606556, + "grad_norm": 0.9567134380340576, + "learning_rate": 4.407775661938462e-06, + "loss": 0.6285, + "step": 4662 + }, + { + "epoch": 1.365046838407494, + "grad_norm": 0.9764002561569214, + "learning_rate": 4.4075267777957855e-06, + "loss": 0.6186, + "step": 4663 + }, + { + "epoch": 1.3653395784543325, + "grad_norm": 1.0383625030517578, + "learning_rate": 4.407277848396671e-06, + "loss": 0.6338, + "step": 4664 + }, + { + "epoch": 1.365632318501171, + "grad_norm": 0.9420385956764221, + "learning_rate": 4.4070288737470216e-06, + "loss": 0.5736, + "step": 4665 + }, + { + "epoch": 1.3659250585480094, + "grad_norm": 0.9997028112411499, + "learning_rate": 4.4067798538527466e-06, + "loss": 0.6193, + "step": 4666 + }, + { + "epoch": 1.3662177985948478, + "grad_norm": 0.9207908511161804, + "learning_rate": 4.406530788719753e-06, + "loss": 0.6313, + "step": 4667 + }, + { + "epoch": 1.3665105386416863, + "grad_norm": 0.9977696537971497, + "learning_rate": 4.406281678353951e-06, + "loss": 0.6524, + "step": 4668 + }, + { + "epoch": 1.3668032786885247, + "grad_norm": 0.9274045825004578, + "learning_rate": 4.406032522761249e-06, + "loss": 0.5647, + "step": 4669 + }, + { + "epoch": 1.3670960187353631, + "grad_norm": 0.9543116092681885, + "learning_rate": 4.40578332194756e-06, + "loss": 0.6337, + "step": 4670 + }, + { + "epoch": 1.3673887587822013, + "grad_norm": 0.9171501398086548, + "learning_rate": 4.405534075918795e-06, + "loss": 0.606, + "step": 4671 + }, + { + "epoch": 1.3676814988290398, + "grad_norm": 0.9976327419281006, + "learning_rate": 4.405284784680868e-06, + "loss": 0.6291, + "step": 4672 + }, + { + "epoch": 1.3679742388758782, + "grad_norm": 0.960177481174469, + "learning_rate": 4.4050354482396926e-06, + "loss": 0.6055, + "step": 4673 + }, + { + "epoch": 1.3682669789227166, + "grad_norm": 0.9963817000389099, + "learning_rate": 4.4047860666011865e-06, + "loss": 0.6542, + "step": 4674 + }, + { + "epoch": 1.368559718969555, + "grad_norm": 1.0327842235565186, + "learning_rate": 4.404536639771264e-06, + "loss": 0.6376, + "step": 4675 + }, + { + "epoch": 1.3688524590163935, + "grad_norm": 1.0307905673980713, + "learning_rate": 4.404287167755845e-06, + "loss": 0.58, + "step": 4676 + }, + { + "epoch": 1.3691451990632317, + "grad_norm": 0.9583171606063843, + "learning_rate": 4.4040376505608464e-06, + "loss": 0.6368, + "step": 4677 + }, + { + "epoch": 1.3694379391100702, + "grad_norm": 0.9752498865127563, + "learning_rate": 4.403788088192189e-06, + "loss": 0.6173, + "step": 4678 + }, + { + "epoch": 1.3697306791569086, + "grad_norm": 0.9192586541175842, + "learning_rate": 4.403538480655793e-06, + "loss": 0.5435, + "step": 4679 + }, + { + "epoch": 1.370023419203747, + "grad_norm": 0.905661940574646, + "learning_rate": 4.403288827957581e-06, + "loss": 0.5624, + "step": 4680 + }, + { + "epoch": 1.3703161592505855, + "grad_norm": 1.1403782367706299, + "learning_rate": 4.4030391301034755e-06, + "loss": 0.5999, + "step": 4681 + }, + { + "epoch": 1.370608899297424, + "grad_norm": 0.9525012373924255, + "learning_rate": 4.402789387099402e-06, + "loss": 0.6148, + "step": 4682 + }, + { + "epoch": 1.3709016393442623, + "grad_norm": 0.9861671328544617, + "learning_rate": 4.4025395989512845e-06, + "loss": 0.6045, + "step": 4683 + }, + { + "epoch": 1.3711943793911008, + "grad_norm": 0.9707449078559875, + "learning_rate": 4.402289765665049e-06, + "loss": 0.6179, + "step": 4684 + }, + { + "epoch": 1.3714871194379392, + "grad_norm": 0.9763824343681335, + "learning_rate": 4.402039887246623e-06, + "loss": 0.6523, + "step": 4685 + }, + { + "epoch": 1.3717798594847777, + "grad_norm": 0.9688648581504822, + "learning_rate": 4.401789963701936e-06, + "loss": 0.6154, + "step": 4686 + }, + { + "epoch": 1.3720725995316159, + "grad_norm": 0.9731820225715637, + "learning_rate": 4.401539995036917e-06, + "loss": 0.6135, + "step": 4687 + }, + { + "epoch": 1.3723653395784543, + "grad_norm": 0.9105067253112793, + "learning_rate": 4.401289981257495e-06, + "loss": 0.6017, + "step": 4688 + }, + { + "epoch": 1.3726580796252927, + "grad_norm": 0.9874604940414429, + "learning_rate": 4.401039922369604e-06, + "loss": 0.6364, + "step": 4689 + }, + { + "epoch": 1.3729508196721312, + "grad_norm": 0.9922077059745789, + "learning_rate": 4.4007898183791756e-06, + "loss": 0.612, + "step": 4690 + }, + { + "epoch": 1.3732435597189696, + "grad_norm": 0.9345640540122986, + "learning_rate": 4.400539669292143e-06, + "loss": 0.6302, + "step": 4691 + }, + { + "epoch": 1.373536299765808, + "grad_norm": 0.9372754693031311, + "learning_rate": 4.400289475114442e-06, + "loss": 0.6201, + "step": 4692 + }, + { + "epoch": 1.3738290398126463, + "grad_norm": 0.9582284688949585, + "learning_rate": 4.400039235852008e-06, + "loss": 0.6045, + "step": 4693 + }, + { + "epoch": 1.3741217798594847, + "grad_norm": 1.0721197128295898, + "learning_rate": 4.399788951510779e-06, + "loss": 0.6338, + "step": 4694 + }, + { + "epoch": 1.3744145199063231, + "grad_norm": 0.9933443665504456, + "learning_rate": 4.399538622096691e-06, + "loss": 0.6055, + "step": 4695 + }, + { + "epoch": 1.3747072599531616, + "grad_norm": 1.0007972717285156, + "learning_rate": 4.399288247615684e-06, + "loss": 0.6439, + "step": 4696 + }, + { + "epoch": 1.375, + "grad_norm": 1.0110632181167603, + "learning_rate": 4.3990378280737e-06, + "loss": 0.6276, + "step": 4697 + }, + { + "epoch": 1.3752927400468384, + "grad_norm": 1.0146427154541016, + "learning_rate": 4.398787363476677e-06, + "loss": 0.6137, + "step": 4698 + }, + { + "epoch": 1.3755854800936769, + "grad_norm": 1.0046837329864502, + "learning_rate": 4.39853685383056e-06, + "loss": 0.6288, + "step": 4699 + }, + { + "epoch": 1.3758782201405153, + "grad_norm": 1.010384202003479, + "learning_rate": 4.3982862991412904e-06, + "loss": 0.6132, + "step": 4700 + }, + { + "epoch": 1.3761709601873537, + "grad_norm": 0.9436549544334412, + "learning_rate": 4.398035699414814e-06, + "loss": 0.6142, + "step": 4701 + }, + { + "epoch": 1.376463700234192, + "grad_norm": 1.0262689590454102, + "learning_rate": 4.397785054657076e-06, + "loss": 0.647, + "step": 4702 + }, + { + "epoch": 1.3767564402810304, + "grad_norm": 0.9781596660614014, + "learning_rate": 4.397534364874024e-06, + "loss": 0.6444, + "step": 4703 + }, + { + "epoch": 1.3770491803278688, + "grad_norm": 0.9622070789337158, + "learning_rate": 4.3972836300716034e-06, + "loss": 0.6157, + "step": 4704 + }, + { + "epoch": 1.3773419203747073, + "grad_norm": 1.016133189201355, + "learning_rate": 4.397032850255765e-06, + "loss": 0.628, + "step": 4705 + }, + { + "epoch": 1.3776346604215457, + "grad_norm": 0.9932270050048828, + "learning_rate": 4.396782025432458e-06, + "loss": 0.6643, + "step": 4706 + }, + { + "epoch": 1.3779274004683841, + "grad_norm": 0.9935219883918762, + "learning_rate": 4.396531155607632e-06, + "loss": 0.6313, + "step": 4707 + }, + { + "epoch": 1.3782201405152223, + "grad_norm": 0.9480718374252319, + "learning_rate": 4.396280240787241e-06, + "loss": 0.5955, + "step": 4708 + }, + { + "epoch": 1.3785128805620608, + "grad_norm": 0.9614169597625732, + "learning_rate": 4.396029280977236e-06, + "loss": 0.6638, + "step": 4709 + }, + { + "epoch": 1.3788056206088992, + "grad_norm": 1.0066014528274536, + "learning_rate": 4.395778276183572e-06, + "loss": 0.635, + "step": 4710 + }, + { + "epoch": 1.3790983606557377, + "grad_norm": 0.9610979557037354, + "learning_rate": 4.395527226412205e-06, + "loss": 0.6189, + "step": 4711 + }, + { + "epoch": 1.379391100702576, + "grad_norm": 0.9821563959121704, + "learning_rate": 4.395276131669089e-06, + "loss": 0.6275, + "step": 4712 + }, + { + "epoch": 1.3796838407494145, + "grad_norm": 0.9679578542709351, + "learning_rate": 4.395024991960184e-06, + "loss": 0.5855, + "step": 4713 + }, + { + "epoch": 1.379976580796253, + "grad_norm": 1.0333330631256104, + "learning_rate": 4.394773807291447e-06, + "loss": 0.6891, + "step": 4714 + }, + { + "epoch": 1.3802693208430914, + "grad_norm": 0.9757387042045593, + "learning_rate": 4.394522577668837e-06, + "loss": 0.6015, + "step": 4715 + }, + { + "epoch": 1.3805620608899298, + "grad_norm": 0.9984257817268372, + "learning_rate": 4.394271303098315e-06, + "loss": 0.6443, + "step": 4716 + }, + { + "epoch": 1.3808548009367683, + "grad_norm": 0.9625066518783569, + "learning_rate": 4.394019983585842e-06, + "loss": 0.5861, + "step": 4717 + }, + { + "epoch": 1.3811475409836065, + "grad_norm": 0.9561929702758789, + "learning_rate": 4.393768619137382e-06, + "loss": 0.6127, + "step": 4718 + }, + { + "epoch": 1.381440281030445, + "grad_norm": 0.9774311184883118, + "learning_rate": 4.393517209758896e-06, + "loss": 0.6103, + "step": 4719 + }, + { + "epoch": 1.3817330210772834, + "grad_norm": 0.9514095783233643, + "learning_rate": 4.393265755456352e-06, + "loss": 0.6448, + "step": 4720 + }, + { + "epoch": 1.3820257611241218, + "grad_norm": 0.9340492486953735, + "learning_rate": 4.393014256235713e-06, + "loss": 0.629, + "step": 4721 + }, + { + "epoch": 1.3823185011709602, + "grad_norm": 0.93208909034729, + "learning_rate": 4.392762712102948e-06, + "loss": 0.5939, + "step": 4722 + }, + { + "epoch": 1.3826112412177987, + "grad_norm": 0.9848264455795288, + "learning_rate": 4.3925111230640235e-06, + "loss": 0.608, + "step": 4723 + }, + { + "epoch": 1.3829039812646369, + "grad_norm": 0.9552759528160095, + "learning_rate": 4.39225948912491e-06, + "loss": 0.6203, + "step": 4724 + }, + { + "epoch": 1.3831967213114753, + "grad_norm": 0.9659993052482605, + "learning_rate": 4.392007810291576e-06, + "loss": 0.6201, + "step": 4725 + }, + { + "epoch": 1.3834894613583137, + "grad_norm": 0.9734834432601929, + "learning_rate": 4.391756086569992e-06, + "loss": 0.5861, + "step": 4726 + }, + { + "epoch": 1.3837822014051522, + "grad_norm": 0.96776282787323, + "learning_rate": 4.391504317966133e-06, + "loss": 0.6075, + "step": 4727 + }, + { + "epoch": 1.3840749414519906, + "grad_norm": 1.045521855354309, + "learning_rate": 4.39125250448597e-06, + "loss": 0.6141, + "step": 4728 + }, + { + "epoch": 1.384367681498829, + "grad_norm": 0.9809744954109192, + "learning_rate": 4.391000646135478e-06, + "loss": 0.6393, + "step": 4729 + }, + { + "epoch": 1.3846604215456675, + "grad_norm": 0.9617583751678467, + "learning_rate": 4.390748742920632e-06, + "loss": 0.5828, + "step": 4730 + }, + { + "epoch": 1.384953161592506, + "grad_norm": 0.9856263399124146, + "learning_rate": 4.39049679484741e-06, + "loss": 0.6227, + "step": 4731 + }, + { + "epoch": 1.3852459016393444, + "grad_norm": 1.0022894144058228, + "learning_rate": 4.390244801921787e-06, + "loss": 0.649, + "step": 4732 + }, + { + "epoch": 1.3855386416861828, + "grad_norm": 1.0636755228042603, + "learning_rate": 4.3899927641497435e-06, + "loss": 0.6863, + "step": 4733 + }, + { + "epoch": 1.385831381733021, + "grad_norm": 0.9573457837104797, + "learning_rate": 4.389740681537259e-06, + "loss": 0.6396, + "step": 4734 + }, + { + "epoch": 1.3861241217798594, + "grad_norm": 0.976978600025177, + "learning_rate": 4.389488554090313e-06, + "loss": 0.6142, + "step": 4735 + }, + { + "epoch": 1.3864168618266979, + "grad_norm": 0.9687004089355469, + "learning_rate": 4.389236381814888e-06, + "loss": 0.6057, + "step": 4736 + }, + { + "epoch": 1.3867096018735363, + "grad_norm": 0.9508687257766724, + "learning_rate": 4.388984164716967e-06, + "loss": 0.6172, + "step": 4737 + }, + { + "epoch": 1.3870023419203747, + "grad_norm": 0.9625722169876099, + "learning_rate": 4.388731902802534e-06, + "loss": 0.6174, + "step": 4738 + }, + { + "epoch": 1.3872950819672132, + "grad_norm": 0.966942310333252, + "learning_rate": 4.3884795960775735e-06, + "loss": 0.6437, + "step": 4739 + }, + { + "epoch": 1.3875878220140514, + "grad_norm": 1.0115737915039062, + "learning_rate": 4.388227244548073e-06, + "loss": 0.6002, + "step": 4740 + }, + { + "epoch": 1.3878805620608898, + "grad_norm": 0.9780842661857605, + "learning_rate": 4.387974848220017e-06, + "loss": 0.6427, + "step": 4741 + }, + { + "epoch": 1.3881733021077283, + "grad_norm": 0.9752804040908813, + "learning_rate": 4.387722407099395e-06, + "loss": 0.6285, + "step": 4742 + }, + { + "epoch": 1.3884660421545667, + "grad_norm": 0.9401038289070129, + "learning_rate": 4.387469921192196e-06, + "loss": 0.592, + "step": 4743 + }, + { + "epoch": 1.3887587822014051, + "grad_norm": 0.975214958190918, + "learning_rate": 4.387217390504412e-06, + "loss": 0.6119, + "step": 4744 + }, + { + "epoch": 1.3890515222482436, + "grad_norm": 0.9230300188064575, + "learning_rate": 4.386964815042032e-06, + "loss": 0.5782, + "step": 4745 + }, + { + "epoch": 1.389344262295082, + "grad_norm": 1.080700397491455, + "learning_rate": 4.386712194811049e-06, + "loss": 0.5952, + "step": 4746 + }, + { + "epoch": 1.3896370023419204, + "grad_norm": 0.9380935430526733, + "learning_rate": 4.386459529817457e-06, + "loss": 0.5866, + "step": 4747 + }, + { + "epoch": 1.3899297423887589, + "grad_norm": 0.9538049697875977, + "learning_rate": 4.38620682006725e-06, + "loss": 0.6279, + "step": 4748 + }, + { + "epoch": 1.3902224824355973, + "grad_norm": 0.944389283657074, + "learning_rate": 4.385954065566424e-06, + "loss": 0.5843, + "step": 4749 + }, + { + "epoch": 1.3905152224824355, + "grad_norm": 0.9461725950241089, + "learning_rate": 4.385701266320975e-06, + "loss": 0.6257, + "step": 4750 + }, + { + "epoch": 1.390807962529274, + "grad_norm": 0.9906145930290222, + "learning_rate": 4.385448422336902e-06, + "loss": 0.6167, + "step": 4751 + }, + { + "epoch": 1.3911007025761124, + "grad_norm": 1.0211412906646729, + "learning_rate": 4.385195533620202e-06, + "loss": 0.642, + "step": 4752 + }, + { + "epoch": 1.3913934426229508, + "grad_norm": 1.0134562253952026, + "learning_rate": 4.384942600176877e-06, + "loss": 0.6169, + "step": 4753 + }, + { + "epoch": 1.3916861826697893, + "grad_norm": 0.9813461899757385, + "learning_rate": 4.384689622012925e-06, + "loss": 0.6224, + "step": 4754 + }, + { + "epoch": 1.3919789227166277, + "grad_norm": 0.972854733467102, + "learning_rate": 4.3844365991343515e-06, + "loss": 0.5896, + "step": 4755 + }, + { + "epoch": 1.392271662763466, + "grad_norm": 0.9697772860527039, + "learning_rate": 4.384183531547156e-06, + "loss": 0.6164, + "step": 4756 + }, + { + "epoch": 1.3925644028103044, + "grad_norm": 0.985464870929718, + "learning_rate": 4.383930419257345e-06, + "loss": 0.6241, + "step": 4757 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 0.9683478474617004, + "learning_rate": 4.3836772622709234e-06, + "loss": 0.6411, + "step": 4758 + }, + { + "epoch": 1.3931498829039812, + "grad_norm": 0.9821563363075256, + "learning_rate": 4.383424060593896e-06, + "loss": 0.5814, + "step": 4759 + }, + { + "epoch": 1.3934426229508197, + "grad_norm": 0.9478673934936523, + "learning_rate": 4.383170814232272e-06, + "loss": 0.5926, + "step": 4760 + }, + { + "epoch": 1.393735362997658, + "grad_norm": 1.0096700191497803, + "learning_rate": 4.382917523192056e-06, + "loss": 0.6147, + "step": 4761 + }, + { + "epoch": 1.3940281030444965, + "grad_norm": 0.9714099764823914, + "learning_rate": 4.3826641874792625e-06, + "loss": 0.5941, + "step": 4762 + }, + { + "epoch": 1.394320843091335, + "grad_norm": 0.9664632678031921, + "learning_rate": 4.382410807099899e-06, + "loss": 0.5718, + "step": 4763 + }, + { + "epoch": 1.3946135831381734, + "grad_norm": 0.9877532720565796, + "learning_rate": 4.3821573820599775e-06, + "loss": 0.5711, + "step": 4764 + }, + { + "epoch": 1.3949063231850118, + "grad_norm": 0.9563380479812622, + "learning_rate": 4.381903912365509e-06, + "loss": 0.5926, + "step": 4765 + }, + { + "epoch": 1.39519906323185, + "grad_norm": 0.9857602119445801, + "learning_rate": 4.3816503980225105e-06, + "loss": 0.6174, + "step": 4766 + }, + { + "epoch": 1.3954918032786885, + "grad_norm": 0.9893366694450378, + "learning_rate": 4.381396839036994e-06, + "loss": 0.6192, + "step": 4767 + }, + { + "epoch": 1.395784543325527, + "grad_norm": 1.0581691265106201, + "learning_rate": 4.381143235414976e-06, + "loss": 0.6562, + "step": 4768 + }, + { + "epoch": 1.3960772833723654, + "grad_norm": 0.9746310114860535, + "learning_rate": 4.380889587162474e-06, + "loss": 0.6214, + "step": 4769 + }, + { + "epoch": 1.3963700234192038, + "grad_norm": 0.9975482821464539, + "learning_rate": 4.380635894285504e-06, + "loss": 0.6608, + "step": 4770 + }, + { + "epoch": 1.3966627634660422, + "grad_norm": 0.9543734192848206, + "learning_rate": 4.3803821567900874e-06, + "loss": 0.6427, + "step": 4771 + }, + { + "epoch": 1.3969555035128804, + "grad_norm": 0.9809375405311584, + "learning_rate": 4.380128374682242e-06, + "loss": 0.5945, + "step": 4772 + }, + { + "epoch": 1.3972482435597189, + "grad_norm": 0.9785882234573364, + "learning_rate": 4.37987454796799e-06, + "loss": 0.6415, + "step": 4773 + }, + { + "epoch": 1.3975409836065573, + "grad_norm": 0.9811891317367554, + "learning_rate": 4.379620676653353e-06, + "loss": 0.6295, + "step": 4774 + }, + { + "epoch": 1.3978337236533958, + "grad_norm": 1.0255491733551025, + "learning_rate": 4.379366760744354e-06, + "loss": 0.6302, + "step": 4775 + }, + { + "epoch": 1.3981264637002342, + "grad_norm": 0.9334329962730408, + "learning_rate": 4.379112800247019e-06, + "loss": 0.5938, + "step": 4776 + }, + { + "epoch": 1.3984192037470726, + "grad_norm": 0.9143648147583008, + "learning_rate": 4.378858795167371e-06, + "loss": 0.5903, + "step": 4777 + }, + { + "epoch": 1.398711943793911, + "grad_norm": 0.9580702185630798, + "learning_rate": 4.3786047455114364e-06, + "loss": 0.6033, + "step": 4778 + }, + { + "epoch": 1.3990046838407495, + "grad_norm": 0.9583886861801147, + "learning_rate": 4.3783506512852435e-06, + "loss": 0.6495, + "step": 4779 + }, + { + "epoch": 1.399297423887588, + "grad_norm": 0.9613014459609985, + "learning_rate": 4.378096512494822e-06, + "loss": 0.6035, + "step": 4780 + }, + { + "epoch": 1.3995901639344264, + "grad_norm": 1.0258708000183105, + "learning_rate": 4.377842329146198e-06, + "loss": 0.6814, + "step": 4781 + }, + { + "epoch": 1.3998829039812646, + "grad_norm": 0.9862787127494812, + "learning_rate": 4.377588101245405e-06, + "loss": 0.6467, + "step": 4782 + }, + { + "epoch": 1.400175644028103, + "grad_norm": 1.022156834602356, + "learning_rate": 4.377333828798473e-06, + "loss": 0.6292, + "step": 4783 + }, + { + "epoch": 1.4004683840749415, + "grad_norm": 0.9601110219955444, + "learning_rate": 4.3770795118114354e-06, + "loss": 0.6228, + "step": 4784 + }, + { + "epoch": 1.4007611241217799, + "grad_norm": 0.939481258392334, + "learning_rate": 4.376825150290326e-06, + "loss": 0.6238, + "step": 4785 + }, + { + "epoch": 1.4010538641686183, + "grad_norm": 0.9832646250724792, + "learning_rate": 4.376570744241179e-06, + "loss": 0.5877, + "step": 4786 + }, + { + "epoch": 1.4013466042154565, + "grad_norm": 0.894860565662384, + "learning_rate": 4.376316293670031e-06, + "loss": 0.5971, + "step": 4787 + }, + { + "epoch": 1.401639344262295, + "grad_norm": 0.9703342318534851, + "learning_rate": 4.376061798582918e-06, + "loss": 0.6426, + "step": 4788 + }, + { + "epoch": 1.4019320843091334, + "grad_norm": 0.9586917757987976, + "learning_rate": 4.375807258985879e-06, + "loss": 0.5941, + "step": 4789 + }, + { + "epoch": 1.4022248243559718, + "grad_norm": 0.9642534255981445, + "learning_rate": 4.375552674884952e-06, + "loss": 0.605, + "step": 4790 + }, + { + "epoch": 1.4025175644028103, + "grad_norm": 0.9387715458869934, + "learning_rate": 4.375298046286177e-06, + "loss": 0.6253, + "step": 4791 + }, + { + "epoch": 1.4028103044496487, + "grad_norm": 0.9559980034828186, + "learning_rate": 4.375043373195597e-06, + "loss": 0.6303, + "step": 4792 + }, + { + "epoch": 1.4031030444964872, + "grad_norm": 1.056389570236206, + "learning_rate": 4.374788655619251e-06, + "loss": 0.6138, + "step": 4793 + }, + { + "epoch": 1.4033957845433256, + "grad_norm": 1.059765100479126, + "learning_rate": 4.374533893563185e-06, + "loss": 0.6299, + "step": 4794 + }, + { + "epoch": 1.403688524590164, + "grad_norm": 1.0305051803588867, + "learning_rate": 4.3742790870334415e-06, + "loss": 0.6311, + "step": 4795 + }, + { + "epoch": 1.4039812646370025, + "grad_norm": 1.0226516723632812, + "learning_rate": 4.374024236036068e-06, + "loss": 0.6551, + "step": 4796 + }, + { + "epoch": 1.4042740046838407, + "grad_norm": 0.9635087251663208, + "learning_rate": 4.373769340577108e-06, + "loss": 0.6057, + "step": 4797 + }, + { + "epoch": 1.404566744730679, + "grad_norm": 0.9903846383094788, + "learning_rate": 4.3735144006626104e-06, + "loss": 0.6282, + "step": 4798 + }, + { + "epoch": 1.4048594847775175, + "grad_norm": 0.9964038729667664, + "learning_rate": 4.373259416298624e-06, + "loss": 0.605, + "step": 4799 + }, + { + "epoch": 1.405152224824356, + "grad_norm": 0.9711071252822876, + "learning_rate": 4.373004387491199e-06, + "loss": 0.579, + "step": 4800 + }, + { + "epoch": 1.4054449648711944, + "grad_norm": 0.9616775512695312, + "learning_rate": 4.372749314246384e-06, + "loss": 0.6052, + "step": 4801 + }, + { + "epoch": 1.4057377049180328, + "grad_norm": 0.9605329036712646, + "learning_rate": 4.3724941965702325e-06, + "loss": 0.6277, + "step": 4802 + }, + { + "epoch": 1.406030444964871, + "grad_norm": 0.9648794531822205, + "learning_rate": 4.372239034468796e-06, + "loss": 0.6051, + "step": 4803 + }, + { + "epoch": 1.4063231850117095, + "grad_norm": 0.9871491193771362, + "learning_rate": 4.3719838279481286e-06, + "loss": 0.6089, + "step": 4804 + }, + { + "epoch": 1.406615925058548, + "grad_norm": 0.9717026352882385, + "learning_rate": 4.371728577014285e-06, + "loss": 0.6295, + "step": 4805 + }, + { + "epoch": 1.4069086651053864, + "grad_norm": 0.9603911638259888, + "learning_rate": 4.371473281673322e-06, + "loss": 0.622, + "step": 4806 + }, + { + "epoch": 1.4072014051522248, + "grad_norm": 0.9673033356666565, + "learning_rate": 4.371217941931295e-06, + "loss": 0.6375, + "step": 4807 + }, + { + "epoch": 1.4074941451990632, + "grad_norm": 0.9402182102203369, + "learning_rate": 4.370962557794263e-06, + "loss": 0.6191, + "step": 4808 + }, + { + "epoch": 1.4077868852459017, + "grad_norm": 0.9693348407745361, + "learning_rate": 4.370707129268285e-06, + "loss": 0.597, + "step": 4809 + }, + { + "epoch": 1.4080796252927401, + "grad_norm": 0.9495952129364014, + "learning_rate": 4.370451656359421e-06, + "loss": 0.6169, + "step": 4810 + }, + { + "epoch": 1.4083723653395785, + "grad_norm": 0.9754199385643005, + "learning_rate": 4.370196139073733e-06, + "loss": 0.6282, + "step": 4811 + }, + { + "epoch": 1.408665105386417, + "grad_norm": 0.9755079746246338, + "learning_rate": 4.369940577417282e-06, + "loss": 0.6398, + "step": 4812 + }, + { + "epoch": 1.4089578454332552, + "grad_norm": 0.9719329476356506, + "learning_rate": 4.369684971396131e-06, + "loss": 0.6217, + "step": 4813 + }, + { + "epoch": 1.4092505854800936, + "grad_norm": 1.0751240253448486, + "learning_rate": 4.369429321016344e-06, + "loss": 0.6178, + "step": 4814 + }, + { + "epoch": 1.409543325526932, + "grad_norm": 1.0086684226989746, + "learning_rate": 4.369173626283988e-06, + "loss": 0.6443, + "step": 4815 + }, + { + "epoch": 1.4098360655737705, + "grad_norm": 0.9757355451583862, + "learning_rate": 4.368917887205129e-06, + "loss": 0.5812, + "step": 4816 + }, + { + "epoch": 1.410128805620609, + "grad_norm": 0.9646985530853271, + "learning_rate": 4.368662103785834e-06, + "loss": 0.5465, + "step": 4817 + }, + { + "epoch": 1.4104215456674474, + "grad_norm": 0.9516500234603882, + "learning_rate": 4.368406276032172e-06, + "loss": 0.6466, + "step": 4818 + }, + { + "epoch": 1.4107142857142856, + "grad_norm": 0.9795973896980286, + "learning_rate": 4.368150403950211e-06, + "loss": 0.6295, + "step": 4819 + }, + { + "epoch": 1.411007025761124, + "grad_norm": 0.9289286136627197, + "learning_rate": 4.367894487546023e-06, + "loss": 0.6095, + "step": 4820 + }, + { + "epoch": 1.4112997658079625, + "grad_norm": 0.9682134389877319, + "learning_rate": 4.367638526825681e-06, + "loss": 0.6172, + "step": 4821 + }, + { + "epoch": 1.411592505854801, + "grad_norm": 0.9668240547180176, + "learning_rate": 4.367382521795253e-06, + "loss": 0.6164, + "step": 4822 + }, + { + "epoch": 1.4118852459016393, + "grad_norm": 0.9580768346786499, + "learning_rate": 4.3671264724608185e-06, + "loss": 0.6235, + "step": 4823 + }, + { + "epoch": 1.4121779859484778, + "grad_norm": 0.9920182228088379, + "learning_rate": 4.366870378828449e-06, + "loss": 0.5738, + "step": 4824 + }, + { + "epoch": 1.4124707259953162, + "grad_norm": 0.9186362624168396, + "learning_rate": 4.366614240904221e-06, + "loss": 0.6569, + "step": 4825 + }, + { + "epoch": 1.4127634660421546, + "grad_norm": 0.9379276037216187, + "learning_rate": 4.366358058694211e-06, + "loss": 0.6447, + "step": 4826 + }, + { + "epoch": 1.413056206088993, + "grad_norm": 0.9928176999092102, + "learning_rate": 4.366101832204499e-06, + "loss": 0.6313, + "step": 4827 + }, + { + "epoch": 1.4133489461358315, + "grad_norm": 0.8956465721130371, + "learning_rate": 4.365845561441161e-06, + "loss": 0.5879, + "step": 4828 + }, + { + "epoch": 1.4136416861826697, + "grad_norm": 0.9387831091880798, + "learning_rate": 4.3655892464102785e-06, + "loss": 0.601, + "step": 4829 + }, + { + "epoch": 1.4139344262295082, + "grad_norm": 0.9742505550384521, + "learning_rate": 4.365332887117934e-06, + "loss": 0.6187, + "step": 4830 + }, + { + "epoch": 1.4142271662763466, + "grad_norm": 0.943447470664978, + "learning_rate": 4.3650764835702085e-06, + "loss": 0.6177, + "step": 4831 + }, + { + "epoch": 1.414519906323185, + "grad_norm": 0.9748445153236389, + "learning_rate": 4.3648200357731844e-06, + "loss": 0.6201, + "step": 4832 + }, + { + "epoch": 1.4148126463700235, + "grad_norm": 0.9662564992904663, + "learning_rate": 4.364563543732946e-06, + "loss": 0.636, + "step": 4833 + }, + { + "epoch": 1.415105386416862, + "grad_norm": 0.9702277183532715, + "learning_rate": 4.36430700745558e-06, + "loss": 0.6295, + "step": 4834 + }, + { + "epoch": 1.4153981264637001, + "grad_norm": 0.9613192677497864, + "learning_rate": 4.3640504269471715e-06, + "loss": 0.6542, + "step": 4835 + }, + { + "epoch": 1.4156908665105385, + "grad_norm": 0.9741793870925903, + "learning_rate": 4.3637938022138095e-06, + "loss": 0.6633, + "step": 4836 + }, + { + "epoch": 1.415983606557377, + "grad_norm": 0.9882297515869141, + "learning_rate": 4.3635371332615804e-06, + "loss": 0.629, + "step": 4837 + }, + { + "epoch": 1.4162763466042154, + "grad_norm": 0.9969061017036438, + "learning_rate": 4.363280420096576e-06, + "loss": 0.6262, + "step": 4838 + }, + { + "epoch": 1.4165690866510539, + "grad_norm": 1.0084924697875977, + "learning_rate": 4.363023662724885e-06, + "loss": 0.5903, + "step": 4839 + }, + { + "epoch": 1.4168618266978923, + "grad_norm": 0.9888980984687805, + "learning_rate": 4.3627668611526e-06, + "loss": 0.6228, + "step": 4840 + }, + { + "epoch": 1.4171545667447307, + "grad_norm": 0.8987814784049988, + "learning_rate": 4.362510015385812e-06, + "loss": 0.5775, + "step": 4841 + }, + { + "epoch": 1.4174473067915692, + "grad_norm": 0.9789316058158875, + "learning_rate": 4.362253125430618e-06, + "loss": 0.6224, + "step": 4842 + }, + { + "epoch": 1.4177400468384076, + "grad_norm": 1.055660367012024, + "learning_rate": 4.36199619129311e-06, + "loss": 0.609, + "step": 4843 + }, + { + "epoch": 1.418032786885246, + "grad_norm": 0.9451059699058533, + "learning_rate": 4.361739212979384e-06, + "loss": 0.6483, + "step": 4844 + }, + { + "epoch": 1.4183255269320842, + "grad_norm": 0.9071941375732422, + "learning_rate": 4.361482190495538e-06, + "loss": 0.6058, + "step": 4845 + }, + { + "epoch": 1.4186182669789227, + "grad_norm": 0.9248945116996765, + "learning_rate": 4.3612251238476695e-06, + "loss": 0.6013, + "step": 4846 + }, + { + "epoch": 1.4189110070257611, + "grad_norm": 0.9196217656135559, + "learning_rate": 4.3609680130418775e-06, + "loss": 0.5854, + "step": 4847 + }, + { + "epoch": 1.4192037470725996, + "grad_norm": 0.9479001760482788, + "learning_rate": 4.360710858084262e-06, + "loss": 0.6289, + "step": 4848 + }, + { + "epoch": 1.419496487119438, + "grad_norm": 1.0375173091888428, + "learning_rate": 4.360453658980923e-06, + "loss": 0.6117, + "step": 4849 + }, + { + "epoch": 1.4197892271662764, + "grad_norm": 0.9747163653373718, + "learning_rate": 4.360196415737964e-06, + "loss": 0.5925, + "step": 4850 + }, + { + "epoch": 1.4200819672131146, + "grad_norm": 1.0007786750793457, + "learning_rate": 4.3599391283614874e-06, + "loss": 0.6644, + "step": 4851 + }, + { + "epoch": 1.420374707259953, + "grad_norm": 0.9534451365470886, + "learning_rate": 4.359681796857598e-06, + "loss": 0.6118, + "step": 4852 + }, + { + "epoch": 1.4206674473067915, + "grad_norm": 0.9887961745262146, + "learning_rate": 4.359424421232401e-06, + "loss": 0.6478, + "step": 4853 + }, + { + "epoch": 1.42096018735363, + "grad_norm": 0.9577417373657227, + "learning_rate": 4.359167001492001e-06, + "loss": 0.6153, + "step": 4854 + }, + { + "epoch": 1.4212529274004684, + "grad_norm": 0.9501870274543762, + "learning_rate": 4.3589095376425084e-06, + "loss": 0.5843, + "step": 4855 + }, + { + "epoch": 1.4215456674473068, + "grad_norm": 1.1090806722640991, + "learning_rate": 4.358652029690029e-06, + "loss": 0.628, + "step": 4856 + }, + { + "epoch": 1.4218384074941453, + "grad_norm": 1.0012975931167603, + "learning_rate": 4.3583944776406735e-06, + "loss": 0.6908, + "step": 4857 + }, + { + "epoch": 1.4221311475409837, + "grad_norm": 0.9532277584075928, + "learning_rate": 4.358136881500552e-06, + "loss": 0.6237, + "step": 4858 + }, + { + "epoch": 1.4224238875878221, + "grad_norm": 0.9607595205307007, + "learning_rate": 4.357879241275776e-06, + "loss": 0.673, + "step": 4859 + }, + { + "epoch": 1.4227166276346606, + "grad_norm": 0.9160950779914856, + "learning_rate": 4.357621556972458e-06, + "loss": 0.6201, + "step": 4860 + }, + { + "epoch": 1.4230093676814988, + "grad_norm": 1.0685043334960938, + "learning_rate": 4.3573638285967115e-06, + "loss": 0.6644, + "step": 4861 + }, + { + "epoch": 1.4233021077283372, + "grad_norm": 0.9403588771820068, + "learning_rate": 4.3571060561546506e-06, + "loss": 0.6323, + "step": 4862 + }, + { + "epoch": 1.4235948477751756, + "grad_norm": 1.3006690740585327, + "learning_rate": 4.356848239652393e-06, + "loss": 0.645, + "step": 4863 + }, + { + "epoch": 1.423887587822014, + "grad_norm": 0.9822148680686951, + "learning_rate": 4.356590379096053e-06, + "loss": 0.6621, + "step": 4864 + }, + { + "epoch": 1.4241803278688525, + "grad_norm": 1.0304076671600342, + "learning_rate": 4.356332474491751e-06, + "loss": 0.6123, + "step": 4865 + }, + { + "epoch": 1.424473067915691, + "grad_norm": 0.9576582312583923, + "learning_rate": 4.3560745258456035e-06, + "loss": 0.6212, + "step": 4866 + }, + { + "epoch": 1.4247658079625292, + "grad_norm": 0.9483251571655273, + "learning_rate": 4.355816533163731e-06, + "loss": 0.6005, + "step": 4867 + }, + { + "epoch": 1.4250585480093676, + "grad_norm": 1.0577986240386963, + "learning_rate": 4.355558496452255e-06, + "loss": 0.6381, + "step": 4868 + }, + { + "epoch": 1.425351288056206, + "grad_norm": 1.018384575843811, + "learning_rate": 4.355300415717297e-06, + "loss": 0.6123, + "step": 4869 + }, + { + "epoch": 1.4256440281030445, + "grad_norm": 0.9508116245269775, + "learning_rate": 4.355042290964981e-06, + "loss": 0.6249, + "step": 4870 + }, + { + "epoch": 1.425936768149883, + "grad_norm": 0.9601740837097168, + "learning_rate": 4.3547841222014295e-06, + "loss": 0.6134, + "step": 4871 + }, + { + "epoch": 1.4262295081967213, + "grad_norm": 0.9893966913223267, + "learning_rate": 4.354525909432769e-06, + "loss": 0.6096, + "step": 4872 + }, + { + "epoch": 1.4265222482435598, + "grad_norm": 0.9920058250427246, + "learning_rate": 4.354267652665124e-06, + "loss": 0.6315, + "step": 4873 + }, + { + "epoch": 1.4268149882903982, + "grad_norm": 0.9220346808433533, + "learning_rate": 4.3540093519046234e-06, + "loss": 0.6081, + "step": 4874 + }, + { + "epoch": 1.4271077283372366, + "grad_norm": 0.9091201424598694, + "learning_rate": 4.353751007157395e-06, + "loss": 0.6333, + "step": 4875 + }, + { + "epoch": 1.4274004683840749, + "grad_norm": 0.9617208242416382, + "learning_rate": 4.353492618429567e-06, + "loss": 0.5951, + "step": 4876 + }, + { + "epoch": 1.4276932084309133, + "grad_norm": 0.9553700089454651, + "learning_rate": 4.353234185727272e-06, + "loss": 0.6285, + "step": 4877 + }, + { + "epoch": 1.4279859484777517, + "grad_norm": 0.9520038366317749, + "learning_rate": 4.352975709056638e-06, + "loss": 0.6072, + "step": 4878 + }, + { + "epoch": 1.4282786885245902, + "grad_norm": 0.9540134072303772, + "learning_rate": 4.352717188423801e-06, + "loss": 0.6098, + "step": 4879 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.9400495886802673, + "learning_rate": 4.352458623834891e-06, + "loss": 0.6164, + "step": 4880 + }, + { + "epoch": 1.428864168618267, + "grad_norm": 1.0380412340164185, + "learning_rate": 4.352200015296045e-06, + "loss": 0.6279, + "step": 4881 + }, + { + "epoch": 1.4291569086651053, + "grad_norm": 0.9240803718566895, + "learning_rate": 4.351941362813399e-06, + "loss": 0.6275, + "step": 4882 + }, + { + "epoch": 1.4294496487119437, + "grad_norm": 1.0016541481018066, + "learning_rate": 4.351682666393087e-06, + "loss": 0.6197, + "step": 4883 + }, + { + "epoch": 1.4297423887587821, + "grad_norm": 0.9833483695983887, + "learning_rate": 4.351423926041248e-06, + "loss": 0.6419, + "step": 4884 + }, + { + "epoch": 1.4300351288056206, + "grad_norm": 0.9942355155944824, + "learning_rate": 4.351165141764021e-06, + "loss": 0.633, + "step": 4885 + }, + { + "epoch": 1.430327868852459, + "grad_norm": 0.9601382613182068, + "learning_rate": 4.350906313567546e-06, + "loss": 0.5788, + "step": 4886 + }, + { + "epoch": 1.4306206088992974, + "grad_norm": 0.9403722882270813, + "learning_rate": 4.350647441457961e-06, + "loss": 0.5886, + "step": 4887 + }, + { + "epoch": 1.4309133489461359, + "grad_norm": 0.9692981243133545, + "learning_rate": 4.350388525441412e-06, + "loss": 0.6466, + "step": 4888 + }, + { + "epoch": 1.4312060889929743, + "grad_norm": 0.9540292620658875, + "learning_rate": 4.3501295655240394e-06, + "loss": 0.6181, + "step": 4889 + }, + { + "epoch": 1.4314988290398127, + "grad_norm": 0.9581567049026489, + "learning_rate": 4.349870561711987e-06, + "loss": 0.5963, + "step": 4890 + }, + { + "epoch": 1.4317915690866512, + "grad_norm": 0.9672182202339172, + "learning_rate": 4.3496115140114e-06, + "loss": 0.5919, + "step": 4891 + }, + { + "epoch": 1.4320843091334894, + "grad_norm": 1.0417258739471436, + "learning_rate": 4.349352422428425e-06, + "loss": 0.6351, + "step": 4892 + }, + { + "epoch": 1.4323770491803278, + "grad_norm": 0.9477212429046631, + "learning_rate": 4.349093286969208e-06, + "loss": 0.6076, + "step": 4893 + }, + { + "epoch": 1.4326697892271663, + "grad_norm": 0.9817236065864563, + "learning_rate": 4.348834107639898e-06, + "loss": 0.6267, + "step": 4894 + }, + { + "epoch": 1.4329625292740047, + "grad_norm": 0.9812107682228088, + "learning_rate": 4.348574884446643e-06, + "loss": 0.6359, + "step": 4895 + }, + { + "epoch": 1.4332552693208431, + "grad_norm": 0.9424794316291809, + "learning_rate": 4.348315617395594e-06, + "loss": 0.5599, + "step": 4896 + }, + { + "epoch": 1.4335480093676816, + "grad_norm": 1.0661001205444336, + "learning_rate": 4.348056306492902e-06, + "loss": 0.6395, + "step": 4897 + }, + { + "epoch": 1.4338407494145198, + "grad_norm": 0.9622060656547546, + "learning_rate": 4.347796951744718e-06, + "loss": 0.617, + "step": 4898 + }, + { + "epoch": 1.4341334894613582, + "grad_norm": 0.9900948405265808, + "learning_rate": 4.347537553157199e-06, + "loss": 0.6269, + "step": 4899 + }, + { + "epoch": 1.4344262295081966, + "grad_norm": 0.9889407753944397, + "learning_rate": 4.347278110736494e-06, + "loss": 0.6179, + "step": 4900 + }, + { + "epoch": 1.434718969555035, + "grad_norm": 1.0914338827133179, + "learning_rate": 4.347018624488762e-06, + "loss": 0.6423, + "step": 4901 + }, + { + "epoch": 1.4350117096018735, + "grad_norm": 0.9528440833091736, + "learning_rate": 4.346759094420158e-06, + "loss": 0.6142, + "step": 4902 + }, + { + "epoch": 1.435304449648712, + "grad_norm": 0.9915702939033508, + "learning_rate": 4.346499520536839e-06, + "loss": 0.6131, + "step": 4903 + }, + { + "epoch": 1.4355971896955504, + "grad_norm": 0.925599217414856, + "learning_rate": 4.3462399028449654e-06, + "loss": 0.6214, + "step": 4904 + }, + { + "epoch": 1.4358899297423888, + "grad_norm": 0.8880231380462646, + "learning_rate": 4.345980241350695e-06, + "loss": 0.5727, + "step": 4905 + }, + { + "epoch": 1.4361826697892273, + "grad_norm": 1.0267466306686401, + "learning_rate": 4.345720536060188e-06, + "loss": 0.6506, + "step": 4906 + }, + { + "epoch": 1.4364754098360657, + "grad_norm": 0.9897314310073853, + "learning_rate": 4.345460786979608e-06, + "loss": 0.5994, + "step": 4907 + }, + { + "epoch": 1.436768149882904, + "grad_norm": 0.9277349710464478, + "learning_rate": 4.3452009941151155e-06, + "loss": 0.6192, + "step": 4908 + }, + { + "epoch": 1.4370608899297423, + "grad_norm": 1.3960142135620117, + "learning_rate": 4.344941157472875e-06, + "loss": 0.6094, + "step": 4909 + }, + { + "epoch": 1.4373536299765808, + "grad_norm": 0.9618222117424011, + "learning_rate": 4.344681277059051e-06, + "loss": 0.6325, + "step": 4910 + }, + { + "epoch": 1.4376463700234192, + "grad_norm": 0.9286855459213257, + "learning_rate": 4.344421352879809e-06, + "loss": 0.588, + "step": 4911 + }, + { + "epoch": 1.4379391100702577, + "grad_norm": 0.9685856699943542, + "learning_rate": 4.3441613849413166e-06, + "loss": 0.5764, + "step": 4912 + }, + { + "epoch": 1.438231850117096, + "grad_norm": 0.9659003615379333, + "learning_rate": 4.343901373249741e-06, + "loss": 0.5916, + "step": 4913 + }, + { + "epoch": 1.4385245901639343, + "grad_norm": 0.9806279540061951, + "learning_rate": 4.34364131781125e-06, + "loss": 0.6496, + "step": 4914 + }, + { + "epoch": 1.4388173302107727, + "grad_norm": 0.970337986946106, + "learning_rate": 4.343381218632016e-06, + "loss": 0.6032, + "step": 4915 + }, + { + "epoch": 1.4391100702576112, + "grad_norm": 1.0136358737945557, + "learning_rate": 4.343121075718208e-06, + "loss": 0.6706, + "step": 4916 + }, + { + "epoch": 1.4394028103044496, + "grad_norm": 1.0752170085906982, + "learning_rate": 4.342860889075999e-06, + "loss": 0.6132, + "step": 4917 + }, + { + "epoch": 1.439695550351288, + "grad_norm": 1.007110357284546, + "learning_rate": 4.3426006587115595e-06, + "loss": 0.6724, + "step": 4918 + }, + { + "epoch": 1.4399882903981265, + "grad_norm": 1.0118402242660522, + "learning_rate": 4.3423403846310665e-06, + "loss": 0.6699, + "step": 4919 + }, + { + "epoch": 1.440281030444965, + "grad_norm": 0.9693822264671326, + "learning_rate": 4.342080066840694e-06, + "loss": 0.6332, + "step": 4920 + }, + { + "epoch": 1.4405737704918034, + "grad_norm": 0.9619869589805603, + "learning_rate": 4.341819705346618e-06, + "loss": 0.5525, + "step": 4921 + }, + { + "epoch": 1.4408665105386418, + "grad_norm": 1.0343364477157593, + "learning_rate": 4.341559300155016e-06, + "loss": 0.5717, + "step": 4922 + }, + { + "epoch": 1.4411592505854802, + "grad_norm": 0.9504677653312683, + "learning_rate": 4.341298851272065e-06, + "loss": 0.6236, + "step": 4923 + }, + { + "epoch": 1.4414519906323184, + "grad_norm": 0.97269207239151, + "learning_rate": 4.341038358703945e-06, + "loss": 0.5744, + "step": 4924 + }, + { + "epoch": 1.4417447306791569, + "grad_norm": 0.964253842830658, + "learning_rate": 4.340777822456837e-06, + "loss": 0.6521, + "step": 4925 + }, + { + "epoch": 1.4420374707259953, + "grad_norm": 0.9905925989151001, + "learning_rate": 4.3405172425369205e-06, + "loss": 0.6088, + "step": 4926 + }, + { + "epoch": 1.4423302107728337, + "grad_norm": 0.9668634533882141, + "learning_rate": 4.34025661895038e-06, + "loss": 0.6597, + "step": 4927 + }, + { + "epoch": 1.4426229508196722, + "grad_norm": 0.9926806688308716, + "learning_rate": 4.3399959517033975e-06, + "loss": 0.6126, + "step": 4928 + }, + { + "epoch": 1.4429156908665106, + "grad_norm": 0.9740691184997559, + "learning_rate": 4.339735240802156e-06, + "loss": 0.636, + "step": 4929 + }, + { + "epoch": 1.4432084309133488, + "grad_norm": 0.9447486996650696, + "learning_rate": 4.339474486252844e-06, + "loss": 0.6499, + "step": 4930 + }, + { + "epoch": 1.4435011709601873, + "grad_norm": 1.0007342100143433, + "learning_rate": 4.339213688061647e-06, + "loss": 0.6344, + "step": 4931 + }, + { + "epoch": 1.4437939110070257, + "grad_norm": 0.985499918460846, + "learning_rate": 4.33895284623475e-06, + "loss": 0.5755, + "step": 4932 + }, + { + "epoch": 1.4440866510538641, + "grad_norm": 0.9719542264938354, + "learning_rate": 4.338691960778344e-06, + "loss": 0.6657, + "step": 4933 + }, + { + "epoch": 1.4443793911007026, + "grad_norm": 0.9702024459838867, + "learning_rate": 4.338431031698619e-06, + "loss": 0.6324, + "step": 4934 + }, + { + "epoch": 1.444672131147541, + "grad_norm": 1.0152117013931274, + "learning_rate": 4.338170059001764e-06, + "loss": 0.6274, + "step": 4935 + }, + { + "epoch": 1.4449648711943794, + "grad_norm": 0.981628954410553, + "learning_rate": 4.337909042693971e-06, + "loss": 0.6401, + "step": 4936 + }, + { + "epoch": 1.4452576112412179, + "grad_norm": 0.9614027142524719, + "learning_rate": 4.337647982781432e-06, + "loss": 0.6304, + "step": 4937 + }, + { + "epoch": 1.4455503512880563, + "grad_norm": 0.8774266839027405, + "learning_rate": 4.337386879270343e-06, + "loss": 0.553, + "step": 4938 + }, + { + "epoch": 1.4458430913348947, + "grad_norm": 1.2423622608184814, + "learning_rate": 4.337125732166896e-06, + "loss": 0.6346, + "step": 4939 + }, + { + "epoch": 1.446135831381733, + "grad_norm": 0.9632560014724731, + "learning_rate": 4.3368645414772895e-06, + "loss": 0.6338, + "step": 4940 + }, + { + "epoch": 1.4464285714285714, + "grad_norm": 1.0148054361343384, + "learning_rate": 4.336603307207718e-06, + "loss": 0.6135, + "step": 4941 + }, + { + "epoch": 1.4467213114754098, + "grad_norm": 0.9192243218421936, + "learning_rate": 4.33634202936438e-06, + "loss": 0.6063, + "step": 4942 + }, + { + "epoch": 1.4470140515222483, + "grad_norm": 0.9496555924415588, + "learning_rate": 4.336080707953475e-06, + "loss": 0.6099, + "step": 4943 + }, + { + "epoch": 1.4473067915690867, + "grad_norm": 1.0192512273788452, + "learning_rate": 4.335819342981202e-06, + "loss": 0.6396, + "step": 4944 + }, + { + "epoch": 1.4475995316159251, + "grad_norm": 0.960931658744812, + "learning_rate": 4.335557934453764e-06, + "loss": 0.6417, + "step": 4945 + }, + { + "epoch": 1.4478922716627634, + "grad_norm": 0.9722139239311218, + "learning_rate": 4.3352964823773595e-06, + "loss": 0.5445, + "step": 4946 + }, + { + "epoch": 1.4481850117096018, + "grad_norm": 1.0103775262832642, + "learning_rate": 4.335034986758194e-06, + "loss": 0.6227, + "step": 4947 + }, + { + "epoch": 1.4484777517564402, + "grad_norm": 0.9751127362251282, + "learning_rate": 4.33477344760247e-06, + "loss": 0.5787, + "step": 4948 + }, + { + "epoch": 1.4487704918032787, + "grad_norm": 0.9400356411933899, + "learning_rate": 4.334511864916394e-06, + "loss": 0.5535, + "step": 4949 + }, + { + "epoch": 1.449063231850117, + "grad_norm": 0.9149196743965149, + "learning_rate": 4.3342502387061725e-06, + "loss": 0.5428, + "step": 4950 + }, + { + "epoch": 1.4493559718969555, + "grad_norm": 0.9899505972862244, + "learning_rate": 4.333988568978012e-06, + "loss": 0.6138, + "step": 4951 + }, + { + "epoch": 1.449648711943794, + "grad_norm": 1.0514671802520752, + "learning_rate": 4.333726855738118e-06, + "loss": 0.6179, + "step": 4952 + }, + { + "epoch": 1.4499414519906324, + "grad_norm": 0.9508295059204102, + "learning_rate": 4.333465098992704e-06, + "loss": 0.5964, + "step": 4953 + }, + { + "epoch": 1.4502341920374708, + "grad_norm": 0.9820045828819275, + "learning_rate": 4.333203298747978e-06, + "loss": 0.6069, + "step": 4954 + }, + { + "epoch": 1.450526932084309, + "grad_norm": 0.9871376156806946, + "learning_rate": 4.3329414550101515e-06, + "loss": 0.6353, + "step": 4955 + }, + { + "epoch": 1.4508196721311475, + "grad_norm": 1.0530534982681274, + "learning_rate": 4.332679567785436e-06, + "loss": 0.6307, + "step": 4956 + }, + { + "epoch": 1.451112412177986, + "grad_norm": 0.974890947341919, + "learning_rate": 4.332417637080046e-06, + "loss": 0.5601, + "step": 4957 + }, + { + "epoch": 1.4514051522248244, + "grad_norm": 0.9563071727752686, + "learning_rate": 4.332155662900196e-06, + "loss": 0.591, + "step": 4958 + }, + { + "epoch": 1.4516978922716628, + "grad_norm": 0.970599353313446, + "learning_rate": 4.331893645252101e-06, + "loss": 0.5604, + "step": 4959 + }, + { + "epoch": 1.4519906323185012, + "grad_norm": 1.0054608583450317, + "learning_rate": 4.331631584141977e-06, + "loss": 0.6206, + "step": 4960 + }, + { + "epoch": 1.4522833723653394, + "grad_norm": 0.955136239528656, + "learning_rate": 4.331369479576041e-06, + "loss": 0.6171, + "step": 4961 + }, + { + "epoch": 1.4525761124121779, + "grad_norm": 0.9145777821540833, + "learning_rate": 4.331107331560513e-06, + "loss": 0.6282, + "step": 4962 + }, + { + "epoch": 1.4528688524590163, + "grad_norm": 0.9429389834403992, + "learning_rate": 4.3308451401016126e-06, + "loss": 0.5996, + "step": 4963 + }, + { + "epoch": 1.4531615925058547, + "grad_norm": 0.9315477609634399, + "learning_rate": 4.330582905205558e-06, + "loss": 0.6155, + "step": 4964 + }, + { + "epoch": 1.4534543325526932, + "grad_norm": 0.9134108424186707, + "learning_rate": 4.330320626878574e-06, + "loss": 0.5876, + "step": 4965 + }, + { + "epoch": 1.4537470725995316, + "grad_norm": 0.9757404923439026, + "learning_rate": 4.33005830512688e-06, + "loss": 0.5878, + "step": 4966 + }, + { + "epoch": 1.45403981264637, + "grad_norm": 0.986552357673645, + "learning_rate": 4.329795939956701e-06, + "loss": 0.6013, + "step": 4967 + }, + { + "epoch": 1.4543325526932085, + "grad_norm": 0.9350465536117554, + "learning_rate": 4.329533531374263e-06, + "loss": 0.5925, + "step": 4968 + }, + { + "epoch": 1.454625292740047, + "grad_norm": 0.9844835996627808, + "learning_rate": 4.32927107938579e-06, + "loss": 0.6235, + "step": 4969 + }, + { + "epoch": 1.4549180327868854, + "grad_norm": 1.0002127885818481, + "learning_rate": 4.329008583997508e-06, + "loss": 0.6101, + "step": 4970 + }, + { + "epoch": 1.4552107728337236, + "grad_norm": 0.945455014705658, + "learning_rate": 4.328746045215647e-06, + "loss": 0.5469, + "step": 4971 + }, + { + "epoch": 1.455503512880562, + "grad_norm": 0.9270692467689514, + "learning_rate": 4.328483463046435e-06, + "loss": 0.5764, + "step": 4972 + }, + { + "epoch": 1.4557962529274004, + "grad_norm": 0.9900239706039429, + "learning_rate": 4.328220837496101e-06, + "loss": 0.5981, + "step": 4973 + }, + { + "epoch": 1.4560889929742389, + "grad_norm": 0.988773763179779, + "learning_rate": 4.327958168570877e-06, + "loss": 0.6341, + "step": 4974 + }, + { + "epoch": 1.4563817330210773, + "grad_norm": 0.9777808785438538, + "learning_rate": 4.327695456276994e-06, + "loss": 0.6337, + "step": 4975 + }, + { + "epoch": 1.4566744730679158, + "grad_norm": 0.9844639897346497, + "learning_rate": 4.327432700620685e-06, + "loss": 0.6227, + "step": 4976 + }, + { + "epoch": 1.456967213114754, + "grad_norm": 1.0020935535430908, + "learning_rate": 4.327169901608184e-06, + "loss": 0.6521, + "step": 4977 + }, + { + "epoch": 1.4572599531615924, + "grad_norm": 1.0193535089492798, + "learning_rate": 4.326907059245726e-06, + "loss": 0.6326, + "step": 4978 + }, + { + "epoch": 1.4575526932084308, + "grad_norm": 0.9903521537780762, + "learning_rate": 4.326644173539547e-06, + "loss": 0.5665, + "step": 4979 + }, + { + "epoch": 1.4578454332552693, + "grad_norm": 0.9723738431930542, + "learning_rate": 4.326381244495884e-06, + "loss": 0.625, + "step": 4980 + }, + { + "epoch": 1.4581381733021077, + "grad_norm": 0.9399203062057495, + "learning_rate": 4.326118272120976e-06, + "loss": 0.5952, + "step": 4981 + }, + { + "epoch": 1.4584309133489461, + "grad_norm": 0.9672629237174988, + "learning_rate": 4.325855256421061e-06, + "loss": 0.6117, + "step": 4982 + }, + { + "epoch": 1.4587236533957846, + "grad_norm": 0.9304363131523132, + "learning_rate": 4.325592197402379e-06, + "loss": 0.6176, + "step": 4983 + }, + { + "epoch": 1.459016393442623, + "grad_norm": 1.0144551992416382, + "learning_rate": 4.32532909507117e-06, + "loss": 0.6335, + "step": 4984 + }, + { + "epoch": 1.4593091334894615, + "grad_norm": 1.1552157402038574, + "learning_rate": 4.325065949433679e-06, + "loss": 0.5739, + "step": 4985 + }, + { + "epoch": 1.4596018735362999, + "grad_norm": 0.9243133068084717, + "learning_rate": 4.324802760496147e-06, + "loss": 0.5757, + "step": 4986 + }, + { + "epoch": 1.459894613583138, + "grad_norm": 0.9539486765861511, + "learning_rate": 4.32453952826482e-06, + "loss": 0.6049, + "step": 4987 + }, + { + "epoch": 1.4601873536299765, + "grad_norm": 0.9663440585136414, + "learning_rate": 4.324276252745941e-06, + "loss": 0.6443, + "step": 4988 + }, + { + "epoch": 1.460480093676815, + "grad_norm": 1.0415740013122559, + "learning_rate": 4.324012933945758e-06, + "loss": 0.6233, + "step": 4989 + }, + { + "epoch": 1.4607728337236534, + "grad_norm": 0.991200864315033, + "learning_rate": 4.323749571870517e-06, + "loss": 0.6451, + "step": 4990 + }, + { + "epoch": 1.4610655737704918, + "grad_norm": 0.960555374622345, + "learning_rate": 4.3234861665264684e-06, + "loss": 0.6203, + "step": 4991 + }, + { + "epoch": 1.4613583138173303, + "grad_norm": 0.9430858492851257, + "learning_rate": 4.32322271791986e-06, + "loss": 0.5996, + "step": 4992 + }, + { + "epoch": 1.4616510538641685, + "grad_norm": 1.010777473449707, + "learning_rate": 4.322959226056941e-06, + "loss": 0.6508, + "step": 4993 + }, + { + "epoch": 1.461943793911007, + "grad_norm": 0.9508931040763855, + "learning_rate": 4.322695690943964e-06, + "loss": 0.6362, + "step": 4994 + }, + { + "epoch": 1.4622365339578454, + "grad_norm": 1.007285714149475, + "learning_rate": 4.3224321125871835e-06, + "loss": 0.6719, + "step": 4995 + }, + { + "epoch": 1.4625292740046838, + "grad_norm": 0.9657604694366455, + "learning_rate": 4.322168490992849e-06, + "loss": 0.6331, + "step": 4996 + }, + { + "epoch": 1.4628220140515222, + "grad_norm": 0.9337171316146851, + "learning_rate": 4.321904826167218e-06, + "loss": 0.5835, + "step": 4997 + }, + { + "epoch": 1.4631147540983607, + "grad_norm": 0.9962891340255737, + "learning_rate": 4.3216411181165455e-06, + "loss": 0.6755, + "step": 4998 + }, + { + "epoch": 1.463407494145199, + "grad_norm": 0.9628698229789734, + "learning_rate": 4.321377366847087e-06, + "loss": 0.6504, + "step": 4999 + }, + { + "epoch": 1.4637002341920375, + "grad_norm": 0.8949825763702393, + "learning_rate": 4.3211135723651e-06, + "loss": 0.5744, + "step": 5000 + }, + { + "epoch": 1.463992974238876, + "grad_norm": 1.0676292181015015, + "learning_rate": 4.320849734676844e-06, + "loss": 0.6309, + "step": 5001 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 1.001513957977295, + "learning_rate": 4.320585853788578e-06, + "loss": 0.6271, + "step": 5002 + }, + { + "epoch": 1.4645784543325526, + "grad_norm": 0.9648604989051819, + "learning_rate": 4.320321929706564e-06, + "loss": 0.6123, + "step": 5003 + }, + { + "epoch": 1.464871194379391, + "grad_norm": 0.9868066310882568, + "learning_rate": 4.320057962437061e-06, + "loss": 0.6189, + "step": 5004 + }, + { + "epoch": 1.4651639344262295, + "grad_norm": 0.921355128288269, + "learning_rate": 4.319793951986334e-06, + "loss": 0.5828, + "step": 5005 + }, + { + "epoch": 1.465456674473068, + "grad_norm": 1.0738803148269653, + "learning_rate": 4.319529898360645e-06, + "loss": 0.5726, + "step": 5006 + }, + { + "epoch": 1.4657494145199064, + "grad_norm": 0.999880850315094, + "learning_rate": 4.31926580156626e-06, + "loss": 0.6334, + "step": 5007 + }, + { + "epoch": 1.4660421545667448, + "grad_norm": 0.9543067812919617, + "learning_rate": 4.319001661609446e-06, + "loss": 0.6229, + "step": 5008 + }, + { + "epoch": 1.466334894613583, + "grad_norm": 0.9460679888725281, + "learning_rate": 4.318737478496466e-06, + "loss": 0.5985, + "step": 5009 + }, + { + "epoch": 1.4666276346604215, + "grad_norm": 1.0148591995239258, + "learning_rate": 4.318473252233591e-06, + "loss": 0.5892, + "step": 5010 + }, + { + "epoch": 1.4669203747072599, + "grad_norm": 0.9935351610183716, + "learning_rate": 4.318208982827088e-06, + "loss": 0.6467, + "step": 5011 + }, + { + "epoch": 1.4672131147540983, + "grad_norm": 0.9850883483886719, + "learning_rate": 4.317944670283227e-06, + "loss": 0.616, + "step": 5012 + }, + { + "epoch": 1.4675058548009368, + "grad_norm": 1.0287790298461914, + "learning_rate": 4.31768031460828e-06, + "loss": 0.6125, + "step": 5013 + }, + { + "epoch": 1.4677985948477752, + "grad_norm": 0.9309239387512207, + "learning_rate": 4.317415915808519e-06, + "loss": 0.6169, + "step": 5014 + }, + { + "epoch": 1.4680913348946136, + "grad_norm": 0.9810964465141296, + "learning_rate": 4.317151473890216e-06, + "loss": 0.6241, + "step": 5015 + }, + { + "epoch": 1.468384074941452, + "grad_norm": 0.9696914553642273, + "learning_rate": 4.316886988859645e-06, + "loss": 0.6292, + "step": 5016 + }, + { + "epoch": 1.4686768149882905, + "grad_norm": 1.001074194908142, + "learning_rate": 4.316622460723081e-06, + "loss": 0.6236, + "step": 5017 + }, + { + "epoch": 1.468969555035129, + "grad_norm": 0.9836937785148621, + "learning_rate": 4.316357889486801e-06, + "loss": 0.6314, + "step": 5018 + }, + { + "epoch": 1.4692622950819672, + "grad_norm": 0.9915736317634583, + "learning_rate": 4.31609327515708e-06, + "loss": 0.6021, + "step": 5019 + }, + { + "epoch": 1.4695550351288056, + "grad_norm": 0.9700872302055359, + "learning_rate": 4.315828617740198e-06, + "loss": 0.6284, + "step": 5020 + }, + { + "epoch": 1.469847775175644, + "grad_norm": 0.9530774354934692, + "learning_rate": 4.315563917242433e-06, + "loss": 0.6435, + "step": 5021 + }, + { + "epoch": 1.4701405152224825, + "grad_norm": 0.9864518046379089, + "learning_rate": 4.3152991736700635e-06, + "loss": 0.651, + "step": 5022 + }, + { + "epoch": 1.470433255269321, + "grad_norm": 1.0165174007415771, + "learning_rate": 4.315034387029374e-06, + "loss": 0.604, + "step": 5023 + }, + { + "epoch": 1.4707259953161593, + "grad_norm": 0.9584099054336548, + "learning_rate": 4.314769557326645e-06, + "loss": 0.5682, + "step": 5024 + }, + { + "epoch": 1.4710187353629975, + "grad_norm": 0.9844294190406799, + "learning_rate": 4.31450468456816e-06, + "loss": 0.642, + "step": 5025 + }, + { + "epoch": 1.471311475409836, + "grad_norm": 0.9797211289405823, + "learning_rate": 4.3142397687602015e-06, + "loss": 0.6274, + "step": 5026 + }, + { + "epoch": 1.4716042154566744, + "grad_norm": 0.9442870616912842, + "learning_rate": 4.313974809909056e-06, + "loss": 0.5715, + "step": 5027 + }, + { + "epoch": 1.4718969555035128, + "grad_norm": 1.0081931352615356, + "learning_rate": 4.3137098080210105e-06, + "loss": 0.6078, + "step": 5028 + }, + { + "epoch": 1.4721896955503513, + "grad_norm": 0.9816237688064575, + "learning_rate": 4.313444763102351e-06, + "loss": 0.6579, + "step": 5029 + }, + { + "epoch": 1.4724824355971897, + "grad_norm": 0.9812033176422119, + "learning_rate": 4.3131796751593656e-06, + "loss": 0.6667, + "step": 5030 + }, + { + "epoch": 1.4727751756440282, + "grad_norm": 1.038355827331543, + "learning_rate": 4.312914544198345e-06, + "loss": 0.623, + "step": 5031 + }, + { + "epoch": 1.4730679156908666, + "grad_norm": 0.9554513692855835, + "learning_rate": 4.312649370225578e-06, + "loss": 0.565, + "step": 5032 + }, + { + "epoch": 1.473360655737705, + "grad_norm": 1.0010956525802612, + "learning_rate": 4.3123841532473565e-06, + "loss": 0.587, + "step": 5033 + }, + { + "epoch": 1.4736533957845435, + "grad_norm": 0.9806815385818481, + "learning_rate": 4.312118893269973e-06, + "loss": 0.612, + "step": 5034 + }, + { + "epoch": 1.4739461358313817, + "grad_norm": 0.9520467519760132, + "learning_rate": 4.311853590299721e-06, + "loss": 0.5942, + "step": 5035 + }, + { + "epoch": 1.4742388758782201, + "grad_norm": 0.95425945520401, + "learning_rate": 4.311588244342893e-06, + "loss": 0.6063, + "step": 5036 + }, + { + "epoch": 1.4745316159250585, + "grad_norm": 0.9245250225067139, + "learning_rate": 4.3113228554057865e-06, + "loss": 0.5928, + "step": 5037 + }, + { + "epoch": 1.474824355971897, + "grad_norm": 0.958236813545227, + "learning_rate": 4.311057423494698e-06, + "loss": 0.5973, + "step": 5038 + }, + { + "epoch": 1.4751170960187354, + "grad_norm": 0.966203510761261, + "learning_rate": 4.310791948615924e-06, + "loss": 0.6082, + "step": 5039 + }, + { + "epoch": 1.4754098360655736, + "grad_norm": 0.968058705329895, + "learning_rate": 4.310526430775762e-06, + "loss": 0.6338, + "step": 5040 + }, + { + "epoch": 1.475702576112412, + "grad_norm": 1.0389138460159302, + "learning_rate": 4.310260869980514e-06, + "loss": 0.6265, + "step": 5041 + }, + { + "epoch": 1.4759953161592505, + "grad_norm": 0.9729427099227905, + "learning_rate": 4.309995266236478e-06, + "loss": 0.5921, + "step": 5042 + }, + { + "epoch": 1.476288056206089, + "grad_norm": 1.017665147781372, + "learning_rate": 4.309729619549957e-06, + "loss": 0.6448, + "step": 5043 + }, + { + "epoch": 1.4765807962529274, + "grad_norm": 0.9443042278289795, + "learning_rate": 4.309463929927254e-06, + "loss": 0.6479, + "step": 5044 + }, + { + "epoch": 1.4768735362997658, + "grad_norm": 0.9576607942581177, + "learning_rate": 4.30919819737467e-06, + "loss": 0.595, + "step": 5045 + }, + { + "epoch": 1.4771662763466042, + "grad_norm": 0.9693806171417236, + "learning_rate": 4.3089324218985115e-06, + "loss": 0.6022, + "step": 5046 + }, + { + "epoch": 1.4774590163934427, + "grad_norm": 0.9182513952255249, + "learning_rate": 4.308666603505084e-06, + "loss": 0.6145, + "step": 5047 + }, + { + "epoch": 1.4777517564402811, + "grad_norm": 0.9269952178001404, + "learning_rate": 4.308400742200695e-06, + "loss": 0.5979, + "step": 5048 + }, + { + "epoch": 1.4780444964871196, + "grad_norm": 0.9620504379272461, + "learning_rate": 4.30813483799165e-06, + "loss": 0.588, + "step": 5049 + }, + { + "epoch": 1.4783372365339578, + "grad_norm": 0.9568591713905334, + "learning_rate": 4.307868890884258e-06, + "loss": 0.6289, + "step": 5050 + }, + { + "epoch": 1.4786299765807962, + "grad_norm": 0.9664583206176758, + "learning_rate": 4.30760290088483e-06, + "loss": 0.6328, + "step": 5051 + }, + { + "epoch": 1.4789227166276346, + "grad_norm": 0.9402685165405273, + "learning_rate": 4.307336867999675e-06, + "loss": 0.6086, + "step": 5052 + }, + { + "epoch": 1.479215456674473, + "grad_norm": 0.992300271987915, + "learning_rate": 4.3070707922351064e-06, + "loss": 0.607, + "step": 5053 + }, + { + "epoch": 1.4795081967213115, + "grad_norm": 1.0159834623336792, + "learning_rate": 4.306804673597436e-06, + "loss": 0.5948, + "step": 5054 + }, + { + "epoch": 1.47980093676815, + "grad_norm": 0.9773232340812683, + "learning_rate": 4.306538512092978e-06, + "loss": 0.6087, + "step": 5055 + }, + { + "epoch": 1.4800936768149882, + "grad_norm": 0.9794203639030457, + "learning_rate": 4.306272307728046e-06, + "loss": 0.5909, + "step": 5056 + }, + { + "epoch": 1.4803864168618266, + "grad_norm": 0.9651490449905396, + "learning_rate": 4.306006060508957e-06, + "loss": 0.5829, + "step": 5057 + }, + { + "epoch": 1.480679156908665, + "grad_norm": 0.9675467014312744, + "learning_rate": 4.305739770442027e-06, + "loss": 0.5811, + "step": 5058 + }, + { + "epoch": 1.4809718969555035, + "grad_norm": 0.9540843367576599, + "learning_rate": 4.305473437533574e-06, + "loss": 0.6225, + "step": 5059 + }, + { + "epoch": 1.481264637002342, + "grad_norm": 0.9866592288017273, + "learning_rate": 4.305207061789917e-06, + "loss": 0.6174, + "step": 5060 + }, + { + "epoch": 1.4815573770491803, + "grad_norm": 0.9919994473457336, + "learning_rate": 4.304940643217375e-06, + "loss": 0.5986, + "step": 5061 + }, + { + "epoch": 1.4818501170960188, + "grad_norm": 0.9838123321533203, + "learning_rate": 4.304674181822271e-06, + "loss": 0.6375, + "step": 5062 + }, + { + "epoch": 1.4821428571428572, + "grad_norm": 0.9155365824699402, + "learning_rate": 4.304407677610923e-06, + "loss": 0.6201, + "step": 5063 + }, + { + "epoch": 1.4824355971896956, + "grad_norm": 0.9443323612213135, + "learning_rate": 4.304141130589658e-06, + "loss": 0.5814, + "step": 5064 + }, + { + "epoch": 1.482728337236534, + "grad_norm": 0.8931990265846252, + "learning_rate": 4.303874540764798e-06, + "loss": 0.5779, + "step": 5065 + }, + { + "epoch": 1.4830210772833723, + "grad_norm": 0.98209148645401, + "learning_rate": 4.303607908142667e-06, + "loss": 0.5643, + "step": 5066 + }, + { + "epoch": 1.4833138173302107, + "grad_norm": 1.018458604812622, + "learning_rate": 4.303341232729592e-06, + "loss": 0.6503, + "step": 5067 + }, + { + "epoch": 1.4836065573770492, + "grad_norm": 0.9427379965782166, + "learning_rate": 4.3030745145319e-06, + "loss": 0.6365, + "step": 5068 + }, + { + "epoch": 1.4838992974238876, + "grad_norm": 0.9472310543060303, + "learning_rate": 4.302807753555919e-06, + "loss": 0.6246, + "step": 5069 + }, + { + "epoch": 1.484192037470726, + "grad_norm": 0.9920113682746887, + "learning_rate": 4.302540949807978e-06, + "loss": 0.6408, + "step": 5070 + }, + { + "epoch": 1.4844847775175645, + "grad_norm": 1.00272798538208, + "learning_rate": 4.302274103294406e-06, + "loss": 0.6546, + "step": 5071 + }, + { + "epoch": 1.4847775175644027, + "grad_norm": 0.938230574131012, + "learning_rate": 4.3020072140215356e-06, + "loss": 0.6017, + "step": 5072 + }, + { + "epoch": 1.4850702576112411, + "grad_norm": 0.9637861251831055, + "learning_rate": 4.301740281995696e-06, + "loss": 0.585, + "step": 5073 + }, + { + "epoch": 1.4853629976580796, + "grad_norm": 0.9575513601303101, + "learning_rate": 4.301473307223224e-06, + "loss": 0.5994, + "step": 5074 + }, + { + "epoch": 1.485655737704918, + "grad_norm": 1.0080647468566895, + "learning_rate": 4.30120628971045e-06, + "loss": 0.6032, + "step": 5075 + }, + { + "epoch": 1.4859484777517564, + "grad_norm": 0.9206872582435608, + "learning_rate": 4.300939229463712e-06, + "loss": 0.5984, + "step": 5076 + }, + { + "epoch": 1.4862412177985949, + "grad_norm": 1.0084919929504395, + "learning_rate": 4.3006721264893446e-06, + "loss": 0.6172, + "step": 5077 + }, + { + "epoch": 1.4865339578454333, + "grad_norm": 0.9552638530731201, + "learning_rate": 4.3004049807936845e-06, + "loss": 0.5962, + "step": 5078 + }, + { + "epoch": 1.4868266978922717, + "grad_norm": 0.9479066729545593, + "learning_rate": 4.30013779238307e-06, + "loss": 0.6081, + "step": 5079 + }, + { + "epoch": 1.4871194379391102, + "grad_norm": 0.9830403923988342, + "learning_rate": 4.299870561263842e-06, + "loss": 0.6139, + "step": 5080 + }, + { + "epoch": 1.4874121779859486, + "grad_norm": 0.9839010238647461, + "learning_rate": 4.299603287442338e-06, + "loss": 0.6455, + "step": 5081 + }, + { + "epoch": 1.4877049180327868, + "grad_norm": 0.9979161620140076, + "learning_rate": 4.2993359709249e-06, + "loss": 0.5878, + "step": 5082 + }, + { + "epoch": 1.4879976580796253, + "grad_norm": 0.9947731494903564, + "learning_rate": 4.299068611717871e-06, + "loss": 0.6382, + "step": 5083 + }, + { + "epoch": 1.4882903981264637, + "grad_norm": 0.960371196269989, + "learning_rate": 4.298801209827594e-06, + "loss": 0.6096, + "step": 5084 + }, + { + "epoch": 1.4885831381733021, + "grad_norm": 1.0146011114120483, + "learning_rate": 4.298533765260412e-06, + "loss": 0.591, + "step": 5085 + }, + { + "epoch": 1.4888758782201406, + "grad_norm": 0.9395280480384827, + "learning_rate": 4.2982662780226715e-06, + "loss": 0.5925, + "step": 5086 + }, + { + "epoch": 1.489168618266979, + "grad_norm": 0.9637488126754761, + "learning_rate": 4.297998748120718e-06, + "loss": 0.6209, + "step": 5087 + }, + { + "epoch": 1.4894613583138172, + "grad_norm": 0.9746048450469971, + "learning_rate": 4.297731175560898e-06, + "loss": 0.5709, + "step": 5088 + }, + { + "epoch": 1.4897540983606556, + "grad_norm": 0.9490916132926941, + "learning_rate": 4.297463560349561e-06, + "loss": 0.6027, + "step": 5089 + }, + { + "epoch": 1.490046838407494, + "grad_norm": 1.0470885038375854, + "learning_rate": 4.297195902493055e-06, + "loss": 0.6232, + "step": 5090 + }, + { + "epoch": 1.4903395784543325, + "grad_norm": 1.006378173828125, + "learning_rate": 4.296928201997732e-06, + "loss": 0.5626, + "step": 5091 + }, + { + "epoch": 1.490632318501171, + "grad_norm": 0.9368309378623962, + "learning_rate": 4.296660458869942e-06, + "loss": 0.6144, + "step": 5092 + }, + { + "epoch": 1.4909250585480094, + "grad_norm": 0.9504420161247253, + "learning_rate": 4.296392673116037e-06, + "loss": 0.6387, + "step": 5093 + }, + { + "epoch": 1.4912177985948478, + "grad_norm": 0.9538701772689819, + "learning_rate": 4.296124844742371e-06, + "loss": 0.6308, + "step": 5094 + }, + { + "epoch": 1.4915105386416863, + "grad_norm": 0.9933557510375977, + "learning_rate": 4.295856973755297e-06, + "loss": 0.6472, + "step": 5095 + }, + { + "epoch": 1.4918032786885247, + "grad_norm": 1.009345531463623, + "learning_rate": 4.295589060161173e-06, + "loss": 0.6024, + "step": 5096 + }, + { + "epoch": 1.4920960187353631, + "grad_norm": 1.0637394189834595, + "learning_rate": 4.295321103966352e-06, + "loss": 0.5965, + "step": 5097 + }, + { + "epoch": 1.4923887587822013, + "grad_norm": 0.9616284370422363, + "learning_rate": 4.295053105177194e-06, + "loss": 0.6902, + "step": 5098 + }, + { + "epoch": 1.4926814988290398, + "grad_norm": 1.020067811012268, + "learning_rate": 4.294785063800055e-06, + "loss": 0.6522, + "step": 5099 + }, + { + "epoch": 1.4929742388758782, + "grad_norm": 1.0209767818450928, + "learning_rate": 4.294516979841297e-06, + "loss": 0.5914, + "step": 5100 + }, + { + "epoch": 1.4932669789227166, + "grad_norm": 0.9494196176528931, + "learning_rate": 4.294248853307278e-06, + "loss": 0.5781, + "step": 5101 + }, + { + "epoch": 1.493559718969555, + "grad_norm": 0.9111950397491455, + "learning_rate": 4.293980684204361e-06, + "loss": 0.6212, + "step": 5102 + }, + { + "epoch": 1.4938524590163935, + "grad_norm": 0.9124439358711243, + "learning_rate": 4.2937124725389065e-06, + "loss": 0.6146, + "step": 5103 + }, + { + "epoch": 1.4941451990632317, + "grad_norm": 0.9613324403762817, + "learning_rate": 4.293444218317279e-06, + "loss": 0.5983, + "step": 5104 + }, + { + "epoch": 1.4944379391100702, + "grad_norm": 0.981351912021637, + "learning_rate": 4.293175921545844e-06, + "loss": 0.6028, + "step": 5105 + }, + { + "epoch": 1.4947306791569086, + "grad_norm": 0.9909009337425232, + "learning_rate": 4.292907582230965e-06, + "loss": 0.612, + "step": 5106 + }, + { + "epoch": 1.495023419203747, + "grad_norm": 0.9442530274391174, + "learning_rate": 4.292639200379009e-06, + "loss": 0.6165, + "step": 5107 + }, + { + "epoch": 1.4953161592505855, + "grad_norm": 0.9502461552619934, + "learning_rate": 4.292370775996345e-06, + "loss": 0.6173, + "step": 5108 + }, + { + "epoch": 1.495608899297424, + "grad_norm": 1.0104641914367676, + "learning_rate": 4.292102309089339e-06, + "loss": 0.6547, + "step": 5109 + }, + { + "epoch": 1.4959016393442623, + "grad_norm": 0.9529364705085754, + "learning_rate": 4.2918337996643614e-06, + "loss": 0.5916, + "step": 5110 + }, + { + "epoch": 1.4961943793911008, + "grad_norm": 0.9701482057571411, + "learning_rate": 4.291565247727782e-06, + "loss": 0.6295, + "step": 5111 + }, + { + "epoch": 1.4964871194379392, + "grad_norm": 0.9347482919692993, + "learning_rate": 4.291296653285973e-06, + "loss": 0.6031, + "step": 5112 + }, + { + "epoch": 1.4967798594847777, + "grad_norm": 0.9098305702209473, + "learning_rate": 4.2910280163453074e-06, + "loss": 0.5962, + "step": 5113 + }, + { + "epoch": 1.4970725995316159, + "grad_norm": 1.0383297204971313, + "learning_rate": 4.290759336912158e-06, + "loss": 0.6406, + "step": 5114 + }, + { + "epoch": 1.4973653395784543, + "grad_norm": 0.9647372364997864, + "learning_rate": 4.2904906149928995e-06, + "loss": 0.6351, + "step": 5115 + }, + { + "epoch": 1.4976580796252927, + "grad_norm": 0.9691990613937378, + "learning_rate": 4.290221850593907e-06, + "loss": 0.6211, + "step": 5116 + }, + { + "epoch": 1.4979508196721312, + "grad_norm": 0.9032235145568848, + "learning_rate": 4.2899530437215565e-06, + "loss": 0.561, + "step": 5117 + }, + { + "epoch": 1.4982435597189696, + "grad_norm": 0.9727394580841064, + "learning_rate": 4.289684194382227e-06, + "loss": 0.6207, + "step": 5118 + }, + { + "epoch": 1.498536299765808, + "grad_norm": 0.9403596520423889, + "learning_rate": 4.289415302582295e-06, + "loss": 0.576, + "step": 5119 + }, + { + "epoch": 1.4988290398126463, + "grad_norm": 0.9884598255157471, + "learning_rate": 4.289146368328142e-06, + "loss": 0.6389, + "step": 5120 + }, + { + "epoch": 1.4991217798594847, + "grad_norm": 0.9983716607093811, + "learning_rate": 4.2888773916261475e-06, + "loss": 0.5945, + "step": 5121 + }, + { + "epoch": 1.4994145199063231, + "grad_norm": 0.9828833937644958, + "learning_rate": 4.288608372482693e-06, + "loss": 0.6337, + "step": 5122 + }, + { + "epoch": 1.4997072599531616, + "grad_norm": 0.9614831209182739, + "learning_rate": 4.288339310904162e-06, + "loss": 0.5964, + "step": 5123 + }, + { + "epoch": 1.5, + "grad_norm": 0.9505947232246399, + "learning_rate": 4.288070206896937e-06, + "loss": 0.6296, + "step": 5124 + }, + { + "epoch": 1.5002927400468384, + "grad_norm": 0.9461503624916077, + "learning_rate": 4.287801060467403e-06, + "loss": 0.5869, + "step": 5125 + }, + { + "epoch": 1.5005854800936769, + "grad_norm": 0.9335198998451233, + "learning_rate": 4.287531871621944e-06, + "loss": 0.5841, + "step": 5126 + }, + { + "epoch": 1.5008782201405153, + "grad_norm": 1.0264045000076294, + "learning_rate": 4.2872626403669495e-06, + "loss": 0.63, + "step": 5127 + }, + { + "epoch": 1.5011709601873537, + "grad_norm": 1.066449761390686, + "learning_rate": 4.2869933667088044e-06, + "loss": 0.6348, + "step": 5128 + }, + { + "epoch": 1.5014637002341922, + "grad_norm": 0.9652385115623474, + "learning_rate": 4.286724050653899e-06, + "loss": 0.6001, + "step": 5129 + }, + { + "epoch": 1.5017564402810304, + "grad_norm": 1.0194919109344482, + "learning_rate": 4.286454692208622e-06, + "loss": 0.6205, + "step": 5130 + }, + { + "epoch": 1.5020491803278688, + "grad_norm": 0.9791635274887085, + "learning_rate": 4.286185291379364e-06, + "loss": 0.6201, + "step": 5131 + }, + { + "epoch": 1.5023419203747073, + "grad_norm": 1.0092837810516357, + "learning_rate": 4.2859158481725175e-06, + "loss": 0.623, + "step": 5132 + }, + { + "epoch": 1.5026346604215457, + "grad_norm": 0.9569693803787231, + "learning_rate": 4.2856463625944735e-06, + "loss": 0.5942, + "step": 5133 + }, + { + "epoch": 1.502927400468384, + "grad_norm": 0.9460557699203491, + "learning_rate": 4.285376834651627e-06, + "loss": 0.6274, + "step": 5134 + }, + { + "epoch": 1.5032201405152223, + "grad_norm": 0.9673527479171753, + "learning_rate": 4.285107264350373e-06, + "loss": 0.5798, + "step": 5135 + }, + { + "epoch": 1.5035128805620608, + "grad_norm": 0.9792625308036804, + "learning_rate": 4.284837651697104e-06, + "loss": 0.6007, + "step": 5136 + }, + { + "epoch": 1.5038056206088992, + "grad_norm": 0.952717661857605, + "learning_rate": 4.28456799669822e-06, + "loss": 0.5808, + "step": 5137 + }, + { + "epoch": 1.5040983606557377, + "grad_norm": 1.0855543613433838, + "learning_rate": 4.284298299360118e-06, + "loss": 0.6159, + "step": 5138 + }, + { + "epoch": 1.504391100702576, + "grad_norm": 0.9018274545669556, + "learning_rate": 4.284028559689195e-06, + "loss": 0.5972, + "step": 5139 + }, + { + "epoch": 1.5046838407494145, + "grad_norm": 0.935826301574707, + "learning_rate": 4.2837587776918525e-06, + "loss": 0.5939, + "step": 5140 + }, + { + "epoch": 1.504976580796253, + "grad_norm": 0.9881680607795715, + "learning_rate": 4.283488953374489e-06, + "loss": 0.6319, + "step": 5141 + }, + { + "epoch": 1.5052693208430914, + "grad_norm": 0.9196656346321106, + "learning_rate": 4.2832190867435085e-06, + "loss": 0.6107, + "step": 5142 + }, + { + "epoch": 1.5055620608899298, + "grad_norm": 0.9721996188163757, + "learning_rate": 4.282949177805313e-06, + "loss": 0.662, + "step": 5143 + }, + { + "epoch": 1.5058548009367683, + "grad_norm": 0.94954913854599, + "learning_rate": 4.282679226566305e-06, + "loss": 0.646, + "step": 5144 + }, + { + "epoch": 1.5061475409836067, + "grad_norm": 1.078478217124939, + "learning_rate": 4.28240923303289e-06, + "loss": 0.5801, + "step": 5145 + }, + { + "epoch": 1.506440281030445, + "grad_norm": 0.9768370389938354, + "learning_rate": 4.2821391972114735e-06, + "loss": 0.6469, + "step": 5146 + }, + { + "epoch": 1.5067330210772834, + "grad_norm": 0.9746650457382202, + "learning_rate": 4.2818691191084625e-06, + "loss": 0.6093, + "step": 5147 + }, + { + "epoch": 1.5070257611241218, + "grad_norm": 0.956028163433075, + "learning_rate": 4.281598998730264e-06, + "loss": 0.6077, + "step": 5148 + }, + { + "epoch": 1.5073185011709602, + "grad_norm": 0.9327326416969299, + "learning_rate": 4.281328836083287e-06, + "loss": 0.5746, + "step": 5149 + }, + { + "epoch": 1.5076112412177984, + "grad_norm": 0.926905632019043, + "learning_rate": 4.281058631173941e-06, + "loss": 0.5784, + "step": 5150 + }, + { + "epoch": 1.5079039812646369, + "grad_norm": 0.9371337890625, + "learning_rate": 4.280788384008637e-06, + "loss": 0.6447, + "step": 5151 + }, + { + "epoch": 1.5081967213114753, + "grad_norm": 0.9836519956588745, + "learning_rate": 4.280518094593786e-06, + "loss": 0.6213, + "step": 5152 + }, + { + "epoch": 1.5084894613583137, + "grad_norm": 1.0511842966079712, + "learning_rate": 4.2802477629358015e-06, + "loss": 0.6192, + "step": 5153 + }, + { + "epoch": 1.5087822014051522, + "grad_norm": 1.015074610710144, + "learning_rate": 4.279977389041097e-06, + "loss": 0.6529, + "step": 5154 + }, + { + "epoch": 1.5090749414519906, + "grad_norm": 0.9941813945770264, + "learning_rate": 4.279706972916087e-06, + "loss": 0.649, + "step": 5155 + }, + { + "epoch": 1.509367681498829, + "grad_norm": 0.9397984743118286, + "learning_rate": 4.279436514567187e-06, + "loss": 0.6061, + "step": 5156 + }, + { + "epoch": 1.5096604215456675, + "grad_norm": 0.9170958399772644, + "learning_rate": 4.279166014000813e-06, + "loss": 0.5991, + "step": 5157 + }, + { + "epoch": 1.509953161592506, + "grad_norm": 1.0051014423370361, + "learning_rate": 4.278895471223385e-06, + "loss": 0.6202, + "step": 5158 + }, + { + "epoch": 1.5102459016393444, + "grad_norm": 1.0599981546401978, + "learning_rate": 4.2786248862413195e-06, + "loss": 0.6179, + "step": 5159 + }, + { + "epoch": 1.5105386416861828, + "grad_norm": 0.9363890886306763, + "learning_rate": 4.278354259061037e-06, + "loss": 0.5687, + "step": 5160 + }, + { + "epoch": 1.5108313817330212, + "grad_norm": 1.0366666316986084, + "learning_rate": 4.278083589688957e-06, + "loss": 0.577, + "step": 5161 + }, + { + "epoch": 1.5111241217798594, + "grad_norm": 0.9964482188224792, + "learning_rate": 4.277812878131503e-06, + "loss": 0.5862, + "step": 5162 + }, + { + "epoch": 1.5114168618266979, + "grad_norm": 0.9842657446861267, + "learning_rate": 4.277542124395098e-06, + "loss": 0.6339, + "step": 5163 + }, + { + "epoch": 1.5117096018735363, + "grad_norm": 0.9587674140930176, + "learning_rate": 4.277271328486163e-06, + "loss": 0.5904, + "step": 5164 + }, + { + "epoch": 1.5120023419203747, + "grad_norm": 1.0178892612457275, + "learning_rate": 4.277000490411125e-06, + "loss": 0.6332, + "step": 5165 + }, + { + "epoch": 1.512295081967213, + "grad_norm": 0.9377586245536804, + "learning_rate": 4.276729610176409e-06, + "loss": 0.6302, + "step": 5166 + }, + { + "epoch": 1.5125878220140514, + "grad_norm": 0.9890187978744507, + "learning_rate": 4.2764586877884405e-06, + "loss": 0.6651, + "step": 5167 + }, + { + "epoch": 1.5128805620608898, + "grad_norm": 0.9816882610321045, + "learning_rate": 4.276187723253649e-06, + "loss": 0.6097, + "step": 5168 + }, + { + "epoch": 1.5131733021077283, + "grad_norm": 0.9388217329978943, + "learning_rate": 4.2759167165784634e-06, + "loss": 0.614, + "step": 5169 + }, + { + "epoch": 1.5134660421545667, + "grad_norm": 0.9853301644325256, + "learning_rate": 4.2756456677693125e-06, + "loss": 0.6164, + "step": 5170 + }, + { + "epoch": 1.5137587822014051, + "grad_norm": 0.9271466732025146, + "learning_rate": 4.275374576832626e-06, + "loss": 0.6077, + "step": 5171 + }, + { + "epoch": 1.5140515222482436, + "grad_norm": 0.9137064814567566, + "learning_rate": 4.275103443774838e-06, + "loss": 0.627, + "step": 5172 + }, + { + "epoch": 1.514344262295082, + "grad_norm": 0.9219373464584351, + "learning_rate": 4.274832268602378e-06, + "loss": 0.6167, + "step": 5173 + }, + { + "epoch": 1.5146370023419204, + "grad_norm": 1.0178911685943604, + "learning_rate": 4.274561051321682e-06, + "loss": 0.6631, + "step": 5174 + }, + { + "epoch": 1.5149297423887589, + "grad_norm": 0.8655738830566406, + "learning_rate": 4.274289791939185e-06, + "loss": 0.5499, + "step": 5175 + }, + { + "epoch": 1.5152224824355973, + "grad_norm": 1.1686251163482666, + "learning_rate": 4.27401849046132e-06, + "loss": 0.6042, + "step": 5176 + }, + { + "epoch": 1.5155152224824358, + "grad_norm": 0.9538536667823792, + "learning_rate": 4.273747146894527e-06, + "loss": 0.5819, + "step": 5177 + }, + { + "epoch": 1.515807962529274, + "grad_norm": 0.920914351940155, + "learning_rate": 4.273475761245242e-06, + "loss": 0.5766, + "step": 5178 + }, + { + "epoch": 1.5161007025761124, + "grad_norm": 0.9759310483932495, + "learning_rate": 4.273204333519903e-06, + "loss": 0.6452, + "step": 5179 + }, + { + "epoch": 1.5163934426229508, + "grad_norm": 0.9807112812995911, + "learning_rate": 4.2729328637249514e-06, + "loss": 0.6023, + "step": 5180 + }, + { + "epoch": 1.5166861826697893, + "grad_norm": 1.0351269245147705, + "learning_rate": 4.272661351866827e-06, + "loss": 0.6573, + "step": 5181 + }, + { + "epoch": 1.5169789227166275, + "grad_norm": 0.9237545728683472, + "learning_rate": 4.272389797951971e-06, + "loss": 0.5791, + "step": 5182 + }, + { + "epoch": 1.517271662763466, + "grad_norm": 1.0170882940292358, + "learning_rate": 4.272118201986827e-06, + "loss": 0.6338, + "step": 5183 + }, + { + "epoch": 1.5175644028103044, + "grad_norm": 1.0138463973999023, + "learning_rate": 4.271846563977838e-06, + "loss": 0.6172, + "step": 5184 + }, + { + "epoch": 1.5178571428571428, + "grad_norm": 1.0087283849716187, + "learning_rate": 4.271574883931449e-06, + "loss": 0.622, + "step": 5185 + }, + { + "epoch": 1.5181498829039812, + "grad_norm": 0.9795041084289551, + "learning_rate": 4.271303161854105e-06, + "loss": 0.6223, + "step": 5186 + }, + { + "epoch": 1.5184426229508197, + "grad_norm": 0.9758836627006531, + "learning_rate": 4.271031397752253e-06, + "loss": 0.6324, + "step": 5187 + }, + { + "epoch": 1.518735362997658, + "grad_norm": 1.0262031555175781, + "learning_rate": 4.270759591632342e-06, + "loss": 0.6253, + "step": 5188 + }, + { + "epoch": 1.5190281030444965, + "grad_norm": 1.0233523845672607, + "learning_rate": 4.2704877435008185e-06, + "loss": 0.6337, + "step": 5189 + }, + { + "epoch": 1.519320843091335, + "grad_norm": 1.0140659809112549, + "learning_rate": 4.2702158533641336e-06, + "loss": 0.6394, + "step": 5190 + }, + { + "epoch": 1.5196135831381734, + "grad_norm": 0.9681879281997681, + "learning_rate": 4.269943921228738e-06, + "loss": 0.5973, + "step": 5191 + }, + { + "epoch": 1.5199063231850118, + "grad_norm": 0.9573580622673035, + "learning_rate": 4.2696719471010815e-06, + "loss": 0.6167, + "step": 5192 + }, + { + "epoch": 1.5201990632318503, + "grad_norm": 0.9402141571044922, + "learning_rate": 4.269399930987619e-06, + "loss": 0.5656, + "step": 5193 + }, + { + "epoch": 1.5204918032786885, + "grad_norm": 0.9478754997253418, + "learning_rate": 4.269127872894802e-06, + "loss": 0.6395, + "step": 5194 + }, + { + "epoch": 1.520784543325527, + "grad_norm": 0.9520384669303894, + "learning_rate": 4.268855772829088e-06, + "loss": 0.5954, + "step": 5195 + }, + { + "epoch": 1.5210772833723654, + "grad_norm": 0.9811244606971741, + "learning_rate": 4.2685836307969295e-06, + "loss": 0.6174, + "step": 5196 + }, + { + "epoch": 1.5213700234192038, + "grad_norm": 1.0151581764221191, + "learning_rate": 4.268311446804785e-06, + "loss": 0.6122, + "step": 5197 + }, + { + "epoch": 1.521662763466042, + "grad_norm": 0.9866485595703125, + "learning_rate": 4.2680392208591125e-06, + "loss": 0.6681, + "step": 5198 + }, + { + "epoch": 1.5219555035128804, + "grad_norm": 0.9998980164527893, + "learning_rate": 4.267766952966369e-06, + "loss": 0.667, + "step": 5199 + }, + { + "epoch": 1.5222482435597189, + "grad_norm": 0.9711611270904541, + "learning_rate": 4.267494643133016e-06, + "loss": 0.5983, + "step": 5200 + }, + { + "epoch": 1.5225409836065573, + "grad_norm": 0.9776273965835571, + "learning_rate": 4.267222291365512e-06, + "loss": 0.5977, + "step": 5201 + }, + { + "epoch": 1.5228337236533958, + "grad_norm": 0.9317762851715088, + "learning_rate": 4.26694989767032e-06, + "loss": 0.6095, + "step": 5202 + }, + { + "epoch": 1.5231264637002342, + "grad_norm": 0.9828393459320068, + "learning_rate": 4.266677462053902e-06, + "loss": 0.6086, + "step": 5203 + }, + { + "epoch": 1.5234192037470726, + "grad_norm": 0.9105871915817261, + "learning_rate": 4.266404984522722e-06, + "loss": 0.5771, + "step": 5204 + }, + { + "epoch": 1.523711943793911, + "grad_norm": 0.990504801273346, + "learning_rate": 4.266132465083244e-06, + "loss": 0.6171, + "step": 5205 + }, + { + "epoch": 1.5240046838407495, + "grad_norm": 0.9457288384437561, + "learning_rate": 4.265859903741933e-06, + "loss": 0.6333, + "step": 5206 + }, + { + "epoch": 1.524297423887588, + "grad_norm": 0.9302508234977722, + "learning_rate": 4.265587300505258e-06, + "loss": 0.6144, + "step": 5207 + }, + { + "epoch": 1.5245901639344264, + "grad_norm": 0.946032702922821, + "learning_rate": 4.265314655379685e-06, + "loss": 0.6541, + "step": 5208 + }, + { + "epoch": 1.5248829039812648, + "grad_norm": 0.9288311004638672, + "learning_rate": 4.265041968371683e-06, + "loss": 0.6136, + "step": 5209 + }, + { + "epoch": 1.525175644028103, + "grad_norm": 0.9309300780296326, + "learning_rate": 4.26476923948772e-06, + "loss": 0.6331, + "step": 5210 + }, + { + "epoch": 1.5254683840749415, + "grad_norm": 0.9640087485313416, + "learning_rate": 4.264496468734268e-06, + "loss": 0.6113, + "step": 5211 + }, + { + "epoch": 1.5257611241217799, + "grad_norm": 1.0006825923919678, + "learning_rate": 4.2642236561177994e-06, + "loss": 0.6526, + "step": 5212 + }, + { + "epoch": 1.526053864168618, + "grad_norm": 0.9420419931411743, + "learning_rate": 4.263950801644784e-06, + "loss": 0.649, + "step": 5213 + }, + { + "epoch": 1.5263466042154565, + "grad_norm": 1.0034172534942627, + "learning_rate": 4.263677905321698e-06, + "loss": 0.6547, + "step": 5214 + }, + { + "epoch": 1.526639344262295, + "grad_norm": 1.0179342031478882, + "learning_rate": 4.263404967155015e-06, + "loss": 0.6047, + "step": 5215 + }, + { + "epoch": 1.5269320843091334, + "grad_norm": 0.9282217025756836, + "learning_rate": 4.26313198715121e-06, + "loss": 0.5909, + "step": 5216 + }, + { + "epoch": 1.5272248243559718, + "grad_norm": 0.976329505443573, + "learning_rate": 4.262858965316759e-06, + "loss": 0.5993, + "step": 5217 + }, + { + "epoch": 1.5275175644028103, + "grad_norm": 1.037953495979309, + "learning_rate": 4.262585901658141e-06, + "loss": 0.6279, + "step": 5218 + }, + { + "epoch": 1.5278103044496487, + "grad_norm": 1.0452667474746704, + "learning_rate": 4.262312796181835e-06, + "loss": 0.6519, + "step": 5219 + }, + { + "epoch": 1.5281030444964872, + "grad_norm": 1.0695490837097168, + "learning_rate": 4.262039648894318e-06, + "loss": 0.6224, + "step": 5220 + }, + { + "epoch": 1.5283957845433256, + "grad_norm": 1.002817153930664, + "learning_rate": 4.2617664598020724e-06, + "loss": 0.6462, + "step": 5221 + }, + { + "epoch": 1.528688524590164, + "grad_norm": 0.955830991268158, + "learning_rate": 4.261493228911579e-06, + "loss": 0.6078, + "step": 5222 + }, + { + "epoch": 1.5289812646370025, + "grad_norm": 1.0113749504089355, + "learning_rate": 4.2612199562293205e-06, + "loss": 0.6328, + "step": 5223 + }, + { + "epoch": 1.529274004683841, + "grad_norm": 0.9632358551025391, + "learning_rate": 4.260946641761779e-06, + "loss": 0.6193, + "step": 5224 + }, + { + "epoch": 1.529566744730679, + "grad_norm": 0.9921161532402039, + "learning_rate": 4.260673285515442e-06, + "loss": 0.6461, + "step": 5225 + }, + { + "epoch": 1.5298594847775175, + "grad_norm": 0.9374629259109497, + "learning_rate": 4.260399887496791e-06, + "loss": 0.6267, + "step": 5226 + }, + { + "epoch": 1.530152224824356, + "grad_norm": 1.0302519798278809, + "learning_rate": 4.260126447712316e-06, + "loss": 0.6584, + "step": 5227 + }, + { + "epoch": 1.5304449648711944, + "grad_norm": 0.9541673064231873, + "learning_rate": 4.259852966168503e-06, + "loss": 0.6173, + "step": 5228 + }, + { + "epoch": 1.5307377049180326, + "grad_norm": 1.1806429624557495, + "learning_rate": 4.259579442871839e-06, + "loss": 0.631, + "step": 5229 + }, + { + "epoch": 1.531030444964871, + "grad_norm": 0.9417897462844849, + "learning_rate": 4.259305877828815e-06, + "loss": 0.615, + "step": 5230 + }, + { + "epoch": 1.5313231850117095, + "grad_norm": 0.9102638959884644, + "learning_rate": 4.259032271045922e-06, + "loss": 0.6085, + "step": 5231 + }, + { + "epoch": 1.531615925058548, + "grad_norm": 1.1131796836853027, + "learning_rate": 4.2587586225296495e-06, + "loss": 0.6352, + "step": 5232 + }, + { + "epoch": 1.5319086651053864, + "grad_norm": 0.9784589409828186, + "learning_rate": 4.258484932286491e-06, + "loss": 0.6038, + "step": 5233 + }, + { + "epoch": 1.5322014051522248, + "grad_norm": 0.9795613288879395, + "learning_rate": 4.25821120032294e-06, + "loss": 0.5958, + "step": 5234 + }, + { + "epoch": 1.5324941451990632, + "grad_norm": 0.9275445938110352, + "learning_rate": 4.2579374266454895e-06, + "loss": 0.5799, + "step": 5235 + }, + { + "epoch": 1.5327868852459017, + "grad_norm": 1.0623422861099243, + "learning_rate": 4.2576636112606364e-06, + "loss": 0.6177, + "step": 5236 + }, + { + "epoch": 1.5330796252927401, + "grad_norm": 0.9314644932746887, + "learning_rate": 4.257389754174877e-06, + "loss": 0.5526, + "step": 5237 + }, + { + "epoch": 1.5333723653395785, + "grad_norm": 0.9679229259490967, + "learning_rate": 4.257115855394708e-06, + "loss": 0.6393, + "step": 5238 + }, + { + "epoch": 1.533665105386417, + "grad_norm": 0.9279500246047974, + "learning_rate": 4.256841914926627e-06, + "loss": 0.5812, + "step": 5239 + }, + { + "epoch": 1.5339578454332554, + "grad_norm": 0.9333216547966003, + "learning_rate": 4.256567932777134e-06, + "loss": 0.59, + "step": 5240 + }, + { + "epoch": 1.5342505854800936, + "grad_norm": 0.9559834003448486, + "learning_rate": 4.25629390895273e-06, + "loss": 0.6042, + "step": 5241 + }, + { + "epoch": 1.534543325526932, + "grad_norm": 0.9410902857780457, + "learning_rate": 4.256019843459916e-06, + "loss": 0.6306, + "step": 5242 + }, + { + "epoch": 1.5348360655737705, + "grad_norm": 0.9632717370986938, + "learning_rate": 4.255745736305192e-06, + "loss": 0.5903, + "step": 5243 + }, + { + "epoch": 1.535128805620609, + "grad_norm": 1.0024524927139282, + "learning_rate": 4.255471587495065e-06, + "loss": 0.584, + "step": 5244 + }, + { + "epoch": 1.5354215456674472, + "grad_norm": 0.9825071692466736, + "learning_rate": 4.255197397036036e-06, + "loss": 0.6052, + "step": 5245 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 0.9651643633842468, + "learning_rate": 4.254923164934611e-06, + "loss": 0.6367, + "step": 5246 + }, + { + "epoch": 1.536007025761124, + "grad_norm": 0.9669291377067566, + "learning_rate": 4.254648891197299e-06, + "loss": 0.6245, + "step": 5247 + }, + { + "epoch": 1.5362997658079625, + "grad_norm": 0.9276668429374695, + "learning_rate": 4.254374575830604e-06, + "loss": 0.6095, + "step": 5248 + }, + { + "epoch": 1.536592505854801, + "grad_norm": 0.9741091132164001, + "learning_rate": 4.2541002188410345e-06, + "loss": 0.6483, + "step": 5249 + }, + { + "epoch": 1.5368852459016393, + "grad_norm": 0.93849778175354, + "learning_rate": 4.253825820235101e-06, + "loss": 0.6078, + "step": 5250 + }, + { + "epoch": 1.5371779859484778, + "grad_norm": 0.9344547390937805, + "learning_rate": 4.253551380019312e-06, + "loss": 0.6436, + "step": 5251 + }, + { + "epoch": 1.5374707259953162, + "grad_norm": 0.9007522463798523, + "learning_rate": 4.25327689820018e-06, + "loss": 0.5812, + "step": 5252 + }, + { + "epoch": 1.5377634660421546, + "grad_norm": 0.9729875326156616, + "learning_rate": 4.253002374784217e-06, + "loss": 0.6207, + "step": 5253 + }, + { + "epoch": 1.538056206088993, + "grad_norm": 1.0258731842041016, + "learning_rate": 4.252727809777937e-06, + "loss": 0.5985, + "step": 5254 + }, + { + "epoch": 1.5383489461358315, + "grad_norm": 0.9390184879302979, + "learning_rate": 4.252453203187852e-06, + "loss": 0.5906, + "step": 5255 + }, + { + "epoch": 1.53864168618267, + "grad_norm": 0.9423456192016602, + "learning_rate": 4.252178555020478e-06, + "loss": 0.6087, + "step": 5256 + }, + { + "epoch": 1.5389344262295082, + "grad_norm": 0.9854576587677002, + "learning_rate": 4.251903865282331e-06, + "loss": 0.6438, + "step": 5257 + }, + { + "epoch": 1.5392271662763466, + "grad_norm": 0.9261794090270996, + "learning_rate": 4.251629133979928e-06, + "loss": 0.6282, + "step": 5258 + }, + { + "epoch": 1.539519906323185, + "grad_norm": 0.9604361653327942, + "learning_rate": 4.251354361119788e-06, + "loss": 0.648, + "step": 5259 + }, + { + "epoch": 1.5398126463700235, + "grad_norm": 1.0010484457015991, + "learning_rate": 4.251079546708429e-06, + "loss": 0.6632, + "step": 5260 + }, + { + "epoch": 1.5401053864168617, + "grad_norm": 0.99141925573349, + "learning_rate": 4.25080469075237e-06, + "loss": 0.6189, + "step": 5261 + }, + { + "epoch": 1.5403981264637001, + "grad_norm": 0.980726420879364, + "learning_rate": 4.250529793258135e-06, + "loss": 0.6195, + "step": 5262 + }, + { + "epoch": 1.5406908665105385, + "grad_norm": 0.9411247372627258, + "learning_rate": 4.250254854232244e-06, + "loss": 0.6641, + "step": 5263 + }, + { + "epoch": 1.540983606557377, + "grad_norm": 0.9647589325904846, + "learning_rate": 4.2499798736812204e-06, + "loss": 0.5967, + "step": 5264 + }, + { + "epoch": 1.5412763466042154, + "grad_norm": 0.9513521194458008, + "learning_rate": 4.249704851611588e-06, + "loss": 0.6259, + "step": 5265 + }, + { + "epoch": 1.5415690866510539, + "grad_norm": 1.018408179283142, + "learning_rate": 4.249429788029872e-06, + "loss": 0.6059, + "step": 5266 + }, + { + "epoch": 1.5418618266978923, + "grad_norm": 0.9436090588569641, + "learning_rate": 4.249154682942598e-06, + "loss": 0.5947, + "step": 5267 + }, + { + "epoch": 1.5421545667447307, + "grad_norm": 0.9745560884475708, + "learning_rate": 4.248879536356293e-06, + "loss": 0.6134, + "step": 5268 + }, + { + "epoch": 1.5424473067915692, + "grad_norm": 0.9765222668647766, + "learning_rate": 4.248604348277485e-06, + "loss": 0.6082, + "step": 5269 + }, + { + "epoch": 1.5427400468384076, + "grad_norm": 0.9816083908081055, + "learning_rate": 4.248329118712703e-06, + "loss": 0.6095, + "step": 5270 + }, + { + "epoch": 1.543032786885246, + "grad_norm": 0.9448951482772827, + "learning_rate": 4.248053847668476e-06, + "loss": 0.6148, + "step": 5271 + }, + { + "epoch": 1.5433255269320845, + "grad_norm": 0.9896662831306458, + "learning_rate": 4.247778535151337e-06, + "loss": 0.5924, + "step": 5272 + }, + { + "epoch": 1.5436182669789227, + "grad_norm": 0.9819498658180237, + "learning_rate": 4.247503181167816e-06, + "loss": 0.6079, + "step": 5273 + }, + { + "epoch": 1.5439110070257611, + "grad_norm": 0.9590914845466614, + "learning_rate": 4.247227785724446e-06, + "loss": 0.5841, + "step": 5274 + }, + { + "epoch": 1.5442037470725996, + "grad_norm": 0.9911090135574341, + "learning_rate": 4.246952348827761e-06, + "loss": 0.6157, + "step": 5275 + }, + { + "epoch": 1.544496487119438, + "grad_norm": 0.98481684923172, + "learning_rate": 4.246676870484296e-06, + "loss": 0.5999, + "step": 5276 + }, + { + "epoch": 1.5447892271662762, + "grad_norm": 1.015299677848816, + "learning_rate": 4.246401350700587e-06, + "loss": 0.6246, + "step": 5277 + }, + { + "epoch": 1.5450819672131146, + "grad_norm": 0.9455148577690125, + "learning_rate": 4.24612578948317e-06, + "loss": 0.6197, + "step": 5278 + }, + { + "epoch": 1.545374707259953, + "grad_norm": 0.9897401332855225, + "learning_rate": 4.245850186838584e-06, + "loss": 0.5989, + "step": 5279 + }, + { + "epoch": 1.5456674473067915, + "grad_norm": 0.9453819990158081, + "learning_rate": 4.2455745427733655e-06, + "loss": 0.592, + "step": 5280 + }, + { + "epoch": 1.54596018735363, + "grad_norm": 1.0018764734268188, + "learning_rate": 4.245298857294057e-06, + "loss": 0.6342, + "step": 5281 + }, + { + "epoch": 1.5462529274004684, + "grad_norm": 0.9838372468948364, + "learning_rate": 4.2450231304071965e-06, + "loss": 0.6088, + "step": 5282 + }, + { + "epoch": 1.5465456674473068, + "grad_norm": 1.0405369997024536, + "learning_rate": 4.244747362119328e-06, + "loss": 0.6276, + "step": 5283 + }, + { + "epoch": 1.5468384074941453, + "grad_norm": 0.9842690825462341, + "learning_rate": 4.244471552436993e-06, + "loss": 0.608, + "step": 5284 + }, + { + "epoch": 1.5471311475409837, + "grad_norm": 1.001088261604309, + "learning_rate": 4.244195701366734e-06, + "loss": 0.5903, + "step": 5285 + }, + { + "epoch": 1.5474238875878221, + "grad_norm": 0.9596325755119324, + "learning_rate": 4.243919808915099e-06, + "loss": 0.6312, + "step": 5286 + }, + { + "epoch": 1.5477166276346606, + "grad_norm": 1.0045183897018433, + "learning_rate": 4.24364387508863e-06, + "loss": 0.6101, + "step": 5287 + }, + { + "epoch": 1.548009367681499, + "grad_norm": 0.9587579369544983, + "learning_rate": 4.243367899893875e-06, + "loss": 0.627, + "step": 5288 + }, + { + "epoch": 1.5483021077283372, + "grad_norm": 0.946642279624939, + "learning_rate": 4.243091883337382e-06, + "loss": 0.6159, + "step": 5289 + }, + { + "epoch": 1.5485948477751756, + "grad_norm": 0.973638117313385, + "learning_rate": 4.2428158254257005e-06, + "loss": 0.6031, + "step": 5290 + }, + { + "epoch": 1.548887587822014, + "grad_norm": 1.002782940864563, + "learning_rate": 4.242539726165376e-06, + "loss": 0.6356, + "step": 5291 + }, + { + "epoch": 1.5491803278688525, + "grad_norm": 0.9336668252944946, + "learning_rate": 4.242263585562965e-06, + "loss": 0.5722, + "step": 5292 + }, + { + "epoch": 1.5494730679156907, + "grad_norm": 0.9536092877388, + "learning_rate": 4.241987403625013e-06, + "loss": 0.5739, + "step": 5293 + }, + { + "epoch": 1.5497658079625292, + "grad_norm": 0.963001012802124, + "learning_rate": 4.241711180358077e-06, + "loss": 0.581, + "step": 5294 + }, + { + "epoch": 1.5500585480093676, + "grad_norm": 1.0058085918426514, + "learning_rate": 4.241434915768707e-06, + "loss": 0.5863, + "step": 5295 + }, + { + "epoch": 1.550351288056206, + "grad_norm": 1.0144939422607422, + "learning_rate": 4.24115860986346e-06, + "loss": 0.6211, + "step": 5296 + }, + { + "epoch": 1.5506440281030445, + "grad_norm": 0.9948499798774719, + "learning_rate": 4.24088226264889e-06, + "loss": 0.6212, + "step": 5297 + }, + { + "epoch": 1.550936768149883, + "grad_norm": 0.9445933103561401, + "learning_rate": 4.240605874131555e-06, + "loss": 0.6427, + "step": 5298 + }, + { + "epoch": 1.5512295081967213, + "grad_norm": 0.9705854654312134, + "learning_rate": 4.24032944431801e-06, + "loss": 0.6127, + "step": 5299 + }, + { + "epoch": 1.5515222482435598, + "grad_norm": 0.9954332113265991, + "learning_rate": 4.240052973214814e-06, + "loss": 0.622, + "step": 5300 + }, + { + "epoch": 1.5518149882903982, + "grad_norm": 0.9340003132820129, + "learning_rate": 4.239776460828529e-06, + "loss": 0.6828, + "step": 5301 + }, + { + "epoch": 1.5521077283372366, + "grad_norm": 0.9407311677932739, + "learning_rate": 4.239499907165711e-06, + "loss": 0.5997, + "step": 5302 + }, + { + "epoch": 1.552400468384075, + "grad_norm": 0.9419457316398621, + "learning_rate": 4.2392233122329255e-06, + "loss": 0.5685, + "step": 5303 + }, + { + "epoch": 1.5526932084309133, + "grad_norm": 0.9864978194236755, + "learning_rate": 4.238946676036731e-06, + "loss": 0.6207, + "step": 5304 + }, + { + "epoch": 1.5529859484777517, + "grad_norm": 0.9393160939216614, + "learning_rate": 4.238669998583694e-06, + "loss": 0.5968, + "step": 5305 + }, + { + "epoch": 1.5532786885245902, + "grad_norm": 0.9839056134223938, + "learning_rate": 4.238393279880376e-06, + "loss": 0.5654, + "step": 5306 + }, + { + "epoch": 1.5535714285714286, + "grad_norm": 0.9224761128425598, + "learning_rate": 4.238116519933344e-06, + "loss": 0.5971, + "step": 5307 + }, + { + "epoch": 1.5538641686182668, + "grad_norm": 0.9641215205192566, + "learning_rate": 4.237839718749165e-06, + "loss": 0.6183, + "step": 5308 + }, + { + "epoch": 1.5541569086651053, + "grad_norm": 1.033076524734497, + "learning_rate": 4.237562876334403e-06, + "loss": 0.6514, + "step": 5309 + }, + { + "epoch": 1.5544496487119437, + "grad_norm": 0.9467810392379761, + "learning_rate": 4.2372859926956284e-06, + "loss": 0.5961, + "step": 5310 + }, + { + "epoch": 1.5547423887587821, + "grad_norm": 0.9989807605743408, + "learning_rate": 4.23700906783941e-06, + "loss": 0.6163, + "step": 5311 + }, + { + "epoch": 1.5550351288056206, + "grad_norm": 1.0303078889846802, + "learning_rate": 4.236732101772317e-06, + "loss": 0.5946, + "step": 5312 + }, + { + "epoch": 1.555327868852459, + "grad_norm": 0.9644239544868469, + "learning_rate": 4.236455094500923e-06, + "loss": 0.6048, + "step": 5313 + }, + { + "epoch": 1.5556206088992974, + "grad_norm": 0.9410210251808167, + "learning_rate": 4.236178046031797e-06, + "loss": 0.5829, + "step": 5314 + }, + { + "epoch": 1.5559133489461359, + "grad_norm": 0.9227371215820312, + "learning_rate": 4.235900956371513e-06, + "loss": 0.5841, + "step": 5315 + }, + { + "epoch": 1.5562060889929743, + "grad_norm": 0.9917036294937134, + "learning_rate": 4.235623825526645e-06, + "loss": 0.5815, + "step": 5316 + }, + { + "epoch": 1.5564988290398127, + "grad_norm": 0.9707561731338501, + "learning_rate": 4.23534665350377e-06, + "loss": 0.6024, + "step": 5317 + }, + { + "epoch": 1.5567915690866512, + "grad_norm": 0.96047043800354, + "learning_rate": 4.235069440309461e-06, + "loss": 0.6282, + "step": 5318 + }, + { + "epoch": 1.5570843091334896, + "grad_norm": 1.0072687864303589, + "learning_rate": 4.234792185950297e-06, + "loss": 0.6228, + "step": 5319 + }, + { + "epoch": 1.5573770491803278, + "grad_norm": 0.9562558531761169, + "learning_rate": 4.2345148904328545e-06, + "loss": 0.6009, + "step": 5320 + }, + { + "epoch": 1.5576697892271663, + "grad_norm": 0.923087477684021, + "learning_rate": 4.234237553763712e-06, + "loss": 0.5796, + "step": 5321 + }, + { + "epoch": 1.5579625292740047, + "grad_norm": 1.0028661489486694, + "learning_rate": 4.233960175949452e-06, + "loss": 0.6206, + "step": 5322 + }, + { + "epoch": 1.5582552693208431, + "grad_norm": 0.9792345762252808, + "learning_rate": 4.233682756996653e-06, + "loss": 0.6096, + "step": 5323 + }, + { + "epoch": 1.5585480093676813, + "grad_norm": 0.973056972026825, + "learning_rate": 4.233405296911898e-06, + "loss": 0.6212, + "step": 5324 + }, + { + "epoch": 1.5588407494145198, + "grad_norm": 0.9363269209861755, + "learning_rate": 4.233127795701769e-06, + "loss": 0.6409, + "step": 5325 + }, + { + "epoch": 1.5591334894613582, + "grad_norm": 0.9527795314788818, + "learning_rate": 4.23285025337285e-06, + "loss": 0.612, + "step": 5326 + }, + { + "epoch": 1.5594262295081966, + "grad_norm": 0.9766455888748169, + "learning_rate": 4.232572669931726e-06, + "loss": 0.6029, + "step": 5327 + }, + { + "epoch": 1.559718969555035, + "grad_norm": 0.9909049272537231, + "learning_rate": 4.232295045384982e-06, + "loss": 0.6428, + "step": 5328 + }, + { + "epoch": 1.5600117096018735, + "grad_norm": 0.9193269610404968, + "learning_rate": 4.232017379739206e-06, + "loss": 0.6439, + "step": 5329 + }, + { + "epoch": 1.560304449648712, + "grad_norm": 1.0580737590789795, + "learning_rate": 4.231739673000984e-06, + "loss": 0.6349, + "step": 5330 + }, + { + "epoch": 1.5605971896955504, + "grad_norm": 0.9424880146980286, + "learning_rate": 4.231461925176906e-06, + "loss": 0.6069, + "step": 5331 + }, + { + "epoch": 1.5608899297423888, + "grad_norm": 0.9340510368347168, + "learning_rate": 4.2311841362735614e-06, + "loss": 0.5957, + "step": 5332 + }, + { + "epoch": 1.5611826697892273, + "grad_norm": 1.2676753997802734, + "learning_rate": 4.23090630629754e-06, + "loss": 0.5662, + "step": 5333 + }, + { + "epoch": 1.5614754098360657, + "grad_norm": 0.9282070994377136, + "learning_rate": 4.230628435255435e-06, + "loss": 0.5685, + "step": 5334 + }, + { + "epoch": 1.5617681498829041, + "grad_norm": 0.9250478744506836, + "learning_rate": 4.230350523153837e-06, + "loss": 0.5946, + "step": 5335 + }, + { + "epoch": 1.5620608899297423, + "grad_norm": 0.9607656598091125, + "learning_rate": 4.230072569999341e-06, + "loss": 0.6154, + "step": 5336 + }, + { + "epoch": 1.5623536299765808, + "grad_norm": 0.9375677704811096, + "learning_rate": 4.2297945757985395e-06, + "loss": 0.6201, + "step": 5337 + }, + { + "epoch": 1.5626463700234192, + "grad_norm": 1.015354037284851, + "learning_rate": 4.229516540558031e-06, + "loss": 0.6622, + "step": 5338 + }, + { + "epoch": 1.5629391100702577, + "grad_norm": 0.9960196018218994, + "learning_rate": 4.22923846428441e-06, + "loss": 0.6124, + "step": 5339 + }, + { + "epoch": 1.5632318501170959, + "grad_norm": 0.9679796695709229, + "learning_rate": 4.228960346984273e-06, + "loss": 0.5806, + "step": 5340 + }, + { + "epoch": 1.5635245901639343, + "grad_norm": 1.0199458599090576, + "learning_rate": 4.22868218866422e-06, + "loss": 0.6396, + "step": 5341 + }, + { + "epoch": 1.5638173302107727, + "grad_norm": 0.9887532591819763, + "learning_rate": 4.228403989330852e-06, + "loss": 0.559, + "step": 5342 + }, + { + "epoch": 1.5641100702576112, + "grad_norm": 1.0004422664642334, + "learning_rate": 4.228125748990764e-06, + "loss": 0.6141, + "step": 5343 + }, + { + "epoch": 1.5644028103044496, + "grad_norm": 0.9627959132194519, + "learning_rate": 4.2278474676505636e-06, + "loss": 0.5983, + "step": 5344 + }, + { + "epoch": 1.564695550351288, + "grad_norm": 0.9395453333854675, + "learning_rate": 4.227569145316849e-06, + "loss": 0.6012, + "step": 5345 + }, + { + "epoch": 1.5649882903981265, + "grad_norm": 0.9860734939575195, + "learning_rate": 4.227290781996224e-06, + "loss": 0.5965, + "step": 5346 + }, + { + "epoch": 1.565281030444965, + "grad_norm": 0.9596362113952637, + "learning_rate": 4.227012377695293e-06, + "loss": 0.6245, + "step": 5347 + }, + { + "epoch": 1.5655737704918034, + "grad_norm": 0.9570796489715576, + "learning_rate": 4.226733932420662e-06, + "loss": 0.6293, + "step": 5348 + }, + { + "epoch": 1.5658665105386418, + "grad_norm": 0.9334981441497803, + "learning_rate": 4.226455446178938e-06, + "loss": 0.5815, + "step": 5349 + }, + { + "epoch": 1.5661592505854802, + "grad_norm": 0.9210329055786133, + "learning_rate": 4.226176918976725e-06, + "loss": 0.6066, + "step": 5350 + }, + { + "epoch": 1.5664519906323187, + "grad_norm": 0.9818671941757202, + "learning_rate": 4.225898350820634e-06, + "loss": 0.6327, + "step": 5351 + }, + { + "epoch": 1.5667447306791569, + "grad_norm": 0.9989430904388428, + "learning_rate": 4.225619741717273e-06, + "loss": 0.6574, + "step": 5352 + }, + { + "epoch": 1.5670374707259953, + "grad_norm": 0.9814867377281189, + "learning_rate": 4.2253410916732515e-06, + "loss": 0.5983, + "step": 5353 + }, + { + "epoch": 1.5673302107728337, + "grad_norm": 1.0033389329910278, + "learning_rate": 4.2250624006951815e-06, + "loss": 0.6469, + "step": 5354 + }, + { + "epoch": 1.5676229508196722, + "grad_norm": 0.9722893834114075, + "learning_rate": 4.224783668789675e-06, + "loss": 0.5819, + "step": 5355 + }, + { + "epoch": 1.5679156908665104, + "grad_norm": 1.0175293684005737, + "learning_rate": 4.224504895963344e-06, + "loss": 0.6099, + "step": 5356 + }, + { + "epoch": 1.5682084309133488, + "grad_norm": 0.9438850283622742, + "learning_rate": 4.224226082222803e-06, + "loss": 0.5865, + "step": 5357 + }, + { + "epoch": 1.5685011709601873, + "grad_norm": 0.9480686187744141, + "learning_rate": 4.223947227574668e-06, + "loss": 0.6187, + "step": 5358 + }, + { + "epoch": 1.5687939110070257, + "grad_norm": 0.9464259743690491, + "learning_rate": 4.223668332025552e-06, + "loss": 0.6159, + "step": 5359 + }, + { + "epoch": 1.5690866510538641, + "grad_norm": 0.9120861291885376, + "learning_rate": 4.223389395582075e-06, + "loss": 0.5843, + "step": 5360 + }, + { + "epoch": 1.5693793911007026, + "grad_norm": 1.0350393056869507, + "learning_rate": 4.223110418250853e-06, + "loss": 0.6363, + "step": 5361 + }, + { + "epoch": 1.569672131147541, + "grad_norm": 1.0102696418762207, + "learning_rate": 4.222831400038505e-06, + "loss": 0.6005, + "step": 5362 + }, + { + "epoch": 1.5699648711943794, + "grad_norm": 0.971808671951294, + "learning_rate": 4.222552340951652e-06, + "loss": 0.6342, + "step": 5363 + }, + { + "epoch": 1.5702576112412179, + "grad_norm": 1.0247169733047485, + "learning_rate": 4.2222732409969134e-06, + "loss": 0.6037, + "step": 5364 + }, + { + "epoch": 1.5705503512880563, + "grad_norm": 0.9258322715759277, + "learning_rate": 4.221994100180911e-06, + "loss": 0.5641, + "step": 5365 + }, + { + "epoch": 1.5708430913348947, + "grad_norm": 0.9280921816825867, + "learning_rate": 4.221714918510268e-06, + "loss": 0.6128, + "step": 5366 + }, + { + "epoch": 1.5711358313817332, + "grad_norm": 0.9897803664207458, + "learning_rate": 4.2214356959916074e-06, + "loss": 0.6367, + "step": 5367 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.9410139918327332, + "learning_rate": 4.221156432631554e-06, + "loss": 0.5365, + "step": 5368 + }, + { + "epoch": 1.5717213114754098, + "grad_norm": 0.9683930277824402, + "learning_rate": 4.220877128436734e-06, + "loss": 0.6036, + "step": 5369 + }, + { + "epoch": 1.5720140515222483, + "grad_norm": 1.0835827589035034, + "learning_rate": 4.220597783413774e-06, + "loss": 0.6068, + "step": 5370 + }, + { + "epoch": 1.5723067915690867, + "grad_norm": 0.9674406051635742, + "learning_rate": 4.220318397569301e-06, + "loss": 0.647, + "step": 5371 + }, + { + "epoch": 1.572599531615925, + "grad_norm": 0.9806514382362366, + "learning_rate": 4.220038970909942e-06, + "loss": 0.5946, + "step": 5372 + }, + { + "epoch": 1.5728922716627634, + "grad_norm": 0.9844345450401306, + "learning_rate": 4.219759503442329e-06, + "loss": 0.6176, + "step": 5373 + }, + { + "epoch": 1.5731850117096018, + "grad_norm": 0.9559934139251709, + "learning_rate": 4.2194799951730916e-06, + "loss": 0.6114, + "step": 5374 + }, + { + "epoch": 1.5734777517564402, + "grad_norm": 0.9572383761405945, + "learning_rate": 4.219200446108861e-06, + "loss": 0.5882, + "step": 5375 + }, + { + "epoch": 1.5737704918032787, + "grad_norm": 0.9758243560791016, + "learning_rate": 4.21892085625627e-06, + "loss": 0.5558, + "step": 5376 + }, + { + "epoch": 1.574063231850117, + "grad_norm": 0.9519952535629272, + "learning_rate": 4.21864122562195e-06, + "loss": 0.5806, + "step": 5377 + }, + { + "epoch": 1.5743559718969555, + "grad_norm": 0.9911897778511047, + "learning_rate": 4.218361554212537e-06, + "loss": 0.6121, + "step": 5378 + }, + { + "epoch": 1.574648711943794, + "grad_norm": 0.9544985890388489, + "learning_rate": 4.218081842034667e-06, + "loss": 0.6452, + "step": 5379 + }, + { + "epoch": 1.5749414519906324, + "grad_norm": 0.9758298993110657, + "learning_rate": 4.217802089094974e-06, + "loss": 0.6213, + "step": 5380 + }, + { + "epoch": 1.5752341920374708, + "grad_norm": 0.9751930832862854, + "learning_rate": 4.217522295400097e-06, + "loss": 0.582, + "step": 5381 + }, + { + "epoch": 1.5755269320843093, + "grad_norm": 0.9171464443206787, + "learning_rate": 4.217242460956674e-06, + "loss": 0.5509, + "step": 5382 + }, + { + "epoch": 1.5758196721311475, + "grad_norm": 0.9153382182121277, + "learning_rate": 4.2169625857713435e-06, + "loss": 0.5937, + "step": 5383 + }, + { + "epoch": 1.576112412177986, + "grad_norm": 0.9963200688362122, + "learning_rate": 4.216682669850746e-06, + "loss": 0.6023, + "step": 5384 + }, + { + "epoch": 1.5764051522248244, + "grad_norm": 0.9425329566001892, + "learning_rate": 4.216402713201522e-06, + "loss": 0.6083, + "step": 5385 + }, + { + "epoch": 1.5766978922716628, + "grad_norm": 0.91167151927948, + "learning_rate": 4.216122715830313e-06, + "loss": 0.5447, + "step": 5386 + }, + { + "epoch": 1.576990632318501, + "grad_norm": 0.9618961215019226, + "learning_rate": 4.215842677743764e-06, + "loss": 0.6178, + "step": 5387 + }, + { + "epoch": 1.5772833723653394, + "grad_norm": 0.9898943901062012, + "learning_rate": 4.215562598948517e-06, + "loss": 0.6393, + "step": 5388 + }, + { + "epoch": 1.5775761124121779, + "grad_norm": 0.9160796403884888, + "learning_rate": 4.2152824794512185e-06, + "loss": 0.5705, + "step": 5389 + }, + { + "epoch": 1.5778688524590163, + "grad_norm": 0.9585755467414856, + "learning_rate": 4.2150023192585135e-06, + "loss": 0.652, + "step": 5390 + }, + { + "epoch": 1.5781615925058547, + "grad_norm": 0.9680289030075073, + "learning_rate": 4.2147221183770496e-06, + "loss": 0.6455, + "step": 5391 + }, + { + "epoch": 1.5784543325526932, + "grad_norm": 1.0055854320526123, + "learning_rate": 4.214441876813472e-06, + "loss": 0.6152, + "step": 5392 + }, + { + "epoch": 1.5787470725995316, + "grad_norm": 0.968205988407135, + "learning_rate": 4.214161594574433e-06, + "loss": 0.6139, + "step": 5393 + }, + { + "epoch": 1.57903981264637, + "grad_norm": 0.9097467064857483, + "learning_rate": 4.213881271666581e-06, + "loss": 0.5941, + "step": 5394 + }, + { + "epoch": 1.5793325526932085, + "grad_norm": 0.981935977935791, + "learning_rate": 4.213600908096566e-06, + "loss": 0.5755, + "step": 5395 + }, + { + "epoch": 1.579625292740047, + "grad_norm": 0.92374587059021, + "learning_rate": 4.213320503871041e-06, + "loss": 0.5869, + "step": 5396 + }, + { + "epoch": 1.5799180327868854, + "grad_norm": 0.9902387857437134, + "learning_rate": 4.213040058996657e-06, + "loss": 0.6086, + "step": 5397 + }, + { + "epoch": 1.5802107728337238, + "grad_norm": 1.0209834575653076, + "learning_rate": 4.212759573480069e-06, + "loss": 0.6412, + "step": 5398 + }, + { + "epoch": 1.580503512880562, + "grad_norm": 0.907634437084198, + "learning_rate": 4.21247904732793e-06, + "loss": 0.5407, + "step": 5399 + }, + { + "epoch": 1.5807962529274004, + "grad_norm": 1.0132944583892822, + "learning_rate": 4.2121984805468975e-06, + "loss": 0.6204, + "step": 5400 + }, + { + "epoch": 1.5810889929742389, + "grad_norm": 0.9881945848464966, + "learning_rate": 4.211917873143627e-06, + "loss": 0.6259, + "step": 5401 + }, + { + "epoch": 1.5813817330210773, + "grad_norm": 0.960358202457428, + "learning_rate": 4.211637225124776e-06, + "loss": 0.6328, + "step": 5402 + }, + { + "epoch": 1.5816744730679155, + "grad_norm": 0.9405820965766907, + "learning_rate": 4.211356536497003e-06, + "loss": 0.6024, + "step": 5403 + }, + { + "epoch": 1.581967213114754, + "grad_norm": 0.9691810011863708, + "learning_rate": 4.211075807266968e-06, + "loss": 0.5732, + "step": 5404 + }, + { + "epoch": 1.5822599531615924, + "grad_norm": 1.0420875549316406, + "learning_rate": 4.21079503744133e-06, + "loss": 0.6399, + "step": 5405 + }, + { + "epoch": 1.5825526932084308, + "grad_norm": 0.9801697134971619, + "learning_rate": 4.210514227026752e-06, + "loss": 0.5916, + "step": 5406 + }, + { + "epoch": 1.5828454332552693, + "grad_norm": 0.9317028522491455, + "learning_rate": 4.210233376029894e-06, + "loss": 0.5974, + "step": 5407 + }, + { + "epoch": 1.5831381733021077, + "grad_norm": 0.9750704765319824, + "learning_rate": 4.209952484457421e-06, + "loss": 0.6275, + "step": 5408 + }, + { + "epoch": 1.5834309133489461, + "grad_norm": 1.0035500526428223, + "learning_rate": 4.209671552315997e-06, + "loss": 0.6399, + "step": 5409 + }, + { + "epoch": 1.5837236533957846, + "grad_norm": 1.0803554058074951, + "learning_rate": 4.209390579612287e-06, + "loss": 0.6428, + "step": 5410 + }, + { + "epoch": 1.584016393442623, + "grad_norm": 0.9712309837341309, + "learning_rate": 4.209109566352957e-06, + "loss": 0.6157, + "step": 5411 + }, + { + "epoch": 1.5843091334894615, + "grad_norm": 0.9199907183647156, + "learning_rate": 4.208828512544674e-06, + "loss": 0.5686, + "step": 5412 + }, + { + "epoch": 1.5846018735362999, + "grad_norm": 0.9107681512832642, + "learning_rate": 4.208547418194106e-06, + "loss": 0.5976, + "step": 5413 + }, + { + "epoch": 1.5848946135831383, + "grad_norm": 0.9464201331138611, + "learning_rate": 4.208266283307923e-06, + "loss": 0.604, + "step": 5414 + }, + { + "epoch": 1.5851873536299765, + "grad_norm": 0.9909436702728271, + "learning_rate": 4.207985107892794e-06, + "loss": 0.607, + "step": 5415 + }, + { + "epoch": 1.585480093676815, + "grad_norm": 1.1185246706008911, + "learning_rate": 4.207703891955391e-06, + "loss": 0.5859, + "step": 5416 + }, + { + "epoch": 1.5857728337236534, + "grad_norm": 0.9789958596229553, + "learning_rate": 4.207422635502383e-06, + "loss": 0.607, + "step": 5417 + }, + { + "epoch": 1.5860655737704918, + "grad_norm": 0.9771956205368042, + "learning_rate": 4.2071413385404466e-06, + "loss": 0.6171, + "step": 5418 + }, + { + "epoch": 1.58635831381733, + "grad_norm": 0.9705901145935059, + "learning_rate": 4.206860001076253e-06, + "loss": 0.5941, + "step": 5419 + }, + { + "epoch": 1.5866510538641685, + "grad_norm": 0.9615542888641357, + "learning_rate": 4.206578623116479e-06, + "loss": 0.5756, + "step": 5420 + }, + { + "epoch": 1.586943793911007, + "grad_norm": 0.945395827293396, + "learning_rate": 4.2062972046677985e-06, + "loss": 0.583, + "step": 5421 + }, + { + "epoch": 1.5872365339578454, + "grad_norm": 0.9191198945045471, + "learning_rate": 4.206015745736889e-06, + "loss": 0.6154, + "step": 5422 + }, + { + "epoch": 1.5875292740046838, + "grad_norm": 0.9568065404891968, + "learning_rate": 4.2057342463304286e-06, + "loss": 0.6203, + "step": 5423 + }, + { + "epoch": 1.5878220140515222, + "grad_norm": 0.9758872985839844, + "learning_rate": 4.205452706455095e-06, + "loss": 0.6236, + "step": 5424 + }, + { + "epoch": 1.5881147540983607, + "grad_norm": 0.9457293748855591, + "learning_rate": 4.205171126117569e-06, + "loss": 0.5834, + "step": 5425 + }, + { + "epoch": 1.588407494145199, + "grad_norm": 0.9616100788116455, + "learning_rate": 4.204889505324529e-06, + "loss": 0.6102, + "step": 5426 + }, + { + "epoch": 1.5887002341920375, + "grad_norm": 0.9559050798416138, + "learning_rate": 4.20460784408266e-06, + "loss": 0.5686, + "step": 5427 + }, + { + "epoch": 1.588992974238876, + "grad_norm": 0.940771758556366, + "learning_rate": 4.204326142398641e-06, + "loss": 0.5663, + "step": 5428 + }, + { + "epoch": 1.5892857142857144, + "grad_norm": 0.9571628570556641, + "learning_rate": 4.204044400279157e-06, + "loss": 0.5794, + "step": 5429 + }, + { + "epoch": 1.5895784543325528, + "grad_norm": 1.0189592838287354, + "learning_rate": 4.2037626177308926e-06, + "loss": 0.6358, + "step": 5430 + }, + { + "epoch": 1.589871194379391, + "grad_norm": 1.0470068454742432, + "learning_rate": 4.203480794760532e-06, + "loss": 0.6292, + "step": 5431 + }, + { + "epoch": 1.5901639344262295, + "grad_norm": 0.9566435813903809, + "learning_rate": 4.203198931374763e-06, + "loss": 0.5933, + "step": 5432 + }, + { + "epoch": 1.590456674473068, + "grad_norm": 1.0195815563201904, + "learning_rate": 4.202917027580272e-06, + "loss": 0.6331, + "step": 5433 + }, + { + "epoch": 1.5907494145199064, + "grad_norm": 0.9945399761199951, + "learning_rate": 4.202635083383748e-06, + "loss": 0.6548, + "step": 5434 + }, + { + "epoch": 1.5910421545667446, + "grad_norm": 0.9916333556175232, + "learning_rate": 4.202353098791878e-06, + "loss": 0.6227, + "step": 5435 + }, + { + "epoch": 1.591334894613583, + "grad_norm": 0.9441604018211365, + "learning_rate": 4.2020710738113555e-06, + "loss": 0.573, + "step": 5436 + }, + { + "epoch": 1.5916276346604215, + "grad_norm": 0.9371253252029419, + "learning_rate": 4.201789008448868e-06, + "loss": 0.5699, + "step": 5437 + }, + { + "epoch": 1.5919203747072599, + "grad_norm": 0.9695107936859131, + "learning_rate": 4.20150690271111e-06, + "loss": 0.5969, + "step": 5438 + }, + { + "epoch": 1.5922131147540983, + "grad_norm": 0.9802107214927673, + "learning_rate": 4.201224756604774e-06, + "loss": 0.6087, + "step": 5439 + }, + { + "epoch": 1.5925058548009368, + "grad_norm": 0.9958006143569946, + "learning_rate": 4.200942570136554e-06, + "loss": 0.6092, + "step": 5440 + }, + { + "epoch": 1.5927985948477752, + "grad_norm": 0.9827014803886414, + "learning_rate": 4.200660343313144e-06, + "loss": 0.6412, + "step": 5441 + }, + { + "epoch": 1.5930913348946136, + "grad_norm": 0.9671086668968201, + "learning_rate": 4.200378076141241e-06, + "loss": 0.5316, + "step": 5442 + }, + { + "epoch": 1.593384074941452, + "grad_norm": 0.9348472952842712, + "learning_rate": 4.200095768627543e-06, + "loss": 0.6344, + "step": 5443 + }, + { + "epoch": 1.5936768149882905, + "grad_norm": 0.9845016598701477, + "learning_rate": 4.199813420778744e-06, + "loss": 0.6057, + "step": 5444 + }, + { + "epoch": 1.593969555035129, + "grad_norm": 0.9565554261207581, + "learning_rate": 4.199531032601546e-06, + "loss": 0.6084, + "step": 5445 + }, + { + "epoch": 1.5942622950819674, + "grad_norm": 0.9643629193305969, + "learning_rate": 4.199248604102648e-06, + "loss": 0.5867, + "step": 5446 + }, + { + "epoch": 1.5945550351288056, + "grad_norm": 0.897308349609375, + "learning_rate": 4.19896613528875e-06, + "loss": 0.5766, + "step": 5447 + }, + { + "epoch": 1.594847775175644, + "grad_norm": 0.9595425128936768, + "learning_rate": 4.198683626166553e-06, + "loss": 0.6078, + "step": 5448 + }, + { + "epoch": 1.5951405152224825, + "grad_norm": 0.9770135283470154, + "learning_rate": 4.198401076742762e-06, + "loss": 0.5805, + "step": 5449 + }, + { + "epoch": 1.595433255269321, + "grad_norm": 0.9650352597236633, + "learning_rate": 4.198118487024079e-06, + "loss": 0.6385, + "step": 5450 + }, + { + "epoch": 1.595725995316159, + "grad_norm": 0.9890455603599548, + "learning_rate": 4.197835857017208e-06, + "loss": 0.6266, + "step": 5451 + }, + { + "epoch": 1.5960187353629975, + "grad_norm": 0.991356611251831, + "learning_rate": 4.197553186728855e-06, + "loss": 0.65, + "step": 5452 + }, + { + "epoch": 1.596311475409836, + "grad_norm": 0.9774539470672607, + "learning_rate": 4.197270476165728e-06, + "loss": 0.6564, + "step": 5453 + }, + { + "epoch": 1.5966042154566744, + "grad_norm": 0.9552963376045227, + "learning_rate": 4.1969877253345315e-06, + "loss": 0.6135, + "step": 5454 + }, + { + "epoch": 1.5968969555035128, + "grad_norm": 0.9501295685768127, + "learning_rate": 4.1967049342419764e-06, + "loss": 0.6124, + "step": 5455 + }, + { + "epoch": 1.5971896955503513, + "grad_norm": 1.019311547279358, + "learning_rate": 4.196422102894769e-06, + "loss": 0.5903, + "step": 5456 + }, + { + "epoch": 1.5974824355971897, + "grad_norm": 0.9628171324729919, + "learning_rate": 4.196139231299623e-06, + "loss": 0.6086, + "step": 5457 + }, + { + "epoch": 1.5977751756440282, + "grad_norm": 0.9768519997596741, + "learning_rate": 4.195856319463247e-06, + "loss": 0.6328, + "step": 5458 + }, + { + "epoch": 1.5980679156908666, + "grad_norm": 0.9703827500343323, + "learning_rate": 4.195573367392354e-06, + "loss": 0.6554, + "step": 5459 + }, + { + "epoch": 1.598360655737705, + "grad_norm": 0.9485707879066467, + "learning_rate": 4.195290375093657e-06, + "loss": 0.5923, + "step": 5460 + }, + { + "epoch": 1.5986533957845435, + "grad_norm": 0.9827724695205688, + "learning_rate": 4.195007342573871e-06, + "loss": 0.6351, + "step": 5461 + }, + { + "epoch": 1.598946135831382, + "grad_norm": 0.9727291464805603, + "learning_rate": 4.194724269839709e-06, + "loss": 0.619, + "step": 5462 + }, + { + "epoch": 1.5992388758782201, + "grad_norm": 0.9622510671615601, + "learning_rate": 4.1944411568978895e-06, + "loss": 0.6274, + "step": 5463 + }, + { + "epoch": 1.5995316159250585, + "grad_norm": 0.9593223929405212, + "learning_rate": 4.194158003755126e-06, + "loss": 0.5783, + "step": 5464 + }, + { + "epoch": 1.599824355971897, + "grad_norm": 0.958459734916687, + "learning_rate": 4.19387481041814e-06, + "loss": 0.5952, + "step": 5465 + }, + { + "epoch": 1.6001170960187352, + "grad_norm": 0.9841025471687317, + "learning_rate": 4.1935915768936485e-06, + "loss": 0.6516, + "step": 5466 + }, + { + "epoch": 1.6004098360655736, + "grad_norm": 0.9457558393478394, + "learning_rate": 4.193308303188371e-06, + "loss": 0.587, + "step": 5467 + }, + { + "epoch": 1.600702576112412, + "grad_norm": 0.8837643265724182, + "learning_rate": 4.193024989309029e-06, + "loss": 0.5659, + "step": 5468 + }, + { + "epoch": 1.6009953161592505, + "grad_norm": 0.9446712136268616, + "learning_rate": 4.192741635262344e-06, + "loss": 0.5918, + "step": 5469 + }, + { + "epoch": 1.601288056206089, + "grad_norm": 0.9736762046813965, + "learning_rate": 4.192458241055038e-06, + "loss": 0.553, + "step": 5470 + }, + { + "epoch": 1.6015807962529274, + "grad_norm": 0.9849094152450562, + "learning_rate": 4.192174806693835e-06, + "loss": 0.6176, + "step": 5471 + }, + { + "epoch": 1.6018735362997658, + "grad_norm": 0.9567934274673462, + "learning_rate": 4.19189133218546e-06, + "loss": 0.6364, + "step": 5472 + }, + { + "epoch": 1.6021662763466042, + "grad_norm": 0.9639703631401062, + "learning_rate": 4.191607817536638e-06, + "loss": 0.6008, + "step": 5473 + }, + { + "epoch": 1.6024590163934427, + "grad_norm": 0.9434102177619934, + "learning_rate": 4.191324262754097e-06, + "loss": 0.6019, + "step": 5474 + }, + { + "epoch": 1.6027517564402811, + "grad_norm": 0.9213517308235168, + "learning_rate": 4.19104066784456e-06, + "loss": 0.6185, + "step": 5475 + }, + { + "epoch": 1.6030444964871196, + "grad_norm": 0.9774340987205505, + "learning_rate": 4.190757032814761e-06, + "loss": 0.5841, + "step": 5476 + }, + { + "epoch": 1.603337236533958, + "grad_norm": 1.0308456420898438, + "learning_rate": 4.190473357671425e-06, + "loss": 0.5989, + "step": 5477 + }, + { + "epoch": 1.6036299765807962, + "grad_norm": 1.0432053804397583, + "learning_rate": 4.190189642421283e-06, + "loss": 0.6301, + "step": 5478 + }, + { + "epoch": 1.6039227166276346, + "grad_norm": 0.9785676598548889, + "learning_rate": 4.189905887071069e-06, + "loss": 0.6405, + "step": 5479 + }, + { + "epoch": 1.604215456674473, + "grad_norm": 1.0098687410354614, + "learning_rate": 4.189622091627512e-06, + "loss": 0.6381, + "step": 5480 + }, + { + "epoch": 1.6045081967213115, + "grad_norm": 0.9711177349090576, + "learning_rate": 4.1893382560973475e-06, + "loss": 0.5901, + "step": 5481 + }, + { + "epoch": 1.6048009367681497, + "grad_norm": 0.9464046359062195, + "learning_rate": 4.189054380487306e-06, + "loss": 0.601, + "step": 5482 + }, + { + "epoch": 1.6050936768149882, + "grad_norm": 0.981517493724823, + "learning_rate": 4.1887704648041265e-06, + "loss": 0.5969, + "step": 5483 + }, + { + "epoch": 1.6053864168618266, + "grad_norm": 0.9890420436859131, + "learning_rate": 4.188486509054544e-06, + "loss": 0.6391, + "step": 5484 + }, + { + "epoch": 1.605679156908665, + "grad_norm": 0.995336651802063, + "learning_rate": 4.188202513245293e-06, + "loss": 0.613, + "step": 5485 + }, + { + "epoch": 1.6059718969555035, + "grad_norm": 1.4209249019622803, + "learning_rate": 4.187918477383114e-06, + "loss": 0.6593, + "step": 5486 + }, + { + "epoch": 1.606264637002342, + "grad_norm": 1.0171973705291748, + "learning_rate": 4.187634401474744e-06, + "loss": 0.5738, + "step": 5487 + }, + { + "epoch": 1.6065573770491803, + "grad_norm": 0.9812471270561218, + "learning_rate": 4.187350285526925e-06, + "loss": 0.6243, + "step": 5488 + }, + { + "epoch": 1.6068501170960188, + "grad_norm": 0.9734159111976624, + "learning_rate": 4.187066129546394e-06, + "loss": 0.5729, + "step": 5489 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 0.9407654404640198, + "learning_rate": 4.186781933539897e-06, + "loss": 0.599, + "step": 5490 + }, + { + "epoch": 1.6074355971896956, + "grad_norm": 1.0407048463821411, + "learning_rate": 4.1864976975141745e-06, + "loss": 0.6343, + "step": 5491 + }, + { + "epoch": 1.607728337236534, + "grad_norm": 0.9784056544303894, + "learning_rate": 4.186213421475968e-06, + "loss": 0.6302, + "step": 5492 + }, + { + "epoch": 1.6080210772833725, + "grad_norm": 0.9759560823440552, + "learning_rate": 4.185929105432027e-06, + "loss": 0.6281, + "step": 5493 + }, + { + "epoch": 1.6083138173302107, + "grad_norm": 0.9960168600082397, + "learning_rate": 4.185644749389092e-06, + "loss": 0.5942, + "step": 5494 + }, + { + "epoch": 1.6086065573770492, + "grad_norm": 0.9948070049285889, + "learning_rate": 4.185360353353912e-06, + "loss": 0.5948, + "step": 5495 + }, + { + "epoch": 1.6088992974238876, + "grad_norm": 0.9799657464027405, + "learning_rate": 4.185075917333234e-06, + "loss": 0.6666, + "step": 5496 + }, + { + "epoch": 1.609192037470726, + "grad_norm": 0.9440536499023438, + "learning_rate": 4.184791441333806e-06, + "loss": 0.6316, + "step": 5497 + }, + { + "epoch": 1.6094847775175642, + "grad_norm": 0.9391979575157166, + "learning_rate": 4.1845069253623784e-06, + "loss": 0.6045, + "step": 5498 + }, + { + "epoch": 1.6097775175644027, + "grad_norm": 0.9533870816230774, + "learning_rate": 4.184222369425699e-06, + "loss": 0.6108, + "step": 5499 + }, + { + "epoch": 1.6100702576112411, + "grad_norm": 0.9149873852729797, + "learning_rate": 4.183937773530521e-06, + "loss": 0.5646, + "step": 5500 + }, + { + "epoch": 1.6103629976580796, + "grad_norm": 0.9457781314849854, + "learning_rate": 4.183653137683596e-06, + "loss": 0.6047, + "step": 5501 + }, + { + "epoch": 1.610655737704918, + "grad_norm": 0.9001069068908691, + "learning_rate": 4.183368461891676e-06, + "loss": 0.6348, + "step": 5502 + }, + { + "epoch": 1.6109484777517564, + "grad_norm": 0.9149500727653503, + "learning_rate": 4.183083746161516e-06, + "loss": 0.5967, + "step": 5503 + }, + { + "epoch": 1.6112412177985949, + "grad_norm": 0.9549123644828796, + "learning_rate": 4.182798990499871e-06, + "loss": 0.6023, + "step": 5504 + }, + { + "epoch": 1.6115339578454333, + "grad_norm": 0.975989043712616, + "learning_rate": 4.182514194913498e-06, + "loss": 0.6608, + "step": 5505 + }, + { + "epoch": 1.6118266978922717, + "grad_norm": 0.9408684968948364, + "learning_rate": 4.18222935940915e-06, + "loss": 0.5962, + "step": 5506 + }, + { + "epoch": 1.6121194379391102, + "grad_norm": 0.9817995429039001, + "learning_rate": 4.1819444839935885e-06, + "loss": 0.5926, + "step": 5507 + }, + { + "epoch": 1.6124121779859486, + "grad_norm": 1.0109424591064453, + "learning_rate": 4.181659568673571e-06, + "loss": 0.6688, + "step": 5508 + }, + { + "epoch": 1.612704918032787, + "grad_norm": 0.9776192307472229, + "learning_rate": 4.1813746134558575e-06, + "loss": 0.603, + "step": 5509 + }, + { + "epoch": 1.6129976580796253, + "grad_norm": 0.9480839967727661, + "learning_rate": 4.181089618347207e-06, + "loss": 0.6279, + "step": 5510 + }, + { + "epoch": 1.6132903981264637, + "grad_norm": 0.9735599756240845, + "learning_rate": 4.180804583354383e-06, + "loss": 0.6312, + "step": 5511 + }, + { + "epoch": 1.6135831381733021, + "grad_norm": 1.0174293518066406, + "learning_rate": 4.180519508484148e-06, + "loss": 0.5851, + "step": 5512 + }, + { + "epoch": 1.6138758782201406, + "grad_norm": 0.9764302968978882, + "learning_rate": 4.180234393743265e-06, + "loss": 0.6121, + "step": 5513 + }, + { + "epoch": 1.6141686182669788, + "grad_norm": 0.9419327974319458, + "learning_rate": 4.179949239138497e-06, + "loss": 0.6067, + "step": 5514 + }, + { + "epoch": 1.6144613583138172, + "grad_norm": 0.9403430223464966, + "learning_rate": 4.179664044676611e-06, + "loss": 0.6191, + "step": 5515 + }, + { + "epoch": 1.6147540983606556, + "grad_norm": 0.9126012325286865, + "learning_rate": 4.1793788103643726e-06, + "loss": 0.5734, + "step": 5516 + }, + { + "epoch": 1.615046838407494, + "grad_norm": 1.0073903799057007, + "learning_rate": 4.17909353620855e-06, + "loss": 0.6087, + "step": 5517 + }, + { + "epoch": 1.6153395784543325, + "grad_norm": 0.9520065784454346, + "learning_rate": 4.17880822221591e-06, + "loss": 0.612, + "step": 5518 + }, + { + "epoch": 1.615632318501171, + "grad_norm": 0.9539020657539368, + "learning_rate": 4.178522868393223e-06, + "loss": 0.5934, + "step": 5519 + }, + { + "epoch": 1.6159250585480094, + "grad_norm": 0.9922879338264465, + "learning_rate": 4.178237474747258e-06, + "loss": 0.6127, + "step": 5520 + }, + { + "epoch": 1.6162177985948478, + "grad_norm": 1.0130990743637085, + "learning_rate": 4.1779520412847865e-06, + "loss": 0.6078, + "step": 5521 + }, + { + "epoch": 1.6165105386416863, + "grad_norm": 0.9644744992256165, + "learning_rate": 4.177666568012581e-06, + "loss": 0.6429, + "step": 5522 + }, + { + "epoch": 1.6168032786885247, + "grad_norm": 0.9803116321563721, + "learning_rate": 4.177381054937413e-06, + "loss": 0.6216, + "step": 5523 + }, + { + "epoch": 1.6170960187353631, + "grad_norm": 0.9549940824508667, + "learning_rate": 4.177095502066058e-06, + "loss": 0.6252, + "step": 5524 + }, + { + "epoch": 1.6173887587822016, + "grad_norm": 0.9446803331375122, + "learning_rate": 4.176809909405288e-06, + "loss": 0.5973, + "step": 5525 + }, + { + "epoch": 1.6176814988290398, + "grad_norm": 0.9400599598884583, + "learning_rate": 4.176524276961883e-06, + "loss": 0.5967, + "step": 5526 + }, + { + "epoch": 1.6179742388758782, + "grad_norm": 0.9706881046295166, + "learning_rate": 4.176238604742617e-06, + "loss": 0.6323, + "step": 5527 + }, + { + "epoch": 1.6182669789227166, + "grad_norm": 0.9023040533065796, + "learning_rate": 4.175952892754268e-06, + "loss": 0.6216, + "step": 5528 + }, + { + "epoch": 1.618559718969555, + "grad_norm": 0.989031195640564, + "learning_rate": 4.175667141003614e-06, + "loss": 0.5973, + "step": 5529 + }, + { + "epoch": 1.6188524590163933, + "grad_norm": 0.9429071545600891, + "learning_rate": 4.175381349497435e-06, + "loss": 0.6432, + "step": 5530 + }, + { + "epoch": 1.6191451990632317, + "grad_norm": 0.9638463258743286, + "learning_rate": 4.175095518242513e-06, + "loss": 0.5774, + "step": 5531 + }, + { + "epoch": 1.6194379391100702, + "grad_norm": 0.9913589358329773, + "learning_rate": 4.174809647245627e-06, + "loss": 0.6584, + "step": 5532 + }, + { + "epoch": 1.6197306791569086, + "grad_norm": 1.0014218091964722, + "learning_rate": 4.1745237365135605e-06, + "loss": 0.6317, + "step": 5533 + }, + { + "epoch": 1.620023419203747, + "grad_norm": 0.946085512638092, + "learning_rate": 4.1742377860530965e-06, + "loss": 0.5636, + "step": 5534 + }, + { + "epoch": 1.6203161592505855, + "grad_norm": 0.9644864797592163, + "learning_rate": 4.17395179587102e-06, + "loss": 0.6087, + "step": 5535 + }, + { + "epoch": 1.620608899297424, + "grad_norm": 0.9819477200508118, + "learning_rate": 4.173665765974116e-06, + "loss": 0.5929, + "step": 5536 + }, + { + "epoch": 1.6209016393442623, + "grad_norm": 0.9784876108169556, + "learning_rate": 4.173379696369169e-06, + "loss": 0.6303, + "step": 5537 + }, + { + "epoch": 1.6211943793911008, + "grad_norm": 0.8980015516281128, + "learning_rate": 4.173093587062967e-06, + "loss": 0.5569, + "step": 5538 + }, + { + "epoch": 1.6214871194379392, + "grad_norm": 0.9754254221916199, + "learning_rate": 4.172807438062299e-06, + "loss": 0.5336, + "step": 5539 + }, + { + "epoch": 1.6217798594847777, + "grad_norm": 0.9595141410827637, + "learning_rate": 4.172521249373952e-06, + "loss": 0.6158, + "step": 5540 + }, + { + "epoch": 1.622072599531616, + "grad_norm": 0.9874184727668762, + "learning_rate": 4.172235021004718e-06, + "loss": 0.6331, + "step": 5541 + }, + { + "epoch": 1.6223653395784543, + "grad_norm": 0.92271888256073, + "learning_rate": 4.171948752961386e-06, + "loss": 0.5861, + "step": 5542 + }, + { + "epoch": 1.6226580796252927, + "grad_norm": 0.9293798208236694, + "learning_rate": 4.17166244525075e-06, + "loss": 0.6197, + "step": 5543 + }, + { + "epoch": 1.6229508196721312, + "grad_norm": 0.9510549902915955, + "learning_rate": 4.171376097879602e-06, + "loss": 0.6059, + "step": 5544 + }, + { + "epoch": 1.6232435597189696, + "grad_norm": 0.9977638125419617, + "learning_rate": 4.171089710854732e-06, + "loss": 0.5905, + "step": 5545 + }, + { + "epoch": 1.6235362997658078, + "grad_norm": 0.9844597578048706, + "learning_rate": 4.17080328418294e-06, + "loss": 0.6306, + "step": 5546 + }, + { + "epoch": 1.6238290398126463, + "grad_norm": 0.9639396667480469, + "learning_rate": 4.170516817871018e-06, + "loss": 0.6121, + "step": 5547 + }, + { + "epoch": 1.6241217798594847, + "grad_norm": 0.9810764193534851, + "learning_rate": 4.170230311925764e-06, + "loss": 0.6521, + "step": 5548 + }, + { + "epoch": 1.6244145199063231, + "grad_norm": 1.0239354372024536, + "learning_rate": 4.1699437663539745e-06, + "loss": 0.5629, + "step": 5549 + }, + { + "epoch": 1.6247072599531616, + "grad_norm": 0.9851817488670349, + "learning_rate": 4.169657181162449e-06, + "loss": 0.6204, + "step": 5550 + }, + { + "epoch": 1.625, + "grad_norm": 0.961105227470398, + "learning_rate": 4.1693705563579855e-06, + "loss": 0.5821, + "step": 5551 + }, + { + "epoch": 1.6252927400468384, + "grad_norm": 0.9992895722389221, + "learning_rate": 4.169083891947385e-06, + "loss": 0.6354, + "step": 5552 + }, + { + "epoch": 1.6255854800936769, + "grad_norm": 1.0184533596038818, + "learning_rate": 4.168797187937449e-06, + "loss": 0.6004, + "step": 5553 + }, + { + "epoch": 1.6258782201405153, + "grad_norm": 0.9815912246704102, + "learning_rate": 4.168510444334978e-06, + "loss": 0.5964, + "step": 5554 + }, + { + "epoch": 1.6261709601873537, + "grad_norm": 0.9528841376304626, + "learning_rate": 4.168223661146777e-06, + "loss": 0.6035, + "step": 5555 + }, + { + "epoch": 1.6264637002341922, + "grad_norm": 0.9744858145713806, + "learning_rate": 4.167936838379649e-06, + "loss": 0.5996, + "step": 5556 + }, + { + "epoch": 1.6267564402810304, + "grad_norm": 0.919823944568634, + "learning_rate": 4.167649976040399e-06, + "loss": 0.5807, + "step": 5557 + }, + { + "epoch": 1.6270491803278688, + "grad_norm": 0.9804245829582214, + "learning_rate": 4.1673630741358325e-06, + "loss": 0.5846, + "step": 5558 + }, + { + "epoch": 1.6273419203747073, + "grad_norm": 0.9274537563323975, + "learning_rate": 4.167076132672757e-06, + "loss": 0.5798, + "step": 5559 + }, + { + "epoch": 1.6276346604215457, + "grad_norm": 1.0080214738845825, + "learning_rate": 4.166789151657981e-06, + "loss": 0.6028, + "step": 5560 + }, + { + "epoch": 1.627927400468384, + "grad_norm": 0.9956586360931396, + "learning_rate": 4.166502131098311e-06, + "loss": 0.6401, + "step": 5561 + }, + { + "epoch": 1.6282201405152223, + "grad_norm": 0.9595084190368652, + "learning_rate": 4.1662150710005575e-06, + "loss": 0.6307, + "step": 5562 + }, + { + "epoch": 1.6285128805620608, + "grad_norm": 0.9168437719345093, + "learning_rate": 4.1659279713715325e-06, + "loss": 0.6005, + "step": 5563 + }, + { + "epoch": 1.6288056206088992, + "grad_norm": 1.0608112812042236, + "learning_rate": 4.165640832218045e-06, + "loss": 0.6193, + "step": 5564 + }, + { + "epoch": 1.6290983606557377, + "grad_norm": 0.9085535407066345, + "learning_rate": 4.16535365354691e-06, + "loss": 0.605, + "step": 5565 + }, + { + "epoch": 1.629391100702576, + "grad_norm": 0.9595838189125061, + "learning_rate": 4.165066435364939e-06, + "loss": 0.6389, + "step": 5566 + }, + { + "epoch": 1.6296838407494145, + "grad_norm": 0.9232372641563416, + "learning_rate": 4.164779177678947e-06, + "loss": 0.6115, + "step": 5567 + }, + { + "epoch": 1.629976580796253, + "grad_norm": 0.9038360118865967, + "learning_rate": 4.164491880495749e-06, + "loss": 0.5893, + "step": 5568 + }, + { + "epoch": 1.6302693208430914, + "grad_norm": 0.958186149597168, + "learning_rate": 4.1642045438221615e-06, + "loss": 0.6408, + "step": 5569 + }, + { + "epoch": 1.6305620608899298, + "grad_norm": 0.9456562995910645, + "learning_rate": 4.1639171676650015e-06, + "loss": 0.5646, + "step": 5570 + }, + { + "epoch": 1.6308548009367683, + "grad_norm": 0.9266447424888611, + "learning_rate": 4.163629752031087e-06, + "loss": 0.5942, + "step": 5571 + }, + { + "epoch": 1.6311475409836067, + "grad_norm": 0.9216094017028809, + "learning_rate": 4.163342296927237e-06, + "loss": 0.5936, + "step": 5572 + }, + { + "epoch": 1.631440281030445, + "grad_norm": 0.9270051121711731, + "learning_rate": 4.163054802360271e-06, + "loss": 0.6037, + "step": 5573 + }, + { + "epoch": 1.6317330210772834, + "grad_norm": 0.9795669913291931, + "learning_rate": 4.162767268337011e-06, + "loss": 0.6105, + "step": 5574 + }, + { + "epoch": 1.6320257611241218, + "grad_norm": 0.9829419851303101, + "learning_rate": 4.1624796948642775e-06, + "loss": 0.6025, + "step": 5575 + }, + { + "epoch": 1.6323185011709602, + "grad_norm": 0.9689444899559021, + "learning_rate": 4.162192081948895e-06, + "loss": 0.6315, + "step": 5576 + }, + { + "epoch": 1.6326112412177984, + "grad_norm": 0.9818074703216553, + "learning_rate": 4.161904429597685e-06, + "loss": 0.6614, + "step": 5577 + }, + { + "epoch": 1.6329039812646369, + "grad_norm": 0.9803730845451355, + "learning_rate": 4.161616737817473e-06, + "loss": 0.6128, + "step": 5578 + }, + { + "epoch": 1.6331967213114753, + "grad_norm": 0.926995575428009, + "learning_rate": 4.1613290066150855e-06, + "loss": 0.6003, + "step": 5579 + }, + { + "epoch": 1.6334894613583137, + "grad_norm": 0.9598695635795593, + "learning_rate": 4.161041235997348e-06, + "loss": 0.6092, + "step": 5580 + }, + { + "epoch": 1.6337822014051522, + "grad_norm": 0.9513311386108398, + "learning_rate": 4.160753425971087e-06, + "loss": 0.6252, + "step": 5581 + }, + { + "epoch": 1.6340749414519906, + "grad_norm": 0.9781410694122314, + "learning_rate": 4.160465576543133e-06, + "loss": 0.5792, + "step": 5582 + }, + { + "epoch": 1.634367681498829, + "grad_norm": 0.9048669338226318, + "learning_rate": 4.160177687720314e-06, + "loss": 0.5835, + "step": 5583 + }, + { + "epoch": 1.6346604215456675, + "grad_norm": 0.945955216884613, + "learning_rate": 4.15988975950946e-06, + "loss": 0.5775, + "step": 5584 + }, + { + "epoch": 1.634953161592506, + "grad_norm": 1.0514464378356934, + "learning_rate": 4.159601791917403e-06, + "loss": 0.6056, + "step": 5585 + }, + { + "epoch": 1.6352459016393444, + "grad_norm": 0.9486498236656189, + "learning_rate": 4.159313784950973e-06, + "loss": 0.6014, + "step": 5586 + }, + { + "epoch": 1.6355386416861828, + "grad_norm": 0.9549001455307007, + "learning_rate": 4.159025738617007e-06, + "loss": 0.5891, + "step": 5587 + }, + { + "epoch": 1.6358313817330212, + "grad_norm": 0.9791409373283386, + "learning_rate": 4.158737652922335e-06, + "loss": 0.5848, + "step": 5588 + }, + { + "epoch": 1.6361241217798594, + "grad_norm": 0.9955450296401978, + "learning_rate": 4.158449527873795e-06, + "loss": 0.623, + "step": 5589 + }, + { + "epoch": 1.6364168618266979, + "grad_norm": 1.0576279163360596, + "learning_rate": 4.15816136347822e-06, + "loss": 0.6525, + "step": 5590 + }, + { + "epoch": 1.6367096018735363, + "grad_norm": 0.9186882972717285, + "learning_rate": 4.157873159742448e-06, + "loss": 0.5731, + "step": 5591 + }, + { + "epoch": 1.6370023419203747, + "grad_norm": 0.9351302981376648, + "learning_rate": 4.157584916673318e-06, + "loss": 0.5679, + "step": 5592 + }, + { + "epoch": 1.637295081967213, + "grad_norm": 0.9460693001747131, + "learning_rate": 4.157296634277667e-06, + "loss": 0.605, + "step": 5593 + }, + { + "epoch": 1.6375878220140514, + "grad_norm": 0.9452774524688721, + "learning_rate": 4.157008312562335e-06, + "loss": 0.6305, + "step": 5594 + }, + { + "epoch": 1.6378805620608898, + "grad_norm": 0.9235080480575562, + "learning_rate": 4.1567199515341614e-06, + "loss": 0.6171, + "step": 5595 + }, + { + "epoch": 1.6381733021077283, + "grad_norm": 0.9075435996055603, + "learning_rate": 4.15643155119999e-06, + "loss": 0.6326, + "step": 5596 + }, + { + "epoch": 1.6384660421545667, + "grad_norm": 0.9390843510627747, + "learning_rate": 4.156143111566661e-06, + "loss": 0.5886, + "step": 5597 + }, + { + "epoch": 1.6387587822014051, + "grad_norm": 0.9560783505439758, + "learning_rate": 4.1558546326410185e-06, + "loss": 0.6105, + "step": 5598 + }, + { + "epoch": 1.6390515222482436, + "grad_norm": 0.9743385314941406, + "learning_rate": 4.155566114429907e-06, + "loss": 0.5682, + "step": 5599 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 0.9386599063873291, + "learning_rate": 4.155277556940171e-06, + "loss": 0.6231, + "step": 5600 + }, + { + "epoch": 1.6396370023419204, + "grad_norm": 0.9535929560661316, + "learning_rate": 4.154988960178658e-06, + "loss": 0.5791, + "step": 5601 + }, + { + "epoch": 1.6399297423887589, + "grad_norm": 0.918840765953064, + "learning_rate": 4.154700324152213e-06, + "loss": 0.6293, + "step": 5602 + }, + { + "epoch": 1.6402224824355973, + "grad_norm": 1.0103074312210083, + "learning_rate": 4.154411648867685e-06, + "loss": 0.596, + "step": 5603 + }, + { + "epoch": 1.6405152224824358, + "grad_norm": 0.9950798749923706, + "learning_rate": 4.154122934331922e-06, + "loss": 0.5798, + "step": 5604 + }, + { + "epoch": 1.640807962529274, + "grad_norm": 0.9825482964515686, + "learning_rate": 4.153834180551776e-06, + "loss": 0.6446, + "step": 5605 + }, + { + "epoch": 1.6411007025761124, + "grad_norm": 0.9484208226203918, + "learning_rate": 4.1535453875340955e-06, + "loss": 0.6249, + "step": 5606 + }, + { + "epoch": 1.6413934426229508, + "grad_norm": 0.9494272470474243, + "learning_rate": 4.153256555285734e-06, + "loss": 0.6177, + "step": 5607 + }, + { + "epoch": 1.6416861826697893, + "grad_norm": 0.9473714828491211, + "learning_rate": 4.152967683813542e-06, + "loss": 0.6256, + "step": 5608 + }, + { + "epoch": 1.6419789227166275, + "grad_norm": 1.0062862634658813, + "learning_rate": 4.152678773124374e-06, + "loss": 0.6193, + "step": 5609 + }, + { + "epoch": 1.642271662763466, + "grad_norm": 0.965005099773407, + "learning_rate": 4.152389823225086e-06, + "loss": 0.6294, + "step": 5610 + }, + { + "epoch": 1.6425644028103044, + "grad_norm": 0.9214213490486145, + "learning_rate": 4.15210083412253e-06, + "loss": 0.6, + "step": 5611 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.9294766187667847, + "learning_rate": 4.151811805823566e-06, + "loss": 0.6573, + "step": 5612 + }, + { + "epoch": 1.6431498829039812, + "grad_norm": 0.950400710105896, + "learning_rate": 4.151522738335049e-06, + "loss": 0.5827, + "step": 5613 + }, + { + "epoch": 1.6434426229508197, + "grad_norm": 0.9983566403388977, + "learning_rate": 4.151233631663838e-06, + "loss": 0.6328, + "step": 5614 + }, + { + "epoch": 1.643735362997658, + "grad_norm": 1.0399993658065796, + "learning_rate": 4.150944485816791e-06, + "loss": 0.6112, + "step": 5615 + }, + { + "epoch": 1.6440281030444965, + "grad_norm": 0.9211300015449524, + "learning_rate": 4.15065530080077e-06, + "loss": 0.621, + "step": 5616 + }, + { + "epoch": 1.644320843091335, + "grad_norm": 0.9775552153587341, + "learning_rate": 4.150366076622634e-06, + "loss": 0.5594, + "step": 5617 + }, + { + "epoch": 1.6446135831381734, + "grad_norm": 0.9660447835922241, + "learning_rate": 4.1500768132892464e-06, + "loss": 0.6372, + "step": 5618 + }, + { + "epoch": 1.6449063231850118, + "grad_norm": 0.9587098360061646, + "learning_rate": 4.149787510807468e-06, + "loss": 0.6275, + "step": 5619 + }, + { + "epoch": 1.6451990632318503, + "grad_norm": 0.954563319683075, + "learning_rate": 4.1494981691841656e-06, + "loss": 0.6401, + "step": 5620 + }, + { + "epoch": 1.6454918032786885, + "grad_norm": 0.9355548024177551, + "learning_rate": 4.149208788426201e-06, + "loss": 0.6127, + "step": 5621 + }, + { + "epoch": 1.645784543325527, + "grad_norm": 1.1049476861953735, + "learning_rate": 4.148919368540442e-06, + "loss": 0.6803, + "step": 5622 + }, + { + "epoch": 1.6460772833723654, + "grad_norm": 0.9760651588439941, + "learning_rate": 4.148629909533754e-06, + "loss": 0.6213, + "step": 5623 + }, + { + "epoch": 1.6463700234192038, + "grad_norm": 1.0304948091506958, + "learning_rate": 4.148340411413005e-06, + "loss": 0.6175, + "step": 5624 + }, + { + "epoch": 1.646662763466042, + "grad_norm": 0.9774481654167175, + "learning_rate": 4.148050874185062e-06, + "loss": 0.6052, + "step": 5625 + }, + { + "epoch": 1.6469555035128804, + "grad_norm": 0.9879711866378784, + "learning_rate": 4.147761297856796e-06, + "loss": 0.6025, + "step": 5626 + }, + { + "epoch": 1.6472482435597189, + "grad_norm": 1.0427857637405396, + "learning_rate": 4.147471682435077e-06, + "loss": 0.6261, + "step": 5627 + }, + { + "epoch": 1.6475409836065573, + "grad_norm": 0.9450214505195618, + "learning_rate": 4.147182027926775e-06, + "loss": 0.5971, + "step": 5628 + }, + { + "epoch": 1.6478337236533958, + "grad_norm": 1.0081013441085815, + "learning_rate": 4.146892334338764e-06, + "loss": 0.629, + "step": 5629 + }, + { + "epoch": 1.6481264637002342, + "grad_norm": 0.956540584564209, + "learning_rate": 4.146602601677915e-06, + "loss": 0.5874, + "step": 5630 + }, + { + "epoch": 1.6484192037470726, + "grad_norm": 0.9433640241622925, + "learning_rate": 4.146312829951103e-06, + "loss": 0.5766, + "step": 5631 + }, + { + "epoch": 1.648711943793911, + "grad_norm": 0.9366849660873413, + "learning_rate": 4.146023019165203e-06, + "loss": 0.6215, + "step": 5632 + }, + { + "epoch": 1.6490046838407495, + "grad_norm": 0.9948446154594421, + "learning_rate": 4.1457331693270906e-06, + "loss": 0.5983, + "step": 5633 + }, + { + "epoch": 1.649297423887588, + "grad_norm": 0.9225346446037292, + "learning_rate": 4.145443280443643e-06, + "loss": 0.6056, + "step": 5634 + }, + { + "epoch": 1.6495901639344264, + "grad_norm": 0.9749168157577515, + "learning_rate": 4.145153352521737e-06, + "loss": 0.601, + "step": 5635 + }, + { + "epoch": 1.6498829039812648, + "grad_norm": 0.9565554261207581, + "learning_rate": 4.144863385568252e-06, + "loss": 0.6159, + "step": 5636 + }, + { + "epoch": 1.650175644028103, + "grad_norm": 0.9603931307792664, + "learning_rate": 4.144573379590067e-06, + "loss": 0.6123, + "step": 5637 + }, + { + "epoch": 1.6504683840749415, + "grad_norm": 0.9295404553413391, + "learning_rate": 4.1442833345940625e-06, + "loss": 0.6083, + "step": 5638 + }, + { + "epoch": 1.6507611241217799, + "grad_norm": 0.955731213092804, + "learning_rate": 4.1439932505871195e-06, + "loss": 0.6001, + "step": 5639 + }, + { + "epoch": 1.651053864168618, + "grad_norm": 0.938983142375946, + "learning_rate": 4.143703127576121e-06, + "loss": 0.6196, + "step": 5640 + }, + { + "epoch": 1.6513466042154565, + "grad_norm": 0.9470329880714417, + "learning_rate": 4.14341296556795e-06, + "loss": 0.6041, + "step": 5641 + }, + { + "epoch": 1.651639344262295, + "grad_norm": 0.9338995814323425, + "learning_rate": 4.143122764569492e-06, + "loss": 0.6068, + "step": 5642 + }, + { + "epoch": 1.6519320843091334, + "grad_norm": 0.9898737072944641, + "learning_rate": 4.142832524587629e-06, + "loss": 0.6427, + "step": 5643 + }, + { + "epoch": 1.6522248243559718, + "grad_norm": 0.9678419232368469, + "learning_rate": 4.142542245629249e-06, + "loss": 0.5777, + "step": 5644 + }, + { + "epoch": 1.6525175644028103, + "grad_norm": 1.020188570022583, + "learning_rate": 4.14225192770124e-06, + "loss": 0.6409, + "step": 5645 + }, + { + "epoch": 1.6528103044496487, + "grad_norm": 0.957899808883667, + "learning_rate": 4.141961570810487e-06, + "loss": 0.6228, + "step": 5646 + }, + { + "epoch": 1.6531030444964872, + "grad_norm": 0.9598512053489685, + "learning_rate": 4.141671174963881e-06, + "loss": 0.6156, + "step": 5647 + }, + { + "epoch": 1.6533957845433256, + "grad_norm": 0.9737558960914612, + "learning_rate": 4.141380740168311e-06, + "loss": 0.639, + "step": 5648 + }, + { + "epoch": 1.653688524590164, + "grad_norm": 0.9970890879631042, + "learning_rate": 4.141090266430667e-06, + "loss": 0.6492, + "step": 5649 + }, + { + "epoch": 1.6539812646370025, + "grad_norm": 0.9354168176651001, + "learning_rate": 4.140799753757842e-06, + "loss": 0.572, + "step": 5650 + }, + { + "epoch": 1.654274004683841, + "grad_norm": 0.9720839858055115, + "learning_rate": 4.140509202156727e-06, + "loss": 0.655, + "step": 5651 + }, + { + "epoch": 1.654566744730679, + "grad_norm": 0.9360138773918152, + "learning_rate": 4.140218611634216e-06, + "loss": 0.5885, + "step": 5652 + }, + { + "epoch": 1.6548594847775175, + "grad_norm": 0.9284921288490295, + "learning_rate": 4.139927982197204e-06, + "loss": 0.5983, + "step": 5653 + }, + { + "epoch": 1.655152224824356, + "grad_norm": 0.9396961331367493, + "learning_rate": 4.139637313852585e-06, + "loss": 0.6282, + "step": 5654 + }, + { + "epoch": 1.6554449648711944, + "grad_norm": 0.9784972667694092, + "learning_rate": 4.139346606607256e-06, + "loss": 0.6355, + "step": 5655 + }, + { + "epoch": 1.6557377049180326, + "grad_norm": 0.9814757108688354, + "learning_rate": 4.139055860468113e-06, + "loss": 0.6009, + "step": 5656 + }, + { + "epoch": 1.656030444964871, + "grad_norm": 0.9557939767837524, + "learning_rate": 4.138765075442055e-06, + "loss": 0.6124, + "step": 5657 + }, + { + "epoch": 1.6563231850117095, + "grad_norm": 0.9914893507957458, + "learning_rate": 4.138474251535982e-06, + "loss": 0.6279, + "step": 5658 + }, + { + "epoch": 1.656615925058548, + "grad_norm": 0.9607000350952148, + "learning_rate": 4.138183388756792e-06, + "loss": 0.6482, + "step": 5659 + }, + { + "epoch": 1.6569086651053864, + "grad_norm": 0.9822984337806702, + "learning_rate": 4.137892487111386e-06, + "loss": 0.5856, + "step": 5660 + }, + { + "epoch": 1.6572014051522248, + "grad_norm": 0.9564976096153259, + "learning_rate": 4.137601546606666e-06, + "loss": 0.6142, + "step": 5661 + }, + { + "epoch": 1.6574941451990632, + "grad_norm": 0.9332866668701172, + "learning_rate": 4.137310567249534e-06, + "loss": 0.557, + "step": 5662 + }, + { + "epoch": 1.6577868852459017, + "grad_norm": 0.9505416750907898, + "learning_rate": 4.137019549046896e-06, + "loss": 0.5793, + "step": 5663 + }, + { + "epoch": 1.6580796252927401, + "grad_norm": 0.9644001126289368, + "learning_rate": 4.136728492005654e-06, + "loss": 0.6324, + "step": 5664 + }, + { + "epoch": 1.6583723653395785, + "grad_norm": 0.9571787118911743, + "learning_rate": 4.136437396132713e-06, + "loss": 0.5539, + "step": 5665 + }, + { + "epoch": 1.658665105386417, + "grad_norm": 0.9537010788917542, + "learning_rate": 4.136146261434981e-06, + "loss": 0.6293, + "step": 5666 + }, + { + "epoch": 1.6589578454332554, + "grad_norm": 0.9031289219856262, + "learning_rate": 4.135855087919365e-06, + "loss": 0.5145, + "step": 5667 + }, + { + "epoch": 1.6592505854800936, + "grad_norm": 0.9959240555763245, + "learning_rate": 4.135563875592773e-06, + "loss": 0.6176, + "step": 5668 + }, + { + "epoch": 1.659543325526932, + "grad_norm": 0.9716666340827942, + "learning_rate": 4.135272624462113e-06, + "loss": 0.6188, + "step": 5669 + }, + { + "epoch": 1.6598360655737705, + "grad_norm": 0.9562145471572876, + "learning_rate": 4.134981334534296e-06, + "loss": 0.58, + "step": 5670 + }, + { + "epoch": 1.660128805620609, + "grad_norm": 0.9885667562484741, + "learning_rate": 4.134690005816233e-06, + "loss": 0.592, + "step": 5671 + }, + { + "epoch": 1.6604215456674472, + "grad_norm": 0.9552138447761536, + "learning_rate": 4.134398638314836e-06, + "loss": 0.624, + "step": 5672 + }, + { + "epoch": 1.6607142857142856, + "grad_norm": 0.9376431703567505, + "learning_rate": 4.134107232037017e-06, + "loss": 0.605, + "step": 5673 + }, + { + "epoch": 1.661007025761124, + "grad_norm": 0.8901010155677795, + "learning_rate": 4.13381578698969e-06, + "loss": 0.57, + "step": 5674 + }, + { + "epoch": 1.6612997658079625, + "grad_norm": 0.967875063419342, + "learning_rate": 4.133524303179769e-06, + "loss": 0.5657, + "step": 5675 + }, + { + "epoch": 1.661592505854801, + "grad_norm": 1.0120233297348022, + "learning_rate": 4.1332327806141704e-06, + "loss": 0.6695, + "step": 5676 + }, + { + "epoch": 1.6618852459016393, + "grad_norm": 0.955199658870697, + "learning_rate": 4.13294121929981e-06, + "loss": 0.595, + "step": 5677 + }, + { + "epoch": 1.6621779859484778, + "grad_norm": 0.9717886447906494, + "learning_rate": 4.132649619243606e-06, + "loss": 0.622, + "step": 5678 + }, + { + "epoch": 1.6624707259953162, + "grad_norm": 0.9656528830528259, + "learning_rate": 4.132357980452476e-06, + "loss": 0.6393, + "step": 5679 + }, + { + "epoch": 1.6627634660421546, + "grad_norm": 0.9773728847503662, + "learning_rate": 4.132066302933339e-06, + "loss": 0.6215, + "step": 5680 + }, + { + "epoch": 1.663056206088993, + "grad_norm": 0.9761949777603149, + "learning_rate": 4.131774586693116e-06, + "loss": 0.5961, + "step": 5681 + }, + { + "epoch": 1.6633489461358315, + "grad_norm": 0.9775042533874512, + "learning_rate": 4.1314828317387265e-06, + "loss": 0.607, + "step": 5682 + }, + { + "epoch": 1.66364168618267, + "grad_norm": 0.9960785508155823, + "learning_rate": 4.131191038077094e-06, + "loss": 0.6621, + "step": 5683 + }, + { + "epoch": 1.6639344262295082, + "grad_norm": 1.02555251121521, + "learning_rate": 4.130899205715141e-06, + "loss": 0.6411, + "step": 5684 + }, + { + "epoch": 1.6642271662763466, + "grad_norm": 0.975101113319397, + "learning_rate": 4.130607334659791e-06, + "loss": 0.5864, + "step": 5685 + }, + { + "epoch": 1.664519906323185, + "grad_norm": 0.9680256247520447, + "learning_rate": 4.130315424917968e-06, + "loss": 0.6165, + "step": 5686 + }, + { + "epoch": 1.6648126463700235, + "grad_norm": 0.9447482824325562, + "learning_rate": 4.130023476496599e-06, + "loss": 0.6107, + "step": 5687 + }, + { + "epoch": 1.6651053864168617, + "grad_norm": 0.9577952027320862, + "learning_rate": 4.129731489402609e-06, + "loss": 0.635, + "step": 5688 + }, + { + "epoch": 1.6653981264637001, + "grad_norm": 0.9460830092430115, + "learning_rate": 4.129439463642927e-06, + "loss": 0.5906, + "step": 5689 + }, + { + "epoch": 1.6656908665105385, + "grad_norm": 0.9584749937057495, + "learning_rate": 4.12914739922448e-06, + "loss": 0.5677, + "step": 5690 + }, + { + "epoch": 1.665983606557377, + "grad_norm": 0.9165685772895813, + "learning_rate": 4.128855296154198e-06, + "loss": 0.5632, + "step": 5691 + }, + { + "epoch": 1.6662763466042154, + "grad_norm": 0.9663140177726746, + "learning_rate": 4.128563154439013e-06, + "loss": 0.6095, + "step": 5692 + }, + { + "epoch": 1.6665690866510539, + "grad_norm": 0.9949911832809448, + "learning_rate": 4.128270974085853e-06, + "loss": 0.6166, + "step": 5693 + }, + { + "epoch": 1.6668618266978923, + "grad_norm": 0.9077321290969849, + "learning_rate": 4.12797875510165e-06, + "loss": 0.6026, + "step": 5694 + }, + { + "epoch": 1.6671545667447307, + "grad_norm": 0.9369482398033142, + "learning_rate": 4.127686497493341e-06, + "loss": 0.5963, + "step": 5695 + }, + { + "epoch": 1.6674473067915692, + "grad_norm": 0.9686824083328247, + "learning_rate": 4.127394201267855e-06, + "loss": 0.6198, + "step": 5696 + }, + { + "epoch": 1.6677400468384076, + "grad_norm": 0.9782867431640625, + "learning_rate": 4.1271018664321296e-06, + "loss": 0.5904, + "step": 5697 + }, + { + "epoch": 1.668032786885246, + "grad_norm": 0.9494737386703491, + "learning_rate": 4.1268094929931e-06, + "loss": 0.6468, + "step": 5698 + }, + { + "epoch": 1.6683255269320845, + "grad_norm": 1.0013993978500366, + "learning_rate": 4.1265170809577015e-06, + "loss": 0.6252, + "step": 5699 + }, + { + "epoch": 1.6686182669789227, + "grad_norm": 0.9298804998397827, + "learning_rate": 4.126224630332874e-06, + "loss": 0.5703, + "step": 5700 + }, + { + "epoch": 1.6689110070257611, + "grad_norm": 0.9337966442108154, + "learning_rate": 4.125932141125554e-06, + "loss": 0.5586, + "step": 5701 + }, + { + "epoch": 1.6692037470725996, + "grad_norm": 0.9807398319244385, + "learning_rate": 4.125639613342682e-06, + "loss": 0.6436, + "step": 5702 + }, + { + "epoch": 1.669496487119438, + "grad_norm": 0.955716609954834, + "learning_rate": 4.125347046991198e-06, + "loss": 0.6368, + "step": 5703 + }, + { + "epoch": 1.6697892271662762, + "grad_norm": 1.0319017171859741, + "learning_rate": 4.125054442078043e-06, + "loss": 0.6426, + "step": 5704 + }, + { + "epoch": 1.6700819672131146, + "grad_norm": 0.9330281615257263, + "learning_rate": 4.124761798610159e-06, + "loss": 0.5695, + "step": 5705 + }, + { + "epoch": 1.670374707259953, + "grad_norm": 0.970100462436676, + "learning_rate": 4.1244691165944895e-06, + "loss": 0.6359, + "step": 5706 + }, + { + "epoch": 1.6706674473067915, + "grad_norm": 0.9792181849479675, + "learning_rate": 4.124176396037977e-06, + "loss": 0.6208, + "step": 5707 + }, + { + "epoch": 1.67096018735363, + "grad_norm": 1.001984715461731, + "learning_rate": 4.123883636947569e-06, + "loss": 0.6348, + "step": 5708 + }, + { + "epoch": 1.6712529274004684, + "grad_norm": 0.9851574301719666, + "learning_rate": 4.123590839330209e-06, + "loss": 0.5807, + "step": 5709 + }, + { + "epoch": 1.6715456674473068, + "grad_norm": 0.9373533725738525, + "learning_rate": 4.123298003192844e-06, + "loss": 0.6575, + "step": 5710 + }, + { + "epoch": 1.6718384074941453, + "grad_norm": 0.9328571557998657, + "learning_rate": 4.123005128542422e-06, + "loss": 0.5778, + "step": 5711 + }, + { + "epoch": 1.6721311475409837, + "grad_norm": 1.0167261362075806, + "learning_rate": 4.122712215385892e-06, + "loss": 0.5941, + "step": 5712 + }, + { + "epoch": 1.6724238875878221, + "grad_norm": 0.9779118895530701, + "learning_rate": 4.122419263730203e-06, + "loss": 0.6074, + "step": 5713 + }, + { + "epoch": 1.6727166276346606, + "grad_norm": 0.9663853049278259, + "learning_rate": 4.122126273582305e-06, + "loss": 0.6211, + "step": 5714 + }, + { + "epoch": 1.673009367681499, + "grad_norm": 0.9429110884666443, + "learning_rate": 4.12183324494915e-06, + "loss": 0.5564, + "step": 5715 + }, + { + "epoch": 1.6733021077283372, + "grad_norm": 0.9417654871940613, + "learning_rate": 4.12154017783769e-06, + "loss": 0.5812, + "step": 5716 + }, + { + "epoch": 1.6735948477751756, + "grad_norm": 1.0702897310256958, + "learning_rate": 4.121247072254876e-06, + "loss": 0.6219, + "step": 5717 + }, + { + "epoch": 1.673887587822014, + "grad_norm": 0.9932584166526794, + "learning_rate": 4.120953928207665e-06, + "loss": 0.6095, + "step": 5718 + }, + { + "epoch": 1.6741803278688525, + "grad_norm": 1.0237438678741455, + "learning_rate": 4.120660745703011e-06, + "loss": 0.6195, + "step": 5719 + }, + { + "epoch": 1.6744730679156907, + "grad_norm": 0.9635059833526611, + "learning_rate": 4.120367524747868e-06, + "loss": 0.5705, + "step": 5720 + }, + { + "epoch": 1.6747658079625292, + "grad_norm": 0.9656919240951538, + "learning_rate": 4.120074265349195e-06, + "loss": 0.614, + "step": 5721 + }, + { + "epoch": 1.6750585480093676, + "grad_norm": 1.0042802095413208, + "learning_rate": 4.119780967513949e-06, + "loss": 0.6022, + "step": 5722 + }, + { + "epoch": 1.675351288056206, + "grad_norm": 1.0449175834655762, + "learning_rate": 4.1194876312490885e-06, + "loss": 0.6267, + "step": 5723 + }, + { + "epoch": 1.6756440281030445, + "grad_norm": 0.9940829873085022, + "learning_rate": 4.1191942565615725e-06, + "loss": 0.6069, + "step": 5724 + }, + { + "epoch": 1.675936768149883, + "grad_norm": 0.9455961585044861, + "learning_rate": 4.118900843458362e-06, + "loss": 0.5974, + "step": 5725 + }, + { + "epoch": 1.6762295081967213, + "grad_norm": 0.9287218451499939, + "learning_rate": 4.118607391946418e-06, + "loss": 0.5839, + "step": 5726 + }, + { + "epoch": 1.6765222482435598, + "grad_norm": 0.9504856467247009, + "learning_rate": 4.118313902032701e-06, + "loss": 0.6228, + "step": 5727 + }, + { + "epoch": 1.6768149882903982, + "grad_norm": 0.9640789031982422, + "learning_rate": 4.118020373724178e-06, + "loss": 0.6094, + "step": 5728 + }, + { + "epoch": 1.6771077283372366, + "grad_norm": 0.9482638835906982, + "learning_rate": 4.11772680702781e-06, + "loss": 0.5995, + "step": 5729 + }, + { + "epoch": 1.677400468384075, + "grad_norm": 0.9333581328392029, + "learning_rate": 4.117433201950562e-06, + "loss": 0.5999, + "step": 5730 + }, + { + "epoch": 1.6776932084309133, + "grad_norm": 0.9958605170249939, + "learning_rate": 4.117139558499401e-06, + "loss": 0.609, + "step": 5731 + }, + { + "epoch": 1.6779859484777517, + "grad_norm": 0.9242718815803528, + "learning_rate": 4.116845876681293e-06, + "loss": 0.6041, + "step": 5732 + }, + { + "epoch": 1.6782786885245902, + "grad_norm": 0.9227412343025208, + "learning_rate": 4.116552156503206e-06, + "loss": 0.5921, + "step": 5733 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 0.9635947942733765, + "learning_rate": 4.11625839797211e-06, + "loss": 0.6045, + "step": 5734 + }, + { + "epoch": 1.6788641686182668, + "grad_norm": 0.9551240801811218, + "learning_rate": 4.115964601094972e-06, + "loss": 0.6336, + "step": 5735 + }, + { + "epoch": 1.6791569086651053, + "grad_norm": 1.049165964126587, + "learning_rate": 4.115670765878762e-06, + "loss": 0.5858, + "step": 5736 + }, + { + "epoch": 1.6794496487119437, + "grad_norm": 0.9990831613540649, + "learning_rate": 4.115376892330454e-06, + "loss": 0.6482, + "step": 5737 + }, + { + "epoch": 1.6797423887587821, + "grad_norm": 1.0634711980819702, + "learning_rate": 4.1150829804570195e-06, + "loss": 0.6148, + "step": 5738 + }, + { + "epoch": 1.6800351288056206, + "grad_norm": 1.0123624801635742, + "learning_rate": 4.11478903026543e-06, + "loss": 0.6322, + "step": 5739 + }, + { + "epoch": 1.680327868852459, + "grad_norm": 0.987663745880127, + "learning_rate": 4.11449504176266e-06, + "loss": 0.6195, + "step": 5740 + }, + { + "epoch": 1.6806206088992974, + "grad_norm": 0.9239993095397949, + "learning_rate": 4.114201014955687e-06, + "loss": 0.6026, + "step": 5741 + }, + { + "epoch": 1.6809133489461359, + "grad_norm": 0.9834372997283936, + "learning_rate": 4.113906949851483e-06, + "loss": 0.6052, + "step": 5742 + }, + { + "epoch": 1.6812060889929743, + "grad_norm": 1.027593970298767, + "learning_rate": 4.113612846457027e-06, + "loss": 0.5947, + "step": 5743 + }, + { + "epoch": 1.6814988290398127, + "grad_norm": 0.960038959980011, + "learning_rate": 4.113318704779296e-06, + "loss": 0.6291, + "step": 5744 + }, + { + "epoch": 1.6817915690866512, + "grad_norm": 0.9442440867424011, + "learning_rate": 4.113024524825269e-06, + "loss": 0.605, + "step": 5745 + }, + { + "epoch": 1.6820843091334896, + "grad_norm": 0.9299863576889038, + "learning_rate": 4.112730306601925e-06, + "loss": 0.6063, + "step": 5746 + }, + { + "epoch": 1.6823770491803278, + "grad_norm": 0.9979293942451477, + "learning_rate": 4.112436050116245e-06, + "loss": 0.6179, + "step": 5747 + }, + { + "epoch": 1.6826697892271663, + "grad_norm": 0.9959329962730408, + "learning_rate": 4.11214175537521e-06, + "loss": 0.6398, + "step": 5748 + }, + { + "epoch": 1.6829625292740047, + "grad_norm": 0.9436835646629333, + "learning_rate": 4.111847422385803e-06, + "loss": 0.6131, + "step": 5749 + }, + { + "epoch": 1.6832552693208431, + "grad_norm": 0.9048517346382141, + "learning_rate": 4.111553051155004e-06, + "loss": 0.5888, + "step": 5750 + }, + { + "epoch": 1.6835480093676813, + "grad_norm": 0.9277451634407043, + "learning_rate": 4.111258641689801e-06, + "loss": 0.5939, + "step": 5751 + }, + { + "epoch": 1.6838407494145198, + "grad_norm": 0.9653424620628357, + "learning_rate": 4.1109641939971764e-06, + "loss": 0.6152, + "step": 5752 + }, + { + "epoch": 1.6841334894613582, + "grad_norm": 0.9412299990653992, + "learning_rate": 4.110669708084116e-06, + "loss": 0.5687, + "step": 5753 + }, + { + "epoch": 1.6844262295081966, + "grad_norm": 0.9954588413238525, + "learning_rate": 4.11037518395761e-06, + "loss": 0.6247, + "step": 5754 + }, + { + "epoch": 1.684718969555035, + "grad_norm": 0.9324162602424622, + "learning_rate": 4.110080621624642e-06, + "loss": 0.5972, + "step": 5755 + }, + { + "epoch": 1.6850117096018735, + "grad_norm": 0.9407853484153748, + "learning_rate": 4.1097860210922e-06, + "loss": 0.6423, + "step": 5756 + }, + { + "epoch": 1.685304449648712, + "grad_norm": 0.9676459431648254, + "learning_rate": 4.109491382367277e-06, + "loss": 0.628, + "step": 5757 + }, + { + "epoch": 1.6855971896955504, + "grad_norm": 1.0007212162017822, + "learning_rate": 4.109196705456863e-06, + "loss": 0.67, + "step": 5758 + }, + { + "epoch": 1.6858899297423888, + "grad_norm": 0.9299372434616089, + "learning_rate": 4.1089019903679464e-06, + "loss": 0.5868, + "step": 5759 + }, + { + "epoch": 1.6861826697892273, + "grad_norm": 1.015102744102478, + "learning_rate": 4.108607237107521e-06, + "loss": 0.5705, + "step": 5760 + }, + { + "epoch": 1.6864754098360657, + "grad_norm": 0.9494650959968567, + "learning_rate": 4.108312445682581e-06, + "loss": 0.6067, + "step": 5761 + }, + { + "epoch": 1.6867681498829041, + "grad_norm": 0.9533968567848206, + "learning_rate": 4.1080176161001174e-06, + "loss": 0.6539, + "step": 5762 + }, + { + "epoch": 1.6870608899297423, + "grad_norm": 1.003609299659729, + "learning_rate": 4.107722748367128e-06, + "loss": 0.6365, + "step": 5763 + }, + { + "epoch": 1.6873536299765808, + "grad_norm": 0.9576236605644226, + "learning_rate": 4.107427842490608e-06, + "loss": 0.6418, + "step": 5764 + }, + { + "epoch": 1.6876463700234192, + "grad_norm": 0.9578040838241577, + "learning_rate": 4.107132898477553e-06, + "loss": 0.6081, + "step": 5765 + }, + { + "epoch": 1.6879391100702577, + "grad_norm": 1.0073800086975098, + "learning_rate": 4.106837916334962e-06, + "loss": 0.6236, + "step": 5766 + }, + { + "epoch": 1.6882318501170959, + "grad_norm": 0.935753583908081, + "learning_rate": 4.106542896069832e-06, + "loss": 0.583, + "step": 5767 + }, + { + "epoch": 1.6885245901639343, + "grad_norm": 0.960622251033783, + "learning_rate": 4.1062478376891625e-06, + "loss": 0.5857, + "step": 5768 + }, + { + "epoch": 1.6888173302107727, + "grad_norm": 1.0061814785003662, + "learning_rate": 4.105952741199955e-06, + "loss": 0.6221, + "step": 5769 + }, + { + "epoch": 1.6891100702576112, + "grad_norm": 0.9534651041030884, + "learning_rate": 4.10565760660921e-06, + "loss": 0.6343, + "step": 5770 + }, + { + "epoch": 1.6894028103044496, + "grad_norm": 0.9615468978881836, + "learning_rate": 4.105362433923931e-06, + "loss": 0.5789, + "step": 5771 + }, + { + "epoch": 1.689695550351288, + "grad_norm": 0.966310441493988, + "learning_rate": 4.1050672231511175e-06, + "loss": 0.6157, + "step": 5772 + }, + { + "epoch": 1.6899882903981265, + "grad_norm": 0.9588844180107117, + "learning_rate": 4.104771974297778e-06, + "loss": 0.6044, + "step": 5773 + }, + { + "epoch": 1.690281030444965, + "grad_norm": 0.962428867816925, + "learning_rate": 4.104476687370914e-06, + "loss": 0.5995, + "step": 5774 + }, + { + "epoch": 1.6905737704918034, + "grad_norm": 0.9499881267547607, + "learning_rate": 4.104181362377533e-06, + "loss": 0.631, + "step": 5775 + }, + { + "epoch": 1.6908665105386418, + "grad_norm": 0.9188812375068665, + "learning_rate": 4.10388599932464e-06, + "loss": 0.6024, + "step": 5776 + }, + { + "epoch": 1.6911592505854802, + "grad_norm": 0.9330126643180847, + "learning_rate": 4.103590598219245e-06, + "loss": 0.6137, + "step": 5777 + }, + { + "epoch": 1.6914519906323187, + "grad_norm": 0.9447015523910522, + "learning_rate": 4.103295159068353e-06, + "loss": 0.6001, + "step": 5778 + }, + { + "epoch": 1.6917447306791569, + "grad_norm": 1.0006519556045532, + "learning_rate": 4.1029996818789775e-06, + "loss": 0.6289, + "step": 5779 + }, + { + "epoch": 1.6920374707259953, + "grad_norm": 1.0250465869903564, + "learning_rate": 4.102704166658125e-06, + "loss": 0.6252, + "step": 5780 + }, + { + "epoch": 1.6923302107728337, + "grad_norm": 0.9696431756019592, + "learning_rate": 4.102408613412809e-06, + "loss": 0.6317, + "step": 5781 + }, + { + "epoch": 1.6926229508196722, + "grad_norm": 0.9362086057662964, + "learning_rate": 4.10211302215004e-06, + "loss": 0.5902, + "step": 5782 + }, + { + "epoch": 1.6929156908665104, + "grad_norm": 0.9839134812355042, + "learning_rate": 4.101817392876832e-06, + "loss": 0.5933, + "step": 5783 + }, + { + "epoch": 1.6932084309133488, + "grad_norm": 0.9481446743011475, + "learning_rate": 4.1015217256001995e-06, + "loss": 0.6095, + "step": 5784 + }, + { + "epoch": 1.6935011709601873, + "grad_norm": 0.9762458205223083, + "learning_rate": 4.101226020327156e-06, + "loss": 0.6376, + "step": 5785 + }, + { + "epoch": 1.6937939110070257, + "grad_norm": 0.9966129064559937, + "learning_rate": 4.100930277064717e-06, + "loss": 0.6441, + "step": 5786 + }, + { + "epoch": 1.6940866510538641, + "grad_norm": 0.9905878305435181, + "learning_rate": 4.1006344958199e-06, + "loss": 0.6497, + "step": 5787 + }, + { + "epoch": 1.6943793911007026, + "grad_norm": 0.9795395135879517, + "learning_rate": 4.100338676599722e-06, + "loss": 0.6216, + "step": 5788 + }, + { + "epoch": 1.694672131147541, + "grad_norm": 0.933814525604248, + "learning_rate": 4.100042819411202e-06, + "loss": 0.6097, + "step": 5789 + }, + { + "epoch": 1.6949648711943794, + "grad_norm": 0.9718238711357117, + "learning_rate": 4.099746924261357e-06, + "loss": 0.5971, + "step": 5790 + }, + { + "epoch": 1.6952576112412179, + "grad_norm": 1.0495082139968872, + "learning_rate": 4.09945099115721e-06, + "loss": 0.6459, + "step": 5791 + }, + { + "epoch": 1.6955503512880563, + "grad_norm": 0.9576998353004456, + "learning_rate": 4.099155020105782e-06, + "loss": 0.6425, + "step": 5792 + }, + { + "epoch": 1.6958430913348947, + "grad_norm": 0.9986475706100464, + "learning_rate": 4.098859011114092e-06, + "loss": 0.6454, + "step": 5793 + }, + { + "epoch": 1.6961358313817332, + "grad_norm": 0.9483119249343872, + "learning_rate": 4.098562964189167e-06, + "loss": 0.594, + "step": 5794 + }, + { + "epoch": 1.6964285714285714, + "grad_norm": 1.026802659034729, + "learning_rate": 4.098266879338026e-06, + "loss": 0.6511, + "step": 5795 + }, + { + "epoch": 1.6967213114754098, + "grad_norm": 0.9605022668838501, + "learning_rate": 4.097970756567697e-06, + "loss": 0.6021, + "step": 5796 + }, + { + "epoch": 1.6970140515222483, + "grad_norm": 0.9465847611427307, + "learning_rate": 4.097674595885206e-06, + "loss": 0.6093, + "step": 5797 + }, + { + "epoch": 1.6973067915690867, + "grad_norm": 0.9255883693695068, + "learning_rate": 4.097378397297577e-06, + "loss": 0.6168, + "step": 5798 + }, + { + "epoch": 1.697599531615925, + "grad_norm": 0.9389970302581787, + "learning_rate": 4.097082160811839e-06, + "loss": 0.5987, + "step": 5799 + }, + { + "epoch": 1.6978922716627634, + "grad_norm": 0.977018415927887, + "learning_rate": 4.096785886435021e-06, + "loss": 0.6092, + "step": 5800 + }, + { + "epoch": 1.6981850117096018, + "grad_norm": 0.959189236164093, + "learning_rate": 4.096489574174149e-06, + "loss": 0.5543, + "step": 5801 + }, + { + "epoch": 1.6984777517564402, + "grad_norm": 0.9449817538261414, + "learning_rate": 4.096193224036257e-06, + "loss": 0.6108, + "step": 5802 + }, + { + "epoch": 1.6987704918032787, + "grad_norm": 0.9365825653076172, + "learning_rate": 4.095896836028373e-06, + "loss": 0.59, + "step": 5803 + }, + { + "epoch": 1.699063231850117, + "grad_norm": 0.9396324157714844, + "learning_rate": 4.09560041015753e-06, + "loss": 0.5894, + "step": 5804 + }, + { + "epoch": 1.6993559718969555, + "grad_norm": 0.9673705101013184, + "learning_rate": 4.095303946430761e-06, + "loss": 0.6011, + "step": 5805 + }, + { + "epoch": 1.699648711943794, + "grad_norm": 0.9430652856826782, + "learning_rate": 4.095007444855099e-06, + "loss": 0.6045, + "step": 5806 + }, + { + "epoch": 1.6999414519906324, + "grad_norm": 0.9884112477302551, + "learning_rate": 4.09471090543758e-06, + "loss": 0.624, + "step": 5807 + }, + { + "epoch": 1.7002341920374708, + "grad_norm": 0.9795242547988892, + "learning_rate": 4.094414328185238e-06, + "loss": 0.6098, + "step": 5808 + }, + { + "epoch": 1.7005269320843093, + "grad_norm": 0.9076258540153503, + "learning_rate": 4.094117713105109e-06, + "loss": 0.5904, + "step": 5809 + }, + { + "epoch": 1.7008196721311475, + "grad_norm": 0.9913195967674255, + "learning_rate": 4.093821060204232e-06, + "loss": 0.6356, + "step": 5810 + }, + { + "epoch": 1.701112412177986, + "grad_norm": 0.9492698907852173, + "learning_rate": 4.093524369489643e-06, + "loss": 0.6246, + "step": 5811 + }, + { + "epoch": 1.7014051522248244, + "grad_norm": 0.9564664363861084, + "learning_rate": 4.093227640968383e-06, + "loss": 0.6427, + "step": 5812 + }, + { + "epoch": 1.7016978922716628, + "grad_norm": 0.9716411232948303, + "learning_rate": 4.092930874647491e-06, + "loss": 0.6281, + "step": 5813 + }, + { + "epoch": 1.701990632318501, + "grad_norm": 0.9693247675895691, + "learning_rate": 4.092634070534007e-06, + "loss": 0.638, + "step": 5814 + }, + { + "epoch": 1.7022833723653394, + "grad_norm": 0.9498764872550964, + "learning_rate": 4.0923372286349746e-06, + "loss": 0.6076, + "step": 5815 + }, + { + "epoch": 1.7025761124121779, + "grad_norm": 0.9185755848884583, + "learning_rate": 4.0920403489574354e-06, + "loss": 0.5967, + "step": 5816 + }, + { + "epoch": 1.7028688524590163, + "grad_norm": 0.9584873914718628, + "learning_rate": 4.091743431508434e-06, + "loss": 0.6166, + "step": 5817 + }, + { + "epoch": 1.7031615925058547, + "grad_norm": 0.9376382827758789, + "learning_rate": 4.091446476295012e-06, + "loss": 0.5923, + "step": 5818 + }, + { + "epoch": 1.7034543325526932, + "grad_norm": 0.9827524423599243, + "learning_rate": 4.0911494833242185e-06, + "loss": 0.6168, + "step": 5819 + }, + { + "epoch": 1.7037470725995316, + "grad_norm": 0.9306148886680603, + "learning_rate": 4.090852452603097e-06, + "loss": 0.5995, + "step": 5820 + }, + { + "epoch": 1.70403981264637, + "grad_norm": 1.0094540119171143, + "learning_rate": 4.090555384138695e-06, + "loss": 0.6481, + "step": 5821 + }, + { + "epoch": 1.7043325526932085, + "grad_norm": 0.9609090685844421, + "learning_rate": 4.090258277938061e-06, + "loss": 0.5997, + "step": 5822 + }, + { + "epoch": 1.704625292740047, + "grad_norm": 0.9707977771759033, + "learning_rate": 4.089961134008245e-06, + "loss": 0.6089, + "step": 5823 + }, + { + "epoch": 1.7049180327868854, + "grad_norm": 0.9859792590141296, + "learning_rate": 4.089663952356294e-06, + "loss": 0.5957, + "step": 5824 + }, + { + "epoch": 1.7052107728337238, + "grad_norm": 0.9584439992904663, + "learning_rate": 4.089366732989261e-06, + "loss": 0.5633, + "step": 5825 + }, + { + "epoch": 1.705503512880562, + "grad_norm": 0.9674412608146667, + "learning_rate": 4.089069475914197e-06, + "loss": 0.5987, + "step": 5826 + }, + { + "epoch": 1.7057962529274004, + "grad_norm": 0.9792816042900085, + "learning_rate": 4.088772181138154e-06, + "loss": 0.6469, + "step": 5827 + }, + { + "epoch": 1.7060889929742389, + "grad_norm": 0.9608473777770996, + "learning_rate": 4.088474848668186e-06, + "loss": 0.5811, + "step": 5828 + }, + { + "epoch": 1.7063817330210773, + "grad_norm": 0.9909663200378418, + "learning_rate": 4.088177478511348e-06, + "loss": 0.6368, + "step": 5829 + }, + { + "epoch": 1.7066744730679155, + "grad_norm": 1.0089884996414185, + "learning_rate": 4.087880070674693e-06, + "loss": 0.5985, + "step": 5830 + }, + { + "epoch": 1.706967213114754, + "grad_norm": 0.9717596173286438, + "learning_rate": 4.087582625165279e-06, + "loss": 0.6217, + "step": 5831 + }, + { + "epoch": 1.7072599531615924, + "grad_norm": 0.9805733561515808, + "learning_rate": 4.087285141990161e-06, + "loss": 0.5573, + "step": 5832 + }, + { + "epoch": 1.7075526932084308, + "grad_norm": 0.9778563380241394, + "learning_rate": 4.0869876211563994e-06, + "loss": 0.5917, + "step": 5833 + }, + { + "epoch": 1.7078454332552693, + "grad_norm": 0.9696106910705566, + "learning_rate": 4.086690062671051e-06, + "loss": 0.6194, + "step": 5834 + }, + { + "epoch": 1.7081381733021077, + "grad_norm": 0.9455929398536682, + "learning_rate": 4.086392466541176e-06, + "loss": 0.6324, + "step": 5835 + }, + { + "epoch": 1.7084309133489461, + "grad_norm": 0.9595345854759216, + "learning_rate": 4.0860948327738345e-06, + "loss": 0.6283, + "step": 5836 + }, + { + "epoch": 1.7087236533957846, + "grad_norm": 0.9726764559745789, + "learning_rate": 4.0857971613760895e-06, + "loss": 0.6325, + "step": 5837 + }, + { + "epoch": 1.709016393442623, + "grad_norm": 0.981269121170044, + "learning_rate": 4.085499452355002e-06, + "loss": 0.62, + "step": 5838 + }, + { + "epoch": 1.7093091334894615, + "grad_norm": 1.0362160205841064, + "learning_rate": 4.085201705717634e-06, + "loss": 0.577, + "step": 5839 + }, + { + "epoch": 1.7096018735362999, + "grad_norm": 0.9804062247276306, + "learning_rate": 4.084903921471053e-06, + "loss": 0.5952, + "step": 5840 + }, + { + "epoch": 1.7098946135831383, + "grad_norm": 1.0013686418533325, + "learning_rate": 4.08460609962232e-06, + "loss": 0.595, + "step": 5841 + }, + { + "epoch": 1.7101873536299765, + "grad_norm": 0.9596145153045654, + "learning_rate": 4.084308240178504e-06, + "loss": 0.5934, + "step": 5842 + }, + { + "epoch": 1.710480093676815, + "grad_norm": 0.9663678407669067, + "learning_rate": 4.084010343146671e-06, + "loss": 0.6074, + "step": 5843 + }, + { + "epoch": 1.7107728337236534, + "grad_norm": 0.9575388431549072, + "learning_rate": 4.083712408533888e-06, + "loss": 0.6055, + "step": 5844 + }, + { + "epoch": 1.7110655737704918, + "grad_norm": 1.028008222579956, + "learning_rate": 4.0834144363472225e-06, + "loss": 0.6201, + "step": 5845 + }, + { + "epoch": 1.71135831381733, + "grad_norm": 0.9597690105438232, + "learning_rate": 4.083116426593746e-06, + "loss": 0.621, + "step": 5846 + }, + { + "epoch": 1.7116510538641685, + "grad_norm": 0.9604100584983826, + "learning_rate": 4.082818379280529e-06, + "loss": 0.621, + "step": 5847 + }, + { + "epoch": 1.711943793911007, + "grad_norm": 0.9527300596237183, + "learning_rate": 4.082520294414642e-06, + "loss": 0.5833, + "step": 5848 + }, + { + "epoch": 1.7122365339578454, + "grad_norm": 0.9666553139686584, + "learning_rate": 4.082222172003155e-06, + "loss": 0.6341, + "step": 5849 + }, + { + "epoch": 1.7125292740046838, + "grad_norm": 0.9772213697433472, + "learning_rate": 4.081924012053145e-06, + "loss": 0.6296, + "step": 5850 + }, + { + "epoch": 1.7128220140515222, + "grad_norm": 0.977452278137207, + "learning_rate": 4.081625814571683e-06, + "loss": 0.5795, + "step": 5851 + }, + { + "epoch": 1.7131147540983607, + "grad_norm": 0.9930068850517273, + "learning_rate": 4.081327579565845e-06, + "loss": 0.6174, + "step": 5852 + }, + { + "epoch": 1.713407494145199, + "grad_norm": 0.9263677597045898, + "learning_rate": 4.081029307042706e-06, + "loss": 0.6157, + "step": 5853 + }, + { + "epoch": 1.7137002341920375, + "grad_norm": 0.8874608278274536, + "learning_rate": 4.080730997009343e-06, + "loss": 0.5911, + "step": 5854 + }, + { + "epoch": 1.713992974238876, + "grad_norm": 0.93777996301651, + "learning_rate": 4.080432649472834e-06, + "loss": 0.6037, + "step": 5855 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 1.0078238248825073, + "learning_rate": 4.080134264440256e-06, + "loss": 0.5895, + "step": 5856 + }, + { + "epoch": 1.7145784543325528, + "grad_norm": 0.9566800594329834, + "learning_rate": 4.079835841918689e-06, + "loss": 0.6121, + "step": 5857 + }, + { + "epoch": 1.714871194379391, + "grad_norm": 1.004228949546814, + "learning_rate": 4.079537381915214e-06, + "loss": 0.5458, + "step": 5858 + }, + { + "epoch": 1.7151639344262295, + "grad_norm": 0.9388912320137024, + "learning_rate": 4.0792388844369105e-06, + "loss": 0.5789, + "step": 5859 + }, + { + "epoch": 1.715456674473068, + "grad_norm": 0.9574386477470398, + "learning_rate": 4.0789403494908604e-06, + "loss": 0.6135, + "step": 5860 + }, + { + "epoch": 1.7157494145199064, + "grad_norm": 0.9964042901992798, + "learning_rate": 4.078641777084149e-06, + "loss": 0.6607, + "step": 5861 + }, + { + "epoch": 1.7160421545667446, + "grad_norm": 0.9757380485534668, + "learning_rate": 4.078343167223856e-06, + "loss": 0.5964, + "step": 5862 + }, + { + "epoch": 1.716334894613583, + "grad_norm": 0.9442198276519775, + "learning_rate": 4.07804451991707e-06, + "loss": 0.6026, + "step": 5863 + }, + { + "epoch": 1.7166276346604215, + "grad_norm": 0.9345252513885498, + "learning_rate": 4.077745835170873e-06, + "loss": 0.5267, + "step": 5864 + }, + { + "epoch": 1.7169203747072599, + "grad_norm": 0.9611176252365112, + "learning_rate": 4.077447112992354e-06, + "loss": 0.5659, + "step": 5865 + }, + { + "epoch": 1.7172131147540983, + "grad_norm": 1.0494709014892578, + "learning_rate": 4.077148353388599e-06, + "loss": 0.5971, + "step": 5866 + }, + { + "epoch": 1.7175058548009368, + "grad_norm": 0.9333611130714417, + "learning_rate": 4.076849556366697e-06, + "loss": 0.6067, + "step": 5867 + }, + { + "epoch": 1.7177985948477752, + "grad_norm": 1.009836196899414, + "learning_rate": 4.076550721933735e-06, + "loss": 0.6743, + "step": 5868 + }, + { + "epoch": 1.7180913348946136, + "grad_norm": 0.9538887143135071, + "learning_rate": 4.076251850096804e-06, + "loss": 0.5931, + "step": 5869 + }, + { + "epoch": 1.718384074941452, + "grad_norm": 0.9580501317977905, + "learning_rate": 4.075952940862996e-06, + "loss": 0.5627, + "step": 5870 + }, + { + "epoch": 1.7186768149882905, + "grad_norm": 0.9258502125740051, + "learning_rate": 4.0756539942394016e-06, + "loss": 0.5263, + "step": 5871 + }, + { + "epoch": 1.718969555035129, + "grad_norm": 0.9681611061096191, + "learning_rate": 4.075355010233114e-06, + "loss": 0.6495, + "step": 5872 + }, + { + "epoch": 1.7192622950819674, + "grad_norm": 0.9869275093078613, + "learning_rate": 4.075055988851225e-06, + "loss": 0.6529, + "step": 5873 + }, + { + "epoch": 1.7195550351288056, + "grad_norm": 0.9467607140541077, + "learning_rate": 4.0747569301008306e-06, + "loss": 0.5962, + "step": 5874 + }, + { + "epoch": 1.719847775175644, + "grad_norm": 0.9209995269775391, + "learning_rate": 4.074457833989026e-06, + "loss": 0.6081, + "step": 5875 + }, + { + "epoch": 1.7201405152224825, + "grad_norm": 0.9664035439491272, + "learning_rate": 4.074158700522905e-06, + "loss": 0.6159, + "step": 5876 + }, + { + "epoch": 1.720433255269321, + "grad_norm": 0.9614987373352051, + "learning_rate": 4.073859529709568e-06, + "loss": 0.584, + "step": 5877 + }, + { + "epoch": 1.720725995316159, + "grad_norm": 0.9940431714057922, + "learning_rate": 4.073560321556112e-06, + "loss": 0.624, + "step": 5878 + }, + { + "epoch": 1.7210187353629975, + "grad_norm": 0.9399563670158386, + "learning_rate": 4.073261076069633e-06, + "loss": 0.6217, + "step": 5879 + }, + { + "epoch": 1.721311475409836, + "grad_norm": 0.9224985241889954, + "learning_rate": 4.072961793257234e-06, + "loss": 0.6185, + "step": 5880 + }, + { + "epoch": 1.7216042154566744, + "grad_norm": 0.9497981667518616, + "learning_rate": 4.072662473126014e-06, + "loss": 0.6066, + "step": 5881 + }, + { + "epoch": 1.7218969555035128, + "grad_norm": 0.969290018081665, + "learning_rate": 4.072363115683075e-06, + "loss": 0.6319, + "step": 5882 + }, + { + "epoch": 1.7221896955503513, + "grad_norm": 0.9386500120162964, + "learning_rate": 4.072063720935518e-06, + "loss": 0.5825, + "step": 5883 + }, + { + "epoch": 1.7224824355971897, + "grad_norm": 0.9981831312179565, + "learning_rate": 4.071764288890448e-06, + "loss": 0.6245, + "step": 5884 + }, + { + "epoch": 1.7227751756440282, + "grad_norm": 0.9478777647018433, + "learning_rate": 4.071464819554968e-06, + "loss": 0.6083, + "step": 5885 + }, + { + "epoch": 1.7230679156908666, + "grad_norm": 0.9969513416290283, + "learning_rate": 4.071165312936184e-06, + "loss": 0.6131, + "step": 5886 + }, + { + "epoch": 1.723360655737705, + "grad_norm": 0.9296313524246216, + "learning_rate": 4.0708657690412e-06, + "loss": 0.5679, + "step": 5887 + }, + { + "epoch": 1.7236533957845435, + "grad_norm": 0.9694191217422485, + "learning_rate": 4.070566187877124e-06, + "loss": 0.6091, + "step": 5888 + }, + { + "epoch": 1.723946135831382, + "grad_norm": 0.9397997260093689, + "learning_rate": 4.070266569451064e-06, + "loss": 0.6085, + "step": 5889 + }, + { + "epoch": 1.7242388758782201, + "grad_norm": 1.0086590051651, + "learning_rate": 4.069966913770127e-06, + "loss": 0.6324, + "step": 5890 + }, + { + "epoch": 1.7245316159250585, + "grad_norm": 0.9728129506111145, + "learning_rate": 4.069667220841424e-06, + "loss": 0.6321, + "step": 5891 + }, + { + "epoch": 1.724824355971897, + "grad_norm": 0.9010568261146545, + "learning_rate": 4.069367490672065e-06, + "loss": 0.5691, + "step": 5892 + }, + { + "epoch": 1.7251170960187352, + "grad_norm": 0.9383226633071899, + "learning_rate": 4.0690677232691604e-06, + "loss": 0.5733, + "step": 5893 + }, + { + "epoch": 1.7254098360655736, + "grad_norm": 0.9365511536598206, + "learning_rate": 4.068767918639823e-06, + "loss": 0.5867, + "step": 5894 + }, + { + "epoch": 1.725702576112412, + "grad_norm": 0.9265756607055664, + "learning_rate": 4.068468076791164e-06, + "loss": 0.6218, + "step": 5895 + }, + { + "epoch": 1.7259953161592505, + "grad_norm": 1.0029934644699097, + "learning_rate": 4.0681681977303e-06, + "loss": 0.5804, + "step": 5896 + }, + { + "epoch": 1.726288056206089, + "grad_norm": 1.018873691558838, + "learning_rate": 4.0678682814643435e-06, + "loss": 0.5697, + "step": 5897 + }, + { + "epoch": 1.7265807962529274, + "grad_norm": 0.9754706025123596, + "learning_rate": 4.067568328000411e-06, + "loss": 0.6193, + "step": 5898 + }, + { + "epoch": 1.7268735362997658, + "grad_norm": 0.8951734900474548, + "learning_rate": 4.067268337345619e-06, + "loss": 0.5526, + "step": 5899 + }, + { + "epoch": 1.7271662763466042, + "grad_norm": 0.9363945126533508, + "learning_rate": 4.066968309507084e-06, + "loss": 0.6197, + "step": 5900 + }, + { + "epoch": 1.7274590163934427, + "grad_norm": 0.9500890970230103, + "learning_rate": 4.066668244491926e-06, + "loss": 0.5998, + "step": 5901 + }, + { + "epoch": 1.7277517564402811, + "grad_norm": 0.959025502204895, + "learning_rate": 4.0663681423072625e-06, + "loss": 0.6228, + "step": 5902 + }, + { + "epoch": 1.7280444964871196, + "grad_norm": 0.9856407046318054, + "learning_rate": 4.066068002960213e-06, + "loss": 0.6181, + "step": 5903 + }, + { + "epoch": 1.728337236533958, + "grad_norm": 0.9672083258628845, + "learning_rate": 4.065767826457899e-06, + "loss": 0.6245, + "step": 5904 + }, + { + "epoch": 1.7286299765807962, + "grad_norm": 0.9539538025856018, + "learning_rate": 4.065467612807444e-06, + "loss": 0.612, + "step": 5905 + }, + { + "epoch": 1.7289227166276346, + "grad_norm": 0.99601149559021, + "learning_rate": 4.065167362015968e-06, + "loss": 0.5839, + "step": 5906 + }, + { + "epoch": 1.729215456674473, + "grad_norm": 0.9416484832763672, + "learning_rate": 4.064867074090596e-06, + "loss": 0.5966, + "step": 5907 + }, + { + "epoch": 1.7295081967213115, + "grad_norm": 0.9324053525924683, + "learning_rate": 4.064566749038452e-06, + "loss": 0.5824, + "step": 5908 + }, + { + "epoch": 1.7298009367681497, + "grad_norm": 0.894216001033783, + "learning_rate": 4.064266386866661e-06, + "loss": 0.6019, + "step": 5909 + }, + { + "epoch": 1.7300936768149882, + "grad_norm": 0.9344473481178284, + "learning_rate": 4.06396598758235e-06, + "loss": 0.5984, + "step": 5910 + }, + { + "epoch": 1.7303864168618266, + "grad_norm": 0.9731354713439941, + "learning_rate": 4.063665551192646e-06, + "loss": 0.611, + "step": 5911 + }, + { + "epoch": 1.730679156908665, + "grad_norm": 0.9518488645553589, + "learning_rate": 4.063365077704675e-06, + "loss": 0.599, + "step": 5912 + }, + { + "epoch": 1.7309718969555035, + "grad_norm": 0.9656001925468445, + "learning_rate": 4.063064567125568e-06, + "loss": 0.6104, + "step": 5913 + }, + { + "epoch": 1.731264637002342, + "grad_norm": 0.9809359312057495, + "learning_rate": 4.062764019462453e-06, + "loss": 0.668, + "step": 5914 + }, + { + "epoch": 1.7315573770491803, + "grad_norm": 0.9242512583732605, + "learning_rate": 4.062463434722462e-06, + "loss": 0.5601, + "step": 5915 + }, + { + "epoch": 1.7318501170960188, + "grad_norm": 0.9862105846405029, + "learning_rate": 4.062162812912726e-06, + "loss": 0.6137, + "step": 5916 + }, + { + "epoch": 1.7321428571428572, + "grad_norm": 0.9275670051574707, + "learning_rate": 4.061862154040378e-06, + "loss": 0.5559, + "step": 5917 + }, + { + "epoch": 1.7324355971896956, + "grad_norm": 0.9558575749397278, + "learning_rate": 4.061561458112549e-06, + "loss": 0.5607, + "step": 5918 + }, + { + "epoch": 1.732728337236534, + "grad_norm": 0.9849057793617249, + "learning_rate": 4.061260725136374e-06, + "loss": 0.6129, + "step": 5919 + }, + { + "epoch": 1.7330210772833725, + "grad_norm": 1.0211830139160156, + "learning_rate": 4.060959955118989e-06, + "loss": 0.6322, + "step": 5920 + }, + { + "epoch": 1.7333138173302107, + "grad_norm": 0.9155946969985962, + "learning_rate": 4.06065914806753e-06, + "loss": 0.5699, + "step": 5921 + }, + { + "epoch": 1.7336065573770492, + "grad_norm": 0.9918410778045654, + "learning_rate": 4.060358303989132e-06, + "loss": 0.606, + "step": 5922 + }, + { + "epoch": 1.7338992974238876, + "grad_norm": 0.9158408641815186, + "learning_rate": 4.060057422890933e-06, + "loss": 0.5797, + "step": 5923 + }, + { + "epoch": 1.734192037470726, + "grad_norm": 1.077942132949829, + "learning_rate": 4.059756504780073e-06, + "loss": 0.6063, + "step": 5924 + }, + { + "epoch": 1.7344847775175642, + "grad_norm": 0.9765343070030212, + "learning_rate": 4.05945554966369e-06, + "loss": 0.6215, + "step": 5925 + }, + { + "epoch": 1.7347775175644027, + "grad_norm": 0.9313166737556458, + "learning_rate": 4.059154557548924e-06, + "loss": 0.5882, + "step": 5926 + }, + { + "epoch": 1.7350702576112411, + "grad_norm": 0.9664828777313232, + "learning_rate": 4.058853528442917e-06, + "loss": 0.6229, + "step": 5927 + }, + { + "epoch": 1.7353629976580796, + "grad_norm": 0.8788259625434875, + "learning_rate": 4.0585524623528105e-06, + "loss": 0.568, + "step": 5928 + }, + { + "epoch": 1.735655737704918, + "grad_norm": 0.9907458424568176, + "learning_rate": 4.058251359285747e-06, + "loss": 0.598, + "step": 5929 + }, + { + "epoch": 1.7359484777517564, + "grad_norm": 0.9932053089141846, + "learning_rate": 4.057950219248871e-06, + "loss": 0.6394, + "step": 5930 + }, + { + "epoch": 1.7362412177985949, + "grad_norm": 0.9065213203430176, + "learning_rate": 4.057649042249326e-06, + "loss": 0.5793, + "step": 5931 + }, + { + "epoch": 1.7365339578454333, + "grad_norm": 0.9257835745811462, + "learning_rate": 4.057347828294259e-06, + "loss": 0.6027, + "step": 5932 + }, + { + "epoch": 1.7368266978922717, + "grad_norm": 0.9046727418899536, + "learning_rate": 4.057046577390816e-06, + "loss": 0.6002, + "step": 5933 + }, + { + "epoch": 1.7371194379391102, + "grad_norm": 0.9561399221420288, + "learning_rate": 4.056745289546143e-06, + "loss": 0.5938, + "step": 5934 + }, + { + "epoch": 1.7374121779859486, + "grad_norm": 0.93314129114151, + "learning_rate": 4.0564439647673895e-06, + "loss": 0.6182, + "step": 5935 + }, + { + "epoch": 1.737704918032787, + "grad_norm": 0.986285388469696, + "learning_rate": 4.0561426030617035e-06, + "loss": 0.6062, + "step": 5936 + }, + { + "epoch": 1.7379976580796253, + "grad_norm": 0.9956618547439575, + "learning_rate": 4.055841204436235e-06, + "loss": 0.5597, + "step": 5937 + }, + { + "epoch": 1.7382903981264637, + "grad_norm": 1.027614951133728, + "learning_rate": 4.055539768898136e-06, + "loss": 0.5652, + "step": 5938 + }, + { + "epoch": 1.7385831381733021, + "grad_norm": 0.9554714560508728, + "learning_rate": 4.055238296454556e-06, + "loss": 0.6119, + "step": 5939 + }, + { + "epoch": 1.7388758782201406, + "grad_norm": 0.9727576971054077, + "learning_rate": 4.054936787112649e-06, + "loss": 0.6143, + "step": 5940 + }, + { + "epoch": 1.7391686182669788, + "grad_norm": 0.9541711211204529, + "learning_rate": 4.054635240879568e-06, + "loss": 0.6004, + "step": 5941 + }, + { + "epoch": 1.7394613583138172, + "grad_norm": 1.2627426385879517, + "learning_rate": 4.054333657762467e-06, + "loss": 0.6039, + "step": 5942 + }, + { + "epoch": 1.7397540983606556, + "grad_norm": 1.0305832624435425, + "learning_rate": 4.054032037768502e-06, + "loss": 0.5926, + "step": 5943 + }, + { + "epoch": 1.740046838407494, + "grad_norm": 0.991578221321106, + "learning_rate": 4.053730380904828e-06, + "loss": 0.604, + "step": 5944 + }, + { + "epoch": 1.7403395784543325, + "grad_norm": 0.9846920371055603, + "learning_rate": 4.053428687178602e-06, + "loss": 0.587, + "step": 5945 + }, + { + "epoch": 1.740632318501171, + "grad_norm": 0.9236680269241333, + "learning_rate": 4.053126956596982e-06, + "loss": 0.5852, + "step": 5946 + }, + { + "epoch": 1.7409250585480094, + "grad_norm": 0.9984573125839233, + "learning_rate": 4.052825189167127e-06, + "loss": 0.6158, + "step": 5947 + }, + { + "epoch": 1.7412177985948478, + "grad_norm": 0.9456061720848083, + "learning_rate": 4.0525233848961954e-06, + "loss": 0.5881, + "step": 5948 + }, + { + "epoch": 1.7415105386416863, + "grad_norm": 1.0351451635360718, + "learning_rate": 4.052221543791349e-06, + "loss": 0.6251, + "step": 5949 + }, + { + "epoch": 1.7418032786885247, + "grad_norm": 0.9582288861274719, + "learning_rate": 4.0519196658597475e-06, + "loss": 0.5908, + "step": 5950 + }, + { + "epoch": 1.7420960187353631, + "grad_norm": 0.9817149639129639, + "learning_rate": 4.051617751108554e-06, + "loss": 0.56, + "step": 5951 + }, + { + "epoch": 1.7423887587822016, + "grad_norm": 0.9452801942825317, + "learning_rate": 4.051315799544932e-06, + "loss": 0.5692, + "step": 5952 + }, + { + "epoch": 1.7426814988290398, + "grad_norm": 0.9462278485298157, + "learning_rate": 4.051013811176044e-06, + "loss": 0.6047, + "step": 5953 + }, + { + "epoch": 1.7429742388758782, + "grad_norm": 0.9749769568443298, + "learning_rate": 4.050711786009055e-06, + "loss": 0.6047, + "step": 5954 + }, + { + "epoch": 1.7432669789227166, + "grad_norm": 0.9775906801223755, + "learning_rate": 4.0504097240511315e-06, + "loss": 0.6036, + "step": 5955 + }, + { + "epoch": 1.743559718969555, + "grad_norm": 0.9560443758964539, + "learning_rate": 4.050107625309439e-06, + "loss": 0.6224, + "step": 5956 + }, + { + "epoch": 1.7438524590163933, + "grad_norm": 0.9327688813209534, + "learning_rate": 4.049805489791145e-06, + "loss": 0.6412, + "step": 5957 + }, + { + "epoch": 1.7441451990632317, + "grad_norm": 0.9650632739067078, + "learning_rate": 4.04950331750342e-06, + "loss": 0.6183, + "step": 5958 + }, + { + "epoch": 1.7444379391100702, + "grad_norm": 0.9299687147140503, + "learning_rate": 4.049201108453429e-06, + "loss": 0.6106, + "step": 5959 + }, + { + "epoch": 1.7447306791569086, + "grad_norm": 0.9817838072776794, + "learning_rate": 4.048898862648344e-06, + "loss": 0.6087, + "step": 5960 + }, + { + "epoch": 1.745023419203747, + "grad_norm": 0.9122613072395325, + "learning_rate": 4.048596580095337e-06, + "loss": 0.5519, + "step": 5961 + }, + { + "epoch": 1.7453161592505855, + "grad_norm": 0.988075852394104, + "learning_rate": 4.048294260801578e-06, + "loss": 0.6399, + "step": 5962 + }, + { + "epoch": 1.745608899297424, + "grad_norm": 0.9387520551681519, + "learning_rate": 4.047991904774241e-06, + "loss": 0.6049, + "step": 5963 + }, + { + "epoch": 1.7459016393442623, + "grad_norm": 0.9285863041877747, + "learning_rate": 4.047689512020498e-06, + "loss": 0.5545, + "step": 5964 + }, + { + "epoch": 1.7461943793911008, + "grad_norm": 1.0117660760879517, + "learning_rate": 4.047387082547524e-06, + "loss": 0.6041, + "step": 5965 + }, + { + "epoch": 1.7464871194379392, + "grad_norm": 1.008302092552185, + "learning_rate": 4.0470846163624935e-06, + "loss": 0.6121, + "step": 5966 + }, + { + "epoch": 1.7467798594847777, + "grad_norm": 0.959913432598114, + "learning_rate": 4.046782113472584e-06, + "loss": 0.6223, + "step": 5967 + }, + { + "epoch": 1.747072599531616, + "grad_norm": 0.9112428426742554, + "learning_rate": 4.046479573884971e-06, + "loss": 0.6009, + "step": 5968 + }, + { + "epoch": 1.7473653395784543, + "grad_norm": 0.9787073731422424, + "learning_rate": 4.0461769976068345e-06, + "loss": 0.5622, + "step": 5969 + }, + { + "epoch": 1.7476580796252927, + "grad_norm": 1.006887435913086, + "learning_rate": 4.045874384645351e-06, + "loss": 0.6082, + "step": 5970 + }, + { + "epoch": 1.7479508196721312, + "grad_norm": 0.9383203983306885, + "learning_rate": 4.045571735007701e-06, + "loss": 0.625, + "step": 5971 + }, + { + "epoch": 1.7482435597189696, + "grad_norm": 0.9390286803245544, + "learning_rate": 4.045269048701063e-06, + "loss": 0.5704, + "step": 5972 + }, + { + "epoch": 1.7485362997658078, + "grad_norm": 0.9529642462730408, + "learning_rate": 4.0449663257326225e-06, + "loss": 0.5797, + "step": 5973 + }, + { + "epoch": 1.7488290398126463, + "grad_norm": 0.9304689764976501, + "learning_rate": 4.044663566109557e-06, + "loss": 0.5778, + "step": 5974 + }, + { + "epoch": 1.7491217798594847, + "grad_norm": 0.9729417562484741, + "learning_rate": 4.044360769839052e-06, + "loss": 0.6124, + "step": 5975 + }, + { + "epoch": 1.7494145199063231, + "grad_norm": 1.00701105594635, + "learning_rate": 4.044057936928292e-06, + "loss": 0.5997, + "step": 5976 + }, + { + "epoch": 1.7497072599531616, + "grad_norm": 0.9828822612762451, + "learning_rate": 4.0437550673844605e-06, + "loss": 0.6318, + "step": 5977 + }, + { + "epoch": 1.75, + "grad_norm": 0.9918591380119324, + "learning_rate": 4.043452161214742e-06, + "loss": 0.619, + "step": 5978 + }, + { + "epoch": 1.7502927400468384, + "grad_norm": 0.984855592250824, + "learning_rate": 4.043149218426327e-06, + "loss": 0.61, + "step": 5979 + }, + { + "epoch": 1.7505854800936769, + "grad_norm": 0.9783483743667603, + "learning_rate": 4.042846239026398e-06, + "loss": 0.5624, + "step": 5980 + }, + { + "epoch": 1.7508782201405153, + "grad_norm": 0.967561662197113, + "learning_rate": 4.042543223022146e-06, + "loss": 0.6426, + "step": 5981 + }, + { + "epoch": 1.7511709601873537, + "grad_norm": 0.9446234107017517, + "learning_rate": 4.042240170420762e-06, + "loss": 0.6147, + "step": 5982 + }, + { + "epoch": 1.7514637002341922, + "grad_norm": 0.9216092228889465, + "learning_rate": 4.041937081229431e-06, + "loss": 0.5976, + "step": 5983 + }, + { + "epoch": 1.7517564402810304, + "grad_norm": 0.9835466146469116, + "learning_rate": 4.041633955455347e-06, + "loss": 0.6153, + "step": 5984 + }, + { + "epoch": 1.7520491803278688, + "grad_norm": 0.9777002930641174, + "learning_rate": 4.041330793105701e-06, + "loss": 0.5997, + "step": 5985 + }, + { + "epoch": 1.7523419203747073, + "grad_norm": 0.9583677053451538, + "learning_rate": 4.041027594187686e-06, + "loss": 0.5907, + "step": 5986 + }, + { + "epoch": 1.7526346604215457, + "grad_norm": 0.910620391368866, + "learning_rate": 4.040724358708496e-06, + "loss": 0.5938, + "step": 5987 + }, + { + "epoch": 1.752927400468384, + "grad_norm": 1.0120534896850586, + "learning_rate": 4.040421086675325e-06, + "loss": 0.6188, + "step": 5988 + }, + { + "epoch": 1.7532201405152223, + "grad_norm": 0.9125576615333557, + "learning_rate": 4.040117778095367e-06, + "loss": 0.6181, + "step": 5989 + }, + { + "epoch": 1.7535128805620608, + "grad_norm": 0.9665127992630005, + "learning_rate": 4.039814432975818e-06, + "loss": 0.5872, + "step": 5990 + }, + { + "epoch": 1.7538056206088992, + "grad_norm": 0.9511306285858154, + "learning_rate": 4.039511051323877e-06, + "loss": 0.6053, + "step": 5991 + }, + { + "epoch": 1.7540983606557377, + "grad_norm": 0.9307471513748169, + "learning_rate": 4.03920763314674e-06, + "loss": 0.5726, + "step": 5992 + }, + { + "epoch": 1.754391100702576, + "grad_norm": 0.9147864580154419, + "learning_rate": 4.038904178451607e-06, + "loss": 0.5863, + "step": 5993 + }, + { + "epoch": 1.7546838407494145, + "grad_norm": 0.906467854976654, + "learning_rate": 4.038600687245675e-06, + "loss": 0.5353, + "step": 5994 + }, + { + "epoch": 1.754976580796253, + "grad_norm": 0.9556354880332947, + "learning_rate": 4.038297159536148e-06, + "loss": 0.6265, + "step": 5995 + }, + { + "epoch": 1.7552693208430914, + "grad_norm": 0.9545077085494995, + "learning_rate": 4.037993595330224e-06, + "loss": 0.5655, + "step": 5996 + }, + { + "epoch": 1.7555620608899298, + "grad_norm": 0.9935433864593506, + "learning_rate": 4.037689994635106e-06, + "loss": 0.5917, + "step": 5997 + }, + { + "epoch": 1.7558548009367683, + "grad_norm": 0.9426618814468384, + "learning_rate": 4.037386357458e-06, + "loss": 0.5818, + "step": 5998 + }, + { + "epoch": 1.7561475409836067, + "grad_norm": 0.9689637422561646, + "learning_rate": 4.037082683806105e-06, + "loss": 0.5898, + "step": 5999 + }, + { + "epoch": 1.756440281030445, + "grad_norm": 1.074061632156372, + "learning_rate": 4.036778973686628e-06, + "loss": 0.6254, + "step": 6000 + }, + { + "epoch": 1.7567330210772834, + "grad_norm": 0.9852315187454224, + "learning_rate": 4.0364752271067755e-06, + "loss": 0.5955, + "step": 6001 + }, + { + "epoch": 1.7570257611241218, + "grad_norm": 1.0050528049468994, + "learning_rate": 4.0361714440737534e-06, + "loss": 0.637, + "step": 6002 + }, + { + "epoch": 1.7573185011709602, + "grad_norm": 0.9530670046806335, + "learning_rate": 4.035867624594768e-06, + "loss": 0.6245, + "step": 6003 + }, + { + "epoch": 1.7576112412177984, + "grad_norm": 0.9991280436515808, + "learning_rate": 4.035563768677027e-06, + "loss": 0.6006, + "step": 6004 + }, + { + "epoch": 1.7579039812646369, + "grad_norm": 0.8976741433143616, + "learning_rate": 4.035259876327743e-06, + "loss": 0.5705, + "step": 6005 + }, + { + "epoch": 1.7581967213114753, + "grad_norm": 1.009335994720459, + "learning_rate": 4.034955947554122e-06, + "loss": 0.6057, + "step": 6006 + }, + { + "epoch": 1.7584894613583137, + "grad_norm": 0.9940446019172668, + "learning_rate": 4.0346519823633765e-06, + "loss": 0.5974, + "step": 6007 + }, + { + "epoch": 1.7587822014051522, + "grad_norm": 1.127808690071106, + "learning_rate": 4.034347980762718e-06, + "loss": 0.5978, + "step": 6008 + }, + { + "epoch": 1.7590749414519906, + "grad_norm": 1.0150749683380127, + "learning_rate": 4.0340439427593585e-06, + "loss": 0.6121, + "step": 6009 + }, + { + "epoch": 1.759367681498829, + "grad_norm": 0.9283134341239929, + "learning_rate": 4.033739868360512e-06, + "loss": 0.579, + "step": 6010 + }, + { + "epoch": 1.7596604215456675, + "grad_norm": 0.9335961937904358, + "learning_rate": 4.033435757573392e-06, + "loss": 0.6101, + "step": 6011 + }, + { + "epoch": 1.759953161592506, + "grad_norm": 0.974371612071991, + "learning_rate": 4.033131610405215e-06, + "loss": 0.6663, + "step": 6012 + }, + { + "epoch": 1.7602459016393444, + "grad_norm": 0.9550794959068298, + "learning_rate": 4.032827426863196e-06, + "loss": 0.6018, + "step": 6013 + }, + { + "epoch": 1.7605386416861828, + "grad_norm": 0.9215535521507263, + "learning_rate": 4.032523206954551e-06, + "loss": 0.584, + "step": 6014 + }, + { + "epoch": 1.7608313817330212, + "grad_norm": 0.9561609625816345, + "learning_rate": 4.032218950686499e-06, + "loss": 0.6033, + "step": 6015 + }, + { + "epoch": 1.7611241217798594, + "grad_norm": 0.938247561454773, + "learning_rate": 4.031914658066257e-06, + "loss": 0.5885, + "step": 6016 + }, + { + "epoch": 1.7614168618266979, + "grad_norm": 0.9397051930427551, + "learning_rate": 4.031610329101047e-06, + "loss": 0.5942, + "step": 6017 + }, + { + "epoch": 1.7617096018735363, + "grad_norm": 0.9044942855834961, + "learning_rate": 4.031305963798087e-06, + "loss": 0.6113, + "step": 6018 + }, + { + "epoch": 1.7620023419203747, + "grad_norm": 0.9262259006500244, + "learning_rate": 4.031001562164598e-06, + "loss": 0.6045, + "step": 6019 + }, + { + "epoch": 1.762295081967213, + "grad_norm": 0.9542113542556763, + "learning_rate": 4.030697124207804e-06, + "loss": 0.6239, + "step": 6020 + }, + { + "epoch": 1.7625878220140514, + "grad_norm": 0.8998700976371765, + "learning_rate": 4.030392649934926e-06, + "loss": 0.6096, + "step": 6021 + }, + { + "epoch": 1.7628805620608898, + "grad_norm": 0.9829412698745728, + "learning_rate": 4.030088139353189e-06, + "loss": 0.5707, + "step": 6022 + }, + { + "epoch": 1.7631733021077283, + "grad_norm": 0.9599190354347229, + "learning_rate": 4.029783592469816e-06, + "loss": 0.5866, + "step": 6023 + }, + { + "epoch": 1.7634660421545667, + "grad_norm": 0.9898992776870728, + "learning_rate": 4.029479009292034e-06, + "loss": 0.6243, + "step": 6024 + }, + { + "epoch": 1.7637587822014051, + "grad_norm": 0.9821174740791321, + "learning_rate": 4.029174389827069e-06, + "loss": 0.5889, + "step": 6025 + }, + { + "epoch": 1.7640515222482436, + "grad_norm": 0.9736547470092773, + "learning_rate": 4.028869734082146e-06, + "loss": 0.6182, + "step": 6026 + }, + { + "epoch": 1.764344262295082, + "grad_norm": 0.9569923281669617, + "learning_rate": 4.028565042064497e-06, + "loss": 0.5955, + "step": 6027 + }, + { + "epoch": 1.7646370023419204, + "grad_norm": 0.9715225100517273, + "learning_rate": 4.0282603137813485e-06, + "loss": 0.6493, + "step": 6028 + }, + { + "epoch": 1.7649297423887589, + "grad_norm": 0.9601028561592102, + "learning_rate": 4.0279555492399295e-06, + "loss": 0.5969, + "step": 6029 + }, + { + "epoch": 1.7652224824355973, + "grad_norm": 1.0217933654785156, + "learning_rate": 4.0276507484474715e-06, + "loss": 0.5746, + "step": 6030 + }, + { + "epoch": 1.7655152224824358, + "grad_norm": 0.988827109336853, + "learning_rate": 4.0273459114112054e-06, + "loss": 0.5929, + "step": 6031 + }, + { + "epoch": 1.765807962529274, + "grad_norm": 0.9946609735488892, + "learning_rate": 4.027041038138366e-06, + "loss": 0.634, + "step": 6032 + }, + { + "epoch": 1.7661007025761124, + "grad_norm": 0.9598988890647888, + "learning_rate": 4.026736128636183e-06, + "loss": 0.6241, + "step": 6033 + }, + { + "epoch": 1.7663934426229508, + "grad_norm": 0.9817661046981812, + "learning_rate": 4.026431182911893e-06, + "loss": 0.621, + "step": 6034 + }, + { + "epoch": 1.7666861826697893, + "grad_norm": 0.941991925239563, + "learning_rate": 4.02612620097273e-06, + "loss": 0.6054, + "step": 6035 + }, + { + "epoch": 1.7669789227166275, + "grad_norm": 0.9631813168525696, + "learning_rate": 4.025821182825929e-06, + "loss": 0.6293, + "step": 6036 + }, + { + "epoch": 1.767271662763466, + "grad_norm": 0.9581178426742554, + "learning_rate": 4.025516128478727e-06, + "loss": 0.5827, + "step": 6037 + }, + { + "epoch": 1.7675644028103044, + "grad_norm": 0.9525718688964844, + "learning_rate": 4.025211037938363e-06, + "loss": 0.611, + "step": 6038 + }, + { + "epoch": 1.7678571428571428, + "grad_norm": 0.9380874037742615, + "learning_rate": 4.0249059112120735e-06, + "loss": 0.6119, + "step": 6039 + }, + { + "epoch": 1.7681498829039812, + "grad_norm": 0.933462917804718, + "learning_rate": 4.024600748307098e-06, + "loss": 0.6038, + "step": 6040 + }, + { + "epoch": 1.7684426229508197, + "grad_norm": 0.9468561410903931, + "learning_rate": 4.024295549230677e-06, + "loss": 0.6097, + "step": 6041 + }, + { + "epoch": 1.768735362997658, + "grad_norm": 1.0478402376174927, + "learning_rate": 4.023990313990051e-06, + "loss": 0.6019, + "step": 6042 + }, + { + "epoch": 1.7690281030444965, + "grad_norm": 0.931609570980072, + "learning_rate": 4.023685042592463e-06, + "loss": 0.5751, + "step": 6043 + }, + { + "epoch": 1.769320843091335, + "grad_norm": 0.9286707639694214, + "learning_rate": 4.023379735045154e-06, + "loss": 0.5822, + "step": 6044 + }, + { + "epoch": 1.7696135831381734, + "grad_norm": 0.9798358082771301, + "learning_rate": 4.023074391355367e-06, + "loss": 0.6292, + "step": 6045 + }, + { + "epoch": 1.7699063231850118, + "grad_norm": 0.9720656275749207, + "learning_rate": 4.022769011530349e-06, + "loss": 0.6151, + "step": 6046 + }, + { + "epoch": 1.7701990632318503, + "grad_norm": 0.9610762000083923, + "learning_rate": 4.022463595577343e-06, + "loss": 0.5988, + "step": 6047 + }, + { + "epoch": 1.7704918032786885, + "grad_norm": 0.9606327414512634, + "learning_rate": 4.022158143503596e-06, + "loss": 0.6137, + "step": 6048 + }, + { + "epoch": 1.770784543325527, + "grad_norm": 0.943142831325531, + "learning_rate": 4.021852655316354e-06, + "loss": 0.6345, + "step": 6049 + }, + { + "epoch": 1.7710772833723654, + "grad_norm": 0.9428618550300598, + "learning_rate": 4.021547131022866e-06, + "loss": 0.6155, + "step": 6050 + }, + { + "epoch": 1.7713700234192038, + "grad_norm": 0.9484491944313049, + "learning_rate": 4.02124157063038e-06, + "loss": 0.6008, + "step": 6051 + }, + { + "epoch": 1.771662763466042, + "grad_norm": 0.973746120929718, + "learning_rate": 4.020935974146145e-06, + "loss": 0.6293, + "step": 6052 + }, + { + "epoch": 1.7719555035128804, + "grad_norm": 0.9482306241989136, + "learning_rate": 4.020630341577413e-06, + "loss": 0.5568, + "step": 6053 + }, + { + "epoch": 1.7722482435597189, + "grad_norm": 0.9295998811721802, + "learning_rate": 4.020324672931433e-06, + "loss": 0.6079, + "step": 6054 + }, + { + "epoch": 1.7725409836065573, + "grad_norm": 0.9879336357116699, + "learning_rate": 4.020018968215458e-06, + "loss": 0.6097, + "step": 6055 + }, + { + "epoch": 1.7728337236533958, + "grad_norm": 0.9474701285362244, + "learning_rate": 4.019713227436741e-06, + "loss": 0.5902, + "step": 6056 + }, + { + "epoch": 1.7731264637002342, + "grad_norm": 0.9508968591690063, + "learning_rate": 4.019407450602536e-06, + "loss": 0.6239, + "step": 6057 + }, + { + "epoch": 1.7734192037470726, + "grad_norm": 0.943376898765564, + "learning_rate": 4.019101637720097e-06, + "loss": 0.5798, + "step": 6058 + }, + { + "epoch": 1.773711943793911, + "grad_norm": 0.9992349743843079, + "learning_rate": 4.01879578879668e-06, + "loss": 0.6285, + "step": 6059 + }, + { + "epoch": 1.7740046838407495, + "grad_norm": 0.9065939784049988, + "learning_rate": 4.018489903839541e-06, + "loss": 0.6186, + "step": 6060 + }, + { + "epoch": 1.774297423887588, + "grad_norm": 0.9361585378646851, + "learning_rate": 4.018183982855937e-06, + "loss": 0.6054, + "step": 6061 + }, + { + "epoch": 1.7745901639344264, + "grad_norm": 1.039978265762329, + "learning_rate": 4.017878025853127e-06, + "loss": 0.6221, + "step": 6062 + }, + { + "epoch": 1.7748829039812648, + "grad_norm": 0.9497652053833008, + "learning_rate": 4.017572032838369e-06, + "loss": 0.6083, + "step": 6063 + }, + { + "epoch": 1.775175644028103, + "grad_norm": 1.0205634832382202, + "learning_rate": 4.0172660038189226e-06, + "loss": 0.6375, + "step": 6064 + }, + { + "epoch": 1.7754683840749415, + "grad_norm": 0.9134788513183594, + "learning_rate": 4.016959938802049e-06, + "loss": 0.558, + "step": 6065 + }, + { + "epoch": 1.7757611241217799, + "grad_norm": 0.9064382910728455, + "learning_rate": 4.016653837795009e-06, + "loss": 0.6213, + "step": 6066 + }, + { + "epoch": 1.776053864168618, + "grad_norm": 0.9461508393287659, + "learning_rate": 4.016347700805066e-06, + "loss": 0.6362, + "step": 6067 + }, + { + "epoch": 1.7763466042154565, + "grad_norm": 0.9589064121246338, + "learning_rate": 4.016041527839482e-06, + "loss": 0.5955, + "step": 6068 + }, + { + "epoch": 1.776639344262295, + "grad_norm": 0.9671129584312439, + "learning_rate": 4.015735318905522e-06, + "loss": 0.6043, + "step": 6069 + }, + { + "epoch": 1.7769320843091334, + "grad_norm": 0.9652732014656067, + "learning_rate": 4.01542907401045e-06, + "loss": 0.6255, + "step": 6070 + }, + { + "epoch": 1.7772248243559718, + "grad_norm": 0.9116383790969849, + "learning_rate": 4.0151227931615325e-06, + "loss": 0.605, + "step": 6071 + }, + { + "epoch": 1.7775175644028103, + "grad_norm": 0.942882239818573, + "learning_rate": 4.014816476366034e-06, + "loss": 0.6164, + "step": 6072 + }, + { + "epoch": 1.7778103044496487, + "grad_norm": 0.9950060248374939, + "learning_rate": 4.014510123631226e-06, + "loss": 0.6338, + "step": 6073 + }, + { + "epoch": 1.7781030444964872, + "grad_norm": 0.9250180721282959, + "learning_rate": 4.014203734964372e-06, + "loss": 0.5878, + "step": 6074 + }, + { + "epoch": 1.7783957845433256, + "grad_norm": 0.9560929536819458, + "learning_rate": 4.013897310372745e-06, + "loss": 0.6187, + "step": 6075 + }, + { + "epoch": 1.778688524590164, + "grad_norm": 0.945675790309906, + "learning_rate": 4.013590849863614e-06, + "loss": 0.5965, + "step": 6076 + }, + { + "epoch": 1.7789812646370025, + "grad_norm": 0.9816797971725464, + "learning_rate": 4.013284353444248e-06, + "loss": 0.6068, + "step": 6077 + }, + { + "epoch": 1.779274004683841, + "grad_norm": 0.9764792323112488, + "learning_rate": 4.0129778211219205e-06, + "loss": 0.6175, + "step": 6078 + }, + { + "epoch": 1.779566744730679, + "grad_norm": 0.9367029070854187, + "learning_rate": 4.012671252903904e-06, + "loss": 0.654, + "step": 6079 + }, + { + "epoch": 1.7798594847775175, + "grad_norm": 0.9197306036949158, + "learning_rate": 4.01236464879747e-06, + "loss": 0.5985, + "step": 6080 + }, + { + "epoch": 1.780152224824356, + "grad_norm": 0.9317857623100281, + "learning_rate": 4.012058008809895e-06, + "loss": 0.6132, + "step": 6081 + }, + { + "epoch": 1.7804449648711944, + "grad_norm": 0.9304709434509277, + "learning_rate": 4.011751332948453e-06, + "loss": 0.5882, + "step": 6082 + }, + { + "epoch": 1.7807377049180326, + "grad_norm": 0.9685103297233582, + "learning_rate": 4.01144462122042e-06, + "loss": 0.6013, + "step": 6083 + }, + { + "epoch": 1.781030444964871, + "grad_norm": 0.9585797786712646, + "learning_rate": 4.011137873633074e-06, + "loss": 0.5948, + "step": 6084 + }, + { + "epoch": 1.7813231850117095, + "grad_norm": 0.9168676137924194, + "learning_rate": 4.010831090193691e-06, + "loss": 0.5942, + "step": 6085 + }, + { + "epoch": 1.781615925058548, + "grad_norm": 0.929011344909668, + "learning_rate": 4.01052427090955e-06, + "loss": 0.6219, + "step": 6086 + }, + { + "epoch": 1.7819086651053864, + "grad_norm": 0.9464293718338013, + "learning_rate": 4.010217415787931e-06, + "loss": 0.6244, + "step": 6087 + }, + { + "epoch": 1.7822014051522248, + "grad_norm": 0.9306617379188538, + "learning_rate": 4.009910524836114e-06, + "loss": 0.6213, + "step": 6088 + }, + { + "epoch": 1.7824941451990632, + "grad_norm": 1.0021411180496216, + "learning_rate": 4.009603598061378e-06, + "loss": 0.6098, + "step": 6089 + }, + { + "epoch": 1.7827868852459017, + "grad_norm": 0.9344602227210999, + "learning_rate": 4.0092966354710076e-06, + "loss": 0.5845, + "step": 6090 + }, + { + "epoch": 1.7830796252927401, + "grad_norm": 0.9370611310005188, + "learning_rate": 4.008989637072285e-06, + "loss": 0.5862, + "step": 6091 + }, + { + "epoch": 1.7833723653395785, + "grad_norm": 0.9406285881996155, + "learning_rate": 4.008682602872493e-06, + "loss": 0.6001, + "step": 6092 + }, + { + "epoch": 1.783665105386417, + "grad_norm": 0.990281343460083, + "learning_rate": 4.0083755328789155e-06, + "loss": 0.6151, + "step": 6093 + }, + { + "epoch": 1.7839578454332554, + "grad_norm": 0.9583709836006165, + "learning_rate": 4.00806842709884e-06, + "loss": 0.6056, + "step": 6094 + }, + { + "epoch": 1.7842505854800936, + "grad_norm": 0.9763213396072388, + "learning_rate": 4.007761285539551e-06, + "loss": 0.633, + "step": 6095 + }, + { + "epoch": 1.784543325526932, + "grad_norm": 0.9603798389434814, + "learning_rate": 4.0074541082083355e-06, + "loss": 0.6264, + "step": 6096 + }, + { + "epoch": 1.7848360655737705, + "grad_norm": 0.9900209903717041, + "learning_rate": 4.007146895112481e-06, + "loss": 0.6214, + "step": 6097 + }, + { + "epoch": 1.785128805620609, + "grad_norm": 0.9243252873420715, + "learning_rate": 4.006839646259279e-06, + "loss": 0.5901, + "step": 6098 + }, + { + "epoch": 1.7854215456674472, + "grad_norm": 0.9660059213638306, + "learning_rate": 4.006532361656015e-06, + "loss": 0.6283, + "step": 6099 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.9178096055984497, + "learning_rate": 4.006225041309981e-06, + "loss": 0.559, + "step": 6100 + }, + { + "epoch": 1.786007025761124, + "grad_norm": 0.9871845841407776, + "learning_rate": 4.00591768522847e-06, + "loss": 0.6167, + "step": 6101 + }, + { + "epoch": 1.7862997658079625, + "grad_norm": 0.9499039649963379, + "learning_rate": 4.005610293418772e-06, + "loss": 0.6055, + "step": 6102 + }, + { + "epoch": 1.786592505854801, + "grad_norm": 0.9275027513504028, + "learning_rate": 4.00530286588818e-06, + "loss": 0.6112, + "step": 6103 + }, + { + "epoch": 1.7868852459016393, + "grad_norm": 0.9098911881446838, + "learning_rate": 4.004995402643989e-06, + "loss": 0.603, + "step": 6104 + }, + { + "epoch": 1.7871779859484778, + "grad_norm": 1.034232258796692, + "learning_rate": 4.004687903693493e-06, + "loss": 0.6208, + "step": 6105 + }, + { + "epoch": 1.7874707259953162, + "grad_norm": 0.9450879693031311, + "learning_rate": 4.004380369043987e-06, + "loss": 0.6202, + "step": 6106 + }, + { + "epoch": 1.7877634660421546, + "grad_norm": 0.9966951012611389, + "learning_rate": 4.004072798702767e-06, + "loss": 0.5692, + "step": 6107 + }, + { + "epoch": 1.788056206088993, + "grad_norm": 0.905888020992279, + "learning_rate": 4.003765192677133e-06, + "loss": 0.5286, + "step": 6108 + }, + { + "epoch": 1.7883489461358315, + "grad_norm": 0.95941561460495, + "learning_rate": 4.003457550974379e-06, + "loss": 0.5997, + "step": 6109 + }, + { + "epoch": 1.78864168618267, + "grad_norm": 0.9677042961120605, + "learning_rate": 4.0031498736018066e-06, + "loss": 0.5765, + "step": 6110 + }, + { + "epoch": 1.7889344262295082, + "grad_norm": 0.979491114616394, + "learning_rate": 4.002842160566715e-06, + "loss": 0.6299, + "step": 6111 + }, + { + "epoch": 1.7892271662763466, + "grad_norm": 1.122516393661499, + "learning_rate": 4.002534411876403e-06, + "loss": 0.6141, + "step": 6112 + }, + { + "epoch": 1.789519906323185, + "grad_norm": 1.0056008100509644, + "learning_rate": 4.002226627538175e-06, + "loss": 0.6448, + "step": 6113 + }, + { + "epoch": 1.7898126463700235, + "grad_norm": 0.9609982371330261, + "learning_rate": 4.0019188075593305e-06, + "loss": 0.6218, + "step": 6114 + }, + { + "epoch": 1.7901053864168617, + "grad_norm": 0.9018323421478271, + "learning_rate": 4.001610951947174e-06, + "loss": 0.5611, + "step": 6115 + }, + { + "epoch": 1.7903981264637001, + "grad_norm": 0.9747595191001892, + "learning_rate": 4.0013030607090096e-06, + "loss": 0.6223, + "step": 6116 + }, + { + "epoch": 1.7906908665105385, + "grad_norm": 0.9701371788978577, + "learning_rate": 4.000995133852141e-06, + "loss": 0.5818, + "step": 6117 + }, + { + "epoch": 1.790983606557377, + "grad_norm": 0.9526664614677429, + "learning_rate": 4.000687171383874e-06, + "loss": 0.5757, + "step": 6118 + }, + { + "epoch": 1.7912763466042154, + "grad_norm": 0.9466896653175354, + "learning_rate": 4.000379173311517e-06, + "loss": 0.5782, + "step": 6119 + }, + { + "epoch": 1.7915690866510539, + "grad_norm": 0.9689291715621948, + "learning_rate": 4.000071139642375e-06, + "loss": 0.5812, + "step": 6120 + }, + { + "epoch": 1.7918618266978923, + "grad_norm": 0.9823866486549377, + "learning_rate": 3.999763070383758e-06, + "loss": 0.5749, + "step": 6121 + }, + { + "epoch": 1.7921545667447307, + "grad_norm": 0.96578049659729, + "learning_rate": 3.999454965542972e-06, + "loss": 0.6342, + "step": 6122 + }, + { + "epoch": 1.7924473067915692, + "grad_norm": 1.0306435823440552, + "learning_rate": 3.999146825127331e-06, + "loss": 0.6596, + "step": 6123 + }, + { + "epoch": 1.7927400468384076, + "grad_norm": 0.9257488250732422, + "learning_rate": 3.9988386491441435e-06, + "loss": 0.6182, + "step": 6124 + }, + { + "epoch": 1.793032786885246, + "grad_norm": 1.0173485279083252, + "learning_rate": 3.998530437600719e-06, + "loss": 0.5747, + "step": 6125 + }, + { + "epoch": 1.7933255269320845, + "grad_norm": 0.9561808109283447, + "learning_rate": 3.998222190504374e-06, + "loss": 0.6182, + "step": 6126 + }, + { + "epoch": 1.7936182669789227, + "grad_norm": 0.9795467853546143, + "learning_rate": 3.9979139078624185e-06, + "loss": 0.5943, + "step": 6127 + }, + { + "epoch": 1.7939110070257611, + "grad_norm": 0.9888968467712402, + "learning_rate": 3.9976055896821685e-06, + "loss": 0.6082, + "step": 6128 + }, + { + "epoch": 1.7942037470725996, + "grad_norm": 0.9028729796409607, + "learning_rate": 3.997297235970939e-06, + "loss": 0.5554, + "step": 6129 + }, + { + "epoch": 1.794496487119438, + "grad_norm": 0.916117787361145, + "learning_rate": 3.996988846736043e-06, + "loss": 0.5484, + "step": 6130 + }, + { + "epoch": 1.7947892271662762, + "grad_norm": 0.9745562672615051, + "learning_rate": 3.9966804219848e-06, + "loss": 0.6202, + "step": 6131 + }, + { + "epoch": 1.7950819672131146, + "grad_norm": 0.9267522096633911, + "learning_rate": 3.9963719617245255e-06, + "loss": 0.5567, + "step": 6132 + }, + { + "epoch": 1.795374707259953, + "grad_norm": 0.9436658024787903, + "learning_rate": 3.99606346596254e-06, + "loss": 0.6172, + "step": 6133 + }, + { + "epoch": 1.7956674473067915, + "grad_norm": 0.9546429514884949, + "learning_rate": 3.9957549347061605e-06, + "loss": 0.5805, + "step": 6134 + }, + { + "epoch": 1.79596018735363, + "grad_norm": 0.9475112557411194, + "learning_rate": 3.995446367962709e-06, + "loss": 0.596, + "step": 6135 + }, + { + "epoch": 1.7962529274004684, + "grad_norm": 0.9865009784698486, + "learning_rate": 3.9951377657395036e-06, + "loss": 0.5843, + "step": 6136 + }, + { + "epoch": 1.7965456674473068, + "grad_norm": 0.9340455532073975, + "learning_rate": 3.994829128043868e-06, + "loss": 0.5823, + "step": 6137 + }, + { + "epoch": 1.7968384074941453, + "grad_norm": 0.9634753465652466, + "learning_rate": 3.994520454883124e-06, + "loss": 0.6146, + "step": 6138 + }, + { + "epoch": 1.7971311475409837, + "grad_norm": 0.9932289123535156, + "learning_rate": 3.994211746264596e-06, + "loss": 0.5965, + "step": 6139 + }, + { + "epoch": 1.7974238875878221, + "grad_norm": 0.9944779276847839, + "learning_rate": 3.993903002195606e-06, + "loss": 0.5935, + "step": 6140 + }, + { + "epoch": 1.7977166276346606, + "grad_norm": 0.9583457708358765, + "learning_rate": 3.993594222683481e-06, + "loss": 0.6409, + "step": 6141 + }, + { + "epoch": 1.798009367681499, + "grad_norm": 0.9587737917900085, + "learning_rate": 3.993285407735545e-06, + "loss": 0.6514, + "step": 6142 + }, + { + "epoch": 1.7983021077283372, + "grad_norm": 0.9436275362968445, + "learning_rate": 3.992976557359127e-06, + "loss": 0.5711, + "step": 6143 + }, + { + "epoch": 1.7985948477751756, + "grad_norm": 0.946711540222168, + "learning_rate": 3.992667671561553e-06, + "loss": 0.6149, + "step": 6144 + }, + { + "epoch": 1.798887587822014, + "grad_norm": 0.9398790597915649, + "learning_rate": 3.992358750350153e-06, + "loss": 0.5957, + "step": 6145 + }, + { + "epoch": 1.7991803278688525, + "grad_norm": 0.9125428795814514, + "learning_rate": 3.992049793732253e-06, + "loss": 0.6196, + "step": 6146 + }, + { + "epoch": 1.7994730679156907, + "grad_norm": 0.928864598274231, + "learning_rate": 3.991740801715186e-06, + "loss": 0.612, + "step": 6147 + }, + { + "epoch": 1.7997658079625292, + "grad_norm": 0.9438503980636597, + "learning_rate": 3.991431774306281e-06, + "loss": 0.6189, + "step": 6148 + }, + { + "epoch": 1.8000585480093676, + "grad_norm": 0.9675360321998596, + "learning_rate": 3.991122711512871e-06, + "loss": 0.6029, + "step": 6149 + }, + { + "epoch": 1.800351288056206, + "grad_norm": 0.9941287040710449, + "learning_rate": 3.9908136133422895e-06, + "loss": 0.6191, + "step": 6150 + }, + { + "epoch": 1.8006440281030445, + "grad_norm": 0.9351491928100586, + "learning_rate": 3.9905044798018675e-06, + "loss": 0.5983, + "step": 6151 + }, + { + "epoch": 1.800936768149883, + "grad_norm": 1.0246165990829468, + "learning_rate": 3.99019531089894e-06, + "loss": 0.5721, + "step": 6152 + }, + { + "epoch": 1.8012295081967213, + "grad_norm": 0.9302083253860474, + "learning_rate": 3.989886106640843e-06, + "loss": 0.5867, + "step": 6153 + }, + { + "epoch": 1.8015222482435598, + "grad_norm": 0.9520400762557983, + "learning_rate": 3.989576867034911e-06, + "loss": 0.5922, + "step": 6154 + }, + { + "epoch": 1.8018149882903982, + "grad_norm": 0.9451850652694702, + "learning_rate": 3.989267592088483e-06, + "loss": 0.6081, + "step": 6155 + }, + { + "epoch": 1.8021077283372366, + "grad_norm": 0.9220877289772034, + "learning_rate": 3.9889582818088945e-06, + "loss": 0.5762, + "step": 6156 + }, + { + "epoch": 1.802400468384075, + "grad_norm": 0.9483327269554138, + "learning_rate": 3.988648936203485e-06, + "loss": 0.6403, + "step": 6157 + }, + { + "epoch": 1.8026932084309133, + "grad_norm": 0.9370690584182739, + "learning_rate": 3.988339555279594e-06, + "loss": 0.5835, + "step": 6158 + }, + { + "epoch": 1.8029859484777517, + "grad_norm": 0.9036605954170227, + "learning_rate": 3.98803013904456e-06, + "loss": 0.5771, + "step": 6159 + }, + { + "epoch": 1.8032786885245902, + "grad_norm": 0.9691224694252014, + "learning_rate": 3.987720687505725e-06, + "loss": 0.5901, + "step": 6160 + }, + { + "epoch": 1.8035714285714286, + "grad_norm": 0.9609853029251099, + "learning_rate": 3.987411200670431e-06, + "loss": 0.6178, + "step": 6161 + }, + { + "epoch": 1.8038641686182668, + "grad_norm": 0.9228965044021606, + "learning_rate": 3.987101678546022e-06, + "loss": 0.5677, + "step": 6162 + }, + { + "epoch": 1.8041569086651053, + "grad_norm": 0.9754084944725037, + "learning_rate": 3.986792121139838e-06, + "loss": 0.5591, + "step": 6163 + }, + { + "epoch": 1.8044496487119437, + "grad_norm": 0.9949679374694824, + "learning_rate": 3.9864825284592265e-06, + "loss": 0.6155, + "step": 6164 + }, + { + "epoch": 1.8047423887587821, + "grad_norm": 0.9804270267486572, + "learning_rate": 3.986172900511531e-06, + "loss": 0.6247, + "step": 6165 + }, + { + "epoch": 1.8050351288056206, + "grad_norm": 0.9824610352516174, + "learning_rate": 3.985863237304098e-06, + "loss": 0.6034, + "step": 6166 + }, + { + "epoch": 1.805327868852459, + "grad_norm": 0.9634712338447571, + "learning_rate": 3.985553538844274e-06, + "loss": 0.5826, + "step": 6167 + }, + { + "epoch": 1.8056206088992974, + "grad_norm": 1.021355390548706, + "learning_rate": 3.985243805139407e-06, + "loss": 0.6099, + "step": 6168 + }, + { + "epoch": 1.8059133489461359, + "grad_norm": 0.9866906404495239, + "learning_rate": 3.984934036196846e-06, + "loss": 0.6019, + "step": 6169 + }, + { + "epoch": 1.8062060889929743, + "grad_norm": 0.9687646627426147, + "learning_rate": 3.984624232023939e-06, + "loss": 0.5747, + "step": 6170 + }, + { + "epoch": 1.8064988290398127, + "grad_norm": 0.9559488296508789, + "learning_rate": 3.984314392628037e-06, + "loss": 0.6145, + "step": 6171 + }, + { + "epoch": 1.8067915690866512, + "grad_norm": 0.9717344045639038, + "learning_rate": 3.984004518016491e-06, + "loss": 0.5608, + "step": 6172 + }, + { + "epoch": 1.8070843091334896, + "grad_norm": 0.9941099286079407, + "learning_rate": 3.983694608196652e-06, + "loss": 0.6031, + "step": 6173 + }, + { + "epoch": 1.8073770491803278, + "grad_norm": 0.9983803629875183, + "learning_rate": 3.983384663175874e-06, + "loss": 0.6095, + "step": 6174 + }, + { + "epoch": 1.8076697892271663, + "grad_norm": 0.9416454434394836, + "learning_rate": 3.98307468296151e-06, + "loss": 0.6176, + "step": 6175 + }, + { + "epoch": 1.8079625292740047, + "grad_norm": 0.9413017630577087, + "learning_rate": 3.982764667560915e-06, + "loss": 0.5957, + "step": 6176 + }, + { + "epoch": 1.8082552693208431, + "grad_norm": 1.0100224018096924, + "learning_rate": 3.982454616981442e-06, + "loss": 0.597, + "step": 6177 + }, + { + "epoch": 1.8085480093676813, + "grad_norm": 1.0564130544662476, + "learning_rate": 3.982144531230448e-06, + "loss": 0.6085, + "step": 6178 + }, + { + "epoch": 1.8088407494145198, + "grad_norm": 0.8972460627555847, + "learning_rate": 3.981834410315292e-06, + "loss": 0.578, + "step": 6179 + }, + { + "epoch": 1.8091334894613582, + "grad_norm": 0.9891716837882996, + "learning_rate": 3.981524254243329e-06, + "loss": 0.6049, + "step": 6180 + }, + { + "epoch": 1.8094262295081966, + "grad_norm": 1.0043092966079712, + "learning_rate": 3.981214063021918e-06, + "loss": 0.6086, + "step": 6181 + }, + { + "epoch": 1.809718969555035, + "grad_norm": 0.9536926746368408, + "learning_rate": 3.980903836658418e-06, + "loss": 0.6147, + "step": 6182 + }, + { + "epoch": 1.8100117096018735, + "grad_norm": 0.9666758179664612, + "learning_rate": 3.980593575160192e-06, + "loss": 0.6154, + "step": 6183 + }, + { + "epoch": 1.810304449648712, + "grad_norm": 0.9707380533218384, + "learning_rate": 3.980283278534598e-06, + "loss": 0.6071, + "step": 6184 + }, + { + "epoch": 1.8105971896955504, + "grad_norm": 0.9307348132133484, + "learning_rate": 3.979972946788999e-06, + "loss": 0.6188, + "step": 6185 + }, + { + "epoch": 1.8108899297423888, + "grad_norm": 0.9855927228927612, + "learning_rate": 3.979662579930757e-06, + "loss": 0.6098, + "step": 6186 + }, + { + "epoch": 1.8111826697892273, + "grad_norm": 0.9405468106269836, + "learning_rate": 3.979352177967237e-06, + "loss": 0.6097, + "step": 6187 + }, + { + "epoch": 1.8114754098360657, + "grad_norm": 0.9482427835464478, + "learning_rate": 3.9790417409058015e-06, + "loss": 0.6084, + "step": 6188 + }, + { + "epoch": 1.8117681498829041, + "grad_norm": 0.9493122696876526, + "learning_rate": 3.978731268753818e-06, + "loss": 0.6136, + "step": 6189 + }, + { + "epoch": 1.8120608899297423, + "grad_norm": 1.0094510316848755, + "learning_rate": 3.978420761518649e-06, + "loss": 0.6422, + "step": 6190 + }, + { + "epoch": 1.8123536299765808, + "grad_norm": 0.9593505859375, + "learning_rate": 3.9781102192076645e-06, + "loss": 0.6271, + "step": 6191 + }, + { + "epoch": 1.8126463700234192, + "grad_norm": 0.9122856259346008, + "learning_rate": 3.9777996418282315e-06, + "loss": 0.5436, + "step": 6192 + }, + { + "epoch": 1.8129391100702577, + "grad_norm": 0.9714474081993103, + "learning_rate": 3.977489029387717e-06, + "loss": 0.638, + "step": 6193 + }, + { + "epoch": 1.8132318501170959, + "grad_norm": 0.955452561378479, + "learning_rate": 3.977178381893492e-06, + "loss": 0.6189, + "step": 6194 + }, + { + "epoch": 1.8135245901639343, + "grad_norm": 0.9460418820381165, + "learning_rate": 3.976867699352926e-06, + "loss": 0.5737, + "step": 6195 + }, + { + "epoch": 1.8138173302107727, + "grad_norm": 0.9253585338592529, + "learning_rate": 3.97655698177339e-06, + "loss": 0.6111, + "step": 6196 + }, + { + "epoch": 1.8141100702576112, + "grad_norm": 1.001999020576477, + "learning_rate": 3.976246229162256e-06, + "loss": 0.6318, + "step": 6197 + }, + { + "epoch": 1.8144028103044496, + "grad_norm": 0.9513369202613831, + "learning_rate": 3.975935441526897e-06, + "loss": 0.6163, + "step": 6198 + }, + { + "epoch": 1.814695550351288, + "grad_norm": 1.0022164583206177, + "learning_rate": 3.975624618874685e-06, + "loss": 0.6275, + "step": 6199 + }, + { + "epoch": 1.8149882903981265, + "grad_norm": 1.0417128801345825, + "learning_rate": 3.975313761212997e-06, + "loss": 0.6139, + "step": 6200 + }, + { + "epoch": 1.815281030444965, + "grad_norm": 1.1279088258743286, + "learning_rate": 3.975002868549205e-06, + "loss": 0.6278, + "step": 6201 + }, + { + "epoch": 1.8155737704918034, + "grad_norm": 0.9313648343086243, + "learning_rate": 3.9746919408906876e-06, + "loss": 0.6052, + "step": 6202 + }, + { + "epoch": 1.8158665105386418, + "grad_norm": 0.929446816444397, + "learning_rate": 3.97438097824482e-06, + "loss": 0.5648, + "step": 6203 + }, + { + "epoch": 1.8161592505854802, + "grad_norm": 0.9573808908462524, + "learning_rate": 3.974069980618981e-06, + "loss": 0.6166, + "step": 6204 + }, + { + "epoch": 1.8164519906323187, + "grad_norm": 0.9962764382362366, + "learning_rate": 3.973758948020549e-06, + "loss": 0.5983, + "step": 6205 + }, + { + "epoch": 1.8167447306791569, + "grad_norm": 0.9860694408416748, + "learning_rate": 3.973447880456901e-06, + "loss": 0.6372, + "step": 6206 + }, + { + "epoch": 1.8170374707259953, + "grad_norm": 0.9485680460929871, + "learning_rate": 3.97313677793542e-06, + "loss": 0.5845, + "step": 6207 + }, + { + "epoch": 1.8173302107728337, + "grad_norm": 0.9334065318107605, + "learning_rate": 3.9728256404634856e-06, + "loss": 0.5798, + "step": 6208 + }, + { + "epoch": 1.8176229508196722, + "grad_norm": 0.9175717234611511, + "learning_rate": 3.972514468048481e-06, + "loss": 0.5663, + "step": 6209 + }, + { + "epoch": 1.8179156908665104, + "grad_norm": 0.9495508670806885, + "learning_rate": 3.972203260697787e-06, + "loss": 0.5803, + "step": 6210 + }, + { + "epoch": 1.8182084309133488, + "grad_norm": 0.9386922121047974, + "learning_rate": 3.971892018418787e-06, + "loss": 0.575, + "step": 6211 + }, + { + "epoch": 1.8185011709601873, + "grad_norm": 0.943837583065033, + "learning_rate": 3.971580741218867e-06, + "loss": 0.6139, + "step": 6212 + }, + { + "epoch": 1.8187939110070257, + "grad_norm": 0.9495595693588257, + "learning_rate": 3.971269429105411e-06, + "loss": 0.6419, + "step": 6213 + }, + { + "epoch": 1.8190866510538641, + "grad_norm": 1.015149712562561, + "learning_rate": 3.970958082085805e-06, + "loss": 0.6488, + "step": 6214 + }, + { + "epoch": 1.8193793911007026, + "grad_norm": 0.9319255352020264, + "learning_rate": 3.9706467001674365e-06, + "loss": 0.6147, + "step": 6215 + }, + { + "epoch": 1.819672131147541, + "grad_norm": 0.9948639869689941, + "learning_rate": 3.970335283357692e-06, + "loss": 0.6001, + "step": 6216 + }, + { + "epoch": 1.8199648711943794, + "grad_norm": 0.909539520740509, + "learning_rate": 3.97002383166396e-06, + "loss": 0.5678, + "step": 6217 + }, + { + "epoch": 1.8202576112412179, + "grad_norm": 0.9071897864341736, + "learning_rate": 3.969712345093631e-06, + "loss": 0.5904, + "step": 6218 + }, + { + "epoch": 1.8205503512880563, + "grad_norm": 0.9435209631919861, + "learning_rate": 3.969400823654094e-06, + "loss": 0.6246, + "step": 6219 + }, + { + "epoch": 1.8208430913348947, + "grad_norm": 0.9753648042678833, + "learning_rate": 3.96908926735274e-06, + "loss": 0.5827, + "step": 6220 + }, + { + "epoch": 1.8211358313817332, + "grad_norm": 0.9284666776657104, + "learning_rate": 3.968777676196961e-06, + "loss": 0.5993, + "step": 6221 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 0.9626908302307129, + "learning_rate": 3.96846605019415e-06, + "loss": 0.6453, + "step": 6222 + }, + { + "epoch": 1.8217213114754098, + "grad_norm": 0.9602272510528564, + "learning_rate": 3.968154389351698e-06, + "loss": 0.5904, + "step": 6223 + }, + { + "epoch": 1.8220140515222483, + "grad_norm": 0.9428016543388367, + "learning_rate": 3.967842693677002e-06, + "loss": 0.6345, + "step": 6224 + }, + { + "epoch": 1.8223067915690867, + "grad_norm": 0.9290540218353271, + "learning_rate": 3.967530963177457e-06, + "loss": 0.5608, + "step": 6225 + }, + { + "epoch": 1.822599531615925, + "grad_norm": 0.9280454516410828, + "learning_rate": 3.9672191978604575e-06, + "loss": 0.6148, + "step": 6226 + }, + { + "epoch": 1.8228922716627634, + "grad_norm": 0.9744189381599426, + "learning_rate": 3.966907397733401e-06, + "loss": 0.61, + "step": 6227 + }, + { + "epoch": 1.8231850117096018, + "grad_norm": 0.9517498016357422, + "learning_rate": 3.9665955628036836e-06, + "loss": 0.6096, + "step": 6228 + }, + { + "epoch": 1.8234777517564402, + "grad_norm": 1.0353553295135498, + "learning_rate": 3.966283693078705e-06, + "loss": 0.6276, + "step": 6229 + }, + { + "epoch": 1.8237704918032787, + "grad_norm": 0.9395636320114136, + "learning_rate": 3.965971788565864e-06, + "loss": 0.5636, + "step": 6230 + }, + { + "epoch": 1.824063231850117, + "grad_norm": 1.0297759771347046, + "learning_rate": 3.965659849272562e-06, + "loss": 0.6743, + "step": 6231 + }, + { + "epoch": 1.8243559718969555, + "grad_norm": 0.9624348282814026, + "learning_rate": 3.965347875206197e-06, + "loss": 0.6002, + "step": 6232 + }, + { + "epoch": 1.824648711943794, + "grad_norm": 0.967278003692627, + "learning_rate": 3.965035866374172e-06, + "loss": 0.6167, + "step": 6233 + }, + { + "epoch": 1.8249414519906324, + "grad_norm": 0.9367884993553162, + "learning_rate": 3.964723822783891e-06, + "loss": 0.6103, + "step": 6234 + }, + { + "epoch": 1.8252341920374708, + "grad_norm": 0.9359004497528076, + "learning_rate": 3.964411744442755e-06, + "loss": 0.5711, + "step": 6235 + }, + { + "epoch": 1.8255269320843093, + "grad_norm": 0.9330704808235168, + "learning_rate": 3.9640996313581695e-06, + "loss": 0.6012, + "step": 6236 + }, + { + "epoch": 1.8258196721311475, + "grad_norm": 0.9193284511566162, + "learning_rate": 3.963787483537538e-06, + "loss": 0.5608, + "step": 6237 + }, + { + "epoch": 1.826112412177986, + "grad_norm": 0.9569916129112244, + "learning_rate": 3.963475300988267e-06, + "loss": 0.6206, + "step": 6238 + }, + { + "epoch": 1.8264051522248244, + "grad_norm": 0.9423056840896606, + "learning_rate": 3.963163083717765e-06, + "loss": 0.5821, + "step": 6239 + }, + { + "epoch": 1.8266978922716628, + "grad_norm": 1.0120781660079956, + "learning_rate": 3.962850831733437e-06, + "loss": 0.6255, + "step": 6240 + }, + { + "epoch": 1.826990632318501, + "grad_norm": 1.0107687711715698, + "learning_rate": 3.9625385450426914e-06, + "loss": 0.6224, + "step": 6241 + }, + { + "epoch": 1.8272833723653394, + "grad_norm": 0.9615622758865356, + "learning_rate": 3.962226223652939e-06, + "loss": 0.6091, + "step": 6242 + }, + { + "epoch": 1.8275761124121779, + "grad_norm": 0.9809932112693787, + "learning_rate": 3.961913867571588e-06, + "loss": 0.5821, + "step": 6243 + }, + { + "epoch": 1.8278688524590163, + "grad_norm": 0.9795226454734802, + "learning_rate": 3.961601476806049e-06, + "loss": 0.5767, + "step": 6244 + }, + { + "epoch": 1.8281615925058547, + "grad_norm": 1.0174455642700195, + "learning_rate": 3.9612890513637344e-06, + "loss": 0.5992, + "step": 6245 + }, + { + "epoch": 1.8284543325526932, + "grad_norm": 0.9216565489768982, + "learning_rate": 3.960976591252056e-06, + "loss": 0.5956, + "step": 6246 + }, + { + "epoch": 1.8287470725995316, + "grad_norm": 0.9803445339202881, + "learning_rate": 3.960664096478428e-06, + "loss": 0.6009, + "step": 6247 + }, + { + "epoch": 1.82903981264637, + "grad_norm": 0.929762601852417, + "learning_rate": 3.960351567050263e-06, + "loss": 0.5928, + "step": 6248 + }, + { + "epoch": 1.8293325526932085, + "grad_norm": 0.9271233081817627, + "learning_rate": 3.960039002974977e-06, + "loss": 0.5716, + "step": 6249 + }, + { + "epoch": 1.829625292740047, + "grad_norm": 0.9604407548904419, + "learning_rate": 3.959726404259986e-06, + "loss": 0.6088, + "step": 6250 + }, + { + "epoch": 1.8299180327868854, + "grad_norm": 0.9468733072280884, + "learning_rate": 3.959413770912705e-06, + "loss": 0.612, + "step": 6251 + }, + { + "epoch": 1.8302107728337238, + "grad_norm": 1.0015348196029663, + "learning_rate": 3.959101102940551e-06, + "loss": 0.6457, + "step": 6252 + }, + { + "epoch": 1.830503512880562, + "grad_norm": 0.9202443957328796, + "learning_rate": 3.958788400350944e-06, + "loss": 0.5598, + "step": 6253 + }, + { + "epoch": 1.8307962529274004, + "grad_norm": 0.9433732628822327, + "learning_rate": 3.958475663151301e-06, + "loss": 0.5522, + "step": 6254 + }, + { + "epoch": 1.8310889929742389, + "grad_norm": 0.9198225736618042, + "learning_rate": 3.9581628913490435e-06, + "loss": 0.6045, + "step": 6255 + }, + { + "epoch": 1.8313817330210773, + "grad_norm": 1.0186489820480347, + "learning_rate": 3.957850084951591e-06, + "loss": 0.6232, + "step": 6256 + }, + { + "epoch": 1.8316744730679155, + "grad_norm": 0.9959121942520142, + "learning_rate": 3.957537243966365e-06, + "loss": 0.5981, + "step": 6257 + }, + { + "epoch": 1.831967213114754, + "grad_norm": 0.9513674378395081, + "learning_rate": 3.957224368400788e-06, + "loss": 0.5725, + "step": 6258 + }, + { + "epoch": 1.8322599531615924, + "grad_norm": 0.8991698622703552, + "learning_rate": 3.956911458262283e-06, + "loss": 0.5585, + "step": 6259 + }, + { + "epoch": 1.8325526932084308, + "grad_norm": 0.9586885571479797, + "learning_rate": 3.956598513558274e-06, + "loss": 0.6081, + "step": 6260 + }, + { + "epoch": 1.8328454332552693, + "grad_norm": 0.9489455819129944, + "learning_rate": 3.9562855342961845e-06, + "loss": 0.6205, + "step": 6261 + }, + { + "epoch": 1.8331381733021077, + "grad_norm": 0.9805490970611572, + "learning_rate": 3.955972520483442e-06, + "loss": 0.5925, + "step": 6262 + }, + { + "epoch": 1.8334309133489461, + "grad_norm": 0.9583761692047119, + "learning_rate": 3.955659472127471e-06, + "loss": 0.6448, + "step": 6263 + }, + { + "epoch": 1.8337236533957846, + "grad_norm": 0.9939512014389038, + "learning_rate": 3.955346389235699e-06, + "loss": 0.642, + "step": 6264 + }, + { + "epoch": 1.834016393442623, + "grad_norm": 0.95876544713974, + "learning_rate": 3.955033271815556e-06, + "loss": 0.6386, + "step": 6265 + }, + { + "epoch": 1.8343091334894615, + "grad_norm": 0.9131155014038086, + "learning_rate": 3.954720119874468e-06, + "loss": 0.5741, + "step": 6266 + }, + { + "epoch": 1.8346018735362999, + "grad_norm": 0.9674673676490784, + "learning_rate": 3.954406933419865e-06, + "loss": 0.6261, + "step": 6267 + }, + { + "epoch": 1.8348946135831383, + "grad_norm": 1.0084198713302612, + "learning_rate": 3.954093712459179e-06, + "loss": 0.5906, + "step": 6268 + }, + { + "epoch": 1.8351873536299765, + "grad_norm": 0.9454860091209412, + "learning_rate": 3.95378045699984e-06, + "loss": 0.5725, + "step": 6269 + }, + { + "epoch": 1.835480093676815, + "grad_norm": 0.9467571377754211, + "learning_rate": 3.953467167049281e-06, + "loss": 0.6149, + "step": 6270 + }, + { + "epoch": 1.8357728337236534, + "grad_norm": 0.940118134021759, + "learning_rate": 3.953153842614933e-06, + "loss": 0.5558, + "step": 6271 + }, + { + "epoch": 1.8360655737704918, + "grad_norm": 0.9025898575782776, + "learning_rate": 3.952840483704232e-06, + "loss": 0.5627, + "step": 6272 + }, + { + "epoch": 1.83635831381733, + "grad_norm": 0.9730240106582642, + "learning_rate": 3.952527090324611e-06, + "loss": 0.6296, + "step": 6273 + }, + { + "epoch": 1.8366510538641685, + "grad_norm": 0.9650467038154602, + "learning_rate": 3.952213662483505e-06, + "loss": 0.5993, + "step": 6274 + }, + { + "epoch": 1.836943793911007, + "grad_norm": 0.9220966696739197, + "learning_rate": 3.9519002001883514e-06, + "loss": 0.6167, + "step": 6275 + }, + { + "epoch": 1.8372365339578454, + "grad_norm": 0.9659632444381714, + "learning_rate": 3.951586703446587e-06, + "loss": 0.5921, + "step": 6276 + }, + { + "epoch": 1.8375292740046838, + "grad_norm": 0.961034893989563, + "learning_rate": 3.951273172265649e-06, + "loss": 0.6264, + "step": 6277 + }, + { + "epoch": 1.8378220140515222, + "grad_norm": 0.9500144720077515, + "learning_rate": 3.950959606652976e-06, + "loss": 0.6268, + "step": 6278 + }, + { + "epoch": 1.8381147540983607, + "grad_norm": 0.9539287686347961, + "learning_rate": 3.950646006616007e-06, + "loss": 0.6086, + "step": 6279 + }, + { + "epoch": 1.838407494145199, + "grad_norm": 0.9569194316864014, + "learning_rate": 3.950332372162184e-06, + "loss": 0.666, + "step": 6280 + }, + { + "epoch": 1.8387002341920375, + "grad_norm": 0.9722229838371277, + "learning_rate": 3.950018703298947e-06, + "loss": 0.6354, + "step": 6281 + }, + { + "epoch": 1.838992974238876, + "grad_norm": 0.9734247326850891, + "learning_rate": 3.9497050000337365e-06, + "loss": 0.6214, + "step": 6282 + }, + { + "epoch": 1.8392857142857144, + "grad_norm": 0.942225456237793, + "learning_rate": 3.949391262373997e-06, + "loss": 0.5939, + "step": 6283 + }, + { + "epoch": 1.8395784543325528, + "grad_norm": 0.9089297652244568, + "learning_rate": 3.94907749032717e-06, + "loss": 0.6112, + "step": 6284 + }, + { + "epoch": 1.839871194379391, + "grad_norm": 0.9795874953269958, + "learning_rate": 3.9487636839007025e-06, + "loss": 0.6131, + "step": 6285 + }, + { + "epoch": 1.8401639344262295, + "grad_norm": 0.9284724593162537, + "learning_rate": 3.948449843102039e-06, + "loss": 0.5753, + "step": 6286 + }, + { + "epoch": 1.840456674473068, + "grad_norm": 0.9399732351303101, + "learning_rate": 3.9481359679386235e-06, + "loss": 0.5639, + "step": 6287 + }, + { + "epoch": 1.8407494145199064, + "grad_norm": 0.9863868951797485, + "learning_rate": 3.947822058417904e-06, + "loss": 0.5835, + "step": 6288 + }, + { + "epoch": 1.8410421545667446, + "grad_norm": 0.9447397589683533, + "learning_rate": 3.9475081145473284e-06, + "loss": 0.6117, + "step": 6289 + }, + { + "epoch": 1.841334894613583, + "grad_norm": 0.9253147840499878, + "learning_rate": 3.947194136334344e-06, + "loss": 0.6184, + "step": 6290 + }, + { + "epoch": 1.8416276346604215, + "grad_norm": 0.9487301707267761, + "learning_rate": 3.946880123786401e-06, + "loss": 0.5898, + "step": 6291 + }, + { + "epoch": 1.8419203747072599, + "grad_norm": 0.9512694478034973, + "learning_rate": 3.94656607691095e-06, + "loss": 0.6471, + "step": 6292 + }, + { + "epoch": 1.8422131147540983, + "grad_norm": 0.9662417769432068, + "learning_rate": 3.94625199571544e-06, + "loss": 0.6048, + "step": 6293 + }, + { + "epoch": 1.8425058548009368, + "grad_norm": 0.9796162247657776, + "learning_rate": 3.945937880207323e-06, + "loss": 0.6337, + "step": 6294 + }, + { + "epoch": 1.8427985948477752, + "grad_norm": 0.9426081776618958, + "learning_rate": 3.945623730394053e-06, + "loss": 0.6202, + "step": 6295 + }, + { + "epoch": 1.8430913348946136, + "grad_norm": 0.981759786605835, + "learning_rate": 3.9453095462830825e-06, + "loss": 0.5887, + "step": 6296 + }, + { + "epoch": 1.843384074941452, + "grad_norm": 0.9426006078720093, + "learning_rate": 3.944995327881865e-06, + "loss": 0.5987, + "step": 6297 + }, + { + "epoch": 1.8436768149882905, + "grad_norm": 0.9663492441177368, + "learning_rate": 3.944681075197855e-06, + "loss": 0.6283, + "step": 6298 + }, + { + "epoch": 1.843969555035129, + "grad_norm": 1.0096343755722046, + "learning_rate": 3.944366788238509e-06, + "loss": 0.6398, + "step": 6299 + }, + { + "epoch": 1.8442622950819674, + "grad_norm": 0.9098829627037048, + "learning_rate": 3.944052467011285e-06, + "loss": 0.5301, + "step": 6300 + }, + { + "epoch": 1.8445550351288056, + "grad_norm": 0.92119300365448, + "learning_rate": 3.943738111523637e-06, + "loss": 0.624, + "step": 6301 + }, + { + "epoch": 1.844847775175644, + "grad_norm": 0.9188681244850159, + "learning_rate": 3.943423721783025e-06, + "loss": 0.6394, + "step": 6302 + }, + { + "epoch": 1.8451405152224825, + "grad_norm": 0.9093318581581116, + "learning_rate": 3.943109297796909e-06, + "loss": 0.6012, + "step": 6303 + }, + { + "epoch": 1.845433255269321, + "grad_norm": 0.9733746647834778, + "learning_rate": 3.942794839572747e-06, + "loss": 0.5908, + "step": 6304 + }, + { + "epoch": 1.845725995316159, + "grad_norm": 0.9191424250602722, + "learning_rate": 3.942480347118001e-06, + "loss": 0.6299, + "step": 6305 + }, + { + "epoch": 1.8460187353629975, + "grad_norm": 0.9265088438987732, + "learning_rate": 3.942165820440131e-06, + "loss": 0.5838, + "step": 6306 + }, + { + "epoch": 1.846311475409836, + "grad_norm": 0.9656901359558105, + "learning_rate": 3.9418512595465995e-06, + "loss": 0.6422, + "step": 6307 + }, + { + "epoch": 1.8466042154566744, + "grad_norm": 0.936866283416748, + "learning_rate": 3.9415366644448705e-06, + "loss": 0.5961, + "step": 6308 + }, + { + "epoch": 1.8468969555035128, + "grad_norm": 0.9322345852851868, + "learning_rate": 3.941222035142407e-06, + "loss": 0.5667, + "step": 6309 + }, + { + "epoch": 1.8471896955503513, + "grad_norm": 0.9122965335845947, + "learning_rate": 3.940907371646674e-06, + "loss": 0.5847, + "step": 6310 + }, + { + "epoch": 1.8474824355971897, + "grad_norm": 0.9650111794471741, + "learning_rate": 3.9405926739651365e-06, + "loss": 0.6069, + "step": 6311 + }, + { + "epoch": 1.8477751756440282, + "grad_norm": 0.9993274211883545, + "learning_rate": 3.940277942105261e-06, + "loss": 0.5899, + "step": 6312 + }, + { + "epoch": 1.8480679156908666, + "grad_norm": 0.9468676447868347, + "learning_rate": 3.939963176074514e-06, + "loss": 0.5894, + "step": 6313 + }, + { + "epoch": 1.848360655737705, + "grad_norm": 0.9805901050567627, + "learning_rate": 3.939648375880365e-06, + "loss": 0.5793, + "step": 6314 + }, + { + "epoch": 1.8486533957845435, + "grad_norm": 0.9569573402404785, + "learning_rate": 3.939333541530282e-06, + "loss": 0.6033, + "step": 6315 + }, + { + "epoch": 1.848946135831382, + "grad_norm": 0.9421535730361938, + "learning_rate": 3.939018673031733e-06, + "loss": 0.6175, + "step": 6316 + }, + { + "epoch": 1.8492388758782201, + "grad_norm": 0.9477778673171997, + "learning_rate": 3.93870377039219e-06, + "loss": 0.5844, + "step": 6317 + }, + { + "epoch": 1.8495316159250585, + "grad_norm": 0.9805952906608582, + "learning_rate": 3.938388833619124e-06, + "loss": 0.6052, + "step": 6318 + }, + { + "epoch": 1.849824355971897, + "grad_norm": 0.9397863745689392, + "learning_rate": 3.938073862720006e-06, + "loss": 0.5643, + "step": 6319 + }, + { + "epoch": 1.8501170960187352, + "grad_norm": 0.9773653149604797, + "learning_rate": 3.93775885770231e-06, + "loss": 0.5982, + "step": 6320 + }, + { + "epoch": 1.8504098360655736, + "grad_norm": 0.9994926452636719, + "learning_rate": 3.937443818573508e-06, + "loss": 0.6006, + "step": 6321 + }, + { + "epoch": 1.850702576112412, + "grad_norm": 0.9782898426055908, + "learning_rate": 3.937128745341076e-06, + "loss": 0.5839, + "step": 6322 + }, + { + "epoch": 1.8509953161592505, + "grad_norm": 0.9390230178833008, + "learning_rate": 3.936813638012488e-06, + "loss": 0.5662, + "step": 6323 + }, + { + "epoch": 1.851288056206089, + "grad_norm": 0.9397550225257874, + "learning_rate": 3.93649849659522e-06, + "loss": 0.6011, + "step": 6324 + }, + { + "epoch": 1.8515807962529274, + "grad_norm": 0.9879733920097351, + "learning_rate": 3.93618332109675e-06, + "loss": 0.6347, + "step": 6325 + }, + { + "epoch": 1.8518735362997658, + "grad_norm": 0.9307846426963806, + "learning_rate": 3.935868111524555e-06, + "loss": 0.6062, + "step": 6326 + }, + { + "epoch": 1.8521662763466042, + "grad_norm": 0.9584196209907532, + "learning_rate": 3.935552867886113e-06, + "loss": 0.5936, + "step": 6327 + }, + { + "epoch": 1.8524590163934427, + "grad_norm": 0.9408822059631348, + "learning_rate": 3.935237590188903e-06, + "loss": 0.588, + "step": 6328 + }, + { + "epoch": 1.8527517564402811, + "grad_norm": 0.9891690015792847, + "learning_rate": 3.934922278440405e-06, + "loss": 0.6189, + "step": 6329 + }, + { + "epoch": 1.8530444964871196, + "grad_norm": 0.9741967916488647, + "learning_rate": 3.934606932648101e-06, + "loss": 0.5791, + "step": 6330 + }, + { + "epoch": 1.853337236533958, + "grad_norm": 0.9387686848640442, + "learning_rate": 3.934291552819473e-06, + "loss": 0.5989, + "step": 6331 + }, + { + "epoch": 1.8536299765807962, + "grad_norm": 0.9357953071594238, + "learning_rate": 3.933976138962e-06, + "loss": 0.5994, + "step": 6332 + }, + { + "epoch": 1.8539227166276346, + "grad_norm": 0.9265718460083008, + "learning_rate": 3.933660691083168e-06, + "loss": 0.5738, + "step": 6333 + }, + { + "epoch": 1.854215456674473, + "grad_norm": 0.9697136282920837, + "learning_rate": 3.933345209190462e-06, + "loss": 0.6163, + "step": 6334 + }, + { + "epoch": 1.8545081967213115, + "grad_norm": 0.9448897838592529, + "learning_rate": 3.933029693291364e-06, + "loss": 0.5989, + "step": 6335 + }, + { + "epoch": 1.8548009367681497, + "grad_norm": 0.9927088618278503, + "learning_rate": 3.932714143393363e-06, + "loss": 0.563, + "step": 6336 + }, + { + "epoch": 1.8550936768149882, + "grad_norm": 0.9463045001029968, + "learning_rate": 3.9323985595039425e-06, + "loss": 0.6432, + "step": 6337 + }, + { + "epoch": 1.8553864168618266, + "grad_norm": 1.000501275062561, + "learning_rate": 3.932082941630591e-06, + "loss": 0.6272, + "step": 6338 + }, + { + "epoch": 1.855679156908665, + "grad_norm": 0.9732459187507629, + "learning_rate": 3.931767289780797e-06, + "loss": 0.6009, + "step": 6339 + }, + { + "epoch": 1.8559718969555035, + "grad_norm": 0.9564077258110046, + "learning_rate": 3.9314516039620484e-06, + "loss": 0.6026, + "step": 6340 + }, + { + "epoch": 1.856264637002342, + "grad_norm": 0.9213919043540955, + "learning_rate": 3.931135884181837e-06, + "loss": 0.5556, + "step": 6341 + }, + { + "epoch": 1.8565573770491803, + "grad_norm": 1.0817264318466187, + "learning_rate": 3.930820130447651e-06, + "loss": 0.5857, + "step": 6342 + }, + { + "epoch": 1.8568501170960188, + "grad_norm": 1.2801885604858398, + "learning_rate": 3.930504342766982e-06, + "loss": 0.6316, + "step": 6343 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.9400875568389893, + "learning_rate": 3.930188521147322e-06, + "loss": 0.5461, + "step": 6344 + }, + { + "epoch": 1.8574355971896956, + "grad_norm": 1.000577449798584, + "learning_rate": 3.929872665596166e-06, + "loss": 0.6445, + "step": 6345 + }, + { + "epoch": 1.857728337236534, + "grad_norm": 0.9439194798469543, + "learning_rate": 3.929556776121006e-06, + "loss": 0.6077, + "step": 6346 + }, + { + "epoch": 1.8580210772833725, + "grad_norm": 0.9793219566345215, + "learning_rate": 3.929240852729336e-06, + "loss": 0.6258, + "step": 6347 + }, + { + "epoch": 1.8583138173302107, + "grad_norm": 1.0039453506469727, + "learning_rate": 3.928924895428653e-06, + "loss": 0.5837, + "step": 6348 + }, + { + "epoch": 1.8586065573770492, + "grad_norm": 0.9445581436157227, + "learning_rate": 3.928608904226452e-06, + "loss": 0.6147, + "step": 6349 + }, + { + "epoch": 1.8588992974238876, + "grad_norm": 0.9785468578338623, + "learning_rate": 3.92829287913023e-06, + "loss": 0.6086, + "step": 6350 + }, + { + "epoch": 1.859192037470726, + "grad_norm": 0.9732187986373901, + "learning_rate": 3.927976820147486e-06, + "loss": 0.5884, + "step": 6351 + }, + { + "epoch": 1.8594847775175642, + "grad_norm": 0.9831200838088989, + "learning_rate": 3.927660727285717e-06, + "loss": 0.5847, + "step": 6352 + }, + { + "epoch": 1.8597775175644027, + "grad_norm": 0.9619171619415283, + "learning_rate": 3.927344600552423e-06, + "loss": 0.625, + "step": 6353 + }, + { + "epoch": 1.8600702576112411, + "grad_norm": 0.9679073691368103, + "learning_rate": 3.927028439955104e-06, + "loss": 0.5748, + "step": 6354 + }, + { + "epoch": 1.8603629976580796, + "grad_norm": 0.9809025526046753, + "learning_rate": 3.926712245501261e-06, + "loss": 0.614, + "step": 6355 + }, + { + "epoch": 1.860655737704918, + "grad_norm": 0.9563438892364502, + "learning_rate": 3.926396017198396e-06, + "loss": 0.5987, + "step": 6356 + }, + { + "epoch": 1.8609484777517564, + "grad_norm": 0.9696565866470337, + "learning_rate": 3.926079755054012e-06, + "loss": 0.6027, + "step": 6357 + }, + { + "epoch": 1.8612412177985949, + "grad_norm": 0.9351434707641602, + "learning_rate": 3.925763459075611e-06, + "loss": 0.6014, + "step": 6358 + }, + { + "epoch": 1.8615339578454333, + "grad_norm": 0.9549933075904846, + "learning_rate": 3.925447129270698e-06, + "loss": 0.6076, + "step": 6359 + }, + { + "epoch": 1.8618266978922717, + "grad_norm": 1.0012909173965454, + "learning_rate": 3.925130765646779e-06, + "loss": 0.6252, + "step": 6360 + }, + { + "epoch": 1.8621194379391102, + "grad_norm": 0.9390286803245544, + "learning_rate": 3.924814368211357e-06, + "loss": 0.5935, + "step": 6361 + }, + { + "epoch": 1.8624121779859486, + "grad_norm": 0.9354349970817566, + "learning_rate": 3.924497936971942e-06, + "loss": 0.5966, + "step": 6362 + }, + { + "epoch": 1.862704918032787, + "grad_norm": 0.9521082043647766, + "learning_rate": 3.924181471936039e-06, + "loss": 0.601, + "step": 6363 + }, + { + "epoch": 1.8629976580796253, + "grad_norm": 0.9469088912010193, + "learning_rate": 3.9238649731111575e-06, + "loss": 0.6239, + "step": 6364 + }, + { + "epoch": 1.8632903981264637, + "grad_norm": 0.9218846559524536, + "learning_rate": 3.9235484405048055e-06, + "loss": 0.6244, + "step": 6365 + }, + { + "epoch": 1.8635831381733021, + "grad_norm": 0.9853828549385071, + "learning_rate": 3.923231874124494e-06, + "loss": 0.6355, + "step": 6366 + }, + { + "epoch": 1.8638758782201406, + "grad_norm": 0.9817132353782654, + "learning_rate": 3.922915273977732e-06, + "loss": 0.6482, + "step": 6367 + }, + { + "epoch": 1.8641686182669788, + "grad_norm": 0.9177657961845398, + "learning_rate": 3.922598640072032e-06, + "loss": 0.5561, + "step": 6368 + }, + { + "epoch": 1.8644613583138172, + "grad_norm": 0.9835156798362732, + "learning_rate": 3.922281972414906e-06, + "loss": 0.6245, + "step": 6369 + }, + { + "epoch": 1.8647540983606556, + "grad_norm": 0.9300404191017151, + "learning_rate": 3.9219652710138666e-06, + "loss": 0.6024, + "step": 6370 + }, + { + "epoch": 1.865046838407494, + "grad_norm": 0.9564766883850098, + "learning_rate": 3.921648535876429e-06, + "loss": 0.6345, + "step": 6371 + }, + { + "epoch": 1.8653395784543325, + "grad_norm": 0.9885044097900391, + "learning_rate": 3.921331767010106e-06, + "loss": 0.5919, + "step": 6372 + }, + { + "epoch": 1.865632318501171, + "grad_norm": 0.964855432510376, + "learning_rate": 3.9210149644224135e-06, + "loss": 0.6011, + "step": 6373 + }, + { + "epoch": 1.8659250585480094, + "grad_norm": 0.9629966616630554, + "learning_rate": 3.920698128120869e-06, + "loss": 0.607, + "step": 6374 + }, + { + "epoch": 1.8662177985948478, + "grad_norm": 0.932196319103241, + "learning_rate": 3.920381258112988e-06, + "loss": 0.5882, + "step": 6375 + }, + { + "epoch": 1.8665105386416863, + "grad_norm": 0.9120140075683594, + "learning_rate": 3.920064354406288e-06, + "loss": 0.5791, + "step": 6376 + }, + { + "epoch": 1.8668032786885247, + "grad_norm": 1.0460612773895264, + "learning_rate": 3.9197474170082885e-06, + "loss": 0.6257, + "step": 6377 + }, + { + "epoch": 1.8670960187353631, + "grad_norm": 0.9669110178947449, + "learning_rate": 3.9194304459265095e-06, + "loss": 0.5968, + "step": 6378 + }, + { + "epoch": 1.8673887587822016, + "grad_norm": 0.9562413692474365, + "learning_rate": 3.919113441168469e-06, + "loss": 0.6183, + "step": 6379 + }, + { + "epoch": 1.8676814988290398, + "grad_norm": 0.9371131062507629, + "learning_rate": 3.918796402741692e-06, + "loss": 0.5645, + "step": 6380 + }, + { + "epoch": 1.8679742388758782, + "grad_norm": 0.9507868885993958, + "learning_rate": 3.918479330653696e-06, + "loss": 0.6041, + "step": 6381 + }, + { + "epoch": 1.8682669789227166, + "grad_norm": 0.9623852968215942, + "learning_rate": 3.918162224912006e-06, + "loss": 0.6323, + "step": 6382 + }, + { + "epoch": 1.868559718969555, + "grad_norm": 0.9406799674034119, + "learning_rate": 3.917845085524145e-06, + "loss": 0.5882, + "step": 6383 + }, + { + "epoch": 1.8688524590163933, + "grad_norm": 0.9652610421180725, + "learning_rate": 3.917527912497635e-06, + "loss": 0.6308, + "step": 6384 + }, + { + "epoch": 1.8691451990632317, + "grad_norm": 0.9762948155403137, + "learning_rate": 3.917210705840004e-06, + "loss": 0.6053, + "step": 6385 + }, + { + "epoch": 1.8694379391100702, + "grad_norm": 1.0097278356552124, + "learning_rate": 3.9168934655587775e-06, + "loss": 0.5657, + "step": 6386 + }, + { + "epoch": 1.8697306791569086, + "grad_norm": 0.9710915684700012, + "learning_rate": 3.9165761916614815e-06, + "loss": 0.5918, + "step": 6387 + }, + { + "epoch": 1.870023419203747, + "grad_norm": 0.9512038230895996, + "learning_rate": 3.916258884155643e-06, + "loss": 0.6069, + "step": 6388 + }, + { + "epoch": 1.8703161592505855, + "grad_norm": 0.9690318703651428, + "learning_rate": 3.915941543048789e-06, + "loss": 0.5634, + "step": 6389 + }, + { + "epoch": 1.870608899297424, + "grad_norm": 1.137939453125, + "learning_rate": 3.9156241683484506e-06, + "loss": 0.61, + "step": 6390 + }, + { + "epoch": 1.8709016393442623, + "grad_norm": 0.9879637360572815, + "learning_rate": 3.915306760062157e-06, + "loss": 0.6189, + "step": 6391 + }, + { + "epoch": 1.8711943793911008, + "grad_norm": 0.9382737278938293, + "learning_rate": 3.914989318197439e-06, + "loss": 0.6231, + "step": 6392 + }, + { + "epoch": 1.8714871194379392, + "grad_norm": 0.9198970198631287, + "learning_rate": 3.914671842761828e-06, + "loss": 0.6052, + "step": 6393 + }, + { + "epoch": 1.8717798594847777, + "grad_norm": 0.9563627243041992, + "learning_rate": 3.914354333762854e-06, + "loss": 0.5724, + "step": 6394 + }, + { + "epoch": 1.872072599531616, + "grad_norm": 0.9794101119041443, + "learning_rate": 3.914036791208053e-06, + "loss": 0.5969, + "step": 6395 + }, + { + "epoch": 1.8723653395784543, + "grad_norm": 0.9330150485038757, + "learning_rate": 3.913719215104958e-06, + "loss": 0.5519, + "step": 6396 + }, + { + "epoch": 1.8726580796252927, + "grad_norm": 0.9758045077323914, + "learning_rate": 3.913401605461101e-06, + "loss": 0.6265, + "step": 6397 + }, + { + "epoch": 1.8729508196721312, + "grad_norm": 0.9384021162986755, + "learning_rate": 3.913083962284021e-06, + "loss": 0.5892, + "step": 6398 + }, + { + "epoch": 1.8732435597189696, + "grad_norm": 1.0079048871994019, + "learning_rate": 3.912766285581252e-06, + "loss": 0.5922, + "step": 6399 + }, + { + "epoch": 1.8735362997658078, + "grad_norm": 0.924126386642456, + "learning_rate": 3.912448575360332e-06, + "loss": 0.6246, + "step": 6400 + }, + { + "epoch": 1.8738290398126463, + "grad_norm": 0.9548190832138062, + "learning_rate": 3.912130831628797e-06, + "loss": 0.559, + "step": 6401 + }, + { + "epoch": 1.8741217798594847, + "grad_norm": 0.9244149923324585, + "learning_rate": 3.911813054394188e-06, + "loss": 0.5496, + "step": 6402 + }, + { + "epoch": 1.8744145199063231, + "grad_norm": 0.9647964835166931, + "learning_rate": 3.911495243664042e-06, + "loss": 0.6257, + "step": 6403 + }, + { + "epoch": 1.8747072599531616, + "grad_norm": 0.9812660813331604, + "learning_rate": 3.911177399445901e-06, + "loss": 0.6043, + "step": 6404 + }, + { + "epoch": 1.875, + "grad_norm": 0.9274317026138306, + "learning_rate": 3.910859521747304e-06, + "loss": 0.6009, + "step": 6405 + }, + { + "epoch": 1.8752927400468384, + "grad_norm": 0.9911108016967773, + "learning_rate": 3.910541610575795e-06, + "loss": 0.5994, + "step": 6406 + }, + { + "epoch": 1.8755854800936769, + "grad_norm": 0.9478574395179749, + "learning_rate": 3.910223665938915e-06, + "loss": 0.6055, + "step": 6407 + }, + { + "epoch": 1.8758782201405153, + "grad_norm": 0.9495585560798645, + "learning_rate": 3.909905687844207e-06, + "loss": 0.6524, + "step": 6408 + }, + { + "epoch": 1.8761709601873537, + "grad_norm": 1.013312578201294, + "learning_rate": 3.909587676299216e-06, + "loss": 0.6155, + "step": 6409 + }, + { + "epoch": 1.8764637002341922, + "grad_norm": 1.0219630002975464, + "learning_rate": 3.909269631311487e-06, + "loss": 0.6298, + "step": 6410 + }, + { + "epoch": 1.8767564402810304, + "grad_norm": 0.9293389320373535, + "learning_rate": 3.9089515528885644e-06, + "loss": 0.5853, + "step": 6411 + }, + { + "epoch": 1.8770491803278688, + "grad_norm": 0.962261974811554, + "learning_rate": 3.908633441037996e-06, + "loss": 0.6201, + "step": 6412 + }, + { + "epoch": 1.8773419203747073, + "grad_norm": 1.0310230255126953, + "learning_rate": 3.9083152957673284e-06, + "loss": 0.6018, + "step": 6413 + }, + { + "epoch": 1.8776346604215457, + "grad_norm": 0.9771367907524109, + "learning_rate": 3.9079971170841105e-06, + "loss": 0.6378, + "step": 6414 + }, + { + "epoch": 1.877927400468384, + "grad_norm": 1.070151925086975, + "learning_rate": 3.907678904995889e-06, + "loss": 0.5984, + "step": 6415 + }, + { + "epoch": 1.8782201405152223, + "grad_norm": 0.9560325741767883, + "learning_rate": 3.907360659510217e-06, + "loss": 0.5846, + "step": 6416 + }, + { + "epoch": 1.8785128805620608, + "grad_norm": 0.9462926387786865, + "learning_rate": 3.907042380634641e-06, + "loss": 0.6206, + "step": 6417 + }, + { + "epoch": 1.8788056206088992, + "grad_norm": 1.0063471794128418, + "learning_rate": 3.906724068376716e-06, + "loss": 0.6179, + "step": 6418 + }, + { + "epoch": 1.8790983606557377, + "grad_norm": 0.9490744471549988, + "learning_rate": 3.906405722743991e-06, + "loss": 0.6313, + "step": 6419 + }, + { + "epoch": 1.879391100702576, + "grad_norm": 0.9917464852333069, + "learning_rate": 3.90608734374402e-06, + "loss": 0.5998, + "step": 6420 + }, + { + "epoch": 1.8796838407494145, + "grad_norm": 0.9329643845558167, + "learning_rate": 3.905768931384357e-06, + "loss": 0.5915, + "step": 6421 + }, + { + "epoch": 1.879976580796253, + "grad_norm": 0.9821494221687317, + "learning_rate": 3.905450485672557e-06, + "loss": 0.5786, + "step": 6422 + }, + { + "epoch": 1.8802693208430914, + "grad_norm": 0.9572768807411194, + "learning_rate": 3.905132006616174e-06, + "loss": 0.6127, + "step": 6423 + }, + { + "epoch": 1.8805620608899298, + "grad_norm": 0.9645853638648987, + "learning_rate": 3.904813494222762e-06, + "loss": 0.5964, + "step": 6424 + }, + { + "epoch": 1.8808548009367683, + "grad_norm": 0.9772170782089233, + "learning_rate": 3.904494948499882e-06, + "loss": 0.6565, + "step": 6425 + }, + { + "epoch": 1.8811475409836067, + "grad_norm": 0.944298505783081, + "learning_rate": 3.904176369455089e-06, + "loss": 0.5995, + "step": 6426 + }, + { + "epoch": 1.881440281030445, + "grad_norm": 0.9562715888023376, + "learning_rate": 3.9038577570959426e-06, + "loss": 0.6351, + "step": 6427 + }, + { + "epoch": 1.8817330210772834, + "grad_norm": 0.9854904413223267, + "learning_rate": 3.90353911143e-06, + "loss": 0.5993, + "step": 6428 + }, + { + "epoch": 1.8820257611241218, + "grad_norm": 0.9774163365364075, + "learning_rate": 3.903220432464823e-06, + "loss": 0.62, + "step": 6429 + }, + { + "epoch": 1.8823185011709602, + "grad_norm": 0.9668650031089783, + "learning_rate": 3.90290172020797e-06, + "loss": 0.5892, + "step": 6430 + }, + { + "epoch": 1.8826112412177984, + "grad_norm": 0.9530554413795471, + "learning_rate": 3.902582974667006e-06, + "loss": 0.6464, + "step": 6431 + }, + { + "epoch": 1.8829039812646369, + "grad_norm": 0.9498767256736755, + "learning_rate": 3.902264195849491e-06, + "loss": 0.6004, + "step": 6432 + }, + { + "epoch": 1.8831967213114753, + "grad_norm": 0.9867331385612488, + "learning_rate": 3.901945383762989e-06, + "loss": 0.6344, + "step": 6433 + }, + { + "epoch": 1.8834894613583137, + "grad_norm": 1.016216516494751, + "learning_rate": 3.9016265384150635e-06, + "loss": 0.6402, + "step": 6434 + }, + { + "epoch": 1.8837822014051522, + "grad_norm": 1.0312061309814453, + "learning_rate": 3.901307659813278e-06, + "loss": 0.6257, + "step": 6435 + }, + { + "epoch": 1.8840749414519906, + "grad_norm": 1.0160499811172485, + "learning_rate": 3.9009887479651995e-06, + "loss": 0.6075, + "step": 6436 + }, + { + "epoch": 1.884367681498829, + "grad_norm": 0.9634211659431458, + "learning_rate": 3.900669802878394e-06, + "loss": 0.6109, + "step": 6437 + }, + { + "epoch": 1.8846604215456675, + "grad_norm": 0.9396457672119141, + "learning_rate": 3.900350824560429e-06, + "loss": 0.615, + "step": 6438 + }, + { + "epoch": 1.884953161592506, + "grad_norm": 0.9735025763511658, + "learning_rate": 3.90003181301887e-06, + "loss": 0.6214, + "step": 6439 + }, + { + "epoch": 1.8852459016393444, + "grad_norm": 0.9591636657714844, + "learning_rate": 3.8997127682612875e-06, + "loss": 0.6173, + "step": 6440 + }, + { + "epoch": 1.8855386416861828, + "grad_norm": 0.9519355893135071, + "learning_rate": 3.899393690295252e-06, + "loss": 0.6483, + "step": 6441 + }, + { + "epoch": 1.8858313817330212, + "grad_norm": 0.9473040103912354, + "learning_rate": 3.899074579128331e-06, + "loss": 0.5645, + "step": 6442 + }, + { + "epoch": 1.8861241217798594, + "grad_norm": 0.9422733187675476, + "learning_rate": 3.8987554347680985e-06, + "loss": 0.6038, + "step": 6443 + }, + { + "epoch": 1.8864168618266979, + "grad_norm": 1.008864164352417, + "learning_rate": 3.898436257222124e-06, + "loss": 0.5714, + "step": 6444 + }, + { + "epoch": 1.8867096018735363, + "grad_norm": 0.9442155361175537, + "learning_rate": 3.898117046497979e-06, + "loss": 0.615, + "step": 6445 + }, + { + "epoch": 1.8870023419203747, + "grad_norm": 0.9296687841415405, + "learning_rate": 3.89779780260324e-06, + "loss": 0.5707, + "step": 6446 + }, + { + "epoch": 1.887295081967213, + "grad_norm": 0.9676709175109863, + "learning_rate": 3.897478525545479e-06, + "loss": 0.5882, + "step": 6447 + }, + { + "epoch": 1.8875878220140514, + "grad_norm": 0.9065234065055847, + "learning_rate": 3.897159215332272e-06, + "loss": 0.5847, + "step": 6448 + }, + { + "epoch": 1.8878805620608898, + "grad_norm": 0.9740220904350281, + "learning_rate": 3.8968398719711944e-06, + "loss": 0.6175, + "step": 6449 + }, + { + "epoch": 1.8881733021077283, + "grad_norm": 0.9571624994277954, + "learning_rate": 3.8965204954698214e-06, + "loss": 0.6232, + "step": 6450 + }, + { + "epoch": 1.8884660421545667, + "grad_norm": 0.9606812000274658, + "learning_rate": 3.896201085835732e-06, + "loss": 0.6046, + "step": 6451 + }, + { + "epoch": 1.8887587822014051, + "grad_norm": 0.9436510801315308, + "learning_rate": 3.895881643076503e-06, + "loss": 0.5705, + "step": 6452 + }, + { + "epoch": 1.8890515222482436, + "grad_norm": 0.9180222153663635, + "learning_rate": 3.895562167199716e-06, + "loss": 0.584, + "step": 6453 + }, + { + "epoch": 1.889344262295082, + "grad_norm": 0.9322636723518372, + "learning_rate": 3.895242658212946e-06, + "loss": 0.6013, + "step": 6454 + }, + { + "epoch": 1.8896370023419204, + "grad_norm": 0.9377424120903015, + "learning_rate": 3.894923116123776e-06, + "loss": 0.6071, + "step": 6455 + }, + { + "epoch": 1.8899297423887589, + "grad_norm": 0.9286126494407654, + "learning_rate": 3.894603540939789e-06, + "loss": 0.6102, + "step": 6456 + }, + { + "epoch": 1.8902224824355973, + "grad_norm": 1.0225462913513184, + "learning_rate": 3.894283932668563e-06, + "loss": 0.6146, + "step": 6457 + }, + { + "epoch": 1.8905152224824358, + "grad_norm": 0.9715913534164429, + "learning_rate": 3.8939642913176844e-06, + "loss": 0.6348, + "step": 6458 + }, + { + "epoch": 1.890807962529274, + "grad_norm": 0.9360010027885437, + "learning_rate": 3.893644616894734e-06, + "loss": 0.6052, + "step": 6459 + }, + { + "epoch": 1.8911007025761124, + "grad_norm": 0.9146936535835266, + "learning_rate": 3.893324909407298e-06, + "loss": 0.592, + "step": 6460 + }, + { + "epoch": 1.8913934426229508, + "grad_norm": 0.8675566911697388, + "learning_rate": 3.8930051688629605e-06, + "loss": 0.5635, + "step": 6461 + }, + { + "epoch": 1.8916861826697893, + "grad_norm": 0.9453274011611938, + "learning_rate": 3.892685395269308e-06, + "loss": 0.5881, + "step": 6462 + }, + { + "epoch": 1.8919789227166275, + "grad_norm": 0.9678349494934082, + "learning_rate": 3.8923655886339265e-06, + "loss": 0.6318, + "step": 6463 + }, + { + "epoch": 1.892271662763466, + "grad_norm": 1.04598069190979, + "learning_rate": 3.892045748964405e-06, + "loss": 0.6018, + "step": 6464 + }, + { + "epoch": 1.8925644028103044, + "grad_norm": 0.9342525601387024, + "learning_rate": 3.891725876268329e-06, + "loss": 0.5486, + "step": 6465 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 0.9606859087944031, + "learning_rate": 3.89140597055329e-06, + "loss": 0.5908, + "step": 6466 + }, + { + "epoch": 1.8931498829039812, + "grad_norm": 0.9611129760742188, + "learning_rate": 3.8910860318268775e-06, + "loss": 0.5951, + "step": 6467 + }, + { + "epoch": 1.8934426229508197, + "grad_norm": 0.9232671856880188, + "learning_rate": 3.890766060096681e-06, + "loss": 0.5572, + "step": 6468 + }, + { + "epoch": 1.893735362997658, + "grad_norm": 0.9686084389686584, + "learning_rate": 3.890446055370293e-06, + "loss": 0.6224, + "step": 6469 + }, + { + "epoch": 1.8940281030444965, + "grad_norm": 0.969631552696228, + "learning_rate": 3.8901260176553046e-06, + "loss": 0.6136, + "step": 6470 + }, + { + "epoch": 1.894320843091335, + "grad_norm": 0.9522787928581238, + "learning_rate": 3.8898059469593095e-06, + "loss": 0.642, + "step": 6471 + }, + { + "epoch": 1.8946135831381734, + "grad_norm": 0.9559014439582825, + "learning_rate": 3.8894858432899015e-06, + "loss": 0.6072, + "step": 6472 + }, + { + "epoch": 1.8949063231850118, + "grad_norm": 0.9170799255371094, + "learning_rate": 3.889165706654676e-06, + "loss": 0.5962, + "step": 6473 + }, + { + "epoch": 1.8951990632318503, + "grad_norm": 0.9805999398231506, + "learning_rate": 3.8888455370612255e-06, + "loss": 0.6138, + "step": 6474 + }, + { + "epoch": 1.8954918032786885, + "grad_norm": 0.9716983437538147, + "learning_rate": 3.888525334517148e-06, + "loss": 0.5817, + "step": 6475 + }, + { + "epoch": 1.895784543325527, + "grad_norm": 0.981335461139679, + "learning_rate": 3.888205099030041e-06, + "loss": 0.6444, + "step": 6476 + }, + { + "epoch": 1.8960772833723654, + "grad_norm": 0.9961607456207275, + "learning_rate": 3.887884830607501e-06, + "loss": 0.6006, + "step": 6477 + }, + { + "epoch": 1.8963700234192038, + "grad_norm": 0.9419275522232056, + "learning_rate": 3.887564529257126e-06, + "loss": 0.5907, + "step": 6478 + }, + { + "epoch": 1.896662763466042, + "grad_norm": 0.9984913468360901, + "learning_rate": 3.887244194986517e-06, + "loss": 0.6046, + "step": 6479 + }, + { + "epoch": 1.8969555035128804, + "grad_norm": 0.9334768056869507, + "learning_rate": 3.886923827803273e-06, + "loss": 0.6394, + "step": 6480 + }, + { + "epoch": 1.8972482435597189, + "grad_norm": 0.9695730209350586, + "learning_rate": 3.886603427714994e-06, + "loss": 0.606, + "step": 6481 + }, + { + "epoch": 1.8975409836065573, + "grad_norm": 0.9474546313285828, + "learning_rate": 3.886282994729282e-06, + "loss": 0.5983, + "step": 6482 + }, + { + "epoch": 1.8978337236533958, + "grad_norm": 0.9911462664604187, + "learning_rate": 3.885962528853742e-06, + "loss": 0.5793, + "step": 6483 + }, + { + "epoch": 1.8981264637002342, + "grad_norm": 1.020256519317627, + "learning_rate": 3.885642030095972e-06, + "loss": 0.549, + "step": 6484 + }, + { + "epoch": 1.8984192037470726, + "grad_norm": 0.9724757671356201, + "learning_rate": 3.88532149846358e-06, + "loss": 0.5864, + "step": 6485 + }, + { + "epoch": 1.898711943793911, + "grad_norm": 0.9624811410903931, + "learning_rate": 3.885000933964168e-06, + "loss": 0.5856, + "step": 6486 + }, + { + "epoch": 1.8990046838407495, + "grad_norm": 0.8908076286315918, + "learning_rate": 3.884680336605344e-06, + "loss": 0.6046, + "step": 6487 + }, + { + "epoch": 1.899297423887588, + "grad_norm": 0.9800412058830261, + "learning_rate": 3.884359706394713e-06, + "loss": 0.6381, + "step": 6488 + }, + { + "epoch": 1.8995901639344264, + "grad_norm": 1.0159860849380493, + "learning_rate": 3.884039043339882e-06, + "loss": 0.5913, + "step": 6489 + }, + { + "epoch": 1.8998829039812648, + "grad_norm": 0.9525268077850342, + "learning_rate": 3.883718347448458e-06, + "loss": 0.5662, + "step": 6490 + }, + { + "epoch": 1.900175644028103, + "grad_norm": 0.9296473860740662, + "learning_rate": 3.883397618728051e-06, + "loss": 0.5611, + "step": 6491 + }, + { + "epoch": 1.9004683840749415, + "grad_norm": 1.0009924173355103, + "learning_rate": 3.883076857186269e-06, + "loss": 0.6232, + "step": 6492 + }, + { + "epoch": 1.9007611241217799, + "grad_norm": 0.9592916369438171, + "learning_rate": 3.882756062830724e-06, + "loss": 0.5889, + "step": 6493 + }, + { + "epoch": 1.901053864168618, + "grad_norm": 0.9552302360534668, + "learning_rate": 3.882435235669025e-06, + "loss": 0.6096, + "step": 6494 + }, + { + "epoch": 1.9013466042154565, + "grad_norm": 0.9626867175102234, + "learning_rate": 3.882114375708784e-06, + "loss": 0.6277, + "step": 6495 + }, + { + "epoch": 1.901639344262295, + "grad_norm": 0.9396390318870544, + "learning_rate": 3.8817934829576145e-06, + "loss": 0.5848, + "step": 6496 + }, + { + "epoch": 1.9019320843091334, + "grad_norm": 0.9043396711349487, + "learning_rate": 3.88147255742313e-06, + "loss": 0.6156, + "step": 6497 + }, + { + "epoch": 1.9022248243559718, + "grad_norm": 0.9871586561203003, + "learning_rate": 3.881151599112942e-06, + "loss": 0.6173, + "step": 6498 + }, + { + "epoch": 1.9025175644028103, + "grad_norm": 0.9636646509170532, + "learning_rate": 3.8808306080346685e-06, + "loss": 0.5769, + "step": 6499 + }, + { + "epoch": 1.9028103044496487, + "grad_norm": 0.9303101301193237, + "learning_rate": 3.880509584195922e-06, + "loss": 0.5641, + "step": 6500 + }, + { + "epoch": 1.9031030444964872, + "grad_norm": 0.9711638689041138, + "learning_rate": 3.880188527604321e-06, + "loss": 0.5955, + "step": 6501 + }, + { + "epoch": 1.9033957845433256, + "grad_norm": 0.9320586919784546, + "learning_rate": 3.8798674382674826e-06, + "loss": 0.5848, + "step": 6502 + }, + { + "epoch": 1.903688524590164, + "grad_norm": 0.9430696368217468, + "learning_rate": 3.879546316193023e-06, + "loss": 0.6383, + "step": 6503 + }, + { + "epoch": 1.9039812646370025, + "grad_norm": 0.9530836343765259, + "learning_rate": 3.879225161388564e-06, + "loss": 0.6299, + "step": 6504 + }, + { + "epoch": 1.904274004683841, + "grad_norm": 0.9883562922477722, + "learning_rate": 3.878903973861721e-06, + "loss": 0.6016, + "step": 6505 + }, + { + "epoch": 1.904566744730679, + "grad_norm": 0.9842495322227478, + "learning_rate": 3.878582753620117e-06, + "loss": 0.6402, + "step": 6506 + }, + { + "epoch": 1.9048594847775175, + "grad_norm": 0.9593751430511475, + "learning_rate": 3.878261500671372e-06, + "loss": 0.6347, + "step": 6507 + }, + { + "epoch": 1.905152224824356, + "grad_norm": 0.9645188450813293, + "learning_rate": 3.877940215023109e-06, + "loss": 0.62, + "step": 6508 + }, + { + "epoch": 1.9054449648711944, + "grad_norm": 0.9836932420730591, + "learning_rate": 3.877618896682949e-06, + "loss": 0.5929, + "step": 6509 + }, + { + "epoch": 1.9057377049180326, + "grad_norm": 0.9614861607551575, + "learning_rate": 3.877297545658516e-06, + "loss": 0.6143, + "step": 6510 + }, + { + "epoch": 1.906030444964871, + "grad_norm": 0.9384861588478088, + "learning_rate": 3.876976161957434e-06, + "loss": 0.545, + "step": 6511 + }, + { + "epoch": 1.9063231850117095, + "grad_norm": 1.0069535970687866, + "learning_rate": 3.8766547455873286e-06, + "loss": 0.5893, + "step": 6512 + }, + { + "epoch": 1.906615925058548, + "grad_norm": 0.9423180818557739, + "learning_rate": 3.876333296555825e-06, + "loss": 0.5401, + "step": 6513 + }, + { + "epoch": 1.9069086651053864, + "grad_norm": 0.9739196300506592, + "learning_rate": 3.876011814870548e-06, + "loss": 0.6077, + "step": 6514 + }, + { + "epoch": 1.9072014051522248, + "grad_norm": 0.9291735291481018, + "learning_rate": 3.875690300539128e-06, + "loss": 0.642, + "step": 6515 + }, + { + "epoch": 1.9074941451990632, + "grad_norm": 0.9280230402946472, + "learning_rate": 3.875368753569191e-06, + "loss": 0.562, + "step": 6516 + }, + { + "epoch": 1.9077868852459017, + "grad_norm": 1.0100058317184448, + "learning_rate": 3.875047173968365e-06, + "loss": 0.6165, + "step": 6517 + }, + { + "epoch": 1.9080796252927401, + "grad_norm": 0.9872006773948669, + "learning_rate": 3.874725561744282e-06, + "loss": 0.6266, + "step": 6518 + }, + { + "epoch": 1.9083723653395785, + "grad_norm": 0.9744424819946289, + "learning_rate": 3.87440391690457e-06, + "loss": 0.6272, + "step": 6519 + }, + { + "epoch": 1.908665105386417, + "grad_norm": 0.9602122902870178, + "learning_rate": 3.874082239456862e-06, + "loss": 0.6293, + "step": 6520 + }, + { + "epoch": 1.9089578454332554, + "grad_norm": 0.9816181063652039, + "learning_rate": 3.873760529408788e-06, + "loss": 0.6378, + "step": 6521 + }, + { + "epoch": 1.9092505854800936, + "grad_norm": 0.9113682508468628, + "learning_rate": 3.873438786767982e-06, + "loss": 0.5444, + "step": 6522 + }, + { + "epoch": 1.909543325526932, + "grad_norm": 1.0002018213272095, + "learning_rate": 3.873117011542076e-06, + "loss": 0.6392, + "step": 6523 + }, + { + "epoch": 1.9098360655737705, + "grad_norm": 0.9235227704048157, + "learning_rate": 3.872795203738706e-06, + "loss": 0.6251, + "step": 6524 + }, + { + "epoch": 1.910128805620609, + "grad_norm": 0.9621080160140991, + "learning_rate": 3.872473363365506e-06, + "loss": 0.6114, + "step": 6525 + }, + { + "epoch": 1.9104215456674472, + "grad_norm": 0.8926423788070679, + "learning_rate": 3.8721514904301114e-06, + "loss": 0.5993, + "step": 6526 + }, + { + "epoch": 1.9107142857142856, + "grad_norm": 0.9422827363014221, + "learning_rate": 3.87182958494016e-06, + "loss": 0.5965, + "step": 6527 + }, + { + "epoch": 1.911007025761124, + "grad_norm": 0.9190835952758789, + "learning_rate": 3.871507646903286e-06, + "loss": 0.5618, + "step": 6528 + }, + { + "epoch": 1.9112997658079625, + "grad_norm": 0.9227039813995361, + "learning_rate": 3.871185676327132e-06, + "loss": 0.6278, + "step": 6529 + }, + { + "epoch": 1.911592505854801, + "grad_norm": 0.9730623960494995, + "learning_rate": 3.870863673219334e-06, + "loss": 0.6392, + "step": 6530 + }, + { + "epoch": 1.9118852459016393, + "grad_norm": 0.9452234506607056, + "learning_rate": 3.870541637587531e-06, + "loss": 0.6213, + "step": 6531 + }, + { + "epoch": 1.9121779859484778, + "grad_norm": 0.9423407912254333, + "learning_rate": 3.870219569439364e-06, + "loss": 0.6288, + "step": 6532 + }, + { + "epoch": 1.9124707259953162, + "grad_norm": 0.9804065823554993, + "learning_rate": 3.869897468782475e-06, + "loss": 0.5734, + "step": 6533 + }, + { + "epoch": 1.9127634660421546, + "grad_norm": 0.9591841101646423, + "learning_rate": 3.8695753356245066e-06, + "loss": 0.5651, + "step": 6534 + }, + { + "epoch": 1.913056206088993, + "grad_norm": 0.9707967042922974, + "learning_rate": 3.869253169973099e-06, + "loss": 0.6122, + "step": 6535 + }, + { + "epoch": 1.9133489461358315, + "grad_norm": 0.9303954243659973, + "learning_rate": 3.868930971835897e-06, + "loss": 0.6137, + "step": 6536 + }, + { + "epoch": 1.91364168618267, + "grad_norm": 0.914708137512207, + "learning_rate": 3.8686087412205445e-06, + "loss": 0.5868, + "step": 6537 + }, + { + "epoch": 1.9139344262295082, + "grad_norm": 0.937812089920044, + "learning_rate": 3.868286478134687e-06, + "loss": 0.5694, + "step": 6538 + }, + { + "epoch": 1.9142271662763466, + "grad_norm": 0.9501574039459229, + "learning_rate": 3.86796418258597e-06, + "loss": 0.5909, + "step": 6539 + }, + { + "epoch": 1.914519906323185, + "grad_norm": 0.9607599377632141, + "learning_rate": 3.867641854582039e-06, + "loss": 0.6402, + "step": 6540 + }, + { + "epoch": 1.9148126463700235, + "grad_norm": 1.0117473602294922, + "learning_rate": 3.867319494130544e-06, + "loss": 0.5971, + "step": 6541 + }, + { + "epoch": 1.9151053864168617, + "grad_norm": 0.9426794052124023, + "learning_rate": 3.86699710123913e-06, + "loss": 0.6211, + "step": 6542 + }, + { + "epoch": 1.9153981264637001, + "grad_norm": 0.9553399682044983, + "learning_rate": 3.866674675915447e-06, + "loss": 0.5945, + "step": 6543 + }, + { + "epoch": 1.9156908665105385, + "grad_norm": 0.9433286190032959, + "learning_rate": 3.866352218167146e-06, + "loss": 0.6346, + "step": 6544 + }, + { + "epoch": 1.915983606557377, + "grad_norm": 0.9531499743461609, + "learning_rate": 3.8660297280018755e-06, + "loss": 0.5843, + "step": 6545 + }, + { + "epoch": 1.9162763466042154, + "grad_norm": 0.9166333675384521, + "learning_rate": 3.865707205427287e-06, + "loss": 0.6039, + "step": 6546 + }, + { + "epoch": 1.9165690866510539, + "grad_norm": 0.9348241090774536, + "learning_rate": 3.865384650451033e-06, + "loss": 0.6381, + "step": 6547 + }, + { + "epoch": 1.9168618266978923, + "grad_norm": 1.0116969347000122, + "learning_rate": 3.865062063080765e-06, + "loss": 0.5919, + "step": 6548 + }, + { + "epoch": 1.9171545667447307, + "grad_norm": 0.9094985127449036, + "learning_rate": 3.864739443324138e-06, + "loss": 0.5844, + "step": 6549 + }, + { + "epoch": 1.9174473067915692, + "grad_norm": 0.9287099838256836, + "learning_rate": 3.864416791188806e-06, + "loss": 0.6232, + "step": 6550 + }, + { + "epoch": 1.9177400468384076, + "grad_norm": 0.9350426197052002, + "learning_rate": 3.864094106682422e-06, + "loss": 0.5767, + "step": 6551 + }, + { + "epoch": 1.918032786885246, + "grad_norm": 0.9584366679191589, + "learning_rate": 3.863771389812645e-06, + "loss": 0.565, + "step": 6552 + }, + { + "epoch": 1.9183255269320845, + "grad_norm": 0.9949343800544739, + "learning_rate": 3.863448640587129e-06, + "loss": 0.6158, + "step": 6553 + }, + { + "epoch": 1.9186182669789227, + "grad_norm": 0.9320353865623474, + "learning_rate": 3.8631258590135316e-06, + "loss": 0.6364, + "step": 6554 + }, + { + "epoch": 1.9189110070257611, + "grad_norm": 0.986731767654419, + "learning_rate": 3.8628030450995116e-06, + "loss": 0.6088, + "step": 6555 + }, + { + "epoch": 1.9192037470725996, + "grad_norm": 0.9401959180831909, + "learning_rate": 3.862480198852728e-06, + "loss": 0.5946, + "step": 6556 + }, + { + "epoch": 1.919496487119438, + "grad_norm": 0.9556440711021423, + "learning_rate": 3.862157320280839e-06, + "loss": 0.6186, + "step": 6557 + }, + { + "epoch": 1.9197892271662762, + "grad_norm": 1.065079927444458, + "learning_rate": 3.861834409391507e-06, + "loss": 0.6378, + "step": 6558 + }, + { + "epoch": 1.9200819672131146, + "grad_norm": 0.9248261451721191, + "learning_rate": 3.861511466192392e-06, + "loss": 0.6314, + "step": 6559 + }, + { + "epoch": 1.920374707259953, + "grad_norm": 0.899074912071228, + "learning_rate": 3.861188490691154e-06, + "loss": 0.6085, + "step": 6560 + }, + { + "epoch": 1.9206674473067915, + "grad_norm": 0.9456394910812378, + "learning_rate": 3.860865482895459e-06, + "loss": 0.6203, + "step": 6561 + }, + { + "epoch": 1.92096018735363, + "grad_norm": 0.9505957365036011, + "learning_rate": 3.860542442812969e-06, + "loss": 0.6455, + "step": 6562 + }, + { + "epoch": 1.9212529274004684, + "grad_norm": 0.9795944094657898, + "learning_rate": 3.860219370451348e-06, + "loss": 0.5886, + "step": 6563 + }, + { + "epoch": 1.9215456674473068, + "grad_norm": 1.062652587890625, + "learning_rate": 3.859896265818261e-06, + "loss": 0.6182, + "step": 6564 + }, + { + "epoch": 1.9218384074941453, + "grad_norm": 0.939649760723114, + "learning_rate": 3.859573128921375e-06, + "loss": 0.6062, + "step": 6565 + }, + { + "epoch": 1.9221311475409837, + "grad_norm": 0.9607481956481934, + "learning_rate": 3.859249959768354e-06, + "loss": 0.6113, + "step": 6566 + }, + { + "epoch": 1.9224238875878221, + "grad_norm": 0.9590779542922974, + "learning_rate": 3.858926758366866e-06, + "loss": 0.6008, + "step": 6567 + }, + { + "epoch": 1.9227166276346606, + "grad_norm": 0.9903432130813599, + "learning_rate": 3.858603524724581e-06, + "loss": 0.6078, + "step": 6568 + }, + { + "epoch": 1.923009367681499, + "grad_norm": 0.9310775995254517, + "learning_rate": 3.858280258849166e-06, + "loss": 0.6012, + "step": 6569 + }, + { + "epoch": 1.9233021077283372, + "grad_norm": 0.966063380241394, + "learning_rate": 3.85795696074829e-06, + "loss": 0.5924, + "step": 6570 + }, + { + "epoch": 1.9235948477751756, + "grad_norm": 0.9571933746337891, + "learning_rate": 3.857633630429626e-06, + "loss": 0.5415, + "step": 6571 + }, + { + "epoch": 1.923887587822014, + "grad_norm": 0.9714875817298889, + "learning_rate": 3.857310267900841e-06, + "loss": 0.6356, + "step": 6572 + }, + { + "epoch": 1.9241803278688525, + "grad_norm": 0.9396136999130249, + "learning_rate": 3.8569868731696105e-06, + "loss": 0.5897, + "step": 6573 + }, + { + "epoch": 1.9244730679156907, + "grad_norm": 0.9546481370925903, + "learning_rate": 3.856663446243606e-06, + "loss": 0.5881, + "step": 6574 + }, + { + "epoch": 1.9247658079625292, + "grad_norm": 0.939493715763092, + "learning_rate": 3.8563399871305e-06, + "loss": 0.5988, + "step": 6575 + }, + { + "epoch": 1.9250585480093676, + "grad_norm": 0.9000023603439331, + "learning_rate": 3.856016495837967e-06, + "loss": 0.5757, + "step": 6576 + }, + { + "epoch": 1.925351288056206, + "grad_norm": 0.9985439777374268, + "learning_rate": 3.855692972373683e-06, + "loss": 0.6476, + "step": 6577 + }, + { + "epoch": 1.9256440281030445, + "grad_norm": 0.9916278719902039, + "learning_rate": 3.855369416745322e-06, + "loss": 0.5901, + "step": 6578 + }, + { + "epoch": 1.925936768149883, + "grad_norm": 0.8916949033737183, + "learning_rate": 3.855045828960561e-06, + "loss": 0.59, + "step": 6579 + }, + { + "epoch": 1.9262295081967213, + "grad_norm": 0.9138490557670593, + "learning_rate": 3.854722209027078e-06, + "loss": 0.6262, + "step": 6580 + }, + { + "epoch": 1.9265222482435598, + "grad_norm": 0.9745141863822937, + "learning_rate": 3.85439855695255e-06, + "loss": 0.5832, + "step": 6581 + }, + { + "epoch": 1.9268149882903982, + "grad_norm": 0.9504146575927734, + "learning_rate": 3.854074872744657e-06, + "loss": 0.6121, + "step": 6582 + }, + { + "epoch": 1.9271077283372366, + "grad_norm": 0.9356542825698853, + "learning_rate": 3.853751156411075e-06, + "loss": 0.5834, + "step": 6583 + }, + { + "epoch": 1.927400468384075, + "grad_norm": 0.9464778304100037, + "learning_rate": 3.8534274079594895e-06, + "loss": 0.615, + "step": 6584 + }, + { + "epoch": 1.9276932084309133, + "grad_norm": 0.9888817071914673, + "learning_rate": 3.853103627397577e-06, + "loss": 0.5762, + "step": 6585 + }, + { + "epoch": 1.9279859484777517, + "grad_norm": 0.9530279040336609, + "learning_rate": 3.852779814733022e-06, + "loss": 0.5606, + "step": 6586 + }, + { + "epoch": 1.9282786885245902, + "grad_norm": 0.9847114682197571, + "learning_rate": 3.852455969973505e-06, + "loss": 0.5932, + "step": 6587 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.9316561222076416, + "learning_rate": 3.852132093126711e-06, + "loss": 0.6066, + "step": 6588 + }, + { + "epoch": 1.9288641686182668, + "grad_norm": 1.0307631492614746, + "learning_rate": 3.8518081842003235e-06, + "loss": 0.6056, + "step": 6589 + }, + { + "epoch": 1.9291569086651053, + "grad_norm": 0.9989437460899353, + "learning_rate": 3.8514842432020275e-06, + "loss": 0.6606, + "step": 6590 + }, + { + "epoch": 1.9294496487119437, + "grad_norm": 1.0019896030426025, + "learning_rate": 3.851160270139507e-06, + "loss": 0.6193, + "step": 6591 + }, + { + "epoch": 1.9297423887587821, + "grad_norm": 0.9755485653877258, + "learning_rate": 3.850836265020449e-06, + "loss": 0.6247, + "step": 6592 + }, + { + "epoch": 1.9300351288056206, + "grad_norm": 0.9618550539016724, + "learning_rate": 3.850512227852542e-06, + "loss": 0.5818, + "step": 6593 + }, + { + "epoch": 1.930327868852459, + "grad_norm": 0.9573217630386353, + "learning_rate": 3.850188158643473e-06, + "loss": 0.6236, + "step": 6594 + }, + { + "epoch": 1.9306206088992974, + "grad_norm": 0.9506344795227051, + "learning_rate": 3.849864057400931e-06, + "loss": 0.6053, + "step": 6595 + }, + { + "epoch": 1.9309133489461359, + "grad_norm": 0.9943659901618958, + "learning_rate": 3.849539924132604e-06, + "loss": 0.6086, + "step": 6596 + }, + { + "epoch": 1.9312060889929743, + "grad_norm": 0.9553675055503845, + "learning_rate": 3.8492157588461835e-06, + "loss": 0.6036, + "step": 6597 + }, + { + "epoch": 1.9314988290398127, + "grad_norm": 0.9806647896766663, + "learning_rate": 3.848891561549361e-06, + "loss": 0.5772, + "step": 6598 + }, + { + "epoch": 1.9317915690866512, + "grad_norm": 0.9607681632041931, + "learning_rate": 3.848567332249825e-06, + "loss": 0.6073, + "step": 6599 + }, + { + "epoch": 1.9320843091334896, + "grad_norm": 0.9954617619514465, + "learning_rate": 3.848243070955271e-06, + "loss": 0.5895, + "step": 6600 + }, + { + "epoch": 1.9323770491803278, + "grad_norm": 0.9364554286003113, + "learning_rate": 3.847918777673391e-06, + "loss": 0.5953, + "step": 6601 + }, + { + "epoch": 1.9326697892271663, + "grad_norm": 0.9361100792884827, + "learning_rate": 3.847594452411879e-06, + "loss": 0.6064, + "step": 6602 + }, + { + "epoch": 1.9329625292740047, + "grad_norm": 0.9751470685005188, + "learning_rate": 3.84727009517843e-06, + "loss": 0.6072, + "step": 6603 + }, + { + "epoch": 1.9332552693208431, + "grad_norm": 0.907489538192749, + "learning_rate": 3.8469457059807384e-06, + "loss": 0.5362, + "step": 6604 + }, + { + "epoch": 1.9335480093676813, + "grad_norm": 0.972011148929596, + "learning_rate": 3.846621284826502e-06, + "loss": 0.654, + "step": 6605 + }, + { + "epoch": 1.9338407494145198, + "grad_norm": 0.9974213242530823, + "learning_rate": 3.846296831723416e-06, + "loss": 0.6141, + "step": 6606 + }, + { + "epoch": 1.9341334894613582, + "grad_norm": 0.8791571259498596, + "learning_rate": 3.84597234667918e-06, + "loss": 0.5336, + "step": 6607 + }, + { + "epoch": 1.9344262295081966, + "grad_norm": 0.9479790329933167, + "learning_rate": 3.845647829701491e-06, + "loss": 0.6127, + "step": 6608 + }, + { + "epoch": 1.934718969555035, + "grad_norm": 0.8975178003311157, + "learning_rate": 3.845323280798048e-06, + "loss": 0.6051, + "step": 6609 + }, + { + "epoch": 1.9350117096018735, + "grad_norm": 0.9648395776748657, + "learning_rate": 3.844998699976553e-06, + "loss": 0.5857, + "step": 6610 + }, + { + "epoch": 1.935304449648712, + "grad_norm": 1.047003149986267, + "learning_rate": 3.844674087244705e-06, + "loss": 0.5475, + "step": 6611 + }, + { + "epoch": 1.9355971896955504, + "grad_norm": 0.9819777607917786, + "learning_rate": 3.844349442610206e-06, + "loss": 0.6125, + "step": 6612 + }, + { + "epoch": 1.9358899297423888, + "grad_norm": 0.9381083846092224, + "learning_rate": 3.844024766080757e-06, + "loss": 0.61, + "step": 6613 + }, + { + "epoch": 1.9361826697892273, + "grad_norm": 0.9669232368469238, + "learning_rate": 3.8437000576640635e-06, + "loss": 0.5835, + "step": 6614 + }, + { + "epoch": 1.9364754098360657, + "grad_norm": 0.9850247502326965, + "learning_rate": 3.843375317367828e-06, + "loss": 0.5897, + "step": 6615 + }, + { + "epoch": 1.9367681498829041, + "grad_norm": 1.0039316415786743, + "learning_rate": 3.843050545199754e-06, + "loss": 0.6205, + "step": 6616 + }, + { + "epoch": 1.9370608899297423, + "grad_norm": 0.9707591533660889, + "learning_rate": 3.842725741167549e-06, + "loss": 0.6338, + "step": 6617 + }, + { + "epoch": 1.9373536299765808, + "grad_norm": 0.9689719080924988, + "learning_rate": 3.842400905278917e-06, + "loss": 0.5966, + "step": 6618 + }, + { + "epoch": 1.9376463700234192, + "grad_norm": 0.9532831907272339, + "learning_rate": 3.842076037541567e-06, + "loss": 0.5978, + "step": 6619 + }, + { + "epoch": 1.9379391100702577, + "grad_norm": 0.9490357637405396, + "learning_rate": 3.841751137963203e-06, + "loss": 0.6358, + "step": 6620 + }, + { + "epoch": 1.9382318501170959, + "grad_norm": 1.0033235549926758, + "learning_rate": 3.841426206551538e-06, + "loss": 0.6009, + "step": 6621 + }, + { + "epoch": 1.9385245901639343, + "grad_norm": 0.9520042538642883, + "learning_rate": 3.841101243314275e-06, + "loss": 0.5892, + "step": 6622 + }, + { + "epoch": 1.9388173302107727, + "grad_norm": 0.9658498167991638, + "learning_rate": 3.84077624825913e-06, + "loss": 0.5732, + "step": 6623 + }, + { + "epoch": 1.9391100702576112, + "grad_norm": 1.0017226934432983, + "learning_rate": 3.840451221393811e-06, + "loss": 0.6244, + "step": 6624 + }, + { + "epoch": 1.9394028103044496, + "grad_norm": 0.9568636417388916, + "learning_rate": 3.840126162726029e-06, + "loss": 0.5931, + "step": 6625 + }, + { + "epoch": 1.939695550351288, + "grad_norm": 0.9724414944648743, + "learning_rate": 3.8398010722634956e-06, + "loss": 0.6142, + "step": 6626 + }, + { + "epoch": 1.9399882903981265, + "grad_norm": 1.031407356262207, + "learning_rate": 3.839475950013925e-06, + "loss": 0.6335, + "step": 6627 + }, + { + "epoch": 1.940281030444965, + "grad_norm": 0.9804384708404541, + "learning_rate": 3.83915079598503e-06, + "loss": 0.6223, + "step": 6628 + }, + { + "epoch": 1.9405737704918034, + "grad_norm": 0.9886770844459534, + "learning_rate": 3.8388256101845235e-06, + "loss": 0.6001, + "step": 6629 + }, + { + "epoch": 1.9408665105386418, + "grad_norm": 0.981016993522644, + "learning_rate": 3.838500392620124e-06, + "loss": 0.594, + "step": 6630 + }, + { + "epoch": 1.9411592505854802, + "grad_norm": 0.9888495802879333, + "learning_rate": 3.838175143299545e-06, + "loss": 0.636, + "step": 6631 + }, + { + "epoch": 1.9414519906323187, + "grad_norm": 0.9251469373703003, + "learning_rate": 3.837849862230504e-06, + "loss": 0.6088, + "step": 6632 + }, + { + "epoch": 1.9417447306791569, + "grad_norm": 0.9504233002662659, + "learning_rate": 3.837524549420717e-06, + "loss": 0.5957, + "step": 6633 + }, + { + "epoch": 1.9420374707259953, + "grad_norm": 0.9549548625946045, + "learning_rate": 3.837199204877905e-06, + "loss": 0.5808, + "step": 6634 + }, + { + "epoch": 1.9423302107728337, + "grad_norm": 0.9707643985748291, + "learning_rate": 3.836873828609783e-06, + "loss": 0.6164, + "step": 6635 + }, + { + "epoch": 1.9426229508196722, + "grad_norm": 0.8690659999847412, + "learning_rate": 3.836548420624075e-06, + "loss": 0.5422, + "step": 6636 + }, + { + "epoch": 1.9429156908665104, + "grad_norm": 0.9372094869613647, + "learning_rate": 3.836222980928496e-06, + "loss": 0.565, + "step": 6637 + }, + { + "epoch": 1.9432084309133488, + "grad_norm": 0.9738519787788391, + "learning_rate": 3.835897509530771e-06, + "loss": 0.5987, + "step": 6638 + }, + { + "epoch": 1.9435011709601873, + "grad_norm": 1.1308281421661377, + "learning_rate": 3.835572006438621e-06, + "loss": 0.6098, + "step": 6639 + }, + { + "epoch": 1.9437939110070257, + "grad_norm": 1.0141764879226685, + "learning_rate": 3.83524647165977e-06, + "loss": 0.5879, + "step": 6640 + }, + { + "epoch": 1.9440866510538641, + "grad_norm": 0.9700433015823364, + "learning_rate": 3.834920905201939e-06, + "loss": 0.6117, + "step": 6641 + }, + { + "epoch": 1.9443793911007026, + "grad_norm": 0.9291895031929016, + "learning_rate": 3.834595307072852e-06, + "loss": 0.5849, + "step": 6642 + }, + { + "epoch": 1.944672131147541, + "grad_norm": 0.9631633758544922, + "learning_rate": 3.834269677280236e-06, + "loss": 0.612, + "step": 6643 + }, + { + "epoch": 1.9449648711943794, + "grad_norm": 0.9543134570121765, + "learning_rate": 3.833944015831815e-06, + "loss": 0.6364, + "step": 6644 + }, + { + "epoch": 1.9452576112412179, + "grad_norm": 0.9577897191047668, + "learning_rate": 3.833618322735317e-06, + "loss": 0.5829, + "step": 6645 + }, + { + "epoch": 1.9455503512880563, + "grad_norm": 1.0061607360839844, + "learning_rate": 3.833292597998467e-06, + "loss": 0.6163, + "step": 6646 + }, + { + "epoch": 1.9458430913348947, + "grad_norm": 0.9101279377937317, + "learning_rate": 3.832966841628995e-06, + "loss": 0.5933, + "step": 6647 + }, + { + "epoch": 1.9461358313817332, + "grad_norm": 0.9289489984512329, + "learning_rate": 3.832641053634627e-06, + "loss": 0.623, + "step": 6648 + }, + { + "epoch": 1.9464285714285714, + "grad_norm": 0.9010082483291626, + "learning_rate": 3.832315234023095e-06, + "loss": 0.5837, + "step": 6649 + }, + { + "epoch": 1.9467213114754098, + "grad_norm": 0.9465002417564392, + "learning_rate": 3.831989382802128e-06, + "loss": 0.6062, + "step": 6650 + }, + { + "epoch": 1.9470140515222483, + "grad_norm": 0.9288009405136108, + "learning_rate": 3.831663499979457e-06, + "loss": 0.5889, + "step": 6651 + }, + { + "epoch": 1.9473067915690867, + "grad_norm": 0.9510943293571472, + "learning_rate": 3.831337585562813e-06, + "loss": 0.5844, + "step": 6652 + }, + { + "epoch": 1.947599531615925, + "grad_norm": 0.9921800494194031, + "learning_rate": 3.831011639559931e-06, + "loss": 0.6308, + "step": 6653 + }, + { + "epoch": 1.9478922716627634, + "grad_norm": 0.9944632053375244, + "learning_rate": 3.830685661978541e-06, + "loss": 0.6049, + "step": 6654 + }, + { + "epoch": 1.9481850117096018, + "grad_norm": 0.9637234210968018, + "learning_rate": 3.830359652826378e-06, + "loss": 0.5927, + "step": 6655 + }, + { + "epoch": 1.9484777517564402, + "grad_norm": 1.0206788778305054, + "learning_rate": 3.830033612111176e-06, + "loss": 0.6585, + "step": 6656 + }, + { + "epoch": 1.9487704918032787, + "grad_norm": 0.9334465861320496, + "learning_rate": 3.829707539840671e-06, + "loss": 0.6039, + "step": 6657 + }, + { + "epoch": 1.949063231850117, + "grad_norm": 0.9660984873771667, + "learning_rate": 3.829381436022599e-06, + "loss": 0.6249, + "step": 6658 + }, + { + "epoch": 1.9493559718969555, + "grad_norm": 0.9616366624832153, + "learning_rate": 3.829055300664698e-06, + "loss": 0.6232, + "step": 6659 + }, + { + "epoch": 1.949648711943794, + "grad_norm": 0.9714418649673462, + "learning_rate": 3.828729133774705e-06, + "loss": 0.6184, + "step": 6660 + }, + { + "epoch": 1.9499414519906324, + "grad_norm": 0.9358531832695007, + "learning_rate": 3.828402935360357e-06, + "loss": 0.6231, + "step": 6661 + }, + { + "epoch": 1.9502341920374708, + "grad_norm": 0.9381201267242432, + "learning_rate": 3.8280767054293935e-06, + "loss": 0.6059, + "step": 6662 + }, + { + "epoch": 1.9505269320843093, + "grad_norm": 0.9805985689163208, + "learning_rate": 3.827750443989556e-06, + "loss": 0.6296, + "step": 6663 + }, + { + "epoch": 1.9508196721311475, + "grad_norm": 0.962887167930603, + "learning_rate": 3.827424151048584e-06, + "loss": 0.6126, + "step": 6664 + }, + { + "epoch": 1.951112412177986, + "grad_norm": 0.9203393459320068, + "learning_rate": 3.82709782661422e-06, + "loss": 0.6081, + "step": 6665 + }, + { + "epoch": 1.9514051522248244, + "grad_norm": 0.953790009021759, + "learning_rate": 3.826771470694204e-06, + "loss": 0.5933, + "step": 6666 + }, + { + "epoch": 1.9516978922716628, + "grad_norm": 0.9543322324752808, + "learning_rate": 3.82644508329628e-06, + "loss": 0.5729, + "step": 6667 + }, + { + "epoch": 1.951990632318501, + "grad_norm": 0.9038568735122681, + "learning_rate": 3.826118664428191e-06, + "loss": 0.5604, + "step": 6668 + }, + { + "epoch": 1.9522833723653394, + "grad_norm": 0.985527753829956, + "learning_rate": 3.825792214097683e-06, + "loss": 0.6166, + "step": 6669 + }, + { + "epoch": 1.9525761124121779, + "grad_norm": 0.9273754358291626, + "learning_rate": 3.825465732312499e-06, + "loss": 0.6081, + "step": 6670 + }, + { + "epoch": 1.9528688524590163, + "grad_norm": 0.99017733335495, + "learning_rate": 3.825139219080387e-06, + "loss": 0.6374, + "step": 6671 + }, + { + "epoch": 1.9531615925058547, + "grad_norm": 0.8830037117004395, + "learning_rate": 3.824812674409092e-06, + "loss": 0.5644, + "step": 6672 + }, + { + "epoch": 1.9534543325526932, + "grad_norm": 0.9574135541915894, + "learning_rate": 3.824486098306361e-06, + "loss": 0.6183, + "step": 6673 + }, + { + "epoch": 1.9537470725995316, + "grad_norm": 0.9841493368148804, + "learning_rate": 3.824159490779944e-06, + "loss": 0.6537, + "step": 6674 + }, + { + "epoch": 1.95403981264637, + "grad_norm": 0.9112286567687988, + "learning_rate": 3.823832851837588e-06, + "loss": 0.5731, + "step": 6675 + }, + { + "epoch": 1.9543325526932085, + "grad_norm": 0.9686096906661987, + "learning_rate": 3.823506181487044e-06, + "loss": 0.6004, + "step": 6676 + }, + { + "epoch": 1.954625292740047, + "grad_norm": 0.954721987247467, + "learning_rate": 3.823179479736061e-06, + "loss": 0.6075, + "step": 6677 + }, + { + "epoch": 1.9549180327868854, + "grad_norm": 0.9458422660827637, + "learning_rate": 3.82285274659239e-06, + "loss": 0.5581, + "step": 6678 + }, + { + "epoch": 1.9552107728337238, + "grad_norm": 0.9576085209846497, + "learning_rate": 3.822525982063784e-06, + "loss": 0.6006, + "step": 6679 + }, + { + "epoch": 1.955503512880562, + "grad_norm": 0.948855996131897, + "learning_rate": 3.822199186157995e-06, + "loss": 0.5898, + "step": 6680 + }, + { + "epoch": 1.9557962529274004, + "grad_norm": 0.9781641364097595, + "learning_rate": 3.8218723588827766e-06, + "loss": 0.59, + "step": 6681 + }, + { + "epoch": 1.9560889929742389, + "grad_norm": 0.9753437042236328, + "learning_rate": 3.821545500245883e-06, + "loss": 0.5787, + "step": 6682 + }, + { + "epoch": 1.9563817330210773, + "grad_norm": 0.9842491745948792, + "learning_rate": 3.821218610255067e-06, + "loss": 0.5655, + "step": 6683 + }, + { + "epoch": 1.9566744730679155, + "grad_norm": 0.9241059422492981, + "learning_rate": 3.820891688918087e-06, + "loss": 0.5894, + "step": 6684 + }, + { + "epoch": 1.956967213114754, + "grad_norm": 0.9701085686683655, + "learning_rate": 3.820564736242696e-06, + "loss": 0.5763, + "step": 6685 + }, + { + "epoch": 1.9572599531615924, + "grad_norm": 0.9580166339874268, + "learning_rate": 3.820237752236655e-06, + "loss": 0.5772, + "step": 6686 + }, + { + "epoch": 1.9575526932084308, + "grad_norm": 0.9328097701072693, + "learning_rate": 3.819910736907719e-06, + "loss": 0.6119, + "step": 6687 + }, + { + "epoch": 1.9578454332552693, + "grad_norm": 0.9461207389831543, + "learning_rate": 3.819583690263647e-06, + "loss": 0.5974, + "step": 6688 + }, + { + "epoch": 1.9581381733021077, + "grad_norm": 0.9783435463905334, + "learning_rate": 3.819256612312199e-06, + "loss": 0.6191, + "step": 6689 + }, + { + "epoch": 1.9584309133489461, + "grad_norm": 0.9300854802131653, + "learning_rate": 3.818929503061134e-06, + "loss": 0.622, + "step": 6690 + }, + { + "epoch": 1.9587236533957846, + "grad_norm": 1.022284746170044, + "learning_rate": 3.818602362518213e-06, + "loss": 0.617, + "step": 6691 + }, + { + "epoch": 1.959016393442623, + "grad_norm": 0.957493245601654, + "learning_rate": 3.818275190691198e-06, + "loss": 0.6014, + "step": 6692 + }, + { + "epoch": 1.9593091334894615, + "grad_norm": 0.9257583022117615, + "learning_rate": 3.81794798758785e-06, + "loss": 0.6073, + "step": 6693 + }, + { + "epoch": 1.9596018735362999, + "grad_norm": 0.9487441182136536, + "learning_rate": 3.817620753215934e-06, + "loss": 0.5837, + "step": 6694 + }, + { + "epoch": 1.9598946135831383, + "grad_norm": 0.9541623592376709, + "learning_rate": 3.8172934875832115e-06, + "loss": 0.6328, + "step": 6695 + }, + { + "epoch": 1.9601873536299765, + "grad_norm": 0.9824578166007996, + "learning_rate": 3.816966190697449e-06, + "loss": 0.6348, + "step": 6696 + }, + { + "epoch": 1.960480093676815, + "grad_norm": 0.9518122673034668, + "learning_rate": 3.816638862566409e-06, + "loss": 0.5996, + "step": 6697 + }, + { + "epoch": 1.9607728337236534, + "grad_norm": 0.9369275569915771, + "learning_rate": 3.81631150319786e-06, + "loss": 0.6194, + "step": 6698 + }, + { + "epoch": 1.9610655737704918, + "grad_norm": 0.9362019896507263, + "learning_rate": 3.8159841125995675e-06, + "loss": 0.6239, + "step": 6699 + }, + { + "epoch": 1.96135831381733, + "grad_norm": 0.9177207946777344, + "learning_rate": 3.815656690779299e-06, + "loss": 0.6128, + "step": 6700 + }, + { + "epoch": 1.9616510538641685, + "grad_norm": 0.9817939400672913, + "learning_rate": 3.815329237744824e-06, + "loss": 0.5955, + "step": 6701 + }, + { + "epoch": 1.961943793911007, + "grad_norm": 0.92729252576828, + "learning_rate": 3.815001753503908e-06, + "loss": 0.6027, + "step": 6702 + }, + { + "epoch": 1.9622365339578454, + "grad_norm": 0.922949492931366, + "learning_rate": 3.814674238064324e-06, + "loss": 0.6255, + "step": 6703 + }, + { + "epoch": 1.9625292740046838, + "grad_norm": 0.9371756315231323, + "learning_rate": 3.814346691433841e-06, + "loss": 0.6184, + "step": 6704 + }, + { + "epoch": 1.9628220140515222, + "grad_norm": 0.9655270576477051, + "learning_rate": 3.8140191136202306e-06, + "loss": 0.632, + "step": 6705 + }, + { + "epoch": 1.9631147540983607, + "grad_norm": 0.933376669883728, + "learning_rate": 3.8136915046312633e-06, + "loss": 0.5942, + "step": 6706 + }, + { + "epoch": 1.963407494145199, + "grad_norm": 0.9637324213981628, + "learning_rate": 3.813363864474714e-06, + "loss": 0.5907, + "step": 6707 + }, + { + "epoch": 1.9637002341920375, + "grad_norm": 0.8963561058044434, + "learning_rate": 3.8130361931583537e-06, + "loss": 0.5666, + "step": 6708 + }, + { + "epoch": 1.963992974238876, + "grad_norm": 0.9373932480812073, + "learning_rate": 3.8127084906899575e-06, + "loss": 0.6014, + "step": 6709 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 0.9989519119262695, + "learning_rate": 3.8123807570773e-06, + "loss": 0.6357, + "step": 6710 + }, + { + "epoch": 1.9645784543325528, + "grad_norm": 0.963596761226654, + "learning_rate": 3.812052992328158e-06, + "loss": 0.6087, + "step": 6711 + }, + { + "epoch": 1.964871194379391, + "grad_norm": 0.9195610284805298, + "learning_rate": 3.8117251964503054e-06, + "loss": 0.5894, + "step": 6712 + }, + { + "epoch": 1.9651639344262295, + "grad_norm": 0.9827913045883179, + "learning_rate": 3.8113973694515212e-06, + "loss": 0.6286, + "step": 6713 + }, + { + "epoch": 1.965456674473068, + "grad_norm": 0.924126923084259, + "learning_rate": 3.8110695113395826e-06, + "loss": 0.6374, + "step": 6714 + }, + { + "epoch": 1.9657494145199064, + "grad_norm": 1.0424431562423706, + "learning_rate": 3.810741622122267e-06, + "loss": 0.5463, + "step": 6715 + }, + { + "epoch": 1.9660421545667446, + "grad_norm": 1.0025393962860107, + "learning_rate": 3.8104137018073554e-06, + "loss": 0.5802, + "step": 6716 + }, + { + "epoch": 1.966334894613583, + "grad_norm": 0.9391179084777832, + "learning_rate": 3.810085750402627e-06, + "loss": 0.5964, + "step": 6717 + }, + { + "epoch": 1.9666276346604215, + "grad_norm": 1.2963989973068237, + "learning_rate": 3.8097577679158613e-06, + "loss": 0.5971, + "step": 6718 + }, + { + "epoch": 1.9669203747072599, + "grad_norm": 0.9566720724105835, + "learning_rate": 3.809429754354842e-06, + "loss": 0.5616, + "step": 6719 + }, + { + "epoch": 1.9672131147540983, + "grad_norm": 1.0152833461761475, + "learning_rate": 3.8091017097273504e-06, + "loss": 0.6103, + "step": 6720 + }, + { + "epoch": 1.9675058548009368, + "grad_norm": 0.9566547274589539, + "learning_rate": 3.8087736340411698e-06, + "loss": 0.5735, + "step": 6721 + }, + { + "epoch": 1.9677985948477752, + "grad_norm": 0.9423879981040955, + "learning_rate": 3.808445527304082e-06, + "loss": 0.6167, + "step": 6722 + }, + { + "epoch": 1.9680913348946136, + "grad_norm": 0.957163393497467, + "learning_rate": 3.808117389523873e-06, + "loss": 0.5778, + "step": 6723 + }, + { + "epoch": 1.968384074941452, + "grad_norm": 0.9833031296730042, + "learning_rate": 3.807789220708328e-06, + "loss": 0.6084, + "step": 6724 + }, + { + "epoch": 1.9686768149882905, + "grad_norm": 0.9496918320655823, + "learning_rate": 3.807461020865232e-06, + "loss": 0.6006, + "step": 6725 + }, + { + "epoch": 1.968969555035129, + "grad_norm": 0.977262556552887, + "learning_rate": 3.8071327900023726e-06, + "loss": 0.6125, + "step": 6726 + }, + { + "epoch": 1.9692622950819674, + "grad_norm": 0.9527190327644348, + "learning_rate": 3.806804528127536e-06, + "loss": 0.5683, + "step": 6727 + }, + { + "epoch": 1.9695550351288056, + "grad_norm": 1.0398495197296143, + "learning_rate": 3.806476235248511e-06, + "loss": 0.6516, + "step": 6728 + }, + { + "epoch": 1.969847775175644, + "grad_norm": 0.962001621723175, + "learning_rate": 3.8061479113730872e-06, + "loss": 0.6445, + "step": 6729 + }, + { + "epoch": 1.9701405152224825, + "grad_norm": 0.9587441086769104, + "learning_rate": 3.805819556509052e-06, + "loss": 0.6354, + "step": 6730 + }, + { + "epoch": 1.970433255269321, + "grad_norm": 0.9762184023857117, + "learning_rate": 3.8054911706641977e-06, + "loss": 0.6017, + "step": 6731 + }, + { + "epoch": 1.970725995316159, + "grad_norm": 0.9018144011497498, + "learning_rate": 3.805162753846314e-06, + "loss": 0.562, + "step": 6732 + }, + { + "epoch": 1.9710187353629975, + "grad_norm": 1.0031377077102661, + "learning_rate": 3.8048343060631943e-06, + "loss": 0.5853, + "step": 6733 + }, + { + "epoch": 1.971311475409836, + "grad_norm": 0.9138507843017578, + "learning_rate": 3.8045058273226287e-06, + "loss": 0.6234, + "step": 6734 + }, + { + "epoch": 1.9716042154566744, + "grad_norm": 0.8895012140274048, + "learning_rate": 3.8041773176324127e-06, + "loss": 0.5513, + "step": 6735 + }, + { + "epoch": 1.9718969555035128, + "grad_norm": 0.9276679754257202, + "learning_rate": 3.803848777000339e-06, + "loss": 0.6168, + "step": 6736 + }, + { + "epoch": 1.9721896955503513, + "grad_norm": 0.9932347536087036, + "learning_rate": 3.8035202054342022e-06, + "loss": 0.6029, + "step": 6737 + }, + { + "epoch": 1.9724824355971897, + "grad_norm": 0.9526447653770447, + "learning_rate": 3.8031916029417987e-06, + "loss": 0.5935, + "step": 6738 + }, + { + "epoch": 1.9727751756440282, + "grad_norm": 0.9265145659446716, + "learning_rate": 3.802862969530924e-06, + "loss": 0.6007, + "step": 6739 + }, + { + "epoch": 1.9730679156908666, + "grad_norm": 0.9439244866371155, + "learning_rate": 3.8025343052093745e-06, + "loss": 0.588, + "step": 6740 + }, + { + "epoch": 1.973360655737705, + "grad_norm": 0.9499882459640503, + "learning_rate": 3.8022056099849487e-06, + "loss": 0.6298, + "step": 6741 + }, + { + "epoch": 1.9736533957845435, + "grad_norm": 0.9849883913993835, + "learning_rate": 3.801876883865444e-06, + "loss": 0.575, + "step": 6742 + }, + { + "epoch": 1.973946135831382, + "grad_norm": 0.9401459693908691, + "learning_rate": 3.8015481268586617e-06, + "loss": 0.595, + "step": 6743 + }, + { + "epoch": 1.9742388758782201, + "grad_norm": 0.9600568413734436, + "learning_rate": 3.8012193389723982e-06, + "loss": 0.5533, + "step": 6744 + }, + { + "epoch": 1.9745316159250585, + "grad_norm": 0.9949639439582825, + "learning_rate": 3.8008905202144575e-06, + "loss": 0.5969, + "step": 6745 + }, + { + "epoch": 1.974824355971897, + "grad_norm": 0.9895834922790527, + "learning_rate": 3.8005616705926386e-06, + "loss": 0.5949, + "step": 6746 + }, + { + "epoch": 1.9751170960187352, + "grad_norm": 0.9791011214256287, + "learning_rate": 3.800232790114744e-06, + "loss": 0.5865, + "step": 6747 + }, + { + "epoch": 1.9754098360655736, + "grad_norm": 0.9616986513137817, + "learning_rate": 3.7999038787885768e-06, + "loss": 0.5799, + "step": 6748 + }, + { + "epoch": 1.975702576112412, + "grad_norm": 1.0036545991897583, + "learning_rate": 3.7995749366219396e-06, + "loss": 0.6115, + "step": 6749 + }, + { + "epoch": 1.9759953161592505, + "grad_norm": 1.000143051147461, + "learning_rate": 3.799245963622638e-06, + "loss": 0.561, + "step": 6750 + }, + { + "epoch": 1.976288056206089, + "grad_norm": 0.9708810448646545, + "learning_rate": 3.7989169597984763e-06, + "loss": 0.6072, + "step": 6751 + }, + { + "epoch": 1.9765807962529274, + "grad_norm": 0.9343686103820801, + "learning_rate": 3.7985879251572606e-06, + "loss": 0.6074, + "step": 6752 + }, + { + "epoch": 1.9768735362997658, + "grad_norm": 0.9833536148071289, + "learning_rate": 3.798258859706796e-06, + "loss": 0.5896, + "step": 6753 + }, + { + "epoch": 1.9771662763466042, + "grad_norm": 0.9643739461898804, + "learning_rate": 3.797929763454891e-06, + "loss": 0.6036, + "step": 6754 + }, + { + "epoch": 1.9774590163934427, + "grad_norm": 0.9888731241226196, + "learning_rate": 3.7976006364093533e-06, + "loss": 0.5753, + "step": 6755 + }, + { + "epoch": 1.9777517564402811, + "grad_norm": 0.9729430079460144, + "learning_rate": 3.797271478577991e-06, + "loss": 0.5911, + "step": 6756 + }, + { + "epoch": 1.9780444964871196, + "grad_norm": 1.3398776054382324, + "learning_rate": 3.7969422899686135e-06, + "loss": 0.6082, + "step": 6757 + }, + { + "epoch": 1.978337236533958, + "grad_norm": 0.9512901306152344, + "learning_rate": 3.7966130705890313e-06, + "loss": 0.6314, + "step": 6758 + }, + { + "epoch": 1.9786299765807962, + "grad_norm": 0.9795243144035339, + "learning_rate": 3.796283820447055e-06, + "loss": 0.6098, + "step": 6759 + }, + { + "epoch": 1.9789227166276346, + "grad_norm": 1.0182825326919556, + "learning_rate": 3.795954539550495e-06, + "loss": 0.5893, + "step": 6760 + }, + { + "epoch": 1.979215456674473, + "grad_norm": 0.9228180050849915, + "learning_rate": 3.7956252279071654e-06, + "loss": 0.5983, + "step": 6761 + }, + { + "epoch": 1.9795081967213115, + "grad_norm": 0.9588137269020081, + "learning_rate": 3.795295885524879e-06, + "loss": 0.5999, + "step": 6762 + }, + { + "epoch": 1.9798009367681497, + "grad_norm": 0.9527151584625244, + "learning_rate": 3.7949665124114477e-06, + "loss": 0.5819, + "step": 6763 + }, + { + "epoch": 1.9800936768149882, + "grad_norm": 0.9826493263244629, + "learning_rate": 3.794637108574687e-06, + "loss": 0.5903, + "step": 6764 + }, + { + "epoch": 1.9803864168618266, + "grad_norm": 0.9373260140419006, + "learning_rate": 3.794307674022413e-06, + "loss": 0.6048, + "step": 6765 + }, + { + "epoch": 1.980679156908665, + "grad_norm": 0.9488074779510498, + "learning_rate": 3.79397820876244e-06, + "loss": 0.5624, + "step": 6766 + }, + { + "epoch": 1.9809718969555035, + "grad_norm": 0.9970247745513916, + "learning_rate": 3.7936487128025862e-06, + "loss": 0.6114, + "step": 6767 + }, + { + "epoch": 1.981264637002342, + "grad_norm": 1.0159178972244263, + "learning_rate": 3.7933191861506674e-06, + "loss": 0.6452, + "step": 6768 + }, + { + "epoch": 1.9815573770491803, + "grad_norm": 0.9305038452148438, + "learning_rate": 3.792989628814503e-06, + "loss": 0.5837, + "step": 6769 + }, + { + "epoch": 1.9818501170960188, + "grad_norm": 0.9993329644203186, + "learning_rate": 3.792660040801911e-06, + "loss": 0.6111, + "step": 6770 + }, + { + "epoch": 1.9821428571428572, + "grad_norm": 0.9610140323638916, + "learning_rate": 3.792330422120711e-06, + "loss": 0.5824, + "step": 6771 + }, + { + "epoch": 1.9824355971896956, + "grad_norm": 0.9631814360618591, + "learning_rate": 3.7920007727787232e-06, + "loss": 0.5887, + "step": 6772 + }, + { + "epoch": 1.982728337236534, + "grad_norm": 0.9246073365211487, + "learning_rate": 3.7916710927837695e-06, + "loss": 0.5817, + "step": 6773 + }, + { + "epoch": 1.9830210772833725, + "grad_norm": 0.9496446251869202, + "learning_rate": 3.7913413821436708e-06, + "loss": 0.6239, + "step": 6774 + }, + { + "epoch": 1.9833138173302107, + "grad_norm": 0.9343958497047424, + "learning_rate": 3.79101164086625e-06, + "loss": 0.6028, + "step": 6775 + }, + { + "epoch": 1.9836065573770492, + "grad_norm": 1.0104193687438965, + "learning_rate": 3.7906818689593295e-06, + "loss": 0.6254, + "step": 6776 + }, + { + "epoch": 1.9838992974238876, + "grad_norm": 0.9176942110061646, + "learning_rate": 3.7903520664307338e-06, + "loss": 0.5995, + "step": 6777 + }, + { + "epoch": 1.984192037470726, + "grad_norm": 1.00187349319458, + "learning_rate": 3.790022233288288e-06, + "loss": 0.6289, + "step": 6778 + }, + { + "epoch": 1.9844847775175642, + "grad_norm": 0.9554363489151001, + "learning_rate": 3.7896923695398163e-06, + "loss": 0.5661, + "step": 6779 + }, + { + "epoch": 1.9847775175644027, + "grad_norm": 0.9322144985198975, + "learning_rate": 3.789362475193145e-06, + "loss": 0.6146, + "step": 6780 + }, + { + "epoch": 1.9850702576112411, + "grad_norm": 0.9074998497962952, + "learning_rate": 3.789032550256102e-06, + "loss": 0.5805, + "step": 6781 + }, + { + "epoch": 1.9853629976580796, + "grad_norm": 0.9678378105163574, + "learning_rate": 3.7887025947365143e-06, + "loss": 0.6182, + "step": 6782 + }, + { + "epoch": 1.985655737704918, + "grad_norm": 0.9512503147125244, + "learning_rate": 3.788372608642209e-06, + "loss": 0.6278, + "step": 6783 + }, + { + "epoch": 1.9859484777517564, + "grad_norm": 0.9751463532447815, + "learning_rate": 3.7880425919810158e-06, + "loss": 0.594, + "step": 6784 + }, + { + "epoch": 1.9862412177985949, + "grad_norm": 0.9936422109603882, + "learning_rate": 3.7877125447607655e-06, + "loss": 0.6338, + "step": 6785 + }, + { + "epoch": 1.9865339578454333, + "grad_norm": 0.9379580020904541, + "learning_rate": 3.787382466989288e-06, + "loss": 0.5934, + "step": 6786 + }, + { + "epoch": 1.9868266978922717, + "grad_norm": 0.943960964679718, + "learning_rate": 3.7870523586744135e-06, + "loss": 0.5823, + "step": 6787 + }, + { + "epoch": 1.9871194379391102, + "grad_norm": 0.9555195569992065, + "learning_rate": 3.7867222198239745e-06, + "loss": 0.6035, + "step": 6788 + }, + { + "epoch": 1.9874121779859486, + "grad_norm": 0.9233765006065369, + "learning_rate": 3.7863920504458036e-06, + "loss": 0.5694, + "step": 6789 + }, + { + "epoch": 1.987704918032787, + "grad_norm": 0.9160581231117249, + "learning_rate": 3.7860618505477342e-06, + "loss": 0.5768, + "step": 6790 + }, + { + "epoch": 1.9879976580796253, + "grad_norm": 0.9654913544654846, + "learning_rate": 3.7857316201376005e-06, + "loss": 0.5941, + "step": 6791 + }, + { + "epoch": 1.9882903981264637, + "grad_norm": 0.9571722149848938, + "learning_rate": 3.7854013592232374e-06, + "loss": 0.5851, + "step": 6792 + }, + { + "epoch": 1.9885831381733021, + "grad_norm": 0.8924282193183899, + "learning_rate": 3.7850710678124792e-06, + "loss": 0.6113, + "step": 6793 + }, + { + "epoch": 1.9888758782201406, + "grad_norm": 0.920337975025177, + "learning_rate": 3.7847407459131636e-06, + "loss": 0.6021, + "step": 6794 + }, + { + "epoch": 1.9891686182669788, + "grad_norm": 0.9611513018608093, + "learning_rate": 3.7844103935331272e-06, + "loss": 0.5883, + "step": 6795 + }, + { + "epoch": 1.9894613583138172, + "grad_norm": 0.9814337491989136, + "learning_rate": 3.7840800106802073e-06, + "loss": 0.6194, + "step": 6796 + }, + { + "epoch": 1.9897540983606556, + "grad_norm": 0.953478991985321, + "learning_rate": 3.783749597362243e-06, + "loss": 0.6083, + "step": 6797 + }, + { + "epoch": 1.990046838407494, + "grad_norm": 0.9337332248687744, + "learning_rate": 3.7834191535870717e-06, + "loss": 0.6431, + "step": 6798 + }, + { + "epoch": 1.9903395784543325, + "grad_norm": 0.9833278656005859, + "learning_rate": 3.7830886793625347e-06, + "loss": 0.6218, + "step": 6799 + }, + { + "epoch": 1.990632318501171, + "grad_norm": 0.9270894527435303, + "learning_rate": 3.7827581746964725e-06, + "loss": 0.601, + "step": 6800 + }, + { + "epoch": 1.9909250585480094, + "grad_norm": 1.2916861772537231, + "learning_rate": 3.7824276395967262e-06, + "loss": 0.6181, + "step": 6801 + }, + { + "epoch": 1.9912177985948478, + "grad_norm": 1.0015928745269775, + "learning_rate": 3.7820970740711384e-06, + "loss": 0.5769, + "step": 6802 + }, + { + "epoch": 1.9915105386416863, + "grad_norm": 1.0018713474273682, + "learning_rate": 3.7817664781275497e-06, + "loss": 0.5904, + "step": 6803 + }, + { + "epoch": 1.9918032786885247, + "grad_norm": 0.955081582069397, + "learning_rate": 3.7814358517738057e-06, + "loss": 0.5965, + "step": 6804 + }, + { + "epoch": 1.9920960187353631, + "grad_norm": 1.0767595767974854, + "learning_rate": 3.78110519501775e-06, + "loss": 0.6442, + "step": 6805 + }, + { + "epoch": 1.9923887587822016, + "grad_norm": 0.9680997729301453, + "learning_rate": 3.7807745078672277e-06, + "loss": 0.6299, + "step": 6806 + }, + { + "epoch": 1.9926814988290398, + "grad_norm": 0.9886751174926758, + "learning_rate": 3.7804437903300837e-06, + "loss": 0.54, + "step": 6807 + }, + { + "epoch": 1.9929742388758782, + "grad_norm": 0.9846494793891907, + "learning_rate": 3.7801130424141653e-06, + "loss": 0.6204, + "step": 6808 + }, + { + "epoch": 1.9932669789227166, + "grad_norm": 0.9548198580741882, + "learning_rate": 3.7797822641273186e-06, + "loss": 0.6341, + "step": 6809 + }, + { + "epoch": 1.993559718969555, + "grad_norm": 0.9979413151741028, + "learning_rate": 3.779451455477392e-06, + "loss": 0.6118, + "step": 6810 + }, + { + "epoch": 1.9938524590163933, + "grad_norm": 0.9366575479507446, + "learning_rate": 3.7791206164722338e-06, + "loss": 0.5829, + "step": 6811 + }, + { + "epoch": 1.9941451990632317, + "grad_norm": 0.9420893788337708, + "learning_rate": 3.7787897471196933e-06, + "loss": 0.6053, + "step": 6812 + }, + { + "epoch": 1.9944379391100702, + "grad_norm": 0.9438278675079346, + "learning_rate": 3.77845884742762e-06, + "loss": 0.5813, + "step": 6813 + }, + { + "epoch": 1.9947306791569086, + "grad_norm": 0.9224913716316223, + "learning_rate": 3.7781279174038656e-06, + "loss": 0.5777, + "step": 6814 + }, + { + "epoch": 1.995023419203747, + "grad_norm": 0.8963552117347717, + "learning_rate": 3.77779695705628e-06, + "loss": 0.5279, + "step": 6815 + }, + { + "epoch": 1.9953161592505855, + "grad_norm": 0.9609224796295166, + "learning_rate": 3.7774659663927165e-06, + "loss": 0.5988, + "step": 6816 + }, + { + "epoch": 1.995608899297424, + "grad_norm": 0.9208358526229858, + "learning_rate": 3.777134945421028e-06, + "loss": 0.5578, + "step": 6817 + }, + { + "epoch": 1.9959016393442623, + "grad_norm": 0.9520053267478943, + "learning_rate": 3.776803894149067e-06, + "loss": 0.5879, + "step": 6818 + }, + { + "epoch": 1.9961943793911008, + "grad_norm": 1.0078680515289307, + "learning_rate": 3.7764728125846884e-06, + "loss": 0.6203, + "step": 6819 + }, + { + "epoch": 1.9964871194379392, + "grad_norm": 0.9375653266906738, + "learning_rate": 3.776141700735747e-06, + "loss": 0.5971, + "step": 6820 + }, + { + "epoch": 1.9967798594847777, + "grad_norm": 0.9386565089225769, + "learning_rate": 3.775810558610098e-06, + "loss": 0.5788, + "step": 6821 + }, + { + "epoch": 1.997072599531616, + "grad_norm": 0.9895408153533936, + "learning_rate": 3.7754793862155996e-06, + "loss": 0.6134, + "step": 6822 + }, + { + "epoch": 1.9973653395784543, + "grad_norm": 0.9309032559394836, + "learning_rate": 3.7751481835601065e-06, + "loss": 0.5936, + "step": 6823 + }, + { + "epoch": 1.9976580796252927, + "grad_norm": 0.9575101733207703, + "learning_rate": 3.7748169506514786e-06, + "loss": 0.5619, + "step": 6824 + }, + { + "epoch": 1.9979508196721312, + "grad_norm": 0.974194347858429, + "learning_rate": 3.7744856874975733e-06, + "loss": 0.6047, + "step": 6825 + }, + { + "epoch": 1.9982435597189696, + "grad_norm": 0.9501975774765015, + "learning_rate": 3.7741543941062507e-06, + "loss": 0.6071, + "step": 6826 + }, + { + "epoch": 1.9985362997658078, + "grad_norm": 0.9539288878440857, + "learning_rate": 3.77382307048537e-06, + "loss": 0.5929, + "step": 6827 + }, + { + "epoch": 1.9988290398126463, + "grad_norm": 0.9403125047683716, + "learning_rate": 3.773491716642792e-06, + "loss": 0.5342, + "step": 6828 + }, + { + "epoch": 1.9991217798594847, + "grad_norm": 0.9199486970901489, + "learning_rate": 3.7731603325863787e-06, + "loss": 0.6086, + "step": 6829 + }, + { + "epoch": 1.9994145199063231, + "grad_norm": 0.9654060006141663, + "learning_rate": 3.772828918323992e-06, + "loss": 0.6007, + "step": 6830 + }, + { + "epoch": 1.9997072599531616, + "grad_norm": 1.0230623483657837, + "learning_rate": 3.7724974738634945e-06, + "loss": 0.6169, + "step": 6831 + }, + { + "epoch": 2.0, + "grad_norm": 0.9087051153182983, + "learning_rate": 3.772165999212749e-06, + "loss": 0.6054, + "step": 6832 + }, + { + "epoch": 2.0002927400468384, + "grad_norm": 0.9134246706962585, + "learning_rate": 3.7718344943796224e-06, + "loss": 0.5931, + "step": 6833 + }, + { + "epoch": 2.000585480093677, + "grad_norm": 0.9694806337356567, + "learning_rate": 3.771502959371977e-06, + "loss": 0.6182, + "step": 6834 + }, + { + "epoch": 2.0008782201405153, + "grad_norm": 0.9808669686317444, + "learning_rate": 3.7711713941976803e-06, + "loss": 0.5857, + "step": 6835 + }, + { + "epoch": 2.0011709601873537, + "grad_norm": 0.9084770679473877, + "learning_rate": 3.770839798864597e-06, + "loss": 0.5939, + "step": 6836 + }, + { + "epoch": 2.001463700234192, + "grad_norm": 0.9444819092750549, + "learning_rate": 3.770508173380596e-06, + "loss": 0.6011, + "step": 6837 + }, + { + "epoch": 2.0017564402810306, + "grad_norm": 1.018134593963623, + "learning_rate": 3.770176517753545e-06, + "loss": 0.5331, + "step": 6838 + }, + { + "epoch": 2.002049180327869, + "grad_norm": 0.9550958275794983, + "learning_rate": 3.769844831991311e-06, + "loss": 0.5599, + "step": 6839 + }, + { + "epoch": 2.002341920374707, + "grad_norm": 0.9373695254325867, + "learning_rate": 3.769513116101765e-06, + "loss": 0.5503, + "step": 6840 + }, + { + "epoch": 2.0026346604215455, + "grad_norm": 0.9882915616035461, + "learning_rate": 3.7691813700927758e-06, + "loss": 0.5767, + "step": 6841 + }, + { + "epoch": 2.002927400468384, + "grad_norm": 1.0088317394256592, + "learning_rate": 3.768849593972216e-06, + "loss": 0.5831, + "step": 6842 + }, + { + "epoch": 2.0032201405152223, + "grad_norm": 0.9929350018501282, + "learning_rate": 3.768517787747955e-06, + "loss": 0.5395, + "step": 6843 + }, + { + "epoch": 2.003512880562061, + "grad_norm": 0.9130370616912842, + "learning_rate": 3.768185951427866e-06, + "loss": 0.5691, + "step": 6844 + }, + { + "epoch": 2.003805620608899, + "grad_norm": 0.9542420506477356, + "learning_rate": 3.7678540850198216e-06, + "loss": 0.5741, + "step": 6845 + }, + { + "epoch": 2.0040983606557377, + "grad_norm": 0.9467065334320068, + "learning_rate": 3.767522188531695e-06, + "loss": 0.5852, + "step": 6846 + }, + { + "epoch": 2.004391100702576, + "grad_norm": 0.9215226173400879, + "learning_rate": 3.767190261971362e-06, + "loss": 0.5733, + "step": 6847 + }, + { + "epoch": 2.0046838407494145, + "grad_norm": 0.9474203586578369, + "learning_rate": 3.766858305346696e-06, + "loss": 0.5721, + "step": 6848 + }, + { + "epoch": 2.004976580796253, + "grad_norm": 0.969508945941925, + "learning_rate": 3.7665263186655733e-06, + "loss": 0.5841, + "step": 6849 + }, + { + "epoch": 2.0052693208430914, + "grad_norm": 0.9147852659225464, + "learning_rate": 3.7661943019358706e-06, + "loss": 0.5664, + "step": 6850 + }, + { + "epoch": 2.00556206088993, + "grad_norm": 0.9209203124046326, + "learning_rate": 3.765862255165464e-06, + "loss": 0.5735, + "step": 6851 + }, + { + "epoch": 2.0058548009367683, + "grad_norm": 0.9860485792160034, + "learning_rate": 3.765530178362233e-06, + "loss": 0.5631, + "step": 6852 + }, + { + "epoch": 2.0061475409836067, + "grad_norm": 0.9297992587089539, + "learning_rate": 3.7651980715340554e-06, + "loss": 0.5462, + "step": 6853 + }, + { + "epoch": 2.006440281030445, + "grad_norm": 0.9679105877876282, + "learning_rate": 3.7648659346888093e-06, + "loss": 0.5847, + "step": 6854 + }, + { + "epoch": 2.0067330210772836, + "grad_norm": 0.9436119198799133, + "learning_rate": 3.7645337678343773e-06, + "loss": 0.5629, + "step": 6855 + }, + { + "epoch": 2.0070257611241216, + "grad_norm": 0.8944746851921082, + "learning_rate": 3.7642015709786384e-06, + "loss": 0.5698, + "step": 6856 + }, + { + "epoch": 2.00731850117096, + "grad_norm": 0.9644535779953003, + "learning_rate": 3.763869344129474e-06, + "loss": 0.5973, + "step": 6857 + }, + { + "epoch": 2.0076112412177984, + "grad_norm": 0.944068431854248, + "learning_rate": 3.763537087294767e-06, + "loss": 0.5187, + "step": 6858 + }, + { + "epoch": 2.007903981264637, + "grad_norm": 0.9737076759338379, + "learning_rate": 3.7632048004823995e-06, + "loss": 0.625, + "step": 6859 + }, + { + "epoch": 2.0081967213114753, + "grad_norm": 0.9727603197097778, + "learning_rate": 3.762872483700255e-06, + "loss": 0.62, + "step": 6860 + }, + { + "epoch": 2.0084894613583137, + "grad_norm": 0.9505138993263245, + "learning_rate": 3.7625401369562188e-06, + "loss": 0.6058, + "step": 6861 + }, + { + "epoch": 2.008782201405152, + "grad_norm": 0.9952167868614197, + "learning_rate": 3.762207760258175e-06, + "loss": 0.6056, + "step": 6862 + }, + { + "epoch": 2.0090749414519906, + "grad_norm": 0.9370227456092834, + "learning_rate": 3.76187535361401e-06, + "loss": 0.5633, + "step": 6863 + }, + { + "epoch": 2.009367681498829, + "grad_norm": 0.9817368388175964, + "learning_rate": 3.7615429170316096e-06, + "loss": 0.6124, + "step": 6864 + }, + { + "epoch": 2.0096604215456675, + "grad_norm": 0.933426558971405, + "learning_rate": 3.761210450518861e-06, + "loss": 0.5467, + "step": 6865 + }, + { + "epoch": 2.009953161592506, + "grad_norm": 0.987797737121582, + "learning_rate": 3.7608779540836527e-06, + "loss": 0.575, + "step": 6866 + }, + { + "epoch": 2.0102459016393444, + "grad_norm": 0.9536080956459045, + "learning_rate": 3.7605454277338727e-06, + "loss": 0.5384, + "step": 6867 + }, + { + "epoch": 2.010538641686183, + "grad_norm": 0.9606307744979858, + "learning_rate": 3.76021287147741e-06, + "loss": 0.6071, + "step": 6868 + }, + { + "epoch": 2.0108313817330212, + "grad_norm": 1.021140456199646, + "learning_rate": 3.7598802853221543e-06, + "loss": 0.5966, + "step": 6869 + }, + { + "epoch": 2.0111241217798597, + "grad_norm": 1.1283185482025146, + "learning_rate": 3.759547669275997e-06, + "loss": 0.5819, + "step": 6870 + }, + { + "epoch": 2.011416861826698, + "grad_norm": 1.0288567543029785, + "learning_rate": 3.7592150233468294e-06, + "loss": 0.6027, + "step": 6871 + }, + { + "epoch": 2.011709601873536, + "grad_norm": 0.9769961833953857, + "learning_rate": 3.758882347542543e-06, + "loss": 0.591, + "step": 6872 + }, + { + "epoch": 2.0120023419203745, + "grad_norm": 0.9513328671455383, + "learning_rate": 3.758549641871032e-06, + "loss": 0.5818, + "step": 6873 + }, + { + "epoch": 2.012295081967213, + "grad_norm": 1.018471121788025, + "learning_rate": 3.7582169063401888e-06, + "loss": 0.5676, + "step": 6874 + }, + { + "epoch": 2.0125878220140514, + "grad_norm": 0.9668817520141602, + "learning_rate": 3.757884140957907e-06, + "loss": 0.5755, + "step": 6875 + }, + { + "epoch": 2.01288056206089, + "grad_norm": 0.9604187607765198, + "learning_rate": 3.7575513457320824e-06, + "loss": 0.5713, + "step": 6876 + }, + { + "epoch": 2.0131733021077283, + "grad_norm": 0.9606986045837402, + "learning_rate": 3.757218520670611e-06, + "loss": 0.5781, + "step": 6877 + }, + { + "epoch": 2.0134660421545667, + "grad_norm": 0.9811913371086121, + "learning_rate": 3.7568856657813883e-06, + "loss": 0.5938, + "step": 6878 + }, + { + "epoch": 2.013758782201405, + "grad_norm": 0.989899218082428, + "learning_rate": 3.7565527810723123e-06, + "loss": 0.6287, + "step": 6879 + }, + { + "epoch": 2.0140515222482436, + "grad_norm": 0.9697996377944946, + "learning_rate": 3.7562198665512795e-06, + "loss": 0.6175, + "step": 6880 + }, + { + "epoch": 2.014344262295082, + "grad_norm": 0.9785324931144714, + "learning_rate": 3.755886922226189e-06, + "loss": 0.5968, + "step": 6881 + }, + { + "epoch": 2.0146370023419204, + "grad_norm": 0.9778080582618713, + "learning_rate": 3.7555539481049397e-06, + "loss": 0.6166, + "step": 6882 + }, + { + "epoch": 2.014929742388759, + "grad_norm": 1.0788285732269287, + "learning_rate": 3.7552209441954328e-06, + "loss": 0.5699, + "step": 6883 + }, + { + "epoch": 2.0152224824355973, + "grad_norm": 0.9475666880607605, + "learning_rate": 3.7548879105055675e-06, + "loss": 0.5466, + "step": 6884 + }, + { + "epoch": 2.0155152224824358, + "grad_norm": 0.9960866570472717, + "learning_rate": 3.754554847043246e-06, + "loss": 0.5768, + "step": 6885 + }, + { + "epoch": 2.015807962529274, + "grad_norm": 0.9796926975250244, + "learning_rate": 3.7542217538163683e-06, + "loss": 0.5602, + "step": 6886 + }, + { + "epoch": 2.0161007025761126, + "grad_norm": 0.9740990996360779, + "learning_rate": 3.7538886308328394e-06, + "loss": 0.6239, + "step": 6887 + }, + { + "epoch": 2.0163934426229506, + "grad_norm": 0.9189649224281311, + "learning_rate": 3.753555478100562e-06, + "loss": 0.5569, + "step": 6888 + }, + { + "epoch": 2.016686182669789, + "grad_norm": 0.9185100793838501, + "learning_rate": 3.7532222956274397e-06, + "loss": 0.5245, + "step": 6889 + }, + { + "epoch": 2.0169789227166275, + "grad_norm": 0.9804782867431641, + "learning_rate": 3.7528890834213773e-06, + "loss": 0.556, + "step": 6890 + }, + { + "epoch": 2.017271662763466, + "grad_norm": 0.9811258316040039, + "learning_rate": 3.7525558414902805e-06, + "loss": 0.5702, + "step": 6891 + }, + { + "epoch": 2.0175644028103044, + "grad_norm": 1.0108312368392944, + "learning_rate": 3.7522225698420565e-06, + "loss": 0.6059, + "step": 6892 + }, + { + "epoch": 2.017857142857143, + "grad_norm": 0.9941244125366211, + "learning_rate": 3.751889268484612e-06, + "loss": 0.5555, + "step": 6893 + }, + { + "epoch": 2.0181498829039812, + "grad_norm": 1.0163973569869995, + "learning_rate": 3.7515559374258525e-06, + "loss": 0.59, + "step": 6894 + }, + { + "epoch": 2.0184426229508197, + "grad_norm": 0.9124834537506104, + "learning_rate": 3.751222576673689e-06, + "loss": 0.5834, + "step": 6895 + }, + { + "epoch": 2.018735362997658, + "grad_norm": 1.0027844905853271, + "learning_rate": 3.750889186236029e-06, + "loss": 0.5909, + "step": 6896 + }, + { + "epoch": 2.0190281030444965, + "grad_norm": 1.006765604019165, + "learning_rate": 3.750555766120783e-06, + "loss": 0.5763, + "step": 6897 + }, + { + "epoch": 2.019320843091335, + "grad_norm": 0.9339196085929871, + "learning_rate": 3.7502223163358614e-06, + "loss": 0.5347, + "step": 6898 + }, + { + "epoch": 2.0196135831381734, + "grad_norm": 1.0339338779449463, + "learning_rate": 3.749888836889175e-06, + "loss": 0.5941, + "step": 6899 + }, + { + "epoch": 2.019906323185012, + "grad_norm": 0.9711725115776062, + "learning_rate": 3.7495553277886354e-06, + "loss": 0.5767, + "step": 6900 + }, + { + "epoch": 2.0201990632318503, + "grad_norm": 0.9885873198509216, + "learning_rate": 3.7492217890421558e-06, + "loss": 0.5744, + "step": 6901 + }, + { + "epoch": 2.0204918032786887, + "grad_norm": 0.9684771299362183, + "learning_rate": 3.7488882206576498e-06, + "loss": 0.5698, + "step": 6902 + }, + { + "epoch": 2.020784543325527, + "grad_norm": 0.9735007286071777, + "learning_rate": 3.74855462264303e-06, + "loss": 0.5241, + "step": 6903 + }, + { + "epoch": 2.021077283372365, + "grad_norm": 0.9360061883926392, + "learning_rate": 3.748220995006213e-06, + "loss": 0.5389, + "step": 6904 + }, + { + "epoch": 2.0213700234192036, + "grad_norm": 0.9279727935791016, + "learning_rate": 3.7478873377551124e-06, + "loss": 0.5549, + "step": 6905 + }, + { + "epoch": 2.021662763466042, + "grad_norm": 0.9939764142036438, + "learning_rate": 3.7475536508976446e-06, + "loss": 0.5713, + "step": 6906 + }, + { + "epoch": 2.0219555035128804, + "grad_norm": 0.9275122880935669, + "learning_rate": 3.7472199344417272e-06, + "loss": 0.5555, + "step": 6907 + }, + { + "epoch": 2.022248243559719, + "grad_norm": 0.9706093668937683, + "learning_rate": 3.746886188395277e-06, + "loss": 0.5746, + "step": 6908 + }, + { + "epoch": 2.0225409836065573, + "grad_norm": 0.9667859673500061, + "learning_rate": 3.746552412766213e-06, + "loss": 0.5607, + "step": 6909 + }, + { + "epoch": 2.0228337236533958, + "grad_norm": 0.9679692387580872, + "learning_rate": 3.746218607562453e-06, + "loss": 0.5797, + "step": 6910 + }, + { + "epoch": 2.023126463700234, + "grad_norm": 1.008257508277893, + "learning_rate": 3.7458847727919168e-06, + "loss": 0.5739, + "step": 6911 + }, + { + "epoch": 2.0234192037470726, + "grad_norm": 1.0329760313034058, + "learning_rate": 3.7455509084625253e-06, + "loss": 0.6031, + "step": 6912 + }, + { + "epoch": 2.023711943793911, + "grad_norm": 0.9903295040130615, + "learning_rate": 3.745217014582199e-06, + "loss": 0.5637, + "step": 6913 + }, + { + "epoch": 2.0240046838407495, + "grad_norm": 1.02057683467865, + "learning_rate": 3.7448830911588607e-06, + "loss": 0.5634, + "step": 6914 + }, + { + "epoch": 2.024297423887588, + "grad_norm": 0.9808785319328308, + "learning_rate": 3.7445491382004305e-06, + "loss": 0.5757, + "step": 6915 + }, + { + "epoch": 2.0245901639344264, + "grad_norm": 0.9348267912864685, + "learning_rate": 3.7442151557148345e-06, + "loss": 0.5714, + "step": 6916 + }, + { + "epoch": 2.024882903981265, + "grad_norm": 0.9782540202140808, + "learning_rate": 3.7438811437099938e-06, + "loss": 0.5654, + "step": 6917 + }, + { + "epoch": 2.0251756440281032, + "grad_norm": 1.005495309829712, + "learning_rate": 3.743547102193834e-06, + "loss": 0.6101, + "step": 6918 + }, + { + "epoch": 2.0254683840749417, + "grad_norm": 0.9767120480537415, + "learning_rate": 3.743213031174281e-06, + "loss": 0.5914, + "step": 6919 + }, + { + "epoch": 2.0257611241217797, + "grad_norm": 0.9697039127349854, + "learning_rate": 3.7428789306592595e-06, + "loss": 0.5531, + "step": 6920 + }, + { + "epoch": 2.026053864168618, + "grad_norm": 0.93015456199646, + "learning_rate": 3.7425448006566966e-06, + "loss": 0.5056, + "step": 6921 + }, + { + "epoch": 2.0263466042154565, + "grad_norm": 0.9754744172096252, + "learning_rate": 3.7422106411745196e-06, + "loss": 0.5978, + "step": 6922 + }, + { + "epoch": 2.026639344262295, + "grad_norm": 0.9736236929893494, + "learning_rate": 3.741876452220656e-06, + "loss": 0.5923, + "step": 6923 + }, + { + "epoch": 2.0269320843091334, + "grad_norm": 0.9579874873161316, + "learning_rate": 3.7415422338030362e-06, + "loss": 0.5804, + "step": 6924 + }, + { + "epoch": 2.027224824355972, + "grad_norm": 0.9399815797805786, + "learning_rate": 3.7412079859295877e-06, + "loss": 0.5677, + "step": 6925 + }, + { + "epoch": 2.0275175644028103, + "grad_norm": 1.0093932151794434, + "learning_rate": 3.7408737086082408e-06, + "loss": 0.5738, + "step": 6926 + }, + { + "epoch": 2.0278103044496487, + "grad_norm": 1.0867291688919067, + "learning_rate": 3.740539401846927e-06, + "loss": 0.5775, + "step": 6927 + }, + { + "epoch": 2.028103044496487, + "grad_norm": 0.9278172850608826, + "learning_rate": 3.7402050656535777e-06, + "loss": 0.5382, + "step": 6928 + }, + { + "epoch": 2.0283957845433256, + "grad_norm": 0.9914090037345886, + "learning_rate": 3.739870700036125e-06, + "loss": 0.5988, + "step": 6929 + }, + { + "epoch": 2.028688524590164, + "grad_norm": 1.1764954328536987, + "learning_rate": 3.739536305002502e-06, + "loss": 0.5614, + "step": 6930 + }, + { + "epoch": 2.0289812646370025, + "grad_norm": 0.9524335861206055, + "learning_rate": 3.739201880560641e-06, + "loss": 0.5813, + "step": 6931 + }, + { + "epoch": 2.029274004683841, + "grad_norm": 0.9705631136894226, + "learning_rate": 3.7388674267184784e-06, + "loss": 0.5654, + "step": 6932 + }, + { + "epoch": 2.0295667447306793, + "grad_norm": 0.9894316792488098, + "learning_rate": 3.7385329434839472e-06, + "loss": 0.561, + "step": 6933 + }, + { + "epoch": 2.0298594847775178, + "grad_norm": 0.9502295851707458, + "learning_rate": 3.7381984308649855e-06, + "loss": 0.5627, + "step": 6934 + }, + { + "epoch": 2.0301522248243558, + "grad_norm": 1.0363293886184692, + "learning_rate": 3.737863888869527e-06, + "loss": 0.6061, + "step": 6935 + }, + { + "epoch": 2.030444964871194, + "grad_norm": 0.9836623668670654, + "learning_rate": 3.7375293175055094e-06, + "loss": 0.5838, + "step": 6936 + }, + { + "epoch": 2.0307377049180326, + "grad_norm": 0.9744644165039062, + "learning_rate": 3.737194716780872e-06, + "loss": 0.5622, + "step": 6937 + }, + { + "epoch": 2.031030444964871, + "grad_norm": 0.9832055568695068, + "learning_rate": 3.736860086703552e-06, + "loss": 0.6096, + "step": 6938 + }, + { + "epoch": 2.0313231850117095, + "grad_norm": 1.1343607902526855, + "learning_rate": 3.7365254272814887e-06, + "loss": 0.6064, + "step": 6939 + }, + { + "epoch": 2.031615925058548, + "grad_norm": 0.9760472178459167, + "learning_rate": 3.736190738522622e-06, + "loss": 0.5854, + "step": 6940 + }, + { + "epoch": 2.0319086651053864, + "grad_norm": 0.9675528407096863, + "learning_rate": 3.7358560204348928e-06, + "loss": 0.5708, + "step": 6941 + }, + { + "epoch": 2.032201405152225, + "grad_norm": 1.0030699968338013, + "learning_rate": 3.735521273026242e-06, + "loss": 0.587, + "step": 6942 + }, + { + "epoch": 2.0324941451990632, + "grad_norm": 1.016855001449585, + "learning_rate": 3.7351864963046112e-06, + "loss": 0.558, + "step": 6943 + }, + { + "epoch": 2.0327868852459017, + "grad_norm": 1.0314451456069946, + "learning_rate": 3.734851690277944e-06, + "loss": 0.5597, + "step": 6944 + }, + { + "epoch": 2.03307962529274, + "grad_norm": 0.9914858937263489, + "learning_rate": 3.7345168549541825e-06, + "loss": 0.5771, + "step": 6945 + }, + { + "epoch": 2.0333723653395785, + "grad_norm": 0.9970172643661499, + "learning_rate": 3.7341819903412724e-06, + "loss": 0.5717, + "step": 6946 + }, + { + "epoch": 2.033665105386417, + "grad_norm": 0.9530056715011597, + "learning_rate": 3.7338470964471566e-06, + "loss": 0.5849, + "step": 6947 + }, + { + "epoch": 2.0339578454332554, + "grad_norm": 1.0097098350524902, + "learning_rate": 3.7335121732797817e-06, + "loss": 0.5794, + "step": 6948 + }, + { + "epoch": 2.034250585480094, + "grad_norm": 0.9552956819534302, + "learning_rate": 3.733177220847094e-06, + "loss": 0.5925, + "step": 6949 + }, + { + "epoch": 2.0345433255269323, + "grad_norm": 1.1201459169387817, + "learning_rate": 3.7328422391570396e-06, + "loss": 0.544, + "step": 6950 + }, + { + "epoch": 2.0348360655737703, + "grad_norm": 0.9431127905845642, + "learning_rate": 3.7325072282175657e-06, + "loss": 0.5809, + "step": 6951 + }, + { + "epoch": 2.0351288056206087, + "grad_norm": 0.9588595032691956, + "learning_rate": 3.7321721880366214e-06, + "loss": 0.574, + "step": 6952 + }, + { + "epoch": 2.035421545667447, + "grad_norm": 0.9694178104400635, + "learning_rate": 3.7318371186221548e-06, + "loss": 0.6029, + "step": 6953 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 0.9611911773681641, + "learning_rate": 3.7315020199821165e-06, + "loss": 0.56, + "step": 6954 + }, + { + "epoch": 2.036007025761124, + "grad_norm": 0.9492121338844299, + "learning_rate": 3.731166892124456e-06, + "loss": 0.5489, + "step": 6955 + }, + { + "epoch": 2.0362997658079625, + "grad_norm": 0.9895868897438049, + "learning_rate": 3.7308317350571243e-06, + "loss": 0.5856, + "step": 6956 + }, + { + "epoch": 2.036592505854801, + "grad_norm": 0.995940089225769, + "learning_rate": 3.7304965487880734e-06, + "loss": 0.5887, + "step": 6957 + }, + { + "epoch": 2.0368852459016393, + "grad_norm": 0.9721046090126038, + "learning_rate": 3.7301613333252553e-06, + "loss": 0.5968, + "step": 6958 + }, + { + "epoch": 2.0371779859484778, + "grad_norm": 1.012921690940857, + "learning_rate": 3.7298260886766235e-06, + "loss": 0.5812, + "step": 6959 + }, + { + "epoch": 2.037470725995316, + "grad_norm": 1.024129867553711, + "learning_rate": 3.7294908148501314e-06, + "loss": 0.553, + "step": 6960 + }, + { + "epoch": 2.0377634660421546, + "grad_norm": 0.9876652359962463, + "learning_rate": 3.7291555118537333e-06, + "loss": 0.5637, + "step": 6961 + }, + { + "epoch": 2.038056206088993, + "grad_norm": 0.9630298614501953, + "learning_rate": 3.7288201796953842e-06, + "loss": 0.578, + "step": 6962 + }, + { + "epoch": 2.0383489461358315, + "grad_norm": 1.0605661869049072, + "learning_rate": 3.7284848183830407e-06, + "loss": 0.5676, + "step": 6963 + }, + { + "epoch": 2.03864168618267, + "grad_norm": 1.0161375999450684, + "learning_rate": 3.7281494279246595e-06, + "loss": 0.5863, + "step": 6964 + }, + { + "epoch": 2.0389344262295084, + "grad_norm": 1.0113269090652466, + "learning_rate": 3.727814008328196e-06, + "loss": 0.6103, + "step": 6965 + }, + { + "epoch": 2.039227166276347, + "grad_norm": 0.97161865234375, + "learning_rate": 3.7274785596016095e-06, + "loss": 0.561, + "step": 6966 + }, + { + "epoch": 2.039519906323185, + "grad_norm": 0.9959555268287659, + "learning_rate": 3.7271430817528587e-06, + "loss": 0.593, + "step": 6967 + }, + { + "epoch": 2.0398126463700232, + "grad_norm": 0.9873859286308289, + "learning_rate": 3.7268075747899025e-06, + "loss": 0.5652, + "step": 6968 + }, + { + "epoch": 2.0401053864168617, + "grad_norm": 0.9205760955810547, + "learning_rate": 3.7264720387207008e-06, + "loss": 0.5728, + "step": 6969 + }, + { + "epoch": 2.0403981264637, + "grad_norm": 0.9784654974937439, + "learning_rate": 3.7261364735532144e-06, + "loss": 0.5918, + "step": 6970 + }, + { + "epoch": 2.0406908665105385, + "grad_norm": 0.9773848652839661, + "learning_rate": 3.7258008792954047e-06, + "loss": 0.5913, + "step": 6971 + }, + { + "epoch": 2.040983606557377, + "grad_norm": 1.0142810344696045, + "learning_rate": 3.725465255955233e-06, + "loss": 0.6015, + "step": 6972 + }, + { + "epoch": 2.0412763466042154, + "grad_norm": 0.9720742702484131, + "learning_rate": 3.7251296035406637e-06, + "loss": 0.5782, + "step": 6973 + }, + { + "epoch": 2.041569086651054, + "grad_norm": 0.9432435035705566, + "learning_rate": 3.7247939220596583e-06, + "loss": 0.5413, + "step": 6974 + }, + { + "epoch": 2.0418618266978923, + "grad_norm": 0.9121375679969788, + "learning_rate": 3.7244582115201826e-06, + "loss": 0.5683, + "step": 6975 + }, + { + "epoch": 2.0421545667447307, + "grad_norm": 0.9516785144805908, + "learning_rate": 3.7241224719302005e-06, + "loss": 0.5854, + "step": 6976 + }, + { + "epoch": 2.042447306791569, + "grad_norm": 0.9812279939651489, + "learning_rate": 3.7237867032976766e-06, + "loss": 0.6418, + "step": 6977 + }, + { + "epoch": 2.0427400468384076, + "grad_norm": 0.9844695329666138, + "learning_rate": 3.723450905630579e-06, + "loss": 0.5704, + "step": 6978 + }, + { + "epoch": 2.043032786885246, + "grad_norm": 0.9578074812889099, + "learning_rate": 3.723115078936873e-06, + "loss": 0.5496, + "step": 6979 + }, + { + "epoch": 2.0433255269320845, + "grad_norm": 0.9605997800827026, + "learning_rate": 3.7227792232245276e-06, + "loss": 0.6138, + "step": 6980 + }, + { + "epoch": 2.043618266978923, + "grad_norm": 0.9611008167266846, + "learning_rate": 3.722443338501509e-06, + "loss": 0.5779, + "step": 6981 + }, + { + "epoch": 2.0439110070257613, + "grad_norm": 0.9823662042617798, + "learning_rate": 3.7221074247757883e-06, + "loss": 0.5965, + "step": 6982 + }, + { + "epoch": 2.0442037470725993, + "grad_norm": 0.9346361756324768, + "learning_rate": 3.7217714820553335e-06, + "loss": 0.5569, + "step": 6983 + }, + { + "epoch": 2.0444964871194378, + "grad_norm": 1.0322294235229492, + "learning_rate": 3.721435510348116e-06, + "loss": 0.616, + "step": 6984 + }, + { + "epoch": 2.044789227166276, + "grad_norm": 0.9223121404647827, + "learning_rate": 3.7210995096621062e-06, + "loss": 0.5403, + "step": 6985 + }, + { + "epoch": 2.0450819672131146, + "grad_norm": 0.9677647948265076, + "learning_rate": 3.7207634800052763e-06, + "loss": 0.5703, + "step": 6986 + }, + { + "epoch": 2.045374707259953, + "grad_norm": 0.9458227157592773, + "learning_rate": 3.720427421385597e-06, + "loss": 0.5503, + "step": 6987 + }, + { + "epoch": 2.0456674473067915, + "grad_norm": 0.981399416923523, + "learning_rate": 3.7200913338110438e-06, + "loss": 0.6073, + "step": 6988 + }, + { + "epoch": 2.04596018735363, + "grad_norm": 1.0065025091171265, + "learning_rate": 3.7197552172895896e-06, + "loss": 0.589, + "step": 6989 + }, + { + "epoch": 2.0462529274004684, + "grad_norm": 0.9568655490875244, + "learning_rate": 3.7194190718292077e-06, + "loss": 0.5708, + "step": 6990 + }, + { + "epoch": 2.046545667447307, + "grad_norm": 0.9812189340591431, + "learning_rate": 3.7190828974378747e-06, + "loss": 0.5768, + "step": 6991 + }, + { + "epoch": 2.0468384074941453, + "grad_norm": 0.9877282977104187, + "learning_rate": 3.7187466941235654e-06, + "loss": 0.58, + "step": 6992 + }, + { + "epoch": 2.0471311475409837, + "grad_norm": 1.0388752222061157, + "learning_rate": 3.7184104618942567e-06, + "loss": 0.5953, + "step": 6993 + }, + { + "epoch": 2.047423887587822, + "grad_norm": 0.9710250496864319, + "learning_rate": 3.7180742007579255e-06, + "loss": 0.5575, + "step": 6994 + }, + { + "epoch": 2.0477166276346606, + "grad_norm": 0.9357069134712219, + "learning_rate": 3.717737910722551e-06, + "loss": 0.5554, + "step": 6995 + }, + { + "epoch": 2.048009367681499, + "grad_norm": 1.0036656856536865, + "learning_rate": 3.7174015917961095e-06, + "loss": 0.5653, + "step": 6996 + }, + { + "epoch": 2.0483021077283374, + "grad_norm": 0.9589866399765015, + "learning_rate": 3.7170652439865814e-06, + "loss": 0.5532, + "step": 6997 + }, + { + "epoch": 2.048594847775176, + "grad_norm": 1.0089857578277588, + "learning_rate": 3.716728867301946e-06, + "loss": 0.5632, + "step": 6998 + }, + { + "epoch": 2.048887587822014, + "grad_norm": 1.031272292137146, + "learning_rate": 3.716392461750185e-06, + "loss": 0.608, + "step": 6999 + }, + { + "epoch": 2.0491803278688523, + "grad_norm": 0.9582589864730835, + "learning_rate": 3.71605602733928e-06, + "loss": 0.5148, + "step": 7000 + }, + { + "epoch": 2.0494730679156907, + "grad_norm": 0.931379497051239, + "learning_rate": 3.7157195640772114e-06, + "loss": 0.5471, + "step": 7001 + }, + { + "epoch": 2.049765807962529, + "grad_norm": 1.0553423166275024, + "learning_rate": 3.7153830719719624e-06, + "loss": 0.5717, + "step": 7002 + }, + { + "epoch": 2.0500585480093676, + "grad_norm": 1.0024031400680542, + "learning_rate": 3.7150465510315164e-06, + "loss": 0.5825, + "step": 7003 + }, + { + "epoch": 2.050351288056206, + "grad_norm": 0.9482718706130981, + "learning_rate": 3.7147100012638583e-06, + "loss": 0.5055, + "step": 7004 + }, + { + "epoch": 2.0506440281030445, + "grad_norm": 0.9610719084739685, + "learning_rate": 3.714373422676971e-06, + "loss": 0.5292, + "step": 7005 + }, + { + "epoch": 2.050936768149883, + "grad_norm": 1.0400458574295044, + "learning_rate": 3.714036815278842e-06, + "loss": 0.6017, + "step": 7006 + }, + { + "epoch": 2.0512295081967213, + "grad_norm": 1.004902958869934, + "learning_rate": 3.7137001790774557e-06, + "loss": 0.556, + "step": 7007 + }, + { + "epoch": 2.0515222482435598, + "grad_norm": 0.956906795501709, + "learning_rate": 3.7133635140808e-06, + "loss": 0.5788, + "step": 7008 + }, + { + "epoch": 2.051814988290398, + "grad_norm": 0.9755375981330872, + "learning_rate": 3.713026820296861e-06, + "loss": 0.6059, + "step": 7009 + }, + { + "epoch": 2.0521077283372366, + "grad_norm": 1.000742793083191, + "learning_rate": 3.712690097733629e-06, + "loss": 0.5535, + "step": 7010 + }, + { + "epoch": 2.052400468384075, + "grad_norm": 0.9777922630310059, + "learning_rate": 3.7123533463990904e-06, + "loss": 0.6095, + "step": 7011 + }, + { + "epoch": 2.0526932084309135, + "grad_norm": 0.9458847641944885, + "learning_rate": 3.7120165663012354e-06, + "loss": 0.5781, + "step": 7012 + }, + { + "epoch": 2.052985948477752, + "grad_norm": 0.96258944272995, + "learning_rate": 3.711679757448055e-06, + "loss": 0.5501, + "step": 7013 + }, + { + "epoch": 2.05327868852459, + "grad_norm": 0.9453668594360352, + "learning_rate": 3.7113429198475394e-06, + "loss": 0.5877, + "step": 7014 + }, + { + "epoch": 2.0535714285714284, + "grad_norm": 0.9470553398132324, + "learning_rate": 3.711006053507681e-06, + "loss": 0.5476, + "step": 7015 + }, + { + "epoch": 2.053864168618267, + "grad_norm": 0.9663994312286377, + "learning_rate": 3.7106691584364706e-06, + "loss": 0.5592, + "step": 7016 + }, + { + "epoch": 2.0541569086651053, + "grad_norm": 1.0251109600067139, + "learning_rate": 3.710332234641901e-06, + "loss": 0.5559, + "step": 7017 + }, + { + "epoch": 2.0544496487119437, + "grad_norm": 1.0148346424102783, + "learning_rate": 3.709995282131968e-06, + "loss": 0.5901, + "step": 7018 + }, + { + "epoch": 2.054742388758782, + "grad_norm": 0.9750030636787415, + "learning_rate": 3.709658300914664e-06, + "loss": 0.5551, + "step": 7019 + }, + { + "epoch": 2.0550351288056206, + "grad_norm": 0.9815239906311035, + "learning_rate": 3.709321290997985e-06, + "loss": 0.5927, + "step": 7020 + }, + { + "epoch": 2.055327868852459, + "grad_norm": 1.009939432144165, + "learning_rate": 3.708984252389925e-06, + "loss": 0.6093, + "step": 7021 + }, + { + "epoch": 2.0556206088992974, + "grad_norm": 0.9484739899635315, + "learning_rate": 3.708647185098482e-06, + "loss": 0.5677, + "step": 7022 + }, + { + "epoch": 2.055913348946136, + "grad_norm": 1.048485517501831, + "learning_rate": 3.708310089131652e-06, + "loss": 0.6359, + "step": 7023 + }, + { + "epoch": 2.0562060889929743, + "grad_norm": 0.9534521698951721, + "learning_rate": 3.707972964497433e-06, + "loss": 0.5966, + "step": 7024 + }, + { + "epoch": 2.0564988290398127, + "grad_norm": 0.9536182880401611, + "learning_rate": 3.7076358112038235e-06, + "loss": 0.5226, + "step": 7025 + }, + { + "epoch": 2.056791569086651, + "grad_norm": 0.9874877333641052, + "learning_rate": 3.7072986292588224e-06, + "loss": 0.5571, + "step": 7026 + }, + { + "epoch": 2.0570843091334896, + "grad_norm": 0.9514103531837463, + "learning_rate": 3.7069614186704302e-06, + "loss": 0.5831, + "step": 7027 + }, + { + "epoch": 2.057377049180328, + "grad_norm": 0.9792178273200989, + "learning_rate": 3.706624179446645e-06, + "loss": 0.579, + "step": 7028 + }, + { + "epoch": 2.0576697892271665, + "grad_norm": 1.0190098285675049, + "learning_rate": 3.7062869115954697e-06, + "loss": 0.5817, + "step": 7029 + }, + { + "epoch": 2.0579625292740045, + "grad_norm": 1.0139126777648926, + "learning_rate": 3.7059496151249065e-06, + "loss": 0.5731, + "step": 7030 + }, + { + "epoch": 2.058255269320843, + "grad_norm": 0.9700093865394592, + "learning_rate": 3.705612290042956e-06, + "loss": 0.5835, + "step": 7031 + }, + { + "epoch": 2.0585480093676813, + "grad_norm": 0.967401385307312, + "learning_rate": 3.705274936357623e-06, + "loss": 0.5465, + "step": 7032 + }, + { + "epoch": 2.0588407494145198, + "grad_norm": 0.9610064625740051, + "learning_rate": 3.70493755407691e-06, + "loss": 0.5658, + "step": 7033 + }, + { + "epoch": 2.059133489461358, + "grad_norm": 0.9271472096443176, + "learning_rate": 3.704600143208822e-06, + "loss": 0.5695, + "step": 7034 + }, + { + "epoch": 2.0594262295081966, + "grad_norm": 0.9703616499900818, + "learning_rate": 3.704262703761365e-06, + "loss": 0.5839, + "step": 7035 + }, + { + "epoch": 2.059718969555035, + "grad_norm": 1.005125880241394, + "learning_rate": 3.703925235742543e-06, + "loss": 0.5965, + "step": 7036 + }, + { + "epoch": 2.0600117096018735, + "grad_norm": 0.9809463024139404, + "learning_rate": 3.7035877391603638e-06, + "loss": 0.5374, + "step": 7037 + }, + { + "epoch": 2.060304449648712, + "grad_norm": 0.9447077512741089, + "learning_rate": 3.7032502140228345e-06, + "loss": 0.5433, + "step": 7038 + }, + { + "epoch": 2.0605971896955504, + "grad_norm": 0.9584416747093201, + "learning_rate": 3.7029126603379623e-06, + "loss": 0.5447, + "step": 7039 + }, + { + "epoch": 2.060889929742389, + "grad_norm": 0.9739288091659546, + "learning_rate": 3.702575078113757e-06, + "loss": 0.578, + "step": 7040 + }, + { + "epoch": 2.0611826697892273, + "grad_norm": 0.9688587784767151, + "learning_rate": 3.7022374673582263e-06, + "loss": 0.5716, + "step": 7041 + }, + { + "epoch": 2.0614754098360657, + "grad_norm": 1.0040931701660156, + "learning_rate": 3.7018998280793804e-06, + "loss": 0.5799, + "step": 7042 + }, + { + "epoch": 2.061768149882904, + "grad_norm": 0.9418444633483887, + "learning_rate": 3.70156216028523e-06, + "loss": 0.5493, + "step": 7043 + }, + { + "epoch": 2.0620608899297426, + "grad_norm": 1.0038857460021973, + "learning_rate": 3.701224463983788e-06, + "loss": 0.5603, + "step": 7044 + }, + { + "epoch": 2.062353629976581, + "grad_norm": 0.9705121517181396, + "learning_rate": 3.700886739183063e-06, + "loss": 0.5441, + "step": 7045 + }, + { + "epoch": 2.062646370023419, + "grad_norm": 0.9030169248580933, + "learning_rate": 3.7005489858910707e-06, + "loss": 0.5162, + "step": 7046 + }, + { + "epoch": 2.0629391100702574, + "grad_norm": 0.9436018466949463, + "learning_rate": 3.7002112041158226e-06, + "loss": 0.5738, + "step": 7047 + }, + { + "epoch": 2.063231850117096, + "grad_norm": 0.9696527123451233, + "learning_rate": 3.699873393865333e-06, + "loss": 0.5878, + "step": 7048 + }, + { + "epoch": 2.0635245901639343, + "grad_norm": 0.9626752138137817, + "learning_rate": 3.6995355551476165e-06, + "loss": 0.5871, + "step": 7049 + }, + { + "epoch": 2.0638173302107727, + "grad_norm": 1.0073622465133667, + "learning_rate": 3.699197687970689e-06, + "loss": 0.5675, + "step": 7050 + }, + { + "epoch": 2.064110070257611, + "grad_norm": 0.9751494526863098, + "learning_rate": 3.6988597923425656e-06, + "loss": 0.6052, + "step": 7051 + }, + { + "epoch": 2.0644028103044496, + "grad_norm": 0.9960125684738159, + "learning_rate": 3.698521868271263e-06, + "loss": 0.608, + "step": 7052 + }, + { + "epoch": 2.064695550351288, + "grad_norm": 0.9444469213485718, + "learning_rate": 3.698183915764799e-06, + "loss": 0.5774, + "step": 7053 + }, + { + "epoch": 2.0649882903981265, + "grad_norm": 0.9719519019126892, + "learning_rate": 3.6978459348311917e-06, + "loss": 0.5634, + "step": 7054 + }, + { + "epoch": 2.065281030444965, + "grad_norm": 1.0255886316299438, + "learning_rate": 3.6975079254784597e-06, + "loss": 0.6051, + "step": 7055 + }, + { + "epoch": 2.0655737704918034, + "grad_norm": 0.9901722073554993, + "learning_rate": 3.697169887714622e-06, + "loss": 0.5742, + "step": 7056 + }, + { + "epoch": 2.065866510538642, + "grad_norm": 0.9785324931144714, + "learning_rate": 3.6968318215476983e-06, + "loss": 0.5841, + "step": 7057 + }, + { + "epoch": 2.0661592505854802, + "grad_norm": 1.0149766206741333, + "learning_rate": 3.696493726985709e-06, + "loss": 0.5344, + "step": 7058 + }, + { + "epoch": 2.0664519906323187, + "grad_norm": 0.9715554118156433, + "learning_rate": 3.6961556040366776e-06, + "loss": 0.6074, + "step": 7059 + }, + { + "epoch": 2.066744730679157, + "grad_norm": 0.9445981383323669, + "learning_rate": 3.6958174527086244e-06, + "loss": 0.6178, + "step": 7060 + }, + { + "epoch": 2.0670374707259955, + "grad_norm": 0.9995592832565308, + "learning_rate": 3.6954792730095716e-06, + "loss": 0.5916, + "step": 7061 + }, + { + "epoch": 2.0673302107728335, + "grad_norm": 0.9269167184829712, + "learning_rate": 3.695141064947544e-06, + "loss": 0.5787, + "step": 7062 + }, + { + "epoch": 2.067622950819672, + "grad_norm": 1.0200037956237793, + "learning_rate": 3.6948028285305648e-06, + "loss": 0.6039, + "step": 7063 + }, + { + "epoch": 2.0679156908665104, + "grad_norm": 1.0113259553909302, + "learning_rate": 3.694464563766659e-06, + "loss": 0.5689, + "step": 7064 + }, + { + "epoch": 2.068208430913349, + "grad_norm": 1.0465774536132812, + "learning_rate": 3.6941262706638525e-06, + "loss": 0.5722, + "step": 7065 + }, + { + "epoch": 2.0685011709601873, + "grad_norm": 0.9789510369300842, + "learning_rate": 3.6937879492301697e-06, + "loss": 0.5847, + "step": 7066 + }, + { + "epoch": 2.0687939110070257, + "grad_norm": 0.9578008651733398, + "learning_rate": 3.6934495994736393e-06, + "loss": 0.5686, + "step": 7067 + }, + { + "epoch": 2.069086651053864, + "grad_norm": 0.9820907711982727, + "learning_rate": 3.6931112214022875e-06, + "loss": 0.5849, + "step": 7068 + }, + { + "epoch": 2.0693793911007026, + "grad_norm": 1.0059622526168823, + "learning_rate": 3.6927728150241428e-06, + "loss": 0.5869, + "step": 7069 + }, + { + "epoch": 2.069672131147541, + "grad_norm": 0.9823980927467346, + "learning_rate": 3.692434380347234e-06, + "loss": 0.577, + "step": 7070 + }, + { + "epoch": 2.0699648711943794, + "grad_norm": 1.0228737592697144, + "learning_rate": 3.69209591737959e-06, + "loss": 0.5288, + "step": 7071 + }, + { + "epoch": 2.070257611241218, + "grad_norm": 0.9677423238754272, + "learning_rate": 3.6917574261292423e-06, + "loss": 0.5214, + "step": 7072 + }, + { + "epoch": 2.0705503512880563, + "grad_norm": 1.104542851448059, + "learning_rate": 3.6914189066042196e-06, + "loss": 0.589, + "step": 7073 + }, + { + "epoch": 2.0708430913348947, + "grad_norm": 0.9795127511024475, + "learning_rate": 3.6910803588125547e-06, + "loss": 0.5554, + "step": 7074 + }, + { + "epoch": 2.071135831381733, + "grad_norm": 1.0126339197158813, + "learning_rate": 3.69074178276228e-06, + "loss": 0.5586, + "step": 7075 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.9905315041542053, + "learning_rate": 3.6904031784614273e-06, + "loss": 0.5906, + "step": 7076 + }, + { + "epoch": 2.07172131147541, + "grad_norm": 0.9907563924789429, + "learning_rate": 3.6900645459180305e-06, + "loss": 0.5978, + "step": 7077 + }, + { + "epoch": 2.072014051522248, + "grad_norm": 0.9677945971488953, + "learning_rate": 3.689725885140124e-06, + "loss": 0.5484, + "step": 7078 + }, + { + "epoch": 2.0723067915690865, + "grad_norm": 0.9562986493110657, + "learning_rate": 3.6893871961357425e-06, + "loss": 0.5904, + "step": 7079 + }, + { + "epoch": 2.072599531615925, + "grad_norm": 0.963220477104187, + "learning_rate": 3.689048478912921e-06, + "loss": 0.5703, + "step": 7080 + }, + { + "epoch": 2.0728922716627634, + "grad_norm": 0.9445935487747192, + "learning_rate": 3.688709733479695e-06, + "loss": 0.5461, + "step": 7081 + }, + { + "epoch": 2.073185011709602, + "grad_norm": 1.0203557014465332, + "learning_rate": 3.688370959844104e-06, + "loss": 0.5776, + "step": 7082 + }, + { + "epoch": 2.0734777517564402, + "grad_norm": 0.9908386468887329, + "learning_rate": 3.688032158014182e-06, + "loss": 0.571, + "step": 7083 + }, + { + "epoch": 2.0737704918032787, + "grad_norm": 0.9608181118965149, + "learning_rate": 3.68769332799797e-06, + "loss": 0.5648, + "step": 7084 + }, + { + "epoch": 2.074063231850117, + "grad_norm": 1.007238507270813, + "learning_rate": 3.6873544698035047e-06, + "loss": 0.591, + "step": 7085 + }, + { + "epoch": 2.0743559718969555, + "grad_norm": 0.9721879363059998, + "learning_rate": 3.6870155834388273e-06, + "loss": 0.6029, + "step": 7086 + }, + { + "epoch": 2.074648711943794, + "grad_norm": 0.9803626537322998, + "learning_rate": 3.6866766689119765e-06, + "loss": 0.6012, + "step": 7087 + }, + { + "epoch": 2.0749414519906324, + "grad_norm": 0.9725275635719299, + "learning_rate": 3.686337726230994e-06, + "loss": 0.5839, + "step": 7088 + }, + { + "epoch": 2.075234192037471, + "grad_norm": 0.9523600935935974, + "learning_rate": 3.6859987554039206e-06, + "loss": 0.5955, + "step": 7089 + }, + { + "epoch": 2.0755269320843093, + "grad_norm": 1.0097798109054565, + "learning_rate": 3.6856597564387993e-06, + "loss": 0.5685, + "step": 7090 + }, + { + "epoch": 2.0758196721311477, + "grad_norm": 0.9877152442932129, + "learning_rate": 3.685320729343673e-06, + "loss": 0.6027, + "step": 7091 + }, + { + "epoch": 2.076112412177986, + "grad_norm": 0.9744873046875, + "learning_rate": 3.6849816741265837e-06, + "loss": 0.5749, + "step": 7092 + }, + { + "epoch": 2.076405152224824, + "grad_norm": 0.9989364743232727, + "learning_rate": 3.684642590795576e-06, + "loss": 0.5545, + "step": 7093 + }, + { + "epoch": 2.0766978922716626, + "grad_norm": 0.9675361514091492, + "learning_rate": 3.684303479358697e-06, + "loss": 0.5647, + "step": 7094 + }, + { + "epoch": 2.076990632318501, + "grad_norm": 0.9704360961914062, + "learning_rate": 3.6839643398239887e-06, + "loss": 0.5571, + "step": 7095 + }, + { + "epoch": 2.0772833723653394, + "grad_norm": 0.9803658723831177, + "learning_rate": 3.683625172199501e-06, + "loss": 0.5995, + "step": 7096 + }, + { + "epoch": 2.077576112412178, + "grad_norm": 0.9381604790687561, + "learning_rate": 3.683285976493277e-06, + "loss": 0.5548, + "step": 7097 + }, + { + "epoch": 2.0778688524590163, + "grad_norm": 0.9396962523460388, + "learning_rate": 3.6829467527133666e-06, + "loss": 0.5721, + "step": 7098 + }, + { + "epoch": 2.0781615925058547, + "grad_norm": 0.9718005657196045, + "learning_rate": 3.682607500867817e-06, + "loss": 0.5276, + "step": 7099 + }, + { + "epoch": 2.078454332552693, + "grad_norm": 1.00523042678833, + "learning_rate": 3.682268220964677e-06, + "loss": 0.5444, + "step": 7100 + }, + { + "epoch": 2.0787470725995316, + "grad_norm": 0.9875917434692383, + "learning_rate": 3.6819289130119974e-06, + "loss": 0.5934, + "step": 7101 + }, + { + "epoch": 2.07903981264637, + "grad_norm": 0.9608770608901978, + "learning_rate": 3.681589577017826e-06, + "loss": 0.5868, + "step": 7102 + }, + { + "epoch": 2.0793325526932085, + "grad_norm": 0.9782458543777466, + "learning_rate": 3.6812502129902154e-06, + "loss": 0.5818, + "step": 7103 + }, + { + "epoch": 2.079625292740047, + "grad_norm": 1.0143951177597046, + "learning_rate": 3.680910820937216e-06, + "loss": 0.5758, + "step": 7104 + }, + { + "epoch": 2.0799180327868854, + "grad_norm": 0.9300653338432312, + "learning_rate": 3.6805714008668814e-06, + "loss": 0.5666, + "step": 7105 + }, + { + "epoch": 2.080210772833724, + "grad_norm": 1.0527687072753906, + "learning_rate": 3.680231952787263e-06, + "loss": 0.6331, + "step": 7106 + }, + { + "epoch": 2.0805035128805622, + "grad_norm": 0.988070547580719, + "learning_rate": 3.679892476706415e-06, + "loss": 0.5985, + "step": 7107 + }, + { + "epoch": 2.0807962529274007, + "grad_norm": 0.9630813598632812, + "learning_rate": 3.679552972632391e-06, + "loss": 0.5538, + "step": 7108 + }, + { + "epoch": 2.081088992974239, + "grad_norm": 0.9866492748260498, + "learning_rate": 3.679213440573246e-06, + "loss": 0.5819, + "step": 7109 + }, + { + "epoch": 2.081381733021077, + "grad_norm": 0.9820101261138916, + "learning_rate": 3.678873880537036e-06, + "loss": 0.6044, + "step": 7110 + }, + { + "epoch": 2.0816744730679155, + "grad_norm": 0.9667420983314514, + "learning_rate": 3.6785342925318167e-06, + "loss": 0.5889, + "step": 7111 + }, + { + "epoch": 2.081967213114754, + "grad_norm": 0.9898485541343689, + "learning_rate": 3.6781946765656447e-06, + "loss": 0.5601, + "step": 7112 + }, + { + "epoch": 2.0822599531615924, + "grad_norm": 0.9663949012756348, + "learning_rate": 3.6778550326465774e-06, + "loss": 0.5991, + "step": 7113 + }, + { + "epoch": 2.082552693208431, + "grad_norm": 0.9773242473602295, + "learning_rate": 3.6775153607826734e-06, + "loss": 0.6062, + "step": 7114 + }, + { + "epoch": 2.0828454332552693, + "grad_norm": 0.9805868268013, + "learning_rate": 3.6771756609819915e-06, + "loss": 0.5819, + "step": 7115 + }, + { + "epoch": 2.0831381733021077, + "grad_norm": 0.9971834421157837, + "learning_rate": 3.6768359332525906e-06, + "loss": 0.5849, + "step": 7116 + }, + { + "epoch": 2.083430913348946, + "grad_norm": 0.9514036178588867, + "learning_rate": 3.6764961776025317e-06, + "loss": 0.5648, + "step": 7117 + }, + { + "epoch": 2.0837236533957846, + "grad_norm": 1.0614371299743652, + "learning_rate": 3.676156394039874e-06, + "loss": 0.6001, + "step": 7118 + }, + { + "epoch": 2.084016393442623, + "grad_norm": 1.02883780002594, + "learning_rate": 3.67581658257268e-06, + "loss": 0.5735, + "step": 7119 + }, + { + "epoch": 2.0843091334894615, + "grad_norm": 0.9776039719581604, + "learning_rate": 3.675476743209012e-06, + "loss": 0.5882, + "step": 7120 + }, + { + "epoch": 2.0846018735363, + "grad_norm": 0.9568858742713928, + "learning_rate": 3.6751368759569327e-06, + "loss": 0.5886, + "step": 7121 + }, + { + "epoch": 2.0848946135831383, + "grad_norm": 1.0087634325027466, + "learning_rate": 3.674796980824505e-06, + "loss": 0.5842, + "step": 7122 + }, + { + "epoch": 2.0851873536299768, + "grad_norm": 0.9749506711959839, + "learning_rate": 3.674457057819793e-06, + "loss": 0.5561, + "step": 7123 + }, + { + "epoch": 2.085480093676815, + "grad_norm": 0.9845228791236877, + "learning_rate": 3.674117106950862e-06, + "loss": 0.5699, + "step": 7124 + }, + { + "epoch": 2.085772833723653, + "grad_norm": 1.003798246383667, + "learning_rate": 3.673777128225776e-06, + "loss": 0.617, + "step": 7125 + }, + { + "epoch": 2.0860655737704916, + "grad_norm": 0.9647496938705444, + "learning_rate": 3.673437121652604e-06, + "loss": 0.5889, + "step": 7126 + }, + { + "epoch": 2.08635831381733, + "grad_norm": 1.004215121269226, + "learning_rate": 3.6730970872394094e-06, + "loss": 0.541, + "step": 7127 + }, + { + "epoch": 2.0866510538641685, + "grad_norm": 1.0636128187179565, + "learning_rate": 3.6727570249942614e-06, + "loss": 0.5714, + "step": 7128 + }, + { + "epoch": 2.086943793911007, + "grad_norm": 0.9656686186790466, + "learning_rate": 3.6724169349252276e-06, + "loss": 0.5387, + "step": 7129 + }, + { + "epoch": 2.0872365339578454, + "grad_norm": 0.9199902415275574, + "learning_rate": 3.672076817040377e-06, + "loss": 0.5414, + "step": 7130 + }, + { + "epoch": 2.087529274004684, + "grad_norm": 0.9637983441352844, + "learning_rate": 3.6717366713477787e-06, + "loss": 0.5697, + "step": 7131 + }, + { + "epoch": 2.0878220140515222, + "grad_norm": 0.9597964882850647, + "learning_rate": 3.671396497855503e-06, + "loss": 0.5466, + "step": 7132 + }, + { + "epoch": 2.0881147540983607, + "grad_norm": 0.9947544932365417, + "learning_rate": 3.67105629657162e-06, + "loss": 0.5594, + "step": 7133 + }, + { + "epoch": 2.088407494145199, + "grad_norm": 1.0314284563064575, + "learning_rate": 3.6707160675042012e-06, + "loss": 0.5877, + "step": 7134 + }, + { + "epoch": 2.0887002341920375, + "grad_norm": 0.9395897388458252, + "learning_rate": 3.670375810661319e-06, + "loss": 0.5264, + "step": 7135 + }, + { + "epoch": 2.088992974238876, + "grad_norm": 0.9866520762443542, + "learning_rate": 3.6700355260510468e-06, + "loss": 0.5714, + "step": 7136 + }, + { + "epoch": 2.0892857142857144, + "grad_norm": 0.9603757858276367, + "learning_rate": 3.6696952136814555e-06, + "loss": 0.5873, + "step": 7137 + }, + { + "epoch": 2.089578454332553, + "grad_norm": 0.9860079288482666, + "learning_rate": 3.669354873560621e-06, + "loss": 0.6018, + "step": 7138 + }, + { + "epoch": 2.0898711943793913, + "grad_norm": 1.031542420387268, + "learning_rate": 3.6690145056966176e-06, + "loss": 0.6164, + "step": 7139 + }, + { + "epoch": 2.0901639344262297, + "grad_norm": 0.9444729685783386, + "learning_rate": 3.6686741100975198e-06, + "loss": 0.5561, + "step": 7140 + }, + { + "epoch": 2.0904566744730677, + "grad_norm": 0.9752890467643738, + "learning_rate": 3.6683336867714053e-06, + "loss": 0.6003, + "step": 7141 + }, + { + "epoch": 2.090749414519906, + "grad_norm": 0.9266863465309143, + "learning_rate": 3.6679932357263494e-06, + "loss": 0.5607, + "step": 7142 + }, + { + "epoch": 2.0910421545667446, + "grad_norm": 0.9711446762084961, + "learning_rate": 3.667652756970429e-06, + "loss": 0.5477, + "step": 7143 + }, + { + "epoch": 2.091334894613583, + "grad_norm": 0.9714831113815308, + "learning_rate": 3.6673122505117225e-06, + "loss": 0.5695, + "step": 7144 + }, + { + "epoch": 2.0916276346604215, + "grad_norm": 0.9745062589645386, + "learning_rate": 3.666971716358309e-06, + "loss": 0.5874, + "step": 7145 + }, + { + "epoch": 2.09192037470726, + "grad_norm": 1.0507619380950928, + "learning_rate": 3.6666311545182676e-06, + "loss": 0.5625, + "step": 7146 + }, + { + "epoch": 2.0922131147540983, + "grad_norm": 0.9451062679290771, + "learning_rate": 3.666290564999677e-06, + "loss": 0.5851, + "step": 7147 + }, + { + "epoch": 2.0925058548009368, + "grad_norm": 0.9316858649253845, + "learning_rate": 3.66594994781062e-06, + "loss": 0.5513, + "step": 7148 + }, + { + "epoch": 2.092798594847775, + "grad_norm": 0.9722456932067871, + "learning_rate": 3.6656093029591756e-06, + "loss": 0.5791, + "step": 7149 + }, + { + "epoch": 2.0930913348946136, + "grad_norm": 1.0296374559402466, + "learning_rate": 3.6652686304534263e-06, + "loss": 0.5893, + "step": 7150 + }, + { + "epoch": 2.093384074941452, + "grad_norm": 0.9850749969482422, + "learning_rate": 3.6649279303014558e-06, + "loss": 0.5707, + "step": 7151 + }, + { + "epoch": 2.0936768149882905, + "grad_norm": 1.0057984590530396, + "learning_rate": 3.664587202511346e-06, + "loss": 0.5841, + "step": 7152 + }, + { + "epoch": 2.093969555035129, + "grad_norm": 0.9286089539527893, + "learning_rate": 3.664246447091181e-06, + "loss": 0.5534, + "step": 7153 + }, + { + "epoch": 2.0942622950819674, + "grad_norm": 0.9839389324188232, + "learning_rate": 3.663905664049045e-06, + "loss": 0.5711, + "step": 7154 + }, + { + "epoch": 2.094555035128806, + "grad_norm": 1.090880274772644, + "learning_rate": 3.663564853393024e-06, + "loss": 0.5551, + "step": 7155 + }, + { + "epoch": 2.0948477751756442, + "grad_norm": 1.0464270114898682, + "learning_rate": 3.6632240151312028e-06, + "loss": 0.5575, + "step": 7156 + }, + { + "epoch": 2.0951405152224822, + "grad_norm": 0.9899533987045288, + "learning_rate": 3.6628831492716687e-06, + "loss": 0.5437, + "step": 7157 + }, + { + "epoch": 2.0954332552693207, + "grad_norm": 0.9791646599769592, + "learning_rate": 3.662542255822509e-06, + "loss": 0.6054, + "step": 7158 + }, + { + "epoch": 2.095725995316159, + "grad_norm": 1.076444387435913, + "learning_rate": 3.66220133479181e-06, + "loss": 0.5942, + "step": 7159 + }, + { + "epoch": 2.0960187353629975, + "grad_norm": 0.9959341287612915, + "learning_rate": 3.661860386187661e-06, + "loss": 0.5816, + "step": 7160 + }, + { + "epoch": 2.096311475409836, + "grad_norm": 1.040284276008606, + "learning_rate": 3.661519410018152e-06, + "loss": 0.5641, + "step": 7161 + }, + { + "epoch": 2.0966042154566744, + "grad_norm": 0.9532347321510315, + "learning_rate": 3.6611784062913713e-06, + "loss": 0.5639, + "step": 7162 + }, + { + "epoch": 2.096896955503513, + "grad_norm": 0.9679784774780273, + "learning_rate": 3.6608373750154104e-06, + "loss": 0.5887, + "step": 7163 + }, + { + "epoch": 2.0971896955503513, + "grad_norm": 0.9852205514907837, + "learning_rate": 3.6604963161983587e-06, + "loss": 0.5559, + "step": 7164 + }, + { + "epoch": 2.0974824355971897, + "grad_norm": 0.9667488932609558, + "learning_rate": 3.6601552298483097e-06, + "loss": 0.5627, + "step": 7165 + }, + { + "epoch": 2.097775175644028, + "grad_norm": 0.9677368402481079, + "learning_rate": 3.6598141159733545e-06, + "loss": 0.5937, + "step": 7166 + }, + { + "epoch": 2.0980679156908666, + "grad_norm": 1.0104990005493164, + "learning_rate": 3.659472974581587e-06, + "loss": 0.5587, + "step": 7167 + }, + { + "epoch": 2.098360655737705, + "grad_norm": 1.0240932703018188, + "learning_rate": 3.6591318056811004e-06, + "loss": 0.5767, + "step": 7168 + }, + { + "epoch": 2.0986533957845435, + "grad_norm": 0.9763134121894836, + "learning_rate": 3.658790609279989e-06, + "loss": 0.605, + "step": 7169 + }, + { + "epoch": 2.098946135831382, + "grad_norm": 0.9381488561630249, + "learning_rate": 3.6584493853863477e-06, + "loss": 0.5646, + "step": 7170 + }, + { + "epoch": 2.0992388758782203, + "grad_norm": 1.0381834506988525, + "learning_rate": 3.658108134008272e-06, + "loss": 0.5153, + "step": 7171 + }, + { + "epoch": 2.0995316159250583, + "grad_norm": 1.0130345821380615, + "learning_rate": 3.6577668551538585e-06, + "loss": 0.572, + "step": 7172 + }, + { + "epoch": 2.0998243559718968, + "grad_norm": 0.9841039180755615, + "learning_rate": 3.657425548831204e-06, + "loss": 0.6169, + "step": 7173 + }, + { + "epoch": 2.100117096018735, + "grad_norm": 0.984039843082428, + "learning_rate": 3.657084215048406e-06, + "loss": 0.6071, + "step": 7174 + }, + { + "epoch": 2.1004098360655736, + "grad_norm": 0.9869380593299866, + "learning_rate": 3.6567428538135624e-06, + "loss": 0.5852, + "step": 7175 + }, + { + "epoch": 2.100702576112412, + "grad_norm": 0.9955151081085205, + "learning_rate": 3.6564014651347722e-06, + "loss": 0.5826, + "step": 7176 + }, + { + "epoch": 2.1009953161592505, + "grad_norm": 0.9823437333106995, + "learning_rate": 3.656060049020136e-06, + "loss": 0.6045, + "step": 7177 + }, + { + "epoch": 2.101288056206089, + "grad_norm": 1.2912143468856812, + "learning_rate": 3.655718605477752e-06, + "loss": 0.5904, + "step": 7178 + }, + { + "epoch": 2.1015807962529274, + "grad_norm": 0.9861872792243958, + "learning_rate": 3.6553771345157224e-06, + "loss": 0.6165, + "step": 7179 + }, + { + "epoch": 2.101873536299766, + "grad_norm": 0.9834899306297302, + "learning_rate": 3.6550356361421485e-06, + "loss": 0.5508, + "step": 7180 + }, + { + "epoch": 2.1021662763466042, + "grad_norm": 0.9914081692695618, + "learning_rate": 3.6546941103651317e-06, + "loss": 0.5893, + "step": 7181 + }, + { + "epoch": 2.1024590163934427, + "grad_norm": 0.9511587619781494, + "learning_rate": 3.654352557192776e-06, + "loss": 0.5804, + "step": 7182 + }, + { + "epoch": 2.102751756440281, + "grad_norm": 0.986050009727478, + "learning_rate": 3.654010976633184e-06, + "loss": 0.5858, + "step": 7183 + }, + { + "epoch": 2.1030444964871196, + "grad_norm": 0.940452516078949, + "learning_rate": 3.653669368694459e-06, + "loss": 0.5914, + "step": 7184 + }, + { + "epoch": 2.103337236533958, + "grad_norm": 1.0081020593643188, + "learning_rate": 3.653327733384707e-06, + "loss": 0.5829, + "step": 7185 + }, + { + "epoch": 2.1036299765807964, + "grad_norm": 0.9730998277664185, + "learning_rate": 3.652986070712034e-06, + "loss": 0.5669, + "step": 7186 + }, + { + "epoch": 2.103922716627635, + "grad_norm": 0.9415112733840942, + "learning_rate": 3.652644380684544e-06, + "loss": 0.5209, + "step": 7187 + }, + { + "epoch": 2.1042154566744733, + "grad_norm": 0.9903334975242615, + "learning_rate": 3.6523026633103454e-06, + "loss": 0.5736, + "step": 7188 + }, + { + "epoch": 2.1045081967213113, + "grad_norm": 1.004311203956604, + "learning_rate": 3.651960918597544e-06, + "loss": 0.5976, + "step": 7189 + }, + { + "epoch": 2.1048009367681497, + "grad_norm": 0.9642108082771301, + "learning_rate": 3.6516191465542485e-06, + "loss": 0.5738, + "step": 7190 + }, + { + "epoch": 2.105093676814988, + "grad_norm": 1.0266457796096802, + "learning_rate": 3.6512773471885675e-06, + "loss": 0.5516, + "step": 7191 + }, + { + "epoch": 2.1053864168618266, + "grad_norm": 1.0319854021072388, + "learning_rate": 3.6509355205086104e-06, + "loss": 0.5852, + "step": 7192 + }, + { + "epoch": 2.105679156908665, + "grad_norm": 0.9910504817962646, + "learning_rate": 3.650593666522487e-06, + "loss": 0.5737, + "step": 7193 + }, + { + "epoch": 2.1059718969555035, + "grad_norm": 0.9813051819801331, + "learning_rate": 3.650251785238308e-06, + "loss": 0.5977, + "step": 7194 + }, + { + "epoch": 2.106264637002342, + "grad_norm": 0.9496029615402222, + "learning_rate": 3.6499098766641838e-06, + "loss": 0.5877, + "step": 7195 + }, + { + "epoch": 2.1065573770491803, + "grad_norm": 0.9747510552406311, + "learning_rate": 3.649567940808227e-06, + "loss": 0.606, + "step": 7196 + }, + { + "epoch": 2.1068501170960188, + "grad_norm": 0.9518154859542847, + "learning_rate": 3.6492259776785503e-06, + "loss": 0.5405, + "step": 7197 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 0.9315703511238098, + "learning_rate": 3.648883987283266e-06, + "loss": 0.5685, + "step": 7198 + }, + { + "epoch": 2.1074355971896956, + "grad_norm": 1.0257079601287842, + "learning_rate": 3.648541969630489e-06, + "loss": 0.5801, + "step": 7199 + }, + { + "epoch": 2.107728337236534, + "grad_norm": 0.9913100600242615, + "learning_rate": 3.648199924728332e-06, + "loss": 0.6135, + "step": 7200 + }, + { + "epoch": 2.1080210772833725, + "grad_norm": 1.0125303268432617, + "learning_rate": 3.6478578525849125e-06, + "loss": 0.6136, + "step": 7201 + }, + { + "epoch": 2.108313817330211, + "grad_norm": 0.9639840126037598, + "learning_rate": 3.6475157532083448e-06, + "loss": 0.6021, + "step": 7202 + }, + { + "epoch": 2.1086065573770494, + "grad_norm": 1.0029587745666504, + "learning_rate": 3.6471736266067448e-06, + "loss": 0.5727, + "step": 7203 + }, + { + "epoch": 2.1088992974238874, + "grad_norm": 1.0313969850540161, + "learning_rate": 3.6468314727882304e-06, + "loss": 0.6001, + "step": 7204 + }, + { + "epoch": 2.109192037470726, + "grad_norm": 1.0454492568969727, + "learning_rate": 3.646489291760919e-06, + "loss": 0.5996, + "step": 7205 + }, + { + "epoch": 2.1094847775175642, + "grad_norm": 0.9598633646965027, + "learning_rate": 3.646147083532929e-06, + "loss": 0.5991, + "step": 7206 + }, + { + "epoch": 2.1097775175644027, + "grad_norm": 0.9479102492332458, + "learning_rate": 3.6458048481123794e-06, + "loss": 0.5987, + "step": 7207 + }, + { + "epoch": 2.110070257611241, + "grad_norm": 1.0341954231262207, + "learning_rate": 3.645462585507389e-06, + "loss": 0.5805, + "step": 7208 + }, + { + "epoch": 2.1103629976580796, + "grad_norm": 0.9701064229011536, + "learning_rate": 3.645120295726079e-06, + "loss": 0.5738, + "step": 7209 + }, + { + "epoch": 2.110655737704918, + "grad_norm": 1.0036945343017578, + "learning_rate": 3.6447779787765693e-06, + "loss": 0.5452, + "step": 7210 + }, + { + "epoch": 2.1109484777517564, + "grad_norm": 0.9730355739593506, + "learning_rate": 3.644435634666983e-06, + "loss": 0.5819, + "step": 7211 + }, + { + "epoch": 2.111241217798595, + "grad_norm": 0.9788405895233154, + "learning_rate": 3.6440932634054414e-06, + "loss": 0.5728, + "step": 7212 + }, + { + "epoch": 2.1115339578454333, + "grad_norm": 0.9745509624481201, + "learning_rate": 3.643750865000067e-06, + "loss": 0.5752, + "step": 7213 + }, + { + "epoch": 2.1118266978922717, + "grad_norm": 0.9683546423912048, + "learning_rate": 3.643408439458984e-06, + "loss": 0.5925, + "step": 7214 + }, + { + "epoch": 2.11211943793911, + "grad_norm": 1.012787938117981, + "learning_rate": 3.643065986790315e-06, + "loss": 0.5858, + "step": 7215 + }, + { + "epoch": 2.1124121779859486, + "grad_norm": 0.9988299608230591, + "learning_rate": 3.6427235070021867e-06, + "loss": 0.6182, + "step": 7216 + }, + { + "epoch": 2.112704918032787, + "grad_norm": 1.325468897819519, + "learning_rate": 3.6423810001027237e-06, + "loss": 0.5613, + "step": 7217 + }, + { + "epoch": 2.1129976580796255, + "grad_norm": 0.972466766834259, + "learning_rate": 3.642038466100052e-06, + "loss": 0.548, + "step": 7218 + }, + { + "epoch": 2.113290398126464, + "grad_norm": 0.9656795263290405, + "learning_rate": 3.641695905002298e-06, + "loss": 0.5787, + "step": 7219 + }, + { + "epoch": 2.113583138173302, + "grad_norm": 0.9745625853538513, + "learning_rate": 3.6413533168175893e-06, + "loss": 0.5697, + "step": 7220 + }, + { + "epoch": 2.1138758782201403, + "grad_norm": 1.0319740772247314, + "learning_rate": 3.641010701554054e-06, + "loss": 0.5585, + "step": 7221 + }, + { + "epoch": 2.1141686182669788, + "grad_norm": 0.9387715458869934, + "learning_rate": 3.6406680592198207e-06, + "loss": 0.5747, + "step": 7222 + }, + { + "epoch": 2.114461358313817, + "grad_norm": 0.9600690007209778, + "learning_rate": 3.6403253898230185e-06, + "loss": 0.5909, + "step": 7223 + }, + { + "epoch": 2.1147540983606556, + "grad_norm": 1.037073016166687, + "learning_rate": 3.6399826933717773e-06, + "loss": 0.6123, + "step": 7224 + }, + { + "epoch": 2.115046838407494, + "grad_norm": 1.0331727266311646, + "learning_rate": 3.639639969874228e-06, + "loss": 0.5496, + "step": 7225 + }, + { + "epoch": 2.1153395784543325, + "grad_norm": 1.018640398979187, + "learning_rate": 3.6392972193385013e-06, + "loss": 0.5616, + "step": 7226 + }, + { + "epoch": 2.115632318501171, + "grad_norm": 0.9906607270240784, + "learning_rate": 3.638954441772729e-06, + "loss": 0.5802, + "step": 7227 + }, + { + "epoch": 2.1159250585480094, + "grad_norm": 1.0753989219665527, + "learning_rate": 3.638611637185044e-06, + "loss": 0.6214, + "step": 7228 + }, + { + "epoch": 2.116217798594848, + "grad_norm": 1.0800228118896484, + "learning_rate": 3.6382688055835784e-06, + "loss": 0.5897, + "step": 7229 + }, + { + "epoch": 2.1165105386416863, + "grad_norm": 0.999691903591156, + "learning_rate": 3.637925946976467e-06, + "loss": 0.6053, + "step": 7230 + }, + { + "epoch": 2.1168032786885247, + "grad_norm": 0.9878100156784058, + "learning_rate": 3.6375830613718445e-06, + "loss": 0.5404, + "step": 7231 + }, + { + "epoch": 2.117096018735363, + "grad_norm": 0.9849600791931152, + "learning_rate": 3.637240148777845e-06, + "loss": 0.6077, + "step": 7232 + }, + { + "epoch": 2.1173887587822016, + "grad_norm": 1.1101908683776855, + "learning_rate": 3.6368972092026046e-06, + "loss": 0.583, + "step": 7233 + }, + { + "epoch": 2.11768149882904, + "grad_norm": 0.9903858304023743, + "learning_rate": 3.6365542426542595e-06, + "loss": 0.6099, + "step": 7234 + }, + { + "epoch": 2.1179742388758784, + "grad_norm": 1.0008820295333862, + "learning_rate": 3.6362112491409463e-06, + "loss": 0.5857, + "step": 7235 + }, + { + "epoch": 2.1182669789227164, + "grad_norm": 0.9992914199829102, + "learning_rate": 3.6358682286708032e-06, + "loss": 0.5412, + "step": 7236 + }, + { + "epoch": 2.118559718969555, + "grad_norm": 0.984912097454071, + "learning_rate": 3.6355251812519677e-06, + "loss": 0.5908, + "step": 7237 + }, + { + "epoch": 2.1188524590163933, + "grad_norm": 1.0168715715408325, + "learning_rate": 3.6351821068925796e-06, + "loss": 0.5811, + "step": 7238 + }, + { + "epoch": 2.1191451990632317, + "grad_norm": 1.0513423681259155, + "learning_rate": 3.6348390056007774e-06, + "loss": 0.5395, + "step": 7239 + }, + { + "epoch": 2.11943793911007, + "grad_norm": 1.0031956434249878, + "learning_rate": 3.6344958773847027e-06, + "loss": 0.5767, + "step": 7240 + }, + { + "epoch": 2.1197306791569086, + "grad_norm": 0.9595091342926025, + "learning_rate": 3.634152722252495e-06, + "loss": 0.594, + "step": 7241 + }, + { + "epoch": 2.120023419203747, + "grad_norm": 0.9878096580505371, + "learning_rate": 3.633809540212296e-06, + "loss": 0.5624, + "step": 7242 + }, + { + "epoch": 2.1203161592505855, + "grad_norm": 0.9656365513801575, + "learning_rate": 3.633466331272248e-06, + "loss": 0.5619, + "step": 7243 + }, + { + "epoch": 2.120608899297424, + "grad_norm": 1.0184600353240967, + "learning_rate": 3.6331230954404927e-06, + "loss": 0.5701, + "step": 7244 + }, + { + "epoch": 2.1209016393442623, + "grad_norm": 0.9573809504508972, + "learning_rate": 3.6327798327251753e-06, + "loss": 0.5485, + "step": 7245 + }, + { + "epoch": 2.121194379391101, + "grad_norm": 0.9872257709503174, + "learning_rate": 3.6324365431344375e-06, + "loss": 0.6051, + "step": 7246 + }, + { + "epoch": 2.121487119437939, + "grad_norm": 1.0416460037231445, + "learning_rate": 3.632093226676426e-06, + "loss": 0.5884, + "step": 7247 + }, + { + "epoch": 2.1217798594847777, + "grad_norm": 1.018636703491211, + "learning_rate": 3.6317498833592856e-06, + "loss": 0.5772, + "step": 7248 + }, + { + "epoch": 2.122072599531616, + "grad_norm": 1.045314073562622, + "learning_rate": 3.631406513191161e-06, + "loss": 0.586, + "step": 7249 + }, + { + "epoch": 2.1223653395784545, + "grad_norm": 1.0034180879592896, + "learning_rate": 3.6310631161801998e-06, + "loss": 0.5928, + "step": 7250 + }, + { + "epoch": 2.1226580796252925, + "grad_norm": 0.9977627396583557, + "learning_rate": 3.6307196923345485e-06, + "loss": 0.5849, + "step": 7251 + }, + { + "epoch": 2.122950819672131, + "grad_norm": 0.9674866795539856, + "learning_rate": 3.6303762416623557e-06, + "loss": 0.5636, + "step": 7252 + }, + { + "epoch": 2.1232435597189694, + "grad_norm": 1.0290703773498535, + "learning_rate": 3.630032764171769e-06, + "loss": 0.5681, + "step": 7253 + }, + { + "epoch": 2.123536299765808, + "grad_norm": 1.0101569890975952, + "learning_rate": 3.6296892598709386e-06, + "loss": 0.5787, + "step": 7254 + }, + { + "epoch": 2.1238290398126463, + "grad_norm": 0.977291464805603, + "learning_rate": 3.629345728768013e-06, + "loss": 0.5883, + "step": 7255 + }, + { + "epoch": 2.1241217798594847, + "grad_norm": 1.0005043745040894, + "learning_rate": 3.6290021708711424e-06, + "loss": 0.6105, + "step": 7256 + }, + { + "epoch": 2.124414519906323, + "grad_norm": 0.9827474355697632, + "learning_rate": 3.62865858618848e-06, + "loss": 0.5918, + "step": 7257 + }, + { + "epoch": 2.1247072599531616, + "grad_norm": 1.004011631011963, + "learning_rate": 3.6283149747281743e-06, + "loss": 0.5937, + "step": 7258 + }, + { + "epoch": 2.125, + "grad_norm": 0.9997082948684692, + "learning_rate": 3.6279713364983802e-06, + "loss": 0.5663, + "step": 7259 + }, + { + "epoch": 2.1252927400468384, + "grad_norm": 0.9803541898727417, + "learning_rate": 3.627627671507249e-06, + "loss": 0.5402, + "step": 7260 + }, + { + "epoch": 2.125585480093677, + "grad_norm": 0.9939888119697571, + "learning_rate": 3.627283979762934e-06, + "loss": 0.5688, + "step": 7261 + }, + { + "epoch": 2.1258782201405153, + "grad_norm": 0.9526798129081726, + "learning_rate": 3.6269402612735906e-06, + "loss": 0.5622, + "step": 7262 + }, + { + "epoch": 2.1261709601873537, + "grad_norm": 1.014593243598938, + "learning_rate": 3.626596516047373e-06, + "loss": 0.6049, + "step": 7263 + }, + { + "epoch": 2.126463700234192, + "grad_norm": 1.03151535987854, + "learning_rate": 3.6262527440924367e-06, + "loss": 0.5268, + "step": 7264 + }, + { + "epoch": 2.1267564402810306, + "grad_norm": 1.0645532608032227, + "learning_rate": 3.625908945416937e-06, + "loss": 0.566, + "step": 7265 + }, + { + "epoch": 2.127049180327869, + "grad_norm": 0.9672757387161255, + "learning_rate": 3.6255651200290314e-06, + "loss": 0.5491, + "step": 7266 + }, + { + "epoch": 2.1273419203747075, + "grad_norm": 0.9956492781639099, + "learning_rate": 3.6252212679368775e-06, + "loss": 0.5848, + "step": 7267 + }, + { + "epoch": 2.1276346604215455, + "grad_norm": 1.0512298345565796, + "learning_rate": 3.6248773891486334e-06, + "loss": 0.5918, + "step": 7268 + }, + { + "epoch": 2.127927400468384, + "grad_norm": 0.9751816987991333, + "learning_rate": 3.624533483672456e-06, + "loss": 0.6102, + "step": 7269 + }, + { + "epoch": 2.1282201405152223, + "grad_norm": 0.9454845190048218, + "learning_rate": 3.624189551516506e-06, + "loss": 0.5894, + "step": 7270 + }, + { + "epoch": 2.128512880562061, + "grad_norm": 0.9854128956794739, + "learning_rate": 3.6238455926889424e-06, + "loss": 0.5662, + "step": 7271 + }, + { + "epoch": 2.128805620608899, + "grad_norm": 1.0049911737442017, + "learning_rate": 3.623501607197927e-06, + "loss": 0.5886, + "step": 7272 + }, + { + "epoch": 2.1290983606557377, + "grad_norm": 0.9539970755577087, + "learning_rate": 3.6231575950516206e-06, + "loss": 0.571, + "step": 7273 + }, + { + "epoch": 2.129391100702576, + "grad_norm": 0.9568235278129578, + "learning_rate": 3.6228135562581836e-06, + "loss": 0.51, + "step": 7274 + }, + { + "epoch": 2.1296838407494145, + "grad_norm": 0.981055498123169, + "learning_rate": 3.6224694908257797e-06, + "loss": 0.5694, + "step": 7275 + }, + { + "epoch": 2.129976580796253, + "grad_norm": 1.0257797241210938, + "learning_rate": 3.6221253987625715e-06, + "loss": 0.5681, + "step": 7276 + }, + { + "epoch": 2.1302693208430914, + "grad_norm": 1.0219618082046509, + "learning_rate": 3.621781280076722e-06, + "loss": 0.65, + "step": 7277 + }, + { + "epoch": 2.13056206088993, + "grad_norm": 0.9622423052787781, + "learning_rate": 3.621437134776397e-06, + "loss": 0.5881, + "step": 7278 + }, + { + "epoch": 2.1308548009367683, + "grad_norm": 1.0410014390945435, + "learning_rate": 3.62109296286976e-06, + "loss": 0.5583, + "step": 7279 + }, + { + "epoch": 2.1311475409836067, + "grad_norm": 1.0247738361358643, + "learning_rate": 3.620748764364977e-06, + "loss": 0.5632, + "step": 7280 + }, + { + "epoch": 2.131440281030445, + "grad_norm": 0.9819615483283997, + "learning_rate": 3.6204045392702143e-06, + "loss": 0.5431, + "step": 7281 + }, + { + "epoch": 2.1317330210772836, + "grad_norm": 0.9656389951705933, + "learning_rate": 3.6200602875936387e-06, + "loss": 0.566, + "step": 7282 + }, + { + "epoch": 2.1320257611241216, + "grad_norm": 0.9591242074966431, + "learning_rate": 3.6197160093434174e-06, + "loss": 0.5413, + "step": 7283 + }, + { + "epoch": 2.13231850117096, + "grad_norm": 1.0451970100402832, + "learning_rate": 3.6193717045277188e-06, + "loss": 0.5625, + "step": 7284 + }, + { + "epoch": 2.1326112412177984, + "grad_norm": 0.9786076545715332, + "learning_rate": 3.6190273731547115e-06, + "loss": 0.5413, + "step": 7285 + }, + { + "epoch": 2.132903981264637, + "grad_norm": 1.0232231616973877, + "learning_rate": 3.618683015232565e-06, + "loss": 0.5833, + "step": 7286 + }, + { + "epoch": 2.1331967213114753, + "grad_norm": 1.000644326210022, + "learning_rate": 3.6183386307694484e-06, + "loss": 0.5599, + "step": 7287 + }, + { + "epoch": 2.1334894613583137, + "grad_norm": 1.0563980340957642, + "learning_rate": 3.6179942197735336e-06, + "loss": 0.5585, + "step": 7288 + }, + { + "epoch": 2.133782201405152, + "grad_norm": 1.0435330867767334, + "learning_rate": 3.6176497822529912e-06, + "loss": 0.5767, + "step": 7289 + }, + { + "epoch": 2.1340749414519906, + "grad_norm": 0.9631320834159851, + "learning_rate": 3.6173053182159922e-06, + "loss": 0.5282, + "step": 7290 + }, + { + "epoch": 2.134367681498829, + "grad_norm": 0.9727579355239868, + "learning_rate": 3.61696082767071e-06, + "loss": 0.5808, + "step": 7291 + }, + { + "epoch": 2.1346604215456675, + "grad_norm": 1.0440027713775635, + "learning_rate": 3.616616310625318e-06, + "loss": 0.6093, + "step": 7292 + }, + { + "epoch": 2.134953161592506, + "grad_norm": 1.0002707242965698, + "learning_rate": 3.61627176708799e-06, + "loss": 0.5653, + "step": 7293 + }, + { + "epoch": 2.1352459016393444, + "grad_norm": 0.9853366017341614, + "learning_rate": 3.615927197066899e-06, + "loss": 0.5711, + "step": 7294 + }, + { + "epoch": 2.135538641686183, + "grad_norm": 1.0017521381378174, + "learning_rate": 3.6155826005702215e-06, + "loss": 0.5928, + "step": 7295 + }, + { + "epoch": 2.1358313817330212, + "grad_norm": 1.0134588479995728, + "learning_rate": 3.615237977606132e-06, + "loss": 0.6271, + "step": 7296 + }, + { + "epoch": 2.1361241217798597, + "grad_norm": 1.0603445768356323, + "learning_rate": 3.614893328182807e-06, + "loss": 0.6046, + "step": 7297 + }, + { + "epoch": 2.136416861826698, + "grad_norm": 0.9630924463272095, + "learning_rate": 3.6145486523084243e-06, + "loss": 0.534, + "step": 7298 + }, + { + "epoch": 2.1367096018735365, + "grad_norm": 1.0095176696777344, + "learning_rate": 3.6142039499911606e-06, + "loss": 0.5491, + "step": 7299 + }, + { + "epoch": 2.1370023419203745, + "grad_norm": 0.9546595215797424, + "learning_rate": 3.6138592212391937e-06, + "loss": 0.5677, + "step": 7300 + }, + { + "epoch": 2.137295081967213, + "grad_norm": 0.9142847657203674, + "learning_rate": 3.6135144660607026e-06, + "loss": 0.5726, + "step": 7301 + }, + { + "epoch": 2.1375878220140514, + "grad_norm": 0.9979089498519897, + "learning_rate": 3.613169684463867e-06, + "loss": 0.5845, + "step": 7302 + }, + { + "epoch": 2.13788056206089, + "grad_norm": 1.032721757888794, + "learning_rate": 3.6128248764568675e-06, + "loss": 0.5719, + "step": 7303 + }, + { + "epoch": 2.1381733021077283, + "grad_norm": 1.0350817441940308, + "learning_rate": 3.6124800420478836e-06, + "loss": 0.5633, + "step": 7304 + }, + { + "epoch": 2.1384660421545667, + "grad_norm": 1.0558626651763916, + "learning_rate": 3.6121351812450968e-06, + "loss": 0.5929, + "step": 7305 + }, + { + "epoch": 2.138758782201405, + "grad_norm": 0.9465398192405701, + "learning_rate": 3.6117902940566886e-06, + "loss": 0.5844, + "step": 7306 + }, + { + "epoch": 2.1390515222482436, + "grad_norm": 1.085519552230835, + "learning_rate": 3.6114453804908426e-06, + "loss": 0.6017, + "step": 7307 + }, + { + "epoch": 2.139344262295082, + "grad_norm": 0.9458562135696411, + "learning_rate": 3.611100440555742e-06, + "loss": 0.5349, + "step": 7308 + }, + { + "epoch": 2.1396370023419204, + "grad_norm": 0.9493037462234497, + "learning_rate": 3.6107554742595697e-06, + "loss": 0.5538, + "step": 7309 + }, + { + "epoch": 2.139929742388759, + "grad_norm": 1.015548586845398, + "learning_rate": 3.6104104816105096e-06, + "loss": 0.6007, + "step": 7310 + }, + { + "epoch": 2.1402224824355973, + "grad_norm": 1.0054351091384888, + "learning_rate": 3.6100654626167482e-06, + "loss": 0.5594, + "step": 7311 + }, + { + "epoch": 2.1405152224824358, + "grad_norm": 1.0543748140335083, + "learning_rate": 3.60972041728647e-06, + "loss": 0.5929, + "step": 7312 + }, + { + "epoch": 2.140807962529274, + "grad_norm": 1.068104863166809, + "learning_rate": 3.6093753456278623e-06, + "loss": 0.6264, + "step": 7313 + }, + { + "epoch": 2.1411007025761126, + "grad_norm": 0.9433171153068542, + "learning_rate": 3.6090302476491112e-06, + "loss": 0.5234, + "step": 7314 + }, + { + "epoch": 2.1413934426229506, + "grad_norm": 0.9666671752929688, + "learning_rate": 3.608685123358404e-06, + "loss": 0.5682, + "step": 7315 + }, + { + "epoch": 2.141686182669789, + "grad_norm": 0.9535567760467529, + "learning_rate": 3.60833997276393e-06, + "loss": 0.5634, + "step": 7316 + }, + { + "epoch": 2.1419789227166275, + "grad_norm": 0.9689370393753052, + "learning_rate": 3.6079947958738766e-06, + "loss": 0.5952, + "step": 7317 + }, + { + "epoch": 2.142271662763466, + "grad_norm": 0.9805169701576233, + "learning_rate": 3.6076495926964338e-06, + "loss": 0.5912, + "step": 7318 + }, + { + "epoch": 2.1425644028103044, + "grad_norm": 1.0007420778274536, + "learning_rate": 3.607304363239792e-06, + "loss": 0.5806, + "step": 7319 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.9823973178863525, + "learning_rate": 3.6069591075121406e-06, + "loss": 0.5509, + "step": 7320 + }, + { + "epoch": 2.1431498829039812, + "grad_norm": 1.0526632070541382, + "learning_rate": 3.6066138255216727e-06, + "loss": 0.5436, + "step": 7321 + }, + { + "epoch": 2.1434426229508197, + "grad_norm": 1.0381447076797485, + "learning_rate": 3.6062685172765784e-06, + "loss": 0.5553, + "step": 7322 + }, + { + "epoch": 2.143735362997658, + "grad_norm": 1.0463775396347046, + "learning_rate": 3.6059231827850514e-06, + "loss": 0.5687, + "step": 7323 + }, + { + "epoch": 2.1440281030444965, + "grad_norm": 0.9609130620956421, + "learning_rate": 3.6055778220552846e-06, + "loss": 0.5432, + "step": 7324 + }, + { + "epoch": 2.144320843091335, + "grad_norm": 0.951931893825531, + "learning_rate": 3.6052324350954715e-06, + "loss": 0.5564, + "step": 7325 + }, + { + "epoch": 2.1446135831381734, + "grad_norm": 0.9842849373817444, + "learning_rate": 3.604887021913806e-06, + "loss": 0.6142, + "step": 7326 + }, + { + "epoch": 2.144906323185012, + "grad_norm": 0.9450511336326599, + "learning_rate": 3.604541582518484e-06, + "loss": 0.6054, + "step": 7327 + }, + { + "epoch": 2.1451990632318503, + "grad_norm": 0.9450120329856873, + "learning_rate": 3.604196116917701e-06, + "loss": 0.548, + "step": 7328 + }, + { + "epoch": 2.1454918032786887, + "grad_norm": 0.9666523337364197, + "learning_rate": 3.6038506251196526e-06, + "loss": 0.5315, + "step": 7329 + }, + { + "epoch": 2.1457845433255267, + "grad_norm": 0.9134855270385742, + "learning_rate": 3.603505107132536e-06, + "loss": 0.5503, + "step": 7330 + }, + { + "epoch": 2.146077283372365, + "grad_norm": 0.9951311349868774, + "learning_rate": 3.6031595629645496e-06, + "loss": 0.6091, + "step": 7331 + }, + { + "epoch": 2.1463700234192036, + "grad_norm": 1.0273849964141846, + "learning_rate": 3.6028139926238897e-06, + "loss": 0.5947, + "step": 7332 + }, + { + "epoch": 2.146662763466042, + "grad_norm": 0.9734192490577698, + "learning_rate": 3.6024683961187557e-06, + "loss": 0.5701, + "step": 7333 + }, + { + "epoch": 2.1469555035128804, + "grad_norm": 0.990711510181427, + "learning_rate": 3.6021227734573484e-06, + "loss": 0.5709, + "step": 7334 + }, + { + "epoch": 2.147248243559719, + "grad_norm": 0.9775105118751526, + "learning_rate": 3.6017771246478654e-06, + "loss": 0.5775, + "step": 7335 + }, + { + "epoch": 2.1475409836065573, + "grad_norm": 0.9815149903297424, + "learning_rate": 3.601431449698509e-06, + "loss": 0.5791, + "step": 7336 + }, + { + "epoch": 2.1478337236533958, + "grad_norm": 1.0007522106170654, + "learning_rate": 3.60108574861748e-06, + "loss": 0.5457, + "step": 7337 + }, + { + "epoch": 2.148126463700234, + "grad_norm": 1.0358264446258545, + "learning_rate": 3.6007400214129797e-06, + "loss": 0.5948, + "step": 7338 + }, + { + "epoch": 2.1484192037470726, + "grad_norm": 0.9740053415298462, + "learning_rate": 3.6003942680932107e-06, + "loss": 0.5638, + "step": 7339 + }, + { + "epoch": 2.148711943793911, + "grad_norm": 1.0439279079437256, + "learning_rate": 3.600048488666377e-06, + "loss": 0.5709, + "step": 7340 + }, + { + "epoch": 2.1490046838407495, + "grad_norm": 0.9962313175201416, + "learning_rate": 3.5997026831406813e-06, + "loss": 0.5572, + "step": 7341 + }, + { + "epoch": 2.149297423887588, + "grad_norm": 0.9629747867584229, + "learning_rate": 3.599356851524327e-06, + "loss": 0.5445, + "step": 7342 + }, + { + "epoch": 2.1495901639344264, + "grad_norm": 0.9213441610336304, + "learning_rate": 3.5990109938255214e-06, + "loss": 0.5016, + "step": 7343 + }, + { + "epoch": 2.149882903981265, + "grad_norm": 1.0061954259872437, + "learning_rate": 3.5986651100524693e-06, + "loss": 0.5626, + "step": 7344 + }, + { + "epoch": 2.1501756440281032, + "grad_norm": 0.9716992974281311, + "learning_rate": 3.5983192002133753e-06, + "loss": 0.5819, + "step": 7345 + }, + { + "epoch": 2.1504683840749417, + "grad_norm": 0.9984675645828247, + "learning_rate": 3.597973264316448e-06, + "loss": 0.5713, + "step": 7346 + }, + { + "epoch": 2.1507611241217797, + "grad_norm": 1.0068068504333496, + "learning_rate": 3.5976273023698936e-06, + "loss": 0.6015, + "step": 7347 + }, + { + "epoch": 2.151053864168618, + "grad_norm": 0.9558974504470825, + "learning_rate": 3.597281314381921e-06, + "loss": 0.5873, + "step": 7348 + }, + { + "epoch": 2.1513466042154565, + "grad_norm": 1.1112838983535767, + "learning_rate": 3.596935300360738e-06, + "loss": 0.571, + "step": 7349 + }, + { + "epoch": 2.151639344262295, + "grad_norm": 0.9784814715385437, + "learning_rate": 3.5965892603145546e-06, + "loss": 0.5145, + "step": 7350 + }, + { + "epoch": 2.1519320843091334, + "grad_norm": 1.033717393875122, + "learning_rate": 3.5962431942515804e-06, + "loss": 0.5948, + "step": 7351 + }, + { + "epoch": 2.152224824355972, + "grad_norm": 0.9950569868087769, + "learning_rate": 3.595897102180025e-06, + "loss": 0.6185, + "step": 7352 + }, + { + "epoch": 2.1525175644028103, + "grad_norm": 0.9905176162719727, + "learning_rate": 3.5955509841081005e-06, + "loss": 0.5662, + "step": 7353 + }, + { + "epoch": 2.1528103044496487, + "grad_norm": 0.9511165618896484, + "learning_rate": 3.595204840044019e-06, + "loss": 0.5955, + "step": 7354 + }, + { + "epoch": 2.153103044496487, + "grad_norm": 0.928272545337677, + "learning_rate": 3.594858669995992e-06, + "loss": 0.5653, + "step": 7355 + }, + { + "epoch": 2.1533957845433256, + "grad_norm": 1.1080414056777954, + "learning_rate": 3.5945124739722328e-06, + "loss": 0.5993, + "step": 7356 + }, + { + "epoch": 2.153688524590164, + "grad_norm": 0.982119619846344, + "learning_rate": 3.5941662519809547e-06, + "loss": 0.5976, + "step": 7357 + }, + { + "epoch": 2.1539812646370025, + "grad_norm": 0.996813952922821, + "learning_rate": 3.5938200040303716e-06, + "loss": 0.5769, + "step": 7358 + }, + { + "epoch": 2.154274004683841, + "grad_norm": 0.9529367089271545, + "learning_rate": 3.5934737301287005e-06, + "loss": 0.5907, + "step": 7359 + }, + { + "epoch": 2.1545667447306793, + "grad_norm": 0.9728191494941711, + "learning_rate": 3.5931274302841534e-06, + "loss": 0.589, + "step": 7360 + }, + { + "epoch": 2.1548594847775178, + "grad_norm": 0.9419677257537842, + "learning_rate": 3.5927811045049483e-06, + "loss": 0.5263, + "step": 7361 + }, + { + "epoch": 2.1551522248243558, + "grad_norm": 0.9930046796798706, + "learning_rate": 3.5924347527993013e-06, + "loss": 0.5874, + "step": 7362 + }, + { + "epoch": 2.155444964871194, + "grad_norm": 0.930590033531189, + "learning_rate": 3.5920883751754304e-06, + "loss": 0.5627, + "step": 7363 + }, + { + "epoch": 2.1557377049180326, + "grad_norm": 0.9749133586883545, + "learning_rate": 3.5917419716415535e-06, + "loss": 0.5784, + "step": 7364 + }, + { + "epoch": 2.156030444964871, + "grad_norm": 0.9595656991004944, + "learning_rate": 3.5913955422058876e-06, + "loss": 0.5537, + "step": 7365 + }, + { + "epoch": 2.1563231850117095, + "grad_norm": 0.9546632766723633, + "learning_rate": 3.5910490868766532e-06, + "loss": 0.5249, + "step": 7366 + }, + { + "epoch": 2.156615925058548, + "grad_norm": 0.9906547665596008, + "learning_rate": 3.5907026056620696e-06, + "loss": 0.5373, + "step": 7367 + }, + { + "epoch": 2.1569086651053864, + "grad_norm": 0.9474034905433655, + "learning_rate": 3.5903560985703574e-06, + "loss": 0.5557, + "step": 7368 + }, + { + "epoch": 2.157201405152225, + "grad_norm": 0.9695624113082886, + "learning_rate": 3.5900095656097373e-06, + "loss": 0.5383, + "step": 7369 + }, + { + "epoch": 2.1574941451990632, + "grad_norm": 1.0290474891662598, + "learning_rate": 3.58966300678843e-06, + "loss": 0.5831, + "step": 7370 + }, + { + "epoch": 2.1577868852459017, + "grad_norm": 1.0180538892745972, + "learning_rate": 3.589316422114659e-06, + "loss": 0.5448, + "step": 7371 + }, + { + "epoch": 2.15807962529274, + "grad_norm": 1.043686032295227, + "learning_rate": 3.588969811596647e-06, + "loss": 0.5893, + "step": 7372 + }, + { + "epoch": 2.1583723653395785, + "grad_norm": 0.9380199313163757, + "learning_rate": 3.5886231752426166e-06, + "loss": 0.5393, + "step": 7373 + }, + { + "epoch": 2.158665105386417, + "grad_norm": 1.0069375038146973, + "learning_rate": 3.5882765130607922e-06, + "loss": 0.5586, + "step": 7374 + }, + { + "epoch": 2.1589578454332554, + "grad_norm": 0.9630095958709717, + "learning_rate": 3.5879298250593987e-06, + "loss": 0.599, + "step": 7375 + }, + { + "epoch": 2.159250585480094, + "grad_norm": 0.9561642408370972, + "learning_rate": 3.5875831112466607e-06, + "loss": 0.5722, + "step": 7376 + }, + { + "epoch": 2.1595433255269323, + "grad_norm": 0.9704912304878235, + "learning_rate": 3.5872363716308045e-06, + "loss": 0.5754, + "step": 7377 + }, + { + "epoch": 2.1598360655737707, + "grad_norm": 1.033447265625, + "learning_rate": 3.5868896062200563e-06, + "loss": 0.5898, + "step": 7378 + }, + { + "epoch": 2.1601288056206087, + "grad_norm": 0.9782422780990601, + "learning_rate": 3.5865428150226444e-06, + "loss": 0.6124, + "step": 7379 + }, + { + "epoch": 2.160421545667447, + "grad_norm": 0.9284066557884216, + "learning_rate": 3.586195998046795e-06, + "loss": 0.5433, + "step": 7380 + }, + { + "epoch": 2.1607142857142856, + "grad_norm": 0.9883168935775757, + "learning_rate": 3.5858491553007362e-06, + "loss": 0.6109, + "step": 7381 + }, + { + "epoch": 2.161007025761124, + "grad_norm": 0.992535412311554, + "learning_rate": 3.585502286792698e-06, + "loss": 0.5865, + "step": 7382 + }, + { + "epoch": 2.1612997658079625, + "grad_norm": 0.9670044183731079, + "learning_rate": 3.5851553925309096e-06, + "loss": 0.5957, + "step": 7383 + }, + { + "epoch": 2.161592505854801, + "grad_norm": 1.033115267753601, + "learning_rate": 3.584808472523601e-06, + "loss": 0.5824, + "step": 7384 + }, + { + "epoch": 2.1618852459016393, + "grad_norm": 0.9304800629615784, + "learning_rate": 3.5844615267790032e-06, + "loss": 0.5317, + "step": 7385 + }, + { + "epoch": 2.1621779859484778, + "grad_norm": 0.9723150730133057, + "learning_rate": 3.584114555305347e-06, + "loss": 0.5294, + "step": 7386 + }, + { + "epoch": 2.162470725995316, + "grad_norm": 1.0051686763763428, + "learning_rate": 3.5837675581108645e-06, + "loss": 0.6299, + "step": 7387 + }, + { + "epoch": 2.1627634660421546, + "grad_norm": 0.987203061580658, + "learning_rate": 3.583420535203789e-06, + "loss": 0.5683, + "step": 7388 + }, + { + "epoch": 2.163056206088993, + "grad_norm": 0.9379174113273621, + "learning_rate": 3.5830734865923532e-06, + "loss": 0.5501, + "step": 7389 + }, + { + "epoch": 2.1633489461358315, + "grad_norm": 0.9152323007583618, + "learning_rate": 3.5827264122847904e-06, + "loss": 0.4894, + "step": 7390 + }, + { + "epoch": 2.16364168618267, + "grad_norm": 1.1138808727264404, + "learning_rate": 3.582379312289335e-06, + "loss": 0.5912, + "step": 7391 + }, + { + "epoch": 2.1639344262295084, + "grad_norm": 0.9629195332527161, + "learning_rate": 3.5820321866142232e-06, + "loss": 0.5826, + "step": 7392 + }, + { + "epoch": 2.164227166276347, + "grad_norm": 0.9455024003982544, + "learning_rate": 3.5816850352676895e-06, + "loss": 0.5255, + "step": 7393 + }, + { + "epoch": 2.164519906323185, + "grad_norm": 0.9407024383544922, + "learning_rate": 3.5813378582579705e-06, + "loss": 0.5748, + "step": 7394 + }, + { + "epoch": 2.1648126463700232, + "grad_norm": 0.9670169949531555, + "learning_rate": 3.580990655593304e-06, + "loss": 0.5828, + "step": 7395 + }, + { + "epoch": 2.1651053864168617, + "grad_norm": 0.9919554591178894, + "learning_rate": 3.5806434272819258e-06, + "loss": 0.5788, + "step": 7396 + }, + { + "epoch": 2.1653981264637, + "grad_norm": 0.9791115522384644, + "learning_rate": 3.5802961733320744e-06, + "loss": 0.5915, + "step": 7397 + }, + { + "epoch": 2.1656908665105385, + "grad_norm": 0.9290021657943726, + "learning_rate": 3.5799488937519887e-06, + "loss": 0.543, + "step": 7398 + }, + { + "epoch": 2.165983606557377, + "grad_norm": 0.9828358292579651, + "learning_rate": 3.5796015885499084e-06, + "loss": 0.5371, + "step": 7399 + }, + { + "epoch": 2.1662763466042154, + "grad_norm": 1.017682433128357, + "learning_rate": 3.5792542577340732e-06, + "loss": 0.6458, + "step": 7400 + }, + { + "epoch": 2.166569086651054, + "grad_norm": 1.0131369829177856, + "learning_rate": 3.578906901312723e-06, + "loss": 0.6136, + "step": 7401 + }, + { + "epoch": 2.1668618266978923, + "grad_norm": 0.9759924411773682, + "learning_rate": 3.5785595192941e-06, + "loss": 0.5627, + "step": 7402 + }, + { + "epoch": 2.1671545667447307, + "grad_norm": 1.0119348764419556, + "learning_rate": 3.5782121116864443e-06, + "loss": 0.5676, + "step": 7403 + }, + { + "epoch": 2.167447306791569, + "grad_norm": 1.027796983718872, + "learning_rate": 3.577864678498e-06, + "loss": 0.5819, + "step": 7404 + }, + { + "epoch": 2.1677400468384076, + "grad_norm": 0.9950904846191406, + "learning_rate": 3.5775172197370084e-06, + "loss": 0.5964, + "step": 7405 + }, + { + "epoch": 2.168032786885246, + "grad_norm": 1.0254303216934204, + "learning_rate": 3.577169735411714e-06, + "loss": 0.6043, + "step": 7406 + }, + { + "epoch": 2.1683255269320845, + "grad_norm": 1.0000373125076294, + "learning_rate": 3.57682222553036e-06, + "loss": 0.6223, + "step": 7407 + }, + { + "epoch": 2.168618266978923, + "grad_norm": 0.9482301473617554, + "learning_rate": 3.5764746901011928e-06, + "loss": 0.5542, + "step": 7408 + }, + { + "epoch": 2.168911007025761, + "grad_norm": 0.9112817645072937, + "learning_rate": 3.576127129132456e-06, + "loss": 0.5084, + "step": 7409 + }, + { + "epoch": 2.1692037470725993, + "grad_norm": 0.958792507648468, + "learning_rate": 3.575779542632397e-06, + "loss": 0.5555, + "step": 7410 + }, + { + "epoch": 2.1694964871194378, + "grad_norm": 0.9976941347122192, + "learning_rate": 3.575431930609261e-06, + "loss": 0.5938, + "step": 7411 + }, + { + "epoch": 2.169789227166276, + "grad_norm": 0.9715577960014343, + "learning_rate": 3.5750842930712964e-06, + "loss": 0.6264, + "step": 7412 + }, + { + "epoch": 2.1700819672131146, + "grad_norm": 0.9701750874519348, + "learning_rate": 3.57473663002675e-06, + "loss": 0.5783, + "step": 7413 + }, + { + "epoch": 2.170374707259953, + "grad_norm": 0.9344435930252075, + "learning_rate": 3.5743889414838706e-06, + "loss": 0.5455, + "step": 7414 + }, + { + "epoch": 2.1706674473067915, + "grad_norm": 0.9433133006095886, + "learning_rate": 3.574041227450908e-06, + "loss": 0.5691, + "step": 7415 + }, + { + "epoch": 2.17096018735363, + "grad_norm": 0.9418923854827881, + "learning_rate": 3.5736934879361094e-06, + "loss": 0.5363, + "step": 7416 + }, + { + "epoch": 2.1712529274004684, + "grad_norm": 1.0163589715957642, + "learning_rate": 3.5733457229477276e-06, + "loss": 0.5742, + "step": 7417 + }, + { + "epoch": 2.171545667447307, + "grad_norm": 1.015493631362915, + "learning_rate": 3.572997932494012e-06, + "loss": 0.581, + "step": 7418 + }, + { + "epoch": 2.1718384074941453, + "grad_norm": 1.014727234840393, + "learning_rate": 3.5726501165832152e-06, + "loss": 0.6253, + "step": 7419 + }, + { + "epoch": 2.1721311475409837, + "grad_norm": 0.965804934501648, + "learning_rate": 3.572302275223587e-06, + "loss": 0.6095, + "step": 7420 + }, + { + "epoch": 2.172423887587822, + "grad_norm": 0.9467878937721252, + "learning_rate": 3.571954408423382e-06, + "loss": 0.5336, + "step": 7421 + }, + { + "epoch": 2.1727166276346606, + "grad_norm": 0.9756520986557007, + "learning_rate": 3.5716065161908524e-06, + "loss": 0.5474, + "step": 7422 + }, + { + "epoch": 2.173009367681499, + "grad_norm": 0.9943487644195557, + "learning_rate": 3.5712585985342524e-06, + "loss": 0.5827, + "step": 7423 + }, + { + "epoch": 2.1733021077283374, + "grad_norm": 1.0164223909378052, + "learning_rate": 3.570910655461836e-06, + "loss": 0.5887, + "step": 7424 + }, + { + "epoch": 2.173594847775176, + "grad_norm": 0.9921031594276428, + "learning_rate": 3.5705626869818595e-06, + "loss": 0.5583, + "step": 7425 + }, + { + "epoch": 2.173887587822014, + "grad_norm": 0.9785749912261963, + "learning_rate": 3.5702146931025765e-06, + "loss": 0.5378, + "step": 7426 + }, + { + "epoch": 2.1741803278688523, + "grad_norm": 1.0047820806503296, + "learning_rate": 3.569866673832245e-06, + "loss": 0.5692, + "step": 7427 + }, + { + "epoch": 2.1744730679156907, + "grad_norm": 0.9376400709152222, + "learning_rate": 3.569518629179121e-06, + "loss": 0.546, + "step": 7428 + }, + { + "epoch": 2.174765807962529, + "grad_norm": 1.0050668716430664, + "learning_rate": 3.5691705591514614e-06, + "loss": 0.6028, + "step": 7429 + }, + { + "epoch": 2.1750585480093676, + "grad_norm": 1.0059369802474976, + "learning_rate": 3.568822463757526e-06, + "loss": 0.5569, + "step": 7430 + }, + { + "epoch": 2.175351288056206, + "grad_norm": 0.9671723246574402, + "learning_rate": 3.5684743430055714e-06, + "loss": 0.5846, + "step": 7431 + }, + { + "epoch": 2.1756440281030445, + "grad_norm": 0.9954736828804016, + "learning_rate": 3.568126196903858e-06, + "loss": 0.5587, + "step": 7432 + }, + { + "epoch": 2.175936768149883, + "grad_norm": 0.961014449596405, + "learning_rate": 3.5677780254606455e-06, + "loss": 0.5507, + "step": 7433 + }, + { + "epoch": 2.1762295081967213, + "grad_norm": 1.005854606628418, + "learning_rate": 3.5674298286841935e-06, + "loss": 0.5805, + "step": 7434 + }, + { + "epoch": 2.1765222482435598, + "grad_norm": 0.9575056433677673, + "learning_rate": 3.567081606582765e-06, + "loss": 0.5505, + "step": 7435 + }, + { + "epoch": 2.176814988290398, + "grad_norm": 0.9756168127059937, + "learning_rate": 3.5667333591646197e-06, + "loss": 0.5179, + "step": 7436 + }, + { + "epoch": 2.1771077283372366, + "grad_norm": 1.0055177211761475, + "learning_rate": 3.5663850864380212e-06, + "loss": 0.606, + "step": 7437 + }, + { + "epoch": 2.177400468384075, + "grad_norm": 0.9798503518104553, + "learning_rate": 3.5660367884112305e-06, + "loss": 0.5409, + "step": 7438 + }, + { + "epoch": 2.1776932084309135, + "grad_norm": 0.9815241098403931, + "learning_rate": 3.565688465092513e-06, + "loss": 0.6088, + "step": 7439 + }, + { + "epoch": 2.177985948477752, + "grad_norm": 0.9861990213394165, + "learning_rate": 3.565340116490131e-06, + "loss": 0.5729, + "step": 7440 + }, + { + "epoch": 2.17827868852459, + "grad_norm": 0.9997943043708801, + "learning_rate": 3.5649917426123515e-06, + "loss": 0.5648, + "step": 7441 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 0.9903982877731323, + "learning_rate": 3.564643343467437e-06, + "loss": 0.6021, + "step": 7442 + }, + { + "epoch": 2.178864168618267, + "grad_norm": 0.9226956963539124, + "learning_rate": 3.564294919063655e-06, + "loss": 0.5219, + "step": 7443 + }, + { + "epoch": 2.1791569086651053, + "grad_norm": 0.9554672837257385, + "learning_rate": 3.5639464694092717e-06, + "loss": 0.5718, + "step": 7444 + }, + { + "epoch": 2.1794496487119437, + "grad_norm": 0.9315884113311768, + "learning_rate": 3.5635979945125542e-06, + "loss": 0.551, + "step": 7445 + }, + { + "epoch": 2.179742388758782, + "grad_norm": 1.0146626234054565, + "learning_rate": 3.56324949438177e-06, + "loss": 0.5793, + "step": 7446 + }, + { + "epoch": 2.1800351288056206, + "grad_norm": 1.008334755897522, + "learning_rate": 3.5629009690251863e-06, + "loss": 0.5901, + "step": 7447 + }, + { + "epoch": 2.180327868852459, + "grad_norm": 0.9725808501243591, + "learning_rate": 3.5625524184510734e-06, + "loss": 0.5608, + "step": 7448 + }, + { + "epoch": 2.1806206088992974, + "grad_norm": 1.041192889213562, + "learning_rate": 3.562203842667701e-06, + "loss": 0.601, + "step": 7449 + }, + { + "epoch": 2.180913348946136, + "grad_norm": 1.0150103569030762, + "learning_rate": 3.5618552416833374e-06, + "loss": 0.563, + "step": 7450 + }, + { + "epoch": 2.1812060889929743, + "grad_norm": 0.9826220273971558, + "learning_rate": 3.561506615506255e-06, + "loss": 0.5696, + "step": 7451 + }, + { + "epoch": 2.1814988290398127, + "grad_norm": 0.9744132161140442, + "learning_rate": 3.5611579641447235e-06, + "loss": 0.5484, + "step": 7452 + }, + { + "epoch": 2.181791569086651, + "grad_norm": 0.985689640045166, + "learning_rate": 3.5608092876070153e-06, + "loss": 0.5803, + "step": 7453 + }, + { + "epoch": 2.1820843091334896, + "grad_norm": 0.9317989349365234, + "learning_rate": 3.560460585901404e-06, + "loss": 0.5602, + "step": 7454 + }, + { + "epoch": 2.182377049180328, + "grad_norm": 1.0095245838165283, + "learning_rate": 3.5601118590361606e-06, + "loss": 0.5623, + "step": 7455 + }, + { + "epoch": 2.1826697892271665, + "grad_norm": 0.9851545095443726, + "learning_rate": 3.5597631070195604e-06, + "loss": 0.5634, + "step": 7456 + }, + { + "epoch": 2.182962529274005, + "grad_norm": 1.0353549718856812, + "learning_rate": 3.5594143298598765e-06, + "loss": 0.6185, + "step": 7457 + }, + { + "epoch": 2.183255269320843, + "grad_norm": 0.9888607859611511, + "learning_rate": 3.559065527565384e-06, + "loss": 0.557, + "step": 7458 + }, + { + "epoch": 2.1835480093676813, + "grad_norm": 0.9629978537559509, + "learning_rate": 3.5587167001443585e-06, + "loss": 0.5803, + "step": 7459 + }, + { + "epoch": 2.1838407494145198, + "grad_norm": 1.0102789402008057, + "learning_rate": 3.558367847605076e-06, + "loss": 0.5923, + "step": 7460 + }, + { + "epoch": 2.184133489461358, + "grad_norm": 0.9811878800392151, + "learning_rate": 3.5580189699558125e-06, + "loss": 0.5545, + "step": 7461 + }, + { + "epoch": 2.1844262295081966, + "grad_norm": 1.0161464214324951, + "learning_rate": 3.557670067204846e-06, + "loss": 0.5909, + "step": 7462 + }, + { + "epoch": 2.184718969555035, + "grad_norm": 0.9684824347496033, + "learning_rate": 3.557321139360454e-06, + "loss": 0.5691, + "step": 7463 + }, + { + "epoch": 2.1850117096018735, + "grad_norm": 0.9948074817657471, + "learning_rate": 3.556972186430915e-06, + "loss": 0.5877, + "step": 7464 + }, + { + "epoch": 2.185304449648712, + "grad_norm": 0.9913234114646912, + "learning_rate": 3.5566232084245074e-06, + "loss": 0.5561, + "step": 7465 + }, + { + "epoch": 2.1855971896955504, + "grad_norm": 1.0163891315460205, + "learning_rate": 3.5562742053495113e-06, + "loss": 0.56, + "step": 7466 + }, + { + "epoch": 2.185889929742389, + "grad_norm": 1.0483078956604004, + "learning_rate": 3.555925177214207e-06, + "loss": 0.5574, + "step": 7467 + }, + { + "epoch": 2.1861826697892273, + "grad_norm": 1.3157286643981934, + "learning_rate": 3.5555761240268743e-06, + "loss": 0.5808, + "step": 7468 + }, + { + "epoch": 2.1864754098360657, + "grad_norm": 1.0156440734863281, + "learning_rate": 3.555227045795796e-06, + "loss": 0.584, + "step": 7469 + }, + { + "epoch": 2.186768149882904, + "grad_norm": 1.041218876838684, + "learning_rate": 3.5548779425292527e-06, + "loss": 0.6173, + "step": 7470 + }, + { + "epoch": 2.1870608899297426, + "grad_norm": 0.9891075491905212, + "learning_rate": 3.5545288142355282e-06, + "loss": 0.6148, + "step": 7471 + }, + { + "epoch": 2.187353629976581, + "grad_norm": 0.9519846439361572, + "learning_rate": 3.5541796609229048e-06, + "loss": 0.5954, + "step": 7472 + }, + { + "epoch": 2.187646370023419, + "grad_norm": 0.9829005002975464, + "learning_rate": 3.5538304825996667e-06, + "loss": 0.5642, + "step": 7473 + }, + { + "epoch": 2.1879391100702574, + "grad_norm": 0.9872539043426514, + "learning_rate": 3.5534812792740976e-06, + "loss": 0.5559, + "step": 7474 + }, + { + "epoch": 2.188231850117096, + "grad_norm": 0.9651653170585632, + "learning_rate": 3.5531320509544824e-06, + "loss": 0.5287, + "step": 7475 + }, + { + "epoch": 2.1885245901639343, + "grad_norm": 0.9833613038063049, + "learning_rate": 3.5527827976491075e-06, + "loss": 0.6297, + "step": 7476 + }, + { + "epoch": 2.1888173302107727, + "grad_norm": 0.9680971503257751, + "learning_rate": 3.5524335193662585e-06, + "loss": 0.59, + "step": 7477 + }, + { + "epoch": 2.189110070257611, + "grad_norm": 0.967336118221283, + "learning_rate": 3.5520842161142217e-06, + "loss": 0.5551, + "step": 7478 + }, + { + "epoch": 2.1894028103044496, + "grad_norm": 0.9452588558197021, + "learning_rate": 3.551734887901285e-06, + "loss": 0.5962, + "step": 7479 + }, + { + "epoch": 2.189695550351288, + "grad_norm": 0.9492132067680359, + "learning_rate": 3.551385534735735e-06, + "loss": 0.525, + "step": 7480 + }, + { + "epoch": 2.1899882903981265, + "grad_norm": 0.9939126968383789, + "learning_rate": 3.5510361566258627e-06, + "loss": 0.5907, + "step": 7481 + }, + { + "epoch": 2.190281030444965, + "grad_norm": 0.9879752993583679, + "learning_rate": 3.550686753579955e-06, + "loss": 0.5628, + "step": 7482 + }, + { + "epoch": 2.1905737704918034, + "grad_norm": 0.9825909733772278, + "learning_rate": 3.550337325606302e-06, + "loss": 0.5679, + "step": 7483 + }, + { + "epoch": 2.190866510538642, + "grad_norm": 1.044297456741333, + "learning_rate": 3.549987872713194e-06, + "loss": 0.5442, + "step": 7484 + }, + { + "epoch": 2.1911592505854802, + "grad_norm": 0.9838974475860596, + "learning_rate": 3.5496383949089227e-06, + "loss": 0.5823, + "step": 7485 + }, + { + "epoch": 2.1914519906323187, + "grad_norm": 0.9362035393714905, + "learning_rate": 3.549288892201779e-06, + "loss": 0.5653, + "step": 7486 + }, + { + "epoch": 2.191744730679157, + "grad_norm": 1.0087311267852783, + "learning_rate": 3.5489393646000535e-06, + "loss": 0.6076, + "step": 7487 + }, + { + "epoch": 2.192037470725995, + "grad_norm": 0.9786613583564758, + "learning_rate": 3.548589812112041e-06, + "loss": 0.6157, + "step": 7488 + }, + { + "epoch": 2.1923302107728335, + "grad_norm": 0.9596601724624634, + "learning_rate": 3.5482402347460334e-06, + "loss": 0.5844, + "step": 7489 + }, + { + "epoch": 2.192622950819672, + "grad_norm": 0.9748948812484741, + "learning_rate": 3.5478906325103253e-06, + "loss": 0.5821, + "step": 7490 + }, + { + "epoch": 2.1929156908665104, + "grad_norm": 1.032564640045166, + "learning_rate": 3.5475410054132105e-06, + "loss": 0.5908, + "step": 7491 + }, + { + "epoch": 2.193208430913349, + "grad_norm": 1.0044875144958496, + "learning_rate": 3.5471913534629844e-06, + "loss": 0.6033, + "step": 7492 + }, + { + "epoch": 2.1935011709601873, + "grad_norm": 1.05475914478302, + "learning_rate": 3.5468416766679414e-06, + "loss": 0.6201, + "step": 7493 + }, + { + "epoch": 2.1937939110070257, + "grad_norm": 1.0189627408981323, + "learning_rate": 3.5464919750363784e-06, + "loss": 0.5793, + "step": 7494 + }, + { + "epoch": 2.194086651053864, + "grad_norm": 1.049872875213623, + "learning_rate": 3.546142248576593e-06, + "loss": 0.5478, + "step": 7495 + }, + { + "epoch": 2.1943793911007026, + "grad_norm": 0.9916800260543823, + "learning_rate": 3.5457924972968814e-06, + "loss": 0.5437, + "step": 7496 + }, + { + "epoch": 2.194672131147541, + "grad_norm": 0.9798120260238647, + "learning_rate": 3.545442721205542e-06, + "loss": 0.5373, + "step": 7497 + }, + { + "epoch": 2.1949648711943794, + "grad_norm": 0.9930229783058167, + "learning_rate": 3.545092920310873e-06, + "loss": 0.5797, + "step": 7498 + }, + { + "epoch": 2.195257611241218, + "grad_norm": 1.0473533868789673, + "learning_rate": 3.5447430946211737e-06, + "loss": 0.567, + "step": 7499 + }, + { + "epoch": 2.1955503512880563, + "grad_norm": 0.9684088230133057, + "learning_rate": 3.544393244144744e-06, + "loss": 0.5283, + "step": 7500 + }, + { + "epoch": 2.1958430913348947, + "grad_norm": 0.960017740726471, + "learning_rate": 3.544043368889884e-06, + "loss": 0.5377, + "step": 7501 + }, + { + "epoch": 2.196135831381733, + "grad_norm": 0.9681464433670044, + "learning_rate": 3.543693468864894e-06, + "loss": 0.5729, + "step": 7502 + }, + { + "epoch": 2.1964285714285716, + "grad_norm": 0.9246691465377808, + "learning_rate": 3.543343544078076e-06, + "loss": 0.5421, + "step": 7503 + }, + { + "epoch": 2.19672131147541, + "grad_norm": 1.0302518606185913, + "learning_rate": 3.5429935945377326e-06, + "loss": 0.5794, + "step": 7504 + }, + { + "epoch": 2.197014051522248, + "grad_norm": 1.0393420457839966, + "learning_rate": 3.542643620252165e-06, + "loss": 0.6013, + "step": 7505 + }, + { + "epoch": 2.1973067915690865, + "grad_norm": 1.0706408023834229, + "learning_rate": 3.5422936212296776e-06, + "loss": 0.5771, + "step": 7506 + }, + { + "epoch": 2.197599531615925, + "grad_norm": 0.9771839380264282, + "learning_rate": 3.5419435974785745e-06, + "loss": 0.5847, + "step": 7507 + }, + { + "epoch": 2.1978922716627634, + "grad_norm": 0.9190829992294312, + "learning_rate": 3.5415935490071586e-06, + "loss": 0.5571, + "step": 7508 + }, + { + "epoch": 2.198185011709602, + "grad_norm": 0.9906396865844727, + "learning_rate": 3.5412434758237356e-06, + "loss": 0.5872, + "step": 7509 + }, + { + "epoch": 2.1984777517564402, + "grad_norm": 0.9733998775482178, + "learning_rate": 3.5408933779366108e-06, + "loss": 0.5873, + "step": 7510 + }, + { + "epoch": 2.1987704918032787, + "grad_norm": 1.031256079673767, + "learning_rate": 3.5405432553540912e-06, + "loss": 0.5716, + "step": 7511 + }, + { + "epoch": 2.199063231850117, + "grad_norm": 0.9462408423423767, + "learning_rate": 3.540193108084483e-06, + "loss": 0.5414, + "step": 7512 + }, + { + "epoch": 2.1993559718969555, + "grad_norm": 0.946650505065918, + "learning_rate": 3.539842936136093e-06, + "loss": 0.6033, + "step": 7513 + }, + { + "epoch": 2.199648711943794, + "grad_norm": 1.074123501777649, + "learning_rate": 3.5394927395172294e-06, + "loss": 0.6065, + "step": 7514 + }, + { + "epoch": 2.1999414519906324, + "grad_norm": 1.0423530340194702, + "learning_rate": 3.5391425182362015e-06, + "loss": 0.618, + "step": 7515 + }, + { + "epoch": 2.200234192037471, + "grad_norm": 0.9567924737930298, + "learning_rate": 3.538792272301318e-06, + "loss": 0.5467, + "step": 7516 + }, + { + "epoch": 2.2005269320843093, + "grad_norm": 1.0007404088974, + "learning_rate": 3.538442001720888e-06, + "loss": 0.5796, + "step": 7517 + }, + { + "epoch": 2.2008196721311477, + "grad_norm": 0.9458216428756714, + "learning_rate": 3.5380917065032213e-06, + "loss": 0.5771, + "step": 7518 + }, + { + "epoch": 2.201112412177986, + "grad_norm": 0.9615421295166016, + "learning_rate": 3.537741386656629e-06, + "loss": 0.5137, + "step": 7519 + }, + { + "epoch": 2.201405152224824, + "grad_norm": 1.0117393732070923, + "learning_rate": 3.5373910421894242e-06, + "loss": 0.5948, + "step": 7520 + }, + { + "epoch": 2.2016978922716626, + "grad_norm": 0.9470860362052917, + "learning_rate": 3.537040673109917e-06, + "loss": 0.5593, + "step": 7521 + }, + { + "epoch": 2.201990632318501, + "grad_norm": 0.986939549446106, + "learning_rate": 3.5366902794264213e-06, + "loss": 0.5524, + "step": 7522 + }, + { + "epoch": 2.2022833723653394, + "grad_norm": 0.9131543636322021, + "learning_rate": 3.5363398611472486e-06, + "loss": 0.5575, + "step": 7523 + }, + { + "epoch": 2.202576112412178, + "grad_norm": 0.9781677722930908, + "learning_rate": 3.535989418280714e-06, + "loss": 0.571, + "step": 7524 + }, + { + "epoch": 2.2028688524590163, + "grad_norm": 0.9749685525894165, + "learning_rate": 3.535638950835132e-06, + "loss": 0.5846, + "step": 7525 + }, + { + "epoch": 2.2031615925058547, + "grad_norm": 0.9821906089782715, + "learning_rate": 3.535288458818816e-06, + "loss": 0.5689, + "step": 7526 + }, + { + "epoch": 2.203454332552693, + "grad_norm": 0.9938517212867737, + "learning_rate": 3.534937942240083e-06, + "loss": 0.5866, + "step": 7527 + }, + { + "epoch": 2.2037470725995316, + "grad_norm": 1.0422176122665405, + "learning_rate": 3.534587401107248e-06, + "loss": 0.5877, + "step": 7528 + }, + { + "epoch": 2.20403981264637, + "grad_norm": 0.9893345832824707, + "learning_rate": 3.534236835428628e-06, + "loss": 0.5834, + "step": 7529 + }, + { + "epoch": 2.2043325526932085, + "grad_norm": 0.9737350940704346, + "learning_rate": 3.533886245212541e-06, + "loss": 0.5663, + "step": 7530 + }, + { + "epoch": 2.204625292740047, + "grad_norm": 0.9710397720336914, + "learning_rate": 3.5335356304673047e-06, + "loss": 0.5981, + "step": 7531 + }, + { + "epoch": 2.2049180327868854, + "grad_norm": 0.9842720031738281, + "learning_rate": 3.533184991201236e-06, + "loss": 0.6127, + "step": 7532 + }, + { + "epoch": 2.205210772833724, + "grad_norm": 0.9917120933532715, + "learning_rate": 3.532834327422655e-06, + "loss": 0.5928, + "step": 7533 + }, + { + "epoch": 2.2055035128805622, + "grad_norm": 0.9827395081520081, + "learning_rate": 3.532483639139881e-06, + "loss": 0.5544, + "step": 7534 + }, + { + "epoch": 2.2057962529274007, + "grad_norm": 0.9204158186912537, + "learning_rate": 3.532132926361235e-06, + "loss": 0.5537, + "step": 7535 + }, + { + "epoch": 2.206088992974239, + "grad_norm": 0.9975022673606873, + "learning_rate": 3.5317821890950366e-06, + "loss": 0.5594, + "step": 7536 + }, + { + "epoch": 2.206381733021077, + "grad_norm": 0.979434609413147, + "learning_rate": 3.5314314273496077e-06, + "loss": 0.5817, + "step": 7537 + }, + { + "epoch": 2.2066744730679155, + "grad_norm": 0.9651575088500977, + "learning_rate": 3.53108064113327e-06, + "loss": 0.5547, + "step": 7538 + }, + { + "epoch": 2.206967213114754, + "grad_norm": 0.9594716429710388, + "learning_rate": 3.5307298304543454e-06, + "loss": 0.5931, + "step": 7539 + }, + { + "epoch": 2.2072599531615924, + "grad_norm": 0.9215472936630249, + "learning_rate": 3.5303789953211583e-06, + "loss": 0.5269, + "step": 7540 + }, + { + "epoch": 2.207552693208431, + "grad_norm": 0.9969272613525391, + "learning_rate": 3.530028135742031e-06, + "loss": 0.5423, + "step": 7541 + }, + { + "epoch": 2.2078454332552693, + "grad_norm": 0.9787754416465759, + "learning_rate": 3.5296772517252885e-06, + "loss": 0.5647, + "step": 7542 + }, + { + "epoch": 2.2081381733021077, + "grad_norm": 0.9812600612640381, + "learning_rate": 3.5293263432792555e-06, + "loss": 0.5521, + "step": 7543 + }, + { + "epoch": 2.208430913348946, + "grad_norm": 0.9580171704292297, + "learning_rate": 3.5289754104122577e-06, + "loss": 0.5814, + "step": 7544 + }, + { + "epoch": 2.2087236533957846, + "grad_norm": 1.0292719602584839, + "learning_rate": 3.5286244531326204e-06, + "loss": 0.5525, + "step": 7545 + }, + { + "epoch": 2.209016393442623, + "grad_norm": 0.9934138059616089, + "learning_rate": 3.52827347144867e-06, + "loss": 0.6074, + "step": 7546 + }, + { + "epoch": 2.2093091334894615, + "grad_norm": 0.9647626280784607, + "learning_rate": 3.527922465368734e-06, + "loss": 0.5851, + "step": 7547 + }, + { + "epoch": 2.2096018735363, + "grad_norm": 0.9837586283683777, + "learning_rate": 3.5275714349011404e-06, + "loss": 0.5491, + "step": 7548 + }, + { + "epoch": 2.2098946135831383, + "grad_norm": 0.9753843545913696, + "learning_rate": 3.527220380054216e-06, + "loss": 0.5472, + "step": 7549 + }, + { + "epoch": 2.2101873536299768, + "grad_norm": 0.9428646564483643, + "learning_rate": 3.5268693008362914e-06, + "loss": 0.5153, + "step": 7550 + }, + { + "epoch": 2.210480093676815, + "grad_norm": 0.9611625671386719, + "learning_rate": 3.5265181972556954e-06, + "loss": 0.5768, + "step": 7551 + }, + { + "epoch": 2.210772833723653, + "grad_norm": 1.07571542263031, + "learning_rate": 3.5261670693207583e-06, + "loss": 0.55, + "step": 7552 + }, + { + "epoch": 2.2110655737704916, + "grad_norm": 0.9437528252601624, + "learning_rate": 3.52581591703981e-06, + "loss": 0.5313, + "step": 7553 + }, + { + "epoch": 2.21135831381733, + "grad_norm": 1.0162365436553955, + "learning_rate": 3.5254647404211816e-06, + "loss": 0.6031, + "step": 7554 + }, + { + "epoch": 2.2116510538641685, + "grad_norm": 0.9623863697052002, + "learning_rate": 3.5251135394732056e-06, + "loss": 0.5648, + "step": 7555 + }, + { + "epoch": 2.211943793911007, + "grad_norm": 0.9793046116828918, + "learning_rate": 3.5247623142042137e-06, + "loss": 0.5795, + "step": 7556 + }, + { + "epoch": 2.2122365339578454, + "grad_norm": 1.0735472440719604, + "learning_rate": 3.52441106462254e-06, + "loss": 0.5756, + "step": 7557 + }, + { + "epoch": 2.212529274004684, + "grad_norm": 0.9784407615661621, + "learning_rate": 3.524059790736516e-06, + "loss": 0.5593, + "step": 7558 + }, + { + "epoch": 2.2128220140515222, + "grad_norm": 0.93964022397995, + "learning_rate": 3.523708492554477e-06, + "loss": 0.5741, + "step": 7559 + }, + { + "epoch": 2.2131147540983607, + "grad_norm": 0.9579123854637146, + "learning_rate": 3.523357170084757e-06, + "loss": 0.5542, + "step": 7560 + }, + { + "epoch": 2.213407494145199, + "grad_norm": 0.9425256252288818, + "learning_rate": 3.5230058233356923e-06, + "loss": 0.5524, + "step": 7561 + }, + { + "epoch": 2.2137002341920375, + "grad_norm": 0.9682066440582275, + "learning_rate": 3.522654452315617e-06, + "loss": 0.5587, + "step": 7562 + }, + { + "epoch": 2.213992974238876, + "grad_norm": 0.9529055953025818, + "learning_rate": 3.5223030570328692e-06, + "loss": 0.5572, + "step": 7563 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.9606208205223083, + "learning_rate": 3.5219516374957846e-06, + "loss": 0.5868, + "step": 7564 + }, + { + "epoch": 2.214578454332553, + "grad_norm": 0.994426965713501, + "learning_rate": 3.5216001937127008e-06, + "loss": 0.5867, + "step": 7565 + }, + { + "epoch": 2.2148711943793913, + "grad_norm": 0.9505156874656677, + "learning_rate": 3.521248725691956e-06, + "loss": 0.5425, + "step": 7566 + }, + { + "epoch": 2.2151639344262293, + "grad_norm": 1.0219844579696655, + "learning_rate": 3.52089723344189e-06, + "loss": 0.5678, + "step": 7567 + }, + { + "epoch": 2.2154566744730677, + "grad_norm": 1.0006412267684937, + "learning_rate": 3.52054571697084e-06, + "loss": 0.5596, + "step": 7568 + }, + { + "epoch": 2.215749414519906, + "grad_norm": 1.0473531484603882, + "learning_rate": 3.5201941762871474e-06, + "loss": 0.5288, + "step": 7569 + }, + { + "epoch": 2.2160421545667446, + "grad_norm": 0.96757972240448, + "learning_rate": 3.5198426113991514e-06, + "loss": 0.5842, + "step": 7570 + }, + { + "epoch": 2.216334894613583, + "grad_norm": 1.028480052947998, + "learning_rate": 3.5194910223151934e-06, + "loss": 0.6015, + "step": 7571 + }, + { + "epoch": 2.2166276346604215, + "grad_norm": 0.9958032369613647, + "learning_rate": 3.519139409043616e-06, + "loss": 0.5707, + "step": 7572 + }, + { + "epoch": 2.21692037470726, + "grad_norm": 0.9631739258766174, + "learning_rate": 3.5187877715927595e-06, + "loss": 0.5654, + "step": 7573 + }, + { + "epoch": 2.2172131147540983, + "grad_norm": 1.0256153345108032, + "learning_rate": 3.5184361099709675e-06, + "loss": 0.5953, + "step": 7574 + }, + { + "epoch": 2.2175058548009368, + "grad_norm": 1.0079213380813599, + "learning_rate": 3.5180844241865832e-06, + "loss": 0.5515, + "step": 7575 + }, + { + "epoch": 2.217798594847775, + "grad_norm": 0.9283998012542725, + "learning_rate": 3.51773271424795e-06, + "loss": 0.5305, + "step": 7576 + }, + { + "epoch": 2.2180913348946136, + "grad_norm": 1.1463178396224976, + "learning_rate": 3.5173809801634134e-06, + "loss": 0.5989, + "step": 7577 + }, + { + "epoch": 2.218384074941452, + "grad_norm": 1.0120903253555298, + "learning_rate": 3.517029221941317e-06, + "loss": 0.5849, + "step": 7578 + }, + { + "epoch": 2.2186768149882905, + "grad_norm": 0.9626523852348328, + "learning_rate": 3.5166774395900073e-06, + "loss": 0.5859, + "step": 7579 + }, + { + "epoch": 2.218969555035129, + "grad_norm": 0.9626777172088623, + "learning_rate": 3.5163256331178297e-06, + "loss": 0.5693, + "step": 7580 + }, + { + "epoch": 2.2192622950819674, + "grad_norm": 0.9541419744491577, + "learning_rate": 3.5159738025331314e-06, + "loss": 0.5482, + "step": 7581 + }, + { + "epoch": 2.219555035128806, + "grad_norm": 0.9329999685287476, + "learning_rate": 3.5156219478442597e-06, + "loss": 0.5783, + "step": 7582 + }, + { + "epoch": 2.2198477751756442, + "grad_norm": 1.043998122215271, + "learning_rate": 3.515270069059562e-06, + "loss": 0.5981, + "step": 7583 + }, + { + "epoch": 2.2201405152224822, + "grad_norm": 1.0298635959625244, + "learning_rate": 3.5149181661873866e-06, + "loss": 0.5842, + "step": 7584 + }, + { + "epoch": 2.2204332552693207, + "grad_norm": 1.0247914791107178, + "learning_rate": 3.5145662392360823e-06, + "loss": 0.5891, + "step": 7585 + }, + { + "epoch": 2.220725995316159, + "grad_norm": 1.0479671955108643, + "learning_rate": 3.5142142882139994e-06, + "loss": 0.5389, + "step": 7586 + }, + { + "epoch": 2.2210187353629975, + "grad_norm": 0.994642436504364, + "learning_rate": 3.513862313129488e-06, + "loss": 0.6134, + "step": 7587 + }, + { + "epoch": 2.221311475409836, + "grad_norm": 0.9545265436172485, + "learning_rate": 3.5135103139908985e-06, + "loss": 0.5707, + "step": 7588 + }, + { + "epoch": 2.2216042154566744, + "grad_norm": 0.9730107188224792, + "learning_rate": 3.5131582908065814e-06, + "loss": 0.6159, + "step": 7589 + }, + { + "epoch": 2.221896955503513, + "grad_norm": 0.9619428515434265, + "learning_rate": 3.5128062435848897e-06, + "loss": 0.5433, + "step": 7590 + }, + { + "epoch": 2.2221896955503513, + "grad_norm": 0.9795038104057312, + "learning_rate": 3.5124541723341753e-06, + "loss": 0.5968, + "step": 7591 + }, + { + "epoch": 2.2224824355971897, + "grad_norm": 0.9935131072998047, + "learning_rate": 3.512102077062791e-06, + "loss": 0.5878, + "step": 7592 + }, + { + "epoch": 2.222775175644028, + "grad_norm": 1.0161811113357544, + "learning_rate": 3.5117499577790908e-06, + "loss": 0.5624, + "step": 7593 + }, + { + "epoch": 2.2230679156908666, + "grad_norm": 0.9740325212478638, + "learning_rate": 3.511397814491428e-06, + "loss": 0.5758, + "step": 7594 + }, + { + "epoch": 2.223360655737705, + "grad_norm": 1.0150675773620605, + "learning_rate": 3.511045647208158e-06, + "loss": 0.5763, + "step": 7595 + }, + { + "epoch": 2.2236533957845435, + "grad_norm": 0.9246149659156799, + "learning_rate": 3.510693455937636e-06, + "loss": 0.5247, + "step": 7596 + }, + { + "epoch": 2.223946135831382, + "grad_norm": 0.9835729598999023, + "learning_rate": 3.510341240688217e-06, + "loss": 0.6318, + "step": 7597 + }, + { + "epoch": 2.2242388758782203, + "grad_norm": 0.9840874671936035, + "learning_rate": 3.509989001468259e-06, + "loss": 0.5803, + "step": 7598 + }, + { + "epoch": 2.2245316159250583, + "grad_norm": 1.023445725440979, + "learning_rate": 3.509636738286117e-06, + "loss": 0.5957, + "step": 7599 + }, + { + "epoch": 2.2248243559718968, + "grad_norm": 0.9928865432739258, + "learning_rate": 3.5092844511501493e-06, + "loss": 0.5612, + "step": 7600 + }, + { + "epoch": 2.225117096018735, + "grad_norm": 0.9888659119606018, + "learning_rate": 3.5089321400687144e-06, + "loss": 0.5761, + "step": 7601 + }, + { + "epoch": 2.2254098360655736, + "grad_norm": 0.9866379499435425, + "learning_rate": 3.5085798050501707e-06, + "loss": 0.5887, + "step": 7602 + }, + { + "epoch": 2.225702576112412, + "grad_norm": 0.9842953085899353, + "learning_rate": 3.5082274461028776e-06, + "loss": 0.5742, + "step": 7603 + }, + { + "epoch": 2.2259953161592505, + "grad_norm": 0.9800955057144165, + "learning_rate": 3.5078750632351937e-06, + "loss": 0.5477, + "step": 7604 + }, + { + "epoch": 2.226288056206089, + "grad_norm": 0.9810753464698792, + "learning_rate": 3.507522656455481e-06, + "loss": 0.5886, + "step": 7605 + }, + { + "epoch": 2.2265807962529274, + "grad_norm": 0.9515209197998047, + "learning_rate": 3.5071702257720997e-06, + "loss": 0.5711, + "step": 7606 + }, + { + "epoch": 2.226873536299766, + "grad_norm": 1.04191255569458, + "learning_rate": 3.5068177711934113e-06, + "loss": 0.594, + "step": 7607 + }, + { + "epoch": 2.2271662763466042, + "grad_norm": 0.9597405791282654, + "learning_rate": 3.5064652927277776e-06, + "loss": 0.5675, + "step": 7608 + }, + { + "epoch": 2.2274590163934427, + "grad_norm": 0.9430240988731384, + "learning_rate": 3.5061127903835617e-06, + "loss": 0.5454, + "step": 7609 + }, + { + "epoch": 2.227751756440281, + "grad_norm": 0.9745681285858154, + "learning_rate": 3.5057602641691265e-06, + "loss": 0.5543, + "step": 7610 + }, + { + "epoch": 2.2280444964871196, + "grad_norm": 0.9263864755630493, + "learning_rate": 3.5054077140928354e-06, + "loss": 0.5547, + "step": 7611 + }, + { + "epoch": 2.228337236533958, + "grad_norm": 1.017497181892395, + "learning_rate": 3.505055140163054e-06, + "loss": 0.6132, + "step": 7612 + }, + { + "epoch": 2.2286299765807964, + "grad_norm": 0.9732674956321716, + "learning_rate": 3.5047025423881454e-06, + "loss": 0.5718, + "step": 7613 + }, + { + "epoch": 2.228922716627635, + "grad_norm": 0.974694550037384, + "learning_rate": 3.5043499207764763e-06, + "loss": 0.5811, + "step": 7614 + }, + { + "epoch": 2.2292154566744733, + "grad_norm": 0.9426451325416565, + "learning_rate": 3.5039972753364125e-06, + "loss": 0.5772, + "step": 7615 + }, + { + "epoch": 2.2295081967213113, + "grad_norm": 1.003250241279602, + "learning_rate": 3.5036446060763207e-06, + "loss": 0.5716, + "step": 7616 + }, + { + "epoch": 2.2298009367681497, + "grad_norm": 0.9797603487968445, + "learning_rate": 3.503291913004568e-06, + "loss": 0.5748, + "step": 7617 + }, + { + "epoch": 2.230093676814988, + "grad_norm": 0.9843968152999878, + "learning_rate": 3.5029391961295213e-06, + "loss": 0.5692, + "step": 7618 + }, + { + "epoch": 2.2303864168618266, + "grad_norm": 1.0205854177474976, + "learning_rate": 3.502586455459549e-06, + "loss": 0.583, + "step": 7619 + }, + { + "epoch": 2.230679156908665, + "grad_norm": 0.9711542129516602, + "learning_rate": 3.5022336910030214e-06, + "loss": 0.5633, + "step": 7620 + }, + { + "epoch": 2.2309718969555035, + "grad_norm": 0.9659913182258606, + "learning_rate": 3.5018809027683066e-06, + "loss": 0.5806, + "step": 7621 + }, + { + "epoch": 2.231264637002342, + "grad_norm": 0.9627401828765869, + "learning_rate": 3.501528090763775e-06, + "loss": 0.5505, + "step": 7622 + }, + { + "epoch": 2.2315573770491803, + "grad_norm": 1.0165250301361084, + "learning_rate": 3.5011752549977966e-06, + "loss": 0.6066, + "step": 7623 + }, + { + "epoch": 2.2318501170960188, + "grad_norm": 0.9519540071487427, + "learning_rate": 3.500822395478743e-06, + "loss": 0.5956, + "step": 7624 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 1.0134050846099854, + "learning_rate": 3.5004695122149856e-06, + "loss": 0.5965, + "step": 7625 + }, + { + "epoch": 2.2324355971896956, + "grad_norm": 0.9568564891815186, + "learning_rate": 3.500116605214897e-06, + "loss": 0.5675, + "step": 7626 + }, + { + "epoch": 2.232728337236534, + "grad_norm": 0.9729264974594116, + "learning_rate": 3.499763674486851e-06, + "loss": 0.5836, + "step": 7627 + }, + { + "epoch": 2.2330210772833725, + "grad_norm": 0.972906231880188, + "learning_rate": 3.4994107200392187e-06, + "loss": 0.5905, + "step": 7628 + }, + { + "epoch": 2.233313817330211, + "grad_norm": 0.972825825214386, + "learning_rate": 3.4990577418803747e-06, + "loss": 0.5849, + "step": 7629 + }, + { + "epoch": 2.2336065573770494, + "grad_norm": 1.0217337608337402, + "learning_rate": 3.498704740018694e-06, + "loss": 0.6157, + "step": 7630 + }, + { + "epoch": 2.2338992974238874, + "grad_norm": 0.9707293510437012, + "learning_rate": 3.4983517144625518e-06, + "loss": 0.5824, + "step": 7631 + }, + { + "epoch": 2.234192037470726, + "grad_norm": 0.9839479923248291, + "learning_rate": 3.4979986652203234e-06, + "loss": 0.5937, + "step": 7632 + }, + { + "epoch": 2.2344847775175642, + "grad_norm": 1.0169748067855835, + "learning_rate": 3.497645592300385e-06, + "loss": 0.5875, + "step": 7633 + }, + { + "epoch": 2.2347775175644027, + "grad_norm": 1.0218712091445923, + "learning_rate": 3.4972924957111133e-06, + "loss": 0.6002, + "step": 7634 + }, + { + "epoch": 2.235070257611241, + "grad_norm": 0.9428950548171997, + "learning_rate": 3.4969393754608847e-06, + "loss": 0.5717, + "step": 7635 + }, + { + "epoch": 2.2353629976580796, + "grad_norm": 1.0359208583831787, + "learning_rate": 3.496586231558079e-06, + "loss": 0.5828, + "step": 7636 + }, + { + "epoch": 2.235655737704918, + "grad_norm": 0.992748498916626, + "learning_rate": 3.4962330640110726e-06, + "loss": 0.6072, + "step": 7637 + }, + { + "epoch": 2.2359484777517564, + "grad_norm": 0.9274013638496399, + "learning_rate": 3.4958798728282463e-06, + "loss": 0.5286, + "step": 7638 + }, + { + "epoch": 2.236241217798595, + "grad_norm": 0.9816086888313293, + "learning_rate": 3.495526658017978e-06, + "loss": 0.581, + "step": 7639 + }, + { + "epoch": 2.2365339578454333, + "grad_norm": 1.0049699544906616, + "learning_rate": 3.4951734195886483e-06, + "loss": 0.5856, + "step": 7640 + }, + { + "epoch": 2.2368266978922717, + "grad_norm": 0.9954342842102051, + "learning_rate": 3.494820157548638e-06, + "loss": 0.5864, + "step": 7641 + }, + { + "epoch": 2.23711943793911, + "grad_norm": 1.000514030456543, + "learning_rate": 3.4944668719063286e-06, + "loss": 0.5652, + "step": 7642 + }, + { + "epoch": 2.2374121779859486, + "grad_norm": 0.9795773029327393, + "learning_rate": 3.4941135626701017e-06, + "loss": 0.5535, + "step": 7643 + }, + { + "epoch": 2.237704918032787, + "grad_norm": 1.061907172203064, + "learning_rate": 3.49376022984834e-06, + "loss": 0.5495, + "step": 7644 + }, + { + "epoch": 2.2379976580796255, + "grad_norm": 0.9650762677192688, + "learning_rate": 3.4934068734494252e-06, + "loss": 0.5696, + "step": 7645 + }, + { + "epoch": 2.2382903981264635, + "grad_norm": 1.0028785467147827, + "learning_rate": 3.4930534934817413e-06, + "loss": 0.6004, + "step": 7646 + }, + { + "epoch": 2.238583138173302, + "grad_norm": 0.9899524450302124, + "learning_rate": 3.492700089953673e-06, + "loss": 0.5738, + "step": 7647 + }, + { + "epoch": 2.2388758782201403, + "grad_norm": 0.9803001880645752, + "learning_rate": 3.492346662873605e-06, + "loss": 0.6026, + "step": 7648 + }, + { + "epoch": 2.2391686182669788, + "grad_norm": 1.0184578895568848, + "learning_rate": 3.49199321224992e-06, + "loss": 0.5976, + "step": 7649 + }, + { + "epoch": 2.239461358313817, + "grad_norm": 0.9590060710906982, + "learning_rate": 3.4916397380910066e-06, + "loss": 0.5763, + "step": 7650 + }, + { + "epoch": 2.2397540983606556, + "grad_norm": 1.0113821029663086, + "learning_rate": 3.49128624040525e-06, + "loss": 0.5812, + "step": 7651 + }, + { + "epoch": 2.240046838407494, + "grad_norm": 1.0068260431289673, + "learning_rate": 3.4909327192010366e-06, + "loss": 0.5995, + "step": 7652 + }, + { + "epoch": 2.2403395784543325, + "grad_norm": 0.9971571564674377, + "learning_rate": 3.4905791744867544e-06, + "loss": 0.5879, + "step": 7653 + }, + { + "epoch": 2.240632318501171, + "grad_norm": 1.011510968208313, + "learning_rate": 3.4902256062707905e-06, + "loss": 0.5903, + "step": 7654 + }, + { + "epoch": 2.2409250585480094, + "grad_norm": 0.9840376973152161, + "learning_rate": 3.489872014561534e-06, + "loss": 0.6052, + "step": 7655 + }, + { + "epoch": 2.241217798594848, + "grad_norm": 1.113525629043579, + "learning_rate": 3.4895183993673736e-06, + "loss": 0.5699, + "step": 7656 + }, + { + "epoch": 2.2415105386416863, + "grad_norm": 0.9758414030075073, + "learning_rate": 3.4891647606966995e-06, + "loss": 0.5585, + "step": 7657 + }, + { + "epoch": 2.2418032786885247, + "grad_norm": 1.0177394151687622, + "learning_rate": 3.4888110985579014e-06, + "loss": 0.5959, + "step": 7658 + }, + { + "epoch": 2.242096018735363, + "grad_norm": 0.9796207547187805, + "learning_rate": 3.4884574129593697e-06, + "loss": 0.5711, + "step": 7659 + }, + { + "epoch": 2.2423887587822016, + "grad_norm": 1.039845585823059, + "learning_rate": 3.488103703909496e-06, + "loss": 0.5607, + "step": 7660 + }, + { + "epoch": 2.24268149882904, + "grad_norm": 1.0048811435699463, + "learning_rate": 3.487749971416672e-06, + "loss": 0.5893, + "step": 7661 + }, + { + "epoch": 2.2429742388758784, + "grad_norm": 1.0295323133468628, + "learning_rate": 3.48739621548929e-06, + "loss": 0.5713, + "step": 7662 + }, + { + "epoch": 2.2432669789227164, + "grad_norm": 0.9891787171363831, + "learning_rate": 3.4870424361357434e-06, + "loss": 0.5641, + "step": 7663 + }, + { + "epoch": 2.243559718969555, + "grad_norm": 1.0139708518981934, + "learning_rate": 3.4866886333644255e-06, + "loss": 0.581, + "step": 7664 + }, + { + "epoch": 2.2438524590163933, + "grad_norm": 0.9864017963409424, + "learning_rate": 3.48633480718373e-06, + "loss": 0.5306, + "step": 7665 + }, + { + "epoch": 2.2441451990632317, + "grad_norm": 1.018236756324768, + "learning_rate": 3.485980957602052e-06, + "loss": 0.5719, + "step": 7666 + }, + { + "epoch": 2.24443793911007, + "grad_norm": 0.9673397541046143, + "learning_rate": 3.4856270846277862e-06, + "loss": 0.5579, + "step": 7667 + }, + { + "epoch": 2.2447306791569086, + "grad_norm": 0.9797884225845337, + "learning_rate": 3.485273188269328e-06, + "loss": 0.5489, + "step": 7668 + }, + { + "epoch": 2.245023419203747, + "grad_norm": 1.0114858150482178, + "learning_rate": 3.484919268535075e-06, + "loss": 0.5964, + "step": 7669 + }, + { + "epoch": 2.2453161592505855, + "grad_norm": 1.0355982780456543, + "learning_rate": 3.484565325433423e-06, + "loss": 0.6036, + "step": 7670 + }, + { + "epoch": 2.245608899297424, + "grad_norm": 1.0538426637649536, + "learning_rate": 3.484211358972769e-06, + "loss": 0.534, + "step": 7671 + }, + { + "epoch": 2.2459016393442623, + "grad_norm": 0.9869254231452942, + "learning_rate": 3.4838573691615114e-06, + "loss": 0.5417, + "step": 7672 + }, + { + "epoch": 2.246194379391101, + "grad_norm": 1.0391573905944824, + "learning_rate": 3.483503356008049e-06, + "loss": 0.5803, + "step": 7673 + }, + { + "epoch": 2.246487119437939, + "grad_norm": 0.9654414653778076, + "learning_rate": 3.4831493195207804e-06, + "loss": 0.583, + "step": 7674 + }, + { + "epoch": 2.2467798594847777, + "grad_norm": 0.9895104169845581, + "learning_rate": 3.482795259708105e-06, + "loss": 0.5761, + "step": 7675 + }, + { + "epoch": 2.247072599531616, + "grad_norm": 1.0356069803237915, + "learning_rate": 3.4824411765784233e-06, + "loss": 0.6149, + "step": 7676 + }, + { + "epoch": 2.2473653395784545, + "grad_norm": 0.9602106213569641, + "learning_rate": 3.482087070140136e-06, + "loss": 0.5637, + "step": 7677 + }, + { + "epoch": 2.2476580796252925, + "grad_norm": 0.9765624403953552, + "learning_rate": 3.4817329404016453e-06, + "loss": 0.585, + "step": 7678 + }, + { + "epoch": 2.247950819672131, + "grad_norm": 1.0587588548660278, + "learning_rate": 3.4813787873713507e-06, + "loss": 0.587, + "step": 7679 + }, + { + "epoch": 2.2482435597189694, + "grad_norm": 1.04042649269104, + "learning_rate": 3.481024611057656e-06, + "loss": 0.5633, + "step": 7680 + }, + { + "epoch": 2.248536299765808, + "grad_norm": 1.0997140407562256, + "learning_rate": 3.4806704114689644e-06, + "loss": 0.5769, + "step": 7681 + }, + { + "epoch": 2.2488290398126463, + "grad_norm": 0.9855808019638062, + "learning_rate": 3.480316188613678e-06, + "loss": 0.5961, + "step": 7682 + }, + { + "epoch": 2.2491217798594847, + "grad_norm": 1.010391116142273, + "learning_rate": 3.479961942500203e-06, + "loss": 0.5603, + "step": 7683 + }, + { + "epoch": 2.249414519906323, + "grad_norm": 0.9865923523902893, + "learning_rate": 3.4796076731369422e-06, + "loss": 0.5484, + "step": 7684 + }, + { + "epoch": 2.2497072599531616, + "grad_norm": 0.9677320122718811, + "learning_rate": 3.4792533805323002e-06, + "loss": 0.5514, + "step": 7685 + }, + { + "epoch": 2.25, + "grad_norm": 1.0941565036773682, + "learning_rate": 3.478899064694685e-06, + "loss": 0.5918, + "step": 7686 + }, + { + "epoch": 2.2502927400468384, + "grad_norm": 1.0260366201400757, + "learning_rate": 3.4785447256325013e-06, + "loss": 0.6003, + "step": 7687 + }, + { + "epoch": 2.250585480093677, + "grad_norm": 1.025589108467102, + "learning_rate": 3.4781903633541554e-06, + "loss": 0.5714, + "step": 7688 + }, + { + "epoch": 2.2508782201405153, + "grad_norm": 1.0157017707824707, + "learning_rate": 3.477835977868055e-06, + "loss": 0.6071, + "step": 7689 + }, + { + "epoch": 2.2511709601873537, + "grad_norm": 0.9787343144416809, + "learning_rate": 3.477481569182608e-06, + "loss": 0.5731, + "step": 7690 + }, + { + "epoch": 2.251463700234192, + "grad_norm": 1.058779001235962, + "learning_rate": 3.477127137306224e-06, + "loss": 0.5924, + "step": 7691 + }, + { + "epoch": 2.2517564402810306, + "grad_norm": 1.0351614952087402, + "learning_rate": 3.4767726822473096e-06, + "loss": 0.5811, + "step": 7692 + }, + { + "epoch": 2.2520491803278686, + "grad_norm": 0.9983100891113281, + "learning_rate": 3.476418204014277e-06, + "loss": 0.5868, + "step": 7693 + }, + { + "epoch": 2.2523419203747075, + "grad_norm": 1.0485458374023438, + "learning_rate": 3.476063702615534e-06, + "loss": 0.5911, + "step": 7694 + }, + { + "epoch": 2.2526346604215455, + "grad_norm": 1.0381702184677124, + "learning_rate": 3.4757091780594927e-06, + "loss": 0.593, + "step": 7695 + }, + { + "epoch": 2.252927400468384, + "grad_norm": 0.9705541729927063, + "learning_rate": 3.475354630354563e-06, + "loss": 0.5643, + "step": 7696 + }, + { + "epoch": 2.2532201405152223, + "grad_norm": 0.9825903177261353, + "learning_rate": 3.4750000595091575e-06, + "loss": 0.5788, + "step": 7697 + }, + { + "epoch": 2.253512880562061, + "grad_norm": 0.992865800857544, + "learning_rate": 3.474645465531689e-06, + "loss": 0.5864, + "step": 7698 + }, + { + "epoch": 2.253805620608899, + "grad_norm": 1.0006290674209595, + "learning_rate": 3.4742908484305695e-06, + "loss": 0.5533, + "step": 7699 + }, + { + "epoch": 2.2540983606557377, + "grad_norm": 0.9651439189910889, + "learning_rate": 3.4739362082142116e-06, + "loss": 0.5791, + "step": 7700 + }, + { + "epoch": 2.254391100702576, + "grad_norm": 0.9553345441818237, + "learning_rate": 3.4735815448910304e-06, + "loss": 0.5641, + "step": 7701 + }, + { + "epoch": 2.2546838407494145, + "grad_norm": 0.9901625514030457, + "learning_rate": 3.4732268584694395e-06, + "loss": 0.5516, + "step": 7702 + }, + { + "epoch": 2.254976580796253, + "grad_norm": 1.0245915651321411, + "learning_rate": 3.4728721489578554e-06, + "loss": 0.5695, + "step": 7703 + }, + { + "epoch": 2.2552693208430914, + "grad_norm": 1.0015031099319458, + "learning_rate": 3.4725174163646924e-06, + "loss": 0.591, + "step": 7704 + }, + { + "epoch": 2.25556206088993, + "grad_norm": 1.002822756767273, + "learning_rate": 3.4721626606983673e-06, + "loss": 0.58, + "step": 7705 + }, + { + "epoch": 2.2558548009367683, + "grad_norm": 0.98483806848526, + "learning_rate": 3.471807881967295e-06, + "loss": 0.571, + "step": 7706 + }, + { + "epoch": 2.2561475409836067, + "grad_norm": 1.0134549140930176, + "learning_rate": 3.471453080179895e-06, + "loss": 0.6045, + "step": 7707 + }, + { + "epoch": 2.256440281030445, + "grad_norm": 1.0283383131027222, + "learning_rate": 3.471098255344584e-06, + "loss": 0.592, + "step": 7708 + }, + { + "epoch": 2.2567330210772836, + "grad_norm": 1.0276507139205933, + "learning_rate": 3.47074340746978e-06, + "loss": 0.5974, + "step": 7709 + }, + { + "epoch": 2.2570257611241216, + "grad_norm": 0.9999250769615173, + "learning_rate": 3.470388536563902e-06, + "loss": 0.6039, + "step": 7710 + }, + { + "epoch": 2.25731850117096, + "grad_norm": 0.9628573060035706, + "learning_rate": 3.4700336426353697e-06, + "loss": 0.5562, + "step": 7711 + }, + { + "epoch": 2.2576112412177984, + "grad_norm": 1.093026041984558, + "learning_rate": 3.469678725692603e-06, + "loss": 0.5847, + "step": 7712 + }, + { + "epoch": 2.257903981264637, + "grad_norm": 0.9539257884025574, + "learning_rate": 3.469323785744022e-06, + "loss": 0.6026, + "step": 7713 + }, + { + "epoch": 2.2581967213114753, + "grad_norm": 1.0111548900604248, + "learning_rate": 3.4689688227980485e-06, + "loss": 0.567, + "step": 7714 + }, + { + "epoch": 2.2584894613583137, + "grad_norm": 0.9934272170066833, + "learning_rate": 3.4686138368631028e-06, + "loss": 0.5594, + "step": 7715 + }, + { + "epoch": 2.258782201405152, + "grad_norm": 0.9677048921585083, + "learning_rate": 3.468258827947608e-06, + "loss": 0.5712, + "step": 7716 + }, + { + "epoch": 2.2590749414519906, + "grad_norm": 0.9571603536605835, + "learning_rate": 3.4679037960599865e-06, + "loss": 0.5727, + "step": 7717 + }, + { + "epoch": 2.259367681498829, + "grad_norm": 1.0174154043197632, + "learning_rate": 3.467548741208661e-06, + "loss": 0.5959, + "step": 7718 + }, + { + "epoch": 2.2596604215456675, + "grad_norm": 0.9668945074081421, + "learning_rate": 3.467193663402057e-06, + "loss": 0.5361, + "step": 7719 + }, + { + "epoch": 2.259953161592506, + "grad_norm": 0.9689031839370728, + "learning_rate": 3.4668385626485968e-06, + "loss": 0.5317, + "step": 7720 + }, + { + "epoch": 2.2602459016393444, + "grad_norm": 0.9548676013946533, + "learning_rate": 3.4664834389567055e-06, + "loss": 0.5154, + "step": 7721 + }, + { + "epoch": 2.260538641686183, + "grad_norm": 1.014752984046936, + "learning_rate": 3.4661282923348095e-06, + "loss": 0.5891, + "step": 7722 + }, + { + "epoch": 2.2608313817330212, + "grad_norm": 0.9766749143600464, + "learning_rate": 3.465773122791334e-06, + "loss": 0.556, + "step": 7723 + }, + { + "epoch": 2.2611241217798597, + "grad_norm": 1.0084501504898071, + "learning_rate": 3.465417930334706e-06, + "loss": 0.5549, + "step": 7724 + }, + { + "epoch": 2.2614168618266977, + "grad_norm": 0.9771736264228821, + "learning_rate": 3.465062714973352e-06, + "loss": 0.5797, + "step": 7725 + }, + { + "epoch": 2.2617096018735365, + "grad_norm": 1.0448120832443237, + "learning_rate": 3.4647074767156993e-06, + "loss": 0.5424, + "step": 7726 + }, + { + "epoch": 2.2620023419203745, + "grad_norm": 0.9873598217964172, + "learning_rate": 3.464352215570176e-06, + "loss": 0.57, + "step": 7727 + }, + { + "epoch": 2.262295081967213, + "grad_norm": 1.0347990989685059, + "learning_rate": 3.4639969315452116e-06, + "loss": 0.5621, + "step": 7728 + }, + { + "epoch": 2.2625878220140514, + "grad_norm": 0.992853581905365, + "learning_rate": 3.463641624649235e-06, + "loss": 0.5596, + "step": 7729 + }, + { + "epoch": 2.26288056206089, + "grad_norm": 1.019025206565857, + "learning_rate": 3.4632862948906755e-06, + "loss": 0.5861, + "step": 7730 + }, + { + "epoch": 2.2631733021077283, + "grad_norm": 0.9693008065223694, + "learning_rate": 3.4629309422779633e-06, + "loss": 0.5456, + "step": 7731 + }, + { + "epoch": 2.2634660421545667, + "grad_norm": 0.9993401765823364, + "learning_rate": 3.46257556681953e-06, + "loss": 0.5902, + "step": 7732 + }, + { + "epoch": 2.263758782201405, + "grad_norm": 0.9665911793708801, + "learning_rate": 3.462220168523806e-06, + "loss": 0.5573, + "step": 7733 + }, + { + "epoch": 2.2640515222482436, + "grad_norm": 0.9588876962661743, + "learning_rate": 3.461864747399224e-06, + "loss": 0.5998, + "step": 7734 + }, + { + "epoch": 2.264344262295082, + "grad_norm": 0.9408324360847473, + "learning_rate": 3.4615093034542157e-06, + "loss": 0.5457, + "step": 7735 + }, + { + "epoch": 2.2646370023419204, + "grad_norm": 0.9427828192710876, + "learning_rate": 3.4611538366972147e-06, + "loss": 0.5406, + "step": 7736 + }, + { + "epoch": 2.264929742388759, + "grad_norm": 1.0319315195083618, + "learning_rate": 3.460798347136655e-06, + "loss": 0.5957, + "step": 7737 + }, + { + "epoch": 2.2652224824355973, + "grad_norm": 1.0052441358566284, + "learning_rate": 3.4604428347809687e-06, + "loss": 0.5711, + "step": 7738 + }, + { + "epoch": 2.2655152224824358, + "grad_norm": 1.0489681959152222, + "learning_rate": 3.460087299638592e-06, + "loss": 0.5947, + "step": 7739 + }, + { + "epoch": 2.265807962529274, + "grad_norm": 0.9785502552986145, + "learning_rate": 3.45973174171796e-06, + "loss": 0.5505, + "step": 7740 + }, + { + "epoch": 2.2661007025761126, + "grad_norm": 0.9931334853172302, + "learning_rate": 3.459376161027508e-06, + "loss": 0.5781, + "step": 7741 + }, + { + "epoch": 2.2663934426229506, + "grad_norm": 0.9627598524093628, + "learning_rate": 3.4590205575756717e-06, + "loss": 0.5533, + "step": 7742 + }, + { + "epoch": 2.266686182669789, + "grad_norm": 1.025359034538269, + "learning_rate": 3.4586649313708886e-06, + "loss": 0.5597, + "step": 7743 + }, + { + "epoch": 2.2669789227166275, + "grad_norm": 1.0238384008407593, + "learning_rate": 3.458309282421597e-06, + "loss": 0.6237, + "step": 7744 + }, + { + "epoch": 2.267271662763466, + "grad_norm": 0.9840860962867737, + "learning_rate": 3.457953610736232e-06, + "loss": 0.5522, + "step": 7745 + }, + { + "epoch": 2.2675644028103044, + "grad_norm": 0.9864637851715088, + "learning_rate": 3.4575979163232343e-06, + "loss": 0.5775, + "step": 7746 + }, + { + "epoch": 2.267857142857143, + "grad_norm": 1.163703203201294, + "learning_rate": 3.4572421991910415e-06, + "loss": 0.5963, + "step": 7747 + }, + { + "epoch": 2.2681498829039812, + "grad_norm": 1.0976139307022095, + "learning_rate": 3.4568864593480937e-06, + "loss": 0.6108, + "step": 7748 + }, + { + "epoch": 2.2684426229508197, + "grad_norm": 1.0447310209274292, + "learning_rate": 3.4565306968028316e-06, + "loss": 0.6147, + "step": 7749 + }, + { + "epoch": 2.268735362997658, + "grad_norm": 0.9625208973884583, + "learning_rate": 3.4561749115636943e-06, + "loss": 0.5723, + "step": 7750 + }, + { + "epoch": 2.2690281030444965, + "grad_norm": 1.0266544818878174, + "learning_rate": 3.4558191036391236e-06, + "loss": 0.5911, + "step": 7751 + }, + { + "epoch": 2.269320843091335, + "grad_norm": 0.9642965793609619, + "learning_rate": 3.4554632730375615e-06, + "loss": 0.5827, + "step": 7752 + }, + { + "epoch": 2.2696135831381734, + "grad_norm": 0.972535252571106, + "learning_rate": 3.4551074197674488e-06, + "loss": 0.5665, + "step": 7753 + }, + { + "epoch": 2.269906323185012, + "grad_norm": 0.9919689893722534, + "learning_rate": 3.4547515438372303e-06, + "loss": 0.4978, + "step": 7754 + }, + { + "epoch": 2.2701990632318503, + "grad_norm": 0.9810467958450317, + "learning_rate": 3.4543956452553467e-06, + "loss": 0.5458, + "step": 7755 + }, + { + "epoch": 2.2704918032786887, + "grad_norm": 0.937999963760376, + "learning_rate": 3.4540397240302437e-06, + "loss": 0.538, + "step": 7756 + }, + { + "epoch": 2.2707845433255267, + "grad_norm": 0.9992679357528687, + "learning_rate": 3.4536837801703655e-06, + "loss": 0.564, + "step": 7757 + }, + { + "epoch": 2.2710772833723656, + "grad_norm": 0.9627020359039307, + "learning_rate": 3.4533278136841553e-06, + "loss": 0.5589, + "step": 7758 + }, + { + "epoch": 2.2713700234192036, + "grad_norm": 1.0101454257965088, + "learning_rate": 3.45297182458006e-06, + "loss": 0.5685, + "step": 7759 + }, + { + "epoch": 2.271662763466042, + "grad_norm": 0.9635245203971863, + "learning_rate": 3.4526158128665257e-06, + "loss": 0.5467, + "step": 7760 + }, + { + "epoch": 2.2719555035128804, + "grad_norm": 0.9948962330818176, + "learning_rate": 3.4522597785519974e-06, + "loss": 0.5666, + "step": 7761 + }, + { + "epoch": 2.272248243559719, + "grad_norm": 1.0332120656967163, + "learning_rate": 3.451903721644923e-06, + "loss": 0.5923, + "step": 7762 + }, + { + "epoch": 2.2725409836065573, + "grad_norm": 1.0510797500610352, + "learning_rate": 3.4515476421537498e-06, + "loss": 0.6067, + "step": 7763 + }, + { + "epoch": 2.2728337236533958, + "grad_norm": 0.9516385793685913, + "learning_rate": 3.4511915400869267e-06, + "loss": 0.5843, + "step": 7764 + }, + { + "epoch": 2.273126463700234, + "grad_norm": 1.006912350654602, + "learning_rate": 3.450835415452901e-06, + "loss": 0.5469, + "step": 7765 + }, + { + "epoch": 2.2734192037470726, + "grad_norm": 1.0265464782714844, + "learning_rate": 3.4504792682601226e-06, + "loss": 0.6104, + "step": 7766 + }, + { + "epoch": 2.273711943793911, + "grad_norm": 1.0236740112304688, + "learning_rate": 3.4501230985170403e-06, + "loss": 0.6022, + "step": 7767 + }, + { + "epoch": 2.2740046838407495, + "grad_norm": 0.9867193698883057, + "learning_rate": 3.4497669062321052e-06, + "loss": 0.5397, + "step": 7768 + }, + { + "epoch": 2.274297423887588, + "grad_norm": 0.9544323682785034, + "learning_rate": 3.4494106914137687e-06, + "loss": 0.5743, + "step": 7769 + }, + { + "epoch": 2.2745901639344264, + "grad_norm": 0.9558359980583191, + "learning_rate": 3.4490544540704807e-06, + "loss": 0.5773, + "step": 7770 + }, + { + "epoch": 2.274882903981265, + "grad_norm": 0.9867073893547058, + "learning_rate": 3.448698194210693e-06, + "loss": 0.5598, + "step": 7771 + }, + { + "epoch": 2.275175644028103, + "grad_norm": 0.9912944436073303, + "learning_rate": 3.4483419118428585e-06, + "loss": 0.589, + "step": 7772 + }, + { + "epoch": 2.2754683840749417, + "grad_norm": 1.0448533296585083, + "learning_rate": 3.4479856069754295e-06, + "loss": 0.5704, + "step": 7773 + }, + { + "epoch": 2.2757611241217797, + "grad_norm": 0.9688596725463867, + "learning_rate": 3.4476292796168613e-06, + "loss": 0.552, + "step": 7774 + }, + { + "epoch": 2.276053864168618, + "grad_norm": 0.977154016494751, + "learning_rate": 3.447272929775605e-06, + "loss": 0.5834, + "step": 7775 + }, + { + "epoch": 2.2763466042154565, + "grad_norm": 1.006469964981079, + "learning_rate": 3.446916557460118e-06, + "loss": 0.5959, + "step": 7776 + }, + { + "epoch": 2.276639344262295, + "grad_norm": 0.9726776480674744, + "learning_rate": 3.4465601626788527e-06, + "loss": 0.566, + "step": 7777 + }, + { + "epoch": 2.2769320843091334, + "grad_norm": 0.9851052165031433, + "learning_rate": 3.4462037454402653e-06, + "loss": 0.5695, + "step": 7778 + }, + { + "epoch": 2.277224824355972, + "grad_norm": 0.9817856550216675, + "learning_rate": 3.445847305752813e-06, + "loss": 0.5622, + "step": 7779 + }, + { + "epoch": 2.2775175644028103, + "grad_norm": 0.9484641551971436, + "learning_rate": 3.4454908436249513e-06, + "loss": 0.5368, + "step": 7780 + }, + { + "epoch": 2.2778103044496487, + "grad_norm": 0.9701506495475769, + "learning_rate": 3.4451343590651378e-06, + "loss": 0.5567, + "step": 7781 + }, + { + "epoch": 2.278103044496487, + "grad_norm": 0.9823960661888123, + "learning_rate": 3.444777852081829e-06, + "loss": 0.5728, + "step": 7782 + }, + { + "epoch": 2.2783957845433256, + "grad_norm": 1.0070308446884155, + "learning_rate": 3.4444213226834853e-06, + "loss": 0.6032, + "step": 7783 + }, + { + "epoch": 2.278688524590164, + "grad_norm": 0.9836553335189819, + "learning_rate": 3.4440647708785647e-06, + "loss": 0.6091, + "step": 7784 + }, + { + "epoch": 2.2789812646370025, + "grad_norm": 0.9726074934005737, + "learning_rate": 3.4437081966755247e-06, + "loss": 0.5397, + "step": 7785 + }, + { + "epoch": 2.279274004683841, + "grad_norm": 0.989581286907196, + "learning_rate": 3.4433516000828272e-06, + "loss": 0.5931, + "step": 7786 + }, + { + "epoch": 2.2795667447306793, + "grad_norm": 0.9802401065826416, + "learning_rate": 3.442994981108931e-06, + "loss": 0.6232, + "step": 7787 + }, + { + "epoch": 2.2798594847775178, + "grad_norm": 0.9552510976791382, + "learning_rate": 3.442638339762299e-06, + "loss": 0.5414, + "step": 7788 + }, + { + "epoch": 2.2801522248243558, + "grad_norm": 0.9811908006668091, + "learning_rate": 3.4422816760513904e-06, + "loss": 0.5376, + "step": 7789 + }, + { + "epoch": 2.280444964871194, + "grad_norm": 0.9683527946472168, + "learning_rate": 3.4419249899846685e-06, + "loss": 0.5286, + "step": 7790 + }, + { + "epoch": 2.2807377049180326, + "grad_norm": 1.034493088722229, + "learning_rate": 3.441568281570595e-06, + "loss": 0.5666, + "step": 7791 + }, + { + "epoch": 2.281030444964871, + "grad_norm": 0.9812235236167908, + "learning_rate": 3.4412115508176332e-06, + "loss": 0.538, + "step": 7792 + }, + { + "epoch": 2.2813231850117095, + "grad_norm": 0.965851902961731, + "learning_rate": 3.4408547977342465e-06, + "loss": 0.5995, + "step": 7793 + }, + { + "epoch": 2.281615925058548, + "grad_norm": 0.9736689925193787, + "learning_rate": 3.4404980223288992e-06, + "loss": 0.5544, + "step": 7794 + }, + { + "epoch": 2.2819086651053864, + "grad_norm": 1.0188462734222412, + "learning_rate": 3.440141224610056e-06, + "loss": 0.5799, + "step": 7795 + }, + { + "epoch": 2.282201405152225, + "grad_norm": 0.9889877438545227, + "learning_rate": 3.4397844045861805e-06, + "loss": 0.5929, + "step": 7796 + }, + { + "epoch": 2.2824941451990632, + "grad_norm": 0.9929053783416748, + "learning_rate": 3.4394275622657395e-06, + "loss": 0.5837, + "step": 7797 + }, + { + "epoch": 2.2827868852459017, + "grad_norm": 0.9707443714141846, + "learning_rate": 3.4390706976572e-06, + "loss": 0.6113, + "step": 7798 + }, + { + "epoch": 2.28307962529274, + "grad_norm": 0.9955633282661438, + "learning_rate": 3.4387138107690273e-06, + "loss": 0.5852, + "step": 7799 + }, + { + "epoch": 2.2833723653395785, + "grad_norm": 0.9914477467536926, + "learning_rate": 3.438356901609689e-06, + "loss": 0.624, + "step": 7800 + }, + { + "epoch": 2.283665105386417, + "grad_norm": 1.0268046855926514, + "learning_rate": 3.437999970187653e-06, + "loss": 0.5928, + "step": 7801 + }, + { + "epoch": 2.2839578454332554, + "grad_norm": 0.9523032307624817, + "learning_rate": 3.4376430165113874e-06, + "loss": 0.5399, + "step": 7802 + }, + { + "epoch": 2.284250585480094, + "grad_norm": 0.9603230953216553, + "learning_rate": 3.4372860405893616e-06, + "loss": 0.5589, + "step": 7803 + }, + { + "epoch": 2.284543325526932, + "grad_norm": 1.0038858652114868, + "learning_rate": 3.436929042430044e-06, + "loss": 0.5471, + "step": 7804 + }, + { + "epoch": 2.2848360655737707, + "grad_norm": 0.9403757452964783, + "learning_rate": 3.4365720220419046e-06, + "loss": 0.5564, + "step": 7805 + }, + { + "epoch": 2.2851288056206087, + "grad_norm": 0.9868103861808777, + "learning_rate": 3.4362149794334145e-06, + "loss": 0.5933, + "step": 7806 + }, + { + "epoch": 2.285421545667447, + "grad_norm": 0.9809015393257141, + "learning_rate": 3.435857914613044e-06, + "loss": 0.5693, + "step": 7807 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.9776702523231506, + "learning_rate": 3.4355008275892644e-06, + "loss": 0.5453, + "step": 7808 + }, + { + "epoch": 2.286007025761124, + "grad_norm": 0.9645871520042419, + "learning_rate": 3.4351437183705485e-06, + "loss": 0.5891, + "step": 7809 + }, + { + "epoch": 2.2862997658079625, + "grad_norm": 0.9682175517082214, + "learning_rate": 3.434786586965368e-06, + "loss": 0.5815, + "step": 7810 + }, + { + "epoch": 2.286592505854801, + "grad_norm": 1.021755337715149, + "learning_rate": 3.434429433382196e-06, + "loss": 0.5928, + "step": 7811 + }, + { + "epoch": 2.2868852459016393, + "grad_norm": 0.9625899791717529, + "learning_rate": 3.4340722576295068e-06, + "loss": 0.5529, + "step": 7812 + }, + { + "epoch": 2.2871779859484778, + "grad_norm": 0.989058792591095, + "learning_rate": 3.4337150597157733e-06, + "loss": 0.5802, + "step": 7813 + }, + { + "epoch": 2.287470725995316, + "grad_norm": 0.9693152904510498, + "learning_rate": 3.43335783964947e-06, + "loss": 0.6322, + "step": 7814 + }, + { + "epoch": 2.2877634660421546, + "grad_norm": 0.9039434790611267, + "learning_rate": 3.4330005974390734e-06, + "loss": 0.5324, + "step": 7815 + }, + { + "epoch": 2.288056206088993, + "grad_norm": 1.0237371921539307, + "learning_rate": 3.432643333093058e-06, + "loss": 0.5778, + "step": 7816 + }, + { + "epoch": 2.2883489461358315, + "grad_norm": 0.9385696649551392, + "learning_rate": 3.4322860466199004e-06, + "loss": 0.5433, + "step": 7817 + }, + { + "epoch": 2.28864168618267, + "grad_norm": 1.002858281135559, + "learning_rate": 3.4319287380280774e-06, + "loss": 0.6011, + "step": 7818 + }, + { + "epoch": 2.2889344262295084, + "grad_norm": 0.9907879829406738, + "learning_rate": 3.431571407326066e-06, + "loss": 0.5767, + "step": 7819 + }, + { + "epoch": 2.289227166276347, + "grad_norm": 0.9268401861190796, + "learning_rate": 3.4312140545223443e-06, + "loss": 0.5228, + "step": 7820 + }, + { + "epoch": 2.289519906323185, + "grad_norm": 1.0444380044937134, + "learning_rate": 3.43085667962539e-06, + "loss": 0.6115, + "step": 7821 + }, + { + "epoch": 2.2898126463700232, + "grad_norm": 1.03604257106781, + "learning_rate": 3.430499282643682e-06, + "loss": 0.5667, + "step": 7822 + }, + { + "epoch": 2.2901053864168617, + "grad_norm": 0.9551458954811096, + "learning_rate": 3.4301418635857e-06, + "loss": 0.5577, + "step": 7823 + }, + { + "epoch": 2.2903981264637, + "grad_norm": 0.9862896800041199, + "learning_rate": 3.429784422459923e-06, + "loss": 0.5663, + "step": 7824 + }, + { + "epoch": 2.2906908665105385, + "grad_norm": 1.0245469808578491, + "learning_rate": 3.429426959274833e-06, + "loss": 0.6218, + "step": 7825 + }, + { + "epoch": 2.290983606557377, + "grad_norm": 0.9806911945343018, + "learning_rate": 3.4290694740389084e-06, + "loss": 0.5667, + "step": 7826 + }, + { + "epoch": 2.2912763466042154, + "grad_norm": 1.0126289129257202, + "learning_rate": 3.428711966760633e-06, + "loss": 0.5236, + "step": 7827 + }, + { + "epoch": 2.291569086651054, + "grad_norm": 0.9768520593643188, + "learning_rate": 3.4283544374484877e-06, + "loss": 0.5741, + "step": 7828 + }, + { + "epoch": 2.2918618266978923, + "grad_norm": 0.9420350193977356, + "learning_rate": 3.4279968861109554e-06, + "loss": 0.5573, + "step": 7829 + }, + { + "epoch": 2.2921545667447307, + "grad_norm": 0.9490533471107483, + "learning_rate": 3.4276393127565176e-06, + "loss": 0.5655, + "step": 7830 + }, + { + "epoch": 2.292447306791569, + "grad_norm": 0.9763731360435486, + "learning_rate": 3.42728171739366e-06, + "loss": 0.5616, + "step": 7831 + }, + { + "epoch": 2.2927400468384076, + "grad_norm": 0.9764741659164429, + "learning_rate": 3.426924100030865e-06, + "loss": 0.5883, + "step": 7832 + }, + { + "epoch": 2.293032786885246, + "grad_norm": 0.9814183712005615, + "learning_rate": 3.4265664606766174e-06, + "loss": 0.6262, + "step": 7833 + }, + { + "epoch": 2.2933255269320845, + "grad_norm": 0.9692974090576172, + "learning_rate": 3.426208799339403e-06, + "loss": 0.5321, + "step": 7834 + }, + { + "epoch": 2.293618266978923, + "grad_norm": 0.9865909218788147, + "learning_rate": 3.425851116027707e-06, + "loss": 0.5922, + "step": 7835 + }, + { + "epoch": 2.293911007025761, + "grad_norm": 1.0511397123336792, + "learning_rate": 3.4254934107500147e-06, + "loss": 0.5705, + "step": 7836 + }, + { + "epoch": 2.2942037470725998, + "grad_norm": 1.01362144947052, + "learning_rate": 3.4251356835148135e-06, + "loss": 0.5875, + "step": 7837 + }, + { + "epoch": 2.2944964871194378, + "grad_norm": 0.9732205867767334, + "learning_rate": 3.424777934330591e-06, + "loss": 0.5895, + "step": 7838 + }, + { + "epoch": 2.294789227166276, + "grad_norm": 0.9971094727516174, + "learning_rate": 3.424420163205834e-06, + "loss": 0.5748, + "step": 7839 + }, + { + "epoch": 2.2950819672131146, + "grad_norm": 1.005722165107727, + "learning_rate": 3.4240623701490313e-06, + "loss": 0.5785, + "step": 7840 + }, + { + "epoch": 2.295374707259953, + "grad_norm": 1.0433924198150635, + "learning_rate": 3.423704555168671e-06, + "loss": 0.6115, + "step": 7841 + }, + { + "epoch": 2.2956674473067915, + "grad_norm": 1.012593388557434, + "learning_rate": 3.423346718273243e-06, + "loss": 0.5694, + "step": 7842 + }, + { + "epoch": 2.29596018735363, + "grad_norm": 1.1964446306228638, + "learning_rate": 3.4229888594712358e-06, + "loss": 0.5963, + "step": 7843 + }, + { + "epoch": 2.2962529274004684, + "grad_norm": 1.0150117874145508, + "learning_rate": 3.422630978771141e-06, + "loss": 0.5823, + "step": 7844 + }, + { + "epoch": 2.296545667447307, + "grad_norm": 1.0573155879974365, + "learning_rate": 3.4222730761814497e-06, + "loss": 0.5992, + "step": 7845 + }, + { + "epoch": 2.2968384074941453, + "grad_norm": 0.9918455481529236, + "learning_rate": 3.421915151710652e-06, + "loss": 0.5622, + "step": 7846 + }, + { + "epoch": 2.2971311475409837, + "grad_norm": 0.9794971346855164, + "learning_rate": 3.421557205367241e-06, + "loss": 0.6153, + "step": 7847 + }, + { + "epoch": 2.297423887587822, + "grad_norm": 0.9875562787055969, + "learning_rate": 3.421199237159707e-06, + "loss": 0.5981, + "step": 7848 + }, + { + "epoch": 2.2977166276346606, + "grad_norm": 1.0113582611083984, + "learning_rate": 3.4208412470965445e-06, + "loss": 0.6059, + "step": 7849 + }, + { + "epoch": 2.298009367681499, + "grad_norm": 0.9977047443389893, + "learning_rate": 3.4204832351862473e-06, + "loss": 0.59, + "step": 7850 + }, + { + "epoch": 2.2983021077283374, + "grad_norm": 1.0333322286605835, + "learning_rate": 3.4201252014373078e-06, + "loss": 0.5693, + "step": 7851 + }, + { + "epoch": 2.298594847775176, + "grad_norm": 0.9450567364692688, + "learning_rate": 3.4197671458582206e-06, + "loss": 0.6137, + "step": 7852 + }, + { + "epoch": 2.298887587822014, + "grad_norm": 1.0263017416000366, + "learning_rate": 3.4194090684574818e-06, + "loss": 0.5421, + "step": 7853 + }, + { + "epoch": 2.2991803278688523, + "grad_norm": 0.9845362305641174, + "learning_rate": 3.419050969243586e-06, + "loss": 0.5828, + "step": 7854 + }, + { + "epoch": 2.2994730679156907, + "grad_norm": 0.9697696566581726, + "learning_rate": 3.41869284822503e-06, + "loss": 0.5771, + "step": 7855 + }, + { + "epoch": 2.299765807962529, + "grad_norm": 1.0437546968460083, + "learning_rate": 3.418334705410309e-06, + "loss": 0.5594, + "step": 7856 + }, + { + "epoch": 2.3000585480093676, + "grad_norm": 1.0158309936523438, + "learning_rate": 3.41797654080792e-06, + "loss": 0.5811, + "step": 7857 + }, + { + "epoch": 2.300351288056206, + "grad_norm": 1.0232326984405518, + "learning_rate": 3.4176183544263625e-06, + "loss": 0.5503, + "step": 7858 + }, + { + "epoch": 2.3006440281030445, + "grad_norm": 0.9556655883789062, + "learning_rate": 3.417260146274133e-06, + "loss": 0.575, + "step": 7859 + }, + { + "epoch": 2.300936768149883, + "grad_norm": 1.0061633586883545, + "learning_rate": 3.41690191635973e-06, + "loss": 0.5591, + "step": 7860 + }, + { + "epoch": 2.3012295081967213, + "grad_norm": 1.0054433345794678, + "learning_rate": 3.4165436646916526e-06, + "loss": 0.5784, + "step": 7861 + }, + { + "epoch": 2.3015222482435598, + "grad_norm": 0.9344382286071777, + "learning_rate": 3.4161853912784016e-06, + "loss": 0.5529, + "step": 7862 + }, + { + "epoch": 2.301814988290398, + "grad_norm": 0.9465065002441406, + "learning_rate": 3.415827096128475e-06, + "loss": 0.5604, + "step": 7863 + }, + { + "epoch": 2.3021077283372366, + "grad_norm": 0.9729405641555786, + "learning_rate": 3.415468779250376e-06, + "loss": 0.5956, + "step": 7864 + }, + { + "epoch": 2.302400468384075, + "grad_norm": 0.9784372448921204, + "learning_rate": 3.415110440652603e-06, + "loss": 0.5322, + "step": 7865 + }, + { + "epoch": 2.3026932084309135, + "grad_norm": 0.9663918018341064, + "learning_rate": 3.41475208034366e-06, + "loss": 0.573, + "step": 7866 + }, + { + "epoch": 2.302985948477752, + "grad_norm": 0.9747358560562134, + "learning_rate": 3.4143936983320476e-06, + "loss": 0.5304, + "step": 7867 + }, + { + "epoch": 2.30327868852459, + "grad_norm": 1.0059560537338257, + "learning_rate": 3.414035294626269e-06, + "loss": 0.5787, + "step": 7868 + }, + { + "epoch": 2.3035714285714284, + "grad_norm": 0.9337498545646667, + "learning_rate": 3.4136768692348275e-06, + "loss": 0.5755, + "step": 7869 + }, + { + "epoch": 2.303864168618267, + "grad_norm": 0.9878649115562439, + "learning_rate": 3.4133184221662265e-06, + "loss": 0.5537, + "step": 7870 + }, + { + "epoch": 2.3041569086651053, + "grad_norm": 0.9773499369621277, + "learning_rate": 3.4129599534289714e-06, + "loss": 0.5508, + "step": 7871 + }, + { + "epoch": 2.3044496487119437, + "grad_norm": 1.0427337884902954, + "learning_rate": 3.412601463031565e-06, + "loss": 0.5571, + "step": 7872 + }, + { + "epoch": 2.304742388758782, + "grad_norm": 0.9451664686203003, + "learning_rate": 3.412242950982514e-06, + "loss": 0.5739, + "step": 7873 + }, + { + "epoch": 2.3050351288056206, + "grad_norm": 0.9465556740760803, + "learning_rate": 3.411884417290323e-06, + "loss": 0.5749, + "step": 7874 + }, + { + "epoch": 2.305327868852459, + "grad_norm": 0.997381329536438, + "learning_rate": 3.4115258619635e-06, + "loss": 0.581, + "step": 7875 + }, + { + "epoch": 2.3056206088992974, + "grad_norm": 0.9685149788856506, + "learning_rate": 3.4111672850105505e-06, + "loss": 0.5591, + "step": 7876 + }, + { + "epoch": 2.305913348946136, + "grad_norm": 0.9764621257781982, + "learning_rate": 3.410808686439982e-06, + "loss": 0.5755, + "step": 7877 + }, + { + "epoch": 2.3062060889929743, + "grad_norm": 1.0020663738250732, + "learning_rate": 3.4104500662603022e-06, + "loss": 0.5796, + "step": 7878 + }, + { + "epoch": 2.3064988290398127, + "grad_norm": 1.0099588632583618, + "learning_rate": 3.41009142448002e-06, + "loss": 0.5736, + "step": 7879 + }, + { + "epoch": 2.306791569086651, + "grad_norm": 0.99127596616745, + "learning_rate": 3.409732761107644e-06, + "loss": 0.554, + "step": 7880 + }, + { + "epoch": 2.3070843091334896, + "grad_norm": 0.9984381794929504, + "learning_rate": 3.4093740761516836e-06, + "loss": 0.6069, + "step": 7881 + }, + { + "epoch": 2.307377049180328, + "grad_norm": 1.009534478187561, + "learning_rate": 3.4090153696206478e-06, + "loss": 0.5514, + "step": 7882 + }, + { + "epoch": 2.307669789227166, + "grad_norm": 1.0367566347122192, + "learning_rate": 3.4086566415230486e-06, + "loss": 0.6039, + "step": 7883 + }, + { + "epoch": 2.307962529274005, + "grad_norm": 0.9837641716003418, + "learning_rate": 3.4082978918673957e-06, + "loss": 0.5885, + "step": 7884 + }, + { + "epoch": 2.308255269320843, + "grad_norm": 0.9728750586509705, + "learning_rate": 3.4079391206622002e-06, + "loss": 0.5909, + "step": 7885 + }, + { + "epoch": 2.3085480093676813, + "grad_norm": 1.0516384840011597, + "learning_rate": 3.407580327915976e-06, + "loss": 0.592, + "step": 7886 + }, + { + "epoch": 2.3088407494145198, + "grad_norm": 0.9902092814445496, + "learning_rate": 3.407221513637233e-06, + "loss": 0.5713, + "step": 7887 + }, + { + "epoch": 2.309133489461358, + "grad_norm": 1.0077672004699707, + "learning_rate": 3.4068626778344854e-06, + "loss": 0.6187, + "step": 7888 + }, + { + "epoch": 2.3094262295081966, + "grad_norm": 1.0126374959945679, + "learning_rate": 3.4065038205162465e-06, + "loss": 0.5846, + "step": 7889 + }, + { + "epoch": 2.309718969555035, + "grad_norm": 0.9964720606803894, + "learning_rate": 3.4061449416910304e-06, + "loss": 0.6044, + "step": 7890 + }, + { + "epoch": 2.3100117096018735, + "grad_norm": 1.031031847000122, + "learning_rate": 3.405786041367352e-06, + "loss": 0.6086, + "step": 7891 + }, + { + "epoch": 2.310304449648712, + "grad_norm": 1.0470452308654785, + "learning_rate": 3.405427119553725e-06, + "loss": 0.5711, + "step": 7892 + }, + { + "epoch": 2.3105971896955504, + "grad_norm": 0.9663960337638855, + "learning_rate": 3.4050681762586656e-06, + "loss": 0.5858, + "step": 7893 + }, + { + "epoch": 2.310889929742389, + "grad_norm": 0.9568478465080261, + "learning_rate": 3.4047092114906903e-06, + "loss": 0.5518, + "step": 7894 + }, + { + "epoch": 2.3111826697892273, + "grad_norm": 1.0026270151138306, + "learning_rate": 3.404350225258315e-06, + "loss": 0.5841, + "step": 7895 + }, + { + "epoch": 2.3114754098360657, + "grad_norm": 0.9254561066627502, + "learning_rate": 3.403991217570057e-06, + "loss": 0.5644, + "step": 7896 + }, + { + "epoch": 2.311768149882904, + "grad_norm": 1.041334629058838, + "learning_rate": 3.4036321884344337e-06, + "loss": 0.5207, + "step": 7897 + }, + { + "epoch": 2.3120608899297426, + "grad_norm": 0.9416597485542297, + "learning_rate": 3.4032731378599624e-06, + "loss": 0.5691, + "step": 7898 + }, + { + "epoch": 2.312353629976581, + "grad_norm": 0.9907670617103577, + "learning_rate": 3.4029140658551633e-06, + "loss": 0.5713, + "step": 7899 + }, + { + "epoch": 2.312646370023419, + "grad_norm": 0.9569364786148071, + "learning_rate": 3.4025549724285546e-06, + "loss": 0.5372, + "step": 7900 + }, + { + "epoch": 2.3129391100702574, + "grad_norm": 0.9919894337654114, + "learning_rate": 3.4021958575886548e-06, + "loss": 0.5776, + "step": 7901 + }, + { + "epoch": 2.313231850117096, + "grad_norm": 0.9765197038650513, + "learning_rate": 3.4018367213439863e-06, + "loss": 0.5666, + "step": 7902 + }, + { + "epoch": 2.3135245901639343, + "grad_norm": 0.9399380087852478, + "learning_rate": 3.4014775637030674e-06, + "loss": 0.5494, + "step": 7903 + }, + { + "epoch": 2.3138173302107727, + "grad_norm": 1.0076026916503906, + "learning_rate": 3.40111838467442e-06, + "loss": 0.5419, + "step": 7904 + }, + { + "epoch": 2.314110070257611, + "grad_norm": 0.9701331257820129, + "learning_rate": 3.4007591842665662e-06, + "loss": 0.5721, + "step": 7905 + }, + { + "epoch": 2.3144028103044496, + "grad_norm": 0.9616657495498657, + "learning_rate": 3.400399962488028e-06, + "loss": 0.5935, + "step": 7906 + }, + { + "epoch": 2.314695550351288, + "grad_norm": 1.0051639080047607, + "learning_rate": 3.4000407193473268e-06, + "loss": 0.6145, + "step": 7907 + }, + { + "epoch": 2.3149882903981265, + "grad_norm": 0.9954362511634827, + "learning_rate": 3.3996814548529873e-06, + "loss": 0.5653, + "step": 7908 + }, + { + "epoch": 2.315281030444965, + "grad_norm": 0.9709159731864929, + "learning_rate": 3.399322169013532e-06, + "loss": 0.5679, + "step": 7909 + }, + { + "epoch": 2.3155737704918034, + "grad_norm": 1.011351227760315, + "learning_rate": 3.3989628618374854e-06, + "loss": 0.6039, + "step": 7910 + }, + { + "epoch": 2.315866510538642, + "grad_norm": 1.0143165588378906, + "learning_rate": 3.3986035333333726e-06, + "loss": 0.5801, + "step": 7911 + }, + { + "epoch": 2.3161592505854802, + "grad_norm": 0.9814159870147705, + "learning_rate": 3.398244183509718e-06, + "loss": 0.5482, + "step": 7912 + }, + { + "epoch": 2.3164519906323187, + "grad_norm": 0.9343315958976746, + "learning_rate": 3.397884812375048e-06, + "loss": 0.5559, + "step": 7913 + }, + { + "epoch": 2.316744730679157, + "grad_norm": 0.9790879487991333, + "learning_rate": 3.3975254199378875e-06, + "loss": 0.5542, + "step": 7914 + }, + { + "epoch": 2.317037470725995, + "grad_norm": 1.0368434190750122, + "learning_rate": 3.3971660062067645e-06, + "loss": 0.616, + "step": 7915 + }, + { + "epoch": 2.317330210772834, + "grad_norm": 0.9793683290481567, + "learning_rate": 3.3968065711902064e-06, + "loss": 0.5321, + "step": 7916 + }, + { + "epoch": 2.317622950819672, + "grad_norm": 0.9890643358230591, + "learning_rate": 3.396447114896739e-06, + "loss": 0.5883, + "step": 7917 + }, + { + "epoch": 2.3179156908665104, + "grad_norm": 0.9414679408073425, + "learning_rate": 3.3960876373348926e-06, + "loss": 0.551, + "step": 7918 + }, + { + "epoch": 2.318208430913349, + "grad_norm": 0.987114667892456, + "learning_rate": 3.395728138513194e-06, + "loss": 0.5538, + "step": 7919 + }, + { + "epoch": 2.3185011709601873, + "grad_norm": 0.9472190141677856, + "learning_rate": 3.3953686184401734e-06, + "loss": 0.5518, + "step": 7920 + }, + { + "epoch": 2.3187939110070257, + "grad_norm": 0.9745451807975769, + "learning_rate": 3.3950090771243604e-06, + "loss": 0.5596, + "step": 7921 + }, + { + "epoch": 2.319086651053864, + "grad_norm": 1.062178134918213, + "learning_rate": 3.3946495145742853e-06, + "loss": 0.5608, + "step": 7922 + }, + { + "epoch": 2.3193793911007026, + "grad_norm": 1.0291049480438232, + "learning_rate": 3.394289930798478e-06, + "loss": 0.5568, + "step": 7923 + }, + { + "epoch": 2.319672131147541, + "grad_norm": 0.9881428480148315, + "learning_rate": 3.39393032580547e-06, + "loss": 0.5712, + "step": 7924 + }, + { + "epoch": 2.3199648711943794, + "grad_norm": 1.008080244064331, + "learning_rate": 3.3935706996037944e-06, + "loss": 0.5852, + "step": 7925 + }, + { + "epoch": 2.320257611241218, + "grad_norm": 0.9598578214645386, + "learning_rate": 3.3932110522019816e-06, + "loss": 0.5806, + "step": 7926 + }, + { + "epoch": 2.3205503512880563, + "grad_norm": 0.9590674638748169, + "learning_rate": 3.3928513836085653e-06, + "loss": 0.5581, + "step": 7927 + }, + { + "epoch": 2.3208430913348947, + "grad_norm": 1.006791353225708, + "learning_rate": 3.3924916938320785e-06, + "loss": 0.618, + "step": 7928 + }, + { + "epoch": 2.321135831381733, + "grad_norm": 0.95268315076828, + "learning_rate": 3.392131982881054e-06, + "loss": 0.5418, + "step": 7929 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 1.0577480792999268, + "learning_rate": 3.3917722507640273e-06, + "loss": 0.5879, + "step": 7930 + }, + { + "epoch": 2.32172131147541, + "grad_norm": 1.0098378658294678, + "learning_rate": 3.391412497489533e-06, + "loss": 0.5879, + "step": 7931 + }, + { + "epoch": 2.322014051522248, + "grad_norm": 0.9565346240997314, + "learning_rate": 3.3910527230661056e-06, + "loss": 0.5517, + "step": 7932 + }, + { + "epoch": 2.3223067915690865, + "grad_norm": 1.0453803539276123, + "learning_rate": 3.390692927502281e-06, + "loss": 0.6261, + "step": 7933 + }, + { + "epoch": 2.322599531615925, + "grad_norm": 0.9619380831718445, + "learning_rate": 3.390333110806595e-06, + "loss": 0.591, + "step": 7934 + }, + { + "epoch": 2.3228922716627634, + "grad_norm": 1.1454769372940063, + "learning_rate": 3.389973272987587e-06, + "loss": 0.5867, + "step": 7935 + }, + { + "epoch": 2.323185011709602, + "grad_norm": 1.0098568201065063, + "learning_rate": 3.3896134140537903e-06, + "loss": 0.612, + "step": 7936 + }, + { + "epoch": 2.3234777517564402, + "grad_norm": 0.9833013415336609, + "learning_rate": 3.389253534013745e-06, + "loss": 0.5759, + "step": 7937 + }, + { + "epoch": 2.3237704918032787, + "grad_norm": 0.9421290755271912, + "learning_rate": 3.3888936328759884e-06, + "loss": 0.5691, + "step": 7938 + }, + { + "epoch": 2.324063231850117, + "grad_norm": 0.9258983731269836, + "learning_rate": 3.3885337106490597e-06, + "loss": 0.5316, + "step": 7939 + }, + { + "epoch": 2.3243559718969555, + "grad_norm": 0.9870781898498535, + "learning_rate": 3.388173767341498e-06, + "loss": 0.4968, + "step": 7940 + }, + { + "epoch": 2.324648711943794, + "grad_norm": 1.0036499500274658, + "learning_rate": 3.3878138029618435e-06, + "loss": 0.5933, + "step": 7941 + }, + { + "epoch": 2.3249414519906324, + "grad_norm": 0.935213029384613, + "learning_rate": 3.3874538175186356e-06, + "loss": 0.5378, + "step": 7942 + }, + { + "epoch": 2.325234192037471, + "grad_norm": 0.9894752502441406, + "learning_rate": 3.3870938110204154e-06, + "loss": 0.524, + "step": 7943 + }, + { + "epoch": 2.3255269320843093, + "grad_norm": 1.0544668436050415, + "learning_rate": 3.3867337834757237e-06, + "loss": 0.6273, + "step": 7944 + }, + { + "epoch": 2.3258196721311477, + "grad_norm": 0.9731618762016296, + "learning_rate": 3.386373734893103e-06, + "loss": 0.5744, + "step": 7945 + }, + { + "epoch": 2.326112412177986, + "grad_norm": 1.2522753477096558, + "learning_rate": 3.3860136652810953e-06, + "loss": 0.5593, + "step": 7946 + }, + { + "epoch": 2.326405152224824, + "grad_norm": 0.9916512370109558, + "learning_rate": 3.385653574648243e-06, + "loss": 0.5346, + "step": 7947 + }, + { + "epoch": 2.3266978922716626, + "grad_norm": 0.9480562210083008, + "learning_rate": 3.3852934630030897e-06, + "loss": 0.5508, + "step": 7948 + }, + { + "epoch": 2.326990632318501, + "grad_norm": 0.9837144613265991, + "learning_rate": 3.3849333303541788e-06, + "loss": 0.6002, + "step": 7949 + }, + { + "epoch": 2.3272833723653394, + "grad_norm": 0.9513790011405945, + "learning_rate": 3.3845731767100544e-06, + "loss": 0.5398, + "step": 7950 + }, + { + "epoch": 2.327576112412178, + "grad_norm": 1.0017409324645996, + "learning_rate": 3.3842130020792608e-06, + "loss": 0.5526, + "step": 7951 + }, + { + "epoch": 2.3278688524590163, + "grad_norm": 0.9510372281074524, + "learning_rate": 3.383852806470345e-06, + "loss": 0.5464, + "step": 7952 + }, + { + "epoch": 2.3281615925058547, + "grad_norm": 0.9887937903404236, + "learning_rate": 3.3834925898918516e-06, + "loss": 0.5535, + "step": 7953 + }, + { + "epoch": 2.328454332552693, + "grad_norm": 0.9880775213241577, + "learning_rate": 3.383132352352326e-06, + "loss": 0.5748, + "step": 7954 + }, + { + "epoch": 2.3287470725995316, + "grad_norm": 0.9722622632980347, + "learning_rate": 3.3827720938603165e-06, + "loss": 0.5813, + "step": 7955 + }, + { + "epoch": 2.32903981264637, + "grad_norm": 0.9734702110290527, + "learning_rate": 3.382411814424369e-06, + "loss": 0.5363, + "step": 7956 + }, + { + "epoch": 2.3293325526932085, + "grad_norm": 1.09489107131958, + "learning_rate": 3.3820515140530317e-06, + "loss": 0.579, + "step": 7957 + }, + { + "epoch": 2.329625292740047, + "grad_norm": 1.0546607971191406, + "learning_rate": 3.381691192754853e-06, + "loss": 0.613, + "step": 7958 + }, + { + "epoch": 2.3299180327868854, + "grad_norm": 1.0198485851287842, + "learning_rate": 3.3813308505383805e-06, + "loss": 0.5897, + "step": 7959 + }, + { + "epoch": 2.330210772833724, + "grad_norm": 0.9862332940101624, + "learning_rate": 3.380970487412165e-06, + "loss": 0.5731, + "step": 7960 + }, + { + "epoch": 2.3305035128805622, + "grad_norm": 0.9992969036102295, + "learning_rate": 3.3806101033847548e-06, + "loss": 0.5793, + "step": 7961 + }, + { + "epoch": 2.3307962529274002, + "grad_norm": 0.9890265464782715, + "learning_rate": 3.380249698464702e-06, + "loss": 0.5535, + "step": 7962 + }, + { + "epoch": 2.331088992974239, + "grad_norm": 0.984207034111023, + "learning_rate": 3.379889272660555e-06, + "loss": 0.5436, + "step": 7963 + }, + { + "epoch": 2.331381733021077, + "grad_norm": 0.9627412557601929, + "learning_rate": 3.379528825980866e-06, + "loss": 0.5889, + "step": 7964 + }, + { + "epoch": 2.3316744730679155, + "grad_norm": 0.9904807806015015, + "learning_rate": 3.3791683584341865e-06, + "loss": 0.5577, + "step": 7965 + }, + { + "epoch": 2.331967213114754, + "grad_norm": 1.082182765007019, + "learning_rate": 3.3788078700290694e-06, + "loss": 0.5857, + "step": 7966 + }, + { + "epoch": 2.3322599531615924, + "grad_norm": 1.0101242065429688, + "learning_rate": 3.3784473607740665e-06, + "loss": 0.576, + "step": 7967 + }, + { + "epoch": 2.332552693208431, + "grad_norm": 1.0118894577026367, + "learning_rate": 3.3780868306777307e-06, + "loss": 0.5733, + "step": 7968 + }, + { + "epoch": 2.3328454332552693, + "grad_norm": 0.9827268123626709, + "learning_rate": 3.3777262797486166e-06, + "loss": 0.5744, + "step": 7969 + }, + { + "epoch": 2.3331381733021077, + "grad_norm": 0.9290745854377747, + "learning_rate": 3.3773657079952784e-06, + "loss": 0.5484, + "step": 7970 + }, + { + "epoch": 2.333430913348946, + "grad_norm": 0.9384550452232361, + "learning_rate": 3.3770051154262694e-06, + "loss": 0.5366, + "step": 7971 + }, + { + "epoch": 2.3337236533957846, + "grad_norm": 0.9573444724082947, + "learning_rate": 3.376644502050146e-06, + "loss": 0.5799, + "step": 7972 + }, + { + "epoch": 2.334016393442623, + "grad_norm": 1.0115141868591309, + "learning_rate": 3.3762838678754634e-06, + "loss": 0.5409, + "step": 7973 + }, + { + "epoch": 2.3343091334894615, + "grad_norm": 0.9599007368087769, + "learning_rate": 3.3759232129107773e-06, + "loss": 0.5443, + "step": 7974 + }, + { + "epoch": 2.3346018735363, + "grad_norm": 1.040575623512268, + "learning_rate": 3.3755625371646444e-06, + "loss": 0.5695, + "step": 7975 + }, + { + "epoch": 2.3348946135831383, + "grad_norm": 1.0504101514816284, + "learning_rate": 3.3752018406456227e-06, + "loss": 0.5851, + "step": 7976 + }, + { + "epoch": 2.3351873536299768, + "grad_norm": 1.0313323736190796, + "learning_rate": 3.3748411233622693e-06, + "loss": 0.5604, + "step": 7977 + }, + { + "epoch": 2.335480093676815, + "grad_norm": 0.9244895577430725, + "learning_rate": 3.3744803853231417e-06, + "loss": 0.5244, + "step": 7978 + }, + { + "epoch": 2.335772833723653, + "grad_norm": 0.9815071225166321, + "learning_rate": 3.3741196265367994e-06, + "loss": 0.5656, + "step": 7979 + }, + { + "epoch": 2.3360655737704916, + "grad_norm": 0.9928235411643982, + "learning_rate": 3.3737588470118005e-06, + "loss": 0.5969, + "step": 7980 + }, + { + "epoch": 2.33635831381733, + "grad_norm": 1.0206091403961182, + "learning_rate": 3.373398046756705e-06, + "loss": 0.5736, + "step": 7981 + }, + { + "epoch": 2.3366510538641685, + "grad_norm": 0.9161703586578369, + "learning_rate": 3.3730372257800737e-06, + "loss": 0.5714, + "step": 7982 + }, + { + "epoch": 2.336943793911007, + "grad_norm": 0.9737401604652405, + "learning_rate": 3.3726763840904663e-06, + "loss": 0.5889, + "step": 7983 + }, + { + "epoch": 2.3372365339578454, + "grad_norm": 0.9591588973999023, + "learning_rate": 3.372315521696444e-06, + "loss": 0.5877, + "step": 7984 + }, + { + "epoch": 2.337529274004684, + "grad_norm": 0.9524088501930237, + "learning_rate": 3.3719546386065676e-06, + "loss": 0.5499, + "step": 7985 + }, + { + "epoch": 2.3378220140515222, + "grad_norm": 1.0105690956115723, + "learning_rate": 3.3715937348294002e-06, + "loss": 0.5476, + "step": 7986 + }, + { + "epoch": 2.3381147540983607, + "grad_norm": 0.950194239616394, + "learning_rate": 3.371232810373505e-06, + "loss": 0.5543, + "step": 7987 + }, + { + "epoch": 2.338407494145199, + "grad_norm": 1.022520899772644, + "learning_rate": 3.370871865247443e-06, + "loss": 0.5935, + "step": 7988 + }, + { + "epoch": 2.3387002341920375, + "grad_norm": 0.9701055288314819, + "learning_rate": 3.3705108994597792e-06, + "loss": 0.5657, + "step": 7989 + }, + { + "epoch": 2.338992974238876, + "grad_norm": 1.0235676765441895, + "learning_rate": 3.3701499130190776e-06, + "loss": 0.565, + "step": 7990 + }, + { + "epoch": 2.3392857142857144, + "grad_norm": 1.0833141803741455, + "learning_rate": 3.3697889059339006e-06, + "loss": 0.5481, + "step": 7991 + }, + { + "epoch": 2.339578454332553, + "grad_norm": 0.9827982187271118, + "learning_rate": 3.3694278782128166e-06, + "loss": 0.5475, + "step": 7992 + }, + { + "epoch": 2.3398711943793913, + "grad_norm": 1.0198692083358765, + "learning_rate": 3.3690668298643873e-06, + "loss": 0.6012, + "step": 7993 + }, + { + "epoch": 2.3401639344262293, + "grad_norm": 0.9860365390777588, + "learning_rate": 3.3687057608971813e-06, + "loss": 0.5884, + "step": 7994 + }, + { + "epoch": 2.340456674473068, + "grad_norm": 0.9644603729248047, + "learning_rate": 3.368344671319764e-06, + "loss": 0.5497, + "step": 7995 + }, + { + "epoch": 2.340749414519906, + "grad_norm": 0.9335082769393921, + "learning_rate": 3.367983561140703e-06, + "loss": 0.5546, + "step": 7996 + }, + { + "epoch": 2.3410421545667446, + "grad_norm": 0.9674527645111084, + "learning_rate": 3.3676224303685652e-06, + "loss": 0.5852, + "step": 7997 + }, + { + "epoch": 2.341334894613583, + "grad_norm": 0.9761932492256165, + "learning_rate": 3.367261279011918e-06, + "loss": 0.5986, + "step": 7998 + }, + { + "epoch": 2.3416276346604215, + "grad_norm": 0.9550302028656006, + "learning_rate": 3.3669001070793305e-06, + "loss": 0.5034, + "step": 7999 + }, + { + "epoch": 2.34192037470726, + "grad_norm": 1.0057826042175293, + "learning_rate": 3.3665389145793716e-06, + "loss": 0.6185, + "step": 8000 + }, + { + "epoch": 2.3422131147540983, + "grad_norm": 0.9733731150627136, + "learning_rate": 3.3661777015206105e-06, + "loss": 0.5552, + "step": 8001 + }, + { + "epoch": 2.3425058548009368, + "grad_norm": 1.0102181434631348, + "learning_rate": 3.3658164679116172e-06, + "loss": 0.5839, + "step": 8002 + }, + { + "epoch": 2.342798594847775, + "grad_norm": 0.9342325925827026, + "learning_rate": 3.3654552137609615e-06, + "loss": 0.5536, + "step": 8003 + }, + { + "epoch": 2.3430913348946136, + "grad_norm": 0.9726691842079163, + "learning_rate": 3.3650939390772146e-06, + "loss": 0.5656, + "step": 8004 + }, + { + "epoch": 2.343384074941452, + "grad_norm": 0.9722458720207214, + "learning_rate": 3.3647326438689475e-06, + "loss": 0.597, + "step": 8005 + }, + { + "epoch": 2.3436768149882905, + "grad_norm": 0.9804683327674866, + "learning_rate": 3.364371328144733e-06, + "loss": 0.5531, + "step": 8006 + }, + { + "epoch": 2.343969555035129, + "grad_norm": 1.0425230264663696, + "learning_rate": 3.3640099919131424e-06, + "loss": 0.5711, + "step": 8007 + }, + { + "epoch": 2.3442622950819674, + "grad_norm": 0.9755222201347351, + "learning_rate": 3.3636486351827493e-06, + "loss": 0.5498, + "step": 8008 + }, + { + "epoch": 2.344555035128806, + "grad_norm": 0.9508012533187866, + "learning_rate": 3.3632872579621256e-06, + "loss": 0.5943, + "step": 8009 + }, + { + "epoch": 2.3448477751756442, + "grad_norm": 0.9702918529510498, + "learning_rate": 3.362925860259846e-06, + "loss": 0.5673, + "step": 8010 + }, + { + "epoch": 2.3451405152224822, + "grad_norm": 0.9799255728721619, + "learning_rate": 3.3625644420844845e-06, + "loss": 0.5754, + "step": 8011 + }, + { + "epoch": 2.3454332552693207, + "grad_norm": 0.9518281817436218, + "learning_rate": 3.3622030034446162e-06, + "loss": 0.547, + "step": 8012 + }, + { + "epoch": 2.345725995316159, + "grad_norm": 1.0403079986572266, + "learning_rate": 3.3618415443488163e-06, + "loss": 0.5585, + "step": 8013 + }, + { + "epoch": 2.3460187353629975, + "grad_norm": 0.9822577238082886, + "learning_rate": 3.361480064805659e-06, + "loss": 0.592, + "step": 8014 + }, + { + "epoch": 2.346311475409836, + "grad_norm": 1.0027341842651367, + "learning_rate": 3.361118564823722e-06, + "loss": 0.6009, + "step": 8015 + }, + { + "epoch": 2.3466042154566744, + "grad_norm": 0.9266039133071899, + "learning_rate": 3.3607570444115823e-06, + "loss": 0.5556, + "step": 8016 + }, + { + "epoch": 2.346896955503513, + "grad_norm": 0.9555667042732239, + "learning_rate": 3.360395503577816e-06, + "loss": 0.5696, + "step": 8017 + }, + { + "epoch": 2.3471896955503513, + "grad_norm": 0.9506762027740479, + "learning_rate": 3.3600339423310015e-06, + "loss": 0.5706, + "step": 8018 + }, + { + "epoch": 2.3474824355971897, + "grad_norm": 0.9995543956756592, + "learning_rate": 3.359672360679716e-06, + "loss": 0.5593, + "step": 8019 + }, + { + "epoch": 2.347775175644028, + "grad_norm": 0.9866055250167847, + "learning_rate": 3.3593107586325384e-06, + "loss": 0.5705, + "step": 8020 + }, + { + "epoch": 2.3480679156908666, + "grad_norm": 0.9725682139396667, + "learning_rate": 3.3589491361980482e-06, + "loss": 0.5604, + "step": 8021 + }, + { + "epoch": 2.348360655737705, + "grad_norm": 1.0162547826766968, + "learning_rate": 3.358587493384825e-06, + "loss": 0.6057, + "step": 8022 + }, + { + "epoch": 2.3486533957845435, + "grad_norm": 1.031523585319519, + "learning_rate": 3.358225830201448e-06, + "loss": 0.572, + "step": 8023 + }, + { + "epoch": 2.348946135831382, + "grad_norm": 0.9342032670974731, + "learning_rate": 3.3578641466564986e-06, + "loss": 0.5351, + "step": 8024 + }, + { + "epoch": 2.3492388758782203, + "grad_norm": 1.0270594358444214, + "learning_rate": 3.357502442758558e-06, + "loss": 0.5865, + "step": 8025 + }, + { + "epoch": 2.3495316159250583, + "grad_norm": 1.0337144136428833, + "learning_rate": 3.3571407185162066e-06, + "loss": 0.5813, + "step": 8026 + }, + { + "epoch": 2.349824355971897, + "grad_norm": 1.0012987852096558, + "learning_rate": 3.3567789739380276e-06, + "loss": 0.579, + "step": 8027 + }, + { + "epoch": 2.350117096018735, + "grad_norm": 1.000012755393982, + "learning_rate": 3.356417209032602e-06, + "loss": 0.5787, + "step": 8028 + }, + { + "epoch": 2.3504098360655736, + "grad_norm": 0.9115464091300964, + "learning_rate": 3.3560554238085142e-06, + "loss": 0.4702, + "step": 8029 + }, + { + "epoch": 2.350702576112412, + "grad_norm": 0.9515077471733093, + "learning_rate": 3.355693618274346e-06, + "loss": 0.565, + "step": 8030 + }, + { + "epoch": 2.3509953161592505, + "grad_norm": 0.970664918422699, + "learning_rate": 3.355331792438683e-06, + "loss": 0.5886, + "step": 8031 + }, + { + "epoch": 2.351288056206089, + "grad_norm": 0.9832165241241455, + "learning_rate": 3.3549699463101094e-06, + "loss": 0.5734, + "step": 8032 + }, + { + "epoch": 2.3515807962529274, + "grad_norm": 0.9655187726020813, + "learning_rate": 3.3546080798972093e-06, + "loss": 0.5693, + "step": 8033 + }, + { + "epoch": 2.351873536299766, + "grad_norm": 0.9823389053344727, + "learning_rate": 3.354246193208568e-06, + "loss": 0.5792, + "step": 8034 + }, + { + "epoch": 2.3521662763466042, + "grad_norm": 1.0032192468643188, + "learning_rate": 3.3538842862527713e-06, + "loss": 0.5747, + "step": 8035 + }, + { + "epoch": 2.3524590163934427, + "grad_norm": 0.972827672958374, + "learning_rate": 3.3535223590384062e-06, + "loss": 0.5652, + "step": 8036 + }, + { + "epoch": 2.352751756440281, + "grad_norm": 1.0045440196990967, + "learning_rate": 3.3531604115740592e-06, + "loss": 0.5852, + "step": 8037 + }, + { + "epoch": 2.3530444964871196, + "grad_norm": 1.018204689025879, + "learning_rate": 3.352798443868318e-06, + "loss": 0.5626, + "step": 8038 + }, + { + "epoch": 2.353337236533958, + "grad_norm": 1.0340232849121094, + "learning_rate": 3.3524364559297695e-06, + "loss": 0.6016, + "step": 8039 + }, + { + "epoch": 2.3536299765807964, + "grad_norm": 1.0124645233154297, + "learning_rate": 3.352074447767002e-06, + "loss": 0.5782, + "step": 8040 + }, + { + "epoch": 2.3539227166276344, + "grad_norm": 0.965022087097168, + "learning_rate": 3.3517124193886046e-06, + "loss": 0.5738, + "step": 8041 + }, + { + "epoch": 2.3542154566744733, + "grad_norm": 0.9855288863182068, + "learning_rate": 3.3513503708031668e-06, + "loss": 0.5591, + "step": 8042 + }, + { + "epoch": 2.3545081967213113, + "grad_norm": 0.946102499961853, + "learning_rate": 3.350988302019278e-06, + "loss": 0.5336, + "step": 8043 + }, + { + "epoch": 2.3548009367681497, + "grad_norm": 0.9856200218200684, + "learning_rate": 3.350626213045528e-06, + "loss": 0.5417, + "step": 8044 + }, + { + "epoch": 2.355093676814988, + "grad_norm": 0.9986181259155273, + "learning_rate": 3.3502641038905076e-06, + "loss": 0.6114, + "step": 8045 + }, + { + "epoch": 2.3553864168618266, + "grad_norm": 0.9950219988822937, + "learning_rate": 3.349901974562808e-06, + "loss": 0.5722, + "step": 8046 + }, + { + "epoch": 2.355679156908665, + "grad_norm": 0.9979133009910583, + "learning_rate": 3.349539825071021e-06, + "loss": 0.5767, + "step": 8047 + }, + { + "epoch": 2.3559718969555035, + "grad_norm": 1.0216248035430908, + "learning_rate": 3.3491776554237384e-06, + "loss": 0.5661, + "step": 8048 + }, + { + "epoch": 2.356264637002342, + "grad_norm": 0.9761382937431335, + "learning_rate": 3.3488154656295523e-06, + "loss": 0.6348, + "step": 8049 + }, + { + "epoch": 2.3565573770491803, + "grad_norm": 1.0475670099258423, + "learning_rate": 3.348453255697056e-06, + "loss": 0.5797, + "step": 8050 + }, + { + "epoch": 2.3568501170960188, + "grad_norm": 1.0468109846115112, + "learning_rate": 3.348091025634844e-06, + "loss": 0.623, + "step": 8051 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.9872409701347351, + "learning_rate": 3.3477287754515092e-06, + "loss": 0.5298, + "step": 8052 + }, + { + "epoch": 2.3574355971896956, + "grad_norm": 0.90134197473526, + "learning_rate": 3.3473665051556464e-06, + "loss": 0.573, + "step": 8053 + }, + { + "epoch": 2.357728337236534, + "grad_norm": 0.9528799057006836, + "learning_rate": 3.34700421475585e-06, + "loss": 0.5585, + "step": 8054 + }, + { + "epoch": 2.3580210772833725, + "grad_norm": 1.0259519815444946, + "learning_rate": 3.346641904260717e-06, + "loss": 0.5871, + "step": 8055 + }, + { + "epoch": 2.358313817330211, + "grad_norm": 1.0075627565383911, + "learning_rate": 3.346279573678841e-06, + "loss": 0.5859, + "step": 8056 + }, + { + "epoch": 2.3586065573770494, + "grad_norm": 1.0017547607421875, + "learning_rate": 3.3459172230188197e-06, + "loss": 0.5956, + "step": 8057 + }, + { + "epoch": 2.3588992974238874, + "grad_norm": 0.9840825796127319, + "learning_rate": 3.3455548522892502e-06, + "loss": 0.5942, + "step": 8058 + }, + { + "epoch": 2.359192037470726, + "grad_norm": 0.9799880981445312, + "learning_rate": 3.345192461498729e-06, + "loss": 0.5531, + "step": 8059 + }, + { + "epoch": 2.3594847775175642, + "grad_norm": 1.0202990770339966, + "learning_rate": 3.3448300506558547e-06, + "loss": 0.5952, + "step": 8060 + }, + { + "epoch": 2.3597775175644027, + "grad_norm": 0.9517308473587036, + "learning_rate": 3.344467619769225e-06, + "loss": 0.5476, + "step": 8061 + }, + { + "epoch": 2.360070257611241, + "grad_norm": 0.9528408646583557, + "learning_rate": 3.3441051688474387e-06, + "loss": 0.6027, + "step": 8062 + }, + { + "epoch": 2.3603629976580796, + "grad_norm": 0.9500678777694702, + "learning_rate": 3.3437426978990954e-06, + "loss": 0.5249, + "step": 8063 + }, + { + "epoch": 2.360655737704918, + "grad_norm": 0.9936739802360535, + "learning_rate": 3.343380206932794e-06, + "loss": 0.5636, + "step": 8064 + }, + { + "epoch": 2.3609484777517564, + "grad_norm": 1.0080183744430542, + "learning_rate": 3.3430176959571353e-06, + "loss": 0.5868, + "step": 8065 + }, + { + "epoch": 2.361241217798595, + "grad_norm": 1.016668438911438, + "learning_rate": 3.34265516498072e-06, + "loss": 0.5831, + "step": 8066 + }, + { + "epoch": 2.3615339578454333, + "grad_norm": 1.0804213285446167, + "learning_rate": 3.342292614012148e-06, + "loss": 0.6053, + "step": 8067 + }, + { + "epoch": 2.3618266978922717, + "grad_norm": 0.971899688243866, + "learning_rate": 3.3419300430600237e-06, + "loss": 0.575, + "step": 8068 + }, + { + "epoch": 2.36211943793911, + "grad_norm": 0.9440717697143555, + "learning_rate": 3.341567452132946e-06, + "loss": 0.577, + "step": 8069 + }, + { + "epoch": 2.3624121779859486, + "grad_norm": 1.0497348308563232, + "learning_rate": 3.3412048412395193e-06, + "loss": 0.5995, + "step": 8070 + }, + { + "epoch": 2.362704918032787, + "grad_norm": 0.953342080116272, + "learning_rate": 3.340842210388346e-06, + "loss": 0.5753, + "step": 8071 + }, + { + "epoch": 2.3629976580796255, + "grad_norm": 1.034010648727417, + "learning_rate": 3.3404795595880294e-06, + "loss": 0.5667, + "step": 8072 + }, + { + "epoch": 2.3632903981264635, + "grad_norm": 0.9572766423225403, + "learning_rate": 3.3401168888471747e-06, + "loss": 0.5729, + "step": 8073 + }, + { + "epoch": 2.3635831381733023, + "grad_norm": 1.0037798881530762, + "learning_rate": 3.339754198174385e-06, + "loss": 0.5546, + "step": 8074 + }, + { + "epoch": 2.3638758782201403, + "grad_norm": 0.995297372341156, + "learning_rate": 3.339391487578265e-06, + "loss": 0.6003, + "step": 8075 + }, + { + "epoch": 2.3641686182669788, + "grad_norm": 0.9546201229095459, + "learning_rate": 3.3390287570674214e-06, + "loss": 0.5429, + "step": 8076 + }, + { + "epoch": 2.364461358313817, + "grad_norm": 1.2545454502105713, + "learning_rate": 3.3386660066504595e-06, + "loss": 0.5986, + "step": 8077 + }, + { + "epoch": 2.3647540983606556, + "grad_norm": 0.9430914521217346, + "learning_rate": 3.3383032363359847e-06, + "loss": 0.5525, + "step": 8078 + }, + { + "epoch": 2.365046838407494, + "grad_norm": 1.0038398504257202, + "learning_rate": 3.337940446132605e-06, + "loss": 0.5764, + "step": 8079 + }, + { + "epoch": 2.3653395784543325, + "grad_norm": 0.9454663395881653, + "learning_rate": 3.3375776360489274e-06, + "loss": 0.5071, + "step": 8080 + }, + { + "epoch": 2.365632318501171, + "grad_norm": 1.0108084678649902, + "learning_rate": 3.3372148060935596e-06, + "loss": 0.5342, + "step": 8081 + }, + { + "epoch": 2.3659250585480094, + "grad_norm": 1.0540952682495117, + "learning_rate": 3.336851956275109e-06, + "loss": 0.5599, + "step": 8082 + }, + { + "epoch": 2.366217798594848, + "grad_norm": 0.9811843633651733, + "learning_rate": 3.3364890866021855e-06, + "loss": 0.5297, + "step": 8083 + }, + { + "epoch": 2.3665105386416863, + "grad_norm": 1.0196460485458374, + "learning_rate": 3.3361261970833974e-06, + "loss": 0.5537, + "step": 8084 + }, + { + "epoch": 2.3668032786885247, + "grad_norm": 1.0299521684646606, + "learning_rate": 3.335763287727355e-06, + "loss": 0.6013, + "step": 8085 + }, + { + "epoch": 2.367096018735363, + "grad_norm": 1.0230530500411987, + "learning_rate": 3.3354003585426674e-06, + "loss": 0.5735, + "step": 8086 + }, + { + "epoch": 2.3673887587822016, + "grad_norm": 0.9690043330192566, + "learning_rate": 3.3350374095379454e-06, + "loss": 0.5873, + "step": 8087 + }, + { + "epoch": 2.36768149882904, + "grad_norm": 0.999502420425415, + "learning_rate": 3.334674440721802e-06, + "loss": 0.5461, + "step": 8088 + }, + { + "epoch": 2.3679742388758784, + "grad_norm": 1.0485353469848633, + "learning_rate": 3.3343114521028463e-06, + "loss": 0.5914, + "step": 8089 + }, + { + "epoch": 2.3682669789227164, + "grad_norm": 1.0146632194519043, + "learning_rate": 3.3339484436896907e-06, + "loss": 0.5897, + "step": 8090 + }, + { + "epoch": 2.368559718969555, + "grad_norm": 1.014729380607605, + "learning_rate": 3.3335854154909486e-06, + "loss": 0.6172, + "step": 8091 + }, + { + "epoch": 2.3688524590163933, + "grad_norm": 0.9857377409934998, + "learning_rate": 3.333222367515232e-06, + "loss": 0.5227, + "step": 8092 + }, + { + "epoch": 2.3691451990632317, + "grad_norm": 0.9746506810188293, + "learning_rate": 3.3328592997711555e-06, + "loss": 0.5525, + "step": 8093 + }, + { + "epoch": 2.36943793911007, + "grad_norm": 1.0052241086959839, + "learning_rate": 3.332496212267331e-06, + "loss": 0.586, + "step": 8094 + }, + { + "epoch": 2.3697306791569086, + "grad_norm": 0.992695152759552, + "learning_rate": 3.3321331050123747e-06, + "loss": 0.5384, + "step": 8095 + }, + { + "epoch": 2.370023419203747, + "grad_norm": 0.9868412017822266, + "learning_rate": 3.3317699780149003e-06, + "loss": 0.5515, + "step": 8096 + }, + { + "epoch": 2.3703161592505855, + "grad_norm": 0.9962422847747803, + "learning_rate": 3.3314068312835234e-06, + "loss": 0.5805, + "step": 8097 + }, + { + "epoch": 2.370608899297424, + "grad_norm": 1.0287505388259888, + "learning_rate": 3.33104366482686e-06, + "loss": 0.5763, + "step": 8098 + }, + { + "epoch": 2.3709016393442623, + "grad_norm": 0.9963488578796387, + "learning_rate": 3.3306804786535256e-06, + "loss": 0.5764, + "step": 8099 + }, + { + "epoch": 2.371194379391101, + "grad_norm": 0.9909058213233948, + "learning_rate": 3.330317272772138e-06, + "loss": 0.5911, + "step": 8100 + }, + { + "epoch": 2.371487119437939, + "grad_norm": 0.9385550022125244, + "learning_rate": 3.3299540471913123e-06, + "loss": 0.5824, + "step": 8101 + }, + { + "epoch": 2.3717798594847777, + "grad_norm": 0.9942654371261597, + "learning_rate": 3.3295908019196676e-06, + "loss": 0.5433, + "step": 8102 + }, + { + "epoch": 2.372072599531616, + "grad_norm": 1.0152716636657715, + "learning_rate": 3.3292275369658227e-06, + "loss": 0.5116, + "step": 8103 + }, + { + "epoch": 2.3723653395784545, + "grad_norm": 1.003165364265442, + "learning_rate": 3.3288642523383947e-06, + "loss": 0.5485, + "step": 8104 + }, + { + "epoch": 2.3726580796252925, + "grad_norm": 1.0480036735534668, + "learning_rate": 3.328500948046003e-06, + "loss": 0.6053, + "step": 8105 + }, + { + "epoch": 2.3729508196721314, + "grad_norm": 0.9792618751525879, + "learning_rate": 3.328137624097267e-06, + "loss": 0.5797, + "step": 8106 + }, + { + "epoch": 2.3732435597189694, + "grad_norm": 0.9728925228118896, + "learning_rate": 3.3277742805008065e-06, + "loss": 0.6308, + "step": 8107 + }, + { + "epoch": 2.373536299765808, + "grad_norm": 0.9448747634887695, + "learning_rate": 3.3274109172652427e-06, + "loss": 0.5381, + "step": 8108 + }, + { + "epoch": 2.3738290398126463, + "grad_norm": 1.0002667903900146, + "learning_rate": 3.327047534399196e-06, + "loss": 0.6115, + "step": 8109 + }, + { + "epoch": 2.3741217798594847, + "grad_norm": 0.9965856075286865, + "learning_rate": 3.3266841319112876e-06, + "loss": 0.6303, + "step": 8110 + }, + { + "epoch": 2.374414519906323, + "grad_norm": 1.056225061416626, + "learning_rate": 3.326320709810139e-06, + "loss": 0.5172, + "step": 8111 + }, + { + "epoch": 2.3747072599531616, + "grad_norm": 0.9739424586296082, + "learning_rate": 3.3259572681043727e-06, + "loss": 0.5276, + "step": 8112 + }, + { + "epoch": 2.375, + "grad_norm": 0.9827669262886047, + "learning_rate": 3.325593806802612e-06, + "loss": 0.5739, + "step": 8113 + }, + { + "epoch": 2.3752927400468384, + "grad_norm": 0.9944759011268616, + "learning_rate": 3.3252303259134794e-06, + "loss": 0.5606, + "step": 8114 + }, + { + "epoch": 2.375585480093677, + "grad_norm": 1.0187758207321167, + "learning_rate": 3.3248668254455994e-06, + "loss": 0.5785, + "step": 8115 + }, + { + "epoch": 2.3758782201405153, + "grad_norm": 0.9993666410446167, + "learning_rate": 3.324503305407595e-06, + "loss": 0.5831, + "step": 8116 + }, + { + "epoch": 2.3761709601873537, + "grad_norm": 0.9804121851921082, + "learning_rate": 3.324139765808091e-06, + "loss": 0.5471, + "step": 8117 + }, + { + "epoch": 2.376463700234192, + "grad_norm": 0.9295097589492798, + "learning_rate": 3.323776206655712e-06, + "loss": 0.5473, + "step": 8118 + }, + { + "epoch": 2.3767564402810306, + "grad_norm": 1.0188076496124268, + "learning_rate": 3.323412627959086e-06, + "loss": 0.5877, + "step": 8119 + }, + { + "epoch": 2.3770491803278686, + "grad_norm": 0.9302417635917664, + "learning_rate": 3.3230490297268365e-06, + "loss": 0.5415, + "step": 8120 + }, + { + "epoch": 2.3773419203747075, + "grad_norm": 1.0485081672668457, + "learning_rate": 3.32268541196759e-06, + "loss": 0.5429, + "step": 8121 + }, + { + "epoch": 2.3776346604215455, + "grad_norm": 1.0156241655349731, + "learning_rate": 3.3223217746899743e-06, + "loss": 0.5728, + "step": 8122 + }, + { + "epoch": 2.377927400468384, + "grad_norm": 0.9645369648933411, + "learning_rate": 3.3219581179026167e-06, + "loss": 0.5251, + "step": 8123 + }, + { + "epoch": 2.3782201405152223, + "grad_norm": 0.9753412008285522, + "learning_rate": 3.3215944416141456e-06, + "loss": 0.5385, + "step": 8124 + }, + { + "epoch": 2.378512880562061, + "grad_norm": 1.0052376985549927, + "learning_rate": 3.321230745833187e-06, + "loss": 0.5494, + "step": 8125 + }, + { + "epoch": 2.378805620608899, + "grad_norm": 0.9678220748901367, + "learning_rate": 3.320867030568372e-06, + "loss": 0.5453, + "step": 8126 + }, + { + "epoch": 2.3790983606557377, + "grad_norm": 1.071163535118103, + "learning_rate": 3.3205032958283288e-06, + "loss": 0.6119, + "step": 8127 + }, + { + "epoch": 2.379391100702576, + "grad_norm": 0.9957399368286133, + "learning_rate": 3.3201395416216874e-06, + "loss": 0.599, + "step": 8128 + }, + { + "epoch": 2.3796838407494145, + "grad_norm": 0.9993316531181335, + "learning_rate": 3.3197757679570785e-06, + "loss": 0.5573, + "step": 8129 + }, + { + "epoch": 2.379976580796253, + "grad_norm": 1.0317108631134033, + "learning_rate": 3.3194119748431315e-06, + "loss": 0.5901, + "step": 8130 + }, + { + "epoch": 2.3802693208430914, + "grad_norm": 0.9907061457633972, + "learning_rate": 3.3190481622884773e-06, + "loss": 0.5935, + "step": 8131 + }, + { + "epoch": 2.38056206088993, + "grad_norm": 0.94187331199646, + "learning_rate": 3.318684330301749e-06, + "loss": 0.5394, + "step": 8132 + }, + { + "epoch": 2.3808548009367683, + "grad_norm": 0.9790815114974976, + "learning_rate": 3.3183204788915776e-06, + "loss": 0.6088, + "step": 8133 + }, + { + "epoch": 2.3811475409836067, + "grad_norm": 0.9234967827796936, + "learning_rate": 3.3179566080665954e-06, + "loss": 0.5614, + "step": 8134 + }, + { + "epoch": 2.381440281030445, + "grad_norm": 0.9488685131072998, + "learning_rate": 3.3175927178354357e-06, + "loss": 0.5682, + "step": 8135 + }, + { + "epoch": 2.3817330210772836, + "grad_norm": 1.0049339532852173, + "learning_rate": 3.317228808206731e-06, + "loss": 0.5547, + "step": 8136 + }, + { + "epoch": 2.3820257611241216, + "grad_norm": 1.0141243934631348, + "learning_rate": 3.3168648791891163e-06, + "loss": 0.5929, + "step": 8137 + }, + { + "epoch": 2.38231850117096, + "grad_norm": 1.0097075700759888, + "learning_rate": 3.316500930791225e-06, + "loss": 0.5741, + "step": 8138 + }, + { + "epoch": 2.3826112412177984, + "grad_norm": 0.984799861907959, + "learning_rate": 3.316136963021693e-06, + "loss": 0.6044, + "step": 8139 + }, + { + "epoch": 2.382903981264637, + "grad_norm": 0.9926040172576904, + "learning_rate": 3.315772975889154e-06, + "loss": 0.5402, + "step": 8140 + }, + { + "epoch": 2.3831967213114753, + "grad_norm": 0.9493396878242493, + "learning_rate": 3.3154089694022452e-06, + "loss": 0.5841, + "step": 8141 + }, + { + "epoch": 2.3834894613583137, + "grad_norm": 0.9679313898086548, + "learning_rate": 3.3150449435696014e-06, + "loss": 0.595, + "step": 8142 + }, + { + "epoch": 2.383782201405152, + "grad_norm": 1.020658016204834, + "learning_rate": 3.3146808983998594e-06, + "loss": 0.5798, + "step": 8143 + }, + { + "epoch": 2.3840749414519906, + "grad_norm": 1.009507179260254, + "learning_rate": 3.3143168339016574e-06, + "loss": 0.6024, + "step": 8144 + }, + { + "epoch": 2.384367681498829, + "grad_norm": 1.0016778707504272, + "learning_rate": 3.3139527500836317e-06, + "loss": 0.5659, + "step": 8145 + }, + { + "epoch": 2.3846604215456675, + "grad_norm": 1.0038961172103882, + "learning_rate": 3.3135886469544202e-06, + "loss": 0.5827, + "step": 8146 + }, + { + "epoch": 2.384953161592506, + "grad_norm": 0.9691742658615112, + "learning_rate": 3.3132245245226624e-06, + "loss": 0.5541, + "step": 8147 + }, + { + "epoch": 2.3852459016393444, + "grad_norm": 1.0539155006408691, + "learning_rate": 3.3128603827969963e-06, + "loss": 0.5599, + "step": 8148 + }, + { + "epoch": 2.385538641686183, + "grad_norm": 0.9999356269836426, + "learning_rate": 3.3124962217860616e-06, + "loss": 0.5778, + "step": 8149 + }, + { + "epoch": 2.3858313817330212, + "grad_norm": 0.9691723585128784, + "learning_rate": 3.3121320414984976e-06, + "loss": 0.5311, + "step": 8150 + }, + { + "epoch": 2.3861241217798597, + "grad_norm": 1.035914421081543, + "learning_rate": 3.3117678419429454e-06, + "loss": 0.5801, + "step": 8151 + }, + { + "epoch": 2.3864168618266977, + "grad_norm": 1.0053391456604004, + "learning_rate": 3.311403623128045e-06, + "loss": 0.6006, + "step": 8152 + }, + { + "epoch": 2.3867096018735365, + "grad_norm": 0.9625900983810425, + "learning_rate": 3.311039385062437e-06, + "loss": 0.5585, + "step": 8153 + }, + { + "epoch": 2.3870023419203745, + "grad_norm": 1.026452660560608, + "learning_rate": 3.310675127754765e-06, + "loss": 0.6043, + "step": 8154 + }, + { + "epoch": 2.387295081967213, + "grad_norm": 0.9983715415000916, + "learning_rate": 3.3103108512136695e-06, + "loss": 0.6023, + "step": 8155 + }, + { + "epoch": 2.3875878220140514, + "grad_norm": 1.0027204751968384, + "learning_rate": 3.3099465554477927e-06, + "loss": 0.5615, + "step": 8156 + }, + { + "epoch": 2.38788056206089, + "grad_norm": 1.1009188890457153, + "learning_rate": 3.3095822404657783e-06, + "loss": 0.5811, + "step": 8157 + }, + { + "epoch": 2.3881733021077283, + "grad_norm": 0.9402661323547363, + "learning_rate": 3.3092179062762703e-06, + "loss": 0.5916, + "step": 8158 + }, + { + "epoch": 2.3884660421545667, + "grad_norm": 0.9511383771896362, + "learning_rate": 3.308853552887912e-06, + "loss": 0.5666, + "step": 8159 + }, + { + "epoch": 2.388758782201405, + "grad_norm": 0.9737733602523804, + "learning_rate": 3.3084891803093473e-06, + "loss": 0.6115, + "step": 8160 + }, + { + "epoch": 2.3890515222482436, + "grad_norm": 1.0306880474090576, + "learning_rate": 3.308124788549222e-06, + "loss": 0.6173, + "step": 8161 + }, + { + "epoch": 2.389344262295082, + "grad_norm": 0.9648563861846924, + "learning_rate": 3.3077603776161797e-06, + "loss": 0.5677, + "step": 8162 + }, + { + "epoch": 2.3896370023419204, + "grad_norm": 1.0009140968322754, + "learning_rate": 3.3073959475188683e-06, + "loss": 0.5722, + "step": 8163 + }, + { + "epoch": 2.389929742388759, + "grad_norm": 0.9878909587860107, + "learning_rate": 3.3070314982659327e-06, + "loss": 0.5919, + "step": 8164 + }, + { + "epoch": 2.3902224824355973, + "grad_norm": 0.9414121508598328, + "learning_rate": 3.30666702986602e-06, + "loss": 0.5277, + "step": 8165 + }, + { + "epoch": 2.3905152224824358, + "grad_norm": 0.9971603155136108, + "learning_rate": 3.306302542327777e-06, + "loss": 0.5736, + "step": 8166 + }, + { + "epoch": 2.390807962529274, + "grad_norm": 0.9795928001403809, + "learning_rate": 3.3059380356598514e-06, + "loss": 0.5575, + "step": 8167 + }, + { + "epoch": 2.3911007025761126, + "grad_norm": 1.066114068031311, + "learning_rate": 3.3055735098708906e-06, + "loss": 0.5399, + "step": 8168 + }, + { + "epoch": 2.3913934426229506, + "grad_norm": 1.0272760391235352, + "learning_rate": 3.3052089649695436e-06, + "loss": 0.5083, + "step": 8169 + }, + { + "epoch": 2.391686182669789, + "grad_norm": 0.9873193502426147, + "learning_rate": 3.304844400964459e-06, + "loss": 0.608, + "step": 8170 + }, + { + "epoch": 2.3919789227166275, + "grad_norm": 0.9318877458572388, + "learning_rate": 3.304479817864287e-06, + "loss": 0.5451, + "step": 8171 + }, + { + "epoch": 2.392271662763466, + "grad_norm": 1.0263928174972534, + "learning_rate": 3.3041152156776764e-06, + "loss": 0.5778, + "step": 8172 + }, + { + "epoch": 2.3925644028103044, + "grad_norm": 0.9506439566612244, + "learning_rate": 3.3037505944132777e-06, + "loss": 0.5462, + "step": 8173 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 0.9732587933540344, + "learning_rate": 3.303385954079742e-06, + "loss": 0.6123, + "step": 8174 + }, + { + "epoch": 2.3931498829039812, + "grad_norm": 0.974742591381073, + "learning_rate": 3.3030212946857197e-06, + "loss": 0.5659, + "step": 8175 + }, + { + "epoch": 2.3934426229508197, + "grad_norm": 0.9726960062980652, + "learning_rate": 3.302656616239863e-06, + "loss": 0.5746, + "step": 8176 + }, + { + "epoch": 2.393735362997658, + "grad_norm": 0.938836395740509, + "learning_rate": 3.302291918750824e-06, + "loss": 0.6134, + "step": 8177 + }, + { + "epoch": 2.3940281030444965, + "grad_norm": 0.9479103088378906, + "learning_rate": 3.3019272022272554e-06, + "loss": 0.5567, + "step": 8178 + }, + { + "epoch": 2.394320843091335, + "grad_norm": 0.9420146346092224, + "learning_rate": 3.30156246667781e-06, + "loss": 0.5654, + "step": 8179 + }, + { + "epoch": 2.3946135831381734, + "grad_norm": 0.9861853122711182, + "learning_rate": 3.3011977121111405e-06, + "loss": 0.5889, + "step": 8180 + }, + { + "epoch": 2.394906323185012, + "grad_norm": 0.9521893858909607, + "learning_rate": 3.3008329385359017e-06, + "loss": 0.5774, + "step": 8181 + }, + { + "epoch": 2.3951990632318503, + "grad_norm": 0.9524110555648804, + "learning_rate": 3.3004681459607475e-06, + "loss": 0.5859, + "step": 8182 + }, + { + "epoch": 2.3954918032786887, + "grad_norm": 1.0034728050231934, + "learning_rate": 3.3001033343943324e-06, + "loss": 0.5635, + "step": 8183 + }, + { + "epoch": 2.3957845433255267, + "grad_norm": 1.0461962223052979, + "learning_rate": 3.299738503845312e-06, + "loss": 0.5987, + "step": 8184 + }, + { + "epoch": 2.3960772833723656, + "grad_norm": 0.9734176397323608, + "learning_rate": 3.2993736543223427e-06, + "loss": 0.5941, + "step": 8185 + }, + { + "epoch": 2.3963700234192036, + "grad_norm": 0.9920011162757874, + "learning_rate": 3.2990087858340798e-06, + "loss": 0.5589, + "step": 8186 + }, + { + "epoch": 2.396662763466042, + "grad_norm": 1.024135708808899, + "learning_rate": 3.2986438983891793e-06, + "loss": 0.614, + "step": 8187 + }, + { + "epoch": 2.3969555035128804, + "grad_norm": 1.003268837928772, + "learning_rate": 3.298278991996299e-06, + "loss": 0.5856, + "step": 8188 + }, + { + "epoch": 2.397248243559719, + "grad_norm": 0.998653769493103, + "learning_rate": 3.2979140666640965e-06, + "loss": 0.5946, + "step": 8189 + }, + { + "epoch": 2.3975409836065573, + "grad_norm": 1.0402742624282837, + "learning_rate": 3.2975491224012296e-06, + "loss": 0.5966, + "step": 8190 + }, + { + "epoch": 2.3978337236533958, + "grad_norm": 0.9683092832565308, + "learning_rate": 3.297184159216356e-06, + "loss": 0.5585, + "step": 8191 + }, + { + "epoch": 2.398126463700234, + "grad_norm": 0.9506076574325562, + "learning_rate": 3.2968191771181356e-06, + "loss": 0.5577, + "step": 8192 + }, + { + "epoch": 2.3984192037470726, + "grad_norm": 0.9533050656318665, + "learning_rate": 3.296454176115227e-06, + "loss": 0.5774, + "step": 8193 + }, + { + "epoch": 2.398711943793911, + "grad_norm": 0.9739707708358765, + "learning_rate": 3.2960891562162896e-06, + "loss": 0.5454, + "step": 8194 + }, + { + "epoch": 2.3990046838407495, + "grad_norm": 1.0385735034942627, + "learning_rate": 3.2957241174299846e-06, + "loss": 0.624, + "step": 8195 + }, + { + "epoch": 2.399297423887588, + "grad_norm": 1.0084000825881958, + "learning_rate": 3.295359059764972e-06, + "loss": 0.5853, + "step": 8196 + }, + { + "epoch": 2.3995901639344264, + "grad_norm": 0.934702455997467, + "learning_rate": 3.294993983229912e-06, + "loss": 0.6024, + "step": 8197 + }, + { + "epoch": 2.399882903981265, + "grad_norm": 0.9693577885627747, + "learning_rate": 3.294628887833468e-06, + "loss": 0.5626, + "step": 8198 + }, + { + "epoch": 2.400175644028103, + "grad_norm": 1.0090161561965942, + "learning_rate": 3.294263773584301e-06, + "loss": 0.5688, + "step": 8199 + }, + { + "epoch": 2.4004683840749417, + "grad_norm": 0.9909380674362183, + "learning_rate": 3.2938986404910734e-06, + "loss": 0.5795, + "step": 8200 + }, + { + "epoch": 2.4007611241217797, + "grad_norm": 0.985207736492157, + "learning_rate": 3.293533488562447e-06, + "loss": 0.6016, + "step": 8201 + }, + { + "epoch": 2.401053864168618, + "grad_norm": 1.0395801067352295, + "learning_rate": 3.293168317807087e-06, + "loss": 0.5553, + "step": 8202 + }, + { + "epoch": 2.4013466042154565, + "grad_norm": 0.9651257395744324, + "learning_rate": 3.2928031282336565e-06, + "loss": 0.5421, + "step": 8203 + }, + { + "epoch": 2.401639344262295, + "grad_norm": 1.0181257724761963, + "learning_rate": 3.292437919850819e-06, + "loss": 0.5811, + "step": 8204 + }, + { + "epoch": 2.4019320843091334, + "grad_norm": 0.979194700717926, + "learning_rate": 3.29207269266724e-06, + "loss": 0.5341, + "step": 8205 + }, + { + "epoch": 2.402224824355972, + "grad_norm": 0.9364225268363953, + "learning_rate": 3.2917074466915836e-06, + "loss": 0.5592, + "step": 8206 + }, + { + "epoch": 2.4025175644028103, + "grad_norm": 0.9514002203941345, + "learning_rate": 3.2913421819325165e-06, + "loss": 0.5362, + "step": 8207 + }, + { + "epoch": 2.4028103044496487, + "grad_norm": 0.9526752829551697, + "learning_rate": 3.2909768983987032e-06, + "loss": 0.5365, + "step": 8208 + }, + { + "epoch": 2.403103044496487, + "grad_norm": 0.9619974493980408, + "learning_rate": 3.2906115960988115e-06, + "loss": 0.5613, + "step": 8209 + }, + { + "epoch": 2.4033957845433256, + "grad_norm": 0.9572023153305054, + "learning_rate": 3.2902462750415085e-06, + "loss": 0.5501, + "step": 8210 + }, + { + "epoch": 2.403688524590164, + "grad_norm": 1.034271478652954, + "learning_rate": 3.28988093523546e-06, + "loss": 0.5752, + "step": 8211 + }, + { + "epoch": 2.4039812646370025, + "grad_norm": 1.007509469985962, + "learning_rate": 3.289515576689335e-06, + "loss": 0.5914, + "step": 8212 + }, + { + "epoch": 2.404274004683841, + "grad_norm": 1.0448912382125854, + "learning_rate": 3.289150199411801e-06, + "loss": 0.5542, + "step": 8213 + }, + { + "epoch": 2.4045667447306793, + "grad_norm": 0.9948554635047913, + "learning_rate": 3.2887848034115265e-06, + "loss": 0.5537, + "step": 8214 + }, + { + "epoch": 2.4048594847775178, + "grad_norm": 0.9615487456321716, + "learning_rate": 3.2884193886971816e-06, + "loss": 0.5476, + "step": 8215 + }, + { + "epoch": 2.4051522248243558, + "grad_norm": 1.019536018371582, + "learning_rate": 3.288053955277435e-06, + "loss": 0.5399, + "step": 8216 + }, + { + "epoch": 2.405444964871194, + "grad_norm": 0.9846368432044983, + "learning_rate": 3.287688503160957e-06, + "loss": 0.5879, + "step": 8217 + }, + { + "epoch": 2.4057377049180326, + "grad_norm": 1.0517654418945312, + "learning_rate": 3.2873230323564175e-06, + "loss": 0.6111, + "step": 8218 + }, + { + "epoch": 2.406030444964871, + "grad_norm": 0.9895825386047363, + "learning_rate": 3.286957542872488e-06, + "loss": 0.5746, + "step": 8219 + }, + { + "epoch": 2.4063231850117095, + "grad_norm": 0.9769183993339539, + "learning_rate": 3.2865920347178402e-06, + "loss": 0.5663, + "step": 8220 + }, + { + "epoch": 2.406615925058548, + "grad_norm": 0.9291290044784546, + "learning_rate": 3.2862265079011453e-06, + "loss": 0.5645, + "step": 8221 + }, + { + "epoch": 2.4069086651053864, + "grad_norm": 0.9783119559288025, + "learning_rate": 3.285860962431075e-06, + "loss": 0.5181, + "step": 8222 + }, + { + "epoch": 2.407201405152225, + "grad_norm": 0.9675142168998718, + "learning_rate": 3.285495398316302e-06, + "loss": 0.5384, + "step": 8223 + }, + { + "epoch": 2.4074941451990632, + "grad_norm": 0.9622794985771179, + "learning_rate": 3.2851298155655003e-06, + "loss": 0.5941, + "step": 8224 + }, + { + "epoch": 2.4077868852459017, + "grad_norm": 0.9537856578826904, + "learning_rate": 3.2847642141873432e-06, + "loss": 0.5646, + "step": 8225 + }, + { + "epoch": 2.40807962529274, + "grad_norm": 0.9895748496055603, + "learning_rate": 3.2843985941905037e-06, + "loss": 0.5764, + "step": 8226 + }, + { + "epoch": 2.4083723653395785, + "grad_norm": 1.0110787153244019, + "learning_rate": 3.284032955583657e-06, + "loss": 0.5861, + "step": 8227 + }, + { + "epoch": 2.408665105386417, + "grad_norm": 0.9296993017196655, + "learning_rate": 3.283667298375478e-06, + "loss": 0.5608, + "step": 8228 + }, + { + "epoch": 2.4089578454332554, + "grad_norm": 0.9434669017791748, + "learning_rate": 3.2833016225746413e-06, + "loss": 0.5434, + "step": 8229 + }, + { + "epoch": 2.409250585480094, + "grad_norm": 0.9952657222747803, + "learning_rate": 3.2829359281898237e-06, + "loss": 0.6076, + "step": 8230 + }, + { + "epoch": 2.409543325526932, + "grad_norm": 1.0261270999908447, + "learning_rate": 3.2825702152297007e-06, + "loss": 0.5877, + "step": 8231 + }, + { + "epoch": 2.4098360655737707, + "grad_norm": 0.9417607188224792, + "learning_rate": 3.2822044837029488e-06, + "loss": 0.5503, + "step": 8232 + }, + { + "epoch": 2.4101288056206087, + "grad_norm": 1.0252951383590698, + "learning_rate": 3.281838733618245e-06, + "loss": 0.56, + "step": 8233 + }, + { + "epoch": 2.410421545667447, + "grad_norm": 0.9586654901504517, + "learning_rate": 3.281472964984267e-06, + "loss": 0.5807, + "step": 8234 + }, + { + "epoch": 2.4107142857142856, + "grad_norm": 1.0235825777053833, + "learning_rate": 3.2811071778096935e-06, + "loss": 0.6157, + "step": 8235 + }, + { + "epoch": 2.411007025761124, + "grad_norm": 0.9347360730171204, + "learning_rate": 3.2807413721032016e-06, + "loss": 0.5277, + "step": 8236 + }, + { + "epoch": 2.4112997658079625, + "grad_norm": 1.0398297309875488, + "learning_rate": 3.2803755478734704e-06, + "loss": 0.5736, + "step": 8237 + }, + { + "epoch": 2.411592505854801, + "grad_norm": 0.9533687829971313, + "learning_rate": 3.28000970512918e-06, + "loss": 0.5683, + "step": 8238 + }, + { + "epoch": 2.4118852459016393, + "grad_norm": 1.0013916492462158, + "learning_rate": 3.2796438438790095e-06, + "loss": 0.537, + "step": 8239 + }, + { + "epoch": 2.4121779859484778, + "grad_norm": 1.001854658126831, + "learning_rate": 3.2792779641316384e-06, + "loss": 0.5508, + "step": 8240 + }, + { + "epoch": 2.412470725995316, + "grad_norm": 1.0048874616622925, + "learning_rate": 3.278912065895749e-06, + "loss": 0.5921, + "step": 8241 + }, + { + "epoch": 2.4127634660421546, + "grad_norm": 1.001360297203064, + "learning_rate": 3.2785461491800204e-06, + "loss": 0.5314, + "step": 8242 + }, + { + "epoch": 2.413056206088993, + "grad_norm": 1.0004886388778687, + "learning_rate": 3.2781802139931346e-06, + "loss": 0.6014, + "step": 8243 + }, + { + "epoch": 2.4133489461358315, + "grad_norm": 1.0372982025146484, + "learning_rate": 3.277814260343774e-06, + "loss": 0.5692, + "step": 8244 + }, + { + "epoch": 2.41364168618267, + "grad_norm": 0.9948405027389526, + "learning_rate": 3.2774482882406213e-06, + "loss": 0.5781, + "step": 8245 + }, + { + "epoch": 2.4139344262295084, + "grad_norm": 0.9956378936767578, + "learning_rate": 3.277082297692358e-06, + "loss": 0.5895, + "step": 8246 + }, + { + "epoch": 2.414227166276347, + "grad_norm": 0.9916720986366272, + "learning_rate": 3.276716288707667e-06, + "loss": 0.5702, + "step": 8247 + }, + { + "epoch": 2.414519906323185, + "grad_norm": 0.9829858541488647, + "learning_rate": 3.2763502612952337e-06, + "loss": 0.5661, + "step": 8248 + }, + { + "epoch": 2.4148126463700232, + "grad_norm": 0.9499753713607788, + "learning_rate": 3.2759842154637413e-06, + "loss": 0.5055, + "step": 8249 + }, + { + "epoch": 2.4151053864168617, + "grad_norm": 0.9393008947372437, + "learning_rate": 3.2756181512218743e-06, + "loss": 0.5547, + "step": 8250 + }, + { + "epoch": 2.4153981264637, + "grad_norm": 0.9338517189025879, + "learning_rate": 3.275252068578318e-06, + "loss": 0.5576, + "step": 8251 + }, + { + "epoch": 2.4156908665105385, + "grad_norm": 0.9631519317626953, + "learning_rate": 3.2748859675417566e-06, + "loss": 0.5585, + "step": 8252 + }, + { + "epoch": 2.415983606557377, + "grad_norm": 0.9919712543487549, + "learning_rate": 3.2745198481208773e-06, + "loss": 0.5683, + "step": 8253 + }, + { + "epoch": 2.4162763466042154, + "grad_norm": 1.0505605936050415, + "learning_rate": 3.2741537103243653e-06, + "loss": 0.572, + "step": 8254 + }, + { + "epoch": 2.416569086651054, + "grad_norm": 0.9831565618515015, + "learning_rate": 3.2737875541609086e-06, + "loss": 0.5243, + "step": 8255 + }, + { + "epoch": 2.4168618266978923, + "grad_norm": 1.0624027252197266, + "learning_rate": 3.2734213796391927e-06, + "loss": 0.5781, + "step": 8256 + }, + { + "epoch": 2.4171545667447307, + "grad_norm": 1.0172719955444336, + "learning_rate": 3.2730551867679073e-06, + "loss": 0.5748, + "step": 8257 + }, + { + "epoch": 2.417447306791569, + "grad_norm": 1.614904761314392, + "learning_rate": 3.272688975555738e-06, + "loss": 0.5342, + "step": 8258 + }, + { + "epoch": 2.4177400468384076, + "grad_norm": 0.9969791769981384, + "learning_rate": 3.2723227460113737e-06, + "loss": 0.5776, + "step": 8259 + }, + { + "epoch": 2.418032786885246, + "grad_norm": 1.0333141088485718, + "learning_rate": 3.2719564981435053e-06, + "loss": 0.6115, + "step": 8260 + }, + { + "epoch": 2.4183255269320845, + "grad_norm": 0.9421793222427368, + "learning_rate": 3.27159023196082e-06, + "loss": 0.5288, + "step": 8261 + }, + { + "epoch": 2.418618266978923, + "grad_norm": 0.992292582988739, + "learning_rate": 3.271223947472008e-06, + "loss": 0.5594, + "step": 8262 + }, + { + "epoch": 2.418911007025761, + "grad_norm": 1.0106185674667358, + "learning_rate": 3.27085764468576e-06, + "loss": 0.6158, + "step": 8263 + }, + { + "epoch": 2.4192037470725998, + "grad_norm": 0.9329013824462891, + "learning_rate": 3.270491323610766e-06, + "loss": 0.5239, + "step": 8264 + }, + { + "epoch": 2.4194964871194378, + "grad_norm": 1.0215264558792114, + "learning_rate": 3.2701249842557174e-06, + "loss": 0.6115, + "step": 8265 + }, + { + "epoch": 2.419789227166276, + "grad_norm": 0.9854496717453003, + "learning_rate": 3.2697586266293057e-06, + "loss": 0.4958, + "step": 8266 + }, + { + "epoch": 2.4200819672131146, + "grad_norm": 0.9832554459571838, + "learning_rate": 3.2693922507402226e-06, + "loss": 0.5651, + "step": 8267 + }, + { + "epoch": 2.420374707259953, + "grad_norm": 0.9455676078796387, + "learning_rate": 3.2690258565971605e-06, + "loss": 0.4938, + "step": 8268 + }, + { + "epoch": 2.4206674473067915, + "grad_norm": 1.0379362106323242, + "learning_rate": 3.2686594442088122e-06, + "loss": 0.5798, + "step": 8269 + }, + { + "epoch": 2.42096018735363, + "grad_norm": 0.9758011698722839, + "learning_rate": 3.268293013583871e-06, + "loss": 0.5507, + "step": 8270 + }, + { + "epoch": 2.4212529274004684, + "grad_norm": 1.0223267078399658, + "learning_rate": 3.267926564731031e-06, + "loss": 0.568, + "step": 8271 + }, + { + "epoch": 2.421545667447307, + "grad_norm": 0.9657965898513794, + "learning_rate": 3.267560097658985e-06, + "loss": 0.5507, + "step": 8272 + }, + { + "epoch": 2.4218384074941453, + "grad_norm": 0.9784999489784241, + "learning_rate": 3.267193612376428e-06, + "loss": 0.5639, + "step": 8273 + }, + { + "epoch": 2.4221311475409837, + "grad_norm": 1.0127464532852173, + "learning_rate": 3.266827108892056e-06, + "loss": 0.5783, + "step": 8274 + }, + { + "epoch": 2.422423887587822, + "grad_norm": 1.060477614402771, + "learning_rate": 3.2664605872145626e-06, + "loss": 0.5898, + "step": 8275 + }, + { + "epoch": 2.4227166276346606, + "grad_norm": 0.982363760471344, + "learning_rate": 3.2660940473526457e-06, + "loss": 0.586, + "step": 8276 + }, + { + "epoch": 2.423009367681499, + "grad_norm": 1.0047773122787476, + "learning_rate": 3.2657274893149993e-06, + "loss": 0.5818, + "step": 8277 + }, + { + "epoch": 2.4233021077283374, + "grad_norm": 0.9629112482070923, + "learning_rate": 3.2653609131103215e-06, + "loss": 0.5913, + "step": 8278 + }, + { + "epoch": 2.423594847775176, + "grad_norm": 0.9780883193016052, + "learning_rate": 3.264994318747309e-06, + "loss": 0.5818, + "step": 8279 + }, + { + "epoch": 2.423887587822014, + "grad_norm": 0.997565507888794, + "learning_rate": 3.2646277062346586e-06, + "loss": 0.5383, + "step": 8280 + }, + { + "epoch": 2.4241803278688523, + "grad_norm": 0.9842807054519653, + "learning_rate": 3.2642610755810707e-06, + "loss": 0.5764, + "step": 8281 + }, + { + "epoch": 2.4244730679156907, + "grad_norm": 0.9939247965812683, + "learning_rate": 3.2638944267952404e-06, + "loss": 0.565, + "step": 8282 + }, + { + "epoch": 2.424765807962529, + "grad_norm": 0.9443012475967407, + "learning_rate": 3.2635277598858683e-06, + "loss": 0.5533, + "step": 8283 + }, + { + "epoch": 2.4250585480093676, + "grad_norm": 0.974319338798523, + "learning_rate": 3.263161074861654e-06, + "loss": 0.5584, + "step": 8284 + }, + { + "epoch": 2.425351288056206, + "grad_norm": 1.0360455513000488, + "learning_rate": 3.262794371731296e-06, + "loss": 0.5563, + "step": 8285 + }, + { + "epoch": 2.4256440281030445, + "grad_norm": 0.9843451976776123, + "learning_rate": 3.262427650503495e-06, + "loss": 0.6012, + "step": 8286 + }, + { + "epoch": 2.425936768149883, + "grad_norm": 0.9997661113739014, + "learning_rate": 3.262060911186952e-06, + "loss": 0.6201, + "step": 8287 + }, + { + "epoch": 2.4262295081967213, + "grad_norm": 0.9521046280860901, + "learning_rate": 3.2616941537903668e-06, + "loss": 0.5842, + "step": 8288 + }, + { + "epoch": 2.4265222482435598, + "grad_norm": 1.003819465637207, + "learning_rate": 3.2613273783224417e-06, + "loss": 0.5784, + "step": 8289 + }, + { + "epoch": 2.426814988290398, + "grad_norm": 0.95973140001297, + "learning_rate": 3.2609605847918783e-06, + "loss": 0.5263, + "step": 8290 + }, + { + "epoch": 2.4271077283372366, + "grad_norm": 1.0043903589248657, + "learning_rate": 3.2605937732073794e-06, + "loss": 0.5742, + "step": 8291 + }, + { + "epoch": 2.427400468384075, + "grad_norm": 1.0280847549438477, + "learning_rate": 3.260226943577647e-06, + "loss": 0.6218, + "step": 8292 + }, + { + "epoch": 2.4276932084309135, + "grad_norm": 0.9658049941062927, + "learning_rate": 3.2598600959113843e-06, + "loss": 0.5388, + "step": 8293 + }, + { + "epoch": 2.427985948477752, + "grad_norm": 0.9843700528144836, + "learning_rate": 3.2594932302172937e-06, + "loss": 0.5874, + "step": 8294 + }, + { + "epoch": 2.42827868852459, + "grad_norm": 0.936115026473999, + "learning_rate": 3.259126346504081e-06, + "loss": 0.5497, + "step": 8295 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.9683148860931396, + "learning_rate": 3.2587594447804505e-06, + "loss": 0.5554, + "step": 8296 + }, + { + "epoch": 2.428864168618267, + "grad_norm": 0.971315860748291, + "learning_rate": 3.2583925250551057e-06, + "loss": 0.5916, + "step": 8297 + }, + { + "epoch": 2.4291569086651053, + "grad_norm": 1.0083078145980835, + "learning_rate": 3.258025587336753e-06, + "loss": 0.5674, + "step": 8298 + }, + { + "epoch": 2.4294496487119437, + "grad_norm": 1.0292905569076538, + "learning_rate": 3.257658631634097e-06, + "loss": 0.6022, + "step": 8299 + }, + { + "epoch": 2.429742388758782, + "grad_norm": 1.0149699449539185, + "learning_rate": 3.2572916579558443e-06, + "loss": 0.58, + "step": 8300 + }, + { + "epoch": 2.4300351288056206, + "grad_norm": 0.9605231285095215, + "learning_rate": 3.2569246663107023e-06, + "loss": 0.5397, + "step": 8301 + }, + { + "epoch": 2.430327868852459, + "grad_norm": 1.0025033950805664, + "learning_rate": 3.256557656707377e-06, + "loss": 0.5984, + "step": 8302 + }, + { + "epoch": 2.4306206088992974, + "grad_norm": 0.9835594892501831, + "learning_rate": 3.256190629154576e-06, + "loss": 0.5996, + "step": 8303 + }, + { + "epoch": 2.430913348946136, + "grad_norm": 1.0029226541519165, + "learning_rate": 3.2558235836610062e-06, + "loss": 0.5971, + "step": 8304 + }, + { + "epoch": 2.4312060889929743, + "grad_norm": 0.9761295914649963, + "learning_rate": 3.255456520235377e-06, + "loss": 0.5451, + "step": 8305 + }, + { + "epoch": 2.4314988290398127, + "grad_norm": 0.975797176361084, + "learning_rate": 3.2550894388863975e-06, + "loss": 0.5551, + "step": 8306 + }, + { + "epoch": 2.431791569086651, + "grad_norm": 0.9480080604553223, + "learning_rate": 3.2547223396227757e-06, + "loss": 0.571, + "step": 8307 + }, + { + "epoch": 2.4320843091334896, + "grad_norm": 1.0144805908203125, + "learning_rate": 3.254355222453221e-06, + "loss": 0.6137, + "step": 8308 + }, + { + "epoch": 2.432377049180328, + "grad_norm": 1.0753586292266846, + "learning_rate": 3.2539880873864437e-06, + "loss": 0.589, + "step": 8309 + }, + { + "epoch": 2.432669789227166, + "grad_norm": 0.9863920211791992, + "learning_rate": 3.2536209344311546e-06, + "loss": 0.5425, + "step": 8310 + }, + { + "epoch": 2.432962529274005, + "grad_norm": 0.9986311793327332, + "learning_rate": 3.253253763596064e-06, + "loss": 0.5846, + "step": 8311 + }, + { + "epoch": 2.433255269320843, + "grad_norm": 0.974120020866394, + "learning_rate": 3.252886574889883e-06, + "loss": 0.5982, + "step": 8312 + }, + { + "epoch": 2.4335480093676813, + "grad_norm": 1.0097498893737793, + "learning_rate": 3.2525193683213234e-06, + "loss": 0.5395, + "step": 8313 + }, + { + "epoch": 2.4338407494145198, + "grad_norm": 0.9613322615623474, + "learning_rate": 3.2521521438990968e-06, + "loss": 0.5629, + "step": 8314 + }, + { + "epoch": 2.434133489461358, + "grad_norm": 1.00404953956604, + "learning_rate": 3.2517849016319163e-06, + "loss": 0.5481, + "step": 8315 + }, + { + "epoch": 2.4344262295081966, + "grad_norm": 0.9699861407279968, + "learning_rate": 3.2514176415284955e-06, + "loss": 0.6108, + "step": 8316 + }, + { + "epoch": 2.434718969555035, + "grad_norm": 1.0912405252456665, + "learning_rate": 3.2510503635975455e-06, + "loss": 0.5435, + "step": 8317 + }, + { + "epoch": 2.4350117096018735, + "grad_norm": 0.9551235437393188, + "learning_rate": 3.250683067847782e-06, + "loss": 0.592, + "step": 8318 + }, + { + "epoch": 2.435304449648712, + "grad_norm": 0.9639994502067566, + "learning_rate": 3.250315754287918e-06, + "loss": 0.5576, + "step": 8319 + }, + { + "epoch": 2.4355971896955504, + "grad_norm": 0.9964255690574646, + "learning_rate": 3.2499484229266686e-06, + "loss": 0.5839, + "step": 8320 + }, + { + "epoch": 2.435889929742389, + "grad_norm": 0.9582764506340027, + "learning_rate": 3.2495810737727495e-06, + "loss": 0.5565, + "step": 8321 + }, + { + "epoch": 2.4361826697892273, + "grad_norm": 0.9848564267158508, + "learning_rate": 3.249213706834875e-06, + "loss": 0.5453, + "step": 8322 + }, + { + "epoch": 2.4364754098360657, + "grad_norm": 1.0160887241363525, + "learning_rate": 3.2488463221217612e-06, + "loss": 0.5747, + "step": 8323 + }, + { + "epoch": 2.436768149882904, + "grad_norm": 1.0066529512405396, + "learning_rate": 3.2484789196421244e-06, + "loss": 0.5855, + "step": 8324 + }, + { + "epoch": 2.4370608899297426, + "grad_norm": 1.0264664888381958, + "learning_rate": 3.2481114994046816e-06, + "loss": 0.614, + "step": 8325 + }, + { + "epoch": 2.437353629976581, + "grad_norm": 1.0138272047042847, + "learning_rate": 3.2477440614181504e-06, + "loss": 0.5894, + "step": 8326 + }, + { + "epoch": 2.437646370023419, + "grad_norm": 0.9691046476364136, + "learning_rate": 3.247376605691247e-06, + "loss": 0.576, + "step": 8327 + }, + { + "epoch": 2.4379391100702574, + "grad_norm": 1.0320780277252197, + "learning_rate": 3.24700913223269e-06, + "loss": 0.6, + "step": 8328 + }, + { + "epoch": 2.438231850117096, + "grad_norm": 0.9805174469947815, + "learning_rate": 3.2466416410511986e-06, + "loss": 0.6119, + "step": 8329 + }, + { + "epoch": 2.4385245901639343, + "grad_norm": 0.9382761716842651, + "learning_rate": 3.24627413215549e-06, + "loss": 0.5156, + "step": 8330 + }, + { + "epoch": 2.4388173302107727, + "grad_norm": 1.0106409788131714, + "learning_rate": 3.2459066055542846e-06, + "loss": 0.5181, + "step": 8331 + }, + { + "epoch": 2.439110070257611, + "grad_norm": 1.0040942430496216, + "learning_rate": 3.2455390612563014e-06, + "loss": 0.6178, + "step": 8332 + }, + { + "epoch": 2.4394028103044496, + "grad_norm": 0.9920803308486938, + "learning_rate": 3.245171499270261e-06, + "loss": 0.6201, + "step": 8333 + }, + { + "epoch": 2.439695550351288, + "grad_norm": 0.9550942778587341, + "learning_rate": 3.244803919604883e-06, + "loss": 0.5533, + "step": 8334 + }, + { + "epoch": 2.4399882903981265, + "grad_norm": 1.0149141550064087, + "learning_rate": 3.2444363222688894e-06, + "loss": 0.5541, + "step": 8335 + }, + { + "epoch": 2.440281030444965, + "grad_norm": 0.9883396029472351, + "learning_rate": 3.244068707271002e-06, + "loss": 0.5488, + "step": 8336 + }, + { + "epoch": 2.4405737704918034, + "grad_norm": 1.0209537744522095, + "learning_rate": 3.2437010746199403e-06, + "loss": 0.5874, + "step": 8337 + }, + { + "epoch": 2.440866510538642, + "grad_norm": 1.0156292915344238, + "learning_rate": 3.2433334243244276e-06, + "loss": 0.6009, + "step": 8338 + }, + { + "epoch": 2.4411592505854802, + "grad_norm": 1.0382535457611084, + "learning_rate": 3.242965756393187e-06, + "loss": 0.6, + "step": 8339 + }, + { + "epoch": 2.4414519906323187, + "grad_norm": 0.9695711135864258, + "learning_rate": 3.242598070834941e-06, + "loss": 0.5541, + "step": 8340 + }, + { + "epoch": 2.441744730679157, + "grad_norm": 0.9706944227218628, + "learning_rate": 3.2422303676584137e-06, + "loss": 0.575, + "step": 8341 + }, + { + "epoch": 2.442037470725995, + "grad_norm": 0.9732181429862976, + "learning_rate": 3.241862646872328e-06, + "loss": 0.545, + "step": 8342 + }, + { + "epoch": 2.442330210772834, + "grad_norm": 1.0121486186981201, + "learning_rate": 3.241494908485408e-06, + "loss": 0.5722, + "step": 8343 + }, + { + "epoch": 2.442622950819672, + "grad_norm": 0.9677677750587463, + "learning_rate": 3.241127152506379e-06, + "loss": 0.5495, + "step": 8344 + }, + { + "epoch": 2.4429156908665104, + "grad_norm": 1.0382310152053833, + "learning_rate": 3.2407593789439663e-06, + "loss": 0.6032, + "step": 8345 + }, + { + "epoch": 2.443208430913349, + "grad_norm": 0.9805883765220642, + "learning_rate": 3.2403915878068946e-06, + "loss": 0.5819, + "step": 8346 + }, + { + "epoch": 2.4435011709601873, + "grad_norm": 0.9653574228286743, + "learning_rate": 3.2400237791038914e-06, + "loss": 0.6119, + "step": 8347 + }, + { + "epoch": 2.4437939110070257, + "grad_norm": 0.987451434135437, + "learning_rate": 3.2396559528436804e-06, + "loss": 0.5995, + "step": 8348 + }, + { + "epoch": 2.444086651053864, + "grad_norm": 1.0393164157867432, + "learning_rate": 3.2392881090349905e-06, + "loss": 0.6138, + "step": 8349 + }, + { + "epoch": 2.4443793911007026, + "grad_norm": 1.0613399744033813, + "learning_rate": 3.2389202476865476e-06, + "loss": 0.633, + "step": 8350 + }, + { + "epoch": 2.444672131147541, + "grad_norm": 0.931509256362915, + "learning_rate": 3.2385523688070803e-06, + "loss": 0.5758, + "step": 8351 + }, + { + "epoch": 2.4449648711943794, + "grad_norm": 0.926037073135376, + "learning_rate": 3.2381844724053165e-06, + "loss": 0.57, + "step": 8352 + }, + { + "epoch": 2.445257611241218, + "grad_norm": 0.9929760694503784, + "learning_rate": 3.2378165584899834e-06, + "loss": 0.5829, + "step": 8353 + }, + { + "epoch": 2.4455503512880563, + "grad_norm": 1.0144745111465454, + "learning_rate": 3.237448627069811e-06, + "loss": 0.539, + "step": 8354 + }, + { + "epoch": 2.4458430913348947, + "grad_norm": 0.9787104725837708, + "learning_rate": 3.2370806781535284e-06, + "loss": 0.5744, + "step": 8355 + }, + { + "epoch": 2.446135831381733, + "grad_norm": 0.9567342400550842, + "learning_rate": 3.2367127117498655e-06, + "loss": 0.5836, + "step": 8356 + }, + { + "epoch": 2.4464285714285716, + "grad_norm": 1.083642840385437, + "learning_rate": 3.236344727867552e-06, + "loss": 0.5673, + "step": 8357 + }, + { + "epoch": 2.44672131147541, + "grad_norm": 1.0271590948104858, + "learning_rate": 3.2359767265153176e-06, + "loss": 0.5938, + "step": 8358 + }, + { + "epoch": 2.447014051522248, + "grad_norm": 1.0241382122039795, + "learning_rate": 3.235608707701894e-06, + "loss": 0.5667, + "step": 8359 + }, + { + "epoch": 2.4473067915690865, + "grad_norm": 1.0180085897445679, + "learning_rate": 3.235240671436013e-06, + "loss": 0.5653, + "step": 8360 + }, + { + "epoch": 2.447599531615925, + "grad_norm": 1.0300925970077515, + "learning_rate": 3.2348726177264055e-06, + "loss": 0.5795, + "step": 8361 + }, + { + "epoch": 2.4478922716627634, + "grad_norm": 1.017876148223877, + "learning_rate": 3.2345045465818048e-06, + "loss": 0.5927, + "step": 8362 + }, + { + "epoch": 2.448185011709602, + "grad_norm": 1.0411696434020996, + "learning_rate": 3.2341364580109415e-06, + "loss": 0.5875, + "step": 8363 + }, + { + "epoch": 2.4484777517564402, + "grad_norm": 0.9343277215957642, + "learning_rate": 3.2337683520225504e-06, + "loss": 0.5256, + "step": 8364 + }, + { + "epoch": 2.4487704918032787, + "grad_norm": 0.988804042339325, + "learning_rate": 3.2334002286253642e-06, + "loss": 0.5816, + "step": 8365 + }, + { + "epoch": 2.449063231850117, + "grad_norm": 1.0997767448425293, + "learning_rate": 3.2330320878281156e-06, + "loss": 0.5937, + "step": 8366 + }, + { + "epoch": 2.4493559718969555, + "grad_norm": 1.0451582670211792, + "learning_rate": 3.232663929639541e-06, + "loss": 0.5965, + "step": 8367 + }, + { + "epoch": 2.449648711943794, + "grad_norm": 0.9629536867141724, + "learning_rate": 3.2322957540683738e-06, + "loss": 0.566, + "step": 8368 + }, + { + "epoch": 2.4499414519906324, + "grad_norm": 1.0270518064498901, + "learning_rate": 3.2319275611233486e-06, + "loss": 0.5843, + "step": 8369 + }, + { + "epoch": 2.450234192037471, + "grad_norm": 0.9404316544532776, + "learning_rate": 3.2315593508132014e-06, + "loss": 0.5414, + "step": 8370 + }, + { + "epoch": 2.4505269320843093, + "grad_norm": 1.0387685298919678, + "learning_rate": 3.2311911231466686e-06, + "loss": 0.5671, + "step": 8371 + }, + { + "epoch": 2.4508196721311477, + "grad_norm": 1.0718995332717896, + "learning_rate": 3.230822878132486e-06, + "loss": 0.5743, + "step": 8372 + }, + { + "epoch": 2.451112412177986, + "grad_norm": 0.9575822949409485, + "learning_rate": 3.2304546157793898e-06, + "loss": 0.6049, + "step": 8373 + }, + { + "epoch": 2.451405152224824, + "grad_norm": 0.912695050239563, + "learning_rate": 3.2300863360961177e-06, + "loss": 0.5789, + "step": 8374 + }, + { + "epoch": 2.4516978922716626, + "grad_norm": 1.0545886754989624, + "learning_rate": 3.2297180390914064e-06, + "loss": 0.5911, + "step": 8375 + }, + { + "epoch": 2.451990632318501, + "grad_norm": 0.9418044090270996, + "learning_rate": 3.2293497247739947e-06, + "loss": 0.5469, + "step": 8376 + }, + { + "epoch": 2.4522833723653394, + "grad_norm": 0.9678881168365479, + "learning_rate": 3.2289813931526214e-06, + "loss": 0.5498, + "step": 8377 + }, + { + "epoch": 2.452576112412178, + "grad_norm": 1.0203630924224854, + "learning_rate": 3.2286130442360242e-06, + "loss": 0.6013, + "step": 8378 + }, + { + "epoch": 2.4528688524590163, + "grad_norm": 1.0100808143615723, + "learning_rate": 3.228244678032942e-06, + "loss": 0.5684, + "step": 8379 + }, + { + "epoch": 2.4531615925058547, + "grad_norm": 0.9995232820510864, + "learning_rate": 3.2278762945521145e-06, + "loss": 0.5531, + "step": 8380 + }, + { + "epoch": 2.453454332552693, + "grad_norm": 0.9702305793762207, + "learning_rate": 3.2275078938022835e-06, + "loss": 0.5286, + "step": 8381 + }, + { + "epoch": 2.4537470725995316, + "grad_norm": 1.025972843170166, + "learning_rate": 3.227139475792187e-06, + "loss": 0.589, + "step": 8382 + }, + { + "epoch": 2.45403981264637, + "grad_norm": 0.9899046421051025, + "learning_rate": 3.2267710405305676e-06, + "loss": 0.6159, + "step": 8383 + }, + { + "epoch": 2.4543325526932085, + "grad_norm": 0.9778925180435181, + "learning_rate": 3.226402588026165e-06, + "loss": 0.5436, + "step": 8384 + }, + { + "epoch": 2.454625292740047, + "grad_norm": 0.963078498840332, + "learning_rate": 3.2260341182877214e-06, + "loss": 0.5782, + "step": 8385 + }, + { + "epoch": 2.4549180327868854, + "grad_norm": 0.9888131618499756, + "learning_rate": 3.2256656313239785e-06, + "loss": 0.5817, + "step": 8386 + }, + { + "epoch": 2.455210772833724, + "grad_norm": 0.9734917879104614, + "learning_rate": 3.22529712714368e-06, + "loss": 0.5623, + "step": 8387 + }, + { + "epoch": 2.4555035128805622, + "grad_norm": 0.9927727580070496, + "learning_rate": 3.2249286057555675e-06, + "loss": 0.5764, + "step": 8388 + }, + { + "epoch": 2.4557962529274002, + "grad_norm": 0.9451578259468079, + "learning_rate": 3.224560067168384e-06, + "loss": 0.5562, + "step": 8389 + }, + { + "epoch": 2.456088992974239, + "grad_norm": 1.006401777267456, + "learning_rate": 3.224191511390874e-06, + "loss": 0.5498, + "step": 8390 + }, + { + "epoch": 2.456381733021077, + "grad_norm": 0.9841427803039551, + "learning_rate": 3.223822938431781e-06, + "loss": 0.5407, + "step": 8391 + }, + { + "epoch": 2.4566744730679155, + "grad_norm": 0.9864225387573242, + "learning_rate": 3.2234543482998502e-06, + "loss": 0.5772, + "step": 8392 + }, + { + "epoch": 2.456967213114754, + "grad_norm": 0.9557392597198486, + "learning_rate": 3.2230857410038264e-06, + "loss": 0.5838, + "step": 8393 + }, + { + "epoch": 2.4572599531615924, + "grad_norm": 1.0130184888839722, + "learning_rate": 3.2227171165524534e-06, + "loss": 0.5465, + "step": 8394 + }, + { + "epoch": 2.457552693208431, + "grad_norm": 0.9472577571868896, + "learning_rate": 3.2223484749544786e-06, + "loss": 0.5514, + "step": 8395 + }, + { + "epoch": 2.4578454332552693, + "grad_norm": 0.997222363948822, + "learning_rate": 3.221979816218647e-06, + "loss": 0.5779, + "step": 8396 + }, + { + "epoch": 2.4581381733021077, + "grad_norm": 0.9963099360466003, + "learning_rate": 3.2216111403537065e-06, + "loss": 0.5519, + "step": 8397 + }, + { + "epoch": 2.458430913348946, + "grad_norm": 1.0038349628448486, + "learning_rate": 3.2212424473684023e-06, + "loss": 0.5965, + "step": 8398 + }, + { + "epoch": 2.4587236533957846, + "grad_norm": 0.9708025455474854, + "learning_rate": 3.220873737271482e-06, + "loss": 0.5737, + "step": 8399 + }, + { + "epoch": 2.459016393442623, + "grad_norm": 0.9788583517074585, + "learning_rate": 3.220505010071695e-06, + "loss": 0.5672, + "step": 8400 + }, + { + "epoch": 2.4593091334894615, + "grad_norm": 0.9924021363258362, + "learning_rate": 3.220136265777787e-06, + "loss": 0.5754, + "step": 8401 + }, + { + "epoch": 2.4596018735363, + "grad_norm": 0.9661285281181335, + "learning_rate": 3.2197675043985095e-06, + "loss": 0.592, + "step": 8402 + }, + { + "epoch": 2.4598946135831383, + "grad_norm": 0.9825810790061951, + "learning_rate": 3.2193987259426083e-06, + "loss": 0.5708, + "step": 8403 + }, + { + "epoch": 2.4601873536299768, + "grad_norm": 1.0098730325698853, + "learning_rate": 3.2190299304188345e-06, + "loss": 0.5696, + "step": 8404 + }, + { + "epoch": 2.460480093676815, + "grad_norm": 1.006103277206421, + "learning_rate": 3.218661117835937e-06, + "loss": 0.6152, + "step": 8405 + }, + { + "epoch": 2.460772833723653, + "grad_norm": 0.9668680429458618, + "learning_rate": 3.2182922882026663e-06, + "loss": 0.5314, + "step": 8406 + }, + { + "epoch": 2.4610655737704916, + "grad_norm": 1.0397429466247559, + "learning_rate": 3.217923441527774e-06, + "loss": 0.61, + "step": 8407 + }, + { + "epoch": 2.46135831381733, + "grad_norm": 1.0634502172470093, + "learning_rate": 3.2175545778200094e-06, + "loss": 0.5452, + "step": 8408 + }, + { + "epoch": 2.4616510538641685, + "grad_norm": 0.9544469118118286, + "learning_rate": 3.2171856970881244e-06, + "loss": 0.5917, + "step": 8409 + }, + { + "epoch": 2.461943793911007, + "grad_norm": 1.0178593397140503, + "learning_rate": 3.216816799340872e-06, + "loss": 0.5811, + "step": 8410 + }, + { + "epoch": 2.4622365339578454, + "grad_norm": 0.9735410213470459, + "learning_rate": 3.2164478845870018e-06, + "loss": 0.5669, + "step": 8411 + }, + { + "epoch": 2.462529274004684, + "grad_norm": 0.8973817229270935, + "learning_rate": 3.2160789528352692e-06, + "loss": 0.5259, + "step": 8412 + }, + { + "epoch": 2.4628220140515222, + "grad_norm": 0.9978737831115723, + "learning_rate": 3.2157100040944254e-06, + "loss": 0.5686, + "step": 8413 + }, + { + "epoch": 2.4631147540983607, + "grad_norm": 1.0427287817001343, + "learning_rate": 3.2153410383732243e-06, + "loss": 0.5576, + "step": 8414 + }, + { + "epoch": 2.463407494145199, + "grad_norm": 1.0155625343322754, + "learning_rate": 3.214972055680419e-06, + "loss": 0.564, + "step": 8415 + }, + { + "epoch": 2.4637002341920375, + "grad_norm": 0.9645121693611145, + "learning_rate": 3.2146030560247653e-06, + "loss": 0.5759, + "step": 8416 + }, + { + "epoch": 2.463992974238876, + "grad_norm": 1.0029692649841309, + "learning_rate": 3.214234039415016e-06, + "loss": 0.6019, + "step": 8417 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 0.9474651217460632, + "learning_rate": 3.2138650058599276e-06, + "loss": 0.5563, + "step": 8418 + }, + { + "epoch": 2.464578454332553, + "grad_norm": 0.9896385073661804, + "learning_rate": 3.213495955368255e-06, + "loss": 0.5903, + "step": 8419 + }, + { + "epoch": 2.4648711943793913, + "grad_norm": 0.9917644262313843, + "learning_rate": 3.213126887948753e-06, + "loss": 0.5604, + "step": 8420 + }, + { + "epoch": 2.4651639344262293, + "grad_norm": 0.9820916056632996, + "learning_rate": 3.212757803610179e-06, + "loss": 0.5893, + "step": 8421 + }, + { + "epoch": 2.465456674473068, + "grad_norm": 1.0119341611862183, + "learning_rate": 3.2123887023612894e-06, + "loss": 0.5876, + "step": 8422 + }, + { + "epoch": 2.465749414519906, + "grad_norm": 1.023709774017334, + "learning_rate": 3.2120195842108403e-06, + "loss": 0.5999, + "step": 8423 + }, + { + "epoch": 2.4660421545667446, + "grad_norm": 1.014430284500122, + "learning_rate": 3.21165044916759e-06, + "loss": 0.5462, + "step": 8424 + }, + { + "epoch": 2.466334894613583, + "grad_norm": 1.0028942823410034, + "learning_rate": 3.2112812972402964e-06, + "loss": 0.5945, + "step": 8425 + }, + { + "epoch": 2.4666276346604215, + "grad_norm": 0.955335795879364, + "learning_rate": 3.2109121284377165e-06, + "loss": 0.5601, + "step": 8426 + }, + { + "epoch": 2.46692037470726, + "grad_norm": 0.9630577564239502, + "learning_rate": 3.21054294276861e-06, + "loss": 0.5397, + "step": 8427 + }, + { + "epoch": 2.4672131147540983, + "grad_norm": 1.0228619575500488, + "learning_rate": 3.2101737402417367e-06, + "loss": 0.5984, + "step": 8428 + }, + { + "epoch": 2.4675058548009368, + "grad_norm": 1.0201832056045532, + "learning_rate": 3.2098045208658545e-06, + "loss": 0.5913, + "step": 8429 + }, + { + "epoch": 2.467798594847775, + "grad_norm": 1.0120502710342407, + "learning_rate": 3.2094352846497236e-06, + "loss": 0.6017, + "step": 8430 + }, + { + "epoch": 2.4680913348946136, + "grad_norm": 0.9797620177268982, + "learning_rate": 3.2090660316021037e-06, + "loss": 0.5798, + "step": 8431 + }, + { + "epoch": 2.468384074941452, + "grad_norm": 0.9737810492515564, + "learning_rate": 3.208696761731756e-06, + "loss": 0.5394, + "step": 8432 + }, + { + "epoch": 2.4686768149882905, + "grad_norm": 0.9581058621406555, + "learning_rate": 3.2083274750474424e-06, + "loss": 0.579, + "step": 8433 + }, + { + "epoch": 2.468969555035129, + "grad_norm": 0.9561278820037842, + "learning_rate": 3.207958171557923e-06, + "loss": 0.5423, + "step": 8434 + }, + { + "epoch": 2.4692622950819674, + "grad_norm": 0.9810628890991211, + "learning_rate": 3.20758885127196e-06, + "loss": 0.5836, + "step": 8435 + }, + { + "epoch": 2.469555035128806, + "grad_norm": 0.9567561745643616, + "learning_rate": 3.2072195141983152e-06, + "loss": 0.5568, + "step": 8436 + }, + { + "epoch": 2.4698477751756442, + "grad_norm": 0.9605334997177124, + "learning_rate": 3.206850160345752e-06, + "loss": 0.5994, + "step": 8437 + }, + { + "epoch": 2.4701405152224822, + "grad_norm": 1.0196784734725952, + "learning_rate": 3.206480789723033e-06, + "loss": 0.6002, + "step": 8438 + }, + { + "epoch": 2.4704332552693207, + "grad_norm": 1.1545056104660034, + "learning_rate": 3.2061114023389213e-06, + "loss": 0.5611, + "step": 8439 + }, + { + "epoch": 2.470725995316159, + "grad_norm": 0.9630811214447021, + "learning_rate": 3.20574199820218e-06, + "loss": 0.5315, + "step": 8440 + }, + { + "epoch": 2.4710187353629975, + "grad_norm": 0.9645237922668457, + "learning_rate": 3.2053725773215754e-06, + "loss": 0.5573, + "step": 8441 + }, + { + "epoch": 2.471311475409836, + "grad_norm": 0.976962685585022, + "learning_rate": 3.2050031397058705e-06, + "loss": 0.5546, + "step": 8442 + }, + { + "epoch": 2.4716042154566744, + "grad_norm": 0.9711899161338806, + "learning_rate": 3.204633685363831e-06, + "loss": 0.5816, + "step": 8443 + }, + { + "epoch": 2.471896955503513, + "grad_norm": 1.0427842140197754, + "learning_rate": 3.204264214304222e-06, + "loss": 0.5726, + "step": 8444 + }, + { + "epoch": 2.4721896955503513, + "grad_norm": 1.0089126825332642, + "learning_rate": 3.2038947265358086e-06, + "loss": 0.5686, + "step": 8445 + }, + { + "epoch": 2.4724824355971897, + "grad_norm": 0.9719372987747192, + "learning_rate": 3.2035252220673575e-06, + "loss": 0.5848, + "step": 8446 + }, + { + "epoch": 2.472775175644028, + "grad_norm": 0.9827622771263123, + "learning_rate": 3.203155700907635e-06, + "loss": 0.5654, + "step": 8447 + }, + { + "epoch": 2.4730679156908666, + "grad_norm": 1.01365065574646, + "learning_rate": 3.20278616306541e-06, + "loss": 0.5896, + "step": 8448 + }, + { + "epoch": 2.473360655737705, + "grad_norm": 0.9750059247016907, + "learning_rate": 3.2024166085494475e-06, + "loss": 0.5779, + "step": 8449 + }, + { + "epoch": 2.4736533957845435, + "grad_norm": 1.0260155200958252, + "learning_rate": 3.2020470373685155e-06, + "loss": 0.6041, + "step": 8450 + }, + { + "epoch": 2.473946135831382, + "grad_norm": 1.0423083305358887, + "learning_rate": 3.2016774495313828e-06, + "loss": 0.5773, + "step": 8451 + }, + { + "epoch": 2.4742388758782203, + "grad_norm": 0.910812497138977, + "learning_rate": 3.201307845046818e-06, + "loss": 0.5859, + "step": 8452 + }, + { + "epoch": 2.4745316159250583, + "grad_norm": 0.992379367351532, + "learning_rate": 3.2009382239235903e-06, + "loss": 0.581, + "step": 8453 + }, + { + "epoch": 2.474824355971897, + "grad_norm": 1.0056205987930298, + "learning_rate": 3.2005685861704684e-06, + "loss": 0.5695, + "step": 8454 + }, + { + "epoch": 2.475117096018735, + "grad_norm": 1.0303982496261597, + "learning_rate": 3.2001989317962223e-06, + "loss": 0.5749, + "step": 8455 + }, + { + "epoch": 2.4754098360655736, + "grad_norm": 0.9829675555229187, + "learning_rate": 3.1998292608096217e-06, + "loss": 0.5473, + "step": 8456 + }, + { + "epoch": 2.475702576112412, + "grad_norm": 0.9691380858421326, + "learning_rate": 3.199459573219437e-06, + "loss": 0.5383, + "step": 8457 + }, + { + "epoch": 2.4759953161592505, + "grad_norm": 1.004169225692749, + "learning_rate": 3.1990898690344403e-06, + "loss": 0.598, + "step": 8458 + }, + { + "epoch": 2.476288056206089, + "grad_norm": 1.0358755588531494, + "learning_rate": 3.198720148263401e-06, + "loss": 0.5693, + "step": 8459 + }, + { + "epoch": 2.4765807962529274, + "grad_norm": 0.9833822846412659, + "learning_rate": 3.198350410915093e-06, + "loss": 0.5472, + "step": 8460 + }, + { + "epoch": 2.476873536299766, + "grad_norm": 1.0099372863769531, + "learning_rate": 3.197980656998287e-06, + "loss": 0.5505, + "step": 8461 + }, + { + "epoch": 2.4771662763466042, + "grad_norm": 1.0023140907287598, + "learning_rate": 3.1976108865217558e-06, + "loss": 0.6237, + "step": 8462 + }, + { + "epoch": 2.4774590163934427, + "grad_norm": 0.9669394493103027, + "learning_rate": 3.1972410994942727e-06, + "loss": 0.5383, + "step": 8463 + }, + { + "epoch": 2.477751756440281, + "grad_norm": 1.0484448671340942, + "learning_rate": 3.19687129592461e-06, + "loss": 0.5848, + "step": 8464 + }, + { + "epoch": 2.4780444964871196, + "grad_norm": 1.0499433279037476, + "learning_rate": 3.1965014758215425e-06, + "loss": 0.6019, + "step": 8465 + }, + { + "epoch": 2.478337236533958, + "grad_norm": 0.951470136642456, + "learning_rate": 3.196131639193843e-06, + "loss": 0.5965, + "step": 8466 + }, + { + "epoch": 2.4786299765807964, + "grad_norm": 1.0628681182861328, + "learning_rate": 3.1957617860502867e-06, + "loss": 0.5972, + "step": 8467 + }, + { + "epoch": 2.4789227166276344, + "grad_norm": 0.9993698000907898, + "learning_rate": 3.195391916399649e-06, + "loss": 0.5947, + "step": 8468 + }, + { + "epoch": 2.4792154566744733, + "grad_norm": 1.0327318906784058, + "learning_rate": 3.195022030250704e-06, + "loss": 0.6079, + "step": 8469 + }, + { + "epoch": 2.4795081967213113, + "grad_norm": 0.9316467642784119, + "learning_rate": 3.194652127612228e-06, + "loss": 0.5364, + "step": 8470 + }, + { + "epoch": 2.4798009367681497, + "grad_norm": 1.012568473815918, + "learning_rate": 3.194282208492997e-06, + "loss": 0.574, + "step": 8471 + }, + { + "epoch": 2.480093676814988, + "grad_norm": 0.9624841213226318, + "learning_rate": 3.193912272901787e-06, + "loss": 0.5527, + "step": 8472 + }, + { + "epoch": 2.4803864168618266, + "grad_norm": 1.003438115119934, + "learning_rate": 3.193542320847375e-06, + "loss": 0.5527, + "step": 8473 + }, + { + "epoch": 2.480679156908665, + "grad_norm": 1.0073264837265015, + "learning_rate": 3.1931723523385377e-06, + "loss": 0.5826, + "step": 8474 + }, + { + "epoch": 2.4809718969555035, + "grad_norm": 0.9411847591400146, + "learning_rate": 3.1928023673840536e-06, + "loss": 0.5529, + "step": 8475 + }, + { + "epoch": 2.481264637002342, + "grad_norm": 0.9661310315132141, + "learning_rate": 3.1924323659927003e-06, + "loss": 0.5464, + "step": 8476 + }, + { + "epoch": 2.4815573770491803, + "grad_norm": 0.9712892174720764, + "learning_rate": 3.192062348173256e-06, + "loss": 0.5507, + "step": 8477 + }, + { + "epoch": 2.4818501170960188, + "grad_norm": 0.9953422546386719, + "learning_rate": 3.1916923139344995e-06, + "loss": 0.5404, + "step": 8478 + }, + { + "epoch": 2.482142857142857, + "grad_norm": 0.9730989336967468, + "learning_rate": 3.19132226328521e-06, + "loss": 0.5643, + "step": 8479 + }, + { + "epoch": 2.4824355971896956, + "grad_norm": 0.9777683019638062, + "learning_rate": 3.190952196234167e-06, + "loss": 0.5383, + "step": 8480 + }, + { + "epoch": 2.482728337236534, + "grad_norm": 0.9974219799041748, + "learning_rate": 3.1905821127901504e-06, + "loss": 0.5966, + "step": 8481 + }, + { + "epoch": 2.4830210772833725, + "grad_norm": 1.0436924695968628, + "learning_rate": 3.19021201296194e-06, + "loss": 0.5928, + "step": 8482 + }, + { + "epoch": 2.483313817330211, + "grad_norm": 0.9788968563079834, + "learning_rate": 3.189841896758318e-06, + "loss": 0.5683, + "step": 8483 + }, + { + "epoch": 2.4836065573770494, + "grad_norm": 0.9108318090438843, + "learning_rate": 3.1894717641880635e-06, + "loss": 0.5417, + "step": 8484 + }, + { + "epoch": 2.4838992974238874, + "grad_norm": 0.9680758118629456, + "learning_rate": 3.1891016152599596e-06, + "loss": 0.5881, + "step": 8485 + }, + { + "epoch": 2.484192037470726, + "grad_norm": 1.0460073947906494, + "learning_rate": 3.188731449982787e-06, + "loss": 0.5919, + "step": 8486 + }, + { + "epoch": 2.4844847775175642, + "grad_norm": 1.0418542623519897, + "learning_rate": 3.1883612683653286e-06, + "loss": 0.5381, + "step": 8487 + }, + { + "epoch": 2.4847775175644027, + "grad_norm": 0.993129312992096, + "learning_rate": 3.1879910704163673e-06, + "loss": 0.5386, + "step": 8488 + }, + { + "epoch": 2.485070257611241, + "grad_norm": 0.9633527994155884, + "learning_rate": 3.187620856144685e-06, + "loss": 0.5617, + "step": 8489 + }, + { + "epoch": 2.4853629976580796, + "grad_norm": 1.0061993598937988, + "learning_rate": 3.1872506255590666e-06, + "loss": 0.5865, + "step": 8490 + }, + { + "epoch": 2.485655737704918, + "grad_norm": 0.9840618968009949, + "learning_rate": 3.1868803786682943e-06, + "loss": 0.5553, + "step": 8491 + }, + { + "epoch": 2.4859484777517564, + "grad_norm": 0.9738926887512207, + "learning_rate": 3.1865101154811535e-06, + "loss": 0.5662, + "step": 8492 + }, + { + "epoch": 2.486241217798595, + "grad_norm": 1.0121805667877197, + "learning_rate": 3.186139836006428e-06, + "loss": 0.5612, + "step": 8493 + }, + { + "epoch": 2.4865339578454333, + "grad_norm": 0.9510138630867004, + "learning_rate": 3.1857695402529034e-06, + "loss": 0.562, + "step": 8494 + }, + { + "epoch": 2.4868266978922717, + "grad_norm": 1.0032466650009155, + "learning_rate": 3.1853992282293644e-06, + "loss": 0.5901, + "step": 8495 + }, + { + "epoch": 2.48711943793911, + "grad_norm": 1.0039101839065552, + "learning_rate": 3.185028899944597e-06, + "loss": 0.5947, + "step": 8496 + }, + { + "epoch": 2.4874121779859486, + "grad_norm": 1.0345734357833862, + "learning_rate": 3.184658555407387e-06, + "loss": 0.605, + "step": 8497 + }, + { + "epoch": 2.487704918032787, + "grad_norm": 1.00588059425354, + "learning_rate": 3.1842881946265226e-06, + "loss": 0.5901, + "step": 8498 + }, + { + "epoch": 2.4879976580796255, + "grad_norm": 0.9747905731201172, + "learning_rate": 3.183917817610788e-06, + "loss": 0.5318, + "step": 8499 + }, + { + "epoch": 2.4882903981264635, + "grad_norm": 0.999417781829834, + "learning_rate": 3.1835474243689724e-06, + "loss": 0.5762, + "step": 8500 + }, + { + "epoch": 2.4885831381733023, + "grad_norm": 0.9718658924102783, + "learning_rate": 3.183177014909862e-06, + "loss": 0.5743, + "step": 8501 + }, + { + "epoch": 2.4888758782201403, + "grad_norm": 0.9797403216362, + "learning_rate": 3.1828065892422467e-06, + "loss": 0.5868, + "step": 8502 + }, + { + "epoch": 2.4891686182669788, + "grad_norm": 0.9508155584335327, + "learning_rate": 3.182436147374914e-06, + "loss": 0.5442, + "step": 8503 + }, + { + "epoch": 2.489461358313817, + "grad_norm": 1.0183136463165283, + "learning_rate": 3.1820656893166518e-06, + "loss": 0.551, + "step": 8504 + }, + { + "epoch": 2.4897540983606556, + "grad_norm": 0.9652585387229919, + "learning_rate": 3.1816952150762504e-06, + "loss": 0.5872, + "step": 8505 + }, + { + "epoch": 2.490046838407494, + "grad_norm": 0.9864011406898499, + "learning_rate": 3.1813247246624995e-06, + "loss": 0.5786, + "step": 8506 + }, + { + "epoch": 2.4903395784543325, + "grad_norm": 0.9886832237243652, + "learning_rate": 3.1809542180841877e-06, + "loss": 0.579, + "step": 8507 + }, + { + "epoch": 2.490632318501171, + "grad_norm": 0.9923737049102783, + "learning_rate": 3.180583695350107e-06, + "loss": 0.5567, + "step": 8508 + }, + { + "epoch": 2.4909250585480094, + "grad_norm": 1.0336837768554688, + "learning_rate": 3.1802131564690474e-06, + "loss": 0.5707, + "step": 8509 + }, + { + "epoch": 2.491217798594848, + "grad_norm": 0.9992858171463013, + "learning_rate": 3.1798426014497998e-06, + "loss": 0.5738, + "step": 8510 + }, + { + "epoch": 2.4915105386416863, + "grad_norm": 1.0506426095962524, + "learning_rate": 3.179472030301155e-06, + "loss": 0.6088, + "step": 8511 + }, + { + "epoch": 2.4918032786885247, + "grad_norm": 0.9882312417030334, + "learning_rate": 3.179101443031907e-06, + "loss": 0.5825, + "step": 8512 + }, + { + "epoch": 2.492096018735363, + "grad_norm": 1.0270415544509888, + "learning_rate": 3.178730839650845e-06, + "loss": 0.5642, + "step": 8513 + }, + { + "epoch": 2.4923887587822016, + "grad_norm": 1.0187088251113892, + "learning_rate": 3.1783602201667656e-06, + "loss": 0.5546, + "step": 8514 + }, + { + "epoch": 2.49268149882904, + "grad_norm": 1.0019376277923584, + "learning_rate": 3.177989584588459e-06, + "loss": 0.5609, + "step": 8515 + }, + { + "epoch": 2.4929742388758784, + "grad_norm": 0.9769514203071594, + "learning_rate": 3.1776189329247185e-06, + "loss": 0.557, + "step": 8516 + }, + { + "epoch": 2.4932669789227164, + "grad_norm": 0.9779874086380005, + "learning_rate": 3.177248265184339e-06, + "loss": 0.5456, + "step": 8517 + }, + { + "epoch": 2.493559718969555, + "grad_norm": 0.9586823582649231, + "learning_rate": 3.1768775813761143e-06, + "loss": 0.5515, + "step": 8518 + }, + { + "epoch": 2.4938524590163933, + "grad_norm": 0.9887434840202332, + "learning_rate": 3.1765068815088397e-06, + "loss": 0.5714, + "step": 8519 + }, + { + "epoch": 2.4941451990632317, + "grad_norm": 0.9581917524337769, + "learning_rate": 3.176136165591308e-06, + "loss": 0.5406, + "step": 8520 + }, + { + "epoch": 2.49443793911007, + "grad_norm": 0.9222782850265503, + "learning_rate": 3.175765433632317e-06, + "loss": 0.5194, + "step": 8521 + }, + { + "epoch": 2.4947306791569086, + "grad_norm": 1.0043301582336426, + "learning_rate": 3.1753946856406604e-06, + "loss": 0.5745, + "step": 8522 + }, + { + "epoch": 2.495023419203747, + "grad_norm": 0.946382462978363, + "learning_rate": 3.175023921625136e-06, + "loss": 0.5565, + "step": 8523 + }, + { + "epoch": 2.4953161592505855, + "grad_norm": 0.9516444802284241, + "learning_rate": 3.174653141594539e-06, + "loss": 0.5411, + "step": 8524 + }, + { + "epoch": 2.495608899297424, + "grad_norm": 1.0164711475372314, + "learning_rate": 3.1742823455576666e-06, + "loss": 0.5618, + "step": 8525 + }, + { + "epoch": 2.4959016393442623, + "grad_norm": 0.9761397838592529, + "learning_rate": 3.173911533523316e-06, + "loss": 0.5432, + "step": 8526 + }, + { + "epoch": 2.496194379391101, + "grad_norm": 1.0298237800598145, + "learning_rate": 3.1735407055002843e-06, + "loss": 0.6178, + "step": 8527 + }, + { + "epoch": 2.496487119437939, + "grad_norm": 1.0328218936920166, + "learning_rate": 3.1731698614973703e-06, + "loss": 0.5654, + "step": 8528 + }, + { + "epoch": 2.4967798594847777, + "grad_norm": 0.9744928479194641, + "learning_rate": 3.1727990015233728e-06, + "loss": 0.5675, + "step": 8529 + }, + { + "epoch": 2.497072599531616, + "grad_norm": 1.0139483213424683, + "learning_rate": 3.172428125587089e-06, + "loss": 0.5599, + "step": 8530 + }, + { + "epoch": 2.4973653395784545, + "grad_norm": 0.9833117723464966, + "learning_rate": 3.172057233697318e-06, + "loss": 0.5718, + "step": 8531 + }, + { + "epoch": 2.4976580796252925, + "grad_norm": 0.9930822849273682, + "learning_rate": 3.171686325862861e-06, + "loss": 0.5345, + "step": 8532 + }, + { + "epoch": 2.4979508196721314, + "grad_norm": 1.0498443841934204, + "learning_rate": 3.171315402092516e-06, + "loss": 0.5698, + "step": 8533 + }, + { + "epoch": 2.4982435597189694, + "grad_norm": 1.0185546875, + "learning_rate": 3.170944462395086e-06, + "loss": 0.5669, + "step": 8534 + }, + { + "epoch": 2.498536299765808, + "grad_norm": 1.0140048265457153, + "learning_rate": 3.170573506779368e-06, + "loss": 0.5662, + "step": 8535 + }, + { + "epoch": 2.4988290398126463, + "grad_norm": 1.0010110139846802, + "learning_rate": 3.1702025352541655e-06, + "loss": 0.5656, + "step": 8536 + }, + { + "epoch": 2.4991217798594847, + "grad_norm": 1.0174888372421265, + "learning_rate": 3.1698315478282783e-06, + "loss": 0.5658, + "step": 8537 + }, + { + "epoch": 2.499414519906323, + "grad_norm": 0.9952043890953064, + "learning_rate": 3.1694605445105092e-06, + "loss": 0.5979, + "step": 8538 + }, + { + "epoch": 2.4997072599531616, + "grad_norm": 1.0683293342590332, + "learning_rate": 3.169089525309661e-06, + "loss": 0.5224, + "step": 8539 + }, + { + "epoch": 2.5, + "grad_norm": 0.9861575961112976, + "learning_rate": 3.1687184902345343e-06, + "loss": 0.5711, + "step": 8540 + }, + { + "epoch": 2.5002927400468384, + "grad_norm": 0.9792702794075012, + "learning_rate": 3.1683474392939328e-06, + "loss": 0.5306, + "step": 8541 + }, + { + "epoch": 2.500585480093677, + "grad_norm": 1.0699180364608765, + "learning_rate": 3.167976372496661e-06, + "loss": 0.6116, + "step": 8542 + }, + { + "epoch": 2.5008782201405153, + "grad_norm": 0.9919843077659607, + "learning_rate": 3.167605289851521e-06, + "loss": 0.5629, + "step": 8543 + }, + { + "epoch": 2.5011709601873537, + "grad_norm": 0.9504328966140747, + "learning_rate": 3.167234191367317e-06, + "loss": 0.5621, + "step": 8544 + }, + { + "epoch": 2.501463700234192, + "grad_norm": 0.9750006794929504, + "learning_rate": 3.1668630770528535e-06, + "loss": 0.5718, + "step": 8545 + }, + { + "epoch": 2.5017564402810306, + "grad_norm": 0.9542937874794006, + "learning_rate": 3.1664919469169355e-06, + "loss": 0.5893, + "step": 8546 + }, + { + "epoch": 2.5020491803278686, + "grad_norm": 0.9311875700950623, + "learning_rate": 3.1661208009683676e-06, + "loss": 0.5414, + "step": 8547 + }, + { + "epoch": 2.5023419203747075, + "grad_norm": 0.9331035017967224, + "learning_rate": 3.165749639215956e-06, + "loss": 0.5327, + "step": 8548 + }, + { + "epoch": 2.5026346604215455, + "grad_norm": 0.9902022480964661, + "learning_rate": 3.1653784616685067e-06, + "loss": 0.5763, + "step": 8549 + }, + { + "epoch": 2.502927400468384, + "grad_norm": 0.9684141874313354, + "learning_rate": 3.1650072683348252e-06, + "loss": 0.5209, + "step": 8550 + }, + { + "epoch": 2.5032201405152223, + "grad_norm": 0.9963178038597107, + "learning_rate": 3.1646360592237185e-06, + "loss": 0.5571, + "step": 8551 + }, + { + "epoch": 2.503512880562061, + "grad_norm": 0.9368748068809509, + "learning_rate": 3.1642648343439937e-06, + "loss": 0.5389, + "step": 8552 + }, + { + "epoch": 2.503805620608899, + "grad_norm": 0.9842060804367065, + "learning_rate": 3.163893593704458e-06, + "loss": 0.5951, + "step": 8553 + }, + { + "epoch": 2.5040983606557377, + "grad_norm": 1.0142284631729126, + "learning_rate": 3.1635223373139196e-06, + "loss": 0.5595, + "step": 8554 + }, + { + "epoch": 2.504391100702576, + "grad_norm": 1.0103951692581177, + "learning_rate": 3.1631510651811857e-06, + "loss": 0.5442, + "step": 8555 + }, + { + "epoch": 2.5046838407494145, + "grad_norm": 0.9061046838760376, + "learning_rate": 3.162779777315065e-06, + "loss": 0.5238, + "step": 8556 + }, + { + "epoch": 2.504976580796253, + "grad_norm": 0.9936938285827637, + "learning_rate": 3.1624084737243675e-06, + "loss": 0.5497, + "step": 8557 + }, + { + "epoch": 2.5052693208430914, + "grad_norm": 1.003836750984192, + "learning_rate": 3.1620371544179018e-06, + "loss": 0.5928, + "step": 8558 + }, + { + "epoch": 2.50556206088993, + "grad_norm": 1.0595965385437012, + "learning_rate": 3.1616658194044776e-06, + "loss": 0.5692, + "step": 8559 + }, + { + "epoch": 2.5058548009367683, + "grad_norm": 0.9824920892715454, + "learning_rate": 3.1612944686929043e-06, + "loss": 0.5651, + "step": 8560 + }, + { + "epoch": 2.5061475409836067, + "grad_norm": 1.0222305059432983, + "learning_rate": 3.160923102291993e-06, + "loss": 0.5524, + "step": 8561 + }, + { + "epoch": 2.5064402810304447, + "grad_norm": 1.0365296602249146, + "learning_rate": 3.1605517202105534e-06, + "loss": 0.5694, + "step": 8562 + }, + { + "epoch": 2.5067330210772836, + "grad_norm": 0.9430036544799805, + "learning_rate": 3.160180322457398e-06, + "loss": 0.5394, + "step": 8563 + }, + { + "epoch": 2.5070257611241216, + "grad_norm": 0.9920965433120728, + "learning_rate": 3.1598089090413376e-06, + "loss": 0.5627, + "step": 8564 + }, + { + "epoch": 2.5073185011709604, + "grad_norm": 0.9692972302436829, + "learning_rate": 3.1594374799711837e-06, + "loss": 0.5973, + "step": 8565 + }, + { + "epoch": 2.5076112412177984, + "grad_norm": 1.017171859741211, + "learning_rate": 3.159066035255749e-06, + "loss": 0.613, + "step": 8566 + }, + { + "epoch": 2.507903981264637, + "grad_norm": 1.100646734237671, + "learning_rate": 3.1586945749038455e-06, + "loss": 0.618, + "step": 8567 + }, + { + "epoch": 2.5081967213114753, + "grad_norm": 1.0123745203018188, + "learning_rate": 3.1583230989242875e-06, + "loss": 0.5684, + "step": 8568 + }, + { + "epoch": 2.5084894613583137, + "grad_norm": 0.9355922937393188, + "learning_rate": 3.1579516073258874e-06, + "loss": 0.5618, + "step": 8569 + }, + { + "epoch": 2.508782201405152, + "grad_norm": 0.9367645978927612, + "learning_rate": 3.1575801001174588e-06, + "loss": 0.5824, + "step": 8570 + }, + { + "epoch": 2.5090749414519906, + "grad_norm": 0.9412235021591187, + "learning_rate": 3.157208577307816e-06, + "loss": 0.5689, + "step": 8571 + }, + { + "epoch": 2.509367681498829, + "grad_norm": 0.952691912651062, + "learning_rate": 3.1568370389057733e-06, + "loss": 0.523, + "step": 8572 + }, + { + "epoch": 2.5096604215456675, + "grad_norm": 1.0625890493392944, + "learning_rate": 3.1564654849201454e-06, + "loss": 0.5491, + "step": 8573 + }, + { + "epoch": 2.509953161592506, + "grad_norm": 0.9526789784431458, + "learning_rate": 3.1560939153597487e-06, + "loss": 0.5497, + "step": 8574 + }, + { + "epoch": 2.5102459016393444, + "grad_norm": 0.9848624467849731, + "learning_rate": 3.1557223302333974e-06, + "loss": 0.5825, + "step": 8575 + }, + { + "epoch": 2.510538641686183, + "grad_norm": 0.9608529806137085, + "learning_rate": 3.1553507295499075e-06, + "loss": 0.5676, + "step": 8576 + }, + { + "epoch": 2.5108313817330212, + "grad_norm": 1.1355825662612915, + "learning_rate": 3.154979113318095e-06, + "loss": 0.6014, + "step": 8577 + }, + { + "epoch": 2.5111241217798597, + "grad_norm": 1.0257573127746582, + "learning_rate": 3.1546074815467786e-06, + "loss": 0.5793, + "step": 8578 + }, + { + "epoch": 2.5114168618266977, + "grad_norm": 0.9542521238327026, + "learning_rate": 3.154235834244773e-06, + "loss": 0.5377, + "step": 8579 + }, + { + "epoch": 2.5117096018735365, + "grad_norm": 0.9715947508811951, + "learning_rate": 3.153864171420898e-06, + "loss": 0.5245, + "step": 8580 + }, + { + "epoch": 2.5120023419203745, + "grad_norm": 0.999555766582489, + "learning_rate": 3.1534924930839682e-06, + "loss": 0.5895, + "step": 8581 + }, + { + "epoch": 2.512295081967213, + "grad_norm": 0.9542290568351746, + "learning_rate": 3.153120799242804e-06, + "loss": 0.5542, + "step": 8582 + }, + { + "epoch": 2.5125878220140514, + "grad_norm": 1.0555391311645508, + "learning_rate": 3.1527490899062236e-06, + "loss": 0.5832, + "step": 8583 + }, + { + "epoch": 2.51288056206089, + "grad_norm": 1.0257068872451782, + "learning_rate": 3.1523773650830457e-06, + "loss": 0.5862, + "step": 8584 + }, + { + "epoch": 2.5131733021077283, + "grad_norm": 1.0060551166534424, + "learning_rate": 3.152005624782089e-06, + "loss": 0.5888, + "step": 8585 + }, + { + "epoch": 2.5134660421545667, + "grad_norm": 0.9646438956260681, + "learning_rate": 3.151633869012174e-06, + "loss": 0.5541, + "step": 8586 + }, + { + "epoch": 2.513758782201405, + "grad_norm": 1.0169105529785156, + "learning_rate": 3.1512620977821196e-06, + "loss": 0.5482, + "step": 8587 + }, + { + "epoch": 2.5140515222482436, + "grad_norm": 0.9705615043640137, + "learning_rate": 3.1508903111007472e-06, + "loss": 0.5226, + "step": 8588 + }, + { + "epoch": 2.514344262295082, + "grad_norm": 0.9832040667533875, + "learning_rate": 3.1505185089768776e-06, + "loss": 0.597, + "step": 8589 + }, + { + "epoch": 2.5146370023419204, + "grad_norm": 0.9710444211959839, + "learning_rate": 3.150146691419331e-06, + "loss": 0.5506, + "step": 8590 + }, + { + "epoch": 2.514929742388759, + "grad_norm": 0.9893074631690979, + "learning_rate": 3.149774858436929e-06, + "loss": 0.5689, + "step": 8591 + }, + { + "epoch": 2.5152224824355973, + "grad_norm": 1.006612777709961, + "learning_rate": 3.149403010038494e-06, + "loss": 0.6078, + "step": 8592 + }, + { + "epoch": 2.5155152224824358, + "grad_norm": 0.9827683568000793, + "learning_rate": 3.149031146232847e-06, + "loss": 0.598, + "step": 8593 + }, + { + "epoch": 2.5158079625292737, + "grad_norm": 1.036858081817627, + "learning_rate": 3.148659267028813e-06, + "loss": 0.5719, + "step": 8594 + }, + { + "epoch": 2.5161007025761126, + "grad_norm": 0.9681839346885681, + "learning_rate": 3.148287372435212e-06, + "loss": 0.5485, + "step": 8595 + }, + { + "epoch": 2.5163934426229506, + "grad_norm": 1.1981080770492554, + "learning_rate": 3.1479154624608693e-06, + "loss": 0.5722, + "step": 8596 + }, + { + "epoch": 2.5166861826697895, + "grad_norm": 1.0090739727020264, + "learning_rate": 3.147543537114607e-06, + "loss": 0.582, + "step": 8597 + }, + { + "epoch": 2.5169789227166275, + "grad_norm": 0.9557317495346069, + "learning_rate": 3.1471715964052506e-06, + "loss": 0.5656, + "step": 8598 + }, + { + "epoch": 2.517271662763466, + "grad_norm": 0.9972891807556152, + "learning_rate": 3.1467996403416233e-06, + "loss": 0.5619, + "step": 8599 + }, + { + "epoch": 2.5175644028103044, + "grad_norm": 1.0399980545043945, + "learning_rate": 3.1464276689325507e-06, + "loss": 0.5566, + "step": 8600 + }, + { + "epoch": 2.517857142857143, + "grad_norm": 1.0204464197158813, + "learning_rate": 3.146055682186857e-06, + "loss": 0.5932, + "step": 8601 + }, + { + "epoch": 2.5181498829039812, + "grad_norm": 0.9957433342933655, + "learning_rate": 3.1456836801133676e-06, + "loss": 0.6156, + "step": 8602 + }, + { + "epoch": 2.5184426229508197, + "grad_norm": 1.0196166038513184, + "learning_rate": 3.1453116627209097e-06, + "loss": 0.5964, + "step": 8603 + }, + { + "epoch": 2.518735362997658, + "grad_norm": 0.9926854968070984, + "learning_rate": 3.1449396300183083e-06, + "loss": 0.5479, + "step": 8604 + }, + { + "epoch": 2.5190281030444965, + "grad_norm": 1.0232324600219727, + "learning_rate": 3.1445675820143905e-06, + "loss": 0.5842, + "step": 8605 + }, + { + "epoch": 2.519320843091335, + "grad_norm": 0.9848755598068237, + "learning_rate": 3.1441955187179828e-06, + "loss": 0.5646, + "step": 8606 + }, + { + "epoch": 2.5196135831381734, + "grad_norm": 1.031462550163269, + "learning_rate": 3.1438234401379124e-06, + "loss": 0.599, + "step": 8607 + }, + { + "epoch": 2.519906323185012, + "grad_norm": 1.0846112966537476, + "learning_rate": 3.1434513462830072e-06, + "loss": 0.589, + "step": 8608 + }, + { + "epoch": 2.5201990632318503, + "grad_norm": 0.9492926597595215, + "learning_rate": 3.1430792371620954e-06, + "loss": 0.5667, + "step": 8609 + }, + { + "epoch": 2.5204918032786887, + "grad_norm": 1.0113887786865234, + "learning_rate": 3.142707112784006e-06, + "loss": 0.6038, + "step": 8610 + }, + { + "epoch": 2.5207845433255267, + "grad_norm": 0.9866676926612854, + "learning_rate": 3.142334973157566e-06, + "loss": 0.5765, + "step": 8611 + }, + { + "epoch": 2.5210772833723656, + "grad_norm": 0.9961693286895752, + "learning_rate": 3.1419628182916052e-06, + "loss": 0.5873, + "step": 8612 + }, + { + "epoch": 2.5213700234192036, + "grad_norm": 0.9355848431587219, + "learning_rate": 3.1415906481949536e-06, + "loss": 0.5787, + "step": 8613 + }, + { + "epoch": 2.521662763466042, + "grad_norm": 0.9354445338249207, + "learning_rate": 3.141218462876441e-06, + "loss": 0.5625, + "step": 8614 + }, + { + "epoch": 2.5219555035128804, + "grad_norm": 0.978055477142334, + "learning_rate": 3.1408462623448965e-06, + "loss": 0.5972, + "step": 8615 + }, + { + "epoch": 2.522248243559719, + "grad_norm": 0.9968634247779846, + "learning_rate": 3.1404740466091517e-06, + "loss": 0.582, + "step": 8616 + }, + { + "epoch": 2.5225409836065573, + "grad_norm": 0.986438512802124, + "learning_rate": 3.1401018156780365e-06, + "loss": 0.5679, + "step": 8617 + }, + { + "epoch": 2.5228337236533958, + "grad_norm": 1.0213932991027832, + "learning_rate": 3.139729569560383e-06, + "loss": 0.5561, + "step": 8618 + }, + { + "epoch": 2.523126463700234, + "grad_norm": 1.011160969734192, + "learning_rate": 3.1393573082650223e-06, + "loss": 0.5721, + "step": 8619 + }, + { + "epoch": 2.5234192037470726, + "grad_norm": 1.0018857717514038, + "learning_rate": 3.1389850318007874e-06, + "loss": 0.6022, + "step": 8620 + }, + { + "epoch": 2.523711943793911, + "grad_norm": 0.9732922911643982, + "learning_rate": 3.1386127401765093e-06, + "loss": 0.5585, + "step": 8621 + }, + { + "epoch": 2.5240046838407495, + "grad_norm": 0.9695212841033936, + "learning_rate": 3.1382404334010213e-06, + "loss": 0.6004, + "step": 8622 + }, + { + "epoch": 2.524297423887588, + "grad_norm": 0.9451719522476196, + "learning_rate": 3.1378681114831566e-06, + "loss": 0.5551, + "step": 8623 + }, + { + "epoch": 2.5245901639344264, + "grad_norm": 0.9449236392974854, + "learning_rate": 3.137495774431748e-06, + "loss": 0.5628, + "step": 8624 + }, + { + "epoch": 2.524882903981265, + "grad_norm": 1.0393506288528442, + "learning_rate": 3.1371234222556304e-06, + "loss": 0.5384, + "step": 8625 + }, + { + "epoch": 2.525175644028103, + "grad_norm": 1.0238311290740967, + "learning_rate": 3.1367510549636367e-06, + "loss": 0.6143, + "step": 8626 + }, + { + "epoch": 2.5254683840749417, + "grad_norm": 0.980523943901062, + "learning_rate": 3.1363786725646015e-06, + "loss": 0.5571, + "step": 8627 + }, + { + "epoch": 2.5257611241217797, + "grad_norm": 0.9342960715293884, + "learning_rate": 3.1360062750673604e-06, + "loss": 0.5744, + "step": 8628 + }, + { + "epoch": 2.526053864168618, + "grad_norm": 0.9604939818382263, + "learning_rate": 3.1356338624807483e-06, + "loss": 0.5824, + "step": 8629 + }, + { + "epoch": 2.5263466042154565, + "grad_norm": 0.9685224294662476, + "learning_rate": 3.1352614348136013e-06, + "loss": 0.5564, + "step": 8630 + }, + { + "epoch": 2.526639344262295, + "grad_norm": 1.0117759704589844, + "learning_rate": 3.1348889920747538e-06, + "loss": 0.5822, + "step": 8631 + }, + { + "epoch": 2.5269320843091334, + "grad_norm": 0.9857194423675537, + "learning_rate": 3.1345165342730435e-06, + "loss": 0.6008, + "step": 8632 + }, + { + "epoch": 2.527224824355972, + "grad_norm": 1.0674018859863281, + "learning_rate": 3.1341440614173063e-06, + "loss": 0.5808, + "step": 8633 + }, + { + "epoch": 2.5275175644028103, + "grad_norm": 0.9618774056434631, + "learning_rate": 3.1337715735163793e-06, + "loss": 0.5565, + "step": 8634 + }, + { + "epoch": 2.5278103044496487, + "grad_norm": 0.975678026676178, + "learning_rate": 3.1333990705791005e-06, + "loss": 0.5818, + "step": 8635 + }, + { + "epoch": 2.528103044496487, + "grad_norm": 0.9787783622741699, + "learning_rate": 3.1330265526143065e-06, + "loss": 0.5694, + "step": 8636 + }, + { + "epoch": 2.5283957845433256, + "grad_norm": 0.9917454123497009, + "learning_rate": 3.1326540196308357e-06, + "loss": 0.566, + "step": 8637 + }, + { + "epoch": 2.528688524590164, + "grad_norm": 0.957866907119751, + "learning_rate": 3.1322814716375273e-06, + "loss": 0.5291, + "step": 8638 + }, + { + "epoch": 2.5289812646370025, + "grad_norm": 1.0390745401382446, + "learning_rate": 3.131908908643219e-06, + "loss": 0.581, + "step": 8639 + }, + { + "epoch": 2.529274004683841, + "grad_norm": 0.9825888872146606, + "learning_rate": 3.1315363306567515e-06, + "loss": 0.5767, + "step": 8640 + }, + { + "epoch": 2.529566744730679, + "grad_norm": 0.981236457824707, + "learning_rate": 3.1311637376869618e-06, + "loss": 0.5592, + "step": 8641 + }, + { + "epoch": 2.5298594847775178, + "grad_norm": 1.0203359127044678, + "learning_rate": 3.1307911297426917e-06, + "loss": 0.5796, + "step": 8642 + }, + { + "epoch": 2.5301522248243558, + "grad_norm": 1.0030028820037842, + "learning_rate": 3.130418506832781e-06, + "loss": 0.5592, + "step": 8643 + }, + { + "epoch": 2.5304449648711946, + "grad_norm": 1.0124156475067139, + "learning_rate": 3.13004586896607e-06, + "loss": 0.5806, + "step": 8644 + }, + { + "epoch": 2.5307377049180326, + "grad_norm": 1.0079138278961182, + "learning_rate": 3.1296732161514005e-06, + "loss": 0.5395, + "step": 8645 + }, + { + "epoch": 2.531030444964871, + "grad_norm": 0.9832847118377686, + "learning_rate": 3.129300548397612e-06, + "loss": 0.575, + "step": 8646 + }, + { + "epoch": 2.5313231850117095, + "grad_norm": 0.9585812091827393, + "learning_rate": 3.128927865713547e-06, + "loss": 0.5637, + "step": 8647 + }, + { + "epoch": 2.531615925058548, + "grad_norm": 0.9883798360824585, + "learning_rate": 3.128555168108048e-06, + "loss": 0.5884, + "step": 8648 + }, + { + "epoch": 2.5319086651053864, + "grad_norm": 1.0160468816757202, + "learning_rate": 3.128182455589957e-06, + "loss": 0.5625, + "step": 8649 + }, + { + "epoch": 2.532201405152225, + "grad_norm": 1.025691270828247, + "learning_rate": 3.127809728168116e-06, + "loss": 0.571, + "step": 8650 + }, + { + "epoch": 2.5324941451990632, + "grad_norm": 0.9729428291320801, + "learning_rate": 3.12743698585137e-06, + "loss": 0.5686, + "step": 8651 + }, + { + "epoch": 2.5327868852459017, + "grad_norm": 1.048387050628662, + "learning_rate": 3.12706422864856e-06, + "loss": 0.6061, + "step": 8652 + }, + { + "epoch": 2.53307962529274, + "grad_norm": 0.9711080193519592, + "learning_rate": 3.12669145656853e-06, + "loss": 0.5619, + "step": 8653 + }, + { + "epoch": 2.5333723653395785, + "grad_norm": 0.9596917033195496, + "learning_rate": 3.1263186696201254e-06, + "loss": 0.5909, + "step": 8654 + }, + { + "epoch": 2.533665105386417, + "grad_norm": 0.929180920124054, + "learning_rate": 3.1259458678121903e-06, + "loss": 0.551, + "step": 8655 + }, + { + "epoch": 2.5339578454332554, + "grad_norm": 0.9634730815887451, + "learning_rate": 3.1255730511535687e-06, + "loss": 0.5896, + "step": 8656 + }, + { + "epoch": 2.534250585480094, + "grad_norm": 1.0295422077178955, + "learning_rate": 3.1252002196531067e-06, + "loss": 0.5845, + "step": 8657 + }, + { + "epoch": 2.534543325526932, + "grad_norm": 1.0135667324066162, + "learning_rate": 3.124827373319649e-06, + "loss": 0.588, + "step": 8658 + }, + { + "epoch": 2.5348360655737707, + "grad_norm": 0.9913381338119507, + "learning_rate": 3.1244545121620417e-06, + "loss": 0.5679, + "step": 8659 + }, + { + "epoch": 2.5351288056206087, + "grad_norm": 1.0022324323654175, + "learning_rate": 3.1240816361891325e-06, + "loss": 0.5842, + "step": 8660 + }, + { + "epoch": 2.535421545667447, + "grad_norm": 0.996712327003479, + "learning_rate": 3.1237087454097646e-06, + "loss": 0.6044, + "step": 8661 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 0.9974764585494995, + "learning_rate": 3.1233358398327883e-06, + "loss": 0.5628, + "step": 8662 + }, + { + "epoch": 2.536007025761124, + "grad_norm": 0.9852837920188904, + "learning_rate": 3.1229629194670487e-06, + "loss": 0.537, + "step": 8663 + }, + { + "epoch": 2.5362997658079625, + "grad_norm": 0.988946795463562, + "learning_rate": 3.122589984321394e-06, + "loss": 0.5623, + "step": 8664 + }, + { + "epoch": 2.536592505854801, + "grad_norm": 1.0119143724441528, + "learning_rate": 3.122217034404673e-06, + "loss": 0.576, + "step": 8665 + }, + { + "epoch": 2.5368852459016393, + "grad_norm": 1.0095630884170532, + "learning_rate": 3.1218440697257336e-06, + "loss": 0.5119, + "step": 8666 + }, + { + "epoch": 2.5371779859484778, + "grad_norm": 0.9789679646492004, + "learning_rate": 3.1214710902934233e-06, + "loss": 0.5724, + "step": 8667 + }, + { + "epoch": 2.537470725995316, + "grad_norm": 1.0160714387893677, + "learning_rate": 3.1210980961165928e-06, + "loss": 0.592, + "step": 8668 + }, + { + "epoch": 2.5377634660421546, + "grad_norm": 1.0188028812408447, + "learning_rate": 3.12072508720409e-06, + "loss": 0.5812, + "step": 8669 + }, + { + "epoch": 2.538056206088993, + "grad_norm": 1.0611103773117065, + "learning_rate": 3.1203520635647657e-06, + "loss": 0.571, + "step": 8670 + }, + { + "epoch": 2.5383489461358315, + "grad_norm": 0.9282464385032654, + "learning_rate": 3.119979025207469e-06, + "loss": 0.5788, + "step": 8671 + }, + { + "epoch": 2.53864168618267, + "grad_norm": 0.9934621453285217, + "learning_rate": 3.119605972141051e-06, + "loss": 0.5189, + "step": 8672 + }, + { + "epoch": 2.538934426229508, + "grad_norm": 1.0330168008804321, + "learning_rate": 3.1192329043743624e-06, + "loss": 0.594, + "step": 8673 + }, + { + "epoch": 2.539227166276347, + "grad_norm": 0.9417957663536072, + "learning_rate": 3.1188598219162535e-06, + "loss": 0.5884, + "step": 8674 + }, + { + "epoch": 2.539519906323185, + "grad_norm": 0.987267255783081, + "learning_rate": 3.1184867247755772e-06, + "loss": 0.5623, + "step": 8675 + }, + { + "epoch": 2.5398126463700237, + "grad_norm": 0.9481030702590942, + "learning_rate": 3.118113612961185e-06, + "loss": 0.5553, + "step": 8676 + }, + { + "epoch": 2.5401053864168617, + "grad_norm": 0.9633530378341675, + "learning_rate": 3.117740486481928e-06, + "loss": 0.551, + "step": 8677 + }, + { + "epoch": 2.5403981264637, + "grad_norm": 1.0311509370803833, + "learning_rate": 3.1173673453466584e-06, + "loss": 0.623, + "step": 8678 + }, + { + "epoch": 2.5406908665105385, + "grad_norm": 0.9523317217826843, + "learning_rate": 3.1169941895642306e-06, + "loss": 0.5351, + "step": 8679 + }, + { + "epoch": 2.540983606557377, + "grad_norm": 1.0217530727386475, + "learning_rate": 3.1166210191434976e-06, + "loss": 0.5898, + "step": 8680 + }, + { + "epoch": 2.5412763466042154, + "grad_norm": 0.9516586661338806, + "learning_rate": 3.116247834093312e-06, + "loss": 0.5764, + "step": 8681 + }, + { + "epoch": 2.541569086651054, + "grad_norm": 0.9512243866920471, + "learning_rate": 3.115874634422528e-06, + "loss": 0.5839, + "step": 8682 + }, + { + "epoch": 2.5418618266978923, + "grad_norm": 0.9796275496482849, + "learning_rate": 3.1155014201400003e-06, + "loss": 0.5593, + "step": 8683 + }, + { + "epoch": 2.5421545667447307, + "grad_norm": 1.026648759841919, + "learning_rate": 3.1151281912545837e-06, + "loss": 0.5764, + "step": 8684 + }, + { + "epoch": 2.542447306791569, + "grad_norm": 1.0081977844238281, + "learning_rate": 3.1147549477751317e-06, + "loss": 0.5443, + "step": 8685 + }, + { + "epoch": 2.5427400468384076, + "grad_norm": 1.0204651355743408, + "learning_rate": 3.114381689710501e-06, + "loss": 0.6009, + "step": 8686 + }, + { + "epoch": 2.543032786885246, + "grad_norm": 1.0240899324417114, + "learning_rate": 3.114008417069546e-06, + "loss": 0.6044, + "step": 8687 + }, + { + "epoch": 2.5433255269320845, + "grad_norm": 1.0055129528045654, + "learning_rate": 3.1136351298611238e-06, + "loss": 0.5565, + "step": 8688 + }, + { + "epoch": 2.543618266978923, + "grad_norm": 1.0073418617248535, + "learning_rate": 3.11326182809409e-06, + "loss": 0.5872, + "step": 8689 + }, + { + "epoch": 2.543911007025761, + "grad_norm": 0.9632744193077087, + "learning_rate": 3.1128885117773016e-06, + "loss": 0.5404, + "step": 8690 + }, + { + "epoch": 2.5442037470725998, + "grad_norm": 1.0449888706207275, + "learning_rate": 3.1125151809196163e-06, + "loss": 0.5929, + "step": 8691 + }, + { + "epoch": 2.5444964871194378, + "grad_norm": 1.0128663778305054, + "learning_rate": 3.11214183552989e-06, + "loss": 0.5896, + "step": 8692 + }, + { + "epoch": 2.544789227166276, + "grad_norm": 1.0371545553207397, + "learning_rate": 3.111768475616981e-06, + "loss": 0.5649, + "step": 8693 + }, + { + "epoch": 2.5450819672131146, + "grad_norm": 0.9597974419593811, + "learning_rate": 3.111395101189747e-06, + "loss": 0.5507, + "step": 8694 + }, + { + "epoch": 2.545374707259953, + "grad_norm": 1.0593973398208618, + "learning_rate": 3.111021712257047e-06, + "loss": 0.5583, + "step": 8695 + }, + { + "epoch": 2.5456674473067915, + "grad_norm": 1.0240877866744995, + "learning_rate": 3.11064830882774e-06, + "loss": 0.5517, + "step": 8696 + }, + { + "epoch": 2.54596018735363, + "grad_norm": 0.989727795124054, + "learning_rate": 3.110274890910684e-06, + "loss": 0.6083, + "step": 8697 + }, + { + "epoch": 2.5462529274004684, + "grad_norm": 0.971893846988678, + "learning_rate": 3.109901458514739e-06, + "loss": 0.5649, + "step": 8698 + }, + { + "epoch": 2.546545667447307, + "grad_norm": 0.9033228754997253, + "learning_rate": 3.1095280116487647e-06, + "loss": 0.5351, + "step": 8699 + }, + { + "epoch": 2.5468384074941453, + "grad_norm": 1.0145328044891357, + "learning_rate": 3.1091545503216213e-06, + "loss": 0.5745, + "step": 8700 + }, + { + "epoch": 2.5471311475409837, + "grad_norm": 0.990061342716217, + "learning_rate": 3.1087810745421696e-06, + "loss": 0.6226, + "step": 8701 + }, + { + "epoch": 2.547423887587822, + "grad_norm": 0.9807667136192322, + "learning_rate": 3.1084075843192697e-06, + "loss": 0.5585, + "step": 8702 + }, + { + "epoch": 2.5477166276346606, + "grad_norm": 0.9663503766059875, + "learning_rate": 3.1080340796617832e-06, + "loss": 0.5626, + "step": 8703 + }, + { + "epoch": 2.548009367681499, + "grad_norm": 1.020571231842041, + "learning_rate": 3.107660560578571e-06, + "loss": 0.5873, + "step": 8704 + }, + { + "epoch": 2.548302107728337, + "grad_norm": 1.0254011154174805, + "learning_rate": 3.107287027078495e-06, + "loss": 0.567, + "step": 8705 + }, + { + "epoch": 2.548594847775176, + "grad_norm": 0.9497807025909424, + "learning_rate": 3.106913479170418e-06, + "loss": 0.5387, + "step": 8706 + }, + { + "epoch": 2.548887587822014, + "grad_norm": 0.9873712062835693, + "learning_rate": 3.106539916863202e-06, + "loss": 0.5713, + "step": 8707 + }, + { + "epoch": 2.5491803278688527, + "grad_norm": 0.9722688794136047, + "learning_rate": 3.10616634016571e-06, + "loss": 0.5796, + "step": 8708 + }, + { + "epoch": 2.5494730679156907, + "grad_norm": 1.0063121318817139, + "learning_rate": 3.1057927490868046e-06, + "loss": 0.5705, + "step": 8709 + }, + { + "epoch": 2.549765807962529, + "grad_norm": 0.9641340970993042, + "learning_rate": 3.1054191436353506e-06, + "loss": 0.5501, + "step": 8710 + }, + { + "epoch": 2.5500585480093676, + "grad_norm": 1.0011228322982788, + "learning_rate": 3.105045523820211e-06, + "loss": 0.535, + "step": 8711 + }, + { + "epoch": 2.550351288056206, + "grad_norm": 0.9924905896186829, + "learning_rate": 3.10467188965025e-06, + "loss": 0.5931, + "step": 8712 + }, + { + "epoch": 2.5506440281030445, + "grad_norm": 0.9625056982040405, + "learning_rate": 3.1042982411343313e-06, + "loss": 0.5609, + "step": 8713 + }, + { + "epoch": 2.550936768149883, + "grad_norm": 0.9833942651748657, + "learning_rate": 3.103924578281322e-06, + "loss": 0.5743, + "step": 8714 + }, + { + "epoch": 2.5512295081967213, + "grad_norm": 0.9990136623382568, + "learning_rate": 3.1035509011000853e-06, + "loss": 0.5652, + "step": 8715 + }, + { + "epoch": 2.5515222482435598, + "grad_norm": 0.9781345725059509, + "learning_rate": 3.1031772095994884e-06, + "loss": 0.569, + "step": 8716 + }, + { + "epoch": 2.551814988290398, + "grad_norm": 1.137799620628357, + "learning_rate": 3.1028035037883953e-06, + "loss": 0.5312, + "step": 8717 + }, + { + "epoch": 2.5521077283372366, + "grad_norm": 0.9675992131233215, + "learning_rate": 3.102429783675674e-06, + "loss": 0.5498, + "step": 8718 + }, + { + "epoch": 2.552400468384075, + "grad_norm": 0.9518214464187622, + "learning_rate": 3.1020560492701896e-06, + "loss": 0.6078, + "step": 8719 + }, + { + "epoch": 2.552693208430913, + "grad_norm": 0.9747027158737183, + "learning_rate": 3.1016823005808105e-06, + "loss": 0.5968, + "step": 8720 + }, + { + "epoch": 2.552985948477752, + "grad_norm": 1.025085210800171, + "learning_rate": 3.101308537616403e-06, + "loss": 0.6019, + "step": 8721 + }, + { + "epoch": 2.55327868852459, + "grad_norm": 1.0157952308654785, + "learning_rate": 3.1009347603858355e-06, + "loss": 0.5511, + "step": 8722 + }, + { + "epoch": 2.553571428571429, + "grad_norm": 1.0010557174682617, + "learning_rate": 3.1005609688979742e-06, + "loss": 0.5962, + "step": 8723 + }, + { + "epoch": 2.553864168618267, + "grad_norm": 0.9807507395744324, + "learning_rate": 3.1001871631616894e-06, + "loss": 0.564, + "step": 8724 + }, + { + "epoch": 2.5541569086651053, + "grad_norm": 0.9725373387336731, + "learning_rate": 3.099813343185848e-06, + "loss": 0.5102, + "step": 8725 + }, + { + "epoch": 2.5544496487119437, + "grad_norm": 1.0521634817123413, + "learning_rate": 3.099439508979321e-06, + "loss": 0.5642, + "step": 8726 + }, + { + "epoch": 2.554742388758782, + "grad_norm": 0.9941195845603943, + "learning_rate": 3.0990656605509755e-06, + "loss": 0.5464, + "step": 8727 + }, + { + "epoch": 2.5550351288056206, + "grad_norm": 1.0480064153671265, + "learning_rate": 3.0986917979096828e-06, + "loss": 0.57, + "step": 8728 + }, + { + "epoch": 2.555327868852459, + "grad_norm": 0.9493387341499329, + "learning_rate": 3.098317921064311e-06, + "loss": 0.5791, + "step": 8729 + }, + { + "epoch": 2.5556206088992974, + "grad_norm": 0.9764227271080017, + "learning_rate": 3.0979440300237325e-06, + "loss": 0.5805, + "step": 8730 + }, + { + "epoch": 2.555913348946136, + "grad_norm": 1.009484887123108, + "learning_rate": 3.0975701247968173e-06, + "loss": 0.5717, + "step": 8731 + }, + { + "epoch": 2.5562060889929743, + "grad_norm": 0.9822878837585449, + "learning_rate": 3.0971962053924353e-06, + "loss": 0.5992, + "step": 8732 + }, + { + "epoch": 2.5564988290398127, + "grad_norm": 1.0184303522109985, + "learning_rate": 3.096822271819459e-06, + "loss": 0.6146, + "step": 8733 + }, + { + "epoch": 2.556791569086651, + "grad_norm": 0.9899926781654358, + "learning_rate": 3.096448324086759e-06, + "loss": 0.5795, + "step": 8734 + }, + { + "epoch": 2.5570843091334896, + "grad_norm": 1.0336273908615112, + "learning_rate": 3.0960743622032086e-06, + "loss": 0.5678, + "step": 8735 + }, + { + "epoch": 2.557377049180328, + "grad_norm": 1.0283071994781494, + "learning_rate": 3.0957003861776798e-06, + "loss": 0.525, + "step": 8736 + }, + { + "epoch": 2.557669789227166, + "grad_norm": 1.0307480096817017, + "learning_rate": 3.0953263960190445e-06, + "loss": 0.5503, + "step": 8737 + }, + { + "epoch": 2.557962529274005, + "grad_norm": 0.996604859828949, + "learning_rate": 3.0949523917361756e-06, + "loss": 0.578, + "step": 8738 + }, + { + "epoch": 2.558255269320843, + "grad_norm": 0.9594244956970215, + "learning_rate": 3.0945783733379477e-06, + "loss": 0.5333, + "step": 8739 + }, + { + "epoch": 2.5585480093676813, + "grad_norm": 0.9828516244888306, + "learning_rate": 3.0942043408332335e-06, + "loss": 0.534, + "step": 8740 + }, + { + "epoch": 2.5588407494145198, + "grad_norm": 0.9878639578819275, + "learning_rate": 3.0938302942309074e-06, + "loss": 0.5663, + "step": 8741 + }, + { + "epoch": 2.559133489461358, + "grad_norm": 0.9612206816673279, + "learning_rate": 3.0934562335398425e-06, + "loss": 0.553, + "step": 8742 + }, + { + "epoch": 2.5594262295081966, + "grad_norm": 1.0429412126541138, + "learning_rate": 3.093082158768915e-06, + "loss": 0.5947, + "step": 8743 + }, + { + "epoch": 2.559718969555035, + "grad_norm": 1.0352234840393066, + "learning_rate": 3.092708069926999e-06, + "loss": 0.6161, + "step": 8744 + }, + { + "epoch": 2.5600117096018735, + "grad_norm": 1.0157663822174072, + "learning_rate": 3.0923339670229704e-06, + "loss": 0.5494, + "step": 8745 + }, + { + "epoch": 2.560304449648712, + "grad_norm": 1.0442285537719727, + "learning_rate": 3.0919598500657044e-06, + "loss": 0.6245, + "step": 8746 + }, + { + "epoch": 2.5605971896955504, + "grad_norm": 1.026260256767273, + "learning_rate": 3.091585719064078e-06, + "loss": 0.5614, + "step": 8747 + }, + { + "epoch": 2.560889929742389, + "grad_norm": 1.0030226707458496, + "learning_rate": 3.091211574026966e-06, + "loss": 0.5849, + "step": 8748 + }, + { + "epoch": 2.5611826697892273, + "grad_norm": 0.9865468144416809, + "learning_rate": 3.090837414963246e-06, + "loss": 0.6134, + "step": 8749 + }, + { + "epoch": 2.5614754098360657, + "grad_norm": 1.018223524093628, + "learning_rate": 3.0904632418817946e-06, + "loss": 0.5697, + "step": 8750 + }, + { + "epoch": 2.561768149882904, + "grad_norm": 1.047448992729187, + "learning_rate": 3.0900890547914898e-06, + "loss": 0.588, + "step": 8751 + }, + { + "epoch": 2.562060889929742, + "grad_norm": 0.9364010691642761, + "learning_rate": 3.089714853701209e-06, + "loss": 0.5272, + "step": 8752 + }, + { + "epoch": 2.562353629976581, + "grad_norm": 1.04702627658844, + "learning_rate": 3.0893406386198295e-06, + "loss": 0.5889, + "step": 8753 + }, + { + "epoch": 2.562646370023419, + "grad_norm": 0.9542171955108643, + "learning_rate": 3.08896640955623e-06, + "loss": 0.5715, + "step": 8754 + }, + { + "epoch": 2.562939110070258, + "grad_norm": 0.9657012224197388, + "learning_rate": 3.08859216651929e-06, + "loss": 0.6002, + "step": 8755 + }, + { + "epoch": 2.563231850117096, + "grad_norm": 0.9173330664634705, + "learning_rate": 3.088217909517887e-06, + "loss": 0.5649, + "step": 8756 + }, + { + "epoch": 2.5635245901639343, + "grad_norm": 1.0278964042663574, + "learning_rate": 3.087843638560902e-06, + "loss": 0.6003, + "step": 8757 + }, + { + "epoch": 2.5638173302107727, + "grad_norm": 0.9737646579742432, + "learning_rate": 3.087469353657213e-06, + "loss": 0.6006, + "step": 8758 + }, + { + "epoch": 2.564110070257611, + "grad_norm": 0.9559490084648132, + "learning_rate": 3.087095054815701e-06, + "loss": 0.5639, + "step": 8759 + }, + { + "epoch": 2.5644028103044496, + "grad_norm": 0.97186279296875, + "learning_rate": 3.0867207420452456e-06, + "loss": 0.5603, + "step": 8760 + }, + { + "epoch": 2.564695550351288, + "grad_norm": 0.9833230972290039, + "learning_rate": 3.0863464153547284e-06, + "loss": 0.5198, + "step": 8761 + }, + { + "epoch": 2.5649882903981265, + "grad_norm": 0.9892507195472717, + "learning_rate": 3.0859720747530295e-06, + "loss": 0.5981, + "step": 8762 + }, + { + "epoch": 2.565281030444965, + "grad_norm": 1.008919596672058, + "learning_rate": 3.0855977202490306e-06, + "loss": 0.6063, + "step": 8763 + }, + { + "epoch": 2.5655737704918034, + "grad_norm": 0.954751193523407, + "learning_rate": 3.0852233518516135e-06, + "loss": 0.5338, + "step": 8764 + }, + { + "epoch": 2.565866510538642, + "grad_norm": 0.9927802085876465, + "learning_rate": 3.08484896956966e-06, + "loss": 0.5496, + "step": 8765 + }, + { + "epoch": 2.5661592505854802, + "grad_norm": 0.9912638068199158, + "learning_rate": 3.0844745734120517e-06, + "loss": 0.5755, + "step": 8766 + }, + { + "epoch": 2.5664519906323187, + "grad_norm": 1.0266685485839844, + "learning_rate": 3.0841001633876727e-06, + "loss": 0.5579, + "step": 8767 + }, + { + "epoch": 2.566744730679157, + "grad_norm": 1.0493155717849731, + "learning_rate": 3.0837257395054043e-06, + "loss": 0.5452, + "step": 8768 + }, + { + "epoch": 2.567037470725995, + "grad_norm": 1.0433067083358765, + "learning_rate": 3.083351301774131e-06, + "loss": 0.5946, + "step": 8769 + }, + { + "epoch": 2.567330210772834, + "grad_norm": 1.031766414642334, + "learning_rate": 3.0829768502027357e-06, + "loss": 0.5503, + "step": 8770 + }, + { + "epoch": 2.567622950819672, + "grad_norm": 0.949423611164093, + "learning_rate": 3.082602384800103e-06, + "loss": 0.5475, + "step": 8771 + }, + { + "epoch": 2.5679156908665104, + "grad_norm": 0.9824818968772888, + "learning_rate": 3.082227905575117e-06, + "loss": 0.5483, + "step": 8772 + }, + { + "epoch": 2.568208430913349, + "grad_norm": 0.9938435554504395, + "learning_rate": 3.0818534125366615e-06, + "loss": 0.5776, + "step": 8773 + }, + { + "epoch": 2.5685011709601873, + "grad_norm": 0.973746120929718, + "learning_rate": 3.0814789056936227e-06, + "loss": 0.617, + "step": 8774 + }, + { + "epoch": 2.5687939110070257, + "grad_norm": 0.9515296816825867, + "learning_rate": 3.081104385054885e-06, + "loss": 0.5647, + "step": 8775 + }, + { + "epoch": 2.569086651053864, + "grad_norm": 0.9683330655097961, + "learning_rate": 3.0807298506293337e-06, + "loss": 0.5383, + "step": 8776 + }, + { + "epoch": 2.5693793911007026, + "grad_norm": 0.9332090020179749, + "learning_rate": 3.080355302425855e-06, + "loss": 0.5572, + "step": 8777 + }, + { + "epoch": 2.569672131147541, + "grad_norm": 1.1263948678970337, + "learning_rate": 3.079980740453336e-06, + "loss": 0.5753, + "step": 8778 + }, + { + "epoch": 2.5699648711943794, + "grad_norm": 1.0521093606948853, + "learning_rate": 3.0796061647206616e-06, + "loss": 0.5911, + "step": 8779 + }, + { + "epoch": 2.570257611241218, + "grad_norm": 0.9793274402618408, + "learning_rate": 3.0792315752367203e-06, + "loss": 0.5354, + "step": 8780 + }, + { + "epoch": 2.5705503512880563, + "grad_norm": 1.0058249235153198, + "learning_rate": 3.078856972010398e-06, + "loss": 0.5546, + "step": 8781 + }, + { + "epoch": 2.5708430913348947, + "grad_norm": 0.9878349304199219, + "learning_rate": 3.0784823550505834e-06, + "loss": 0.505, + "step": 8782 + }, + { + "epoch": 2.571135831381733, + "grad_norm": 0.9632161855697632, + "learning_rate": 3.0781077243661633e-06, + "loss": 0.5362, + "step": 8783 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 1.0025891065597534, + "learning_rate": 3.0777330799660267e-06, + "loss": 0.5858, + "step": 8784 + }, + { + "epoch": 2.57172131147541, + "grad_norm": 1.0270357131958008, + "learning_rate": 3.0773584218590615e-06, + "loss": 0.5837, + "step": 8785 + }, + { + "epoch": 2.572014051522248, + "grad_norm": 0.9976455569267273, + "learning_rate": 3.076983750054157e-06, + "loss": 0.5735, + "step": 8786 + }, + { + "epoch": 2.572306791569087, + "grad_norm": 0.982944130897522, + "learning_rate": 3.076609064560202e-06, + "loss": 0.591, + "step": 8787 + }, + { + "epoch": 2.572599531615925, + "grad_norm": 0.978323221206665, + "learning_rate": 3.0762343653860864e-06, + "loss": 0.57, + "step": 8788 + }, + { + "epoch": 2.5728922716627634, + "grad_norm": 0.9939210414886475, + "learning_rate": 3.0758596525407e-06, + "loss": 0.5785, + "step": 8789 + }, + { + "epoch": 2.573185011709602, + "grad_norm": 1.0732451677322388, + "learning_rate": 3.0754849260329324e-06, + "loss": 0.6015, + "step": 8790 + }, + { + "epoch": 2.5734777517564402, + "grad_norm": 1.033509612083435, + "learning_rate": 3.0751101858716744e-06, + "loss": 0.5362, + "step": 8791 + }, + { + "epoch": 2.5737704918032787, + "grad_norm": 1.0292835235595703, + "learning_rate": 3.0747354320658163e-06, + "loss": 0.5911, + "step": 8792 + }, + { + "epoch": 2.574063231850117, + "grad_norm": 0.9678041338920593, + "learning_rate": 3.0743606646242508e-06, + "loss": 0.5335, + "step": 8793 + }, + { + "epoch": 2.5743559718969555, + "grad_norm": 0.9644010066986084, + "learning_rate": 3.0739858835558673e-06, + "loss": 0.5567, + "step": 8794 + }, + { + "epoch": 2.574648711943794, + "grad_norm": 1.025457501411438, + "learning_rate": 3.0736110888695586e-06, + "loss": 0.5899, + "step": 8795 + }, + { + "epoch": 2.5749414519906324, + "grad_norm": 1.0104283094406128, + "learning_rate": 3.0732362805742167e-06, + "loss": 0.5716, + "step": 8796 + }, + { + "epoch": 2.575234192037471, + "grad_norm": 1.0042833089828491, + "learning_rate": 3.0728614586787343e-06, + "loss": 0.5867, + "step": 8797 + }, + { + "epoch": 2.5755269320843093, + "grad_norm": 0.967310905456543, + "learning_rate": 3.0724866231920026e-06, + "loss": 0.6037, + "step": 8798 + }, + { + "epoch": 2.5758196721311473, + "grad_norm": 0.9910795092582703, + "learning_rate": 3.0721117741229167e-06, + "loss": 0.5777, + "step": 8799 + }, + { + "epoch": 2.576112412177986, + "grad_norm": 1.0063129663467407, + "learning_rate": 3.0717369114803684e-06, + "loss": 0.5791, + "step": 8800 + }, + { + "epoch": 2.576405152224824, + "grad_norm": 0.9305808544158936, + "learning_rate": 3.071362035273252e-06, + "loss": 0.5221, + "step": 8801 + }, + { + "epoch": 2.576697892271663, + "grad_norm": 0.9643677473068237, + "learning_rate": 3.070987145510463e-06, + "loss": 0.557, + "step": 8802 + }, + { + "epoch": 2.576990632318501, + "grad_norm": 1.040871262550354, + "learning_rate": 3.0706122422008925e-06, + "loss": 0.5888, + "step": 8803 + }, + { + "epoch": 2.5772833723653394, + "grad_norm": 0.9774057269096375, + "learning_rate": 3.070237325353438e-06, + "loss": 0.5802, + "step": 8804 + }, + { + "epoch": 2.577576112412178, + "grad_norm": 1.0124714374542236, + "learning_rate": 3.069862394976993e-06, + "loss": 0.5522, + "step": 8805 + }, + { + "epoch": 2.5778688524590163, + "grad_norm": 0.9655002355575562, + "learning_rate": 3.0694874510804528e-06, + "loss": 0.5785, + "step": 8806 + }, + { + "epoch": 2.5781615925058547, + "grad_norm": 0.9940184950828552, + "learning_rate": 3.0691124936727147e-06, + "loss": 0.5699, + "step": 8807 + }, + { + "epoch": 2.578454332552693, + "grad_norm": 0.9602165818214417, + "learning_rate": 3.068737522762672e-06, + "loss": 0.5476, + "step": 8808 + }, + { + "epoch": 2.5787470725995316, + "grad_norm": 1.0069019794464111, + "learning_rate": 3.068362538359223e-06, + "loss": 0.5737, + "step": 8809 + }, + { + "epoch": 2.57903981264637, + "grad_norm": 0.98065185546875, + "learning_rate": 3.0679875404712635e-06, + "loss": 0.5516, + "step": 8810 + }, + { + "epoch": 2.5793325526932085, + "grad_norm": 1.0042803287506104, + "learning_rate": 3.06761252910769e-06, + "loss": 0.5485, + "step": 8811 + }, + { + "epoch": 2.579625292740047, + "grad_norm": 1.005139946937561, + "learning_rate": 3.0672375042774017e-06, + "loss": 0.6072, + "step": 8812 + }, + { + "epoch": 2.5799180327868854, + "grad_norm": 1.0240448713302612, + "learning_rate": 3.0668624659892933e-06, + "loss": 0.6132, + "step": 8813 + }, + { + "epoch": 2.580210772833724, + "grad_norm": 0.9552462100982666, + "learning_rate": 3.066487414252264e-06, + "loss": 0.5587, + "step": 8814 + }, + { + "epoch": 2.5805035128805622, + "grad_norm": 0.9713073372840881, + "learning_rate": 3.066112349075212e-06, + "loss": 0.5589, + "step": 8815 + }, + { + "epoch": 2.5807962529274002, + "grad_norm": 0.9902871251106262, + "learning_rate": 3.0657372704670358e-06, + "loss": 0.5599, + "step": 8816 + }, + { + "epoch": 2.581088992974239, + "grad_norm": 1.0057636499404907, + "learning_rate": 3.0653621784366346e-06, + "loss": 0.5783, + "step": 8817 + }, + { + "epoch": 2.581381733021077, + "grad_norm": 0.9962286353111267, + "learning_rate": 3.0649870729929064e-06, + "loss": 0.5594, + "step": 8818 + }, + { + "epoch": 2.5816744730679155, + "grad_norm": 1.0140166282653809, + "learning_rate": 3.064611954144752e-06, + "loss": 0.573, + "step": 8819 + }, + { + "epoch": 2.581967213114754, + "grad_norm": 0.9808467626571655, + "learning_rate": 3.0642368219010697e-06, + "loss": 0.5746, + "step": 8820 + }, + { + "epoch": 2.5822599531615924, + "grad_norm": 0.9838092923164368, + "learning_rate": 3.063861676270761e-06, + "loss": 0.5652, + "step": 8821 + }, + { + "epoch": 2.582552693208431, + "grad_norm": 1.0560661554336548, + "learning_rate": 3.063486517262726e-06, + "loss": 0.5562, + "step": 8822 + }, + { + "epoch": 2.5828454332552693, + "grad_norm": 1.0104674100875854, + "learning_rate": 3.0631113448858648e-06, + "loss": 0.5724, + "step": 8823 + }, + { + "epoch": 2.5831381733021077, + "grad_norm": 1.0238564014434814, + "learning_rate": 3.0627361591490785e-06, + "loss": 0.5838, + "step": 8824 + }, + { + "epoch": 2.583430913348946, + "grad_norm": 0.9723584651947021, + "learning_rate": 3.0623609600612692e-06, + "loss": 0.5713, + "step": 8825 + }, + { + "epoch": 2.5837236533957846, + "grad_norm": 0.9779043793678284, + "learning_rate": 3.0619857476313374e-06, + "loss": 0.5721, + "step": 8826 + }, + { + "epoch": 2.584016393442623, + "grad_norm": 0.9741715788841248, + "learning_rate": 3.0616105218681863e-06, + "loss": 0.5839, + "step": 8827 + }, + { + "epoch": 2.5843091334894615, + "grad_norm": 1.040805697441101, + "learning_rate": 3.0612352827807178e-06, + "loss": 0.608, + "step": 8828 + }, + { + "epoch": 2.5846018735363, + "grad_norm": 1.037559986114502, + "learning_rate": 3.0608600303778346e-06, + "loss": 0.5415, + "step": 8829 + }, + { + "epoch": 2.5848946135831383, + "grad_norm": 1.0259300470352173, + "learning_rate": 3.060484764668439e-06, + "loss": 0.5517, + "step": 8830 + }, + { + "epoch": 2.5851873536299763, + "grad_norm": 1.0209320783615112, + "learning_rate": 3.060109485661434e-06, + "loss": 0.5796, + "step": 8831 + }, + { + "epoch": 2.585480093676815, + "grad_norm": 1.0185773372650146, + "learning_rate": 3.0597341933657242e-06, + "loss": 0.5752, + "step": 8832 + }, + { + "epoch": 2.585772833723653, + "grad_norm": 0.9706973433494568, + "learning_rate": 3.0593588877902143e-06, + "loss": 0.5759, + "step": 8833 + }, + { + "epoch": 2.586065573770492, + "grad_norm": 0.9830930233001709, + "learning_rate": 3.0589835689438064e-06, + "loss": 0.5591, + "step": 8834 + }, + { + "epoch": 2.58635831381733, + "grad_norm": 1.0009740591049194, + "learning_rate": 3.0586082368354058e-06, + "loss": 0.5438, + "step": 8835 + }, + { + "epoch": 2.5866510538641685, + "grad_norm": 1.0069061517715454, + "learning_rate": 3.0582328914739173e-06, + "loss": 0.574, + "step": 8836 + }, + { + "epoch": 2.586943793911007, + "grad_norm": 0.9547997713088989, + "learning_rate": 3.0578575328682466e-06, + "loss": 0.5621, + "step": 8837 + }, + { + "epoch": 2.5872365339578454, + "grad_norm": 1.0094574689865112, + "learning_rate": 3.0574821610272985e-06, + "loss": 0.5923, + "step": 8838 + }, + { + "epoch": 2.587529274004684, + "grad_norm": 0.9649577736854553, + "learning_rate": 3.0571067759599793e-06, + "loss": 0.5565, + "step": 8839 + }, + { + "epoch": 2.5878220140515222, + "grad_norm": 1.0484646558761597, + "learning_rate": 3.056731377675195e-06, + "loss": 0.5808, + "step": 8840 + }, + { + "epoch": 2.5881147540983607, + "grad_norm": 1.0261176824569702, + "learning_rate": 3.0563559661818514e-06, + "loss": 0.5824, + "step": 8841 + }, + { + "epoch": 2.588407494145199, + "grad_norm": 0.9893553256988525, + "learning_rate": 3.0559805414888554e-06, + "loss": 0.5894, + "step": 8842 + }, + { + "epoch": 2.5887002341920375, + "grad_norm": 1.0235592126846313, + "learning_rate": 3.055605103605115e-06, + "loss": 0.562, + "step": 8843 + }, + { + "epoch": 2.588992974238876, + "grad_norm": 0.9996101260185242, + "learning_rate": 3.0552296525395362e-06, + "loss": 0.5696, + "step": 8844 + }, + { + "epoch": 2.5892857142857144, + "grad_norm": 1.0314780473709106, + "learning_rate": 3.054854188301028e-06, + "loss": 0.5782, + "step": 8845 + }, + { + "epoch": 2.589578454332553, + "grad_norm": 1.0336835384368896, + "learning_rate": 3.0544787108984968e-06, + "loss": 0.5678, + "step": 8846 + }, + { + "epoch": 2.5898711943793913, + "grad_norm": 0.9642943739891052, + "learning_rate": 3.0541032203408515e-06, + "loss": 0.5558, + "step": 8847 + }, + { + "epoch": 2.5901639344262293, + "grad_norm": 0.9720951318740845, + "learning_rate": 3.0537277166370016e-06, + "loss": 0.5992, + "step": 8848 + }, + { + "epoch": 2.590456674473068, + "grad_norm": 1.0307961702346802, + "learning_rate": 3.053352199795855e-06, + "loss": 0.5364, + "step": 8849 + }, + { + "epoch": 2.590749414519906, + "grad_norm": 0.9437359571456909, + "learning_rate": 3.0529766698263207e-06, + "loss": 0.5586, + "step": 8850 + }, + { + "epoch": 2.5910421545667446, + "grad_norm": 1.0005760192871094, + "learning_rate": 3.0526011267373086e-06, + "loss": 0.6246, + "step": 8851 + }, + { + "epoch": 2.591334894613583, + "grad_norm": 0.9308120012283325, + "learning_rate": 3.052225570537729e-06, + "loss": 0.5628, + "step": 8852 + }, + { + "epoch": 2.5916276346604215, + "grad_norm": 0.9941227436065674, + "learning_rate": 3.0518500012364918e-06, + "loss": 0.5703, + "step": 8853 + }, + { + "epoch": 2.59192037470726, + "grad_norm": 1.0027600526809692, + "learning_rate": 3.0514744188425073e-06, + "loss": 0.5754, + "step": 8854 + }, + { + "epoch": 2.5922131147540983, + "grad_norm": 0.9550033211708069, + "learning_rate": 3.0510988233646853e-06, + "loss": 0.5427, + "step": 8855 + }, + { + "epoch": 2.5925058548009368, + "grad_norm": 1.2804654836654663, + "learning_rate": 3.0507232148119387e-06, + "loss": 0.547, + "step": 8856 + }, + { + "epoch": 2.592798594847775, + "grad_norm": 0.9035466313362122, + "learning_rate": 3.0503475931931777e-06, + "loss": 0.5281, + "step": 8857 + }, + { + "epoch": 2.5930913348946136, + "grad_norm": 1.014677882194519, + "learning_rate": 3.049971958517315e-06, + "loss": 0.5945, + "step": 8858 + }, + { + "epoch": 2.593384074941452, + "grad_norm": 0.9914320707321167, + "learning_rate": 3.0495963107932615e-06, + "loss": 0.5693, + "step": 8859 + }, + { + "epoch": 2.5936768149882905, + "grad_norm": 0.9341211318969727, + "learning_rate": 3.049220650029929e-06, + "loss": 0.5267, + "step": 8860 + }, + { + "epoch": 2.593969555035129, + "grad_norm": 0.9793946146965027, + "learning_rate": 3.048844976236232e-06, + "loss": 0.531, + "step": 8861 + }, + { + "epoch": 2.5942622950819674, + "grad_norm": 0.9765534996986389, + "learning_rate": 3.0484692894210834e-06, + "loss": 0.5326, + "step": 8862 + }, + { + "epoch": 2.5945550351288054, + "grad_norm": 0.9666818976402283, + "learning_rate": 3.048093589593394e-06, + "loss": 0.5417, + "step": 8863 + }, + { + "epoch": 2.5948477751756442, + "grad_norm": 0.9685017466545105, + "learning_rate": 3.0477178767620803e-06, + "loss": 0.5586, + "step": 8864 + }, + { + "epoch": 2.5951405152224822, + "grad_norm": 1.0256953239440918, + "learning_rate": 3.0473421509360534e-06, + "loss": 0.6106, + "step": 8865 + }, + { + "epoch": 2.595433255269321, + "grad_norm": 0.9676437377929688, + "learning_rate": 3.046966412124229e-06, + "loss": 0.5896, + "step": 8866 + }, + { + "epoch": 2.595725995316159, + "grad_norm": 1.0064831972122192, + "learning_rate": 3.046590660335522e-06, + "loss": 0.557, + "step": 8867 + }, + { + "epoch": 2.5960187353629975, + "grad_norm": 0.9529656171798706, + "learning_rate": 3.0462148955788463e-06, + "loss": 0.5302, + "step": 8868 + }, + { + "epoch": 2.596311475409836, + "grad_norm": 0.9723898768424988, + "learning_rate": 3.0458391178631173e-06, + "loss": 0.5351, + "step": 8869 + }, + { + "epoch": 2.5966042154566744, + "grad_norm": 1.0469632148742676, + "learning_rate": 3.04546332719725e-06, + "loss": 0.6133, + "step": 8870 + }, + { + "epoch": 2.596896955503513, + "grad_norm": 0.9801945686340332, + "learning_rate": 3.0450875235901605e-06, + "loss": 0.5466, + "step": 8871 + }, + { + "epoch": 2.5971896955503513, + "grad_norm": 0.9759947061538696, + "learning_rate": 3.044711707050765e-06, + "loss": 0.5213, + "step": 8872 + }, + { + "epoch": 2.5974824355971897, + "grad_norm": 0.9813037514686584, + "learning_rate": 3.04433587758798e-06, + "loss": 0.5722, + "step": 8873 + }, + { + "epoch": 2.597775175644028, + "grad_norm": 1.0558151006698608, + "learning_rate": 3.0439600352107214e-06, + "loss": 0.5808, + "step": 8874 + }, + { + "epoch": 2.5980679156908666, + "grad_norm": 1.0322299003601074, + "learning_rate": 3.0435841799279062e-06, + "loss": 0.5968, + "step": 8875 + }, + { + "epoch": 2.598360655737705, + "grad_norm": 0.9857988357543945, + "learning_rate": 3.043208311748452e-06, + "loss": 0.5516, + "step": 8876 + }, + { + "epoch": 2.5986533957845435, + "grad_norm": 1.0133029222488403, + "learning_rate": 3.0428324306812762e-06, + "loss": 0.57, + "step": 8877 + }, + { + "epoch": 2.598946135831382, + "grad_norm": 1.032208800315857, + "learning_rate": 3.042456536735297e-06, + "loss": 0.5672, + "step": 8878 + }, + { + "epoch": 2.5992388758782203, + "grad_norm": 0.9672929048538208, + "learning_rate": 3.0420806299194322e-06, + "loss": 0.5791, + "step": 8879 + }, + { + "epoch": 2.5995316159250583, + "grad_norm": 1.0056226253509521, + "learning_rate": 3.0417047102426e-06, + "loss": 0.5959, + "step": 8880 + }, + { + "epoch": 2.599824355971897, + "grad_norm": 1.0052978992462158, + "learning_rate": 3.0413287777137202e-06, + "loss": 0.5534, + "step": 8881 + }, + { + "epoch": 2.600117096018735, + "grad_norm": 1.0140728950500488, + "learning_rate": 3.0409528323417102e-06, + "loss": 0.5726, + "step": 8882 + }, + { + "epoch": 2.6004098360655736, + "grad_norm": 0.9917723536491394, + "learning_rate": 3.0405768741354913e-06, + "loss": 0.5926, + "step": 8883 + }, + { + "epoch": 2.600702576112412, + "grad_norm": 0.9970232248306274, + "learning_rate": 3.0402009031039815e-06, + "loss": 0.5934, + "step": 8884 + }, + { + "epoch": 2.6009953161592505, + "grad_norm": 0.966766893863678, + "learning_rate": 3.039824919256102e-06, + "loss": 0.5545, + "step": 8885 + }, + { + "epoch": 2.601288056206089, + "grad_norm": 1.003678321838379, + "learning_rate": 3.0394489226007715e-06, + "loss": 0.5767, + "step": 8886 + }, + { + "epoch": 2.6015807962529274, + "grad_norm": 1.011904239654541, + "learning_rate": 3.0390729131469125e-06, + "loss": 0.5776, + "step": 8887 + }, + { + "epoch": 2.601873536299766, + "grad_norm": 1.0250532627105713, + "learning_rate": 3.0386968909034455e-06, + "loss": 0.5888, + "step": 8888 + }, + { + "epoch": 2.6021662763466042, + "grad_norm": 0.932141125202179, + "learning_rate": 3.0383208558792902e-06, + "loss": 0.5456, + "step": 8889 + }, + { + "epoch": 2.6024590163934427, + "grad_norm": 0.991124153137207, + "learning_rate": 3.03794480808337e-06, + "loss": 0.5514, + "step": 8890 + }, + { + "epoch": 2.602751756440281, + "grad_norm": 0.9640399217605591, + "learning_rate": 3.0375687475246054e-06, + "loss": 0.5592, + "step": 8891 + }, + { + "epoch": 2.6030444964871196, + "grad_norm": 0.9749506711959839, + "learning_rate": 3.037192674211919e-06, + "loss": 0.556, + "step": 8892 + }, + { + "epoch": 2.603337236533958, + "grad_norm": 1.0401298999786377, + "learning_rate": 3.036816588154234e-06, + "loss": 0.5733, + "step": 8893 + }, + { + "epoch": 2.6036299765807964, + "grad_norm": 0.9755192399024963, + "learning_rate": 3.0364404893604715e-06, + "loss": 0.5918, + "step": 8894 + }, + { + "epoch": 2.6039227166276344, + "grad_norm": 0.9817084074020386, + "learning_rate": 3.036064377839555e-06, + "loss": 0.5612, + "step": 8895 + }, + { + "epoch": 2.6042154566744733, + "grad_norm": 1.0165388584136963, + "learning_rate": 3.0356882536004084e-06, + "loss": 0.5767, + "step": 8896 + }, + { + "epoch": 2.6045081967213113, + "grad_norm": 0.9460651278495789, + "learning_rate": 3.0353121166519556e-06, + "loss": 0.5643, + "step": 8897 + }, + { + "epoch": 2.6048009367681497, + "grad_norm": 1.033281922340393, + "learning_rate": 3.0349359670031197e-06, + "loss": 0.5107, + "step": 8898 + }, + { + "epoch": 2.605093676814988, + "grad_norm": 1.0338764190673828, + "learning_rate": 3.0345598046628253e-06, + "loss": 0.5803, + "step": 8899 + }, + { + "epoch": 2.6053864168618266, + "grad_norm": 1.0041859149932861, + "learning_rate": 3.0341836296399972e-06, + "loss": 0.5447, + "step": 8900 + }, + { + "epoch": 2.605679156908665, + "grad_norm": 0.9554330706596375, + "learning_rate": 3.0338074419435594e-06, + "loss": 0.5233, + "step": 8901 + }, + { + "epoch": 2.6059718969555035, + "grad_norm": 0.9955936074256897, + "learning_rate": 3.0334312415824374e-06, + "loss": 0.5768, + "step": 8902 + }, + { + "epoch": 2.606264637002342, + "grad_norm": 1.0046824216842651, + "learning_rate": 3.033055028565557e-06, + "loss": 0.5654, + "step": 8903 + }, + { + "epoch": 2.6065573770491803, + "grad_norm": 1.0852317810058594, + "learning_rate": 3.0326788029018434e-06, + "loss": 0.5724, + "step": 8904 + }, + { + "epoch": 2.6068501170960188, + "grad_norm": 0.9938488006591797, + "learning_rate": 3.0323025646002225e-06, + "loss": 0.5752, + "step": 8905 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 1.020703673362732, + "learning_rate": 3.0319263136696212e-06, + "loss": 0.5674, + "step": 8906 + }, + { + "epoch": 2.6074355971896956, + "grad_norm": 1.0864896774291992, + "learning_rate": 3.0315500501189664e-06, + "loss": 0.5889, + "step": 8907 + }, + { + "epoch": 2.607728337236534, + "grad_norm": 1.0004661083221436, + "learning_rate": 3.031173773957184e-06, + "loss": 0.5956, + "step": 8908 + }, + { + "epoch": 2.6080210772833725, + "grad_norm": 0.9960981011390686, + "learning_rate": 3.0307974851932026e-06, + "loss": 0.5836, + "step": 8909 + }, + { + "epoch": 2.6083138173302105, + "grad_norm": 1.0102083683013916, + "learning_rate": 3.0304211838359475e-06, + "loss": 0.5926, + "step": 8910 + }, + { + "epoch": 2.6086065573770494, + "grad_norm": 1.0085666179656982, + "learning_rate": 3.0300448698943493e-06, + "loss": 0.5471, + "step": 8911 + }, + { + "epoch": 2.6088992974238874, + "grad_norm": 0.9803964495658875, + "learning_rate": 3.0296685433773336e-06, + "loss": 0.5677, + "step": 8912 + }, + { + "epoch": 2.6091920374707263, + "grad_norm": 0.9660468697547913, + "learning_rate": 3.0292922042938307e-06, + "loss": 0.5503, + "step": 8913 + }, + { + "epoch": 2.6094847775175642, + "grad_norm": 0.9908324480056763, + "learning_rate": 3.0289158526527683e-06, + "loss": 0.5654, + "step": 8914 + }, + { + "epoch": 2.6097775175644027, + "grad_norm": 0.993557870388031, + "learning_rate": 3.028539488463076e-06, + "loss": 0.5729, + "step": 8915 + }, + { + "epoch": 2.610070257611241, + "grad_norm": 1.051450252532959, + "learning_rate": 3.028163111733682e-06, + "loss": 0.5589, + "step": 8916 + }, + { + "epoch": 2.6103629976580796, + "grad_norm": 0.9387568831443787, + "learning_rate": 3.027786722473518e-06, + "loss": 0.5207, + "step": 8917 + }, + { + "epoch": 2.610655737704918, + "grad_norm": 0.9817445874214172, + "learning_rate": 3.027410320691512e-06, + "loss": 0.5328, + "step": 8918 + }, + { + "epoch": 2.6109484777517564, + "grad_norm": 0.9793021082878113, + "learning_rate": 3.0270339063965948e-06, + "loss": 0.5646, + "step": 8919 + }, + { + "epoch": 2.611241217798595, + "grad_norm": 0.9441813826560974, + "learning_rate": 3.026657479597696e-06, + "loss": 0.5476, + "step": 8920 + }, + { + "epoch": 2.6115339578454333, + "grad_norm": 0.9944534301757812, + "learning_rate": 3.0262810403037485e-06, + "loss": 0.6034, + "step": 8921 + }, + { + "epoch": 2.6118266978922717, + "grad_norm": 1.3542896509170532, + "learning_rate": 3.0259045885236813e-06, + "loss": 0.6031, + "step": 8922 + }, + { + "epoch": 2.61211943793911, + "grad_norm": 1.014948844909668, + "learning_rate": 3.025528124266427e-06, + "loss": 0.596, + "step": 8923 + }, + { + "epoch": 2.6124121779859486, + "grad_norm": 0.9870669841766357, + "learning_rate": 3.0251516475409177e-06, + "loss": 0.5946, + "step": 8924 + }, + { + "epoch": 2.612704918032787, + "grad_norm": 0.9580053091049194, + "learning_rate": 3.0247751583560843e-06, + "loss": 0.5393, + "step": 8925 + }, + { + "epoch": 2.6129976580796255, + "grad_norm": 1.012477993965149, + "learning_rate": 3.024398656720859e-06, + "loss": 0.5898, + "step": 8926 + }, + { + "epoch": 2.6132903981264635, + "grad_norm": 0.9684709310531616, + "learning_rate": 3.0240221426441747e-06, + "loss": 0.5949, + "step": 8927 + }, + { + "epoch": 2.6135831381733023, + "grad_norm": 0.9753859043121338, + "learning_rate": 3.0236456161349646e-06, + "loss": 0.6028, + "step": 8928 + }, + { + "epoch": 2.6138758782201403, + "grad_norm": 1.001165509223938, + "learning_rate": 3.023269077202162e-06, + "loss": 0.5551, + "step": 8929 + }, + { + "epoch": 2.6141686182669788, + "grad_norm": 1.0483896732330322, + "learning_rate": 3.0228925258547005e-06, + "loss": 0.574, + "step": 8930 + }, + { + "epoch": 2.614461358313817, + "grad_norm": 0.9754256010055542, + "learning_rate": 3.022515962101512e-06, + "loss": 0.5431, + "step": 8931 + }, + { + "epoch": 2.6147540983606556, + "grad_norm": 1.0132639408111572, + "learning_rate": 3.0221393859515325e-06, + "loss": 0.5784, + "step": 8932 + }, + { + "epoch": 2.615046838407494, + "grad_norm": 1.043914794921875, + "learning_rate": 3.0217627974136965e-06, + "loss": 0.5758, + "step": 8933 + }, + { + "epoch": 2.6153395784543325, + "grad_norm": 0.977583110332489, + "learning_rate": 3.021386196496937e-06, + "loss": 0.5664, + "step": 8934 + }, + { + "epoch": 2.615632318501171, + "grad_norm": 0.9805905818939209, + "learning_rate": 3.0210095832101905e-06, + "loss": 0.5376, + "step": 8935 + }, + { + "epoch": 2.6159250585480094, + "grad_norm": 0.9591331481933594, + "learning_rate": 3.020632957562391e-06, + "loss": 0.5614, + "step": 8936 + }, + { + "epoch": 2.616217798594848, + "grad_norm": 0.9965227246284485, + "learning_rate": 3.0202563195624745e-06, + "loss": 0.5733, + "step": 8937 + }, + { + "epoch": 2.6165105386416863, + "grad_norm": 0.9861810803413391, + "learning_rate": 3.0198796692193772e-06, + "loss": 0.5782, + "step": 8938 + }, + { + "epoch": 2.6168032786885247, + "grad_norm": 0.9649412631988525, + "learning_rate": 3.019503006542035e-06, + "loss": 0.5577, + "step": 8939 + }, + { + "epoch": 2.617096018735363, + "grad_norm": 0.9661535024642944, + "learning_rate": 3.0191263315393837e-06, + "loss": 0.5668, + "step": 8940 + }, + { + "epoch": 2.6173887587822016, + "grad_norm": 0.9671904444694519, + "learning_rate": 3.018749644220361e-06, + "loss": 0.5724, + "step": 8941 + }, + { + "epoch": 2.6176814988290396, + "grad_norm": 0.9927760362625122, + "learning_rate": 3.0183729445939025e-06, + "loss": 0.5333, + "step": 8942 + }, + { + "epoch": 2.6179742388758784, + "grad_norm": 1.002966046333313, + "learning_rate": 3.0179962326689464e-06, + "loss": 0.6106, + "step": 8943 + }, + { + "epoch": 2.6182669789227164, + "grad_norm": 0.9376605749130249, + "learning_rate": 3.0176195084544312e-06, + "loss": 0.5681, + "step": 8944 + }, + { + "epoch": 2.6185597189695553, + "grad_norm": 1.0496728420257568, + "learning_rate": 3.0172427719592927e-06, + "loss": 0.585, + "step": 8945 + }, + { + "epoch": 2.6188524590163933, + "grad_norm": 0.9762730598449707, + "learning_rate": 3.016866023192471e-06, + "loss": 0.5312, + "step": 8946 + }, + { + "epoch": 2.6191451990632317, + "grad_norm": 1.040940761566162, + "learning_rate": 3.0164892621629023e-06, + "loss": 0.589, + "step": 8947 + }, + { + "epoch": 2.61943793911007, + "grad_norm": 1.0123320817947388, + "learning_rate": 3.0161124888795275e-06, + "loss": 0.5422, + "step": 8948 + }, + { + "epoch": 2.6197306791569086, + "grad_norm": 0.9619187712669373, + "learning_rate": 3.0157357033512847e-06, + "loss": 0.5721, + "step": 8949 + }, + { + "epoch": 2.620023419203747, + "grad_norm": 0.9941492676734924, + "learning_rate": 3.015358905587113e-06, + "loss": 0.5934, + "step": 8950 + }, + { + "epoch": 2.6203161592505855, + "grad_norm": 0.9813873171806335, + "learning_rate": 3.0149820955959524e-06, + "loss": 0.5461, + "step": 8951 + }, + { + "epoch": 2.620608899297424, + "grad_norm": 1.0671931505203247, + "learning_rate": 3.0146052733867424e-06, + "loss": 0.5788, + "step": 8952 + }, + { + "epoch": 2.6209016393442623, + "grad_norm": 0.9855492115020752, + "learning_rate": 3.014228438968424e-06, + "loss": 0.5636, + "step": 8953 + }, + { + "epoch": 2.621194379391101, + "grad_norm": 1.0244359970092773, + "learning_rate": 3.0138515923499375e-06, + "loss": 0.6019, + "step": 8954 + }, + { + "epoch": 2.621487119437939, + "grad_norm": 0.9319297075271606, + "learning_rate": 3.013474733540222e-06, + "loss": 0.5526, + "step": 8955 + }, + { + "epoch": 2.6217798594847777, + "grad_norm": 0.9955439567565918, + "learning_rate": 3.0130978625482197e-06, + "loss": 0.6051, + "step": 8956 + }, + { + "epoch": 2.622072599531616, + "grad_norm": 0.9844621419906616, + "learning_rate": 3.012720979382873e-06, + "loss": 0.5766, + "step": 8957 + }, + { + "epoch": 2.6223653395784545, + "grad_norm": 1.0012179613113403, + "learning_rate": 3.012344084053122e-06, + "loss": 0.5288, + "step": 8958 + }, + { + "epoch": 2.6226580796252925, + "grad_norm": 1.0152148008346558, + "learning_rate": 3.01196717656791e-06, + "loss": 0.5887, + "step": 8959 + }, + { + "epoch": 2.6229508196721314, + "grad_norm": 0.9476361870765686, + "learning_rate": 3.011590256936177e-06, + "loss": 0.5237, + "step": 8960 + }, + { + "epoch": 2.6232435597189694, + "grad_norm": 0.9695540070533752, + "learning_rate": 3.011213325166868e-06, + "loss": 0.5656, + "step": 8961 + }, + { + "epoch": 2.623536299765808, + "grad_norm": 1.0560855865478516, + "learning_rate": 3.010836381268924e-06, + "loss": 0.5881, + "step": 8962 + }, + { + "epoch": 2.6238290398126463, + "grad_norm": 1.0283780097961426, + "learning_rate": 3.0104594252512887e-06, + "loss": 0.568, + "step": 8963 + }, + { + "epoch": 2.6241217798594847, + "grad_norm": 1.2951215505599976, + "learning_rate": 3.0100824571229057e-06, + "loss": 0.5875, + "step": 8964 + }, + { + "epoch": 2.624414519906323, + "grad_norm": 0.9580673575401306, + "learning_rate": 3.0097054768927186e-06, + "loss": 0.5783, + "step": 8965 + }, + { + "epoch": 2.6247072599531616, + "grad_norm": 1.0278098583221436, + "learning_rate": 3.009328484569671e-06, + "loss": 0.5638, + "step": 8966 + }, + { + "epoch": 2.625, + "grad_norm": 0.9347093105316162, + "learning_rate": 3.0089514801627066e-06, + "loss": 0.5445, + "step": 8967 + }, + { + "epoch": 2.6252927400468384, + "grad_norm": 1.0207276344299316, + "learning_rate": 3.008574463680771e-06, + "loss": 0.5611, + "step": 8968 + }, + { + "epoch": 2.625585480093677, + "grad_norm": 1.0518099069595337, + "learning_rate": 3.0081974351328096e-06, + "loss": 0.5675, + "step": 8969 + }, + { + "epoch": 2.6258782201405153, + "grad_norm": 0.9497443437576294, + "learning_rate": 3.007820394527765e-06, + "loss": 0.5939, + "step": 8970 + }, + { + "epoch": 2.6261709601873537, + "grad_norm": 0.9966674447059631, + "learning_rate": 3.007443341874585e-06, + "loss": 0.5848, + "step": 8971 + }, + { + "epoch": 2.626463700234192, + "grad_norm": 1.0021837949752808, + "learning_rate": 3.007066277182214e-06, + "loss": 0.5381, + "step": 8972 + }, + { + "epoch": 2.6267564402810306, + "grad_norm": 1.0296164751052856, + "learning_rate": 3.0066892004595977e-06, + "loss": 0.5293, + "step": 8973 + }, + { + "epoch": 2.6270491803278686, + "grad_norm": 1.0193402767181396, + "learning_rate": 3.0063121117156835e-06, + "loss": 0.6051, + "step": 8974 + }, + { + "epoch": 2.6273419203747075, + "grad_norm": 1.0137133598327637, + "learning_rate": 3.005935010959416e-06, + "loss": 0.5376, + "step": 8975 + }, + { + "epoch": 2.6276346604215455, + "grad_norm": 1.03030264377594, + "learning_rate": 3.005557898199744e-06, + "loss": 0.6074, + "step": 8976 + }, + { + "epoch": 2.627927400468384, + "grad_norm": 0.9868131875991821, + "learning_rate": 3.0051807734456135e-06, + "loss": 0.5613, + "step": 8977 + }, + { + "epoch": 2.6282201405152223, + "grad_norm": 0.9459651708602905, + "learning_rate": 3.004803636705972e-06, + "loss": 0.5373, + "step": 8978 + }, + { + "epoch": 2.628512880562061, + "grad_norm": 0.9987647533416748, + "learning_rate": 3.0044264879897678e-06, + "loss": 0.5616, + "step": 8979 + }, + { + "epoch": 2.628805620608899, + "grad_norm": 0.9882397055625916, + "learning_rate": 3.0040493273059473e-06, + "loss": 0.595, + "step": 8980 + }, + { + "epoch": 2.6290983606557377, + "grad_norm": 0.9911173582077026, + "learning_rate": 3.00367215466346e-06, + "loss": 0.5818, + "step": 8981 + }, + { + "epoch": 2.629391100702576, + "grad_norm": 0.9766349792480469, + "learning_rate": 3.0032949700712544e-06, + "loss": 0.5599, + "step": 8982 + }, + { + "epoch": 2.6296838407494145, + "grad_norm": 0.9808452129364014, + "learning_rate": 3.0029177735382785e-06, + "loss": 0.5548, + "step": 8983 + }, + { + "epoch": 2.629976580796253, + "grad_norm": 1.0265967845916748, + "learning_rate": 3.002540565073482e-06, + "loss": 0.5959, + "step": 8984 + }, + { + "epoch": 2.6302693208430914, + "grad_norm": 1.0148651599884033, + "learning_rate": 3.0021633446858145e-06, + "loss": 0.5667, + "step": 8985 + }, + { + "epoch": 2.63056206088993, + "grad_norm": 1.0204864740371704, + "learning_rate": 3.0017861123842245e-06, + "loss": 0.5584, + "step": 8986 + }, + { + "epoch": 2.6308548009367683, + "grad_norm": 0.9807088971138, + "learning_rate": 3.001408868177663e-06, + "loss": 0.5328, + "step": 8987 + }, + { + "epoch": 2.6311475409836067, + "grad_norm": 1.0256966352462769, + "learning_rate": 3.0010316120750798e-06, + "loss": 0.6147, + "step": 8988 + }, + { + "epoch": 2.6314402810304447, + "grad_norm": 0.9986637234687805, + "learning_rate": 3.0006543440854253e-06, + "loss": 0.5341, + "step": 8989 + }, + { + "epoch": 2.6317330210772836, + "grad_norm": 1.0495115518569946, + "learning_rate": 3.0002770642176503e-06, + "loss": 0.6129, + "step": 8990 + }, + { + "epoch": 2.6320257611241216, + "grad_norm": 1.0133894681930542, + "learning_rate": 2.999899772480706e-06, + "loss": 0.5769, + "step": 8991 + }, + { + "epoch": 2.6323185011709604, + "grad_norm": 1.0312459468841553, + "learning_rate": 2.9995224688835426e-06, + "loss": 0.5869, + "step": 8992 + }, + { + "epoch": 2.6326112412177984, + "grad_norm": 0.9871283173561096, + "learning_rate": 2.9991451534351136e-06, + "loss": 0.6102, + "step": 8993 + }, + { + "epoch": 2.632903981264637, + "grad_norm": 0.9547732472419739, + "learning_rate": 2.99876782614437e-06, + "loss": 0.5649, + "step": 8994 + }, + { + "epoch": 2.6331967213114753, + "grad_norm": 0.9529850482940674, + "learning_rate": 2.9983904870202634e-06, + "loss": 0.5454, + "step": 8995 + }, + { + "epoch": 2.6334894613583137, + "grad_norm": 0.9800135493278503, + "learning_rate": 2.998013136071747e-06, + "loss": 0.5661, + "step": 8996 + }, + { + "epoch": 2.633782201405152, + "grad_norm": 0.9891607761383057, + "learning_rate": 2.997635773307773e-06, + "loss": 0.5747, + "step": 8997 + }, + { + "epoch": 2.6340749414519906, + "grad_norm": 1.0199462175369263, + "learning_rate": 2.997258398737295e-06, + "loss": 0.5658, + "step": 8998 + }, + { + "epoch": 2.634367681498829, + "grad_norm": 1.0183769464492798, + "learning_rate": 2.9968810123692655e-06, + "loss": 0.5432, + "step": 8999 + }, + { + "epoch": 2.6346604215456675, + "grad_norm": 0.998275876045227, + "learning_rate": 2.996503614212639e-06, + "loss": 0.5746, + "step": 9000 + }, + { + "epoch": 2.634953161592506, + "grad_norm": 0.9560033679008484, + "learning_rate": 2.996126204276369e-06, + "loss": 0.5813, + "step": 9001 + }, + { + "epoch": 2.6352459016393444, + "grad_norm": 1.0025479793548584, + "learning_rate": 2.9957487825694087e-06, + "loss": 0.5697, + "step": 9002 + }, + { + "epoch": 2.635538641686183, + "grad_norm": 0.9958920478820801, + "learning_rate": 2.9953713491007142e-06, + "loss": 0.5572, + "step": 9003 + }, + { + "epoch": 2.6358313817330212, + "grad_norm": 0.9541734457015991, + "learning_rate": 2.994993903879239e-06, + "loss": 0.5189, + "step": 9004 + }, + { + "epoch": 2.6361241217798597, + "grad_norm": 0.9313660264015198, + "learning_rate": 2.994616446913938e-06, + "loss": 0.5453, + "step": 9005 + }, + { + "epoch": 2.6364168618266977, + "grad_norm": 1.0112884044647217, + "learning_rate": 2.994238978213767e-06, + "loss": 0.5789, + "step": 9006 + }, + { + "epoch": 2.6367096018735365, + "grad_norm": 1.0448447465896606, + "learning_rate": 2.9938614977876816e-06, + "loss": 0.6077, + "step": 9007 + }, + { + "epoch": 2.6370023419203745, + "grad_norm": 1.038905382156372, + "learning_rate": 2.9934840056446367e-06, + "loss": 0.5294, + "step": 9008 + }, + { + "epoch": 2.637295081967213, + "grad_norm": 1.0365638732910156, + "learning_rate": 2.993106501793589e-06, + "loss": 0.5568, + "step": 9009 + }, + { + "epoch": 2.6375878220140514, + "grad_norm": 1.0616137981414795, + "learning_rate": 2.9927289862434954e-06, + "loss": 0.5717, + "step": 9010 + }, + { + "epoch": 2.63788056206089, + "grad_norm": 1.0167906284332275, + "learning_rate": 2.9923514590033116e-06, + "loss": 0.6048, + "step": 9011 + }, + { + "epoch": 2.6381733021077283, + "grad_norm": 1.0507031679153442, + "learning_rate": 2.9919739200819946e-06, + "loss": 0.5753, + "step": 9012 + }, + { + "epoch": 2.6384660421545667, + "grad_norm": 0.9674553871154785, + "learning_rate": 2.991596369488502e-06, + "loss": 0.5447, + "step": 9013 + }, + { + "epoch": 2.638758782201405, + "grad_norm": 1.0240648984909058, + "learning_rate": 2.9912188072317905e-06, + "loss": 0.5698, + "step": 9014 + }, + { + "epoch": 2.6390515222482436, + "grad_norm": 1.0085475444793701, + "learning_rate": 2.99084123332082e-06, + "loss": 0.5654, + "step": 9015 + }, + { + "epoch": 2.639344262295082, + "grad_norm": 0.9303801655769348, + "learning_rate": 2.9904636477645454e-06, + "loss": 0.5289, + "step": 9016 + }, + { + "epoch": 2.6396370023419204, + "grad_norm": 1.041739583015442, + "learning_rate": 2.990086050571927e-06, + "loss": 0.5652, + "step": 9017 + }, + { + "epoch": 2.639929742388759, + "grad_norm": 1.0073609352111816, + "learning_rate": 2.989708441751923e-06, + "loss": 0.6008, + "step": 9018 + }, + { + "epoch": 2.6402224824355973, + "grad_norm": 0.9561459422111511, + "learning_rate": 2.9893308213134918e-06, + "loss": 0.5212, + "step": 9019 + }, + { + "epoch": 2.6405152224824358, + "grad_norm": 0.9489405155181885, + "learning_rate": 2.9889531892655935e-06, + "loss": 0.5617, + "step": 9020 + }, + { + "epoch": 2.6408079625292737, + "grad_norm": 0.9724708795547485, + "learning_rate": 2.9885755456171865e-06, + "loss": 0.6016, + "step": 9021 + }, + { + "epoch": 2.6411007025761126, + "grad_norm": 1.0392524003982544, + "learning_rate": 2.9881978903772306e-06, + "loss": 0.5767, + "step": 9022 + }, + { + "epoch": 2.6413934426229506, + "grad_norm": 0.9744355082511902, + "learning_rate": 2.9878202235546865e-06, + "loss": 0.5778, + "step": 9023 + }, + { + "epoch": 2.6416861826697895, + "grad_norm": 0.9476162791252136, + "learning_rate": 2.9874425451585138e-06, + "loss": 0.5513, + "step": 9024 + }, + { + "epoch": 2.6419789227166275, + "grad_norm": 0.9934871196746826, + "learning_rate": 2.987064855197673e-06, + "loss": 0.538, + "step": 9025 + }, + { + "epoch": 2.642271662763466, + "grad_norm": 1.055871605873108, + "learning_rate": 2.9866871536811246e-06, + "loss": 0.63, + "step": 9026 + }, + { + "epoch": 2.6425644028103044, + "grad_norm": 1.0412415266036987, + "learning_rate": 2.98630944061783e-06, + "loss": 0.5708, + "step": 9027 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 1.0695880651474, + "learning_rate": 2.9859317160167504e-06, + "loss": 0.5545, + "step": 9028 + }, + { + "epoch": 2.6431498829039812, + "grad_norm": 0.9344189763069153, + "learning_rate": 2.9855539798868476e-06, + "loss": 0.5704, + "step": 9029 + }, + { + "epoch": 2.6434426229508197, + "grad_norm": 1.0133874416351318, + "learning_rate": 2.985176232237084e-06, + "loss": 0.564, + "step": 9030 + }, + { + "epoch": 2.643735362997658, + "grad_norm": 0.9917844533920288, + "learning_rate": 2.9847984730764206e-06, + "loss": 0.5758, + "step": 9031 + }, + { + "epoch": 2.6440281030444965, + "grad_norm": 1.0503493547439575, + "learning_rate": 2.98442070241382e-06, + "loss": 0.5944, + "step": 9032 + }, + { + "epoch": 2.644320843091335, + "grad_norm": 1.026772141456604, + "learning_rate": 2.9840429202582455e-06, + "loss": 0.5603, + "step": 9033 + }, + { + "epoch": 2.6446135831381734, + "grad_norm": 1.0312632322311401, + "learning_rate": 2.9836651266186596e-06, + "loss": 0.5792, + "step": 9034 + }, + { + "epoch": 2.644906323185012, + "grad_norm": 1.0371558666229248, + "learning_rate": 2.983287321504026e-06, + "loss": 0.5679, + "step": 9035 + }, + { + "epoch": 2.6451990632318503, + "grad_norm": 1.0971440076828003, + "learning_rate": 2.982909504923307e-06, + "loss": 0.5307, + "step": 9036 + }, + { + "epoch": 2.6454918032786887, + "grad_norm": 0.9856106042861938, + "learning_rate": 2.982531676885468e-06, + "loss": 0.5844, + "step": 9037 + }, + { + "epoch": 2.6457845433255267, + "grad_norm": 0.9969475269317627, + "learning_rate": 2.982153837399472e-06, + "loss": 0.5807, + "step": 9038 + }, + { + "epoch": 2.6460772833723656, + "grad_norm": 1.0150203704833984, + "learning_rate": 2.981775986474283e-06, + "loss": 0.5791, + "step": 9039 + }, + { + "epoch": 2.6463700234192036, + "grad_norm": 0.9508613348007202, + "learning_rate": 2.981398124118867e-06, + "loss": 0.5565, + "step": 9040 + }, + { + "epoch": 2.646662763466042, + "grad_norm": 1.0040369033813477, + "learning_rate": 2.9810202503421877e-06, + "loss": 0.5879, + "step": 9041 + }, + { + "epoch": 2.6469555035128804, + "grad_norm": 0.946316659450531, + "learning_rate": 2.9806423651532115e-06, + "loss": 0.5709, + "step": 9042 + }, + { + "epoch": 2.647248243559719, + "grad_norm": 0.9901151657104492, + "learning_rate": 2.980264468560901e-06, + "loss": 0.5518, + "step": 9043 + }, + { + "epoch": 2.6475409836065573, + "grad_norm": 1.0470384359359741, + "learning_rate": 2.979886560574225e-06, + "loss": 0.6071, + "step": 9044 + }, + { + "epoch": 2.6478337236533958, + "grad_norm": 0.9524545669555664, + "learning_rate": 2.9795086412021477e-06, + "loss": 0.5729, + "step": 9045 + }, + { + "epoch": 2.648126463700234, + "grad_norm": 0.914930522441864, + "learning_rate": 2.979130710453636e-06, + "loss": 0.5381, + "step": 9046 + }, + { + "epoch": 2.6484192037470726, + "grad_norm": 1.011607050895691, + "learning_rate": 2.978752768337656e-06, + "loss": 0.5708, + "step": 9047 + }, + { + "epoch": 2.648711943793911, + "grad_norm": 0.9369107484817505, + "learning_rate": 2.978374814863174e-06, + "loss": 0.5599, + "step": 9048 + }, + { + "epoch": 2.6490046838407495, + "grad_norm": 1.0017813444137573, + "learning_rate": 2.9779968500391587e-06, + "loss": 0.5657, + "step": 9049 + }, + { + "epoch": 2.649297423887588, + "grad_norm": 0.9899217486381531, + "learning_rate": 2.9776188738745758e-06, + "loss": 0.5367, + "step": 9050 + }, + { + "epoch": 2.6495901639344264, + "grad_norm": 1.0488379001617432, + "learning_rate": 2.9772408863783937e-06, + "loss": 0.6011, + "step": 9051 + }, + { + "epoch": 2.649882903981265, + "grad_norm": 0.9221716523170471, + "learning_rate": 2.976862887559579e-06, + "loss": 0.541, + "step": 9052 + }, + { + "epoch": 2.650175644028103, + "grad_norm": 1.0039937496185303, + "learning_rate": 2.976484877427101e-06, + "loss": 0.5445, + "step": 9053 + }, + { + "epoch": 2.6504683840749417, + "grad_norm": 1.0260590314865112, + "learning_rate": 2.976106855989928e-06, + "loss": 0.5838, + "step": 9054 + }, + { + "epoch": 2.6507611241217797, + "grad_norm": 0.967918336391449, + "learning_rate": 2.9757288232570285e-06, + "loss": 0.5385, + "step": 9055 + }, + { + "epoch": 2.651053864168618, + "grad_norm": 1.0353442430496216, + "learning_rate": 2.9753507792373714e-06, + "loss": 0.573, + "step": 9056 + }, + { + "epoch": 2.6513466042154565, + "grad_norm": 0.9752504825592041, + "learning_rate": 2.974972723939925e-06, + "loss": 0.535, + "step": 9057 + }, + { + "epoch": 2.651639344262295, + "grad_norm": 1.0406041145324707, + "learning_rate": 2.97459465737366e-06, + "loss": 0.5595, + "step": 9058 + }, + { + "epoch": 2.6519320843091334, + "grad_norm": 1.0728487968444824, + "learning_rate": 2.9742165795475458e-06, + "loss": 0.5794, + "step": 9059 + }, + { + "epoch": 2.652224824355972, + "grad_norm": 1.0113639831542969, + "learning_rate": 2.973838490470552e-06, + "loss": 0.601, + "step": 9060 + }, + { + "epoch": 2.6525175644028103, + "grad_norm": 1.032416820526123, + "learning_rate": 2.973460390151649e-06, + "loss": 0.5679, + "step": 9061 + }, + { + "epoch": 2.6528103044496487, + "grad_norm": 1.1432147026062012, + "learning_rate": 2.973082278599807e-06, + "loss": 0.5206, + "step": 9062 + }, + { + "epoch": 2.653103044496487, + "grad_norm": 1.0136127471923828, + "learning_rate": 2.9727041558239965e-06, + "loss": 0.6004, + "step": 9063 + }, + { + "epoch": 2.6533957845433256, + "grad_norm": 1.0025831460952759, + "learning_rate": 2.97232602183319e-06, + "loss": 0.6297, + "step": 9064 + }, + { + "epoch": 2.653688524590164, + "grad_norm": 1.0049501657485962, + "learning_rate": 2.9719478766363575e-06, + "loss": 0.5931, + "step": 9065 + }, + { + "epoch": 2.6539812646370025, + "grad_norm": 1.0539844036102295, + "learning_rate": 2.9715697202424716e-06, + "loss": 0.5513, + "step": 9066 + }, + { + "epoch": 2.654274004683841, + "grad_norm": 1.013666033744812, + "learning_rate": 2.971191552660503e-06, + "loss": 0.6105, + "step": 9067 + }, + { + "epoch": 2.654566744730679, + "grad_norm": 0.9513922929763794, + "learning_rate": 2.9708133738994238e-06, + "loss": 0.5825, + "step": 9068 + }, + { + "epoch": 2.6548594847775178, + "grad_norm": 1.0136618614196777, + "learning_rate": 2.970435183968207e-06, + "loss": 0.6076, + "step": 9069 + }, + { + "epoch": 2.6551522248243558, + "grad_norm": 0.9857422709465027, + "learning_rate": 2.970056982875825e-06, + "loss": 0.5479, + "step": 9070 + }, + { + "epoch": 2.6554449648711946, + "grad_norm": 0.9564558267593384, + "learning_rate": 2.9696787706312517e-06, + "loss": 0.5973, + "step": 9071 + }, + { + "epoch": 2.6557377049180326, + "grad_norm": 1.0292887687683105, + "learning_rate": 2.969300547243459e-06, + "loss": 0.5709, + "step": 9072 + }, + { + "epoch": 2.656030444964871, + "grad_norm": 0.9910536408424377, + "learning_rate": 2.96892231272142e-06, + "loss": 0.5921, + "step": 9073 + }, + { + "epoch": 2.6563231850117095, + "grad_norm": 0.9451072216033936, + "learning_rate": 2.9685440670741094e-06, + "loss": 0.5729, + "step": 9074 + }, + { + "epoch": 2.656615925058548, + "grad_norm": 0.9720368385314941, + "learning_rate": 2.9681658103105016e-06, + "loss": 0.59, + "step": 9075 + }, + { + "epoch": 2.6569086651053864, + "grad_norm": 0.9665109515190125, + "learning_rate": 2.967787542439569e-06, + "loss": 0.5669, + "step": 9076 + }, + { + "epoch": 2.657201405152225, + "grad_norm": 0.9976322054862976, + "learning_rate": 2.967409263470287e-06, + "loss": 0.5494, + "step": 9077 + }, + { + "epoch": 2.6574941451990632, + "grad_norm": 0.9801317453384399, + "learning_rate": 2.9670309734116315e-06, + "loss": 0.5573, + "step": 9078 + }, + { + "epoch": 2.6577868852459017, + "grad_norm": 1.010622262954712, + "learning_rate": 2.9666526722725754e-06, + "loss": 0.5654, + "step": 9079 + }, + { + "epoch": 2.65807962529274, + "grad_norm": 0.9336455464363098, + "learning_rate": 2.966274360062095e-06, + "loss": 0.5837, + "step": 9080 + }, + { + "epoch": 2.6583723653395785, + "grad_norm": 1.0383944511413574, + "learning_rate": 2.965896036789167e-06, + "loss": 0.5789, + "step": 9081 + }, + { + "epoch": 2.658665105386417, + "grad_norm": 0.970444917678833, + "learning_rate": 2.9655177024627645e-06, + "loss": 0.5661, + "step": 9082 + }, + { + "epoch": 2.6589578454332554, + "grad_norm": 1.0302573442459106, + "learning_rate": 2.9651393570918655e-06, + "loss": 0.6191, + "step": 9083 + }, + { + "epoch": 2.659250585480094, + "grad_norm": 1.0429987907409668, + "learning_rate": 2.9647610006854465e-06, + "loss": 0.5785, + "step": 9084 + }, + { + "epoch": 2.659543325526932, + "grad_norm": 1.0778322219848633, + "learning_rate": 2.9643826332524827e-06, + "loss": 0.6073, + "step": 9085 + }, + { + "epoch": 2.6598360655737707, + "grad_norm": 1.0397459268569946, + "learning_rate": 2.9640042548019525e-06, + "loss": 0.5889, + "step": 9086 + }, + { + "epoch": 2.6601288056206087, + "grad_norm": 0.9800072312355042, + "learning_rate": 2.963625865342832e-06, + "loss": 0.5707, + "step": 9087 + }, + { + "epoch": 2.660421545667447, + "grad_norm": 1.0009901523590088, + "learning_rate": 2.963247464884098e-06, + "loss": 0.57, + "step": 9088 + }, + { + "epoch": 2.6607142857142856, + "grad_norm": 0.9830644726753235, + "learning_rate": 2.9628690534347294e-06, + "loss": 0.5617, + "step": 9089 + }, + { + "epoch": 2.661007025761124, + "grad_norm": 0.97222501039505, + "learning_rate": 2.9624906310037037e-06, + "loss": 0.5899, + "step": 9090 + }, + { + "epoch": 2.6612997658079625, + "grad_norm": 1.0176020860671997, + "learning_rate": 2.9621121975999993e-06, + "loss": 0.6032, + "step": 9091 + }, + { + "epoch": 2.661592505854801, + "grad_norm": 0.9744598269462585, + "learning_rate": 2.9617337532325934e-06, + "loss": 0.5527, + "step": 9092 + }, + { + "epoch": 2.6618852459016393, + "grad_norm": 0.9759353399276733, + "learning_rate": 2.961355297910466e-06, + "loss": 0.5383, + "step": 9093 + }, + { + "epoch": 2.6621779859484778, + "grad_norm": 0.986294686794281, + "learning_rate": 2.9609768316425956e-06, + "loss": 0.5468, + "step": 9094 + }, + { + "epoch": 2.662470725995316, + "grad_norm": 1.0319645404815674, + "learning_rate": 2.960598354437961e-06, + "loss": 0.5068, + "step": 9095 + }, + { + "epoch": 2.6627634660421546, + "grad_norm": 0.9853067398071289, + "learning_rate": 2.960219866305542e-06, + "loss": 0.6029, + "step": 9096 + }, + { + "epoch": 2.663056206088993, + "grad_norm": 1.0035723447799683, + "learning_rate": 2.959841367254319e-06, + "loss": 0.5813, + "step": 9097 + }, + { + "epoch": 2.6633489461358315, + "grad_norm": 1.0231704711914062, + "learning_rate": 2.9594628572932704e-06, + "loss": 0.5836, + "step": 9098 + }, + { + "epoch": 2.66364168618267, + "grad_norm": 0.9906663298606873, + "learning_rate": 2.959084336431377e-06, + "loss": 0.5399, + "step": 9099 + }, + { + "epoch": 2.663934426229508, + "grad_norm": 1.0099643468856812, + "learning_rate": 2.95870580467762e-06, + "loss": 0.5892, + "step": 9100 + }, + { + "epoch": 2.664227166276347, + "grad_norm": 0.9647420644760132, + "learning_rate": 2.9583272620409796e-06, + "loss": 0.5584, + "step": 9101 + }, + { + "epoch": 2.664519906323185, + "grad_norm": 0.9867638349533081, + "learning_rate": 2.957948708530437e-06, + "loss": 0.5897, + "step": 9102 + }, + { + "epoch": 2.6648126463700237, + "grad_norm": 1.0641660690307617, + "learning_rate": 2.957570144154973e-06, + "loss": 0.5108, + "step": 9103 + }, + { + "epoch": 2.6651053864168617, + "grad_norm": 0.923679769039154, + "learning_rate": 2.9571915689235693e-06, + "loss": 0.5504, + "step": 9104 + }, + { + "epoch": 2.6653981264637, + "grad_norm": 0.9493889808654785, + "learning_rate": 2.956812982845208e-06, + "loss": 0.5532, + "step": 9105 + }, + { + "epoch": 2.6656908665105385, + "grad_norm": 0.9466227293014526, + "learning_rate": 2.9564343859288715e-06, + "loss": 0.5675, + "step": 9106 + }, + { + "epoch": 2.665983606557377, + "grad_norm": 0.9505672454833984, + "learning_rate": 2.956055778183541e-06, + "loss": 0.5621, + "step": 9107 + }, + { + "epoch": 2.6662763466042154, + "grad_norm": 0.9896240830421448, + "learning_rate": 2.9556771596181987e-06, + "loss": 0.6132, + "step": 9108 + }, + { + "epoch": 2.666569086651054, + "grad_norm": 0.9585578441619873, + "learning_rate": 2.955298530241829e-06, + "loss": 0.5755, + "step": 9109 + }, + { + "epoch": 2.6668618266978923, + "grad_norm": 1.0560003519058228, + "learning_rate": 2.9549198900634146e-06, + "loss": 0.5739, + "step": 9110 + }, + { + "epoch": 2.6671545667447307, + "grad_norm": 1.0192612409591675, + "learning_rate": 2.9545412390919383e-06, + "loss": 0.5421, + "step": 9111 + }, + { + "epoch": 2.667447306791569, + "grad_norm": 1.0126538276672363, + "learning_rate": 2.9541625773363836e-06, + "loss": 0.5392, + "step": 9112 + }, + { + "epoch": 2.6677400468384076, + "grad_norm": 1.029043436050415, + "learning_rate": 2.953783904805735e-06, + "loss": 0.5734, + "step": 9113 + }, + { + "epoch": 2.668032786885246, + "grad_norm": 1.0106563568115234, + "learning_rate": 2.9534052215089746e-06, + "loss": 0.569, + "step": 9114 + }, + { + "epoch": 2.6683255269320845, + "grad_norm": 0.9626703262329102, + "learning_rate": 2.9530265274550898e-06, + "loss": 0.5781, + "step": 9115 + }, + { + "epoch": 2.668618266978923, + "grad_norm": 1.0406019687652588, + "learning_rate": 2.952647822653063e-06, + "loss": 0.5713, + "step": 9116 + }, + { + "epoch": 2.668911007025761, + "grad_norm": 0.9898436665534973, + "learning_rate": 2.952269107111879e-06, + "loss": 0.5543, + "step": 9117 + }, + { + "epoch": 2.6692037470725998, + "grad_norm": 0.998477041721344, + "learning_rate": 2.9518903808405242e-06, + "loss": 0.5767, + "step": 9118 + }, + { + "epoch": 2.6694964871194378, + "grad_norm": 1.000301718711853, + "learning_rate": 2.951511643847983e-06, + "loss": 0.5824, + "step": 9119 + }, + { + "epoch": 2.669789227166276, + "grad_norm": 0.9949687719345093, + "learning_rate": 2.9511328961432412e-06, + "loss": 0.5612, + "step": 9120 + }, + { + "epoch": 2.6700819672131146, + "grad_norm": 0.971519947052002, + "learning_rate": 2.9507541377352853e-06, + "loss": 0.5517, + "step": 9121 + }, + { + "epoch": 2.670374707259953, + "grad_norm": 1.0216031074523926, + "learning_rate": 2.950375368633101e-06, + "loss": 0.5637, + "step": 9122 + }, + { + "epoch": 2.6706674473067915, + "grad_norm": 0.9751573801040649, + "learning_rate": 2.949996588845674e-06, + "loss": 0.5675, + "step": 9123 + }, + { + "epoch": 2.67096018735363, + "grad_norm": 1.0197206735610962, + "learning_rate": 2.949617798381991e-06, + "loss": 0.6071, + "step": 9124 + }, + { + "epoch": 2.6712529274004684, + "grad_norm": 1.0333843231201172, + "learning_rate": 2.9492389972510406e-06, + "loss": 0.503, + "step": 9125 + }, + { + "epoch": 2.671545667447307, + "grad_norm": 1.0426474809646606, + "learning_rate": 2.948860185461808e-06, + "loss": 0.5765, + "step": 9126 + }, + { + "epoch": 2.6718384074941453, + "grad_norm": 0.9806601405143738, + "learning_rate": 2.9484813630232816e-06, + "loss": 0.5621, + "step": 9127 + }, + { + "epoch": 2.6721311475409837, + "grad_norm": 0.9633013606071472, + "learning_rate": 2.948102529944448e-06, + "loss": 0.5153, + "step": 9128 + }, + { + "epoch": 2.672423887587822, + "grad_norm": 1.003114104270935, + "learning_rate": 2.947723686234296e-06, + "loss": 0.5574, + "step": 9129 + }, + { + "epoch": 2.6727166276346606, + "grad_norm": 1.0040240287780762, + "learning_rate": 2.9473448319018146e-06, + "loss": 0.573, + "step": 9130 + }, + { + "epoch": 2.673009367681499, + "grad_norm": 0.9586438536643982, + "learning_rate": 2.94696596695599e-06, + "loss": 0.535, + "step": 9131 + }, + { + "epoch": 2.673302107728337, + "grad_norm": 1.009002447128296, + "learning_rate": 2.9465870914058124e-06, + "loss": 0.5412, + "step": 9132 + }, + { + "epoch": 2.673594847775176, + "grad_norm": 0.9381882548332214, + "learning_rate": 2.9462082052602702e-06, + "loss": 0.5668, + "step": 9133 + }, + { + "epoch": 2.673887587822014, + "grad_norm": 0.9728730320930481, + "learning_rate": 2.9458293085283527e-06, + "loss": 0.5406, + "step": 9134 + }, + { + "epoch": 2.6741803278688527, + "grad_norm": 1.0010651350021362, + "learning_rate": 2.9454504012190486e-06, + "loss": 0.5749, + "step": 9135 + }, + { + "epoch": 2.6744730679156907, + "grad_norm": 1.0113050937652588, + "learning_rate": 2.945071483341349e-06, + "loss": 0.5363, + "step": 9136 + }, + { + "epoch": 2.674765807962529, + "grad_norm": 0.9981377124786377, + "learning_rate": 2.9446925549042427e-06, + "loss": 0.562, + "step": 9137 + }, + { + "epoch": 2.6750585480093676, + "grad_norm": 0.999559223651886, + "learning_rate": 2.9443136159167195e-06, + "loss": 0.5555, + "step": 9138 + }, + { + "epoch": 2.675351288056206, + "grad_norm": 1.0083905458450317, + "learning_rate": 2.943934666387771e-06, + "loss": 0.5971, + "step": 9139 + }, + { + "epoch": 2.6756440281030445, + "grad_norm": 1.0419975519180298, + "learning_rate": 2.943555706326387e-06, + "loss": 0.576, + "step": 9140 + }, + { + "epoch": 2.675936768149883, + "grad_norm": 0.9773483872413635, + "learning_rate": 2.943176735741559e-06, + "loss": 0.5503, + "step": 9141 + }, + { + "epoch": 2.6762295081967213, + "grad_norm": 1.0870299339294434, + "learning_rate": 2.9427977546422777e-06, + "loss": 0.5672, + "step": 9142 + }, + { + "epoch": 2.6765222482435598, + "grad_norm": 0.978019654750824, + "learning_rate": 2.942418763037534e-06, + "loss": 0.5478, + "step": 9143 + }, + { + "epoch": 2.676814988290398, + "grad_norm": 1.0218827724456787, + "learning_rate": 2.9420397609363206e-06, + "loss": 0.6076, + "step": 9144 + }, + { + "epoch": 2.6771077283372366, + "grad_norm": 1.1642351150512695, + "learning_rate": 2.9416607483476283e-06, + "loss": 0.5857, + "step": 9145 + }, + { + "epoch": 2.677400468384075, + "grad_norm": 1.00273859500885, + "learning_rate": 2.9412817252804503e-06, + "loss": 0.5717, + "step": 9146 + }, + { + "epoch": 2.677693208430913, + "grad_norm": 0.964859127998352, + "learning_rate": 2.9409026917437794e-06, + "loss": 0.5352, + "step": 9147 + }, + { + "epoch": 2.677985948477752, + "grad_norm": 1.0221779346466064, + "learning_rate": 2.940523647746607e-06, + "loss": 0.5683, + "step": 9148 + }, + { + "epoch": 2.67827868852459, + "grad_norm": 1.0165715217590332, + "learning_rate": 2.940144593297927e-06, + "loss": 0.5718, + "step": 9149 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 1.1075454950332642, + "learning_rate": 2.9397655284067305e-06, + "loss": 0.5949, + "step": 9150 + }, + { + "epoch": 2.678864168618267, + "grad_norm": 0.9813345074653625, + "learning_rate": 2.939386453082013e-06, + "loss": 0.5745, + "step": 9151 + }, + { + "epoch": 2.6791569086651053, + "grad_norm": 0.9729172587394714, + "learning_rate": 2.9390073673327678e-06, + "loss": 0.5859, + "step": 9152 + }, + { + "epoch": 2.6794496487119437, + "grad_norm": 0.9713660478591919, + "learning_rate": 2.938628271167988e-06, + "loss": 0.5801, + "step": 9153 + }, + { + "epoch": 2.679742388758782, + "grad_norm": 1.0314770936965942, + "learning_rate": 2.9382491645966688e-06, + "loss": 0.5261, + "step": 9154 + }, + { + "epoch": 2.6800351288056206, + "grad_norm": 0.9761993288993835, + "learning_rate": 2.937870047627803e-06, + "loss": 0.6208, + "step": 9155 + }, + { + "epoch": 2.680327868852459, + "grad_norm": 0.9531440734863281, + "learning_rate": 2.9374909202703872e-06, + "loss": 0.5773, + "step": 9156 + }, + { + "epoch": 2.6806206088992974, + "grad_norm": 0.9728958606719971, + "learning_rate": 2.937111782533415e-06, + "loss": 0.5545, + "step": 9157 + }, + { + "epoch": 2.680913348946136, + "grad_norm": 1.053299069404602, + "learning_rate": 2.9367326344258816e-06, + "loss": 0.5739, + "step": 9158 + }, + { + "epoch": 2.6812060889929743, + "grad_norm": 0.968621551990509, + "learning_rate": 2.9363534759567824e-06, + "loss": 0.5432, + "step": 9159 + }, + { + "epoch": 2.6814988290398127, + "grad_norm": 0.923976480960846, + "learning_rate": 2.935974307135113e-06, + "loss": 0.5673, + "step": 9160 + }, + { + "epoch": 2.681791569086651, + "grad_norm": 1.0247396230697632, + "learning_rate": 2.9355951279698698e-06, + "loss": 0.6142, + "step": 9161 + }, + { + "epoch": 2.6820843091334896, + "grad_norm": 1.0095657110214233, + "learning_rate": 2.935215938470049e-06, + "loss": 0.552, + "step": 9162 + }, + { + "epoch": 2.682377049180328, + "grad_norm": 1.010560393333435, + "learning_rate": 2.9348367386446458e-06, + "loss": 0.5562, + "step": 9163 + }, + { + "epoch": 2.682669789227166, + "grad_norm": 0.9983615875244141, + "learning_rate": 2.934457528502657e-06, + "loss": 0.5668, + "step": 9164 + }, + { + "epoch": 2.682962529274005, + "grad_norm": 1.0157965421676636, + "learning_rate": 2.934078308053081e-06, + "loss": 0.5979, + "step": 9165 + }, + { + "epoch": 2.683255269320843, + "grad_norm": 1.0645016431808472, + "learning_rate": 2.9336990773049127e-06, + "loss": 0.5951, + "step": 9166 + }, + { + "epoch": 2.6835480093676813, + "grad_norm": 1.0379799604415894, + "learning_rate": 2.933319836267151e-06, + "loss": 0.5515, + "step": 9167 + }, + { + "epoch": 2.6838407494145198, + "grad_norm": 0.9173418283462524, + "learning_rate": 2.9329405849487935e-06, + "loss": 0.5708, + "step": 9168 + }, + { + "epoch": 2.684133489461358, + "grad_norm": 1.0265225172042847, + "learning_rate": 2.9325613233588363e-06, + "loss": 0.5639, + "step": 9169 + }, + { + "epoch": 2.6844262295081966, + "grad_norm": 0.9944242835044861, + "learning_rate": 2.9321820515062787e-06, + "loss": 0.5597, + "step": 9170 + }, + { + "epoch": 2.684718969555035, + "grad_norm": 1.020339846611023, + "learning_rate": 2.9318027694001195e-06, + "loss": 0.5846, + "step": 9171 + }, + { + "epoch": 2.6850117096018735, + "grad_norm": 1.0383141040802002, + "learning_rate": 2.9314234770493565e-06, + "loss": 0.5826, + "step": 9172 + }, + { + "epoch": 2.685304449648712, + "grad_norm": 0.9325937032699585, + "learning_rate": 2.9310441744629885e-06, + "loss": 0.535, + "step": 9173 + }, + { + "epoch": 2.6855971896955504, + "grad_norm": 0.962796688079834, + "learning_rate": 2.9306648616500144e-06, + "loss": 0.5723, + "step": 9174 + }, + { + "epoch": 2.685889929742389, + "grad_norm": 1.225831389427185, + "learning_rate": 2.930285538619434e-06, + "loss": 0.5874, + "step": 9175 + }, + { + "epoch": 2.6861826697892273, + "grad_norm": 1.0244771242141724, + "learning_rate": 2.9299062053802463e-06, + "loss": 0.6131, + "step": 9176 + }, + { + "epoch": 2.6864754098360657, + "grad_norm": 1.0907642841339111, + "learning_rate": 2.9295268619414517e-06, + "loss": 0.5725, + "step": 9177 + }, + { + "epoch": 2.686768149882904, + "grad_norm": 1.021169900894165, + "learning_rate": 2.9291475083120503e-06, + "loss": 0.5793, + "step": 9178 + }, + { + "epoch": 2.687060889929742, + "grad_norm": 1.022048830986023, + "learning_rate": 2.9287681445010406e-06, + "loss": 0.5533, + "step": 9179 + }, + { + "epoch": 2.687353629976581, + "grad_norm": 0.9575668573379517, + "learning_rate": 2.928388770517425e-06, + "loss": 0.5813, + "step": 9180 + }, + { + "epoch": 2.687646370023419, + "grad_norm": 0.9516888856887817, + "learning_rate": 2.928009386370203e-06, + "loss": 0.5653, + "step": 9181 + }, + { + "epoch": 2.687939110070258, + "grad_norm": 0.9822300672531128, + "learning_rate": 2.9276299920683764e-06, + "loss": 0.5359, + "step": 9182 + }, + { + "epoch": 2.688231850117096, + "grad_norm": 0.961766242980957, + "learning_rate": 2.9272505876209462e-06, + "loss": 0.5909, + "step": 9183 + }, + { + "epoch": 2.6885245901639343, + "grad_norm": 0.9958710074424744, + "learning_rate": 2.9268711730369135e-06, + "loss": 0.5589, + "step": 9184 + }, + { + "epoch": 2.6888173302107727, + "grad_norm": 0.9932858943939209, + "learning_rate": 2.926491748325281e-06, + "loss": 0.5261, + "step": 9185 + }, + { + "epoch": 2.689110070257611, + "grad_norm": 0.9520835876464844, + "learning_rate": 2.9261123134950487e-06, + "loss": 0.5356, + "step": 9186 + }, + { + "epoch": 2.6894028103044496, + "grad_norm": 1.0448414087295532, + "learning_rate": 2.9257328685552204e-06, + "loss": 0.5995, + "step": 9187 + }, + { + "epoch": 2.689695550351288, + "grad_norm": 0.9934876561164856, + "learning_rate": 2.925353413514798e-06, + "loss": 0.5727, + "step": 9188 + }, + { + "epoch": 2.6899882903981265, + "grad_norm": 0.9177548289299011, + "learning_rate": 2.9249739483827842e-06, + "loss": 0.5327, + "step": 9189 + }, + { + "epoch": 2.690281030444965, + "grad_norm": 1.020206093788147, + "learning_rate": 2.9245944731681814e-06, + "loss": 0.5337, + "step": 9190 + }, + { + "epoch": 2.6905737704918034, + "grad_norm": 0.9656126499176025, + "learning_rate": 2.9242149878799937e-06, + "loss": 0.564, + "step": 9191 + }, + { + "epoch": 2.690866510538642, + "grad_norm": 0.9627676010131836, + "learning_rate": 2.923835492527224e-06, + "loss": 0.5613, + "step": 9192 + }, + { + "epoch": 2.6911592505854802, + "grad_norm": 1.0334272384643555, + "learning_rate": 2.923455987118875e-06, + "loss": 0.5673, + "step": 9193 + }, + { + "epoch": 2.6914519906323187, + "grad_norm": 0.9638808369636536, + "learning_rate": 2.923076471663952e-06, + "loss": 0.5838, + "step": 9194 + }, + { + "epoch": 2.691744730679157, + "grad_norm": 1.025681734085083, + "learning_rate": 2.922696946171458e-06, + "loss": 0.5819, + "step": 9195 + }, + { + "epoch": 2.692037470725995, + "grad_norm": 0.9606029391288757, + "learning_rate": 2.922317410650398e-06, + "loss": 0.514, + "step": 9196 + }, + { + "epoch": 2.692330210772834, + "grad_norm": 0.9904552698135376, + "learning_rate": 2.9219378651097763e-06, + "loss": 0.5757, + "step": 9197 + }, + { + "epoch": 2.692622950819672, + "grad_norm": 0.970820426940918, + "learning_rate": 2.9215583095585974e-06, + "loss": 0.5507, + "step": 9198 + }, + { + "epoch": 2.6929156908665104, + "grad_norm": 1.011412262916565, + "learning_rate": 2.921178744005867e-06, + "loss": 0.5737, + "step": 9199 + }, + { + "epoch": 2.693208430913349, + "grad_norm": 1.0018311738967896, + "learning_rate": 2.92079916846059e-06, + "loss": 0.589, + "step": 9200 + }, + { + "epoch": 2.6935011709601873, + "grad_norm": 0.936931848526001, + "learning_rate": 2.9204195829317724e-06, + "loss": 0.5711, + "step": 9201 + }, + { + "epoch": 2.6937939110070257, + "grad_norm": 0.9686232805252075, + "learning_rate": 2.9200399874284186e-06, + "loss": 0.578, + "step": 9202 + }, + { + "epoch": 2.694086651053864, + "grad_norm": 1.0031660795211792, + "learning_rate": 2.919660381959536e-06, + "loss": 0.5794, + "step": 9203 + }, + { + "epoch": 2.6943793911007026, + "grad_norm": 1.00238037109375, + "learning_rate": 2.9192807665341293e-06, + "loss": 0.5833, + "step": 9204 + }, + { + "epoch": 2.694672131147541, + "grad_norm": 1.0103989839553833, + "learning_rate": 2.918901141161206e-06, + "loss": 0.5951, + "step": 9205 + }, + { + "epoch": 2.6949648711943794, + "grad_norm": 1.0865660905838013, + "learning_rate": 2.9185215058497727e-06, + "loss": 0.6089, + "step": 9206 + }, + { + "epoch": 2.695257611241218, + "grad_norm": 1.0203584432601929, + "learning_rate": 2.9181418606088374e-06, + "loss": 0.5381, + "step": 9207 + }, + { + "epoch": 2.6955503512880563, + "grad_norm": 0.8705118298530579, + "learning_rate": 2.917762205447405e-06, + "loss": 0.4878, + "step": 9208 + }, + { + "epoch": 2.6958430913348947, + "grad_norm": 1.1263411045074463, + "learning_rate": 2.9173825403744843e-06, + "loss": 0.5935, + "step": 9209 + }, + { + "epoch": 2.696135831381733, + "grad_norm": 0.9334210753440857, + "learning_rate": 2.917002865399083e-06, + "loss": 0.5528, + "step": 9210 + }, + { + "epoch": 2.696428571428571, + "grad_norm": 1.0208781957626343, + "learning_rate": 2.9166231805302074e-06, + "loss": 0.5814, + "step": 9211 + }, + { + "epoch": 2.69672131147541, + "grad_norm": 1.0129528045654297, + "learning_rate": 2.9162434857768683e-06, + "loss": 0.59, + "step": 9212 + }, + { + "epoch": 2.697014051522248, + "grad_norm": 1.0480375289916992, + "learning_rate": 2.9158637811480716e-06, + "loss": 0.5717, + "step": 9213 + }, + { + "epoch": 2.697306791569087, + "grad_norm": 0.9881630539894104, + "learning_rate": 2.9154840666528272e-06, + "loss": 0.5689, + "step": 9214 + }, + { + "epoch": 2.697599531615925, + "grad_norm": 0.993236243724823, + "learning_rate": 2.9151043423001433e-06, + "loss": 0.5711, + "step": 9215 + }, + { + "epoch": 2.6978922716627634, + "grad_norm": 1.038394570350647, + "learning_rate": 2.914724608099029e-06, + "loss": 0.5865, + "step": 9216 + }, + { + "epoch": 2.698185011709602, + "grad_norm": 0.9695250391960144, + "learning_rate": 2.9143448640584943e-06, + "loss": 0.5823, + "step": 9217 + }, + { + "epoch": 2.6984777517564402, + "grad_norm": 0.9413284063339233, + "learning_rate": 2.9139651101875477e-06, + "loss": 0.5227, + "step": 9218 + }, + { + "epoch": 2.6987704918032787, + "grad_norm": 0.9988759160041809, + "learning_rate": 2.913585346495199e-06, + "loss": 0.5871, + "step": 9219 + }, + { + "epoch": 2.699063231850117, + "grad_norm": 1.0703753232955933, + "learning_rate": 2.9132055729904596e-06, + "loss": 0.5443, + "step": 9220 + }, + { + "epoch": 2.6993559718969555, + "grad_norm": 0.9899178147315979, + "learning_rate": 2.9128257896823377e-06, + "loss": 0.5473, + "step": 9221 + }, + { + "epoch": 2.699648711943794, + "grad_norm": 1.0169011354446411, + "learning_rate": 2.912445996579845e-06, + "loss": 0.5875, + "step": 9222 + }, + { + "epoch": 2.6999414519906324, + "grad_norm": 0.9762726426124573, + "learning_rate": 2.9120661936919913e-06, + "loss": 0.5566, + "step": 9223 + }, + { + "epoch": 2.700234192037471, + "grad_norm": 0.9695413112640381, + "learning_rate": 2.9116863810277882e-06, + "loss": 0.5605, + "step": 9224 + }, + { + "epoch": 2.7005269320843093, + "grad_norm": 0.9651323556900024, + "learning_rate": 2.9113065585962462e-06, + "loss": 0.5808, + "step": 9225 + }, + { + "epoch": 2.7008196721311473, + "grad_norm": 0.9592257738113403, + "learning_rate": 2.9109267264063777e-06, + "loss": 0.5821, + "step": 9226 + }, + { + "epoch": 2.701112412177986, + "grad_norm": 0.9876719117164612, + "learning_rate": 2.9105468844671935e-06, + "loss": 0.5758, + "step": 9227 + }, + { + "epoch": 2.701405152224824, + "grad_norm": 0.9420945644378662, + "learning_rate": 2.910167032787706e-06, + "loss": 0.5453, + "step": 9228 + }, + { + "epoch": 2.701697892271663, + "grad_norm": 0.9887692332267761, + "learning_rate": 2.909787171376926e-06, + "loss": 0.5673, + "step": 9229 + }, + { + "epoch": 2.701990632318501, + "grad_norm": 0.9679926037788391, + "learning_rate": 2.909407300243867e-06, + "loss": 0.5581, + "step": 9230 + }, + { + "epoch": 2.7022833723653394, + "grad_norm": 0.97652667760849, + "learning_rate": 2.909027419397541e-06, + "loss": 0.5849, + "step": 9231 + }, + { + "epoch": 2.702576112412178, + "grad_norm": 1.0022228956222534, + "learning_rate": 2.9086475288469614e-06, + "loss": 0.5945, + "step": 9232 + }, + { + "epoch": 2.7028688524590163, + "grad_norm": 0.9811041355133057, + "learning_rate": 2.90826762860114e-06, + "loss": 0.5167, + "step": 9233 + }, + { + "epoch": 2.7031615925058547, + "grad_norm": 0.9887670874595642, + "learning_rate": 2.907887718669091e-06, + "loss": 0.5389, + "step": 9234 + }, + { + "epoch": 2.703454332552693, + "grad_norm": 0.9925669431686401, + "learning_rate": 2.9075077990598278e-06, + "loss": 0.5753, + "step": 9235 + }, + { + "epoch": 2.7037470725995316, + "grad_norm": 1.0033528804779053, + "learning_rate": 2.9071278697823635e-06, + "loss": 0.5742, + "step": 9236 + }, + { + "epoch": 2.70403981264637, + "grad_norm": 0.9821708798408508, + "learning_rate": 2.9067479308457125e-06, + "loss": 0.5915, + "step": 9237 + }, + { + "epoch": 2.7043325526932085, + "grad_norm": 0.9448722004890442, + "learning_rate": 2.9063679822588888e-06, + "loss": 0.5658, + "step": 9238 + }, + { + "epoch": 2.704625292740047, + "grad_norm": 1.0141515731811523, + "learning_rate": 2.9059880240309065e-06, + "loss": 0.5386, + "step": 9239 + }, + { + "epoch": 2.7049180327868854, + "grad_norm": 0.9276725053787231, + "learning_rate": 2.90560805617078e-06, + "loss": 0.483, + "step": 9240 + }, + { + "epoch": 2.705210772833724, + "grad_norm": 1.0266813039779663, + "learning_rate": 2.905228078687524e-06, + "loss": 0.5707, + "step": 9241 + }, + { + "epoch": 2.7055035128805622, + "grad_norm": 0.97702556848526, + "learning_rate": 2.9048480915901543e-06, + "loss": 0.6042, + "step": 9242 + }, + { + "epoch": 2.7057962529274002, + "grad_norm": 0.9851671457290649, + "learning_rate": 2.9044680948876867e-06, + "loss": 0.5778, + "step": 9243 + }, + { + "epoch": 2.706088992974239, + "grad_norm": 0.9532384872436523, + "learning_rate": 2.9040880885891354e-06, + "loss": 0.5593, + "step": 9244 + }, + { + "epoch": 2.706381733021077, + "grad_norm": 0.964082658290863, + "learning_rate": 2.9037080727035162e-06, + "loss": 0.5456, + "step": 9245 + }, + { + "epoch": 2.7066744730679155, + "grad_norm": 1.004249095916748, + "learning_rate": 2.9033280472398454e-06, + "loss": 0.5688, + "step": 9246 + }, + { + "epoch": 2.706967213114754, + "grad_norm": 0.9276480078697205, + "learning_rate": 2.9029480122071397e-06, + "loss": 0.5382, + "step": 9247 + }, + { + "epoch": 2.7072599531615924, + "grad_norm": 1.001429557800293, + "learning_rate": 2.9025679676144147e-06, + "loss": 0.5756, + "step": 9248 + }, + { + "epoch": 2.707552693208431, + "grad_norm": 1.0741630792617798, + "learning_rate": 2.9021879134706878e-06, + "loss": 0.5631, + "step": 9249 + }, + { + "epoch": 2.7078454332552693, + "grad_norm": 0.9851979613304138, + "learning_rate": 2.9018078497849745e-06, + "loss": 0.5728, + "step": 9250 + }, + { + "epoch": 2.7081381733021077, + "grad_norm": 0.9735644459724426, + "learning_rate": 2.901427776566293e-06, + "loss": 0.5795, + "step": 9251 + }, + { + "epoch": 2.708430913348946, + "grad_norm": 0.9832671880722046, + "learning_rate": 2.9010476938236603e-06, + "loss": 0.5505, + "step": 9252 + }, + { + "epoch": 2.7087236533957846, + "grad_norm": 1.1758975982666016, + "learning_rate": 2.900667601566095e-06, + "loss": 0.5555, + "step": 9253 + }, + { + "epoch": 2.709016393442623, + "grad_norm": 1.0011868476867676, + "learning_rate": 2.9002874998026133e-06, + "loss": 0.5786, + "step": 9254 + }, + { + "epoch": 2.7093091334894615, + "grad_norm": 0.9826376438140869, + "learning_rate": 2.899907388542234e-06, + "loss": 0.561, + "step": 9255 + }, + { + "epoch": 2.7096018735363, + "grad_norm": 0.9967426657676697, + "learning_rate": 2.8995272677939747e-06, + "loss": 0.5806, + "step": 9256 + }, + { + "epoch": 2.7098946135831383, + "grad_norm": 1.0074445009231567, + "learning_rate": 2.899147137566854e-06, + "loss": 0.5475, + "step": 9257 + }, + { + "epoch": 2.7101873536299763, + "grad_norm": 1.058425784111023, + "learning_rate": 2.898766997869891e-06, + "loss": 0.6025, + "step": 9258 + }, + { + "epoch": 2.710480093676815, + "grad_norm": 0.9503589868545532, + "learning_rate": 2.8983868487121046e-06, + "loss": 0.5506, + "step": 9259 + }, + { + "epoch": 2.710772833723653, + "grad_norm": 0.9918925762176514, + "learning_rate": 2.8980066901025134e-06, + "loss": 0.6072, + "step": 9260 + }, + { + "epoch": 2.711065573770492, + "grad_norm": 1.0237574577331543, + "learning_rate": 2.8976265220501366e-06, + "loss": 0.5984, + "step": 9261 + }, + { + "epoch": 2.71135831381733, + "grad_norm": 0.9806030988693237, + "learning_rate": 2.8972463445639947e-06, + "loss": 0.5481, + "step": 9262 + }, + { + "epoch": 2.7116510538641685, + "grad_norm": 1.0041518211364746, + "learning_rate": 2.896866157653107e-06, + "loss": 0.5761, + "step": 9263 + }, + { + "epoch": 2.711943793911007, + "grad_norm": 0.9784849286079407, + "learning_rate": 2.8964859613264934e-06, + "loss": 0.5485, + "step": 9264 + }, + { + "epoch": 2.7122365339578454, + "grad_norm": 0.9716160297393799, + "learning_rate": 2.896105755593174e-06, + "loss": 0.5631, + "step": 9265 + }, + { + "epoch": 2.712529274004684, + "grad_norm": 0.9641571640968323, + "learning_rate": 2.89572554046217e-06, + "loss": 0.5578, + "step": 9266 + }, + { + "epoch": 2.7128220140515222, + "grad_norm": 1.0254888534545898, + "learning_rate": 2.8953453159425005e-06, + "loss": 0.6057, + "step": 9267 + }, + { + "epoch": 2.7131147540983607, + "grad_norm": 1.0220038890838623, + "learning_rate": 2.8949650820431885e-06, + "loss": 0.5951, + "step": 9268 + }, + { + "epoch": 2.713407494145199, + "grad_norm": 0.9940744042396545, + "learning_rate": 2.8945848387732535e-06, + "loss": 0.553, + "step": 9269 + }, + { + "epoch": 2.7137002341920375, + "grad_norm": 1.001781702041626, + "learning_rate": 2.894204586141717e-06, + "loss": 0.6022, + "step": 9270 + }, + { + "epoch": 2.713992974238876, + "grad_norm": 1.0133202075958252, + "learning_rate": 2.8938243241576014e-06, + "loss": 0.5684, + "step": 9271 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 1.0719590187072754, + "learning_rate": 2.893444052829929e-06, + "loss": 0.5868, + "step": 9272 + }, + { + "epoch": 2.714578454332553, + "grad_norm": 0.9601916670799255, + "learning_rate": 2.8930637721677195e-06, + "loss": 0.5979, + "step": 9273 + }, + { + "epoch": 2.7148711943793913, + "grad_norm": 0.9278573989868164, + "learning_rate": 2.8926834821799977e-06, + "loss": 0.5129, + "step": 9274 + }, + { + "epoch": 2.7151639344262293, + "grad_norm": 0.975287139415741, + "learning_rate": 2.8923031828757835e-06, + "loss": 0.5408, + "step": 9275 + }, + { + "epoch": 2.715456674473068, + "grad_norm": 0.9999167919158936, + "learning_rate": 2.8919228742641015e-06, + "loss": 0.5461, + "step": 9276 + }, + { + "epoch": 2.715749414519906, + "grad_norm": 0.9960411787033081, + "learning_rate": 2.8915425563539738e-06, + "loss": 0.5756, + "step": 9277 + }, + { + "epoch": 2.7160421545667446, + "grad_norm": 1.0919861793518066, + "learning_rate": 2.8911622291544243e-06, + "loss": 0.578, + "step": 9278 + }, + { + "epoch": 2.716334894613583, + "grad_norm": 0.9930003881454468, + "learning_rate": 2.8907818926744757e-06, + "loss": 0.5931, + "step": 9279 + }, + { + "epoch": 2.7166276346604215, + "grad_norm": 1.0478003025054932, + "learning_rate": 2.890401546923151e-06, + "loss": 0.5552, + "step": 9280 + }, + { + "epoch": 2.71692037470726, + "grad_norm": 0.9913460612297058, + "learning_rate": 2.8900211919094747e-06, + "loss": 0.5787, + "step": 9281 + }, + { + "epoch": 2.7172131147540983, + "grad_norm": 1.0350557565689087, + "learning_rate": 2.889640827642471e-06, + "loss": 0.5718, + "step": 9282 + }, + { + "epoch": 2.7175058548009368, + "grad_norm": 0.9143216013908386, + "learning_rate": 2.8892604541311642e-06, + "loss": 0.5512, + "step": 9283 + }, + { + "epoch": 2.717798594847775, + "grad_norm": 0.9694494009017944, + "learning_rate": 2.8888800713845777e-06, + "loss": 0.575, + "step": 9284 + }, + { + "epoch": 2.7180913348946136, + "grad_norm": 0.9172192215919495, + "learning_rate": 2.8884996794117365e-06, + "loss": 0.5514, + "step": 9285 + }, + { + "epoch": 2.718384074941452, + "grad_norm": 1.037321925163269, + "learning_rate": 2.888119278221666e-06, + "loss": 0.5669, + "step": 9286 + }, + { + "epoch": 2.7186768149882905, + "grad_norm": 0.9267193078994751, + "learning_rate": 2.8877388678233914e-06, + "loss": 0.5459, + "step": 9287 + }, + { + "epoch": 2.718969555035129, + "grad_norm": 0.970159649848938, + "learning_rate": 2.8873584482259377e-06, + "loss": 0.5756, + "step": 9288 + }, + { + "epoch": 2.7192622950819674, + "grad_norm": 0.9905170798301697, + "learning_rate": 2.8869780194383303e-06, + "loss": 0.5672, + "step": 9289 + }, + { + "epoch": 2.7195550351288054, + "grad_norm": 0.9652527570724487, + "learning_rate": 2.8865975814695947e-06, + "loss": 0.5536, + "step": 9290 + }, + { + "epoch": 2.7198477751756442, + "grad_norm": 0.9914873242378235, + "learning_rate": 2.886217134328758e-06, + "loss": 0.6055, + "step": 9291 + }, + { + "epoch": 2.7201405152224822, + "grad_norm": 0.9535242319107056, + "learning_rate": 2.885836678024845e-06, + "loss": 0.5798, + "step": 9292 + }, + { + "epoch": 2.720433255269321, + "grad_norm": 0.99223393201828, + "learning_rate": 2.885456212566883e-06, + "loss": 0.5734, + "step": 9293 + }, + { + "epoch": 2.720725995316159, + "grad_norm": 1.043760895729065, + "learning_rate": 2.885075737963898e-06, + "loss": 0.5612, + "step": 9294 + }, + { + "epoch": 2.7210187353629975, + "grad_norm": 0.9784605503082275, + "learning_rate": 2.884695254224917e-06, + "loss": 0.569, + "step": 9295 + }, + { + "epoch": 2.721311475409836, + "grad_norm": 0.9853714108467102, + "learning_rate": 2.884314761358967e-06, + "loss": 0.567, + "step": 9296 + }, + { + "epoch": 2.7216042154566744, + "grad_norm": 1.0569777488708496, + "learning_rate": 2.883934259375076e-06, + "loss": 0.5968, + "step": 9297 + }, + { + "epoch": 2.721896955503513, + "grad_norm": 1.0302157402038574, + "learning_rate": 2.8835537482822713e-06, + "loss": 0.6033, + "step": 9298 + }, + { + "epoch": 2.7221896955503513, + "grad_norm": 0.9944224953651428, + "learning_rate": 2.8831732280895797e-06, + "loss": 0.5866, + "step": 9299 + }, + { + "epoch": 2.7224824355971897, + "grad_norm": 0.9671976566314697, + "learning_rate": 2.8827926988060295e-06, + "loss": 0.5621, + "step": 9300 + }, + { + "epoch": 2.722775175644028, + "grad_norm": 1.008946180343628, + "learning_rate": 2.8824121604406483e-06, + "loss": 0.5678, + "step": 9301 + }, + { + "epoch": 2.7230679156908666, + "grad_norm": 0.9634215831756592, + "learning_rate": 2.882031613002466e-06, + "loss": 0.5521, + "step": 9302 + }, + { + "epoch": 2.723360655737705, + "grad_norm": 0.9564135670661926, + "learning_rate": 2.8816510565005103e-06, + "loss": 0.5471, + "step": 9303 + }, + { + "epoch": 2.7236533957845435, + "grad_norm": 1.002044677734375, + "learning_rate": 2.8812704909438104e-06, + "loss": 0.5644, + "step": 9304 + }, + { + "epoch": 2.723946135831382, + "grad_norm": 0.9627578258514404, + "learning_rate": 2.8808899163413937e-06, + "loss": 0.5942, + "step": 9305 + }, + { + "epoch": 2.7242388758782203, + "grad_norm": 1.003683090209961, + "learning_rate": 2.8805093327022913e-06, + "loss": 0.5742, + "step": 9306 + }, + { + "epoch": 2.7245316159250583, + "grad_norm": 0.9915255904197693, + "learning_rate": 2.880128740035532e-06, + "loss": 0.5597, + "step": 9307 + }, + { + "epoch": 2.724824355971897, + "grad_norm": 1.0005927085876465, + "learning_rate": 2.8797481383501453e-06, + "loss": 0.5852, + "step": 9308 + }, + { + "epoch": 2.725117096018735, + "grad_norm": 0.9431834816932678, + "learning_rate": 2.879367527655161e-06, + "loss": 0.5753, + "step": 9309 + }, + { + "epoch": 2.7254098360655736, + "grad_norm": 1.0190460681915283, + "learning_rate": 2.8789869079596095e-06, + "loss": 0.5923, + "step": 9310 + }, + { + "epoch": 2.725702576112412, + "grad_norm": 1.0663928985595703, + "learning_rate": 2.87860627927252e-06, + "loss": 0.5798, + "step": 9311 + }, + { + "epoch": 2.7259953161592505, + "grad_norm": 0.9888255000114441, + "learning_rate": 2.8782256416029244e-06, + "loss": 0.6014, + "step": 9312 + }, + { + "epoch": 2.726288056206089, + "grad_norm": 0.9654275178909302, + "learning_rate": 2.877844994959852e-06, + "loss": 0.5893, + "step": 9313 + }, + { + "epoch": 2.7265807962529274, + "grad_norm": 0.9862688779830933, + "learning_rate": 2.8774643393523355e-06, + "loss": 0.5668, + "step": 9314 + }, + { + "epoch": 2.726873536299766, + "grad_norm": 1.0346291065216064, + "learning_rate": 2.8770836747894044e-06, + "loss": 0.6008, + "step": 9315 + }, + { + "epoch": 2.7271662763466042, + "grad_norm": 1.0179028511047363, + "learning_rate": 2.8767030012800906e-06, + "loss": 0.5749, + "step": 9316 + }, + { + "epoch": 2.7274590163934427, + "grad_norm": 0.9852107167243958, + "learning_rate": 2.876322318833426e-06, + "loss": 0.5678, + "step": 9317 + }, + { + "epoch": 2.727751756440281, + "grad_norm": 0.9778124094009399, + "learning_rate": 2.875941627458442e-06, + "loss": 0.5779, + "step": 9318 + }, + { + "epoch": 2.7280444964871196, + "grad_norm": 1.0434967279434204, + "learning_rate": 2.875560927164171e-06, + "loss": 0.5801, + "step": 9319 + }, + { + "epoch": 2.728337236533958, + "grad_norm": 0.9523108601570129, + "learning_rate": 2.8751802179596444e-06, + "loss": 0.5283, + "step": 9320 + }, + { + "epoch": 2.7286299765807964, + "grad_norm": 0.9378182888031006, + "learning_rate": 2.874799499853895e-06, + "loss": 0.5428, + "step": 9321 + }, + { + "epoch": 2.7289227166276344, + "grad_norm": 1.0248326063156128, + "learning_rate": 2.874418772855956e-06, + "loss": 0.5622, + "step": 9322 + }, + { + "epoch": 2.7292154566744733, + "grad_norm": 0.9984749555587769, + "learning_rate": 2.874038036974859e-06, + "loss": 0.5892, + "step": 9323 + }, + { + "epoch": 2.7295081967213113, + "grad_norm": 0.9753093719482422, + "learning_rate": 2.8736572922196384e-06, + "loss": 0.5832, + "step": 9324 + }, + { + "epoch": 2.7298009367681497, + "grad_norm": 0.9836376905441284, + "learning_rate": 2.873276538599326e-06, + "loss": 0.5496, + "step": 9325 + }, + { + "epoch": 2.730093676814988, + "grad_norm": 1.2120203971862793, + "learning_rate": 2.8728957761229566e-06, + "loss": 0.5603, + "step": 9326 + }, + { + "epoch": 2.7303864168618266, + "grad_norm": 0.9947241544723511, + "learning_rate": 2.8725150047995638e-06, + "loss": 0.5657, + "step": 9327 + }, + { + "epoch": 2.730679156908665, + "grad_norm": 1.0190016031265259, + "learning_rate": 2.87213422463818e-06, + "loss": 0.5779, + "step": 9328 + }, + { + "epoch": 2.7309718969555035, + "grad_norm": 0.9584509134292603, + "learning_rate": 2.8717534356478404e-06, + "loss": 0.524, + "step": 9329 + }, + { + "epoch": 2.731264637002342, + "grad_norm": 0.9749506115913391, + "learning_rate": 2.8713726378375795e-06, + "loss": 0.5761, + "step": 9330 + }, + { + "epoch": 2.7315573770491803, + "grad_norm": 0.9915590882301331, + "learning_rate": 2.8709918312164307e-06, + "loss": 0.5689, + "step": 9331 + }, + { + "epoch": 2.7318501170960188, + "grad_norm": 1.00376296043396, + "learning_rate": 2.8706110157934297e-06, + "loss": 0.6091, + "step": 9332 + }, + { + "epoch": 2.732142857142857, + "grad_norm": 1.0402039289474487, + "learning_rate": 2.870230191577611e-06, + "loss": 0.5912, + "step": 9333 + }, + { + "epoch": 2.7324355971896956, + "grad_norm": 0.951949954032898, + "learning_rate": 2.8698493585780108e-06, + "loss": 0.6018, + "step": 9334 + }, + { + "epoch": 2.732728337236534, + "grad_norm": 1.033478021621704, + "learning_rate": 2.8694685168036627e-06, + "loss": 0.5431, + "step": 9335 + }, + { + "epoch": 2.7330210772833725, + "grad_norm": 1.0480574369430542, + "learning_rate": 2.869087666263603e-06, + "loss": 0.5841, + "step": 9336 + }, + { + "epoch": 2.7333138173302105, + "grad_norm": 0.9993564486503601, + "learning_rate": 2.868706806966868e-06, + "loss": 0.5721, + "step": 9337 + }, + { + "epoch": 2.7336065573770494, + "grad_norm": 1.0220905542373657, + "learning_rate": 2.8683259389224926e-06, + "loss": 0.5764, + "step": 9338 + }, + { + "epoch": 2.7338992974238874, + "grad_norm": 0.9694909453392029, + "learning_rate": 2.8679450621395143e-06, + "loss": 0.5566, + "step": 9339 + }, + { + "epoch": 2.7341920374707263, + "grad_norm": 0.9429411292076111, + "learning_rate": 2.867564176626968e-06, + "loss": 0.564, + "step": 9340 + }, + { + "epoch": 2.7344847775175642, + "grad_norm": 1.009530782699585, + "learning_rate": 2.867183282393891e-06, + "loss": 0.5692, + "step": 9341 + }, + { + "epoch": 2.7347775175644027, + "grad_norm": 0.9713295102119446, + "learning_rate": 2.8668023794493206e-06, + "loss": 0.5891, + "step": 9342 + }, + { + "epoch": 2.735070257611241, + "grad_norm": 1.0831478834152222, + "learning_rate": 2.866421467802293e-06, + "loss": 0.6103, + "step": 9343 + }, + { + "epoch": 2.7353629976580796, + "grad_norm": 0.9643791317939758, + "learning_rate": 2.866040547461846e-06, + "loss": 0.5616, + "step": 9344 + }, + { + "epoch": 2.735655737704918, + "grad_norm": 0.9793335795402527, + "learning_rate": 2.8656596184370163e-06, + "loss": 0.5924, + "step": 9345 + }, + { + "epoch": 2.7359484777517564, + "grad_norm": 1.0199253559112549, + "learning_rate": 2.8652786807368417e-06, + "loss": 0.6047, + "step": 9346 + }, + { + "epoch": 2.736241217798595, + "grad_norm": 0.9925402402877808, + "learning_rate": 2.8648977343703604e-06, + "loss": 0.602, + "step": 9347 + }, + { + "epoch": 2.7365339578454333, + "grad_norm": 0.9715761542320251, + "learning_rate": 2.8645167793466104e-06, + "loss": 0.556, + "step": 9348 + }, + { + "epoch": 2.7368266978922717, + "grad_norm": 0.9834302663803101, + "learning_rate": 2.8641358156746302e-06, + "loss": 0.538, + "step": 9349 + }, + { + "epoch": 2.73711943793911, + "grad_norm": 1.0307087898254395, + "learning_rate": 2.863754843363457e-06, + "loss": 0.5578, + "step": 9350 + }, + { + "epoch": 2.7374121779859486, + "grad_norm": 0.9553983211517334, + "learning_rate": 2.86337386242213e-06, + "loss": 0.5397, + "step": 9351 + }, + { + "epoch": 2.737704918032787, + "grad_norm": 0.9624847173690796, + "learning_rate": 2.862992872859689e-06, + "loss": 0.6192, + "step": 9352 + }, + { + "epoch": 2.7379976580796255, + "grad_norm": 0.9845491051673889, + "learning_rate": 2.862611874685172e-06, + "loss": 0.5702, + "step": 9353 + }, + { + "epoch": 2.7382903981264635, + "grad_norm": 0.9749117493629456, + "learning_rate": 2.8622308679076193e-06, + "loss": 0.5598, + "step": 9354 + }, + { + "epoch": 2.7385831381733023, + "grad_norm": 1.0251368284225464, + "learning_rate": 2.8618498525360692e-06, + "loss": 0.5689, + "step": 9355 + }, + { + "epoch": 2.7388758782201403, + "grad_norm": 1.0235164165496826, + "learning_rate": 2.8614688285795616e-06, + "loss": 0.6235, + "step": 9356 + }, + { + "epoch": 2.7391686182669788, + "grad_norm": 0.9869339466094971, + "learning_rate": 2.8610877960471368e-06, + "loss": 0.5498, + "step": 9357 + }, + { + "epoch": 2.739461358313817, + "grad_norm": 1.006537675857544, + "learning_rate": 2.8607067549478347e-06, + "loss": 0.5557, + "step": 9358 + }, + { + "epoch": 2.7397540983606556, + "grad_norm": 1.01195228099823, + "learning_rate": 2.8603257052906957e-06, + "loss": 0.5936, + "step": 9359 + }, + { + "epoch": 2.740046838407494, + "grad_norm": 0.9620962738990784, + "learning_rate": 2.8599446470847603e-06, + "loss": 0.5816, + "step": 9360 + }, + { + "epoch": 2.7403395784543325, + "grad_norm": 0.9630130529403687, + "learning_rate": 2.8595635803390683e-06, + "loss": 0.553, + "step": 9361 + }, + { + "epoch": 2.740632318501171, + "grad_norm": 1.0300894975662231, + "learning_rate": 2.859182505062662e-06, + "loss": 0.5933, + "step": 9362 + }, + { + "epoch": 2.7409250585480094, + "grad_norm": 0.9793286323547363, + "learning_rate": 2.8588014212645814e-06, + "loss": 0.5658, + "step": 9363 + }, + { + "epoch": 2.741217798594848, + "grad_norm": 1.022901177406311, + "learning_rate": 2.858420328953869e-06, + "loss": 0.6001, + "step": 9364 + }, + { + "epoch": 2.7415105386416863, + "grad_norm": 0.9858667254447937, + "learning_rate": 2.8580392281395643e-06, + "loss": 0.5788, + "step": 9365 + }, + { + "epoch": 2.7418032786885247, + "grad_norm": 0.9610768556594849, + "learning_rate": 2.8576581188307094e-06, + "loss": 0.5874, + "step": 9366 + }, + { + "epoch": 2.742096018735363, + "grad_norm": 0.9952544569969177, + "learning_rate": 2.8572770010363484e-06, + "loss": 0.5489, + "step": 9367 + }, + { + "epoch": 2.7423887587822016, + "grad_norm": 0.9828405380249023, + "learning_rate": 2.856895874765521e-06, + "loss": 0.5913, + "step": 9368 + }, + { + "epoch": 2.7426814988290396, + "grad_norm": 0.9891294240951538, + "learning_rate": 2.8565147400272713e-06, + "loss": 0.5835, + "step": 9369 + }, + { + "epoch": 2.7429742388758784, + "grad_norm": 0.9676374197006226, + "learning_rate": 2.8561335968306403e-06, + "loss": 0.5895, + "step": 9370 + }, + { + "epoch": 2.7432669789227164, + "grad_norm": 1.015315055847168, + "learning_rate": 2.8557524451846714e-06, + "loss": 0.5459, + "step": 9371 + }, + { + "epoch": 2.7435597189695553, + "grad_norm": 0.9716100096702576, + "learning_rate": 2.855371285098407e-06, + "loss": 0.5828, + "step": 9372 + }, + { + "epoch": 2.7438524590163933, + "grad_norm": 1.013184666633606, + "learning_rate": 2.8549901165808913e-06, + "loss": 0.5801, + "step": 9373 + }, + { + "epoch": 2.7441451990632317, + "grad_norm": 0.9428216814994812, + "learning_rate": 2.8546089396411668e-06, + "loss": 0.5353, + "step": 9374 + }, + { + "epoch": 2.74443793911007, + "grad_norm": 0.9874728322029114, + "learning_rate": 2.854227754288277e-06, + "loss": 0.5893, + "step": 9375 + }, + { + "epoch": 2.7447306791569086, + "grad_norm": 0.9649765491485596, + "learning_rate": 2.853846560531265e-06, + "loss": 0.5464, + "step": 9376 + }, + { + "epoch": 2.745023419203747, + "grad_norm": 0.9833037257194519, + "learning_rate": 2.853465358379176e-06, + "loss": 0.5561, + "step": 9377 + }, + { + "epoch": 2.7453161592505855, + "grad_norm": 0.9868829846382141, + "learning_rate": 2.853084147841053e-06, + "loss": 0.55, + "step": 9378 + }, + { + "epoch": 2.745608899297424, + "grad_norm": 1.0290818214416504, + "learning_rate": 2.8527029289259415e-06, + "loss": 0.5694, + "step": 9379 + }, + { + "epoch": 2.7459016393442623, + "grad_norm": 1.0578100681304932, + "learning_rate": 2.852321701642885e-06, + "loss": 0.5785, + "step": 9380 + }, + { + "epoch": 2.746194379391101, + "grad_norm": 0.958702564239502, + "learning_rate": 2.8519404660009286e-06, + "loss": 0.5911, + "step": 9381 + }, + { + "epoch": 2.746487119437939, + "grad_norm": 0.9808756709098816, + "learning_rate": 2.8515592220091167e-06, + "loss": 0.5224, + "step": 9382 + }, + { + "epoch": 2.7467798594847777, + "grad_norm": 0.9737194180488586, + "learning_rate": 2.8511779696764947e-06, + "loss": 0.5978, + "step": 9383 + }, + { + "epoch": 2.747072599531616, + "grad_norm": 1.0254168510437012, + "learning_rate": 2.8507967090121076e-06, + "loss": 0.616, + "step": 9384 + }, + { + "epoch": 2.7473653395784545, + "grad_norm": 1.0173166990280151, + "learning_rate": 2.850415440025002e-06, + "loss": 0.5943, + "step": 9385 + }, + { + "epoch": 2.7476580796252925, + "grad_norm": 1.0211327075958252, + "learning_rate": 2.8500341627242223e-06, + "loss": 0.5933, + "step": 9386 + }, + { + "epoch": 2.7479508196721314, + "grad_norm": 0.9976787567138672, + "learning_rate": 2.8496528771188146e-06, + "loss": 0.5872, + "step": 9387 + }, + { + "epoch": 2.7482435597189694, + "grad_norm": 0.9575583338737488, + "learning_rate": 2.8492715832178254e-06, + "loss": 0.5256, + "step": 9388 + }, + { + "epoch": 2.748536299765808, + "grad_norm": 0.9139519333839417, + "learning_rate": 2.8488902810303002e-06, + "loss": 0.539, + "step": 9389 + }, + { + "epoch": 2.7488290398126463, + "grad_norm": 0.978006899356842, + "learning_rate": 2.8485089705652874e-06, + "loss": 0.5569, + "step": 9390 + }, + { + "epoch": 2.7491217798594847, + "grad_norm": 1.005049467086792, + "learning_rate": 2.848127651831831e-06, + "loss": 0.5618, + "step": 9391 + }, + { + "epoch": 2.749414519906323, + "grad_norm": 1.0341637134552002, + "learning_rate": 2.8477463248389804e-06, + "loss": 0.6078, + "step": 9392 + }, + { + "epoch": 2.7497072599531616, + "grad_norm": 1.014853596687317, + "learning_rate": 2.8473649895957805e-06, + "loss": 0.5753, + "step": 9393 + }, + { + "epoch": 2.75, + "grad_norm": 0.9601976871490479, + "learning_rate": 2.84698364611128e-06, + "loss": 0.5469, + "step": 9394 + }, + { + "epoch": 2.7502927400468384, + "grad_norm": 0.9662564992904663, + "learning_rate": 2.846602294394526e-06, + "loss": 0.5305, + "step": 9395 + }, + { + "epoch": 2.750585480093677, + "grad_norm": 1.0423386096954346, + "learning_rate": 2.8462209344545656e-06, + "loss": 0.5793, + "step": 9396 + }, + { + "epoch": 2.7508782201405153, + "grad_norm": 0.9963483214378357, + "learning_rate": 2.845839566300447e-06, + "loss": 0.588, + "step": 9397 + }, + { + "epoch": 2.7511709601873537, + "grad_norm": 0.9956042170524597, + "learning_rate": 2.8454581899412183e-06, + "loss": 0.57, + "step": 9398 + }, + { + "epoch": 2.751463700234192, + "grad_norm": 1.0129120349884033, + "learning_rate": 2.8450768053859274e-06, + "loss": 0.5747, + "step": 9399 + }, + { + "epoch": 2.7517564402810306, + "grad_norm": 0.9846058487892151, + "learning_rate": 2.844695412643624e-06, + "loss": 0.597, + "step": 9400 + }, + { + "epoch": 2.7520491803278686, + "grad_norm": 0.9449270963668823, + "learning_rate": 2.844314011723355e-06, + "loss": 0.5528, + "step": 9401 + }, + { + "epoch": 2.7523419203747075, + "grad_norm": 0.9712116718292236, + "learning_rate": 2.8439326026341696e-06, + "loss": 0.5544, + "step": 9402 + }, + { + "epoch": 2.7526346604215455, + "grad_norm": 0.9297698140144348, + "learning_rate": 2.843551185385117e-06, + "loss": 0.5205, + "step": 9403 + }, + { + "epoch": 2.752927400468384, + "grad_norm": 0.9825588464736938, + "learning_rate": 2.843169759985247e-06, + "loss": 0.5507, + "step": 9404 + }, + { + "epoch": 2.7532201405152223, + "grad_norm": 0.9558621048927307, + "learning_rate": 2.8427883264436084e-06, + "loss": 0.5579, + "step": 9405 + }, + { + "epoch": 2.753512880562061, + "grad_norm": 1.1054574251174927, + "learning_rate": 2.842406884769251e-06, + "loss": 0.5488, + "step": 9406 + }, + { + "epoch": 2.753805620608899, + "grad_norm": 1.0290380716323853, + "learning_rate": 2.8420254349712247e-06, + "loss": 0.577, + "step": 9407 + }, + { + "epoch": 2.7540983606557377, + "grad_norm": 1.0099371671676636, + "learning_rate": 2.8416439770585785e-06, + "loss": 0.5594, + "step": 9408 + }, + { + "epoch": 2.754391100702576, + "grad_norm": 1.0485857725143433, + "learning_rate": 2.8412625110403635e-06, + "loss": 0.5787, + "step": 9409 + }, + { + "epoch": 2.7546838407494145, + "grad_norm": 0.9962441921234131, + "learning_rate": 2.840881036925631e-06, + "loss": 0.5556, + "step": 9410 + }, + { + "epoch": 2.754976580796253, + "grad_norm": 1.030652403831482, + "learning_rate": 2.8404995547234293e-06, + "loss": 0.5281, + "step": 9411 + }, + { + "epoch": 2.7552693208430914, + "grad_norm": 1.036529302597046, + "learning_rate": 2.84011806444281e-06, + "loss": 0.5732, + "step": 9412 + }, + { + "epoch": 2.75556206088993, + "grad_norm": 1.028822660446167, + "learning_rate": 2.8397365660928246e-06, + "loss": 0.5746, + "step": 9413 + }, + { + "epoch": 2.7558548009367683, + "grad_norm": 0.9132595062255859, + "learning_rate": 2.8393550596825248e-06, + "loss": 0.5287, + "step": 9414 + }, + { + "epoch": 2.7561475409836067, + "grad_norm": 0.981693685054779, + "learning_rate": 2.83897354522096e-06, + "loss": 0.5854, + "step": 9415 + }, + { + "epoch": 2.7564402810304447, + "grad_norm": 0.9946855902671814, + "learning_rate": 2.8385920227171827e-06, + "loss": 0.5999, + "step": 9416 + }, + { + "epoch": 2.7567330210772836, + "grad_norm": 0.9942695498466492, + "learning_rate": 2.8382104921802454e-06, + "loss": 0.5558, + "step": 9417 + }, + { + "epoch": 2.7570257611241216, + "grad_norm": 0.9995521903038025, + "learning_rate": 2.8378289536191985e-06, + "loss": 0.5845, + "step": 9418 + }, + { + "epoch": 2.7573185011709604, + "grad_norm": 1.0025780200958252, + "learning_rate": 2.8374474070430948e-06, + "loss": 0.5353, + "step": 9419 + }, + { + "epoch": 2.7576112412177984, + "grad_norm": 0.9748710989952087, + "learning_rate": 2.8370658524609872e-06, + "loss": 0.559, + "step": 9420 + }, + { + "epoch": 2.757903981264637, + "grad_norm": 1.075530767440796, + "learning_rate": 2.836684289881927e-06, + "loss": 0.6059, + "step": 9421 + }, + { + "epoch": 2.7581967213114753, + "grad_norm": 1.0012534856796265, + "learning_rate": 2.836302719314967e-06, + "loss": 0.5646, + "step": 9422 + }, + { + "epoch": 2.7584894613583137, + "grad_norm": 1.0531114339828491, + "learning_rate": 2.8359211407691604e-06, + "loss": 0.5376, + "step": 9423 + }, + { + "epoch": 2.758782201405152, + "grad_norm": 1.0124820470809937, + "learning_rate": 2.835539554253561e-06, + "loss": 0.5262, + "step": 9424 + }, + { + "epoch": 2.7590749414519906, + "grad_norm": 0.9872561097145081, + "learning_rate": 2.8351579597772205e-06, + "loss": 0.5368, + "step": 9425 + }, + { + "epoch": 2.759367681498829, + "grad_norm": 0.9837691783905029, + "learning_rate": 2.834776357349193e-06, + "loss": 0.5774, + "step": 9426 + }, + { + "epoch": 2.7596604215456675, + "grad_norm": 1.0122126340866089, + "learning_rate": 2.834394746978533e-06, + "loss": 0.5378, + "step": 9427 + }, + { + "epoch": 2.759953161592506, + "grad_norm": 1.0065420866012573, + "learning_rate": 2.834013128674292e-06, + "loss": 0.5516, + "step": 9428 + }, + { + "epoch": 2.7602459016393444, + "grad_norm": 0.9343027472496033, + "learning_rate": 2.833631502445526e-06, + "loss": 0.5086, + "step": 9429 + }, + { + "epoch": 2.760538641686183, + "grad_norm": 0.9522389769554138, + "learning_rate": 2.8332498683012887e-06, + "loss": 0.5423, + "step": 9430 + }, + { + "epoch": 2.7608313817330212, + "grad_norm": 0.9828049540519714, + "learning_rate": 2.8328682262506342e-06, + "loss": 0.5877, + "step": 9431 + }, + { + "epoch": 2.7611241217798597, + "grad_norm": 1.0762742757797241, + "learning_rate": 2.8324865763026173e-06, + "loss": 0.545, + "step": 9432 + }, + { + "epoch": 2.7614168618266977, + "grad_norm": 1.0463359355926514, + "learning_rate": 2.8321049184662923e-06, + "loss": 0.5418, + "step": 9433 + }, + { + "epoch": 2.7617096018735365, + "grad_norm": 1.0040371417999268, + "learning_rate": 2.8317232527507143e-06, + "loss": 0.5367, + "step": 9434 + }, + { + "epoch": 2.7620023419203745, + "grad_norm": 1.0365276336669922, + "learning_rate": 2.8313415791649385e-06, + "loss": 0.5749, + "step": 9435 + }, + { + "epoch": 2.762295081967213, + "grad_norm": 0.9813699126243591, + "learning_rate": 2.83095989771802e-06, + "loss": 0.5838, + "step": 9436 + }, + { + "epoch": 2.7625878220140514, + "grad_norm": 1.0067254304885864, + "learning_rate": 2.830578208419014e-06, + "loss": 0.594, + "step": 9437 + }, + { + "epoch": 2.76288056206089, + "grad_norm": 0.9810924530029297, + "learning_rate": 2.8301965112769766e-06, + "loss": 0.5576, + "step": 9438 + }, + { + "epoch": 2.7631733021077283, + "grad_norm": 1.0145649909973145, + "learning_rate": 2.8298148063009632e-06, + "loss": 0.5302, + "step": 9439 + }, + { + "epoch": 2.7634660421545667, + "grad_norm": 1.0351791381835938, + "learning_rate": 2.8294330935000315e-06, + "loss": 0.5434, + "step": 9440 + }, + { + "epoch": 2.763758782201405, + "grad_norm": 0.9896538257598877, + "learning_rate": 2.8290513728832347e-06, + "loss": 0.5568, + "step": 9441 + }, + { + "epoch": 2.7640515222482436, + "grad_norm": 0.9387458562850952, + "learning_rate": 2.828669644459632e-06, + "loss": 0.5441, + "step": 9442 + }, + { + "epoch": 2.764344262295082, + "grad_norm": 0.9837558269500732, + "learning_rate": 2.828287908238278e-06, + "loss": 0.5454, + "step": 9443 + }, + { + "epoch": 2.7646370023419204, + "grad_norm": 1.0680006742477417, + "learning_rate": 2.827906164228231e-06, + "loss": 0.5689, + "step": 9444 + }, + { + "epoch": 2.764929742388759, + "grad_norm": 0.9783828258514404, + "learning_rate": 2.827524412438547e-06, + "loss": 0.5577, + "step": 9445 + }, + { + "epoch": 2.7652224824355973, + "grad_norm": 1.0061050653457642, + "learning_rate": 2.8271426528782835e-06, + "loss": 0.5735, + "step": 9446 + }, + { + "epoch": 2.7655152224824358, + "grad_norm": 0.9677219986915588, + "learning_rate": 2.8267608855564978e-06, + "loss": 0.5748, + "step": 9447 + }, + { + "epoch": 2.7658079625292737, + "grad_norm": 0.956000566482544, + "learning_rate": 2.8263791104822468e-06, + "loss": 0.5723, + "step": 9448 + }, + { + "epoch": 2.7661007025761126, + "grad_norm": 1.0693317651748657, + "learning_rate": 2.825997327664589e-06, + "loss": 0.5538, + "step": 9449 + }, + { + "epoch": 2.7663934426229506, + "grad_norm": 1.0203648805618286, + "learning_rate": 2.825615537112582e-06, + "loss": 0.5872, + "step": 9450 + }, + { + "epoch": 2.7666861826697895, + "grad_norm": 0.965609610080719, + "learning_rate": 2.825233738835284e-06, + "loss": 0.5783, + "step": 9451 + }, + { + "epoch": 2.7669789227166275, + "grad_norm": 1.03472900390625, + "learning_rate": 2.8248519328417534e-06, + "loss": 0.5497, + "step": 9452 + }, + { + "epoch": 2.767271662763466, + "grad_norm": 1.0349180698394775, + "learning_rate": 2.8244701191410474e-06, + "loss": 0.5595, + "step": 9453 + }, + { + "epoch": 2.7675644028103044, + "grad_norm": 0.9880517721176147, + "learning_rate": 2.8240882977422257e-06, + "loss": 0.5629, + "step": 9454 + }, + { + "epoch": 2.767857142857143, + "grad_norm": 0.96617591381073, + "learning_rate": 2.8237064686543477e-06, + "loss": 0.5763, + "step": 9455 + }, + { + "epoch": 2.7681498829039812, + "grad_norm": 1.031312346458435, + "learning_rate": 2.8233246318864706e-06, + "loss": 0.577, + "step": 9456 + }, + { + "epoch": 2.7684426229508197, + "grad_norm": 0.9933826923370361, + "learning_rate": 2.8229427874476544e-06, + "loss": 0.5402, + "step": 9457 + }, + { + "epoch": 2.768735362997658, + "grad_norm": 1.012389898300171, + "learning_rate": 2.8225609353469592e-06, + "loss": 0.5574, + "step": 9458 + }, + { + "epoch": 2.7690281030444965, + "grad_norm": 0.9482973217964172, + "learning_rate": 2.822179075593443e-06, + "loss": 0.5634, + "step": 9459 + }, + { + "epoch": 2.769320843091335, + "grad_norm": 1.1572954654693604, + "learning_rate": 2.8217972081961676e-06, + "loss": 0.5936, + "step": 9460 + }, + { + "epoch": 2.7696135831381734, + "grad_norm": 0.9779424667358398, + "learning_rate": 2.821415333164191e-06, + "loss": 0.5852, + "step": 9461 + }, + { + "epoch": 2.769906323185012, + "grad_norm": 1.0261083841323853, + "learning_rate": 2.8210334505065733e-06, + "loss": 0.5746, + "step": 9462 + }, + { + "epoch": 2.7701990632318503, + "grad_norm": 1.0825210809707642, + "learning_rate": 2.820651560232376e-06, + "loss": 0.6054, + "step": 9463 + }, + { + "epoch": 2.7704918032786887, + "grad_norm": 0.9692781567573547, + "learning_rate": 2.8202696623506583e-06, + "loss": 0.5881, + "step": 9464 + }, + { + "epoch": 2.7707845433255267, + "grad_norm": 0.9667243957519531, + "learning_rate": 2.8198877568704813e-06, + "loss": 0.5773, + "step": 9465 + }, + { + "epoch": 2.7710772833723656, + "grad_norm": 0.973293662071228, + "learning_rate": 2.8195058438009065e-06, + "loss": 0.5236, + "step": 9466 + }, + { + "epoch": 2.7713700234192036, + "grad_norm": 0.9819160103797913, + "learning_rate": 2.8191239231509937e-06, + "loss": 0.5512, + "step": 9467 + }, + { + "epoch": 2.771662763466042, + "grad_norm": 0.9985999464988708, + "learning_rate": 2.8187419949298045e-06, + "loss": 0.6031, + "step": 9468 + }, + { + "epoch": 2.7719555035128804, + "grad_norm": 0.9970507621765137, + "learning_rate": 2.8183600591464004e-06, + "loss": 0.5916, + "step": 9469 + }, + { + "epoch": 2.772248243559719, + "grad_norm": 0.9579191207885742, + "learning_rate": 2.817978115809843e-06, + "loss": 0.5436, + "step": 9470 + }, + { + "epoch": 2.7725409836065573, + "grad_norm": 1.0050179958343506, + "learning_rate": 2.817596164929194e-06, + "loss": 0.5775, + "step": 9471 + }, + { + "epoch": 2.7728337236533958, + "grad_norm": 1.00389564037323, + "learning_rate": 2.8172142065135137e-06, + "loss": 0.5431, + "step": 9472 + }, + { + "epoch": 2.773126463700234, + "grad_norm": 0.9871366620063782, + "learning_rate": 2.816832240571866e-06, + "loss": 0.5802, + "step": 9473 + }, + { + "epoch": 2.7734192037470726, + "grad_norm": 0.9896851181983948, + "learning_rate": 2.8164502671133124e-06, + "loss": 0.6004, + "step": 9474 + }, + { + "epoch": 2.773711943793911, + "grad_norm": 0.9655653238296509, + "learning_rate": 2.8160682861469157e-06, + "loss": 0.6039, + "step": 9475 + }, + { + "epoch": 2.7740046838407495, + "grad_norm": 0.9631032943725586, + "learning_rate": 2.8156862976817384e-06, + "loss": 0.5676, + "step": 9476 + }, + { + "epoch": 2.774297423887588, + "grad_norm": 1.0260127782821655, + "learning_rate": 2.8153043017268426e-06, + "loss": 0.5583, + "step": 9477 + }, + { + "epoch": 2.7745901639344264, + "grad_norm": 0.9774251580238342, + "learning_rate": 2.814922298291292e-06, + "loss": 0.5748, + "step": 9478 + }, + { + "epoch": 2.774882903981265, + "grad_norm": 0.9716874957084656, + "learning_rate": 2.814540287384149e-06, + "loss": 0.5741, + "step": 9479 + }, + { + "epoch": 2.775175644028103, + "grad_norm": 1.0029414892196655, + "learning_rate": 2.814158269014477e-06, + "loss": 0.5733, + "step": 9480 + }, + { + "epoch": 2.7754683840749417, + "grad_norm": 0.9680550694465637, + "learning_rate": 2.813776243191341e-06, + "loss": 0.5423, + "step": 9481 + }, + { + "epoch": 2.7757611241217797, + "grad_norm": 0.9897329211235046, + "learning_rate": 2.813394209923802e-06, + "loss": 0.5557, + "step": 9482 + }, + { + "epoch": 2.776053864168618, + "grad_norm": 0.9828308820724487, + "learning_rate": 2.8130121692209257e-06, + "loss": 0.5892, + "step": 9483 + }, + { + "epoch": 2.7763466042154565, + "grad_norm": 1.0151290893554688, + "learning_rate": 2.812630121091775e-06, + "loss": 0.5584, + "step": 9484 + }, + { + "epoch": 2.776639344262295, + "grad_norm": 0.9922875165939331, + "learning_rate": 2.812248065545415e-06, + "loss": 0.5926, + "step": 9485 + }, + { + "epoch": 2.7769320843091334, + "grad_norm": 1.0276262760162354, + "learning_rate": 2.81186600259091e-06, + "loss": 0.5329, + "step": 9486 + }, + { + "epoch": 2.777224824355972, + "grad_norm": 0.9978697896003723, + "learning_rate": 2.8114839322373236e-06, + "loss": 0.6227, + "step": 9487 + }, + { + "epoch": 2.7775175644028103, + "grad_norm": 0.9557158350944519, + "learning_rate": 2.8111018544937226e-06, + "loss": 0.5635, + "step": 9488 + }, + { + "epoch": 2.7778103044496487, + "grad_norm": 0.966530978679657, + "learning_rate": 2.8107197693691684e-06, + "loss": 0.5655, + "step": 9489 + }, + { + "epoch": 2.778103044496487, + "grad_norm": 0.9620060920715332, + "learning_rate": 2.8103376768727284e-06, + "loss": 0.521, + "step": 9490 + }, + { + "epoch": 2.7783957845433256, + "grad_norm": 1.0008882284164429, + "learning_rate": 2.8099555770134686e-06, + "loss": 0.5697, + "step": 9491 + }, + { + "epoch": 2.778688524590164, + "grad_norm": 1.0302172899246216, + "learning_rate": 2.8095734698004525e-06, + "loss": 0.5931, + "step": 9492 + }, + { + "epoch": 2.7789812646370025, + "grad_norm": 0.8945954442024231, + "learning_rate": 2.809191355242746e-06, + "loss": 0.4903, + "step": 9493 + }, + { + "epoch": 2.779274004683841, + "grad_norm": 0.9621415734291077, + "learning_rate": 2.808809233349415e-06, + "loss": 0.5788, + "step": 9494 + }, + { + "epoch": 2.779566744730679, + "grad_norm": 0.9909876585006714, + "learning_rate": 2.808427104129526e-06, + "loss": 0.602, + "step": 9495 + }, + { + "epoch": 2.7798594847775178, + "grad_norm": 1.0128012895584106, + "learning_rate": 2.8080449675921444e-06, + "loss": 0.5349, + "step": 9496 + }, + { + "epoch": 2.7801522248243558, + "grad_norm": 0.9762925505638123, + "learning_rate": 2.807662823746337e-06, + "loss": 0.5829, + "step": 9497 + }, + { + "epoch": 2.7804449648711946, + "grad_norm": 0.9943733215332031, + "learning_rate": 2.8072806726011698e-06, + "loss": 0.5995, + "step": 9498 + }, + { + "epoch": 2.7807377049180326, + "grad_norm": 1.0559827089309692, + "learning_rate": 2.8068985141657094e-06, + "loss": 0.591, + "step": 9499 + }, + { + "epoch": 2.781030444964871, + "grad_norm": 0.989127516746521, + "learning_rate": 2.8065163484490233e-06, + "loss": 0.5892, + "step": 9500 + }, + { + "epoch": 2.7813231850117095, + "grad_norm": 0.9671430587768555, + "learning_rate": 2.8061341754601783e-06, + "loss": 0.5665, + "step": 9501 + }, + { + "epoch": 2.781615925058548, + "grad_norm": 0.9461710453033447, + "learning_rate": 2.805751995208241e-06, + "loss": 0.56, + "step": 9502 + }, + { + "epoch": 2.7819086651053864, + "grad_norm": 1.0082274675369263, + "learning_rate": 2.8053698077022785e-06, + "loss": 0.5934, + "step": 9503 + }, + { + "epoch": 2.782201405152225, + "grad_norm": 1.0775409936904907, + "learning_rate": 2.804987612951359e-06, + "loss": 0.5722, + "step": 9504 + }, + { + "epoch": 2.7824941451990632, + "grad_norm": 0.9944618344306946, + "learning_rate": 2.804605410964549e-06, + "loss": 0.5528, + "step": 9505 + }, + { + "epoch": 2.7827868852459017, + "grad_norm": 0.9499923586845398, + "learning_rate": 2.8042232017509175e-06, + "loss": 0.556, + "step": 9506 + }, + { + "epoch": 2.78307962529274, + "grad_norm": 0.9886543154716492, + "learning_rate": 2.8038409853195327e-06, + "loss": 0.5739, + "step": 9507 + }, + { + "epoch": 2.7833723653395785, + "grad_norm": 0.9644238948822021, + "learning_rate": 2.8034587616794616e-06, + "loss": 0.562, + "step": 9508 + }, + { + "epoch": 2.783665105386417, + "grad_norm": 0.9924982190132141, + "learning_rate": 2.8030765308397727e-06, + "loss": 0.5714, + "step": 9509 + }, + { + "epoch": 2.7839578454332554, + "grad_norm": 1.003706455230713, + "learning_rate": 2.8026942928095356e-06, + "loss": 0.5327, + "step": 9510 + }, + { + "epoch": 2.784250585480094, + "grad_norm": 0.9837749004364014, + "learning_rate": 2.802312047597818e-06, + "loss": 0.5362, + "step": 9511 + }, + { + "epoch": 2.784543325526932, + "grad_norm": 1.0201160907745361, + "learning_rate": 2.801929795213689e-06, + "loss": 0.5255, + "step": 9512 + }, + { + "epoch": 2.7848360655737707, + "grad_norm": 0.9992729425430298, + "learning_rate": 2.8015475356662176e-06, + "loss": 0.6217, + "step": 9513 + }, + { + "epoch": 2.7851288056206087, + "grad_norm": 1.048880696296692, + "learning_rate": 2.8011652689644726e-06, + "loss": 0.6343, + "step": 9514 + }, + { + "epoch": 2.785421545667447, + "grad_norm": 1.0560355186462402, + "learning_rate": 2.8007829951175243e-06, + "loss": 0.5548, + "step": 9515 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.9610493779182434, + "learning_rate": 2.8004007141344412e-06, + "loss": 0.515, + "step": 9516 + }, + { + "epoch": 2.786007025761124, + "grad_norm": 1.0075844526290894, + "learning_rate": 2.800018426024294e-06, + "loss": 0.5954, + "step": 9517 + }, + { + "epoch": 2.7862997658079625, + "grad_norm": 0.9575924277305603, + "learning_rate": 2.7996361307961515e-06, + "loss": 0.5788, + "step": 9518 + }, + { + "epoch": 2.786592505854801, + "grad_norm": 1.0046026706695557, + "learning_rate": 2.799253828459084e-06, + "loss": 0.5867, + "step": 9519 + }, + { + "epoch": 2.7868852459016393, + "grad_norm": 0.9901940226554871, + "learning_rate": 2.7988715190221623e-06, + "loss": 0.5269, + "step": 9520 + }, + { + "epoch": 2.7871779859484778, + "grad_norm": 0.9665742516517639, + "learning_rate": 2.798489202494457e-06, + "loss": 0.5659, + "step": 9521 + }, + { + "epoch": 2.787470725995316, + "grad_norm": 1.0749585628509521, + "learning_rate": 2.7981068788850373e-06, + "loss": 0.6043, + "step": 9522 + }, + { + "epoch": 2.7877634660421546, + "grad_norm": 0.9778940677642822, + "learning_rate": 2.7977245482029754e-06, + "loss": 0.5739, + "step": 9523 + }, + { + "epoch": 2.788056206088993, + "grad_norm": 0.9759072065353394, + "learning_rate": 2.7973422104573405e-06, + "loss": 0.5769, + "step": 9524 + }, + { + "epoch": 2.7883489461358315, + "grad_norm": 0.9985911250114441, + "learning_rate": 2.7969598656572045e-06, + "loss": 0.5515, + "step": 9525 + }, + { + "epoch": 2.78864168618267, + "grad_norm": 0.9675553441047668, + "learning_rate": 2.796577513811639e-06, + "loss": 0.5515, + "step": 9526 + }, + { + "epoch": 2.788934426229508, + "grad_norm": 0.9811699390411377, + "learning_rate": 2.7961951549297152e-06, + "loss": 0.571, + "step": 9527 + }, + { + "epoch": 2.789227166276347, + "grad_norm": 0.9930775761604309, + "learning_rate": 2.795812789020504e-06, + "loss": 0.5578, + "step": 9528 + }, + { + "epoch": 2.789519906323185, + "grad_norm": 0.9935011863708496, + "learning_rate": 2.7954304160930777e-06, + "loss": 0.5735, + "step": 9529 + }, + { + "epoch": 2.7898126463700237, + "grad_norm": 0.9726369380950928, + "learning_rate": 2.795048036156508e-06, + "loss": 0.5463, + "step": 9530 + }, + { + "epoch": 2.7901053864168617, + "grad_norm": 1.0262537002563477, + "learning_rate": 2.7946656492198672e-06, + "loss": 0.5594, + "step": 9531 + }, + { + "epoch": 2.7903981264637, + "grad_norm": 1.0105459690093994, + "learning_rate": 2.7942832552922272e-06, + "loss": 0.5352, + "step": 9532 + }, + { + "epoch": 2.7906908665105385, + "grad_norm": 1.0174109935760498, + "learning_rate": 2.7939008543826602e-06, + "loss": 0.5588, + "step": 9533 + }, + { + "epoch": 2.790983606557377, + "grad_norm": 1.1178425550460815, + "learning_rate": 2.7935184465002384e-06, + "loss": 0.5757, + "step": 9534 + }, + { + "epoch": 2.7912763466042154, + "grad_norm": 1.0018393993377686, + "learning_rate": 2.7931360316540356e-06, + "loss": 0.6018, + "step": 9535 + }, + { + "epoch": 2.791569086651054, + "grad_norm": 1.0283631086349487, + "learning_rate": 2.7927536098531247e-06, + "loss": 0.5864, + "step": 9536 + }, + { + "epoch": 2.7918618266978923, + "grad_norm": 1.0225685834884644, + "learning_rate": 2.792371181106578e-06, + "loss": 0.5482, + "step": 9537 + }, + { + "epoch": 2.7921545667447307, + "grad_norm": 1.0082563161849976, + "learning_rate": 2.7919887454234685e-06, + "loss": 0.5427, + "step": 9538 + }, + { + "epoch": 2.792447306791569, + "grad_norm": 0.9972245097160339, + "learning_rate": 2.79160630281287e-06, + "loss": 0.5566, + "step": 9539 + }, + { + "epoch": 2.7927400468384076, + "grad_norm": 0.9661069512367249, + "learning_rate": 2.791223853283856e-06, + "loss": 0.5801, + "step": 9540 + }, + { + "epoch": 2.793032786885246, + "grad_norm": 0.9853939414024353, + "learning_rate": 2.7908413968455e-06, + "loss": 0.5568, + "step": 9541 + }, + { + "epoch": 2.7933255269320845, + "grad_norm": 1.0296343564987183, + "learning_rate": 2.7904589335068766e-06, + "loss": 0.5919, + "step": 9542 + }, + { + "epoch": 2.793618266978923, + "grad_norm": 0.9911004900932312, + "learning_rate": 2.790076463277058e-06, + "loss": 0.5522, + "step": 9543 + }, + { + "epoch": 2.793911007025761, + "grad_norm": 0.9857498407363892, + "learning_rate": 2.78969398616512e-06, + "loss": 0.5701, + "step": 9544 + }, + { + "epoch": 2.7942037470725998, + "grad_norm": 0.9571513533592224, + "learning_rate": 2.789311502180137e-06, + "loss": 0.5581, + "step": 9545 + }, + { + "epoch": 2.7944964871194378, + "grad_norm": 0.9535316228866577, + "learning_rate": 2.7889290113311822e-06, + "loss": 0.5275, + "step": 9546 + }, + { + "epoch": 2.794789227166276, + "grad_norm": 0.9649606943130493, + "learning_rate": 2.7885465136273326e-06, + "loss": 0.5566, + "step": 9547 + }, + { + "epoch": 2.7950819672131146, + "grad_norm": 0.9861120581626892, + "learning_rate": 2.7881640090776603e-06, + "loss": 0.5573, + "step": 9548 + }, + { + "epoch": 2.795374707259953, + "grad_norm": 0.9615885019302368, + "learning_rate": 2.7877814976912415e-06, + "loss": 0.6149, + "step": 9549 + }, + { + "epoch": 2.7956674473067915, + "grad_norm": 0.9744978547096252, + "learning_rate": 2.787398979477152e-06, + "loss": 0.5631, + "step": 9550 + }, + { + "epoch": 2.79596018735363, + "grad_norm": 1.0075618028640747, + "learning_rate": 2.7870164544444656e-06, + "loss": 0.6165, + "step": 9551 + }, + { + "epoch": 2.7962529274004684, + "grad_norm": 1.0143351554870605, + "learning_rate": 2.7866339226022596e-06, + "loss": 0.598, + "step": 9552 + }, + { + "epoch": 2.796545667447307, + "grad_norm": 1.0261744260787964, + "learning_rate": 2.7862513839596085e-06, + "loss": 0.6078, + "step": 9553 + }, + { + "epoch": 2.7968384074941453, + "grad_norm": 1.076375961303711, + "learning_rate": 2.7858688385255883e-06, + "loss": 0.5824, + "step": 9554 + }, + { + "epoch": 2.7971311475409837, + "grad_norm": 1.0011662244796753, + "learning_rate": 2.7854862863092745e-06, + "loss": 0.5822, + "step": 9555 + }, + { + "epoch": 2.797423887587822, + "grad_norm": 0.9573985934257507, + "learning_rate": 2.785103727319744e-06, + "loss": 0.5835, + "step": 9556 + }, + { + "epoch": 2.7977166276346606, + "grad_norm": 0.9663135409355164, + "learning_rate": 2.7847211615660736e-06, + "loss": 0.5901, + "step": 9557 + }, + { + "epoch": 2.798009367681499, + "grad_norm": 1.0339508056640625, + "learning_rate": 2.7843385890573376e-06, + "loss": 0.5546, + "step": 9558 + }, + { + "epoch": 2.798302107728337, + "grad_norm": 0.9851021766662598, + "learning_rate": 2.7839560098026153e-06, + "loss": 0.552, + "step": 9559 + }, + { + "epoch": 2.798594847775176, + "grad_norm": 1.5920500755310059, + "learning_rate": 2.7835734238109813e-06, + "loss": 0.6203, + "step": 9560 + }, + { + "epoch": 2.798887587822014, + "grad_norm": 0.9599549174308777, + "learning_rate": 2.7831908310915136e-06, + "loss": 0.5738, + "step": 9561 + }, + { + "epoch": 2.7991803278688527, + "grad_norm": 0.9989436864852905, + "learning_rate": 2.782808231653289e-06, + "loss": 0.5944, + "step": 9562 + }, + { + "epoch": 2.7994730679156907, + "grad_norm": 1.0039925575256348, + "learning_rate": 2.7824256255053842e-06, + "loss": 0.5686, + "step": 9563 + }, + { + "epoch": 2.799765807962529, + "grad_norm": 0.9025194644927979, + "learning_rate": 2.7820430126568774e-06, + "loss": 0.5143, + "step": 9564 + }, + { + "epoch": 2.8000585480093676, + "grad_norm": 0.9852994084358215, + "learning_rate": 2.7816603931168463e-06, + "loss": 0.5922, + "step": 9565 + }, + { + "epoch": 2.800351288056206, + "grad_norm": 0.9786555767059326, + "learning_rate": 2.781277766894368e-06, + "loss": 0.5629, + "step": 9566 + }, + { + "epoch": 2.8006440281030445, + "grad_norm": 1.0132288932800293, + "learning_rate": 2.7808951339985213e-06, + "loss": 0.5523, + "step": 9567 + }, + { + "epoch": 2.800936768149883, + "grad_norm": 1.0194716453552246, + "learning_rate": 2.780512494438383e-06, + "loss": 0.553, + "step": 9568 + }, + { + "epoch": 2.8012295081967213, + "grad_norm": 1.0739185810089111, + "learning_rate": 2.7801298482230315e-06, + "loss": 0.601, + "step": 9569 + }, + { + "epoch": 2.8015222482435598, + "grad_norm": 0.9777877330780029, + "learning_rate": 2.779747195361546e-06, + "loss": 0.5476, + "step": 9570 + }, + { + "epoch": 2.801814988290398, + "grad_norm": 0.9889852404594421, + "learning_rate": 2.7793645358630044e-06, + "loss": 0.5666, + "step": 9571 + }, + { + "epoch": 2.8021077283372366, + "grad_norm": 1.0658154487609863, + "learning_rate": 2.7789818697364856e-06, + "loss": 0.5752, + "step": 9572 + }, + { + "epoch": 2.802400468384075, + "grad_norm": 0.9775821566581726, + "learning_rate": 2.778599196991069e-06, + "loss": 0.5281, + "step": 9573 + }, + { + "epoch": 2.802693208430913, + "grad_norm": 1.01515793800354, + "learning_rate": 2.7782165176358317e-06, + "loss": 0.565, + "step": 9574 + }, + { + "epoch": 2.802985948477752, + "grad_norm": 1.0223830938339233, + "learning_rate": 2.7778338316798544e-06, + "loss": 0.585, + "step": 9575 + }, + { + "epoch": 2.80327868852459, + "grad_norm": 0.9825272560119629, + "learning_rate": 2.7774511391322167e-06, + "loss": 0.5495, + "step": 9576 + }, + { + "epoch": 2.803571428571429, + "grad_norm": 1.0317846536636353, + "learning_rate": 2.7770684400019972e-06, + "loss": 0.5502, + "step": 9577 + }, + { + "epoch": 2.803864168618267, + "grad_norm": 0.9979701042175293, + "learning_rate": 2.7766857342982756e-06, + "loss": 0.599, + "step": 9578 + }, + { + "epoch": 2.8041569086651053, + "grad_norm": 1.010117530822754, + "learning_rate": 2.7763030220301317e-06, + "loss": 0.6071, + "step": 9579 + }, + { + "epoch": 2.8044496487119437, + "grad_norm": 1.046674132347107, + "learning_rate": 2.7759203032066456e-06, + "loss": 0.557, + "step": 9580 + }, + { + "epoch": 2.804742388758782, + "grad_norm": 1.0182186365127563, + "learning_rate": 2.775537577836897e-06, + "loss": 0.5914, + "step": 9581 + }, + { + "epoch": 2.8050351288056206, + "grad_norm": 1.0078990459442139, + "learning_rate": 2.775154845929967e-06, + "loss": 0.5342, + "step": 9582 + }, + { + "epoch": 2.805327868852459, + "grad_norm": 1.0407699346542358, + "learning_rate": 2.774772107494935e-06, + "loss": 0.5509, + "step": 9583 + }, + { + "epoch": 2.8056206088992974, + "grad_norm": 1.014309048652649, + "learning_rate": 2.774389362540882e-06, + "loss": 0.5385, + "step": 9584 + }, + { + "epoch": 2.805913348946136, + "grad_norm": 0.9574797749519348, + "learning_rate": 2.7740066110768886e-06, + "loss": 0.574, + "step": 9585 + }, + { + "epoch": 2.8062060889929743, + "grad_norm": 0.9355307221412659, + "learning_rate": 2.773623853112036e-06, + "loss": 0.5713, + "step": 9586 + }, + { + "epoch": 2.8064988290398127, + "grad_norm": 0.9434797167778015, + "learning_rate": 2.7732410886554052e-06, + "loss": 0.5478, + "step": 9587 + }, + { + "epoch": 2.806791569086651, + "grad_norm": 1.0274487733840942, + "learning_rate": 2.772858317716076e-06, + "loss": 0.5829, + "step": 9588 + }, + { + "epoch": 2.8070843091334896, + "grad_norm": 1.0055100917816162, + "learning_rate": 2.772475540303131e-06, + "loss": 0.566, + "step": 9589 + }, + { + "epoch": 2.807377049180328, + "grad_norm": 1.012559413909912, + "learning_rate": 2.7720927564256524e-06, + "loss": 0.5645, + "step": 9590 + }, + { + "epoch": 2.807669789227166, + "grad_norm": 0.9936415553092957, + "learning_rate": 2.77170996609272e-06, + "loss": 0.5733, + "step": 9591 + }, + { + "epoch": 2.807962529274005, + "grad_norm": 0.9923223853111267, + "learning_rate": 2.771327169313417e-06, + "loss": 0.5876, + "step": 9592 + }, + { + "epoch": 2.808255269320843, + "grad_norm": 1.0130362510681152, + "learning_rate": 2.7709443660968246e-06, + "loss": 0.6015, + "step": 9593 + }, + { + "epoch": 2.8085480093676813, + "grad_norm": 0.9698089957237244, + "learning_rate": 2.7705615564520256e-06, + "loss": 0.5356, + "step": 9594 + }, + { + "epoch": 2.8088407494145198, + "grad_norm": 1.0277875661849976, + "learning_rate": 2.770178740388101e-06, + "loss": 0.5679, + "step": 9595 + }, + { + "epoch": 2.809133489461358, + "grad_norm": 1.034165620803833, + "learning_rate": 2.7697959179141337e-06, + "loss": 0.5759, + "step": 9596 + }, + { + "epoch": 2.8094262295081966, + "grad_norm": 0.9874408841133118, + "learning_rate": 2.7694130890392075e-06, + "loss": 0.5438, + "step": 9597 + }, + { + "epoch": 2.809718969555035, + "grad_norm": 1.0270856618881226, + "learning_rate": 2.769030253772403e-06, + "loss": 0.5832, + "step": 9598 + }, + { + "epoch": 2.8100117096018735, + "grad_norm": 1.0401335954666138, + "learning_rate": 2.768647412122804e-06, + "loss": 0.5897, + "step": 9599 + }, + { + "epoch": 2.810304449648712, + "grad_norm": 0.9912053346633911, + "learning_rate": 2.7682645640994938e-06, + "loss": 0.5498, + "step": 9600 + }, + { + "epoch": 2.8105971896955504, + "grad_norm": 0.9625949263572693, + "learning_rate": 2.767881709711555e-06, + "loss": 0.5755, + "step": 9601 + }, + { + "epoch": 2.810889929742389, + "grad_norm": 0.9606577754020691, + "learning_rate": 2.767498848968072e-06, + "loss": 0.5676, + "step": 9602 + }, + { + "epoch": 2.8111826697892273, + "grad_norm": 1.005812644958496, + "learning_rate": 2.7671159818781267e-06, + "loss": 0.5609, + "step": 9603 + }, + { + "epoch": 2.8114754098360657, + "grad_norm": 0.9562460780143738, + "learning_rate": 2.766733108450803e-06, + "loss": 0.5929, + "step": 9604 + }, + { + "epoch": 2.811768149882904, + "grad_norm": 1.0383306741714478, + "learning_rate": 2.766350228695186e-06, + "loss": 0.5538, + "step": 9605 + }, + { + "epoch": 2.812060889929742, + "grad_norm": 0.9647553563117981, + "learning_rate": 2.7659673426203582e-06, + "loss": 0.5321, + "step": 9606 + }, + { + "epoch": 2.812353629976581, + "grad_norm": 1.005774736404419, + "learning_rate": 2.7655844502354044e-06, + "loss": 0.5592, + "step": 9607 + }, + { + "epoch": 2.812646370023419, + "grad_norm": 0.9983596205711365, + "learning_rate": 2.765201551549408e-06, + "loss": 0.626, + "step": 9608 + }, + { + "epoch": 2.812939110070258, + "grad_norm": 1.0307517051696777, + "learning_rate": 2.7648186465714544e-06, + "loss": 0.5706, + "step": 9609 + }, + { + "epoch": 2.813231850117096, + "grad_norm": 0.968100905418396, + "learning_rate": 2.764435735310627e-06, + "loss": 0.5757, + "step": 9610 + }, + { + "epoch": 2.8135245901639343, + "grad_norm": 1.0171308517456055, + "learning_rate": 2.7640528177760115e-06, + "loss": 0.5903, + "step": 9611 + }, + { + "epoch": 2.8138173302107727, + "grad_norm": 0.991359531879425, + "learning_rate": 2.763669893976692e-06, + "loss": 0.5422, + "step": 9612 + }, + { + "epoch": 2.814110070257611, + "grad_norm": 1.003724455833435, + "learning_rate": 2.7632869639217537e-06, + "loss": 0.5976, + "step": 9613 + }, + { + "epoch": 2.8144028103044496, + "grad_norm": 0.9284337759017944, + "learning_rate": 2.762904027620281e-06, + "loss": 0.5591, + "step": 9614 + }, + { + "epoch": 2.814695550351288, + "grad_norm": 0.9323634505271912, + "learning_rate": 2.7625210850813604e-06, + "loss": 0.5497, + "step": 9615 + }, + { + "epoch": 2.8149882903981265, + "grad_norm": 0.9594340920448303, + "learning_rate": 2.762138136314076e-06, + "loss": 0.5955, + "step": 9616 + }, + { + "epoch": 2.815281030444965, + "grad_norm": 1.0066087245941162, + "learning_rate": 2.761755181327515e-06, + "loss": 0.5856, + "step": 9617 + }, + { + "epoch": 2.8155737704918034, + "grad_norm": 0.9519250392913818, + "learning_rate": 2.7613722201307615e-06, + "loss": 0.5789, + "step": 9618 + }, + { + "epoch": 2.815866510538642, + "grad_norm": 0.930357038974762, + "learning_rate": 2.7609892527329015e-06, + "loss": 0.5739, + "step": 9619 + }, + { + "epoch": 2.8161592505854802, + "grad_norm": 0.9429950714111328, + "learning_rate": 2.760606279143021e-06, + "loss": 0.5681, + "step": 9620 + }, + { + "epoch": 2.8164519906323187, + "grad_norm": 0.9408476948738098, + "learning_rate": 2.7602232993702074e-06, + "loss": 0.5503, + "step": 9621 + }, + { + "epoch": 2.816744730679157, + "grad_norm": 0.956651508808136, + "learning_rate": 2.759840313423545e-06, + "loss": 0.5427, + "step": 9622 + }, + { + "epoch": 2.817037470725995, + "grad_norm": 0.9880919456481934, + "learning_rate": 2.7594573213121224e-06, + "loss": 0.5344, + "step": 9623 + }, + { + "epoch": 2.817330210772834, + "grad_norm": 0.9622296094894409, + "learning_rate": 2.7590743230450246e-06, + "loss": 0.5431, + "step": 9624 + }, + { + "epoch": 2.817622950819672, + "grad_norm": 1.042024850845337, + "learning_rate": 2.758691318631338e-06, + "loss": 0.5705, + "step": 9625 + }, + { + "epoch": 2.8179156908665104, + "grad_norm": 1.0329926013946533, + "learning_rate": 2.758308308080151e-06, + "loss": 0.5939, + "step": 9626 + }, + { + "epoch": 2.818208430913349, + "grad_norm": 1.0321455001831055, + "learning_rate": 2.7579252914005494e-06, + "loss": 0.5705, + "step": 9627 + }, + { + "epoch": 2.8185011709601873, + "grad_norm": 1.0098841190338135, + "learning_rate": 2.757542268601621e-06, + "loss": 0.5431, + "step": 9628 + }, + { + "epoch": 2.8187939110070257, + "grad_norm": 0.9536676406860352, + "learning_rate": 2.7571592396924525e-06, + "loss": 0.5185, + "step": 9629 + }, + { + "epoch": 2.819086651053864, + "grad_norm": 0.9970709085464478, + "learning_rate": 2.7567762046821322e-06, + "loss": 0.5593, + "step": 9630 + }, + { + "epoch": 2.8193793911007026, + "grad_norm": 0.9817735552787781, + "learning_rate": 2.7563931635797465e-06, + "loss": 0.5815, + "step": 9631 + }, + { + "epoch": 2.819672131147541, + "grad_norm": 0.977287232875824, + "learning_rate": 2.756010116394384e-06, + "loss": 0.5415, + "step": 9632 + }, + { + "epoch": 2.8199648711943794, + "grad_norm": 0.9331138134002686, + "learning_rate": 2.755627063135132e-06, + "loss": 0.5129, + "step": 9633 + }, + { + "epoch": 2.820257611241218, + "grad_norm": 1.0171006917953491, + "learning_rate": 2.7552440038110793e-06, + "loss": 0.5841, + "step": 9634 + }, + { + "epoch": 2.8205503512880563, + "grad_norm": 1.1429412364959717, + "learning_rate": 2.7548609384313135e-06, + "loss": 0.5545, + "step": 9635 + }, + { + "epoch": 2.8208430913348947, + "grad_norm": 0.9853453636169434, + "learning_rate": 2.754477867004922e-06, + "loss": 0.5648, + "step": 9636 + }, + { + "epoch": 2.821135831381733, + "grad_norm": 0.9474877715110779, + "learning_rate": 2.7540947895409954e-06, + "loss": 0.5769, + "step": 9637 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 1.0095223188400269, + "learning_rate": 2.753711706048621e-06, + "loss": 0.5774, + "step": 9638 + }, + { + "epoch": 2.82172131147541, + "grad_norm": 0.9853947758674622, + "learning_rate": 2.753328616536887e-06, + "loss": 0.5196, + "step": 9639 + }, + { + "epoch": 2.822014051522248, + "grad_norm": 1.0689109563827515, + "learning_rate": 2.752945521014883e-06, + "loss": 0.574, + "step": 9640 + }, + { + "epoch": 2.822306791569087, + "grad_norm": 0.9906177520751953, + "learning_rate": 2.7525624194916982e-06, + "loss": 0.5744, + "step": 9641 + }, + { + "epoch": 2.822599531615925, + "grad_norm": 0.9631854295730591, + "learning_rate": 2.752179311976421e-06, + "loss": 0.5259, + "step": 9642 + }, + { + "epoch": 2.8228922716627634, + "grad_norm": 1.038849949836731, + "learning_rate": 2.751796198478142e-06, + "loss": 0.5591, + "step": 9643 + }, + { + "epoch": 2.823185011709602, + "grad_norm": 1.0225108861923218, + "learning_rate": 2.7514130790059497e-06, + "loss": 0.5918, + "step": 9644 + }, + { + "epoch": 2.8234777517564402, + "grad_norm": 1.026627540588379, + "learning_rate": 2.7510299535689335e-06, + "loss": 0.5785, + "step": 9645 + }, + { + "epoch": 2.8237704918032787, + "grad_norm": 0.9789816737174988, + "learning_rate": 2.7506468221761826e-06, + "loss": 0.5756, + "step": 9646 + }, + { + "epoch": 2.824063231850117, + "grad_norm": 1.0064470767974854, + "learning_rate": 2.750263684836789e-06, + "loss": 0.5808, + "step": 9647 + }, + { + "epoch": 2.8243559718969555, + "grad_norm": 1.0004035234451294, + "learning_rate": 2.749880541559841e-06, + "loss": 0.5831, + "step": 9648 + }, + { + "epoch": 2.824648711943794, + "grad_norm": 1.0074292421340942, + "learning_rate": 2.74949739235443e-06, + "loss": 0.5746, + "step": 9649 + }, + { + "epoch": 2.8249414519906324, + "grad_norm": 0.9613202810287476, + "learning_rate": 2.749114237229644e-06, + "loss": 0.5729, + "step": 9650 + }, + { + "epoch": 2.825234192037471, + "grad_norm": 0.9460597634315491, + "learning_rate": 2.748731076194575e-06, + "loss": 0.5771, + "step": 9651 + }, + { + "epoch": 2.8255269320843093, + "grad_norm": 1.0288726091384888, + "learning_rate": 2.7483479092583138e-06, + "loss": 0.5703, + "step": 9652 + }, + { + "epoch": 2.8258196721311473, + "grad_norm": 1.0043730735778809, + "learning_rate": 2.747964736429951e-06, + "loss": 0.5506, + "step": 9653 + }, + { + "epoch": 2.826112412177986, + "grad_norm": 0.9536246657371521, + "learning_rate": 2.747581557718577e-06, + "loss": 0.5237, + "step": 9654 + }, + { + "epoch": 2.826405152224824, + "grad_norm": 1.0039262771606445, + "learning_rate": 2.747198373133283e-06, + "loss": 0.5658, + "step": 9655 + }, + { + "epoch": 2.826697892271663, + "grad_norm": 0.9626429677009583, + "learning_rate": 2.7468151826831595e-06, + "loss": 0.5311, + "step": 9656 + }, + { + "epoch": 2.826990632318501, + "grad_norm": 0.963931679725647, + "learning_rate": 2.746431986377299e-06, + "loss": 0.5842, + "step": 9657 + }, + { + "epoch": 2.8272833723653394, + "grad_norm": 0.972872793674469, + "learning_rate": 2.7460487842247924e-06, + "loss": 0.5483, + "step": 9658 + }, + { + "epoch": 2.827576112412178, + "grad_norm": 1.0072828531265259, + "learning_rate": 2.7456655762347305e-06, + "loss": 0.5742, + "step": 9659 + }, + { + "epoch": 2.8278688524590163, + "grad_norm": 0.9883870482444763, + "learning_rate": 2.745282362416206e-06, + "loss": 0.5667, + "step": 9660 + }, + { + "epoch": 2.8281615925058547, + "grad_norm": 0.9894766807556152, + "learning_rate": 2.74489914277831e-06, + "loss": 0.5806, + "step": 9661 + }, + { + "epoch": 2.828454332552693, + "grad_norm": 0.973296046257019, + "learning_rate": 2.744515917330135e-06, + "loss": 0.5301, + "step": 9662 + }, + { + "epoch": 2.8287470725995316, + "grad_norm": 0.9652460217475891, + "learning_rate": 2.744132686080774e-06, + "loss": 0.5343, + "step": 9663 + }, + { + "epoch": 2.82903981264637, + "grad_norm": 1.0130680799484253, + "learning_rate": 2.743749449039317e-06, + "loss": 0.5862, + "step": 9664 + }, + { + "epoch": 2.8293325526932085, + "grad_norm": 1.013751745223999, + "learning_rate": 2.743366206214857e-06, + "loss": 0.5866, + "step": 9665 + }, + { + "epoch": 2.829625292740047, + "grad_norm": 0.980457603931427, + "learning_rate": 2.742982957616488e-06, + "loss": 0.575, + "step": 9666 + }, + { + "epoch": 2.8299180327868854, + "grad_norm": 0.9774943590164185, + "learning_rate": 2.7425997032533015e-06, + "loss": 0.5976, + "step": 9667 + }, + { + "epoch": 2.830210772833724, + "grad_norm": 0.9826992750167847, + "learning_rate": 2.74221644313439e-06, + "loss": 0.5715, + "step": 9668 + }, + { + "epoch": 2.8305035128805622, + "grad_norm": 1.020749807357788, + "learning_rate": 2.741833177268847e-06, + "loss": 0.5531, + "step": 9669 + }, + { + "epoch": 2.8307962529274002, + "grad_norm": 1.0123463869094849, + "learning_rate": 2.7414499056657656e-06, + "loss": 0.5484, + "step": 9670 + }, + { + "epoch": 2.831088992974239, + "grad_norm": 1.0452022552490234, + "learning_rate": 2.7410666283342387e-06, + "loss": 0.571, + "step": 9671 + }, + { + "epoch": 2.831381733021077, + "grad_norm": 0.9652131795883179, + "learning_rate": 2.740683345283359e-06, + "loss": 0.5615, + "step": 9672 + }, + { + "epoch": 2.8316744730679155, + "grad_norm": 0.9638946056365967, + "learning_rate": 2.740300056522222e-06, + "loss": 0.5414, + "step": 9673 + }, + { + "epoch": 2.831967213114754, + "grad_norm": 0.9580747485160828, + "learning_rate": 2.7399167620599194e-06, + "loss": 0.5325, + "step": 9674 + }, + { + "epoch": 2.8322599531615924, + "grad_norm": 0.9697766900062561, + "learning_rate": 2.739533461905545e-06, + "loss": 0.5174, + "step": 9675 + }, + { + "epoch": 2.832552693208431, + "grad_norm": 0.9825441241264343, + "learning_rate": 2.7391501560681938e-06, + "loss": 0.5483, + "step": 9676 + }, + { + "epoch": 2.8328454332552693, + "grad_norm": 0.9520642161369324, + "learning_rate": 2.738766844556959e-06, + "loss": 0.5772, + "step": 9677 + }, + { + "epoch": 2.8331381733021077, + "grad_norm": 0.9826327562332153, + "learning_rate": 2.7383835273809353e-06, + "loss": 0.5764, + "step": 9678 + }, + { + "epoch": 2.833430913348946, + "grad_norm": 0.9889461994171143, + "learning_rate": 2.738000204549216e-06, + "loss": 0.5655, + "step": 9679 + }, + { + "epoch": 2.8337236533957846, + "grad_norm": 0.9477822780609131, + "learning_rate": 2.7376168760708967e-06, + "loss": 0.492, + "step": 9680 + }, + { + "epoch": 2.834016393442623, + "grad_norm": 1.0146512985229492, + "learning_rate": 2.7372335419550706e-06, + "loss": 0.562, + "step": 9681 + }, + { + "epoch": 2.8343091334894615, + "grad_norm": 0.9389151334762573, + "learning_rate": 2.736850202210834e-06, + "loss": 0.5098, + "step": 9682 + }, + { + "epoch": 2.8346018735363, + "grad_norm": 0.961119532585144, + "learning_rate": 2.7364668568472803e-06, + "loss": 0.5636, + "step": 9683 + }, + { + "epoch": 2.8348946135831383, + "grad_norm": 1.017683982849121, + "learning_rate": 2.736083505873506e-06, + "loss": 0.5702, + "step": 9684 + }, + { + "epoch": 2.8351873536299763, + "grad_norm": 1.008049488067627, + "learning_rate": 2.735700149298604e-06, + "loss": 0.547, + "step": 9685 + }, + { + "epoch": 2.835480093676815, + "grad_norm": 1.0007820129394531, + "learning_rate": 2.735316787131671e-06, + "loss": 0.6111, + "step": 9686 + }, + { + "epoch": 2.835772833723653, + "grad_norm": 0.9997431635856628, + "learning_rate": 2.734933419381802e-06, + "loss": 0.5732, + "step": 9687 + }, + { + "epoch": 2.836065573770492, + "grad_norm": 0.9631885290145874, + "learning_rate": 2.734550046058093e-06, + "loss": 0.5789, + "step": 9688 + }, + { + "epoch": 2.83635831381733, + "grad_norm": 1.0048521757125854, + "learning_rate": 2.734166667169638e-06, + "loss": 0.538, + "step": 9689 + }, + { + "epoch": 2.8366510538641685, + "grad_norm": 1.3064653873443604, + "learning_rate": 2.7337832827255347e-06, + "loss": 0.5372, + "step": 9690 + }, + { + "epoch": 2.836943793911007, + "grad_norm": 0.9367465376853943, + "learning_rate": 2.7333998927348777e-06, + "loss": 0.4984, + "step": 9691 + }, + { + "epoch": 2.8372365339578454, + "grad_norm": 0.9837944507598877, + "learning_rate": 2.733016497206763e-06, + "loss": 0.5445, + "step": 9692 + }, + { + "epoch": 2.837529274004684, + "grad_norm": 1.0032575130462646, + "learning_rate": 2.7326330961502883e-06, + "loss": 0.5479, + "step": 9693 + }, + { + "epoch": 2.8378220140515222, + "grad_norm": 0.9862685799598694, + "learning_rate": 2.732249689574548e-06, + "loss": 0.6059, + "step": 9694 + }, + { + "epoch": 2.8381147540983607, + "grad_norm": 1.0275819301605225, + "learning_rate": 2.731866277488639e-06, + "loss": 0.5917, + "step": 9695 + }, + { + "epoch": 2.838407494145199, + "grad_norm": 1.0462684631347656, + "learning_rate": 2.7314828599016583e-06, + "loss": 0.5664, + "step": 9696 + }, + { + "epoch": 2.8387002341920375, + "grad_norm": 1.0090311765670776, + "learning_rate": 2.7310994368227024e-06, + "loss": 0.5693, + "step": 9697 + }, + { + "epoch": 2.838992974238876, + "grad_norm": 0.9910580515861511, + "learning_rate": 2.730716008260868e-06, + "loss": 0.558, + "step": 9698 + }, + { + "epoch": 2.8392857142857144, + "grad_norm": 0.9676010608673096, + "learning_rate": 2.7303325742252516e-06, + "loss": 0.5209, + "step": 9699 + }, + { + "epoch": 2.839578454332553, + "grad_norm": 0.9530277848243713, + "learning_rate": 2.7299491347249506e-06, + "loss": 0.5496, + "step": 9700 + }, + { + "epoch": 2.8398711943793913, + "grad_norm": 0.9594235420227051, + "learning_rate": 2.729565689769063e-06, + "loss": 0.5656, + "step": 9701 + }, + { + "epoch": 2.8401639344262293, + "grad_norm": 1.0399049520492554, + "learning_rate": 2.729182239366685e-06, + "loss": 0.5523, + "step": 9702 + }, + { + "epoch": 2.840456674473068, + "grad_norm": 0.9998798966407776, + "learning_rate": 2.7287987835269137e-06, + "loss": 0.5847, + "step": 9703 + }, + { + "epoch": 2.840749414519906, + "grad_norm": 0.9349467754364014, + "learning_rate": 2.7284153222588485e-06, + "loss": 0.5409, + "step": 9704 + }, + { + "epoch": 2.8410421545667446, + "grad_norm": 1.0203386545181274, + "learning_rate": 2.7280318555715846e-06, + "loss": 0.5714, + "step": 9705 + }, + { + "epoch": 2.841334894613583, + "grad_norm": 0.9881662130355835, + "learning_rate": 2.7276483834742218e-06, + "loss": 0.5856, + "step": 9706 + }, + { + "epoch": 2.8416276346604215, + "grad_norm": 1.0067416429519653, + "learning_rate": 2.7272649059758567e-06, + "loss": 0.5823, + "step": 9707 + }, + { + "epoch": 2.84192037470726, + "grad_norm": 1.011810064315796, + "learning_rate": 2.726881423085588e-06, + "loss": 0.5838, + "step": 9708 + }, + { + "epoch": 2.8422131147540983, + "grad_norm": 1.0297894477844238, + "learning_rate": 2.7264979348125155e-06, + "loss": 0.5567, + "step": 9709 + }, + { + "epoch": 2.8425058548009368, + "grad_norm": 0.9843021035194397, + "learning_rate": 2.726114441165735e-06, + "loss": 0.5639, + "step": 9710 + }, + { + "epoch": 2.842798594847775, + "grad_norm": 0.989463746547699, + "learning_rate": 2.7257309421543453e-06, + "loss": 0.5759, + "step": 9711 + }, + { + "epoch": 2.8430913348946136, + "grad_norm": 1.012300968170166, + "learning_rate": 2.7253474377874463e-06, + "loss": 0.5602, + "step": 9712 + }, + { + "epoch": 2.843384074941452, + "grad_norm": 1.0314866304397583, + "learning_rate": 2.724963928074136e-06, + "loss": 0.5693, + "step": 9713 + }, + { + "epoch": 2.8436768149882905, + "grad_norm": 0.9413531422615051, + "learning_rate": 2.7245804130235138e-06, + "loss": 0.6117, + "step": 9714 + }, + { + "epoch": 2.843969555035129, + "grad_norm": 0.9351473450660706, + "learning_rate": 2.724196892644677e-06, + "loss": 0.5234, + "step": 9715 + }, + { + "epoch": 2.8442622950819674, + "grad_norm": 1.0261683464050293, + "learning_rate": 2.7238133669467267e-06, + "loss": 0.5809, + "step": 9716 + }, + { + "epoch": 2.8445550351288054, + "grad_norm": 0.9738840460777283, + "learning_rate": 2.7234298359387606e-06, + "loss": 0.5674, + "step": 9717 + }, + { + "epoch": 2.8448477751756442, + "grad_norm": 1.0150110721588135, + "learning_rate": 2.7230462996298795e-06, + "loss": 0.5231, + "step": 9718 + }, + { + "epoch": 2.8451405152224822, + "grad_norm": 0.9898948669433594, + "learning_rate": 2.722662758029182e-06, + "loss": 0.5835, + "step": 9719 + }, + { + "epoch": 2.845433255269321, + "grad_norm": 0.950889527797699, + "learning_rate": 2.722279211145768e-06, + "loss": 0.5724, + "step": 9720 + }, + { + "epoch": 2.845725995316159, + "grad_norm": 0.9517435431480408, + "learning_rate": 2.7218956589887363e-06, + "loss": 0.5758, + "step": 9721 + }, + { + "epoch": 2.8460187353629975, + "grad_norm": 1.0110315084457397, + "learning_rate": 2.7215121015671875e-06, + "loss": 0.5642, + "step": 9722 + }, + { + "epoch": 2.846311475409836, + "grad_norm": 0.9413420557975769, + "learning_rate": 2.7211285388902215e-06, + "loss": 0.5658, + "step": 9723 + }, + { + "epoch": 2.8466042154566744, + "grad_norm": 0.9686469435691833, + "learning_rate": 2.7207449709669393e-06, + "loss": 0.5458, + "step": 9724 + }, + { + "epoch": 2.846896955503513, + "grad_norm": 0.991482138633728, + "learning_rate": 2.720361397806439e-06, + "loss": 0.5846, + "step": 9725 + }, + { + "epoch": 2.8471896955503513, + "grad_norm": 0.9852102398872375, + "learning_rate": 2.7199778194178226e-06, + "loss": 0.5465, + "step": 9726 + }, + { + "epoch": 2.8474824355971897, + "grad_norm": 0.992392897605896, + "learning_rate": 2.7195942358101905e-06, + "loss": 0.5491, + "step": 9727 + }, + { + "epoch": 2.847775175644028, + "grad_norm": 0.9845281839370728, + "learning_rate": 2.7192106469926426e-06, + "loss": 0.5892, + "step": 9728 + }, + { + "epoch": 2.8480679156908666, + "grad_norm": 0.9483802914619446, + "learning_rate": 2.718827052974281e-06, + "loss": 0.5437, + "step": 9729 + }, + { + "epoch": 2.848360655737705, + "grad_norm": 1.0167860984802246, + "learning_rate": 2.7184434537642046e-06, + "loss": 0.5836, + "step": 9730 + }, + { + "epoch": 2.8486533957845435, + "grad_norm": 0.9463897347450256, + "learning_rate": 2.7180598493715156e-06, + "loss": 0.5652, + "step": 9731 + }, + { + "epoch": 2.848946135831382, + "grad_norm": 0.9755455851554871, + "learning_rate": 2.717676239805314e-06, + "loss": 0.57, + "step": 9732 + }, + { + "epoch": 2.8492388758782203, + "grad_norm": 0.987817645072937, + "learning_rate": 2.7172926250747024e-06, + "loss": 0.5696, + "step": 9733 + }, + { + "epoch": 2.8495316159250583, + "grad_norm": 0.9576829075813293, + "learning_rate": 2.716909005188782e-06, + "loss": 0.5438, + "step": 9734 + }, + { + "epoch": 2.849824355971897, + "grad_norm": 1.0438566207885742, + "learning_rate": 2.716525380156653e-06, + "loss": 0.5784, + "step": 9735 + }, + { + "epoch": 2.850117096018735, + "grad_norm": 1.0146479606628418, + "learning_rate": 2.7161417499874183e-06, + "loss": 0.5629, + "step": 9736 + }, + { + "epoch": 2.8504098360655736, + "grad_norm": 1.0118911266326904, + "learning_rate": 2.715758114690179e-06, + "loss": 0.5942, + "step": 9737 + }, + { + "epoch": 2.850702576112412, + "grad_norm": 0.9698836207389832, + "learning_rate": 2.7153744742740366e-06, + "loss": 0.5207, + "step": 9738 + }, + { + "epoch": 2.8509953161592505, + "grad_norm": 0.9909166693687439, + "learning_rate": 2.714990828748094e-06, + "loss": 0.5606, + "step": 9739 + }, + { + "epoch": 2.851288056206089, + "grad_norm": 1.0275561809539795, + "learning_rate": 2.714607178121452e-06, + "loss": 0.5772, + "step": 9740 + }, + { + "epoch": 2.8515807962529274, + "grad_norm": 0.939954400062561, + "learning_rate": 2.714223522403214e-06, + "loss": 0.5238, + "step": 9741 + }, + { + "epoch": 2.851873536299766, + "grad_norm": 0.994583010673523, + "learning_rate": 2.7138398616024812e-06, + "loss": 0.5542, + "step": 9742 + }, + { + "epoch": 2.8521662763466042, + "grad_norm": 1.0163180828094482, + "learning_rate": 2.7134561957283572e-06, + "loss": 0.5596, + "step": 9743 + }, + { + "epoch": 2.8524590163934427, + "grad_norm": 0.9763860106468201, + "learning_rate": 2.7130725247899444e-06, + "loss": 0.5556, + "step": 9744 + }, + { + "epoch": 2.852751756440281, + "grad_norm": 0.9922687411308289, + "learning_rate": 2.7126888487963436e-06, + "loss": 0.5605, + "step": 9745 + }, + { + "epoch": 2.8530444964871196, + "grad_norm": 0.9893204569816589, + "learning_rate": 2.7123051677566597e-06, + "loss": 0.5888, + "step": 9746 + }, + { + "epoch": 2.853337236533958, + "grad_norm": 0.9366058111190796, + "learning_rate": 2.711921481679995e-06, + "loss": 0.5106, + "step": 9747 + }, + { + "epoch": 2.8536299765807964, + "grad_norm": 0.9724178910255432, + "learning_rate": 2.7115377905754524e-06, + "loss": 0.5818, + "step": 9748 + }, + { + "epoch": 2.8539227166276344, + "grad_norm": 0.9769547581672668, + "learning_rate": 2.7111540944521353e-06, + "loss": 0.5647, + "step": 9749 + }, + { + "epoch": 2.8542154566744733, + "grad_norm": 1.0170665979385376, + "learning_rate": 2.7107703933191464e-06, + "loss": 0.6026, + "step": 9750 + }, + { + "epoch": 2.8545081967213113, + "grad_norm": 0.9894102811813354, + "learning_rate": 2.7103866871855896e-06, + "loss": 0.56, + "step": 9751 + }, + { + "epoch": 2.8548009367681497, + "grad_norm": 0.9483039379119873, + "learning_rate": 2.710002976060568e-06, + "loss": 0.5537, + "step": 9752 + }, + { + "epoch": 2.855093676814988, + "grad_norm": 1.0400643348693848, + "learning_rate": 2.7096192599531856e-06, + "loss": 0.5306, + "step": 9753 + }, + { + "epoch": 2.8553864168618266, + "grad_norm": 1.0179321765899658, + "learning_rate": 2.709235538872546e-06, + "loss": 0.5841, + "step": 9754 + }, + { + "epoch": 2.855679156908665, + "grad_norm": 1.0244969129562378, + "learning_rate": 2.708851812827753e-06, + "loss": 0.5271, + "step": 9755 + }, + { + "epoch": 2.8559718969555035, + "grad_norm": 0.9699947237968445, + "learning_rate": 2.708468081827911e-06, + "loss": 0.5565, + "step": 9756 + }, + { + "epoch": 2.856264637002342, + "grad_norm": 0.9824523329734802, + "learning_rate": 2.7080843458821232e-06, + "loss": 0.5795, + "step": 9757 + }, + { + "epoch": 2.8565573770491803, + "grad_norm": 1.0335627794265747, + "learning_rate": 2.7077006049994942e-06, + "loss": 0.6125, + "step": 9758 + }, + { + "epoch": 2.8568501170960188, + "grad_norm": 1.0217262506484985, + "learning_rate": 2.707316859189129e-06, + "loss": 0.6162, + "step": 9759 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.0156996250152588, + "learning_rate": 2.7069331084601313e-06, + "loss": 0.5953, + "step": 9760 + }, + { + "epoch": 2.8574355971896956, + "grad_norm": 0.9765334725379944, + "learning_rate": 2.706549352821605e-06, + "loss": 0.5562, + "step": 9761 + }, + { + "epoch": 2.857728337236534, + "grad_norm": 1.0143921375274658, + "learning_rate": 2.7061655922826567e-06, + "loss": 0.5482, + "step": 9762 + }, + { + "epoch": 2.8580210772833725, + "grad_norm": 1.0191739797592163, + "learning_rate": 2.7057818268523895e-06, + "loss": 0.5552, + "step": 9763 + }, + { + "epoch": 2.8583138173302105, + "grad_norm": 1.0163123607635498, + "learning_rate": 2.7053980565399095e-06, + "loss": 0.6195, + "step": 9764 + }, + { + "epoch": 2.8586065573770494, + "grad_norm": 0.987171471118927, + "learning_rate": 2.705014281354321e-06, + "loss": 0.5868, + "step": 9765 + }, + { + "epoch": 2.8588992974238874, + "grad_norm": 0.9934576153755188, + "learning_rate": 2.7046305013047293e-06, + "loss": 0.6115, + "step": 9766 + }, + { + "epoch": 2.8591920374707263, + "grad_norm": 0.9686475992202759, + "learning_rate": 2.704246716400239e-06, + "loss": 0.5864, + "step": 9767 + }, + { + "epoch": 2.8594847775175642, + "grad_norm": 1.0076932907104492, + "learning_rate": 2.703862926649956e-06, + "loss": 0.5655, + "step": 9768 + }, + { + "epoch": 2.8597775175644027, + "grad_norm": 1.0092746019363403, + "learning_rate": 2.703479132062987e-06, + "loss": 0.587, + "step": 9769 + }, + { + "epoch": 2.860070257611241, + "grad_norm": 0.9555947780609131, + "learning_rate": 2.703095332648436e-06, + "loss": 0.5375, + "step": 9770 + }, + { + "epoch": 2.8603629976580796, + "grad_norm": 1.0386419296264648, + "learning_rate": 2.7027115284154094e-06, + "loss": 0.6021, + "step": 9771 + }, + { + "epoch": 2.860655737704918, + "grad_norm": 0.9498079419136047, + "learning_rate": 2.7023277193730128e-06, + "loss": 0.5323, + "step": 9772 + }, + { + "epoch": 2.8609484777517564, + "grad_norm": 0.998831033706665, + "learning_rate": 2.701943905530352e-06, + "loss": 0.5585, + "step": 9773 + }, + { + "epoch": 2.861241217798595, + "grad_norm": 0.9911845922470093, + "learning_rate": 2.7015600868965335e-06, + "loss": 0.5377, + "step": 9774 + }, + { + "epoch": 2.8615339578454333, + "grad_norm": 0.9858494997024536, + "learning_rate": 2.7011762634806624e-06, + "loss": 0.6003, + "step": 9775 + }, + { + "epoch": 2.8618266978922717, + "grad_norm": 1.020404577255249, + "learning_rate": 2.7007924352918467e-06, + "loss": 0.5658, + "step": 9776 + }, + { + "epoch": 2.86211943793911, + "grad_norm": 1.063209056854248, + "learning_rate": 2.700408602339191e-06, + "loss": 0.5423, + "step": 9777 + }, + { + "epoch": 2.8624121779859486, + "grad_norm": 0.9585698843002319, + "learning_rate": 2.7000247646318036e-06, + "loss": 0.5762, + "step": 9778 + }, + { + "epoch": 2.862704918032787, + "grad_norm": 0.9831405282020569, + "learning_rate": 2.69964092217879e-06, + "loss": 0.5753, + "step": 9779 + }, + { + "epoch": 2.8629976580796255, + "grad_norm": 1.0189769268035889, + "learning_rate": 2.699257074989257e-06, + "loss": 0.5974, + "step": 9780 + }, + { + "epoch": 2.8632903981264635, + "grad_norm": 1.004219889640808, + "learning_rate": 2.698873223072312e-06, + "loss": 0.5693, + "step": 9781 + }, + { + "epoch": 2.8635831381733023, + "grad_norm": 1.0033254623413086, + "learning_rate": 2.698489366437061e-06, + "loss": 0.5506, + "step": 9782 + }, + { + "epoch": 2.8638758782201403, + "grad_norm": 1.0136319398880005, + "learning_rate": 2.698105505092612e-06, + "loss": 0.5818, + "step": 9783 + }, + { + "epoch": 2.8641686182669788, + "grad_norm": 0.9948752522468567, + "learning_rate": 2.6977216390480715e-06, + "loss": 0.5722, + "step": 9784 + }, + { + "epoch": 2.864461358313817, + "grad_norm": 0.9254619479179382, + "learning_rate": 2.6973377683125473e-06, + "loss": 0.5109, + "step": 9785 + }, + { + "epoch": 2.8647540983606556, + "grad_norm": 1.0793007612228394, + "learning_rate": 2.6969538928951468e-06, + "loss": 0.5597, + "step": 9786 + }, + { + "epoch": 2.865046838407494, + "grad_norm": 1.054333209991455, + "learning_rate": 2.696570012804977e-06, + "loss": 0.6011, + "step": 9787 + }, + { + "epoch": 2.8653395784543325, + "grad_norm": 0.9983278512954712, + "learning_rate": 2.6961861280511457e-06, + "loss": 0.5575, + "step": 9788 + }, + { + "epoch": 2.865632318501171, + "grad_norm": 1.0405802726745605, + "learning_rate": 2.6958022386427624e-06, + "loss": 0.5961, + "step": 9789 + }, + { + "epoch": 2.8659250585480094, + "grad_norm": 0.9273172616958618, + "learning_rate": 2.695418344588932e-06, + "loss": 0.5462, + "step": 9790 + }, + { + "epoch": 2.866217798594848, + "grad_norm": 0.9439367651939392, + "learning_rate": 2.695034445898764e-06, + "loss": 0.5832, + "step": 9791 + }, + { + "epoch": 2.8665105386416863, + "grad_norm": 1.092673659324646, + "learning_rate": 2.6946505425813663e-06, + "loss": 0.5219, + "step": 9792 + }, + { + "epoch": 2.8668032786885247, + "grad_norm": 0.9938117265701294, + "learning_rate": 2.6942666346458474e-06, + "loss": 0.581, + "step": 9793 + }, + { + "epoch": 2.867096018735363, + "grad_norm": 0.9605545997619629, + "learning_rate": 2.693882722101315e-06, + "loss": 0.5466, + "step": 9794 + }, + { + "epoch": 2.8673887587822016, + "grad_norm": 1.0306203365325928, + "learning_rate": 2.693498804956878e-06, + "loss": 0.5818, + "step": 9795 + }, + { + "epoch": 2.8676814988290396, + "grad_norm": 1.0248464345932007, + "learning_rate": 2.6931148832216444e-06, + "loss": 0.5845, + "step": 9796 + }, + { + "epoch": 2.8679742388758784, + "grad_norm": 0.9933881759643555, + "learning_rate": 2.692730956904723e-06, + "loss": 0.576, + "step": 9797 + }, + { + "epoch": 2.8682669789227164, + "grad_norm": 0.9763699769973755, + "learning_rate": 2.692347026015222e-06, + "loss": 0.5595, + "step": 9798 + }, + { + "epoch": 2.8685597189695553, + "grad_norm": 1.0251960754394531, + "learning_rate": 2.691963090562252e-06, + "loss": 0.545, + "step": 9799 + }, + { + "epoch": 2.8688524590163933, + "grad_norm": 1.01806640625, + "learning_rate": 2.6915791505549207e-06, + "loss": 0.6, + "step": 9800 + }, + { + "epoch": 2.8691451990632317, + "grad_norm": 0.9584134817123413, + "learning_rate": 2.6911952060023365e-06, + "loss": 0.58, + "step": 9801 + }, + { + "epoch": 2.86943793911007, + "grad_norm": 0.9749621152877808, + "learning_rate": 2.69081125691361e-06, + "loss": 0.5295, + "step": 9802 + }, + { + "epoch": 2.8697306791569086, + "grad_norm": 1.0248795747756958, + "learning_rate": 2.6904273032978495e-06, + "loss": 0.5878, + "step": 9803 + }, + { + "epoch": 2.870023419203747, + "grad_norm": 0.9857308864593506, + "learning_rate": 2.6900433451641644e-06, + "loss": 0.5621, + "step": 9804 + }, + { + "epoch": 2.8703161592505855, + "grad_norm": 1.0073063373565674, + "learning_rate": 2.689659382521665e-06, + "loss": 0.5974, + "step": 9805 + }, + { + "epoch": 2.870608899297424, + "grad_norm": 0.9669982194900513, + "learning_rate": 2.68927541537946e-06, + "loss": 0.549, + "step": 9806 + }, + { + "epoch": 2.8709016393442623, + "grad_norm": 0.9899983406066895, + "learning_rate": 2.6888914437466597e-06, + "loss": 0.5824, + "step": 9807 + }, + { + "epoch": 2.871194379391101, + "grad_norm": 0.9724266529083252, + "learning_rate": 2.6885074676323735e-06, + "loss": 0.5617, + "step": 9808 + }, + { + "epoch": 2.871487119437939, + "grad_norm": 1.0090984106063843, + "learning_rate": 2.6881234870457113e-06, + "loss": 0.5842, + "step": 9809 + }, + { + "epoch": 2.8717798594847777, + "grad_norm": 0.9587587118148804, + "learning_rate": 2.687739501995783e-06, + "loss": 0.5116, + "step": 9810 + }, + { + "epoch": 2.872072599531616, + "grad_norm": 1.0285629034042358, + "learning_rate": 2.6873555124916995e-06, + "loss": 0.5297, + "step": 9811 + }, + { + "epoch": 2.8723653395784545, + "grad_norm": 1.0207188129425049, + "learning_rate": 2.68697151854257e-06, + "loss": 0.5689, + "step": 9812 + }, + { + "epoch": 2.8726580796252925, + "grad_norm": 1.0467908382415771, + "learning_rate": 2.686587520157505e-06, + "loss": 0.5813, + "step": 9813 + }, + { + "epoch": 2.8729508196721314, + "grad_norm": 1.0312845706939697, + "learning_rate": 2.6862035173456154e-06, + "loss": 0.6019, + "step": 9814 + }, + { + "epoch": 2.8732435597189694, + "grad_norm": 1.0173486471176147, + "learning_rate": 2.685819510116012e-06, + "loss": 0.545, + "step": 9815 + }, + { + "epoch": 2.873536299765808, + "grad_norm": 1.0011918544769287, + "learning_rate": 2.6854354984778053e-06, + "loss": 0.5955, + "step": 9816 + }, + { + "epoch": 2.8738290398126463, + "grad_norm": 0.9573125243186951, + "learning_rate": 2.6850514824401047e-06, + "loss": 0.5813, + "step": 9817 + }, + { + "epoch": 2.8741217798594847, + "grad_norm": 0.9375141859054565, + "learning_rate": 2.6846674620120226e-06, + "loss": 0.5521, + "step": 9818 + }, + { + "epoch": 2.874414519906323, + "grad_norm": 1.0128872394561768, + "learning_rate": 2.6842834372026694e-06, + "loss": 0.584, + "step": 9819 + }, + { + "epoch": 2.8747072599531616, + "grad_norm": 1.0038564205169678, + "learning_rate": 2.6838994080211574e-06, + "loss": 0.5916, + "step": 9820 + }, + { + "epoch": 2.875, + "grad_norm": 0.94096440076828, + "learning_rate": 2.6835153744765952e-06, + "loss": 0.5488, + "step": 9821 + }, + { + "epoch": 2.8752927400468384, + "grad_norm": 1.0202410221099854, + "learning_rate": 2.6831313365780957e-06, + "loss": 0.5673, + "step": 9822 + }, + { + "epoch": 2.875585480093677, + "grad_norm": 0.9390025734901428, + "learning_rate": 2.6827472943347703e-06, + "loss": 0.5063, + "step": 9823 + }, + { + "epoch": 2.8758782201405153, + "grad_norm": 0.9363316297531128, + "learning_rate": 2.6823632477557305e-06, + "loss": 0.5606, + "step": 9824 + }, + { + "epoch": 2.8761709601873537, + "grad_norm": 0.9657983779907227, + "learning_rate": 2.6819791968500876e-06, + "loss": 0.5758, + "step": 9825 + }, + { + "epoch": 2.876463700234192, + "grad_norm": 0.9885620474815369, + "learning_rate": 2.6815951416269525e-06, + "loss": 0.5995, + "step": 9826 + }, + { + "epoch": 2.8767564402810306, + "grad_norm": 0.9855531454086304, + "learning_rate": 2.681211082095439e-06, + "loss": 0.6141, + "step": 9827 + }, + { + "epoch": 2.8770491803278686, + "grad_norm": 0.9767968058586121, + "learning_rate": 2.680827018264657e-06, + "loss": 0.5659, + "step": 9828 + }, + { + "epoch": 2.8773419203747075, + "grad_norm": 0.975405752658844, + "learning_rate": 2.680442950143719e-06, + "loss": 0.5581, + "step": 9829 + }, + { + "epoch": 2.8776346604215455, + "grad_norm": 1.0493026971817017, + "learning_rate": 2.680058877741738e-06, + "loss": 0.5762, + "step": 9830 + }, + { + "epoch": 2.877927400468384, + "grad_norm": 0.9897744059562683, + "learning_rate": 2.6796748010678252e-06, + "loss": 0.5836, + "step": 9831 + }, + { + "epoch": 2.8782201405152223, + "grad_norm": 0.9783439040184021, + "learning_rate": 2.679290720131093e-06, + "loss": 0.6024, + "step": 9832 + }, + { + "epoch": 2.878512880562061, + "grad_norm": 0.981504499912262, + "learning_rate": 2.6789066349406547e-06, + "loss": 0.5644, + "step": 9833 + }, + { + "epoch": 2.878805620608899, + "grad_norm": 1.0260565280914307, + "learning_rate": 2.6785225455056213e-06, + "loss": 0.5639, + "step": 9834 + }, + { + "epoch": 2.8790983606557377, + "grad_norm": 0.9829539656639099, + "learning_rate": 2.678138451835107e-06, + "loss": 0.5922, + "step": 9835 + }, + { + "epoch": 2.879391100702576, + "grad_norm": 0.9748877286911011, + "learning_rate": 2.6777543539382227e-06, + "loss": 0.5701, + "step": 9836 + }, + { + "epoch": 2.8796838407494145, + "grad_norm": 1.0615137815475464, + "learning_rate": 2.6773702518240828e-06, + "loss": 0.5431, + "step": 9837 + }, + { + "epoch": 2.879976580796253, + "grad_norm": 0.9775130748748779, + "learning_rate": 2.6769861455017997e-06, + "loss": 0.5732, + "step": 9838 + }, + { + "epoch": 2.8802693208430914, + "grad_norm": 0.9317847490310669, + "learning_rate": 2.6766020349804863e-06, + "loss": 0.5337, + "step": 9839 + }, + { + "epoch": 2.88056206088993, + "grad_norm": 0.9936427474021912, + "learning_rate": 2.6762179202692558e-06, + "loss": 0.5737, + "step": 9840 + }, + { + "epoch": 2.8808548009367683, + "grad_norm": 0.9770551323890686, + "learning_rate": 2.6758338013772206e-06, + "loss": 0.5513, + "step": 9841 + }, + { + "epoch": 2.8811475409836067, + "grad_norm": 0.9823822379112244, + "learning_rate": 2.675449678313495e-06, + "loss": 0.5589, + "step": 9842 + }, + { + "epoch": 2.8814402810304447, + "grad_norm": 0.9537256360054016, + "learning_rate": 2.6750655510871924e-06, + "loss": 0.5554, + "step": 9843 + }, + { + "epoch": 2.8817330210772836, + "grad_norm": 0.9937271475791931, + "learning_rate": 2.6746814197074263e-06, + "loss": 0.5677, + "step": 9844 + }, + { + "epoch": 2.8820257611241216, + "grad_norm": 1.000256896018982, + "learning_rate": 2.6742972841833092e-06, + "loss": 0.5818, + "step": 9845 + }, + { + "epoch": 2.8823185011709604, + "grad_norm": 1.0226863622665405, + "learning_rate": 2.6739131445239565e-06, + "loss": 0.5671, + "step": 9846 + }, + { + "epoch": 2.8826112412177984, + "grad_norm": 1.021185040473938, + "learning_rate": 2.6735290007384794e-06, + "loss": 0.5586, + "step": 9847 + }, + { + "epoch": 2.882903981264637, + "grad_norm": 0.9508678317070007, + "learning_rate": 2.6731448528359944e-06, + "loss": 0.5367, + "step": 9848 + }, + { + "epoch": 2.8831967213114753, + "grad_norm": 0.974570095539093, + "learning_rate": 2.6727607008256147e-06, + "loss": 0.5393, + "step": 9849 + }, + { + "epoch": 2.8834894613583137, + "grad_norm": 0.9661222100257874, + "learning_rate": 2.672376544716454e-06, + "loss": 0.5794, + "step": 9850 + }, + { + "epoch": 2.883782201405152, + "grad_norm": 1.0779931545257568, + "learning_rate": 2.6719923845176264e-06, + "loss": 0.6097, + "step": 9851 + }, + { + "epoch": 2.8840749414519906, + "grad_norm": 0.9608885049819946, + "learning_rate": 2.6716082202382464e-06, + "loss": 0.5614, + "step": 9852 + }, + { + "epoch": 2.884367681498829, + "grad_norm": 1.0085233449935913, + "learning_rate": 2.671224051887429e-06, + "loss": 0.5584, + "step": 9853 + }, + { + "epoch": 2.8846604215456675, + "grad_norm": 0.952269971370697, + "learning_rate": 2.6708398794742875e-06, + "loss": 0.5493, + "step": 9854 + }, + { + "epoch": 2.884953161592506, + "grad_norm": 0.9176225066184998, + "learning_rate": 2.6704557030079376e-06, + "loss": 0.5298, + "step": 9855 + }, + { + "epoch": 2.8852459016393444, + "grad_norm": 1.0003095865249634, + "learning_rate": 2.670071522497493e-06, + "loss": 0.5815, + "step": 9856 + }, + { + "epoch": 2.885538641686183, + "grad_norm": 0.982055127620697, + "learning_rate": 2.669687337952068e-06, + "loss": 0.5669, + "step": 9857 + }, + { + "epoch": 2.8858313817330212, + "grad_norm": 0.9915335774421692, + "learning_rate": 2.6693031493807797e-06, + "loss": 0.5614, + "step": 9858 + }, + { + "epoch": 2.8861241217798597, + "grad_norm": 0.9446676969528198, + "learning_rate": 2.668918956792741e-06, + "loss": 0.5594, + "step": 9859 + }, + { + "epoch": 2.8864168618266977, + "grad_norm": 1.0374022722244263, + "learning_rate": 2.668534760197068e-06, + "loss": 0.5777, + "step": 9860 + }, + { + "epoch": 2.8867096018735365, + "grad_norm": 1.0211526155471802, + "learning_rate": 2.668150559602875e-06, + "loss": 0.5671, + "step": 9861 + }, + { + "epoch": 2.8870023419203745, + "grad_norm": 1.059348464012146, + "learning_rate": 2.667766355019278e-06, + "loss": 0.5965, + "step": 9862 + }, + { + "epoch": 2.887295081967213, + "grad_norm": 1.0226088762283325, + "learning_rate": 2.667382146455392e-06, + "loss": 0.575, + "step": 9863 + }, + { + "epoch": 2.8875878220140514, + "grad_norm": 1.0004295110702515, + "learning_rate": 2.6669979339203323e-06, + "loss": 0.552, + "step": 9864 + }, + { + "epoch": 2.88788056206089, + "grad_norm": 1.0447949171066284, + "learning_rate": 2.666613717423214e-06, + "loss": 0.5557, + "step": 9865 + }, + { + "epoch": 2.8881733021077283, + "grad_norm": 1.0018258094787598, + "learning_rate": 2.6662294969731543e-06, + "loss": 0.5953, + "step": 9866 + }, + { + "epoch": 2.8884660421545667, + "grad_norm": 1.055825114250183, + "learning_rate": 2.665845272579267e-06, + "loss": 0.5665, + "step": 9867 + }, + { + "epoch": 2.888758782201405, + "grad_norm": 1.03768789768219, + "learning_rate": 2.665461044250669e-06, + "loss": 0.5456, + "step": 9868 + }, + { + "epoch": 2.8890515222482436, + "grad_norm": 0.9693235754966736, + "learning_rate": 2.6650768119964758e-06, + "loss": 0.5335, + "step": 9869 + }, + { + "epoch": 2.889344262295082, + "grad_norm": 0.9848907589912415, + "learning_rate": 2.6646925758258037e-06, + "loss": 0.5984, + "step": 9870 + }, + { + "epoch": 2.8896370023419204, + "grad_norm": 0.9643967151641846, + "learning_rate": 2.664308335747769e-06, + "loss": 0.5607, + "step": 9871 + }, + { + "epoch": 2.889929742388759, + "grad_norm": 0.9844010472297668, + "learning_rate": 2.663924091771487e-06, + "loss": 0.57, + "step": 9872 + }, + { + "epoch": 2.8902224824355973, + "grad_norm": 1.0020513534545898, + "learning_rate": 2.6635398439060743e-06, + "loss": 0.5803, + "step": 9873 + }, + { + "epoch": 2.8905152224824358, + "grad_norm": 1.0052539110183716, + "learning_rate": 2.663155592160647e-06, + "loss": 0.6018, + "step": 9874 + }, + { + "epoch": 2.8908079625292737, + "grad_norm": 0.999758780002594, + "learning_rate": 2.6627713365443224e-06, + "loss": 0.5232, + "step": 9875 + }, + { + "epoch": 2.8911007025761126, + "grad_norm": 0.9861293435096741, + "learning_rate": 2.6623870770662174e-06, + "loss": 0.5335, + "step": 9876 + }, + { + "epoch": 2.8913934426229506, + "grad_norm": 0.9275851249694824, + "learning_rate": 2.662002813735447e-06, + "loss": 0.5509, + "step": 9877 + }, + { + "epoch": 2.8916861826697895, + "grad_norm": 0.9664603471755981, + "learning_rate": 2.661618546561129e-06, + "loss": 0.5425, + "step": 9878 + }, + { + "epoch": 2.8919789227166275, + "grad_norm": 1.0227230787277222, + "learning_rate": 2.6612342755523794e-06, + "loss": 0.5921, + "step": 9879 + }, + { + "epoch": 2.892271662763466, + "grad_norm": 1.0235340595245361, + "learning_rate": 2.6608500007183162e-06, + "loss": 0.5698, + "step": 9880 + }, + { + "epoch": 2.8925644028103044, + "grad_norm": 0.9856959581375122, + "learning_rate": 2.660465722068056e-06, + "loss": 0.547, + "step": 9881 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 0.9606420993804932, + "learning_rate": 2.660081439610715e-06, + "loss": 0.5547, + "step": 9882 + }, + { + "epoch": 2.8931498829039812, + "grad_norm": 0.9877397418022156, + "learning_rate": 2.659697153355411e-06, + "loss": 0.5488, + "step": 9883 + }, + { + "epoch": 2.8934426229508197, + "grad_norm": 1.017397403717041, + "learning_rate": 2.659312863311262e-06, + "loss": 0.5855, + "step": 9884 + }, + { + "epoch": 2.893735362997658, + "grad_norm": 0.9937192797660828, + "learning_rate": 2.658928569487384e-06, + "loss": 0.5817, + "step": 9885 + }, + { + "epoch": 2.8940281030444965, + "grad_norm": 0.9753185510635376, + "learning_rate": 2.658544271892896e-06, + "loss": 0.6126, + "step": 9886 + }, + { + "epoch": 2.894320843091335, + "grad_norm": 0.9917641282081604, + "learning_rate": 2.6581599705369144e-06, + "loss": 0.5637, + "step": 9887 + }, + { + "epoch": 2.8946135831381734, + "grad_norm": 0.9804947972297668, + "learning_rate": 2.657775665428557e-06, + "loss": 0.5456, + "step": 9888 + }, + { + "epoch": 2.894906323185012, + "grad_norm": 0.99666428565979, + "learning_rate": 2.6573913565769414e-06, + "loss": 0.5809, + "step": 9889 + }, + { + "epoch": 2.8951990632318503, + "grad_norm": 0.9949003458023071, + "learning_rate": 2.6570070439911856e-06, + "loss": 0.5901, + "step": 9890 + }, + { + "epoch": 2.8954918032786887, + "grad_norm": 0.9823651909828186, + "learning_rate": 2.6566227276804085e-06, + "loss": 0.5787, + "step": 9891 + }, + { + "epoch": 2.8957845433255267, + "grad_norm": 1.041821002960205, + "learning_rate": 2.6562384076537262e-06, + "loss": 0.5507, + "step": 9892 + }, + { + "epoch": 2.8960772833723656, + "grad_norm": 1.0816609859466553, + "learning_rate": 2.6558540839202575e-06, + "loss": 0.5575, + "step": 9893 + }, + { + "epoch": 2.8963700234192036, + "grad_norm": 0.9905228614807129, + "learning_rate": 2.6554697564891207e-06, + "loss": 0.547, + "step": 9894 + }, + { + "epoch": 2.896662763466042, + "grad_norm": 1.0232818126678467, + "learning_rate": 2.655085425369435e-06, + "loss": 0.5898, + "step": 9895 + }, + { + "epoch": 2.8969555035128804, + "grad_norm": 0.9378979802131653, + "learning_rate": 2.6547010905703167e-06, + "loss": 0.5484, + "step": 9896 + }, + { + "epoch": 2.897248243559719, + "grad_norm": 0.9960646033287048, + "learning_rate": 2.6543167521008862e-06, + "loss": 0.5491, + "step": 9897 + }, + { + "epoch": 2.8975409836065573, + "grad_norm": 0.9998040199279785, + "learning_rate": 2.6539324099702613e-06, + "loss": 0.5894, + "step": 9898 + }, + { + "epoch": 2.8978337236533958, + "grad_norm": 1.0375581979751587, + "learning_rate": 2.6535480641875593e-06, + "loss": 0.5755, + "step": 9899 + }, + { + "epoch": 2.898126463700234, + "grad_norm": 0.9286021590232849, + "learning_rate": 2.6531637147619004e-06, + "loss": 0.5501, + "step": 9900 + }, + { + "epoch": 2.8984192037470726, + "grad_norm": 1.0603398084640503, + "learning_rate": 2.652779361702403e-06, + "loss": 0.6023, + "step": 9901 + }, + { + "epoch": 2.898711943793911, + "grad_norm": 1.0221811532974243, + "learning_rate": 2.6523950050181864e-06, + "loss": 0.5435, + "step": 9902 + }, + { + "epoch": 2.8990046838407495, + "grad_norm": 0.9842182397842407, + "learning_rate": 2.6520106447183685e-06, + "loss": 0.5477, + "step": 9903 + }, + { + "epoch": 2.899297423887588, + "grad_norm": 0.983978807926178, + "learning_rate": 2.6516262808120685e-06, + "loss": 0.5882, + "step": 9904 + }, + { + "epoch": 2.8995901639344264, + "grad_norm": 1.0145071744918823, + "learning_rate": 2.651241913308406e-06, + "loss": 0.5613, + "step": 9905 + }, + { + "epoch": 2.899882903981265, + "grad_norm": 1.0149650573730469, + "learning_rate": 2.650857542216501e-06, + "loss": 0.5772, + "step": 9906 + }, + { + "epoch": 2.900175644028103, + "grad_norm": 1.0438017845153809, + "learning_rate": 2.650473167545471e-06, + "loss": 0.5445, + "step": 9907 + }, + { + "epoch": 2.9004683840749417, + "grad_norm": 0.9786389470100403, + "learning_rate": 2.6500887893044365e-06, + "loss": 0.5829, + "step": 9908 + }, + { + "epoch": 2.9007611241217797, + "grad_norm": 1.1099216938018799, + "learning_rate": 2.6497044075025157e-06, + "loss": 0.5958, + "step": 9909 + }, + { + "epoch": 2.901053864168618, + "grad_norm": 0.9910482168197632, + "learning_rate": 2.64932002214883e-06, + "loss": 0.5887, + "step": 9910 + }, + { + "epoch": 2.9013466042154565, + "grad_norm": 1.0581802129745483, + "learning_rate": 2.6489356332524986e-06, + "loss": 0.5676, + "step": 9911 + }, + { + "epoch": 2.901639344262295, + "grad_norm": 0.9760097861289978, + "learning_rate": 2.6485512408226393e-06, + "loss": 0.5192, + "step": 9912 + }, + { + "epoch": 2.9019320843091334, + "grad_norm": 0.9891105890274048, + "learning_rate": 2.648166844868374e-06, + "loss": 0.5896, + "step": 9913 + }, + { + "epoch": 2.902224824355972, + "grad_norm": 0.9684768915176392, + "learning_rate": 2.6477824453988217e-06, + "loss": 0.5498, + "step": 9914 + }, + { + "epoch": 2.9025175644028103, + "grad_norm": 0.9785380363464355, + "learning_rate": 2.6473980424231033e-06, + "loss": 0.6134, + "step": 9915 + }, + { + "epoch": 2.9028103044496487, + "grad_norm": 1.0237152576446533, + "learning_rate": 2.647013635950337e-06, + "loss": 0.5573, + "step": 9916 + }, + { + "epoch": 2.903103044496487, + "grad_norm": 1.010901689529419, + "learning_rate": 2.6466292259896447e-06, + "loss": 0.5712, + "step": 9917 + }, + { + "epoch": 2.9033957845433256, + "grad_norm": 0.9737251996994019, + "learning_rate": 2.646244812550145e-06, + "loss": 0.5516, + "step": 9918 + }, + { + "epoch": 2.903688524590164, + "grad_norm": 0.96632981300354, + "learning_rate": 2.6458603956409595e-06, + "loss": 0.4967, + "step": 9919 + }, + { + "epoch": 2.9039812646370025, + "grad_norm": 1.0565180778503418, + "learning_rate": 2.6454759752712078e-06, + "loss": 0.5756, + "step": 9920 + }, + { + "epoch": 2.904274004683841, + "grad_norm": 0.964795708656311, + "learning_rate": 2.6450915514500115e-06, + "loss": 0.5721, + "step": 9921 + }, + { + "epoch": 2.904566744730679, + "grad_norm": 0.9916386008262634, + "learning_rate": 2.64470712418649e-06, + "loss": 0.5847, + "step": 9922 + }, + { + "epoch": 2.9048594847775178, + "grad_norm": 0.9828813672065735, + "learning_rate": 2.6443226934897632e-06, + "loss": 0.5453, + "step": 9923 + }, + { + "epoch": 2.9051522248243558, + "grad_norm": 1.084175944328308, + "learning_rate": 2.6439382593689533e-06, + "loss": 0.5577, + "step": 9924 + }, + { + "epoch": 2.9054449648711946, + "grad_norm": 1.0259548425674438, + "learning_rate": 2.6435538218331802e-06, + "loss": 0.6026, + "step": 9925 + }, + { + "epoch": 2.9057377049180326, + "grad_norm": 0.9882990121841431, + "learning_rate": 2.643169380891566e-06, + "loss": 0.5826, + "step": 9926 + }, + { + "epoch": 2.906030444964871, + "grad_norm": 0.9854710102081299, + "learning_rate": 2.6427849365532306e-06, + "loss": 0.5851, + "step": 9927 + }, + { + "epoch": 2.9063231850117095, + "grad_norm": 1.0033305883407593, + "learning_rate": 2.642400488827294e-06, + "loss": 0.5918, + "step": 9928 + }, + { + "epoch": 2.906615925058548, + "grad_norm": 0.9737502336502075, + "learning_rate": 2.6420160377228794e-06, + "loss": 0.558, + "step": 9929 + }, + { + "epoch": 2.9069086651053864, + "grad_norm": 0.9834559559822083, + "learning_rate": 2.641631583249107e-06, + "loss": 0.5963, + "step": 9930 + }, + { + "epoch": 2.907201405152225, + "grad_norm": 0.9902862906455994, + "learning_rate": 2.6412471254150978e-06, + "loss": 0.6051, + "step": 9931 + }, + { + "epoch": 2.9074941451990632, + "grad_norm": 1.0124973058700562, + "learning_rate": 2.6408626642299738e-06, + "loss": 0.5842, + "step": 9932 + }, + { + "epoch": 2.9077868852459017, + "grad_norm": 0.9865565896034241, + "learning_rate": 2.6404781997028563e-06, + "loss": 0.5775, + "step": 9933 + }, + { + "epoch": 2.90807962529274, + "grad_norm": 1.0051566362380981, + "learning_rate": 2.640093731842866e-06, + "loss": 0.5945, + "step": 9934 + }, + { + "epoch": 2.9083723653395785, + "grad_norm": 1.0050538778305054, + "learning_rate": 2.6397092606591247e-06, + "loss": 0.5836, + "step": 9935 + }, + { + "epoch": 2.908665105386417, + "grad_norm": 0.9983928203582764, + "learning_rate": 2.639324786160755e-06, + "loss": 0.5338, + "step": 9936 + }, + { + "epoch": 2.9089578454332554, + "grad_norm": 1.2999451160430908, + "learning_rate": 2.638940308356878e-06, + "loss": 0.5287, + "step": 9937 + }, + { + "epoch": 2.909250585480094, + "grad_norm": 0.9982926249504089, + "learning_rate": 2.6385558272566145e-06, + "loss": 0.5351, + "step": 9938 + }, + { + "epoch": 2.909543325526932, + "grad_norm": 1.0099663734436035, + "learning_rate": 2.638171342869088e-06, + "loss": 0.5636, + "step": 9939 + }, + { + "epoch": 2.9098360655737707, + "grad_norm": 0.9898994565010071, + "learning_rate": 2.63778685520342e-06, + "loss": 0.5511, + "step": 9940 + }, + { + "epoch": 2.9101288056206087, + "grad_norm": 0.9701278209686279, + "learning_rate": 2.637402364268732e-06, + "loss": 0.577, + "step": 9941 + }, + { + "epoch": 2.910421545667447, + "grad_norm": 0.9528393149375916, + "learning_rate": 2.637017870074147e-06, + "loss": 0.5623, + "step": 9942 + }, + { + "epoch": 2.9107142857142856, + "grad_norm": 1.0412545204162598, + "learning_rate": 2.6366333726287862e-06, + "loss": 0.5761, + "step": 9943 + }, + { + "epoch": 2.911007025761124, + "grad_norm": 1.0074620246887207, + "learning_rate": 2.636248871941772e-06, + "loss": 0.5796, + "step": 9944 + }, + { + "epoch": 2.9112997658079625, + "grad_norm": 1.0090821981430054, + "learning_rate": 2.635864368022228e-06, + "loss": 0.5368, + "step": 9945 + }, + { + "epoch": 2.911592505854801, + "grad_norm": 1.017452359199524, + "learning_rate": 2.6354798608792755e-06, + "loss": 0.5552, + "step": 9946 + }, + { + "epoch": 2.9118852459016393, + "grad_norm": 1.0181223154067993, + "learning_rate": 2.6350953505220377e-06, + "loss": 0.5509, + "step": 9947 + }, + { + "epoch": 2.9121779859484778, + "grad_norm": 1.003537893295288, + "learning_rate": 2.6347108369596365e-06, + "loss": 0.5893, + "step": 9948 + }, + { + "epoch": 2.912470725995316, + "grad_norm": 1.0258468389511108, + "learning_rate": 2.634326320201194e-06, + "loss": 0.5719, + "step": 9949 + }, + { + "epoch": 2.9127634660421546, + "grad_norm": 0.9915449023246765, + "learning_rate": 2.633941800255835e-06, + "loss": 0.5693, + "step": 9950 + }, + { + "epoch": 2.913056206088993, + "grad_norm": 1.1051099300384521, + "learning_rate": 2.6335572771326805e-06, + "loss": 0.5215, + "step": 9951 + }, + { + "epoch": 2.9133489461358315, + "grad_norm": 0.9933915138244629, + "learning_rate": 2.6331727508408544e-06, + "loss": 0.5308, + "step": 9952 + }, + { + "epoch": 2.91364168618267, + "grad_norm": 0.9717495441436768, + "learning_rate": 2.6327882213894784e-06, + "loss": 0.5234, + "step": 9953 + }, + { + "epoch": 2.913934426229508, + "grad_norm": 1.0100027322769165, + "learning_rate": 2.632403688787677e-06, + "loss": 0.5731, + "step": 9954 + }, + { + "epoch": 2.914227166276347, + "grad_norm": 1.0273844003677368, + "learning_rate": 2.632019153044572e-06, + "loss": 0.5666, + "step": 9955 + }, + { + "epoch": 2.914519906323185, + "grad_norm": 0.9632961750030518, + "learning_rate": 2.631634614169287e-06, + "loss": 0.5474, + "step": 9956 + }, + { + "epoch": 2.9148126463700237, + "grad_norm": 0.9477797746658325, + "learning_rate": 2.6312500721709468e-06, + "loss": 0.5536, + "step": 9957 + }, + { + "epoch": 2.9151053864168617, + "grad_norm": 1.0507324934005737, + "learning_rate": 2.6308655270586724e-06, + "loss": 0.5466, + "step": 9958 + }, + { + "epoch": 2.9153981264637, + "grad_norm": 1.0040477514266968, + "learning_rate": 2.6304809788415885e-06, + "loss": 0.5672, + "step": 9959 + }, + { + "epoch": 2.9156908665105385, + "grad_norm": 0.9673066139221191, + "learning_rate": 2.6300964275288182e-06, + "loss": 0.6035, + "step": 9960 + }, + { + "epoch": 2.915983606557377, + "grad_norm": 1.0081510543823242, + "learning_rate": 2.6297118731294853e-06, + "loss": 0.5497, + "step": 9961 + }, + { + "epoch": 2.9162763466042154, + "grad_norm": 0.9652444124221802, + "learning_rate": 2.629327315652713e-06, + "loss": 0.5616, + "step": 9962 + }, + { + "epoch": 2.916569086651054, + "grad_norm": 0.9795066714286804, + "learning_rate": 2.628942755107626e-06, + "loss": 0.5577, + "step": 9963 + }, + { + "epoch": 2.9168618266978923, + "grad_norm": 0.9661463499069214, + "learning_rate": 2.6285581915033466e-06, + "loss": 0.5734, + "step": 9964 + }, + { + "epoch": 2.9171545667447307, + "grad_norm": 1.0287375450134277, + "learning_rate": 2.628173624849e-06, + "loss": 0.574, + "step": 9965 + }, + { + "epoch": 2.917447306791569, + "grad_norm": 0.9852139353752136, + "learning_rate": 2.627789055153709e-06, + "loss": 0.5837, + "step": 9966 + }, + { + "epoch": 2.9177400468384076, + "grad_norm": 0.9645309448242188, + "learning_rate": 2.6274044824265984e-06, + "loss": 0.5548, + "step": 9967 + }, + { + "epoch": 2.918032786885246, + "grad_norm": 1.0071197748184204, + "learning_rate": 2.6270199066767916e-06, + "loss": 0.6078, + "step": 9968 + }, + { + "epoch": 2.9183255269320845, + "grad_norm": 0.9899149537086487, + "learning_rate": 2.626635327913414e-06, + "loss": 0.5449, + "step": 9969 + }, + { + "epoch": 2.918618266978923, + "grad_norm": 0.9480745196342468, + "learning_rate": 2.6262507461455885e-06, + "loss": 0.5431, + "step": 9970 + }, + { + "epoch": 2.918911007025761, + "grad_norm": 0.9373776912689209, + "learning_rate": 2.6258661613824393e-06, + "loss": 0.5293, + "step": 9971 + }, + { + "epoch": 2.9192037470725998, + "grad_norm": 1.0076605081558228, + "learning_rate": 2.625481573633092e-06, + "loss": 0.5436, + "step": 9972 + }, + { + "epoch": 2.9194964871194378, + "grad_norm": 1.0137319564819336, + "learning_rate": 2.6250969829066695e-06, + "loss": 0.5369, + "step": 9973 + }, + { + "epoch": 2.919789227166276, + "grad_norm": 0.9932499527931213, + "learning_rate": 2.6247123892122974e-06, + "loss": 0.5844, + "step": 9974 + }, + { + "epoch": 2.9200819672131146, + "grad_norm": 0.9715107083320618, + "learning_rate": 2.6243277925591e-06, + "loss": 0.563, + "step": 9975 + }, + { + "epoch": 2.920374707259953, + "grad_norm": 1.7234936952590942, + "learning_rate": 2.6239431929562015e-06, + "loss": 0.5704, + "step": 9976 + }, + { + "epoch": 2.9206674473067915, + "grad_norm": 0.9674422740936279, + "learning_rate": 2.6235585904127275e-06, + "loss": 0.5879, + "step": 9977 + }, + { + "epoch": 2.92096018735363, + "grad_norm": 0.9710869193077087, + "learning_rate": 2.6231739849378016e-06, + "loss": 0.5531, + "step": 9978 + }, + { + "epoch": 2.9212529274004684, + "grad_norm": 1.0098940134048462, + "learning_rate": 2.6227893765405494e-06, + "loss": 0.5801, + "step": 9979 + }, + { + "epoch": 2.921545667447307, + "grad_norm": 0.9797923564910889, + "learning_rate": 2.6224047652300956e-06, + "loss": 0.5965, + "step": 9980 + }, + { + "epoch": 2.9218384074941453, + "grad_norm": 1.0257765054702759, + "learning_rate": 2.622020151015565e-06, + "loss": 0.5534, + "step": 9981 + }, + { + "epoch": 2.9221311475409837, + "grad_norm": 1.0363632440567017, + "learning_rate": 2.621635533906084e-06, + "loss": 0.5946, + "step": 9982 + }, + { + "epoch": 2.922423887587822, + "grad_norm": 1.0044482946395874, + "learning_rate": 2.6212509139107755e-06, + "loss": 0.5163, + "step": 9983 + }, + { + "epoch": 2.9227166276346606, + "grad_norm": 1.0214349031448364, + "learning_rate": 2.620866291038766e-06, + "loss": 0.5735, + "step": 9984 + }, + { + "epoch": 2.923009367681499, + "grad_norm": 1.0144801139831543, + "learning_rate": 2.6204816652991805e-06, + "loss": 0.572, + "step": 9985 + }, + { + "epoch": 2.923302107728337, + "grad_norm": 1.8927607536315918, + "learning_rate": 2.620097036701145e-06, + "loss": 0.6003, + "step": 9986 + }, + { + "epoch": 2.923594847775176, + "grad_norm": 0.9664320349693298, + "learning_rate": 2.619712405253784e-06, + "loss": 0.5678, + "step": 9987 + }, + { + "epoch": 2.923887587822014, + "grad_norm": 1.012929081916809, + "learning_rate": 2.619327770966223e-06, + "loss": 0.5414, + "step": 9988 + }, + { + "epoch": 2.9241803278688527, + "grad_norm": 0.9633643627166748, + "learning_rate": 2.6189431338475874e-06, + "loss": 0.5711, + "step": 9989 + }, + { + "epoch": 2.9244730679156907, + "grad_norm": 1.0105806589126587, + "learning_rate": 2.618558493907003e-06, + "loss": 0.5883, + "step": 9990 + }, + { + "epoch": 2.924765807962529, + "grad_norm": 0.981605052947998, + "learning_rate": 2.6181738511535958e-06, + "loss": 0.578, + "step": 9991 + }, + { + "epoch": 2.9250585480093676, + "grad_norm": 0.9997009038925171, + "learning_rate": 2.617789205596492e-06, + "loss": 0.5515, + "step": 9992 + }, + { + "epoch": 2.925351288056206, + "grad_norm": 0.9580520391464233, + "learning_rate": 2.617404557244816e-06, + "loss": 0.5459, + "step": 9993 + }, + { + "epoch": 2.9256440281030445, + "grad_norm": 0.9858453273773193, + "learning_rate": 2.617019906107694e-06, + "loss": 0.5349, + "step": 9994 + }, + { + "epoch": 2.925936768149883, + "grad_norm": 0.9898223280906677, + "learning_rate": 2.6166352521942522e-06, + "loss": 0.5605, + "step": 9995 + }, + { + "epoch": 2.9262295081967213, + "grad_norm": 1.0405662059783936, + "learning_rate": 2.616250595513617e-06, + "loss": 0.5874, + "step": 9996 + }, + { + "epoch": 2.9265222482435598, + "grad_norm": 1.0571022033691406, + "learning_rate": 2.6158659360749145e-06, + "loss": 0.5637, + "step": 9997 + }, + { + "epoch": 2.926814988290398, + "grad_norm": 0.9616442322731018, + "learning_rate": 2.61548127388727e-06, + "loss": 0.5889, + "step": 9998 + }, + { + "epoch": 2.9271077283372366, + "grad_norm": 1.0143259763717651, + "learning_rate": 2.61509660895981e-06, + "loss": 0.5732, + "step": 9999 + }, + { + "epoch": 2.927400468384075, + "grad_norm": 1.059718370437622, + "learning_rate": 2.6147119413016608e-06, + "loss": 0.5556, + "step": 10000 + }, + { + "epoch": 2.927693208430913, + "grad_norm": 0.9164896607398987, + "learning_rate": 2.6143272709219485e-06, + "loss": 0.4945, + "step": 10001 + }, + { + "epoch": 2.927985948477752, + "grad_norm": 1.0087686777114868, + "learning_rate": 2.6139425978298004e-06, + "loss": 0.5929, + "step": 10002 + }, + { + "epoch": 2.92827868852459, + "grad_norm": 1.0189886093139648, + "learning_rate": 2.613557922034342e-06, + "loss": 0.5621, + "step": 10003 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.9937826991081238, + "learning_rate": 2.6131732435447e-06, + "loss": 0.602, + "step": 10004 + }, + { + "epoch": 2.928864168618267, + "grad_norm": 0.9663675427436829, + "learning_rate": 2.612788562370002e-06, + "loss": 0.5774, + "step": 10005 + }, + { + "epoch": 2.9291569086651053, + "grad_norm": 0.9506199359893799, + "learning_rate": 2.6124038785193723e-06, + "loss": 0.549, + "step": 10006 + }, + { + "epoch": 2.9294496487119437, + "grad_norm": 0.9789804220199585, + "learning_rate": 2.61201919200194e-06, + "loss": 0.5636, + "step": 10007 + }, + { + "epoch": 2.929742388758782, + "grad_norm": 0.9798746705055237, + "learning_rate": 2.6116345028268305e-06, + "loss": 0.5754, + "step": 10008 + }, + { + "epoch": 2.9300351288056206, + "grad_norm": 1.010386347770691, + "learning_rate": 2.6112498110031705e-06, + "loss": 0.5329, + "step": 10009 + }, + { + "epoch": 2.930327868852459, + "grad_norm": 0.9782123565673828, + "learning_rate": 2.6108651165400876e-06, + "loss": 0.5661, + "step": 10010 + }, + { + "epoch": 2.9306206088992974, + "grad_norm": 0.9611077308654785, + "learning_rate": 2.6104804194467083e-06, + "loss": 0.5467, + "step": 10011 + }, + { + "epoch": 2.930913348946136, + "grad_norm": 1.0300558805465698, + "learning_rate": 2.610095719732161e-06, + "loss": 0.5728, + "step": 10012 + }, + { + "epoch": 2.9312060889929743, + "grad_norm": 0.9943877458572388, + "learning_rate": 2.60971101740557e-06, + "loss": 0.551, + "step": 10013 + }, + { + "epoch": 2.9314988290398127, + "grad_norm": 0.9972922801971436, + "learning_rate": 2.609326312476065e-06, + "loss": 0.5455, + "step": 10014 + }, + { + "epoch": 2.931791569086651, + "grad_norm": 0.9283608794212341, + "learning_rate": 2.6089416049527717e-06, + "loss": 0.5075, + "step": 10015 + }, + { + "epoch": 2.9320843091334896, + "grad_norm": 0.987540602684021, + "learning_rate": 2.608556894844818e-06, + "loss": 0.5206, + "step": 10016 + }, + { + "epoch": 2.932377049180328, + "grad_norm": 1.0338423252105713, + "learning_rate": 2.608172182161332e-06, + "loss": 0.5757, + "step": 10017 + }, + { + "epoch": 2.932669789227166, + "grad_norm": 0.9676620364189148, + "learning_rate": 2.607787466911439e-06, + "loss": 0.6078, + "step": 10018 + }, + { + "epoch": 2.932962529274005, + "grad_norm": 0.9747098088264465, + "learning_rate": 2.607402749104268e-06, + "loss": 0.5461, + "step": 10019 + }, + { + "epoch": 2.933255269320843, + "grad_norm": 0.9478684663772583, + "learning_rate": 2.6070180287489465e-06, + "loss": 0.5388, + "step": 10020 + }, + { + "epoch": 2.9335480093676813, + "grad_norm": 1.034559726715088, + "learning_rate": 2.6066333058546018e-06, + "loss": 0.6355, + "step": 10021 + }, + { + "epoch": 2.9338407494145198, + "grad_norm": 1.038272738456726, + "learning_rate": 2.606248580430361e-06, + "loss": 0.5559, + "step": 10022 + }, + { + "epoch": 2.934133489461358, + "grad_norm": 1.0246496200561523, + "learning_rate": 2.6058638524853525e-06, + "loss": 0.5501, + "step": 10023 + }, + { + "epoch": 2.9344262295081966, + "grad_norm": 1.0285816192626953, + "learning_rate": 2.6054791220287036e-06, + "loss": 0.574, + "step": 10024 + }, + { + "epoch": 2.934718969555035, + "grad_norm": 1.0151768922805786, + "learning_rate": 2.6050943890695424e-06, + "loss": 0.5543, + "step": 10025 + }, + { + "epoch": 2.9350117096018735, + "grad_norm": 0.9570237994194031, + "learning_rate": 2.604709653616996e-06, + "loss": 0.5589, + "step": 10026 + }, + { + "epoch": 2.935304449648712, + "grad_norm": 0.9810967445373535, + "learning_rate": 2.6043249156801938e-06, + "loss": 0.5988, + "step": 10027 + }, + { + "epoch": 2.9355971896955504, + "grad_norm": 0.9626652002334595, + "learning_rate": 2.6039401752682624e-06, + "loss": 0.5624, + "step": 10028 + }, + { + "epoch": 2.935889929742389, + "grad_norm": 0.9516878128051758, + "learning_rate": 2.6035554323903306e-06, + "loss": 0.5767, + "step": 10029 + }, + { + "epoch": 2.9361826697892273, + "grad_norm": 0.9645724296569824, + "learning_rate": 2.6031706870555264e-06, + "loss": 0.5828, + "step": 10030 + }, + { + "epoch": 2.9364754098360657, + "grad_norm": 0.9641596674919128, + "learning_rate": 2.602785939272977e-06, + "loss": 0.5521, + "step": 10031 + }, + { + "epoch": 2.936768149882904, + "grad_norm": 0.9857109189033508, + "learning_rate": 2.602401189051812e-06, + "loss": 0.5561, + "step": 10032 + }, + { + "epoch": 2.937060889929742, + "grad_norm": 0.9745016694068909, + "learning_rate": 2.6020164364011596e-06, + "loss": 0.583, + "step": 10033 + }, + { + "epoch": 2.937353629976581, + "grad_norm": 0.9551525712013245, + "learning_rate": 2.6016316813301474e-06, + "loss": 0.5428, + "step": 10034 + }, + { + "epoch": 2.937646370023419, + "grad_norm": 1.0611048936843872, + "learning_rate": 2.601246923847904e-06, + "loss": 0.5569, + "step": 10035 + }, + { + "epoch": 2.937939110070258, + "grad_norm": 1.0037412643432617, + "learning_rate": 2.600862163963557e-06, + "loss": 0.5931, + "step": 10036 + }, + { + "epoch": 2.938231850117096, + "grad_norm": 0.9677786827087402, + "learning_rate": 2.6004774016862364e-06, + "loss": 0.5717, + "step": 10037 + }, + { + "epoch": 2.9385245901639343, + "grad_norm": 0.9833011031150818, + "learning_rate": 2.600092637025071e-06, + "loss": 0.5814, + "step": 10038 + }, + { + "epoch": 2.9388173302107727, + "grad_norm": 0.9984132647514343, + "learning_rate": 2.5997078699891874e-06, + "loss": 0.5545, + "step": 10039 + }, + { + "epoch": 2.939110070257611, + "grad_norm": 0.9759519100189209, + "learning_rate": 2.5993231005877165e-06, + "loss": 0.5769, + "step": 10040 + }, + { + "epoch": 2.9394028103044496, + "grad_norm": 0.9688433408737183, + "learning_rate": 2.598938328829785e-06, + "loss": 0.5801, + "step": 10041 + }, + { + "epoch": 2.939695550351288, + "grad_norm": 0.9999233484268188, + "learning_rate": 2.5985535547245235e-06, + "loss": 0.5924, + "step": 10042 + }, + { + "epoch": 2.9399882903981265, + "grad_norm": 1.0006941556930542, + "learning_rate": 2.5981687782810595e-06, + "loss": 0.579, + "step": 10043 + }, + { + "epoch": 2.940281030444965, + "grad_norm": 0.9880650043487549, + "learning_rate": 2.5977839995085226e-06, + "loss": 0.5958, + "step": 10044 + }, + { + "epoch": 2.9405737704918034, + "grad_norm": 1.00563383102417, + "learning_rate": 2.597399218416041e-06, + "loss": 0.6204, + "step": 10045 + }, + { + "epoch": 2.940866510538642, + "grad_norm": 0.9694665670394897, + "learning_rate": 2.5970144350127443e-06, + "loss": 0.5205, + "step": 10046 + }, + { + "epoch": 2.9411592505854802, + "grad_norm": 1.0156255960464478, + "learning_rate": 2.5966296493077615e-06, + "loss": 0.5861, + "step": 10047 + }, + { + "epoch": 2.9414519906323187, + "grad_norm": 1.0056363344192505, + "learning_rate": 2.596244861310222e-06, + "loss": 0.5263, + "step": 10048 + }, + { + "epoch": 2.941744730679157, + "grad_norm": 1.009000301361084, + "learning_rate": 2.595860071029255e-06, + "loss": 0.595, + "step": 10049 + }, + { + "epoch": 2.942037470725995, + "grad_norm": 1.020891547203064, + "learning_rate": 2.5954752784739885e-06, + "loss": 0.61, + "step": 10050 + }, + { + "epoch": 2.942330210772834, + "grad_norm": 0.9596639275550842, + "learning_rate": 2.595090483653553e-06, + "loss": 0.5648, + "step": 10051 + }, + { + "epoch": 2.942622950819672, + "grad_norm": 0.9673138856887817, + "learning_rate": 2.5947056865770776e-06, + "loss": 0.5782, + "step": 10052 + }, + { + "epoch": 2.9429156908665104, + "grad_norm": 0.9929797649383545, + "learning_rate": 2.594320887253692e-06, + "loss": 0.5525, + "step": 10053 + }, + { + "epoch": 2.943208430913349, + "grad_norm": 0.9723708033561707, + "learning_rate": 2.5939360856925246e-06, + "loss": 0.543, + "step": 10054 + }, + { + "epoch": 2.9435011709601873, + "grad_norm": 0.9696510434150696, + "learning_rate": 2.5935512819027053e-06, + "loss": 0.5169, + "step": 10055 + }, + { + "epoch": 2.9437939110070257, + "grad_norm": 1.0004442930221558, + "learning_rate": 2.5931664758933644e-06, + "loss": 0.5879, + "step": 10056 + }, + { + "epoch": 2.944086651053864, + "grad_norm": 0.9458190202713013, + "learning_rate": 2.592781667673631e-06, + "loss": 0.5639, + "step": 10057 + }, + { + "epoch": 2.9443793911007026, + "grad_norm": 0.9802045822143555, + "learning_rate": 2.5923968572526347e-06, + "loss": 0.5926, + "step": 10058 + }, + { + "epoch": 2.944672131147541, + "grad_norm": 0.9025931358337402, + "learning_rate": 2.592012044639506e-06, + "loss": 0.5229, + "step": 10059 + }, + { + "epoch": 2.9449648711943794, + "grad_norm": 0.9420754909515381, + "learning_rate": 2.5916272298433724e-06, + "loss": 0.5286, + "step": 10060 + }, + { + "epoch": 2.945257611241218, + "grad_norm": 0.9139617085456848, + "learning_rate": 2.5912424128733655e-06, + "loss": 0.5181, + "step": 10061 + }, + { + "epoch": 2.9455503512880563, + "grad_norm": 0.9536256194114685, + "learning_rate": 2.5908575937386147e-06, + "loss": 0.544, + "step": 10062 + }, + { + "epoch": 2.9458430913348947, + "grad_norm": 1.0246326923370361, + "learning_rate": 2.5904727724482515e-06, + "loss": 0.5424, + "step": 10063 + }, + { + "epoch": 2.946135831381733, + "grad_norm": 1.0021089315414429, + "learning_rate": 2.590087949011403e-06, + "loss": 0.5507, + "step": 10064 + }, + { + "epoch": 2.946428571428571, + "grad_norm": 1.046675443649292, + "learning_rate": 2.5897031234372007e-06, + "loss": 0.5902, + "step": 10065 + }, + { + "epoch": 2.94672131147541, + "grad_norm": 0.9549639821052551, + "learning_rate": 2.5893182957347747e-06, + "loss": 0.5592, + "step": 10066 + }, + { + "epoch": 2.947014051522248, + "grad_norm": 0.9875115752220154, + "learning_rate": 2.588933465913255e-06, + "loss": 0.584, + "step": 10067 + }, + { + "epoch": 2.947306791569087, + "grad_norm": 1.0662988424301147, + "learning_rate": 2.588548633981772e-06, + "loss": 0.5713, + "step": 10068 + }, + { + "epoch": 2.947599531615925, + "grad_norm": 1.015260100364685, + "learning_rate": 2.588163799949455e-06, + "loss": 0.5443, + "step": 10069 + }, + { + "epoch": 2.9478922716627634, + "grad_norm": 0.9878665804862976, + "learning_rate": 2.5877789638254354e-06, + "loss": 0.5967, + "step": 10070 + }, + { + "epoch": 2.948185011709602, + "grad_norm": 0.9332766532897949, + "learning_rate": 2.587394125618843e-06, + "loss": 0.544, + "step": 10071 + }, + { + "epoch": 2.9484777517564402, + "grad_norm": 0.9491848349571228, + "learning_rate": 2.5870092853388078e-06, + "loss": 0.5673, + "step": 10072 + }, + { + "epoch": 2.9487704918032787, + "grad_norm": 1.0232242345809937, + "learning_rate": 2.5866244429944614e-06, + "loss": 0.5746, + "step": 10073 + }, + { + "epoch": 2.949063231850117, + "grad_norm": 0.9673404693603516, + "learning_rate": 2.586239598594933e-06, + "loss": 0.5686, + "step": 10074 + }, + { + "epoch": 2.9493559718969555, + "grad_norm": 1.0178145170211792, + "learning_rate": 2.5858547521493534e-06, + "loss": 0.5197, + "step": 10075 + }, + { + "epoch": 2.949648711943794, + "grad_norm": 0.9979034066200256, + "learning_rate": 2.5854699036668536e-06, + "loss": 0.5844, + "step": 10076 + }, + { + "epoch": 2.9499414519906324, + "grad_norm": 0.993906557559967, + "learning_rate": 2.5850850531565634e-06, + "loss": 0.5658, + "step": 10077 + }, + { + "epoch": 2.950234192037471, + "grad_norm": 1.0166525840759277, + "learning_rate": 2.584700200627615e-06, + "loss": 0.6159, + "step": 10078 + }, + { + "epoch": 2.9505269320843093, + "grad_norm": 1.0451226234436035, + "learning_rate": 2.5843153460891375e-06, + "loss": 0.6192, + "step": 10079 + }, + { + "epoch": 2.9508196721311473, + "grad_norm": 0.9609853625297546, + "learning_rate": 2.5839304895502613e-06, + "loss": 0.5447, + "step": 10080 + }, + { + "epoch": 2.951112412177986, + "grad_norm": 0.9573104977607727, + "learning_rate": 2.583545631020119e-06, + "loss": 0.5584, + "step": 10081 + }, + { + "epoch": 2.951405152224824, + "grad_norm": 0.975617527961731, + "learning_rate": 2.5831607705078406e-06, + "loss": 0.5508, + "step": 10082 + }, + { + "epoch": 2.951697892271663, + "grad_norm": 0.9541507363319397, + "learning_rate": 2.5827759080225573e-06, + "loss": 0.5375, + "step": 10083 + }, + { + "epoch": 2.951990632318501, + "grad_norm": 0.9903622269630432, + "learning_rate": 2.582391043573399e-06, + "loss": 0.6172, + "step": 10084 + }, + { + "epoch": 2.9522833723653394, + "grad_norm": 1.0134152173995972, + "learning_rate": 2.582006177169497e-06, + "loss": 0.595, + "step": 10085 + }, + { + "epoch": 2.952576112412178, + "grad_norm": 0.95745450258255, + "learning_rate": 2.581621308819983e-06, + "loss": 0.5367, + "step": 10086 + }, + { + "epoch": 2.9528688524590163, + "grad_norm": 0.9496130347251892, + "learning_rate": 2.5812364385339884e-06, + "loss": 0.5843, + "step": 10087 + }, + { + "epoch": 2.9531615925058547, + "grad_norm": 1.0012109279632568, + "learning_rate": 2.5808515663206435e-06, + "loss": 0.5599, + "step": 10088 + }, + { + "epoch": 2.953454332552693, + "grad_norm": 0.9747013449668884, + "learning_rate": 2.5804666921890793e-06, + "loss": 0.5576, + "step": 10089 + }, + { + "epoch": 2.9537470725995316, + "grad_norm": 1.0288690328598022, + "learning_rate": 2.5800818161484275e-06, + "loss": 0.5396, + "step": 10090 + }, + { + "epoch": 2.95403981264637, + "grad_norm": 1.1096160411834717, + "learning_rate": 2.5796969382078193e-06, + "loss": 0.5887, + "step": 10091 + }, + { + "epoch": 2.9543325526932085, + "grad_norm": 0.9693667888641357, + "learning_rate": 2.579312058376386e-06, + "loss": 0.5806, + "step": 10092 + }, + { + "epoch": 2.954625292740047, + "grad_norm": 1.0004332065582275, + "learning_rate": 2.5789271766632585e-06, + "loss": 0.5168, + "step": 10093 + }, + { + "epoch": 2.9549180327868854, + "grad_norm": 1.006118655204773, + "learning_rate": 2.5785422930775687e-06, + "loss": 0.5936, + "step": 10094 + }, + { + "epoch": 2.955210772833724, + "grad_norm": 0.9602951407432556, + "learning_rate": 2.5781574076284484e-06, + "loss": 0.5494, + "step": 10095 + }, + { + "epoch": 2.9555035128805622, + "grad_norm": 1.003527045249939, + "learning_rate": 2.577772520325028e-06, + "loss": 0.5716, + "step": 10096 + }, + { + "epoch": 2.9557962529274002, + "grad_norm": 0.9840759634971619, + "learning_rate": 2.5773876311764397e-06, + "loss": 0.5547, + "step": 10097 + }, + { + "epoch": 2.956088992974239, + "grad_norm": 0.9578275680541992, + "learning_rate": 2.5770027401918155e-06, + "loss": 0.5631, + "step": 10098 + }, + { + "epoch": 2.956381733021077, + "grad_norm": 0.9360936284065247, + "learning_rate": 2.576617847380286e-06, + "loss": 0.5096, + "step": 10099 + }, + { + "epoch": 2.9566744730679155, + "grad_norm": 1.0073481798171997, + "learning_rate": 2.5762329527509833e-06, + "loss": 0.5997, + "step": 10100 + }, + { + "epoch": 2.956967213114754, + "grad_norm": 0.9746791124343872, + "learning_rate": 2.5758480563130395e-06, + "loss": 0.5277, + "step": 10101 + }, + { + "epoch": 2.9572599531615924, + "grad_norm": 0.9762089252471924, + "learning_rate": 2.575463158075585e-06, + "loss": 0.5378, + "step": 10102 + }, + { + "epoch": 2.957552693208431, + "grad_norm": 1.0262727737426758, + "learning_rate": 2.5750782580477534e-06, + "loss": 0.5781, + "step": 10103 + }, + { + "epoch": 2.9578454332552693, + "grad_norm": 0.9664428234100342, + "learning_rate": 2.5746933562386762e-06, + "loss": 0.5981, + "step": 10104 + }, + { + "epoch": 2.9581381733021077, + "grad_norm": 0.989524245262146, + "learning_rate": 2.574308452657484e-06, + "loss": 0.5531, + "step": 10105 + }, + { + "epoch": 2.958430913348946, + "grad_norm": 0.9945396184921265, + "learning_rate": 2.57392354731331e-06, + "loss": 0.543, + "step": 10106 + }, + { + "epoch": 2.9587236533957846, + "grad_norm": 0.9965120553970337, + "learning_rate": 2.5735386402152853e-06, + "loss": 0.5626, + "step": 10107 + }, + { + "epoch": 2.959016393442623, + "grad_norm": 0.9782065749168396, + "learning_rate": 2.5731537313725418e-06, + "loss": 0.5513, + "step": 10108 + }, + { + "epoch": 2.9593091334894615, + "grad_norm": 0.9571029543876648, + "learning_rate": 2.572768820794213e-06, + "loss": 0.5557, + "step": 10109 + }, + { + "epoch": 2.9596018735363, + "grad_norm": 1.0453517436981201, + "learning_rate": 2.5723839084894294e-06, + "loss": 0.5461, + "step": 10110 + }, + { + "epoch": 2.9598946135831383, + "grad_norm": 0.9859747290611267, + "learning_rate": 2.5719989944673242e-06, + "loss": 0.5614, + "step": 10111 + }, + { + "epoch": 2.9601873536299763, + "grad_norm": 1.0266543626785278, + "learning_rate": 2.571614078737028e-06, + "loss": 0.5984, + "step": 10112 + }, + { + "epoch": 2.960480093676815, + "grad_norm": 1.014898657798767, + "learning_rate": 2.5712291613076744e-06, + "loss": 0.5343, + "step": 10113 + }, + { + "epoch": 2.960772833723653, + "grad_norm": 0.9235020279884338, + "learning_rate": 2.570844242188396e-06, + "loss": 0.5059, + "step": 10114 + }, + { + "epoch": 2.961065573770492, + "grad_norm": 0.9500049948692322, + "learning_rate": 2.5704593213883234e-06, + "loss": 0.5055, + "step": 10115 + }, + { + "epoch": 2.96135831381733, + "grad_norm": 1.0002983808517456, + "learning_rate": 2.57007439891659e-06, + "loss": 0.5617, + "step": 10116 + }, + { + "epoch": 2.9616510538641685, + "grad_norm": 0.9688754677772522, + "learning_rate": 2.569689474782328e-06, + "loss": 0.5344, + "step": 10117 + }, + { + "epoch": 2.961943793911007, + "grad_norm": 0.9425867199897766, + "learning_rate": 2.56930454899467e-06, + "loss": 0.5395, + "step": 10118 + }, + { + "epoch": 2.9622365339578454, + "grad_norm": 0.9224328398704529, + "learning_rate": 2.568919621562749e-06, + "loss": 0.5215, + "step": 10119 + }, + { + "epoch": 2.962529274004684, + "grad_norm": 1.0588762760162354, + "learning_rate": 2.5685346924956957e-06, + "loss": 0.5706, + "step": 10120 + }, + { + "epoch": 2.9628220140515222, + "grad_norm": 1.039132833480835, + "learning_rate": 2.568149761802644e-06, + "loss": 0.548, + "step": 10121 + }, + { + "epoch": 2.9631147540983607, + "grad_norm": 1.0130723714828491, + "learning_rate": 2.567764829492726e-06, + "loss": 0.5736, + "step": 10122 + }, + { + "epoch": 2.963407494145199, + "grad_norm": 1.0364254713058472, + "learning_rate": 2.5673798955750746e-06, + "loss": 0.5779, + "step": 10123 + }, + { + "epoch": 2.9637002341920375, + "grad_norm": 0.972813069820404, + "learning_rate": 2.5669949600588223e-06, + "loss": 0.5432, + "step": 10124 + }, + { + "epoch": 2.963992974238876, + "grad_norm": 0.9890536069869995, + "learning_rate": 2.5666100229531016e-06, + "loss": 0.6095, + "step": 10125 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 1.0114766359329224, + "learning_rate": 2.566225084267045e-06, + "loss": 0.6053, + "step": 10126 + }, + { + "epoch": 2.964578454332553, + "grad_norm": 0.9850433468818665, + "learning_rate": 2.5658401440097858e-06, + "loss": 0.5538, + "step": 10127 + }, + { + "epoch": 2.9648711943793913, + "grad_norm": 0.9322547316551208, + "learning_rate": 2.565455202190457e-06, + "loss": 0.5583, + "step": 10128 + }, + { + "epoch": 2.9651639344262293, + "grad_norm": 0.99313884973526, + "learning_rate": 2.5650702588181903e-06, + "loss": 0.5577, + "step": 10129 + }, + { + "epoch": 2.965456674473068, + "grad_norm": 0.9785377979278564, + "learning_rate": 2.5646853139021195e-06, + "loss": 0.5921, + "step": 10130 + }, + { + "epoch": 2.965749414519906, + "grad_norm": 0.9990673065185547, + "learning_rate": 2.5643003674513774e-06, + "loss": 0.5702, + "step": 10131 + }, + { + "epoch": 2.9660421545667446, + "grad_norm": 0.891956627368927, + "learning_rate": 2.563915419475096e-06, + "loss": 0.5178, + "step": 10132 + }, + { + "epoch": 2.966334894613583, + "grad_norm": 0.973667323589325, + "learning_rate": 2.563530469982409e-06, + "loss": 0.5571, + "step": 10133 + }, + { + "epoch": 2.9666276346604215, + "grad_norm": 0.957231342792511, + "learning_rate": 2.5631455189824505e-06, + "loss": 0.5691, + "step": 10134 + }, + { + "epoch": 2.96692037470726, + "grad_norm": 0.959494411945343, + "learning_rate": 2.5627605664843514e-06, + "loss": 0.5567, + "step": 10135 + }, + { + "epoch": 2.9672131147540983, + "grad_norm": 0.9650396108627319, + "learning_rate": 2.562375612497246e-06, + "loss": 0.5519, + "step": 10136 + }, + { + "epoch": 2.9675058548009368, + "grad_norm": 0.9933462142944336, + "learning_rate": 2.561990657030267e-06, + "loss": 0.5566, + "step": 10137 + }, + { + "epoch": 2.967798594847775, + "grad_norm": 0.9800896048545837, + "learning_rate": 2.561605700092548e-06, + "loss": 0.5637, + "step": 10138 + }, + { + "epoch": 2.9680913348946136, + "grad_norm": 1.0274094343185425, + "learning_rate": 2.5612207416932227e-06, + "loss": 0.5278, + "step": 10139 + }, + { + "epoch": 2.968384074941452, + "grad_norm": 1.0173791646957397, + "learning_rate": 2.5608357818414224e-06, + "loss": 0.5795, + "step": 10140 + }, + { + "epoch": 2.9686768149882905, + "grad_norm": 0.9875310659408569, + "learning_rate": 2.560450820546282e-06, + "loss": 0.5856, + "step": 10141 + }, + { + "epoch": 2.968969555035129, + "grad_norm": 0.9736294746398926, + "learning_rate": 2.560065857816934e-06, + "loss": 0.555, + "step": 10142 + }, + { + "epoch": 2.9692622950819674, + "grad_norm": 1.029537320137024, + "learning_rate": 2.559680893662512e-06, + "loss": 0.5829, + "step": 10143 + }, + { + "epoch": 2.9695550351288054, + "grad_norm": 0.9812964200973511, + "learning_rate": 2.5592959280921502e-06, + "loss": 0.5092, + "step": 10144 + }, + { + "epoch": 2.9698477751756442, + "grad_norm": 0.9556853771209717, + "learning_rate": 2.5589109611149805e-06, + "loss": 0.5372, + "step": 10145 + }, + { + "epoch": 2.9701405152224822, + "grad_norm": 0.9855762720108032, + "learning_rate": 2.558525992740137e-06, + "loss": 0.5456, + "step": 10146 + }, + { + "epoch": 2.970433255269321, + "grad_norm": 0.9653757810592651, + "learning_rate": 2.5581410229767527e-06, + "loss": 0.5283, + "step": 10147 + }, + { + "epoch": 2.970725995316159, + "grad_norm": 0.9545133709907532, + "learning_rate": 2.5577560518339618e-06, + "loss": 0.5454, + "step": 10148 + }, + { + "epoch": 2.9710187353629975, + "grad_norm": 1.0174356698989868, + "learning_rate": 2.5573710793208977e-06, + "loss": 0.5873, + "step": 10149 + }, + { + "epoch": 2.971311475409836, + "grad_norm": 0.9905725717544556, + "learning_rate": 2.556986105446694e-06, + "loss": 0.5903, + "step": 10150 + }, + { + "epoch": 2.9716042154566744, + "grad_norm": 0.9642060399055481, + "learning_rate": 2.556601130220483e-06, + "loss": 0.5676, + "step": 10151 + }, + { + "epoch": 2.971896955503513, + "grad_norm": 1.0196517705917358, + "learning_rate": 2.5562161536513995e-06, + "loss": 0.5425, + "step": 10152 + }, + { + "epoch": 2.9721896955503513, + "grad_norm": 1.0072453022003174, + "learning_rate": 2.555831175748577e-06, + "loss": 0.5425, + "step": 10153 + }, + { + "epoch": 2.9724824355971897, + "grad_norm": 1.0424411296844482, + "learning_rate": 2.5554461965211497e-06, + "loss": 0.5939, + "step": 10154 + }, + { + "epoch": 2.972775175644028, + "grad_norm": 0.9519059658050537, + "learning_rate": 2.55506121597825e-06, + "loss": 0.5705, + "step": 10155 + }, + { + "epoch": 2.9730679156908666, + "grad_norm": 1.0013854503631592, + "learning_rate": 2.5546762341290127e-06, + "loss": 0.6183, + "step": 10156 + }, + { + "epoch": 2.973360655737705, + "grad_norm": 0.9788804650306702, + "learning_rate": 2.5542912509825714e-06, + "loss": 0.6009, + "step": 10157 + }, + { + "epoch": 2.9736533957845435, + "grad_norm": 0.982871949672699, + "learning_rate": 2.553906266548059e-06, + "loss": 0.5723, + "step": 10158 + }, + { + "epoch": 2.973946135831382, + "grad_norm": 0.9497590065002441, + "learning_rate": 2.5535212808346115e-06, + "loss": 0.53, + "step": 10159 + }, + { + "epoch": 2.9742388758782203, + "grad_norm": 0.9524624347686768, + "learning_rate": 2.5531362938513604e-06, + "loss": 0.5617, + "step": 10160 + }, + { + "epoch": 2.9745316159250583, + "grad_norm": 0.956714928150177, + "learning_rate": 2.5527513056074406e-06, + "loss": 0.5458, + "step": 10161 + }, + { + "epoch": 2.974824355971897, + "grad_norm": 1.004198670387268, + "learning_rate": 2.5523663161119855e-06, + "loss": 0.5781, + "step": 10162 + }, + { + "epoch": 2.975117096018735, + "grad_norm": 1.0231038331985474, + "learning_rate": 2.5519813253741304e-06, + "loss": 0.5727, + "step": 10163 + }, + { + "epoch": 2.9754098360655736, + "grad_norm": 0.9572821259498596, + "learning_rate": 2.551596333403008e-06, + "loss": 0.5413, + "step": 10164 + }, + { + "epoch": 2.975702576112412, + "grad_norm": 0.9725139737129211, + "learning_rate": 2.551211340207752e-06, + "loss": 0.5754, + "step": 10165 + }, + { + "epoch": 2.9759953161592505, + "grad_norm": 0.9667413830757141, + "learning_rate": 2.5508263457974984e-06, + "loss": 0.5825, + "step": 10166 + }, + { + "epoch": 2.976288056206089, + "grad_norm": 1.05428147315979, + "learning_rate": 2.550441350181379e-06, + "loss": 0.5734, + "step": 10167 + }, + { + "epoch": 2.9765807962529274, + "grad_norm": 1.017099380493164, + "learning_rate": 2.550056353368529e-06, + "loss": 0.5486, + "step": 10168 + }, + { + "epoch": 2.976873536299766, + "grad_norm": 0.9789232611656189, + "learning_rate": 2.549671355368083e-06, + "loss": 0.5715, + "step": 10169 + }, + { + "epoch": 2.9771662763466042, + "grad_norm": 0.9914844632148743, + "learning_rate": 2.549286356189174e-06, + "loss": 0.5608, + "step": 10170 + }, + { + "epoch": 2.9774590163934427, + "grad_norm": 1.0093212127685547, + "learning_rate": 2.548901355840937e-06, + "loss": 0.6061, + "step": 10171 + }, + { + "epoch": 2.977751756440281, + "grad_norm": 0.956893265247345, + "learning_rate": 2.5485163543325054e-06, + "loss": 0.5756, + "step": 10172 + }, + { + "epoch": 2.9780444964871196, + "grad_norm": 1.1242519617080688, + "learning_rate": 2.5481313516730144e-06, + "loss": 0.6011, + "step": 10173 + }, + { + "epoch": 2.978337236533958, + "grad_norm": 0.9936127662658691, + "learning_rate": 2.547746347871598e-06, + "loss": 0.5978, + "step": 10174 + }, + { + "epoch": 2.9786299765807964, + "grad_norm": 0.9700965285301208, + "learning_rate": 2.54736134293739e-06, + "loss": 0.5581, + "step": 10175 + }, + { + "epoch": 2.9789227166276344, + "grad_norm": 1.0018045902252197, + "learning_rate": 2.546976336879525e-06, + "loss": 0.5717, + "step": 10176 + }, + { + "epoch": 2.9792154566744733, + "grad_norm": 1.023258924484253, + "learning_rate": 2.5465913297071375e-06, + "loss": 0.5538, + "step": 10177 + }, + { + "epoch": 2.9795081967213113, + "grad_norm": 1.0026612281799316, + "learning_rate": 2.546206321429362e-06, + "loss": 0.5927, + "step": 10178 + }, + { + "epoch": 2.9798009367681497, + "grad_norm": 1.1144144535064697, + "learning_rate": 2.545821312055332e-06, + "loss": 0.6179, + "step": 10179 + }, + { + "epoch": 2.980093676814988, + "grad_norm": 1.0298751592636108, + "learning_rate": 2.5454363015941835e-06, + "loss": 0.5765, + "step": 10180 + }, + { + "epoch": 2.9803864168618266, + "grad_norm": 1.001114845275879, + "learning_rate": 2.5450512900550496e-06, + "loss": 0.5648, + "step": 10181 + }, + { + "epoch": 2.980679156908665, + "grad_norm": 0.9549432396888733, + "learning_rate": 2.5446662774470656e-06, + "loss": 0.5763, + "step": 10182 + }, + { + "epoch": 2.9809718969555035, + "grad_norm": 1.0080718994140625, + "learning_rate": 2.5442812637793646e-06, + "loss": 0.5677, + "step": 10183 + }, + { + "epoch": 2.981264637002342, + "grad_norm": 0.9793052673339844, + "learning_rate": 2.5438962490610823e-06, + "loss": 0.5398, + "step": 10184 + }, + { + "epoch": 2.9815573770491803, + "grad_norm": 1.0213853120803833, + "learning_rate": 2.543511233301354e-06, + "loss": 0.5828, + "step": 10185 + }, + { + "epoch": 2.9818501170960188, + "grad_norm": 1.0107041597366333, + "learning_rate": 2.5431262165093125e-06, + "loss": 0.5614, + "step": 10186 + }, + { + "epoch": 2.982142857142857, + "grad_norm": 1.0308723449707031, + "learning_rate": 2.5427411986940935e-06, + "loss": 0.5397, + "step": 10187 + }, + { + "epoch": 2.9824355971896956, + "grad_norm": 1.0443565845489502, + "learning_rate": 2.542356179864831e-06, + "loss": 0.5652, + "step": 10188 + }, + { + "epoch": 2.982728337236534, + "grad_norm": 1.0145683288574219, + "learning_rate": 2.54197116003066e-06, + "loss": 0.548, + "step": 10189 + }, + { + "epoch": 2.9830210772833725, + "grad_norm": 0.9559600949287415, + "learning_rate": 2.5415861392007156e-06, + "loss": 0.5568, + "step": 10190 + }, + { + "epoch": 2.9833138173302105, + "grad_norm": 0.9691932797431946, + "learning_rate": 2.541201117384132e-06, + "loss": 0.5694, + "step": 10191 + }, + { + "epoch": 2.9836065573770494, + "grad_norm": 1.0129698514938354, + "learning_rate": 2.540816094590044e-06, + "loss": 0.5558, + "step": 10192 + }, + { + "epoch": 2.9838992974238874, + "grad_norm": 0.9904354810714722, + "learning_rate": 2.540431070827586e-06, + "loss": 0.5408, + "step": 10193 + }, + { + "epoch": 2.9841920374707263, + "grad_norm": 0.9854150414466858, + "learning_rate": 2.540046046105893e-06, + "loss": 0.5888, + "step": 10194 + }, + { + "epoch": 2.9844847775175642, + "grad_norm": 0.941913366317749, + "learning_rate": 2.5396610204341006e-06, + "loss": 0.5367, + "step": 10195 + }, + { + "epoch": 2.9847775175644027, + "grad_norm": 1.0297328233718872, + "learning_rate": 2.5392759938213425e-06, + "loss": 0.5587, + "step": 10196 + }, + { + "epoch": 2.985070257611241, + "grad_norm": 0.9482709765434265, + "learning_rate": 2.5388909662767536e-06, + "loss": 0.5202, + "step": 10197 + }, + { + "epoch": 2.9853629976580796, + "grad_norm": 0.992103636264801, + "learning_rate": 2.538505937809469e-06, + "loss": 0.5509, + "step": 10198 + }, + { + "epoch": 2.985655737704918, + "grad_norm": 1.007818341255188, + "learning_rate": 2.538120908428624e-06, + "loss": 0.5273, + "step": 10199 + }, + { + "epoch": 2.9859484777517564, + "grad_norm": 1.0285131931304932, + "learning_rate": 2.537735878143353e-06, + "loss": 0.602, + "step": 10200 + }, + { + "epoch": 2.986241217798595, + "grad_norm": 0.9661919474601746, + "learning_rate": 2.5373508469627917e-06, + "loss": 0.5887, + "step": 10201 + }, + { + "epoch": 2.9865339578454333, + "grad_norm": 0.950613796710968, + "learning_rate": 2.5369658148960735e-06, + "loss": 0.5701, + "step": 10202 + }, + { + "epoch": 2.9868266978922717, + "grad_norm": 1.0172460079193115, + "learning_rate": 2.536580781952334e-06, + "loss": 0.5992, + "step": 10203 + }, + { + "epoch": 2.98711943793911, + "grad_norm": 1.010460615158081, + "learning_rate": 2.536195748140709e-06, + "loss": 0.5647, + "step": 10204 + }, + { + "epoch": 2.9874121779859486, + "grad_norm": 1.0193984508514404, + "learning_rate": 2.535810713470333e-06, + "loss": 0.5567, + "step": 10205 + }, + { + "epoch": 2.987704918032787, + "grad_norm": 1.0027703046798706, + "learning_rate": 2.5354256779503407e-06, + "loss": 0.5669, + "step": 10206 + }, + { + "epoch": 2.9879976580796255, + "grad_norm": 1.3034329414367676, + "learning_rate": 2.5350406415898676e-06, + "loss": 0.5534, + "step": 10207 + }, + { + "epoch": 2.9882903981264635, + "grad_norm": 1.0150972604751587, + "learning_rate": 2.534655604398048e-06, + "loss": 0.5221, + "step": 10208 + }, + { + "epoch": 2.9885831381733023, + "grad_norm": 0.9722839593887329, + "learning_rate": 2.534270566384018e-06, + "loss": 0.5704, + "step": 10209 + }, + { + "epoch": 2.9888758782201403, + "grad_norm": 1.0512923002243042, + "learning_rate": 2.5338855275569124e-06, + "loss": 0.5833, + "step": 10210 + }, + { + "epoch": 2.9891686182669788, + "grad_norm": 1.0249638557434082, + "learning_rate": 2.5335004879258656e-06, + "loss": 0.5216, + "step": 10211 + }, + { + "epoch": 2.989461358313817, + "grad_norm": 0.9941748380661011, + "learning_rate": 2.5331154475000135e-06, + "loss": 0.5481, + "step": 10212 + }, + { + "epoch": 2.9897540983606556, + "grad_norm": 0.9580910205841064, + "learning_rate": 2.5327304062884913e-06, + "loss": 0.5392, + "step": 10213 + }, + { + "epoch": 2.990046838407494, + "grad_norm": 0.9658277630805969, + "learning_rate": 2.532345364300433e-06, + "loss": 0.5941, + "step": 10214 + }, + { + "epoch": 2.9903395784543325, + "grad_norm": 0.9911201596260071, + "learning_rate": 2.5319603215449764e-06, + "loss": 0.5793, + "step": 10215 + }, + { + "epoch": 2.990632318501171, + "grad_norm": 0.950863242149353, + "learning_rate": 2.5315752780312538e-06, + "loss": 0.5857, + "step": 10216 + }, + { + "epoch": 2.9909250585480094, + "grad_norm": 0.9816730618476868, + "learning_rate": 2.531190233768402e-06, + "loss": 0.5505, + "step": 10217 + }, + { + "epoch": 2.991217798594848, + "grad_norm": 1.008878231048584, + "learning_rate": 2.530805188765556e-06, + "loss": 0.5532, + "step": 10218 + }, + { + "epoch": 2.9915105386416863, + "grad_norm": 0.9973474740982056, + "learning_rate": 2.5304201430318504e-06, + "loss": 0.5938, + "step": 10219 + }, + { + "epoch": 2.9918032786885247, + "grad_norm": 0.9599733948707581, + "learning_rate": 2.530035096576422e-06, + "loss": 0.5625, + "step": 10220 + }, + { + "epoch": 2.992096018735363, + "grad_norm": 0.9454092979431152, + "learning_rate": 2.5296500494084046e-06, + "loss": 0.4939, + "step": 10221 + }, + { + "epoch": 2.9923887587822016, + "grad_norm": 0.9957608580589294, + "learning_rate": 2.5292650015369343e-06, + "loss": 0.5421, + "step": 10222 + }, + { + "epoch": 2.9926814988290396, + "grad_norm": 1.0264177322387695, + "learning_rate": 2.5288799529711455e-06, + "loss": 0.5478, + "step": 10223 + }, + { + "epoch": 2.9929742388758784, + "grad_norm": 0.9494912624359131, + "learning_rate": 2.528494903720175e-06, + "loss": 0.569, + "step": 10224 + }, + { + "epoch": 2.9932669789227164, + "grad_norm": 0.9076371192932129, + "learning_rate": 2.5281098537931575e-06, + "loss": 0.5198, + "step": 10225 + }, + { + "epoch": 2.9935597189695553, + "grad_norm": 0.9957818984985352, + "learning_rate": 2.527724803199228e-06, + "loss": 0.5609, + "step": 10226 + }, + { + "epoch": 2.9938524590163933, + "grad_norm": 0.9998932480812073, + "learning_rate": 2.527339751947522e-06, + "loss": 0.5488, + "step": 10227 + }, + { + "epoch": 2.9941451990632317, + "grad_norm": 0.9863314032554626, + "learning_rate": 2.5269547000471756e-06, + "loss": 0.5615, + "step": 10228 + }, + { + "epoch": 2.99443793911007, + "grad_norm": 0.9639276266098022, + "learning_rate": 2.526569647507323e-06, + "loss": 0.5468, + "step": 10229 + }, + { + "epoch": 2.9947306791569086, + "grad_norm": 1.008310317993164, + "learning_rate": 2.5261845943371016e-06, + "loss": 0.547, + "step": 10230 + }, + { + "epoch": 2.995023419203747, + "grad_norm": 0.957706868648529, + "learning_rate": 2.5257995405456447e-06, + "loss": 0.5451, + "step": 10231 + }, + { + "epoch": 2.9953161592505855, + "grad_norm": 0.9738366603851318, + "learning_rate": 2.525414486142089e-06, + "loss": 0.5921, + "step": 10232 + }, + { + "epoch": 2.995608899297424, + "grad_norm": 0.9973214268684387, + "learning_rate": 2.52502943113557e-06, + "loss": 0.5896, + "step": 10233 + }, + { + "epoch": 2.9959016393442623, + "grad_norm": 0.9456358551979065, + "learning_rate": 2.524644375535223e-06, + "loss": 0.5213, + "step": 10234 + }, + { + "epoch": 2.996194379391101, + "grad_norm": 0.9891716837882996, + "learning_rate": 2.5242593193501834e-06, + "loss": 0.5893, + "step": 10235 + }, + { + "epoch": 2.996487119437939, + "grad_norm": 0.9880731105804443, + "learning_rate": 2.5238742625895863e-06, + "loss": 0.5725, + "step": 10236 + }, + { + "epoch": 2.9967798594847777, + "grad_norm": 0.9198263883590698, + "learning_rate": 2.5234892052625685e-06, + "loss": 0.5638, + "step": 10237 + }, + { + "epoch": 2.997072599531616, + "grad_norm": 0.9794508218765259, + "learning_rate": 2.523104147378264e-06, + "loss": 0.5406, + "step": 10238 + }, + { + "epoch": 2.9973653395784545, + "grad_norm": 1.0483037233352661, + "learning_rate": 2.52271908894581e-06, + "loss": 0.5244, + "step": 10239 + }, + { + "epoch": 2.9976580796252925, + "grad_norm": 0.9647294878959656, + "learning_rate": 2.522334029974341e-06, + "loss": 0.5558, + "step": 10240 + }, + { + "epoch": 2.9979508196721314, + "grad_norm": 1.060628056526184, + "learning_rate": 2.521948970472992e-06, + "loss": 0.5708, + "step": 10241 + }, + { + "epoch": 2.9982435597189694, + "grad_norm": 0.9826286435127258, + "learning_rate": 2.5215639104509005e-06, + "loss": 0.5529, + "step": 10242 + }, + { + "epoch": 2.998536299765808, + "grad_norm": 0.9928288459777832, + "learning_rate": 2.5211788499172002e-06, + "loss": 0.5806, + "step": 10243 + }, + { + "epoch": 2.9988290398126463, + "grad_norm": 1.0649229288101196, + "learning_rate": 2.520793788881028e-06, + "loss": 0.5837, + "step": 10244 + }, + { + "epoch": 2.9991217798594847, + "grad_norm": 0.9793468117713928, + "learning_rate": 2.52040872735152e-06, + "loss": 0.5803, + "step": 10245 + }, + { + "epoch": 2.999414519906323, + "grad_norm": 0.9808663129806519, + "learning_rate": 2.5200236653378096e-06, + "loss": 0.5571, + "step": 10246 + }, + { + "epoch": 2.9997072599531616, + "grad_norm": 0.9776644110679626, + "learning_rate": 2.5196386028490345e-06, + "loss": 0.5842, + "step": 10247 + }, + { + "epoch": 3.0, + "grad_norm": 0.9497858881950378, + "learning_rate": 2.5192535398943295e-06, + "loss": 0.5834, + "step": 10248 + } + ], + "logging_steps": 1, + "max_steps": 20496, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 3416, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.5886403062283633e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}